[med-svn] [crossbow] 04/07: New upstream version 1.2.0
Andreas Tille
tille at debian.org
Sun Dec 3 18:02:51 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository crossbow.
commit b74ffb2ce0861b6c66d845ddd6656a3685a668c6
Author: Andreas Tille <tille at debian.org>
Date: Sun Dec 3 18:57:13 2017 +0100
New upstream version 1.2.0
---
AWS.pm | 66 +
Align.pl | 490 +++++
BinSort.pl | 301 +++
CBFinish.pl | 205 ++
CheckDirs.pl | 112 +
Copy.pl | 619 ++++++
Counters.pl | 157 ++
Counters.pm | 164 ++
CrossbowIface.pm | 1495 +++++++++++++
Get.pm | 499 +++++
LICENSES | 12 +
LICENSE_ARTISTIC | 114 +
LICENSE_GPL3 | 674 ++++++
MANUAL | 1708 +++++++++++++++
MANUAL.markdown | 2144 ++++++++++++++++++
Makefile | 167 ++
MapWrap.pl | 296 +++
NEWS | 225 ++
ReduceWrap.pl | 459 ++++
Soapsnp.pl | 396 ++++
TOOLNAME | 1 +
TUTORIAL | 1 +
Tools.pm | 523 +++++
Util.pm | 137 ++
VERSION | 1 +
Wrap.pm | 67 +
cb_emr | 136 ++
cb_hadoop | 214 ++
cb_local | 249 +++
contrib/ForkManager.pm | 412 ++++
contrib/Sort.pm | 1081 ++++++++++
debian/changelog | 5 -
debian/compat | 1 -
debian/control | 23 -
debian/copyright | 10 -
debian/rules | 6 -
debian/source/format | 1 -
debian/upstream/metadata | 11 -
debian/watch | 3 -
doc/images/AWS_cb_e_coli_fillin.png | Bin 0 -> 135083 bytes
doc/images/AWS_cb_mouse17_fillin.png | Bin 0 -> 139496 bytes
doc/images/AWS_console.png | Bin 0 -> 186671 bytes
doc/images/AWS_console_debug.png | Bin 0 -> 159560 bytes
doc/images/AWS_console_upper_left.png | Bin 0 -> 94221 bytes
doc/images/AWS_create_new.png | Bin 0 -> 62026 bytes
doc/manual.html | 3832 +++++++++++++++++++++++++++++++++
doc/strip_markdown.pl | 39 +
doc/style.css | 145 ++
doc/website/faq.shtml | 34 +
doc/website/faq.ssi | 15 +
doc/website/foot.ssi | 9 +
doc/website/index.html | 9 +
doc/website/index.shtml | 68 +
doc/website/manual.shtml | 33 +
doc/website/manual.ssi | 3821 ++++++++++++++++++++++++++++++++
doc/website/news.shtml | 35 +
doc/website/old_news.ssi | 214 ++
doc/website/push.sh | 17 +
doc/website/recent_news.ssi | 150 ++
doc/website/rhsidebar.ssi | 126 ++
doc/website/top.ssi | 12 +
doc/website/tutorial.shtml | 234 ++
doc/website/ui.html | 9 +
emr/util/pull_push.sh | 36 +
emr/util/push.sh | 26 +
example/e_coli/full.manifest | 4 +
example/e_coli/small.manifest | 3 +
example/mouse17/full.manifest | 11 +
example/mouse17/small.manifest | 3 +
reftools/chimp_ensembl.sh | 63 +
reftools/db2ssnp | 314 +++
reftools/db2ssnp_ce4 | 30 +
reftools/db2ssnp_ce6 | 30 +
reftools/db2ssnp_hg19 | 61 +
reftools/db2ssnp_mm9 | 54 +
reftools/db2ssnp_mm9_chr17 | 30 +
reftools/e_coli_jar | 74 +
reftools/ensembl_snps.pl | 234 ++
reftools/fasta_cmap.pl | 70 +
reftools/fly_ensembl.sh | 67 +
reftools/hg19_jar | 79 +
reftools/human_ensembl.sh | 58 +
reftools/mm9_chr17_jar | 51 +
reftools/mm9_jar | 79 +
reftools/mouse_ensembl.sh | 58 +
reftools/sanity_check.pl | 59 +
reftools/shared.sh | 156 ++
reftools/yeast_ensembl.sh | 55 +
soapsnp/COPYING | 674 ++++++
soapsnp/binarize.cc | 71 +
soapsnp/call_genotype.cc | 584 +++++
soapsnp/chromosome.cc | 254 +++
soapsnp/main.cc | 460 ++++
soapsnp/makefile | 30 +
soapsnp/matrix.cc | 67 +
soapsnp/normal_dis.cc | 24 +
soapsnp/prior.cc | 111 +
soapsnp/rank_sum.cc | 128 ++
soapsnp/readme | 233 ++
soapsnp/release | 9 +
soapsnp/soap_snp.h | 793 +++++++
util/build_soapsnp.sh | 174 ++
util/package.bash | 79 +
webui/S3Util.pm | 118 +
webui/crossbow.pl | 977 +++++++++
webui/fill_e_coli_generic.sh | 37 +
webui/fill_mm9chr17_generic.sh | 38 +
webui/push.sh | 30 +
webui/push_test.sh | 33 +
webui/setup.sh | 20 +
webui/wait.gif | Bin 0 -> 4178 bytes
111 files changed, 28576 insertions(+), 60 deletions(-)
diff --git a/AWS.pm b/AWS.pm
new file mode 100644
index 0000000..2aea33e
--- /dev/null
+++ b/AWS.pm
@@ -0,0 +1,66 @@
+#!/usr/bin/perl -w
+
+##
+# Author: Ben Langmead
+# Date: 2/14/2010
+#
+# Routines for getting and expanding jars from
+#
+
+package AWS;
+use strict;
+use warnings;
+
+our $accessKey = "";
+our $secretKey = "";
+
+##
+# If either $accessKey or $secretKey are not already set, look some
+# more places for them.
+#
+sub ensureKeys($$$) {
+ my ($hadoop, $hadoop_arg, $env) = @_;
+ my $hadoopHome = $env->{HADOOP_HOME};
+ if(!defined($hadoopHome)) {
+ $hadoop = $hadoop_arg if $hadoop_arg ne "";
+ if(-x $hadoop) {
+ $hadoopHome = `dirname $hadoop`;
+ chomp($hadoopHome);
+ $hadoopHome .= "/..";
+ }
+ }
+ if($accessKey eq "") {
+ if(defined($env->{AWS_ACCESS_KEY_ID})) {
+ $accessKey = $env->{AWS_ACCESS_KEY_ID};
+ } elsif(defined($hadoopHome)) {
+ $accessKey = `grep fs.s3n.awsAccessKeyId $hadoopHome/conf/*.xml | sed 's/.*<value>//' | sed 's/<\\/value>.*//'`;
+ $accessKey =~ s/\s.*$//; # In case we got multiple lines back
+ if($accessKey eq "") {
+ print STDERR "Couldn't get access key from $hadoopHome/conf/*.xml\n";
+ }
+ }
+ if($accessKey eq "") {
+ die "--accesskey was not specified, nor could the access ".
+ "key be retrived from an environment variable or from ".
+ "the \$HADOOP_HOME/conf directory\n";
+ }
+ }
+ if($secretKey eq "") {
+ if(defined($env->{AWS_SECRET_ACCESS_KEY})) {
+ $secretKey = $env->{AWS_SECRET_ACCESS_KEY};
+ } elsif(defined($hadoopHome)) {
+ $secretKey = `grep fs.s3n.awsSecretAccessKey $hadoopHome/conf/*.xml | sed 's/.*<value>//' | sed 's/<\\/value>.*//'`;
+ $secretKey =~ s/\s.*$//; # In case we got multiple lines back
+ if($secretKey eq "") {
+ print STDERR "Couldn't get secret key from $hadoopHome/conf/*.xml\n";
+ }
+ }
+ if($secretKey eq "") {
+ die "--secretkey was not specified, nor could the secret ".
+ "key be retrived from an environment variable or from ".
+ "the \$HADOOP_HOME/conf directory\n";
+ }
+ }
+}
+
+1;
diff --git a/Align.pl b/Align.pl
new file mode 100755
index 0000000..497bf24
--- /dev/null
+++ b/Align.pl
@@ -0,0 +1,490 @@
+#!/usr/bin/perl -w
+
+##
+# Align.pl
+#
+# Align reads using Bowite. Fetch reference jar (ensuring mutual
+# exclusion among mappers) if necessary.
+#
+# Author: Ben Langmead
+# Date: February 11, 2010
+#
+
+use strict;
+use warnings;
+use 5.004;
+use Carp;
+use Getopt::Long;
+use IO::File;
+use FindBin qw($Bin);
+use lib $Bin;
+use Get;
+use Counters;
+use Util;
+use Tools;
+use AWS;
+use File::Path qw(mkpath);
+use List::Util qw[min max];
+
+{
+ # Force stderr to flush immediately
+ my $ofh = select STDERR;
+ $| = 1;
+ select $ofh;
+}
+
+my @counterUpdates = ();
+
+sub counter($) {
+ my $c = shift;
+ defined($c) || croak("Undefined counter update");
+ print STDERR "reporter:counter:$c\n";
+}
+
+sub flushCounters() {
+ for my $c (@counterUpdates) { counter($c); }
+ @counterUpdates = ();
+}
+
+my $ref = "";
+my $dest_dir = "";
+my $sam_passthru = 0;
+my $partlen = 0;
+my $discardReads = 0;
+my $indexLocal = "";
+my $qual = "phred33";
+my $truncate = 0;
+my $discardSmall = 0;
+my $discardMate = 0;
+my $straightThrough = 0;
+my $test = 0;
+my $cntfn = "";
+
+sub dieusage {
+ my $msg = shift;
+ my $exitlevel = shift;
+ $exitlevel = $exitlevel || 1;
+ print STDERR "$msg\n";
+ exit $exitlevel;
+}
+
+sub msg($) {
+ my $m = shift;
+ defined($m) || croak("Undefined message");
+ $m =~ s/[\r\n]*$//;
+ print STDERR "Align.pl: $m\n";
+}
+
+Tools::initTools();
+my %env = %ENV;
+
+GetOptions (
+ "bowtie:s" => \$Tools::bowtie_arg,
+ "s3cmd:s" => \$Tools::s3cmd_arg,
+ "s3cfg:s" => \$Tools::s3cfg,
+ "jar:s" => \$Tools::jar_arg,
+ "accessid:s" => \$AWS::accessKey,
+ "secretid:s" => \$AWS::secretKey,
+ "hadoop:s" => \$Tools::hadoop_arg,
+ "wget:s" => \$Tools::wget_arg,
+ "refjar:s" => \$ref,
+ "partlen:i" => \$partlen,
+ "index-local:s" => \$indexLocal,
+ "discard-reads:f" => \$discardReads,
+ "qual:s" => \$qual,
+ "sampass" => \$sam_passthru,
+ "truncate:i" => \$truncate,
+ "discard-mate:i" => \$discardMate,
+ "discard-small" => \$discardSmall,
+ "straight-through"=> \$straightThrough,
+ "counters:s" => \$cntfn,
+ "destdir:s" => \$dest_dir,
+ "test" => \$test) || dieusage("Bad option", 1);
+
+Tools::purgeEnv();
+
+msg("s3cmd: found: $Tools::s3cmd, given: $Tools::s3cmd_arg");
+msg("jar: found: $Tools::jar, given: $Tools::jar_arg");
+msg("hadoop: found: $Tools::hadoop, given: $Tools::hadoop_arg");
+msg("wget: found: $Tools::wget, given: $Tools::wget_arg");
+msg("s3cfg: $Tools::s3cfg");
+msg("bowtie: found: $Tools::bowtie, given: $Tools::bowtie_arg");
+msg("partition len: $partlen");
+msg("ref: $ref");
+msg("quality: $qual");
+msg("truncate at: $truncate");
+msg("discard mate: $discardMate");
+msg("discard reads < truncate len: $discardSmall");
+msg("SAM passthrough: $sam_passthru");
+msg("Straight through: $straightThrough");
+msg("local index path: $indexLocal");
+msg("counters: $cntfn");
+msg("dest dir: $dest_dir");
+msg("bowtie args: @ARGV");
+msg("ls -al");
+msg(`ls -al`);
+
+if($sam_passthru) {
+ my $alsUnpaired = 0;
+ my $alsPaired = 0;
+ my $alsUnpairedTot = 0;
+ my $line = "";
+ my $skipped = 0;
+ my $downloaded = 0;
+ while(<STDIN>) {
+ next if /^\s*FAKE\s*$/;
+ next if /^\s*$/;
+ $downloaded++;
+ if($discardReads != 0 && rand() < $discardReads) {
+ $skipped++; next;
+ }
+ # Tokenize preprocessed read line
+ chomp;
+ my @ts = split(/\t/, $_);
+ $#ts == 2 || $#ts == 4 || die "Expected either 3 or 5 tokens, got:\n$_\n";
+ # Tokenize read name
+ my @ntok = split(/;/, $ts[0]);
+ for(my $i = 0; $i <= $#ntok; $i++) {
+ if($ntok[$i] =~ /^SM:/) {
+ # Tokenize SAM alignment details
+ my @stok = split(/,/, substr($ntok[$i], 3));
+ $#stok == 4 || die "Expected 5 SAM alignment tokens, got:\n$_\n";
+ my ($chr, $pos, $fw, $mapq, $cigar) = @stok;
+ my $oms = ($mapq == 0 ? 1 : 0);
+ length($cigar) > 0 || die "Expected CIGAR string of non-zero length:\n$_\n";
+ my $part = $pos / $partlen;
+ $fw eq "0" || $fw eq "1" || die "Bad SM:fw field: $fw\n$_\n";
+ $fw = ($fw ? "+" : "-");
+ my $mate = 0; # TODO: be smart about mates
+ #$line = sprintf("%s\t%010d\t%010d\t$fw\t%s\t%s\t$oms\t$cigar\t$mate\t", $chr, $part, $pos, $ts[1], $ts[2]);
+ # TODO: be smart about propagating some read and quality
+ # information forward
+ my $len = length($ts[0]);
+ $line = sprintf("%s\t%010d\t%010d\t$fw\t$len\t$oms\t$cigar\t$mate\t", $chr, $part, $pos);
+ }
+ }
+ $line =~ /[\n\r]/ && die "Bad whitespace in line:\n$line\n";
+ my @ls = split(/\t/, $line);
+ # what <- list("", # Chr
+ # integer(0), # Part
+ # integer(0), # ChrOff
+ # "", # Orient
+ # integer(0), # SeqLen
+ # integer(0), # Oms
+ # "", # CIGAR
+ # "", # Mate
+ # "") # Lab
+ $#ls == 8 || die "Expected 9 fields in SAM passthroughput output:\n$line\n";
+ $ls[1] == int($ls[1]) || die "Expected 2nd field to be numeric:\n$line\n";
+ $ls[2] == int($ls[2]) || die "Expected 3rd field to be numeric:\n$line\n";
+ $ls[4] == int($ls[4]) || die "Expected 5th field to be numeric:\n$line\n";
+ $ls[5] == int($ls[5]) || die "Expected 6th field to be numeric:\n$line\n";
+ print "$line\n";
+ $alsUnpairedTot++;
+ if(++$alsUnpaired >= 10000) {
+ counter("Bowtie,Alignments (unpaired) passed through from SAM,".$alsUnpaired);
+ $alsUnpaired = 0;
+ }
+ }
+ counter("Bowtie,Alignments (unpaired) passed through from SAM,".$alsUnpairedTot);
+ counter("Bowtie,Alignments (paired) passed through from SAM,".$alsPaired);
+ counter("Bowtie,Alignments passed through from SAM,".($alsUnpaired+$alsPaired));
+ counter("Bowtie,Reads skipped,".$skipped);
+ counter("Bowtie,Reads downloaded,".$downloaded);
+ # Note: SAM passthrough mode doesn't require that -refjar, -jar,
+ # -dstdir, bowtie, etc be specified
+ exit 0;
+}
+
+$ref ne "" || $indexLocal ne "" || $test ||
+ die "Neither -ref nor -index-local specified; must specify one\n";
+$dest_dir = "." if $dest_dir eq "";
+
+mkpath($dest_dir);
+(-d $dest_dir) || die "-destdir $dest_dir does not exist or isn't a directory, and could not be created\n";
+
+my $bowtie = Tools::bowtie();
+
+##
+# Run bowtie, ensuring that index exists first.
+#
+my $jarEnsured = 0;
+sub runBowtie($$$) {
+ my ($fn, $efn, $env) = @_;
+ my $args = join(" ", @ARGV);
+ msg(" ...ensuring reference jar is installed first");
+ my $index_base;
+ if($indexLocal ne "") {
+ $index_base = $indexLocal;
+ } else {
+ if($ref ne "" && !$jarEnsured) {
+ Get::ensureFetched($ref, $dest_dir, \@counterUpdates, undef, undef, $env);
+ flushCounters();
+ $jarEnsured = 1;
+ }
+ # Find all index file sets
+ my @indexes = <$dest_dir/index/*.rev.1.ebwt>;
+ for(my $i = 0; $i < scalar(@indexes); $i++) {
+ # convert to basename
+ $indexes[$i] =~ s/\.rev\.1\.ebwt$//;
+ }
+ if(scalar(@indexes) > 1) {
+ # There was more than one index; pick the first one
+ msg("Warning: More than one index base: @indexes");
+ msg("ls -al $dest_dir");
+ msg(`ls -al $dest_dir\n`);
+ msg("ls -al $dest_dir/index");
+ msg(`ls -al $dest_dir/index\n`);
+ msg("Using $indexes[0]");
+ } elsif(scalar(@indexes) == 0) {
+ # There were no indexes; abort
+ msg("Could not find any files ending in .rev.1.ebwt in $dest_dir/index:");
+ msg("ls -al $dest_dir");
+ msg(`ls -al $dest_dir\n`);
+ msg("ls -al $dest_dir/index");
+ msg(`ls -al $dest_dir/index\n`);
+ die;
+ }
+ $index_base = "$indexes[0]";
+ }
+ # Check that all index files are present
+ for my $i ("1", "2", "3", "4", "rev.1", "rev.2") {
+ my $f = "$index_base.$i.ebwt";
+ (-f $f) || die "Did not successfully install index file $f\n";
+ }
+ (-s "$index_base.1.ebwt" == -s "$index_base.rev.1.ebwt") ||
+ die "Mismatched file sizes for .1.ebwt and rev.1.ebwt\n";
+ (-s "$index_base.2.ebwt" == -s "$index_base.rev.2.ebwt") ||
+ die "Mismatched file sizes for .2.ebwt and rev.2.ebwt\n";
+ # Set up bowtie invocation
+ my $cmd = "$bowtie $args --12 $fn $index_base 2>$efn";
+ msg("Running: $cmd");
+ return $cmd;
+}
+
+my $sthruCmd = ""; # command for bowtie in straight-through mode
+my $efn = ".tmp.Align.pl.$$.err"; # bowtie stderr dump
+if($straightThrough) {
+ $sthruCmd = runBowtie("-", $efn, \%env);
+ open OUT, "| $sthruCmd" || die "Could not open '| $sthruCmd' for writing";
+} else {
+ open OUT, ">.tmp.$$" || die "Could not open .tmp.$$ for writing";
+}
+my $records = 0;
+my $downloaded = 0;
+my $skipped = 0;
+my $truncSkipped = 0;
+my $pass = 0;
+my $unpairedPass = 0;
+my $pairedPass = 0;
+my $matesSkipped = 0;
+my $truncated = 0;
+
+##
+# q is a decoded solexa qual; return a decoded phred qual.
+#
+my @sol2phredMap = (
+ 0, 1, 1, 1, 1, 1, 1, 2, 2, 3, # -10
+ 3, 4, 4, 5, 5, 6, 7, 8, 9, 10, # 0
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, # 10
+);
+sub sol2phred($) {
+ my $q = shift;
+ return 0 if $q < -10;
+ return $sol2phredMap[$q + 10] if $q < 20;
+ return $q;
+}
+
+##
+# Argument is a quality string. Update counters and convert to phred+33.
+#
+my %qualCnts = ();
+my %rawQualCnts = ();
+my $qualOff = $qual =~ /33$/ ? 33 : 64;
+my $qualSol = $qual =~ /^solexa/i;
+sub processQuals($) {
+ my $qs = shift;
+ my $ret = "";
+ for(my $i = 0; $i < length($qs); $i++) {
+ my $q = ord(substr($qs, $i, 1));
+ $rawQualCnts{int($q/10)}++;
+ $q -= $qualOff;
+ $q = sol2phred($q) if $qualSol;
+ $qualCnts{int($q/10)}++;
+ $ret .= chr($q+33);
+ }
+ return $ret;
+}
+
+if($test) {
+ $qualOff = 33;
+ $qualSol = 0;
+ my $q = processQuals("I");
+ $q eq "I" || die;
+ $qualOff = 64;
+ $qualSol = 1;
+ $q = processQuals('6789:;<=>?'.'@ABCDEFGHI'.'JKLMNOPQRS');
+ $q eq q|!""""""##$$%%&&'()*++,-./01234| || die;
+ $qualSol = 0;
+ $q = processQuals('ABCDEFGHIJ');
+ $q eq q|"#$%&'()*+| || die;
+ msg("PASSED all tests");
+ %qualCnts = ();
+ %rawQualCnts = ();
+}
+
+# Shunt all of the input to a file
+my %lens = ();
+my $first = 1;
+my $lastLine = "";
+while(<STDIN>) {
+ next if /^\s*FAKE\s*$/;
+ next if /^\s*$/;
+ msg("Read first line of stdin:\n$_") if $first;
+ $first = 0;
+ $lastLine = $_;
+ chomp;
+ $downloaded++;
+ if($discardReads != 0 && rand() < $discardReads) {
+ $skipped++; next;
+ }
+ my @altok = split(/\t/);
+ scalar(@altok) == 3 || scalar(@altok) == 5 || die "Bad number of read tokens ; expected 3 or 5:\n$_\n";
+ my $pe = (scalar(@altok) == 5);
+ my $len1 = length($altok[1]);
+ my $len2 = 0;
+ if($pe) {
+ if($discardMate > 0) {
+ if($discardMate == 1) {
+ # First mate is discarded, second is promoted to the
+ # first slot
+ $altok[1] = $altok[3];
+ $altok[2] = $altok[4];
+ $len1 = length($altok[1]);
+ } else {
+ # Second mate is discarded by virtue of $pe = 0
+ }
+ $matesSkipped++;
+ $pe = 0;
+ # $len2 remains =0
+ } else {
+ # Mate is intact; tally its length
+ $len2 = length($altok[3]); $lens{$len2}++;
+ }
+ }
+ $lens{$len1}++;
+ # Is it so small that we should discard it?
+ if($truncate > 0 && $discardSmall &&
+ ($len1 < $truncate || ($len2 > 0 && $len2 < $truncate)))
+ {
+ # Yes, discard
+ $truncSkipped++;
+ next;
+ }
+ # Print alignment after truncating it
+ my $nlen1 = $len1;
+ $nlen1 = min($truncate, $len1) if $truncate > 0;
+ $truncated++ if ($nlen1 < $len1);
+ if($pe) {
+ my $nlen2 = $len2;
+ $nlen2 = min($truncate, $len2) if $truncate > 0;
+ $truncated++ if ($nlen2 < $len2);
+ my ($nm, $s1, $q1, $s2, $q2) = (@altok);
+ ($q1, $q2) = (processQuals($q1), processQuals($q2));
+ $pass++; $pairedPass++;
+ print OUT "r\t".
+ substr($s1, 0, $nlen1)."\t".
+ substr($q1, 0, $nlen1)."\t".
+ substr($s2, 0, $nlen2)."\t".
+ substr($q2, 0, $nlen2)."\n";
+ } else {
+ $pass++; $unpairedPass++;
+ my ($nm, $s1, $q1) = (@altok);
+ $q1 = processQuals($q1);
+ print OUT "r\t".
+ substr($s1, 0, $nlen1)."\t".
+ substr($q1, 0, $nlen1)."\n";
+ }
+ $records++;
+}
+msg("Read last line of stdin:\n$lastLine");
+msg("$records reads downloaded\n");
+counter("Bowtie,Reads downloaded,$downloaded");
+counter("Bowtie,Reads (all) passing filters,$pass");
+counter("Bowtie,Reads (unpaired) passing filters,$unpairedPass");
+counter("Bowtie,Reads (paired) passing filters,$pairedPass");
+counter("Bowtie,Reads skipped due to -discard-reads,$skipped");
+counter("Bowtie,Reads skipped due to -truncate-discard,$truncSkipped");
+counter("Bowtie,Mates skipped due to -discard-mate,$matesSkipped");
+counter("Bowtie,Reads (mates) truncated due to -truncate*,$truncated");
+for my $len (keys %lens) {
+ counter("Bowtie,Reads of length $len,$lens{$len}");
+}
+for my $qual (keys %rawQualCnts) {
+ counter("Bowtie,Occurrences of raw quality value [".($qual*10).":".($qual*10+10)."),$rawQualCnts{$qual}");
+}
+for my $qual (keys %qualCnts) {
+ counter("Bowtie,Occurrences of phred-33 quality value [".($qual*10).":".($qual*10+10)."),$qualCnts{$qual}");
+}
+close(OUT);
+if($straightThrough) {
+ if($? != 0) {
+ msg("Fatal error: Bowtie exited with level $?:");
+ open(EFN, "$efn") || die "Could not open '$efn' for reading\n";
+ while(<EFN>) { msg($_); }
+ close(EFN);
+ die;
+ }
+}
+msg("$downloaded reads downloaded");
+
+if($records > 0 && !$straightThrough) {
+ counter("Bowtie,Reads downloaded,$downloaded");
+ # Print a bit of the reads file, for sanity-checking purposes
+ my $fn = ".tmp.$$";
+ msg("head -4 $fn:");
+ msg(`head -4 $fn`);
+ msg("tail -4 $fn:");
+ msg(`tail -4 $fn`);
+ my $cmd = runBowtie($fn, $efn, \%env);
+ my $ret = Util::run($cmd);
+ if($ret != 0) {
+ msg("Fatal error: Bowtie exited with level $?:");
+ open(EFN, "$efn") || die "Could not open '$efn' for reading\n";
+ while(<EFN>) { msg($_); }
+ close(EFN);
+ die;
+ }
+ unlink($fn);
+}
+if($records > 0) {
+ open SUMM, $efn || die "Could not open $efn for reading\n";
+ while(<SUMM>) {
+ if(/reads with at least one reported alignment/) {
+ /: ([0-9]+)/;
+ my $num = $1;
+ $num == int($num) || die "Expected number: $num\n$_";
+ counter("Bowtie,Reads with at least 1 reported alignment,$num");
+ } elsif(/reads that failed to align/) {
+ /: ([0-9]+)/;
+ my $num = $1;
+ $num == int($num) || die "Expected number: $num\n$_";
+ counter("Bowtie,Reads that failed to align,$num");
+ } elsif(/reads with alignments suppressed due to -m/) {
+ /: ([0-9]+)/;
+ my $num = $1;
+ $num == int($num) || die "Expected number: $num\n$_";
+ counter("Bowtie,Reads with alignments suppressed due to -m,$num");
+ } elsif(/reads with alignments sampled due to -M/) {
+ /: ([0-9]+)/;
+ my $num = $1;
+ $num == int($num) || die "Expected number: $num\n$_";
+ counter("Bowtie,Reads with alignments sampled due to -M,$num");
+ }
+ }
+ close(SUMM);
+ unlink($efn);
+ msg("$records reads aligned");
+}
+print "FAKE\n";
+counter("Bowtie,Reads given to Bowtie,$records");
diff --git a/BinSort.pl b/BinSort.pl
new file mode 100755
index 0000000..09f7eb2
--- /dev/null
+++ b/BinSort.pl
@@ -0,0 +1,301 @@
+#!/usr/bin/perl
+
+##
+# BinSort.pl
+#
+# A utility for binning and sorting input data in parallel. Input
+# files
+#
+
+use strict;
+use warnings;
+use Getopt::Long;
+use FindBin qw($Bin);
+use lib $Bin;
+use lib "$Bin/contrib";
+use Cwd 'abs_path';
+use ForkManager;
+use IO::File;
+use List::Util qw[min max];
+
+my $input = "";
+my $output = "";
+my $intermediate = "";
+my $prefix = "";
+my $suffix = "";
+my $delim = "\t";
+my $sortSize = "";
+my $cores = 0;
+my $sortArgs = "";
+my $verbose = 0;
+my $force = 0;
+my $keep = 0;
+my $excludeUnmapped = 0;
+
+my @bin = ();
+my $binmapStr = "";
+my %binmap = ();
+
+GetOptions (
+ "input:s" => \$input,
+ "intermediate:s" => \$intermediate,
+ "output:s" => \$output,
+ "bin:s" => \@bin,
+ "sort:s" => \$sortArgs,
+ "delim:s" => \$delim,
+ "S:i" => \$sortSize,
+ "size:i" => \$sortSize,
+ "cores:i" => \$cores,
+ "bin-map:s" => \$binmapStr,
+ "binmap:s" => \$binmapStr,
+ "exclude-unmapped" => \$excludeUnmapped,
+ "prefix:s" => \$prefix,
+ "suffix:s" => \$suffix,
+ "keep-all" => \$keep,
+ "verbose" => \$verbose,
+ "force" => \$force) || die "Bad option\n";
+
+if(scalar(@ARGV) > 0) {
+ $input .= "," if $input ne "";
+ $input .= join(",", @ARGV);
+}
+
+# By default, limit the total size of all sorts to 2GB
+$delim = "\t" if $delim eq "";
+
+print STDERR "# parallel binners/sorters: $cores\n";
+print STDERR "Input: $input\n";
+print STDERR "Output: $output\n";
+print STDERR "Sort memory footprint (total): $sortSize\n";
+print STDERR "Output prefix/suffix: $prefix/$suffix\n";
+print STDERR "Delimiter (ascii): ".ord($delim)."\n";
+print STDERR "Options: [ ";
+print STDERR "-keep-all " if $keep;
+print STDERR "-force " if $force;
+print STDERR "]\n";
+
+sub checkDir($) {
+ my $dir = shift;
+ if(-d $dir) {
+ die "Output directory $dir already exists" unless $force;
+ if($force) {
+ print STDERR "Removing directory $dir due to -force\n";
+ system("rm -rf $dir >/dev/null 2>/dev/null");
+ -d $dir && die "Could not remove directory $dir";
+ }
+ }
+ system("mkdir -p $dir >/dev/null 2>/dev/null");
+ -d $dir || die "Could not create new directory $dir";
+}
+checkDir("$output");
+$intermediate = "$output.pre" if $intermediate eq "";
+my $binsOut = "$intermediate/bins";
+my $binsErr = "$intermediate/bins.err";
+checkDir($binsOut);
+checkDir($binsErr);
+$output = abs_path($output);
+
+##
+# Make a string into an acceptible filename.
+#
+sub fsSanitize($) {
+ my $f = shift;
+ my $ret = "";
+ for(my $i = 0; $i < length($f); $i++) {
+ my $c = substr($f, $i, 1);
+ if($c =~ /[.,#A-Za-z01-9_-]/) {
+ $ret .= $c;
+ } else {
+ $ret .= "_";
+ }
+ }
+ return $ret;
+}
+
+if($binmapStr ne "") {
+ open (BINMAP, $binmapStr) || die "Could not open $binmapStr for reading\n";
+ print "Bin map = {\n" if $verbose;
+ while(<BINMAP>) {
+ chomp;
+ my @s = split /\t/;
+ scalar(@s) == 2 || die "Expected key-tab-value, got:\n$_\n";
+ my ($k, $v) = @s;
+ defined($binmap{$k}) && print "WARNING: Key $k is mapped more than once\n";
+ $binmap{$k} = fsSanitize($v);
+ print " $k => $binmap{$k}\n" if $verbose;
+ }
+ print "}\n" if $verbose;
+ close(BINMAP);
+}
+
+print "Starting fork manager\n" if $verbose;
+my $pm = new Parallel::ForkManager($cores);
+
+# All bins must be >= 1
+for my $b (@bin) { $b > 0 || die "A -bin was $b, but must be > 0\n"; }
+
+# Setup a callback for when a child finishes up so we can
+# get its exit code
+my $childFailed = 0;
+my $childFailedPid = 0;
+$pm->run_on_finish(
+ sub {
+ my ($pid, $exit_code, $ident) = @_;
+ if($exit_code != 0) {
+ $childFailed = $exit_code;
+ $childFailedPid = $pid;
+ }
+ }
+);
+
+# First, determine the number of input files
+my $ninputs = 0;
+for my $inp (split(/,/, $input)) {
+ $inp = abs_path($inp);
+ -d $inp || -f $inp || die "No such input file or directory as \"$inp\"\n";
+ my @fs = ();
+ if(-d $inp) { @fs = <$inp/*>; }
+ else { push @fs, $inp; }
+ $ninputs += scalar(@fs);
+}
+print STDERR "Found $ninputs input files\n";
+
+# For each input dir
+my %filesDone = ();
+my %bases = ();
+print STDERR "--- Bin ---\n";
+my $fi = 0;
+for my $inp (split(/,/, $input)) {
+ $inp = abs_path($inp) if $inp ne "-";
+ -d $inp || -f $inp || $inp eq "-" || die "No such input file or directory as \"$inp\"\n";
+ my @fs = ();
+ if(-d $inp) { @fs = <$inp/*>; }
+ else { push @fs, $inp; }
+ scalar(@fs) > 0 || die "No input files in directory \"$inp\"\n";
+ # For each input file (in current dir)
+ for my $f (@fs) {
+ my $base = `basename $f`;
+ chomp($base);
+ defined($bases{$base}) && die "Attempted to process file $base more than once\n";
+ $bases{$base} = 1; # parent keeps track of all the basenames
+ $fi++;
+ if($childFailed) {
+ print STDERR "Aborting master loop because child failed\n";
+ last;
+ }
+ $pm->start and next; # fork off a mapper for this input file
+ print STDERR "Pid $$ processing input $f [$fi of $ninputs]...\n";
+ if($f =~ /\.gz$/) {
+ open INP, "gzip -dc $f |" || die "Could not open pipe 'gzip -dc $f |'";
+ } elsif($f =~ /\.bz2$/) {
+ open INP, "bzip2 -dc $f |" || die "Could not open pipe 'bzip2 -dc $f |'";
+ } else {
+ open INP, "$f" || die "Could not open $f for reading\n";
+ }
+ my $lastBin = undef;
+ my $lastBinval = undef;
+ my %outfhs = ();
+ while(<INP>) {
+ chomp;
+ my @s = split /$delim/;
+ my $binkey = "";
+ # For each binning dimension
+ for my $b (@bin) {
+ $b <= scalar(@s) || die "Bad bin index $b; line only had ".scalar(@s)." tokens:\n$_\n";
+ $binkey .= $s[$b-1];
+ }
+ if(defined($lastBin) && $binkey eq $lastBin) {
+ # Fast, common case; do what we did last time
+ defined($lastBinval) || die;
+ print {$outfhs{$lastBinval}} "$_\n";
+ } else {
+ # Use -binmap to map the bin key. If no mapping exists,
+ # keep the same key (but sanitized).
+ unless(defined($binmap{$binkey})) {
+ next if $excludeUnmapped;
+ # Make a mapping to a sanitized version of binkey
+ $binmap{$binkey} = fsSanitize($binkey);
+ }
+ my $binval = $binmap{$binkey};
+ unless(defined($outfhs{$binval})) {
+ system("mkdir -p $binsOut/$base");
+ my $ofn = "$binsOut/$base/$binval";
+ print STDERR "Opened filehandle $ofn" if $verbose;
+ print STDERR "; ".scalar(keys %outfhs)." open in PID $$\n" if $verbose;
+ $outfhs{$binval} = new IO::File($ofn, "w");
+ $outfhs{$binval} || die "Could not open $ofn for writing\n";
+ }
+ print {$outfhs{$binval}} "$_\n";
+ $lastBin = $binkey;
+ $lastBinval = $binval;
+ }
+ }
+ # Close output handles
+ for my $bin (keys %outfhs) { $outfhs{$bin}->close() };
+ # Close input handle
+ close(INP);
+ $? == 0 || die "Bad exitlevel from input slurp: $?\n";
+ $pm->finish; # end of fork
+ }
+}
+print STDERR "Aborted master loop because child failed\n" if $childFailed;
+$pm->wait_all_children;
+if($childFailed) {
+ die "Aborting because child with PID $childFailedPid exited abnormally\nSee previous output\n";
+} else {
+ print STDERR "All children succeeded\n";
+}
+
+# Now collect a list of all the binvals. We couldn't have (easily)
+# collected them in the previous loop because the binvals were known
+# only to the child processes and not to the parent. But we can
+# reconstitute them based on the file names.
+my %binvals = ();
+for my $base (keys %bases) {
+ for my $f (<$binsOut/$base/*>) {
+ my $b = `basename $f`;
+ chomp($b);
+ $binvals{$b} = 1;
+ }
+}
+
+#
+$sortSize = int((3 * 1024 * 1024)/min($cores, scalar(keys %binvals)));
+my $bi = 0;
+my $sortCmd = "sort -S $sortSize $sortArgs";
+print STDERR "--- Sort ---\n";
+print STDERR "Sort command: $sortCmd\n";
+for my $binval (sort keys %binvals) {
+ $bi++;
+ if($childFailed) {
+ print STDERR "Aborting master loop because child failed\n";
+ last;
+ }
+ $pm->start and next; # fork off a mapper for this input file
+ print STDERR "Pid $$ processing bin $binval [$bi of ".scalar(keys %binvals)."]...\n";
+ my $inps = "";
+ for my $base (keys %bases) {
+ if(-f "$binsOut/$base/$binval") {
+ $inps .= "$binsOut/$base/$binval ";
+ }
+ }
+ my $ret = system("$sortCmd $inps >$output/$prefix$binval$suffix 2>$binsErr/$binval");
+ if($ret == 0 && !$keep) {
+ # Delete all the files that were inputs to the sort
+ system("rm -f $inps");
+ }
+ exit $ret;
+}
+$pm->wait_all_children;
+if($childFailed) {
+ die "Aborting because child with PID $childFailedPid exited abnormally\nSee previous output\n";
+} else {
+ print STDERR "All children succeeded\n";
+}
+
+print STDERR "DONE\n";
+# No errors
+unless($keep) {
+ print STDERR "Removing $intermediate (to keep, specify -keep-all)\n";
+ system("rm -rf $intermediate");
+}
diff --git a/CBFinish.pl b/CBFinish.pl
new file mode 100755
index 0000000..53b65b6
--- /dev/null
+++ b/CBFinish.pl
@@ -0,0 +1,205 @@
+#!/usr/bin/perl -w
+
+##
+# CBFinish.pl
+#
+# Authors: Ben Langmead & Michael C. Schatz
+# Date: October 20, 2009
+#
+# Put a proper chromosome name back onto all Crossbow records.
+#
+# Author: Ben Langmead
+# Date: February 11, 2010
+#
+
+use strict;
+use warnings;
+use 5.004;
+use Getopt::Long;
+use IO::File;
+use Carp;
+use FindBin qw($Bin);
+use lib $Bin;
+use Counters;
+use Get;
+use Util;
+use Tools;
+use AWS;
+use File::Path qw(mkpath);
+
+{
+ # Force stderr to flush immediately
+ my $ofh = select STDERR;
+ $| = 1;
+ select $ofh;
+}
+
+sub run($) {
+ my $cmd = shift;
+ print STDERR "Postprocess.pl: Running \"$cmd\"\n";
+ return system($cmd);
+}
+
+# We want to manipulate counters before opening stdin, but Hadoop seems
+# to freak out when counter updates come before the first <STDIN>. So
+# instead, we append counter updates to this list.
+my @counterUpdates = ();
+
+sub counter($) {
+ my $c = shift;
+ defined($c) || croak("Undefined counter update");
+ print STDERR "reporter:counter:$c\n";
+}
+
+sub flushCounters() {
+ for my $c (@counterUpdates) { counter($c); }
+ @counterUpdates = ();
+}
+
+push @counterUpdates, "Postprocess,Invoked,1";
+
+my $cmap_file = "";
+my $cmap_jar = "";
+my $dest_dir = "";
+my $output = "";
+my $cntfn = "";
+
+sub dieusage {
+ my $msg = shift;
+ my $exitlevel = shift;
+ $exitlevel = $exitlevel || 1;
+ print STDERR "$msg\n";
+ exit $exitlevel;
+}
+
+sub msg($) {
+ my $m = shift;
+ defined($m) || croak("Undefined message");
+ $m =~ s/[\r\n]*$//;
+ print STDERR "CBFinish.pl: $m\n";
+}
+
+Tools::initTools();
+my %env = %ENV;
+
+GetOptions (
+ "output:s" => \$output,
+ "s3cmd:s" => \$Tools::s3cmd_arg,
+ "s3cfg:s" => \$Tools::s3cfg,
+ "jar:s" => \$Tools::jar_arg,
+ "accessid:s" => \$AWS::accessKey,
+ "secretid:s" => \$AWS::secretKey,
+ "hadoop:s" => \$Tools::hadoop_arg,
+ "wget:s" => \$Tools::wget_arg,
+ "cmap:s" => \$cmap_file,
+ "cmapjar:s" => \$cmap_jar,
+ "destdir:s" => \$dest_dir,
+ "counters:s" => \$cntfn) || dieusage("Bad option", 1);
+
+Tools::purgeEnv();
+
+$dest_dir = "." if $dest_dir eq "";
+
+msg("s3cmd: found: $Tools::s3cmd, given: $Tools::s3cmd_arg");
+msg("jar: found: $Tools::jar, given: $Tools::jar_arg");
+msg("hadoop: found: $Tools::hadoop, given: $Tools::hadoop_arg");
+msg("wget: found: $Tools::wget, given: $Tools::wget_arg");
+msg("s3cfg: $Tools::s3cfg");
+msg("cmap_file: $cmap_file");
+msg("cmap_jar: $cmap_jar");
+msg("local destination dir: $dest_dir");
+msg("Output dir: $output");
+msg("ls -al");
+msg(`ls -al`);
+
+if($cmap_jar ne "") {
+ mkpath($dest_dir);
+ (-d $dest_dir) || die "-destdir $dest_dir does not exist or isn't a directory, and could not be created\n";
+}
+if($cmap_file ne "" && ! -f $cmap_file) {
+ die "-cmap file $cmap_file doesn't exist or isn't readable\n";
+}
+
+sub pushResult($) {
+ my $fn = shift;
+ msg("Pushing $fn");
+ $output .= "/" unless $output =~ /\/$/;
+ if($output =~ /^s3/i) {
+ Get::do_s3_put($fn, $output, \@counterUpdates, \%env);
+ } elsif($output =~ /^hdfs/i) {
+ my $ret = Get::do_hdfs_put($fn, $output, \@counterUpdates);
+ if($ret != 0) {
+ msg("Fatal error: could not put result file $fn into HDFS directory $output");
+ exit 1;
+ }
+ } else {
+ mkpath($output);
+ (-d $output) || die "Could not create output directory: $output\n";
+ run("cp $fn $output") == 0 || die;
+ }
+}
+
+my %cmap = ();
+sub loadCmap($) {
+ my $f = shift;
+ if($f ne "" && -e $f) {
+ open CMAP, "$f";
+ while(<CMAP>) {
+ chomp;
+ my @s = split;
+ next if $s[0] eq "" || $#s < 1;
+ $cmap{$s[1]} = $s[0];
+ push @counterUpdates, "Postprocess,Chromosome map entries loaded,1";
+ }
+ close(CMAP);
+ }
+}
+
+if($cmap_jar ne "") {
+ msg("Ensuring cmap jar is installed");
+ Get::ensureFetched($cmap_jar, $dest_dir, \@counterUpdates, undef, undef, \%env);
+ push @counterUpdates, "Postprocess,Calls to ensureJar,1";
+ $cmap_file = "$dest_dir/cmap.txt";
+ msg("Examining extracted files");
+ msg("find $dest_dir");
+ print STDERR `find $dest_dir`;
+ unless(-f $cmap_file) {
+ die "Extracting jar didn't create \"$dest_dir/cmap.txt\" file.\n";
+ }
+}
+
+loadCmap($cmap_file) if $cmap_file ne "";
+
+my %outfhs = ();
+my %recs = ();
+my $lines = 0;
+while(<STDIN>) {
+ next if /^\s*FAKE\s*$/;
+ next if /^\s*$/;
+ $lines++;
+ flushCounters() if scalar(@counterUpdates) > 0;
+ next unless $_ ne "";
+ my @ss = split(/\t/);
+ my $chr = $ss[0];
+ $chr = $cmap{$chr} if defined($cmap{$chr});
+ unless(defined($outfhs{$chr})) {
+ counter("Postprocess,Chromosomes observed,1");
+ $outfhs{$chr} = new IO::File(".tmp.CBFinish.pl.$$.$chr", "w");
+ }
+ $ss[0] = $chr;
+ $ss[1] = int($ss[1]); # remove leading 0s
+ print {$outfhs{$chr}} join("\t", @ss);
+ $recs{$chr}++;
+}
+msg("Read $lines lines of output");
+for my $chr (keys %outfhs) {
+ counter("Postprocess,SNPs for chromosome $chr,$recs{$chr}");
+ $outfhs{$chr}->close();
+ my $fn = ".tmp.CBFinish.pl.$$.$chr";
+ run("gzip -c < $fn > $chr.gz") == 0 || die "Couldn't gzip $fn\n";
+ $fn = "$chr.gz";
+ pushResult($fn);
+ counter("Postprocess,Chromosome files pushed,1");
+};
+counter("Postprocess,0-SNP invocations,1") if $lines == 0;
+flushCounters() if scalar(@counterUpdates) > 0;
diff --git a/CheckDirs.pl b/CheckDirs.pl
new file mode 100755
index 0000000..35f1565
--- /dev/null
+++ b/CheckDirs.pl
@@ -0,0 +1,112 @@
+#!/usr/bin/perl
+
+##
+# MapWrap.pl
+#
+# Simple wrapper that mimics some of Hadoop's behavior during the
+# Map step of a MapReduce computation.
+#
+
+use strict;
+use warnings;
+use Getopt::Long;
+use FindBin qw($Bin);
+use lib $Bin;
+use lib "$Bin/contrib";
+use Cwd 'abs_path';
+use Wrap;
+use File::Path qw(mkpath);
+use POSIX qw/strftime/;
+
+my $input = "";
+my $output = "";
+my $intermediate = "";
+my $force = 0;
+my $verbose = 0;
+my $VERSION = `cat $Bin/VERSION`; $VERSION =~ s/\s//g;
+
+my $support = qq!
+When requesting support, please include the full output printed here.
+If a child process was the cause of the error, the output should
+include the relevant error message from the child's error log. You may
+be asked to provide additional files as well.
+!;
+
+##
+# Printer that prints to STDERR and, optionally, to a file for messages.
+#
+my $msgfn = "";
+my $msgfh = undef;
+sub msg($) {
+ my $msg = shift;
+ $msg =~ s/[\r\n]*$//;
+ print STDERR "$msg\n";
+ print {$msgfh} "$msg\n" if defined($msgfh);
+}
+
+##
+# Printer that prints to STDERR and, optionally, to a file for counters.
+#
+my ($cntfn, $cntdir) = ("", "");
+my $cntfh = undef;
+sub cnt($) {
+ my $msg = shift;
+ $msg =~ s/[\r\n]*$//;
+ print STDERR "$msg\n";
+ print {$cntfh} "$msg\n" if defined($cntfh);
+}
+
+##
+# Print an error message, a support message, then die with given
+# exitlevel.
+#
+sub mydie($$) {
+ my ($msg, $lev) = @_;
+ msg("Fatal error $VERSION:D$lev: $msg");
+ msg($support);
+ exit $lev;
+}
+
+GetOptions (
+ "messages:s" => \$msgfn,
+ "counters:s" => \$cntdir,
+ "intermediate:s" => \$intermediate,
+ "input:s" => \$input,
+ "output:s" => \$output,
+ "force" => \$force) || die "Bad option\n";
+
+if($msgfn ne "") {
+ open($msgfh, ">>$msgfn") || mydie("Could not open message-out file $msgfn for writing", 15);
+}
+$input ne "" || mydie("Must specify input directory with --input", 10);
+$intermediate ne "" || mydie("Must specify intermediate directory with --intermediate", 10);
+$output ne "" || mydie("Must specify output directory with --output", 10);
+$cntdir ne "" || mydie("Must specify counters directory with --counters", 10);
+
+msg("=== Directory checker ===");
+msg("Time: ".strftime('%H:%M:%S %d-%b-%Y', localtime));
+msg("Input: $input");
+msg("Output: $output");
+msg("Intermediate: $intermediate");
+msg("Counters: $cntdir");
+msg("Options: [ ".($force ? "--force " : "")."]");
+
+sub checkDir {
+ my ($dir, $forceoverride) = @_;
+ if(-d $dir) {
+ mydie("Output directory $dir already exists", 20) unless $force;
+ if($force && !$forceoverride) {
+ msg("Removing directory $dir due to --force");
+ system("rm -rf $dir >/dev/null 2>/dev/null");
+ -d $dir && mydie("Could not remove directory $dir", 30);
+ }
+ }
+ mkpath($dir);
+ (-d $dir) || mydie("Could not create new directory $dir", 40);
+}
+checkDir($output);
+checkDir($intermediate);
+if(defined($cntdir) && $cntdir ne "") {
+ checkDir($cntdir);
+}
+close($msgfh) if $msgfn ne "";
diff --git a/Copy.pl b/Copy.pl
new file mode 100755
index 0000000..3bb810e
--- /dev/null
+++ b/Copy.pl
@@ -0,0 +1,619 @@
+#!/usr/bin/perl -w
+
+##
+# Copy.pl
+#
+# Authors: Michael C. Schatz & Ben Langmead
+# Date: 6/26/2009
+#
+# Mapper for Crossbow bulk copies of FASTQ/SAM/BAM reads.
+#
+
+use strict;
+use warnings;
+use Getopt::Long;
+use POSIX ":sys_wait_h";
+use FindBin qw($Bin);
+use lib $Bin;
+use Counters;
+use Get;
+use Util;
+use AWS;
+use Tools;
+use File::Basename;
+use File::Path qw(mkpath rmtree);
+
+{
+ # Force stderr to flush immediately
+ my $ofh = select STDERR;
+ $| = 1;
+ select $ofh;
+}
+
+my %delayedCounters = ();
+
+sub counter($) {
+ my $c = shift;
+ print STDERR "reporter:counter:$c\n";
+}
+
+sub flushDelayedCounters($) {
+ my $name = shift;
+ for my $k (keys %delayedCounters) {
+ counter("$name,$k,$delayedCounters{$k}");
+ delete $delayedCounters{$k};
+ }
+}
+
+my $compress = "gzip";
+my $push = "";
+my $helpflag = undef;
+my $skipfirst = undef;
+my $owner = undef;
+my $stopAfter = 0;
+my $maxPerFile = 0;
+my $keep = 0;
+my $verbose = 0;
+my $labReadGroup = 0;
+my $cntfn = "";
+
+sub msg($) {
+ my $m = shift;
+ return unless defined($m);
+ $m =~ s/[\r\n]*$//;
+ print STDERR "Copy.pl: $m\n";
+}
+
+Tools::initTools();
+my %env = %ENV;
+
+GetOptions(
+ "compress:s" => \$compress,
+ "push:s" => \$push,
+ "samtools:s" => \$Tools::samtools_arg,
+ "fastq-dump:s" => \$Tools::fastq_dump_arg,
+ "s3cmd:s" => \$Tools::s3cmd_arg,
+ "s3cfg:s" => \$Tools::s3cfg,
+ "md5:s" => \$Tools::md5_arg,
+ "accessid:s" => \$AWS::accessKey,
+ "secretid:s" => \$AWS::secretKey,
+ "hadoop:s" => \$Tools::hadoop_arg,
+ "stop:i" => \$stopAfter,
+ "maxperfile:i" => \$maxPerFile,
+ "keep" => \$keep,
+ "h" => \$helpflag,
+ "s" => \$skipfirst,
+ "owner:s" => \$owner,
+ "label-rg" => \$labReadGroup,
+ "counters:s" => \$cntfn,
+ "verbose" => \$verbose)
+ || die "GetOptions failed\n";
+
+Tools::purgeEnv();
+
+my $ws = 0;
+
+$labReadGroup = 0 unless ($labReadGroup);
+$stopAfter = 0 unless($stopAfter);
+$maxPerFile = 500000 unless($maxPerFile);
+
+my $firstEnsureS3cmd = 1;
+my $s3cmdHasListMD5 = 1;
+
+# Reverse any capitalization we may have done in cb.pl
+$push =~ s/^S3N/s3n/;
+$push =~ s/^S3/s3/;
+$push =~ s/^HDFS/hdfs/;
+
+if ($push =~ /^s3/) {
+ msg("Checking availability of s3cmd") if $verbose;
+ Tools::ensureS3cmd(\%env);
+} else {
+ msg("s3cmd not needed") if $verbose;
+}
+
+my $unpaired = 0;
+my $paired = 0;
+my $totunpaired = 0;
+my $totpaired = 0;
+
+if (defined $owner && $push ne "") {
+ my $hadoop = Tools::hadoop();
+ msg("Creating destination directory and setting owner") if $verbose;
+ Util::run("$hadoop fs -mkdir $push");
+ Util::run("$hadoop fs -chown $owner $push >&2");
+}
+
+##
+# Calculate the md5 hash of an object in S3 using s3cmd.
+#
+sub s3md5($$) {
+ my ($path, $env) = @_;
+ my $s3cmd = Tools::s3cmd($env);
+ $s3cmdHasListMD5 = system("$s3cmd ls --list-md5 >/dev/null 2>&1") == 0;
+ return "" unless $s3cmdHasListMD5;
+ $path = Get::s3cmdify($path, $env);
+ my $md = `$s3cmd --list-md5 ls $path | awk '{print \$4}'`;
+ chomp($md);
+ length($md) == 32 || die "Bad MD5 obtained from s3: $md\n";
+ return $md;
+}
+
+##
+# Push a file from the local filesystem to another filesystem (perhaps
+# HDFS, perhaps S3) using hadoop fs -cp.
+#
+sub pushBatch($$) {
+ my ($file, $env) = @_;
+ -e $file || die "No such file $file";
+ $push ne "" || die "pushBatch() called but no destination is set";
+ my $pushDest = "local filesystem";
+ if ($push =~ /^hdfs:/i) { $pushDest = "HDFS"; }
+ elsif($push =~ /^s3n?:/i) { $pushDest = "S3"; }
+ counter("Short read preprocessor,Read files pushed to $pushDest,".(-s $file));
+
+ if($compress eq "bzip2" || $compress eq "bz2") {
+ Util::runAndWait("bzip2 $file >&2", "bzip2") == 0 || die "bzip2 command failed";
+ $file .= ".bz2";
+ -e $file || die "No such file $file after bzip2 compression";
+ } elsif($compress eq "gzip" || $compress eq "gz") {
+ Util::runAndWait("gzip $file >&2", "gzip") == 0 || die "gzip command failed";
+ $file .= ".gz";
+ -e $file || die "No such file $file after compression";
+ } elsif($compress eq "none") {
+ ## nothing to do
+ } elsif($compress ne "") {
+ die "Did not recognize compression type $compress";
+ }
+ -e $file || die "No such file $file";
+
+ my $md5 = Tools::md5();
+ my $md = Util::trim(Util::backtickRun("cat $file | $md5 | cut -d' ' -f 1"));
+ length($md) == 32 || die "Bad MD5 calculated locally: $md";
+
+ if ($push =~ /^hdfs:/i) {
+ my $hadoop = Tools::hadoop();
+ Util::runAndWait("$hadoop fs -put $file $push >&2", "hadoop fs -put") == 0 ||
+ die "hadoop fs -put command failed";
+ if (defined $owner) {
+ Util::run("$hadoop fs -chown $owner $push/$file >&2") == 0 ||
+ die "hadoop fs -chown command failed";
+ }
+ } elsif($push =~ /^s3n?:/i) {
+ my $s3cmd = Tools::s3cmd($env);
+ # For s3cmd, change s3n -> s3 and remove login info
+ my $s3cmd_push = Get::s3cmdify($push, $env);
+ my $cmd = "$s3cmd put $file $s3cmd_push/$file >&2";
+ Util::run($cmd) == 0 || die "Command failed: $cmd";
+ my $rmd5 = s3md5("$push/$file", $env);
+ $rmd5 eq "" || $md eq $rmd5 || die "Local MD5 $md does not equal S3 md5 $rmd5 for file $s3cmd_push/$file";
+ } else {
+ $push .= "/" unless $push =~ /\/$/;
+ mkpath($push);
+ (-d $push) || die "Could not create -push destination directory $push\n";
+ my $cmd = "cp $file $push >&2 2>/dev/null";
+ Util::run($cmd) == 0 || die "Command failed: $cmd";
+ }
+
+ counter("Short read preprocessor,Read data pushed to $pushDest (compressed),".(-s $file)) if $compress ne "";
+}
+
+## Download a file with wget
+sub wget($$$) {
+ my ($fname, $url, $md) = @_;
+ my $rc = Util::run("wget -O $fname $url >&2");
+ die "wget failed: $url $rc\n" if $rc;
+}
+
+## Download a file with hadoop fs -get
+sub hadoopget($$$) {
+ my ($fname, $url, $md) = @_;
+ my $hadoop = Tools::hadoop();
+ my $rc = Util::runAndWait("$hadoop fs -get $url $fname >&2", "hadoop fs -get");
+ die "hadoop get failed: $url $rc\n" if $rc;
+}
+
+## Download a file with s3cmd get
+sub s3get($$$$) {
+ my ($fname, $url, $md, $env) = @_;
+ my $s3cmd = Tools::s3cmd($env);
+ $url = Get::s3cmdify($url, $env);
+ my $rc = Util::run("$s3cmd get $url $fname >&2");
+ die "s3cmd get failed: $url $rc\n" if $rc;
+}
+
+## Fetch a file
+sub fetch($$$$) {
+ my ($fname, $url, $md, $env) = @_;
+ defined($md) || die;
+ msg("Fetching $url $fname $md");
+
+ if(! -f $fname) {
+ if ($url =~ /^hdfs:/) { hadoopget($fname, $url, $md); }
+ elsif ($url =~ /^s3n?:/) { s3get($fname, $url, $md, $env); }
+ elsif ($url =~ /^ftp:/ || $url =~ /^https?:/) { wget($fname, $url, $md); }
+ elsif ($url ne $fname) { Util::run("cp $url ./$fname >&2"); }
+ -f $fname || die "Failed to copy $url to $fname\n";
+ (-s $fname) > 0 || die "File obtained from URL $url was empty; bad URL?\n";
+
+ if ($md ne "0") {
+ my $md5 = Tools::md5();
+ my $omd5 = `cat $fname | $md5 | cut -d' ' -f 1`;
+ chomp($omd5);
+ $omd5 eq $md || die "MD5 mismatch for $fname; expected \"$md\", got \"$omd5\"";
+ counter("Short read preprocessor,MD5s checked,2");
+ }
+ }
+
+ counter("Short read preprocessor,Read data fetched,".(-s $fname));
+
+ my $newfname = $fname;
+ if($fname =~ /\.gz$/ || $fname =~ /\.gzip$/) {
+ $newfname =~ s/\.gzi?p?$//;
+ Util::runAndWait("gzip -dc $fname > $newfname", "gzip -dc") == 0 || die "Error while gunzipping $fname";
+ counter("Short read preprocessor,Read data fetched (uncompressed),".(-s $newfname));
+ counter("Short read preprocessor,Read data fetched (un-gzipped),".(-s $newfname));
+ } elsif($fname =~ /\.bz2$/ || $fname =~ /\.bzip2$/) {
+ $newfname =~ s/\.bzi?p?2$//;
+ Util::runAndWait("bzip2 -dc $fname > $newfname", "bzip2 -dc") == 0 || die "Error while bzip2 decompressing $fname";
+ counter("Short read preprocessor,Read data fetched (uncompressed),".(-s $newfname));
+ counter("Short read preprocessor,Read data fetched (un-bzip2ed),".(-s $newfname));
+ } elsif($fname =~ /\.bam$/) {
+ my $samtools = Tools::samtools();
+ $newfname =~ s/\.bam$/.sam/;
+ Util::runAndWait("$samtools view $fname > $newfname", "samtools") == 0 ||
+ die "Error performing BAM-to-SAM $fname";
+ counter("Short read preprocessor,Read data fetched (uncompressed),".(-s $newfname));
+ counter("Short read preprocessor,Read data fetched (BAM-to-SAM),".(-s $newfname));
+ } elsif($fname =~ /\.sra$/) {
+ my $fastq_dump = Tools::fastq_dump();
+ $newfname =~ s/\.sra$/.fastq/;
+ mkpath("./sra_tmp");
+ Util::runAndWait("$fastq_dump $fname -O ./sra_tmp > /dev/null", "fastq-dump") == 0 ||
+ die "Error performing SRA-to-FASTQ $fname";
+ Util::runAndWait("cat ./sra_tmp/* > $newfname", "cat") == 0 ||
+ die "Error copying resuld of SRA-to-FASTQ $fname";
+ counter("Short read preprocessor,Read data fetched (uncompressed),".(-s $newfname));
+ counter("Short read preprocessor,Read data fetched (un-SRAed),".(-s $newfname));
+ rmtree("./sra_tmp");
+ }
+ return $newfname;
+}
+
+##
+# Utility function that returns the reverse complement of its argument
+#
+sub revcomp($$) {
+ my ($r, $color) = @_;
+ $r = reverse($r);
+ $r =~ tr/aAcCgGtT/tTgGcCaA/ unless $color;
+ return $r;
+}
+
+my ($name, $seq, $qual, $readGroup) = (undef, undef, undef, undef);
+my $rtot = 0;
+
+##
+# Parse optional fields from a SAM record.
+#
+sub parseSAMOptionals($$) {
+ my ($opts, $hash) = @_;
+ my @ops = split(/\s+/, $opts);
+ for my $o (@ops) {
+ my @co = split(/:/, $o);
+ $#co >= 2 || die;
+ my ($nm, $ty) = ($co[0], $co[1]);
+ shift @co;
+ shift @co;
+ $hash->{"$nm:$ty"} = join(":", @co);
+ }
+}
+
+##
+# Parse a record out of a SAM input file.
+#
+sub parseSAM($$) {
+ my ($fh, $color) = @_;
+ my $samLine = <$fh>;
+ unless(defined($samLine)) {
+ $name = undef;
+ return;
+ }
+ chomp($samLine);
+ my @stok = split(/\t/, $samLine);
+ defined($stok[10]) || die "Malformed SAM line; not enough tokens:\n$samLine\n";
+ ($name, $seq, $qual) = ($stok[0], $stok[9], $stok[10]);
+ my ($flags, $chr, $pos, $mapq, $cigar) =
+ ($stok[1], $stok[2], $stok[3], $stok[4], $stok[5]);
+ $flags == int($flags) || die "SAM flags field must be an integer; was $flags\n$samLine\n";
+ my $fw = ($flags & 16) == 0;
+ if($fw) {
+ $seq = revcomp($seq, $color);
+ $qual = reverse $qual;
+ }
+ $fw = ($fw ? 1 : 0);
+ my %opts;
+ my $optstr = "";
+ for(my $i = 11; $i <= $#stok; $i++) {
+ $optstr .= " " if $optstr ne "";
+ $optstr .= $stok[$i];
+ }
+ parseSAMOptionals($optstr, \%opts);
+ if($labReadGroup && defined($opts{"RG:Z"})) {
+ $readGroup = $opts{"RG:Z"};
+ } elsif($labReadGroup) {
+ $ws++;
+ msg("No read group for read $name\n$samLine\n$_");
+ die;
+ $readGroup = "no-group";
+ } else {
+ $readGroup = undef;
+ }
+ $name =~ s/\s.*//;
+ $name = "RN:$name;SM:$chr,$pos,$fw,$mapq,$cigar";
+}
+
+##
+# Parse a record out of a FASTQ input file.
+#
+sub parseFastq($$) {
+ my ($fh, $color) = @_;
+ $name = <$fh>;
+ return unless defined($name);
+ chomp($name);
+ $seq = <$fh>;
+ unless(defined($seq)) { $name = undef; return; }
+ chomp($seq);
+ my $name2 = <$fh>;
+ unless(defined($name2)) { $name = undef; return; }
+ $qual = <$fh>;
+ unless(defined($qual)) { $name = undef; return; }
+ chomp($qual);
+ $name =~ s/\s.*//;
+ $name = "RN:$name";
+}
+
+##
+# Parse a record from an input file. Could be many lines.
+#
+sub parseRead($$$) {
+ my ($fh, $sam, $color) = @_;
+ if($sam) {
+ parseSAM($fh, $color);
+ } else {
+ parseFastq($fh, $color);
+ }
+}
+
+##
+# Handle the copy for a single unpaired entry
+#
+sub doUnpairedUrl($$$$$$) {
+ my ($url, $md, $lab, $format, $color, $env) = @_;
+ my @path = split /\//, $url;
+ my $fn = $path[-1];
+ my $of;
+ my $sam = $format =~ /^sam$/i;
+ if(defined($lab)) {
+ $lab =~ /[:\s]/ && die "Label may not contain a colon or whitespace character; was \"$lab\"\n";
+ }
+
+ # fetch the file
+ my $origFn = $fn;
+ $fn = fetch($fn, $url, $md, $env);
+
+ # turn FASTQ entries into single-line reads
+ my $fh;
+ open($fh, $fn) || die "Could not open input file $fn";
+ my $r = 0;
+ my $fileno = 1;
+ open($of, ">${fn}_$fileno.out") || die "Could not open output file ${fn}_$fileno.out";
+ my $fn_nospace = $fn;
+ $fn_nospace =~ s/[\s]+//g;
+ my $rname = "FN:".$fn_nospace; # Add filename
+ while(1) {
+ last if($stopAfter != 0 && $rtot >= $stopAfter);
+ parseRead($fh, $sam, $color);
+ last unless(defined($name));
+ my $fullname = $rname;
+ if($labReadGroup) {
+ defined($readGroup) || die;
+ $fullname .= ";LB:$readGroup";
+ $delayedCounters{"Unpaired reads with label $readGroup"}++;
+ } elsif(defined($lab)) {
+ $fullname .= ";LB:$lab";
+ $delayedCounters{"Unpaired reads with label $lab"}++;
+ }
+ $fullname .= ";$name";
+ print $of "$fullname\t$seq\t$qual\n";
+ $r++; $rtot++;
+ if($maxPerFile > 0 && ($r % $maxPerFile) == 0) {
+ close($of);
+ if($push ne "") {
+ pushBatch("${fn}_$fileno.out", $env);
+ system("rm -f ${fn}_$fileno.out ${fn}_$fileno.out.* >&2");
+ }
+ $fileno++;
+ open($of, ">${fn}_$fileno.out") || die "Could not open output file ${fn}_$fileno.out";
+ }
+ $totunpaired++;
+ if(++$unpaired >= 100000) {
+ counter("Short read preprocessor,Unpaired reads,$unpaired");
+ $unpaired = 0;
+ }
+ }
+ counter("Short read preprocessor,Unpaired reads,$unpaired");
+ close($fh);
+ close($of);
+ flushDelayedCounters("Short read preprocessor");
+
+ # Remove input file
+ system("rm -f $fn $origFn >&2") unless $keep;
+ if($push ne "") {
+ # Push and remove output files
+ pushBatch("${fn}_$fileno.out", $env);
+ system("rm -f ${fn}_$fileno.out ${fn}_$fileno.out.* >&2");
+ } else {
+ # Just keep the output files around
+ }
+}
+
+##
+# Handle the copy for a single paired entry
+#
+sub doPairedUrl($$$$$$$$) {
+ my ($url1, $md51, $url2, $md52, $lab, $format, $color, $env) = @_;
+ my @path1 = split /\//, $url1;
+ my @path2 = split /\//, $url2;
+ my ($fn1, $fn2) = ($path1[-1], $path2[-1]);
+ my $origFn1 = $fn1;
+ my $origFn2 = $fn2;
+ $fn1 = fetch($fn1, $url1, $md51, $env);
+ $fn2 = fetch($fn2, $url2, $md52, $env);
+ my $sam = $format =~ /^sam$/i;
+ if(defined($lab)) {
+ $lab =~ /[:\s]/ && die "Label may not contain a colon or whitespace character; was \"$lab\"\n";
+ }
+
+ # turn FASTQ pairs into tuples
+ my ($fh1, $fh2);
+ open($fh1, $fn1) || die "Could not open input file $fn1";
+ open($fh2, $fn2) || die "Could not open input file $fn2";
+ my $r = 0;
+ my $fileno = 1;
+ my $of;
+ open($of, ">${fn1}_$fileno.out") || die;
+ my $fn1_nospace = $fn1;
+ $fn1_nospace =~ s/[\s]+//g;
+ my $rname .= "FN:".$fn1_nospace; # Add filename
+ while(1) {
+ last if($stopAfter != 0 && $rtot >= $stopAfter);
+ parseRead($fh1, $sam, $color);
+ my ($name1, $seq1, $qual1) = ($name, $seq, $qual);
+ parseRead($fh2, $sam, $color);
+ defined($name) == defined($name1) ||
+ die "Mate files didn't come together properly: $fn1,$fn2\n";
+ last unless defined($name);
+ my $fullname = $rname;
+ if($labReadGroup) {
+ defined($readGroup) || die;
+ $fullname .= ";LB:$readGroup";
+ $delayedCounters{"Pairs with label $readGroup"}++;
+ } elsif(defined($lab)) {
+ $fullname .= ";LB:$lab";
+ $delayedCounters{"Pairs with label $lab"}++;
+ }
+ $fullname .= ";$name";
+ print $of "$fullname\t$seq1\t$qual1\t$seq\t$qual\n";
+ $r++;
+ $rtot += 2;
+ if($maxPerFile > 0 && ($r % $maxPerFile) == 0) {
+ close($of);
+ if($push ne "") {
+ pushBatch("${fn1}_$fileno.out", $env);
+ system("rm -f ${fn1}_$fileno.out ${fn1}_$fileno.out.* >&2");
+ }
+ $fileno++;
+ open($of, ">${fn1}_$fileno.out") || die "Could not open output file ${fn1}_$fileno.out";
+ }
+ $totpaired++;
+ if(++$paired >= 100000) {
+ counter("Short read preprocessor,Paired reads,$paired");
+ $paired = 0;
+ }
+ }
+ counter("Short read preprocessor,Paired reads,$paired");
+ close($fh1);
+ close($fh2);
+ close($of);
+ flushDelayedCounters("Short read preprocessor");
+
+ # Remove input files
+ system("rm -f $fn1 $origFn1 >&2") unless $keep;
+ system("rm -f $fn2 $origFn2 >&2") unless $keep;
+ if($push ne "") {
+ # Push and remove output files
+ pushBatch("${fn1}_$fileno.out", $env);
+ system("rm -f ${fn1}_$fileno.out ${fn1}_$fileno.out.* >&2");
+ } else {
+ # Just keep the output files around
+ }
+}
+
+##
+# Add user's credentials to an s3 or s3n URI if necessary
+#
+sub addkey($$) {
+ my ($url, $env) = @_;
+ return $url unless $url =~ /^s3n?:/i;
+ AWS::ensureKeys($Tools::hadoop, $Tools::hadoop_arg, $env);
+ if($url =~ /s3n?:\/\/[^\@]*$/ && defined($AWS::accessKey)) {
+ my $ec2key = $AWS::accessKey.":".$AWS::secretKey;
+ $url =~ s/s3:\/\//s3:\/\/$ec2key\@/;
+ $url =~ s/s3n:\/\//s3n:\/\/$ec2key\@/;
+ }
+ return $url;
+}
+
+##
+# Give URL, return likely format string. Default to fastq.
+#
+sub urlToFormat($) {
+ my $url = shift;
+ if($url =~ /\.sam$/i || $url =~ /\.bam$/i) {
+ return "sam";
+ } else {
+ return "fastq";
+ }
+}
+
+while (<>) {
+ # Skip comments and whitespace lines
+ chomp;
+ my @s = split(/\s+/);
+ msg("Line: $_");
+ if ($skipfirst) {
+ my $trimmed = shift @s;
+ msg("-s trimmed \"$trimmed\" from line:\n$_");
+ }
+ if(scalar(@s) == 0) { # Skip empty or whitespace-only lines
+ counter("Short read preprocessor,Empty lines,1");
+ next;
+ }
+ if($s[0] =~ /^\s*#/) { # Skip lines beginning with hash
+ counter("Short read preprocessor,Comment lines,1");
+ msg("Skipping comment line");
+ next;
+ } else {
+ msg("Not a comment line");
+ }
+ unless(defined($s[1])) {
+ counter("Short read preprocessor,Malformed lines,1");
+ msg("Line malformed:\n$_");
+ msg("Skipping...");
+ next;
+ }
+ my ($url1, $md51) = (addkey($s[0], \%env), $s[1]);
+ my $color = 0; # TODO
+
+ my $turl1 = fileparse($url1);
+ if($#s >= 3) {
+ # If s[4] is defined, it contains the sample label
+ msg("Doing paired-end entry $turl1");
+ my ($url2, $md52) = (addkey($s[2], \%env), $s[3]);
+ doPairedUrl($url1, $md51, $url2, $md52, $s[4], urlToFormat($url1), $color, \%env);
+ counter("Short read preprocessor,Paired URLs,1");
+ } else {
+ # If s[2] is defined, it contains the sample label
+ msg("Doing unpaired entry $turl1");
+ doUnpairedUrl($url1, $md51, $s[2], urlToFormat($url1), $color, \%env);
+ counter("Short read preprocessor,Unpaired URLs,1");
+ }
+ msg("Total unpaired reads: $totunpaired");
+ msg("Total paired reads: $totpaired");
+}
+print "FAKE\n";
+
+counter("Short read preprocessor,Warnings,$ws");
+msg("Warnings: $ws");
+flushDelayedCounters("Short read preprocessor");
diff --git a/Counters.pl b/Counters.pl
new file mode 100755
index 0000000..f1feea1
--- /dev/null
+++ b/Counters.pl
@@ -0,0 +1,157 @@
+#!/usr/bin/perl -w
+
+##
+# Counters.pl
+#
+# Authors: Ben Langmead
+# Date: February 14, 2010
+#
+# Get all the counters and put them in the output directory.
+#
+
+use strict;
+use warnings;
+use Getopt::Long;
+use POSIX qw/strftime/;
+use FindBin qw($Bin);
+use lib $Bin;
+use Get;
+use Util;
+use Tools;
+use AWS;
+use File::Path qw(mkpath);
+
+{
+ # Force stderr to flush immediately
+ my $ofh = select STDERR;
+ $| = 1;
+ select $ofh;
+}
+
+my @counterUpdates = ();
+
+sub counter($) {
+ my $c = shift;
+ print STDERR "reporter:counter:$c\n";
+}
+
+sub flushCounters() {
+ for my $c (@counterUpdates) { counter($c); }
+ @counterUpdates = ();
+}
+
+sub trim($) {
+ my $string = shift;
+ $string =~ s/^\s+//;
+ $string =~ s/\s+$//;
+ return $string;
+}
+
+my $ref = "";
+my $dest_dir = "";
+my $output = "";
+
+sub dieusage {
+ my $msg = shift;
+ my $exitlevel = shift;
+ $exitlevel = $exitlevel || 1;
+ print STDERR "$msg\n";
+ exit $exitlevel;
+}
+
+Tools::initTools();
+my %env = %ENV;
+
+GetOptions (
+ "s3cmd:s" => \$Tools::s3cmd_arg,
+ "s3cfg:s" => \$Tools::s3cfg,
+ "jar:s" => \$Tools::jar_arg,
+ "accessid:s" => \$AWS::accessKey,
+ "secretid:s" => \$AWS::secretKey,
+ "hadoop:s" => \$Tools::hadoop_arg,
+ "wget:s" => \$Tools::wget_arg,
+ "destdir:s" => \$dest_dir,
+ "output:s" => \$output) || dieusage("Bad option", 1);
+
+Tools::purgeEnv();
+
+$output ne "" || die "Must specify non-empty -output\n";
+print STDERR "s3cmd: found: $Tools::s3cmd, given: $Tools::s3cmd_arg\n";
+print STDERR "jar: found: $Tools::jar, given: $Tools::jar_arg\n";
+print STDERR "hadoop: found: $Tools::hadoop, given: $Tools::hadoop_arg\n";
+print STDERR "wget: found: $Tools::wget, given: $Tools::wget_arg\n";
+print STDERR "s3cfg: $Tools::s3cfg\n";
+print STDERR "local destination dir: $dest_dir\n";
+print STDERR "output url: $output\n";
+print STDERR "ls -al\n";
+print STDERR `ls -al`;
+
+sub pushResult($) {
+ my $fn = shift;
+ print STDERR "Pushing $fn\n";
+ $output .= "/" unless $output =~ /\/$/;
+ if($output =~ /^s3/i) {
+ Get::do_s3_put($fn, $output, \@counterUpdates, \%env);
+ } elsif($output =~ /^hdfs/i) {
+ Get::do_hdfs_put($fn, $output, \@counterUpdates);
+ } else {
+ mkpath($output);
+ (-d $output) || die "Could not create push directory $output\n";
+ run("cp $fn $output") == 0 || die;
+ }
+}
+
+my $warnings = 0;
+sub warning($) {
+ print STDERR shift;
+ $warnings++;
+}
+
+while(<STDIN>) { }
+
+my $countersFn = "counters_".strftime('%Y_%H_%M_%S',localtime).".txt";
+open TMP, ">$countersFn" || die "Could not open $countersFn for writing\n";
+
+my $counters = 0;
+my $hadoop = Tools::hadoop();
+my $jstr = `$hadoop job -list all | awk '\$1 ~ /^job/ && \$2 == 2 {print \$1}'`;
+my @jobs = split(/[\n\r]+/, $jstr);
+for my $job (@jobs) {
+ my $sstr = `$hadoop job -status $job`;
+ my @status = split(/[\n\r]+/, $sstr);
+ my $section = "";
+ for (@status) {
+ next if /^\s*$/; # skip blank lines
+ next if /^Job:/; # skip Job: lines
+ next if /^file:/; # skip file: lines
+ next if /^tracking URL:/;
+ if(/^map[(][)] completion: (.*)$/) {
+ $1 eq "1.0" || warning("Incomplete mappers:\n\"$_\"\n");
+ }
+ if(/^reduce[(][)] completion: (.*)$/) {
+ $1 eq "1.0" || warning("Incomplete reducers:\n\"$_\"\n");
+ }
+ next if /^Counters:/;
+ chomp;
+ my $l = trim($_);
+ if(/[=]/) {
+ # Key=Value line
+ $section ne "" || warning("No label before line:\n\"$_\"\n");
+ my @s = split(/[=]/, $l);
+ $#s == 1 || die;
+ print TMP "$job\t$section\t$s[0]\t$s[1]\n";
+ counter("Get counters,Counters,1");
+ $counters++;
+ } else {
+ $section = $l;
+ }
+ }
+}
+close(TMP);
+
+counter("Get counters,Counter files pushed,1");
+print STDERR "Pushing counters file to $output\n";
+pushResult($countersFn);
+
+print STDERR "Collected $counters counters\n";
+print STDERR "$warnings warnings\n";
diff --git a/Counters.pm b/Counters.pm
new file mode 100644
index 0000000..cdd43c6
--- /dev/null
+++ b/Counters.pm
@@ -0,0 +1,164 @@
+#!/usr/bin/perl -w
+
+##
+# Counters.pl
+#
+# Authors: Ben Langmead
+# Date: February 14, 2010
+#
+# When it comes to counters, there are several complicating factors.
+# First, single-computer mode accesses counters in a very different way
+# from Hadoop or Cloud modes.
+#
+# Get all the counters and put them in the output directory.
+#
+
+package Counters;
+use strict;
+use warnings;
+use Fcntl qw(:DEFAULT :flock); # for locking
+use FindBin qw($Bin);
+use lib $Bin;
+use File::Path qw(mkpath);
+use Tools;
+use Util;
+use AWS;
+use Util;
+use Carp;
+
+##
+# Given a directory with stderr output from a single-computer-mode
+# stage ($dir), an output filename ($outfn), and a function to send
+# warning and error messages to ($msg), parse all the counter updates
+# into a counter hash and then write the hash to the file at $outfn.
+#
+sub dumpLocalCounters($$$) {
+ my ($dir, $outfn, $msg) = @_;
+ -d $dir || die "No such input file or directory as \"$dir\"\n";
+ my @fs = ();
+ @fs = <$dir/*>;
+ my %counters = ();
+ for my $f (@fs) {
+ if($f =~ /\.gz$/) {
+ open INP, "gzip -dc $f |" || die "Could not open pipe 'gzip -dc $f |'";
+ } elsif($f =~ /\.bz2$/) {
+ open INP, "bzip2 -dc $f |" || die "Could not open pipe 'bzip2 -dc $f |'";
+ } else {
+ open INP, "$f" || die "Could not open $f for reading\n";
+ }
+ while(<INP>) {
+ if(/^reporter:counter:/) {
+ chomp;
+ $_ = substr($_, length("reporter:counter:"));
+ my @us = split(/,/);
+ if(scalar(@us) != 3) {
+ $msg->("Warning: Ill-formed counter updated line:\n$_");
+ }
+ $counters{$us[0]}{$us[1]} += $us[2];
+ }
+ }
+ close(INP);
+ $? == 0 || die "Bad exitlevel from input slurp: $?\n";
+ }
+ open(CNT, ">>$outfn") || die "Could not open file '$outfn' for appending\n";
+ for my $k1 (sort keys %counters) {
+ for my $k2 (sort keys %{$counters{$k1}}) {
+ print CNT "pid=$$\t$k1\t$k2\t$counters{$k1}{$k2}\n";
+ }
+ }
+ close(CNT);
+}
+
+##
+# Use the 'hadoop' script to (a) determine what jobs have completed,
+# and (b) populate a hash with all the counter values.
+#
+# If we had information about job ids of previous jobs in this same job
+# flow, we wouldn't have to scan this whole list.
+#
+# Note: the caller has to know the job id of the .
+#
+sub getHadoopCounters($$$$) {
+ my ($cnth, $selectjob, $msg, $verbose) = @_;
+ $msg->("In getHadoopCounters:");
+ my $counters = 0; # overall
+ my $hadoop = Tools::hadoop();
+ # Get all finished jobs
+ my $jstr = `$hadoop job -list all | awk '\$1 ~ /^job/ && \$2 == 2 {print \$1}'`;
+ my @jobs = split(/[\n\r]+/, $jstr);
+ my $jobfound = 0;
+ $selectjob = sub {return 1} unless defined($selectjob);
+ for my $job (@jobs) {
+ if(!$selectjob->($job)) {
+ $msg->(" Skipping job $job") if $verbose;
+ } else {
+ $msg->(" Examining job $job") if $verbose;
+ }
+ $jobfound++;
+ my $sstr = `$hadoop job -status $job`;
+ my @status = split(/[\n\r]+/, $sstr);
+ my $seccounters = 0; # per section
+ my $section = "";
+ for (@status) {
+ next if /^\s*$/; # skip blank lines
+ next if /^Job:/; # skip Job: lines
+ next if /^file:/; # skip file: lines
+ next if /^tracking URL:/;
+ if(/^map[(][)] completion: (.*)$/) {
+ $1 eq "1.0" || $msg->("Warning: Incomplete mappers:\n\"$_\"\n");
+ }
+ if(/^reduce[(][)] completion: (.*)$/) {
+ $1 eq "1.0" || $msg->("Warning: Incomplete reducers:\n\"$_\"\n");
+ }
+ next if /^Counters:/;
+ chomp;
+ my $l = Util::trim($_);
+ if(/[=]/) {
+ # Key=Value line
+ $section ne "" || $msg->("No label before line:\n\"$_\"\n");
+ my @s = split(/[=]/, $l);
+ $#s == 1 || die;
+ $cnth->{$section}{$s[0]} = $s[1];
+ $counters++;
+ $seccounters++;
+ } else {
+ $msg->(" section had $seccounters counters") if $verbose && $section ne "";
+ $section = $l;
+ $seccounters = 0;
+ $msg->(" Found section: $section") if $verbose;
+ }
+ }
+ $msg->(" section had $seccounters counters") if $verbose && $section ne "";
+ }
+}
+
+##
+# Sift through a local directory of stderr output files, extract and
+# compile all the counter updates into the '$counters' hashref.
+#
+sub getLocalCounters($$$$) {
+ my ($fn, $counters, $msg, $verbose) = @_;
+ open(CNTS, $fn) || die "Could not open counter file '$fn'";
+ while(<CNTS>) {
+ my @s = split(/\t/);
+ scalar(@s) == 3 || die "Ill-formatted counter line; must have 3 fields:\n$_\n";
+ $counters->{$s[0]}{$s[1]} = $s[2];
+ }
+ close(CNTS);
+}
+
+##
+# Get counters from previous stages.
+#
+sub getCounters($$$$) {
+ my ($cntfn, $counters, $msg, $verbose) = @_;
+ if(!defined($cntfn) || $cntfn eq "") {
+ # Try to get counters from Hadoop
+ Counters::getHadoopCounters($counters, undef, $msg, $verbose);
+ } else {
+ # Try to get counters from specified file
+ Counters::getLocalCounters($cntfn, $counters, $msg, $verbose);
+ }
+}
+
+1;
diff --git a/CrossbowIface.pm b/CrossbowIface.pm
new file mode 100644
index 0000000..9f71f04
--- /dev/null
+++ b/CrossbowIface.pm
@@ -0,0 +1,1495 @@
+#!/usr/bin/perl -w
+
+##
+# Author: Ben Langmead
+# Date: February 11, 2010
+#
+# Use 'elastic-mapreduce' ruby script to invoke an EMR job described
+# in a dynamically-generated JSON file. Constructs the elastic-
+# mapreduce invocation from paramteres/defaults/environment variables.
+#
+
+package CrossbowIface;
+use strict;
+use warnings;
+use Getopt::Long;
+use FindBin qw($Bin);
+use List::Util qw[min max];
+use Cwd 'abs_path';
+use lib $Bin;
+use Tools;
+use File::Path qw(mkpath);
+
+##
+# Function interface for invoking the generic Crossbow wrapper.
+#
+sub crossbow {
+
+scalar(@_) == 7 || die "Must specify 7 arguments";
+
+our @args = @{$_[0]};
+our $scr = $_[1];
+our $usage = $_[2];
+our $msg = $_[3];
+our $msgf = $_[4];
+our $emsg = $_[5];
+our $emsgf = $_[6];
+
+defined($msg) || ($msg = sub { print @_ });
+defined($msgf) || ($msgf = sub { printf @_ });
+defined($emsg) || ($emsg = sub { print STDERR @_ });
+defined($emsgf) || ($emsgf = sub { printf STDERR @_ });
+
+our $APP = "Crossbow";
+our $app = lc $APP;
+our $VERSION = `cat $Bin/VERSION`; $VERSION =~ s/\s//g;
+if($VERSION eq "") {
+ $VERSION = `cat $Bin/VERSION_CROSSBOW`; $VERSION =~ s/\s//g;
+}
+
+our $umaskOrig = umask();
+
+sub dieusage($$$) {
+ my ($text, $usage, $lev) = @_;
+ $emsg->("$text\n");
+ $emsg->("$usage\n");
+ exit $lev;
+}
+
+our $warnings = 0;
+sub warning($) {
+ my $str = shift;
+ $emsg->("$str\n");
+ $warnings++;
+}
+
+# AWS params
+our $awsEnv = 0;
+our $emrScript = "";
+our $hadoopVersion = "";
+our $accessKey = "";
+our $secretKey = "";
+our $keypair = "";
+our $keypairFile = "";
+our $zone = "";
+our $credentials = "";
+our $swap = 0; # to add
+
+# EMR params
+our $dryrun = 0;
+our $name = "";
+our $waitJob = 0;
+our $instType = "";
+our $numNodes = 1;
+our $reducersPerNode = 0;
+our $emrArgs = "";
+our $noLogs = 0;
+our $logs = "";
+our $noEmrDebugging = 0;
+
+# Job params
+our $input = "";
+our $output = "";
+our $intermediate = "";
+our $partitionLen = 0;
+our $justAlign = 0;
+our $resumeAlign = 0;
+our $resumeSnps = 0;
+our $keepAll = 0;
+our $keepIntermediate = 0;
+
+# Lobal job params
+our $localJob = 0;
+our $test = 0;
+our $inputLocal = "";
+our $outputLocal = "";
+our $intermediateLocal = "";
+our $cores = 0;
+our $dontForce = 0;
+our $bowtie = "";
+our $samtools = "";
+our $fastq_dump = "";
+our $useSamtools = 0;
+our $useFastqDump = 0;
+our $soapsnp = "";
+our $externalSort = 0;
+our $maxSortRecords = 800000;
+our $maxSortFiles = 40;
+
+# Hadoop job params
+our $hadoopJob = 0;
+our $hadoop_arg = "";
+our $hadoopStreamingJar_arg = "";
+
+# Preprocessing
+our $preprocess = 0;
+our $justPreprocess = 0;
+our $preprocOutput = "";
+our $preprocCompress = "";
+our $preprocStop = 0;
+our $preprocMax = 0;
+
+# Crossbow params
+our $ref = "";
+our $bt_args = "";
+our $qual = "";
+our $discardAll = 0;
+our $discardReads = 0;
+our $discardRefBins = 0;
+our $indexLocal = "";
+our $truncate = 0;
+our $truncateDiscard = 0;
+our $cmapLocal = "";
+our $sequencesLocal = "";
+our $snpsLocal = "";
+our $ss_args = "";
+our $ss_hap_args = "";
+our $ss_dip_args = "";
+our $haploids = "";
+our $allHaploids = 0;
+
+# Other parmams
+our $tempdir = "";
+our $slaveTempdir = "";
+our $splitJars = 0;
+our $verbose = 0;
+
+sub absPath($) {
+ my $path = shift;
+ defined($path) || die;
+ if($path =~ /^hdfs:/i || $path =~ /^s3n?:/i || $path eq "") {
+ return $path;
+ }
+ $path =~ s/^~/$ENV{HOME}/;
+ my $ret = abs_path($path);
+ defined($ret) || die "abs_path turned $path into undef\n";
+ return $ret;
+}
+
+##
+# A tiny log facility in case we need to report what we did to the user.
+#
+our $checkExeMsg = "";
+sub checkExeLog($) {
+ my $text = shift;
+ $checkExeMsg .= $text;
+ $emsg->($text) if $verbose;
+}
+
+##
+# Can I run the executable and receive error 256? This is a little
+# more robust than -x, but it requires that the executable return 1
+# immediately if run without arguments.
+#
+sub canRun {
+ my ($nm, $f, $exitlevel) = @_;
+ $exitlevel = 0 unless defined($exitlevel);
+ my $ret = system("$f 2>/dev/null >/dev/null") >> 8;
+ return 1 if $ret == $exitlevel;
+ if($ret != 1 && $ret != 255) {
+ return 0;
+ }
+ if($nm eq "Rscript" || $nm eq "R") {
+ checkExeLog(" Checking whether R has appropriate R/Bioconductor packages...\n");
+ my $packages = "";
+ for my $pack ("lmtest", "multicore", "IRanges", "geneplotter") {
+ $packages .= "suppressPackageStartupMessages(library($pack)); print('Found required package $pack'); ";
+ }
+ my $out = `$f -e \"$packages print('All packages found')\" 2>&1`;
+ checkExeLog($out);
+ $ret = $? >> 8;
+ return $ret == $exitlevel;
+ }
+ return 1;
+}
+
+##
+# Scan the bin subdirectory for a working version of the given program.
+#
+sub scanPrebuiltBin {
+ my ($nm, $base, $exitlevel) = @_;
+ defined($nm) || die;
+ defined($base) || die;
+ $exitlevel = 0 unless defined($exitlevel);
+ my @ret = ();
+ for my $f (<$base/bin/*>) {
+ checkExeLog(" Scanning directory: $f\n");
+ for my $f2 (<$f/$nm>) {
+ next unless -f $f2;
+ checkExeLog(" Found candidate: $f2\n");
+ checkExeLog(" Runnable?...");
+ if(canRun($nm, $f2, $exitlevel)) {
+ checkExeLog("YES\n");
+ push @ret, $f2;
+ } else {
+ checkExeLog("no\n");
+ }
+ }
+ }
+ if($nm eq "Rscript" || $nm eq "R") {
+ my $path = "$Bin/R/bin/Rscript";
+ checkExeLog(" I'm searching for R or Rscript, so scanning directory: $path\n");
+ if(canRun($nm, $path, $exitlevel)) {
+ push @ret, $path;
+ }
+ }
+ if(scalar(@ret) > 0) {
+ @ret = sort @ret;
+ checkExeLog(" Settling on $ret[-1]\n");
+ return $ret[-1];
+ } else {
+ checkExeLog(" No runnable candidates\n");
+ return "";
+ }
+}
+
+##
+# Require that an exe be specified and require that it's there.
+#
+sub checkExe {
+ my ($path, $nm, $env, $sub, $arg, $dieOnFail, $exitlevel) = @_;
+ $exitlevel = 0 unless defined($exitlevel);
+ $nm ne "" || die "Empty name\n";
+ defined($path) || die "Path for $nm undefined\n";
+ checkExeLog("Searching for '$nm' binary...\n");
+ checkExeLog(sprintf " Specified via $arg?....%s\n", (($path ne "") ? "YES" : "no"));
+ if($path ne "") {
+ my $cr = canRun($nm, $path, $exitlevel);
+ checkExeLog(sprintf(" Runnable?....%s\n", ($cr ? "YES" : "no")));
+ return $path if $cr;
+ die "Error: $arg specified, but path $path does not point to something $APP can execute\n";
+ }
+ my $envSpecified = defined($ENV{$env}) && $ENV{$env} ne "";
+ checkExeLog(sprintf " \$$env specified?....%s\n", ($envSpecified ? "YES ($ENV{$env})" : "no"));
+ if($envSpecified) {
+ my $envPath = $ENV{$env};
+ $envPath .= "/$sub" if $sub ne "";
+ $envPath .= "/$nm";
+ my $cr = canRun($nm, $envPath, $exitlevel);
+ checkExeLog(sprintf " Runnable?....%s\n", ($cr ? "YES" : "no"));
+ return $envPath if $cr;
+ }
+ checkExeLog(" Checking $Bin/bin...\n");
+ $path = scanPrebuiltBin($nm, $Bin);
+ return $path if $path ne "";
+ checkExeLog(" Checking \$PATH...\n");
+ $path = `which $nm 2>/dev/null`;
+ if(defined($path)) {
+ chomp($path);
+ if($path) {
+ checkExeLog(" Found '$path'...\n");
+ my $cr = canRun($nm, $path, $exitlevel);
+ checkExeLog(sprintf " Runnable?....%s\n", ($cr ? "YES" : "no"));
+ return $path if $cr;
+ } else {
+ checkExeLog(" Didn't find anything...\n");
+ }
+ }
+ $emsg->("Error: Could not find '$nm' executable\n");
+ if($hadoopJob) {
+ $emsg->("Note: for Hadoop jobs, required executables must be located at the same path on all cluster nodes including the master.\n");
+ }
+ unless($verbose) {
+ $emsg->("Here's what I tried:\n");
+ $emsg->($checkExeMsg);
+ }
+ exit 1 if $dieOnFail;
+ return "";
+}
+
+ at ARGV = @args;
+
+my $help = 0;
+
+Getopt::Long::Configure("no_pass_through");
+GetOptions (
+# AWS params
+ "aws-env" => \$awsEnv,
+ "emr-script:s" => \$emrScript,
+ "elastic-mapreduce:s" => \$emrScript,
+ "hadoop-version:s" => \$hadoopVersion,
+ "accessid:s" => \$accessKey,
+ "secretid:s" => \$secretKey,
+ "keypair|key-pair:s" => \$keypair,
+ "key-pair-file:s" => \$keypairFile,
+ "zone|region:s" => \$zone,
+ "credentials:s" => \$credentials,
+# EMR params
+ "dryrun" => \$dryrun,
+ "dry-run" => \$dryrun,
+ "name:s" => \$name,
+ "instance-type:s" => \$instType,
+ "stay-alive" => \$waitJob,
+ "wait-on-fail" => \$waitJob,
+ "nodes:i" => \$numNodes,
+ "instances|num-instances:i" => \$numNodes,
+ "emr-args:s" => \$emrArgs,
+ "no-logs" => \$noLogs,
+ "logs:s" => \$logs,
+ "no-emr-debug" => \$noEmrDebugging,
+ "swap:i" => \$swap,
+# Job params
+ "input:s" => \$input,
+ "output:s" => \$output,
+ "intermediate:s" => \$intermediate,
+ "partition-len:i" => \$partitionLen,
+ "just-align" => \$justAlign,
+ "resume-align" => \$resumeAlign,
+ "resume-snps" => \$resumeSnps,
+ "local-job" => \$localJob,
+ "hadoop-job" => \$hadoopJob,
+ "keep-all" => \$keepAll,
+ "keep-intermediates" => \$keepIntermediate,
+ "test" => \$test,
+# Local job params
+ "input-local:s" => \$inputLocal,
+ "output-local:s" => \$outputLocal,
+ "intermediate-local:s" => \$intermediateLocal,
+ "cores:i" => \$cores,
+ "cpus:i" => \$cores,
+ "max-sort-records:i" => \$maxSortRecords,
+ "max-sort-files:i" => \$maxSortFiles,
+ "dont-overwrite" => \$dontForce,
+ "no-overwrite" => \$dontForce,
+ "bowtie:s" => \$bowtie,
+ "samtools:s" => \$samtools,
+ "fastq-dump:s" => \$fastq_dump,
+ "soapsnp:s" => \$soapsnp,
+ "external-sort" => \$externalSort,
+# Hadoop job params
+ "hadoop:s" => \$hadoop_arg,
+ "streaming-jar:s" => \$hadoopStreamingJar_arg,
+# Crossbow params
+ "reference:s" => \$ref,
+ "index-local:s" => \$indexLocal,
+ "quality|qual|quals:s" => \$qual,
+ "bowtie-args:s" => \$bt_args,
+ "discard-reads:f" => \$discardReads,
+ "discard-all:f" => \$discardAll,
+ "discard-ref-bins:f" => \$discardRefBins,
+ "truncate|truncate-length:i"=> \$truncate,
+ "truncate-discard:i" => \$truncateDiscard,
+ "cmap-local:s" => \$cmapLocal,
+ "sequences-local:s" => \$sequencesLocal,
+ "snps-local:s" => \$snpsLocal,
+ "ss-args:s" => \$ss_args,
+ "ss-hap-args:s" => \$ss_hap_args,
+ "ss-dip-args:s" => \$ss_dip_args,
+ "soapsnp-args:s" => \$ss_args,
+ "soapsnp-hap-args:s" => \$ss_hap_args,
+ "soapsnp-dip-args:s" => \$ss_dip_args,
+ "haploids:s" => \$haploids,
+ "all-haploids" => \$allHaploids,
+# Preprocessing params
+ "preprocess" => \$preprocess,
+ "just-preprocess" => \$justPreprocess,
+ "crossbow" => sub { $justPreprocess = 0 },
+ "pre-output:s" => \$preprocOutput,
+ "preproc-output:s" => \$preprocOutput,
+ "preprocess-output:s" => \$preprocOutput,
+ "pre-compress:s" => \$preprocCompress,
+ "preproc-compress:s" => \$preprocCompress,
+ "preprocess-compress:s" => \$preprocCompress,
+ "pre-stop:i" => \$preprocStop,
+ "pre-filemax:i" => \$preprocMax,
+# Other parmams
+ "tempdir:s" => \$tempdir,
+ "slave-tempdir:s" => \$slaveTempdir,
+ "split-jars" => \$splitJars,
+ "verbose" => \$verbose,
+ "version" => \$VERSION,
+ "help" => \$help
+) || dieusage("Error parsing options", $usage, 1);
+
+dieusage("", $usage, 0) if $help;
+
+# This function generates random strings of a given length
+sub randStr($) {
+ my $len = shift;
+ my @chars = ('a'..'z', 'A'..'Z', '0'..'9', '_');
+ my $str = "";
+ foreach (1..$len) {
+ $str .= $chars[int(rand(scalar(@chars)))];
+ }
+ return $str;
+}
+srand(time ^ $$);
+my $randstr = randStr(10);
+
+# See http://aws.amazon.com/ec2/instance-types/
+
+our %instTypeNumCores = (
+ "m1.small" => 1,
+ "m1.large" => 2,
+ "m1.xlarge" => 4,
+ "c1.medium" => 2,
+ "c1.xlarge" => 8,
+ "m2.xlarge" => 2,
+ "m2.2xlarge" => 4,
+ "m2.4xlarge" => 8,
+ "cc1.4xlarge" => 8
+);
+
+our %instTypeSwap = (
+ "m1.small" => (2 *1024), # 1.7 GB
+ "m1.large" => (8 *1024), # 7.5 GB
+ "m1.xlarge" => (16*1024), # 15.0 GB
+ "c1.medium" => (2 *1024), # 1.7 GB
+ "c1.xlarge" => (8 *1024), # 7.0 GB
+ "m2.xlarge" => (16*1024), # 17.1 GB
+ "m2.2xlarge" => (16*1024), # 34.2 GB
+ "m2.4xlarge" => (16*1024), # 68.4 GB
+ "cc1.4xlarge" => (16*1024) # 23.0 GB
+);
+
+our %instTypeBitsMap = (
+ "m1.small" => 32,
+ "m1.large" => 64,
+ "m1.xlarge" => 64,
+ "c1.medium" => 32,
+ "c1.xlarge" => 64,
+ "m2.xlarge" => 64,
+ "m2.2xlarge" => 64,
+ "m2.4xlarge" => 64,
+ "cc1.4xlarge" => 64
+);
+
+##
+# Return the appropriate configuration string for setting the number of fields
+# to bin on. This depends on the Hadoop version.
+#
+sub partitionConf($) {
+ my $binFields = shift;
+ my @vers = split(/[^0-9]+/, $hadoopVersion);
+ scalar(@vers) >= 2 && scalar(@vers <= 4) || die "Could not parse Hadoop version: \"$hadoopVersion\"\n";
+ my ($hadoopMajorVer, $hadoopMinorVer) = ($vers[0], $vers[1]);
+ my $hadoop18Partition = "num.key.fields.for.partition=$binFields";
+ my $hadoop19Partition = "mapred.text.key.partitioner.options=-k1,$binFields";
+ if($hadoopMajorVer == 0 && $hadoopMinorVer < 19) {
+ return $hadoop18Partition;
+ }
+ return $hadoop19Partition;
+}
+
+##
+# Return the parameter used to configure Hadoop. In older versions it
+# was -jobconf; in newer versions, it's -D.
+#
+sub confParam() {
+ my @vers = split(/[^0-9]+/, $hadoopVersion);
+ scalar(@vers) >= 2 && scalar(@vers <= 4) || die "Could not parse Hadoop version: \"$hadoopVersion\"\n";
+ my ($hadoopMajorVer, $hadoopMinorVer) = ($vers[0], $vers[1]);
+ if($hadoopMajorVer == 0 && $hadoopMinorVer < 19) {
+ return "-jobconf\", \"";
+ }
+ return "-D\", \"";
+}
+
+##
+# Return the parameter used to ask streaming Hadoop to cache a file.
+#
+sub cacheFile() {
+ my @vers = split(/[^0-9]+/, $hadoopVersion);
+ scalar(@vers) >= 2 && scalar(@vers <= 4) || die "Could not parse Hadoop version: \"$hadoopVersion\"\n";
+ my ($hadoopMajorVer, $hadoopMinorVer) = ($vers[0], $vers[1]);
+ #if($hadoopMajorVer == 0 && $hadoopMinorVer < 19) {
+ return "-cacheFile";
+ #}
+ #return "-files";
+}
+
+sub validateInstType($) {
+ defined($instTypeNumCores{$_[0]}) || die "Bad --instance-type: \"$_[0]\"\n";
+}
+
+sub instanceTypeBits($) {
+ defined($instTypeBitsMap{$_[0]}) || die "Bad --instance-type: \"$_[0]\"\n";
+ return $instTypeBitsMap{$_[0]};
+}
+
+$hadoopVersion = "0.20.205" if !defined($hadoopVersion) || $hadoopVersion eq "";
+my $appDir = "$app-emr/$VERSION";
+$accessKey = $ENV{AWS_ACCESS_KEY_ID} if
+ $accessKey eq "" && $awsEnv && defined($ENV{AWS_ACCESS_KEY_ID});
+$secretKey = $ENV{AWS_SECRET_ACCESS_KEY} if
+ $secretKey eq "" && $awsEnv && defined($ENV{AWS_SECRET_ACCESS_KEY});
+$name = "$APP-$VERSION" if $name eq "";
+$qual = "phred33" if $qual eq "";
+($qual eq "phred33" || $qual eq "phred64" || $qual eq "solexa64") ||
+ dieusage("Bad quality type: $qual", $usage, 1);
+$instType = "c1.xlarge" if $instType eq "";
+validateInstType($instType);
+$cores = 1 if $cores == 0 && $localJob;
+$cores = ($instTypeNumCores{$instType} || 1) if $cores == 0;
+$cores > 0 || die;
+$swap = ($instTypeSwap{$instType} || 0) if $swap == 0;
+$reducersPerNode = $cores if $reducersPerNode == 0;
+$reducersPerNode > 0 || die;
+$partitionLen = 1000000 if $partitionLen == 0;
+$bt_args = "-M 1" if $bt_args eq "";
+$ref eq "" || $ref =~ /\.jar$/ || dieusage("--reference must end with .jar", $usage, 1);
+$numNodes = 1 if !$numNodes;
+$haploids = "none" if $haploids eq "";
+$haploids = "all" if $allHaploids;
+$ss_args = "-2 -u -n -q" if $ss_args eq "";
+$ss_hap_args = "-r 0.0001" if $ss_hap_args eq "";
+$ss_dip_args = "-r 0.00005 -e 0.0001" if $ss_dip_args eq "";
+$justAlign = 0 unless(defined($justAlign));
+$resumeAlign = 0 unless(defined($resumeAlign));
+$preprocess = 0 unless(defined($preprocess));
+$justPreprocess = 0 unless(defined($justPreprocess));
+$preprocStop = 0 unless(defined($preprocStop));
+$preprocOutput eq "" || $preprocess ||
+ warning( "Warning: --pre-output is specified but --preprocess is not");
+$preprocCompress eq "" || $preprocess ||
+ warning("Warning: --pre-compress is specified but --preprocess is not");
+$preprocStop == 0 || $preprocess ||
+ warning("Warning: --pre-stop is specified but --preprocess is not");
+$preprocMax == 0 || $preprocess ||
+ warning("Warning: --pre-filemax is specified but --preprocess is not");
+$preprocCompress = "gzip" if $preprocCompress eq "";
+$preprocCompress = "gzip" if $preprocCompress eq "gz";
+$preprocMax = 500000 if !$preprocMax;
+$preprocCompress eq "gzip" || $preprocCompress eq "none" ||
+ dieusage("--pre-compress must be \"gzip\" or \"none\"", $usage, 1);
+$tempdir = "/tmp/$app-$randstr" unless $tempdir ne "";
+my $scriptTempdir = "$tempdir/invoke.scripts";
+mkpath($scriptTempdir);
+if(!$hadoopJob && !$localJob) {
+ $slaveTempdir = "/mnt/$$" if $slaveTempdir eq "";
+} else {
+ $slaveTempdir = "$tempdir" if $slaveTempdir eq "";
+}
+-d $tempdir || die "Could not create temporary directory \"$tempdir\"\n";
+if(!$hadoopJob && !$localJob) {
+ if($waitJob) {
+ $emrArgs .= " " if ($emrArgs ne "" && $emrArgs !~ /\s$/);
+ $emrArgs .= "--alive";
+ }
+ unless($noEmrDebugging) {
+ $emrArgs .= " " if ($emrArgs ne "" && $emrArgs !~ /\s$/);
+ $emrArgs .= "--enable-debugging";
+ }
+}
+
+my $failAction = "TERMINATE_JOB_FLOW";
+$failAction = "CANCEL_AND_WAIT" if $waitJob;
+
+($discardReads >= 0.0 && $discardReads <= 1.0) ||
+ die "--discard-reads must be in [0,1], was: $discardReads\n";
+length("$discardReads") > 0 || die "--discard-reads was empty\n";
+($discardRefBins >= 0.0 && $discardRefBins <= 1.0) ||
+ die "--discard-ref-bins must be in [0,1], was: $discardRefBins\n";
+length("$discardRefBins") > 0 || die "--discard-ref-bins was empty\n";
+($discardAll >= 0.0 && $discardAll <= 1.0) ||
+ die "--discard-all must be in [0,1], was: $discardAll\n";
+$discardReads = $discardAll if $discardReads == 0;
+$discardRefBins = $discardAll if $discardRefBins == 0;
+
+##
+# Parse a URL, extracting the protocol and type of program that will
+# be needed to download it.
+#
+sub parse_url($) {
+ my $s = shift;
+ defined($s) || croak();
+ my @ss = split(/[:]/, $s);
+ if($ss[0] =~ /s3n?/i) {
+ return "s3";
+ } elsif($ss[0] =~ /hdfs/i) {
+ return "hdfs";
+ } else {
+ return "local";
+ }
+}
+
+$input = absPath($input);
+$output = absPath($output);
+$intermediate = absPath($intermediate);
+$ref = absPath($ref);
+$indexLocal = absPath($indexLocal);
+$preprocOutput = absPath($preprocOutput);
+$tempdir = absPath($tempdir);
+
+my $resume = $resumeAlign || $resumeSnps;
+
+#
+# Work out which phases are going to be executed
+#
+my %stages = (
+ "preprocess" => 0,
+ "align" => 0,
+ "snps" => 0,
+ "postprocess" => 0
+);
+
+my ($firstStage, $lastStage) = ("", "");
+if($justPreprocess) {
+ $stages{preprocess} = 1;
+} elsif($justAlign) {
+ # --just-align specified. Either preprocess and align (input =
+ # manifest) or just align (input = preprocessed reads).
+ $stages{preprocess} = 1 if $preprocess;
+ $stages{align} = 1;
+} elsif($resumeAlign) {
+ $stages{snps} = 1;
+ $stages{postprocess} = 1;
+} elsif($resumeSnps) {
+ $stages{postprocess} = 1;
+} else {
+ $stages{preprocess} = 1 if $preprocess;
+ $stages{align} = 1;
+ $stages{snps} = 1;
+ $stages{postprocess} = 1;
+}
+# Determine first and last stages
+for my $s ("preprocess", "align", "snps", "postprocess") {
+ if(defined($stages{$s}) && $stages{$s} != 0) {
+ $firstStage = $s if $firstStage eq "";
+ $lastStage = $s;
+ }
+}
+$firstStage ne "" || die;
+$lastStage ne "" || die;
+my $numStages = 0;
+for my $k (keys %stages) { $numStages += $stages{$k}; }
+
+$useFastqDump = $stages{preprocess};
+$useSamtools = $stages{align} && 0;
+my $useBowtie = $stages{align};
+my $sraToolkit = $stages{preprocess};
+my $useSoapsnp = $stages{snps};
+my $pre = "CROSSBOW_";
+$bowtie =~ s/^~/$ENV{HOME}/;
+$samtools =~ s/^~/$ENV{HOME}/;
+$soapsnp =~ s/^~/$ENV{HOME}/;
+$fastq_dump =~ s/^~/$ENV{HOME}/;
+if($test) {
+ $verbose = 1;
+ my $failed = 0;
+ if($localJob || $hadoopJob) {
+ # Check for binaries
+ $bowtie = checkExe($bowtie, "bowtie", "${pre}BOWTIE_HOME", "", "--bowtie" , 0);
+ $samtools = checkExe($samtools, "samtools", "${pre}SAMTOOLS_HOME", "", "--samtools", 0) if $useSamtools;
+ $soapsnp = checkExe($soapsnp, "soapsnp", "${pre}SOAPSNP_HOME", "", "--soapsnp" , 0);
+ $fastq_dump = checkExe($fastq_dump, "fastq-dump","${pre}SRATOOLKIT_HOME", "", "--fastq-dump", 0, 4);
+ $msg->("Summary:\n");
+ $msgf->(" bowtie: %s\n", ($bowtie ne "" ? "INSTALLED at $bowtie" : "NOT INSTALLED"));
+ $msgf->(" samtools: %s\n", ($samtools ne "" ? "INSTALLED at $samtools" : "NOT INSTALLED")) if $useSamtools;
+ $msgf->(" soapsnp: %s\n", ($soapsnp ne "" ? "INSTALLED at $soapsnp" : "NOT INSTALLED"));
+ $msgf->(" fastq-dump: %s\n", ($fastq_dump ne "" ? "INSTALLED at $fastq_dump" : "NOT INSTALLED")) if $useFastqDump;
+ $msg->("Hadoop note: executables must be runnable via the SAME PATH on all nodes.\n") if $hadoopJob;
+ $failed = $bowtie eq "" || ($useSamtools && $samtools eq "") || $soapsnp eq ""; #|| $sra eq "";
+ if($failed) {
+ $msg->("FAILED install test\n");
+ } elsif($fastq_dump eq "") {
+ $msg->("PASSED WITH ***WARNING***: SRA toolkit fastq-dump not found; .sra inputs won't work but others will\n");
+ } else {
+ $msg->("PASSED install test\n");
+ }
+ } else {
+ $emrScript = checkExe($emrScript, "elastic-mapreduce", "${pre}EMR_HOME", "", "--emr-script", 0);
+ $msg->("Summary:\n");
+ $msgf->(" elastic-mapreduce: %s\n", ($emrScript ne "" ? "INSTALLED at $emrScript" : "NOT INSTALLED"));
+ $failed = $emrScript eq "";
+ $msg->($failed ? "FAILED install test\n" : "PASSED install test\n");
+ }
+ exit $failed ? 1 : 0;
+}
+if($localJob || $hadoopJob) {
+ # Check for binaries
+ $bowtie = checkExe($bowtie, "bowtie", "${pre}BOWTIE_HOME", "", "--bowtie" , 1) if $useBowtie;
+ $samtools = checkExe($samtools, "samtools", "${pre}SAMTOOLS_HOME", "", "--samtools", 1) if $useSamtools;
+ $soapsnp = checkExe($soapsnp, "soapsnp", "${pre}SOAPSNP_HOME", "", "--soapsnp" , 1) if $useSoapsnp;
+ $fastq_dump = checkExe($fastq_dump, "fastq-dump", "${pre}SRATOOLKIT_HOME", "", "--fastq-dump", 0, 4) if $useFastqDump;
+ if($fastq_dump eq "") {
+ print STDERR "***WARNING***\n";
+ print STDERR "***WARNING***: fastq-dump not found; .sra inputs won't work but others will\n";
+ print STDERR "***WARNING***\n";
+ }
+} else {
+ $emrScript = checkExe($emrScript, "elastic-mapreduce", "${pre}EMR_HOME", "", "--emr-script", 1);
+}
+
+# Parse input, output and intermediate directories
+if($inputLocal eq "") {
+ defined($input) || die;
+ $input = "hdfs://$input" if parse_url($input) eq "local";
+} else {
+ parse_url($inputLocal) eq "local" || die "--input-local specified non-local URL: $inputLocal\n";
+ $input = $inputLocal;
+}
+if($outputLocal eq "") {
+ defined($output) || die;
+ $output = "hdfs://$output" if parse_url($output) eq "local";
+} else {
+ parse_url($outputLocal) eq "local" || die "--output-local specified non-local URL: $outputLocal\n";
+ $output = $outputLocal;
+}
+if(!$hadoopJob && !$localJob) {
+ # If the user hasn't specified --no-logs and hasn't specified a --log-uri
+ # via --emr-args, then specify a subdirectory of the output directory as
+ # the log dir.
+ $logs = "${output}_logs" if $logs eq "";
+ if(!$noLogs && $emrArgs !~ /-log-uri/) {
+ $emrArgs .= " " if ($emrArgs ne "" && $emrArgs !~ /\s$/);
+ $emrArgs .= "--log-uri $logs ";
+ }
+ my @vers = split(/[^0-9]+/, $hadoopVersion);
+ if($vers[0] < 1 && $vers[1] < 20) {
+ die "Error: Myrna not compatible with Hadoop versions before 0.20";
+ }
+ scalar(@vers) >= 2 && scalar(@vers <= 4) || die "Could not parse Hadoop version: \"$hadoopVersion\"\n";
+ if ($vers[0] == 0 && $vers[1] == 20 && scalar(@vers) > 2 && $vers[2] == 205) {
+ $emrArgs .= " " if ($emrArgs ne "" && $emrArgs !~ /\s$/);
+ $emrArgs .= "--hadoop-version=0.20.205 --ami-version 2.0 ";
+ } elsif($vers[0] == 0 && $vers[1] == 20) {
+ $emrArgs .= " " if ($emrArgs ne "" && $emrArgs !~ /\s$/);
+ $emrArgs .= "--hadoop-version=0.20 --ami-version 1.0 ";
+ } else {
+ print STDERR "Error: Expected Hadoop version 0.20 or 0.20.205, got $hadoopVersion\n";
+ exit 1;
+ }
+}
+my $intermediateSet = ($intermediate ne "" || $intermediateLocal ne "");
+if($intermediateLocal eq "") {
+ if($intermediate eq "") {
+ if($localJob) {
+ $intermediate = "$tempdir/$app/intermediate/$$";
+ } else {
+ $intermediate = "hdfs:///$app/intermediate/$$";
+ }
+ }
+} else {
+ parse_url($intermediateLocal) eq "local" || die "--intermediate-local specified non-local URL: $intermediateLocal\n";
+ $intermediate = $intermediateLocal;
+}
+
+$output ne "" || dieusage("Must specify --output", $usage, 1);
+if(!$localJob && !$hadoopJob) {
+ parse_url($output) eq "s3" || die "Error: In cloud mode, --output path must be an S3 path; was: $output\n";
+}
+if($resume && $intermediateSet) {
+ die "Cannot specify both --resume-* and --intermediate; specify intermediate directory\n".
+ "to be resumed using --input. --intermediate is automatically set to --input\n";
+}
+if($intermediate eq "" && $localJob) {
+ $intermediate = "$tempdir/$app/intermediate";
+} elsif($intermediate eq "") {
+ $intermediate = "hdfs:///tmp/$app" if $intermediate eq "";
+}
+$input ne "" || dieusage("Must specify --input", $usage, 1);
+if(!$localJob && !$hadoopJob) {
+ parse_url($input) eq "s3" || die "Error: In cloud mode, --input path must be an S3 path; was: $input\n";
+}
+if($localJob && !$justPreprocess) {
+ $snpsLocal ne "" || die "Must specify --snps-local when --local-job is specified\n";
+ $sequencesLocal ne "" || die "Must specify --sequences-local when --local-job is specified\n";
+ $cmapLocal ne "" || die "Must specify --cmap-local when --local-job is specified\n";
+ $indexLocal ne "" || die "Must specify --index-local when --local-job is specified\n";
+}
+
+sub checkArgs($$) {
+ my ($args, $param) = @_;
+ if($args =~ /[\t\n\r]/) {
+ die "$param \"$args\" has one or more illegal whitespace characters\n";
+ } elsif($args =~ /[_]/) {
+ $emsg->("$param \"$args\" contains underscores; this may confuse $APP\n");
+ }
+ $args =~ s/ /_/g;
+ $args =~ /\s/ && die "$param still has whitespace after space conversion: \"$args\"\n";
+ return $args;
+}
+$ss_args = checkArgs($ss_args, "--ss-args");
+$ss_hap_args = checkArgs($ss_hap_args, "--ss-hap-args");
+$ss_dip_args = checkArgs($ss_dip_args, "--ss-dip-args");
+
+sub upperize($) {
+ my $url = shift;
+ $url =~ s/^s3n/S3N/;
+ $url =~ s/^s3/S3/;
+ $url =~ s/^hdfs/HDFS/;
+ return $url;
+}
+
+#
+# If the caller has provided all the relevant individual parameters,
+# bypass the credentials file.
+#
+my $credentialsFile = "";
+if($credentials eq "" && $accessKey ne "" && $secretKey ne "") {
+ my ($regionStr, $keypairStr, $keypairFileStr) = ("", "", "");
+ $regionStr = "--region=$zone" if $zone ne "";
+ $keypairStr = "--key-pair=$keypair" if $keypair ne "";
+ $keypairFileStr = "--key-pair-file=$keypairFile" if $keypairFile ne "";
+ $credentials = "--access-id=$accessKey --private-key=$secretKey $keypairStr $keypairFileStr $regionStr";
+} elsif($credentials ne "") {
+ $credentialsFile = $credentials;
+ $credentials = "-c $credentials";
+}
+
+my $intermediateUpper = upperize($intermediate);
+$ref ne "" || $justPreprocess || $localJob ||
+ dieusage("Must specify --reference OR --just-preprocess", $usage, 1);
+$ref eq "" || $ref =~ /\.jar$/ || dieusage("--reference must end with .jar", $usage, 1);
+$indexLocal eq "" || -f "$indexLocal.1.ebwt" || dieusage("--index-local \"$indexLocal\" path doesn't point to an index", $usage, 1);
+$sequencesLocal eq "" || -d $sequencesLocal || dieusage("--sequences-local \"$sequencesLocal\" path doesn't point to a directory", $usage, 1);
+$snpsLocal eq "" || -d $snpsLocal || dieusage("--snps-local \"$snpsLocal\" path doesn't point to a directory", $usage, 1);
+$cmapLocal eq "" || -f $cmapLocal || dieusage("--cmap-local \"$cmapLocal\" path doesn't point to a readable file", $usage, 1);
+
+if(!$localJob && !$hadoopJob && defined($ref) && $ref ne "") {
+ parse_url($ref) eq "s3" || die "Error: In cloud mode, --reference path must be an S3 path; was: $ref\n";
+}
+
+# Remove inline credentials from URLs
+$input =~ s/:\/\/[^\/]@//;
+$output =~ s/:\/\/[^\/]@//;
+$ref =~ s/:\/\/[^\/]@//;
+my $refIdx = $ref;
+$refIdx =~ s/\.jar$/.idx.jar/ if $splitJars;
+my $refSnp = $ref;
+$refSnp =~ s/\.jar$/.snp.jar/ if $splitJars;
+my $refCmap = $ref;
+$refCmap =~ s/\.jar$/.cmap.jar/ if $splitJars;
+my $refSnpUpper = upperize($refSnp);
+my $refCmapUpper = upperize($refCmap);
+my $refIdxUpper = upperize($refIdx);
+
+# Remove trailing slashes from output
+$output =~ s/[\/]+$//;
+
+my $hadoop = "";
+my $hadoopStreamingJar = "";
+if(!$localJob && !$hadoopJob) {
+} elsif($hadoopJob) {
+ # Look for hadoop script here on the master
+ if($hadoop_arg eq "") {
+ if(defined($ENV{HADOOP_HOME})) {
+ $hadoop = "$ENV{HADOOP_HOME}/bin/hadoop";
+ chomp($hadoop);
+ }
+ if($hadoop eq "" || system("$hadoop version 2>/dev/null >/dev/null") != 0) {
+ $hadoop = `which hadoop 2>/dev/null`;
+ chomp($hadoop);
+ }
+ } else {
+ $hadoop = $hadoop_arg;
+ }
+ if(system("$hadoop version 2>/dev/null >/dev/null") != 0) {
+ if($hadoop_arg ne "") {
+ die "Specified --hadoop: '$hadoop_arg' cannot be run\n";
+ } else {
+ die "Cannot find working 'hadoop' in PATH or HADOOP_HOME/bin; please specify --hadoop\n";
+ }
+ }
+ # Now look for hadoop streaming jar file here on the master
+ my $hadoopHome;
+ if($hadoopStreamingJar_arg eq "") {
+ $hadoopHome = `dirname $hadoop`;
+ $hadoopHome = `dirname $hadoopHome`;
+ chomp($hadoopHome);
+ $hadoopStreamingJar = "";
+ my @hadoopStreamingJars;
+ @hadoopStreamingJars = <$hadoopHome/contrib/streaming/hadoop-*-streaming.jar>;
+ if(scalar(@hadoopStreamingJars) == 0) {
+ # Alternate naming scheme
+ @hadoopStreamingJars = <$hadoopHome/contrib/streaming/hadoop-streaming-*.jar>;
+ }
+ if(scalar(@hadoopStreamingJars) == 0) {
+ # Alternate naming scheme
+ @hadoopStreamingJars = <$hadoopHome/contrib/streaming/hadoop-streaming.jar>;
+ }
+ $hadoopStreamingJar = $hadoopStreamingJars[0] if scalar(@hadoopStreamingJars) > 0;
+ } else {
+ $hadoopStreamingJar = $hadoopStreamingJar_arg;
+ }
+ unless(-f $hadoopStreamingJar) {
+ if($hadoopStreamingJar_arg ne "") {
+ die "Specified --streaming-jar: '$hadoopStreamingJar_arg' cannot be found\n";
+ } else {
+ die "Cannot find streaming jar in $hadoopHome/contrib/streaming; please specify --streaming-jar\n";
+ }
+ }
+ $hadoopStreamingJar =~ /hadoop-(.*)-streaming\.jar$/; $hadoopVersion = $1;
+ if(!defined($hadoopVersion)) {
+ # Alternate naming scheme
+ $hadoopStreamingJar =~ /hadoop-streaming-(.*)\.jar$/; $hadoopVersion = $1;
+ }
+ defined($hadoopVersion) || die "Could not parse streaming jar name: $hadoopStreamingJar";
+ # Hadoop version might be as simlpe as 0.20 or as complex as 0.20.2+737
+ $emsg->("Detected Hadoop version '$hadoopVersion'") if $verbose;
+} elsif($localJob) {
+ system("sort < /dev/null") == 0 || die "Could not invoke 'sort'; is it in the PATH?\n";
+}
+
+# Set up the --samtools, --bowtie, and --R arguments for each script invocation
+my $bowtie_arg = "";
+my $samtools_arg = "";
+my $soapsnp_arg = "";
+my $fastq_dump_arg = "";
+if($localJob || $hadoopJob) {
+ if($useSamtools) {
+ $samtools ne "" || die;
+ $msg->("$APP expects 'samtools' to be at path $samtools on the workers\n") if $hadoopJob;
+ $samtools_arg = "--samtools $samtools";
+ }
+
+ if($useBowtie) {
+ $bowtie ne "" || die;
+ $msg->("$APP expects 'bowtie' to be at path $bowtie on the workers\n") if $hadoopJob;
+ $bowtie_arg = "--bowtie $bowtie";
+ }
+
+ if($useSoapsnp) {
+ $soapsnp ne "" || die;
+ $msg->("$APP expects 'soapsnp' to be at path $soapsnp on the workers\n") if $hadoopJob;
+ $soapsnp_arg = "--soapsnp $soapsnp";
+ }
+
+ if($useFastqDump) {
+ $fastq_dump ne "" || die;
+ $msg->("$APP expects 'fastq-dump' to be at path $fastq_dump on the workers\n") unless $localJob;
+ $fastq_dump_arg = "--fastq-dump $fastq_dump";
+ }
+}
+
+# Set up name of streaming jar for EMR mode
+my $emrStreamJar = "/home/hadoop/contrib/streaming/hadoop-streaming-$hadoopVersion.jar";
+if($hadoopVersion eq "0.20" || $hadoopVersion eq "0.18") {
+ $emrStreamJar = "/home/hadoop/contrib/streaming/hadoop-$hadoopVersion-streaming.jar";
+}
+
+# Set up some variables to save us some typing:
+
+my $cachef = cacheFile();
+my $ec2CacheFiles =
+qq! "$cachef", "s3n://$appDir/Get.pm#Get.pm",
+ "$cachef", "s3n://$appDir/Counters.pm#Counters.pm",
+ "$cachef", "s3n://$appDir/Util.pm#Util.pm",
+ "$cachef", "s3n://$appDir/Tools.pm#Tools.pm",
+ "$cachef", "s3n://$appDir/AWS.pm#AWS.pm"!;
+
+my $hadoopCacheFiles = qq! \\
+ -file '$Bin/Get.pm' \\
+ -file '$Bin/Counters.pm' \\
+ -file '$Bin/Util.pm' \\
+ -file '$Bin/Tools.pm' \\
+ -file '$Bin/AWS.pm' \\
+!;
+
+my $inputPreproc = $input;
+my $outputPreproc = ($preprocOutput ne "" ? $preprocOutput : "$intermediate/preproc");
+$outputPreproc = $output if $justPreprocess;
+my $outputPreprocUpper = upperize($outputPreproc);
+my $bits = instanceTypeBits($instType);
+$bits == 32 || $bits == 64 || die "Bad samtoolsBits: $bits\n";
+my $forceStr = ($dontForce ? "" : "--force");
+my $keepAllStr = $keepAll ? "--keep-all" : "";
+
+my $preprocArgs = "";
+$preprocArgs .= " --compress=$preprocCompress";
+$preprocArgs .= " --stop=$preprocStop";
+$preprocArgs .= " --maxperfile=$preprocMax";
+$preprocArgs .= " --s";
+$preprocArgs .= " --push=$outputPreprocUpper";
+
+my $samtoolsCacheFiles = qq!"$cachef", "s3n://$appDir/samtools$bits#samtools"!;
+my $sraCacheFiles = qq!"$cachef", "s3n://$appDir/fastq-dump$bits#fastq-dump"!;
+
+my $conf = confParam();
+
+my $preprocessJson = qq!
+{
+ "Name": "Preprocess short reads",
+ "ActionOnFailure": "$failAction",
+ "HadoopJarStep": {
+ "Jar": "$emrStreamJar",
+ "Args": [
+ "${conf}mapred.reduce.tasks=0",
+ "-input", "$inputPreproc",
+ "-output", "$outputPreproc",
+ "-mapper", "s3n://$appDir/Copy.pl $preprocArgs",
+ "-inputformat", "org.apache.hadoop.mapred.lib.NLineInputFormat",
+ $ec2CacheFiles,
+ $sraCacheFiles,
+ $samtoolsCacheFiles
+ ]
+ }
+}!;
+
+my $preprocessHadoop = qq!
+echo ==========================
+echo Stage \$phase of $numStages. Preprocess
+echo ==========================
+date
+$hadoop jar $hadoopStreamingJar \\
+ -D mapred.reduce.tasks=0 \\
+ -D mapred.job.name='Preprocess $inputPreproc' \\
+ -input $inputPreproc \\
+ -output $outputPreproc \\
+ -mapper '$Bin/Copy.pl $samtools_arg $fastq_dump_arg $preprocArgs' \\
+ $hadoopCacheFiles \\
+ -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat
+
+[ \$? -ne 0 ] && echo "Non-zero exitlevel from Preprocess stage" && exit 1
+phase=`expr \$phase + 1`
+!;
+
+my $preprocessSh = qq!
+perl $Bin/MapWrap.pl \\
+ --stage \$phase \\
+ --num-stages $numStages \\
+ --name Preprocess \\
+ --input $inputPreproc \\
+ --output $outputPreproc \\
+ --counters ${output}_counters/counters.txt \\
+ --messages cb.local.\$\$.out \\
+ --line-by-line \\
+ --silent-skipping \\
+ $keepAllStr \\
+ $forceStr \\
+ --mappers $cores -- \\
+ perl $Bin/Copy.pl \\
+ --compress=$preprocCompress \\
+ --stop=$preprocStop \\
+ --maxperfile $preprocMax \\
+ $fastq_dump_arg \\
+ --push $outputPreproc \\
+ --counters ${output}_counters/counters.txt
+
+[ \$? -ne 0 ] && echo "Non-zero exitlevel from Preprocess stage" && exit 1
+if [ \$phase -gt 1 -a $keepIntermediate -eq 0 -a $keepAll -eq 0 ] ; then
+ echo "Removing $inputPreproc (to keep, specify --keep-all or --keep-intermediates)"
+ rm -rf $inputPreproc
+fi
+phase=`expr \$phase + 1`
+!;
+
+my $inputAlign = (($firstStage eq "align") ? $input : $outputPreproc);
+my $outputAlign = (($lastStage eq "align") ? $output : "$intermediate/align");
+$truncate = max($truncate, $truncateDiscard);
+$truncateDiscard = $truncateDiscard > 0 ? "--discard-small" : "";
+
+my $alignArgs = "";
+$alignArgs .= " --discard-reads=$discardReads";
+$alignArgs .= " --ref=$refIdxUpper";
+$alignArgs .= " --destdir=$slaveTempdir";
+$alignArgs .= " --partlen=$partitionLen";
+$alignArgs .= " --qual=$qual";
+$alignArgs .= " --truncate=$truncate";
+$alignArgs .= " $truncateDiscard";
+$alignArgs .= " --";
+$alignArgs .= " --partition $partitionLen";
+$alignArgs .= " --mm -t --hadoopout --startverbose";
+$alignArgs .= " $bt_args";
+
+my $alignJson = qq!
+{
+ "Name": "$APP Step 1: Align with Bowtie",
+ "ActionOnFailure": "$failAction",
+ "HadoopJarStep": {
+ "Jar": "$emrStreamJar",
+ "Args": [
+ "${conf}mapred.reduce.tasks=0",
+ "-input", "$inputAlign",
+ "-output", "$outputAlign",
+ "-mapper", "s3n://$appDir/Align.pl $alignArgs",
+ "$cachef", "s3n://$appDir/bowtie$bits#bowtie",
+ $ec2CacheFiles
+ ]
+ }
+}!;
+
+my $alignHadoop = qq!
+echo ==========================
+echo Stage \$phase of $numStages. Align
+echo ==========================
+date
+$hadoop jar $hadoopStreamingJar \\
+ -D mapred.reduce.tasks=0 \\
+ -D mapred.job.name='Align $inputAlign' \\
+ -input $inputAlign \\
+ -output $outputAlign \\
+ -mapper '$Bin/Align.pl $bowtie_arg $alignArgs' \\
+ $hadoopCacheFiles
+
+[ \$? -ne 0 ] && echo "Non-zero exitlevel from Align streaming job" && exit 1
+phase=`expr \$phase + 1`
+!;
+
+my $preprocOutputSpecified = $preprocOutput ne "" ? "1" : "0";
+
+my $alignSh = qq!
+perl $Bin/MapWrap.pl \\
+ --stage \$phase \\
+ --num-stages $numStages \\
+ --name Align \\
+ --input $inputAlign \\
+ --output $outputAlign \\
+ --counters ${output}_counters/counters.txt \\
+ --messages cb.local.\$\$.out \\
+ $keepAllStr \\
+ $forceStr \\
+ --mappers $cores -- \\
+ perl $Bin/Align.pl \\
+ $bowtie_arg \\
+ --discard-reads=$discardReads \\
+ --index-local=$indexLocal \\
+ --partlen=$partitionLen \\
+ --qual=$qual \\
+ --counters ${output}_counters/counters.txt \\
+ --truncate=$truncate \\
+ $truncateDiscard \\
+ -- \\
+ --partition $partitionLen \\
+ --mm -t --hadoopout --startverbose \\
+ $bt_args
+
+[ \$? -ne 0 ] && echo "Non-zero exitlevel from Align stage" && exit 1
+if [ \$phase -gt 1 -a $keepIntermediate -eq 0 -a $keepAll -eq 0 -a $preprocOutputSpecified -eq 0 ] ; then
+ echo "Removing $inputAlign (to keep, specify --keep-all or --keep-intermediates)"
+ rm -rf $inputAlign
+fi
+phase=`expr \$phase + 1`
+!;
+
+my $snpInput = "$intermediate/align";
+my $snpOutput = "$intermediate/snps";
+
+my $snpTasks = $numNodes * $reducersPerNode * 4;
+my $snpArgs = "--discard-ref-bins=$discardRefBins ".
+ "--refjar=$refSnpUpper ".
+ "--destdir=$slaveTempdir ".
+ "--soapsnp=$soapsnp ".
+ "--args=$ss_args ".
+ "--haploid_args=$ss_hap_args ".
+ "--diploid_args=$ss_dip_args ".
+ "--basequal=\! ".
+ "--partition=$partitionLen ".
+ "--haploids=$haploids ".
+ "--replace-uscores";
+
+my $inputSnp = ($resumeAlign ? $input: "$intermediate/align");
+my $outputSnp = "$intermediate/snps";
+my $snpsPartitionConf = partitionConf(2);
+my $snpsJson = qq!
+{
+ "Name": "$APP Step 2: Call SNPs with SOAPsnp",
+ "ActionOnFailure": "$failAction",
+ "HadoopJarStep": {
+ "Jar": "$emrStreamJar",
+ "Args": [
+ "${conf}stream.num.map.output.key.fields=3",
+ "${conf}$snpsPartitionConf",
+ "${conf}mapred.reduce.tasks=$snpTasks",
+ "-input", "$snpInput",
+ "-output", "$snpOutput",
+ "-mapper", "cat",
+ "-reducer", "s3n://$appDir/Soapsnp.pl $snpArgs",
+ "-partitioner", "org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner",
+ "$cachef", "s3n://$appDir/soapsnp$bits#soapsnp",
+ $ec2CacheFiles
+ ]
+ }
+}!;
+
+my $snpsHadoop = qq!
+echo ==========================
+echo Stage \$phase of $numStages. Call SNPs
+echo ==========================
+date
+$hadoop jar $hadoopStreamingJar \\
+ -D stream.num.map.output.key.fields=3 \\
+ -D $snpsPartitionConf \\
+ -D mapred.job.name='Soapsnp $inputSnp' \\
+ -D mapred.reduce.tasks=$snpTasks \\
+ -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \\
+ -input $inputSnp \\
+ -output $outputSnp \\
+ -mapper 'cat' \\
+ -reducer '$Bin/Soapsnp.pl $soapsnp_arg $snpArgs' \\
+ $hadoopCacheFiles
+
+[ \$? -ne 0 ] && echo "Non-zero exitlevel from Call SNPs streaming job" && exit 1
+phase=`expr \$phase + 1`
+!;
+
+$externalSort = $externalSort ? "--external-sort" : "";
+my $snpsSh = qq!
+perl $Bin/ReduceWrap.pl \\
+ --stage \$phase \\
+ --num-stages $numStages \\
+ --name "Call SNPs" \\
+ --input $snpInput \\
+ --output $snpOutput \\
+ --counters ${output}_counters/counters.txt \\
+ --messages cb.local.\$\$.out \\
+ --reducers $cores \\
+ --tasks $snpTasks \\
+ --bin-fields 2 \\
+ --sort-fields 3 \\
+ --max-sort-records $maxSortRecords \\
+ --max-sort-files $maxSortFiles \\
+ $externalSort \\
+ $keepAllStr \\
+ $forceStr \\
+ -- \\
+ perl $Bin/Soapsnp.pl \\
+ $soapsnp_arg \\
+ --discard-ref-bins=$discardRefBins \\
+ --args="$ss_args" \\
+ --snpdir="$snpsLocal" \\
+ --refdir="$sequencesLocal" \\
+ --haploid_args="$ss_hap_args" \\
+ --diploid_args="$ss_dip_args" \\
+ --basequal=\! \\
+ --partition=$partitionLen \\
+ --haploids="$haploids" \\
+ --counters ${output}_counters/counters.txt \\
+ --replace-uscores
+
+[ \$? -ne 0 ] && echo "Non-zero exitlevel from SNP calling stage" && exit 1
+if [ \$phase -gt 1 -a $keepIntermediate -eq 0 -a $keepAll -eq 0 ] ; then
+ echo "Removing $snpInput (to keep, specify --keep-all or --keep-intermediates)"
+ rm -rf $snpInput
+fi
+phase=`expr \$phase + 1`
+!;
+
+my $inputDummy = "s3n://$app-emr/dummy-input";
+my $outputUpper = upperize($output);
+my $countersArgs = "";
+$countersArgs .= " --output=${outputUpper}_${app}_counters";
+
+my $countersJson = qq!
+{
+ "Name": "Get counters",
+ "ActionOnFailure": "$failAction",
+ "HadoopJarStep": {
+ "Jar": "$emrStreamJar",
+ "Args": [
+ "${conf}mapred.reduce.tasks=1",
+ "-input", "$inputDummy",
+ "-output", "${output}_${app}_counters/ignoreme1",
+ "-mapper", "cat",
+ "-reducer", "s3n://$appDir/Counters.pl $countersArgs",
+ $ec2CacheFiles
+ ]
+ }
+}!;
+my $countersSh = qq!
+!;
+
+my $inputPostproc = "$intermediate/snps";
+my $outputPostproc = "$output/${app}_results";
+
+my $postprocArgs = "";
+$postprocArgs .= " --cmapjar=$refCmapUpper";
+$postprocArgs .= " --destdir=$slaveTempdir";
+$postprocArgs .= " --output=$outputUpper";
+
+my $postprocPartitionConf = partitionConf(1);
+my $postprocJson = qq!
+{
+ "Name": "$APP Step 3: Postprocess",
+ "ActionOnFailure": "$failAction",
+ "HadoopJarStep": {
+ "Jar": "$emrStreamJar",
+ "Args": [
+ "${conf}stream.num.map.output.key.fields=2",
+ "${conf}$postprocPartitionConf",
+ "${conf}mapred.reduce.tasks=30",
+ "-input", "$inputPostproc",
+ "-output", "$output/ignoreme2",
+ "-mapper", "cat",
+ "-reducer", "s3n://$appDir/CBFinish.pl $postprocArgs",
+ "-partitioner", "org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner",
+ $ec2CacheFiles
+ ]
+ }
+}!;
+
+my $postprocHadoop = qq!
+echo ==========================
+echo Stage \$phase of $numStages. Postprocess
+echo ==========================
+date
+$hadoop jar $hadoopStreamingJar \\
+ -D stream.num.map.output.key.fields=2 \\
+ -D $postprocPartitionConf \\
+ -D mapred.reduce.tasks=30 \\
+ -D mapred.job.name='Postprocess $inputPostproc' \\
+ -input $inputPostproc \\
+ -output $output/ignoreme2 \\
+ -mapper 'cat' \\
+ -reducer '$Bin/CBFinish.pl $postprocArgs' \\
+ $hadoopCacheFiles \\
+ -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
+
+rm -rf $output/ignoreme2
+[ \$? -ne 0 ] && echo "Non-zero exitlevel from Postprocess streaming job" && exit 1
+phase=`expr \$phase + 1`
+!;
+
+my $postprocSh = qq!
+perl $Bin/ReduceWrap.pl \\
+ --stage \$phase \\
+ --num-stages $numStages \\
+ --name Postprocess \\
+ --input $inputPostproc \\
+ --output $outputPostproc \\
+ --counters ${output}_counters/counters.txt \\
+ --messages cb.local.\$\$.out \\
+ --reducers $cores \\
+ --tasks 1 \\
+ --bin-fields 1 \\
+ --sort-fields 2 \\
+ --max-sort-records $maxSortRecords \\
+ --max-sort-files $maxSortFiles \\
+ $externalSort \\
+ $keepAllStr \\
+ $forceStr \\
+ -- \\
+ perl $Bin/CBFinish.pl \\
+ --cmap=$cmapLocal \\
+ --counters ${output}_counters/counters.txt \\
+ --output="$outputPostproc"
+
+[ \$? -ne 0 ] && echo "Non-zero exitlevel from Postprocess stage" && exit 1
+if [ \$phase -gt 1 -a $keepIntermediate -eq 0 -a $keepAll -eq 0 ] ; then
+ echo "Removing $inputPostproc (to keep, specify --keep-all or --keep-intermediates)"
+ rm -rf $inputPostproc
+fi
+phase=`expr \$phase + 1`
+!;
+
+my $jsonFile = "$scriptTempdir/cb.$$.json";
+my $runJsonFile = "$scriptTempdir/cb.$$.json.sh";
+my $runHadoopFile = "$scriptTempdir/cb.$$.hadoop.sh";
+my $runLocalFile = "$scriptTempdir/cb.$$.sh";
+umask 0077;
+my $json = "";
+open JSON, ">$jsonFile" || die "Error: Could not open $jsonFile for writing\n";
+my $sh = "";
+open SH, ">$runLocalFile" || die "Error: Could not open $runLocalFile for writing\n";
+my $had = "";
+open HADOOP, ">$runHadoopFile" || die "Error: Could not open $runHadoopFile for writing\n";
+$json .= "[";
+$sh .= "#!/bin/sh\n\nphase=1\n";
+$sh .= "rm -f cb.local.\$\$.out\n";
+$sh .= qq!
+perl $Bin/CheckDirs.pl \\
+ --input $input \\
+ --intermediate $intermediate \\
+ --output $output \\
+ --counters ${output}_counters \\
+ --messages cb.local.\$\$.out \\
+ $forceStr
+!;
+$had .= "#!/bin/sh\n\nphase=1\n";
+#$had .= "rm -f cb.hadoop.\$\$.out\n";
+if($stages{preprocess}) {
+ $json .= "," if $json ne "[";
+ $json .= $preprocessJson;
+ $had .= $preprocessHadoop;
+ $sh .= $preprocessSh;
+}
+if($stages{align}) {
+ $json .= "," if $json ne "[";
+ $json .= $alignJson;
+ $had .= $alignHadoop;
+ $sh .= $alignSh;
+}
+if($stages{snps}) {
+ $json .= "," if $json ne "[";
+ $json .= $snpsJson;
+ $had .= $snpsHadoop;
+ $sh .= $snpsSh;
+}
+if($stages{postprocess}) {
+ $json .= "," if $json ne "[";
+ $json .= $postprocJson;
+ $had .= $postprocHadoop;
+ $sh .= $postprocSh;
+}
+$json .= "," if $json ne "[";
+$json .= $countersJson;
+$sh .= "echo \"All output to console recorded in cb.local.\$\$.out\"\n";
+$sh .= "date ; echo DONE\n";
+#$had .= "echo \"All output to console recorded in cb.hadoop.\$\$.out\"\n";
+$had .= "date ; echo DONE\n";
+$json .= "\n]\n";
+print JSON $json;
+close(JSON);
+print SH $sh;
+close(SH);
+print HADOOP $had;
+close(HADOOP);
+umask $umaskOrig;
+
+if(!$localJob && !$hadoopJob) {
+ $cores == 1 || $cores == 2 || $cores == 4 || $cores == 8 || die "Bad number of cores: $cores\n";
+}
+$name =~ s/"//g;
+(defined($emrScript) && $emrScript ne "") || $localJob || $hadoopJob || die;
+my $cmdJson = "$emrScript ".
+ "--create ".
+ "$credentials ".
+ "$emrArgs ".
+ "--name \"$name\" ".
+ "--num-instances $numNodes ".
+ "--instance-type $instType ".
+ "--json $jsonFile ".
+ "--bootstrap-action s3://elasticmapreduce/bootstrap-actions/configurations/latest/memory-intensive ".
+ "--bootstrap-name \"Set memory-intensive mode\" ".
+ "--bootstrap-action s3://elasticmapreduce/bootstrap-actions/configure-hadoop ".
+ "--bootstrap-name \"Configure Hadoop\" ".
+ "--args \"-s,mapred.job.reuse.jvm.num.tasks=1,-s,mapred.tasktracker.reduce.tasks.maximum=$cores,-s,io.sort.mb=100\" ".
+ "--bootstrap-action s3://elasticmapreduce/bootstrap-actions/add-swap ".
+ "--bootstrap-name \"Add Swap\" ".
+ "--args \"$swap\"";
+
+my $cmdSh = "sh $runLocalFile";
+my $cmdHadoop = "sh $runHadoopFile";
+
+if($dryrun) {
+ open RUN, ">$runJsonFile" || die "Error: Could not open $runJsonFile for writing\n";
+ print RUN "#!/bin/sh\n";
+ print RUN $cmdJson; # include argument passthrough
+ close(RUN);
+}
+
+$msg->("\n");
+$msg->("$APP job\n");
+$msg->("------------\n");
+$msg->("Job json in: $jsonFile\n") if (!$localJob && !$hadoopJob);
+$msg->("Job command in: $runJsonFile\n") if (!$localJob && !$hadoopJob && $dryrun);
+$msg->("Local commands in: $runLocalFile\n") if $localJob;
+$msg->("Hadoop streaming commands in: $runHadoopFile\n") if $hadoopJob;
+if($dryrun) {
+ $msg->("Exiting without running command because of --dryrun\n");
+} else {
+ my $ms = "";
+ my $pipe;
+ if($localJob) {
+ $pipe = "$cmdSh 2>&1 |";
+ $ms .= "$cmdSh\n" if $verbose;
+ } elsif($hadoopJob) {
+ $pipe = "$cmdHadoop 2>&1 |";
+ $ms .= "$cmdHadoop\n" if $verbose;
+ } else {
+ $pipe = "$cmdJson 2>&1 |";
+ $ms .= "$cmdJson\n" if $verbose;
+ }
+ $msg->($ms) if $verbose;
+ $msg->("Running...\n");
+ open(CMDP, $pipe) || die "Could not open pipe '$pipe' for reading\n";
+ while(<CMDP>) { $msg->($_); }
+ close(CMDP);
+ $msg->("elastic-mapreduce script completed with exitlevel $?\n");
+}
+$msg->("$warnings warnings\n") if $warnings > 0;
+
+}
+
+1;
diff --git a/Get.pm b/Get.pm
new file mode 100644
index 0000000..bc44b12
--- /dev/null
+++ b/Get.pm
@@ -0,0 +1,499 @@
+#!/usr/bin/perl -w
+
+##
+# Author: Ben Langmead
+# Date: 2/14/2010
+#
+# Routines for getting and expanding jars from
+#
+
+package Get;
+use strict;
+use warnings;
+use Fcntl qw(:DEFAULT :flock); # for locking
+use FindBin qw($Bin);
+use lib $Bin;
+use File::Path qw(mkpath);
+use File::Basename;
+use Tools;
+use AWS;
+use Util;
+use Carp;
+
+##
+# Parse a URL, extracting the protocol and type of program that will
+# be needed to download it.
+#
+sub parse_url($) {
+ my ($ref) = @_;
+ my $type;
+ my @s = split(/[:]/, $ref);
+ my $proto = $s[0];
+ $proto = lc $proto;
+ if($proto =~ /s3n?/) {
+ $type = "s3";
+ $ref =~ s/^s3n/s3/; # because s3cmd doesn't like s3n://
+ } elsif($proto eq "ftp" || $proto eq "http") {
+ $type = "wget";
+ } elsif($proto eq "hdfs") {
+ $type = "hdfs";
+ } else {
+ $type = "local";
+ (-f $ref || -d $ref) || croak("URL referring to local file $ref doesn't exist or cannot be read\n");
+ return ("", $type);
+ }
+ return ($proto, $type);
+}
+
+##
+# Prepare an s3 URL for use with s3cmd.
+#
+sub s3cmdify($$) {
+ my ($path, $env) = @_;
+ $path =~ s/^S3N:/s3n:/;
+ $path =~ s/^S3:/s3:/;
+ $path =~ s/^s3n:/s3:/;
+ # Note: this is a good way to strip out the access ID and secret
+ # key ID. It's better than using a regular expression because it's
+ # hard to think of an expression that correctly handles slashes in
+ # the secret key ID (which is possible).
+ AWS::ensureKeys($Tools::hadoop, $Tools::hadoop_arg, $env);
+ my $ec2key = $AWS::accessKey.":".$AWS::secretKey;
+ my $idx = index($path, $ec2key);
+ if($idx != -1) {
+ # Remove ID:secret and the @ on the end
+ substr($path, $idx, length($ec2key)+1) = "";
+ }
+ return $path;
+}
+
+sub do_s3cmd($$) {
+ my ($args, $env) = @_;
+ my $s3cmd = Tools::s3cmd($env);
+ my $cmd = "$s3cmd $args";
+ print STDERR "Get.pm:do_s3cmd: $cmd\n";
+ my $out = Util::backtickAndWait($cmd, "s3cmd");
+ $? && croak("Exitlevel from \"$cmd\" was $?\n");
+ return ($?, $out);
+}
+
+sub do_s3_get($$$$$$) {
+ my ($file, $base, $dest_dir, $counters, $retries, $env) = @_;
+ $file = s3cmdify($file, $env);
+ my $file_arg = $file;
+ mkpath($dest_dir);
+ my $cmd = "rm -f $dest_dir/$base >&2";
+ print STDERR "Get.pm:do_s3_get: $cmd\n";
+ system($cmd);
+ my $ret;
+ while($retries >= 0) {
+ my $out;
+ ($ret, $out) = do_s3cmd("get --force $file_arg $dest_dir/$base >&2", $env);
+ (-f "$dest_dir/$base") || croak("Did not create $dest_dir/$base - wrong URL?\n");
+ push @{$counters}, "Fetcher,s3cmd return $ret,1";
+ push @{$counters}, "Fetcher,Bytes obtained with s3cmd get,".(-s "$dest_dir/$base");
+ push @{$counters}, "Fetcher,Files obtained with s3cmd get,1";
+ return $ret if $ret == 0;
+ system("rm -f $dest_dir/$base* $dest_dir/.$base*");
+ $retries--;
+ }
+ return $ret;
+}
+
+sub do_s3_put($$$$) {
+ my ($file, $dest, $counters, $env) = @_;
+ $dest = s3cmdify($dest, $env);
+ $dest .= "/" unless $dest =~ /\/$/;
+ my $base = fileparse($file);
+ my ($ret, $out) = do_s3cmd("put $file $dest$base >&2", $env);
+ push @{$counters}, "Fetcher,Bytes uploaded with s3cmd put,".(-s "$file");
+ push @{$counters}, "Fetcher,Files uploaded with s3cmd put,1";
+}
+
+sub do_hdfs_get($$$$) {
+ my ($file, $base, $dest_dir, $counters) = @_;
+ defined($base) || croak("Must define base\n");
+ defined($dest_dir) || croak("Must define dest_dir\n");
+ $file =~ s/^HDFS:/hdfs:/;
+ my $hadoop = Tools::hadoop();
+ mkpath($dest_dir);
+ my $cmd = "$hadoop dfs -get $file $dest_dir/$base >&2";
+ print STDERR "Get.pm:do_hdfs_get: $cmd\n";
+ my $ret = Util::runAndWait($cmd, "hadoop dfs -get");
+ print STDERR "Get.pm:returned $ret\n";
+ push @{$counters}, "Fetcher,hadoop dfs -get return $ret,1";
+ push @{$counters}, "Fetcher,Bytes obtained with hadoop dfs -get,".(-s "$dest_dir/$base");
+ push @{$counters}, "Fetcher,Files obtained with hadoop dfs -get,1";
+ return $ret;
+}
+
+##
+# Put a local file into HDFS.
+#
+sub do_hdfs_put($$$) {
+ my ($file, $dest, $counters) = @_;
+ $dest =~ s/^HDFS:/hdfs:/;
+ $dest .= "/" unless $dest =~ /\/$/;
+ my $base = fileparse($file);
+ my $hadoop = Tools::hadoop();
+ # Ensure HDFS directory exists
+ my $cmd = "$hadoop dfs -mkdir $dest >&2";
+ Util::runAndWait($cmd, "$hadoop dfs -mkdir");
+ # Put the file
+ $cmd = "$hadoop dfs -put $file $dest$base >&2";
+ print STDERR "Get.pm:do_hdfs_put: $cmd\n";
+ my $ret = Util::runAndWait($cmd, "$hadoop dfs -put");
+ # Update counters
+ push @{$counters}, "Fetcher,hadoop dfs -put return $ret,1";
+ push @{$counters}, "Fetcher,Bytes uploaded with hadoop dfs -put,".(-s $file);
+ push @{$counters}, "Fetcher,Files uploaded with hadoop dfs -put,1";
+ return $ret;
+}
+
+sub do_local($$$$) {
+ my ($file, $base, $dest_dir, $counters) = @_;
+ mkpath($dest_dir);
+ my $cmd = "cp $file $dest_dir/$base >&2";
+ print STDERR "Get.pm:do_local: $cmd\n";
+ my $ret = Util::run($cmd);
+ push @{$counters}, "Fetcher,cp return $ret,1";
+ push @{$counters}, "Fetcher,Bytes obtained with cp,".(-s "$dest_dir/$base");
+ push @{$counters}, "Fetcher,Files obtained with cp,1";
+ return $ret;
+}
+
+##
+# Workaround for the situation where the change of FTP dir is
+# forbidden, but fetching the file itself is permitted (this seems to
+# happen e.g. on the NCBI 1000genomes server sometimes).
+#
+sub fix_wget_url($) {
+ my $url = shift;
+ my @us = split(/\//, $url);
+ my $ret = "";
+ return $url if $#us <= 3;
+ $ret .= join("/", ($us[0], $us[1], $us[2]))."/";
+ shift @us; shift @us; shift @us;
+ $ret .= join("%2f", @us);
+ return $ret;
+}
+
+##
+# Get a file over http or ftp using wget.
+#
+sub do_wget($$$$) {
+ my ($file, $base, $dest_dir, $counters) = @_;
+ my $url = fix_wget_url($file);
+ my $wget = Tools::wget();
+ mkpath($dest_dir);
+ my $cmd = "$wget $url -O $dest_dir/$base >&2";
+ print STDERR "Get.pm:do_wget: $cmd\n";
+ my $ret = Util::run($cmd);
+ push @{$counters}, "Fetcher,wget return $ret,1";
+ push @{$counters}, "Fetcher,Bytes obtained with wget,".(-s "$dest_dir/$base");
+ push @{$counters}, "Fetcher,Files obtained with wget,1";
+ return $ret;
+}
+
+sub lsDir($$) {
+ my ($dir, $env) = @_;
+ print STDERR "Get.pm:lsDir: About to parse URL $dir\n";
+ my ($proto, $type) = parse_url($dir);
+ my @fs = ();
+ if($type eq "s3") {
+ print STDERR "Get.pm:lsDir: About to handle S3\n";
+ $dir = s3cmdify($dir, $env);
+ $dir .= "/" if $dir !~ /\/$/;
+ my ($ret, $out) = do_s3cmd("ls $dir", $env);
+ my @fls = split(/[\r\n]+/, $out);
+ for (@fls) {
+ next if /^Bucket/;
+ my @fs2 = split(/[\s]+/, $_);
+ push @fs, $fs2[-1];
+ }
+ } elsif($type eq "local") {
+ print STDERR "Get.pm:lsDir: About to handle local\n";
+ my $out = Util::backtickRun("ls -1 $dir");
+ my @fls = split(/[\r\n]+/, $out);
+ $dir =~ s/\/$//;
+ for my $f (@fls) { push @fs, "$dir/$f"; }
+ } else {
+ my $fsstr = "dfs";
+ print STDERR "Get.pm:lsDir: About to handle HDFS\n";
+ my $hadoop = Tools::hadoop();
+ my $out = `$hadoop $fsstr -ls $dir`;
+ my @fls = split(/[\r\n]+/, $out);
+ for (@fls) {
+ next if /^Found/;
+ my @fs2 = split(/[\s]+/, $_);
+ my $f = $fs2[-1];
+ $f = "hdfs://".$f if ($f =~ /^\// && $type eq "hdfs");
+ push @fs, $f;
+ }
+ }
+ return @fs;
+}
+
+##
+# Ensure all of the files in the source directory have been copied into
+# dest_dir.
+#
+sub ensureDirFetched($$$$) {
+ my ($dir, $dest_dir, $counters, $env) = @_;
+ $dir =~ s/^S3N/s3n/;
+ $dir =~ s/^S3/s3/;
+ $dir =~ s/^HDFS/hdfs/;
+ my $dirDoneFile = $dir;
+ $dirDoneFile =~ s/[\/:]/_/g;
+ mkpath($dest_dir);
+ $dirDoneFile = "$dest_dir/.dir.$dirDoneFile";
+ unless(-f $dirDoneFile) {
+ $dir .= "/" unless $dir =~ /\/$/;
+ my @files = lsDir($dir, $env);
+ for(@files) {
+ print STDERR "Get.pm:ensureDirFetched: About to be fetched: $_\n";
+ }
+ for(@files) {
+ print STDERR "ensureDirFetched: Fetching directory file $_\n";
+ ensureFetched($_, $dest_dir, $counters, undef, undef, $env);
+ }
+ Util::run("touch $dirDoneFile");
+ }
+}
+
+##
+# Do not return until the given file has been obtained and the "done"
+# flag file has been installed.
+#
+# If the thing being decompressed is an R installation, we do a little
+# ad-hoc fixup to ensure it likes the new directory it's in.
+#
+sub ensureFetched($$$$$$) {
+ my (
+ $file, # Path/URL of file to get
+ $dest_dir, # Directory to copy it to and/or extract it in
+ $counters, # Ref to array to store counter updates in
+ $doRfixup, # If it's R that's being extracted and this is
+ # true, we set RHOME and modify Rscript
+ # accordingly
+ $lockSub, # A parameterless subroutine to call if and
+ # when we get the lock
+ $env) = @_; # environment
+
+ print STDERR "Get.pm:ensureFetched: called on \"$file\"\n";
+ $file =~ s/^S3N/s3n/;
+ $file =~ s/^S3/s3/;
+ $file =~ s/^HDFS/hdfs/;
+ my $base = fileparse($file);
+ print STDERR "Get.pm:ensureFetched: base name \"$base\"\n";
+ mkpath($dest_dir);
+ my $done_file = "$dest_dir/.$base.done";
+ my $lock_file = "$dest_dir/.$base.lock";
+ print STDERR "ls -al $dest_dir/*$base* $dest_dir/.*$base*\n";
+ print STDERR `ls -al $dest_dir/*$base* $dest_dir/.*$base*\n`;
+ my ($proto, $type) = parse_url($file);
+ print STDERR "Pid $$: Checking for done file $done_file\n";
+ if(! -f $done_file) {
+ print STDERR "Pid $$: Done file $done_file was NOT present\n";
+ #
+ # Use perl portable file locking to prevent race conditions when
+ # there are multiple mappers per machine.
+ #
+ system("touch $lock_file");
+ print STDERR "Pid $$: Attempting to obtain lock...\n";
+ open(FH, "<$lock_file") or croak("Can't open lock file \"$lock_file\": $!");
+ if(flock(FH, LOCK_EX | LOCK_NB)) {
+ # Got the lock; it's up to me to download and explode the jar file
+ print STDERR "Pid $$: got the lock; downloading file...\n";
+ print STDERR "Pid $$: file name: $base\n";
+ my $cmd = "rm -f $dest_dir/$base >&2";
+ print STDERR "$cmd\n";
+ system($cmd);
+ my $ret;
+ print STDERR "Pid $$: downloading file...\n";
+ if($type eq "s3") {
+ $ret = do_s3_get($file, $base, $dest_dir, $counters, 3, $env);
+ } elsif($type eq "hdfs") {
+ $ret = do_hdfs_get($file, $base, $dest_dir, $counters);
+ } elsif($type =~ /https?/ || $proto eq "ftp") {
+ $ret = do_wget($file, $base, $dest_dir, $counters);
+ } else {
+ $type eq "local" || croak("Bad type: $type\n");
+ $ret = do_local($file, $base, $dest_dir, $counters);
+ }
+ print STDERR "ls -al $dest_dir/$base\n";
+ print STDERR `ls -al $dest_dir/$base`;
+ if($ret != 0) {
+ system("rm -f $dest_dir/$base* $dest_dir/.$base*");
+ flock(FH, LOCK_UN);
+ close(FH);
+ print STDERR "Return value from download task was $ret\n";
+ croak("Return value from download task was $ret\n");
+ }
+ if(! -f "$dest_dir/$base") {
+ flock(FH, LOCK_UN);
+ close(FH);
+ print STDERR "Return value from download task was $ret but the file $dest_dir/$base doesn't exist\n";
+ croak("Return value from download task was $ret but the file $dest_dir/$base doesn't exist\n");
+ }
+ if($base =~ /\.jar$/) {
+ print STDERR "Pid $$: extract jar\n";
+ #prefer unzip to jar
+ my $jar_exe = Tools::unzip();
+ my $jar_arguments = "";
+ if($jar_exe eq ""){
+ print $STDERR "Could not find unzip, falling back to jar";
+ $jar_exe = Tools::jar();
+ $jar_arguments = "xf";
+ }
+ $cmd = "cd $dest_dir && $jar_exe $jar_arguments $base >&2";
+ print STDERR "$cmd\n";
+ $ret = Util::runAndWait($cmd, "$jar_exe $jar_arguments");
+
+ } elsif($base =~ /\.tar\.gz$/ || $base =~ /\.tgz$/) {
+ $cmd = "cd $dest_dir && tar zxf $base >&2";
+ print STDERR "$cmd\n";
+ $ret = Util::runAndWait($cmd, "tar zxf");
+ } elsif($base =~ /\.tar.bz2$/) {
+ $cmd = "cd $dest_dir && tar jxf $base >&2";
+ print STDERR "$cmd\n";
+ $ret = Util::runAndWait($cmd, "tar jxf");
+ }
+ print STDERR "ls -al $dest_dir/$base\n";
+ print STDERR `ls -al $dest_dir/$base`;
+ if($ret != 0) {
+ system("rm -rf $dest_dir/$base* $dest_dir/.$base*");
+ flock(FH, LOCK_UN);
+ close(FH);
+ croak("Return value from extract task was $ret\n");
+ }
+ my $size = -s "$dest_dir/$base";
+ push @{$counters}, "Fetcher,File and size $base and $size,1";
+ push @{$counters}, "Fetcher,Bytes obtained,$size";
+ push @{$counters}, "Fetcher,Files obtained,1";
+ if(defined($doRfixup)) {
+ # This is a silly fixup we have to do if we want R and Rscript
+ # to run in their new home.
+ print STDERR "Setting RHOME = \"$dest_dir/$doRfixup\"\n";
+ print STDERR "Writing new \"$dest_dir/$doRfixup/bin/R\" script\n";
+ open(RSC, "$dest_dir/$doRfixup/bin/R") ||
+ croak("Could not open '$dest_dir/$doRfixup/bin/R' for reading");
+ open(RSCN, ">$dest_dir/$doRfixup/bin/R.new") ||
+ croak("Could not open '$dest_dir/$doRfixup/bin/R.new' for writing");
+ while(<RSC>) {
+ if(/^R_HOME_DIR=/) {
+ print STDERR "Modifying R_HOME_DIR\n";
+ print RSCN "R_HOME_DIR=$dest_dir/$doRfixup\n";
+ } else { print RSCN $_; }
+ }
+ close(RSC); close(RSCN);
+ system("mv $dest_dir/$doRfixup/bin/R.new $dest_dir/$doRfixup/bin/R");
+ system("chmod a+x $dest_dir/$doRfixup/bin/R");
+ push @{$counters}, "Fetcher,R path fixups performed,1";
+ }
+ # Call user-supplied function
+ if(defined($lockSub)) { $lockSub->(); }
+ system("touch $done_file");
+ } else {
+ print STDERR "Pid $$: didn't get the lock; waiting for master to finish\n";
+ my $sleeps = 0;
+ while(! -f $done_file) {
+ sleep(3);
+ if((++$sleeps % 10) == 0) {
+ my $secs = $sleeps * 3;
+ print STDERR "Pid $$: still waiting (it's been $secs seconds)\n";
+ }
+ }
+ print STDERR "Pid $$: master finished; continuing\n";
+ }
+ close(FH);
+ } else {
+ print STDERR "Pid $$: done file $done_file was there already; continuing\n";
+ }
+ (-f $done_file) || croak("Pid $$: about to exit ensureFetched, but done file $done_file doesn't exist\n");
+}
+
+##
+# Check if a local, hdfs or s3 (or other Hadoop-supported fs) file or
+# directory exists.
+#
+sub fs_exists {
+ my $path = shift;
+ my $rc;
+ if(Util::is_local($path)) {
+ $rc = Util::run("stat $path >& /dev/null");
+ } else {
+ my $hadoop = Tools::hadoop();
+ $path =~ s/^hdfs:\/\///i;
+ $rc = Util::run("($hadoop fs -stat $path) >& /dev/null");
+ }
+ return !$rc;
+}
+
+##
+# Put a file into a a local, hdfs or s3 (or other Hadoop-supported fs)
+# path.
+#
+# $src must be a path to a file
+#
+# $dst must be a path to a directory; it can't specify the destination
+# filename - the basename from $src is preserved
+#
+sub fs_put {
+ my ($src, $dst) = @_;
+ my $base = fileparse($src);
+ $dst .= "/" unless $dst =~ /\/$/;
+ my $fulldst = "$dst$base";
+ if(fs_exists($fulldst)) {
+ print STDERR "WARNING: replacing old $dst from hdfs\n";
+ if(Util::is_local($fulldst)) {
+ Util::run("rm -rf $fulldst >&2");
+ } else {
+ my $hadoop = Tools::hadoop();
+ if($fulldst =~ /^hdfs:/i) {
+ my $fd = $fulldst;
+ $fd =~ s/^hdfs:\/\///i;
+ Util::run("$hadoop dfs -rmr $fulldst >&2");
+ } else {
+ Util::run("$hadoop fs -rmr $fulldst >&2");
+ }
+ }
+ }
+ my $rc;
+ if(Util::is_local($src) && Util::is_local($dst)) {
+ mkpath($dst);
+ $rc = Util::run("cp $src $dst >&2");
+ } else {
+ my $hadoop = Tools::hadoop();
+ if($dst =~ /^hdfs:/i) {
+ my ($d, $fd) = ($dst, $fulldst);
+ $d =~ s/^hdfs:\/\///i;
+ $fd =~ s/^hdfs:\/\///i;
+ Util::run("$hadoop dfs -mkdir $dst >&2");
+ $rc = Util::run("$hadoop dfs -put $src $fd >&2");
+ } else {
+ Util::run("$hadoop fs -mkdir $dst >&2");
+ $rc = Util::run("$hadoop fs -put $src $fulldst >&2");
+ }
+ }
+ die "Can't load $src to $dst ($rc)\n" if $rc;
+}
+
+##
+# Remove a file in a local, hdfs or s3 (or other Hadoop-supported fs)
+# path.
+#
+sub fs_remove {
+ my ($path) = @_;
+ my $rc;
+ if(Util::is_local($path)) {
+ $rc = Util::run("rm -rf $path >&2");
+ } else {
+ my $hadoop = Tools::hadoop();
+ $path =~ s/^hdfs:\/\///i;
+ $rc = Util::run("$hadoop fs -rmr $path >&2");
+ }
+ return $rc;
+}
+
+1;
diff --git a/LICENSES b/LICENSES
new file mode 100644
index 0000000..4d5e5cd
--- /dev/null
+++ b/LICENSES
@@ -0,0 +1,12 @@
+Crossbow and Bowtie are both licensed under the Artistic License. You
+may not use the Crossbow and Bowtie files except in compliance with the
+license. A copy of the Artistic license can be found in the
+"LICENSE_ARTISTIC" file included with Crossbow.
+
+SOAPsnp version 1.02 is licensed under the GNU Public License, version
+3. You may not use the SOAPsnp files except in compliance with the
+license. A copy of the GNU Public License, version 3 can be found in
+the "LICENSE_GPL3" file included with Crossbow.
+
+The Perl modules included in the "contrib" subdirectory are under the
+same license as Perl itself.
diff --git a/LICENSE_ARTISTIC b/LICENSE_ARTISTIC
new file mode 100644
index 0000000..7cb8b7b
--- /dev/null
+++ b/LICENSE_ARTISTIC
@@ -0,0 +1,114 @@
+The Artistic License
+
+Preamble
+
+The intent of this document is to state the conditions under which a
+Package may be copied, such that the Copyright Holder maintains some
+semblance of artistic control over the development of the package,
+while giving the users of the package the right to use and distribute
+the Package in a more-or-less customary fashion, plus the right to
+make reasonable modifications.
+
+Definitions:
+ * "Package" refers to the collection of files distributed by the
+ Copyright Holder, and derivatives of that collection of files
+ created through textual modification.
+ * "Standard Version" refers to such a Package if it has not been
+ modified, or has been modified in accordance with the wishes of
+ the Copyright Holder.
+ * "Copyright Holder" is whoever is named in the copyright or
+ copyrights for the package.
+ * "You" is you, if you're thinking about copying or distributing
+ this Package.
+ * "Reasonable copying fee" is whatever you can justify on the
+ basis of media cost, duplication charges, time of people
+ involved, and so on. (You will not be required to justify it to
+ the Copyright Holder, but only to the computing community at
+ large as a market that must bear the fee.)
+ * "Freely Available" means that no fee is charged for the item
+ itself, though there may be fees involved in handling the
+ item. It also means that recipients of the item may redistribute
+ it under the same conditions they received it.
+
+1. You may make and give away verbatim copies of the source form of
+ the Standard Version of this Package without restriction, provided
+ that you duplicate all of the original copyright notices and
+ associated disclaimers.
+
+2. You may apply bug fixes, portability fixes and other modifications
+ derived from the Public Domain or from the Copyright Holder. A
+ Package modified in such a way shall still be considered the
+ Standard Version.
+
+3. You may otherwise modify your copy of this Package in any way,
+ provided that you insert a prominent notice in each changed file
+ stating how and when you changed that file, and provided that you
+ do at least ONE of the following:
+
+ a) place your modifications in the Public Domain or otherwise make
+ them Freely Available, such as by posting said modifications to
+ Usenet or an equivalent medium, or placing the modifications on a
+ major archive site such as ftp.uu.net, or by allowing the
+ Copyright Holder to include your modifications in the Standard
+ Version of the Package.
+
+ b) use the modified Package only within your corporation or
+ organization.
+
+ c) rename any non-standard executables so the names do not
+ conflict with standard executables, which must also be provided,
+ and provide a separate manual page for each non-standard
+ executable that clearly documents how it differs from the Standard
+ Version.
+
+ d) make other distribution arrangements with the Copyright Holder.
+
+4. You may distribute the programs of this Package in object code or
+ executable form, provided that you do at least ONE of the
+ following:
+
+ a) distribute a Standard Version of the executables and library
+ files, together with instructions (in the manual page or
+ equivalent) on where to get the Standard Version.
+
+ b) accompany the distribution with the machine-readable source of
+ the Package with your modifications.
+
+ c) accompany any non-standard executables with their corresponding
+ Standard Version executables, giving the non-standard executables
+ non-standard names, and clearly documenting the differences in
+ manual pages (or equivalent), together with instructions on where
+ to get the Standard Version.
+
+ d) make other distribution arrangements with the Copyright Holder.
+
+5. You may charge a reasonable copying fee for any distribution of
+ this Package. You may charge any fee you choose for support of this
+ Package. You may not charge a fee for this Package itself. However,
+ you may distribute this Package in aggregate with other (possibly
+ commercial) programs as part of a larger (possibly commercial)
+ software distribution provided that you do not advertise this
+ Package as a product of your own.
+
+6. The scripts and library files supplied as input to or produced as
+ output from the programs of this Package do not automatically fall
+ under the copyright of this Package, but belong to whomever
+ generated them, and may be sold commercially, and may be aggregated
+ with this Package.
+
+7. C or perl subroutines supplied by you and linked into this Package
+ shall not be considered part of this Package.
+
+8. The name of the Copyright Holder may not be used to endorse or
+ promote products derived from this software without specific prior
+ written permission.
+
+9. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
+ WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES
+ OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+
+The End
+This license is approved by the Open Source Initiative
+(www.opensource.org) for certifying software as OSI Certified Open
+Source.
+
diff --git a/LICENSE_GPL3 b/LICENSE_GPL3
new file mode 100644
index 0000000..94a9ed0
--- /dev/null
+++ b/LICENSE_GPL3
@@ -0,0 +1,674 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ <program> Copyright (C) <year> <name of author>
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/MANUAL b/MANUAL
new file mode 100644
index 0000000..e70206d
--- /dev/null
+++ b/MANUAL
@@ -0,0 +1,1708 @@
+% Crossbow: Parallel short read genotyping in the cloud
+% Ben Langmead and Michael C. Schatz
+% http://bowtie-bio.sf.net/crossbow
+
+# What is Crossbow?
+
+[Crossbow] is a scalable, portable, and automatic Cloud Computing tool for
+finding SNPs from short read data. Crossbow employs [Bowtie] and a modified
+version of [SOAPsnp] to perform the short read alignment and SNP calling
+respectively. Crossbow is designed to be easy to run (a) in "the cloud" (in
+this case, Amazon's [Elastic MapReduce] service), (b) on any [Hadoop] cluster,
+or (c) on any single computer, without [Hadoop]. Crossbow exploits the
+availability of multiple computers and processors where possible.
+
+[Crossbow]: http://bowtie-bio.sf.net/crossbow
+[Bowtie]: http://bowtie-bio.sf.net
+[SOAPsnp]: http://soap.genomics.org.cn/soapsnp.html
+[Elastic MapReduce]: http://aws.amazon.com/elasticmapreduce "Amazon Elastic MapReduce"
+
+# A word of caution
+
+Renting resources from [Amazon Web Services] (AKA [AWS]), costs money,
+regardless of whether your experiment ultimately succeeds or fails. In some
+cases, Crossbow or its documentation may be partially to blame for a failed
+experiment. While we are happy to accept bug reports, we do not accept
+responsibility for financial damage caused by these errors. Crossbow is
+provided "as is" with no warranty. See `LICENSE` file.
+
+[Amazon Web Services]: http://aws.amazon.com
+[Amazon EC2]: http://aws.amazon.com/ec2
+[Amazon S3]: http://aws.amazon.com/s3
+[Amazon EMR]: http://aws.amazon.com/elasticmapreduce
+[Amazon SimpleDB]: http://aws.amazon.com/simpledb
+[AWS]: http://aws.amazon.com
+
+# Crossbow modes and prerequisites
+
+Crossbow can be run in four different ways.
+
+1. **Via the [Crossbow web interface]**
+
+ In this case, the [Crossbow] code and the user interface are installed on EC2
+ web servers. Also, the computers running the Crossbow computation are rented
+ from Amazon, and the user must have [EC2], [EMR], [S3] and [SimpleDB]
+ accounts and must pay the [going rate] for the resources used. The user does
+ not need any special software besides a web browser and, in most cases, an
+ [S3 tool].
+
+[Crossbow web interface]: http://bowtie-bio.sf.net/crossbow/ui.html
+
+2. **On Amazon [Elastic MapReduce] via the command-line**
+
+ In this case, the Crossbow code is hosted by Amazon and the computers running
+ the Crossbow computation are rented from Amazon. However, the user must
+ install and run (a) the Crossbow scripts, which require [Perl] 5.6 or later,
+ (b) Amazon's `elastic-mapreduce` script, which requires Ruby 1.8 or later,
+ and (c) an [S3 tool]. The user must have [EC2], [EMR], [S3] and [SimpleDB]
+ accounts and must pay the [going rate] for the resources used.
+
+3. **On a [Hadoop] cluster via the command-line**
+
+ In this case, the Crossbow code is hosted on your [Hadoop] cluster, as are
+ supporting tools: [Bowtie], [SOAPsnp], and possibly `fastq-dump`.
+ Supporting tools must be installed on all cluster nodes, but the Crossbow
+ scripts need only be installed on the master. Crossbow was tested with
+ [Hadoop] versions 0.20 and 0.20.205, and might also be compatible with other
+ versions newer than 0.20. Crossbow scripts require [Perl] 5.6 or later.
+
+4. **On any computer via the command-line**
+
+ In this case, the Crossbow code and all supporting tools ([Bowtie],
+ [SOAPsnp], and possibly `fastq-dump`) must be installed on the computer
+ running Crossbow. Crossbow scripts require [Perl] 5.6 or later. The user
+ specifies the maximum number of CPUs that Crossbow should use at a time.
+ This mode does *not* require [Java] or [Hadoop].
+
+[Amazon EMR]: http://aws.amazon.com/elasticmapreduce
+[Elastic MapReduce]: http://aws.amazon.com/elasticmapreduce
+[EMR]: http://aws.amazon.com/elasticmapreduce
+[S3]: http://aws.amazon.com/s3
+[EC2]: http://aws.amazon.com/ec2
+[going rate]: http://aws.amazon.com/ec2/#pricing
+[Elastic MapReduce web interface]: https://console.aws.amazon.com/elasticmapreduce/home
+[AWS Console]: https://console.aws.amazon.com
+[AWS console]: https://console.aws.amazon.com
+`elastic-mapreduce`: http://aws.amazon.com/developertools/2264?_encoding=UTF8&jiveRedirect=1
+[Java]: http://java.sun.com/
+[Hadoop]: http://hadoop.apache.org/
+[R]: http://www.r-project.org/
+[Bioconductor]: http://www.bioconductor.org/
+[Perl]: http://www.perl.org/get.html
+
+# Preparing to run on Amazon Elastic MapReduce
+
+Before running Crossbow on [EMR], you must have an [AWS] account with the
+appropriate features enabled. You may also need to [install Amazon's
+`elastic-mapreduce` tool]. In addition, you may want to install an [S3 tool],
+though most users can simply use [Amazon's web interface for S3], which requires
+no installation.
+
+If you plan to run Crossbow exclusively on a single computer or on a [Hadoop]
+cluster, you can skip this section.
+
+[Amazon's web interface for S3]: https://console.aws.amazon.com/s3/home
+
+1. Create an AWS account by navigating to the [AWS page]. Click "Sign Up Now"
+ in the upper right-hand corner and follow the instructions. You will be asked
+ to accept the [AWS Customer Agreement].
+
+2. Sign up for [EC2] and [S3]. Navigate to the [Amazon EC2] page, click on
+ "Sign Up For Amazon EC2" and follow the instructions. This step requires you
+ to enter credit card information. Once this is complete, your AWS account
+ will be permitted to use [EC2] and [S3], which are required.
+
+3. Sign up for [EMR]. Navigate to the [Elastic MapReduce] page, click on "Sign
+ up for Elastic MapReduce" and follow the instructions. Once this is complete,
+ your AWS account will be permitted to use [EMR], which is required.
+
+4. Sign up for [SimpleDB]. With [SimpleDB] enabled, you have the option of
+ using the [AWS Console]'s [Job Flow Debugging] feature. This is a convenient
+ way to monitor your job's progress and diagnose errors.
+
+5. *Optional*: Request an increase to your instance limit. By default, Amazon
+ allows you to allocate EC2 clusters with up to 20 instances (virtual
+ computers). To be permitted to work with more instances, fill in the form on
+ the [Request to Increase] page. You may have to speak to an Amazon
+ representative and/or wait several business days before your request is
+ granted.
+
+To see a list of AWS services you've already signed up for, see your [Account
+Activity] page. If "Amazon Elastic Compute Cloud", "Amazon Simple Storage
+Service", "Amazon Elastic MapReduce" and "Amazon SimpleDB" all appear there, you
+are ready to proceed.
+
+Be sure to make a note of the various numbers and names associated with your
+accounts, especially your Access Key ID, Secret Access Key, and your EC2 key
+pair name. You will have to refer to these and other account details in the
+future.
+
+[AWS Customer Agreement]: http://aws.amazon.com/agreement/
+[Request to Increase]: http://aws.amazon.com/contact-us/ec2-request/
+[Job Flow Debugging]: http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/DebuggingJobFlows.html
+[SimpleDB]: http://aws.amazon.com/simpledb/
+[Account Activity]: http://aws-portal.amazon.com/gp/aws/developer/account/index.html?ie=UTF8&action=activity-summary
+
+## Installing Amazon's `elastic-mapreduce` tool
+
+Read this section if you plan to run Crossbow on [Elastic MapReduce] via the
+command-line tool. Skip this section if you are not using [EMR] or if you plan
+to run exclusively via the [Crossbow web interface].
+
+To install Amazon's `elastic-mapreduce` tool, follow the instructions in Amazon
+Elastic MapReduce developer's guide for [How to Download and Install Ruby and
+the Command Line Interface]. That document describes:
+
+[How to Download and Install Ruby and the Command Line Interface]: http://aws.amazon.com/developertools/2264?_encoding=UTF8&jiveRedirect=1
+
+1. Installing an appropriate version of [Ruby], if necessary.
+
+2. Setting up an EC2 keypair, if necessary.
+
+3. Setting up a credentials file, which is used by the `elastic-mapreduce` tool
+ for authentication.
+
+ For convenience, we suggest you name the credentials file `credentials.json`
+ and place it in the same directory with the `elastic-mapreduce` script.
+ Otherwise you will have to specify the credential file path with the
+ `--credentials` option each time you run `cb_emr`.
+
+We strongly recommend using a version of the `elastic-mapreduce` Ruby script
+released on or after December 8, 2011. This is when the script switched to
+using Hadoop v0.20.205 by default, which is the preferred way of running Myrna.
+
+[Ruby]: http://www.ruby-lang.org/
+[Setting up an EC2 keypair]: http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?download_ruby.html
+
+We also recommend that you add the directory containing the `elastic-mapreduce`
+tool to your `PATH`. This allows Crossbow to locate it automatically.
+Alternately, you can specify the path to the `elastic-mapreduce` tool via the
+`--emr-script` option when running `cb_emr`.
+
+[AWS]: http://aws.amazon.com/ "Amazon Web Services"
+[AWS page]: http://aws.amazon.com/ "Amazon Web Services"
+[AWS Getting Started Guide]: http://docs.amazonwebservices.com/AWSEC2/latest/GettingStartedGuide/
+
+## S3 tools
+
+Running on [EMR] requires exchanging files via the cloud-based [S3] filesystem.
+[S3] is organized as a collection of [S3 buckets] in a global namespace. [S3
+charges] are incurred when transferring data to and from [S3] (but transfers
+between [EC2] and [S3] are free), and a per-GB-per-month charge applies when
+data is stored in [S3] over time.
+
+To transfer files to and from [S3], use an S3 tool. Amazon's [AWS Console] has
+an [S3 tab] that provides a friendly web-based interface to [S3], and doesn't
+require any software installation. [s3cmd] is a very good command-line tool
+that requires [Python] 2.4 or later. [S3Fox Organizer] is another GUI tool that
+works as a [Firefox] extension. Other tools include [Cyberduck] (for Mac OS
+10.6 or later) and [Bucket Explorer] (for Mac, Windows or Linux, but commercial
+software).
+
+[S3]: http://aws.amazon.com/s3/
+[S3 tab]: https://console.aws.amazon.com/s3/home
+[s3cmd]: http://s3tools.org/s3cmd
+[Python]: http://www.python.org/download/
+[Firefox]: http://www.mozilla.com/firefox/
+[S3 buckets]: http://docs.amazonwebservices.com/AmazonS3/latest/gsg/
+[S3 bucket]: http://docs.amazonwebservices.com/AmazonS3/latest/gsg/
+[S3 charges]: http://aws.amazon.com/s3/#pricing
+[S3Fox Organizer]: http://www.s3fox.net/
+[Cyberduck]: http://cyberduck.ch/
+[Bucket Explorer]: http://www.bucketexplorer.com/
+
+# Installing Crossbow
+
+Crossbow consists of a set of [Perl] and shell scripts, plus supporting tools:
+[Bowtie] and [SOAPsnp] . If you plan to run Crossbow via the [Crossbow web
+interface] exclusively, there is nothing to install. Otherwise:
+
+1. Download the desired version of Crossbow from the [sourceforge site]
+
+2. [Extract the zip archive]
+
+3. Set the `CROSSBOW_HOME` environment variable to point to the extracted
+ directory (containing `cb_emr`)
+
+4. *If you plan to run on a local computer or [Hadoop] cluster*:
+
+ If using Linux or Mac OS 10.6 or later, you likely don't have to install
+ [Bowtie] or [SOAPsnp], as Crossbow comes with compatible versions of both
+ pre-installed. Test this by running:
+
+ $CROSSBOW_HOME/cb_local --test
+
+ If the install test passes, installation is complete.
+
+ If the install test indicates [Bowtie] is not installed, obtain or build a
+ `bowtie` binary v0.12.8 or higher and install it by setting the
+ `CROSSBOW_BOWTIE_HOME` environment variable to `bowtie`'s enclosing
+ directory. Alternately, add the enclosing directory to your `PATH` or
+ specify the full path to `bowtie` via the `--bowtie` option when running
+ Crossbow scripts.
+
+ If the install test indicates that [SOAPsnp] is not installed, build the
+ `soapsnp` binary using the sources and makefile in `CROSSBOW_HOME/soapsnp`.
+ You must have compiler tools such as GNU `make` and `g++` installed for this
+ to work. If you are using a Mac, you may need to install the [Apple
+ developer tools]. To build the `soapsnp` binary, run:
+
+ make -C $CROSSBOW_HOME/soapsnp
+
+ Now install `soapsnp` by setting the `CROSSBOW_SOAPSNP_HOME` environment
+ variable to `soapsnp`'s enclosing directory. Alternately, add the enclosing
+ directory to your `PATH` or specify the full path to `soapsnp` via the
+ `--soapsnp` option when running Crossbow scripts.
+
+5. *If you plan to run on a [Hadoop] cluster*, you may need to manually copy
+ the `bowtie` and `soapsnp` executables, and possibly also the `fastq-dump`
+ executable, to the same path on each of your [Hadoop] cluster nodes. You
+ can avoid this step by installing `bowtie`, `soapsnp` and `fastq-dump` on a
+ filesystem shared by all [Hadoop] nodes (e.g. an [NFS share]). You can also
+ skip this step if [Hadoop] is installed in [pseudo distributed] mode,
+ meaning that the cluster really consists of one node whose CPUs are treated
+ as distinct slaves.
+
+[NFS share]: http://en.wikipedia.org/wiki/Network_File_System_(protocol)
+[pseudo distributed]: http://hadoop.apache.org/common/docs/current/quickstart.html#PseudoDistributed
+
+## The SRA toolkit
+
+The [Sequence Read Archive] (SRA) is a resource at the [National Center for
+Biotechnology Information] (NCBI) for storing sequence data from modern
+sequencing instruments. Sequence data underlying many studies, including very
+large studies, can often be downloaded from this archive.
+
+The SRA uses a special file format to store archived read data. These files end
+in extensions `.sra`, and they can be specified as inputs to Crossbow's
+preprocessing step in exactly the same way as [FASTQ] files.
+
+However, if you plan to use `.sra` files as input to Crossbow in either
+[Hadoop] mode or in single-computer mode, you must first install the [SRA
+toolkit]'s `fastq-dump` tool appropriately. See the [SRA toolkit] page for
+details about how to download and install.
+
+When searching for the `fastq-dump` tool at runtime, Crossbow searches the
+following places in order:
+
+1. The path specified in the `--fastq-dump` option
+2. The directory specified in the `$CROSSBOW_SRATOOLKIT_HOME` environment
+ variable.
+3. In the system `PATH`
+
+[Sequence Read Archive]: http://www.ncbi.nlm.nih.gov/books/NBK47533/
+[National Center for Biotechnology Information]: http://www.ncbi.nlm.nih.gov/
+[SRA toolkit]: http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?cmd=show&f=software&m=software&s=software
+
+# Running Crossbow
+
+The commands for invoking Crossbow from the command line are:
+
+`$CROSSBOW_HOME/cb_emr` (or just `cb_emr` if `$CROSSBOW_HOME` is in the `PATH`)
+for running on [EMR]. See [Running Crossbow on EMR via the command line] for
+details.
+
+`$CROSSBOW_HOME/cb_hadoop` (or just `cb_hadoop` if `$CROSSBOW_HOME` is in the
+`PATH`) for running on [Hadoop]. See [Running Crossbow on a Hadoop cluster via
+the command line] for details.
+
+`$CROSSBOW_HOME/cb_local` (or just `cb_local` if `$CROSSBOW_HOME` is in the
+`PATH`) for running locally on a single computer. See [Running Crossbow on a
+single computer via the command line] for details.
+
+[Apple developer tools]: http://developer.apple.com/technologies/tools/
+[NFS share]: http://en.wikipedia.org/wiki/Network_File_System_(protocol)
+[pseudo distributed]: http://hadoop.apache.org/common/docs/current/quickstart.html#PseudoDistributed
+[sourceforge site]: http://bowtie-bio.sf.net/crossbow
+[Extract the zip archive]: http://en.wikipedia.org/wiki/ZIP_(file_format)
+
+# Running Crossbow on EMR via the EMR web interface
+
+## Prerequisites
+
+1. Web browser
+2. [EC2], [S3], [EMR], and [SimpleDB] accounts. To check which ones you've
+ already enabled, visit your [Account Activity] page.
+3. A tool for browsing and exchanging files with [S3]
+ a. The [AWS Console]'s [S3 tab] is a good web-based tool that does not
+ require software installation
+ b. A good command line tool is [s3cmd]
+ c. A good GUI tool is [S3Fox Organizer], which is a Firefox Plugin
+ d. Others include [Cyberduck], [Bucket Explorer]
+3. Basic knowledge regarding:
+ a. [What S3 is], [what an S3 bucket is], how to create one, how to upload a
+ file to an S3 bucket from your computer (see your S3 tool's documentation).
+ b. How much AWS resources [will cost you]
+
+[Account Activity]: http://aws-portal.amazon.com/gp/aws/developer/account/index.html?ie=UTF8&action=activity-summary
+[s3cmd]: http://s3tools.org/s3cmd
+[S3Fox Organizer]: http://www.s3fox.net/
+[Cyberduck]: http://cyberduck.ch/
+[Bucket Explorer]: http://www.bucketexplorer.com/
+[What S3 is]: http://aws.amazon.com/s3/
+[what an S3 bucket is]: http://docs.amazonwebservices.com/AmazonS3/latest/gsg/
+[will cost you]: http://aws.amazon.com/ec2/#pricing
+
+## To run
+
+1. *If the input reads have not yet been preprocessed by Crossbow* (i.e. input
+ is [FASTQ] or `.sra`), then first (a) prepare a [manifest file] with URLs
+ pointing to the read files, and (b) upload it to an [S3] bucket that you
+ own. See your [S3] tool's documentation for how to create a bucket and
+ upload a file to it. The URL for the [manifest file] will be the input URL
+ for your [EMR] job.
+
+ *If the input reads have already been preprocessed by Crossbow*, make a note
+ of of the [S3] URL where they're located. This will be the input URL for
+ your [EMR] job.
+
+2. *If you are using a pre-built reference jar*, make a note of its [S3] URL.
+ This will be the reference URL for your [EMR] job. See the [Crossbow
+ website] for a list of pre-built reference jars and their URLs.
+
+ *If you are not using a pre-built reference jar*, you may need to [build the
+ reference jars] and/or upload them to an [S3] bucket you own. See your [S3
+ tool]'s documentation for how to create a bucket and upload to it. The URL
+ for the main reference jar will be the reference URL for your [EMR] job.
+
+[Crossbow website]: http://bowtie-bio.sf.net/crossbow
+`.sra`: http://www.ncbi.nlm.nih.gov/books/NBK47540/
+
+3. In a web browser, go to the [Crossbow web interface].
+
+4. Fill in the form according to your job's parameters. We recommend filling in
+ and validating the "AWS ID" and "AWS Secret Key" fields first. Also, when
+ entering S3 URLs (e.g. "Input URL" and "Output URL"), we recommend that users
+ validate the entered URLs by clicking the link below it. This avoids failed
+ jobs due to simple URL issues (e.g. non-existence of the "Input URL"). For
+ examples of how to fill in this form, see the [E. coli EMR] and [Mouse
+ chromosome 17 EMR] examples.
+
+# Running Crossbow on EMR via the command line
+
+## Prerequisites
+
+1. [EC2], [S3], [EMR], and [SimpleDB] accounts. To check which ones you've
+ already enabled, visit your [Account Activity] page.
+2. A tool for browsing and exchanging files with [S3]
+ a. The [AWS Console]'s [S3 tab] is a good web-based tool that does not
+ require software installation
+ b. A good command line tool is [s3cmd]
+ c. A good GUI tool is [S3Fox Organizer], which is a Firefox Plugin
+ d. Others include [Cyberduck], [Bucket Explorer]
+3. Basic knowledge regarding:
+ a. [What S3 is], [what an S3 bucket is], how to create one, how to upload a
+ file to an S3 bucket from your computer (see your S3 tool's documentation).
+ b. How much AWS resources [will cost you]
+
+[Account Activity]: http://aws-portal.amazon.com/gp/aws/developer/account/index.html?ie=UTF8&action=activity-summary
+[s3cmd]: http://s3tools.org/s3cmd
+[S3Fox Organizer]: http://www.s3fox.net/
+[Cyberduck]: http://cyberduck.ch/
+[Bucket Explorer]: http://www.bucketexplorer.com/
+[What S3 is]: http://aws.amazon.com/s3/
+[What an S3 bucket is]: http://docs.amazonwebservices.com/AmazonS3/latest/gsg/
+[will cost you]: http://aws.amazon.com/ec2/#pricing
+
+## To run
+
+1. *If the input reads have not yet been preprocessed by Crossbow* (i.e. input
+ is [FASTQ] or `.sra`), then first (a) prepare a [manifest file] with URLs
+ pointing to the read files, and (b) upload it to an [S3] bucket that you
+ own. See your [S3] tool's documentation for how to create a bucket and
+ upload a file to it. The URL for the [manifest file] will be the input URL
+ for your [EMR] job.
+
+ *If the input reads have already been preprocessed by Crossbow*, make a note
+ of of the [S3] URL where they're located. This will be the input URL for
+ your [EMR] job.
+
+2. *If you are using a pre-built reference jar*, make a note of its [S3] URL.
+ This will be the reference URL for your [EMR] job. See the [Crossbow
+ website] for a list of pre-built reference jars and their URLs.
+
+ *If you are not using a pre-built reference jar*, you may need to [build the
+ reference jars] and/or upload them to an [S3] bucket you own. See your [S3
+ tool]'s documentation for how to create a bucket and upload to it. The URL
+ for the main reference jar will be the reference URL for your [EMR] job.
+
+[Crossbow website]: http://bowtie-bio.sf.net/crossbow
+
+3. Run `$CROSSBOW_HOME/cb_emr` with the desired options. Options that are unique
+ to [EMR] jobs are described in the following section. Options that apply to
+ all running modes are described in the [General Crossbow options] section.
+ For examples of how to run `$CROSSBOW_HOME/cb_emr` see the [E. coli EMR] and
+ [Mouse chromosome 17 EMR] examples.
+
+## EMR-specific options
+
+ --reference <URL>
+
+[S3] URL where the reference jar is located. URLs for pre-built reference jars
+for some commonly studied species (including human and mouse) are available from
+the [Crossbow web site]. Note that a [Myrna] reference jar is not the same as a
+[Crossbow] reference jar. If your desired genome and/or SNP annotations are not
+available in pre-built form, you will have to make your own reference jar and
+upload it to one of your own S3 buckets (see [Reference jars]). This option
+must be specified.
+
+[Myrna]: http://bowtie-bio.sf.net/myrna
+[Crossbow web site]: http://bowtie-bio.sf.net/crossbow
+
+ --input <URL>
+
+[S3] URL where the input is located. If `--preprocess` or
+`--just-preprocess` are specified, `<URL>` sould point to a [manifest file].
+Otherwise, `<URL>` should point to a directory containing preprocessed reads.
+This option must be specified.
+
+ --output <URL>
+
+[S3] URL where the output is to be deposited. If `--just-preprocess` is
+specified, the output consists of the preprocessed reads. Otherwise, the output
+consists of the SNP calls calculated by [SOAPsnp] for each chromosome in the
+[Crossbow output format], organized as one file per chromosome. This option
+must be specified.
+
+ --intermediate <URL>
+
+[S3] URL where all intermediate results should be be deposited. This can be
+useful if you later want to resume the computation from partway through the
+pipeline (e.g. after alignment but before SNP calling). By default,
+intermediate results are stored in [HDFS] and disappear once the cluster is
+terminated.
+
+ --preprocess-output <URL>
+
+[S3] URL where the preprocessed reads should be stored. This can be useful if
+you later want to run Crossbow on the same input reads without having to re-run
+the preprocessing step (i.e. leaving `--preprocess` unspecified).
+
+ --credentials <id>
+
+Local path to the credentials file set up by the user when the
+`elastic-mapreduce` script was installed (see [Installing Amazon's
+`elastic-mapreduce` tool]). Default: use `elastic-mapreduce`'s default (i.e.
+the `credentials.json` file in the same directory as the `elastic-mapreduce`
+script). If `--credentials` is not specified and the default `credentials.json`
+file doesn't exist, `elastic-mapreduce` will abort with an error message.
+
+ --emr-script <path>
+
+Local path to the `elastic-mapreduce` script. By default, Crossbow looks first
+in the `$CROSSBOW_EMR_HOME` directory, then in the `PATH`.
+
+ --name <string>
+
+Specify the name by which the job will be identified in the [AWS Console].
+
+ --stay-alive
+
+By default, [EMR] will terminate the cluster as soon as (a) one of the stages
+fails, or (b) the job complete successfully. Specify this option to force [EMR]
+to keep the cluster alive in either case.
+
+ --instances <int>
+
+Specify the number of instances (i.e. virtual computers, also called nodes) to
+be allocated to your cluster. If set to 1, the 1 instance will funcion as both
+[Hadoop] master and slave node. If set greater than 1, one instance will
+function as a [Hadoop] master and the rest will function as [Hadoop] slaves. In
+general, the greater the value of `<int>`, the faster the Crossbow computation
+will complete. Consider the desired speed as well as the [going rate] when
+choosing a value for `<int>`. Default: 1.
+
+ --instance-type <type>
+
+Specify the type of [EC2] instance to use for the computation. See Amazon's
+[list of available instance types] and be sure to specify the "API name" of the
+desired type (e.g. `m1.small` or `c1.xlarge`). **The default of `c1.xlarge` is
+strongly recommended** because it has an appropriate mix of computing power and
+memory for a large breadth of problems. Choosing an instance type with less
+than 5GB of physical RAM can cause problems when the reference is as large (e.g.
+a mammalian genome). Stick to the default unless you're pretty sure the
+specified instance type can handle your problem size.
+
+[list of available instance types]: http://aws.amazon.com/ec2/instance-types/
+`<instance-type>`: http://aws.amazon.com/ec2/instance-types/
+
+ --emr-args "<args>"
+
+Pass the specified extra arguments to the `elastic-mapreduce` script. See
+documentation for the `elastic-mapreduce` script for details.
+
+ --logs <URL>
+
+Causes [EMR] to copy the log files to `<URL>`. Default: [EMR] writes logs to
+the `logs` subdirectory of the `--output` URL. See also `--no-logs`.
+
+ --no-logs
+
+By default, Crossbow causes [EMR] to copy all cluster log files to the `log`
+subdirectory of the `--output` URL (or another destination, if `--logs` is
+specified). Specifying this option disables all copying of logs.
+
+ --no-emr-debug
+
+Disables [Job Flow Debugging]. If this is *not* specified, you must have a
+[SimpleDB] account for [Job Flow Debugging] to work. You will be subject to
+additional [SimpleDB-related charges] if this option is enabled, but those fees
+are typically small or zero (depending on your account's [SimpleDB tier]).
+
+[Job Flow Debugging]: http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/DebuggingJobFlows.html
+[SimpleDB]: http://aws.amazon.com/simpledb/
+[SimpleDB-related charges]: http://aws.amazon.com/simpledb/#pricing
+[SimpleDB tier]: http://aws.amazon.com/simpledb/#pricing
+
+# Running Crossbow on a Hadoop cluster via the command line
+
+## Prerequisites
+
+1. Working installation of [Hadoop] v0.20.2 or v0.20.205. Other versions newer
+ than 0.20 might also work, but haven't been tested.
+
+2. A `bowtie` v0.12.8 executable must exist at the same path on all cluster
+ nodes (including the master). That path must be specified via the
+ `--bowtie` option OR located in the directory specified
+ in the `CROSSBOW_BOWTIE_HOME` environment variable, OR in a subdirectory of
+ `$CROSSBOW_HOME/bin` OR in the `PATH` (Crossbow looks in that order).
+ `$CROSSBOW_HOME/bin` comes with pre-built Bowtie binaries for Linux and Mac
+ OS X 10.5 or later. An executable from that directory is used automatically
+ unless the platform is not Mac or Linux or unless overridden by
+ `--bowtie` or by defining `CROSSBOW_BOWTIE_HOME`.
+
+3. A Crossbow-customized version of `soapsnp` v1.02 must be installed
+ at the same path on all cluster nodes (including the master). That
+ path must be specified via the `--soapsnp` option OR located in the
+ directory specified in the `CROSSBOW_SOAPSNP_HOME` environment
+ variable, OR in a subdirectory of `$CROSSBOW_HOME/bin` OR in the
+ `PATH` (Crossbow searches in that order). `$CROSSBOW_HOME/bin` comes
+ with pre-built SOAPsnp binaries for Linux and Mac OS X 10.6 or
+ later. An executable from that directory is used automatically
+ unless the platform is not Mac or Linux or unless overridden by
+ `--soapsnp` or by defining `CROSSBOW_SOAPSNP_HOME`.
+
+4. If any of your inputs are in [Sequence Read Archive] format (i.e. end in
+ `.sra`), then the `fastq-dump` tool from the [SRA Toolkit] must be installed
+ at the same path on all cluster nodes. The path to the `fastq-dump` tool
+ must be specified via the (`--fastq-dump`) option OR
+ `fastq-dump` must be located in the directory specified in the
+ `CROSSBOW_FASTQ_DUMP_HOME` environment variable, OR `fastq-dump` must be
+ found in the `PATH` (Myrna searches in that order).
+
+5. Sufficient memory must be available on all [Hadoop] slave nodes to
+ hold the Bowtie index for the desired organism in addition to any
+ other loads placed on those nodes by [Hadoop] or other programs.
+ For mammalian genomes such as the human genome, this typically means
+ that slave nodes must have at least 5-6 GB of RAM.
+
+## To run
+
+Run `$CROSSBOW_HOME/cb_hadoop` with the desired options. Options that are
+unique to [Hadoop] jobs are described in the following subsection. Options that
+apply to all running modes are described in the [General Crossbow options]
+subsection. To see example invocations of `$CROSSBOW_HOME/cb_hadoop` see the
+[E. coli Hadoop] and [Mouse chromosome 17 Hadoop] examples.
+
+## Hadoop-specific options
+
+ --reference <URL>
+
+[HDFS] URL where the reference jar is located. Pre-built reference jars for
+some commonly studied species (including human and mouse) are available from the
+[Crossbow web site]; these can be downloaded and installed in HDFS using `hadoop
+dfs` commands. If your desired genome and/or SNP annotations are not available
+in pre-built form, you will have to make your own reference jars, install them
+in HDFS, and specify their HDFS path here. This option must be specified.
+
+[Crossbow web site]: http://bowtie-bio.sf.net/crossbow
+[HDFS]: http://hadoop.apache.org/common/docs/current/hdfs_design.html
+
+ --input <URL>
+
+[HDFS] URL where the input is located. If `--preprocess` or
+`--just-preprocess` are specified, `<URL>` sould point to a manifest file.
+Otherwise, `<URL>` should point to a directory containing preprocessed reads.
+This option must be specified.
+
+ --output <URL>
+
+[HDFS] URL where the output is to be deposited. If `--just-preprocess` is
+specified, the output consists of the preprocessed reads. Otherwise, the output
+consists of the SNP calls calculated by SOAPsnp for each chromosome, organized
+as one file per chromosome. This option must be specified.
+
+ --intermediate <URL>
+
+[HDFS] URL where all intermediate results should be be deposited. Default:
+`hdfs:///crossbow/intermediate/<PID>`.
+
+ --preprocess-output <URL>
+
+[HDFS] URL where the preprocessed reads should be stored. This can be useful if
+you later want to run Crossbow on the same input reads without having to re-run
+the preprocessing step (i.e. leaving `--preprocess` unspecified).
+
+ --bowtie <path>
+
+Local path to the [Bowtie] binary Crossbow should use. `bowtie` must be
+installed in this same directory on all [Hadoop] worker nodes. By default,
+Crossbow searches the `PATH` and in the directory pointed to by the
+`CROSSBOW_HOME` environment variable.
+
+ --fastq-dump <path>
+
+Path to the directory containing `fastq-dump`, which is part of the [SRA
+Toolkit]. This overrides all other ways that Crossbow searches for
+`fastq-dump`, including the `CROSSBOW_SRATOOLKIT_HOME` environment variable, the
+subdirectories of the `$CROSSBOW_HOME/bin` directory, and the `PATH`.
+
+ --soapsnp <path>
+
+Local path to the SOAPsnp executable to use when running the Call SNPs step.
+`soapsnp` must be installed in this same directory on all [Hadoop] worker nodes
+This overrides all other ways that Crossbow searches for `soapsnp`, including
+the `CROSSBOW_SOAPSNP_HOME` environment variable, the subdirectories of the
+`$CROSSBOW_HOME/bin` directory, and the `PATH`.
+
+# Running Crossbow on a single computer via the command line
+
+## Prerequisites
+
+1. A `bowtie` v0.12.8 executable must exist on the local computer. The
+ path to `bowtie` must be specified via the `--bowtie` option OR be located
+ in the directory specified in the `$CROSSBOW_BOWTIE_HOME` environment
+ variable, OR in a subdirectory of `$CROSSBOW_HOME/bin` OR in the `PATH`
+ (search proceeds in that order). `$CROSSBOW_HOME/bin` comes with
+ pre-built Bowtie binaries for Linux and Mac OS X 10.6 or later, so most
+ Mac and Linux users do not need to install either tool.
+
+2. A Crossbow-customized version of `soapsnp` v1.02 must exist. The path
+ to `soapsnp` must be specified via the `--soapsnp` option OR be in
+ the directory specified in the `$CROSSBOW_SOAPSNP_HOME` environment
+ variable, OR in a subdirectory of `$CROSSBOW_HOME/bin` OR in the `PATH` (Crossbow searches in that order).
+ `$CROSSBOW_HOME/bin` comes with pre-built SOAPsnp binaries for Linux and
+ Mac OS X 10.6 or later. An executable from that directory is used
+ automatically unless the platform is not Mac or Linux or unless
+ overridden by `--soapsnp` or `$CROSSBOW_SOAPSNP_HOME`.
+
+3. If any of your inputs are in [Sequence Read Archive] format (i.e. end in
+ `.sra`), then the `fastq-dump` tool from the [SRA Toolkit] must be installed
+ on the local computer. The path to the `fastq-dump` tool must be specified
+ via the (`--fastq-dump`) option OR `fastq-dump` must be
+ located in the directory specified in the `MYRNA_FASTQ_DUMP_HOME` environment
+ variable, OR `fastq-dump` must be found in the `PATH` (Myrna searches in that
+ order).
+
+4. Sufficient memory must be available on the local computer to hold one copy of
+ the Bowtie index for the desired organism *in addition* to all other running
+ workloads. For mammalian genomes such as the human genome, this typically
+ means that the local computer must have at least 5-6 GB of RAM.
+
+## To run
+
+Run `$CROSSBOW_HOME/cb_local` with the desired options. Options unique to local
+jobs are described in the following subsection. Options that apply to all
+running modes are described in the [General Crossbow options] subsection. To
+see example invocations of `$CROSSBOW_HOME/cb_local` see the [E. coli local] and
+[Mouse chromosome 17 local] examples.
+
+## Local-run-specific options
+
+ --reference <path>
+
+Local path where expanded reference jar is located. Specified path should have
+a `index` subdirectory with a set of Bowtie index files, a `sequences`
+subdirectory with a set of FASTA files, a `snps` subdirectory with 0 or more
+per-chromosome SNP description files, and a `cmap.txt` file. Pre-built
+reference jars for some commonly studied species (including human and mouse) are
+available from the [Crossbow web site]; these can be downloaded and expanded
+into a directory with the appropriate structure using an `unzip` utility. If
+your desired genome and/or SNP annotations are not available in pre-built form,
+you will have to make your own reference jars and specify the appropriate path.
+This option must be specified.
+
+[Crossbow web site]: http://bowtie-bio.sf.net/crossbow
+[HDFS]: http://hadoop.apache.org/common/docs/current/hdfs_design.html
+`unzip`: http://en.wikipedia.org/wiki/Unzip
+
+ --input <path>
+
+Local path where the input is located. If `--preprocess` or
+`--just-preprocess` are specified, this sould point to a [manifest file].
+Otherwise, this should point to a directory containing preprocessed reads. This
+option must be specified.
+
+ --output <path>
+
+Local path where the output is to be deposited. If `--just-preprocess` is
+specified, the output consists of the preprocessed reads. Otherwise, the output
+consists of the SNP calls calculated by SOAPsnp for each chromosome, organized
+as one file per chromosome. This option must be specified.
+
+ --intermediate <path>
+
+Local path where all intermediate results should be kept temporarily (or
+permanently, if `--keep-intermediates` or `--keep-all` are specified).
+Default: `/tmp/crossbow/intermediate/<PID>`.
+
+ --preprocess-output <path>
+
+Local path where the preprocessed reads should be stored. This can be useful if
+you later want to run Crossbow on the same input reads without having to re-run
+the preprocessing step (i.e. leaving `--preprocess` unspecified).
+
+ --keep-intermediates
+
+Keep intermediate directories and files, i.e. the output from all stages prior
+to the final stage. By default these files are deleted as soon as possible.
+
+ --keep-all
+
+Keep all temporary files generated during the process of binning and sorting
+data records and moving them from stage to stage, as well as all intermediate
+results. By default these files are deleted as soon as possible.
+
+ --cpus <int>
+
+The maximum number of processors to use at any given time during the job.
+Crossbow will try to make maximal use of the processors allocated. Default: 1.
+
+ --max-sort-records <int>
+
+Maximum number of records to be dispatched to the sort routine at one time when
+sorting bins before each reduce step. For each child process, this number is
+effectively divided by the number of CPUs used (`--cpus`). The default is
+200000.
+
+ --max-sort-files <int>
+
+Maximum number of files that can be opened at once by the sort routine when
+sorting bins before each reduce step. For each child process, this number is
+effectively divided by the number of CPUs used (`--cpus`). The default is 40.
+
+ --bowtie <path>
+
+Path to the Bowtie executable to use when running the Align step. This
+overrides all other ways that Crossbow searches for `bowtie`, including the
+`CROSSBOW_BOWTIE_HOME` environment variable, the subdirectories of the
+`$CROSSBOW_HOME/bin` directory, and the `PATH`.
+
+ --fastq-dump <path>
+
+Path to the directory containing the programs in the [SRA toolkit], including
+`fastq-dump`. This overrides all other ways that Crossbow searches for
+`fastq-dump`, including the `CROSSBOW_SRATOOLKIT_HOME` environment variable, the
+subdirectories of the `$CROSSBOW_HOME/bin` directory, and the `PATH`.
+
+ --soapsnp <path>
+
+Path to the SOAPsnp executable to use when running the Call SNPs step. This
+overrides all other ways that Crossbow searches for `soapsnp`, including the
+`CROSSBOW_SOAPSNP_HOME` environment variable, the subdirectories of the
+`$CROSSBOW_HOME/bin` directory, and the `PATH`.
+
+# General Crossbow options
+
+The following options can be specified regardless of what mode ([EMR],
+[Hadoop] or local) Crossbow is run in.
+
+ --quality { phred33 | phred64 | solexa64 }
+
+Treat all input reads as having the specified quality encoding. `phred33`
+denotes the [Phred+33] or "Sanger" format whereby ASCII values 33-126 are used
+to encode qualities on the [Phred scale]. `phred64` denotes the [Phred+64] or
+"Illumina 1.3+" format whereby ASCII values 64-126 are used to encode qualities
+on the [Phred scale]. `solexa64` denotes the [Solexa+64] or "Solexa/Illumina
+1.0" format whereby ASCII values 59-126 are used to encode qualities on a
+[log-odds scale] that includes values as low as -5. Default: `phred33`.
+
+[Phred scale]: http://en.wikipedia.org/wiki/Phred_quality_score
+[Phred+33]: http://en.wikipedia.org/wiki/FASTQ_format#Encoding
+[Phred+64]: http://en.wikipedia.org/wiki/FASTQ_format#Encoding
+[Solexa+64]: http://en.wikipedia.org/wiki/FASTQ_format#Encoding
+[log-odds scale]: http://en.wikipedia.org/wiki/FASTQ_format#Variations
+
+ --preprocess
+
+The input path or URL refers to a [manifest file] rather than a directory of
+preprocessed reads. The first step in the Crossbow computation will be to
+preprocess the reads listed in the [manifest file] and store the preprocessed
+reads in the intermediate directory or in the `--preprocess-output` directory if
+it's specified. Default: off.
+
+ --just-preprocess
+
+The input path or URL refers to a [manifest file] rather than a directory of
+preprocessed reads. Crossbow will preprocess the reads listed in the [manifest
+file] and store the preprocessed reads in the `--output` directory and quit.
+Default: off.
+
+ --just-align
+
+Instead of running the Crossbow pipeline all the way through to the end, run the
+pipeline up to and including the align stage and store the results in the
+`--output` URL. To resume the run later, use `--resume-align`.
+
+ --resume-align
+
+Resume the Crossbow pipeline from just after the alignment stage. The
+`--input` URL must point to an `--output` URL from a previous run using
+`--just-align`.
+
+ --bowtie-args "<args>"
+
+Pass the specified arguments to [Bowtie] for the Align stage. Default: `-M
+1`. See the [Bowtie manual] for details on what options are available.
+
+`-M 1`: http://bowtie-bio.sf.net/manual.shtml#bowtie-options-M
+[Bowtie manual]: http://bowtie-bio.sf.net/manual.shtml
+
+ --discard-reads <fraction>
+
+Randomly discard a fraction of the input reads. E.g. specify `0.5` to discard
+50%. This applies to all input reads regardless of type (paired vs. unpaired)
+or length. This can be useful for debugging. Default: 0.0.
+
+ --discard-ref-bins <fraction>
+
+Randomly discard a fraction of the reference bins prior to SNP calling. E.g.
+specify `0.5` to discard 50% of the reference bins. This can be useful for
+debugging. Default: 0.0.
+
+ --discard-all <fraction>
+
+Equivalent to setting `--discard-reads` and `--discard-ref-bins` to
+`<fraction>`. Default: 0.0.
+
+ --soapsnp-args "<args>"
+
+Pass the specified arguments to [SOAPsnp] in the SNP calling stage. These
+options are passed to SOAPsnp regardless of whether the reference sequence under
+consideration is diploid or haploid. Default: `-2 -u -n -q`. See the [SOAPsnp
+manual] for details on what options are available.
+
+[SOAPsnp manual]: http://soap.genomics.org.cn/soapsnp.html
+
+ --soapsnp-hap-args "<args>"
+
+Pass the specified arguments to [SOAPsnp] in the SNP calling stage. when the
+reference sequence under consideration is haploid. Default: `-r 0.0001`. See
+the [SOAPsnp manual] for details on what options are available.
+
+ --soapsnp-dip-args "<args>"
+
+Pass the specified arguments to [SOAPsnp] in the SNP calling stage. when the
+reference sequence under consideration is diploid. Default: `-r 0.00005 -e
+0.0001`. See the [SOAPsnp manual] for details on what options are available.
+
+ --haploids <chromosome-list>
+
+The specified comma-separated list of chromosome names are to be treated as
+haploid by SOAPsnp. The rest are treated as diploid. Default: all chromosomes
+are treated as diploid.
+
+ --all-haploids
+
+If specified, all chromosomes are treated as haploid by SOAPsnp.
+
+ --partition-len <int>
+
+The bin size to use when binning alignments into partitions prior to SNP
+calling. If load imbalance occurrs in the SNP calling step (some tasks taking
+far longer than others), try decreasing this. Default: 1,000,000.
+
+></tr><tr><td id="cb-dry-run">
+
+ --dry-run
+
+Just generate a script containing the commands needed to launch the job, but
+don't run it. The script's location will be printed so that you may run it
+later.
+
+ --test
+
+Instead of running Crossbow, just search for the supporting tools ([Bowtie] and
+[SOAPsnp]) and report whether and how they were found. If running in Cloud Mode,
+this just tests whether the `elastic-mapreduce` script is locatable and
+runnable. Use this option to debug your local Crossbow installation.
+
+ --tempdir `<path>`
+
+Local directory where temporary files (e.g. dynamically generated scripts)
+should be deposited. Default: `/tmp/Crossbow/invoke.scripts`.
+
+# Crossbow examples
+
+The following subsections guide you step-by-step through examples included with
+the Crossbow package. Because reads (and sometimes reference jars) must be
+obtained over the Internet, running these examples requires an active Internet
+connection.
+
+## E. coli (small)
+
+Data for this example is taken from the study by [Parkhomchuk et al].
+
+[Parkhomchuk et al]: http://www.pnas.org/content/early/2009/11/19/0906681106.abstract
+
+ EMR
+
+ Via web interface
+
+Identify an [S3] bucket to hold the job's input and output. You may
+need to create an [S3 bucket] for this purpose. See your [S3 tool]'s
+documentation.
+
+[S3 bucket]: http://docs.amazonwebservices.com/AmazonS3/latest/index.html?UsingBucket.html
+
+Use an [S3 tool] to upload `$CROSSBOW_HOME/example/e_coli/small.manifest` to
+the `example/e_coli` subdirectory in your bucket. You can do so with this
+[s3cmd] command:
+
+ s3cmd put $CROSSBOW_HOME/example/e_coli/small.manifest s3://<YOUR-BUCKET>/example/e_coli/
+
+Direct your web browser to the [Crossbow web interface] and fill in the form as
+below (substituting for `<YOUR-BUCKET>`):
+
+1. For **AWS ID**, enter your AWS Access Key ID
+2. For **AWS Secret Key**, enter your AWS Secret Access Key
+3. *Optional*: For **AWS Keypair name**, enter the name of
+ your AWS keypair. This is only necessary if you would like to be
+ able to [ssh] into the [EMR] cluster while it runs.
+4. *Optional*: Check that the AWS ID and Secret Key entered are
+ valid by clicking the "Check credentials..." link
+5. For **Job name**, enter `Crossbow-Ecoli`
+6. Make sure that **Job type** is set to "Crossbow"
+7. For **Input URL**, enter
+ `s3n://<YOUR-BUCKET>/example/e_coli/small.manifest`, substituting
+ for `<YOUR-BUCKET>`
+8. *Optional*: Check that the Input URL exists by clicking the
+ "Check that input URL exists..." link
+9. For **Output URL**, enter
+ `s3n://<YOUR-BUCKET>/example/e_coli/output_small`, substituting for
+ `<YOUR-BUCKET>`
+10. *Optional*: Check that the Output URL does not exist by
+ clicking the "Check that output URL doesn't exist..." link
+11. For **Input type**, select "Manifest file"
+12. For **Genome/Annotation**, select "E. coli" from the drop-down
+ menu
+13. For **Chromosome ploidy**, select "All are haploid"
+14. Click Submit
+
+This job typically takes about 30 minutes on 1 `c1.xlarge` [EC2] node. See
+[Monitoring your EMR jobs] for information on how to track job progress. To
+download the results, use an [S3 tool] to retrieve the contents of the
+`s3n://<YOUR-BUCKET>/example/e_coli/output_small` directory.
+
+[ssh]: http://en.wikipedia.org/wiki/Secure_Shell
+
+ Via command line
+
+Test your Crossbow installation by running:
+
+ $CROSSBOW_HOME/cb_emr --test
+
+This will warn you if any supporting tools (`elastic-mapreduce` in this case)
+cannot be located or run.
+
+Identify an [S3] bucket to hold the job's input and output. You may need to
+create an [S3 bucket] for this purpose. See your [S3 tool]'s documentation.
+
+Use your [S3 tool] to upload `$CROSSBOW_HOME/example/e_coli/small.manifest` to
+the `example/e_coli` subdirectory in your bucket. You can do so with this
+[s3cmd] command:
+
+ s3cmd put $CROSSBOW_HOME/example/e_coli/small.manifest s3://<YOUR-BUCKET>/example/e_coli/
+
+Start the [EMR] job with the following command (substituting for
+`<YOUR-BUCKET>`):
+
+ $CROSSBOW_HOME/cb_emr \
+ --name "Crossbow-Ecoli" \
+ --preprocess \
+ --input=s3n://<YOUR-BUCKET>/example/e_coli/small.manifest \
+ --output=s3n://<YOUR-BUCKET>/example/e_coli/output_small \
+ --reference=s3n://crossbow-refs/e_coli.jar \
+ --all-haploids
+
+The `--reference` option instructs Crossbow to use a pre-built reference jar at
+URL `s3n://crossbow-refs/e_coli.jar`. The `--preprocess` option instructs
+Crossbow to treat the input as a [manifest file], rather than a directory of
+already-preprocessed reads. As the first stage of the pipeline, Crossbow
+downloads files specified in the manifest file and preprocesses them into
+Crossbow's read format. `--output` specifies where the final output is placed.
+
+This job typically takes about 30 minutes on 1 `c1.xlarge` [EC2] node. See
+[Monitoring your EMR jobs] for information on how to track job progress. To
+download the results, use an [S3 tool] to retrieve the contents of the
+`s3n://<YOUR-BUCKET>/example/e_coli/output_small` directory.
+
+ Hadoop
+
+Log into the [Hadoop] master node and test your Crossbow installation by running:
+
+ $CROSSBOW_HOME/cb_hadoop --test
+
+This will tell you if any of the supporting tools or packages are missing on the
+master. *You must also ensure* that the same tools are installed in the same
+paths on all slave nodes, and are runnable by the slaves.
+
+From the master, download the file named `e_coli.jar` from the following URL:
+
+ http://crossbow-refs.s3.amazonaws.com/e_coli.jar
+
+E.g. with this command:
+
+ wget http://crossbow-refs.s3.amazonaws.com/e_coli.jar
+
+Equivalently, you can use an [S3 tool] to download the same file from this URL:
+
+ s3n://crossbow-refs/e_coli.jar
+
+E.g. with this [s3cmd] command:
+
+ s3cmd get s3://crossbow-refs/e_coli.jar
+
+Install `e_coli.jar` in [HDFS] (the [Hadoop] distributed filesystem) with the
+following commands. If the `hadoop` script is not in your `PATH`, either add it
+to your `PATH` (recommended) or specify the full path to the `hadoop` script in
+the following commands.
+
+ hadoop dfs -mkdir /crossbow-refs
+ hadoop dfs -put e_coli.jar /crossbow-refs/e_coli.jar
+
+The first creates a directory in [HDFS] (you will see a warning message if the
+directory already exists) and the second copies the local jar files into that
+directory. In this example, we deposit the jars in the `/crossbow-refs`
+directory, but any [HDFS] directory is fine.
+
+Remove the local `e_coli.jar` file to save space. E.g.:
+
+ rm -f e_coli.jar
+
+Next install the [manifest file] in [HDFS]:
+
+ hadoop dfs -mkdir /crossbow/example/e_coli
+ hadoop dfs -put $CROSSBOW_HOME/example/e_coli/small.manifest /crossbow/example/e_coli/small.manifest
+
+Now start the job by running:
+
+ $CROSSBOW_HOME/cb_hadoop \
+ --preprocess \
+ --input=hdfs:///crossbow/example/e_coli/small.manifest \
+ --output=hdfs:///crossbow/example/e_coli/output_small \
+ --reference=hdfs:///crossbow-refs/e_coli.jar \
+ --all-haploids
+
+The `--preprocess` option instructs Crossbow to treat the input as a [manifest
+file]. As the first stage of the pipeline, Crossbow will download the files
+specified on each line of the manifest file and preprocess them into Crossbow's
+read format. The `--reference` option specifies the location of the reference
+jar contents. The `--output` option specifies where the final output is
+placed.
+
+ Single computer
+
+Test your Crossbow installation by running:
+
+ $CROSSBOW_HOME/cb_local --test
+
+This will warn you if any supporting tools (`bowtie` and `soapsnp` in this case)
+cannot be located or run.
+
+If you don't already have a `CROSSBOW_REFS` directory, choose one; it will be
+the default path Crossbow searches for reference jars. Permanently set the
+`CROSSBOW_REFS` environment variable to the selected directory.
+
+Create a subdirectory called `$CROSSBOW_REFS/e_coli`:
+
+ mkdir $CROSSBOW_REFS/e_coli
+
+Download `e_coli.jar` from the following URL to the new `e_coli` directory:
+
+ http://crossbow-refs.s3.amazonaws.com/e_coli.jar
+
+E.g. with this command:
+
+ wget -O $CROSSBOW_REFS/e_coli/e_coli.jar http://crossbow-refs.s3.amazonaws.com/e_coli.jar
+
+Equivalently, you can use an [S3 tool] to download the same file from this URL:
+
+ s3n://crossbow-refs/e_coli.jar
+
+E.g. with this [s3cmd] command:
+
+ s3cmd get s3://crossbow-refs/e_coli.jar $CROSSBOW_REFS/e_coli/e_coli.jar
+
+Change to the new `e_coli` directory and expand `e_coli.jar` using an `unzip` or
+`jar` utility:
+
+ cd $CROSSBOW_REFS/e_coli && unzip e_coli.jar
+
+Now you may remove `e_coli.jar` to save space:
+
+ rm -f $CROSSBOW_REFS/e_coli/e_coli.jar
+
+Now run Crossbow. Change to the `$CROSSBOW_HOME/example/e_coli` directory and
+start the job via the `cb_local` script:
+
+ cd $CROSSBOW_HOME/example/e_coli
+ $CROSSBOW_HOME/cb_local \
+ --input=small.manifest \
+ --preprocess \
+ --reference=$CROSSBOW_REFS/e_coli \
+ --output=output_small \
+ --all-haploids \
+ --cpus=<CPUS>
+
+Substitute the number of CPUs you'd like to use for `<CPUS>`.
+
+The `--preprocess` option instructs Crossbow to treat the input as a [manifest
+file]. As the first stage of the pipeline, Crossbow will download the files
+specified on each line of the manifest file and "preprocess" them into a format
+understood by Crossbow. The `--reference` option specifies the location of
+the reference jar contents. The `--output` option specifies where the final
+output is placed. The `--cpus` option enables Crossbow to use up to the
+specified number of CPUs at any given time.
+
+## Mouse chromosome 17 (large)
+
+Data for this example is taken from the study by [Sudbury, Stalker et al].
+
+[Sudbury, Stalker et al]: http://genomebiology.com/2009/10/10/R112
+
+ EMR
+
+ Via web interface
+
+First we build a reference jar for a human assembly and annotations using
+scripts included with Crossbow. The script searches for a `bowtie-build`
+executable with the same rules Crossbow uses to search for `bowtie`. See
+[Installing Crossbow] for details. Because one of the steps executed by the
+script builds an index of the human genome, it should be run on a computer with
+plenty of memory (at least 4 gigabytes, preferably 6 or more).
+
+ cd $CROSSBOW_HOME/reftools
+ ./mm9_chr17_jar
+
+The `mm9_chr17_jar` script will automatically:
+
+1. Download the FASTA sequence for mouse (build [mm9]) chromome 17 from [UCSC].
+2. Build an index from that FASTA sequence.
+3. Download the known SNPs and SNP frequencies for mouse chromosome 17 from
+ [dbSNP].
+4. Arrange this information in the directory structure expected by Crossbow.
+5. Package the information in a [jar file] named `mm9_chr17.jar`.
+
+[mm9]: http://hgdownload.cse.ucsc.edu/downloads.html#mouse
+
+Next, use an [S3 tool] to upload the `mm9_chr17.jar` file to the `crossbow-refs`
+subdirectory in your bucket. E.g. with this [s3cmd] command (substituting for
+`<YOUR-BUCKET>`):
+
+ s3cmd put $CROSSBOW_HOME/reftools/mm9_chr17/mm9_chr17.jar s3://<YOUR-BUCKET>/crossbow-refs/
+
+You may wish to remove the locally-generated reference jar files to save space.
+E.g.:
+
+ rm -rf $CROSSBOW_HOME/reftools/mm9_chr17
+
+Use an [S3 tool] to upload `$CROSSBOW_HOME/example/mouse17/full.manifest` to the
+`example/mouse17` subdirectory in your bucket. E.g. with this [s3cmd] command:
+
+ s3cmd put $CROSSBOW_HOME/example/mouse17/full.manifest s3://<YOUR-BUCKET>/example/mouse17/
+
+Direct your web browser to the [Crossbow web interface] and fill in the form as
+below (substituting for `<YOUR-BUCKET>`):
+
+1. For **AWS ID**, enter your AWS Access Key ID
+2. For **AWS Secret Key**, enter your AWS Secret Access Key
+3. *Optional*: For **AWS Keypair name**, enter the name of your AWS keypair.
+ This is only necessary if you would like to be able to [ssh] into the [EMR]
+ cluster while it runs.
+4. *Optional*: Check that the AWS ID and Secret Key entered are valid by
+ clicking the "Check credentials..." link
+5. For **Job name**, enter `Crossbow-Mouse17`
+6. Make sure that **Job type** is set to "Crossbow"
+7. For **Input URL**, enter
+ `s3n://<YOUR-BUCKET>/example/mouse17/full.manifest`, substituting for
+ `<YOUR-BUCKET>`
+8. *Optional*: Check that the Input URL exists by clicking the "Check that
+ input URL exists..." link
+9. For **Output URL**, enter `s3n://<YOUR-BUCKET>/example/mouse17/output_full`,
+ substituting for `<YOUR-BUCKET>`
+10. *Optional*: Check that the Output URL does not exist by clicking the "Check
+ that output URL doesn't exist..." link
+11. For **Input type**, select "Manifest file"
+12. For **Genome/Annotation**, check the box labeled "Specify reference jar
+ URL:" and enter `s3n://<YOUR-BUCKET>/crossbow-refs/mm9_chr17.jar` in the
+ text box below
+13. *Optional*: Check that the reference jar URL exists by clicking the "Check
+ that reference jar URL exists..." link
+14. For **Chromosome ploidy**, select "All are diploid"
+15. Click Submit
+
+This job typically takes about 45 minutes on 8 `c1.xlarge` [EC2] instances. See
+[Monitoring your EMR jobs] for information on how to track job progress. To
+download the results, use an [S3 tool] to retrieve the contents of the
+`s3n://<YOUR-BUCKET>/example/mouse17/output_full` directory.
+
+[Job Flow Debugging]: http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/DebuggingJobFlows.html
+[ssh]: http://en.wikipedia.org/wiki/Secure_Shell
+
+ Via command line
+
+First we build a reference jar for a human assembly and annotations using
+scripts included with Crossbow. The script searches for a `bowtie-build`
+executable with the same rules Crossbow uses to search for `bowtie`. See
+[Installing Crossbow] for details. Because one of the steps executed by the
+script builds an index of the human genome, it should be run on a computer with
+plenty of memory (at least 4 gigabytes, preferably 6 or more).
+
+ cd $CROSSBOW_HOME/reftools
+ ./mm9_chr17_jar
+
+The `mm9_chr17_jar` script will automatically:
+
+1. Download the FASTA sequence for mouse (build [mm9]) chromome 17 from [UCSC].
+2. Build an index from that FASTA sequence.
+3. Download the known SNPs and SNP frequencies for mouse chromosome 17 from
+ [dbSNP].
+4. Arrange this information in the directory structure expected by Crossbow.
+5. Package the information in a [jar file] named `mm9_chr17.jar`.
+
+[mm9]: http://hgdownload.cse.ucsc.edu/downloads.html#mouse
+
+Next, use an [S3 tool] to upload the `mm9_chr17.jar` file to the `crossbow-refs`
+subdirectory in your bucket. E.g. with this [s3cmd] command (substituting for
+`<YOUR-BUCKET>`):
+
+ s3cmd put $CROSSBOW_HOME/reftools/mm9_chr17/mm9_chr17.jar s3://<YOUR-BUCKET>/crossbow-refs/
+
+You may wish to remove the locally-generated reference jar files to save space.
+E.g.:
+
+ rm -rf $CROSSBOW_HOME/reftools/mm9_chr17
+
+Use an [S3 tool] to upload `$CROSSBOW_HOME/example/mouse17/full.manifest` to the
+`example/mouse17` subdirectory in your bucket. E.g. with this [s3cmd] command:
+
+ s3cmd put $CROSSBOW_HOME/example/mouse17/full.manifest s3://<YOUR-BUCKET>/example/mouse17/
+
+To start the [EMR] job, run the following command (substituting for
+`<YOUR-BUCKET>`):
+
+ $CROSSBOW_HOME/cb_emr \
+ --name "Crossbow-Mouse17" \
+ --preprocess \
+ --input=s3n://<YOUR-BUCKET>/example/mouse17/full.manifest \
+ --output=s3n://<YOUR-BUCKET>/example/mouse17/output_full \
+ --reference=s3n://<YOUR-BUCKET>/crossbow-refs/mm9_chr17.jar \
+ --instances 8
+
+This job typically takes about 45 minutes on 8 `c1.xlarge` [EC2] instances. See
+[Monitoring your EMR jobs] for information on how to track job progress. To
+download the results, use an [S3 tool] to retrieve the contents of the
+`s3n://<YOUR-BUCKET>/example/mouse17/output_full` directory.
+
+[Job Flow Debugging]: http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/DebuggingJobFlows.html
+
+ Hadoop
+
+First we build a reference jar for a human assembly and annotations using
+scripts included with Crossbow. The script searches for a `bowtie-build`
+executable with the same rules Crossbow uses to search for `bowtie`. See
+[Installing Crossbow] for details. Because one of the steps executed by the
+script builds an index of the human genome, it should be run on a computer with
+plenty of memory (at least 4 gigabytes, preferably 6 or more).
+
+ cd $CROSSBOW_HOME/reftools
+ ./mm9_chr17_jar
+
+The `mm9_chr17_jar` script will automatically:
+
+1. Download the FASTA sequence for mouse (build [mm9]) chromome 17 from [UCSC].
+2. Build an index from that FASTA sequence.
+3. Download the known SNPs and SNP frequencies for mouse chromosome 17 from
+ [dbSNP].
+4. Arrange this information in the directory structure expected by Crossbow.
+5. Package the information in a [jar file] named `mm9_chr17.jar`.
+
+Next, use the `hadoop` script to put the `mm9_chr17.jar` file in the
+`crossbow-refs` [HDFS] directory. Note tha tif `hadoop` is not in your `PATH`,
+you must specify `hadoop`'s full path instead:
+
+ hadoop dfs -mkdir /crossbow-refs
+ hadoop dfs -put $CROSSBOW_HOME/reftools/mm9_chr17/mm9_chr17.jar /crossbow-refs/mm9_chr17.jar
+
+The first command will yield a warning if the directory already exists; ignore
+this. In this example, we deposit the jars in the `/crossbow-refs` directory,
+but any [HDFS] directory is fine.
+
+You may wish to remove the locally-generated reference jar files to save space.
+E.g.:
+
+ rm -rf $CROSSBOW_HOME/reftools/mm9_chr17
+
+Now install the [manifest file] in [HDFS]:
+
+ hadoop dfs -mkdir /crossbow/example/mouse17
+ hadoop dfs -put $CROSSBOW_HOME/example/mouse17/full.manifest /crossbow/example/mouse17/full.manifest
+
+To start the [Hadoop] job, run the following command (substituting for
+`<YOUR-BUCKET>`):
+
+ $CROSSBOW_HOME/cb_hadoop \
+ --preprocess \
+ --input=hdfs:///crossbow/example/mouse17/full.manifest \
+ --output=hdfs:///crossbow/example/mouse17/output_full \
+ --reference=hdfs:///crossbow-refs/mm9_chr17.jar
+
+[mm9]: http://hgdownload.cse.ucsc.edu/downloads.html#mouse
+
+ Single computer
+
+First we build a reference jar for a human assembly and annotations
+using scripts included with Crossbow. The script searches for a
+`bowtie-build` executable with the same rules Crossbow uses to search
+for `bowtie`. See [Installing Crossbow] for details. Because one of
+the steps executed by the script builds an index of the human genome,
+it should be run on a computer with plenty of memory (at least 4
+gigabytes, preferably 6 or more).
+
+Run the following commands:
+
+ cd $CROSSBOW_HOME/reftools
+ ./mm9_chr17_jar
+
+The `mm9_chr17_jar` script will automatically:
+
+1. Download the FASTA sequence for mouse (build [mm9]) chromome 17 from
+ [UCSC].
+2. Build an index from that FASTA sequence.
+3. Download the known SNPs and SNP frequencies for mouse chromosome 17
+ from [dbSNP].
+4. Arrange this information in the directory structure expected by
+ Crossbow.
+5. Package the information in a [jar file] named `mm9_chr17.jar`.
+
+Move the directory containing the new reference jar into the
+`$CROSSBOW_REFS` directory:
+
+ mv $CROSSBOW_HOME/reftools/mm9_chr17 $CROSSBOW_REFS/
+
+Now change to the `$CROSSBOW_HOME/example/mouse17` directory and run
+Crossbow (substitute the number of CPUs you'd like to use for
+`<CPUS>`):
+
+ cd $CROSSBOW_HOME/example/mouse17
+ $CROSSBOW_HOME/cb_local \
+ --input=$CROSSBOW_HOME/example/mouse17/full.manifest \
+ --preprocess \
+ --reference=$CROSSBOW_REFS/mm9_chr17 \
+ --output=output_full \
+ --cpus=<CPUS>
+
+[UCSC]: http://hgdownload.cse.ucsc.edu/downloads.html
+
+# Manifest files
+
+A manifest file describes a set of [FASTQ] or `.sra` formatted input
+files that might be located:
+
+[gzip]: http://en.wikipedia.org/wiki/Gzip
+[bzip2]: http://en.wikipedia.org/wiki/Bzip2
+
+1. On the local computer
+2. In [HDFS]
+3. In [S3]
+4. On an FTP or web server
+
+[FASTQ]: http://en.wikipedia.org/wiki/FASTQ_format
+
+A manifest file can contain any combination of URLs and local paths from these
+various types of sources.
+
+[FASTQ] files can be gzip or bzip2-compressed (i.e. with `.gz` or `.bz2` file
+extensions). If `.sra` files are specified in the manifest and Crossbow is
+being run in single-computer or [Hadoop] modes, then the `fastq-dump` tool must
+be installed and Myrna must be able to locate it. See the `--fastq-dump`
+option and the [SRA Toolkit section of the manual].
+
+Each line in the manifest file represents either one file, for unpaired input
+reads, or a pair of files, for paired input reads. For a set of unpaired input
+reads, the line is formatted:
+
+ URL(tab)Optional-MD5
+
+Specifying an MD5 for the input file is optional. If it is specified, Crossbow
+will attempt to check the integrity of the file after downloading by comparing
+the observed MD5 to the user-provided MD5. To disable this checking, specify `0`
+in this field.
+
+For a set of paired input reads, the line is formatted:
+
+ URL-1(tab)Optional-MD5-1(tab)URL-2(tab)Optional-MD5-2
+
+Where `URL-1` and `URL-2` point to input files with all the #1 mates in `URL-1`
+and all the #2 mates in `URL-2`. The entries in the files must be arranged so
+that pairs "line up" in parallel. This is commonly the way public paired-end
+FASTQ datasets, such as those produced by the [1000 Genomes Project], are
+formatted. Typically these file pairs end in suffixes `_1.fastq.gz` and
+`_2.fastq.gz`.
+
+[1000 Genomes Project]: http://www.1000genomes.org/page.php
+
+Manifest files may have comment lines, which must start with the hash (`#`)
+symbol, and blank lines. Such lines are ignored by Crossbow.
+
+For examples of manifest files, see the files ending in `.manifest` in
+the `$CROSSBOW_HOME/example/e_coli` and
+`$CROSSBOW_HOME/example/mouse17` directories.
+
+# Reference jars
+
+All information about a reference sequence needed by Crossbow is encapsulated in
+a "reference jar" file. A reference jar includes a set of FASTA files encoding
+the reference sequences, a [Bowtie] index of the reference sequence, and a set
+of files encoding information about known SNPs for the species.
+
+A Crossbow reference jar is organized as:
+
+1. A `sequences` subdirectory containing one FASTA file per reference sequence.
+2. An `index` subdirectory containing the [Bowtie] index files for the reference
+ sequences.
+3. A `snps` subdirectory containing all of the SNP description files.
+
+The FASTA files in the `sequences` subdirectory must each be named `chrX.fa`,
+where `X` is the 0-based numeric id of the chromosome or sequence in the file.
+For example, for a human reference, chromosome 1's FASTA file could be named
+`chr0.fa`, chromosome 2 named `chr1.fa`, etc, all the way up to chromosomes 22,
+X and Y, named `chr21.fa`, `chr22.fa` and `chr23.fa`. Also, the names of the
+sequences within the FASTA files must match the number in the file name. I.e.,
+the first line of the FASTA file `chr0.fa` must be `>0`.
+
+The index files in the `index` subdirectory must have the basename `index`.
+I.e., the index subdirectory must contain these files:
+
+ index.1.ebwt
+ index.2.ebwt
+ index.3.ebwt
+ index.4.ebwt
+ index.rev.1.ebwt
+ index.rev.2.ebwt
+
+The index must be built using the `bowtie-build` tool distributed with
+[Bowtie]. When `bowtie-build` is executed, the FASTA files specified on the
+command line must be listed in ascending order of numeric id. For instance, for
+a set of FASTA files encoding human chromosomes 1,2,...,22,X,Y as
+`chr0.fa`,`chr1.fa`,...,`chr21.fa`, `chr22.fa`,`chr23.fa`, the command for
+`bowtie-build` must list the FASTA files in that order:
+
+ bowtie-build chr0.fa,chr1.fa,...,chr23.fa index
+
+The SNP description files in the `snps` subdirectory must also have names that
+match the corresponding FASTA files in the `sequences` subdirectory, but with
+extension `.snps`. E.g. if the sequence file for human Chromosome 1 is named
+`chr0.fa`, then the SNP description file for Chromosome 1 must be named
+`chr0.snps`. SNP description files may be omitted for some or all chromosomes.
+
+The format of the SNP description files must match the format expected by
+[SOAPsnp]'s `-s` option. The format consists of 1 SNP per line, with the
+following tab-separated fields per SNP:
+
+1. Chromosome ID
+2. 1-based offset into chromosome
+3. Whether SNP has allele frequency information (1 = yes, 0 = no)
+4. Whether SNP is validated by experiment (1 = yes, 0 = no)
+5. Whether SNP is actually an indel (1 = yes, 0 = no)
+6. Frequency of A allele, as a decimal number
+7. Frequency of C allele, as a decimal number
+8. Frequency of T allele, as a decimal number
+9. Frequency of G allele, as a decimal number
+10. SNP id (e.g. a [dbSNP] id such as `rs9976767`)
+
+Once these three subdirectories have been created and populated, they can be
+combined into a single [jar file] with a command like this:
+
+[jar file]: http://en.wikipedia.org/wiki/JAR_(file_format)
+
+ jar cf ref-XXX.jar sequences snps index
+
+To use `ref-XXX.jar` with Crossbow, you must copy it to a location where it can
+be downloaded over the internet via HTTP, FTP, or S3. Once it is placed in such
+a location, make a note if its URL.
+
+`bowtie-build`: http://bowtie-bio.sourceforge.net/manual.shtml#indx
+[dbSNP]: http://www.ncbi.nlm.nih.gov/projects/SNP/
+
+## Building a reference jar using automatic scripts
+
+The `reftools` subdirectory of the Crossbow package contains scripts that assist
+in building reference jars, including scripts that handle the entire process of
+building reference jars for [hg18] (UCSC human genome build 18) and [mm9] (UCSC
+mouse genome build 9). The `db2ssnp` script combines SNP and allele frequency
+information from [dbSNP] to create a `chrX.snps` file for the `snps`
+subdirectory of the reference jar. The `db2ssnp_*` scripts drive the `db2ssnp`
+script for each chromosome in the [hg18] and [mm9] genomes. The `*_jar` scripts
+drive the entire reference-jar building process, including downloading reference
+FASTA files, building a Bowtie index, and using `db2ssnp` to generate the `.snp`
+files for [hg18] and [mm9].
+
+[hg18]: http://hgdownload.cse.ucsc.edu/downloads.html#human
+[mm9]: http://hgdownload.cse.ucsc.edu/downloads.html#mouse
+[dbSNP]: http://www.ncbi.nlm.nih.gov/projects/SNP/
+
+# Monitoring, debugging and logging
+
+## Single computer
+
+Single-computer runs of Crossbow are relatively easy to monitor and debug.
+Progress messages are printed to the console as the job runs. When there is a
+fatal error, Crossbow usually indicates exactly which log file on the local
+filesystem contains the relevant error message. Additional debugging is possible
+when intermediate and temporary files are kept rather than discarded; see
+`--keep-intermediates` and `--keep-all`. All output and logs are stored on
+the local filesystem; see `--intermediate` and
+`--output` options.
+
+## Hadoop
+
+The simplest way to monitor Crossbow [Hadoop] jobs is via the Hadoop JobTracker.
+ The JobTracker is a web server that provides a point-and-click interface for
+monitoring jobs and reading output and other log files generated by those jobs,
+including after they've finished.
+
+When a job fails, you can often find the relevant error message by "drilling
+down" from the "step" level through the "job" level and "task" levels, and
+finally to the "attempt" level. To diagnose why an attempt failed, click
+through to the "stderr" ("standard error") log and scan for the relevant error
+message.
+
+See your version of Hadoop's documentation for details on how to use the web
+interface. Amazon has a brief document describing [How to Use the Hadoop User
+Interface], though some of the instructions are specific to clusters rented from
+Amazon. [Hadoop, the Definitive Guide] is also an excellent reference.
+
+[How to Use the Hadoop User Interface]: http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?UsingtheHadoopUserInterface.html
+[Hadoop, the Definitive Guide]: http://oreilly.com/catalog/9780596521981
+
+## EMR
+
+The recommended way to monitor EMR [Hadoop] jobs is via the [AWS Console]. The
+[AWS Console] allows you to see:
+
+1. The status for job (e.g. "COMPLETED", "RUNNING" or "FAILED")
+2. The status for each step of each job
+3. How long a job has been running for and how many "compute units" have been
+ utilized so far.
+4. The exact Hadoop commands used to initiate each job step.
+5. The button for [Debugging Job Flows]
+
+The [AWS Console] also has a useful facility for [Debugging Job Flows], which is
+accessible via the "Debug" button on the "Elastic MapReduce" tab of the Console
+(labeled "5"). You must (a) have a [SimpleDB] account (b) not have specified
+`--no-emr-debug` in order to use all of the [EMR Debug] interface's features:
+
+The debug interface is similar to Hadoop's JobTracker interface. When a job
+fails, you can often find the relevant error message by "drilling down" from the
+"job" level, through the "task" level, and finally to the "attempt" level. To
+diagnose why an attempt failed, click through to the "stderr" ("standard error")
+log and scan for the relevant error message.
+
+For more information, see Amazon's document on [Debugging Job Flows].
+
+[Debugging Job Flows]: http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?DebuggingJobFlows.html
+[EMR Debug]: http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?DebuggingJobFlows.html
+
+## AWS Management Console
+
+A simple way to monitor your EMR activity is via the [AWS Console]. The [AWS
+Console] summarizes current information regarding all your running [EC2] nodes
+and [EMR] jobs. Each job is listed in the "Amazon Elastic MapReduce" tab of the
+console, whereas individual [EC2] nodes are listed in the "Amazon EC2" tab.
+
+# Crossbow Output
+
+Once a Crossbow job completes successfully, the output is deposited in a
+`crossbow_results` subdirectory of the specified `--output` directory or URL.
+Within the `crossbow_results` subdirectory, results are organized as one gzipped
+result file per chromosome. E.g. if your run was against the [hg18] build of
+the human genome, the output files from your experiment will named:
+
+ <output_url>/crossbow_results/chr1.gz
+ <output_url>/crossbow_results/chr2.gz
+ <output_url>/crossbow_results/chr3.gz
+ ...
+ <output_url>/crossbow_results/chr21.gz
+ <output_url>/crossbow_results/chr22.gz
+ <output_url>/crossbow_results/chrX.gz
+ <output_url>/crossbow_results/chrY.gz
+ <output_url>/crossbow_results/chrM.gz
+
+Each individual record is in the [SOAPsnp] output format. SOAPsnp's format
+consists of 1 SNP per line with several tab-separated fields per SNP. The
+fields are:
+
+1. Chromosome ID
+2. 1-based offset into chromosome
+3. Reference genotype
+4. Subject genotype
+5. Quality score of subject genotype
+6. Best base
+7. Average quality score of best base
+8. Count of uniquely aligned reads corroborating the best base
+9. Count of all aligned reads corroborating the best base
+10. Second best base
+11. Average quality score of second best base
+12. Count of uniquely aligned reads corroborating second best base
+13. Count of all aligned reads corroborating second best base
+14. Overall sequencing depth at the site
+15. Sequencing depth of just the paired alignments at the site
+16. Rank sum test P-value
+17. Average copy number of nearby region
+18. Whether the site is a known SNP from the file specified with `-s`
+
+Note that field 15 was added in Crossbow and is not output by unmodified SOAPsnp.
+
+For further details, see the [SOAPsnp] manual.
+
+# Other reading
+
+The [Crossbow paper] discusses the broad design philosophy of both [Crossbow]
+and [Myrna] and why cloud computing can be considered a useful trend for
+comparative genomics applications. The [Bowtie paper] discusses the alignment
+algorithm underlying [Bowtie].
+
+[Bowtie paper]: http://genomebiology.com/2009/10/3/R25
+[Crossbow]: http://bowtie-bio.sf.net/crossbow
+[Crossbow paper]: http://genomebiology.com/2009/10/11/R134
+
+For additional information regarding Amazon EC2, S3, EMR, and related
+services, see Amazon's [AWS Documentation]. Some helpful screencasts
+are posted on the [AWS Console] home page.
+
+[AWS Documentation]: http://aws.amazon.com/documentation/
+
+For additional information regarding Hadoop, see the [Hadoop web site] and
+[Cloudera's Getting Started with Hadoop] document. [Cloudera's training virtual
+machine] for [VMWare] is an excellent way to get acquainted with Hadoop without
+having to install it on a production cluster.
+
+[Cloudera's Getting Started with Hadoop]: http://www.cloudera.com/resource/getting_started_with_hadoop
+[Cloudera's training virtual machine]: http://www.cloudera.com/developers/downloads/virtual-machine/
+[VMWare]: http://www.vmware.com/
+[Hadoop web site]: http://hadoop.apache.org/
+
+# Acknowledgements
+
+[Crossbow] software is by [Ben Langmead] and [Michael C. Schatz].
+
+[Bowtie] software is by [Ben Langmead] and [Cole Trapnell].
+
+[SOAPsnp] is by Ruiqiang Li, Yingrui Li, Xiaodong Fang, Huanming Yang, Jian
+Wang, Karsten Kristiansen, and Jun Wang.
+
+[Ben Langmead]: http://faculty.jhsph.edu/default.cfm?faculty_id=2209&grouped=false&searchText=&department_id=3&departmentName=Biostatistics
+[Michael C. Schatz]: http://www.cbcb.umd.edu/~mschatz/
+[Cole Trapnell]: http://www.cs.umd.edu/~cole/
diff --git a/MANUAL.markdown b/MANUAL.markdown
new file mode 100644
index 0000000..a741032
--- /dev/null
+++ b/MANUAL.markdown
@@ -0,0 +1,2144 @@
+% Crossbow: Parallel short read genotyping in the cloud
+% Ben Langmead and Michael C. Schatz
+% http://bowtie-bio.sf.net/crossbow
+
+# What is Crossbow?
+
+[Crossbow] is a scalable, portable, and automatic Cloud Computing tool for
+finding SNPs from short read data. Crossbow employs [Bowtie] and a modified
+version of [SOAPsnp] to perform the short read alignment and SNP calling
+respectively. Crossbow is designed to be easy to run (a) in "the cloud" (in
+this case, Amazon's [Elastic MapReduce] service), (b) on any [Hadoop] cluster,
+or (c) on any single computer, without [Hadoop]. Crossbow exploits the
+availability of multiple computers and processors where possible.
+
+[Crossbow]: http://bowtie-bio.sf.net/crossbow
+[Bowtie]: http://bowtie-bio.sf.net
+[SOAPsnp]: http://soap.genomics.org.cn/soapsnp.html
+[Elastic MapReduce]: http://aws.amazon.com/elasticmapreduce "Amazon Elastic MapReduce"
+
+# A word of caution
+
+Renting resources from [Amazon Web Services] (AKA [AWS]), costs money,
+regardless of whether your experiment ultimately succeeds or fails. In some
+cases, Crossbow or its documentation may be partially to blame for a failed
+experiment. While we are happy to accept bug reports, we do not accept
+responsibility for financial damage caused by these errors. Crossbow is
+provided "as is" with no warranty. See `LICENSE` file.
+
+[Amazon Web Services]: http://aws.amazon.com
+[Amazon EC2]: http://aws.amazon.com/ec2
+[Amazon S3]: http://aws.amazon.com/s3
+[Amazon EMR]: http://aws.amazon.com/elasticmapreduce
+[Amazon SimpleDB]: http://aws.amazon.com/simpledb
+[AWS]: http://aws.amazon.com
+
+# Crossbow modes and prerequisites
+
+Crossbow can be run in four different ways.
+
+1. **Via the [Crossbow web interface]**
+
+ In this case, the [Crossbow] code and the user interface are installed on EC2
+ web servers. Also, the computers running the Crossbow computation are rented
+ from Amazon, and the user must have [EC2], [EMR], [S3] and [SimpleDB]
+ accounts and must pay the [going rate] for the resources used. The user does
+ not need any special software besides a web browser and, in most cases, an
+ [S3 tool].
+
+[S3 tool]: #s3-tools
+[Crossbow web interface]: http://bowtie-bio.sf.net/crossbow/ui.html
+
+2. **On Amazon [Elastic MapReduce] via the command-line**
+
+ In this case, the Crossbow code is hosted by Amazon and the computers running
+ the Crossbow computation are rented from Amazon. However, the user must
+ install and run (a) the Crossbow scripts, which require [Perl] 5.6 or later,
+ (b) Amazon's [`elastic-mapreduce`] script, which requires Ruby 1.8 or later,
+ and (c) an [S3 tool]. The user must have [EC2], [EMR], [S3] and [SimpleDB]
+ accounts and must pay the [going rate] for the resources used.
+
+[S3 tool]: #s3-tools
+
+3. **On a [Hadoop] cluster via the command-line**
+
+ In this case, the Crossbow code is hosted on your [Hadoop] cluster, as are
+ supporting tools: [Bowtie], [SOAPsnp], and possibly [`fastq-dump`].
+ Supporting tools must be installed on all cluster nodes, but the Crossbow
+ scripts need only be installed on the master. Crossbow was tested with
+ [Hadoop] versions 0.20 and 0.20.205, and might also be compatible with other
+ versions newer than 0.20. Crossbow scripts require [Perl] 5.6 or later.
+
+4. **On any computer via the command-line**
+
+ In this case, the Crossbow code and all supporting tools ([Bowtie],
+ [SOAPsnp], and possibly [`fastq-dump`]) must be installed on the computer
+ running Crossbow. Crossbow scripts require [Perl] 5.6 or later. The user
+ specifies the maximum number of CPUs that Crossbow should use at a time.
+ This mode does *not* require [Java] or [Hadoop].
+
+[Amazon EMR]: http://aws.amazon.com/elasticmapreduce
+[Elastic MapReduce]: http://aws.amazon.com/elasticmapreduce
+[EMR]: http://aws.amazon.com/elasticmapreduce
+[S3]: http://aws.amazon.com/s3
+[EC2]: http://aws.amazon.com/ec2
+[going rate]: http://aws.amazon.com/ec2/#pricing
+[Elastic MapReduce web interface]: https://console.aws.amazon.com/elasticmapreduce/home
+[AWS Console]: https://console.aws.amazon.com
+[AWS console]: https://console.aws.amazon.com
+[`elastic-mapreduce`]: http://aws.amazon.com/developertools/2264?_encoding=UTF8&jiveRedirect=1
+[Java]: http://java.sun.com/
+[Hadoop]: http://hadoop.apache.org/
+[R]: http://www.r-project.org/
+[Bioconductor]: http://www.bioconductor.org/
+[Perl]: http://www.perl.org/get.html
+
+# Preparing to run on Amazon Elastic MapReduce
+
+Before running Crossbow on [EMR], you must have an [AWS] account with the
+appropriate features enabled. You may also need to [install Amazon's
+`elastic-mapreduce` tool]. In addition, you may want to install an [S3 tool],
+though most users can simply use [Amazon's web interface for S3], which requires
+no installation.
+
+If you plan to run Crossbow exclusively on a single computer or on a [Hadoop]
+cluster, you can skip this section.
+
+[Amazon's web interface for S3]: https://console.aws.amazon.com/s3/home
+[Installing Amazon's `elastic-mapreduce` tool]: #installing-amazons-elastic-mapreduce-tool
+
+1. Create an AWS account by navigating to the [AWS page]. Click "Sign Up Now"
+ in the upper right-hand corner and follow the instructions. You will be asked
+ to accept the [AWS Customer Agreement].
+
+2. Sign up for [EC2] and [S3]. Navigate to the [Amazon EC2] page, click on
+ "Sign Up For Amazon EC2" and follow the instructions. This step requires you
+ to enter credit card information. Once this is complete, your AWS account
+ will be permitted to use [EC2] and [S3], which are required.
+
+3. Sign up for [EMR]. Navigate to the [Elastic MapReduce] page, click on "Sign
+ up for Elastic MapReduce" and follow the instructions. Once this is complete,
+ your AWS account will be permitted to use [EMR], which is required.
+
+4. Sign up for [SimpleDB]. With [SimpleDB] enabled, you have the option of
+ using the [AWS Console]'s [Job Flow Debugging] feature. This is a convenient
+ way to monitor your job's progress and diagnose errors.
+
+5. *Optional*: Request an increase to your instance limit. By default, Amazon
+ allows you to allocate EC2 clusters with up to 20 instances (virtual
+ computers). To be permitted to work with more instances, fill in the form on
+ the [Request to Increase] page. You may have to speak to an Amazon
+ representative and/or wait several business days before your request is
+ granted.
+
+To see a list of AWS services you've already signed up for, see your [Account
+Activity] page. If "Amazon Elastic Compute Cloud", "Amazon Simple Storage
+Service", "Amazon Elastic MapReduce" and "Amazon SimpleDB" all appear there, you
+are ready to proceed.
+
+Be sure to make a note of the various numbers and names associated with your
+accounts, especially your Access Key ID, Secret Access Key, and your EC2 key
+pair name. You will have to refer to these and other account details in the
+future.
+
+[install Amazon's `elastic-mapreduce` tool]: #installing-amazons-elastic-mapreduce-tool
+[AWS Customer Agreement]: http://aws.amazon.com/agreement/
+[Request to Increase]: http://aws.amazon.com/contact-us/ec2-request/
+[Job Flow Debugging]: http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/DebuggingJobFlows.html
+[SimpleDB]: http://aws.amazon.com/simpledb/
+[Account Activity]: http://aws-portal.amazon.com/gp/aws/developer/account/index.html?ie=UTF8&action=activity-summary
+
+## Installing Amazon's `elastic-mapreduce` tool
+
+Read this section if you plan to run Crossbow on [Elastic MapReduce] via the
+command-line tool. Skip this section if you are not using [EMR] or if you plan
+to run exclusively via the [Crossbow web interface].
+
+To install Amazon's `elastic-mapreduce` tool, follow the instructions in Amazon
+Elastic MapReduce developer's guide for [How to Download and Install Ruby and
+the Command Line Interface]. That document describes:
+
+[How to Download and Install Ruby and the Command Line Interface]: http://aws.amazon.com/developertools/2264?_encoding=UTF8&jiveRedirect=1
+
+1. Installing an appropriate version of [Ruby], if necessary.
+
+2. Setting up an EC2 keypair, if necessary.
+
+3. Setting up a credentials file, which is used by the `elastic-mapreduce` tool
+ for authentication.
+
+ For convenience, we suggest you name the credentials file `credentials.json`
+ and place it in the same directory with the `elastic-mapreduce` script.
+ Otherwise you will have to specify the credential file path with the
+ [`--credentials`] option each time you run `cb_emr`.
+
+We strongly recommend using a version of the `elastic-mapreduce` Ruby script
+released on or after December 8, 2011. This is when the script switched to
+using Hadoop v0.20.205 by default, which is the preferred way of running Myrna.
+
+[Ruby]: http://www.ruby-lang.org/
+[Setting up an EC2 keypair]: http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?download_ruby.html
+
+We also recommend that you add the directory containing the `elastic-mapreduce`
+tool to your `PATH`. This allows Crossbow to locate it automatically.
+Alternately, you can specify the path to the `elastic-mapreduce` tool via the
+[`--emr-script`] option when running `cb_emr`.
+
+[AWS]: http://aws.amazon.com/ "Amazon Web Services"
+[AWS page]: http://aws.amazon.com/ "Amazon Web Services"
+[AWS Getting Started Guide]: http://docs.amazonwebservices.com/AWSEC2/latest/GettingStartedGuide/
+
+## S3 tools
+
+Running on [EMR] requires exchanging files via the cloud-based [S3] filesystem.
+[S3] is organized as a collection of [S3 buckets] in a global namespace. [S3
+charges] are incurred when transferring data to and from [S3] (but transfers
+between [EC2] and [S3] are free), and a per-GB-per-month charge applies when
+data is stored in [S3] over time.
+
+To transfer files to and from [S3], use an S3 tool. Amazon's [AWS Console] has
+an [S3 tab] that provides a friendly web-based interface to [S3], and doesn't
+require any software installation. [s3cmd] is a very good command-line tool
+that requires [Python] 2.4 or later. [S3Fox Organizer] is another GUI tool that
+works as a [Firefox] extension. Other tools include [Cyberduck] (for Mac OS
+10.6 or later) and [Bucket Explorer] (for Mac, Windows or Linux, but commercial
+software).
+
+[S3]: http://aws.amazon.com/s3/
+[S3 tab]: https://console.aws.amazon.com/s3/home
+[s3cmd]: http://s3tools.org/s3cmd
+[Python]: http://www.python.org/download/
+[Firefox]: http://www.mozilla.com/firefox/
+[S3 buckets]: http://docs.amazonwebservices.com/AmazonS3/latest/gsg/
+[S3 bucket]: http://docs.amazonwebservices.com/AmazonS3/latest/gsg/
+[S3 charges]: http://aws.amazon.com/s3/#pricing
+[S3Fox Organizer]: http://www.s3fox.net/
+[Cyberduck]: http://cyberduck.ch/
+[Bucket Explorer]: http://www.bucketexplorer.com/
+
+# Installing Crossbow
+
+[Installing Crossbow]: #installing-crossbow
+
+Crossbow consists of a set of [Perl] and shell scripts, plus supporting tools:
+[Bowtie] and [SOAPsnp] . If you plan to run Crossbow via the [Crossbow web
+interface] exclusively, there is nothing to install. Otherwise:
+
+1. Download the desired version of Crossbow from the [sourceforge site]
+
+2. [Extract the zip archive]
+
+3. Set the `CROSSBOW_HOME` environment variable to point to the extracted
+ directory (containing `cb_emr`)
+
+4. *If you plan to run on a local computer or [Hadoop] cluster*:
+
+ If using Linux or Mac OS 10.6 or later, you likely don't have to install
+ [Bowtie] or [SOAPsnp], as Crossbow comes with compatible versions of both
+ pre-installed. Test this by running:
+
+ $CROSSBOW_HOME/cb_local --test
+
+ If the install test passes, installation is complete.
+
+ If the install test indicates [Bowtie] is not installed, obtain or build a
+ `bowtie` binary v0.12.8 or higher and install it by setting the
+ `CROSSBOW_BOWTIE_HOME` environment variable to `bowtie`'s enclosing
+ directory. Alternately, add the enclosing directory to your `PATH` or
+ specify the full path to `bowtie` via the `--bowtie` option when running
+ Crossbow scripts.
+
+ If the install test indicates that [SOAPsnp] is not installed, build the
+ `soapsnp` binary using the sources and makefile in `CROSSBOW_HOME/soapsnp`.
+ You must have compiler tools such as GNU `make` and `g++` installed for this
+ to work. If you are using a Mac, you may need to install the [Apple
+ developer tools]. To build the `soapsnp` binary, run:
+
+ make -C $CROSSBOW_HOME/soapsnp
+
+ Now install `soapsnp` by setting the `CROSSBOW_SOAPSNP_HOME` environment
+ variable to `soapsnp`'s enclosing directory. Alternately, add the enclosing
+ directory to your `PATH` or specify the full path to `soapsnp` via the
+ `--soapsnp` option when running Crossbow scripts.
+
+5. *If you plan to run on a [Hadoop] cluster*, you may need to manually copy
+ the `bowtie` and `soapsnp` executables, and possibly also the `fastq-dump`
+ executable, to the same path on each of your [Hadoop] cluster nodes. You
+ can avoid this step by installing `bowtie`, `soapsnp` and `fastq-dump` on a
+ filesystem shared by all [Hadoop] nodes (e.g. an [NFS share]). You can also
+ skip this step if [Hadoop] is installed in [pseudo distributed] mode,
+ meaning that the cluster really consists of one node whose CPUs are treated
+ as distinct slaves.
+
+[NFS share]: http://en.wikipedia.org/wiki/Network_File_System_(protocol)
+[pseudo distributed]: http://hadoop.apache.org/common/docs/current/quickstart.html#PseudoDistributed
+
+## The SRA toolkit
+
+The [Sequence Read Archive] (SRA) is a resource at the [National Center for
+Biotechnology Information] (NCBI) for storing sequence data from modern
+sequencing instruments. Sequence data underlying many studies, including very
+large studies, can often be downloaded from this archive.
+
+The SRA uses a special file format to store archived read data. These files end
+in extensions [`.sra`], and they can be specified as inputs to Crossbow's
+preprocessing step in exactly the same way as [FASTQ] files.
+
+However, if you plan to use [`.sra`] files as input to Crossbow in either
+[Hadoop] mode or in single-computer mode, you must first install the [SRA
+toolkit]'s `fastq-dump` tool appropriately. See the [SRA toolkit] page for
+details about how to download and install.
+
+When searching for the `fastq-dump` tool at runtime, Crossbow searches the
+following places in order:
+
+1. The path specified in the [`--fastq-dump`] option
+2. The directory specified in the `$CROSSBOW_SRATOOLKIT_HOME` environment
+ variable.
+3. In the system `PATH`
+
+[Sequence Read Archive]: http://www.ncbi.nlm.nih.gov/books/NBK47533/
+[National Center for Biotechnology Information]: http://www.ncbi.nlm.nih.gov/
+[SRA toolkit]: http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?cmd=show&f=software&m=software&s=software
+
+# Running Crossbow
+
+The commands for invoking Crossbow from the command line are:
+
+`$CROSSBOW_HOME/cb_emr` (or just `cb_emr` if `$CROSSBOW_HOME` is in the `PATH`)
+for running on [EMR]. See [Running Crossbow on EMR via the command line] for
+details.
+
+`$CROSSBOW_HOME/cb_hadoop` (or just `cb_hadoop` if `$CROSSBOW_HOME` is in the
+`PATH`) for running on [Hadoop]. See [Running Crossbow on a Hadoop cluster via
+the command line] for details.
+
+`$CROSSBOW_HOME/cb_local` (or just `cb_local` if `$CROSSBOW_HOME` is in the
+`PATH`) for running locally on a single computer. See [Running Crossbow on a
+single computer via the command line] for details.
+
+[Apple developer tools]: http://developer.apple.com/technologies/tools/
+[NFS share]: http://en.wikipedia.org/wiki/Network_File_System_(protocol)
+[pseudo distributed]: http://hadoop.apache.org/common/docs/current/quickstart.html#PseudoDistributed
+[sourceforge site]: http://bowtie-bio.sf.net/crossbow
+[Extract the zip archive]: http://en.wikipedia.org/wiki/ZIP_(file_format)
+[Running Crossbow on EMR via the command line]: #running-crossbow-on-emr-via-the-command-line
+[Running Crossbow on a Hadoop cluster via the command line]: #running-crossbow-on-a-hadoop-cluster-via-the-command-line
+[Running Crossbow on a single computer via the command line]: #running-crossbow-on-a-single-computer-via-the-command-line
+
+# Running Crossbow on EMR via the EMR web interface
+
+## Prerequisites
+
+1. Web browser
+2. [EC2], [S3], [EMR], and [SimpleDB] accounts. To check which ones you've
+ already enabled, visit your [Account Activity] page.
+3. A tool for browsing and exchanging files with [S3]
+ a. The [AWS Console]'s [S3 tab] is a good web-based tool that does not
+ require software installation
+ b. A good command line tool is [s3cmd]
+ c. A good GUI tool is [S3Fox Organizer], which is a Firefox Plugin
+ d. Others include [Cyberduck], [Bucket Explorer]
+3. Basic knowledge regarding:
+ a. [What S3 is], [what an S3 bucket is], how to create one, how to upload a
+ file to an S3 bucket from your computer (see your S3 tool's documentation).
+ b. How much AWS resources [will cost you]
+
+[Account Activity]: http://aws-portal.amazon.com/gp/aws/developer/account/index.html?ie=UTF8&action=activity-summary
+[s3cmd]: http://s3tools.org/s3cmd
+[S3Fox Organizer]: http://www.s3fox.net/
+[Cyberduck]: http://cyberduck.ch/
+[Bucket Explorer]: http://www.bucketexplorer.com/
+[What S3 is]: http://aws.amazon.com/s3/
+[what an S3 bucket is]: http://docs.amazonwebservices.com/AmazonS3/latest/gsg/
+[will cost you]: http://aws.amazon.com/ec2/#pricing
+
+## To run
+
+1. *If the input reads have not yet been preprocessed by Crossbow* (i.e. input
+ is [FASTQ] or [`.sra`]), then first (a) prepare a [manifest file] with URLs
+ pointing to the read files, and (b) upload it to an [S3] bucket that you
+ own. See your [S3] tool's documentation for how to create a bucket and
+ upload a file to it. The URL for the [manifest file] will be the input URL
+ for your [EMR] job.
+
+ *If the input reads have already been preprocessed by Crossbow*, make a note
+ of of the [S3] URL where they're located. This will be the input URL for
+ your [EMR] job.
+
+2. *If you are using a pre-built reference jar*, make a note of its [S3] URL.
+ This will be the reference URL for your [EMR] job. See the [Crossbow
+ website] for a list of pre-built reference jars and their URLs.
+
+ *If you are not using a pre-built reference jar*, you may need to [build the
+ reference jars] and/or upload them to an [S3] bucket you own. See your [S3
+ tool]'s documentation for how to create a bucket and upload to it. The URL
+ for the main reference jar will be the reference URL for your [EMR] job.
+
+[Crossbow website]: http://bowtie-bio.sf.net/crossbow
+[build the reference jars]: #reference-jars
+[S3 tool]: #s3-tools
+[`.sra`]: http://www.ncbi.nlm.nih.gov/books/NBK47540/
+
+3. In a web browser, go to the [Crossbow web interface].
+
+4. Fill in the form according to your job's parameters. We recommend filling in
+ and validating the "AWS ID" and "AWS Secret Key" fields first. Also, when
+ entering S3 URLs (e.g. "Input URL" and "Output URL"), we recommend that users
+ validate the entered URLs by clicking the link below it. This avoids failed
+ jobs due to simple URL issues (e.g. non-existence of the "Input URL"). For
+ examples of how to fill in this form, see the [E. coli EMR] and [Mouse
+ chromosome 17 EMR] examples.
+
+[Monitoring your EMR jobs]: #monitoring-your-emr-jobs
+
+# Running Crossbow on EMR via the command line
+
+## Prerequisites
+
+1. [EC2], [S3], [EMR], and [SimpleDB] accounts. To check which ones you've
+ already enabled, visit your [Account Activity] page.
+2. A tool for browsing and exchanging files with [S3]
+ a. The [AWS Console]'s [S3 tab] is a good web-based tool that does not
+ require software installation
+ b. A good command line tool is [s3cmd]
+ c. A good GUI tool is [S3Fox Organizer], which is a Firefox Plugin
+ d. Others include [Cyberduck], [Bucket Explorer]
+3. Basic knowledge regarding:
+ a. [What S3 is], [what an S3 bucket is], how to create one, how to upload a
+ file to an S3 bucket from your computer (see your S3 tool's documentation).
+ b. How much AWS resources [will cost you]
+
+[Account Activity]: http://aws-portal.amazon.com/gp/aws/developer/account/index.html?ie=UTF8&action=activity-summary
+[s3cmd]: http://s3tools.org/s3cmd
+[S3Fox Organizer]: http://www.s3fox.net/
+[Cyberduck]: http://cyberduck.ch/
+[Bucket Explorer]: http://www.bucketexplorer.com/
+[What S3 is]: http://aws.amazon.com/s3/
+[What an S3 bucket is]: http://docs.amazonwebservices.com/AmazonS3/latest/gsg/
+[will cost you]: http://aws.amazon.com/ec2/#pricing
+
+## To run
+
+1. *If the input reads have not yet been preprocessed by Crossbow* (i.e. input
+ is [FASTQ] or [`.sra`]), then first (a) prepare a [manifest file] with URLs
+ pointing to the read files, and (b) upload it to an [S3] bucket that you
+ own. See your [S3] tool's documentation for how to create a bucket and
+ upload a file to it. The URL for the [manifest file] will be the input URL
+ for your [EMR] job.
+
+ *If the input reads have already been preprocessed by Crossbow*, make a note
+ of of the [S3] URL where they're located. This will be the input URL for
+ your [EMR] job.
+
+2. *If you are using a pre-built reference jar*, make a note of its [S3] URL.
+ This will be the reference URL for your [EMR] job. See the [Crossbow
+ website] for a list of pre-built reference jars and their URLs.
+
+ *If you are not using a pre-built reference jar*, you may need to [build the
+ reference jars] and/or upload them to an [S3] bucket you own. See your [S3
+ tool]'s documentation for how to create a bucket and upload to it. The URL
+ for the main reference jar will be the reference URL for your [EMR] job.
+
+[Crossbow website]: http://bowtie-bio.sf.net/crossbow
+[build the reference jars]: #reference-jars
+[S3 tool]: #s3-tools
+
+3. Run `$CROSSBOW_HOME/cb_emr` with the desired options. Options that are unique
+ to [EMR] jobs are described in the following section. Options that apply to
+ all running modes are described in the [General Crossbow options] section.
+ For examples of how to run `$CROSSBOW_HOME/cb_emr` see the [E. coli EMR] and
+ [Mouse chromosome 17 EMR] examples.
+
+[General Crossbow options]: #general-crossbow-options
+[E. coli EMR]: #cb-example-e-coli-emr
+[Mouse chromosome 17 EMR]: #cb-example-mouse17-emr
+
+## EMR-specific options
+
+<table>
+
+<tr><td id="cb-emr-reference">
+
+[`--reference`]: #cb-emr-reference
+
+ --reference <URL>
+
+</td><td>
+
+[S3] URL where the reference jar is located. URLs for pre-built reference jars
+for some commonly studied species (including human and mouse) are available from
+the [Crossbow web site]. Note that a [Myrna] reference jar is not the same as a
+[Crossbow] reference jar. If your desired genome and/or SNP annotations are not
+available in pre-built form, you will have to make your own reference jar and
+upload it to one of your own S3 buckets (see [Reference jars]). This option
+must be specified.
+
+[Myrna]: http://bowtie-bio.sf.net/myrna
+[Reference jars]: #reference-jars
+[Crossbow web site]: http://bowtie-bio.sf.net/crossbow
+
+<tr><td id="cb-emr-input">
+
+[`--input`]: #cb-emr-input
+
+ --input <URL>
+
+</td><td>
+
+[S3] URL where the input is located. If [`--preprocess`] or
+[`--just-preprocess`] are specified, `<URL>` sould point to a [manifest file].
+Otherwise, `<URL>` should point to a directory containing preprocessed reads.
+This option must be specified.
+
+</td></tr><tr><td id="cb-emr-output">
+
+[`--output`]: #cb-emr-output
+
+ --output <URL>
+
+</td><td>
+
+[S3] URL where the output is to be deposited. If [`--just-preprocess`] is
+specified, the output consists of the preprocessed reads. Otherwise, the output
+consists of the SNP calls calculated by [SOAPsnp] for each chromosome in the
+[Crossbow output format], organized as one file per chromosome. This option
+must be specified.
+
+[Crossbow output format]: #cb-output
+
+</td></tr><tr><td id="cb-emr-intermediate">
+
+[`--intermediate`]: #cb-emr-intermediate
+
+ --intermediate <URL>
+
+</td><td>
+
+[S3] URL where all intermediate results should be be deposited. This can be
+useful if you later want to resume the computation from partway through the
+pipeline (e.g. after alignment but before SNP calling). By default,
+intermediate results are stored in [HDFS] and disappear once the cluster is
+terminated.
+
+</td></tr><tr><td id="cb-emr-preprocess-output">
+
+[`--preprocess-output`]: #cb-emr-preprocess-output
+
+ --preprocess-output <URL>
+
+</td><td>
+
+[S3] URL where the preprocessed reads should be stored. This can be useful if
+you later want to run Crossbow on the same input reads without having to re-run
+the preprocessing step (i.e. leaving [`--preprocess`] unspecified).
+
+</td></tr><tr><td id="cb-emr-credentials">
+
+[`--credentials`]: #cb-emr-credentials
+
+ --credentials <id>
+
+</td><td>
+
+Local path to the credentials file set up by the user when the
+[`elastic-mapreduce`] script was installed (see [Installing Amazon's
+`elastic-mapreduce` tool]). Default: use `elastic-mapreduce`'s default (i.e.
+the `credentials.json` file in the same directory as the `elastic-mapreduce`
+script). If `--credentials` is not specified and the default `credentials.json`
+file doesn't exist, `elastic-mapreduce` will abort with an error message.
+
+[Installing Amazon's `elastic-mapreduce` tool]: #installing-amazons-elastic-mapreduce-tool
+
+</td></tr><tr><td id="cb-emr-script">
+
+[`--emr-script`]: #cb-emr-script
+
+ --emr-script <path>
+
+</td><td>
+
+Local path to the `elastic-mapreduce` script. By default, Crossbow looks first
+in the `$CROSSBOW_EMR_HOME` directory, then in the `PATH`.
+
+</td></tr><tr><td id="cb-emr-name">
+
+[`--name`]: #cb-emr-name
+
+ --name <string>
+
+</td><td>
+
+Specify the name by which the job will be identified in the [AWS Console].
+
+</td></tr><tr><td id="cb-emr-stay-alive">
+
+[`--stay-alive`]: #cb-stay-alive
+
+ --stay-alive
+
+</td><td>
+
+By default, [EMR] will terminate the cluster as soon as (a) one of the stages
+fails, or (b) the job complete successfully. Specify this option to force [EMR]
+to keep the cluster alive in either case.
+
+</td></tr><tr><td id="cb-emr-instances">
+
+[`--instances`]: #cb-instances
+
+ --instances <int>
+
+</td><td>
+
+Specify the number of instances (i.e. virtual computers, also called nodes) to
+be allocated to your cluster. If set to 1, the 1 instance will funcion as both
+[Hadoop] master and slave node. If set greater than 1, one instance will
+function as a [Hadoop] master and the rest will function as [Hadoop] slaves. In
+general, the greater the value of `<int>`, the faster the Crossbow computation
+will complete. Consider the desired speed as well as the [going rate] when
+choosing a value for `<int>`. Default: 1.
+
+</td></tr><tr><td id="cb-emr-instance-type">
+
+[`--instance-type`]: #cb-instance-type
+
+ --instance-type <type>
+
+</td><td>
+
+Specify the type of [EC2] instance to use for the computation. See Amazon's
+[list of available instance types] and be sure to specify the "API name" of the
+desired type (e.g. `m1.small` or `c1.xlarge`). **The default of `c1.xlarge` is
+strongly recommended** because it has an appropriate mix of computing power and
+memory for a large breadth of problems. Choosing an instance type with less
+than 5GB of physical RAM can cause problems when the reference is as large (e.g.
+a mammalian genome). Stick to the default unless you're pretty sure the
+specified instance type can handle your problem size.
+
+[list of available instance types]: http://aws.amazon.com/ec2/instance-types/
+[`<instance-type>`]: http://aws.amazon.com/ec2/instance-types/
+
+</td></tr><tr><td id="cb-emr-args">
+
+[`--emr-args`]: #cb-emr-args
+
+ --emr-args "<args>"
+
+</td><td>
+
+Pass the specified extra arguments to the `elastic-mapreduce` script. See
+documentation for the `elastic-mapreduce` script for details.
+
+</td></tr><tr><td id="cb-logs">
+
+[`--logs`]: #cb-logs
+
+ --logs <URL>
+
+</td><td>
+
+Causes [EMR] to copy the log files to `<URL>`. Default: [EMR] writes logs to
+the `logs` subdirectory of the [`--output`] URL. See also [`--no-logs`].
+
+</td></tr><tr><td id="cb-no-logs">
+
+[`--no-logs`]: #cb-no-logs
+
+ --no-logs
+
+</td><td>
+
+By default, Crossbow causes [EMR] to copy all cluster log files to the `log`
+subdirectory of the [`--output`] URL (or another destination, if [`--logs`] is
+specified). Specifying this option disables all copying of logs.
+
+</td></tr><tr><td id="cb-no-emr-debug">
+
+[`--no-emr-debug`]: #cb-no-emr-debug
+
+ --no-emr-debug
+
+</td><td>
+
+Disables [Job Flow Debugging]. If this is *not* specified, you must have a
+[SimpleDB] account for [Job Flow Debugging] to work. You will be subject to
+additional [SimpleDB-related charges] if this option is enabled, but those fees
+are typically small or zero (depending on your account's [SimpleDB tier]).
+
+[Job Flow Debugging]: http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/DebuggingJobFlows.html
+[SimpleDB]: http://aws.amazon.com/simpledb/
+[SimpleDB-related charges]: http://aws.amazon.com/simpledb/#pricing
+[SimpleDB tier]: http://aws.amazon.com/simpledb/#pricing
+
+</td></tr>
+</table>
+
+# Running Crossbow on a Hadoop cluster via the command line
+
+## Prerequisites
+
+1. Working installation of [Hadoop] v0.20.2 or v0.20.205. Other versions newer
+ than 0.20 might also work, but haven't been tested.
+
+2. A `bowtie` v0.12.8 executable must exist at the same path on all cluster
+ nodes (including the master). That path must be specified via the
+ [`--bowtie`](#cb-hadoop-bowtie) option OR located in the directory specified
+ in the `CROSSBOW_BOWTIE_HOME` environment variable, OR in a subdirectory of
+ `$CROSSBOW_HOME/bin` OR in the `PATH` (Crossbow looks in that order).
+ `$CROSSBOW_HOME/bin` comes with pre-built Bowtie binaries for Linux and Mac
+ OS X 10.5 or later. An executable from that directory is used automatically
+ unless the platform is not Mac or Linux or unless overridden by
+ [`--bowtie`](#cb-hadoop-bowtie) or by defining `CROSSBOW_BOWTIE_HOME`.
+
+3. A Crossbow-customized version of `soapsnp` v1.02 must be installed
+ at the same path on all cluster nodes (including the master). That
+ path must be specified via the [`--soapsnp`](#cb-hadoop-soapsnp) option OR located in the
+ directory specified in the `CROSSBOW_SOAPSNP_HOME` environment
+ variable, OR in a subdirectory of `$CROSSBOW_HOME/bin` OR in the
+ `PATH` (Crossbow searches in that order). `$CROSSBOW_HOME/bin` comes
+ with pre-built SOAPsnp binaries for Linux and Mac OS X 10.6 or
+ later. An executable from that directory is used automatically
+ unless the platform is not Mac or Linux or unless overridden by
+ [`--soapsnp`](#cb-hadoop-soapsnp) or by defining `CROSSBOW_SOAPSNP_HOME`.
+
+4. If any of your inputs are in [Sequence Read Archive] format (i.e. end in
+ `.sra`), then the `fastq-dump` tool from the [SRA Toolkit] must be installed
+ at the same path on all cluster nodes. The path to the `fastq-dump` tool
+ must be specified via the ([`--fastq-dump`](#myrna-fastq-dump)) option OR
+ `fastq-dump` must be located in the directory specified in the
+ `CROSSBOW_FASTQ_DUMP_HOME` environment variable, OR `fastq-dump` must be
+ found in the `PATH` (Myrna searches in that order).
+
+5. Sufficient memory must be available on all [Hadoop] slave nodes to
+ hold the Bowtie index for the desired organism in addition to any
+ other loads placed on those nodes by [Hadoop] or other programs.
+ For mammalian genomes such as the human genome, this typically means
+ that slave nodes must have at least 5-6 GB of RAM.
+
+## To run
+
+Run `$CROSSBOW_HOME/cb_hadoop` with the desired options. Options that are
+unique to [Hadoop] jobs are described in the following subsection. Options that
+apply to all running modes are described in the [General Crossbow options]
+subsection. To see example invocations of `$CROSSBOW_HOME/cb_hadoop` see the
+[E. coli Hadoop] and [Mouse chromosome 17 Hadoop] examples.
+
+[General Crossbow options]: #general-crossbow-options
+[E. coli Hadoop]: #cb-example-e-coli-hadoop
+[Mouse chromosome 17 Hadoop]: #cb-example-mouse17-hadoop
+
+## Hadoop-specific options
+
+<table>
+
+<tr><td id="cb-hadoop-reference">
+
+[`--reference`]: #cb-hadoop-reference
+
+ --reference <URL>
+
+</td><td>
+
+[HDFS] URL where the reference jar is located. Pre-built reference jars for
+some commonly studied species (including human and mouse) are available from the
+[Crossbow web site]; these can be downloaded and installed in HDFS using `hadoop
+dfs` commands. If your desired genome and/or SNP annotations are not available
+in pre-built form, you will have to make your own reference jars, install them
+in HDFS, and specify their HDFS path here. This option must be specified.
+
+[Crossbow web site]: http://bowtie-bio.sf.net/crossbow
+[HDFS]: http://hadoop.apache.org/common/docs/current/hdfs_design.html
+
+<tr><td id="cb-hadoop-input">
+
+[`--input`]: #cb-hadoop-input
+
+ --input <URL>
+
+</td><td>
+
+[HDFS] URL where the input is located. If [`--preprocess`] or
+[`--just-preprocess`] are specified, `<URL>` sould point to a manifest file.
+Otherwise, `<URL>` should point to a directory containing preprocessed reads.
+This option must be specified.
+
+</td></tr><tr><td id="cb-hadoop-output">
+
+[`--output`]: #cb-hadoop-output
+
+ --output <URL>
+
+</td><td>
+
+[HDFS] URL where the output is to be deposited. If [`--just-preprocess`] is
+specified, the output consists of the preprocessed reads. Otherwise, the output
+consists of the SNP calls calculated by SOAPsnp for each chromosome, organized
+as one file per chromosome. This option must be specified.
+
+</td></tr><tr><td id="cb-hadoop-intermediate">
+
+[`--intermediate`]: #cb-hadoop-intermediate
+
+ --intermediate <URL>
+
+</td><td>
+
+[HDFS] URL where all intermediate results should be be deposited. Default:
+`hdfs:///crossbow/intermediate/<PID>`.
+
+</td></tr><tr><td id="cb-hadoop-preprocess-output">
+
+[`--preprocess-output`]: #cb-hadoop-preprocess-output
+
+ --preprocess-output <URL>
+
+</td><td>
+
+[HDFS] URL where the preprocessed reads should be stored. This can be useful if
+you later want to run Crossbow on the same input reads without having to re-run
+the preprocessing step (i.e. leaving [`--preprocess`] unspecified).
+
+</td></tr><tr><td id="cb-hadoop-bowtie">
+
+[`--bowtie`]: #cb-hadoop-bowtie
+
+ --bowtie <path>
+
+</td><td>
+
+Local path to the [Bowtie] binary Crossbow should use. `bowtie` must be
+installed in this same directory on all [Hadoop] worker nodes. By default,
+Crossbow searches the `PATH` and in the directory pointed to by the
+`CROSSBOW_HOME` environment variable.
+
+</td></tr><tr><td id="cb-hadoop-fastq-dump">
+
+[`--fastq-dump`]: #cb-hadoop-fastq-dump
+
+ --fastq-dump <path>
+
+</td><td>
+
+Path to the directory containing `fastq-dump`, which is part of the [SRA
+Toolkit]. This overrides all other ways that Crossbow searches for
+`fastq-dump`, including the `CROSSBOW_SRATOOLKIT_HOME` environment variable, the
+subdirectories of the `$CROSSBOW_HOME/bin` directory, and the `PATH`.
+
+</td></tr><tr><td id="cb-hadoop-soapsnp">
+
+[`--soapsnp`]: #cb-hadoop-soapsnp
+
+ --soapsnp <path>
+
+</td><td>
+
+Local path to the SOAPsnp executable to use when running the Call SNPs step.
+`soapsnp` must be installed in this same directory on all [Hadoop] worker nodes
+This overrides all other ways that Crossbow searches for `soapsnp`, including
+the `CROSSBOW_SOAPSNP_HOME` environment variable, the subdirectories of the
+`$CROSSBOW_HOME/bin` directory, and the `PATH`.
+
+</td></tr>
+</table>
+
+# Running Crossbow on a single computer via the command line
+
+## Prerequisites
+
+1. A `bowtie` v0.12.8 executable must exist on the local computer. The
+ path to `bowtie` must be specified via the [`--bowtie`](#cb-local-bowtie) option OR be located
+ in the directory specified in the `$CROSSBOW_BOWTIE_HOME` environment
+ variable, OR in a subdirectory of `$CROSSBOW_HOME/bin` OR in the `PATH`
+ (search proceeds in that order). `$CROSSBOW_HOME/bin` comes with
+ pre-built Bowtie binaries for Linux and Mac OS X 10.6 or later, so most
+ Mac and Linux users do not need to install either tool.
+
+2. A Crossbow-customized version of `soapsnp` v1.02 must exist. The path
+ to `soapsnp` must be specified via the [`--soapsnp`](#cb-local-soapsnp) option OR be in
+ the directory specified in the `$CROSSBOW_SOAPSNP_HOME` environment
+ variable, OR in a subdirectory of `$CROSSBOW_HOME/bin` OR in the `PATH` (Crossbow searches in that order).
+ `$CROSSBOW_HOME/bin` comes with pre-built SOAPsnp binaries for Linux and
+ Mac OS X 10.6 or later. An executable from that directory is used
+ automatically unless the platform is not Mac or Linux or unless
+ overridden by [`--soapsnp`](#cb-local-soapsnp) or `$CROSSBOW_SOAPSNP_HOME`.
+
+3. If any of your inputs are in [Sequence Read Archive] format (i.e. end in
+ `.sra`), then the `fastq-dump` tool from the [SRA Toolkit] must be installed
+ on the local computer. The path to the `fastq-dump` tool must be specified
+ via the ([`--fastq-dump`](#myrna-fastq-dump)) option OR `fastq-dump` must be
+ located in the directory specified in the `MYRNA_FASTQ_DUMP_HOME` environment
+ variable, OR `fastq-dump` must be found in the `PATH` (Myrna searches in that
+ order).
+
+4. Sufficient memory must be available on the local computer to hold one copy of
+ the Bowtie index for the desired organism *in addition* to all other running
+ workloads. For mammalian genomes such as the human genome, this typically
+ means that the local computer must have at least 5-6 GB of RAM.
+
+## To run
+
+Run `$CROSSBOW_HOME/cb_local` with the desired options. Options unique to local
+jobs are described in the following subsection. Options that apply to all
+running modes are described in the [General Crossbow options] subsection. To
+see example invocations of `$CROSSBOW_HOME/cb_local` see the [E. coli local] and
+[Mouse chromosome 17 local] examples.
+
+[General Crossbow options]: #general-crossbow-options
+[E. coli local]: #cb-example-e-coli-local
+[Mouse chromosome 17 local]: #cb-example-mouse17-local
+
+## Local-run-specific options
+
+<table>
+
+<tr><td id="cb-local-reference">
+
+[`--reference`]: #cb-local-reference
+
+ --reference <path>
+
+</td><td>
+
+Local path where expanded reference jar is located. Specified path should have
+a `index` subdirectory with a set of Bowtie index files, a `sequences`
+subdirectory with a set of FASTA files, a `snps` subdirectory with 0 or more
+per-chromosome SNP description files, and a `cmap.txt` file. Pre-built
+reference jars for some commonly studied species (including human and mouse) are
+available from the [Crossbow web site]; these can be downloaded and expanded
+into a directory with the appropriate structure using an [`unzip`] utility. If
+your desired genome and/or SNP annotations are not available in pre-built form,
+you will have to make your own reference jars and specify the appropriate path.
+This option must be specified.
+
+[Crossbow web site]: http://bowtie-bio.sf.net/crossbow
+[HDFS]: http://hadoop.apache.org/common/docs/current/hdfs_design.html
+[`unzip`]: http://en.wikipedia.org/wiki/Unzip
+
+<tr><td id="cb-local-input">
+
+[`--input`]: #cb-local-input
+
+ --input <path>
+
+</td><td>
+
+Local path where the input is located. If [`--preprocess`] or
+[`--just-preprocess`] are specified, this sould point to a [manifest file].
+Otherwise, this should point to a directory containing preprocessed reads. This
+option must be specified.
+
+</td></tr><tr><td id="cb-local-output">
+
+[`--output`]: #cb-local-output
+
+ --output <path>
+
+</td><td>
+
+Local path where the output is to be deposited. If [`--just-preprocess`] is
+specified, the output consists of the preprocessed reads. Otherwise, the output
+consists of the SNP calls calculated by SOAPsnp for each chromosome, organized
+as one file per chromosome. This option must be specified.
+
+</td></tr><tr><td id="cb-local-intermediate">
+
+[`--intermediate`]: #cb-local-intermediate
+
+ --intermediate <path>
+
+</td><td>
+
+Local path where all intermediate results should be kept temporarily (or
+permanently, if [`--keep-intermediates`] or [`--keep-all`] are specified).
+Default: `/tmp/crossbow/intermediate/<PID>`.
+
+</td></tr><tr><td id="cb-local-preprocess-output">
+
+[`--preprocess-output`]: #cb-local-preprocess-output
+
+ --preprocess-output <path>
+
+</td><td>
+
+Local path where the preprocessed reads should be stored. This can be useful if
+you later want to run Crossbow on the same input reads without having to re-run
+the preprocessing step (i.e. leaving [`--preprocess`] unspecified).
+
+</td></tr><tr><td id="cb-local-keep-intermediates">
+
+[`--keep-intermediates`]: #cb-local-keep-intermediates
+
+ --keep-intermediates
+
+</td><td>
+
+Keep intermediate directories and files, i.e. the output from all stages prior
+to the final stage. By default these files are deleted as soon as possible.
+
+</td></tr><tr><td id="cb-local-keep-all">
+
+[`--keep-all`]: #cb-local-keep-all
+
+ --keep-all
+
+</td><td>
+
+Keep all temporary files generated during the process of binning and sorting
+data records and moving them from stage to stage, as well as all intermediate
+results. By default these files are deleted as soon as possible.
+
+</td></tr><tr><td id="cb-local-cpus">
+
+[`--cpus`]: #cb-local-cpus
+
+ --cpus <int>
+
+</td><td>
+
+The maximum number of processors to use at any given time during the job.
+Crossbow will try to make maximal use of the processors allocated. Default: 1.
+
+</td></tr><tr><td id="cb-local-max-sort-records">
+
+[`--max-sort-records`]: #cb-local-max-sort-records
+
+ --max-sort-records <int>
+
+</td><td>
+
+Maximum number of records to be dispatched to the sort routine at one time when
+sorting bins before each reduce step. For each child process, this number is
+effectively divided by the number of CPUs used ([`--cpus`]). The default is
+200000.
+
+</td></tr><tr><td id="cb-local-max-sort-files">
+
+[`--max-sort-files`]: #cb-local-max-sort-files
+
+ --max-sort-files <int>
+
+</td><td>
+
+Maximum number of files that can be opened at once by the sort routine when
+sorting bins before each reduce step. For each child process, this number is
+effectively divided by the number of CPUs used ([`--cpus`]). The default is 40.
+
+</td></tr><tr><td id="cb-local-bowtie">
+
+[`--bowtie`]: #cb-local-bowtie
+
+ --bowtie <path>
+
+</td><td>
+
+Path to the Bowtie executable to use when running the Align step. This
+overrides all other ways that Crossbow searches for `bowtie`, including the
+`CROSSBOW_BOWTIE_HOME` environment variable, the subdirectories of the
+`$CROSSBOW_HOME/bin` directory, and the `PATH`.
+
+</td></tr><tr><td id="cb-local-fastq-dump">
+
+[`--fastq-dump`]: #cb-local-fastq-dump
+
+ --fastq-dump <path>
+
+</td><td>
+
+Path to the directory containing the programs in the [SRA toolkit], including
+`fastq-dump`. This overrides all other ways that Crossbow searches for
+`fastq-dump`, including the `CROSSBOW_SRATOOLKIT_HOME` environment variable, the
+subdirectories of the `$CROSSBOW_HOME/bin` directory, and the `PATH`.
+
+</td></tr><tr><td id="cb-local-soapsnp">
+
+[`--soapsnp`]: #cb-local-soapsnp
+
+ --soapsnp <path>
+
+</td><td>
+
+Path to the SOAPsnp executable to use when running the Call SNPs step. This
+overrides all other ways that Crossbow searches for `soapsnp`, including the
+`CROSSBOW_SOAPSNP_HOME` environment variable, the subdirectories of the
+`$CROSSBOW_HOME/bin` directory, and the `PATH`.
+
+</td></tr>
+
+</table>
+
+# General Crossbow options
+
+The following options can be specified regardless of what mode ([EMR],
+[Hadoop] or local) Crossbow is run in.
+
+<table>
+
+<tr><td id="cb-quality">
+
+[`--quality`]: #cb-quality
+
+ --quality { phred33 | phred64 | solexa64 }
+
+</td><td>
+
+Treat all input reads as having the specified quality encoding. `phred33`
+denotes the [Phred+33] or "Sanger" format whereby ASCII values 33-126 are used
+to encode qualities on the [Phred scale]. `phred64` denotes the [Phred+64] or
+"Illumina 1.3+" format whereby ASCII values 64-126 are used to encode qualities
+on the [Phred scale]. `solexa64` denotes the [Solexa+64] or "Solexa/Illumina
+1.0" format whereby ASCII values 59-126 are used to encode qualities on a
+[log-odds scale] that includes values as low as -5. Default: `phred33`.
+
+[Phred scale]: http://en.wikipedia.org/wiki/Phred_quality_score
+[Phred+33]: http://en.wikipedia.org/wiki/FASTQ_format#Encoding
+[Phred+64]: http://en.wikipedia.org/wiki/FASTQ_format#Encoding
+[Solexa+64]: http://en.wikipedia.org/wiki/FASTQ_format#Encoding
+[log-odds scale]: http://en.wikipedia.org/wiki/FASTQ_format#Variations
+
+</td></tr><tr><td id="cb-preprocess">
+
+[`--preprocess`]: #cb-preprocess
+
+ --preprocess
+
+</td><td>
+
+The input path or URL refers to a [manifest file] rather than a directory of
+preprocessed reads. The first step in the Crossbow computation will be to
+preprocess the reads listed in the [manifest file] and store the preprocessed
+reads in the intermediate directory or in the `--preprocess-output` directory if
+it's specified. Default: off.
+
+[manifest file]: #manifest-files
+
+</td></tr><tr><td id="cb-just-preprocess">
+
+[`--just-preprocess`]: #cb-just-preprocess
+
+ --just-preprocess
+
+</td><td>
+
+The input path or URL refers to a [manifest file] rather than a directory of
+preprocessed reads. Crossbow will preprocess the reads listed in the [manifest
+file] and store the preprocessed reads in the `--output` directory and quit.
+Default: off.
+
+[manifest file]: #manifest-files
+
+</td></tr><tr><td id="cb-just-align">
+
+[`--just-align`]: #cb-just-align
+
+ --just-align
+
+</td><td>
+
+Instead of running the Crossbow pipeline all the way through to the end, run the
+pipeline up to and including the align stage and store the results in the
+[`--output`] URL. To resume the run later, use [`--resume-align`].
+
+</td></tr><tr><td id="cb-resume-align">
+
+[`--resume-align`]: #cb-resume-align
+
+ --resume-align
+
+</td><td>
+
+Resume the Crossbow pipeline from just after the alignment stage. The
+[`--input`] URL must point to an [`--output`] URL from a previous run using
+[`--just-align`].
+
+</td></tr><tr><td id="cb-bowtie-args">
+
+[`--bowtie-args`]: #cb-bowtie-args
+
+ --bowtie-args "<args>"
+
+</td><td>
+
+Pass the specified arguments to [Bowtie] for the Align stage. Default: [`-M
+1`]. See the [Bowtie manual] for details on what options are available.
+
+[`-M 1`]: http://bowtie-bio.sf.net/manual.shtml#bowtie-options-M
+[Bowtie manual]: http://bowtie-bio.sf.net/manual.shtml
+
+</td></tr><tr><td id="cb-discard-reads">
+
+[`--discard-reads`]: #cb-discard-reads
+
+ --discard-reads <fraction>
+
+</td><td>
+
+Randomly discard a fraction of the input reads. E.g. specify `0.5` to discard
+50%. This applies to all input reads regardless of type (paired vs. unpaired)
+or length. This can be useful for debugging. Default: 0.0.
+
+</td></tr><tr><td id="cb-discard-ref-bins">
+
+[`--discard-ref-bins`]: #cb-discard-ref-bins
+
+ --discard-ref-bins <fraction>
+
+</td><td>
+
+Randomly discard a fraction of the reference bins prior to SNP calling. E.g.
+specify `0.5` to discard 50% of the reference bins. This can be useful for
+debugging. Default: 0.0.
+
+</td></tr><tr><td id="cb-discard-all">
+
+[`--discard-all`]: #cb-discard-all
+
+ --discard-all <fraction>
+
+</td><td>
+
+Equivalent to setting [`--discard-reads`] and [`--discard-ref-bins`] to
+`<fraction>`. Default: 0.0.
+
+</td></tr><tr><td id="cb-soapsnp-args">
+
+[`--soapsnp-args`]: #cb-soapsnp-args
+
+ --soapsnp-args "<args>"
+
+</td><td>
+
+Pass the specified arguments to [SOAPsnp] in the SNP calling stage. These
+options are passed to SOAPsnp regardless of whether the reference sequence under
+consideration is diploid or haploid. Default: `-2 -u -n -q`. See the [SOAPsnp
+manual] for details on what options are available.
+
+[SOAPsnp manual]: http://soap.genomics.org.cn/soapsnp.html
+
+</td></tr><tr><td id="cb-soapsnp-hap-args">
+
+[`--soapsnp-hap-args`]: #cb-soapsnp-hap-args
+
+ --soapsnp-hap-args "<args>"
+
+</td><td>
+
+Pass the specified arguments to [SOAPsnp] in the SNP calling stage. when the
+reference sequence under consideration is haploid. Default: `-r 0.0001`. See
+the [SOAPsnp manual] for details on what options are available.
+
+</td></tr><tr><td id="cb-soapsnp-dip-args">
+
+[`--soapsnp-dip-args`]: #cb-soapsnp-dip-args
+
+ --soapsnp-dip-args "<args>"
+
+</td><td>
+
+Pass the specified arguments to [SOAPsnp] in the SNP calling stage. when the
+reference sequence under consideration is diploid. Default: `-r 0.00005 -e
+0.0001`. See the [SOAPsnp manual] for details on what options are available.
+
+</td></tr><tr><td id="cb-haploids">
+
+[`--haploids`]: #cb-haploids
+
+ --haploids <chromosome-list>
+
+</td><td>
+
+The specified comma-separated list of chromosome names are to be treated as
+haploid by SOAPsnp. The rest are treated as diploid. Default: all chromosomes
+are treated as diploid.
+
+</td></tr><tr><td id="cb-all-haploids">
+
+[`--all-haploids`]: #cb-all-haploids
+
+ --all-haploids
+
+</td><td>
+
+If specified, all chromosomes are treated as haploid by SOAPsnp.
+
+</td></tr><tr><td id="cb-partition-len">
+
+[`--partition-len`]: #cb-partition-len
+
+ --partition-len <int>
+
+</td><td>
+
+The bin size to use when binning alignments into partitions prior to SNP
+calling. If load imbalance occurrs in the SNP calling step (some tasks taking
+far longer than others), try decreasing this. Default: 1,000,000.
+
+></tr><tr><td id="cb-dry-run">
+
+[`--dry-run`]: #cb-dry-run
+
+ --dry-run
+
+</td><td>
+
+Just generate a script containing the commands needed to launch the job, but
+don't run it. The script's location will be printed so that you may run it
+later.
+
+</td></tr>
+
+</td></tr><tr><td id="cb-test">
+
+[`--test`]: #cb-test
+
+ --test
+
+</td><td>
+
+Instead of running Crossbow, just search for the supporting tools ([Bowtie] and
+[SOAPsnp]) and report whether and how they were found. If running in Cloud Mode,
+this just tests whether the `elastic-mapreduce` script is locatable and
+runnable. Use this option to debug your local Crossbow installation.
+
+</td></tr><tr><td id="cb-tempdir">
+
+[`--tempdir`]: #cb-tempdir
+
+ --tempdir `<path>`
+
+</td><td>
+
+Local directory where temporary files (e.g. dynamically generated scripts)
+should be deposited. Default: `/tmp/Crossbow/invoke.scripts`.
+
+</td></tr>
+</table>
+
+# Crossbow examples
+
+The following subsections guide you step-by-step through examples included with
+the Crossbow package. Because reads (and sometimes reference jars) must be
+obtained over the Internet, running these examples requires an active Internet
+connection.
+
+## E. coli (small)
+
+Data for this example is taken from the study by [Parkhomchuk et al].
+
+[Parkhomchuk et al]: http://www.pnas.org/content/early/2009/11/19/0906681106.abstract
+
+### EMR
+
+<div id="cb-example-e-coli-emr" />
+
+#### Via web interface
+
+Identify an [S3] bucket to hold the job's input and output. You may
+need to create an [S3 bucket] for this purpose. See your [S3 tool]'s
+documentation.
+
+[S3 tool]: #s3-tools
+[S3 bucket]: http://docs.amazonwebservices.com/AmazonS3/latest/index.html?UsingBucket.html
+
+Use an [S3 tool] to upload `$CROSSBOW_HOME/example/e_coli/small.manifest` to
+the `example/e_coli` subdirectory in your bucket. You can do so with this
+[s3cmd] command:
+
+ s3cmd put $CROSSBOW_HOME/example/e_coli/small.manifest s3://<YOUR-BUCKET>/example/e_coli/
+
+Direct your web browser to the [Crossbow web interface] and fill in the form as
+below (substituting for `<YOUR-BUCKET>`):
+
+<div>
+<img src="images/AWS_cb_e_coli_fillin.png" alt="" />
+<p><i>Crossbow web form filled in for the small E. coli example.</i></p>
+</div>
+
+1. For **AWS ID**, enter your AWS Access Key ID
+2. For **AWS Secret Key**, enter your AWS Secret Access Key
+3. *Optional*: For **AWS Keypair name**, enter the name of
+ your AWS keypair. This is only necessary if you would like to be
+ able to [ssh] into the [EMR] cluster while it runs.
+4. *Optional*: Check that the AWS ID and Secret Key entered are
+ valid by clicking the "Check credentials..." link
+5. For **Job name**, enter `Crossbow-Ecoli`
+6. Make sure that **Job type** is set to "Crossbow"
+7. For **Input URL**, enter
+ `s3n://<YOUR-BUCKET>/example/e_coli/small.manifest`, substituting
+ for `<YOUR-BUCKET>`
+8. *Optional*: Check that the Input URL exists by clicking the
+ "Check that input URL exists..." link
+9. For **Output URL**, enter
+ `s3n://<YOUR-BUCKET>/example/e_coli/output_small`, substituting for
+ `<YOUR-BUCKET>`
+10. *Optional*: Check that the Output URL does not exist by
+ clicking the "Check that output URL doesn't exist..." link
+11. For **Input type**, select "Manifest file"
+12. For **Genome/Annotation**, select "E. coli" from the drop-down
+ menu
+13. For **Chromosome ploidy**, select "All are haploid"
+14. Click Submit
+
+This job typically takes about 30 minutes on 1 `c1.xlarge` [EC2] node. See
+[Monitoring your EMR jobs] for information on how to track job progress. To
+download the results, use an [S3 tool] to retrieve the contents of the
+`s3n://<YOUR-BUCKET>/example/e_coli/output_small` directory.
+
+[ssh]: http://en.wikipedia.org/wiki/Secure_Shell
+
+#### Via command line
+
+Test your Crossbow installation by running:
+
+ $CROSSBOW_HOME/cb_emr --test
+
+This will warn you if any supporting tools (`elastic-mapreduce` in this case)
+cannot be located or run.
+
+Identify an [S3] bucket to hold the job's input and output. You may need to
+create an [S3 bucket] for this purpose. See your [S3 tool]'s documentation.
+
+[S3 tool]: #s3-tools
+
+Use your [S3 tool] to upload `$CROSSBOW_HOME/example/e_coli/small.manifest` to
+the `example/e_coli` subdirectory in your bucket. You can do so with this
+[s3cmd] command:
+
+ s3cmd put $CROSSBOW_HOME/example/e_coli/small.manifest s3://<YOUR-BUCKET>/example/e_coli/
+
+Start the [EMR] job with the following command (substituting for
+`<YOUR-BUCKET>`):
+
+ $CROSSBOW_HOME/cb_emr \
+ --name "Crossbow-Ecoli" \
+ --preprocess \
+ --input=s3n://<YOUR-BUCKET>/example/e_coli/small.manifest \
+ --output=s3n://<YOUR-BUCKET>/example/e_coli/output_small \
+ --reference=s3n://crossbow-refs/e_coli.jar \
+ --all-haploids
+
+The `--reference` option instructs Crossbow to use a pre-built reference jar at
+URL `s3n://crossbow-refs/e_coli.jar`. The [`--preprocess`] option instructs
+Crossbow to treat the input as a [manifest file], rather than a directory of
+already-preprocessed reads. As the first stage of the pipeline, Crossbow
+downloads files specified in the manifest file and preprocesses them into
+Crossbow's read format. [`--output`] specifies where the final output is placed.
+
+This job typically takes about 30 minutes on 1 `c1.xlarge` [EC2] node. See
+[Monitoring your EMR jobs] for information on how to track job progress. To
+download the results, use an [S3 tool] to retrieve the contents of the
+`s3n://<YOUR-BUCKET>/example/e_coli/output_small` directory.
+
+[Monitoring your EMR jobs]: #monitoring-your-emr-jobs
+
+### Hadoop
+
+<div id="cb-example-e-coli-hadoop" />
+
+Log into the [Hadoop] master node and test your Crossbow installation by running:
+
+ $CROSSBOW_HOME/cb_hadoop --test
+
+This will tell you if any of the supporting tools or packages are missing on the
+master. *You must also ensure* that the same tools are installed in the same
+paths on all slave nodes, and are runnable by the slaves.
+
+From the master, download the file named `e_coli.jar` from the following URL:
+
+ http://crossbow-refs.s3.amazonaws.com/e_coli.jar
+
+E.g. with this command:
+
+ wget http://crossbow-refs.s3.amazonaws.com/e_coli.jar
+
+Equivalently, you can use an [S3 tool] to download the same file from this URL:
+
+ s3n://crossbow-refs/e_coli.jar
+
+E.g. with this [s3cmd] command:
+
+ s3cmd get s3://crossbow-refs/e_coli.jar
+
+Install `e_coli.jar` in [HDFS] (the [Hadoop] distributed filesystem) with the
+following commands. If the `hadoop` script is not in your `PATH`, either add it
+to your `PATH` (recommended) or specify the full path to the `hadoop` script in
+the following commands.
+
+ hadoop dfs -mkdir /crossbow-refs
+ hadoop dfs -put e_coli.jar /crossbow-refs/e_coli.jar
+
+The first creates a directory in [HDFS] (you will see a warning message if the
+directory already exists) and the second copies the local jar files into that
+directory. In this example, we deposit the jars in the `/crossbow-refs`
+directory, but any [HDFS] directory is fine.
+
+Remove the local `e_coli.jar` file to save space. E.g.:
+
+ rm -f e_coli.jar
+
+Next install the [manifest file] in [HDFS]:
+
+ hadoop dfs -mkdir /crossbow/example/e_coli
+ hadoop dfs -put $CROSSBOW_HOME/example/e_coli/small.manifest /crossbow/example/e_coli/small.manifest
+
+Now start the job by running:
+
+ $CROSSBOW_HOME/cb_hadoop \
+ --preprocess \
+ --input=hdfs:///crossbow/example/e_coli/small.manifest \
+ --output=hdfs:///crossbow/example/e_coli/output_small \
+ --reference=hdfs:///crossbow-refs/e_coli.jar \
+ --all-haploids
+
+The [`--preprocess`] option instructs Crossbow to treat the input as a [manifest
+file]. As the first stage of the pipeline, Crossbow will download the files
+specified on each line of the manifest file and preprocess them into Crossbow's
+read format. The [`--reference`] option specifies the location of the reference
+jar contents. The [`--output`] option specifies where the final output is
+placed.
+
+### Single computer
+
+<div id="cb-example-e-coli-local" />
+
+Test your Crossbow installation by running:
+
+ $CROSSBOW_HOME/cb_local --test
+
+This will warn you if any supporting tools (`bowtie` and `soapsnp` in this case)
+cannot be located or run.
+
+If you don't already have a `CROSSBOW_REFS` directory, choose one; it will be
+the default path Crossbow searches for reference jars. Permanently set the
+`CROSSBOW_REFS` environment variable to the selected directory.
+
+Create a subdirectory called `$CROSSBOW_REFS/e_coli`:
+
+ mkdir $CROSSBOW_REFS/e_coli
+
+Download `e_coli.jar` from the following URL to the new `e_coli` directory:
+
+ http://crossbow-refs.s3.amazonaws.com/e_coli.jar
+
+E.g. with this command:
+
+ wget -O $CROSSBOW_REFS/e_coli/e_coli.jar http://crossbow-refs.s3.amazonaws.com/e_coli.jar
+
+Equivalently, you can use an [S3 tool] to download the same file from this URL:
+
+ s3n://crossbow-refs/e_coli.jar
+
+E.g. with this [s3cmd] command:
+
+ s3cmd get s3://crossbow-refs/e_coli.jar $CROSSBOW_REFS/e_coli/e_coli.jar
+
+Change to the new `e_coli` directory and expand `e_coli.jar` using an `unzip` or
+`jar` utility:
+
+ cd $CROSSBOW_REFS/e_coli && unzip e_coli.jar
+
+Now you may remove `e_coli.jar` to save space:
+
+ rm -f $CROSSBOW_REFS/e_coli/e_coli.jar
+
+Now run Crossbow. Change to the `$CROSSBOW_HOME/example/e_coli` directory and
+start the job via the `cb_local` script:
+
+ cd $CROSSBOW_HOME/example/e_coli
+ $CROSSBOW_HOME/cb_local \
+ --input=small.manifest \
+ --preprocess \
+ --reference=$CROSSBOW_REFS/e_coli \
+ --output=output_small \
+ --all-haploids \
+ --cpus=<CPUS>
+
+Substitute the number of CPUs you'd like to use for `<CPUS>`.
+
+The [`--preprocess`] option instructs Crossbow to treat the input as a [manifest
+file]. As the first stage of the pipeline, Crossbow will download the files
+specified on each line of the manifest file and "preprocess" them into a format
+understood by Crossbow. The [`--reference`] option specifies the location of
+the reference jar contents. The [`--output`] option specifies where the final
+output is placed. The [`--cpus`] option enables Crossbow to use up to the
+specified number of CPUs at any given time.
+
+[manifest file]: #manifest-files
+
+## Mouse chromosome 17 (large)
+
+Data for this example is taken from the study by [Sudbury, Stalker et al].
+
+[Sudbury, Stalker et al]: http://genomebiology.com/2009/10/10/R112
+
+### EMR
+
+<div id="cb-example-mouse17-emr" />
+
+#### Via web interface
+
+First we build a reference jar for a human assembly and annotations using
+scripts included with Crossbow. The script searches for a `bowtie-build`
+executable with the same rules Crossbow uses to search for `bowtie`. See
+[Installing Crossbow] for details. Because one of the steps executed by the
+script builds an index of the human genome, it should be run on a computer with
+plenty of memory (at least 4 gigabytes, preferably 6 or more).
+
+ cd $CROSSBOW_HOME/reftools
+ ./mm9_chr17_jar
+
+The `mm9_chr17_jar` script will automatically:
+
+1. Download the FASTA sequence for mouse (build [mm9]) chromome 17 from [UCSC].
+2. Build an index from that FASTA sequence.
+3. Download the known SNPs and SNP frequencies for mouse chromosome 17 from
+ [dbSNP].
+4. Arrange this information in the directory structure expected by Crossbow.
+5. Package the information in a [jar file] named `mm9_chr17.jar`.
+
+[mm9]: http://hgdownload.cse.ucsc.edu/downloads.html#mouse
+
+Next, use an [S3 tool] to upload the `mm9_chr17.jar` file to the `crossbow-refs`
+subdirectory in your bucket. E.g. with this [s3cmd] command (substituting for
+`<YOUR-BUCKET>`):
+
+ s3cmd put $CROSSBOW_HOME/reftools/mm9_chr17/mm9_chr17.jar s3://<YOUR-BUCKET>/crossbow-refs/
+
+[S3 tool]: #s3-tools
+
+You may wish to remove the locally-generated reference jar files to save space.
+E.g.:
+
+ rm -rf $CROSSBOW_HOME/reftools/mm9_chr17
+
+Use an [S3 tool] to upload `$CROSSBOW_HOME/example/mouse17/full.manifest` to the
+`example/mouse17` subdirectory in your bucket. E.g. with this [s3cmd] command:
+
+ s3cmd put $CROSSBOW_HOME/example/mouse17/full.manifest s3://<YOUR-BUCKET>/example/mouse17/
+
+Direct your web browser to the [Crossbow web interface] and fill in the form as
+below (substituting for `<YOUR-BUCKET>`):
+
+<div>
+<img src="images/AWS_cb_mouse17_fillin.png" alt="" />
+<p><i>Crossbow web form filled in for the large Mouse Chromosome 17 example.</i></p>
+</div>
+
+1. For **AWS ID**, enter your AWS Access Key ID
+2. For **AWS Secret Key**, enter your AWS Secret Access Key
+3. *Optional*: For **AWS Keypair name**, enter the name of your AWS keypair.
+ This is only necessary if you would like to be able to [ssh] into the [EMR]
+ cluster while it runs.
+4. *Optional*: Check that the AWS ID and Secret Key entered are valid by
+ clicking the "Check credentials..." link
+5. For **Job name**, enter `Crossbow-Mouse17`
+6. Make sure that **Job type** is set to "Crossbow"
+7. For **Input URL**, enter
+ `s3n://<YOUR-BUCKET>/example/mouse17/full.manifest`, substituting for
+ `<YOUR-BUCKET>`
+8. *Optional*: Check that the Input URL exists by clicking the "Check that
+ input URL exists..." link
+9. For **Output URL**, enter `s3n://<YOUR-BUCKET>/example/mouse17/output_full`,
+ substituting for `<YOUR-BUCKET>`
+10. *Optional*: Check that the Output URL does not exist by clicking the "Check
+ that output URL doesn't exist..." link
+11. For **Input type**, select "Manifest file"
+12. For **Genome/Annotation**, check the box labeled "Specify reference jar
+ URL:" and enter `s3n://<YOUR-BUCKET>/crossbow-refs/mm9_chr17.jar` in the
+ text box below
+13. *Optional*: Check that the reference jar URL exists by clicking the "Check
+ that reference jar URL exists..." link
+14. For **Chromosome ploidy**, select "All are diploid"
+15. Click Submit
+
+This job typically takes about 45 minutes on 8 `c1.xlarge` [EC2] instances. See
+[Monitoring your EMR jobs] for information on how to track job progress. To
+download the results, use an [S3 tool] to retrieve the contents of the
+`s3n://<YOUR-BUCKET>/example/mouse17/output_full` directory.
+
+[Monitoring your EMR jobs]: #monitoring-your-emr-jobs
+[Job Flow Debugging]: http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/DebuggingJobFlows.html
+[ssh]: http://en.wikipedia.org/wiki/Secure_Shell
+
+#### Via command line
+
+First we build a reference jar for a human assembly and annotations using
+scripts included with Crossbow. The script searches for a `bowtie-build`
+executable with the same rules Crossbow uses to search for `bowtie`. See
+[Installing Crossbow] for details. Because one of the steps executed by the
+script builds an index of the human genome, it should be run on a computer with
+plenty of memory (at least 4 gigabytes, preferably 6 or more).
+
+ cd $CROSSBOW_HOME/reftools
+ ./mm9_chr17_jar
+
+The `mm9_chr17_jar` script will automatically:
+
+1. Download the FASTA sequence for mouse (build [mm9]) chromome 17 from [UCSC].
+2. Build an index from that FASTA sequence.
+3. Download the known SNPs and SNP frequencies for mouse chromosome 17 from
+ [dbSNP].
+4. Arrange this information in the directory structure expected by Crossbow.
+5. Package the information in a [jar file] named `mm9_chr17.jar`.
+
+[mm9]: http://hgdownload.cse.ucsc.edu/downloads.html#mouse
+
+Next, use an [S3 tool] to upload the `mm9_chr17.jar` file to the `crossbow-refs`
+subdirectory in your bucket. E.g. with this [s3cmd] command (substituting for
+`<YOUR-BUCKET>`):
+
+ s3cmd put $CROSSBOW_HOME/reftools/mm9_chr17/mm9_chr17.jar s3://<YOUR-BUCKET>/crossbow-refs/
+
+[S3 tool]: #s3-tools
+
+You may wish to remove the locally-generated reference jar files to save space.
+E.g.:
+
+ rm -rf $CROSSBOW_HOME/reftools/mm9_chr17
+
+Use an [S3 tool] to upload `$CROSSBOW_HOME/example/mouse17/full.manifest` to the
+`example/mouse17` subdirectory in your bucket. E.g. with this [s3cmd] command:
+
+ s3cmd put $CROSSBOW_HOME/example/mouse17/full.manifest s3://<YOUR-BUCKET>/example/mouse17/
+
+To start the [EMR] job, run the following command (substituting for
+`<YOUR-BUCKET>`):
+
+ $CROSSBOW_HOME/cb_emr \
+ --name "Crossbow-Mouse17" \
+ --preprocess \
+ --input=s3n://<YOUR-BUCKET>/example/mouse17/full.manifest \
+ --output=s3n://<YOUR-BUCKET>/example/mouse17/output_full \
+ --reference=s3n://<YOUR-BUCKET>/crossbow-refs/mm9_chr17.jar \
+ --instances 8
+
+This job typically takes about 45 minutes on 8 `c1.xlarge` [EC2] instances. See
+[Monitoring your EMR jobs] for information on how to track job progress. To
+download the results, use an [S3 tool] to retrieve the contents of the
+`s3n://<YOUR-BUCKET>/example/mouse17/output_full` directory.
+
+[Monitoring your EMR jobs]: #monitoring-your-emr-jobs
+[Job Flow Debugging]: http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/DebuggingJobFlows.html
+
+### Hadoop
+
+<div id="cb-example-mouse17-hadoop" />
+
+First we build a reference jar for a human assembly and annotations using
+scripts included with Crossbow. The script searches for a `bowtie-build`
+executable with the same rules Crossbow uses to search for `bowtie`. See
+[Installing Crossbow] for details. Because one of the steps executed by the
+script builds an index of the human genome, it should be run on a computer with
+plenty of memory (at least 4 gigabytes, preferably 6 or more).
+
+ cd $CROSSBOW_HOME/reftools
+ ./mm9_chr17_jar
+
+The `mm9_chr17_jar` script will automatically:
+
+1. Download the FASTA sequence for mouse (build [mm9]) chromome 17 from [UCSC].
+2. Build an index from that FASTA sequence.
+3. Download the known SNPs and SNP frequencies for mouse chromosome 17 from
+ [dbSNP].
+4. Arrange this information in the directory structure expected by Crossbow.
+5. Package the information in a [jar file] named `mm9_chr17.jar`.
+
+Next, use the `hadoop` script to put the `mm9_chr17.jar` file in the
+`crossbow-refs` [HDFS] directory. Note tha tif `hadoop` is not in your `PATH`,
+you must specify `hadoop`'s full path instead:
+
+ hadoop dfs -mkdir /crossbow-refs
+ hadoop dfs -put $CROSSBOW_HOME/reftools/mm9_chr17/mm9_chr17.jar /crossbow-refs/mm9_chr17.jar
+
+The first command will yield a warning if the directory already exists; ignore
+this. In this example, we deposit the jars in the `/crossbow-refs` directory,
+but any [HDFS] directory is fine.
+
+You may wish to remove the locally-generated reference jar files to save space.
+E.g.:
+
+ rm -rf $CROSSBOW_HOME/reftools/mm9_chr17
+
+Now install the [manifest file] in [HDFS]:
+
+[manifest file]: #manifest-files
+
+ hadoop dfs -mkdir /crossbow/example/mouse17
+ hadoop dfs -put $CROSSBOW_HOME/example/mouse17/full.manifest /crossbow/example/mouse17/full.manifest
+
+To start the [Hadoop] job, run the following command (substituting for
+`<YOUR-BUCKET>`):
+
+ $CROSSBOW_HOME/cb_hadoop \
+ --preprocess \
+ --input=hdfs:///crossbow/example/mouse17/full.manifest \
+ --output=hdfs:///crossbow/example/mouse17/output_full \
+ --reference=hdfs:///crossbow-refs/mm9_chr17.jar
+
+[mm9]: http://hgdownload.cse.ucsc.edu/downloads.html#mouse
+
+### Single computer
+
+<div id="cb-example-mouse17-local" />
+
+First we build a reference jar for a human assembly and annotations
+using scripts included with Crossbow. The script searches for a
+`bowtie-build` executable with the same rules Crossbow uses to search
+for `bowtie`. See [Installing Crossbow] for details. Because one of
+the steps executed by the script builds an index of the human genome,
+it should be run on a computer with plenty of memory (at least 4
+gigabytes, preferably 6 or more).
+
+Run the following commands:
+
+ cd $CROSSBOW_HOME/reftools
+ ./mm9_chr17_jar
+
+The `mm9_chr17_jar` script will automatically:
+
+1. Download the FASTA sequence for mouse (build [mm9]) chromome 17 from
+ [UCSC].
+2. Build an index from that FASTA sequence.
+3. Download the known SNPs and SNP frequencies for mouse chromosome 17
+ from [dbSNP].
+4. Arrange this information in the directory structure expected by
+ Crossbow.
+5. Package the information in a [jar file] named `mm9_chr17.jar`.
+
+Move the directory containing the new reference jar into the
+`$CROSSBOW_REFS` directory:
+
+ mv $CROSSBOW_HOME/reftools/mm9_chr17 $CROSSBOW_REFS/
+
+Now change to the `$CROSSBOW_HOME/example/mouse17` directory and run
+Crossbow (substitute the number of CPUs you'd like to use for
+`<CPUS>`):
+
+ cd $CROSSBOW_HOME/example/mouse17
+ $CROSSBOW_HOME/cb_local \
+ --input=$CROSSBOW_HOME/example/mouse17/full.manifest \
+ --preprocess \
+ --reference=$CROSSBOW_REFS/mm9_chr17 \
+ --output=output_full \
+ --cpus=<CPUS>
+
+[UCSC]: http://hgdownload.cse.ucsc.edu/downloads.html
+
+# Manifest files
+
+A manifest file describes a set of [FASTQ] or [`.sra`] formatted input
+files that might be located:
+
+[gzip]: http://en.wikipedia.org/wiki/Gzip
+[bzip2]: http://en.wikipedia.org/wiki/Bzip2
+
+1. On the local computer
+2. In [HDFS]
+3. In [S3]
+4. On an FTP or web server
+
+[FASTQ]: http://en.wikipedia.org/wiki/FASTQ_format
+
+A manifest file can contain any combination of URLs and local paths from these
+various types of sources.
+
+[FASTQ] files can be gzip or bzip2-compressed (i.e. with `.gz` or `.bz2` file
+extensions). If [`.sra`] files are specified in the manifest and Crossbow is
+being run in single-computer or [Hadoop] modes, then the `fastq-dump` tool must
+be installed and Myrna must be able to locate it. See the [`--fastq-dump`]
+option and the [SRA Toolkit section of the manual].
+
+[SRA Toolkit section of the manual]: #the-fastq-dump
+
+Each line in the manifest file represents either one file, for unpaired input
+reads, or a pair of files, for paired input reads. For a set of unpaired input
+reads, the line is formatted:
+
+ URL(tab)Optional-MD5
+
+Specifying an MD5 for the input file is optional. If it is specified, Crossbow
+will attempt to check the integrity of the file after downloading by comparing
+the observed MD5 to the user-provided MD5. To disable this checking, specify `0`
+in this field.
+
+For a set of paired input reads, the line is formatted:
+
+ URL-1(tab)Optional-MD5-1(tab)URL-2(tab)Optional-MD5-2
+
+Where `URL-1` and `URL-2` point to input files with all the #1 mates in `URL-1`
+and all the #2 mates in `URL-2`. The entries in the files must be arranged so
+that pairs "line up" in parallel. This is commonly the way public paired-end
+FASTQ datasets, such as those produced by the [1000 Genomes Project], are
+formatted. Typically these file pairs end in suffixes `_1.fastq.gz` and
+`_2.fastq.gz`.
+
+[1000 Genomes Project]: http://www.1000genomes.org/page.php
+
+Manifest files may have comment lines, which must start with the hash (`#`)
+symbol, and blank lines. Such lines are ignored by Crossbow.
+
+For examples of manifest files, see the files ending in `.manifest` in
+the `$CROSSBOW_HOME/example/e_coli` and
+`$CROSSBOW_HOME/example/mouse17` directories.
+
+# Reference jars
+
+All information about a reference sequence needed by Crossbow is encapsulated in
+a "reference jar" file. A reference jar includes a set of FASTA files encoding
+the reference sequences, a [Bowtie] index of the reference sequence, and a set
+of files encoding information about known SNPs for the species.
+
+A Crossbow reference jar is organized as:
+
+1. A `sequences` subdirectory containing one FASTA file per reference sequence.
+2. An `index` subdirectory containing the [Bowtie] index files for the reference
+ sequences.
+3. A `snps` subdirectory containing all of the SNP description files.
+
+The FASTA files in the `sequences` subdirectory must each be named `chrX.fa`,
+where `X` is the 0-based numeric id of the chromosome or sequence in the file.
+For example, for a human reference, chromosome 1's FASTA file could be named
+`chr0.fa`, chromosome 2 named `chr1.fa`, etc, all the way up to chromosomes 22,
+X and Y, named `chr21.fa`, `chr22.fa` and `chr23.fa`. Also, the names of the
+sequences within the FASTA files must match the number in the file name. I.e.,
+the first line of the FASTA file `chr0.fa` must be `>0`.
+
+The index files in the `index` subdirectory must have the basename `index`.
+I.e., the index subdirectory must contain these files:
+
+ index.1.ebwt
+ index.2.ebwt
+ index.3.ebwt
+ index.4.ebwt
+ index.rev.1.ebwt
+ index.rev.2.ebwt
+
+The index must be built using the [`bowtie-build`] tool distributed with
+[Bowtie]. When `bowtie-build` is executed, the FASTA files specified on the
+command line must be listed in ascending order of numeric id. For instance, for
+a set of FASTA files encoding human chromosomes 1,2,...,22,X,Y as
+`chr0.fa`,`chr1.fa`,...,`chr21.fa`, `chr22.fa`,`chr23.fa`, the command for
+`bowtie-build` must list the FASTA files in that order:
+
+ bowtie-build chr0.fa,chr1.fa,...,chr23.fa index
+
+The SNP description files in the `snps` subdirectory must also have names that
+match the corresponding FASTA files in the `sequences` subdirectory, but with
+extension `.snps`. E.g. if the sequence file for human Chromosome 1 is named
+`chr0.fa`, then the SNP description file for Chromosome 1 must be named
+`chr0.snps`. SNP description files may be omitted for some or all chromosomes.
+
+The format of the SNP description files must match the format expected by
+[SOAPsnp]'s `-s` option. The format consists of 1 SNP per line, with the
+following tab-separated fields per SNP:
+
+1. Chromosome ID
+2. 1-based offset into chromosome
+3. Whether SNP has allele frequency information (1 = yes, 0 = no)
+4. Whether SNP is validated by experiment (1 = yes, 0 = no)
+5. Whether SNP is actually an indel (1 = yes, 0 = no)
+6. Frequency of A allele, as a decimal number
+7. Frequency of C allele, as a decimal number
+8. Frequency of T allele, as a decimal number
+9. Frequency of G allele, as a decimal number
+10. SNP id (e.g. a [dbSNP] id such as `rs9976767`)
+
+Once these three subdirectories have been created and populated, they can be
+combined into a single [jar file] with a command like this:
+
+[jar file]: http://en.wikipedia.org/wiki/JAR_(file_format)
+
+ jar cf ref-XXX.jar sequences snps index
+
+To use `ref-XXX.jar` with Crossbow, you must copy it to a location where it can
+be downloaded over the internet via HTTP, FTP, or S3. Once it is placed in such
+a location, make a note if its URL.
+
+[`bowtie-build`]: http://bowtie-bio.sourceforge.net/manual.shtml#indx
+[dbSNP]: http://www.ncbi.nlm.nih.gov/projects/SNP/
+
+## Building a reference jar using automatic scripts
+
+The `reftools` subdirectory of the Crossbow package contains scripts that assist
+in building reference jars, including scripts that handle the entire process of
+building reference jars for [hg18] (UCSC human genome build 18) and [mm9] (UCSC
+mouse genome build 9). The `db2ssnp` script combines SNP and allele frequency
+information from [dbSNP] to create a `chrX.snps` file for the `snps`
+subdirectory of the reference jar. The `db2ssnp_*` scripts drive the `db2ssnp`
+script for each chromosome in the [hg18] and [mm9] genomes. The `*_jar` scripts
+drive the entire reference-jar building process, including downloading reference
+FASTA files, building a Bowtie index, and using `db2ssnp` to generate the `.snp`
+files for [hg18] and [mm9].
+
+[hg18]: http://hgdownload.cse.ucsc.edu/downloads.html#human
+[mm9]: http://hgdownload.cse.ucsc.edu/downloads.html#mouse
+[dbSNP]: http://www.ncbi.nlm.nih.gov/projects/SNP/
+
+# Monitoring, debugging and logging
+
+## Single computer
+
+Single-computer runs of Crossbow are relatively easy to monitor and debug.
+Progress messages are printed to the console as the job runs. When there is a
+fatal error, Crossbow usually indicates exactly which log file on the local
+filesystem contains the relevant error message. Additional debugging is possible
+when intermediate and temporary files are kept rather than discarded; see
+[`--keep-intermediates`] and [`--keep-all`]. All output and logs are stored on
+the local filesystem; see [`--intermediate`](#cb-local-intermediate) and
+[`--output`](#cb-local-output) options.
+
+## Hadoop
+
+The simplest way to monitor Crossbow [Hadoop] jobs is via the Hadoop JobTracker.
+ The JobTracker is a web server that provides a point-and-click interface for
+monitoring jobs and reading output and other log files generated by those jobs,
+including after they've finished.
+
+When a job fails, you can often find the relevant error message by "drilling
+down" from the "step" level through the "job" level and "task" levels, and
+finally to the "attempt" level. To diagnose why an attempt failed, click
+through to the "stderr" ("standard error") log and scan for the relevant error
+message.
+
+See your version of Hadoop's documentation for details on how to use the web
+interface. Amazon has a brief document describing [How to Use the Hadoop User
+Interface], though some of the instructions are specific to clusters rented from
+Amazon. [Hadoop, the Definitive Guide] is also an excellent reference.
+
+[How to Use the Hadoop User Interface]: http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?UsingtheHadoopUserInterface.html
+[Hadoop, the Definitive Guide]: http://oreilly.com/catalog/9780596521981
+
+## EMR
+
+The recommended way to monitor EMR [Hadoop] jobs is via the [AWS Console]. The
+[AWS Console] allows you to see:
+
+1. The status for job (e.g. "COMPLETED", "RUNNING" or "FAILED")
+2. The status for each step of each job
+3. How long a job has been running for and how many "compute units" have been
+ utilized so far.
+4. The exact Hadoop commands used to initiate each job step.
+5. The button for [Debugging Job Flows]
+
+<div>
+<img src="images/AWS_console.png" alt="Screen shot of AWS console with interface elements labeled" />
+<p><i>Screen shot of [AWS Console] interface with some relevant interface elements labeled</i></p>
+</div>
+
+The [AWS Console] also has a useful facility for [Debugging Job Flows], which is
+accessible via the "Debug" button on the "Elastic MapReduce" tab of the Console
+(labeled "5"). You must (a) have a [SimpleDB] account (b) not have specified
+[`--no-emr-debug`] in order to use all of the [EMR Debug] interface's features:
+
+<div>
+<img src="images/AWS_console_debug.png" alt="Screen shot of AWS console debug interface" />
+<p><i>Screen shot of [EMR Debug] interface</i></p>
+</div>
+
+The debug interface is similar to Hadoop's JobTracker interface. When a job
+fails, you can often find the relevant error message by "drilling down" from the
+"job" level, through the "task" level, and finally to the "attempt" level. To
+diagnose why an attempt failed, click through to the "stderr" ("standard error")
+log and scan for the relevant error message.
+
+For more information, see Amazon's document on [Debugging Job Flows].
+
+[Debugging Job Flows]: http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?DebuggingJobFlows.html
+[EMR Debug]: http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?DebuggingJobFlows.html
+
+## AWS Management Console
+
+A simple way to monitor your EMR activity is via the [AWS Console]. The [AWS
+Console] summarizes current information regarding all your running [EC2] nodes
+and [EMR] jobs. Each job is listed in the "Amazon Elastic MapReduce" tab of the
+console, whereas individual [EC2] nodes are listed in the "Amazon EC2" tab.
+
+<div>
+<img src="images/AWS_console_upper_left.png" alt="Screen shot of AWS console tabs" />
+<p><i>Screen shot of [AWS console]; note tabs for "Amazon Elastic MapReduce" and "Amazon EC2"</i></p>
+</div>
+
+# Crossbow Output
+
+Once a Crossbow job completes successfully, the output is deposited in a
+`crossbow_results` subdirectory of the specified `--output` directory or URL.
+Within the `crossbow_results` subdirectory, results are organized as one gzipped
+result file per chromosome. E.g. if your run was against the [hg18] build of
+the human genome, the output files from your experiment will named:
+
+ <output_url>/crossbow_results/chr1.gz
+ <output_url>/crossbow_results/chr2.gz
+ <output_url>/crossbow_results/chr3.gz
+ ...
+ <output_url>/crossbow_results/chr21.gz
+ <output_url>/crossbow_results/chr22.gz
+ <output_url>/crossbow_results/chrX.gz
+ <output_url>/crossbow_results/chrY.gz
+ <output_url>/crossbow_results/chrM.gz
+
+Each individual record is in the [SOAPsnp] output format. SOAPsnp's format
+consists of 1 SNP per line with several tab-separated fields per SNP. The
+fields are:
+
+1. Chromosome ID
+2. 1-based offset into chromosome
+3. Reference genotype
+4. Subject genotype
+5. Quality score of subject genotype
+6. Best base
+7. Average quality score of best base
+8. Count of uniquely aligned reads corroborating the best base
+9. Count of all aligned reads corroborating the best base
+10. Second best base
+11. Average quality score of second best base
+12. Count of uniquely aligned reads corroborating second best base
+13. Count of all aligned reads corroborating second best base
+14. Overall sequencing depth at the site
+15. Sequencing depth of just the paired alignments at the site
+16. Rank sum test P-value
+17. Average copy number of nearby region
+18. Whether the site is a known SNP from the file specified with `-s`
+
+Note that field 15 was added in Crossbow and is not output by unmodified SOAPsnp.
+
+For further details, see the [SOAPsnp] manual.
+
+# Other reading
+
+The [Crossbow paper] discusses the broad design philosophy of both [Crossbow]
+and [Myrna] and why cloud computing can be considered a useful trend for
+comparative genomics applications. The [Bowtie paper] discusses the alignment
+algorithm underlying [Bowtie].
+
+[Bowtie paper]: http://genomebiology.com/2009/10/3/R25
+[Crossbow]: http://bowtie-bio.sf.net/crossbow
+[Crossbow paper]: http://genomebiology.com/2009/10/11/R134
+
+For additional information regarding Amazon EC2, S3, EMR, and related
+services, see Amazon's [AWS Documentation]. Some helpful screencasts
+are posted on the [AWS Console] home page.
+
+[AWS Documentation]: http://aws.amazon.com/documentation/
+
+For additional information regarding Hadoop, see the [Hadoop web site] and
+[Cloudera's Getting Started with Hadoop] document. [Cloudera's training virtual
+machine] for [VMWare] is an excellent way to get acquainted with Hadoop without
+having to install it on a production cluster.
+
+[Cloudera's Getting Started with Hadoop]: http://www.cloudera.com/resource/getting_started_with_hadoop
+[Cloudera's training virtual machine]: http://www.cloudera.com/developers/downloads/virtual-machine/
+[VMWare]: http://www.vmware.com/
+[Hadoop web site]: http://hadoop.apache.org/
+
+# Acknowledgements
+
+[Crossbow] software is by [Ben Langmead] and [Michael C. Schatz].
+
+[Bowtie] software is by [Ben Langmead] and [Cole Trapnell].
+
+[SOAPsnp] is by Ruiqiang Li, Yingrui Li, Xiaodong Fang, Huanming Yang, Jian
+Wang, Karsten Kristiansen, and Jun Wang.
+
+[Ben Langmead]: http://faculty.jhsph.edu/default.cfm?faculty_id=2209&grouped=false&searchText=&department_id=3&departmentName=Biostatistics
+[Michael C. Schatz]: http://www.cbcb.umd.edu/~mschatz/
+[Cole Trapnell]: http://www.cs.umd.edu/~cole/
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..138c0fe
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,167 @@
+all: doc bin package
+
+CROSSBOW_VERSION=$(shell cat VERSION)
+SF_BOWTIE_BASE=https://sourceforge.net/projects/bowtie-bio/files/bowtie
+SF_BOWTIE_MID=
+BOWTIE_VERSION=0.12.8
+MACOS_VERSION=10.6
+
+doc: doc/manual.html MANUAL
+.PHONY: doc
+
+doc/manual.html: MANUAL.markdown
+ echo "<h1>Table of Contents</h1>" > .tmp.head
+ pandoc -T "Crossbow $(CROSSBOW_VERSION) Manual" -B .tmp.head \
+ --css style.css -o $@ \
+ --from markdown --to HTML \
+ --table-of-contents $^
+
+MANUAL: MANUAL.markdown
+ perl doc/strip_markdown.pl < $^ > $@
+
+.PHONY: bin32
+bin32: bin/linux32/bowtie \
+ bin/linux32/bowtie-build \
+ bin/linux32/bowtie-debug \
+ bin/linux32/bowtie-build-debug \
+ bin/linux32/soapsnp \
+ bin/linux32/soapsnp-debug \
+ bin/mac32/bowtie \
+ bin/mac32/bowtie-build \
+ bin/mac32/bowtie-debug \
+ bin/mac32/bowtie-build-debug \
+ bin/mac32/soapsnp \
+ bin/mac32/soapsnp-debug
+
+.PHONY: bin64
+bin64: bin/linux64/bowtie \
+ bin/linux64/bowtie-build \
+ bin/linux64/bowtie-debug \
+ bin/linux64/bowtie-build-debug \
+ bin/linux64/soapsnp \
+ bin/linux64/soapsnp-debug \
+ bin/mac64/bowtie \
+ bin/mac64/bowtie-build \
+ bin/mac64/bowtie-debug \
+ bin/mac64/bowtie-build-debug \
+ bin/mac64/soapsnp \
+ bin/mac64/soapsnp-debug
+
+.PHONY: bin
+bin: bin64
+
+#bin/linux32/bowtie: bowtie-$(BOWTIE_VERSION)-linux-i386.zip
+# mkdir -p bin/linux32
+# unzip $^ bowtie-$(BOWTIE_VERSION)/bowtie
+# mv bowtie-$(BOWTIE_VERSION)/bowtie $@
+# rm -rf bowtie-$(BOWTIE_VERSION)
+
+#bin/linux32/bowtie-build: bowtie-$(BOWTIE_VERSION)-linux-i386.zip
+# mkdir -p bin/linux32
+# unzip $^ bowtie-$(BOWTIE_VERSION)/bowtie-build
+# mv bowtie-$(BOWTIE_VERSION)/bowtie-build $@
+# rm -rf bowtie-$(BOWTIE_VERSION)
+
+#bin/linux32/bowtie-debug: bowtie-$(BOWTIE_VERSION)-linux-i386.zip
+# mkdir -p bin/linux32
+# unzip $^ bowtie-$(BOWTIE_VERSION)/bowtie-debug
+# mv bowtie-$(BOWTIE_VERSION)/bowtie-debug $@
+# rm -rf bowtie-$(BOWTIE_VERSION)
+
+#bin/linux32/bowtie-build-debug: bowtie-$(BOWTIE_VERSION)-linux-i386.zip
+# mkdir -p bin/linux32
+# unzip $^ bowtie-$(BOWTIE_VERSION)/bowtie-build-debug
+# mv bowtie-$(BOWTIE_VERSION)/bowtie-build-debug $@
+# rm -rf bowtie-$(BOWTIE_VERSION)
+
+
+bin/linux64/bowtie: bowtie-$(BOWTIE_VERSION)-linux-x86_64.zip
+ mkdir -p bin/linux64
+ unzip $^ bowtie-$(BOWTIE_VERSION)/bowtie
+ mv bowtie-$(BOWTIE_VERSION)/bowtie $@
+ rm -rf bowtie-$(BOWTIE_VERSION)
+
+bin/linux64/bowtie-build: bowtie-$(BOWTIE_VERSION)-linux-x86_64.zip
+ mkdir -p bin/linux64
+ unzip $^ bowtie-$(BOWTIE_VERSION)/bowtie-build
+ mv bowtie-$(BOWTIE_VERSION)/bowtie-build $@
+ rm -rf bowtie-$(BOWTIE_VERSION)
+
+bin/linux64/bowtie-debug: bowtie-$(BOWTIE_VERSION)-linux-x86_64.zip
+ mkdir -p bin/linux64
+ unzip $^ bowtie-$(BOWTIE_VERSION)/bowtie-debug
+ mv bowtie-$(BOWTIE_VERSION)/bowtie-debug $@
+ rm -rf bowtie-$(BOWTIE_VERSION)
+
+bin/linux64/bowtie-build-debug: bowtie-$(BOWTIE_VERSION)-linux-x86_64.zip
+ mkdir -p bin/linux64
+ unzip $^ bowtie-$(BOWTIE_VERSION)/bowtie-build-debug
+ mv bowtie-$(BOWTIE_VERSION)/bowtie-build-debug $@
+ rm -rf bowtie-$(BOWTIE_VERSION)
+
+
+#bin/mac32/bowtie: bowtie-$(BOWTIE_VERSION)-macos-$(MACOS_VERSION)-i386.zip
+# mkdir -p bin/mac32
+# unzip $^ bowtie-$(BOWTIE_VERSION)/bowtie
+# mv bowtie-$(BOWTIE_VERSION)/bowtie $@
+# rm -rf bowtie-$(BOWTIE_VERSION)
+
+#bin/mac32/bowtie-build: bowtie-$(BOWTIE_VERSION)-macos-$(MACOS_VERSION)-i386.zip
+# mkdir -p bin/mac32
+# unzip $^ bowtie-$(BOWTIE_VERSION)/bowtie-build
+# mv bowtie-$(BOWTIE_VERSION)/bowtie-build $@
+# rm -rf bowtie-$(BOWTIE_VERSION)
+
+#bin/mac32/bowtie-debug: bowtie-$(BOWTIE_VERSION)-macos-$(MACOS_VERSION)-i386.zip
+# mkdir -p bin/mac32
+# unzip $^ bowtie-$(BOWTIE_VERSION)/bowtie-debug
+# mv bowtie-$(BOWTIE_VERSION)/bowtie-debug $@
+# rm -rf bowtie-$(BOWTIE_VERSION)
+
+#bin/mac32/bowtie-build-debug: bowtie-$(BOWTIE_VERSION)-macos-$(MACOS_VERSION)-i386.zip
+# mkdir -p bin/mac32
+# unzip $^ bowtie-$(BOWTIE_VERSION)/bowtie-build-debug
+# mv bowtie-$(BOWTIE_VERSION)/bowtie-build-debug $@
+# rm -rf bowtie-$(BOWTIE_VERSION)
+
+
+bin/mac64/bowtie: bowtie-$(BOWTIE_VERSION)-macos-$(MACOS_VERSION)-x86_64.zip
+ mkdir -p bin/mac64
+ unzip $^ bowtie-$(BOWTIE_VERSION)/bowtie
+ mv bowtie-$(BOWTIE_VERSION)/bowtie $@
+ rm -rf bowtie-$(BOWTIE_VERSION)
+
+bin/mac64/bowtie-build: bowtie-$(BOWTIE_VERSION)-macos-$(MACOS_VERSION)-x86_64.zip
+ mkdir -p bin/mac64
+ unzip $^ bowtie-$(BOWTIE_VERSION)/bowtie-build
+ mv bowtie-$(BOWTIE_VERSION)/bowtie-build $@
+ rm -rf bowtie-$(BOWTIE_VERSION)
+
+bin/mac64/bowtie-debug: bowtie-$(BOWTIE_VERSION)-macos-$(MACOS_VERSION)-x86_64.zip
+ mkdir -p bin/mac64
+ unzip $^ bowtie-$(BOWTIE_VERSION)/bowtie-debug
+ mv bowtie-$(BOWTIE_VERSION)/bowtie-debug $@
+ rm -rf bowtie-$(BOWTIE_VERSION)
+
+bin/mac64/bowtie-build-debug: bowtie-$(BOWTIE_VERSION)-macos-$(MACOS_VERSION)-x86_64.zip
+ mkdir -p bin/mac64
+ unzip $^ bowtie-$(BOWTIE_VERSION)/bowtie-build-debug
+ mv bowtie-$(BOWTIE_VERSION)/bowtie-build-debug $@
+ rm -rf bowtie-$(BOWTIE_VERSION)
+
+
+bowtie-$(BOWTIE_VERSION)-macos-$(MACOS_VERSION)-i386.zip:
+ wget --no-check-certificate $(SF_BOWTIE_BASE)$(SF_BOWTIE_MID)/$(BOWTIE_VERSION)/bowtie-$(BOWTIE_VERSION)-macos-$(MACOS_VERSION)-i386.zip/download
+
+bowtie-$(BOWTIE_VERSION)-macos-$(MACOS_VERSION)-x86_64.zip:
+ wget --no-check-certificate $(SF_BOWTIE_BASE)$(SF_BOWTIE_MID)/$(BOWTIE_VERSION)/bowtie-$(BOWTIE_VERSION)-macos-$(MACOS_VERSION)-x86_64.zip/download
+
+bowtie-$(BOWTIE_VERSION)-linux-i386.zip:
+ wget --no-check-certificate $(SF_BOWTIE_BASE)$(SF_BOWTIE_MID)/$(BOWTIE_VERSION)/bowtie-$(BOWTIE_VERSION)-linux-i386.zip/download
+
+bowtie-$(BOWTIE_VERSION)-linux-x86_64.zip:
+ wget --no-check-certificate $(SF_BOWTIE_BASE)$(SF_BOWTIE_MID)/$(BOWTIE_VERSION)/bowtie-$(BOWTIE_VERSION)-linux-x86_64.zip/download
+
+.PHONY: package
+package: bin
+ bash util/package.bash
diff --git a/MapWrap.pl b/MapWrap.pl
new file mode 100755
index 0000000..7be2884
--- /dev/null
+++ b/MapWrap.pl
@@ -0,0 +1,296 @@
+#!/usr/bin/perl
+
+##
+# MapWrap.pl
+#
+# Simple wrapper that mimics some of Hadoop's behavior during the
+# Map step of a MapReduce computation.
+#
+
+use strict;
+use warnings;
+use Getopt::Long;
+use FindBin qw($Bin);
+use lib $Bin;
+use lib "$Bin/contrib";
+use Cwd 'abs_path';
+use ForkManager;
+use Wrap;
+use File::Path qw(mkpath);
+use POSIX qw/strftime/;
+
+my $name = "";
+my $stage = -1;
+my $numStages = -1;
+my $nmap = 1;
+my $input = "";
+my $output = "";
+my $intermediate = "";
+my $lineByLine = 0;
+my $silentSkipping = 0;
+my $force = 0;
+my $keep = 0;
+my $verbose = 0;
+my $retries = 3;
+my $delay = 5;
+my $VERSION = `cat $Bin/VERSION`; $VERSION =~ s/\s//g;
+
+my $support = qq!
+When requesting support, please include the full output printed here.
+If a child process was the cause of the error, the output should
+include the relevant error message from the child's error log. You may
+be asked to provide additional files as well.
+!;
+
+##
+# Printer that prints to STDERR and, optionally, to a file for messages.
+#
+my $msgfn = "";
+my $msgfh = undef;
+sub msg($) {
+ my $msg = shift;
+ $msg =~ s/[\r\n]*$//;
+ print STDERR "$msg\n";
+ print {$msgfh} "$msg\n" if defined($msgfh);
+}
+
+##
+# Printer that prints to STDERR and, optionally, to a file for counters.
+#
+my $cntfn = "";
+my $cntfh = undef;
+sub cnt($) {
+ my $msg = shift;
+ $msg =~ s/[\r\n]*$//;
+ print STDERR "$msg\n";
+ print {$cntfh} "$msg\n" if defined($cntfh);
+}
+
+##
+# Print an error message, a support message, then die with given
+# exitlevel.
+#
+sub mydie($$) {
+ my ($msg, $lev) = @_;
+ msg("Fatal error $VERSION:M$lev: $msg");
+ msg($support);
+ exit $lev;
+}
+
+GetOptions (
+ "name:s" => \$name,
+ "stage:i" => \$stage,
+ "num-stages:i" => \$numStages,
+ "mappers:i" => \$nmap,
+ "output:s" => \$output,
+ "messages:s" => \$msgfn,
+ "counters:s" => \$cntfn,
+ "intermediate:s" => \$intermediate,
+ "input:s" => \$input,
+ "retries:i" => \$retries,
+ "delay:i" => \$delay,
+ "force" => \$force,
+ "line-by-line" => \$lineByLine,
+ "silent-skipping" => \$silentSkipping,
+ "keep-all" => \$keep) || die "Bad option\n";
+
+if($msgfn ne "") {
+ open($msgfh, ">>$msgfn") || mydie("Could not open message-out file $msgfn for writing", 15);
+}
+$input ne "" || mydie("Must specify input directory with --input", 10);
+$intermediate = "$output.map.pre" if $intermediate eq "";
+
+if($name ne "") {
+ msg("==========================");
+ msg("Stage $stage of $numStages. $name");
+ msg("==========================");
+}
+msg("Time: ".strftime('%H:%M:%S %d-%b-%Y', localtime));
+
+msg("=== Map ===");
+msg("# parallel mappers: $nmap");
+msg("Input: $input");
+msg("Output: $output");
+msg("Intermediate: $intermediate");
+msg("Retries / delay: $retries / $delay");
+msg("Options: [ " .
+ ($lineByLine ? "--line-by-line " : "").
+ ($keep ? "--keep-all " : "").
+ ($force ? "--force " : "")."]");
+
+sub checkDir($) {
+ my $dir = shift;
+ if(-d $dir) {
+ mydie("Output directory $dir already exists", 20) unless $force;
+ if($force) {
+ msg("Removing directory $dir due to --force");
+ system("rm -rf $dir >/dev/null 2>/dev/null");
+ -d $dir && mydie("Could not remove directory $dir", 30);
+ }
+ }
+ mkpath($dir);
+ (-d $dir) || mydie("Could not create new directory $dir", 40);
+}
+
+checkDir($output);
+my $errDir = "$intermediate/map.err";
+checkDir($errDir);
+my $workingDir = "$intermediate/map.wds";
+checkDir($workingDir);
+if(defined($cntfn) && $cntfn ne "") {
+ open($cntfh, ">>", "$cntfn") || mydie("Could not open counters file $cntfn", 45);
+}
+
+my $cmd = join(" ", @ARGV);
+msg("Starting $nmap mappers with command:\n$cmd");
+
+my $pm = new Parallel::ForkManager($nmap);
+
+# Setup a callback for when a child finishes up so we can
+# get its exit code
+my $childFailed = 0;
+my $childFailedPid = 0;
+$pm->run_on_finish(
+ sub {
+ my ($pid, $exit_code, $ident) = @_;
+ if($exit_code != 0) {
+ $childFailed = $exit_code;
+ $childFailedPid = $pid;
+ }
+ }
+);
+
+my @inputs = ();
+my $linewise = 0;
+for my $inp (split(/,/, $input)) {
+ $inp = abs_path($inp);
+ -d $inp || -f $inp || mydie("No such input file or directory as \"$inp\"", 50);
+ my @fs = ();
+ if(-d $inp) {
+ @fs = <$inp/*>;
+ } else {
+ push @fs, $inp;
+ }
+ if($lineByLine) {
+ $linewise = 1;
+ for my $f (@fs) {
+ if($f =~ /\.gz$/) {
+ open(INP, "gzip -dc $f |") || mydie("Could not open pipe 'gzip -dc $f |'", 60);
+ } elsif($f =~ /\.bz2$/) {
+ open(INP, "bzip2 -dc $f |") || mydie("Could not open pipe 'bzip2 -dc $f |'", 70);
+ } else {
+ open(INP, "$f") || mydie("Could not open $f for reading", 80);
+ }
+ while(<INP>) {
+ my $add = 1;
+ if($silentSkipping) {
+ $add = 0 if /^\s*$/ || /^#/;
+ }
+ push @inputs, $_ if $add;
+ }
+ close(INP);
+ $? == 0 || mydie("Bad exitlevel from input slurp: $?", 90);
+ }
+ } else {
+ push @inputs, @fs;
+ }
+}
+
+# Map from PIDs to the file(s) where the error message is likely to be
+# if and when they fail
+my %pidToErrfiles = ();
+my %pidToInputs = ();
+my $alreadyDumped = 0;
+sub failDump() {
+ return if $alreadyDumped;
+ msg("******");
+ msg("* Aborting master loop because child $childFailedPid failed");
+ msg("* (other children may also have failed)");
+ msg("* Input file or string was:");
+ msg("* $pidToInputs{$childFailedPid}:");
+ msg("* Error message is in file: ".$pidToErrfiles{$childFailedPid}.", also printed below");
+ msg("******");
+ if(!open(ERR, $pidToErrfiles{$childFailedPid})) {
+ msg("* (could not open)");
+ } else {
+ while(<ERR>) { msg("* $_"); }
+ close(ERR);
+ }
+ msg("******");
+ $alreadyDumped = 1;
+}
+
+my $fi = 0;
+for my $f (@inputs) {
+ $fi++;
+ if($childFailed) { failDump(); last; }
+ my $childPid = $pm->start;
+ if($childPid != 0) {
+ # I'm the parent
+ my $ofn = sprintf "map-%05d", $childPid;
+ $pidToErrfiles{$childPid} = "$errDir/$ofn";
+ $pidToInputs{$childPid} = "$f";
+ next; # spawn the next child
+ }
+ # I'm the child
+ exit 0 if $childFailed;
+ chomp($f);
+ msg("Pid $$ processing input $f [$fi of ".scalar(@inputs)."]...");
+ my $ofn = sprintf "map-%05d", $$;
+ my $redir = ">$output/$ofn 2>$errDir/$ofn";
+ my $wd = "$workingDir/$$";
+ mkpath($wd);
+ (-d $wd) || mydie("Could not create working directory $wd", 100);
+ chdir($wd) || mydie("Could not change to working directory $wd", 110);
+ for(my $i = 0; $i <= $retries; $i++) {
+ if($linewise) {
+ my $pipe = "| $cmd $redir";
+ open(CMD, $pipe) || mydie("Could not open pipe '$pipe' for writing", 120);
+ print CMD "$f\n";
+ close(CMD);
+ if($? != 0) {
+ msg("Non-zero return ($?) after closing pipe '$pipe'");
+ msg("Retrying in $delay seconds...");
+ sleep($delay);
+ next;
+ }
+ } else {
+ my $ret = 1;
+ my $fullcmd = "";
+ if($f =~ /\.gz$/) {
+ $fullcmd = "gzip -dc $f | $cmd $redir";
+ } elsif($f =~ /\.bz2$/) {
+ $fullcmd = "bzip2 -dc $f | $cmd $redir";
+ } else {
+ $fullcmd = "cat $f | $cmd $redir";
+ }
+ $ret = system($fullcmd);
+ if($ret != 0) {
+ msg("Non-zero return ($ret) after executing command '$fullcmd'");
+ msg("Retrying in $delay seconds...");
+ sleep($delay);
+ next;
+ }
+ }
+ $pm->finish;
+ }
+ mydie("Out of retries; aborting...", 130);
+}
+msg("Aborting master loop because child failed") if $childFailed;
+$pm->wait_all_children;
+if($childFailed) {
+ failDump();
+ mydie("Aborting because child with PID $childFailedPid exited abnormally", 140);
+} else {
+ msg("All children succeeded");
+}
+
+msg("-- Map counters --");
+Wrap::getAndPrintLocalCounters($errDir, \&msg);
+Wrap::getAndPrintLocalCounters($errDir, \&cnt) if defined($cntfh);
+
+# No errors
+unless($keep) {
+ msg("Removing $intermediate (to keep, specify --keep-all)");
+ system("rm -rf $intermediate");
+}
diff --git a/NEWS b/NEWS
new file mode 100644
index 0000000..8fc20cf
--- /dev/null
+++ b/NEWS
@@ -0,0 +1,225 @@
+Crossbow: Parallel short read genotyping in the cloud
+http://bowtie-bio.sf.net/crossbow
+
+Crossbow NEWS
+=============
+
+ Crossbow is now available for download. 0.1.0 was the first version
+to be released, and it was released under the OSI Artistic License (see
+LICENSE_ARTISTIC file) and freely available to the public for download.
+Portions of the Crossbow package are borrowed from other sources, and
+those portions are subject to other licenses (see LICENSES file). The
+current version is 1.2.0.
+
+Reporting Issues
+================
+
+Please report any issues using the Sourceforge bug tracker:
+
+ https://sourceforge.net/tracker/?group_id=236897&atid=1101606
+
+Version Release History
+=======================
+
+Version 1.2.0 - July 20, 2012
+ * Added support for Hadoop version 0.20.205.
+ * Dropped support for Hadoop versions prior to 0.20.
+ * Updated default Hadoop version for EMR jobs to 0.20.205.
+ * Updated Bowtie version used to 0.12.8.
+ * Fixed issues with streaming jar version parsing
+ * Fixed documentation bugs regarding --sra-toolkit option, which is
+ superseded by the --fastq-dump option.
+
+Version 1.1.2 - May 23, 2011
+ * Added --just-align and --resume-align options. --just-align
+ causes Crossbow to put the results of the Alignment phase in the
+ --output directory and quit after the alignment phase. You can
+ later "resume" Crossbow by specifying this directory as the
+ --input directory and specifying the --resume-align option.
+ * Fixed issue with .sra input whereby status output from fastq-dump
+ would be interpreted as a read.
+ * Other minor bugfixes.
+
+Version 1.1.1 - February 7, 2011
+ * Added support for the .sra file format, used by the Sequence Read
+ Archive. These files can now be specified in the manifest.
+ Crossbow uses the fastq-convert tool from the SRA Toolkit to
+ convert .sra files to .fastq files in the preprocess stage.
+ * The examples that included defunct SRA FASTQ files were updated to
+ point to new .sra files instead.
+
+Version 1.1.0 - October 11, 2010
+ * Added --discard-ref-bin and --discard-all options, which can be
+ helpful to reduce Crossbow running time when a run's chief purpose
+ is to test whether it runs all the way through.
+ * Fixed a bug in soapsnp that caused a segmentation fault in the
+ last partition of a chromosome when chromosome length is a
+ multiple of 64.
+ * Revamped the reference jar scripts (in $CROSSBOW_HOME/reftools).
+ The new scripts use Ensembl rather than UCSC & dbSNP. The old
+ scripts (db2ssnp* and *_jar) are still there, but are likely to be
+ deprecated soon.
+ * Fixed a few bugs in the hg19_jar and db2ssnp_hg19 scripts.
+ * Removed the hg18_jar script, which was broken by a reorganization
+ of the dbSNP site.
+ * Uses Bowtie 0.12.7 instead of 0.12.5.
+ * Switched Mouse17 example's manifest files back to use .gz
+ extension instead of .bz2.
+
+Version 1.0.9 - September 13, 2010
+ * Fixed example manifests that point to Short Read Archive files to
+ use .bz2 instead of .gz extensions.
+
+Version 1.0.8 - September 4, 2010
+ * Set the memory cap on the sort task to be inversely proportional
+ to --cpus, to avoid memory footprint blowup on computers with more
+ processors.
+ * Fixed a final issue that affected how Crossbow handles quality
+ value conversion.
+ * Fixed issue whereby bzip2'ed data would be handled incorrectly by
+ the preprocessor.
+ * Fixed counter in Preprocess step that would erroneously refer to
+ unpaired reads as paired. Also "Read data fetched to EC2" has
+ been changed to "Read data fetched".
+ * In EMR mode, updated where user credentials are found; Amazon
+ changed their path sometime around 8/30/2010.
+ * In EMR mode, updated the manner in which the bootstrap action is
+ specified; the old way was disabled by Amazon sometime around
+ 8/30/2010.
+ * Fixed issue whereby ReduceWrap.pl would crash in cases with a
+ large number of bins (>10 million) .
+ * NOTE: The Short Read Archive (SRA) seems to be in the midst of a
+ reorganization that includes files that were previously gzipped
+ being replaced with versions zipped with bzip2. The files will
+ sometimes disappear for a while. If you are having problems with
+ an example where input reads come from the SRA, try renaming files
+ in the manifest file as appropriate. If that doesn't work, please
+ contact us.
+
+Version 1.0.7 - August 27, 2010
+ * Fixed issue whereby the order of the arguments to bowtie would
+ result in a crash when POSIXLY_CORRECT is set.
+ * Fixed --keep-all option, which was causing a crash.
+ * Fixed a lingering quality bug whereby qualities were converted
+ immediately to phred33 but phred64 or solexa64 flags would be
+ spuriously passed to Bowtie.
+
+Version 1.0.6 - August 26, 2010
+ * Single-computer mode now copies the output that it writes to the
+ console to a file 'cb.local.(pid).out'. Please include the
+ contents of this file when reporting issues.
+ * Sorting in single-computer mode is now more portable; switched
+ from command-line sort to pure-Perl File::Sort.
+ * Fixed bug whereby the quality setting would be propagated to
+ Bowtie but not to SOAPsnp, causing SOAPsnp to operate with
+ incorrect (over-optimistic) quality values when Phred+64 or
+ Solexa+64 modes were used.
+ * More helpful output from MapWrap.pl and ReduceWrap.pl to make it
+ easier to debug issues in single-computer-mode runs.
+ * Fixed issue where web form would incorrectly convert + signs in
+ AWS secret key to spaces, causing some good credentials to fail
+ verification.
+ * Fixed issue in preprocessor that mishandles copies when user's AWS
+ secret key contains slash characters.
+
+Version 1.0.5 - August 15, 2010
+ * Fixed issue that prevented CROSSBOW_EMR_HOME environment variable
+ from working.
+ * Fixed issue whereby Align.pl script fails to report a count for
+ the number of reads with alignments sampled due to Bowtie's -M
+ option.
+ * Fixed issue whereby scripts in the $CROSSBOW_HOME/reftools
+ directory had `#!/bin/sh` headers but were actually bash scripts.
+ * Fixed issue that made it difficult to specify a space-separated
+ list of arguments to the --bowtie-args and other --*-args
+ parameters.
+ * Fixed issue whereby most documentation referred to arguments with
+ a single-dash prefix, whereas users with the POSIXLY_CORRECT
+ environment variable set must use a double-dash prefix.
+ Documentation and code have been updated to always use double-dash
+ prefixes.
+
+Major revision: Version 1.0.4 - July 21, 2010
+ * Crossbow has been largely rewritten as an Amazon Elastic MapReduce
+ (EMR) application, as opposed to an Elastic Compute Cloud (EC2)
+ application. EMR runs on top of EC2 and is a more appropriate way
+ to run Crossbow for several reasons, including:
+ + The AWS Console's Elastic MapReduce tab, together with EMR's
+ Debug Job Flow feature, provide a much friendlier interface for
+ monitoring and manipulating jobs.
+ + The elaborate scripts for automating cluster setup, teardown,
+ proxy connection, etc., are all gone. They are either
+ irrelevant now, or else are handled automatically by EMR.
+ * A web-based GUI for composing and submitting EMR jobs has been
+ added. Most helpfully, the web GUI has features for
+ sanity-checking inputs; e.g. whether the user's credentials as
+ entered are valid, whether the input URL exists, etc.
+ * Crossbow is now fully "tri-mode", with separate cloud, Hadoop, and
+ single-computer operating modes. All three modes share a great
+ deal of common infrastructure, making all three modes easier to
+ maintain going forward.
+ + Crossbow's Hadoop mode is now much improved, having an interface
+ very similar to cloud and single-computer modes.
+ + A new single-computer operating mode has been added that (a)
+ uses many processors/cores to shorten computation time, and (b)
+ does not require the user to have a cloud account or a Hadoop
+ installation. It also doesn't require Java; just appropriate
+ versions of Bowtie, SOAPsnp (some of which are included), Perl,
+ and other tools. Its interface is very similar to cloud and
+ Hadoop modes.
+ * The manual is entirely rewritten. It now contains information
+ about running in all three modes (cloud, Hadoop, single-computer),
+ and gives multiple examples for how to run in each mode.
+ * Fixed a bug whereby allele frequency columns in the provided
+ reference jars had T and G columns switched.
+ * SOAPsnp reduce step now outputs more counter and status
+ information.
+ * SOAPsnp reduce step outputs an additional column per SNP
+ indicating paired-end coverage.
+ * Compatible with Bowtie versions 0.12.0 and above. Bowtie 0.12.5
+ now included.
+ * Many other new options and features. See manual.
+
+Version 0.1.3 - October 21, 2009
+ * cb-local now gives the user clear feedback when worker nodes fail
+ to confirm the MD5 signature of the reference jar. If this
+ failure occurs several times per node across all nodes, the
+ supplied MD5 is probably incorrect.
+ * An extra Reduce step was added to the end of the Crossbow job to
+ bin and sort SNPs before downloaded to the user's computer. This
+ step also renames output files by chromosome and deletes empty
+ output files.
+ * Added another example that uses recently-published mouse
+ chromosome 17 data (sequenced by Sudbery et al). The TUTORIAL
+ file now points to this new example.
+ * More and clearer messages in the output from cb-local.
+
+Version 0.1.2 - October 12, 2009
+ * Many fixes for the scripts that automate the reference-jar
+ building process.
+ * Added two utility scripts, dist_mfa and sanity_check, to the
+ reftools subdirectory. See their documentation for details.
+ * Added scripts for building a reference jar for C. elegans using
+ UCSC's ce6 (WormBase's WS190) assembly and information from dbSNP.
+ This small genome is used in the new TUTORIAL.
+ * New TUTORIAL steps the user through preprocessing reads from the
+ NCBI Short Read Archive, creating a reference .jar from a UCSC
+ assembly (ce6 in this case) and a set of SNP descriptions from
+ dbSNP, then running Crossbow and examining the resulting SNPs.
+ * Extended the preprocess-and-copy infrastructure to allow output
+ from a single input file to be split over many output files. This
+ is critical for achieving good load balance across a breadth of
+ datasets.
+
+Version 0.1.1 - October 9, 2009
+ * Added scripts that automate the reference-jar building process for
+ UCSC genomes hg18 and mm9. These scripts can be adapted to other
+ species. See the new "Using Automatic Scripts" subsection of the
+ "Building a Reference Jar" section of the MANUAL for details.
+ * License agreement files are now organized better. All licenses
+ applying to all software included in Crossbow are in "LICENSE*"
+ files in the Crossbow root directory.
+ * Minor updates to MANUAL
+
+Version 0.1.0 - October 3, 2009
+ * First stable release of Crossbow.
diff --git a/ReduceWrap.pl b/ReduceWrap.pl
new file mode 100755
index 0000000..9607cfd
--- /dev/null
+++ b/ReduceWrap.pl
@@ -0,0 +1,459 @@
+#!/usr/bin/perl
+
+##
+# ReduceWrap.pl
+#
+# Simple wrapper that mimics some of Hadoop's behavior during the
+# Reduce step of a MapReduce computation.
+#
+
+use strict;
+use warnings;
+use Getopt::Long;
+use FindBin qw($Bin);
+use lib $Bin;
+use lib "$Bin/contrib";
+use Cwd 'abs_path';
+use ForkManager;
+use Sort;
+use Wrap;
+use File::Path qw(mkpath);
+use POSIX qw/strftime/;
+use List::Util qw[min max];
+
+my $name = "";
+my $stage = -1;
+my $numStages = -1;
+my $nred = 1;
+my $ntasks = 1;
+my $input = "";
+my $output = "";
+my $intermediate = "";
+my $binFields = 0;
+my $sortFields = 0;
+my $sortSize = 0;
+my $maxRecords = 800000;
+my $maxFiles = 40;
+my $force = 0;
+my $keep = 0;
+my $externalSort = 0;
+my $verbose = 0;
+my $VERSION = `cat $Bin/VERSION`; $VERSION =~ s/\s//g;
+
+my $support = qq!
+When requesting support, please include the full output printed here.
+If a child process was the cause of the error, the output should
+include the relevant error message from the child's error log. You may
+be asked to provide additional files as well.
+!;
+
+##
+# Printer that prints to STDERR and, optionally, to a file for messages.
+#
+my $msgfn = "";
+my $msgfh = undef;
+sub msg($) {
+ my $msg = shift;
+ $msg =~ s/[\r\n]*$//;
+ print STDERR "$msg\n";
+ print {$msgfh} "$msg\n" if defined($msgfh);
+}
+
+##
+# Printer that prints to STDERR and, optionally, to a file for counters.
+#
+my $cntfn = "";
+my $cntfh = undef;
+sub cnt($) {
+ my $msg = shift;
+ $msg =~ s/[\r\n]*$//;
+ print STDERR "$msg\n";
+ print {$cntfh} "$msg\n" if defined($cntfh);
+}
+
+##
+# Print an error message, a support message, then die with given
+# exitlevel.
+#
+sub mydie($$) {
+ my ($msg, $lev) = @_;
+ msg("Fatal error $VERSION:R$lev: $msg");
+ msg($support);
+ exit $lev;
+}
+
+GetOptions (
+ "name:s" => \$name,
+ "stage:i" => \$stage,
+ "num-stages:i" => \$numStages,
+ "input:s" => \$input,
+ "output:s" => \$output,
+ "messages:s" => \$msgfn,
+ "counters:s" => \$cntfn,
+ "intermediate:s" => \$intermediate,
+ "reducers:i" => \$nred,
+ "tasks:i" => \$ntasks,
+ "bin-fields:i" => \$binFields,
+ "sort-fields:i" => \$sortFields,
+ "external-sort" => \$externalSort,
+ "S:i" => \$sortSize,
+ "size:i" => \$sortSize,
+ "max-sort-records:i"=> \$maxRecords,
+ "max-sort-files:i" => \$maxFiles,
+ "force" => \$force,
+ "keep-all" => \$keep,
+ "verbose" => \$verbose) || mydie("Bad option", 1);
+
+$intermediate = "$output.reduce.pre" if $intermediate eq "";
+if($msgfn ne "") {
+ open($msgfh, ">>$msgfn") || mydie("Could not open message-out file $msgfn for writing", 15);
+}
+
+if($name ne "") {
+ msg("==========================");
+ msg("Stage $stage of $numStages. $name");
+ msg("==========================");
+}
+msg("Time: ".strftime('%H:%M:%S %d-%b-%Y', localtime));
+
+msg("=== Reduce ===");
+msg("# parallel reducers: $nred");
+msg("# reduce tasks: $ntasks");
+msg("Input: $input");
+msg("Output: $output");
+msg("Intermediate: $intermediate");
+msg("# bin, sort fields: $binFields, $sortFields");
+msg("Total allowed sort memory footprint: $sortSize");
+msg("Options: [ ".
+ ($keep ? "--keep-all " : "").
+ ($force ? "--force " : "")."]");
+
+$sortSize = int((3 * 1024 * 1024)/$nred);
+
+$input ne "" || mydie("Must specify input directory with --input", 10);
+$output ne "" || mydie("Must specify output directory with --output", 20);
+-d $input || mydie("Input directory doesn't exist: \"$input\"", 30);
+$sortFields >= $binFields || mydie("--sort-fields must be >= --bin-fields", 40);
+$sortFields >= 1 || mydie("--sort-fields must be >= 1", 50);
+$binFields >= 1 || mydie("--bin-fields must be >= 1", 60);
+
+sub checkDir($) {
+ my $dir = shift;
+ if(-d $dir) {
+ mydie("Output directory $dir already exists", 70) unless $force;
+ if($force) {
+ msg("Removing directory $dir due to --force");
+ system("rm -rf $dir >/dev/null 2>/dev/null");
+ -d $dir && mydie("Could not remove directory $dir", 80);
+ }
+ }
+ mkpath($dir);
+ (-d $dir) || mydie("Could not create new directory $dir", 90);
+ return abs_path($dir);
+}
+$output = checkDir($output);
+my $errDir = "$intermediate/reduce.err";
+$errDir = checkDir($errDir);
+my $taskDir = "$intermediate/reduce.tasks";
+$taskDir = checkDir($taskDir);
+my $sortedTaskDir = "$intermediate/reduce.stasks";
+$sortedTaskDir = checkDir($sortedTaskDir);
+my $workingDir = "$intermediate/reduce.wds";
+$workingDir = checkDir($workingDir);
+my $binSizeDir = "$intermediate/reduce.binsz";
+$binSizeDir = checkDir($binSizeDir);
+if(defined($cntfn) && $cntfn ne "") {
+ open($cntfh, ">>", "$cntfn") || mydie("Could not open counters file $cntfn", 95);
+}
+
+my $cmd = join(" ", @ARGV);
+msg("Command:\n$cmd");
+
+########################################
+# Stage 1. Partition bins into tasks
+########################################
+
+my @taskFhs = ();
+my @taskFns = ();
+
+my $pm = new Parallel::ForkManager($nred);
+
+# Setup a callback for when a child finishes up so we can
+# get its exit code
+my $childFailed = 0;
+my $childFailedPid = 0;
+$pm->run_on_finish(
+ sub {
+ my ($pid, $exit_code, $ident) = @_;
+ $childFailed = $exit_code != 0;
+ $childFailedPid = $pid;
+ }
+);
+
+##
+# Count size of bins in each input file in parallel.
+#
+msg("Calculating per-input bin counts in parallel");
+my $ninfiles = 0;
+for my $dir (split(/,/, $input)) {
+ $dir = abs_path($dir);
+ -d $dir || mydie("No such input directory as \"$dir\"", 100);
+ my @fs = <$dir/*>;
+ $ninfiles += scalar(@fs);
+}
+my $fi = 0;
+for my $dir (split(/,/, $input)) {
+ $dir = abs_path($dir);
+ -d $dir || mydie("No such input directory as \"$dir\"", 110);
+ for my $f (<$dir/*>) {
+ $fi++;
+ $pm->start and next;
+ msg("Pid $$ processing input $f [$fi of $ninfiles]...");
+ my %binSizes = ();
+ if($f =~ /\.gz$/) {
+ open(F, "gzip -dc $f |") || mydie("Could not open gz file \"$f\" for reading", 120);
+ } elsif($f =~ /\.bz2$/) {
+ open(F, "bzip2 -dc $f |") || mydie("Could not open bzip2 file \"$f\" for reading", 130);
+ } else {
+ open(F, "$f") || mydie("Could not open \"$f\" for reading", 140);
+ }
+ while(<F>) {
+ chomp;
+ my @s = split(/\t/);
+ my $joined = join("\t", @s[0..min($binFields-1, $#s)]);
+ scalar(@s) >= $sortFields || $joined eq "FAKE" || mydie("$sortFields sort fields, but line doesn't have that many:\n$_", 150);
+ my $k = $joined;
+ $binSizes{$k}++;
+ }
+ my $ofn = sprintf "$binSizeDir/sizes-%05d", $$;
+ open (COUT, ">$ofn") || mydie("Could not open \"$ofn\" for writing", 160);
+ for my $k (keys %binSizes) {
+ print COUT "$k\t$binSizes{$k}\n";
+ }
+ close(COUT);
+ $pm->finish;
+ }
+}
+$pm->wait_all_children;
+
+##
+# Sum all per-input sizes
+#
+msg("Summing per-input counts");
+my %binSizes = ();
+for my $f (<$binSizeDir/*>) {
+ open (F, $f) || mydie("Could not open \"$f\" for reading", 170);
+ while(<F>) {
+ chomp;
+ my @s = split /\t/;
+ scalar(@s) >= 2 ||
+ mydie("Too few fields in subtotal line in $f:\n$_", 180);
+ my $k = join("\t", @s[0..($#s-1)]);
+ $s[-1] == int($s[-1]) ||
+ mydie("Malformed subtotal line in $f; final field isn't integer:\n$s[-1]", 190);
+ $binSizes{$k} += $s[-1];
+ }
+ close(F);
+}
+
+##
+# In one pass, allocate every bin to a task. Greedily allocate each
+# bin to the task with the fewest records in it.
+#
+msg("Factoring input into $ntasks tasks");
+my %tasks = ();
+my $nonemptyTasks = 0;
+my @taskSzs = (0) x $ntasks;
+for my $k (sort { $binSizes{$b} <=> $binSizes{$a} } keys %binSizes) {
+ my $min = -1;
+ for(my $i = 0; $i <= $#taskSzs; $i++) {
+ if($taskSzs[$i] < $min || $min == -1) {
+ $min = $taskSzs[$i];
+ $tasks{$k} = $i;
+ }
+ }
+ defined($tasks{$k}) || mydie("Couldn't map key \"$k\" to a task; sizes: @taskSzs", 200);
+ $nonemptyTasks++ if $taskSzs[$tasks{$k}] == 0;
+ $taskSzs[$tasks{$k}] += $binSizes{$k};
+}
+
+# Allocate and write bins
+$fi = 0;
+my %binPids = ();
+for my $dir (split(/,/, $input)) {
+ $dir = abs_path($dir);
+ -d $dir || mydie("No such input directory as \"$dir\"", 210);
+ for my $f (<$dir/*>) {
+ $fi++;
+ my $pid = $pm->start;
+ $binPids{$pid} = 1;
+ next if $pid;
+ msg("Pid $$ processing input $f [$fi of $ninfiles]...");
+ mkpath("$taskDir/$$");
+ for(my $i = 0; $i < $ntasks; $i++) {
+ my $nfn = sprintf "task-%05d", $i;
+ push @taskFns, "$taskDir/$$/$nfn";
+ my $cmd2 = ">$taskFns[-1]";
+ push @taskFhs, undef;
+ open ($taskFhs[-1], $cmd2) || mydie("Could not open pipe for writing: \"$cmd2\"", 220);
+ }
+ if($f =~ /\.gz$/) {
+ open(F, "gzip -dc $f |") || mydie("Could not open gz file \"$f\" for reading", 230);
+ } elsif($f =~ /\.bz2$/) {
+ open(F, "bzip2 -dc $f |") || mydie("Could not open bzip2 file \"$f\" for reading", 240);
+ } else {
+ open(F, "$f") || mydie("Could not open \"$f\" for reading", 250);
+ }
+ while(<F>) {
+ chomp;
+ my @s = split(/\t/);
+ my $k = join("\t", @s[0..min($binFields-1, $#s)]);
+ defined($tasks{$k}) || mydie("Bin \"$k\" wasn't assigned a task!", 260);
+ print {$taskFhs[$tasks{$k}]} "$_\n";
+ }
+ close(F);
+ # Close task pipes.
+ for(my $i = 0; $i < $ntasks; $i++) { close($taskFhs[$i]); }
+ $pm->finish;
+ }
+}
+$pm->wait_all_children;
+msg("Factored $ninfiles files into $nonemptyTasks non-empty tasks");
+
+########################################
+# Stage 2. Sort and reduce each task
+########################################
+
+my @srPids = ();
+my $reduceProcs = 0;
+
+##
+# Sort each bin of tuples prior to calling the reducer.
+#
+sub doSort($$$) {
+ my ($task, $ntasks, $external) = @_;
+ my @nfn = (); # bin inputs
+ my $taskEmpty = 1;
+ for my $k (keys %binPids) {
+ my $subtask = sprintf "$taskDir/$k/task-%05d", $task;
+ -f $subtask || mydie("No such input file as $subtask", 270);
+ $taskEmpty = 0 if -s $subtask > 0;
+ push @nfn, $subtask;
+ }
+ my $sfn = sprintf "$sortedTaskDir/stask-%05d", $$;
+ -f $sfn && mydie("Sorted version of input file $sfn already exists", 280);
+ length("$sortSize") > 0 || mydie("sortSize has length 0", 281);
+ length("$sortFields") > 0 || mydie("sortFields has length 0", 282);
+ if($external) {
+ my $nfnstr = join(' ', @nfn);
+ my $cmd = "sort -S $sortSize -k1,$sortFields $nfnstr > $sfn";
+ system($cmd) == 0 || mydie("Sort command: '$cmd' failed", 284);
+ } else {
+ my $denom = min($nred, $ntasks);
+ File::Sort::sort_file({
+ I => \@nfn,
+ o => $sfn,
+ t => "\t",
+ k => "1,$sortFields",
+ y => max(int($maxRecords/$denom), 100),
+ F => max(int($maxFiles/$denom), 3)
+ });
+ }
+}
+
+##
+# Construct command for reducing the reduce task.
+#
+sub cmdifyReducer($) {
+ my ($task) = @_;
+ my $sfn = sprintf "$sortedTaskDir/stask-%05d", $$;
+ -f $sfn || mydie("Sorted version of input file $sfn doesn't exist", 285);
+ my $taskEmpty = (-s $sfn == 0);
+ my $ofn = sprintf "$output/part-%05d", $$;
+ -f $ofn && mydie("Output file $ofn already exists", 290);
+ my $efn = sprintf "$errDir/epart-%05d", $$;
+ -f $efn && mydie("Error file $efn already exists", 300);
+ my $ret = ($taskEmpty ? undef : "cat $sfn | $cmd > $ofn 2> $efn");
+ $reduceProcs++ if defined($ret);
+ unless(defined($ret)) {
+ msg("Pid $$ skipping task $task; input is empty");
+ }
+ return $ret;
+}
+
+# Map from PIDs to the file(s) where the error message is likely to be
+# if and when they fail
+my %pidToErrFiles = ();
+my %pidToInputs = ();
+my $alreadyDumped = 0;
+sub failDump() {
+ return if $alreadyDumped;
+ msg("******");
+ msg("* Aborting master loop because child $childFailedPid failed");
+ msg("* (other children may also have failed)");
+ msg("* Input file or string was:");
+ msg("* $pidToInputs{$childFailedPid}");
+ msg("* Error message is in file: ".$pidToErrFiles{$childFailedPid}.", also printed below");
+ msg("******");
+ if(!open(ERR, $pidToErrFiles{$childFailedPid})) {
+ msg("* (could not open)");
+ } else {
+ while(<ERR>) { msg("* $_"); }
+ close(ERR);
+ }
+ msg("******");
+ $alreadyDumped = 1;
+}
+
+##
+# Open sort/reduce pipes.
+#
+for(my $i = 0; $i < $nonemptyTasks; $i++) {
+ if($childFailed) { failDump(); last; }
+ my $childPid = $pm->start;
+ if($childPid != 0) {
+ # I'm the parent
+ my $efn = sprintf "$errDir/epart-%05d", $childPid;
+ $pidToErrFiles{$childPid} = $efn;
+ $pidToInputs{$childPid} = sprintf "$sortedTaskDir/stask-%05d", $childPid;
+ next; # spawn the next child
+ }
+ # I'm the child
+ exit 0 if $childFailed;
+ my $nfn = sprintf "task-%05d", $i;
+ my $wd = "$workingDir/$$";
+ mkpath($wd);
+ (-d $wd) || mydie("Could not create working directory $wd", 310);
+ chdir($wd) || mydie("Could not change to working directory $wd", 320);
+ my $cmd;
+ #if($nonemptyTasks > 0) {
+ msg("Pid $$ sorting task $nfn [".($i+1)." of ".max($nonemptyTasks, 1)."]...");
+ doSort($i, $nonemptyTasks, $externalSort);
+ #} else {
+ # # Make dummy input file
+ # my $sfn = sprintf "$sortedTaskDir/stask-%05d", $$;
+ # open(TMP, ">$sfn") || mydie("Could not touch dummy input file $sfn", 325);
+ # close(TMP);
+ #}
+ $cmd = cmdifyReducer($i);
+ msg("Pid $$ reducing task $nfn [".($i+1)." of ".$nonemptyTasks."]...");
+ exec($cmd) if defined($cmd);
+ exit 0;
+}
+$pm->wait_all_children;
+if($childFailed) {
+ failDump(); # Dump offending file if we haven't already
+ mydie("Aborting because child with PID $childFailedPid exited abnormally", 330);
+}
+if($nonemptyTasks == 0) {
+ msg("WARNING: There was no input data");
+}
+msg("-- Reduce counters --");
+Wrap::getAndPrintLocalCounters($errDir, \&msg);
+Wrap::getAndPrintLocalCounters($errDir, \&cnt) if defined($cntfh);
+
+# No errors
+unless($keep) {
+ msg("Removing $intermediate (to keep, specify --keep-all)");
+ system("rm -rf $intermediate");
+}
diff --git a/Soapsnp.pl b/Soapsnp.pl
new file mode 100755
index 0000000..266a608
--- /dev/null
+++ b/Soapsnp.pl
@@ -0,0 +1,396 @@
+#!/usr/bin/perl -w
+
+##
+# Soapsnp.pl
+#
+# Batch alignments streaming in on STDIN and send them to SOAPsnp.
+# Alignments are binned by partition and sorted by reference offset.
+# Fetch reference jar (ensuring mutual exclusion among reducers) if
+# necessary.
+#
+# Author: Ben Langmead
+# Date: February 11, 2010
+#
+
+use strict;
+use warnings;
+use 5.004;
+use Carp;
+use Getopt::Long;
+use FindBin qw($Bin);
+use lib $Bin;
+use Counters;
+use Get;
+use Tools;
+use Util;
+use AWS;
+use File::Path qw(mkpath);
+
+{
+ # Force stderr to flush immediately
+ my $ofh = select STDERR;
+ $| = 1;
+ select $ofh;
+}
+
+my @counterUpdates = ();
+
+sub msg($) {
+ my $m = shift;
+ $m =~ s/\s*$//;
+ defined($m) || croak("Undefined message");
+ print STDERR "Soapsnp.pl: $m\n";
+}
+
+sub counter($) {
+ my $c = shift;
+ defined($c) || croak("Undefined counter update");
+ print STDERR "reporter:counter:$c\n";
+}
+
+sub flushCounters() {
+ for my $c (@counterUpdates) { counter($c); }
+ @counterUpdates = ();
+}
+
+my $ref = "";
+my $type = "s3";
+my $file = "";
+my $dest_dir = "";
+my $plen = 2000000;
+my $args = "";
+my $refdir = "";
+my $snpdir = "";
+my $haploidstr = "";
+my $dryRun = 0;
+my $baseQual = '!';
+my $diploid_args = "";
+my $haploid_args = "";
+my $replaceUnderscores = 0;
+my $discardRefBins = 0;
+my $cntfn = "";
+
+Tools::initTools();
+my %env = %ENV;
+
+sub dieusage {
+ my $msg = shift;
+ my $exitlevel = shift;
+ $exitlevel = $exitlevel || 1;
+ print STDERR "$msg\n";
+ exit $exitlevel;
+}
+
+GetOptions (
+ "soapsnp:s" => \$Tools::soapsnp_arg,
+ "s3cmd:s" => \$Tools::s3cmd_arg,
+ "s3cfg:s" => \$Tools::s3cfg,
+ "jar:s" => \$Tools::jar_arg,
+ "accessid:s" => \$AWS::accessKey,
+ "secretid:s" => \$AWS::secretKey,
+ "hadoop:s" => \$Tools::hadoop_arg,
+ "wget:s" => \$Tools::wget_arg,
+ "refjar:s" => \$ref,
+ "destdir:s" => \$dest_dir,
+ "refdir:s" => \$refdir,
+ "snpdir:s" => \$snpdir,
+ "partition=i" => \$plen,
+ "args:s" => \$args,
+ "diploid_args:s" => \$diploid_args,
+ "haploid_args:s" => \$haploid_args,
+ "haploids:s" => \$haploidstr,
+ "dryrun" => \$dryRun,
+ "replace-uscores" => \$replaceUnderscores,
+ "counters:s" => \$cntfn,
+ "basequal=s" => \$baseQual,
+ "discard-ref-bins=f" => \$discardRefBins) || dieusage("Bad option", 1);
+
+Tools::purgeEnv();
+
+if($replaceUnderscores) {
+ $args =~ s/_/ /g;
+ $diploid_args =~ s/_/ /g;
+ $haploid_args =~ s/_/ /g;
+}
+
+$diploid_args = "-r 0.00005 -e 0.0001" if $diploid_args eq "";
+$haploid_args = "-r 0.0001" if $haploid_args eq "";
+$haploid_args .= " -m";
+
+msg("soapsnp: found: $Tools::soapsnp, given: $Tools::soapsnp_arg");
+msg("s3cmd: found: $Tools::s3cmd, given: $Tools::s3cmd_arg");
+msg("jar: found: $Tools::jar, given: $Tools::jar_arg");
+msg("hadoop: found: $Tools::hadoop, given: $Tools::hadoop_arg");
+msg("wget: found: $Tools::wget, given: $Tools::wget_arg");
+msg("s3cfg: $Tools::s3cfg");
+msg("soapsnp args: $args");
+msg("refdir: $refdir");
+msg("snpdir: $snpdir");
+msg("partition length: $plen");
+msg("haploid ids: $haploidstr");
+msg("haploid arguments: $haploid_args");
+msg("diploid arguments: $diploid_args");
+msg("base quality value: $baseQual");
+msg("discard SNP bins: $discardRefBins");
+msg("dryrun: $dryRun");
+msg("ls -al");
+print STDERR `ls -al`;
+
+$refdir ne "" || $ref ne "" || die "Must specify either -refdir <path> or -ref <url> and -destdir\n";
+$refdir ne "" || $dest_dir ne "" || die "Must specify either -refdir <path> or -ref <url> and -destdir\n";
+$snpdir ne "" || $ref ne "" || die "Must specify either -snpdir <path> or -ref <url> and -destdir\n";
+$snpdir ne "" || $dest_dir ne "" || die "Must specify either -snpdir <path> or -ref <url> and -destdir\n";
+$refdir = "$dest_dir/sequences" if $refdir eq "";
+$snpdir = "$dest_dir/snps" if $snpdir eq "";
+$dest_dir eq "" || (-d $dest_dir) || mkpath($dest_dir);
+$dest_dir eq "" || (-d $dest_dir) || die "-destdir $dest_dir does not exist or isn't a directory, and could not be created\n";
+
+my $soapsnp = Tools::soapsnp();
+
+my $lchr = -1;
+my $lpart = -1;
+my $als = 0;
+my $alstot = 0;
+my $ranges = 0;
+
+# Record which chromosomes are haploid; assume all others are diploid
+my %hapHash = ();
+if($haploidstr ne "none" && $haploidstr ne "all") {
+ my @haploids = split /[,]/, $haploidstr;
+ for my $h (@haploids) { $hapHash{$h} = 1; }
+}
+
+sub lookAtFile() {
+ my $f = shift;
+ msg("ls -l $snpdir/chr$lchr.snps");
+ print STDERR `ls -l $snpdir/chr$lchr.snps`;
+}
+
+##
+# Flush quality range counters.
+#
+my %qualCnts = ();
+my $qualCntsToFlush = 0;
+sub flushQualCounters() {
+ for my $qual (keys %qualCnts) {
+ counter("SOAPsnp,Occurrences of quality value [".($qual*10).":".($qual*10+10).") in wrapper,$qualCnts{$qual}");
+ $qualCnts{$qual} = 0;
+ }
+ $qualCntsToFlush = 0;
+}
+
+##
+# Argument is a quality string. Update counters.
+#
+sub processQuals($) {
+ for(my $i = 0; $i < length($_[0]); $i++) {
+ my $q = ord(substr($_[0], $i, 1))-33;
+ $qualCnts{int($q/10)}++;
+ $qualCntsToFlush++;
+ }
+ flushQualCounters() if $qualCntsToFlush > 10000;
+}
+
+my $maxlen = 1; # per-partition maximum read length
+open TMP, ">.tmp.$plen.0" || die;
+my $jarEnsured = 0;
+while(1) {
+ # Extract the chromosome and partition key
+ my $line = <STDIN>;
+ if(defined($line)) {
+ next if substr($line, 0, 1) eq '#';
+ next if $line =~ /^\s*FAKE\s*$/;
+ next if $line =~ /^\s*$/;
+ }
+ my $chromo;
+ my $parti;
+ my $lmaxlen = $maxlen;
+ if(defined($line)) {
+ # Parse chromosome and partition for this alignment
+ my @s = split(/[\t]/, $line);
+ ($chromo, $parti) = ($s[0], $s[1]);
+ my $len = length($s[4]);
+ processQuals($s[5]);
+ if($parti != $lpart || $chromo != $lchr) {
+ # New partition so start a separate tally
+ $maxlen = $len;
+ } else {
+ $maxlen = $len if $len > $maxlen;
+ }
+ } else {
+ # No more input; last partition was final
+ print STDERR "Read the last line of input\n";
+ last if $als == 0; # bail if there are no alignments to flush
+ $parti = $lpart+1; # force alignments to flush
+ }
+ # If either the partition or the chromosome is different...
+ if($parti != $lpart || $chromo != $lchr) {
+ close(TMP);
+ # If there are any alignments to flush...
+ if($als > 0) {
+ if($discardRefBins == 0 || rand() > $discardRefBins) {
+ #
+ # Set up range based on partition id and partition length
+ #
+ my $irange = $plen * int($lpart);
+ my $frange = $irange + $plen;
+ my $rname = ".range_".$irange."_$frange";
+ $ranges++;
+ open RANGE, ">$rname" || die;
+ print RANGE "$lchr\t$irange\t$frange\n";
+ close(RANGE);
+
+ counter("SOAPsnp wrapper,Ranges processed,1");
+ counter("SOAPsnp wrapper,Alignments processed,$als");
+
+ #
+ # Run SOAPsnp
+ #
+ my $date = `date`; chomp($date);
+ msg("Genotyping chromosome $lchr $irange-$frange using $als alignments: $date");
+ my $ploid = $diploid_args;
+ if(defined($hapHash{$lchr}) || $haploidstr eq "all") {
+ msg(" chromosome $lchr is haploid; using args \"$haploid_args\"");
+ $ploid = $haploid_args;
+ } else {
+ msg(" chromosome $lchr is diploid; using args \"$diploid_args\"");
+ }
+
+ msg("head -4 .tmp.$plen.$lpart:");
+ print STDERR `head -4 .tmp.$plen.$lpart`;
+ msg("tail -4 .tmp.$plen.$lpart:");
+ print STDERR `tail -4 .tmp.$plen.$lpart`;
+ if($ref ne "" && !$jarEnsured) {
+ Get::ensureFetched($ref, $dest_dir, \@counterUpdates, undef, undef, \%env);
+ flushCounters();
+ $jarEnsured = 1;
+ unless(-d "$dest_dir/sequences") {
+ msg("Extracting jar didn't create 'sequences' subdirectory.");
+ msg("find $dest_dir");
+ print STDERR `find $dest_dir`;
+ exit 1;
+ }
+ }
+
+ my $snpsArg = "";
+ my $snpsFn = "$snpdir/chr$lchr.snps";
+ if(! -f $snpsFn) {
+ $snpsFn = "$snpdir/$lchr.snps";
+ if(! -f $snpsFn) {
+ counter("SOAPsnp wrapper,SNP files missing,1");
+ msg("Warning: $snpsFn doesn't exist");
+ msg("ls -l $snpdir");
+ msg(`ls -l $snpdir`);
+ $snpsFn = "";
+ }
+ }
+ if($snpsFn ne "") {
+ msg("ls -l $snpsFn");
+ msg(`ls -l $snpsFn`);
+ $snpsArg = "-s $snpsFn";
+ } else {
+ msg("Warning: neither $snpdir/chr$lchr.snps nor $snpdir/$lchr.snps exist; not using known SNPs");
+ msg("ls -al $snpdir");
+ msg(`ls -al $snpdir`);
+ }
+
+ my $refFn = "$refdir/chr$lchr.fa";
+ if(! -f $refFn) {
+ $refFn = "$refdir/$lchr.fa";
+ if(! -f $refFn) {
+ counter("SOAPsnp wrapper,Sequence files missing,1");
+ msg("Fatal error: $refFn doesn't exist");
+ msg("ls -l $refdir");
+ print STDERR `ls -l $refdir`;
+ die;
+ }
+ }
+
+ my $partFn = ".tmp.$plen.$lpart";
+ if(! -f $partFn) {
+ counter("SOAPsnp wrapper,Alignment files missing,1");
+ msg("Warning: $partFn doesn't exist");
+ msg("ls -al");
+ print STDERR `ls -al`;
+ }
+
+ my $cmd = "${soapsnp} ".
+ "-i $partFn ". # alignments
+ "-d $refFn ". # reference sequence
+ "-o .tmp.snps ". # output file
+ "$snpsArg ". # known SNP file
+ "-z '$baseQual' ". # base quality value
+ "-L $lmaxlen ". # maximum read length
+ "-c ". # Crossbow
+ "-H ". # Hadoop output
+ "-T $rname ". # region
+ "$ploid ". # ploidy/rate args
+ "$args ". # other arguments
+ ">.soapsnp.$$.stdout ".
+ "2>.soapsnp.$$.stderr";
+ msg("$cmd");
+
+ my $ret = $dryRun ? 0 : system($cmd);
+
+ msg("soapsnp returned $ret");
+ msg("command: $cmd");
+ open OUT, ".soapsnp.$$.stdout";
+ msg("stdout from soapsnp:");
+ while(<OUT>) { print STDERR $_; } close(OUT);
+ open ERR, ".soapsnp.$$.stderr";
+ msg("stderr from soapsnp:");
+ while(<ERR>) { print STDERR $_; } close(ERR);
+ msg("range: $lchr\t$irange\t$frange");
+
+ msg("head -4 .tmp.snps:");
+ print STDERR `head -4 .tmp.snps`;
+ msg("tail -4 .tmp.snps:");
+ print STDERR `tail -4 .tmp.snps`;
+
+ die "Dying following soapsnp returning non-zero $ret" if $ret;
+
+ #
+ # Read and print called SNPs
+ #
+ my $snpsreported = 0;
+ open SNPS, ".tmp.snps";
+ while(<SNPS>) {
+ chomp;
+ my @ss = split(/\t/);
+ my $known = $ss[0] eq 'K';
+ shift @ss if $known;
+ my $snpoff = $ss[1];
+ $snpoff == int($snpoff) || die "SNP offset isn't a number: $snpoff";
+ if($snpoff < $irange || $snpoff >= $frange) {
+ counter("SOAPsnp wrapper,Out-of-range SNPs trimmed,1");
+ msg("Skipping $snpoff because it's outside [$irange, $frange) $_");
+ next;
+ }
+ $ss[1] = sprintf "%011d", $snpoff;
+ print "K\t" if $known;
+ print join("\t", @ss)."\n";
+ $snpsreported++;
+ }
+ close(SNPS);
+ counter("SOAPsnp wrapper,SNPs reported,$snpsreported");
+ msg("Reported $snpsreported SNPs");
+ } else {
+ counter("SOAPsnp wrapper,SNP bins skipped,1");
+ msg("Skipped bin with $als alignments");
+ }
+ $als = 0;
+ }
+ open TMP, ">.tmp.$plen.$parti" || die;
+ $lpart = $parti;
+ $lchr = $chromo;
+ }
+ last unless defined($line);
+ print TMP "$line";
+ $als++;
+ $alstot++;
+}
+counter("SOAPsnp,0-range invocations,1") if $ranges == 0;
+counter("SOAPsnp,0-alignment invocations,1") if $alstot == 0;
+close(TMP);
+flushQualCounters();
+flushCounters() if scalar(@counterUpdates) > 0;
diff --git a/TOOLNAME b/TOOLNAME
new file mode 100644
index 0000000..de66ea7
--- /dev/null
+++ b/TOOLNAME
@@ -0,0 +1 @@
+Crossbow
\ No newline at end of file
diff --git a/TUTORIAL b/TUTORIAL
new file mode 100644
index 0000000..56c07c1
--- /dev/null
+++ b/TUTORIAL
@@ -0,0 +1 @@
+See MANUAL's examples section.
diff --git a/Tools.pm b/Tools.pm
new file mode 100644
index 0000000..03cfd28
--- /dev/null
+++ b/Tools.pm
@@ -0,0 +1,523 @@
+#!/usr/bin/perl -w
+
+##
+# Author: Ben Langmead
+# Date: 2/14/2010
+#
+# Routines for getting and expanding jars from
+#
+
+package Tools;
+use strict;
+use warnings;
+use AWS;
+use FindBin qw($Bin);
+
+# Prefix to use for environment variables. E.g. in Myrna, we dont look for
+# MYRNA_FASTQ_DUMP_HOME before we look for FASTQ_DUMP_HOME.
+our $pre = "";
+
+our $s3cmd_arg = "";
+our $s3cmd = "";
+our $s3cfg = "";
+our $hadoop_arg = "";
+our $hadoop = "";
+our $fastq_dump_arg = "";
+our $fastq_dump = "";
+our $soapsnp_arg = "";
+our $soapsnp = "";
+our $samtools_arg = "";
+our $samtools = "";
+our $bowtie_arg = "";
+our $bowtie = "";
+our $jar = "";
+our $jar_arg = "";
+our $wget = "";
+our $wget_arg = "";
+our $md5 = "";
+our $md5_arg = "";
+our $r = "";
+our $r_arg = "";
+our $unzip = "";
+
+my $hadoopEnsured = 0;
+sub ensureHadoop() {
+ return if $hadoopEnsured;
+ $hadoop = $hadoop_arg if $hadoop_arg ne "";
+ if(system("$hadoop -version >&2") != 0) {
+ if($hadoop_arg ne "") {
+ die "--hadoop argument \"$hadoop\" doesn't exist or isn't executable\n";
+ } else {
+ die "hadoop could not be found in HADOOP_HOME or PATH; please specify --hadoop\n";
+ }
+ }
+ $hadoopEnsured = 1;
+}
+sub hadoop() { ensureHadoop(); return $hadoop; }
+
+# Bowtie
+my $bowtieEnsured = 0;
+sub ensureBowtie() {
+ return if $bowtieEnsured;
+ $bowtie = $bowtie_arg if $bowtie_arg ne "";
+ if(! -x $bowtie) {
+ if($bowtie_arg ne "") {
+ die "--bowtie argument \"$bowtie\" doesn't exist or isn't executable\n";
+ } else {
+ die "bowtie could not be found in BOWTIE_HOME or PATH; please specify --bowtie\n";
+ }
+ }
+ $bowtieEnsured = 1;
+}
+sub bowtie() { ensureBowtie(); return $bowtie; }
+
+# SOAPsnp
+my $soapsnpEnsured = 0;
+sub ensureSoapsnp() {
+ return if $soapsnpEnsured;
+ $soapsnp = $soapsnp_arg if $soapsnp_arg ne "";
+ if(! -x $soapsnp) {
+ if($soapsnp_arg ne "") {
+ die "--soapsnp argument \"$soapsnp\" doesn't exist or isn't executable\n";
+ } else {
+ die "soapsnp could not be found in SOAPSNP_HOME or PATH; please specify --soapsnp\n";
+ }
+ }
+ $soapsnpEnsured = 1;
+}
+sub soapsnp() { ensureSoapsnp(); return $soapsnp; }
+
+my $samtoolsEnsured = 0;
+sub ensureSamtools() {
+ return if $samtoolsEnsured;
+ $samtools = $samtools_arg if $samtools_arg ne "";
+ if(! -x $samtools) {
+ if($samtools_arg ne "") {
+ die "--samtools argument \"$samtools\" doesn't exist or isn't executable\n";
+ } else {
+ die "samtools could not be found in SAMTOOLS_HOME or PATH; please specify --samtools\n";
+ }
+ }
+ $samtoolsEnsured = 1;
+}
+sub samtools() { ensureSamtools(); return $samtools; }
+
+my $fqdumpEnsured = 0;
+sub ensureFastqDump() {
+ return if $fqdumpEnsured;
+ $fastq_dump = $fastq_dump_arg if $fastq_dump_arg ne "";
+ my $ret = 0;
+ if($fastq_dump ne "") {
+ $ret = system("$fastq_dump -H >&2 >/dev/null") >> 8;
+ }
+ if($ret != 4) {
+ if($fastq_dump_arg ne "") {
+ die "--fastq-dump argument \"$fastq_dump\" doesn't exist or isn't executable\n";
+ } else {
+ die "fastq-dump could not be found in FASTQ_DUMP_HOME or PATH; please specify --fastq-dump\n";
+ }
+ }
+ $fqdumpEnsured = 1;
+}
+sub fastq_dump() { ensureFastqDump(); return $fastq_dump; }
+
+##
+# Write a temporary s3cfg file with appropriate keys.
+#
+sub writeS3cfg($) {
+ my ($env) = @_;
+ AWS::ensureKeys($hadoop, $hadoop_arg, $env);
+ my $cfgText = qq{
+[default]
+access_key = $AWS::accessKey
+secret_key = $AWS::secretKey
+acl_public = False
+bucket_location = US
+debug_syncmatch = False
+default_mime_type = binary/octet-stream
+delete_removed = False
+dry_run = False
+encrypt = False
+force = False
+gpg_command = /usr/bin/gpg
+gpg_decrypt = \%(gpg_command)s -d --verbose --no-use-agent --batch --yes --passphrase-fd \%(passphrase_fd)s -o \%(output_file)s \%(input_file)s
+gpg_encrypt = \%(gpg_command)s -c --verbose --no-use-agent --batch --yes --passphrase-fd \%(passphrase_fd)s -o \%(output_file)s \%(input_file)s
+gpg_passphrase =
+guess_mime_type = False
+host_base = s3.amazonaws.com
+host_bucket = \%(bucket)s.s3.amazonaws.com
+human_readable_sizes = False
+preserve_attrs = True
+proxy_host =
+proxy_port = 0
+recv_chunk = 4096
+send_chunk = 4096
+simpledb_host = sdb.amazonaws.com
+use_https = False
+verbosity = WARNING
+};
+ open S3CFG, ">.s3cfg" || die "Could not open .s3cfg\n";
+ print S3CFG $cfgText;
+ close(S3CFG);
+}
+
+my $s3cmdEnsured = 0;
+sub ensureS3cmd($) {
+ my ($env) = @_;
+ return if $s3cmdEnsured;
+ $s3cmd = $s3cmd_arg if $s3cmd_arg ne "";
+ if(system("$s3cmd --version >&2") != 0) {
+ if($s3cmd_arg ne "") {
+ die "-s3cmd argument \"$s3cmd\" doesn't exist or isn't executable\n";
+ } else {
+ die "s3cmd could not be found in S3CMD_HOME or PATH; please specify -s3cmd\n";
+ }
+ }
+ if($s3cfg eq "") {
+ writeS3cfg($env) unless -f ".s3cfg";
+ $s3cfg = ".s3cfg";
+ }
+ $s3cmdEnsured = 1;
+}
+sub s3cmd($) { ensureS3cmd($_[0]); return "$s3cmd -c $s3cfg"; }
+
+my $md5Ensured = 0;
+sub ensureMd5() {
+ return if $md5Ensured;
+ $md5 = $md5_arg if $md5_arg ne "";
+ unless(-x $md5) {
+ if($md5_arg ne "") {
+ die "-md5 argument \"$md5\" doesn't exist or isn't executable\n";
+ } else {
+ die "md5 or md5sum could not be found in PATH; please specify -md5\n";
+ }
+ }
+ $md5Ensured = 1;
+}
+sub md5() { ensureMd5(); return $md5; }
+
+my $wgetEnsured = 0;
+sub ensureWget() {
+ return if $wgetEnsured;
+ $wget = $wget_arg if $wget_arg ne "";
+ unless(-x $wget) {
+ if($wget_arg ne "") {
+ die "-wget argument \"$wget_arg\" doesn't exist or isn't executable\n";
+ } else {
+ die "wget could not be found in PATH; please specify -wget\n";
+ }
+ }
+ $wgetEnsured = 1;
+}
+sub wget() { ensureWget(); return $wget; }
+
+my $jarEnsured = 0;
+sub ensureJar() {
+ return if $jarEnsured;
+ $jar = $jar_arg if $jar_arg ne "";
+ unless(-x $jar) {
+ if($jar_arg ne "") {
+ die "-jar argument \"$jar_arg\" doesn't exist or isn't executable\n";
+ } else {
+ die "jar could not be found in PATH; please specify -jar\n";
+ }
+ }
+ $jarEnsured = 1;
+}
+sub jar() { ensureJar(); return $jar; }
+
+# Rscript
+my $rscriptEnsured = 0;
+sub ensureRscript() {
+ return if $rscriptEnsured;
+ $r = $r_arg if $r_arg ne "";
+ if(! -x $r) {
+ if($r_arg ne "") {
+ die "--R argument \"$r_arg\" doesn't exist or isn't executable\n";
+ } else {
+ die "Rscript could not be found in R_HOME or PATH; please specify --R\n";
+ }
+ }
+ $rscriptEnsured = 1;
+}
+sub Rscript() { ensureRscript(); return $r; }
+
+sub unzip(){ return $unzip; }
+
+sub initTools() {
+
+ # Read the tool name from the 'TOOLNAME' file. We'll use an all-caps
+ # version of this as our environment variable prefix.
+ if(open(NAME, "$Bin/TOOLNAME")) {
+ $pre = <NAME>;
+ $pre =~ s/^\s*//;
+ $pre =~ s/\s*$//;
+ $pre = uc $pre;
+ $pre .= "_";
+ close(NAME);
+ } else {
+ $pre = "";
+ print STDERR "Warning: No TOOLNAME file in tool directory: Bin\n";
+ }
+
+ #
+ # jar
+ #
+
+ if($pre ne "" && defined($ENV{"${pre}JAVA_HOME"})) {
+ my $h = $ENV{"${pre}JAVA_HOME"};
+ $jar = "$h/bin/jar";
+ unless(-x $jar) { $jar = "" };
+ }
+ elsif(defined($ENV{JAVA_HOME})) {
+ $jar = "$ENV{JAVA_HOME}/bin/jar";
+ unless(-x $jar) { $jar = "" };
+ }
+ if($jar eq "") {
+ $jar = `which jar 2>/dev/null`;
+ chomp($jar);
+ unless(-x $jar) { $jar = "" };
+ }
+
+ ##unzip
+ if($unzip eq ""){
+ $unzip = `which unzip 2>/dev/null`;
+ chomp($unzip);
+ unless(-x $unzip){ $unzip = "" };
+ }
+
+
+ #
+ # s3cmd
+ #
+
+ if($pre ne "" && defined($ENV{"${pre}S3CMD_HOME"})) {
+ my $h = $ENV{"${pre}S3CMD_HOME"};
+ $s3cmd = "$h/s3cmd";
+ unless(-x $s3cmd) { $s3cmd = "" };
+ }
+ elsif(defined($ENV{S3CMD_HOME})) {
+ $s3cmd = "$ENV{S3CMD_HOME}/s3cmd";
+ unless(-x $s3cmd) { $s3cmd = "" };
+ }
+ if($s3cmd eq "") {
+ $s3cmd = `which s3cmd 2>/dev/null`;
+ chomp($s3cmd);
+ unless(-x $s3cmd) { $s3cmd = "" };
+ }
+
+ #
+ # hadoop
+ #
+
+ if($pre ne "" && defined($ENV{"${pre}HADOOP_HOME"})) {
+ my $h = $ENV{"${pre}HADOOP_HOME"};
+ $hadoop = "$h/bin/hadoop";
+ unless(-x $hadoop) { $hadoop = "" };
+ }
+ elsif(defined($ENV{HADOOP_HOME})) {
+ $hadoop = "$ENV{HADOOP_HOME}/bin/hadoop";
+ unless(-x $hadoop) { $hadoop = "" };
+ }
+ if($hadoop eq "") {
+ $hadoop = `which hadoop 2>/dev/null`;
+ chomp($hadoop);
+ unless(-x $hadoop) { $hadoop = "" };
+ }
+
+ #
+ # fastq-dump
+ #
+ if($pre ne "" && defined($ENV{"${pre}FASTQ_DUMP_HOME"})) {
+ my $h = $ENV{"${pre}FASTQ_DUMP_HOME"};
+ $fastq_dump = "$h/fastq-dump";
+ unless(-x $fastq_dump) { $fastq_dump = "" };
+ }
+ elsif(defined($ENV{FASTQ_DUMP_HOME})) {
+ $fastq_dump = "$ENV{FASTQ_DUMP_HOME}/fastq-dump";
+ unless(-x $fastq_dump) { $fastq_dump = "" };
+ }
+ if($fastq_dump eq "") {
+ $fastq_dump = `which fastq-dump 2>/dev/null`;
+ chomp($fastq_dump);
+ unless(-x $fastq_dump) { $fastq_dump = "" };
+ }
+ if($fastq_dump eq "") {
+ $fastq_dump = "./fastq-dump";
+ chomp($fastq_dump);
+ unless(-x $fastq_dump) { $fastq_dump = "" };
+ }
+
+ #
+ # bowtie
+ #
+
+ if($pre ne "" && defined($ENV{"${pre}BOWTIE_HOME"})) {
+ my $h = $ENV{"${pre}BOWTIE_HOME"};
+ $bowtie = "$h/bowtie";
+ unless(-x $bowtie) { $bowtie = "" };
+ }
+ elsif(defined($ENV{BOWTIE_HOME})) {
+ $bowtie = "$ENV{BOWTIE_HOME}/bowtie";
+ unless(-x $bowtie) { $bowtie = "" };
+ }
+ if($bowtie eq "") {
+ $bowtie = `which bowtie 2>/dev/null`;
+ chomp($bowtie);
+ unless(-x $bowtie) { $bowtie = "" };
+ }
+ if($bowtie eq "" && -f "./bowtie") {
+ $bowtie = "./bowtie";
+ chomp($bowtie);
+ chmod 0777, $bowtie;
+ unless(-x $bowtie) { $bowtie = "" };
+ }
+
+ #
+ # soapsnp
+ #
+
+ if($pre ne "" && defined($ENV{"${pre}SOAPSNP_HOME"})) {
+ my $h = $ENV{"${pre}SOAPSNP_HOME"};
+ $soapsnp = "$h/soapsnp";
+ unless(-x $soapsnp) { $soapsnp = "" };
+ }
+ elsif(defined($ENV{SOAPSNP_HOME})) {
+ $soapsnp = "$ENV{SOAPSNP_HOME}/soapsnp";
+ unless(-x $soapsnp) { $soapsnp = "" };
+ }
+ if($soapsnp eq "") {
+ $soapsnp = `which soapsnp 2>/dev/null`;
+ chomp($soapsnp);
+ unless(-x $soapsnp) { $soapsnp = "" };
+ }
+ if($soapsnp eq "" && -f "./soapsnp") {
+ $soapsnp = "./soapsnp";
+ chomp($soapsnp);
+ chmod 0777, $soapsnp;
+ unless(-x $soapsnp) { $soapsnp = "" };
+ }
+
+ #
+ # samtools
+ #
+
+ if($pre ne "" && defined($ENV{"${pre}SAMTOOLS_HOME"})) {
+ my $h = $ENV{"${pre}SAMTOOLS_HOME"};
+ $samtools = "$h/samtools";
+ unless(-x $samtools) { $samtools = "" };
+ }
+ elsif(defined($ENV{SAMTOOLS_HOME})) {
+ $samtools = "$ENV{SAMTOOLS_HOME}/samtools";
+ unless(-x $samtools) { $samtools = "" };
+ }
+ if($samtools eq "") {
+ $samtools = `which samtools 2>/dev/null`;
+ chomp($samtools);
+ unless(-x $samtools) { $samtools = "" };
+ }
+ if($samtools eq "") {
+ $samtools = "./samtools";
+ chomp($samtools);
+ unless(-x $samtools) { $samtools = "" };
+ }
+
+ #
+ # Rscript
+ #
+
+ if($pre ne "" && defined($ENV{"${pre}R_HOME"})) {
+ my $h = $ENV{"${pre}R_HOME"};
+ $r = "$h/bin/Rscript";
+ unless(-x $r) { $r = "" };
+ }
+ elsif(defined($ENV{R_HOME})) {
+ $r = "$ENV{R_HOME}/bin/Rscript";
+ unless(-x $r) { $r = "" };
+ }
+ if($r eq "") {
+ $r = `which Rscript 2>/dev/null`;
+ chomp($r);
+ unless(-x $r) { $r = "" };
+ }
+ if($r eq "" && -x "Rscript") {
+ $r = "Rscript";
+ }
+
+ # md5/md5sum, for checking integrity of downloaded files
+ $md5 = `which md5 2>/dev/null`;
+ chomp($md5);
+ $md5 = "" unless(-x $md5);
+ if($md5 eq "") {
+ $md5 = `which md5sum 2>/dev/null`;
+ chomp($md5);
+ $md5 = "" unless(-x $md5);
+ }
+
+ # wget, for downloading files over http or ftp
+ $wget = `which wget 2>/dev/null`;
+ chomp($wget);
+ unless(-x $wget) { $wget = "" };
+
+ # expand s3cmd if it's present
+ if(-f "s3cmd.tar.gz") {
+ system("tar zxvf s3cmd.tar.gz >/dev/null");
+ }
+}
+
+##
+# Look (a) relative to an environment variable, (b) in the path, and
+# (c) in the current directory for an executable. Return where we
+# found it, or "" if we didn't.
+#
+sub lookFor($$$) {
+ my ($exe, $env, $envsub) = @_;
+ my $tool = "";
+ if(defined($ENV{$env})) {
+ $tool = "$ENV{$env}/$envsub";
+ unless(-x $tool) { $tool = "" };
+ }
+ if($tool eq "") {
+ $tool = `which $exe 2>/dev/null`;
+ chomp($tool);
+ unless(-x $tool) { $tool = "" };
+ }
+ $tool = "./$exe" if ($tool eq "" && -x "./$exe");
+ return $tool;
+}
+
+##
+# Purge the environment down to a few essentials. This fixes an issue
+# whereby some environment changes made by hadoop.sh mess with future
+# invocations of hadoop.
+#
+sub purgeEnv() {
+ foreach my $k (keys %ENV) {
+ next if $k eq "PATH";
+ next if $k eq "PWD";
+ next if $k eq "HOME";
+ next if $k eq "USER";
+ next if $k eq "TERM";
+ next if $k eq "JAVA_HOME";
+ delete $ENV{$k};
+ }
+ $ENV{SHELL}="/bin/sh";
+}
+
+##
+# Given a bowtie argument string, look for obvious problems.
+#
+sub checkBowtieParams($$) {
+ my ($args, $version) = @_;
+ return 1;
+}
+
+##
+# Given a bowtie argument string, look for obvious problems.
+#
+sub checkSoapsnpParams($$) {
+ my ($args, $version) = @_;
+ return 1;
+}
+
+1;
diff --git a/Util.pm b/Util.pm
new file mode 100644
index 0000000..9ebf4ef
--- /dev/null
+++ b/Util.pm
@@ -0,0 +1,137 @@
+#!/usr/bin/perl -w
+
+##
+# Author: Ben Langmead
+# Date: 3/12/2010
+#
+# Various utility functions.
+#
+
+package Util;
+use strict;
+use warnings;
+use POSIX ":sys_wait_h";
+use FindBin qw($Bin);
+use lib $Bin;
+use Tools;
+
+##
+# Parse a URL, extracting the protocol and type of program that will
+# be needed to download it.
+#
+sub parse_url_proto($) {
+ my @s = split(/[:]/, $_[0]);
+ defined($s[0]) || return "local";
+ if($s[0] =~ /^s3n?/i) {
+ return "s3";
+ } elsif($s[0] =~ /^hdfs/i) {
+ return "hdfs";
+ } else {
+ return "local";
+ }
+}
+
+##
+# Return true iff given url is local.
+#
+sub is_local($) {
+ return parse_url_proto($_[0]) eq "local";
+}
+
+##
+# Print command to stderr, run it, return its exitlevel.
+#
+sub run($) {
+ my $cmd = shift;
+ print STDERR "$cmd\n";
+ return system($cmd);
+}
+
+##
+# Run given command and wait for it to finish, printing wait messages
+# to stderr periodically. Return its exitlevel.
+#
+sub runAndWait($$) {
+ my ($cmd, $shortname) = @_;
+ print STDERR "$cmd\n";
+ my $f = fork();
+ if($f == 0) {
+ # Run the command, echoing its stdout to our stdout
+ open(CMD, "$cmd |");
+ while(<CMD>) { print $_; }
+ close(CMD);
+ # Check its exitlevel
+ my $ret = $?;
+ # Write its exitlevel to a file. TODO: is there a better way
+ # to do this?
+ open(OUT, ">.Util.pm.$$") || die "Could not open .Util.pm.$$ for writing\n";
+ print OUT "$ret\n";
+ close(OUT);
+ exit $ret;
+ }
+ print STDERR "runAndWait: Child's PID is $f\n";
+ my $ret;
+ my $cnt = 0;
+ while(1) {
+ $ret = waitpid(-1, &WNOHANG);
+ last if $ret == $f;
+ sleep (5);
+ my $secs = ++$cnt * 5;
+ print STDERR "Waiting for $shortname (it's been $secs secs)...\n";
+ }
+ my $lev = int(`cat .Util.pm.$ret`);
+ unlink(".Util.pm.$ret");
+ return $lev;
+}
+
+##
+# Run given command, return its output.
+#
+sub backtickRun($) {
+ my ($cmd) = @_;
+ print STDERR "$cmd\n";
+ return `$cmd`;
+}
+
+##
+# Run given command and wait for it to finish, printing wait messages
+# to stderr periodically. Return its output.
+#
+sub backtickAndWait($$) {
+ my ($cmd, $shortname) = @_;
+ print STDERR "$cmd\n";
+ my $f = fork();
+ if($f == 0) {
+ open(TMP, ">.tmp.Get.pm") || die;
+ open(CMD, "$cmd |") || die;
+ while(<CMD>) { print TMP $_; }
+ close(CMD);
+ my $ret = $?;
+ close(TMP);
+ exit $ret;
+ }
+ print STDERR "runAndWait: Child's PID is $f\n";
+ my $ret;
+ my $cnt = 0;
+ while(1) {
+ $ret = waitpid(-1, &WNOHANG);
+ last if $ret == $f;
+ sleep (5);
+ my $secs = ++$cnt * 5;
+ print STDERR "Waiting for $shortname (it's been $secs secs)...\n";
+ }
+ return `cat .tmp.Get.pm`;
+}
+
+##
+# Return version of argument with leading and trailing whitespace
+# removed.
+#
+sub trim($) {
+ my $string = shift;
+ $string =~ s/^\s+//;
+ $string =~ s/\s+$//;
+ return $string;
+}
+
+1;
diff --git a/VERSION b/VERSION
new file mode 100644
index 0000000..867e524
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+1.2.0
\ No newline at end of file
diff --git a/Wrap.pm b/Wrap.pm
new file mode 100644
index 0000000..001031e
--- /dev/null
+++ b/Wrap.pm
@@ -0,0 +1,67 @@
+#!/usr/bin/perl -w
+
+##
+# Author: Ben Langmead
+# Date: 3/28/2010
+#
+# Helpful utilities for Hadoop-like wrappers.
+#
+
+package Wrap;
+
+##
+# Pretty-print a hash filled with counter information.
+#
+sub printCounters($$) {
+ my ($counters, $msg) = @_;
+ for my $k1 (sort keys %$counters) {
+ for my $k2 (sort keys %{$counters->{$k1}}) {
+ $msg->("$k1\t$k2\t$counters->{$k1}{$k2}");
+ }
+ }
+}
+
+##
+# Parse all counter updates in a directory of Hadoop-like output.
+#
+sub getLocalCounters($$$) {
+ my ($dir, $counters, $msg) = @_;
+ -d $dir || die "No such input file or directory as \"$dir\"\n";
+ my @fs = ();
+ @fs = <$dir/*>;
+ for my $f (@fs) {
+ if($f =~ /\.gz$/) {
+ open INP, "gzip -dc $f |" || die "Could not open pipe 'gzip -dc $f |'";
+ } elsif($f =~ /\.bz2$/) {
+ open INP, "bzip2 -dc $f |" || die "Could not open pipe 'bzip2 -dc $f |'";
+ } else {
+ open INP, "$f" || die "Could not open $f for reading\n";
+ }
+ while(<INP>) {
+ if(/^reporter:counter:/) {
+ chomp;
+ $_ = substr($_, length("reporter:counter:"));
+ my @us = split(/,/);
+ if(scalar(@us) != 3) {
+ $msg->("Warning: Ill-formed counter updated line:\n$_");
+ }
+ $counters->{$us[0]}{$us[1]} += $us[2];
+ }
+ }
+ close(INP);
+ $? == 0 || die "Bad exitlevel from input slurp: $?\n";
+ }
+}
+
+##
+# Parse all counter updates in a directory of Hadoop-like output then
+# pretty-print it.
+#
+sub getAndPrintLocalCounters($$) {
+ my ($dir, $msg) = @_;
+ my %counters = ();
+ getLocalCounters($dir, \%counters, $msg);
+ printCounters(\%counters, $msg);
+}
+
+1;
diff --git a/cb_emr b/cb_emr
new file mode 100755
index 0000000..021bbe6
--- /dev/null
+++ b/cb_emr
@@ -0,0 +1,136 @@
+#!/usr/bin/perl -w
+
+##
+# Author: Ben Langmead
+# Date: February 11, 2010
+#
+# Use 'elastic-mapreduce' ruby script to invoke an EMR job described
+# in a dynamically-generated JSON file. Constructs the elastic-
+# mapreduce invocation from paramteres/defaults/environment variables.
+#
+
+use strict;
+use warnings;
+
+use FindBin qw($Bin);
+use lib $Bin;
+use CrossbowIface;
+
+my $APP = "Crossbow";
+my $app = lc $APP;
+my $SCRIPT = "cb_emr";
+my $VERSION = `cat $Bin/VERSION`; $VERSION =~ s/\s//g;
+
+my $usage = qq{
+$SCRIPT: Run $APP v$VERSION as an Elastic MapReduce job
+
+Usage: perl $SCRIPT --input <url> --output <url> \
+ [--reference <url> | --just-preprocess ] [options]
+
+Options (defaults in []):
+
+ EMR params:
+
+ --credentials <path> Path to credentials.json file [elastic-mapreduce
+ script's default]
+ --emr-script <path> Path to 'elastic-mapreduce' script [First under
+ \$${APP}_EMR, then in \$PATH]
+ --hadoop-version <ver> Hadoop version to use on EMR; for now, the options are
+ 0.18 and 0.20 [0.20]
+ --dry-run Produce job's .json and .sh files and print
+ 'elastic-mapreduce' command but don't run it
+ --name <name> Name for EMR job ["$APP"]
+ --stay-alive Keep cluster running even if it completes or a step
+ fails [off]
+ --instance-type <type> EC2 instance type [c1.xlarge (highly recommended)]
+ --nodes <int> Number of nodes (instances) to allocate [1]
+ --emr-args "<args>" Extra arguments for 'elastic-mapreduce' script [none]
+ --logs <url> By default, logs are deposited in 'log' directory
+ alongside the --output URL. Override by specifying
+ --logs and an S3 URL. Can't be a subdirectory of
+ --output.
+ --no-logs Disables copying of logs entirely [off]
+ --no-emr-debug Disable SimpleDB-based debugging. SimpleDB-based
+ debugging enables the "Debug" button in the AWS
+ Console. User must have SimpleDB account when this is
+ *not* specified. [off]
+
+ Job params (don't affect results):
+
+ --input <url> S3 URL for input. Usually a directory with
+ preprocessed reads. If --preprocess or
+ --just-preprocess are specified, URL is a manifest
+ file.
+ --output <url> Final output (S3)
+ --intermediate <url> Intermediate output (can be HDFS, S3). Use an S3 URL
+ if you'd like keep to keep intermediate results after
+ cluster is deallocated. [hdfs:///$app/intermediate]
+ --partition-len <int> Partition length in bases [1 million]
+
+ $APP params (affect results):
+
+ --reference <url> Reference jar (can be HDFS, S3, local, HTTP, FTP)
+ --quality <type> Encoding for sequence quality values; one of: phred33,
+ phred64, solexa64 [phred33]
+ --just-align Don't do SNP calling; --output will contain alignments
+ --resume-align --input URL is a directory of output from the Crossbow
+ alignment step (obtained e.g. using --intermediate);
+ pipeline resumes at the SNP calling step
+ --resume-snps --input URL is a directory of output from the Crossbow
+ SNP calling step (obtained e.g. using --intermediate);
+ pipeline resumes at post-SNP-calling sort step
+ --bowtie-args "<args>" Arguments for Bowtie [-M 1] (Note: --partition --mm -t
+ --hadoopout --startverbose are always set by Crossbow)
+ --ss-args "<args>" Arguments for SOAPsnp [-2 -u -n -q] (Note: -i -d -o -s
+ -z -L -T are always set by Crossbow)
+ --ss-hap-args "<args>" Additional SOAPsnp arguments when reference is haploid
+ [-r 0.0001] (Note: -m is always set by Crossbow)
+ --ss-dip-args "<args>" Additional SOAPsnp arguments when reference is diploid
+ [-r 0.00005 -e 0.0001]
+ --haploids "<chrs>" Comma-separated names of references that are haploid.
+ Others are considered diploid. [All diploid].
+ --all-haploids Consider all chromosomes to be haploid when calling
+ SNPs. [All diploid]
+ --discard-reads <frac> Randomly discard specified fraction of input reads.
+ [off]
+ --truncate <int> Truncate reads longer than <int> bases to <int> bases
+ by trimming from the 3' end [off]
+ --truncate-discard <int> Same as --truncate except that reads shorter than
+ <int> bases are discarded. [off]
+
+ Preprocessing params (not necessary if --input points to preprocessed reads):
+
+ --preprocess --input URL is a manifest file describing a set of
+ unpreprocessed, FASTQ read files; preprocess them
+ before running $APP [off]
+ --just-preprocess Like --preprocess but $APP isn't run; --output
+ contains preprocessed reads [off]
+ --pre-output <url> If --preprocess is on, put preprocessed output here
+ instead of in the intermediate directory [off]. Has
+ no effect if --just-preprocess is specifeid (use
+ --output). Useful if future jobs will make use of the
+ same input.
+ --pre-compress <type> Compression type; one of: gzip, none [gzip]
+ --pre-stop <int> Stop preprocessing after <int> reads/mates [no limit]
+ --pre-filemax <int> Split preprocessed output such that there are no more
+ than <int> reads/mates per preprocessed read file;
+ 0 = no limit. [500,000]
+
+ Other params:
+
+ --test Try to locate all necessary software; print a helpful
+ message showing what was found and quit [off]
+ --tempdir <path> Put temporary scripts and files in <path> [/tmp]
+
+};
+
+# Try to avoid forcing the user to use the equals sign in cases where
+# they're specifying a set of arguments, as in --bowtie-args "-n 3 -l 35"
+for(my $i = 0; $i < scalar(@ARGV)-1; $i++) {
+ if($ARGV[$i] =~ /^-.*-args$/) {
+ $ARGV[$i] = "$ARGV[$i]=\"".$ARGV[$i+1]."\"";
+ splice @ARGV, $i+1, 1;
+ }
+}
+
+CrossbowIface::crossbow(\@ARGV, $SCRIPT, $usage, undef, undef, undef, undef);
diff --git a/cb_hadoop b/cb_hadoop
new file mode 100755
index 0000000..018ec80
--- /dev/null
+++ b/cb_hadoop
@@ -0,0 +1,214 @@
+#!/usr/bin/perl -w
+
+##
+# Author: Ben Langmead
+# Date: March 28, 2010
+#
+# Initiate a Hadoop streaming Crossbow job. Must be on the Hadoop master node.
+#
+
+use strict;
+use warnings;
+use Getopt::Long qw(:config pass_through);
+use FindBin qw($Bin);
+use lib $Bin;
+use CrossbowIface;
+use Cwd 'abs_path';
+
+my $APP = "Crossbow";
+my $app = lc $APP;
+my $SCRIPT = "cb_hadoop";
+my $VERSION = `cat $Bin/VERSION`; $VERSION =~ s/\s//g;
+
+my $usage = qq{
+$SCRIPT: Run $APP v$VERSION as a Hadoop job
+
+Usage: perl $SCRIPT --input <url> --output <url> \
+ [--reference <url> | --just-preprocess ] [options]
+
+Options (defaults in []):
+
+ Job params:
+
+ --dry-run Produce and print path to a script for running the
+ Hadoop job, but don't run it.
+ --input <url> HDFS or S3 URL for input. URL is typically a
+ directory containing preprocessed reads. If
+ --preprocess or --just-preprocess are enabled, URL is
+ a manifest file. If --resume-align is enabled, URL is
+ a directory containing $APP alignments.
+ --output <url> Final output (can be HDFS, S3)
+ --intermediate <url> Intermediate output (can be HDFS, S3). Use an S3 URL
+ if you'd like keep to keep intermediate results after
+ cluster is deallocated. [hdfs:///crossbow]
+ --partition-len <int> Partition length in bases [1 million]
+ --fastq-dump <path> Path to fastq-dump binary on slaves [search \$PATH locally]
+ --bowtie <path> Path to bowtie binary on slaves [search
+ \$MYRNA_BOWTIE_HOME, \$MYRNA_HOME/bin, \$PATH locally]
+ --soapsnp <path> Path to soapsnp binary on slaves [search
+ \$MYRNA_SOAPSNP_HOME, \$MYRNA_HOME/bin, \$PATH locally]
+
+ $APP params (affect results):
+
+ --reference <url> Reference jar (can be HDFS, S3)
+ --just-align Don't do SNP calling; --output will contain alignments
+ --resume-align --input URL is a directory of output from the Crossbow
+ alignment step (obtained e.g. using --intermediate);
+ pipeline resumes at the SNP calling step
+ --resume-snps --input URL is a directory of output from the Crossbow
+ SNP calling step (obtained e.g. using --intermediate);
+ pipeline resumes at post-SNP-calling sort step
+ --bowtie-args "<args>" Arguments for Bowtie [-M 1] (Note: --partition --mm -t
+ --hadoopout --startverbose are always set by Crossbow)
+ --ss-args "<args>" Arguments for SOAPsnp [-2 -u -n -q] (Note: -i -d -o -s
+ -z -L -T are always set by Crossbow)
+ --ss-hap-args "<args>" Additional SOAPsnp arguments when reference is haploid
+ [-r 0.0001] (Note: -m is always set by Crossbow)
+ --ss-dip-args "<args>" Additional SOAPsnp arguments when reference is diploid
+ [-r 0.00005 -e 0.0001]
+ --haploids "<chrs>" Comma-separated names of references to be considered
+ haploid. Others are considered diploid. [None]
+ --all-haploids Consider all chromosomes to be haploid when calling
+ SNPs. [All diploid]
+ --quality <type> Encoding for sequence quality values; one of: phred33,
+ phred64, solexa64 [phred33]
+ --discard-reads <frac> Randomly discard specified fraction of input reads.
+ [off]
+ --truncate <int> Truncate reads longer than <int> bases to <int> bases
+ by trimming from the 3' end.
+ --truncate-discard <int> Same as --truncate except that reads shorter than
+ <int> bases are discarded.
+
+ Preprocessing params (not necessary if --input points to preprocessed reads):
+
+ --preprocess --input URL is a manifest file describing a set of
+ unpreprocessed, FASTQ read files; preprocess them
+ before running $APP [off]
+ --just-preprocess Like --preprocess but $APP isn't run; --output
+ contains preprocessed reads [off]
+ --pre-output <url> If --preprocess is on, put preprocessed output here
+ instead of in the intermediate directory [off]. Has
+ no effect if --just-preprocess is specified (--output
+ is used instead). Useful if future jobs use same
+ input.
+ --pre-compress <type> Compression type; one of: gzip, none [gzip]
+ --pre-stop <int> Stop preprocessing after <int> reads/mates [no limit]
+ --pre-filemax <int> Split preprocessed output such that there are no more
+ than <int> reads/mates per preprocessed read file;
+ 0 = no limit. [500,000]
+
+ Other params:
+
+ --test Try to locate all necessary software; print a helpful
+ message showing what was found and quit [off]
+ --tempdir <path> Put temporary scripts in <path>
+ [/tmp/$APP/invoke.scripts]
+ (umask 0077 used to protect credentials)
+
+};
+
+sub dieusage($$$) {
+ my ($text, $usage, $lev) = @_;
+ print STDERR "$usage\nError:\n";
+ print STDERR "$text\n\n";
+ exit $lev;
+}
+
+# Try to avoid forcing the user to use the equals sign in cases where
+# they're specifying a set of arguments, as in --bowtie-args "-n 3 -l 35"
+for(my $i = 0; $i < scalar(@ARGV)-1; $i++) {
+ if($ARGV[$i] =~ /^-.*-args$/) {
+ $ARGV[$i] = "$ARGV[$i]=\"".$ARGV[$i+1]."\"";
+ splice @ARGV, $i+1, 1;
+ }
+}
+
+my $input = "";
+my $output = "";
+my $intermediate = "";
+my $bowtie = "";
+my $ref = "";
+my $soapsnp = "";
+my $samtools = "";
+my $verbose = 0;
+my $test = 0;
+
+GetOptions (
+ "input:s" => \$input,
+ "output:s" => \$output,
+ "intermediate:s" => \$intermediate,
+ "reference:s" => \$ref,
+ "soapsnp:s" => \$soapsnp,
+ "bowtie:s" => \$bowtie,
+ "samtools:s" => \$samtools,
+ "test" => \$test,
+ "verbose" => \$verbose
+);
+
+##
+# Take a path and make it absolute. If it has a protocol, assume it's
+# already absolute.
+#
+sub absPath($$$) {
+ my ($path, $check, $name) = @_;
+ return $path if $path =~ /^s3n?:\//i;
+ return $path if $path =~ /^hdfs:\//i;
+ return $path if $path =~ /^file:\//i;
+ $path =~ s/^~/$ENV{HOME}/;
+ die "Error: $name path doesn't exist: $path" unless (!$check || -f $path || -d $path);
+ return abs_path($path);
+}
+
+if($verbose) {
+ print STDERR "Relative paths:\n";
+ print STDERR " input: $input\n";
+ print STDERR " output: $output\n";
+ print STDERR " intermediate: $intermediate\n";
+ print STDERR " reference: $ref\n";
+ print STDERR " soapsnp: $soapsnp\n";
+ print STDERR " bowtie: $bowtie\n";
+ print STDERR " samtools: $samtools\n";
+}
+
+$input = absPath($input, 1, "--input") if $input ne "";
+$output = absPath($output, 0, "--output") if $output ne "";
+$intermediate = absPath($intermediate, 0, "--intermediate") if $intermediate ne "";
+$ref = absPath($ref, 1, "--ref") if $ref ne "";
+$soapsnp = absPath($soapsnp, 1, "--soapsnp") if $soapsnp ne "";
+$bowtie = absPath($bowtie, 1, "--bowtie") if $bowtie ne "";
+$samtools = absPath($samtools, 1, "--samtools") if $samtools ne "";
+
+if($verbose) {
+ print STDERR "Absolute paths:\n";
+ print STDERR " input: $input\n";
+ print STDERR " output: $output\n";
+ print STDERR " intermediate: $intermediate\n";
+ print STDERR " reference: $ref\n";
+ print STDERR " soapsnp: $soapsnp\n";
+ print STDERR " bowtie: $bowtie\n";
+ print STDERR " samtools: $samtools\n";
+}
+
+if(!$test) {
+ $input ne "" || dieusage("Must specify --input", $usage, 1);
+ $output ne "" || dieusage("Must specify --output", $usage, 1);
+}
+
+my @args = ();
+
+push @args, "--hadoop-job";
+push @args, ("--input", $input) if $input ne "";
+push @args, ("--output", $output) if $output ne "";
+push @args, ("--intermediate", $intermediate) if $intermediate ne "";
+push @args, ("--reference", $ref) if $ref ne "";
+push @args, ("--soapsnp", $soapsnp) if $soapsnp ne "";
+push @args, ("--bowtie", $bowtie) if $bowtie ne "";
+push @args, ("--samtools", $samtools) if $samtools ne "";
+push @args, "--verbose" if $verbose;
+push @args, "--test" if $test;
+
+$ref ne "" || $test || die "Must specify --reference\n";
+
+push @args, @ARGV;
+
+CrossbowIface::crossbow(\@args, $SCRIPT, $usage, undef, undef, undef, undef);
diff --git a/cb_local b/cb_local
new file mode 100755
index 0000000..2526cb8
--- /dev/null
+++ b/cb_local
@@ -0,0 +1,249 @@
+#!/usr/bin/perl -w
+
+##
+# Author: Ben Langmead
+# Date: March 17, 2010
+#
+# Use perl Hadoop wrappers to intiate a local Crossbow computation
+# similar to the Hadoop version (but without using Hadoop or Java).
+#
+
+use strict;
+use warnings;
+use Getopt::Long;
+use FindBin qw($Bin);
+use lib $Bin;
+use CrossbowIface;
+use Cwd 'abs_path';
+Getopt::Long::Configure ("pass_through", "no_auto_abbrev", "permute", "prefix_pattern=(--|-)");
+
+my $APP = "Crossbow";
+my $app = lc $APP;
+my $SCRIPT = "cb_local";
+my $VERSION = `cat $Bin/VERSION`; $VERSION =~ s/\s//g;
+
+my $usage = qq{
+$SCRIPT: Run $APP v$VERSION on a single computer
+
+Usage: perl $SCRIPT --input <url> --output <url> \
+ [--reference <url> | --just-preprocess ] [options]
+
+Options (defaults in []):
+
+ Job params:
+
+ --input <path> Local path to input. Typically points to a directory
+ containing preprocessed reads, but if --preprocess or
+ --just-preprocess are enabled, points to a manifest
+ file. If --resume-* is specified, <path> is the
+ directory with the intermediate results to be resumed.
+ --output <path> Path to store final output
+ --intermediate <path> Path to store intermediate output
+ [/tmp/$APP/intermediate]
+ --dry-run Produce and print path to a script for running the
+ local job, but don't run it.
+ --bowtie <path> Path to bowtie binary [search BOWTIE_HOME, PATH]
+ --soapsnp <path> Path to soapsnp binary [search SOAPSNP_HOME, PATH]
+
+ $APP params:
+
+ --reference <path> Path to directory with reference information (must be
+ local, with "index", "sequences", "snps" subdirs)
+ --just-align Don't do SNP calling; --output will contain alignments
+ --bowtie-args "<args>" Arguments for Bowtie [-M 1] (Note: --partition --mm -t
+ --hadoopout --startverbose are always set by Crossbow)
+ --ss-args "<args>" Arguments for SOAPsnp [-2 -u -n -q] (Note: -i -d -o -s
+ -z -L -T are always set by Crossbow)
+ --ss-hap-args "<args>" Additional SOAPsnp arguments when reference is haploid
+ [-r 0.0001] (Note: -m is always set by Crossbow)
+ --ss-dip-args "<args>" Additional SOAPsnp arguments when reference is diploid
+ [-r 0.00005 -e 0.0001]
+ --haploids "<chrs>" Comma-separated names of references to be considered
+ haploid. Others are considered diploid. [None]
+ --all-haploids Consider all chromosomes to be haploid when calling
+ SNPs. [All diploid]
+ --quality <type> Encoding for sequence quality values; one of: phred33,
+ phred64, solexa64 [phred33]
+ --discard-reads <frac> Randomly discard specified fraction of input reads.
+ [off]
+ --truncate <int> Truncate reads longer than <int> bases to <int> bases
+ by trimming from the 3' end.
+ --truncate-discard <int> Same as --truncate except that reads shorter than
+ <int> bases are discarded.
+ --partition-len <int> Partition length in bases [1 million]
+
+ Preprocessing params (not necessary if --input contains preprocessed reads):
+
+ --preprocess --input URL is a manifest file describing a set of
+ unpreprocessed, FASTQ read files; preprocess them
+ before running $APP [off]
+ --just-preprocess Like --preprocess but $APP isn't run; --output
+ contains preprocessed reads [off]
+ --pre-output <url> If --preprocess is on, put preprocessed output here
+ instead of in the intermediate directory [off]. Has
+ no effect if --just-preprocess is specified (--output
+ is used instead). Useful if future jobs use same
+ input.
+ --pre-compress <type> Compression type; one of: gzip, none [gzip]
+ --pre-stop <int> Stop preprocessing after <int> reads/mates [no limit]
+ --pre-filemax <int> Split preprocessed output such that there are no more
+ than <int> reads/mates per preprocessed read file;
+ 0 = no limit. [500,000]
+
+ Other params:
+
+ --cpus <int> # CPUs to use (in parallel) for this job [1].
+ --max-sort-records <int> Maximum # records dispatched to sort routine at one
+ time when sorting bins before reduce step. In each
+ child process, this number is effectively divided by
+ --cpus. [200000]
+ --max-sort-files <int> Maximum # total files opened by sort routines. In
+ each child process, this number is effectively divided
+ by --cpus. [40]
+ --dont-overwrite Abort if an intermediate or output directory already
+ exists, instead of removing it [off]
+ --tempdir <path> Put temporary scripts in <path>
+ [/tmp/$APP/invoke.scripts]
+ (umask 0077 used to protect credentials)
+
+};
+
+# Try to avoid forcing the user to use the equals sign in cases where
+# they're specifying a set of arguments, as in --bowtie-args "-n 3 -l 35"
+for(my $i = 0; $i < scalar(@ARGV)-1; $i++) {
+ if($ARGV[$i] =~ /^-.*-args$/) {
+ $ARGV[$i] = "$ARGV[$i]=\"".$ARGV[$i+1]."\"";
+ splice @ARGV, $i+1, 1;
+ }
+}
+
+my $input = "";
+my $output = "";
+my $intermediate = "";
+my $ref = "";
+my $index = "";
+my $sequences = "";
+my $snps = "";
+my $cmap = "";
+my $justPreprocess = 0;
+my $verbose = 0;
+my $test = 0;
+
+sub dieusage($$$) {
+ my ($text, $usage, $lev) = @_;
+ print STDERR "$usage\nError:\n";
+ print STDERR "$text\n\n";
+ exit $lev;
+}
+
+GetOptions (
+ "input:s" => \$input,
+ "output:s" => \$output,
+ "intermediate:s" => \$intermediate,
+ "reference:s" => \$ref,
+ "index:s" => \$index,
+ "sequences:s" => \$sequences,
+ "snps:s" => \$snps,
+ "cmap:s" => \$cmap,
+ "just-preprocess" => \$justPreprocess,
+ "test" => \$test,
+ "verbose" => \$verbose
+) || die "Error parsing options";
+
+##
+# Take a path and make it absolute. If it has a protocol, assume it's
+# already absolute.
+#
+sub absPath($$$) {
+ my ($path, $check, $name) = @_;
+ return $path if $path =~ /^s3n?:\//i;
+ return $path if $path =~ /^hdfs:\//i;
+ return $path if $path =~ /^file:\//i;
+ $path =~ s/^~/$ENV{HOME}/;
+ die "Error: $name path doesn't exist: $path" unless (!$check || -f $path || -d $path);
+ return abs_path($path);
+}
+
+if($verbose) {
+ print STDERR "Relative paths:\n";
+ print STDERR " input: $input\n";
+ print STDERR " output: $output\n";
+ print STDERR " intermediate: $intermediate\n";
+ print STDERR " reference: $ref\n";
+ print STDERR " index: $index\n";
+ print STDERR " sequences: $sequences\n";
+ print STDERR " snps: $snps\n";
+ print STDERR " cmap: $cmap\n";
+}
+
+$input = absPath($input, 1, "--input") if $input ne "";
+$output = absPath($output, 0, "--output") if $output ne "";
+$intermediate = absPath($intermediate, 0, "--intermediate") if $intermediate ne "";
+$index = absPath($index, 1, "--index") if $index ne "";
+$ref = absPath($ref, 1, "--reference") if $ref ne "";
+$sequences = absPath($sequences, 1, "--sequences") if $sequences ne "";
+$snps = absPath($snps, 1, "--snps") if $snps ne "";
+$cmap = absPath($cmap, 1, "--cmap") if $cmap ne "";
+
+if($verbose) {
+ print STDERR "Absolute paths:\n";
+ print STDERR " input: $input\n";
+ print STDERR " output: $output\n";
+ print STDERR " intermediate: $intermediate\n";
+ print STDERR " reference: $ref\n";
+ print STDERR " index: $index\n";
+ print STDERR " sequences: $sequences\n";
+ print STDERR " snps: $snps\n";
+ print STDERR " cmap: $cmap\n";
+}
+
+if(!$test) {
+ $input ne "" || dieusage("Must specify --input", $usage, 1);
+ $output ne "" || dieusage("Must specify --output", $usage, 1);
+}
+
+my @args = ();
+
+push @args, "--local-job";
+push @args, ("--input-local", $input) if $input ne "";
+push @args, ("--output-local", $output) if $output ne "";
+push @args, ("--intermediate-local", $intermediate) if $intermediate ne "";
+push @args, "--verbose" if $verbose;
+if($ref ne "") {
+ $index = "$ref/index";
+ $sequences = "$ref/sequences";
+ $snps = "$ref/snps";
+ $cmap = "$ref/cmap.txt";
+ -d $index || die "Index directory under --reference directory, $index doesn't exist\n";
+ -d $sequences || die "Sequences directory under --reference directory, $sequences doesn't exist\n";
+ -d $snps || die "SNPs directory under --reference directory, $snps doesn't exist\n";
+ -f $cmap || die "cmap files under --reference directory, $cmap doesn't exist\n";
+ my @indexes = <$index/*.rev.1.ebwt>;
+ scalar(@indexes) <= 1 ||
+ die "Error: there seems to be more than one index in $index\n";
+ scalar(@indexes) == 1 || die "Error: no index found in $index\n";
+ my @is = split(/\//, $indexes[0]);
+ my $indexBase = $is[-1];
+ $indexBase =~ s/\.rev\.1\.ebwt$//;
+ push @args, ("--index-local", "$index/$indexBase");
+ push @args, ("--sequences-local", $sequences);
+ push @args, ("--snps-local", $snps);
+ push @args, ("--cmap-local", $cmap);
+} else {
+ push @args, ("--index-local", $index) if $index ne "";
+ push @args, ("--sequences-local", $sequences) if $sequences ne "";
+ push @args, ("--cmap-local", $cmap) if $cmap ne "";
+ push @args, ("--snps-local", $snps) if $snps ne "";
+}
+if(!$justPreprocess) {
+ $index ne "" || $test || die "Must specify --reference or --index\n";
+ $sequences ne "" || $test || die "Must specify --reference or --sequences\n";
+ $snps ne "" || $test || die "Must specify --reference or --snps\n";
+ $cmap ne "" || $test || die "Must specify --reference or --cmap\n";
+}
+
+push @args, "--test" if $test;
+push @args, "--just-preprocess" if $justPreprocess;
+push @args, @ARGV;
+
+CrossbowIface::crossbow(\@args, $SCRIPT, $usage, undef, undef, undef, undef);
diff --git a/contrib/ForkManager.pm b/contrib/ForkManager.pm
new file mode 100644
index 0000000..331c59c
--- /dev/null
+++ b/contrib/ForkManager.pm
@@ -0,0 +1,412 @@
+=head1 NAME
+
+Parallel::ForkManager - A simple parallel processing fork manager
+
+=head1 SYNOPSIS
+
+ use Parallel::ForkManager;
+
+ $pm = new Parallel::ForkManager($MAX_PROCESSES);
+
+ foreach $data (@all_data) {
+ # Forks and returns the pid for the child:
+ my $pid = $pm->start and next;
+
+ ... do some work with $data in the child process ...
+
+ $pm->finish; # Terminates the child process
+ }
+
+=head1 DESCRIPTION
+
+This module is intended for use in operations that can be done in parallel
+where the number of processes to be forked off should be limited. Typical
+use is a downloader which will be retrieving hundreds/thousands of files.
+
+The code for a downloader would look something like this:
+
+ use LWP::Simple;
+ use Parallel::ForkManager;
+
+ ...
+
+ @links=(
+ ["http://www.foo.bar/rulez.data","rulez_data.txt"],
+ ["http://new.host/more_data.doc","more_data.doc"],
+ ...
+ );
+
+ ...
+
+ # Max 30 processes for parallel download
+ my $pm = new Parallel::ForkManager(30);
+
+ foreach my $linkarray (@links) {
+ $pm->start and next; # do the fork
+
+ my ($link,$fn) = @$linkarray;
+ warn "Cannot get $fn from $link"
+ if getstore($link,$fn) != RC_OK;
+
+ $pm->finish; # do the exit in the child process
+ }
+ $pm->wait_all_children;
+
+First you need to instantiate the ForkManager with the "new" constructor.
+You must specify the maximum number of processes to be created. If you
+specify 0, then NO fork will be done; this is good for debugging purposes.
+
+Next, use $pm->start to do the fork. $pm returns 0 for the child process,
+and child pid for the parent process (see also L<perlfunc(1p)/fork()>).
+The "and next" skips the internal loop in the parent process. NOTE:
+$pm->start dies if the fork fails.
+
+$pm->finish terminates the child process (assuming a fork was done in the
+"start").
+
+NOTE: You cannot use $pm->start if you are already in the child process.
+If you want to manage another set of subprocesses in the child process,
+you must instantiate another Parallel::ForkManager object!
+
+=head1 METHODS
+
+=over 5
+
+=item new $processes
+
+Instantiate a new Parallel::ForkManager object. You must specify the maximum
+number of children to fork off. If you specify 0 (zero), then no children
+will be forked. This is intended for debugging purposes.
+
+=item start [ $process_identifier ]
+
+This method does the fork. It returns the pid of the child process for
+the parent, and 0 for the child process. If the $processes parameter
+for the constructor is 0 then, assuming you're in the child process,
+$pm->start simply returns 0.
+
+An optional $process_identifier can be provided to this method... It is used by
+the "run_on_finish" callback (see CALLBACKS) for identifying the finished
+process.
+
+=item finish [ $exit_code ]
+
+Closes the child process by exiting and accepts an optional exit code
+(default exit code is 0) which can be retrieved in the parent via callback.
+If you use the program in debug mode ($processes == 0), this method doesn't
+do anything.
+
+=item set_max_procs $processes
+
+Allows you to set a new maximum number of children to maintain. Returns
+the previous setting.
+
+=item wait_all_children
+
+You can call this method to wait for all the processes which have been
+forked. This is a blocking wait.
+
+=back
+
+=head1 CALLBACKS
+
+You can define callbacks in the code, which are called on events like starting
+a process or upon finish.
+
+The callbacks can be defined with the following methods:
+
+=over 4
+
+=item run_on_finish $code [, $pid ]
+
+You can define a subroutine which is called when a child is terminated. It is
+called in the parent process.
+
+The paremeters of the $code are the following:
+
+ - pid of the process, which is terminated
+ - exit code of the program
+ - identification of the process (if provided in the "start" method)
+ - exit signal (0-127: signal name)
+ - core dump (1 if there was core dump at exit)
+
+=item run_on_start $code
+
+You can define a subroutine which is called when a child is started. It called
+after the successful startup of a child in the parent process.
+
+The parameters of the $code are the following:
+
+ - pid of the process which has been started
+ - identification of the process (if provided in the "start" method)
+
+=item run_on_wait $code, [$period]
+
+You can define a subroutine which is called when the child process needs to wait
+for the startup. If $period is not defined, then one call is done per
+child. If $period is defined, then $code is called periodically and the
+module waits for $period seconds betwen the two calls. Note, $period can be
+fractional number also. The exact "$period seconds" is not guarranteed,
+signals can shorten and the process scheduler can make it longer (on busy
+systems).
+
+The $code called in the "start" and the "wait_all_children" method also.
+
+No parameters are passed to the $code on the call.
+
+=back
+
+=head1 EXAMPLE
+
+=head2 Parallel get
+
+This small example can be used to get URLs in parallel.
+
+ use Parallel::ForkManager;
+ use LWP::Simple;
+ my $pm=new Parallel::ForkManager(10);
+ for my $link (@ARGV) {
+ $pm->start and next;
+ my ($fn)= $link =~ /^.*\/(.*?)$/;
+ if (!$fn) {
+ warn "Cannot determine filename from $fn\n";
+ } else {
+ $0.=" ".$fn;
+ print "Getting $fn from $link\n";
+ my $rc=getstore($link,$fn);
+ print "$link downloaded. response code: $rc\n";
+ };
+ $pm->finish;
+ };
+
+=head2 Callbacks
+
+Example of a program using callbacks to get child exit codes:
+
+ use strict;
+ use Parallel::ForkManager;
+
+ my $max_procs = 5;
+ my @names = qw( Fred Jim Lily Steve Jessica Bob Dave Christine Rico Sara );
+ # hash to resolve PID's back to child specific information
+
+ my $pm = new Parallel::ForkManager($max_procs);
+
+ # Setup a callback for when a child finishes up so we can
+ # get it's exit code
+ $pm->run_on_finish(
+ sub { my ($pid, $exit_code, $ident) = @_;
+ print "** $ident just got out of the pool ".
+ "with PID $pid and exit code: $exit_code\n";
+ }
+ );
+
+ $pm->run_on_start(
+ sub { my ($pid,$ident)=@_;
+ print "** $ident started, pid: $pid\n";
+ }
+ );
+
+ $pm->run_on_wait(
+ sub {
+ print "** Have to wait for one children ...\n"
+ },
+ 0.5
+ );
+
+ foreach my $child ( 0 .. $#names ) {
+ my $pid = $pm->start($names[$child]) and next;
+
+ # This code is the child process
+ print "This is $names[$child], Child number $child\n";
+ sleep ( 2 * $child );
+ print "$names[$child], Child $child is about to get out...\n";
+ sleep 1;
+ $pm->finish($child); # pass an exit code to finish
+ }
+
+ print "Waiting for Children...\n";
+ $pm->wait_all_children;
+ print "Everybody is out of the pool!\n";
+
+=head1 BUGS AND LIMITATIONS
+
+Do not use Parallel::ForkManager in an environment, where other child
+processes can affect the run of the main program, so using this module
+is not recommended in an environment where fork() / wait() is already used.
+
+If you want to use more than one copies of the Parallel::ForkManager, then
+you have to make sure that all children processes are terminated, before you
+use the second object in the main program.
+
+You are free to use a new copy of Parallel::ForkManager in the child
+processes, although I don't think it makes sense.
+
+=head1 COPYRIGHT
+
+Copyright (c) 2000 Szab�, Bal�zs (dLux)
+
+All right reserved. This program is free software; you can redistribute it
+and/or modify it under the same terms as Perl itself.
+
+=head1 AUTHOR
+
+ dLux (Szab�, Bal�zs) <dlux at kapu.hu>
+
+=head1 CREDITS
+
+ Noah Robin <sitz at onastick.net> (documentation tweaks)
+ Chuck Hirstius <chirstius at megapathdsl.net> (callback exit status, example)
+ Grant Hopwood <hopwoodg at valero.com> (win32 port)
+ Mark Southern <mark_southern at merck.com> (bugfix)
+
+=cut
+
+package Parallel::ForkManager;
+use POSIX ":sys_wait_h";
+use strict;
+use vars qw($VERSION);
+$VERSION='0.7.5';
+
+sub new { my ($c,$processes)=@_;
+ my $h={
+ max_proc => $processes,
+ processes => {},
+ in_child => 0,
+ };
+ return bless($h,ref($c)||$c);
+};
+
+sub start { my ($s,$identification)=@_;
+ die "Cannot start another process while you are in the child process"
+ if $s->{in_child};
+ while ($s->{max_proc} && ( keys %{ $s->{processes} } ) >= $s->{max_proc}) {
+ $s->on_wait;
+ $s->wait_one_child(defined $s->{on_wait_period} ? &WNOHANG : undef);
+ };
+ $s->wait_children;
+ if ($s->{max_proc}) {
+ my $pid=fork();
+ die "Cannot fork: $!" if !defined $pid;
+ if ($pid) {
+ $s->{processes}->{$pid}=$identification;
+ $s->on_start($pid,$identification);
+ } else {
+ $s->{in_child}=1 if !$pid;
+ }
+ return $pid;
+ } else {
+ $s->{processes}->{$$}=$identification;
+ $s->on_start($$,$identification);
+ return 0; # Simulating the child which returns 0
+ }
+}
+
+sub finish { my ($s, $x)=@_;
+ if ( $s->{in_child} ) {
+ exit ($x || 0);
+ }
+ if ($s->{max_proc} == 0) { # max_proc == 0
+ $s->on_finish($$, $x ,$s->{processes}->{$$}, 0, 0);
+ delete $s->{processes}->{$$};
+ }
+ return 0;
+}
+
+sub wait_children { my ($s)=@_;
+ return if !keys %{$s->{processes}};
+ my $kid;
+ do {
+ $kid = $s->wait_one_child(&WNOHANG);
+ } while $kid > 0 || $kid < -1; # AS 5.6/Win32 returns negative PIDs
+};
+
+sub wait_one_child { my ($s,$par)=@_;
+ my $kid;
+ while (1) {
+ $kid = $s->_waitpid(-1,$par||=0);
+ last if $kid == 0 || $kid == -1; # AS 5.6/Win32 returns negative PIDs
+ redo if !exists $s->{processes}->{$kid};
+ my $id = delete $s->{processes}->{$kid};
+ $s->on_finish( $kid, $? >> 8 , $id, $? & 0x7f, $? & 0x80 ? 1 : 0);
+ last;
+ }
+ $kid;
+};
+
+sub wait_all_children { my ($s)=@_;
+ while (keys %{ $s->{processes} }) {
+ $s->on_wait;
+ $s->wait_one_child(defined $s->{on_wait_period} ? &WNOHANG : undef);
+ };
+}
+
+sub run_on_finish { my ($s,$code,$pid)=@_;
+ $s->{on_finish}->{$pid || 0}=$code;
+}
+
+sub on_finish { my ($s,$pid, at par)=@_;
+ my $code=$s->{on_finish}->{$pid} || $s->{on_finish}->{0} or return 0;
+ $code->($pid, at par);
+};
+
+sub run_on_wait { my ($s,$code, $period)=@_;
+ $s->{on_wait}=$code;
+ $s->{on_wait_period} = $period;
+}
+
+sub on_wait { my ($s)=@_;
+ if(ref($s->{on_wait}) eq 'CODE') {
+ $s->{on_wait}->();
+ if (defined $s->{on_wait_period}) {
+ local $SIG{CHLD} = sub { } if ! defined $SIG{CHLD};
+ select undef, undef, undef, $s->{on_wait_period}
+ };
+ };
+};
+
+sub run_on_start { my ($s,$code)=@_;
+ $s->{on_start}=$code;
+}
+
+sub on_start { my ($s, at par)=@_;
+ $s->{on_start}->(@par) if ref($s->{on_start}) eq 'CODE';
+};
+
+sub set_max_procs { my ($s, $mp)=@_;
+ $s->{max_proc} = $mp;
+}
+
+# OS dependant code follows...
+
+sub _waitpid { # Call waitpid() in the standard Unix fashion.
+ return waitpid($_[1],$_[2]);
+}
+
+# On ActiveState Perl 5.6/Win32 build 625, waitpid(-1, &WNOHANG) always
+# blocks unless an actual PID other than -1 is given.
+sub _NT_waitpid { my ($s, $pid, $par) = @_;
+ if ($par == &WNOHANG) { # Need to nonblock on each of our PIDs in the pool.
+ my @pids = keys %{ $s->{processes} };
+ # Simulate -1 (no processes awaiting cleanup.)
+ return -1 unless scalar(@pids);
+ # Check each PID in the pool.
+ my $kid;
+ foreach $pid (@pids) {
+ $kid = waitpid($pid, $par);
+ return $kid if $kid != 0; # AS 5.6/Win32 returns negative PIDs.
+ }
+ return $kid;
+ } else { # Normal waitpid() call.
+ return waitpid($pid, $par);
+ }
+}
+
+{
+ local $^W = 0;
+ if ($^O eq 'NT' or $^O eq 'MSWin32') {
+ *_waitpid = \&_NT_waitpid;
+ }
+}
+
+1;
diff --git a/contrib/Sort.pm b/contrib/Sort.pm
new file mode 100644
index 0000000..57a36f1
--- /dev/null
+++ b/contrib/Sort.pm
@@ -0,0 +1,1081 @@
+package File::Sort;
+use Carp;
+use Fcntl qw(O_RDONLY O_WRONLY O_CREAT O_TRUNC);
+use Symbol qw(gensym);
+use strict;
+use locale;
+use vars qw($VERSION *sortsub *sort1 *sort2 *map1 *map2 %fh);
+
+require Exporter;
+use vars qw(@ISA @EXPORT_OK);
+ at ISA = 'Exporter';
+ at EXPORT_OK = 'sort_file';
+$VERSION = '1.01';
+
+sub sort_file {
+ my @args = @_;
+ if (ref $args[0]) {
+
+ # fix pos to look like k
+ if (exists $args[0]{'pos'}) {
+ my @argv;
+ my $pos = $args[0]{'pos'};
+
+ if (!ref $pos) {
+ $pos = [$pos];
+ }
+
+ if (!exists $args[0]{'k'}) {
+ $args[0]{'k'} = [];
+ } elsif (!ref $args[0]{'k'}) {
+ $args[0]{'k'} = [$args[0]{'k'}];
+ }
+
+ for (@$pos) {
+ my $n;
+ if ( /^\+(\d+)(?:\.(\d+))?([bdfinr]+)?
+ (?:\s+\-(\d+)(?:\.(\d+))?([bdfinr]+)?)?$/x) {
+ $n = $1 + 1;
+ $n .= '.' . ($2 + 1) if defined $2;
+ $n .= $3 if $3;
+
+ if (defined $4) {
+ $n .= "," . (defined $5 ? ($4 + 1) . ".$5" : $4);
+ $n .= $6 if $6;
+ }
+ push @{$args[0]{'k'}}, $n;
+ }
+ }
+
+ }
+ _sort_file(@args);
+ } else {
+ _sort_file({I => $args[0], o => $args[1]});
+ }
+}
+
+sub _sort_file {
+ local $\; # don't mess up our prints
+ my($opts, @fh, @recs) = shift;
+
+ # record separator, default to \n
+ local $/ = $opts->{R} ? $opts->{R} : "\n";
+
+ # get input files into anon array if not already
+ $opts->{I} = [$opts->{I}] unless ref $opts->{I};
+
+ usage() unless @{$opts->{I}};
+
+ # "K" == "no k", for later
+ $opts->{K} = $opts->{k} ? 0 : 1;
+ $opts->{k} = $opts->{k} ? [$opts->{k}] : [] if !ref $opts->{k};
+
+ # set output and other defaults
+ $opts->{o} = !$opts->{o} ? '' : $opts->{o};
+ $opts->{'y'} ||= $ENV{MAX_SORT_RECORDS} || 200000; # default max records
+ $opts->{F} ||= $ENV{MAX_SORT_FILES} || 40; # default max files
+
+
+ # see big ol' mess below
+ _make_sort_sub($opts);
+
+ # only check to see if file is sorted
+ if ($opts->{c}) {
+ local *F;
+ my $last;
+
+ if ($opts->{I}[0] eq '-') {
+ open(F, $opts->{I}[0])
+ or die "Can't open `$opts->{I}[0]' for reading: $!";
+ } else {
+ sysopen(F, $opts->{I}[0], O_RDONLY)
+ or die "Can't open `$opts->{I}[0]' for reading: $!";
+ }
+
+ while (defined(my $rec = <F>)) {
+ # fail if -u and keys are not unique (assume sorted)
+ if ($opts->{u} && $last) {
+ return 0 unless _are_uniq($opts->{K}, $last, $rec);
+ }
+
+ # fail if records not in proper sort order
+ if ($last) {
+ my @foo;
+ if ($opts->{K}) {
+ local $^W;
+ @foo = sort sort1 ($rec, $last);
+ } else {
+ local $^W;
+ @foo = map {$_->[0]} sort sortsub
+ map &map1, ($rec, $last);
+ }
+ return 0 if $foo[0] ne $last || $foo[1] ne $rec;
+ }
+
+ # save value of last record
+ $last = $rec;
+ }
+
+ # success, yay
+ return 1;
+
+ # if merging sorted files
+ } elsif ($opts->{'m'}) {
+
+ foreach my $filein (@{$opts->{I}}) {
+
+ # just open files and get array of handles
+ my $sym = gensym();
+
+ sysopen($sym, $filein, O_RDONLY)
+ or die "Can't open `$filein' for reading: $!";
+
+ push @fh, $sym;
+ }
+
+ # ooo, get ready, get ready
+ } else {
+
+ # once for each input file
+ foreach my $filein (@{$opts->{I}}) {
+ local *F;
+ my $count = 0;
+
+ _debug("Sorting file $filein ...\n") if $opts->{D};
+
+ if ($filein eq '-') {
+ open(F, $filein)
+ or die "Can't open `$filein' for reading: $!";
+ } else {
+ sysopen(F, $filein, O_RDONLY)
+ or die "Can't open `$filein' for reading: $!";
+ }
+
+ while (defined(my $rec = <F>)) {
+ push @recs, $rec;
+ $count++; # keep track of number of records
+
+ if ($count >= $opts->{'y'}) { # don't go over record limit
+
+ _debug("$count records reached in `$filein'\n")
+ if $opts->{D};
+
+ # save to temp file, add new fh to array
+ push @fh, _write_temp(\@recs, $opts);
+
+ # reset record count and record array
+ ($count, @recs) = (0);
+
+ # do a merge now if at file limit
+ if (@fh >= $opts->{F}) {
+
+ # get filehandle and restart array with it
+ @fh = (_merge_files($opts, \@fh, [], _get_temp()));
+
+ _debug("\nCreating temp files ...\n") if $opts->{D};
+ }
+ }
+ }
+
+ close F;
+ }
+
+ # records leftover, didn't reach record limit
+ if (@recs) {
+ _debug("\nSorting leftover records ...\n") if $opts->{D};
+ _check_last(\@recs);
+ if ($opts->{K}) {
+ local $^W;
+ @recs = sort sort1 @recs;
+ } else {
+ local $^W;
+ @recs = map {$_->[0]} sort sortsub map &map1, @recs;
+ }
+ }
+ }
+
+ # do the merge thang, uh huh, do the merge thang
+ my $close = _merge_files($opts, \@fh, \@recs, $opts->{o});
+ close $close unless fileno($close) == fileno('STDOUT'); # don't close STDOUT
+
+ _debug("\nDone!\n\n") if $opts->{D};
+ return 1; # yay
+}
+
+# take optional arrayref of handles of sorted files,
+# plus optional arrayref of sorted scalars
+sub _merge_files {
+ # we need the options, filehandles, and output file
+ my($opts, $fh, $recs, $file) = @_;
+ my($uniq, $first, $o, %oth);
+
+ # arbitrarily named keys, store handles as values
+ %oth = map {($o++ => $_)} @$fh;
+
+ # match handle key in %oth to next record of the handle
+ %fh = map {
+ my $fh = $oth{$_};
+ ($_ => scalar <$fh>);
+ } keys %oth;
+
+ # extra records, special X "handle"
+ $fh{X} = shift @$recs if @$recs;
+
+ _debug("\nCreating sorted $file ...\n") if $opts->{D};
+
+ # output to STDOUT if no output file provided
+ if ($file eq '') {
+ $file = \*STDOUT;
+
+ # if output file is a path, not a reference to a file, open
+ # file and get a reference to it
+ } elsif (!ref $file) {
+ my $tfh = gensym();
+ sysopen($tfh, $file, O_WRONLY|O_CREAT|O_TRUNC)
+ or die "Can't open `$file' for writing: $!";
+ $file = $tfh;
+ }
+
+ my $oldfh = select $file;
+ $| = 0; # just in case, use the buffer, you knob
+
+ while (keys %fh) {
+ # don't bother sorting keys if only one key remains!
+ if (!$opts->{u} && keys %fh == 1) {
+ ($first) = keys %fh;
+ my $curr = $oth{$first};
+ my @left = $first eq 'X' ? @$recs : <$curr>;
+ print $fh{$first}, @left;
+ delete $fh{$first};
+ last;
+ }
+
+ {
+ # $first is arbitrary number assigned to first fh in sort
+ if ($opts->{K}) {
+ local $^W;
+ ($first) = (sort sort2 keys %fh);
+ } else {
+ local $^W;
+ ($first) = (map {$_->[0]} sort sortsub
+ map &map2, keys %fh);
+ }
+ }
+
+ # don't print if -u and not unique
+ if ($opts->{u}) {
+ print $fh{$first} if
+ (!$uniq || _are_uniq($opts->{K}, $uniq, $fh{$first}));
+ $uniq = $fh{$first};
+ } else {
+ print $fh{$first};
+ }
+
+ # get current filehandle
+ my $curr = $oth{$first};
+
+ # use @$recs, not filehandles, if key is X
+ my $rec = $first eq 'X' ? shift @$recs : scalar <$curr>;
+
+ if (defined $rec) { # bring up next record for this filehandle
+ $fh{$first} = $rec;
+
+ } else { # we don't need you anymore
+ delete $fh{$first};
+ }
+ }
+
+ seek $file, 0, 0; # might need to read back from it
+ select $oldfh;
+ return $file;
+}
+
+sub _check_last {
+ # add new record separator if not one there
+ ${$_[0]}[-1] .= $/ if (${$_[0]}[-1] !~ m|$/$|);
+}
+
+sub _write_temp {
+ my($recs, $opts) = @_;
+ my $temp = _get_temp() or die "Can't get temp file: $!";
+
+ _check_last($recs);
+
+ _debug("New tempfile: $temp\n") if $opts->{D};
+
+ if ($opts->{K}) {
+ local $^W;
+ print $temp sort sort1 @{$recs};
+ } else {
+ local $^W;
+ print $temp map {$_->[0]} sort sortsub map &map1, @{$recs};
+ }
+
+ seek $temp, 0, 0; # might need to read back from it
+ return $temp;
+}
+
+sub _parse_keydef {
+ my($k, $topts) = @_;
+
+ # gurgle
+ $k =~ /^(\d+)(?:\.(\d+))?([bdfinr]+)?
+ (?:,(\d+)(?:\.(\d+))?([bdfinr]+)?)?$/x;
+
+ # set defaults at zero or undef
+ my %opts = (
+ %$topts, # get other options
+ ksf => $1 || 0, # start field
+ ksc => $2 || 0, # start field char start
+ kst => $3 || '', # start field type
+ kff => (defined $4 ? $4 : undef), # end field
+ kfc => $5 || 0, # end field char end
+ kft => $6 || '', # end field type
+ );
+
+ # their idea of 1 is not ours
+ for (qw(ksf ksc kff)) { # kfc stays same
+ $opts{$_}-- if $opts{$_};
+ }
+
+ # if nothing in kst or kft, use other flags possibly passed
+ if (!$opts{kst} && !$opts{kft}) {
+ foreach (qw(b d f i n r)) {
+ $opts{kst} .= $_ if $topts->{$_};
+ $opts{kft} .= $_ if $topts->{$_};
+ }
+
+ # except for b, flags on one apply to the other
+ } else {
+ foreach (qw(d f i n r)) {
+ $opts{kst} .= $_ if ($opts{kst} =~ /$_/ || $opts{kft} =~ /$_/);
+ $opts{kft} .= $_ if ($opts{kst} =~ /$_/ || $opts{kft} =~ /$_/);
+ }
+ }
+
+ return \%opts;
+}
+
+sub _make_sort_sub {
+ my($topts, @sortsub, @mapsub, @sort1, @sort2) = shift;
+
+ # if no keydefs set
+ if ($topts->{K}) {
+ $topts->{kst} = '';
+ foreach (qw(b d f i n r)) {
+ $topts->{kst} .= $_ if $topts->{$_};
+ }
+
+ # more complex stuff, act like we had -k defined
+ if ($topts->{kst} =~ /[bdfi]/) {
+ $topts->{K} = 0;
+ $topts->{k} = ['K']; # special K ;-)
+ }
+ }
+
+ # if no keydefs set
+ if ($topts->{K}) {
+ _debug("No keydef set\n") if $topts->{D};
+
+ # defaults for main sort sub components
+ my($cmp, $aa, $bb, $fa, $fb) = qw(cmp $a $b $fh{$a} $fh{$b});
+
+ # reverse sense
+ ($bb, $aa, $fb, $fa) = ($aa, $bb, $fa, $fb) if $topts->{r};
+
+ # do numeric sort
+ $cmp = '<=>' if $topts->{n};
+
+ # add finished expression to array
+ my $sort1 = "sub { $aa $cmp $bb }\n";
+ my $sort2 = "sub { $fa $cmp $fb }\n";
+
+ _debug("$sort1\n$sort2\n") if $topts->{D};
+
+ {
+ local $^W;
+ *sort1 = eval $sort1;
+ die "Can't create sort sub: $@" if $@;
+ *sort2 = eval $sort2;
+ die "Can't create sort sub: $@" if $@;
+ }
+
+ } else {
+
+ # get text separator or use whitespace
+ $topts->{t} =
+ defined $topts->{X} ? $topts->{X} :
+ defined $topts->{t} ? quotemeta($topts->{t}) :
+ '\s+';
+ $topts->{t} =~ s|/|\\/|g if defined $topts->{X};
+
+ foreach my $k (@{$topts->{k}}) {
+ my($opts, @fil) = ($topts);
+
+ # defaults for main sort sub components
+ my($cmp, $ab_, $fab_, $aa, $bb) = qw(cmp $_ $fh{$_} $a $b);
+
+ # skip stuff if special K
+ $opts = $k eq 'K' ? $topts : _parse_keydef($k, $topts);
+
+ if ($k ne 'K') {
+ my($tmp1, $tmp2) = ("\$tmp[$opts->{ksf}]",
+ ($opts->{kff} ? "\$tmp[$opts->{kff}]" : ''));
+
+ # skip leading spaces
+ if ($opts->{kst} =~ /b/) {
+ $tmp1 = "($tmp1 =~ /(\\S.*)/)[0]";
+ }
+
+ if ($opts->{kft} =~ /b/) {
+ $tmp2 = "($tmp2 =~ /(\\S.*)/)[0]";
+ }
+
+ # simpler if one field, goody for us
+ if (! defined $opts->{kff} || $opts->{ksf} == $opts->{kff}) {
+
+ # simpler if chars are both 0, wicked pissah
+ if ($opts->{ksc} == 0 &&
+ (!$opts->{kfc} || $opts->{kfc} == 0)) {
+ @fil = "\$tmp[$opts->{ksf}]";
+
+ # hmmmmm
+ } elsif (!$opts->{kfc}) {
+ @fil = "substr($tmp1, $opts->{ksc})";
+
+ # getting out of hand now
+ } else {
+ @fil = "substr($tmp1, $opts->{ksc}, ".
+ ($opts->{kfc} - $opts->{ksc}) . ')';
+ }
+
+ # try again, shall we?
+ } else {
+
+ # if spans two fields, but chars are both 0
+ # and neither has -b, alrighty
+ if ($opts->{kfc} == 0 && $opts->{ksc} == 0 &&
+ $opts->{kst} !~ /b/ && $opts->{kft} !~ /b/) {
+ @fil = "join(''," .
+ "\@tmp[$opts->{ksf} .. $opts->{kff}])";
+
+ # if only one field away
+ } elsif (($opts->{kff} - $opts->{ksf}) == 1) {
+ @fil = "join('', substr($tmp1, $opts->{ksc}), " .
+ "substr($tmp2, 0, $opts->{kfc}))";
+
+ # fine, have it your way! hurt me! love me!
+ } else {
+ @fil = "join('', substr($tmp1, $opts->{ksc}), " .
+ "\@tmp[" . ($opts->{ksf} + 1) . " .. " .
+ ($opts->{kff} - 1) . "], " .
+ "substr($tmp2, 0, $opts->{kfc}))";
+ }
+ }
+ } else {
+ @fil = $opts->{kst} =~ /b/ ?
+ "(\$tmp[0] =~ /(\\S.*)/)[0]" : "\$tmp[0]";
+ }
+
+ # fold to upper case
+ if ($opts->{kst} =~ /f/) {
+ $fil[0] = "uc($fil[0])";
+ }
+
+ # only alphanumerics and whitespace, override -i
+ if ($opts->{kst} =~ /d/) {
+ $topts->{DD}++;
+ push @fil, "\$tmp =~ s/[^\\w\\s]+//g", '"$tmp"';
+
+ # only printable characters
+ } elsif ($opts->{kst} =~ /i/) {
+ require POSIX;
+ $fil[0] = "join '', grep {POSIX::isprint \$_} " .
+ "split //,\n$fil[0]";
+ }
+
+ $fil[0] = "\$tmp = $fil[0]" if $opts->{kst} =~ /d/;
+
+
+ # reverse sense
+ ($bb, $aa) = ($aa, $bb) if ($opts->{kst} =~ /r/);
+
+ # do numeric sort
+ $cmp = '<=>' if ($opts->{kst} =~ /n/);
+
+ # add finished expressions to arrays
+ my $n = @sortsub + 2;
+ push @sortsub, sprintf "%s->[$n] %s %s->[$n]",
+ $aa, $cmp, $bb;
+
+ if (@fil > 1) {
+ push @mapsub, " (\n" .
+ join(",\n", map {s/^/ /mg; $_} @fil),
+ "\n )[-1],\n ";
+ } else {
+ push @mapsub, " " . $fil[0] . ",\n ";
+ }
+ }
+
+ # if not -u
+ if (! $topts->{u} ) {
+ # do straight compare if all else is equal
+ push @sortsub, sprintf "%s->[1] %s %s->[1]",
+ $topts->{r} ? qw($b cmp $a) : qw($a cmp $b);
+ }
+
+ my(%maps, $sortsub, $mapsub) = (map1 => '$_', map2 => '$fh{$_}');
+
+ $sortsub = "sub {\n " . join(" || \n ", @sortsub) . "\n}\n";
+
+ for my $m (keys %maps) {
+ my $k = $maps{$m};
+ $maps{$m} = sprintf "sub {\n my \@tmp = %s;\n",
+ $topts->{k}[0] eq 'K' ? $k : "split(/$topts->{t}/, $k)";
+
+ $maps{$m} .= " my \$tmp;\n" if $topts->{DD};
+ $maps{$m} .= "\n [\$_, $k";
+ $maps{$m} .= ",\n " . join('', @mapsub) if @mapsub;
+ $maps{$m} .= "]\n}\n";
+ }
+
+ _debug("$sortsub\n$maps{map1}\n$maps{map2}\n") if $topts->{D};
+
+ {
+ local $^W;
+ *sortsub = eval $sortsub;
+ die "Can't create sort sub: $@" if $@;
+ *map1 = eval $maps{map1};
+ die "Can't create sort sub: $@" if $@;
+ *map2 = eval $maps{map2};
+ die "Can't create sort sub: $@" if $@;
+ }
+ }
+}
+
+
+sub _get_temp { # nice and simple
+ require IO::File;
+ IO::File->new_tmpfile;
+}
+
+sub _are_uniq {
+ my $nok = shift;
+ local $^W;
+
+ if ($nok) {
+ ($a, $b) = @_;
+ return &sort1;
+ } else {
+ ($a, $b) = map &map1, @_;
+ return &sortsub;
+ }
+}
+
+sub _debug {
+ print STDERR @_;
+}
+
+sub usage {
+ local $/ = "\n"; # in case changed
+ my $u;
+
+ seek DATA, 0, 0;
+ while (<DATA>) {
+ last if m/^=head1 SYNOPSIS$/;
+ }
+
+ while (<DATA>) {
+ last if m/^=/;
+ $u .= $_;
+ }
+
+ $u =~ s/\n//;
+
+ die "Usage:$u";
+
+}
+
+__END__
+
+=head1 NAME
+
+File::Sort - Sort a file or merge sort multiple files
+
+
+=head1 SYNOPSIS
+
+ use File::Sort qw(sort_file);
+ sort_file({
+ I => [qw(file_1 file_2)],
+ o => 'file_new', k => '5.3,5.5rn', -t => '|'
+ });
+
+ sort_file('file1', 'file1.sorted');
+
+
+=head1 DESCRIPTION
+
+This module sorts text files by lines (or records). Comparisons
+are based on one or more sort keys extracted from each line of input,
+and are performed lexicographically. By default, if keys are not given,
+sort regards each input line as a single field. The sort is a merge
+sort. If you don't like that, feel free to change it.
+
+
+=head2 Options
+
+The following options are available, and are passed in the hash
+reference passed to the function in the format:
+
+ OPTION => VALUE
+
+Where an option can take multiple values (like C<I>, C<k>, and C<pos>),
+values may be passed via an anonymous array:
+
+ OPTION => [VALUE1, VALUE2]
+
+Where the OPTION is a switch, it should be passed a boolean VALUE
+of 1 or 0.
+
+This interface will always be supported, though a more perlish
+interface may be offered in the future, as well. This interface
+is basically a mapping of the command-line options to the Unix
+sort utility.
+
+
+=over 4
+
+=item C<I> I<INPUT>
+
+Pass in the input file(s). This can be either a single string with the
+filename, or an array reference containing multiple filename strings.
+
+=item C<c>
+
+Check that single input fle is ordered as specified by the arguments and
+the collating sequence of the current locale. No output is produced;
+only the exit code is affected.
+
+=item C<m>
+
+Merge only; the input files are assumed to already be sorted.
+
+=item C<o> I<OUTPUT>
+
+Specify the name of an I<OUTPUT> file to be used instead of the standard
+output.
+
+=item C<u>
+
+Unique: Suppresses all but one in each set of lines having equal keys.
+If used with the B<c> option check that there are no lines with
+consecutive lines with duplicate keys, in addition to checking that the
+input file is sorted.
+
+=item C<y> I<MAX_SORT_RECORDS>
+
+Maximum number of lines (records) read before writing to temp file.
+Default is 200,000. This may eventually change to be kbytes instead of
+lines. Lines was easier to implement. Can also specify with
+MAX_SORT_RECORDS environment variable.
+
+=item C<F> I<MAX_SORT_FILES>
+
+Maximum number of temp files to be held open at once. Default to 40,
+as older Windows ports had quite a small limit. Can also specify
+with MAX_SORT_FILES environment variable. No temp files will be used
+at all if MAX_SORT_RECORDS is never reached.
+
+=item C<D>
+
+Send debugging information to STDERR. Behavior subject to change.
+
+=back
+
+
+The following options override the default ordering rules. When ordering
+options appear independent of any key field specifications, the requested
+field ordering rules are applied globally to all sort keys. When attached
+to a specific key (see B<k>), the specified ordering options override all
+global ordering options for that key.
+
+
+=over 4
+
+=item C<d>
+
+Specify that only blank characters and alphanumeric characters,
+according to the current locale setting, are significant in comparisons.
+B<d> overrides B<i>.
+
+=item C<f>
+
+Consider all lower-case characters that have upper-case equivalents,
+according to the current locale setting, to be the upper-case equivalent
+for the purposes of comparison.
+
+=item C<i>
+
+Ignores all characters that are non-printable, according to the current
+locale setting.
+
+=item C<n>
+
+Does numeric instead of string compare, using whatever perl considers to
+be a number in numeric comparisons.
+
+=item C<r>
+
+Reverse the sense of the comparisons.
+
+=item C<b>
+
+Ignore leading blank characters when determining the starting and ending
+positions of a restricted sort key. If the B<b> option is specified
+before the first B<k> option, it is applied to all B<k> options.
+Otherwise, the B<b> option can be attached indepently to each
+field_start or field_end option argument (see below).
+
+=item C<t> I<STRING>
+
+Use I<STRING> as the field separator character; char is not considered
+to be part of a field (although it can be included in a sort key). Each
+occurrence of char is significant (for example,
+E<lt>charE<gt>E<lt>charE<gt> delimits an empty field). If B<t> is not
+specified, blank characters are used as default field separators; each
+maximal non-empty sequence of blank characters that follows a non-blank
+character is a field separator.
+
+=item C<X> I<STRING>
+
+Same as B<t>, but I<STRING> is interpreted as a Perl regular expression
+instead. Do not escape any characters (C</> characters need to be
+escaped internally, and will be escaped for you).
+
+The string matched by I<STRING> is not included in the fields
+themselves, unless demanded by perl's regex and split semantics (e.g.,
+regexes in parentheses will add that matched expression as an extra
+field). See L<perlre> and L<perlfunc/split>.
+
+=item C<R> I<STRING>
+
+Record separator, defaults to newline.
+
+=item C<k> I<pos1[,pos2]>
+
+The keydef argument is a restricted sort key field definition. The
+format of this definition is:
+
+ field_start[.first_char][type][,field_end[.last_char][type]]
+
+where field_start and field_end define a key field restricted to a
+portion of the line, and type is a modifier from the list of characters
+B<b>, B<d>, B<f>, B<i>, B<n>, B<r>. The b modifier behaves like the
+B<b> option, but applies only to the field_start or field_end to which
+it is attached. The other modifiers behave like the corresponding
+options, but apply only to the key field to which they are attached;
+they have this effect if specified with field_start, field_end, or both.
+If any modifier is attached to a field_start or a field_end, no option
+applies to either.
+
+Occurrences of the B<k> option are significant in command line order.
+If no B<k> option is specified, a default sort key of the entire line
+is used. When there are multiple keys fields, later keys are compared
+only after all earlier keys compare equal.
+
+Except when the B<u> option is specified, lines that otherwise compare
+equal are ordered as if none of the options B<d>, B<f>, B<i>, B<n>
+or B<k> were present (but with B<r> still in effect, if it was
+specified) and with all bytes in the lines significant to the
+comparison. The order in which lines that still compare equal are
+written is unspecified.
+
+
+=item C<pos> I<+pos1 [-pos2]>
+
+Similar to B<k>, these are mostly obsolete switches, but some people
+like them and want to use them. Usage is:
+
+ +field_start[.first_char][type] [-field_end[.last_char][type]]
+
+Where field_end in B<k> specified the last position to be included,
+it specifes the last position to NOT be included. Also, numbers
+are counted from 0 instead of 1. B<pos2> must immediately follow
+corresponding B<+pos1>. The rest should be the same as the B<k> option.
+
+Mixing B<+pos1> B<pos2> with B<k> is allowed, but will result in all of
+the B<+pos1> B<pos2> options being ordered AFTER the B<k> options.
+It is best if you Don't Do That. Pick one and stick with it.
+
+Here are some equivalencies:
+
+ pos => '+1 -2' -> k => '2,2'
+ pos => '+1.1 -1.2' -> k => '2.2,2.2'
+ pos => ['+1 -2', '+3 -5'] -> k => ['2,2', '4,5']
+ pos => ['+2', '+0b -1'] -> k => ['3', '1b,1']
+ pos => '+2.1 -2.4' -> k => '3.2,3.4'
+ pos => '+2.0 -3.0' -> k => '3.1,4.0'
+
+=back
+
+
+=head2 Not Implemented
+
+If the options are not listed as implemented above, or are not
+listed in TODO below, they are not in the plan for implementation.
+This includes B<T> and B<z>.
+
+
+=head1 EXAMPLES
+
+Sort file by straight string compare of each line, sending
+output to STDOUT.
+
+ use File::Sort qw(sort_file);
+ sort_file('file');
+
+Sort contents of file by second key in file.
+
+ sort_file({k => 2, I => 'file'});
+
+Sort, in reverse order, contents of file1 and file2, placing
+output in outfile and using second character of second field
+as the sort key.
+
+ sort_file({
+ r => 1, k => '2.2,2.2', o => 'outfile',
+ I => ['file1', 'file2']
+ });
+
+Same sort but sorting numerically on characters 3 through 5 of
+the fifth field first, and only return records with unique keys.
+
+ sort_file({
+ u => 1, r => 1, k => ['5.3,5.5rn', '2.2,2.2'],
+ o => 'outfile', I => ['file1', 'file2']
+ });
+
+Print passwd(4) file sorted by numeric user ID.
+
+ sort_file({t => ':', k => '3n', I => '/etc/passwd'});
+
+For the anal sysadmin, check that passwd(4) file is sorted by numeric
+user ID.
+
+ sort_file({c => 1, t => ':', k => '3n', I => '/etc/passwd'});
+
+
+=head1 ENVIRONMENT
+
+Note that if you change the locale settings after the program has started
+up, you must call setlocale() for the new settings to take effect. For
+example:
+
+ # get constants
+ use POSIX 'locale_h';
+
+ # e.g., blank out locale
+ $ENV{LC_ALL} = $ENV{LANG} = '';
+
+ # use new ENV settings
+ setlocale(LC_CTYPE, '');
+ setlocale(LC_COLLATE, '');
+
+=over 4
+
+=item LC_COLLATE
+
+Determine the locale for ordering rules.
+
+=item LC_CTYPE
+
+Determine the locale for the interpretation of sequences of bytes of
+text data as characters (for example, single- versus multi-byte
+characters in arguments and input files) and the behaviour of
+character classification for the B<b>, B<d>, B<f>, B<i> and B<n>
+options.
+
+=item MAX_SORT_RECORDS
+
+Default is 200,000. Maximum number of records to use before writing
+to a temp file. Overriden by B<y> option.
+
+=item MAX_SORT_FILES
+
+Maximum number of open temp files to use before merging open temp
+files. Overriden by B<F> option.
+
+=back
+
+
+=head1 EXPORT
+
+Exports C<sort_file> on request.
+
+
+=head1 TODO
+
+=over 4
+
+=item Better debugging and error reporting
+
+=item Performance hit with -u
+
+=item Do bytes instead of lines
+
+=item Better test suite
+
+=item Switch for turning off locale ... ?
+
+=back
+
+
+=head1 HISTORY
+
+=over 4
+
+=item v1.01, Monday, January 14, 2002
+
+Change license to be that of Perl.
+
+=item v1.00, Tuesday, November 13, 2001
+
+Long overdue release.
+
+Add O_TRUNC to output open (D'oh!).
+
+Played with somem of the -k options (Marco A. Romero).
+
+Fix filehandle close test of STDOUT (Gael Marziou).
+
+Some cleanup.
+
+=item v0.91, Saturday, February 12, 2000
+
+Closed all files in test.pl so they could be unlinked on some
+platforms. (Hubert Toullec)
+
+Documented C<I> option. (Hubert Toullec)
+
+Removed O_EXCL flag from C<sort_file>.
+
+Fixed bug in sorting multiple files. (Paul Eckert)
+
+
+=item v0.90, Friday, April 30, 1999
+
+Complete rewrite. Took the code from this module to write sort
+utility for PPT project, then brought changes back over. As a result
+the interface has changed slightly, mostly in regard to what letters
+are used for options, but there are also some key behavioral differences.
+If you need the old interface, the old module will remain on CPAN, but
+will not be supported. Sorry for any inconvenience this may cause.
+The good news is that it should not be too difficult to update your
+code to use the new interface.
+
+
+=item v0.20
+
+Fixed bug with unique option (didn't work :).
+
+Switched to sysopen for better portability.
+
+Print to STDOUT if no output file supplied.
+
+Added c option to check sorting.
+
+
+=item v0.18 (31 January 1998)
+
+Tests 3 and 4 failed because we hit the open file limit in the
+standard Windows port of perl5.004_02 (50). Adjusted the default
+for total number of temp files from 50 to 40 (leave room for other open
+files), changed docs. (Mike Blazer, Gurusamy Sarathy)
+
+=item v0.17 (30 December 1998)
+
+Fixed bug in C<_merge_files> that tried to C<open> a passed
+C<IO::File> object.
+
+Fixed up docs and did some more tests and benchmarks.
+
+=item v0.16 (24 December 1998)
+
+One year between releases was too long. I made changes Miko O'Sullivan
+wanted, and I didn't even know I had made them.
+
+Also now use C<IO::File> to create temp files, so the TMPDIR option is
+no longer supported. Hopefully made the whole thing more robust and
+faster, while supporting more options for sorting, including delimited
+sorts, and arbitrary sorts.
+
+Made CHUNK default a lot larger, which improves performance. On
+low-memory systems, or where (e.g.) the MacPerl binary is not allocated
+much RAM, it might need to be lowered.
+
+
+=item v0.11 (04 January 1998)
+
+More cleanup; fixed special case of no linebreak on last line; wrote test
+suite; fixed warning for redefined subs (sort1 and sort2).
+
+=item v0.10 (03 January 1998)
+
+Some cleanup; made it not subject to system file limitations; separated
+many parts out into separate functions.
+
+=item v0.03 (23 December 1997)
+
+Added reverse and numeric sorting options.
+
+=item v0.02 (19 December 1997)
+
+Added unique and merge-only options.
+
+=item v0.01 (18 December 1997)
+
+First release.
+
+=back
+
+
+=head1 THANKS
+
+Mike Blazer E<lt>blazer at mail.nevalink.ruE<gt>,
+Vicki Brown E<lt>vlb at cfcl.comE<gt>,
+Tom Christiansen E<lt>tchrist at perl.comE<gt>,
+Albert Dvornik E<lt>bert at mit.eduE<gt>,
+Paul Eckert E<lt>peckert at epicrealm.comE<gt>,
+Gene Hsu E<lt>gene at moreinfo.comE<gt>,
+Andrew M. Langmead E<lt>aml at world.std.comE<gt>,
+Gael Marziou E<lt>gael_marziou at hp.comE<gt>,
+Brian L. Matthews E<lt>blm at halcyon.comE<gt>,
+Rich Morin E<lt>rdm at cfcl.comE<gt>,
+Matthias Neeracher E<lt>neeri at iis.ee.ethz.chE<gt>,
+Miko O'Sullivan E<lt>miko at idocs.comE<gt>,
+Tom Phoneix E<lt>rootbeer at teleport.comE<gt>,
+Marco A. Romero E<lt>mromero at iglou.comE<gt>,
+Gurusamy Sarathy E<lt>gsar at activestate.comE<gt>,
+Hubert Toullec E<lt>Hubert.Toullec at wanadoo.frE<gt>.
+
+
+=head1 AUTHOR
+
+Chris Nandor E<lt>pudge at pobox.comE<gt>, http://pudge.net/
+
+Copyright (c) 1997-2002 Chris Nandor. All rights reserved. This program
+is free software; you can redistribute it and/or modify it under the same
+terms as Perl itself.
+
+
+=head1 VERSION
+
+v1.01, Monday, January 14, 2002
+
+
+=head1 SEE ALSO
+
+sort(1), locale, PPT project, <URL:http://sf.net/projects/ppt/>.
+
+=cut
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index efc5b4b..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,5 +0,0 @@
-crossbow (1.1.2-1) unstable; urgency=low
-
- * Initial release
-
- -- Andreas Tille <tille at debian.org> Sun, 27 May 2012 09:24:36 +0200
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index f599e28..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-10
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 4ea52da..0000000
--- a/debian/control
+++ /dev/null
@@ -1,23 +0,0 @@
-Source: crossbow
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Andreas Tille <tille at debian.org>
-Section: science
-Priority: optional
-Build-Depends: debhelper (>= 10)
-Standards-Version: 3.9.8
-Vcs-Browser: http://anonscm.debian.org/viewvc/debian-med/trunk/packages/crossbow/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/crossbow/trunk/
-Homepage: http://bowtie-bio.sourceforge.net/crossbow
-
-Package: crossbow
-Architecture: any
-Depends: ${shlibs:Depends},
- ${misc:Depends}
-Description: Genotyping from short reads using cloud computing
- Crossbow is a scalable software pipeline for whole genome resequencing
- analysis. It combines Bowtie, an ultrafast and memory efficient short read
- aligner, and SoapSNP, an accurate genotyper, within Hadoop to distribute and
- accelerate the computation with many nodes. The pipeline can accurately analyze
- over 35x coverage of a human genome in one day on a 10-node local cluster, or
- in 3 hours for about $100 using a 40-node, 320-core cluster rented from
- Amazon's EC2 utility computing service.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index 532654e..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,10 +0,0 @@
-Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Source: https://github.com/BenLangmead/crossbow/releases
-
-Files: *
-Copyright: © 2010-2011 <upstream>
-License: Artistic
-
-Files: debian/*
-Copyright: © 2012 Andreas Tille <tille at debian.org>
-License: Artistic
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index 6055261..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/make -f
-
-# DH_VERBOSE := 1
-
-%:
- dh $@
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/upstream/metadata b/debian/upstream/metadata
deleted file mode 100644
index f09c6b3..0000000
--- a/debian/upstream/metadata
+++ /dev/null
@@ -1,11 +0,0 @@
-Reference:
- Author: Ben Langmead and Michael C Schatz and Jimmy Lin and Mihai Pop and Steven L Salzberg
- Title: Searching for SNPs with cloud computing
- Journal: Genome Biology
- Year: 2009
- Volume: 10
- Pages: R134
- DOI: 10.1186/gb-2009-10-11-r134
- PMID: 19930550
- URL: http://genomebiology.com/2009/10/11/R134
- eprint: http://genomebiology.com/content/pdf/gb-2009-10-11-r134.pdf
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index cece8fb..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,3 +0,0 @@
-version=4
-
-https://github.com/BenLangmead/crossbow/releases .*/archive/v(\d[\d.-]+)\.(?:tar(?:\.gz|\.bz2)?|tgz)
diff --git a/doc/images/AWS_cb_e_coli_fillin.png b/doc/images/AWS_cb_e_coli_fillin.png
new file mode 100644
index 0000000..9561a40
Binary files /dev/null and b/doc/images/AWS_cb_e_coli_fillin.png differ
diff --git a/doc/images/AWS_cb_mouse17_fillin.png b/doc/images/AWS_cb_mouse17_fillin.png
new file mode 100644
index 0000000..c011d9c
Binary files /dev/null and b/doc/images/AWS_cb_mouse17_fillin.png differ
diff --git a/doc/images/AWS_console.png b/doc/images/AWS_console.png
new file mode 100644
index 0000000..13d928a
Binary files /dev/null and b/doc/images/AWS_console.png differ
diff --git a/doc/images/AWS_console_debug.png b/doc/images/AWS_console_debug.png
new file mode 100644
index 0000000..781b4b0
Binary files /dev/null and b/doc/images/AWS_console_debug.png differ
diff --git a/doc/images/AWS_console_upper_left.png b/doc/images/AWS_console_upper_left.png
new file mode 100644
index 0000000..ad1a6ce
Binary files /dev/null and b/doc/images/AWS_console_upper_left.png differ
diff --git a/doc/images/AWS_create_new.png b/doc/images/AWS_create_new.png
new file mode 100644
index 0000000..b580dd5
Binary files /dev/null and b/doc/images/AWS_create_new.png differ
diff --git a/doc/manual.html b/doc/manual.html
new file mode 100644
index 0000000..24f2b0f
--- /dev/null
+++ b/doc/manual.html
@@ -0,0 +1,3832 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+ <title>Crossbow 1.2.0 Manual - Crossbow: Parallel short read genotyping in the cloud</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+ <meta name="generator" content="pandoc" />
+ <meta name="author" content="Ben Langmead and Michael C. Schatz" />
+ <meta name="date" content="http://bowtie-bio.sf.net/crossbow" />
+ <link rel="stylesheet" href="style.css" type="text/css" />
+</head>
+<body>
+<h1>Table of Contents</h1>
+<h1 class="title">Crossbow: Parallel short read genotyping in the cloud</h1>
+<div id="TOC"
+><ul
+ ><li
+ ><a href="#what-is-crossbow"
+ >What is Crossbow?</a
+ ></li
+ ><li
+ ><a href="#a-word-of-caution"
+ >A word of caution</a
+ ></li
+ ><li
+ ><a href="#crossbow-modes-and-prerequisites"
+ >Crossbow modes and prerequisites</a
+ ></li
+ ><li
+ ><a href="#preparing-to-run-on-amazon-elastic-mapreduce"
+ >Preparing to run on Amazon Elastic MapReduce</a
+ ><ul
+ ><li
+ ><a href="#installing-amazons-elastic-mapreduce-tool"
+ >Installing Amazon's <code
+ >elastic-mapreduce</code
+ > tool</a
+ ></li
+ ><li
+ ><a href="#s3-tools"
+ >S3 tools</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#installing-crossbow"
+ >Installing Crossbow</a
+ ><ul
+ ><li
+ ><a href="#the-sra-toolkit"
+ >The SRA toolkit</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#running-crossbow"
+ >Running Crossbow</a
+ ></li
+ ><li
+ ><a href="#running-crossbow-on-emr-via-the-emr-web-interface"
+ >Running Crossbow on EMR via the EMR web interface</a
+ ><ul
+ ><li
+ ><a href="#prerequisites"
+ >Prerequisites</a
+ ></li
+ ><li
+ ><a href="#to-run"
+ >To run</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#running-crossbow-on-emr-via-the-command-line"
+ >Running Crossbow on EMR via the command line</a
+ ><ul
+ ><li
+ ><a href="#prerequisites-1"
+ >Prerequisites</a
+ ></li
+ ><li
+ ><a href="#to-run-1"
+ >To run</a
+ ></li
+ ><li
+ ><a href="#emr-specific-options"
+ >EMR-specific options</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#running-crossbow-on-a-hadoop-cluster-via-the-command-line"
+ >Running Crossbow on a Hadoop cluster via the command line</a
+ ><ul
+ ><li
+ ><a href="#prerequisites-2"
+ >Prerequisites</a
+ ></li
+ ><li
+ ><a href="#to-run-2"
+ >To run</a
+ ></li
+ ><li
+ ><a href="#hadoop-specific-options"
+ >Hadoop-specific options</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#running-crossbow-on-a-single-computer-via-the-command-line"
+ >Running Crossbow on a single computer via the command line</a
+ ><ul
+ ><li
+ ><a href="#prerequisites-3"
+ >Prerequisites</a
+ ></li
+ ><li
+ ><a href="#to-run-3"
+ >To run</a
+ ></li
+ ><li
+ ><a href="#local-run-specific-options"
+ >Local-run-specific options</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#general-crossbow-options"
+ >General Crossbow options</a
+ ></li
+ ><li
+ ><a href="#crossbow-examples"
+ >Crossbow examples</a
+ ><ul
+ ><li
+ ><a href="#e.-coli-small"
+ >E. coli (small)</a
+ ><ul
+ ><li
+ ><a href="#emr"
+ >EMR</a
+ ><ul
+ ><li
+ ><a href="#via-web-interface"
+ >Via web interface</a
+ ></li
+ ><li
+ ><a href="#via-command-line"
+ >Via command line</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#hadoop"
+ >Hadoop</a
+ ></li
+ ><li
+ ><a href="#single-computer"
+ >Single computer</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#mouse-chromosome-17-large"
+ >Mouse chromosome 17 (large)</a
+ ><ul
+ ><li
+ ><a href="#emr-1"
+ >EMR</a
+ ><ul
+ ><li
+ ><a href="#via-web-interface-1"
+ >Via web interface</a
+ ></li
+ ><li
+ ><a href="#via-command-line-1"
+ >Via command line</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#hadoop-1"
+ >Hadoop</a
+ ></li
+ ><li
+ ><a href="#single-computer-1"
+ >Single computer</a
+ ></li
+ ></ul
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#manifest-files"
+ >Manifest files</a
+ ></li
+ ><li
+ ><a href="#reference-jars"
+ >Reference jars</a
+ ><ul
+ ><li
+ ><a href="#building-a-reference-jar-using-automatic-scripts"
+ >Building a reference jar using automatic scripts</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#monitoring-debugging-and-logging"
+ >Monitoring, debugging and logging</a
+ ><ul
+ ><li
+ ><a href="#single-computer-2"
+ >Single computer</a
+ ></li
+ ><li
+ ><a href="#hadoop-2"
+ >Hadoop</a
+ ></li
+ ><li
+ ><a href="#emr-2"
+ >EMR</a
+ ></li
+ ><li
+ ><a href="#aws-management-console"
+ >AWS Management Console</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#crossbow-output"
+ >Crossbow Output</a
+ ></li
+ ><li
+ ><a href="#other-reading"
+ >Other reading</a
+ ></li
+ ><li
+ ><a href="#acknowledgements"
+ >Acknowledgements</a
+ ></li
+ ></ul
+ ></div
+>
+<h1 id="what-is-crossbow"
+><a href="#TOC"
+ >What is Crossbow?</a
+ ></h1
+><p
+><a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow</a
+ > is a scalable, portable, and automatic Cloud Computing tool for finding SNPs from short read data. Crossbow employs <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > and a modified version of <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > to perform the short read alignment and SNP calling respectively. Crossbow is designed to be easy to run (a) in "the cloud" (in this case, Amazon's <a href="http://aws.amazon.com/elasticmapreduce"
+ >Elastic MapReduce</a
+ > service), (b) on any <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > cluster, or (c) on any single computer, without <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ >. Crossbow exploits the availability of multiple computers and processors where possible.</p
+><h1 id="a-word-of-caution"
+><a href="#TOC"
+ >A word of caution</a
+ ></h1
+><p
+>Renting resources from <a href="http://aws.amazon.com"
+ >Amazon Web Services</a
+ > (AKA <a href="http://aws.amazon.com/" title="Amazon Web Services"
+ >AWS</a
+ >), costs money, regardless of whether your experiment ultimately succeeds or fails. In some cases, Crossbow or its documentation may be partially to blame for a failed experiment. While we are happy to accept bug reports, we do not accept responsibility for financial damage caused by these errors. Crossbow is provided "as is" with no warranty. See <code
+ >LICENSE</code
+ > file.</p
+><h1 id="crossbow-modes-and-prerequisites"
+><a href="#TOC"
+ >Crossbow modes and prerequisites</a
+ ></h1
+><p
+>Crossbow can be run in four different ways.</p
+><ol style="list-style-type: decimal;"
+><li
+ ><strong
+ >Via the <a href="http://bowtie-bio.sf.net/crossbow/ui.html"
+ >Crossbow web interface</a
+ ></strong
+ ></li
+ ></ol
+><p
+>In this case, the <a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow</a
+ > code and the user interface are installed on EC2 web servers. Also, the computers running the Crossbow computation are rented from Amazon, and the user must have <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ >, <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ >, <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > and <a href="http://aws.amazon.com/simpledb/"
+ >SimpleDB</a
+ > accounts and must pay the <a href="http://aws.amazon.com/ec2/#pricing"
+ >going rate</a
+ > for the resources used. The user does not need any special software besides a web browser and, in most cases, an <a href="#s3-tools"
+ >S3 tool</a
+ >.</p
+><ol start="2" style="list-style-type: decimal;"
+><li
+ ><strong
+ >On Amazon <a href="http://aws.amazon.com/elasticmapreduce"
+ >Elastic MapReduce</a
+ > via the command-line</strong
+ ></li
+ ></ol
+><p
+>In this case, the Crossbow code is hosted by Amazon and the computers running the Crossbow computation are rented from Amazon. However, the user must install and run (a) the Crossbow scripts, which require <a href="http://www.perl.org/get.html"
+ >Perl</a
+ > 5.6 or later, (b) Amazon's <a href="http://aws.amazon.com/developertools/2264?_encoding=UTF8&jiveRedirect=1"
+ ><code
+ >elastic-mapreduce</code
+ ></a
+ > script, which requires Ruby 1.8 or later, and (c) an <a href="#s3-tools"
+ >S3 tool</a
+ >. The user must have <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ >, <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ >, <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > and <a href="http://aws.amazon.com/simpledb/"
+ >SimpleDB</a
+ > accounts and must pay the <a href="http://aws.amazon.com/ec2/#pricing"
+ >going rate</a
+ > for the resources used.</p
+><ol start="3" style="list-style-type: decimal;"
+><li
+ ><strong
+ >On a <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > cluster via the command-line</strong
+ ></li
+ ></ol
+><p
+>In this case, the Crossbow code is hosted on your <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > cluster, as are supporting tools: <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ >, <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ >, and possibly [<code
+ >fastq-dump</code
+ >]. Supporting tools must be installed on all cluster nodes, but the Crossbow scripts need only be installed on the master. Crossbow was tested with <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > versions 0.20 and 0.20.205, and might also be compatible with other versions newer than 0.20. Crossbow scripts require <a href="http://www.perl.org/get.html"
+ >Perl</a
+ > 5.6 or later.</p
+><ol start="4" style="list-style-type: decimal;"
+><li
+ ><strong
+ >On any computer via the command-line</strong
+ ></li
+ ></ol
+><p
+>In this case, the Crossbow code and all supporting tools (<a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ >, <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ >, and possibly [<code
+ >fastq-dump</code
+ >]) must be installed on the computer running Crossbow. Crossbow scripts require <a href="http://www.perl.org/get.html"
+ >Perl</a
+ > 5.6 or later. The user specifies the maximum number of CPUs that Crossbow should use at a time. This mode does <em
+ >not</em
+ > require <a href="http://java.sun.com/"
+ >Java</a
+ > or <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ >.</p
+><h1 id="preparing-to-run-on-amazon-elastic-mapreduce"
+><a href="#TOC"
+ >Preparing to run on Amazon Elastic MapReduce</a
+ ></h1
+><p
+>Before running Crossbow on <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ >, you must have an <a href="http://aws.amazon.com/" title="Amazon Web Services"
+ >AWS</a
+ > account with the appropriate features enabled. You may also need to <a href="#installing-amazons-elastic-mapreduce-tool"
+ >install Amazon's <code
+ >elastic-mapreduce</code
+ > tool</a
+ >. In addition, you may want to install an <a href="#s3-tools"
+ >S3 tool</a
+ >, though most users can simply use <a href="https://console.aws.amazon.com/s3/home"
+ >Amazon's web interface for S3</a
+ >, which requires no installation.</p
+><p
+>If you plan to run Crossbow exclusively on a single computer or on a <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > cluster, you can skip this section.</p
+><ol style="list-style-type: decimal;"
+><li
+ ><p
+ >Create an AWS account by navigating to the <a href="http://aws.amazon.com/" title="Amazon Web Services"
+ >AWS page</a
+ >. Click "Sign Up Now" in the upper right-hand corner and follow the instructions. You will be asked to accept the <a href="http://aws.amazon.com/agreement/"
+ >AWS Customer Agreement</a
+ >.</p
+ ></li
+ ><li
+ ><p
+ >Sign up for <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > and <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ >. Navigate to the <a href="http://aws.amazon.com/ec2"
+ >Amazon EC2</a
+ > page, click on "Sign Up For Amazon EC2" and follow the instructions. This step requires you to enter credit card information. Once this is complete, your AWS account will be permitted to use <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > and <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ >, which are required.</p
+ ></li
+ ><li
+ ><p
+ >Sign up for <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ >. Navigate to the <a href="http://aws.amazon.com/elasticmapreduce"
+ >Elastic MapReduce</a
+ > page, click on "Sign up for Elastic MapReduce" and follow the instructions. Once this is complete, your AWS account will be permitted to use <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ >, which is required.</p
+ ></li
+ ><li
+ ><p
+ >Sign up for <a href="http://aws.amazon.com/simpledb/"
+ >SimpleDB</a
+ >. With <a href="http://aws.amazon.com/simpledb/"
+ >SimpleDB</a
+ > enabled, you have the option of using the <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ >'s <a href="http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/DebuggingJobFlows.html"
+ >Job Flow Debugging</a
+ > feature. This is a convenient way to monitor your job's progress and diagnose errors.</p
+ ></li
+ ><li
+ ><p
+ ><em
+ >Optional</em
+ >: Request an increase to your instance limit. By default, Amazon allows you to allocate EC2 clusters with up to 20 instances (virtual computers). To be permitted to work with more instances, fill in the form on the <a href="http://aws.amazon.com/contact-us/ec2-request/"
+ >Request to Increase</a
+ > page. You may have to speak to an Amazon representative and/or wait several business days before your request is granted.</p
+ ></li
+ ></ol
+><p
+>To see a list of AWS services you've already signed up for, see your <a href="http://aws-portal.amazon.com/gp/aws/developer/account/index.html?ie=UTF8&action=activity-summary"
+ >Account Activity</a
+ > page. If "Amazon Elastic Compute Cloud", "Amazon Simple Storage Service", "Amazon Elastic MapReduce" and "Amazon SimpleDB" all appear there, you are ready to proceed.</p
+><p
+>Be sure to make a note of the various numbers and names associated with your accounts, especially your Access Key ID, Secret Access Key, and your EC2 key pair name. You will have to refer to these and other account details in the future.</p
+><h2 id="installing-amazons-elastic-mapreduce-tool"
+><a href="#TOC"
+ >Installing Amazon's <code
+ >elastic-mapreduce</code
+ > tool</a
+ ></h2
+><p
+>Read this section if you plan to run Crossbow on <a href="http://aws.amazon.com/elasticmapreduce"
+ >Elastic MapReduce</a
+ > via the command-line tool. Skip this section if you are not using <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > or if you plan to run exclusively via the <a href="http://bowtie-bio.sf.net/crossbow/ui.html"
+ >Crossbow web interface</a
+ >.</p
+><p
+>To install Amazon's <code
+ >elastic-mapreduce</code
+ > tool, follow the instructions in Amazon Elastic MapReduce developer's guide for <a href="http://aws.amazon.com/developertools/2264?_encoding=UTF8&jiveRedirect=1"
+ >How to Download and Install Ruby and the Command Line Interface</a
+ >. That document describes:</p
+><ol style="list-style-type: decimal;"
+><li
+ ><p
+ >Installing an appropriate version of <a href="http://www.ruby-lang.org/"
+ >Ruby</a
+ >, if necessary.</p
+ ></li
+ ><li
+ ><p
+ >Setting up an EC2 keypair, if necessary.</p
+ ></li
+ ><li
+ ><p
+ >Setting up a credentials file, which is used by the <code
+ >elastic-mapreduce</code
+ > tool for authentication.</p
+ ></li
+ ></ol
+><p
+>For convenience, we suggest you name the credentials file <code
+ >credentials.json</code
+ > and place it in the same directory with the <code
+ >elastic-mapreduce</code
+ > script. Otherwise you will have to specify the credential file path with the <a href="#cb-emr-credentials"
+ ><code
+ >--credentials</code
+ ></a
+ > option each time you run <code
+ >cb_emr</code
+ >.</p
+><p
+>We strongly recommend using a version of the <code
+ >elastic-mapreduce</code
+ > Ruby script released on or after December 8, 2011. This is when the script switched to using Hadoop v0.20.205 by default, which is the preferred way of running Myrna.</p
+><p
+>We also recommend that you add the directory containing the <code
+ >elastic-mapreduce</code
+ > tool to your <code
+ >PATH</code
+ >. This allows Crossbow to locate it automatically. Alternately, you can specify the path to the <code
+ >elastic-mapreduce</code
+ > tool via the <a href="#cb-emr-script"
+ ><code
+ >--emr-script</code
+ ></a
+ > option when running <code
+ >cb_emr</code
+ >.</p
+><h2 id="s3-tools"
+><a href="#TOC"
+ >S3 tools</a
+ ></h2
+><p
+>Running on <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > requires exchanging files via the cloud-based <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > filesystem. <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > is organized as a collection of <a href="http://docs.amazonwebservices.com/AmazonS3/latest/gsg/"
+ >S3 buckets</a
+ > in a global namespace. <a href="http://aws.amazon.com/s3/#pricing"
+ >S3 charges</a
+ > are incurred when transferring data to and from <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > (but transfers between <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > and <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > are free), and a per-GB-per-month charge applies when data is stored in <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > over time.</p
+><p
+>To transfer files to and from <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ >, use an S3 tool. Amazon's <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ > has an <a href="https://console.aws.amazon.com/s3/home"
+ >S3 tab</a
+ > that provides a friendly web-based interface to <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ >, and doesn't require any software installation. <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ > is a very good command-line tool that requires <a href="http://www.python.org/download/"
+ >Python</a
+ > 2.4 or later. <a href="http://www.s3fox.net/"
+ >S3Fox Organizer</a
+ > is another GUI tool that works as a <a href="http://www.mozilla.com/firefox/"
+ >Firefox</a
+ > extension. Other tools include <a href="http://cyberduck.ch/"
+ >Cyberduck</a
+ > (for Mac OS 10.6 or later) and <a href="http://www.bucketexplorer.com/"
+ >Bucket Explorer</a
+ > (for Mac, Windows or Linux, but commercial software).</p
+><h1 id="installing-crossbow"
+><a href="#TOC"
+ >Installing Crossbow</a
+ ></h1
+><p
+>Crossbow consists of a set of <a href="http://www.perl.org/get.html"
+ >Perl</a
+ > and shell scripts, plus supporting tools: <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > and <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > . If you plan to run Crossbow via the <a href="http://bowtie-bio.sf.net/crossbow/ui.html"
+ >Crossbow web interface</a
+ > exclusively, there is nothing to install. Otherwise:</p
+><ol style="list-style-type: decimal;"
+><li
+ ><p
+ >Download the desired version of Crossbow from the <a href="http://bowtie-bio.sf.net/crossbow"
+ >sourceforge site</a
+ ></p
+ ></li
+ ><li
+ ><p
+ ><a href="http://en.wikipedia.org/wiki/ZIP_(file_format)"
+ >Extract the zip archive</a
+ ></p
+ ></li
+ ><li
+ ><p
+ >Set the <code
+ >CROSSBOW_HOME</code
+ > environment variable to point to the extracted directory (containing <code
+ >cb_emr</code
+ >)</p
+ ></li
+ ><li
+ ><p
+ ><em
+ >If you plan to run on a local computer or <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > cluster</em
+ >:</p
+ ><p
+ >If using Linux or Mac OS 10.6 or later, you likely don't have to install <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > or <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ >, as Crossbow comes with compatible versions of both pre-installed. Test this by running:</p
+ ><pre
+ ><code
+ >$CROSSBOW_HOME/cb_local --test
+</code
+ ></pre
+ ><p
+ >If the install test passes, installation is complete.</p
+ ><p
+ >If the install test indicates <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > is not installed, obtain or build a <code
+ >bowtie</code
+ > binary v0.12.8 or higher and install it by setting the <code
+ >CROSSBOW_BOWTIE_HOME</code
+ > environment variable to <code
+ >bowtie</code
+ >'s enclosing directory. Alternately, add the enclosing directory to your <code
+ >PATH</code
+ > or specify the full path to <code
+ >bowtie</code
+ > via the <code
+ >--bowtie</code
+ > option when running Crossbow scripts.</p
+ ><p
+ >If the install test indicates that <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > is not installed, build the <code
+ >soapsnp</code
+ > binary using the sources and makefile in <code
+ >CROSSBOW_HOME/soapsnp</code
+ >. You must have compiler tools such as GNU <code
+ >make</code
+ > and <code
+ >g++</code
+ > installed for this to work. If you are using a Mac, you may need to install the <a href="http://developer.apple.com/technologies/tools/"
+ >Apple developer tools</a
+ >. To build the <code
+ >soapsnp</code
+ > binary, run:</p
+ ><pre
+ ><code
+ >make -C $CROSSBOW_HOME/soapsnp
+</code
+ ></pre
+ ><p
+ >Now install <code
+ >soapsnp</code
+ > by setting the <code
+ >CROSSBOW_SOAPSNP_HOME</code
+ > environment variable to <code
+ >soapsnp</code
+ >'s enclosing directory. Alternately, add the enclosing directory to your <code
+ >PATH</code
+ > or specify the full path to <code
+ >soapsnp</code
+ > via the <code
+ >--soapsnp</code
+ > option when running Crossbow scripts.</p
+ ></li
+ ><li
+ ><p
+ ><em
+ >If you plan to run on a <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > cluster</em
+ >, you may need to manually copy the <code
+ >bowtie</code
+ > and <code
+ >soapsnp</code
+ > executables, and possibly also the <code
+ >fastq-dump</code
+ > executable, to the same path on each of your <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > cluster nodes. You can avoid this step by installing <code
+ >bowtie</code
+ >, <code
+ >soapsnp</code
+ > and <code
+ >fastq-dump</code
+ > on a filesystem shared by all <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > nodes (e.g. an <a href="http://en.wikipedia.org/wiki/Network_File_System_(protocol)"
+ >NFS share</a
+ >). You can also skip this step if <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > is installed in <a href="http://hadoop.apache.org/common/docs/current/quickstart.html#PseudoDistributed"
+ >pseudo distributed</a
+ > mode, meaning that the cluster really consists of one node whose CPUs are treated as distinct slaves.</p
+ ></li
+ ></ol
+><h2 id="the-sra-toolkit"
+><a href="#TOC"
+ >The SRA toolkit</a
+ ></h2
+><p
+>The <a href="http://www.ncbi.nlm.nih.gov/books/NBK47533/"
+ >Sequence Read Archive</a
+ > (SRA) is a resource at the <a href="http://www.ncbi.nlm.nih.gov/"
+ >National Center for Biotechnology Information</a
+ > (NCBI) for storing sequence data from modern sequencing instruments. Sequence data underlying many studies, including very large studies, can often be downloaded from this archive.</p
+><p
+>The SRA uses a special file format to store archived read data. These files end in extensions <a href="http://www.ncbi.nlm.nih.gov/books/NBK47540/"
+ ><code
+ >.sra</code
+ ></a
+ >, and they can be specified as inputs to Crossbow's preprocessing step in exactly the same way as <a href="http://en.wikipedia.org/wiki/FASTQ_format"
+ >FASTQ</a
+ > files.</p
+><p
+>However, if you plan to use <a href="http://www.ncbi.nlm.nih.gov/books/NBK47540/"
+ ><code
+ >.sra</code
+ ></a
+ > files as input to Crossbow in either <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > mode or in single-computer mode, you must first install the <a href="http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?cmd=show&f=software&m=software&s=software"
+ >SRA toolkit</a
+ >'s <code
+ >fastq-dump</code
+ > tool appropriately. See the <a href="http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?cmd=show&f=software&m=software&s=software"
+ >SRA toolkit</a
+ > page for details about how to download and install.</p
+><p
+>When searching for the <code
+ >fastq-dump</code
+ > tool at runtime, Crossbow searches the following places in order:</p
+><ol style="list-style-type: decimal;"
+><li
+ >The path specified in the <a href="#cb-local-fastq-dump"
+ ><code
+ >--fastq-dump</code
+ ></a
+ > option</li
+ ><li
+ >The directory specified in the <code
+ >$CROSSBOW_SRATOOLKIT_HOME</code
+ > environment variable.</li
+ ><li
+ >In the system <code
+ >PATH</code
+ ></li
+ ></ol
+><h1 id="running-crossbow"
+><a href="#TOC"
+ >Running Crossbow</a
+ ></h1
+><p
+>The commands for invoking Crossbow from the command line are:</p
+><p
+><code
+ >$CROSSBOW_HOME/cb_emr</code
+ > (or just <code
+ >cb_emr</code
+ > if <code
+ >$CROSSBOW_HOME</code
+ > is in the <code
+ >PATH</code
+ >) for running on <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ >. See <a href="#running-crossbow-on-emr-via-the-command-line"
+ >Running Crossbow on EMR via the command line</a
+ > for details.</p
+><p
+><code
+ >$CROSSBOW_HOME/cb_hadoop</code
+ > (or just <code
+ >cb_hadoop</code
+ > if <code
+ >$CROSSBOW_HOME</code
+ > is in the <code
+ >PATH</code
+ >) for running on <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ >. See <a href="#running-crossbow-on-a-hadoop-cluster-via-the-command-line"
+ >Running Crossbow on a Hadoop cluster via the command line</a
+ > for details.</p
+><p
+><code
+ >$CROSSBOW_HOME/cb_local</code
+ > (or just <code
+ >cb_local</code
+ > if <code
+ >$CROSSBOW_HOME</code
+ > is in the <code
+ >PATH</code
+ >) for running locally on a single computer. See <a href="#running-crossbow-on-a-single-computer-via-the-command-line"
+ >Running Crossbow on a single computer via the command line</a
+ > for details.</p
+><h1 id="running-crossbow-on-emr-via-the-emr-web-interface"
+><a href="#TOC"
+ >Running Crossbow on EMR via the EMR web interface</a
+ ></h1
+><h2 id="prerequisites"
+><a href="#TOC"
+ >Prerequisites</a
+ ></h2
+><ol style="list-style-type: decimal;"
+><li
+ >Web browser</li
+ ><li
+ ><a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ >, <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ >, <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ >, and <a href="http://aws.amazon.com/simpledb/"
+ >SimpleDB</a
+ > accounts. To check which ones you've already enabled, visit your <a href="http://aws-portal.amazon.com/gp/aws/developer/account/index.html?ie=UTF8&action=activity-summary"
+ >Account Activity</a
+ > page.</li
+ ><li
+ >A tool for browsing and exchanging files with <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ ><ol style="list-style-type: lower-alpha;"
+ ><li
+ >The <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ >'s <a href="https://console.aws.amazon.com/s3/home"
+ >S3 tab</a
+ > is a good web-based tool that does not require software installation</li
+ ><li
+ >A good command line tool is <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ ></li
+ ><li
+ >A good GUI tool is <a href="http://www.s3fox.net/"
+ >S3Fox Organizer</a
+ >, which is a Firefox Plugin</li
+ ><li
+ >Others include <a href="http://cyberduck.ch/"
+ >Cyberduck</a
+ >, <a href="http://www.bucketexplorer.com/"
+ >Bucket Explorer</a
+ ></li
+ ></ol
+ ></li
+ ><li
+ >Basic knowledge regarding:<ol style="list-style-type: lower-alpha;"
+ ><li
+ ><a href="http://aws.amazon.com/s3/"
+ >What S3 is</a
+ >, <a href="http://docs.amazonwebservices.com/AmazonS3/latest/gsg/"
+ >what an S3 bucket is</a
+ >, how to create one, how to upload a file to an S3 bucket from your computer (see your S3 tool's documentation).</li
+ ><li
+ >How much AWS resources <a href="http://aws.amazon.com/ec2/#pricing"
+ >will cost you</a
+ ></li
+ ></ol
+ ></li
+ ></ol
+><h2 id="to-run"
+><a href="#TOC"
+ >To run</a
+ ></h2
+><ol style="list-style-type: decimal;"
+><li
+ ><p
+ ><em
+ >If the input reads have not yet been preprocessed by Crossbow</em
+ > (i.e. input is <a href="http://en.wikipedia.org/wiki/FASTQ_format"
+ >FASTQ</a
+ > or <a href="http://www.ncbi.nlm.nih.gov/books/NBK47540/"
+ ><code
+ >.sra</code
+ ></a
+ >), then first (a) prepare a <a href="#manifest-files"
+ >manifest file</a
+ > with URLs pointing to the read files, and (b) upload it to an <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > bucket that you own. See your <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > tool's documentation for how to create a bucket and upload a file to it. The URL for the <a href="#manifest-files"
+ >manifest file</a
+ > will be the input URL for your <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job.</p
+ ><p
+ ><em
+ >If the input reads have already been preprocessed by Crossbow</em
+ >, make a note of of the <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > URL where they're located. This will be the input URL for your <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job.</p
+ ></li
+ ><li
+ ><p
+ ><em
+ >If you are using a pre-built reference jar</em
+ >, make a note of its <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > URL. This will be the reference URL for your <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job. See the <a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow website</a
+ > for a list of pre-built reference jars and their URLs.</p
+ ><p
+ ><em
+ >If you are not using a pre-built reference jar</em
+ >, you may need to <a href="#reference-jars"
+ >build the reference jars</a
+ > and/or upload them to an <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > bucket you own. See your <a href="#s3-tools"
+ >S3 tool</a
+ >'s documentation for how to create a bucket and upload to it. The URL for the main reference jar will be the reference URL for your <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job.</p
+ ></li
+ ><li
+ ><p
+ >In a web browser, go to the <a href="http://bowtie-bio.sf.net/crossbow/ui.html"
+ >Crossbow web interface</a
+ >.</p
+ ></li
+ ><li
+ ><p
+ >Fill in the form according to your job's parameters. We recommend filling in and validating the "AWS ID" and "AWS Secret Key" fields first. Also, when entering S3 URLs (e.g. "Input URL" and "Output URL"), we recommend that users validate the entered URLs by clicking the link below it. This avoids failed jobs due to simple URL issues (e.g. non-existence of the "Input URL"). For examples of how to fill in this form, see the <a href="#c [...]
+ >E. coli EMR</a
+ > and <a href="#cb-example-mouse17-emr"
+ >Mouse chromosome 17 EMR</a
+ > examples.</p
+ ></li
+ ></ol
+><h1 id="running-crossbow-on-emr-via-the-command-line"
+><a href="#TOC"
+ >Running Crossbow on EMR via the command line</a
+ ></h1
+><h2 id="prerequisites-1"
+><a href="#TOC"
+ >Prerequisites</a
+ ></h2
+><ol style="list-style-type: decimal;"
+><li
+ ><a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ >, <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ >, <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ >, and <a href="http://aws.amazon.com/simpledb/"
+ >SimpleDB</a
+ > accounts. To check which ones you've already enabled, visit your <a href="http://aws-portal.amazon.com/gp/aws/developer/account/index.html?ie=UTF8&action=activity-summary"
+ >Account Activity</a
+ > page.</li
+ ><li
+ >A tool for browsing and exchanging files with <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ ><ol style="list-style-type: lower-alpha;"
+ ><li
+ >The <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ >'s <a href="https://console.aws.amazon.com/s3/home"
+ >S3 tab</a
+ > is a good web-based tool that does not require software installation</li
+ ><li
+ >A good command line tool is <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ ></li
+ ><li
+ >A good GUI tool is <a href="http://www.s3fox.net/"
+ >S3Fox Organizer</a
+ >, which is a Firefox Plugin</li
+ ><li
+ >Others include <a href="http://cyberduck.ch/"
+ >Cyberduck</a
+ >, <a href="http://www.bucketexplorer.com/"
+ >Bucket Explorer</a
+ ></li
+ ></ol
+ ></li
+ ><li
+ >Basic knowledge regarding:<ol style="list-style-type: lower-alpha;"
+ ><li
+ ><a href="http://aws.amazon.com/s3/"
+ >What S3 is</a
+ >, <a href="http://docs.amazonwebservices.com/AmazonS3/latest/gsg/"
+ >what an S3 bucket is</a
+ >, how to create one, how to upload a file to an S3 bucket from your computer (see your S3 tool's documentation).</li
+ ><li
+ >How much AWS resources <a href="http://aws.amazon.com/ec2/#pricing"
+ >will cost you</a
+ ></li
+ ></ol
+ ></li
+ ></ol
+><h2 id="to-run-1"
+><a href="#TOC"
+ >To run</a
+ ></h2
+><ol style="list-style-type: decimal;"
+><li
+ ><p
+ ><em
+ >If the input reads have not yet been preprocessed by Crossbow</em
+ > (i.e. input is <a href="http://en.wikipedia.org/wiki/FASTQ_format"
+ >FASTQ</a
+ > or <a href="http://www.ncbi.nlm.nih.gov/books/NBK47540/"
+ ><code
+ >.sra</code
+ ></a
+ >), then first (a) prepare a <a href="#manifest-files"
+ >manifest file</a
+ > with URLs pointing to the read files, and (b) upload it to an <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > bucket that you own. See your <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > tool's documentation for how to create a bucket and upload a file to it. The URL for the <a href="#manifest-files"
+ >manifest file</a
+ > will be the input URL for your <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job.</p
+ ><p
+ ><em
+ >If the input reads have already been preprocessed by Crossbow</em
+ >, make a note of of the <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > URL where they're located. This will be the input URL for your <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job.</p
+ ></li
+ ><li
+ ><p
+ ><em
+ >If you are using a pre-built reference jar</em
+ >, make a note of its <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > URL. This will be the reference URL for your <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job. See the <a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow website</a
+ > for a list of pre-built reference jars and their URLs.</p
+ ><p
+ ><em
+ >If you are not using a pre-built reference jar</em
+ >, you may need to <a href="#reference-jars"
+ >build the reference jars</a
+ > and/or upload them to an <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > bucket you own. See your <a href="#s3-tools"
+ >S3 tool</a
+ >'s documentation for how to create a bucket and upload to it. The URL for the main reference jar will be the reference URL for your <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job.</p
+ ></li
+ ><li
+ ><p
+ >Run <code
+ >$CROSSBOW_HOME/cb_emr</code
+ > with the desired options. Options that are unique to <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > jobs are described in the following section. Options that apply to all running modes are described in the <a href="#general-crossbow-options"
+ >General Crossbow options</a
+ > section. For examples of how to run <code
+ >$CROSSBOW_HOME/cb_emr</code
+ > see the <a href="#cb-example-e-coli-emr"
+ >E. coli EMR</a
+ > and <a href="#cb-example-mouse17-emr"
+ >Mouse chromosome 17 EMR</a
+ > examples.</p
+ ></li
+ ></ol
+><h2 id="emr-specific-options"
+><a href="#TOC"
+ >EMR-specific options</a
+ ></h2
+><table>
+
+<tr><td id="cb-emr-reference">
+
+
+<pre
+><code
+ >--reference <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > URL where the reference jar is located. URLs for pre-built reference jars for some commonly studied species (including human and mouse) are available from the <a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow web site</a
+ >. Note that a <a href="http://bowtie-bio.sf.net/myrna"
+ >Myrna</a
+ > reference jar is not the same as a <a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow</a
+ > reference jar. If your desired genome and/or SNP annotations are not available in pre-built form, you will have to make your own reference jar and upload it to one of your own S3 buckets (see <a href="#reference-jars"
+ >Reference jars</a
+ >). This option must be specified.</p
+><tr><td id="cb-emr-input">
+
+
+<pre
+><code
+ >--input <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > URL where the input is located. If <a href="#cb-preprocess"
+ ><code
+ >--preprocess</code
+ ></a
+ > or <a href="#cb-just-preprocess"
+ ><code
+ >--just-preprocess</code
+ ></a
+ > are specified, <code
+ ><URL></code
+ > sould point to a <a href="#manifest-files"
+ >manifest file</a
+ >. Otherwise, <code
+ ><URL></code
+ > should point to a directory containing preprocessed reads. This option must be specified.</p
+></td></tr><tr><td id="cb-emr-output">
+
+
+<pre
+><code
+ >--output <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > URL where the output is to be deposited. If <a href="#cb-just-preprocess"
+ ><code
+ >--just-preprocess</code
+ ></a
+ > is specified, the output consists of the preprocessed reads. Otherwise, the output consists of the SNP calls calculated by <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > for each chromosome in the <a href="#cb-output"
+ >Crossbow output format</a
+ >, organized as one file per chromosome. This option must be specified.</p
+></td></tr><tr><td id="cb-emr-intermediate">
+
+
+<pre
+><code
+ >--intermediate <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > URL where all intermediate results should be be deposited. This can be useful if you later want to resume the computation from partway through the pipeline (e.g. after alignment but before SNP calling). By default, intermediate results are stored in <a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > and disappear once the cluster is terminated.</p
+></td></tr><tr><td id="cb-emr-preprocess-output">
+
+
+<pre
+><code
+ >--preprocess-output <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > URL where the preprocessed reads should be stored. This can be useful if you later want to run Crossbow on the same input reads without having to re-run the preprocessing step (i.e. leaving <a href="#cb-preprocess"
+ ><code
+ >--preprocess</code
+ ></a
+ > unspecified).</p
+></td></tr><tr><td id="cb-emr-credentials">
+
+
+<pre
+><code
+ >--credentials <id>
+</code
+ ></pre
+></td><td>
+<p
+>Local path to the credentials file set up by the user when the <a href="http://aws.amazon.com/developertools/2264?_encoding=UTF8&jiveRedirect=1"
+ ><code
+ >elastic-mapreduce</code
+ ></a
+ > script was installed (see <a href="#installing-amazons-elastic-mapreduce-tool"
+ >Installing Amazon's <code
+ >elastic-mapreduce</code
+ > tool</a
+ >). Default: use <code
+ >elastic-mapreduce</code
+ >'s default (i.e. the <code
+ >credentials.json</code
+ > file in the same directory as the <code
+ >elastic-mapreduce</code
+ > script). If <code
+ >--credentials</code
+ > is not specified and the default <code
+ >credentials.json</code
+ > file doesn't exist, <code
+ >elastic-mapreduce</code
+ > will abort with an error message.</p
+></td></tr><tr><td id="cb-emr-script">
+
+
+<pre
+><code
+ >--emr-script <path>
+</code
+ ></pre
+></td><td>
+<p
+>Local path to the <code
+ >elastic-mapreduce</code
+ > script. By default, Crossbow looks first in the <code
+ >$CROSSBOW_EMR_HOME</code
+ > directory, then in the <code
+ >PATH</code
+ >.</p
+></td></tr><tr><td id="cb-emr-name">
+
+
+<pre
+><code
+ >--name <string>
+</code
+ ></pre
+></td><td>
+<p
+>Specify the name by which the job will be identified in the <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ >.</p
+></td></tr><tr><td id="cb-emr-stay-alive">
+
+
+<pre
+><code
+ >--stay-alive
+</code
+ ></pre
+></td><td>
+<p
+>By default, <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > will terminate the cluster as soon as (a) one of the stages fails, or (b) the job complete successfully. Specify this option to force <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > to keep the cluster alive in either case.</p
+></td></tr><tr><td id="cb-emr-instances">
+
+
+<pre
+><code
+ >--instances <int>
+</code
+ ></pre
+></td><td>
+<p
+>Specify the number of instances (i.e. virtual computers, also called nodes) to be allocated to your cluster. If set to 1, the 1 instance will funcion as both <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > master and slave node. If set greater than 1, one instance will function as a <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > master and the rest will function as <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > slaves. In general, the greater the value of <code
+ ><int></code
+ >, the faster the Crossbow computation will complete. Consider the desired speed as well as the <a href="http://aws.amazon.com/ec2/#pricing"
+ >going rate</a
+ > when choosing a value for <code
+ ><int></code
+ >. Default: 1.</p
+></td></tr><tr><td id="cb-emr-instance-type">
+
+
+<pre
+><code
+ >--instance-type <type>
+</code
+ ></pre
+></td><td>
+<p
+>Specify the type of <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > instance to use for the computation. See Amazon's <a href="http://aws.amazon.com/ec2/instance-types/"
+ >list of available instance types</a
+ > and be sure to specify the "API name" of the desired type (e.g. <code
+ >m1.small</code
+ > or <code
+ >c1.xlarge</code
+ >). <strong
+ >The default of <code
+ >c1.xlarge</code
+ > is strongly recommended</strong
+ > because it has an appropriate mix of computing power and memory for a large breadth of problems. Choosing an instance type with less than 5GB of physical RAM can cause problems when the reference is as large (e.g. a mammalian genome). Stick to the default unless you're pretty sure the specified instance type can handle your problem size.</p
+></td></tr><tr><td id="cb-emr-args">
+
+
+<pre
+><code
+ >--emr-args "<args>"
+</code
+ ></pre
+></td><td>
+<p
+>Pass the specified extra arguments to the <code
+ >elastic-mapreduce</code
+ > script. See documentation for the <code
+ >elastic-mapreduce</code
+ > script for details.</p
+></td></tr><tr><td id="cb-logs">
+
+
+<pre
+><code
+ >--logs <URL>
+</code
+ ></pre
+></td><td>
+<p
+>Causes <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > to copy the log files to <code
+ ><URL></code
+ >. Default: <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > writes logs to the <code
+ >logs</code
+ > subdirectory of the <a href="#cb-local-output"
+ ><code
+ >--output</code
+ ></a
+ > URL. See also <a href="#cb-no-logs"
+ ><code
+ >--no-logs</code
+ ></a
+ >.</p
+></td></tr><tr><td id="cb-no-logs">
+
+
+<pre
+><code
+ >--no-logs
+</code
+ ></pre
+></td><td>
+<p
+>By default, Crossbow causes <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > to copy all cluster log files to the <code
+ >log</code
+ > subdirectory of the <a href="#cb-local-output"
+ ><code
+ >--output</code
+ ></a
+ > URL (or another destination, if <a href="#cb-logs"
+ ><code
+ >--logs</code
+ ></a
+ > is specified). Specifying this option disables all copying of logs.</p
+></td></tr><tr><td id="cb-no-emr-debug">
+
+
+<pre
+><code
+ >--no-emr-debug
+</code
+ ></pre
+></td><td>
+<p
+>Disables <a href="http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/DebuggingJobFlows.html"
+ >Job Flow Debugging</a
+ >. If this is <em
+ >not</em
+ > specified, you must have a <a href="http://aws.amazon.com/simpledb/"
+ >SimpleDB</a
+ > account for <a href="http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/DebuggingJobFlows.html"
+ >Job Flow Debugging</a
+ > to work. You will be subject to additional <a href="http://aws.amazon.com/simpledb/#pricing"
+ >SimpleDB-related charges</a
+ > if this option is enabled, but those fees are typically small or zero (depending on your account's <a href="http://aws.amazon.com/simpledb/#pricing"
+ >SimpleDB tier</a
+ >).</p
+></td></tr>
+</table>
+<h1 id="running-crossbow-on-a-hadoop-cluster-via-the-command-line"
+><a href="#TOC"
+ >Running Crossbow on a Hadoop cluster via the command line</a
+ ></h1
+><h2 id="prerequisites-2"
+><a href="#TOC"
+ >Prerequisites</a
+ ></h2
+><ol style="list-style-type: decimal;"
+><li
+ ><p
+ >Working installation of <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > v0.20.2 or v0.20.205. Other versions newer than 0.20 might also work, but haven't been tested.</p
+ ></li
+ ><li
+ ><p
+ >A <code
+ >bowtie</code
+ > v0.12.8 executable must exist at the same path on all cluster nodes (including the master). That path must be specified via the <a href="#cb-hadoop-bowtie"
+ ><code
+ >--bowtie</code
+ ></a
+ > option OR located in the directory specified in the <code
+ >CROSSBOW_BOWTIE_HOME</code
+ > environment variable, OR in a subdirectory of <code
+ >$CROSSBOW_HOME/bin</code
+ > OR in the <code
+ >PATH</code
+ > (Crossbow looks in that order). <code
+ >$CROSSBOW_HOME/bin</code
+ > comes with pre-built Bowtie binaries for Linux and Mac OS X 10.5 or later. An executable from that directory is used automatically unless the platform is not Mac or Linux or unless overridden by <a href="#cb-hadoop-bowtie"
+ ><code
+ >--bowtie</code
+ ></a
+ > or by defining <code
+ >CROSSBOW_BOWTIE_HOME</code
+ >.</p
+ ></li
+ ><li
+ ><p
+ >A Crossbow-customized version of <code
+ >soapsnp</code
+ > v1.02 must be installed at the same path on all cluster nodes (including the master). That path must be specified via the <a href="#cb-hadoop-soapsnp"
+ ><code
+ >--soapsnp</code
+ ></a
+ > option OR located in the directory specified in the <code
+ >CROSSBOW_SOAPSNP_HOME</code
+ > environment variable, OR in a subdirectory of <code
+ >$CROSSBOW_HOME/bin</code
+ > OR in the <code
+ >PATH</code
+ > (Crossbow searches in that order). <code
+ >$CROSSBOW_HOME/bin</code
+ > comes with pre-built SOAPsnp binaries for Linux and Mac OS X 10.6 or later. An executable from that directory is used automatically unless the platform is not Mac or Linux or unless overridden by <a href="#cb-hadoop-soapsnp"
+ ><code
+ >--soapsnp</code
+ ></a
+ > or by defining <code
+ >CROSSBOW_SOAPSNP_HOME</code
+ >.</p
+ ></li
+ ><li
+ ><p
+ >If any of your inputs are in <a href="http://www.ncbi.nlm.nih.gov/books/NBK47533/"
+ >Sequence Read Archive</a
+ > format (i.e. end in <code
+ >.sra</code
+ >), then the <code
+ >fastq-dump</code
+ > tool from the [SRA Toolkit] must be installed at the same path on all cluster nodes. The path to the <code
+ >fastq-dump</code
+ > tool must be specified via the (<a href="#myrna-fastq-dump"
+ ><code
+ >--fastq-dump</code
+ ></a
+ >) option OR <code
+ >fastq-dump</code
+ > must be located in the directory specified in the <code
+ >CROSSBOW_FASTQ_DUMP_HOME</code
+ > environment variable, OR <code
+ >fastq-dump</code
+ > must be found in the <code
+ >PATH</code
+ > (Myrna searches in that order).</p
+ ></li
+ ><li
+ ><p
+ >Sufficient memory must be available on all <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > slave nodes to hold the Bowtie index for the desired organism in addition to any other loads placed on those nodes by <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > or other programs. For mammalian genomes such as the human genome, this typically means that slave nodes must have at least 5-6 GB of RAM.</p
+ ></li
+ ></ol
+><h2 id="to-run-2"
+><a href="#TOC"
+ >To run</a
+ ></h2
+><p
+>Run <code
+ >$CROSSBOW_HOME/cb_hadoop</code
+ > with the desired options. Options that are unique to <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > jobs are described in the following subsection. Options that apply to all running modes are described in the <a href="#general-crossbow-options"
+ >General Crossbow options</a
+ > subsection. To see example invocations of <code
+ >$CROSSBOW_HOME/cb_hadoop</code
+ > see the <a href="#cb-example-e-coli-hadoop"
+ >E. coli Hadoop</a
+ > and <a href="#cb-example-mouse17-hadoop"
+ >Mouse chromosome 17 Hadoop</a
+ > examples.</p
+><h2 id="hadoop-specific-options"
+><a href="#TOC"
+ >Hadoop-specific options</a
+ ></h2
+><table>
+
+<tr><td id="cb-hadoop-reference">
+
+
+<pre
+><code
+ >--reference <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > URL where the reference jar is located. Pre-built reference jars for some commonly studied species (including human and mouse) are available from the <a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow web site</a
+ >; these can be downloaded and installed in HDFS using <code
+ >hadoop dfs</code
+ > commands. If your desired genome and/or SNP annotations are not available in pre-built form, you will have to make your own reference jars, install them in HDFS, and specify their HDFS path here. This option must be specified.</p
+><tr><td id="cb-hadoop-input">
+
+
+<pre
+><code
+ >--input <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > URL where the input is located. If <a href="#cb-preprocess"
+ ><code
+ >--preprocess</code
+ ></a
+ > or <a href="#cb-just-preprocess"
+ ><code
+ >--just-preprocess</code
+ ></a
+ > are specified, <code
+ ><URL></code
+ > sould point to a manifest file. Otherwise, <code
+ ><URL></code
+ > should point to a directory containing preprocessed reads. This option must be specified.</p
+></td></tr><tr><td id="cb-hadoop-output">
+
+
+<pre
+><code
+ >--output <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > URL where the output is to be deposited. If <a href="#cb-just-preprocess"
+ ><code
+ >--just-preprocess</code
+ ></a
+ > is specified, the output consists of the preprocessed reads. Otherwise, the output consists of the SNP calls calculated by SOAPsnp for each chromosome, organized as one file per chromosome. This option must be specified.</p
+></td></tr><tr><td id="cb-hadoop-intermediate">
+
+
+<pre
+><code
+ >--intermediate <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > URL where all intermediate results should be be deposited. Default: <code
+ >hdfs:///crossbow/intermediate/<PID></code
+ >.</p
+></td></tr><tr><td id="cb-hadoop-preprocess-output">
+
+
+<pre
+><code
+ >--preprocess-output <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > URL where the preprocessed reads should be stored. This can be useful if you later want to run Crossbow on the same input reads without having to re-run the preprocessing step (i.e. leaving <a href="#cb-preprocess"
+ ><code
+ >--preprocess</code
+ ></a
+ > unspecified).</p
+></td></tr><tr><td id="cb-hadoop-bowtie">
+
+
+<pre
+><code
+ >--bowtie <path>
+</code
+ ></pre
+></td><td>
+<p
+>Local path to the <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > binary Crossbow should use. <code
+ >bowtie</code
+ > must be installed in this same directory on all <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > worker nodes. By default, Crossbow searches the <code
+ >PATH</code
+ > and in the directory pointed to by the <code
+ >CROSSBOW_HOME</code
+ > environment variable.</p
+></td></tr><tr><td id="cb-hadoop-fastq-dump">
+
+
+<pre
+><code
+ >--fastq-dump <path>
+</code
+ ></pre
+></td><td>
+<p
+>Path to the directory containing <code
+ >fastq-dump</code
+ >, which is part of the [SRA Toolkit]. This overrides all other ways that Crossbow searches for <code
+ >fastq-dump</code
+ >, including the <code
+ >CROSSBOW_SRATOOLKIT_HOME</code
+ > environment variable, the subdirectories of the <code
+ >$CROSSBOW_HOME/bin</code
+ > directory, and the <code
+ >PATH</code
+ >.</p
+></td></tr><tr><td id="cb-hadoop-soapsnp">
+
+
+<pre
+><code
+ >--soapsnp <path>
+</code
+ ></pre
+></td><td>
+<p
+>Local path to the SOAPsnp executable to use when running the Call SNPs step. <code
+ >soapsnp</code
+ > must be installed in this same directory on all <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > worker nodes This overrides all other ways that Crossbow searches for <code
+ >soapsnp</code
+ >, including the <code
+ >CROSSBOW_SOAPSNP_HOME</code
+ > environment variable, the subdirectories of the <code
+ >$CROSSBOW_HOME/bin</code
+ > directory, and the <code
+ >PATH</code
+ >.</p
+></td></tr>
+</table>
+<h1 id="running-crossbow-on-a-single-computer-via-the-command-line"
+><a href="#TOC"
+ >Running Crossbow on a single computer via the command line</a
+ ></h1
+><h2 id="prerequisites-3"
+><a href="#TOC"
+ >Prerequisites</a
+ ></h2
+><ol style="list-style-type: decimal;"
+><li
+ ><p
+ >A <code
+ >bowtie</code
+ > v0.12.8 executable must exist on the local computer. The path to <code
+ >bowtie</code
+ > must be specified via the <a href="#cb-local-bowtie"
+ ><code
+ >--bowtie</code
+ ></a
+ > option OR be located in the directory specified in the <code
+ >$CROSSBOW_BOWTIE_HOME</code
+ > environment variable, OR in a subdirectory of <code
+ >$CROSSBOW_HOME/bin</code
+ > OR in the <code
+ >PATH</code
+ > (search proceeds in that order). <code
+ >$CROSSBOW_HOME/bin</code
+ > comes with pre-built Bowtie binaries for Linux and Mac OS X 10.6 or later, so most Mac and Linux users do not need to install either tool.</p
+ ></li
+ ><li
+ ><p
+ >A Crossbow-customized version of <code
+ >soapsnp</code
+ > v1.02 must exist. The path to <code
+ >soapsnp</code
+ > must be specified via the <a href="#cb-local-soapsnp"
+ ><code
+ >--soapsnp</code
+ ></a
+ > option OR be in the directory specified in the <code
+ >$CROSSBOW_SOAPSNP_HOME</code
+ > environment variable, OR in a subdirectory of <code
+ >$CROSSBOW_HOME/bin</code
+ > OR in the <code
+ >PATH</code
+ > (Crossbow searches in that order). <code
+ >$CROSSBOW_HOME/bin</code
+ > comes with pre-built SOAPsnp binaries for Linux and Mac OS X 10.6 or later. An executable from that directory is used automatically unless the platform is not Mac or Linux or unless overridden by <a href="#cb-local-soapsnp"
+ ><code
+ >--soapsnp</code
+ ></a
+ > or <code
+ >$CROSSBOW_SOAPSNP_HOME</code
+ >.</p
+ ></li
+ ><li
+ ><p
+ >If any of your inputs are in <a href="http://www.ncbi.nlm.nih.gov/books/NBK47533/"
+ >Sequence Read Archive</a
+ > format (i.e. end in <code
+ >.sra</code
+ >), then the <code
+ >fastq-dump</code
+ > tool from the [SRA Toolkit] must be installed on the local computer. The path to the <code
+ >fastq-dump</code
+ > tool must be specified via the (<a href="#myrna-fastq-dump"
+ ><code
+ >--fastq-dump</code
+ ></a
+ >) option OR <code
+ >fastq-dump</code
+ > must be located in the directory specified in the <code
+ >MYRNA_FASTQ_DUMP_HOME</code
+ > environment variable, OR <code
+ >fastq-dump</code
+ > must be found in the <code
+ >PATH</code
+ > (Myrna searches in that order).</p
+ ></li
+ ><li
+ ><p
+ >Sufficient memory must be available on the local computer to hold one copy of the Bowtie index for the desired organism <em
+ >in addition</em
+ > to all other running workloads. For mammalian genomes such as the human genome, this typically means that the local computer must have at least 5-6 GB of RAM.</p
+ ></li
+ ></ol
+><h2 id="to-run-3"
+><a href="#TOC"
+ >To run</a
+ ></h2
+><p
+>Run <code
+ >$CROSSBOW_HOME/cb_local</code
+ > with the desired options. Options unique to local jobs are described in the following subsection. Options that apply to all running modes are described in the <a href="#general-crossbow-options"
+ >General Crossbow options</a
+ > subsection. To see example invocations of <code
+ >$CROSSBOW_HOME/cb_local</code
+ > see the <a href="#cb-example-e-coli-local"
+ >E. coli local</a
+ > and <a href="#cb-example-mouse17-local"
+ >Mouse chromosome 17 local</a
+ > examples.</p
+><h2 id="local-run-specific-options"
+><a href="#TOC"
+ >Local-run-specific options</a
+ ></h2
+><table>
+
+<tr><td id="cb-local-reference">
+
+
+<pre
+><code
+ >--reference <path>
+</code
+ ></pre
+></td><td>
+<p
+>Local path where expanded reference jar is located. Specified path should have a <code
+ >index</code
+ > subdirectory with a set of Bowtie index files, a <code
+ >sequences</code
+ > subdirectory with a set of FASTA files, a <code
+ >snps</code
+ > subdirectory with 0 or more per-chromosome SNP description files, and a <code
+ >cmap.txt</code
+ > file. Pre-built reference jars for some commonly studied species (including human and mouse) are available from the <a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow web site</a
+ >; these can be downloaded and expanded into a directory with the appropriate structure using an <a href="http://en.wikipedia.org/wiki/Unzip"
+ ><code
+ >unzip</code
+ ></a
+ > utility. If your desired genome and/or SNP annotations are not available in pre-built form, you will have to make your own reference jars and specify the appropriate path. This option must be specified.</p
+><tr><td id="cb-local-input">
+
+
+<pre
+><code
+ >--input <path>
+</code
+ ></pre
+></td><td>
+<p
+>Local path where the input is located. If <a href="#cb-preprocess"
+ ><code
+ >--preprocess</code
+ ></a
+ > or <a href="#cb-just-preprocess"
+ ><code
+ >--just-preprocess</code
+ ></a
+ > are specified, this sould point to a <a href="#manifest-files"
+ >manifest file</a
+ >. Otherwise, this should point to a directory containing preprocessed reads. This option must be specified.</p
+></td></tr><tr><td id="cb-local-output">
+
+
+<pre
+><code
+ >--output <path>
+</code
+ ></pre
+></td><td>
+<p
+>Local path where the output is to be deposited. If <a href="#cb-just-preprocess"
+ ><code
+ >--just-preprocess</code
+ ></a
+ > is specified, the output consists of the preprocessed reads. Otherwise, the output consists of the SNP calls calculated by SOAPsnp for each chromosome, organized as one file per chromosome. This option must be specified.</p
+></td></tr><tr><td id="cb-local-intermediate">
+
+
+<pre
+><code
+ >--intermediate <path>
+</code
+ ></pre
+></td><td>
+<p
+>Local path where all intermediate results should be kept temporarily (or permanently, if <a href="#cb-local-keep-intermediates"
+ ><code
+ >--keep-intermediates</code
+ ></a
+ > or <a href="#cb-local-keep-all"
+ ><code
+ >--keep-all</code
+ ></a
+ > are specified). Default: <code
+ >/tmp/crossbow/intermediate/<PID></code
+ >.</p
+></td></tr><tr><td id="cb-local-preprocess-output">
+
+
+<pre
+><code
+ >--preprocess-output <path>
+</code
+ ></pre
+></td><td>
+<p
+>Local path where the preprocessed reads should be stored. This can be useful if you later want to run Crossbow on the same input reads without having to re-run the preprocessing step (i.e. leaving <a href="#cb-preprocess"
+ ><code
+ >--preprocess</code
+ ></a
+ > unspecified).</p
+></td></tr><tr><td id="cb-local-keep-intermediates">
+
+
+<pre
+><code
+ >--keep-intermediates
+</code
+ ></pre
+></td><td>
+<p
+>Keep intermediate directories and files, i.e. the output from all stages prior to the final stage. By default these files are deleted as soon as possible.</p
+></td></tr><tr><td id="cb-local-keep-all">
+
+
+<pre
+><code
+ >--keep-all
+</code
+ ></pre
+></td><td>
+<p
+>Keep all temporary files generated during the process of binning and sorting data records and moving them from stage to stage, as well as all intermediate results. By default these files are deleted as soon as possible.</p
+></td></tr><tr><td id="cb-local-cpus">
+
+
+<pre
+><code
+ >--cpus <int>
+</code
+ ></pre
+></td><td>
+<p
+>The maximum number of processors to use at any given time during the job. Crossbow will try to make maximal use of the processors allocated. Default: 1.</p
+></td></tr><tr><td id="cb-local-max-sort-records">
+
+
+<pre
+><code
+ >--max-sort-records <int>
+</code
+ ></pre
+></td><td>
+<p
+>Maximum number of records to be dispatched to the sort routine at one time when sorting bins before each reduce step. For each child process, this number is effectively divided by the number of CPUs used (<a href="#cb-local-cpus"
+ ><code
+ >--cpus</code
+ ></a
+ >). The default is 200000.</p
+></td></tr><tr><td id="cb-local-max-sort-files">
+
+
+<pre
+><code
+ >--max-sort-files <int>
+</code
+ ></pre
+></td><td>
+<p
+>Maximum number of files that can be opened at once by the sort routine when sorting bins before each reduce step. For each child process, this number is effectively divided by the number of CPUs used (<a href="#cb-local-cpus"
+ ><code
+ >--cpus</code
+ ></a
+ >). The default is 40.</p
+></td></tr><tr><td id="cb-local-bowtie">
+
+
+<pre
+><code
+ >--bowtie <path>
+</code
+ ></pre
+></td><td>
+<p
+>Path to the Bowtie executable to use when running the Align step. This overrides all other ways that Crossbow searches for <code
+ >bowtie</code
+ >, including the <code
+ >CROSSBOW_BOWTIE_HOME</code
+ > environment variable, the subdirectories of the <code
+ >$CROSSBOW_HOME/bin</code
+ > directory, and the <code
+ >PATH</code
+ >.</p
+></td></tr><tr><td id="cb-local-fastq-dump">
+
+
+<pre
+><code
+ >--fastq-dump <path>
+</code
+ ></pre
+></td><td>
+<p
+>Path to the directory containing the programs in the <a href="http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?cmd=show&f=software&m=software&s=software"
+ >SRA toolkit</a
+ >, including <code
+ >fastq-dump</code
+ >. This overrides all other ways that Crossbow searches for <code
+ >fastq-dump</code
+ >, including the <code
+ >CROSSBOW_SRATOOLKIT_HOME</code
+ > environment variable, the subdirectories of the <code
+ >$CROSSBOW_HOME/bin</code
+ > directory, and the <code
+ >PATH</code
+ >.</p
+></td></tr><tr><td id="cb-local-soapsnp">
+
+
+<pre
+><code
+ >--soapsnp <path>
+</code
+ ></pre
+></td><td>
+<p
+>Path to the SOAPsnp executable to use when running the Call SNPs step. This overrides all other ways that Crossbow searches for <code
+ >soapsnp</code
+ >, including the <code
+ >CROSSBOW_SOAPSNP_HOME</code
+ > environment variable, the subdirectories of the <code
+ >$CROSSBOW_HOME/bin</code
+ > directory, and the <code
+ >PATH</code
+ >.</p
+></td></tr>
+
+</table>
+<h1 id="general-crossbow-options"
+><a href="#TOC"
+ >General Crossbow options</a
+ ></h1
+><p
+>The following options can be specified regardless of what mode (<a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ >, <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > or local) Crossbow is run in.</p
+><table>
+
+<tr><td id="cb-quality">
+
+
+<pre
+><code
+ >--quality { phred33 | phred64 | solexa64 }
+</code
+ ></pre
+></td><td>
+<p
+>Treat all input reads as having the specified quality encoding. <code
+ >phred33</code
+ > denotes the <a href="http://en.wikipedia.org/wiki/FASTQ_format#Encoding"
+ >Phred+33</a
+ > or "Sanger" format whereby ASCII values 33-126 are used to encode qualities on the <a href="http://en.wikipedia.org/wiki/Phred_quality_score"
+ >Phred scale</a
+ >. <code
+ >phred64</code
+ > denotes the <a href="http://en.wikipedia.org/wiki/FASTQ_format#Encoding"
+ >Phred+64</a
+ > or "Illumina 1.3+" format whereby ASCII values 64-126 are used to encode qualities on the <a href="http://en.wikipedia.org/wiki/Phred_quality_score"
+ >Phred scale</a
+ >. <code
+ >solexa64</code
+ > denotes the <a href="http://en.wikipedia.org/wiki/FASTQ_format#Encoding"
+ >Solexa+64</a
+ > or "Solexa/Illumina 1.0" format whereby ASCII values 59-126 are used to encode qualities on a <a href="http://en.wikipedia.org/wiki/FASTQ_format#Variations"
+ >log-odds scale</a
+ > that includes values as low as -5. Default: <code
+ >phred33</code
+ >.</p
+></td></tr><tr><td id="cb-preprocess">
+
+
+<pre
+><code
+ >--preprocess
+</code
+ ></pre
+></td><td>
+<p
+>The input path or URL refers to a <a href="#manifest-files"
+ >manifest file</a
+ > rather than a directory of preprocessed reads. The first step in the Crossbow computation will be to preprocess the reads listed in the <a href="#manifest-files"
+ >manifest file</a
+ > and store the preprocessed reads in the intermediate directory or in the <code
+ >--preprocess-output</code
+ > directory if it's specified. Default: off.</p
+></td></tr><tr><td id="cb-just-preprocess">
+
+
+<pre
+><code
+ >--just-preprocess
+</code
+ ></pre
+></td><td>
+<p
+>The input path or URL refers to a <a href="#manifest-files"
+ >manifest file</a
+ > rather than a directory of preprocessed reads. Crossbow will preprocess the reads listed in the <a href="#manifest-files"
+ >manifest file</a
+ > and store the preprocessed reads in the <code
+ >--output</code
+ > directory and quit. Default: off.</p
+></td></tr><tr><td id="cb-just-align">
+
+
+<pre
+><code
+ >--just-align
+</code
+ ></pre
+></td><td>
+<p
+>Instead of running the Crossbow pipeline all the way through to the end, run the pipeline up to and including the align stage and store the results in the <a href="#cb-local-output"
+ ><code
+ >--output</code
+ ></a
+ > URL. To resume the run later, use <a href="#cb-resume-align"
+ ><code
+ >--resume-align</code
+ ></a
+ >.</p
+></td></tr><tr><td id="cb-resume-align">
+
+
+<pre
+><code
+ >--resume-align
+</code
+ ></pre
+></td><td>
+<p
+>Resume the Crossbow pipeline from just after the alignment stage. The <a href="#cb-local-input"
+ ><code
+ >--input</code
+ ></a
+ > URL must point to an <a href="#cb-local-output"
+ ><code
+ >--output</code
+ ></a
+ > URL from a previous run using <a href="#cb-just-align"
+ ><code
+ >--just-align</code
+ ></a
+ >.</p
+></td></tr><tr><td id="cb-bowtie-args">
+
+
+<pre
+><code
+ >--bowtie-args "<args>"
+</code
+ ></pre
+></td><td>
+<p
+>Pass the specified arguments to <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > for the Align stage. Default: <a href="http://bowtie-bio.sf.net/manual.shtml#bowtie-options-M"
+ ><code
+ >-M 1</code
+ ></a
+ >. See the <a href="http://bowtie-bio.sf.net/manual.shtml"
+ >Bowtie manual</a
+ > for details on what options are available.</p
+></td></tr><tr><td id="cb-discard-reads">
+
+
+<pre
+><code
+ >--discard-reads <fraction>
+</code
+ ></pre
+></td><td>
+<p
+>Randomly discard a fraction of the input reads. E.g. specify <code
+ >0.5</code
+ > to discard 50%. This applies to all input reads regardless of type (paired vs. unpaired) or length. This can be useful for debugging. Default: 0.0.</p
+></td></tr><tr><td id="cb-discard-ref-bins">
+
+
+<pre
+><code
+ >--discard-ref-bins <fraction>
+</code
+ ></pre
+></td><td>
+<p
+>Randomly discard a fraction of the reference bins prior to SNP calling. E.g. specify <code
+ >0.5</code
+ > to discard 50% of the reference bins. This can be useful for debugging. Default: 0.0.</p
+></td></tr><tr><td id="cb-discard-all">
+
+
+<pre
+><code
+ >--discard-all <fraction>
+</code
+ ></pre
+></td><td>
+<p
+>Equivalent to setting <a href="#cb-discard-reads"
+ ><code
+ >--discard-reads</code
+ ></a
+ > and <a href="#cb-discard-ref-bins"
+ ><code
+ >--discard-ref-bins</code
+ ></a
+ > to <code
+ ><fraction></code
+ >. Default: 0.0.</p
+></td></tr><tr><td id="cb-soapsnp-args">
+
+
+<pre
+><code
+ >--soapsnp-args "<args>"
+</code
+ ></pre
+></td><td>
+<p
+>Pass the specified arguments to <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > in the SNP calling stage. These options are passed to SOAPsnp regardless of whether the reference sequence under consideration is diploid or haploid. Default: <code
+ >-2 -u -n -q</code
+ >. See the <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp manual</a
+ > for details on what options are available.</p
+></td></tr><tr><td id="cb-soapsnp-hap-args">
+
+
+<pre
+><code
+ >--soapsnp-hap-args "<args>"
+</code
+ ></pre
+></td><td>
+<p
+>Pass the specified arguments to <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > in the SNP calling stage. when the reference sequence under consideration is haploid. Default: <code
+ >-r 0.0001</code
+ >. See the <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp manual</a
+ > for details on what options are available.</p
+></td></tr><tr><td id="cb-soapsnp-dip-args">
+
+
+<pre
+><code
+ >--soapsnp-dip-args "<args>"
+</code
+ ></pre
+></td><td>
+<p
+>Pass the specified arguments to <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > in the SNP calling stage. when the reference sequence under consideration is diploid. Default: <code
+ >-r 0.00005 -e 0.0001</code
+ >. See the <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp manual</a
+ > for details on what options are available.</p
+></td></tr><tr><td id="cb-haploids">
+
+
+<pre
+><code
+ >--haploids <chromosome-list>
+</code
+ ></pre
+></td><td>
+<p
+>The specified comma-separated list of chromosome names are to be treated as haploid by SOAPsnp. The rest are treated as diploid. Default: all chromosomes are treated as diploid.</p
+></td></tr><tr><td id="cb-all-haploids">
+
+
+<pre
+><code
+ >--all-haploids
+</code
+ ></pre
+></td><td>
+<p
+>If specified, all chromosomes are treated as haploid by SOAPsnp.</p
+></td></tr><tr><td id="cb-partition-len">
+
+
+<pre
+><code
+ >--partition-len <int>
+</code
+ ></pre
+></td><td>
+<p
+>The bin size to use when binning alignments into partitions prior to SNP calling. If load imbalance occurrs in the SNP calling step (some tasks taking far longer than others), try decreasing this. Default: 1,000,000.</p
+><blockquote
+></tr><tr><td id="cb-dry-run">
+</blockquote
+><pre
+><code
+ >--dry-run
+</code
+ ></pre
+></td><td>
+<p
+>Just generate a script containing the commands needed to launch the job, but don't run it. The script's location will be printed so that you may run it later.</p
+></td></tr>
+
+</td></tr><tr><td id="cb-test">
+
+
+<pre
+><code
+ >--test
+</code
+ ></pre
+></td><td>
+<p
+>Instead of running Crossbow, just search for the supporting tools (<a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > and <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ >) and report whether and how they were found. If running in Cloud Mode, this just tests whether the <code
+ >elastic-mapreduce</code
+ > script is locatable and runnable. Use this option to debug your local Crossbow installation.</p
+></td></tr><tr><td id="cb-tempdir">
+
+
+<pre
+><code
+ >--tempdir `<path>`
+</code
+ ></pre
+></td><td>
+<p
+>Local directory where temporary files (e.g. dynamically generated scripts) should be deposited. Default: <code
+ >/tmp/Crossbow/invoke.scripts</code
+ >.</p
+></td></tr>
+</table>
+<h1 id="crossbow-examples"
+><a href="#TOC"
+ >Crossbow examples</a
+ ></h1
+><p
+>The following subsections guide you step-by-step through examples included with the Crossbow package. Because reads (and sometimes reference jars) must be obtained over the Internet, running these examples requires an active Internet connection.</p
+><h2 id="e.-coli-small"
+><a href="#TOC"
+ >E. coli (small)</a
+ ></h2
+><p
+>Data for this example is taken from the study by <a href="http://www.pnas.org/content/early/2009/11/19/0906681106.abstract"
+ >Parkhomchuk et al</a
+ >.</p
+><h3 id="emr"
+><a href="#TOC"
+ >EMR</a
+ ></h3
+><div id="cb-example-e-coli-emr" />
+<h4 id="via-web-interface"
+><a href="#TOC"
+ >Via web interface</a
+ ></h4
+><p
+>Identify an <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > bucket to hold the job's input and output. You may need to create an <a href="http://docs.amazonwebservices.com/AmazonS3/latest/index.html?UsingBucket.html"
+ >S3 bucket</a
+ > for this purpose. See your <a href="#s3-tools"
+ >S3 tool</a
+ >'s documentation.</p
+><p
+>Use an <a href="#s3-tools"
+ >S3 tool</a
+ > to upload <code
+ >$CROSSBOW_HOME/example/e_coli/small.manifest</code
+ > to the <code
+ >example/e_coli</code
+ > subdirectory in your bucket. You can do so with this <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ > command:</p
+><pre
+><code
+ >s3cmd put $CROSSBOW_HOME/example/e_coli/small.manifest s3://<YOUR-BUCKET>/example/e_coli/
+</code
+ ></pre
+><p
+>Direct your web browser to the <a href="http://bowtie-bio.sf.net/crossbow/ui.html"
+ >Crossbow web interface</a
+ > and fill in the form as below (substituting for <code
+ ><YOUR-BUCKET></code
+ >):</p
+><div><img src="images/AWS_cb_e_coli_fillin.png" alt="" /><p><i>Crossbow web form filled in for the small E. coli example.</i></p>
+</div>
+<ol style="list-style-type: decimal;"
+><li
+ >For <strong
+ >AWS ID</strong
+ >, enter your AWS Access Key ID</li
+ ><li
+ >For <strong
+ >AWS Secret Key</strong
+ >, enter your AWS Secret Access Key</li
+ ><li
+ ><em
+ >Optional</em
+ >: For <strong
+ >AWS Keypair name</strong
+ >, enter the name of your AWS keypair. This is only necessary if you would like to be able to <a href="http://en.wikipedia.org/wiki/Secure_Shell"
+ >ssh</a
+ > into the <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > cluster while it runs.</li
+ ><li
+ ><em
+ >Optional</em
+ >: Check that the AWS ID and Secret Key entered are valid by clicking the "Check credentials..." link</li
+ ><li
+ >For <strong
+ >Job name</strong
+ >, enter <code
+ >Crossbow-Ecoli</code
+ ></li
+ ><li
+ >Make sure that <strong
+ >Job type</strong
+ > is set to "Crossbow"</li
+ ><li
+ >For <strong
+ >Input URL</strong
+ >, enter <code
+ >s3n://<YOUR-BUCKET>/example/e_coli/small.manifest</code
+ >, substituting for <code
+ ><YOUR-BUCKET></code
+ ></li
+ ><li
+ ><em
+ >Optional</em
+ >: Check that the Input URL exists by clicking the "Check that input URL exists..." link</li
+ ><li
+ >For <strong
+ >Output URL</strong
+ >, enter <code
+ >s3n://<YOUR-BUCKET>/example/e_coli/output_small</code
+ >, substituting for <code
+ ><YOUR-BUCKET></code
+ ></li
+ ><li
+ ><em
+ >Optional</em
+ >: Check that the Output URL does not exist by clicking the "Check that output URL doesn't exist..." link</li
+ ><li
+ >For <strong
+ >Input type</strong
+ >, select "Manifest file"</li
+ ><li
+ >For <strong
+ >Genome/Annotation</strong
+ >, select "E. coli" from the drop-down menu</li
+ ><li
+ >For <strong
+ >Chromosome ploidy</strong
+ >, select "All are haploid"</li
+ ><li
+ >Click Submit</li
+ ></ol
+><p
+>This job typically takes about 30 minutes on 1 <code
+ >c1.xlarge</code
+ > <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > node. See <a href="#monitoring-your-emr-jobs"
+ >Monitoring your EMR jobs</a
+ > for information on how to track job progress. To download the results, use an <a href="#s3-tools"
+ >S3 tool</a
+ > to retrieve the contents of the <code
+ >s3n://<YOUR-BUCKET>/example/e_coli/output_small</code
+ > directory.</p
+><h4 id="via-command-line"
+><a href="#TOC"
+ >Via command line</a
+ ></h4
+><p
+>Test your Crossbow installation by running:</p
+><pre
+><code
+ >$CROSSBOW_HOME/cb_emr --test
+</code
+ ></pre
+><p
+>This will warn you if any supporting tools (<code
+ >elastic-mapreduce</code
+ > in this case) cannot be located or run.</p
+><p
+>Identify an <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > bucket to hold the job's input and output. You may need to create an <a href="http://docs.amazonwebservices.com/AmazonS3/latest/index.html?UsingBucket.html"
+ >S3 bucket</a
+ > for this purpose. See your <a href="#s3-tools"
+ >S3 tool</a
+ >'s documentation.</p
+><p
+>Use your <a href="#s3-tools"
+ >S3 tool</a
+ > to upload <code
+ >$CROSSBOW_HOME/example/e_coli/small.manifest</code
+ > to the <code
+ >example/e_coli</code
+ > subdirectory in your bucket. You can do so with this <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ > command:</p
+><pre
+><code
+ >s3cmd put $CROSSBOW_HOME/example/e_coli/small.manifest s3://<YOUR-BUCKET>/example/e_coli/
+</code
+ ></pre
+><p
+>Start the <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job with the following command (substituting for <code
+ ><YOUR-BUCKET></code
+ >):</p
+><pre
+><code
+ >$CROSSBOW_HOME/cb_emr \
+ --name "Crossbow-Ecoli" \
+ --preprocess \
+ --input=s3n://<YOUR-BUCKET>/example/e_coli/small.manifest \
+ --output=s3n://<YOUR-BUCKET>/example/e_coli/output_small \
+ --reference=s3n://crossbow-refs/e_coli.jar \
+ --all-haploids
+</code
+ ></pre
+><p
+>The <code
+ >--reference</code
+ > option instructs Crossbow to use a pre-built reference jar at URL <code
+ >s3n://crossbow-refs/e_coli.jar</code
+ >. The <a href="#cb-preprocess"
+ ><code
+ >--preprocess</code
+ ></a
+ > option instructs Crossbow to treat the input as a <a href="#manifest-files"
+ >manifest file</a
+ >, rather than a directory of already-preprocessed reads. As the first stage of the pipeline, Crossbow downloads files specified in the manifest file and preprocesses them into Crossbow's read format. <a href="#cb-local-output"
+ ><code
+ >--output</code
+ ></a
+ > specifies where the final output is placed.</p
+><p
+>This job typically takes about 30 minutes on 1 <code
+ >c1.xlarge</code
+ > <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > node. See <a href="#monitoring-your-emr-jobs"
+ >Monitoring your EMR jobs</a
+ > for information on how to track job progress. To download the results, use an <a href="#s3-tools"
+ >S3 tool</a
+ > to retrieve the contents of the <code
+ >s3n://<YOUR-BUCKET>/example/e_coli/output_small</code
+ > directory.</p
+><h3 id="hadoop"
+><a href="#TOC"
+ >Hadoop</a
+ ></h3
+><div id="cb-example-e-coli-hadoop" />
+<p
+>Log into the <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > master node and test your Crossbow installation by running:</p
+><pre
+><code
+ >$CROSSBOW_HOME/cb_hadoop --test
+</code
+ ></pre
+><p
+>This will tell you if any of the supporting tools or packages are missing on the master. <em
+ >You must also ensure</em
+ > that the same tools are installed in the same paths on all slave nodes, and are runnable by the slaves.</p
+><p
+>From the master, download the file named <code
+ >e_coli.jar</code
+ > from the following URL:</p
+><pre
+><code
+ >http://crossbow-refs.s3.amazonaws.com/e_coli.jar
+</code
+ ></pre
+><p
+>E.g. with this command:</p
+><pre
+><code
+ >wget http://crossbow-refs.s3.amazonaws.com/e_coli.jar
+</code
+ ></pre
+><p
+>Equivalently, you can use an <a href="#s3-tools"
+ >S3 tool</a
+ > to download the same file from this URL:</p
+><pre
+><code
+ >s3n://crossbow-refs/e_coli.jar
+</code
+ ></pre
+><p
+>E.g. with this <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ > command:</p
+><pre
+><code
+ >s3cmd get s3://crossbow-refs/e_coli.jar
+</code
+ ></pre
+><p
+>Install <code
+ >e_coli.jar</code
+ > in <a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > (the <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > distributed filesystem) with the following commands. If the <code
+ >hadoop</code
+ > script is not in your <code
+ >PATH</code
+ >, either add it to your <code
+ >PATH</code
+ > (recommended) or specify the full path to the <code
+ >hadoop</code
+ > script in the following commands.</p
+><pre
+><code
+ >hadoop dfs -mkdir /crossbow-refs
+hadoop dfs -put e_coli.jar /crossbow-refs/e_coli.jar
+</code
+ ></pre
+><p
+>The first creates a directory in <a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > (you will see a warning message if the directory already exists) and the second copies the local jar files into that directory. In this example, we deposit the jars in the <code
+ >/crossbow-refs</code
+ > directory, but any <a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > directory is fine.</p
+><p
+>Remove the local <code
+ >e_coli.jar</code
+ > file to save space. E.g.:</p
+><pre
+><code
+ >rm -f e_coli.jar
+</code
+ ></pre
+><p
+>Next install the <a href="#manifest-files"
+ >manifest file</a
+ > in <a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ >:</p
+><pre
+><code
+ >hadoop dfs -mkdir /crossbow/example/e_coli
+hadoop dfs -put $CROSSBOW_HOME/example/e_coli/small.manifest /crossbow/example/e_coli/small.manifest
+</code
+ ></pre
+><p
+>Now start the job by running:</p
+><pre
+><code
+ >$CROSSBOW_HOME/cb_hadoop \
+ --preprocess \
+ --input=hdfs:///crossbow/example/e_coli/small.manifest \
+ --output=hdfs:///crossbow/example/e_coli/output_small \
+ --reference=hdfs:///crossbow-refs/e_coli.jar \
+ --all-haploids
+</code
+ ></pre
+><p
+>The <a href="#cb-preprocess"
+ ><code
+ >--preprocess</code
+ ></a
+ > option instructs Crossbow to treat the input as a <a href="#manifest-files"
+ >manifest file</a
+ >. As the first stage of the pipeline, Crossbow will download the files specified on each line of the manifest file and preprocess them into Crossbow's read format. The <a href="#cb-local-reference"
+ ><code
+ >--reference</code
+ ></a
+ > option specifies the location of the reference jar contents. The <a href="#cb-local-output"
+ ><code
+ >--output</code
+ ></a
+ > option specifies where the final output is placed.</p
+><h3 id="single-computer"
+><a href="#TOC"
+ >Single computer</a
+ ></h3
+><div id="cb-example-e-coli-local" />
+<p
+>Test your Crossbow installation by running:</p
+><pre
+><code
+ >$CROSSBOW_HOME/cb_local --test
+</code
+ ></pre
+><p
+>This will warn you if any supporting tools (<code
+ >bowtie</code
+ > and <code
+ >soapsnp</code
+ > in this case) cannot be located or run.</p
+><p
+>If you don't already have a <code
+ >CROSSBOW_REFS</code
+ > directory, choose one; it will be the default path Crossbow searches for reference jars. Permanently set the <code
+ >CROSSBOW_REFS</code
+ > environment variable to the selected directory.</p
+><p
+>Create a subdirectory called <code
+ >$CROSSBOW_REFS/e_coli</code
+ >:</p
+><pre
+><code
+ >mkdir $CROSSBOW_REFS/e_coli
+</code
+ ></pre
+><p
+>Download <code
+ >e_coli.jar</code
+ > from the following URL to the new <code
+ >e_coli</code
+ > directory:</p
+><pre
+><code
+ >http://crossbow-refs.s3.amazonaws.com/e_coli.jar
+</code
+ ></pre
+><p
+>E.g. with this command:</p
+><pre
+><code
+ >wget -O $CROSSBOW_REFS/e_coli/e_coli.jar http://crossbow-refs.s3.amazonaws.com/e_coli.jar
+</code
+ ></pre
+><p
+>Equivalently, you can use an <a href="#s3-tools"
+ >S3 tool</a
+ > to download the same file from this URL:</p
+><pre
+><code
+ >s3n://crossbow-refs/e_coli.jar
+</code
+ ></pre
+><p
+>E.g. with this <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ > command:</p
+><pre
+><code
+ >s3cmd get s3://crossbow-refs/e_coli.jar $CROSSBOW_REFS/e_coli/e_coli.jar
+</code
+ ></pre
+><p
+>Change to the new <code
+ >e_coli</code
+ > directory and expand <code
+ >e_coli.jar</code
+ > using an <code
+ >unzip</code
+ > or <code
+ >jar</code
+ > utility:</p
+><pre
+><code
+ >cd $CROSSBOW_REFS/e_coli && unzip e_coli.jar
+</code
+ ></pre
+><p
+>Now you may remove <code
+ >e_coli.jar</code
+ > to save space:</p
+><pre
+><code
+ >rm -f $CROSSBOW_REFS/e_coli/e_coli.jar
+</code
+ ></pre
+><p
+>Now run Crossbow. Change to the <code
+ >$CROSSBOW_HOME/example/e_coli</code
+ > directory and start the job via the <code
+ >cb_local</code
+ > script:</p
+><pre
+><code
+ >cd $CROSSBOW_HOME/example/e_coli
+$CROSSBOW_HOME/cb_local \
+ --input=small.manifest \
+ --preprocess \
+ --reference=$CROSSBOW_REFS/e_coli \
+ --output=output_small \
+ --all-haploids \
+ --cpus=<CPUS>
+</code
+ ></pre
+><p
+>Substitute the number of CPUs you'd like to use for <code
+ ><CPUS></code
+ >.</p
+><p
+>The <a href="#cb-preprocess"
+ ><code
+ >--preprocess</code
+ ></a
+ > option instructs Crossbow to treat the input as a <a href="#manifest-files"
+ >manifest file</a
+ >. As the first stage of the pipeline, Crossbow will download the files specified on each line of the manifest file and "preprocess" them into a format understood by Crossbow. The <a href="#cb-local-reference"
+ ><code
+ >--reference</code
+ ></a
+ > option specifies the location of the reference jar contents. The <a href="#cb-local-output"
+ ><code
+ >--output</code
+ ></a
+ > option specifies where the final output is placed. The <a href="#cb-local-cpus"
+ ><code
+ >--cpus</code
+ ></a
+ > option enables Crossbow to use up to the specified number of CPUs at any given time.</p
+><h2 id="mouse-chromosome-17-large"
+><a href="#TOC"
+ >Mouse chromosome 17 (large)</a
+ ></h2
+><p
+>Data for this example is taken from the study by <a href="http://genomebiology.com/2009/10/10/R112"
+ >Sudbury, Stalker et al</a
+ >.</p
+><h3 id="emr-1"
+><a href="#TOC"
+ >EMR</a
+ ></h3
+><div id="cb-example-mouse17-emr" />
+<h4 id="via-web-interface-1"
+><a href="#TOC"
+ >Via web interface</a
+ ></h4
+><p
+>First we build a reference jar for a human assembly and annotations using scripts included with Crossbow. The script searches for a <code
+ >bowtie-build</code
+ > executable with the same rules Crossbow uses to search for <code
+ >bowtie</code
+ >. See <a href="#installing-crossbow"
+ >Installing Crossbow</a
+ > for details. Because one of the steps executed by the script builds an index of the human genome, it should be run on a computer with plenty of memory (at least 4 gigabytes, preferably 6 or more).</p
+><pre
+><code
+ >cd $CROSSBOW_HOME/reftools
+./mm9_chr17_jar
+</code
+ ></pre
+><p
+>The <code
+ >mm9_chr17_jar</code
+ > script will automatically:</p
+><ol style="list-style-type: decimal;"
+><li
+ >Download the FASTA sequence for mouse (build <a href="http://hgdownload.cse.ucsc.edu/downloads.html#mouse"
+ >mm9</a
+ >) chromome 17 from <a href="http://hgdownload.cse.ucsc.edu/downloads.html"
+ >UCSC</a
+ >.</li
+ ><li
+ >Build an index from that FASTA sequence.</li
+ ><li
+ >Download the known SNPs and SNP frequencies for mouse chromosome 17 from <a href="http://www.ncbi.nlm.nih.gov/projects/SNP/"
+ >dbSNP</a
+ >.</li
+ ><li
+ >Arrange this information in the directory structure expected by Crossbow.</li
+ ><li
+ >Package the information in a <a href="http://en.wikipedia.org/wiki/JAR_(file_format)"
+ >jar file</a
+ > named <code
+ >mm9_chr17.jar</code
+ >.</li
+ ></ol
+><p
+>Next, use an <a href="#s3-tools"
+ >S3 tool</a
+ > to upload the <code
+ >mm9_chr17.jar</code
+ > file to the <code
+ >crossbow-refs</code
+ > subdirectory in your bucket. E.g. with this <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ > command (substituting for <code
+ ><YOUR-BUCKET></code
+ >):</p
+><pre
+><code
+ >s3cmd put $CROSSBOW_HOME/reftools/mm9_chr17/mm9_chr17.jar s3://<YOUR-BUCKET>/crossbow-refs/
+</code
+ ></pre
+><p
+>You may wish to remove the locally-generated reference jar files to save space. E.g.:</p
+><pre
+><code
+ >rm -rf $CROSSBOW_HOME/reftools/mm9_chr17
+</code
+ ></pre
+><p
+>Use an <a href="#s3-tools"
+ >S3 tool</a
+ > to upload <code
+ >$CROSSBOW_HOME/example/mouse17/full.manifest</code
+ > to the <code
+ >example/mouse17</code
+ > subdirectory in your bucket. E.g. with this <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ > command:</p
+><pre
+><code
+ >s3cmd put $CROSSBOW_HOME/example/mouse17/full.manifest s3://<YOUR-BUCKET>/example/mouse17/
+</code
+ ></pre
+><p
+>Direct your web browser to the <a href="http://bowtie-bio.sf.net/crossbow/ui.html"
+ >Crossbow web interface</a
+ > and fill in the form as below (substituting for <code
+ ><YOUR-BUCKET></code
+ >):</p
+><div><img src="images/AWS_cb_mouse17_fillin.png" alt="" /><p><i>Crossbow web form filled in for the large Mouse Chromosome 17 example.</i></p>
+</div>
+<ol style="list-style-type: decimal;"
+><li
+ >For <strong
+ >AWS ID</strong
+ >, enter your AWS Access Key ID</li
+ ><li
+ >For <strong
+ >AWS Secret Key</strong
+ >, enter your AWS Secret Access Key</li
+ ><li
+ ><em
+ >Optional</em
+ >: For <strong
+ >AWS Keypair name</strong
+ >, enter the name of your AWS keypair. This is only necessary if you would like to be able to <a href="http://en.wikipedia.org/wiki/Secure_Shell"
+ >ssh</a
+ > into the <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > cluster while it runs.</li
+ ><li
+ ><em
+ >Optional</em
+ >: Check that the AWS ID and Secret Key entered are valid by clicking the "Check credentials..." link</li
+ ><li
+ >For <strong
+ >Job name</strong
+ >, enter <code
+ >Crossbow-Mouse17</code
+ ></li
+ ><li
+ >Make sure that <strong
+ >Job type</strong
+ > is set to "Crossbow"</li
+ ><li
+ >For <strong
+ >Input URL</strong
+ >, enter <code
+ >s3n://<YOUR-BUCKET>/example/mouse17/full.manifest</code
+ >, substituting for <code
+ ><YOUR-BUCKET></code
+ ></li
+ ><li
+ ><em
+ >Optional</em
+ >: Check that the Input URL exists by clicking the "Check that input URL exists..." link</li
+ ><li
+ >For <strong
+ >Output URL</strong
+ >, enter <code
+ >s3n://<YOUR-BUCKET>/example/mouse17/output_full</code
+ >, substituting for <code
+ ><YOUR-BUCKET></code
+ ></li
+ ><li
+ ><em
+ >Optional</em
+ >: Check that the Output URL does not exist by clicking the "Check that output URL doesn't exist..." link</li
+ ><li
+ >For <strong
+ >Input type</strong
+ >, select "Manifest file"</li
+ ><li
+ >For <strong
+ >Genome/Annotation</strong
+ >, check the box labeled "Specify reference jar URL:" and enter <code
+ >s3n://<YOUR-BUCKET>/crossbow-refs/mm9_chr17.jar</code
+ > in the text box below</li
+ ><li
+ ><em
+ >Optional</em
+ >: Check that the reference jar URL exists by clicking the "Check that reference jar URL exists..." link</li
+ ><li
+ >For <strong
+ >Chromosome ploidy</strong
+ >, select "All are diploid"</li
+ ><li
+ >Click Submit</li
+ ></ol
+><p
+>This job typically takes about 45 minutes on 8 <code
+ >c1.xlarge</code
+ > <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > instances. See <a href="#monitoring-your-emr-jobs"
+ >Monitoring your EMR jobs</a
+ > for information on how to track job progress. To download the results, use an <a href="#s3-tools"
+ >S3 tool</a
+ > to retrieve the contents of the <code
+ >s3n://<YOUR-BUCKET>/example/mouse17/output_full</code
+ > directory.</p
+><h4 id="via-command-line-1"
+><a href="#TOC"
+ >Via command line</a
+ ></h4
+><p
+>First we build a reference jar for a human assembly and annotations using scripts included with Crossbow. The script searches for a <code
+ >bowtie-build</code
+ > executable with the same rules Crossbow uses to search for <code
+ >bowtie</code
+ >. See <a href="#installing-crossbow"
+ >Installing Crossbow</a
+ > for details. Because one of the steps executed by the script builds an index of the human genome, it should be run on a computer with plenty of memory (at least 4 gigabytes, preferably 6 or more).</p
+><pre
+><code
+ >cd $CROSSBOW_HOME/reftools
+./mm9_chr17_jar
+</code
+ ></pre
+><p
+>The <code
+ >mm9_chr17_jar</code
+ > script will automatically:</p
+><ol style="list-style-type: decimal;"
+><li
+ >Download the FASTA sequence for mouse (build <a href="http://hgdownload.cse.ucsc.edu/downloads.html#mouse"
+ >mm9</a
+ >) chromome 17 from <a href="http://hgdownload.cse.ucsc.edu/downloads.html"
+ >UCSC</a
+ >.</li
+ ><li
+ >Build an index from that FASTA sequence.</li
+ ><li
+ >Download the known SNPs and SNP frequencies for mouse chromosome 17 from <a href="http://www.ncbi.nlm.nih.gov/projects/SNP/"
+ >dbSNP</a
+ >.</li
+ ><li
+ >Arrange this information in the directory structure expected by Crossbow.</li
+ ><li
+ >Package the information in a <a href="http://en.wikipedia.org/wiki/JAR_(file_format)"
+ >jar file</a
+ > named <code
+ >mm9_chr17.jar</code
+ >.</li
+ ></ol
+><p
+>Next, use an <a href="#s3-tools"
+ >S3 tool</a
+ > to upload the <code
+ >mm9_chr17.jar</code
+ > file to the <code
+ >crossbow-refs</code
+ > subdirectory in your bucket. E.g. with this <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ > command (substituting for <code
+ ><YOUR-BUCKET></code
+ >):</p
+><pre
+><code
+ >s3cmd put $CROSSBOW_HOME/reftools/mm9_chr17/mm9_chr17.jar s3://<YOUR-BUCKET>/crossbow-refs/
+</code
+ ></pre
+><p
+>You may wish to remove the locally-generated reference jar files to save space. E.g.:</p
+><pre
+><code
+ >rm -rf $CROSSBOW_HOME/reftools/mm9_chr17
+</code
+ ></pre
+><p
+>Use an <a href="#s3-tools"
+ >S3 tool</a
+ > to upload <code
+ >$CROSSBOW_HOME/example/mouse17/full.manifest</code
+ > to the <code
+ >example/mouse17</code
+ > subdirectory in your bucket. E.g. with this <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ > command:</p
+><pre
+><code
+ >s3cmd put $CROSSBOW_HOME/example/mouse17/full.manifest s3://<YOUR-BUCKET>/example/mouse17/
+</code
+ ></pre
+><p
+>To start the <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job, run the following command (substituting for <code
+ ><YOUR-BUCKET></code
+ >):</p
+><pre
+><code
+ >$CROSSBOW_HOME/cb_emr \
+ --name "Crossbow-Mouse17" \
+ --preprocess \
+ --input=s3n://<YOUR-BUCKET>/example/mouse17/full.manifest \
+ --output=s3n://<YOUR-BUCKET>/example/mouse17/output_full \
+ --reference=s3n://<YOUR-BUCKET>/crossbow-refs/mm9_chr17.jar \
+ --instances 8
+</code
+ ></pre
+><p
+>This job typically takes about 45 minutes on 8 <code
+ >c1.xlarge</code
+ > <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > instances. See <a href="#monitoring-your-emr-jobs"
+ >Monitoring your EMR jobs</a
+ > for information on how to track job progress. To download the results, use an <a href="#s3-tools"
+ >S3 tool</a
+ > to retrieve the contents of the <code
+ >s3n://<YOUR-BUCKET>/example/mouse17/output_full</code
+ > directory.</p
+><h3 id="hadoop-1"
+><a href="#TOC"
+ >Hadoop</a
+ ></h3
+><div id="cb-example-mouse17-hadoop" />
+<p
+>First we build a reference jar for a human assembly and annotations using scripts included with Crossbow. The script searches for a <code
+ >bowtie-build</code
+ > executable with the same rules Crossbow uses to search for <code
+ >bowtie</code
+ >. See <a href="#installing-crossbow"
+ >Installing Crossbow</a
+ > for details. Because one of the steps executed by the script builds an index of the human genome, it should be run on a computer with plenty of memory (at least 4 gigabytes, preferably 6 or more).</p
+><pre
+><code
+ >cd $CROSSBOW_HOME/reftools
+./mm9_chr17_jar
+</code
+ ></pre
+><p
+>The <code
+ >mm9_chr17_jar</code
+ > script will automatically:</p
+><ol style="list-style-type: decimal;"
+><li
+ >Download the FASTA sequence for mouse (build <a href="http://hgdownload.cse.ucsc.edu/downloads.html#mouse"
+ >mm9</a
+ >) chromome 17 from <a href="http://hgdownload.cse.ucsc.edu/downloads.html"
+ >UCSC</a
+ >.</li
+ ><li
+ >Build an index from that FASTA sequence.</li
+ ><li
+ >Download the known SNPs and SNP frequencies for mouse chromosome 17 from <a href="http://www.ncbi.nlm.nih.gov/projects/SNP/"
+ >dbSNP</a
+ >.</li
+ ><li
+ >Arrange this information in the directory structure expected by Crossbow.</li
+ ><li
+ >Package the information in a <a href="http://en.wikipedia.org/wiki/JAR_(file_format)"
+ >jar file</a
+ > named <code
+ >mm9_chr17.jar</code
+ >.</li
+ ></ol
+><p
+>Next, use the <code
+ >hadoop</code
+ > script to put the <code
+ >mm9_chr17.jar</code
+ > file in the <code
+ >crossbow-refs</code
+ > <a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > directory. Note tha tif <code
+ >hadoop</code
+ > is not in your <code
+ >PATH</code
+ >, you must specify <code
+ >hadoop</code
+ >'s full path instead:</p
+><pre
+><code
+ >hadoop dfs -mkdir /crossbow-refs
+hadoop dfs -put $CROSSBOW_HOME/reftools/mm9_chr17/mm9_chr17.jar /crossbow-refs/mm9_chr17.jar
+</code
+ ></pre
+><p
+>The first command will yield a warning if the directory already exists; ignore this. In this example, we deposit the jars in the <code
+ >/crossbow-refs</code
+ > directory, but any <a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > directory is fine.</p
+><p
+>You may wish to remove the locally-generated reference jar files to save space. E.g.:</p
+><pre
+><code
+ >rm -rf $CROSSBOW_HOME/reftools/mm9_chr17
+</code
+ ></pre
+><p
+>Now install the <a href="#manifest-files"
+ >manifest file</a
+ > in <a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ >:</p
+><pre
+><code
+ >hadoop dfs -mkdir /crossbow/example/mouse17
+hadoop dfs -put $CROSSBOW_HOME/example/mouse17/full.manifest /crossbow/example/mouse17/full.manifest
+</code
+ ></pre
+><p
+>To start the <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > job, run the following command (substituting for <code
+ ><YOUR-BUCKET></code
+ >):</p
+><pre
+><code
+ >$CROSSBOW_HOME/cb_hadoop \
+ --preprocess \
+ --input=hdfs:///crossbow/example/mouse17/full.manifest \
+ --output=hdfs:///crossbow/example/mouse17/output_full \
+ --reference=hdfs:///crossbow-refs/mm9_chr17.jar
+</code
+ ></pre
+><h3 id="single-computer-1"
+><a href="#TOC"
+ >Single computer</a
+ ></h3
+><div id="cb-example-mouse17-local" />
+<p
+>First we build a reference jar for a human assembly and annotations using scripts included with Crossbow. The script searches for a <code
+ >bowtie-build</code
+ > executable with the same rules Crossbow uses to search for <code
+ >bowtie</code
+ >. See <a href="#installing-crossbow"
+ >Installing Crossbow</a
+ > for details. Because one of the steps executed by the script builds an index of the human genome, it should be run on a computer with plenty of memory (at least 4 gigabytes, preferably 6 or more).</p
+><p
+>Run the following commands:</p
+><pre
+><code
+ >cd $CROSSBOW_HOME/reftools
+./mm9_chr17_jar
+</code
+ ></pre
+><p
+>The <code
+ >mm9_chr17_jar</code
+ > script will automatically:</p
+><ol style="list-style-type: decimal;"
+><li
+ >Download the FASTA sequence for mouse (build <a href="http://hgdownload.cse.ucsc.edu/downloads.html#mouse"
+ >mm9</a
+ >) chromome 17 from <a href="http://hgdownload.cse.ucsc.edu/downloads.html"
+ >UCSC</a
+ >.</li
+ ><li
+ >Build an index from that FASTA sequence.</li
+ ><li
+ >Download the known SNPs and SNP frequencies for mouse chromosome 17 from <a href="http://www.ncbi.nlm.nih.gov/projects/SNP/"
+ >dbSNP</a
+ >.</li
+ ><li
+ >Arrange this information in the directory structure expected by Crossbow.</li
+ ><li
+ >Package the information in a <a href="http://en.wikipedia.org/wiki/JAR_(file_format)"
+ >jar file</a
+ > named <code
+ >mm9_chr17.jar</code
+ >.</li
+ ></ol
+><p
+>Move the directory containing the new reference jar into the <code
+ >$CROSSBOW_REFS</code
+ > directory:</p
+><pre
+><code
+ >mv $CROSSBOW_HOME/reftools/mm9_chr17 $CROSSBOW_REFS/
+</code
+ ></pre
+><p
+>Now change to the <code
+ >$CROSSBOW_HOME/example/mouse17</code
+ > directory and run Crossbow (substitute the number of CPUs you'd like to use for <code
+ ><CPUS></code
+ >):</p
+><pre
+><code
+ >cd $CROSSBOW_HOME/example/mouse17
+$CROSSBOW_HOME/cb_local \
+ --input=$CROSSBOW_HOME/example/mouse17/full.manifest \
+ --preprocess \
+ --reference=$CROSSBOW_REFS/mm9_chr17 \
+ --output=output_full \
+ --cpus=<CPUS>
+</code
+ ></pre
+><h1 id="manifest-files"
+><a href="#TOC"
+ >Manifest files</a
+ ></h1
+><p
+>A manifest file describes a set of <a href="http://en.wikipedia.org/wiki/FASTQ_format"
+ >FASTQ</a
+ > or <a href="http://www.ncbi.nlm.nih.gov/books/NBK47540/"
+ ><code
+ >.sra</code
+ ></a
+ > formatted input files that might be located:</p
+><ol style="list-style-type: decimal;"
+><li
+ >On the local computer</li
+ ><li
+ >In <a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ ></li
+ ><li
+ >In <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ ></li
+ ><li
+ >On an FTP or web server</li
+ ></ol
+><p
+>A manifest file can contain any combination of URLs and local paths from these various types of sources.</p
+><p
+><a href="http://en.wikipedia.org/wiki/FASTQ_format"
+ >FASTQ</a
+ > files can be gzip or bzip2-compressed (i.e. with <code
+ >.gz</code
+ > or <code
+ >.bz2</code
+ > file extensions). If <a href="http://www.ncbi.nlm.nih.gov/books/NBK47540/"
+ ><code
+ >.sra</code
+ ></a
+ > files are specified in the manifest and Crossbow is being run in single-computer or <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > modes, then the <code
+ >fastq-dump</code
+ > tool must be installed and Myrna must be able to locate it. See the <a href="#cb-local-fastq-dump"
+ ><code
+ >--fastq-dump</code
+ ></a
+ > option and the <a href="#the-fastq-dump"
+ >SRA Toolkit section of the manual</a
+ >.</p
+><p
+>Each line in the manifest file represents either one file, for unpaired input reads, or a pair of files, for paired input reads. For a set of unpaired input reads, the line is formatted:</p
+><pre
+><code
+ >URL(tab)Optional-MD5
+</code
+ ></pre
+><p
+>Specifying an MD5 for the input file is optional. If it is specified, Crossbow will attempt to check the integrity of the file after downloading by comparing the observed MD5 to the user-provided MD5. To disable this checking, specify <code
+ >0</code
+ > in this field.</p
+><p
+>For a set of paired input reads, the line is formatted:</p
+><pre
+><code
+ >URL-1(tab)Optional-MD5-1(tab)URL-2(tab)Optional-MD5-2
+</code
+ ></pre
+><p
+>Where <code
+ >URL-1</code
+ > and <code
+ >URL-2</code
+ > point to input files with all the #1 mates in <code
+ >URL-1</code
+ > and all the #2 mates in <code
+ >URL-2</code
+ >. The entries in the files must be arranged so that pairs "line up" in parallel. This is commonly the way public paired-end FASTQ datasets, such as those produced by the <a href="http://www.1000genomes.org/page.php"
+ >1000 Genomes Project</a
+ >, are formatted. Typically these file pairs end in suffixes <code
+ >_1.fastq.gz</code
+ > and <code
+ >_2.fastq.gz</code
+ >.</p
+><p
+>Manifest files may have comment lines, which must start with the hash (<code
+ >#</code
+ >) symbol, and blank lines. Such lines are ignored by Crossbow.</p
+><p
+>For examples of manifest files, see the files ending in <code
+ >.manifest</code
+ > in the <code
+ >$CROSSBOW_HOME/example/e_coli</code
+ > and <code
+ >$CROSSBOW_HOME/example/mouse17</code
+ > directories.</p
+><h1 id="reference-jars"
+><a href="#TOC"
+ >Reference jars</a
+ ></h1
+><p
+>All information about a reference sequence needed by Crossbow is encapsulated in a "reference jar" file. A reference jar includes a set of FASTA files encoding the reference sequences, a <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > index of the reference sequence, and a set of files encoding information about known SNPs for the species.</p
+><p
+>A Crossbow reference jar is organized as:</p
+><ol style="list-style-type: decimal;"
+><li
+ >A <code
+ >sequences</code
+ > subdirectory containing one FASTA file per reference sequence.</li
+ ><li
+ >An <code
+ >index</code
+ > subdirectory containing the <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > index files for the reference sequences.</li
+ ><li
+ >A <code
+ >snps</code
+ > subdirectory containing all of the SNP description files.</li
+ ></ol
+><p
+>The FASTA files in the <code
+ >sequences</code
+ > subdirectory must each be named <code
+ >chrX.fa</code
+ >, where <code
+ >X</code
+ > is the 0-based numeric id of the chromosome or sequence in the file. For example, for a human reference, chromosome 1's FASTA file could be named <code
+ >chr0.fa</code
+ >, chromosome 2 named <code
+ >chr1.fa</code
+ >, etc, all the way up to chromosomes 22, X and Y, named <code
+ >chr21.fa</code
+ >, <code
+ >chr22.fa</code
+ > and <code
+ >chr23.fa</code
+ >. Also, the names of the sequences within the FASTA files must match the number in the file name. I.e., the first line of the FASTA file <code
+ >chr0.fa</code
+ > must be <code
+ >>0</code
+ >.</p
+><p
+>The index files in the <code
+ >index</code
+ > subdirectory must have the basename <code
+ >index</code
+ >. I.e., the index subdirectory must contain these files:</p
+><pre
+><code
+ >index.1.ebwt
+index.2.ebwt
+index.3.ebwt
+index.4.ebwt
+index.rev.1.ebwt
+index.rev.2.ebwt
+</code
+ ></pre
+><p
+>The index must be built using the <a href="http://bowtie-bio.sourceforge.net/manual.shtml#indx"
+ ><code
+ >bowtie-build</code
+ ></a
+ > tool distributed with <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ >. When <code
+ >bowtie-build</code
+ > is executed, the FASTA files specified on the command line must be listed in ascending order of numeric id. For instance, for a set of FASTA files encoding human chromosomes 1,2,...,22,X,Y as <code
+ >chr0.fa</code
+ >,<code
+ >chr1.fa</code
+ >,...,<code
+ >chr21.fa</code
+ >, <code
+ >chr22.fa</code
+ >,<code
+ >chr23.fa</code
+ >, the command for <code
+ >bowtie-build</code
+ > must list the FASTA files in that order:</p
+><pre
+><code
+ >bowtie-build chr0.fa,chr1.fa,...,chr23.fa index
+</code
+ ></pre
+><p
+>The SNP description files in the <code
+ >snps</code
+ > subdirectory must also have names that match the corresponding FASTA files in the <code
+ >sequences</code
+ > subdirectory, but with extension <code
+ >.snps</code
+ >. E.g. if the sequence file for human Chromosome 1 is named <code
+ >chr0.fa</code
+ >, then the SNP description file for Chromosome 1 must be named <code
+ >chr0.snps</code
+ >. SNP description files may be omitted for some or all chromosomes.</p
+><p
+>The format of the SNP description files must match the format expected by <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ >'s <code
+ >-s</code
+ > option. The format consists of 1 SNP per line, with the following tab-separated fields per SNP:</p
+><ol style="list-style-type: decimal;"
+><li
+ >Chromosome ID</li
+ ><li
+ >1-based offset into chromosome</li
+ ><li
+ >Whether SNP has allele frequency information (1 = yes, 0 = no)</li
+ ><li
+ >Whether SNP is validated by experiment (1 = yes, 0 = no)</li
+ ><li
+ >Whether SNP is actually an indel (1 = yes, 0 = no)</li
+ ><li
+ >Frequency of A allele, as a decimal number</li
+ ><li
+ >Frequency of C allele, as a decimal number</li
+ ><li
+ >Frequency of T allele, as a decimal number</li
+ ><li
+ >Frequency of G allele, as a decimal number</li
+ ><li
+ >SNP id (e.g. a <a href="http://www.ncbi.nlm.nih.gov/projects/SNP/"
+ >dbSNP</a
+ > id such as <code
+ >rs9976767</code
+ >)</li
+ ></ol
+><p
+>Once these three subdirectories have been created and populated, they can be combined into a single <a href="http://en.wikipedia.org/wiki/JAR_(file_format)"
+ >jar file</a
+ > with a command like this:</p
+><pre
+><code
+ >jar cf ref-XXX.jar sequences snps index
+</code
+ ></pre
+><p
+>To use <code
+ >ref-XXX.jar</code
+ > with Crossbow, you must copy it to a location where it can be downloaded over the internet via HTTP, FTP, or S3. Once it is placed in such a location, make a note if its URL.</p
+><h2 id="building-a-reference-jar-using-automatic-scripts"
+><a href="#TOC"
+ >Building a reference jar using automatic scripts</a
+ ></h2
+><p
+>The <code
+ >reftools</code
+ > subdirectory of the Crossbow package contains scripts that assist in building reference jars, including scripts that handle the entire process of building reference jars for <a href="http://hgdownload.cse.ucsc.edu/downloads.html#human"
+ >hg18</a
+ > (UCSC human genome build 18) and <a href="http://hgdownload.cse.ucsc.edu/downloads.html#mouse"
+ >mm9</a
+ > (UCSC mouse genome build 9). The <code
+ >db2ssnp</code
+ > script combines SNP and allele frequency information from <a href="http://www.ncbi.nlm.nih.gov/projects/SNP/"
+ >dbSNP</a
+ > to create a <code
+ >chrX.snps</code
+ > file for the <code
+ >snps</code
+ > subdirectory of the reference jar. The <code
+ >db2ssnp_*</code
+ > scripts drive the <code
+ >db2ssnp</code
+ > script for each chromosome in the <a href="http://hgdownload.cse.ucsc.edu/downloads.html#human"
+ >hg18</a
+ > and <a href="http://hgdownload.cse.ucsc.edu/downloads.html#mouse"
+ >mm9</a
+ > genomes. The <code
+ >*_jar</code
+ > scripts drive the entire reference-jar building process, including downloading reference FASTA files, building a Bowtie index, and using <code
+ >db2ssnp</code
+ > to generate the <code
+ >.snp</code
+ > files for <a href="http://hgdownload.cse.ucsc.edu/downloads.html#human"
+ >hg18</a
+ > and <a href="http://hgdownload.cse.ucsc.edu/downloads.html#mouse"
+ >mm9</a
+ >.</p
+><h1 id="monitoring-debugging-and-logging"
+><a href="#TOC"
+ >Monitoring, debugging and logging</a
+ ></h1
+><h2 id="single-computer-2"
+><a href="#TOC"
+ >Single computer</a
+ ></h2
+><p
+>Single-computer runs of Crossbow are relatively easy to monitor and debug. Progress messages are printed to the console as the job runs. When there is a fatal error, Crossbow usually indicates exactly which log file on the local filesystem contains the relevant error message. Additional debugging is possible when intermediate and temporary files are kept rather than discarded; see <a href="#cb-local-keep-intermediates"
+ ><code
+ >--keep-intermediates</code
+ ></a
+ > and <a href="#cb-local-keep-all"
+ ><code
+ >--keep-all</code
+ ></a
+ >. All output and logs are stored on the local filesystem; see <a href="#cb-local-intermediate"
+ ><code
+ >--intermediate</code
+ ></a
+ > and <a href="#cb-local-output"
+ ><code
+ >--output</code
+ ></a
+ > options.</p
+><h2 id="hadoop-2"
+><a href="#TOC"
+ >Hadoop</a
+ ></h2
+><p
+>The simplest way to monitor Crossbow <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > jobs is via the Hadoop JobTracker. The JobTracker is a web server that provides a point-and-click interface for monitoring jobs and reading output and other log files generated by those jobs, including after they've finished.</p
+><p
+>When a job fails, you can often find the relevant error message by "drilling down" from the "step" level through the "job" level and "task" levels, and finally to the "attempt" level. To diagnose why an attempt failed, click through to the "stderr" ("standard error") log and scan for the relevant error message.</p
+><p
+>See your version of Hadoop's documentation for details on how to use the web interface. Amazon has a brief document describing <a href="http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?UsingtheHadoopUserInterface.html"
+ >How to Use the Hadoop User Interface</a
+ >, though some of the instructions are specific to clusters rented from Amazon. <a href="http://oreilly.com/catalog/9780596521981"
+ >Hadoop, the Definitive Guide</a
+ > is also an excellent reference.</p
+><h2 id="emr-2"
+><a href="#TOC"
+ >EMR</a
+ ></h2
+><p
+>The recommended way to monitor EMR <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > jobs is via the <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ >. The <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ > allows you to see:</p
+><ol style="list-style-type: decimal;"
+><li
+ >The status for job (e.g. "COMPLETED", "RUNNING" or "FAILED")</li
+ ><li
+ >The status for each step of each job</li
+ ><li
+ >How long a job has been running for and how many "compute units" have been utilized so far.</li
+ ><li
+ >The exact Hadoop commands used to initiate each job step.</li
+ ><li
+ >The button for <a href="http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?DebuggingJobFlows.html"
+ >Debugging Job Flows</a
+ ></li
+ ></ol
+><div><img src="images/AWS_console.png" alt="Screen shot of AWS console with interface elements labeled" /><p><i>Screen shot of <a href="https://console.aws.amazon.com"
+>AWS Console</a
+> interface with some relevant interface elements labeled</i></p>
+</div>
+<p
+>The <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ > also has a useful facility for <a href="http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?DebuggingJobFlows.html"
+ >Debugging Job Flows</a
+ >, which is accessible via the "Debug" button on the "Elastic MapReduce" tab of the Console (labeled "5"). You must (a) have a <a href="http://aws.amazon.com/simpledb/"
+ >SimpleDB</a
+ > account (b) not have specified <a href="#cb-no-emr-debug"
+ ><code
+ >--no-emr-debug</code
+ ></a
+ > in order to use all of the <a href="http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?DebuggingJobFlows.html"
+ >EMR Debug</a
+ > interface's features:</p
+><div><img src="images/AWS_console_debug.png" alt="Screen shot of AWS console debug interface" /><p><i>Screen shot of <a href="http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?DebuggingJobFlows.html"
+>EMR Debug</a
+> interface</i></p>
+</div>
+<p
+>The debug interface is similar to Hadoop's JobTracker interface. When a job fails, you can often find the relevant error message by "drilling down" from the "job" level, through the "task" level, and finally to the "attempt" level. To diagnose why an attempt failed, click through to the "stderr" ("standard error") log and scan for the relevant error message.</p
+><p
+>For more information, see Amazon's document on <a href="http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?DebuggingJobFlows.html"
+ >Debugging Job Flows</a
+ >.</p
+><h2 id="aws-management-console"
+><a href="#TOC"
+ >AWS Management Console</a
+ ></h2
+><p
+>A simple way to monitor your EMR activity is via the <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ >. The <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ > summarizes current information regarding all your running <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > nodes and <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > jobs. Each job is listed in the "Amazon Elastic MapReduce" tab of the console, whereas individual <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > nodes are listed in the "Amazon EC2" tab.</p
+><div><img src="images/AWS_console_upper_left.png" alt="Screen shot of AWS console tabs" /><p><i>Screen shot of <a href="https://console.aws.amazon.com"
+>AWS console</a
+>; note tabs for "Amazon Elastic MapReduce" and "Amazon EC2"</i></p>
+</div>
+<h1 id="crossbow-output"
+><a href="#TOC"
+ >Crossbow Output</a
+ ></h1
+><p
+>Once a Crossbow job completes successfully, the output is deposited in a <code
+ >crossbow_results</code
+ > subdirectory of the specified <code
+ >--output</code
+ > directory or URL. Within the <code
+ >crossbow_results</code
+ > subdirectory, results are organized as one gzipped result file per chromosome. E.g. if your run was against the <a href="http://hgdownload.cse.ucsc.edu/downloads.html#human"
+ >hg18</a
+ > build of the human genome, the output files from your experiment will named:</p
+><pre
+><code
+ ><output_url>/crossbow_results/chr1.gz
+<output_url>/crossbow_results/chr2.gz
+<output_url>/crossbow_results/chr3.gz
+...
+<output_url>/crossbow_results/chr21.gz
+<output_url>/crossbow_results/chr22.gz
+<output_url>/crossbow_results/chrX.gz
+<output_url>/crossbow_results/chrY.gz
+<output_url>/crossbow_results/chrM.gz
+</code
+ ></pre
+><p
+>Each individual record is in the <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > output format. SOAPsnp's format consists of 1 SNP per line with several tab-separated fields per SNP. The fields are:</p
+><ol style="list-style-type: decimal;"
+><li
+ >Chromosome ID</li
+ ><li
+ >1-based offset into chromosome</li
+ ><li
+ >Reference genotype</li
+ ><li
+ >Subject genotype</li
+ ><li
+ >Quality score of subject genotype</li
+ ><li
+ >Best base</li
+ ><li
+ >Average quality score of best base</li
+ ><li
+ >Count of uniquely aligned reads corroborating the best base</li
+ ><li
+ >Count of all aligned reads corroborating the best base</li
+ ><li
+ >Second best base</li
+ ><li
+ >Average quality score of second best base</li
+ ><li
+ >Count of uniquely aligned reads corroborating second best base</li
+ ><li
+ >Count of all aligned reads corroborating second best base</li
+ ><li
+ >Overall sequencing depth at the site</li
+ ><li
+ >Sequencing depth of just the paired alignments at the site</li
+ ><li
+ >Rank sum test P-value</li
+ ><li
+ >Average copy number of nearby region</li
+ ><li
+ >Whether the site is a known SNP from the file specified with <code
+ >-s</code
+ ></li
+ ></ol
+><p
+>Note that field 15 was added in Crossbow and is not output by unmodified SOAPsnp.</p
+><p
+>For further details, see the <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > manual.</p
+><h1 id="other-reading"
+><a href="#TOC"
+ >Other reading</a
+ ></h1
+><p
+>The <a href="http://genomebiology.com/2009/10/11/R134"
+ >Crossbow paper</a
+ > discusses the broad design philosophy of both <a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow</a
+ > and <a href="http://bowtie-bio.sf.net/myrna"
+ >Myrna</a
+ > and why cloud computing can be considered a useful trend for comparative genomics applications. The <a href="http://genomebiology.com/2009/10/3/R25"
+ >Bowtie paper</a
+ > discusses the alignment algorithm underlying <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ >.</p
+><p
+>For additional information regarding Amazon EC2, S3, EMR, and related services, see Amazon's <a href="http://aws.amazon.com/documentation/"
+ >AWS Documentation</a
+ >. Some helpful screencasts are posted on the <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ > home page.</p
+><p
+>For additional information regarding Hadoop, see the <a href="http://hadoop.apache.org/"
+ >Hadoop web site</a
+ > and <a href="http://www.cloudera.com/resource/getting_started_with_hadoop"
+ >Cloudera's Getting Started with Hadoop</a
+ > document. <a href="http://www.cloudera.com/developers/downloads/virtual-machine/"
+ >Cloudera's training virtual machine</a
+ > for <a href="http://www.vmware.com/"
+ >VMWare</a
+ > is an excellent way to get acquainted with Hadoop without having to install it on a production cluster.</p
+><h1 id="acknowledgements"
+><a href="#TOC"
+ >Acknowledgements</a
+ ></h1
+><p
+><a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow</a
+ > software is by <a href="http://faculty.jhsph.edu/default.cfm?faculty_id=2209&grouped=false&searchText=&department_id=3&departmentName=Biostatistics"
+ >Ben Langmead</a
+ > and <a href="http://www.cbcb.umd.edu/~mschatz/"
+ >Michael C. Schatz</a
+ >.</p
+><p
+><a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > software is by <a href="http://faculty.jhsph.edu/default.cfm?faculty_id=2209&grouped=false&searchText=&department_id=3&departmentName=Biostatistics"
+ >Ben Langmead</a
+ > and <a href="http://www.cs.umd.edu/~cole/"
+ >Cole Trapnell</a
+ >.</p
+><p
+><a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > is by Ruiqiang Li, Yingrui Li, Xiaodong Fang, Huanming Yang, Jian Wang, Karsten Kristiansen, and Jun Wang.</p
+>
+</body>
+</html>
diff --git a/doc/strip_markdown.pl b/doc/strip_markdown.pl
new file mode 100644
index 0000000..c0c86b4
--- /dev/null
+++ b/doc/strip_markdown.pl
@@ -0,0 +1,39 @@
+#!/usr/bin/perl -w
+
+# Used to convert MANUAL.markdown to MANUAL.
+
+use strict;
+use warnings;
+
+my $lastBlank = 0;
+
+while(<>) {
+ # Skip comments
+ next if /^\s*<!--/;
+ next if /^\s*!/;
+ next if /^\s*-->/;
+ # Skip internal links
+ next if /\[.*\]: #/;
+ # Skip HTML
+ next if /^\s?\s?\s?<.*>\s*$/;
+ # Skip HTML
+ next if /^\s*<table/;
+ next if /^\s*<\/td/;
+ # Strip [`...`]
+ s/\[`/`/g;
+ s/`\]/`/g;
+ # Strip [#...]
+ s/\[#[^\]]*\]//g;
+ # Strip (#...)
+ s/\(#[^\)]*\)//g;
+ # Turn hashes into spaces
+ s/^####/ /;
+ s/^###/ /;
+ if(/^\s*$/) {
+ next if $lastBlank;
+ $lastBlank = 1;
+ } else {
+ $lastBlank = 0;
+ }
+ print $_;
+}
diff --git a/doc/style.css b/doc/style.css
new file mode 100644
index 0000000..146c76d
--- /dev/null
+++ b/doc/style.css
@@ -0,0 +1,145 @@
+/*
+Stylesheet for the free sNews15_1 template
+from http://www.free-css-templates.com
+*/
+
+/* Reset all margins and paddings for browsers */
+* {
+ padding: 0;
+ margin: 0;
+}
+
+body {
+ font: .8em Verdana, Arial, Sans-Serif;
+ line-height: 1.6em;
+ margin: 0;
+ /* background-image: url(../images/bg.jpg); */
+ /* background-repeat: repeat */
+}
+
+#wrap { margin: 0 auto; width: 95% }
+
+/* TOP HEADER -------- */
+#top {
+ margin: 0 auto;
+ padding: 0;
+ color: #666;
+ background: #FFF url(../images/cbcbstrip.jpg) repeat-x top;
+ height: 81px;
+}
+#top h1 { padding: 10px 0 0 25px; color: #FFF; font-size: 240%; background: transparent;}
+#top h2 { padding: 0px 0 0 25px; color: #bbb; font-size: 100%; background: transparent;}
+#top .padding { padding-top: 5px; }
+
+/* SEARCH BOX AND BUTTON ----------*/
+#search { float: right; padding: 10px 25px 0 0; }
+
+#subheader {
+ clear: both;
+ border-top: 1px dotted #888;
+ border-bottom: 1px dotted #888;
+ background: #eaeaea;
+ color: #505050;
+ padding: 1em;
+ margin: 15px 0px 10px 0px;
+
+}
+#subheader a { text-decoration: none; /* border-bottom: 1px dashed #0066B3; */ }
+
+
+#main { background: #FFF; margin: 25px 0 15px 0; color: #666; }
+
+#main #rightside {
+ width: 300px;
+ float: right;
+ background: #FFF;
+ margin-right: 0px;
+ color: #555;
+
+}
+
+#main #rightside .box {
+ background: #efefef;
+ margin-bottom: 10px;
+ padding: 5px;
+ color: #555;
+}
+
+#main #rightside h2 {
+ font: bold 1.0em Arial, Arial, Sans-Serif;
+ height: 18px;
+ padding: 3px;
+ color: #666;
+}
+
+/* LEFT SIDE - ARTICLES AREA -------- */
+#leftside {
+ padding-left: 8px;
+ color: #555;
+ background: #FFF;
+ margin-right: 305px;
+ margin-left: 0px;
+
+}
+#leftside h1 { padding: 15px 0 10px 0 }
+#leftside h2 { padding: 15px 0 10px 0; color: #555; text-indent: 17px }
+/* #leftside h3 { font-size: 100%; margin-left: 15px; text-indent: 17px; background: #FFF url(../images/head.gif) no-repeat left; } */
+#leftside h3 { padding: 15px 0 10px 0; font-size: 100%; margin-left: 5px; text-indent: 17px; background: #FFF url(../images/head.gif) no-repeat left; }
+#leftside ul { margin-left: 24px; padding-left 24px; list-style-type: circle }
+#leftside li { }
+#leftside p { padding: 0px 0 10px 0 }
+
+#footer {
+ clear: both;
+ background: #FFF url(../images/footer.jpg) repeat-x;
+ height: 46px;
+ margin-left: 0px;
+ margin-right: 0px;
+ font-size: 75%;
+ color: #666;
+}
+#footer p { padding: 5px }
+#footer .rside { float: right; display: inline; padding: 5px; text-align: right}
+
+#toc {
+ margin: 15px 15px 15px 10px;
+}
+#toc ol { list-style: roman }
+#toc ul { padding: 0 0 0 20px }
+
+a { color: #0066B3; background: inherit; text-decoration: none }
+h1 { margin: 0 15px 10px 15px; padding: 10px 0 10px 0; font: bold 1.9em Arial, Arial, Sans-Serif }
+h2 { margin: 0 15px 10px 15px; padding: 10px 0 10px 0; font: bold 1.2em Arial, Arial, Sans-Serif }
+h3 { margin: 0 15px 10px 20px; padding: 10px 0 10px 0; font: 1.2em Arial, Arial, Sans-Serif }
+h4 { margin: 0 15px 10px 25px; padding: 10px 0 10px 0; font: 1.1em Arial, Arial, Sans-Serif }
+p { margin: 0 15px 10px 15px; color: #444 }
+table { margin-top: 15px }
+ul { margin: 0 15px 10px 15px;
+ padding: 0;
+ padding-bottom 10px;
+ margin: 0; }
+pre { margin: 0 15px 15px 25px; font-family: "Courier New", Courier; }
+li { margin: 0 15px 1px 15px; color: #444 }
+ol { margin-left: 24px;
+ padding-left 24px;
+ padding-bottom 20px;
+ list-style: decimal }
+td { vertical-align: top; }
+blockquote { margin-left: 35px; font-family: "Courier New", Courier; }
+tt { font-family: "Courier New", Courier; }
+img { padding:8px; }
+code { font-family: "Courier New", Courier; }
+.date { border-top: 1px solid #e5e5e5; text-align: right; margin-bottom: 25px; margin-top: 5px;}
+#main #leftside .date a, #main #rightside a { border: 0; text-decoration: none; }
+
+.comment .date { text-align: left; border: 0;}
+
+#leftside #txt {width: 100%; height: 10em; padding: 3px 3px 3px 6px; margin-left:0em;}
+#leftside textarea { border: 1px solid #bbb; width: 100%; }
+
+
+/* SNEWS */
+#main #leftside fieldset { float: left; width: 100%; border: 1px solid #ccc; padding: 10px 8px; margin: 0 10px 8px 0; background: #FFF; color: #000; }
+#main #leftside fieldset p { width: 100%; }
+#main input { padding: 3px; margin: 0; border: 1px solid #bbb }
+
diff --git a/doc/website/faq.shtml b/doc/website/faq.shtml
new file mode 100644
index 0000000..493a620
--- /dev/null
+++ b/doc/website/faq.shtml
@@ -0,0 +1,34 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<title>Crossbow: Whole Genome Resequencing Analysis in the Clouds</title>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+<link rel="stylesheet" type="text/css" href="../css/style.css" media="screen" />
+<meta name="verify-v1" content="YJT1CfXN3kzE9cr+jvNB+Q73lTfHrv8eivoY+xjblc0=" />
+</head>
+<body>
+<div id="wrap">
+ <!--#include virtual="top.ssi" -->
+ <div id="main">
+ <!--#include virtual="rhsidebar.ssi" -->
+ <div id="leftside">
+ <h2>Frequently Asked Questions</h2>
+ <!--#include virtual="faq.ssi" -->
+ </div>
+ </div>
+ <!--#include virtual="foot.ssi" -->
+</div>
+
+<!-- Google analytics code -->
+<script type="text/javascript">
+var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
+document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
+</script>
+<script type="text/javascript">
+var pageTracker = _gat._getTracker("UA-5334290-1");
+pageTracker._trackPageview();
+</script>
+<!-- End google analytics code -->
+
+</body>
+</html>
diff --git a/doc/website/faq.ssi b/doc/website/faq.ssi
new file mode 100644
index 0000000..916484f
--- /dev/null
+++ b/doc/website/faq.ssi
@@ -0,0 +1,15 @@
+<h2>Does Crossbow support colorspace?</h2>
+Yes, but note that the pre-built reference jars available from this
+site are currently all in nucleotide space. In the future, we will
+also post colorspace versions of the reference jars. That said,
+Crossbow works fine with colorspace reads: just ensure that you specify
+a colorspace reference jar and pass -C to Bowtie.
+
+<h2>Can Crossbow handle a mix of colorspace and normal (nucleotide-space) reads in a single job?</h2>
+No. As of now, Crossbow applies the same set of Bowtie arguments to
+every invocation Bowtie, and so cannot be used on both types of reads
+in the same job.
+
+<h2>Can Crossbow output alignments in SAM or BAM format?</h2>
+No. While Bowtie itself can certainly do this, Crossbow does not yet have a
+facility for outputting a directory of SAM/BAM files either as the final result or as an intermediate result.
diff --git a/doc/website/foot.ssi b/doc/website/foot.ssi
new file mode 100644
index 0000000..1b1701c
--- /dev/null
+++ b/doc/website/foot.ssi
@@ -0,0 +1,9 @@
+ <div id="footer">
+ <table width="100%" cellspacing=15><tr><td>
+ This research was supported in part by NIH grants R01-LM006845, R01-GM083873 and P41HG004059.
+ </td><td align=right>
+ Administrator: <a href="mailto:langmead at umd.edu">Ben Langmead</a>. Design by <a href="http://www.free-css-templates.com" title="Design by David Herreman">David Herreman</a>
+ </td></tr>
+ </table>
+ <center><a href="http://www.sourceforge.net"><img src="../images/sflogo.png" alt="Sourceforge.net" style="border-style: none"></img></a></center>
+ </div>
diff --git a/doc/website/index.html b/doc/website/index.html
new file mode 100644
index 0000000..e05e5ea
--- /dev/null
+++ b/doc/website/index.html
@@ -0,0 +1,9 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
+<html>
+<head>
+<title>Crossbow: Whole Genome Resequencing Analysis in the Clouds</title>
+<meta http-equiv="REFRESH" content="0;url=index.shtml"></HEAD>
+<BODY>
+Redirecting to Crossbow.
+</BODY>
+</HTML>
diff --git a/doc/website/index.shtml b/doc/website/index.shtml
new file mode 100644
index 0000000..6636d3f
--- /dev/null
+++ b/doc/website/index.shtml
@@ -0,0 +1,68 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<title>Crossbow: Whole Genome Resequencing Analysis in the Clouds</title>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+<link rel="stylesheet" type="text/css" href="../css/style.css" media="screen" />
+<meta name="verify-v1" content="YJT1CfXN3kzE9cr+jvNB+Q73lTfHrv8eivoY+xjblc0=" />
+</head>
+<body>
+<div id="wrap">
+ <!--#include virtual="top.ssi" -->
+ <div id="subheader">
+ <table width="100%"><tr>
+ <td>
+
+ <strong>Crossbow</strong> is a scalable software pipeline for whole genome resequencing analysis.
+ It combines <a href="http://bowtie-bio.sf.net">Bowtie</a>, an ultrafast and memory efficient short read
+ aligner, and <a href="http://soap.genomics.org.cn/soapsnp.html">SoapSNP</a>, and an accurate
+ genotyper.
+
+ These
+ tools are combined in an automatic, parallel pipeline that runs in the
+ cloud (<a href="http://aws.amazon.com/elasticmapreduce/">Elastic MapReduce</a> in this case) on a local
+ <a href="http://hadoop.apache.org/">Hadoop</a>
+ cluster, or on a single computer, exploiting
+ multiple computers and CPUs wherever possible.
+
+ The pipeline can
+ analyze over 35x coverage of a human genome in one day on a 10-node local cluster, or in 3 hours for about
+ $85 using a 40-node, 320-core cluster rented from <a href="http://aws.amazon.com">Amazon Web Services</a>.
+ </td>
+
+
+<td align=right valign=middle>
+ <a href="http://opensource.org"><img border=0 alt="Open Source Software" src="/images/osi-certified.gif"></img></a>
+ </td></tr>
+ </table>
+ </div>
+ <div id="main">
+ <!--#include virtual="rhsidebar.ssi" -->
+ <div id="leftside">
+ <!--#include virtual="recent_news.ssi" -->
+ </div>
+ </div>
+ <div id="footer">
+ <table width="100%" cellspacing=15><tr><td>
+ This research was supported in part by NIH grants R01-LM006845 and R01-GM083873.
+ </td><td align=right>
+ Administrator: <a href="mailto:langmead at umd.edu">Ben Langmead</a>. Design by <a href="http://www.free-css-templates.com" title="Design by David Herreman">David Herreman</a>
+ </td></tr>
+ </table>
+ <center><a href="http://www.sourceforge.net"><img src="/images/sflogo.png" alt="Sourceforge.net" style="border-style: none"></img></a></center>
+ </div>
+</div>
+
+<!-- Google analytics code -->
+<script type="text/javascript">
+var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
+document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
+</script>
+<script type="text/javascript">
+var pageTracker = _gat._getTracker("UA-5334290-1");
+pageTracker._trackPageview();
+</script>
+<!-- End google analytics code -->
+
+</body>
+</html>
diff --git a/doc/website/manual.shtml b/doc/website/manual.shtml
new file mode 100644
index 0000000..bfbd53c
--- /dev/null
+++ b/doc/website/manual.shtml
@@ -0,0 +1,33 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<title>Crossbow: Manual</title>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+<link rel="stylesheet" type="text/css" href="../css/style.css" media="screen" />
+<meta name="verify-v1" content="YJT1CfXN3kzE9cr+jvNB+Q73lTfHrv8eivoY+xjblc0=" />
+</head>
+<body>
+<div id="wrap">
+ <!--#include virtual="top.ssi" -->
+ <div id="main">
+ <!--#include virtual="rhsidebar.ssi" -->
+ <div id="manual" style="margin-right: 310px; margin-left: 0px; width: auto;">
+ <!--#include virtual="manual.ssi" -->
+ </div>
+ </div>
+ <!--#include virtual="foot.ssi" -->
+</div>
+
+<!-- Google analytics code -->
+<script type="text/javascript">
+var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
+document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
+</script>
+<script type="text/javascript">
+var pageTracker = _gat._getTracker("UA-5334290-1");
+pageTracker._trackPageview();
+</script>
+<!-- End google analytics code -->
+
+</body>
+</html>
diff --git a/doc/website/manual.ssi b/doc/website/manual.ssi
new file mode 100644
index 0000000..eb08410
--- /dev/null
+++ b/doc/website/manual.ssi
@@ -0,0 +1,3821 @@
+<h1>Table of Contents</h1>
+<p> Version <b>1.2.0</b></p>
+<!-- paste everything from div id="toc" on here -->
+<div id="TOC"
+><ul
+ ><li
+ ><a href="#what-is-crossbow"
+ >What is Crossbow?</a
+ ></li
+ ><li
+ ><a href="#a-word-of-caution"
+ >A word of caution</a
+ ></li
+ ><li
+ ><a href="#crossbow-modes-and-prerequisites"
+ >Crossbow modes and prerequisites</a
+ ></li
+ ><li
+ ><a href="#preparing-to-run-on-amazon-elastic-mapreduce"
+ >Preparing to run on Amazon Elastic MapReduce</a
+ ><ul
+ ><li
+ ><a href="#installing-amazons-elastic-mapreduce-tool"
+ >Installing Amazon's <code
+ >elastic-mapreduce</code
+ > tool</a
+ ></li
+ ><li
+ ><a href="#s3-tools"
+ >S3 tools</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#installing-crossbow"
+ >Installing Crossbow</a
+ ><ul
+ ><li
+ ><a href="#the-sra-toolkit"
+ >The SRA toolkit</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#running-crossbow"
+ >Running Crossbow</a
+ ></li
+ ><li
+ ><a href="#running-crossbow-on-emr-via-the-emr-web-interface"
+ >Running Crossbow on EMR via the EMR web interface</a
+ ><ul
+ ><li
+ ><a href="#prerequisites"
+ >Prerequisites</a
+ ></li
+ ><li
+ ><a href="#to-run"
+ >To run</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#running-crossbow-on-emr-via-the-command-line"
+ >Running Crossbow on EMR via the command line</a
+ ><ul
+ ><li
+ ><a href="#prerequisites-1"
+ >Prerequisites</a
+ ></li
+ ><li
+ ><a href="#to-run-1"
+ >To run</a
+ ></li
+ ><li
+ ><a href="#emr-specific-options"
+ >EMR-specific options</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#running-crossbow-on-a-hadoop-cluster-via-the-command-line"
+ >Running Crossbow on a Hadoop cluster via the command line</a
+ ><ul
+ ><li
+ ><a href="#prerequisites-2"
+ >Prerequisites</a
+ ></li
+ ><li
+ ><a href="#to-run-2"
+ >To run</a
+ ></li
+ ><li
+ ><a href="#hadoop-specific-options"
+ >Hadoop-specific options</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#running-crossbow-on-a-single-computer-via-the-command-line"
+ >Running Crossbow on a single computer via the command line</a
+ ><ul
+ ><li
+ ><a href="#prerequisites-3"
+ >Prerequisites</a
+ ></li
+ ><li
+ ><a href="#to-run-3"
+ >To run</a
+ ></li
+ ><li
+ ><a href="#local-run-specific-options"
+ >Local-run-specific options</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#general-crossbow-options"
+ >General Crossbow options</a
+ ></li
+ ><li
+ ><a href="#crossbow-examples"
+ >Crossbow examples</a
+ ><ul
+ ><li
+ ><a href="#e.-coli-small"
+ >E. coli (small)</a
+ ><ul
+ ><li
+ ><a href="#emr"
+ >EMR</a
+ ><ul
+ ><li
+ ><a href="#via-web-interface"
+ >Via web interface</a
+ ></li
+ ><li
+ ><a href="#via-command-line"
+ >Via command line</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#hadoop"
+ >Hadoop</a
+ ></li
+ ><li
+ ><a href="#single-computer"
+ >Single computer</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#mouse-chromosome-17-large"
+ >Mouse chromosome 17 (large)</a
+ ><ul
+ ><li
+ ><a href="#emr-1"
+ >EMR</a
+ ><ul
+ ><li
+ ><a href="#via-web-interface-1"
+ >Via web interface</a
+ ></li
+ ><li
+ ><a href="#via-command-line-1"
+ >Via command line</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#hadoop-1"
+ >Hadoop</a
+ ></li
+ ><li
+ ><a href="#single-computer-1"
+ >Single computer</a
+ ></li
+ ></ul
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#manifest-files"
+ >Manifest files</a
+ ></li
+ ><li
+ ><a href="#reference-jars"
+ >Reference jars</a
+ ><ul
+ ><li
+ ><a href="#building-a-reference-jar-using-automatic-scripts"
+ >Building a reference jar using automatic scripts</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#monitoring-debugging-and-logging"
+ >Monitoring, debugging and logging</a
+ ><ul
+ ><li
+ ><a href="#single-computer-2"
+ >Single computer</a
+ ></li
+ ><li
+ ><a href="#hadoop-2"
+ >Hadoop</a
+ ></li
+ ><li
+ ><a href="#emr-2"
+ >EMR</a
+ ></li
+ ><li
+ ><a href="#aws-management-console"
+ >AWS Management Console</a
+ ></li
+ ></ul
+ ></li
+ ><li
+ ><a href="#crossbow-output"
+ >Crossbow Output</a
+ ></li
+ ><li
+ ><a href="#other-reading"
+ >Other reading</a
+ ></li
+ ><li
+ ><a href="#acknowledgements"
+ >Acknowledgements</a
+ ></li
+ ></ul
+ ></div
+>
+<h1 id="what-is-crossbow"
+><a href="#TOC"
+ >What is Crossbow?</a
+ ></h1
+><p
+><a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow</a
+ > is a scalable, portable, and automatic Cloud Computing tool for finding SNPs from short read data. Crossbow employs <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > and a modified version of <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > to perform the short read alignment and SNP calling respectively. Crossbow is designed to be easy to run (a) in "the cloud" (in this case, Amazon's <a href="http://aws.amazon.com/elasticmapreduce"
+ >Elastic MapReduce</a
+ > service), (b) on any <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > cluster, or (c) on any single computer, without <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ >. Crossbow exploits the availability of multiple computers and processors where possible.</p
+><h1 id="a-word-of-caution"
+><a href="#TOC"
+ >A word of caution</a
+ ></h1
+><p
+>Renting resources from <a href="http://aws.amazon.com"
+ >Amazon Web Services</a
+ > (AKA <a href="http://aws.amazon.com/" title="Amazon Web Services"
+ >AWS</a
+ >), costs money, regardless of whether your experiment ultimately succeeds or fails. In some cases, Crossbow or its documentation may be partially to blame for a failed experiment. While we are happy to accept bug reports, we do not accept responsibility for financial damage caused by these errors. Crossbow is provided "as is" with no warranty. See <code
+ >LICENSE</code
+ > file.</p
+><h1 id="crossbow-modes-and-prerequisites"
+><a href="#TOC"
+ >Crossbow modes and prerequisites</a
+ ></h1
+><p
+>Crossbow can be run in four different ways.</p
+><ol style="list-style-type: decimal;"
+><li
+ ><strong
+ >Via the <a href="http://bowtie-bio.sf.net/crossbow/ui.html"
+ >Crossbow web interface</a
+ ></strong
+ ></li
+ ></ol
+><p
+>In this case, the <a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow</a
+ > code and the user interface are installed on EC2 web servers. Also, the computers running the Crossbow computation are rented from Amazon, and the user must have <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ >, <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ >, <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > and <a href="http://aws.amazon.com/simpledb/"
+ >SimpleDB</a
+ > accounts and must pay the <a href="http://aws.amazon.com/ec2/#pricing"
+ >going rate</a
+ > for the resources used. The user does not need any special software besides a web browser and, in most cases, an <a href="#s3-tools"
+ >S3 tool</a
+ >.</p
+><ol start="2" style="list-style-type: decimal;"
+><li
+ ><strong
+ >On Amazon <a href="http://aws.amazon.com/elasticmapreduce"
+ >Elastic MapReduce</a
+ > via the command-line</strong
+ ></li
+ ></ol
+><p
+>In this case, the Crossbow code is hosted by Amazon and the computers running the Crossbow computation are rented from Amazon. However, the user must install and run (a) the Crossbow scripts, which require <a href="http://www.perl.org/get.html"
+ >Perl</a
+ > 5.6 or later, (b) Amazon's <a href="http://aws.amazon.com/developertools/2264?_encoding=UTF8&jiveRedirect=1"
+ ><code
+ >elastic-mapreduce</code
+ ></a
+ > script, which requires Ruby 1.8 or later, and (c) an <a href="#s3-tools"
+ >S3 tool</a
+ >. The user must have <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ >, <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ >, <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > and <a href="http://aws.amazon.com/simpledb/"
+ >SimpleDB</a
+ > accounts and must pay the <a href="http://aws.amazon.com/ec2/#pricing"
+ >going rate</a
+ > for the resources used.</p
+><ol start="3" style="list-style-type: decimal;"
+><li
+ ><strong
+ >On a <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > cluster via the command-line</strong
+ ></li
+ ></ol
+><p
+>In this case, the Crossbow code is hosted on your <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > cluster, as are supporting tools: <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ >, <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ >, and possibly [<code
+ >fastq-dump</code
+ >]. Supporting tools must be installed on all cluster nodes, but the Crossbow scripts need only be installed on the master. Crossbow was tested with <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > versions 0.20 and 0.20.205, and might also be compatible with other versions newer than 0.20. Crossbow scripts require <a href="http://www.perl.org/get.html"
+ >Perl</a
+ > 5.6 or later.</p
+><ol start="4" style="list-style-type: decimal;"
+><li
+ ><strong
+ >On any computer via the command-line</strong
+ ></li
+ ></ol
+><p
+>In this case, the Crossbow code and all supporting tools (<a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ >, <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ >, and possibly [<code
+ >fastq-dump</code
+ >]) must be installed on the computer running Crossbow. Crossbow scripts require <a href="http://www.perl.org/get.html"
+ >Perl</a
+ > 5.6 or later. The user specifies the maximum number of CPUs that Crossbow should use at a time. This mode does <em
+ >not</em
+ > require <a href="http://java.sun.com/"
+ >Java</a
+ > or <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ >.</p
+><h1 id="preparing-to-run-on-amazon-elastic-mapreduce"
+><a href="#TOC"
+ >Preparing to run on Amazon Elastic MapReduce</a
+ ></h1
+><p
+>Before running Crossbow on <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ >, you must have an <a href="http://aws.amazon.com/" title="Amazon Web Services"
+ >AWS</a
+ > account with the appropriate features enabled. You may also need to <a href="#installing-amazons-elastic-mapreduce-tool"
+ >install Amazon's <code
+ >elastic-mapreduce</code
+ > tool</a
+ >. In addition, you may want to install an <a href="#s3-tools"
+ >S3 tool</a
+ >, though most users can simply use <a href="https://console.aws.amazon.com/s3/home"
+ >Amazon's web interface for S3</a
+ >, which requires no installation.</p
+><p
+>If you plan to run Crossbow exclusively on a single computer or on a <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > cluster, you can skip this section.</p
+><ol style="list-style-type: decimal;"
+><li
+ ><p
+ >Create an AWS account by navigating to the <a href="http://aws.amazon.com/" title="Amazon Web Services"
+ >AWS page</a
+ >. Click "Sign Up Now" in the upper right-hand corner and follow the instructions. You will be asked to accept the <a href="http://aws.amazon.com/agreement/"
+ >AWS Customer Agreement</a
+ >.</p
+ ></li
+ ><li
+ ><p
+ >Sign up for <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > and <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ >. Navigate to the <a href="http://aws.amazon.com/ec2"
+ >Amazon EC2</a
+ > page, click on "Sign Up For Amazon EC2" and follow the instructions. This step requires you to enter credit card information. Once this is complete, your AWS account will be permitted to use <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > and <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ >, which are required.</p
+ ></li
+ ><li
+ ><p
+ >Sign up for <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ >. Navigate to the <a href="http://aws.amazon.com/elasticmapreduce"
+ >Elastic MapReduce</a
+ > page, click on "Sign up for Elastic MapReduce" and follow the instructions. Once this is complete, your AWS account will be permitted to use <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ >, which is required.</p
+ ></li
+ ><li
+ ><p
+ >Sign up for <a href="http://aws.amazon.com/simpledb/"
+ >SimpleDB</a
+ >. With <a href="http://aws.amazon.com/simpledb/"
+ >SimpleDB</a
+ > enabled, you have the option of using the <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ >'s <a href="http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/DebuggingJobFlows.html"
+ >Job Flow Debugging</a
+ > feature. This is a convenient way to monitor your job's progress and diagnose errors.</p
+ ></li
+ ><li
+ ><p
+ ><em
+ >Optional</em
+ >: Request an increase to your instance limit. By default, Amazon allows you to allocate EC2 clusters with up to 20 instances (virtual computers). To be permitted to work with more instances, fill in the form on the <a href="http://aws.amazon.com/contact-us/ec2-request/"
+ >Request to Increase</a
+ > page. You may have to speak to an Amazon representative and/or wait several business days before your request is granted.</p
+ ></li
+ ></ol
+><p
+>To see a list of AWS services you've already signed up for, see your <a href="http://aws-portal.amazon.com/gp/aws/developer/account/index.html?ie=UTF8&action=activity-summary"
+ >Account Activity</a
+ > page. If "Amazon Elastic Compute Cloud", "Amazon Simple Storage Service", "Amazon Elastic MapReduce" and "Amazon SimpleDB" all appear there, you are ready to proceed.</p
+><p
+>Be sure to make a note of the various numbers and names associated with your accounts, especially your Access Key ID, Secret Access Key, and your EC2 key pair name. You will have to refer to these and other account details in the future.</p
+><h2 id="installing-amazons-elastic-mapreduce-tool"
+><a href="#TOC"
+ >Installing Amazon's <code
+ >elastic-mapreduce</code
+ > tool</a
+ ></h2
+><p
+>Read this section if you plan to run Crossbow on <a href="http://aws.amazon.com/elasticmapreduce"
+ >Elastic MapReduce</a
+ > via the command-line tool. Skip this section if you are not using <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > or if you plan to run exclusively via the <a href="http://bowtie-bio.sf.net/crossbow/ui.html"
+ >Crossbow web interface</a
+ >.</p
+><p
+>To install Amazon's <code
+ >elastic-mapreduce</code
+ > tool, follow the instructions in Amazon Elastic MapReduce developer's guide for <a href="http://aws.amazon.com/developertools/2264?_encoding=UTF8&jiveRedirect=1"
+ >How to Download and Install Ruby and the Command Line Interface</a
+ >. That document describes:</p
+><ol style="list-style-type: decimal;"
+><li
+ ><p
+ >Installing an appropriate version of <a href="http://www.ruby-lang.org/"
+ >Ruby</a
+ >, if necessary.</p
+ ></li
+ ><li
+ ><p
+ >Setting up an EC2 keypair, if necessary.</p
+ ></li
+ ><li
+ ><p
+ >Setting up a credentials file, which is used by the <code
+ >elastic-mapreduce</code
+ > tool for authentication.</p
+ ></li
+ ></ol
+><p
+>For convenience, we suggest you name the credentials file <code
+ >credentials.json</code
+ > and place it in the same directory with the <code
+ >elastic-mapreduce</code
+ > script. Otherwise you will have to specify the credential file path with the <a href="#cb-emr-credentials"
+ ><code
+ >--credentials</code
+ ></a
+ > option each time you run <code
+ >cb_emr</code
+ >.</p
+><p
+>We strongly recommend using a version of the <code
+ >elastic-mapreduce</code
+ > Ruby script released on or after December 8, 2011. This is when the script switched to using Hadoop v0.20.205 by default, which is the preferred way of running Myrna.</p
+><p
+>We also recommend that you add the directory containing the <code
+ >elastic-mapreduce</code
+ > tool to your <code
+ >PATH</code
+ >. This allows Crossbow to locate it automatically. Alternately, you can specify the path to the <code
+ >elastic-mapreduce</code
+ > tool via the <a href="#cb-emr-script"
+ ><code
+ >--emr-script</code
+ ></a
+ > option when running <code
+ >cb_emr</code
+ >.</p
+><h2 id="s3-tools"
+><a href="#TOC"
+ >S3 tools</a
+ ></h2
+><p
+>Running on <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > requires exchanging files via the cloud-based <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > filesystem. <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > is organized as a collection of <a href="http://docs.amazonwebservices.com/AmazonS3/latest/gsg/"
+ >S3 buckets</a
+ > in a global namespace. <a href="http://aws.amazon.com/s3/#pricing"
+ >S3 charges</a
+ > are incurred when transferring data to and from <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > (but transfers between <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > and <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > are free), and a per-GB-per-month charge applies when data is stored in <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > over time.</p
+><p
+>To transfer files to and from <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ >, use an S3 tool. Amazon's <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ > has an <a href="https://console.aws.amazon.com/s3/home"
+ >S3 tab</a
+ > that provides a friendly web-based interface to <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ >, and doesn't require any software installation. <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ > is a very good command-line tool that requires <a href="http://www.python.org/download/"
+ >Python</a
+ > 2.4 or later. <a href="http://www.s3fox.net/"
+ >S3Fox Organizer</a
+ > is another GUI tool that works as a <a href="http://www.mozilla.com/firefox/"
+ >Firefox</a
+ > extension. Other tools include <a href="http://cyberduck.ch/"
+ >Cyberduck</a
+ > (for Mac OS 10.6 or later) and <a href="http://www.bucketexplorer.com/"
+ >Bucket Explorer</a
+ > (for Mac, Windows or Linux, but commercial software).</p
+><h1 id="installing-crossbow"
+><a href="#TOC"
+ >Installing Crossbow</a
+ ></h1
+><p
+>Crossbow consists of a set of <a href="http://www.perl.org/get.html"
+ >Perl</a
+ > and shell scripts, plus supporting tools: <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > and <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > . If you plan to run Crossbow via the <a href="http://bowtie-bio.sf.net/crossbow/ui.html"
+ >Crossbow web interface</a
+ > exclusively, there is nothing to install. Otherwise:</p
+><ol style="list-style-type: decimal;"
+><li
+ ><p
+ >Download the desired version of Crossbow from the <a href="http://bowtie-bio.sf.net/crossbow"
+ >sourceforge site</a
+ ></p
+ ></li
+ ><li
+ ><p
+ ><a href="http://en.wikipedia.org/wiki/ZIP_(file_format)"
+ >Extract the zip archive</a
+ ></p
+ ></li
+ ><li
+ ><p
+ >Set the <code
+ >CROSSBOW_HOME</code
+ > environment variable to point to the extracted directory (containing <code
+ >cb_emr</code
+ >)</p
+ ></li
+ ><li
+ ><p
+ ><em
+ >If you plan to run on a local computer or <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > cluster</em
+ >:</p
+ ><p
+ >If using Linux or Mac OS 10.6 or later, you likely don't have to install <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > or <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ >, as Crossbow comes with compatible versions of both pre-installed. Test this by running:</p
+ ><pre
+ ><code
+ >$CROSSBOW_HOME/cb_local --test
+</code
+ ></pre
+ ><p
+ >If the install test passes, installation is complete.</p
+ ><p
+ >If the install test indicates <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > is not installed, obtain or build a <code
+ >bowtie</code
+ > binary v0.12.8 or higher and install it by setting the <code
+ >CROSSBOW_BOWTIE_HOME</code
+ > environment variable to <code
+ >bowtie</code
+ >'s enclosing directory. Alternately, add the enclosing directory to your <code
+ >PATH</code
+ > or specify the full path to <code
+ >bowtie</code
+ > via the <code
+ >--bowtie</code
+ > option when running Crossbow scripts.</p
+ ><p
+ >If the install test indicates that <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > is not installed, build the <code
+ >soapsnp</code
+ > binary using the sources and makefile in <code
+ >CROSSBOW_HOME/soapsnp</code
+ >. You must have compiler tools such as GNU <code
+ >make</code
+ > and <code
+ >g++</code
+ > installed for this to work. If you are using a Mac, you may need to install the <a href="http://developer.apple.com/technologies/tools/"
+ >Apple developer tools</a
+ >. To build the <code
+ >soapsnp</code
+ > binary, run:</p
+ ><pre
+ ><code
+ >make -C $CROSSBOW_HOME/soapsnp
+</code
+ ></pre
+ ><p
+ >Now install <code
+ >soapsnp</code
+ > by setting the <code
+ >CROSSBOW_SOAPSNP_HOME</code
+ > environment variable to <code
+ >soapsnp</code
+ >'s enclosing directory. Alternately, add the enclosing directory to your <code
+ >PATH</code
+ > or specify the full path to <code
+ >soapsnp</code
+ > via the <code
+ >--soapsnp</code
+ > option when running Crossbow scripts.</p
+ ></li
+ ><li
+ ><p
+ ><em
+ >If you plan to run on a <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > cluster</em
+ >, you may need to manually copy the <code
+ >bowtie</code
+ > and <code
+ >soapsnp</code
+ > executables, and possibly also the <code
+ >fastq-dump</code
+ > executable, to the same path on each of your <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > cluster nodes. You can avoid this step by installing <code
+ >bowtie</code
+ >, <code
+ >soapsnp</code
+ > and <code
+ >fastq-dump</code
+ > on a filesystem shared by all <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > nodes (e.g. an <a href="http://en.wikipedia.org/wiki/Network_File_System_(protocol)"
+ >NFS share</a
+ >). You can also skip this step if <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > is installed in <a href="http://hadoop.apache.org/common/docs/current/quickstart.html#PseudoDistributed"
+ >pseudo distributed</a
+ > mode, meaning that the cluster really consists of one node whose CPUs are treated as distinct slaves.</p
+ ></li
+ ></ol
+><h2 id="the-sra-toolkit"
+><a href="#TOC"
+ >The SRA toolkit</a
+ ></h2
+><p
+>The <a href="http://www.ncbi.nlm.nih.gov/books/NBK47533/"
+ >Sequence Read Archive</a
+ > (SRA) is a resource at the <a href="http://www.ncbi.nlm.nih.gov/"
+ >National Center for Biotechnology Information</a
+ > (NCBI) for storing sequence data from modern sequencing instruments. Sequence data underlying many studies, including very large studies, can often be downloaded from this archive.</p
+><p
+>The SRA uses a special file format to store archived read data. These files end in extensions <a href="http://www.ncbi.nlm.nih.gov/books/NBK47540/"
+ ><code
+ >.sra</code
+ ></a
+ >, and they can be specified as inputs to Crossbow's preprocessing step in exactly the same way as <a href="http://en.wikipedia.org/wiki/FASTQ_format"
+ >FASTQ</a
+ > files.</p
+><p
+>However, if you plan to use <a href="http://www.ncbi.nlm.nih.gov/books/NBK47540/"
+ ><code
+ >.sra</code
+ ></a
+ > files as input to Crossbow in either <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > mode or in single-computer mode, you must first install the <a href="http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?cmd=show&f=software&m=software&s=software"
+ >SRA toolkit</a
+ >'s <code
+ >fastq-dump</code
+ > tool appropriately. See the <a href="http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?cmd=show&f=software&m=software&s=software"
+ >SRA toolkit</a
+ > page for details about how to download and install.</p
+><p
+>When searching for the <code
+ >fastq-dump</code
+ > tool at runtime, Crossbow searches the following places in order:</p
+><ol style="list-style-type: decimal;"
+><li
+ >The path specified in the <a href="#cb-local-fastq-dump"
+ ><code
+ >--fastq-dump</code
+ ></a
+ > option</li
+ ><li
+ >The directory specified in the <code
+ >$CROSSBOW_SRATOOLKIT_HOME</code
+ > environment variable.</li
+ ><li
+ >In the system <code
+ >PATH</code
+ ></li
+ ></ol
+><h1 id="running-crossbow"
+><a href="#TOC"
+ >Running Crossbow</a
+ ></h1
+><p
+>The commands for invoking Crossbow from the command line are:</p
+><p
+><code
+ >$CROSSBOW_HOME/cb_emr</code
+ > (or just <code
+ >cb_emr</code
+ > if <code
+ >$CROSSBOW_HOME</code
+ > is in the <code
+ >PATH</code
+ >) for running on <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ >. See <a href="#running-crossbow-on-emr-via-the-command-line"
+ >Running Crossbow on EMR via the command line</a
+ > for details.</p
+><p
+><code
+ >$CROSSBOW_HOME/cb_hadoop</code
+ > (or just <code
+ >cb_hadoop</code
+ > if <code
+ >$CROSSBOW_HOME</code
+ > is in the <code
+ >PATH</code
+ >) for running on <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ >. See <a href="#running-crossbow-on-a-hadoop-cluster-via-the-command-line"
+ >Running Crossbow on a Hadoop cluster via the command line</a
+ > for details.</p
+><p
+><code
+ >$CROSSBOW_HOME/cb_local</code
+ > (or just <code
+ >cb_local</code
+ > if <code
+ >$CROSSBOW_HOME</code
+ > is in the <code
+ >PATH</code
+ >) for running locally on a single computer. See <a href="#running-crossbow-on-a-single-computer-via-the-command-line"
+ >Running Crossbow on a single computer via the command line</a
+ > for details.</p
+><h1 id="running-crossbow-on-emr-via-the-emr-web-interface"
+><a href="#TOC"
+ >Running Crossbow on EMR via the EMR web interface</a
+ ></h1
+><h2 id="prerequisites"
+><a href="#TOC"
+ >Prerequisites</a
+ ></h2
+><ol style="list-style-type: decimal;"
+><li
+ >Web browser</li
+ ><li
+ ><a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ >, <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ >, <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ >, and <a href="http://aws.amazon.com/simpledb/"
+ >SimpleDB</a
+ > accounts. To check which ones you've already enabled, visit your <a href="http://aws-portal.amazon.com/gp/aws/developer/account/index.html?ie=UTF8&action=activity-summary"
+ >Account Activity</a
+ > page.</li
+ ><li
+ >A tool for browsing and exchanging files with <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ ><ol style="list-style-type: lower-alpha;"
+ ><li
+ >The <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ >'s <a href="https://console.aws.amazon.com/s3/home"
+ >S3 tab</a
+ > is a good web-based tool that does not require software installation</li
+ ><li
+ >A good command line tool is <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ ></li
+ ><li
+ >A good GUI tool is <a href="http://www.s3fox.net/"
+ >S3Fox Organizer</a
+ >, which is a Firefox Plugin</li
+ ><li
+ >Others include <a href="http://cyberduck.ch/"
+ >Cyberduck</a
+ >, <a href="http://www.bucketexplorer.com/"
+ >Bucket Explorer</a
+ ></li
+ ></ol
+ ></li
+ ><li
+ >Basic knowledge regarding:<ol style="list-style-type: lower-alpha;"
+ ><li
+ ><a href="http://aws.amazon.com/s3/"
+ >What S3 is</a
+ >, <a href="http://docs.amazonwebservices.com/AmazonS3/latest/gsg/"
+ >what an S3 bucket is</a
+ >, how to create one, how to upload a file to an S3 bucket from your computer (see your S3 tool's documentation).</li
+ ><li
+ >How much AWS resources <a href="http://aws.amazon.com/ec2/#pricing"
+ >will cost you</a
+ ></li
+ ></ol
+ ></li
+ ></ol
+><h2 id="to-run"
+><a href="#TOC"
+ >To run</a
+ ></h2
+><ol style="list-style-type: decimal;"
+><li
+ ><p
+ ><em
+ >If the input reads have not yet been preprocessed by Crossbow</em
+ > (i.e. input is <a href="http://en.wikipedia.org/wiki/FASTQ_format"
+ >FASTQ</a
+ > or <a href="http://www.ncbi.nlm.nih.gov/books/NBK47540/"
+ ><code
+ >.sra</code
+ ></a
+ >), then first (a) prepare a <a href="#manifest-files"
+ >manifest file</a
+ > with URLs pointing to the read files, and (b) upload it to an <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > bucket that you own. See your <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > tool's documentation for how to create a bucket and upload a file to it. The URL for the <a href="#manifest-files"
+ >manifest file</a
+ > will be the input URL for your <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job.</p
+ ><p
+ ><em
+ >If the input reads have already been preprocessed by Crossbow</em
+ >, make a note of of the <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > URL where they're located. This will be the input URL for your <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job.</p
+ ></li
+ ><li
+ ><p
+ ><em
+ >If you are using a pre-built reference jar</em
+ >, make a note of its <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > URL. This will be the reference URL for your <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job. See the <a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow website</a
+ > for a list of pre-built reference jars and their URLs.</p
+ ><p
+ ><em
+ >If you are not using a pre-built reference jar</em
+ >, you may need to <a href="#reference-jars"
+ >build the reference jars</a
+ > and/or upload them to an <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > bucket you own. See your <a href="#s3-tools"
+ >S3 tool</a
+ >'s documentation for how to create a bucket and upload to it. The URL for the main reference jar will be the reference URL for your <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job.</p
+ ></li
+ ><li
+ ><p
+ >In a web browser, go to the <a href="http://bowtie-bio.sf.net/crossbow/ui.html"
+ >Crossbow web interface</a
+ >.</p
+ ></li
+ ><li
+ ><p
+ >Fill in the form according to your job's parameters. We recommend filling in and validating the "AWS ID" and "AWS Secret Key" fields first. Also, when entering S3 URLs (e.g. "Input URL" and "Output URL"), we recommend that users validate the entered URLs by clicking the link below it. This avoids failed jobs due to simple URL issues (e.g. non-existence of the "Input URL"). For examples of how to fill in this form, see the <a href="#c [...]
+ >E. coli EMR</a
+ > and <a href="#cb-example-mouse17-emr"
+ >Mouse chromosome 17 EMR</a
+ > examples.</p
+ ></li
+ ></ol
+><h1 id="running-crossbow-on-emr-via-the-command-line"
+><a href="#TOC"
+ >Running Crossbow on EMR via the command line</a
+ ></h1
+><h2 id="prerequisites-1"
+><a href="#TOC"
+ >Prerequisites</a
+ ></h2
+><ol style="list-style-type: decimal;"
+><li
+ ><a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ >, <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ >, <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ >, and <a href="http://aws.amazon.com/simpledb/"
+ >SimpleDB</a
+ > accounts. To check which ones you've already enabled, visit your <a href="http://aws-portal.amazon.com/gp/aws/developer/account/index.html?ie=UTF8&action=activity-summary"
+ >Account Activity</a
+ > page.</li
+ ><li
+ >A tool for browsing and exchanging files with <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ ><ol style="list-style-type: lower-alpha;"
+ ><li
+ >The <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ >'s <a href="https://console.aws.amazon.com/s3/home"
+ >S3 tab</a
+ > is a good web-based tool that does not require software installation</li
+ ><li
+ >A good command line tool is <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ ></li
+ ><li
+ >A good GUI tool is <a href="http://www.s3fox.net/"
+ >S3Fox Organizer</a
+ >, which is a Firefox Plugin</li
+ ><li
+ >Others include <a href="http://cyberduck.ch/"
+ >Cyberduck</a
+ >, <a href="http://www.bucketexplorer.com/"
+ >Bucket Explorer</a
+ ></li
+ ></ol
+ ></li
+ ><li
+ >Basic knowledge regarding:<ol style="list-style-type: lower-alpha;"
+ ><li
+ ><a href="http://aws.amazon.com/s3/"
+ >What S3 is</a
+ >, <a href="http://docs.amazonwebservices.com/AmazonS3/latest/gsg/"
+ >what an S3 bucket is</a
+ >, how to create one, how to upload a file to an S3 bucket from your computer (see your S3 tool's documentation).</li
+ ><li
+ >How much AWS resources <a href="http://aws.amazon.com/ec2/#pricing"
+ >will cost you</a
+ ></li
+ ></ol
+ ></li
+ ></ol
+><h2 id="to-run-1"
+><a href="#TOC"
+ >To run</a
+ ></h2
+><ol style="list-style-type: decimal;"
+><li
+ ><p
+ ><em
+ >If the input reads have not yet been preprocessed by Crossbow</em
+ > (i.e. input is <a href="http://en.wikipedia.org/wiki/FASTQ_format"
+ >FASTQ</a
+ > or <a href="http://www.ncbi.nlm.nih.gov/books/NBK47540/"
+ ><code
+ >.sra</code
+ ></a
+ >), then first (a) prepare a <a href="#manifest-files"
+ >manifest file</a
+ > with URLs pointing to the read files, and (b) upload it to an <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > bucket that you own. See your <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > tool's documentation for how to create a bucket and upload a file to it. The URL for the <a href="#manifest-files"
+ >manifest file</a
+ > will be the input URL for your <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job.</p
+ ><p
+ ><em
+ >If the input reads have already been preprocessed by Crossbow</em
+ >, make a note of of the <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > URL where they're located. This will be the input URL for your <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job.</p
+ ></li
+ ><li
+ ><p
+ ><em
+ >If you are using a pre-built reference jar</em
+ >, make a note of its <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > URL. This will be the reference URL for your <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job. See the <a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow website</a
+ > for a list of pre-built reference jars and their URLs.</p
+ ><p
+ ><em
+ >If you are not using a pre-built reference jar</em
+ >, you may need to <a href="#reference-jars"
+ >build the reference jars</a
+ > and/or upload them to an <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > bucket you own. See your <a href="#s3-tools"
+ >S3 tool</a
+ >'s documentation for how to create a bucket and upload to it. The URL for the main reference jar will be the reference URL for your <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job.</p
+ ></li
+ ><li
+ ><p
+ >Run <code
+ >$CROSSBOW_HOME/cb_emr</code
+ > with the desired options. Options that are unique to <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > jobs are described in the following section. Options that apply to all running modes are described in the <a href="#general-crossbow-options"
+ >General Crossbow options</a
+ > section. For examples of how to run <code
+ >$CROSSBOW_HOME/cb_emr</code
+ > see the <a href="#cb-example-e-coli-emr"
+ >E. coli EMR</a
+ > and <a href="#cb-example-mouse17-emr"
+ >Mouse chromosome 17 EMR</a
+ > examples.</p
+ ></li
+ ></ol
+><h2 id="emr-specific-options"
+><a href="#TOC"
+ >EMR-specific options</a
+ ></h2
+><table>
+
+<tr><td id="cb-emr-reference">
+
+
+<pre
+><code
+ >--reference <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > URL where the reference jar is located. URLs for pre-built reference jars for some commonly studied species (including human and mouse) are available from the <a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow web site</a
+ >. Note that a <a href="http://bowtie-bio.sf.net/myrna"
+ >Myrna</a
+ > reference jar is not the same as a <a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow</a
+ > reference jar. If your desired genome and/or SNP annotations are not available in pre-built form, you will have to make your own reference jar and upload it to one of your own S3 buckets (see <a href="#reference-jars"
+ >Reference jars</a
+ >). This option must be specified.</p
+><tr><td id="cb-emr-input">
+
+
+<pre
+><code
+ >--input <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > URL where the input is located. If <a href="#cb-preprocess"
+ ><code
+ >--preprocess</code
+ ></a
+ > or <a href="#cb-just-preprocess"
+ ><code
+ >--just-preprocess</code
+ ></a
+ > are specified, <code
+ ><URL></code
+ > sould point to a <a href="#manifest-files"
+ >manifest file</a
+ >. Otherwise, <code
+ ><URL></code
+ > should point to a directory containing preprocessed reads. This option must be specified.</p
+></td></tr><tr><td id="cb-emr-output">
+
+
+<pre
+><code
+ >--output <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > URL where the output is to be deposited. If <a href="#cb-just-preprocess"
+ ><code
+ >--just-preprocess</code
+ ></a
+ > is specified, the output consists of the preprocessed reads. Otherwise, the output consists of the SNP calls calculated by <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > for each chromosome in the <a href="#cb-output"
+ >Crossbow output format</a
+ >, organized as one file per chromosome. This option must be specified.</p
+></td></tr><tr><td id="cb-emr-intermediate">
+
+
+<pre
+><code
+ >--intermediate <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > URL where all intermediate results should be be deposited. This can be useful if you later want to resume the computation from partway through the pipeline (e.g. after alignment but before SNP calling). By default, intermediate results are stored in <a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > and disappear once the cluster is terminated.</p
+></td></tr><tr><td id="cb-emr-preprocess-output">
+
+
+<pre
+><code
+ >--preprocess-output <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > URL where the preprocessed reads should be stored. This can be useful if you later want to run Crossbow on the same input reads without having to re-run the preprocessing step (i.e. leaving <a href="#cb-preprocess"
+ ><code
+ >--preprocess</code
+ ></a
+ > unspecified).</p
+></td></tr><tr><td id="cb-emr-credentials">
+
+
+<pre
+><code
+ >--credentials <id>
+</code
+ ></pre
+></td><td>
+<p
+>Local path to the credentials file set up by the user when the <a href="http://aws.amazon.com/developertools/2264?_encoding=UTF8&jiveRedirect=1"
+ ><code
+ >elastic-mapreduce</code
+ ></a
+ > script was installed (see <a href="#installing-amazons-elastic-mapreduce-tool"
+ >Installing Amazon's <code
+ >elastic-mapreduce</code
+ > tool</a
+ >). Default: use <code
+ >elastic-mapreduce</code
+ >'s default (i.e. the <code
+ >credentials.json</code
+ > file in the same directory as the <code
+ >elastic-mapreduce</code
+ > script). If <code
+ >--credentials</code
+ > is not specified and the default <code
+ >credentials.json</code
+ > file doesn't exist, <code
+ >elastic-mapreduce</code
+ > will abort with an error message.</p
+></td></tr><tr><td id="cb-emr-script">
+
+
+<pre
+><code
+ >--emr-script <path>
+</code
+ ></pre
+></td><td>
+<p
+>Local path to the <code
+ >elastic-mapreduce</code
+ > script. By default, Crossbow looks first in the <code
+ >$CROSSBOW_EMR_HOME</code
+ > directory, then in the <code
+ >PATH</code
+ >.</p
+></td></tr><tr><td id="cb-emr-name">
+
+
+<pre
+><code
+ >--name <string>
+</code
+ ></pre
+></td><td>
+<p
+>Specify the name by which the job will be identified in the <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ >.</p
+></td></tr><tr><td id="cb-emr-stay-alive">
+
+
+<pre
+><code
+ >--stay-alive
+</code
+ ></pre
+></td><td>
+<p
+>By default, <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > will terminate the cluster as soon as (a) one of the stages fails, or (b) the job complete successfully. Specify this option to force <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > to keep the cluster alive in either case.</p
+></td></tr><tr><td id="cb-emr-instances">
+
+
+<pre
+><code
+ >--instances <int>
+</code
+ ></pre
+></td><td>
+<p
+>Specify the number of instances (i.e. virtual computers, also called nodes) to be allocated to your cluster. If set to 1, the 1 instance will funcion as both <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > master and slave node. If set greater than 1, one instance will function as a <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > master and the rest will function as <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > slaves. In general, the greater the value of <code
+ ><int></code
+ >, the faster the Crossbow computation will complete. Consider the desired speed as well as the <a href="http://aws.amazon.com/ec2/#pricing"
+ >going rate</a
+ > when choosing a value for <code
+ ><int></code
+ >. Default: 1.</p
+></td></tr><tr><td id="cb-emr-instance-type">
+
+
+<pre
+><code
+ >--instance-type <type>
+</code
+ ></pre
+></td><td>
+<p
+>Specify the type of <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > instance to use for the computation. See Amazon's <a href="http://aws.amazon.com/ec2/instance-types/"
+ >list of available instance types</a
+ > and be sure to specify the "API name" of the desired type (e.g. <code
+ >m1.small</code
+ > or <code
+ >c1.xlarge</code
+ >). <strong
+ >The default of <code
+ >c1.xlarge</code
+ > is strongly recommended</strong
+ > because it has an appropriate mix of computing power and memory for a large breadth of problems. Choosing an instance type with less than 5GB of physical RAM can cause problems when the reference is as large (e.g. a mammalian genome). Stick to the default unless you're pretty sure the specified instance type can handle your problem size.</p
+></td></tr><tr><td id="cb-emr-args">
+
+
+<pre
+><code
+ >--emr-args "<args>"
+</code
+ ></pre
+></td><td>
+<p
+>Pass the specified extra arguments to the <code
+ >elastic-mapreduce</code
+ > script. See documentation for the <code
+ >elastic-mapreduce</code
+ > script for details.</p
+></td></tr><tr><td id="cb-logs">
+
+
+<pre
+><code
+ >--logs <URL>
+</code
+ ></pre
+></td><td>
+<p
+>Causes <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > to copy the log files to <code
+ ><URL></code
+ >. Default: <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > writes logs to the <code
+ >logs</code
+ > subdirectory of the <a href="#cb-local-output"
+ ><code
+ >--output</code
+ ></a
+ > URL. See also <a href="#cb-no-logs"
+ ><code
+ >--no-logs</code
+ ></a
+ >.</p
+></td></tr><tr><td id="cb-no-logs">
+
+
+<pre
+><code
+ >--no-logs
+</code
+ ></pre
+></td><td>
+<p
+>By default, Crossbow causes <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > to copy all cluster log files to the <code
+ >log</code
+ > subdirectory of the <a href="#cb-local-output"
+ ><code
+ >--output</code
+ ></a
+ > URL (or another destination, if <a href="#cb-logs"
+ ><code
+ >--logs</code
+ ></a
+ > is specified). Specifying this option disables all copying of logs.</p
+></td></tr><tr><td id="cb-no-emr-debug">
+
+
+<pre
+><code
+ >--no-emr-debug
+</code
+ ></pre
+></td><td>
+<p
+>Disables <a href="http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/DebuggingJobFlows.html"
+ >Job Flow Debugging</a
+ >. If this is <em
+ >not</em
+ > specified, you must have a <a href="http://aws.amazon.com/simpledb/"
+ >SimpleDB</a
+ > account for <a href="http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/DebuggingJobFlows.html"
+ >Job Flow Debugging</a
+ > to work. You will be subject to additional <a href="http://aws.amazon.com/simpledb/#pricing"
+ >SimpleDB-related charges</a
+ > if this option is enabled, but those fees are typically small or zero (depending on your account's <a href="http://aws.amazon.com/simpledb/#pricing"
+ >SimpleDB tier</a
+ >).</p
+></td></tr>
+</table>
+<h1 id="running-crossbow-on-a-hadoop-cluster-via-the-command-line"
+><a href="#TOC"
+ >Running Crossbow on a Hadoop cluster via the command line</a
+ ></h1
+><h2 id="prerequisites-2"
+><a href="#TOC"
+ >Prerequisites</a
+ ></h2
+><ol style="list-style-type: decimal;"
+><li
+ ><p
+ >Working installation of <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > v0.20.2 or v0.20.205. Other versions newer than 0.20 might also work, but haven't been tested.</p
+ ></li
+ ><li
+ ><p
+ >A <code
+ >bowtie</code
+ > v0.12.8 executable must exist at the same path on all cluster nodes (including the master). That path must be specified via the <a href="#cb-hadoop-bowtie"
+ ><code
+ >--bowtie</code
+ ></a
+ > option OR located in the directory specified in the <code
+ >CROSSBOW_BOWTIE_HOME</code
+ > environment variable, OR in a subdirectory of <code
+ >$CROSSBOW_HOME/bin</code
+ > OR in the <code
+ >PATH</code
+ > (Crossbow looks in that order). <code
+ >$CROSSBOW_HOME/bin</code
+ > comes with pre-built Bowtie binaries for Linux and Mac OS X 10.5 or later. An executable from that directory is used automatically unless the platform is not Mac or Linux or unless overridden by <a href="#cb-hadoop-bowtie"
+ ><code
+ >--bowtie</code
+ ></a
+ > or by defining <code
+ >CROSSBOW_BOWTIE_HOME</code
+ >.</p
+ ></li
+ ><li
+ ><p
+ >A Crossbow-customized version of <code
+ >soapsnp</code
+ > v1.02 must be installed at the same path on all cluster nodes (including the master). That path must be specified via the <a href="#cb-hadoop-soapsnp"
+ ><code
+ >--soapsnp</code
+ ></a
+ > option OR located in the directory specified in the <code
+ >CROSSBOW_SOAPSNP_HOME</code
+ > environment variable, OR in a subdirectory of <code
+ >$CROSSBOW_HOME/bin</code
+ > OR in the <code
+ >PATH</code
+ > (Crossbow searches in that order). <code
+ >$CROSSBOW_HOME/bin</code
+ > comes with pre-built SOAPsnp binaries for Linux and Mac OS X 10.6 or later. An executable from that directory is used automatically unless the platform is not Mac or Linux or unless overridden by <a href="#cb-hadoop-soapsnp"
+ ><code
+ >--soapsnp</code
+ ></a
+ > or by defining <code
+ >CROSSBOW_SOAPSNP_HOME</code
+ >.</p
+ ></li
+ ><li
+ ><p
+ >If any of your inputs are in <a href="http://www.ncbi.nlm.nih.gov/books/NBK47533/"
+ >Sequence Read Archive</a
+ > format (i.e. end in <code
+ >.sra</code
+ >), then the <code
+ >fastq-dump</code
+ > tool from the [SRA Toolkit] must be installed at the same path on all cluster nodes. The path to the <code
+ >fastq-dump</code
+ > tool must be specified via the (<a href="#myrna-fastq-dump"
+ ><code
+ >--fastq-dump</code
+ ></a
+ >) option OR <code
+ >fastq-dump</code
+ > must be located in the directory specified in the <code
+ >CROSSBOW_FASTQ_DUMP_HOME</code
+ > environment variable, OR <code
+ >fastq-dump</code
+ > must be found in the <code
+ >PATH</code
+ > (Myrna searches in that order).</p
+ ></li
+ ><li
+ ><p
+ >Sufficient memory must be available on all <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > slave nodes to hold the Bowtie index for the desired organism in addition to any other loads placed on those nodes by <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > or other programs. For mammalian genomes such as the human genome, this typically means that slave nodes must have at least 5-6 GB of RAM.</p
+ ></li
+ ></ol
+><h2 id="to-run-2"
+><a href="#TOC"
+ >To run</a
+ ></h2
+><p
+>Run <code
+ >$CROSSBOW_HOME/cb_hadoop</code
+ > with the desired options. Options that are unique to <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > jobs are described in the following subsection. Options that apply to all running modes are described in the <a href="#general-crossbow-options"
+ >General Crossbow options</a
+ > subsection. To see example invocations of <code
+ >$CROSSBOW_HOME/cb_hadoop</code
+ > see the <a href="#cb-example-e-coli-hadoop"
+ >E. coli Hadoop</a
+ > and <a href="#cb-example-mouse17-hadoop"
+ >Mouse chromosome 17 Hadoop</a
+ > examples.</p
+><h2 id="hadoop-specific-options"
+><a href="#TOC"
+ >Hadoop-specific options</a
+ ></h2
+><table>
+
+<tr><td id="cb-hadoop-reference">
+
+
+<pre
+><code
+ >--reference <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > URL where the reference jar is located. Pre-built reference jars for some commonly studied species (including human and mouse) are available from the <a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow web site</a
+ >; these can be downloaded and installed in HDFS using <code
+ >hadoop dfs</code
+ > commands. If your desired genome and/or SNP annotations are not available in pre-built form, you will have to make your own reference jars, install them in HDFS, and specify their HDFS path here. This option must be specified.</p
+><tr><td id="cb-hadoop-input">
+
+
+<pre
+><code
+ >--input <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > URL where the input is located. If <a href="#cb-preprocess"
+ ><code
+ >--preprocess</code
+ ></a
+ > or <a href="#cb-just-preprocess"
+ ><code
+ >--just-preprocess</code
+ ></a
+ > are specified, <code
+ ><URL></code
+ > sould point to a manifest file. Otherwise, <code
+ ><URL></code
+ > should point to a directory containing preprocessed reads. This option must be specified.</p
+></td></tr><tr><td id="cb-hadoop-output">
+
+
+<pre
+><code
+ >--output <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > URL where the output is to be deposited. If <a href="#cb-just-preprocess"
+ ><code
+ >--just-preprocess</code
+ ></a
+ > is specified, the output consists of the preprocessed reads. Otherwise, the output consists of the SNP calls calculated by SOAPsnp for each chromosome, organized as one file per chromosome. This option must be specified.</p
+></td></tr><tr><td id="cb-hadoop-intermediate">
+
+
+<pre
+><code
+ >--intermediate <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > URL where all intermediate results should be be deposited. Default: <code
+ >hdfs:///crossbow/intermediate/<PID></code
+ >.</p
+></td></tr><tr><td id="cb-hadoop-preprocess-output">
+
+
+<pre
+><code
+ >--preprocess-output <URL>
+</code
+ ></pre
+></td><td>
+<p
+><a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > URL where the preprocessed reads should be stored. This can be useful if you later want to run Crossbow on the same input reads without having to re-run the preprocessing step (i.e. leaving <a href="#cb-preprocess"
+ ><code
+ >--preprocess</code
+ ></a
+ > unspecified).</p
+></td></tr><tr><td id="cb-hadoop-bowtie">
+
+
+<pre
+><code
+ >--bowtie <path>
+</code
+ ></pre
+></td><td>
+<p
+>Local path to the <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > binary Crossbow should use. <code
+ >bowtie</code
+ > must be installed in this same directory on all <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > worker nodes. By default, Crossbow searches the <code
+ >PATH</code
+ > and in the directory pointed to by the <code
+ >CROSSBOW_HOME</code
+ > environment variable.</p
+></td></tr><tr><td id="cb-hadoop-fastq-dump">
+
+
+<pre
+><code
+ >--fastq-dump <path>
+</code
+ ></pre
+></td><td>
+<p
+>Path to the directory containing <code
+ >fastq-dump</code
+ >, which is part of the [SRA Toolkit]. This overrides all other ways that Crossbow searches for <code
+ >fastq-dump</code
+ >, including the <code
+ >CROSSBOW_SRATOOLKIT_HOME</code
+ > environment variable, the subdirectories of the <code
+ >$CROSSBOW_HOME/bin</code
+ > directory, and the <code
+ >PATH</code
+ >.</p
+></td></tr><tr><td id="cb-hadoop-soapsnp">
+
+
+<pre
+><code
+ >--soapsnp <path>
+</code
+ ></pre
+></td><td>
+<p
+>Local path to the SOAPsnp executable to use when running the Call SNPs step. <code
+ >soapsnp</code
+ > must be installed in this same directory on all <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > worker nodes This overrides all other ways that Crossbow searches for <code
+ >soapsnp</code
+ >, including the <code
+ >CROSSBOW_SOAPSNP_HOME</code
+ > environment variable, the subdirectories of the <code
+ >$CROSSBOW_HOME/bin</code
+ > directory, and the <code
+ >PATH</code
+ >.</p
+></td></tr>
+</table>
+<h1 id="running-crossbow-on-a-single-computer-via-the-command-line"
+><a href="#TOC"
+ >Running Crossbow on a single computer via the command line</a
+ ></h1
+><h2 id="prerequisites-3"
+><a href="#TOC"
+ >Prerequisites</a
+ ></h2
+><ol style="list-style-type: decimal;"
+><li
+ ><p
+ >A <code
+ >bowtie</code
+ > v0.12.8 executable must exist on the local computer. The path to <code
+ >bowtie</code
+ > must be specified via the <a href="#cb-local-bowtie"
+ ><code
+ >--bowtie</code
+ ></a
+ > option OR be located in the directory specified in the <code
+ >$CROSSBOW_BOWTIE_HOME</code
+ > environment variable, OR in a subdirectory of <code
+ >$CROSSBOW_HOME/bin</code
+ > OR in the <code
+ >PATH</code
+ > (search proceeds in that order). <code
+ >$CROSSBOW_HOME/bin</code
+ > comes with pre-built Bowtie binaries for Linux and Mac OS X 10.6 or later, so most Mac and Linux users do not need to install either tool.</p
+ ></li
+ ><li
+ ><p
+ >A Crossbow-customized version of <code
+ >soapsnp</code
+ > v1.02 must exist. The path to <code
+ >soapsnp</code
+ > must be specified via the <a href="#cb-local-soapsnp"
+ ><code
+ >--soapsnp</code
+ ></a
+ > option OR be in the directory specified in the <code
+ >$CROSSBOW_SOAPSNP_HOME</code
+ > environment variable, OR in a subdirectory of <code
+ >$CROSSBOW_HOME/bin</code
+ > OR in the <code
+ >PATH</code
+ > (Crossbow searches in that order). <code
+ >$CROSSBOW_HOME/bin</code
+ > comes with pre-built SOAPsnp binaries for Linux and Mac OS X 10.6 or later. An executable from that directory is used automatically unless the platform is not Mac or Linux or unless overridden by <a href="#cb-local-soapsnp"
+ ><code
+ >--soapsnp</code
+ ></a
+ > or <code
+ >$CROSSBOW_SOAPSNP_HOME</code
+ >.</p
+ ></li
+ ><li
+ ><p
+ >If any of your inputs are in <a href="http://www.ncbi.nlm.nih.gov/books/NBK47533/"
+ >Sequence Read Archive</a
+ > format (i.e. end in <code
+ >.sra</code
+ >), then the <code
+ >fastq-dump</code
+ > tool from the [SRA Toolkit] must be installed on the local computer. The path to the <code
+ >fastq-dump</code
+ > tool must be specified via the (<a href="#myrna-fastq-dump"
+ ><code
+ >--fastq-dump</code
+ ></a
+ >) option OR <code
+ >fastq-dump</code
+ > must be located in the directory specified in the <code
+ >MYRNA_FASTQ_DUMP_HOME</code
+ > environment variable, OR <code
+ >fastq-dump</code
+ > must be found in the <code
+ >PATH</code
+ > (Myrna searches in that order).</p
+ ></li
+ ><li
+ ><p
+ >Sufficient memory must be available on the local computer to hold one copy of the Bowtie index for the desired organism <em
+ >in addition</em
+ > to all other running workloads. For mammalian genomes such as the human genome, this typically means that the local computer must have at least 5-6 GB of RAM.</p
+ ></li
+ ></ol
+><h2 id="to-run-3"
+><a href="#TOC"
+ >To run</a
+ ></h2
+><p
+>Run <code
+ >$CROSSBOW_HOME/cb_local</code
+ > with the desired options. Options unique to local jobs are described in the following subsection. Options that apply to all running modes are described in the <a href="#general-crossbow-options"
+ >General Crossbow options</a
+ > subsection. To see example invocations of <code
+ >$CROSSBOW_HOME/cb_local</code
+ > see the <a href="#cb-example-e-coli-local"
+ >E. coli local</a
+ > and <a href="#cb-example-mouse17-local"
+ >Mouse chromosome 17 local</a
+ > examples.</p
+><h2 id="local-run-specific-options"
+><a href="#TOC"
+ >Local-run-specific options</a
+ ></h2
+><table>
+
+<tr><td id="cb-local-reference">
+
+
+<pre
+><code
+ >--reference <path>
+</code
+ ></pre
+></td><td>
+<p
+>Local path where expanded reference jar is located. Specified path should have a <code
+ >index</code
+ > subdirectory with a set of Bowtie index files, a <code
+ >sequences</code
+ > subdirectory with a set of FASTA files, a <code
+ >snps</code
+ > subdirectory with 0 or more per-chromosome SNP description files, and a <code
+ >cmap.txt</code
+ > file. Pre-built reference jars for some commonly studied species (including human and mouse) are available from the <a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow web site</a
+ >; these can be downloaded and expanded into a directory with the appropriate structure using an <a href="http://en.wikipedia.org/wiki/Unzip"
+ ><code
+ >unzip</code
+ ></a
+ > utility. If your desired genome and/or SNP annotations are not available in pre-built form, you will have to make your own reference jars and specify the appropriate path. This option must be specified.</p
+><tr><td id="cb-local-input">
+
+
+<pre
+><code
+ >--input <path>
+</code
+ ></pre
+></td><td>
+<p
+>Local path where the input is located. If <a href="#cb-preprocess"
+ ><code
+ >--preprocess</code
+ ></a
+ > or <a href="#cb-just-preprocess"
+ ><code
+ >--just-preprocess</code
+ ></a
+ > are specified, this sould point to a <a href="#manifest-files"
+ >manifest file</a
+ >. Otherwise, this should point to a directory containing preprocessed reads. This option must be specified.</p
+></td></tr><tr><td id="cb-local-output">
+
+
+<pre
+><code
+ >--output <path>
+</code
+ ></pre
+></td><td>
+<p
+>Local path where the output is to be deposited. If <a href="#cb-just-preprocess"
+ ><code
+ >--just-preprocess</code
+ ></a
+ > is specified, the output consists of the preprocessed reads. Otherwise, the output consists of the SNP calls calculated by SOAPsnp for each chromosome, organized as one file per chromosome. This option must be specified.</p
+></td></tr><tr><td id="cb-local-intermediate">
+
+
+<pre
+><code
+ >--intermediate <path>
+</code
+ ></pre
+></td><td>
+<p
+>Local path where all intermediate results should be kept temporarily (or permanently, if <a href="#cb-local-keep-intermediates"
+ ><code
+ >--keep-intermediates</code
+ ></a
+ > or <a href="#cb-local-keep-all"
+ ><code
+ >--keep-all</code
+ ></a
+ > are specified). Default: <code
+ >/tmp/crossbow/intermediate/<PID></code
+ >.</p
+></td></tr><tr><td id="cb-local-preprocess-output">
+
+
+<pre
+><code
+ >--preprocess-output <path>
+</code
+ ></pre
+></td><td>
+<p
+>Local path where the preprocessed reads should be stored. This can be useful if you later want to run Crossbow on the same input reads without having to re-run the preprocessing step (i.e. leaving <a href="#cb-preprocess"
+ ><code
+ >--preprocess</code
+ ></a
+ > unspecified).</p
+></td></tr><tr><td id="cb-local-keep-intermediates">
+
+
+<pre
+><code
+ >--keep-intermediates
+</code
+ ></pre
+></td><td>
+<p
+>Keep intermediate directories and files, i.e. the output from all stages prior to the final stage. By default these files are deleted as soon as possible.</p
+></td></tr><tr><td id="cb-local-keep-all">
+
+
+<pre
+><code
+ >--keep-all
+</code
+ ></pre
+></td><td>
+<p
+>Keep all temporary files generated during the process of binning and sorting data records and moving them from stage to stage, as well as all intermediate results. By default these files are deleted as soon as possible.</p
+></td></tr><tr><td id="cb-local-cpus">
+
+
+<pre
+><code
+ >--cpus <int>
+</code
+ ></pre
+></td><td>
+<p
+>The maximum number of processors to use at any given time during the job. Crossbow will try to make maximal use of the processors allocated. Default: 1.</p
+></td></tr><tr><td id="cb-local-max-sort-records">
+
+
+<pre
+><code
+ >--max-sort-records <int>
+</code
+ ></pre
+></td><td>
+<p
+>Maximum number of records to be dispatched to the sort routine at one time when sorting bins before each reduce step. For each child process, this number is effectively divided by the number of CPUs used (<a href="#cb-local-cpus"
+ ><code
+ >--cpus</code
+ ></a
+ >). The default is 200000.</p
+></td></tr><tr><td id="cb-local-max-sort-files">
+
+
+<pre
+><code
+ >--max-sort-files <int>
+</code
+ ></pre
+></td><td>
+<p
+>Maximum number of files that can be opened at once by the sort routine when sorting bins before each reduce step. For each child process, this number is effectively divided by the number of CPUs used (<a href="#cb-local-cpus"
+ ><code
+ >--cpus</code
+ ></a
+ >). The default is 40.</p
+></td></tr><tr><td id="cb-local-bowtie">
+
+
+<pre
+><code
+ >--bowtie <path>
+</code
+ ></pre
+></td><td>
+<p
+>Path to the Bowtie executable to use when running the Align step. This overrides all other ways that Crossbow searches for <code
+ >bowtie</code
+ >, including the <code
+ >CROSSBOW_BOWTIE_HOME</code
+ > environment variable, the subdirectories of the <code
+ >$CROSSBOW_HOME/bin</code
+ > directory, and the <code
+ >PATH</code
+ >.</p
+></td></tr><tr><td id="cb-local-fastq-dump">
+
+
+<pre
+><code
+ >--fastq-dump <path>
+</code
+ ></pre
+></td><td>
+<p
+>Path to the directory containing the programs in the <a href="http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?cmd=show&f=software&m=software&s=software"
+ >SRA toolkit</a
+ >, including <code
+ >fastq-dump</code
+ >. This overrides all other ways that Crossbow searches for <code
+ >fastq-dump</code
+ >, including the <code
+ >CROSSBOW_SRATOOLKIT_HOME</code
+ > environment variable, the subdirectories of the <code
+ >$CROSSBOW_HOME/bin</code
+ > directory, and the <code
+ >PATH</code
+ >.</p
+></td></tr><tr><td id="cb-local-soapsnp">
+
+
+<pre
+><code
+ >--soapsnp <path>
+</code
+ ></pre
+></td><td>
+<p
+>Path to the SOAPsnp executable to use when running the Call SNPs step. This overrides all other ways that Crossbow searches for <code
+ >soapsnp</code
+ >, including the <code
+ >CROSSBOW_SOAPSNP_HOME</code
+ > environment variable, the subdirectories of the <code
+ >$CROSSBOW_HOME/bin</code
+ > directory, and the <code
+ >PATH</code
+ >.</p
+></td></tr>
+
+</table>
+<h1 id="general-crossbow-options"
+><a href="#TOC"
+ >General Crossbow options</a
+ ></h1
+><p
+>The following options can be specified regardless of what mode (<a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ >, <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > or local) Crossbow is run in.</p
+><table>
+
+<tr><td id="cb-quality">
+
+
+<pre
+><code
+ >--quality { phred33 | phred64 | solexa64 }
+</code
+ ></pre
+></td><td>
+<p
+>Treat all input reads as having the specified quality encoding. <code
+ >phred33</code
+ > denotes the <a href="http://en.wikipedia.org/wiki/FASTQ_format#Encoding"
+ >Phred+33</a
+ > or "Sanger" format whereby ASCII values 33-126 are used to encode qualities on the <a href="http://en.wikipedia.org/wiki/Phred_quality_score"
+ >Phred scale</a
+ >. <code
+ >phred64</code
+ > denotes the <a href="http://en.wikipedia.org/wiki/FASTQ_format#Encoding"
+ >Phred+64</a
+ > or "Illumina 1.3+" format whereby ASCII values 64-126 are used to encode qualities on the <a href="http://en.wikipedia.org/wiki/Phred_quality_score"
+ >Phred scale</a
+ >. <code
+ >solexa64</code
+ > denotes the <a href="http://en.wikipedia.org/wiki/FASTQ_format#Encoding"
+ >Solexa+64</a
+ > or "Solexa/Illumina 1.0" format whereby ASCII values 59-126 are used to encode qualities on a <a href="http://en.wikipedia.org/wiki/FASTQ_format#Variations"
+ >log-odds scale</a
+ > that includes values as low as -5. Default: <code
+ >phred33</code
+ >.</p
+></td></tr><tr><td id="cb-preprocess">
+
+
+<pre
+><code
+ >--preprocess
+</code
+ ></pre
+></td><td>
+<p
+>The input path or URL refers to a <a href="#manifest-files"
+ >manifest file</a
+ > rather than a directory of preprocessed reads. The first step in the Crossbow computation will be to preprocess the reads listed in the <a href="#manifest-files"
+ >manifest file</a
+ > and store the preprocessed reads in the intermediate directory or in the <code
+ >--preprocess-output</code
+ > directory if it's specified. Default: off.</p
+></td></tr><tr><td id="cb-just-preprocess">
+
+
+<pre
+><code
+ >--just-preprocess
+</code
+ ></pre
+></td><td>
+<p
+>The input path or URL refers to a <a href="#manifest-files"
+ >manifest file</a
+ > rather than a directory of preprocessed reads. Crossbow will preprocess the reads listed in the <a href="#manifest-files"
+ >manifest file</a
+ > and store the preprocessed reads in the <code
+ >--output</code
+ > directory and quit. Default: off.</p
+></td></tr><tr><td id="cb-just-align">
+
+
+<pre
+><code
+ >--just-align
+</code
+ ></pre
+></td><td>
+<p
+>Instead of running the Crossbow pipeline all the way through to the end, run the pipeline up to and including the align stage and store the results in the <a href="#cb-local-output"
+ ><code
+ >--output</code
+ ></a
+ > URL. To resume the run later, use <a href="#cb-resume-align"
+ ><code
+ >--resume-align</code
+ ></a
+ >.</p
+></td></tr><tr><td id="cb-resume-align">
+
+
+<pre
+><code
+ >--resume-align
+</code
+ ></pre
+></td><td>
+<p
+>Resume the Crossbow pipeline from just after the alignment stage. The <a href="#cb-local-input"
+ ><code
+ >--input</code
+ ></a
+ > URL must point to an <a href="#cb-local-output"
+ ><code
+ >--output</code
+ ></a
+ > URL from a previous run using <a href="#cb-just-align"
+ ><code
+ >--just-align</code
+ ></a
+ >.</p
+></td></tr><tr><td id="cb-bowtie-args">
+
+
+<pre
+><code
+ >--bowtie-args "<args>"
+</code
+ ></pre
+></td><td>
+<p
+>Pass the specified arguments to <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > for the Align stage. Default: <a href="http://bowtie-bio.sf.net/manual.shtml#bowtie-options-M"
+ ><code
+ >-M 1</code
+ ></a
+ >. See the <a href="http://bowtie-bio.sf.net/manual.shtml"
+ >Bowtie manual</a
+ > for details on what options are available.</p
+></td></tr><tr><td id="cb-discard-reads">
+
+
+<pre
+><code
+ >--discard-reads <fraction>
+</code
+ ></pre
+></td><td>
+<p
+>Randomly discard a fraction of the input reads. E.g. specify <code
+ >0.5</code
+ > to discard 50%. This applies to all input reads regardless of type (paired vs. unpaired) or length. This can be useful for debugging. Default: 0.0.</p
+></td></tr><tr><td id="cb-discard-ref-bins">
+
+
+<pre
+><code
+ >--discard-ref-bins <fraction>
+</code
+ ></pre
+></td><td>
+<p
+>Randomly discard a fraction of the reference bins prior to SNP calling. E.g. specify <code
+ >0.5</code
+ > to discard 50% of the reference bins. This can be useful for debugging. Default: 0.0.</p
+></td></tr><tr><td id="cb-discard-all">
+
+
+<pre
+><code
+ >--discard-all <fraction>
+</code
+ ></pre
+></td><td>
+<p
+>Equivalent to setting <a href="#cb-discard-reads"
+ ><code
+ >--discard-reads</code
+ ></a
+ > and <a href="#cb-discard-ref-bins"
+ ><code
+ >--discard-ref-bins</code
+ ></a
+ > to <code
+ ><fraction></code
+ >. Default: 0.0.</p
+></td></tr><tr><td id="cb-soapsnp-args">
+
+
+<pre
+><code
+ >--soapsnp-args "<args>"
+</code
+ ></pre
+></td><td>
+<p
+>Pass the specified arguments to <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > in the SNP calling stage. These options are passed to SOAPsnp regardless of whether the reference sequence under consideration is diploid or haploid. Default: <code
+ >-2 -u -n -q</code
+ >. See the <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp manual</a
+ > for details on what options are available.</p
+></td></tr><tr><td id="cb-soapsnp-hap-args">
+
+
+<pre
+><code
+ >--soapsnp-hap-args "<args>"
+</code
+ ></pre
+></td><td>
+<p
+>Pass the specified arguments to <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > in the SNP calling stage. when the reference sequence under consideration is haploid. Default: <code
+ >-r 0.0001</code
+ >. See the <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp manual</a
+ > for details on what options are available.</p
+></td></tr><tr><td id="cb-soapsnp-dip-args">
+
+
+<pre
+><code
+ >--soapsnp-dip-args "<args>"
+</code
+ ></pre
+></td><td>
+<p
+>Pass the specified arguments to <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > in the SNP calling stage. when the reference sequence under consideration is diploid. Default: <code
+ >-r 0.00005 -e 0.0001</code
+ >. See the <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp manual</a
+ > for details on what options are available.</p
+></td></tr><tr><td id="cb-haploids">
+
+
+<pre
+><code
+ >--haploids <chromosome-list>
+</code
+ ></pre
+></td><td>
+<p
+>The specified comma-separated list of chromosome names are to be treated as haploid by SOAPsnp. The rest are treated as diploid. Default: all chromosomes are treated as diploid.</p
+></td></tr><tr><td id="cb-all-haploids">
+
+
+<pre
+><code
+ >--all-haploids
+</code
+ ></pre
+></td><td>
+<p
+>If specified, all chromosomes are treated as haploid by SOAPsnp.</p
+></td></tr><tr><td id="cb-partition-len">
+
+
+<pre
+><code
+ >--partition-len <int>
+</code
+ ></pre
+></td><td>
+<p
+>The bin size to use when binning alignments into partitions prior to SNP calling. If load imbalance occurrs in the SNP calling step (some tasks taking far longer than others), try decreasing this. Default: 1,000,000.</p
+><blockquote
+></tr><tr><td id="cb-dry-run">
+</blockquote
+><pre
+><code
+ >--dry-run
+</code
+ ></pre
+></td><td>
+<p
+>Just generate a script containing the commands needed to launch the job, but don't run it. The script's location will be printed so that you may run it later.</p
+></td></tr>
+
+</td></tr><tr><td id="cb-test">
+
+
+<pre
+><code
+ >--test
+</code
+ ></pre
+></td><td>
+<p
+>Instead of running Crossbow, just search for the supporting tools (<a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > and <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ >) and report whether and how they were found. If running in Cloud Mode, this just tests whether the <code
+ >elastic-mapreduce</code
+ > script is locatable and runnable. Use this option to debug your local Crossbow installation.</p
+></td></tr><tr><td id="cb-tempdir">
+
+
+<pre
+><code
+ >--tempdir `<path>`
+</code
+ ></pre
+></td><td>
+<p
+>Local directory where temporary files (e.g. dynamically generated scripts) should be deposited. Default: <code
+ >/tmp/Crossbow/invoke.scripts</code
+ >.</p
+></td></tr>
+</table>
+<h1 id="crossbow-examples"
+><a href="#TOC"
+ >Crossbow examples</a
+ ></h1
+><p
+>The following subsections guide you step-by-step through examples included with the Crossbow package. Because reads (and sometimes reference jars) must be obtained over the Internet, running these examples requires an active Internet connection.</p
+><h2 id="e.-coli-small"
+><a href="#TOC"
+ >E. coli (small)</a
+ ></h2
+><p
+>Data for this example is taken from the study by <a href="http://www.pnas.org/content/early/2009/11/19/0906681106.abstract"
+ >Parkhomchuk et al</a
+ >.</p
+><h3 id="emr"
+><a href="#TOC"
+ >EMR</a
+ ></h3
+><div id="cb-example-e-coli-emr" />
+<h4 id="via-web-interface"
+><a href="#TOC"
+ >Via web interface</a
+ ></h4
+><p
+>Identify an <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > bucket to hold the job's input and output. You may need to create an <a href="http://docs.amazonwebservices.com/AmazonS3/latest/index.html?UsingBucket.html"
+ >S3 bucket</a
+ > for this purpose. See your <a href="#s3-tools"
+ >S3 tool</a
+ >'s documentation.</p
+><p
+>Use an <a href="#s3-tools"
+ >S3 tool</a
+ > to upload <code
+ >$CROSSBOW_HOME/example/e_coli/small.manifest</code
+ > to the <code
+ >example/e_coli</code
+ > subdirectory in your bucket. You can do so with this <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ > command:</p
+><pre
+><code
+ >s3cmd put $CROSSBOW_HOME/example/e_coli/small.manifest s3://<YOUR-BUCKET>/example/e_coli/
+</code
+ ></pre
+><p
+>Direct your web browser to the <a href="http://bowtie-bio.sf.net/crossbow/ui.html"
+ >Crossbow web interface</a
+ > and fill in the form as below (substituting for <code
+ ><YOUR-BUCKET></code
+ >):</p
+><div><img src="images/AWS_cb_e_coli_fillin.png" alt="" /><p><i>Crossbow web form filled in for the small E. coli example.</i></p>
+</div>
+<ol style="list-style-type: decimal;"
+><li
+ >For <strong
+ >AWS ID</strong
+ >, enter your AWS Access Key ID</li
+ ><li
+ >For <strong
+ >AWS Secret Key</strong
+ >, enter your AWS Secret Access Key</li
+ ><li
+ ><em
+ >Optional</em
+ >: For <strong
+ >AWS Keypair name</strong
+ >, enter the name of your AWS keypair. This is only necessary if you would like to be able to <a href="http://en.wikipedia.org/wiki/Secure_Shell"
+ >ssh</a
+ > into the <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > cluster while it runs.</li
+ ><li
+ ><em
+ >Optional</em
+ >: Check that the AWS ID and Secret Key entered are valid by clicking the "Check credentials..." link</li
+ ><li
+ >For <strong
+ >Job name</strong
+ >, enter <code
+ >Crossbow-Ecoli</code
+ ></li
+ ><li
+ >Make sure that <strong
+ >Job type</strong
+ > is set to "Crossbow"</li
+ ><li
+ >For <strong
+ >Input URL</strong
+ >, enter <code
+ >s3n://<YOUR-BUCKET>/example/e_coli/small.manifest</code
+ >, substituting for <code
+ ><YOUR-BUCKET></code
+ ></li
+ ><li
+ ><em
+ >Optional</em
+ >: Check that the Input URL exists by clicking the "Check that input URL exists..." link</li
+ ><li
+ >For <strong
+ >Output URL</strong
+ >, enter <code
+ >s3n://<YOUR-BUCKET>/example/e_coli/output_small</code
+ >, substituting for <code
+ ><YOUR-BUCKET></code
+ ></li
+ ><li
+ ><em
+ >Optional</em
+ >: Check that the Output URL does not exist by clicking the "Check that output URL doesn't exist..." link</li
+ ><li
+ >For <strong
+ >Input type</strong
+ >, select "Manifest file"</li
+ ><li
+ >For <strong
+ >Genome/Annotation</strong
+ >, select "E. coli" from the drop-down menu</li
+ ><li
+ >For <strong
+ >Chromosome ploidy</strong
+ >, select "All are haploid"</li
+ ><li
+ >Click Submit</li
+ ></ol
+><p
+>This job typically takes about 30 minutes on 1 <code
+ >c1.xlarge</code
+ > <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > node. See <a href="#monitoring-your-emr-jobs"
+ >Monitoring your EMR jobs</a
+ > for information on how to track job progress. To download the results, use an <a href="#s3-tools"
+ >S3 tool</a
+ > to retrieve the contents of the <code
+ >s3n://<YOUR-BUCKET>/example/e_coli/output_small</code
+ > directory.</p
+><h4 id="via-command-line"
+><a href="#TOC"
+ >Via command line</a
+ ></h4
+><p
+>Test your Crossbow installation by running:</p
+><pre
+><code
+ >$CROSSBOW_HOME/cb_emr --test
+</code
+ ></pre
+><p
+>This will warn you if any supporting tools (<code
+ >elastic-mapreduce</code
+ > in this case) cannot be located or run.</p
+><p
+>Identify an <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ > bucket to hold the job's input and output. You may need to create an <a href="http://docs.amazonwebservices.com/AmazonS3/latest/index.html?UsingBucket.html"
+ >S3 bucket</a
+ > for this purpose. See your <a href="#s3-tools"
+ >S3 tool</a
+ >'s documentation.</p
+><p
+>Use your <a href="#s3-tools"
+ >S3 tool</a
+ > to upload <code
+ >$CROSSBOW_HOME/example/e_coli/small.manifest</code
+ > to the <code
+ >example/e_coli</code
+ > subdirectory in your bucket. You can do so with this <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ > command:</p
+><pre
+><code
+ >s3cmd put $CROSSBOW_HOME/example/e_coli/small.manifest s3://<YOUR-BUCKET>/example/e_coli/
+</code
+ ></pre
+><p
+>Start the <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job with the following command (substituting for <code
+ ><YOUR-BUCKET></code
+ >):</p
+><pre
+><code
+ >$CROSSBOW_HOME/cb_emr \
+ --name "Crossbow-Ecoli" \
+ --preprocess \
+ --input=s3n://<YOUR-BUCKET>/example/e_coli/small.manifest \
+ --output=s3n://<YOUR-BUCKET>/example/e_coli/output_small \
+ --reference=s3n://crossbow-refs/e_coli.jar \
+ --all-haploids
+</code
+ ></pre
+><p
+>The <code
+ >--reference</code
+ > option instructs Crossbow to use a pre-built reference jar at URL <code
+ >s3n://crossbow-refs/e_coli.jar</code
+ >. The <a href="#cb-preprocess"
+ ><code
+ >--preprocess</code
+ ></a
+ > option instructs Crossbow to treat the input as a <a href="#manifest-files"
+ >manifest file</a
+ >, rather than a directory of already-preprocessed reads. As the first stage of the pipeline, Crossbow downloads files specified in the manifest file and preprocesses them into Crossbow's read format. <a href="#cb-local-output"
+ ><code
+ >--output</code
+ ></a
+ > specifies where the final output is placed.</p
+><p
+>This job typically takes about 30 minutes on 1 <code
+ >c1.xlarge</code
+ > <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > node. See <a href="#monitoring-your-emr-jobs"
+ >Monitoring your EMR jobs</a
+ > for information on how to track job progress. To download the results, use an <a href="#s3-tools"
+ >S3 tool</a
+ > to retrieve the contents of the <code
+ >s3n://<YOUR-BUCKET>/example/e_coli/output_small</code
+ > directory.</p
+><h3 id="hadoop"
+><a href="#TOC"
+ >Hadoop</a
+ ></h3
+><div id="cb-example-e-coli-hadoop" />
+<p
+>Log into the <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > master node and test your Crossbow installation by running:</p
+><pre
+><code
+ >$CROSSBOW_HOME/cb_hadoop --test
+</code
+ ></pre
+><p
+>This will tell you if any of the supporting tools or packages are missing on the master. <em
+ >You must also ensure</em
+ > that the same tools are installed in the same paths on all slave nodes, and are runnable by the slaves.</p
+><p
+>From the master, download the file named <code
+ >e_coli.jar</code
+ > from the following URL:</p
+><pre
+><code
+ >http://crossbow-refs.s3.amazonaws.com/e_coli.jar
+</code
+ ></pre
+><p
+>E.g. with this command:</p
+><pre
+><code
+ >wget http://crossbow-refs.s3.amazonaws.com/e_coli.jar
+</code
+ ></pre
+><p
+>Equivalently, you can use an <a href="#s3-tools"
+ >S3 tool</a
+ > to download the same file from this URL:</p
+><pre
+><code
+ >s3n://crossbow-refs/e_coli.jar
+</code
+ ></pre
+><p
+>E.g. with this <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ > command:</p
+><pre
+><code
+ >s3cmd get s3://crossbow-refs/e_coli.jar
+</code
+ ></pre
+><p
+>Install <code
+ >e_coli.jar</code
+ > in <a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > (the <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > distributed filesystem) with the following commands. If the <code
+ >hadoop</code
+ > script is not in your <code
+ >PATH</code
+ >, either add it to your <code
+ >PATH</code
+ > (recommended) or specify the full path to the <code
+ >hadoop</code
+ > script in the following commands.</p
+><pre
+><code
+ >hadoop dfs -mkdir /crossbow-refs
+hadoop dfs -put e_coli.jar /crossbow-refs/e_coli.jar
+</code
+ ></pre
+><p
+>The first creates a directory in <a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > (you will see a warning message if the directory already exists) and the second copies the local jar files into that directory. In this example, we deposit the jars in the <code
+ >/crossbow-refs</code
+ > directory, but any <a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > directory is fine.</p
+><p
+>Remove the local <code
+ >e_coli.jar</code
+ > file to save space. E.g.:</p
+><pre
+><code
+ >rm -f e_coli.jar
+</code
+ ></pre
+><p
+>Next install the <a href="#manifest-files"
+ >manifest file</a
+ > in <a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ >:</p
+><pre
+><code
+ >hadoop dfs -mkdir /crossbow/example/e_coli
+hadoop dfs -put $CROSSBOW_HOME/example/e_coli/small.manifest /crossbow/example/e_coli/small.manifest
+</code
+ ></pre
+><p
+>Now start the job by running:</p
+><pre
+><code
+ >$CROSSBOW_HOME/cb_hadoop \
+ --preprocess \
+ --input=hdfs:///crossbow/example/e_coli/small.manifest \
+ --output=hdfs:///crossbow/example/e_coli/output_small \
+ --reference=hdfs:///crossbow-refs/e_coli.jar \
+ --all-haploids
+</code
+ ></pre
+><p
+>The <a href="#cb-preprocess"
+ ><code
+ >--preprocess</code
+ ></a
+ > option instructs Crossbow to treat the input as a <a href="#manifest-files"
+ >manifest file</a
+ >. As the first stage of the pipeline, Crossbow will download the files specified on each line of the manifest file and preprocess them into Crossbow's read format. The <a href="#cb-local-reference"
+ ><code
+ >--reference</code
+ ></a
+ > option specifies the location of the reference jar contents. The <a href="#cb-local-output"
+ ><code
+ >--output</code
+ ></a
+ > option specifies where the final output is placed.</p
+><h3 id="single-computer"
+><a href="#TOC"
+ >Single computer</a
+ ></h3
+><div id="cb-example-e-coli-local" />
+<p
+>Test your Crossbow installation by running:</p
+><pre
+><code
+ >$CROSSBOW_HOME/cb_local --test
+</code
+ ></pre
+><p
+>This will warn you if any supporting tools (<code
+ >bowtie</code
+ > and <code
+ >soapsnp</code
+ > in this case) cannot be located or run.</p
+><p
+>If you don't already have a <code
+ >CROSSBOW_REFS</code
+ > directory, choose one; it will be the default path Crossbow searches for reference jars. Permanently set the <code
+ >CROSSBOW_REFS</code
+ > environment variable to the selected directory.</p
+><p
+>Create a subdirectory called <code
+ >$CROSSBOW_REFS/e_coli</code
+ >:</p
+><pre
+><code
+ >mkdir $CROSSBOW_REFS/e_coli
+</code
+ ></pre
+><p
+>Download <code
+ >e_coli.jar</code
+ > from the following URL to the new <code
+ >e_coli</code
+ > directory:</p
+><pre
+><code
+ >http://crossbow-refs.s3.amazonaws.com/e_coli.jar
+</code
+ ></pre
+><p
+>E.g. with this command:</p
+><pre
+><code
+ >wget -O $CROSSBOW_REFS/e_coli/e_coli.jar http://crossbow-refs.s3.amazonaws.com/e_coli.jar
+</code
+ ></pre
+><p
+>Equivalently, you can use an <a href="#s3-tools"
+ >S3 tool</a
+ > to download the same file from this URL:</p
+><pre
+><code
+ >s3n://crossbow-refs/e_coli.jar
+</code
+ ></pre
+><p
+>E.g. with this <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ > command:</p
+><pre
+><code
+ >s3cmd get s3://crossbow-refs/e_coli.jar $CROSSBOW_REFS/e_coli/e_coli.jar
+</code
+ ></pre
+><p
+>Change to the new <code
+ >e_coli</code
+ > directory and expand <code
+ >e_coli.jar</code
+ > using an <code
+ >unzip</code
+ > or <code
+ >jar</code
+ > utility:</p
+><pre
+><code
+ >cd $CROSSBOW_REFS/e_coli && unzip e_coli.jar
+</code
+ ></pre
+><p
+>Now you may remove <code
+ >e_coli.jar</code
+ > to save space:</p
+><pre
+><code
+ >rm -f $CROSSBOW_REFS/e_coli/e_coli.jar
+</code
+ ></pre
+><p
+>Now run Crossbow. Change to the <code
+ >$CROSSBOW_HOME/example/e_coli</code
+ > directory and start the job via the <code
+ >cb_local</code
+ > script:</p
+><pre
+><code
+ >cd $CROSSBOW_HOME/example/e_coli
+$CROSSBOW_HOME/cb_local \
+ --input=small.manifest \
+ --preprocess \
+ --reference=$CROSSBOW_REFS/e_coli \
+ --output=output_small \
+ --all-haploids \
+ --cpus=<CPUS>
+</code
+ ></pre
+><p
+>Substitute the number of CPUs you'd like to use for <code
+ ><CPUS></code
+ >.</p
+><p
+>The <a href="#cb-preprocess"
+ ><code
+ >--preprocess</code
+ ></a
+ > option instructs Crossbow to treat the input as a <a href="#manifest-files"
+ >manifest file</a
+ >. As the first stage of the pipeline, Crossbow will download the files specified on each line of the manifest file and "preprocess" them into a format understood by Crossbow. The <a href="#cb-local-reference"
+ ><code
+ >--reference</code
+ ></a
+ > option specifies the location of the reference jar contents. The <a href="#cb-local-output"
+ ><code
+ >--output</code
+ ></a
+ > option specifies where the final output is placed. The <a href="#cb-local-cpus"
+ ><code
+ >--cpus</code
+ ></a
+ > option enables Crossbow to use up to the specified number of CPUs at any given time.</p
+><h2 id="mouse-chromosome-17-large"
+><a href="#TOC"
+ >Mouse chromosome 17 (large)</a
+ ></h2
+><p
+>Data for this example is taken from the study by <a href="http://genomebiology.com/2009/10/10/R112"
+ >Sudbury, Stalker et al</a
+ >.</p
+><h3 id="emr-1"
+><a href="#TOC"
+ >EMR</a
+ ></h3
+><div id="cb-example-mouse17-emr" />
+<h4 id="via-web-interface-1"
+><a href="#TOC"
+ >Via web interface</a
+ ></h4
+><p
+>First we build a reference jar for a human assembly and annotations using scripts included with Crossbow. The script searches for a <code
+ >bowtie-build</code
+ > executable with the same rules Crossbow uses to search for <code
+ >bowtie</code
+ >. See <a href="#installing-crossbow"
+ >Installing Crossbow</a
+ > for details. Because one of the steps executed by the script builds an index of the human genome, it should be run on a computer with plenty of memory (at least 4 gigabytes, preferably 6 or more).</p
+><pre
+><code
+ >cd $CROSSBOW_HOME/reftools
+./mm9_chr17_jar
+</code
+ ></pre
+><p
+>The <code
+ >mm9_chr17_jar</code
+ > script will automatically:</p
+><ol style="list-style-type: decimal;"
+><li
+ >Download the FASTA sequence for mouse (build <a href="http://hgdownload.cse.ucsc.edu/downloads.html#mouse"
+ >mm9</a
+ >) chromome 17 from <a href="http://hgdownload.cse.ucsc.edu/downloads.html"
+ >UCSC</a
+ >.</li
+ ><li
+ >Build an index from that FASTA sequence.</li
+ ><li
+ >Download the known SNPs and SNP frequencies for mouse chromosome 17 from <a href="http://www.ncbi.nlm.nih.gov/projects/SNP/"
+ >dbSNP</a
+ >.</li
+ ><li
+ >Arrange this information in the directory structure expected by Crossbow.</li
+ ><li
+ >Package the information in a <a href="http://en.wikipedia.org/wiki/JAR_(file_format)"
+ >jar file</a
+ > named <code
+ >mm9_chr17.jar</code
+ >.</li
+ ></ol
+><p
+>Next, use an <a href="#s3-tools"
+ >S3 tool</a
+ > to upload the <code
+ >mm9_chr17.jar</code
+ > file to the <code
+ >crossbow-refs</code
+ > subdirectory in your bucket. E.g. with this <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ > command (substituting for <code
+ ><YOUR-BUCKET></code
+ >):</p
+><pre
+><code
+ >s3cmd put $CROSSBOW_HOME/reftools/mm9_chr17/mm9_chr17.jar s3://<YOUR-BUCKET>/crossbow-refs/
+</code
+ ></pre
+><p
+>You may wish to remove the locally-generated reference jar files to save space. E.g.:</p
+><pre
+><code
+ >rm -rf $CROSSBOW_HOME/reftools/mm9_chr17
+</code
+ ></pre
+><p
+>Use an <a href="#s3-tools"
+ >S3 tool</a
+ > to upload <code
+ >$CROSSBOW_HOME/example/mouse17/full.manifest</code
+ > to the <code
+ >example/mouse17</code
+ > subdirectory in your bucket. E.g. with this <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ > command:</p
+><pre
+><code
+ >s3cmd put $CROSSBOW_HOME/example/mouse17/full.manifest s3://<YOUR-BUCKET>/example/mouse17/
+</code
+ ></pre
+><p
+>Direct your web browser to the <a href="http://bowtie-bio.sf.net/crossbow/ui.html"
+ >Crossbow web interface</a
+ > and fill in the form as below (substituting for <code
+ ><YOUR-BUCKET></code
+ >):</p
+><div><img src="images/AWS_cb_mouse17_fillin.png" alt="" /><p><i>Crossbow web form filled in for the large Mouse Chromosome 17 example.</i></p>
+</div>
+<ol style="list-style-type: decimal;"
+><li
+ >For <strong
+ >AWS ID</strong
+ >, enter your AWS Access Key ID</li
+ ><li
+ >For <strong
+ >AWS Secret Key</strong
+ >, enter your AWS Secret Access Key</li
+ ><li
+ ><em
+ >Optional</em
+ >: For <strong
+ >AWS Keypair name</strong
+ >, enter the name of your AWS keypair. This is only necessary if you would like to be able to <a href="http://en.wikipedia.org/wiki/Secure_Shell"
+ >ssh</a
+ > into the <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > cluster while it runs.</li
+ ><li
+ ><em
+ >Optional</em
+ >: Check that the AWS ID and Secret Key entered are valid by clicking the "Check credentials..." link</li
+ ><li
+ >For <strong
+ >Job name</strong
+ >, enter <code
+ >Crossbow-Mouse17</code
+ ></li
+ ><li
+ >Make sure that <strong
+ >Job type</strong
+ > is set to "Crossbow"</li
+ ><li
+ >For <strong
+ >Input URL</strong
+ >, enter <code
+ >s3n://<YOUR-BUCKET>/example/mouse17/full.manifest</code
+ >, substituting for <code
+ ><YOUR-BUCKET></code
+ ></li
+ ><li
+ ><em
+ >Optional</em
+ >: Check that the Input URL exists by clicking the "Check that input URL exists..." link</li
+ ><li
+ >For <strong
+ >Output URL</strong
+ >, enter <code
+ >s3n://<YOUR-BUCKET>/example/mouse17/output_full</code
+ >, substituting for <code
+ ><YOUR-BUCKET></code
+ ></li
+ ><li
+ ><em
+ >Optional</em
+ >: Check that the Output URL does not exist by clicking the "Check that output URL doesn't exist..." link</li
+ ><li
+ >For <strong
+ >Input type</strong
+ >, select "Manifest file"</li
+ ><li
+ >For <strong
+ >Genome/Annotation</strong
+ >, check the box labeled "Specify reference jar URL:" and enter <code
+ >s3n://<YOUR-BUCKET>/crossbow-refs/mm9_chr17.jar</code
+ > in the text box below</li
+ ><li
+ ><em
+ >Optional</em
+ >: Check that the reference jar URL exists by clicking the "Check that reference jar URL exists..." link</li
+ ><li
+ >For <strong
+ >Chromosome ploidy</strong
+ >, select "All are diploid"</li
+ ><li
+ >Click Submit</li
+ ></ol
+><p
+>This job typically takes about 45 minutes on 8 <code
+ >c1.xlarge</code
+ > <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > instances. See <a href="#monitoring-your-emr-jobs"
+ >Monitoring your EMR jobs</a
+ > for information on how to track job progress. To download the results, use an <a href="#s3-tools"
+ >S3 tool</a
+ > to retrieve the contents of the <code
+ >s3n://<YOUR-BUCKET>/example/mouse17/output_full</code
+ > directory.</p
+><h4 id="via-command-line-1"
+><a href="#TOC"
+ >Via command line</a
+ ></h4
+><p
+>First we build a reference jar for a human assembly and annotations using scripts included with Crossbow. The script searches for a <code
+ >bowtie-build</code
+ > executable with the same rules Crossbow uses to search for <code
+ >bowtie</code
+ >. See <a href="#installing-crossbow"
+ >Installing Crossbow</a
+ > for details. Because one of the steps executed by the script builds an index of the human genome, it should be run on a computer with plenty of memory (at least 4 gigabytes, preferably 6 or more).</p
+><pre
+><code
+ >cd $CROSSBOW_HOME/reftools
+./mm9_chr17_jar
+</code
+ ></pre
+><p
+>The <code
+ >mm9_chr17_jar</code
+ > script will automatically:</p
+><ol style="list-style-type: decimal;"
+><li
+ >Download the FASTA sequence for mouse (build <a href="http://hgdownload.cse.ucsc.edu/downloads.html#mouse"
+ >mm9</a
+ >) chromome 17 from <a href="http://hgdownload.cse.ucsc.edu/downloads.html"
+ >UCSC</a
+ >.</li
+ ><li
+ >Build an index from that FASTA sequence.</li
+ ><li
+ >Download the known SNPs and SNP frequencies for mouse chromosome 17 from <a href="http://www.ncbi.nlm.nih.gov/projects/SNP/"
+ >dbSNP</a
+ >.</li
+ ><li
+ >Arrange this information in the directory structure expected by Crossbow.</li
+ ><li
+ >Package the information in a <a href="http://en.wikipedia.org/wiki/JAR_(file_format)"
+ >jar file</a
+ > named <code
+ >mm9_chr17.jar</code
+ >.</li
+ ></ol
+><p
+>Next, use an <a href="#s3-tools"
+ >S3 tool</a
+ > to upload the <code
+ >mm9_chr17.jar</code
+ > file to the <code
+ >crossbow-refs</code
+ > subdirectory in your bucket. E.g. with this <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ > command (substituting for <code
+ ><YOUR-BUCKET></code
+ >):</p
+><pre
+><code
+ >s3cmd put $CROSSBOW_HOME/reftools/mm9_chr17/mm9_chr17.jar s3://<YOUR-BUCKET>/crossbow-refs/
+</code
+ ></pre
+><p
+>You may wish to remove the locally-generated reference jar files to save space. E.g.:</p
+><pre
+><code
+ >rm -rf $CROSSBOW_HOME/reftools/mm9_chr17
+</code
+ ></pre
+><p
+>Use an <a href="#s3-tools"
+ >S3 tool</a
+ > to upload <code
+ >$CROSSBOW_HOME/example/mouse17/full.manifest</code
+ > to the <code
+ >example/mouse17</code
+ > subdirectory in your bucket. E.g. with this <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ > command:</p
+><pre
+><code
+ >s3cmd put $CROSSBOW_HOME/example/mouse17/full.manifest s3://<YOUR-BUCKET>/example/mouse17/
+</code
+ ></pre
+><p
+>To start the <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > job, run the following command (substituting for <code
+ ><YOUR-BUCKET></code
+ >):</p
+><pre
+><code
+ >$CROSSBOW_HOME/cb_emr \
+ --name "Crossbow-Mouse17" \
+ --preprocess \
+ --input=s3n://<YOUR-BUCKET>/example/mouse17/full.manifest \
+ --output=s3n://<YOUR-BUCKET>/example/mouse17/output_full \
+ --reference=s3n://<YOUR-BUCKET>/crossbow-refs/mm9_chr17.jar \
+ --instances 8
+</code
+ ></pre
+><p
+>This job typically takes about 45 minutes on 8 <code
+ >c1.xlarge</code
+ > <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > instances. See <a href="#monitoring-your-emr-jobs"
+ >Monitoring your EMR jobs</a
+ > for information on how to track job progress. To download the results, use an <a href="#s3-tools"
+ >S3 tool</a
+ > to retrieve the contents of the <code
+ >s3n://<YOUR-BUCKET>/example/mouse17/output_full</code
+ > directory.</p
+><h3 id="hadoop-1"
+><a href="#TOC"
+ >Hadoop</a
+ ></h3
+><div id="cb-example-mouse17-hadoop" />
+<p
+>First we build a reference jar for a human assembly and annotations using scripts included with Crossbow. The script searches for a <code
+ >bowtie-build</code
+ > executable with the same rules Crossbow uses to search for <code
+ >bowtie</code
+ >. See <a href="#installing-crossbow"
+ >Installing Crossbow</a
+ > for details. Because one of the steps executed by the script builds an index of the human genome, it should be run on a computer with plenty of memory (at least 4 gigabytes, preferably 6 or more).</p
+><pre
+><code
+ >cd $CROSSBOW_HOME/reftools
+./mm9_chr17_jar
+</code
+ ></pre
+><p
+>The <code
+ >mm9_chr17_jar</code
+ > script will automatically:</p
+><ol style="list-style-type: decimal;"
+><li
+ >Download the FASTA sequence for mouse (build <a href="http://hgdownload.cse.ucsc.edu/downloads.html#mouse"
+ >mm9</a
+ >) chromome 17 from <a href="http://hgdownload.cse.ucsc.edu/downloads.html"
+ >UCSC</a
+ >.</li
+ ><li
+ >Build an index from that FASTA sequence.</li
+ ><li
+ >Download the known SNPs and SNP frequencies for mouse chromosome 17 from <a href="http://www.ncbi.nlm.nih.gov/projects/SNP/"
+ >dbSNP</a
+ >.</li
+ ><li
+ >Arrange this information in the directory structure expected by Crossbow.</li
+ ><li
+ >Package the information in a <a href="http://en.wikipedia.org/wiki/JAR_(file_format)"
+ >jar file</a
+ > named <code
+ >mm9_chr17.jar</code
+ >.</li
+ ></ol
+><p
+>Next, use the <code
+ >hadoop</code
+ > script to put the <code
+ >mm9_chr17.jar</code
+ > file in the <code
+ >crossbow-refs</code
+ > <a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > directory. Note tha tif <code
+ >hadoop</code
+ > is not in your <code
+ >PATH</code
+ >, you must specify <code
+ >hadoop</code
+ >'s full path instead:</p
+><pre
+><code
+ >hadoop dfs -mkdir /crossbow-refs
+hadoop dfs -put $CROSSBOW_HOME/reftools/mm9_chr17/mm9_chr17.jar /crossbow-refs/mm9_chr17.jar
+</code
+ ></pre
+><p
+>The first command will yield a warning if the directory already exists; ignore this. In this example, we deposit the jars in the <code
+ >/crossbow-refs</code
+ > directory, but any <a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ > directory is fine.</p
+><p
+>You may wish to remove the locally-generated reference jar files to save space. E.g.:</p
+><pre
+><code
+ >rm -rf $CROSSBOW_HOME/reftools/mm9_chr17
+</code
+ ></pre
+><p
+>Now install the <a href="#manifest-files"
+ >manifest file</a
+ > in <a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ >:</p
+><pre
+><code
+ >hadoop dfs -mkdir /crossbow/example/mouse17
+hadoop dfs -put $CROSSBOW_HOME/example/mouse17/full.manifest /crossbow/example/mouse17/full.manifest
+</code
+ ></pre
+><p
+>To start the <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > job, run the following command (substituting for <code
+ ><YOUR-BUCKET></code
+ >):</p
+><pre
+><code
+ >$CROSSBOW_HOME/cb_hadoop \
+ --preprocess \
+ --input=hdfs:///crossbow/example/mouse17/full.manifest \
+ --output=hdfs:///crossbow/example/mouse17/output_full \
+ --reference=hdfs:///crossbow-refs/mm9_chr17.jar
+</code
+ ></pre
+><h3 id="single-computer-1"
+><a href="#TOC"
+ >Single computer</a
+ ></h3
+><div id="cb-example-mouse17-local" />
+<p
+>First we build a reference jar for a human assembly and annotations using scripts included with Crossbow. The script searches for a <code
+ >bowtie-build</code
+ > executable with the same rules Crossbow uses to search for <code
+ >bowtie</code
+ >. See <a href="#installing-crossbow"
+ >Installing Crossbow</a
+ > for details. Because one of the steps executed by the script builds an index of the human genome, it should be run on a computer with plenty of memory (at least 4 gigabytes, preferably 6 or more).</p
+><p
+>Run the following commands:</p
+><pre
+><code
+ >cd $CROSSBOW_HOME/reftools
+./mm9_chr17_jar
+</code
+ ></pre
+><p
+>The <code
+ >mm9_chr17_jar</code
+ > script will automatically:</p
+><ol style="list-style-type: decimal;"
+><li
+ >Download the FASTA sequence for mouse (build <a href="http://hgdownload.cse.ucsc.edu/downloads.html#mouse"
+ >mm9</a
+ >) chromome 17 from <a href="http://hgdownload.cse.ucsc.edu/downloads.html"
+ >UCSC</a
+ >.</li
+ ><li
+ >Build an index from that FASTA sequence.</li
+ ><li
+ >Download the known SNPs and SNP frequencies for mouse chromosome 17 from <a href="http://www.ncbi.nlm.nih.gov/projects/SNP/"
+ >dbSNP</a
+ >.</li
+ ><li
+ >Arrange this information in the directory structure expected by Crossbow.</li
+ ><li
+ >Package the information in a <a href="http://en.wikipedia.org/wiki/JAR_(file_format)"
+ >jar file</a
+ > named <code
+ >mm9_chr17.jar</code
+ >.</li
+ ></ol
+><p
+>Move the directory containing the new reference jar into the <code
+ >$CROSSBOW_REFS</code
+ > directory:</p
+><pre
+><code
+ >mv $CROSSBOW_HOME/reftools/mm9_chr17 $CROSSBOW_REFS/
+</code
+ ></pre
+><p
+>Now change to the <code
+ >$CROSSBOW_HOME/example/mouse17</code
+ > directory and run Crossbow (substitute the number of CPUs you'd like to use for <code
+ ><CPUS></code
+ >):</p
+><pre
+><code
+ >cd $CROSSBOW_HOME/example/mouse17
+$CROSSBOW_HOME/cb_local \
+ --input=$CROSSBOW_HOME/example/mouse17/full.manifest \
+ --preprocess \
+ --reference=$CROSSBOW_REFS/mm9_chr17 \
+ --output=output_full \
+ --cpus=<CPUS>
+</code
+ ></pre
+><h1 id="manifest-files"
+><a href="#TOC"
+ >Manifest files</a
+ ></h1
+><p
+>A manifest file describes a set of <a href="http://en.wikipedia.org/wiki/FASTQ_format"
+ >FASTQ</a
+ > or <a href="http://www.ncbi.nlm.nih.gov/books/NBK47540/"
+ ><code
+ >.sra</code
+ ></a
+ > formatted input files that might be located:</p
+><ol style="list-style-type: decimal;"
+><li
+ >On the local computer</li
+ ><li
+ >In <a href="http://hadoop.apache.org/common/docs/current/hdfs_design.html"
+ >HDFS</a
+ ></li
+ ><li
+ >In <a href="http://aws.amazon.com/s3/"
+ >S3</a
+ ></li
+ ><li
+ >On an FTP or web server</li
+ ></ol
+><p
+>A manifest file can contain any combination of URLs and local paths from these various types of sources.</p
+><p
+><a href="http://en.wikipedia.org/wiki/FASTQ_format"
+ >FASTQ</a
+ > files can be gzip or bzip2-compressed (i.e. with <code
+ >.gz</code
+ > or <code
+ >.bz2</code
+ > file extensions). If <a href="http://www.ncbi.nlm.nih.gov/books/NBK47540/"
+ ><code
+ >.sra</code
+ ></a
+ > files are specified in the manifest and Crossbow is being run in single-computer or <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > modes, then the <code
+ >fastq-dump</code
+ > tool must be installed and Myrna must be able to locate it. See the <a href="#cb-local-fastq-dump"
+ ><code
+ >--fastq-dump</code
+ ></a
+ > option and the <a href="#the-fastq-dump"
+ >SRA Toolkit section of the manual</a
+ >.</p
+><p
+>Each line in the manifest file represents either one file, for unpaired input reads, or a pair of files, for paired input reads. For a set of unpaired input reads, the line is formatted:</p
+><pre
+><code
+ >URL(tab)Optional-MD5
+</code
+ ></pre
+><p
+>Specifying an MD5 for the input file is optional. If it is specified, Crossbow will attempt to check the integrity of the file after downloading by comparing the observed MD5 to the user-provided MD5. To disable this checking, specify <code
+ >0</code
+ > in this field.</p
+><p
+>For a set of paired input reads, the line is formatted:</p
+><pre
+><code
+ >URL-1(tab)Optional-MD5-1(tab)URL-2(tab)Optional-MD5-2
+</code
+ ></pre
+><p
+>Where <code
+ >URL-1</code
+ > and <code
+ >URL-2</code
+ > point to input files with all the #1 mates in <code
+ >URL-1</code
+ > and all the #2 mates in <code
+ >URL-2</code
+ >. The entries in the files must be arranged so that pairs "line up" in parallel. This is commonly the way public paired-end FASTQ datasets, such as those produced by the <a href="http://www.1000genomes.org/page.php"
+ >1000 Genomes Project</a
+ >, are formatted. Typically these file pairs end in suffixes <code
+ >_1.fastq.gz</code
+ > and <code
+ >_2.fastq.gz</code
+ >.</p
+><p
+>Manifest files may have comment lines, which must start with the hash (<code
+ >#</code
+ >) symbol, and blank lines. Such lines are ignored by Crossbow.</p
+><p
+>For examples of manifest files, see the files ending in <code
+ >.manifest</code
+ > in the <code
+ >$CROSSBOW_HOME/example/e_coli</code
+ > and <code
+ >$CROSSBOW_HOME/example/mouse17</code
+ > directories.</p
+><h1 id="reference-jars"
+><a href="#TOC"
+ >Reference jars</a
+ ></h1
+><p
+>All information about a reference sequence needed by Crossbow is encapsulated in a "reference jar" file. A reference jar includes a set of FASTA files encoding the reference sequences, a <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > index of the reference sequence, and a set of files encoding information about known SNPs for the species.</p
+><p
+>A Crossbow reference jar is organized as:</p
+><ol style="list-style-type: decimal;"
+><li
+ >A <code
+ >sequences</code
+ > subdirectory containing one FASTA file per reference sequence.</li
+ ><li
+ >An <code
+ >index</code
+ > subdirectory containing the <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > index files for the reference sequences.</li
+ ><li
+ >A <code
+ >snps</code
+ > subdirectory containing all of the SNP description files.</li
+ ></ol
+><p
+>The FASTA files in the <code
+ >sequences</code
+ > subdirectory must each be named <code
+ >chrX.fa</code
+ >, where <code
+ >X</code
+ > is the 0-based numeric id of the chromosome or sequence in the file. For example, for a human reference, chromosome 1's FASTA file could be named <code
+ >chr0.fa</code
+ >, chromosome 2 named <code
+ >chr1.fa</code
+ >, etc, all the way up to chromosomes 22, X and Y, named <code
+ >chr21.fa</code
+ >, <code
+ >chr22.fa</code
+ > and <code
+ >chr23.fa</code
+ >. Also, the names of the sequences within the FASTA files must match the number in the file name. I.e., the first line of the FASTA file <code
+ >chr0.fa</code
+ > must be <code
+ >>0</code
+ >.</p
+><p
+>The index files in the <code
+ >index</code
+ > subdirectory must have the basename <code
+ >index</code
+ >. I.e., the index subdirectory must contain these files:</p
+><pre
+><code
+ >index.1.ebwt
+index.2.ebwt
+index.3.ebwt
+index.4.ebwt
+index.rev.1.ebwt
+index.rev.2.ebwt
+</code
+ ></pre
+><p
+>The index must be built using the <a href="http://bowtie-bio.sourceforge.net/manual.shtml#indx"
+ ><code
+ >bowtie-build</code
+ ></a
+ > tool distributed with <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ >. When <code
+ >bowtie-build</code
+ > is executed, the FASTA files specified on the command line must be listed in ascending order of numeric id. For instance, for a set of FASTA files encoding human chromosomes 1,2,...,22,X,Y as <code
+ >chr0.fa</code
+ >,<code
+ >chr1.fa</code
+ >,...,<code
+ >chr21.fa</code
+ >, <code
+ >chr22.fa</code
+ >,<code
+ >chr23.fa</code
+ >, the command for <code
+ >bowtie-build</code
+ > must list the FASTA files in that order:</p
+><pre
+><code
+ >bowtie-build chr0.fa,chr1.fa,...,chr23.fa index
+</code
+ ></pre
+><p
+>The SNP description files in the <code
+ >snps</code
+ > subdirectory must also have names that match the corresponding FASTA files in the <code
+ >sequences</code
+ > subdirectory, but with extension <code
+ >.snps</code
+ >. E.g. if the sequence file for human Chromosome 1 is named <code
+ >chr0.fa</code
+ >, then the SNP description file for Chromosome 1 must be named <code
+ >chr0.snps</code
+ >. SNP description files may be omitted for some or all chromosomes.</p
+><p
+>The format of the SNP description files must match the format expected by <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ >'s <code
+ >-s</code
+ > option. The format consists of 1 SNP per line, with the following tab-separated fields per SNP:</p
+><ol style="list-style-type: decimal;"
+><li
+ >Chromosome ID</li
+ ><li
+ >1-based offset into chromosome</li
+ ><li
+ >Whether SNP has allele frequency information (1 = yes, 0 = no)</li
+ ><li
+ >Whether SNP is validated by experiment (1 = yes, 0 = no)</li
+ ><li
+ >Whether SNP is actually an indel (1 = yes, 0 = no)</li
+ ><li
+ >Frequency of A allele, as a decimal number</li
+ ><li
+ >Frequency of C allele, as a decimal number</li
+ ><li
+ >Frequency of T allele, as a decimal number</li
+ ><li
+ >Frequency of G allele, as a decimal number</li
+ ><li
+ >SNP id (e.g. a <a href="http://www.ncbi.nlm.nih.gov/projects/SNP/"
+ >dbSNP</a
+ > id such as <code
+ >rs9976767</code
+ >)</li
+ ></ol
+><p
+>Once these three subdirectories have been created and populated, they can be combined into a single <a href="http://en.wikipedia.org/wiki/JAR_(file_format)"
+ >jar file</a
+ > with a command like this:</p
+><pre
+><code
+ >jar cf ref-XXX.jar sequences snps index
+</code
+ ></pre
+><p
+>To use <code
+ >ref-XXX.jar</code
+ > with Crossbow, you must copy it to a location where it can be downloaded over the internet via HTTP, FTP, or S3. Once it is placed in such a location, make a note if its URL.</p
+><h2 id="building-a-reference-jar-using-automatic-scripts"
+><a href="#TOC"
+ >Building a reference jar using automatic scripts</a
+ ></h2
+><p
+>The <code
+ >reftools</code
+ > subdirectory of the Crossbow package contains scripts that assist in building reference jars, including scripts that handle the entire process of building reference jars for <a href="http://hgdownload.cse.ucsc.edu/downloads.html#human"
+ >hg18</a
+ > (UCSC human genome build 18) and <a href="http://hgdownload.cse.ucsc.edu/downloads.html#mouse"
+ >mm9</a
+ > (UCSC mouse genome build 9). The <code
+ >db2ssnp</code
+ > script combines SNP and allele frequency information from <a href="http://www.ncbi.nlm.nih.gov/projects/SNP/"
+ >dbSNP</a
+ > to create a <code
+ >chrX.snps</code
+ > file for the <code
+ >snps</code
+ > subdirectory of the reference jar. The <code
+ >db2ssnp_*</code
+ > scripts drive the <code
+ >db2ssnp</code
+ > script for each chromosome in the <a href="http://hgdownload.cse.ucsc.edu/downloads.html#human"
+ >hg18</a
+ > and <a href="http://hgdownload.cse.ucsc.edu/downloads.html#mouse"
+ >mm9</a
+ > genomes. The <code
+ >*_jar</code
+ > scripts drive the entire reference-jar building process, including downloading reference FASTA files, building a Bowtie index, and using <code
+ >db2ssnp</code
+ > to generate the <code
+ >.snp</code
+ > files for <a href="http://hgdownload.cse.ucsc.edu/downloads.html#human"
+ >hg18</a
+ > and <a href="http://hgdownload.cse.ucsc.edu/downloads.html#mouse"
+ >mm9</a
+ >.</p
+><h1 id="monitoring-debugging-and-logging"
+><a href="#TOC"
+ >Monitoring, debugging and logging</a
+ ></h1
+><h2 id="single-computer-2"
+><a href="#TOC"
+ >Single computer</a
+ ></h2
+><p
+>Single-computer runs of Crossbow are relatively easy to monitor and debug. Progress messages are printed to the console as the job runs. When there is a fatal error, Crossbow usually indicates exactly which log file on the local filesystem contains the relevant error message. Additional debugging is possible when intermediate and temporary files are kept rather than discarded; see <a href="#cb-local-keep-intermediates"
+ ><code
+ >--keep-intermediates</code
+ ></a
+ > and <a href="#cb-local-keep-all"
+ ><code
+ >--keep-all</code
+ ></a
+ >. All output and logs are stored on the local filesystem; see <a href="#cb-local-intermediate"
+ ><code
+ >--intermediate</code
+ ></a
+ > and <a href="#cb-local-output"
+ ><code
+ >--output</code
+ ></a
+ > options.</p
+><h2 id="hadoop-2"
+><a href="#TOC"
+ >Hadoop</a
+ ></h2
+><p
+>The simplest way to monitor Crossbow <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > jobs is via the Hadoop JobTracker. The JobTracker is a web server that provides a point-and-click interface for monitoring jobs and reading output and other log files generated by those jobs, including after they've finished.</p
+><p
+>When a job fails, you can often find the relevant error message by "drilling down" from the "step" level through the "job" level and "task" levels, and finally to the "attempt" level. To diagnose why an attempt failed, click through to the "stderr" ("standard error") log and scan for the relevant error message.</p
+><p
+>See your version of Hadoop's documentation for details on how to use the web interface. Amazon has a brief document describing <a href="http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?UsingtheHadoopUserInterface.html"
+ >How to Use the Hadoop User Interface</a
+ >, though some of the instructions are specific to clusters rented from Amazon. <a href="http://oreilly.com/catalog/9780596521981"
+ >Hadoop, the Definitive Guide</a
+ > is also an excellent reference.</p
+><h2 id="emr-2"
+><a href="#TOC"
+ >EMR</a
+ ></h2
+><p
+>The recommended way to monitor EMR <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ > jobs is via the <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ >. The <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ > allows you to see:</p
+><ol style="list-style-type: decimal;"
+><li
+ >The status for job (e.g. "COMPLETED", "RUNNING" or "FAILED")</li
+ ><li
+ >The status for each step of each job</li
+ ><li
+ >How long a job has been running for and how many "compute units" have been utilized so far.</li
+ ><li
+ >The exact Hadoop commands used to initiate each job step.</li
+ ><li
+ >The button for <a href="http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?DebuggingJobFlows.html"
+ >Debugging Job Flows</a
+ ></li
+ ></ol
+><div><img src="images/AWS_console.png" alt="Screen shot of AWS console with interface elements labeled" /><p><i>Screen shot of <a href="https://console.aws.amazon.com"
+>AWS Console</a
+> interface with some relevant interface elements labeled</i></p>
+</div>
+<p
+>The <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ > also has a useful facility for <a href="http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?DebuggingJobFlows.html"
+ >Debugging Job Flows</a
+ >, which is accessible via the "Debug" button on the "Elastic MapReduce" tab of the Console (labeled "5"). You must (a) have a <a href="http://aws.amazon.com/simpledb/"
+ >SimpleDB</a
+ > account (b) not have specified <a href="#cb-no-emr-debug"
+ ><code
+ >--no-emr-debug</code
+ ></a
+ > in order to use all of the <a href="http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?DebuggingJobFlows.html"
+ >EMR Debug</a
+ > interface's features:</p
+><div><img src="images/AWS_console_debug.png" alt="Screen shot of AWS console debug interface" /><p><i>Screen shot of <a href="http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?DebuggingJobFlows.html"
+>EMR Debug</a
+> interface</i></p>
+</div>
+<p
+>The debug interface is similar to Hadoop's JobTracker interface. When a job fails, you can often find the relevant error message by "drilling down" from the "job" level, through the "task" level, and finally to the "attempt" level. To diagnose why an attempt failed, click through to the "stderr" ("standard error") log and scan for the relevant error message.</p
+><p
+>For more information, see Amazon's document on <a href="http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?DebuggingJobFlows.html"
+ >Debugging Job Flows</a
+ >.</p
+><h2 id="aws-management-console"
+><a href="#TOC"
+ >AWS Management Console</a
+ ></h2
+><p
+>A simple way to monitor your EMR activity is via the <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ >. The <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ > summarizes current information regarding all your running <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > nodes and <a href="http://aws.amazon.com/elasticmapreduce"
+ >EMR</a
+ > jobs. Each job is listed in the "Amazon Elastic MapReduce" tab of the console, whereas individual <a href="http://aws.amazon.com/ec2"
+ >EC2</a
+ > nodes are listed in the "Amazon EC2" tab.</p
+><div><img src="images/AWS_console_upper_left.png" alt="Screen shot of AWS console tabs" /><p><i>Screen shot of <a href="https://console.aws.amazon.com"
+>AWS console</a
+>; note tabs for "Amazon Elastic MapReduce" and "Amazon EC2"</i></p>
+</div>
+<h1 id="crossbow-output"
+><a href="#TOC"
+ >Crossbow Output</a
+ ></h1
+><p
+>Once a Crossbow job completes successfully, the output is deposited in a <code
+ >crossbow_results</code
+ > subdirectory of the specified <code
+ >--output</code
+ > directory or URL. Within the <code
+ >crossbow_results</code
+ > subdirectory, results are organized as one gzipped result file per chromosome. E.g. if your run was against the <a href="http://hgdownload.cse.ucsc.edu/downloads.html#human"
+ >hg18</a
+ > build of the human genome, the output files from your experiment will named:</p
+><pre
+><code
+ ><output_url>/crossbow_results/chr1.gz
+<output_url>/crossbow_results/chr2.gz
+<output_url>/crossbow_results/chr3.gz
+...
+<output_url>/crossbow_results/chr21.gz
+<output_url>/crossbow_results/chr22.gz
+<output_url>/crossbow_results/chrX.gz
+<output_url>/crossbow_results/chrY.gz
+<output_url>/crossbow_results/chrM.gz
+</code
+ ></pre
+><p
+>Each individual record is in the <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > output format. SOAPsnp's format consists of 1 SNP per line with several tab-separated fields per SNP. The fields are:</p
+><ol style="list-style-type: decimal;"
+><li
+ >Chromosome ID</li
+ ><li
+ >1-based offset into chromosome</li
+ ><li
+ >Reference genotype</li
+ ><li
+ >Subject genotype</li
+ ><li
+ >Quality score of subject genotype</li
+ ><li
+ >Best base</li
+ ><li
+ >Average quality score of best base</li
+ ><li
+ >Count of uniquely aligned reads corroborating the best base</li
+ ><li
+ >Count of all aligned reads corroborating the best base</li
+ ><li
+ >Second best base</li
+ ><li
+ >Average quality score of second best base</li
+ ><li
+ >Count of uniquely aligned reads corroborating second best base</li
+ ><li
+ >Count of all aligned reads corroborating second best base</li
+ ><li
+ >Overall sequencing depth at the site</li
+ ><li
+ >Sequencing depth of just the paired alignments at the site</li
+ ><li
+ >Rank sum test P-value</li
+ ><li
+ >Average copy number of nearby region</li
+ ><li
+ >Whether the site is a known SNP from the file specified with <code
+ >-s</code
+ ></li
+ ></ol
+><p
+>Note that field 15 was added in Crossbow and is not output by unmodified SOAPsnp.</p
+><p
+>For further details, see the <a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > manual.</p
+><h1 id="other-reading"
+><a href="#TOC"
+ >Other reading</a
+ ></h1
+><p
+>The <a href="http://genomebiology.com/2009/10/11/R134"
+ >Crossbow paper</a
+ > discusses the broad design philosophy of both <a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow</a
+ > and <a href="http://bowtie-bio.sf.net/myrna"
+ >Myrna</a
+ > and why cloud computing can be considered a useful trend for comparative genomics applications. The <a href="http://genomebiology.com/2009/10/3/R25"
+ >Bowtie paper</a
+ > discusses the alignment algorithm underlying <a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ >.</p
+><p
+>For additional information regarding Amazon EC2, S3, EMR, and related services, see Amazon's <a href="http://aws.amazon.com/documentation/"
+ >AWS Documentation</a
+ >. Some helpful screencasts are posted on the <a href="https://console.aws.amazon.com"
+ >AWS Console</a
+ > home page.</p
+><p
+>For additional information regarding Hadoop, see the <a href="http://hadoop.apache.org/"
+ >Hadoop web site</a
+ > and <a href="http://www.cloudera.com/resource/getting_started_with_hadoop"
+ >Cloudera's Getting Started with Hadoop</a
+ > document. <a href="http://www.cloudera.com/developers/downloads/virtual-machine/"
+ >Cloudera's training virtual machine</a
+ > for <a href="http://www.vmware.com/"
+ >VMWare</a
+ > is an excellent way to get acquainted with Hadoop without having to install it on a production cluster.</p
+><h1 id="acknowledgements"
+><a href="#TOC"
+ >Acknowledgements</a
+ ></h1
+><p
+><a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow</a
+ > software is by <a href="http://faculty.jhsph.edu/default.cfm?faculty_id=2209&grouped=false&searchText=&department_id=3&departmentName=Biostatistics"
+ >Ben Langmead</a
+ > and <a href="http://www.cbcb.umd.edu/~mschatz/"
+ >Michael C. Schatz</a
+ >.</p
+><p
+><a href="http://bowtie-bio.sf.net"
+ >Bowtie</a
+ > software is by <a href="http://faculty.jhsph.edu/default.cfm?faculty_id=2209&grouped=false&searchText=&department_id=3&departmentName=Biostatistics"
+ >Ben Langmead</a
+ > and <a href="http://www.cs.umd.edu/~cole/"
+ >Cole Trapnell</a
+ >.</p
+><p
+><a href="http://soap.genomics.org.cn/soapsnp.html"
+ >SOAPsnp</a
+ > is by Ruiqiang Li, Yingrui Li, Xiaodong Fang, Huanming Yang, Jian Wang, Karsten Kristiansen, and Jun Wang.</p
+>
+
diff --git a/doc/website/news.shtml b/doc/website/news.shtml
new file mode 100644
index 0000000..cc5f29f
--- /dev/null
+++ b/doc/website/news.shtml
@@ -0,0 +1,35 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<title>Crossbow: Whole Genome Resequencing Analysis in the Clouds</title>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+<link rel="stylesheet" type="text/css" href="../css/style.css" media="screen" />
+<meta name="verify-v1" content="YJT1CfXN3kzE9cr+jvNB+Q73lTfHrv8eivoY+xjblc0=" />
+</head>
+<body>
+<div id="wrap">
+ <!--#include virtual="top.ssi" -->
+ <div id="main">
+ <!--#include virtual="rhsidebar.ssi" -->
+ <div id="leftside">
+ <h2>News archive</h2>
+ <!--#include virtual="recent_news.ssi" -->
+ <!--#include virtual="old_news.ssi" -->
+ </div>
+ </div>
+ <!--#include virtual="foot.ssi" -->
+</div>
+
+<!-- Google analytics code -->
+<script type="text/javascript">
+var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
+document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
+</script>
+<script type="text/javascript">
+var pageTracker = _gat._getTracker("UA-5334290-1");
+pageTracker._trackPageview();
+</script>
+<!-- End google analytics code -->
+
+</body>
+</html>
diff --git a/doc/website/old_news.ssi b/doc/website/old_news.ssi
new file mode 100644
index 0000000..d5742d8
--- /dev/null
+++ b/doc/website/old_news.ssi
@@ -0,0 +1,214 @@
+<h2>Version 1.0.6 - August 26, 2010</h2>
+<ul>
+ <li>
+ Single-computer mode now copies the output that it writes to the
+ console to a file <tt>cb.local.(pid).out</tt>. Please include the
+ contents of this file when reporting issues.
+ <li>
+ Sorting in single-computer mode is now more portable; switched
+ from command-line sort to pure-Perl <a href="http://search.cpan.org/~cnandor/File-Sort-1.01/Sort.pm">File::Sort</a>.
+ <li>
+ Fixed bug whereby the quality setting would be propagated to
+ Bowtie but not to SOAPsnp, causing SOAPsnp to operate with
+ incorrect (over-optimistic) quality values when Phred+64 or
+ Solexa+64 modes were used.
+ <li>
+ More helpful output from <tt>MapWrap.pl</tt> and <tt>ReduceWrap.pl</tt> to make it
+ easier to debug issues in single-computer-mode runs.
+ <li>
+ Fixed issue where web form would incorrectly convert + signs in
+ AWS secret key to spaces, causing some good credentials to fail
+ verification.
+ <li>
+ Fixed issue in preprocessor that mishandles copies when user's AWS
+ secret key contains slash characters.
+</ul>
+
+<h2>Version 1.0.5 - August 16, 2010</h2>
+<ul>
+ <li>
+ Fixed issue that prevented <tt>CROSSBOW_EMR_HOME</tt> environment variable
+ from working.
+ <li>
+ Fixed issue whereby <tt>Align.pl</tt> script fails to report a count for
+ the number of reads with alignments sampled due to Bowtie's <tt>-M</tt>
+ option.
+ <li>
+ Fixed issue whereby scripts in the <tt>$CROSSBOW_HOME/reftools</tt>
+ directory had <tt>#!/bin/sh</tt> headers but were actually <tt>bash</tt> scripts.
+ <li>
+ Fixed issue that made it difficult to specify a space-separated
+ list of arguments to the <tt>--bowtie-args</tt> and other <tt>--*-args</tt>
+ parameters.
+ <li>
+ Fixed issue whereby most documentation referred to arguments with
+ a single-dash prefix, whereas users with the <tt>POSIXLY_CORRECT</tt>
+ environment variable set must use a double-dash prefix.
+ Documentation and code have been updated to always use double-dash
+ prefixes.
+</ul>
+
+<h2>Myrna paper out - 8/11/10</h2>
+<ul>
+ <li>
+ The provisional version of the <a href="http://genomebiology.com/2010/11/8/R83/abstract">Myrna paper</a>
+ appeared today in <a href="http://genomebiology.com/">Genome Biology</a>. See the <a href="http://bowtie-bio.sourceforge.net/myrna">Myrna site</a>
+ for details about the software.
+</ul>
+
+<h2>Major revision: Version 1.0.4 - July 22, 2010</h2>
+<ul>
+ <li>Crossbow has been largely rewritten as an <a href="http://aws.amazon.com/elasticmapreduce/">Amazon Elastic MapReduce</a>
+ (EMR) application, as opposed to an Elastic Compute Cloud (EC2)
+ application. EMR runs on top of EC2 and is a more appropriate way
+ to run Crossbow for several reasons, including:
+ <ul>
+ <li>
+ The <a href="https://console.aws.amazon.com/elasticmapreduce/home">AWS Console's Elastic MapReduce tab</a>, together with EMR's
+ Debug Job Flow feature, provide a much friendlier interface for
+ monitoring and manipulating jobs.
+ <li>
+ The elaborate scripts for automating cluster setup, teardown,
+ proxy connection, etc., that were in old versions of Crossbow
+ are all gone. They are either no longer relevant, or else are
+ handled automatically by EMR.
+ </ul>
+ <li>A <a href="ui.html">web-based GUI</a> for composing and submitting EMR jobs has been
+ added. Most helpfully, the web GUI has features for
+ sanity-checking inputs; e.g. whether the user's credentials as
+ entered are valid, whether the input URL exists, etc.
+ <li>Crossbow is now fully "tri-mode", with separate cloud, <a href="http://hadoop.apache.org/">Hadoop</a>, and
+ single-computer operating modes. All three modes share a great
+ deal of common infrastructure, making all three modes easier to
+ maintain going forward.
+ <ul>
+ <li>
+ Crossbow's <a href="http://hadoop.apache.org/">Hadoop</a> mode is now much improved, having an interface
+ very similar to cloud and single-computer modes.
+ <li>
+ A new single-computer operating mode has been added that (a)
+ uses many processors/cores to shorten computation time, and (b)
+ does not require the user to have a cloud account or a Hadoop
+ installation. It also doesn't require Java; just appropriate
+ versions of <a href="http://bowtie-bio.sf.net">Bowtie</a>, SOAPsnp (some of which are included), Perl,
+ and other tools. Its interface is very similar to cloud and
+ <a href="http://hadoop.apache.org/">Hadoop</a> modes.
+ </ul>
+ <li>The <a href="manual.shtml">manual</a> is entirely rewritten. It now contains information
+ about running in all three modes (cloud, Hadoop, single-computer),
+ and gives multiple examples for how to run in each mode.
+ <li>Fixed a bug whereby allele frequency columns in the provided
+ reference jars had T and G columns switched.
+ <li>SOAPsnp reduce step now outputs more counter and status
+ information.
+ <li>SOAPsnp reduce step outputs an additional column per SNP
+ indicating paired-end coverage.
+ <li>Compatible with <a href="http://bowtie-bio.sf.net">Bowtie</a> versions 0.12.0 and above. Bowtie 0.12.5
+ now included.
+ <li>Compatible with <a href="http://hadoop.apache.org/">Hadoop</a> 0.18 or 0.20; both were tested.
+ <li>Many other new options and features. See <a href="manual.shtml">manual</a>.
+</ul>
+
+<h2>Crossbow 0.2.0 and Crossbow for Elastic MapReduce - Coming soon</h2>
+<ul>
+ <li>Crossbow has been rewritten to allow it to run as an
+ <a href="http://aws.amazon.com/elasticmapreduce/">Amazon Elastic MapReduce</a>
+ application. EMR is a much easier and slicker
+ way of running MapReduce programs on EC2. It will also be easier
+ for us to maintain.
+ <li>Fixed a bug whereby allele frequency columns in the provided
+ reference jars had T and G columns switched.
+ <li>SOAPsnp reduce step now outputs more counter and status
+ information.
+ <li>SOAPsnp reduce step outputs an additional column per SNP
+ indicating paired-end coverage.
+ <li>Compatible with <a href="http://bowtie-bio.sf.net">Bowtie</a>
+ versions 0.12.0 and above.
+</ul>
+
+<h2>Bowtie 0.12.0 - 12/23/09</h2>
+<ul>
+ <li>Note that <a href="http://bowtie-bio.sf.net">Bowtie version 0.12.0</a> is not compatible with Crossbow. A Crossbow update is coming soon.
+</ul>
+
+<h2>Crossbow paper out - 11/20/09</h2>
+<ul>
+ <li>
+ The provisional version of the <a href="http://genomebiology.com/2009/10/11/R134">Crossbow paper</a>
+ appeared today in <a href="http://genomebiology.com/">Genome Biology</a>.
+</ul>
+
+<h2>0.1.3 release - 10/21/09</h2>
+<ul>
+ <li>
+ <tt>cb-local</tt> now gives the user clear feedback when worker nodes fail
+ to confirm the MD5 signature of the reference jar. If this
+ failure occurs several times per node across all nodes, the
+ supplied MD5 is probably incorrect.
+ <li>
+ An extra Reduce step was added to the end of the Crossbow job to
+ bin and sort SNPs before downloaded to the user's computer. This
+ step also renames output files by chromosome and deletes empty
+ output files.
+ <li>Added another example that uses recently-published mouse
+ chromosome 17 data (<a href="http://genomebiology.com/2009/10/10/R112">sequenced by Sudbery <i>et al</i></a>). The TUTORIAL
+ file now points to this new example.
+ <li>More and clearer messages in the output from <tt>cb-local</tt>.
+</ul>
+
+<h2>Crossbow piece on Cloudera Blog - 10/15/09</h2>
+<ul>
+ <li><a href="http://www.cbcb.umd.edu/~mschatz/">Mike Schatz</a>,
+ Crossbow co-author, has a great
+ <a href="http://www.cloudera.com/blog/2009/10/15/analyzing-human-genomes-with-hadoop/">guest post</a> on Cloudera's
+ "Hadoop & Big Data" blog about Crossbow.
+</ul>
+
+<h2>0.1.2 release - 10/13/09</h2>
+<ul>
+ <li>Many fixes for the scripts that automate the reference-jar
+ building process.
+ <li>Added two utility scripts, <tt>dist_mfa</tt> and <tt>sanity_check</tt>, to the
+ <tt>reftools</tt> subdirectory. See their documentation for details.
+ <li>Added scripts for building a reference jar for C. elegans using
+ UCSC's <a href="http://hgdownload.cse.ucsc.edu/downloads.html#c_elegans">ce6</a> (<a href="http://www.wormbase.org/">WormBase</a>'s WS190) assembly and information from <a href="http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=6239">dbSNP</a>.
+ This small genome is used in the new <tt>TUTORIAL</tt>.
+ <li>New <tt>TUTORIAL</tt> steps the user through preprocessing reads from the
+ <a href="http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?cmd=show&f=main&m=main&s=main">NCBI Short Read Archive</a>, creating a reference jar from a UCSC
+ assembly (<a href="http://hgdownload.cse.ucsc.edu/downloads.html#c_elegans">ce6</a> in this case) and a set of SNP descriptions from
+ <a href="http://www.ncbi.nlm.nih.gov/projects/SNP/">dbSNP</a>, then running Crossbow and examining the resulting SNPs.
+ <li>Extended the preprocess-and-copy infrastructure to allow output
+ from a single input file to be split over many output files. This
+ is critical for achieving good load balance across a breadth of
+ datasets.
+</ul>
+
+<h2>0.1.1 release - 10/9/09</h2>
+<ul>
+ <li>Added scripts that automate the reference-jar building process for
+ UCSC genomes <a href="http://hgdownload.cse.ucsc.edu/downloads.html#human">hg18</a> and <a href="http://hgdownload.cse.ucsc.edu/downloads.html#mouse">mm9</a>. These scripts can be adapted to other
+ species. See the new "Using Automatic Scripts" subsection of the
+ "Building a Reference Jar" section of the <tt>MANUAL</tt> for details.
+ <li>License agreement files are now organized better. All licenses
+applying to all software included in Crossbow are in <tt>LICENSE</tt>*
+files in the Crossbow root directory.
+ <li>Minor updates to <tt>MANUAL</tt>
+</ul>
+
+
+
+<h2>0.1.0 release - 10/3/09</h2>
+ The first public release of Crossbow is now available for download.
+ This release includes:
+ <ul>
+ <li>A detailed manual (<tt>MANUAL</tt> in the expanded archive)
+ that includes step-by-step instructions for how to get
+ started with <a href="http://aws.amazon.com/">Amazon Web
+ Services</a> and Crossbow.
+ <li>Scripts for copying and preprocessing short-read FASTQ files
+ into Amazon's <a href="http://aws.amazon.com/s3/">S3 storage service</a>, including an easy-to-use
+ interactive script (<tt>cb-copy-interactive</tt>)</li>
+ <li>Scripts for running Crossbow either locally or on Amazon's <a href="http://aws.amazon.com/ec2/">EC2
+ utility computing service</a>, including an easy-to-use
+ interactive script (<tt>cb-interactive</tt>)
+ <ul>
diff --git a/doc/website/push.sh b/doc/website/push.sh
new file mode 100644
index 0000000..4c59e3e
--- /dev/null
+++ b/doc/website/push.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+##
+# push.sh
+#
+# Run this from the $CROSSBOW_HOME/doc/website subdirectory.
+#
+# Copies the files that comprise the website at
+# http://bowtie-bio.sourceforge.net/crossbow to sourceforge. You must
+# have the right sourceforge privileges to do this. The SF_USER
+# environment variable must be set appropriately.
+#
+
+[ -z "$SF_USER" ] && echo "Must set SF_USER" && exit 1
+
+scp -r * $SF_USER,bowtie-bio at web.sourceforge.net:/home/groups/b/bo/bowtie-bio/htdocs/crossbow
+scp -r ../images $SF_USER,bowtie-bio at web.sourceforge.net:/home/groups/b/bo/bowtie-bio/htdocs/crossbow/
diff --git a/doc/website/recent_news.ssi b/doc/website/recent_news.ssi
new file mode 100644
index 0000000..7844a96
--- /dev/null
+++ b/doc/website/recent_news.ssi
@@ -0,0 +1,150 @@
+<h2>Hiring Postdocs - 9/12/2012</h2>
+<ul>
+<li>
+The
+<a href="http://www.cs.jhu.edu/~langmea">Langmead</a>
+and
+<a href="http://bioinformatics.igm.jhmi.edu/salzberg">Salzberg</a>
+labs have open positions for postdoctoral researchers. See
+<a href="http://www.cs.jhu.edu/~langmea/positions.shtml">the posting</a>
+and please apply if you're interested in working with either or both of us.
+</li>
+</ul>
+
+<h2>Version 1.2.0 - July 20, 2012</h2>
+<ul>
+ <li>Added support for Hadoop version 0.20.205.</li>
+ <li>Dropped support for Hadoop versions prior to 0.20.</li>
+ <li>Updated default Hadoop version for EMR jobs to 0.20.205.</li>
+ <li>Updated Bowtie version used to 0.12.8.</li>
+ <li>Fixed issues with streaming jar version parsing.</li>
+ <li>Fixed documentation bugs regarding --sra-toolkit option, which is
+ superseded by the
+ <a href="manual.shtml#cb-local-fastq-dump"><tt>--fastq-dump</tt></a>
+ option.</li>
+
+</ul>
+
+<h2>Version 1.1.2 - May 23, 2011</h2>
+<ul>
+ <li>
+ Added
+ <a href="manual.shtml#cb-just-align"><tt>--just-align</tt></a>
+ and
+ <a href="manual.shtml#cb-resume-align"><tt>--resume-align</tt></a>
+ options.
+ <a href="manual.shtml#cb-just-align"><tt>--just-align</tt></a>
+ causes Crossbow to put the results of the Alignment phase in the
+ <tt>--output</tt>
+ directory and quit after the alignment phase. You can
+ later "resume" Crossbow by specifying this directory as the
+ <tt>--input</tt>
+ directory and specifying the
+ <a href="manual.shtml#cb-resume-align"><tt>--resume-align</tt></a>
+ option.
+ <li>
+ Fixed issue with <tt>.sra</tt> input whereby status output from fastq-dump
+ would be interpreted as a read.
+ <li>
+ Other minor bugfixes.
+</ul>
+
+<h2>Version 1.1.1 - February 7, 2011</h2>
+<ul>
+ <li>
+ Added support for the
+ <a href="http://www.ncbi.nlm.nih.gov/books/NBK47540/"><tt>.sra</tt></a> file format, used by the
+ <a href="http://www.ncbi.nlm.nih.gov/books/NBK47533/">Sequence Read Archive</a>. These files can now be
+ specified in the manifest. Crossbow uses the <tt>fastq-convert</tt>
+ tool from the
+ <a href="http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?cmd=show&f=software&m=software&s=software">SRA Toolkit</a> to convert
+ <a href="http://www.ncbi.nlm.nih.gov/books/NBK47540/"><tt>.sra</tt></a>
+ files to FASTQ files in the preprocess stage.
+ <li>
+ The examples that included defunct SRA FASTQ files were updated to
+ point to new
+ <a href="http://www.ncbi.nlm.nih.gov/books/NBK47540/"><tt>.sra</tt></a>
+ files instead.
+</ul>
+
+<h2>Version 1.1.0 - October 12, 2010</h2>
+<ul>
+ <li>
+ Added
+ <a href="manual.shtml#cb-discard-ref-bins"><tt>--discard-ref-bin</tt></a>
+ and
+ <a href="manual.shtml#cb-discard-all"><tt>--discard-all</tt></a>
+ options, which can be
+ helpful to reduce Crossbow running time when a run's chief purpose
+ is to test whether it runs all the way through.
+ <li>
+ Fixed a bug in <tt>soapsnp</tt> that caused a segmentation fault in the
+ last partition of a chromosome when chromosome length is a
+ multiple of 64.
+ <li>
+ Revamped the reference jar scripts (in <tt>$CROSSBOW_HOME/reftools</tt>).
+ The new scripts use <a href="http://www.ensembl.org">Ensembl</a> rather than
+ <a href="http://genome.ucsc.edu/">UCSC</a> &
+ <a href="http://www.ncbi.nlm.nih.gov/projects/SNP">dbSNP</a>. The old
+ scripts (<tt>db2ssnp*</tt> and <tt>*_jar</tt>) are still there, but are likely to be
+ deprecated soon.
+ <li>
+ Fixed a few bugs in the <tt>hg19_jar</tt> and <tt>db2ssnp_hg19</tt> scripts.
+ <li>
+ Removed the <tt>hg18_jar</tt> script, which was broken by a reorganization
+ of the <a href="http://www.ncbi.nlm.nih.gov/projects/SNP">dbSNP</a> site.
+ <li>
+ Uses <a href="http://bowtie-bio.sf.net">Bowtie</a> 0.12.7 instead of 0.12.5.
+ <li>
+ Switched Mouse17 example's manifest files back to use <tt>.gz</tt>
+ extension instead of <tt>.bz2</tt>.
+</ul>
+
+<h2>Version 1.0.9 - September 13, 2010</h2>
+<ul>
+ <li>
+ Fixed example manifests that point to Short Read Archive files to
+ use <tt>.bz2</tt> instead of <tt>.gz</tt> extensions.
+</ul>
+
+<h2>Version 1.0.8 - September 4, 2010</h2>
+<ul>
+ <li>
+ Set the memory cap on the sort task to be inversely proportional
+ to <tt>--cpus</tt>, to avoid memory footprint blowup on computers with more
+ processors.
+ <li>Fixed a final issue that affected how Crossbow handles quality
+ value conversion.
+ <li>Fixed issue whereby <tt>bzip2</tt>'ed data would be handled incorrectly by
+ the preprocessor.
+ <li>Fixed counter in Preprocess step that would erroneously refer to
+ unpaired reads as paired. Also "Read data fetched to EC2" has
+ been changed to "Read data fetched".
+ <li>In EMR mode, updated where user credentials are found; Amazon
+ changed their path sometime around 8/30/2010.
+ <li>In EMR mode, updated the manner in which the bootstrap action is
+ specified; the old way was disabled by Amazon sometime around
+ 8/30/2010.
+ <li>Fixed issue whereby <tt>ReduceWrap.pl</tt> would crash in cases with a
+ large number of bins (>10 million) .
+ <li>NOTE: The Short Read Archive (SRA) seems to be in the midst of a
+ reorganization that includes files that were previously gzipped
+ being replaced with versions zipped with bzip2. The files will
+ sometimes disappear for a while. If you are having problems with
+ an example where input reads come from the SRA, try renaming files
+ in the manifest file as appropriate. If that doesn't work, please
+ contact us.
+</ul>
+
+<h2>Version 1.0.7 - August 27, 2010</h2>
+<ul>
+ <li>
+ Fixed issue whereby the order of the arguments to <tt>bowtie</tt> would
+ result in a crash when <tt>POSIXLY_CORRECT</tt> was set.
+ <li>
+ Fixed <a href="manual.shtml#cb-local-keep-all"><tt>--keep-all</tt></a> option, which was causing a crash.
+ <li>
+ Fixed a lingering quality bug whereby qualities were converted
+ immediately to phred33 but phred64 or solexa64 flags would be
+ spuriously passed to Bowtie.
+</ul>
diff --git a/doc/website/rhsidebar.ssi b/doc/website/rhsidebar.ssi
new file mode 100644
index 0000000..ed1f1ed
--- /dev/null
+++ b/doc/website/rhsidebar.ssi
@@ -0,0 +1,126 @@
+<div id="rightside" style="width: 320px ; padding-left: 20px">
+ <h2>Site Map</h2>
+ <div class="box">
+ <ul>
+ <li><a href="index.shtml">Home</a></li>
+ <li><a href="ui.html" target="_blank">Web interface</a></li>
+ <li><a href="news.shtml">News archive</a></li>
+ <li><a href="manual.shtml#crossbow-examples">Getting Started</a></li>
+ <li><a href="manual.shtml">Manual</a></li>
+ <li><a href="faq.shtml">Frequently Asked Questions</a></li>
+ </ul>
+ </div>
+ <a href="http://sourceforge.net/projects/bowtie-bio/files/crossbow/"><h2><u>Latest Release</u></h2></a>
+ <div class="box">
+ <ul>
+ <table width="100%">
+ <tr>
+ <td>
+ <a href="http://sourceforge.net/projects/bowtie-bio/files/crossbow/">Crossbow 1.2.0</a>
+ </td>
+ <td align="right">
+ 7/20/12
+ </td>
+ </tr>
+ <tr>
+ <td colspan=2>
+ <li style="font-size: x-small; line-height: 130%">Please cite: Langmead B, Schatz MC, Lin J, Pop M, Salzberg SL. <a href="http://genomebiology.com/2009/10/11/R134">Searching for SNPs with cloud computing</a>. <i><a href="http://genomebiology.com">Genome Biol</a></i> 10:R134.
+ </td>
+ </tr>
+ <tr>
+ <td colspan=2>
+ <li style="font-size: x-small; line-height: 130%">For release updates, subscribe to the <a href="https://lists.sourceforge.net/lists/listinfo/bowtie-bio-announce">mailing list</a>.
+ </td>
+ </tr>
+ </table>
+ </ul>
+ </div>
+ <h2>Related Tools</h2>
+ <div class="box">
+ <ul>
+ <table width="100%">
+ <tr><td><a href="http://bowtie-bio.sf.net">Bowtie</a>: Ultrafast short read alignment</td></tr>
+ <tr><td><a href="http://hadoop.apache.org">Hadoop</a>: Open Source MapReduce</td></tr>
+ <tr><td><a href="https://sourceforge.net/apps/mediawiki/contrail-bio/index.php?title=Contrail">Contrail</a>: Cloud-based <i>de novo</t> assembly</td></tr>
+ <tr><td><a href="https://sourceforge.net/apps/mediawiki/cloudburst-bio/index.php?title=CloudBurst">CloudBurst</a>: Sensitive MapReduce alignment</td></tr>
+ <tr><td><a href="http://bowtie-bio.sf.net/myrna">Myrna</a>: Cloud, differential gene expression</td></tr>
+ <tr><td><a href="http://tophat.cbcb.umd.edu">Tophat</a>: RNA-Seq splice junction mapper</td></tr>
+ <tr><td><a href="http://cufflinks.cbcb.umd.edu">Cufflinks</a>: Isoform assembly, quantitation</td></tr>
+ <tr><td><a href="http://soap.genomics.org.cn/soapsnp.html">SoapSNP</a>: Accurate SNP/consensus calling</td></tr>
+ </table>
+ </ul>
+ </div>
+ <h2>Reference jars</h2>
+ <div class="box">
+ <ul>
+ <table width="100%">
+ <tr><td></td></tr>
+ <tr><td>
+ <i>H. sapiens</i>: <a href="http://hgdownload.cse.ucsc.edu/downloads.html#human">hg18</a>/<a href="http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606">dbSNP 130</a>
+ </td></tr>
+ <tr><td style="font-size: x-small">
+ s3n://crossbow-refs/hg18.jar<br/>
+ </td></tr>
+
+ <tr><td>
+ <i>M. musculus</i>: <a href="http://hgdownload.cse.ucsc.edu/downloads.html#mouse">mm9</a>/<a href="http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=10090">dbSNP 128</a>
+ </td></tr>
+ <tr><td style="font-size: x-small">
+ s3n://crossbow-refs/mm9.jar<br/>
+ </td></tr>
+
+ <tr><td>
+ <i>E. coli</i>: O157:H7, NCBI (no SNPs)
+ </td></tr>
+ <tr><td style="font-size: x-small">
+ s3n://crossbow-refs/e_coli.jar<br/>
+ </td></tr>
+
+ </table>
+ </ul>
+ </div>
+ <h2>Related publications</h2>
+ <div class="box">
+ <ul>
+ <li style="font-size: x-small; line-height: 130%">Langmead B, Schatz M, Lin J, Pop M, Salzberg SL. <a href="http://genomebiology.com/2009/10/11/R134"><b>Searching for SNPs with cloud computing</b></a>. <i><a href="http://genomebiology.com">Genome Biology</a></i> <b>10</b>:R134.
+ <br/><br/>
+ <li style="font-size: x-small; line-height: 130%">Schatz M, Langmead B, Salzberg SL. <a href="http://www.nature.com/nbt/journal/v28/n7/abs/nbt0710-691.html"><b>Cloud computing and the DNA data race</b></a>. <i><a href="http://www.nature.com/nbt/index.html">Nature Biotechnology</a></i> 2010 Jul;28(7):691-3.
+ <br/><br/>
+ <li style="font-size: x-small; line-height: 130%">Langmead B, Hansen K, Leek J. <a href="http://genomebiology.com/2010/11/8/R83"><b>Cloud-scale RNA-sequencing differential expression analysis with Myrna</b></a>. <i><a href="http://genomebiology.com">Genome Biology</a></i> <b>11</b>:R83.
+ <br/><br/>
+ <li style="font-size: x-small; line-height: 130%">Langmead B, Trapnell C, Pop M, Salzberg SL. <a href="http://genomebiology.com/2009/10/3/R25"><b>Ultrafast and memory-efficient alignment of short DNA sequences to the human genome</b></a>. <i><a href="http://genomebiology.com">Genome Biology</a></i> <b>10</b>:R25.
+ <br/><br/>
+ <li style="font-size: x-small; line-height: 130%">Li R, Li Y, Fang X, Yang H, Wang J, Kristiansen K, Wang J. <a href="http://genome.cshlp.org/content/19/6/1124.abstract"><b>SNP detection for massively parallel whole-genome resequencing</b></a>. <i><a href="http://genome.cshlp.org/">Genome Res.</a></i> 2009. 19: 1124-1132.</li>
+ </ul>
+ </div>
+ <h2>Authors</h2>
+ <div class="box">
+ <ul>
+ <li><a href="http://www.cbcb.umd.edu/~langmead">Ben Langmead</a></li>
+ <li><a href="http://www.cbcb.umd.edu/~mschatz/">Michael Schatz</a></li>
+ </ul>
+ </div>
+ <h2>Other Documentation</h2>
+ <div class="box">
+ <ul>
+ <li>WABI 09 Poster (<a href="http://www.cbcb.umd.edu/~mschatz/Posters/Crossbow_WABI_Sept2009.pdf">.pdf</a>)</li>
+ </ul>
+ </div>
+ <h2>Links</h2>
+ <div class="box">
+ <ul>
+ <li>Bowtie <a href="https://sourceforge.net/projects/bowtie-bio/">sourceforge.net project</a></li>
+ <ul>
+ <li> <a href="https://sourceforge.net/tracker/?func=add&group_id=236897&atid=1101609">Request a feature</li>
+ <li> <a href="https://sourceforge.net/tracker/?func=add&group_id=236897&atid=1101606">Report a bug</li>
+ </ul>
+ <li><a href="http://seqanswers.com/">SEQanswers</a></li>
+ <li><a href="http://www.umd.edu/">University of Maryland</a></li>
+ <li> <a href="http://www.cbcb.umd.edu/">UMD CBCB</a></li>
+ <li> <a href="http://www.cs.umd.edu/">UMD Computer Science</a></li>
+ <li> <a href="http://www.umiacs.umd.edu/">UMIACS</a></li>
+ <li><a href="http://www.jhsph.edu/">J.H. Bloomberg School of Public Health</a></li>
+ <li> <a href="http://www.biostat.jhsph.edu/">JHSPH Biostatistics</a></li>
+ </ul>
+ </div>
+</div> <!-- End of "rightside" -->
diff --git a/doc/website/top.ssi b/doc/website/top.ssi
new file mode 100644
index 0000000..96ca953
--- /dev/null
+++ b/doc/website/top.ssi
@@ -0,0 +1,12 @@
+ <div id="top">
+ <div class="lefts">
+ <table width="100%" cellpadding="2">
+ <tr><td>
+ <a href="./index.shtml"><h1>Crossbow</h1></a>
+ <h2>Genotyping from short reads using cloud computing</h2>
+ </td><td align="right" valign="middle">
+ <h1><a href="http://www.cbcb.umd.edu/"><img border=0 src="../images/cbcblogo.gif"></a> <a href="http://www.biostat.jhsph.edu/"><img border=0 src="../images/jhsph_white.png"></img></a> </h1>
+ </td></tr>
+ </table>
+ </div>
+ </div>
diff --git a/doc/website/tutorial.shtml b/doc/website/tutorial.shtml
new file mode 100644
index 0000000..fb62cfa
--- /dev/null
+++ b/doc/website/tutorial.shtml
@@ -0,0 +1,234 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<title>Bowtie: Tutorial</title>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+<link rel="stylesheet" type="text/css" href="../css/style.css" media="screen" />
+<meta name="verify-v1" content="YJT1CfXN3kzE9cr+jvNB+Q73lTfHrv8eivoY+xjblc0=" />
+</head>
+<body>
+<div id="wrap">
+ <!--#include virtual="top.ssi" -->
+ <div id="main">
+ <!--#include virtual="rhsidebar.ssi" -->
+ <div id="leftside">
+ <h1> Getting started</h1>
+<div id="toc"
+><ul
+ ><li
+ ><a href="#mouse-chromosome-17-example" id="TOC-mouse-chromosome-17-example"
+ >Mouse Chromosome 17 Example</a
+ ><ul
+ ><li
+ ><a href="#step-1-preprocess-and-copy-the-reads-from-the-era" id="TOC-step-1-preprocess-and-copy-the-reads-from-the-era"
+ >Step 1. Preprocess and copy the reads from the ERA</a
+ ></li
+ ><li
+ ><a href="#step-2-create-and-upload-the-reference-jar" id="TOC-step-2-create-and-upload-the-reference-jar"
+ >Step 2. Create and upload the reference jar</a
+ ></li
+ ><li
+ ><a href="#step-3-start-the-crossbow-job" id="TOC-step-3-start-the-crossbow-job"
+ >Step 3. Start the Crossbow job</a
+ ></li
+ ><li
+ ><a href="#step-4-sanity-check-the-results" id="TOC-step-4-sanity-check-the-results"
+ >Step 4. Sanity-check the results</a
+ ></li
+ ></ul
+ ></li
+ ></ul
+ ></div
+><h1 id="mouse-chromosome-17-example"
+><a href="#TOC-mouse-chromosome-17-example"
+ >Mouse Chromosome 17 Example</a
+ ></h1
+><p
+>This example guides you through (a) preprocessing and copying a public short-read dataset from the <a href="http://www.ncbi.nlm.nih.gov/Traces/sra/"
+ >NCBI Short Read Archive</a
+ > into <a href="https://s3.amazonaws.com/"
+ >Amazon S3</a
+ >, (b) creating a reference jar using public data from <a href="http://www.ncbi.nlm.nih.gov/projects/SNP"
+ >dbSNP</a
+ > and <a href="http://hgdownload.cse.ucsc.edu/downloads.html"
+ >UCSC</a
+ >, then (c) running a <a href="http://bowtie-bio.sf.net/crossbow"
+ >Crossbow</a
+ > job that aligns and calls SNPs from that dataset. The datasets used here are for M. musculus chromosome 17. This example is intended to show how each step of the process works; it does not require much time or money to run. This example is not intended to highlight Crossbow's speed or scalability. Those features are far better demonstrated using much larger, whole-genome datasets.</p
+><p
+>This example assumes that you have already set up your <a href="http://aws.amazon.com/" title="Amazon Web Services"
+ >AWS</a
+ > accounts and credentials as described in the <a href="http://bowtie-bio.sf.net/crossbow/manual.shtml"
+ >Crossbow Manual</a
+ >.</p
+><p
+>Reads are taken from a <a href="http://genomebiology.com/2009/10/10/R112"
+ >mouse reseqeuncing study</a
+ > by Ian Sudbery and colleagues.</p
+><h2 id="step-1-preprocess-and-copy-the-reads-from-the-era"
+><a href="#TOC-step-1-preprocess-and-copy-the-reads-from-the-era"
+ >Step 1. Preprocess and copy the reads from the ERA</a
+ ></h2
+><p
+>The quickest way to get started with a copy/preprocessing job is to simply run the <code
+ >cb-copy-interactive</code
+ > script and answer the prompts. When asked for the manifest file, specify the <code
+ >copy.manifiest</code
+ > file in the <code
+ >examples/mouse17</code
+ > subdirectory of the Crossbow directory. Alternately use this command:</p
+><pre
+><code
+ >cb-copy-local \
+ -n 5 \
+ -r 50 \
+ -i <path-to-examples/mouse17>/copy.manifest \
+ -o s3n://<read-bucket-name>/mm9chr17 \
+ -m 2 \
+ -t c1.medium \
+ -M 500000 \
+ -c crossbowcopy<your-account-#-without-dashes>
+</code
+ ></pre
+><p
+>These are reasonable defaults. In our experiments, this job took about 15 minutes and cost about $2-3. Remember to terminate it when it completes.</p
+><p
+>Once the job is complete, you should see that the destination directory in S3 contains a set of files (about 115 of them if you used <code
+ >-M 500000</code
+ >) with names that start with <code
+ >ERR0028</code
+ > and end with <code
+ >gz</code
+ >.</p
+><h2 id="step-2-create-and-upload-the-reference-jar"
+><a href="#TOC-step-2-create-and-upload-the-reference-jar"
+ >Step 2. Create and upload the reference jar</a
+ ></h2
+><p
+>Scripts have already been created to create a reference jar from publicly available <em
+ >M. musculus</em
+ > genome data (at <a href="http://hgdownload.cse.ucsc.edu/downloads.html"
+ >UCSC</a
+ >) and SNP data (at <a href="http://www.ncbi.nlm.nih.gov/projects/SNP"
+ >dbSNP</a
+ >). Change to the <code
+ >reftools</code
+ > subdirectory and run the <code
+ >mm9_chr17_jar</code
+ > shell script. When the script completes, a <code
+ >mm9chr17</code
+ > subdirectory should have been created containing a jar file named <code
+ >mm9_chr17.jar</code
+ >. Upload this jar file to an S3 bucket (e.g. using <a href="http://s3tools.org/s3cmd"
+ >s3cmd</a
+ >'s <code
+ >put</code
+ > command, <a href="http://hadoop.apache.org/"
+ >Hadoop</a
+ >'s <code
+ >hadoop fs -put</code
+ > command, or a graphical user interface such as <a href="http://www.s3fox.net/"
+ >S3 Firefox Organizer</a
+ >, <a href="http://www.bucketexplorer.com/"
+ >Bucket Explorer</a
+ > or <a href="http://cyberduck.ch/"
+ >Cyberduck</a
+ >) and change its permissions to be readable by Everyone. You should now be able to access it through the following URL:</p
+><pre
+><code
+ >http://<bucket-name>.s3.amazonaws.com/<path-to-jar>
+</code
+ ></pre
+><p
+>Make a note of the URL for the next step. You may also want to make a note of the reference jar file's <a href="http://en.wikipedia.org/wiki/Md5"
+ >MD5</a
+ > checksum either by running a tool like <code
+ >md5sum</code
+ > on a local copy of <code
+ >mm9_chr17.jar</code
+ >, or by running something like <code
+ >s3cmd ls --list-md5</code
+ > on the jar in S3.</p
+><h2 id="step-3-start-the-crossbow-job"
+><a href="#TOC-step-3-start-the-crossbow-job"
+ >Step 3. Start the Crossbow job</a
+ ></h2
+><p
+>The quickest way to get started with a Crossbow job is to simply run the <code
+ >cb-interactive</code
+ > script and answer the prompts. When asked for the reference jar file, specify the URL from the previous step. For extra data integrity, also specify the <a href="http://en.wikipedia.org/wiki/Md5"
+ >MD5</a
+ > checksum from the previous step when prompted. The maximum read length in this dataset is 36. When prompted for an instance type, select option 5 (<code
+ >c1.xlarge</code
+ >). When prompted for number of nodes, select 3. Alternately, issue this command (portions in square brackets are optional):</p
+><pre
+><code
+ >cb-local \
+ -r http://<bucket-name>.s3.amazonaws.com/<path-to-jar>[::<MD5>] \
+ -n 8 \
+ -i s3n://<read-bucket-name>/mm9chr17 \
+ -t c1.xlarge \
+ -a "-v 2 --strata --best -m 1" \
+ -b "-2 -u -n -q" \
+ -L 36 \
+ -s 1000000 \
+ -q phred33 \
+ -c crossbow<your-account-#-without-dashes>
+</code
+ ></pre
+><p
+>These are reasonable defaults. In our experiments, this job took about 30-40 minutes to run and cost about <span class="math"
+ >2.50-</span
+ >3. Remember to terminate it when it completes.</p
+><p
+>Note that these are not reasonable defaults for genomes larger than than a few hundred megabases. For larger genomes, always use the <code
+ >c1.xlarge</code
+ > instance type (option <code
+ >-t c1.xlarge</code
+ >).</p
+><p
+>See the manual for instructions on how to monitor your EC2 job.</p
+><h2 id="step-4-sanity-check-the-results"
+><a href="#TOC-step-4-sanity-check-the-results"
+ >Step 4. Sanity-check the results</a
+ ></h2
+><p
+>Results are automatically downloaded from the EC2 cluster into the directory from which Crossbow was run and saved in a tar archive with name <code
+ ><cluster-name>-output.tar</code
+ >. To unpack the tar archive, run:</p
+><pre
+><code
+ >tar xvf <cluster-name>-output.tar
+</code
+ ></pre
+><p
+>or a similar command. The archive will be unpacked to a subdirectory named <code
+ >output</code
+ >, which will contain a set of files named <code
+ >chrXX.gz</code
+ >, where <code
+ >XX</code
+ > is a chromosome name as specified in the chromosome map (cmap) file when the reference jar was built. Each <code
+ >chrXX.gz</code
+ > file contains all of the SNPs on chromosome <code
+ >XX</code
+ > sorted along the forward reference strand.</p
+>
+ </div>
+ </div>
+ <!--#include virtual="foot.ssi" -->
+</div>
+
+<!-- Google analytics code -->
+<script type="text/javascript">
+var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
+document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
+</script>
+<script type="text/javascript">
+var pageTracker = _gat._getTracker("UA-5334290-1");
+pageTracker._trackPageview();
+</script>
+<!-- End google analytics code -->
+
+</body>
+</html>
diff --git a/doc/website/ui.html b/doc/website/ui.html
new file mode 100644
index 0000000..a998978
--- /dev/null
+++ b/doc/website/ui.html
@@ -0,0 +1,9 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
+<html>
+<head>
+<title>Crossbow: Whole Genome Resequencing Analysis in the Clouds</title>
+<meta http-equiv="REFRESH" content="0;url=http://bio-cloud-1449786154.us-east-1.elb.amazonaws.com/cgi-bin/crossbow.pl"></HEAD>
+<BODY>
+Redirecting to Crossbow web UI.
+</BODY>
+</HTML>
diff --git a/emr/util/pull_push.sh b/emr/util/pull_push.sh
new file mode 100644
index 0000000..34484aa
--- /dev/null
+++ b/emr/util/pull_push.sh
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+##
+# push.sh
+#
+# Run from Crossbow root directory (i.e. sh emr/util/push.sh X1.Y1.Z1
+# X2.Y2.Z2 where X1.Y1.Z1 is the source version and X2.Y2.Z2 is the
+# destination version). Copies all of the S3-resident files from
+# an S3 subdirectory corresponding to one version of Crossbow to
+# another S3 subdirectory corresponding to another (usually newer)
+# version. Once that copy is done, the Perl-script infrastructure is
+# copied from the local computer into the new S3 directory, overwriting
+# the older versions of those files. The S3CFG environent variable
+# must be set to an appropriate .s3cfg file (config file for s3cmd).
+#
+
+d=`dirname $0`
+d=$d/../..
+
+VERSION_OLD=$1
+[ -z "$VERSION_OLD" ] && echo "Must specify source version as argument" && exit 1
+shift
+VERSION_NEW=$1
+[ -z "$VERSION_NEW" ] && echo "Must specify destination version as argument" && exit 1
+[ -z "$S3CFG" ] && echo "S3CFG not set" && exit 1
+
+s3cmd -c $S3CFG \
+ --acl-public --recursive cp \
+ s3://crossbow-emr/$VERSION_OLD/ \
+ s3://crossbow-emr/$VERSION_NEW
+
+s3cmd -c $S3CFG \
+ --acl-public \
+ put \
+ $d/*.pl $d/*.pm \
+ s3://crossbow-emr/$VERSION_NEW/
diff --git a/emr/util/push.sh b/emr/util/push.sh
new file mode 100644
index 0000000..5869de9
--- /dev/null
+++ b/emr/util/push.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+
+##
+# push.sh
+#
+# Run from Crossbow root directory (i.e. sh emr/util/push.sh X.Y.Z
+# where X.Y.Z is version). Puts all of the Perl-script intrastructure
+# into place. Doesn't do anything about the binaries. You either have
+# to push those yourself or use the pull_push.sh script to move
+# everything from one version to another first. The S3CFG environent
+# variable must be set to an appropriate .s3cfg file (config file for
+# s3cmd).
+#
+
+d=`dirname $0`
+d=$d/../..
+
+VERSION=$1
+[ -z "$VERSION" ] && echo "Must specify version as argument" && exit 1
+[ -z "$S3CFG" ] && echo "S3CFG not set" && exit 1
+
+s3cmd -c $S3CFG \
+ --acl-public \
+ put \
+ $d/*.pl $d/*.pm \
+ s3://crossbow-emr/$VERSION/
diff --git a/example/e_coli/full.manifest b/example/e_coli/full.manifest
new file mode 100644
index 0000000..d3f5c50
--- /dev/null
+++ b/example/e_coli/full.manifest
@@ -0,0 +1,4 @@
+# Two runs from study "Use of high throughput sequencing to observe
+# genome dynamics at a single cell level" by Parkhomchuk et al.
+ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/litesra/SRR/SRR014/SRR014475/SRR014475.lite.sra 0
+ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/litesra/SRR/SRR014/SRR014476/SRR014476.lite.sra 0
diff --git a/example/e_coli/small.manifest b/example/e_coli/small.manifest
new file mode 100644
index 0000000..a2e35ac
--- /dev/null
+++ b/example/e_coli/small.manifest
@@ -0,0 +1,3 @@
+# One run from study "Use of high throughput sequencing to observe
+# genome dynamics at a single cell level" by Parkhomchuk et al.
+ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/litesra/SRR/SRR014/SRR014475/SRR014475.lite.sra 0
diff --git a/example/mouse17/full.manifest b/example/mouse17/full.manifest
new file mode 100644
index 0000000..9fe6b6c
--- /dev/null
+++ b/example/mouse17/full.manifest
@@ -0,0 +1,11 @@
+#
+ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002814/ERR002814_1.fastq.gz 0 ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002814/ERR002814_2.fastq.gz 0
+ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002815/ERR002815_1.fastq.gz 0 ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002815/ERR002815_2.fastq.gz 0
+ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002816/ERR002816_1.fastq.gz 0 ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002816/ERR002816_2.fastq.gz 0
+ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002817/ERR002817_1.fastq.gz 0 ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002817/ERR002817_2.fastq.gz 0
+ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002818/ERR002818_1.fastq.gz 0 ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002818/ERR002818_2.fastq.gz 0
+ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002819/ERR002819_1.fastq.gz 0 ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002819/ERR002819_2.fastq.gz 0
+ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002820/ERR002820_1.fastq.gz 0 ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002820/ERR002820_2.fastq.gz 0
+ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002821/ERR002821_1.fastq.gz 0 ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002821/ERR002821_2.fastq.gz 0
+ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002822/ERR002822_1.fastq.gz 0 ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002822/ERR002822_2.fastq.gz 0
+ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002823/ERR002823_1.fastq.gz 0 ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002823/ERR002823_2.fastq.gz 0
diff --git a/example/mouse17/small.manifest b/example/mouse17/small.manifest
new file mode 100644
index 0000000..0ff2be3
--- /dev/null
+++ b/example/mouse17/small.manifest
@@ -0,0 +1,3 @@
+#
+ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002814/ERR002814_1.fastq.gz 0
+ftp://ftp.era.ebi.ac.uk/vol1/fastq/ERR002/ERR002814/ERR002814_2.fastq.gz 0
diff --git a/reftools/chimp_ensembl.sh b/reftools/chimp_ensembl.sh
new file mode 100755
index 0000000..72cfedf
--- /dev/null
+++ b/reftools/chimp_ensembl.sh
@@ -0,0 +1,63 @@
+#!/bin/sh
+
+##
+# chimp_ensembl.sh
+#
+# Build a chimp reference jar from scratch using info from the current
+# version of Ensembl. Put results in subdirectory called
+# "chimp_ensembl_(ver)" where (ver) is the Ensembl version used.
+#
+# To build a colorspace version, run 'chimp_ensembl.sh .c -C'.
+#
+# Needs appropriate helper scripts to exist in $CROSSBOW_HOME/reftools.
+#
+# Needs bowtie-build to be in the current dir, in the
+# $CROSSBOW_BOWTIE_HOME directory, or in the $PATH.
+#
+# Needs 'mysql' to be in the $PATH.
+#
+# Needs a good deal of scratch space (~15GB) on the current partition
+# so that the script has enough space to produce its output and make
+# copies of certain large inputs, such as fasta files.
+#
+
+SUFFIX=$1
+shift
+ENSEMBL_VER=67
+ENSEMBL_SNP_VER=214
+ENSEMBL_PREFIX=Pan_troglodytes.CHIMP2.1.4.$ENSEMBL_VER
+ENSEMBL_ORGANISM=ptroglodytes
+ENSEMBL_FTP=ftp://ftp.ensembl.org/pub/release-$ENSEMBL_VER/fasta/pan_troglodytes/dna
+ENSEMBL_SNP_DB=pan_troglodytes_variation_${ENSEMBL_VER}_${ENSEMBL_SNP_VER}
+INDEX=chimp_ensembl_${ENSEMBL_VER}$SUFFIX
+SIMPLE_NAME=$INDEX
+
+# Change to jar scratch directory
+mkdir -p $SIMPLE_NAME
+cd $SIMPLE_NAME
+
+# Compose the list of fasta files to download
+i=2
+BASE_CHRS="chromosome.1"
+while [ $i -lt 23 ] ; do
+ if [ $i -eq 2 ] ; then
+ # "Chromosome 2" comes in 2 parts
+ BASE_CHRS="$BASE_CHRS chromosome.2A chromosome.2B"
+ else
+ BASE_CHRS="$BASE_CHRS chromosome.$i"
+ fi
+ i=`expr $i + 1`
+done
+BASE_CHRS="$BASE_CHRS chromosome.X chromosome.Y nonchromosomal"
+CHRS_TO_INDEX=$BASE_CHRS
+
+[ -z "$CROSSBOW_HOME" ] && echo "CROSSBOW_HOME not set" && exit 1
+source $CROSSBOW_HOME/reftools/shared.sh
+
+check_prereqs
+find_bowtie_build
+do_index $*
+do_snps
+do_jar
+
+cd ..
diff --git a/reftools/db2ssnp b/reftools/db2ssnp
new file mode 100755
index 0000000..a18bad5
--- /dev/null
+++ b/reftools/db2ssnp
@@ -0,0 +1,314 @@
+#!/usr/bin/perl -w
+
+##
+# db2ssnp
+#
+# Authors: Ben Langmead & Michael C. Schatz
+# Date: 10/9/2009
+
+use strict;
+use Getopt::Long;
+use FileHandle;
+
+my $usage = qq{
+Usage: db2ssnp -snps <file/URL> -cmap <file> -asm <name>
+
+Takes a dbSNP ASN.1 flat file and, optionally, a dbSNP genotype XML file,
+and converts to the 10-field format expected by SOAPsnp's -s option. If the
+genotyping XML file is specified, then allele frequency information is added to
+the output file wherever possible. The name of the reference assembly to use
+(e.g. "reference" for human or "C57BL/6J" for mouse) must be specified and must
+match the assembly. The cmap file maps chromosome names used in dbSNP to
+indexes used internally in Crossbow.
+
+Options except -freqs are mandatory. URLs may be http:// or ftp://.
+
+ -snps <file/URL> file/URL for dbSNP ASN.1 flat file to process; can be .gz
+ -freqs <file/URL> URL for dbSNP genotyping XML file to profess; can be .gz
+ -cmap <map> map dbSNP chromosome names to alternate names for output
+ -asm <name> name of genomic assembly to use in -snps, -freqs files
+};
+
+my $flatin = ""; # input files
+my $gtxml = ""; # input files
+my $cmapstr = ""; # chromosome map
+my $asm = "";
+
+my $result = GetOptions(
+ "snps=s" => \$flatin,
+ "freqs=s" => \$gtxml,
+ "cmap=s" => \$cmapstr,
+ "asm=s" => \$asm
+);
+
+$asm ne "" || die "Must specify -asm\n";
+$flatin =~ s/~/$ENV{HOME}/;
+$gtxml =~ s/~/$ENV{HOME}/;
+
+my %cmap = ();
+if($cmapstr ne "") {
+ open CMAP, "$cmapstr";
+ while(<CMAP>) {
+ chomp;
+ my @s = split;
+ next if $s[0] eq "" || $#s < 1;
+ $cmap{$s[0]} = $s[1];
+ }
+ close(CMAP);
+}
+
+sub checkCurl {
+ my $r = system("curl --version | grep -i -q curl 2> /dev/null");
+ if($r != 0) {
+ die "'curl' executable not in PATH; please install curl and add to PATH\n";
+ }
+}
+
+sub checkGzip {
+ if(system("gzip -h >/dev/null 2>/dev/null") != 0) {
+ die "'gzip' executable not in PATH; please install gzip and add to PATH\n";
+ }
+}
+
+sub isDna {
+ my $d = uc shift;
+ return $d eq 'A' || $d eq 'C' || $d eq 'G' || $d eq 'T';
+}
+
+sub isAlleleSep {
+ my $d = shift;
+ return $d eq '\'' || $d eq '/';
+}
+
+sub comp {
+ my $c = uc shift;
+ if($c eq 'A') {
+ return 'T';
+ } elsif($c eq 'C') {
+ return 'G';
+ } elsif($c eq 'G') {
+ return 'C';
+ } elsif($c eq 'T') {
+ return 'A';
+ }
+ die "Bad DNA char: $c\n";
+}
+
+##
+# Create a file handle from a filename, possibly representing a gzipped
+# file, or from an http or ftp URL.
+#
+sub toFileHandle {
+ my $f = shift;
+ my $fh = FileHandle->new;
+ if($f =~ /^(http|ftp):/) {
+ checkCurl();
+ my $gz = "";
+ if($f =~ /gz$/) {
+ $gz = "gzip -dc | ";
+ checkGzip();
+ }
+ $fh->open("curl $f 2>/dev/null | $gz ");
+ } else {
+ my $gz = "";
+ if($f =~ /gz$/) {
+ $gz = " | gzip -dc |";
+ checkGzip();
+ }
+ $fh->open("$f$gz");
+ }
+ return $fh;
+}
+
+# Output fields, expected by soapsnp -s
+#
+# 1) Chromosome ID
+# 2) 1-based offset into chromosome
+# 3) Whether SNP has allele frequency information (1 = yes, 0 = no)
+# 4) Whether SNP is validated by experiment (1 = yes, 0 = no)
+# 5) Whether SNP is actually an indel (1 = yes, 0 = no)
+# 6) Frequency of A allele, as a decimal number
+# 7) Frequency of C allele, as a decimal number
+# 8) Frequency of G allele, as a decimal number
+# 9) Frequency of T allele, as a decimal number
+# 10) SNP id (e.g. a dbSNP id such as "rs9976767")
+
+my $tot = 0;
+my $nval = 0;
+my %rss = ();
+my %popcnts = ();
+my %poplabs = ();
+my %isVal = ();
+
+my ($rs, $val, $als, $pop, $sz, $al, $freq, $ssToRsOrient);
+my @chrs;
+my @poss;
+my @orients;
+my $tmpfn = ".tmp.$$.dbsnp";
+open(TMP, ">$tmpfn") || die "Could not open '$tmpfn' for writing";
+my $gtLines = 0;
+if($gtxml ne "") {
+ my $fh = toFileHandle($gtxml);
+ my %als = ('A' => 0.0, 'C' => 0.0, 'G' => 0.0, 'T' => 0.0);
+ my $cumsz = 0;
+ $sz = 0;
+ while(<$fh>) {
+ $gtLines++;
+ chomp;
+ if(/<SnpInfo\s+rsId="([^"]*)"/) {
+ $rs = "rs$1";
+ } elsif(/<SnpLoc\s+genomicAssembly=".*$asm.*".*\schrom="([^"]*)".*\sstart="([^"]*)".*\srsOrientToChrom="([^"]*)"/) {
+ next if $1 eq '?';
+ next if $2 eq '?';
+ push @chrs, $1;
+ #$chrs[-1] eq $chrs[0] || die "Chrs has ".scalar(@chrs)." elts; first is $chrs[0], last is $chrs[-1]";
+ push @poss, ($2+1);
+ $2 == int($2) || die "start isn't a number: $_\n";
+ push @orients, $3;
+ $3 eq "fwd" || $3 eq "rev" || die;
+ } elsif(/<SsInfo.*\s+ssOrientToRs="([^"]*)"/) {
+ $ssToRsOrient = $1;
+ $1 eq "fwd" || $1 eq "rev" || die;
+ } elsif(/<ByPop\s+popId="([^"]*)".*\s+sampleSize="([^"]*)"/) {
+ $pop = $1;
+ $cumsz += $sz;
+ $sz = $2;
+ $popcnts{$pop} += $sz;
+ $sz == int($sz) || die "Bad sample size: $_\n";
+ } elsif(/<AlleleFreq\s+allele="([^"]*)"\s+freq="([^"]*)"/) {
+ $al = $1;
+ next if !isDna($al);
+ $freq = $2;
+ $ssToRsOrient ne "" || die;
+ $al = comp($al) if $ssToRsOrient eq 'rev';
+ if($sz > 0) {
+ $als{$al} *= (($cumsz * 1.0) / ($cumsz + $sz * 1.0));
+ if($freq > 0) {
+ $als{$al} += (($freq * $sz) / ($cumsz + $sz * 1.0));
+ }
+ }
+ } elsif(/<\/SnpInfo>/) {
+ if($rs ne "") {
+ $rss{$rs} = 1;
+ for(my $i = 0; $i <= $#chrs; $i++) {
+ my $a = ($orients[$i] eq 'fwd' ? $als{A} : $als{T});
+ my $c = ($orients[$i] eq 'fwd' ? $als{C} : $als{G});
+ my $g = ($orients[$i] eq 'fwd' ? $als{G} : $als{C});
+ my $t = ($orients[$i] eq 'fwd' ? $als{T} : $als{A});
+ my $chr = $cmap{$chrs[$i]} if defined($cmap{$chrs[$i]});
+ printf TMP "$chr\t$poss[$i]\t1\t1\t0\t%.4f\t%.4f\t%.4f\t%.4f\t$rs\n", $a, $c, $t, $g;
+ }
+ $tot++;
+ $nval++;
+ }
+ %als = ('A' => 0.0, 'C' => 0.0, 'G' => 0.0, 'T' => 0.0);
+ @chrs = ();
+ @poss = ();
+ @orients = ();
+ $sz = 0;
+ $cumsz = 0;
+ } elsif(/<Population popId="([^"]*)".*\s+handle="([^"]*)".*\s+locPopId="([^"]*)"/) {
+ $2 ne "" || die;
+ $3 ne "" || die;
+ $poplabs{$1}{handle} = $2;
+ $poplabs{$1}{id} = $3;
+ }
+ }
+ close($fh);
+ print STDERR "Read allele frequncy information from $gtxml\n";
+}
+close(TMP);
+my $fh = toFileHandle($flatin);
+my %alhash = ();
+my $flatLines = 0;
+while(<$fh>) {
+ chomp;
+ $flatLines++;
+ if(/^rs/) {
+ $rs = substr($_, 0, index($_, " "));
+ $val = -1;
+ } elsif(/SNP\s+[|]\s+alleles=/) {
+ my $idx = 14;
+ substr($_, $idx, 1) eq "'" || die;
+ $idx++;
+ my $alleles = 0;
+ $als = "";
+ %alhash = ();
+ while(substr($_, $idx, 1) ne "'") {
+ my $c = substr($_, $idx, 1);
+ if(isDna($c) &&
+ isAlleleSep(substr($_, $idx-1, 1)) &&
+ isAlleleSep(substr($_, $idx+1, 1)))
+ {
+ $als .= $c;
+ $alhash{$c} = 1;
+ }
+ $idx++;
+ }
+ } elsif(/VAL\s+[|]\s+validated=/) {
+ $val = substr($_, 16, 1) eq 'Y' ? 1 : 0;
+ } elsif(/^CTG\s+[|]\s+assembly=.*$asm/) {
+ /chr=([^\s]+)/;
+ next if $1 eq '?';
+ my $chr = $1;
+ if(defined($cmap{$chr})) {
+ $chr = $cmap{$chr};
+ }
+ $chr ne '?' || die;
+ my $orient = substr($_, -1, 1);
+ $orient eq '-' || $orient eq '+' || die;
+ /chr[-]pos=([^ ]+)/;
+ my $pos = $1;
+ next if $pos eq '?';
+ $val != -1 || die;
+ if(length($als) > 1 && $rs ne "") {
+ if(defined($rss{$rs})) {
+ $isVal{$rs} = 1 if $val;
+ } else {
+ my $num = length($als);
+ my ($af, $cf, $gf, $tf) = (0, 0, 0, 0);
+ $af = 1.0/$num if defined($alhash{A});
+ $cf = 1.0/$num if defined($alhash{C});
+ $gf = 1.0/$num if defined($alhash{G});
+ $tf = 1.0/$num if defined($alhash{T});
+ if($orient eq '-') {
+ my $tmp;
+ $tmp = $af; $af = $tf; $tf = $tmp;
+ $tmp = $cf; $cf = $gf; $gf = $tmp;
+ }
+ print "$chr\t$pos\t0\t$val\t0";
+ printf "\t%0.3f\t%0.3f\t%0.3f\t%0.3f", $af, $cf, $tf, $gf;
+ print "\t$rs\n";
+ $tot++;
+ $nval++ if $val;
+ }
+ }
+ } elsif(/^\s*$/) {
+ $als = "";
+ }
+}
+print STDERR "Read genotype information from $flatin\n";
+close($fh);
+
+# Now go back and output SNPs with allele frequencies with the
+# correct 'validated' flag.
+open(TMP, $tmpfn) || die "Could not open '$tmpfn' for reading";
+my $tmpLines = 0;
+while(<TMP>) {
+ $tmpLines++;
+ chomp;
+ my @s = split(/\t/, $_);
+ my $val = $isVal{$s[9]} || 0;
+ printf "$s[0]\t$s[1]\t$s[2]\t$val\t$s[4]\t$s[5]\t$s[6]\t$s[7]\t$s[8]\t$s[9]\n";
+}
+print STDERR "Re-read temporary allele frequency info from $tmpfn\n";
+close(TMP);
+unlink($tmpfn);
+
+print STDERR "Total SNPs written: $tot\n";
+print STDERR " w/ allele frequency info: ".(scalar(keys %rss))."\n";
+print STDERR " validated: $nval\n";
+print STDERR " gt file lines read: $gtLines\n";
+print STDERR " flat file lines read: $flatLines\n";
+print STDERR " temp allele freq file lines read: $tmpLines\n";
+
diff --git a/reftools/db2ssnp_ce4 b/reftools/db2ssnp_ce4
new file mode 100755
index 0000000..4caa020
--- /dev/null
+++ b/reftools/db2ssnp_ce4
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+##
+# db2ssnp_ce4
+#
+# Authors: Ben Langmead & Michael C. Schatz
+# Date: 10/9/2009
+#
+# Invoke db2ssnp with appropriate arguments for each C. elegans
+# chromosome.
+
+cat <<EOF > .cmap.$$
+I 0
+II 1
+III 2
+IV 3
+V 4
+X 5
+EOF
+
+SNP_BASE=ftp://ftp.ncbi.nih.gov/snp/organisms/nematode_6239/ASN1_flat
+
+mkdir -p .ce4snps
+j=0
+for i in I II III IV V X ; do
+ perl db2ssnp -asm=reference -cmap=.cmap.$$ -snps=$SNP_BASE/ds_flat_ch$i.flat.gz > .ce4snps/chr$j.snps
+ j=`expr $j + 1`
+done
+mv .ce4snps ce4snps
+echo "Output in ce4snps"
diff --git a/reftools/db2ssnp_ce6 b/reftools/db2ssnp_ce6
new file mode 100755
index 0000000..7d91a99
--- /dev/null
+++ b/reftools/db2ssnp_ce6
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+##
+# db2ssnp_ce6
+#
+# Authors: Ben Langmead & Michael C. Schatz
+# Date: 10/9/2009
+#
+# Invoke db2ssnp with appropriate arguments for each C. elegans
+# chromosome.
+
+cat <<EOF > .cmap.$$
+I 0
+II 1
+III 2
+IV 3
+V 4
+X 5
+EOF
+
+SNP_BASE=ftp://ftp.ncbi.nih.gov/snp/organisms/nematode_6239/ASN1_flat
+
+mkdir -p .ce6snps
+j=0
+for i in I II III IV V X ; do
+ perl db2ssnp -asm=reference -cmap=.cmap.$$ -snps=$SNP_BASE/ds_flat_ch$i.flat.gz > .ce6snps/chr$j.snps
+ j=`expr $j + 1`
+done
+mv .ce6snps ce6snps
+echo "Output in ce6snps"
diff --git a/reftools/db2ssnp_hg19 b/reftools/db2ssnp_hg19
new file mode 100755
index 0000000..208b227
--- /dev/null
+++ b/reftools/db2ssnp_hg19
@@ -0,0 +1,61 @@
+#!/bin/sh
+
+##
+# db2ssnp_hg19
+#
+# Authors: Ben Langmead & Michael C. Schatz
+# Date: 10/9/2009
+#
+# Invoke db2ssnp with appropriate arguments for each human chromosome.
+
+NAME=hg19
+
+mkdir -p .${NAME}snps
+cat <<EOF > .${NAME}snps/cmap.txt
+1 0
+2 1
+3 2
+4 3
+5 4
+6 5
+7 6
+8 7
+9 8
+10 9
+11 10
+12 11
+13 12
+14 13
+15 14
+16 15
+17 16
+18 17
+19 18
+20 19
+21 20
+22 21
+X 22
+Y 23
+MT 24
+EOF
+
+GT_BASE=ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/genotype
+SNP_BASE=ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/ASN1_flat
+
+j=0
+for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y MT ; do
+ IN_NAME=$i
+ if [ "$i" = "MT" ] ; then OUT_NAME=M ; fi
+ if ! perl db2ssnp -asm=GRCh37 \
+ -cmap=.${NAME}snps/cmap.txt \
+ -snps=$SNP_BASE/ds_flat_ch$i.flat.gz \
+ -freqs=$GT_BASE/gt_chr$IN_NAME.xml.gz \
+ > .${NAME}snps/chr$j.snps
+ then
+ echo "Error running db2ssnp"
+ exit 1
+ fi
+ j=`expr $j + 1`
+done
+mv .${NAME}snps ${NAME}snps
+echo "Output in ${NAME}snps"
diff --git a/reftools/db2ssnp_mm9 b/reftools/db2ssnp_mm9
new file mode 100755
index 0000000..e11b615
--- /dev/null
+++ b/reftools/db2ssnp_mm9
@@ -0,0 +1,54 @@
+#!/bin/sh
+
+##
+# db2ssnp_mm9
+#
+# Authors: Ben Langmead & Michael C. Schatz
+# Date: 10/9/2009
+#
+# Invoke db2ssnp with appropriate arguments for each mouse chromosome.
+
+NAME=mm9
+
+mkdir -p .${NAME}snps
+cat <<EOF > .${NAME}snps/cmap.txt
+1 0
+2 1
+3 2
+4 3
+5 4
+6 5
+7 6
+8 7
+9 8
+10 9
+11 10
+12 11
+13 12
+14 13
+15 14
+16 15
+17 16
+18 17
+19 18
+X 19
+Y 20
+MT 21
+EOF
+
+GT_BASE=ftp://ftp.ncbi.nih.gov/snp/organisms/mouse_10090/genotype
+SNP_BASE=ftp://ftp.ncbi.nih.gov/snp/organisms/mouse_10090/ASN1_flat
+
+j=0
+for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y MT ; do
+ IN_NAME=$i
+ if [ "$i" = "MT" ] ; then OUT_NAME=M ; fi
+ perl db2ssnp -asm="C57BL/6J" \
+ -cmap=.${NAME}snps/snp_cmap.txt \
+ -snps=$SNP_BASE/ds_flat_ch$i.flat.gz \
+ -freqs=$GT_BASE/gt_chr$IN_NAME.xml.gz \
+ > .${NAME}snps/chr$j.snps
+ j=`expr $j + 1`
+done
+mv .${NAME}snps ${NAME}snps
+echo "Output in ${NAME}snps"
diff --git a/reftools/db2ssnp_mm9_chr17 b/reftools/db2ssnp_mm9_chr17
new file mode 100755
index 0000000..6e36ff0
--- /dev/null
+++ b/reftools/db2ssnp_mm9_chr17
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+##
+# db2ssnp_mm9_chr17
+#
+# Authors: Ben Langmead & Michael C. Schatz
+# Date: 10/9/2009
+#
+# Invoke db2ssnp with appropriate arguments for mouse chromosome 17.
+
+GT_BASE=ftp://ftp.ncbi.nih.gov/snp/organisms/mouse_10090/genotype
+SNP_BASE=ftp://ftp.ncbi.nih.gov/snp/organisms/mouse_10090/ASN1_flat
+
+mkdir -p .mm9_chr17snps
+
+cat <<EOF > .mm9_chr17snps/cmap.txt
+17 0
+EOF
+
+j=0
+for i in 17 ; do
+ perl db2ssnp -asm="C57BL/6J" \
+ -cmap=.mm9_chr17snps/cmap.txt \
+ -snps=$SNP_BASE/ds_flat_ch$i.flat.gz \
+ -freqs=$GT_BASE/gt_chr$i.xml.gz \
+ > .mm9_chr17snps/chr$j.snps
+ j=`expr $j + 1`
+done
+mv .mm9_chr17snps mm9_chr17snps
+echo "Output in mm9_chr17snps"
diff --git a/reftools/e_coli_jar b/reftools/e_coli_jar
new file mode 100755
index 0000000..19ac852
--- /dev/null
+++ b/reftools/e_coli_jar
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+##
+# e_coli_jar
+#
+# Author: Ben Langmead
+# Date: 4/2/2010
+#
+# Driver script for building a reference jar for the E. coli genome.
+
+SUFFIX=$1
+shift
+OUT_DIR=.
+[ -n "$1" ] && OUT_DIR=$1
+
+BOWTIE_BUILD=bowtie-build
+if [ ! `which bowtie-build 2>/dev/null` --version ] ; then
+ if [ ! $BOWTIE_HOME/bowtie-build --version ] ; then
+ echo "Error: Could not find bowtie-build in PATH or BOWTIE_HOME"
+ exit 1
+ else
+ BOWTIE_BUILD=$BOWTIE_HOME/bowtie-build
+ fi
+fi
+
+echo "Using output directory $OUT_DIR"
+mkdir -p $OUT_DIR
+
+INDEX=e_coli$SUFFIX
+BASE=ftp://ftp.ncbi.nih.gov/genomes/Bacteria/Escherichia_coli_O157H7
+FA=NC_002695.fna
+
+tmp=${TMPDIR-/tmp}
+tmp=$tmp/e_coli_jar.$RANDOM.$RANDOM.$$
+(umask 077 && mkdir $tmp) || {
+ echo "Could not create temporary directory! Exiting." 1>&2
+ exit 1
+}
+WD=$tmp/.$INDEX
+echo "Using temporary directory $WD"
+mkdir -p $WD
+mkdir -p $WD/sequences
+
+BUILD_INPUTS=$WD/sequences/chr0.fa
+
+cat <<EOF > $WD/cmap.txt
+complete_genome 0
+EOF
+
+if [ ! -f $WD/sequences/Escherichia_coli_O157H7.fa ] ; then
+ wget $BASE/$FA -O $WD/Escherichia_coli_O157H7.fa
+ sed -e "s/^>.*/>0/" $WD/Escherichia_coli_O157H7.fa > $WD/sequences/chr0.fa
+ rm -f $WD/Escherichia_coli_O157H7.fa
+ [ ! -f $WD/sequences/chr0.fa ] && echo "Didn't get $WD/sequences/chr0.fa" && exit 1
+fi
+
+mkdir -p $WD/index
+pushd $WD/index
+$BOWTIE_BUILD $* $BUILD_INPUTS index
+popd
+
+mkdir -p $WD/snps
+mv $WD $OUT_DIR/$INDEX
+[ ! -d $OUT_DIR/$INDEX ] && echo "Did not successfully move output to $OUT_DIR/$INDEX" && exit 1
+echo "Output in $OUT_DIR/$INDEX"
+mkdir -p $OUT_DIR/$INDEX
+pushd $OUT_DIR/$INDEX
+[ ! -f cmap.txt ] && echo "Did not successfully install cmap.txt in output dir $OUT_DIR/$INDEX" && exit 1
+echo "Running jar cf $INDEX.jar sequences snps index cmap.txt"
+jar cf ${INDEX}.jar sequences snps index cmap.txt
+#jar cf ${INDEX}.idx.jar index cmap.txt
+#jar cf ${INDEX}.snp.jar sequences snps cmap.txt
+#jar cf ${INDEX}.cmap.jar cmap.txt
+popd
diff --git a/reftools/ensembl_snps.pl b/reftools/ensembl_snps.pl
new file mode 100755
index 0000000..4161e10
--- /dev/null
+++ b/reftools/ensembl_snps.pl
@@ -0,0 +1,234 @@
+#!/usr/bin/perl -w
+
+##
+# ensembl_snps.pl
+#
+
+# To explore Ensembl SNP database version numbers do:
+#
+# $ mysql
+
+use strict;
+use warnings;
+use Getopt::Long;
+use File::Path qw(mkpath);
+
+my $user = "anonymous";
+my $host = "ensembldb.ensembl.org";
+my $port = 5306;
+my $database = "homo_sapiens_variation_67_37";
+my $crossbowOut = ""; # output dir for Crossbow-style output
+my $crossbowCmap = ""; # Crossbow-style chromosome name map
+my $limit = 0;
+
+my $noChr = 0;
+my $noOffset = 0;
+my $noName = 0;
+my $noAlleles = 0;
+my $noValidation = 0;
+my $noSummValid = 0;
+
+my $listDbs = "<off>";
+
+my $verbose = 0;
+my $dryRun = 0;
+
+my $printUsage = 0;
+
+my $usage = qq!
+Usage: perl ensembl_snps.pl [options]*
+
+Options (defaults in [ ]):
+ --user=<str> Let mysql user = <str> [anonymous]
+ --host=<str> mysql host to connect to [ensembldb.ensembl.org]
+ --port=<int> mysql port [5306]
+ --database=<str> mysql databse [homo_sapiens_variation_59_37d]
+ (This changes\! See notes below.)
+ --limit=<int> Limit number of results to max of <int> [no limit]
+
+ --list-dbs=<str> Just list databases with <str> in the name, exit.
+ If <str> is empty, lists all databases [off]
+
+ --no-chr Suppress chromosome name in output [off]
+ --no-off Suppress chromosome offset in output [off]
+ --no-name Suppress variant name in output [off]
+ --no-alleles Suppress allele string in output [off]
+ --no-validation Suppress validation status string [off]
+
+ --no-summ-valid Script summarizes validation status with "1" (at
+ least 1 validation type) or "0" (none) by default.
+ Specify this to see the raw validation string.
+
+ --cb-out=<path> Output Crossbow-style 'snps' dir to <path> [off]
+ (--no-* options are ignored)
+ --cb-cmap=<path> Use chromosome name map at <path>
+
+ --verbose Print queries and commands [off]
+ --dry-run Exit without making query; enables --verbose [off]
+
+TODO:
+ * Retrieve and print allele info
+
+See http://uswest.ensembl.org/info/data/mysql.html for info about hosts
+and ports.
+
+Use --list-dbs to determine available databases. E.g. to see all the
+human variation databases, try:
+
+ perl ensembl_snps.pl --list-dbs homo_sapiens_variation
+
+!;
+
+GetOptions (
+ "user:s" => \$user,
+ "host:s" => \$host,
+ "port:i" => \$port,
+ "database:s" => \$database,
+ "limit:i" => \$limit,
+ "no-chr" => \$noChr,
+ "no-off" => \$noOffset,
+ "no-name" => \$noName,
+ "no-alleles" => \$noAlleles,
+ "no-validation" => \$noValidation,
+ "no-summ-valid" => \$noSummValid,
+ "list-dbs:s" => \$listDbs,
+ "cb-out|crossbow-out:s" => \$crossbowOut,
+ "cb-cmap|crossbow-cmap:s" => \$crossbowCmap,
+ "verbose" => \$verbose,
+ "dryrun|dry-run" => \$dryRun,
+ "help|h|usage|?" => \$printUsage) || die "Bad option";
+
+my %cmap = ();
+if($crossbowCmap ne "") {
+ open(CMAP, $crossbowCmap) || die;
+ while(<CMAP>) {
+ chomp;
+ my @s = split(/\t/);
+ defined($s[1]) || die "Bad cmap line:\n$_\n";
+ $cmap{$s[0]} = $s[1];
+ }
+ close(CMAP);
+}
+
+$verbose = 1 if $dryRun;
+if($printUsage) { print $usage; exit 0; }
+
+mkpath($crossbowOut) if $crossbowOut ne "";
+
+if($listDbs ne "<off>") {
+ my $cmd = "mysql --user=$user --host=$host --port=$port -e \"show databases;\"";
+ open CMD, "$cmd |";
+ while(<CMD>) {
+ if($listDbs ne "") { next unless /$listDbs/i; }
+ print $_;
+ }
+ close(CMD);
+ exit 0;
+}
+
+my $limitStr = $limit > 0 ? "LIMIT $limit" : "";
+
+my $outputList = "";
+if(!$noName || $crossbowOut ne "") {
+ $outputList .= "," if $outputList ne "";
+ $outputList .= "vf.variation_name";
+}
+if(!$noChr || $crossbowOut ne "") {
+ $outputList .= "," if $outputList ne "";
+ $outputList .= "sq.name";
+}
+if(!$noOffset || $crossbowOut ne "") {
+ $outputList .= "," if $outputList ne "";
+ $outputList .= "vf.seq_region_start";
+}
+if(!$noAlleles || $crossbowOut ne "") {
+ $outputList .= "," if $outputList ne "";
+ $outputList .= "vf.allele_string";
+}
+if(!$noValidation || $crossbowOut ne "") {
+ $outputList .= "," if $outputList ne "";
+ $outputList .= "v.validation_status";
+}
+# TODO: get "validated" info
+if($outputList eq "") {
+ print STDERR "No fields selected, quitting\n";
+ exit 0;
+}
+
+my $query =
+ "SELECT CONCAT_WS(' ', $outputList) ".
+ "FROM variation_feature vf, seq_region sq, variation v ".
+ "WHERE vf.seq_region_id = sq.seq_region_id ".
+ "AND vf.seq_region_end = vf.seq_region_start ".
+ "AND vf.variation_id = v.variation_id ".
+ "$limitStr;";
+
+print STDERR "Query:\n$query\n" if $verbose;
+
+my $cmd =
+ "mysql --batch --user=$user --host=$host --port=$port ".
+ "-e \"use $database; $query\"";
+
+print STDERR "Command:\n$cmd\n" if $verbose;
+
+exit 0 if $dryRun;
+
+open CMD, "$cmd |";
+my $results = 0;
+my %fhs = ();
+while(<CMD>) {
+ chomp;
+ next if /^CONCAT/;
+ $results++;
+ # Remove mysql output crud and output as tab-delimited lines
+ s/\s+/\t/g;
+ if($crossbowOut ne "") {
+ # Parse record
+ my ($name, $chr, $offset, $alleles, $valstr) = split(/\t/);
+ defined($name) || die;
+ defined($offset) || die;
+ defined($chr) || die;
+ my $chrCmap = $chr;
+ $chrCmap =~ s/\s.*//;
+ $chrCmap =~ s/[^a-zA-Z01-9]/_/g;
+ $chr = $cmap{$chrCmap} if defined($cmap{$chrCmap});
+ my $fn = "$crossbowOut/$chr.snps";
+ unless(defined($fhs{$fn})) {
+ open($fhs{$fn}, ">$fn") || die "Could not open $fn for writing\n";
+ }
+ my @alss = split(/\//, $alleles);
+ my %als = ();
+ for my $a (@alss) {
+ $als{$a} = 1 if ($a eq "A" || $a eq "C" || $a eq "G" || $a eq "T");
+ }
+ next if scalar(keys %als) < 2; # not a SNP
+ for my $k (keys %als) { $als{$k} = ((1.0 * $als{$k}) / scalar(keys %als)); }
+ my $val = ((defined($valstr) && $valstr ne "") ? "0" : "1");
+ # Crossbow file fields are:
+ # 1. Chromosome ID
+ # 2. 1-based offset into chromosome
+ # 3. Whether SNP has allele frequency information (1 = yes, 0 = no)
+ # 4. Whether SNP is validated by experiment (1 = yes, 0 = no)
+ # 5. Whether SNP is actually an indel (1 = yes, 0 = no)
+ # 6. Frequency of A allele, as a decimal number
+ # 7. Frequency of C allele, as a decimal number
+ # 8. Frequency of T allele, as a decimal number
+ # 9. Frequency of G allele, as a decimal number
+ # 10. SNP id (e.g. a dbSNP id such as rs9976767)
+ printf {$fhs{$fn}} "$chr\t$offset\t0\t$val\t0\t%0.4f\t%0.4f\t%0.4f\t%0.4f\t$name\n",
+ ($als{A} || 0), ($als{C} || 0), ($als{T} || 0), ($als{G} || 0);
+ } else {
+ my @s = split(/\t/);
+ if(!($noValidation || $noSummValid)) {
+ if($s[-1] =~ /\//) {
+ push @s, "0";
+ } else {
+ $s[-1] = "1";
+ }
+ }
+ print join("\t", @s)."\n";
+ }
+}
+print STDERR "$results results\n";
+close(CMD);
+for my $k (keys %fhs) { close($fhs{$k}); }
diff --git a/reftools/fasta_cmap.pl b/reftools/fasta_cmap.pl
new file mode 100755
index 0000000..c9b35f2
--- /dev/null
+++ b/reftools/fasta_cmap.pl
@@ -0,0 +1,70 @@
+#!/usr/bin/perl -w
+
+##
+# fasta_cmap.pl
+#
+# Scan a list of FASTA files in order. For each name line encountered
+# (again, in order), replace the name with a 0-based integer and add an
+# entry to a file (the cmap file) that maps the integer to the true
+# name. All of the non-alphanumeric characters in the name are first
+# coverted to underscores before being stored in the cmap file. To
+# reduce peak disk usage, each FASTA file is deleted after it is
+# scanned.
+#
+# E.g.:
+#
+# perl fasta_cmap.pl --cmap=my.cmap -- chr1.fa chr2.fa chrMT.fa
+#
+
+use strict;
+use warnings;
+use Getopt::Long;
+
+my $out = "cmap.txt";
+my $outLong = "cmap_long.txt";
+my $suffix = ".cmap.fa";
+my $delete = 1;
+
+GetOptions(
+ "suffix=s" => \$suffix,
+ "cmap=s" => \$out,
+ "cmap-long=s" => \$outLong,
+ "no-delete" => sub {$delete = 0}) || die "Bad options";
+
+$out ne "" || die;
+open (CMAP, ">$out") || die "Could not open '$out' for writing";
+open (CMAPL, ">$outLong") || die "Could not open '$outLong' for writing";
+my $idx = 0;
+my $cmfafh = undef;
+for my $f (@ARGV) {
+ print STDERR "Processing fasta file $f...\n";
+ open (FA, (($f =~ /\.gz$/) ? "gzip -dc $f |" : $f)) || die "Could not open '$f' for reading";
+ while(<FA>) {
+ if(/^>/) {
+ my $oname = substr($_, 1);
+ chomp($oname);
+ my $name = $oname;
+ my $nameShort = $name;
+ $nameShort =~ s/\s.*//; # truncate short name at first whitespace
+ $name =~ s/[^a-zA-Z01-9]/_/g;
+ $nameShort =~ s/[^a-zA-Z01-9]/_/g;
+ print STDERR " Processing sequence '$oname' (converted to: '$name', short: '$nameShort')...\n";
+ close($cmfafh) if defined($cmfafh);
+ $cmfafh = undef;
+ open($cmfafh, ">$idx$suffix") || die "Could not open $idx$suffix for writing";
+ defined($cmfafh) || die "Filhandle not defined after opening $idx$suffix for writing";
+ print {$cmfafh} ">$idx\n";
+ print CMAP "$nameShort\t$idx\n";
+ print CMAPL "$name\t$idx\n";
+ $idx++;
+ } else {
+ defined($cmfafh) || die;
+ print {$cmfafh} $_;
+ }
+ }
+ close (FA);
+ unlink($f) if $delete;
+}
+close($cmfafh) if defined($cmfafh);
+close(CMAP);
+close(CMAPL);
diff --git a/reftools/fly_ensembl.sh b/reftools/fly_ensembl.sh
new file mode 100755
index 0000000..ef5303c
--- /dev/null
+++ b/reftools/fly_ensembl.sh
@@ -0,0 +1,67 @@
+#!/bin/sh
+
+##
+# fly_ensembl.sh
+#
+# Build a fly (D. melanogaster) reference jar from scratch using info
+# from the current version of Ensembl. Put results in subdirectory
+# called "fly_ensembl_(ver)" where (ver) is the Ensembl version used.
+#
+# To build a colorspace version, run 'fly_ensembl.sh .c -C'.
+#
+# Needs appropriate helper scripts to exist in $CROSSBOW_HOME/reftools.
+#
+# Needs bowtie-build to be in the current dir, in the
+# $CROSSBOW_BOWTIE_HOME directory, or in the $PATH.
+#
+# Needs 'mysql' to be in the $PATH.
+#
+# Needs a good deal of scratch space (~15GB) on the current partition
+# so that the script has enough space to produce its output and make
+# copies of certain large inputs, such as fasta files.
+#
+
+SUFFIX=$1
+shift
+ENSEMBL_VER=67
+ENSEMBL_SNP_VER=539
+ENSEMBL_PREFIX=Drosophila_melanogaster.BDGP5.$ENSEMBL_VER
+ENSEMBL_ORGANISM=dmelanogaster
+ENSEMBL_FTP=ftp://ftp.ensembl.org/pub/release-$ENSEMBL_VER/fasta/drosophila_melanogaster/dna
+ENSEMBL_SNP_DB=drosophila_melanogaster_variation_${ENSEMBL_VER}_${ENSEMBL_SNP_VER}
+INDEX=fly_ensembl_${ENSEMBL_VER}$SUFFIX
+SIMPLE_NAME=$INDEX
+
+# Change to jar scratch directory
+mkdir -p $SIMPLE_NAME
+cd $SIMPLE_NAME
+
+# Compose the list of fasta files to download
+BASE_CHRS""
+BASE_CHRS="$BASE_CHRS chromosome.2L"
+BASE_CHRS="$BASE_CHRS chromosome.2LHet"
+BASE_CHRS="$BASE_CHRS chromosome.2R"
+BASE_CHRS="$BASE_CHRS chromosome.2RHet"
+BASE_CHRS="$BASE_CHRS chromosome.3L"
+BASE_CHRS="$BASE_CHRS chromosome.3LHet"
+BASE_CHRS="$BASE_CHRS chromosome.3R"
+BASE_CHRS="$BASE_CHRS chromosome.3RHet"
+BASE_CHRS="$BASE_CHRS chromosome.4"
+BASE_CHRS="$BASE_CHRS chromosome.U"
+BASE_CHRS="$BASE_CHRS chromosome.Uextra"
+BASE_CHRS="$BASE_CHRS chromosome.X"
+BASE_CHRS="$BASE_CHRS chromosome.XHet"
+BASE_CHRS="$BASE_CHRS chromosome.YHet"
+BASE_CHRS="$BASE_CHRS chromosome.dmel_mitochondrion_genome"
+CHRS_TO_INDEX=$BASE_CHRS
+
+[ -z "$CROSSBOW_HOME" ] && echo "CROSSBOW_HOME not set" && exit 1
+source $CROSSBOW_HOME/reftools/shared.sh
+
+check_prereqs
+find_bowtie_build
+do_index $*
+do_snps
+do_jar
+
+cd ..
diff --git a/reftools/hg19_jar b/reftools/hg19_jar
new file mode 100755
index 0000000..f2c49d6
--- /dev/null
+++ b/reftools/hg19_jar
@@ -0,0 +1,79 @@
+#!/bin/bash
+
+##
+# hg19_jar
+#
+# Authors: Ben Langmead & Michael C. Schatz
+# Date: 10/9/2009
+#
+# Driver script for building a reference jar for the hg19 human genome
+# assembly.
+
+SUFFIX=$1
+shift
+INDEX=hg19$SUFFIX
+BASE=ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/chromosomes
+
+mkdir -p .${INDEX}
+mkdir -p .${INDEX}/sequences
+
+BUILD_INPUTS=
+
+cat <<EOF > cmap.txt
+chr1 0
+chr2 1
+chr3 2
+chr4 3
+chr5 4
+chr6 5
+chr7 6
+chr8 7
+chr9 8
+chr10 9
+chr11 10
+chr12 11
+chr13 12
+chr14 13
+chr15 14
+chr16 15
+chr17 16
+chr18 17
+chr19 18
+chr20 19
+chr21 20
+chr22 21
+chrX 22
+chrY 23
+chrM 24
+EOF
+
+j=0
+for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y M ; do
+ if [ ! -f .${INDEX}/sequences/chr$j.fa ] ; then
+ wget $BASE/chr$i.fa.gz -O .${INDEX}/sequences/zchr$j.fa.gz
+ gunzip .${INDEX}/sequences/zchr$j.fa.gz
+ sed -e "s/^>.*/>$j/" .${INDEX}/sequences/zchr$j.fa > .${INDEX}/sequences/chr$j.fa
+ rm -f .${INDEX}/sequences/zchr$j.fa
+ [ ! -f .${INDEX}/sequences/chr$j.fa ] && echo "Didn't get .${INDEX}/sequences/chr$j.fa" && exit 1
+ fi
+ if [ -z "$BUILD_INPUTS" ] ; then
+ BUILD_INPUTS=".${INDEX}/sequences/chr$j.fa"
+ else
+ BUILD_INPUTS="$BUILD_INPUTS,.${INDEX}/sequences/chr$j.fa"
+ fi
+ j=`expr $j + 1`
+done
+
+mkdir -p .${INDEX}/index
+bowtie-build $* $BUILD_INPUTS .${INDEX}/index/index
+
+sh db2ssnp_${INDEX}
+mv ${INDEX}snps .${INDEX}/snps
+mv .${INDEX} ${INDEX}
+echo "Output in ${INDEX}"
+cd ${INDEX}
+cp ../cmap.txt .
+touch cmap.txt
+echo "Running jar cf ${INDEX}.jar sequences snps index cmap.txt"
+jar cf ${INDEX}.jar sequences snps index cmap.txt
+cd ..
diff --git a/reftools/human_ensembl.sh b/reftools/human_ensembl.sh
new file mode 100755
index 0000000..57198c8
--- /dev/null
+++ b/reftools/human_ensembl.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+
+##
+# human_ensembl.sh
+#
+# Build a human reference jar from scratch using info from the current
+# version of Ensembl. Put results in subdirectory called
+# "human_ensembl_(ver)" where (ver) is the Ensembl version used.
+#
+# To build a colorspace version, run 'human_ensembl.sh .c -C'.
+#
+# Needs appropriate helper scripts to exist in $CROSSBOW_HOME/reftools.
+#
+# Needs bowtie-build to be in the current dir, in the
+# $CROSSBOW_BOWTIE_HOME directory, or in the $PATH.
+#
+# Needs 'mysql' to be in the $PATH.
+#
+# Needs a good deal of scratch space (~15GB) on the current partition
+# so that the script has enough space to produce its output and make
+# copies of certain large inputs, such as fasta files.
+#
+
+SUFFIX=$1
+shift
+ENSEMBL_VER=67
+ENSEMBL_SNP_VER=37
+ENSEMBL_PREFIX=Homo_sapiens.GRCh37.$ENSEMBL_VER
+ENSEMBL_ORGANISM=hsapiens
+ENSEMBL_FTP=ftp://ftp.ensembl.org/pub/release-$ENSEMBL_VER/fasta/homo_sapiens/dna
+ENSEMBL_SNP_DB=homo_sapiens_variation_${ENSEMBL_VER}_${ENSEMBL_SNP_VER}
+INDEX=human_ensembl_${ENSEMBL_VER}$SUFFIX
+SIMPLE_NAME=$INDEX
+
+# Change to jar scratch directory
+mkdir -p $SIMPLE_NAME
+cd $SIMPLE_NAME
+
+# Compose the list of fasta files to download
+i=2
+BASE_CHRS="chromosome.1"
+while [ $i -lt 23 ] ; do
+ BASE_CHRS="$BASE_CHRS chromosome.$i"
+ i=`expr $i + 1`
+done
+BASE_CHRS="$BASE_CHRS chromosome.X chromosome.Y chromosome.MT nonchromosomal"
+CHRS_TO_INDEX=$BASE_CHRS
+
+[ -z "$CROSSBOW_HOME" ] && echo "CROSSBOW_HOME not set" && exit 1
+source $CROSSBOW_HOME/reftools/shared.sh
+
+check_prereqs
+find_bowtie_build
+do_index $*
+do_snps
+do_jar
+
+cd ..
diff --git a/reftools/mm9_chr17_jar b/reftools/mm9_chr17_jar
new file mode 100755
index 0000000..9704965
--- /dev/null
+++ b/reftools/mm9_chr17_jar
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+##
+# mm9_chr17_jar
+#
+# Authors: Ben Langmead & Michael C. Schatz
+# Date: 10/9/2009
+#
+# Driver script for building a reference jar for mm9 chromosome 17.
+
+SUFFIX=$1
+shift
+INDEX=mm9_chr17$SUFFIX
+BASE=ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/chromosomes
+
+mkdir -p .${INDEX}
+mkdir -p .${INDEX}/sequences
+
+BUILD_INPUTS=
+
+j=0
+for i in 17 ; do
+ wget $BASE/chr$i.fa.gz -O .${INDEX}/sequences/zchr$j.fa.gz
+ gunzip .${INDEX}/sequences/zchr$j.fa.gz
+ sed -e "s/^>.*/>$j/" .${INDEX}/sequences/zchr$j.fa > .${INDEX}/sequences/chr$j.fa
+ rm -f .${INDEX}/sequences/zchr$j.fa
+ [ ! -f .${INDEX}/sequences/chr$j.fa ] && echo "Didn't get .${INDEX}/sequences/chr$j.fa" && exit 1
+ if [ -z "$BUILD_INPUTS" ] ; then
+ BUILD_INPUTS=".${INDEX}/sequences/chr$j.fa"
+ else
+ BUILD_INPUTS="$BUILD_INPUTS,.${INDEX}/sequences/chr$j.fa"
+ fi
+ j=`expr $j + 1`
+done
+
+mkdir -p .${INDEX}/index
+bowtie-build $* $BUILD_INPUTS .${INDEX}/index/index
+
+sh db2ssnp_${INDEX}
+mv ${INDEX}snps .${INDEX}/snps
+mv .${INDEX} ${INDEX}
+echo "Output in ${INDEX}"
+pushd ${INDEX}
+mv snps/cmap.txt .
+touch cmap.txt
+echo "Running jar cf ${INDEX}.jar sequences snps index cmap.txt"
+jar cf ${INDEX}.jar sequences snps index cmap.txt
+#jar cf ${INDEX}.idx.jar index cmap.txt
+#jar cf ${INDEX}.snp.jar sequences snps cmap.txt
+#jar cf ${INDEX}.cmap.jar cmap.txt
+popd
diff --git a/reftools/mm9_jar b/reftools/mm9_jar
new file mode 100755
index 0000000..378334f
--- /dev/null
+++ b/reftools/mm9_jar
@@ -0,0 +1,79 @@
+#!/bin/bash
+
+##
+# mm9_jar
+#
+# Authors: Ben Langmead & Michael C. Schatz
+# Date: 10/9/2009
+#
+# Driver script for building a reference jar for the mm9 human genome
+# assembly.
+
+SUFFIX=$1
+shift
+INDEX=mm9$SUFFIX
+BASE=ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/chromosomes
+
+mkdir -p .${INDEX}
+mkdir -p .${INDEX}/sequences
+
+BUILD_INPUTS=
+
+cat <<EOF > cmap.txt
+chr1 0
+chr2 1
+chr3 2
+chr4 3
+chr5 4
+chr6 5
+chr7 6
+chr8 7
+chr9 8
+chr10 9
+chr11 10
+chr12 11
+chr13 12
+chr14 13
+chr15 14
+chr16 15
+chr17 16
+chr18 17
+chr19 18
+chrX 19
+chrY 20
+chrM 21
+EOF
+
+j=0
+for i in `seq 1 19` X Y M ; do
+ if [ ! -f .${INDEX}/sequences/chr$j.fa ] ; then
+ wget $BASE/chr$i.fa.gz -O .${INDEX}/sequences/zchr$j.fa.gz
+ gunzip .${INDEX}/sequences/zchr$j.fa.gz
+ sed -e "s/^>.*/>$j/" .${INDEX}/sequences/zchr$j.fa > .${INDEX}/sequences/chr$j.fa
+ rm -f .${INDEX}/sequences/zchr$j.fa
+ [ ! -f .${INDEX}/sequences/chr$j.fa ] && echo "Didn't get .${INDEX}/sequences/chr$j.fa" && exit 1
+ fi
+ if [ -z "$BUILD_INPUTS" ] ; then
+ BUILD_INPUTS=".${INDEX}/sequences/chr$j.fa"
+ else
+ BUILD_INPUTS="$BUILD_INPUTS,.${INDEX}/sequences/chr$j.fa"
+ fi
+ j=`expr $j + 1`
+done
+
+mkdir -p .${INDEX}/index
+bowtie-build $* $BUILD_INPUTS .${INDEX}/index/index
+
+sh db2ssnp_${INDEX}
+mv ${INDEX}snps .${INDEX}/snps
+mv .${INDEX} ${INDEX}
+echo "Output in ${INDEX}"
+pushd ${INDEX}
+cp ../cmap.txt .
+touch cmap.txt
+echo "Running jar cf ${INDEX}.jar sequences snps index cmap.txt"
+jar cf ${INDEX}.jar sequences snps index cmap.txt
+#jar cf ${INDEX}.idx.jar index cmap.txt
+#jar cf ${INDEX}.snp.jar sequences snps cmap.txt
+#jar cf ${INDEX}.cmap.jar cmap.txt
+popd
diff --git a/reftools/mouse_ensembl.sh b/reftools/mouse_ensembl.sh
new file mode 100755
index 0000000..3155958
--- /dev/null
+++ b/reftools/mouse_ensembl.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+
+##
+# mouse_ensembl.sh
+#
+# Build a mouse reference jar from scratch using info from the current
+# version of Ensembl. Put results in subdirectory called
+# "mouse_ensembl_(ver)" where (ver) is the Ensembl version used.
+#
+# To build a colorspace version, run 'mouse_ensembl.sh .c -C'.
+#
+# Needs appropriate helper scripts to exist in $CROSSBOW_HOME/reftools.
+#
+# Needs bowtie-build to be in the current dir, in the
+# $CROSSBOW_BOWTIE_HOME directory, or in the $PATH.
+#
+# Needs 'mysql' to be in the $PATH.
+#
+# Needs a good deal of scratch space (~15GB) on the current partition
+# so that the script has enough space to produce its output and make
+# copies of certain large inputs, such as fasta files.
+#
+
+SUFFIX=$1
+shift
+ENSEMBL_VER=67
+ENSEMBL_SNP_VER=37
+ENSEMBL_PREFIX=Mus_musculus.NCBIM37.$ENSEMBL_VER
+ENSEMBL_ORGANISM=mmusculus
+ENSEMBL_FTP=ftp://ftp.ensembl.org/pub/release-$ENSEMBL_VER/fasta/mus_musculus/dna
+ENSEMBL_SNP_DB=mus_musculus_variation_${ENSEMBL_VER}_${ENSEMBL_SNP_VER}
+INDEX=mouse_ensembl_${ENSEMBL_VER}$SUFFIX
+SIMPLE_NAME=$INDEX
+
+# Change to jar scratch directory
+mkdir -p $SIMPLE_NAME
+cd $SIMPLE_NAME
+
+# Compose the list of fasta files to download
+i=2
+BASE_CHRS="chromosome.1"
+while [ $i -lt 20 ] ; do
+ BASE_CHRS="$BASE_CHRS chromosome.$i"
+ i=`expr $i + 1`
+done
+BASE_CHRS="$BASE_CHRS chromosome.X chromosome.Y chromosome.MT nonchromosomal"
+CHRS_TO_INDEX=$BASE_CHRS
+
+[ -z "$CROSSBOW_HOME" ] && echo "CROSSBOW_HOME not set" && exit 1
+source $CROSSBOW_HOME/reftools/shared.sh
+
+check_prereqs
+find_bowtie_build
+do_index $*
+do_snps
+do_jar
+
+cd ..
diff --git a/reftools/sanity_check.pl b/reftools/sanity_check.pl
new file mode 100755
index 0000000..a34e909
--- /dev/null
+++ b/reftools/sanity_check.pl
@@ -0,0 +1,59 @@
+#!/usr/bin/perl
+
+#
+# sanity_check.pl
+#
+# Authors: Ben Langmead & Michael C. Schatz
+# Date: 10/12/2009
+#
+# Run from the root directory of an expanded reference har to see how
+# often the reference character matches one of the SNP alleles at all
+# positions with SNPs. If only about half of them match, chances are
+# good that the reference FASTA files are mismatched or misaligned with
+# the dbSNP snps.
+#
+
+use warnings;
+use strict;
+
+for my $f (split(/\s+/, `ls sequences/*.fa`)) {
+ my $bad = 0;
+ my %badc = ('A' => 0, 'C' => 0, 'G' => 0, 'T' => 0);
+ my $good = 0;
+ open FA, $f || die;
+ my $s = $f;
+ $s =~ s/\.fa$/.snps/;
+ $s =~ s/^sequences/snps/;
+ print STDERR "Processing $f/$s\n";
+ my $seq = "";
+ while(<FA>) {
+ chomp;
+ next if /^>/;
+ $seq .= $_;
+ }
+ close(FA);
+ open SNPS, $s || die;
+ while(<SNPS>) {
+ chomp;
+ my @s = split;
+ my ($a, $c, $t, $g) = ($s[5], $s[6], $s[7], $s[8]);
+ my $refc = uc substr($seq, $s[1]-1, 1);
+ if($refc eq 'A' && $a == 0.0) {
+ $badc{A}++; $bad++;
+ } elsif($refc eq 'C' && $c == 0.0) {
+ $badc{C}++; $bad++;
+ } elsif($refc eq 'G' && $g == 0.0) {
+ $badc{G}++; $bad++;
+ } elsif($refc eq 'T' && $t == 0.0) {
+ $badc{T}++; $bad++;
+ } else {
+ $good++;
+ }
+ }
+ close(SNPS);
+ print "Matched: $good, Mismatched: $bad\n";
+ print " Bad As: $badc{A}\n";
+ print " Bad Cs: $badc{C}\n";
+ print " Bad Gs: $badc{G}\n";
+ print " Bad Ts: $badc{T}\n";
+}
diff --git a/reftools/shared.sh b/reftools/shared.sh
new file mode 100755
index 0000000..2f121f0
--- /dev/null
+++ b/reftools/shared.sh
@@ -0,0 +1,156 @@
+#!/bin/sh
+
+##
+# shared.sh
+#
+# Shared routines for getting fasta files & SNP info, composing them
+# into the proper formats and indexes, and ultimately bundling them all
+# into a reference jar.
+#
+# Needs appropriate helper scripts to exist in $CROSSBOW_HOME/reftools.
+#
+# Needs bowtie-build to be in the current dir, in the
+# $CROSSBOW_BOWTIE_HOME directory, or in the $PATH.
+#
+# Needs 'mysql' to be in the $PATH.
+#
+# Needs a good deal of scratch space (~15GB) on the current partition
+# so that the script has enough space to produce its output and make
+# copies of certain large inputs, such as fasta files.
+#
+
+##
+# Get a file with either wget or curl (whichever is available, wget
+# being preferable)
+#
+get() {
+ file=$1
+ if ! wget --version >/dev/null 2>/dev/null ; then
+ if ! curl --version >/dev/null 2>/dev/null ; then
+ echo "Please install wget or curl somewhere in your PATH"
+ exit 1
+ fi
+ curl -o `basename $1` $1
+ return $?
+ else
+ wget -O `basename $1` $1
+ return $?
+ fi
+}
+
+##
+# Check that ensembl_snps.pl script is there and that 'mysql' is in the
+# path.
+#
+check_prereqs() {
+ SCRIPT_DIR=$CROSSBOW_HOME/reftools
+ [ -n "$1" ] && SCRIPT_DIR=$1
+ [ ! -f "$SCRIPT_DIR/ensembl_snps.pl" ] && echo "Can't find '$SCRIPT_DIR/ensembl_snps.pl'" && exit 1
+ [ ! -f "$SCRIPT_DIR/fasta_cmap.pl" ] && echo "Can't find '$SCRIPT_DIR/fasta_cmap.pl'" && exit 1
+ ! which mysql >/dev/null 2>/dev/null && echo "Can't find 'mysql' in path" && exit 1
+}
+
+##
+# Find a runnable bowtie-build binary.
+#
+find_bowtie_build() {
+ # Try current dir
+ BOWTIE_BUILD_EXE=./bowtie-build
+ if ! $BOWTIE_BUILD_EXE --version >/dev/null 2>/dev/null ; then
+ # Try $CROSSBOW_BOWTIE_HOME
+ BOWTIE_BUILD_EXE="$CROSSBOW_BOWTIE_HOME/bowtie-build"
+ if ! $BOWTIE_BUILD_EXE --version >/dev/null 2>/dev/null ; then
+ # Try $PATH
+ BOWTIE_BUILD_EXE=`which bowtie-build`
+ if ! $BOWTIE_BUILD_EXE --version >/dev/null 2>/dev/null ; then
+ echo "Error: Could not find runnable bowtie-build in current directory, in \$CROSSBOW_BOWTIE_HOME/bowtie-build, or in \$PATH"
+ exit 1
+ fi
+ fi
+ fi
+}
+
+##
+# Make the jar file.
+#
+do_jar() {
+ if [ ! -f jar/$INDEX.jar ]
+ then
+ # Jar it up
+ jar cf $INDEX.jar cmap.txt cmap_long.txt sequences index snps
+ else
+ echo "$INDEX.jar already present"
+ fi
+}
+
+##
+# Get the genome fasta files and rename
+#
+do_get_fasta() {
+ mkdir -p sequences
+ cd sequences
+ dir=`pwd`
+ for ci in $CHRS_TO_INDEX ; do
+ c=$ENSEMBL_PREFIX.dna.$ci
+ F=${c}.fa.gz
+ if [ ! -f $F ] ; then
+ if ! get ${ENSEMBL_FTP}/$F ; then
+ echo "Error: Unable to get '${ENSEMBL_FTP}/$F'"
+ exit 1
+ fi
+ fi
+ done
+ ARGS="--cmap=cmap.txt --cmap-long=cmap_long.txt --suffix=.fa"
+ if ! perl $SCRIPT_DIR/fasta_cmap.pl $ARGS -- $dir/*.fa.gz ; then
+ echo "Error running: $SCRIPT_DIR/fasta_cmap.pl $ARGS -- $dir/*.fa.gz"
+ exit 1
+ fi
+ # Gather output files into $INPUTS
+ for fa in `ls $dir/*.fa` ; do
+ [ -n "$INPUTS" ] && INPUTS="$INPUTS,"
+ INPUTS="$INPUTS$fa"
+ done
+ cd ..
+ [ ! -f sequences/cmap.txt ] && echo "Error: no sequences/cmap.txt created" && exit 1
+ [ ! -f sequences/cmap_long.txt ] && echo "Error: no sequences/cmap_long.txt created" && exit 1
+ mv sequences/cmap.txt .
+ mv sequences/cmap_long.txt .
+}
+
+##
+# Make the Bowtie index files.
+#
+do_index() {
+ if [ ! -f index/$INDEX.1.ebwt ] ; then
+ INPUTS=
+ do_get_fasta
+ mkdir -p index
+ cd index
+ CMD="$BOWTIE_BUILD_EXE $* $INPUTS $INDEX"
+ echo Running $CMD
+ if $CMD ; then
+ echo "$INDEX index built"
+ else
+ echo "Index building failed; see error message"
+ fi
+ cd ..
+ else
+ echo "$INDEX.*.ebwt files already present"
+ fi
+}
+
+##
+# Obtain SNPs for the organism using the ensembl_snps.pl script, which
+# in turn uses 'mysql' to query the Ensembl database.
+#
+do_snps() {
+ if [ ! -d snps ] ; then
+ # Create the SNP directory
+ if ! perl $SCRIPT_DIR/ensembl_snps.pl --database=$ENSEMBL_SNP_DB --cb-out=snps --cb-cmap=cmap.txt ; then
+ echo "Error: ensembl_snps.pl failed; aborting..."
+ exit 1
+ fi
+ else
+ echo "snps directory already present"
+ fi
+}
diff --git a/reftools/yeast_ensembl.sh b/reftools/yeast_ensembl.sh
new file mode 100755
index 0000000..1b62152
--- /dev/null
+++ b/reftools/yeast_ensembl.sh
@@ -0,0 +1,55 @@
+#!/bin/sh
+
+##
+# yeast_ensembl.sh
+#
+# Build a yeast (S. cerevisiae) reference jar from scratch using info
+# from the current version of Ensembl. Put results in subdirectory
+# called "yeast_ensembl_(ver)" where (ver) is the Ensembl version used.
+#
+# To build a colorspace version, run 'human_ensembl.sh .c -C'.
+#
+# Needs appropriate helper scripts to exist in $CROSSBOW_HOME/reftools.
+#
+# Needs bowtie-build to be in the current dir, in the
+# $CROSSBOW_BOWTIE_HOME directory, or in the $PATH.
+#
+# Needs 'mysql' to be in the $PATH.
+#
+# Needs a good deal of scratch space (~15GB) on the current partition
+# so that the script has enough space to produce its output and make
+# copies of certain large inputs, such as fasta files.
+#
+
+SUFFIX=$1
+shift
+ENSEMBL_VER=67
+ENSEMBL_SNP_VER=4
+ENSEMBL_PREFIX=Saccharomyces_cerevisiae.EF4.$ENSEMBL_VER
+ENSEMBL_ORGANISM=scerevisiae
+ENSEMBL_FTP=ftp://ftp.ensembl.org/pub/release-$ENSEMBL_VER/fasta/saccharomyces_cerevisiae/dna
+ENSEMBL_SNP_DB=saccharomyces_cerevisiae_variation_${ENSEMBL_VER}_${ENSEMBL_SNP_VER}
+INDEX=yeast_ensembl_${ENSEMBL_VER}$SUFFIX
+SIMPLE_NAME=$INDEX
+
+# Change to jar scratch directory
+mkdir -p $SIMPLE_NAME
+cd $SIMPLE_NAME
+
+# Compose the list of fasta files to download
+BASE_CHRS=
+for i in I II III IV IX Mito V VI VII VIII X XI XII XIII XIV XV XVI ; do
+ BASE_CHRS="$BASE_CHRS chromosome.$i"
+done
+CHRS_TO_INDEX=$BASE_CHRS
+
+[ -z "$CROSSBOW_HOME" ] && echo "CROSSBOW_HOME not set" && exit 1
+source $CROSSBOW_HOME/reftools/shared.sh
+
+check_prereqs
+find_bowtie_build
+do_index $*
+do_snps
+do_jar
+
+cd ..
diff --git a/soapsnp/COPYING b/soapsnp/COPYING
new file mode 100644
index 0000000..94a9ed0
--- /dev/null
+++ b/soapsnp/COPYING
@@ -0,0 +1,674 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ <program> Copyright (C) <year> <name of author>
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/soapsnp/binarize.cc b/soapsnp/binarize.cc
new file mode 100644
index 0000000..367b74c
--- /dev/null
+++ b/soapsnp/binarize.cc
@@ -0,0 +1,71 @@
+/*
+ * binarize.cc
+ *
+ * Created on: May 20, 2009
+ * Author: Ben Langmead
+ *
+ * Serialize binarized sequences to files so that they can be memory-
+ * mapped in future invocations.
+ */
+
+#include "soap_snp.h"
+#include <getopt.h>
+
+using namespace std;
+
+int usage() {
+ cerr<<"SoapSNP binarize version 1.02 "<<endl;
+ cerr<<"\nLicense GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>"<<endl;
+ cerr<<"This is free software: you are free to change and redistribute it."<<endl;
+ cerr<<"There is NO WARRANTY, to the extent permitted by law.\n"<<endl;
+
+ exit(1);
+ return 0;
+}
+
+int readme() {
+ return usage();
+}
+
+int main(int argc, char **argv) {
+ int c;
+ bool refine_mode;
+ string ref_seq, dbsnp, outdir = ".";
+ while((c = getopt(argc, argv, "d:s:o:2h?")) != -1) {
+ switch(c) {
+ case 'd': {
+ // The reference genome in fasta format
+ ref_seq = optarg;
+ break;
+ }
+ case 's': {
+ // Optional: A pre-formated dbSNP table
+ dbsnp = optarg;
+ break;
+ }
+ case 'o': {
+ // Optional: Output directory (default: .)
+ outdir = optarg;
+ break;
+ }
+ case '2': {
+ // Refine prior probability based on dbSNP information
+ refine_mode = true;
+ break;
+ }
+ case 'h':readme();break;
+ case '?':usage();break;
+ default: cerr << "Unknown error in command line parameters" << endl;
+ }
+ }
+ if(ref_seq.empty()) {
+ cerr << "Error: Must specify reference sequence using -d" << endl;
+ usage();
+ exit(1);
+ }
+ ifstream ref_seq_in(ref_seq.c_str());
+ ifstream dbsnp_in(dbsnp.c_str());
+ Genome * genome = new Genome(ref_seq_in, dbsnp_in, outdir.c_str());
+ delete genome;
+ return 0;
+}
diff --git a/soapsnp/call_genotype.cc b/soapsnp/call_genotype.cc
new file mode 100644
index 0000000..127acda
--- /dev/null
+++ b/soapsnp/call_genotype.cc
@@ -0,0 +1,584 @@
+#include "soap_snp.h"
+
+int Call_win::initialize(ubit64_t start) {
+ std::string::size_type i;
+ for(i = 0; i != read_len + win_size; i++) {
+ sites[i].pos = i + start;
+ }
+ return 1;
+}
+
+int Call_win::recycle(int start) {
+ std::string::size_type i;
+ // Move the
+ if(sites[win_size].depth > 0 && start == -1) {
+ for(i = 0; i != read_len ; i++) {
+ sites[i].pos = sites[i+win_size].pos;
+ sites[i].ori = sites[i+win_size].ori;
+ sites[i].depth = sites[i+win_size].depth;
+ sites[i].repeat_time = sites[i+win_size].repeat_time;
+ sites[i].dep_uni = sites[i+win_size].dep_uni;
+ sites[i].dep_pair = sites[i+win_size].dep_uni;
+ sites[i].dep_uni_pair= sites[i+win_size].dep_uni;
+#ifdef FAST_BOUNDS
+ sites[i].coordmin = sites[i+win_size].coordmin;
+ sites[i].coordmax = sites[i+win_size].coordmax;
+ sites[i].qmin = sites[i+win_size].qmin;
+ sites[i].qmax = sites[i+win_size].qmax;
+#endif
+ memcpy(sites[i].base_info, sites[i+win_size].base_info, sizeof(small_int)*4*2*64*256); // 4 types of bases, 2 strands, max quality score is 64, and max read length 256
+ memcpy(sites[i].count_uni, sites[i+win_size].count_uni, sizeof(int)*4);
+ memcpy(sites[i].q_sum, sites[i+win_size].q_sum, sizeof(int)*4);
+ memcpy(sites[i].count_all, sites[i+win_size].count_all, sizeof(int)*4);
+ }
+ } else {
+ Pos_info::clear(&sites[0], read_len);
+ if(start == -1) {
+ for(i = 0; i != read_len ; i++) {
+ sites[i].ori = 0xFF;
+ sites[i].pos = sites[i+win_size].pos;
+ }
+ } else {
+ for(i = 0; i != read_len ; i++) {
+ sites[i].ori = 0xFF;
+ sites[i].pos = start + i;
+ }
+ }
+ }
+ // Fill in a window's worth of 0s
+ Pos_info::clear(&sites[read_len], win_size);
+ for(i = read_len; i != read_len + win_size; i++) {
+ sites[i].ori = 0xFF;
+ sites[i].pos = sites[i-1].pos+1;
+ }
+ return 1;
+}
+
+extern unsigned long poscalled; // positions called
+extern unsigned long poscalled_knownsnp; // ... where there was a known SNP
+extern unsigned long poscalled_uncov_uni; // ... uncovered by unique reads
+extern unsigned long poscalled_uncov; // ... uncovered by any reads
+extern unsigned long poscalled_n_no_depth; // ... where ref=N and there's no reads
+extern unsigned long poscalled_nonref; // ... where allele other than ref was called
+extern unsigned long poscalled_reported; // ... # positions called already counted
+
+static unsigned long report_every = 100000;
+
+int Call_win::call_cns(Chr_name call_name,
+ Chr_info* call_chr,
+ ubit64_t call_length,
+ Prob_matrix * mat,
+ Parameter * para,
+ std::ofstream & consensus)
+{
+ std::string::size_type coord;
+ small_int k;
+ ubit64_t o_base, strand;
+ char allele1, allele2, genotype, type, type1/*best genotype*/, type2/*suboptimal genotype*/, base1, base2, base3;
+ int i, q_score, q_adjusted, qual1, qual2, qual3, q_cns, all_count1, all_count2, all_count3;
+ int global_dep_count, *pcr_dep_count;
+ pcr_dep_count = new int [para->read_length*2];
+ double rank_sum_test_value, binomial_test_value;
+ bool is_out;
+ double * real_p_prior = new double [16];
+
+ if(para->verbose) {
+ clog << " call_cns called with chr " << call_name
+ << ", first pos: " << sites[0].pos
+ << ", call length:" << call_length
+ << ", is SNP only: " << para->is_snp_only
+ << ", is region only: " << para->region_only
+ << ", get_regions().size(): " << call_chr->get_regions().size()
+ << ", <" << call_chr->get_regions()[0].first
+ << ", " << call_chr->get_regions()[0].second << ">" << endl;
+ }
+
+ // Special case: the user selected just one region in SNP-only
+ // mode; skip this window if it doesn't overlap that region
+ if(para->is_snp_only &&
+ para->region_only &&
+ call_chr->get_regions().size() == 1)
+ {
+ if(call_chr->get_regions()[0].first >= sites[0].pos + call_length) {
+ // Skip this window - too early
+ if(para->verbose) {
+ clog << " Skipping " << sites[0].pos << " because it's too early" << endl;
+ }
+ return -1;
+ }
+ if(call_chr->get_regions()[0].second <= sites[0].pos) {
+ // Skip this window - too late
+ if(para->verbose) {
+ clog << " Skipping " << sites[0].pos << " because it's too late" << endl;
+ }
+ return -2;
+ }
+ }
+ // Iterate over every reference position that we'd like to call
+ for(std::string::size_type j = 0; j != call_length; j++) {
+ if(para->region_only && !call_chr->is_in_region(sites[j].pos)) {
+ // Skip region that user asked us to skip using -T
+ continue;
+ }
+ if((++poscalled % report_every) == 0) {
+ poscalled_reported += report_every;
+ if(para->verbose) {
+ clog << " Processed " << poscalled << " positions" << endl;
+ }
+ if(para->hadoop_out) {
+ cerr << "reporter:counter:SOAPsnp,Positions called," << report_every << endl;
+ }
+ }
+ // Get "original" reference base
+ sites[j].ori = (call_chr->get_bin_base(sites[j].pos))&0xF;
+ // Check whether this is a known SNP that we should dump the
+ // consensus for even if -q is specified
+ bool known_snp = (((sites[j].ori & 0x8) != 0) && para->dump_dbsnp_evidence);
+ if((sites[j].ori & 0x8) != 0) poscalled_knownsnp++;
+
+ // Check whether we can skip this reference position entirely
+ // because (a) we're only interested in SNPs, and (b) the
+ // position is not covered by any evidence that we can use to
+ // call SNPs.
+ if(sites[j].dep_uni == 0) poscalled_uncov_uni++;
+ if(sites[j].depth == 0) poscalled_uncov++;
+ if(sites[j].dep_uni == 0 && para->is_snp_only) {
+ assert(sites[j].count_uni[0] == 0);
+ assert(sites[j].count_uni[1] == 0);
+ assert(sites[j].count_uni[2] == 0);
+ assert(sites[j].count_uni[3] == 0);
+ if(known_snp) {
+ // This is a known-SNP site that is not covered by any
+ // alignments; if the user asked us to dump all dbSNP
+ // evidence, then just print a brief record indicating
+ // there was no coverage at the site.
+ consensus << "K"
+ << '\t' << call_name // chromosome name
+ << '\t' << (sites[j].pos+1)
+ << '\t' << ("ACTGNNNN"[(sites[j].ori & 0x7)]) // ref allele
+ << '\t' << "no-coverage"
+ << endl;
+ }
+ continue;
+ }
+ // N on the reference, no "depth"
+ bool n_no_dep = ((sites[j].ori & 4) != 0)/*an N*/ && sites[j].depth == 0;
+ if(n_no_dep) poscalled_n_no_depth++;
+ if(!para->is_snp_only && n_no_dep) {
+ // CNS text format:
+ // ChrID\tPos\tRef\tCns\tQual\tBase1\tAvgQ1\tCountUni1\tCountAll1\tBase2\tAvgQ2\tCountUni2\tCountAll2\tDepth\tRank_sum\tCopyNum\tSNPstauts\n"
+ if(!para->glf_format) {
+ consensus << call_name
+ << '\t'
+ << (sites[j].pos+1)
+ << "\tN\tN\t0\tN\t0\t0\t0\tN\t0\t0\t0\t0\t1.000\t255.000\t0"
+ << endl;
+ }
+ else if (para->glf_format) {
+ consensus << (unsigned char)(0xF<<4|0) << (unsigned char)(0<<4|0xF)<<flush;
+ for(type=0;type!=10;type++) {
+ consensus<<(unsigned char)0;
+ }
+ consensus<<flush;
+ if(!consensus.good()) {
+ cerr<<"Broken ofstream after writting Position "<<(sites[j].pos+1)<<" at "<<call_name<<endl;
+ exit(255);
+ }
+ }
+ continue;
+ }
+ base1 = 0, base2 = 0, base3 = 0;
+ qual1 = -1, qual2 = -2, qual3 = -3;
+ all_count1 = 0, all_count2 = 0, all_count3 = 0;
+ // .dep_uni = Depth of unique bases?
+ if(sites[j].dep_uni) {
+ // This position is uniquely covered by at least one
+ // nucleotide. BTL: This loop seems to collect the most
+ // frequent three bases according to sum-of-Phred-calls
+ // for that base. sites[].q_sum is already calculated
+ for(i = 0; i != 4; i++) {
+ // i is four kind of alleles
+ if(sites[j].q_sum[i] >= qual1) {
+ base3 = base2;
+ qual3 = qual2;
+ base2 = base1;
+ qual2 = qual1;
+ base1 = i;
+ qual1 = sites[j].q_sum[i];
+ }
+ else if (sites[j].q_sum[i] >= qual2) {
+ base3 = base2;
+ qual3 = qual2;
+ base2 = i;
+ qual2 = sites[j].q_sum[i];
+ }
+ else if (sites[j].q_sum[i] >= qual3) {
+ base3 = i;
+ qual3 = sites[j].q_sum[i];
+ }
+ else {
+ ;
+ }
+ }
+ if(qual1 == 0) {
+ // Adjust the best base so that things won't look ugly
+ // if the pos is not covered
+ base1 = (sites[j].ori & 7);
+ }
+ else if(qual2 ==0 && base1 != (sites[j].ori & 7)) {
+ base2 = (sites[j].ori & 7);
+ }
+ else {
+ ;
+ }
+ } // if(sites[j].dep_uni)
+ else {
+ // This position is covered by all repeats
+ for(i = 0; i != 4; i++) {
+ if(sites[j].count_all[i] >= all_count1) {
+ base3 = base2;
+ all_count3 = all_count2;
+ base2 = base1;
+ all_count2 = all_count1;
+ base1 = i;
+ all_count1 = sites[j].count_all[i];
+ }
+ else if (sites[j].count_all[i] >= all_count2) {
+ base3 = base2;
+ all_count3 = all_count2;
+ base2 = i;
+ all_count2 = sites[j].count_all[i];
+ }
+ else if (sites[j].count_all[i] >= all_count3) {
+ base3 = i;
+ all_count3 = sites[j].count_all[i];
+ }
+ }
+ if(all_count1 == 0) {
+ // none found
+ base1 = (sites[j].ori&7);
+ }
+ else if(all_count2 == 0 && base1 != (sites[j].ori&7)) {
+ base2 = (sites[j].ori&7);
+ }
+ }
+
+ // Calculate likelihood
+ for(genotype = 0; genotype != 16; genotype++){
+ mat->type_likely[genotype] = 0.0;
+ }
+
+ //
+ // The next set of nested loops is looping over (a) the H, q
+ // and c dimensions of the 4-dim recal matrix, then (b) over
+ // all aligned bases matching that H, q and c, then (c) over
+ // all possible alleles for the current reference position.
+ // The result is that each aligned base's mojo gets spread
+ // across the candidate alleles according to the equations in
+ // the Genome Res paper.
+ //
+
+#ifdef FAST_BOUNDS
+ char qmin = (sites[j].qmin == 0 ? 1 : sites[j].qmin-1);
+ char qmax = (sites[j].qmax == 0 ? 0 : sites[j].qmax-1);
+ small_int coordmin = (sites[j].coordmin == 0 ? 1 : sites[j].coordmin-1);
+ small_int coordmax = (sites[j].coordmax == 0 ? 0 : sites[j].coordmax-1);
+#endif
+ // Looping over haplo-genotypes (H) in the 4-dim table?
+ for(o_base = 0; o_base != 4; o_base++) {
+ if(sites[j].count_uni[o_base] == 0) {
+ // No unique alignments with this reference haplotype
+ continue;
+ }
+ // Reset the
+ global_dep_count = -1;
+ memset(pcr_dep_count, 0, sizeof(int) * 2 * para->read_length);
+ // Looping over quality scores (q) in the 4-dim table
+#ifdef FAST_BOUNDS
+ for(q_score = qmax; q_score >= qmin; q_score--) {
+#else
+ for(q_score = para->q_max - para->q_min; q_score != -1; q_score--) {
+#endif
+ // Looping over cycles (c) in the 4-dim table
+#ifdef FAST_BOUNDS
+ for(coord = coordmin; coord <= coordmax; coord++) {
+#else
+ for(coord = 0; coord != para->read_length; coord++) {
+#endif
+ // Looping over reference strands
+ for(strand = 0; strand != 2; strand++) {
+ // Now iterate over all the aligned bases with:
+ // (a) character 'o_base'
+ // (b) ...aligned to reference strand 'strand'
+ // (c) ...with quality score 'q_score'
+ // (d) ...generated in sequencing cycle 'coord'
+ const int bi = o_base << 15 | strand << 14 | q_score << 8 | coord;
+ for(k = 0; k != sites[j].base_info[bi]; k++) {
+ // pcr_dep_count is indexed by coordinate,
+ // and cares about which strand was read
+ if(pcr_dep_count[strand*para->read_length+coord] == 0) {
+ global_dep_count += 1; // sets it to 0
+ }
+ pcr_dep_count[strand*para->read_length+coord] += 1;
+ // This is where the dependency coefficient
+ // is calculated and taken into account.
+ // q_score is iterated over in an outer
+ // loop.
+ q_adjusted = int( pow(10, (log10(q_score) +
+ (pcr_dep_count[strand*para->read_length+coord]-1) *
+ para->pcr_dependency +
+ global_dep_count*para->global_dependency)) + 0.5 );
+ if(q_adjusted < 1) {
+ q_adjusted = 1;
+ }
+ // For all 10 diploid alleles...
+ for(allele1 = 0; allele1 != 4; allele1++) {
+ for(allele2 = allele1; allele2 != 4; allele2++) {
+ // Here's where we calculate P(D|T)
+ // given all the P(dk|T)s
+ double hm = mat->p_matrix[((ubit64_t)q_adjusted << 12) | (coord << 4) | (allele1 << 2) | o_base];
+ double hn = mat->p_matrix[((ubit64_t)q_adjusted << 12) | (coord << 4) | (allele2 << 2) | o_base];
+ mat->type_likely[allele1 << 2 | allele2] +=
+ // Here's where we calculate
+ // P(dk|T) given P(dk|Hm) and
+ // P(dk|Hn); see p8 of the
+ // Genome Res paper
+ log10(0.5 * hm + 0.5 * hn);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ //
+ // The GLF format takes information about copy-number depth.
+ //
+ if(1==para->glf_format) {
+ // Generate GLFv2 format
+ int copy_num;
+ if(sites[j].depth == 0) {
+ copy_num = 15;
+ }
+ else {
+ copy_num = int(1.442695041*log(sites[j].repeat_time/sites[j].depth));
+ if(copy_num > 15) {
+ copy_num = 15;
+ }
+ }
+ if(sites[j].depth > 255) {
+ sites[j].depth = 255;
+ }
+ consensus << (unsigned char)(glf_base_code[sites[j].ori&7]<<4|((sites[j].depth>>4)&0xF))<<(unsigned char)((sites[j].depth&0xF)<<4|copy_num&0xF)<<flush;
+ type1 = 0;
+ // Find the largest likelihood
+ for (allele1=0; allele1!=4; allele1++) {
+ for (allele2=allele1; allele2!=4; allele2++) {
+ genotype = allele1 << 2 | allele2;
+ if (mat->type_likely[genotype] > mat->type_likely[type1]) {
+ type1 = genotype;
+ }
+ }
+ }
+ for(type = 0; type != 10; type++) {
+ if(mat->type_likely[type1] -
+ mat->type_likely[glf_type_code[type]] > 25.5)
+ {
+ consensus << (unsigned char)255;
+ } else {
+ consensus << (unsigned char)(unsigned int)
+ (10 * (mat->type_likely[type1] -
+ mat->type_likely[glf_type_code[type]]));
+ }
+ }
+ consensus << flush;
+ if(!consensus.good()) {
+ cerr << "Broken ofstream after writing Position " << (sites[j].pos+1) << " at " << call_name << endl;
+ exit(255);
+ }
+ continue;
+ }
+ // Calculate prior probability
+ memcpy(real_p_prior, &mat->p_prior[((ubit64_t)sites[j].ori&0x7)<<4], sizeof(double)*16);
+ if ( (sites[j].ori & 0x8) && para->refine_mode) {
+ // Refine the prior probability by taking into account that
+ // this position is the site of a known SNP
+ snp_p_prior_gen(real_p_prior, call_chr->find_snp(sites[j].pos), para, sites[j].ori);
+ }
+ // Given priors and likelihoods, calculate posteriors and keep
+ // the two genotypes with the highest posterior probabilities.
+ memset(mat->type_prob, 0, sizeof(rate_t) * 17);
+ type2 = type1 = 16;
+ for (allele1 = 0; allele1 != 4; allele1++) {
+ for (allele2 = allele1; allele2 != 4; allele2++) {
+ genotype = allele1 << 2 | allele2;
+ if (para->is_monoploid && allele1 != allele2) {
+ continue;
+ }
+ mat->type_prob[genotype] = mat->type_likely[genotype] + log10(real_p_prior[genotype]) ;
+
+ if (mat->type_prob[genotype] >= mat->type_prob[type1] || type1 == 16) {
+ type2 = type1;
+ type1 = genotype; // new most-likely genotype
+ }
+ else if (mat->type_prob[genotype] >= mat->type_prob[type2] || type2 ==16) {
+ type2 = genotype; // new second-most-likely genotype
+ }
+ }
+ }
+ if(2 == para->glf_format) {
+ // Generate GLFv2 format
+ int copy_num;
+ if(sites[j].depth == 0) {
+ copy_num = 15;
+ }
+ else {
+ copy_num = int(1.442695041*log(sites[j].repeat_time/sites[j].depth));
+ if(copy_num>15) {
+ copy_num = 15;
+ }
+ }
+ if(sites[j].depth >255) {
+ sites[j].depth = 255;
+ }
+ consensus<<(unsigned char)(glf_base_code[sites[j].ori&7]<<4|((sites[j].depth>>4)&0xF))<<(unsigned char)((sites[j].depth&0xF)<<4|copy_num&0xF)<<flush;
+ type1 = 0;
+ // Find the largest likelihood
+ for (allele1=0; allele1!=4; allele1++) {
+ for (allele2=allele1; allele2!=4; allele2++) {
+ genotype = allele1<<2|allele2;
+ if (mat->type_prob[genotype] > mat->type_prob[type1]) {
+ type1 = genotype;
+ }
+ }
+ }
+ for(type=0;type!=10;type++) {
+ if(mat->type_prob[type1]-mat->type_prob[glf_type_code[type]]>25.5) {
+ consensus<<(unsigned char)255;
+ }
+ else {
+ consensus<<(unsigned char)(unsigned int)(10*(mat->type_prob[type1]-mat->type_prob[glf_type_code[type]]));
+ }
+ }
+ consensus<<flush;
+ if(!consensus.good()) {
+ cerr<<"Broken ofstream after writting Position "<<(sites[j].pos+1)<<" at "<<call_name<<endl;
+ exit(255);
+ }
+ continue;
+ }
+ is_out = true; // Check if the position needs to be output, useful in snp-only mode
+
+ if (para->rank_sum_mode) {
+ rank_sum_test_value = rank_test(sites[j], type1, mat->p_rank, para);
+ }
+ else {
+ rank_sum_test_value = 1.0;
+ }
+
+ if(rank_sum_test_value == 0.0) {
+ // avoid double genotype overflow
+ q_cns = 0;
+ }
+ else {
+ // Quality of the consensus call is related to the
+ // difference between the probabilities of the first and
+ // second most probable calls.
+ q_cns = (int)(10*(mat->type_prob[type1] -
+ mat->type_prob[type2]) +
+ 10*log10(rank_sum_test_value));
+ }
+
+ if ((type1 & 3) == ((type1 >> 2) & 3)) { // Called Homozygous
+ if (qual1 > 0 && base1 != (type1 & 3)) {
+ // Wired: best base is not the consensus!
+ q_cns = 0;
+ }
+ else if (/*qual2>0 &&*/ q_cns > qual1-qual2) {
+ // Should not bigger than this
+ q_cns = qual1-qual2;
+ }
+ }
+ else { // Called Heterozygous
+ if(sites[j].q_sum[base1] > 0 &&
+ sites[j].q_sum[base2] > 0 &&
+ type1 == (base1 < base2 ? (base1 << 2 | base2) : (base2 << 2 | base1)))
+ {
+ // The best bases are in the heterozygote
+
+ // Quality is limited by the difference in quality
+ // between the second-best call and the third-best call
+ if (q_cns > qual2-qual3) {
+ q_cns = qual2-qual3;
+ }
+ }
+ else { // Ok, wired things happened
+ q_cns = 0;
+ }
+ }
+ if(q_cns > 99) {
+ q_cns = 99;
+ }
+ if (q_cns < 0) {
+ q_cns = 0;
+ }
+ // ChrID\tPos\tRef\tCns\tQual\tBase1\tAvgQ1\tCountUni1\tCountAll1\tBase2\tAvgQ2\tCountUni2\tCountAll2\tDepth\tRank_sum\tCopyNum\tSNPstauts\n"
+ bool non_ref = (abbv[type1] != "ACTGNNNN"[(sites[j].ori&0x7)] && sites[j].depth > 0);
+ if(non_ref) poscalled_nonref++;
+ if(!para->is_snp_only || known_snp || non_ref) {
+ if(base1 < 4 && base2 < 4) {
+ if(known_snp && !non_ref) consensus << "K\t";
+ consensus << call_name // chromosome name
+ << '\t' << (sites[j].pos+1) // position
+ << '\t' << ("ACTGNNNN"[(sites[j].ori & 0x7)]) // reference allele
+ << '\t' << abbv[type1] // called type
+ << '\t' << q_cns // quality of call
+ << '\t' << ("ACTGNNNN"[base1]) // base1 call
+ << '\t' << (sites[j].q_sum[base1] == 0 ? 0 : sites[j].q_sum[base1]/sites[j].count_uni[base1])
+ << '\t' << sites[j].count_uni[base1]
+ << '\t' << sites[j].count_all[base1]
+ << '\t' << ("ACTGNNNN"[base2]) // base2 call
+ << '\t' << (sites[j].q_sum[base2]==0?0:sites[j].q_sum[base2]/sites[j].count_uni[base2])
+ << '\t' << sites[j].count_uni[base2]
+ << '\t' << sites[j].count_all[base2]
+ << '\t' << sites[j].depth
+ << '\t' << sites[j].dep_pair
+ << '\t' << showpoint << rank_sum_test_value
+ << '\t' << (sites[j].depth == 0 ? 255 : (double)(sites[j].repeat_time)/sites[j].depth)
+ << '\t' << ((sites[j].ori & 8) ? 1 : 0) // dbSNP locus?
+ << endl;
+ }
+ else if(base1 < 4) {
+ if(known_snp && !non_ref) consensus << "K\t";
+ consensus << call_name // chromosome name
+ << '\t' << (sites[j].pos+1) // position
+ << '\t' << ("ACTGNNNN"[(sites[j].ori&0x7)]) // reference char
+ << '\t' << abbv[type1] // called type
+ << '\t' << q_cns // quality of call
+ << '\t' << ("ACTGNNNN"[base1]) // first heterozygous base
+ << '\t' << (sites[j].q_sum[base1] == 0 ? 0 : sites[j].q_sum[base1]/sites[j].count_uni[base1])
+ << '\t' << sites[j].count_uni[base1]
+ << '\t' << sites[j].count_all[base1]
+ << '\t' << "N\t0\t0\t0"
+ << '\t' << sites[j].depth
+ << '\t' << sites[j].dep_pair
+ << '\t' << showpoint << rank_sum_test_value
+ << '\t' << (sites[j].depth == 0 ? 255 : (double)(sites[j].repeat_time)/sites[j].depth)
+ << '\t' << ((sites[j].ori & 8) ? 1 : 0) // dbSNP locus?
+ << endl;
+ }
+ else {
+ if(known_snp && !non_ref) consensus << "K\t";
+ consensus << call_name
+ << '\t'
+ << (sites[j].pos+1)
+ << "\tN\tN\t0\tN\t0\t0\t0\tN\t0\t0\t0\t0\t0\t1.000\t255.000\t0"
+ << endl;
+ }
+ }
+ }
+ delete [] real_p_prior;
+ delete [] pcr_dep_count;
+ return 1;
+}
diff --git a/soapsnp/chromosome.cc b/soapsnp/chromosome.cc
new file mode 100644
index 0000000..8f9e5c3
--- /dev/null
+++ b/soapsnp/chromosome.cc
@@ -0,0 +1,254 @@
+#include "soap_snp.h"
+
+/**
+ * Insert a mapping from a chromosome name to a pointer to a chromosome
+ * info structure.
+ */
+bool Genome::add_chr(Chr_name & name) {
+ Chr_info * new_chr = new Chr_info;
+ pair<map<Chr_name, Chr_info*>::iterator, bool> insert_pair;
+ insert_pair=chromosomes.insert(pair<Chr_name, Chr_info*>(name,new_chr));
+ return insert_pair.second;
+}
+
+Genome::~Genome(){
+ for( map<Chr_name, Chr_info*>::iterator iter=chromosomes.begin(); iter!= chromosomes.end(); iter++ ){
+ ;
+ }
+}
+Chr_info::Chr_info(const Chr_info & other) {
+ dbsnp = other.dbsnp;
+ len = other.len;
+ elts = other.elts;
+ if (len%capacity==0) {
+ bin_seq = new ubit64_t [len/capacity];
+ memcpy(bin_seq, other.bin_seq, sizeof(ubit64_t)*len/capacity);
+ }
+ else {
+ bin_seq = new ubit64_t [1+len/capacity];
+ memcpy(bin_seq, other.bin_seq, sizeof(ubit64_t)*len/capacity);
+ }
+ regions = other.regions;
+}
+
+int Chr_info::binarize(std::string & seq) {
+ len = seq.length();
+ //cerr<<len<<endl;
+ // 4bit for each base
+ // Allocate memory
+ if (len%capacity==0) {
+ elts = len/capacity;
+ bin_seq = new ubit64_t [elts];
+ memset(bin_seq,0,sizeof(ubit64_t)* elts);
+ }
+ else {
+ elts = 1+len/capacity;
+ bin_seq = new ubit64_t [elts];
+ memset(bin_seq,0,sizeof(ubit64_t)*(elts));
+ }
+
+ // Add each base, 7 is 0b111
+ for(std::string::size_type i=0;i!=seq.length();i++) {
+ bin_seq[i/capacity] |= ((((ubit64_t)seq[i]>>1)&7)<<(i%capacity*4));
+ }
+ return 1;
+}
+
+/**
+ * Dump the bin_seq sequence to a file with the given name.
+ */
+void Chr_info::dump_binarized(std::string fn) {
+ ofstream of(fn.c_str(), ios_base::binary | ios_base::out);
+ of.write((const char *)bin_seq, elts*sizeof(ubit64_t));
+ of.close();
+}
+
+int Chr_info::insert_snp(std::string::size_type pos, Snp_info & snp_form, bool quiet) {
+ Snp_info * new_snp = new Snp_info;
+ *new_snp = snp_form;
+ pair<map<ubit64_t, Snp_info*>::iterator, bool> insert_pair;
+ if(dbsnp.find(pos) != dbsnp.end()) {
+ if(!quiet) {
+ cerr << "Warning: SNP has already been inserted at position " << pos << endl;
+ cerr << " new SNP: " << snp_form.get_name()
+ << ", old SNP: " << dbsnp.find(pos)->second->get_name() << endl;
+ }
+ return 0;
+ }
+ pair<ubit64_t, Snp_info*> p(pos,new_snp);
+ insert_pair = dbsnp.insert(p);
+ if(insert_pair.second) {
+ // Successful insertion
+ // Modify the binary sequence! Mark SNPs
+ bin_seq[pos/capacity] |= (1ULL<<(pos%capacity*4+3));
+ } else {
+ cerr << "Warning: SNP insertion failed for SNP with name "
+ << snp_form.get_name() << " at position " << pos << endl;
+ return 0;
+ }
+ return 1;
+}
+
+int Chr_info::set_region(int start, int end) {
+ if(start<0) {
+ start = 0;
+ }
+ else if (start >= len) {
+ start = len;
+ }
+
+ if(end<0) {
+ end = 0;
+ }
+ else if (end >= len) {
+ // BTL: Modified from 'end = len' per bug report
+ end = len - 1;
+ }
+ if (start > end) {
+ cerr<<"Invalid region: "<<start<<"-"<<end<<endl;
+ exit(255);
+ }
+ if(start/64 == end/64) {
+ region_mask[start/64] |= ((~((~(0ULL))<<(end-start+1)))<<(63-end%64));
+ }
+ else {
+ if(start % 64) {
+ region_mask[start/64] |= (~((~(0ULL))<<(64-start%64)));
+ }
+ else {
+ region_mask[start/64] = ~(0ULL);
+ }
+ region_mask[end/64] |= ((~(0ULL))<<(63-end%64));
+ if(end/64-start/64>1) {
+ memset(region_mask+start/64+1, 0xFF, sizeof(ubit64_t)*(end/64-start/64-1));
+ }
+ }
+ regions.push_back(make_pair(start, end));
+ return 1;
+}
+
+/**
+ * Initialize the region mask. Everything's 0 to begin with.
+ */
+int Chr_info::region_mask_ini(){
+ if(len%64==0) {
+ region_mask = new ubit64_t [len/64];
+ memset(region_mask, 0, sizeof(ubit64_t)*(len/64));
+ }
+ else {
+ region_mask = new ubit64_t [len/64+1];
+ memset(region_mask, 0, sizeof(ubit64_t)*(len/64+1));
+ }
+ return 1;
+}
+
+/**
+ * Read and parse a region file, specified via the -T option.
+ */
+int Genome::read_region(std::ifstream & region, Parameter * para) {
+ Chr_name current_name(""), prev_name("");
+ int start, end;
+ map<Chr_name, Chr_info*>::iterator chr_iter;
+ // Lines appear to be formatted as: name, start, end
+ for(std::string buff; getline(region,buff); ) {
+ std::istringstream s(buff);
+ if(s >> current_name >> start >> end) {
+ if(current_name != prev_name) {
+ chr_iter = chromosomes.find(current_name);
+ if(chr_iter == chromosomes.end()) {
+ // Chromosome was not known
+ cerr << "Unexpected Chromosome:" << current_name<<endl;
+ continue;
+ }
+ if(NULL == chr_iter->second->get_region()) {
+ chr_iter->second->region_mask_ini();
+ }
+ }
+ chr_iter->second->set_region(start-para->read_length, end-1);
+ prev_name = current_name;
+ }
+ else {
+ cerr<<"Wrong format in target region file"<<endl;
+ return 0;
+ }
+ }
+ return 1;
+}
+
+/**
+ * Read and parse a genome from a single fasta file, which is assumed
+ * to be organized by chromosome. Also read and parse the SNP file.
+ */
+Genome::Genome(std::ifstream &fasta, std::ifstream & known_snp, bool quiet)
+{
+ // As we read in the characters, we store them in seq. We
+ // eventually binarize them into the bin_seq field of the
+ // respective Chr_info
+ std::string seq("");
+ Chr_name current_name("");
+ map<Chr_name, Chr_info*>::iterator chr_iter;
+ // Read the fasta file
+ size_t lines = 0, chars = 0;
+ for(std::string buff; getline(fasta,buff); ) {
+ // Name line?
+ lines++;
+ if('>' == buff[0]) {
+ // Fasta id
+ // Deal with previous chromosome
+ if(chromosomes.find(current_name) != chromosomes.end()) {
+ // The previous chromosome is finished, so binarize it
+ chr_iter = chromosomes.find(current_name);
+ chr_iter->second->binarize(seq);
+ }
+ // Insert new chromosome
+ std::string::size_type i;
+ for(i = 1; !isspace(buff[i]) && i != buff.length(); i++) {
+ ;
+ }
+ Chr_name new_chr_name(buff, 1, i-1);
+ if(!add_chr(new_chr_name)) {
+ std::cerr << "Insert Chromosome " << new_chr_name << " Failed!\n";
+ }
+ current_name = new_chr_name;
+ seq = "";
+ }
+ else {
+ // Append line to sequence
+ chars += buff.length();
+ seq += buff;
+ }
+ }
+ clog << "Read " << chars << " from " << lines << " lines of input FASTA sequence "; logTime(); clog << endl;
+ if(seq.length() != 0 && chromosomes.find(current_name) != chromosomes.end()) {
+ // Binarize the final chromosome
+ chr_iter = chromosomes.find(current_name);
+ chr_iter->second->binarize(seq);
+ }
+ clog << "Finished loading and binarizing chromosome "; logTime(); clog << endl;
+ lines = 0;
+ if(known_snp) {
+ // Read in the SNP file
+ Chr_name current_name;
+ Snp_info snp_form;
+ std::string::size_type pos;
+ for(std::string buff; getline(known_snp, buff); ) {
+ // Format: Chr\tPos\thapmap?\tvalidated?\tis_indel?\tA\tC\tT\tG\trsID\n
+ lines++;
+ std::istringstream s(buff);
+ // Read chromosome name and position
+ s >> current_name >> pos;
+ // Snp_info has a special operator>> that reads the rest
+ // of the line; see soap_snp.h
+ s >> snp_form;
+ if(chromosomes.find(current_name) != chromosomes.end()) {
+ // The SNP is located on an valid chromosome
+ pos -= 1; // Coordinates starts from 0
+ // Stick the SNP in a chromosome-specific map that maps
+ // positions to SNP_Infos
+ (chromosomes.find(current_name)->second)->insert_snp(pos, snp_form, quiet);
+ }
+ }
+ // Now possibly dump SNPs
+ }
+ clog << "Finished parsing " << lines << " known SNPs "; logTime(); clog << endl;
+}
diff --git a/soapsnp/main.cc b/soapsnp/main.cc
new file mode 100644
index 0000000..66de1d4
--- /dev/null
+++ b/soapsnp/main.cc
@@ -0,0 +1,460 @@
+#include "soap_snp.h"
+#include <getopt.h>
+
+using namespace std;
+
+int usage() {
+ cerr<<"SoapSNP version 1.02, Crossbow modifications (last changed 10/10/2010)"<<endl;
+ cerr<<"Compulsory Parameters:"<<endl;
+ cerr<<"-i <FILE> Input SORTED Soap Result"<<endl;
+ cerr<<"-d <FILE> Reference Sequence in fasta format"<<endl;
+ cerr<<"-o <FILE> Output consensus file"<<endl;
+ cerr<<"Optional Parameters:(Default in [])"<<endl;
+ cerr<<"-z <Char> ASCII chracter standing for quality==0 [@]"<<endl;
+ cerr<<"-g <Double> Global Error Dependency Coefficient, 0.0(complete dependent)~1.0(complete independent)[0.9]"<<endl;
+ cerr<<"-p <Double> PCR Error Dependency Coefficient, 0.0(complete dependent)~1.0(complete independent)[0.5]"<<endl;
+ cerr<<"-r <Double> novel altHOM prior probability [0.0005]"<<endl;
+ cerr<<"-e <Double> novel HET prior probability [0.0010]"<<endl;
+ cerr<<"-t set transition/transversion ratio to 2:1 in prior probability"<<endl;
+ cerr<<"-s <FILE> Pre-formated dbSNP information"<<endl;
+ cerr<<"-2 specify this option will REFINE SNPs using dbSNPs information [Off]"<<endl;
+ cerr<<"-a <Double> Validated HET prior, if no allele frequency known [0.1]"<<endl;
+ cerr<<"-b <Double> Validated altHOM prior, if no allele frequency known[0.05]"<<endl;
+ cerr<<"-j <Double> Unvalidated HET prior, if no allele frequency known [0.02]"<<endl;
+ cerr<<"-k <Double> Unvalidated altHOM rate, if no allele frequency known[0.01]"<<endl;
+ cerr<<"-u Enable rank sum test to give HET further penalty for better accuracy. [Off]"<<endl;
+ //cerr<<"-n Enable binomial probability calculation to give HET for better accuracy. [Off]"<<endl;
+ cerr<<"-m Enable monoploid calling mode, this will ensure all consensus as HOM and you probably should SPECIFY higher altHOM rate. [Off]"<<endl;
+ cerr<<"-q Only output potential SNPs. Useful in Text output mode. [Off]"<<endl;
+ cerr<<"-M <FILE> Output the quality calibration matrix; the matrix can be reused with -I if you rerun the program"<<endl;
+ cerr<<"-I <FILE> Input previous quality calibration matrix. It cannot be used simutaneously with -M"<<endl;
+ cerr<<"-L <short> maximum length of read [45]"<<endl;
+ cerr<<"-Q <short> maximum FASTQ quality score [40]"<<endl;
+ cerr<<"-F <int> Output format. 0: Text; 1: GLFv2; 2: GPFv2.[0]"<<endl;
+ cerr<<"-E <String> Extra headers EXCEPT CHROMOSOME FIELD specified in GLFv2 output. Format is \"TypeName1:DataName1:TypeName2:DataName2\"[""]"<<endl;
+ cerr<<"-T <FILE> Only call consensus on regions specified in FILE. Format: ChrName\\tStart\\tEnd."<<endl;
+ cerr<<"-c Use the crossbow input format [Off]"<<endl;
+ cerr<<"-K In -q mode, print consensus info for every dbsnp pos even if there's no SNP [Off]"<<endl;
+ //cerr<<"-S <FILE> Output summary of consensus"<<endl;
+ cerr<<"-H Print Hadoop status updates" << endl;
+ cerr<<"-v Verbose mode"<<endl;
+ cerr<<"-h Display this help"<<endl;
+
+ cerr<<"\nLicense GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>"<<endl;
+ cerr<<"This is free software: you are free to change and redistribute it."<<endl;
+ cerr<<"There is NO WARRANTY, to the extent permitted by law.\n"<<endl;
+
+ exit(1);
+ return 0;
+}
+
+int readme() {
+ return usage();
+}
+
+unsigned long poscalled = 0;
+unsigned long poscalled_knownsnp = 0;
+unsigned long poscalled_uncov_uni = 0;
+unsigned long poscalled_uncov = 0;
+unsigned long poscalled_n_no_depth = 0;
+unsigned long poscalled_nonref = 0;
+unsigned long poscalled_reported = 0;
+
+unsigned long alignments_read = 0;
+unsigned long alignments_read_unique = 0;
+unsigned long alignments_read_unpaired = 0;
+unsigned long alignments_read_paired = 0;
+
+int main ( int argc, char * argv[]) {
+ // This part is the default values of all parameters
+ Parameter * para = new Parameter;
+ std::string alignment_name, consensus_name;
+ bool is_matrix_in = false; // Generate the matrix or just read it?
+ int c;
+ Files files;
+ while((c=getopt(argc,argv,"Ki:d:o:z:g:p:r:e:ts:2a:b:j:k:unmqM:I:L:Q:S:F:E:T:clhHv")) != -1) {
+ switch(c) {
+ case 'i':
+ {
+ // Soap Alignment Result
+ files.soap_result.clear();
+ files.soap_result.open(optarg);
+ if( ! files.soap_result) {
+ cerr<<"No such file or directory:"<<optarg<<endl;
+ exit(1);
+ }
+ alignment_name = optarg;
+ cerr << "-i is set to " << alignment_name << endl;
+ break;
+ }
+ case 'd':
+ {
+ // The reference genome in fasta format
+ files.ref_seq.clear();
+ files.ref_seq.open(optarg);
+ if( ! files.ref_seq) {
+ cerr<<"No such file or directory:"<<optarg<<endl;
+ exit(1);
+ }
+ files.ref_seq.clear();
+ cerr << "-d is set to " << optarg << endl;
+ break;
+ }
+ case 'o':
+ {
+ files.consensus.clear();
+ files.consensus.open(optarg);
+ if( ! files.consensus ) {
+ cerr<<"Cannot creat file:" <<optarg <<endl;
+ exit(1);
+ }
+ files.consensus.clear();
+ consensus_name = optarg;
+ cerr << "-o is set to " << consensus_name << endl;
+ break;
+ }
+ case 'z':
+ {
+ // The char stands for quality==0 in fastq format
+ para->q_min = optarg[0];
+ if(para->q_min == 33) {
+ clog<<"Standard Fastq System Set"<<endl;
+ }
+ else if(para->q_min == 64) {
+ clog<<"Illumina Fastq System Set"<<endl;
+ }
+ else {
+ clog<<"Other types of Fastq files?? Are you sure?"<<endl;
+ }
+ para->q_max = para->q_min + 40;
+ break;
+ }
+ case 'g':
+ {
+ para->global_dependency= log10(atof(optarg));
+ cerr << "-g is set to " << para->global_dependency << endl;
+ break;
+ }
+ case 'p':
+ {
+ para->pcr_dependency= log10(atof(optarg));
+ cerr << "-p is set to " << para->pcr_dependency << endl;
+ break;
+ }
+ case 'r':
+ {
+ para->althom_novel_r = atof(optarg);
+ cerr << "-r is set to " << para->althom_novel_r << endl;
+ break;
+ }
+ case 'e':
+ {
+ para->het_novel_r=atof(optarg);
+ cerr << "-e is set to " << para->het_novel_r << endl;
+ break;
+ }
+ case 't':
+ {
+ cerr << "-t is set" << endl;
+ para->transition_dominant=true;
+ break;
+ }
+ case 'K':
+ {
+ cerr << "-K is set" << endl;
+ para->dump_dbsnp_evidence=true;
+ break;
+ }
+ case 's':
+ {
+ // Optional: A pre-formated dbSNP table
+ cerr << "-s is set" << endl;
+ files.dbsnp.clear();
+ files.dbsnp.open(optarg);
+ if(!files.ref_seq) {
+ cerr << "No such file or directory:" << optarg << endl;
+ exit(1);
+ }
+ files.dbsnp.clear();
+ break;
+ }
+ case '2':
+ {
+ // Refine prior probability based on dbSNP information
+ cerr << "-2 is set" << endl;
+ para->refine_mode = true;
+ break;
+ }
+ case 'a':
+ {
+ para->althom_val_r=atof(optarg);
+ cerr << "-a is set to " << para->althom_val_r << endl;
+ break;
+ }
+ case 'b':
+ {
+ para->het_val_r=atof(optarg);
+ cerr << "-b is set to " << para->het_val_r << endl;
+ break;
+ }
+ case 'j':
+ {
+ para->althom_unval_r=atof(optarg);
+ cerr << "-j is set to " << para->althom_unval_r << endl;
+ break;
+ }
+ case 'k':
+ {
+ para->het_unval_r=atof(optarg);
+ cerr << "-k is set to " << para->het_unval_r << endl;
+ break;
+ }
+ case 'u':
+ {
+ cerr << "-u is set" << endl;
+ para->rank_sum_mode = true;
+ break;
+ }
+ case 'n':
+ {
+ cerr << "-n is set" << endl;
+ para->binom_mode = true;
+ break;
+ }
+ case 'm':
+ {
+ cerr << "-m is set" << endl;
+ para->is_monoploid=1;
+ break;
+ }
+ case 'q':
+ {
+ cerr << "-q is set" << endl;
+ para->is_snp_only=1;
+ break;
+ }
+ case 'M':
+ {
+ files.matrix_file.close(); files.matrix_file.clear();
+ // Output the calibration matrix
+ files.matrix_file.open(optarg, fstream::out);
+ if( ! files.matrix_file) {
+ cerr<<"Cannot creat file :"<<optarg<<endl;
+ exit(1);
+ }
+ files.matrix_file.clear();
+ cerr << "-M is set to " << optarg << endl;
+ break;
+ }
+ case 'I':
+ {
+ files.matrix_file.close(); files.matrix_file.clear();
+ // Input the calibration matrix
+ files.matrix_file.open(optarg, fstream::in);
+ if( ! files.matrix_file) {
+ cerr<<"No such file or directory:"<<optarg<<endl;
+ exit(1);
+ }
+ files.matrix_file.clear();
+ is_matrix_in = true;
+ cerr << "-I is set to " << optarg << endl;
+ break;
+ }
+ case 'S':
+ {
+ //files.summary.open(optarg);
+ //// Output the summary of consensus
+ //if( ! files.summary ) {
+ // cerr<<"No such file or directory: "<<optarg<<endl;
+ // exit(1);
+ //}
+ break;
+ }
+ case 'L':
+ {
+ para->read_length = atoi(optarg);
+ cerr << "-L is set to " << (int)para->read_length << endl;
+ break;
+ }
+ case 'Q':
+ {
+ para->q_max = optarg[0];
+ if(para->q_max < para->q_min) {
+ cerr<< "FASTQ quality character error: Q_MAX > Q_MIN" <<endl;
+ }
+ cerr << "-Q is set to " << para->q_max << endl;
+ break;
+ }
+ case 'F': {
+ para->glf_format = atoi(optarg);
+ cerr << "-F is set to " << optarg << endl;
+ break;
+ }
+ case 'E': {
+ para->glf_header = optarg;
+ cerr << "-E is set to " << optarg << endl;
+ break;
+ }
+ case 'l': {
+ cerr << "-l is set" << endl;
+ para->do_recal = false;
+ break;
+ }
+ case 'T': {
+ files.region.clear();
+ files.region.open(optarg);
+ files.region.clear();
+ para->region_only = true;
+ cerr << "-T is set to " << optarg << endl;
+ break;
+ }
+ case 'c': {
+ para->format = CROSSBOW_FORMAT;
+ cerr << "-c is set" << endl;
+ break;
+ }
+ case 'v': para->verbose = true; break;
+ case 'H': para->hadoop_out = true; break;
+ case 'h':readme();break;
+ case '?':usage();break;
+ default: cerr<<"Unknown error in command line parameters"<<endl;
+ }
+ }
+ if( !files.consensus || !files.ref_seq || !files.soap_result ) {
+ // These are compulsory parameters
+ usage();
+ }
+ //Read the chromosomes into memory
+ Genome * genome = new Genome(files.ref_seq, files.dbsnp, true);
+ files.ref_seq.close();
+ files.dbsnp.close();
+ clog<<"Reading Chromosome and dbSNP information Done."<<endl;
+ if(para->region_only && files.region) {
+ genome->read_region(files.region, para);
+ clog<<"Read target region done."<<endl;
+ }
+ if(para->glf_format) { // GLF or GPF
+ files.consensus.close();
+ files.consensus.clear();
+ files.consensus.open(consensus_name.c_str(), ios::binary);
+ if(!files.consensus) {
+ cerr<<"Cannot write result to the specified output file."<<endl;
+ exit(255);
+ }
+ if (1==para->glf_format) {
+ files.consensus<<'g'<<'l'<<'f';
+ }
+ else if (2==para->glf_format) {
+ files.consensus<<'g'<<'p'<<'f';
+ }
+ int major_ver = 0;
+ int minor_ver = 0;
+ files.consensus.write(reinterpret_cast<char*>(&major_ver), sizeof(major_ver));
+ files.consensus.write(reinterpret_cast<char*>(&minor_ver), sizeof(minor_ver));
+ if(!files.consensus.good()) {
+ cerr<<"Broken ofstream after version."<<endl;
+ exit(255);
+ }
+ std::string temp("");
+ for(std::string::iterator iter=para->glf_header.begin();iter!=para->glf_header.end(); iter++) {
+ if (':'==(*iter)) {
+ int type_len(temp.size()+1);
+ files.consensus.write(reinterpret_cast<char*>(&type_len), sizeof(type_len));
+ files.consensus.write(temp.c_str(), temp.size()+1)<<flush;
+ temp = "";
+ }
+ else {
+ temp+=(*iter);
+ }
+ }
+ if(!files.consensus.good()) {
+ cerr<<"Broken ofstream after tags."<<endl;
+ exit(255);
+ }
+ if(temp != "") {
+ int type_len(temp.size()+1);
+ files.consensus.write(reinterpret_cast<char*>(&type_len), sizeof(type_len));
+ files.consensus.write(temp.c_str(), temp.size()+1)<<flush;
+ temp = "";
+ }
+ int temp_int(12);
+ files.consensus.write(reinterpret_cast<char*>(&temp_int), sizeof(temp_int));
+ files.consensus.write("CHROMOSOMES", 12);
+ temp_int = genome->chromosomes.size();
+ files.consensus.write(reinterpret_cast<char*>(&temp_int), sizeof(temp_int));
+ files.consensus<<flush;
+ if(!files.consensus.good()) {
+ cerr<<"Broken ofstream after writting header."<<endl;
+ exit(255);
+ }
+ }
+ Prob_matrix * mat = new Prob_matrix;
+ if(!is_matrix_in) {
+ // Read the soap result and give the calibration matrix
+ if(para->format == SOAP_FORMAT) {
+ clog << "Training correction matrix in SOAP format"; logTime(); clog << endl;
+ mat->matrix_gen<Soap_format>(files.soap_result, para, genome);
+ } else {
+ clog << "Training correction matrix in Crossbow format"; logTime(); clog << endl;
+ mat->matrix_gen<Crossbow_format>(files.soap_result, para, genome);
+ }
+ if (files.matrix_file) {
+ clog << "Writing correction matrix"; logTime(); clog << endl;
+ mat->matrix_write(files.matrix_file, para);
+ }
+ }
+ else {
+ clog << "Reading correction matrix"; logTime(); clog << endl;
+ mat->matrix_read(files.matrix_file, para);
+ }
+ files.matrix_file.close();
+ clog << "Correction Matrix Done "; logTime(); clog << endl;
+ mat->prior_gen(para);
+ if(para->verbose) clog << "Just did prior_gen" << endl;
+ mat->rank_table_gen();
+ if(para->verbose) clog << "Just did rank_table_gen" << endl;
+ Call_win *info = new Call_win(para->read_length, 1000);
+ if(para->verbose) clog << "Just allocated Call_win" << endl;
+ info->initialize(0);
+ //Call the consensus
+ files.soap_result.close();
+ files.soap_result.clear();
+ files.soap_result.open(alignment_name.c_str());
+ files.soap_result.clear();
+ if(para->verbose) clog << "Just reopened alignment file" << endl;
+ alignments_read = 0;
+ alignments_read_unique = 0;
+ if(para->format == SOAP_FORMAT) {
+ info->soap2cns<Soap_format>(files.soap_result, files.consensus, genome, mat, para);
+ } else {
+ info->soap2cns<Crossbow_format>(files.soap_result, files.consensus, genome, mat, para);
+ }
+ if(para->verbose) clog << "Just called soap2cns" << endl;
+ files.soap_result.close();
+ files.consensus.close();
+ if(para->hadoop_out) {
+ cerr << "reporter:counter:SOAPsnp,Alignments read," << alignments_read << endl;
+ cerr << "reporter:counter:SOAPsnp,Unique alignments read," << alignments_read_unique << endl;
+ cerr << "reporter:counter:SOAPsnp,Unpaired alignments read," << alignments_read_unpaired << endl;
+ cerr << "reporter:counter:SOAPsnp,Paired alignments read," << alignments_read_paired << endl;
+ cerr << "reporter:counter:SOAPsnp,Positions called," << (poscalled-poscalled_reported) << endl;
+ cerr << "reporter:counter:SOAPsnp,Positions called with known SNP info," << poscalled_knownsnp << endl;
+ cerr << "reporter:counter:SOAPsnp,Positions called uncovered by unique alignments," << poscalled_uncov_uni << endl;
+ cerr << "reporter:counter:SOAPsnp,Positions called uncovered by any alignments," << poscalled_uncov << endl;
+ cerr << "reporter:counter:SOAPsnp,Positions with non-reference allele called," << poscalled_nonref << endl;
+ }
+ if(para->verbose) {
+ clog << "Alignments read: " << alignments_read << endl;
+ clog << "Unique alignments read: " << alignments_read_unique << endl;
+ clog << "Unpaired alignments read: " << alignments_read_unpaired << endl;
+ clog << "Paired alignments read: " << alignments_read_paired << endl;
+ clog << "Positions called: " << (poscalled-poscalled_reported) << endl;
+ clog << "Positions called with known SNP info: " << poscalled_knownsnp << endl;
+ clog << "Positions called uncovered by unique alignments: " << poscalled_uncov_uni << endl;
+ clog << "Positions called uncovered by any alignments: " << poscalled_uncov << endl;
+ clog << "Positions with non-reference allele called: " << poscalled_nonref << endl;
+ }
+ clog << "Consensus Done!"; logTime(); clog << endl;
+ return 0;
+}
+
diff --git a/soapsnp/makefile b/soapsnp/makefile
new file mode 100644
index 0000000..30df780
--- /dev/null
+++ b/soapsnp/makefile
@@ -0,0 +1,30 @@
+BITS_FLAG =
+ifeq (32,$(BITS))
+BITS_FLAG = -m32
+endif
+ifeq (64,$(BITS))
+BITS_FLAG = -m64
+endif
+
+DEFINE =
+CXX = g++
+CXXFLAGS = #-MMD -MP -MF #-g3 -Wall -maccumulate-outgoing-args
+CXXFLAGS_RELEASE = -fomit-frame-pointer -O3 -ffast-math -funroll-loops -mmmx -msse -msse2 -msse3 -fmessage-length=0 -DNDEBUG -DFAST_BOUNDS
+CXXFLAGS_DEBUG = -g -g3 -O0
+LFLAGS =
+
+all: soapsnp
+.PHONY: all
+
+soapsnp: call_genotype.cc chromosome.cc matrix.cc normal_dis.cc prior.cc rank_sum.cc main.cc soap_snp.h makefile
+ $(CXX) $(CXXFLAGS) $(CXXFLAGS_RELEASE) $(BITS_FLAG) call_genotype.cc chromosome.cc matrix.cc normal_dis.cc prior.cc rank_sum.cc main.cc -o $@ $(LFLAGS)
+
+soapsnp-debug: call_genotype.cc chromosome.cc matrix.cc normal_dis.cc prior.cc rank_sum.cc main.cc soap_snp.h makefile
+ $(CXX) $(CXXFLAGS) $(CXXFLAGS_DEBUG) $(BITS_FLAG) call_genotype.cc chromosome.cc matrix.cc normal_dis.cc prior.cc rank_sum.cc main.cc -o $@ $(LFLAGS)
+
+binarize: call_genotype.cc chromosome.cc matrix.cc normal_dis.cc prior.cc rank_sum.cc binarize.cc soap_snp.h makefile
+ $(CXX) $(CXXFLAGS) call_genotype.cc chromosome.cc matrix.cc normal_dis.cc prior.cc rank_sum.cc binarize.cc -o binarize $(LFLAGS)
+
+.PHONY: clean
+clean:
+ rm -f *.o soapsnp
diff --git a/soapsnp/matrix.cc b/soapsnp/matrix.cc
new file mode 100644
index 0000000..225931a
--- /dev/null
+++ b/soapsnp/matrix.cc
@@ -0,0 +1,67 @@
+#include "soap_snp.h"
+Prob_matrix::Prob_matrix(){
+ int i;
+ // p_matrix has 1 million entires; rate_t is a double
+ p_matrix = new rate_t [256*256*4*4]; // 8bit: q_max, 8bit: read_len, 4bit: number of types of all mismatch/match 4x4
+ p_prior = new rate_t [8*4*4]; // 8(ref ACTGNNNN) * diploid(4x4)
+ base_freq = new rate_t [4]; // 4 base
+ type_likely = new rate_t [16+1]; //The 17th element rate_t[16] will be used in comparison
+ type_prob = new rate_t [16+1];
+ p_rank = new rate_t [64*64*2048]; // 6bit: N; 5bit: n1; 11bit; T1
+ p_binom = new rate_t [256*256]; // Total * case
+ for(i=0;i!=256*256*4*4;i++) {
+ p_matrix[i] = 1.0;
+ }
+ for(i=0;i!=8*4*4;i++) {
+ p_prior[i] = 1.0;
+ }
+ for(i=0;i!=4;i++) {
+ base_freq[i] = 1.0;
+ }
+ for(i=0;i!=16+1;i++) {
+ type_likely[i] = 0.0; // LOG10 Scale
+ type_prob[i] = 0.0; // LOG10 Scale
+ }
+ for(i=0;i!=64*64*2048;i++) {
+ p_rank[i] = 1.0;
+ }
+ for(i=0;i!=256*256;i++) {
+ p_binom[i] = 1.0;
+ }
+}
+
+Prob_matrix::~Prob_matrix(){
+ delete [] p_matrix; // 8bit: q_max, 8bit: read_len, 4bit: number of types of all mismatch/match 4x4
+ delete [] p_prior; // 8(ref ACTGNNNN) * diploid(4x4)
+ delete [] base_freq; // 4 base
+ delete [] type_likely; //The 17th element rate_t[16] will be used in comparison
+ delete [] type_prob;
+ delete [] p_rank; // 6bit: N; 5bit: n1; 11bit; T1
+ delete [] p_binom; // Total * case;
+}
+
+int Prob_matrix::matrix_read(std::fstream &mat_in, Parameter * para) {
+ int q_char, type;
+ std::string::size_type coord;
+ for(std::string line; getline(mat_in, line);) {
+ std::istringstream s(line);
+ s>>q_char>>coord;
+ for(type=0;type!=16;type++) {
+ s>>p_matrix [ ((ubit64_t)q_char<<12) | (coord <<4) | type];
+ }
+ }
+ return 1;
+}
+
+int Prob_matrix::matrix_write(std::fstream &mat_out, Parameter * para) {
+ for( char q_char = para->q_min; q_char <= para->q_max; q_char++ ) {
+ for( std::string::size_type coord=0; coord != para->read_length; coord++) {
+ mat_out<<((ubit64_t)q_char-para->q_min)<<'\t'<<coord;
+ for(char type=0;type!=16;type++) {
+ mat_out<<'\t'<<scientific<<showpoint<<setprecision(16)<<p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | type];
+ }
+ mat_out<<endl;
+ }
+ }
+ return 1;
+}
diff --git a/soapsnp/normal_dis.cc b/soapsnp/normal_dis.cc
new file mode 100644
index 0000000..2ac0d8d
--- /dev/null
+++ b/soapsnp/normal_dis.cc
@@ -0,0 +1,24 @@
+#include "soap_snp.h"
+
+double Call_win::normal_value(double z) {
+ if (z>6.0 || z<-6.0) {
+ return 0.0;
+ }
+ else {
+ double b1 = 0.31938153;
+ double b2 = -0.356563782;
+ double b3 = 1.781477937;
+ double b4 = -1.821255978;
+ double b5 = 1.330274429;
+ double p = 0.2316419;
+ double c2 = 0.39894228;
+
+ double a = fabs(z);
+ double t = 1.0/(1.0+a*p);
+ double b = c2*exp((-z)*(z/2.0));
+ double n = ((((b5*t+b4)*t+b3)*t+b2)*t+b1)*t;
+ n = 1.0 - b*n;
+ if (z < 0.0) n = 1.0 - n;
+ return n>0.5?1-n:n;
+ }
+}
diff --git a/soapsnp/prior.cc b/soapsnp/prior.cc
new file mode 100644
index 0000000..4ffb72c
--- /dev/null
+++ b/soapsnp/prior.cc
@@ -0,0 +1,111 @@
+#include "soap_snp.h"
+
+int Prob_matrix::prior_gen(Parameter * para) {
+ char t_base, allele1, allele2;
+ // Note, the above parameter should be changed to a more reasonable one
+ for(t_base=0;t_base!=4;t_base++) {
+ for(allele1=0;allele1!=4;allele1++) {
+ for(allele2=allele1;allele2!=4;allele2++) {
+ if(allele1 == t_base && allele2 == t_base) {
+ // refHOM
+ p_prior[t_base<<4|allele1<<2|allele2] = 1;
+ }
+ else if (allele1 == t_base || allele2 == t_base) {
+ // refHET: 1 ref 1 alt
+ p_prior[t_base<<4|allele1<<2|allele2] = para->het_novel_r;
+ }
+ else if (allele1 == allele2) {
+ // altHOM
+ p_prior[t_base<<4|allele1<<2|allele2] = para->althom_novel_r;
+ }
+ else {
+ // altHET: 2 diff alt base
+ p_prior[t_base<<4|allele1<<2|allele2] = para->het_novel_r * para->althom_novel_r;
+ }
+ if( para->transition_dominant && ((allele1^t_base) == 0x3 || (allele2^t_base) == 0x3)) {
+ // transition
+ p_prior[t_base<<4|allele1<<2|allele2] *= 4;
+ }
+ //std::cerr<<"ACTG"[t_base]<<"\t"<<"ACTG"[allele1]<<"ACTG"[allele2]<<"\t"<<p_prior[t_base<<4|allele1<<2|allele2]<<endl;
+ }
+ }
+ }
+ for(allele1=0;allele1!=4;allele1++) {
+ for(allele2=allele1;allele2!=4;allele2++) {
+ // Deal with N
+ p_prior[0x4<<4|allele1<<2|allele2] = (allele1==allele2? 1: (2*para->het_novel_r)) * 0.25 *0.25;
+ p_prior[0x5<<4|allele1<<2|allele2] = (allele1==allele2? 1: (2*para->het_novel_r)) * 0.25 *0.25;
+ p_prior[0x6<<4|allele1<<2|allele2] = (allele1==allele2? 1: (2*para->het_novel_r)) * 0.25 *0.25;
+ p_prior[0x7<<4|allele1<<2|allele2] = (allele1==allele2? 1: (2*para->het_novel_r)) * 0.25 *0.25;
+ }
+ }
+ return 1;
+}
+
+/**
+ * Generate a prior probability for each diploid genotype given SNPdb
+ * allele frequency data.
+ */
+int Call_win::snp_p_prior_gen(double * real_p_prior, Snp_info* snp,
+ Parameter * para, char ref)
+{
+ if (snp->is_indel()) {
+ return 0;
+ }
+ char base, allele1, allele2;
+ int allele_count;
+ allele_count = 0;
+ for (base=0; base != 4; base ++) {
+ if(snp->get_freq(base)>0) {
+ // The base is found in dbSNP
+ allele_count += 1;
+ }
+ }
+ if(allele_count <= 1) {
+ // Should never occur
+
+ // BTL: Yes, this can occur, when all subjects in a HapMap
+ // population have different alleles from the reference.
+
+ //cerr<<"Previous Extract SNP error."<<endl;
+ //exit(255);
+ //return -1;
+ }
+ char t_base = (ref&0x3);
+ for(allele1=0;allele1!=4;allele1++) {
+ for(allele2=allele1;allele2!=4;allele2++) {
+
+ // Note: site are either HapMap or not HapMap. When sites
+ // are from HapMap, SOAPsnp trusts the allele frequencies.
+
+ if(!snp->is_hapmap()) {
+ // Real HapMap Sites
+ if(snp->get_freq(allele1) > 0 && snp->get_freq(allele2) > 0) {
+ // Here the frequency is just a tag to indicate SNP alleles in non-HapMap sites
+ if(allele1 == allele2 && allele1 == t_base) {
+ // refHOM
+ real_p_prior[allele1<<2|allele2] = 1;
+ }
+ else if (allele1 == t_base || allele2 == t_base) {
+ // refHET: 1 ref 1 alt
+ real_p_prior[allele1<<2|allele2] = snp->is_validated()?para->het_val_r:para->het_unval_r;
+ }
+ else if (allele1 == allele2) {
+ real_p_prior[allele1<<2|allele2] = snp->is_validated()?para->althom_val_r:para->althom_unval_r;
+ }
+ else {
+ // altHET: 2 diff alt base
+ real_p_prior[allele1<<2|allele2] = snp->is_validated()?para->het_val_r:para->het_unval_r;
+ }
+ }
+ }
+ else {
+ // Real HapMap Sites
+ if(snp->get_freq(allele1) > 0 && snp->get_freq(allele2) > 0) {
+ real_p_prior[allele1<<2|allele2] = (allele1==allele2?1:(2*para->het_val_r))*snp->get_freq(allele1)*snp->get_freq(allele2);
+ }
+ }
+ }
+ }
+ return 1;
+}
diff --git a/soapsnp/rank_sum.cc b/soapsnp/rank_sum.cc
new file mode 100644
index 0000000..abc009c
--- /dev/null
+++ b/soapsnp/rank_sum.cc
@@ -0,0 +1,128 @@
+#include "soap_snp.h"
+
+/*
+ * From the paper:
+ *
+ * "Since the quality scores of erroneous bases are lower than that for
+ * correct bases, we used the sum rank test to check the heterozygous
+ * sites of the called consensus. All observed appearances of the two
+ * alleles in the reads were ordered according to the quality score,
+ * then the sum rank of the less frequent allele was tested. The
+ * calculated P-value was integrated into the consensus quality score
+ * by subtracting -10log10(p)."
+ */
+
+int Prob_matrix::rank_table_gen() {
+ // When N <= 63, (so that n1<=31), use this table to test
+ ubit64_t i, n1, N, T1;
+ rate_t p_left, p_right;
+
+ // Calculate the factorials
+ double * fact = new double [64];
+ fact[0]=(double)1.0;
+ for(i=1;i!=64;i++) {
+ fact[i] = fact[i-1]*i;
+ }
+
+ ubit64_t * rank_sum= new ubit64_t [64*64*2048]; // 6bit: N; 5bit: n1; 11bit; T1
+ memset(rank_sum, 0, sizeof(ubit64_t)*64*64*2048);
+ rank_sum[0]=1;
+ for(N=1;N!=64;N++) {
+ for(n1=0;n1<=N;n1++) {
+ for(T1=(1+n1)*n1/2;T1<=(N+N-n1+1)*n1/2;T1++) {
+ // Dynamic programming to generate the table
+ rank_sum[N<<17|n1<<11|T1] = rank_sum[((N-1)<<17)|(n1<<11)|T1] + ((T1>=N && n1>0) ? rank_sum[((N-1)<<17)|((n1-1)<<11)|(T1-N)]:0);
+ // Here, the p_rank is not cumulative
+ p_rank[(N<<17)|(n1<<11)|T1] = rank_sum[N<<17|n1<<11|T1] / (fact[N]/(fact[n1]*fact[N-n1]));
+ }
+ p_left = 0.0, p_right =1.0;
+ for(T1=(1+n1)*n1/2;T1<=(N+N-n1+1)*n1/2;T1++) {
+ p_right = 1.0 - p_left;
+ p_left += p_rank[(N<<17)|(n1<<11)|T1];
+ p_rank[N<<17|n1<<11|T1] = (p_left<p_right?p_left:p_right);
+ }
+ }
+ }
+ delete [] rank_sum;
+ delete [] fact;
+ return 1;
+}
+
+double Call_win::normal_test(int n1, int n2, double T1, double T2) {
+ double u1, u2;
+ u1 = (T1 - n1*(n1+n2+1)/2) / sqrt(n1*n2*(n1+n2+1)/(double)12);
+ u2 = (T2 - n2*(n1+n2+1)/2) / sqrt(n1*n2*(n1+n2+1)/(double)12);
+ return normal_value(fabs(u1)>fabs(u2)?u1:u2);
+}
+
+double Call_win::table_test(rate_t *p_rank, int n1, int n2, double T1, double T2) {
+ if(n1<=n2) {
+ return p_rank[(n1+n2)<<17|n1<<11|(int)(T1)]+(T1-(int)T1)*(p_rank[(n1+n2)<<16|n1<<11|(int)(T1+1)]-p_rank[(n1+n2)<<17|n1<<11|(int)(T1)]);
+ }
+ else {
+ return p_rank[(n1+n2)<<17|n2<<11|(int)(T2)]+(T2-(int)T2)*(p_rank[(n1+n2)<<16|n2<<11|(int)(T2+1)]-p_rank[(n1+n2)<<17|n2<<11|(int)(T2)]);
+ }
+}
+
+double Call_win::rank_test(Pos_info & info, char best_type, rate_t * p_rank, Parameter * para) {
+ if( (best_type&3) == ((best_type>>2)&3) ) {
+ // HOM
+ return 1.0;
+ }
+ if( info.count_uni[best_type&3]==0 || info.count_uni[(best_type>>2)&3]==0) {
+ // HET with one allele...
+ return 0.0;
+ }
+ //cerr<<"RankSum:"<<info.pos<<endl;
+ //int * same_qual_count = new int [para->q_max-para->q_min+1];
+ //memset(same_qual_count, 0, sizeof(int)*(para->q_max-para->q_min+1));
+ //double * rank_array= new double [para->q_max-para->q_min+1];
+ //memset(rank_array, 0, sizeof(double)*(para->q_max-para->q_min+1));
+ int *same_qual_count = new int [64];
+ double *rank_array = new double [64];
+ memset(same_qual_count,0,sizeof(int)*64);
+ memset(rank_array,0,sizeof(double)*64);
+
+ int rank(0);
+ double T[4]={0.0, 0.0, 0.0, 0.0};
+ bool is_need[4] ={false,false,false,false};
+ is_need[(best_type&3)]=true; is_need[((best_type>>2)&3)]=true;
+ std::string::size_type o_base, strand;
+ int q_score, coord;
+ for(o_base=0;o_base!=4;o_base++) {
+ if(info.count_uni[o_base]==0 || !is_need[o_base]) continue;
+ for(q_score=para->q_max-para->q_min;q_score>=0;q_score--) {
+ for(coord=para->read_length-1;coord>=0;coord--) {
+ for(strand=0;strand<2;strand++) {
+ same_qual_count[q_score] += info.base_info[o_base<<15|strand<<14|q_score<<8|coord];
+ //if(info.pos==1256 && info.base_info[o_base<<13|strand<<12|q_score<<6|coord]!=0) {
+ // cerr<<info.pos<<"\t"<<q_score<<"\t"<<same_qual_count[q_score]<<"\t"<<int(info.base_info[o_base<<13|strand<<12|q_score<<6|coord])<<endl;
+ //}
+ }
+ }
+ }
+ }
+ rank = 0;
+ for(q_score=0;q_score<=(ubit64_t)(para->q_max-para->q_min+1);q_score++) {
+ rank_array[q_score]= rank+(1+same_qual_count[q_score])/2.0;
+ rank += same_qual_count[q_score];
+ }
+ for(o_base=0;o_base!=4;o_base++) {
+ if(info.count_uni[o_base]==0 || !is_need[o_base]) continue;
+ for(q_score=para->q_max-para->q_min;q_score>=0;q_score--) {
+ for(coord=para->read_length-1;coord>=0;coord--) {
+ for(strand=0;strand<2;strand++) {
+ T[o_base] += (rank_array[q_score] * info.base_info[o_base<<15|strand<<14|q_score<<8|coord]);
+ }
+ }
+ }
+ }
+ delete [] same_qual_count;
+ delete [] rank_array;
+ if (info.count_uni[best_type&3]+info.count_uni[(best_type>>2)&3]<64) {
+ return table_test(p_rank, info.count_uni[best_type&3], info.count_uni[(best_type>>2)&3], T[best_type&3], T[(best_type>>2)&3]);
+ }
+ else {
+ return normal_test(info.count_uni[best_type&3], info.count_uni[(best_type>>2)&3],T[best_type&3], T[(best_type>>2)&3]);
+ }
+}
diff --git a/soapsnp/readme b/soapsnp/readme
new file mode 100644
index 0000000..0a3452d
--- /dev/null
+++ b/soapsnp/readme
@@ -0,0 +1,233 @@
+Program: SOAPsnp (Short Oligonucleotide Analysis Package for Single
+Nucleotide Polymorphism)
+
+Copyright (C) 2008, BGI Shenzhen.
+
+License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
+This is free software: you are free to change and redistribute it.
+There is NO WARRANTY, to the extent permitted by law.
+
+Author: BGI shenzhen
+Contact: soap at genomics.org.cn
+
+Introduction
+
+SOAPsnp is a member of the SOAP (Short Oligonucleotide Analysis
+Package). Despite its name, the program is a resequencing utility that
+can assemble consensus sequence for the genome of a newly sequenced
+individual based on the alignment of the raw sequencing reads on the
+known reference. The SNPs can then be identified on the consensus
+sequence through the comparison with the reference. In the first Asian
+genome re-sequencing project, evalution of SOAPsnp result on Illumina
+HapMap 1M BeadChip Duo genotyping sites shows great accuracy. Over 99%
+of the genotyping sites are covered at over 99.9% consistency. Further
+PCR plus Sanger sequencing of the inconsistent SNP sites confirmed
+majority of the SOAPsnp results.
+
+SOAPsnp uses a method based on Bayes' theorem (the reverse probability
+model) to call consensus genotype by carefully considering the data
+quality, alignment, and recurring experimental errors. All these kinds
+of information was integrated into a single quality score for each base
+in PHRED scale to measure the accuracy of consensus calling. Currently,
+it supports the alignment format of SoapMap.
+
+Download
+
+<Download link>
+
+System requirements
+
+SOAPsnp is a command line driven program written in C/C++ that
+generally runs under 64-bit Linux system. The program has been tested
+on various platforms like x86-64 Xeon with Linux kernel 2.6.9 and
+Loongson 2E/2F with Linux kernel 2.6.22. It is in principle portable to
+other architectures/systems as only standard C++ libraries were used.
+GNU Compiler Collection (version>=3.4) is recommended to compile the
+codes.
+
+The program needs ~500M or even smaller memory to run. However, its
+output might be very large that consumes a lot of harddisk space. In
+text output mode, the output file may be as large as 60 times the
+genome size (e.g. 180G free space is required to run a human genome).
+In GLF output format (which is proposed by Prof. R. Durbin in Wellcome
+Trust Sanger Institute), the output file approximately requires a free
+disk space of 12 times the genome size to store.
+
+Installation
+
+1. Download the tarball of the latest SOAPsnp version from the link
+ above. (For example, SOAPsnp.tar.gz)
+2. In the Linux console:
+
+ tar zxvf /<PATH_WHERE_YOU_PUT_THE_TARBALL>/SOAPsnp.tar.gz
+ cd SOAPsnp/
+
+3. Change the 'makefile' if necessary. For example, you may would like
+ to modify the compiler optimization parameters.
+4. In the Linux console:
+
+ make all
+
+ Then an executable of SOAPsnp will be generated in the directory.
+ In the Linux console, type:
+
+ ./soapsnp
+
+ or:
+
+ <Absolute path>/soapsnp
+
+ to run the program. You may copy the executable to /usr/bin/ or
+ other system paths defined in the environment variables so that you
+ can simply run the program by directly typing 'soapsnp' in the
+ console.
+
+Quick Start:
+
+For diploid genome resequencing:
+
+soapsnp -i <Alignment.soap.sort.chrN> \
+ -d <chrN.fasta> \
+ -o <chrN.consensus> \
+ -r 0.00005 \
+ -e 0.0001 \
+ -t -u \
+ -L <Maximum Read Length> \
+ -M <chrN.mat>
+
+For monoploid genome resequencing:
+
+soapsnp -i <Alignment.soap.sort.chrN> \
+ -d <chrN.fasta> \
+ -o <chrN.consensus> \
+ -r 0.0001 \
+ -t -u \
+ -L <Maximum Read Length> \
+ -M <chrN.mat> \
+ -m
+
+Usage
+
+Command line options:
+
+1. Required parameters:
+
+-i <FILE> Input SORTED SOAPaligner(soap) alignment result
+
+Note that here we say 'sorted' means alignments of each chromosome are
+sorted first by chromosome name lexicographically and then by
+coordinates on each chromosome numerically.
+
+-d <FILE> Reference DNA sequence in FASTA format
+
+-o <FILE> Output consensus file
+
+2. Optional parameters: (default in [])
+
+-z <Char> ASCII character that stands for quality score==0 [@]
+
+FASTQ files generated by Illumina base-calling pipeline use '@' as 0,
+but some institutes use '!' as 0.
+
+-g <Double> Global error dependency coefficient, 0.0(complete dependent)~1.0(complete independent)[0.9]
+
+-p <Double> PCR error dependency coefficient, 0.0(complete dependent)~1.0(complete independent)[0.5]
+Sequencing errors are found slightly repeatable (once an error occur, additional errors also tend to occur) due to various reasons. Therefore, observations of sequencing errors are not complete independent. The main source of repeatable errors is believed to be PCR amplification in sequencing process. The proper values of the two parameters rely on wetlab process. Nonetheless, the default value generally work at most time.
+
+-r <Double> novel altHOM prior probability [0.0005]
+
+-e <Double> novel HET prior probability [0.0010]
+The two are prior probabilities of homozygous SNPs (altHOM) and heterozygous SNPs (HET), which are used in Bayes formula calculation. Note these are prior probabilities of a new (novel) SNP. They are expected to be stringent. For different species, the two values should change if necessary.
+
+-t set transition/transversion ratio to 2:1 in prior probability
+
+-s <FILE> Pre-formatted known SNP information.
+
+The file consist of a lot of lines like this one:
+chr1 201979756 1 1 0 0.161 0 0 0.839 rs568
+The columns from left to right are:
+
+ name of chromosome,
+ coordinate on the chromosome,
+ whether the SNP has allele frequency information (1 is true, 0 is false),
+ whether the SNP is validated by experiment (1 is true, 0 is false),
+ whether the SNP is actually an indel (1 is true, 0 is false),
+ frequency of A,
+ frequency of C,
+ frequency of T,
+ frequency of G,
+ SNP id.
+
+ For known SNP sites that do not have allele frequency information,
+ the frequency information can be arbitrarily determined as any
+ positive values, which only imply what alleles have already been
+ deposited in the database.
+
+-2 specify this option will REFINE SNP calling using known SNP information [Off]
+
+-a <Double> Validated HET prior, if no allele frequency known [0.1]
+
+-b <Double> Validated altHOM prior, if no allele frequency known[0.05]
+
+-j <Double> Unvalidated HET prior, if no allele frequency known [0.02]
+
+-k <Double> Unvalidated altHOM rate, if no allele frequency known[0.01]
+The parameters are related to using external SNP information to alter prior probabilities for SNP calling. SOAPsnp will try using allele frequency information as prior probability in calling genotypes for each site. If the allele frequency information is absent, it will use the above 4 parameters as prior probability.
+
+-u Enable rank sum test (that check whether the two allele of a
+ possible HET call have same sequencing quality) to give HET further
+ penalty for better accuracy. [Off]
+
+-n Enable binomial probability calculation (that check whether the two
+ allele are observed equally) to give HET further penalty for better
+ accuracy. [Off]
+
+-m Enable monoploid calling mode, this will ensure all consensus as HOM
+ and you probably should SPECIFY higher altHOM rate. [Off]
+
+-q Only output potential SNPs. Useful in Text output mode. [Off]
+
+-M <FILE> Output the quality calibration matrix; the matrix can be
+ reused with -I if you rerun the program
+
+-I <FILE> Input previous quality calibration matrix. It cannot be used
+ simutaneously with -M
+
+-L <short> maximum length of read [45]
+
+ Please note that once length of some reads exceeds the parameter
+ will probably collapse the program.
+
+-Q <short> maximum FASTQ quality score [40]
+
+-F <int> Output format. 0: Text; 1: GLFv2; 2: GPFv2. [0]
+
+-E <String> Extra headers EXCEPT CHROMOSOME FIELD specified in GLFv2 output. Format is "TypeName1:DataName1:TypeName2:DataName2"[]
+
+-T <FILE> Only call consensus on regions specified in FILE. Format of this file is:
+ChrName\tStart\tEnd
+ChrName\tStart\tEnd
+...
+
+-h Display this help
+
+Output format
+1. Text format
+The result of SOAPsnp has 17 columns:
+1) Chromosome ID
+2) Coordinate on chromosome, start from 1
+3) Reference genotype
+4) Consensus genotype
+5) Quality score of consensus genotype
+6) Best base, average quality score of best base
+7) Count of uniquely mapped best base
+8) Count of all mapped best base
+9) Second best bases, average quality score of second best base
+10) Count of uniquely mapped second best base
+11) Count of all mapped second best base
+12) Sequencing depth of the site, rank sum test p_value
+13) Average copy number of nearby region
+14) Whether the site is a dbSNP.
+2. GLFv2 and GPFv2
+GLFv2 (Genome Likelihood Format v2) is a binary file format proposed by Prof. R. Durbin.
+
diff --git a/soapsnp/release b/soapsnp/release
new file mode 100644
index 0000000..8982e0a
--- /dev/null
+++ b/soapsnp/release
@@ -0,0 +1,9 @@
+
+release: SOAPsnp 1.01
+30-3-2009
+1. fix bugs
+
+release: SOAPsnp 1.00
+10-11-2008
+1. release the first stable verion of SOAPsnp
+2. include 2 C library <cstdlib> and <cstring> for gcc-4.3
diff --git a/soapsnp/soap_snp.h b/soapsnp/soap_snp.h
new file mode 100644
index 0000000..e6a540c
--- /dev/null
+++ b/soapsnp/soap_snp.h
@@ -0,0 +1,793 @@
+#ifndef SOAP_SNP_HH_
+#define SOAP_SNP_HH_
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <cstring>
+#include <cstdlib>
+#include <map>
+#include <vector>
+#include <cmath>
+#include <iomanip>
+#include <cassert>
+#include <time.h>
+typedef unsigned long long ubit64_t;
+typedef unsigned int ubit32_t;
+typedef double rate_t;
+typedef unsigned char small_int;
+using namespace std;
+const size_t capacity = sizeof(ubit64_t)*8/4;
+const char abbv[17]={'A','M','W','R','M','C','Y','S','W','Y','T','K','R','S','K','G','N'};
+const ubit64_t glf_base_code[8]={1,2,8,4,15,15,15,15}; // A C T G
+const ubit64_t glf_type_code[10]={0,5,15,10,1,3,2,7,6,11};// AA,CC,GG,TT,AC,AG,AT,CG,CT,GT
+
+
+
+// Some global variables
+class Files {
+public:
+ ifstream soap_result, ref_seq, dbsnp, region;
+ ofstream consensus, summary;
+ fstream matrix_file;
+ Files(){
+ soap_result.close();
+ ref_seq.close();
+ dbsnp.close();
+ consensus.close();
+ summary.close();
+ matrix_file.close();
+ region.close();
+ };
+};
+
+typedef enum {
+ SOAP_FORMAT = 1,
+ BOWTIE_FORMAT,
+ CROSSBOW_FORMAT
+} alignment_format;
+
+class Parameter {
+public:
+ char q_min; // The char stands for 0 in fastq
+ char q_max; // max quality score
+ small_int read_length; // max read length
+ bool is_monoploid; // Is it an monoploid? chrX,Y,M in man.
+ bool is_snp_only; // Only output possible SNP sites?
+ bool refine_mode; // Refine prior probability using dbSNP
+ bool rank_sum_mode; // Use rank sum test to refine HET quality
+ bool binom_mode; // Use binomial test to refine HET quality
+ bool transition_dominant; // Consider transition/transversion ratio?
+ int glf_format; // Generate Output in GLF format
+ bool region_only; // Only report consensus in specified region
+ std::string glf_header; // Header of GLF format
+ rate_t althom_novel_r, het_novel_r; // Expected novel prior
+ rate_t althom_val_r, het_val_r; // Expected Validated dbSNP prior
+ rate_t althom_unval_r, het_unval_r; // Expected Unvalidated dbSNP prior
+ rate_t global_dependency, pcr_dependency; // Error dependencies, 1 is NO dependency
+ alignment_format format;
+ bool do_recal, verbose, dump_dbsnp_evidence;
+ bool hadoop_out;
+// Default onstruction
+ Parameter(){
+ q_min = 64;
+ q_max = 64+40;
+ read_length = 45;
+ is_monoploid = is_snp_only = refine_mode = rank_sum_mode = binom_mode = transition_dominant = region_only =false;
+ glf_format = 0;
+ glf_header = "";
+ althom_novel_r=0.0005, het_novel_r=0.0010;
+ althom_val_r=0.05, het_val_r=0.10;
+ althom_unval_r=0.01, het_unval_r=0.02;
+ global_dependency= log10(0.9), pcr_dependency= log10(0.5); // In Log10 Scale
+ format = SOAP_FORMAT;
+ do_recal = true;
+ verbose = false;
+ hadoop_out = false;
+ dump_dbsnp_evidence = false;
+ };
+};
+
+extern unsigned long alignments_read;
+extern unsigned long alignments_read_unique;
+extern unsigned long alignments_read_unpaired;
+extern unsigned long alignments_read_paired;
+
+class Crossbow_format {
+ // Crossbow alignment result
+ std::string read_id, read, qual, chr_name, mms;
+ int part, read_len, position, hit;
+ unsigned mate;
+ char strand;
+public:
+ Crossbow_format() { }
+ friend std::istringstream & operator>>(std::istringstream & alignment, Crossbow_format & bowf) {
+ alignment >> bowf.chr_name
+ >> bowf.part
+ >> bowf.position
+ >> bowf.strand
+ >> bowf.read
+ >> bowf.qual
+ >> bowf.hit
+ >> bowf.mms
+ >> bowf.mate
+ >> bowf.read_id;
+ bowf.read_len = bowf.read.length(); // infer
+ bowf.hit++;
+ alignments_read++;
+ if(bowf.hit == 1) alignments_read_unique++;
+ if(bowf.mate == 0) alignments_read_unpaired++;
+ if(bowf.mate > 0) alignments_read_paired++;
+ return alignment;
+ }
+ friend std::ostream & operator<<(std::ostream & o, Crossbow_format & bowf) {
+ o << bowf.read_id << '\t'
+ << bowf.read << '\t'
+ << bowf.qual << '\t'
+ << bowf.hit << '\t'
+ << (bowf.mate < 3 ? "aab"[bowf.mate] : '?') << '\t'
+ << bowf.read_len << '\t'
+ << bowf.strand << '\t'
+ << bowf.chr_name << '\t'
+ << bowf.position << '\t'
+ << "0";
+ return o;
+ }
+ char get_base(std::string::size_type coord) {
+ return read[coord];
+ }
+ char get_qual(std::string::size_type coord) {
+ return qual[coord];
+ }
+ bool is_fwd() {
+ return (strand=='+');
+ }
+ int get_read_len() {
+ return read_len;
+ }
+ inline int get_pos() {
+ return position;
+ }
+ std::string get_chr_name() {
+ return chr_name;
+ }
+ int get_hit() {
+ return hit;
+ }
+ bool is_unique() {
+ return (hit==1);
+ }
+ bool is_N(int coord) {
+ return (read[coord] == 'N');
+ }
+ unsigned get_mate() const { return mate; }
+};
+
+/**
+ * Note that SOAPsnp does not read reference information from the
+ * alignment; it gets all reference information from the Genome
+ * structure.
+ */
+class Soap_format {
+ // Soap alignment result
+ std::string read_id, read, qual, chr_name;
+ int hit, read_len, position, mismatch;
+ char ab, strand;
+ unsigned mate;
+ // 'ab' is not used in consensus/SNP calling, just for printing out
+ // the alignment
+public:
+ Soap_format(){;};
+ friend std::istringstream & operator>>(std::istringstream & alignment, Soap_format & soap) {
+ alignment >> soap.read_id
+ >> soap.read
+ >> soap.qual
+ >> soap.hit // # alignments w/ same # mms
+ >> soap.ab // whether it's mate a/b
+ >> soap.read_len
+ >> soap.strand
+ >> soap.chr_name
+ >> soap.position
+ >> soap.mismatch; // mismatch string
+ if(soap.mismatch > 200) {
+ // Refine the read so that the read contains an insertion
+ // w/r/t reference
+ int indel_pos,indel_len;
+ string temp("");
+ alignment >> indel_pos;
+ indel_len = soap.mismatch-200;
+ for(int i = 0; i != indel_len; i++) {
+ temp = temp+'N';
+ }
+ soap.read = soap.read.substr(0,indel_pos)+temp+soap.read.substr(indel_pos,soap.read_len-indel_pos);
+ soap.qual = soap.qual.substr(0,indel_pos)+temp+soap.qual.substr(indel_pos,soap.read_len-indel_pos);
+ }
+ else if (soap.mismatch > 100) {
+ // Refine the read so that the read contains an deletion
+ // w/r/t reference
+ int indel_pos,indel_len;
+ alignment >> indel_pos;
+ indel_len = soap.mismatch-100;
+ soap.read = soap.read.substr(0,indel_pos) + soap.read.substr(indel_pos+indel_len, soap.read_len-indel_pos-indel_len);
+ soap.qual = soap.qual.substr(0,indel_pos) + soap.qual.substr(indel_pos+indel_len, soap.read_len-indel_pos-indel_len);
+ }
+ soap.position -= 1;
+ soap.mate = 0;
+ return alignment;
+ }
+ friend std::ostream & operator<<(std::ostream & o, Soap_format & soap) {
+ o<<soap.read_id<<'\t'<<soap.read<<'\t'<<soap.qual<<'\t'<<soap.hit<<'\t'<<soap.ab<<'\t'<<soap.read_len<<'\t'<<soap.strand<<'\t'<<soap.chr_name<<'\t'<<soap.position<<'\t'<<soap.mismatch;
+ return o;
+ }
+ char get_base(std::string::size_type coord) {
+ return read[coord];
+ }
+ char get_qual(std::string::size_type coord) {
+ return qual[coord];
+ }
+ bool is_fwd(){
+ return (strand=='+');
+ }
+ int get_read_len(){
+ return read_len;
+ }
+ inline int get_pos(){
+ return position;
+ }
+ std::string get_chr_name(){
+ return chr_name;
+ }
+ int get_hit(){
+ return hit;
+ }
+ bool is_unique(){
+ return (hit==1);
+ }
+ bool is_N(int coord) {
+ return (read[coord] == 'N');
+ }
+ unsigned get_mate() const { return mate; }
+};
+
+// dbSNP information
+class Snp_info {
+ bool validated;
+ bool hapmap_site;
+ bool indel_site;
+ rate_t * freq; // elements record frequency of ACTG
+ string name;
+public:
+ Snp_info(){
+ validated=hapmap_site=indel_site=false;
+ freq = new rate_t [4];
+ memset(freq,0,sizeof(rate_t)*4);
+ }
+ Snp_info(const Snp_info & other) {
+ validated = other.validated;
+ hapmap_site = other.hapmap_site;
+ indel_site = other.indel_site;
+ freq = new rate_t [4];
+ memcpy(freq, other.freq, sizeof(rate_t)*4);
+ }
+ ~Snp_info(){
+ delete [] freq;
+ }
+ /**
+ * Here's where the SNP format is defined (beyond the first two
+ * fields, which hold the chromosome name and offset).
+ */
+ friend std::istringstream& operator>>(std::istringstream & s,
+ Snp_info & snp_form)
+ {
+ s >> snp_form.hapmap_site
+ >> snp_form.validated
+ >> snp_form.indel_site
+ >> snp_form.freq[0] // A
+ >> snp_form.freq[1] // C
+ >> snp_form.freq[2] // T
+ >> snp_form.freq[3] // G
+ >> snp_form.name;
+ return s;
+ }
+ Snp_info & operator=(Snp_info& other) {
+ this->validated = other.validated;
+ this->hapmap_site = other.hapmap_site;
+ this->indel_site = other.indel_site;
+ this->name = other.name;
+ this->freq = new rate_t [4];
+ memcpy(this->freq, other.freq, sizeof(rate_t)*4);
+ return *this;
+
+ }
+ bool is_validated(){
+ return validated;
+ }
+ bool is_hapmap(){
+ return hapmap_site;
+ }
+ bool is_indel(){
+ return indel_site;
+ }
+ rate_t get_freq(char bin_base_2bit) {
+ return freq[bin_base_2bit];
+ }
+ const string& get_name() {
+ return name;
+ }
+};
+
+// Chromosome(Reference) information
+class Chr_info {
+ ubit32_t len;
+ ubit32_t elts;
+ ubit64_t* bin_seq; // Sequence in binary format
+ bool bin_seq_is_mm; // bin_seq array is memory-mapped?
+ // region_mask is initilized lazily, only of the user specifies -T.
+ // The Parameter.region_only flag will be set iff region_mask is
+ // initialized.
+ ubit64_t* region_mask;
+ // 4bits for one base: 1 bit dbSNPstatus, 1bit for N, followed two bit of base A: 00, C: 01, T: 10, G:11,
+ // Every ubit64_t could store 16 bases
+ map<ubit64_t, Snp_info*> dbsnp;
+ vector<pair<int, int> > regions;
+public:
+ Chr_info(){
+ bin_seq_is_mm = false;
+ len = 0;
+ elts = 0;
+ bin_seq = NULL;
+ region_mask = NULL;
+ regions.clear();
+ };
+ Chr_info(const Chr_info & other);
+ ~Chr_info(){
+ if(!bin_seq_is_mm) {
+ delete [] bin_seq;
+ }
+ delete [] region_mask;
+ }
+ ubit32_t length() {
+ return len;
+ }
+ ubit64_t get_bin_base(std::string::size_type pos) {
+ return (bin_seq[pos/capacity]>>(pos%capacity*4))&0xF; // All 4 bits
+ }
+ int binarize(std::string & seq);
+ void dump_binarized(std::string fn);
+ int insert_snp(std::string::size_type pos, Snp_info & new_snp, bool quiet);
+ int region_mask_ini();
+ bool is_in_region(std::string::size_type pos) {
+ if(region_mask == NULL) return true;
+ return (region_mask[pos/64]>>(63-pos%64))&1;
+ }
+ int set_region(int start, int end);
+ /**
+ * The only place this is called is in Call_win::call_cns when it
+ * passes the result to snp_p_prior_gen in order to generate a
+ * prior probability for each diploid genotype.
+ */
+ Snp_info * find_snp(ubit64_t pos) {
+ return dbsnp.find(pos)->second;
+ }
+ ubit64_t * get_region() {
+ return region_mask;
+ }
+ ubit64_t * get_bin_seq() {
+ return bin_seq;
+ }
+ ubit32_t get_elts() {
+ return elts;
+ }
+ const std::vector<pair<int, int> >& get_regions() {
+ return regions;
+ }
+};
+
+typedef std::string Chr_name;
+
+class Genome {
+public:
+ map<Chr_name, Chr_info*> chromosomes;
+
+ Genome(ifstream & fasta, ifstream & known_snp, bool quiet);
+ ~Genome();
+
+ /// Add a new chromosome to the map
+ bool add_chr(Chr_name &);
+
+ /// Read in and parse a region file
+ int read_region(std::ifstream & region, Parameter * para);
+};
+
+class Prob_matrix {
+public:
+ rate_t *p_matrix, *p_prior; // Calibration matrix and prior probabilities
+ rate_t *base_freq, *type_likely, *type_prob; // Estimate base frequency, conditional probability, and posterior probablity
+ rate_t *p_rank, *p_binom; // Ranksum test and binomial test on HETs
+ Prob_matrix();
+ ~Prob_matrix();
+ template<typename T> int matrix_gen(std::ifstream & alignment, Parameter * para, Genome * genome);
+ int matrix_read(std::fstream & mat_in, Parameter * para);
+ int matrix_write(std::fstream & mat_out, Parameter * para);
+ int prior_gen(Parameter * para);
+ int rank_table_gen();
+
+};
+
+template<typename T>
+int Prob_matrix::matrix_gen(std::ifstream & alignment, Parameter * para, Genome * genome) {
+ // Read Alignment files
+ T soap;
+ ubit64_t * count_matrix = new ubit64_t [256*256*4*4];
+ memset(count_matrix, 0, sizeof(ubit64_t)*256*256*4*4);
+ map<Chr_name, Chr_info*>::iterator current_chr;
+ current_chr = genome->chromosomes.end();
+ ubit64_t ref(0);
+ std::string::size_type coord;
+ if(para->do_recal) {
+ // For each alignment
+ for(std::string line; getline(alignment, line);) {
+ std::istringstream s(line);
+ // Parse the alignment
+ if(s >> soap) {
+ if(soap.get_pos() < 0) {
+ continue;
+ }
+ // In the overloaded "+" above, soap.position will be substracted by 1 so that coordiates start from 0
+ if (current_chr == genome->chromosomes.end() || current_chr->first != soap.get_chr_name()) {
+ current_chr = genome->chromosomes.find(soap.get_chr_name());
+ if(current_chr == genome->chromosomes.end()) {
+ for(map<Chr_name, Chr_info*>::iterator test = genome->chromosomes.begin();test != genome->chromosomes.end();test++) {
+ cerr<<'!'<<(test->first)<<'!'<<endl;
+ }
+ cerr<<"Assertion Failed: Chromosome: !"<<soap.get_chr_name()<<"! NOT found"<<endl;
+ exit(255);
+ }
+ }
+ else {
+ ;
+ }
+ if (soap.is_unique()) {
+ for(coord = 0; coord != soap.get_read_len(); coord++) {
+ if (soap.is_N(coord)) {
+ ;
+ }
+ else {
+ if(! (soap.get_pos()+coord<current_chr->second->length())) {
+ cerr<<soap<<endl;
+ cerr<<"The program found the above read has exceed the reference length:\n";
+ cerr<<"The read is aligned to postion: "<<soap.get_pos()<<" with read length: "<<soap.get_read_len()<<endl;
+ cerr<<"Reference: "<<current_chr->first<<" FASTA Length: "<<current_chr->second->length()<<endl;
+ exit(255);
+ }
+ ref = current_chr->second->get_bin_base(soap.get_pos()+coord);
+ if ( (ref&12) !=0 ) {
+ // This is an N on reference or a dbSNP which should be excluded from calibration
+ ;
+ }
+ else {
+ if(soap.is_fwd()) {
+ // forward strand
+ count_matrix[(((ubit64_t)soap.get_qual(coord))<<12) | (coord<<4) | ((ref&0x3)<<2) | (soap.get_base(coord)>>1)&3] += 1;
+ }
+ else {
+ // reverse strand
+ count_matrix[(((ubit64_t)soap.get_qual(coord))<<12) | ((soap.get_read_len()-1-coord)<<4) | ((ref&0x3)<<2) | (soap.get_base(coord)>>1)&3] += 1;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ ubit64_t o_base/*o_based base*/, t_base/*theorecical(supposed) base*/, type, sum[4], same_qual_count_by_type[16], same_qual_count_by_t_base[4], same_qual_count_total, same_qual_count_mismatch;
+ char q_char/*fastq quality char*/;
+
+ const ubit64_t sta_pow=10; // minimum number to say statistically powerful
+ for(q_char=para->q_min; q_char<=para->q_max ;q_char++) {
+ memset(same_qual_count_by_type, 0, sizeof(ubit64_t)*16);
+ memset(same_qual_count_by_t_base, 0, sizeof(ubit64_t)*4);
+ same_qual_count_total = 0;
+ same_qual_count_mismatch = 0;
+ for(coord=0; coord != para->read_length ; coord++) {
+ for(type=0;type!=16;type++) {
+ // If the sample is small, then we will not consider the effect of read cycle.
+ same_qual_count_by_type[type] += count_matrix[ ((ubit64_t)q_char<<12) | coord <<4 | type];
+ same_qual_count_by_t_base[(type>>2)&3] += count_matrix[ ((ubit64_t)q_char<<12) | coord <<4 | type];
+ same_qual_count_total += count_matrix[ ((ubit64_t)q_char<<12) | coord <<4 | type];
+ if(type % 5 != 0) {
+ // Mismatches
+ same_qual_count_mismatch += count_matrix[ ((ubit64_t)q_char<<12) | coord <<4 | type];
+ }
+ }
+ }
+ for(coord=0; coord != para->read_length ; coord++) {
+ memset(sum, (ubit64_t)0, sizeof(ubit64_t)*4);
+ // Count of all ref base at certain coord and quality
+ for(type=0;type!=16;type++) {
+ sum[(type>>2)&3] += count_matrix[ ((ubit64_t)q_char<<12) | (coord <<4) | type]; // (type>>2)&3: the ref base
+ }
+ for(t_base=0; t_base!=4; t_base++) {
+ for(o_base=0; o_base!=4; o_base++) {
+ if (count_matrix[ ((ubit64_t)q_char<<12) | (coord <<4) | (t_base<<2) | o_base] > sta_pow) {
+ // Statistically powerful
+ p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base] = ((double)count_matrix[ ((ubit64_t)q_char<<12) | (coord <<4) | (t_base<<2) | o_base]) / sum[t_base];
+ }
+ else if (same_qual_count_by_type[t_base<<2|o_base] > sta_pow) {
+ // Smaller sample, given up effect from read cycle
+ p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base] = ((double)same_qual_count_by_type[t_base<<2|o_base]) / same_qual_count_by_t_base[t_base];
+ }
+ else if (same_qual_count_total > 0){
+ // Too small sample, given up effect of mismatch types
+ if (o_base == t_base) {
+ p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base] = ((double)(same_qual_count_total-same_qual_count_mismatch))/same_qual_count_total;
+ }
+ else {
+ p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base] = ((double)same_qual_count_mismatch)/same_qual_count_total;
+ }
+ }
+
+ // For these cases like:
+ // Ref: G o_base: G x10 Ax5. When calculate the probability of this allele to be A,
+ // If there's no A in reference gives observation of G, then the probability will be zero,
+ // And therefore exclude the possibility of this pos to have an A
+ // These cases should be avoid when the dataset is large enough
+ // If no base with certain quality is o_based, it also doesn't matter
+ if( (p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base]==0) || p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base] ==1) {
+ if (o_base == t_base) {
+ p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base] = (1-pow(10, -((q_char-para->q_min)/10.0)));
+ if(p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base]<0.25) {
+ p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base] = 0.25;
+ }
+ }
+ else {
+ p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base] = (pow(10, -((q_char-para->q_min)/10.0))/3);
+ if(p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base]>0.25) {
+ p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base] = 0.25;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ delete [] count_matrix;
+
+ // Note: from now on, the first 8 bit of p_matrix is its quality score, not the FASTQ char
+ return 1;
+}
+
+struct Pos_info {
+ unsigned char ori;
+ small_int base_info[4*2*64*256];
+#ifdef FAST_BOUNDS
+ small_int coordmin, coordmax;
+ char qmin, qmax;
+#endif
+ int pos, depth, dep_uni, repeat_time;
+ int dep_pair, dep_uni_pair;
+ int count_uni[4];
+ int q_sum[4];
+ int count_all[4];
+
+ Pos_info(){
+ ori = 0xFF;
+ memset(base_info,0,sizeof(small_int)*4*2*64*256);
+ pos = -1;
+ memset(count_uni,0,sizeof(int)*4);
+ memset(q_sum,0,sizeof(int)*4);
+ depth = 0;
+ dep_uni = 0;
+ dep_pair = 0;
+ dep_uni_pair = 0;
+ repeat_time = 0;
+#ifdef FAST_BOUNDS
+ coordmin = coordmax = 0;
+ qmin = qmax = 0;
+#endif
+ memset(count_all,0,sizeof(int)*4);
+ }
+
+ static void clear(Pos_info* p, int num) {
+ memset((void*)p, 0, num * sizeof(Pos_info));
+ }
+};
+
+class Call_win {
+public:
+ ubit64_t win_size;
+ ubit64_t read_len;
+ Pos_info * sites; // a single Pos_info is 50 bytes or so
+ Call_win(ubit64_t read_length, ubit64_t window_size=1000) {
+ sites = new Pos_info [window_size+read_length];
+ win_size = window_size;
+ read_len = read_length;
+ }
+ ~Call_win(){
+ delete [] sites;
+ }
+
+ int initialize(ubit64_t start);
+ int recycle(int start = -1);
+ int call_cns(Chr_name call_name, Chr_info* call_chr, ubit64_t call_length, Prob_matrix * mat, Parameter * para, std::ofstream & consensus);
+ template<typename T> int soap2cns(std::ifstream & alignment, std::ofstream & consensus, Genome * genome, Prob_matrix * mat, Parameter * para);
+ int snp_p_prior_gen(double * real_p_prior, Snp_info* snp, Parameter * para, char ref);
+ double rank_test(Pos_info & info, char best_type, rate_t * p_rank, Parameter * para);
+ double normal_value(double z);
+ double normal_test(int n1, int n2, double T1, double T2);
+ double table_test(rate_t *p_rank, int n1, int n2, double T1, double T2);
+};
+
+/**
+ * Loop over SNP-calling windows.
+ */
+template<typename T>
+int Call_win::soap2cns(std::ifstream & alignment, std::ofstream & consensus, Genome * genome, Prob_matrix * mat, Parameter * para) {
+ T soap;
+ map<Chr_name, Chr_info*>::iterator current_chr, prev_chr;
+ current_chr = prev_chr = genome->chromosomes.end();
+ int coord, sub;
+ int last_start(0);
+ int aln = 0;
+ for(std::string line; getline(alignment, line);) {
+ std::istringstream s(line);
+ if(s >> soap) {
+ aln++;
+ if(para->verbose) {
+ clog << "Processing alignment " << aln << endl;
+ }
+ if(soap.get_pos() < 0) {
+ continue;
+ }
+ if (current_chr == genome->chromosomes.end() ||
+ current_chr->first != soap.get_chr_name())
+ {
+ // Moved on to a new Chromosome
+ if(current_chr != genome->chromosomes.end()) {
+ // This it not the first chromosome, so we ha
+ while(current_chr->second->length() > sites[win_size-1].pos) {
+ call_cns(current_chr->first, current_chr->second, win_size, mat, para, consensus);
+ recycle();
+ last_start = sites[win_size-1].pos;
+ }
+ call_cns(current_chr->first, current_chr->second, current_chr->second->length()%win_size, mat, para, consensus);
+ recycle();
+ }
+ // Get the chromosome info corresponding to the next
+ // chunk of alignments
+ current_chr = genome->chromosomes.find(soap.get_chr_name());
+ initialize(0);
+ if(para->verbose) {
+ clog << "Returned from initialize(0) for chromosome " << current_chr->first << endl;
+ }
+ last_start = 0;
+ if(para->glf_format) {
+ cerr << "Processing " << current_chr->first << endl;
+ int temp_int(current_chr->first.size()+1);
+ consensus.write(reinterpret_cast<char *> (&temp_int), sizeof(temp_int));
+ consensus.write(current_chr->first.c_str(), current_chr->first.size()+1);
+ temp_int = current_chr->second->length();
+ consensus.write(reinterpret_cast<char *> (&temp_int), sizeof(temp_int));
+ consensus<<flush;
+ if (!consensus.good()) {
+ cerr<<"Broken IO stream after writing chromosome info."<<endl;
+ exit(255);
+ }
+ assert(consensus.good());
+ }
+ }
+ else {
+ ;
+ }
+ Chr_info *chr = current_chr->second;
+ if(para->region_only && !chr->is_in_region(soap.get_pos())) {
+ continue;
+ }
+ if(soap.get_pos() < last_start) {
+ cerr << "Errors in sorting:" << soap.get_pos() << "<" << last_start << endl;
+ exit(255);
+ }
+ // Call the previous window
+ int aln_win = soap.get_pos() / win_size;
+ int last_aln_win = last_start / win_size;
+ if (aln_win > last_aln_win) {
+ // We should call the base here
+ call_cns(current_chr->first, current_chr->second,
+ win_size, mat, para, consensus);
+ if(aln_win > last_aln_win+1) {
+ recycle(aln_win * win_size);
+ } else {
+ recycle();
+ }
+ last_start = sites[win_size-1].pos;
+ if((last_start + 1) / win_size == 1000) {
+ cerr << "Called " << last_start;
+ }
+ }
+ last_start = soap.get_pos();
+ // Commit the read information
+ for(coord = 0; coord < soap.get_read_len(); coord++) {
+ const int pos = soap.get_pos() + coord;
+ if(!chr->is_in_region(pos)) {
+ continue;
+ }
+ if(pos / win_size == soap.get_pos() / win_size ) {
+ // In the same sliding window
+ sub = pos % win_size;
+ }
+ else {
+ sub = pos % win_size + win_size; // Use the tail to store the info so that it won't intervene the uncalled bases
+ }
+ sites[sub].depth += 1;
+ if(soap.get_mate() > 0) sites[sub].dep_pair += 1;
+ sites[sub].repeat_time += soap.get_hit();
+ if((soap.is_N(coord)) ||
+ soap.get_qual(coord) < para->q_min ||
+ sites[sub].dep_uni >= 0xFF)
+ {
+ // An N, low quality or meaningless huge depth
+ continue;
+ }
+ if(soap.get_hit() == 1) {
+ sites[sub].dep_uni += 1;
+ if(soap.get_mate() > 0) sites[sub].dep_uni_pair += 1;
+ int rcoord = coord;
+ // Update the covering info: 4x2x64x64 matrix, base x strand x q_score x read_pos, 2-1-6-6 bits for each
+ if(soap.is_fwd()) {
+ // Binary strand: 0 for plus and 1 for minus
+ sites[sub].base_info[(((ubit64_t)(soap.get_base(coord)&0x6)|0))<<14 | ((ubit64_t)(soap.get_qual(coord)-para->q_min))<<8 | coord ] += 1;
+ } else {
+ rcoord = (soap.get_read_len()-1-coord);
+ sites[sub].base_info[(((ubit64_t)(soap.get_base(coord)&0x6)|1))<<14 | ((ubit64_t)(soap.get_qual(coord)-para->q_min))<<8 | rcoord ] += 1;
+ }
+#ifdef FAST_BOUNDS
+ char qu = soap.get_qual(coord) - para->q_min;
+ if(qu+1 > sites[sub].qmax || sites[sub].qmax == 0) sites[sub].qmax = qu+1;
+ if(qu+1 < sites[sub].qmin || sites[sub].qmin == 0) sites[sub].qmin = qu+1;
+ if(rcoord+1 > sites[sub].coordmax || sites[sub].coordmax == 0) sites[sub].coordmax = rcoord+1;
+ if(rcoord+1 < sites[sub].coordmin || sites[sub].coordmin == 0) sites[sub].coordmin = rcoord+1;
+#endif
+ // Update # of unique alignments having the given
+ // unambiguous base
+ sites[sub].count_uni[(soap.get_base(coord)>>1)&3] += 1;
+ // Update sum-of-Phreds
+ sites[sub].q_sum[(soap.get_base(coord)>>1)&3] += (soap.get_qual(coord)-para->q_min);
+ }
+ // Update # of alignments having the given unambiguous base
+ sites[sub].count_all[(soap.get_base(coord)>>1)&3] += 1;
+ }
+ }
+ } // end loop over alignments
+ if(aln == 0) {
+ cerr << "Error: did not read any alignments" << endl;
+ exit(1);
+ }
+ while(current_chr->second->length() > sites[win_size-1].pos) {
+ int ret = call_cns(current_chr->first, current_chr->second,
+ win_size, mat, para, consensus);
+ recycle();
+ last_start = sites[win_size-1].pos;
+ if(ret == -2) break;
+ }
+ call_cns(current_chr->first, current_chr->second,
+ current_chr->second->length() % win_size,
+ mat, para, consensus);
+ alignment.close();
+ consensus.close();
+ return 1;
+}
+
+static inline void logTime() {
+ struct tm *current;
+ time_t now;
+ time(&now);
+ current = localtime(&now);
+ clog << setfill('0') << setw(2)
+ << current->tm_hour << ":"
+ << setfill('0') << setw(2)
+ << current->tm_min << ":"
+ << setfill('0') << setw(2)
+ << current->tm_sec;
+}
+
+#endif /*SOAP_SNP_HH_*/
diff --git a/util/build_soapsnp.sh b/util/build_soapsnp.sh
new file mode 100755
index 0000000..4d0897d
--- /dev/null
+++ b/util/build_soapsnp.sh
@@ -0,0 +1,174 @@
+#!/bin/sh
+
+##
+# build_crossbow_jar
+#
+# Author: Ben Langmead
+# Date: June 1, 2009
+#
+# Build the bowtie/soapsnp binaries for Linux (and possibly Mac) and
+# debosit them in the bin directory.
+#
+# FIXME:
+# 1. Directories are hardcoded
+# 2. Assumes local machine is mac and remote is linux
+#
+
+usage() {
+cat > /dev/stdout <<EOF
+Usage: build_crossbow_jar [-m] [-u <URL>]
+
+ -b........do bowtie too
+ -m compile Mac versions of binaries first
+ -u <URL> compile sources from given URL instead of SVN
+ -h show usage message
+
+EOF
+}
+
+usagedie() {
+ usage ; exit 1
+}
+
+DO_BOWTIE=0
+
+while getopts u:mh OPT; do
+ case "$OPT" in
+ b) DO_BOWTIE=1
+ ;;
+ h) usage ; exit 0
+ ;;
+ \?) # getopts issues an error message
+ usagedie
+ ;;
+ esac
+done
+
+linux_host=privet.umiacs.umd.edu
+user=langmead
+
+# Ensure we're in the Crossbow checkout dir
+if [ ! -f util/build_soapsnp.sh ] ; then
+ echo Must run in crossbow checkout directory
+ exit 1
+fi
+
+# Bulldoze old .bin directory
+rm -rf .bin
+mkdir -p .bin
+
+# Bulldoze old .build directory
+rm -rf .build
+mkdir -p .build
+
+cd .build
+
+# SOAPsnp source always comes from svn
+svn co https://bowtie-bio.svn.sourceforge.net/svnroot/bowtie-bio/crossbow
+mv crossbow/soapsnp soapsnp
+rm -rf crossbow
+
+if [ $DO_BOWTIE -ne 0 ] ; then
+ # Bowtie source can come from CVS or from a URL
+ if [ -z "$SRC_URL" ] ; then
+ export CVS_RSH=ssh
+ cvs -d :ext:${user}@${linux_host}:/fs/szdevel/src/cvsroot co bowtie
+ else
+ wget --no-check-certificate $SRC_URL
+ unzip *.zip
+ rm -f *.zip
+ mv bowtie* bowtie
+ fi
+ #if ! make -C bowtie BITS=32 bowtie bowtie-debug ; then
+ # echo "Error bulding bowtie 32"
+ # exit 1
+ #fi
+fi
+
+#if ! make -C soapsnp BITS=32 soapsnp soapsnp-debug ; then
+# echo "Error bulding soapsnp 32"
+# exit 1
+#fi
+
+#mkdir -p ../.bin/mac32
+#if [ $DO_BOWTIE -ne 0 ] ; then
+# cp bowtie/bowtie ../.bin/mac32
+# cp bowtie/bowtie-debug ../.bin/mac32
+#fi
+#cp soapsnp/soapsnp ../.bin/mac32
+#cp soapsnp/soapsnp-debug ../.bin/mac32
+
+if [ $DO_BOWTIE -ne 0 ] ; then
+ make -C bowtie clean
+fi
+rm -f soapsnp/soapsnp soapsnp/soapsnp-debug
+
+if [ $DO_BOWTIE -ne 0 ] ; then
+ if ! make -C bowtie BITS=64 bowtie bowtie-debug ; then
+ echo "Error bulding bowtie 64"
+ exit 1
+ fi
+fi
+
+if ! make -C soapsnp BITS=64 soapsnp soapsnp-debug ; then
+ echo "Error bulding soapsnp 64"
+ exit 1
+fi
+
+mkdir -p ../.bin/mac64
+if [ $DO_BOWTIE -ne 0 ] ; then
+ cp bowtie/bowtie ../.bin/mac64
+ cp bowtie/bowtie-debug ../.bin/mac64
+fi
+cp soapsnp/soapsnp ../.bin/mac64
+cp soapsnp/soapsnp-debug ../.bin/mac64
+
+cd ..
+
+# Prepare
+ssh ${user}@${linux_host} \
+ "rm -rf /tmp/.build_crossbow_tmp && mkdir -p /tmp/.build_crossbow_tmp"
+
+if [ $DO_BOWTIE -ne 0 ] ; then
+ # Get Bowtie source
+ if [ -z "$SRC_URL" ] ; then
+ ssh ${user}@${linux_host} \
+ "cd /tmp/.build_crossbow_tmp && cvs -d /fs/szdevel/src/cvsroot co bowtie"
+ else
+ ssh ${user}@${linux_host} \
+ "cd /tmp/.build_crossbow_tmp && wget --no-check-certificate $SRC_URL && unzip *.zip && rm -f *.zip && mv bowtie* bowtie"
+ fi
+ # Build Bowtie source; Get and build SOAPsnp source
+ #ssh ${user}@${linux_host} \
+ # "cd /tmp/.build_crossbow_tmp/bowtie && " \
+ # "make -j2 BITS=32 bowtie bowtie-debug"
+fi
+
+# Get and build SOAPsnp source
+ssh ${user}@${linux_host} "cd /tmp/.build_crossbow_tmp && svn co https://bowtie-bio.svn.sourceforge.net/svnroot/bowtie-bio/crossbow"
+
+#mkdir -p .bin/linux32
+#if [ $DO_BOWTIE -ne 0 ] ; then
+# scp ${user}@${linux_host}:/tmp/.build_crossbow_tmp/bowtie/bowtie \
+# ${user}@${linux_host}:/tmp/.build_crossbow_tmp/bowtie/bowtie-debug .bin/linux32
+#fi
+#scp ${user}@${linux_host}:/tmp/.build_crossbow_tmp/crossbow/soapsnp/soapsnp* .bin/linux32
+
+if [ $DO_BOWTIE -ne 0 ] ; then
+ ssh ${user}@${linux_host} \
+ "cd /tmp/.build_crossbow_tmp/bowtie && rm -f bowtie bowtie-debug && make -j2 BITS=64 bowtie bowtie-debug"
+fi
+
+ssh ${user}@${linux_host} \
+ "cd /tmp/.build_crossbow_tmp/crossbow/soapsnp && rm -f soapsnp soapsnp-debug && make -j2 BITS=64 soapsnp soapsnp-debug"
+
+mkdir -p .bin/linux64
+if [ $DO_BOWTIE -ne 0 ] ; then
+ scp ${user}@${linux_host}:/tmp/.build_crossbow_tmp/bowtie/bowtie \
+ ${user}@${linux_host}:/tmp/.build_crossbow_tmp/bowtie/bowtie-debug .bin/linux64
+fi
+scp ${user}@${linux_host}:/tmp/.build_crossbow_tmp/crossbow/soapsnp/soapsnp* .bin/linux64
+
+ssh ${user}@${linux_host} "rm -rf /tmp/.build_crossbow_tmp"
+echo "PASSED"
+echo "Binaries in .bin subdirectory"
diff --git a/util/package.bash b/util/package.bash
new file mode 100755
index 0000000..7474e25
--- /dev/null
+++ b/util/package.bash
@@ -0,0 +1,79 @@
+#
+# Author: Ben Langmead
+# Date: 9/26/2009
+#
+# Package Crossbow files for release.
+#
+
+VERSION=`cat VERSION`
+PKG_BASE=.pkg
+APP=crossbow
+PKG=.pkg/$APP-${VERSION}
+
+echo "Should have already run 'make doc' to make documentation"
+
+rm -rf $PKG_BASE
+mkdir -p $PKG
+
+# Copy Crossbow sources
+cp *.pl *.pm $PKG/
+for i in cb ; do
+ for j in emr hadoop local ; do
+ cp ${i}_$j $PKG/
+ chmod a+x $PKG/${i}_$j
+ done
+done
+chmod a+x $PKG/*.pl
+
+# Copy modified-SOAPsnp sources
+mkdir -p $PKG/soapsnp
+cp soapsnp/*.cc \
+ soapsnp/*.h \
+ soapsnp/COPYING \
+ soapsnp/readme \
+ soapsnp/release \
+ soapsnp/makefile \
+ $PKG/soapsnp/
+
+# Include the Bowtie and SOAPsnp binaries for 32-bit and 64-bit Linux/Mac
+mkdir -p $PKG/bin/linux32
+mkdir -p $PKG/bin/linux64
+mkdir -p $PKG/bin/mac32
+mkdir -p $PKG/bin/mac64
+cp bin/linux32/* $PKG/bin/linux32/
+cp bin/linux64/* $PKG/bin/linux64/
+cp bin/mac32/* $PKG/bin/mac32/
+cp bin/mac64/* $PKG/bin/mac64/
+
+# Copy contrib dir
+mkdir -p $PKG/contrib
+cp contrib/* $PKG/contrib
+
+# Copy reftools dir
+mkdir -p $PKG/reftools
+rm -f reftools/*.jar
+cp reftools/* $PKG/reftools
+chmod a+x $PKG/reftools/*
+
+# Copy example dir
+mkdir -p $PKG/example
+for i in e_coli mouse17 ; do
+ mkdir -p $PKG/example/$i
+ cp example/$i/copy.manifest $PKG/example/$i/
+ cp example/$i/small.manifest $PKG/example/$i/
+ cp example/$i/full.manifest $PKG/example/$i/
+done
+
+# Copy doc dir
+mkdir -p $PKG/doc
+cp doc/*.html $PKG/doc
+cp doc/*.css $PKG/doc
+mkdir -p $PKG/doc/images
+cp -r doc/images/*.png $PKG/doc/images/
+
+cp VERSION NEWS MANUAL LICENSE* TUTORIAL $PKG/
+
+pushd $PKG_BASE
+zip -r $APP-${VERSION}.zip $APP-${VERSION}
+popd
+cp $PKG_BASE/$APP-${VERSION}.zip .
diff --git a/webui/S3Util.pm b/webui/S3Util.pm
new file mode 100644
index 0000000..ec372a0
--- /dev/null
+++ b/webui/S3Util.pm
@@ -0,0 +1,118 @@
+#!/usr/bin/perl -w
+
+##
+# S3Util.pm
+#
+# Utilities used by Crossbow and Myrna web interfaces.
+#
+
+package S3Util;
+
+use strict;
+use warnings;
+use Carp;
+use Net::Amazon::S3;
+
+##
+# Parse an S3 path into a (protocol, bucket, path) triple. Note that
+# the path may be empty.
+#
+sub parsePath($) {
+ my $s = shift;
+ my $proto = undef;
+ $proto = "s3n" if $s =~ /^s3n:\/\//i;
+ $proto = "s3" if $s =~ /^s3:\/\//i;
+ $proto || return undef;
+ $s =~ s/^s3n?:\/\///; # strip protocol
+ $s ne "" || return ($proto, undef, undef);
+ my @ss = split(/\//, $s);
+ scalar(@ss) > 0 || return ($proto, undef, undef);
+ my $bucket = shift @ss;
+ my $path = undef;
+ $path = join("/", @ss) if scalar(@ss) > 0;
+ return ($proto, $bucket, $path);
+}
+
+##
+# Get an S3 object.
+#
+sub s3($$) {
+ my ($awsId, $awsSecret) = @_;
+ return(Net::Amazon::S3->new(
+ aws_access_key_id => $awsId,
+ aws_secret_access_key => $awsSecret,
+ retry => 1
+ ));
+}
+
+##
+# Get an S3 client object.
+#
+sub client($$) {
+ my ($awsId, $awsSecret) = @_;
+ return Net::Amazon::S3::Client->new(s3 => s3($awsId, $awsSecret));
+}
+
+##
+# Check whether ID/password credentials are good.
+#
+sub checkCreds($$) {
+ my ($awsId, $awsSecret) = @_;
+ my $client = client($awsId, $awsSecret);
+ if(eval { $client->buckets() }) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+##
+# Check if an s3 file exists.
+#
+sub s3exists {
+ my ($awsId, $awsSecret, $path, $verbose) = @_;
+ my $s3 = s3($awsId, $awsSecret);
+ if(!eval { $s3->buckets() }) {
+ return (-1, "Bad AWS ID and/or Secret Key");
+ }
+ defined($s3) || return (-1, "Could not create client");
+ my ($pr, $bu, $pa) = parsePath($path);
+ defined($bu) || return (-1, "Could not parse path $path");
+ if(defined($pa)) {
+ my $l = $s3->list_bucket({bucket => $bu, prefix => $pa, max_keys => 1});
+ defined($l) || return (0, "list_bucket returned 0");
+ print Dumper($l) if $verbose;
+ if(scalar(@{$l->{keys}})) {
+ my $key = shift @{$l->{keys}};
+ $key = $key->{key};
+ substr($key, 0, length($pa)) eq $pa || die;
+ substr($key, 0, length($pa)) = "";
+ if($key eq "" || substr($key, 0, 1) eq "/") {
+ return (1, "remainder: $key");
+ } else {
+ return (0, "remainder: $key");
+ }
+ } else {
+ return (0, "");
+ }
+ } else {
+ return (1, "");
+ }
+}
+
+if($0 =~ /S3Util\.pm$/) {
+ use Getopt::Long;
+ my ($id, $key);
+ GetOptions (
+ "aws-id:s" => \$id,
+ "aws-secret-key:s" => \$key);
+ if(defined($id) && defined($key)) {
+ if(checkCreds($id, $key)) {
+ print "Creds OK\n";
+ } else {
+ print "BAD CREDS\n";
+ }
+ }
+}
+
+1;
diff --git a/webui/crossbow.pl b/webui/crossbow.pl
new file mode 100644
index 0000000..d51e57e
--- /dev/null
+++ b/webui/crossbow.pl
@@ -0,0 +1,977 @@
+#!/usr/bin/perl -w
+
+##
+# Crossbow web interface. Requires S3Util.pm and CrossbowIface.pm in
+# the same directory.
+#
+
+use strict;
+use warnings;
+use CGI;
+use CGI::Ajax;
+use Net::Amazon::S3;
+use FindBin qw($Bin);
+use lib $Bin;
+use CrossbowIface;
+use S3Util;
+use CGI::Carp qw(fatalsToBrowser);
+
+my $VERSION = "1.2.0";
+my $debugLev = 0;
+my $cgi = CGI->new();
+my $ajax = CGI::Ajax->new(submitClicked => \&submitClicked,
+ checkS3URL => \&checkS3URL,
+ checkS3Creds => \&checkS3Creds,
+ checkRefURL => \&checkRefURL,
+ checkInputURL => \&checkInputURL,
+ checkOutputURL => \&checkOutputURL);
+$ajax->js_encode_function('encodeURIComponent');
+$ajax->JSDEBUG($debugLev);
+print $ajax->build_html( $cgi, \&main );
+
+##
+# Verify that given input URL exists.
+#
+sub checkInputURL {
+ my ($awsId, $awsSecret, $url) = @_;
+ my ($ret, $err);
+ ($ret, $err) = eval { S3Util::s3exists($awsId, $awsSecret, $url); };
+ my $recheck = "(<a href=\"javascript:jsCheckInputURL()\">Re-check input URL...</a>)";
+ unless(defined($ret)) {
+ if($debugLev > 0) {
+ return "<font color='red'>Error: s3exists died with message \"$@\": \"$url\"</font> $recheck";
+ } else {
+ return "<font color='red'>Error: s3exists died with message \"$@\"</font> $recheck";
+ }
+ }
+ if($ret < -1 || $ret > 1) {
+ if($debugLev > 0) {
+ return "<font color='red'>Error: Return value from s3exists was $ret: \"$url\"</font> $recheck";
+ } else {
+ return "<font color='red'>Error: Return value from s3exists was $ret</font> $recheck";
+ }
+ }
+ if($ret == 1) {
+ if($debugLev > 0) {
+ return "<font color='green'>Verified: \"$url\"</font>";
+ } else {
+ return "<font color='green'>Verified</font>";
+ }
+ } elsif($ret == -1) {
+ if($debugLev > 0) {
+ return "<font color='red'>Error: $err: \"$url\"</font> $recheck";
+ } else {
+ return "<font color='red'>Error: $err</font> $recheck";
+ }
+ } else {
+ $ret == 0 || croak();
+ if($debugLev > 0) {
+ return "<font color='red'>Error: Input URL does not exist: \"$url\"</font> $recheck"
+ } else {
+ return "<font color='red'>Error: Input URL does not exist</font> $recheck"
+ }
+ }
+}
+
+##
+# Verify that given reference-jar URL exists.
+#
+sub checkRefURL {
+ my ($awsId, $awsSecret, $url) = @_;
+ my ($ret, $err);
+ ($ret, $err) = eval { S3Util::s3exists($awsId, $awsSecret, $url); };
+ my $recheck = "(<a href=\"javascript:jsCheckRefURL()\">Re-check reference URL...</a>)";
+ unless(defined($ret)) {
+ if($debugLev > 0) {
+ return "<font color='red'>Error: s3exists died with message \"$@\": \"$url\"</font> $recheck";
+ } else {
+ return "<font color='red'>Error: s3exists died with message \"$@\"</font> $recheck";
+ }
+ }
+ if($ret < -1 || $ret > 1) {
+ if($debugLev > 0) {
+ return "<font color='red'>Error: Return value from s3exists was $ret: \"$url\"</font> $recheck";
+ } else {
+ return "<font color='red'>Error: Return value from s3exists was $ret</font> $recheck";
+ }
+ }
+ if($ret == 1) {
+ if($debugLev > 0) {
+ return "<font color='green'>Verified: \"$url\"</font>";
+ } else {
+ return "<font color='green'>Verified</font>";
+ }
+ } elsif($ret == -1) {
+ if($debugLev > 0) {
+ return "<font color='red'>Error: $err: \"$url\"</font> $recheck";
+ } else {
+ return "<font color='red'>Error: $err</font> $recheck";
+ }
+ } else {
+ $ret == 0 || croak();
+ if($debugLev > 0) {
+ return "<font color='red'>Error: Reference jar URL does not exist: \"$url\"</font> $recheck"
+ } else {
+ return "<font color='red'>Error: Reference jar URL does not exist</font> $recheck"
+ }
+ }
+}
+
+##
+# Verify that given output URL does not exist.
+#
+sub checkOutputURL {
+ my ($awsId, $awsSecret, $url) = @_;
+ my ($ret, $err);
+ ($ret, $err) = eval { S3Util::s3exists($awsId, $awsSecret, $url); };
+ my $recheck = "(<a href=\"javascript:jsCheckOutputURL()\">Re-check output URL...</a>)";
+ unless(defined($ret)) {
+ if($debugLev > 0) {
+ return "<font color='red'>Error: s3exists died with message \"$@\": \"$url\"</font> $recheck";
+ } else {
+ return "<font color='red'>Error: s3exists died with message \"$@\"</font> $recheck";
+ }
+ }
+ if($ret < -1 || $ret > 1) {
+ if($debugLev > 0) {
+ return "<font color='red'>Error: Return value from s3exists was $ret: \"$url\"</font> $recheck";
+ } else {
+ return "<font color='red'>Error: Return value from s3exists was $ret</font> $recheck";
+ }
+ }
+ if($ret == 0) {
+ if($debugLev > 0) {
+ return "<font color='green'>Verified: \"$url\"</font>";
+ } else {
+ return "<font color='green'>Verified</font>";
+ }
+ } elsif($ret == -1) {
+ if($debugLev > 0) {
+ return "<font color='red'>Error: $err: \"$url\"</font> $recheck";
+ } else {
+ return "<font color='red'>Error: $err</font> $recheck";
+ }
+ } else {
+ $ret == 1 || croak();
+ if($debugLev > 0) {
+ return "<font color='red'>Error: Output URL already exists: \"$url\"</font> $recheck"
+ } else {
+ return "<font color='red'>Error: Output URL already exists</font> $recheck"
+ }
+ }
+}
+
+##
+# Check if the given S3 credentials work.
+#
+sub checkS3Creds {
+ my ($awsId, $awsSecret) = @_;
+ my $ret = eval { S3Util::checkCreds($awsId, $awsSecret); };
+ my $recheck = "(<a href=\"javascript:jsCheckS3Creds()\">Re-check credentials...</a>)";
+ unless(defined($ret)) {
+ if($debugLev > 0) {
+ return "<font color='red'>Error: checkCreds died with message \"$@\": \"$awsId\", \"$awsSecret\"</font> $recheck";
+ } else {
+ return "<font color='red'>Error: checkCreds died with message \"$@\"</font> $recheck";
+ }
+ }
+ if($ret == 1) {
+ if($debugLev > 0) {
+ return "<font color='green'>Verified: \"$awsId\", \"$awsSecret\"</font>";
+ } else {
+ return "<font color='green'>Verified</font>";
+ }
+ } else {
+ if($debugLev > 0) {
+ return "<font color='red'>Error: Bad AWS ID and/or Secret Key: \"$awsId\", \"$awsSecret\"</font> ";
+ } else {
+ return "<font color='red'>Error: Bad AWS ID and/or Secret Key</font> $recheck";
+ }
+ }
+}
+
+#
+# Form elements:
+#
+# AWSId: text
+# AWSSecret: password
+# AWSKeyPair: text
+# JobName: text
+# JobType: radio (just-preprocess | crossbow)
+# InputURL: text
+# OutputURL: text
+# InputType: radio (manifest | preprocessed)
+# TruncateLength: text (blank or 0 = don't truncate)
+# TruncateDiscard: check
+# DiscardFraction: text (blank or 0 = don't discard)
+# QualityEncoding: dropdown (Phred+33 | Phred+64 | Solexa+64)
+# Genome: dropdown (bunch of genomes)
+# SpecifyRef: check
+# Ref: text
+# BowtieOpts: text
+# SoapsnpOpts: text
+# SoapsnpOptsHap: text
+# SoapsnpOptsDip: text
+# Haploids: text
+# HaploidsList: text
+# ClusterWait: check
+# NumNodes: text
+# InstanceType: dropdown (c1.xlarge)
+#
+
+sub submitClicked {
+ my ($awsId,
+ $awsSecret,
+ $keyPairName,
+ $name,
+ $jobType,
+ $inputURL,
+ $outputURL,
+ $inputType,
+ $truncLen,
+ $truncDiscard,
+ $discardFrac,
+ $qual,
+ $genome,
+ $specifyRef,
+ $ref,
+ $bowtieOpts,
+ $soapsnpOpts,
+ $soapsnpOptsHap,
+ $soapsnpOptsDip,
+ $haploids,
+ $haploidsList,
+ $clusterWait,
+ $numNodes,
+ $instanceType) = @_;
+
+ ##
+ # Map from short names to URLs for the pre-built reference jars.
+ #
+ my %refMap = (
+ "hg18_130" => "s3n://crossbow-refs/hg18.jar",
+ "mm9_130" => "s3n://crossbow-refs/mm9.jar",
+ "e_coli" => "s3n://crossbow-refs/e_coli.jar"
+ );
+
+ $name = "Crossbow" unless defined($name) && $name ne "";
+ $jobType eq "--just-preprocess" || $jobType eq "--crossbow" || croak("Bad JobType: $jobType");
+ $numNodes == int($numNodes) || croak("NumNodes is not an integer: $numNodes");
+
+ my @as = ();
+ push @as, "--accessid=$awsId";
+ push @as, "--secretid=$awsSecret";
+ push @as, "--key-pair=$keyPairName" if defined($keyPairName) && $keyPairName ne "";
+ push @as, "--emr-script=\"/var/www/cgi-bin/elastic-mapreduce\"";
+ push @as, "--name=\"$name\"";
+ push @as, "$jobType";
+ push @as, "--input=$inputURL";
+ push @as, "--output=$outputURL";
+ if($jobType eq "just-preprocess") {
+ # Preprocess job
+ } else {
+ # Crossbow job
+ $truncDiscard = "--truncate-length" unless $truncDiscard ne "";
+ push @as, "$truncDiscard=$discardFrac" if $truncLen > 0;
+ push @as, "--discard-reads=$truncLen" if $discardFrac > 0;
+ push @as, "--quality=$qual";
+ push @as, "--preprocess" if $inputType eq "manifest";
+ if($specifyRef) {
+ # User-specified ref URL
+ my ($proto, $bucket, $path) = S3Util::parsePath($ref);
+ defined($proto) || croak("Could not parse reference path: $ref");
+ defined($bucket) || croak("Could not parse bucket in reference path: $ref");
+ defined($path) || croak("Could not parse path in reference path: $ref");
+ # TODO: check if reference exists
+ push @as, "--ref=$ref";
+ } else {
+ # Pre-built ref
+ defined($refMap{$genome}) || croak("Bad genome short name: \"$genome\"");
+ push @as, "--ref=$refMap{$genome}";
+ }
+ push @as, "--bowtie-args=$bowtieOpts";
+ push @as, "--soapsnp-args=$bowtieOpts";
+ push @as, "--soapsnp-args=$soapsnpOpts";
+ push @as, "--soapsnp-hap-args=$soapsnpOptsHap";
+ push @as, "--soapsnp-dip-args=$soapsnpOptsDip";
+ if($haploids eq "all-diploid") {
+ # no arg
+ } elsif($haploids eq "all-haploid") {
+ push @as, "--all-haploids";
+ } elsif($haploids eq "all-diploid-except") {
+ push @as, "--haploids=$haploidsList";
+ } else {
+ croak("Bad value for haplids: \"$haploids\"");
+ }
+ }
+ push @as, "$clusterWait";
+ push @as, "--instances=$numNodes";
+ push @as, "--verbose";
+ push @as, "--instance-type=$instanceType";
+
+ my $stdout = "";
+ my $stderr = "";
+
+ my $stdoutf = sub { $stdout .= $_[0]; };
+ my $stdoutff = sub {
+ my $str = shift @_;
+ $stdout .= sprintf $str, @_;
+ };
+ my $stderrf = sub { $stderr .= $_[0]; };
+ my $stderrff = sub {
+ my $str = shift @_;
+ $stderr .= sprintf $str, @_;
+ };
+ if(!defined($ENV{HOME})) {
+ $stderr .= "Had to define HOME in myrna.pl\n";
+ $ENV{HOME} = "/var/www/cgi-bin";
+ }
+ CrossbowIface::crossbow(\@as, "crossbow.pl", "(no usage)", $stdoutf, $stdoutff, $stderrf, $stderrff);
+
+ my $jobid = "";
+ $stdout =~ /Created job flow (.*)/;
+ $jobid = $1 if defined($1);
+
+ my $resultHtml = "";
+ if($jobid eq "") {
+ my $asStr = "";
+ for my $a (@as) {
+ next unless $a ne "";
+ $asStr .= "$a\n";
+ }
+ # Error condition
+ $resultHtml .= <<HTML;
+ <font color="red"><b>Error invoking Crossbow. Job not submitted.</b></font>
+
+ <br>Arguments given to Crossbow driver script:
+ <pre>$asStr</pre>
+
+ Standard output from driver:
+ <pre>$stdout</pre>
+
+ Standard error from driver:
+ <pre>$stderr</pre>
+HTML
+ } else {
+ # Everything seemed to go fine
+ $resultHtml .= <<HTML;
+ <br>
+ Job created; MapReduce job ID = $jobid
+ <br>
+ Go to the
+ <a href="https://console.aws.amazon.com/elasticmapreduce" target="_blank">
+ AWS Console's Elastic MapReduce</a> tab to monitor your
+ job.
+HTML
+ }
+ return $resultHtml;
+}
+
+sub main {
+ my $html = "";
+ $html .= <<HTML;
+<html>
+<head>
+</head>
+<body>
+<script src="http://jotform.com/js/form.js?v2.0.1347" type="text/javascript"></script>
+<style type="text/css">
+.main {
+ font-family:"Verdana";
+ font-size:11px;
+ color:#666666;
+}
+.tbmain{
+ /* Changes on the form */
+ background: white !important;
+}
+.left{
+ /* Changes on the form */
+ color: black !important;
+ font-family: Verdana !important;
+ font-size: 12px !important;
+}
+.right{
+ /* Changes on the form */
+ color: black !important;
+ font-family: Verdana !important;
+ font-size: 12px !important;
+}
+.check{
+ color: black !important;
+ font-family: Verdana !important;
+ font-size: 10px !important;
+}
+.head{
+ color:#333333;
+ font-size:20px;;
+ text-decoration:underline;
+ font-family:"Verdana";
+}
+td.left {
+ font-family:"Verdana";
+ font-size:12px;
+ color:black;
+}
+.pagebreak{
+ font-family:"Verdana";
+ font-size:12px;
+ color:black;
+}
+.tbmain{
+ height:100%;
+ background:white;
+}
+span.required{
+ font-size: 13px !important;
+ color: red !important;
+}
+
+div.backButton{
+ background: transparent url("http://jotform.com//images/btn_back.gif") no-repeat scroll 0 0;
+ height:16px;
+ width:53px;
+ float:left;
+ margin-bottom:15px;
+ padding-right:5px;
+}
+div.backButton:hover{
+ background: transparent url("http://jotform.com//images/btn_back_over.gif") no-repeat scroll 0 0;
+}
+div.backButton:active{
+ background: transparent url("http://jotform.com//images/btn_back_down.gif") no-repeat scroll 0 0;
+}
+div.nextButton{
+ background: transparent url("http://jotform.com//images/btn_next.gif") no-repeat scroll 0 0;
+ height:16px;
+ width:53px;
+ float: left;
+ margin-bottom:15px;
+ padding-right:5px;
+}
+div.nextButton:hover{
+ background: transparent url("http://jotform.com//images/btn_next_over.gif") no-repeat scroll 0 0;
+}
+div.nextButton:active{
+ background: transparent url("http://jotform.com//images/btn_next_down.gif") no-repeat scroll 0 0;
+}
+.pageinfo{
+ padding-right:5px;
+ margin-bottom:15px;
+ float:left;
+}
+
+</style>
+<table width="100%" cellpadding="2" cellspacing="0" class="tbmain">
+<tr><td class="topleft" width="10" height="10"> </td>
+<td class="topmid"> </td>
+<td class="topright" width="10" height="10"> </td>
+ </tr>
+<tr>
+<td class="midleft" width="10"> </td>
+<td class="midmid" valign="top">
+<form accept-charset="utf-8" action="/crossbowform" method="post" name="form">
+<div id="main">
+<div class="pagebreak">
+<table width="520" cellpadding="5" cellspacing="0">
+ <tr >
+ <td class="left" colspan=2>
+ <h2>Crossbow $VERSION</h2>
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" >
+ <label >AWS ID <span class="required">*</span></label>
+ </td>
+ <td class="right" >
+ <input type="text"
+ onblur="validate(this,'Required')"
+ onkeypress="jsResetCheckS3Creds()"
+ size="25" name="AWSId" class="text" value="" onmouseover="ddrivetip('Your AWS Access Key ID, usually 20 characters long (not your Secret Access Key or your Account ID).', 200)" onmouseout="hideddrivetip()" maxlength="100" maxsize="100"></input>
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" >
+ <label >AWS Secret Key <span class="required">*</span></label>
+ </td>
+ <td class="right" >
+ <input type="password"
+ onblur="validate(this,'Required')"
+ onkeypress="jsResetCheckS3Creds()"
+ size="50" name="AWSSecret" class="text" value="" onmouseover="ddrivetip('Your AWS Secret Access Key, usually 40 characters long (not your Access Key ID or your Account ID).', 200)" onmouseout="hideddrivetip()" maxlength="100" maxsize="100"></input>
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" >
+ <label >AWS Keypair Name</label>
+ </td>
+ <td class="right" >
+ <input type="text"
+ size="30" name="AWSKeyPair" class="text" value="gsg-keypair" onmouseover="ddrivetip('Name of the keypair that AWS should install on the cluster, allowing you to log in.', 200)" onmouseout="hideddrivetip()" maxlength="100" maxsize="100"></input>
+ <a href="https://console.aws.amazon.com/ec2/home#c=EC2&s=KeyPairs" target="_blank">Look it up</a>
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" >
+ </td>
+ <td class="right" >
+ <span id="credcheck" class="check"><a href="javascript:jsCheckS3Creds()">Check credentials...</a></span>
+ </td>
+ </tr>
+ <tr >
+ <td colspan="2" >
+ <hr>
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" >
+ <label >Job name</label>
+ </td>
+ <td class="right" >
+ <input type="text" size="30" name="JobName" class="text" value="Crossbow" onmouseover="ddrivetip('Name given to Elastic MapReduce job.', 200)" onmouseout="hideddrivetip()" maxlength="100" maxsize="100"></input>
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" valign="top" >
+ <label>Job type</label>
+ </td>
+ <td class="right">
+ <input type="radio" class="other" name="JobType" onclick="enableApp()" onmouseover="ddrivetip('Run the Crossbow pipeline, starting with a manifest file or preprocessed reads, and ending with Crossbow results.', 200)" onmouseout="hideddrivetip()" value="--crossbow" checked />
+ <label class="left">Crossbow</label> <br />
+ <input type="radio" class="other" name="JobType" onclick="disableApp()" onmouseover="ddrivetip('Just run the Preprocess step and place preprocessed reads at Output URL.', 200)" onmouseout="hideddrivetip()" value="--just-preprocess" />
+ <label class="left">Just preprocess reads</label> <br />
+ </td>
+ </tr>
+ <tr >
+ <td colspan="2" >
+ <hr>
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" >
+ <label >Input URL <span class="required">*</span></label>
+ </td>
+ <td class="right" >
+ <input type="text" size="60" name="InputURL"
+ onmouseover="ddrivetip('S3 URL where manifest file or preprocessed reads are located.', 200)"
+ onmouseout="hideddrivetip()"
+ class="text" value="s3n://"
+ onblur="validate(this,'Required')"
+ onkeypress="jsResetCheckInputURL()"
+ maxlength="400" maxsize="400" />
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" >
+ </td>
+ <td class="right" >
+ <div id="inputcheck" class="check"><a href="javascript:jsCheckInputURL()">Check that input URL exists...</a></div>
+ </td>
+ </tr>
+
+ <tr >
+ <td width="165" class="left" >
+ <label >Output URL <span class="required">*</span></label>
+ </td>
+ <td class="right" >
+ <input type="text" size="60" name="OutputURL"
+ onmouseover="ddrivetip('S3 URL where Crossbow output should be placed.', 200)"
+ onmouseout="hideddrivetip()"
+ class="text" value="s3n://"
+ onblur="validate(this,'Required')"
+ onkeypress="jsResetCheckOutputURL()"
+ maxlength="400" maxsize="400" />
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" >
+ </td>
+ <td class="right" >
+ <div id="outputcheck" class="check"><a href="javascript:jsCheckOutputURL()">Check that output URL doesn't exist...</a></div>
+ </td>
+ </tr>
+ <tr >
+ <td colspan="2" >
+ <hr>
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" valign="top" >
+ <label id="app-input-type-label">Input type</label>
+ </td>
+ <td class="right">
+ <input type="radio" id="app-input-type-radio-preprocess" class="other" name="InputType" name="InputType" onmouseover="ddrivetip('Input URL points to a directory of files that have already been preprocessed by Crossbow.', 200)" onmouseout="hideddrivetip()" value="preprocessed" checked />
+ <label id="app-input-type-preprocess-label">Preprocessed reads</label> <br />
+ <input type="radio" id="app-input-type-radio-manifest" class="other" name="InputType" name="InputType" onmouseover="ddrivetip('Input URL points to a manifest file listing publicly-readable URLs of input FASTQ files; FASTQ files are both preprocessed and analyzed.', 200)" onmouseout="hideddrivetip()" value="manifest" />
+ <label id="app-input-type-manifest-label">Manifest file</label> <br />
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" >
+ <label id="app-truncate-length-label">Truncate length</label>
+ </td>
+ <td class="right" >
+ <input type="text" size="5" id="app-truncate-length-text" class="text" name="TruncateLength" onmouseover="ddrivetip('Specifies N such that reads longer than N bases are truncated to length N by removing bases from the 3\\' end.', 200)" onmouseout="hideddrivetip()" class="text" value="0" onblur="validate(this,'Numeric')" maxlength="5" maxsize="5" />
+ <span class="main"> (If blank or 0, truncation is disabled)</span>
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" valign="top" >
+ </td>
+ <td valign="top" class="right">
+ <input id="app-skip-truncate-check" type="checkbox" class="other"
+ name="TruncateDiscard"
+ value="--truncate-discard" />
+ <label id="app-skip-truncate-label">Skip reads shorter than truncate length</label> <br />
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" >
+ <label id="app-discard-fraction-label">Discard fraction</label>
+ </td>
+ <td class="right" >
+ <input id="app-discard-fraction-text" type="text" size="5" name="DiscardFraction" onmouseover="ddrivetip('Randomly discard specified fraction of the input reads. Useful for testing purposes.', 200)" onmouseout="hideddrivetip()" class="text" value="0" onblur="validate(this,'Numeric')" maxlength="5" maxsize="5" />
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" valign="top" >
+ <label id="app-quality-label">Quality encoding</label>
+ </td>
+ <td class="right">
+ <select id="app-quality-dropdown" class="other" name="QualityEncoding" onmouseover="ddrivetip('Quality value encoding scheme used for input reads.', 200)" onmouseout="hideddrivetip()">
+ <option value="phred33">Phred+33</option>
+ <option value="phred64">Phred+64</option>
+ <option value="solexa64">Solexa+64</option>
+ </select>
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" valign="top" >
+ <label id="app-genome-label">Genome/Annotation</label>
+ </td>
+ <td class="right">
+ <select id="app-genome-dropdown" class="other" name="Genome" onmouseover="ddrivetip('Genome assembly to use as reference genome and annotation database to use for prior SNP probabilities.', 200)" onmouseout="hideddrivetip()" >
+ <option value="hg18_130">Human (v36, dbSNP 130)</option>
+ <option value="mm9_130">Mouse (v37, dbSNP 130)</option>
+ <option value="e_coli">E. coli O157:H7</option>
+ </select>
+ </td>
+ </tr>
+ <tr>
+ <td width="165" class="left" valign="top" >
+ </td>
+ <td class="right">
+ <input id="app-specify-ref-check" type="checkbox" onclick="updateElements()" onmouseover="ddrivetip('Specify an S3 url for a reference jar.', 200)" onmouseout="hideddrivetip()" class="other"
+ value="1"
+ name="SpecifyRef"
+ />
+ <label id="app-specify-ref-label">Specify reference jar URL:</label> <br />
+ <br/>
+ <!-- Reference URL text box -->
+ <input id="app-specify-ref-text"
+ disabled
+ type="text"
+ size="50"
+ name="Ref"
+ onblur="validate(this,'Required')"
+ onkeypress="jsResetCheckRefURL()"
+ onmouseover="ddrivetip('Specify an S3 url for a reference jar.', 200)"
+ onmouseout="hideddrivetip()"
+ value="s3n://" class="text" value="" maxlength="100" maxsize="100" />
+ </td>
+ </tr>
+ <tr>
+ <td width="165" class="left" valign="top" >
+ </td>
+ <td class="right">
+ <div id="refcheck" class="check"><a href="javascript:jsCheckRefURL()">Check that reference jar URL exists...</a></div>
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" >
+ <label id="app-bowtie-options-label">Bowtie options</label>
+ </td>
+ <td class="right" >
+ <input id="app-bowtie-options-text" type="text" size="50" name="BowtieOpts" onmouseover="ddrivetip('Options to pass to Bowtie in the Align stage.', 200)" onmouseout="hideddrivetip()" class="text" value="-m 1" maxlength="400" maxsize="400" />
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" >
+ <label id="app-soapsnp-options-label">SOAPsnp options</label>
+ </td>
+ <td class="right" >
+ <input id="app-soapsnp-options-text" type="text" size="50" name="SoapsnpOpts" onmouseover="ddrivetip('Options to pass to SOAPsnp in the Call SNPs stage.', 200)" onmouseout="hideddrivetip()" class="text" value="-2 -u -n -q" maxlength="500" maxsize="500" />
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" >
+ <label id="app-soapsnp-haploid-options-label">Additional SOAPsnp options for haploids</label>
+ </td>
+ <td class="right" >
+ <input id="app-soapsnp-diploid-options-text" type="text" size="50" name="SoapsnpOptsHap" onmouseover="ddrivetip('Options to pass to SOAPsnp in the Call SNPs stage when the reference chromosome is haploid.', 200)" onmouseout="hideddrivetip()" class="text" value="-r 0.0001" maxlength="500" maxsize="500" />
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" >
+ <label id="app-soapsnp-diploid-options-label">Additional SOAPSNP options for diploids</label>
+ </td>
+ <td class="right" >
+ <input id="app-soapsnp-diploid-options-text" type="text" size="50" name="SoapsnpOptsDip" onmouseover="ddrivetip('Options to pass to SOAPsnp in the Call SNPs stage when the reference chromosome is diploid.', 200)" onmouseout="hideddrivetip()" class="text" value="-r 0.00005 -e 0.0001" maxlength="500" maxsize="500" />
+ </td>
+ </tr>
+ <tr >
+
+ <td width="165" class="left" valign="top" >
+ <label id="app-ploidy-label" >Chromosome ploidy</label>
+ </td>
+ <td class="right">
+ <input id="app-ploidy1-radio" type="radio" class="other" name="Haploids" onclick="updateElements()" value="all-diploid" checked />
+ <label id="app-ploidy1-label">All chrosmosomes are diploid</label> <br />
+ <input id="app-ploidy2-radio" type="radio" class="other" name="Haploids" onclick="updateElements()" value="all-haploid" />
+ <label id="app-ploidy2-label">All are haploid</label> <br />
+ <input id="app-ploidy3-radio" type="radio" class="other" name="Haploids" onclick="updateElements()" value="all-diploid-except" />
+ <label id="app-ploidy3-label">All are diploid except: </label>
+ <input id="app-ploidy-text" disabled type="text" size="50" name="HaploidsList" onmouseover="ddrivetip('Comma-separated list of names of chromosomes that should be considered haploid.', 200)" onmouseout="hideddrivetip()" class="text" value="" maxlength="100" maxsize="100" />
+ <br />
+ </td>
+ </tr>
+ <tr >
+ <td colspan="2" >
+ <hr>
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" valign="top" >
+ <label id="options-label">Options</label>
+ </td>
+ <td valign="top" class="right">
+ <input id="wait-check" type="checkbox" onmouseover="ddrivetip('Typically the cluster is terminated as soon as the job either completes or aborts. Check this to keep the cluster running either way.', 200)" onmouseout="hideddrivetip()" class="other"
+ name="ClusterWait"
+ value="--stay-alive" />
+ <label id="wait-label">Keep cluster running after job finishes/aborts</label> <br />
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" >
+ <label ># EC2 instances</label>
+ </td>
+ <td class="right" >
+ <input type="text" size="5" name="NumNodes" onmouseover="ddrivetip('Number of Amazon EC2 instances (virtual computers) to use for this computation.', 200)" onmouseout="hideddrivetip()" class="text" value="1" onblur="validate(this,'Numeric')" maxlength="5" maxsize="5" />
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" valign="top" >
+ <label><a href="http://aws.amazon.com/ec2/instance-types/" target="_blank">Instance type</a></label>
+ </td>
+ <td class="right">
+ <select class="other" name="InstanceType" onmouseover="ddrivetip('Type of EC2 instance (virtual computer) to use; c1.xlarge is strongly recommended.', 200)" onmouseout="hideddrivetip()">
+ <option value="c1.xlarge">c1.xlarge (recommended)</option>
+ <option value="c1.medium">c1.medium</option>
+
+ <option value="m2.xlarge">m2.xlarge</option>
+ <option value="m2.2xlarge">m2.2xlarge</option>
+ <option value="m2.4xlarge">m2.4xlarge</option>
+
+ <option value="m1.xlarge">m1.xlarge</option>
+ <option value="m1.large">m1.large</option>
+ <option value="m1.small">m1.small</option>
+ </select>
+ </td>
+ </tr>
+ <tr >
+ <td width="165" class="left" valign="top">
+ <span class="main">Made with the help of</span>
+ <br>
+ <a href="http://www.jotform.com/" target="_blank">
+ <img border=0 width=115
+ src="http://www.jotform.com/images/jotform.gif"
+ alt="Made with the help of JotForm" /></a>
+ </td>
+ <td class="right">
+ <input type="button" class="btn" value="Submit"
+ onclick="document.getElementById('result1').innerHTML = '<img border=0 src=\\'/wait.gif\\' /> Creating job, please wait ...' ;
+ submitClicked(
+ ['AWSId',
+ 'AWSSecret',
+ 'AWSKeyPair',
+ 'JobName',
+ 'JobType',
+ 'InputURL',
+ 'OutputURL',
+ 'InputType',
+ 'TruncateLength',
+ 'TruncateDiscard',
+ 'DiscardFraction',
+ 'QualityEncoding',
+ 'Genome',
+ 'SpecifyRef',
+ 'Ref',
+ 'BowtieOpts',
+ 'SoapsnpOpts',
+ 'SoapsnpOptsHap',
+ 'SoapsnpOptsDip',
+ 'Haploids',
+ 'HaploidsList',
+ 'ClusterWait',
+ 'NumNodes',
+ 'InstanceType'],
+ ['result1'])" />
+ </td>
+ <tr >
+ <td colspan="2" class="right">
+ <span class="main"><b>Please cite</b>:
+ Langmead B, Schatz MC, Lin J, Pop M, Salzberg SL.
+ <a href="http://genomebiology.com/2009/10/11/R134">Searching for SNPs with cloud computing</a>. <i>Genome Biology</i> 10:R134.</span>
+ </td>
+ </tr>
+ <tr >
+ <td colspan="2" >
+ <hr> <!-- Horizontal rule -->
+ </td>
+ </tr>
+ <tr>
+ <td colspan=2 id="result1" class="right">
+ <!-- Insert result here -->
+ </td>
+ </tr>
+</table>
+</div>
+</div>
+</form>
+</td>
+<td class="midright" width="10"> </td>
+</tr>
+<tr>
+ <td class="bottomleft" width="10" height="10"> </td>
+ <td class="bottommid"> </td>
+ <td class="bottomright" width="10" height="10"> </td>
+</tr>
+</table>
+<script type="text/javascript">
+
+var isAppRegex=/^app-/;
+var isLabel=/-label\$/;
+
+function haploidTextEnabled() {
+ var sel;
+ for(i = 0; i < document.form.Haploids.length; i++) {
+ if(document.form.Haploids[i].checked) {
+ sel = i;
+ break;
+ }
+ }
+ return sel == 2;
+}
+
+function updateElements() {
+ if(document.form.SpecifyRef.checked) {
+ document.form.Ref.disabled = false;
+ document.form.Ref.style.color = "black";
+ document.form.Genome.disabled = true;
+ } else {
+ document.form.Ref.disabled = true;
+ document.form.Ref.style.color = "gray";
+ document.form.Genome.disabled = false;
+ }
+ if(haploidTextEnabled()) {
+ document.form.HaploidsList.disabled = false;
+ document.form.HaploidsList.style.color = "black";
+ } else {
+ document.form.HaploidsList.disabled = true;
+ document.form.HaploidsList.style.color = "gray";
+ }
+}
+
+function checkS3ExistsWait(div) {
+ document.getElementById(div).innerHTML = '<img border=0 width=18 src=\\'/wait.gif\\' />';
+}
+
+function enableApp() {
+ var elts = document.getElementsByTagName('*');
+ var count = elts.length;
+ for(i = 0; i < count; i++) {
+ var element = elts[i];
+ if(isAppRegex.test(element.id)) {
+ // Yes, this is an app-related form element that should be re-enabled
+ element.disabled = false;
+ if(isLabel.test(element.id) || element.type == "text") {
+ element.style.color = "black";
+ }
+ }
+ }
+ updateElements();
+}
+function disableApp() {
+ var elts = document.getElementsByTagName('*');
+ var count = elts.length;
+ for(i = 0; i < count; i++) {
+ var element = elts[i];
+ if(isAppRegex.test(element.id)) {
+ // Yes, this is an app-related form element that should be disabled
+ element.disabled = true;
+ if(isLabel.test(element.id) || element.type == "text") {
+ element.style.color = "gray";
+ }
+ }
+ }
+}
+
+function jsResetCheckS3Creds() {
+ document.getElementById('credcheck').innerHTML = '<a href=\\'javascript:jsCheckS3Creds()\\'>Check credentials...</a>';
+}
+
+function jsCheckS3Creds() {
+ document.getElementById('credcheck').innerHTML = "Checking, please wait...";
+ checkS3Creds(['AWSId', 'AWSSecret'], ['credcheck']);
+}
+
+function jsResetCheckRefURL() {
+ document.getElementById('refcheck').innerHTML = '<a href=\\'javascript:jsCheckRefURL()\\'>Check that reference jar URL exists...</a>';
+}
+
+function jsCheckRefURL() {
+ document.getElementById('refcheck').innerHTML = "Checking, please wait...";
+ checkInputURL(['AWSId', 'AWSSecret', 'Ref'], ['refcheck']);
+}
+
+function jsResetCheckInputURL() {
+ document.getElementById('inputcheck').innerHTML = '<a href=\\'javascript:jsCheckInputURL()\\'>Check that input URL exists...</a>';
+}
+
+function jsCheckInputURL() {
+ document.getElementById('inputcheck').innerHTML = "Checking, please wait...";
+ checkInputURL(['AWSId', 'AWSSecret', 'InputURL'], ['inputcheck']);
+}
+
+function jsResetCheckOutputURL() {
+ document.getElementById('outputcheck').innerHTML = '<a href=\\'javascript:jsCheckOutputURL()\\'>Check that output URL doesn\\'t exist...</a>';
+}
+
+function jsCheckOutputURL() {
+ document.getElementById('outputcheck').innerHTML = "Checking, please wait...";
+ checkOutputURL(['AWSId', 'AWSSecret', 'OutputURL'], ['outputcheck']);
+}
+
+validate();
+
+</script>
+
+<!-- Google analytics code -->
+<script type="text/javascript">
+var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
+document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
+</script>
+<script type="text/javascript">
+var pageTracker = _gat._getTracker("UA-5334290-1");
+pageTracker._trackPageview();
+</script>
+<!-- End google analytics code -->
+
+</body>
+</html>
+HTML
+ return $html;
+}
+
+exit 0;
+__END__
diff --git a/webui/fill_e_coli_generic.sh b/webui/fill_e_coli_generic.sh
new file mode 100644
index 0000000..7e08b6a
--- /dev/null
+++ b/webui/fill_e_coli_generic.sh
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+#
+# fill_e_coli_generic.sh
+#
+# Uses Applescript/Safari to fill in the Crossbow Web UI form
+# generically (i.e. with placeholders for AWS credentials and bucket
+# name) for the E. coli example.
+#
+
+CROSSBOW_URL=http://ec2-184-73-43-172.compute-1.amazonaws.com/cgi-bin/crossbow.pl
+
+cat >.fill_e_coli.applescript <<EOF
+tell application "Safari"
+ activate
+ tell (make new document) to set URL to "$CROSSBOW_URL"
+ delay 6
+ set doc to document "$CROSSBOW_URL"
+ log (doc's name)
+ do JavaScript "document.forms['form']['AWSId'].value = '<YOUR-AWS-ID>'" in doc
+ do JavaScript "document.forms['form']['AWSSecret'].value = '<YOUR-AWS-SECRET-KEY>'" in doc
+ do JavaScript "document.forms['form']['JobName'].value = 'Crossbow-Ecoli'" in doc
+ do JavaScript "document.forms['form']['InputURL'].value = 's3n://<YOUR-BUCKET>/example/e_coli/small.manifest'" in doc
+ do JavaScript "document.forms['form']['OutputURL'].value = 's3n://<YOUR-BUCKET>/example/e_coli/output_small'" in doc
+ do JavaScript "document.forms['form']['InputType'][1].checked = 1" in doc
+ do JavaScript "document.forms['form']['InputType'][0].checked = 0" in doc
+ do JavaScript "document.forms['form']['QualityEncoding'].value = 'phred33'" in doc
+ do JavaScript "document.forms['form']['Genome'].value = 'e_coli'" in doc
+ do JavaScript "document.forms['form']['NumNodes'].value = '1'" in doc
+ do JavaScript "document.forms['form']['InstanceType'].value = 'c1.xlarge'" in doc
+ do JavaScript "document.forms['form']['Haploids'].value = 'all-haploid'" in doc
+ do JavaScript "document.forms['form']['Haploids'][1].checked = 1" in doc
+end tell
+EOF
+
+osascript .fill_e_coli.applescript
+rm -f .fill_e_coli.applescript
diff --git a/webui/fill_mm9chr17_generic.sh b/webui/fill_mm9chr17_generic.sh
new file mode 100644
index 0000000..a679c55
--- /dev/null
+++ b/webui/fill_mm9chr17_generic.sh
@@ -0,0 +1,38 @@
+#!/bin/sh
+
+#
+# fill_mm9chr17_generic.sh
+#
+# Uses Applescript/Safari to fill in the Crossbow Web UI form
+# generically (i.e. with placeholders for AWS credentials and bucket
+# name) for the E. coli example.
+#
+
+CROSSBOW_URL=http://ec2-184-73-43-172.compute-1.amazonaws.com/cgi-bin/crossbow.pl
+
+cat >.fill_mm9_chr17.applescript <<EOF
+tell application "Safari"
+ activate
+ tell (make new document) to set URL to "$CROSSBOW_URL"
+ delay 6
+ set doc to document "$CROSSBOW_URL"
+ log (doc's name)
+ do JavaScript "document.forms['form']['AWSId'].value = '<YOUR-AWS-ID>'" in doc
+ do JavaScript "document.forms['form']['AWSSecret'].value = '<YOUR-AWS-SECRET-KEY>'" in doc
+ do JavaScript "document.forms['form']['JobName'].value = 'Crossbow-Mouse17'" in doc
+ do JavaScript "document.forms['form']['InputURL'].value = 's3n://<YOUR-BUCKET>/example/mouse17/full.manifest'" in doc
+ do JavaScript "document.forms['form']['OutputURL'].value = 's3n://<YOUR-BUCKET>/example/mouse17/output_full'" in doc
+ do JavaScript "document.forms['form']['InputType'][1].checked = 1" in doc
+ do JavaScript "document.forms['form']['InputType'][0].checked = 0" in doc
+ do JavaScript "document.forms['form']['QualityEncoding'].value = 'phred33'" in doc
+ do JavaScript "document.forms['form']['SpecifyRef'].checked = '1'" in doc
+ do JavaScript "document.forms['form']['Ref'].value = 's3n://<YOUR-BUCKET>/crossbow-refs/mm9_chr17.jar'" in doc
+ do JavaScript "document.forms['form']['NumNodes'].value = '8'" in doc
+ do JavaScript "document.forms['form']['InstanceType'].value = 'c1.xlarge'" in doc
+ do JavaScript "document.forms['form']['Haploids'].value = 'all-diploid'" in doc
+ do JavaScript "document.forms['form']['Haploids'][1].checked = 0" in doc
+end tell
+EOF
+
+osascript .fill_mm9_chr17.applescript
+rm -f .fill_mm9_chr17.applescript
diff --git a/webui/push.sh b/webui/push.sh
new file mode 100644
index 0000000..04e7c1e
--- /dev/null
+++ b/webui/push.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+# push.sh
+#
+# Run from the crossbow base directory (i.e. sh webui/push.sh) to copy
+# the appropriate files to the EC2 web server. The EC2_KEYPAIR
+# environment variable must point to the id_rsa-gsg-keypair (or
+# similarly named) file with your keypair.
+#
+
+[ -z "$EC2_KEYPAIR" ] && echo "Must set EC2_KEYPAIR" && exit 1
+
+[ ! -d webui ] && echo "Run from CROSSBOW_HOME" && exit 1
+
+ARGS=$*
+[ -z "$ARGS" ] && ARGS="ec2-75-101-218-11.compute-1.amazonaws.com ec2-184-73-43-172.compute-1.amazonaws.com"
+
+for i in $ARGS ; do
+ echo $i
+
+ # Move perl scripts to cgi-bin
+ scp -i $EC2_KEYPAIR webui/crossbow.pl webui/S3Util.pm Tools.pm AWS.pm CrossbowIface.pm webui/wait.gif cb_emr root@$i:/var/www/cgi-bin/
+ scp -i $EC2_KEYPAIR VERSION root@$i:/var/www/cgi-bin/VERSION_CROSSBOW
+ scp -i $EC2_KEYPAIR webui/wait.gif root@$i:/home/webuser/helloworld/htdocs/
+ ssh -i $EC2_KEYPAIR root@$i chmod a+x /var/www/cgi-bin/*.pl
+ ssh -i $EC2_KEYPAIR root@$i rm -f /var/www/cgi-bin/VERSION
+
+ # URL to surf to
+ echo "http://$i/cgi-bin/crossbow.pl\n";
+done
diff --git a/webui/push_test.sh b/webui/push_test.sh
new file mode 100644
index 0000000..137a729
--- /dev/null
+++ b/webui/push_test.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+# push.sh
+#
+# Run from the crossbow base directory (i.e. sh webui/push.sh) to copy
+# the appropriate files to the EC2 web server. The EC2_KEYPAIR
+# environment variable must point to the id_rsa-gsg-keypair (or
+# similarly named) file with your keypair.
+#
+
+[ -z "$EC2_KEYPAIR" ] && echo "Must set EC2_KEYPAIR" && exit 1
+
+[ ! -d webui ] && echo "Run from CROSSBOW_HOME" && exit 1
+
+ARGS=$*
+[ -z "$ARGS" ] && ARGS="ec2-75-101-218-11.compute-1.amazonaws.com ec2-184-73-43-172.compute-1.amazonaws.com"
+
+for i in $ARGS ; do
+ echo $i
+
+ # Move perl scripts to cgi-bin
+ sed -e 's/my $debugLev = 0;/my $debugLev = 2;/' < webui/crossbow.pl > webui/crossbow.test.pl
+ ssh -i $EC2_KEYPAIR root@$i mkdir -p /var/www/cgi-bin/test
+ scp -i $EC2_KEYPAIR webui/S3Util.pm Tools.pm AWS.pm CrossbowIface.pm webui/wait.gif cb_emr root@$i:/var/www/cgi-bin/test/
+ scp -i $EC2_KEYPAIR VERSION root@$i:/var/www/cgi-bin/test/VERSION_CROSSBOW
+ scp -i $EC2_KEYPAIR webui/crossbow.test.pl root@$i:/var/www/cgi-bin/test/crossbow.pl
+ scp -i $EC2_KEYPAIR webui/wait.gif root@$i:/home/webuser/helloworld/htdocs/
+ ssh -i $EC2_KEYPAIR root@$i chmod a+x /var/www/cgi-bin/test/*.pl
+ ssh -i $EC2_KEYPAIR root@$i rm -f /var/www/cgi-bin/test/VERSION
+
+ # URL to surf to
+ echo "http://$i/cgi-bin/test/crossbow.pl\n";
+done
diff --git a/webui/setup.sh b/webui/setup.sh
new file mode 100644
index 0000000..d141f3d
--- /dev/null
+++ b/webui/setup.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+##
+# setup.sh
+#
+# Not-quite-automated set of commands that should be run on a new EC2
+# instance to get it ready to run the Crossbow or Myrna web interfaces.
+#
+# EC2 changes pretty often, so your mileage may vary.
+#
+
+sudo yum -y install cpan gcc libxml2-devel
+
+sudo cpan
+#o conf prerequisites_policy follow
+#o conf commit
+#install CPAN::Bundle
+#reload cpan
+#install Class::Accessor CGI::Ajax Net::Amazon::S3 MIME::Types
+#install Net::Amazon::S3
diff --git a/webui/wait.gif b/webui/wait.gif
new file mode 100644
index 0000000..75c04d5
Binary files /dev/null and b/webui/wait.gif differ
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/crossbow.git
More information about the debian-med-commit
mailing list