[med-svn] [Git][med-team/centrifuge][upstream] New upstream version 1.0.3
Andreas Tille
gitlab at salsa.debian.org
Fri Mar 2 19:08:33 UTC 2018
Andreas Tille pushed to branch upstream at Debian Med / centrifuge
Commits:
f6f7b098 by Andreas Tille at 2018-03-02T19:56:18+01:00
New upstream version 1.0.3
- - - - -
13 changed files:
- MANUAL
- MANUAL.markdown
- Makefile
- aln_sink.h
- centrifuge
- centrifuge-download
- centrifuge-kreport
- + centrifuge-promote
- centrifuge.cpp
- classifier.h
- doc/manual.inc.html
- doc/sidebar.inc.shtml
- indices/Makefile
Changes:
=====================================
MANUAL
=====================================
--- a/MANUAL
+++ b/MANUAL
@@ -198,7 +198,83 @@ can be generated from a GI taxid dump:
### Custom database
-TODO: Add toy example for nodes.dmp, names.dmp and seqid2taxid.map
+To build a custom database, you need the provide the follwing four files to `centrifuge-build`:
+
+ - `--conversion-table`: tab-separated file mapping sequence IDs to taxonomy IDs. Sequence IDs are the header up to the first space or second pipe (`|`).
+ - `--taxonomy-tree`: `\t|\t`-separated file mapping taxonomy IDs to their parents and rank, up to the root of the tree. When using NCBI taxonomy IDs, this will be the `nodes.dmp` from `ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz`.
+ - `--name-table`: '\t|\t'-separated file mapping taxonomy IDs to a name. A further column (typically column 4) must specify `scientific name`. When using NCBI taxonomy IDs, `names.dmp` is the appropriate file.
+ - reference sequences: The ID of the sequences are the header up to the first space or second pipe (`|`)
+
+When using custom taxonomy IDs, use only positive integers greater-equal to `1` and use `1` for the root of the tree.
+
+#### More info on `--taxonomy-tree` and `--name-table`
+
+The format of these files are based on `nodes.dmp` and `names.dmp` from the NCBI taxonomy database dump.
+
+- Field terminator is `\t|\t`
+- Row terminator is `\t|\n`
+
+The `taxonomy-tree` / nodes.dmp file consists of taxonomy nodes. The description for each node includes the following
+fields:
+
+ tax_id -- node id in GenBank taxonomy database
+ parent tax_id -- parent node id in GenBank taxonomy database
+ rank -- rank of this node (superkingdom, kingdom, ..., no rank)
+
+Further fields are ignored.
+
+The `name-table` / names.dmp is the taxonomy names file:
+
+ tax_id -- the id of node associated with this name
+ name_txt -- name itself
+ unique name -- the unique variant of this name if name not unique
+ name class -- (scientific name, synonym, common name, ...)
+
+`name class` **has** to be `scientific name` to be included in the build. All other lines are ignored
+
+#### Example
+
+*Conversion table `ex.conv`*:
+
+ Seq1 11
+ Seq2 12
+ Seq3 13
+ Seq4 11
+
+*Taxonomy tree `ex.tree`*:
+
+ 1 | 1 | root
+ 10 | 1 | kingdom
+ 11 | 10 | species
+ 12 | 10 | species
+ 13 | 1 | species
+
+*Name table `ex.name`*:
+
+ 1 | root | | scientific name |
+ 10 | Bacteria | | scientific name |
+ 11 | Bacterium A | | scientific name |
+ 12 | Bacterium B | | scientific name |
+ 12 | Some other species | | scientific name |
+
+*Reference sequences `ex.fa`*:
+
+ >Seq1
+ AAAACGTACGA.....
+ >Seq2
+ AAAACGTACGA.....
+ >Seq3
+ AAAACGTACGA.....
+ >Seq4
+ AAAACGTACGA.....
+
+To build the database, call
+
+ centrifuge-build --conversion-table ex.conv \
+ --taxonomy-tree ex.tree --name-table ex.name \
+ ex.fa ex
+
+which results in three index files named `ex.1.cf`, `ex.2.cf` and `ex.3.cf`.
### Centrifuge classification output
=====================================
MANUAL.markdown
=====================================
--- a/MANUAL.markdown
+++ b/MANUAL.markdown
@@ -211,7 +211,84 @@ can be generated from a GI taxid dump:
### Custom database
-TODO: Add toy example for nodes.dmp, names.dmp and seqid2taxid.map
+To build a custom database, you need the provide the follwing four files to `centrifuge-build`:
+
+ - `--conversion-table`: tab-separated file mapping sequence IDs to taxonomy IDs. Sequence IDs are the header up to the first space or second pipe (`|`).
+ - `--taxonomy-tree`: `\t|\t`-separated file mapping taxonomy IDs to their parents and rank, up to the root of the tree. When using NCBI taxonomy IDs, this will be the `nodes.dmp` from `ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz`.
+ - `--name-table`: '\t|\t'-separated file mapping taxonomy IDs to a name. A further column (typically column 4) must specify `scientific name`. When using NCBI taxonomy IDs, `names.dmp` is the appropriate file.
+ - reference sequences: The ID of the sequences are the header up to the first space or second pipe (`|`)
+
+When using custom taxonomy IDs, use only positive integers greater-equal to `1` and use `1` for the root of the tree.
+
+#### More info on `--taxonomy-tree` and `--name-table`
+
+The format of these files are based on `nodes.dmp` and `names.dmp` from the NCBI taxonomy database dump.
+
+- Field terminator is `\t|\t`
+- Row terminator is `\t|\n`
+
+The `taxonomy-tree` / nodes.dmp file consists of taxonomy nodes. The description for each node includes the following
+fields:
+
+ tax_id -- node id in GenBank taxonomy database
+ parent tax_id -- parent node id in GenBank taxonomy database
+ rank -- rank of this node (superkingdom, kingdom, ..., no rank)
+
+Further fields are ignored.
+
+The `name-table` / names.dmp is the taxonomy names file:
+
+ tax_id -- the id of node associated with this name
+ name_txt -- name itself
+ unique name -- the unique variant of this name if name not unique
+ name class -- (scientific name, synonym, common name, ...)
+
+`name class` **has** to be `scientific name` to be included in the build. All other lines are ignored
+
+#### Example
+
+*Conversion table `ex.conv`*:
+
+ Seq1 11
+ Seq2 12
+ Seq3 13
+ Seq4 11
+
+
+*Taxonomy tree `ex.tree`*:
+
+ 1 | 1 | root
+ 10 | 1 | kingdom
+ 11 | 10 | species
+ 12 | 10 | species
+ 13 | 1 | species
+
+*Name table `ex.name`*:
+
+ 1 | root | | scientific name |
+ 10 | Bacteria | | scientific name |
+ 11 | Bacterium A | | scientific name |
+ 12 | Bacterium B | | scientific name |
+ 12 | Some other species | | scientific name |
+
+*Reference sequences `ex.fa`*:
+
+ >Seq1
+ AAAACGTACGA.....
+ >Seq2
+ AAAACGTACGA.....
+ >Seq3
+ AAAACGTACGA.....
+ >Seq4
+ AAAACGTACGA.....
+
+To build the database, call
+
+ centrifuge-build --conversion-table ex.conv \
+ --taxonomy-tree ex.tree --name-table ex.name \
+ ex.fa ex
+
+which results in three index files named `ex.1.cf`, `ex.2.cf` and `ex.3.cf`.
### Centrifuge classification output
=====================================
Makefile
=====================================
--- a/Makefile
+++ b/Makefile
@@ -191,6 +191,7 @@ CENTRIFUGE_SCRIPT_LIST = centrifuge \
centrifuge-build \
centrifuge-inspect \
centrifuge-download \
+ centrifuge-kreport \
$(wildcard centrifuge-*.pl)
=====================================
aln_sink.h
=====================================
--- a/aln_sink.h
+++ b/aln_sink.h
@@ -2305,9 +2305,9 @@ void AlnSinkSam<index_t>::appendMate(
case READ_ID: appendReadID(o, rd.name); break;
case SEQ_ID: appendSeqID(o, rs, ebwt.tree()); break;
case SEQ: o.append((string(rd.patFw.toZBuf()) +
- (rdo == NULL? "" : "N" + string(rdo->patFw.toZBuf()))).c_str()); break;
+ (rdo == NULL? "" : "_" + string(rdo->patFw.toZBuf()))).c_str()); break;
case QUAL: o.append((string(rd.qual.toZBuf()) +
- (rdo == NULL? "" : "I" + string(rdo->qual.toZBuf()))).c_str()); break;
+ (rdo == NULL? "" : "_" + string(rdo->qual.toZBuf()))).c_str()); break;
case SEQ1: o.append(rd.patFw.toZBuf()); break;
case QUAL1: o.append(rd.qual.toZBuf()); break;
=====================================
centrifuge
=====================================
--- a/centrifuge
+++ b/centrifuge
@@ -116,6 +116,13 @@ my %read_compress = ();
my $cap_out = undef; # Filename for passthrough
my $no_unal = 0;
my $large_idx = 0;
+
+# Variables handling the output format
+my $outputFmtSam = 0 ;
+my $tabFmtOptIdx = 0 ;
+my $needReadSeq = 0 ;
+my $removeSeqCols = 0 ;
+
# Remove whitespace
for my $i (0..$#bt2_args) {
$bt2_args[$i]=~ s/^\s+//; $bt2_args[$i] =~ s/\s+$//;
@@ -179,6 +186,7 @@ for(my $i = 0; $i < scalar(@bt2_args); $i++) {
}
for my $rarg ("un-conc", "al-conc", "un", "al") {
if($arg =~ /^--${rarg}$/ || $arg =~ /^--${rarg}-gz$/ || $arg =~ /^--${rarg}-bz2$/) {
+ $needReadSeq = 1 ;
$bt2_args[$i] = undef;
if(scalar(@args) > 1 && $args[1] ne "") {
$read_fns{$rarg} = $args[1];
@@ -193,7 +201,57 @@ for(my $i = 0; $i < scalar(@bt2_args); $i++) {
last;
}
}
+ if ($arg eq "--out-fmt" )
+ {
+ $i < scalar(@bt2_args)-1 || Fail("Argument expected in next token!\n");
+ $i++;
+ if ( $bt2_args[$i] eq "sam" )
+ {
+ $outputFmtSam = 1 ;
+ }
+ #$bt2_args[$i] = undef;
+
+ }
+
+ if ( $arg eq "--tab-fmt-cols" )
+ {
+ $i < scalar(@bt2_args)-1 || Fail("Argument expected in next token!\n");
+ $tabFmtOptIdx = $i + 1 ;
+ }
}
+
+# Determine whether we need to add two extra columns for seq and qual to out-fmt
+if ( $needReadSeq == 1 && ( $tabFmtOptIdx == 0 || $outputFmtSam == 1 ) )
+{
+ my $i ;
+ my $needAdd = 1 ;
+ if ( $tabFmtOptIdx != 0 )
+ {
+ my @cols = split /,/, $bt2_args[ $tabFmtOptIdx ] ;
+ foreach my $f (@cols)
+ {
+ if ( $f eq "readSeq" )
+ {
+ $needAdd = 0 ;
+ last ;
+ }
+ }
+
+ }
+ else
+ {
+ push @bt2_args, "--tab-fmt-cols" ;
+ push @bt2_args, "readID,seqID,taxID,score,2ndBestScore,hitLength,queryLength,numMatches" ;
+ $tabFmtOptIdx = scalar( @bt2_args ) - 1 ;
+ }
+
+ if ( $needAdd )
+ {
+ $removeSeqCols = 1 ;
+ $bt2_args[ $tabFmtOptIdx ] .= ",readSeq,readQual" ;
+ }
+}
+
# If the user asked us to redirect some reads to files, or to suppress
# unaligned reads, then we need to capture the output from Centrifuge and pass it
# through this wrapper.
@@ -423,6 +481,17 @@ my $cmd = "$align_prog$debug_str --wrapper basic-0 ".join(" ", @bt2_args);
# Possibly add read input on an anonymous pipe
$cmd = "$readpipe $cmd" if defined($readpipe);
+# The function removes the two extra columns that we added to get the read seq and qual
+sub RemoveSeqCols
+{
+ my $line = $_[0] ;
+ my @cols = split /\t/, $line ;
+ pop @cols ;
+ pop @cols ;
+ my $tab = "\t" ;
+ return join( $tab, @cols ) ;
+}
+
Info("$cmd\n");
my $ret;
if(defined($cap_out)) {
@@ -485,30 +554,87 @@ if(defined($cap_out)) {
}
}
}
+
+ my $seqIndex = -1 ;
+ my $qualIndex = -1 ;
+ my $readIdIndex = -1 ;
+ if ( $outputFmtSam == 0 )
+ {
+ my $outputHeader = <BT> ;
+ my @cols = split /\t/, $outputHeader ;
+ for ( my $i = 0 ; $i < scalar( @cols ) ; ++$i )
+ {
+ if ( $cols[$i] =~ /readSeq/ )
+ {
+ $seqIndex = $i ;
+ }
+ elsif ( $cols[$i] =~ /readQual/ )
+ {
+ $qualIndex = $i ;
+ }
+ elsif ( $cols[$i] =~ /readID/ )
+ {
+ $readIdIndex = $i ;
+ }
+ }
+ if ( $seqIndex == -1 && scalar( keys %read_fhs) == 0 )
+ {
+ Error( "Must use readSeq in --tabFmtCols in order to output unaligned reads." ) ;
+ }
+
+ $outputHeader = RemoveSeqCols( $outputHeader )."\n" if ( $removeSeqCols == 1 ) ;
+ print {$ofh} $outputHeader ;
+ }
+ else
+ {
+ $seqIndex = 9 ;
+ $qualIndex = 10 ;
+ $readIdIndex = 0 ;
+ }
+
while(<BT>) {
chomp;
my $filt = 0;
unless(substr($_, 0, 1) eq "@") {
# If we are supposed to output certain reads to files...
- my $tab1_i = index($_, "\t") + 1;
- my $tab2_i = index($_, "\t", $tab1_i);
- my $fl = substr($_, $tab1_i, $tab2_i - $tab1_i);
- my $unal = ($fl & 4) != 0;
+ #my $tab1_i = index($_, "\t") + 1;
+ #my $tab2_i = index($_, "\t", $tab1_i);
+ #my $fl = substr($_, $tab1_i, $tab2_i - $tab1_i);
+ my $unal = 0 ;
+ if ( /unclassified/ )
+ {
+ $unal = 1 ;
+ }
$filt = 1 if $no_unal && $unal;
if($passthru) {
- if(scalar(keys %read_fhs) == 0) {
+ if(scalar(keys %read_fhs) == 0 || $seqIndex == -1 ) {
# Next line is read with some whitespace escaped
- my $l = <BT>;
+ # my $l = <BT>;
} else {
- my $mate1 = (($fl & 64) != 0);
- my $mate2 = (($fl & 128) != 0);
- my $unp = !$mate1 && !$mate2;
- my $pair = !$unp;
+ my @cols = split /\t/ ;
+ my $isPaired = 0 ;
+ my $pair = 0 ;
+ if ( $cols[$seqIndex] =~ /_/ )
+ {
+ $pair = 1 ;
+ }
+ my $unp = !$pair ;
+
# Next line is read with some whitespace escaped
- my $l = <BT>;
- chomp($l);
- $l =~ s/%(..)/chr(hex($1))/eg;
+ #my $l = <BT>;
+ #chomp($l);
+ #$l =~ s/%(..)/chr(hex($1))/eg;
+
if((defined($read_fhs{un}) || defined($read_fhs{al})) && $unp) {
+ my $l ;
+ if ( $qualIndex != -1 )
+ {
+ $l = "@".$cols[ $readIdIndex ]."\n".$cols[$seqIndex]."\n+\n".$cols[$qualIndex]."\n" ;
+ }
+ else
+ {
+ $l = ">".$cols[ $readIdIndex ]."\n".$cols[$seqIndex]."\n" ;
+ }
if($unal) {
# Failed to align
print {$read_fhs{un}} $l if defined($read_fhs{un});
@@ -517,21 +643,42 @@ if(defined($cap_out)) {
print {$read_fhs{al}} $l if defined($read_fhs{al});
}
}
+ my $warnedAboutLength = 0 ;
if((defined($read_fhs{"un-conc"}) || defined($read_fhs{"al-conc"})) && $pair) {
- my $conc = (($fl & 2) != 0);
- if ($conc && $mate1) {
- print {$read_fhs{"al-conc"}{1}} $l if defined($read_fhs{"al-conc"});
- } elsif($conc && $mate2) {
- print {$read_fhs{"al-conc"}{2}} $l if defined($read_fhs{"al-conc"});
- } elsif(!$conc && $mate1) {
- print {$read_fhs{"un-conc"}{1}} $l if defined($read_fhs{"un-conc"});
- } elsif(!$conc && $mate2) {
- print {$read_fhs{"un-conc"}{2}} $l if defined($read_fhs{"un-conc"});
+ my @seq = split /_/, $cols[$seqIndex] ;
+ my @qual = ( substr( $cols[$qualIndex], 0, length( $seq[0] ) ), substr( $cols[$qualIndex], length( $seq[0] ) + 1 ) ) ;
+
+ my $l1 ;
+ my $l2 ;
+ if ( $qualIndex != -1 )
+ {
+ $l1 = "@".$cols[ $readIdIndex ]."\n".$seq[0]."\n+\n".$qual[0]."\n" ;
+ }
+ else
+ {
+ $l1 = ">".$cols[ $readIdIndex ]."\n".$seq[0]."\n" ;
+ }
+ if ( $qualIndex != -1 )
+ {
+ $l2 = "@".$cols[ $readIdIndex ]."\n".$seq[1]."\n+\n".$qual[1]."\n" ;
+ }
+ else
+ {
+ $l2 = ">".$cols[ $readIdIndex ]."\n".$seq[1]."\n" ;
+ }
+
+ if ( !$unal) {
+ print {$read_fhs{"al-conc"}{1}} $l1 if defined($read_fhs{"al-conc"});
+ print {$read_fhs{"al-conc"}{2}} $l2 if defined($read_fhs{"al-conc"});
+ } else {
+ print {$read_fhs{"un-conc"}{1}} $l1 if defined($read_fhs{"un-conc"});
+ print {$read_fhs{"un-conc"}{2}} $l2 if defined($read_fhs{"un-conc"});
}
}
}
}
}
+ $_ = RemoveSeqCols( $_ ) if ( $removeSeqCols == 1 ) ;
print {$ofh} "$_\n" if !$filt;
}
for my $k (@fhs_to_close) { close($k); }
=====================================
centrifuge-download
=====================================
--- a/centrifuge-download
+++ b/centrifuge-download
@@ -11,10 +11,10 @@ if hash rsync 2>/dev/null; then
DL_MODE="rsync"
elif hash wget 2>/dev/null; then
DL_PROG="wget -N --reject=index.html -qO"
- DL_MODE="ftp"
+ DL_MODE="https"
else
DL_PROG="curl -s -o"
- DL_MODE="ftp"
+ DL_MODE="https"
fi
export DL_PROG DL_MODE
@@ -154,7 +154,7 @@ COMMON OPTIONS
WHEN USING database refseq OR genbank:
-d <domain> What domain to download. One or more of ${ALL_GENOMES// /, } (comma separated).
- -a <assembly level> Only download genomes with the specified assembly level. Default: '$ASSEMBLY_LEVEL'.
+ -a <assembly level> Only download genomes with the specified assembly level. Default: '$ASSEMBLY_LEVEL'. Use 'Any' for any assembly level.
-c <refseq category> Only download genomes in the specified refseq category. Default: any.
-t <taxids> Only download the specified taxonomy IDs, comma separated. Default: any.
-r Download RNA sequences, too.
@@ -262,8 +262,10 @@ SPECIES_TAXID_FIELD=7
VERSION_STATUS_FIELD=11
ASSEMBLY_LEVEL_FIELD=12
FTP_PATH_FIELD=20
+FTP_PATH_FIELD2=21 ## Needed for wrongly formatted virus files - hopefully just a temporary fix
-AWK_QUERY="\$$ASSEMBLY_LEVEL_FIELD==\"$ASSEMBLY_LEVEL\" && \$$VERSION_STATUS_FIELD==\"latest\""
+AWK_QUERY="\$$VERSION_STATUS_FIELD==\"latest\""
+[[ "$ASSEMBLY_LEVEL" != "Any" ]] && AWK_QUERY="$AWK_QUERY && \$$ASSEMBLY_LEVEL_FIELD==\"$ASSEMBLY_LEVEL\""
[[ "$REFSEQ_CATEGORY" != "" ]] && AWK_QUERY="$AWK_QUERY && \$$REFSEQ_CAT_FIELD==\"$REFSEQ_CATEGORY\""
TAXID=${TAXID//,/|}
@@ -320,11 +322,22 @@ for DOMAIN in $DOMAINS; do
N_EXPECTED=`cat "$ASSEMBLY_SUMMARY_FILE" | wc -l`
[[ $N_EXPECTED -gt 0 ]] || { echo "Domain $DOMAIN has no genomes with specified filter." >&2; exit 1; }
- echo "Downloading $N_EXPECTED $DOMAIN genomes at assembly level $ASSEMBLY_LEVEL ... (will take a while)" >&2
- cut -f "$TAXID_FIELD,$FTP_PATH_FIELD" "$ASSEMBLY_SUMMARY_FILE" | sed 's#\([^/]*\)$#\1/\1_genomic.fna.gz#' |\
- tr '\n' '\0' | xargs -0 -n1 -P $N_PROC bash -c 'download_n_process_nofail "$@"' _ | count $N_EXPECTED
+
+ if [[ "$DOMAIN" == "viral" ]]; then
+ ## Wrong columns in viral assembly summary files - the path is sometimes in field 20, sometimes 21
+ cut -f "$TAXID_FIELD,$FTP_PATH_FIELD,$FTP_PATH_FIELD2" "$ASSEMBLY_SUMMARY_FILE" | \
+ sed 's/^\(.*\)\t\(ftp:.*\)\t.*/\1\t\2/;s/^\(.*\)\t.*\t\(ftp:.*\)/\1\t\2/' | \
+ sed 's#\([^/]*\)$#\1/\1_genomic.fna.gz#' |\
+ tr '\n' '\0' | xargs -0 -n1 -P $N_PROC bash -c 'download_n_process_nofail "$@"' _ | count $N_EXPECTED
+ else
+ echo "Downloading $N_EXPECTED $DOMAIN genomes at assembly level $ASSEMBLY_LEVEL ... (will take a while)" >&2
+ cut -f "$TAXID_FIELD,$FTP_PATH_FIELD" "$ASSEMBLY_SUMMARY_FILE" | sed 's#\([^/]*\)$#\1/\1_genomic.fna.gz#' |\
+ tr '\n' '\0' | xargs -0 -n1 -P $N_PROC bash -c 'download_n_process_nofail "$@"' _ | count $N_EXPECTED
+ fi
echo >&2
+
+
if [[ "$DOWNLOAD_RNA" == "1" && ! `echo $DOMAIN | egrep 'bacteria|viral|archaea'` ]]; then
echo "Downloadinging rna sequence files" >&2
cut -f $TAXID_FIELD,$FTP_PATH_FIELD "$ASSEMBLY_SUMMARY_FILE"| sed 's#\([^/]*\)$#\1/\1_rna.fna.gz#' |\
=====================================
centrifuge-kreport
=====================================
--- a/centrifuge-kreport
+++ b/centrifuge-kreport
@@ -9,12 +9,17 @@
use strict;
use warnings;
use Getopt::Long;
+use File::Basename;
+use Cwd;
+use Cwd 'cwd' ;
+use Cwd 'abs_path' ;
my ($centrifuge_index, $min_score, $min_length);
my $only_unique = 0;
my $show_zeros = 0;
my $is_cnts_table = 0;
my $PROG = "centrifuge-kreport";
+my $CWD = dirname( abs_path( $0 ) ) ;
GetOptions(
"help" => \&display_help,
@@ -71,9 +76,23 @@ if ($is_cnts_table) {
$seq_count += $count;
}
} else {
- <>;
+ my $header = <>;
+ my @cols = split /\s+/, $header ;
+ my %headerMap ;
+ for ( my $i = 0 ; $i < scalar( @cols ) ; ++$i )
+ {
+ $headerMap{ $cols[$i] } = $i ;
+ }
while (<>) {
- my (undef,$seqID,$taxid,$score, undef, $hitLength, $queryLength, $numMatches) = split /\t/;
+ #my (undef,$seqID,$taxid,$score, undef, $hitLength, $queryLength, $numMatches) = split /\t/;
+ my @cols = split /\s+/ ;
+ my $seqID = $cols[ $headerMap{ "seqID" } ] ;
+ my $taxid = $cols[ $headerMap{ "taxID" } ] ;
+ my $score = $cols[ $headerMap{ "score" } ] ;
+ my $hitLength = $cols[ $headerMap{ "hitLength" } ] ;
+ my $queryLength = $cols[ $headerMap{ "queryLength" } ] ;
+ my $numMatches = $cols[ $headerMap{ "numMatches" } ] ;
+
next if $only_unique && $numMatches > 1;
next if defined $min_length && $hitLength < $min_length;
next if defined $min_score && $score < $min_score;
@@ -150,7 +169,7 @@ sub dfs_summation {
sub load_taxonomy {
print STDERR "Loading names file ...\n";
- open NAMES, "-|", "centrifuge-inspect --name-table $centrifuge_index"
+ open NAMES, "-|", "$CWD/centrifuge-inspect --name-table $centrifuge_index"
or die "$PROG: can't open names file: $!\n";
while (<NAMES>) {
chomp;
@@ -162,7 +181,7 @@ sub load_taxonomy {
close NAMES;
print STDERR "Loading nodes file ...\n";
- open NODES, "-|", "centrifuge-inspect --taxonomy-tree $centrifuge_index"
+ open NODES, "-|", "$CWD/centrifuge-inspect --taxonomy-tree $centrifuge_index"
or die "$PROG: can't open nodes file: $!\n";
while (<NODES>) {
chomp;
=====================================
centrifuge-promote
=====================================
--- /dev/null
+++ b/centrifuge-promote
@@ -0,0 +1,122 @@
+#!/usr/bin/env perl
+
+use strict ;
+use warnings ;
+
+use File::Basename;
+use Cwd;
+use Cwd 'cwd' ;
+use Cwd 'abs_path' ;
+
+
+die "Usage: centrifuge-promote.pl centrifuge_index centrifuge_output level > output\n\n".
+ "Promote the taxonomy id to specified level in Centrifuge output.\n" if ( @ARGV == 0 ) ;
+
+my $CWD = dirname( abs_path( $0 ) ) ;
+# Go through the index to obtain the taxonomy tree
+my %taxParent ;
+my %taxIdToSeqId ;
+my %taxLevel ;
+
+my $centrifuge_index = $ARGV[0] ;
+open FP1, "-|", "$CWD/centrifuge-inspect --taxonomy-tree $centrifuge_index" or die "can't open $!\n" ;
+while ( <FP1> )
+{
+ chomp ;
+ my @cols = split /\t\|\t/;
+ $taxParent{ $cols[0] } = $cols[1] ;
+ $taxLevel{ $cols[0] } = $cols[2] ;
+}
+close FP1 ;
+open FP1, "-|", "$CWD/centrifuge-inspect --conversion-table $centrifuge_index" or die "can't open $!\n" ;
+while ( <FP1> )
+{
+ chomp ;
+ my @cols = split /\t/ ;
+ $taxIdToSeqId{ $cols[1] } = $cols[0] ;
+}
+close FP1 ;
+
+# Go through the output of centrifuge
+my $level = $ARGV[2] ;
+sub PromoteTaxId
+{
+ my $tid = $_[0] ;
+ return 0 if ( $tid <= 0 || !defined( $taxLevel{ $tid } ) ) ;
+
+ if ( $taxLevel{ $tid } eq $level )
+ {
+ return $tid ;
+ }
+ else
+ {
+ return 0 if ( $tid <= 1 ) ;
+ return PromoteTaxId( $taxParent{ $tid } ) ;
+ }
+}
+
+sub OutputPromotedLines
+{
+ my @lines = @{ $_[0] } ;
+ return if ( scalar( @lines ) <= 0 ) ;
+
+ my @newLines ;
+ my $i ;
+ my $numMatches = 0 ;
+ my %showedUpTaxId ;
+ my $tab = sprintf( "\t" ) ;
+ for ( $i = 0 ; $i < scalar( @lines ) ; ++$i )
+ {
+ my @cols = split /\t+/, $lines[ $i ] ;
+ my $newTid = PromoteTaxId( $cols[2] ) ;
+ if ( $newTid <= 1 )
+ {
+ $newTid = $cols[2] ;
+ }
+ my $newLevel = $cols[1] ;
+ $newLevel = $taxLevel{ $newTid } if ( $newTid >= 1 && defined $taxLevel{ $newTid } ) ;
+
+ next if ( defined $showedUpTaxId{ $newTid } ) ;
+
+ $showedUpTaxId{ $newTid } = 1 ;
+ ++$numMatches ;
+
+ $cols[2] = $newTid ;
+ $cols[1] = $newLevel ;
+ push @newLines, join( $tab, @cols ) ;
+ }
+
+ for ( $i = 0 ; $i < scalar( @newLines ) ; ++$i )
+ {
+ my @cols = split /\t+/, $newLines[$i] ;
+ $cols[-1] = $numMatches ;
+ print join( $tab, @cols ), "\n" ;
+ }
+}
+
+open FP1, $ARGV[1] ;
+my $header = <FP1> ;
+my $prevReadId = "" ;
+my @lines ;
+
+print $header ;
+while ( <FP1> )
+{
+ chomp ;
+ my @cols = split /\t/ ;
+ if ( $cols[0] eq $prevReadId )
+ {
+ push @lines, $_ ;
+ }
+ else
+ {
+ $prevReadId = $cols[0] ;
+
+ OutputPromotedLines( \@lines ) ;
+
+ undef @lines ;
+ push @lines, $_ ;
+ }
+}
+OutputPromotedLines( \@lines ) ;
+close FP1 ;
=====================================
centrifuge.cpp
=====================================
--- a/centrifuge.cpp
+++ b/centrifuge.cpp
@@ -501,7 +501,7 @@ static void resetOptions() {
col_name_map["CIGAR"] = PLACEHOLDER;
col_name_map["RNEXT"] = SEQ_ID;
col_name_map["PNEXT"] = PLACEHOLDER_ZERO;
- col_name_map["TLEN"] = PLACEHOLDER_ZERO;
+ col_name_map["TLEN"] = QUERY_LENGTH ; //PLACEHOLDER_ZERO;
col_name_map["SEQ"] = SEQ;
col_name_map["QUAL"] = QUAL;
=====================================
classifier.h
=====================================
--- a/classifier.h
+++ b/classifier.h
@@ -418,7 +418,10 @@ public:
if(!_tree_traverse) {
if(_hitMap.size() > (size_t)rp.khits)
+ {
+ reportUnclassified( sink ) ;
return 0;
+ }
}
uint8_t rank = 0;
@@ -511,7 +514,10 @@ public:
}
}
if(!only_host_taxIDs && _hitMap.size() > (size_t)rp.khits)
+ {
+ reportUnclassified( sink ) ;
return 0;
+ }
#if 0
// boost up the score if the assignment is unique
@@ -528,7 +534,7 @@ public:
max_score += (rdlen > 15 ? (rdlen - 15) * (rdlen - 15) : 0);
}
-
+ bool reported = false ;
for(size_t gi = 0; gi < _hitMap.size(); gi++) {
assert_gt(_hitMap[gi].score, 0);
HitCount<index_t>& hitCount = _hitMap[gi];
@@ -555,8 +561,13 @@ public:
hitCount.readPositions,
isFw);
sink.report(0, &rs);
+ reported = true ;
}
- return 0;
+
+ if ( reported == false )
+ reportUnclassified( sink ) ;
+
+ return 0;
}
bool getGenomeIdx(
@@ -968,62 +979,73 @@ private:
size_t offset,
size_t length)
{
- size_t idx = 0;
+ size_t idx = 0;
#ifdef LI_DEBUG
- cout << "Add " << taxID << " " << partialHitScore << " " << weightedHitLen << endl;
+ cout << "Add " << taxID << " " << partialHitScore << " " << weightedHitLen << endl;
#endif
- const TaxonomyPathTable& pathTable = ebwt.paths();
- pathTable.getPath(taxID, _tempPath);
- uint8_t rank = _classification_rank;
- if(rank > 0) {
- for(; rank < _tempPath.size(); rank++) {
- if(_tempPath[rank] != 0) {
- taxID = _tempPath[rank];
- break;
- }
- }
- }
-
- for(; idx < hitMap.size(); ++idx) {
- bool same = false;
- if(rank == 0) {
- same = (uniqueID == hitMap[idx].uniqueID);
- } else {
- same = (taxID == hitMap[idx].taxID);
- }
- if(same) {
- if(hitMap[idx].timeStamp != hi) {
- hitMap[idx].count += 1;
- hitMap[idx].scores[rdi][fwi] += partialHitScore;
- hitMap[idx].summedHitLens[rdi][fwi] += weightedHitLen;
- hitMap[idx].timeStamp = (uint32_t)hi;
- hitMap[idx].readPositions.push_back(make_pair(offset, length));
- }
- break;
- }
- }
-
- if(idx >= hitMap.size() && !considerOnlyIfPreviouslyObserved) {
- hitMap.expand();
- HitCount<index_t>& hitCount = hitMap.back();
- hitCount.reset();
- hitCount.uniqueID = uniqueID;
- hitCount.count = 1;
- hitCount.scores[rdi][fwi] = partialHitScore;
- hitCount.summedHitLens[rdi][fwi] = weightedHitLen;
- hitCount.timeStamp = (uint32_t)hi;
- hitCount.readPositions.clear();
- hitCount.readPositions.push_back(make_pair(offset, length));
- hitCount.path = _tempPath;
- hitCount.rank = rank;
- hitCount.taxID = taxID;
- }
+ const TaxonomyPathTable& pathTable = ebwt.paths();
+ pathTable.getPath(taxID, _tempPath);
+ uint8_t rank = _classification_rank;
+ if(rank > 0) {
+ for(; rank < _tempPath.size(); rank++) {
+ if(_tempPath[rank] != 0) {
+ taxID = _tempPath[rank];
+ break;
+ }
+ }
+ }
+
+ for(; idx < hitMap.size(); ++idx) {
+ bool same = false;
+ if(rank == 0) {
+ same = (uniqueID == hitMap[idx].uniqueID);
+ } else {
+ same = (taxID == hitMap[idx].taxID);
+ }
+ if(same) {
+ if(hitMap[idx].timeStamp != hi) {
+ hitMap[idx].count += 1;
+ hitMap[idx].scores[rdi][fwi] += partialHitScore;
+ hitMap[idx].summedHitLens[rdi][fwi] += weightedHitLen;
+ hitMap[idx].timeStamp = (uint32_t)hi;
+ hitMap[idx].readPositions.push_back(make_pair(offset, length));
+ }
+ break;
+ }
+ }
+
+ if(idx >= hitMap.size() && !considerOnlyIfPreviouslyObserved) {
+ hitMap.expand();
+ HitCount<index_t>& hitCount = hitMap.back();
+ hitCount.reset();
+ hitCount.uniqueID = uniqueID;
+ hitCount.count = 1;
+ hitCount.scores[rdi][fwi] = partialHitScore;
+ hitCount.summedHitLens[rdi][fwi] = weightedHitLen;
+ hitCount.timeStamp = (uint32_t)hi;
+ hitCount.readPositions.clear();
+ hitCount.readPositions.push_back(make_pair(offset, length));
+ hitCount.path = _tempPath;
+ hitCount.rank = rank;
+ hitCount.taxID = taxID;
+ }
- //if considerOnlyIfPreviouslyObserved and it was not found, genus Idx size is equal to the genus Map size
- //assert_lt(genusIdx, genusMap.size());
- return idx;
+ //if considerOnlyIfPreviouslyObserved and it was not found, genus Idx size is equal to the genus Map size
+ //assert_lt(genusIdx, genusMap.size());
+ return idx;
}
+ void reportUnclassified( AlnSinkWrap<index_t>& sink )
+ {
+ AlnRes rs ;
+ EList<pair<uint32_t,uint32_t> > dummy ;
+ dummy.push_back( make_pair( 0, 0 ) ) ;
+ rs.init( 0, 0, string( "unclassified" ), 0, 0, 0, dummy, true ) ;
+ sink.report( 0, &rs ) ;
+ }
+
+
+
// compare BWTHits by size, ascending, first, then by length, descending
// TODO: move this operator into BWTHits if that is the standard way we would like to sort
// TODO: this ordering does not necessarily give the best results
=====================================
doc/manual.inc.html
=====================================
--- a/doc/manual.inc.html
+++ b/doc/manual.inc.html
@@ -1,897 +0,0 @@
-<div id="TOC">
-<ul>
-<li><a href="#introduction">Introduction</a><ul>
-<li><a href="#what-is-centrifuge">What is Centrifuge?</a></li>
-</ul></li>
-<li><a href="#obtaining-centrifuge">Obtaining Centrifuge</a><ul>
-<li><a href="#building-from-source">Building from source</a></li>
-</ul></li>
-<li><a href="#running-centrifuge">Running Centrifuge</a><ul>
-<li><a href="#adding-to-path">Adding to PATH</a></li>
-<li><a href="#before-running-centrifuge">Before running Centrifuge</a></li>
-<li><a href="#database-download-and-index-building">Database download and index building</a><ul>
-<li><a href="#building-index-on-all-complete-bacterial-and-viral-genomes">Building index on all complete bacterial and viral genomes</a></li>
-<li><a href="#adding-human-or-mouse-genome-to-the-index">Adding human or mouse genome to the index</a></li>
-<li><a href="#nt-database">nt database</a></li>
-<li><a href="#custom-database">Custom database</a></li>
-<li><a href="#centrifuge-classification-output">Centrifuge classification output</a></li>
-<li><a href="#centrifuge-summary-output-the-default-filename-is-centrifuge_report.tsv">Centrifuge summary output (the default filename is centrifuge_report.tsv)</a></li>
-<li><a href="#kraken-style-report">Kraken-style report</a></li>
-</ul></li>
-<li><a href="#inspecting-the-centrifuge-index">Inspecting the Centrifuge index</a></li>
-<li><a href="#wrapper">Wrapper</a></li>
-<li><a href="#performance-tuning">Performance tuning</a></li>
-<li><a href="#command-line">Command Line</a><ul>
-<li><a href="#usage">Usage</a></li>
-<li><a href="#main-arguments">Main arguments</a></li>
-<li><a href="#options">Options</a></li>
-</ul></li>
-</ul></li>
-<li><a href="#the-centrifuge-build-indexer">The <code>centrifuge-build</code> indexer</a><ul>
-<li><a href="#command-line-1">Command Line</a><ul>
-<li><a href="#main-arguments-1">Main arguments</a></li>
-<li><a href="#options-1">Options</a></li>
-</ul></li>
-</ul></li>
-<li><a href="#the-centrifuge-inspect-index-inspector">The <code>centrifuge-inspect</code> index inspector</a><ul>
-<li><a href="#command-line-2">Command Line</a><ul>
-<li><a href="#main-arguments-2">Main arguments</a></li>
-<li><a href="#options-2">Options</a></li>
-</ul></li>
-</ul></li>
-<li><a href="#getting-started-with-centrifuge">Getting started with Centrifuge</a><ul>
-<li><a href="#indexing-a-reference-genome">Indexing a reference genome</a></li>
-<li><a href="#classifying-example-reads">Classifying example reads</a></li>
-</ul></li>
-</ul>
-</div>
-<!--
- ! This manual is written in "markdown" format and thus contains some
- ! distracting formatting clutter. See 'MANUAL' for an easier-to-read version
- ! of this text document, or see the HTML manual online.
- ! -->
-
-<h1 id="introduction">Introduction</h1>
-<h2 id="what-is-centrifuge">What is Centrifuge?</h2>
-<p><a href="http://www.ccb.jhu.edu/software/centrifuge">Centrifuge</a> is a novel microbial classification engine that enables rapid, accurate, and sensitive labeling of reads and quantification of species on desktop computers. The system uses a novel indexing scheme based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini (FM) index, optimized specifically for the metagenomic classification problem. Centrifuge requires a relatively small index (5.8 GB for all complete bacterial and viral genomes plus the human genome) and classifies sequences at a very high speed, allowing it to process the millions of reads from a typical high-throughput DNA sequencing run within a few minutes. Together these advances enable timely and accurate analysis of large metagenomics data sets on conventional desktop computers.</p>
-<h1 id="obtaining-centrifuge">Obtaining Centrifuge</h1>
-<p>Download Centrifuge and binaries from the Releases sections on the right side. Binaries are available for Intel architectures (<code>x86_64</code>) running Linux, and Mac OS X.</p>
-<h2 id="building-from-source">Building from source</h2>
-<p>Building Centrifuge from source requires a GNU-like environment with GCC, GNU Make and other basics. It should be possible to build Centrifuge on most vanilla Linux installations or on a Mac installation with <a href="http://developer.apple.com/xcode/">Xcode</a> installed. Centrifuge can also be built on Windows using <a href="http://www.cygwin.com/">Cygwin</a> or <a href="http://www.mingw.org/">MinGW</a> (MinGW recommended). For a MinGW build the choice of what compiler is to be used is important since this will determine if a 32 or 64 bit code can be successfully compiled using it. If there is a need to generate both 32 and 64 bit on the same machine then a multilib MinGW has to be properly installed. <a href="http://www.mingw.org/wiki/msys">MSYS</a>, the <a href="http://cygwin.com/packages/mingw-zlib/">zlib</a> library, and depending on architecture <a href="http://sourceware.org/pthreads-win32/">pthreads</a> library are also required. We are recommending a 64 bit build since it has some clear advantages in real life research problems. In order to simplify the MinGW setup it might be worth investigating popular MinGW personal builds since these are coming already prepared with most of the toolchains needed.</p>
-<p>First, download the [source package] from the Releases secion on the right side. Unzip the file, change to the unzipped directory, and build the Centrifuge tools by running GNU <code>make</code> (usually with the command <code>make</code>, but sometimes with <code>gmake</code>) with no arguments. If building with MinGW, run <code>make</code> from the MSYS environment.</p>
-<p>Centrifuge is using the multithreading software model in order to speed up execution times on SMP architectures where this is possible. On POSIX platforms (like linux, Mac OS, etc) it needs the pthread library. Although it is possible to use pthread library on non-POSIX platform like Windows, due to performance reasons Centrifuge will try to use Windows native multithreading if possible.</p>
-<p>For the support of SRA data access in HISAT2, please download and install the <a href="https://github.com/ncbi/ngs/wiki/Downloads">NCBI-NGS</a> toolkit. When running <code>make</code>, specify additional variables as follow. <code>make USE_SRA=1 NCBI_NGS_DIR=/path/to/NCBI-NGS-directory NCBI_VDB_DIR=/path/to/NCBI-NGS-directory</code>, where <code>NCBI_NGS_DIR</code> and <code>NCBI_VDB_DIR</code> will be used in Makefile for -I and -L compilation options. For example, $(NCBI_NGS_DIR)/include and $(NCBI_NGS_DIR)/lib64 will be used.</p>
-<h1 id="running-centrifuge">Running Centrifuge</h1>
-<h2 id="adding-to-path">Adding to PATH</h2>
-<p>By adding your new Centrifuge directory to your <a href="http://en.wikipedia.org/wiki/PATH_(variable)">PATH environment variable</a>, you ensure that whenever you run <code>centrifuge</code>, <code>centrifuge-build</code>, <code>centrifuge-download</code> or <code>centrifuge-inspect</code> from the command line, you will get the version you just installed without having to specify the entire path. This is recommended for most users. To do this, follow your operating system's instructions for adding the directory to your <a href="http://en.wikipedia.org/wiki/PATH_(variable)">PATH</a>.</p>
-<p>If you would like to install Centrifuge by copying the Centrifuge executable files to an existing directory in your <a href="http://en.wikipedia.org/wiki/PATH_(variable)">PATH</a>, make sure that you copy all the executables, including <code>centrifuge</code>, <code>centrifuge-class</code>, <code>centrifuge-build</code>, <code>centrifuge-build-bin</code>, <code>centrifuge-download</code> <code>centrifuge-inspect</code> and <code>centrifuge-inspect-bin</code>. Furthermore you need the programs in the scripts/ folder if you opt for genome compression in the database construction.</p>
-<h2 id="before-running-centrifuge">Before running Centrifuge</h2>
-<p>Classification is considerably different from alignment in that classification is performed on a large set of genomes as opposed to on just one reference genome as in alignment. Currently, an enormous number of complete genomes are available at the GenBank (e.g. >4,000 bacterial genomes, >10,000 viral genomes, …). These genomes are organized in a taxonomic tree where each genome is located at the bottom of the tree, at the strain or subspecies level. On the taxonomic tree, genomes have ancestors usually situated at the species level, and those ancestors also have ancestors at the genus level and so on up the family level, the order level, class level, phylum, kingdom, and finally at the root level.</p>
-<p>Given the gigantic number of genomes available, which continues to expand at a rapid rate, and the development of the taxonomic tree, which continues to evolve with new advancements in research, we have designed Centrifuge to be flexible and general enough to reflect this huge database. We provide several standard indexes that will meet most of users’ needs (see the side panel - Indexes). In our approach our indexes not only include raw genome sequences, but also genome names/sizes and taxonomic trees. This enables users to perform additional analyses on Centrifuge’s classification output without the need to download extra database sources. This also eliminates the potential issue of discrepancy between the indexes we provide and the databases users may otherwise download. We plan to provide a couple of additional standard indexes in the near future, and update the indexes on a regular basis.</p>
-<p>We encourage first time users to take a look at and follow a <a href="#centrifuge-example"><code>small example</code></a> that illustrates how to build an index, how to run Centrifuge using the index, how to interpret the classification results, and how to extract additional genomic information from the index. For those who choose to build customized indexes, please take a close look at the following description.</p>
-<h2 id="database-download-and-index-building">Database download and index building</h2>
-<p>Centrifuge indexes can be built with arbritary sequences. Standard choices are all of the complete bacterial and viral genomes, or using the sequences that are part of the BLAST nt database. Centrifuge always needs the nodes.dmp file from the NCBI taxonomy dump to build the taxonomy tree, as well as a sequence ID to taxonomy ID map. The map is a tab-separated file with the sequence ID to taxonomy ID map.</p>
-<p>To download all of the complete archaeal, viral, and bacterial genomes from RefSeq, and build the index:</p>
-<p>Centrifuge indices can be build on arbritary sequences. Usually an ensemble of genomes is used - such as all complete microbial genomes in the RefSeq database, or all sequences in the BLAST nt database.</p>
-<p>To map sequence identifiers to taxonomy IDs, and taxonomy IDs to names and its parents, three files are necessary in addition to the sequence files:</p>
-<ul>
-<li>taxonomy tree: typically nodes.dmp from the NCBI taxonomy dump. Links taxonomy IDs to their parents</li>
-<li>names file: typically names.dmp from the NCBI taxonomy dump. Links taxonomy IDs to their scientific name</li>
-<li>a tab-separated sequence ID to taxonomy ID mapping</li>
-</ul>
-<p>When using the provided scripts to download the genomes, these files are automatically downloaded or generated. When using a custom taxonomy or sequence files, please refer to the section <code>TODO</code> to learn more about their format.</p>
-<h3 id="building-index-on-all-complete-bacterial-and-viral-genomes">Building index on all complete bacterial and viral genomes</h3>
-<p>Use <code>centrifuge-download</code> to download genomes from NCBI. The following two commands download the NCBI taxonomy to <code>taxonomy/</code> in the current directory, and all complete archaeal, bacterial and viral genomes to <code>library/</code>. Low-complexity regions in the genomes are masked after download (parameter <code>-m</code>) using blast+'s <code>dustmasker</code>. <code>centrifuge-download</code> outputs tab-separated sequence ID to taxonomy ID mappings to standard out, which are required by <code>centrifuge-build</code>.</p>
-<pre><code>centrifuge-download -o taxonomy taxonomy
-centrifuge-download -o library -m -d "archaea,bacteria,viral" refseq > seqid2taxid.map</code></pre>
-<p>To build the index, first concatenate all downloaded sequences into a single file, and then run <code>centrifuge-build</code>:</p>
-<pre><code>cat library/*/*.fna > input-sequences.fna
-
-## build centrifuge index with 4 threads
-centrifuge-build -p 4 --conversion-table seqid2taxid.map \
- --taxonomy-tree taxonomy/nodes.dmp --name-table taxonomy/names.dmp \
- input-sequences.fna abv</code></pre>
-<p>After building the index, all files except the index *.[123].cf files may be removed. If you also want to include the human and/or the mouse genome, add their sequences to the library folder before building the index with one of the following commands:</p>
-<p>After the index building, all but the *.[123].cf index files may be removed. I.e. the files in the <code>library/</code> and <code>taxonomy/</code> directories are no longer needed.</p>
-<h3 id="adding-human-or-mouse-genome-to-the-index">Adding human or mouse genome to the index</h3>
-<p>The human and mouse genomes can also be downloaded using <code>centrifuge-download</code>. They are in the domain "vertebrate_mammalian" (argument <code>-d</code>), are assembled at the chromosome level (argument <code>-a</code>) and categorized as reference genomes by RefSeq (<code>-c</code>). The argument <code>-t</code> takes a comma-separated list of taxonomy IDs - e.g. <code>9606</code> for human and <code>10090</code> for mouse:</p>
-<pre><code># download mouse and human reference genomes
-centrifuge-download -o library -d "vertebrate_mammalian" -a "Chromosome" -t 9606,10090 -c 'reference genome' >> seqid2taxid.map
-# only human
-centrifuge-download -o library -d "vertebrate_mammalian" -a "Chromosome" -t 9606 -c 'reference genome' >> seqid2taxid.map
-# only mouse
-centrifuge-download -o library -d "vertebrate_mammalian" -a "Chromosome" -t 10090 -c 'reference genome' >> seqid2taxid.map</code></pre>
-<h3 id="nt-database">nt database</h3>
-<p>NCBI BLAST's nt database contains all spliced non-redundant coding sequences from multiplpe databases, inferred from genommic sequences. Traditionally used with BLAST, a download of the FASTA is provided on the NCBI homepage. Building an index with any database requires the user to creates a sequence ID to taxonomy ID map that can be generated from a GI taxid dump:</p>
-<pre><code>wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/nt.gz
-gunzip nt.gz && mv -v nt nt.fa
-
-# Get mapping file
-wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.dmp.gz
-gunzip -c gi_taxid_nucl.dmp.gz | sed 's/^/gi|/' > gi_taxid_nucl.map
-
-# build index using 16 cores and a small bucket size, which will require less memory
-centrifuge-build -p 16 --bmax 1342177280 --conversion-table gi_taxid_nucl.map \
- --taxonomy-tree taxonomy/nodes.dmp --name-table taxonomy/names.dmp \
- nt.fa nt</code></pre>
-<h3 id="custom-database">Custom database</h3>
-<p>TODO: Add toy example for nodes.dmp, names.dmp and seqid2taxid.map</p>
-<h3 id="centrifuge-classification-output">Centrifuge classification output</h3>
-<p>The following example shows classification assignments for a read. The assignment output has 8 columns.</p>
-<pre><code>readID seqID taxID score 2ndBestScore hitLength queryLength numMatches
-1_1 gi|4 9646 4225 0 80 80 1
-
-The first column is the read ID from a raw sequencing read (e.g., 1_1 in the example).
-The second column is the sequence ID of the genomic sequence, where the read is classified (e.g., gi|4).
-The third column is the taxonomic ID of the genomic sequence in the second column (e.g., 9646).
-The fourth column is the score for the classification, which is the weighted sum of hits (e.g., 4225)
-The fifth column is the score for the next best classification (e.g., 0).
-The sixth column is a pair of two numbers: (1) an approximate number of base pairs of the read that match the genomic sequence and (2) the length of a read or the combined length of mate pairs (e.g., 80 / 80).
-The seventh column is a pair of two numbers: (1) an approximate number of base pairs of the read that match the genomic sequence and (2) the length of a read or the combined length of mate pairs (e.g., 80 / 80).
-The eighth column is the number of classifications for this read, indicating how many assignments were made (e.g.,1).</code></pre>
-<h3 id="centrifuge-summary-output-the-default-filename-is-centrifuge_report.tsv">Centrifuge summary output (the default filename is centrifuge_report.tsv)</h3>
-<p>The following example shows a classification summary for each genome or taxonomic unit. The assignment output has 7 columns.</p>
-<pre><code>name taxID taxRank genomeSize numReads numUniqueReads abundance
-Wigglesworthia glossinidia endosymbiont of Glossina brevipalpis 36870 leaf 703004 5981 5964 0.0152317
-
-The first column is the name of a genome, or the name corresponding to a taxonomic ID (the second column) at a rank higher than the strain (e.g., Wigglesworthia glossinidia endosymbiont of Glossina brevipalpis).
-The second column is the taxonomic ID (e.g., 36870).
-The third column is the taxonomic rank (e.g., leaf).
-The fourth column is the length of the genome sequence (e.g., 703004).
-The fifth column is the number of reads classified to this genomic sequence including multi-classified reads (e.g., 5981).
-The sixth column is the number of reads uniquely classified to this genomic sequence (e.g., 5964).
-The seventh column is the proportion of this genome normalized by its genomic length (e.g., 0.0152317).</code></pre>
-<p>As the GenBank database is incomplete (i.e., many more genomes remain to be identified and added), and reads have sequencing errors, classification programs including Centrifuge often report many false assignments. In order to perform more conservative analyses, users may want to discard assignments for reads having a matching length (8th column in the output of Centrifuge) of 40% or lower. It may be also helpful to use a score (4th column) for filtering out some assignments. Our future research plans include working on developing methods that estimate confidence scores for assignments.</p>
-<h3 id="kraken-style-report">Kraken-style report</h3>
-<p><code>centrifuge-kreport</code> can be used to make a Kraken-style report from the Centrifuge output including taxonomy information:</p>
-<p><code>centrifuge-kreport -x <centrifuge index> <centrifuge out file></code></p>
-<h2 id="inspecting-the-centrifuge-index">Inspecting the Centrifuge index</h2>
-<p>The index can be inspected with <code>centrifuge-inspect</code>. To extract raw sequences:</p>
-<pre><code>centrifuge-inspect <centrifuge index></code></pre>
-<p>Extract the sequence ID to taxonomy ID conversion table from the index</p>
-<pre><code>centrifuge-inspect --conversion-table <centrifuge index></code></pre>
-<p>Extract the taxonomy tree from the index:</p>
-<pre><code>centrifuge-inspect --taxonomy-tree <centrifuge index></code></pre>
-<p>Extract the lengths of the sequences from the index (each row has two columns: taxonomic ID and length):</p>
-<pre><code>centrifuge-inspect --size-table <centrifuge index></code></pre>
-<p>Extract the names from the index (each row has two columns: taxonomic ID and name):</p>
-<pre><code>centrifuge-inspect --name-table <centrifuge index></code></pre>
-<h2 id="wrapper">Wrapper</h2>
-<p>The <code>centrifuge</code>, <code>centrifuge-build</code> and <code>centrifuge-inspect</code> executables are actually wrapper scripts that call binary programs as appropriate. Also, the <code>centrifuge</code> wrapper provides some key functionality, like the ability to handle compressed inputs, and the functionality for [<code>--un</code>], [<code>--al</code>] and related options.</p>
-<p>It is recommended that you always run the centrifuge wrappers and not run the binaries directly.</p>
-<h2 id="performance-tuning">Performance tuning</h2>
-<ol style="list-style-type: decimal">
-<li><p>If your computer has multiple processors/cores, use <code>-p NTHREADS</code></p>
-<p>The <a href="#centrifuge-build-options-p"><code>-p</code></a> option causes Centrifuge to launch a specified number of parallel search threads. Each thread runs on a different processor/core and all threads find alignments in parallel, increasing alignment throughput by approximately a multiple of the number of threads (though in practice, speedup is somewhat worse than linear).</p></li>
-</ol>
-<h2 id="command-line">Command Line</h2>
-<h3 id="usage">Usage</h3>
-<pre><code>centrifuge [options]* -x <centrifuge-idx> {-1 <m1> -2 <m2> | -U <r> | --sra-acc <SRA accession number>} [--report-file <report file name> -S <classification output file name>]</code></pre>
-<h3 id="main-arguments">Main arguments</h3>
-<table><tr><td>
-
-<pre><code>-x <centrifuge-idx></code></pre>
-</td><td>
-
-<p>The basename of the index for the reference genomes. The basename is the name of any of the index files up to but not including the final <code>.1.cf</code> / etc.<br /><code>centrifuge</code> looks for the specified index first in the current directory, then in the directory specified in the <code>CENTRIFUGE_INDEXES</code> environment variable.</p>
-</td></tr><tr><td>
-
-<pre><code>-1 <m1></code></pre>
-</td><td>
-
-<p>Comma-separated list of files containing mate 1s (filename usually includes <code>_1</code>), e.g. <code>-1 flyA_1.fq,flyB_1.fq</code>. Sequences specified with this option must correspond file-for-file and read-for-read with those specified in <code><m2></code>. Reads may be a mix of different lengths. If <code>-</code> is specified, <code>centrifuge</code> will read the mate 1s from the "standard in" or "stdin" filehandle.</p>
-</td></tr><tr><td>
-
-<pre><code>-2 <m2></code></pre>
-</td><td>
-
-<p>Comma-separated list of files containing mate 2s (filename usually includes <code>_2</code>), e.g. <code>-2 flyA_2.fq,flyB_2.fq</code>. Sequences specified with this option must correspond file-for-file and read-for-read with those specified in <code><m1></code>. Reads may be a mix of different lengths. If <code>-</code> is specified, <code>centrifuge</code> will read the mate 2s from the "standard in" or "stdin" filehandle.</p>
-</td></tr><tr><td>
-
-<pre><code>-U <r></code></pre>
-</td><td>
-
-<p>Comma-separated list of files containing unpaired reads to be aligned, e.g. <code>lane1.fq,lane2.fq,lane3.fq,lane4.fq</code>. Reads may be a mix of different lengths. If <code>-</code> is specified, <code>centrifuge</code> gets the reads from the "standard in" or "stdin" filehandle.</p>
-</td></tr><tr><td>
-
-<pre><code>--sra-acc <SRA accession number></code></pre>
-</td><td>
-
-<p>Comma-separated list of SRA accession numbers, e.g. <code>--sra-acc SRR353653,SRR353654</code>. Information about read types is available at http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?sp=runinfo&acc=<b>sra-acc</b>&retmode=xml, where <b>sra-acc</b> is SRA accession number. If users run HISAT2 on a computer cluster, it is recommended to disable SRA-related caching (see the instruction at <a href="https://github.com/ncbi/sra-tools/wiki/Toolkit-Configuration">SRA-MANUAL</a>).</p>
-</td></tr><tr><td>
-
-<pre><code>-S <filename></code></pre>
-</td><td>
-
-<p>File to write classification results to. By default, assignments are written to the "standard out" or "stdout" filehandle (i.e. the console).</p>
-</td></tr><tr><td>
-
-<pre><code>--report-file <filename></code></pre>
-</td><td>
-
-<p>File to write a classification summary to (default: centrifuge_report.tsv).</p>
-</td></tr></table>
-
-<h3 id="options">Options</h3>
-<h4 id="input-options">Input options</h4>
-<table>
-<tr><td id="centrifuge-options-q">
-
-<pre><code>-q</code></pre>
-</td><td>
-
-<p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are FASTQ files. FASTQ files usually have extension <code>.fq</code> or <code>.fastq</code>. FASTQ is the default format. See also: <a href="#centrifuge-options-solexa-quals"><code>--solexa-quals</code></a> and <a href="#centrifuge-options-int-quals"><code>--int-quals</code></a>.</p>
-</td></tr>
-<tr><td id="centrifuge-options-qseq">
-
-<pre><code>--qseq</code></pre>
-</td><td>
-
-<p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are QSEQ files. QSEQ files usually end in <code>_qseq.txt</code>. See also: <a href="#centrifuge-options-solexa-quals"><code>--solexa-quals</code></a> and <a href="#centrifuge-options-int-quals"><code>--int-quals</code></a>.</p>
-</td></tr>
-<tr><td id="centrifuge-options-f">
-
-<pre><code>-f</code></pre>
-</td><td>
-
-<p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are FASTA files. FASTA files usually have extension <code>.fa</code>, <code>.fasta</code>, <code>.mfa</code>, <code>.fna</code> or similar. FASTA files do not have a way of specifying quality values, so when <code>-f</code> is set, the result is as if <code>--ignore-quals</code> is also set.</p>
-</td></tr>
-<tr><td id="centrifuge-options-r">
-
-<pre><code>-r</code></pre>
-</td><td>
-
-<p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are files with one input sequence per line, without any other information (no read names, no qualities). When <code>-r</code> is set, the result is as if <code>--ignore-quals</code> is also set.</p>
-</td></tr>
-<tr><td id="centrifuge-options-c">
-
-<pre><code>-c</code></pre>
-</td><td>
-
-<p>The read sequences are given on command line. I.e. <code><m1></code>, <code><m2></code> and <code><singles></code> are comma-separated lists of reads rather than lists of read files. There is no way to specify read names or qualities, so <code>-c</code> also implies <code>--ignore-quals</code>.</p>
-</td></tr>
-<tr><td id="centrifuge-options-s">
-
-<pre><code>-s/--skip <int></code></pre>
-</td><td>
-
-<p>Skip (i.e. do not align) the first <code><int></code> reads or pairs in the input.</p>
-</td></tr>
-<tr><td id="centrifuge-options-u">
-
-<pre><code>-u/--qupto <int></code></pre>
-</td><td>
-
-<p>Align the first <code><int></code> reads or read pairs from the input (after the <a href="#centrifuge-options-s"><code>-s</code>/<code>--skip</code></a> reads or pairs have been skipped), then stop. Default: no limit.</p>
-</td></tr>
-<tr><td id="centrifuge-options-5">
-
-<pre><code>-5/--trim5 <int></code></pre>
-</td><td>
-
-<p>Trim <code><int></code> bases from 5' (left) end of each read before alignment (default: 0).</p>
-</td></tr>
-<tr><td id="centrifuge-options-3">
-
-<pre><code>-3/--trim3 <int></code></pre>
-</td><td>
-
-<p>Trim <code><int></code> bases from 3' (right) end of each read before alignment (default: 0).</p>
-</td></tr><tr><td id="centrifuge-options-phred33-quals">
-
-<pre><code>--phred33</code></pre>
-</td><td>
-
-<p>Input qualities are ASCII chars equal to the <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Phred quality</a> plus 33. This is also called the "Phred+33" encoding, which is used by the very latest Illumina pipelines.</p>
-</td></tr>
-<tr><td id="centrifuge-options-phred64-quals">
-
-<pre><code>--phred64</code></pre>
-</td><td>
-
-<p>Input qualities are ASCII chars equal to the <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Phred quality</a> plus 64. This is also called the "Phred+64" encoding.</p>
-</td></tr>
-<tr><td id="centrifuge-options-solexa-quals">
-
-<pre><code>--solexa-quals</code></pre>
-</td><td>
-
-<p>Convert input qualities from <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Solexa</a> (which can be negative) to <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Phred</a> (which can't). This scheme was used in older Illumina GA Pipeline versions (prior to 1.3). Default: off.</p>
-</td></tr>
-<tr><td id="centrifuge-options-int-quals">
-
-<pre><code>--int-quals</code></pre>
-</td><td>
-
-<p>Quality values are represented in the read input file as space-separated ASCII integers, e.g., <code>40 40 30 40</code>..., rather than ASCII characters, e.g., <code>II?I</code>.... Integers are treated as being on the <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Phred quality</a> scale unless <a href="#centrifuge-options-solexa-quals"><code>--solexa-quals</code></a> is also specified. Default: off.</p>
-</td></tr></table>
-
-<h4 id="classification">Classification</h4>
-<table>
-
-<tr><td id="centrifuge-options-min-hitlen">
-
-<pre><code>--min-hitlen <int></code></pre>
-</td><td>
-
-<p>Minimum length of partial hits, which must be greater than 15 (default: 22)"</p>
-</td></tr>
-
-<tr><td id="centrifuge-options-k">
-
-<pre><code>-k <int></code></pre>
-</td><td>
-
-<p>It searches for at most <code><int></code> distinct, primary assignments for each read or pair.<br />Primary assignments mean assignments whose assignment score is equal or higher than any other assignments. If there are more primary assignments than this value, the search will merge some of the assignments into a higher taxonomic rank. The assignment score for a paired-end assignment equals the sum of the assignment scores of the individual mates. Default: 5</p>
-</td></tr>
-
-<tr><td id="centrifuge-options-host-taxids">
-
-<pre><code>--host-taxids</code></pre>
-</td><td>
-
-<p>A comma-separated list of taxonomic IDs that will be preferred in classification procedure. The descendants from these IDs will also be preferred. In case some of a read's assignments correspond to these taxonomic IDs, only those corresponding assignments will be reported.</p>
-</td></tr>
-
-<tr><td id="centrifuge-options-exclude-taxids">
-
-<pre><code>--exclude-taxids</code></pre>
-</td><td>
-
-<p>A comma-separated list of taxonomic IDs that will be excluded in classification procedure. The descendants from these IDs will also be exclude.</p>
-</td></tr>
-
-</table>
-
-
-<!--
-#### Alignment options
-
-<table>
-
-<tr><td id="centrifuge-options-n-ceil">
-
-[`--n-ceil`]: #centrifuge-options-n-ceil
-
- --n-ceil <func>
-
-</td><td>
-
-Sets a function governing the maximum number of ambiguous characters (usually
-`N`s and/or `.`s) allowed in a read as a function of read length. For instance,
-specifying `-L,0,0.15` sets the N-ceiling function `f` to `f(x) = 0 + 0.15 * x`,
-where x is the read length. See also: [setting function options]. Reads
-exceeding this ceiling are [filtered out]. Default: `L,0,0.15`.
-
-[filtered out]: #filtering
-
-</td></tr>
-
-<tr><td id="centrifuge-options-ignore-quals">
-
-[`--ignore-quals`]: #centrifuge-options-ignore-quals
-
- --ignore-quals
-
-</td><td>
-
-When calculating a mismatch penalty, always consider the quality value at the
-mismatched position to be the highest possible, regardless of the actual value.
-I.e. input is treated as though all quality values are high. This is also the
-default behavior when the input doesn't specify quality values (e.g. in [`-f`],
-[`-r`], or [`-c`] modes).
-
-</td></tr>
-<tr><td id="centrifuge-options-nofw">
-
-[`--nofw`]: #centrifuge-options-nofw
-
- --nofw/--norc
-
-</td><td>
-
-If `--nofw` is specified, `centrifuge` will not attempt to align unpaired reads to
-the forward (Watson) reference strand. If `--norc` is specified, `centrifuge` will
-not attempt to align unpaired reads against the reverse-complement (Crick)
-reference strand. In paired-end mode, `--nofw` and `--norc` pertain to the
-fragments; i.e. specifying `--nofw` causes `centrifuge` to explore only those
-paired-end configurations corresponding to fragments from the reverse-complement
-(Crick) strand. Default: both strands enabled.
-
-</td></tr>
-
-</table>
-
-#### Paired-end options
-
-<table>
-
-<tr><td id="centrifuge-options-fr">
-
-[`--fr`/`--rf`/`--ff`]: #centrifuge-options-fr
-[`--fr`]: #centrifuge-options-fr
-[`--rf`]: #centrifuge-options-fr
-[`--ff`]: #centrifuge-options-fr
-
- --fr/--rf/--ff
-
-</td><td>
-
-The upstream/downstream mate orientations for a valid paired-end alignment
-against the forward reference strand. E.g., if `--fr` is specified and there is
-a candidate paired-end alignment where mate 1 appears upstream of the reverse
-complement of mate 2 and the fragment length constraints ([`-I`] and [`-X`]) are
-met, that alignment is valid. Also, if mate 2 appears upstream of the reverse
-complement of mate 1 and all other constraints are met, that too is valid.
-`--rf` likewise requires that an upstream mate1 be reverse-complemented and a
-downstream mate2 be forward-oriented. ` --ff` requires both an upstream mate 1
-and a downstream mate 2 to be forward-oriented. Default: `--fr` (appropriate
-for Illumina's Paired-end Sequencing Assay).
-
-</td></tr></table>
--->
-
-<h4 id="output-options">Output options</h4>
-<table>
-
-<tr><td id="centrifuge-options-t">
-
-<pre><code>-t/--time</code></pre>
-</td><td>
-
-<p>Print the wall-clock time required to load the index files and align the reads. This is printed to the "standard error" ("stderr") filehandle. Default: off.</p>
-</td></tr>
-
-<!--
-<tr><td id="centrifuge-options-un">
-
-[`--un`]: #centrifuge-options-un
-[`--un-gz`]: #centrifuge-options-un
-[`--un-bz2`]: #centrifuge-options-un
-
- --un <path>
- --un-gz <path>
- --un-bz2 <path>
-
-</td><td>
-
-Write unpaired reads that fail to align to file at `<path>`. These reads
-correspond to the SAM records with the FLAGS `0x4` bit set and neither the
-`0x40` nor `0x80` bits set. If `--un-gz` is specified, output will be gzip
-compressed. If `--un-bz2` is specified, output will be bzip2 compressed. Reads
-written in this way will appear exactly as they did in the input file, without
-any modification (same sequence, same name, same quality string, same quality
-encoding). Reads will not necessarily appear in the same order as they did in
-the input.
-
-</td></tr>
-<tr><td id="centrifuge-options-al">
-
-[`--al`]: #centrifuge-options-al
-[`--al-gz`]: #centrifuge-options-al
-[`--al-bz2`]: #centrifuge-options-al
-
- --al <path>
- --al-gz <path>
- --al-bz2 <path>
-
-</td><td>
-
-Write unpaired reads that align at least once to file at `<path>`. These reads
-correspond to the SAM records with the FLAGS `0x4`, `0x40`, and `0x80` bits
-unset. If `--al-gz` is specified, output will be gzip compressed. If `--al-bz2`
-is specified, output will be bzip2 compressed. Reads written in this way will
-appear exactly as they did in the input file, without any modification (same
-sequence, same name, same quality string, same quality encoding). Reads will
-not necessarily appear in the same order as they did in the input.
-
-</td></tr>
-<tr><td id="centrifuge-options-un-conc">
-
-[`--un-conc`]: #centrifuge-options-un-conc
-[`--un-conc-gz`]: #centrifuge-options-un-conc
-[`--un-conc-bz2`]: #centrifuge-options-un-conc
-
- --un-conc <path>
- --un-conc-gz <path>
- --un-conc-bz2 <path>
-
-</td><td>
-
-Write paired-end reads that fail to align concordantly to file(s) at `<path>`.
-These reads correspond to the SAM records with the FLAGS `0x4` bit set and
-either the `0x40` or `0x80` bit set (depending on whether it's mate #1 or #2).
-`.1` and `.2` strings are added to the filename to distinguish which file
-contains mate #1 and mate #2. If a percent symbol, `%`, is used in `<path>`,
-the percent symbol is replaced with `1` or `2` to make the per-mate filenames.
-Otherwise, `.1` or `.2` are added before the final dot in `<path>` to make the
-per-mate filenames. Reads written in this way will appear exactly as they did
-in the input files, without any modification (same sequence, same name, same
-quality string, same quality encoding). Reads will not necessarily appear in
-the same order as they did in the inputs.
-
-</td></tr>
-<tr><td id="centrifuge-options-al-conc">
-
-[`--al-conc`]: #centrifuge-options-al-conc
-[`--al-conc-gz`]: #centrifuge-options-al-conc
-[`--al-conc-bz2`]: #centrifuge-options-al-conc
-
- --al-conc <path>
- --al-conc-gz <path>
- --al-conc-bz2 <path>
-
-</td><td>
-
-Write paired-end reads that align concordantly at least once to file(s) at
-`<path>`. These reads correspond to the SAM records with the FLAGS `0x4` bit
-unset and either the `0x40` or `0x80` bit set (depending on whether it's mate #1
-or #2). `.1` and `.2` strings are added to the filename to distinguish which
-file contains mate #1 and mate #2. If a percent symbol, `%`, is used in
-`<path>`, the percent symbol is replaced with `1` or `2` to make the per-mate
-filenames. Otherwise, `.1` or `.2` are added before the final dot in `<path>` to
-make the per-mate filenames. Reads written in this way will appear exactly as
-they did in the input files, without any modification (same sequence, same name,
-same quality string, same quality encoding). Reads will not necessarily appear
-in the same order as they did in the inputs.
-
-</td></tr>
--->
-
-<tr><td id="centrifuge-options-quiet">
-
-<pre><code>--quiet</code></pre>
-</td><td>
-
-<p>Print nothing besides alignments and serious errors.</p>
-</td></tr>
-<tr><td id="centrifuge-options-met-file">
-
-<pre><code>--met-file <path></code></pre>
-</td><td>
-
-<p>Write <code>centrifuge</code> metrics to file <code><path></code>. Having alignment metric can be useful for debugging certain problems, especially performance issues. See also: <a href="#centrifuge-options-met"><code>--met</code></a>. Default: metrics disabled.</p>
-</td></tr>
-<tr><td id="centrifuge-options-met-stderr">
-
-<pre><code>--met-stderr</code></pre>
-</td><td>
-
-<p>Write <code>centrifuge</code> metrics to the "standard error" ("stderr") filehandle. This is not mutually exclusive with <a href="#centrifuge-options-met-file"><code>--met-file</code></a>. Having alignment metric can be useful for debugging certain problems, especially performance issues. See also: <a href="#centrifuge-options-met"><code>--met</code></a>. Default: metrics disabled.</p>
-</td></tr>
-<tr><td id="centrifuge-options-met">
-
-<pre><code>--met <int></code></pre>
-</td><td>
-
-<p>Write a new <code>centrifuge</code> metrics record every <code><int></code> seconds. Only matters if either <a href="#centrifuge-options-met-stderr"><code>--met-stderr</code></a> or <a href="#centrifuge-options-met-file"><code>--met-file</code></a> are specified. Default: 1.</p>
-</td></tr>
-</table>
-
-<h4 id="performance-options">Performance options</h4>
-<table><tr>
-
-<td id="centrifuge-options-o">
-
-<pre><code>-o/--offrate <int></code></pre>
-</td><td>
-
-<p>Override the offrate of the index with <code><int></code>. If <code><int></code> is greater than the offrate used to build the index, then some row markings are discarded when the index is read into memory. This reduces the memory footprint of the aligner but requires more time to calculate text offsets. <code><int></code> must be greater than the value used to build the index.</p>
-</td></tr>
-<tr><td id="centrifuge-options-p">
-
-<pre><code>-p/--threads NTHREADS</code></pre>
-</td><td>
-
-<p>Launch <code>NTHREADS</code> parallel search threads (default: 1). Threads will run on separate processors/cores and synchronize when parsing reads and outputting alignments. Searching for alignments is highly parallel, and speedup is close to linear. Increasing <code>-p</code> increases Centrifuge's memory footprint. E.g. when aligning to a human genome index, increasing <code>-p</code> from 1 to 8 increases the memory footprint by a few hundred megabytes. This option is only available if <code>bowtie</code> is linked with the <code>pthreads</code> library (i.e. if <code>BOWTIE_PTHREADS=0</code> is not specified at build time).</p>
-</td></tr>
-<tr><td id="centrifuge-options-reorder">
-
-<pre><code>--reorder</code></pre>
-</td><td>
-
-<p>Guarantees that output records are printed in an order corresponding to the order of the reads in the original input file, even when <a href="#centrifuge-build-options-p"><code>-p</code></a> is set greater than 1. Specifying <code>--reorder</code> and setting <a href="#centrifuge-build-options-p"><code>-p</code></a> greater than 1 causes Centrifuge to run somewhat slower and use somewhat more memory then if <code>--reorder</code> were not specified. Has no effect if <a href="#centrifuge-build-options-p"><code>-p</code></a> is set to 1, since output order will naturally correspond to input order in that case.</p>
-</td></tr>
-<tr><td id="centrifuge-options-mm">
-
-<pre><code>--mm</code></pre>
-</td><td>
-
-<p>Use memory-mapped I/O to load the index, rather than typical file I/O. Memory-mapping allows many concurrent <code>bowtie</code> processes on the same computer to share the same memory image of the index (i.e. you pay the memory overhead just once). This facilitates memory-efficient parallelization of <code>bowtie</code> in situations where using <a href="#centrifuge-build-options-p"><code>-p</code></a> is not possible or not preferable.</p>
-</td></tr></table>
-
-<h4 id="other-options">Other options</h4>
-<table>
-<tr><td id="centrifuge-options-qc-filter">
-
-<pre><code>--qc-filter</code></pre>
-</td><td>
-
-<p>Filter out reads for which the QSEQ filter field is non-zero. Only has an effect when read format is <a href="#centrifuge-options-qseq"><code>--qseq</code></a>. Default: off.</p>
-</td></tr>
-<tr><td id="centrifuge-options-seed">
-
-<pre><code>--seed <int></code></pre>
-</td><td>
-
-<p>Use <code><int></code> as the seed for pseudo-random number generator. Default: 0.</p>
-</td></tr>
-<tr><td id="centrifuge-options-non-deterministic">
-
-<pre><code>--non-deterministic</code></pre>
-</td><td>
-
-<p>Normally, Centrifuge re-initializes its pseudo-random generator for each read. It seeds the generator with a number derived from (a) the read name, (b) the nucleotide sequence, (c) the quality sequence, (d) the value of the <a href="#centrifuge-options-seed"><code>--seed</code></a> option. This means that if two reads are identical (same name, same nucleotides, same qualities) Centrifuge will find and report the same classification(s) for both, even if there was ambiguity. When <code>--non-deterministic</code> is specified, Centrifuge re-initializes its pseudo-random generator for each read using the current time. This means that Centrifuge will not necessarily report the same classification for two identical reads. This is counter-intuitive for some users, but might be more appropriate in situations where the input consists of many identical reads.</p>
-</td></tr>
-<tr><td id="centrifuge-options-version">
-
-<pre><code>--version</code></pre>
-</td><td>
-
-<p>Print version information and quit.</p>
-</td></tr>
-<tr><td id="centrifuge-options-h">
-
-<pre><code>-h/--help</code></pre>
-</td><td>
-
-<p>Print usage information and quit.</p>
-</td></tr></table>
-
-
-<h1 id="the-centrifuge-build-indexer">The <code>centrifuge-build</code> indexer</h1>
-<p><code>centrifuge-build</code> builds a Centrifuge index from a set of DNA sequences. <code>centrifuge-build</code> outputs a set of 6 files with suffixes <code>.1.cf</code>, <code>.2.cf</code>, and <code>.3.cf</code>. These files together constitute the index: they are all that is needed to align reads to that reference. The original sequence FASTA files are no longer used by Centrifuge once the index is built.</p>
-<p>Use of Karkkainen's <a href="http://portal.acm.org/citation.cfm?id=1314852">blockwise algorithm</a> allows <code>centrifuge-build</code> to trade off between running time and memory usage. <code>centrifuge-build</code> has two options governing how it makes this trade: <a href="#centrifuge-build-options-bmax"><code>--bmax</code></a>/<a href="#centrifuge-build-options-bmaxdivn"><code>--bmaxdivn</code></a>, and <a href="#centrifuge-build-options-dcv"><code>--dcv</code></a>. By default, <code>centrifuge-build</code> will automatically search for the settings that yield the best running time without exhausting memory. This behavior can be disabled using the <a href="#centrifuge-build-options-a"><code>-a</code>/<code>--noauto</code></a> option.</p>
-<p>The indexer provides options pertaining to the "shape" of the index, e.g. <a href="#centrifuge-build-options-o"><code>--offrate</code></a> governs the fraction of <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheeler</a> rows that are "marked" (i.e., the density of the suffix-array sample; see the original <a href="http://en.wikipedia.org/wiki/FM-index">FM Index</a> paper for details). All of these options are potentially profitable trade-offs depending on the application. They have been set to defaults that are reasonable for most cases according to our experiments. See <a href="#performance-tuning">Performance tuning</a> for details.</p>
-<p>The Centrifuge index is based on the <a href="http://en.wikipedia.org/wiki/FM-index">FM Index</a> of Ferragina and Manzini, which in turn is based on the <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheeler</a> transform. The algorithm used to build the index is based on the <a href="http://portal.acm.org/citation.cfm?id=1314852">blockwise algorithm</a> of Karkkainen.</p>
-<h2 id="command-line-1">Command Line</h2>
-<p>Usage:</p>
-<pre><code>centrifuge-build [options]* --conversion-table <table_in> --taxonomy-tree <taxonomy_in> --name-table <table_in2> <reference_in> <cf_base></code></pre>
-<h3 id="main-arguments-1">Main arguments</h3>
-<table><tr><td>
-
-<pre><code><reference_in></code></pre>
-</td><td>
-
-<p>A comma-separated list of FASTA files containing the reference sequences to be aligned to, or, if <a href="#centrifuge-build-options-c"><code>-c</code></a> is specified, the sequences themselves. E.g., <code><reference_in></code> might be <code>chr1.fa,chr2.fa,chrX.fa,chrY.fa</code>, or, if <a href="#centrifuge-build-options-c"><code>-c</code></a> is specified, this might be <code>GGTCATCCT,ACGGGTCGT,CCGTTCTATGCGGCTTA</code>.</p>
-</td></tr><tr><td>
-
-<pre><code><cf_base></code></pre>
-</td><td>
-
-<p>The basename of the index files to write. By default, <code>centrifuge-build</code> writes files named <code>NAME.1.cf</code>, <code>NAME.2.cf</code>, and <code>NAME.3.cf</code>, where <code>NAME</code> is <code><cf_base></code>.</p>
-</td></tr></table>
-
-<h3 id="options-1">Options</h3>
-<table><tr><td>
-
-<pre><code>-f</code></pre>
-</td><td>
-
-<p>The reference input files (specified as <code><reference_in></code>) are FASTA files (usually having extension <code>.fa</code>, <code>.mfa</code>, <code>.fna</code> or similar).</p>
-</td></tr><tr><td id="centrifuge-build-options-c">
-
-<pre><code>-c</code></pre>
-</td><td>
-
-<p>The reference sequences are given on the command line. I.e. <code><reference_in></code> is a comma-separated list of sequences rather than a list of FASTA files.</p>
-</td></tr>
-<tr><td id="centrifuge-build-options-a">
-
-<pre><code>-a/--noauto</code></pre>
-</td><td>
-
-<p>Disable the default behavior whereby <code>centrifuge-build</code> automatically selects values for the <a href="#centrifuge-build-options-bmax"><code>--bmax</code></a>, <a href="#centrifuge-build-options-dcv"><code>--dcv</code></a> and [<code>--packed</code>] parameters according to available memory. Instead, user may specify values for those parameters. If memory is exhausted during indexing, an error message will be printed; it is up to the user to try new parameters.</p>
-</td></tr><tr><td id="centrifuge-build-options-p">
-
-<pre><code>-p/--threads <int></code></pre>
-</td><td>
-
-<p>Launch <code>NTHREADS</code> parallel search threads (default: 1).</p>
-</td></tr><tr><td id="centrifuge-build-options-conversion-table">
-
-<pre><code>--conversion-table <file></code></pre>
-</td><td>
-
-<p>List of UIDs (unique ID) and corresponding taxonomic IDs.</p>
-</td></tr><tr><td id="centrifuge-build-options-taxonomy-tree">
-
-<pre><code>--taxonomy-tree <file></code></pre>
-</td><td>
-
-<p>Taxonomic tree (e.g. nodes.dmp).</p>
-</td></tr><tr><td id="centrifuge-build-options-name-table">
-
-<pre><code>--name-table <file></code></pre>
-</td><td>
-
-<p>Name table (e.g. names.dmp).</p>
-</td></tr><tr><td id="centrifuge-build-options-taxonomy-tree">
-
-<pre><code>--size-table <file></code></pre>
-</td><td>
-
-<p>List of taxonomic IDs and lengths of the sequences belonging to the same taxonomic IDs.</p>
-</td></tr><tr><td id="centrifuge-build-options-bmax">
-
-<pre><code>--bmax <int></code></pre>
-</td><td>
-
-<p>The maximum number of suffixes allowed in a block. Allowing more suffixes per block makes indexing faster, but increases peak memory usage. Setting this option overrides any previous setting for <a href="#centrifuge-build-options-bmax"><code>--bmax</code></a>, or <a href="#centrifuge-build-options-bmaxdivn"><code>--bmaxdivn</code></a>. Default (in terms of the <a href="#centrifuge-build-options-bmaxdivn"><code>--bmaxdivn</code></a> parameter) is <a href="#centrifuge-build-options-bmaxdivn"><code>--bmaxdivn</code></a> 4. This is configured automatically by default; use <a href="#centrifuge-build-options-a"><code>-a</code>/<code>--noauto</code></a> to configure manually.</p>
-</td></tr><tr><td id="centrifuge-build-options-bmaxdivn">
-
-<pre><code>--bmaxdivn <int></code></pre>
-</td><td>
-
-<p>The maximum number of suffixes allowed in a block, expressed as a fraction of the length of the reference. Setting this option overrides any previous setting for <a href="#centrifuge-build-options-bmax"><code>--bmax</code></a>, or <a href="#centrifuge-build-options-bmaxdivn"><code>--bmaxdivn</code></a>. Default: <a href="#centrifuge-build-options-bmaxdivn"><code>--bmaxdivn</code></a> 4. This is configured automatically by default; use <a href="#centrifuge-build-options-a"><code>-a</code>/<code>--noauto</code></a> to configure manually.</p>
-</td></tr><tr><td id="centrifuge-build-options-dcv">
-
-<pre><code>--dcv <int></code></pre>
-</td><td>
-
-<p>Use <code><int></code> as the period for the difference-cover sample. A larger period yields less memory overhead, but may make suffix sorting slower, especially if repeats are present. Must be a power of 2 no greater than 4096. Default: 1024. This is configured automatically by default; use <a href="#centrifuge-build-options-a"><code>-a</code>/<code>--noauto</code></a> to configure manually.</p>
-</td></tr><tr><td id="centrifuge-build-options-nodc">
-
-<pre><code>--nodc</code></pre>
-</td><td>
-
-<p>Disable use of the difference-cover sample. Suffix sorting becomes quadratic-time in the worst case (where the worst case is an extremely repetitive reference). Default: off.</p>
-</td></tr><tr><td id="centrifuge-build-options-o">
-
-<pre><code>-o/--offrate <int></code></pre>
-</td><td>
-
-<p>To map alignments back to positions on the reference sequences, it's necessary to annotate ("mark") some or all of the <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheeler</a> rows with their corresponding location on the genome. <a href="#centrifuge-build-options-o"><code>-o</code>/<code>--offrate</code></a> governs how many rows get marked: the indexer will mark every 2^<code><int></code> rows. Marking more rows makes reference-position lookups faster, but requires more memory to hold the annotations at runtime. The default is 4 (every 16th row is marked; for human genome, annotations occupy about 680 megabytes).</p>
-</td></tr><tr><td>
-
-<pre><code>-t/--ftabchars <int></code></pre>
-</td><td>
-
-<p>The ftab is the lookup table used to calculate an initial <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheeler</a> range with respect to the first <code><int></code> characters of the query. A larger <code><int></code> yields a larger lookup table but faster query times. The ftab has size 4^(<code><int></code>+1) bytes. The default setting is 10 (ftab is 4MB).</p>
-</td></tr><tr><td>
-
-<pre><code>--seed <int></code></pre>
-</td><td>
-
-<p>Use <code><int></code> as the seed for pseudo-random number generator.</p>
-</td></tr><tr><td>
-
-<pre><code>--kmer-count <int></code></pre>
-</td><td>
-
-<p>Use <code><int></code> as kmer-size for counting the distinct number of k-mers in the input sequences.</p>
-</td></tr><tr><td>
-
-<pre><code>-q/--quiet</code></pre>
-</td><td>
-
-<p><code>centrifuge-build</code> is verbose by default. With this option <code>centrifuge-build</code> will print only error messages.</p>
-</td></tr><tr><td>
-
-<pre><code>-h/--help</code></pre>
-</td><td>
-
-<p>Print usage information and quit.</p>
-</td></tr><tr><td>
-
-<pre><code>--version</code></pre>
-</td><td>
-
-<p>Print version information and quit.</p>
-</td></tr></table>
-
-<h1 id="the-centrifuge-inspect-index-inspector">The <code>centrifuge-inspect</code> index inspector</h1>
-<p><code>centrifuge-inspect</code> extracts information from a Centrifuge index about what kind of index it is and what reference sequences were used to build it. When run without any options, the tool will output a FASTA file containing the sequences of the original references (with all non-<code>A</code>/<code>C</code>/<code>G</code>/<code>T</code> characters converted to <code>N</code>s). It can also be used to extract just the reference sequence names using the <a href="#centrifuge-inspect-options-n"><code>-n</code>/<code>--names</code></a> option or a more verbose summary using the <a href="#centrifuge-inspect-options-s"><code>-s</code>/<code>--summary</code></a> option.</p>
-<h2 id="command-line-2">Command Line</h2>
-<p>Usage:</p>
-<pre><code>centrifuge-inspect [options]* <cf_base></code></pre>
-<h3 id="main-arguments-2">Main arguments</h3>
-<table><tr><td>
-
-<pre><code><cf_base></code></pre>
-</td><td>
-
-<p>The basename of the index to be inspected. The basename is name of any of the index files but with the <code>.X.cf</code> suffix omitted. <code>centrifuge-inspect</code> first looks in the current directory for the index files, then in the directory specified in the <code>Centrifuge_INDEXES</code> environment variable.</p>
-</td></tr></table>
-
-<h3 id="options-2">Options</h3>
-<table><tr><td>
-
-<pre><code>-a/--across <int></code></pre>
-</td><td>
-
-<p>When printing FASTA output, output a newline character every <code><int></code> bases (default: 60).</p>
-</td></tr><tr><td id="centrifuge-inspect-options-n">
-
-<pre><code>-n/--names</code></pre>
-</td><td>
-
-<p>Print reference sequence names, one per line, and quit.</p>
-</td></tr><tr><td id="centrifuge-inspect-options-s">
-
-<pre><code>-s/--summary</code></pre>
-</td><td>
-
-<p>Print a summary that includes information about index settings, as well as the names and lengths of the input sequences. The summary has this format:</p>
-<pre><code>Colorspace <0 or 1>
-SA-Sample 1 in <sample>
-FTab-Chars <chars>
-Sequence-1 <name> <len>
-Sequence-2 <name> <len>
-...
-Sequence-N <name> <len></code></pre>
-<p>Fields are separated by tabs. Colorspace is always set to 0 for Centrifuge.</p>
-</td></tr><tr><td id="centrifuge-inspect-options-conversion-table">
-
-<pre><code>--conversion-table</code></pre>
-</td><td>
-
-<p>Print a list of UIDs (unique ID) and corresponding taxonomic IDs.</p>
-</td></tr><tr><td id="centrifuge-inspect-options-taxonomy-tree">
-
-<pre><code>--taxonomy-tree</code></pre>
-</td><td>
-
-<p>Print taxonomic tree.</p>
-</td></tr><tr><td id="centrifuge-inspect-options-name-table">
-
-<pre><code>--name-table</code></pre>
-</td><td>
-
-<p>Print name table.</p>
-</td></tr><tr><td id="centrifuge-inspect-options-taxonomy-tree">
-
-<pre><code>--size-table</code></pre>
-</td><td>
-
-<p>Print a list of taxonomic IDs and lengths of the sequences belonging to the same taxonomic IDs.</p>
-</td></tr><tr><td>
-
-<pre><code>-v/--verbose</code></pre>
-</td><td>
-
-<p>Print verbose output (for debugging).</p>
-</td></tr><tr><td>
-
-<pre><code>--version</code></pre>
-</td><td>
-
-<p>Print version information and quit.</p>
-</td></tr><tr><td>
-
-<pre><code>-h/--help</code></pre>
-</td><td>
-
-<p>Print usage information and quit.</p>
-</td></tr></table>
-
-<h1 id="getting-started-with-centrifuge">Getting started with Centrifuge</h1>
-<p>Centrifuge comes with some example files to get you started. The example files are not scientifically significant; these files will simply let you start running Centrifuge and downstream tools right away.</p>
-<p>First follow the manual instructions to <a href="#obtaining-centrifuge">obtain Centrifuge</a>. Set the <code>CENTRIFUGE_HOME</code> environment variable to point to the new Centrifuge directory containing the <code>centrifuge</code>, <code>centrifuge-build</code> and <code>centrifuge-inspect</code> binaries. This is important, as the <code>CENTRIFUGE_HOME</code> variable is used in the commands below to refer to that directory.</p>
-<h2 id="indexing-a-reference-genome">Indexing a reference genome</h2>
-<p>To create an index for two small sequences included with Centrifuge, create a new temporary directory (it doesn't matter where), change into that directory, and run:</p>
-<pre><code>$CENTRIFUGE_HOME/centrifuge-build --conversion-table $CENTRIFUGE_HOME/example/reference/gi_to_tid.dmp --taxonomy-tree $CENTRIFUGE_HOME/example/reference/nodes.dmp --name-table $CENTRIFUGE_HOME/example/reference/names.dmp $CENTRIFUGE_HOME/example/reference/test.fa test</code></pre>
-<p>The command should print many lines of output then quit. When the command completes, the current directory will contain ten new files that all start with <code>test</code> and end with <code>.1.cf</code>, <code>.2.cf</code>, <code>.3.cf</code>. These files constitute the index - you're done!</p>
-<p>You can use <code>centrifuge-build</code> to create an index for a set of FASTA files obtained from any source, including sites such as <a href="http://genome.ucsc.edu/cgi-bin/hgGateway">UCSC</a>, <a href="http://www.ncbi.nlm.nih.gov/sites/genome">NCBI</a>, and <a href="http://www.ensembl.org/">Ensembl</a>. When indexing multiple FASTA files, specify all the files using commas to separate file names. For more details on how to create an index with <code>centrifuge-build</code>, see the <a href="#the-centrifuge-build-indexer">manual section on index building</a>. You may also want to bypass this process by obtaining a pre-built index.</p>
-<h2 id="classifying-example-reads">Classifying example reads</h2>
-<p>Stay in the directory created in the previous step, which now contains the <code>test</code> index files. Next, run:</p>
-<pre><code>$CENTRIFUGE_HOME/centrifuge -f -x test $CENTRIFUGE_HOME/example/reads/input.fa</code></pre>
-<p>This runs the Centrifuge classifier, which classifies a set of unpaired reads to the the genomes using the index generated in the previous step. The classification results are reported to stdout, and a short classification summary is written to centrifuge-species_report.tsv.</p>
-<p>You will see something like this:</p>
-<pre><code>readID seqID taxID score 2ndBestScore hitLength numMatches
-C_1 gi|7 9913 4225 4225 80 2
-C_1 gi|4 9646 4225 4225 80 2
-C_2 gi|4 9646 4225 4225 80 2
-C_2 gi|7 9913 4225 4225 80 2
-C_3 gi|7 9913 4225 4225 80 2
-C_3 gi|4 9646 4225 4225 80 2
-C_4 gi|4 9646 4225 4225 80 2
-C_4 gi|7 9913 4225 4225 80 2
-1_1 gi|4 9646 4225 0 80 1
-1_2 gi|4 9646 4225 0 80 1
-2_1 gi|7 9913 4225 0 80 1
-2_2 gi|7 9913 4225 0 80 1
-2_3 gi|7 9913 4225 0 80 1
-2_4 gi|7 9913 4225 0 80 1
-2_5 gi|7 9913 4225 0 80 1
-2_6 gi|7 9913 4225 0 80 1</code></pre>
=====================================
doc/sidebar.inc.shtml
=====================================
--- a/doc/sidebar.inc.shtml
+++ b/doc/sidebar.inc.shtml
@@ -52,7 +52,7 @@
<table width="100%"><tr><td>last updated:</td> <td align="right">12/06/2016</td></tr>
<tr>
<td>
- <a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed.tar.gz"><i> Bacteria (compressed)</i></a>
+ <a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed.tar.gz"><i> Bacteria, Archaea (compressed)</i></a>
</td>
<td align="right" style="font-size: x-small">
<b>4.4 GB</b>
@@ -60,7 +60,7 @@
</tr>
<tr>
<td>
- <a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz"><i> Bacteria, Viruses, Human (compressed)</i></a>
+ <a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz"><i> Bacteria, Archaea, Viruses, Human (compressed)</i></a>
</td>
<td align="right" style="font-size: x-small">
<b>5.4 GB</b>
@@ -68,7 +68,7 @@
</tr>
<tr>
<td>
- <a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p+h+v.tar.gz"><i> Bacteria, Viruses, Human </i></a>
+ <a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p+h+v.tar.gz"><i> Bacteria, Aarchaea, Viruses, Human </i></a>
</td>
<td align="right" style="font-size: x-small">
<b>7.9 GB</b>
=====================================
indices/Makefile
=====================================
--- a/indices/Makefile
+++ b/indices/Makefile
@@ -11,6 +11,7 @@ KEEP_FILES?=0
get_ref_file_names = $(addprefix $(REFERENCE_SEQUENCES_DIR)/, $(addsuffix $(1), \
$(addprefix all-,$(COMPLETE_GENOMES)) \
$(addprefix all-,$(addsuffix -chromosome_level,$(CHROMOSOME_LEVEL_GENOMES))) \
+ $(addprefix all-,$(addsuffix -any_level,$(ANY_LEVEL_GENOMES))) \
$(addprefix mammalian-reference-,$(MAMMALIAN_TAXIDS)) \
$(addprefix all-compressed-,$(COMPLETE_GENOMES_COMPRESSED)) \
$(if $(INCLUDE_CONTAMINANTS),contaminants)))
@@ -43,6 +44,10 @@ STANDARD TARGETS:
p+h+v As above, but with uncompressed bacterial genomes
+ p+v
+
+ v
+
Alternatively, a IDX_NAME and one or more genomes may be specified as
options to build a custom database.
@@ -61,12 +66,12 @@ EXAMPLES:
# same as:
make COMPLETE_GENOMES=archaea COMPLETE_GENOMES_COMPRESSED=bacteria IDX_NAME=p_compressed
-
+
# Make an index with just the human genome
make IDX_NAME=h MAMMALIAN_TAXIDS=9606
# All archaeal genomes and contaminant sequences from UniVec and EmVec
- make IDX_NAME=a COMPLETE_GENOMES=archaea INCLUDE_CONTAMINANTS=1
+ make IDX_NAME=a COMPLETE_GENOMES=archaea INCLUDE_CONTAMINANTS=1
endef
export USAGE
@@ -74,30 +79,42 @@ export USAGE
###################################################################################################
ifndef IDX_NAME
-all:
+all:
@echo "$$USAGE"
IDX_NAME?=$(shell basename $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST)))))
-INDICES=p+h+v p_compressed p_compressed+h+v refseq_microbial refseq_full nt
+INDICES=p+h+v p+v v p p_compressed p_compressed+h+v refseq_microbial refseq_full nt
-p+h+v: export COMPLETE_GENOMES:=archaea bacteria viral
+p+h+v: export ANY_LEVEL_GENOMES:=viral
+p+h+v: export COMPLETE_GENOMES:=archaea bacteria
p+h+v: export MAMMALIAN_TAXIDS:=9606
p+h+v: export INCLUDE_CONTAMINANTS:=1
p+h+v: export IDX_NAME:=p+h+v
-p_compressed: export COMPLETE_GENOMES:=
+p+v: export ANY_LEVEL_GENOMES:=viral
+p+v: export COMPLETE_GENOMES:=archaea bacteria
+p+v: export INCLUDE_CONTAMINANTS:=1
+p+v: export IDX_NAME:=p+v
+
+v: export ANY_LEVEL_GENOMES:=viral
+v: export IDX_NAME:=v
+
+p: export COMPLETE_GENOMES:=archaea bacteria
+p: export IDX_NAME:=p
+
p_compressed: export COMPLETE_GENOMES_COMPRESSED:=archaea bacteria
p_compressed: export IDX_NAME:=p_compressed
-p_compressed+h+v: export COMPLETE_GENOMES:=viral
+p_compressed+h+v: export ANY_LEVEL_GENOMES:=viral
p_compressed+h+v: export COMPLETE_GENOMES_COMPRESSED:=archaea bacteria
p_compressed+h+v: export MAMMALIAN_TAXIDS:=9606
p_compressed+h+v: export INCLUDE_CONTAMINANTS:=1
p_compressed+h+v: export IDX_NAME:=p_compressed+h+v
-refseq_microbial: export COMPLETE_GENOMES:=archaea bacteria fungi protozoa viral
+refseq_microbial: export COMPLETE_GENOMES:=archaea bacteria fungi protozoa
refseq_microbial: export CHROMOSOME_LEVEL_GENOMES:=$(COMPLETE_GENOMES)
+refseq_microbial: export ANY_LEVEL_GENOMES:=viral
##refseq_microbial: export SMALL_GENOMES:=mitochondrion plasmid plastid # TODO
refseq_microbial: export MAMMALIAN_TAXIDS:=9606 10090
refseq_microbial: export INCLUDE_CONTAMINANTS:=1
@@ -106,6 +123,7 @@ refseq_microbial: export CF_BUILD_OPTS+=--ftabchars 14
refseq_full: export COMPLETE_GENOMES:=archaea bacteria fungi invertebrate plant protozoa vertebrate_mammalian vertebrate_other viral
refseq_full: export CHROMOSOME_LEVEL_GENOMES:=$(COMPLETE_GENOMES)
+refseq_full: export ANY_LEVEL_GENOMES:=viral
refseq_full: export SMALL_GENOMES:=mitochondrion plasmid plastid
refseq_full: export MAMMALIAN_TAXIDS:=9606 10090
refseq_full: export INCLUDE_CONTAMINANTS:=1
@@ -125,7 +143,7 @@ DONT_DUSTMASK=
TAXONOMY_DOWNLOAD_OPTS?=
REFERENCE_SEQUENCES=$(call get_ref_file_names,.fna)
TAXID_MAPS=$(call get_ref_file_names,$(TAXID_SUFFIX))
-CF_BUILD_OPTS?=
+CF_BUILD_OPTS?=
ifeq (nt,$(IDX_NAME))
ifeq ($(strip $(DONT_DUSTMASK)),)
@@ -196,7 +214,7 @@ $(REFERENCE_SEQUENCES_DIR)/mammalian-reference-%.fna: | $(REFERENCE_SEQUENCES_DI
@[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
centrifuge-download -o $(TMP_DIR) -d "vertebrate_mammalian" -a "Chromosome" -t $* -c 'reference genome' -P $(THREADS) refseq > \
$(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX), $(notdir $@))
- cat $(TMP_DIR)/vertebrate_mammalian/*.fna > $@.tmp && mv $@.tmp $@
+ find $(TMP_DIR)/vertebrate_mammalian -name "*.fna" | xargs cat > $@.tmp && mv $@.tmp $@
mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
ifeq (1,$(KEEP_FILES))
[[ -d $(DL_DIR)/vertebrate_mammalian ]] || mkdir -p $(DL_DIR)/vertebrate_mammalian
@@ -219,12 +237,12 @@ else
rm -rf $(TMP_DIR)
endif
-$(REFERENCE_SEQUENCES_DIR)/all-%.fna: | $(REFERENCE_SEQUENCES_DIR) .dustmasker-ok
+$(REFERENCE_SEQUENCES_DIR)/all-%-chromosome_level.fna: | $(REFERENCE_SEQUENCES_DIR) .dustmasker-ok
[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
@echo Downloading and dust-masking $*
- centrifuge-download -o $(TMP_DIR) $(CF_DOWNLOAD_OPTS) -d "$*" -P $(THREADS) refseq > \
+ centrifuge-download -o $(TMP_DIR) $(CF_DOWNLOAD_OPTS) -a "Chromosome" -d "$*" -P $(THREADS) refseq > \
$(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@))
- cat $(TMP_DIR)/$*/*.fna > $@.tmp && mv $@.tmp $@
+ find $(TMP_DIR)/$* -name "*.fna" | xargs cat > $@.tmp && mv $@.tmp $@
mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
ifeq (1,$(KEEP_FILES))
[[ -d $(DL_DIR)/$* ]] || mkdir -p $(DL_DIR)/$*
@@ -233,12 +251,12 @@ else
rm -rf $(TMP_DIR)
endif
-$(REFERENCE_SEQUENCES_DIR)/all-%-chromosome_level.fna: | $(REFERENCE_SEQUENCES_DIR) .dustmasker-ok
+$(REFERENCE_SEQUENCES_DIR)/all-%-any_level.fna: | $(REFERENCE_SEQUENCES_DIR) .dustmasker-ok
[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
@echo Downloading and dust-masking $*
- centrifuge-download -o $(TMP_DIR) $(CF_DOWNLOAD_OPTS) -a "Chromosome" -d "$*" -P $(THREADS) refseq > \
+ centrifuge-download -o $(TMP_DIR) $(CF_DOWNLOAD_OPTS) -a "Any" -d "$*" -P $(THREADS) refseq > \
$(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@))
- cat $(TMP_DIR)/$*/*.fna > $@.tmp && mv $@.tmp $@
+ find $(TMP_DIR)/$* -name "*.fna" | xargs cat > $@.tmp && mv $@.tmp $@
mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
ifeq (1,$(KEEP_FILES))
[[ -d $(DL_DIR)/$* ]] || mkdir -p $(DL_DIR)/$*
@@ -247,12 +265,24 @@ else
rm -rf $(TMP_DIR)
endif
-
+$(REFERENCE_SEQUENCES_DIR)/all-%.fna: | $(REFERENCE_SEQUENCES_DIR) .dustmasker-ok
+ [[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
+ @echo Downloading and dust-masking $*
+ centrifuge-download -o $(TMP_DIR) $(CF_DOWNLOAD_OPTS) -d "$*" -P $(THREADS) refseq > \
+ $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@))
+ find $(TMP_DIR)/$* -name "*.fna" | xargs cat > $@.tmp && mv $@.tmp $@
+ mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
+ifeq (1,$(KEEP_FILES))
+ [[ -d $(DL_DIR)/$* ]] || mkdir -p $(DL_DIR)/$*
+ mv $(TMP_DIR)/$*/* $(DL_DIR)/$*
+else
+ rm -rf $(TMP_DIR)
+endif
$(REFERENCE_SEQUENCES_DIR)/contaminants.fna: | $(REFERENCE_SEQUENCES_DIR)
[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
centrifuge-download -o $(TMP_DIR) contaminants > $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@))
- cat $(TMP_DIR)/contaminants/*.fna > $@.tmp && mv $@.tmp $@
+ find $(TMP_DIR)/contaminants -name "*.fna" | xargs cat > $@.tmp && mv $@.tmp $@
mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
ifeq (1,$(KEEP_FILES))
[[ -d $(DL_DIR)/contaminants ]] || mkdir -p $(DL_DIR)/contaminants
@@ -268,7 +298,7 @@ ifeq ($(strip $(DONT_DUSTMASK)),)
$(error dustmasker program does not exist. Install NCBI blast+, or set option DONT_DUSTMASK=1)
endif
endif
-
+
taxonomy/names.dmp: taxonomy/nodes.dmp
taxonomy/nodes.dmp: | .path-ok
View it on GitLab: https://salsa.debian.org/med-team/centrifuge/commit/f6f7b098137879381ca6b18e3e5ba7c4c327171f
---
View it on GitLab: https://salsa.debian.org/med-team/centrifuge/commit/f6f7b098137879381ca6b18e3e5ba7c4c327171f
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.alioth.debian.org/pipermail/debian-med-commit/attachments/20180302/7b60d338/attachment-0001.html>
More information about the debian-med-commit
mailing list