[med-svn] [dnaclust] 01/02: Imported Upstream version 3

Andreas Tille tille at debian.org
Sat Feb 1 16:09:04 UTC 2014


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository dnaclust.

commit 4fcb863b70c3769b2afb3a0e47ff5dfcf6a75339
Author: Andreas Tille <tille at debian.org>
Date:   Sat Feb 1 17:08:39 2014 +0100

    Imported Upstream version 3
---
 Makefile                  |   30 +
 Makefile.OSX              |   29 +
 README                    |   26 +
 debian/changelog          |    5 -
 debian/compat             |    1 -
 debian/control            |   23 -
 debian/copyright          |   25 -
 debian/rules              |   24 -
 debian/source/format      |    1 -
 debian/upstream           |   12 -
 debian/watch              |    2 -
 dnaclust-abun.sh          |   74 ++
 dnaclust-ref              |   98 ++
 dnaclust.cpp              | 2188 +++++++++++++++++++++++++++++++++++++++++++++
 fasta.cpp                 |   80 ++
 fasta.hpp                 |   35 +
 fastaselect.cpp           |  115 +++
 fastasort.cpp             |   69 ++
 find-large-clusters       |   94 ++
 generate_test_clusters.sh |   15 +
 multi_dim.hpp             |  156 ++++
 search_include.cpp        |  209 +++++
 star-align                |   44 +
 ternary_sort.hpp          |   58 ++
 utility.hpp               |   20 +
 25 files changed, 3340 insertions(+), 93 deletions(-)

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..c92c1b5
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,30 @@
+#PROFILE_FLAGS = -pg -fprofile-arcs -ftest-coverage -g -O1
+CXXFLAGS = -Wall -pedantic -std=c++98 -O2 -static
+CXX = g++ 
+#-DCHRIS_DEBUG
+
+all: dnaclust fastaselect fastasort
+
+dnaclust: dnaclust.o fasta.o
+	$(CXX) $(CXXFLAGS) $(PROFILE_FLAGS) dnaclust.o fasta.o -lboost_program_options -o $@ -lpthread
+
+dnaclust.o: dnaclust.cpp search_include.cpp utility.hpp ternary_sort.hpp multi_dim.hpp
+	$(CXX) -c $(CXXFLAGS) $(PROFILE_FLAGS) dnaclust.cpp
+
+fastaselect: fastaselect.o fasta.o
+	$(CXX) $(CXXFLAGS) $(PROFILE_FLAGS) fastaselect.o fasta.o -lboost_program_options -o $@
+
+fastaselect.o: fastaselect.cpp
+	$(CXX) -c $(CXXFLAGS) $(PROFILE_FLAGS) fastaselect.cpp
+
+fastasort: fastasort.o fasta.o
+	$(CXX) $(CXXFLAGS) $(PROFILE_FLAGS) $+ -lboost_program_options -o $@
+
+fastasort.o: fastasort.cpp
+	$(CXX) -c $(CXXFLAGS) $(PROFILE_FLAGS) $+ -o $@
+
+fasta.o: fasta.cpp fasta.hpp
+	$(CXX) -c $(CXXFLAGS) fasta.cpp
+
+clean:
+	rm -f dnaclust.o dnaclust fasta.o fastaselect.o fastaselect fastasort fastasort.o
diff --git a/Makefile.OSX b/Makefile.OSX
new file mode 100644
index 0000000..7ee60fe
--- /dev/null
+++ b/Makefile.OSX
@@ -0,0 +1,29 @@
+#PROFILE_FLAGS = -pg -fprofile-arcs -ftest-coverage -g -O1
+CXXFLAGS = -Wall -pedantic -std=c++98 -O2
+CXX = g++
+
+all: dnaclust fastaselect fastasort
+
+dnaclust: dnaclust.o fasta.o
+	$(CXX) $(CXXFLAGS) $(PROFILE_FLAGS) dnaclust.o fasta.o -lboost_program_options-mt -o $@ -lpthread
+
+dnaclust.o: dnaclust.cpp search_include.cpp utility.hpp ternary_sort.hpp multi_dim.hpp
+	$(CXX) -c $(CXXFLAGS) $(PROFILE_FLAGS) dnaclust.cpp
+
+fastaselect: fastaselect.o fasta.o
+	$(CXX) $(CXXFLAGS) $(PROFILE_FLAGS) fastaselect.o fasta.o -lboost_program_options-mt -o $@
+
+fastaselect.o: fastaselect.cpp
+	$(CXX) -c $(CXXFLAGS) $(PROFILE_FLAGS) fastaselect.cpp
+
+fastasort: fastasort.o fasta.o
+	$(CXX) $(CXXFLAGS) $(PROFILE_FLAGS) $+ -lboost_program_options-mt -o $@
+
+fastasort.o: fastasort.cpp
+	$(CXX) -c $(CXXFLAGS) $(PROFILE_FLAGS) $+ -o $@
+
+fasta.o: fasta.cpp fasta.hpp
+	$(CXX) -c $(CXXFLAGS) fasta.cpp
+
+clean:
+	rm -f dnaclust.o dnaclust fasta.o fastaselect.o fastaselect
diff --git a/README b/README
new file mode 100644
index 0000000..fbac14b
--- /dev/null
+++ b/README
@@ -0,0 +1,26 @@
+DNACLUST is a tool for clustering millions of short DNA sequences.
+
+Use 'make' to compile. (Use 'make -f Makefile.OSX' on Mac OS X.)
+You need a new version of Boost C++ Libraries and GCC C++ compiler in order to compile this program.
+
+
+Run './dnaclust --help' to see a description of command line options.
+For a more detailed manual read 'man doc/dnaclust.1'.
+
+The project web page is at http://dnaclust.sourceforge.net/
+If you have any problems, questions or suggestions, please contact Mohammadreza Ghodsi <ghodsi at cs.umd.edu>.
+
+DNACLUST Copyright (C) 2010 Mohammadreza Ghodsi
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index 2ae80aa..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,5 +0,0 @@
-dnaclust (0.0.1-1) UNRELEASED; urgency=low
-
-  * Initial release (Closes: #<bug>)
-
- -- DMPT <debian-med-packaging at lists.alioth.debian.org>  Thu, 24 May 2012 14:30:13 +0200
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index ec63514..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 138a285..0000000
--- a/debian/control
+++ /dev/null
@@ -1,23 +0,0 @@
-Source: dnaclust
-Section: science
-Priority: optional
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Andreas Tille <tille at debian.org>
-Build-Depends: debhelper (>= 9)
-Standards-Version: 3.9.5
-Vcs-Browser: http://anonscm.debian.org/viewvc/debian-med/trunk/packages/<pkg>/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/<pkg>/trunk/
-Homepage: http://dnaclust.sourceforge.net/
-
-Package: dnaclust
-Architecture: any
-Depends: ${shlibs:Depends}, ${misc:Depends}
-Description: tool for clustering millions of short DNA sequences
- dnaclust is a tool for clustering large number of short DNA sequences.
- The clusters are created in such a way that the "radius" of each
- clusters is no more than the specified threshold.
- .
- The input sequences to be clustered should be in Fasta format. The id
- of each sequence is based on the first word of the seqeunce in the Fasta
- format. The first word is the prefix of the header up to the first
- occurance of white space charachters in the header. 
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index 2497160..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,25 +0,0 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: DNA Clust
-Source: http://sourceforge.net/projects/dnaclust/files/parallel_release_3/
-
-Files: *
-Copyright: 2010 Mohammadreza Ghodsi
-License: GPLv3+
-
-Files: debian/*
-Copyright: 2014 Andreas Tille <tille at debian.org>
-License: GPLv3+
-
-License: GPLv3+
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
- .
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
- .
- On Debian systems you can find the full text of the GNU General Public
- License at /usr/share/common-licenses/GPL-3.
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index 969156e..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/make -f
-
-# DH_VERBOSE := 1
-
-# some helpful variables - uncomment them if needed
-# shamelessly stolen from http://jmtd.net/log/awk/
-#DEBVERS        := $(shell dpkg-parsechangelog | awk '/^Version:/ {print $$2}')
-#VERSION        := $(shell echo '$(DEBVERS)' | sed -e 's/^[0-9]*://' -e 's/-.*//')
-#DEBFLAVOR      := $(shell dpkg-parsechangelog | awk '/^Distribution:/ {print $$2}')
-#DEBPKGNAME     := $(shell dpkg-parsechangelog | awk '/^Source:/ {print $$2}')
-#DEBIAN_BRANCH  := $(shell awk 'BEGIN{FS="[= ]+"} /debian-branch/ {print $$2}' debian/gbp.conf)
-#GIT_TAG        := $(subst ~,_,$(VERSION))
-
-# alternatively to manually set those variables you can
-#  include /usr/share/cdbs/1/rules/buildvars.mk
-# and use what is set there.  Any hint whether dh might set variables in
-# a similar manner are welcome.
-
-%:
-	dh $@
-
-get-orig-source:
-	uscan --verbose --repack
-
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/upstream b/debian/upstream
deleted file mode 100644
index d8b5812..0000000
--- a/debian/upstream
+++ /dev/null
@@ -1,12 +0,0 @@
-Reference:
-  Author: 
-  Title: 
-  Journal: 
-  Year: 
-  Volume: 
-  Number: 
-  Pages: 
-  DOI: 
-  PMID:
-  URL: 
-  eprint: 
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index 3774673..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,2 +0,0 @@
-version=3
-http://sf.net/dnaclust/dnaclust_repo_release([\d.]+)\.zip
diff --git a/dnaclust-abun.sh b/dnaclust-abun.sh
new file mode 100644
index 0000000..02b3155
--- /dev/null
+++ b/dnaclust-abun.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+similarity=0.98
+threads=1
+
+print_help()
+{
+    fold --spaces <<EOF
+Usage: dnaclust-ref [OPTIONS...]
+DNACLUST helper script to cluster sequences using a reference database.
+
+  -c CENTERS         Fasta file of cluster centers/references.
+  -d                 After clustering with reference database, perform de novo clustering.
+  -r SIMILARITY      Set similarity between cluster center and cluster
+                     sequences (default=0.98)
+  -t THREADS         Set the number of threads to use
+  -i INPUT_FILE      Fasta file of sequences to be clustered.
+  -v                 Print verbose messages to standard error
+  -h                 Give this help list
+
+The sequences to be clustered are read from the STDIN. The cluster centers are written to STDOUT. Messages are written to STDERR.
+EOF
+}
+
+while getopts "c:i:dr:t:vhln" option
+do
+    case $option in
+  c) cluster_centers="$OPTARG";;
+  i) input="$OPTARG";;
+  d) de_novo_cluster=0;;
+    r) similarity="$OPTARG";;
+  t) threads="$OPTARG";;
+    v) verbose=0;;
+  l) left_gaps_allowed=0;;
+  n) no_overlap=0;;
+    h) print_help; exit 0;;
+    [?]) print_help; exit 1;;
+    esac
+done
+
+print_message()
+{
+    if [ $verbose ]
+    then
+    echo "`date +%T` $1" >&2
+    fi
+}
+
+parameters=""
+if [ $left_gaps_allowed ]
+then
+  parameters+=" --left-gaps-allowed "
+fi
+
+if [ $no_overlap ]
+then
+  parameters+=" --no-overlap "
+fi
+
+dnaclust_path="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+print_message "$dnaclust_path"
+#exit 1
+tempdir=`mktemp -d -p .`
+#tempdir="tmpref/"
+trap "rm -fr $tempdir" EXIT
+
+#sequences_sorted=`mktemp -p $tempdir`
+db_sorted=`basename ${cluster_centers} .fasta`.sorted.fasta
+# Reads the sequences from STDIN.
+print_message "Reading and sorting the database sequences: $tempdir/${db_sorted}" 
+#"$dnaclust_path/fastasort" --random-shuffle > $sequences_sorted 
+cat $cluster_centers | "$dnaclust_path/fastasort" > $tempdir/${db_sorted}
+
+print_message "Recruiting from the sequences, using database."
+"$dnaclust_path/dnaclust" $parameters -s $similarity -t $threads --no-k-mer-filter -i $input -p $tempdir/${db_sorted} -r | awk '{if (NF > 1) print $0}' > $input.db.clusters
\ No newline at end of file
diff --git a/dnaclust-ref b/dnaclust-ref
new file mode 100755
index 0000000..a9c66c7
--- /dev/null
+++ b/dnaclust-ref
@@ -0,0 +1,98 @@
+#!/bin/bash
+similarity=0.98
+threads=1
+
+print_help()
+{
+    fold -w 120 <<EOF
+Usage: dnaclust-ref [OPTIONS...]
+DNACLUST helper script to cluster sequences using a reference database.
+
+  -c CENTERS         Fasta file of cluster centers/references.
+  -d                 After clustering with reference database, perform de novo clustering.
+  -r SIMILARITY      Set similarity between cluster center and cluster
+                     sequences (default=0.98)
+  -t THREADS         Set the number of threads to use
+  -i INPUT_FILE      Fasta file of sequences to be clustered.
+  -v                 Print verbose messages to standard error
+  -h                 Give this help list
+
+The sequences to be clustered are read from the STDIN. The cluster centers are written to STDOUT. Messages are written to STDERR.
+EOF
+}
+
+while getopts "c:i:dr:t:vhln" option
+do
+    case $option in
+  c) cluster_centers="$OPTARG";;
+  i) input="$OPTARG";;
+  d) de_novo_cluster=0;;
+	r) similarity="$OPTARG";;
+  t) threads="$OPTARG";;
+	v) verbose=0;;
+  l) left_gaps_allowed=0;;
+  n) no_overlap=0;;
+	h) print_help; exit 0;;
+	[?]) print_help; exit 1;;
+    esac
+done
+
+print_message()
+{
+    if [ $verbose ]
+    then
+	echo "`date +%T` $1" >&2
+    fi
+}
+
+parameters=""
+if [ $left_gaps_allowed ]
+then
+  parameters+=" --left-gaps-allowed "
+fi
+
+if [ $no_overlap ]
+then
+  parameters+=" --no-overlap "
+fi
+
+dnaclust_path="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+print_message "$dnaclust_path"
+#exit 1
+UNAME=$( uname )
+tempdir=""
+if [ $UNAME = "Darwin" ]; 
+then
+  tempdir=`mktemp -d -t .`
+else
+  tempdir=`mktemp -d -p .`
+fi
+#tempdir="tmpref/"
+trap "rm -fr $tempdir" EXIT
+
+#sequences_sorted=`mktemp -p $tempdir`
+db_sorted=`basename ${cluster_centers} .fasta`.sorted.fasta
+# Reads the sequences from STDIN.
+print_message "Reading and sorting the database sequences: $tempdir/${db_sorted}" 
+#"$dnaclust_path/fastasort" --random-shuffle > $sequences_sorted 
+cat $cluster_centers | "$dnaclust_path/fastasort" > $tempdir/${db_sorted}
+
+print_message "Recruiting from the sequences, using database."
+"$dnaclust_path/dnaclust" $parameters -s $similarity -t $threads --no-k-mer-filter -i $input -p $tempdir/${db_sorted} -r | awk '{if (NF > 1) print $0}' > $input.db.clusters
+print_message "DB recruited sequences: $input.db.clusters"
+
+if [ $de_novo_cluster ]
+then
+  print_message "Run DNACLUST on remaining sequences."
+  unclustered_seq=`basename $input .fasta`.unclustered.fasta
+  awk '{ for (i = 1; i <= NF; i++) print $i}' $input.db.clusters > $tempdir/clustered_seqs
+
+  "$dnaclust_path/fastaselect" --everything-except -f ${input} < $tempdir/clustered_seqs > $tempdir/${unclustered_seq}
+  if [[ -s $tempdir/${unclustered_seq} ]] ; then
+    "$dnaclust_path/dnaclust" $parameters -s $similarity -t $threads --no-k-mer-filter -i $tempdir/${unclustered_seq} > $input.denovo.clusters  
+    print_message "Writing de novo clusters to: $input.denovo.clusters"
+  else
+    touch $input.denovo.clusters
+  fi
+fi
+exit 1
diff --git a/dnaclust.cpp b/dnaclust.cpp
new file mode 100644
index 0000000..9e854b1
--- /dev/null
+++ b/dnaclust.cpp
@@ -0,0 +1,2188 @@
+#include <sstream>
+#include <cstdlib>
+#include <cmath>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <limits>
+#include <stack>
+#include <fstream>
+// #include <boost/bind.hpp>
+// #include <boost/date_time/posix_time/posix_time.hpp>
+// #include <boost/foreach.hpp>
+#include <boost/tuple/tuple.hpp>
+#include <boost/program_options.hpp>		
+#include <stdint.h>
+//#include <boost/thread.hpp>
+//#include <boost/thread/condition_variable.hpp>
+#include <pthread.h>
+namespace po = boost::program_options;
+
+#include "ternary_sort.hpp"
+#include "fasta.hpp"
+
+#define MATRIX_RESIZABLE 1
+#include "multi_dim.hpp"
+#include "utility.hpp"
+
+
+typedef char CharType;
+typedef const CharType * ConstCharPointer;
+typedef std::vector<ConstCharPointer> ConstCharPointerVector;
+typedef int PositionType;
+typedef int SequenceNumber;
+typedef intptr_t Index;
+
+uint16_t running_threads = 0;
+size_t num_threads = 1;
+int threads_completed = 0;
+pthread_mutex_t running_mutex;
+
+struct thread_args
+ {
+    PositionType l;
+    PositionType r;
+    PositionType pos;
+    Index intervalIndex;
+    uint16_t tableIndex;
+    int globalOffset;
+};
+
+SequenceNumber numberOfInputSequences;
+
+bool outputMultipleAlignment = false;
+bool leftGapsAllowed = false;
+bool noKmerFilter = true;
+bool kmerFilter = false;
+#if OLD_CODE
+bool approximateFilter = false;
+double approximationFactor;
+#endif // OLD_CODE
+bool approximateFilter2 = false;
+bool noOverlap = false;
+bool recruit_only = false;
+bool useFullQueryHeader = false;
+
+ConstCharPointerVector sortedStrings;
+
+ConstCharPointerVector lexicographicSequencePointers;
+ConstCharPointerVector headerPointers;
+typedef std::vector<PositionType> PositionVector;
+PositionVector headerLengths;
+PositionVector sequenceLengths;
+
+
+ConstCharPointer querySequencePointer;
+PositionType querySequenceLength;
+ConstCharPointer queryHeaderPointer;
+PositionType queryHeaderLength;
+
+
+const CharType endOfStringChar = '\0';
+const CharType invalidChar = '^';
+const CharType gapChar = '-';
+
+const char HEADER_INDICATOR = '%';
+const char CLUSTER_INDICATOR = '#';
+
+typedef int32_t CostType;
+const CostType MAX_COST = std::numeric_limits<CostType>::max() / 2;
+
+struct Context {
+  int id;
+};
+
+const size_t MAX_THREADS = 60;
+pthread_t searchWorkers[MAX_THREADS];
+struct Context searchWorkersContext[MAX_THREADS];
+struct thread_args searchWorkersArgs[MAX_THREADS];
+pthread_mutex_t searchWorkersMutexes[MAX_THREADS];
+pthread_cond_t searchWorkersConds[MAX_THREADS];
+pthread_cond_t searchWorkersCompleted;
+pthread_mutex_t allThreadsMutex;
+
+const size_t MAX_LENGTH = 4500;
+std::vector< std::vector<CharType> > word(MAX_THREADS, std::vector<CharType>(MAX_LENGTH + 1));
+std::vector< std::vector<PositionType> > wordMinLength(MAX_THREADS, std::vector<PositionType>(MAX_LENGTH + 1));
+std::vector< std::vector<CostType> > minCost(MAX_THREADS, std::vector<CostType>(MAX_LENGTH + 1));
+std::vector< std::vector<PositionType> > left(MAX_THREADS, std::vector<PositionType>(MAX_LENGTH + 1));
+std::vector< std::vector<PositionType> > right(MAX_THREADS, std::vector<PositionType>(MAX_LENGTH + 1));
+
+CostType radious;
+
+float DEFAULT_SIMILARITY = 0.99;
+float similarity;
+
+float DEFAULT_MISMATCHES = -1.0;
+float mismatches;
+
+bool printInvertedIndex = false;
+bool assignAmbiguous = false;
+std::map< std::string, std::vector<std::string> > final_clusters;
+std::map< std::string, std::vector<std::string> > inverted_index;
+std::map< std::string, int > center_nonambig_counts;
+
+template<typename T>
+std::ostream & operator<<(std::ostream & os, std::vector<T> vec)
+{
+    os<<"{ ";
+    std::copy(vec.begin(), vec.end(), std::ostream_iterator<T>(os, " "));
+    os<<"}";
+    return os;
+}
+
+template <typename BaseType, typename ExponentType>
+BaseType power(const BaseType base, const ExponentType exponent)
+{
+  BaseType result = 1;
+  for (ExponentType i = 0; i < exponent; ++i)
+    result *= base;
+  return result;
+}
+
+
+const int MAXIMUM_K_MER_LENGTH = 6;
+#if OLD_CODE
+const int DEFAULT_K_MER_LENGTH = 4;
+#endif // OLD_CODE
+int k_mer_length;
+int random_seed = 0;
+
+enum Nucleotides{A, C, G, T, NUMBER_OF_NUCLEOTIDES};
+
+typedef int16_t KmerNumber;
+
+#if FIXED_K
+const KmerNumber NUMBER_OF_K_MERS = power(static_cast<int>(NUMBER_OF_NUCLEOTIDES), K_MER_LENGTH);
+#endif // FIXED_K
+KmerNumber number_of_k_mers;
+
+// Exception thrown when program gets an unkown nucleotide letter.
+class UnknownNucleotide : public std::exception
+{
+public:
+  UnknownNucleotide(CharType nucleotide) throw()
+    : nucleotide_(nucleotide)
+  { }
+
+  virtual const char *what() const throw()
+  {
+    std::ostringstream messageStream;
+    messageStream << "Sequence contains unknown nucleotide: " << nucleotide_ << ".";
+    return messageStream.str().c_str();
+  }
+
+private:
+  const CharType nucleotide_;
+};
+
+// Return an integer number in range 0..(NUMBER_OF_NUCLEOTIDES - 1) for a given nucleotide character.
+int numberFromNucleotide(const CharType nucleotide)
+{
+  switch(tolower(nucleotide)) {
+  case 'a':
+    return A;
+  case 'c':
+    return C;
+  case 'g':
+    return G;
+  case 't':
+    return T;
+  default:
+    return A;
+    throw UnknownNucleotide(nucleotide);
+  }
+}
+
+typedef uint16_t KmerCount;
+const KmerCount MAX_K_MER_COUNT = std::numeric_limits<KmerCount>::max();
+
+// k-mer spectrum is a vector of integers counting how many of each k-mer there are in a sequence.
+typedef std::vector<KmerCount> KmerSpectrum;
+
+// Count the number of each k-mer in a sequence and return a vector of counts.
+KmerSpectrum countKmers(ConstCharPointer sequence, PositionType length)
+{
+  KmerSpectrum spectrum(number_of_k_mers);
+
+  if (length >= k_mer_length) {
+    int i = 0;
+    int kMerNumber = 0;
+    for (; i < k_mer_length; ++i) {
+      kMerNumber *= NUMBER_OF_NUCLEOTIDES;
+      kMerNumber += numberFromNucleotide(sequence[i]);
+    }
+    
+    ++spectrum[kMerNumber];
+    
+    for (; i < length; ++i) {
+      kMerNumber *= NUMBER_OF_NUCLEOTIDES;
+      kMerNumber %= number_of_k_mers;
+      kMerNumber += numberFromNucleotide(sequence[i]);
+      
+      if (spectrum[kMerNumber] < MAX_K_MER_COUNT)
+	++spectrum[kMerNumber];
+    }
+      
+    
+  }
+  
+  return spectrum;
+}
+
+
+
+typedef std::vector<Index> IndexVector;
+
+IndexVector lexicographicIndexOfSortedStringsIndexes;
+
+typedef multi_dim::Matrix<KmerCount> CountMatrix;
+CountMatrix spectrumMatrix;
+
+IndexVector lexicographicIndexOfSpectrumIndexes;
+
+
+IndexVector spectrumSearchResults;
+
+KmerSpectrum querySpectrum;
+KmerSpectrum queryLowSpectrum;
+
+IndexVector spectrumSplitGuide;
+PositionVector spectrumRemainingGuide;
+
+/*
+  Search for potential matches based on their k-mer spectrum.
+  The query's k-mer spectrum should in global variable 'querySpectrum'.
+  The k-mer spectrums for the sequences to be clustered are in global variable 'specturms'.
+  Results will be in global variable ''.
+*/
+
+
+
+
+typedef std::pair<KmerSpectrum, Index> SpectrumIndexPair;
+typedef std::vector<SpectrumIndexPair> SpectrumIndexVector;
+    
+SpectrumIndexVector spectrumIndexes;
+
+
+enum BackPointer {LEFT, DOWN, DOWN_LEFT, ROOT};
+
+typedef multi_dim::Matrix<CostType> DpTable;
+
+DpTable *tables[MAX_THREADS];
+//DpTable table(MAX_LENGTH + 1, MAX_LENGTH + 1);
+
+typedef multi_dim::Matrix<BackPointer> BackPointerTable;
+
+BackPointerTable backTable(MAX_LENGTH + 1, MAX_LENGTH + 1);
+
+
+inline const CostType cost(const CharType c1, const CharType c2)
+{
+  return (c1 != c2) ? 1 : 0;
+}
+
+#if COST_MATRIX
+namespace cost_matrix {
+  const size_t MAX_CHAR = 256;
+  typedef multi_dim::Array2D<CostType, MAX_CHAR, MAX_CHAR> CostMatrix;
+
+  CostMatrix cost;
+  
+}
+#endif // COST_MATRIX
+
+void initialize()
+{
+  // initialize() implementation depends on whether we allow gaps on both ends.
+  
+  for(uint16_t i = 0; i < num_threads; ++i){
+    if (mismatches >= 0)
+      left[i][0] = - static_cast<PositionType>(mismatches);
+    else
+      left[i][0] = - static_cast<PositionType>(floor(MAX_LENGTH * (1 - similarity)));
+    right[i][0] = std::numeric_limits<PositionType>::max() / 2;
+    std::fill(word[i].begin(), word[i].end(), invalidChar);
+
+    tables[i] = new DpTable(MAX_LENGTH + 1, MAX_LENGTH + 1);
+    (*tables[i])(0, 0) = 0;
+    backTable(0, 0) = ROOT;
+
+    for (size_t j = 1; j <  MAX_LENGTH + 1; ++j) {
+      (*tables[i])(0, j) = leftGapsAllowed ? 0 : (*tables[i])(0, j - 1) + cost(gapChar, invalidChar);    
+      backTable(0, j) = LEFT;
+    }
+  }
+} // void initialize()
+
+
+typedef std::vector<PositionType> AlignmentFunction;
+typedef boost::tuple<CostType, AlignmentFunction> AlignmentCostAndFunction;
+
+
+AlignmentCostAndFunction findAlignmentFunction(const PositionType len1, const PositionType len2, const PositionType pos2)
+{
+
+  AlignmentFunction alignmentFunc(len2 + 1);
+
+  // TODO(cmhill): Remove
+  CostType minCost = 0; // table(len1, pos2);
+
+
+  PositionType i = len1;
+  PositionType j = len2;
+
+  assert(minCost <= radious);
+
+  while (j > pos2){
+    alignmentFunc[j] = i;
+    --j;
+  }
+
+  while (backTable(i, j) != ROOT){
+
+    switch(backTable(i, j)){
+    case DOWN:
+      alignmentFunc[j] = i;
+      --i;
+      break;
+    case LEFT:
+      alignmentFunc[j] = i;
+      --j;
+      break;
+    case DOWN_LEFT:
+      alignmentFunc[j] = i;
+      --i;
+      --j;
+      break;
+    case ROOT:
+    default:
+      std::stringstream errorStream;
+      errorStream << "Invalid back pointer value; backTable(" << i << "," << j << ")=" << backTable(i, j) << ".";
+      throw std::logic_error(errorStream.str());
+    } // switch
+  } // while (backTable(i, j) != ROOT)
+
+  alignmentFunc[j] = i;
+    
+  return boost::make_tuple(minCost, alignmentFunc);
+} // findAlignmentFunction2
+
+struct SearchResult
+{
+  SequenceNumber number;
+  CostType cost;
+  AlignmentFunction func;
+};
+
+typedef std::vector<SearchResult> SearchResultVector;
+
+SearchResultVector searchResults;
+
+std::vector< SearchResultVector > threadSearchResults(MAX_THREADS);
+std::vector< PositionVector > sortedStringsLengthsMinTree(MAX_THREADS);
+std::vector< PositionVector > sortedStringsLengthsMaxTree(MAX_THREADS);
+int resultsCounter[MAX_THREADS];
+
+template <class Iterator1, class Iterator2>
+Iterator2 buildMaxTree(Iterator1 first, Iterator1 last, Iterator2 intervalIterator)
+{
+  if (last > first) {
+    if (last == first + 1) {
+      *intervalIterator = *first;
+    } else {
+      Iterator1 mid = first + (last - 1 - first) / 2 + 1;
+      *intervalIterator = std::max(*buildMaxTree(first,  mid, intervalIterator + 1),
+				   *buildMaxTree(mid,   last, intervalIterator + 2 * (mid - first)));
+
+    }
+  }
+  return intervalIterator;
+}
+
+
+template <class Iterator1, class Iterator2>
+Iterator2 buildMinTree(Iterator1 first, Iterator1 last, Iterator2 intervalIterator)
+{
+  if (last > first) {
+    if (last == first + 1) {
+      *intervalIterator = *first;
+    } else {
+      Iterator1 mid = first + (last - 1 - first) / 2 + 1;
+      *intervalIterator = std::min(*buildMinTree(first,  mid, intervalIterator + 1),
+				   *buildMinTree(mid,   last, intervalIterator + 2 * (mid - first)));
+
+    }
+  }
+  return intervalIterator;
+}
+
+
+/*
+  The implementation of search function is seperated in another file.
+  The reason for this is to reuse the code to make two search functions:
+  The first search function does not keep track of back pointers and would not find the alignment but is supposedly faster.
+  The second search function will find the alignment.
+*/
+
+#define ALIGN_WITH_BACKPOINTER 0
+#include "search_include.cpp"
+
+#undef ALIGN_WITH_BACKPOINTER
+#define ALIGN_WITH_BACKPOINTER 1
+#include "search_include.cpp"
+
+
+
+template <class InputIterator>
+std::string firstWord(InputIterator first, InputIterator last)
+{
+  return std::string(first, std::find_if(first, last, isspace));
+}
+
+
+// Using std::vector<char> instead of std::vector<bool> .
+typedef std::vector<char> BoolVector;
+BoolVector marked;
+BoolVector markedGrey;
+int numberOfMarked;
+int remainingAtLastDbUpdate;
+
+sequence::Fasta inputFasta;
+sequence::Fasta predeterminedCentersFasta;
+
+bool lessSpectrumIndexPair(const SpectrumIndexPair &x, const SpectrumIndexPair &y, KmerNumber dimension)
+{
+  return x.first[dimension] < y.first[dimension];
+}
+
+typedef std::vector<KmerCount> KmerCountVector;
+
+template <class RandomAccessIterator>
+RandomAccessIterator median(RandomAccessIterator first, RandomAccessIterator last)
+{
+  RandomAccessIterator mid = first + ((last - first) / 2);
+  std::nth_element(first, mid, last);
+  return mid;
+}
+
+typedef const KmerCount * const ConstKmerCountArray;
+typedef KmerCount * const KmerCountArray;
+
+Index twoMeans(const Index l, const Index r)
+{
+
+#if MY_DEBUG
+  // debug.
+  std::cerr << "\n\ttwoMeans(" << l << ",\t" << r  << ")\n" << std::flush;
+#endif // MY_DEBUG
+
+#if OLD_CODE
+  std::sort(spectrumIndexes.begin() + l, spectrumIndexes.begin() + r);
+#endif // OLD_CODE
+
+#if OLD_CODE
+  typedef std::vector<float> FloatVector;
+  FloatVector center1(spectrumIndexes[l].first.begin(), spectrumIndexes[l].first.end());
+  FloatVector center2(center1.size());
+#endif // OLD_CODE
+
+  KmerCountVector center1(spectrumIndexes[l].first.begin(), spectrumIndexes[l].first.end());
+  KmerCountVector center2(center1.size());
+
+
+  {
+    Index i = l + 1;
+    while (spectrumIndexes[l].first == spectrumIndexes[i].first)
+      ++i;
+    
+    std::copy(spectrumIndexes[i].first.begin(), spectrumIndexes[i].first.end(), center2.begin());
+  }
+
+  assert(center1 != center2);
+
+  typedef std::vector<int> IntVector;
+  IntVector clusterNumber(r - l, 0);
+  clusterNumber.front() = 1;
+  clusterNumber.back() = 0;
+
+  int clustersChanged = 100000000;
+
+
+  // debug.
+  int clusterChanges = 0;
+
+
+  KmerCountVector counts1;
+  KmerCountVector counts2;
+
+  counts1.reserve(r - l);
+  counts2.reserve(r - l);
+
+  while((clustersChanged > 0) && ((clusterChanges <= 1) || (clustersChanged > (r - l) / 10))) {
+
+#if MY_DEBUG
+    // debug.
+    std::cerr << '.' << std::flush;
+    if ((l == 215699) && (r == 216534)) {
+      for (Index i = l; i < r; ++i)
+	std::cerr << (clusterNumber[i - l]);
+      std::cerr << '\n';
+    }
+#endif // MY_DEBUG
+    ++clusterChanges;
+
+
+    clustersChanged = 0;
+    
+    for (Index i = l; i < r; ++i) {
+      int d1 = 0;
+      int d2 = 0;
+
+      ConstKmerCountArray spectrum = spectrumIndexes[i].first.data();
+      for (KmerNumber j = 0; j < number_of_k_mers; ++j) {
+	d1 += abs(static_cast<int>(center1[j]) - static_cast<int>(spectrum[j]));
+	d2 += abs(static_cast<int>(center2[j]) - static_cast<int>(spectrum[j]));
+      }
+
+      if (d1 != d2) {
+	int newClusterNumber = (d1 < d2) ? 1 : 0;
+	if (newClusterNumber != clusterNumber[i - l]) {
+	  clusterNumber[i - l] = newClusterNumber;
+	  ++clustersChanged;
+	} // if
+      }
+    } // for
+
+#if OLD_CODE
+
+    {
+
+      int sizeOfCluster1 = 0;
+      int sizeOfCluster2 = 0;
+
+      for (Index i = l; i < r; ++i) 
+	if (clusterNumber[i - l] == 1) 
+	  ++sizeOfCluster1;
+	else 
+	  ++sizeOfCluster2;
+
+
+      if (sizeOfCluster1 == 0) {
+	
+	// debug.
+	std::cerr << "sizeOfCluster1 == 0\n" << std::flush;
+
+	float maxD2 = 0;
+	Index maxIndex = -1;
+	for (Index i = l; i < r; ++i) {
+	  float d2 = 0;
+	  for (KmerNumber j = 0; j < NUMBER_OF_K_MERS; ++j) {
+	    d2 += fabsf(center2[j] - spectrumIndexes[i].first[j]);
+	  }
+	  if (d2 > maxD2) {
+	    maxD2 = d2;
+	    maxIndex = i;
+	  }
+	} // for
+
+	assert(l <= maxIndex);
+	assert(maxIndex < r);
+
+	clusterNumber[maxIndex - l] = 1;
+	clustersChanged = true;
+      }
+      if (sizeOfCluster2 == 0) {
+
+	// debug.
+	std::cerr << "sizeOfCluster2 == 0\n" << std::flush;
+
+	float maxD1 = 0;
+	Index maxIndex = -1;
+	for (Index i = l; i < r; ++i) {
+	  float d1 = 0;
+	  for (KmerNumber j = 0; j < NUMBER_OF_K_MERS; ++j) {
+	    d1 += fabsf(center1[j] - spectrumIndexes[i].first[j]);
+	  }
+	  if (d1 > maxD1) {
+	    maxD1 = d1;
+	    maxIndex = i;
+	  }
+	} // for
+
+	assert(l <= maxIndex);
+	assert(maxIndex < r);
+
+	clusterNumber[maxIndex - l] = 0;
+	clustersChanged = true;
+      }
+	
+
+    }
+#endif // OLD_CODE
+ 
+
+    if (clustersChanged) { // Find new centers.
+
+#if OLD_CODE
+      int sizeOfCluster1 = 0;
+      int sizeOfCluster2 = 0;
+
+      std::fill(center1.begin(), center1.end(), 0);
+      std::fill(center2.begin(), center2.end(), 0);
+
+      for (Index i = l; i < r; ++i) {
+	if (clusterNumber[i - l] == 1) {
+	  ++sizeOfCluster1;
+	  for (KmerNumber j = 0; j < NUMBER_OF_K_MERS; ++j)
+	    center1[j] += spectrumIndexes[i].first[j];
+	} else {
+	  ++sizeOfCluster2;
+	  for (KmerNumber j = 0; j < NUMBER_OF_K_MERS; ++j)
+	    center2[j] += spectrumIndexes[i].first[j];
+	}
+
+      }
+
+      assert(sizeOfCluster1 > 0);
+      assert(sizeOfCluster2 > 0);
+
+      for (KmerNumber j = 0; j < NUMBER_OF_K_MERS; ++j) {
+	center1[j] /= sizeOfCluster1;
+	center2[j] /= sizeOfCluster2;
+      } // for
+#endif // OLD_CODE
+
+
+#if MY_DEBUG
+      std::cerr << '(' << std::flush;
+#endif // MY_DEBUG
+
+      for (KmerNumber j = 0; j < number_of_k_mers; ++j) {
+
+	counts1.clear();
+	counts2.clear();
+
+	for (Index i = l; i < r; ++i) {
+	  if (clusterNumber[i - l] == 1) {
+	    counts1.push_back(spectrumIndexes[i].first[j]);
+	  } else {
+	    counts2.push_back(spectrumIndexes[i].first[j]);
+	  } // if (clusterNumber[i - l] == 1)
+	} // for (Index i
+
+#if MY_DEBUG
+	assert(counts1.size() > 0);
+	assert(counts2.size() > 0);
+#endif // MY_DEBUG
+
+	center1[j] = *median(counts1.begin(), counts1.end());
+	center2[j] = *median(counts2.begin(), counts2.end());
+
+      } // for (KmerNumber j
+
+#if MY_DEBUG
+      std::cerr << ')' << std::flush;
+#endif // MY_DEBUG
+
+    } // Find new center.
+
+
+  } // while
+
+#if MY_DEBUG
+  std::cerr << 'S' << std::flush;
+#endif // MY_DEBUG
+
+  { // Split.
+    Index i = l;
+    Index j = r - 1;
+
+    KmerCountVector temp(number_of_k_mers);
+    
+    while (j > i) {
+      while (clusterNumber[i - l] == 1)
+	++i;
+      while (clusterNumber[j - l] == 0)
+	--j;
+      if (j > i) {
+	std::swap(spectrumIndexes[i], spectrumIndexes[j]);
+	/*
+	{
+	  // std::swap(spectrumIndexes[i].first, spectrumIndexes[j].first);
+	  std::copy(spectrumIndexes[i].first.begin(), spectrumIndexes[i].first.end(), temp.begin());
+	  std::copy(spectrumIndexes[j].first.begin(), spectrumIndexes[j].first.end(), spectrumIndexes[i].first.begin());
+	  std::copy(temp.begin(), temp.end(), spectrumIndexes[i].first.begin());
+
+	  std::swap(spectrumIndexes[i].second, spectrumIndexes[j].second);
+	}
+	*/
+	std::swap(clusterNumber[i - l], clusterNumber[j - l]);
+	++i;
+	--j;
+      } // if
+    } // while
+
+    // debug.
+    
+  } // Split.
+
+#if MY_DEBUG
+  std::cerr << 'R' << std::flush;
+#endif // MY_DEBUG
+
+  Index result = l;
+  while (clusterNumber[result - l] == 1)
+    ++result;
+
+  for (Index i = result; i < r; ++i)
+    assert(clusterNumber[i - l] == 0);
+
+#if MY_DEBUG
+  // debug.
+  std::cerr << "clusterChanges=" << clusterChanges << ". done.\n" << std::flush;
+#endif // MY_DEBUG
+#if MY_DEBUG
+  std::cerr << 'Q' << std::flush;
+#endif // MY_DEBUG
+
+  return result;
+  
+  
+} // twoMeans
+
+
+namespace cluster_information
+{
+  Index *l = 0;
+  Index *r = 0;
+  Index *sizeOfLeftCluster = 0;
+
+
+  Index *numberOfKmers = 0;
+  KmerCount **allMinCounts = 0;
+  KmerCount **allMaxCounts = 0;
+
+  int **counts = 0;
+}
+
+
+
+const KmerCount MAX_COUNT = std::numeric_limits<KmerCount>::max();
+const KmerCount MIN_COUNT = 0;
+
+const Index NULL_INDEX = -1;
+
+
+void clusterSort(const Index l, const Index r, const Index index)
+{
+
+#if MY_DEBUG
+  if (r - l > 10000)
+    std::cerr << "clusterSort(" << l << ",\t" << r << ") " << std::flush;
+#endif // MY_DEBUG
+
+#if OLD_CODE
+  ClusterInformation &info = clusters_new[index];
+
+  clusters_new[index].l = l;
+  clusters_new[index].r = r;
+
+  KmerCountArray minCounts = clusters_new[index].minCounts;
+  KmerCountArray maxCounts = clusters_new[index].maxCounts;
+
+  std::fill_n(minCounts, number_of_k_mers, MAX_COUNT);
+  std::fill_n(maxCounts, number_of_k_mers, MIN_COUNT);
+
+  for (Index i = l; i < r; ++i)
+    for (KmerNumber j = 0; j < number_of_k_mers; ++j) {
+      KmerCount count = spectrumIndexes[i].first[j];
+
+      if (count < minCounts[j])
+	minCounts[j] = count;
+      if (count > maxCounts[j])
+	maxCounts[j] = count;
+    }
+
+
+#endif // OLD_CODE
+
+#if MY_DEBUG
+  std::cerr << '1' << std::flush;
+#endif // MY_DEBUG
+
+  {
+    // Set 'cluster_infomation' of this cluster.
+    // \for_all k \in {1...k_mer_length}: cluster_information::allMinCounts[k][index * cluster_information::numberOfKmers[k] ... index * cluster_information::numberOfKmers[k] + cluster_information::numberOfKmers[k] - 1]
+    
+
+    for (int k = 0; k < k_mer_length; ++k) {
+      std::fill_n(&cluster_information::allMinCounts[k][index * cluster_information::numberOfKmers[k]], cluster_information::numberOfKmers[k], MAX_COUNT);
+      std::fill_n(&cluster_information::allMaxCounts[k][index * cluster_information::numberOfKmers[k]], cluster_information::numberOfKmers[k], MIN_COUNT);
+    }
+
+#if MY_DEBUG
+    std::cerr << '2' << std::flush;
+#endif // MY_DEBUG
+
+    for (Index i = l; i < r; ++i) {
+
+
+      for (int k = 0; k < k_mer_length; ++k)
+	std::fill_n(cluster_information::counts[k], cluster_information::numberOfKmers[k], 0);
+    
+      std::copy (spectrumIndexes[i].first.begin(), spectrumIndexes[i].first.end(), cluster_information::counts[k_mer_length - 1]);
+
+      for (int k = k_mer_length - 1; k > 0; --k)
+	for (int j = 0; j < cluster_information::numberOfKmers[k]; ++j) {
+	  int j_prime = j / static_cast<int>(NUMBER_OF_NUCLEOTIDES);
+	  cluster_information::counts[k - 1][j_prime] += cluster_information::counts[k][j];
+	}
+
+      for (int k = 0; k < k_mer_length; ++k)
+	for (int j = 0; j < cluster_information::numberOfKmers[k]; ++j) {
+	  KmerCount count = MAX_COUNT;
+	  if (cluster_information::counts[k][j] < MAX_COUNT)
+	    count = cluster_information::counts[k][j];
+
+	  KmerCount &minCount = cluster_information::allMinCounts[k][index * cluster_information::numberOfKmers[k] + j];
+	  KmerCount &maxCount = cluster_information::allMaxCounts[k][index * cluster_information::numberOfKmers[k] + j];
+
+	  if (minCount > count)
+	    minCount = count;
+	  if (maxCount < count)
+	    maxCount = count;
+	}
+    } // for (Index i = l; i < r; ++i)
+
+
+    cluster_information::l[index] = l;
+    cluster_information::r[index] = r;
+
+  }
+
+#if MY_DEBUG
+  {
+
+    std::cerr << '\n';
+
+    for (int k = 0; k < k_mer_length; ++k) {
+      for (int j = 0; j < cluster_information::numberOfKmers[k]; ++j) {
+	std::cerr << static_cast<int>(cluster_information::allMinCounts[k][index * cluster_information::numberOfKmers[k] + j]) << '\t';
+      }
+      std::cerr << '\n';
+      for (int j = 0; j < cluster_information::numberOfKmers[k]; ++j) {
+	std::cerr << static_cast<int>(cluster_information::allMaxCounts[k][index * cluster_information::numberOfKmers[k] + j]) << '\t';
+      }
+      std::cerr << '\n';
+      
+
+    }
+
+    std::cerr << std::flush;
+
+  }
+
+  std::cerr << '3' << std::flush;
+#endif // MY_DEBUG
+
+
+#if MEDIAN_COUNTS
+  // Calculate 'medianCounts' and 'averageDeviations'.
+  KmerCountArray medianCounts = info.medianCounts;
+  {
+    //    KmerCountArray averageDeviations = info.averageDeviations;
+
+    KmerCountVector counts(r - l);
+    KmerCount count;
+
+    for (KmerNumber j = 0; j < NUMBER_OF_K_MERS; ++j) {
+      for (Index i = l; i < r; ++i) {
+	count = spectrumIndexes[i].first[j];
+	counts[i - l] = count;
+      } // for (Index i = l; i < r; ++i)
+    
+      medianCounts[j] = *median(counts.begin(), counts.end());
+
+#if OLD_CODE
+      int deviation = 0;
+
+      for (Index i = l; i < r; ++i) {
+	count = spectrumIndexes[i].first[j];
+	deviation += abs(medianCounts[j] - count);
+      } // for (Index i = l; i < r; ++i)
+
+      averageDeviations[j] = deviation / (r - l);
+#endif // OLD_CODE
+
+    } // for (KmerNumber k = 0; k < NUMBER_OF_K_MERS; ++k)
+  } // Calculate 'medianCounts' and 'averageDeviations'.
+#endif // MEDIAN_COUNTS
+
+#if OLD_CODE
+  // Store indexes of different new counts.
+  info.numberOfDifferentCounts = 0;
+  for (KmerNumber j = 0; j < number_of_k_mers; ++j) 
+    if ((minCounts[j] > prevMinCounts[j]) || 
+	(maxCounts[j] < prevMaxCounts[j])
+
+#if MEDIAN_COUNTS
+	||
+	(medianCounts[j] != prevMedianCounts[j])
+#endif // MEDIAN_COUNTS
+
+	) {
+      info.indexOfDifferentCounts[info.numberOfDifferentCounts] = j;
+      ++info.numberOfDifferentCounts;
+    }
+
+#endif // OLD_CODE
+
+#if MY_DEBUG
+  std::cerr << "l=" << l << "\tr=" << r << "\ti=" << index << '\n' << std::flush;
+  for (KmerNumber j = 0; j < NUMBER_OF_K_MERS; ++j)
+    std::cerr << static_cast<int>(minCounts[j]) << '\t';
+  std::cerr << '\n' << std::flush;
+  for (KmerNumber j = 0; j < NUMBER_OF_K_MERS; ++j)
+    std::cerr << static_cast<int>(maxCounts[j]) << '\t';
+  std::cerr << '\n' << std::flush;
+#endif // MY_DEBUG
+
+  bool allEqual = true;
+
+  {
+    for (KmerNumber j = 0; j < number_of_k_mers; ++j) {
+
+      KmerCount &minCount = cluster_information::allMinCounts[k_mer_length - 1][index * cluster_information::numberOfKmers[k_mer_length - 1] + j];
+      KmerCount &maxCount = cluster_information::allMaxCounts[k_mer_length - 1][index * cluster_information::numberOfKmers[k_mer_length - 1] + j];
+
+      if (minCount < maxCount)
+	allEqual = false;
+    }
+  }
+
+
+  if (allEqual) {
+
+#if OLD_CODE
+    clusters_new[index].sizeOfLeftCluster = NULL_INDEX;
+#endif // OLD_CODE
+
+    cluster_information::sizeOfLeftCluster[index] = NULL_INDEX;
+
+#if MY_DEBUG
+    std::cerr << ".\n" << std::flush;
+#endif // MY_DEBUG
+
+    return;
+  }
+
+#if MY_DEBUG
+  std::cerr << '4' << std::flush;
+#endif // MY_DEBUG
+
+  Index mid = twoMeans(l, r);
+
+#if MY_DEBUG
+  std::cerr << ".\n" << std::flush;
+#endif // MY_DEBUG
+
+#if OLD_CODE
+  clusters_new[index].sizeOfLeftCluster = mid - l;
+#endif // OLD_CODE
+
+  cluster_information::sizeOfLeftCluster[index] = mid - l;
+
+  clusterSort(l, mid, index + 1);
+  clusterSort(mid, r, index + 2 * (mid - l));
+}
+
+IndexVector clusterSearchResults;
+
+#if OLD_CODE
+typedef const ClusterInformation * ConstClusterInfoPointer;
+#endif // OLD_CODE
+
+typedef const KmerCount * const ConstKmerCountArray;
+
+typedef int Threshold;
+
+Threshold maxMoreThreshold;
+Threshold minMoreThreshold;
+
+
+
+namespace layered {
+
+  int layeredMaxMoreThresholds[MAXIMUM_K_MER_LENGTH];
+  int layeredMinMoreThresholds[MAXIMUM_K_MER_LENGTH];
+
+  int **allQueryCounts = 0;
+
+  int layeredClusterSearch(Index index, int k)
+  {
+    
+#if MY_DEBUG
+    std::cerr << "layeredClusterSearch(" << index << ", " << k << ")\n" << std::flush;
+#endif // MY_DEBUG
+
+    int maxMore = 0;
+    int minMore = 0;
+
+    int numberOfKmers = cluster_information::numberOfKmers[k];
+    KmerCount *minCounts = &cluster_information::allMinCounts[k][index * numberOfKmers];
+    KmerCount *maxCounts = &cluster_information::allMaxCounts[k][index * numberOfKmers];
+
+    int minMoreThreshold = layeredMinMoreThresholds[k];
+    int maxMoreThreshold = layeredMaxMoreThresholds[k];
+
+
+    int *queryCounts = allQueryCounts[k];
+
+#if MY_DEBUG
+    for (Index i = 0; i < numberOfKmers; ++i) 
+      std::cerr << queryCounts[i] << '\t';
+    std::cerr << '\n';
+    for (Index i = 0; i < numberOfKmers; ++i) 
+      std::cerr << static_cast<int>(minCounts[i]) << '\t';
+    std::cerr << '\n';
+    for (Index i = 0; i < numberOfKmers; ++i) 
+      std::cerr << static_cast<int>(maxCounts[i]) << '\t';
+    std::cerr << '\n' << std::flush;
+#endif // MY_DEBUG
+
+    {
+      // Compute 'minMore' and 'maxMore'.
+
+      int minCount;
+      int maxCount;
+      int queryCount;
+
+      for (Index i = 0; i < numberOfKmers; ++i) {
+
+	maxCount = maxCounts[i];
+	queryCount = queryCounts[i];
+	
+	if (maxCount > queryCount) {
+	  maxMore += maxCount - queryCount;
+	  
+	  minCount = minCounts[i];
+	  if (minCount > queryCount) { 
+	    minMore += minCount - queryCount;
+
+	    if (minMore > minMoreThreshold) {
+#if MY_DEBUG								
+	      std::cerr << "minMore=" << minMore << " > " << layeredMinMoreThresholds[k] << "=layeredMinMoreThresholds[k]\n" << std::flush;
+#endif // MY_DEBUG				
+	      return 0;
+	    } // if (minMore > minMoreThreshold)
+
+
+	    // Second type of approximate filter
+	    if (approximateFilter2 && minMore * (cluster_information::r[index] - cluster_information::l[index]) > minMoreThreshold)
+	      return 0;
+
+	  } // if (minCount > queryCount)
+
+	} // if (maxMore > queryCount)
+	  
+#if OLD_CODE
+	if (minCounts[i] > queryCounts[i])
+	  minMore += static_cast<int>(minCounts[i]) - queryCounts[i];
+	if (maxCounts[i] > queryCounts[i])
+	  maxMore += static_cast<int>(maxCounts[i]) - queryCounts[i];
+#endif // OLD_CODE
+      } // for (Index i = 0; i < numberOfKmers; ++i)
+    }
+
+    if (maxMore <= maxMoreThreshold) {
+#if MY_DEBUG
+      std::cerr << "maxMore=" << maxMore << " <= " << layeredMaxMoreThresholds[k] << "=layeredMaxMoreThresholds[k]\n" << std::flush;
+#endif // MY_DEBUG
+      if (k + 1 < k_mer_length) {
+	return layeredClusterSearch(index, k + 1);
+      }
+
+#if MY_DEBUG
+      std::cerr << "Return results:\n" << std::flush;
+#endif // MY_DEBUG
+
+      for (Index i = cluster_information::l[index]; i < cluster_information::r[index]; ++i)
+	clusterSearchResults.push_back(i);
+      return cluster_information::r[index] - cluster_information::l[index];
+    }
+
+
+    return (layeredClusterSearch(index + 1, k) +
+	    layeredClusterSearch(index + 2 * cluster_information::sizeOfLeftCluster[index], k));
+
+  }
+
+} // namespace layered
+
+
+void *threaded_search_without_backpointer(void *read_args) {
+    struct thread_args *args = static_cast<struct thread_args *>(read_args);
+
+    const PositionType l = args->l;
+    const PositionType r = args->r;
+    PositionType pos = args->pos;
+    const Index intervalIndex = args->intervalIndex;
+    const uint16_t tableIndex = args->tableIndex;
+    const int globalOffset = args->globalOffset;
+    search_without_backpointer(l, r, pos, intervalIndex, tableIndex, globalOffset);
+    return NULL;
+}
+
+
+void * threadWorker(void *context) {
+  int id = (static_cast<struct Context *>(context))->id;
+
+  pthread_mutex_lock(&searchWorkersMutexes[id]);
+  while (1) {
+    // Then wait for more work.
+    pthread_cond_wait(&searchWorkersConds[id], &searchWorkersMutexes[id]);
+
+    // Received work:
+#if CHRIS_DEBUG
+    std::cout<< "[THREAD " << id << "] received work.  Table index: " << searchWorkersArgs[id].tableIndex << "\n";
+    flush(std::cout);
+#endif // CHRIS_DEBUG
+
+    
+    threaded_search_without_backpointer(static_cast<void *>(&searchWorkersArgs[id]));
+    pthread_mutex_lock(&allThreadsMutex);
+    
+    ++threads_completed;
+
+    if (threads_completed >= running_threads) {
+      pthread_cond_signal(&searchWorkersCompleted);
+    }
+    pthread_mutex_unlock(&allThreadsMutex);
+  }
+
+  return NULL;
+}
+
+
+/*
+  Input: 'queryString', 'queryLength', 'queryHeader', 'queryHeaderLength' should be set.
+  Database: 'sortedStrings' contains the strings to be clusterd in lexicographically sorted order. After making a cluster the sequences are removed from this vector.
+  Output: Prints the cluster to STDOUT. The format of the output is determined by the value of global boolean variable 'outputMultipleAlignment'.
+*/
+
+void makeCluster()
+{
+#if MY_DEBUG
+  std::cerr << "makeCluster()\n" << std::flush;
+#endif // MY_DEBUG
+
+  radious = static_cast<CostType>(floor((1 - similarity) * querySequenceLength));
+  if (mismatches >= 0)
+    radious = static_cast<CostType>(mismatches);
+
+  if (noOverlap)
+    radious *= 2;
+
+  if (! noKmerFilter) {
+    querySpectrum = countKmers(querySequencePointer, querySequenceLength);
+  } // if (! noKmerFilter)
+
+
+
+  if (! noKmerFilter) {   
+    // Set allQueryCounts
+
+    std::copy(querySpectrum.begin(), querySpectrum.end(), layered::allQueryCounts[k_mer_length - 1]);
+    
+    for (int k = 0; k < k_mer_length - 1; ++k)
+      std::fill_n(layered::allQueryCounts[k], cluster_information::numberOfKmers[k], 0);
+
+
+    for (int k = k_mer_length - 1; k > 0; --k)
+      for (int j = 0; j < cluster_information::numberOfKmers[k]; ++j) {
+	int j_prime = j / static_cast<int>(NUMBER_OF_NUCLEOTIDES);
+	layered::allQueryCounts[k - 1][j_prime] += layered::allQueryCounts[k][j];
+      }
+
+
+#if MY_DEBUG
+    for (int k = 0; k < k_mer_length; ++k) {
+      for (int j = 0; j < cluster_information::numberOfKmers[k]; ++j)
+	std::cerr << layered::allQueryCounts[k][j] << '\t';
+      std::cerr << '\n' << std::flush;
+    }
+#endif // MY_DEBUG
+
+    for (int k = 0; k < k_mer_length - 1; ++k)
+      for (int j = 0; j < cluster_information::numberOfKmers[k]; ++j)
+	if (layered::allQueryCounts[k][j] > MAX_COUNT)
+	  layered::allQueryCounts[k][j] = MAX_COUNT;
+
+
+  } // if (! noKmerFilter)
+
+#if OLD_CODE
+  callSplitSearch();
+#endif // OLD_CODE
+
+#if MY_DEBUG
+  std::cerr << "splitSearchResults.size() == " << splitSearchResults.size() << '\n' << std::flush;
+#endif // MY_DEBUG
+
+
+#if MY_DEBUG
+  naiveSearchResults.clear();
+  
+  naiveSpectrumSearch(radious * K_MER_LENGTH);
+
+  assert(naiveSearchResults == splitSearchResults);
+
+  std::cerr << "navieSearchResults == splitSearchResults\n" << std::flush;
+#endif // MY_DEBUG
+
+#if MY_DEBUG
+  for (int i = 0; i < NUMBER_OF_K_MERS; ++i)
+    std::cerr << static_cast<int>(querySpectrum[i]) << '\t';
+  std::cerr << '\n' << std::flush;
+
+  for (int j = 0; j < splitSearchResults.size(); ++j) {
+    for (int i = 0; i < NUMBER_OF_K_MERS; ++i)
+      std::cerr << static_cast<int>(spectrumIndexes[splitSearchResults[j]].first[i]) << '\t';
+    std::cerr << '\n';
+  }
+#endif // MY_DEBUG
+
+  clusterSearchResults.clear();
+
+  if (! noKmerFilter) {
+    // Set k-mer filter thresholds
+    {
+      for (int i = 0; i < k_mer_length; ++i)
+	if (radious == 0) {
+	  layered::layeredMaxMoreThresholds[i] = 0;
+	  layered::layeredMinMoreThresholds[i] = 0;
+	} else {
+	  // radious * sqrt(i + 1)
+	  // Approximation minMoreThreshold <= maxMoreThreshold
+	  layered::layeredMaxMoreThresholds[i] = std::max(radious, static_cast<CostType>(querySequenceLength * 0.04)) * (i + 1);
+#if OLD_CODE
+	  layered::layeredMinMoreThresholds[i] = (approximateFilter ? static_cast<int>(radious * exp(approximationFactor * log(i + 1)))  : (radious * (i + 1)));
+#endif // OLD_CODE
+	  layered::layeredMinMoreThresholds[i] = radious * (i + 1);
+	}
+    } 
+
+#if MY_DEBUG
+    std::cerr << "Before layeredClusterSearch(0, 0)\n" << std::flush;
+#endif //MY_DEBUG
+
+    layered::layeredClusterSearch(0, 0);
+
+#if MY_DEBUG
+    for (int i = 0; i < static_cast<int>(clusterSearchResults.size()); ++i)
+      std::cerr << clusterSearchResults[i] << '\t';
+    std::cerr << '\n' << std::flush;
+
+    exit(EXIT_FAILURE);
+#endif // MY_DEBUG
+
+
+#if NEW_CODE
+    clusterSearch(clusters.data());
+#endif // NEW_CODE
+  } // if (! noKmerFilter)
+
+  IndexVector unmarkeds;
+
+
+  if (! noKmerFilter) { 
+    // Remove already marked sequences.
+    unmarkeds.reserve(clusterSearchResults.size());
+
+    for(IndexVector::const_iterator it = clusterSearchResults.begin(); it != clusterSearchResults.end(); ++it){
+      Index lexicographicIndex = spectrumIndexes[*it].second;
+      if (!marked[lexicographicIndex])
+	unmarkeds.push_back(lexicographicIndex);
+    }
+    const Index numberOfUnmarkeds = unmarkeds.size();
+  
+    std::sort(unmarkeds.begin(), unmarkeds.end());
+
+#if MY_DEBUG							
+    std::cout << "Query:\t" << querySequencePointer << '\n';
+  
+    for (Index i = 0; i < numberOfUnmarkeds; ++i)
+      std::cout << lexicographicSequencePointers[unmarkeds[i]] << '\n';
+#endif // MY_DEBUG
+
+    sortedStrings.resize(numberOfUnmarkeds);
+
+    PositionVector sortedStringsLengths(sortedStrings.size());
+  
+    for (Index i = 0; i < numberOfUnmarkeds; ++i) {
+      sortedStrings[i] = lexicographicSequencePointers[unmarkeds[i]];
+      sortedStringsLengths[i] = sequenceLengths[unmarkeds[i]];
+    }
+
+    searchResults.clear();
+
+    for(uint16_t i = 0; i < num_threads; ++i){
+      word[i][1] = invalidChar;
+      wordMinLength[i][1] = MAX_LENGTH + 1;
+    }
+
+    
+    size_t num_threads_to_use = num_threads;
+    int interval_size = static_cast<int>(ceil(sortedStrings.size() / static_cast<double>(num_threads)));//sortedStrings.size() / num_threads_to_use;
+
+    if (sortedStrings.size() < num_threads) {
+      num_threads_to_use = 1;
+      interval_size = sortedStrings.size();
+    }
+
+#if CHRIS_DEBUG
+      std::cout << "Size: " <<  sortedStrings.size() << "\n";
+      //std::cout << "Building first trees for table: " << i << " from: " <<  << " to " << end << ", interval_size " << chunkIndexes[i].second << "\n" << std::flush;
+#endif // CHRIS_DEBUG
+
+    running_threads = 0;
+    for (unsigned int i = 0, currentTable = 0; i < sortedStrings.size(); i += interval_size, ++currentTable) {
+      unsigned int end = i + interval_size;
+      if (i + interval_size > sortedStrings.size()) {
+        end = sortedStrings.size();
+      }
+      else {
+        end = i + interval_size;
+      }
+
+      sortedStringsLengthsMinTree[currentTable].resize(2 * (end - i));
+      sortedStringsLengthsMaxTree[currentTable].resize(2 * (end - i));
+
+#if CHRIS_DEBUG
+        std::cout << "Kmer: Building second trees from: " << i << " to " << end << "\n" << std::flush; 
+        std::cout << "Kmer: Size of sorted string min/max: " << sortedStringsLengthsMinTree[currentTable].size() << ", " 
+            << sortedStringsLengthsMaxTree[currentTable].size() << "\n" << std::flush;
+#endif // CHRIS_DEBUG
+
+      buildMinTree(sortedStringsLengths.begin() + i, sortedStringsLengths.begin() + end, sortedStringsLengthsMinTree[currentTable].begin());
+      buildMaxTree(sortedStringsLengths.begin() + i, sortedStringsLengths.begin() + end, sortedStringsLengthsMaxTree[currentTable].begin());
+    }
+
+    if (numberOfUnmarkeds) {
+      if (outputMultipleAlignment)
+	search(0, numberOfUnmarkeds - 1, 0, 0, 0, 0);
+      else {
+      
+      running_threads = 0;
+      threads_completed = 0;
+
+#if CHRIS_DEBUG
+        std::cout << "Size: " << sortedStrings.size() << ", numberOfUnmarkeds: " << numberOfUnmarkeds 
+        << ", interval_size" << interval_size<< "\n" <<std::flush;
+#endif // CHRIS_DEBUG
+
+      for (int i = 0; i < numberOfUnmarkeds; i += interval_size) {
+        ++running_threads;
+        pthread_mutex_lock(&searchWorkersMutexes[running_threads-1]);
+
+        struct thread_args *args = &searchWorkersArgs[running_threads-1];
+        args->l = 0;
+
+        if (i + interval_size > numberOfUnmarkeds) {
+          args->r = (numberOfUnmarkeds - 1) - i;
+        }
+        else {
+          args->r = interval_size - 1;
+        }
+        args->pos = 0;
+        args->intervalIndex = 0;
+        args->tableIndex = running_threads-1;
+        args->globalOffset = i;
+
+#if CHRIS_DEBUG
+        std::cout << "  Assigning work to thread: " << i << ", l: " << args->l
+            << ", r: " << args->r << "\n" <<std::flush;
+#endif // CHRIS_DEBUG
+
+        pthread_cond_signal(&searchWorkersConds[running_threads-1]);
+        pthread_mutex_unlock(&searchWorkersMutexes[running_threads-1]);
+
+      }
+
+      // Wait for all threads to complete.
+      pthread_mutex_lock(&allThreadsMutex);
+      while (threads_completed < running_threads) {
+        pthread_cond_wait(&searchWorkersCompleted, &allThreadsMutex);
+      }
+      pthread_mutex_unlock(&allThreadsMutex);
+
+      for(int i = 0; i < running_threads; i++)  {
+        searchResults.insert(searchResults.end(), threadSearchResults[i].begin(), threadSearchResults[i].end());
+        threadSearchResults[i].clear();
+      }
+    }
+
+	//search_without_backpointer(0, numberOfUnmarkeds - 1, 0, 0, 0, 0);
+    }
+
+  } // if (! noKmerFilter)
+  else {
+    // No k-mer filter
+
+    for(uint16_t i = 0; i < num_threads; ++i){
+      word[i][1] = invalidChar;
+      wordMinLength[i][1] = MAX_LENGTH + 1;
+    }
+    searchResults.clear();
+
+    if (outputMultipleAlignment)
+      search(0, sortedStrings.size() - 1, 0, 0, 0, 0);
+    else {
+      int num_threads_to_use = num_threads;
+      int interval_size = static_cast<int>(ceil(sortedStrings.size() / static_cast<double>(num_threads)));
+      
+      if (sortedStrings.size() < num_threads) {
+        num_threads_to_use = 1;
+        interval_size = sortedStrings.size();
+      }
+
+      running_threads = 0;
+      threads_completed = 0;
+
+      for (unsigned int i = 0; i < sortedStrings.size(); i += interval_size) {
+        ++running_threads;
+
+        // Create the thread variables.
+        pthread_mutex_lock(&searchWorkersMutexes[running_threads - 1]);
+
+        struct thread_args *args = &searchWorkersArgs[running_threads - 1];
+        args->l = 0;
+
+        if (i + interval_size > sortedStrings.size()) {
+          args->r = (sortedStrings.size() - 1) - i;
+        }
+        else {
+          args->r = interval_size - 1;
+        }
+        args->pos = 0;
+        args->intervalIndex = 0;
+        args->tableIndex = running_threads-1;
+        args->globalOffset = i;
+
+#if CHRIS_DEBUG
+        std::cout << "Assigning work to thread: " << i << ", l: " << args->l
+            << ", r: " << args->r << "\n" <<std::flush;
+#endif // CHRIS_DEBUG
+
+
+        pthread_cond_signal(&searchWorkersConds[running_threads-1]);
+        pthread_mutex_unlock(&searchWorkersMutexes[running_threads-1]);
+      }
+
+      // Wait for all threads to complete.
+      pthread_mutex_lock(&allThreadsMutex);
+      while (threads_completed < running_threads) {
+        pthread_cond_wait(&searchWorkersCompleted, &allThreadsMutex);
+      }
+      pthread_mutex_unlock(&allThreadsMutex);
+
+      for(int i = 0; i < running_threads; i++)  {
+        searchResults.insert(searchResults.end(), threadSearchResults[i].begin(), threadSearchResults[i].end());
+        threadSearchResults[i].clear();
+      }
+
+      //search_without_backpointer(0, sortedStrings.size() - 1, 0, 0);
+    }
+  }
+
+
+  { // Output cluster alignment.
+	
+
+    // Output cluster size.
+    if (outputMultipleAlignment) {
+      std::cout << CLUSTER_INDICATOR << searchResults.size() + 1 << '\n';
+    }
+
+
+    typedef std::vector<PositionType> CountsVector;
+
+
+    // This vector will contain the maximum number of gaps that happen after position [i] in query string in ANY of the alignments to search results.
+    CountsVector gapCounts(querySequenceLength);
+	
+	
+    if (outputMultipleAlignment) { // Compute gapCounts[0..].
+
+      for (SearchResultVector::const_iterator resultIterator = searchResults.begin(); resultIterator != searchResults.end(); ++resultIterator) {
+	for (PositionType i = 0; i < static_cast<PositionType>(resultIterator->func.size() - 1); ++i)
+	  gapCounts[i] = std::max(gapCounts[i], resultIterator->func[i + 1] - resultIterator->func[i] - 1);
+
+      } // for
+    } // if (outputMultipleAlignment)
+
+    // This string will contain the query sequence with appropriate number of gaps inserted at each position to produce a multiple alignment of the cluster.
+    std::string clusterCenterSequenceWithGaps;
+
+    if (outputMultipleAlignment) { // Compute clusterCenterSequenceWithGaps.
+      for (PositionType positionInQuery = 0; positionInQuery < querySequenceLength; ++positionInQuery) {
+	clusterCenterSequenceWithGaps.append(gapCounts[positionInQuery], gapChar);
+	clusterCenterSequenceWithGaps.push_back(querySequencePointer[positionInQuery]);
+      } // for
+    } // if (outputMultipleAlignment)
+
+    // Output cluster center sequence with gaps from multiple alignment.
+    if (outputMultipleAlignment) {
+      std::cout << '>' << queryHeaderPointer << '\n'
+		<< clusterCenterSequenceWithGaps << '\n';
+    } else {
+      if (useFullQueryHeader) {
+        if (!assignAmbiguous) {
+          std::cout << queryHeaderPointer << '\t';
+        } else {
+          if (!printInvertedIndex)
+            center_nonambig_counts[queryHeaderPointer] = 1;
+        }
+      } else {
+        if (!assignAmbiguous) {
+          std::cout << firstWord(queryHeaderPointer, queryHeaderPointer + queryHeaderLength) << '\t';
+        } else {
+          if (!printInvertedIndex)
+            center_nonambig_counts[firstWord(queryHeaderPointer, queryHeaderPointer + queryHeaderLength)] = 1;
+        }
+      }
+    } 
+	
+    // Output multiple alignment of sequences in current cluster.
+    for (SearchResultVector::const_iterator resultIterator = searchResults.begin(); resultIterator != searchResults.end(); ++resultIterator) {
+
+      Index lexicographicIndex;
+      if (! noKmerFilter) {
+	lexicographicIndex = unmarkeds[resultIterator->number];
+      } else {
+	lexicographicIndex = lexicographicIndexOfSortedStringsIndexes[resultIterator->number];
+      }
+      ConstCharPointer s = sortedStrings[resultIterator->number];
+
+      if (!assignAmbiguous)
+        markedGrey[lexicographicIndex] = true;
+
+      if ((!noOverlap) || resultIterator->cost < radious / 2) {
+      
+	if (!marked[lexicographicIndex]) {
+    if (!assignAmbiguous) {
+  	  marked[lexicographicIndex] = true;
+  	  ++numberOfMarked;
+    }
+
+	  ConstCharPointer headerPointer = headerPointers[lexicographicIndex];
+	  PositionType headerLength = headerLengths[lexicographicIndex];
+
+	  if (outputMultipleAlignment) {
+	    std::string alignedSequence; 
+	    for (PositionType i = 0; i < static_cast<PositionType>(gapCounts.size()); ++i) {
+	      for (PositionType j = resultIterator->func[i + 1] - resultIterator->func[i]; j < gapCounts[i] + 1; ++j)
+		alignedSequence.push_back(gapChar);
+	      for (PositionType j = resultIterator->func[i]; j < resultIterator->func[i + 1]; ++j)
+		alignedSequence.push_back(s[j]);
+	    } // for (PositionType i = 0; i < gapCounts.size() - 1; ++i)
+
+	    std::cout << '>' << headerPointer << '\n'
+		      << alignedSequence << '\n';
+	  } else {
+      if (useFullQueryHeader) {
+        if (!assignAmbiguous) {
+          std::cout << headerPointer << '\t';
+        }
+        else {
+          if (printInvertedIndex) {
+            std::cout << headerPointer << '\t' << queryHeaderPointer << '\n';
+          } else {
+            inverted_index[headerPointer].push_back(queryHeaderPointer);
+            if (inverted_index[headerPointer].size() == 1) {
+              center_nonambig_counts[queryHeaderPointer] += 1;
+            } else if (inverted_index[headerPointer].size() == 2) {
+              center_nonambig_counts[inverted_index[headerPointer][0]] -= 1;
+            }
+          }
+        }
+
+      } else {
+        if (!assignAmbiguous) {
+          std::cout << firstWord(headerPointer, headerPointer + headerLength) << '\t';
+        } else {
+          if (printInvertedIndex) {
+            std::cout << firstWord(headerPointer, headerPointer + headerLength) << '\t' 
+                << firstWord(queryHeaderPointer, queryHeaderPointer + queryHeaderLength) << '\n';
+          } else {
+            inverted_index[firstWord(headerPointer, headerPointer + headerLength)].push_back(
+                firstWord(queryHeaderPointer, queryHeaderPointer + queryHeaderLength));
+
+            //inverted_index[firstWord(headerPointer, headerPointer + headerLength)].push_back(
+            //  firstWord(queryHeaderPointer, queryHeaderPointer + queryHeaderLength));
+            if (inverted_index[firstWord(headerPointer, headerPointer + headerLength)].size() == 1) {
+              center_nonambig_counts[firstWord(queryHeaderPointer, queryHeaderPointer + queryHeaderLength)] += 1;
+            } else if (inverted_index[firstWord(headerPointer, headerPointer + headerLength)].size() == 2) {
+              //std::cout << inverted_index[firstWord(headerPointer, headerPointer + headerLength)];
+              //std::cout << "Prev index: " << inverted_index[firstWord(headerPointer, headerPointer + headerLength)][0] << "+";
+              center_nonambig_counts[inverted_index[firstWord(headerPointer, headerPointer + headerLength)][0]] -= 1;
+            }
+          }
+        }
+      }
+
+	  }
+	    
+	} // if (!marked[lexicographicIndex])
+
+      } // if ((!noOverlap) || resultIterator->cost < radious / 2)
+
+    } // for
+	
+    if (!assignAmbiguous)
+      std::cout << '\n';
+
+  } // Output cluster alignment.
+
+}
+
+
+int main(int argc, char *argv[])
+{
+
+  { // Parse command line options.
+    
+    const std::string PROGRAM_NAME = "dnaclust";
+    const std::string USAGE_DESCRIPTION =
+      "  The output is written to STDOUT."
+      " Each line will contain the ids of the sequences in each cluster, and the first id of each line is the cluster representative.\n"
+      "  Example: To cluster a set of 16S rRNA fragments at 0.98 similarity use:\n"
+      "  ./dnaclust file.fasta -l -s 0.98 > clusters \n"
+      "  You can optionally specify a k-mer length for the filter."
+      " The longer k-mers use more memory. "
+      " Also the filter will be more specific with longer k-mers."
+      " The default log_4(median length) should be good for most cases.\n"
+      ;
+#if OLD_CODE
+	  "  For the fastest running time please specify k-mer length around log_4(sequence-length).\n"
+	  "  Example: To cluster a set of 16S RNA fragments of length around 250 at 0.98 similarity use.\n"
+	  "  ./dnaclust file.fasta -l -s 0.98 -k 3 > clusters \n"
+#endif // OLD_CODE
+
+
+    po::options_description desc("Options");
+
+    std::string inputFileName;
+    std::string predeterminedCentersFileName;
+    
+    desc.add_options()
+      ("help,h", "produce help message")
+      ("similarity,s", po::value<float>(&similarity)->default_value(DEFAULT_SIMILARITY), "set similarity between cluster center and cluster sequences")
+      ("input-file,i", po::value<std::string>(&inputFileName), "input file")
+      ("predetermined-cluster-centers,p", po::value<std::string>(&predeterminedCentersFileName), "file containing predetermined cluster centers")
+      ("recruit-only,r", po::bool_switch(&recruit_only), "when used with 'predetermined-cluster-centers' option, only clusters the input sequences that are similar to the predetermined centers")
+      //("multiple-alignment,m", po::bool_switch(&outputMultipleAlignment)->default_value(false), "produce multiple alignment for each cluster")
+      ("header,d", "output header line indicating run options")
+      ("left-gaps-allowed,l", "allow for gaps on the left of shorter string in semi-global alignment")
+      ("k-mer-length,k", po::value<int>(&k_mer_length)/*->default_value(DEFAULT_K_MER_LENGTH) */, "length of k-mer for filtering")
+#if OLD_CODE
+      ("approximate-filter,a", po::bool_switch(&approximateFilter)->default_value(false), "use faster approximate k-mer filter")
+      ("approximation-factor,f", po::value<double>(&approximationFactor)->default_value(0.5), "determine amount of approximation, by a floating point value between zero and one. smaller values result in more approximation.")
+      ("approximate-filter2", po::bool_switch(&approximateFilter2)->default_value(false), "use second type of approximate k-mer filter")
+#endif // OLC_CODE
+      ("approximate-filter", po::bool_switch(&approximateFilter2)->default_value(false), "use faster approximate k-mer filter")
+      ("k-mer-filter", po::bool_switch(&kmerFilter)->default_value(false), "use k-mer filter")
+      ("no-k-mer-filter", po::bool_switch(&noKmerFilter)->default_value(true), "do not use k-mer filter")
+      ("no-overlap", po::bool_switch(&noOverlap)->default_value(false), "cluster some of sequences such that the cluster centers are at distance at least two times the radius of the clusters")
+      ("threads,t", po::value<size_t>(&num_threads), "number of threads")
+      ("use-full-query-header,u", po::bool_switch(&useFullQueryHeader)->default_value(false), "use the full query header instead of the first word")
+      ("mismatches,m", po::value<float>(&mismatches)->default_value(DEFAULT_MISMATCHES), "number of mismatches allowed from cluster center")
+      ("assign-ambiguous,a", po::bool_switch(&assignAmbiguous), "assign ambiguous reads to clusters based on abundances of non-ambiguous reads")
+      ("random-seed,e", po::value<int>(&random_seed), "Seed for random number generator")
+      ("print-inverted-index", po::bool_switch(&printInvertedIndex), "Print mapping from sequence to each center")
+      ;
+
+    po::positional_options_description pod;
+    pod.add("input-file", -1);
+
+    typedef std::vector<std::string> StringVector;
+    StringVector options(argv, argv + argc);
+    
+    po::variables_map vm;
+    po::store(po::command_line_parser(argc, argv).options(desc).positional(pod).run(), vm);
+    notify(vm);
+    
+    if(vm.count("help") || 
+       (inputFileName.empty()))
+      {
+	std::cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] FastaFile\n"
+		  << USAGE_DESCRIPTION
+		  << desc << '\n';
+	exit(EXIT_FAILURE);
+      }
+
+    if(kmerFilter)
+      noKmerFilter = false;
+
+    std::ifstream inputFile(inputFileName.c_str());
+    inputFile >> inputFasta;
+
+    numberOfInputSequences = inputFasta.size();
+
+    // Initialize all threads.
+    for (size_t i = 0; i < MAX_THREADS; ++i) {
+      pthread_mutex_init(&searchWorkersMutexes[i], NULL);
+      pthread_cond_init(&searchWorkersConds[i], NULL);
+
+      //struct thread_args *args = &args_array[running_threads-1];
+      searchWorkersContext[i].id = i;
+      pthread_create(&searchWorkers[i], NULL, threadWorker, static_cast<void *>(&searchWorkersContext[i]));
+    }
+    pthread_mutex_init(&allThreadsMutex, NULL);
+    pthread_cond_init(&searchWorkersCompleted, NULL);
+    pthread_mutex_init(&running_mutex, NULL);
+
+    if (vm.count("predetermined-cluster-centers")) {
+      std::ifstream predeterminedCentersFile(predeterminedCentersFileName.c_str());
+      predeterminedCentersFile >> predeterminedCentersFasta;
+    }
+
+
+    if (vm.count("header")){ // Output command line options at the first line of output.
+      std::cout << HEADER_INDICATOR;
+      for (StringVector::const_iterator i = options.begin(); i != options.end(); ++i)
+	std::cout << ' ' << *i;
+      std::cout << '\n';
+    }
+
+    if (vm.count("left-gaps-allowed")) {
+      leftGapsAllowed = true;
+    }
+
+    if (vm.count("k-mer-length") == 0) {
+      // Try to guess appropriate k-mer length based on median sequence length.
+
+      typedef std::vector<size_t> size_vector;
+      size_vector sizes;
+      for (sequence::Fasta::const_iterator i = inputFasta.begin(); i != inputFasta.end(); ++i)
+	sizes.push_back(i->sequence.length());
+      size_t median_size = *median(sizes.begin(), sizes.end());
+      k_mer_length = static_cast<int>(floor(log(median_size) / log(NUMBER_OF_NUCLEOTIDES)));
+    }
+
+    // Initializing this at the beginning is crucial.
+    number_of_k_mers = power(static_cast<int>(NUMBER_OF_NUCLEOTIDES), k_mer_length);
+
+  }
+
+
+  {
+
+    { // Calculate global vectors indexed by 'lexicographical index': lexicographicalSequencePointers, sequenceLengths, headerPointers, headerLengths
+
+      typedef std::pair<ConstCharPointer, Index> PointerIndexPair;
+      typedef std::vector<PointerIndexPair> PointerIndexVector;
+    
+      PointerIndexVector pointerFastaIndexes(numberOfInputSequences);
+      lexicographicSequencePointers.resize(numberOfInputSequences);
+
+      for (Index i = 0; i < numberOfInputSequences; ++i) {
+	lexicographicSequencePointers[i] = inputFasta[i].sequence.c_str();
+
+	pointerFastaIndexes[i].first = inputFasta[i].sequence.c_str();
+	pointerFastaIndexes[i].second = i;
+      }
+
+      ternary_sort::ternarySort(lexicographicSequencePointers.data(), numberOfInputSequences);
+
+      std::sort(pointerFastaIndexes.begin(), pointerFastaIndexes.end());
+    
+      PointerIndexVector pointerLexicographicIndexes(numberOfInputSequences);
+
+      for (Index i = 0; i < numberOfInputSequences; ++i) {
+	pointerLexicographicIndexes[i].first = lexicographicSequencePointers[i];
+	pointerLexicographicIndexes[i].second = i;
+      }
+
+      std::sort(pointerLexicographicIndexes.begin(), pointerLexicographicIndexes.end());
+
+      IndexVector fastaIndexOfLexicographicIndexes(numberOfInputSequences);
+      IndexVector lexicographicIndexOfFastaIndexes(numberOfInputSequences);
+
+      for (Index i = 0; i < numberOfInputSequences; ++i) {
+	assert(pointerFastaIndexes[i].first == pointerLexicographicIndexes[i].first);
+      
+	fastaIndexOfLexicographicIndexes[pointerLexicographicIndexes[i].second] = pointerFastaIndexes[i].second;
+	lexicographicIndexOfFastaIndexes[pointerFastaIndexes[i].second] = pointerLexicographicIndexes[i].second;
+
+      }
+
+      headerPointers.resize(numberOfInputSequences);
+      headerLengths.resize(numberOfInputSequences);
+      sequenceLengths.resize(numberOfInputSequences);
+
+      for (Index i = 0; i < numberOfInputSequences; ++i) {
+	Index fastaIndex = fastaIndexOfLexicographicIndexes[i];
+	headerPointers[i] = inputFasta[fastaIndex].header.c_str();
+	headerLengths[i] = inputFasta[fastaIndex].header.length();
+	sequenceLengths[i] = inputFasta[fastaIndex].sequence.length();
+      }
+    } // Calculate global vectors indexed by 'lexicographical index': lexicographicalSequencePointers, sequenceLengths, headerPointers, headerLengths
+
+
+
+    typedef std::pair<PositionType, Index> LengthIndexPair;
+    typedef std::vector<LengthIndexPair> LengthIndexVector;
+#if MY_DEBUG
+    typedef std::pair<std::pair<PositionType, ConstCharPointer>, Index> LengthIndexDebugPair;
+    typedef std::vector<LengthIndexDebugPair> LengthIndexVector;
+#endif // MY_DEBUG
+    LengthIndexVector lengthLexicographicIndexes(numberOfInputSequences);
+    
+    for (Index i = 0; i < numberOfInputSequences; ++i) {
+
+      lengthLexicographicIndexes[i].first = sequenceLengths[i];
+      lengthLexicographicIndexes[i].second = i;
+#if MY_DEBUG
+      // Assume i is fastaIndex;
+      Index lexicographicIndex = lexicographicIndexOfFastaIndexes[i];
+      lengthLexicographicIndexes[i].first.first = sequenceLengths[lexicographicIndex];
+      lengthLexicographicIndexes[i].first.second = lexicographicSequencePointers[lexicographicIndex];
+      lengthLexicographicIndexes[i].second = lexicographicIndex;
+#endif // MY_DEBUG
+    }
+
+
+    std::sort(lengthLexicographicIndexes.begin(), lengthLexicographicIndexes.end(), std::greater<LengthIndexPair>());
+#if MY_DEBUG
+    std::sort(lengthLexicographicIndexes.begin(), lengthLexicographicIndexes.end(), std::greater<LengthIndexDebugPair>());
+#endif // MY_DEBUG
+
+    if (! noKmerFilter) { 
+
+      spectrumIndexes.resize(numberOfInputSequences);
+
+      for (Index i = 0; i < numberOfInputSequences; ++i) {
+	spectrumIndexes[i].first = countKmers(lexicographicSequencePointers[i], sequenceLengths[i]);
+	spectrumIndexes[i].second = i;
+      }
+
+
+      
+      { // Sorting the k-mer spectrums in a clever way.
+	
+	{ // Allocate cluster_information
+	  cluster_information::l = new Index[2 * numberOfInputSequences];
+	  cluster_information::r = new Index[2 * numberOfInputSequences];
+	  cluster_information::sizeOfLeftCluster = new Index[2 * numberOfInputSequences];
+
+	  cluster_information::numberOfKmers = new Index[k_mer_length];
+	  for (int i = 0; i < k_mer_length; ++i)
+	    cluster_information::numberOfKmers[i] = power(static_cast<int>(NUMBER_OF_NUCLEOTIDES), i + 1);
+
+	  cluster_information::allMinCounts = new KmerCount *[k_mer_length];
+	  for (int i = 0; i < k_mer_length; ++i)
+	    cluster_information::allMinCounts[i] = new KmerCount[cluster_information::numberOfKmers[i] * numberOfInputSequences * 2];
+
+	  cluster_information::allMaxCounts = new KmerCount *[k_mer_length];
+	  for (int i = 0; i < k_mer_length; ++i)
+	    cluster_information::allMaxCounts[i] = new KmerCount[cluster_information::numberOfKmers[i] * numberOfInputSequences * 2];
+
+	  cluster_information::counts = new int *[k_mer_length];
+	  for(int i = 0; i < k_mer_length; ++i)
+	    cluster_information::counts[i] = new int[cluster_information::numberOfKmers[i]];
+
+	}
+
+	// Allocate allQueryCounts
+	{
+	  layered::allQueryCounts = new int *[k_mer_length];
+	  for (int i = 0; i < k_mer_length; ++i)
+	    layered::allQueryCounts[i] = new int[cluster_information::numberOfKmers[i]];
+	}
+
+#if MY_DEBUG
+	std::cerr << "Before clusterSort()\n" << std::flush;
+#endif // MY_DEBUG
+
+	{
+	  clusterSort(0, numberOfInputSequences, 0);
+
+
+#if MY_DEBUG
+	  std::cerr << "After clusterSort.\n" << std::flush;
+#endif // MY_DEBUG
+
+	  remainingAtLastDbUpdate = numberOfInputSequences;
+	}
+
+      }
+
+
+
+#if MY_DEBUG
+      for (Index i = 0; i < numberOfInputSequences; ++i) {
+	std::cout << i << ":\t";
+	for (KmerNumber j = 0; j < NUMBER_OF_K_MERS; ++j)
+	  std::cout << static_cast<int>(spectrumIndexes[i].first[j]) << '\t';
+	std::cout << '\n';
+      }
+
+    exit(EXIT_FAILURE);
+#endif // MY_DEBUG
+
+    } // if (! noKmerFilter)
+
+    if (noKmerFilter) {
+      sortedStrings = lexicographicSequencePointers;
+      lexicographicIndexOfSortedStringsIndexes.resize(sortedStrings.size());
+      for (Index i = 0; i < static_cast<Index>(sortedStrings.size()); ++i)
+	lexicographicIndexOfSortedStringsIndexes[i] = i;
+
+      // Go through each thread and resize all the threads
+      int num_threads_to_use = num_threads;
+      int interval_size = static_cast<int>(ceil(sortedStrings.size() / static_cast<double>(num_threads)));//sortedStrings.size() / num_threads_to_use;
+
+      if (sortedStrings.size() < num_threads) {
+        num_threads_to_use = 1; //sortedStrings.size(); //1; //sortedStrings.size();
+        interval_size = sortedStrings.size();//1;//sortedStrings.size();
+      }
+
+      //std::cout << "Using threads: " <<  num_threads_to_use << "\n";
+      //flush(std::cout);
+      
+      
+      running_threads = 0;
+      //std::cout << "Sorted strings size: " << sortedStrings.size() << "\n"; 
+      //flush(std::cout);
+
+#if CHRIS_DEBUG
+       std::cout << "Size: " <<  sortedStrings.size() << "\n";
+      //std::cout << "Building first trees for table: " << i << " from: " << chunkIndexes[i].first << " to " << end << ", interval_size " << chunkIndexes[i].second << "\n" << std::flush;
+#endif // CHRIS_DEBUG
+
+      for (unsigned int i = 0, currentTable = 0; i < sortedStrings.size(); i += interval_size, ++currentTable) {
+        unsigned int end = i + interval_size;
+        if (i + interval_size > sortedStrings.size()) {
+          end = sortedStrings.size();
+        }
+        else {
+          end = i + interval_size;
+        }
+
+        sortedStringsLengthsMinTree[currentTable].resize(2 * (end - i));
+        sortedStringsLengthsMaxTree[currentTable].resize(2 * (end - i));
+  
+#if CHRIS_DEBUG
+        std::cout << "Building first trees from: " << i << " to " << end << "\n"; 
+#endif // CHRIS_DEBUG
+
+        
+        buildMinTree(sequenceLengths.begin() + i, sequenceLengths.begin() + end, sortedStringsLengthsMinTree[currentTable].begin());
+        buildMaxTree(sequenceLengths.begin() + i, sequenceLengths.begin() + end, sortedStringsLengthsMaxTree[currentTable].begin());
+      }
+
+      remainingAtLastDbUpdate = numberOfInputSequences;
+    }
+
+    marked.resize(numberOfInputSequences);
+    std::fill(marked.begin(), marked.end(), false);
+
+    markedGrey.resize(numberOfInputSequences);
+    std::fill(markedGrey.begin(), markedGrey.end(), false);
+    
+    numberOfMarked = 0;
+    
+
+    // This MUST be called before any call to any of the search functions.
+    initialize();
+    
+
+    // Loop over predetermined cluster centers.
+    for (sequence::Fasta::const_iterator predeterminedIterator = predeterminedCentersFasta.begin(); predeterminedIterator != predeterminedCentersFasta.end();  ++predeterminedIterator) {
+      querySequencePointer = predeterminedIterator->sequence.c_str();
+      querySequenceLength = predeterminedIterator->sequence.length();
+      queryHeaderPointer = predeterminedIterator->header.c_str();
+      queryHeaderLength = predeterminedIterator->header.length();
+
+      makeCluster();
+
+    } // for
+
+
+    if (recruit_only) {
+      // If we are assigning ambiguous reads based on abundance, handle it here.
+      if (assignAmbiguous) {
+        /* initialize random seed: */
+        if (random_seed == 0) {
+          srand(time(NULL));
+        } else {
+          srand(random_seed);
+        }
+//        std::cout << "********" << "\n";
+      
+        typedef std::map< std::string, std::vector<std::string> >::const_iterator MapIterator;
+        for (MapIterator iter = inverted_index.begin(); iter != inverted_index.end(); iter++)
+        {
+            //std::cout << "Key: " << iter->first << std::endl << "Values:" << std::endl;
+
+            if (iter->second.size() == 1) {
+              final_clusters[iter->second[0]].push_back(iter->first);
+            } 
+            else{
+              std::vector<std::string> bins;
+              typedef std::vector<std::string>::const_iterator ListIterator;
+              for (ListIterator center_iter = iter->second.begin(); center_iter != iter->second.end(); center_iter++) {
+                  for (int z = 0; z < center_nonambig_counts[*center_iter]; ++z) {
+                    bins.push_back(*center_iter);
+                  }
+              }
+
+              int index = rand() % bins.size();
+
+#if ABUN_DEBUG
+              std::cout << "Bins: " << bins << ", selected index: " << index << 
+                  "(" << bins[index] << ")" << "\n";
+#endif // ABUN_DEBUG
+
+              final_clusters[bins[index]].push_back(iter->first);
+  //            std::cout << " " << *list_iter << std::endl;
+            }
+        }
+
+        for (MapIterator iter = final_clusters.begin(); iter != final_clusters.end(); iter++) {
+          std::cout << iter->first << "\t";
+
+          typedef std::vector<std::string>::const_iterator ListIterator;
+          for (ListIterator seq_iter = iter->second.begin(); seq_iter != iter->second.end(); seq_iter++) {
+            std::cout << *seq_iter << "\t";
+          }
+
+          std::cout << "\n";
+        }
+      }
+
+      return EXIT_SUCCESS;
+    }
+
+#if TIMING
+    boost::posix_time::ptime beginning = boost::posix_time::second_clock::local_time();
+#endif // TIMING
+
+    for (Index i = 0; i < numberOfInputSequences; ++i) {
+
+      Index lexicographicIndex = lengthLexicographicIndexes[i].second;
+      if (!marked[lexicographicIndex]) {
+	if (!markedGrey[lexicographicIndex]) {
+	  markedGrey[lexicographicIndex] = true;
+	  marked[lexicographicIndex] = true;
+	  ++numberOfMarked;
+
+
+	if ((numberOfInputSequences - numberOfMarked < remainingAtLastDbUpdate * 0.5) && (remainingAtLastDbUpdate > numberOfInputSequences * 0.05) && (numberOfInputSequences - numberOfMarked > 1)) {
+	  if (! noKmerFilter) {
+
+	    Index i = 0;
+	    Index j = remainingAtLastDbUpdate - 1;
+	  
+	    while (i < j) {
+	      while (!marked[spectrumIndexes[i].second])
+		++i;
+	      while (marked[spectrumIndexes[j].second])
+		--j;
+	      
+	      if (i < j) {
+		std::swap(spectrumIndexes[i], spectrumIndexes[j]);
+
+		++i;
+		--j;
+	      } // if (i < j)
+	    } // while (i < j)
+
+	    remainingAtLastDbUpdate = numberOfInputSequences - numberOfMarked;
+
+#if TIMING								
+	    std::cerr << (boost::posix_time::second_clock::local_time() - beginning)  << '\t' << "Database update.\n" << std::flush;
+#endif // TIMING
+							
+	    clusterSort(0, remainingAtLastDbUpdate, 0);
+	    
+#if TIMING								
+	    std::cerr << (boost::posix_time::second_clock::local_time() - beginning)  << '\t' << "...done.\n" << std::flush;
+#endif // TIMING				
+	  } // if (! noKmerFilter)
+	  else { // No k-mer filter
+
+	    ConstCharPointerVector newSortedStrings;
+	    PositionVector newSortedStringsLengths;
+	    IndexVector newLexicographicIndexOfSortedStringsIndexes;
+	  
+
+	    for (Index i = 0; i < static_cast<Index>(sortedStrings.size()); ++i) {
+	      Index lexicographicIndex = lexicographicIndexOfSortedStringsIndexes[i];
+	      if (! marked[lexicographicIndex]){
+		newSortedStrings.push_back(lexicographicSequencePointers[lexicographicIndex]);
+		newLexicographicIndexOfSortedStringsIndexes.push_back(lexicographicIndex);
+		newSortedStringsLengths.push_back(sequenceLengths[lexicographicIndex]);
+	      }
+	    }
+					 
+	    sortedStrings.swap(newSortedStrings);
+	    lexicographicIndexOfSortedStringsIndexes.swap(newLexicographicIndexOfSortedStringsIndexes);
+	  
+      size_t num_threads_to_use = num_threads;
+      int interval_size = static_cast<int>(ceil(sortedStrings.size() / static_cast<double>(num_threads)));//sortedStrings.size() / num_threads_to_use;
+
+      if (sortedStrings.size() < num_threads) {
+        num_threads_to_use = 1;//sortedStrings.size(); //1;
+        interval_size = sortedStrings.size();//1; //sortedStrings.size();
+      }
+      
+#if CHRIS_DEBUG
+       std::cout << "Size: " <<  sortedStrings.size() << "\n";
+      //std::cout << "Building first trees for table: " << i << " from: " << chunkIndexes[i].first << " to " << end << ", interval_size " << chunkIndexes[i].second << "\n" << std::flush;
+#endif // CHRIS_DEBUG
+      
+      for (unsigned int i = 0, currentTable = 0; i < sortedStrings.size() ; i += interval_size, ++currentTable) {
+
+        unsigned int end = i + interval_size;
+        if (i + interval_size > sortedStrings.size()) {
+          end = sortedStrings.size();
+        }
+        else {
+          end = i + interval_size;
+        }
+        sortedStringsLengthsMinTree[currentTable].resize(2 * (end - i));
+        sortedStringsLengthsMaxTree[currentTable].resize(2 * (end - i));
+
+#if CHRIS_DEBUG
+        std::cout << "Building second trees from: " << i << " to " << end << "\n" << std::flush; 
+        std::cout << "Size of sorted string min/max: " << sortedStringsLengthsMinTree[currentTable].size() << ", " 
+            << sortedStringsLengthsMaxTree[currentTable].size() << "\n" << std::flush;
+        //std::cout << "Building first trees for table: " << i << " from: " << chunkIndexes[i].first << " to " << end << ", interval_size " << chunkIndexes[i].second << "\n" << std::flush;
+#endif // CHRIS_DEBUG
+        buildMinTree(newSortedStringsLengths.begin() + i, newSortedStringsLengths.begin() + end, sortedStringsLengthsMinTree[currentTable].begin());
+        buildMaxTree(newSortedStringsLengths.begin() + i, newSortedStringsLengths.begin() + end, sortedStringsLengthsMaxTree[currentTable].begin());
+      }
+
+	    remainingAtLastDbUpdate = numberOfInputSequences - numberOfMarked;
+	  } // else
+	} // if ((numberOfInputSequences - numberOfMarked < remainingAtLastDbUpdate * 0.5) && (remainingAtLastDbUpdate > numberOfInputSequences * 0.05))
+	querySequencePointer = lexicographicSequencePointers[lexicographicIndex];
+	querySequenceLength = sequenceLengths[lexicographicIndex];
+	queryHeaderPointer = headerPointers[lexicographicIndex];
+	queryHeaderLength = headerLengths[lexicographicIndex];
+
+	makeCluster();
+	
+	} // if (!markedGrey[lexicographicIndex])
+      } // if (!marked[lexicographicIndex])
+
+
+    } // for (Index i = 0; i < numberOfInputSequences; ++i)
+
+    return EXIT_SUCCESS;
+  }
+} // main
diff --git a/fasta.cpp b/fasta.cpp
new file mode 100644
index 0000000..139a9b0
--- /dev/null
+++ b/fasta.cpp
@@ -0,0 +1,80 @@
+#include "fasta.hpp"
+
+#include <boost/algorithm/string/trim.hpp>
+using boost::algorithm::trim;
+
+#include <algorithm>
+using std::min;
+
+#include <ostream>
+using std::endl;
+
+#include <stdexcept>
+using std::runtime_error;
+
+
+namespace sequence{
+
+  FastaRecord::FastaRecord(const string &h)
+    : header(h)
+  {
+  }
+  
+  FastaRecord::FastaRecord()
+  {
+  }
+
+  istream &operator>>(istream &inputStream, FastaRecord &fr)
+  {
+    fr.header.clear();
+    fr.sequence.clear();
+
+    if('>' != inputStream.get())
+      throw runtime_error("Error in FASTA file format.");
+    getline(inputStream, fr.header);
+    while(inputStream.good() && '>' != inputStream.peek()){
+      string line;
+      getline(inputStream, line);
+      fr.sequence += line;
+    }
+    return inputStream;
+  }
+
+  ostream &operator<<(ostream &outputStream, const FastaRecord &fr)
+  {
+    outputStream << '>' << fr.header;
+    
+    for(string::size_type i = 0; i < fr.sequence.length(); ++i){
+      if(0 == i % CHARS_PER_LINE)
+	outputStream << '\n';
+      outputStream << fr.sequence[i];
+    }
+
+    outputStream << endl;
+
+    return outputStream;
+  }
+  
+  istream &operator>>(istream &inputStream, Fasta &fa)
+  {
+    fa.clear();
+    string line;
+    while(getline(inputStream, line))
+      if(not line.empty() and '>' == line[0])
+	fa.push_back(FastaRecord(line.substr(1)));
+      else{
+       	trim(line);
+	if(not line.empty())
+	  fa.rbegin()->sequence += line;
+      }
+    return inputStream;
+  }
+
+  ostream &operator<<(ostream &outputStream, const Fasta &fa)
+  {
+    for(Fasta::const_iterator i = fa.begin(); i != fa.end(); ++i)
+      outputStream << *i;
+    return outputStream;
+  }
+  
+}
diff --git a/fasta.hpp b/fasta.hpp
new file mode 100644
index 0000000..6d533f6
--- /dev/null
+++ b/fasta.hpp
@@ -0,0 +1,35 @@
+#ifndef FASTA_HPP
+#define FASTA_HPP
+
+#include <string>
+#include <istream>
+#include <ostream>
+#include <vector>
+
+namespace sequence
+{
+  using std::string;
+  using std::vector;
+  using std::istream;
+  using std::ostream;
+
+  const int CHARS_PER_LINE = 60;
+  
+  struct FastaRecord
+  {
+    FastaRecord();
+    FastaRecord(const string &);
+    string header;
+    string sequence;
+  };
+
+  istream &operator>>(istream &, FastaRecord &);
+  ostream &operator<<(ostream &, const FastaRecord &);
+  
+  typedef vector<FastaRecord> Fasta;
+  
+  istream &operator>>(istream &, Fasta &);
+  ostream &operator<<(ostream &, const Fasta &);
+}
+
+#endif
diff --git a/fastaselect.cpp b/fastaselect.cpp
new file mode 100644
index 0000000..72cf7ab
--- /dev/null
+++ b/fastaselect.cpp
@@ -0,0 +1,115 @@
+#include <iostream>
+#include <cstdlib>
+#include <fstream>
+#include <string>
+#include <map>
+#include <algorithm>
+#include "fasta.hpp"
+#include <boost/program_options.hpp>		
+namespace po = boost::program_options;
+
+template <class InputIterator>
+std::string firstWord(InputIterator first, InputIterator last)
+{
+  return std::string(first, std::find_if(first, last, isspace));
+}
+
+int main(int argc, char *argv[])
+{
+
+  bool produce_help;
+  bool cluster_centers;
+  bool all_clusters;
+  bool everything_except;
+  bool cluster_sequences;
+  std::string fasta_file_name;
+  std::string clusters_file_name_prefix;
+  const std::string FASTA_SUFFIX = "fasta";
+
+  po::options_description desc("Options");
+  desc.add_options()
+    ("help,h", po::bool_switch(&produce_help), "Produce help message.")
+    ("fasta-file,f", po::value<std::string>(&fasta_file_name), "REQUIRED - File containing sequences in FASTA format.")
+    ("cluster-centers,c", po::bool_switch(&cluster_centers), "Write the sequences of all cluster centers to standard output in FASTA format.")
+    ("all-clusters,a", po::bool_switch(&all_clusters), "Write the sequences of each cluster to a seperate FASTA file. The name of the files will be the given path and prefix, folloed by cluster number andy a \'.fasta\' suffix.")
+    ("file-name-prefix,p", po::value<std::string>(&clusters_file_name_prefix)->default_value("./cluster-"), "Specify the path and prefix for the cluster FASTA file names.")
+    ("everything-except", po::bool_switch(&everything_except), "Write all sequences except cluster centers to the standard output in FASTA format.")
+    ("cluster-sequences", po::bool_switch(&cluster_sequences), "Write the cluster sequences (but not the cluster center sequence) to the standard output in FASTA format.")
+    ;
+
+  po::variables_map vm;
+  po::store(po::command_line_parser(argc, argv).options(desc).run(), vm);
+  notify(vm);
+
+  if (produce_help || fasta_file_name.empty()) {
+    std::cerr << "Usage: fastaselect FastaFile\n"
+	      << " The ids are read from standard input. Each line corresponds to a cluster with the first id being the cluster center.\n"
+	      << " The output sequences are written to standard output in FASTA format.\n"
+	      << desc << '\n';
+    exit(EXIT_FAILURE);
+  }
+
+
+  std::ifstream fasta_file(fasta_file_name.c_str());
+  sequence::Fasta sequences;
+  fasta_file >> sequences;
+
+  typedef std::map<std::string, size_t> StringIndexMap;
+
+  StringIndexMap index_of_id;
+
+  for (size_t i = 0; i < sequences.size(); ++i) {
+    const std::string &header = sequences[i].header;
+    index_of_id[firstWord(header.begin(), header.end())] = i;
+  }
+
+
+  if (cluster_centers) {
+    std::string line;
+    while (getline(std::cin, line)) {
+      std::string center_id = firstWord(line.begin(), line.end());
+      std::cout << sequences[index_of_id[center_id]];
+    }
+
+  } else if (everything_except) {
+    std::set<std::string> center_ids;
+    {
+      std::string line;
+      while (getline(std::cin, line)) 
+	center_ids.insert(firstWord(line.begin(), line.end()));
+    }
+    for (size_t i = 0; i < sequences.size(); ++i) {
+      const std::string &header = sequences[i].header;
+      std::string id = firstWord(header.begin(), header.end());
+      if (center_ids.find(id) == center_ids.end())
+	std::cout << sequences[i];
+    }
+  } else if (cluster_sequences) {
+    std::string line;
+    while (getline(std::cin, line)) {
+      std::istringstream line_stream(line);
+      std::string id;
+      line_stream >> id;
+      while (line_stream >> id)
+	std::cout << sequences[index_of_id[id]];
+    }
+  }
+
+  if (all_clusters) {
+    std::string line;
+    int cluster_number = 0;
+    while (getline(std::cin, line)) {
+      std::ostringstream cluster_file_name_stream;
+      cluster_file_name_stream << clusters_file_name_prefix << ++cluster_number << '.' << FASTA_SUFFIX;
+      std::ofstream cluster_file(cluster_file_name_stream.str().c_str());
+
+      std::istringstream line_stream(line);
+      std::string id;
+      while (line_stream >> id)
+	cluster_file << sequences[index_of_id[id]];
+      
+    }
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/fastasort.cpp b/fastasort.cpp
new file mode 100644
index 0000000..1c3735b
--- /dev/null
+++ b/fastasort.cpp
@@ -0,0 +1,69 @@
+#include <iostream>
+#include <cstdlib>
+#include <fstream>
+#include <string>
+#include <map>
+#include <algorithm>
+#include "fasta.hpp"
+#include <boost/program_options.hpp>		
+namespace po = boost::program_options;
+
+
+struct longer_sequence
+{
+  bool operator()(const sequence::FastaRecord &r1, const sequence::FastaRecord &r2)
+  {
+    return r1.sequence.length() > r2.sequence.length();
+  }
+};
+
+int main(int argc, char *argv[])
+{
+
+  bool produce_help = false;
+  bool do_random_shuffle = false;
+  // bool lexicographic = false;
+
+  po::options_description desc("Options");
+  desc.add_options()
+    ("help,h", po::bool_switch(&produce_help), "Produce help message.")
+    ("random-shuffle,r", po::bool_switch(&do_random_shuffle), "Randomly shuffle sequences.")
+    // ("lexicographic,l", po::bool_switch(&lexicographic), "Sort sequences in lexicographical order.")
+    ;
+
+  po::variables_map vm;
+  po::store(po::command_line_parser(argc, argv).options(desc).run(), vm);
+  notify(vm);
+
+  if (produce_help) {
+    std::cerr << "Usage: fastasort [OPTIONS]\n"
+	      << " The FASTA sequences are read from the STDIN.\n"
+	      << " The sorted sequences are written to the STDOUT in FASTA format.\n"
+	      << desc << '\n';
+    exit(EXIT_FAILURE);
+  }
+
+
+  sequence::Fasta sequences;
+  std::cin >> sequences;
+
+  if (do_random_shuffle) {
+    std::random_shuffle(sequences.begin(), sequences.end());
+    std::cout << sequences;
+  } else {
+    // sort by length. the first sequence is the longest. 
+    typedef std::pair<size_t, size_t> length_index_pair;
+
+    std::vector<length_index_pair> length_indexes;
+
+    for (size_t i = 0; i < sequences.size(); ++i)
+      length_indexes.push_back(std::make_pair(sequences[i].sequence.length(), i));
+
+    std::sort(length_indexes.begin(), length_indexes.end(), std::greater<length_index_pair>());
+    
+    for (size_t i = 0; i < length_indexes.size(); ++i)
+      std::cout << sequences[length_indexes[i].second];
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/find-large-clusters b/find-large-clusters
new file mode 100755
index 0000000..3b4e2e9
--- /dev/null
+++ b/find-large-clusters
@@ -0,0 +1,94 @@
+#!/bin/bash
+minimum_cluster_size=3
+similarity=0.98
+threads=1
+
+print_help()
+{
+    fold --spaces <<EOF
+Usage: find-large-clusters [OPTIONS...]
+Fast and approximate method for finding large clusters. It first clusters a small subset of sequences, then uses the centers of the clusters that are larger than the specified threshold to recruit from the rest of the sequences. Note that this method does not cluster all of the sequences. Hopefully, however, we will find most the of large clusters.
+
+  -m SIZE            The minimum size of the clusters whose centers are
+                     uesd in the second phase (default=3)
+  -s SIZE            The size of set of sample sequences for finding the
+                     centers of large clusters (default=n^0.8)
+  -r SIMILARITY      Set similarity between cluster center and cluster
+                     sequences (default=0.98)
+  -t THREADS         Set the number of threads to use
+  -a                 Produce multiple sequence alignment for each cluster
+  -v                 Print verbose messages to standard error
+  -h                 Give this help list
+
+The sequences to be clustered are read from the STDIN. The cluster centers are written to STDOUT. Messages are written to STDERR.
+EOF
+}
+
+while getopts "m:s:r:t:avh" option
+do
+    case $option in
+	m) minimum_cluster_size="$OPTARG";;
+	s) sample_count="$OPTARG";;
+	r) similarity="$OPTARG";;
+  t) threads="$OPTARG";;
+	a) multiple_alignment="--multiple-alignment";;
+	v) verbose=0;;
+	h) print_help; exit 0;;
+	[?]) print_help; exit 1;;
+    esac
+done
+
+print_message()
+{
+    if [ $verbose ]
+    then
+	echo "`date +%T` $1" >&2
+    fi
+}
+
+dnaclust_path="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+tempdir=`mktemp -d -p .`
+#trap "rm -fr $tempdir" EXIT
+
+sequences_sorted=`mktemp -p $tempdir`
+# Reads the sequences from STDIN.
+print_message "Reading and sorting the input sequences."
+"$dnaclust_path/fastasort" --random-shuffle > $sequences_sorted 
+
+input_count=`grep -c '^>' $sequences_sorted`
+if [ $input_count -le 1 ]
+then
+    echo "ERROR: Input sequences must contain more than one sequence." >&2
+    exit 1
+fi
+
+if [ -z $sample_count ]
+then
+    sample_count_float=`echo "e(l($input_count)*0.8)" | bc -l`
+    sample_count=`echo "($sample_count_float * 2 + 1) / 2" | bc`
+fi
+
+sample_lines=`awk --assign=count=$sample_count '/^>/{if (++n > count) {print FNR - 1; exit}}' $sequences_sorted`
+
+
+print_message "Splitting $input_count input sequences into a sample of size $sample_count, and the rest."
+sequences_sorted_head=`mktemp -p $tempdir`
+head -n $sample_lines $sequences_sorted > $sequences_sorted_head
+
+large_clusters=`mktemp -p $tempdir`
+print_message "Clustering the set of sample sequences, and extracting centers of large clusters."
+"$dnaclust_path/dnaclust" -s $similarity -t $threads -i $sequences_sorted_head \
+    | awk --assign=minimum=$minimum_cluster_size '{if (NF >= minimum) print $0}' > $large_clusters
+
+sequences_sorted_head_large_centers=`mktemp -p $tempdir`
+"$dnaclust_path/fastaselect" -c -f $sequences_sorted_head < $large_clusters > $sequences_sorted_head_large_centers
+
+sequences_minus_centers=`mktemp -p $tempdir`
+"$dnaclust_path/fastaselect" --everything-except -f $sequences_sorted < $large_clusters > $sequences_minus_centers
+
+centers_count=`grep -c '^>' $sequences_sorted_head_large_centers`
+print_message "Recruiting from the rest of the sequences, using $centers_count centers."
+"$dnaclust_path/dnaclust" -s $similarity -t $threads $multiple_alignment --no-k-mer-filter -i $sequences_minus_centers -p $sequences_sorted_head_large_centers -r
+
+print_message "Done."
diff --git a/generate_test_clusters.sh b/generate_test_clusters.sh
new file mode 100755
index 0000000..a720e93
--- /dev/null
+++ b/generate_test_clusters.sh
@@ -0,0 +1,15 @@
+#! /usr/bin/sh
+
+
+for test_name in small_test.fna test_600k.fna ; do 
+    for threads in 1 2 3 4 ; do
+    
+        #run_cmd="${cmd} --no-k-mer-filter "
+        eval time ./dnaclust --no-k-mer-filter -t ${threads} ../dnaclust/${test_name} > results/${test_name}.nk.${threads}.clusters
+        awk '{print NF}' results/${test_name}.nk.${threads}.clusters > results/${test_name}.nk.${threads}.counts 
+
+        eval time ./dnaclust -t ${threads} ../dnaclust/${test_name} > results/${test_name}.k.${threads}.clusters
+        awk '{print NF}' results/${test_name}.k.${threads}.clusters > results/${test_name}.k.${threads}.counts 
+
+    done
+done
diff --git a/multi_dim.hpp b/multi_dim.hpp
new file mode 100644
index 0000000..9dac7f5
--- /dev/null
+++ b/multi_dim.hpp
@@ -0,0 +1,156 @@
+#ifndef MULTI_DIM_HPP
+#define MULTI_DIM_HPP
+
+#include <boost/array.hpp>
+#include <vector>
+#include <cassert>
+#include <stdexcept>
+
+namespace multi_dim{
+  using boost::array;
+  using std::vector;
+  using std::range_error;
+
+  template<typename ElementType_, std::size_t SIZE1_, std::size_t SIZE2_>
+  class Array2D{
+    
+  private:
+    typedef array<ElementType_, SIZE1_ * SIZE2_> Container_;
+
+  public:
+    typedef typename Container_::value_type      value_type;
+    typedef typename Container_::reference       reference;
+    typedef typename Container_::const_reference const_reference;
+    typedef typename Container_::size_type       size_type;
+
+  public:
+    reference at(const size_type i, const size_type j)
+    {
+      rangeCheck_(i, j);
+      return buffer_[i * SIZE2_ + j];
+    }
+
+    const_reference at(const size_type i, const size_type j) const
+    {
+      rangeCheck_(i, j);
+      return buffer_[i * SIZE2_ + j];
+    }
+
+    inline reference operator()(const size_type i, const size_type j)
+    {
+      return buffer_[i * SIZE2_ + j];
+    }
+
+    inline const_reference operator()(const size_type i, const size_type j) const
+    {
+      return buffer_[i * SIZE2_ + j];
+    }
+
+    size_type size1() const
+    {
+      return SIZE1_;
+    }
+    size_type size2() const
+    {
+      return SIZE2_;
+    }
+
+  private:
+    void rangeCheck_(const size_type i, const size_type j) const
+    {
+      if((i >= SIZE1_) || (j >= SIZE2_))
+	throw range_error();
+    }
+      
+
+  private:
+    Container_ buffer_;
+  };
+
+  template<typename ElementType_>
+  class Matrix{
+  private:
+    typedef std::vector<ElementType_>   Container_;
+  public:
+    typedef typename Container_::value_type              value_type;
+    typedef typename Container_::reference               reference;
+    typedef typename Container_::const_reference         const_reference;
+    typedef typename Container_::size_type               size_type;
+
+
+  public:
+    Matrix(const size_type nrows, const size_type ncols)
+      : buffer_(nrows * ncols), size1_(nrows), size2_(ncols)
+    {}
+
+
+#if MATRIX_RESIZABLE
+    
+    Matrix()
+      : buffer_(), size1_(0), size2_(0)
+    {}
+
+    void clear()
+    {
+      size1_ = 0;
+      size2_ = 0;
+      buffer_.clear();
+    }
+
+    void resize(const size_type nrows, const size_type ncols)
+    {
+      size1_ = nrows;
+      size2_ = ncols;
+      buffer_.resize(nrows * ncols);
+    }
+#endif // MATRIX_RESIZABLE
+
+    reference at(const size_type i, const size_type j)
+    {
+      rangeCheck_(i, j);
+      return buffer_[i * size2_ + j];
+    }
+    
+    const_reference at(const size_type i, const size_type j) const
+    {
+      rangeCheck_(i, j);
+      return buffer_[i * size2_ + j];
+    }
+
+    inline reference operator()(const size_type i, const size_type j)
+    {
+      return buffer_[i * size2_ + j];
+    }
+    
+    inline const_reference operator()(const size_type i, const size_type j) const
+    {
+      return buffer_[i * size2_ + j];
+    }
+
+    size_type size1() const
+    {
+      return size1_;
+    }
+    
+    size_type size2() const
+    {
+      return size2_;
+    }
+
+  private:
+    void rangeCheck_(const size_type i, const size_type j) const
+    {
+      if((i >= size1_) || (j >= size2_))
+	throw range_error();
+    }
+      
+
+      
+  private:
+    Container_ buffer_;
+    size_type size1_, size2_;
+  };
+
+}
+
+#endif
diff --git a/search_include.cpp b/search_include.cpp
new file mode 100644
index 0000000..4d95744
--- /dev/null
+++ b/search_include.cpp
@@ -0,0 +1,209 @@
+
+#undef SEARCH_FUNCTION_NAME
+
+#if ALIGN_WITH_BACKPOINTER
+#define SEARCH_FUNCTION_NAME search
+#else // ALIGN_WITH_BACKPOINTER is false
+#define SEARCH_FUNCTION_NAME search_without_backpointer
+#endif // ALIGN_WITH_BACKPOINTER
+
+/*
+  Input:
+  Global: radious, similarity, sortedStrings.
+
+  Output:
+  Global: searchResults.
+*/
+
+
+void SEARCH_FUNCTION_NAME(const PositionType l, const PositionType r, PositionType pos, const Index intervalIndex, const uint16_t tableIndex, const int globalOffset)
+{
+
+  DpTable *table = tables[tableIndex];
+  std::vector<CharType> *localWord = &word[tableIndex];
+  std::vector<PositionType> *localWordMinLength = &wordMinLength[tableIndex];
+  std::vector<PositionType> *localLeft = &left[tableIndex];
+  std::vector<PositionType> *localRight = &right[tableIndex];
+  std::vector<PositionType> *localMinCost = &minCost[tableIndex];
+
+  CostType maxLengthRadious = std::min(static_cast<CostType>(floor((1 - similarity) * sortedStringsLengthsMaxTree[tableIndex][intervalIndex])), radious);
+  if (mismatches >= 0)
+    maxLengthRadious = static_cast<CostType>(radious);
+
+  PositionType minSortedStringsLength = sortedStringsLengthsMinTree[tableIndex][intervalIndex];
+  PositionType maxRight = querySequenceLength - minSortedStringsLength + radious;
+
+  do {
+
+
+    if (sortedStrings[globalOffset + l][pos] != sortedStrings[globalOffset + r][pos]) {
+      PositionType mid = (l + r) / 2;
+      Index leftIntervalIndex = intervalIndex + 1;
+      Index rightIntervalIndex = 2 * (mid - l + 1) + intervalIndex;
+      if (sortedStringsLengthsMinTree[tableIndex][leftIntervalIndex] < sortedStringsLengthsMinTree[tableIndex][rightIntervalIndex]) {
+	SEARCH_FUNCTION_NAME(l,       mid, pos, leftIntervalIndex, tableIndex, globalOffset);
+	SEARCH_FUNCTION_NAME(mid + 1, r,   pos, rightIntervalIndex, tableIndex, globalOffset);
+      } else {
+	SEARCH_FUNCTION_NAME(mid + 1, r,   pos, rightIntervalIndex, tableIndex, globalOffset);
+	SEARCH_FUNCTION_NAME(l,       mid, pos, leftIntervalIndex, tableIndex, globalOffset);
+      }
+      return;
+    } 
+
+    if (endOfStringChar == sortedStrings[globalOffset + l][pos]) {
+      SearchResult result;
+
+      PositionType minPos = std::max(0, (*localLeft)[pos] + pos);
+      for(PositionType i = minPos + 1; i <= (*localRight)[pos] + pos; ++i)
+	if ((*table)(pos, i) < (*table)(pos, minPos))
+	  minPos = i;
+      
+      // Check the number of differences is less than or equal to similarity times the length of the shorter sequence.
+      float acceptable_differences = pos * (1 - similarity);
+      if (mismatches >= 0)
+        acceptable_differences = mismatches;
+      if ((*table)(pos, minPos) <= acceptable_differences) {
+
+	if (outputMultipleAlignment) {
+	  boost::tie(result.cost, result.func) = findAlignmentFunction(pos, querySequenceLength, minPos);
+	}
+
+	// Try to lower the amount of time we try to claim the lock.
+	//if (l <= r){
+		//pthread_mutex_lock(&running_mutex);
+		for (PositionType i = l; i <= r; ++i) {
+		  result.number = i + globalOffset;
+		  threadSearchResults[tableIndex].push_back(result);
+		  //++resultsCounter[tableIndex];
+		} // for (PositionType i = l;; i <= r; ++i)
+
+      } // if (table(pos, minPos) <= pos * (1 - similarity))
+
+      return;
+    }
+    
+
+    if ((*localWord)[pos + 1] != sortedStrings[globalOffset + l][pos]
+	|| (*localWordMinLength)[pos + 1] > minSortedStringsLength
+	) {
+      // Row table(pos + 1, *) must be updated.
+      // Thus minCost[pos + 1] must be updated.
+
+      (*localWord)[pos + 1] = sortedStrings[globalOffset + l][pos];
+      (*localWord)[pos + 2] = invalidChar;
+
+      (*localWordMinLength)[pos + 1] = minSortedStringsLength;
+
+      {  // Update the row table(pos + 1, *)
+	const PositionType minColumn = std::max(0, (*localLeft)[pos] + (pos + 1));
+	const PositionType maxColumn = std::min(querySequenceLength, (*localRight)[pos] + (pos + 1));
+	CostType currentMinCost = MAX_COST;
+
+
+	CostType currentCost;
+#if ALIGN_WITH_BACKPOINTER
+	BackPointer currentPointer;
+#endif // ALIGN_WITH_BACKPOINTER
+	const PositionType i = pos + 1;
+	const CharType c1 = (*localWord)[i];
+	CharType c2; // = querySequencePointer[j - 1]
+	CostType matchCost, gapCost2, gapCost1;
+	
+	CostType leftCost, downLeftCost, downCost;
+
+
+	leftCost = MAX_COST;
+	if (minColumn > 0) {
+	  downLeftCost = (*table)(i - 1, minColumn - 1);
+	} else {
+	  downLeftCost = MAX_COST;
+	}
+	
+
+	for (PositionType j = minColumn; j <= maxColumn; ++j) {
+
+	  { // Update table(i, j) = table(pos + 1, j)
+	    
+#if ALIGN_WITH_BACKPOINTER
+	    currentPointer = ROOT;
+#endif // ALIGN_WITH_BACKPOINTER
+	    currentCost = MAX_COST;
+	    
+	    // c1 == word[i]
+	    c2 = querySequencePointer[j - 1];
+	    
+	    matchCost = downLeftCost + cost(c1, c2);
+	    if (matchCost < currentCost) {
+	      currentCost = matchCost;
+#if ALIGN_WITH_BACKPOINTER
+	      currentPointer = DOWN_LEFT;
+#endif // ALIGN_WITH_BACKPOINTER
+	    }
+	      
+	    gapCost2 = leftCost + cost(gapChar, c2);
+	    if (gapCost2 < currentCost) {
+	      currentCost = gapCost2;
+#if ALIGN_WITH_BACKPOINTER
+	      currentPointer = LEFT;
+#endif // ALIGN_WITH_BACKPOINTER
+	    }
+	    
+	    downCost = (*table)(i - 1, j);
+
+	    gapCost1 = downCost + cost(c1, gapChar);
+	    if (gapCost1 < currentCost) {
+	      currentCost = gapCost1;
+#if ALIGN_WITH_BACKPOINTER
+	      currentPointer = DOWN;
+#endif // ALIGN_WITH_BACKPOINTER
+	    }
+	    
+	    (*table)(i, j) = currentCost;
+#if ALIGN_WITH_BACKPOINTER
+	    backTable(i, j) = currentPointer;
+#endif // ALIGN_WITH_BACKPOINTER
+
+	    leftCost = currentCost;
+	    downLeftCost = downCost;
+	    
+	  } // Update table(i, j) = table(pos + 1, j)
+
+	  
+	  if (currentCost < currentMinCost)
+	    currentMinCost = currentCost;
+	} // for (PositionType j...
+
+
+	(*localMinCost)[pos + 1] = currentMinCost;
+      }
+
+      if ((*localRight)[pos] + (pos + 1) < querySequenceLength)
+	(*table)(pos + 1, (*localRight)[pos] + (pos + 1) + 1) = MAX_COST;
+      
+      
+      { // Find left[pos + 1].
+	PositionType j = std::max(-(pos + 1), (*localLeft)[pos]);
+	PositionType maxIndex = std::min(querySequenceLength - (pos + 1), (*localRight)[pos]);
+	while ((j <= maxIndex) && ((*table)(pos + 1, j + (pos + 1)) > radious))
+	  ++j;
+	if (j == -(pos + 1))
+	  (*localLeft)[pos + 1] = j - 1;
+	else
+	  (*localLeft)[pos + 1] = j;
+      }
+
+      { // Find right[pos + 1].
+	PositionType j = std::min(querySequenceLength - (pos + 1), (*localRight)[pos]);
+	PositionType minIndex = std::max(-(pos + 1), (*localLeft)[pos]);
+	while ((j >= minIndex) && ((*table)(pos + 1, j + (pos + 1)) > radious))
+	  --j;
+	
+	(*localRight)[pos + 1] = std::min(j, maxRight);
+
+      } // Find right[pos + 1]
+    
+    } // if (word[pos + 1] != sortedStrings[l][pos])
+    
+  } while ((*localMinCost)[++pos] <= maxLengthRadious);
+
+}
diff --git a/star-align b/star-align
new file mode 100755
index 0000000..9ff2714
--- /dev/null
+++ b/star-align
@@ -0,0 +1,44 @@
+#!/bin/bash
+similarity=0.80
+
+print_help()
+{
+    fold --spaces <<EOF
+Usage: star-align [OPTIONS...]
+Build a multiple sequence alignment (MSA) for a set of sequences, using star alignment heuristic.
+
+  -i FILE                    The file containing the sequences in FASTA format. 
+                             Note that the MSAs are built based on the ids
+                             read from STDIN.
+  -h                         Give this help list
+
+The ids of the sequnces to be aligned are read from STDIN. The MSAs are written to STDOUT. One MSA is generated for each line. The first sequence id on each line is used as the center of the 'star'.
+
+Example: To build a MSA for the first cluster generated by 'dnaclust' do:
+head -n 1 sequences.cluster | ./star-align -i sequences.fasta > alignment.fasta
+EOF
+}
+
+while getopts "i:h" option
+do 
+    case $option in
+	i) input_fasta="$OPTARG";;
+	h) print_help; exit 0;;
+	[?]) print_help; exit 1;;
+    esac
+done
+
+dnaclust_path="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+tempdir=`mktemp -d -p .`
+trap "rm -fr $tempdir" EXIT
+
+center_fasta=`mktemp -p $tempdir`
+sequences_fasta=`mktemp -p $tempdir`
+
+while read line
+do
+    echo $line | "$dnaclust_path/fastaselect" -f $input_fasta --cluster-centers > $center_fasta
+    echo $line | "$dnaclust_path/fastaselect" -f $input_fasta --cluster-sequences > $sequences_fasta
+    "$dnaclust_path/dnaclust" -s $similarity --multiple-alignment --no-k-mer-filter -i $sequences_fasta -p $center_fasta -r
+done
\ No newline at end of file
diff --git a/ternary_sort.hpp b/ternary_sort.hpp
new file mode 100644
index 0000000..2a1ef2d
--- /dev/null
+++ b/ternary_sort.hpp
@@ -0,0 +1,58 @@
+#ifndef TERNARY_SORT_HPP
+#define TERNARY_SORT_HPP
+
+#include <algorithm>
+
+namespace ternary_sort
+{
+
+  void vecswap(int i, int j, int n, const char *x[])
+  {   
+    while (n-- > 0) {
+      std::swap(x[i], x[j]);
+      i++;
+      j++;
+    }
+  } // void vecswap(int i, int j, int n, const char *x[])
+
+  void ssort1(const char *x[], int n, int depth)
+  {
+    int    a, b, c, d, r, v;
+    if (n <= 1)
+      return;
+    // Randomness here means that outputs will not always be the same. Maybe.
+    a = rand() % n;
+    std::swap(x[0], x[a]);
+    v = x[0][depth];
+    a = b = 1;
+    c = d = n-1;
+    while (true) {
+      while (b <= c && (r = x[b][depth]-v) <= 0) {
+	if (r == 0) { std::swap(x[a], x[b]); a++; }
+	b++;
+      }
+      while (b <= c && (r = x[c][depth]-v) >= 0) {
+	if (r == 0) { std::swap(x[c], x[d]); d--; }
+	c--;
+      }
+      if (b > c) break;
+      std::swap(x[b], x[c]);
+      b++;
+      c--;
+    }
+    r = std::min(a, b-a);     vecswap(0, b-r, r, x);
+    r = std::min(d-c, n-d-1); vecswap(b, n-r, r, x);
+    r = b-a; ssort1(x, r, depth);
+    if (x[r][depth] != 0)
+      ssort1(x + r, a + n-d-1, depth+1);
+    r = d-c; ssort1(x + n-r, r, depth);
+  } // void ssort1(const char *x[], int n, int depth)
+
+  void ternarySort(const char *x[], int n)
+  { 
+    ssort1(x, n, 0); 
+  } // void ternarySort(const char *x[], int n)
+
+} // namespace ternary_sort
+
+#endif // TERNARY_SORT_HPP
diff --git a/utility.hpp b/utility.hpp
new file mode 100644
index 0000000..3e2ef05
--- /dev/null
+++ b/utility.hpp
@@ -0,0 +1,20 @@
+#ifndef UTILITY_HPP
+#define UTILITY_HPP
+
+#include <vector>
+
+template<typename ElementType>
+inline const ElementType *getPointerOfVector(const std::vector<ElementType> &v)
+{
+  return v.empty() ? 0 : &v[0];
+}
+
+
+template<typename ElementType>
+inline ElementType *getPointerOfVector(std::vector<ElementType> &v)
+{
+  return v.empty() ? 0 : &v[0];
+}
+
+
+#endif // UTILITY_HPP

-- 
Alioth's /git/debian-med/git-commit-notice on /srv/git.debian.org/git/debian-med/dnaclust.git



More information about the debian-med-commit mailing list