[med-svn] [fsm-lite] 01/02: Imported Upstream version 0.0+20151109

Andreas Tille tille at debian.org
Fri Apr 8 19:18:30 UTC 2016


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository fsm-lite.

commit 17b96ac19ef1ecba195e8923b58cbf18b96c4784
Author: Andreas Tille <tille at debian.org>
Date:   Fri Apr 8 21:17:29 2016 +0200

    Imported Upstream version 0.0+20151109
---
 Makefile          |  21 +++++
 README.md         |  33 ++++++++
 configuration.cpp | 136 +++++++++++++++++++++++++++++++
 configuration.h   |  30 +++++++
 default.h         |   4 +
 dependencies.mk   |  99 +++++++++++++++++++++++
 fsm-lite.cpp      | 207 +++++++++++++++++++++++++++++++++++++++++++++++
 input_reader.cpp  | 236 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 input_reader.h    |  70 ++++++++++++++++
 9 files changed, 836 insertions(+)

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..3ba0e35
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,21 @@
+SDSL_INSTALL_PREFIX=/Users/nvalimak/code/sdsl-lite-2.0.3
+
+CC=g++
+CPPFLAGS=-std=c++11 -I${SDSL_INSTALL_PREFIX}/include 
+DISBALEDOPTIMIZATIONFLAGS = -DNDEBUG -O3 -msse4.2
+LIBS=-lsdsl -ldivsufsort -ldivsufsort64
+OBJ = configuration.o input_reader.o 
+
+fsm-lite: fsm-lite.o $(OBJ)
+	$(CC) $(CPPFLAGS) -L${SDSL_INSTALL_PREFIX}/lib -o fsm-lite fsm-lite.o $(OBJ) $(LIBS)
+
+test: fsm-lite
+	./fsm-lite -l test.list -t tmp -v --debug -m 1
+
+clean:
+	rm -f fsm-lite *.o *~
+
+depend:
+	g++ -MM -std=c++11 -I${SDSL_INSTALL_PREFIX}/include *.cpp > dependencies.mk
+
+include dependencies.mk
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1b31975
--- /dev/null
+++ b/README.md
@@ -0,0 +1,33 @@
+Frequency-based String Mining (lite)
+===
+
+A singe-core implemetation of frequency-based substring mining. This
+implementation requires the https://github.com/simongog/sdsl-lite
+library (tested using the release `sdsl-lite-2.0.3`).
+
+1. Download and extract https://github.com/simongog/sdsl-lite/archive/v2.0.3.tar.gz
+2. install SDSL by running `./install.sh /install/path/sdsl-lite-2.0.3`, where `/install/path` need to be specified,
+3. update the correct SDSL installation path into the `fsm-lite/Makefile`,
+4. turn on preferred compiler optimization in `fsm-lite/Makefile`, and
+5. run `make depend && make` under the directory `fsm-lite`.
+
+For command-line options, see `./fsm-lite --help`.
+
+Usage example
+---
+
+Input files are given as a list of `<data-identifier>` `<data-filename>` pairs. The `<data-identifier>`'s are assumed to be unique. Here's an example how to construct such a list out of all `/input/dir/*.fasta` files:
+
+  `for f in /input/dir/*.fasta; do id=$(basename "$f" .fasta); echo $id $f; done > input.list` 
+
+The files can then be processed by 
+
+  `./fsm-lite -l input.list -t tmp | gzip - > output.txt.gz`
+
+where `tmp` is a prefix filename for storing temporary index files.
+
+TODO
+---
+1. Optimize the time and space usage.
+2. Multi-threading.
+3. Support for gzip compressed input.
diff --git a/configuration.cpp b/configuration.cpp
new file mode 100644
index 0000000..142e2d8
--- /dev/null
+++ b/configuration.cpp
@@ -0,0 +1,136 @@
+#include "configuration.h"
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <cstdlib> // std::exit()
+#include <cassert>
+#include <getopt.h>
+using namespace std;
+
+configuration::configuration(int argc, char **argv)
+    : prog(argv[0]), listfile(""), tmpfile(""), inputformat(input_reader::input_fasta),
+      minlength(9), maxlength(100), minfreq(1), minsupport(2), maxsupport(~0u),
+      verbose(false), debug(false), good(false)
+{
+    assert (argc > 0);
+    good = parse(argc, argv);
+}
+
+int atoi_min(char const *value, int min, char const *parameter, char const *name)
+{
+    istringstream iss(value);
+    int i;
+    char c;
+    if (!(iss >> i) || iss.get(c))
+    {
+        cerr << "error: argument of " << parameter << " must be of type <int>, and greater than or equal to " << min << endl
+             << "Check " << name << " --help' for more information." << endl;
+        exit(1);
+    }
+
+    if (i < min)
+    {
+        cerr << "error: argument of " << parameter << " must be greater than or equal to " << min << endl
+             << "Check `" << name << " --help' for more information." << endl;
+        exit(1);
+    }
+    return i;
+}
+
+void configuration::print_short_usage() const
+{
+    cerr << "See " << prog << " --help for more information." << endl;
+    exit(1);
+}
+
+void configuration::print_usage() const
+{
+    cerr << "usage: " << prog << " -l <file> -t <file> [options]" << endl
+         << "  -l,--list <file>     Text file that lists all input files as whitespace-separated pairs "<< endl
+         << "                            <data-name> <data-filename>" << endl
+         << "                       where <data-name> is unique identifier (without whitespace)" << endl
+         << "                       and <data-filename> is full path to each input file." << endl
+         << "                       Default data file format is FASTA (uncompressed)." << endl
+         << "  -t,--tmp <file>      Store temporary index data" << endl
+         << "options:" << endl
+         << "  -m,--min <int>       Minimum length to report (default 9)" << endl
+         << "  -M,--max <int>       Maximum length to report (default 100)" << endl
+         << "  -f,--freq <int>      Minimum frequency per input file to report (default 1)" << endl
+         << "  -s,--minsupp <int>   Minimum number of input files with support to report (default 2)" << endl
+         << "  -S,--maxsupp <int>   Maximum number of input files with support to report (default inf)" << endl
+         << "  -v,--verbose         Verbose output" << endl;
+    exit(1);
+}
+
+bool configuration::parse(int argc, char **argv)
+{    
+    static struct option long_options[] =
+        {
+            {"list",      required_argument, 0, 'l'},
+            {"tmp",       required_argument, 0, 't'},
+            {"min",       required_argument, 0, 'm'},
+            {"max",       required_argument, 0, 'M'},
+            {"freq",      required_argument, 0, 'f'},
+            {"minsupp",   required_argument, 0, 's'},
+            {"maxsupp",   required_argument, 0, 'S'},
+            {"verbose",   no_argument,       0, 'v'},
+            {"debug",     no_argument,       0, 'D'},
+            {"help",      no_argument,       0, 'h'},
+            {0, 0, 0, 0}
+        };
+    int option_index = 0;
+    int c;
+    while ((c = getopt_long(argc, argv, "l:t:m:M:f:s:S:vDh",
+                            long_options, &option_index)) != -1) 
+    {
+        switch(c) 
+        {
+        case 'l':
+            listfile = string(optarg); break;
+        case 't':
+            tmpfile = string(optarg); break;
+        case 'm':
+            minlength = atoi_min(optarg, 1, "-m,--min", argv[0]); break;
+        case 'M':
+            maxlength = atoi_min(optarg, 1, "-M,--max", argv[0]); break;
+        case 'f':
+            minfreq = atoi_min(optarg, 1, "-f,--freq", argv[0]); break;
+        case 's':
+            minsupport = atoi_min(optarg, 1, "-s,--minsupp", argv[0]); break;
+        case 'S':
+            maxsupport = atoi_min(optarg, 1, "-S,--maxsupp", argv[0]); break;
+        case 'v':
+            verbose = true; break;
+        case 'D':
+            debug = true; break;
+        case 'h':
+            print_usage(); return 1;
+        default:
+            cerr << "error: invalid parameter!" << endl;
+            return false;
+        }
+    }
+    if (listfile == "")
+    {
+        cerr << "error: -l,--list is a required parameter" << endl;
+        return false;
+    }
+    if (tmpfile == "")
+    {
+        cerr << "error: -t,--tmp is a required parameter" << endl;
+        return false;
+    }
+
+    if (minlength > maxlength)
+    {
+        cerr << "error: -m,--min must be smaller than or equal to -M,--max" << endl;
+        return false;
+    }
+    if (minsupport > maxsupport)
+    {
+        cerr << "error: -s,--minsupp must be smaller than or equal to -S,--maxsupp" << endl;
+        return false;
+    }
+
+    return true;
+}
diff --git a/configuration.h b/configuration.h
new file mode 100644
index 0000000..d75e0aa
--- /dev/null
+++ b/configuration.h
@@ -0,0 +1,30 @@
+#ifndef CONFIGURATION_H
+#define CONFIGURATION_H
+#include "input_reader.h"
+#include <string>
+
+
+class configuration
+{
+public:
+    configuration(int, char **);
+    bool parse(int, char **);
+    void print_short_usage() const;
+    void print_usage() const;
+
+    std::string prog;
+    std::string listfile;
+    std::string tmpfile;
+    int inputformat;
+    unsigned minlength;
+    unsigned maxlength;
+    unsigned minfreq;
+    unsigned minsupport;
+    unsigned maxsupport;
+    bool verbose;
+    bool debug;
+    bool good;
+private:
+    configuration();
+};
+#endif
diff --git a/default.h b/default.h
new file mode 100644
index 0000000..055b8bc
--- /dev/null
+++ b/default.h
@@ -0,0 +1,4 @@
+
+// Compile time setting, which determines the maximum number of input files.
+// Note: Maximun is 2^DBITS, and increasing DBITS will increase memory usage.
+#define DBITS 12
diff --git a/dependencies.mk b/dependencies.mk
new file mode 100644
index 0000000..abde7f8
--- /dev/null
+++ b/dependencies.mk
@@ -0,0 +1,99 @@
+configuration.o: configuration.cpp configuration.h input_reader.h
+fsm-lite.o: fsm-lite.cpp default.h configuration.h input_reader.h \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/suffix_trees.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/sdsl_concepts.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/uintx_t.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/suffix_arrays.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/csa_bitcompressed.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/int_vector.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/bits.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/structure_tree.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/config.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/util.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/sfstream.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/ram_fs.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/ram_filebuf.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/io.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/memory_management.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/int_vector_buffer.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/iterators.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/suffix_array_helper.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/csa_sampling_strategy.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/csa_alphabet_strategy.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/sd_vector.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/select_support_mcl.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/select_support.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rank_support.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rank_support_v.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rank_support_v5.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rank_support_scan.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/select_support_scan.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/csa_wt.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wavelet_trees.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wt_pc.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/bit_vectors.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/bit_vector_il.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rrr_vector.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rrr_helper.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/uint128_t.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/uint256_t.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rrr_vector_15.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wt_helper.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wt_blcd.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wt_gmr.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/vectors.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/enc_vector.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/coder.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/coder_fibonacci.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/coder_elias_delta.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/coder_elias_gamma.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/coder_comma.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/vlc_vector.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/dac_vector.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wt_huff.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wt_hutu.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wt_int.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wm_int.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wt_rlmn.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/construct.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/construct_lcp.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/construct_isa.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/construct_bwt.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wt_algorithm.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/construct_lcp_helper.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/construct_sa.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/divsufsort.h \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/divsufsort64.h \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/qsufsort.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/construct_sa_se.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/construct_config.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/fast_cache.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/csa_sada.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/suffix_array_algorithm.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/suffix_tree_algorithm.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/suffix_tree_helper.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/sorted_multi_stack_support.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/sorted_stack_support.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/cst_sct3.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/lcp.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/lcp_support_sada.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/lcp_byte.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/lcp_wt.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/lcp_vlc.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/lcp_dac.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/lcp_bitcompressed.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/lcp_support_tree.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/lcp_support_tree2.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/bp_support.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/bp_support_g.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/nearest_neighbour_dictionary.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rmq_support.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rmq_support_sparse_table.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rmq_succinct_sct.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/bp_support_sada.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/bp_support_algorithm.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rmq_succinct_sada.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/bp_support_gg.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/cst_iterators.hpp \
+  /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/cst_sada.hpp
+input_reader.o: input_reader.cpp default.h input_reader.h configuration.h
diff --git a/fsm-lite.cpp b/fsm-lite.cpp
new file mode 100644
index 0000000..bcd06c5
--- /dev/null
+++ b/fsm-lite.cpp
@@ -0,0 +1,207 @@
+#include "default.h"
+#include "configuration.h"
+#include "input_reader.h"
+#include <sdsl/suffix_trees.hpp> // TODO: replace with csa+lcp array
+#include <sdsl/wt_algorithm.hpp>
+#include <iostream>
+#include <vector>
+#include <cstdlib> // std::exit()
+using namespace std;
+
+typedef sdsl::cst_sct3<> cst_t;
+typedef sdsl::wt_int<> wt_t;
+typedef sdsl::bit_vector bitv_t;
+typedef cst_t::char_type char_type;
+typedef cst_t::node_type node_type;
+typedef wt_t::size_type size_type;
+
+/**
+ * Construct the sequence labels
+ *
+ * Assumes that the number of input files is less than 2^DBITS.
+ * The value of DBITS has to be set at compile time (in defaults.h).
+ * Large DBITS values result in large memory requirements for wt_init().
+ */
+void wt_init(wt_t &wt, bitv_t &separator, cst_t &cst, input_reader *ir, configuration &config)
+{
+    uint64_t n = cst.csa.size();
+    sdsl::int_vector<DBITS> labels(n, ~0u);
+    separator = bitv_t(n, 0);
+    uint64_t k = ir->size()-1;
+    uint64_t j = cst.csa.wavelet_tree.select(1, 0);
+    if (config.debug)
+        cerr << "bwt end marker pos = " << j << endl;
+    uint64_t bwtendpos = j;
+    j = cst.csa.lf[j];
+    labels[j] = 0;  // Label of last byte
+    separator[n-1] = 0;
+    separator[n-2] = 1;
+    j = cst.csa.lf[j];
+    for (uint64_t i = n-2; i > 0; i--) {
+        char_type c = cst.csa.bwt[j];
+        labels[j] = k;
+        if (c == '$')
+            k --;
+        if (c == '$' || c == '#')
+            separator[i-1] = 1;
+        
+        j = cst.csa.lf[j];
+    }
+    labels[j] = k;
+    if (j != bwtendpos || k != 0) // Assert
+    {
+        cerr << "Labeling failed, j = " << j << ", k = " << k << endl;
+        exit(1);
+    }
+
+    //TODO cleanup
+    /*for (uint64_t i = 0; i < n; ++i) 
+        cerr << cst.csa.text[i];
+    cerr << endl;
+    for (uint64_t i = 0; i < n; ++i) 
+        cerr << separator[i];
+    cerr << endl;
+    for (uint64_t i = 0; i < n; ++i) 
+        cerr << labels[cst.csa.isa[i]];
+    cerr << endl;
+    */  
+        
+    std::string tmp_file = sdsl::ram_file_name(sdsl::util::to_string(sdsl::util::pid())+"_"+sdsl::util::to_string(sdsl::util::id()));
+    sdsl::store_to_file(labels, tmp_file);
+    sdsl::int_vector_buffer<DBITS> text_buf(tmp_file);
+    wt = wt_t(text_buf, labels.size());
+    if (config.debug)
+        cerr << "wt size = " << wt.size() << ", n = " << n << endl;
+    j = 0;
+    for (uint64_t i = 0; i < ir->size(); ++i)
+        j += wt.rank(n, i);
+    if (j != n) // Assert
+    {
+        cerr << "Label sum failed, j = " << j << ", n = " << n << endl;
+        exit(1);
+    }
+    
+}
+    
+int main(int argc, char ** argv)
+{
+    configuration config(argc, argv);
+    if (!config.good)
+        config.print_short_usage();
+
+    if (config.verbose)
+        cerr << "Reading input files..." << endl;
+    input_reader *ir = input_reader::build(config);
+    if (config.verbose)
+        cerr << "Read " << ir->size() << " input files and " << ir->total_seqs() << " sequences of total length " << ir->total_size() << " (includes rev.compl. sequences)" << endl;
+    
+    /**
+     * Initialize the data structures
+     */
+    if (config.verbose)
+        cerr << "Constructing the data structures..." << endl;
+    cst_t cst;    
+    construct(cst, config.tmpfile + ".tmp", 1);
+    if (!cst.csa.size())
+    {
+        cerr << "error: unable to construct the data structure; out of memory?" << endl; 
+        abort();
+    }
+    
+    wt_t label_wt;
+    bitv_t separator;
+    wt_init(label_wt, separator, cst, ir, config);
+
+    bitv_t::rank_1_type sep_rank1(&separator);
+    //bitv_t::select_1_type sep_select1(&separator); TODO Remove?
+    assert(sep_rank1(cst.size()) == ir->total_seqs());
+    
+    size_type support = 0;
+    vector<wt_t::value_type> labels(ir->size(), 0);
+    vector<size_type> rank_sp(ir->size(), 0);
+    vector<size_type> rank_ep(ir->size(), 0);
+
+    if (config.verbose)
+        cerr << "Construction complete, the main index requires " << size_in_mega_bytes(cst) << " MiB plus " << size_in_mega_bytes(label_wt) << " MiB for labels." << endl;
+    
+    /**
+     * Main loop
+     */
+    node_type root = cst.root();
+    vector<node_type> buffer;
+    buffer.reserve(1024*1024);
+    for (auto& child: cst.children(root))
+        buffer.push_back(child);
+    while (!buffer.empty())
+    {
+        node_type const node = buffer.back();
+        buffer.pop_back();        
+        unsigned depth = cst.depth(node);
+        if (depth < config.maxlength)
+            for (auto& child: cst.children(node))
+                buffer.push_back(child);
+        if (depth < config.minlength)
+            continue;
+        if (cst.is_leaf(node))
+            continue;
+        
+        // Process the candidate node
+        size_type sp = cst.lb(node);
+        size_type ep = cst.rb(node);
+        node_type wn = cst.wl(node, cst.csa.bwt[sp]);
+        /*if (config.debug)
+        {
+            size_type pos = cst.csa[sp];
+            auto s = extract(cst.csa, pos, pos + depth - 1);
+            cerr << "at node = " << depth << "-[" << sp << "," << ep << "], wl = " << (wn != root);
+            if (wn!=root)
+                cerr << "[" << cst.rb(wn)-cst.lb(wn) << " vs " << ep-sp << "]";
+            cerr << ", seq = " << s << endl;
+            }*/
+        if (wn == root && config.debug)
+        {
+            cerr << "warning: no Weiner-link at " << depth << "-[" << sp << "," << ep << "]" << endl;
+            continue;
+        }
+        if (depth < config.maxlength && cst.rb(wn)-cst.lb(wn) == ep-sp)
+            continue; // not left-branching
+
+        sdsl::interval_symbols(label_wt, sp, ep+1, support, labels, rank_sp, rank_ep);
+        if (support < config.minsupport || support > config.maxsupport)
+            continue;
+        
+        size_type truesupp = 0;
+        for (size_type i = 0; i < support; ++i)
+            if (config.minfreq <= rank_ep[i]-rank_sp[i])
+                ++truesupp;
+        if (truesupp < config.minsupport)
+            continue;
+        
+        if (depth > config.maxlength)
+            depth = config.maxlength;
+        size_type pos = cst.csa[sp];
+        // Check for separator symbol TODO cleanup
+        /*unsigned p_depth = cst.depth(cst.parent(node));
+        if (sep_rank1(pos) != sep_rank1(pos + p_depth))
+            continue; // Separator occurs above parent node
+        if (sep_rank1(pos) != sep_rank1(pos + depth))
+            depth = sep_select1(sep_rank1(pos)+1) - pos +1; // Separator above current node      
+        */
+        
+        if (sep_rank1(pos) != sep_rank1(pos + depth))
+            continue;
+        auto s = extract(cst.csa, pos, pos + depth - 1);
+        if (input_reader::smaller_than_rev_cmpl(s))
+            continue;
+        cout << s + " |";
+        for (size_type i = 0; i < support; ++i)
+            if (config.minfreq <= rank_ep[i]-rank_sp[i])
+                cout << ' ' << ir->id(labels[i]) << ':' << rank_ep[i]-rank_sp[i];
+        cout << '\n';
+    }
+    
+    if (config.verbose)
+        cerr << "All done." << endl;    
+    delete ir; ir = 0;
+    return 0;
+}
diff --git a/input_reader.cpp b/input_reader.cpp
new file mode 100644
index 0000000..51ca830
--- /dev/null
+++ b/input_reader.cpp
@@ -0,0 +1,236 @@
+#include "default.h"
+#include "input_reader.h"
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <cstdlib> // abort()
+#include <cassert>
+using namespace std;
+
+input_reader * input_reader::build(configuration const &config)
+{
+    switch (config.inputformat)
+    {
+    case input_fasta:
+        return new fasta_input_reader(config);
+        break;
+    default:
+        cerr << "Error: invalid file type at InputReader::build()" << endl; 
+        abort();
+    }
+}
+
+    std::istream *fp;
+    std::vector<std::string> id_;
+    std::vector<std::size_t> size_;
+    std::size_t total_size_;
+    bool good_;
+
+input_reader::input_reader(string file)
+    : fp(0), id_(), size_(), total_size_(0), total_seqs_(0)
+{
+    // Open file handle, "-" uses standard input
+    if (file == "-")
+        fp = &std::cin;
+    else
+        fp = new ifstream(file.c_str());
+}
+
+void error_reading_input(configuration const &config, char const *message)
+{
+    cerr << message << endl;
+    config.print_short_usage();
+    exit(1);
+}
+
+/**
+ * Simple FASTA input
+ */
+void normalize(std::string &t)
+{
+    for (std::string::iterator it = t.begin(); it != t.end(); ++it)
+    {
+        switch (*it)
+        {
+        case('a'):
+            *it = 'A';
+            break;
+        case('c'):
+            *it = 'C';
+            break;
+        case('g'):
+            *it = 'G';
+            break;
+        case('t'):
+            *it = 'T';
+            break;
+        case('n'):
+            *it = 'N';
+            break;
+        case('A'):
+        case('C'):
+        case('G'):
+        case('T'):
+        case('N'):
+            break;
+        case('0'):
+        case('1'):
+        case('2'):
+        case('3'):
+        case('.'):
+            break;
+        default:
+            *it = 'N';
+            break;
+        }
+    }
+}
+
+void revcmpl(std::string &t)
+{
+    char c;
+    std::size_t n = t.size();
+    for (std::size_t i = 0; i < n / 2; ++i) {
+        c = t[i];
+        t[i] = t[n - i - 1];
+        t[n - i - 1] = c;
+    }
+    for (std::string::iterator it = t.begin(); it != t.end(); ++it)
+    {
+        switch (*it)
+        {
+        case('T'):
+            *it = 'A'; break;
+        case('G'):
+            *it = 'C'; break;
+        case('C'):
+            *it = 'G'; break;
+        case('A'):
+            *it = 'T'; break;
+        }
+    }
+}
+
+bool input_reader::smaller_than_rev_cmpl(string const& path)
+{
+    // Check if given path is smaller than its rev.compl.
+    string rc(path);
+    revcmpl(rc);
+    if (path < rc)
+        return true;
+    return false;
+}
+
+size_t append(ofstream &ofs, string const &fname, unsigned &nseqs)
+{
+    string row;
+    string buffer;
+    buffer.reserve(1024*1024);
+    ifstream ifs(fname.c_str());
+    bool first = true;
+    size_t n = 0;
+    while (std::getline(ifs, row).good())
+    {
+        if (row.empty())
+            continue;
+        if (row[0] == '>')
+        {
+            if (!first)
+            {
+                ofs << '#';                
+                nseqs++;
+                n ++;
+                revcmpl(buffer);
+                ofs << buffer << '#';
+                nseqs++;
+                n += buffer.size() + 1;
+                buffer.clear();
+            }
+            first = false;
+            continue;
+        }
+        if (row[row.size()-1] == '\n')
+            row.pop_back();
+        if (row[row.size()-1] == '\r')
+            row.pop_back();
+        normalize(row);
+        ofs << row;
+        buffer += row;
+        n += row.size();
+    }
+    if (n == 0)
+    {
+        cerr << "error: Unable to read the input file " << fname << endl;
+        exit(1);
+    }
+    ofs << '#';
+    nseqs ++;
+    n ++;
+    revcmpl(buffer);
+    ofs << buffer << '$';
+    nseqs++;
+    n += buffer.size() + 1;
+    buffer.clear();
+    return n;
+}
+
+bool file_exists (string const &name)
+{
+    ifstream f(name.c_str());
+    if (f.good())
+    {
+        f.close();
+        return true;
+    }
+    else
+    {
+        f.close();
+        return false;
+    }   
+}
+
+fasta_input_reader::fasta_input_reader(configuration const &config)
+    : input_reader(config.listfile)
+{
+    string tmpfile = config.tmpfile + ".tmp";
+    string metafile = config.tmpfile + ".meta";
+//    if (file_exists(tmpfile) && file_exists(metafile))
+// TODO        cerr << "warning: tmp files under " << config.tmpfile << " will be rebuilt" << endl;
+        
+    ofstream ofs(tmpfile.c_str());
+    ofstream ofsmeta(metafile.c_str());
+    // Parse the list file
+    string id, fname;
+    while (fp->good())
+    {
+        *fp >> id;
+        if (id.empty() || !fp->good())
+            break;
+        *fp >> fname;
+        if (fname.empty())
+            error_reading_input(config, "error: Unable to read the -l,--list input file: please check the list file format.");
+
+        id_.push_back(id);
+        size_t s = append(ofs, fname, total_seqs_);
+        size_.push_back(s);
+        total_size_ += s;
+        ofsmeta << id << "\t" << s << "\t" << endl;
+    }
+    if (id_.empty() || total_size_ == 0) 
+        error_reading_input(config, "error: Unable to read the -l,--list input file: please check the list file format.");
+    if (!ofs.good()) 
+        error_reading_input(config, "error: Unable to write the temporary files under the location -i,--index.");
+    if (!ofsmeta.good()) 
+        error_reading_input(config, "error: Unable to write the temporary files under the location -i,--index.");
+    
+    if ((1 << DBITS) <= size_.size())
+    {
+        cerr << "error: increase DBITS value to support more than " << (1<<DBITS)
+             << " input files; see the file default.h for more details and recompile." << endl;
+        exit(1);
+    }
+
+    ofsmeta.close();
+    ofs.close();
+}
+
diff --git a/input_reader.h b/input_reader.h
new file mode 100644
index 0000000..6d95282
--- /dev/null
+++ b/input_reader.h
@@ -0,0 +1,70 @@
+#ifndef INPUT_READER_H
+#define INPUT_READER_H
+#include "configuration.h"
+#include <string>
+#include <vector>
+#include <iostream>
+
+class configuration;
+
+/**
+ * Base class for input
+ *
+ * Use input_reader::build() to construct an instance.
+ */
+class input_reader
+{
+public:
+    enum input_format_t { input_unset, input_fasta };
+
+    // Constructor
+    static input_reader* build(configuration const &);
+    
+    // Accessors for metadata
+    std::string const & id(std::size_t i) const
+        { return id_[i]; }
+    std::size_t size(std::size_t i) const
+        { return size_[i]; }
+    std::size_t size() const
+        { return size_.size(); }
+    std::size_t total_size() const
+        { return total_size_; }
+    std::size_t total_seqs() const
+        { return total_seqs_; }
+
+    
+    virtual ~input_reader()
+    { if (fp != &std::cin) delete fp; fp = 0; }
+    
+    static bool smaller_than_rev_cmpl(std::string const &);
+protected:
+    explicit input_reader(std::string);
+    std::istream *fp;
+    std::vector<std::string> id_;
+    std::vector<std::size_t> size_;
+    std::size_t total_size_;
+    unsigned total_seqs_;
+private:
+    input_reader();
+    // No copy constructor or assignment
+    input_reader(input_reader const&);
+    input_reader& operator = (input_reader const&);
+};
+
+
+/**
+ * Simple FASTA reader
+ */
+class fasta_input_reader : public input_reader
+{
+public:
+    explicit fasta_input_reader(configuration const &);
+    virtual ~fasta_input_reader()
+    {  }
+private:
+    fasta_input_reader();
+    // No copy constructor or assignment
+    fasta_input_reader(fasta_input_reader const&);
+    fasta_input_reader& operator = (fasta_input_reader const&);
+};
+#endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/fsm-lite.git



More information about the debian-med-commit mailing list