[med-svn] [fsm-lite] 01/02: Imported Upstream version 0.0+20151109
Andreas Tille
tille at debian.org
Fri Apr 8 19:18:30 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository fsm-lite.
commit 17b96ac19ef1ecba195e8923b58cbf18b96c4784
Author: Andreas Tille <tille at debian.org>
Date: Fri Apr 8 21:17:29 2016 +0200
Imported Upstream version 0.0+20151109
---
Makefile | 21 +++++
README.md | 33 ++++++++
configuration.cpp | 136 +++++++++++++++++++++++++++++++
configuration.h | 30 +++++++
default.h | 4 +
dependencies.mk | 99 +++++++++++++++++++++++
fsm-lite.cpp | 207 +++++++++++++++++++++++++++++++++++++++++++++++
input_reader.cpp | 236 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
input_reader.h | 70 ++++++++++++++++
9 files changed, 836 insertions(+)
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..3ba0e35
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,21 @@
+SDSL_INSTALL_PREFIX=/Users/nvalimak/code/sdsl-lite-2.0.3
+
+CC=g++
+CPPFLAGS=-std=c++11 -I${SDSL_INSTALL_PREFIX}/include
+DISBALEDOPTIMIZATIONFLAGS = -DNDEBUG -O3 -msse4.2
+LIBS=-lsdsl -ldivsufsort -ldivsufsort64
+OBJ = configuration.o input_reader.o
+
+fsm-lite: fsm-lite.o $(OBJ)
+ $(CC) $(CPPFLAGS) -L${SDSL_INSTALL_PREFIX}/lib -o fsm-lite fsm-lite.o $(OBJ) $(LIBS)
+
+test: fsm-lite
+ ./fsm-lite -l test.list -t tmp -v --debug -m 1
+
+clean:
+ rm -f fsm-lite *.o *~
+
+depend:
+ g++ -MM -std=c++11 -I${SDSL_INSTALL_PREFIX}/include *.cpp > dependencies.mk
+
+include dependencies.mk
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1b31975
--- /dev/null
+++ b/README.md
@@ -0,0 +1,33 @@
+Frequency-based String Mining (lite)
+===
+
+A singe-core implemetation of frequency-based substring mining. This
+implementation requires the https://github.com/simongog/sdsl-lite
+library (tested using the release `sdsl-lite-2.0.3`).
+
+1. Download and extract https://github.com/simongog/sdsl-lite/archive/v2.0.3.tar.gz
+2. install SDSL by running `./install.sh /install/path/sdsl-lite-2.0.3`, where `/install/path` need to be specified,
+3. update the correct SDSL installation path into the `fsm-lite/Makefile`,
+4. turn on preferred compiler optimization in `fsm-lite/Makefile`, and
+5. run `make depend && make` under the directory `fsm-lite`.
+
+For command-line options, see `./fsm-lite --help`.
+
+Usage example
+---
+
+Input files are given as a list of `<data-identifier>` `<data-filename>` pairs. The `<data-identifier>`'s are assumed to be unique. Here's an example how to construct such a list out of all `/input/dir/*.fasta` files:
+
+ `for f in /input/dir/*.fasta; do id=$(basename "$f" .fasta); echo $id $f; done > input.list`
+
+The files can then be processed by
+
+ `./fsm-lite -l input.list -t tmp | gzip - > output.txt.gz`
+
+where `tmp` is a prefix filename for storing temporary index files.
+
+TODO
+---
+1. Optimize the time and space usage.
+2. Multi-threading.
+3. Support for gzip compressed input.
diff --git a/configuration.cpp b/configuration.cpp
new file mode 100644
index 0000000..142e2d8
--- /dev/null
+++ b/configuration.cpp
@@ -0,0 +1,136 @@
+#include "configuration.h"
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <cstdlib> // std::exit()
+#include <cassert>
+#include <getopt.h>
+using namespace std;
+
+configuration::configuration(int argc, char **argv)
+ : prog(argv[0]), listfile(""), tmpfile(""), inputformat(input_reader::input_fasta),
+ minlength(9), maxlength(100), minfreq(1), minsupport(2), maxsupport(~0u),
+ verbose(false), debug(false), good(false)
+{
+ assert (argc > 0);
+ good = parse(argc, argv);
+}
+
+int atoi_min(char const *value, int min, char const *parameter, char const *name)
+{
+ istringstream iss(value);
+ int i;
+ char c;
+ if (!(iss >> i) || iss.get(c))
+ {
+ cerr << "error: argument of " << parameter << " must be of type <int>, and greater than or equal to " << min << endl
+ << "Check " << name << " --help' for more information." << endl;
+ exit(1);
+ }
+
+ if (i < min)
+ {
+ cerr << "error: argument of " << parameter << " must be greater than or equal to " << min << endl
+ << "Check `" << name << " --help' for more information." << endl;
+ exit(1);
+ }
+ return i;
+}
+
+void configuration::print_short_usage() const
+{
+ cerr << "See " << prog << " --help for more information." << endl;
+ exit(1);
+}
+
+void configuration::print_usage() const
+{
+ cerr << "usage: " << prog << " -l <file> -t <file> [options]" << endl
+ << " -l,--list <file> Text file that lists all input files as whitespace-separated pairs "<< endl
+ << " <data-name> <data-filename>" << endl
+ << " where <data-name> is unique identifier (without whitespace)" << endl
+ << " and <data-filename> is full path to each input file." << endl
+ << " Default data file format is FASTA (uncompressed)." << endl
+ << " -t,--tmp <file> Store temporary index data" << endl
+ << "options:" << endl
+ << " -m,--min <int> Minimum length to report (default 9)" << endl
+ << " -M,--max <int> Maximum length to report (default 100)" << endl
+ << " -f,--freq <int> Minimum frequency per input file to report (default 1)" << endl
+ << " -s,--minsupp <int> Minimum number of input files with support to report (default 2)" << endl
+ << " -S,--maxsupp <int> Maximum number of input files with support to report (default inf)" << endl
+ << " -v,--verbose Verbose output" << endl;
+ exit(1);
+}
+
+bool configuration::parse(int argc, char **argv)
+{
+ static struct option long_options[] =
+ {
+ {"list", required_argument, 0, 'l'},
+ {"tmp", required_argument, 0, 't'},
+ {"min", required_argument, 0, 'm'},
+ {"max", required_argument, 0, 'M'},
+ {"freq", required_argument, 0, 'f'},
+ {"minsupp", required_argument, 0, 's'},
+ {"maxsupp", required_argument, 0, 'S'},
+ {"verbose", no_argument, 0, 'v'},
+ {"debug", no_argument, 0, 'D'},
+ {"help", no_argument, 0, 'h'},
+ {0, 0, 0, 0}
+ };
+ int option_index = 0;
+ int c;
+ while ((c = getopt_long(argc, argv, "l:t:m:M:f:s:S:vDh",
+ long_options, &option_index)) != -1)
+ {
+ switch(c)
+ {
+ case 'l':
+ listfile = string(optarg); break;
+ case 't':
+ tmpfile = string(optarg); break;
+ case 'm':
+ minlength = atoi_min(optarg, 1, "-m,--min", argv[0]); break;
+ case 'M':
+ maxlength = atoi_min(optarg, 1, "-M,--max", argv[0]); break;
+ case 'f':
+ minfreq = atoi_min(optarg, 1, "-f,--freq", argv[0]); break;
+ case 's':
+ minsupport = atoi_min(optarg, 1, "-s,--minsupp", argv[0]); break;
+ case 'S':
+ maxsupport = atoi_min(optarg, 1, "-S,--maxsupp", argv[0]); break;
+ case 'v':
+ verbose = true; break;
+ case 'D':
+ debug = true; break;
+ case 'h':
+ print_usage(); return 1;
+ default:
+ cerr << "error: invalid parameter!" << endl;
+ return false;
+ }
+ }
+ if (listfile == "")
+ {
+ cerr << "error: -l,--list is a required parameter" << endl;
+ return false;
+ }
+ if (tmpfile == "")
+ {
+ cerr << "error: -t,--tmp is a required parameter" << endl;
+ return false;
+ }
+
+ if (minlength > maxlength)
+ {
+ cerr << "error: -m,--min must be smaller than or equal to -M,--max" << endl;
+ return false;
+ }
+ if (minsupport > maxsupport)
+ {
+ cerr << "error: -s,--minsupp must be smaller than or equal to -S,--maxsupp" << endl;
+ return false;
+ }
+
+ return true;
+}
diff --git a/configuration.h b/configuration.h
new file mode 100644
index 0000000..d75e0aa
--- /dev/null
+++ b/configuration.h
@@ -0,0 +1,30 @@
+#ifndef CONFIGURATION_H
+#define CONFIGURATION_H
+#include "input_reader.h"
+#include <string>
+
+
+class configuration
+{
+public:
+ configuration(int, char **);
+ bool parse(int, char **);
+ void print_short_usage() const;
+ void print_usage() const;
+
+ std::string prog;
+ std::string listfile;
+ std::string tmpfile;
+ int inputformat;
+ unsigned minlength;
+ unsigned maxlength;
+ unsigned minfreq;
+ unsigned minsupport;
+ unsigned maxsupport;
+ bool verbose;
+ bool debug;
+ bool good;
+private:
+ configuration();
+};
+#endif
diff --git a/default.h b/default.h
new file mode 100644
index 0000000..055b8bc
--- /dev/null
+++ b/default.h
@@ -0,0 +1,4 @@
+
+// Compile time setting, which determines the maximum number of input files.
+// Note: Maximun is 2^DBITS, and increasing DBITS will increase memory usage.
+#define DBITS 12
diff --git a/dependencies.mk b/dependencies.mk
new file mode 100644
index 0000000..abde7f8
--- /dev/null
+++ b/dependencies.mk
@@ -0,0 +1,99 @@
+configuration.o: configuration.cpp configuration.h input_reader.h
+fsm-lite.o: fsm-lite.cpp default.h configuration.h input_reader.h \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/suffix_trees.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/sdsl_concepts.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/uintx_t.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/suffix_arrays.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/csa_bitcompressed.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/int_vector.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/bits.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/structure_tree.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/config.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/util.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/sfstream.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/ram_fs.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/ram_filebuf.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/io.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/memory_management.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/int_vector_buffer.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/iterators.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/suffix_array_helper.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/csa_sampling_strategy.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/csa_alphabet_strategy.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/sd_vector.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/select_support_mcl.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/select_support.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rank_support.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rank_support_v.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rank_support_v5.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rank_support_scan.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/select_support_scan.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/csa_wt.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wavelet_trees.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wt_pc.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/bit_vectors.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/bit_vector_il.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rrr_vector.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rrr_helper.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/uint128_t.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/uint256_t.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rrr_vector_15.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wt_helper.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wt_blcd.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wt_gmr.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/vectors.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/enc_vector.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/coder.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/coder_fibonacci.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/coder_elias_delta.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/coder_elias_gamma.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/coder_comma.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/vlc_vector.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/dac_vector.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wt_huff.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wt_hutu.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wt_int.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wm_int.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wt_rlmn.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/construct.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/construct_lcp.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/construct_isa.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/construct_bwt.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/wt_algorithm.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/construct_lcp_helper.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/construct_sa.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/divsufsort.h \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/divsufsort64.h \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/qsufsort.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/construct_sa_se.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/construct_config.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/fast_cache.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/csa_sada.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/suffix_array_algorithm.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/suffix_tree_algorithm.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/suffix_tree_helper.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/sorted_multi_stack_support.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/sorted_stack_support.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/cst_sct3.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/lcp.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/lcp_support_sada.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/lcp_byte.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/lcp_wt.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/lcp_vlc.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/lcp_dac.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/lcp_bitcompressed.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/lcp_support_tree.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/lcp_support_tree2.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/bp_support.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/bp_support_g.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/nearest_neighbour_dictionary.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rmq_support.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rmq_support_sparse_table.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rmq_succinct_sct.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/bp_support_sada.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/bp_support_algorithm.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/rmq_succinct_sada.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/bp_support_gg.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/cst_iterators.hpp \
+ /Users/nvalimak/code/sdsl-lite-2.0.3/include/sdsl/cst_sada.hpp
+input_reader.o: input_reader.cpp default.h input_reader.h configuration.h
diff --git a/fsm-lite.cpp b/fsm-lite.cpp
new file mode 100644
index 0000000..bcd06c5
--- /dev/null
+++ b/fsm-lite.cpp
@@ -0,0 +1,207 @@
+#include "default.h"
+#include "configuration.h"
+#include "input_reader.h"
+#include <sdsl/suffix_trees.hpp> // TODO: replace with csa+lcp array
+#include <sdsl/wt_algorithm.hpp>
+#include <iostream>
+#include <vector>
+#include <cstdlib> // std::exit()
+using namespace std;
+
+typedef sdsl::cst_sct3<> cst_t;
+typedef sdsl::wt_int<> wt_t;
+typedef sdsl::bit_vector bitv_t;
+typedef cst_t::char_type char_type;
+typedef cst_t::node_type node_type;
+typedef wt_t::size_type size_type;
+
+/**
+ * Construct the sequence labels
+ *
+ * Assumes that the number of input files is less than 2^DBITS.
+ * The value of DBITS has to be set at compile time (in defaults.h).
+ * Large DBITS values result in large memory requirements for wt_init().
+ */
+void wt_init(wt_t &wt, bitv_t &separator, cst_t &cst, input_reader *ir, configuration &config)
+{
+ uint64_t n = cst.csa.size();
+ sdsl::int_vector<DBITS> labels(n, ~0u);
+ separator = bitv_t(n, 0);
+ uint64_t k = ir->size()-1;
+ uint64_t j = cst.csa.wavelet_tree.select(1, 0);
+ if (config.debug)
+ cerr << "bwt end marker pos = " << j << endl;
+ uint64_t bwtendpos = j;
+ j = cst.csa.lf[j];
+ labels[j] = 0; // Label of last byte
+ separator[n-1] = 0;
+ separator[n-2] = 1;
+ j = cst.csa.lf[j];
+ for (uint64_t i = n-2; i > 0; i--) {
+ char_type c = cst.csa.bwt[j];
+ labels[j] = k;
+ if (c == '$')
+ k --;
+ if (c == '$' || c == '#')
+ separator[i-1] = 1;
+
+ j = cst.csa.lf[j];
+ }
+ labels[j] = k;
+ if (j != bwtendpos || k != 0) // Assert
+ {
+ cerr << "Labeling failed, j = " << j << ", k = " << k << endl;
+ exit(1);
+ }
+
+ //TODO cleanup
+ /*for (uint64_t i = 0; i < n; ++i)
+ cerr << cst.csa.text[i];
+ cerr << endl;
+ for (uint64_t i = 0; i < n; ++i)
+ cerr << separator[i];
+ cerr << endl;
+ for (uint64_t i = 0; i < n; ++i)
+ cerr << labels[cst.csa.isa[i]];
+ cerr << endl;
+ */
+
+ std::string tmp_file = sdsl::ram_file_name(sdsl::util::to_string(sdsl::util::pid())+"_"+sdsl::util::to_string(sdsl::util::id()));
+ sdsl::store_to_file(labels, tmp_file);
+ sdsl::int_vector_buffer<DBITS> text_buf(tmp_file);
+ wt = wt_t(text_buf, labels.size());
+ if (config.debug)
+ cerr << "wt size = " << wt.size() << ", n = " << n << endl;
+ j = 0;
+ for (uint64_t i = 0; i < ir->size(); ++i)
+ j += wt.rank(n, i);
+ if (j != n) // Assert
+ {
+ cerr << "Label sum failed, j = " << j << ", n = " << n << endl;
+ exit(1);
+ }
+
+}
+
+int main(int argc, char ** argv)
+{
+ configuration config(argc, argv);
+ if (!config.good)
+ config.print_short_usage();
+
+ if (config.verbose)
+ cerr << "Reading input files..." << endl;
+ input_reader *ir = input_reader::build(config);
+ if (config.verbose)
+ cerr << "Read " << ir->size() << " input files and " << ir->total_seqs() << " sequences of total length " << ir->total_size() << " (includes rev.compl. sequences)" << endl;
+
+ /**
+ * Initialize the data structures
+ */
+ if (config.verbose)
+ cerr << "Constructing the data structures..." << endl;
+ cst_t cst;
+ construct(cst, config.tmpfile + ".tmp", 1);
+ if (!cst.csa.size())
+ {
+ cerr << "error: unable to construct the data structure; out of memory?" << endl;
+ abort();
+ }
+
+ wt_t label_wt;
+ bitv_t separator;
+ wt_init(label_wt, separator, cst, ir, config);
+
+ bitv_t::rank_1_type sep_rank1(&separator);
+ //bitv_t::select_1_type sep_select1(&separator); TODO Remove?
+ assert(sep_rank1(cst.size()) == ir->total_seqs());
+
+ size_type support = 0;
+ vector<wt_t::value_type> labels(ir->size(), 0);
+ vector<size_type> rank_sp(ir->size(), 0);
+ vector<size_type> rank_ep(ir->size(), 0);
+
+ if (config.verbose)
+ cerr << "Construction complete, the main index requires " << size_in_mega_bytes(cst) << " MiB plus " << size_in_mega_bytes(label_wt) << " MiB for labels." << endl;
+
+ /**
+ * Main loop
+ */
+ node_type root = cst.root();
+ vector<node_type> buffer;
+ buffer.reserve(1024*1024);
+ for (auto& child: cst.children(root))
+ buffer.push_back(child);
+ while (!buffer.empty())
+ {
+ node_type const node = buffer.back();
+ buffer.pop_back();
+ unsigned depth = cst.depth(node);
+ if (depth < config.maxlength)
+ for (auto& child: cst.children(node))
+ buffer.push_back(child);
+ if (depth < config.minlength)
+ continue;
+ if (cst.is_leaf(node))
+ continue;
+
+ // Process the candidate node
+ size_type sp = cst.lb(node);
+ size_type ep = cst.rb(node);
+ node_type wn = cst.wl(node, cst.csa.bwt[sp]);
+ /*if (config.debug)
+ {
+ size_type pos = cst.csa[sp];
+ auto s = extract(cst.csa, pos, pos + depth - 1);
+ cerr << "at node = " << depth << "-[" << sp << "," << ep << "], wl = " << (wn != root);
+ if (wn!=root)
+ cerr << "[" << cst.rb(wn)-cst.lb(wn) << " vs " << ep-sp << "]";
+ cerr << ", seq = " << s << endl;
+ }*/
+ if (wn == root && config.debug)
+ {
+ cerr << "warning: no Weiner-link at " << depth << "-[" << sp << "," << ep << "]" << endl;
+ continue;
+ }
+ if (depth < config.maxlength && cst.rb(wn)-cst.lb(wn) == ep-sp)
+ continue; // not left-branching
+
+ sdsl::interval_symbols(label_wt, sp, ep+1, support, labels, rank_sp, rank_ep);
+ if (support < config.minsupport || support > config.maxsupport)
+ continue;
+
+ size_type truesupp = 0;
+ for (size_type i = 0; i < support; ++i)
+ if (config.minfreq <= rank_ep[i]-rank_sp[i])
+ ++truesupp;
+ if (truesupp < config.minsupport)
+ continue;
+
+ if (depth > config.maxlength)
+ depth = config.maxlength;
+ size_type pos = cst.csa[sp];
+ // Check for separator symbol TODO cleanup
+ /*unsigned p_depth = cst.depth(cst.parent(node));
+ if (sep_rank1(pos) != sep_rank1(pos + p_depth))
+ continue; // Separator occurs above parent node
+ if (sep_rank1(pos) != sep_rank1(pos + depth))
+ depth = sep_select1(sep_rank1(pos)+1) - pos +1; // Separator above current node
+ */
+
+ if (sep_rank1(pos) != sep_rank1(pos + depth))
+ continue;
+ auto s = extract(cst.csa, pos, pos + depth - 1);
+ if (input_reader::smaller_than_rev_cmpl(s))
+ continue;
+ cout << s + " |";
+ for (size_type i = 0; i < support; ++i)
+ if (config.minfreq <= rank_ep[i]-rank_sp[i])
+ cout << ' ' << ir->id(labels[i]) << ':' << rank_ep[i]-rank_sp[i];
+ cout << '\n';
+ }
+
+ if (config.verbose)
+ cerr << "All done." << endl;
+ delete ir; ir = 0;
+ return 0;
+}
diff --git a/input_reader.cpp b/input_reader.cpp
new file mode 100644
index 0000000..51ca830
--- /dev/null
+++ b/input_reader.cpp
@@ -0,0 +1,236 @@
+#include "default.h"
+#include "input_reader.h"
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <cstdlib> // abort()
+#include <cassert>
+using namespace std;
+
+input_reader * input_reader::build(configuration const &config)
+{
+ switch (config.inputformat)
+ {
+ case input_fasta:
+ return new fasta_input_reader(config);
+ break;
+ default:
+ cerr << "Error: invalid file type at InputReader::build()" << endl;
+ abort();
+ }
+}
+
+ std::istream *fp;
+ std::vector<std::string> id_;
+ std::vector<std::size_t> size_;
+ std::size_t total_size_;
+ bool good_;
+
+input_reader::input_reader(string file)
+ : fp(0), id_(), size_(), total_size_(0), total_seqs_(0)
+{
+ // Open file handle, "-" uses standard input
+ if (file == "-")
+ fp = &std::cin;
+ else
+ fp = new ifstream(file.c_str());
+}
+
+void error_reading_input(configuration const &config, char const *message)
+{
+ cerr << message << endl;
+ config.print_short_usage();
+ exit(1);
+}
+
+/**
+ * Simple FASTA input
+ */
+void normalize(std::string &t)
+{
+ for (std::string::iterator it = t.begin(); it != t.end(); ++it)
+ {
+ switch (*it)
+ {
+ case('a'):
+ *it = 'A';
+ break;
+ case('c'):
+ *it = 'C';
+ break;
+ case('g'):
+ *it = 'G';
+ break;
+ case('t'):
+ *it = 'T';
+ break;
+ case('n'):
+ *it = 'N';
+ break;
+ case('A'):
+ case('C'):
+ case('G'):
+ case('T'):
+ case('N'):
+ break;
+ case('0'):
+ case('1'):
+ case('2'):
+ case('3'):
+ case('.'):
+ break;
+ default:
+ *it = 'N';
+ break;
+ }
+ }
+}
+
+void revcmpl(std::string &t)
+{
+ char c;
+ std::size_t n = t.size();
+ for (std::size_t i = 0; i < n / 2; ++i) {
+ c = t[i];
+ t[i] = t[n - i - 1];
+ t[n - i - 1] = c;
+ }
+ for (std::string::iterator it = t.begin(); it != t.end(); ++it)
+ {
+ switch (*it)
+ {
+ case('T'):
+ *it = 'A'; break;
+ case('G'):
+ *it = 'C'; break;
+ case('C'):
+ *it = 'G'; break;
+ case('A'):
+ *it = 'T'; break;
+ }
+ }
+}
+
+bool input_reader::smaller_than_rev_cmpl(string const& path)
+{
+ // Check if given path is smaller than its rev.compl.
+ string rc(path);
+ revcmpl(rc);
+ if (path < rc)
+ return true;
+ return false;
+}
+
+size_t append(ofstream &ofs, string const &fname, unsigned &nseqs)
+{
+ string row;
+ string buffer;
+ buffer.reserve(1024*1024);
+ ifstream ifs(fname.c_str());
+ bool first = true;
+ size_t n = 0;
+ while (std::getline(ifs, row).good())
+ {
+ if (row.empty())
+ continue;
+ if (row[0] == '>')
+ {
+ if (!first)
+ {
+ ofs << '#';
+ nseqs++;
+ n ++;
+ revcmpl(buffer);
+ ofs << buffer << '#';
+ nseqs++;
+ n += buffer.size() + 1;
+ buffer.clear();
+ }
+ first = false;
+ continue;
+ }
+ if (row[row.size()-1] == '\n')
+ row.pop_back();
+ if (row[row.size()-1] == '\r')
+ row.pop_back();
+ normalize(row);
+ ofs << row;
+ buffer += row;
+ n += row.size();
+ }
+ if (n == 0)
+ {
+ cerr << "error: Unable to read the input file " << fname << endl;
+ exit(1);
+ }
+ ofs << '#';
+ nseqs ++;
+ n ++;
+ revcmpl(buffer);
+ ofs << buffer << '$';
+ nseqs++;
+ n += buffer.size() + 1;
+ buffer.clear();
+ return n;
+}
+
+bool file_exists (string const &name)
+{
+ ifstream f(name.c_str());
+ if (f.good())
+ {
+ f.close();
+ return true;
+ }
+ else
+ {
+ f.close();
+ return false;
+ }
+}
+
+fasta_input_reader::fasta_input_reader(configuration const &config)
+ : input_reader(config.listfile)
+{
+ string tmpfile = config.tmpfile + ".tmp";
+ string metafile = config.tmpfile + ".meta";
+// if (file_exists(tmpfile) && file_exists(metafile))
+// TODO cerr << "warning: tmp files under " << config.tmpfile << " will be rebuilt" << endl;
+
+ ofstream ofs(tmpfile.c_str());
+ ofstream ofsmeta(metafile.c_str());
+ // Parse the list file
+ string id, fname;
+ while (fp->good())
+ {
+ *fp >> id;
+ if (id.empty() || !fp->good())
+ break;
+ *fp >> fname;
+ if (fname.empty())
+ error_reading_input(config, "error: Unable to read the -l,--list input file: please check the list file format.");
+
+ id_.push_back(id);
+ size_t s = append(ofs, fname, total_seqs_);
+ size_.push_back(s);
+ total_size_ += s;
+ ofsmeta << id << "\t" << s << "\t" << endl;
+ }
+ if (id_.empty() || total_size_ == 0)
+ error_reading_input(config, "error: Unable to read the -l,--list input file: please check the list file format.");
+ if (!ofs.good())
+ error_reading_input(config, "error: Unable to write the temporary files under the location -i,--index.");
+ if (!ofsmeta.good())
+ error_reading_input(config, "error: Unable to write the temporary files under the location -i,--index.");
+
+ if ((1 << DBITS) <= size_.size())
+ {
+ cerr << "error: increase DBITS value to support more than " << (1<<DBITS)
+ << " input files; see the file default.h for more details and recompile." << endl;
+ exit(1);
+ }
+
+ ofsmeta.close();
+ ofs.close();
+}
+
diff --git a/input_reader.h b/input_reader.h
new file mode 100644
index 0000000..6d95282
--- /dev/null
+++ b/input_reader.h
@@ -0,0 +1,70 @@
+#ifndef INPUT_READER_H
+#define INPUT_READER_H
+#include "configuration.h"
+#include <string>
+#include <vector>
+#include <iostream>
+
+class configuration;
+
+/**
+ * Base class for input
+ *
+ * Use input_reader::build() to construct an instance.
+ */
+class input_reader
+{
+public:
+ enum input_format_t { input_unset, input_fasta };
+
+ // Constructor
+ static input_reader* build(configuration const &);
+
+ // Accessors for metadata
+ std::string const & id(std::size_t i) const
+ { return id_[i]; }
+ std::size_t size(std::size_t i) const
+ { return size_[i]; }
+ std::size_t size() const
+ { return size_.size(); }
+ std::size_t total_size() const
+ { return total_size_; }
+ std::size_t total_seqs() const
+ { return total_seqs_; }
+
+
+ virtual ~input_reader()
+ { if (fp != &std::cin) delete fp; fp = 0; }
+
+ static bool smaller_than_rev_cmpl(std::string const &);
+protected:
+ explicit input_reader(std::string);
+ std::istream *fp;
+ std::vector<std::string> id_;
+ std::vector<std::size_t> size_;
+ std::size_t total_size_;
+ unsigned total_seqs_;
+private:
+ input_reader();
+ // No copy constructor or assignment
+ input_reader(input_reader const&);
+ input_reader& operator = (input_reader const&);
+};
+
+
+/**
+ * Simple FASTA reader
+ */
+class fasta_input_reader : public input_reader
+{
+public:
+ explicit fasta_input_reader(configuration const &);
+ virtual ~fasta_input_reader()
+ { }
+private:
+ fasta_input_reader();
+ // No copy constructor or assignment
+ fasta_input_reader(fasta_input_reader const&);
+ fasta_input_reader& operator = (fasta_input_reader const&);
+};
+#endif
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/fsm-lite.git
More information about the debian-med-commit
mailing list