[med-svn] [libfastahack] 01/02: Imported Upstream version 0.0+20160309
Andreas Tille
tille at debian.org
Thu Jun 23 09:12:06 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository libfastahack.
commit 56f345e677c5523ea2ecb4ee6c8ec276e9c5075c
Author: Andreas Tille <tille at debian.org>
Date: Thu Jun 23 11:10:21 2016 +0200
Imported Upstream version 0.0+20160309
---
.gitignore | 2 +
.travis.yml | 13 ++
Fasta.cpp | 323 ++++++++++++++++++++++++++++++++++++++++
Fasta.h | 85 +++++++++++
FastaHack.cpp | 179 ++++++++++++++++++++++
LargeFileSupport.h | 16 ++
Makefile | 42 ++++++
README | 70 +++++++++
Region.h | 51 +++++++
disorder.c | 192 ++++++++++++++++++++++++
disorder.h | 62 ++++++++
libdisorder.LICENSE | 339 ++++++++++++++++++++++++++++++++++++++++++
split.cpp | 33 ++++
split.h | 20 +++
tests/correct.fasta | 30 ++++
tests/embedded_newline.fasta | 35 +++++
tests/mismatched_lines.fasta | 30 ++++
tests/trailing_newlines.fasta | 34 +++++
18 files changed, 1556 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1aa62e4
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.o
+fastahack
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..d6add9e
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,13 @@
+# Control file for continuous integration testing at http://travis-ci.org/
+
+language: cpp
+compiler: gcc
+sudo: required
+dist: trusty
+script: make
+os:
+ - linux
+ - osx
+compiler:
+ - gcc
+ - clang
diff --git a/Fasta.cpp b/Fasta.cpp
new file mode 100644
index 0000000..572f2d0
--- /dev/null
+++ b/Fasta.cpp
@@ -0,0 +1,323 @@
+// ***************************************************************************
+// FastaIndex.cpp (c) 2010 Erik Garrison <erik.garrison at bc.edu>
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 9 February 2010 (EG)
+// ---------------------------------------------------------------------------
+
+#include "Fasta.h"
+
+FastaIndexEntry::FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len)
+ : name(name)
+ , length(length)
+ , offset(offset)
+ , line_blen(line_blen)
+ , line_len(line_len)
+{}
+
+FastaIndexEntry::FastaIndexEntry(void) // empty constructor
+{ clear(); }
+
+FastaIndexEntry::~FastaIndexEntry(void)
+{}
+
+void FastaIndexEntry::clear(void)
+{
+ name = "";
+ length = 0;
+ offset = -1; // no real offset will ever be below 0, so this allows us to
+ // check if we have already recorded a real offset
+ line_blen = 0;
+ line_len = 0;
+}
+
+ostream& operator<<(ostream& output, const FastaIndexEntry& e) {
+ // just write the first component of the name, for compliance with other tools
+ output << split(e.name, ' ').at(0) << "\t" << e.length << "\t" << e.offset << "\t" <<
+ e.line_blen << "\t" << e.line_len;
+ return output; // for multiple << operators.
+}
+
+FastaIndex::FastaIndex(void)
+{}
+
+void FastaIndex::readIndexFile(string fname) {
+ string line;
+ long long linenum = 0;
+ indexFile.open(fname.c_str(), ifstream::in);
+ if (indexFile.is_open()) {
+ while (getline (indexFile, line)) {
+ ++linenum;
+ // the fai format defined in samtools is tab-delimited, every line being:
+ // fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len
+ vector<string> fields = split(line, '\t');
+ if (fields.size() == 5) { // if we don't get enough fields then there is a problem with the file
+ // note that fields[0] is the sequence name
+ char* end;
+ string name = split(fields[0], " \t").at(0); // key by first token of name
+ sequenceNames.push_back(name);
+ this->insert(make_pair(name, FastaIndexEntry(fields[0], atoi(fields[1].c_str()),
+ strtoll(fields[2].c_str(), &end, 10),
+ atoi(fields[3].c_str()),
+ atoi(fields[4].c_str()))));
+ } else {
+ cerr << "Warning: malformed fasta index file " << fname <<
+ "does not have enough fields @ line " << linenum << endl;
+ cerr << line << endl;
+ exit(1);
+ }
+ }
+ } else {
+ cerr << "could not open index file " << fname << endl;
+ exit(1);
+ }
+}
+
+// for consistency this should be a class method
+bool fastaIndexEntryCompare ( FastaIndexEntry a, FastaIndexEntry b) { return (a.offset<b.offset); }
+
+ostream& operator<<(ostream& output, FastaIndex& fastaIndex) {
+ vector<FastaIndexEntry> sortedIndex;
+ for(vector<string>::const_iterator it = fastaIndex.sequenceNames.begin(); it != fastaIndex.sequenceNames.end(); ++it)
+ {
+ sortedIndex.push_back(fastaIndex[*it]);
+ }
+ sort(sortedIndex.begin(), sortedIndex.end(), fastaIndexEntryCompare);
+ for( vector<FastaIndexEntry>::iterator fit = sortedIndex.begin(); fit != sortedIndex.end(); ++fit) {
+ output << *fit << endl;
+ }
+}
+
+void FastaIndex::indexReference(string refname) {
+ // overview:
+ // for line in the reference fasta file
+ // track byte offset from the start of the file
+ // if line is a fasta header, take the name and dump the last sequnece to the index
+ // if line is a sequence, add it to the current sequence
+ //cerr << "indexing fasta reference " << refname << endl;
+ string line;
+ FastaIndexEntry entry; // an entry buffer used in processing
+ entry.clear();
+ int line_length = 0;
+ long long offset = 0; // byte offset from start of file
+ long long line_number = 0; // current line number
+ bool mismatchedLineLengths = false; // flag to indicate if our line length changes mid-file
+ // this will be used to raise an error
+ // if we have a line length change at
+ // any line other than the last line in
+ // the sequence
+ bool emptyLine = false; // flag to catch empty lines, which we allow for
+ // index generation only on the last line of the sequence
+ ifstream refFile;
+ refFile.open(refname.c_str());
+ if (refFile.is_open()) {
+ while (getline(refFile, line)) {
+ ++line_number;
+ line_length = line.length();
+ if (line[0] == ';') {
+ // fasta comment, skip
+ } else if (line[0] == '+') {
+ // fastq quality header
+ getline(refFile, line);
+ line_length = line.length();
+ offset += line_length + 1;
+ // get and don't handle the quality line
+ getline(refFile, line);
+ line_length = line.length();
+ } else if (line[0] == '>' || line[0] == '@') { // fasta /fastq header
+ // if we aren't on the first entry, push the last sequence into the index
+ if (entry.name != "") {
+ mismatchedLineLengths = false; // reset line length error tracker for every new sequence
+ emptyLine = false;
+ flushEntryToIndex(entry);
+ entry.clear();
+ }
+ entry.name = line.substr(1, line_length - 1);
+ } else { // we assume we have found a sequence line
+ if (entry.offset == -1) // NB initially the offset is -1
+ entry.offset = offset;
+ entry.length += line_length;
+ if (entry.line_len) {
+ //entry.line_len = entry.line_len ? entry.line_len : line_length + 1;
+ if (mismatchedLineLengths || emptyLine) {
+ if (line_length == 0) {
+ emptyLine = true; // flag empty lines, raise error only if this is embedded in the sequence
+ } else {
+ if (emptyLine) {
+ cerr << "ERROR: embedded newline";
+ } else {
+ cerr << "ERROR: mismatched line lengths";
+ }
+ cerr << " at line " << line_number << " within sequence " << entry.name <<
+ endl << "File not suitable for fasta index generation." << endl;
+ exit(1);
+ }
+ }
+ // this flag is set here and checked on the next line
+ // because we may have reached the end of the sequence, in
+ // which case a mismatched line length is OK
+ if (entry.line_len != line_length + 1) {
+ mismatchedLineLengths = true;
+ if (line_length == 0) {
+ emptyLine = true; // flag empty lines, raise error only if this is embedded in the sequence
+ }
+ }
+ } else {
+ entry.line_len = line_length + 1; // first line
+ }
+ entry.line_blen = entry.line_len - 1;
+ }
+ offset += line_length + 1;
+ }
+ // we've hit the end of the fasta file!
+ // flush the last entry
+ flushEntryToIndex(entry);
+ } else {
+ cerr << "could not open reference file " << refname << " for indexing!" << endl;
+ exit(1);
+ }
+}
+
+void FastaIndex::flushEntryToIndex(FastaIndexEntry& entry) {
+ string name = split(entry.name, " \t").at(0); // key by first token of name
+ sequenceNames.push_back(name);
+ this->insert(make_pair(name, FastaIndexEntry(entry.name, entry.length,
+ entry.offset, entry.line_blen,
+ entry.line_len)));
+
+}
+
+void FastaIndex::writeIndexFile(string fname) {
+ //cerr << "writing fasta index file " << fname << endl;
+ ofstream file;
+ file.open(fname.c_str());
+ if (file.is_open()) {
+ file << *this;
+ } else {
+ cerr << "could not open index file " << fname << " for writing!" << endl;
+ exit(1);
+ }
+}
+
+FastaIndex::~FastaIndex(void) {
+ indexFile.close();
+}
+
+FastaIndexEntry FastaIndex::entry(string name) {
+ FastaIndex::iterator e = this->find(name);
+ if (e == this->end()) {
+ cerr << "unable to find FASTA index entry for '" << name << "'" << endl;
+ exit(1);
+ } else {
+ return e->second;
+ }
+}
+
+string FastaIndex::indexFileExtension() { return ".fai"; }
+
+/*
+FastaReference::FastaReference(string reffilename) {
+}
+*/
+
+void FastaReference::open(string reffilename) {
+ filename = reffilename;
+ if (!(file = fopen(filename.c_str(), "r"))) {
+ cerr << "could not open " << filename << endl;
+ exit(1);
+ }
+ index = new FastaIndex();
+ struct stat stFileInfo;
+ string indexFileName = filename + index->indexFileExtension();
+ // if we can find an index file, use it
+ if(stat(indexFileName.c_str(), &stFileInfo) == 0) {
+ index->readIndexFile(indexFileName);
+ } else { // otherwise, read the reference and generate the index file in the cwd
+ cerr << "index file " << indexFileName << " not found, generating..." << endl;
+ index->indexReference(filename);
+ index->writeIndexFile(indexFileName);
+ }
+}
+
+FastaReference::~FastaReference(void) {
+ if (file != NULL)
+ fclose(file);
+ if (index != NULL)
+ delete index;
+}
+
+string FastaReference::getSequence(string seqname) {
+ FastaIndexEntry entry = index->entry(seqname);
+ int newlines_in_sequence = entry.length / entry.line_blen;
+ int seqlen = newlines_in_sequence + entry.length;
+ char* seq = (char*) calloc (seqlen + 1, sizeof(char));
+ fseek64(file, entry.offset, SEEK_SET);
+ string s;
+ if (fread(seq, sizeof(char), seqlen, file)) {
+ seq[seqlen] = '\0';
+ char* pbegin = seq;
+ char* pend = seq + (seqlen/sizeof(char));
+ pend = remove(pbegin, pend, '\n');
+ pend = remove(pbegin, pend, '\0');
+ s = seq;
+ free(seq);
+ s.resize((pend - pbegin)/sizeof(char));
+ }
+ return s;
+}
+
+// TODO cleanup; odd function. use a map
+string FastaReference::sequenceNameStartingWith(string seqnameStart) {
+ try {
+ return (*index)[seqnameStart].name;
+ } catch (exception& e) {
+ cerr << e.what() << ": unable to find index entry for " << seqnameStart << endl;
+ exit(1);
+ }
+}
+
+string FastaReference::getTargetSubSequence(FastaRegion& target) {
+ if (target.startPos == -1) {
+ return getSequence(target.startSeq);
+ } else {
+ return getSubSequence(target.startSeq, target.startPos - 1, target.length());
+ }
+}
+
+string FastaReference::getSubSequence(string seqname, int start, int length) {
+ FastaIndexEntry entry = index->entry(seqname);
+ length = min(length, entry.length - start);
+ if (start < 0 || length < 1) {
+ //cerr << "Empty sequence" << endl;
+ return "";
+ }
+ // we have to handle newlines
+ // approach: count newlines before start
+ // count newlines by end of read
+ // subtracting newlines before start find count of embedded newlines
+ int newlines_before = start > 0 ? (start - 1) / entry.line_blen : 0;
+ int newlines_by_end = (start + length - 1) / entry.line_blen;
+ int newlines_inside = newlines_by_end - newlines_before;
+ int seqlen = length + newlines_inside;
+ char* seq = (char*) calloc (seqlen + 1, sizeof(char));
+ fseek64(file, (off_t) (entry.offset + newlines_before + start), SEEK_SET);
+ string s;
+ if (fread(seq, sizeof(char), (off_t) seqlen, file)) {
+ seq[seqlen] = '\0';
+ char* pbegin = seq;
+ char* pend = seq + (seqlen/sizeof(char));
+ pend = remove(pbegin, pend, '\n');
+ pend = remove(pbegin, pend, '\0');
+ s = seq;
+ free(seq);
+ s.resize((pend - pbegin)/sizeof(char));
+ }
+ return s;
+}
+
+long unsigned int FastaReference::sequenceLength(string seqname) {
+ FastaIndexEntry entry = index->entry(seqname);
+ return entry.length;
+}
+
diff --git a/Fasta.h b/Fasta.h
new file mode 100644
index 0000000..e2f44fe
--- /dev/null
+++ b/Fasta.h
@@ -0,0 +1,85 @@
+// ***************************************************************************
+// FastaIndex.h (c) 2010 Erik Garrison <erik.garrison at bc.edu>
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 5 February 2010 (EG)
+// ---------------------------------------------------------------------------
+
+#ifndef _FASTA_H
+#define _FASTA_H
+
+#include <map>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <stdint.h>
+#include <stdio.h>
+#include <algorithm>
+#include "LargeFileSupport.h"
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include "split.h"
+#include <stdlib.h>
+#include <ctype.h>
+#include <unistd.h>
+#include "Region.h"
+
+using namespace std;
+
+class FastaIndexEntry {
+ friend ostream& operator<<(ostream& output, const FastaIndexEntry& e);
+ public:
+ FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len);
+ FastaIndexEntry(void);
+ ~FastaIndexEntry(void);
+ string name; // sequence name
+ int length; // length of sequence
+ long long offset; // bytes offset of sequence from start of file
+ int line_blen; // line length in bytes, sequence characters
+ int line_len; // line length including newline
+ void clear(void);
+};
+
+class FastaIndex : public map<string, FastaIndexEntry> {
+ friend ostream& operator<<(ostream& output, FastaIndex& i);
+ public:
+ FastaIndex(void);
+ ~FastaIndex(void);
+ vector<string> sequenceNames;
+ map<string, unsigned int> sequenceID;
+ void indexReference(string refName);
+ void readIndexFile(string fname);
+ void writeIndexFile(string fname);
+ ifstream indexFile;
+ FastaIndexEntry entry(string key);
+ void flushEntryToIndex(FastaIndexEntry& entry);
+ string indexFileExtension(void);
+};
+
+class FastaReference {
+ public:
+ void open(string reffilename);
+ bool usingmmap;
+ string filename;
+ FastaReference(void) : usingmmap(false) {
+ file = NULL;
+ index = NULL;
+ }
+ ~FastaReference(void);
+ FILE* file;
+ void* filemm;
+ size_t filesize;
+ FastaIndex* index;
+ vector<FastaIndexEntry> findSequencesStartingWith(string seqnameStart);
+ string getSequence(string seqname);
+ // potentially useful for performance, investigate
+ // void getSequence(string seqname, string& sequence);
+ string getSubSequence(string seqname, int start, int length);
+ string getTargetSubSequence(FastaRegion& target);
+ string sequenceNameStartingWith(string seqnameStart);
+ unsigned int getSequenceID(string seqname);
+ long unsigned int sequenceLength(string seqname);
+};
+
+#endif
diff --git a/FastaHack.cpp b/FastaHack.cpp
new file mode 100644
index 0000000..07cc445
--- /dev/null
+++ b/FastaHack.cpp
@@ -0,0 +1,179 @@
+#include "Fasta.h"
+#include <stdlib.h>
+#include <getopt.h>
+#include "disorder.h"
+#include "Region.h"
+
+void printSummary() {
+ cerr << "usage: fastahack [options] <fasta reference>" << endl
+ << endl
+ << "options:" << endl
+ << " -i, --index generate fasta index <fasta reference>.fai" << endl
+ << " -r, --region REGION print the specified region" << endl
+ << " -c, --stdin read a stream of line-delimited region specifiers on stdin" << endl
+ << " and print the corresponding sequence for each on stdout" << endl
+ << " -e, --entropy print the shannon entropy of the specified region" << endl
+ << " -d, --dump print the fasta file in the form 'seq_name <tab> sequence'" << endl
+ << endl
+ << "REGION is of the form <seq>, <seq>:<start>[sep]<end>, <seq1>:<start>[sep]<seq2>:<end>" << endl
+ << "where start and end are 1-based, and the region includes the end position." << endl
+ << "[sep] is \"-\" or \"..\"" << endl
+ << endl
+ << "Specifying a sequence name alone will return the entire sequence, specifying" << endl
+ << "range will return that range, and specifying a single coordinate pair, e.g." << endl
+ << "<seq>:<start> will return just that base." << endl
+ << endl
+ << "author: Erik Garrison <erik.garrison at bc.edu>" << endl;
+}
+
+
+int main (int argc, char** argv) {
+
+ string command;
+ string fastaFileName;
+ string seqname;
+ string longseqname;
+ long long start;
+ long long length;
+ bool dump = false;
+
+ bool buildIndex = false; // flag to force index building
+ bool printEntropy = false; // entropy printing
+ bool readRegionsFromStdin = false;
+ //bool printLength = false;
+ string region;
+
+ int c;
+
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ //{"brief", no_argument, &verbose_flag, 0},
+ {"help", no_argument, 0, 'h'},
+ {"index", no_argument, 0, 'i'},
+ //{"length", no_argument, &printLength, true},
+ {"entropy", no_argument, 0, 'e'},
+ {"region", required_argument, 0, 'r'},
+ {"stdin", no_argument, 0, 'c'},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hciedr:",
+ long_options, &option_index);
+
+ /* Detect the end of the options. */
+ if (c == -1)
+ break;
+
+ switch (c)
+ {
+ case 0:
+ /* If this option set a flag, do nothing else now. */
+ if (long_options[option_index].flag != 0)
+ break;
+ printf ("option %s", long_options[option_index].name);
+ if (optarg)
+ printf (" with arg %s", optarg);
+ printf ("\n");
+ break;
+
+ case 'e':
+ printEntropy = true;
+ break;
+
+ case 'c':
+ readRegionsFromStdin = true;
+ break;
+
+ case 'i':
+ buildIndex = true;
+ break;
+
+ case 'r':
+ region = optarg;
+ break;
+
+ case 'd':
+ dump = true;
+ break;
+
+ case 'h':
+ printSummary();
+ exit(0);
+ break;
+
+ case '?':
+ /* getopt_long already printed an error message. */
+ printSummary();
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ /* Print any remaining command line arguments (not options). */
+ if (optind < argc) {
+ //cerr << "fasta file: " << argv[optind] << endl;
+ fastaFileName = argv[optind];
+ } else {
+ cerr << "please specify a fasta file" << endl;
+ printSummary();
+ exit(1);
+ }
+
+ if (buildIndex) {
+ FastaIndex* fai = new FastaIndex();
+ //cerr << "generating fasta index file for " << fastaFileName << endl;
+ fai->indexReference(fastaFileName);
+ fai->writeIndexFile((string) fastaFileName + fai->indexFileExtension());
+ }
+
+ string sequence; // holds sequence so we can optionally process it
+
+ FastaReference fr;
+ fr.open(fastaFileName);
+
+ if (dump) {
+ for (vector<string>::iterator s = fr.index->sequenceNames.begin(); s != fr.index->sequenceNames.end(); ++s) {
+ cout << *s << "\t" << fr.getSequence(*s) << endl;
+ }
+ return 0;
+ }
+
+ if (region != "") {
+ FastaRegion target(region);
+ sequence = fr.getTargetSubSequence(target);
+ }
+
+ if (readRegionsFromStdin) {
+ string regionstr;
+ while (getline(cin, regionstr)) {
+ FastaRegion target(regionstr);
+ if (target.startPos == -1) {
+ cout << fr.getSequence(target.startSeq) << endl;
+ } else {
+ cout << fr.getSubSequence(target.startSeq, target.startPos - 1, target.length()) << endl;
+ }
+ }
+ } else {
+ if (sequence != "") {
+ if (printEntropy) {
+ if (sequence.size() > 0) {
+ cout << shannon_H((char*) sequence.c_str(), sequence.size()) << endl;
+ } else {
+ cerr << "please specify a region or sequence for which to calculate the shannon entropy" << endl;
+ }
+ } else { // if no statistical processing is requested, just print the sequence
+ cout << sequence << endl;
+ }
+ }
+ }
+
+ return 0;
+}
diff --git a/LargeFileSupport.h b/LargeFileSupport.h
new file mode 100644
index 0000000..b90e585
--- /dev/null
+++ b/LargeFileSupport.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#define _FILE_OFFSET_BITS 64
+#ifdef WIN32
+#define ftell64(a) _ftelli64(a)
+#define fseek64(a,b,c) _fseeki64(a,b,c)
+typedef __int64 off_type;
+#elif defined(__APPLE__) || defined(__FreeBSD__)
+#define ftell64(a) ftello(a)
+#define fseek64(a,b,c) fseeko(a,b,c)
+typedef off_t off_type;
+#else
+#define ftell64(a) ftello(a)
+#define fseek64(a,b,c) fseeko(a,b,c)
+typedef __off64_t off_type;
+#endif
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..e397326
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,42 @@
+
+# Use ?= to allow overriding from the env or command-line
+CXX ?= g++
+CXXFLAGS ?= -O3
+PREFIX ?= ./stage
+STRIP_CMD ?= strip
+INSTALL ?= install -c
+MKDIR ?= mkdir -p
+
+# Required flags that we shouldn't override
+CXXFLAGS += -D_FILE_OFFSET_BITS=64
+
+OBJS = Fasta.o FastaHack.o split.o disorder.o
+
+all: fastahack
+
+fastahack: $(OBJS)
+ $(CXX) $(CXXFLAGS) $(OBJS) -o fastahack
+
+FastaHack.o: Fasta.h FastaHack.cpp
+ $(CXX) $(CXXFLAGS) -c FastaHack.cpp
+
+Fasta.o: Fasta.h Fasta.cpp
+ $(CXX) $(CXXFLAGS) -c Fasta.cpp
+
+split.o: split.h split.cpp
+ $(CXX) $(CXXFLAGS) -c split.cpp
+
+disorder.o: disorder.c disorder.h
+ $(CXX) $(CXXFLAGS) -c disorder.c
+
+install: fastahack
+ $(MKDIR) $(DESTDIR)$(PREFIX)/bin
+ $(INSTALL) fastahack $(DESTDIR)$(PREFIX)/bin
+
+install-strip: install
+ $(STRIP_CMD) $(DESTDIR)$(PREFIX)/bin/fastahack
+
+clean:
+ rm -rf fastahack *.o stage
+
+.PHONY: clean
diff --git a/README b/README
new file mode 100644
index 0000000..a33f5d9
--- /dev/null
+++ b/README
@@ -0,0 +1,70 @@
+fastahack --- *fast* FASTA file indexing, subsequence and sequence extraction
+
+Author: Erik Garrison <erik.garrison at bc.edu>, Marth Lab, Boston College
+Date: May 7, 2010
+
+
+Overview:
+
+fastahack is a small application for indexing and extracting sequences and
+subsequences from FASTA files. The included Fasta.cpp library provides a FASTA
+reader and indexer that can be embedded into applications which would benefit
+from directly reading subsequences from FASTA files. The library automatically
+handles index file generation and use.
+
+
+Features:
+
+ - FASTA index (.fai) generation for FASTA files
+ - Sequence extraction
+ - Subsequence extraction
+ - Sequence statistics (TODO: currently only entropy is provided)
+
+Sequence and subsequence extraction use fseek64 to provide fastest-possible
+extraction without RAM-intensive file loading operations. This makes fastahack
+a useful tool for bioinformaticists who need to quickly extract many
+subsequences from a reference FASTA sequence.
+
+
+Notes:
+
+The index files generated by this system should be numerically equivalent to
+those generated by samtools (http://samtools.sourceforge.net/). However, while
+samtools truncates sequence names in the index file, fastahack provides them
+completely.
+
+To simplify use, sequences can be addressed by first whitespace-separated
+field; e.g. "8 SN(Homo sapiens) GA(HG18) URI(NC_000008.9)" can be addressed
+simply as "8", provided "8" is a unique first-field name in the FASTA file.
+Thus, to extract 20bp starting at position 323202 in chromosome 8 from the
+human reference:
+
+ % fastahack -r 8:323202..20 h.sapiens.fasta
+ ACATTGTAATAGATCTCAGA
+
+Usage information is provided by running fastahack with no arguments:
+
+ % usage: fastahack [options] <fasta reference>
+
+ options:
+ -i, --index generate fasta index <fasta reference>.fai
+ -r, --region REGION print the specified region
+ -c, --stdin read a stream of line-delimited region specifiers on stdin
+ and print the corresponding sequence for each on stdout
+ -e, --entropy print the shannon entropy of the specified region
+
+ REGION is of the form <seq>, <seq>:<start>..<end>, <seq1>:<start>..<seq2>:<end>
+ where start and end are 1-based, and the region includes the end position.
+ Specifying a sequence name alone will return the entire sequence, specifying
+ range will return that range, and specifying a single coordinate pair, e.g.
+ <seq>:<start> will return just that base.
+
+
+Limitations:
+
+fastahack will only generate indexes for FASTA files in which the sequences
+have self-consistent line lengths. Trailing whitespace is allowed at the end
+of sequences, but not embedded within the sequence. These limitations are
+necessitated by the complexity of indexing sequences whose lines change in
+length--- the use of indexes is frustrated by such inconsistencies; each change
+in line length would require a new entry in the index file.
diff --git a/Region.h b/Region.h
new file mode 100644
index 0000000..cd5fb57
--- /dev/null
+++ b/Region.h
@@ -0,0 +1,51 @@
+#ifndef FASTA_REGION_H
+#define FASTA_REGION_H
+
+#include <string>
+#include <stdlib.h>
+
+using namespace std;
+
+class FastaRegion {
+public:
+ string startSeq;
+ int startPos;
+ int stopPos;
+
+ FastaRegion(string& region) {
+ startPos = -1;
+ stopPos = -1;
+ size_t foundFirstColon = region.find(":");
+ // we only have a single string, use the whole sequence as the target
+ if (foundFirstColon == string::npos) {
+ startSeq = region;
+ } else {
+ startSeq = region.substr(0, foundFirstColon);
+ size_t foundRangeDots = region.find("..", foundFirstColon);
+ size_t foundRangeDash = region.find("-", foundFirstColon);
+ if (foundRangeDots == string::npos && foundRangeDash == string::npos) {
+ startPos = atoi(region.substr(foundFirstColon + 1).c_str());
+ stopPos = startPos; // just print one base if we don't give an end
+ } else {
+ if (foundRangeDash == string::npos) {
+ startPos = atoi(region.substr(foundFirstColon + 1, foundRangeDots - foundRangeDots - 1).c_str());
+ stopPos = atoi(region.substr(foundRangeDots + 2).c_str()); // to the start of this chromosome
+ } else {
+ startPos = atoi(region.substr(foundFirstColon + 1, foundRangeDash - foundRangeDash - 1).c_str());
+ stopPos = atoi(region.substr(foundRangeDash + 1).c_str()); // to the start of this chromosome
+ }
+ }
+ }
+ }
+
+ int length(void) {
+ if (stopPos > 0) {
+ return stopPos - startPos + 1;
+ } else {
+ return 1;
+ }
+ }
+
+};
+
+#endif
diff --git a/disorder.c b/disorder.c
new file mode 100644
index 0000000..a5f7c35
--- /dev/null
+++ b/disorder.c
@@ -0,0 +1,192 @@
+/***************************************************************************
+ * libdisorder: A Library for Measuring Byte Stream Entropy
+ * Copyright (C) 2010 Michael E. Locasto
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the:
+ * Free Software Foundation, Inc.
+ * 59 Temple Place, Suite 330
+ * Boston, MA 02111-1307 USA
+ *
+ * $Id$
+ **************************************************************************/
+
+#include <math.h> //for log2()
+#include <stdio.h> //for NULL
+#include "disorder.h"
+
+#if defined(__FreeBSD__)
+#define log2(x) (log((x)) * (1./M_LN2))
+#endif
+
+/** Frequecies for each byte */
+static int m_token_freqs[LIBDO_MAX_BYTES]; //frequency of each token in sample
+static float m_token_probs[LIBDO_MAX_BYTES]; //P(each token appearing)
+static int m_num_tokens = 0; //actual number of `seen' tokens, max 256
+static float m_maxent = 0.0;
+static float m_ratio = 0.0;
+static int LIBDISORDER_INITIALIZED = 0;
+
+static void
+initialize_lib()
+{
+ int i = 0;
+ if(1==LIBDISORDER_INITIALIZED)
+ return;
+
+ m_num_tokens = 0;
+
+ for(i=0;i<LIBDO_MAX_BYTES;i++)
+ {
+ m_token_freqs[i]=0;
+ m_token_probs[i]=0.0;
+ }
+
+ LIBDISORDER_INITIALIZED = 1;
+}
+
+/**
+ * Set m_num_tokens by iterating over m_token_freq[] and maintaining
+ * a counter of each position that does not hold the value of zero.
+ */
+static void
+count_num_tokens()
+{
+ int i = 0;
+ int counter = 0;
+ for(i=0;i<LIBDO_MAX_BYTES;i++)
+ {
+ if(0!=m_token_freqs[i])
+ {
+ counter++;
+ }
+ }
+ m_num_tokens = counter;
+ return;
+}
+
+/**
+ * Sum frequencies for each token (i.e., byte values 0 through 255)
+ * We assume the `length' parameter is correct.
+ *
+ * This function is available only to functions in this file.
+ */
+static void
+get_token_frequencies(char* buf,
+ long long length)
+{
+ int i=0;
+ char* itr=NULL;
+ unsigned char c=0;
+
+ itr = buf;
+
+ //reset number of tokens
+ m_num_tokens = 0;
+
+ //make sure freqency and probability arrays are cleared
+ for(i=0;i<LIBDO_MAX_BYTES;i++)
+ {
+ m_token_freqs[i] = 0;
+ m_token_probs[i] = 0.0;
+ }
+
+ for(i=0;i<length;i++)
+ {
+ c = (unsigned char)*itr;
+ //assert(0<=c<LIBDO_MAX_BYTES);
+ m_token_freqs[c]++;
+ itr++;
+ }
+}
+
+/**
+ * Return entropy (in bits) of this buffer of bytes. We assume that the
+ * `length' parameter is correct. This implementation is a translation
+ * of the PHP code found here:
+ *
+ * http://onlamp.com/pub/a/php/2005/01/06/entropy.html
+ *
+ * with a helpful hint on the `foreach' statement from here:
+ *
+ * http://php.net/manual/en/control-structures.foreach.php
+ */
+float
+shannon_H(char* buf,
+ long long length)
+{
+ int i = 0;
+ float bits = 0.0;
+ char* itr=NULL; //values of itr should be zero to 255
+ unsigned char token;
+ int num_events = 0; //`length' parameter
+ float freq = 0.0; //loop variable for holding freq from m_token_freq[]
+ float entropy = 0.0; //running entropy sum
+
+ if(NULL==buf || 0==length)
+ return 0.0;
+
+ if(0==LIBDISORDER_INITIALIZED)
+ initialize_lib();
+
+ itr = buf;
+ m_maxent = 0.0;
+ m_ratio = 0.0;
+ num_events = length;
+ get_token_frequencies(itr, num_events); //modifies m_token_freqs[]
+ //set m_num_tokens by counting unique m_token_freqs entries
+ count_num_tokens();
+
+ if(m_num_tokens>LIBDO_MAX_BYTES)
+ {
+ //report error somehow?
+ return 0.0;
+ }
+
+ //iterate through whole m_token_freq array, but only count
+ //spots that have a registered token (i.e., freq>0)
+ for(i=0;i<LIBDO_MAX_BYTES;i++)
+ {
+ if(0!=m_token_freqs[i])
+ {
+ token = i;
+ freq = ((float)m_token_freqs[token]);
+ m_token_probs[token] = (freq / ((float)num_events));
+ entropy += m_token_probs[token] * log2(m_token_probs[token]);
+ }
+ }
+
+ bits = -1.0 * entropy;
+ m_maxent = log2(m_num_tokens);
+ m_ratio = bits / m_maxent;
+
+ return bits;
+}
+
+int
+get_num_tokens()
+{
+ return m_num_tokens;
+}
+
+float
+get_max_entropy()
+{
+ return m_maxent;
+}
+
+float
+get_entropy_ratio()
+{
+ return m_ratio;
+}
diff --git a/disorder.h b/disorder.h
new file mode 100644
index 0000000..3458774
--- /dev/null
+++ b/disorder.h
@@ -0,0 +1,62 @@
+/***************************************************************************
+ * libdisorder: A Library for Measuring Byte Stream Entropy
+ * Copyright (C) 2010 Michael E. Locasto
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the:
+ * Free Software Foundation, Inc.
+ * 59 Temple Place, Suite 330
+ * Boston, MA 02111-1307 USA
+ *
+ * $Id$
+ **************************************************************************/
+
+#ifndef __DISORDER_H_
+#define __DISORDER_H_
+
+/** Max number of bytes (i.e., tokens) */
+#define LIBDO_MAX_BYTES 256
+
+/** A convienance value for clients of this library. Feel free to change
+ * if you plan to use a larger buffer. You can also safely ignore it, as
+ * libdisorder does not use this value internally; it relies on the
+ * client-supplied `length' parameter.
+ *
+ * NB: Might become deprecated because it is potentially misleading and
+ * has zero relationship to any library internal state.
+ */
+#define LIBDO_BUFFER_LEN 16384
+
+/**
+ * Given a pointer to an array of bytes, return a float indicating the
+ * level of entropy in bits (a number between zero and eight),
+ * assuming a space of 256 possible byte values. The second argument
+ * indicates the number of bytes in the sequence. If this sequence
+ * runs into unallocated memory, this function should fail with a
+ * SIGSEGV.
+ */
+float shannon_H(char*, long long);
+
+/** Report the number of (unique) tokens seen. This is _not_ the
+ number of individual events seen. For example, if the library sees
+ the string `aaab', the number of events is 4 and the number of
+ tokens is 2. */
+int get_num_tokens(void);
+
+/** Returns maximum entropy for byte distributions log2(256)=8 bits*/
+float get_max_entropy(void);
+
+/** Returns the ratio of entropy to maxentropy */
+float get_entropy_ratio(void);
+
+#endif
diff --git a/libdisorder.LICENSE b/libdisorder.LICENSE
new file mode 100644
index 0000000..8bbcff9
--- /dev/null
+++ b/libdisorder.LICENSE
@@ -0,0 +1,339 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/split.cpp b/split.cpp
new file mode 100644
index 0000000..5f1dc4e
--- /dev/null
+++ b/split.cpp
@@ -0,0 +1,33 @@
+#include "split.h"
+
+std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) {
+ std::stringstream ss(s);
+ std::string item;
+ while(std::getline(ss, item, delim)) {
+ elems.push_back(item);
+ }
+ return elems;
+}
+
+std::vector<std::string> split(const std::string &s, char delim) {
+ std::vector<std::string> elems;
+ return split(s, delim, elems);
+}
+
+std::vector<std::string> &split(const std::string &s, const std::string& delims, std::vector<std::string> &elems) {
+ char* tok;
+ char cchars [s.size()+1];
+ char* cstr = &cchars[0];
+ strcpy(cstr, s.c_str());
+ tok = strtok(cstr, delims.c_str());
+ while (tok != NULL) {
+ elems.push_back(tok);
+ tok = strtok(NULL, delims.c_str());
+ }
+ return elems;
+}
+
+std::vector<std::string> split(const std::string &s, const std::string& delims) {
+ std::vector<std::string> elems;
+ return split(s, delims, elems);
+}
diff --git a/split.h b/split.h
new file mode 100644
index 0000000..bd5525d
--- /dev/null
+++ b/split.h
@@ -0,0 +1,20 @@
+#ifndef __SPLIT_H
+#define __SPLIT_H
+
+// functions to split a string by a specific delimiter
+#include <string>
+#include <vector>
+#include <sstream>
+#include <string.h>
+
+// thanks to Evan Teran, http://stackoverflow.com/questions/236129/how-to-split-a-string/236803#236803
+
+// split a string on a single delimiter character (delim)
+std::vector<std::string>& split(const std::string &s, char delim, std::vector<std::string> &elems);
+std::vector<std::string> split(const std::string &s, char delim);
+
+// split a string on any character found in the string of delimiters (delims)
+std::vector<std::string>& split(const std::string &s, const std::string& delims, std::vector<std::string> &elems);
+std::vector<std::string> split(const std::string &s, const std::string& delims);
+
+#endif
diff --git a/tests/correct.fasta b/tests/correct.fasta
new file mode 100644
index 0000000..99af0c0
--- /dev/null
+++ b/tests/correct.fasta
@@ -0,0 +1,30 @@
+>1
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCTAACC
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG
+>2
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCTAACC
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAAC
+>3
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCT
diff --git a/tests/embedded_newline.fasta b/tests/embedded_newline.fasta
new file mode 100644
index 0000000..26b21e7
--- /dev/null
+++ b/tests/embedded_newline.fasta
@@ -0,0 +1,35 @@
+>1
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCTAACC
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG
+
+
+
+>2
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCTAACC
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAAC
+
+>3
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCT
diff --git a/tests/mismatched_lines.fasta b/tests/mismatched_lines.fasta
new file mode 100644
index 0000000..56a7020
--- /dev/null
+++ b/tests/mismatched_lines.fasta
@@ -0,0 +1,30 @@
+>1
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCTAACC
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG
+>2
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCTAACC
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAAC
+>3
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCT
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG
diff --git a/tests/trailing_newlines.fasta b/tests/trailing_newlines.fasta
new file mode 100644
index 0000000..377513f
--- /dev/null
+++ b/tests/trailing_newlines.fasta
@@ -0,0 +1,34 @@
+>1
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCTAACC
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG
+
+
+
+>2
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCTAACC
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAAC
+
+>3
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCT
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/libfastahack.git
More information about the debian-med-commit
mailing list