[med-svn] [Git][med-team/spaced][upstream] New upstream version 1.2.0-201605+dfsg
Andreas Tille
gitlab at salsa.debian.org
Mon Jul 9 12:08:28 BST 2018
Andreas Tille pushed to branch upstream at Debian Med / spaced
Commits:
3f1306ad by Andreas Tille at 2018-07-09T10:39:47+02:00
New upstream version 1.2.0-201605+dfsg
- - - - -
13 changed files:
- Makefile
- − extkey.cpp
- − extkey.h
- − patternset.cpp
- − patternset.h
- + src/patternset.cpp
- + src/patternset.h
- sort.h → src/sort.h
- spaced.cc → src/spaced.cc
- + src/variance.cpp
- + src/variance.h
- − variance.cpp
- − variance.h
Changes:
=====================================
Makefile
=====================================
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
all:
- g++ -O3 -std=c++11 spaced.cc variance.cpp patternset.cpp extkey.cpp -o spaced -fopenmp
+ g++ -O3 -fopenmp -std=c++11 src/spaced.cc src/variance.cpp src/patternset.cpp -o spaced
=====================================
extkey.cpp deleted
=====================================
--- a/extkey.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * This programm calculates the variance/OC and/or the sensitivity of a set of pattern with the same weight.
- * It is possible to improve your patternset and read patterns from a file.
- *
- * extended key object file
- *
- * For theory please have a look at:
- *
- * B. Morgenstern, B. Zhu, S. Horwege, C.-A Leimeister (2015)
- * Estimating evolutionary distances between genomic sequences from spaced-word matches
- * Algorithms for Molecular Biology 10, 5. (http://www.almob.org/content/10/1/5/abstract)
- *
- *
- * @author: Lars Hahn - 26.10.2015, Georg-August-Universitaet Goettingen
- * @version: 1.0.2 11/2015
- */
-#include "extkey.h"
-
-/*---Variables---------------------------------------------------------------*/
-
-
-/*===Main-Part===============================================================*/
-/*---Constructor-& Init------------------------------------------------------*/
-extkey::extkey(){
- extkey(1,1);
-}
-
-extkey::extkey(int pos, double value){
- this->pos = pos;
- this->value = value;
-}
-
-
-/*---Functions---------------------------------------------------------------*/
-int extkey::GetPos(){
- return pos;
-}
-
-double extkey::GetValue(){
- return value;
-}
-
-void extkey::SetValue(double value){
- this->value = value;
-}
-
-bool extkey::operator < (extkey const& p) const{
- return (value < p.value);
-}
-
-bool extkey::operator <= (extkey const& p) const{
- return (value <= p.value);
-}
-
-bool extkey::operator > (extkey const& p) const{
- return (value > p.value);
-}
-
-bool extkey::operator >= (extkey const& p) const{
- return (value >= p.value);
-}
-
-bool extkey::operator == (extkey const& p) const{
- return (value == p.value);
-}
-
-bool extkey::operator || (extkey const& p) const{
- return (value || p.value);
-}
-
-bool extkey::operator != (extkey const& p) const{
- return (value != p.value);
-}
=====================================
extkey.h deleted
=====================================
--- a/extkey.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
- * This programm calculates the variance/OC and/or the sensitivity of a set of pattern with the same weight.
- * It is possible to improve your patternset and read patterns from a file.
- *
- * extended key object header
- *
- * For theory please have a look at:
- *
- * B. Morgenstern, B. Zhu, S. Horwege, C.-A Leimeister (2015)
- * Estimating evolutionary distances between genomic sequences from spaced-word matches
- * Algorithms for Molecular Biology 10, 5. (http://www.almob.org/content/10/1/5/abstract)
- *
- *
- * @author: Lars Hahn - 26.10.2015, Georg-August-Universitaet Goettingen
- * @version: 1.0.2 11/2015
- */
-#ifndef EXTKEY_H_
-#define EXTKEY_H_
-
-class extkey{
- public:
- extkey();
- extkey(int pos, double value);
-
- int GetPos();
- double GetValue();
- void SetValue(double value);
-
- bool operator < (extkey const& p) const;
- bool operator <= (extkey const& p) const;
- bool operator > (extkey const& p) const;
- bool operator >= (extkey const& p) const;
- bool operator == (extkey const& p) const;
- bool operator != (extkey const& p) const;
- bool operator || (extkey const& p) const;
-
- private:
- int pos;
- double value;
-};
-#endif
=====================================
patternset.cpp deleted
=====================================
--- a/patternset.cpp
+++ /dev/null
@@ -1,1050 +0,0 @@
-/**
- * This programm calculates the variance/OC and/or the sensitivity of a set of pattern with the same weight.
- * It is possible to improve your patternset and read patterns from a file.
- *
- * patternset object file
- *
- * For theory please have a look at:
- *
- * B. Morgenstern, B. Zhu, S. Horwege, C.-A Leimeister (2015)
- * Estimating evolutionary distances between genomic sequences from spaced-word matches
- * Algorithms for Molecular Biology 10, 5. (http://www.almob.org/content/10/1/5/abstract)
- *
- *
- * @author: Lars Hahn - 26.10.2015, Georg-August-Universitaet Goettingen
- * @version: 1.0.2 11/2015
- */
-#include "patternset.h"
-
-
-/*---Variables---------------------------------------------------------------*/
-
-std::default_random_engine generator(std::random_device{}());
-
-/*===Main-Part===============================================================*/
-/*---Constructor-------------------------------------------------------------*/
-/**
- * Default constructor, sets the default vaulues, pattern will be generated automatically.
- */
-patternset::patternset() {
- this->size = 10;
- this->length = new int[2];
- this->length[0] = 14;
- this->length[1] = 14;
- this->weight = 8;
- patternset(NULL, size, length, weight);
-}
-
-
-/**
- * File constructor, sets values only from files; resets automatically if there are problems.
- */
-patternset::patternset(char* pattern_file) {
- this->pattern_file = pattern_file;
- this->length = new int[2];
- this->length[0] = 14;
- this->length[1] = 14;
- patternset(pattern_file, 10, length, 8);
-}
-
-/**
- * Short constructor, sets some default vaulues, just pattern dimension is set.
- *
- * @param size
- * The amount of patterns; pattern number.
- *
- * @param length
- * The pattern length for each pattern of the pattern set.
- *
- * @param weigth
- * The weight (match positions; '1') for each pattern of the pattern set.
- */
-patternset::patternset(int size, int *length, int weight) {
- patternset(NULL, size, length, weight);
-}
-
-
-/**
- * Long constructor, sets the values; resets automatically, if there are problems.
- *
- * @param pattern_file File, that may contains submitted pattern
- *
- * @param align_file File, that may contains an alignment file to estimate p, q, l_hom, li and lj
- *
- * @param size The amount of patterns; pattern number.
- *
- * @param length The pattern length for each pattern of the pattern set.
- *
- * @param weigth The weight (match positions; '1') for each pattern of the pattern set.
- *
- * @param l_hom In theory, the amount of homologous positions of two sequences in an multiple alignment.
- *
- * @param l1 In theory, the first(represents each sequence i) sequence length of two observed sequences.
- *
- * @param l2 In theory, the second(represents each sequence j) sequence length of two observed sequences.
- *
- * @param p The match probability ( = #matches / #l_hom)
- *
- * @param q The background probability for each nucleotide A,C,G,T
- */
-patternset::patternset(char* pattern_file, int size, int *length, int weight) {
- this->pattern_file = pattern_file;
- this->size = size;
- this->length = length;
- this->weight = weight;
- update = false;
- improve = true;
- silent = false;
- randpatleng = false;
- ReInitPattern();
-}
-
-/**
- * Default destructor, deletes all vectors and matrices in the object.
- */
-patternset::~patternset() {
- delete[] length;
- Clear();
-}
-
-/*---Init--------------------------------------------------------------------*/
-
-/**
- * Creates for the submitted or default values a set of pattern and calculates the first variance.
- * If possible estimates p, q, all sequences lengths and all combinations of homologous sequence positions
- *
- * Also reset Pattern if needed, not necessary to create new object
- */
-void patternset::ReInitPattern() { /*====Main-TODO=====*/
- uint64_t tmp;
- int lgth, lgt;
- Clear();
-
- lgth = 0;
-
- if (pattern_file != NULL) {
- TestPattern();
- }
- if(string_pat.size()== 0){
- VerifyConditions();
- CreateRandomPattern();
- }
- else{
- lengths.clear();
- for(int i = 0; i < size; i++){
- tmp = ToBit(string_pat[i]);
- pattern_set.push_back(tmp);
- lgt = (int)(log2(tmp))+1;
- lengths.push_back(lgt);
- }
- length[0] = lengths[0];
- length[1] = length[0];
- for(int i = 1; i < size; i++){
- if(length[0] > lengths[i]){
- length[0] = lengths[i];
- }
- if(length[1] < lengths[i]){
- length[1] = lengths[i];
- }
- }
- }
- for(int i = 0; i < size; i++){
- lgth += lengths[i];
- }
- length_mean = lgth/size;
-
- return;
-}
-/**
- * Due to some cases, it is possible necessary to
- * reset some pattern parameters.
- *
- * This is not a complete delete, so therefore it
- * it is only a part of the destructor.
- */
-void patternset::Clear(){
- lengths.clear();
- pattern_set.clear();
- string_pat.clear();
-}
-
-
-/*---Functions---------------------------------------------------------------*/
-/**
- * If there is an submitted pattern_file, it will check, if it is containing
- * a possible pattern, and if this pattern is in the correct format.
- * Pattern can be parsed in a different formats
- */
-void patternset::TestPattern() {
- std::ifstream patternfile;
- std::vector<std::string> pattern_tmp;
- std::string tmp;
- char tokens[4] = { '.',' ',',',';' }; /*These tokens are allowed to seperate patterns*/
- size_t f_size;
- bool start;
-
- start = false;
-
- patternfile.open(pattern_file);
- patternfile.seekg(0, std::ios::end);
- f_size = (size_t)patternfile.tellg();
- if (!patternfile) {
- SecureMessage("file", -1);
- }
- else if (f_size == 0) { /*[FILE].eof() does not recognize empty files...*/
- SecureMessage("empty", -1);
- }
- else {
- patternfile.close(); /*..therefore it has also to be closed and opened -.-** */
- patternfile.open(pattern_file);
- if(!silent){
- std::cout << "Reading pattern from submitted patternfile ...\n" << std::endl;
- }
- patternfile >> tmp;
- while (!patternfile.eof()) {
- if (!ValidatePatternsFormat(tmp)) {
- patternfile >> tmp; /*Ignoring each incorrect pattern. It is easier to calculate with all the rests*/
- }
- else {
- pattern_tmp = SplitString(tmp, tokens);
- for (unsigned int i = 0; i < pattern_tmp.size(); i++) {
- string_pat.push_back(pattern_tmp[i]); /*Each pattern needs to be saved in its own std::string for comparison*/
- }
- patternfile >> tmp;
- }
- }
- if (size > 0) { /*For an empty set we do not habe to validate*/
- start = ValidatePatternConditions();
- }
- if (!start) { /*Some conditions, like changed weight is too much much amount of work*/
- SecureMessage("pattern", -1); /*Therefore we just use our default values*/
- size = 10;
- weight = 8;
- length = new int[2];
- length[0] = 14;
- length[1] = 14;
- string_pat.clear(); /*Do not forget to reset, or the pattern set will not replaces, just increased*/
- }
- else {
- size = (int )string_pat.size();
- length = PatternLength(string_pat);
- weight = PatternWeight(string_pat[0]);
- if(!silent){
- for (int i = 0; i < size - 1; i++) {
- std::cout << string_pat[i] << "\n";
- }
- std::cout << string_pat[size - 1] << std::endl;
- std::cout << "\n... Done!\n" << std::endl;
- }
- }
- }
- patternfile.close();
-}
-
-
-/**
- * It will Verify the pattern conditions: is the weight correct,
- * not negative values, weight not above length and so on..
- */
-void patternset::VerifyConditions(){
- std::vector<int> leng_new;
- int tmp, diff, leng_old;
-
- if(size <= 0){
- SecureMessage("size", -1);
- size = 10;
- update = true;
- }
- if(length[1] < length[0]){
- tmp = length[1];
- length[1] = length[0];
- length[0] = tmp;
- }
- if(weight < 2){
- SecureMessage("weight_pat", -1);
- weight = 8;
- update = true;
- }
-
- if(length[0] < 3 || length[1] < 3){
- SecureMessage("length",-1);
- diff = length[1] - length[0];
- length[0] = weight*2;
- length[1] = length[0]+diff;
- update = true;
- }
-
- diff = length[1]-length[0];
- leng_old = length[0];
- if(length[1] > 63){ /*max length of 64-Bit Integer*/
- SecureMessage("length",-1);
- if(diff >= 63-weight){
- diff = 63-weight-1;
- }
- length[1] = 63;
- length[0] = length[1]-diff;
- update = true;
- }
- if(length[0] < weight){
- SecureMessage("length",-1);
- if(length[0] != leng_old){
- length[0] = leng_old;
- }
- else{
- length[0] = weight+2;
- }
- if(length[0] > length[1]){
- length[0] = length[1];
- }
- update = true;
- }
-
- if(length[1] < weight || length[0] < weight || length[0] > 63 || length[1] > 63){
- SecureMessage("nkl", -1); /*...if everything is fucked up and rescue does not work...*/
- length[0] = 14;
- length[1] = 14;
- weight = 7;
- update = true;
- }
-
- if(length[0] == weight && length[1] == weight){
- if(size > 1){
- SecureMessage("max_number_pattern",-1);
- size = 1;
- update = true;
- }
- if(improve){
- improve = false;
- SecureMessage("noimprove",-1);
- update = true;
- }
- }
-
- CreateLengths();
-
- tmp = 1;
- diff = 0;
- if(lengths[0] == weight && size > 1){
- diff++;
- if(lengths[1] == lengths[0]){
- lengths[1]++;
- }
- }
-
- for(int i = 1+diff; i < size; i++){
- if(lengths[i] < lengths[i-1]){
- lengths[i] = lengths[i-1];
- }
- if(lengths[i] == lengths[i-1]){
- tmp++;
- if(tmp > MaxNumberPattern(weight-2, lengths[i]-2)){
- lengths[i]++;
- if(lengths[i] > length[1]){
- lengths[i] = length[1];
- size = i;
- if(size <= 0){
- size = 1;
- }
- }
- tmp = 1;
- }
- }
- else{
- tmp = 1;
- }
- }
-
-
- if(size != (int)lengths.size()){
- SecureMessage("max_number_pattern",-1);
- for(int i = 0; i < size; i++){
- leng_new.push_back(lengths[i]);
- }
- lengths.clear();
- lengths = leng_new;
- update=true;
- }
- return;
-}
-
-
-
-/*---Functions---------------------------------------------------------------*/
-/*---Func-Create-------------------------------------------------------------*/
-/**
- * Splits a string, read by a pattern file. Mayby each pattern does not get
- * a new line, it has to be parsed, when a pattern starts and ends
- *
- * @param pattern_split The string containing a few pattern
- *
- * @param tokens The allowed tokens which can be used to seperate patterns in a line
- */
-std::vector<std::string> patternset::SplitString(std::string pattern_split, char* tokens) {
- std::vector<std::string> patternset;
- std::string tmp = "";
- bool flag_token = false;
-
- for (unsigned int i = 0; i < pattern_split.length(); i++) {
- for (unsigned int j = 0; j < strlen(tokens); j++) {
- if (pattern_split[i] == tokens[j]) {
- flag_token = true;
- }
- }
- if (flag_token) { /*token found, which means in one line more patterns --> start new pattern, save last pattern*/
- patternset.push_back(tmp);
- tmp = "";
- }
- else {
- tmp = tmp + pattern_split[i]; /*concatenating patternparts*/
- }
- flag_token = false;
- }
- patternset.push_back(tmp);
- return patternset;
-}
-
-/**
- * Validates a pattern, if it contains only pattern symbols and seperation tokens
- *
- * @param pattern_form The pattern which has to be investigate for symbols and tokens
- *
- * @return Returns true if this one pattern is in right format, false else
- */
-bool patternset::ValidatePatternsFormat(std::string pattern_form) {
- bool flag = true;
-
- for (unsigned int i = 0; i < pattern_form.length(); i++) { /*allowed tokens in patternformat, also separating tokens*/
- if (pattern_form[i] != '1' && pattern_form[i] != '0' && pattern_form[i] != ' ' && pattern_form[i] != ',' && pattern_form[i] != '.' && pattern_form[i] != ';') {
- flag = false;
- SecureMessage("format", -1);
- break;
- }
- }
- if (pattern_form[0] != '1' || pattern_form[pattern_form.length() - 1] != '1') {
- flag = false;
- SecureMessage("startend", -1);
- }
-
- return flag;
-}
-
-/**
- * Validates a pattern set, if all patterns have the same length and weight
- *
- * @return Returns true if this one pattern is in right format, false else
- */
-bool patternset::ValidatePatternConditions() {
- int com_weight, com_size;
- bool condition;
-
- com_size = (int) string_pat.size();
- com_weight = PatternWeight(string_pat[0]); /*as fixpoint saving first patternweight*/
- condition = true;
-
- for (int i = 1; i < com_size; i++) {
- if (PatternWeight(string_pat[i]) != com_weight) {
- SecureMessage("weight", i);
- condition = false;
- }
- }
- return condition;
-}
-
-/**
- * Estimates the weight of a pattern
- *
- * @param pattern_str The pattern which has to be investigate for the weight
- *
- * @return Returns true if this one pattern is in right format, false else
- */
-int patternset::PatternWeight(std::string pattern_wght) {
- int str_weight;
- int str_length;
-
- str_weight = 0;
- str_length = (int) pattern_wght.length();
-
- for (int i = 0; i < str_length; i++) {
- if (pattern_wght[i] == '1') {
- str_weight++;
- }
- }
- return str_weight;
-}
-
-/**
- * Estimates the weight of a pattern
- *
- * @param pattern_str The pattern which has to be investigate for the weight
- *
- * @return Returns true if this one pattern is in right format, false else
- */
-int* patternset::PatternLength(std::vector<std::string> pattern_length) {
- int* pat_length;
- int minp, maxp;
-
- pat_length = new int[2];
- minp = (int) pattern_length[0].length();
- maxp = (int) pattern_length[0].length();
-
- for (unsigned int i = 1; i < pattern_length.size(); i++) {
- if ((int)pattern_length[i].length() > maxp) {
- maxp = (int) pattern_length[i].length();
- }
- if ((int)pattern_length[i].length() < minp) {
- minp = (int) pattern_length[i].length();
- }
- }
- pat_length[0] = minp;
- pat_length[1] = maxp;
- return pat_length;
-}
-
-/**
- * This function will create for our lenghts intervall
- * for each pattern a specific length!.
- *
- */
-void patternset::CreateLengths(){
- double step, leng2;
- int leng, one;
-
- one = 0;
- leng = length[1];
- if(length[0] != length[1]){
- leng--;
- }
- if(length[0] == weight){
- one = 1;
- }
- std::uniform_int_distribution<int> pat_leng(length[0]+one, leng);
-
- if(randpatleng){
- for (int i = 0; i < size-1; i++) {
- if (i < (size / 4)) {
- lengths.push_back(length[1]);
- }
- else {
- lengths.push_back(pat_leng(generator));
- }
- }
- lengths.push_back(length[0]);
- }
- else{
- step = ((double)(length[1]-length[0]+1)/(size-1));
- leng = length[0]+one;
- leng2 = (double) leng;
- if(step == 0){
- step++;
- }
- for(int i = one; i < size; i++){
- if(leng > length[1]){
- leng = length[1];
- }
- lengths.push_back(leng);
- leng2 += step;
- leng = (int)leng2;
- }
- if(one==1){ /*if one pattern has to be without don't care positions*/
- lengths.push_back(length[0]);
- }
- }
- std::sort(lengths.begin(),lengths.end());
-}
-
-/**
- * Creates random a set of pattern. For convention a pattern has to start and end with '1'
- * Reason 10010 ~ 1001
- *
- * @return Returns a randomly created pattern set
- */
-void patternset::CreateRandomPattern() {
- std::vector<uint64_t> tmp;
- uint64_t prototype, position, one;
- int counter;
-
- bool flag;
-
- one = 1;
-
- std::sort(lengths.rbegin(),lengths.rend());
-
- for (int i = 0; i < size; i++) {
- flag = true;
- while(flag){
- flag = false;
- prototype = 1;
- prototype = ((prototype << (lengths[i] - 1))| 1); /*Prototype, first and last position is a match!*/
- std::uniform_int_distribution<int> distribution(1, lengths[i] - 2); /*better random generater than time, uses likelihood for evenly random distribution*/
- counter = 2;
- while (counter < weight) {
- position = (uint64_t) distribution(generator);
- if ((prototype & (one << position)) == 0) { /*have fun and fill with '1' randomly :D */
- counter++;
- prototype |= (one << position);
- }
- }
- for(int j = 0; j < i; j++){
- if(prototype == tmp[j]){
- flag = true;
- }
- }
- if(!flag){
- tmp.push_back(prototype);
- }
- }
- }
- for(int i = 0; i < size; i++){
- pattern_set.push_back(tmp[i]);
- string_pat.push_back(BitString(tmp[i]));
- }
- tmp.clear();
-}
-
-
-/**
- * Scans if there is another pattern in the same format
- *
- * @return returns boolean if there is another same pattern
- */
-bool patternset::UniqPattern(int number) {
- bool uniq = true;
- for (int i = 0; i < size; i++) {
- if (number != i) {
- if (pattern_set[i] == pattern_set[number]) {
- uniq = false;
- }
- }
- }
- return uniq;
-}
-
-/*--Func-Change--------------------------------------------------------------*/
-/**
- * Changes two different positions ('1' and '0') in a specific bit-pattern
- * Start and end are excluded
- *
- * @param number
- * The pattern which has to be modified
- */
-void patternset::ChangeBits(int number) {
- int pos1, pos0;
- bool flag = true;
-
- if(lengths[number] == weight){ /*only possible, if lenghts[size-1] == weight*/
- number--;
- }
-
- while (flag && improve) {
- flag = false;
- pos1 = SymbolRandPos(number, '1');
- pos0 = SymbolRandPos(number, '0');
- pattern_set[number] = ChangeBitPos(number, pos1, pos0);
- if (!UniqPattern(number)) {
- flag = true;
- }
- }
-}
-
-/**
- * Exchange of two positions; changing match to don't care and vice versa.
- *
- * @param pos The pattern index of the modifying pattern
- *
- * @param pos_one Matchposition which has to be a don't care
- *
- * @param pos_zero Don'tCare position which has to be a match
- *
- * @return The modified pattern as an 64-Bit integer
- */
-uint64_t patternset::ChangeBitPos(int pos, int pos_one, int pos_zero) {
- uint64_t diff, one_mask, mask, p;
-
- p = pattern_set[pos];
- diff = 1;
- diff = diff << pos_one;
- one_mask = 1;
-
- one_mask = one_mask << ((int)log2(p)+1);
- one_mask--;
-
- mask = one_mask ^ diff;
- p = p & mask;
-
- diff = 1;
- diff = diff << pos_zero;
- p = p | diff;
-
- return p;
-}
-
-/**
- * Returns for a specific symbol ('1', '0') a random chosen position
- *
- * @param number The pattern index of the modifying pattern
- *
- * @param symb The specific symbol, either '1' or '0'
- *
- * @return A random chosen position in the pattern of the specific symbol
- */
-int patternset::SymbolRandPos(int number, char symb) {
- std::vector<int> positions;
- int pos;
-
- positions = GetSymbol(number, symb);
-
- std::uniform_int_distribution<int> distribution(0, (int) positions.size() - 1);
-
- pos = distribution(generator);
- pos = positions[pos];
-
- positions.clear();
- return pos;
-}
-
-/**
- * Investigates for a specific symbol all positions of a pattern.
- * For a match, the first and last positions are excluded!
- *
- * @param number The pattern index of the modifying pattern
- *
- * @param symb The specific symbol, either '1' or '0'
- *
- * @return All positions of the pattern of the specific symbol
- */
-std::vector<int> patternset::GetSymbol(int number, char symb){
- std::vector<int> positions;
- int lgth;
- uint64_t tmp, chr;
-
- tmp = pattern_set[number];
- lgth = (int)(log2(tmp));
-
- chr = 0;
- if (symb == '1') {
- chr = 1;
- }
-
- for (int i = 1; i < lgth; i++) {
- if (((tmp >> i) & 1) == chr) {
- positions.push_back(i);
- }
- }
- return positions;
-}
-
-
-/*---stuff-------------------------------------------------------------------*/
-/**
-* Method to print the current pattern
-*/
-void patternset::Print() {
- ToString();
- for (int i = 0; i < size; i++) {
- std::cout << string_pat[i] << std::endl;;
- }
-}
-
-/**
- * Saves the current 64-Bit integer in the string pat as printable strings
- */
-void patternset::ToString() {
- for (int i = 0; i < size; i++) {
- string_pat[i] = BitString(pattern_set[i]);
- }
-}
-
-/**
- * Turns a string into a 64-Bit Integer as binary pattern
- *
- * @param p The number of the pattern in the string pattern set
- *
- * @return The binary representation of the input string
- */
-uint64_t patternset::ToBit(std::string p) {
- std::string tmp;
- uint64_t val, d;
- val = 0;
- d = 1;
-
- for (unsigned int i = 0; i < p.length(); i++) {
- if (p[i] == '1') {
- val += d << (p.length() - (i + 1));
- }
- }
- return val;
-}
-
-/**
- * Changes a pattern in 64-Bit integer format into a printable string
- *
- * @param p The pattern in 64-bit integer format
- *
- * @return The printable string of the input pattern
- */
-std::string patternset::BitString(uint64_t p) {
- std::string bit_string;
- int lgth;
- lgth = (int) log2(p);
-
- bit_string = "";
-
- for (int i = lgth; i >= 0; i--) {
- if (((p >> i) & 1) == 1) {
- bit_string += "1";
- }
- else {
- bit_string += "0";
- }
- }
- bit_string += "\0";
- return bit_string;
-}
-
-
-/**
- * Determines the maximum number of patterns with weight and length
- *
- *@param weight Complete pattern weight-2, start and end have to be match and do not change
- *
- *@param length Complete pattern lengt-2, start and end have to be match and do not change
- *
- *@return max number of possible pattern
- */
-double patternset::MaxNumberPattern(int p_weight, int p_length) {
- double tmpa = Faculty(p_length);
- double tmpb = Faculty(p_length - p_weight);
- double tmpc = Faculty(p_weight);
- double tmp = tmpa / (tmpb*tmpc);
- return tmp;
-}
-
-/**
- * Calculates the faculty
- *
- *@param value Calculating the faculty for value
- *
- *@return faculty(value) = 'value!'
- */
-double patternset::Faculty(int value) {
- double tmp;
- tmp = 1;
- for (int i = 1; i <= value; i++) {
- tmp *= i;
- }
- return tmp;
-}
-
-/**
- * A Method to collect all errormessages. Just easier for programmer to change
- * the text or extend.
- *
- * @param errmsg Due to a few possible errormessages, this is the option, which has to be printed.
- *
- * @param pos The position of the incorrect patterns.
- */
-void patternset::SecureMessage(std::string errmsg, int pos) {
- /*if (errmsg == "wrongindex") {
- std::cerr << "ERROR! Pattern " << pos << " does not exist... do nothing\n" << std::endl;
- return;
- }
- if (errmsg == "file") {
- std::cerr << "ERROR! Pattern file \'" << pattern_file << "\' could not be found!" << std::endl;
- std::cerr << "Return to submitted or default values\n" << std::endl;
- return;
- }
- if (errmsg == "empty") {
- std::cerr << "ERROR! File \'" << pattern_file << "\' is an empty file!" << std::endl;
- std::cerr << "Return to submitted or default values\n" << std::endl;
- return;
- }
- if (errmsg == "pattern") {
- std::cerr << "ERROR! Patternconditions from pattern file were not correct (different weight or length)!" << std::endl;
- std::cerr << "Return to submitted or default values\n" << std::endl;
- return;
- }
- if (errmsg == "nkl") {
- std::cerr << "ERROR! Wrong values for weight, pattern number or pattern length!" << std::endl;
- std::cerr << "Return to default values\n" << std::endl;
- return;
- }
- if (errmsg == "weight_pat") {
- std::cerr << "ERROR! Weight of a pattern cannot under 2!" << std::endl;
- std::cerr << "Return to submitted or default values\n" << std::endl;
- return;
- }
- if (errmsg == "format") {
- std::cerr << "FORMAT-ERROR: Pattern containes illegal characters!" << std::endl;
- std::cerr << "Allowed characters: '0','1' for pattern; ','|'.'|';'|' ' to seperate patterns" << std::endl;
- std::cerr << "Go on to next pattern.\n" << std::endl;
- return;
- }
- if (errmsg == "max_number_pattern") {
- std::cerr << "The number of patterns is too high for your configuration and will be reset!" << std::endl;
- return;
- }
- if (errmsg == "size") {
- std::cerr << "ERROR! The number of patterns has to be greater than 0!\n" << std::endl;
- return;
- }
- if (errmsg == "weight") {
- std::cerr << "ERROR! By comparing with the first pattern, the " << pos + 1 << ". pattern has a different weight!\n" << std::endl;
- return;
- }
- if(errmsg == "length"){
- std::cerr << "ERROR! The patternlengths has to be greater or equal the weight, greater than 2 and below 63!" << std::endl;
- return;
- }
- if (errmsg == "startend") {
- std::cerr << "FORMAT-ERROR: Pattern has to start and end with a match position '1' !\n" << std::endl;
- return;
- }
- if (errmsg == "noimprove") {
- std::cerr << "Using your pattern conditions it is not sensible to improve your pattern, sorry!" << std::endl;
- std::cerr << "Deactivating improve mode\n" << std::endl;
- return;
- }
- if (errmsg == "update"){
- std::cerr << "\n####IMPROTANT####\nDue to some configuration errors, your submitted parameters have been updated!" << std::endl;
- return;
- }*/
-}
-
-
-/*---Set-&-GetFunc-----------------------------------------------------------*/
-/**
- * Changes to silent output, nothing will be printed automatically!
- * Errors are going to be printed!
- */
-void patternset::Silent(){
- silent = true;
-}
-
-/**
- * Changes the patternlengths from evenly distributed to random pattern lengths.
- * One quarter has maximum length, the rest randomly distributed between
- * maximum and minimum length.
- */
-void patternset::RandPatLength(){
- if(!randpatleng){
- randpatleng = true;
- ReInitPattern();
- }
-}
-
-/**
- * Returns the weight of each pattern, the match positions
- *
- * @return returns weight
- */
-int patternset::GetWeight() {
- return weight;
-}
-
-/**
- * Returns the amount of patters; number of patterns
- *
- * @return returns size
- */
-int patternset::GetSize() {
- return size;
-}
-
-/**
- * Returns the length of each pattern
- *
- * @return returns length
- */
-int* patternset::GetLength() {
- return length;
-}
-
-/**
- * Returns the mean of all pattern lengths
- *
- * @return The mean of all lengths;
- */
-int patternset::GetLengthMean(){
- return length_mean;
-}
-
-/**
- * Returns a boolean, if an improvement is possible
- * Imagine, a improvment for pattern '101' is not possible!
- *
- * @return The boolean, if this improvement is possible
- */
-bool patternset::GetImprove() {
- return improve;
-}
-
-/**
- * Returns a boolean, if due to some configuration errors
- * values set by the user had to be updated to better configuration.
- * Imagine, the weight was above the pattern length.
- *
- * @return The boolean, if something had been updated.
- */
-bool patternset::GetUpdate() {
- return update;
-}
-
-
-/**
- * Returns a specific pattern of the pattern set in 64-Bit integer format
- *
- * @return specific string in 64-Bit integer format
- */
-uint64_t patternset::GetPattern(int number) {
- if (number >= size) {
- SecureMessage("wrongindex", number);
- return 0;
- }
- else {
- return pattern_set[number];
- }
-}
-
-/**
- * Returns complete pattern set
- *
- * @return complete pattern set
- */
-std::vector<std::string> patternset::GetStringPattern() {
- ToString();
- return string_pat;
-}
-
-/**
- * Returns only one pattern as a string from the current patternset.
- *
- * @param number The index of the chosen pattern
- *
- * @return The pattern as a printable string
- */
-std::string patternset::GetString(int number){
- if (number >= size) {
- SecureMessage("wrongindex", number);
- return NULL;
- }
- else {
- ToString();
- return string_pat[number];
- }
-}
-
-/**
- * Changes a pattern in 64-Bit integer format
- *
- * @param number The index of the pattern which will be changed
- *
- * @param patt The new pattern
- */
-void patternset::SetPattern(int number, uint64_t patt){
- if (number >= size) {
- SecureMessage("wrongindex", number);
- }
- else {
- pattern_set[number] = patt;
- }
-}
=====================================
patternset.h deleted
=====================================
--- a/patternset.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/**
- * This programm calculates the variance/OC and/or the sensitivity of a set of pattern with the same weight.
- * It is possible to improve your patternset and read patterns from a file.
- *
- * patternset object header
- *
- * For theory please have a look at:
- *
- * B. Morgenstern, B. Zhu, S. Horwege, C.-A Leimeister (2015)
- * Estimating evolutionary distances between genomic sequences from spaced-word matches
- * Algorithms for Molecular Biology 10, 5. (http://www.almob.org/content/10/1/5/abstract)
- *
- *
- * @author: Lars Hahn - 26.10.2015, Georg-August-Universitaet Goettingen
- * @version: 1.0.2 11/2015
- */
-#ifndef PATTERNSET_H_
-#define PATTERNSET_H_
-
-
-#include <iostream>
-#include <fstream>
-#include <random>
-#include <vector>
-#include <string>
-#include <algorithm>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <inttypes.h>
-#include "extkey.h"
-
-class patternset {
-public:
- patternset();
- patternset(char* pattern_file);
- patternset(int size, int *length, int weight);
- patternset(char* pattern_file, int size, int *length, int weight);
- ~patternset();
-
- void ReInitPattern();
-
- void ChangeBits(int number);
- uint64_t ChangeBitPos(int pos, int pos_one, int pos_zero);
- bool UniqPattern(int number);
-
- std::vector<std::string> GetStringPattern();
- std::string GetString(int number);
- uint64_t GetPattern(int number);
- int GetWeight();
- int GetSize();
- int* GetLength();
- int GetLengthMean();
- bool GetImprove();
- bool GetUpdate();
-
- double GetValue(int number);
- void SetPattern(int number, uint64_t patt);
- void Print();
- void Silent();
- void RandPatLength();
-
-protected:
- void Clear();
- void TestPattern();
- void VerifyConditions();
-
- std::vector<std::string> SplitString(std::string pattern, char* tokens);
- bool ValidatePatternsFormat(std::string pattern_form);
- bool ValidatePatternConditions();
- int PatternWeight(std::string pattern_wght);
- int* PatternLength(std::vector<std::string> pattern_length);
-
- void CreateLengths();
- void CreateRandomPattern();
- bool IsSetScore(int pattern);
- int SymbolRandPos(int number, char symb);
- int SymbolCalcPos(int number, char symb);
- std::vector<int> GetSymbol(int number, char symb);
-
- void ToString();
- uint64_t ToBit(std::string p);
- std::string BitString(uint64_t p);
-
- double MaxNumberPattern(int p_weight, int p_length);
- double Faculty(int value);
- void SecureMessage(std::string errmsg, int pos);
-
-private:
- std::vector<uint64_t> pattern_set;
- std::vector<std::string> string_pat;
- std::vector<int> lengths;
- int size;
- int *length;
- int length_mean;
- int weight;
- char* pattern_file = NULL;
- bool improve;
- bool update;
- bool silent;
- bool randpatleng;
-};
-#endif
=====================================
src/patternset.cpp
=====================================
--- /dev/null
+++ b/src/patternset.cpp
@@ -0,0 +1,1101 @@
+/**
+ * This programm calculates the variance/OC and/or the sensitivity of a set of pattern with the same weight.
+ * It is possible to improve your patternset and read patterns from a file.
+ *
+ * patternset object file
+ *
+ * For theory please have a look at:
+ *
+ * - L. Hahn, C.-A. Leimeister, B. Morgenstern (2015)
+ * RasBhari: optimizing spaced seeds for database searching, read mapping and alignment-free sequence comparison
+ * arXiv:1511.04001[q-bio.GN]
+ *
+ * - B. Morgenstern, B. Zhu, S. Horwege, C.-A Leimeister (2015)
+ * Estimating evolutionary distances between genomic sequences from spaced-word matches
+ * Algorithms for Molecular Biology 10, 5. (http://www.almob.org/content/10/1/5/abstract)
+ *
+ *
+ * @author: Lars Hahn - 12.05.2016, Georg-August-Universitaet Goettingen
+ * @version: 1.2.0 05/2016
+ */
+#include "patternset.h"
+
+
+/*---Variables---------------------------------------------------------------*/
+std::default_random_engine generator(time(0)); //time(0)
+
+
+/*===Main-Part===============================================================*/
+/*---Constructor-------------------------------------------------------------*/
+/**
+ * Default constructor, sets the default vaulues.
+ */
+patternset::patternset(){
+ size = 10;
+ weight = 8;
+ max_dontcare = 6;
+ min_dontcare = 6;
+ silent = true;
+ isInFile = false;
+ bitmode = true;
+ random_length = false;
+ improve = true;
+ update = false;
+ initialized = false;
+}
+
+
+/**
+ * Constructor, sets some default vaulues, just pattern dimension is set.
+ *
+ * @param size Number of patterns for the patternset.
+ *
+ * @param weigth Number of match positions ('1') in each pattern.
+ *
+ * @param min_dontcare Number of minimum don't care positions ('0') in each pattern.
+ *
+ * @param max_dontcare Number of maximum don't care positions ('0') in each pattern.
+ *
+ * @param silent Boolean which (de)activates std::output on commandline (exclusive errors)
+ */
+patternset::patternset(uint32_t size, uint32_t weight, uint32_t min_dontcare, uint32_t max_dontcare, bool silent){
+ this->size = size;
+ this->weight = weight;
+ this->max_dontcare = max_dontcare;
+ this->min_dontcare = min_dontcare;
+ this->silent = silent;
+ isInFile = false;
+ bitmode = true;
+ random_length = false;
+ improve = true;
+ update = false;
+ size = 10;
+ weight = 8;
+ max_dontcare = 6;
+ min_dontcare = 6;
+ initialized = false;
+}
+
+
+/**
+ * File constructor, sets values only from files.
+ *
+ * @param inFile The file which contains a patternset.
+ *
+ * @param silent Boolean which (de)activates std::output on commandline (exclusive errors)
+ */
+patternset::patternset(const char *inFile, bool silent){
+ this->silent = silent;
+ this->inFile = inFile;
+ isInFile = true;
+ improve = true;
+ update = false;
+ bitmode = true;
+ size = 10;
+ weight = 8;
+ max_dontcare = 6;
+ min_dontcare = 6;
+ initialized = false;
+}
+
+
+/**
+ * Default destructor, deletes all vectors and matrices in the object.
+ */
+patternset::~patternset(){
+ pattern_set.clear();
+ bit_pattern_set.clear();
+ pattern_dontcare.clear();
+ pattern_improve.clear();
+}
+
+
+/*---Initialization----------------------------------------------------------*/
+/**
+ * Creates for the submitted or default values a set of pattern; if errors occur, these will be fixed.
+ * It is possible to get all patterns for a specific combination.
+ * Size and lenght can be reduced; each pattern is unique.
+ *
+ * @param random_length Boolean which (de)activates random generated pattern lengths.
+ */
+void patternset::Init(bool random_length){
+ bool read_error;
+
+ read_error = false;
+ this->random_length = random_length;
+
+ if(isInFile){
+ read_error = ReadPattern();
+ if(read_error){
+ SecureMessage("nkl", -1);
+ pattern_set.clear();
+ update = true;
+ }
+ else{
+ CheckParams();
+ improve = SetImprove();
+ if(bitmode){
+ PatternToBit();
+ }
+ }
+ }
+
+ if(!isInFile || read_error){
+ if(weight != 1){
+ CheckParams();
+ pattern_dontcare = CreateLengths();
+ pattern_set = CreatePattern();
+ improve = SetImprove();
+ if(bitmode){
+ PatternToBit();
+ }
+ }
+ else{
+ pattern_dontcare.push_back(0);
+ pattern_set = CreatePattern();
+ improve = false;
+ pattern_improve.push_back(improve);
+ PatternToBit();
+ }
+
+ }
+ initialized = true;
+}
+
+
+/**
+ * Standard Reinitialization of variance options, creates a new patternset.
+ */
+void patternset::ReInit(){
+ pattern_set.clear();
+ bit_pattern_set.clear();
+ pattern_dontcare.clear();
+ pattern_improve.clear();
+ bitmode = true;
+ improve = true;
+ update = false;
+ initialized = false;
+ Init(random_length);
+}
+
+
+/**
+ * Checks the first conditions of the patternset, in special cases resets them.
+ * Checks, if bitmode is possible.
+ */
+void patternset::CheckParams(){
+ if(size < 0){
+ size = std::abs(size);
+ if(size <= 0){
+ size = 10;
+ update = true;
+ }
+ }
+ if(max_dontcare < 0 || min_dontcare < 0){
+ max_dontcare = std::abs(max_dontcare);
+ min_dontcare = std::abs(min_dontcare);
+ }
+ if(max_dontcare < min_dontcare){
+ std::swap(max_dontcare,min_dontcare);
+ }
+ if(weight < 2){
+ if(weight < 0){
+ weight = std::abs(weight);
+ }
+ else{
+ if(size != 1){
+ size = 1;
+ update = true;
+ }
+ if(max_dontcare != 0 || max_dontcare != 0){
+ min_dontcare = 0;
+ max_dontcare = 0;
+ update = true;
+ }
+ }
+ }
+ if(weight == 2){
+ improve = false;
+ }
+ if(max_dontcare + weight > 63 || min_dontcare + weight > 63){
+ bitmode = false;
+ }
+ if(update && !silent){
+ SecureMessage("update",-1);
+ }
+}
+
+/*---Functions---------------------------------------------------------------*/
+/*---Func-Create-------------------------------------------------------------*/
+/**
+ * Generates automatically patternlengths; depends on min_dontcare and max_dontcare.
+ * Returns a vector which contains in descending order the pattern lenghts.
+ * Two options: 1) nearly linear distributed lenghts
+ * 2) random distributed lenghts
+ *
+ * @return All lenghts for the patternset (type: std::vector<uint32_t>)
+ */
+std::vector<uint32_t> patternset::CreateLengths(){
+ std::vector<uint32_t> leng_vec;
+ double steps, leng, leng_tmp, max_size;
+ uint32_t intervals;
+
+ if(random_length){
+ std::uniform_int_distribution<uint32_t> lengths(min_dontcare, max_dontcare);
+ for(uint32_t i = 0; i < size; i++){
+ leng_vec.push_back(lengths(generator));
+ }
+ }
+ else{
+ intervals = max_dontcare - min_dontcare + 1;
+ steps = ((double)intervals /(double) size);
+ leng = (double) min_dontcare;
+
+ for(uint32_t i = 0; i < size; i++){
+ leng_vec.push_back((uint32_t)leng);
+ leng += steps;
+ }
+ }
+
+ std::sort(leng_vec.begin(), leng_vec.end());
+
+ leng_tmp = leng_vec[0];
+ leng = 0;
+
+
+ max_size = MaxNumberPattern(weight -2 , leng_tmp + weight - 2);
+
+ for(uint32_t i = 0; i < size; i++){
+ if(leng_vec[i] <= leng_tmp){
+ if(leng_vec[i] < leng_tmp){
+ leng_vec[i] = leng_tmp;
+ }
+ if((double)leng >= max_size){
+ if(leng_tmp != leng_vec[leng_vec.size()-1]){
+ leng = 0;
+ leng_vec[i]++;
+ leng_tmp = leng_vec[i];
+ max_size = MaxNumberPattern(weight-2, leng_tmp + weight - 2);
+ }
+ else{
+ update = true;
+ leng_vec.erase(leng_vec.begin()+i,leng_vec.end());
+ size = leng_vec.size();
+ SecureMessage("max_number_pattern", i+1);
+ }
+ }
+ }
+ else{
+ leng_tmp = leng_vec[i];
+ leng = 0;
+ max_size = MaxNumberPattern(weight-2, leng_tmp + weight -2);
+ }
+ leng++;
+ }
+ std::sort(leng_vec.rbegin(), leng_vec.rend());
+
+ return leng_vec;
+}
+
+
+/**
+ * Generates for submitted (and corrected) values a patternset as std::vector< std::vector<char> >.
+ *
+ * @return Patternset (type: std::vector< std::vector<char> >)
+ */
+std::vector<std::vector<char> > patternset::CreatePattern(){
+ std::vector<std::vector<char> > patterns;
+ std::vector<char> pat;
+ uint32_t tmp_weight, pos;
+ if(!silent){
+ std::cout << "Generating pattern automatically ...\n" << std::endl;
+ }
+
+ if(weight != 1){
+ for(uint32_t i = 0; i < size; i++){
+ for(uint32_t j = 0; j < pattern_dontcare[i] + weight; j++){
+ pat.push_back('0');
+ }
+ pat[0] = '1';
+ pat[pat.size()-1] = '1';
+ tmp_weight = weight-2;
+ std::uniform_int_distribution<uint32_t> positions(0, pattern_dontcare[i] + weight-1);
+ while(tmp_weight > 0){
+ pos = positions(generator);
+ if(pat[pos] == '0'){
+ tmp_weight--;
+ pat[pos] = '1';
+ }
+ }
+ if(UniqPattern(patterns, pat)){
+ patterns.push_back(pat);
+ if(!silent){
+ for(uint32_t i = 0; i < pat.size(); i++){
+ std::cout << pat[i];
+ }
+ std::cout << std::endl;
+ }
+ }
+ else{
+ i--;
+ }
+ pat.clear();
+ }
+ }
+ else{
+ pat.push_back('1');
+ patterns.push_back(pat);
+ pat.clear();
+ }
+
+ if(!silent){
+ std::cout << "\nDone." << std::endl << std::endl;
+ }
+
+ return patterns;
+}
+
+
+/**
+ * Reads pattern from a submitted file which (may) contains binary strings.
+ * Each pattern will be verified, if its really a binary pattern with the same weight.
+ * If an error occurs (file not found, etc.), it returns false, otherwise true.
+ *
+ * @return Boolean, if an error occured or not while pattern processing.
+ */
+bool patternset::ReadPattern(){
+ std::ifstream fasta;
+ std::vector<char> pattern;
+ uint32_t p_size;
+ char tokens[6] = {'.', ' ', ',', ';', '\n', '\t'};
+ char c;
+ bool token, read_error, bad;
+
+ read_error = false;
+ bad = false;
+ fasta.open(inFile);
+ if(fasta.fail()){
+ read_error = true;
+ SecureMessage("file", -1);
+ }
+ else if(fasta.peek() == std::ifstream::traits_type::eof()){
+ read_error = true;
+ SecureMessage("empty", -1);
+ }
+ else{
+ while(!fasta.eof()){
+ c = fasta.get();
+ token = false;
+ for(uint32_t i = 0; i < 6; i++){
+ if(c == tokens[i]){
+ token = true;
+ }
+ }
+ if(token || -1 ==(int)c){
+ if(!bad && pattern.size () != 0){
+ pattern_set.push_back(pattern);
+ }
+ bad = false;
+ pattern.clear();
+ }
+ else if(c == '1' || c== '0'){
+ pattern.push_back(c);
+ }
+ else{
+ bad = true;
+ SecureMessage("format", -1);
+ }
+ }
+ }
+ fasta.close();
+ size = (uint32_t)pattern_set.size();
+ if(size == 0){
+ if(!read_error){
+ SecureMessage("empty", -1);
+ }
+ size = 10;
+ read_error = true;
+ }
+
+ for(uint32_t i = 0; i < pattern_set.size(); i++){
+ p_size = pattern_set[i].size();
+ if(p_size != 0){
+ p_size--;
+ }
+ if(pattern_set[i][0] != '1' || pattern_set[i][p_size] != '1'){
+ SecureMessage("startend", i);
+ read_error = true;
+ }
+ }
+ if(!read_error){
+ read_error = VerifyPatternCondition();
+ if(!read_error){
+ for(uint32_t i = 0; i < size; i++){
+ pattern_dontcare.push_back(pattern_set[i].size()-weight);
+ }
+ }
+ }
+ return read_error;
+}
+
+
+/**
+* Checks, if the submitted pattern file is in correct format; if it
+* contains only binary pattern with the same weight.
+* Sets the wight and max_ and min_dontcare number, if no error occured.
+*/
+bool patternset::VerifyPatternCondition(){
+ uint32_t first_weight, p_size;
+ bool pattern_error;
+
+ pattern_error = false;
+
+ min_dontcare = pattern_set[0].size();
+ max_dontcare = min_dontcare;
+
+ first_weight = GetWeight(pattern_set[0]);
+
+ for(uint32_t i = 0; i < size; i++){
+ p_size = pattern_set[i].size();
+ min_dontcare = std::min(min_dontcare, p_size);
+ max_dontcare = std::max(max_dontcare, p_size);
+ if(first_weight != GetWeight(pattern_set[i])){
+ pattern_error = true;
+ SecureMessage("weight", i);
+ }
+ if(!IsBinary(pattern_set[i])){
+ pattern_error = true;
+ SecureMessage("pattern", i);
+ }
+ }
+
+ weight = first_weight;
+
+ min_dontcare -= weight;
+ max_dontcare -= weight;
+
+ if(min_dontcare < 0 || max_dontcare < 0){
+ pattern_error = true;
+ }
+
+ return pattern_error;
+}
+
+
+/**
+ * Checks, if there is at least one pattern, which can be improved.
+ * Returns true, if there is at least one, otherwise false.
+ *
+ * Checks and saves for each pattern individually, if it can be improved.
+ *
+ * @return Boolean, if at least one pattern can be changed.
+ */
+bool patternset::SetImprove(){
+ uint32_t leng, p_size, offset;
+ bool isImprovable, tmp_impro;
+
+ isImprovable = false;
+
+ leng = pattern_dontcare[0];
+ p_size = 0;
+ offset = 0;
+
+ for(uint32_t i = 0; i < size; i++){
+ if(leng <= pattern_dontcare[i]){
+ p_size++;
+ }
+ else{
+ if(p_size < MaxNumberPattern(weight -2, leng + weight - 2)){
+ isImprovable = true;
+ tmp_impro = true;
+ }
+ else{
+ tmp_impro = false;
+ }
+ for(uint32_t j = offset; j < i; j++){
+ pattern_improve.push_back(tmp_impro);
+ }
+ p_size = 1;
+ offset = i;
+ leng = pattern_dontcare[i];
+ }
+ }
+ if(p_size < MaxNumberPattern(weight -2, leng + weight - 2)){
+ isImprovable = true;
+ tmp_impro = true;
+ }
+ else{
+ tmp_impro = false;
+ }
+ for(uint32_t j = offset; j < size; j++){
+ pattern_improve.push_back(tmp_impro);
+ }
+
+ return isImprovable;
+}
+
+
+/*--Func-Change--------------------------------------------------------------*/
+/**
+ * Changes two different positions ('1' and '0') in a specific bit-pattern
+ * Start and end are excluded
+ *
+ * @param number The pattern which has to be modified
+ */
+void patternset::ChangeBitsRandom(uint32_t number){
+ uint64_t zero, one;
+ bool change = false;
+ while(!change){
+ zero = GetSymbolRandPos(number, '0');
+ one = GetSymbolRandPos(number, '1');
+ change = ChangeBitPos(number, zero, one);
+ }
+}
+
+
+/**
+ * Exchange of two positions; changing match to don't care and vice versa.
+ *
+ * @param pos The pattern index of the modifying pattern
+ *
+ * @param pos_one Matchposition which has to be a don't care
+ *
+ * @param pos_zero Don'tCare position which has to be a match
+ *
+ * @return Boolean, if two positions could be changed or not.
+ */
+bool patternset::ChangeBitPos(uint32_t number, uint64_t zero, uint64_t one){
+ bool change;
+ change = false;
+ if(initialized){
+ if(pattern_improve[number%size]){
+ if(bitmode){
+ uint64_t bit_tmp, shift;
+ bit_tmp = bit_pattern_set[number%size];
+ shift = 1;
+ if(bit_tmp > zero && bit_tmp > one ){
+ bit_tmp -= (shift << one);
+ bit_tmp += (shift << zero);
+ if(UniqBit(bit_tmp)){
+ bit_pattern_set[number] = bit_tmp;
+ change = true;
+ }
+ }
+ }
+ else{
+ std::vector<char> pattern_tmp(pattern_set[number]);
+ if(pattern_tmp.size() > zero && pattern_tmp.size() > one){
+ std::swap(pattern_tmp[zero], pattern_tmp[one]);
+ if(UniqPattern(pattern_set, pattern_tmp)){
+ std::swap(pattern_set[number], pattern_tmp);
+ change = true;
+ }
+ pattern_tmp.clear();
+ }
+ }
+ }
+ }
+ return change;
+}
+
+
+/**
+ * Investigates for a specific symbol all positions of a pattern.
+ * For a match, the first and last positions are excluded!
+ * Chooses a position randomly and returns the position
+ *
+ * @param number The pattern index of the modifying pattern
+ *
+ * @param symb The specific symbol, either '1' or '0'
+ *
+ * @return A random chosen position of all positions matching the symbol.
+ */
+uint64_t patternset::GetSymbolRandPos(uint32_t number, char symb){
+ std::vector<uint32_t> positions;
+ int pos = -1;
+ if(pattern_improve[number%size]){
+ pos = GetLength(number)-1;
+ if(bitmode){
+ uint64_t tmp_pat, one;
+ one = 1;
+ for(int i = 1; i < pos; i++){
+ tmp_pat = bit_pattern_set[number%size] & ( one << (i));
+ if(tmp_pat != 0 && symb == '1'){
+ positions.push_back(i);
+ }
+ if(tmp_pat == 0 && symb == '0'){
+ positions.push_back(i);
+ }
+ }
+ }
+ else{
+ for(int i = 1; i < pos; i++){
+ if(pattern_set[number][i] == symb){
+ positions.push_back(i);
+ }
+ }
+ }
+
+ std::uniform_int_distribution<uint32_t> posits(0, positions.size()-1);
+ pos = posits(generator);
+ pos = positions[pos];
+ positions.clear();
+ }
+ return pos;
+}
+
+
+/*---stuff-------------------------------------------------------------------*/
+/**
+ * A Method to collect all errormessages. Just easier for programmer to change
+ * the text or extend.
+ *
+ * @param errmsg Due to a few possible errormessages, this is the option, which has to be printed.
+ *
+ * @param pos The position of the incorrect patterns.
+ */
+void patternset::SecureMessage(std::string errmsg, int pos){
+ if (errmsg == "file") {
+ printf("%c[1;31mERROR! ", 27);
+ printf("%c[0m", 27);
+ std::cerr << "Pattern file \'" << inFile << "\' could not be found!" << std::endl;
+ std::cerr << "Return to submitted or default values" << std::endl;
+ return;
+ }
+ if (errmsg == "empty") {
+ printf("%c[1;31mERROR! ", 27);
+ printf("%c[0m", 27);
+ std::cerr << "File \'" << inFile << "\' is an empty file!" << std::endl;
+ std::cerr << "Return to submitted or default values" << std::endl;
+ return;
+ }
+ if (errmsg == "pattern") {
+ printf("%c[1;31mERROR! ", 27);
+ printf("%c[0m", 27);
+ std::cerr << "Patternconditions from pattern " << pos + 1 << " file were not correct (different weight, etc.)!" << std::endl;
+ std::cerr << "Go on to next pattern." << std::endl;
+ return;
+ }
+ if (errmsg == "startend") {
+ printf("%c[1;31mFORMAT-ERROR: ", 27);
+ printf("%c[0m", 27);
+ std::cerr << "Pattern " << pos + 1 <<" has to start and end with a match position '1' !" << std::endl;
+ return;
+ }
+ if (errmsg == "format") {
+ printf("%c[1;31mFORMAT-ERROR: ", 27);
+ printf("%c[0m", 27);
+ std::cerr << "Pattern containes illegal characters!" << std::endl;
+ std::cerr << "Allowed characters: '0','1' for pattern; ','|'.'|';'|' ' to seperate patterns" << std::endl;
+ std::cerr << "Go on to next pattern." << std::endl;
+ return;
+ }
+ //if(!silent){
+ if (errmsg == "nkl") {
+ printf("%c[1;31mERROR! ", 27);
+ printf("%c[0m", 27);
+ std::cerr << "Wrong values for weight, pattern number or pattern length!" << std::endl;
+ std::cerr << "Return to default values" << std::endl;
+ return;
+ }
+ if (errmsg == "max_number_pattern") {
+ printf("%c[1;33m", 27);
+ std::cerr << "The number of patterns is too high for your configuration and will be reset!" << std::endl;
+ printf("%c[0m", 27);
+ return;
+ }
+ if (errmsg == "weight") {
+ printf("%c[1;31mERROR! ", 27);
+ printf("%c[0m", 27);
+ std::cerr << "By comparing with the first pattern, the " << pos + 1 << ". pattern has a different weight!" << std::endl;
+ return;
+ }
+ if (errmsg == "update"){
+ printf("%c[1;31m\n####IMPROTANT####\n", 27);
+ printf("%c[0m", 27);
+ std::cerr << "Due to some configuration errors, your submitted parameters have been updated!" << std::endl;
+ return;
+ // }
+ }
+}
+
+
+/**
+ * Estimates the number of matches ('1') in a std::vector<char>.
+ *
+ * @param pattern Pattern as std::vector<char>
+ *
+ * @return Number of matches ('1').
+ */
+uint32_t patternset::GetWeight(std::vector<char> pattern){
+ uint32_t weight_p = 0;
+ for(uint32_t i = 0; i < pattern.size(); i++){
+ if(pattern[i] == '1'){
+ weight_p++;
+ }
+ }
+ return weight_p;
+}
+
+
+/**
+ * Estimates if a std::vector<char> pattern is in binary format ('1' or '0').
+ *
+ * @param pattern Pattern as std::vector<char>
+ *
+ * @return Boolean, if the submitted std::vector<char> pattern is binary.
+ */
+bool patternset::IsBinary(std::vector<char> pattern){
+ bool binary = true;
+ for(uint32_t i = 0; i < pattern.size(); i++){
+ if(pattern[i] != '0' && pattern[i] != '1'){
+ binary = false;
+ }
+ }
+ return binary;
+}
+
+
+/**
+ * Creates a bitpattern (64-Bit Integer) from a char vector pattern.
+ *
+ *@param pattern Pattern as std::vector<char>
+ *
+ *@return Pattern as uint64_t (64-Bit Integer)
+ */
+uint64_t patternset::StringToBit(std::vector<char> pattern){
+ uint64_t bitpat, tmp, tmp_size;
+
+ bitpat = 0;
+ tmp = 1;
+ tmp_size = (uint64_t) pattern.size()-1;
+
+ for(uint32_t i = 0; i < pattern.size(); i++){
+ if(pattern[i] == '1'){
+ bitpat += (tmp << (tmp_size - i));
+ }
+ }
+ return bitpat;
+}
+
+
+/**
+ * Creates a char vector pattern. from a bitpattern (64-Bit Integer).
+ *
+ *@param pattern Pattern as uint64_t (64-Bit Integer)
+ *
+ *@return Pattern as std::vector<char>
+ */
+std::vector<char> patternset::BitToString(uint64_t bitpat){
+ std::vector<char> pattern;
+ uint64_t one, tmp;
+
+ one = 1;
+
+ while(bitpat > 0){
+ tmp = bitpat & one;
+ if(tmp == 0){
+ pattern.push_back('0');
+ }
+ else{
+ pattern.push_back('1');
+ }
+ bitpat >>= one;
+ }
+ std::reverse(pattern.begin(), pattern.end());
+ return pattern;
+}
+
+
+/**
+ * Changes the current patternset into a bitpattern set.
+ */
+void patternset::PatternToBit(){
+ uint64_t tmp;
+ for(uint32_t i = 0; i < size; i++){
+ tmp = StringToBit(pattern_set[i]);
+ bit_pattern_set.push_back(tmp);
+ }
+}
+
+
+/**
+ * Changes the current bitpattern set into a patternset (vector of char vectors).
+ */
+void patternset::BitToPattern(){
+ std::vector<char> tmp;
+ for(uint32_t i = 0; i < size; i++){
+ pattern_set[i] = BitToString(bit_pattern_set[i]);
+ }
+}
+
+
+/*---------------------------------------------------------------------------*/
+/**
+ * Determines the maximum number of patterns with weight and length
+ *
+ *@param weight Complete pattern weight-2, start and end have to be match and do not change
+ *
+ *@param length Complete pattern lengt-2, start and end have to be match and do not change
+ *
+ *@return max number of possible pattern
+ */
+double patternset::MaxNumberPattern(uint32_t k, uint32_t l){
+ double max_number;
+
+ max_number = 1;
+
+ for(uint32_t i = 1; i <= k; i++){
+ max_number *= ((double)(l-i+1)) / (double) i;
+ }
+
+ return max_number;
+}
+
+
+/**
+ * Scans if there is another pattern in the same format in a patternset
+ *
+ * @param pats The patternset, in which we are looking for a similar pattern.
+ *
+ * @param pat The pattern, the query, we are looking for to be unique.
+ *
+ * @return Returns a boolean, if the submitted pattern is not existing in the patternset.
+ */
+bool patternset::UniqPattern(std::vector<std::vector<char> > pats, std::vector<char> pat){
+ bool uniq, no_common;
+ uniq = true;
+
+ for(uint32_t i = 0; i < pats.size(); i++){
+ if(pats[i].size() == pat.size()){
+ no_common = false;
+ for(uint32_t j = 0; j < pat.size(); j++){
+ if(pats[i][j] != pat[j]){
+ no_common = true;
+ j = pat.size();
+ }
+ }
+ if(!no_common){
+ uniq = false;
+ }
+ }
+ }
+
+ return uniq;
+}
+
+
+/**
+ * Scans if there is another bitpattern in the same format in a bitpattern set
+ *
+ * @param pats The bitpattern set, in which we are looking for a similar bitpattern.
+ *
+ * @param pat The bitpattern, the query, we are looking for to be unique.
+ *
+ * @return Returns a boolean, if the submitted bitpattern is not existing in the bitpattern set.
+ */
+bool patternset::UniqBit(uint64_t bit_pat){
+ return std::find(bit_pattern_set.begin(), bit_pattern_set.end(), bit_pat) == bit_pattern_set.end();
+}
+
+
+/**
+ * Prints the current patternset to the commandline.
+ */
+void patternset::Print(){
+ if(bitmode){
+ BitToPattern();
+ }
+ for(uint32_t i = 0; i < size; i++){
+ for(uint32_t j = 0; j < pattern_set[i].size(); j++){
+ std::cout << pattern_set[i][j];
+ }
+ std::cout << std::endl;
+ }
+}
+
+/*---------------------------------------------------------------------------*/
+/**
+ * Returns the patternset as std::vector< std::vector<char> >
+ *
+ * @return Copy of the current patternset vector (type: std::vector< std::vector<char> >)
+ */
+std::vector<std::vector<char> > patternset::GetPattern(){
+ std::vector<std::vector<char> > tmp;
+ if(bitmode){
+ BitToPattern();
+ }
+ return pattern_set;
+}
+
+
+/**
+ * Returns the patternset as bitpattern as std::vector<uint64_t>
+ *
+ * @return Copy of the current bit-patternset vector (type: std::vector<uint64_t>)
+ */
+std::vector<uint64_t> patternset::GetBitPattern(){
+ std::vector<uint64_t> tmp;
+ if(initialized){
+ if(bitmode){
+ tmp = bit_pattern_set;
+ }
+ }
+ return tmp;
+}
+
+
+/**
+ * Returns a pattern of the patternset as std::vector<char>
+ *
+ * @return Copy of the 'number'th pattern of the current patternset vector (type: std::vector<char>)
+ */
+std::vector<char> patternset::GetPattern(uint32_t number){
+ std::vector<char> tmp;
+ if(bitmode){
+ tmp = BitToString(bit_pattern_set[number%size]);
+ }
+ else{
+ tmp = pattern_set[number%size];
+ }
+ return tmp;
+}
+
+
+/**
+ * Returns a pattern of the patternset as uint64_t.
+ *
+ * @return Copy of the 'number'th pattern of the current patternset vector (type: uint64_t)
+ */
+uint64_t patternset::GetBitPattern(uint32_t number){
+ if(!bitmode){
+ return 0;
+ }
+ return bit_pattern_set[number%size];
+}
+
+
+/**
+ * Returns a vector in which (in pattern order) for each pattern there is a boolean, if the pattern
+ * can be changed or not.
+ *
+ * @return Vector<bool> which contains for each pattern the boolean of improvement.
+ */
+std::vector<uint32_t> patternset::GetDontCareVec(){
+ std::vector<uint32_t> tmp;
+ if(initialized){
+ tmp = pattern_dontcare;
+ }
+ return tmp;
+}
+
+
+/**
+ * Returns a boolean, if the 'number'th patterncan be improved/changed, or not.
+ *
+ * @return Boolean, if the 'number'th patterncan be improved/changed.
+ */
+bool patternset::GetImprove(uint32_t number){
+ bool tmp = false;
+ if(initialized){
+ tmp = pattern_improve[number%size];
+ }
+ return tmp;
+}
+
+
+/**
+ * Returns a boolean, if there is at least one pattern, which can be improved/changed, or not.
+ *
+ * @return Boolean, if at least one pattern can be changed.
+ */
+bool patternset::GetImprove(){
+ return improve;
+}
+
+
+/**
+ * Returns the current patternset weight.
+ *
+ * @return Current pattern weight.
+ */
+uint32_t patternset::GetSize(){
+ return size;
+}
+
+
+/**
+ * Returns the current patternset weight.
+ *
+ * @return Current pattern weight.
+ */
+uint32_t patternset::GetWeight(){
+ return weight;
+}
+
+
+/**
+ * Returns the current lenght of the 'number'th pattern.
+ * Weight + don't care positions
+ *
+ *@return Current lenght of the 'number'th pattern.
+ */
+uint32_t patternset::GetLength(uint32_t number){
+ uint32_t leng = 0;
+ if(initialized){
+ leng = pattern_dontcare[number%size] + weight;
+ }
+ return leng;
+}
+
+
+/**
+ * Returns boolean if the current patternset modus is bitmode oder not.
+ *
+ * @return Current bitmode boolean (true, if bitmode).
+ */
+bool patternset::GetBitMode(){
+ return bitmode;
+}
+
+
+/**
+ * Returns boolean if the current patternset modus has been updated
+ *
+ * @return Current patternmode has been updated.
+ */
+bool patternset::GetUpdate(){
+ return update;
+}
+
+/**
+ * Sets a bitpattern at the 'number'th positions of the bitpattern set.
+ *
+ * @param number The position where the bitpattern should be saved.
+ *
+ * @param pat The bitpattern, that should be saved.
+ */
+void patternset::SetPattern(uint32_t number, uint64_t pat){
+ if((uint32_t)std::log2(pat) == (uint32_t)std::log2(bit_pattern_set[number%size])){
+ bit_pattern_set[number%size] = pat;
+ }
+}
+
+
+/**
+ * Sets a pattern at the 'number'th positions of the patternset.
+ *
+ * @param number The position where the pattern should be saved.
+ *
+ * @param pat The pattern, that should be saved.
+ */
+void patternset::SetPattern(uint32_t number, std::vector<char> pat){
+ uint64_t tmp;
+ if(bitmode && pat.size() == (uint32_t)(std::log2(bit_pattern_set[number%size])+1)){
+ tmp = StringToBit(pat);
+ bit_pattern_set[number%size] = tmp;
+ }
+ else if(pat.size() == pattern_set[number%size].size()){
+ pattern_set[number%size] = pat;
+ }
+}
=====================================
src/patternset.h
=====================================
--- /dev/null
+++ b/src/patternset.h
@@ -0,0 +1,112 @@
+/**
+ * This programm calculates the variance/OC and/or the sensitivity of a set of pattern with the same weight.
+ * It is possible to improve your patternset and read patterns from a file.
+ *
+ * patternset object header
+ *
+ * For theory please have a look at:
+ *
+ * - L. Hahn, C.-A. Leimeister, B. Morgenstern (2015)
+ * RasBhari: optimizing spaced seeds for database searching, read mapping and alignment-free sequence comparison
+ * arXiv:1511.04001[q-bio.GN]
+ *
+ * - B. Morgenstern, B. Zhu, S. Horwege, C.-A Leimeister (2015)
+ * Estimating evolutionary distances between genomic sequences from spaced-word matches
+ * Algorithms for Molecular Biology 10, 5. (http://www.almob.org/content/10/1/5/abstract)
+ *
+ *
+ * @author: Lars Hahn - 12.05.2016, Georg-August-Universitaet Goettingen
+ * @version: 1.2.0 05/2016
+ */
+#ifndef PATTERNSET_H_
+#define PATTERNSET_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <cmath>
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+class patternset{
+public:
+ patternset();
+ patternset(uint32_t size, uint32_t weight, uint32_t min_dontcare, uint32_t max_dontcare, bool silent);
+ patternset(const char *patternfile, bool silent);
+ ~patternset();
+
+ void Init(bool random_length);
+ void ReInit();
+
+ void ChangeBitsRandom(uint32_t number);
+ bool ChangeBitPos(uint32_t number, uint64_t zero, uint64_t one);
+
+ void Print();
+
+ static uint64_t StringToBit(std::vector<char> pattern);
+ static std::vector<char> BitToString(uint64_t bitpat);
+
+ /*-----------------------------------------------------------------------*/
+ std::vector<std::vector<char> > GetPattern();
+ std::vector<uint64_t> GetBitPattern();
+ std::vector<char> GetPattern(uint32_t number);
+ uint64_t GetBitPattern(uint32_t number);
+
+ std::vector<uint32_t> GetDontCareVec();
+ bool GetImprove(uint32_t number);
+ bool GetImprove();
+ uint32_t GetSize();
+ uint32_t GetWeight();
+ uint32_t GetLength(uint32_t number);
+ bool GetBitMode();
+ bool GetUpdate();
+
+ void SetPattern(uint32_t number, uint64_t pat);
+ void SetPattern(uint32_t number, std::vector<char> pat);
+
+protected:
+ void CheckParams();
+ std::vector<uint32_t> CreateLengths();
+ std::vector<std::vector<char> > CreatePattern();
+ bool ReadPattern();
+ bool VerifyPatternCondition();
+ bool SetImprove();
+ void PatternToBit();
+ void BitToPattern();
+
+ uint64_t GetSymbolRandPos(uint32_t number, char symb);
+
+ bool UniqPattern(std::vector<std::vector<char> > pats, std::vector<char> pat);
+ bool UniqBit(uint64_t bit_pat);
+
+ static uint32_t GetWeight(std::vector<char> pattern);
+ static bool IsBinary(std::vector<char> pattern);
+
+ static double MaxNumberPattern(uint32_t k, uint32_t l);
+ static double Faculty(uint32_t value);
+
+ void SecureMessage(std::string errmsg, int pos);
+
+private:
+ std::vector<std::vector<char> > pattern_set;
+ std::vector<uint64_t> bit_pattern_set;
+ std::vector<uint32_t> pattern_dontcare;
+ std::vector<bool> pattern_improve;
+ std::vector<double> pattern_uniq;
+
+ uint32_t size;
+ uint32_t weight;
+ uint32_t min_dontcare;
+ uint32_t max_dontcare;
+
+ const char *inFile;
+
+ bool bitmode;
+ bool random_length;
+ bool isInFile;
+ bool improve;
+ bool update;
+ bool silent;
+ bool initialized;
+};
+#endif
\ No newline at end of file
=====================================
sort.h → src/sort.h
=====================================
=====================================
spaced.cc → src/spaced.cc
=====================================
--- a/spaced.cc
+++ b/src/spaced.cc
@@ -16,7 +16,7 @@
#include "variance.h"
bool revComp=true;
-int number = 5, dontcare = 15, weight = 14, threads = 15, distanceType = EV;
+int number = 5, dontcare = 15, weight = 14, threads = 15, distanceType = JS;
string patternFile="", output="DMat";
void printHelp(){
@@ -31,20 +31,11 @@ void printHelp(){
"\n\t -f <file>: use patterns in <file> instead of random patterns"
"\n\t -t <integer>: numer of threads (default: 25 threads)"
"\n\t -r: don't consider the reverse complement"
- "\n\t -d EU | JS | EV: change distance type to Euclidean, Jensen-Shannon, evolutionary distance (default: EV)"
+ "\n\t -d EU | JS | EV: change distance type to Euclidean, Jensen-Shannon, evolutionary distance (default: JS)"
"\n";
cout << help << endl;
}
-void parsePattern(string filePath, vector<string>& patternSet){
- ifstream file(filePath.c_str());
- string str;
- while (getline(file, str)){
- patternSet.push_back(str);
- }
- file.close();
-}
-
void parseParameters(int argc, char** argv){
// too few?
if (argc < 2) {
@@ -166,18 +157,28 @@ int main(int argc, char *argv[]){
unsigned char *str = readData(inputFile, n);
if (str == NULL)
return 0;
+
vector<string> patternSet;
+ vector< vector<char> > tmp_pat_set;
+ variance variance_obj;
if(patternFile.length()==0){
- variance* variance_obj;
- int* laenge = new int[2];
- laenge[0]=laenge[1]=dontcare+weight;
- variance_obj = new variance(number,laenge, weight);
- variance_obj->Silent();
- variance_obj->Improve(500); //5000 mal versuchen ein Pattern zu verbessern
- patternSet = variance_obj->GetPattern();
+ variance_obj = variance(number,weight,dontcare,dontcare);
+ variance_obj.Init(true, true, true, true, false, NULL);
+ variance_obj.Improve(5000); //5000 mal versuchen ein Pattern zu verbessern
}
- else
- parsePattern(patternFile, patternSet);
+ else{
+ variance_obj = variance(patternFile.c_str(),0.75,0.25,16000);
+ variance_obj.Init(true, true, true, true, false, NULL);
+ }
+ number = variance_obj.GetSize();
+ weight = variance_obj.GetWeight();
+ dontcare = variance_obj.GetMinDontcare();
+ tmp_pat_set = variance_obj.GetPattern();
+ for(size_t i = 0; i < tmp_pat_set.size(); i++){
+ patternSet.push_back(string(tmp_pat_set[i].begin(),tmp_pat_set[i].end()));
+ }
+ tmp_pat_set.clear();
+
double seq=0;
int length=0;
double dna=0;
=====================================
src/variance.cpp
=====================================
--- /dev/null
+++ b/src/variance.cpp
@@ -0,0 +1,875 @@
+/**
+ * This programm calculates the variance/OC and/or the sensitivity of a set of pattern with the same weight.
+ * It is possible to improve your patternset and read patterns from a file.
+ *
+ * variance object file
+ *
+ * For theory please have a look at:
+ *
+ * - L. Hahn, C.-A. Leimeister, B. Morgenstern (2015)
+ * RasBhari: optimizing spaced seeds for database searching, read mapping and alignment-free sequence comparison
+ * arXiv:1511.04001[q-bio.GN]
+ *
+ * - B. Morgenstern, B. Zhu, S. Horwege, C.-A Leimeister (2015)
+ * Estimating evolutionary distances between genomic sequences from spaced-word matches
+ * Algorithms for Molecular Biology 10, 5. (http://www.almob.org/content/10/1/5/abstract)
+ *
+ *
+ * @author: Lars Hahn - 12.05.2016, Georg-August-Universitaet Goettingen
+ * @version: 1.2.0 05/2016
+ */
+#include "variance.h"
+
+/*---Variables---------------------------------------------------------------*/
+
+
+
+/*===Main-Part===============================================================*/
+/*---Constructor-------------------------------------------------------------*/
+/**
+ * Default constructor, sets the default vaulues, pattern will be generated automatically.
+ */
+variance::variance(){
+ size = 10;
+ weight = 8;
+ min_dontcare = 6;
+ max_dontcare = 6;
+ seq_leng = 16000;
+ p = 0.75;
+ q = 0.25;
+ isInFile = false;
+ initialized = false;
+}
+
+
+/**
+ * Short constructor, sets some default vaulues, just pattern dimension is set.
+ *
+ * @param size Number of patterns for the patternset.
+ *
+ * @param weigth Number of match positions ('1') in each pattern.
+ *
+ * @param min_dontcare Number of minimum don't care positions ('0') in each pattern.
+ *
+ * @param max_dontcare Number of maximum don't care positions ('0') in each pattern.
+ */
+variance::variance(uint32_t size, uint32_t weight, uint32_t min_dontcare, uint32_t max_dontcare){
+ this->size = size;
+ this->weight = weight;
+ this->min_dontcare = min_dontcare;
+ this->max_dontcare = max_dontcare;
+ seq_leng = 16000;
+ p = 0.75;
+ q = 0.25;
+ isInFile = false;
+ initialized = false;
+}
+
+
+/**
+ * Long constructor, sets the values; resets automatically, if there are problems.
+ *
+ * @param size Number of patterns for the patternset.
+ *
+ * @param weight Number of match positions ('1') in each pattern.
+ *
+ * @param min_dontcare Number of minimum don't care positions ('0') in each pattern.
+ *
+ * @param max_dontcare Number of maximum don't care positions ('0') in each pattern.
+ *
+ * @param seq_leng Length of sequences in database, used for variance.
+ *
+ * @param p Match probability ( = #matches / #positions), used for sensitivity and variance.
+ *
+ * @param q Background probability, used for variance.
+ */
+variance::variance(uint32_t size, uint32_t weight, uint32_t min_dontcare, uint32_t max_dontcare, uint32_t seq_leng, double p, double q){
+ this->size = size;
+ this->weight = weight;
+ this->min_dontcare = min_dontcare;
+ this->max_dontcare = max_dontcare;
+ this->seq_leng = seq_leng;
+ this->p = p;
+ this->q = q;
+ isInFile = false;
+ initialized = false;
+}
+
+
+/**
+ * File constructor, sets values only from files; resets automatically if there are problems.
+ *
+ * @param inFile The file which contains a patternset.
+ */
+variance::variance(const char *inFile, double p, double q, uint32_t seq_leng){
+ this->inFile = inFile;
+ this->p = p;
+ this->q = q;
+ this->seq_leng = seq_leng;
+ size = 10;
+ weight = 8;
+ min_dontcare = 6;
+ max_dontcare = 6;
+ isInFile = true;
+ initialized = false;
+}
+
+
+/**
+ * Default destructor, deletes all vectors and matrices in the object.
+ */
+variance::~variance(){
+ Clear();
+}
+
+
+/*---Initialization----------------------------------------------------------*/
+/**
+ * Standard Initialization of variance options.
+ */
+void variance::Init(){
+ if(!initialized){
+ std::string str = "outfile.pat";
+ oc = true;
+ improve = true;
+ quiet = true;
+ silent = false;
+ isOutFile = false;
+ random_leng = false;
+ outFile = str.c_str();
+ Init(oc, improve, quiet, silent, random_leng, outFile);
+ }
+}
+
+/**
+ * Initialization of variance options with specific parameter.
+ *
+ * @param oc Boolean which (de)activates the overlap complexity/variance calculation
+ *
+ * @param improve Boolean which (de)activates the improvement of the patternset.
+ *
+ * @param quiet Boolean which (de)activates just a short output of the improvement
+ *
+ * @param silent Boolean which (de)activates std::output on commandline (exclusive errors)
+ *
+ * @param randpatleng Boolean which (de)activates random generated pattern lengths
+ *
+ * @param outFile Char array which contains name and path to outputfile (commandline parameter)
+ */
+void variance::Init(bool oc, bool improve, bool quiet, bool silent, bool random_leng, const char *outFile){
+ if(!initialized){
+ this->oc = oc;
+ this->improve = improve;
+ this->quiet = quiet;
+ this->silent = silent;
+ this->outFile = outFile;
+ this->random_leng = random_leng;
+ best_silent = false;
+ impro_silent = false;
+ loops = 1;
+ if(outFile != NULL && outFile[0] != '\0'){
+ isOutFile = true;
+ }
+ else{
+ isOutFile = false;
+ }
+ if(oc){
+ outvar = "overlap-complexity\t\t= ";
+ }
+ else{
+ outvar = "variance\t\t\t= ";
+ }
+ if(silent){
+ quiet = true;
+ }
+ ReInit();
+ }
+}
+
+
+/**
+ * Standard Reinitialization of variance options, with a new patternset.
+ */
+void variance::ReInit(){
+ if(isInFile){
+ pattern_set = patternset(inFile, true);
+ }
+ else{
+ pattern_set = patternset(size, weight, min_dontcare, max_dontcare, true);
+ }
+ pattern_set.Init(random_leng);
+ ReInit(pattern_set);
+}
+
+
+/**
+ * Standard Reinitialization of variance options, with a given patternset.
+ */
+void variance::ReInit(patternset &pat){
+ if(initialized){
+ Clear();
+ }
+ initialized = false;
+ pattern_set = pat;
+
+ size = pattern_set.GetSize();
+ weight = pattern_set.GetWeight();
+ min_dontcare = pattern_set.GetLength(size-1)- weight;
+ max_dontcare = pattern_set.GetLength(0) - weight;
+ bitmode = pattern_set.GetBitMode();
+ if(improve){
+ improve = pattern_set.GetImprove();
+ if(!improve){
+ SecureMessage("noimprove",-1);
+ }
+ }
+ norm_size = (size*(size+1))/2;
+ p = std::abs(p);
+ q = std::abs(q);
+ if(p < q || p > 1 || q > 1){
+ p = 0.75;
+ q = 0.25;
+ SecureMessage("pq",-1);
+ }
+ InitVariance();
+ initialized = true;
+}
+
+
+/**
+ * First Initializing of the variance/oc where each value of the matrix has
+ * to be created, instead of updated variance, where it is not necessary to
+ * recalculate each value/entry.
+ */
+void variance::InitVariance(){
+ std::pair<double,uint32_t> tmp_pair;
+ double var_hom, var_bac, hom, back;
+ uint64_t oc_hom;
+ uint32_t lower_length, upper_length, length_mean;
+ int shift;
+ oc_hom = 1;
+ variance_val = 0;
+
+ variance_mat = std::vector< std::vector<double> >(size,std::vector<double>(size,0));
+ pattern_variance = std::vector<double>(size,0);
+ pattern_current_contribute = std::vector<double>(size,0);
+ pattern_last_contribute = std::vector<double>(size,0);
+ for(uint32_t i = 0; i < size; i++){
+ lower_length = pattern_set.GetLength(i)-1;
+ for(uint32_t j = i; j < size; j++){
+ upper_length = pattern_set.GetLength(j);
+
+ if(i == j && !oc){
+ upper_length = 1;
+ }
+ var_hom = 0.0;
+ var_bac = 0.0;
+ for (int s = -1 * lower_length; s < (int)upper_length; s++) {
+ shift = ShiftPos(i, j, s);
+ if (!oc) {
+ var_hom += (pow(p, shift) - pow(p, 2 * weight));
+ var_bac += (pow(q, shift) - pow(q, 2 * weight));
+ }
+ else{
+ var_hom += (double) (oc_hom << (shift));
+ }
+ }
+ if(!oc){
+ length_mean = (lower_length + 1 + upper_length) / 2;
+ hom = (seq_leng - length_mean + 1);
+ hom *= var_hom;
+
+ back = (seq_leng - length_mean + 1);
+ back *= (seq_leng - length_mean);
+ back *= var_bac;
+ }
+ else{
+ hom = var_hom;
+ back = 0.0;
+ }
+ variance_mat[i][j] = hom + back;
+ variance_mat[j][i] = hom + back;
+ variance_val += variance_mat[i][j];
+ pattern_variance[i] += variance_mat[i][j];
+ pattern_variance[j] += variance_mat[i][j];
+ }
+ }
+
+ for(uint32_t i = 0; i < size; i++){
+ if(pattern_set.GetImprove(i)){
+ tmp_pair = std::make_pair(pattern_variance[i],i);
+ variance_set.insert(tmp_pair);
+ }
+ }
+}
+
+/**
+ * Part of the destructor; could be used in other cases for
+ * reinitializing the variance/oc new.
+ */
+void variance::Clear(){
+ variance_mat.clear();
+ variance_set.clear();
+ pattern_variance.clear();
+ pattern_current_contribute.clear();
+ pattern_last_contribute.clear();
+}
+
+
+/*---Variance-----------------------------------------------------------------*/
+/**
+ * Instead of recalculating each value, only the matrix entries which
+ * belong to the modified pattern have to be recalculated.
+ *
+ * @param number Patternindex of the modified pattern
+ */
+void variance::UpdateVariance(uint32_t number){
+ double var_hom, var_bac, hom, back;
+ uint64_t oc_hom;
+ uint32_t lower_length, upper_length, length_mean;
+ int shift;
+
+ if(initialized){
+ oc_hom = 1;
+ lower_length = pattern_set.GetLength(number)-1;
+ for(uint32_t i = 0; i < size; i++){
+ pattern_last_contribute[i] = variance_mat[number][i];
+ variance_val -= pattern_last_contribute[i];
+ pattern_variance[number] -= pattern_last_contribute[i];
+ pattern_variance[i] -= pattern_last_contribute[i];
+ upper_length = pattern_set.GetLength(i);
+
+ if(number == i && !oc){
+ upper_length = 1;
+ }
+ var_hom = 0.0;
+ var_bac = 0.0;
+ for (int s = -1 * lower_length; s < (int)upper_length; s++) {
+ shift = ShiftPos(number, i, s);
+ if (!oc) {
+ var_hom += (pow(p, shift) - pow(p, 2 * weight));
+ var_bac += (pow(q, shift) - pow(q, 2 * weight));
+ }
+ else{
+ var_hom += (double) (oc_hom << (shift));
+ }
+ }
+ if(!oc){
+ length_mean = (lower_length + 1 + upper_length) / 2;
+ hom = (seq_leng - length_mean + 1);
+ hom *= var_hom;
+
+ back = (seq_leng - length_mean + 1);
+ back *= (seq_leng - length_mean);
+ back *= var_bac;
+ }
+ else{
+ hom = var_hom;
+ back = 0.0;
+ }
+ variance_mat[number][i] = hom + back;
+ variance_mat[i][number] = hom + back;
+ pattern_current_contribute[i] = variance_mat[number][i];
+ variance_val += pattern_current_contribute[i];
+ pattern_variance[number] += pattern_current_contribute[i];
+ pattern_variance[i] += pattern_current_contribute[i];
+
+ }
+ }
+}
+
+/**
+ * Shifts pattern and counts the number of common match positions.
+ *
+ * @param p1 Position of the first used pattern of the pattern set.
+ *
+ * @param p2 Position of the second used pattern of the pattern set.
+ * NOTE: possible is p1 = p2
+ *
+ * @param s The shift of the second pattern, s < 0 := shift left
+ * pattern 2, s > := shift right pattern 2.
+ *
+ * @return Calculates and returns current variance.
+ */
+uint32_t variance::ShiftPos(uint32_t number, uint32_t number2, int shift){
+ std::vector<char> stra, strb;
+ uint64_t pata, patb, pos, tmp;
+ uint32_t counter, maxi, maxa, maxb;
+ counter = 0;
+ pos = 1;
+ if(bitmode){
+ pata = pattern_set.GetBitPattern(number);
+ patb = pattern_set.GetBitPattern(number2);
+ if(shift < 0){
+ shift= std::abs(shift);
+ pata = pata >> shift;
+ }
+ else{
+ patb = patb >> shift;
+ }
+ maxa = (uint32_t) std::log2(pata)+1;
+ maxb = (uint32_t) std::log2(patb)+1;
+ maxi = std::min(maxa,maxb);
+ tmp = pata & patb;
+ for(uint32_t i = 0; i < maxi; i++){
+ if((tmp & (pos << i)) != 0){
+ counter ++;
+ }
+ }
+ if(!oc){
+ counter = 2*weight-counter;
+ }
+ }
+ else{
+ stra = pattern_set.GetPattern(number);
+ strb = pattern_set.GetPattern(number2);
+
+ maxa = pattern_set.GetLength(number);
+ maxb = pattern_set.GetLength(number2);
+ if(shift < 0){
+ shift = std::abs(shift);
+ maxa -= shift;
+ }
+ else{
+ maxb -= shift;
+ std::swap(stra,strb);
+ }
+ maxi = std::min(maxa,maxb);
+ for(uint32_t i = 0; i < maxi; i++){
+ if(stra[i] == '1' && strb[strb.size() - maxi + i] == '1'){
+ counter++;
+ }
+ }
+ }
+ return counter;
+}
+
+
+/**
+ * If the random swap leads to a worse oc/variance, the old values have
+ * to be restored.
+ */
+void variance::ResetContribute(uint32_t number){
+ for(uint32_t i = 0; i < size; i++){
+ variance_val -= pattern_current_contribute[i];
+ pattern_variance[number] -= pattern_current_contribute[i];
+ pattern_variance[i] -= pattern_current_contribute[i];
+
+ variance_val += pattern_last_contribute[i];
+ pattern_variance[number] += pattern_last_contribute[i];
+ pattern_variance[i] += pattern_last_contribute[i];
+ variance_mat[number][i] = pattern_last_contribute[i];
+ variance_mat[i][number] = pattern_last_contribute[i];
+ }
+}
+
+
+/*---Improvment--------------------------------------------------------------*/
+/**
+ * The improvement method.
+ *
+ * @param limits The number of patterns which have to be selected and mayby modified.
+ */
+void variance::Improve(uint32_t limits){
+ std::set<std::pair<double,uint32_t> >::iterator var_iterator;
+ std::vector<char> last_pat;
+ std::pair<double,uint32_t> tmp_pair;
+ uint64_t last_bit_pat;
+ double best_variance_val;
+ uint32_t worst_pat, better_pattern;
+
+ if(initialized && improve){
+ best_variance_val = variance_val;
+ var_iterator = variance_set.end();
+ var_iterator--;
+ better_pattern = 0;
+
+ if(bitmode){
+ last_bit_pat = 0;
+ }
+
+ for(uint32_t i = 1; i <= limits; i++){
+ worst_pat = var_iterator->second;
+ if(bitmode){
+ last_bit_pat = pattern_set.GetBitPattern(worst_pat);
+ }
+ else{
+ last_pat = pattern_set.GetPattern(worst_pat);
+ }
+
+ pattern_set.ChangeBitsRandom(worst_pat);
+ UpdateVariance(worst_pat);
+
+ if(variance_val <= best_variance_val){
+ if(best_variance_val == variance_val){
+ if(var_iterator == variance_set.begin()){
+ var_iterator = variance_set.end();
+ }
+ }
+ else{
+ best_variance_val = variance_val;
+ variance_set.erase(var_iterator);
+ tmp_pair = std::make_pair(pattern_variance[worst_pat],worst_pat);
+ variance_set.insert(tmp_pair);
+ var_iterator = variance_set.end();
+ better_pattern++;
+
+ if(loops == 1 && (!silent || impro_silent)){
+ if(!quiet){
+ std::cout << "*** BETTER PATTERN " << better_pattern << " *** \t(random permutation)" << std::endl;
+ std::cout << "Step " << i << " / " << limits << std::endl << "Patternset: \n";
+ pattern_set.Print();
+ std::cout << outvar << GetVariance() << std::endl;
+ std::cout << "norm_" << outvar << GetNormVariance() << std::endl << std::endl;
+ }
+ else{
+ std::cout << "\r*** BETTER PATTERN " << better_pattern << " *** \t(random permutation)";
+ std::cout.flush();
+ }
+ }
+ }
+ if(!bitmode){
+ last_pat.clear();
+ }
+ }
+ else{
+ if(var_iterator == variance_set.begin()){
+ var_iterator = variance_set.end();
+ }
+ if(bitmode){
+ pattern_set.SetPattern(worst_pat, last_bit_pat);
+ }
+ else{
+ pattern_set.SetPattern(worst_pat, last_pat);
+ }
+ ResetContribute(worst_pat);
+ }
+ var_iterator--;
+ }
+ if(!silent && quiet){
+ std::cout << std::endl;
+ }
+ }
+ else if(!improve && !silent){
+ SecureMessage("noimprove",-1);
+ }
+}
+
+/**
+ * Improvement with reinitialization. Improves a patternset, resets it 'loops' times and
+ * and sets the best patternset with the best detected oc/variance.
+ *
+ * @param limits The number of patterns which have to be selected and mayby modified.
+ *
+ * @param loops The number of reinitializations to find better oc/variance patternsets.
+ */
+void variance::Improve(uint32_t limits, uint32_t loops){
+ if(initialized && improve){
+ patternset best_pattern_set;
+ std::ofstream pattern_out;
+ std::string out_str;
+ double best_variance_val;
+ uint32_t better_pattern;
+
+ this->loops = loops;
+
+ if(!silent || best_silent){
+ std::cout << "\n===== First patternset =====" << std::endl;
+ pattern_set.Print();
+ std::cout << outvar << GetVariance() << std::endl;
+ std::cout << "norm_" << outvar << GetNormVariance() << std::endl << std::endl;
+ }
+
+ best_variance_val = variance_val;
+ better_pattern = 0;
+ best_pattern_set = pattern_set;
+ for(uint32_t i = 1; i <= loops; i++){
+ Improve(limits);
+ if(best_variance_val > variance_val){
+ better_pattern++;
+ if(loops != 1 && (!silent || impro_silent)){
+ if(!quiet){
+ std::cout << "\n*** BETTER PATTERN " << better_pattern << " *** \t(oc/variance optimization)" << std::endl;
+ std::cout << "Step " << i << " / " << loops << std::endl << "Patternset: \n";
+ pattern_set.Print();
+ std::cout << outvar << GetVariance() << std::endl;
+ std::cout << "norm_" << outvar << GetNormVariance() << std::endl << std::endl;
+ }
+ else{
+ std::cout << "\r*** BETTER PATTERN " << better_pattern << " *** \t(oc/variance optimization)";
+ std::cout.flush();
+ }
+ }
+ best_variance_val = variance_val;
+ best_pattern_set = pattern_set;
+ }
+ ReInit();
+ }
+ if(!silent || best_silent){
+ std::cout << std::endl;
+ }
+ pattern_set = best_pattern_set;
+ ReInit(pattern_set);
+
+ if(!silent || best_silent){
+ std::cout << "\n===== Best patternset ======" << std::endl;
+ pattern_set.Print();
+ std::cout << outvar << GetVariance() << std::endl;
+ std::cout << "norm_" << outvar << GetNormVariance() << std::endl << std::endl;
+ }
+ if(isOutFile && best_silent){
+ pattern_out.open(outFile);
+ for(uint32_t i = 0; i < size; i++){
+ out_str = std::string(pattern_set.GetPattern(i).begin(),pattern_set.GetPattern(i).end());
+ pattern_out << out_str << std::endl;
+ }
+ pattern_out << GetFormat() << GetVariance() << std::endl;
+ pattern_out << "norm_" << GetFormat() << GetNormVariance() << std::endl;
+ }
+ }
+ else if(!improve && !silent){
+ SecureMessage("noimprove",-1);
+ }
+}
+
+
+/*---stuff-------------------------------------------------------------------*/
+/**
+ * Prints the current patternset to the commandline.
+ */
+void variance::PrintPatternSet(){
+ pattern_set.Print();
+}
+
+
+/**
+ * A Method to collect all errormessages. Just easier for programmer to change
+ * the text or extend.
+ *
+ * @param errmsg Due to a few possible errormessages, this is the option, which has to be printed.
+ *
+ * @param pos The position of the incorrect patterns.
+ */
+void variance::SecureMessage(std::string errmsg, int pos){
+ if (errmsg == "noimprove" && !silent) {
+ printf("%c[1;33m", 27);
+ std::cerr << "Using your pattern conditions it is not sensible to improve your pattern, sorry!" << std::endl;
+ std::cerr << "Deactivating improve mode\n" << std::endl;
+ printf("%c[0m", 27);
+ return;
+ }
+ if (errmsg == "pq") {
+ printf("%c[1;32m\n\n--> IMPORTANT <--\n", 27);
+ printf("%c[0m", 27);
+ printf("%c[1;31mError ", 27);
+ printf("%c[0m", 27);
+ std::cerr << "while parsing your p and/or q value: \t0 < q <= p <= 1!" << std::endl;
+ std::cerr << "Return to default values:\tp = 0.75 \tq=0.25\n" << std::endl;
+ return;
+ }
+}
+
+
+/*---Set&Get-----------------------------------------------------------------*/
+/**
+ * Returns the patternset object
+ * @return Copy of the current patternset object (type: patternset)
+ */
+patternset variance::GetPatternSet(){
+ return pattern_set;
+}
+
+
+/**
+ * Returns the patternset as std::vector< std::vector<char> >
+ *
+ * @return Copy of the current patternset vector (type: std::vector< std::vector<char> >)
+ */
+std::vector< std::vector<char> > variance::GetPattern(){
+ return pattern_set.GetPattern();
+}
+
+/**
+ * Returns a pattern of the patternset as std::vector<char>
+ *
+ * @return Copy of the 'number'th pattern of the current patternset vector (type: std::vector<char>)
+ */
+std::vector<char> variance::GetPattern(uint32_t number){
+ return pattern_set.GetPattern(number);
+}
+
+
+/**
+ * Returns the patternset as bitpattern as std::vector<uint64_t>
+ *
+ * @return Copy of the current bit-patternset vector (type: std::vector<uint64_t>)
+ */
+std::vector<uint64_t> variance::GetBitPattern(){
+ return pattern_set.GetBitPattern();
+}
+
+
+/**
+ * Returns a pattern of the patternset as uint64_t.
+ *
+ * @return Copy of the 'number'th pattern of the current patternset vector (type: uint64_t)
+ */
+uint64_t variance::GetBitPattern(uint32_t number){
+ return pattern_set.GetBitPattern(number);
+}
+
+
+/**
+ * Returns the current variance/oc value.
+ *
+ * @return Current pattern variance/oc.
+ */
+double variance::GetVariance(){
+ return variance_val;
+}
+
+
+/**
+ * Returns the current norm variance/oc value.
+ *
+ * @return Current pattern norm variance/oc.
+ */
+double variance::GetNormVariance(){
+ return variance_val/norm_size;
+}
+
+
+/**
+ * Returns the current patternset size.
+ *
+ * @return Current pattern size.
+ */
+uint32_t variance::GetSize(){
+ return size;
+}
+
+
+/**
+ * Returns the current patternset weight.
+ *
+ * @return Current pattern weight.
+ */
+uint32_t variance::GetWeight(){
+ return weight;
+}
+
+
+/**
+ * Returns the current patternset minimum don't care number.
+ *
+ * @return Current pattern minimum don't care number.
+ */
+uint32_t variance::GetMaxDontcare(){
+ return max_dontcare;
+}
+
+
+/**
+ * Returns the current patternset maximum don't care number.
+ *
+ * @return Current pattern maximum don't care number.
+ */
+uint32_t variance::GetMinDontcare(){
+ return min_dontcare;
+}
+
+
+/**
+ * Returns the current dataset sequence length for variance calculation.
+ *
+ * @return Current dataset sequence length for variance calculation.
+ */
+uint32_t variance::GetSequenceLength(){
+ return seq_leng;
+}
+
+
+/**
+ * Returns the current match probability for variance calculation.
+ *
+ * @return Current match probability for variance calculation.
+ */
+double variance::GetP(){
+ return p;
+}
+
+
+/**
+ * Returns the current background probability for variance calculation.
+ *
+ * @return Current background probability for variance calculation.
+ */
+double variance::GetQ(){
+ return q;
+}
+
+
+/**
+ * Returns boolean if the current patternset modus is bitmode oder not.
+ *
+ * @return Current bitmode boolean (true, if bitmode).
+ */
+bool variance::GetBitMode(){
+ return pattern_set.GetBitMode();
+}
+
+
+/**
+ * Returns boolean if the current patternset modus has been updated
+ *
+ * @return Current patternmode has been updated.
+ */
+bool variance::GetUpdate(){
+ return pattern_set.GetUpdate();
+}
+
+/**
+ * A special option, if silent == true but, first and best pattern should be printed.
+ *
+ * @param sil Boolean, if output first and best pattern, although silent == true.
+ */
+void variance::BestSilent(bool sil){
+ best_silent = sil;
+}
+
+
+/**
+ * A special option, if silent == true but the number of improved pattern should be printed.
+ *
+ * @param sil Boolean, if print number of improvements, although silent == true.
+ */
+void variance::ImproSilent(bool sil){
+ impro_silent = sil;
+}
+
+/**
+ * Returns a string containing a speacial output format, if its overlap complexity or not.
+ *
+ * @return String which contains current pattern mode overlap complexity or variance calculation.
+ */
+std::string variance::GetFormat(){
+ return outvar;
+}
+
+
+/**
+ * Returns a boolean, if there is at least one pattern, which can be improved/changed, or not.
+ *
+ * @return Boolean, if at least one pattern can be changed.
+ */
+bool variance::GetImprove(){
+ return improve;
+}
+
+
+/**
+ * Prints the current patternset to the commandline.
+ */
+void variance::Print(){
+ pattern_set.Print();
+}
\ No newline at end of file
=====================================
src/variance.h
=====================================
--- /dev/null
+++ b/src/variance.h
@@ -0,0 +1,119 @@
+/**
+ * This programm calculates the variance/OC and/or the sensitivity of a set of pattern with the same weight.
+ * It is possible to improve your patternset and read patterns from a file.
+ *
+ * variance object header
+ *
+ * For theory please have a look at:
+ *
+ * - L. Hahn, C.-A. Leimeister, B. Morgenstern (2015)
+ * RasBhari: optimizing spaced seeds for database searching, read mapping and alignment-free sequence comparison
+ * arXiv:1511.04001[q-bio.GN]
+ *
+ * - B. Morgenstern, B. Zhu, S. Horwege, C.-A Leimeister (2015)
+ * Estimating evolutionary distances between genomic sequences from spaced-word matches
+ * Algorithms for Molecular Biology 10, 5. (http://www.almob.org/content/10/1/5/abstract)
+ *
+ *
+ * @author: Lars Hahn - 12.05.2016, Georg-August-Universitaet Goettingen
+ * @version: 1.2.0 05/2016
+ */
+#ifndef VARIANCE_H_
+#define VARIANCE_H_
+
+#include <cmath>
+#include <cstdint>
+#include <set>
+#include <utility>
+#include <vector>
+#include "patternset.h"
+
+class variance{
+public:
+ variance();
+ variance(uint32_t size, uint32_t weight, uint32_t min_dontcare, uint32_t max_dontcare);
+ variance(uint32_t size, uint32_t weight, uint32_t min_dontcare, uint32_t max_dontcare, uint32_t seq_leng, double p, double q);
+ variance(const char *inFile, double p, double q, uint32_t seq_leng);
+ ~variance();
+
+ void Init();
+ void Init(bool oc, bool improve, bool quiet, bool silent, bool random_leng, const char *outFile);
+ void ReInit();
+
+ void Improve(uint32_t limits);
+ void Improve(uint32_t limits, uint32_t loops);
+ void PrintPatternSet();
+
+ /*-----------------------------------------------------------------------*/
+ patternset GetPatternSet();
+ std::vector< std::vector<char> > GetPattern();
+ std::vector<char> GetPattern(uint32_t number);
+ std::vector<uint64_t> GetBitPattern();
+ uint64_t GetBitPattern(uint32_t number);
+
+ double GetVariance();
+ double GetNormVariance();
+
+ uint32_t GetSize();
+ uint32_t GetWeight();
+ uint32_t GetMaxDontcare();
+ uint32_t GetMinDontcare();
+ uint32_t GetSequenceLength();
+ double GetP();
+ double GetQ();
+ bool GetBitMode();
+ bool GetUpdate();
+ void BestSilent(bool sil);
+ void ImproSilent(bool sil);
+ std::string GetFormat();
+ bool GetImprove();
+ void Print();
+
+protected:
+ void Clear();
+ void ReInit(patternset &pat);
+ void InitVariance();
+ void SetVariance();
+ void UpdateVariance(uint32_t number);
+ uint32_t ShiftPos(uint32_t number, uint32_t number2, int shift);
+ void ResetContribute(uint32_t number);
+ void SecureMessage(std::string errmsg, int pos);
+
+
+private:
+ patternset pattern_set;
+ std::vector< std::vector<double> > variance_mat;
+ std::set< std::pair<double, uint32_t> > variance_set;
+ std::vector<double> pattern_variance;
+ std::vector<double> pattern_current_contribute;
+ std::vector<double> pattern_last_contribute;
+
+ std::string outvar;
+ double variance_val;
+ double p;
+ double q;
+
+ uint32_t size;
+ uint32_t weight;
+ uint32_t min_dontcare;
+ uint32_t max_dontcare;
+ uint32_t seq_leng;
+ uint32_t norm_size;
+ uint32_t loops;
+
+ const char *inFile;
+ const char *outFile;
+
+ bool isInFile;
+ bool isOutFile;
+ bool bitmode;
+ bool initialized;
+ bool random_leng;
+ bool improve;
+ bool quiet;
+ bool silent;
+ bool best_silent;
+ bool impro_silent;
+ bool oc;
+};
+#endif
\ No newline at end of file
=====================================
variance.cpp deleted
=====================================
--- a/variance.cpp
+++ /dev/null
@@ -1,788 +0,0 @@
-/**
- * This programm calculates the variance/OC and/or the sensitivity of a set of pattern with the same weight.
- * It is possible to improve your patternset and read patterns from a file.
- *
- * variance object file
- *
- * For theory please have a look at:
- *
- * B. Morgenstern, B. Zhu, S. Horwege, C.-A Leimeister (2015)
- * Estimating evolutionary distances between genomic sequences from spaced-word matches
- * Algorithms for Molecular Biology 10, 5. (http://www.almob.org/content/10/1/5/abstract)
- *
- *
- * @author: Lars Hahn - 26.10.2015, Georg-August-Universitaet Goettingen
- * @version: 1.0.2 11/2015
- */
-#include "variance.h"
-
-
-/*---Variables---------------------------------------------------------------*/
-
-
-
-/*===Main-Part===============================================================*/
-/*---Constructor-------------------------------------------------------------*/
-/**
- * Default constructor, sets the default vaulues, pattern will be generated automatically.
- */
-variance::variance() {
- int *length = new int[2];
- length[0] = 14;
- length[1] = 14;
- Ctor(NULL, 10, length, 8, 10000, 10000, 10000, 0.75, 0.25, 64);
-}
-
-
-/**
- * File constructor, sets values only from files; resets automatically if there are problems.
- */
-variance::variance(char* pattern_file) {
- int *length = new int[2];
- length[0] = 14;
- length[1] = 14;
- Ctor(pattern_file, 10, length, 8, 10000, 10000, 10000, 0.75, 0.25, 64);
-}
-
-
-/**
- * Short constructor, sets some default vaulues, just pattern dimension is set.
- *
- * @param size The amount of patterns; pattern number.
- *
- * @param length The pattern length for each pattern of the pattern set.
- *
- * @param weigth The weight (match positions; '1') for each pattern of the pattern set.
- */
-variance::variance(int size, int *length, int weight) {
- Ctor(NULL, size, length, weight, 10000, 10000, 10000, 0.75, 0.25, 64);
-}
-
-
-/**
- * Long constructor, sets the values; resets automatically, if there are problems.
- *
- * @param pattern_file File, that may contains submitted pattern
- *
- * @param align_file File, that may contains an alignment file to estimate p, q, l_hom, li and lj
- *
- * @param size The amount of patterns; pattern number.
- *
- * @param length The pattern length for each pattern of the pattern set.
- *
- * @param weigth The weight (match positions; '1') for each pattern of the pattern set.
- *
- * @param l_hom In theory, the amount of homologous positions of two sequences in an multiple alignment.
- *
- * @param l1 In theory, the first(represents each sequence i) sequence length of two observed sequences.
- *
- * @param l2 In theory, the second(represents each sequence j) sequence length of two observed sequences.
- *
- * @param p The match probability ( = #matches / #l_hom)
- *
- * @param q The background probability for each nucleotide A,C,G,T
- */
-variance::variance(char* pattern_file, int size, int *length, int weight, int l_hom, int l1, int l2, double p, double q, int H) {
- Ctor(pattern_file, size, length, weight, l_hom, l1, l2, p, q, H);
-}
-
-
-/**
- * Default destructor, deletes all vectors and matrices in the object.
- */
-variance::~variance() {
- Clear();
-}
-
-void variance::Ctor(char* pattern_file, int size, int *length, int weight, int l_hom, int l1, int l2, double p, double q, int H){
- this->l_hom = l_hom;
- this->l1 = l1;
- this->l2 = l2;
- this->p = p;
- this->q = q;
- this->H = H;
- this->length = length;
- variance_val = 0;
- loop_opt = 1;
- quiet = false;
- silent = false;
- oc = false;
- update = false;
- outvar = "variance: ";
- InitVariance(pattern_file, size, length, weight);
-}
-
-/*---Init--------------------------------------------------------------------*/
-/**
- * Complete Initializing of the Variance, which also creates the patternsets
- * calculates the first variance/oc and checks if each parameter is in its
- * own correct size and dimension.
- */
-void variance::InitVariance(char* pattern_file, int size, int *length, int weight){
- Clear();
- pattern_set = new patternset(pattern_file, size, length, weight);
- this->size = pattern_set->GetSize();
- this->weight = pattern_set->GetWeight();
- this->length = pattern_set->GetLength();
- length_mean = pattern_set->GetLengthMean();
- improve = pattern_set->GetImprove();
- update = pattern_set->GetUpdate();
- if (q > p || q >= 1 || p > 1 || q < 0 || p < 0) { /*Incorrect values for variance have to be corrected*/
- SecureMessage("pq");
- p = 0.9;
- q = 0.25;
- update = true;
- }
-
- if (l_hom <= 0 || l1 <= 0 || l2 <= 0) {
- SecureMessage("length");
- l_hom = 10000;
- l1 = 10000;
- l2 = 10000;
- update = true;
- }
- InitVarMatrix();
- InitVar();
- if(update){
- Quiet();
- }
-}
-
-/**
- * Part Initialzinig of the Variance, it will not reset complete.
- * Only for ReInitializin a new pattternset with most values in
- * common.
- * Public Access, so other programs can 'reload' it
- */
-void variance::ReInitVariance(){
- for(int i = 0; i < size; i++){
- var_sum[i].clear();
- }
- var_sum.clear();
- pattern_order.clear();
- pattern_set->ReInitPattern();
- length_mean = pattern_set->GetLengthMean();
- InitVarMatrix();
- InitVar();
-}
-
-
-/**
- * Part of the destructor; could be used in other cases for
- * reinitializing the variance/oc new.
- */
-void variance::Clear(){
- for(int i = 0; i < size; i++){
- var_sum[i].clear();
- }
- var_sum.clear();
- pattern_order.clear();
- delete pattern_set;
-}
-
-/**
- * Up to different patternsizes a specific matrix will be created.
- * Necessary for reinitliazing.
- */
-void variance::InitVarMatrix() {
- std::vector<double> tmp;
- for (int i = 0; i < size; i++) {
- tmp.push_back(0.0);
- }
- for (int i = 0; i < size; i++) {
- var_sum.push_back(tmp);
- }
-}
-
-/**
- * First Initializing of the variance/oc where each value of the matrix has
- * to be created, instead of updated variance, where it is not necessary to
- * recalculate each value/entry. Only of those, which alter due to pattern
- * modification.
- */
-void variance::InitVar() {
- double var_hom, var_bac, hom, back, tmp;// ,hom_val, back_val;
- int shift, lower_length, upper_length;
- uint64_t oc_hom;
-
- oc_hom = 1;
- //hom_val = 0;
- //back_val = 0;
-
- for (int i = 0; i < size; i++) { /*i and j represents Pi and Pj of the set of pattern*/
- lower_length = (int) log2(pattern_set->GetPattern(i));
- for (int j = i; j < size; j++) {
- upper_length = (int) log2(pattern_set->GetPattern(j)) + 1;
-
- if(i == j && !oc){
- upper_length = 1;
- }
-
- var_hom = 0.0;
- var_bac = 0.0;
- for (int s = -1 * lower_length; s < upper_length; s++) { /*As in the formula, the shift goes from max shift left to max shift right*/
- shift = ShiftPos(i, j, s); /*At least one position has to overlap*/
- if (!oc) {
- var_hom += (pow(p, shift) - pow(p, 2 * weight)); /*summation of the homologue first part*/
- var_bac += (pow(q, shift) - pow(q, 2 * weight));
- }
- else{
- var_hom += (double) (oc_hom << (shift));
- }
- }
- if(!oc){
- hom = (l_hom - length_mean + 1);
- hom *= var_hom;
-
- back = (l1 - length_mean + 1);
- back *= (l2 - length_mean);
- back *= var_bac;
- }
- else{
- hom = var_hom;
- back = 0.0;
- }
- // hom_val += hom;
- // back_val += back;
- var_sum[i][j] = hom + back; /*For each pair Pi and Pj this is the direct contribute to the complete variance*/
- var_sum[j][i] = var_sum[i][j];
- }
- }
- variance_val = Variance();
-
- pattern_order.clear();
-
- for(int i = 0; i < size; i++){
- tmp = 0.0;
- for(int j = 0; j < size; j++){
- tmp += var_sum[i][j]; /*Sets the contribute of each pattern in the beginning*/
- }
- pattern_order.push_back(tmp);
- }
- ResetPatternOrder();
-
- //std::cout << "homologue contribute: " << hom_val/variance_val << std::endl;
- //std::cout << "background contribute: " << back_val/variance_val << std::endl;
-}
-
-
-/*---Variance-----------------------------------------------------------------*/
-/**
- * IF neccessary, it calculates the current variance/oc from the matrix.
- *
- * @return returns current variance/oc
- */
-double variance::Variance() {
- double var;
-
- var = 0.0;
-
- for (int i = 0; i < size; i++) {
- for(int j = i; j < size; j++){
- var += var_sum[i][j];
- }
- }
- variance_val = var;
- return var;
-}
-
-/**
- * Instead of recalculating each value, only the matrix entries which
- * belong to the modified pattern have to be recalculated.
- *
- * @param pat Patternindex of the modified pattern
- */
-void variance::UpdateVariance(int pat) {
- double var_hom, var_bac, hom, back;
- int shift, lower_length, upper_length;
- uint64_t oc_hom;
-
- oc_hom = 1;
-
- for (int j = 0; j < size; j++) {
- var_hom = 0.0;
- var_bac = 0.0;
-
- lower_length = (int) log2(pattern_set->GetPattern(pat));
- upper_length = (int) log2(pattern_set->GetPattern(j)) + 1;
-
- if(pat == j && !oc){
- upper_length = 1;
- }
-
- for (int s = -1 * lower_length; s < upper_length; s++) { /*As in the formula, the shift goes from max shift left to max shift right*/
- shift = ShiftPos(pat, j, s); /*At least one position has to overlap*/
- if (!oc) {
- var_hom += (pow(p, shift) - pow(p, 2 * weight)); /*summation of the homologue first part*/
- var_bac += (pow(q, shift) - pow(q, 2 * weight));
- }
- else{
- var_hom += (double) (oc_hom << (shift));
- }
- }
-
- variance_val -= var_sum[pat][j];
- pattern_order[pat] -= var_sum[pat][j];
- pattern_order[j] -= var_sum[pat][j];
-
- if(!oc){
- hom = (l_hom - length_mean + 1);
- hom *= var_hom;
- back = (l1 - length_mean + 1);
- back *= (l2 - length_mean);
- back *= var_bac;
- }
- else{
- hom = var_hom;
- back = 0.0;
- }
-
- var_sum[pat][j] = hom + back; /*For each pair Pi and Pj this is the direct share of the complete variance...*/
- var_sum[j][pat] = var_sum[pat][j];
-
- variance_val += var_sum[pat][j];
- pattern_order[pat] += var_sum[pat][j];
- pattern_order[j] += var_sum[pat][j];
- }
-}
-
-/**
- * Shifts pattern and counts the number of common match positions
- *
- * @param p1 Position of the first used pattern of the pattern set
- *
- * @param p2 Position of the second used pattern of the pattern set
- * NOTE: possible is p1 = p2
- *
- * @param s The shift of the second pattern, s < 0 := shift left pattern 2, s > := shift right pattern 2
- *
- * @return Calculates and returns current variance
- */
-int variance::ShiftPos(int number, int number2, int s) {
- int counter, maxi, maxa, maxb;
- uint64_t pata, patb, c;
-
- pata = pattern_set->GetPattern(number);
- patb = pattern_set->GetPattern(number2);
-
- counter = 0;
- c = 1; /*LookUp Counter*/
-
- if (s < 0) { /*Negative shift means, the upper pattern is shifted right*/
- s = 0 - s;
-
- pata = pata >> s;
- }
- else{ /*otherweise the lower pattern is shifted right*/
- patb = patb >> s;
- }
-
- maxa = (int) log2(pata)+1;
- maxb = (int) log2(patb)+1;
- maxi = std::min(maxa,maxb);
- for (int i = 0; i < maxi; i++) {
- if(((pata & patb) & c << i) != 0){
- counter++;
- }
- }
-
- if(!oc){
- counter = 2*weight - counter; /*Allows us to calculate th n(p,p',s) value in a way...*/
- } /*...with not so much space. We do not need to look for...*/
- return counter; /*...each position containing at least one match*/
-}
-
-/**
- * Returns the position of the worst pattern, estimated by the maximum variancepart
- * for each pattern pair
- *
- * @param n The pattern index of the chosen pattern.
- *
- * @return returns position worst matrix by max_value
- */
-int variance::WorstPattern(int n){
- return pattern_order_sort[n]->GetPos();
-}
-
-void variance::ResetPatternOrder(){
- extkey* tmp;
- int end;
-
- if(pattern_order_sort.size() != 0){
- pattern_order_sort.clear();
- }
-
- if(length[0]==weight){
- end = size-1;
- }
- else{
- end = size;
- }
-
- for(int i = 0; i < end; i++){
- tmp = new extkey(i,pattern_order[i]); /*Using an extended key to store each contribute and the ...*/
- pattern_order_sort.push_back(tmp); /*pattern number.*/
- }
-
- std::sort(pattern_order_sort.rbegin(), pattern_order_sort.rend());
-}
-
-/**
- * The improvement method
- *
- * @param limit The number of patterns which have to be selected and mayby modified.
- */
-void variance::Improve(int limit) {
- std::vector<uint64_t> vec;
- uint64_t pat_save;
- double var_save, tmp_var, best_by;
- int worst_pat, better_pattern, counter;
- //std::string str_out = "Output_variance";
- //std::ofstream fout;
- //fout.open(str_out);
-
- best_by = variance_val;
-
- if (!silent) {
- std::cout << "First " << outvar << "\t" << GetVariance() << std::endl;
- std::cout << "First norm_" << outvar << "\t" << GetNormVariance() << std::endl << std::endl;
- }
- //fout << 0 << "\t" << GetNormVariance() << std::endl;
-
- if(loop_opt > 1){
- pattern_set->Silent();
- for(int i = 0; i < size; i++){
- vec.push_back(pattern_set->GetPattern(i));
- }
- }
-
-
- for(int d = 0; d < loop_opt; d++){
-
- counter = 0;
- better_pattern = 0;
- worst_pat = 0;
- if(d > 0){
- if(!silent && !quiet){
- std::cout << "\n\n=================\nAnd do it again!" << std::endl;
- }
- ReInitVariance();
- }
- if (improve) {
- for (int i = 1; i <= limit; i++) {
- var_save = variance_val;
- worst_pat = GetWorstPat(counter%((int)pattern_order_sort.size()));
- //std::cout << counter << "\t" << worst_pat << std::endl;
- pat_save = pattern_set->GetPattern(worst_pat);
- pattern_set->ChangeBits(worst_pat);
-
- UpdateVariance(worst_pat);
- tmp_var = variance_val;
-
- if (tmp_var < var_save && pattern_set->UniqPattern(worst_pat)) {
- better_pattern++;
- var_save = variance_val;
- if (quiet && !silent) {
- std::cout << "\r*** BETTER PATTERN " << better_pattern << " ***";
- std::cout.flush();
- }
- else if (!quiet && !silent) {
- std::cout << "*** BETTER PATTERN " << better_pattern << " ***" << std::endl;
- std::cout << "Step " << i << " / " << limit << std::endl << "Patternset: \n";
- pattern_set->Print();
- std::cout << outvar << GetVariance() << std::endl;
- std::cout << "norm_" << outvar << GetNormVariance() << std::endl << std::endl;
- }
- else{
- //Do-Nothing
- }
- counter = 0;
- ResetPatternOrder();
- //fout << i << "\t" << GetNormVariance() << std::endl;
- }
- else{
- pattern_set->SetPattern(worst_pat, pat_save);
- UpdateVariance(worst_pat);
- counter++;
- }
- }
- if(best_by > variance_val && loop_opt > 2){
- best_by = variance_val;
- for(int k = 0; k < size; k++){
- vec[k] = pattern_set->GetPattern(k);
- }
- }
- if (!silent && loop_opt < 2) {
- std::cout << "\n\nBest pattern:\n\n";
- pattern_set->Print();
- std::cout << "\nBest " << outvar << GetVariance() << std::endl;
- std::cout << "Best norm_"<< outvar << GetNormVariance() << std::endl;
- }
- //fout << limit << "\t" << GetNormVariance() << std::endl;
- }
- }
- //if(update){
- //SecureMessage("update");
- //}
- if(loop_opt > 1){
- for(int i = 0; i < size; i++){
- pattern_set->SetPattern(i, vec[i]);
- }
- variance_val = best_by;
- if(!silent){
- std::cout << "\n\nBest pattern:\n\n";
- pattern_set->Print();
- std::cout << "\nBest " << outvar << GetVariance() << std::endl;
- std::cout << "Best norm_"<< outvar << GetNormVariance() << std::endl;
- }
- }
- vec.clear();
- //fout.close();
-}
-
-
-/**
- * Turns the current calculation from variance to overlap complexity.
- * The matrix has to be recalculated complete.
- */
-void variance::ImproveOC() {
- this->oc = true;
- outvar = "oc: ";
- InitVar();
-}
-
-/*---stuff-------------------------------------------------------------------*/
-/**
- * Method calculate the number of all pattern combinations
- * Actually the gauss summation (n*(n+1)/2)
- *
- * @return the number of maximum pattern combinations
- */
-double variance::Gauss() {
- return 0.5*size*size + 0.5*size;
-}
-
-/**
- * Changes to quiet output, only the number of better patterns will be printed
- * automatically, and the best patternset by the variance/oc minimization.
- * Errors arr going to be printed!
- */
-void variance::Quiet() {
- this->quiet = true;
-}
-
-/**
- * Changes to silent output, nothing will be printed automatically!
- * Errors are going to be printed!
- */
-void variance::Silent() {
- this->quiet = true;
- this->silent = true;
- pattern_set->Silent();
-}
-
-/**
- * Resets the pattern lengths from evenly distributed lengths between maximum
- * and minimum to random created pattern lengths.
- * One quarter of the set will have maximum length, the rest will have the
- * length between maximum and minimum randomly distributed.
- */
-void variance::RandPatLength(){
- pattern_set->RandPatLength();
- InitVar();
-}
-
-/**
- * Sets the number of times the improvement method should be executed again.
- * It will save the best variance/oc and its patternset.
- *
- * @param n The number of iterations for the improvement method
- */
-void variance::LoopOpt(int n){
- if(n == 0){
- n = 1;
- }
- loop_opt = n;
-
-}
-
-/**
- * Prints complete the current patternset
- */
-void variance::PrintPattern(){
- pattern_set->Print();
-}
-
-/**
- * A Method to collect all errormessages. Just easier for programmer to change
- * the text or extend.
- *
- * @param errmsg
- * Due to a few possible errormessages, this is the option, which has to be printed.
- *
- * @param pos
- * The position of the incorrect patterns.
- *
- */
-void variance::SecureMessage(std::string errmsg) {
- /*if (errmsg == "noimprove") {
- std::cerr << "Using your pattern conditions it is not sensible to improve your pattern, sorry!" << std::endl;
- std::cerr << "Deactivating improve mode\n" << std::endl;
- return;
- }
- if (errmsg == "update"){
- std::cerr << "\n\n--> IMPORTANT <--\nDue to some configuration errors, your submitted parameters have been updated!\n" << std::endl;
- return;
- }
- if (errmsg == "pq") {
- std::cerr << "Error while parsing your p and/or q value: \t0 < q <= p <= 1!" << std::endl;
- std::cerr << "Return to default values:\tp = 0.9 \tq=0.25\n" << std::endl;
- return;
- }
- if (errmsg == "length") {
- std::cerr << "Error while parsing your sequence length S: \t 0 < S!" << std::endl;
- std::cerr << "Return to default value: \tS = 10000\n" << std::endl;
- return;
- }
- if (errmsg == "wrongindex") {
- std::cerr << "ERROR! Pattern does not exist... do nothing\n" << std::endl;
- return;
- }
- if (errmsg == "speed"){
- std::cerr << "\n\n\nUPDATE! SpEED lenghts optimization is used, creating a new patternset!!!\n\n\n\n" << std::endl;
- return;
- }*/
-}
-
-/*---Set-&-GetFunc-----------------------------------------------------------*/
-
-std::vector<std::string> variance::GetPattern(){
- return pattern_set->GetStringPattern();
-}
-
-std::string variance::GetPattern(int number){
- return pattern_set->GetString(number);
-}
-
-/**
- * Returns the current variance
- *
- * @return returns variance
- */
-double variance::GetVariance() {
- return variance_val;
-}
-
-/**
- * Returns the current variance, normalized
- *
- * @return returns norm_variance
- */
-double variance::GetNormVariance() {
- return variance_val / Gauss();
-}
-
-
-/**
- * Returns the position of the worst pattern, estimated by the maximum variancepart
- * for each pattern pair
- *
- * @return returns position worst matrix by max_value
- */
-int variance::GetWorstPat(int number) {
- if(number >= size){
- return 0;
- }
- return WorstPattern(number);
-}
-
-/**
- * Returns the weight of each pattern, the match positions
- *
- * @return returns weight
- */
-int variance::GetWeight() {
- return pattern_set->GetWeight();
-}
-
-/**
- * Returns the amount of patters; number of patterns
- *
- * @return returns size
- */
-int variance::GetSize() {
- return size;
-}
-
-/**
- * Returns the length of each pattern
- *
- * @return returns length
- */
-int* variance::GetLength() {
- length = pattern_set->GetLength();
- return length;
-}
-
-/**
- * Returns the match probability
- *
- * @return returns p value
- */
-double variance::GetP() {
- return p;
-}
-
-/**
- * Returns the background probabillity, summation over all nucleotids
- *
- * @return returns q value
- */
-double variance::GetQ() {
- return q;
-}
-
-/**
- * Returns the length of the homologous sequence pair
- *
- * @return returns homologous positions
- */
-int variance::GetLHom() {
- return l_hom;
-}
-
-/**
- * Returns the length of the first observed sequence
- *
- * @return returns length sequence 1
- */
-int variance::GetL1() {
- return l1;
-}
-
-/**
- * Returns the length of the second observed sequence
- *
- * @return returns length sequence 2
- */
-int variance::GetL2() {
- return l2;
-}
-
-/**
- * Returns the current format, variance or overlap complexity
- *
- * @return the format string, either 'variance' or 'oc'
- */
-std::string variance::GetFormat() {
- return outvar;
-}
-
-/**
- * Returns the length of a random homolgue region on a dataset
- *
- * @return The length of random homolgue region
- */
-int variance::GetH(){
- return H;
-}
=====================================
variance.h deleted
=====================================
--- a/variance.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/**
- * This programm calculates the variance/OC and/or the sensitivity of a set of pattern with the same weight.
- * It is possible to improve your patternset and read patterns from a file.
- *
- * variance object header
- *
- * For theory please have a look at:
- *
- * B. Morgenstern, B. Zhu, S. Horwege, C.-A Leimeister (2015)
- * Estimating evolutionary distances between genomic sequences from spaced-word matches
- * Algorithms for Molecular Biology 10, 5. (http://www.almob.org/content/10/1/5/abstract)
- *
- *
- * @author: Lars Hahn - 26.10.2015, Georg-August-Universitaet Goettingen
- * @version: 1.0.2 11/2015
- */
-#ifndef VARIANCE_H_
-#define VARIANCE_H_
-
-
-#include <iostream>
-#include <fstream>
-#include <random>
-#include <vector>
-#include <string>
-#include <algorithm>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <inttypes.h>
-#include "patternset.h"
-#include "extkey.h"
-
-class variance {
-public:
- variance();
- variance(char* pattern_file);
- variance(int size, int *length, int weight);
- variance(char* pattern_file, int size, int *length, int weight, int l_hom, int l1, int l2, double p, double q, int H);
- ~variance();
-
- void ReInitVariance();
- void RandPatLength();
-
- void Improve(int limit);
- void ImproveOC();
-
- std::vector<std::string> GetPattern();
- std::string GetPattern(int number);
- int GetWorstPat(int number);
- int GetWeight();
- int GetSize();
- int* GetLength();
- void PrintPattern();
-
- std::string GetFormat();
- double GetVariance();
- double GetNormVariance();
- double GetP();
- double GetQ();
- int GetLHom();
- int GetL1();
- int GetL2();
- int GetH();
-
- void LoopOpt(int n);
- void Quiet();
- void Silent();
-
-protected:
- void Ctor(char* pattern_file, int size, int *length, int weight, int l_hom, int l1, int l2, double p, double q, int H);
- void InitVariance(char* pattern_file, int size, int *length, int weight);
- void InitVarMatrix();
- void InitVar();
- void SetPatOrder();
- void Clear();
-
- double Variance();
- void UpdateVariance(int pat);
-
- int ShiftPos(int number, int number2, int s);
- int WorstPattern(int number);
- void ResetPatternOrder();
-
- double Gauss();
- void SecureMessage(std::string errmsg);
-
-private:
- std::vector<std::vector<double> > var_sum;
- patternset* pattern_set;
- std::vector<double> pattern_order;
- std::vector<extkey*> pattern_order_sort;
- std::string outvar;
- double variance_val;
- double p;
- double q;
- int size;
- int weight;
- int length_mean;
- int *length;
- int l_hom;
- int l1;
- int l2;
- int H;
- int loop_opt;
- bool oc;
- bool improve;
- bool quiet;
- bool silent;
- bool update;
-};
-#endif
View it on GitLab: https://salsa.debian.org/med-team/spaced/commit/3f1306ad27fb01b15f16c1945979230b25c99725
--
View it on GitLab: https://salsa.debian.org/med-team/spaced/commit/3f1306ad27fb01b15f16c1945979230b25c99725
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20180709/2ffe28a4/attachment-0001.html>
More information about the debian-med-commit
mailing list