[med-svn] [bedtools] 04/14: Refactored KeyListOps, Context, mapFile for KeyListOps re-usability by other tools.
Charles Plessy
plessy at moszumanska.debian.org
Thu Mar 6 22:56:44 UTC 2014
This is an automated email from the git hooks/post-receive script.
plessy pushed a commit to branch master
in repository bedtools.
commit 2268ee4fd792e109f2af5477c5e1b6208b5cba20
Author: nkindlon <nek3d at virginia.edu>
Date: Thu Feb 27 23:45:48 2014 -0500
Refactored KeyListOps, Context, mapFile for KeyListOps re-usability by other tools.
---
src/intersectFile/Makefile | 1 +
src/mapFile/Makefile | 1 +
src/mapFile/mapFile.cpp | 67 +---
src/mapFile/mapFile.h | 85 +----
src/mapFile/mapMain.cpp | 138 --------
src/nekSandbox1/Makefile | 1 +
src/regressTest/Makefile | 1 +
src/sampleFile/Makefile | 1 +
src/utils/BinTree/Makefile | 1 +
src/utils/Contexts/ContextBase.cpp | 120 ++++++-
src/utils/Contexts/ContextBase.h | 21 +-
src/utils/Contexts/ContextIntersect.h | 2 +
src/utils/Contexts/ContextMap.cpp | 93 +-----
src/utils/Contexts/ContextMap.h | 20 +-
src/utils/Contexts/Makefile | 1 +
src/utils/FileRecordTools/Records/BamRecord.cpp | 5 +
src/utils/FileRecordTools/Records/BamRecord.h | 1 +
.../FileRecordTools/Records/Bed12Interval.cpp | 26 ++
src/utils/FileRecordTools/Records/Bed12Interval.h | 1 +
src/utils/FileRecordTools/Records/Bed3Interval.cpp | 20 ++
src/utils/FileRecordTools/Records/Bed3Interval.h | 2 +
src/utils/FileRecordTools/Records/Bed4Interval.cpp | 5 +
src/utils/FileRecordTools/Records/Bed4Interval.h | 1 +
src/utils/FileRecordTools/Records/Bed5Interval.cpp | 13 +
src/utils/FileRecordTools/Records/Bed5Interval.h | 1 +
src/utils/FileRecordTools/Records/Bed6Interval.cpp | 17 +
src/utils/FileRecordTools/Records/Bed6Interval.h | 1 +
.../FileRecordTools/Records/BedGraphInterval.cpp | 11 +
.../FileRecordTools/Records/BedGraphInterval.h | 1 +
.../FileRecordTools/Records/BedPlusInterval.cpp | 15 +
.../FileRecordTools/Records/BedPlusInterval.h | 2 +
src/utils/FileRecordTools/Records/GffRecord.cpp | 36 ++
src/utils/FileRecordTools/Records/GffRecord.h | 1 +
src/utils/FileRecordTools/Records/Record.cpp | 6 +-
src/utils/FileRecordTools/Records/Record.h | 2 +
src/utils/GenomeFile/Makefile | 1 +
src/utils/KeyListOps/KeyListOps.cpp | 364 ++++++++++++++++++++
src/utils/KeyListOps/KeyListOps.h | 54 +++
src/utils/KeyListOps/KeyListOpsMethods.cpp | 368 +++++++++++++++++++++
src/utils/KeyListOps/KeyListOpsMethods.h | 113 +++++++
src/utils/{NewChromsweep => KeyListOps}/Makefile | 12 +-
src/utils/NewChromsweep/Makefile | 1 +
src/utils/RecordOutputMgr/Makefile | 1 +
src/utils/general/Makefile | 2 +-
src/utils/general/QuickString.cpp | 65 ++++
src/utils/general/QuickString.h | 19 ++
test/map/test-map.sh | 97 ++++--
47 files changed, 1399 insertions(+), 418 deletions(-)
diff --git a/src/intersectFile/Makefile b/src/intersectFile/Makefile
index e265b33..8c81049 100644
--- a/src/intersectFile/Makefile
+++ b/src/intersectFile/Makefile
@@ -17,6 +17,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
+ -I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/RecordOutputMgr/ \
-I$(UTILITIES_DIR)/NewChromsweep \
-I$(UTILITIES_DIR)/BinTree \
diff --git a/src/mapFile/Makefile b/src/mapFile/Makefile
index 17bb42d..8628242 100644
--- a/src/mapFile/Makefile
+++ b/src/mapFile/Makefile
@@ -29,6 +29,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/RecordOutputMgr/ \
+ -I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/NewChromsweep \
-I$(UTILITIES_DIR)/VectorOps \
-I$(UTILITIES_DIR)/BinTree \
diff --git a/src/mapFile/mapFile.cpp b/src/mapFile/mapFile.cpp
index 88dcc26..8dbf24a 100644
--- a/src/mapFile/mapFile.cpp
+++ b/src/mapFile/mapFile.cpp
@@ -47,74 +47,11 @@ bool FileMap::mapFiles()
RecordKeyList keySet(hitSet.getKey());
RecordKeyList resultSet(hitSet.getKey());
_blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet);
- SummarizeHits(resultSet);
- _recordOutputMgr->printRecord(resultSet.getKey(), _output);
+ _recordOutputMgr->printRecord(resultSet.getKey(), _context->getColumnOpsVal(resultSet));
} else {
- SummarizeHits(hitSet);
- _recordOutputMgr->printRecord(hitSet.getKey(), _output);
+ _recordOutputMgr->printRecord(hitSet.getKey(), _context->getColumnOpsVal(hitSet));
}
}
return true;
}
-void FileMap::ExtractColumnFromHits(RecordKeyList &hits) {
- _column_vec.clear();
- RecordKeyList::const_iterator_type iter = hits.begin();
- for (; iter != hits.end(); iter = hits.next())
- {
- _column_vec.push_back(iter->value()->getField(_context->getColumn()).str());
- }
-}
-
-void FileMap::SummarizeHits(RecordKeyList &hits) {
-
- const QuickString & operation = _context->getColumnOperation();
- _output.clear();
-
- if (hits.size() == 0) {
- if (operation == "count" || operation == "count_distinct")
- _output.append("0");
- else
- _output.append(_context->getNullValue().str());
- return;
- }
-
- _tmp_output.str("");
- _tmp_output.clear();
-
- ExtractColumnFromHits(hits);
-
- VectorOps vo(_column_vec);
- if (operation == "sum")
- _tmp_output << setprecision (PRECISION) << vo.GetSum();
- else if (operation == "mean")
- _tmp_output << setprecision (PRECISION) << vo.GetMean();
- else if (operation == "median")
- _tmp_output << setprecision (PRECISION) << vo.GetMedian();
- else if (operation == "min")
- _tmp_output << setprecision (PRECISION) << vo.GetMin();
- else if (operation == "max")
- _tmp_output << setprecision (PRECISION) << vo.GetMax();
- else if (operation == "absmin")
- _tmp_output << setprecision (PRECISION) << vo.GetAbsMin();
- else if (operation == "absmax")
- _tmp_output << setprecision (PRECISION) << vo.GetAbsMax();
- else if (operation == "mode")
- _tmp_output << vo.GetMode();
- else if (operation == "antimode")
- _tmp_output << vo.GetAntiMode();
- else if (operation == "count")
- _tmp_output << setprecision (PRECISION) << vo.GetCount();
- else if (operation == "count_distinct")
- _tmp_output << setprecision (PRECISION) << vo.GetCountDistinct();
- else if (operation == "collapse")
- _tmp_output << vo.GetCollapse();
- else if (operation == "distinct")
- _tmp_output << vo.GetDistinct();
- else {
- cerr << "ERROR: " << operation << " is an unrecognized operation\n";
- exit(1);
- }
- _output.append(_tmp_output.str());
-
-}
diff --git a/src/mapFile/mapFile.h b/src/mapFile/mapFile.h
index cb1da08..fbb431a 100644
--- a/src/mapFile/mapFile.h
+++ b/src/mapFile/mapFile.h
@@ -18,10 +18,11 @@ using namespace std;
#include <iomanip>
#include "VectorOps.h"
#include "RecordKeyList.h"
+#include "KeyListOps.h"
+#include "ContextMap.h"
using namespace std;
-class ContextMap;
class BlockMgr;
class RecordOutputMgr;
@@ -35,90 +36,8 @@ public:
private:
ContextMap *_context;
- Record *_queryRec;
- Record *_databaseRec;
BlockMgr *_blockMgr;
RecordOutputMgr *_recordOutputMgr;
-
- vector<string> _column_vec; // vector to hold current column's worth of data
-
- ostringstream _tmp_output;
- QuickString _output; // placeholder for the results of mapping B to each a in A.
- //------------------------------------------------
- // private methods
- //------------------------------------------------
- void Map();
- void SummarizeHits(RecordKeyList &hits);
- void ExtractColumnFromHits(RecordKeyList &hits);
-
};
#endif /* MAPFILE_H */
-
-
-/*
-#include "bedFile.h"
-#include "chromsweep.h"
-#include "VectorOps.h"
-#include "api/BamReader.h"
-#include "api/BamWriter.h"
-#include "api/BamAux.h"
-#include "BamAncillary.h"
-using namespace BamTools;
-
-
-#include <vector>
-#include <iostream>
-#include <algorithm>
-#include <numeric>
-#include <fstream>
-#include <iomanip>
-#include <stdlib.h>
-using namespace std;
-
-
-
-class BedMap {
-
-public:
-
- // constructor
- BedMap(string bedAFile, string bedBFile, int column, string operation,
- float overlapFraction, bool sameStrand,
- bool diffStrand, bool reciprocal,
- bool choseNullValue, string nullValue,
- bool printHeader);
-
- // destructor
- ~BedMap(void);
-
-private:
-
- //------------------------------------------------
- // private attributes
- //------------------------------------------------
- string _bedAFile;
- string _bedBFile;
- int _column;
- string _operation;
- bool _sameStrand;
- bool _diffStrand;
- bool _reciprocal;
- float _overlapFraction;
- string _nullValue;
- bool _printHeader;
-
- // instance of a bed file class.
- BedFile *_bedA, *_bedB;
-
- vector<string> _column_vec; // vector to hold current column's worth of data
-
- //------------------------------------------------
- // private methods
- //------------------------------------------------
- void Map();
- string MapHits(const BED &a, const vector<BED> &hits);
- void ExtractColumnFromHits(const vector<BED> &hits);
-};
-*/
-//#endif /* MAPFILE_H */
diff --git a/src/mapFile/mapMain.cpp b/src/mapFile/mapMain.cpp
index a9eeb36..f08e56b 100644
--- a/src/mapFile/mapMain.cpp
+++ b/src/mapFile/mapMain.cpp
@@ -38,144 +38,6 @@ int map_main(int argc, char* argv[]) {
return retVal ? 0 : 1;
}
-
-/*
-int map_main(int argc, char* argv[]) {
-
- // our configuration variables
- bool showHelp = false;
-
- // input files
- string bedAFile;
- string bedBFile;
- int column = 5;
- string operation = "sum";
- string nullValue = ".";
-
- // input arguments
- float overlapFraction = 1E-9;
-
- bool haveBedA = false;
- bool haveBedB = false;
- bool haveColumn = false;
- bool haveOperation = false;
- bool haveFraction = false;
- bool reciprocalFraction = false;
- bool sameStrand = false;
- bool diffStrand = false;
- bool printHeader = false;
- bool choseNullValue = false;
-
- // check to see if we should print out some help
- if(argc <= 1) showHelp = true;
-
- for(int i = 1; i < argc; i++) {
- int parameterLength = (int)strlen(argv[i]);
-
- if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
- (PARAMETER_CHECK("--help", 5, parameterLength))) {
- showHelp = true;
- }
- }
-
- if(showHelp) map_help();
-
- // do some parsing (all of these parameters require 2 strings)
- for(int i = 1; i < argc; i++) {
-
- int parameterLength = (int)strlen(argv[i]);
-
- if(PARAMETER_CHECK("-a", 2, parameterLength)) {
- if ((i+1) < argc) {
- haveBedA = true;
- bedAFile = argv[i + 1];
- i++;
- }
- }
- else if(PARAMETER_CHECK("-b", 2, parameterLength)) {
- if ((i+1) < argc) {
- haveBedB = true;
- bedBFile = argv[i + 1];
- i++;
- }
- }
- else if(PARAMETER_CHECK("-c", 2, parameterLength)) {
- if ((i+1) < argc) {
- haveColumn = true;
- column = atoi(argv[i + 1]);
- i++;
- }
- }
- else if(PARAMETER_CHECK("-o", 2, parameterLength)) {
- if ((i+1) < argc) {
- haveOperation = true;
- operation = argv[i + 1];
- i++;
- }
- }
- else if(PARAMETER_CHECK("-f", 2, parameterLength)) {
- if ((i+1) < argc) {
- haveFraction = true;
- overlapFraction = atof(argv[i + 1]);
- i++;
- }
- }
- else if(PARAMETER_CHECK("-r", 2, parameterLength)) {
- reciprocalFraction = true;
- }
- else if (PARAMETER_CHECK("-s", 2, parameterLength)) {
- sameStrand = true;
- }
- else if (PARAMETER_CHECK("-S", 2, parameterLength)) {
- diffStrand = true;
- }
- else if (PARAMETER_CHECK("-null", 5, parameterLength)) {
- nullValue = argv[i + 1];
- choseNullValue = true;
- i++;
- }
- else if(PARAMETER_CHECK("-header", 7, parameterLength)) {
- printHeader = true;
- }
- else {
- cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
- showHelp = true;
- }
- }
-
- // make sure we have both input files
- if (!haveBedA || !haveBedB) {
- cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl;
- showHelp = true;
- }
-
- if (reciprocalFraction && !haveFraction) {
- cerr << endl << "*****" << endl << "*****ERROR: If using -r, you need to define -f." << endl << "*****" << endl;
- showHelp = true;
- }
-
- if (sameStrand && diffStrand) {
- cerr << endl << "*****" << endl << "*****ERROR: Request either -s OR -S, not both." << endl << "*****" << endl;
- showHelp = true;
- }
-
- if (!showHelp) {
-
- BedMap *bm = new BedMap(bedAFile, bedBFile, column, operation,
- overlapFraction, sameStrand,
- diffStrand, reciprocalFraction,
- choseNullValue, nullValue,
- printHeader);
- delete bm;
- return 0;
- }
- else {
- map_help();
- return 0;
- }
-}
-*/
-
void map_help(void) {
cerr << "\nTool: bedtools map (aka mapBed)" << endl;
diff --git a/src/nekSandbox1/Makefile b/src/nekSandbox1/Makefile
index fbe6d86..df8aba7 100644
--- a/src/nekSandbox1/Makefile
+++ b/src/nekSandbox1/Makefile
@@ -10,6 +10,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders \
-I$(UTILITIES_DIR)/FileRecordTools/Records \
+ -I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/general \
-I$(UTILITIES_DIR)/NewChromsweep \
-I$(UTILITIES_DIR)/GenomeFile/ \
diff --git a/src/regressTest/Makefile b/src/regressTest/Makefile
index e9ceebf..8ffeeab 100644
--- a/src/regressTest/Makefile
+++ b/src/regressTest/Makefile
@@ -18,6 +18,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders \
-I$(UTILITIES_DIR)/FileRecordTools/Records \
+ -I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/general
# ----------------------------------
diff --git a/src/sampleFile/Makefile b/src/sampleFile/Makefile
index 2042291..9ccbe5a 100644
--- a/src/sampleFile/Makefile
+++ b/src/sampleFile/Makefile
@@ -17,6 +17,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
+ -I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/RecordOutputMgr/ \
-I$(UTILITIES_DIR)/version/
diff --git a/src/utils/BinTree/Makefile b/src/utils/BinTree/Makefile
index de04c81..c29b5eb 100644
--- a/src/utils/BinTree/Makefile
+++ b/src/utils/BinTree/Makefile
@@ -11,6 +11,7 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
+ -I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/BamTools/src/ \
-I$(UTILITIES_DIR)/version/
diff --git a/src/utils/Contexts/ContextBase.cpp b/src/utils/Contexts/ContextBase.cpp
index cd30b20..adbc47a 100644
--- a/src/utils/Contexts/ContextBase.cpp
+++ b/src/utils/Contexts/ContextBase.cpp
@@ -52,20 +52,16 @@ ContextBase::ContextBase()
_hasConstantSeed(false),
_seed(0),
_forwardOnly(false),
- _reverseOnly(false)
+ _reverseOnly(false),
+ _hasColumnOpsMethods(false)
{
_programNames["intersect"] = INTERSECT;
_programNames["sample"] = SAMPLE;
_programNames["map"] = MAP;
- _validScoreOps.insert("sum");
- _validScoreOps.insert("max");
- _validScoreOps.insert("min");
- _validScoreOps.insert("mean");
- _validScoreOps.insert("mode");
- _validScoreOps.insert("median");
- _validScoreOps.insert("antimode");
- _validScoreOps.insert("collapse");
+ if (hasColumnOpsMethods()) {
+ _keyListOps = new KeyListOps();
+ }
}
ContextBase::~ContextBase()
@@ -79,6 +75,11 @@ ContextBase::~ContextBase()
delete _files[i];
_files[i] = NULL;
}
+ if (hasColumnOpsMethods()) {
+ delete _keyListOps;
+ _keyListOps = NULL;
+ }
+
}
bool ContextBase::determineOutputType() {
@@ -176,6 +177,19 @@ bool ContextBase::parseCmdArgs(int argc, char **argv, int skipFirstArgs) {
else if (strcmp(_argv[_i], "-seed") == 0) {
if (!handle_seed()) return false;
}
+ else if (strcmp(_argv[_i], "-o") == 0) {
+ if (!handle_o()) return false;
+ }
+ else if (strcmp(_argv[_i], "-c") == 0) {
+ if (!handle_c()) return false;
+ }
+ else if (strcmp(_argv[_i], "-null") == 0) {
+ if (!handle_null()) return false;
+ }
+ else if (strcmp(_argv[_i], "-delim") == 0) {
+ if (!handle_delim()) return false;
+ }
+
}
return true;
}
@@ -191,6 +205,12 @@ bool ContextBase::isValidState()
if (!determineOutputType()) {
return false;
}
+ if (hasColumnOpsMethods()) {
+ FileRecordMgr *dbFile = getFile(hasIntersectMethods() ? _databaseFileIdx : 0);
+ if (!_keyListOps->isValidColumnOps(dbFile)) {
+ return false;
+ }
+ }
return true;
}
@@ -363,3 +383,85 @@ bool ContextBase::handle_ubam()
markUsed(_i - _skipFirstArgs);
return true;
}
+
+
+// Methods specific to column operations.
+// for col ops, -c is the string of columns upon which to operate
+bool ContextBase::handle_c()
+{
+ if (!hasColumnOpsMethods()) {
+ return false;
+ }
+ if ((_i+1) < _argc) {
+ _keyListOps->setColumns(_argv[_i + 1]);
+ markUsed(_i - _skipFirstArgs);
+ _i++;
+ markUsed(_i - _skipFirstArgs);
+ }
+ return true;
+}
+
+
+// for col ops, -o is the string of operations to apply to the columns (-c)
+bool ContextBase::handle_o()
+{
+ if (!hasColumnOpsMethods()) {
+ return false;
+ }
+ if ((_i+1) < _argc) {
+ _keyListOps->setOperations(_argv[_i + 1]);
+ markUsed(_i - _skipFirstArgs);
+ _i++;
+ markUsed(_i - _skipFirstArgs);
+ }
+ return true;
+}
+
+
+// for col ops, -null is a NULL vakue assigned
+// when no overlaps are detected.
+bool ContextBase::handle_null()
+{
+ if (!hasColumnOpsMethods()) {
+ return false;
+ }
+ if ((_i+1) < _argc) {
+ _keyListOps->setNullValue(_argv[_i + 1]);
+ markUsed(_i - _skipFirstArgs);
+ _i++;
+ markUsed(_i - _skipFirstArgs);
+ }
+ return true;
+}
+
+//for col ops, delimStr will appear between each item in
+//a collapsed but delimited list.
+bool ContextBase::handle_delim()
+{
+ if (!hasColumnOpsMethods()) {
+ return false;
+ }
+ if ((_i+1) < _argc) {
+ _keyListOps->setDelimStr(_argv[_i + 1]);
+ markUsed(_i - _skipFirstArgs);
+ _i++;
+ markUsed(_i - _skipFirstArgs);
+ }
+ return true;
+}
+
+void ContextBase::setColumnOpsMethods(bool val)
+{
+ _hasColumnOpsMethods = val;
+ if (val) {
+ _keyListOps = new KeyListOps();
+ }
+}
+
+const QuickString &ContextBase::getColumnOpsVal(RecordKeyList &keyList) const {
+ if (!hasColumnOpsMethods()) {
+ return _nullStr;
+ }
+ return _keyListOps->getOpVals(keyList);
+}
+
diff --git a/src/utils/Contexts/ContextBase.h b/src/utils/Contexts/ContextBase.h
index 872193f..b4bf122 100644
--- a/src/utils/Contexts/ContextBase.h
+++ b/src/utils/Contexts/ContextBase.h
@@ -24,6 +24,7 @@
#include "NewGenomeFile.h"
#include "api/BamReader.h"
#include "api/BamAux.h"
+#include "KeyListOps.h"
class ContextBase {
@@ -144,6 +145,13 @@ public:
//methods.
virtual bool hasIntersectMethods() const { return false; }
+ // determine whether column operations like those used in map
+ // are available.
+ void setColumnOpsMethods(bool val);
+ virtual bool hasColumnOpsMethods() const { return _hasColumnOpsMethods; }
+ const QuickString &getColumnOpsVal(RecordKeyList &keyList) const;
+ //methods applicable only to column operations.
+
protected:
PROGRAM_TYPE _program;
@@ -191,15 +199,11 @@ protected:
int _bamHeaderAndRefIdx;
int _maxNumDatabaseFields;
bool _useFullBamTags;
- QuickString _columnOperation;
- int _column;
- QuickString _nullValue;
bool _reportCount;
int _maxDistance;
bool _reportNames;
bool _reportScores;
QuickString _scoreOp;
- set<QuickString> _validScoreOps;
int _numOutputRecords;
@@ -208,6 +212,10 @@ protected:
bool _forwardOnly;
bool _reverseOnly;
+ bool _hasColumnOpsMethods;
+ KeyListOps *_keyListOps;
+ QuickString _nullStr; //placeholder return value when col ops aren't valid.
+
void markUsed(int i) { _argsProcessed[i] = true; }
bool isUsed(int i) const { return _argsProcessed[i]; }
bool cmdArgsValid();
@@ -231,6 +239,11 @@ protected:
virtual bool handle_split();
virtual bool handle_sorted();
virtual bool handle_ubam();
+
+ virtual bool handle_c();
+ virtual bool handle_o();
+ virtual bool handle_null();
+ virtual bool handle_delim();
};
#endif /* CONTEXTBASE_H_ */
diff --git a/src/utils/Contexts/ContextIntersect.h b/src/utils/Contexts/ContextIntersect.h
index 0144a12..b066e94 100644
--- a/src/utils/Contexts/ContextIntersect.h
+++ b/src/utils/Contexts/ContextIntersect.h
@@ -21,6 +21,8 @@ public:
//NOTE: Query and database files will only be marked as such by either the
//parseCmdArgs method, or by explicitly setting them.
+ FileRecordMgr *getQueryFile() { return getFile(_queryFileIdx); }
+ FileRecordMgr *getDatabaseFile() { return getFile(_databaseFileIdx); }
int getQueryFileIdx() const { return _queryFileIdx; }
void setQueryFileIdx(int idx) { _queryFileIdx = idx; }
int getDatabaseFileIdx() const { return _databaseFileIdx; }
diff --git a/src/utils/Contexts/ContextMap.cpp b/src/utils/Contexts/ContextMap.cpp
index d94d088..e3f8241 100644
--- a/src/utils/Contexts/ContextMap.cpp
+++ b/src/utils/Contexts/ContextMap.cpp
@@ -12,13 +12,7 @@ ContextMap::ContextMap()
// map requires sorted input
setSortedInput(true);
setLeftJoin(true);
-
- // default to BED score column
- setColumn(5);
- // default to "sum"
- setColumnOperation("sum");
- // default to "." as a NULL value
- setNullValue('.');
+ setColumnOpsMethods(true);
}
ContextMap::~ContextMap()
@@ -44,75 +38,22 @@ bool ContextMap::parseCmdArgs(int argc, char **argv, int skipFirstArgs) {
if (isUsed(_i - _skipFirstArgs)) {
continue;
}
- else if (strcmp(_argv[_i], "-o") == 0) {
- if (!handle_o()) return false;
- }
- else if (strcmp(_argv[_i], "-c") == 0) {
- if (!handle_c()) return false;
- }
- else if (strcmp(_argv[_i], "-null") == 0) {
- if (!handle_null()) return false;
- }
- }
- return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs);
-}
-
+ if (strcmp(_argv[_i], "-c") == 0) {
+ //bypass intersect's use of the -c option, because -c
+ //means writeCount for intersect, but means columns for map.
+ if (!ContextBase::handle_c()) return false;
+ }
-bool ContextMap::isValidState()
-{
- if (!ContextIntersect::isValidState()) {
- return false;
}
-
- if (getDatabaseFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) {
- //throw Error
- cerr << endl << "*****"
- << endl
- << "***** ERROR: BAM database file not currently supported for the map tool."
- << endl;
- exit(1);
- }
- // TODO
- // enforce any specific checks for Map.
- return true;
-}
-
-
-// for map, -c is the column upon which to operate
-bool ContextMap::handle_c()
-{
- if ((_i+1) < _argc) {
- setColumn(atoi(_argv[_i + 1]));
- markUsed(_i - _skipFirstArgs);
- _i++;
- markUsed(_i - _skipFirstArgs);
- }
- return true;
-}
-
-
-// for map, -o is the operation to apply to the column (-c)
-bool ContextMap::handle_o()
-{
- if ((_i+1) < _argc) {
- setColumnOperation(_argv[_i + 1]);
- markUsed(_i - _skipFirstArgs);
- _i++;
- markUsed(_i - _skipFirstArgs);
- }
- return true;
-}
-
-
-// for map, -null is a NULL vakue assigned
-// when no overlaps are detected.
-bool ContextMap::handle_null()
-{
- if ((_i+1) < _argc) {
- setNullValue(_argv[_i + 1]);
- markUsed(_i - _skipFirstArgs);
- _i++;
- markUsed(_i - _skipFirstArgs);
- }
- return true;
+ return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs);
}
+//
+//
+//bool ContextMap::isValidState()
+//{
+// if (!ContextIntersect::isValidState()) {
+// return false;
+// }
+//}
+//
+//
diff --git a/src/utils/Contexts/ContextMap.h b/src/utils/Contexts/ContextMap.h
index b8ee57f..9b7280e 100644
--- a/src/utils/Contexts/ContextMap.h
+++ b/src/utils/Contexts/ContextMap.h
@@ -9,30 +9,20 @@
#define CONTEXTMAP_H_
#include "ContextIntersect.h"
+#include "KeyListOps.h"
class ContextMap : public ContextIntersect {
public:
ContextMap();
virtual ~ContextMap();
- virtual bool isValidState();
-
+// virtual bool isValidState();
+//
virtual bool parseCmdArgs(int argc, char **argv, int skipFirstArgs);
-
- int getColumn() const { return _column; }
- void setColumn(int column) { _column = column; }
-
- const QuickString & getColumnOperation() const { return _columnOperation; }
- void setColumnOperation(const QuickString & operation) { _columnOperation = operation; }
-
- const QuickString & getNullValue() const { return _nullValue; }
- void setNullValue(const QuickString & nullValue) { _nullValue = nullValue; }
-
+//
virtual bool hasIntersectMethods() const { return true; }
+//
private:
- virtual bool handle_c();
- virtual bool handle_o();
- virtual bool handle_null();
};
diff --git a/src/utils/Contexts/Makefile b/src/utils/Contexts/Makefile
index 7ddc3c6..4b2ed42 100644
--- a/src/utils/Contexts/Makefile
+++ b/src/utils/Contexts/Makefile
@@ -9,6 +9,7 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
+ -I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/GenomeFile/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/BamTools/src/ \
diff --git a/src/utils/FileRecordTools/Records/BamRecord.cpp b/src/utils/FileRecordTools/Records/BamRecord.cpp
index 4c5cd8d..f939fef 100644
--- a/src/utils/FileRecordTools/Records/BamRecord.cpp
+++ b/src/utils/FileRecordTools/Records/BamRecord.cpp
@@ -172,5 +172,10 @@ const QuickString &BamRecord::getField(int fieldNum) const
return Bed6Interval::getField(fieldNum);
}
+bool BamRecord::isNumericField(int fieldNum) {
+
+ //TBD: As with getField, this isn't defined for BAM.
+ return (fieldNum > 6 ? false : Bed6Interval::isNumericField(fieldNum));
+}
diff --git a/src/utils/FileRecordTools/Records/BamRecord.h b/src/utils/FileRecordTools/Records/BamRecord.h
index b74dbc2..022ecb4 100644
--- a/src/utils/FileRecordTools/Records/BamRecord.h
+++ b/src/utils/FileRecordTools/Records/BamRecord.h
@@ -40,6 +40,7 @@ public:
virtual const QuickString &getField(int fieldNum) const;
virtual int getNumFields() const { return 12; }
+ static bool isNumericField(int fieldNum);
protected:
BamTools::BamAlignment _bamAlignment;
diff --git a/src/utils/FileRecordTools/Records/Bed12Interval.cpp b/src/utils/FileRecordTools/Records/Bed12Interval.cpp
index 867a69e..0a5a092 100644
--- a/src/utils/FileRecordTools/Records/Bed12Interval.cpp
+++ b/src/utils/FileRecordTools/Records/Bed12Interval.cpp
@@ -146,3 +146,29 @@ const QuickString &Bed12Interval::getField(int fieldNum) const
}
}
+bool Bed12Interval::isNumericField(int fieldNum) {
+ switch (fieldNum) {
+ case 7:
+ return true;
+ break;
+ case 8:
+ return true;
+ break;
+ case 9:
+ return false;
+ break;
+ case 10:
+ return true;
+ break;
+ case 11:
+ return false;
+ break;
+ case 12:
+ return false;
+ break;
+ default:
+ return Bed6Interval::isNumericField(fieldNum);
+ break;
+ }
+}
+
diff --git a/src/utils/FileRecordTools/Records/Bed12Interval.h b/src/utils/FileRecordTools/Records/Bed12Interval.h
index 711800c..ffa89f9 100644
--- a/src/utils/FileRecordTools/Records/Bed12Interval.h
+++ b/src/utils/FileRecordTools/Records/Bed12Interval.h
@@ -54,6 +54,7 @@ public:
virtual const QuickString &getField(int fieldNum) const;
virtual int getNumFields() const { return 12; }
+ static bool isNumericField(int fieldNum);
protected:
diff --git a/src/utils/FileRecordTools/Records/Bed3Interval.cpp b/src/utils/FileRecordTools/Records/Bed3Interval.cpp
index 3f896be..e31e43e 100644
--- a/src/utils/FileRecordTools/Records/Bed3Interval.cpp
+++ b/src/utils/FileRecordTools/Records/Bed3Interval.cpp
@@ -79,3 +79,23 @@ const QuickString &Bed3Interval::getField(int fieldNum) const
break;
}
}
+
+bool Bed3Interval::isNumericField(int fieldNum) {
+ switch (fieldNum) {
+ case 1:
+ return false; //chrom
+ break;
+ case 2:
+ return true; //startPos
+ break;
+ case 3:
+ return true; //endPos
+ break;
+ default:
+ cerr << endl << "*****" << endl
+ << "*****ERROR: requested invalid column " << fieldNum << ". Exiting." << endl
+ << endl << "*****" << endl;
+ exit(1);
+ break;
+ }
+}
diff --git a/src/utils/FileRecordTools/Records/Bed3Interval.h b/src/utils/FileRecordTools/Records/Bed3Interval.h
index 9f1ff11..93377a0 100644
--- a/src/utils/FileRecordTools/Records/Bed3Interval.h
+++ b/src/utils/FileRecordTools/Records/Bed3Interval.h
@@ -32,6 +32,8 @@ public:
virtual const QuickString &getField(int fieldNum) const;
virtual int getNumFields() const { return 3; }
+ static bool isNumericField(int fieldNum);
+
protected:
virtual ~Bed3Interval();
diff --git a/src/utils/FileRecordTools/Records/Bed4Interval.cpp b/src/utils/FileRecordTools/Records/Bed4Interval.cpp
index c1ef81a..27ca9f7 100644
--- a/src/utils/FileRecordTools/Records/Bed4Interval.cpp
+++ b/src/utils/FileRecordTools/Records/Bed4Interval.cpp
@@ -60,3 +60,8 @@ const QuickString &Bed4Interval::getField(int fieldNum) const
}
}
+bool Bed4Interval::isNumericField(int fieldNum) {
+ return (fieldNum == 4 ? false : Bed3Interval::isNumericField(fieldNum));
+}
+
+
diff --git a/src/utils/FileRecordTools/Records/Bed4Interval.h b/src/utils/FileRecordTools/Records/Bed4Interval.h
index f42817c..b038446 100644
--- a/src/utils/FileRecordTools/Records/Bed4Interval.h
+++ b/src/utils/FileRecordTools/Records/Bed4Interval.h
@@ -28,6 +28,7 @@ public:
virtual const QuickString &getField(int fieldNum) const;
virtual int getNumFields() const { return 4; }
+ static bool isNumericField(int fieldNum);
protected:
diff --git a/src/utils/FileRecordTools/Records/Bed5Interval.cpp b/src/utils/FileRecordTools/Records/Bed5Interval.cpp
index 7307fb6..130a788 100644
--- a/src/utils/FileRecordTools/Records/Bed5Interval.cpp
+++ b/src/utils/FileRecordTools/Records/Bed5Interval.cpp
@@ -70,3 +70,16 @@ const QuickString &Bed5Interval::getField(int fieldNum) const
break;
}
}
+
+bool Bed5Interval::isNumericField(int fieldNum) {
+ switch (fieldNum) {
+ case 4:
+ return false;
+ break;
+ case 5:
+ return true;
+ break;
+ default:
+ return Bed3Interval::isNumericField(fieldNum);
+ }
+}
diff --git a/src/utils/FileRecordTools/Records/Bed5Interval.h b/src/utils/FileRecordTools/Records/Bed5Interval.h
index bc913d1..2064d35 100644
--- a/src/utils/FileRecordTools/Records/Bed5Interval.h
+++ b/src/utils/FileRecordTools/Records/Bed5Interval.h
@@ -27,6 +27,7 @@ public:
virtual const QuickString &getField(int fieldNum) const;
virtual int getNumFields() const { return 5; }
+ static bool isNumericField(int fieldNum);
protected:
diff --git a/src/utils/FileRecordTools/Records/Bed6Interval.cpp b/src/utils/FileRecordTools/Records/Bed6Interval.cpp
index 8371553..5bc783c 100644
--- a/src/utils/FileRecordTools/Records/Bed6Interval.cpp
+++ b/src/utils/FileRecordTools/Records/Bed6Interval.cpp
@@ -81,3 +81,20 @@ const QuickString &Bed6Interval::getField(int fieldNum) const
break;
}
}
+
+bool Bed6Interval::isNumericField(int fieldNum) {
+ switch (fieldNum) {
+ case 4:
+ return false;
+ break;
+ case 5:
+ return true;
+ break;
+ case 6:
+ return false;
+ break;
+ default:
+ return Bed3Interval::isNumericField(fieldNum);
+ break;
+ }
+}
diff --git a/src/utils/FileRecordTools/Records/Bed6Interval.h b/src/utils/FileRecordTools/Records/Bed6Interval.h
index 9ad9f80..023683f 100644
--- a/src/utils/FileRecordTools/Records/Bed6Interval.h
+++ b/src/utils/FileRecordTools/Records/Bed6Interval.h
@@ -27,6 +27,7 @@ public:
virtual const QuickString &getField(int fieldNum) const;
virtual int getNumFields() const { return 6; }
+ static bool isNumericField(int fieldNum);
protected:
diff --git a/src/utils/FileRecordTools/Records/BedGraphInterval.cpp b/src/utils/FileRecordTools/Records/BedGraphInterval.cpp
index e080857..9cfda48 100644
--- a/src/utils/FileRecordTools/Records/BedGraphInterval.cpp
+++ b/src/utils/FileRecordTools/Records/BedGraphInterval.cpp
@@ -60,3 +60,14 @@ const QuickString &BedGraphInterval::getField(int fieldNum) const
}
}
+bool BedGraphInterval::isNumericField(int fieldNum) {
+ switch (fieldNum) {
+ case 4:
+ return true;
+ break;
+ default:
+ return Bed3Interval::isNumericField(fieldNum);
+ break;
+ }
+}
+
diff --git a/src/utils/FileRecordTools/Records/BedGraphInterval.h b/src/utils/FileRecordTools/Records/BedGraphInterval.h
index 1bdf619..5db6fea 100644
--- a/src/utils/FileRecordTools/Records/BedGraphInterval.h
+++ b/src/utils/FileRecordTools/Records/BedGraphInterval.h
@@ -28,6 +28,7 @@ public:
virtual const QuickString &getField(int fieldNum) const;
virtual int getNumFields() const { return 4; }
+ static bool isNumericField(int fieldNum);
protected:
virtual ~BedGraphInterval();
diff --git a/src/utils/FileRecordTools/Records/BedPlusInterval.cpp b/src/utils/FileRecordTools/Records/BedPlusInterval.cpp
index fc8be36..5819b86 100644
--- a/src/utils/FileRecordTools/Records/BedPlusInterval.cpp
+++ b/src/utils/FileRecordTools/Records/BedPlusInterval.cpp
@@ -117,3 +117,18 @@ const QuickString &BedPlusInterval::getField(int fieldNum) const
}
return Bed6Interval::getField(fieldNum);
}
+
+bool BedPlusInterval::isNumericField(int fieldNum) {
+
+ //
+ // TBD: There is no currently no good way to guarantee / enforce whether
+ // fields after the 6th are numeric, so for now we'll give the user the
+ // benefit of the doubt on those.
+ //
+ if (fieldNum > startOtherIdx) {
+ return true;
+ } else {
+ return Bed6Interval::isNumericField(fieldNum);
+ }
+}
+
diff --git a/src/utils/FileRecordTools/Records/BedPlusInterval.h b/src/utils/FileRecordTools/Records/BedPlusInterval.h
index 4b98b4f..077ed93 100644
--- a/src/utils/FileRecordTools/Records/BedPlusInterval.h
+++ b/src/utils/FileRecordTools/Records/BedPlusInterval.h
@@ -38,6 +38,8 @@ public:
virtual void setField(int fieldNum, const char *str) { (*(_otherIdxs[fieldNum])) = str; }
virtual void setNumPrintFields(int num) { _numPrintFields = num; }
virtual int getNumPrintFields() const { return _numPrintFields; }
+ static bool isNumericField(int fieldNum);
+
protected:
virtual ~BedPlusInterval();
diff --git a/src/utils/FileRecordTools/Records/GffRecord.cpp b/src/utils/FileRecordTools/Records/GffRecord.cpp
index a91ce15..21cea1d 100644
--- a/src/utils/FileRecordTools/Records/GffRecord.cpp
+++ b/src/utils/FileRecordTools/Records/GffRecord.cpp
@@ -156,4 +156,40 @@ const QuickString &GffRecord::getField(int fieldNum) const
}
}
+bool GffRecord::isNumericField(int fieldNum) {
+ switch (fieldNum) {
+ case 1:
+ return false;
+ break;
+ case 2:
+ return false;
+ break;
+ case 3:
+ return false;
+ break;
+ case 4:
+ return true;
+ break;
+ case 5:
+ return true;
+ break;
+ case 6:
+ return true;
+ break;
+ case 7:
+ return false;
+ break;
+ case 8:
+ return false;
+ break;
+ case 9:
+ return false;
+ break;
+ default:
+ return Bed6Interval::isNumericField(fieldNum);
+ break;
+ }
+
+}
+
diff --git a/src/utils/FileRecordTools/Records/GffRecord.h b/src/utils/FileRecordTools/Records/GffRecord.h
index b84d96a..e675542 100644
--- a/src/utils/FileRecordTools/Records/GffRecord.h
+++ b/src/utils/FileRecordTools/Records/GffRecord.h
@@ -34,6 +34,7 @@ public:
//Note: using the assignment operator in a GffRecord can potentially be a performance hit,
//if the number of fields frequently differ between this object and the one being copied.
const GffRecord &operator=(const GffRecord &other);
+ static bool isNumericField(int fieldNum);
protected:
virtual ~GffRecord();
diff --git a/src/utils/FileRecordTools/Records/Record.cpp b/src/utils/FileRecordTools/Records/Record.cpp
index 2beb4dc..89544ed 100644
--- a/src/utils/FileRecordTools/Records/Record.cpp
+++ b/src/utils/FileRecordTools/Records/Record.cpp
@@ -187,9 +187,9 @@ void Record::undoZeroLength()
ostream &operator << (ostream &out, const Record &record)
{
- QuickString errBuf;
- record.print(errBuf);
- out << errBuf;
+ QuickString outBuf;
+ record.print(outBuf);
+ out << outBuf;
return out;
}
diff --git a/src/utils/FileRecordTools/Records/Record.h b/src/utils/FileRecordTools/Records/Record.h
index 2c303d9..d8071c1 100644
--- a/src/utils/FileRecordTools/Records/Record.h
+++ b/src/utils/FileRecordTools/Records/Record.h
@@ -129,6 +129,8 @@ public:
virtual bool sameChromIntersects(const Record *otherRecord,
bool sameStrand, bool diffStrand, float overlapFraction, bool reciprocal) const;
+// virtual static bool isNumericField(int fieldNum) const = 0;
+
protected:
virtual ~Record(); //by making the destructor protected, only the friend class(es) can actually delete Record objects, or objects derived from Record.
diff --git a/src/utils/GenomeFile/Makefile b/src/utils/GenomeFile/Makefile
index afaeccd..fd17d29 100644
--- a/src/utils/GenomeFile/Makefile
+++ b/src/utils/GenomeFile/Makefile
@@ -6,6 +6,7 @@ UTILITIES_DIR = ../
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/lineFileUtilities/ \
+ -I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/BamTools/include/
# ----------------------------------
diff --git a/src/utils/KeyListOps/KeyListOps.cpp b/src/utils/KeyListOps/KeyListOps.cpp
new file mode 100644
index 0000000..6576350
--- /dev/null
+++ b/src/utils/KeyListOps/KeyListOps.cpp
@@ -0,0 +1,364 @@
+/*
+ * KeyListOps.cpp
+ *
+ * Created on: Feb 24, 2014
+ * Author: nek3d
+ */
+#include "KeyListOps.h"
+#include "FileRecordMgr.h"
+#include <cmath> //for isnan
+
+KeyListOps::KeyListOps() {
+ _opCodes["sum"] = SUM;
+ _opCodes["mean"] = MEAN;
+ _opCodes["stddev"] = STDDEV;
+ _opCodes["sample_stddev"] = SAMPLE_STDDEV;
+ _opCodes["median"] = MEDIAN;
+ _opCodes["mode"] = MODE;
+ _opCodes["antimode"] = ANTIMODE;
+ _opCodes["min"] = MIN;
+ _opCodes["max"] = MAX;
+ _opCodes["absmin"] = ABSMIN;
+ _opCodes["absmax"] = ABSMAX;
+ _opCodes["count"] = COUNT;
+ _opCodes["distinct"] = DISTINCT;
+ _opCodes["count_distinct"] = COUNT_DISTINCT;
+ _opCodes["distinct_only"] = DISTINCT_ONLY;
+ _opCodes["collapse"] = COLLAPSE;
+ _opCodes["concat"] = CONCAT;
+ _opCodes["freq_asc"] = FREQ_ASC;
+ _opCodes["freq_desc"] = FREQ_DESC;
+ _opCodes["first"] = FIRST;
+ _opCodes["last"] = LAST;
+
+ _isNumericOp[SUM] = true;
+ _isNumericOp[MEAN] = true;
+ _isNumericOp[STDDEV] = true;
+ _isNumericOp[MEDIAN] = true;
+ _isNumericOp[MODE] = false;
+ _isNumericOp[ANTIMODE] = false;
+ _isNumericOp[MIN] = true;
+ _isNumericOp[MAX] = true;
+ _isNumericOp[ABSMIN] = true;
+ _isNumericOp[COUNT] = false;
+ _isNumericOp[DISTINCT] = false;
+ _isNumericOp[COUNT_DISTINCT] = false;
+ _isNumericOp[DISTINCT_ONLY] = false;
+ _isNumericOp[COLLAPSE] = false;
+ _isNumericOp[CONCAT] = false;
+ _isNumericOp[FREQ_ASC] = false;
+ _isNumericOp[FREQ_DESC] = false;
+ _isNumericOp[FIRST] = false;
+ _isNumericOp[LAST] = false;
+
+ _methods.setDelimStr(",");
+ _methods.setNullValue(".");
+
+ // default to BED score column
+ _columns = "5";
+ // default to "sum"
+ _operations = "sum";
+
+}
+
+bool KeyListOps::isNumericOp(OP_TYPES op) const {
+ map<OP_TYPES, bool>::const_iterator iter = _isNumericOp.find(op);
+ return (iter == _isNumericOp.end() ? false : iter->second);
+}
+
+bool KeyListOps::isNumericOp(const QuickString &op) const {
+ return isNumericOp(getOpCode(op));
+}
+
+KeyListOps::OP_TYPES KeyListOps::getOpCode(const QuickString &operation) const {
+ //If the operation does not exist, return INVALID.
+ //otherwise, return code for given operation.
+ map<QuickString, OP_TYPES>::const_iterator iter = _opCodes.find(operation);
+ if (iter == _opCodes.end()) {
+ return INVALID;
+ }
+ return iter->second;
+}
+
+
+bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) {
+
+ if (dbFile->getFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) {
+ //throw Error
+ cerr << endl << "*****" << endl
+ << "***** ERROR: BAM database file not currently supported for column operations."
+ << endl;
+ exit(1);
+ }
+
+
+ //get the strings from context containing the comma-delimited lists of columns
+ //and operations. Split both of these into vectors. Get the operation code
+ //for each operation string. Finally, make a vector of pairs, where the first
+ //member of each pair is a column number, and the second member is the code for the
+ //operation to perform on that column.
+
+ vector<QuickString> columnsVec;
+ vector<QuickString> opsVec;
+ int numCols = Tokenize(_columns, columnsVec, ',');
+ int numOps = Tokenize(_operations, opsVec, ',');
+
+ if (numOps < 1 || numCols < 1) {
+ cerr << endl << "*****" << endl
+ << "***** ERROR: There must be at least one column and at least one operation named." << endl;
+ return false;
+ }
+ if (numOps > 1 && numCols != numOps) {
+ cerr << endl << "*****" << endl
+ << "***** ERROR: There are " << numCols <<" columns given, but there are " << numOps << " operations." << endl;
+ cerr << "\tPlease provide either a single operation that will be applied to all listed columns, " << endl;
+ cerr << "\tor an operation for each column." << endl;
+ return false;
+ }
+ for (int i=0; i < (int)columnsVec.size(); i++) {
+ int col = str2chrPos(columnsVec[i]);
+
+ //check that the column number is valid
+ if (col < 1 || col > dbFile->getNumFields()) {
+ cerr << endl << "*****" << endl << "***** ERROR: Requested column " << col << ", but database file "
+ << dbFile->getFileName() << " only has fields 1 - " << dbFile->getNumFields() << "." << endl;
+ return false;
+ }
+ const QuickString &operation = opsVec.size() > 1 ? opsVec[i] : opsVec[0];
+ OP_TYPES opCode = getOpCode(operation);
+ if (opCode == INVALID) {
+ cerr << endl << "*****" << endl
+ << "***** ERROR: " << operation << " is not a valid operation. " << endl;
+ return false;
+ }
+ _colOps.push_back(pair<int, OP_TYPES>(col, opCode));
+ }
+
+
+ //The final step we need to do is check that for each column/operation pair,
+ //if the operation is numeric, see if the database's record type supports
+ //numeric operations for that column. For instance, we can allow the mean
+ //of column 4 for a BedGraph file, because that's numeric, but not for Bed4,
+ //because that isn't.
+
+ for (int i = 0; i < (int)_colOps.size(); i++) {
+ int col = _colOps[i].first;
+ OP_TYPES opCode = _colOps[i].second;
+ FileRecordTypeChecker::RECORD_TYPE recordType = dbFile->getRecordType();
+
+ if (isNumericOp(opCode)) {
+ bool isValidNumOp = false;
+ switch(recordType) {
+ case FileRecordTypeChecker::BED3_RECORD_TYPE:
+ isValidNumOp = Bed3Interval::isNumericField(col);
+ break;
+
+ case FileRecordTypeChecker::BED4_RECORD_TYPE:
+ isValidNumOp = Bed4Interval::isNumericField(col);
+ break;
+
+ case FileRecordTypeChecker::BED5_RECORD_TYPE:
+ isValidNumOp = Bed5Interval::isNumericField(col);
+ break;
+
+ case FileRecordTypeChecker::BEDGRAPH_RECORD_TYPE:
+ isValidNumOp = BedGraphInterval::isNumericField(col);
+ break;
+
+ case FileRecordTypeChecker::BED6_RECORD_TYPE:
+ isValidNumOp = Bed6Interval::isNumericField(col);
+ break;
+
+ case FileRecordTypeChecker::BED_PLUS_RECORD_TYPE:
+ isValidNumOp = BedPlusInterval::isNumericField(col);
+ break;
+
+ case FileRecordTypeChecker::BED12_RECORD_TYPE:
+ isValidNumOp = Bed12Interval::isNumericField(col);
+ break;
+
+ case FileRecordTypeChecker::BAM_RECORD_TYPE:
+ isValidNumOp = BamRecord::isNumericField(col);
+ break;
+
+ case FileRecordTypeChecker::VCF_RECORD_TYPE:
+ isValidNumOp = VcfRecord::isNumericField(col);
+ break;
+
+ case FileRecordTypeChecker::GFF_RECORD_TYPE:
+ isValidNumOp = GffRecord::isNumericField(col);
+ break;
+
+ default:
+ break;
+ }
+ if (!isValidNumOp) {
+ cerr << endl << "*****" << endl << "***** ERROR: Column " << col << " is not a numeric field for database file "
+ << dbFile->getFileName() << "." << endl;
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+const QuickString & KeyListOps::getOpVals(RecordKeyList &hits)
+{
+ //loop through all requested columns, and for each one, call the method needed
+ //for the operation specified.
+ _methods.setKeyList(&hits);
+ _outVals.clear();
+ double val = 0.0;
+ for (int i=0; i < (int)_colOps.size(); i++) {
+ int col = _colOps[i].first;
+ OP_TYPES opCode = _colOps[i].second;
+
+ _methods.setColumn(col);
+ switch (opCode) {
+ case SUM:
+ val = _methods.getSum();
+ if (isnan(val)) {
+ _outVals.append(_methods.getNullValue());
+ } else {
+ _outVals.append(val);
+ }
+ break;
+
+ case MEAN:
+ val = _methods.getMean();
+ if (isnan(val)) {
+ _outVals.append(_methods.getNullValue());
+ } else {
+ _outVals.append(val);
+ }
+ break;
+
+ case STDDEV:
+ val = _methods.getStddev();
+ if (isnan(val)) {
+ _outVals.append(_methods.getNullValue());
+ } else {
+ _outVals.append(val);
+ }
+ break;
+
+ case SAMPLE_STDDEV:
+ val = _methods.getSampleStddev();
+ if (isnan(val)) {
+ _outVals.append(_methods.getNullValue());
+ } else {
+ _outVals.append(val);
+ }
+ break;
+
+ case MEDIAN:
+ val = _methods.getMedian();
+ if (isnan(val)) {
+ _outVals.append(_methods.getNullValue());
+ } else {
+ _outVals.append(val);
+ }
+ break;
+
+ case MODE:
+ _outVals.append(_methods.getMode());
+ break;
+
+ case ANTIMODE:
+ _outVals.append(_methods.getAntiMode());
+ break;
+
+ case MIN:
+ val = _methods.getMin();
+ if (isnan(val)) {
+ _outVals.append(_methods.getNullValue());
+ } else {
+ _outVals.append(val);
+ }
+ break;
+
+ case MAX:
+ val = _methods.getMax();
+ if (isnan(val)) {
+ _outVals.append(_methods.getNullValue());
+ } else {
+ _outVals.append(val);
+ }
+ break;
+
+ case ABSMIN:
+ val = _methods.getAbsMin();
+ if (isnan(val)) {
+ _outVals.append(_methods.getNullValue());
+ } else {
+ _outVals.append(val);
+ }
+ break;
+
+ case ABSMAX:
+ val = _methods.getAbsMax();
+ if (isnan(val)) {
+ _outVals.append(_methods.getNullValue());
+ } else {
+ _outVals.append(val);
+ }
+ break;
+
+ case COUNT:
+ _outVals.append(_methods.getCount());
+ break;
+
+ case DISTINCT:
+ _outVals.append(_methods.getDistinct());
+ break;
+
+ case COUNT_DISTINCT:
+ _outVals.append(_methods.getCountDistinct());
+ break;
+
+ case DISTINCT_ONLY:
+ _outVals.append(_methods.getDistinctOnly());
+ break;
+
+ case COLLAPSE:
+ _outVals.append(_methods.getCollapse());
+ break;
+
+ case CONCAT:
+ _outVals.append(_methods.getConcat());
+ break;
+
+ case FREQ_ASC:
+ _outVals.append(_methods.getFreqAsc());
+ break;
+
+ case FREQ_DESC:
+ _outVals.append(_methods.getFreqDesc());
+ break;
+
+ case FIRST:
+ _outVals.append(_methods.getFirst());
+ break;
+
+ case LAST:
+ _outVals.append(_methods.getLast());
+ break;
+
+ case INVALID:
+ default:
+ // Any unrecognized operation should have been handled already in the context validation.
+ // It's thus unnecessary to handle it here, but throw an error to help us know if future
+ // refactoring or code changes accidentally bypass the validation phase.
+ cerr << "ERROR: Invalid operation given for column " << col << ". Exiting..." << endl;
+ break;
+ }
+ //if this isn't the last column, add a tab.
+ if (i < (int)_colOps.size() -1) {
+ _outVals.append('\t');
+ }
+ }
+ return _outVals;
+}
+
+
diff --git a/src/utils/KeyListOps/KeyListOps.h b/src/utils/KeyListOps/KeyListOps.h
new file mode 100644
index 0000000..3c26d2c
--- /dev/null
+++ b/src/utils/KeyListOps/KeyListOps.h
@@ -0,0 +1,54 @@
+/*
+ * KeyListOps.h
+ *
+ * Created on: Feb 24, 2014
+ * Author: nek3d
+ */
+
+#ifndef KEYLISTOPS_H_
+#define KEYLISTOPS_H_
+
+#include "KeyListOpsMethods.h"
+
+class FileRecordMgr;
+
+class KeyListOps {
+public:
+
+ KeyListOps();
+
+ void setColumns(const QuickString &columns) { _columns = columns; }
+ void setOperations(const QuickString & operation) { _operations = operation; }
+ void setNullValue(const QuickString & nullValue) { _methods.setNullValue(nullValue); }
+ void setDelimStr(const QuickString & delimStr) { _methods.setDelimStr(delimStr); }
+
+ void setKeyList(RecordKeyList *keyList) { _methods.setKeyList(keyList); }
+
+ typedef enum { SUM, MEAN, STDDEV, SAMPLE_STDDEV, MEDIAN, MODE, ANTIMODE, MIN, MAX, ABSMIN, ABSMAX, COUNT, DISTINCT, COUNT_DISTINCT,
+ DISTINCT_ONLY, COLLAPSE, CONCAT, FREQ_ASC, FREQ_DESC, FIRST, LAST, INVALID } OP_TYPES;
+
+ bool isValidColumnOps(FileRecordMgr *dbFile);
+
+ const QuickString &getOpVals(RecordKeyList &hits);
+
+private:
+ void init();
+
+ QuickString _operations;
+ QuickString _columns;
+
+ KeyListOpsMethods _methods;
+ map<QuickString, OP_TYPES> _opCodes;
+ map<OP_TYPES, bool> _isNumericOp;
+
+ typedef vector<pair<int, OP_TYPES> > colOpsType;
+ colOpsType _colOps;
+ QuickString _outVals;
+
+ OP_TYPES getOpCode(const QuickString &operation) const;
+ bool isNumericOp(OP_TYPES op) const;
+ bool isNumericOp(const QuickString &op) const;
+
+};
+
+#endif /* KEYLISTOPS_H_ */
diff --git a/src/utils/KeyListOps/KeyListOpsMethods.cpp b/src/utils/KeyListOps/KeyListOpsMethods.cpp
new file mode 100644
index 0000000..0b00135
--- /dev/null
+++ b/src/utils/KeyListOps/KeyListOpsMethods.cpp
@@ -0,0 +1,368 @@
+/*
+ * KeyListOpsMethods.cpp
+ *
+ * Created on: Feb 6, 2014
+ * Author: nek3d
+ */
+
+#include "KeyListOpsMethods.h"
+#include <cfloat>
+#include <cmath>
+#include <algorithm>
+
+KeyListOpsMethods::KeyListOpsMethods()
+: _keyList(&_nullKeyList),
+ _column(1),
+ _nullVal("."),
+ _delimStr(","),
+ _iter(_nullKeyList.begin())
+{
+}
+
+KeyListOpsMethods::KeyListOpsMethods(RecordKeyList *keyList, int column)
+: _keyList(keyList),
+ _column(column),
+ _nullVal("."),
+ _delimStr(","),
+ _iter(keyList->begin())
+{
+}
+
+
+KeyListOpsMethods::~KeyListOpsMethods() {
+
+}
+
+// return the total of the values in the vector
+double KeyListOpsMethods::getSum() {
+ if (empty()) return NAN;
+
+ double theSum = 0.0;
+ for (begin(); !end(); next()) {
+ theSum += getColValNum();
+ }
+ return theSum;
+}
+
+// return the average value in the vector
+double KeyListOpsMethods::getMean() {
+ if (empty()) return NAN;
+
+ return getSum() / (float)getCount();
+}
+
+
+ // return the standard deviation
+double KeyListOpsMethods::getStddev() {
+ if (empty()) return NAN;
+
+ double avg = getMean();
+ double squareDiffSum = 0.0;
+ for (begin(); !end(); next()) {
+ double val = getColValNum();
+ double diff = val - avg;
+ squareDiffSum += diff * diff;
+ }
+ return squareDiffSum / (float)getCount();
+}
+// return the standard deviation
+double KeyListOpsMethods::getSampleStddev() {
+ if (empty()) return NAN;
+
+ double avg = getMean();
+ double squareDiffSum = 0.0;
+ for (begin(); !end(); next()) {
+ double val = getColValNum();
+ double diff = val - avg;
+ squareDiffSum += diff * diff;
+ }
+ return squareDiffSum / ((float)getCount() - 1.0);
+}
+
+// return the median value in the vector
+double KeyListOpsMethods::getMedian() {
+ if (empty()) return NAN;
+
+ //get sorted vector. if even number of elems, return middle val.
+ //if odd, average of two.
+ toArray(true, ASC);
+ size_t count = getCount();
+ if (count % 2) {
+ //odd number of elements. Take middle one.
+ return _numArray[count/2];
+ } else {
+ //even numnber of elements. Take average of middle 2.
+ double sum = _numArray[count/2 -1] + _numArray[count/2];
+ return sum / 2.0;
+ }
+}
+
+// return the most common value in the vector
+const QuickString &KeyListOpsMethods::getMode() {
+ if (empty()) return _nullVal;
+
+ makeFreqMap();
+
+ //now pass through the freq map and keep track of which key has the highest occurance.
+ freqMapType::iterator maxIter = _freqMap.begin();
+ int maxVal = 0;
+ for (; _freqIter != _freqMap.end(); _freqIter++) {
+ if (_freqIter->second > maxVal) {
+ maxIter = _freqIter;
+ maxVal = _freqIter->second;
+ }
+ }
+ _retStr = maxIter->first;
+ return _retStr;
+}
+// return the least common value in the vector
+const QuickString &KeyListOpsMethods::getAntiMode() {
+ if (empty()) return _nullVal;
+
+ makeFreqMap();
+
+ //now pass through the freq map and keep track of which key has the highest occurance.
+ freqMapType::iterator minIter = _freqMap.begin();
+ int minVal = INT_MAX;
+ for (; _freqIter != _freqMap.end(); _freqIter++) {
+ if (_freqIter->second < minVal) {
+ minIter = _freqIter;
+ minVal = _freqIter->second;
+ }
+ }
+ _retStr = minIter->first;
+ return _retStr;
+}
+// return the minimum element of the vector
+double KeyListOpsMethods::getMin() {
+ if (empty()) return NAN;
+
+ double minVal = DBL_MAX;
+ for (begin(); !end(); next()) {
+ double currVal = getColValNum();
+ minVal = (currVal < minVal) ? currVal : minVal;
+ }
+ return minVal;
+}
+
+// return the maximum element of the vector
+double KeyListOpsMethods::getMax() {
+ if (empty()) return NAN;
+
+ double maxVal = DBL_MIN;
+ for (begin(); !end(); next()) {
+ double currVal = getColValNum();
+ maxVal = (currVal > maxVal) ? currVal : maxVal;
+ }
+ return maxVal;
+}
+
+// return the minimum absolute value of the vector
+double KeyListOpsMethods::getAbsMin() {
+ if (empty()) return NAN;
+
+ double minVal = DBL_MAX;
+ for (begin(); !end(); next()) {
+ double currVal = abs(getColValNum());
+ minVal = (currVal < minVal) ? currVal : minVal;
+ }
+ return minVal;
+}
+// return the maximum absolute value of the vector
+double KeyListOpsMethods::getAbsMax() {
+ if (empty()) return NAN;
+
+ double maxVal = DBL_MIN;
+ for (begin(); !end(); next()) {
+ double currVal = abs(getColValNum());
+ maxVal = (currVal > maxVal) ? currVal : maxVal;
+ }
+ return maxVal;
+}
+// return the count of element in the vector
+uint32_t KeyListOpsMethods::getCount() {
+ return _keyList->size();
+}
+// return a delimited list of the unique elements
+const QuickString &KeyListOpsMethods::getDistinct() {
+ if (empty()) return _nullVal;
+ // separated list of unique values. If something repeats, only report once.
+ makeFreqMap();
+ _retStr.clear();
+ for (; _freqIter != _freqMap.end(); _freqIter++) {
+ if (_freqIter != _freqMap.begin()) _retStr += _delimStr;
+ _retStr.append(_freqIter->first);
+ }
+ return _retStr;
+}
+
+const QuickString &KeyListOpsMethods::getDistinctOnly() {
+ if (empty()) return _nullVal;
+
+ //separated list of only unique values. If item repeats, discard.
+ makeFreqMap();
+ _retStr.clear();
+ for (; _freqIter != _freqMap.end(); _freqIter++) {
+ if (_freqIter->second != 1) continue;
+ if (_freqIter != _freqMap.begin()) _retStr += _delimStr;
+ _retStr.append(_freqIter->first);
+ }
+ return _retStr;
+}
+
+// return a the count of _unique_ elements in the vector
+uint32_t KeyListOpsMethods::getCountDistinct() {
+ if (empty()) return 0;
+
+ makeFreqMap();
+ return _freqMap.size();
+}
+// return a delimiter-separated list of elements
+const QuickString &KeyListOpsMethods::getCollapse(const QuickString &delimiter) {
+ if (empty()) return _nullVal;
+
+ //just put all items in one big separated list.
+ _retStr.clear();
+ int i=0;
+ for (begin(); !end(); next()) {
+ if (i > 0) _retStr += _delimStr;
+ _retStr.append(getColVal());
+ i++;
+ }
+ return _retStr;
+
+}
+// return a concatenation of all elements in the vector
+const QuickString &KeyListOpsMethods::getConcat() {
+ if (empty()) return _nullVal;
+
+ //like collapse but w/o commas. Just a true concat of all vals.
+ //just swap out the delimChar with '' and call collapse, then
+ //restore the delimChar.
+ QuickString oldDelimStr(_delimStr);
+ _delimStr = "";
+ getCollapse(); //this will store it's results in the _retStr method.
+ _delimStr = oldDelimStr;
+ return _retStr;
+}
+
+// return a histogram of values and their freqs. in desc. order of frequency
+const QuickString &KeyListOpsMethods::getFreqDesc() {
+ if (empty()) return _nullVal;
+
+ //for each uniq val, report # occurances, in desc order.
+ makeFreqMap();
+ //put freq map into multimap where key is the freq and val is the item. In other words, basically a reverse freq map.
+ histDescType hist;
+ for (; _freqIter != _freqMap.end(); _freqIter++) {
+ hist.insert(pair<int, QuickString>(_freqIter->second, _freqIter->first));
+ }
+ //now iterate through the reverse map we just made and output it's pairs in val:key format.
+ _retStr.clear();
+ for (histDescType::iterator histIter = hist.begin(); histIter != hist.end(); histIter++) {
+ if (histIter != hist.begin()) _retStr += _delimStr;
+ _retStr.append(histIter->second);
+ _retStr += ":";
+ _retStr.append(histIter->first);
+ }
+ return _retStr;
+}
+// return a histogram of values and their freqs. in asc. order of frequency
+const QuickString &KeyListOpsMethods::getFreqAsc() {
+ if (empty()) return _nullVal;
+
+ //for each uniq val, report # occurances, in asc order.
+ makeFreqMap();
+ //put freq map into multimap where key is the freq and val is the item. In other words, basically a reverse freq map.
+ histAscType hist;
+ for (; _freqIter != _freqMap.end(); _freqIter++) {
+ hist.insert(pair<int, QuickString>(_freqIter->second, _freqIter->first));
+// hist[*(_freqIter->second)] = _freqIter->first;
+ }
+ //now iterate through the reverse map we just made and output it's pairs in val:key format.
+ _retStr.clear();
+ for (histAscType::iterator histIter = hist.begin(); histIter != hist.end(); histIter++) {
+ if (histIter != hist.begin()) _retStr += _delimStr;
+ _retStr.append(histIter->second);
+ _retStr += ":";
+ _retStr.append(histIter->first);
+ }
+ return _retStr;
+}
+// return the first value in the list
+const QuickString &KeyListOpsMethods::getFirst() {
+ if (empty()) return _nullVal;
+
+ //just the first item.
+ begin();
+ return getColVal();
+}
+// return the last value in the list
+const QuickString &KeyListOpsMethods::getLast() {
+ if (empty()) return _nullVal;
+
+ //just the last item.
+ begin();
+ for (size_t i = 0; i < getCount() -1; i++) {
+ next();
+ }
+ return getColVal();
+}
+
+const QuickString &KeyListOpsMethods::getColVal() {
+ return _iter->value()->getField(_column);
+}
+
+double KeyListOpsMethods::getColValNum() {
+ return atof(_iter->value()->getField(_column).c_str());
+}
+
+void KeyListOpsMethods::toArray(bool useNum, SORT_TYPE sortVal) {
+
+ //TBD: optimize performance with better memory management.
+ if (useNum) {
+ _numArray.resize(_keyList->size());
+ int i=0;
+ for (begin(); !end(); next()) {
+ _numArray[i] = getColValNum();
+ i++;
+ }
+ } else {
+ _qsArray.resize(_keyList->size());
+ int i=0;
+ for (begin(); !end(); next()) {
+ _qsArray[i] = getColVal();
+ i++;
+ }
+ }
+ if (sortVal != UNSORTED) {
+ sortArray(useNum, sortVal == ASC);
+ }
+}
+
+void KeyListOpsMethods::sortArray(bool useNum, bool ascOrder)
+{
+ if (useNum) {
+ if (ascOrder) {
+ sort(_numArray.begin(), _numArray.end(), less<double>());
+ } else {
+ sort(_numArray.begin(), _numArray.end(), greater<double>());
+ }
+ } else {
+ if (ascOrder) {
+ sort(_qsArray.begin(), _qsArray.end(), less<QuickString>());
+ } else {
+ sort(_qsArray.begin(), _qsArray.end(), greater<QuickString>());
+ }
+ }
+}
+
+void KeyListOpsMethods::makeFreqMap() {
+ _freqMap.clear();
+
+ //make a map of values to their number of times occuring.
+ for (begin(); !end(); next()) {
+ _freqMap[getColVal()]++;
+ }
+ _freqIter = _freqMap.begin();
+}
diff --git a/src/utils/KeyListOps/KeyListOpsMethods.h b/src/utils/KeyListOps/KeyListOpsMethods.h
new file mode 100644
index 0000000..0cac9c8
--- /dev/null
+++ b/src/utils/KeyListOps/KeyListOpsMethods.h
@@ -0,0 +1,113 @@
+/*
+ * KeyListOpsMethods.h
+ *
+ * Created on: Feb 6, 2014
+ * Author: nek3d
+ */
+
+#ifndef KEYLISTOPSMETHODS_H_
+#define KEYLISTOPSMETHODS_H_
+
+using namespace std;
+
+#include <map>
+#include <utility> //for pair
+#include "QuickString.h"
+#include <stdint.h>
+#include "RecordKeyList.h"
+
+class KeyListOpsMethods {
+public:
+ KeyListOpsMethods();
+ KeyListOpsMethods(RecordKeyList *keyList, int column = 1);
+ ~KeyListOpsMethods();
+
+
+ void setKeyList(RecordKeyList *keyList) { _keyList = keyList; }
+ void setColumn(int col) { _column = col; }
+ void setNullValue(const QuickString & nullVal) { _nullVal = nullVal; }
+ const QuickString &getNullValue() const { return _nullVal; }
+ void setDelimStr(const QuickString &delimStr) { _delimStr = delimStr; }
+ const QuickString &getDelimStr() const { return _delimStr; }
+
+ // return the total of the values in the vector
+ double getSum();
+ // return the average value in the vector
+ double getMean();
+ // return the standard deviation
+ double getStddev();
+ // return the sample standard deviation
+ double getSampleStddev();
+ // return the median value in the vector
+ double getMedian();
+ // return the most common value in the vector
+ const QuickString &getMode();
+ // return the least common value in the vector
+ const QuickString &getAntiMode();
+ // return the minimum element of the vector
+ double getMin();
+ // return the maximum element of the vector
+ double getMax();
+ // return the minimum absolute value of the vector
+ double getAbsMin();
+ // return the maximum absolute value of the vector
+ double getAbsMax();
+ // return the count of element in the vector
+ uint32_t getCount();
+ // return a the count of _unique_ elements in the vector
+ uint32_t getCountDistinct();
+ // return only those elements that occur once
+ const QuickString &getDistinctOnly();
+ // return a delimiter-separated list of elements
+ const QuickString & getCollapse(const QuickString & delimiter = ",");
+ // return a concatenation of all elements in the vector
+ const QuickString & getConcat();
+ // return a comma-separated list of the _unique_ elements
+ const QuickString & getDistinct();
+ // return a histogram of values and their freqs. in desc. order of frequency
+ const QuickString & getFreqDesc();
+ // return a histogram of values and their freqs. in asc. order of frequency
+ const QuickString & getFreqAsc();
+ // return the first value in the list
+ const QuickString & getFirst();
+ // return the last value in the list
+ const QuickString & getLast();
+
+private:
+ RecordKeyList *_keyList;
+ int _column;
+ QuickString _nullVal;
+ QuickString _delimStr;
+ QuickString _retStr;
+
+ RecordKeyList _nullKeyList; //this has to exist just so we can initialize _iter, below.
+ RecordKeyList::const_iterator_type _iter;
+
+ // Some methods need to put values into a vector, mostly for sorting.
+ vector<double> _numArray;
+ vector<QuickString> _qsArray;
+
+ typedef map<QuickString, int> freqMapType;
+ freqMapType _freqMap;
+ freqMapType::iterator _freqIter;
+
+ typedef enum { UNSORTED, ASC, DESC} SORT_TYPE;
+
+ typedef multimap<int, QuickString, less<int> > histAscType;
+ typedef multimap<int, QuickString, greater<int> > histDescType;
+ void init();
+ const QuickString &getColVal();
+ double getColValNum();
+ bool empty() { return _keyList->empty(); }
+ void begin() { _iter = _keyList->begin(); }
+ bool end() { return _iter == _keyList->end(); }
+ void next() { _iter = _keyList->next(); }
+ void toArray(bool useNum, SORT_TYPE sortVal = UNSORTED);
+ void sortArray(bool useNum, bool ascOrder);
+ void makeFreqMap();
+
+
+};
+
+
+#endif /* KEYLISTOPSMETHODS_H_ */
diff --git a/src/utils/NewChromsweep/Makefile b/src/utils/KeyListOps/Makefile
similarity index 81%
copy from src/utils/NewChromsweep/Makefile
copy to src/utils/KeyListOps/Makefile
index 8f4d931..0b0ac99 100644
--- a/src/utils/NewChromsweep/Makefile
+++ b/src/utils/KeyListOps/Makefile
@@ -6,7 +6,6 @@ UTILITIES_DIR = ../../utils/
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/fileType/ \
- -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/GenomeFile/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
@@ -19,21 +18,26 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \
# ----------------------------------
# define our source and object files
# ----------------------------------
-SOURCES= NewChromsweep.cpp NewChromsweep.h
-OBJECTS= NewChromsweep.o
+SOURCES= KeyListOps.cpp KeyListOps.h KeyListOpsMethods.cpp KeyListOpsMethods.h
+OBJECTS= KeyListOps.o KeyListOpsMethods.o
_EXT_OBJECTS=
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
+all: $(BUILT_OBJECTS)
+
+.PHONY: all
+
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
+
$(EXT_OBJECTS):
@$(MAKE) --no-print-directory -C $(INCLUDES)
clean:
@echo "Cleaning up."
- @rm -f $(OBJ_DIR)/NewChromsweep.o $(BIN_DIR)/NewChromsweep.o
+ @rm -f $(OBJ_DIR)/KeyListOps.o $(OBJ_DIR)/KeyListOpsMethods.o
.PHONY: clean
\ No newline at end of file
diff --git a/src/utils/NewChromsweep/Makefile b/src/utils/NewChromsweep/Makefile
index 8f4d931..34fc5d1 100644
--- a/src/utils/NewChromsweep/Makefile
+++ b/src/utils/NewChromsweep/Makefile
@@ -11,6 +11,7 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
+ -I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/BamTools/src/ \
-I$(UTILITIES_DIR)/version/
diff --git a/src/utils/RecordOutputMgr/Makefile b/src/utils/RecordOutputMgr/Makefile
index 2d196ec..346a5c7 100644
--- a/src/utils/RecordOutputMgr/Makefile
+++ b/src/utils/RecordOutputMgr/Makefile
@@ -11,6 +11,7 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
+ -I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/BamTools/src/ \
-I$(UTILITIES_DIR)/version/
diff --git a/src/utils/general/Makefile b/src/utils/general/Makefile
index 43dcfba..0361fab 100644
--- a/src/utils/general/Makefile
+++ b/src/utils/general/Makefile
@@ -4,7 +4,7 @@ UTILITIES_DIR = ../../utils/
# -------------------
# define our includes
# -------------------
-INCLUDES =
+INCLUDES = -I$(UTILITIES_DIR)/lineFileUtilities/
# ----------------------------------
# define our source and object files
diff --git a/src/utils/general/QuickString.cpp b/src/utils/general/QuickString.cpp
index 831f84a..9e06186 100644
--- a/src/utils/general/QuickString.cpp
+++ b/src/utils/general/QuickString.cpp
@@ -3,6 +3,7 @@
#include <cstdlib>
#include <cstdio>
#include "ParseTools.h"
+#include "lineFileUtilities.h"
QuickString::QuickString(size_t capacity)
: _buffer(NULL),
@@ -82,6 +83,35 @@ QuickString &QuickString::operator = (const QuickString & inBuf){
return *this;
}
+QuickString &QuickString::operator = (char val) {
+ clear();
+ append(val);
+ return *this;
+}
+QuickString &QuickString::operator = (int val) {
+ clear();
+ append(val);
+ return *this;
+}
+
+QuickString &QuickString::operator = (uint32_t val) {
+ clear();
+ append(val);
+ return *this;
+}
+
+QuickString &QuickString::operator = (float val) {
+ clear();
+ append(val);
+ return *this;
+}
+
+QuickString &QuickString::operator = (double val) {
+ clear();
+ append(val);
+ return *this;
+}
+
QuickString &QuickString::operator += (const QuickString & inBuf)
{
@@ -107,6 +137,26 @@ QuickString &QuickString::operator += (const char *inBuf)
return *this;
}
+QuickString &QuickString::operator += (int num) {
+ append(num);
+ return *this;
+}
+
+QuickString &QuickString::operator += (uint32_t num) {
+ append(num);
+ return *this;
+}
+
+QuickString &QuickString::operator += (float num) {
+ append(num);
+ return *this;
+}
+
+QuickString &QuickString::operator += (double num) {
+ append(num);
+ return *this;
+}
+
bool QuickString::operator == (const QuickString &qs) const {
if ( _currSize != qs._currSize) {
return false;
@@ -194,6 +244,21 @@ void QuickString::append(const char *inBuf, size_t inBufLen)
void QuickString::append(int num) {
int2str(num, *this, true);
}
+
+void QuickString::append(uint32_t num) {
+ int2str((int)num, *this, true);
+}
+
+void QuickString::append(float num) {
+ append(ToString(num));
+}
+
+void QuickString::append(double num) {
+ append(ToString(num));
+}
+
+
+
QuickString &QuickString::assign(const char *inBuf, size_t inBufLen)
{
clear();
diff --git a/src/utils/general/QuickString.h b/src/utils/general/QuickString.h
index 5fdc0fc..a76e5ff 100644
--- a/src/utils/general/QuickString.h
+++ b/src/utils/general/QuickString.h
@@ -10,6 +10,7 @@
using namespace std;
#include <string>
+#include <stdint.h>
#include <climits>
#include <ostream>
@@ -32,10 +33,19 @@ public:
QuickString &operator = (const string &);
QuickString &operator = (const char *);
QuickString &operator = (const QuickString &);
+ QuickString &operator = (char);
+ QuickString &operator = (int);
+ QuickString &operator = (uint32_t);
+ QuickString &operator = (float);
+ QuickString &operator = (double);
QuickString &operator += (const QuickString &);
QuickString &operator += (const string &);
QuickString &operator += (const char *);
QuickString &operator += (char);
+ QuickString &operator += (int);
+ QuickString &operator += (uint32_t);
+ QuickString &operator += (float);
+ QuickString &operator += (double);
friend ostream &operator << (ostream &out, const QuickString &str);
bool operator == (const QuickString &) const;
@@ -52,7 +62,16 @@ public:
void append(const QuickString &str) { append(str.c_str(), str.size()); }
void append(const char *buf, size_t bufLen);
void append(char c);
+
+ //These are not templated because float and double require a stringstream based
+ //implementation, while the integer append uses a much faster home-brewed algorithm
+ //for better performance.
void append(int num);
+ void append(uint32_t num);
+ void append(float num);
+ void append(double num);
+
+
QuickString &assign(const char *str, size_t n);
void resize(size_t n, char c = '\0');
diff --git a/test/map/test-map.sh b/test/map/test-map.sh
index 293d84e..a47b14e 100644
--- a/test/map/test-map.sh
+++ b/test/map/test-map.sh
@@ -499,10 +499,8 @@ echo " map.t33..\c"
echo \
"
*****
-*****ERROR: requested column 15 , but record only has fields 1 - 12. Exiting.
-
-*****" > exp
-$BT map -a ivls.bed -b test.vcf -c 15 -o collapse 2> obs
+***** ERROR: Requested column 15, but database file test.vcf only has fields 1 - 12." > exp
+$BT map -a ivls.bed -b test.vcf -c 15 -o collapse 2>&1 > /dev/null | head -3> obs
check obs exp
rm obs exp
@@ -624,12 +622,9 @@ echo " map.t41..\c"
echo \
"
*****
-*****ERROR: requested column 41 , but record only has fields 1 - 6. Exiting.
-
-*****" > exp
-$BT map -a ivls.bed -b values5.bed -c 41 -o collapse 2> obs
+***** ERROR: Requested column 41, but database file test.vcf only has fields 1 - 12." > exp
+$BT map -a ivls.bed -b test.vcf -c 41 -o collapse 2>&1 > /dev/null | head -3> obs
check obs exp
-
rm obs exp
###########################################################
@@ -639,12 +634,9 @@ echo " map.t42..\c"
echo \
"
*****
-*****ERROR: requested column -1 , but record only has fields 1 - 6. Exiting.
-
-*****" > exp
-$BT map -a ivls.bed -b values5.bed -c -1 -o collapse 2> obs
+***** ERROR: Requested column -1, but database file test.vcf only has fields 1 - 12." > exp
+$BT map -a ivls.bed -b test.vcf -c -1 -o collapse 2>&1 > /dev/null | head -3> obs
check obs exp
-
rm obs exp
###########################################################
@@ -654,12 +646,9 @@ echo " map.t43..\c"
echo \
"
*****
-*****ERROR: requested column 0 , but record only has fields 1 - 6. Exiting.
-
-*****" > exp
-$BT map -a ivls.bed -b values5.bed -c 0 -o collapse 2> obs
+***** ERROR: Requested column 0, but database file test.vcf only has fields 1 - 12." > exp
+$BT map -a ivls.bed -b test.vcf -c 0 -o collapse 2>&1 > /dev/null | head -3> obs
check obs exp
-
rm obs exp
@@ -667,7 +656,7 @@ rm obs exp
# Test that Bam database is not allowed
############################################################
echo " map.t44...\c"
-echo -e "\n*****\n***** ERROR: BAM database file not currently supported for the map tool." > exp
+echo -e "\n*****\n***** ERROR: BAM database file not currently supported for column operations." > exp
$BT map -a ivls.bed -b values.bam 2> obs
check obs exp
rm obs exp
@@ -682,3 +671,71 @@ echo "chr1 0 50 three_blocks_match 15 + 0 0 0 3 10,10,10, 0,20,40, ." > exp
$BT map -o sum -a three_blocks_match.bed -b three_blocks_nomatch.bed -split > obs
check obs exp
rm obs exp
+
+
+
+
+
+
+###########################################################
+#
+#
+# Tests for multiple columns and operations
+#
+#
+############################################################
+
+
+###########################################################
+# Test that error is given when ops outnumber columns
+############################################################
+echo " map.t46...\c"
+echo \
+"
+*****
+***** ERROR: There are 1 columns given, but there are 2 operations." > exp
+../../bin/bedtools map -a ivls.bed -b values.bed -o count,sum 2>&1 > /dev/null | head -3 > obs
+check obs exp
+rm obs exp
+
+
+###########################################################
+# Test that error is given when columns outnumber ops,
+# if there are two or more ops.
+############################################################
+echo " map.t47...\c"
+echo \
+"
+*****
+***** ERROR: There are 3 columns given, but there are 2 operations." > exp
+../../bin/bedtools map -a ivls.bed -b values.bed -c 5,1,2 -o count,sum 2>&1 > /dev/null | head -3 > obs
+check obs exp
+rm obs exp
+
+
+###########################################################
+# Test that numeric ops for non-numeric columns aren't allowed
+############################################################
+echo " map.t48...\c"
+echo \
+"
+*****
+***** ERROR: Column 1 is not a numeric field for database file values.bed." > exp
+../../bin/bedtools map -a ivls.bed -b values.bed -c 1 -o sum 2>&1 > /dev/null | head -3 > obs
+check obs exp
+rm obs exp
+
+
+###########################################################
+# Test that multiple columns are allowed with a
+# single operation
+############################################################
+#
+# TBD
+#
+#echo " map.t49...\c"
+#../../bin/bedtools map -a ivls.bed -b values.bed -c 2 -o sum 2>&1 > /dev/null | head -3 > obs
+#check obs exp
+#rm obs exp
+
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/bedtools.git
More information about the debian-med-commit
mailing list