[med-svn] [filo] 02/04: Imported Upstream version 1.1.0
Kevin Murray
daube-guest at moszumanska.debian.org
Mon Dec 21 14:03:35 UTC 2015
This is an automated email from the git hooks/post-receive script.
daube-guest pushed a commit to branch master
in repository filo.
commit 4c2e733f51f96d42b6d9929a9130951cfd06d930
Author: Kevin Murray <spam at kdmurray.id.au>
Date: Tue Dec 22 00:57:53 2015 +1100
Imported Upstream version 1.1.0
---
LICENSE | 19 +++++++++++++++++++
README.rst | 2 ++
src/common/version/version.h | 2 +-
src/groupBy/groupBy.cpp | 45 ++++++++++++++++++++++++++++++++++++--------
src/groupBy/groupBy.h | 4 ++--
src/shuffle/shuffle.cpp | 13 ++++++++++++-
src/stats/stats.cpp | 12 ++++++++++++
7 files changed, 85 insertions(+), 12 deletions(-)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..8f49d70
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,19 @@
+Copyright (C) 2009,2010,2011 by Aaron Quinlan.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/README.rst b/README.rst
index a053fd8..bb6574b 100644
--- a/README.rst
+++ b/README.rst
@@ -32,6 +32,8 @@ Here is the current list of the available operations.
9. *antimode* - numeric or text collapse (i.e., print a comma separated list) - numeric or text
10. *freqasc* - print a comma separated list of values observed and the number of times they were observed. Reported in **ascending** order of frequency.
11. *freqdesc* - print a comma separated list of values observed and the number of times they were observed. Reported in descending order of frequency.
+12. *collapse* - print a comma separated list of each value in the grouped column.
+13. *concat* - concattenate each value in the grouped column into a single string.
And here are some usage examples. I hope you find this utility to be of use in your work. I have found it to be a huge time saver.
diff --git a/src/common/version/version.h b/src/common/version/version.h
index a3e541f..c4fce5d 100644
--- a/src/common/version/version.h
+++ b/src/common/version/version.h
@@ -5,4 +5,4 @@
// suite carry the same version number.
#define VERSION "1.1.0"
-#endif /* VERSION_H */
\ No newline at end of file
+#endif /* VERSION_H */
diff --git a/src/groupBy/groupBy.cpp b/src/groupBy/groupBy.cpp
index ce6d177..1babc0f 100644
--- a/src/groupBy/groupBy.cpp
+++ b/src/groupBy/groupBy.cpp
@@ -1,16 +1,17 @@
/*****************************************************************************
groupBy.cpp
-(c) 2009, 2010 - Aaron Quinlan
+(c) 2009, 2010, 2011 - Aaron Quinlan
Center for Public Health Genomics
University of Virginia
aaronquinlan at gmail.com
-Licenced under the GNU General Public License 2.0 license.
+Licenced under the MIT license.
******************************************************************************/
#include <vector>
#include <map>
#include <numeric>
+#include <algorithm>
#include <iterator>
#include <iostream>
#include <iomanip>
@@ -184,8 +185,9 @@ int main(int argc, char* argv[]) {
for( size_t i = 0; i < ops.size(); i++ ) {
if ((ops[i] != "sum") && (ops[i] != "max") && (ops[i] != "min") && (ops[i] != "mean") &&
(ops[i] != "mode") && (ops[i] != "median") && (ops[i] != "antimode") && (ops[i] != "stdev") &&
- (ops[i] != "sstdev") && (ops[i] != "count") && (ops[i] != "collapse") && (ops[i] != "freqdesc") &&
- (ops[i] != "freqasc")) {
+ (ops[i] != "sstdev") && (ops[i] != "count") && (ops[i] != "collapse") && (ops[i] != "distinct") &&
+ (ops[i] != "concat") && (ops[i] != "freqdesc") && (ops[i] != "freqasc"))
+ {
cerr << endl << "*****" << endl << "*****ERROR: Invalid operation selection \"" << ops[i] << endl << "\" *****" << endl;
showHelp = true;
}
@@ -261,7 +263,9 @@ void ShowHelp(void) {
cerr << "\t\t\t sum, count, min, max," << endl;
cerr << "\t\t\t mean, median, mode, antimode," << endl;
cerr << "\t\t\t stdev, sstdev (sample standard dev.)," << endl;
- cerr << "\t\t\t collapse (i.e., print a comma separated list), " << endl;
+ cerr << "\t\t\t collapse (i.e., print a comma separated list (duplicates allowed)), " << endl;
+ cerr << "\t\t\t distinct (i.e., print a comma separated list (NO duplicates allowed)), " << endl;
+ cerr << "\t\t\t concat (i.e., merge values into a single, non-delimited string), " << endl;
cerr << "\t\t\t freqdesc (i.e., print desc. list of values:freq)" << endl;
cerr << "\t\t\t freqasc (i.e., print asc. list of values:freq)" << endl;
cerr << "\t\t\t- Default: sum" << endl << endl;
@@ -283,8 +287,8 @@ void ShowHelp(void) {
cerr << "Examples: " << endl;
cerr << "\t$ cat ex1.out" << endl;
- cerr << "\tchr1 10 20 A chr1 15 25 B.1 1000" << endl;
- cerr << "\tchr1 10 20 A chr1 25 35 B.2 10000" << endl << endl;
+ cerr << "\tchr1 10 20 A chr1 15 25 B.1 1000 ATAT" << endl;
+ cerr << "\tchr1 10 20 A chr1 25 35 B.2 10000 CGCG" << endl << endl;
cerr << "\t$ groupBy -i ex1.out -g 1,2,3,4 -c 9 -o sum" << endl;
cerr << "\tchr1 10 20 A 11000" << endl << endl;
cerr << "\t$ groupBy -i ex1.out -grp 1,2,3,4 -opCols 9,9 -ops sum,max" << endl;
@@ -293,6 +297,8 @@ void ShowHelp(void) {
cerr << "\tchr1 10 20 A B.1,B.2, 5500" << endl << endl;
cerr << "\t$ cat ex1.out | groupBy -g 1,2,3,4 -c 8,9 -o collapse,mean" << endl;
cerr << "\tchr1 10 20 A B.1,B.2, 5500" << endl << endl;
+ cerr << "\t$ cat ex1.out | groupBy -g 1,2,3,4 -c 10 -o concat" << endl;
+ cerr << "\tchr1 10 20 A ATATCGCG" << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) The input file/stream should be sorted/grouped by the -grp. columns" << endl;
@@ -312,7 +318,7 @@ void GroupBy (const string &inFile,
const bool printOriginalLine,
const bool printHeaderLine,
const bool InputHaveHeaderLine,
-const bool ignoreCase) {
+ const bool ignoreCase) {
// current line number
int lineNum = 0;
@@ -427,6 +433,29 @@ void ReportSummary(const vector<string> &group, const vector<vector<string> > &d
}
result.push_back(collapse);
}
+ else if (op == "distinct") {
+ string distinct;
+ // get the current column's data
+ vector<string> col_data = data[i];
+ // remove duplicate entries from the vector
+ // http://stackoverflow.com/questions/1041620/most-efficient-way-to-erase-duplicates-and-sort-a-c-vector
+ sort( col_data.begin(), col_data.end() );
+ col_data.erase( unique( col_data.begin(), col_data.end() ), col_data.end() );
+
+ for( size_t j = 0; j < col_data.size(); j++ ) {//Ugly, but cannot use back_inserter
+ if (j>0)
+ distinct.append(",");
+ distinct.append(col_data[j]);
+ }
+ result.push_back(distinct);
+ }
+ else if (op == "concat") {
+ string concat;
+ for( size_t j = 0; j < data[i].size(); j++ ) {//Ugly, but cannot use back_inserter
+ concat.append(data[i][j]);
+ }
+ result.push_back(concat);
+ }
else if (op == "min") {
buffer << setprecision (PRECISION) << *min_element( dataF.begin(), dataF.end() );
result.push_back(buffer.str());
diff --git a/src/groupBy/groupBy.h b/src/groupBy/groupBy.h
index 0503cf6..af8ab6a 100644
--- a/src/groupBy/groupBy.h
+++ b/src/groupBy/groupBy.h
@@ -1,12 +1,12 @@
/*****************************************************************************
groupBy.h
- (c) 2009, 2010 - Aaron Quinlan
+ (c) 2009, 2010, 2011 - Aaron Quinlan
Center for Public Health Genomics
University of Virginia
aaronquinlan at gmail.com
- Licenced under the GNU General Public License 2.0 license.
+ Licenced under the MIT license.
******************************************************************************/
#ifndef GROUPBY_H
diff --git a/src/shuffle/shuffle.cpp b/src/shuffle/shuffle.cpp
index b50b1b5..c1fc9c6 100755
--- a/src/shuffle/shuffle.cpp
+++ b/src/shuffle/shuffle.cpp
@@ -1,8 +1,19 @@
+/*****************************************************************************
+ shuffle.cpp
+
+ (c) 2009, 2010, 2011 - Aaron Quinlan
+ Center for Public Health Genomics
+ University of Virginia
+ aaronquinlan at gmail.com
+
+ Licenced under the MIT license.
+******************************************************************************/
#include <iostream>
#include <fstream>
#include <vector>
#include <algorithm>
#include <math.h>
+#include <cstring>
#include <unistd.h> // for getpid()
#include "version.h"
@@ -66,7 +77,7 @@ int main(int argc, char* argv[]) {
long totalLines = 0;
vector<string> linesVector;
- linesVector.reserve(1E6); // allocate 1 mill lines of input.
+ linesVector.reserve((int) 1E6); // allocate 1 mill lines of input.
// 0. Are we dealing with a stream or a proper file? Default to a stream.
istream *in = &cin;
diff --git a/src/stats/stats.cpp b/src/stats/stats.cpp
index dc7e448..c394816 100755
--- a/src/stats/stats.cpp
+++ b/src/stats/stats.cpp
@@ -1,3 +1,13 @@
+/*****************************************************************************
+ stats.cpp
+
+ (c) 2009, 2010, 2011 - Aaron Quinlan
+ Center for Public Health Genomics
+ University of Virginia
+ aaronquinlan at gmail.com
+
+ Licenced under the MIT license.
+******************************************************************************/
#include <iostream>
#include <fstream>
#include <iomanip>
@@ -7,6 +17,8 @@
#include <math.h>
#include <exception>
#include <stdexcept> // out_of_range exception
+#include <cstring>
+#include <climits>
#include "version.h"
#include "lineFileUtilities.h"
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/filo.git
More information about the debian-med-commit
mailing list