[med-svn] [filo] 02/04: Imported Upstream version 1.1.0

Kevin Murray daube-guest at moszumanska.debian.org
Mon Dec 21 14:03:35 UTC 2015


This is an automated email from the git hooks/post-receive script.

daube-guest pushed a commit to branch master
in repository filo.

commit 4c2e733f51f96d42b6d9929a9130951cfd06d930
Author: Kevin Murray <spam at kdmurray.id.au>
Date:   Tue Dec 22 00:57:53 2015 +1100

    Imported Upstream version 1.1.0
---
 LICENSE                      | 19 +++++++++++++++++++
 README.rst                   |  2 ++
 src/common/version/version.h |  2 +-
 src/groupBy/groupBy.cpp      | 45 ++++++++++++++++++++++++++++++++++++--------
 src/groupBy/groupBy.h        |  4 ++--
 src/shuffle/shuffle.cpp      | 13 ++++++++++++-
 src/stats/stats.cpp          | 12 ++++++++++++
 7 files changed, 85 insertions(+), 12 deletions(-)

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..8f49d70
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,19 @@
+Copyright (C) 2009,2010,2011 by Aaron Quinlan.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/README.rst b/README.rst
index a053fd8..bb6574b 100644
--- a/README.rst
+++ b/README.rst
@@ -32,6 +32,8 @@ Here is the current list of the available operations.
 9. *antimode* - numeric or text collapse (i.e., print a comma separated list) - numeric or text 
 10. *freqasc* - print a comma separated list of values observed and the number of times they were observed. Reported in **ascending** order of frequency.
 11. *freqdesc* - print a comma separated list of values observed and the number of times they were observed. Reported in descending order of frequency.
+12. *collapse* - print a comma separated list of each value in the grouped column.
+13. *concat* - concattenate each value in the grouped column into a single string.
 
 And here are some usage examples.  I hope you find this utility to be of use in your work.  I have found it to be a huge time saver.
 
diff --git a/src/common/version/version.h b/src/common/version/version.h
index a3e541f..c4fce5d 100644
--- a/src/common/version/version.h
+++ b/src/common/version/version.h
@@ -5,4 +5,4 @@
 // suite carry the same version number.
 #define VERSION "1.1.0"
 
-#endif /* VERSION_H */
\ No newline at end of file
+#endif /* VERSION_H */
diff --git a/src/groupBy/groupBy.cpp b/src/groupBy/groupBy.cpp
index ce6d177..1babc0f 100644
--- a/src/groupBy/groupBy.cpp
+++ b/src/groupBy/groupBy.cpp
@@ -1,16 +1,17 @@
 /*****************************************************************************
 groupBy.cpp
 
-(c) 2009, 2010 - Aaron Quinlan
+(c) 2009, 2010, 2011 - Aaron Quinlan
 Center for Public Health Genomics
 University of Virginia
 aaronquinlan at gmail.com
 
-Licenced under the GNU General Public License 2.0 license.
+Licenced under the MIT license.
 ******************************************************************************/
 #include <vector>
 #include <map>
 #include <numeric>
+#include <algorithm>
 #include <iterator>
 #include <iostream>
 #include <iomanip>
@@ -184,8 +185,9 @@ int main(int argc, char* argv[]) {
     for( size_t i = 0; i < ops.size(); i++ ) {
         if ((ops[i] != "sum")  && (ops[i] != "max")    && (ops[i] != "min") && (ops[i] != "mean") &&
             (ops[i] != "mode") && (ops[i] != "median") && (ops[i] != "antimode") && (ops[i] != "stdev") &&
-            (ops[i] != "sstdev") && (ops[i] != "count") && (ops[i] != "collapse") && (ops[i] != "freqdesc") &&
-        (ops[i] != "freqasc")) {
+            (ops[i] != "sstdev") && (ops[i] != "count") && (ops[i] != "collapse") && (ops[i] != "distinct") &&
+            (ops[i] != "concat") && (ops[i] != "freqdesc") && (ops[i] != "freqasc")) 
+        {
             cerr << endl << "*****" << endl << "*****ERROR: Invalid operation selection \"" << ops[i] << endl << "\"  *****" << endl;
             showHelp = true;
         }
@@ -261,7 +263,9 @@ void ShowHelp(void) {
     cerr                         << "\t\t\t    sum, count, min, max," << endl;
     cerr                         << "\t\t\t    mean, median, mode, antimode," << endl;
     cerr                         << "\t\t\t    stdev, sstdev (sample standard dev.)," << endl;
-    cerr                         << "\t\t\t    collapse (i.e., print a comma separated list), " << endl;
+    cerr                         << "\t\t\t    collapse (i.e., print a comma separated list (duplicates allowed)), " << endl;
+    cerr                         << "\t\t\t    distinct (i.e., print a comma separated list (NO duplicates allowed)), " << endl;
+    cerr                         << "\t\t\t    concat   (i.e., merge values into a single, non-delimited string), " << endl;
     cerr                         << "\t\t\t    freqdesc (i.e., print desc. list of values:freq)" << endl;
     cerr                         << "\t\t\t    freqasc (i.e., print asc. list of values:freq)" << endl;
     cerr                         << "\t\t\t- Default: sum" << endl << endl;
@@ -283,8 +287,8 @@ void ShowHelp(void) {
 
     cerr << "Examples: " << endl;
     cerr << "\t$ cat ex1.out" << endl;
-    cerr << "\tchr1 10  20  A   chr1    15  25  B.1 1000" << endl;
-    cerr << "\tchr1 10  20  A   chr1    25  35  B.2 10000" << endl << endl;
+    cerr << "\tchr1 10  20  A   chr1    15  25  B.1 1000    ATAT" << endl;
+    cerr << "\tchr1 10  20  A   chr1    25  35  B.2 10000   CGCG" << endl << endl;
     cerr << "\t$ groupBy -i ex1.out -g 1,2,3,4 -c 9 -o sum" << endl;
     cerr << "\tchr1 10  20  A   11000" << endl << endl;
     cerr << "\t$ groupBy -i ex1.out -grp 1,2,3,4 -opCols 9,9 -ops sum,max" << endl;
@@ -293,6 +297,8 @@ void ShowHelp(void) {
     cerr << "\tchr1 10  20  A   B.1,B.2,    5500" << endl << endl;
     cerr << "\t$ cat ex1.out | groupBy -g 1,2,3,4 -c 8,9 -o collapse,mean" << endl;
     cerr << "\tchr1 10  20  A   B.1,B.2,    5500" << endl << endl;
+    cerr << "\t$ cat ex1.out | groupBy -g 1,2,3,4 -c 10 -o concat" << endl;
+    cerr << "\tchr1 10  20  A   ATATCGCG" << endl << endl;
 
     cerr << "Notes: " << endl;
     cerr << "\t(1)  The input file/stream should be sorted/grouped by the -grp. columns" << endl;
@@ -312,7 +318,7 @@ void GroupBy (const string &inFile,
     const bool printOriginalLine,
     const bool printHeaderLine,
     const bool InputHaveHeaderLine,
-const bool ignoreCase) {
+    const bool ignoreCase) {
 
     // current line number
     int lineNum = 0;
@@ -427,6 +433,29 @@ void ReportSummary(const vector<string> &group, const vector<vector<string> > &d
             }
             result.push_back(collapse);
         }
+        else if (op == "distinct") {
+            string distinct;
+            // get the current column's data
+            vector<string> col_data = data[i];
+            // remove duplicate entries from the vector
+            // http://stackoverflow.com/questions/1041620/most-efficient-way-to-erase-duplicates-and-sort-a-c-vector
+            sort( col_data.begin(), col_data.end() );
+            col_data.erase( unique( col_data.begin(), col_data.end() ), col_data.end() );
+            
+            for( size_t j = 0; j < col_data.size(); j++ ) {//Ugly, but cannot use back_inserter
+                if (j>0)
+                    distinct.append(",");
+                distinct.append(col_data[j]);
+            }
+            result.push_back(distinct);
+        }
+        else if (op == "concat") {
+            string concat;
+            for( size_t j = 0; j < data[i].size(); j++ ) {//Ugly, but cannot use back_inserter
+                concat.append(data[i][j]);
+            }
+            result.push_back(concat);
+        }
         else if (op == "min") {
             buffer << setprecision (PRECISION) << *min_element( dataF.begin(), dataF.end() );
             result.push_back(buffer.str());
diff --git a/src/groupBy/groupBy.h b/src/groupBy/groupBy.h
index 0503cf6..af8ab6a 100644
--- a/src/groupBy/groupBy.h
+++ b/src/groupBy/groupBy.h
@@ -1,12 +1,12 @@
 /*****************************************************************************
   groupBy.h
 
-  (c) 2009, 2010 - Aaron Quinlan
+  (c) 2009, 2010, 2011 - Aaron Quinlan
   Center for Public Health Genomics
   University of Virginia
   aaronquinlan at gmail.com
 
-  Licenced under the GNU General Public License 2.0 license.
+  Licenced under the MIT license.
 ******************************************************************************/
 
 #ifndef GROUPBY_H
diff --git a/src/shuffle/shuffle.cpp b/src/shuffle/shuffle.cpp
index b50b1b5..c1fc9c6 100755
--- a/src/shuffle/shuffle.cpp
+++ b/src/shuffle/shuffle.cpp
@@ -1,8 +1,19 @@
+/*****************************************************************************
+  shuffle.cpp
+
+  (c) 2009, 2010, 2011 - Aaron Quinlan
+  Center for Public Health Genomics
+  University of Virginia
+  aaronquinlan at gmail.com
+
+  Licenced under the MIT license.
+******************************************************************************/
 #include <iostream>
 #include <fstream>
 #include <vector>
 #include <algorithm>
 #include <math.h>
+#include <cstring>
 #include <unistd.h> // for getpid()
 #include "version.h"
 
@@ -66,7 +77,7 @@ int main(int argc, char* argv[]) {
     long totalLines = 0;
 
     vector<string> linesVector;
-    linesVector.reserve(1E6);   // allocate 1 mill lines of input.
+    linesVector.reserve((int) 1E6);   // allocate 1 mill lines of input.
 
     // 0. Are we dealing with a stream or a proper file? Default to a stream.
     istream *in = &cin;
diff --git a/src/stats/stats.cpp b/src/stats/stats.cpp
index dc7e448..c394816 100755
--- a/src/stats/stats.cpp
+++ b/src/stats/stats.cpp
@@ -1,3 +1,13 @@
+/*****************************************************************************
+  stats.cpp
+
+  (c) 2009, 2010, 2011 - Aaron Quinlan
+  Center for Public Health Genomics
+  University of Virginia
+  aaronquinlan at gmail.com
+
+  Licenced under the MIT license.
+******************************************************************************/
 #include <iostream>
 #include <fstream>
 #include <iomanip>
@@ -7,6 +17,8 @@
 #include <math.h>
 #include <exception>
 #include <stdexcept> // out_of_range exception
+#include <cstring>
+#include <climits>
 
 #include "version.h"
 #include "lineFileUtilities.h"

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/filo.git



More information about the debian-med-commit mailing list