[med-svn] [gwama] 01/02: Imported Upstream version 2.2.2+dfsg
Dylan Aïssi
bob.dybian-guest at moszumanska.debian.org
Fri Aug 26 05:31:26 UTC 2016
This is an automated email from the git hooks/post-receive script.
bob.dybian-guest pushed a commit to branch master
in repository gwama.
commit 3d879d0e12bce254992c0292873994a483271bd5
Author: Dylan Aïssi <bob.dybian at gmail.com>
Date: Fri Aug 26 07:30:57 2016 +0200
Imported Upstream version 2.2.2+dfsg
---
Makefile | 2 +-
commandLine.cpp | 52 ++++-
global.cpp | 40 ++--
global.h | 10 +
readFile.cpp | 688 +++++++++++++++++++++++++++++++++++++++++++++++++-------
readFile.h | 2 +-
6 files changed, 698 insertions(+), 96 deletions(-)
diff --git a/Makefile b/Makefile
index 9be1847..310d954 100755
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@ VERSION = 2.7
CC = g++
-DEBUGFLAGS = -Wno-deprecated -O3
+DEBUGFLAGS = -Wno-deprecated -O3 -lz
GWAMA: main.cpp
diff --git a/commandLine.cpp b/commandLine.cpp
index 7e0acd4..5697a2a 100755
--- a/commandLine.cpp
+++ b/commandLine.cpp
@@ -54,7 +54,7 @@ void printHelp(string version)
cout << "| GWAMA |" << v << "| May, 2010 |" << endl;
cout << "|----------------------------------------------------------|" << endl;
cout << "| (C) 2008 Reedik Magi & Andrew P Morris |" << endl;
- cout << "| GNU General Public License, v2 |" << endl;
+ cout << "| BSD 3-Clause License |" << endl;
cout << "|----------------------------------------------------------|" << endl;
cout << "| For documentation, citation & bug-report instructions: |" << endl;
cout << "| http://www.well.ox.ac.uk/GWAMA/ |" << endl;
@@ -93,6 +93,11 @@ void printHelp(string version)
cout << " must be provided in filelist file. "<<endl;
cout << " (second column after file names is "<<endl;
cout << " either M or F) More info in tutorial."<<endl<<endl;
+ cout << " --filter Set a filtering based on column "<<endl;
+ cout << " header. It needs 3 arguments: column "<<endl;
+ cout << " name, equation [>,<,>=,<=,==,!=], "<<endl;
+ cout << " filter value. Multiple filters can be"<<endl;
+ cout << " set. "<<endl<<endl;
cout << " --name_marker alternative header to marker name col"<<endl<<endl;
cout << " --name_strand alternative header to strand column "<<endl<<endl;
cout << " --name_n alternative header to sample size col"<<endl<<endl;
@@ -354,6 +359,51 @@ readCommandline (int argc, char *argv[], global & _g)
}
i++;
}
+ else if (string(argv[i]).compare("--filter")==0)
+ {
+ string a = "";
+ string b = "";
+ double c = -9999;
+ if (i != argc-1)
+ {
+ a = string(argv[i+1]);
+ }
+ if (i == argc-1 )
+ {
+ cout<<"ERROR: Missing a value for --filter\n";
+ return 0;
+ }
+ i++;
+ if (i != argc-1)
+ {
+ b = string(argv[i+1]);
+ }
+ if (i == argc-1 )
+ {
+ cout<<"ERROR: Missing a value for --filter\n";
+ return 0;
+ }
+ i++;
+ if (i != argc-1)
+ {
+ c = atof(argv[i+1]);
+ }
+ if (i == argc-1 )
+ {
+ cout<<"ERROR: Missing a value for --filter\n";
+ return 0;
+ }
+ i++;
+
+ filter _filter(a,b,c);
+ if (! _filter.valid())
+ {
+ cout<<"ERROR: Filter not valid. Please check the format\n";
+ return 0;
+ }
+ cout << "Setting filter for column: " << a << " to filter out samples with values " << b << " " << c << endl;
+ _g.filters.push_back(_filter);
+ }
else if (string(argv[i]).compare("--threshold")==0 || string(argv[i]).compare("-t")==0)
{
if (i != argc-1)
diff --git a/global.cpp b/global.cpp
index d725a9d..4de740b 100755
--- a/global.cpp
+++ b/global.cpp
@@ -24,6 +24,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <stdlib.h>
#include <fstream>
using namespace std;
+filter::filter(string columnName, string equation, double value)
+{
+ _columnName = columnName;
+ _equation = equation;
+ _value = value;
+}
+bool
+filter::valid()
+{
+ if (_columnName == "" || _value == -9999) return false;
+ if (_equation == "==" || _equation == "!=" || _equation == ">"
+ || _equation == ">=" || _equation == "<" || _equation == "<=") return true;
+ return false;
+}
global::global()
{
_errNR = 1; //count of errors and warnings found
@@ -40,7 +54,7 @@ global::global()
_fileList = "gwama.in"; //gwama.in file
_markerMap = "N"; //map file name
_outputRoot = "gwama"; //output file root (w/o file extension)
- _version = "2.1";
+ _version = "2.2.2";
outputLambda=1;
outputLambda_male=1;
@@ -48,25 +62,25 @@ global::global()
- _alternativeMarkerNames.push_back("SNP");
- _alternativeMarkerNames.push_back("RS");
- _alternativeMarkerNames.push_back("SNPID");
- _alternativeMarkerNames.push_back("ID");
- _alternativeMarkerNames.push_back("MARKER");
+// _alternativeMarkerNames.push_back("SNP");
+// _alternativeMarkerNames.push_back("RS");
+// _alternativeMarkerNames.push_back("SNPID");
+// _alternativeMarkerNames.push_back("ID");
+// _alternativeMarkerNames.push_back("MARKER");
_alternativeMarkerNames.push_back("MARKERNAME");
_alternativeStrands.push_back("STRAND");
_alternativeEffectAlleles.push_back("EA");
- _alternativeEffectAlleles.push_back("EFFECT_ALLELE");
- _alternativeOtherAlleles.push_back("NON_EFFECT_ALLELE");
- _alternativeOtherAlleles.push_back("OTHER_ALLELE");
+// _alternativeEffectAlleles.push_back("EFFECT_ALLELE");
+// _alternativeOtherAlleles.push_back("NON_EFFECT_ALLELE");
+// _alternativeOtherAlleles.push_back("OTHER_ALLELE");
_alternativeOtherAlleles.push_back("NEA");
- _alternativeEffectAlleleFreqs.push_back("EFFECT_ALLELE_FREQUENCY");
+// _alternativeEffectAlleleFreqs.push_back("EFFECT_ALLELE_FREQUENCY");
_alternativeEffectAlleleFreqs.push_back("EAF");
- _alternativeEffectAlleleFreqs.push_back("EFFECT_ALLELE_FREQ");
+// _alternativeEffectAlleleFreqs.push_back("EFFECT_ALLELE_FREQ");
_alternativeBetas.push_back("BETA");
- _alternativeBetas.push_back("EFFECT");
+// _alternativeBetas.push_back("EFFECT");
_alternativeSEs.push_back("SE");
- _alternativeSEs.push_back("STDERR");
+// _alternativeSEs.push_back("STDERR");
_alternativeORs.push_back("OR");
_alternativeOR_95Ls.push_back("OR_95L");
_alternativeOR_95Us.push_back("OR_95U");
diff --git a/global.h b/global.h
index 049ef6b..6e2d95c 100755
--- a/global.h
+++ b/global.h
@@ -27,6 +27,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <fstream>
using namespace std;
+class filter
+{
+public:
+ string _columnName; // Column name for the filter check
+ string _equation; // >, <, >=, <=, !=, ==
+ double _value; // numerical value for filtering
+ filter(string columnName, string equation, double value); // constructor
+ bool valid(); //consistency check
+};
class global
{
public:
@@ -60,6 +69,7 @@ public:
int _studyCount;
vector <string> studyList;
vector <string> studyGenders;
+ vector <filter> filters;
double outputLambda;
double outputLambda_male;;
diff --git a/readFile.cpp b/readFile.cpp
index ed15d9f..0ff812f 100755
--- a/readFile.cpp
+++ b/readFile.cpp
@@ -26,14 +26,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "marker.h"
#include "study.h"
#include <math.h>
+#include <zlib.h>
#include <cctype> // std::toupper
#include "global.h"
#include "tools.h"
#include "cohort.h"
#include "problem.h"
+#define LENS 1000000
using namespace std;
+bool isFiltered(double value, string equation, double threshold)
+{
+ if (equation=="==" && value==threshold) return true;
+ if (equation=="!=" && value!=threshold) return true;
+ if (equation==">=" && value>=threshold) return true;
+ if (equation=="<=" && value<=threshold) return true;
+ if (equation==">" && value>threshold) return true;
+ if (equation=="<" && value<threshold) return true;
+ return false;
+}
bool
readFilelist(global & _g, ofstream & ERR, ofstream & LOG)
@@ -84,100 +96,214 @@ readFilelist(global & _g, ofstream & ERR, ofstream & LOG)
bool
getLambda(string name, double & directLambda, double & imputedLambda,
int & directCount, int & imputedCount, int columnBeta,
- int columnSE, int columnImputed, int columnCount, global & g)
+ int columnSE, int columnImputed, int columnCount, vector <int> filterColumns, global & g)
{
int currentLine =0;
int _imputedCount = 0;
int _directCount = 0;
vector <double> _imputedChiStat;
vector <double> _directChiStat;
- ifstream F (name.c_str());
- if (F.is_open())
- {
- while (! F.eof() )
- {
- string line;
- vector<string> tokens;
- getline (F,line);
- int n = Tokenize(string(line), tokens, " ");
- if (n)
- {
- if (currentLine>0) // skip headers
- {
- double beta=0;
- double se=0;
- if (g._binaryTrait)
- {
- double oratio = atof(tokens[columnBeta].c_str());
- double oratio_95l = atof(tokens[columnSE].c_str());
- if (oratio> 0 && oratio > oratio_95l && oratio_95l>0)
- {
- beta=log(oratio);
- se = ((log(oratio)-log(oratio_95l))/1.96);
- }
- }
- else
- {
- beta = atof(tokens[columnBeta].c_str());
- se = atof(tokens[columnSE].c_str());
- }
- int imputed = 0;
- if (columnImputed!=-9)
- {
- if (string(tokens[columnImputed])=="1" || string(tokens[columnImputed])=="0")
- {
- if (atoi(tokens[columnImputed].c_str())==1)imputed=1;
- }
+
+ if (name.substr(name.length()-2)=="gz")
+ {
+ gzFile F =gzopen(name.c_str(),"r");
+
+
+ char *buffer = new char[LENS];
+ while(0!=gzgets(F,buffer,LENS))
+ {
+ string line;
+ vector<string> tokens;
- }
- if (se>0)
- {
- if (imputed)
- {
- _imputedCount++;
- _imputedChiStat.push_back((beta*beta)/(se*se));
- }
- else
- {
- _directCount++;
- _directChiStat.push_back((beta*beta)/(se*se));
- }
- }
- }
- }
- if (int(currentLine/10000)==currentLine/10000.0) cout << "Finding lambda. Line: " << currentLine << "\r";
+ int n = Tokenize(buffer, tokens, " ");
+ if (n)
+ {
+ if (currentLine>0) // skip headers
+ {
+ bool filtered=0;
+ double beta=0;
+ double se=0;
+ if (g._binaryTrait)
+ {
+ double oratio = atof(tokens[columnBeta].c_str());
+ double oratio_95l = atof(tokens[columnSE].c_str());
+ if (oratio> 0 && oratio > oratio_95l && oratio_95l>0)
+ {
+ beta=log(oratio);
+ se = ((log(oratio)-log(oratio_95l))/1.96);
+ }
+ }
+ else
+ {
+ beta = atof(tokens[columnBeta].c_str());
+ se = atof(tokens[columnSE].c_str());
+ }
+ for (int j; j<filterColumns.size();j++)
+ {
+ if (filterColumns[j] != -1)
+ {
+ double valueX = atof(tokens[filterColumns[j]].c_str());
+ if (isFiltered(valueX, g.filters[j]._equation, g.filters[j]._value))
+ filtered=1;
+ }
+ }
+ int imputed = 0;
+ if (columnImputed!=-9)
+ {
+ if (string(tokens[columnImputed])=="1" || string(tokens[columnImputed])=="0")
+ {
+ if (atoi(tokens[columnImputed].c_str())==1)imputed=1;
+ }
+
+ }
+ if (se>0 && filtered==0)
+ {
+ if (imputed)
+ {
+ _imputedCount++;
+ _imputedChiStat.push_back((beta*beta)/(se*se));
+ }
+ else
+ {
+ _directCount++;
+ _directChiStat.push_back((beta*beta)/(se*se));
+ }
+ }
+ }
+ }
+ if (int(currentLine/10000)==currentLine/10000.0) cout << "Finding lambda. Line: " << currentLine << "\r";
+
+ currentLine++;
+ }
+
+
+ if (_imputedCount>0)
+ {
+ double _medianI = 0;
+ sortVec( _imputedChiStat, _imputedCount);
+ if (_imputedCount%2!=0) _medianI = _imputedChiStat[((_imputedCount+1)/2)-1];
+ else _medianI = (_imputedChiStat[_imputedCount/2 -1] + _imputedChiStat[_imputedCount/2])/2;
+ imputedLambda = _medianI/0.4549364; //median of chi-sq from R ... qchisq(0.5, df= 1)
+
+ }
+
+ if (_directCount>0)
+ {
+ double _medianD = 0;
+ sortVec( _directChiStat, _directCount);
+ if (_directCount%2!=0) _medianD = _directChiStat[((_directCount+1)/2)-1];
+ else _medianD = (_directChiStat[_directCount/2 -1] + _directChiStat[_directCount/2])/2;
+ directLambda = _medianD/0.4549364; //median of chi-sq from R ... qchisq(0.5, df= 1)
+ }
+ directCount = _directCount;
+ imputedCount = _imputedCount;
+
+ }
+
+ else
+ {
+ ifstream F (name.c_str());
+ if (F.is_open())
+ {
+ while (! F.eof() )
+ {
+ string line;
+ vector<string> tokens;
+ getline (F,line);
+ int n = Tokenize(string(line), tokens, " ");
+ if (n)
+ {
+ if (currentLine>0) // skip headers
+ {
+ double beta=0;
+ double se=0;
+ bool filtered = 0;
+ if (g._binaryTrait)
+ {
+ double oratio = atof(tokens[columnBeta].c_str());
+ double oratio_95l = atof(tokens[columnSE].c_str());
+ if (oratio> 0 && oratio > oratio_95l && oratio_95l>0)
+ {
+ beta=log(oratio);
+ se = ((log(oratio)-log(oratio_95l))/1.96);
+ }
+ }
+ else
+ {
+ beta = atof(tokens[columnBeta].c_str());
+ se = atof(tokens[columnSE].c_str());
+ }
+ int imputed = 0;
+ if (columnImputed!=-9)
+ {
+ if (string(tokens[columnImputed])=="1" || string(tokens[columnImputed])=="0")
+ {
+ if (atoi(tokens[columnImputed].c_str())==1)imputed=1;
+ }
- currentLine++;
- }
- }
+ }
+ for (int j; j<filterColumns.size();j++)
+ {
+ if (filterColumns[j] != -1)
+ {
+ double valueX = atof(tokens[filterColumns[j]].c_str());
+ if (isFiltered(valueX, g.filters[j]._equation, g.filters[j]._value))
+ filtered=1;
+ }
+ }
- if (_imputedCount>0)
- {
- double _medianI = 0;
- sortVec( _imputedChiStat, _imputedCount);
- if (_imputedCount%2!=0) _medianI = _imputedChiStat[((_imputedCount+1)/2)-1];
- else _medianI = (_imputedChiStat[_imputedCount/2 -1] + _imputedChiStat[_imputedCount/2])/2;
- imputedLambda = _medianI/0.4549364; //median of chi-sq from R ... qchisq(0.5, df= 1)
+ if (se>0 && filtered==0)
+ {
+ if (imputed)
+ {
+ _imputedCount++;
+ _imputedChiStat.push_back((beta*beta)/(se*se));
+ }
+ else
+ {
+ _directCount++;
+ _directChiStat.push_back((beta*beta)/(se*se));
+ }
+ }
+ }
+ }
+ if (int(currentLine/10000)==currentLine/10000.0) cout << "Finding lambda. Line: " << currentLine << "\r";
- }
+ currentLine++;
+ }
+ }
- if (_directCount>0)
- {
- double _medianD = 0;
- sortVec( _directChiStat, _directCount);
- if (_directCount%2!=0) _medianD = _directChiStat[((_directCount+1)/2)-1];
- else _medianD = (_directChiStat[_directCount/2 -1] + _directChiStat[_directCount/2])/2;
- directLambda = _medianD/0.4549364; //median of chi-sq from R ... qchisq(0.5, df= 1)
- }
- directCount = _directCount;
- imputedCount = _imputedCount;
+ if (_imputedCount>0)
+ {
+ double _medianI = 0;
+ sortVec( _imputedChiStat, _imputedCount);
+ if (_imputedCount%2!=0) _medianI = _imputedChiStat[((_imputedCount+1)/2)-1];
+ else _medianI = (_imputedChiStat[_imputedCount/2 -1] + _imputedChiStat[_imputedCount/2])/2;
+ imputedLambda = _medianI/0.4549364; //median of chi-sq from R ... qchisq(0.5, df= 1)
+
+ }
+
+ if (_directCount>0)
+ {
+ double _medianD = 0;
+ sortVec( _directChiStat, _directCount);
+ if (_directCount%2!=0) _medianD = _directChiStat[((_directCount+1)/2)-1];
+ else _medianD = (_directChiStat[_directCount/2 -1] + _directChiStat[_directCount/2])/2;
+ directLambda = _medianD/0.4549364; //median of chi-sq from R ... qchisq(0.5, df= 1)
+ }
+ directCount = _directCount;
+ imputedCount = _imputedCount;
+
+ }
+
+
return true;
}
bool
readMapFile(string markermap, vector <marker> & markerlist,
- map<string, int> & markerPosition,
+ map<string, int> & markerPosition,
ofstream & ERR, int & errNR, ofstream & LOG)
{
char sb [11];
@@ -231,6 +357,14 @@ readCohort(int fileNr, global & g,map<string, int> & markerPosition,vector <mark
int currentLine = 0;
int _countGoodMarkers=0;
int _countBadMarkers=0;
+
+ vector <int> _filterColumnNr;
+ vector <unsigned int> _markerFilteredBy;
+ for (int i = 0; i < g.filters.size(); i++)
+ {
+ _filterColumnNr.push_back(-1);
+ _markerFilteredBy.push_back(0);
+ }
char sb [11];
cohort thisCohort;
problem _markerProblems;
@@ -241,6 +375,370 @@ readCohort(int fileNr, global & g,map<string, int> & markerPosition,vector <mark
thisCohort._name=g.studyList[fileNr];
// cout << "Reading file: " << g.studyList[fileNr] << endl;
LOG << " [" << fileNr+1 << "]" << " Reading file: " << g.studyList[fileNr] << endl;
+
+ if (g.studyList[fileNr].substr(g.studyList[fileNr].length()-2)=="gz")
+ {
+ // ifstream F (G.inputGenFile.c_str());
+ gzFile F =gzopen(g.studyList[fileNr].c_str(),"r");
+
+ char *buffer = new char[LENS];
+ while(0!=gzgets(F,buffer,LENS))
+ {
+ string line;
+ vector<string> tokens;
+ int n = Tokenize(buffer, tokens, " ");
+ if (n)
+ {
+ if (currentLine==0) // get headers
+ {
+ for (int i=0; i<n; i++)
+ {
+ //markername
+ for (unsigned int j=0; j<g._alternativeMarkerNames.size(); j++)
+ {
+ if (uc(string(tokens[i])).compare(g._alternativeMarkerNames[j])==0){thisCohort._columnMarkerName=i;}
+ }
+ //effect allele
+ for (unsigned int j=0; j<g._alternativeEffectAlleles.size(); j++)
+ {
+ if (uc(string(tokens[i])).compare(g._alternativeEffectAlleles[j])==0)thisCohort._columnEffectAllele=i;
+ }
+ //other allele
+ for (unsigned int j=0; j<g._alternativeOtherAlleles.size(); j++)
+ {
+ if (uc(string(tokens[i])).compare(g._alternativeOtherAlleles[j])==0)thisCohort._columnOtherAllele=i;
+ }
+ //effect allele freq
+ for (unsigned int j=0; j<g._alternativeEffectAlleleFreqs.size(); j++)
+ {
+ if (uc(string(tokens[i])).compare(g._alternativeEffectAlleleFreqs[j])==0)thisCohort._columnEffectAlleleFreq=i;
+ }
+ //strand
+ for (unsigned int j=0; j<g._alternativeStrands.size(); j++)
+ {
+ if (uc(string(tokens[i])).compare(g._alternativeStrands[j])==0)thisCohort._columnStrand=i;
+ }
+ //n
+ for (unsigned int j=0; j<g._alternativeNs.size(); j++)
+ {
+ if (uc(string(tokens[i])).compare(g._alternativeNs[j])==0)thisCohort._columnN=i;
+ }
+ //beta
+ for (unsigned int j=0; j<g._alternativeBetas.size(); j++)
+ {
+ if (uc(string(tokens[i])).compare(g._alternativeBetas[j])==0)thisCohort._columnBeta=i;
+ }
+ //se
+ for (unsigned int j=0; j<g._alternativeSEs.size(); j++)
+ {
+ if (uc(string(tokens[i])).compare(g._alternativeSEs[j])==0)thisCohort._columnSE=i;
+ }
+ //or
+ for (unsigned int j=0; j<g._alternativeORs.size(); j++)
+ {
+ if (uc(string(tokens[i])).compare(g._alternativeORs[j])==0)thisCohort._columnOR=i;
+ }
+ //or 95L
+ for (unsigned int j=0; j<g._alternativeOR_95Ls.size(); j++)
+ {
+ if (uc(string(tokens[i])).compare(g._alternativeOR_95Ls[j])==0)thisCohort._columnOR_95L=i;
+ }
+ //or 95U
+ for (unsigned int j=0; j<g._alternativeOR_95Us.size(); j++)
+ {
+ if (uc(string(tokens[i])).compare(g._alternativeOR_95Us[j])==0)thisCohort._columnOR_95U=i;
+ }
+ //imputed
+ for (unsigned int j=0; j<g._alternativeImputeds.size(); j++)
+ {
+ if (uc(string(tokens[i])).compare(g._alternativeImputeds[j])==0)thisCohort._columnImputed=i;
+ }
+ //filters
+ for (unsigned int j=0; j<g.filters.size(); j++)
+ {
+ if (uc(string(tokens[i])).compare(uc(g.filters[j]._columnName))==0)_filterColumnNr[j]=i;
+ }
+
+ }
+ thisCohort._columnCount = n;
+ if (g._noAlleles)
+ {
+ thisCohort._columnEffectAllele=-9;
+ thisCohort._columnOtherAllele=-9;
+ }
+ }//header lines
+ else //data lines
+ {
+ bool currentMarkerIsOK=true;
+ if (thisCohort._columnCount!=n) // lets check if current row has right number of tokens
+ {
+ sprintf(sb, "E%.9d" , g._errNR); g._errNR++;
+ LOG << g.studyList[fileNr] << " row " << currentLine << " has different number of columns compared to header line ( " << n << ", " << thisCohort._columnCount << ") Skipping line!(" << sb << ")" << endl;
+ ERR << sb << " Error: "<< g.studyList[fileNr] << " row " << currentLine << " has different number of columns compared to header line ( " << n << ", " << thisCohort._columnCount << ") Skipping line!" << endl;
+
+ }
+ if (currentLine==1) //lets check if all necessary columns are present
+ {
+ if (thisCohort._columnMarkerName==-9 ||
+ (thisCohort._columnBeta==-9 && g._binaryTrait==false) ||
+ (thisCohort._columnSE==-9 && g._binaryTrait==false) ||
+ (thisCohort._columnOR==-9 && g._binaryTrait==true) ||
+ (thisCohort._columnOR_95L==-9 && g._binaryTrait==true) ||
+ (thisCohort._columnEffectAllele==-9 && g._noAlleles==false) ||
+ (thisCohort._columnOtherAllele==-9 && g._noAlleles==false))
+ {
+ cout << "\tmissing mandatory column! Skipping file." << endl;
+ sprintf(sb, "E%.9d" , g._errNR); g._errNR++;
+ LOG << g.studyList[fileNr] << " misses some of the mandatory columns. Skipping file!(" << sb << ")" << endl;
+ ERR << sb << " " << g.studyList[fileNr] << " misses some of the mandatory columns. Skipping file!" << endl;
+ if (thisCohort._columnMarkerName==-9)ERR << sb << " Markername column is missing" << endl;
+ if (thisCohort._columnBeta==-9 && g._binaryTrait==false)ERR << sb << " Beta column is missing" << endl;
+ if (thisCohort._columnSE==-9 && g._binaryTrait==false)ERR << sb << " SE column is missing" << endl;
+ if (thisCohort._columnOR==-9 && g._binaryTrait==true)ERR << sb << " OR column is missing" << endl;
+ if (thisCohort._columnOR_95L==-9 && g._binaryTrait==true)ERR << sb << " OR _95L column is missing" << endl;
+ if (thisCohort._columnOR_95U==-9 && g._binaryTrait==true)ERR << sb << " OR _95U column is missing" << endl;
+ if (thisCohort._columnEffectAllele==-9 && g._noAlleles==false)ERR << sb << " Effect allele column is missing. If all effects are according to the same allele, then please use --no_alleles option" << endl;
+ if (thisCohort._columnOtherAllele==-9 && g._noAlleles==false)ERR << sb << " Other allele column is missing. If all effects are according to the same allele, then please use --no_alleles option" << endl;
+ return 0;
+ }
+ if (thisCohort._columnStrand==-9)
+ {
+ cout << "Strand column missing! Expecting always positive strand."<< endl;
+ LOG << "Strand column missing! Expecting always positive strand."<< endl;
+ }
+ for (int j=0;j<_filterColumnNr.size();j++)
+ {
+ if (_filterColumnNr[j]!=-1) cout << "Filtering by " << g.filters[j]._columnName << " column" << endl;
+ }
+ if (g._genomicControl) //calculating lambdas for the file
+ {
+ bool lambdaSuccess;
+ if (g._binaryTrait==false)lambdaSuccess = getLambda(thisCohort._name, thisCohort._directLambda,
+ thisCohort._imputedLambda, thisCohort._directCount, thisCohort._imputedCount, thisCohort._columnBeta, thisCohort._columnSE, thisCohort._columnImputed,
+ thisCohort._columnCount, _filterColumnNr, g);
+ else lambdaSuccess = getLambda(thisCohort._name, thisCohort._directLambda, thisCohort._imputedLambda,
+ thisCohort._directCount, thisCohort._imputedCount,thisCohort._columnOR, thisCohort._columnOR_95L, thisCohort._columnImputed,
+ thisCohort._columnCount, _filterColumnNr, g);
+
+ if (lambdaSuccess)
+ {
+ cout << "GC lambda genotyped: " << thisCohort._directLambda << " (" << thisCohort._directCount<< ") imputed: " << thisCohort._imputedLambda << " (" << thisCohort._imputedCount << ")"<< endl;
+
+ LAMBDA_FILE << thisCohort._name;
+ char sb [1024]; //temporary char buffer
+ std::string x=""; //starting to collect all information into this string
+
+ x.append("\t");
+ sprintf(sb, "%.4f" , thisCohort._directLambda);
+ x.append(sb);
+
+ x.append("\t");
+ sprintf(sb, "%d" , thisCohort._directCount);
+ x.append(sb);
+
+ x.append("\t");
+ sprintf(sb, "%.4f" , thisCohort._imputedLambda);
+ x.append(sb);
+
+ x.append("\t");
+ sprintf(sb, "%d" , thisCohort._imputedCount);
+ x.append(sb);
+ LAMBDA_FILE << x <<endl;
+
+ }
+ }
+
+ }
+ //everything seems to be OK..lets read marker data
+ //marker name
+ string myMarker = string(tokens[thisCohort._columnMarkerName]);
+ _markerProblems.markersAll++;
+ //alleles
+ string myEffectAllele, myOtherAllele;
+ if (thisCohort._columnEffectAllele!=-9 && thisCohort._columnOtherAllele!=-9)
+ {
+ myEffectAllele = uc(string(tokens[thisCohort._columnEffectAllele]));
+ myOtherAllele = uc(string(tokens[thisCohort._columnOtherAllele]));
+ if (!checkAlleles(myEffectAllele, myOtherAllele) && !g._indel) //problem with alleles - reporting
+ {
+ sprintf(sb, "E%.9d" , g._errNR); g._errNR++;
+ LOG << g.studyList[fileNr] << " has problem with alleles for marker " << myMarker << "!(" << sb << ")" << endl;
+ ERR << sb << " " << g.studyList[fileNr] << " has problem with alleles for marker " << myMarker << "!"<< endl;
+ ERR << sb << " " << g.studyList[fileNr] << " Given alleles: "<< string(tokens[thisCohort._columnEffectAllele]) << "/" << string(tokens[thisCohort._columnOtherAllele]) << ". Skipping marker!" << endl;
+ currentMarkerIsOK=false;
+ _markerProblems.wrongAlleles++;
+ }
+ }
+ else {myEffectAllele = "N";myOtherAllele = "N";}
+ //strand
+ bool myStrand = true;
+ if (thisCohort._columnStrand!=-9){if (string(tokens[thisCohort._columnStrand]).compare("-")==0){myStrand = false;}}
+ if (!myStrand && !g._indel)
+ {
+ myEffectAllele = flip(myEffectAllele);
+ myOtherAllele = flip(myOtherAllele);
+ }
+ else if (!myStrand & checkAlleles(myEffectAllele, myOtherAllele))
+ {
+ myEffectAllele = flip(myEffectAllele);
+ myOtherAllele = flip(myOtherAllele);
+ }
+ //eaf
+ double myEaf=-9;
+ if (thisCohort._columnEffectAlleleFreq!=-9)
+ {
+ if (atof(tokens[thisCohort._columnEffectAlleleFreq].c_str())>0 &&
+ atof(tokens[thisCohort._columnEffectAlleleFreq].c_str())<1)
+ {
+ myEaf = atof(tokens[thisCohort._columnEffectAlleleFreq].c_str());
+ }
+ else //eaf out of range - report it
+ {
+ sprintf(sb, "E%.9d" , g._errNR); g._errNR++;
+ LOG << g.studyList[fileNr] << " has problem with effect allele frequency for marker " << myMarker << "!(" << sb << ")" << endl;
+ ERR << sb << " " << g.studyList[fileNr] << " has problem with effect allele frequency " << myMarker << "!"<< endl;
+ ERR << sb << " " << g.studyList[fileNr] << " Given value: "<< string(tokens[thisCohort._columnEffectAlleleFreq]) << ". Value not used!" << endl;
+ }
+ }
+ //effect+se (or+ci)
+ double myBeta=-9;
+ double mySE=-9;
+ if (g._binaryTrait)
+ {
+ double oratio = atof(tokens[thisCohort._columnOR].c_str());
+ double oratio_95l = atof(tokens[thisCohort._columnOR_95L].c_str());
+ if (oratio>0 && oratio_95l< oratio && oratio_95l>0)
+ {
+ myBeta=log(oratio);
+ mySE = ((log(oratio)-log(oratio_95l))/1.96);
+ }
+ else // problem with or value
+ {
+ sprintf(sb, "E%.9d" , g._errNR); g._errNR++;
+ LOG << g.studyList[fileNr] << " has problem with odds ratio and its 95_L for marker " << myMarker << "!(" << sb << ")" << endl;
+ ERR << sb << " " << g.studyList[fileNr] << " has problem with odds ratio or its CI of " << myMarker << "!"<< endl;
+ ERR << sb << " " << g.studyList[fileNr] << " Given values: OR="<< string(tokens[thisCohort._columnOR]) << " CI_95L="<<string(tokens[thisCohort._columnOR_95L]) << ". Marker not used!" << endl;
+ _markerProblems.problemEffect++;
+ currentMarkerIsOK=false;
+
+ }
+ }
+ else
+ {
+ myBeta = atof(tokens[thisCohort._columnBeta].c_str());
+ mySE = atof(tokens[thisCohort._columnSE].c_str());
+ if (mySE<=0)
+ {
+ sprintf(sb, "E%.9d" , g._errNR); g._errNR++;
+ LOG << g.studyList[fileNr] << " has problem with beta and se for marker " << myMarker << "!(" << sb << ")" << endl;
+ ERR << sb << " " << g.studyList[fileNr] << " has problem with odds ratio or its CI of " << myMarker << "!"<< endl;
+ ERR << sb << " " << g.studyList[fileNr] << " Given values: BETA="<< string(tokens[thisCohort._columnBeta]) << " SE="<<string(tokens[thisCohort._columnSE]) << ". Marker not used!" << endl;
+ _markerProblems.problemEffect++;
+ currentMarkerIsOK=false;
+ }
+ }
+ //imputed
+ bool myImputed = false;
+ if (thisCohort._columnImputed!=-9)
+ {
+ if (string(tokens[thisCohort._columnImputed])=="1" || string(tokens[thisCohort._columnImputed])=="0")
+ {
+ if (atoi(tokens[thisCohort._columnImputed].c_str())==1){myImputed = true;}
+ }
+ else
+ {
+ sprintf(sb, "E%.9d" , g._errNR); g._errNR++;
+ LOG << g.studyList[fileNr] << " has problem with imputation status for marker " << myMarker << "!(" << sb << ")" << endl;
+ ERR << sb << " " << g.studyList[fileNr] << " has problem with imputation status for " << myMarker << "!"<< endl;
+ ERR << sb << " " << g.studyList[fileNr] << " Given values: IMPUTED="<< string(tokens[thisCohort._columnBeta]) << "Value must be 1-imputed, 0-directly genotyped." << endl;
+ }
+ }
+ //n
+ int myN = -9;
+ if (thisCohort._columnN!=-9)
+ {
+ if (atoi(tokens[thisCohort._columnN].c_str())>0){myN = atoi(tokens[thisCohort._columnN].c_str());}
+ else
+ {
+ sprintf(sb, "E%.9d" , g._errNR); g._errNR++;
+ LOG << g.studyList[fileNr] << " has problem with sample size for marker " << myMarker << "!(" << sb << ")" << endl;
+ ERR << sb << " " << g.studyList[fileNr] << " has problem with sample size for " << myMarker << "!"<< endl;
+ ERR << sb << " " << g.studyList[fileNr] << " Given values: N="<< string(tokens[thisCohort._columnN]) << "Value must be larger than zero." << endl;
+ }
+ }
+ for (int j=0; j<_filterColumnNr.size();j++)
+ {
+ if (_filterColumnNr[j] != -1)
+ {
+ double valueX = atof(tokens[_filterColumnNr[j]].c_str());
+ if (isFiltered(valueX, g.filters[j]._equation, g.filters[j]._value))
+ {
+ _markerFilteredBy[j]++;
+ currentMarkerIsOK=0;
+ }
+ }
+ }
+ //all data read - will add marker to marker list
+ if (currentMarkerIsOK)
+ {
+ if (_markernames[myMarker]==0)
+ {
+ _markernames[myMarker]=_markerNamesCount;
+ _markerNamesCount++;
+
+ if (markerPosition[myMarker]==0)
+ {
+ markerlist.push_back(marker(myMarker, g._studyCount));
+ if (markerlist[g._markerCount-1].addCohort(fileNr, g,
+ thisCohort._directLambda, thisCohort._imputedLambda,
+ myMarker,myEffectAllele, myOtherAllele, myEaf,
+ myStrand, myBeta, mySE, myImputed, myN, ERR, LOG, _markerProblems)==0)
+ // markerlist[markerCount-1].addStudy(currentimputed,
+ // currentstrand, currenteffectallele, currentnoneffectallele, currenteffectallelefreq,
+ // currentbeta, currentse, i, studies,n, ERR, errNR, LOG) ;
+ {
+ markerPosition[myMarker]=g._markerCount;
+ g._markerCount++;
+ }
+ }
+ else //marker is already existing in database
+ {
+ int x = markerPosition[myMarker]; //this is the line number of current marker
+ markerlist[x-1].addCohort(fileNr, g,
+ thisCohort._directLambda, thisCohort._imputedLambda,
+ myMarker,myEffectAllele, myOtherAllele, myEaf,
+ myStrand, myBeta, mySE, myImputed, myN, ERR, LOG, _markerProblems);
+
+ // markerlist[x-1].addStudy(currentimputed,
+ // currentstrand, currenteffectallele, currentnoneffectallele, currenteffectallelefreq,
+ // currentbeta, currentse, i, studies,n, ERR, errNR, LOG);
+
+ }
+ }
+ else
+ {
+ _markerProblems.problemMulti++;
+ }
+ _countGoodMarkers++;
+ }
+ else
+ {
+ _countBadMarkers++;
+ }
+
+ } //data lines end here
+
+ }
+ if (int(currentLine/10000)==currentLine/10000.0) cout << "Line: " << currentLine << "\r";
+ currentLine++;
+ }
+ gzclose(F);
+
+ }
+ else
+ {
+
+
ifstream F (g.studyList[fileNr].c_str());
if (F.is_open())
{
@@ -316,6 +814,11 @@ readCohort(int fileNr, global & g,map<string, int> & markerPosition,vector <mark
{
if (uc(string(tokens[i])).compare(g._alternativeImputeds[j])==0)thisCohort._columnImputed=i;
}
+ //filters
+ for (unsigned int j=0; j<g.filters.size(); j++)
+ {
+ if (uc(string(tokens[i])).compare(uc(g.filters[j]._columnName))==0)_filterColumnNr[j]=i;
+ }
}
thisCohort._columnCount = n;
@@ -364,16 +867,20 @@ readCohort(int fileNr, global & g,map<string, int> & markerPosition,vector <mark
cout << "Strand column missing! Expecting always positive strand."<< endl;
LOG << "Strand column missing! Expecting always positive strand."<< endl;
}
+ for (int j=0;j<_filterColumnNr.size();j++)
+ {
+ if (_filterColumnNr[j]!=-1) cout << "Fltering by " << g.filters[j]._columnName << " column" << endl;
+ }
if (g._genomicControl) //calculating lambdas for the file
{
bool lambdaSuccess;
if (g._binaryTrait==false)lambdaSuccess = getLambda(thisCohort._name, thisCohort._directLambda,
thisCohort._imputedLambda, thisCohort._directCount, thisCohort._imputedCount,
thisCohort._columnBeta, thisCohort._columnSE, thisCohort._columnImputed,
- thisCohort._columnCount, g);
+ thisCohort._columnCount,_filterColumnNr, g);
else lambdaSuccess = getLambda(thisCohort._name, thisCohort._directLambda, thisCohort._imputedLambda,
thisCohort._directCount, thisCohort._imputedCount,thisCohort._columnOR, thisCohort._columnOR_95L, thisCohort._columnImputed,
- thisCohort._columnCount, g);
+ thisCohort._columnCount,_filterColumnNr, g);
if (lambdaSuccess)
{
@@ -516,6 +1023,18 @@ readCohort(int fileNr, global & g,map<string, int> & markerPosition,vector <mark
ERR << sb << " " << g.studyList[fileNr] << " Given values: N="<< string(tokens[thisCohort._columnN]) << "Value must be larger than zero." << endl;
}
}
+ for (int j=0; j<_filterColumnNr.size();j++)
+ {
+ if (_filterColumnNr[j] != -1)
+ {
+ double valueX = atof(tokens[_filterColumnNr[j]].c_str());
+ if (isFiltered(valueX, g.filters[j]._equation, g.filters[j]._value))
+ {
+ _markerFilteredBy[j]++;
+ currentMarkerIsOK=0;
+ }
+ }
+ }
//all data read - will add marker to marker list
if (currentMarkerIsOK)
{
@@ -577,9 +1096,18 @@ readCohort(int fileNr, global & g,map<string, int> & markerPosition,vector <mark
cerr << "Cannot open file. Exit program!"<< endl;
exit(1);
}
- cout << "Marker count: " << _markerProblems.markersAll << " Markers passing sanity check: " << _markerProblems.markersOK << endl;
+ }
+ cout << "Marker count: " << _markerProblems.markersAll << " Markers passing sanity check (and filters): " << _markerProblems.markersOK << endl;
cout << "Strand problems: " << _markerProblems.problemStrand << " Wrong alleles: " << _markerProblems.wrongAlleles << endl;
cout << "Effect problems: " << _markerProblems.problemEffect << " Multiple occurances: " << _markerProblems.problemMulti << endl;
+ for (int j=0; j<_filterColumnNr.size();j++)
+ {
+ if (_filterColumnNr[j] != -1)
+ {
+ cout << "FILTER " << g.filters[j]._columnName << " " << g.filters[j]._equation << " " << g.filters[j]._value << " :" << _markerFilteredBy[j] << endl;
+ }
+ }
+
return true;
}
diff --git a/readFile.h b/readFile.h
index fcdfa13..0d79037 100755
--- a/readFile.h
+++ b/readFile.h
@@ -38,7 +38,7 @@ using namespace std;
bool readFilelist(global & _g, ofstream & ERR, ofstream & LOG);
bool
-getLambda(string name, double & directLambda, double & imputedLambda, int & directCount, int & imputedCount, int columnBeta, int columnSE, int columnImputed, int columnCount, global & g);
+getLambda(string name, double & directLambda, double & imputedLambda, int & directCount, int & imputedCount, int columnBeta, int columnSE, int columnImputed, int columnCount, vector < int> filterColumns, global & g);
bool
readCohort(int fileNr, global & g,map<string, int> & markerPosition,vector <marker> & markerlist, ofstream & ERR, ofstream & LOG, ofstream & LAMBDA_FILE);
bool
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/gwama.git
More information about the debian-med-commit
mailing list