[med-svn] [Git][med-team/fastqtl][upstream] 2 commits: New upstream version 2.184v7
Andreas Tille (@tille)
gitlab at salsa.debian.org
Wed Oct 5 10:46:59 BST 2022
Andreas Tille pushed to branch upstream at Debian Med / fastqtl
Commits:
1fb0a51c by Andreas Tille at 2022-10-05T10:32:03+02:00
New upstream version 2.184v7
- - - - -
6c34ffd0 by Andreas Tille at 2022-10-05T11:10:55+02:00
New upstream version 2.184v7+dfsg
- - - - -
18 changed files:
- Makefile
- + R/calculateSignificanceFastQTL.R
- − README
- + README.md
- + python/annotate_outputs.py
- + python/merge_chunks.py
- + python/run_FastQTL_threaded.py
- + python/run_chunk.py
- + python/sort_index_pairs.py
- − script/calulateNominalPvalueThresholds.R
- src/analysisNominal.cpp
- + src/analysisNominalBest.cpp
- src/analysisPermutation.cpp
- src/data.h
- src/fastQTL.cpp
- src/management.cpp
- src/readCovariates.cpp
- src/readGenotypes.cpp
Changes:
=====================================
Makefile
=====================================
@@ -1,8 +1,8 @@
#PLEASE SPECIFY THE R path here where you built the R math library standalone
-RMATH=~/Software/R-3.1.3/src
+RMATH=../../R-3.2.4/src
#compiler
-CXX=g++
+CXX=g++ -std=c++11
#internal paths
VPATH=$(shell for file in `find src -name *.cpp`; do echo $$(dirname $$file); done)
@@ -27,7 +27,7 @@ INC_MACX=-I/usr/local/include/
#libraries
#LIB_BASE=-lm -lboost_iostreams -lboost_program_options -lz -lgsl -lblas
-LIB_BASE=-lm -lz -lboost_iostreams -lboost_program_options -lgsl -lblas
+LIB_BASE=-lm -lz -lbz2 -lboost_iostreams -lboost_program_options -lgslcblas -lgsl -lblas
LIB_MATH=$(RMATH)/nmath/standalone/libRmath.a
LIB_TABX=$(PATH_TABX)/libtabix.a
LIB_MACX=-L/usr/local/lib/
=====================================
R/calculateSignificanceFastQTL.R
=====================================
@@ -0,0 +1,54 @@
+# Author: Francois Aguet
+# Description: This is a post-processing function for FastQTL. It calculates
+# q-values (Storey) and p-value thresholds for all genes in the permutation results file.
+
+suppressMessages(library(qvalue))
+suppressMessages(library(tools))
+suppressMessages(library(argparser))
+
+# parse inputs
+p <- arg_parser("Annotates FastQTL permutation output and runs qvalue")
+p <- add_argument(p, "fastqtlOutput", help="")
+p <- add_argument(p, "fdr", type="numeric", help="")
+p <- add_argument(p, "outfile", help="")
+p <- add_argument(p, "--lambda", type="numeric", help="", default=NULL)
+args <- parse_args(p)
+
+cat("Processing FastQTL output (", args$fastqtlOutput, "), with FDR=", args$fdr, "\n", sep="")
+
+# input files have no headers
+D <- read.table(args$fastqtlOutput, header=FALSE, stringsAsFactors=FALSE)
+if (dim(D)[2]==17) {
+ colnames(D) <- c('gene_id', 'num_var', 'beta_shape1', 'beta_shape2', 'true_df', 'pval_true_df', 'variant_id', 'tss_distance',
+ 'minor_allele_samples', 'minor_allele_count', 'maf', 'ref_factor',
+ 'pval_nominal', 'slope', 'slope_se', 'pval_perm', 'pval_beta')
+} else {
+ stop("FastQTL output in unrecognized format (mismatched number of columns).")
+}
+
+# remove genes w/o variants
+nanrows <- is.na(D[, 'pval_beta'])
+D <- D[!nanrows, ]
+cat(" * Number of genes tested: ", nrow(D), " (excluding ", sum(nanrows), " genes w/o variants)\n", sep="")
+cat(" * Correlation between Beta-approximated and empirical p-values: ", round(cor(D[, 'pval_perm'], D[, 'pval_beta']), 4), "\n", sep="")
+
+# calculate q-values
+if (is.null(args$lambda) || is.na(args$lambda)) {
+ Q <- qvalue(D[, 'pval_beta'])
+} else {
+ cat(" * Calculating q-values with lambda = ", args$lambda, "\n", sep="")
+ Q <- qvalue(D[, 'pval_beta'], lambda=args$lambda)
+}
+
+D$qval <- signif(Q$qvalues, 6)
+cat(" * Proportion of significant phenotypes (1-pi0): " , round((1 - Q$pi0), 2), "\n", sep="")
+cat(" * eGenes @ FDR ", args$fdr, ": ", sum(D[, 'qval']<args$fdr), "\n", sep="")
+
+# determine global min(p) significance threshold and calculate nominal p-value threshold for each gene
+ub <- sort(D[D$qval > args$fdr, 'pval_beta'])[1] # smallest p-value above FDR
+lb <- -sort(-D[D$qval <= args$fdr, 'pval_beta'])[1] # largest p-value below FDR
+pthreshold <- (lb+ub)/2
+cat(" * min p-value threshold @ FDR ", args$fdr, ": ", pthreshold, "\n", sep="")
+D[, 'pval_nominal_threshold'] <- signif(qbeta(pthreshold, D[, 'beta_shape1'], D[, 'beta_shape2'], ncp=0, lower.tail=TRUE, log.p=FALSE), 6)
+
+write.table(D, gzfile(args$outfile), quote=FALSE, row.names=FALSE, col.names=TRUE, sep="\t")
=====================================
README deleted
=====================================
@@ -1 +0,0 @@
-Documentation can be found here: http://fastqtl.sourceforge.net/
=====================================
README.md
=====================================
@@ -0,0 +1,22 @@
+## FastQTL
+
+This repository contains a modified version of the [FastQTL](http://fastqtl.sourceforge.net/) QTL mapping software, with the following enhancements:
+
+1. Options for filtering by minor allele frequency and minor allele sample count
+2. Python wrapper for multi-threaded execution
+3. Calculation of q-values (Storey) for FDR estimation (requires R)
+4. Minor allele information reported in output
+
+For documentation and the original version, see: http://fastqtl.sourceforge.net/
+
+#### Running multi-threaded analyses
+
+Nominal pass:
+```bash
+run_FastQTL_threaded.py ${genotypes}.vcf.gz ${phenotypes}.bed.gz ${prefix} --covariates ${covariates}.txt.gz --window 1e6 --ma_sample_threshold 10 --maf_threshold 0.01 --chunks 100 --threads 10
+```
+Permutation pass:
+```bash
+run_FastQTL_threaded.py ${genotypes}.vcf.gz ${phenotypes}.bed.gz ${prefix} --covariates ${covariates}.txt.gz --permute 1000 10000 --window 1e6 --ma_sample_threshold 10 --maf_threshold 0.01 --chunks 100 --threads 10
+```
+These minor allele filters result in inclusion of genotypes with minor allele frequency ≥ 0.01 and with at least 10 samples containing the minor allele.
=====================================
python/annotate_outputs.py
=====================================
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+# Author: Francois Aguet
+
+import argparse
+import numpy as np
+import pandas as pd
+import os
+import gzip
+from datetime import datetime
+import subprocess
+import io
+
+parser = argparse.ArgumentParser(description='Filter significant SNP-gene pairs from FastQTL results using FDR cutoff')
+parser.add_argument('permutation_results', help='FastQTL output')
+parser.add_argument('fdr', type=np.double, help='False discovery rate (e.g., 0.05)')
+parser.add_argument('annotation_gtf', help='Annotation in GTF format')
+parser.add_argument('--snp_lookup', default='', help='Tab-delimited file with columns: chr, variant_pos, variant_id, ref, alt, num_alt_per_site, rs_id_dbSNP...')
+parser.add_argument('--nominal_results', default='', help='FastQTL output from nominal pass')
+parser.add_argument('--nominal_results_unnormalized', nargs=2, default='', help='FastQTL output file (nominal pass), and units')
+parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
+args = parser.parse_args()
+
+#------------------------------------------------------------------------------
+# 1. eGenes (permutation output): add gene and variant information
+#------------------------------------------------------------------------------
+gene_dict = {}
+print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Parsing GTF', flush=True)
+# add: gene_name, gene_chr, gene_start, gene_end, strand
+with open(args.annotation_gtf) as gtf:
+ for row in gtf:
+ row = row.strip().split('\t')
+ if row[0][0]=='#' or row[2]!='gene': continue
+ # get gene_id and gene_name from attributes
+ attr = dict([i.split() for i in row[8].replace('"','').split(';') if i!=''])
+ gene_dict[attr['gene_id']] = [attr['gene_name'], row[0], row[3], row[4], row[6]]
+
+print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Annotating permutation results (eGenes)', flush=True)
+gene_df = pd.read_csv(args.permutation_results, sep='\t', index_col=0)
+gene_info = pd.DataFrame(data=[gene_dict[i] for i in gene_df.index], columns=['gene_name', 'gene_chr', 'gene_start', 'gene_end', 'strand'], index=gene_df.index)
+gene_df = pd.concat([gene_info, gene_df], axis=1)
+assert np.all(gene_df.index==gene_info.index)
+
+col_order = ['gene_name', 'gene_chr', 'gene_start', 'gene_end', 'strand',
+ 'num_var', 'beta_shape1', 'beta_shape2', 'true_df', 'pval_true_df', 'variant_id', 'tss_distance']
+if args.snp_lookup:
+ print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Adding variant annotations from lookup table', flush=True)
+ # intersect lookup table with variant_ids (col 7 in permutation_results; col 3 in snp_lookup)
+ cmd = "awk 'NR==FNR {v[$7]; next} $3 in v' <(zcat "+args.permutation_results+") <(zcat "+args.snp_lookup+")"
+ s = subprocess.check_output(cmd, shell=True, executable='/bin/bash')
+ snp_lookup_df = pd.read_csv(io.StringIO(s.decode()), index_col=2, sep='\t',
+ dtype={'chr':str, 'variant_pos':np.int64, 'variant_id':str, 'ref':str, 'alt':str, 'num_alt_per_site':np.int32})
+ gene_df = gene_df.join(snp_lookup_df, on='variant_id') # add variant information
+ col_order += list(snp_lookup_df.columns)
+col_order += ['minor_allele_samples', 'minor_allele_count', 'maf', 'ref_factor',
+ 'pval_nominal', 'slope', 'slope_se', 'pval_perm', 'pval_beta', 'qval', 'pval_nominal_threshold']
+gene_df = gene_df[col_order]
+
+outname = os.path.join(args.output_dir, os.path.split(args.permutation_results)[1].split('.txt.gz')[0]+'.annotated.txt.gz')
+with gzip.open(outname, 'wt') as f:
+ gene_df.to_csv(f, sep='\t', float_format='%.6g')
+
+#------------------------------------------------------------------------------
+# 2. variant-gene pairs: output new file with all significant pairs
+#------------------------------------------------------------------------------
+if args.nominal_results:
+ print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Filtering significant variant-gene pairs', flush=True)
+
+ # eGenes (apply FDR threshold)
+ egene_df = gene_df.loc[gene_df['qval']<=args.fdr, ['pval_nominal_threshold', 'pval_nominal', 'pval_beta']].copy()
+ egene_df.rename(columns={'pval_nominal': 'min_pval_nominal'}, inplace=True)
+ egene_ids = set(egene_df.index)
+ threshold_dict = egene_df['pval_nominal_threshold'].to_dict()
+
+ # process by chunks to reduce memory usage
+ signif_df = []
+ mask = []
+ for i,chunk in enumerate(pd.read_csv(args.nominal_results, sep='\t', iterator=True, chunksize=1000000, index_col=1,
+ dtype={'gene_id':str, 'variant_id':str, 'tss_distance':np.int32,
+ 'ma_samples':np.int32, 'ma_count':np.int32, 'maf':np.float32,
+ 'pval_nominal':np.float64, 'slope':np.float32, 'slope_se':np.float32})):
+ chunk = chunk[chunk['gene_id'].isin(egene_ids)]
+ m = chunk['pval_nominal']<chunk['gene_id'].apply(lambda x: threshold_dict[x])
+ signif_df.append(chunk[m])
+ mask.append(m)
+ print('Chunks processed: {0:d}'.format(i+1), end='\r', flush=True)
+ signif_df = pd.concat(signif_df, axis=0)
+
+ if args.nominal_results_unnormalized:
+ signif_raw_df = []
+ print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Filtering significant variant-gene pairs (unnormalized)', flush=True)
+ for i,chunk in enumerate(pd.read_csv(args.nominal_results_unnormalized[0], sep='\t', iterator=True, chunksize=1000000,
+ usecols=['gene_id', 'variant_id', 'slope', 'slope_se'], index_col=1, dtype={'gene_id':str, 'variant_id':str, 'slope':np.float32, 'slope_se':np.float32})):
+ chunk = chunk[chunk['gene_id'].isin(egene_ids)]
+ signif_raw_df.append(chunk[mask[i]])
+ print('Chunks processed: {0:d}'.format(i+1), end='\r', flush=True)
+ signif_raw_df = pd.concat(signif_raw_df, axis=0)
+ signif_raw_df.rename(columns={'slope':'slope_'+args.nominal_results_unnormalized[1], 'slope_se':'slope_'+args.nominal_results_unnormalized[1]+'_se'}, inplace=True)
+ assert np.all(signif_df.index==signif_raw_df.index)
+ signif_df = pd.concat([signif_df, signif_raw_df[signif_raw_df.columns[1:]]], axis=1)
+
+ signif_df = signif_df.merge(egene_df, left_on='gene_id', right_index=True)
+
+ outname = os.path.join(args.output_dir, os.path.split(args.nominal_results)[1].split('.allpairs.txt.gz')[0]+'.signifpairs.txt.gz')
+ with gzip.open(outname, 'wt') as f:
+ signif_df.to_csv(f, sep='\t', float_format='%.6g')
+
+print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Completed annotation', flush=True)
=====================================
python/merge_chunks.py
=====================================
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+# Author: Francois Aguet
+import argparse
+import os
+import numpy as np
+import subprocess
+import gzip
+import contextlib
+from datetime import datetime
+
+ at contextlib.contextmanager
+def cd(cd_path):
+ saved_path = os.getcwd()
+ os.chdir(cd_path)
+ yield
+ os.chdir(saved_path)
+
+parser = argparse.ArgumentParser(description='Run FastQTL')
+parser.add_argument('chunk_list', help='List of chunks')
+parser.add_argument('log_list', help='List of chunks')
+parser.add_argument('prefix', help='Prefix for output file name')
+parser.add_argument('--permute', action='store_true')
+parser.add_argument('--fdr', default=0.05, type=np.double)
+parser.add_argument('--lambda_qvalue', default=None, help='lambda parameter for pi0est in qvalue.')
+parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
+args = parser.parse_args()
+fastqtl_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
+
+if not os.path.exists(args.output_dir):
+ os.makedirs(args.output_dir)
+
+with cd(args.output_dir):
+ if args.permute:
+ # merge chunks
+ print('Merging chunks ... ', end='', flush=True)
+ cmd = 'xargs zcat < '+args.chunk_list+' | gzip -c -1 > '+args.prefix+'.txt.gz'
+ subprocess.check_call(cmd, shell=True, executable='/bin/bash')
+ cmd = 'xargs cat < '+args.log_list+' > '+args.prefix+'.egenes.log'
+ subprocess.check_call(cmd, shell=True, executable='/bin/bash')
+ print('done.', flush=True)
+
+ # calculate q-values (R script also adds header)
+ print('Calculating q-values', flush=True)
+ cmd = 'Rscript '+os.path.join(fastqtl_dir, 'R', 'calculateSignificanceFastQTL.R')\
+ +' '+args.prefix+'.txt.gz '+str(args.fdr)+' '+args.prefix+'.egenes.txt.gz'
+ if args.lambda_qvalue is not None:
+ cmd += ' --lambda '+args.lambda_qvalue
+ subprocess.check_call(cmd, shell=True, executable='/bin/bash')
+ os.remove(args.prefix+'.txt.gz')
+ else:
+ # merge chunks
+ print('Merging chunks ... ', end='', flush=True)
+ with gzip.open('header_chunk.txt.gz', 'wb') as f: # order from analysisNominal.cpp
+ f.write(('\t'.join([
+ 'gene_id',
+ 'variant_id',
+ 'tss_distance',
+ 'ma_samples',
+ 'ma_count',
+ 'maf',
+ 'pval_nominal',
+ 'slope',
+ 'slope_se',
+ ])+'\n').encode('utf-8'))
+ cmd = 'zcat header_chunk.txt.gz <(xargs cat < '+args.chunk_list+') | gzip -c -1 > '+args.prefix+'.txt.gz'
+ subprocess.check_call(cmd, shell=True, executable='/bin/bash')
+ os.remove('header_chunk.txt.gz')
+ cmd = 'xargs cat < '+args.log_list+' > '+args.prefix+'.allpairs.log'
+ subprocess.check_call(cmd, shell=True, executable='/bin/bash')
+ print('done.', flush=True)
+
+ os.rename(args.prefix+'.txt.gz', args.prefix+'.allpairs.txt.gz')
=====================================
python/run_FastQTL_threaded.py
=====================================
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+# Author: Francois Aguet
+import argparse
+import os
+import numpy as np
+import subprocess
+import gzip
+import multiprocessing as mp
+import contextlib
+from datetime import datetime
+
+ at contextlib.contextmanager
+def cd(cd_path):
+ saved_path = os.getcwd()
+ os.chdir(cd_path)
+ yield
+ os.chdir(saved_path)
+
+def get_cmd(args, chunk):
+ cmd = os.path.join(fastqtl_dir, 'bin', 'fastQTL')+' --vcf '+args.vcf+' --bed '+args.bed+' --window '+args.window\
+ +' --maf-threshold '+args.maf_threshold+' --ma-sample-threshold '+args.ma_sample_threshold
+ if args.covariates:
+ cmd += ' --cov '+args.covariates
+ if args.threshold:
+ cmd += ' --threshold'+args.threshold
+ if args.permute:
+ cmd += ' --permute '+' '.join([str(p) for p in args.permute])
+ if args.best_variant_only:
+ cmd += ' --report-best-only'
+ if args.seed:
+ cmd += ' --seed '+args.seed
+ if args.exclude_samples:
+ cmd += ' --exclude-samples '+args.exclude_samples
+ cmd += ' --chunk '+str(chunk)+' '+args.chunks\
+ + ' --out '+args.prefix+'_chunk{0:03d}.txt.gz'.format(chunk)\
+ + ' --log '+args.prefix+'_chunk{0:03d}.log'.format(chunk)
+ return cmd
+
+def perm_worker(inputs):
+ args = inputs[0]
+ chunk = inputs[1]
+ cmd = get_cmd(args, chunk)
+ print('Processing chunk '+str(chunk), flush=True)
+ s = subprocess.check_call(cmd, shell=True, executable='/bin/bash', stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+ print('Finished chunk '+str(chunk), flush=True)
+ return s
+
+
+parser = argparse.ArgumentParser(description='Run FastQTL')
+parser.add_argument('vcf', help='Genotypes in VCF 4.1 format')
+parser.add_argument('bed', help='Phenotypes in UCSC BED extended format')
+parser.add_argument('prefix', help='Prefix for output file name')
+parser.add_argument('--covariates', default='', help='Covariates')
+parser.add_argument('--chunks', default='100', help='Number of chunks, minimum: #chromosomes')
+parser.add_argument('--permute', default=None, type=str, nargs='+', help='Number of permutations, e.g. [1000, 10000] (adaptive). Default: None (run nominal pass)')
+parser.add_argument('--best_variant_only', action='store_true')
+parser.add_argument('--window', default='1e6', help='Cis-window size. Default values is 1Mb (1e6).')
+parser.add_argument('--threshold', default='', help='Output only significant phenotype-variant pairs with a p-value below threshold (default 1)')
+parser.add_argument('--maf_threshold', default='0.0', help='Include only genotypes with minor allele frequency >=maf_threshold (default 0)')
+parser.add_argument('--ma_sample_threshold', default='0', help='Include only genotypes with >=ma_sample_threshold samples carrying the minor allele (default 0)')
+parser.add_argument('--fdr', default=0.05, type=np.double)
+parser.add_argument('--seed', default=None, help='Random number generator seed')
+parser.add_argument('--exclude_samples', default=None, help='')
+parser.add_argument('--qvalue_lambda', default=None, help='lambda parameter for pi0est in qvalue.')
+parser.add_argument('-t', '--threads', default=8, type=int, help='Number of threads')
+parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
+args = parser.parse_args()
+fastqtl_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
+
+if not os.path.exists(args.output_dir):
+ os.makedirs(args.output_dir)
+
+with cd(args.output_dir):
+ print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Running FastQTL on {0:d} threads.'.format(args.threads), flush=True)
+ with mp.Pool(processes=args.threads) as pool:
+ pdata_res = [pool.map_async(perm_worker, ((args,k),)) for k in np.arange(1,int(args.chunks)+1)]
+ pool.close()
+ pool.join()
+ for res in pdata_res: # check exit status
+ assert res.get()[0]==0
+ print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Done.', flush=True)
+
+ if args.permute:
+ # merge chunks
+ print('Merging chunks ... ', end='', flush=True)
+ cmd = 'zcat '+args.prefix+'_chunk*.txt.gz | gzip -c -1 > '+args.prefix+'.txt.gz && rm '+args.prefix+'_chunk*.txt.gz'
+ subprocess.check_call(cmd, shell=True, executable='/bin/bash')
+ cmd = 'cat '+args.prefix+'_chunk*.log > '+args.prefix+'.egenes.log && rm '+args.prefix+'_chunk*.log'
+ subprocess.check_call(cmd, shell=True, executable='/bin/bash')
+ print('done.', flush=True)
+
+ # calculate q-values (R script also adds header)
+ print('Calculating q-values', flush=True)
+ cmd = 'Rscript '+os.path.join(fastqtl_dir, 'R', 'calculateSignificanceFastQTL.R')\
+ +' '+args.prefix+'.txt.gz '+str(args.fdr)+' '+args.prefix+'.egenes.txt.gz'
+ if args.qvalue_lambda is not None:
+ cmd += ' --lambda '+args.qvalue_lambda
+ subprocess.check_call(cmd, shell=True, executable='/bin/bash')
+ os.remove(args.prefix+'.txt.gz')
+ else:
+ # merge chunks
+ print('Merging chunks ... ', end='', flush=True)
+ with gzip.open('header_chunk.txt.gz', 'wb') as f: # order from analysisNominal.cpp
+ f.write(('\t'.join([
+ 'gene_id',
+ 'variant_id',
+ 'tss_distance',
+ 'ma_samples',
+ 'ma_count',
+ 'maf',
+ 'pval_nominal',
+ 'slope',
+ 'slope_se',
+ ])+'\n').encode('utf-8'))
+ cmd = 'zcat header_chunk.txt.gz '+args.prefix+'_chunk*.txt.gz | gzip -c -1 > '+args.prefix+'.txt.gz && rm '+args.prefix+'_chunk*.txt.gz'
+ subprocess.check_call(cmd, shell=True, executable='/bin/bash')
+ os.remove('header_chunk.txt.gz')
+ cmd = 'cat '+args.prefix+'_chunk*.log > '+args.prefix+'.allpairs.log && rm '+args.prefix+'_chunk*.log'
+ subprocess.check_call(cmd, shell=True, executable='/bin/bash')
+ print('done.', flush=True)
+
+ os.rename(args.prefix+'.txt.gz', args.prefix+'.allpairs.txt.gz')
=====================================
python/run_chunk.py
=====================================
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+# Author: Francois Aguet
+import argparse
+import os
+import subprocess
+import contextlib
+from datetime import datetime
+
+ at contextlib.contextmanager
+def cd(cd_path):
+ saved_path = os.getcwd()
+ os.chdir(cd_path)
+ yield
+ os.chdir(saved_path)
+
+def get_cmd(args):
+ cmd = os.path.join(fastqtl_dir, 'bin', 'fastQTL')+' --vcf '+args.vcf+' --bed '+args.bed+' --window '+args.window\
+ +' --maf-threshold '+args.maf_threshold+' --ma-sample-threshold '+args.ma_sample_threshold
+ if args.covariates:
+ cmd += ' --cov '+args.covariates
+ if args.threshold:
+ cmd += ' --threshold'+args.threshold
+ if args.permute:
+ cmd += ' --permute '+' '.join([str(p) for p in args.permute])
+ if args.best_variant_only:
+ cmd += ' --report-best-only'
+ if args.seed:
+ cmd += ' --seed '+args.seed
+ if args.exclude_samples:
+ cmd += ' --exclude-samples '+args.exclude_samples
+ cmd += ' --chunk {} {}'.format(*args.chunk)\
+ + ' --out '+args.prefix+'_chunk{0:03d}.txt.gz'.format(args.chunk[0])\
+ + ' --log '+args.prefix+'_chunk{0:03d}.log'.format(args.chunk[0])
+ return cmd
+
+
+parser = argparse.ArgumentParser(description='Run FastQTL')
+parser.add_argument('vcf', help='Genotypes in VCF 4.1 format')
+parser.add_argument('bed', help='Phenotypes in UCSC BED extended format')
+parser.add_argument('prefix', help='Prefix for output file name')
+parser.add_argument('chunk', type=int, nargs=2, help='Chunk and total number of chunks, e.g., 1 100')
+parser.add_argument('--covariates', default='', help='Covariates')
+parser.add_argument('--permute', default=None, type=str, nargs='+', help='Number of permutations, e.g. [1000, 10000] (adaptive). Default: None (run nominal pass)')
+parser.add_argument('--best_variant_only', action='store_true')
+parser.add_argument('--window', default='1e6', help='Cis-window size. Default values is 1Mb (1e6).')
+parser.add_argument('--threshold', default='', help='Output only significant phenotype-variant pairs with a p-value below threshold (default 1)')
+parser.add_argument('--maf_threshold', default='0.0', help='Include only genotypes with minor allele frequency >=maf_threshold (default 0)')
+parser.add_argument('--ma_sample_threshold', default='0', help='Include only genotypes with >=ma_sample_threshold samples carrying the minor allele (default 0)')
+parser.add_argument('--seed', default=None, help='Random number generator seed')
+parser.add_argument('--exclude_samples', default=None, help='')
+parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
+args = parser.parse_args()
+fastqtl_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
+
+if not os.path.exists(args.output_dir):
+ os.makedirs(args.output_dir)
+
+with cd(args.output_dir):
+ print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Processing chunk {}/{}'.format(*args.chunk), flush=True)
+ s = subprocess.check_call(get_cmd(args), shell=True, executable='/bin/bash')
+ assert s==0
+ print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Done.', flush=True)
=====================================
python/sort_index_pairs.py
=====================================
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+# Author: Francois Aguet
+
+import argparse
+import os
+import subprocess
+
+parser = argparse.ArgumentParser(description='')
+parser.add_argument('fastqtl_pairs_file', help='FastQTL output from nominal pass')
+parser.add_argument('--tmp_dir', default='', help='tmp directory for sorting')
+parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
+args = parser.parse_args()
+
+print('Sorting and indexing '+args.fastqtl_pairs_file+' (tabix) ... ', end='', flush=True)
+file_base = os.path.split(args.fastqtl_pairs_file)[1].split('.txt.gz')[0]
+file_name = os.path.join(args.output_dir, file_base+'.sorted.txt.gz')
+
+cmd = 'cat <(echo -e "#chr\tpos\tvariant_id\tgene_id\ttss_distance\tpval_nominal\tslope\tslope_se") \
+ <(zcat '+args.input_file+' | tail -n+2 | awk \'{print $2,$1,$3,$4,$5,$6}\' OFS="\t" | awk -F"_" \'{print $1"\t"$2"\t"$0}\')'
+if args.tmp_dir:
+ cmd += ' | sort -T '+args.tmp_dir+' -n -k1,1 -k2,2 | bgzip -c > '+file_name
+else:
+ cmd += ' | sort -n -k1,1 -k2,2 | bgzip -c > '+file_name
+
+subprocess.check_call(cmd, shell=True, executable='/bin/bash')
+subprocess.check_call('tabix -f -b 2 -e 2 '+file_name, shell=True, executable='/bin/bash')
+print('done.')
=====================================
script/calulateNominalPvalueThresholds.R deleted
=====================================
@@ -1,33 +0,0 @@
-suppressMessages(library(qvalue))
-
-args <- commandArgs(trailingOnly = TRUE)
-
-ifile = args[1]
-fdr = as.numeric(args[2]);
-
-cat("Processing fastQTL concatenated output [", ifile, "] controlling for FDR =", fdr * 100, "%\n");
-
-#Read data
-D = read.table(ifile, hea=FALSE, stringsAsFactors=FALSE)
-D = D[which(!is.na(D[, 11])),]
-cat(" * Number of molecular phenotypes =" , nrow(D), "\n")
-cat(" * Correlation between Beta approx. and Empirical p-values =", round(cor(D[, 9], D[, 10]), 4), "\n")
-
-#Run qvalue on pvalues for best signals
-Q = qvalue(D[, 11])
-D$qval = Q$qvalue
-cat(" * Proportion of significant phenotypes =" , round((1 - Q$pi0) * 100, 2), "%\n")
-
-#Determine significance threshold
-set0 = D[which(D$qval <= fdr),]
-set1 = D[which(D$qval > fdr),]
-pthreshold = (sort(set1$V11)[1] - sort(-1.0 * set0$V11)[1]) / 2
-cat(" * Corrected p-value threshold = ", pthreshold, "\n")
-
-#Calculate nominal pvalue thresholds
-D$nthresholds = qbeta(pthreshold, D$V3, D$V4, ncp = 0, lower.tail = TRUE, log.p = FALSE)
-
-#Write output
-write.table(D[, c(1, 13, 12)], args[3], quote=FALSE, row.names=FALSE, col.names=FALSE)
-
-cat("Done\n")
=====================================
src/analysisNominal.cpp
=====================================
@@ -19,51 +19,59 @@
void data::runNominal(string fout, double threshold) {
- //0. Prepare genotypes
- vector < double > genotype_sd = vector < double > (genotype_count, 0.0);
- vector < double > phenotype_sd = vector < double > (phenotype_count, 0.0);
- if (covariate_count > 0) {
- LOG.println("\nCorrecting genotypes & phenotypes for covariates");
- covariate_engine->residualize(genotype_orig);
- covariate_engine->residualize(phenotype_orig);
- }
- for (int g = 0 ; g < genotype_count ; g ++) genotype_sd[g] = RunningStat(genotype_orig[g]).StandardDeviation();
- for (int p = 0 ; p < phenotype_count ; p ++) phenotype_sd[p] = RunningStat(phenotype_orig[p]).StandardDeviation();
- normalize(genotype_orig);
- normalize(phenotype_orig);
+ //0. Prepare genotypes
+ vector < double > genotype_sd = vector < double > (genotype_count, 0.0);
+ vector < double > phenotype_sd = vector < double > (phenotype_count, 0.0);
+ if (covariate_count > 0) {
+ LOG.println("\nCorrecting genotypes & phenotypes for covariates");
+ covariate_engine->residualize(genotype_orig);
+ covariate_engine->residualize(phenotype_orig);
+ }
+ for (int g = 0 ; g < genotype_count ; g ++) genotype_sd[g] = RunningStat(genotype_orig[g]).StandardDeviation();
+ for (int p = 0 ; p < phenotype_count ; p ++) phenotype_sd[p] = RunningStat(phenotype_orig[p]).StandardDeviation();
+ normalize(genotype_orig);
+ normalize(phenotype_orig);
- //1. Loop over phenotypes
- ofile fdo (fout);
- for (int p = 0 ; p < phenotype_count ; p ++) {
+ //1. Loop over phenotypes
+ ofile fdo (fout);
+ for (int p = 0 ; p < phenotype_count ; p ++) {
- LOG.println("\nProcessing gene [" + phenotype_id[p] + "]");
+ LOG.println("\nProcessing gene [" + phenotype_id[p] + "]");
- //1.1. Enumerate all genotype-phenotype pairs within cis-window
- vector < int > targetGenotypes, targetDistances;
- for (int g = 0 ; g < genotype_count ; g ++) {
- int cisdistance = genotype_pos[g] - phenotype_start[p];
- if (abs(cisdistance) <= cis_window) {
- targetGenotypes.push_back(g);
- targetDistances.push_back(cisdistance);
- }
- }
- LOG.println(" * Number of variants in cis = " + sutils::int2str(targetGenotypes.size()));
+ //1.1. Enumerate all genotype-phenotype pairs within cis-window
+ vector < int > targetGenotypes, targetDistances;
+ for (int g = 0 ; g < genotype_count ; g ++) {
+ int cisdistance = genotype_pos[g] - phenotype_start[p];
+ if (abs(cisdistance) <= cis_window) {
+ targetGenotypes.push_back(g);
+ targetDistances.push_back(cisdistance);
+ }
+ }
+ LOG.println(" * Number of variants in cis = " + sutils::int2str(targetGenotypes.size()));
- //1.2. Nominal pass: scan cis-window & compute statistics
- for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
- double corr = getCorrelation(genotype_orig[targetGenotypes[g]], phenotype_orig[p]);
- double pval = getPvalue(corr, sample_count - 2 - covariate_count);
- double slope = getSlope(corr, phenotype_sd[p], genotype_sd[targetGenotypes[g]]);
- if (pval <= threshold ) {
- fdo << phenotype_id[p];
- fdo << " " << genotype_id[targetGenotypes[g]];
- fdo << " " << targetDistances[g];
- fdo << " " << pval;
- fdo << " " << slope;
- fdo << endl;
- }
- }
- LOG.println(" * Progress = " + sutils::double2str((p+1) * 100.0 / phenotype_count, 1) + "%");
- }
- fdo.close();
+ //1.2. Nominal pass: scan cis-window & compute statistics
+ for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
+ double corr = getCorrelation(genotype_orig[targetGenotypes[g]], phenotype_orig[p]);
+ double df = sample_count - 2 - covariate_count;
+ double tstat2 = getTstat2(corr, df);
+ double pval = getPvalueFromTstat2(tstat2, df);
+ double slope = getSlope(corr, phenotype_sd[p], genotype_sd[targetGenotypes[g]]);
+ double slope_se = abs(slope) / sqrt(tstat2);
+ if (pval <= threshold ) {
+ fdo << phenotype_id[p];
+ int gi = targetGenotypes[g];
+ fdo << "\t" << genotype_id[gi];
+ fdo << "\t" << targetDistances[g]; // indexed by g
+ fdo << "\t" << genotype_ma_samples[gi];
+ fdo << "\t" << genotype_ma_count[gi];
+ fdo << "\t" << genotype_maf[gi];
+ fdo << "\t" << pval;
+ fdo << "\t" << slope;
+ fdo << "\t" << slope_se;
+ fdo << endl;
+ }
+ }
+ LOG.println(" * Progress = " + sutils::double2str((p+1) * 100.0 / phenotype_count, 1) + "%");
+ }
+ fdo.close();
}
=====================================
src/analysisNominalBest.cpp
=====================================
@@ -0,0 +1,84 @@
+//FastQTL: Fast and efficient QTL mapper for molecular phenotypes
+//Copyright (C) 2015 Olivier DELANEAU, Alfonso BUIL, Emmanouil DERMITZAKIS & Olivier DELANEAU
+//
+//This program is free software: you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation, either version 3 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "data.h"
+
+
+void data::runNominalBest(string fout) {
+
+ //0. Prepare genotypes
+ vector < double > genotype_sd = vector < double > (genotype_count, 0.0);
+ vector < double > phenotype_sd = vector < double > (phenotype_count, 0.0);
+ if (covariate_count > 0) {
+ LOG.println("\nCorrecting genotypes & phenotypes for covariates");
+ covariate_engine->residualize(genotype_orig);
+ covariate_engine->residualize(phenotype_orig);
+ }
+ for (int g = 0 ; g < genotype_count ; g ++) genotype_sd[g] = RunningStat(genotype_orig[g]).StandardDeviation();
+ for (int p = 0 ; p < phenotype_count ; p ++) phenotype_sd[p] = RunningStat(phenotype_orig[p]).StandardDeviation();
+ normalize(genotype_orig);
+ normalize(phenotype_orig);
+
+ double df = sample_count - 2 - covariate_count;
+
+ //1. Loop over phenotypes
+ ofile fdo (fout);
+ for (int p = 0 ; p < phenotype_count ; p ++) {
+
+ LOG.println("\nProcessing gene [" + phenotype_id[p] + "]");
+
+ //1.1. Enumerate all genotype-phenotype pairs within cis-window
+ vector < int > targetGenotypes, targetDistances;
+ for (int g = 0 ; g < genotype_count ; g ++) {
+ int cisdistance = genotype_pos[g] - phenotype_start[p];
+ if (abs(cisdistance) <= cis_window) {
+ targetGenotypes.push_back(g);
+ targetDistances.push_back(cisdistance);
+ }
+ }
+ LOG.println(" * Number of variants in cis = " + sutils::int2str(targetGenotypes.size()));
+
+ //1.2. Nominal pass: scan cis-window for best variant & compute statistics
+ int min_idx = 0;
+ double min_pval = 1.0;
+ double opt_corr = 0.0;
+ double opt_tstat2 = 0.0;
+ for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
+ double corr = getCorrelation(genotype_orig[targetGenotypes[g]], phenotype_orig[p]);
+ double tstat2 = getTstat2(corr, df);
+ double pval = getPvalueFromTstat2(tstat2, df);
+ if (pval<min_pval) {
+ min_idx = g;
+ min_pval = pval;
+ opt_corr = corr;
+ opt_tstat2 = tstat2;
+ }
+ }
+ if (targetGenotypes.size()>0) {
+ double slope = getSlope(opt_corr, phenotype_sd[p], genotype_sd[targetGenotypes[min_idx]]);
+ double slope_se = abs(slope) / sqrt(opt_tstat2);
+ fdo << phenotype_id[p];
+ fdo << "\t" << genotype_id[targetGenotypes[min_idx]];
+ fdo << "\t" << targetDistances[min_idx];
+ fdo << "\t" << min_pval;
+ fdo << "\t" << slope;
+ fdo << "\t" << slope_se;
+ fdo << endl;
+ }
+ LOG.println(" * Progress = " + sutils::double2str((p+1) * 100.0 / phenotype_count, 1) + "%");
+ }
+ fdo.close();
+}
=====================================
src/analysisPermutation.cpp
=====================================
@@ -19,118 +19,127 @@
void data::runPermutation(string fout, vector < int > nPermutations) {
- //0. Prepare genotypes
- vector < double > genotype_sd = vector < double > (genotype_count, 0.0);
- vector < double > phenotype_sd = vector < double > (phenotype_count, 0.0);
- if (covariate_count > 0) {
- LOG.println("\nCorrecting genotypes for covariates");
- covariate_engine->residualize(genotype_orig);
- }
- for (int g = 0 ; g < genotype_count ; g ++) genotype_sd[g] = RunningStat(genotype_orig[g]).StandardDeviation();
- normalize(genotype_orig);
+ //0. Prepare genotypes
+ vector < double > genotype_sd = vector < double > (genotype_count, 0.0);
+ vector < double > phenotype_sd = vector < double > (phenotype_count, 0.0);
+ if (covariate_count > 0) {
+ LOG.println("\nCorrecting genotypes for covariates");
+ covariate_engine->residualize(genotype_orig);
+ }
+ for (int g = 0 ; g < genotype_count ; g ++) genotype_sd[g] = RunningStat(genotype_orig[g]).StandardDeviation();
+ normalize(genotype_orig);
- //1. Loop over phenotypes
- ofile fdo (fout);
- for (int p = 0 ; p < phenotype_count ; p ++) {
+ //1. Loop over phenotypes
+ ofile fdo (fout);
+ for (int p = 0 ; p < phenotype_count ; p ++) {
- LOG.println("\nProcessing gene [" + phenotype_id[p] + "]");
+ LOG.println("\nProcessing gene [" + phenotype_id[p] + "]");
- //1.1. Enumerate all genotype-phenotype pairs within cis-window
- vector < int > targetGenotypes, targetDistances;
- for (int g = 0 ; g < genotype_count ; g ++) {
- int cisdistance = genotype_pos[g] - phenotype_start[p];
- if (abs(cisdistance) <= cis_window) {
- targetGenotypes.push_back(g);
- targetDistances.push_back(cisdistance);
- }
- }
- LOG.println(" * Number of variants in cis = " + sutils::int2str(targetGenotypes.size()));
+ //1.1. Enumerate all genotype-phenotype pairs within cis-window
+ vector < int > targetGenotypes, targetDistances;
+ for (int g = 0 ; g < genotype_count ; g ++) {
+ int cisdistance = genotype_pos[g] - phenotype_start[p];
+ if (abs(cisdistance) <= cis_window) {
+ targetGenotypes.push_back(g);
+ targetDistances.push_back(cisdistance);
+ }
+ }
+ LOG.println(" * Number of variants in cis = " + sutils::int2str(targetGenotypes.size()));
- //1.2. Copy original data
- vector < float > phenotype_curr = phenotype_orig[p];
- if (covariate_count > 0) covariate_engine->residualize(phenotype_curr);
- phenotype_sd[p] = RunningStat(phenotype_curr).StandardDeviation();
- normalize(phenotype_curr);
+ //1.2. Copy original data
+ vector < float > phenotype_curr = phenotype_orig[p];
+ if (covariate_count > 0) covariate_engine->residualize(phenotype_curr);
+ phenotype_sd[p] = RunningStat(phenotype_curr).StandardDeviation();
+ normalize(phenotype_curr);
- //1.3. Nominal pass: scan cis-window & compute statistics
- double bestCorr = 0.0;
- int bestDistance = ___LI___, bestIndex = -1;
- for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
- double corr = getCorrelation(genotype_orig[targetGenotypes[g]], phenotype_curr);
- if (abs(corr) > abs(bestCorr) || (abs(corr) == abs(bestCorr) && abs(targetDistances[g]) < bestDistance)) {
- bestCorr = corr;
- bestDistance = targetDistances[g];
- bestIndex = targetGenotypes[g];
- }
- }
- if (targetGenotypes.size() > 0) LOG.println(" * Best correlation = " + sutils::double2str(bestCorr, 4));
+ //1.3. Nominal pass: scan cis-window & compute statistics
+ double bestCorr = 0.0;
+ int bestDistance = ___LI___, bestIndex = -1;
+ for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
+ double corr = getCorrelation(genotype_orig[targetGenotypes[g]], phenotype_curr);
+ if (abs(corr) > abs(bestCorr) || (abs(corr) == abs(bestCorr) && abs(targetDistances[g]) < bestDistance)) {
+ bestCorr = corr;
+ bestDistance = targetDistances[g];
+ bestIndex = targetGenotypes[g];
+ }
+ }
+ if (targetGenotypes.size() > 0) LOG.println(" * Best correlation = " + sutils::double2str(bestCorr, 4));
- //1.4. Permutation pass:
- bool done = false;
- int countPermutations = 0, nBetterCorrelation = 0;
- vector < double > permCorr;
- do {
- double bestCperm = 0.0;
- phenotype_curr = phenotype_orig[p];
- random_shuffle(phenotype_curr.begin(), phenotype_curr.end());
- if (covariate_count > 0) covariate_engine->residualize(phenotype_curr);
- normalize(phenotype_curr);
- for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
- double corr = getCorrelation(genotype_orig[targetGenotypes[g]], phenotype_curr);
- if (abs(corr) > abs(bestCperm)) bestCperm = corr;
- }
- if (abs(bestCperm) >= abs(bestCorr)) nBetterCorrelation++;
- permCorr.push_back(bestCperm);
- countPermutations++;
+ //1.4. Permutation pass:
+ bool done = false;
+ int countPermutations = 0, nBetterCorrelation = 0;
+ vector < double > permCorr;
+ do {
+ double bestCperm = 0.0;
+ phenotype_curr = phenotype_orig[p];
+ random_shuffle(phenotype_curr.begin(), phenotype_curr.end());
+ if (covariate_count > 0) covariate_engine->residualize(phenotype_curr);
+ normalize(phenotype_curr);
+ for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
+ double corr = getCorrelation(genotype_orig[targetGenotypes[g]], phenotype_curr);
+ if (abs(corr) > abs(bestCperm)) bestCperm = corr;
+ }
+ if (abs(bestCperm) >= abs(bestCorr)) nBetterCorrelation++;
+ permCorr.push_back(bestCperm);
+ countPermutations++;
- if (nPermutations.size() == 1 && countPermutations >= nPermutations[0]) done = true;
- if (nPermutations.size() == 2 && (nBetterCorrelation >= nPermutations[0] || countPermutations >= nPermutations[1])) done = true;
- if (nPermutations.size() == 3 && (countPermutations >= nPermutations[0]) && (nBetterCorrelation >= nPermutations[1] || countPermutations >= nPermutations[2])) done = true;
- } while (!done);
- if (targetGenotypes.size() > 0) LOG.println(" * Number of permutations = " + sutils::int2str(nBetterCorrelation) + " / " + sutils::int2str(countPermutations));
+ if (nPermutations.size() == 1 && countPermutations >= nPermutations[0]) done = true;
+ if (nPermutations.size() == 2 && (nBetterCorrelation >= nPermutations[0] || countPermutations >= nPermutations[1])) done = true;
+ if (nPermutations.size() == 3 && (countPermutations >= nPermutations[0]) && (nBetterCorrelation >= nPermutations[1] || countPermutations >= nPermutations[2])) done = true;
+ } while (!done);
+ if (targetGenotypes.size() > 0) LOG.println(" * Number of permutations = " + sutils::int2str(nBetterCorrelation) + " / " + sutils::int2str(countPermutations));
- //1.5. Calculate effective DFs & Beta distribution parameters
- vector < double > permPvalues;
- double true_df = sample_count - 2 - ((covariate_count>0)?covariate_engine->nCovariates():0);
- double mean = 0.0, variance = 0.0, beta_shape1 = 1.0, beta_shape2 = 1.0;
- fdo << phenotype_id[p] << " " << targetGenotypes.size();
- if (targetGenotypes.size() > 0) {
- //Estimate number of degrees of freedom
- if (putils::variance(permCorr, putils::mean(permCorr)) != 0.0) {
- learnDF(permCorr, true_df);
- //LOG.println(" * Effective degree of freedom = " + sutils::double2str(true_df, 4));
- }
- //Compute mean and variance of p-values
- for (int c = 0 ; c < permCorr.size() ; c ++) permPvalues.push_back(getPvalue(permCorr[c], true_df));
- for (int pv = 0 ; pv < permPvalues.size() ; pv++) mean += permPvalues[pv];
- mean /= permPvalues.size();
- for (int pv = 0 ; pv < permPvalues.size() ; pv++) variance += (permPvalues[pv] - mean) * (permPvalues[pv] - mean);
- variance /= (permPvalues.size() - 1);
- //Estimate shape1 & shape2
- if (targetGenotypes.size() > 1 && mean != 1.0) {
- beta_shape1 = mean * (mean * (1 - mean ) / variance - 1);
- beta_shape2 = beta_shape1 * (1 / mean - 1);
- if (targetGenotypes.size() > 10) mleBeta(permPvalues, beta_shape1, beta_shape2); //ML estimate if more than 10 variant in cis
- }
- LOG.println(" * Beta distribution parameters = " + sutils::double2str(beta_shape1, 4) + " " + sutils::double2str(beta_shape2, 4));
- fdo << " " << beta_shape1 << " " << beta_shape2 << " " << true_df;
- } else fdo << " NA NA NA";
+ //1.5. Calculate effective DFs & Beta distribution parameters
+ vector < double > permPvalues;
+ double true_df = sample_count - 2 - ((covariate_count>0)?covariate_engine->nCovariates():0);
+ double mean = 0.0, variance = 0.0, beta_shape1 = 1.0, beta_shape2 = 1.0;
+ fdo << phenotype_id[p] << "\t" << targetGenotypes.size();
+ if (targetGenotypes.size() > 0) {
+ //Estimate number of degrees of freedom
+ if (putils::variance(permCorr, putils::mean(permCorr)) != 0.0) {
+ learnDF(permCorr, true_df);
+ //LOG.println(" * Effective degree of freedom = " + sutils::double2str(true_df, 4));
+ }
+ //Compute mean and variance of p-values
+ for (int c = 0 ; c < permCorr.size() ; c ++) permPvalues.push_back(getPvalue(permCorr[c], true_df));
+ for (int pv = 0 ; pv < permPvalues.size() ; pv++) mean += permPvalues[pv];
+ mean /= permPvalues.size();
+ for (int pv = 0 ; pv < permPvalues.size() ; pv++) variance += (permPvalues[pv] - mean) * (permPvalues[pv] - mean);
+ variance /= (permPvalues.size() - 1);
+ //Estimate shape1 & shape2
+ if (targetGenotypes.size() > 1 && mean != 1.0) {
+ beta_shape1 = mean * (mean * (1 - mean ) / variance - 1);
+ beta_shape2 = beta_shape1 * (1 / mean - 1);
+ if (targetGenotypes.size() > 10) mleBeta(permPvalues, beta_shape1, beta_shape2); //ML estimate if more than 10 variant in cis
+ }
+ LOG.println(" * Beta distribution parameters = " + sutils::double2str(beta_shape1, 4) + " " + sutils::double2str(beta_shape2, 4));
+ fdo << "\t" << beta_shape1 << "\t" << beta_shape2 << "\t" << true_df;
+ } else fdo << "\tNA\tNA\tNA";
- //1.6. Writing results
- if (targetGenotypes.size() > 0 && bestIndex >= 0) {
- double pval_fdo = getPvalue(bestCorr, true_df);
- double pval_nom = getPvalue(bestCorr, sample_count - 2 - ((covariate_count>0)?covariate_engine->nCovariates():0));
- double pval_slope = getSlope(bestCorr, phenotype_sd[p], genotype_sd[bestIndex]);
- fdo << " " << genotype_id[bestIndex];
- fdo << " " << bestDistance;
- fdo << " " << pval_nom;
- fdo << " " << pval_slope;
- fdo << " " << (nBetterCorrelation + 1) * 1.0 / (countPermutations + 1.0);
- fdo << " " << pbeta(pval_fdo, beta_shape1, beta_shape2, 1, 0);
- } else fdo << " NA NA NA NA NA NA";
- fdo << endl;
+ //1.6. Writing results
+ if (targetGenotypes.size() > 0 && bestIndex >= 0) {
+ double pval_fdo = getPvalue(bestCorr, true_df);
+ double df = sample_count - 2 - ((covariate_count>0)?covariate_engine->nCovariates():0);
+ double tstat2 = getTstat2(bestCorr, df);
+ double pval_nom = getPvalueFromTstat2(tstat2, df);
+ double pval_slope = getSlope(bestCorr, phenotype_sd[p], genotype_sd[bestIndex]);
+ double slope_se = abs(pval_slope) / sqrt(tstat2);
+ fdo << "\t" << pval_fdo;
+ fdo << "\t" << genotype_id[bestIndex];
+ fdo << "\t" << bestDistance;
+ fdo << "\t" << genotype_ma_samples[bestIndex];
+ fdo << "\t" << genotype_ma_count[bestIndex];
+ fdo << "\t" << genotype_maf[bestIndex];
+ fdo << "\t" << genotype_ref_factor[bestIndex];
+ fdo << "\t" << pval_nom;
+ fdo << "\t" << pval_slope;
+ fdo << "\t" << slope_se;
+ fdo << "\t" << (nBetterCorrelation + 1) * 1.0 / (countPermutations + 1.0);
+ fdo << "\t" << pbeta(pval_fdo, beta_shape1, beta_shape2, 1, 0);
+ } else fdo << "\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA";
+ fdo << endl;
- LOG.println(" * Progress = " + sutils::double2str((p+1) * 100.0 / phenotype_count, 1) + "%");
- }
- fdo.close();
+ LOG.println(" * Progress = " + sutils::double2str((p+1) * 100.0 / phenotype_count, 1) + "%");
+ }
+ fdo.close();
}
=====================================
src/data.h
=====================================
@@ -29,7 +29,7 @@
class data {
public:
- //INCLUDE/EXCLUDE LISTS
+ //INCLUDE/EXCLUDE LISTS
set < string > sample_inclusion;
set < string > sample_exclusion;
set < string > genotype_inclusion;
@@ -43,6 +43,9 @@ public:
region regionPhenotype;
region regionGenotype;
float cis_window;
+ float maf_threshold; // minor allele frequency threshold
+ int ma_sample_threshold; // minor allele sample threshold
+ float global_af_threshold;
//SAMPLES
int sample_count; //sample number
@@ -55,12 +58,16 @@ public:
vector < string > genotype_chr; //variant site chromosome
vector < string > genotype_id; //variant site IDs
vector < int > genotype_pos; //variant site positions
+ vector < float > genotype_maf; //variant minor allele frequency
+ vector < int > genotype_ma_count; //variant minor allele count
+ vector < int > genotype_ma_samples; //variant minor allele samples
+ vector < int > genotype_ref_factor; //variant minor allele reference factor: 1 if ALT is MA; -1 if REF is MA
//PHENOTYPES
int phenotype_count; //phenotype number
vector < vector < float > > phenotype_orig; //original phenotype values
vector < string > phenotype_id; //phenotype ids
- vector < string > phenotype_grp; //phenotype groups
+ vector < string > phenotype_grp; //phenotype groups
vector < string > phenotype_chr; //phenotype chromosomes
vector < int > phenotype_start; //phenotype start positions
vector < int > phenotype_end; //phenotype end positions
@@ -112,7 +119,7 @@ public:
//GENERAL MANAGMENT
void clusterizePhenotypes(int);
void imputeGenotypes();
- void imputePhenotypes();
+ void imputePhenotypes();
void normalTranformPhenotypes();
void initResidualizer();
void rankTranformPhenotypes();
@@ -126,12 +133,15 @@ public:
//COMPUTATION METHODS [ALL INLINES FOR SPEED]
double getCorrelation(vector < float > &, vector < float > &);
double getCorrelation(vector < float > &, vector < float > &, vector < int > &);
+ double getTstat2(double, double);
+ double getPvalueFromTstat2(double, double);
double getPvalue(double, double);
double getPvalue(double, vector < double > &);
double getSlope(double, double, double);
//ANALYSIS
void runNominal(string, double);
+ void runNominalBest(string);
void runPermutation(string, vector < int >);
void runPermutation(string, string);
void runPermutationPerGroup(string, vector < int >);
@@ -192,6 +202,14 @@ inline double data::getCorrelation(vector < float > & vec1, vector < float > & v
return corr;
}
+inline double data::getTstat2(double corr, double df) {
+ return df * corr * corr / (1 - corr * corr);
+}
+
+inline double data::getPvalueFromTstat2(double tstat2, double df) {
+ return pf(tstat2, 1, df, 0, 0);
+}
+
inline double data::getPvalue(double corr, double df) {
return pf(df * corr * corr / (1 - corr * corr), 1, df,0,0);
}
=====================================
src/fastQTL.cpp
=====================================
@@ -53,7 +53,10 @@ int main(int argc, char ** argv) {
opt_parameters.add_options()
("normal", "Normal transform the phenotypes.")
("window,W", bpo::value< double >()->default_value(1e6), "Cis-window size.")
- ("threshold, T", bpo::value< double >()->default_value(1.0), "P-value threshold used in nominal pass of association");
+ ("threshold,T", bpo::value< double >()->default_value(1.0), "P-value threshold used in nominal pass of association")
+ ("maf-threshold", bpo::value< double >()->default_value(0.0), "Minor allele frequency threshold used when parsing genotypes")
+ ("ma-sample-threshold", bpo::value< int >()->default_value(0), "Minimum number of samples carrying the minor allele; used when parsing genotypes")
+ ("global-af-threshold", bpo::value< double >()->default_value(0.0), "AF threshold for all samples in VCF (used to filter AF in INFO field)");
bpo::options_description opt_modes ("\33[33mModes\33[0m");
opt_modes.add_options()
@@ -61,7 +64,8 @@ int main(int argc, char ** argv) {
("psequence", bpo::value< string >(), "Permutation sequence.")
("map", bpo::value< string >(), "Map best QTL candidates per molecular phenotype.")
("map-full", "Scan full cis-window to discover independent signals.")
- ("interaction", bpo::value< string >(), "Test for interactions with variable specified in file.");
+ ("interaction", bpo::value< string >(), "Test for interactions with variable specified in file.")
+ ("report-best-only", bpo::bool_switch()->default_value(false), "Report best variant only (nominal mode)");
bpo::options_description opt_parallel ("\33[33mParallelization\33[0m");
opt_parallel.add_options()
@@ -93,7 +97,7 @@ int main(int argc, char ** argv) {
cout << " * Authors : Olivier DELANEAU, Halit ONGEN, Alfonso BUIL & Manolis DERMITZAKIS" << endl;
cout << " * Contact : olivier.delaneau at gmail.com" << endl;
cout << " * Webpage : http://fastqtl.sourceforge.net/" << endl;
- cout << " * Version : v2.0" << endl;
+ cout << " * Version : v2.184_gtex" << endl;
if (options.count("help")) { cout << descriptions<< endl; exit(1); }
}
@@ -201,6 +205,18 @@ int main(int argc, char ** argv) {
if (options["window"].as < double > () > 1e9) LOG.error ("Cis-window cannot be larger than 1e9bp");
LOG.println(" * Considering variants within " + sutils::double2str(options["window"].as < double > ()) + " bp of the MPs");
D.cis_window = options["window"].as < double > ();
+
+ D.maf_threshold = options["maf-threshold"].as < double > ();
+ if (D.maf_threshold < 0.0 || D.maf_threshold >= 0.5) LOG.error("Incorrect --maf-threshold value : 0 <= X < 0.5");
+ LOG.println(" * Using minor allele frequency threshold = " + sutils::double2str(D.maf_threshold, 4));
+
+ D.ma_sample_threshold = options["ma-sample-threshold"].as < int > ();
+ if (D.ma_sample_threshold < 0) LOG.error("Incorrect --ma-sample-threshold : 0 <= X");
+ LOG.println(" * Using minor allele sample threshold = " + sutils::int2str(D.ma_sample_threshold));
+
+ D.global_af_threshold = options["global-af-threshold"].as < double > ();
+ if (D.global_af_threshold < 0.0 || D.global_af_threshold >= 0.5) LOG.error("Incorrect --global-af-threshold value : 0 <= X < 0.5");
+ LOG.println(" * Using INFO field AF threshold = " + sutils::double2str(D.global_af_threshold, 4));
if (options.count("chunk")) {
vector < int > nChunk = options["chunk"].as < vector < int > > ();
@@ -273,10 +289,13 @@ int main(int argc, char ** argv) {
else if (options.count("permute")) {
if (options.count("psequence")) D.runPermutation(options["out"].as < string > (), options["psequence"].as < string > ());
else D.runPermutation(options["out"].as < string > (), options["permute"].as < vector < int > > ());
- } else if (options.count("map"))
+ } else if (options.count("map")) {
D.runMapping(options["out"].as < string > (), options.count("map-full"));
- else
+ } else if (options["report-best-only"].as<bool>()) {
+ D.runNominalBest(options["out"].as < string > ());
+ } else {
D.runNominal(options["out"].as < string > (), options["threshold"].as < double > ());
+ }
}
//----------------
=====================================
src/management.cpp
=====================================
@@ -19,234 +19,253 @@
#include "data.h"
#include "utils/ranker.h"
-
+#include <map>
data::data() {
- sample_count = 0;
- genotype_count = 0;
- phenotype_count = 0;
- covariate_count = 0;
- covariate_engine = NULL;
+ sample_count = 0;
+ genotype_count = 0;
+ phenotype_count = 0;
+ covariate_count = 0;
+ covariate_engine = NULL;
}
void data::clear() {
- sample_count = 0;
- sample_id.clear();
- genotype_count = 0;
- genotype_orig.clear();
- genotype_curr.clear();
- genotype_chr.clear();
- genotype_id.clear();
- genotype_pos.clear();
- phenotype_count = 0;
- phenotype_orig.clear();
- phenotype_id.clear();
- phenotype_chr.clear();
- phenotype_start.clear();
- phenotype_end.clear();
- covariate_count = 0;
- covariate_val.clear();
- covariate_id.clear();
+ sample_count = 0;
+ sample_id.clear();
+ genotype_count = 0;
+ genotype_orig.clear();
+ genotype_curr.clear();
+ genotype_chr.clear();
+ genotype_id.clear();
+ genotype_pos.clear();
+ genotype_maf.clear();
+ genotype_ma_count.clear();
+ genotype_ma_samples.clear();
+ genotype_ref_factor.clear();
+ phenotype_count = 0;
+ phenotype_orig.clear();
+ phenotype_id.clear();
+ phenotype_chr.clear();
+ phenotype_start.clear();
+ phenotype_end.clear();
+ covariate_count = 0;
+ covariate_val.clear();
+ covariate_id.clear();
}
bool data::checkSample(string & id, bool checkDuplicate) {
- bool included = ((sample_inclusion.size() == 0)?true:sample_inclusion.count(id));
- bool excluded = ((sample_exclusion.size() == 0)?false:sample_exclusion.count(id));
- if (!included || excluded) return false;
- if (checkDuplicate) for (int s = 0 ; s < sample_id.size() ; s++) if (id == sample_id[s]) LOG.error("Duplicate sample id [" + id +"]");
- return true;
+ bool included = ((sample_inclusion.size() == 0)?true:sample_inclusion.count(id));
+ bool excluded = ((sample_exclusion.size() == 0)?false:sample_exclusion.count(id));
+ if (!included || excluded) return false;
+ if (checkDuplicate) for (int s = 0 ; s < sample_id.size() ; s++) if (id == sample_id[s]) LOG.error("Duplicate sample id [" + id +"]");
+ return true;
}
bool data::checkGenotype(string & id) {
- bool included = ((genotype_inclusion.size() == 0)?true:genotype_inclusion.count(id));
- bool excluded = ((genotype_exclusion.size() == 0)?false:genotype_exclusion.count(id));
- if (!included || excluded) return false;
- //for (int g = 0 ; g < genotype_id.size() ; g++) if (id == genotype_id[g]) LOG.error("Duplicate variant site id [" + id + "]");
- return true;
+ bool included = ((genotype_inclusion.size() == 0)?true:genotype_inclusion.count(id));
+ bool excluded = ((genotype_exclusion.size() == 0)?false:genotype_exclusion.count(id));
+ if (!included || excluded) return false;
+ //for (int g = 0 ; g < genotype_id.size() ; g++) if (id == genotype_id[g]) LOG.error("Duplicate variant site id [" + id + "]");
+ return true;
}
bool data::checkPhenotype(string & id, bool include) {
- if (include) {
- bool included = ((phenotype_inclusion.size() == 0)?true:phenotype_inclusion.count(id));
- bool excluded = ((phenotype_exclusion.size() == 0)?false:phenotype_exclusion.count(id));
- if (!included || excluded) return false;
- }
- for (int p = 0 ; p < phenotype_id.size() ; p++) if (id == phenotype_id[p]) LOG.error("Duplicate phenotype id [" + id + "]");
- return true;
+ if (include) {
+ bool included = ((phenotype_inclusion.size() == 0)?true:phenotype_inclusion.count(id));
+ bool excluded = ((phenotype_exclusion.size() == 0)?false:phenotype_exclusion.count(id));
+ if (!included || excluded) return false;
+ }
+ for (int p = 0 ; p < phenotype_id.size() ; p++) if (id == phenotype_id[p]) LOG.error("Duplicate phenotype id [" + id + "]");
+ return true;
}
bool data::checkCovariate(string & id) {
- bool included = ((covariate_inclusion.size() == 0)?true:covariate_inclusion.count(id));
- bool excluded = ((covariate_exclusion.size() == 0)?false:covariate_exclusion.count(id));
- if (!included || excluded) return false;
- for (int c = 0 ; c < covariate_id.size() ; c++) if (id == covariate_id[c]) LOG.error("Duplicate covariate id [" + id + "]");
- return true;
+ bool included = ((covariate_inclusion.size() == 0)?true:covariate_inclusion.count(id));
+ bool excluded = ((covariate_exclusion.size() == 0)?false:covariate_exclusion.count(id));
+ if (!included || excluded) return false;
+ for (int c = 0 ; c < covariate_id.size() ; c++) if (id == covariate_id[c]) LOG.error("Duplicate covariate id [" + id + "]");
+ return true;
}
void data::clusterizePhenotypes(int K) {
- //check K
- if (K < 1) LOG.error("Number of chunks needs to be > 0");
- if (K > phenotype_count) LOG.error("Number of chunks (" + sutils::int2str(K) + ") is greater than the number of phenotypes (" + sutils::int2str(phenotype_count) + ")");
-
- //initialise (1 cluster = 1 chromosome)
- phenotype_cluster = vector < vector < int > > (1, vector < int > (1, 0));
- for (int p = 1 ; p < phenotype_count ; p ++) {
- if (phenotype_chr[p] != phenotype_chr[p-1]) phenotype_cluster.push_back(vector < int > (1, p));
- else phenotype_cluster.back().push_back(p);
- }
-
- //iterate (split cluster in the middle until K clusters are built)
- if (phenotype_cluster.size() > K) LOG.error("Number of chunks needs to be > #chr");
- if (phenotype_cluster.size() < K) {
- int max_idx, max_val, max_mid;
- do {
- max_idx = -1;
- max_val = +1;
- max_mid = -1;
- for (int k = 0 ; k < phenotype_cluster.size() ; k ++) {
- if (phenotype_cluster[k].size() > max_val) {
- max_val = phenotype_cluster[k].size();
- max_idx = k;
- }
- }
- if (max_idx >= 0) {
- max_mid = max_val / 2;
- while (max_mid > 1 && phenotype_end[phenotype_cluster[max_idx][max_mid-1]] >= phenotype_start[phenotype_cluster[max_idx][max_mid]]) max_mid --;
- phenotype_cluster.push_back(vector < int > (phenotype_cluster[max_idx].begin() + max_mid, phenotype_cluster[max_idx].end()));
- phenotype_cluster[max_idx].erase(phenotype_cluster[max_idx].begin() + max_mid, phenotype_cluster[max_idx].end());
- }
- } while (phenotype_cluster.size() < K && max_idx >= 0);
- }
+ //check K
+ if (K < 1) LOG.error("Number of chunks needs to be > 0");
+ if (K > phenotype_count) LOG.error("Number of chunks (" + sutils::int2str(K) + ") is greater than the number of phenotypes (" + sutils::int2str(phenotype_count) + ")");
+
+ // count number of phenotypes per chromosome
+ std::map<std::string,int> num_pheno;
+ for (int p=0; p<phenotype_count; ++p) {
+ num_pheno[phenotype_chr[p]]++;
+ }
+ if (num_pheno.size() > K) LOG.error("Number of chunks needs to be > #chr");
+
+ // initialize chunk size to all phenotypes
+ std::map<std::string,int> chunk_size(num_pheno);
+
+ // each chr must have at least 1 chunk
+ std::map<std::string,int> num_chunks;
+ for(const auto &kv : num_pheno) {
+ num_chunks[kv.first] = 1;
+ }
+
+ // determine number of chunks for each chr
+ for (auto i=0;i<K-num_pheno.size(); ++i) {
+ // find largest chunk size
+ int max_chunk = 0;
+ std::string max_key;
+ for(const auto &kv : chunk_size) {
+ if (kv.second>max_chunk) {
+ max_chunk = kv.second;
+ max_key = kv.first;
+ }
+ }
+ num_chunks[max_key] += 1; // add chunk for this chr
+ chunk_size[max_key] = ceil((double)num_pheno[max_key]/(double)num_chunks[max_key]); // adjust chunk size
+ }
+
+ // now, split according to chr and chunk size
+ phenotype_cluster = vector < vector < int > > (1, vector < int > (1, 0)); // first phenotype already stored (index 0)
+ int chr_offset = 0;
+ for (int p=1; p<phenotype_count; ++p) {
+ if (phenotype_chr[p]!=phenotype_chr[p-1]) { // new chr --> new chunk
+ chr_offset += num_pheno[phenotype_chr[p-1]];
+ phenotype_cluster.push_back(vector < int > (1, p));
+ } else if (((p-chr_offset) % chunk_size[phenotype_chr[p]])==0) { // new chunk
+ phenotype_cluster.push_back(vector < int > (1, p));
+ } else { // add to current chunk
+ phenotype_cluster.back().push_back(p);
+ }
+ }
}
bool data::setPhenotypeRegion(string reg) {
- return regionPhenotype.set(reg);
+ return regionPhenotype.set(reg);
}
void data::setPhenotypeRegion(int k) {
- regionPhenotype.chr = phenotype_chr[phenotype_cluster[k][0]];
- regionPhenotype.start = phenotype_start[phenotype_cluster[k][0]];
- regionPhenotype.end = phenotype_end[phenotype_cluster[k].back()];
+ regionPhenotype.chr = phenotype_chr[phenotype_cluster[k][0]];
+ regionPhenotype.start = phenotype_start[phenotype_cluster[k][0]];
+ regionPhenotype.end = phenotype_end[phenotype_cluster[k].back()];
}
string data::getPhenotypeRegion(int k) {
- return string (phenotype_chr[phenotype_cluster[k][0]] + ":" + sutils::int2str(phenotype_start[phenotype_cluster[k][0]]) + "-" + sutils::int2str(phenotype_end[phenotype_cluster[k].back()]));
+ return string (phenotype_chr[phenotype_cluster[k][0]] + ":" + sutils::int2str(phenotype_start[phenotype_cluster[k][0]]) + "-" + sutils::int2str(phenotype_end[phenotype_cluster[k].back()]));
}
void data::deduceGenotypeRegion(int W) {
- regionGenotype.chr = regionPhenotype.chr;
- regionGenotype.start = regionPhenotype.start - W;
- if (regionGenotype.start < 0) regionGenotype.start = 0;
- regionGenotype.end = regionPhenotype.end + W;
+ regionGenotype.chr = regionPhenotype.chr;
+ regionGenotype.start = regionPhenotype.start - W;
+ if (regionGenotype.start < 0) regionGenotype.start = 0;
+ regionGenotype.end = regionPhenotype.end + W;
}
void data::imputeGenotypes() {
- LOG.println("\nImputing missing genotypes");
- for (int g = 0; g < genotype_count ; g ++) {
- double mean = 0.0;
- int c_mean = 0;
- for (int s = 0; s < sample_count ; s ++) {
- if (genotype_orig[g][s] >= 0) {
- mean += genotype_orig[g][s];
- c_mean ++;
- }
- }
- mean /= c_mean;
- for (int s = 0; s < sample_count ; s ++) if (genotype_orig[g][s] < 0) genotype_orig[g][s] = mean;
- }
+ LOG.println("\nImputing missing genotypes");
+ for (int g = 0; g < genotype_count ; g ++) {
+ double mean = 0.0;
+ int c_mean = 0;
+ for (int s = 0; s < sample_count ; s ++) {
+ if (genotype_orig[g][s] >= 0) {
+ mean += genotype_orig[g][s];
+ c_mean ++;
+ }
+ }
+ mean /= c_mean;
+ for (int s = 0; s < sample_count ; s ++) if (genotype_orig[g][s] < 0) genotype_orig[g][s] = mean;
+ }
}
void data::imputePhenotypes() {
- LOG.println("\nImputing missing phenotypes");
- for (int p = 0; p < phenotype_count ; p ++) {
- double mean = 0.0;
- int c_mean = 0;
- for (int s = 0; s < sample_count; s ++) {
- if (!isnan(phenotype_orig[p][s])) {
- mean += phenotype_orig [p][s];
- c_mean ++;
- }
- }
- mean /= c_mean;
- for (int s = 0; s < sample_count ; s ++) if (isnan(phenotype_orig[p][s])) phenotype_orig[p][s] = mean;
- }
+ LOG.println("\nImputing missing phenotypes");
+ for (int p = 0; p < phenotype_count ; p ++) {
+ double mean = 0.0;
+ int c_mean = 0;
+ for (int s = 0; s < sample_count; s ++) {
+ if (!isnan(phenotype_orig[p][s])) {
+ mean += phenotype_orig [p][s];
+ c_mean ++;
+ }
+ }
+ mean /= c_mean;
+ for (int s = 0; s < sample_count ; s ++) if (isnan(phenotype_orig[p][s])) phenotype_orig[p][s] = mean;
+ }
}
void data::normalTranformPhenotypes() {
- string method = "average";
- LOG.println("\nQuantile normalise phenotypes to Normal distribution");
- for (int p = 0; p < phenotype_count ; p ++) {
- vector < float > R;
+ string method = "average";
+ LOG.println("\nQuantile normalise phenotypes to Normal distribution");
+ for (int p = 0; p < phenotype_count ; p ++) {
+ vector < float > R;
myranker::rank(phenotype_orig[p], R, method);
- double max = 0;
- for (int s = 0 ; s < sample_count ; s ++) {
- R[s] = R[s] - 0.5;
- if (R[s] > max) max = R[s];
- }
- max = max + 0.5;
- for (int s = 0 ; s < sample_count ; s ++) {
- R[s] /= max;
- phenotype_orig[p][s] = qnorm(R[s], 0.0, 1.0, 1, 0);
- }
- }
+ double max = 0;
+ for (int s = 0 ; s < sample_count ; s ++) {
+ R[s] = R[s] - 0.5;
+ if (R[s] > max) max = R[s];
+ }
+ max = max + 0.5;
+ for (int s = 0 ; s < sample_count ; s ++) {
+ R[s] /= max;
+ phenotype_orig[p][s] = qnorm(R[s], 0.0, 1.0, 1, 0);
+ }
+ }
}
void data::initResidualizer() {
- LOG.println("\nInitialize covariate ");
- covariate_engine = new residualizer (sample_count);
- for (int c = 0 ; c < covariate_count ; c ++) covariate_engine->pushHard(covariate_val[c]);
- if (interaction_val.size() > 0) covariate_engine->pushHard(interaction_val);
+ LOG.println("\nInitialize covariate ");
+ covariate_engine = new residualizer (sample_count);
+ for (int c = 0 ; c < covariate_count ; c ++) covariate_engine->pushHard(covariate_val[c]);
+ if (interaction_val.size() > 0) covariate_engine->pushHard(interaction_val);
}
void data::rankTranformPhenotypes() {
- string method = "average";
- LOG.println("\nRanking phenotypes");
- for (int p = 0; p < phenotype_count ; p ++) {
- vector < float > R;
+ string method = "average";
+ LOG.println("\nRanking phenotypes");
+ for (int p = 0; p < phenotype_count ; p ++) {
+ vector < float > R;
myranker::rank(phenotype_orig[p], R, method);
- phenotype_orig[p] = R;
- }
+ phenotype_orig[p] = R;
+ }
}
void data::rankTranformGenotypes() {
- string method = "average";
- LOG.println("\nRanking genotypes");
- for (int g = 0; g < genotype_count ; g ++) {
- vector < float > R;
+ string method = "average";
+ LOG.println("\nRanking genotypes");
+ for (int g = 0; g < genotype_count ; g ++) {
+ vector < float > R;
myranker::rank(genotype_orig[g], R, method);
- genotype_orig[g] = R;
- }
+ genotype_orig[g] = R;
+ }
}
void data::normalize(vector < float > & X) {
- double mean = 0.0, sum = 0.0;
- for (int s = 0; s < sample_count ; s ++) mean += X[s];
- mean /= sample_count;
- for (int s = 0; s < sample_count ; s ++) {
- X[s] -= mean;
- sum += X[s] * X[s];
- }
- sum = sqrt(sum);
- if (sum == 0) sum = 1;
- for (int s = 0; s < sample_count ; s ++) X[s] /= sum;
+ double mean = 0.0, sum = 0.0;
+ for (int s = 0; s < sample_count ; s ++) mean += X[s];
+ mean /= sample_count;
+ for (int s = 0; s < sample_count ; s ++) {
+ X[s] -= mean;
+ sum += X[s] * X[s];
+ }
+ sum = sqrt(sum);
+ if (sum == 0) sum = 1;
+ for (int s = 0; s < sample_count ; s ++) X[s] /= sum;
}
void data::normalize(vector < vector < float > > & X) {
- for (int x = 0 ; x < X.size() ; x++) {
- double mean = 0.0, sum = 0.0;
- for (int s = 0; s < sample_count ; s ++) mean += X[x][s];
- mean /= sample_count;
- for (int s = 0; s < sample_count ; s ++) {
- X[x][s] -= mean;
- sum += X[x][s] * X[x][s];
- }
- sum = sqrt(sum);
- if (sum == 0) sum = 1;
- for (int s = 0; s < sample_count ; s ++) X[x][s] /= sum;
- }
+ for (int x = 0 ; x < X.size() ; x++) {
+ double mean = 0.0, sum = 0.0;
+ for (int s = 0; s < sample_count ; s ++) mean += X[x][s];
+ mean /= sample_count;
+ for (int s = 0; s < sample_count ; s ++) {
+ X[x][s] -= mean;
+ sum += X[x][s] * X[x][s];
+ }
+ sum = sqrt(sum);
+ if (sum == 0) sum = 1;
+ for (int s = 0; s < sample_count ; s ++) X[x][s] /= sum;
+ }
}
void data::correct(vector < float > & X, vector < float > & Y) {
- double corr = getCorrelation(X, Y);
- for (int s = 0 ; s < sample_count ; s ++) Y[s] = Y[s] - corr * X[s];
+ double corr = getCorrelation(X, Y);
+ for (int s = 0 ; s < sample_count ; s ++) Y[s] = Y[s] - corr * X[s];
}
=====================================
src/readCovariates.cpp
=====================================
@@ -35,33 +35,34 @@ void data::readCovariates(string fcov) {
if (buffer.size() == 0) LOG.error("No header line detected!");
sutils::tokenize(buffer, str, "\t");
for (int t = 1 ; t < str.size() ; t ++) {
- if (checkSample(str[t], false)) {
+ string istr = sutils::remove_spaces(str[t]);
+ if (checkSample(istr, false)) {
int idx_sample = -1;
- for (int i = 0 ; i < sample_count && idx_sample < 0 ; i++) if (sample_id[i] == sutils::remove_spaces(str[t])) idx_sample = i;
+ for (int i = 0 ; i < sample_count && idx_sample < 0 ; i++) if (sample_id[i] == istr) idx_sample = i;
mappingS.push_back(idx_sample);
if (idx_sample >= 0) n_includedS ++;
else n_missingS ++;
- } else {
- mappingS.push_back(-1);
- n_excludedS ++;
- }
+ } else {
+ mappingS.push_back(-1);
+ n_excludedS ++;
+ }
}
if (n_includedS != sample_count) LOG.error("Number of overlapping samples in covariate file is " + sutils::int2str(n_includedS) + " and should be " + sutils::int2str(sample_count));
//Read covariates
//int n_type0 = 0, n_type1 = 0;
while(getline(fd, buffer)) {
- if (buffer.size() == 0) continue;
+ if (buffer.size() == 0) continue;
sutils::tokenize(buffer, str, "\t");
if (str.size() < 2) LOG.error("Wrong genotype covariate file format");
if (checkCovariate(str[0])) {
covariate_val.push_back(vector < string > (sample_count, "0"));
for (int t = 1 ; t < str.size() ; t ++) {
if (mappingS[t-1] >= 0) {
- covariate_val.back()[mappingS[t-1]] = str[t];
+ covariate_val.back()[mappingS[t-1]] = sutils::remove_spaces(str[t]);
}
}
- n_includedC ++;
+ n_includedC ++;
} else n_excludedC ++;
}
=====================================
src/readGenotypes.cpp
=====================================
@@ -26,6 +26,8 @@ void data::readGenotypesVCF(string fvcf) {
int n_excludedF = 0;
int n_includedS = 0;
int n_excludedS = 0;
+ int n_excludedMAF = 0;
+ int n_excludedMAFglobal = 0;
int n_missingS = 0;
int n_parsed = 0;
vector < int > mappingS;
@@ -58,48 +60,116 @@ void data::readGenotypesVCF(string fvcf) {
if (!fd.setRegion(regionGenotype.str())) LOG.error("Failed to get region " + regionGenotype.str() + " in [" + fvcf + "]");
LOG.println(" * region = " + regionGenotype.str());
while (fd.getNextLine(buffer)) {
- if (buffer.size() == 0) continue;
+ if (buffer.size() == 0) continue;
sutils::tokenize(buffer, str);
if (checkGenotype(str[2])) {
//Check VCF format
bool gt_field = false;
int idx_field = -1;
- sutils::tokenize(str[8], field, ":");
- for (int f = 0 ; f < field.size() ; f ++) if (field[f] == "DS") idx_field = f;
+ sutils::tokenize(str[8], field, ":"); // FORMAT field
+ for (int f = 0 ; f < field.size() ; f ++) if (field[f] == "DS") idx_field = f; // check if DS present, else use GT
if (idx_field < 0) { for (int f = 0 ; f < field.size() ; f ++) if (field[f] == "GT") idx_field = f; gt_field = true; }
+
+ bool passed_AF = true;
+ if (global_af_threshold>0.0) {
+ // apply AF filter
+ sutils::tokenize(str[7], field, ";"); // INFO field
+ for (int f=0; f<field.size(); ++f) {
+ size_t p = field[f].find("=");
+ if (field[f].substr(0, p)=="AF") {
+ double val = std::stod(field[f].substr(p+1, string::npos));
+ if ((val<global_af_threshold) | (val>1.0-global_af_threshold)) {
+ passed_AF = false;
+ }
+ break;
+ }
+ }
+ }
+
//Read data is format is correct
if (idx_field >= 0) {
- genotype_id.push_back(str[2]);
- genotype_chr.push_back(str[0]);
- genotype_pos.push_back(atoi(str[1].c_str()));
- genotype_orig.push_back(vector < float > (sample_count, 0.0));
- genotype_curr.push_back(vector < float > (sample_count, 0.0));
- for (int t = 9 ; t < str.size() ; t ++) {
- if (mappingS[t-9] >= 0) {
- sutils::tokenize(str[t], field, ":");
- if (str[t] == "." || str[t] == "NN" || str[t] == "NA") genotype_orig.back()[mappingS[t-9]] = -1.0;
- else if (!gt_field) {
- if (field[idx_field][0] == '.') genotype_orig.back()[mappingS[t-9]] = -1.0;
- else {
- float dosage = atof(field[idx_field].c_str());
- //if (dosage < 0 || dosage > 2) LOG.error("Dosages must be between 0 and 2, check: " + field[idx_field]);
- genotype_orig.back()[mappingS[t-9]] = dosage;
+ if (passed_AF) {
+ vector < float > genotype_vec = vector < float > (sample_count, 0.0);
+
+ for (int t = 9 ; t < str.size() ; t ++) {
+ if (mappingS[t-9] >= 0) { // if sample is in include list
+ sutils::tokenize(str[t], field, ":");
+ if (str[t] == "." || str[t] == "NN" || str[t] == "NA") genotype_vec[mappingS[t-9]] = -1.0;
+ else if (!gt_field) {
+ if (field[idx_field][0] == '.') genotype_vec[mappingS[t-9]] = -1.0;
+ else {
+ float dosage = atof(field[idx_field].c_str());
+ //if (dosage < 0 || dosage > 2) LOG.error("Dosages must be between 0 and 2, check: " + field[idx_field]);
+ genotype_vec[mappingS[t-9]] = dosage;
+ }
+ } else {
+ if (field[idx_field][0] == '.' || field[idx_field][2] == '.') genotype_vec[mappingS[t-9]] = -1.0;
+ else {
+ int a0 = atoi(field[idx_field].substr(0, 1).c_str());
+ int a1 = atoi(field[idx_field].substr(2, 1).c_str());
+ int dosage = a0 + a1;
+ if (dosage < 0 || dosage > 2) LOG.error("Genotypes must be 00, 01, or 11, check: " + field[idx_field]);
+ genotype_vec[mappingS[t-9]] = dosage;
+ }
}
- } else {
- if (field[idx_field][0] == '.' || field[idx_field][2] == '.') genotype_orig.back()[mappingS[t-9]] = -1.0;
- else {
- int a0 = atoi(field[idx_field].substr(0, 1).c_str());
- int a1 = atoi(field[idx_field].substr(2, 1).c_str());
- int dosage = a0 + a1;
- if (dosage < 0 || dosage > 2) LOG.error("Genotypes must be 00, 01, or 11, check: " + field[idx_field]);
- genotype_orig.back()[mappingS[t-9]] = dosage;
+ }
+ }
+ // calculate minor allele frequency
+ // for now, replicate previous approach
+ // in the future, use alt_alleles = sum(dosages)
+ int c0 = 0;
+ int c1 = 0;
+ int c2 = 0;
+ int r;
+ for (int i=0; i<genotype_vec.size(); ++i) {
+ if (genotype_vec[i]!=-1.0) {
+ r = round(genotype_vec[i]);
+ if (r==0) {
+ c0++;
+ } else if (r==1) {
+ c1++;
+ } else if (r==2) {
+ c2++;
+ } else {
+ LOG.error("Dosage values must be between 0 and 2");
}
}
}
- }
- n_includedG ++;
- } else n_excludedF ++;
- } else n_excludedG ++;
+ float ref_alleles = 2*c0 + c1;
+ float alt_alleles = c1 + 2*c2;
+ float maf;
+ int ma_samples;
+ int ma_count;
+ int num_samples = c0+c1+c2;
+ int ref_factor;
+ if (ref_alleles >= alt_alleles) {
+ maf = alt_alleles / (float)(2*num_samples);
+ ma_samples = c1+c2;
+ ma_count = alt_alleles;
+ ref_factor = 1;
+ } else {
+ maf = ref_alleles / (float)(2*num_samples);
+ ma_samples = c0+c1;
+ ma_count = ref_alleles;
+ ref_factor = -1;
+ }
+ if (maf>=maf_threshold && ma_samples>=ma_sample_threshold) {
+ genotype_id.push_back(str[2]);
+ genotype_chr.push_back(str[0]);
+ genotype_pos.push_back(atoi(str[1].c_str()));
+ genotype_orig.push_back(genotype_vec);
+ genotype_curr.push_back(vector < float > (sample_count, 0.0));
+ genotype_maf.push_back(maf);
+ genotype_ma_count.push_back(ma_count);
+ genotype_ma_samples.push_back(ma_samples);
+ genotype_ref_factor.push_back(ref_factor);
+ n_includedG++;
+ } else {
+ n_excludedMAF++;
+ }
+ } else n_excludedMAFglobal++;
+ } else n_excludedF++;
+ } else n_excludedG++;
n_parsed++;
if (n_parsed % 100000 == 0) LOG.println(" * " + sutils::int2str(n_parsed) + " lines parsed");
}
@@ -113,5 +183,7 @@ void data::readGenotypesVCF(string fvcf) {
LOG.println(" * " + sutils::int2str(n_includedG) + " sites included");
if (n_excludedG > 0) LOG.println(" * " + sutils::int2str(n_excludedG) + " sites excluded");
if (n_excludedF > 0) LOG.println(" * " + sutils::int2str(n_excludedF) + " sites excluded because of missing GT/DS field");
- if (n_includedG <= 0) LOG.error("No genotypes in this region: " + regionPhenotype.str());
+ if (n_excludedMAF > 0) LOG.println(" * " + sutils::int2str(n_excludedMAF) + " sites excluded because below minor allele thresholds for selected samples");
+ if (n_excludedMAFglobal > 0) LOG.println(" * " + sutils::int2str(n_excludedMAFglobal) + " sites excluded because global minor allele frequency < " + sutils::double2str(global_af_threshold));
+ if (n_includedG <= 0) LOG.error("No genotypes in this region: " + regionPhenotype.str());
}
View it on GitLab: https://salsa.debian.org/med-team/fastqtl/-/compare/8cf06313b080d920898507dc667b015c87877c61...6c34ffd0b461b505f013c715c35e3a1f804953c3
--
View it on GitLab: https://salsa.debian.org/med-team/fastqtl/-/compare/8cf06313b080d920898507dc667b015c87877c61...6c34ffd0b461b505f013c715c35e3a1f804953c3
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221005/3d27f03f/attachment-0001.htm>
More information about the debian-med-commit
mailing list