[med-svn] [varscan] 01/02: Imported Upstream version 2.3.6+dfsg
Andreas Tille
tille at debian.org
Tue Apr 15 19:56:00 UTC 2014
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository varscan.
commit 935de6c4b1d0e15170ae79a315edc06045032bd0
Author: Andreas Tille <tille at debian.org>
Date: Tue Apr 15 21:51:06 2014 +0200
Imported Upstream version 2.3.6+dfsg
---
net/sf/varscan/CallMpileup.java | 946 +++++++++++++++++
net/sf/varscan/CallPileup.java | 339 +++++++
net/sf/varscan/Comparison.java | 302 ++++++
net/sf/varscan/CopyCaller.java | 579 +++++++++++
net/sf/varscan/Copynumber.java | 1107 ++++++++++++++++++++
net/sf/varscan/Coverage.java | 423 ++++++++
net/sf/varscan/FilterSomatic.java | 575 +++++++++++
net/sf/varscan/FilterVariants.java | 583 +++++++++++
net/sf/varscan/FishersExact.java | 292 ++++++
net/sf/varscan/LimitVariants.java | 311 ++++++
net/sf/varscan/ProcessSomatic.java | 432 ++++++++
net/sf/varscan/ReadCounts.java | 412 ++++++++
net/sf/varscan/Somatic.java | 1966 ++++++++++++++++++++++++++++++++++++
net/sf/varscan/Trio.java | 1284 +++++++++++++++++++++++
net/sf/varscan/VarScan.java | 1757 ++++++++++++++++++++++++++++++++
15 files changed, 11308 insertions(+)
diff --git a/net/sf/varscan/CallMpileup.java b/net/sf/varscan/CallMpileup.java
new file mode 100644
index 0000000..28590ae
--- /dev/null
+++ b/net/sf/varscan/CallMpileup.java
@@ -0,0 +1,946 @@
+/**
+ * @(#)CallMpileup.java
+ *
+ * Copyright (c) 2009-2010 Daniel C. Koboldt and Washington University in St. Louis
+ *
+ * COPYRIGHT
+ */
+
+package net.sf.varscan;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.text.DecimalFormat;
+import java.util.HashMap;
+import java.lang.Math;
+
+/**
+ * A class for calling variants or consensus bases from a pileup file
+ *
+ * @version 2.3
+ *
+ * @author Daniel C. Koboldt <dkoboldt at genome.wustl.edu>
+ *
+ */
+public class CallMpileup {
+
+ public CallMpileup(String[] args, String callType)
+ {
+ // Define the usage message //
+ String usage = "USAGE: java -jar VarScan.jar mpileup2cns [pileup file] OPTIONS\n" +
+ "\tmpileup file - The SAMtools mpileup file\n" +
+ "\n" +
+ "\tOPTIONS:\n" +
+ "\t--min-coverage\tMinimum read depth at a position to make a call [8]\n" +
+ "\t--min-reads2\tMinimum supporting reads at a position to call variants [2]\n" +
+ "\t--min-avg-qual\tMinimum base quality at a position to count a read [15]\n" +
+ "\t--min-var-freq\tMinimum variant allele frequency threshold [0.01]\n" +
+ "\t--min-freq-for-hom\tMinimum frequency to call homozygote [0.75]\n" +
+ "\t--p-value\tDefault p-value threshold for calling variants [99e-02]\n" +
+ "\t--strand-filter\tIgnore variants with >90% support on one strand [1]\n" +
+ "\t--output-vcf\tIf set to 1, outputs in VCF format\n" +
+ "\t--vcf-sample-list\tFor VCF output, a list of sample names in order, one per line\n" +
+ "\t--variants\tReport only variant (SNP/indel) positions [0]";
+
+ // Set parameter defaults //
+
+ HashMap<String, String> params = VarScan.getParams(args);
+
+
+ // Set up formatting for p-values //
+ DecimalFormat pvalueFormat = new DecimalFormat("0.####E0");
+
+ // If mpileup2snp or mpileup2indel was called, set the variants parameter //
+
+ if(args[0].equals("mpileup2snp"))
+ params.put("variants", "snp");
+
+ if(args[0].equals("mpileup2indel"))
+ params.put("variants", "indel");
+
+ if(args[0].equals("mpileup2vcf"))
+ params.put("output-vcf", "1");
+
+ // Set parameter defaults //
+
+ int minCoverage = 8;
+ int minReads2 = 2;
+ int minAvgQual = 15;
+ double minVarFreq = 0.01;
+ double minFreqForHom = 0.75;
+ double pValueThreshold = 0.99;
+ double strandPvalueThreshold = 0.01;
+ boolean variantsOnly = false;
+ boolean snpsOnly = false;
+ boolean indelsOnly = false;
+ boolean strandFilter = true;
+ String sampleList = "";
+
+ if(callType.equals("CNS"))
+ {
+ // Set more rigorous parameters for consensus calling
+ minVarFreq = 0.20;
+ pValueThreshold = 0.01;
+ }
+
+ // Adjust parameters based on user input //
+
+ try
+ {
+ if(params.containsKey("min-coverage"))
+ minCoverage = Integer.parseInt(params.get("min-coverage"));
+
+ if(params.containsKey("min-reads2"))
+ minReads2 = Integer.parseInt(params.get("min-reads2"));
+
+ if(params.containsKey("min-var-freq"))
+ minVarFreq = Double.parseDouble(params.get("min-var-freq"));
+
+ if(params.containsKey("min-freq-for-hom"))
+ minFreqForHom = Double.parseDouble(params.get("min-freq-for-hom"));
+
+ if(params.containsKey("min-avg-qual"))
+ minAvgQual = Integer.parseInt(params.get("min-avg-qual"));
+
+ if(params.containsKey("p-value"))
+ pValueThreshold = Double.parseDouble(params.get("p-value"));
+
+ if(params.containsKey("strand-filter"))
+ {
+ int filter = Integer.parseInt(params.get("strand-filter"));
+ if(filter > 0)
+ strandFilter = true;
+ else
+ strandFilter = false;
+ }
+
+ if(params.containsKey("vcf-sample-list"))
+ {
+ File samplefile = new File(params.get("vcf-sample-list"));
+ // Parse sample list //
+ if(samplefile.exists())
+ {
+ BufferedReader in = new BufferedReader(new FileReader(samplefile));
+ String line = "";
+ if(in.ready())
+ {
+ while ((line = in.readLine()) != null)
+ {
+ String sampleName = line;
+ if(sampleList.length() > 0)
+ sampleList += "\t";
+ sampleList += sampleName;
+ }
+ }
+ else
+ {
+ System.err.println("Unable to open sample list");
+ }
+
+ in.close();
+ }
+
+ System.err.println("Got the following sample list: ");
+ System.err.println(sampleList);
+ }
+
+
+ if(params.containsKey("variants"))
+ {
+ String variants = params.get("variants");
+
+ // Determine type of variant reporting: all (default), SNPs, or indels //
+ if(variants.equals("snp"))
+ {
+ snpsOnly = true;
+ System.err.println("Only SNPs will be reported");
+ }
+ else if(variants.equals("indel"))
+ {
+ indelsOnly = true;
+ System.err.println("Only indels will be reported");
+ }
+ else
+ {
+ variantsOnly = true;
+ System.err.println("Only variants will be reported");
+ }
+ }
+
+ if(params.containsKey("p-value"))
+ pValueThreshold = Double.parseDouble(params.get("p-value"));
+ else
+ System.err.println("Warning: No p-value threshold provided, so p-values will not be calculated");
+
+ System.err.println("Min coverage:\t" + minCoverage);
+ System.err.println("Min reads2:\t" + minReads2);
+ System.err.println("Min var freq:\t" + minVarFreq);
+ System.err.println("Min avg qual:\t" + minAvgQual);
+ System.err.println("P-value thresh:\t" + pValueThreshold);
+ }
+ catch(Exception e)
+ {
+ System.err.println("Input Parameter Threw Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ return;
+ }
+
+ // Print usage if -h or --help invoked //
+ if(params.containsKey("help") || params.containsKey("h"))
+ {
+ System.err.println(usage);
+ return;
+ }
+
+ // Define the statistics hash and reset counters //
+
+
+ long numBases = 0;
+ long numVariantPositions = 0;
+ long numSNPpositions = 0;
+ long numIndelPositions = 0;
+ long numFailStrandFilter = 0;
+ long numVariantsReported = 0;
+ long numSNPsReported = 0;
+ long numIndelsReported = 0;
+
+
+ int numParsingExceptions = 0;
+
+ // Parse piped input or user-provided pileup file //
+
+ try
+ {
+
+ // Declare file-parsing variables //
+
+ BufferedReader in = VarScan.getInfile(args);
+ String line;
+
+ // If no input, print usage //
+
+ if(in == null)
+ {
+ System.out.println(usage);
+ return;
+ }
+
+ // If input file not ready, give it a few seconds //
+ int numNaps = 0;
+
+ while(!in.ready())
+ {
+ try {
+ Thread.sleep(5000);
+ numNaps++;
+
+ if(numNaps > 100)
+ {
+ System.err.println("Input file was not ready after 100 5-second cycles!");
+ System.exit(10);
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Exception while trying to get input" + e.getMessage());
+ System.exit(1);
+ }
+ }
+
+ // Proceed if input stream is ready //
+ String vcfHeader = "##fileformat=VCFv4.1";
+
+ if(in != null && in.ready())
+ {
+ // Print a file header //
+ if(!params.containsKey("no-headers"))
+ {
+ if(params.containsKey("output-vcf"))
+ {
+ // Output VCF Header //
+
+ vcfHeader += "\n" + "##source=VarScan2";
+ vcfHeader += "\n" + "##INFO=<ID=ADP,Number=1,Type=Integer,Description=\"Average per-sample depth of bases with Phred score >= " + minAvgQual + "\">";
+ vcfHeader += "\n" + "##INFO=<ID=WT,Number=1,Type=Integer,Description=\"Number of samples called reference (wild-type)\">";
+ vcfHeader += "\n" + "##INFO=<ID=HET,Number=1,Type=Integer,Description=\"Number of samples called heterozygous-variant\">";
+ vcfHeader += "\n" + "##INFO=<ID=HOM,Number=1,Type=Integer,Description=\"Number of samples called homozygous-variant\">";
+ vcfHeader += "\n" + "##INFO=<ID=NC,Number=1,Type=Integer,Description=\"Number of samples not called\">";
+ vcfHeader += "\n" + "##FILTER=<ID=str10,Description=\"Less than 10% or more than 90% of variant supporting reads on one strand\">";
+ vcfHeader += "\n" + "##FILTER=<ID=indelError,Description=\"Likely artifact due to indel reads at this position\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=SDP,Number=1,Type=Integer,Description=\"Raw Read Depth as reported by SAMtools\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Quality Read Depth of bases with Phred score >= " + minAvgQual + "\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=RD,Number=1,Type=Integer,Description=\"Depth of reference-supporting bases (reads1)\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Depth of variant-supporting bases (reads2)\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=FREQ,Number=1,Type=String,Description=\"Variant allele frequency\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=PVAL,Number=1,Type=String,Description=\"P-value from Fisher's Exact Test\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=RBQ,Number=1,Type=Integer,Description=\"Average quality of reference-supporting bases (qual1)\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=ABQ,Number=1,Type=Integer,Description=\"Average quality of variant-supporting bases (qual2)\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=RDF,Number=1,Type=Integer,Description=\"Depth of reference-supporting bases on forward strand (reads1plus)\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=RDR,Number=1,Type=Integer,Description=\"Depth of reference-supporting bases on reverse strand (reads1minus)\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=ADF,Number=1,Type=Integer,Description=\"Depth of variant-supporting bases on forward strand (reads2plus)\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=ADR,Number=1,Type=Integer,Description=\"Depth of variant-supporting bases on reverse strand (reads2minus)\">";
+
+ }
+ else
+ {
+ // Output VarScan Header //
+ System.out.println("Chrom\tPosition\tRef\tVar\tCons:Cov:Reads1:Reads2:Freq:P-value\tStrandFilter:R1+:R1-:R2+:R2-:pval\tSamplesRef\tSamplesHet\tSamplesHom\tSamplesNC\tCons:Cov:Reads1:Reads2:Freq:P-value");
+ }
+
+ }
+
+
+
+ // Parse the infile line by line //
+
+ while ((line = in.readLine()) != null)
+ {
+ numBases++;//stats.put("numBases", (stats.get("numBases") + 1));
+
+ // Output progress line //
+ if(params.containsKey("verbose") && (numBases % 100000) == 0)
+ System.err.println(numBases + " positions parsed...");
+
+ // Begin try-catch for line parsing //
+
+ try
+ {
+ String[] lineContents = line.split("\t");
+
+ // Verify expected pileup format //
+
+ if(lineContents.length > 5 && lineContents[0].length() > 0 && lineContents[1].length() > 0 && lineContents[2].length() > 0 && lineContents[3].length() > 0)
+ {
+ if(numBases == 1 && params.containsKey("output-vcf"))
+ {
+ vcfHeader += "\n" + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT";
+ if(sampleList.length() > 0)
+ {
+ vcfHeader += "\t" + sampleList;
+ }
+ else
+ {
+ // print the VCF sample header //
+ int sampleCounter = 0;
+ for(int colCounter = 3; colCounter <= (lineContents.length - 3); colCounter += 3)
+ {
+ sampleCounter++;
+ vcfHeader += "\tSample" + sampleCounter;
+ }
+
+ }
+
+ System.out.println(vcfHeader);
+ }
+
+
+ String refName = lineContents[0];
+ String position = lineContents[1];
+ String refBase = lineContents[2].toUpperCase();
+ String callDepths = "";
+ String callResults = "";
+ String vcfResults = "";
+ HashMap<String, Integer> varAlleles = new HashMap<String, Integer>();
+ boolean variantFlag = false;
+ boolean snpFlag = false;
+ boolean indelFlag = false;
+ int samplesRef = 0;
+ int samplesHet = 0;
+ int samplesHom = 0;
+ int samplesUncalled = 0;
+
+ // Declare variables for cross-sample calling and strand filter //
+ int allReadDepth = 0;
+ int allReads1plus = 0;
+ int allReads1minus = 0;
+ int allReads2plus = 0;
+ int allReads2minus = 0;
+ double strandPvalue = 1.00;
+ String allReadBases = "";
+ String allReadQualities = "";
+
+ // Call Individual Genotypes for All Samples in Mpileup //
+
+ for(int colCounter = 3; colCounter <= (lineContents.length - 3); colCounter += 3)
+ {
+ int readDepth = 0;
+ String readBases = "";
+ String readQualities = "";
+ String mapQualities = "";
+
+ readDepth = Integer.parseInt(lineContents[colCounter]);
+ readBases = lineContents[colCounter + 1];
+ readQualities = lineContents[colCounter + 2];
+
+ // Append to our long-running total //
+
+ allReadDepth += readDepth;
+ allReadBases = allReadBases + readBases;
+ allReadQualities = allReadQualities + readQualities;
+
+ // Determine if this sample's depth meets our minimum //
+ int qualityDepth = 0;
+ qualityDepth = VarScan.qualityDepth(readQualities, minAvgQual);
+
+ String thisCall = "N" + ":" + qualityDepth + ":-:-:-:-";
+ String thisVCF = "./.:.:" + qualityDepth;
+
+ if(readDepth >= minCoverage && qualityDepth >= minCoverage)
+ {
+ HashMap<String, String> readCounts = VarScan.getReadCounts(refBase, readBases, readQualities, minAvgQual, mapQualities);
+ String positionCall = VarScan.callPosition(refBase, readCounts, "CNS", minReads2, minVarFreq, minAvgQual, pValueThreshold, minFreqForHom);
+
+ if(positionCall.length() > 0)
+ {
+ String[] callLines = positionCall.split("\n");
+
+ // Go thru each line in resulting call list //
+ for(int lineCounter = 0; lineCounter < callLines.length; lineCounter++)
+ {
+ // Determine type of call that was made //
+ String[] callContents = callLines[lineCounter].split("\t");
+ String consBase = callContents[0];
+ int reads1 = Integer.parseInt(callContents[1]);
+ int reads2 = Integer.parseInt(callContents[2]);
+ String varFreq = callContents[3];
+ int strands1 = Integer.parseInt(callContents[4]);
+ int strands2 = Integer.parseInt(callContents[5]);
+ int qual1 = Integer.parseInt(callContents[6]);
+ int qual2 = Integer.parseInt(callContents[7]);
+ double pValue = Double.parseDouble(callContents[8]);
+ int reads1plus = Integer.parseInt(callContents[11]);
+ int reads1minus = Integer.parseInt(callContents[12]);
+ int reads2plus = Integer.parseInt(callContents[13]);
+ int reads2minus = Integer.parseInt(callContents[14]);
+ String varAllele = "";
+
+ double logP = 0;
+ try {
+ logP = 0 - (10 * java.lang.Math.log10(pValue));
+ if(logP > 255)
+ logP = 255;
+ }
+ catch(Exception e)
+ {
+ // Stick with default logP value
+ }
+
+
+ // Capture the variant allele if there is one //
+
+ if(!consBase.equals(refBase) && !consBase.equals("N") && callContents.length > 15)
+ {
+ varAllele = callContents[15];
+
+ // Determine how many variant alleles have been seen //
+
+ int varAlleleNumber = 0;
+
+ // Determine if we've seen the variant and what its number is ##
+
+ if(varAlleles.containsKey(varAllele))
+ {
+ varAlleleNumber = varAlleles.get(varAllele);
+ }
+ else
+ {
+ // IF no variants yet seen, this is variant allele 1 //
+ varAlleleNumber = varAlleles.size() + 1;
+ varAlleles.put(varAllele, varAlleleNumber);
+ }
+
+ if(VarScan.isHomozygous(consBase))
+ {
+ samplesHom++;
+ thisVCF = varAlleleNumber + "/" + varAlleleNumber;
+ }
+ else
+ {
+ samplesHet++;
+ thisVCF = "0" + "/" + varAlleleNumber;
+ }
+
+ thisVCF += ":" + (int) logP + ":" + readDepth + ":" + qualityDepth;
+ thisVCF += ":" + reads1 + ":" + reads2 + ":" + varFreq + ":" + pvalueFormat.format(pValue);
+ thisVCF += ":" + qual1 + ":" + qual2;
+ thisVCF += ":" + reads1plus + ":" + reads1minus + ":" + reads2plus + ":" + reads2minus;
+ }
+ else if(consBase.equals(refBase))
+ {
+ // A reference call - recalculate p-value against a possible het //
+ int expReads1 = (reads1 + reads2) / 2;
+ int expReads2 = (reads1 + reads2) - expReads1;
+ double newPvalue = VarScan.getSignificance(reads1, reads2, expReads1, expReads2);
+ double newLogP = 0;
+ try {
+ newLogP = 0 - (10 * java.lang.Math.log10(newPvalue));
+ }
+ catch(Exception e)
+ {
+ // Stick with default logP value
+ }
+ thisVCF = "0" + "/" + "0";
+ thisVCF += ":" + (int) newLogP + ":" + readDepth + ":" + qualityDepth;
+ thisVCF += ":" + reads1 + ":" + reads2 + ":" + varFreq + ":" + pvalueFormat.format(pValue);
+ thisVCF += ":" + qual1 + ":" + qual2;
+ thisVCF += ":" + reads1plus + ":" + reads1minus + ":" + reads2plus + ":" + reads2minus;
+
+ }
+
+
+ thisCall = consBase + ":" + qualityDepth + ":" + reads1 + ":" + reads2 + ":" + varFreq;
+ thisCall += ":" + pvalueFormat.format(pValue);
+
+ if(!consBase.equals(refBase) && !consBase.equals("N"))
+ {
+ variantFlag = true;
+
+ // Flag what type of variant was observed //
+ if(consBase.length() > 1)
+ indelFlag = true;
+ else
+ snpFlag = true;
+
+ // Save reads1plus and reads1minus //
+
+ allReads1plus += reads1plus;
+ allReads1minus += reads1minus;
+ allReads2plus += reads2plus;
+ allReads2minus += reads2minus;
+
+
+ }
+ else
+ {
+ samplesRef++;
+ }
+ }
+
+ }
+ else
+ {
+ samplesUncalled++;
+ }
+
+
+ }
+ else
+ {
+ samplesUncalled++;
+ }
+
+
+ // Add this depth to the list //
+
+ if(callDepths.length() > 0)
+ callDepths += " ";
+
+ callDepths += readDepth;
+
+ // Add this call to the list //
+ if(callResults.length() > 0)
+ callResults = callResults + " ";
+
+ callResults = callResults + thisCall;
+
+ // Add this to the sample VCF string //
+
+ if(vcfResults.length() > 0)
+ vcfResults = vcfResults + "\t";
+
+ vcfResults = vcfResults + thisVCF;
+ }
+
+
+ // Call the cross-sample pileup //
+
+ int qualityDepth = 0;
+ qualityDepth = VarScan.qualityDepth(allReadQualities, minAvgQual);
+ String allMapQualities = "";
+ String allConsensusCall = "N:" + qualityDepth + ":-:-:-:-";
+
+
+
+ if(allReadDepth >= minCoverage && qualityDepth >= minCoverage)
+ {
+ HashMap<String, String> readCounts = VarScan.getReadCounts(refBase, allReadBases, allReadQualities, minAvgQual, allMapQualities);
+ String positionCall = VarScan.callPosition(refBase, readCounts, "CNS", minReads2, minVarFreq, minAvgQual, pValueThreshold, minFreqForHom);
+
+ if(positionCall.length() > 0)
+ {
+ String[] callLines = positionCall.split("\n");
+
+ // Go thru each line in resulting call list //
+ for(int lineCounter = 0; lineCounter < callLines.length; lineCounter++)
+ {
+ // Determine type of call that was made //
+ String[] callContents = callLines[lineCounter].split("\t");
+ String consBase = callContents[0];
+ int reads1 = Integer.parseInt(callContents[1]);
+ int reads2 = Integer.parseInt(callContents[2]);
+ String varFreq = callContents[3];
+ double pValue = Double.parseDouble(callContents[8]);
+ String varAllele = "";
+
+ // Capture the variant allele if there is one //
+
+ if(!consBase.equals(refBase) && callContents.length > 15)
+ {
+ varAllele = callContents[15];
+ if(varAlleles.containsKey(varAllele))
+ {
+// varAlleles.put(varAllele, (varAlleles.get(varAllele) + 1));
+ }
+ else
+ {
+ // IF no variants yet seen, this is variant allele 1 //
+ int varAlleleNumber = varAlleles.size() + 1;
+ varAlleles.put(varAllele, varAlleleNumber);
+ }
+
+ }
+
+
+ allConsensusCall = consBase + ":" + qualityDepth + ":" + reads1 + ":" + reads2 + ":" + varFreq;
+ allConsensusCall += ":" + pvalueFormat.format(pValue);
+
+ if(!consBase.equals(refBase) && !consBase.equals("N"))
+ {
+ variantFlag = true;
+
+ // Flag what type of variant was observed //
+ if(consBase.length() > 1)
+ indelFlag = true;
+ else
+ snpFlag = true;
+
+ }
+ }
+
+ }
+ else
+ {
+ // NO call made from all-sample pileup //
+ }
+
+
+ }
+ else
+ {
+ // All-sample pileup failed to meet min depth //
+ }
+
+
+ // Get All Variant alleles observed //
+
+ String varBases = "";
+ // First, obtain their unique keys which are in alphanumeric order //
+ String[] sortedKeys = (String[]) varAlleles.keySet().toArray(new String[0]);
+
+ // Create an empty array to put these into sorted order //
+ String[] alleleKeys = new String[sortedKeys.length];
+
+ // Put alleles into this array in their order of occurrence in VCF line //
+ for(String allele : sortedKeys)
+ {
+ int arrayIndex = varAlleles.get(allele) - 1;
+ alleleKeys[arrayIndex] = allele;
+ }
+
+ // Export all variant alleles into a comma-separated string//
+ // This is what's provided in native output, or converted to VCF format //
+ for(String allele : alleleKeys)
+ {
+ if(varBases.length() > 0)
+ varBases += ",";
+
+ varBases += allele;
+ }
+
+ // It's possible that we see no variant here, so we need the proper empty character //
+ if(varBases.length() == 0)
+ varBases = ".";
+
+ // Count whether there was a variant //
+ if(variantFlag)
+ numVariantPositions++;
+ if(snpFlag)
+ numSNPpositions++;
+ if(indelFlag)
+ numIndelPositions++;
+
+ // Determine strand filter status if it's turned on //
+ String strandFilterStatus = "Pass:" + allReads1plus + ":" + allReads1minus + ":" + allReads2plus + ":" + allReads2minus + ":" + pvalueFormat.format(strandPvalue);
+ boolean failedStrandFilter = false;
+
+ if(strandFilter && variantFlag && (allReads1plus > 0 || allReads1minus > 0 || allReads2plus > 0 || allReads2plus > 0))
+ {
+ double refStrandPlus = 0.50;
+ double varStrandPlus = 0.50;
+
+ // Calculate strandedness for variant allele //
+
+ if((allReads2plus + allReads2minus) > 0)
+ varStrandPlus = (double) allReads2plus / (double) (allReads2plus + allReads2minus);
+
+ // To save time, only calculate p-value if var strandedness is biased //
+
+ if(varStrandPlus < 0.10 || varStrandPlus > 0.90)
+ {
+ // Calculate strandedness for reference allele if we have 2+ reads //
+
+ if((allReads1plus + allReads1minus) > 1)
+ {
+ refStrandPlus = (double) allReads1plus / (double) (allReads1plus + allReads1minus);
+ strandPvalue = VarScan.getSignificance(allReads1plus, allReads1minus, allReads2plus, allReads2minus);
+ }
+ // Otherwise, only homozygous-variant reads seen, so compare to a 50/50 distribution //
+ else
+ {
+ // Compare to expected 50/50 distribution //
+ int testReads1plus = (int) (allReads2plus + allReads2minus) / 2;
+ int testReads1minus = (allReads2plus + allReads2minus) - testReads1plus;
+ strandPvalue = VarScan.getSignificance(testReads1plus, testReads1minus, allReads2plus, allReads2minus);
+ }
+
+ strandFilterStatus = "Pass:" + varStrandPlus + ":" + allReads1plus + ":" + allReads1minus + ":" + allReads2plus + ":" + allReads2minus + ":" + pvalueFormat.format(strandPvalue);
+
+ // If ref allele had good strandedness, and var allele did not, this may be a failure //
+ if(refStrandPlus >= 0.10 && refStrandPlus <= 0.90 && !(varStrandPlus >= 0.10 && varStrandPlus <= 0.90))
+ {
+ if(strandPvalue < strandPvalueThreshold)
+ {
+ strandFilterStatus = "Fail:" + allReads1plus + ":" + allReads1minus + ":" + allReads2plus + ":" + allReads2minus + ":" + pvalueFormat.format(strandPvalue);
+ numFailStrandFilter++;
+ failedStrandFilter = true;
+ }
+ }
+ }
+
+ }
+
+
+
+ String outLine = refName + "\t" + position + "\t";
+
+ if(params.containsKey("output-vcf"))
+ {
+ // Calculate average sample depth //
+ int avgQualityDepth = qualityDepth / (samplesRef + samplesHet + samplesHom + samplesUncalled);
+ String refColumn = "";
+ String varColumn = "";
+
+ // Handle complex positions with multiple alleles including at least one indel //
+
+ if(varBases.contains(",") && (varBases.contains("-") || varBases.contains("+")))
+ {
+ // Multi-allele indel //
+ int maxDelSize = 0;
+ String maxDelBases = "";
+ // Go through each varAllele to find longest deletion //
+ String[] varBaseContents = varBases.split(",");
+ for(String varAllele : varBaseContents)
+ {
+ if(varAllele.startsWith("-"))
+ {
+ varAllele = varAllele.replace("-", "");
+ if(varAllele.length() > maxDelSize)
+ {
+ maxDelBases = varAllele;
+ maxDelSize = varAllele.length();
+ }
+ }
+ }
+
+ // Set refBase to maximum del //
+ refColumn = refBase + maxDelBases;
+
+ // Establish each allele in var Column //
+ varColumn = "";
+
+ for(String varAllele : varBaseContents)
+ {
+ if(varColumn.length() > 0)
+ varColumn = varColumn + ",";
+
+ if(varAllele.startsWith("-"))
+ {
+ varAllele = varAllele.replace("-", "");
+
+ // For the smaller deletion, determine ref bases to add //
+ if(varAllele.length() < maxDelSize)
+ {
+ String varEntry = maxDelBases.replace(varAllele, "");
+ varColumn = varColumn + refBase + varEntry;
+ }
+ else
+ {
+ varColumn = varColumn + refBase;
+ }
+ }
+ else if(varAllele.startsWith("+"))
+ {
+ varAllele = varAllele.replace("+", "");
+ String varEntry = refBase + varAllele + maxDelBases;
+ varColumn = varColumn + varEntry;
+ }
+ else
+ {
+ String varEntry = varAllele + maxDelBases;
+ varColumn = varColumn + varEntry;
+ }
+ }
+
+
+ }
+
+ else if(varBases.startsWith("+"))
+ {
+ // INSERTION //
+ // Ref = ref base; Var = ref base followed by inserted bases //
+ refColumn = refBase;
+ varColumn = refBase + varBases.replace("+", "");
+ }
+ else if(varBases.startsWith("-"))
+ {
+ // DELETION //
+ // Ref = ref base followed by deleted bases; var = ref base //
+ refColumn = refBase + varBases.replace("-", "");
+ varColumn = refBase;
+ }
+ else
+ {
+ refColumn = refBase;
+ varColumn = varBases;
+ }
+
+ // Ensure that varColumn does not contain any +/- //
+ varColumn = varColumn.replace("+", "");
+ varColumn = varColumn.replace("-", "");
+
+
+ outLine += "." + "\t" + refColumn + "\t" + varColumn + "\t.\t";
+
+ if(strandFilterStatus.contains("Pass"))
+ outLine += "PASS\t";
+ else
+ outLine += "str10\t";
+ outLine += "ADP=" + avgQualityDepth + ";WT=" + samplesRef + ";HET=" + samplesHet + ";HOM=" + samplesHom + ";NC=" + samplesUncalled;
+ outLine += "\t" + "GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR" + "\t";
+ outLine += vcfResults;
+ }
+ else
+ {
+ outLine += refBase + "\t" + varBases + "\t";
+ outLine += allConsensusCall + "\t" + strandFilterStatus + "\t";
+ outLine += samplesRef + "\t" + samplesHet + "\t" + samplesHom + "\t" + samplesUncalled + "\t";
+ outLine += callResults;
+ }
+
+
+
+ // If there was a variant, but strand-filter failed, and only reporting variants:
+ boolean reportFlag = false;
+
+ if(variantFlag && strandFilter && failedStrandFilter)
+ {
+ // Do not print a variant that failed strand-filter unless in CNS mode //
+ if(!variantsOnly && !snpsOnly && !indelsOnly)
+ reportFlag = true;
+ }
+ else if((variantsOnly || snpsOnly || indelsOnly) && !variantFlag)
+ {
+ // Do not print if reporting variants, but no variant was seen //
+ }
+ else if(!variantsOnly && !snpsOnly && !indelsOnly)
+ {
+ // Print consensus if in consensus calling mode //
+ reportFlag = true;
+ }
+ else if(variantFlag && variantsOnly)
+ {
+ // Print any variant if variants flag set //
+ reportFlag = true;
+ }
+ else if(snpFlag && snpsOnly)
+ {
+ // Print SNP variant if SNPs-only flag set //
+ reportFlag = true;
+ }
+ else if(indelFlag && indelsOnly)
+ {
+ // Print indel variant if indels-only flag set //
+ reportFlag = true;
+ }
+ else
+ {
+ // Don't report a consensus call if limited to variant reporting //
+ }
+
+ if(reportFlag)
+ {
+ System.out.println(outLine);
+
+ if(variantFlag)
+ numVariantsReported++;
+ if(snpFlag)
+ numSNPsReported++;
+ if(indelFlag)
+ numIndelsReported++;
+ }
+
+ }
+ else
+ {
+ if(lineContents.length >= 4 && lineContents[3].equals("0"))
+ {
+ // A pileup line with 0x coverage, so ignore
+ }
+ else
+ {
+ System.err.println("Error: Invalid format for pileup at line " + numBases + "\n" + line + "\n");
+ return;
+ }
+
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Parsing Exception on line:\n" + line + "\n" + e.getLocalizedMessage());
+ numParsingExceptions++;
+ if(numParsingExceptions >= 5)
+ {
+ System.err.println("Too many parsing exceptions encountered; exiting");
+ return;
+ }
+ return;
+ }
+
+
+ }
+
+ in.close();
+
+ System.err.println(numBases + " bases in pileup file");
+ System.err.println(numVariantPositions + " variant positions (" + numSNPpositions + " SNP, " + numIndelPositions + " indel)");
+ System.err.println(numFailStrandFilter + " were failed by the strand-filter");
+ System.err.println(numVariantsReported + " variant positions reported (" + numSNPsReported + " SNP, " + numIndelsReported + " indel)");
+ }
+ // Insufficient input was provided, so print usage //
+ else
+ {
+ System.err.println("Please provide an input file!\n" + usage);
+ System.exit(10);
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ System.exit(11);
+ }
+ }
+}
diff --git a/net/sf/varscan/CallPileup.java b/net/sf/varscan/CallPileup.java
new file mode 100644
index 0000000..d289312
--- /dev/null
+++ b/net/sf/varscan/CallPileup.java
@@ -0,0 +1,339 @@
+/**
+ * @(#)CallPileup.java
+ *
+ * Copyright (c) 2009-2010 Daniel C. Koboldt and Washington University in St. Louis
+ *
+ * COPYRIGHT
+ */
+
+package net.sf.varscan;
+
+import java.io.BufferedReader;
+import java.util.HashMap;
+
+/**
+ * A class for calling variants or consensus bases from a pileup file
+ *
+ * @version 2.3
+ *
+ * @author Daniel C. Koboldt <dkoboldt at genome.wustl.edu>
+ *
+ */
+public class CallPileup {
+
+ public CallPileup(String[] args, String callType)
+ {
+ // Define the usage message //
+ String usage = "USAGE: java -jar VarScan.jar pileup2cns [pileup file] OPTIONS\n" +
+ "\tpileup file - The SAMtools pileup file\n" +
+ "\n" +
+ "\tOPTIONS:\n" +
+ "\t--min-coverage\tMinimum read depth at a position to make a call [8]\n" +
+ "\t--min-reads2\tMinimum supporting reads at a position to call variants [2]\n" +
+ "\t--min-avg-qual\tMinimum base quality at a position to count a read [15]\n" +
+ "\t--min-var-freq\tMinimum variant allele frequency threshold [0.01]\n" +
+ "\t--min-freq-for-hom\tMinimum frequency to call homozygote [0.75]\n" +
+ "\t--p-value\tDefault p-value threshold for calling variants [99e-02]\n" +
+ "\t--variants\tReport only variant (SNP/indel) positions [0]";
+
+ // Set parameter defaults //
+
+ HashMap<String, String> params = VarScan.getParams(args);
+
+ int minCoverage = 8;
+ int minReads2 = 2;
+ int minAvgQual = 15;
+ double minVarFreq = 0.01;
+ double minFreqForHom = 0.75;
+ double pValueThreshold = 0.99;
+
+ if(callType.equals("CNS"))
+ {
+ // Set more rigorous parameters for consensus calling
+ minVarFreq = 0.20;
+ pValueThreshold = 0.01;
+ }
+
+ // Adjust parameters based on user input //
+
+ try
+ {
+ if(params.containsKey("min-coverage"))
+ minCoverage = Integer.parseInt(params.get("min-coverage"));
+
+ if(params.containsKey("min-reads2"))
+ minReads2 = Integer.parseInt(params.get("min-reads2"));
+
+ if(params.containsKey("min-var-freq"))
+ minVarFreq = Double.parseDouble(params.get("min-var-freq"));
+
+ if(params.containsKey("min-freq-for-hom"))
+ minFreqForHom = Double.parseDouble(params.get("min-freq-for-hom"));
+
+ if(params.containsKey("min-avg-qual"))
+ minAvgQual = Integer.parseInt(params.get("min-avg-qual"));
+
+ if(params.containsKey("p-value"))
+ pValueThreshold = Double.parseDouble(params.get("p-value"));
+ else
+ System.err.println("Warning: No p-value threshold provided, so p-values will not be calculated");
+
+ System.err.println("Min coverage:\t" + minCoverage);
+ System.err.println("Min reads2:\t" + minReads2);
+ System.err.println("Min var freq:\t" + minVarFreq);
+ System.err.println("Min avg qual:\t" + minAvgQual);
+ System.err.println("P-value thresh:\t" + pValueThreshold);
+ }
+ catch(Exception e)
+ {
+ System.err.println("Input Parameter Threw Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ return;
+ }
+
+ // Print usage if -h or --help invoked //
+ if(params.containsKey("help") || params.containsKey("h"))
+ {
+ System.err.println(usage);
+ return;
+ }
+
+ // Define the statistics hash and reset counters //
+
+// HashMap<String, Integer> stats = new HashMap<String, Integer>();
+// stats.put("numBases", 0);
+// stats.put("numCovered", 0);
+// stats.put("numCalled", 0);
+// stats.put("calledRef", 0);
+// stats.put("calledIndel", 0);
+// stats.put("calledSNP", 0);
+// stats.put("numParsingExceptions", 0);
+
+ long numBases = 0;
+ long numCovered = 0;
+ long numCalled = 0;
+ long calledRef = 0;
+ long calledIndel = 0;
+ long calledSNP = 0;
+ int numParsingExceptions = 0;
+
+ // Parse piped input or user-provided pileup file //
+
+ try
+ {
+ // Declare file-parsing variables //
+
+ BufferedReader in = VarScan.getInfile(args);
+ String line;
+
+ // If no input, print usage //
+
+ if(in == null)
+ {
+ System.out.println(usage);
+ return;
+ }
+
+ // If input file not ready, give it a few seconds //
+ int numNaps = 0;
+
+ while(!in.ready())
+ {
+ try {
+ Thread.sleep(5000);
+ numNaps++;
+
+ if(numNaps > 100)
+ {
+ System.err.println("Input file was not ready after 100 5-second cycles!");
+ System.exit(10);
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Exception while trying to get input" + e.getMessage());
+ System.exit(1);
+ }
+ }
+
+ // Proceed if input stream is ready //
+
+ if(in != null && in.ready())
+ {
+ // Print a file header //
+ if(!params.containsKey("no-headers"))
+ System.out.println("Chrom\tPosition\tRef\tCons\tReads1\tReads2\tVarFreq\tStrands1\tStrands2\tQual1\tQual2\tPvalue\tMapQual1\tMapQual2\tReads1Plus\tReads1Minus\tReads2Plus\tReads2Minus\tVarAllele");
+
+ // Parse the infile line by line //
+
+ while ((line = in.readLine()) != null)
+ {
+ numBases++;//stats.put("numBases", (stats.get("numBases") + 1));
+
+ // Output progress line //
+ if(params.containsKey("verbose") && (numBases % 100000) == 0)
+ System.err.println(numBases + " positions parsed...");
+
+ // Begin try-catch for line parsing //
+
+ try
+ {
+ String[] lineContents = line.split("\t");
+
+ // Verify expected pileup format //
+
+ if(lineContents.length > 5 && lineContents[0].length() > 0 && lineContents[1].length() > 0 && lineContents[2].length() > 0 && lineContents[3].length() > 0)
+ {
+ String refName = "";
+ String position = "";
+ String refBase = "";
+ int readDepth = 0;
+ String readBases = "";
+ String readQualities = "";
+ String mapQualities = "";
+
+ // Pileup Files have 6-7 columns //
+ if(lineContents.length <= 7)
+ {
+ refName = lineContents[0];
+ position = lineContents[1];
+ refBase = lineContents[2].toUpperCase();
+ readDepth = Integer.parseInt(lineContents[3]);
+ readBases = lineContents[4];
+ readQualities = lineContents[5];
+ mapQualities = "";
+ if(lineContents.length > 6) // Get Map Qualities if available //
+ mapQualities = lineContents[6];
+ }
+ // Pileup lines in CNS files have 10-11 columns
+ else if (lineContents.length >= 10 && lineContents.length <= 11)
+ {
+ refName = lineContents[0];
+ position = lineContents[1];
+ refBase = lineContents[2].toUpperCase();
+ readDepth = Integer.parseInt(lineContents[7]);
+ readBases = lineContents[8];
+ readQualities = lineContents[9];
+ mapQualities = "";
+ if(lineContents.length > 10) // Get Map Qualities if available //
+ mapQualities = lineContents[10];
+ }
+ // Indel calls in CNS files have 15-16 columns
+ else if(lineContents.length >= 15 && lineContents.length <= 16)
+ {
+ // Ignore these //
+ }
+
+
+ if(readDepth >= minCoverage && VarScan.qualityDepth(readQualities, minAvgQual) >= minCoverage)
+ {
+ numCovered++;
+
+ // Build brief pileup string //
+
+ HashMap<String, String> readCounts = VarScan.getReadCounts(refBase, readBases, readQualities, minAvgQual, mapQualities);
+
+ String positionCall = VarScan.callPosition(refBase, readCounts, callType, minReads2, minVarFreq, minAvgQual, pValueThreshold, minFreqForHom);
+
+ if(positionCall.length() > 0)
+ {
+ numCalled++;
+ String[] callLines = positionCall.split("\n");
+
+ // Go thru each line in resulting call list //
+ for(int lineCounter = 0; lineCounter < callLines.length; lineCounter++)
+ {
+ // Determine type of call that was made //
+ String[] callContents = callLines[lineCounter].split("\t");
+ String consBase = callContents[0];
+
+ if(consBase.equals(refBase))
+ {
+ calledRef++;
+ }
+ else if(consBase.length() > 1)
+ {
+ calledIndel++;
+ }
+ else
+ {
+ calledSNP++;
+ }
+
+ // Print some results //
+ if(params.containsKey("variants") && (consBase.equals(refBase) || consBase.equals("N")))
+ {
+ // Don't print ref base if only printing variants //
+ }
+ else
+ {
+ System.out.println(refName + "\t" + position + "\t" + refBase + "\t" + callLines[lineCounter]);
+ }
+ }
+ }
+
+
+ }
+ else
+ {
+ // Either raw depth or quality depth did not meet minimum //
+ if(readDepth >= minCoverage)
+ {
+ // Raw depth was enough, but quality depth was not //
+ //System.err.println("Raw depth = " + readDepth + " Qual depth = " + VarScan.qualityDepth(readQualities, minAvgQual) + " < " + minCoverage);
+ }
+ }
+ }
+ else
+ {
+ System.err.println("Error: Invalid format for pileup at line " + numBases + "\n" + line + "\n");
+ return;
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Parsing Exception on line:\n" + line + "\n" + e.getLocalizedMessage());
+ numParsingExceptions++;
+ if(numParsingExceptions >= 5)
+ {
+ System.err.println("Too many parsing exceptions encountered; exiting");
+ return;
+ }
+ return;
+ }
+
+
+ }
+
+ in.close();
+
+ System.err.println(numBases + " bases in pileup file");
+ System.err.println(numCovered + " met minimum coverage of " + minCoverage + "x");
+
+ if(callType.equals("SNP"))
+ System.err.println(calledSNP + " SNPs predicted");
+ else if(callType.equals("INDEL"))
+ System.err.println(calledIndel + " indels predicted");
+ else //CNS //
+ {
+ System.err.println(numCalled + " positions were called");
+ System.err.println(calledRef + " called Reference");
+ System.err.println(calledSNP + " called SNP");
+ System.err.println(calledIndel + " called indel");
+ }
+ }
+ // Insufficient input was provided, so print usage //
+ else
+ {
+ System.err.println("Please provide an input file!\n" + usage);
+ System.exit(10);
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ System.exit(11);
+ }
+ }
+}
diff --git a/net/sf/varscan/Comparison.java b/net/sf/varscan/Comparison.java
new file mode 100644
index 0000000..1ac4e4b
--- /dev/null
+++ b/net/sf/varscan/Comparison.java
@@ -0,0 +1,302 @@
+/**
+ * @(#)Comparison.java
+ *
+ * Copyright (c) 2009-2010 Daniel C. Koboldt and Washington University in St. Louis
+ *
+ * COPYRIGHT
+ */
+
+package net.sf.varscan;
+
+import java.io.BufferedReader;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.PrintStream;
+import java.util.HashMap;
+import java.util.BitSet;
+
+/**
+ * A class for comparing positions (variants) between two files
+ *
+ * @version 2.3
+ *
+ * @author Daniel C. Koboldt <dkoboldt at genome.wustl.edu>
+ *
+ */
+public class Comparison {
+
+ public Comparison(String[] args)
+ {
+ String usage = "USAGE: java -jar VarScan.jar compare [file1] [file2] [type] [output] OPTIONS\n" +
+ "\tfile1 - A file of chromosome-positions, tab-delimited\n" +
+ "\tfile2 - A file of chromosome-positions, tab-delimited\n" +
+ "\ttype - Type of comparison [intersect|merge|unique1|unique2]\n" +
+ "\toutput - Output file for the comparison result\n";
+
+ if(args.length < 5)
+ {
+ System.out.println(usage);
+ return;
+ }
+
+ // Get the required arguments //
+ String fileName1 = args[1];
+ String fileName2 = args[2];
+ String comparisonType = args[3];
+ String outFileName = args[4];
+
+ System.err.println("File 1: " + fileName1);
+ System.err.println("File 2: " + fileName2);
+
+ try
+ {
+ // Declare output file //
+ PrintStream outFile = null;
+
+ outFile = new PrintStream( new FileOutputStream(outFileName) );
+
+ BufferedReader file1 = new BufferedReader(new FileReader(fileName1));
+ BufferedReader file2 = new BufferedReader(new FileReader(fileName2));
+
+ if(!(file1.ready() && file2.ready()))
+ {
+ System.err.println("ERROR: Invalid input file(s)");
+ return;
+ }
+
+ // Load the positions of both files into a hash //
+ System.err.println("Loading positions from file 1");
+ HashMap<String, BitSet> positionHash1 = loadPositions(fileName1);
+ System.err.println("Loading positions from file 2");
+ HashMap<String, BitSet> positionHash2 = loadPositions(fileName2);
+ System.err.println("Done");
+
+ // Reset counters ///
+
+ int numShared = 0;
+ int uniqueToFile1 = 0;
+ int uniqueToFile2 = 0;
+
+ // Parse the lines in file 1 //
+ String line = "";
+ int lineCounter = 0;
+
+ while ((line = file1.readLine()) != null)
+ {
+ lineCounter++;
+
+ String[] lineContents = line.split("\t");
+ if(lineContents.length >= 2)
+ {
+ // Try to parse chrom and position //
+ try
+ {
+ String refName = lineContents[0];
+ int position = Integer.parseInt(lineContents[1]);
+
+ // Declare booleans //
+
+ boolean inFile1 = false;
+ boolean inFile2 = false;
+
+ // Declare a BitSet //
+ BitSet refPositions;
+
+ if(positionHash1.containsKey(refName))
+ {
+ refPositions = positionHash1.get(refName);
+ if(refPositions.get(position))
+ inFile1 = true;
+ }
+
+ if(positionHash2.containsKey(refName))
+ {
+ refPositions = positionHash2.get(refName);
+ if(refPositions.get(position))
+ inFile2 = true;
+ }
+
+ // Check to see if shared //
+ if(inFile1 && inFile2)
+ {
+ numShared++;
+ if(comparisonType.equals("intersect"))
+ {
+ outFile.println(line);
+ }
+ }
+ else if(inFile1)
+ {
+ if(comparisonType.equals("unique1"))
+ outFile.println(line);
+ uniqueToFile1++;
+ }
+
+ // Check to see if merging //
+ if(comparisonType.equals("merge"))
+ {
+ outFile.println(line);
+ }
+ }
+ catch(Exception e)
+ {
+ if(lineCounter == 1)
+ {
+// Skip header // outFile.println(line);
+ }
+ else
+ System.err.println("Warning: Unable to parse chrom/position from " + line);
+ }
+ }
+ }
+
+
+ while ((line = file2.readLine()) != null)
+ {
+ lineCounter++;
+
+ String[] lineContents = line.split("\t");
+ if(lineContents.length >= 2)
+ {
+ // Try to parse chrom and position //
+ try
+ {
+ String refName = lineContents[0];
+ int position = Integer.parseInt(lineContents[1]);
+
+ // Declare booleans //
+
+ boolean inFile1 = false;
+ boolean inFile2 = false;
+
+ // Declare a BitSet //
+ BitSet refPositions;
+
+ if(positionHash1.containsKey(refName))
+ {
+ refPositions = positionHash1.get(refName);
+ if(refPositions.get(position))
+ inFile1 = true;
+ }
+
+ if(positionHash2.containsKey(refName))
+ {
+ refPositions = positionHash2.get(refName);
+ if(refPositions.get(position))
+ inFile2 = true;
+ }
+
+ // Check to see if shared //
+ if(inFile1 && inFile2)
+ {
+ // Already counted and printed in file 1 //
+ }
+ else if(inFile2)
+ {
+ if(comparisonType.equals("merge") || comparisonType.equals("unique2"))
+ outFile.println(line);
+
+ uniqueToFile2++;
+ }
+
+ }
+ catch(Exception e)
+ {
+ if(lineCounter == 1)
+ outFile.println(line);
+ else
+ System.err.println("Warning: Unable to parse chrom/position from " + line);
+ }
+ }
+ }
+
+ file1.close();
+ file2.close();
+
+ int numTotal = numShared + uniqueToFile1 + uniqueToFile2;
+ System.err.println(numTotal + " total positions");
+ System.err.println(uniqueToFile1 + " positions unique to file 1");
+ System.err.println(uniqueToFile2 + " positions unique to file 2");
+ System.err.println(numShared + " positions shared");
+ }
+ catch(Exception e)
+ {
+ System.err.println("ERROR: File Parsing Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ return;
+ }
+
+ }
+
+
+ /**
+ * Saves positions into a BitSet hash by chromosome
+ *
+ * @param args Command-line arguments
+ * @return HashMap of parameter names and their values
+ */
+ static HashMap<String, BitSet> loadPositions(String fileName)
+ {
+ HashMap<String, BitSet> positionsByChrom = new HashMap<String, BitSet>();
+
+ try
+ {
+ BufferedReader infile = new BufferedReader(new FileReader(fileName));
+
+ String line = "";
+ int lineCounter = 0;
+
+ while ((line = infile.readLine()) != null)
+ {
+ lineCounter++;
+
+ String[] lineContents = line.split("\t");
+ if(lineContents.length >= 2)
+ {
+ // Try to parse chrom and position //
+ try
+ {
+ String refName = lineContents[0];
+ int position = Integer.parseInt(lineContents[1]);
+
+ // Get or create BitSet //
+ BitSet refPositions;
+
+ if(positionsByChrom.containsKey(refName))
+ {
+ refPositions = positionsByChrom.get(refName);
+ }
+ else
+ {
+ refPositions = new BitSet(position + 1);
+ }
+
+ // Set the position to true //
+ refPositions.set(position, true);
+
+ // Return it to the hash //
+ positionsByChrom.put(refName, refPositions);
+ }
+ catch(Exception e)
+ {
+ if(lineCounter > 1)
+ System.err.println("Warning: Unable to parse chrom/position from " + line);
+ }
+
+
+ }
+ }
+
+ infile.close();
+ }
+ catch(Exception e)
+ {
+ System.err.println("ERROR: File Parsing Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ }
+
+
+ return(positionsByChrom);
+ }
+
+}
diff --git a/net/sf/varscan/CopyCaller.java b/net/sf/varscan/CopyCaller.java
new file mode 100644
index 0000000..e9b5817
--- /dev/null
+++ b/net/sf/varscan/CopyCaller.java
@@ -0,0 +1,579 @@
+/**
+ * @(#)CopyCaller.java
+ *
+ * Copyright (c) 2009-2010 Daniel C. Koboldt and Washington University in St. Louis
+ *
+ * COPYRIGHT
+ */
+
+package net.sf.varscan;
+
+import java.io.BufferedReader;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.PrintStream;
+import java.text.DecimalFormat;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.HashMap;
+
+/**
+ * A class for calling/GC-adjusting copy number variants from raw somatic copynumber output
+ *
+ * @version 2.3
+ *
+ * @author Daniel C. Koboldt <dkoboldt at genome.wustl.edu>
+ *
+ */
+public class CopyCaller {
+ public CopyCaller(String[] args, HashMap<String, String> params)
+ {
+ String usage = "USAGE: java -jar VarScan.jar copyCaller [varScan.copynumber] OPTIONS\n" +
+ "This command will adjust VarScan copynumber output for GC content, apply amp/del thresholds,\n and (optionally) recenter the data\n" +
+ "\tINPUT:\n" +
+ "\tRaw output from the VarScan copynumber command (eg. varScan.output.copynumber)\n\n" +
+ "\tOPTIONS:\n" +
+ "\t--output-file\tOutput file to contain the calls\n" +
+ "\t--output-homdel-file\tOptional output file for candidate homozygous deletions\n" +
+ "\t--min-coverage\tMinimum normal read depth at a position to make a call [20]\n" +
+ "\t--min-tumor-coverage\tMinimum tumor read depth at a position to make a non-homdel call [10]\n" +
+ "\t--max-homdel-coverage\tMaximum depth in tumor for candidate homozygous deletions [5]\n" +
+ "\t--amp-threshold\tLower bound for log ratio to call amplification [0.25]\n" +
+ "\t--del-threshold\tUpper bound for log ratio to call deletion (provide as positive number) [0.25]\n" +
+ "\t--min-region-size\tMinimum size (in bases) for a region to be counted [10]\n" +
+ "\t--recenter-up\tRecenter data around an adjusted baseline > 0 [0]\n" +
+ "\t--recenter-down\tRecenter data around an adjusted baseline < 0 [0]\n";
+
+ // Set parameter defaults //
+
+ String regionsFile = "";
+ String outputFile = "";
+ String homdelFile = "";
+ int minCoverage = 20;
+ int minTumorCoverage = 10;
+ int maxHomdelCoverage = 5;
+ int minRegionSize = 10;
+ double ampThreshold = 0.25;
+ double delThreshold = -0.25;
+ double recenterBaseline = 0.00;
+ Float[] gcLogSum = new Float[101];
+ Integer[] gcLogNum = new Integer[101];
+
+ // Reset GC bin //
+
+ for(int i = 0; i <= 100; i++)
+ {
+ gcLogSum[i] = (float) 0;
+ gcLogNum[i] = 0;
+ }
+
+ // Get any user-provided parameters //
+
+ try
+ {
+ if(params.containsKey("min-coverage"))
+ minCoverage = Integer.parseInt(params.get("min-coverage"));
+
+ if(params.containsKey("min-tumor-coverage"))
+ minTumorCoverage = Integer.parseInt(params.get("min-tumor-coverage"));
+
+ if(params.containsKey("max-homdel-coverage"))
+ maxHomdelCoverage = Integer.parseInt(params.get("max-homdel-coverage"));
+
+ if(params.containsKey("min-region-size"))
+ minRegionSize = Integer.parseInt(params.get("min-region-size"));
+
+ if(params.containsKey("amp-threshold"))
+ ampThreshold = Double.parseDouble(params.get("amp-threshold"));
+
+ if(params.containsKey("del-threshold"))
+ {
+ delThreshold = 0 - Double.parseDouble(params.get("del-threshold"));
+ }
+
+ if(params.containsKey("recenter-up"))
+ {
+ recenterBaseline = Double.parseDouble(params.get("recenter-up"));
+ }
+
+ if(params.containsKey("recenter-down"))
+ {
+ recenterBaseline = Double.parseDouble(params.get("recenter-down"));
+ recenterBaseline = 0.00 - recenterBaseline;
+ }
+
+ if(params.containsKey("output-file"))
+ outputFile = params.get("output-file");
+
+ if(params.containsKey("output-homdel-file"))
+ homdelFile = params.get("output-homdel-file");
+
+ System.err.println("Min coverage:\t" + minCoverage);
+ }
+ catch(Exception e)
+ {
+ System.err.println("Input Parameter Threw Exception: " + e.getLocalizedMessage());
+ System.err.println("Parsing " + params.get("del-threshold"));
+ e.printStackTrace(System.err);
+ System.exit(1);
+ }
+
+ // Print usage if -h or --help invoked //
+ if(params.containsKey("help") || params.containsKey("h"))
+ {
+ System.err.println(usage);
+ return;
+ }
+
+ // Define the statistics hash and reset counters //
+
+ HashMap<String, Integer> stats = new HashMap<String, Integer>();
+ stats.put("numRegions", 0);
+ stats.put("metMinDepth", 0);
+ stats.put("metMinSize", 0);
+ stats.put("numAmp", 0);
+ stats.put("numDel", 0);
+ stats.put("numHomDel", 0);
+ stats.put("numNeutral", 0);
+
+ DecimalFormat threeDigits = new DecimalFormat("#0.000");
+
+ HashMap<String, Long> baseCounts = new HashMap<String, Long>();
+ baseCounts.put("numAmp", (long) 0);
+ baseCounts.put("numDel", (long) 0);
+ baseCounts.put("numHomDel", (long) 0);
+ baseCounts.put("numNeutral", (long) 0);
+
+ try
+ {
+ // If output file was provided, open it //
+ PrintStream out = null; // declare a print stream object
+ PrintStream outGC = null;
+ PrintStream outHomdel = null; // declare a print stream object
+
+ if(params.containsKey("output-file"))
+ {
+ out = new PrintStream( new FileOutputStream(outputFile) );
+ out.println("chrom\tchr_start\tchr_stop\tnum_positions\tnormal_depth\ttumor_depth\tadjusted_log_ratio\tgc_content\tregion_call\traw_ratio");
+
+ outGC = new PrintStream ( new FileOutputStream(outputFile + ".gc") );
+ outGC.println("gc\tregions\tavg_log2\tmean_sd_log2");
+ }
+
+ if(params.containsKey("output-homdel-file"))
+ {
+ outHomdel = new PrintStream( new FileOutputStream(homdelFile) );
+ outHomdel.println("chrom\tchr_start\tchr_stop\tnum_positions\tnormal_depth\ttumor_depth\tadjusted_log_ratio\tgc_content\tregion_call\traw_ratio");
+ }
+
+ // Declare file-parsing variables //
+
+ BufferedReader in = VarScan.getInfile(args);
+ String line;
+
+ // If no input, print usage //
+
+ if(in == null)
+ {
+ System.out.println(usage);
+ return;
+ }
+
+ // If input file not ready, give it a few seconds //
+ int numNaps = 0;
+
+ if(!in.ready())
+ {
+ try {
+ Thread.sleep(5000);
+ numNaps++;
+ if(numNaps > 100)
+ {
+ System.err.println("Input file was not ready after 100 5-second cycles!");
+ System.exit(10);
+ }
+ }
+ catch(Exception e)
+ {
+
+ }
+ }
+
+
+ boolean gcWarned = false;
+ // If input input was provided, begin parsing it //
+
+ if(in.ready())
+ {
+ while ((line = in.readLine()) != null)
+ {
+ // Output progress line //
+ if(params.containsKey("verbose") && (stats.get("numRegions") % 10000) == 0)
+ System.err.println(stats.get("numRegions") + " regions parsed...");
+
+ // Begin try-catch for line parsing //
+
+ try
+ {
+ String[] lineContents = line.split("\t");
+
+ // Verify expected copynumber regions format //
+
+ if(lineContents.length > 4 && lineContents[0].length() > 0 && lineContents[1].length() > 0 && lineContents[2].length() > 0 && lineContents[3].length() > 0)
+ {
+ String refName = lineContents[0];
+
+ // Print the header line //
+ if(refName.equals("chrom"))
+ {
+
+ }
+ else if(!refName.equals("chrom"))
+ {
+ stats.put("numRegions", (stats.get("numRegions") + 1));
+
+ long regionStart = Long.parseLong(lineContents[1]);
+ long regionStop = Long.parseLong(lineContents[2]);
+ long numPositions = Long.parseLong(lineContents[3]);
+
+ // Fix locale-parsing issues //
+ float normalDepth = Float.parseFloat(lineContents[4].replace(',', '.'));
+ float tumorDepth = Float.parseFloat(lineContents[5].replace(',', '.'));
+ double logratio = Double.parseDouble(lineContents[6].replace(',', '.'));
+
+ if(recenterBaseline != 0)
+ logratio = logratio - recenterBaseline;
+
+ if(lineContents.length >= 8)
+ {
+ // Apply coverage threshold //
+ if(normalDepth >= minCoverage && tumorDepth >= minTumorCoverage)
+ {
+ float gcContent = Float.parseFloat(lineContents[7].replace(',', '.'));
+ int gcBin = (int) gcContent;
+ if(gcBin >= 0 && gcBin <= 100)
+ {
+ gcLogSum[gcBin] += logratio;
+ gcLogNum[gcBin]++;
+ }
+ }
+
+ }
+ else
+ {
+ // No GC information.. warn if we haven't already //
+ if(!gcWarned)
+ {
+ System.err.println("Warning: Older VarScan copynumber output (without GC content column) detected, so no GC adjustment will be performed");
+ gcWarned = true;
+ }
+ }
+
+
+ }
+ else
+ {
+ // Don't process the header line //
+ }
+ }
+ else
+ {
+ System.err.println("Error: Invalid format for pileup at line " + stats.get("numBases") + "\n" + line + "\n");
+ return;
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Parsing Exception on line:\n" + line + "\n" + e.getLocalizedMessage());
+
+ stats.put("numParsingExceptions", (stats.get("numParsingExceptions") + 1));
+
+ if(stats.get("numParsingExceptions") >= 5)
+ {
+ System.err.println("Too many parsing exceptions encountered; exiting");
+ System.exit(11);
+ }
+
+ }
+
+
+ }
+
+ }
+ else
+ {
+ System.err.println("Input was not ready for parsing!");
+ System.exit(10);
+ }
+
+ in.close();
+
+ // Get overall mean copy number //
+ float totalAvgSum = (float) 0;
+ long totalAvgNum = 0;
+ // Print GC content by bin //
+ for(int i = 0; i <= 100; i++)
+ {
+ if(gcLogNum[i] > 0)
+ {
+ totalAvgSum += gcLogSum[i];
+ totalAvgNum += gcLogNum[i];
+ }
+ }
+
+ float totalAvgLog = totalAvgSum / (float) totalAvgNum;
+
+ if(!gcWarned)
+ {
+ System.err.println(totalAvgLog + " was the average log2 of copy number change");
+ System.err.println("Copy number change by GC content bin:");
+ System.err.println("bin\tregions\tavg_log2\tmean_sd");
+ }
+
+ Float[] gcLogMeanSD = new Float[101];
+ try
+ {
+ // Print GC content by bin //
+ for(int i = 0; i <= 100; i++)
+ {
+ if(gcLogNum[i] > 0)
+ {
+ float binAvgLog = gcLogSum[i] / (float) gcLogNum[i];
+ float binMeanSD = binAvgLog - totalAvgLog;
+ outGC.println(i + "\t" + gcLogNum[i] + "\t" + binAvgLog + "\t" + binMeanSD);
+ System.err.println(i + "\t" + gcLogNum[i] + "\t" + binAvgLog + "\t" + binMeanSD);
+ gcLogMeanSD[i] = binMeanSD;
+ }
+ else
+ {
+ gcLogMeanSD[i] = (float) 0;
+ }
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Insufficient data for GC adjustment, so adjustment will be skipped");
+ for(int i = 0; i <= 100; i++)
+ {
+ gcLogMeanSD[i] = (float) 0;
+ }
+ }
+
+
+
+ // Re-parse the input file //
+
+ in = VarScan.getInfile(args);
+ // If input input was provided, begin parsing it //
+
+
+ if(in.ready())
+ {
+ while ((line = in.readLine()) != null)
+ {
+ // Output progress line //
+ if(params.containsKey("verbose") && (stats.get("numRegions") % 10000) == 0)
+ System.err.println(stats.get("numRegions") + " regions parsed...");
+
+ // Begin try-catch for line parsing //
+
+ try
+ {
+ String[] lineContents = line.split("\t");
+
+ // Verify expected copynumber regions format //
+
+ if(lineContents.length > 4 && lineContents[0].length() > 0 && lineContents[1].length() > 0 && lineContents[2].length() > 0 && lineContents[3].length() > 0)
+ {
+ String refName = lineContents[0];
+
+ // Print the header line //
+ if(refName.equals("chrom"))
+ {
+
+ }
+ else if(!refName.equals("chrom"))
+ {
+ stats.put("numRegions", (stats.get("numRegions") + 1));
+
+ long regionStart = Long.parseLong(lineContents[1]);
+ long regionStop = Long.parseLong(lineContents[2]);
+ long numPositions = Long.parseLong(lineContents[3]);
+ float normalDepth = Float.parseFloat(lineContents[4].replace(',', '.'));
+ float tumorDepth = Float.parseFloat(lineContents[5].replace(',', '.'));
+ double logratio = Double.parseDouble(lineContents[6].replace(',', '.'));
+ double adjustedRatio = logratio;
+
+ // If recentering, adjust the adjusted log ratio //
+
+ if(recenterBaseline != 0)
+ adjustedRatio = adjustedRatio - recenterBaseline;
+
+ float gcContent = (float) -1;
+ if(lineContents.length >= 8)
+ {
+ gcContent = Float.parseFloat(lineContents[7].replace(',', '.'));
+ int gcBin = (int) gcContent;
+ // If there was an adjustment for this GC bin, make it so //
+ if(gcBin >= 0 && gcBin <= 100) // && normalDepth >= minCoverage && tumorDepth >= minTumorCoverage
+ {
+ if(gcLogMeanSD[gcBin] != (float) 0)
+ {
+ adjustedRatio = adjustedRatio - gcLogMeanSD[gcBin];
+ }
+ }
+ }
+
+
+ // Check to see if this position meets minimum depth //
+ long regionSize = regionStop - regionStart + 1;
+
+ if(normalDepth >= minCoverage && tumorDepth >= minTumorCoverage)
+ {
+ stats.put("metMinDepth", (stats.get("metMinDepth") + 1));
+
+ String regionCall = "neutral";
+
+ if(regionSize >= minRegionSize)
+ {
+ stats.put("metMinSize", (stats.get("metMinSize") + 1));
+
+ // Determine class based on user-specified thresholds //
+
+ if(adjustedRatio >= ampThreshold)
+ {
+ stats.put("numAmp", (stats.get("numAmp") + 1));
+ baseCounts.put("numAmp", (baseCounts.get("numAmp") + regionSize));
+
+ regionCall = "amp";
+ }
+ else if(adjustedRatio <= delThreshold)
+ {
+ stats.put("numDel", (stats.get("numDel") + 1));
+ baseCounts.put("numDel", (baseCounts.get("numDel") + regionSize));
+ regionCall = "del";
+ }
+ else
+ {
+ stats.put("numNeutral", (stats.get("numNeutral") + 1));
+ baseCounts.put("numNeutral", (baseCounts.get("numNeutral") + regionSize));
+ }
+
+ String outLine = refName + "\t" + regionStart + "\t" + regionStop + "\t" + numPositions + "\t";
+ outLine += normalDepth + "\t" + tumorDepth + "\t" + threeDigits.format(adjustedRatio) + "\t" + gcContent + "\t" + regionCall + "\t" + logratio;
+
+ // Print to outfile or standardout //
+
+ if(params.containsKey("output-file"))
+ {
+ out.println(outLine);
+ }
+ else
+ {
+ System.err.println(outLine);
+ }
+ }
+
+ }
+ else if(normalDepth >= minCoverage && tumorDepth <= maxHomdelCoverage && regionSize >= minRegionSize && adjustedRatio <= delThreshold)
+ {
+ // Output candidate homozygous deletion //
+ String outLine = refName + "\t" + regionStart + "\t" + regionStop + "\t" + numPositions + "\t";
+ outLine += normalDepth + "\t" + tumorDepth + "\t" + threeDigits.format(adjustedRatio) + "\t" + gcContent + "\thomozygous_deletion\t" + logratio;
+ stats.put("numHomDel", (stats.get("numHomDel") + 1));
+ baseCounts.put("numHomDel", (baseCounts.get("numHomDel") + regionSize));
+ if(params.containsKey("output-homdel-file"))
+ {
+ outHomdel.println(outLine);
+ }
+
+ }
+
+ }
+ else
+ {
+ // Don't process the header line //
+ }
+ }
+ else
+ {
+ System.err.println("Error: Invalid format for pileup at line " + stats.get("numBases") + "\n" + line + "\n");
+ return;
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Parsing Exception on line:\n" + line + "\n" + e.getMessage() + "\n" + e.getLocalizedMessage());
+ System.err.println(e.toString());
+ stats.put("numParsingExceptions", (stats.get("numParsingExceptions") + 1));
+
+ if(stats.get("numParsingExceptions") >= 5)
+ {
+ System.err.println("Too many parsing exceptions encountered; exiting");
+ System.exit(11);
+ }
+
+ }
+
+
+ }
+
+ }
+ else
+ {
+ System.err.println("Input was not ready for parsing!");
+ System.exit(10);
+ }
+
+ if(params.containsKey("output-file"))
+ {
+ out.close();
+ outGC.close();
+ }
+
+
+
+ // Print summary statistics //
+ System.err.println(stats.get("numRegions") + " raw regions parsed");
+ System.err.println(stats.get("metMinDepth") + " met min depth");
+ System.err.println(stats.get("metMinSize") + " met min size");
+ System.err.println(stats.get("numAmp") + " regions (" + baseCounts.get("numAmp") + " bp)" + " were called amplification (log2 > " + ampThreshold + ")");
+ System.err.println(stats.get("numNeutral") + " regions (" + baseCounts.get("numNeutral") + " bp)" + " were called neutral");
+ System.err.println(stats.get("numDel") + " regions (" + baseCounts.get("numDel") + " bp)" + " were called deletion (log2 <" + delThreshold + ")");
+ System.err.println(stats.get("numHomDel") + " regions (" + baseCounts.get("numHomDel") + " bp)" + " were called homozygous deletion (normal cov >= " + minCoverage + " and tumor cov <= " + maxHomdelCoverage + ")");
+ }
+ catch(Exception e)
+ {
+ System.err.println("Error parsing input: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ System.exit(11);
+ }
+ }
+
+
+/**
+ * Processes a region of copy number calls
+ *
+ * @param regionRef Reference or chromosome name
+ * @param regionStart Start position on reference
+ * @param regionStop Stop position on reference
+ */
+ static String processRegion(String regionRef, long regionStart, long regionStop, int regionCalls, long regionSumNormal, long regionSumTumor)
+ {
+ // Calculate average region depth //
+
+ float regionDepthNormal = regionSumNormal / regionCalls;
+ float regionDepthTumor = regionSumTumor / regionCalls;
+
+ double tumorNormalRatio = 100;
+ double log2ratio = 0.00;
+ if(regionDepthNormal > 0)
+ {
+ tumorNormalRatio = regionDepthTumor / regionDepthNormal;
+ log2ratio = Math.log(tumorNormalRatio) / Math.log(2);
+ }
+
+ return(regionRef + "\t" + regionStart + "\t" + regionStop + "\t" + regionCalls + "\t" + regionDepthNormal + "\t" + regionDepthTumor + "\t" + log2ratio);
+
+ }
+}
\ No newline at end of file
diff --git a/net/sf/varscan/Copynumber.java b/net/sf/varscan/Copynumber.java
new file mode 100644
index 0000000..83c3186
--- /dev/null
+++ b/net/sf/varscan/Copynumber.java
@@ -0,0 +1,1107 @@
+package net.sf.varscan;
+
+import java.io.BufferedReader;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.text.DecimalFormat;
+import java.util.Arrays;
+import java.util.HashMap;
+/**
+ * A class for calling copy number variants between a tumor and a matched normal sample
+ *
+ * @version 2.3
+ *
+ * @author Daniel C. Koboldt <dkoboldt at genome.wustl.edu>
+ *
+ */
+public class Copynumber {
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
+ // Constructor with two arguments (string[], boolean) expects mpileup input //
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
+ public Copynumber(String[] args, boolean isMpileup)
+ {
+ String usage = "USAGE: java -jar VarScan.jar copynumber [normal-tumor.mpileup] [Opt: output] OPTIONS\n" +
+ "\tnormal-tumor.mpileup - The SAMtools mpileup file for Normal and Tumor\n" +
+ "\toutput - Output base name for files\n" +
+ "\nOPTIONS:\n" +
+ "\t--min-base-qual - Minimum base quality to count for coverage [20]\n" +
+ "\t--min-map-qual - Minimum read mapping quality to count for coverage [20]\n" +
+ "\t--min-coverage - Minimum coverage threshold for copynumber segments [20]\n" +
+ "\t--min-segment-size - Minimum number of consecutive bases to report a segment [10]\n" +
+ "\t--max-segment-size - Max size before a new segment is made [100]\n" +
+ "\t--p-value - P-value threshold for significant copynumber change-point [0.01]\n" +
+ "\t--data-ratio - The normal/tumor input data ratio for copynumber adjustment [1.0]\n";
+
+ if(args.length < 2)
+ {
+ System.err.println(usage);
+ return;
+ }
+
+ // Set parameter defaults //
+
+ HashMap<String, String> params = VarScan.getParams(args);
+
+ // Set up formatting for p-values //
+ DecimalFormat pvalueFormat = new DecimalFormat("0.####E0");
+
+ String outputName = "output";
+
+ if(args.length >= 3 && !args[2].startsWith("-"))
+ {
+ outputName = args[2];
+ }
+
+ // Set parameter defaults //
+
+ int minCoverage = 10;
+ int minBaseQual = 15;
+ int minSegmentSize = 10;
+ int maxSegmentSize = 100;
+ double dataRatio = 1.00;
+ double pValueThreshold = 0.01;
+
+ // Try adjusting any provided parameters based on user inut //
+ try
+ {
+ if(params.containsKey("min-coverage"))
+ {
+ minCoverage = Integer.parseInt(params.get("min-coverage"));
+ }
+
+ if(params.containsKey("min-base-qual"))
+ minBaseQual = Integer.parseInt(params.get("min-base-qual"));
+
+ if(params.containsKey("min-segment-size"))
+ minSegmentSize = Integer.parseInt(params.get("min-segment-size"));
+
+ if(params.containsKey("max-segment-size"))
+ maxSegmentSize = Integer.parseInt(params.get("max-segment-size"));
+
+ if(params.containsKey("p-value"))
+ pValueThreshold = Double.parseDouble(params.get("p-value"));
+
+ if(params.containsKey("data-ratio"))
+ dataRatio = Double.parseDouble(params.get("data-ratio"));
+
+ System.err.println("Min coverage:\t" + minCoverage);
+ System.err.println("Min avg qual:\t" + minBaseQual);
+ System.err.println("P-value thresh:\t" + pValueThreshold);
+
+ }
+ catch(Exception e)
+ {
+ System.err.println("Input Parameter Threw Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ System.exit(1);
+ }
+
+ // Print usage if -h or --help invoked //
+ if(params.containsKey("help") || params.containsKey("h"))
+ {
+ System.err.println(usage);
+ return;
+ }
+
+ // Check for correct input //
+
+ if(args.length < 3)
+ {
+ System.err.println("Please provide an output file basename!");
+ System.err.println(usage);
+ System.exit(1);
+ }
+
+
+ // Parse piped input or user-provided pileup file //
+
+ try
+ {
+ // Declare file-parsing variables //
+
+ BufferedReader in = VarScan.getInfile(args);
+ String line;
+
+ // If no input, print usage //
+
+ if(in == null)
+ {
+ System.out.println(usage);
+ return;
+ }
+
+ // If input file not ready, give it a few seconds //
+ int numNaps = 0;
+
+ while(!in.ready())
+ {
+ try {
+ Thread.sleep(5000);
+ numNaps++;
+
+ if(numNaps > 100)
+ {
+ System.err.println("Input file was not ready after 100 5-second cycles!");
+ System.exit(10);
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Exception while trying to get input" + e.getMessage());
+ System.exit(1);
+ }
+ }
+
+ // Proceed if input stream is ready //
+
+ if(in != null && in.ready())
+ {
+ // Declare output file //
+ PrintStream outCopySegments = null; // declare a print stream object for copynumber segments
+
+ outCopySegments = new PrintStream( new FileOutputStream(outputName + ".copynumber") );
+ outCopySegments.println("chrom\tchr_start\tchr_stop\tnum_positions\tnormal_depth\ttumor_depth\tlog2_ratio\tgc_content");
+
+
+ System.err.println("Reading mpileup input...");
+ int numParsingExceptions = 0;
+
+ // Statistics counters //
+ long sharedPositions = 0;
+ long comparedPositions = 0;
+ long rawCopySegments = 0;
+ long goodCopySegments = 0;
+
+ // Set some default parsing variables //
+ String chromNormal = "";
+ String chromTumor = "";
+ String refBase = "";
+ int posNormal = 0;
+ int posTumor = 0;
+
+ // Parameters for copy number calling //
+ String copyChrom = "";
+ int copyStart = 0;
+ int copyStop = 0;
+ int copyDepthNormal = 0;
+ int copyDepthTumor = 0;
+ long copySumNormal = 0;
+ long copySumTumor = 0;
+ long copyPositions = 0;
+ long copyPositionsGC = 0;
+
+ DecimalFormat oneDigit = new DecimalFormat("#0.0");
+ DecimalFormat threeDigits = new DecimalFormat("#0.000");
+
+ // Parse the infile line by line //
+
+ while ((line = in.readLine()) != null)
+ {
+
+ // Begin try-catch for line parsing //
+
+ try
+ {
+ String[] lineContents = line.split("\t");
+
+ // Verify expected pileup format //
+
+ if(lineContents.length > 5 && lineContents[0].length() > 0 && lineContents[1].length() > 0 && lineContents[2].length() > 0 && lineContents[3].length() > 0)
+ {
+ sharedPositions++;
+
+ // Parse common fields from line //
+ String refName = lineContents[0];
+ int position = Integer.parseInt(lineContents[1]);
+ refBase = lineContents[2].toUpperCase();
+
+ chromNormal = refName;
+ chromTumor = refName;
+ posNormal = position;
+ posTumor = position;
+
+ // Parse normal, which should be first sample //
+ int normalOffset = 3;
+ int pileupDepthNormal = Integer.parseInt(lineContents[normalOffset]);
+ //String normalBases = lineContents[normalOffset + 1];
+ String normalQualities = lineContents[normalOffset + 2];
+
+ // Parse tumor, which should be second sample //
+ int tumorOffset = 6;
+ int pileupDepthTumor = Integer.parseInt(lineContents[tumorOffset]);
+ //String tumorBases = lineContents[tumorOffset + 1];
+ String tumorQualities = lineContents[tumorOffset + 2];
+
+
+ // If either sample met the minimum coverage and both had at least one read //
+
+// if((pileupDepthNormal >= minCoverage || pileupDepthTumor >= minCoverage) && normalQualities.length() > 0)// && tumorQualities.length() > 0)
+
+ // We want the normal sample to meet the minimum coverage because that's the comparator //
+ if(pileupDepthNormal >= minCoverage && normalQualities.length() > 0)// && tumorQualities.length() > 0)
+ {
+ comparedPositions++;
+ // Get the depth of bases above minimum quality //
+
+ int normalDepth = VarScan.qualityDepth(normalQualities, minBaseQual);
+ int tumorDepth = 0;
+ if(tumorQualities.length() > 0)
+ tumorDepth = VarScan.qualityDepth(tumorQualities, minBaseQual);
+
+ // Determine if we have a copy changepoint //
+ // If this base is not contiguous with the copyRegion
+ // If the normal or tumor depth changes //
+
+ int diffNormal = Math.abs(copyDepthNormal - normalDepth);
+ int diffTumor = Math.abs(copyDepthTumor - tumorDepth);
+ int posDiff = posTumor - copyStop;
+
+ // DETERMINE IF WE CONTINUE THIS REGION OR PROCESS IT AND START A NEW ONE //
+
+ boolean continueFlag = false;
+
+ // If chromosomes differ or contiguity broken, process the region //
+
+ if(posDiff > 2 || !(copyChrom.equals(chromTumor)))
+ {
+ continueFlag = false;
+ }
+ else
+ {
+ if(copyPositions >= maxSegmentSize)
+ {
+ continueFlag = false;
+ }
+ else if(diffNormal <= 2 && diffTumor <= 2)
+ {
+ continueFlag = true;
+ }
+ else
+ {
+ // Do a Fisher's exact test on the copy number changes. ##
+
+ double changePvalue = VarScan.getSignificance(copyDepthNormal, copyDepthTumor, normalDepth, tumorDepth);
+
+ // If depth change not significant, continue with region //
+ if(changePvalue >= pValueThreshold)
+ {
+ continueFlag = true;
+ }
+ else
+ {
+ continueFlag = false;
+ }
+
+ }
+ }
+
+
+ // If continuing, extend this region and don't process yet //
+
+ if(continueFlag)
+ {
+ copySumNormal += normalDepth;
+ copySumTumor += tumorDepth;
+ copyPositions++;
+ if(refBase.equals("G") || refBase.equals("C") || refBase.equals("g") || refBase.equals("c"))
+ copyPositionsGC++;
+ copyStop = posTumor;
+ }
+
+ // Otherwise, process this region (if it qualifies) and start a new one //
+
+ else
+ {
+ if(copyPositions >= minSegmentSize)
+ {
+ rawCopySegments++;
+ String regionResults = processCopyRegion(copyChrom, copyStart, copyStop, copyPositions, copyPositionsGC, copySumNormal, copySumTumor, minCoverage, dataRatio);
+
+ if(regionResults.length() > 0)
+ {
+ outCopySegments.println(regionResults);
+ goodCopySegments++;
+ }
+ }
+
+ // Start a new copyNumber region //
+ copyChrom = chromTumor;
+ copyStart = posTumor;
+ copyStop = posTumor;
+ copyDepthNormal = normalDepth;
+ copyDepthTumor = tumorDepth;
+ copySumNormal = normalDepth;
+ copySumTumor = tumorDepth;
+ copyPositions = 1;
+ if(refBase.equals("G") || refBase.equals("C") || refBase.equals("g") || refBase.equals("c"))
+ copyPositionsGC = 1;
+ else
+ copyPositionsGC = 0;
+ }
+
+
+ }
+ else
+ {
+ // If minimum coverage was not met, print region //
+ // If we had a copyNumber region that met minimum coverage, report it //
+ if(copyPositions >= minSegmentSize)
+ {
+ rawCopySegments++;
+ String regionResults = processCopyRegion(copyChrom, copyStart, copyStop, copyPositions, copyPositionsGC, copySumNormal, copySumTumor, minCoverage, dataRatio);
+
+ if(regionResults.length() > 0)
+ {
+ outCopySegments.println(regionResults);
+ goodCopySegments++;
+ }
+ }
+
+ // Reset the copyNumber region //
+ copyChrom = "";
+ copyStart = 0;
+ copyStop = 0;
+ copyDepthNormal = 0;
+ copyDepthTumor = 0;
+ copySumNormal = 0;
+ copySumTumor = 0;
+ copyPositions = 0;
+ copyPositionsGC = 0;
+ }
+
+ }
+ else
+ {
+ System.err.println("Error: Invalid format or not enough samples in mpileup: " + line + "\n");
+ return;
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Parsing Exception on line:\n" + line + "\n" + e.getLocalizedMessage());
+ numParsingExceptions++;
+ if(numParsingExceptions >= 5)
+ {
+ System.err.println("Too many parsing exceptions encountered; exiting");
+ return;
+ }
+ return;
+ }
+
+
+ }
+
+ // Last region: If minimum coverage was not met, print region //
+ // If we had a copyNumber region that met minimum coverage, report it //
+ if(copyPositions > minSegmentSize)
+ {
+ rawCopySegments++;
+ String regionResults = processCopyRegion(copyChrom, copyStart, copyStop, copyPositions, copyPositionsGC, copySumNormal, copySumTumor, minCoverage, dataRatio);
+
+ if(regionResults.length() > 0)
+ {
+ outCopySegments.println(regionResults);
+ goodCopySegments++;
+ }
+ }
+
+ in.close();
+
+ System.err.println(sharedPositions + " positions in mpileup"); //stats.get("sharedPositions")
+ System.err.println(comparedPositions + " had sufficient coverage for comparison"); //stats.get("comparedPositions")
+ System.err.println(rawCopySegments + " raw copynumber segments with size > " + minSegmentSize);
+ System.err.println(goodCopySegments + " good copynumber segments with depth > " + minCoverage);
+
+ }
+ else
+ {
+ System.err.println("Input file never ready for parsing (maybe due to file I/O)...");
+ System.exit(10);
+ }
+ }
+ catch (IOException e)
+ {
+ System.err.println("File Parsing Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ System.exit(11);
+ }
+
+
+
+ }
+
+
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
+ // Constructor with one argument (string[]) expects independent normal and tumor pileups as input //
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ public Copynumber(String[] args)
+ {
+ String usage = "USAGE: VarScan copynumber [normal_pileup] [tumor_pileup] [Opt: output] OPTIONS\n" +
+ "\tnormal_pileup - The SAMtools pileup file for Normal\n" +
+ "\ttumor_pileup - The SAMtools pileup file for Tumor\n" +
+ "\toutput - Output base name for files\n" +
+ "***If you have a single mpileup, see VarScan copynumber -mpileup 1 -h ***\n" +
+ "\nOPTIONS:\n" +
+ "\t--min-base-qual - Minimum base quality to count for coverage [20]\n" +
+ "\t--min-map-qual - Minimum read mapping quality to count for coverage [20]\n" +
+ "\t--min-coverage - Minimum coverage threshold for copynumber segments [20]\n" +
+ "\t--min-segment-size - Minimum number of consecutive bases to report a segment [10]\n" +
+ "\t--max-segment-size - Max size before a new segment is made [100]\n" +
+ "\t--p-value - P-value threshold for significant copynumber change-point [0.01]\n" +
+ "\t--data-ratio - The normal/tumor input data ratio for copynumber adjustment [1.0]\n";
+
+ if(args.length < 3)
+ {
+ System.err.println(usage);
+ return;
+ }
+
+ // Get the required arguments //
+ String normalPileupFile = args[1];
+ String tumorPileupFile = args[2];
+
+ String outputName = "output";
+
+ if(args.length >= 4 && !args[3].startsWith("-"))
+ {
+ outputName = args[3];
+ }
+
+ System.err.println("Normal Pileup: " + normalPileupFile);
+ System.err.println("Tumor Pileup: " + tumorPileupFile);
+
+ // Set parameter defaults //
+
+ int minCoverage = 10;
+ int minBaseQual = 15;
+ int minSegmentSize = 10;
+ int maxSegmentSize = 100;
+ double dataRatio = 1.00;
+ double pValueThreshold = 0.01;
+
+ // Parse command-line parameters //
+ HashMap<String, String> params = VarScan.getParams(args);
+
+ // Try adjusting any provided parameters based on user inut //
+ try
+ {
+ if(params.containsKey("min-coverage"))
+ {
+ minCoverage = Integer.parseInt(params.get("min-coverage"));
+ }
+
+ if(params.containsKey("min-base-qual"))
+ minBaseQual = Integer.parseInt(params.get("min-base-qual"));
+
+ if(params.containsKey("min-segment-size"))
+ minSegmentSize = Integer.parseInt(params.get("min-segment-size"));
+
+ if(params.containsKey("max-segment-size"))
+ maxSegmentSize = Integer.parseInt(params.get("max-segment-size"));
+
+ if(params.containsKey("p-value"))
+ pValueThreshold = Double.parseDouble(params.get("p-value"));
+
+ if(params.containsKey("data-ratio"))
+ dataRatio = Double.parseDouble(params.get("data-ratio"));
+
+ System.err.println("Min coverage:\t" + minCoverage);
+ System.err.println("Min avg qual:\t" + minBaseQual);
+ System.err.println("P-value thresh:\t" + pValueThreshold);
+
+ }
+ catch(Exception e)
+ {
+ System.err.println("Input Parameter Threw Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ System.exit(1);
+ }
+
+ // Print usage if -h or --help invoked //
+ if(params.containsKey("help") || params.containsKey("h"))
+ {
+ System.err.println(usage);
+ return;
+ }
+
+ // Check for correct input //
+
+ if(args.length < 3)
+ {
+ System.err.println("Please provide an output file basename!");
+ System.err.println(usage);
+ System.exit(1);
+ }
+
+
+ // Statistics counters //
+ long tumorPositions = 0;
+ long sharedPositions = 0;
+ long comparedPositions = 0;
+ long rawCopySegments = 0;
+ long goodCopySegments = 0;
+
+ try
+ {
+ // Declare output file //
+ PrintStream outCopySegments = null; // declare a print stream object for copynumber segments
+
+ outCopySegments = new PrintStream( new FileOutputStream(outputName + ".copynumber") );
+ outCopySegments.println("chrom\tchr_start\tchr_stop\tnum_positions\tnormal_depth\ttumor_depth\tlog2_ratio\tgc_content");
+
+ // Prepare file readers for normal and tumor pileups //
+
+ BufferedReader normal = new BufferedReader(new FileReader(normalPileupFile));
+ BufferedReader tumor = new BufferedReader(new FileReader(tumorPileupFile));
+
+ if(!(normal.ready() && tumor.ready()))
+ {
+ // Delay a few seconds to let SAMtools pileup start outputting //
+ try {
+ Thread.sleep(5000);
+
+ if(!(normal.ready() && tumor.ready()))
+ Thread.sleep(5000);
+
+ if(!(normal.ready() && tumor.ready()))
+ Thread.sleep(5000);
+
+ if(!(normal.ready() && tumor.ready()))
+ Thread.sleep(5000);
+ }
+ catch(Exception e)
+ {
+
+ }
+
+ }
+
+ // Exit if files not ready after waiting //
+
+ if(!(normal.ready() && tumor.ready()))
+ {
+ System.err.println("ERROR: Invalid input file(s)");
+ System.exit(10);
+ }
+
+ String lineNormal;
+ String lineTumor;
+ String chromNormal = "";
+ String chromTumor = "";
+ String prevChromNormal = "";
+ String prevChromTumor = "";
+ String refBase = "";
+ int posNormal = 0;
+ int posTumor = 0;
+
+ // Parameters for copy number calling //
+ String copyChrom = "";
+ int copyStart = 0;
+ int copyStop = 0;
+ int copyDepthNormal = 0;
+ int copyDepthTumor = 0;
+ long copySumNormal = 0;
+ long copySumTumor = 0;
+ long copyPositions = 0;
+ long copyPositionsGC = 0;
+
+ DecimalFormat oneDigit = new DecimalFormat("#0.0");
+ DecimalFormat threeDigits = new DecimalFormat("#0.000");
+
+
+ // Get first line of Normal //
+
+ if((lineNormal = normal.readLine()) != null)
+ {
+ String[] normalContents = lineNormal.split("\t");
+
+ if(normalContents.length > 1)
+ {
+ chromNormal = normalContents[0];
+ posNormal = Integer.parseInt(normalContents[1]);
+ }
+ }
+
+ // Loop through lines in tumor //
+
+ while ((lineTumor = tumor.readLine()) != null)
+ {
+ tumorPositions++;
+ String[] tumorContents = lineTumor.split("\t");
+
+ if(tumorContents.length > 1)
+ {
+ chromTumor = tumorContents[0];
+ posTumor = Integer.parseInt(tumorContents[1]);
+ }
+
+ // Parse normal lines until we get the same chromosome //
+ boolean flagEOF = false;
+ boolean normalWasReset = false;
+
+ // Advance in normal file if tumor is changed but normal is not, or if tumor is higher //
+ while(!chromNormal.equals(chromTumor) && !chromTumor.equals(prevChromTumor) && !flagEOF && (chromNormal.equals(prevChromTumor) || inSortOrder(chromNormal, chromTumor)))
+ {
+ //System.err.println("Normal (" + chromNormal + ") catching up to " + chromTumor);
+ // Get next line from normal pileup //
+ if((lineNormal = normal.readLine()) != null)
+ {
+ String[] normalContents = lineNormal.split("\t");
+
+ if(normalContents.length > 1)
+ {
+ chromNormal = normalContents[0];
+ posNormal = Integer.parseInt(normalContents[1]);
+ }
+ }
+ else
+ {
+ flagEOF = true;
+ }
+
+
+ }
+
+ // If chromosomes match and are non-blank, attempt to get matching positions //
+ if(chromNormal.equals(chromTumor) && !chromNormal.equals(""))
+ {
+ normalWasReset = false;
+ // Seek to matching Normal Position //
+
+ while(chromNormal.equals(chromTumor) && posNormal < posTumor && ((lineNormal = normal.readLine()) != null))
+ {
+ String[] normalContents = lineNormal.split("\t");
+ if(normalContents.length > 1)
+ {
+ chromNormal = normalContents[0];
+ posNormal = Integer.parseInt(normalContents[1]);
+
+ // If still less than tumor position, look for homozygous del //
+ if(posNormal < posTumor)
+ {
+ int pileupDepthNormal = 0;
+ String normalQualities = "";
+
+ // Pileup Files have 6-7 columns //
+ if(normalContents.length <= 7)
+ {
+ pileupDepthNormal = Integer.parseInt(normalContents[3]);
+ normalQualities = normalContents[5];
+ }
+ // Pileup lines in CNS files have 10-11 columns
+ else if (normalContents.length >= 10 && normalContents.length <= 11)
+ {
+ pileupDepthNormal = Integer.parseInt(normalContents[7]);
+ normalQualities = normalContents[9];
+ }
+
+ }
+ else
+ {
+
+ }
+ }
+ }
+
+ // Seek to matching Tumor Position //
+
+ while(chromNormal.equals(chromTumor) && posTumor < posNormal && ((lineTumor = tumor.readLine()) != null))
+ {
+ tumorContents = lineTumor.split("\t");
+ if(tumorContents.length > 1)
+ {
+ chromTumor = tumorContents[0];
+ posTumor = Integer.parseInt(tumorContents[1]);
+ }
+ }
+
+ // Proceed if normal and tumor positions match //
+
+ if(chromNormal.equals(chromTumor) && chromNormal.equals(chromTumor) && posNormal == posTumor)
+ {
+ //stats.put("sharedPositions", (stats.get("sharedPositions") + 1));
+ sharedPositions++;
+ refBase = tumorContents[2];
+
+// Parse out base qualities //
+ String[] normalContents = lineNormal.split("\t");
+ int pileupDepthNormal = 0;
+ int pileupDepthTumor = 0;
+ String normalQualities = "";
+ String tumorQualities = "";
+
+ // Pileup Files have 6-7 columns //
+ if(normalContents.length <= 7)
+ {
+ pileupDepthNormal = Integer.parseInt(normalContents[3]);
+ normalQualities = normalContents[5];
+ }
+ // Pileup lines in CNS files have 10-11 columns
+ else if (normalContents.length >= 10 && normalContents.length <= 11)
+ {
+ pileupDepthNormal = Integer.parseInt(normalContents[7]);
+ normalQualities = normalContents[9];
+ }
+
+ // Pileup Files have 6-7 columns //
+ if(tumorContents.length <= 7)
+ {
+ tumorQualities = tumorContents[5];
+ pileupDepthTumor = Integer.parseInt(tumorContents[3]);
+ }
+ // Pileup lines in CNS files have 10-11 columns
+ else if (tumorContents.length >= 10 && tumorContents.length <= 11)
+ {
+ tumorQualities = tumorContents[9];
+ pileupDepthTumor = Integer.parseInt(tumorContents[7]);
+ }
+
+ // If either sample met the minimum coverage and both had at least one read //
+
+// if((pileupDepthNormal >= minCoverage || pileupDepthTumor >= minCoverage) && normalQualities.length() > 0 && tumorQualities.length() > 0)
+
+ // We want the normal sample to meet the minimum coverage because that's the comparator //
+ if(pileupDepthNormal >= minCoverage && normalQualities.length() > 0) // && tumorQualities.length() > 0)
+ {
+ comparedPositions++;
+// Get the depth of bases above minimum quality //
+
+ int normalDepth = VarScan.qualityDepth(normalQualities, minBaseQual);
+ int tumorDepth = VarScan.qualityDepth(tumorQualities, minBaseQual);
+
+ // Determine if we have a copy changepoint //
+ // If this base is not contiguous with the copyRegion
+ // If the normal or tumor depth changes //
+
+ int diffNormal = Math.abs(copyDepthNormal - normalDepth);
+ int diffTumor = Math.abs(copyDepthTumor - tumorDepth);
+ int posDiff = posTumor - copyStop;
+
+ // DETERMINE IF WE CONTINUE THIS REGION OR PROCESS IT AND START A NEW ONE //
+
+ boolean continueFlag = false;
+
+ // If chromosomes differ or contiguity broken, process the region //
+
+ if(posDiff > 2 || !(copyChrom.equals(chromTumor)))
+ {
+ continueFlag = false;
+ }
+ else
+ {
+ if(copyPositions >= maxSegmentSize)
+ {
+ continueFlag = false;
+ }
+ else if(diffNormal <= 2 && diffTumor <= 2)
+ {
+ continueFlag = true;
+ }
+ else
+ {
+ // Do a Fisher's exact test on the copy number changes. ##
+
+ double changePvalue = VarScan.getSignificance(copyDepthNormal, copyDepthTumor, normalDepth, tumorDepth);
+
+ // If depth change not significant, continue with region //
+ if(changePvalue >= pValueThreshold)
+ {
+ continueFlag = true;
+ }
+ else
+ {
+ continueFlag = false;
+ }
+
+ }
+ }
+
+
+ // If continuing, extend this region and don't process yet //
+
+ if(continueFlag)
+ {
+ copySumNormal += normalDepth;
+ copySumTumor += tumorDepth;
+ copyPositions++;
+ if(refBase.equals("G") || refBase.equals("C") || refBase.equals("g") || refBase.equals("c"))
+ copyPositionsGC++;
+ copyStop = posTumor;
+ }
+
+ // Otherwise, process this region (if it qualifies) and start a new one //
+
+ else
+ {
+ if(copyPositions >= minSegmentSize)
+ {
+ rawCopySegments++;
+ String regionResults = processCopyRegion(copyChrom, copyStart, copyStop, copyPositions, copyPositionsGC, copySumNormal, copySumTumor, minCoverage, dataRatio);
+
+ if(regionResults.length() > 0)
+ {
+ outCopySegments.println(regionResults);
+ goodCopySegments++;
+ }
+ }
+
+ // Start a new copyNumber region //
+ copyChrom = chromTumor;
+ copyStart = posTumor;
+ copyStop = posTumor;
+ copyDepthNormal = normalDepth;
+ copyDepthTumor = tumorDepth;
+ copySumNormal = normalDepth;
+ copySumTumor = tumorDepth;
+ copyPositions = 1;
+ if(refBase.equals("G") || refBase.equals("C") || refBase.equals("g") || refBase.equals("c"))
+ copyPositionsGC = 1;
+ else
+ copyPositionsGC = 0;
+ }
+
+
+ }
+ else
+ {
+ // If minimum coverage was not met, print region //
+ // If we had a copyNumber region that met minimum coverage, report it //
+ if(copyPositions >= minSegmentSize)
+ {
+ rawCopySegments++;
+ String regionResults = processCopyRegion(copyChrom, copyStart, copyStop, copyPositions, copyPositionsGC, copySumNormal, copySumTumor, minCoverage, dataRatio);
+
+ if(regionResults.length() > 0)
+ {
+ outCopySegments.println(regionResults);
+ goodCopySegments++;
+ }
+ }
+
+ // Reset the copyNumber region //
+ copyChrom = "";
+ copyStart = 0;
+ copyStop = 0;
+ copyDepthNormal = 0;
+ copyDepthTumor = 0;
+ copySumNormal = 0;
+ copySumTumor = 0;
+ copyPositions = 0;
+ copyPositionsGC = 0;
+ }
+
+ // Record this chromosome //
+
+ prevChromNormal = chromNormal;
+ prevChromTumor = chromTumor;
+ }
+ else
+ {
+ //System.err.println("Failed to match positions " + chromNormal + " " + posNormal + " to Tumor " + chromTumor + " " + posTumor);
+ }
+ }
+ // If they're in sort order, do nothing so that tumor can catch up //
+ else if(inSortOrder(chromNormal, chromTumor))
+ {
+ System.err.println("Not resetting normal file because " + chromNormal + " < " + chromTumor);
+ }
+ // If we reached the end of the normal file but never saw this chromosome, //
+ // fast-forward until tumor chromosome changes and reset normal file //
+ else if(flagEOF)
+ {
+ flagEOF = false;
+
+ while(prevChromTumor.equals(chromTumor) && !flagEOF)
+ {
+ if((lineTumor = tumor.readLine()) != null)
+ {
+ tumorContents = lineTumor.split("\t");
+
+ if(tumorContents.length > 1)
+ {
+ chromTumor = tumorContents[0];
+ posTumor = Integer.parseInt(tumorContents[1]);
+ }
+ }
+ else
+ {
+ flagEOF = true;
+ }
+ }
+
+ // Reset the normal file if we've already passed this chromosome in normal //
+
+ if(!flagEOF && !normalWasReset)
+ {
+ if(inSortOrder(chromNormal, chromTumor))
+ {
+ System.err.println("Not resetting normal file because " + chromNormal + " < " + chromTumor);
+ }
+ else
+ {
+ System.err.println("Resetting normal file because " + chromNormal + " > " + chromTumor);
+ normalWasReset = true;
+ normal.close();
+ normal = new BufferedReader(new FileReader(normalPileupFile));
+ }
+
+ }
+ }
+
+ }
+
+
+ normal.close();
+ tumor.close();
+
+ // If we had a copyNumber region that met minimum coverage, report it //
+ if(copyPositions > minSegmentSize)
+ {
+ rawCopySegments++;
+ String regionResults = processCopyRegion(copyChrom, copyStart, copyStop, copyPositions, copyPositionsGC, copySumNormal, copySumTumor, minCoverage, dataRatio);
+
+ if(regionResults.length() > 0)
+ {
+ outCopySegments.println(regionResults);
+ goodCopySegments++;
+ }
+ }
+
+
+ outCopySegments.close();
+
+ System.err.println(tumorPositions + " positions in tumor");
+ System.err.println(sharedPositions + " positions shared in normal"); //stats.get("sharedPositions")
+ System.err.println(comparedPositions + " had sufficient coverage for comparison"); //stats.get("comparedPositions")
+
+ System.err.println(rawCopySegments + " raw copynumber segments with size > " + minSegmentSize);
+ System.err.println(goodCopySegments + " good copynumber segments with depth > " + minCoverage);
+ }
+ catch (IOException e)
+ {
+ System.err.println("File Parsing Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ System.exit(11);
+ }
+ }
+
+
+ /**
+ * Calculates relative tumor copynumber for a contiguous segment
+ *
+ * @param args Command-line arguments
+ * @return HashMap of parameter names and their values
+ */
+ static String processCopyRegion(String copyChrom, int copyStart, int copyStop, long copyPositions, long copyPositionsGC, long copySumNormal, long copySumTumor, int minCoverage, double dataRatio)
+ {
+ DecimalFormat oneDigit = new DecimalFormat("#0.0");
+ DecimalFormat threeDigits = new DecimalFormat("#0.000");
+
+ try
+ {
+// Calculate average depth //
+ float avgNormal = (float) copySumNormal / (float) copyPositions;
+ float avgTumor = (float) copySumTumor / (float) copyPositions;
+ // Adjust tumor depth for ratio
+ float adjustedTumorDepth = (float) dataRatio * (float) avgTumor;
+
+ float gcContent = (float) copyPositionsGC / (float) copyPositions * 100;
+
+ if(avgNormal >= minCoverage || avgTumor >= minCoverage)
+ {
+ // Determine ratio and diff //
+ if(avgNormal >= 0.01 && avgTumor >= 0.01)
+ {
+ float tumorNormalRatio = adjustedTumorDepth / avgNormal;
+ double log2ratio = Math.log(tumorNormalRatio) / Math.log(2);
+
+ return(copyChrom + "\t" + copyStart + "\t" + copyStop + "\t" + copyPositions + "\t" + oneDigit.format(avgNormal) + "\t" + oneDigit.format(avgTumor) + "\t" + threeDigits.format(log2ratio) + "\t" + oneDigit.format(gcContent));
+ }
+ else if (avgTumor >= 0.01)
+ {
+ // If only tumor has coverage, handle it //
+ double log2ratio = 2.00;
+ return(copyChrom + "\t" + copyStart + "\t" + copyStop + "\t" + copyPositions + "\t" + oneDigit.format(avgNormal) + "\t" + oneDigit.format(avgTumor) + "\t" + threeDigits.format(log2ratio) + "\t" + oneDigit.format(gcContent));
+ }
+ else
+ {
+ // If only normal has coverage, mark as homozygyous deletion //
+ double log2ratio = -2.00;
+ return(copyChrom + "\t" + copyStart + "\t" + copyStop + "\t" + copyPositions + "\t" + oneDigit.format(avgNormal) + "\t" + oneDigit.format(avgTumor) + "\t" + threeDigits.format(log2ratio) + "\t" + oneDigit.format(gcContent));
+ }
+
+ }
+ else
+ {
+// System.err.println("Warning: Not reporting region " + copyChrom + " " + copyStart + " " + copyStop + " " + copyPositions + " " + avgNormal + " " + avgTumor);
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Warning: Error while processing copynumber segment:" + e.getMessage());
+ }
+
+
+ return("");
+ }
+
+
+ /**
+ * Determine if tumor chromosome is before normal chromosome in sort order
+ *
+ * @param args Command-line arguments
+ * @return HashMap of parameter names and their values
+ */
+ static boolean inSortOrder(String chrom1, String chrom2)
+ {
+ String[] testArray = {chrom1, chrom2};
+ Arrays.sort(testArray);
+
+ if(testArray[0].equals(chrom1))
+ return true;
+
+ return false;
+ }
+
+
+
+ /**
+ * Determines the sort order for chromosomes
+ *
+ * @param args Command-line arguments
+ * @return HashMap of parameter names and their values
+ */
+ static Boolean chromSorted(String chrom1, String chrom2)
+ {
+ Boolean answer = false;
+
+ chrom1.replace("X", "23");
+ chrom1.replace("Y", "24");
+ chrom1.replace("M", "25");
+
+ chrom2.replace("X", "23");
+ chrom2.replace("Y", "24");
+ chrom2.replace("M", "25");
+
+ String[] unsorted = {chrom1, chrom2};
+ String[] sorted = {chrom1, chrom2};
+ Arrays.sort(sorted);
+ System.err.println("Sorted order is " + sorted[0] + " " + sorted[1]);
+ try{
+ if(sorted[0].equals(unsorted[0]))
+ {
+ answer = true;
+ }
+ }
+ catch(Exception e)
+ {
+
+ }
+
+ return(answer);
+ }
+}
diff --git a/net/sf/varscan/Coverage.java b/net/sf/varscan/Coverage.java
new file mode 100644
index 0000000..bc83bf0
--- /dev/null
+++ b/net/sf/varscan/Coverage.java
@@ -0,0 +1,423 @@
+/**
+ * @(#)FilterSomatic.java
+ *
+ * Copyright (c) 2009-2010 Daniel C. Koboldt and Washington University in St. Louis
+ *
+ * COPYRIGHT
+ */
+
+package net.sf.varscan;
+
+import java.io.BufferedReader;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.InputStreamReader;
+import java.io.PrintStream;
+import java.util.BitSet;
+import java.util.HashMap;
+
+/**
+ * A class for assessing coverage of target regions (experimental)
+ *
+ * @version 2.3
+ *
+ * @author Daniel C. Koboldt <dkoboldt at genome.wustl.edu>
+ *
+ */
+public class Coverage {
+
+ public Coverage(String[] args)
+ {
+ // Define the usage message //
+ String usage = "USAGE: java -jar VarScan.jar coverage [pileup-file] OPTIONS\n" +
+ "\n" +
+ "\tpileup-file - A SAMtools pileup file or piped input\n" +
+ "\tOPTIONS:\n" +
+ "\t--regions-file\tTab-delimited file of regions of interest (required)\n" +
+ "\t--min-base-qual\tMinimum base quality [20]\n" +
+ "\t--output-file\tOutput file for coverage report";
+
+ // Set default parameters //
+
+ String outFileName = "";
+ String regionsFile = "";
+ int minBaseQual = 20;
+ int maxDepth = 50;
+ String targetFileType = "regions";
+
+ // Parse the input parameters //
+
+ HashMap<String, String> params = VarScan.getParams(args);
+
+ // Adjust parameters based on user input //
+
+ try
+ {
+ if(params.containsKey("output-file"))
+ outFileName = params.get("output-file");
+
+ if(params.containsKey("regions-file"))
+ regionsFile = params.get("regions-file");
+
+ if(params.containsKey("min-base-qual"))
+ minBaseQual = Integer.parseInt(params.get("min-base-qual"));
+
+ }
+ catch(Exception e)
+ {
+ System.err.println("Input Parameter Threw Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ return;
+ }
+
+ // Print usage if -h or --help invoked //
+ if(params.containsKey("help") || params.containsKey("h"))
+ {
+ System.err.println(usage);
+ return;
+ }
+
+ // Load target positions //
+ HashMap<String, BitSet> targetHash = null;
+
+ if(params.containsKey("regions-file"))
+ targetHash = loadTargets(regionsFile, targetFileType, 0);
+
+
+ // Obtain SAM output with system call to SAMtools //
+
+ try
+ {
+// Runtime r = Runtime.getRuntime();
+// Process p = r.exec("samtools view -q " + minMapQual + " " + bamFile);
+// InputStreamReader instream = new InputStreamReader(p.getInputStream());
+
+ // Declare file-parsing variables //
+
+ BufferedReader in = VarScan.getInfile(args);
+
+ // If no input, print usage //
+
+ if(in == null)
+ {
+ System.out.println(usage);
+ return;
+ }
+
+ // If input file not ready, give it a few seconds //
+ int numNaps = 0;
+
+ while(!in.ready())
+ {
+ try {
+ Thread.sleep(5000);
+ numNaps++;
+
+ if(numNaps > 10)
+ {
+ System.err.println("Input file was not ready after 10 5-second cycles!");
+ return;
+ }
+ }
+ catch(Exception e)
+ {
+
+ }
+ }
+
+ // Declare an array to count positions at each depth from 0 to max depth //
+
+ long[] positionsByDepth = new long[maxDepth + 1];
+ long basesOnTarget = 0;
+ long basesOffTarget = 0;
+
+ // Prepare to parse the SAM input //
+
+ String line;
+ int lineCounter = 0;
+ int numParsingExceptions = 0;
+
+ if(in != null && in.ready())
+ {
+ // Parse the infile line by line //
+
+ while ((line = in.readLine()) != null)
+ {
+ lineCounter++;//stats.put("numBases", (stats.get("numBases") + 1));
+
+ // Output progress line //
+ if(params.containsKey("verbose") && (lineCounter % 100000) == 0)
+ System.err.println(lineCounter + " positions parsed...");
+
+ // Begin try-catch for line parsing //
+
+ try
+ {
+ String[] lineContents = line.split("\t");
+
+ // Verify expected pileup format //
+
+ if(lineContents.length > 5 && lineContents[0].length() > 0 && lineContents[1].length() > 0 && lineContents[2].length() > 0 && lineContents[3].length() > 0)
+ {
+ String refName = lineContents[0];
+ int position = Integer.parseInt(lineContents[1]);
+
+ // Declare a BitSet //
+ BitSet refPositions;
+
+ boolean inTarget = false;
+
+ if(!params.containsKey("regions-file"))
+ {
+ // If no regions file provided, report on all positions //
+ inTarget = true;
+ }
+ // Get the position BitSet for this chromosome//
+ else if(targetHash.containsKey(refName))
+ {
+ refPositions = targetHash.get(refName);
+
+ // Check to see if position set //
+ if(refPositions.get(position))
+ {
+ inTarget = true;
+ }
+ }
+
+ if(inTarget)
+ {
+ basesOnTarget++;
+
+ try {
+ // Parse out the depth and base qualities //
+ int readDepth = Integer.parseInt(lineContents[3]);
+ String readQualities = lineContents[5];
+
+ String mapQualities = "";
+ if(lineContents.length > 6) // Get Map Qualities if available //
+ mapQualities = lineContents[6];
+
+ int qualDepth = VarScan.qualityDepth(readQualities, minBaseQual);
+
+ for(int thisDepth = 0; thisDepth <= qualDepth; thisDepth++)
+ {
+ positionsByDepth[thisDepth]++;
+ }
+ }
+ catch (Exception e)
+ {
+
+ }
+
+ }
+ else
+ {
+ basesOffTarget++;
+ }
+
+
+ }
+ else
+ {
+ System.err.println("Error: Invalid format for pileup at line " + lineCounter + ":" + line + "\n");
+ return;
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Parsing Exception on line:\n" + line + "\n" + e.getLocalizedMessage());
+ numParsingExceptions++;
+ if(numParsingExceptions >= 5)
+ {
+ System.err.println("Too many parsing exceptions encountered; exiting");
+ return;
+ }
+ return;
+ }
+
+
+ }
+
+ in.close();
+
+
+ // Determine the number of bases targeted //
+ long targetPositions = 0;
+ for (String refName : targetHash.keySet())
+ {
+ BitSet refPositions = targetHash.get(refName);
+ for(int thisPosition = 1; thisPosition <= refPositions.size(); thisPosition++)
+ {
+ if(refPositions.get(thisPosition))
+ targetPositions++;
+ }
+
+ }
+
+ String outputReport = "";
+
+ outputReport += targetPositions + " positions targeted\n";
+ outputReport += lineCounter + " positions in pileup file\n";
+
+ // Print a summary report //
+ outputReport += "TARGET SPECIFICITY\n";
+ outputReport += basesOffTarget + " bases off target\n";
+ outputReport += basesOnTarget + " bases on target\n";
+
+ // Print a breadth-by-depth report //
+
+ long cov50x = positionsByDepth[50];
+ long cov40x = positionsByDepth[40] - positionsByDepth[50];
+ long cov30x = positionsByDepth[30] - positionsByDepth[40];
+ long cov20x = positionsByDepth[20] - positionsByDepth[30];
+ long cov10x = positionsByDepth[10] - positionsByDepth[20];
+ long cov8x = positionsByDepth[8] - positionsByDepth[10];
+ long cov6x = positionsByDepth[6] - positionsByDepth[8];
+ long cov2x = positionsByDepth[2] - positionsByDepth[6];
+ long cov1x = positionsByDepth[1] - positionsByDepth[2];
+ long cov0x = targetPositions - positionsByDepth[1];
+
+ outputReport += "COVERAGE BREADTH-BY-DEPTH\n";
+ outputReport += "min_depth\t50x\t40x\t30x\t20x\t10x\t8x\t6x\t2x\t1x\t0x\n";
+ outputReport += "num_positions\t" + cov50x + "\t" + cov40x + "\t" + cov30x + "\t" + cov20x + "\t" + cov10x + "\t" + cov8x + "\t" + cov6x + "\t" + cov2x + "\t" + cov1x + "\t" + cov0x + "\n";
+
+ // Print the summary report to error output //
+ System.err.println(outputReport);
+
+ outputReport += "COVERED BASES BY DEPTH\n";
+ outputReport += "Depth\tBasesCovered\n";
+ for(int thisDepth = 0; thisDepth <= maxDepth; thisDepth++)
+ {
+ outputReport += thisDepth + "\t" + positionsByDepth[thisDepth] + "\n";
+ }
+
+ if(params.containsKey("output-file"))
+ {
+ PrintStream outFile = new PrintStream( new FileOutputStream(outFileName) );
+ outFile.println(outputReport);
+ outFile.close();
+ }
+ }
+
+
+
+ }
+
+ catch(Exception e)
+ {
+ System.err.println("Error extracting SAM from BAM: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ return;
+ }
+
+
+
+ }
+
+
+
+ /**
+ * Saves positions into a BitSet hash by chromosome
+ *
+ * @param fileName Name of file to be parsed
+ * @param fileType Type of file ("positions" or "regions")
+ * @return HashMap of parameter names and their values
+ */
+ static HashMap<String, BitSet> loadTargets(String fileName, String fileType, int marginSize)
+ {
+ HashMap<String, BitSet> positionsByChrom = new HashMap<String, BitSet>();
+
+ int numRegions = 0;
+ int numBases = 0;
+
+ try
+ {
+ BufferedReader infile = new BufferedReader(new FileReader(fileName));
+
+ String line = "";
+ int lineCounter = 0;
+
+ while ((line = infile.readLine()) != null)
+ {
+ lineCounter++;
+
+ String[] lineContents = line.split("\t");
+ if(lineContents.length >= 2)
+ {
+ // Try to parse chrom and position //
+ try
+ {
+ String refName = lineContents[0];
+
+ // Get or create BitSet for this refName //
+ BitSet refPositions;
+
+ if(positionsByChrom.containsKey(refName))
+ {
+ refPositions = positionsByChrom.get(refName);
+ }
+ else
+ {
+ refPositions = new BitSet();
+ }
+
+ // Mark position or regions, depending on what was provided //
+ int chrStart = 0;
+ int chrStop = 0;
+
+ if(fileType.equals("positions") && lineContents.length > 1)
+ {
+ // Set the position to true //
+ int position = Integer.parseInt(lineContents[1]);
+ chrStart = position - marginSize;
+ chrStop = position + marginSize;
+ }
+ else if(fileType.equals("regions") && lineContents.length > 2)
+ {
+ chrStart = Integer.parseInt(lineContents[1]) - marginSize;
+ chrStop = Integer.parseInt(lineContents[2]) + marginSize;
+ }
+
+ // Check that it won't be an infinite loop//
+ if(chrStart <= chrStop)
+ {
+ numRegions++;
+
+ // Mark every position //
+ for(int position = chrStart; position <= chrStop; position++)
+ {
+ if(!refPositions.get(position))
+ {
+ numBases++;
+ refPositions.set(position, true);
+ }
+ }
+ }
+
+ // Return it to the hash //
+ positionsByChrom.put(refName, refPositions);
+ }
+ catch(Exception e)
+ {
+ if(lineCounter > 1)
+ System.err.println("Warning: Unable to parse chrom/position from " + line);
+ }
+
+
+ }
+ }
+
+ infile.close();
+ }
+ catch(Exception e)
+ {
+ System.err.println("ERROR: File Parsing Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ }
+
+ System.err.println(numRegions + " regions parsed");
+ System.err.println(numBases + " unique positions targeted");
+ return(positionsByChrom);
+ }
+
+}
diff --git a/net/sf/varscan/FilterSomatic.java b/net/sf/varscan/FilterSomatic.java
new file mode 100644
index 0000000..02ff284
--- /dev/null
+++ b/net/sf/varscan/FilterSomatic.java
@@ -0,0 +1,575 @@
+/**
+ * @(#)FilterSomatic.java
+ *
+ * Copyright (c) 2009-2010 Daniel C. Koboldt and Washington University in St. Louis
+ *
+ * COPYRIGHT
+ */
+
+package net.sf.varscan;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.PrintStream;
+import java.util.Arrays;
+import java.util.HashMap;
+
+/**
+ * A class for filtering VarScan variant predictions
+ *
+ * @version 2.3
+ *
+ * @author Daniel C. Koboldt <dkoboldt at genome.wustl.edu>
+ *
+ */
+public class FilterSomatic {
+
+ public FilterSomatic(String[] args)
+ {
+ // Define the usage message //
+ String usage = "USAGE: java -jar VarScan.jar filter [variant file] OPTIONS\n" +
+ "\tvariant file - A file of SNPs or indels\n" +
+ "\n" +
+ "\tOPTIONS:\n" +
+ "\t--min-coverage\tMinimum read depth at a position to make a call [10]\n" +
+ "\t--min-reads2\tMinimum supporting reads at a position to call variants [4]\n" +
+ "\t--min-strands2\tMinimum # of strands on which variant observed (1 or 2) [1]\n" +
+ "\t--min-var-freq\tMinimum variant allele frequency threshold [0.20]\n" +
+ "\t--p-value\tDefault p-value threshold for calling variants [5e-02]\n" +
+ "\t--indel-file\tFile of indels for filtering nearby SNPs\n" +
+ "\t--output-file\tOptional output file for filtered variants";
+
+ // Set parameter defaults //
+// Set parameter defaults //
+
+ int minCoverage = 8;
+ int minReads2 = 4;
+ int minStrands2 = 1;
+ int minAvgQual = 20;
+ double minVarFreq = 0.20;
+ double pValueThreshold = 0.05; //1.0e-04;
+
+ int windowSize = 10;
+ int windowSNPs = 3;
+ int indelMargin = 3;
+ String outFileName = "";
+
+ HashMap<String, Boolean> indelPositions = new HashMap<String, Boolean>();
+
+ // Adjust parameters based on user input //
+
+ HashMap<String, String> params = VarScan.getParams(args);
+
+ try
+ {
+ if(params.containsKey("output-file"))
+ outFileName = params.get("output-file");
+
+ if(params.containsKey("window-size"))
+ windowSize = Integer.parseInt(params.get("window-size"));
+
+ if(params.containsKey("window-snps"))
+ windowSNPs = Integer.parseInt(params.get("window-snps"));
+
+ if(params.containsKey("indel-margin"))
+ indelMargin = Integer.parseInt(params.get("indel-margin"));
+
+ if(params.containsKey("indel-file"))
+ {
+ indelPositions = loadIndels(params.get("indel-file"));
+ }
+
+ if(params.containsKey("min-coverage"))
+ minCoverage = Integer.parseInt(params.get("min-coverage"));
+
+ if(params.containsKey("min-reads2"))
+ minReads2 = Integer.parseInt(params.get("min-reads2"));
+
+ if(params.containsKey("min-strands2"))
+ minStrands2 = Integer.parseInt(params.get("min-strands2"));
+
+ if(params.containsKey("min-var-freq"))
+ minVarFreq = Double.parseDouble(params.get("min-var-freq"));
+
+ if(params.containsKey("min-avg-qual"))
+ minAvgQual = Integer.parseInt(params.get("min-avg-qual"));
+
+ if(params.containsKey("p-value"))
+ pValueThreshold = Double.parseDouble(params.get("p-value"));
+
+
+ System.err.println("Window size:\t" + windowSize);
+ System.err.println("Window SNPs:\t" + windowSNPs);
+ System.err.println("Indel margin:\t" + indelMargin);
+
+ }
+ catch(Exception e)
+ {
+ System.err.println("Input Parameter Threw Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ System.exit(1);
+ }
+
+ // Print usage if -h or --help invoked //
+ if(params.containsKey("help") || params.containsKey("h"))
+ {
+ System.err.println(usage);
+ return;
+ }
+
+ // Identify SNP clusters //
+
+ HashMap<String, Boolean> clusterSNPs = findSNPclusters(windowSize, windowSNPs, args);
+
+
+ // Define two-decimal-place format and statistics hash //
+
+ HashMap<String, Integer> stats = new HashMap<String, Integer>();
+ stats.put("numVariants", 0);
+ stats.put("numFailCoverage", 0);
+ stats.put("numFailVarFreq", 0);
+ stats.put("numFailPvalue", 0);
+ stats.put("numFailReads2", 0);
+ stats.put("numNearIndel", 0);
+ stats.put("numSNPcluster", 0);
+ stats.put("numPassFilter", 0);
+ stats.put("numParsingExceptions", 0);
+
+ // Parse piped input or user-provided pileup file //
+
+ try
+ {
+ // Declare output file //
+ PrintStream outFile = null;
+ if(params.containsKey("output-file"))
+ outFile = new PrintStream( new FileOutputStream(outFileName) );
+
+ // Declare file-parsing variables //
+
+ BufferedReader in = VarScan.getInfile(args);
+
+ // If no input, print usage //
+
+ if(in == null)
+ {
+ System.out.println(usage);
+ return;
+ }
+
+ // If input file not ready, give it a few seconds //
+ int numNaps = 0;
+
+ while(!in.ready())
+ {
+ try {
+ Thread.sleep(5000);
+ numNaps++;
+
+ if(numNaps > 100)
+ {
+ System.err.println("Input file was not ready after 100 5-second cycles!");
+ System.exit(10);
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Exception while trying to get input" + e.getMessage());
+ System.exit(1);
+ }
+ }
+
+ String line;
+ int lineCounter = 0;
+ boolean isVCF = false;
+
+ // Proceed if input stream is ready //
+
+ if(in != null && in.ready())
+ {
+ // Parse the infile line by line //
+
+ while ((line = in.readLine()) != null)
+ {
+ lineCounter++;
+
+ try
+ {
+ String[] lineContents = line.split("\t");
+ String chrom = lineContents[0];
+
+ if(line.startsWith("#"))
+ isVCF = true;
+
+ if(chrom.equals("Chrom") || chrom.equals("chrom") || line.startsWith("#"))
+ {
+
+ // Print header //
+ if(params.containsKey("output-file"))
+ outFile.println(line);
+ }
+ else
+ {
+ stats.put("numVariants", (stats.get("numVariants") + 1));
+ int position = Integer.parseInt(lineContents[1]);
+ String thisKey = chrom + "\t" + position;
+
+ int normalReads1 = 0;
+ int normalReads2 = 0;
+ int tumorReads1 = 0;
+ int tumorReads2 = 0;
+ String somaticStatus = "";
+ double somaticPvalue = 0.00;
+
+ if(isVCF)
+ {
+ String info = lineContents[7];
+ String normal = lineContents[9];
+ String tumor = lineContents[10];
+
+ String[] infoContents = info.split(";");
+ for(int colCounter = 0; colCounter < infoContents.length; colCounter++)
+ {
+ String element = infoContents[colCounter];
+ String[] elementContents = element.split("=");
+ if(elementContents[0].equals("SS"))
+ somaticStatus = elementContents[1];
+ else if(elementContents[0].equals("GPV") && somaticStatus.equals("1"))
+ somaticPvalue = Double.parseDouble(elementContents[1]);
+ else if(elementContents[0].equals("SPV") && !somaticStatus.equals("1"))
+ somaticPvalue = Double.parseDouble(elementContents[1]);
+ }
+
+ String[] normalContents = normal.split(":");
+ normalReads1 = Integer.parseInt(normalContents[3]);
+ normalReads2 = Integer.parseInt(normalContents[4]);
+
+ String[] tumorContents = tumor.split(":");
+ tumorReads1 = Integer.parseInt(tumorContents[3]);
+ tumorReads2 = Integer.parseInt(tumorContents[4]);
+ }
+ else
+ {
+ normalReads1 = Integer.parseInt(lineContents[4]);
+ normalReads2 = Integer.parseInt(lineContents[5]);
+ tumorReads1 = Integer.parseInt(lineContents[8]);
+ tumorReads2 = Integer.parseInt(lineContents[9]);
+
+ somaticStatus = lineContents[12];
+ somaticPvalue = Double.parseDouble(lineContents[14]);
+ }
+
+ // Proceed //
+
+ double normalFreq = 0;
+ double tumorFreq = 0;
+
+ int normalCoverage = normalReads1 + normalReads2;
+ int tumorCoverage = tumorReads1 + tumorReads2;
+
+ if(normalReads1 > 0 || normalReads2 > 0)
+ normalFreq = (double) normalReads2 / (double) (normalReads1 + normalReads2);
+
+ if(tumorReads1 > 0 || tumorReads2 > 0)
+ tumorFreq = (double) tumorReads2 / (double) (tumorReads1 + tumorReads2);
+
+
+ if(normalReads1 > 0 || normalReads2 > 0)
+ normalFreq = (double) normalReads2 / (double) (normalReads1 + normalReads2);
+
+ if(tumorReads1 > 0 || tumorReads2 > 0)
+ tumorFreq = (double) tumorReads2 / (double) (tumorReads1 + tumorReads2);
+
+ boolean filterFlag = false;
+ boolean indelFlag = false;
+ boolean clusterFlag = false;
+
+ if(normalCoverage < minCoverage || tumorCoverage < minCoverage)
+ {
+ // Fail due to coverage //
+ filterFlag = true;
+ stats.put("numFailCoverage", (stats.get("numFailCoverage") + 1));
+ }
+ else if(tumorReads2 < minReads2)
+ {
+ filterFlag = true;
+ stats.put("numFailReads2", (stats.get("numFailReads2") + 1));
+ }
+ else if(tumorFreq < minVarFreq)
+ {
+ filterFlag = true;
+ stats.put("numFailVarFreq", (stats.get("numFailVarFreq") + 1));
+ }
+ else if(somaticPvalue > pValueThreshold)
+ {
+ filterFlag = true;
+ stats.put("numFailPvalue", (stats.get("numFailPvalue") + 1));
+ }
+
+ if(clusterSNPs.containsKey(thisKey))
+ {
+ clusterFlag = true;
+ }
+ // If indel file was provided, check position-1 to position+1 //
+ else if(params.containsKey("indel-file"))
+ {
+ for(int thisPosition = position - indelMargin; thisPosition <= position + indelMargin; thisPosition++)
+ {
+ String key = chrom + "\t" + thisPosition;
+ if(indelPositions.containsKey(key))
+ {
+ indelFlag = true;
+ }
+ }
+ }
+
+ if(filterFlag)
+ {
+ // Already counted //
+ }
+ else if(indelFlag)
+ {
+ stats.put("numNearIndel", (stats.get("numNearIndel") + 1));
+ }
+ else if(clusterFlag)
+ {
+ // Remove cluster SNPs //
+ stats.put("numSNPcluster", (stats.get("numSNPcluster") + 1));
+ }
+ else
+ {
+ if(params.containsKey("output-file"))
+ outFile.println(line);
+
+ stats.put("numPassFilter", (stats.get("numPassFilter") + 1));
+ }
+
+
+ }
+ }
+ catch(Exception e)
+ {
+ if(lineCounter == 1)
+ {
+ if(params.containsKey("output-file"))
+ outFile.println(line);
+ }
+ else
+ {
+ System.err.println("Parsing Exception on line:\n" + line + "\n" + e.getLocalizedMessage());
+ stats.put("numParsingExceptions", (stats.get("numParsingExceptions") + 1));
+ if(stats.get("numParsingExceptions") >= 5)
+ {
+ System.err.println("Too many parsing exceptions encountered; exiting");
+ return;
+ }
+ }
+
+ }
+
+ }
+
+ // Report summary of results //
+
+ System.err.println(stats.get("numVariants") + " variants in input stream");
+ System.err.println(stats.get("numFailCoverage") + " failed to meet coverage requirement");
+ System.err.println(stats.get("numFailReads2") + " failed to meet reads2 requirement");
+ System.err.println(stats.get("numFailVarFreq") + " failed to meet varfreq requirement");
+ System.err.println(stats.get("numFailPvalue") + " failed to meet p-value requirement");
+ System.err.println(stats.get("numSNPcluster") + " in SNP clusters were removed");
+ System.err.println(stats.get("numNearIndel") + " were removed near indels");
+ System.err.println(stats.get("numPassFilter") + " passed filters");
+
+ in.close();
+ }
+ else
+ {
+ System.err.println("Input file not found!");
+ System.err.println(usage);
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Error Parsing Input File: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ return;
+ }
+ }
+
+
+ /**
+ * Loads indels to be used for filtering
+ *
+ * @param filename Path to file of indels
+ * @return indels HashMap of indel positions (chrom\tposition)
+ */
+ static HashMap<String, Boolean> loadIndels(String filename)
+ {
+ HashMap<String, Boolean> indels = new HashMap<String, Boolean>();
+
+ try
+ {
+ // Declare file-parsing variables //
+
+ String line;
+
+ File infile = new File(filename);
+ if(infile.exists())
+ {
+ BufferedReader in = new BufferedReader(new FileReader(infile));
+
+ if(in.ready())
+ {
+ while ((line = in.readLine()) != null)
+ {
+ String[] lineContents = line.split("\t");
+ String chrom = lineContents[0];
+ if(chrom.equals("Chrom") || line.startsWith("#"))
+ {
+ // Ignore headers //
+ }
+ else
+ {
+ String position = lineContents[1];
+ String indelKey = chrom + "\t" + position;
+ indels.put(indelKey, Boolean.TRUE);
+ }
+
+ }
+ }
+ else
+ {
+ System.err.println("Unable to open indels file for reading");
+ }
+
+ in.close();
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Error Parsing Indel File: " + e.getLocalizedMessage());
+ }
+
+ return(indels);
+ }
+
+
+ /**
+ * Loads indels to be used for filtering
+ *
+ * @param filename Path to file of indels
+ * @return indels HashMap of indel positions (chrom\tposition)
+ */
+ static HashMap<String, Boolean> findSNPclusters(int windowSize, int windowSNPs, String[] args)
+ {
+ HashMap<String, Boolean> snps = new HashMap<String, Boolean>();
+ HashMap<String, Boolean> clusterSNPs = new HashMap<String, Boolean>();
+
+ // Declare file-parsing variables //
+
+ BufferedReader in = VarScan.getInfile(args);
+ String line;
+ int lineCounter = 0;
+
+ // Proceed if input stream is ready //
+ try
+ {
+ if(in != null && in.ready())
+ {
+ // Parse the infile line by line //
+
+ while ((line = in.readLine()) != null)
+ {
+ lineCounter++;
+
+ try
+ {
+ String[] lineContents = line.split("\t");
+ String chrom = lineContents[0];
+
+
+ if(chrom.equals("Chrom") || line.startsWith("#"))
+ {
+ // Ignore headers //
+ }
+ else
+ {
+ int position = Integer.parseInt(lineContents[1]);
+ String snpKey = chrom + "\t" + position;
+ snps.put(snpKey, Boolean.TRUE);
+ }
+ }
+ catch(Exception e)
+ {
+ // Ignore parsing exceptions for now
+ }
+ }
+
+ in.close();
+ }
+ }
+ catch(Exception e)
+ {
+ // File parsing exception //
+ System.err.println("Error loading SNP positions:" + e.getMessage());
+ }
+
+ int numClusterSNPs = 0;
+ // Go through each position in SNP keys //
+
+ String[] snpKeys = (String[]) snps.keySet().toArray(new String[0]);
+ Arrays.sort(snpKeys);
+
+ for(String snpPosition : snpKeys)
+ {
+ try
+ {
+ String[] snpContents = snpPosition.split("\t");
+ String chrom = snpContents[0];
+ int position = Integer.parseInt(snpContents[1]);
+
+ // Search window size in each direction //
+ int numSNPsInWindow = 1;
+
+ // Check downstream window //
+ for(int thisPosition = position + 1; thisPosition <= position + windowSize; thisPosition++)
+ {
+ String thisKey = chrom + "\t" + thisPosition;
+ if(snps.containsKey(thisKey))
+ {
+ numSNPsInWindow++;
+ }
+ }
+
+ // If we have a cluster, mark this position as well as any downstream //
+
+ if(numSNPsInWindow >= windowSNPs)
+ {
+ clusterSNPs.put(snpPosition, true);
+
+ for(int thisPosition = position + 1; thisPosition <= position + windowSize; thisPosition++)
+ {
+ String thisKey = chrom + "\t" + thisPosition;
+ if(snps.containsKey(thisKey))
+ {
+ clusterSNPs.put(thisKey, true);
+ numClusterSNPs++;
+ }
+ }
+ }
+
+
+
+ }
+ catch(Exception e)
+ {
+
+ }
+ }
+
+
+ System.err.println(numClusterSNPs + " cluster SNPs identified");
+ return(clusterSNPs);
+ }
+}
\ No newline at end of file
diff --git a/net/sf/varscan/FilterVariants.java b/net/sf/varscan/FilterVariants.java
new file mode 100644
index 0000000..c684df0
--- /dev/null
+++ b/net/sf/varscan/FilterVariants.java
@@ -0,0 +1,583 @@
+/**
+ * @(#)FilterVariants.java
+ *
+ * Copyright (c) 2009-2010 Daniel C. Koboldt and Washington University in St. Louis
+ *
+ * COPYRIGHT
+ */
+
+package net.sf.varscan;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.PrintStream;
+import java.util.HashMap;
+
+/**
+ * A class for filtering VarScan variant predictions
+ *
+ * @version 2.3
+ *
+ * @author Daniel C. Koboldt <dkoboldt at genome.wustl.edu>
+ *
+ */
+public class FilterVariants {
+
+ public FilterVariants(String[] args)
+ {
+ // Define the usage message //
+ String usage = "USAGE: java -jar VarScan.jar filter [variant file] OPTIONS\n" +
+ "\tvariant file - A file of SNPs or indels\n" +
+ "\n" +
+ "\tOPTIONS:\n" +
+ "\t--min-coverage\tMinimum read depth at a position to make a call [10]\n" +
+ "\t--min-reads2\tMinimum supporting reads at a position to call variants [2]\n" +
+ "\t--min-strands2\tMinimum # of strands on which variant observed (1 or 2) [1]\n" +
+ "\t--min-avg-qual\tMinimum average base quality for variant-supporting reads [20]\n" +
+ "\t--min-var-freq\tMinimum variant allele frequency threshold [0.20]\n" +
+ "\t--p-value\tDefault p-value threshold for calling variants [1e-01]\n" +
+ "\t--indel-file\tFile of indels for filtering nearby SNPs\n" +
+ "\t--output-file\tFile to contain variants passing filters\n";
+
+ // Set parameter defaults //
+
+ int minCoverage = 10;
+ int minReads2 = 2;
+ int minStrands2 = 1;
+ int minAvgQual = 15;
+ double minVarFreq = 0.20;
+ double pValueThreshold = 0.10;
+ HashMap<String, Boolean> indelPositions = new HashMap<String, Boolean>();
+ String outFileName = "";
+ String notFileName = "";
+
+ // Adjust parameters based on user input //
+
+ HashMap<String, String> params = VarScan.getParams(args);
+
+ try
+ {
+ if(params.containsKey("min-coverage"))
+ minCoverage = Integer.parseInt(params.get("min-coverage"));
+
+ if(params.containsKey("min-reads2"))
+ minReads2 = Integer.parseInt(params.get("min-reads2"));
+
+ if(params.containsKey("min-strands2"))
+ minStrands2 = Integer.parseInt(params.get("min-strands2"));
+
+ if(params.containsKey("min-var-freq"))
+ minVarFreq = Double.parseDouble(params.get("min-var-freq"));
+
+ if(params.containsKey("min-avg-qual"))
+ minAvgQual = Integer.parseInt(params.get("min-avg-qual"));
+
+ if(params.containsKey("p-value"))
+ pValueThreshold = Double.parseDouble(params.get("p-value"));
+
+ if(params.containsKey("indel-file"))
+ {
+ indelPositions = loadIndels(params.get("indel-file"));
+ }
+
+ if(params.containsKey("output-file"))
+ outFileName = params.get("output-file");
+
+ if(params.containsKey("not-file"))
+ notFileName = params.get("not-file");
+
+ System.err.println("Min coverage:\t" + minCoverage);
+ System.err.println("Min reads2:\t" + minReads2);
+ System.err.println("Min strands2:\t" + minStrands2);
+ System.err.println("Min var freq:\t" + minVarFreq);
+ System.err.println("Min avg qual:\t" + minAvgQual);
+ System.err.println("P-value thresh:\t" + pValueThreshold);
+
+ }
+ catch(Exception e)
+ {
+ System.err.println("Input Parameter Threw Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ System.exit(1);
+ }
+
+
+ // Print usage if -h or --help invoked //
+ if(params.containsKey("help") || params.containsKey("h"))
+ {
+ System.err.println(usage);
+ return;
+ }
+
+ // Get the input file and parse it //
+
+ // Define two-decimal-place format and statistics hash //
+
+ HashMap<String, Integer> stats = new HashMap<String, Integer>();
+ stats.put("numVariants", 0);
+ stats.put("numNearIndel", 0);
+ stats.put("numPassFilter", 0);
+ stats.put("numFailCoverage", 0);
+ stats.put("numFailFreq", 0);
+ stats.put("numFailQual", 0);
+ stats.put("numFailStrands", 0);
+ stats.put("numFailReads2", 0);
+ stats.put("numFailPvalue", 0);
+ stats.put("numParsingExceptions", 0);
+ stats.put("numNoGenotype", 0);
+ stats.put("numCalledRef", 0);
+
+ // Parse piped input or user-provided pileup file //
+
+ try
+ {
+// Declare output files //
+ PrintStream outFile = null;
+ if(params.containsKey("output-file"))
+ outFile = new PrintStream( new FileOutputStream(outFileName) );
+
+ PrintStream notFile = null;
+ if(params.containsKey("not-file"))
+ notFile = new PrintStream( new FileOutputStream(notFileName) );
+
+ // Declare file-parsing variables //
+
+ BufferedReader in = VarScan.getInfile(args);
+
+ // If no input, print usage //
+
+ if(in == null)
+ {
+ System.out.println(usage);
+ return;
+ }
+
+ // If input file not ready, give it a few seconds //
+ int numNaps = 0;
+
+ while(!in.ready())
+ {
+ try {
+ Thread.sleep(5000);
+ numNaps++;
+
+ if(numNaps > 100)
+ {
+ System.err.println("Input file was not ready after 100 5-second cycles!");
+ System.exit(10);
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Exception while trying to get input" + e.getMessage());
+ System.exit(1);
+ }
+ }
+
+ String line;
+ int lineCounter = 0;
+ boolean isVCF = false;
+
+ // Proceed if input stream is ready //
+
+ if(in != null && in.ready())
+ {
+ // Parse the infile line by line //
+
+ while ((line = in.readLine()) != null)
+ {
+ try
+ {
+ lineCounter++;
+
+ String[] lineContents = line.split("\t");
+ String chrom = lineContents[0];
+
+ if(line.startsWith("#"))
+ {
+ // VCF header line //
+ isVCF = true;
+ // Print header //
+ if(params.containsKey("output-file"))
+ outFile.println(line);
+ else
+ System.out.println(line);
+
+ if(params.containsKey("not-file"))
+ notFile.println(line);
+ }
+ else if(chrom.equals("Chrom"))
+ {
+ // Native output header line //
+ if(params.containsKey("output-file"))
+ outFile.println(line);
+ else
+ System.out.println(line);
+
+ if(params.containsKey("not-file"))
+ notFile.println(line);
+ }
+ else
+ {
+ int position = Integer.parseInt(lineContents[1]);
+
+ // If indel file was provided, check position-1 to position+1 //
+ boolean indelFilter = false;
+ if(params.containsKey("indel-file"))
+ {
+ String key1 = chrom + "\t" + position;
+ String key2 = chrom + "\t" + (position - 1);
+ String key3 = chrom + "\t" + (position + 1);
+
+ if(indelPositions.containsKey(key1) || indelPositions.containsKey(key2) || indelPositions.containsKey(key3))
+ {
+ indelFilter = true;
+ }
+ }
+
+ if(indelFilter)
+ {
+ stats.put("numNearIndel", (stats.get("numNearIndel") + 1));
+ stats.put("numVariants", (stats.get("numVariants") + 1));
+ if(params.containsKey("not-file"))
+ notFile.println(line);
+ }
+ else if(isVCF)
+ {
+ int maxCol = lineContents.length;
+ String vcfLine = "";
+ int numSamples = 0;
+ int numSamplesReferencePass = 0;
+ int numSamplesVariantPass = 0;
+ for(int colCounter = 0; colCounter < maxCol; colCounter++)
+ {
+ if(colCounter < 9)
+ {
+ if(colCounter > 0)
+ vcfLine += "\t";
+ vcfLine += lineContents[colCounter];
+ }
+ else
+ {
+ numSamples++;
+ vcfLine += "\t";
+ // Evaluate sample //
+ //GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR//
+ String[] sampleContents = lineContents[colCounter].split(":");
+ String gt = sampleContents[0];
+
+ stats.put("numVariants", (stats.get("numVariants") + 1));
+
+ if(gt.contains("."))
+ {
+ // Blank genotype, so ignore //
+ stats.put("numNoGenotype", (stats.get("numNoGenotype") + 1));
+ }
+ else
+ {
+ int qualityDepth = Integer.parseInt(sampleContents[3]);
+ int reads1 = Integer.parseInt(sampleContents[4]);
+ int reads2 = Integer.parseInt(sampleContents[5]);
+ double varFreq = (double) reads2 / (double) ((reads1 + reads2));
+ double pValue = Float.parseFloat(sampleContents[7]);
+ int qual1 = Integer.parseInt(sampleContents[8]);
+ int qual2 = Integer.parseInt(sampleContents[9]);
+ int reads1plus = Integer.parseInt(sampleContents[10]);
+ int reads1minus = Integer.parseInt(sampleContents[11]);
+ int reads2plus = Integer.parseInt(sampleContents[12]);
+ int reads2minus = Integer.parseInt(sampleContents[13]);
+ boolean strandFail = false;
+ if(reads1plus > 0 && reads1minus > 0)
+ {
+ if(reads2plus == 0 || reads2minus == 0)
+ strandFail = true;
+ }
+
+ boolean isFiltered = true;
+
+ /// Begin checks for either reference or variant //
+ if(qualityDepth < minCoverage)
+ {
+ stats.put("numFailCoverage", (stats.get("numFailCoverage") + 1));
+ isFiltered = true;
+ }
+
+ if(gt.equals("0/0"))
+ {
+ stats.put("numCalledRef", (stats.get("numCalledRef") + 1));
+
+ if(!isFiltered)
+ numSamplesReferencePass++;
+ // Don't try to filter wild-type calls //
+ }
+ else if(reads2 < minReads2)
+ {
+ stats.put("numFailReads2", (stats.get("numFailReads2") + 1));
+ isFiltered = true;
+ }
+ else if (qual2 < minAvgQual)
+ {
+ stats.put("numFailQual", (stats.get("numFailQual") + 1));
+ isFiltered = true;
+ }
+ else if(varFreq < minVarFreq)
+ {
+ stats.put("numFailFreq", (stats.get("numFailFreq") + 1));
+ isFiltered = true;
+ }
+ else if (strandFail)
+ {
+ stats.put("numFailStrands", (stats.get("numFailStrands") + 1));
+ isFiltered = true;
+ }
+ else if (pValue > pValueThreshold)
+ {
+ stats.put("numFailPvalue", (stats.get("numFailPvalue") + 1));
+ isFiltered = true;
+ }
+ else
+ {
+ // Pass the variant //
+ stats.put("numPassFilter", (stats.get("numPassFilter") + 1));
+ numSamplesVariantPass++;
+ }
+
+ // IF we have a variant filtered, address that //
+
+ if(isFiltered)
+ {
+ // Replace genotype with blank value //
+ lineContents[colCounter].replace(gt, "./.");
+ }
+ }
+
+
+ // Append to line //
+ vcfLine += lineContents[colCounter];
+ }
+ }
+
+ if(numSamplesVariantPass > 0 || numSamplesReferencePass > 0)
+ {
+ if(params.containsKey("output-file"))
+ outFile.println(vcfLine);
+ else
+ System.out.println(vcfLine);
+ }
+ else if(params.containsKey("not-file"))
+ {
+ notFile.println(vcfLine);
+ }
+
+ }
+ else
+ {
+ stats.put("numVariants", (stats.get("numVariants") + 1));
+
+ // Parse out relevant values //
+ String ref = lineContents[2];
+ String var = lineContents[3];
+ int reads1 = Integer.parseInt(lineContents[4]);
+ int reads2 = Integer.parseInt(lineContents[5]);
+ int strands1 = Integer.parseInt(lineContents[7]);
+ int strands2 = Integer.parseInt(lineContents[8]);
+ int qual1 = Integer.parseInt(lineContents[9]);
+ int qual2 = Integer.parseInt(lineContents[10]);
+ double pValue = Float.parseFloat(lineContents[11]);
+
+ int coverage = reads1 + reads2;
+ double varFreq = (double) reads2 / (double) (reads1 + reads2);
+
+ boolean isFiltered = true;
+
+
+ if(coverage >= minCoverage)
+ {
+ if(ref.equals(var))
+ {
+ stats.put("numCalledRef", (stats.get("numCalledRef") + 1));
+
+ if(params.containsKey("output-file"))
+ outFile.println(line);
+ else
+ System.out.println(line);
+ }
+ else if(reads2 >= minReads2)
+ {
+ if(strands2 >= minStrands2)
+ {
+ if(qual2 >= minAvgQual)
+ {
+ if(varFreq >= minVarFreq)
+ {
+ // Calculate p-value if it has value of 0.98 meaning not calculated //
+
+ if(pValue >= 0.98)
+ {
+ pValue = VarScan.getSignificance(reads1, reads2);
+ }
+
+
+ if(pValue <= pValueThreshold)
+ {
+ stats.put("numPassFilter", (stats.get("numPassFilter") + 1));
+
+ if(params.containsKey("output-file"))
+ outFile.println(line);
+ else
+ System.out.println(line);
+
+ isFiltered = false;
+ }
+ else
+ {
+ stats.put("numFailPvalue", (stats.get("numFailPvalue") + 1));
+ }
+ }
+ else
+ {
+ stats.put("numFailFreq", (stats.get("numFailFreq") + 1));
+ }
+ }
+ else
+ {
+ stats.put("numFailQual", (stats.get("numFailQual") + 1));
+ }
+ }
+ else
+ {
+ stats.put("numFailStrands", (stats.get("numFailStrands") + 1));
+ }
+ }
+ else
+ {
+ stats.put("numFailReads2", (stats.get("numFailReads2") + 1));
+ }
+ }
+ else
+ {
+ stats.put("numFailCoverage", (stats.get("numFailCoverage") + 1));
+ }
+
+ // If not file provided, print this filtered variant to it //
+
+ if(isFiltered && params.containsKey("not-file"))
+ notFile.println(line);
+
+
+ }
+
+ }
+ }
+ catch(Exception e)
+ {
+ if(lineCounter == 1)
+ {
+ // Print header //
+ System.out.println(line);
+ if(params.containsKey("output-file"))
+ outFile.println(line);
+ }
+ else
+ {
+ System.err.println("Parsing Exception on line:\n" + line + "\n" + e.getLocalizedMessage());
+ stats.put("numParsingExceptions", (stats.get("numParsingExceptions") + 1));
+ if(stats.get("numParsingExceptions") >= 5)
+ {
+ System.err.println("Too many parsing exceptions encountered; exiting");
+ return;
+ }
+ }
+ }
+
+ }
+
+ // Report summary of results //
+
+ System.err.println(stats.get("numVariants") + " entries in input stream");
+ System.err.println(stats.get("numNoGenotype") + " had no genotype");
+ System.err.println(stats.get("numCalledRef") + " were called wild-type");
+ System.err.println(stats.get("numFailCoverage") + " failed coverage");
+ System.err.println(stats.get("numFailReads2") + " failed reads2");
+ System.err.println(stats.get("numFailStrands") + " failed strands");
+ System.err.println(stats.get("numFailQual") + " failed quality");
+ System.err.println(stats.get("numFailFreq") + " failed variant frequency < " + minVarFreq);
+ System.err.println(stats.get("numFailPvalue") + " failed P-value > " + pValueThreshold);
+ System.err.println(stats.get("numNearIndel") + " were removed near indels");
+ System.err.println(stats.get("numPassFilter") + " passed filters");
+
+ in.close();
+ }
+ else
+ {
+ System.err.println("Input file not found!");
+ System.err.println(usage);
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Error Parsing Input File: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ return;
+ }
+ }
+
+
+ /**
+ * Loads indels to be used for filtering
+ *
+ * @param filename Path to file of indels
+ * @return indels HashMap of indel positions (chrom\tposition)
+ */
+ static HashMap<String, Boolean> loadIndels(String filename)
+ {
+ HashMap<String, Boolean> indels = new HashMap<String, Boolean>();
+
+ try
+ {
+ // Declare file-parsing variables //
+
+ String line;
+
+ File infile = new File(filename);
+ if(infile.exists())
+ {
+ BufferedReader in = new BufferedReader(new FileReader(infile));
+
+ if(in.ready())
+ {
+ while ((line = in.readLine()) != null)
+ {
+ if(line.startsWith("#"))
+ {
+ // Ignore //
+ }
+ else
+ {
+ String[] lineContents = line.split("\t");
+ String chrom = lineContents[0];
+ String position = lineContents[1];
+ String indelKey = chrom + "\t" + position;
+ indels.put(indelKey, Boolean.TRUE);
+ }
+
+ }
+ }
+ else
+ {
+ System.err.println("Unable to open indels file for reading");
+ }
+
+ in.close();
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Error Parsing Indel File: " + e.getLocalizedMessage());
+ }
+
+ return(indels);
+ }
+}
diff --git a/net/sf/varscan/FishersExact.java b/net/sf/varscan/FishersExact.java
new file mode 100644
index 0000000..c9827ae
--- /dev/null
+++ b/net/sf/varscan/FishersExact.java
@@ -0,0 +1,292 @@
+/**
+ * @(#)FishersExact.java
+ *
+ * Copyright (c) 2009-2010 Daniel C. Koboldt and Washington University in St. Louis
+ *
+ * COPYRIGHT
+ */
+
+package net.sf.varscan;
+
+/**
+ * A statistical class for performing Fisher's exact test.
+ *
+ * This does a Fisher's Exact test, a statistical test procedure that calculates an exact probability value
+ * for the relationship between two dichotomous variables, as found in a two by two crosstable. The program
+ * calculates the difference between the data observed and the data expected, considering the given marginal
+ * and the assumptions of the model of independence. It works in exactly the same way as the Chi-square test
+ * for independence; however, the Chi-square gives only an estimate of the true probability value, an estimate
+ * which might not be very accurate if the marginal is very uneven or if there is a small value (less than five)
+ * in one of the cells.
+ *
+ * @version 2.3
+ *
+ * @author Daniel C. Koboldt <dkoboldt at genome.wustl.edu>
+ *
+ */
+public class FishersExact {
+ private static final boolean DEBUG = false;
+ private double[] f;
+ int maxSize;
+
+
+ /**
+ * constructor for FisherExact table
+ *
+ * @param maxSize is the maximum sum that will be encountered by the table (a+b+c+d)
+ */
+ public FishersExact(int maxSize) {
+ this.maxSize = maxSize;
+ double cf = 1.0;
+ f = new double[maxSize + 1];
+ f[0] = 0.0;
+ for (int i = 1; i <= this.maxSize; i++) {
+ f[i] = f[i - 1] + Math.log(i);
+ }
+ }
+
+ /**
+ * calculates the P-value for this specific state
+ *
+ * @param a a, b, c, d are the four cells in a 2x2 matrix
+ * @param b
+ * @param c
+ * @param d
+ * @return the P-value
+ */
+ public final double getP(int a, int b, int c, int d) {
+ try
+ {
+ int n = a + b + c + d;
+ if (n > maxSize) {
+ return Double.NaN;
+ }
+ double p;
+ p = (f[a + b] + f[c + d] + f[a + c] + f[b + d]) - (f[a] + f[b] + f[c] + f[d] + f[n]);
+ return Math.exp(p);
+ }
+ catch(Exception e)
+ {
+ return Double.NaN;
+ }
+
+ }
+
+ /**
+ * Calculates the one-tail P-value for the Fisher Exact test. Determines whether to calculate the right- or left-
+ * tail, thereby always returning the smallest p-value.
+ *
+ * @param a a, b, c, d are the four cells in a 2x2 matrix
+ * @param b
+ * @param c
+ * @param d
+ * @return one-tailed P-value (right or left, whichever is smallest)
+ */
+ public final double getCumlativeP(int a, int b, int c, int d) {
+ int min, i;
+ int n = a + b + c + d;
+ if (n > maxSize) {
+ return Double.NaN;
+ }
+ double p = 0;
+
+ p += getP(a, b, c, d);
+ if (DEBUG) {System.out.println("p = " + p);}
+ if ((a * d) >= (b * c)) {
+ if (DEBUG) {System.out.println("doing R-tail: a=" + a + " b=" + b + " c=" + c + " d=" + d);}
+ min = (c < b) ? c : b;
+ for (i = 0; i < min; i++) {
+ if (DEBUG) {System.out.print("doing round " + i);}
+ p += getP(++a, --b, --c, ++d);
+ if (DEBUG) {System.out.println("\ta=" + a + " b=" + b + " c=" + c + " d=" + d);}
+ }
+ System.out.println("");
+ }
+ if ((a * d) < (b * c)) {
+ if (DEBUG) {System.out.println("doing L-tail: a=" + a + " b=" + b + " c=" + c + " d=" + d);}
+ min = (a < d) ? a : d;
+ for (i = 0; i < min; i++) {
+ if (DEBUG) {System.out.print("doing round " + i);}
+ double pTemp = getP(--a, ++b, ++c, --d);
+ if (DEBUG) {System.out.print("\tpTemp = " + pTemp);}
+ p += pTemp;
+ if (DEBUG) {System.out.println("\ta=" + a + " b=" + b + " c=" + c + " d=" + d);}
+ }
+ }
+ return p;
+ }
+
+ /**
+ * Calculates the right-tail P-value for the Fisher Exact test.
+ *
+ * @param a a, b, c, d are the four cells in a 2x2 matrix
+ * @param b
+ * @param c
+ * @param d
+ * @return one-tailed P-value (right-tail)
+ */
+ public final double getRightTailedP(int a, int b, int c, int d) {
+ int min, i;
+ int n = a + b + c + d;
+ if (n > maxSize) {
+ return Double.NaN;
+ }
+ double p = 0;
+
+ p += getP(a, b, c, d);
+ if (DEBUG) {System.out.println("p = " + p);}
+ if (DEBUG) {System.out.println("doing R-tail: a=" + a + " b=" + b + " c=" + c + " d=" + d);}
+ min = (c < b) ? c : b;
+ for (i = 0; i < min; i++) {
+ p += getP(++a, --b, --c, ++d);
+
+ }
+ return p;
+ }
+
+ /**
+ * Calculates the left-tail P-value for the Fisher Exact test.
+ *
+ * @param a a, b, c, d are the four cells in a 2x2 matrix
+ * @param b
+ * @param c
+ * @param d
+ * @return one-tailed P-value (left-tail)
+ */
+ public final double getLeftTailedP(int a, int b, int c, int d) {
+ int min, i;
+ int n = a + b + c + d;
+ if (n > maxSize) {
+ return Double.NaN;
+ }
+ double p = 0;
+
+ p += getP(a, b, c, d);
+ if (DEBUG) {System.out.println("p = " + p);}
+ if (DEBUG) {System.out.println("doing L-tail: a=" + a + " b=" + b + " c=" + c + " d=" + d);}
+ min = (a < d) ? a : d;
+ for (i = 0; i < min; i++) {
+ if (DEBUG) {System.out.print("doing round " + i);}
+ double pTemp = getP(--a, ++b, ++c, --d);
+ if (DEBUG) {System.out.print("\tpTemp = " + pTemp);}
+ p += pTemp;
+ if (DEBUG) {System.out.println("\ta=" + a + " b=" + b + " c=" + c + " d=" + d);}
+ }
+
+
+ return p;
+ }
+
+
+ /**
+ * Calculates the two-tailed P-value for the Fisher Exact test.
+ *
+ * In order for a table under consideration to have its p-value included
+ * in the final result, it must have a p-value less than the original table's P-value, i.e.
+ * Fisher's exact test computes the probability, given the observed marginal
+ * frequencies, of obtaining exactly the frequencies observed and any configuration more extreme.
+ * By "more extreme," we mean any configuration (given observed marginals) with a smaller probability of
+ * occurrence in the same direction (one-tailed) or in both directions (two-tailed).
+ *
+ * @param a a, b, c, d are the four cells in a 2x2 matrix
+ * @param b
+ * @param c
+ * @param d
+ * @return two-tailed P-value
+ */
+ public final double getTwoTailedP(int a, int b, int c, int d) {
+ int min, i;
+ int n = a + b + c + d;
+ if (n > maxSize) {
+ return Double.NaN;
+ }
+ double p = 0;
+
+ double baseP = getP(a, b, c, d);
+// in order for a table under consideration to have its p-value included
+// in the final result, it must have a p-value less than the baseP, i.e.
+// Fisher's exact test computes the probability, given the observed marginal
+// frequencies, of obtaining exactly the frequencies observed and any configuration more extreme.
+// By "more extreme," we mean any configuration (given observed marginals) with a smaller probability of
+// occurrence in the same direction (one-tailed) or in both directions (two-tailed).
+
+ if (DEBUG) {System.out.println("baseP = " + baseP);}
+ int initialA = a, initialB = b, initialC = c, initialD = d;
+ p += baseP;
+ if (DEBUG) {System.out.println("p = " + p);}
+ if (DEBUG) {System.out.println("Starting with R-tail: a=" + a + " b=" + b + " c=" + c + " d=" + d);}
+ min = (c < b) ? c : b;
+ for (i = 0; i < min; i++) {
+ if (DEBUG) {System.out.print("doing round " + i);}
+ double tempP = getP(++a, --b, --c, ++d);
+ if (tempP <= baseP) {
+ if (DEBUG) {System.out.print("\ttempP (" + tempP + ") is less than baseP (" + baseP + ")");}
+ p += tempP;
+ }
+ if (DEBUG) {System.out.println(" a=" + a + " b=" + b + " c=" + c + " d=" + d);}
+ }
+
+ // reset the values to their original so we can repeat this process for the other side
+ a = initialA;
+ b = initialB;
+ c = initialC;
+ d = initialD;
+
+ if (DEBUG) {System.out.println("Now doing L-tail: a=" + a + " b=" + b + " c=" + c + " d=" + d);}
+ min = (a < d) ? a : d;
+ if (DEBUG) {System.out.println("min = " + min);}
+ for (i = 0; i < min; i++) {
+ if (DEBUG) {System.out.print("doing round " + i);}
+ double pTemp = getP(--a, ++b, ++c, --d);
+ if (DEBUG) {System.out.println(" pTemp = " + pTemp);}
+ if (pTemp <= baseP) {
+ if (DEBUG) {System.out.print("\ttempP (" + pTemp + ") is less than baseP (" + baseP + ")");}
+ p += pTemp;
+ }
+ if (DEBUG) {System.out.println(" a=" + a + " b=" + b + " c=" + c + " d=" + d);}
+ }
+ return p;
+ }
+
+ public static void main(String[] args) {
+
+ int[][] argInts = new int[15][4];
+ argInts[0] = new int[]{2, 3, 6, 4};
+ argInts[1] = new int[]{2, 1, 3, 0};
+ argInts[2] = new int[]{3, 0, 2, 1};
+ argInts[3] = new int[]{1, 2, 0, 3};
+ argInts[4] = new int[]{3, 1, 1, 3};
+ argInts[5] = new int[]{1, 3, 3, 1};
+ argInts[6] = new int[]{0, 1, 1, 0};
+ argInts[7] = new int[]{1, 0, 0, 1};
+ argInts[8] = new int[]{11, 0, 0, 6};
+ argInts[9] = new int[]{10, 1, 1, 5};
+ argInts[10] = new int[]{5, 6, 6, 0};
+ argInts[11] = new int[]{9, 2, 2, 4};
+ argInts[12] = new int[]{6, 5, 5, 1};
+ argInts[13] = new int[]{8, 3, 3, 3};
+ argInts[14] = new int[]{7, 4, 4, 2};
+
+ FishersExact fe = new FishersExact(100);
+
+ for (int i = 0; i < argInts.length; i++) {
+ System.out.println("\na=" + argInts[i][0] + " b=" + argInts[i][1] + " c=" + argInts[i][2] + " d=" + argInts[i][3]);
+ System.out.print("*****Original algorithm: ");
+ double cumulativeP = fe.getCumlativeP(argInts[i][0], argInts[i][1], argInts[i][2], argInts[i][3]);
+ System.out.println("\tcumulativeP = " + cumulativeP);
+
+ System.out.print("*****Left Tailed: ");
+ double leftTailedP = fe.getLeftTailedP(argInts[i][0], argInts[i][1], argInts[i][2], argInts[i][3]);
+ System.out.println("\tleftTailedP = " + leftTailedP);
+
+ System.out.print("*****Right Tailed: ");
+ double rightTailedP = fe.getRightTailedP(argInts[i][0], argInts[i][1], argInts[i][2], argInts[i][3]);
+ System.out.println("\trightTailedP = " + rightTailedP);
+
+ System.out.print("*****Two Tailed: ");
+ double twoTailedP = fe.getTwoTailedP(argInts[i][0], argInts[i][1], argInts[i][2], argInts[i][3]);
+ System.out.println("\ttwoTailedP = " + twoTailedP);
+ }
+ }
+
+}
diff --git a/net/sf/varscan/LimitVariants.java b/net/sf/varscan/LimitVariants.java
new file mode 100644
index 0000000..75852ba
--- /dev/null
+++ b/net/sf/varscan/LimitVariants.java
@@ -0,0 +1,311 @@
+/**
+ * @(#)LimitVariants.java
+ *
+ * Copyright (c) 2009-2010 Daniel C. Koboldt and Washington University in St. Louis
+ *
+ * COPYRIGHT
+ */
+
+package net.sf.varscan;
+
+import java.io.BufferedReader;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.PrintStream;
+import java.util.HashMap;
+import java.util.BitSet;
+
+/**
+ * A class for restricting a list of variants to a given set of positions or regions
+ *
+ * @version 2.3
+ *
+ * @author Daniel C. Koboldt <dkoboldt at genome.wustl.edu>
+ *
+ */
+public class LimitVariants {
+
+ public LimitVariants(String[] args)
+ {
+ String usage = "USAGE: java -jar VarScan.jar limit [infile] OPTIONS\n" +
+ "\tinfile - A file of chromosome-positions, tab-delimited\n" +
+ "\tOPTIONS\n" +
+ "\t--positions-file - a file of chromosome-positions, tab delimited, or VCF\n" +
+ "\t--regions-file - a file of chromosome-start-stops, tab delimited\n" +
+ "\t--margin-size - shoulder bases to allow on either side of targets [0]\n" +
+ "\t--output-file - Output file for the matching variants\n" +
+ "\t--not-file - Output file for variants NOT matching regions/positions\n";
+
+ // Declare argument variables //
+ String outFileName = "";
+ String notFileName = "";
+ String targetFileName = "";
+ String targetFileType = "";
+ int marginSize = 0;
+
+ // Parse command-line parameters //
+ HashMap<String, String> params = VarScan.getParams(args);
+
+ // Try adjusting any provided parameters based on user inut //
+
+ try
+ {
+ if(params.containsKey("output-file"))
+ outFileName = params.get("output-file");
+
+ if(params.containsKey("not-file"))
+ notFileName = params.get("not-file");
+
+
+ if(params.containsKey("positions-file"))
+ {
+ targetFileName = params.get("positions-file");
+ targetFileType = "positions";
+ }
+ else if(params.containsKey("regions-file"))
+ {
+ targetFileName = params.get("regions-file");
+ targetFileType = "regions";
+ }
+ else
+ {
+ System.err.println("Please provide a regions file or a positions file");
+ System.err.println(usage);
+ return;
+ }
+
+ if(params.containsKey("margin-size"))
+ {
+ marginSize = Integer.parseInt(params.get("margin-size"));
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Input Parameter Threw Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ return;
+ }
+
+ // Print usage if -h or --help invoked //
+ if(params.containsKey("help") || params.containsKey("h"))
+ {
+ System.err.println(usage);
+ return;
+ }
+
+ try
+ {
+ // Obtain input file //
+ BufferedReader infile = VarScan.getInfile(args);
+
+ // Declare output file //
+ PrintStream outFile = null;
+ if(params.containsKey("output-file"))
+ outFile = new PrintStream( new FileOutputStream(outFileName) );
+
+ // Declare not output file //
+ PrintStream notFile = null;
+ if(params.containsKey("not-file"))
+ notFile = new PrintStream( new FileOutputStream(notFileName) );
+
+ // Load target positions //
+ HashMap<String, BitSet> targetHash = loadTargets(targetFileName, targetFileType, marginSize);
+
+ // Declare file-parsing variables //
+ String line = "";
+ int lineCounter = 0;
+ int numVariants = 0;
+ int numInTarget = 0;
+
+ // Actually parse the infile //
+
+ while ((line = infile.readLine()) != null)
+ {
+ lineCounter++;
+
+ String[] lineContents = line.split("\t");
+ if(line.substring(0, 1).equals("#"))
+ {
+ // Handle VCF headers //
+ outFile.println(line);
+ notFile.println(line);
+ }
+ else if(line.substring(0, 5).toLowerCase().equals("chrom"))
+ {
+ // Handle native file headers //
+ outFile.println(line);
+ if(params.containsKey("not-file"))
+ notFile.println(line);
+ }
+ else if(lineContents.length >= 2)
+ {
+ // Try to parse chrom and position //
+ try
+ {
+ String refName = lineContents[0];
+ int position = Integer.parseInt(lineContents[1]);
+
+ numVariants++;
+
+ // Declare a BitSet //
+ BitSet refPositions;
+
+ boolean inTarget = false;
+
+ // Get the position BitSet for this chromosome//
+ if(targetHash.containsKey(refName))
+ {
+ refPositions = targetHash.get(refName);
+
+ // Check to see if position set //
+ if(refPositions.get(position))
+ {
+ inTarget = true;
+ numInTarget++;
+ if(params.containsKey("output-file"))
+ {
+ outFile.println(line);
+ }
+ }
+ }
+
+ // If no match and not file declared, print to it //
+ if(!inTarget && params.containsKey("not-file"))
+ {
+ notFile.println(line);
+ }
+
+ }
+ catch(Exception e)
+ {
+ if(lineCounter == 1)
+ {
+// Skip header // outFile.println(line);
+ }
+ else
+ System.err.println("Warning: Unable to parse chrom/position from " + line);
+ }
+ }
+ }
+
+ float pctInTarget = (float) 0;
+
+ if(numVariants > 0 && numInTarget > 0)
+ {
+ pctInTarget = (float) numInTarget / (float) numVariants * (float) 100;
+ }
+
+ // Print summary statistics //
+
+ System.err.println(numVariants + " variants in input file");
+ System.err.println(numInTarget + " variants (" + pctInTarget + "%) matched target positions");
+ }
+ catch(Exception e)
+ {
+ System.err.println("ERROR: File Parsing Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ return;
+ }
+
+ }
+
+
+ /**
+ * Saves positions into a BitSet hash by chromosome
+ *
+ * @param fileName Name of file to be parsed
+ * @param fileType Type of file ("positions" or "regions")
+ * @return HashMap of parameter names and their values
+ */
+ static HashMap<String, BitSet> loadTargets(String fileName, String fileType, int marginSize)
+ {
+ HashMap<String, BitSet> positionsByChrom = new HashMap<String, BitSet>();
+
+ try
+ {
+ BufferedReader infile = new BufferedReader(new FileReader(fileName));
+
+ String line = "";
+ int lineCounter = 0;
+
+ while ((line = infile.readLine()) != null)
+ {
+ lineCounter++;
+ String[] lineContents = line.split("\t");
+
+ if(line.substring(0, 1).equals("#"))
+ {
+ // Ignore VCF headers //
+ }
+ else if(lineContents.length >= 2)
+ {
+ // Try to parse chrom and position //
+ try
+ {
+ String refName = lineContents[0];
+
+ // Get or create BitSet for this refName //
+ BitSet refPositions;
+
+ if(positionsByChrom.containsKey(refName))
+ {
+ refPositions = positionsByChrom.get(refName);
+ }
+ else
+ {
+ refPositions = new BitSet();
+ }
+
+ // Mark position or regions, depending on what was provided //
+ int chrStart = 0;
+ int chrStop = 0;
+
+ if(fileType.equals("positions") && lineContents.length > 1)
+ {
+ // Set the position to true //
+ int position = Integer.parseInt(lineContents[1]);
+ chrStart = position - marginSize;
+ chrStop = position + marginSize;
+ }
+ else if(fileType.equals("regions") && lineContents.length > 2)
+ {
+ chrStart = Integer.parseInt(lineContents[1]) - marginSize;
+ chrStop = Integer.parseInt(lineContents[2]) + marginSize;
+ }
+
+ // Check that it won't be an infinite loop//
+ if(chrStart <= chrStop)
+ {
+ // Mark every position //
+ for(int position = chrStart; position <= chrStop; position++)
+ {
+ refPositions.set(position, true);
+ }
+ }
+
+ // Return it to the hash //
+ positionsByChrom.put(refName, refPositions);
+ }
+ catch(Exception e)
+ {
+ if(lineCounter > 1)
+ System.err.println("Warning: Unable to parse chrom/position from " + line);
+ }
+
+
+ }
+ }
+
+ infile.close();
+ }
+ catch(Exception e)
+ {
+ System.err.println("ERROR: File Parsing Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ }
+
+
+ return(positionsByChrom);
+ }
+
+}
diff --git a/net/sf/varscan/ProcessSomatic.java b/net/sf/varscan/ProcessSomatic.java
new file mode 100644
index 0000000..7af5bc3
--- /dev/null
+++ b/net/sf/varscan/ProcessSomatic.java
@@ -0,0 +1,432 @@
+package net.sf.varscan;
+
+import java.io.BufferedReader;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.PrintStream;
+import java.util.BitSet;
+import java.util.HashMap;
+
+/**
+ * A class for processing VarScan output by somatic status and confidence
+ *
+ * @version 2.3
+ *
+ * @author Daniel C. Koboldt <dkoboldt at genome.wustl.edu>
+ *
+ */
+public class ProcessSomatic {
+
+ public ProcessSomatic(String[] args)
+ {
+ String usage = "USAGE: java -jar VarScan.jar process [status-file] OPTIONS\n" +
+ "\tstatus-file - The VarScan output file for SNPs or Indels\n" +
+ "\tOPTIONS\n" +
+ "\t--min-tumor-freq - Minimum variant allele frequency in tumor [0.10]\n" +
+ "\t--max-normal-freq - Maximum variant allele frequency in normal [0.05]\n" +
+ "\t--p-value - P-value for high-confidence calling [0.07]";
+
+ // Parse command-line parameters //
+ HashMap<String, String> params = VarScan.getParams(args);
+
+ // Try adjusting any provided parameters based on user inut //
+ String statusFile = "varScan.output";
+
+ double maxNormalFreq = 0.05;
+ double minTumorFreq = 0.10;
+ double pValueForHC = 0.07;
+
+ try
+ {
+ if(params.containsKey("min-tumor-freq"))
+ minTumorFreq = Double.parseDouble(params.get("min-tumor-freq"));
+ if(params.containsKey("max-normal-freq"))
+ maxNormalFreq = Double.parseDouble(params.get("max-normal-freq"));
+ if(params.containsKey("p-value"))
+ pValueForHC = Double.parseDouble(params.get("p-value"));
+ }
+ catch(Exception e)
+ {
+ System.err.println("Input Parameter Threw Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ System.exit(1);
+ }
+
+ // Print usage if -h or --help invoked //
+ if(params.containsKey("help") || params.containsKey("h"))
+ {
+ System.err.println(usage);
+ return;
+ }
+
+ try
+ {
+ // Obtain input file //
+ BufferedReader infile = VarScan.getInfile(args);
+
+ // If no input, print usage //
+
+ if(infile == null)
+ {
+ System.out.println(usage);
+ return;
+ }
+
+ // If input file not ready, give it a few seconds //
+ int numNaps = 0;
+
+ while(!infile.ready())
+ {
+ try {
+ Thread.sleep(5000);
+ numNaps++;
+
+ if(numNaps > 100)
+ {
+ System.err.println("Input file was not ready after 100 5-second cycles!");
+ System.exit(10);
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Exception while trying to get input" + e.getMessage());
+ System.exit(1);
+ }
+ }
+
+ if(args.length > 1)
+ statusFile = args[1];
+
+
+ // Declare output files //
+ PrintStream outSomatic = null;
+ PrintStream outSomaticHC = null;
+ PrintStream outGermline = null;
+ PrintStream outGermlineHC = null;
+ PrintStream outLOH = null;
+ PrintStream outLOHHC = null;
+
+ boolean isVCF = false;
+
+ // Open output files for Somatic, Somatic HC, Germline, and LOH //
+
+ if(statusFile.endsWith(".vcf"))
+ {
+ isVCF = true;
+ String nameString = statusFile.replace(".vcf", "");
+ System.err.println("Opening output files: " + nameString + ".Somatic.vcf " + nameString + ".Germline.vcf " + nameString + ".LOH.vcf ");
+ outSomatic = new PrintStream( new FileOutputStream(nameString + ".Somatic.vcf") );
+ outSomaticHC = new PrintStream( new FileOutputStream(nameString + ".Somatic.hc.vcf") );
+ outGermline = new PrintStream( new FileOutputStream(nameString + ".Germline.vcf") );
+ outGermlineHC = new PrintStream( new FileOutputStream(nameString + ".Germline.hc.vcf") );
+ outLOH = new PrintStream( new FileOutputStream(nameString + ".LOH.vcf") );
+ outLOHHC = new PrintStream( new FileOutputStream(nameString + ".LOH.hc.vcf") );
+
+ }
+ else
+ {
+ System.err.println("Opening output files: " + statusFile + ".Somatic " + statusFile + ".Germline " + statusFile + ".LOH ");
+ outSomatic = new PrintStream( new FileOutputStream(statusFile + ".Somatic") );
+ outSomaticHC = new PrintStream( new FileOutputStream(statusFile + ".Somatic.hc") );
+ outGermline = new PrintStream( new FileOutputStream(statusFile + ".Germline") );
+ outGermlineHC = new PrintStream( new FileOutputStream(statusFile + ".Germline.hc") );
+ outLOH = new PrintStream( new FileOutputStream(statusFile + ".LOH") );
+ outLOHHC = new PrintStream( new FileOutputStream(statusFile + ".LOH.hc") );
+
+ }
+
+
+
+
+
+ // Reset counters //
+ int numProcessed = 0;
+ int numSomatic = 0;
+ int numSomaticHC = 0;
+ int numGermline = 0;
+ int numLOH = 0;
+ int numGermlineHC = 0;
+ int numLOHHC = 0;
+
+ // Declare file-parsing variables //
+ String line = "";
+ int lineCounter = 0;
+
+ // Actually parse the infile //
+
+ while ((line = infile.readLine()) != null)
+ {
+ lineCounter++;
+
+ String[] lineContents = line.split("\t");
+
+ if(lineContents.length >= 1)
+ {
+ // Try to parse chrom and position //
+ try
+ {
+ String refName = lineContents[0];
+
+ if(line.startsWith("#"))
+ {
+ // VCF header line //
+ isVCF = true;
+ outSomatic.println(line);
+ outSomaticHC.println(line);
+ outGermline.println(line);
+ outGermlineHC.println(line);
+ outLOH.println(line);
+ outLOHHC.println(line);
+ }
+ else if(refName.equals("chrom") || refName.equals("Chrom"))
+ {
+ outSomatic.println(line);
+ outSomaticHC.println(line);
+ outGermline.println(line);
+ outGermlineHC.println(line);
+ outLOH.println(line);
+ outLOHHC.println(line);
+ }
+ else
+ {
+
+ int position = Integer.parseInt(lineContents[1]);
+ int normalReads1 = 0;
+ int normalReads2 = 0;
+ int tumorReads1 = 0;
+ int tumorReads2 = 0;
+ String somaticStatus = "";
+ double somaticPvalue = 0.00;
+ if(isVCF)
+ {
+ String info = lineContents[7];
+ String normal = lineContents[9];
+ String tumor = lineContents[10];
+
+ String[] infoContents = info.split(";");
+ for(int colCounter = 0; colCounter < infoContents.length; colCounter++)
+ {
+ String element = infoContents[colCounter];
+ String[] elementContents = element.split("=");
+ if(elementContents[0].equals("SS"))
+ somaticStatus = elementContents[1];
+ else if(elementContents[0].equals("GPV") && somaticStatus.equals("1"))
+ somaticPvalue = Double.parseDouble(elementContents[1]);
+ else if(elementContents[0].equals("SPV") && !somaticStatus.equals("1"))
+ somaticPvalue = Double.parseDouble(elementContents[1]);
+ }
+
+ String[] normalContents = normal.split(":");
+ normalReads1 = Integer.parseInt(normalContents[3]);
+ normalReads2 = Integer.parseInt(normalContents[4]);
+
+ String[] tumorContents = tumor.split(":");
+ tumorReads1 = Integer.parseInt(tumorContents[3]);
+ tumorReads2 = Integer.parseInt(tumorContents[4]);
+ }
+ else if(refName.equals("chrom"))
+ {
+ // VarScan header //
+ }
+ else
+ {
+ normalReads1 = Integer.parseInt(lineContents[4]);
+ normalReads2 = Integer.parseInt(lineContents[5]);
+ tumorReads1 = Integer.parseInt(lineContents[8]);
+ tumorReads2 = Integer.parseInt(lineContents[9]);
+
+ somaticStatus = lineContents[12];
+ somaticPvalue = Double.parseDouble(lineContents[14]);
+ }
+
+ // Proceed //
+
+ double normalFreq = 0;
+ double tumorFreq = 0;
+
+ if(normalReads1 > 0 || normalReads2 > 0)
+ normalFreq = (double) normalReads2 / (double) (normalReads1 + normalReads2);
+
+ if(tumorReads1 > 0 || tumorReads2 > 0)
+ tumorFreq = (double) tumorReads2 / (double) (tumorReads1 + tumorReads2);
+
+ numProcessed++;
+
+ if(somaticStatus.equals("Somatic") || somaticStatus.equals("2"))
+ {
+ numSomatic++;
+ outSomatic.println(line);
+ if(normalFreq <= maxNormalFreq && tumorFreq >= minTumorFreq && somaticPvalue <= pValueForHC)
+ {
+ numSomaticHC++;
+ outSomaticHC.println(line);
+ }
+ }
+ else if(somaticStatus.equals("Germline") || somaticStatus.equals("1"))
+ {
+ numGermline++;
+ outGermline.println(line);
+ if(normalFreq >= minTumorFreq && tumorFreq >= minTumorFreq && somaticPvalue <= pValueForHC)
+ {
+ numGermlineHC++;
+ outGermlineHC.println(line);
+ }
+ }
+ else if(somaticStatus.equals("LOH") || somaticStatus.equals("3"))
+ {
+ numLOH++;
+ outLOH.println(line);
+ double normalHetDistance = Math.abs(0.50 - normalFreq);
+ double tumorHetDistance = Math.abs(0.50 - tumorFreq);
+ if(normalFreq >= minTumorFreq && tumorHetDistance > normalHetDistance && somaticPvalue <= pValueForHC)
+ {
+ numLOHHC++;
+ outLOHHC.println(line);
+ }
+ }
+ }
+
+
+
+
+ }
+ catch(Exception e)
+ {
+ if(lineCounter == 1)
+ {
+ // Print the header to the output files //
+ outSomatic.println(line);
+ outSomaticHC.println(line);
+ outGermline.println(line);
+ outLOH.println(line);
+
+ }
+ else
+ System.err.println("Warning: Unable to parse chrom/position from " + line);
+ }
+ }
+ }
+
+ // Close the files //
+ infile.close();
+ outSomatic.close();
+ outSomaticHC.close();
+ outGermline.close();
+ outLOH.close();
+ outGermlineHC.close();
+ outLOHHC.close();
+
+ // Print the status report //
+
+ System.out.println(numProcessed + " VarScan calls processed");
+ System.out.println(numSomatic + " were Somatic (" + numSomaticHC + " high confidence)");
+ System.out.println(numGermline + " were Germline (" + numGermlineHC + " high confidence)");
+ System.out.println(numLOH + " were LOH (" + numLOHHC + " high confidence)");
+
+ }
+ catch(Exception e)
+ {
+ System.err.println("ERROR: File Parsing Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ return;
+ }
+
+
+ }
+
+
+ /**
+ * Saves positions into a BitSet hash by chromosome
+ *
+ * @param fileName Name of file to be parsed
+ * @param fileType Type of file ("positions" or "regions")
+ * @return HashMap of parameter names and their values
+ */
+ static HashMap<String, BitSet> loadTargets(String fileName, String fileType, int marginSize)
+ {
+ HashMap<String, BitSet> positionsByChrom = new HashMap<String, BitSet>();
+
+ try
+ {
+ BufferedReader infile = new BufferedReader(new FileReader(fileName));
+
+ String line = "";
+ int lineCounter = 0;
+
+ while ((line = infile.readLine()) != null)
+ {
+ lineCounter++;
+
+ String[] lineContents = line.split("\t");
+ if(lineContents.length >= 2)
+ {
+ // Try to parse chrom and position //
+ try
+ {
+ String refName = lineContents[0];
+
+ // Get or create BitSet for this refName //
+ BitSet refPositions;
+
+ if(positionsByChrom.containsKey(refName))
+ {
+ refPositions = positionsByChrom.get(refName);
+ }
+ else
+ {
+ refPositions = new BitSet();
+ }
+
+ // Mark position or regions, depending on what was provided //
+ int chrStart = 0;
+ int chrStop = 0;
+
+ if(fileType.equals("positions") && lineContents.length > 1)
+ {
+ // Set the position to true //
+ int position = Integer.parseInt(lineContents[1]);
+ chrStart = position - marginSize;
+ chrStop = position + marginSize;
+ }
+ else if(fileType.equals("regions") && lineContents.length > 2)
+ {
+ chrStart = Integer.parseInt(lineContents[1]) - marginSize;
+ chrStop = Integer.parseInt(lineContents[2]) + marginSize;
+ }
+
+ // Check that it won't be an infinite loop//
+ if(chrStart <= chrStop)
+ {
+ // Mark every position //
+ for(int position = chrStart; position <= chrStop; position++)
+ {
+ refPositions.set(position, true);
+ }
+ }
+
+ // Return it to the hash //
+ positionsByChrom.put(refName, refPositions);
+ }
+ catch(Exception e)
+ {
+ if(lineCounter > 1)
+ System.err.println("Warning: Unable to parse chrom/position from " + line);
+ }
+
+
+ }
+ }
+
+ infile.close();
+ }
+ catch(Exception e)
+ {
+ System.err.println("ERROR: File Parsing Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ }
+
+
+ return(positionsByChrom);
+ }
+
+}
\ No newline at end of file
diff --git a/net/sf/varscan/ReadCounts.java b/net/sf/varscan/ReadCounts.java
new file mode 100644
index 0000000..f0edc36
--- /dev/null
+++ b/net/sf/varscan/ReadCounts.java
@@ -0,0 +1,412 @@
+/**
+ * @(#)ReadCounts.java
+ *
+ * Copyright (c) 2009-2010 Daniel C. Koboldt and Washington University in St. Louis
+ *
+ * COPYRIGHT
+ */
+
+package net.sf.varscan;
+
+import java.io.BufferedReader;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.PrintStream;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.HashMap;
+
+/**
+ * A class for obtaining read counts for a list of variants
+ *
+ * @version 2.3
+ *
+ * @author Daniel C. Koboldt <dkoboldt at genome.wustl.edu>
+ *
+ */
+public class ReadCounts {
+
+ public ReadCounts(String[] args, HashMap<String, String> params)
+ {
+ String usage = "USAGE: java -jar VarScan.jar readcounts [pileup] OPTIONS\n" +
+ "\tOPTIONS:\n" +
+ "\t--variants-file\tA list of variants at which to report readcounts\n" +
+ "\t--output-file\tOutput file to contain the readcounts\n" +
+ "\t--min-coverage\tMinimum read depth at a position to make a call [1]\n" +
+ "\t--min-base-qual\tMinimum base quality at a position to count a read [20]\n";
+
+ // Set parameter defaults //
+
+ String variantsFile = "";
+ String outputFile = "";
+ int minCoverage = 1;
+ int minBaseQual = 20;
+
+ // Get any user-provided parameters //
+
+ try
+ {
+ if(params.containsKey("min-coverage"))
+ minCoverage = Integer.parseInt(params.get("min-coverage"));
+
+ if(params.containsKey("min-base-qual"))
+ minBaseQual = Integer.parseInt(params.get("min-base-qual"));
+
+ if(params.containsKey("variants-file"))
+ variantsFile = params.get("variants-file");
+
+ if(params.containsKey("output-file"))
+ outputFile = params.get("output-file");
+
+ System.err.println("Min coverage:\t" + minCoverage);
+ System.err.println("Min base qual:\t" + minBaseQual);
+ }
+ catch(Exception e)
+ {
+ System.err.println("Input Parameter Threw Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ System.exit(1);
+ }
+
+ // Print usage if -h or --help invoked //
+ if(params.containsKey("help") || params.containsKey("h"))
+ {
+ System.err.println(usage);
+ return;
+ }
+
+ // Define the statistics hash and reset counters //
+
+ HashMap<String, Integer> stats = new HashMap<String, Integer>();
+ stats.put("numPositions", 0);
+ stats.put("numIncluded", 0);
+ stats.put("numCovered", 0);
+
+ // If a list of variants was provided, save it //
+
+ HashMap<String, String> variantPositions = new HashMap<String, String>();
+ if(params.containsKey("variants-file"))
+ {
+ System.err.println("Loading variant positions from " + variantsFile);
+ variantPositions = loadVariants(variantsFile);
+ System.err.println(variantPositions.size() + " variant positions saved");
+ }
+
+ // Attempt to obtain and parse pileup input //
+
+ try
+ {
+ // If output file was provided, open it //
+ PrintStream out = null; // declare a print stream object
+
+ if(params.containsKey("output-file"))
+ {
+ out = new PrintStream( new FileOutputStream(outputFile) );
+ out.println("chrom\tposition\tref_base\tdepth\tq" + minBaseQual + "_depth\tbase:reads:strands:avg_qual:map_qual:plus_reads:minus_reads");
+ }
+
+ // Declare file-parsing variables //
+
+ BufferedReader in = VarScan.getInfile(args);
+ String line;
+
+ // If no input, print usage //
+
+ if(in == null)
+ {
+ System.out.println(usage);
+ System.exit(10);
+ }
+
+ // If input file not ready, give it a few seconds //
+ int numNaps = 0;
+
+ if(!in.ready())
+ {
+ try {
+ Thread.sleep(5000);
+ numNaps++;
+ if(numNaps > 100)
+ {
+ System.err.println("Input file was not ready after 100 5-second cycles!");
+ System.exit(10);
+ }
+ }
+ catch(Exception e)
+ {
+
+ }
+ }
+
+ // If pileup input was provided, begin parsing it //
+
+ if(in.ready())
+ {
+ while ((line = in.readLine()) != null)
+ {
+ stats.put("numPositions", (stats.get("numPositions") + 1));
+
+ // Output progress line //
+ if(params.containsKey("verbose") && (stats.get("numPositions") % 100000) == 0)
+ System.err.println(stats.get("numPositions") + " positions parsed...");
+
+ // Begin try-catch for line parsing //
+
+ try
+ {
+ String[] lineContents = line.split("\t");
+
+ // Verify expected pileup format //
+
+ if(lineContents.length > 5 && lineContents[0].length() > 0 && lineContents[1].length() > 0 && lineContents[2].length() > 0 && lineContents[3].length() > 0)
+ {
+ String refName = lineContents[0];
+ String position = lineContents[1];
+ String refBase = lineContents[2].toUpperCase();
+ int readDepth = Integer.parseInt(lineContents[3]);
+ String readBases = lineContents[4];
+ String readQualities = lineContents[5];
+ String mapQualities = "";
+ if(lineContents.length > 6) // Get Map Qualities if available //
+ mapQualities = lineContents[6];
+
+ // If variant file was provided, verify that this position matches one in list //
+
+ if(!params.containsKey("variants-file") || variantPositions.containsKey(refName + "\t" + position))
+ {
+ stats.put("numIncluded", (stats.get("numIncluded") + 1));
+
+ // Build output line //
+ String outputLine = refName + "\t" + position + "\t" + refBase + "\t" + readDepth + "\t";
+
+ if(readDepth >= minCoverage)
+ {
+ stats.put("numCovered", (stats.get("numCovered") + 1));
+
+ // Obtain the readcounts //
+
+ HashMap<String, String> readCounts = VarScan.getReadCounts(refBase, readBases, readQualities, minBaseQual, mapQualities);
+
+ // Build array of allele keys observed and sort it //
+ String[] alleleKeys = (String[]) readCounts.keySet().toArray(new String[0]);
+ Arrays.sort(alleleKeys);
+
+ // Calculate the # of reads that met quality threshold //
+ int readDepthQual = VarScan.qualityDepth(readQualities, minBaseQual);
+
+
+ // Add the quality-met read count to output //
+
+ outputLine += readDepthQual + "\t";
+
+ // First, get the ref result //
+ //reads2 strands2 qual2 map2 readsPlus readsMinus
+ String refResult = "0\t0\t0\t0\t0\t0";
+ if(readCounts.containsKey(refBase))
+ refResult = readCounts.get(refBase);
+
+ // Replace tabs with colons for ref base //
+ refResult = refBase + ":" + refResult.replace("\t", ":");
+
+ outputLine += refResult + "\t";
+
+ // If a variant file was provided, try to get the desired variant allele //
+
+ String desiredAllele = "";
+ if(variantPositions.containsKey(refName + "\t" + position))
+ {
+ try
+ {
+ String[] varContents = variantPositions.get(refName + "\t" + position).split("\t");
+ if(varContents.length > 1 && varContents[1].length() > 0)
+ {
+ desiredAllele = varContents[1];
+
+ String varResult = "0\t0\t0\t0\t0\t0";
+ if(readCounts.containsKey(desiredAllele))
+ varResult = readCounts.get(desiredAllele);
+
+ outputLine += desiredAllele + "\t" + varResult + "\t";
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Warning: Error parsing variant position entry: " + variantPositions.get(refName + "\t" + position) + "\n" + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ }
+ }
+
+ // Go through all bases observed //
+
+ for(String allele : alleleKeys)
+ {
+ String[] alleleContents = readCounts.get(allele).split("\t");
+ if(allele.equals(refBase) || allele.equals(desiredAllele))
+ {
+ // Skip the reference base and desired base //
+ }
+ else
+ {
+ try {
+ int thisReads2 = Integer.parseInt(alleleContents[0]);
+ int thisStrands2 = Integer.parseInt(alleleContents[1]);
+ int thisAvgQual2 = Integer.parseInt(alleleContents[2]);
+ int thisMapQual2 = Integer.parseInt(alleleContents[3]);
+ int thisReads2plus = Integer.parseInt(alleleContents[4]);
+ int thisReads2minus = Integer.parseInt(alleleContents[5]);
+
+ String varResult = allele + ":" + thisReads2 + ":" + thisStrands2 + ":" + thisAvgQual2 + ":" + thisMapQual2 + ":" + thisReads2plus + ":" + thisReads2minus;
+ outputLine += varResult + "\t";
+ }
+ catch(Exception e)
+ {
+
+ }
+
+ }
+ }
+
+ // Print the output line //
+
+ if(params.containsKey("output-file"))
+ {
+ out.println(outputLine);
+ }
+ else
+ {
+ System.err.println(outputLine);
+ }
+ }
+ else if(variantPositions.containsKey(refName + "\t" + position))
+ {
+ // Insufficient coverage - Print the output line //
+
+ if(params.containsKey("output-file"))
+ {
+ out.println(outputLine);
+ }
+ else
+ {
+ System.err.println(outputLine);
+ }
+ }
+ }
+
+ }
+ else
+ {
+ System.err.println("Error: Invalid format for pileup at line " + stats.get("numBases") + "\n" + line + "\n");
+ return;
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Parsing Exception on line:\n" + line + "\n" + e.getLocalizedMessage());
+ stats.put("numParsingExceptions", (stats.get("numParsingExceptions") + 1));
+ if(stats.get("numParsingExceptions") >= 5)
+ {
+ System.err.println("Too many parsing exceptions encountered; exiting");
+ return;
+ }
+ return;
+ }
+
+
+ }
+ }
+ else
+ {
+ System.err.println("Input was not ready for parsing!");
+ return;
+ }
+
+ in.close();
+
+ if(params.containsKey("output-file"))
+ {
+ out.close();
+ }
+
+ // Print summary statistics //
+
+ System.err.println(stats.get("numPositions") + " positions in pileup file");
+ System.err.println(stats.get("numIncluded") + " included in readcount analysis");
+ System.err.println(stats.get("numCovered") + " met minimum coverage");
+ }
+ catch(Exception e)
+ {
+ System.err.println("Error parsing input: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ System.exit(11);
+ }
+
+ }
+
+
+ /**
+ * Saves variants into a HashMap
+ *
+ * @param args Command-line arguments
+ * @return HashMap of parameter names and their values
+ */
+ static HashMap<String, String> loadVariants(String fileName)
+ {
+ HashMap<String, String> variants = new HashMap<String, String>();
+
+ try
+ {
+ BufferedReader infile = new BufferedReader(new FileReader(fileName));
+
+ String line = "";
+ int lineCounter = 0;
+
+ while ((line = infile.readLine()) != null)
+ {
+ lineCounter++;
+
+ String[] lineContents = line.split("\t");
+ if(lineContents.length >= 2)
+ {
+ // Try to parse chrom and position //
+ try
+ {
+ String refName = lineContents[0];
+ int position = Integer.parseInt(lineContents[1]);
+
+ String allele1 = "";
+ String allele2 = "";
+
+ try{
+ allele1 = lineContents[2];
+ allele2 = lineContents[3];
+ }
+ catch(Exception e)
+ {
+ // Don't load variant positions //
+ }
+
+ String positionKey = refName + "\t" + position;
+ variants.put(positionKey, allele1 + "\t" + allele2);
+ }
+ catch(Exception e)
+ {
+ if(lineCounter > 1)
+ System.err.println("Warning: Unable to parse chrom/position from " + line);
+ }
+
+
+ }
+ }
+
+ infile.close();
+ }
+ catch(Exception e)
+ {
+ System.err.println("ERROR: File Parsing Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ }
+
+
+ return(variants);
+ }
+}
diff --git a/net/sf/varscan/Somatic.java b/net/sf/varscan/Somatic.java
new file mode 100644
index 0000000..25bb198
--- /dev/null
+++ b/net/sf/varscan/Somatic.java
@@ -0,0 +1,1966 @@
+/**
+ * @(#)Somatic.java
+ *
+ * Copyright (c) 2009-2010 Daniel C. Koboldt and Washington University in St. Louis
+ *
+ * COPYRIGHT
+ */
+
+package net.sf.varscan;
+
+//Import required packages //
+
+import java.io.*;
+import java.text.*;
+import java.util.*;
+import java.lang.Math.*;
+
+/**
+ * A class for determining somatic status of variants from Normal/Tumor pileup files
+ *
+ * @version 2.3
+ *
+ * @author Daniel C. Koboldt <dkoboldt at genome.wustl.edu>
+ *
+ */
+public class Somatic {
+
+
+
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
+ // Constructor with two arguments (string[], boolean) expects mpileup input //
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ public Somatic(String[] args, boolean isMpileup)
+ {
+ String usage = "USAGE: java -jar VarScan.jar somatic [normal-tumor.mpileup] [Opt: output] OPTIONS\n" +
+ "\tnormal-tumor.pileup - The SAMtools mpileup file for Normal and Tumor BAMs\n" +
+ "\toutput - Output base name for SNP and indel output\n" +
+ "\nOPTIONS:\n" +
+ "\t--output-snp - Output file for SNP calls [output.snp]\n" +
+ "\t--output-indel - Output file for indel calls [output.indel]\n" +
+ "\t--min-coverage - Minimum coverage in normal and tumor to call variant [8]\n" +
+ "\t--min-coverage-normal - Minimum coverage in normal to call somatic [8]\n" +
+ "\t--min-coverage-tumor - Minimum coverage in tumor to call somatic [6]\n" +
+ "\t--min-var-freq - Minimum variant frequency to call a heterozygote [0.10]\n" +
+ "\t--min-freq-for-hom\tMinimum frequency to call homozygote [0.75]\n" +
+ "\t--normal-purity - Estimated purity (non-tumor content) of normal sample [1.00]\n" +
+ "\t--tumor-purity - Estimated purity (tumor content) of tumor sample [1.00]\n" +
+ "\t--p-value - P-value threshold to call a heterozygote [0.99]\n" +
+ "\t--somatic-p-value - P-value threshold to call a somatic site [0.05]\n" +
+ "\t--strand-filter - If set to 1, removes variants with >90% strand bias\n" +
+ "\t--validation - If set to 1, outputs all compared positions even if non-variant\n" +
+ "\t--output-vcf - If set to 1, output VCF instead of VarScan native format\n";
+
+ String vcfHeader = "##fileformat=VCFv4.1";
+ vcfHeader += "\n" + "##source=VarScan2";
+ vcfHeader += "\n" + "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total depth of quality bases\">";
+ vcfHeader += "\n" + "##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description=\"Indicates if record is a somatic mutation\">";
+ vcfHeader += "\n" + "##INFO=<ID=SS,Number=1,Type=String,Description=\"Somatic status of variant (0=Reference,1=Germline,2=Somatic,3=LOH, or 5=Unknown)\">";
+ vcfHeader += "\n" + "##INFO=<ID=SSC,Number=1,Type=String,Description=\"Somatic score in Phred scale (0-255) derived from somatic p-value\">";
+ vcfHeader += "\n" + "##INFO=<ID=GPV,Number=1,Type=Float,Description=\"Fisher's Exact Test P-value of tumor+normal versus no variant for Germline calls\">";
+ vcfHeader += "\n" + "##INFO=<ID=SPV,Number=1,Type=Float,Description=\"Fisher's Exact Test P-value of tumor versus normal for Somatic/LOH calls\">";
+ vcfHeader += "\n" + "##FILTER=<ID=str10,Description=\"Less than 10% or more than 90% of variant supporting reads on one strand\">";
+ vcfHeader += "\n" + "##FILTER=<ID=indelError,Description=\"Likely artifact due to indel reads at this position\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=RD,Number=1,Type=Integer,Description=\"Depth of reference-supporting bases (reads1)\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Depth of variant-supporting bases (reads2)\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=FREQ,Number=1,Type=String,Description=\"Variant allele frequency\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=DP4,Number=1,Type=String,Description=\"Strand read counts: ref/fwd, ref/rev, var/fwd, var/rev\">";
+ vcfHeader += "\n" + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNORMAL\tTUMOR";
+
+ // Set parameter defaults //
+
+ HashMap<String, String> params = VarScan.getParams(args);
+
+ // Set up formatting for p-values //
+ DecimalFormat pvalueFormat = new DecimalFormat("0.####E0");
+ DecimalFormat oneDigit = new DecimalFormat("#0.0");
+ DecimalFormat threeDigits = new DecimalFormat("#0.000");
+
+ // Establish output file names //
+ String outputName = "output";
+ String outputSnp = "";
+ String outputIndel = "";
+ String outputCopy = "";
+
+ if(args.length >= 3 && !args[2].startsWith("-"))
+ {
+ outputName = args[2];
+ outputSnp = outputName + ".snp";
+ outputIndel = outputName + ".indel";
+ }
+
+// Set parameter defaults //
+
+ int minCoverage = 8;
+ int minCoverageNormal = 8;
+ int minCoverageTumor = 6;
+ int minReads2 = 2;
+ int minStrands2 = 1;
+ int minAvgQual = 15;
+ double normalPurity = 1.00;
+ double tumorPurity = 1.00;
+ double dataRatio = 1.00;
+ double minVarFreq = 0.20;
+ double pValueThreshold = 0.99;
+ double somaticPvalue = 0.05; //1.0e-04;
+ double minFreqForHom = 0.75;
+ boolean doStrandFilter = true;
+
+ // Try adjusting any provided parameters based on user inut //
+ try
+ {
+ if(params.containsKey("output-snp"))
+ outputSnp = params.get("output-snp");
+
+ if(params.containsKey("output-indel"))
+ outputIndel = params.get("output-indel");
+
+ if(params.containsKey("min-coverage"))
+ {
+ minCoverage = Integer.parseInt(params.get("min-coverage"));
+ minCoverageNormal = minCoverage;
+ minCoverageTumor = minCoverage;
+ }
+
+ if(params.containsKey("min-coverage-normal"))
+ minCoverageNormal = Integer.parseInt(params.get("min-coverage-normal"));
+
+ if(params.containsKey("min-coverage-tumor"))
+ minCoverageTumor = Integer.parseInt(params.get("min-coverage-tumor"));
+
+ if(params.containsKey("min-reads2"))
+ minReads2 = Integer.parseInt(params.get("min-reads2"));
+
+ if(params.containsKey("min-strands2"))
+ minStrands2 = Integer.parseInt(params.get("min-strands2"));
+
+ if(params.containsKey("min-var-freq"))
+ minVarFreq = Double.parseDouble(params.get("min-var-freq"));
+
+ if(params.containsKey("min-freq-for-hom"))
+ minFreqForHom = Double.parseDouble(params.get("min-freq-for-hom"));
+
+ if(params.containsKey("min-avg-qual"))
+ minAvgQual = Integer.parseInt(params.get("min-avg-qual"));
+
+ if(params.containsKey("p-value"))
+ pValueThreshold = Double.parseDouble(params.get("p-value"));
+
+ if(params.containsKey("somatic-p-value"))
+ somaticPvalue = Double.parseDouble(params.get("somatic-p-value"));
+
+ if(params.containsKey("data-ratio"))
+ dataRatio = Double.parseDouble(params.get("data-ratio"));
+
+ if(params.containsKey("normal-purity"))
+ {
+ normalPurity = Double.parseDouble(params.get("normal-purity"));
+ if(normalPurity > 1)
+ normalPurity = normalPurity / 100.00;
+ }
+
+ if(params.containsKey("tumor-purity"))
+ {
+ tumorPurity = Double.parseDouble(params.get("tumor-purity"));
+ if(tumorPurity > 1)
+ tumorPurity = normalPurity / 100.00;
+ }
+
+ if(params.containsKey("strand-filter"))
+ {
+ int filter = Integer.parseInt(params.get("strand-filter"));
+ if(filter > 0)
+ doStrandFilter = true;
+ else
+ doStrandFilter = false;
+ }
+
+// System.err.println("Min coverage:\t" + minCoverage);
+ System.err.println("Min coverage:\t" + minCoverageNormal + "x for Normal, " + minCoverageTumor + "x for Tumor");
+ System.err.println("Min reads2:\t" + minReads2);
+ System.err.println("Min strands2:\t" + minStrands2);
+ System.err.println("Min var freq:\t" + minVarFreq);
+ System.err.println("Min freq for hom:\t" + minFreqForHom);
+ System.err.println("Normal purity:\t" + normalPurity);
+ System.err.println("Tumor purity:\t" + tumorPurity);
+ System.err.println("Min avg qual:\t" + minAvgQual);
+ System.err.println("P-value thresh:\t" + pValueThreshold);
+ System.err.println("Somatic p-value:\t" + somaticPvalue);
+ if(params.containsKey("validation"))
+ System.err.println("Validation mode: on");
+
+ }
+ catch(Exception e)
+ {
+ System.err.println("Input Parameter Threw Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ System.exit(1);
+ }
+
+ // Print usage if -h or --help invoked //
+ if(params.containsKey("help") || params.containsKey("h"))
+ {
+ System.err.println(usage);
+ return;
+ }
+
+ // Check for correct input //
+
+ if(outputSnp.length() == 0 || outputIndel.length() == 0)
+ {
+ System.err.println("Please provide an output basename or SNP/indel output files!");
+ System.err.println(usage);
+ System.exit(1);
+ }
+
+ // Statistics counters //
+ long tumorPositions = 0;
+ long sharedPositions = 0;
+ long comparedPositions = 0;
+ long calledReference = 0;
+ long indelFilter = 0;
+ long strandFilter = 0;
+ long calledGermline = 0;
+ long calledLOH = 0;
+ long calledSomatic = 0;
+ long calledUnknown = 0;
+ long calledVariant = 0;
+
+
+ try
+ {
+ // Declare file-parsing variables //
+
+ BufferedReader in = VarScan.getInfile(args);
+ String line;
+
+ // If no input, print usage //
+
+ if(in == null)
+ {
+ System.out.println(usage);
+ return;
+ }
+
+ // If input file not ready, give it a few seconds //
+ int numNaps = 0;
+
+ while(!in.ready())
+ {
+ try {
+ Thread.sleep(5000);
+ numNaps++;
+
+ if(numNaps > 100)
+ {
+ System.err.println("Input file was not ready after 100 5-second cycles!");
+ System.exit(10);
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Exception while trying to get input" + e.getMessage());
+ System.exit(1);
+ }
+ }
+
+ // Proceed if input stream is ready //
+
+ if(in != null && in.ready())
+ {
+ // Declare some file-parsing variables //
+ String lineNormal;
+ String lineTumor;
+ String chromNormal = "";
+ String chromTumor = "";
+ String refBase = "";
+ int posNormal = 0;
+ int posTumor = 0;
+
+ // Declare output file //
+ PrintStream outSnp = null; // declare a print stream object for SNPs
+ PrintStream outIndel = null; // declare a print stream object for Indels
+ PrintStream outValidation = null; // declare a print stream object for both for validation
+ PrintStream outCopyNumber = null; // declare a print stream object for both for validation
+
+ if(params.containsKey("output-vcf"))
+ {
+ if(!outputSnp.contains(".vcf"))
+ outputSnp += ".vcf";
+ if(!outputIndel.contains(".vcf"))
+ outputIndel += ".vcf";
+ }
+ outSnp = new PrintStream( new FileOutputStream(outputSnp) );
+ outIndel = new PrintStream( new FileOutputStream(outputIndel) );
+
+ if(!params.containsKey("no-headers") && !params.containsKey("output-vcf"))
+ {
+ outSnp.println("chrom\tposition\tref\tvar\tnormal_reads1\tnormal_reads2\tnormal_var_freq\tnormal_gt\ttumor_reads1\ttumor_reads2\ttumor_var_freq\ttumor_gt\tsomatic_status\tvariant_p_value\tsomatic_p_value\ttumor_reads1_plus\ttumor_reads1_minus\ttumor_reads2_plus\ttumor_reads2_minus\tnormal_reads1_plus\tnormal_reads1_minus\tnormal_reads2_plus\tnormal_reads2_minus");
+ outIndel.println("chrom\tposition\tref\tvar\tnormal_reads1\tnormal_reads2\tnormal_var_freq\tnormal_gt\ttumor_reads1\ttumor_reads2\ttumor_var_freq\ttumor_gt\tsomatic_status\tvariant_p_value\tsomatic_p_value\ttumor_reads1_plus\ttumor_reads1_minus\ttumor_reads2_plus\ttumor_reads2_minus\tnormal_reads1_plus\tnormal_reads1_minus\tnormal_reads2_plus\tnormal_reads2_minus");
+ }
+
+ if(params.containsKey("output-vcf"))
+ {
+ // Output VCF Header //
+ outSnp.println(vcfHeader);
+ outIndel.println(vcfHeader);
+ }
+
+ if(params.containsKey("validation"))
+ {
+ outValidation = new PrintStream( new FileOutputStream(outputName + ".validation") );
+ if(!params.containsKey("no-headers") && !params.containsKey("output-vcf"))
+ outValidation.println("chrom\tposition\tref\tvar\tnormal_reads1\tnormal_reads2\tnormal_var_freq\tnormal_gt\ttumor_reads1\ttumor_reads2\ttumor_var_freq\ttumor_gt\tsomatic_status\tvariant_p_value\tsomatic_p_value\ttumor_reads1_plus\ttumor_reads1_minus\ttumor_reads2_plus\ttumor_reads2_minus");
+ if(params.containsKey("output-vcf"))
+ {
+ // Output VCF Header //
+ outValidation.println(vcfHeader);
+ }
+ }
+
+ // Parse the infile line by line //
+ System.err.println("Reading mpileup input...");
+ int numParsingExceptions = 0;
+
+ while ((line = in.readLine()) != null)
+ {
+
+ // Begin try-catch for line parsing //
+
+ try
+ {
+ String[] lineContents = line.split("\t");
+
+ // Verify expected pileup format //
+
+ if(lineContents.length > 5 && lineContents[0].length() > 0 && lineContents[1].length() > 0 && lineContents[2].length() > 0 && lineContents[3].length() > 0)
+ {
+ sharedPositions++;
+
+ // Parse common fields from line //
+ String refName = lineContents[0];
+ int position = Integer.parseInt(lineContents[1]);
+ refBase = lineContents[2].toUpperCase();
+
+ chromNormal = refName;
+ chromTumor = refName;
+ posNormal = position;
+ posTumor = position;
+
+ // Parse normal, which should be first sample //
+ int normalOffset = 3;
+ int pileupDepthNormal = Integer.parseInt(lineContents[normalOffset]);
+ String normalBases = lineContents[normalOffset + 1];
+ String normalQualities = lineContents[normalOffset + 2];
+
+ // Parse tumor, which should be second sample //
+ int tumorOffset = 6;
+ int pileupDepthTumor = Integer.parseInt(lineContents[tumorOffset]);
+ String tumorBases = lineContents[tumorOffset + 1];
+ String tumorQualities = lineContents[tumorOffset + 2];
+
+ lineNormal = refName + "\t" + position + "\t" + refBase + "\t" + pileupDepthNormal + "\t" + normalBases + "\t" + normalQualities;
+ lineTumor = refName + "\t" + position + "\t" + refBase + "\t" + pileupDepthTumor + "\t" + tumorBases + "\t" + tumorQualities;
+
+ String compareResult = comparePositions(lineNormal, lineTumor, minCoverage, minReads2, minVarFreq, minAvgQual, pValueThreshold, somaticPvalue, minFreqForHom, normalPurity, tumorPurity);
+
+ if(compareResult.length() > 0)
+ {
+ // Get the alleles to determine type //
+ String[] compareContents = compareResult.split("\t");
+ String allele1 = compareContents[0];
+ String allele2 = compareContents[1];
+
+ double strandedness1 = 0.50;
+ double strandedness2 = 0.50;
+ double strandednessDiff = 0.00;
+
+ if(compareContents.length >= 17)
+ {
+ try
+ {
+ int tumorReads1plus = Integer.parseInt(compareContents[13]);
+ int tumorReads1minus = Integer.parseInt(compareContents[14]);
+ int tumorReads2plus = Integer.parseInt(compareContents[15]);
+ int tumorReads2minus = Integer.parseInt(compareContents[16]);
+
+ if(tumorReads1plus > 0 || tumorReads1minus > 0)
+ {
+ strandedness1 = (double) tumorReads1plus / (double) (tumorReads1plus + tumorReads1minus);
+ }
+
+ if(tumorReads2plus > 0 || tumorReads2minus > 0)
+ {
+ strandedness2 = (double) tumorReads2plus / (double) (tumorReads2plus + tumorReads2minus);
+ if(tumorReads1plus > 0 || tumorReads1minus > 0)
+ {
+ strandednessDiff = java.lang.Math.abs(strandedness1 - strandedness2);
+ }
+ }
+ }
+ catch(Exception e)
+ {
+ // Exception parsing info from compareResult //
+ }
+ }
+
+ //stats.put("comparedPositions", (stats.get("comparedPositions") + 1));
+ comparedPositions++;
+
+ if(params.containsKey("verbose") && !compareResult.contains("Reference"))
+ System.err.println(chromNormal + "\t" + posNormal + "\t" + compareResult);
+
+ // If VCF format specified, supply it //
+
+ if(params.containsKey("output-vcf"))
+ {
+ int normalReads1 = Integer.parseInt(compareContents[2]);
+ int normalReads2 = Integer.parseInt(compareContents[3]);
+ String normalFreq = compareContents[4];
+ String normalCall = compareContents[5];
+ int tumorReads1 = Integer.parseInt(compareContents[6]);
+ int tumorReads2 = Integer.parseInt(compareContents[7]);
+ String tumorFreq = compareContents[8];
+ String tumorCall = compareContents[9];
+ String somStatus = compareContents[10];
+ Double germlineP = Double.parseDouble(compareContents[11]);
+ Double somaticP = Double.parseDouble(compareContents[12]);
+
+ int totalDepth = pileupDepthNormal + pileupDepthTumor;
+
+ if(allele2.startsWith("+"))
+ {
+ // INSERTION //
+ // Ref = ref base; Var = ref base followed by inserted bases //
+ String varColumn = allele1 + allele2.replace("+", "");
+ compareResult = "." + "\t" + allele1 + "\t" + varColumn + "\t" + ".";
+ }
+ else if(allele2.startsWith("-"))
+ {
+ // DELETION //
+ // Ref = ref base followed by deleted bases; var = ref base //
+ String refColumn = allele1 + allele2.replace("-", "");
+ compareResult = "." + "\t" + refColumn + "\t" + allele1 + "\t" + ".";
+ }
+ else
+ {
+ compareResult = "." + "\t" + allele1 + "\t" + allele2 + "\t" + ".";
+ }
+
+
+ // Decide on filter field //
+ if(doStrandFilter && strandednessDiff > 0.10 && (strandedness2 < 0.10 || strandedness2 > 0.90))
+ {
+ compareResult += "\t" + "str10";
+ }
+ else if(somStatus.equals("IndelFilter"))
+ {
+ compareResult += "\t" + "indelError";
+ }
+ else
+ {
+ compareResult += "\t" + "PASS";
+ }
+
+ // Determine somatic status id and score //
+ int ssCode = 0;
+ double somScore = 0;
+
+ if(somStatus.equals("Reference"))
+ {
+ // Wildtype //
+ ssCode = 0;
+ calledReference++;
+ }
+ else if(somStatus.equals("Germline"))
+ {
+ // Germline //
+ ssCode = 1;
+ calledGermline++;
+ if(somaticP == 0)
+ {
+ somScore = 0;
+ }
+ else
+ {
+ somScore = 0 - (10 * java.lang.Math.log10(somaticP));
+ }
+ }
+ else if(somStatus.equals("Somatic"))
+ {
+ // Somatic //
+ ssCode = 2;
+ calledSomatic++;
+ if(somaticP == 0)
+ {
+ somScore = 255;
+ }
+ else
+ {
+ somScore = 0 - (10 * java.lang.Math.log10(somaticP));
+ }
+ }
+ else if(somStatus.equals("LOH"))
+ {
+ // LOH //
+ ssCode = 3;
+ calledLOH++;
+ if(somaticP == 0)
+ {
+ somScore = 255;
+ }
+ else
+ {
+ somScore = 0 - (10 * java.lang.Math.log10(somaticP));
+ }
+ }
+ else
+ {
+ // Unknown //
+ calledUnknown++;
+ ssCode = 5;
+ }
+
+ // Adjust somatic score //
+ if(somScore > 255)
+ somScore = 255;
+
+ // Print the info field //
+
+ compareResult += "\t" + "DP=" + totalDepth;
+ if(somStatus.equals("Somatic"))
+ compareResult += ";SOMATIC";
+ compareResult += ";" + "SS=" + ssCode;
+ compareResult += ";" + "SSC=" + (int) somScore;
+ compareResult += ";" + "GPV=" + pvalueFormat.format(germlineP);
+ compareResult += ";" + "SPV=" + pvalueFormat.format(somaticP);
+
+ // Print the format field //
+
+ String tumorDP4 = "";
+ String normalDP4 = "";
+
+ if(compareContents.length >= 17)
+ {
+ try
+ {
+ tumorDP4 = compareContents[13] + "," + compareContents[14] + "," + compareContents[15] + "," + compareContents[16];
+ normalDP4 = compareContents[17] + "," + compareContents[18] + "," + compareContents[19] + "," + compareContents[20];
+ }
+ catch(Exception e)
+ {
+ // Exception parsing info from compareResult //
+ tumorDP4 = "";
+ normalDP4 = "";
+ }
+ }
+
+ if(tumorDP4.length() > 0)
+ compareResult += "\tGT:GQ:DP:RD:AD:FREQ:DP4";
+ else
+ compareResult += "\tGT:GQ:DP:RD:AD:FREQ";
+
+ // Determine normal genotype //
+ String normalGt = ".";
+ String tumorGt = ".";
+ if(normalCall.equals(refBase))
+ {
+ normalGt = "0/0";
+ }
+ else if(VarScan.isHeterozygous(normalCall))
+ {
+ normalGt = "0/1";
+ }
+ else
+ {
+ normalGt = "1/1";
+ }
+
+ if(tumorCall.equals(refBase))
+ {
+ tumorGt = "0/0";
+ }
+ else if(VarScan.isHeterozygous(tumorCall))
+ {
+ tumorGt = "0/1";
+ }
+ else
+ {
+ tumorGt = "1/1";
+ }
+
+ if(tumorDP4.length() > 0)
+ {
+ compareResult += "\t" + normalGt + ":.:" + pileupDepthNormal + ":" + normalReads1 + ":" + normalReads2 + ":" + normalFreq + ":" + normalDP4;
+ compareResult += "\t" + tumorGt + ":.:" + pileupDepthTumor + ":" + tumorReads1 + ":" + tumorReads2 + ":" + tumorFreq + ":" + tumorDP4;
+ }
+ else
+ {
+ compareResult += "\t" + normalGt + ":.:" + pileupDepthNormal + ":" + normalReads1 + ":" + normalReads2 + ":" + normalFreq;
+ compareResult += "\t" + tumorGt + ":.:" + pileupDepthTumor + ":" + tumorReads1 + ":" + tumorReads2 + ":" + tumorFreq;
+ }
+ }
+
+ // Print to master file for validation //
+
+ if(params.containsKey("validation"))
+ {
+ outValidation.println(chromNormal + "\t" + posNormal + "\t" + compareResult);
+ }
+
+ if(!params.containsKey("validation") && (compareResult.contains("Reference") || compareResult.contains("SS=0") || compareResult.contains("Filter")))
+ {
+ // Don't print reference/indelfilter positions unless doing validation //
+ }
+ else if(doStrandFilter && strandednessDiff > 0.10 && (strandedness2 < 0.10 || strandedness2 > 0.90))
+ {
+ // If filter is set, ignore variants that are supported largely by one strand //
+ if(!params.containsKey("output-vcf"))
+ compareResult = "StrandFilter";
+ }
+ else if(allele1.contains("-") || allele1.contains("+") || allele2.contains("-") || allele2.contains("+"))//if(compareResult.contains("INS") || compareResult.contains("DEL"))
+ {
+ outIndel.println(chromNormal + "\t" + posNormal + "\t" + compareResult);
+ }
+ else
+ {
+ outSnp.println(chromNormal + "\t" + posNormal + "\t" + compareResult);
+ }
+ }
+ else
+ {
+// System.err.println("Uncalled" + chromNormal + "\t" + posNormal + "\t" + compareResult);
+ }
+
+ if(compareResult.contains("Reference"))
+ calledReference++; //stats.put("calledReference", (stats.get("calledReference") + 1));
+ else if(compareResult.contains("IndelFilter"))
+ indelFilter++; //stats.put("indelFilter", (stats.get("indelFilter") + 1));
+ else if(compareResult.contains("StrandFilter"))
+ strandFilter++;
+ else if(compareResult.contains("Germline"))
+ calledGermline++; //stats.put("calledGermline", (stats.get("calledGermline") + 1));
+ else if(compareResult.contains("Somatic"))
+ calledSomatic++; //stats.put("calledSomatic", (stats.get("calledSomatic") + 1));
+ else if(compareResult.contains("LOH"))
+ calledLOH++; //stats.put("calledLOH", (stats.get("calledLOH") + 1));
+ else if(compareResult.contains("Unknown"))
+ calledUnknown++; //stats.put("calledUnknown", (stats.get("calledUnknown") + 1));
+ else if(compareResult.contains("Variant"))
+ calledVariant++; //stats.put("calledVariant", (stats.get("calledVariant") + 1));
+
+
+
+ }
+ else
+ {
+ System.err.println("Error: Invalid format or not enough samples in mpileup: " + line + "\n");
+ return;
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Parsing Exception on line:\n" + line + "\n" + e.getLocalizedMessage());
+ numParsingExceptions++;
+ if(numParsingExceptions >= 5)
+ {
+ System.err.println("Too many parsing exceptions encountered; exiting");
+ return;
+ }
+ return;
+ }
+ }
+
+ // Close input/output files //
+ in.close();
+ outSnp.close();
+ outIndel.close();
+
+ System.err.println(sharedPositions + " positions in mpileup file"); //stats.get("sharedPositions")
+ System.err.println(comparedPositions + " had sufficient coverage for comparison"); //stats.get("comparedPositions")
+ System.err.println(calledReference + " were called Reference"); //stats.get("calledReference")
+ System.err.println(indelFilter + " were mixed SNP-indel calls and filtered");
+ if(doStrandFilter)
+ System.err.println(strandFilter + " were removed by the strand filter");
+ System.err.println(calledGermline + " were called Germline");
+ System.err.println(calledLOH + " were called LOH");
+ System.err.println(calledSomatic + " were called Somatic");
+ System.err.println(calledUnknown + " were called Unknown");
+ System.err.println(calledVariant + " were called Variant");
+ }
+ else
+ {
+ System.err.println("Input file never ready for parsing (maybe due to file I/O)...");
+ System.exit(10);
+ }
+ }
+ catch (IOException e)
+ {
+ System.err.println("File Parsing Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ System.exit(11);
+ }
+
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
+ // Constructor with one argument (string[]) expects two-pileup input //
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ public Somatic(String[] args)
+ {
+ String usage = "USAGE: VarScan somatic [normal_pileup] [tumor_pileup] [Opt: output] OPTIONS\n" +
+ "\tnormal_pileup - The SAMtools pileup file for Normal\n" +
+ "\ttumor_pileup - The SAMtools pileup file for Tumor\n" +
+ "\toutput - Output base name for SNP and indel output\n" +
+ "\nOPTIONS:\n" +
+ "\t--output-snp - Output file for SNP calls [output.snp]\n" +
+ "\t--output-indel - Output file for indel calls [output.indel]\n" +
+ "\t--min-coverage - Minimum coverage in normal and tumor to call variant [8]\n" +
+ "\t--min-coverage-normal - Minimum coverage in normal to call somatic [8]\n" +
+ "\t--min-coverage-tumor - Minimum coverage in tumor to call somatic [6]\n" +
+ "\t--min-var-freq - Minimum variant frequency to call a heterozygote [0.10]\n" +
+ "\t--min-freq-for-hom\tMinimum frequency to call homozygote [0.75]\n" +
+ "\t--normal-purity - Estimated purity (non-tumor content) of normal sample [1.00]\n" +
+ "\t--tumor-purity - Estimated purity (tumor content) of tumor sample [1.00]\n" +
+ "\t--p-value - P-value threshold to call a heterozygote [0.99]\n" +
+ "\t--somatic-p-value - P-value threshold to call a somatic site [0.05]\n" +
+ "\t--strand-filter - If set to 1, removes variants with >90% strand bias [0]\n" +
+ "\t--validation - If set to 1, outputs all compared positions even if non-variant\n" +
+ "\t--output-vcf - If set to 1, output VCF instead of VarScan native format\n";
+
+ String vcfHeader = "##fileformat=VCFv4.1";
+ vcfHeader += "\n" + "##source=VarScan2";
+ vcfHeader += "\n" + "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total depth of quality bases\">";
+ vcfHeader += "\n" + "##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description=\"Indicates if record is a somatic mutation\">";
+ vcfHeader += "\n" + "##INFO=<ID=SS,Number=1,Type=String,Description=\"Somatic status of variant (0=Reference,1=Germline,2=Somatic,3=LOH, or 5=Unknown)\">";
+ vcfHeader += "\n" + "##INFO=<ID=SSC,Number=1,Type=String,Description=\"Somatic score in Phred scale (0-255) derived from somatic p-value\">";
+ vcfHeader += "\n" + "##INFO=<ID=GPV,Number=1,Type=Float,Description=\"Fisher's Exact Test P-value of tumor+normal versus no variant for Germline calls\">";
+ vcfHeader += "\n" + "##INFO=<ID=SPV,Number=1,Type=Float,Description=\"Fisher's Exact Test P-value of tumor versus normal for Somatic/LOH calls\">";
+ vcfHeader += "\n" + "##FILTER=<ID=str10,Description=\"Less than 10% or more than 90% of variant supporting reads on one strand\">";
+ vcfHeader += "\n" + "##FILTER=<ID=indelError,Description=\"Likely artifact due to indel reads at this position\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=RD,Number=1,Type=Integer,Description=\"Depth of reference-supporting bases (reads1)\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Depth of variant-supporting bases (reads2)\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=FREQ,Number=1,Type=String,Description=\"Variant allele frequency\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=DP4,Number=1,Type=String,Description=\"Strand read counts: ref/fwd, ref/rev, var/fwd, var/rev\">";
+ vcfHeader += "\n" + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNORMAL\tTUMOR";
+
+ if(args.length < 3)
+ {
+ System.err.println(usage);
+ System.exit(1);
+ }
+
+ // Get the required arguments //
+ String normalPileupFile = args[1];
+ String tumorPileupFile = args[2];
+
+ String outputName = "output";
+ String outputSnp = "";
+ String outputIndel = "";
+ String outputCopy = "";
+
+ if(args.length >= 4 && !args[3].startsWith("-"))
+ {
+ outputName = args[3];
+ outputSnp = outputName + ".snp";
+ outputIndel = outputName + ".indel";
+ }
+
+
+ System.err.println("Normal Pileup: " + normalPileupFile);
+ System.err.println("Tumor Pileup: " + tumorPileupFile);
+
+ // Set parameter defaults //
+
+ int minCoverage = 8;
+ int minCoverageNormal = 8;
+ int minCoverageTumor = 6;
+ int minReads2 = 2;
+ int minStrands2 = 1;
+ int minAvgQual = 15;
+ double normalPurity = 1.00;
+ double tumorPurity = 1.00;
+ double dataRatio = 1.00;
+ double minVarFreq = 0.20;
+ double pValueThreshold = 0.99;
+ double somaticPvalue = 0.05; //1.0e-04;
+ double minFreqForHom = 0.75;
+
+ // Parse command-line parameters //
+ HashMap<String, String> params = VarScan.getParams(args);
+
+ // Try adjusting any provided parameters based on user inut //
+ try
+ {
+ if(params.containsKey("output-snp"))
+ outputSnp = params.get("output-snp");
+
+ if(params.containsKey("output-indel"))
+ outputIndel = params.get("output-indel");
+
+ if(params.containsKey("min-coverage"))
+ {
+ minCoverage = Integer.parseInt(params.get("min-coverage"));
+ minCoverageNormal = minCoverage;
+ minCoverageTumor = minCoverage;
+ }
+
+ if(params.containsKey("min-coverage-normal"))
+ minCoverageNormal = Integer.parseInt(params.get("min-coverage-normal"));
+
+ if(params.containsKey("min-coverage-tumor"))
+ minCoverageTumor = Integer.parseInt(params.get("min-coverage-tumor"));
+
+ if(params.containsKey("min-reads2"))
+ minReads2 = Integer.parseInt(params.get("min-reads2"));
+
+ if(params.containsKey("min-strands2"))
+ minStrands2 = Integer.parseInt(params.get("min-strands2"));
+
+ if(params.containsKey("min-var-freq"))
+ minVarFreq = Double.parseDouble(params.get("min-var-freq"));
+
+ if(params.containsKey("min-freq-for-hom"))
+ minFreqForHom = Double.parseDouble(params.get("min-freq-for-hom"));
+
+ if(params.containsKey("min-avg-qual"))
+ minAvgQual = Integer.parseInt(params.get("min-avg-qual"));
+
+ if(params.containsKey("p-value"))
+ pValueThreshold = Double.parseDouble(params.get("p-value"));
+
+ if(params.containsKey("somatic-p-value"))
+ somaticPvalue = Double.parseDouble(params.get("somatic-p-value"));
+
+ if(params.containsKey("data-ratio"))
+ dataRatio = Double.parseDouble(params.get("data-ratio"));
+
+ if(params.containsKey("normal-purity"))
+ {
+ normalPurity = Double.parseDouble(params.get("normal-purity"));
+ if(normalPurity > 1)
+ normalPurity = normalPurity / 100.00;
+ }
+
+ if(params.containsKey("tumor-purity"))
+ {
+ tumorPurity = Double.parseDouble(params.get("tumor-purity"));
+ if(tumorPurity > 1)
+ tumorPurity = normalPurity / 100.00;
+ }
+
+// System.err.println("Min coverage:\t" + minCoverage);
+ System.err.println("Min coverage:\t" + minCoverageNormal + "x for Normal, " + minCoverageTumor + "x for Tumor");
+ System.err.println("Min reads2:\t" + minReads2);
+ System.err.println("Min strands2:\t" + minStrands2);
+ System.err.println("Min var freq:\t" + minVarFreq);
+ System.err.println("Min freq for hom:\t" + minFreqForHom);
+ System.err.println("Normal purity:\t" + normalPurity);
+ System.err.println("Tumor purity:\t" + tumorPurity);
+ System.err.println("Min avg qual:\t" + minAvgQual);
+ System.err.println("P-value thresh:\t" + pValueThreshold);
+ System.err.println("Somatic p-value:\t" + somaticPvalue);
+ if(params.containsKey("validation"))
+ System.err.println("Validation mode: on");
+
+ }
+ catch(Exception e)
+ {
+ System.err.println("Input Parameter Threw Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ System.exit(1);
+ }
+
+ // Print usage if -h or --help invoked //
+ if(params.containsKey("help") || params.containsKey("h"))
+ {
+ System.err.println(usage);
+ return;
+ }
+
+ // Check for correct input //
+
+ if(args.length < 3 && (outputSnp.length() == 0 || outputIndel.length() == 0))
+ {
+ System.err.println("Please provide SNP and Indel output files!");
+ System.err.println(usage);
+ System.exit(1);
+ }
+
+
+ // Statistics counters //
+ long tumorPositions = 0;
+ long sharedPositions = 0;
+ long comparedPositions = 0;
+ long calledReference = 0;
+ long indelFilter = 0;
+ long strandFilter = 0;
+ long calledGermline = 0;
+ long calledLOH = 0;
+ long calledSomatic = 0;
+ long calledUnknown = 0;
+ long calledVariant = 0;
+ DecimalFormat pvalueFormat = new DecimalFormat("0.####E0");
+
+ try
+ {
+ // Declare output file //
+ PrintStream outSnp = null; // declare a print stream object for SNPs
+ PrintStream outIndel = null; // declare a print stream object for Indels
+ PrintStream outValidation = null; // declare a print stream object for both for validation
+ PrintStream outCopyNumber = null; // declare a print stream object for both for validation
+
+ if(params.containsKey("output-vcf"))
+ {
+ if(!outputSnp.contains(".vcf"))
+ outputSnp += ".vcf";
+ if(!outputIndel.contains(".vcf"))
+ outputIndel += ".vcf";
+ }
+
+ outSnp = new PrintStream( new FileOutputStream(outputSnp) );
+ outIndel = new PrintStream( new FileOutputStream(outputIndel) );
+ if(!params.containsKey("no-headers") && !params.containsKey("output-vcf"))
+ {
+ outSnp.println("chrom\tposition\tref\tvar\tnormal_reads1\tnormal_reads2\tnormal_var_freq\tnormal_gt\ttumor_reads1\ttumor_reads2\ttumor_var_freq\ttumor_gt\tsomatic_status\tvariant_p_value\tsomatic_p_value\ttumor_reads1_plus\ttumor_reads1_minus\ttumor_reads2_plus\ttumor_reads2_minus\tnormal_reads1_plus\tnormal_reads1_minus\tnormal_reads2_plus\tnormal_reads2_minus");
+ outIndel.println("chrom\tposition\tref\tvar\tnormal_reads1\tnormal_reads2\tnormal_var_freq\tnormal_gt\ttumor_reads1\ttumor_reads2\ttumor_var_freq\ttumor_gt\tsomatic_status\tvariant_p_value\tsomatic_p_value\ttumor_reads1_plus\ttumor_reads1_minus\ttumor_reads2_plus\ttumor_reads2_minus\tnormal_reads1_plus\tnormal_reads1_minus\tnormal_reads2_plus\tnormal_reads2_minus");
+ }
+ if(params.containsKey("output-vcf"))
+ {
+ // Output VCF Header //
+ outSnp.println(vcfHeader);
+ outIndel.println(vcfHeader);
+ }
+
+ if(params.containsKey("validation"))
+ {
+ outValidation = new PrintStream( new FileOutputStream(outputName + ".validation") );
+ if(!params.containsKey("no-headers") && !params.containsKey("output-vcf"))
+ outValidation.println("chrom\tposition\tref\tvar\tnormal_reads1\tnormal_reads2\tnormal_var_freq\tnormal_gt\ttumor_reads1\ttumor_reads2\ttumor_var_freq\ttumor_gt\tsomatic_status\tvariant_p_value\tsomatic_p_value\ttumor_reads1_plus\ttumor_reads1_minus\ttumor_reads2_plus\ttumor_reads2_minus\tnormal_reads1_plus\tnormal_reads1_minus\tnormal_reads2_plus\tnormal_reads2_minus");
+ if(params.containsKey("output-vcf"))
+ {
+ outValidation.println(vcfHeader);
+ }
+ }
+
+
+ BufferedReader normal = new BufferedReader(new FileReader(normalPileupFile));
+ BufferedReader tumor = new BufferedReader(new FileReader(tumorPileupFile));
+
+ // If input file not ready, give it a few seconds //
+ int numNaps = 0;
+
+ while(!(normal.ready() && tumor.ready()))
+ {
+ try {
+ Thread.sleep(5000);
+ numNaps++;
+
+ if(numNaps > 100)
+ {
+ System.err.println("Input file(s) were not ready for parsing after 100 5-second cycles! Pileup output may be invalid or too slow");
+ System.exit(10);
+ }
+ }
+ catch(Exception e)
+ {
+
+ }
+ }
+
+
+ if(!(normal.ready() && tumor.ready()))
+ {
+ System.err.println("ERROR: Input file(s) not ready for parsing! Pileup output may be invalid or too slow.");
+ System.exit(10);
+ }
+
+ String lineNormal;
+ String lineTumor;
+ String chromNormal = "";
+ String chromTumor = "";
+ String prevChromNormal = "";
+ String prevChromTumor = "";
+ String refBase = "";
+ int posNormal = 0;
+ int posTumor = 0;
+
+
+ DecimalFormat oneDigit = new DecimalFormat("#0.0");
+ DecimalFormat threeDigits = new DecimalFormat("#0.000");
+
+
+ // Get first line of Normal //
+
+ if((lineNormal = normal.readLine()) != null)
+ {
+ String[] normalContents = lineNormal.split("\t");
+
+ if(normalContents.length > 1)
+ {
+ chromNormal = normalContents[0];
+ posNormal = Integer.parseInt(normalContents[1]);
+ }
+ }
+
+ // Loop through lines in tumor //
+
+ while ((lineTumor = tumor.readLine()) != null)
+ {
+ tumorPositions++;
+ String[] tumorContents = lineTumor.split("\t");
+
+ if(tumorContents.length > 1)
+ {
+ chromTumor = tumorContents[0];
+ posTumor = Integer.parseInt(tumorContents[1]);
+ }
+
+ // Parse normal lines until we get the same chromosome //
+ boolean flagEOF = false;
+ boolean normalWasReset = false;
+
+ // Advance in normal file if tumor is changed but normal is not, or if tumor is higher //
+ while(!chromNormal.equals(chromTumor) && !chromTumor.equals(prevChromTumor) && !flagEOF && (chromNormal.equals(prevChromTumor) || inSortOrder(chromNormal, chromTumor)))
+ {
+ //System.err.println("Normal (" + chromNormal + ") catching up to " + chromTumor);
+ // Get next line from normal pileup //
+ if((lineNormal = normal.readLine()) != null)
+ {
+ String[] normalContents = lineNormal.split("\t");
+
+ if(normalContents.length > 1)
+ {
+ chromNormal = normalContents[0];
+ posNormal = Integer.parseInt(normalContents[1]);
+ }
+ }
+ else
+ {
+ flagEOF = true;
+ }
+
+
+ }
+
+ // If chromosomes match and are non-blank, attempt to get matching positions //
+ if(chromNormal.equals(chromTumor) && !chromNormal.equals(""))
+ {
+ normalWasReset = false;
+ // Seek to matching Normal Position //
+
+ while(chromNormal.equals(chromTumor) && posNormal < posTumor && ((lineNormal = normal.readLine()) != null))
+ {
+ String[] normalContents = lineNormal.split("\t");
+ if(normalContents.length > 1)
+ {
+ chromNormal = normalContents[0];
+ posNormal = Integer.parseInt(normalContents[1]);
+ }
+ }
+
+ // Seek to matching Tumor Position //
+
+ while(chromNormal.equals(chromTumor) && posTumor < posNormal && ((lineTumor = tumor.readLine()) != null))
+ {
+ tumorContents = lineTumor.split("\t");
+ if(tumorContents.length > 1)
+ {
+ chromTumor = tumorContents[0];
+ posTumor = Integer.parseInt(tumorContents[1]);
+ }
+ }
+
+ // Proceed if normal and tumor positions match //
+
+ if(chromNormal.equals(chromTumor) && chromNormal.equals(chromTumor) && posNormal == posTumor)
+ {
+ //stats.put("sharedPositions", (stats.get("sharedPositions") + 1));
+ sharedPositions++;
+ if(params.containsKey("verbose"))
+ System.err.println("Comparing calls at " + chromTumor + ":" + posTumor);
+
+ refBase = tumorContents[2];
+ String compareResult = comparePositions(lineNormal, lineTumor, minCoverage, minReads2, minVarFreq, minAvgQual, pValueThreshold, somaticPvalue, minFreqForHom, normalPurity, tumorPurity);
+
+ if(compareResult.length() > 0)
+ {
+ // Get the alleles to determine type //
+ String[] compareContents = compareResult.split("\t");
+ String allele1 = compareContents[0];
+ String allele2 = compareContents[1];
+
+ double strandedness1 = 0.50;
+ double strandedness2 = 0.50;
+ double strandednessDiff = 0.00;
+
+ if(compareContents.length >= 17)
+ {
+ try
+ {
+ int tumorReads1plus = Integer.parseInt(compareContents[13]);
+ int tumorReads1minus = Integer.parseInt(compareContents[14]);
+ int tumorReads2plus = Integer.parseInt(compareContents[15]);
+ int tumorReads2minus = Integer.parseInt(compareContents[16]);
+
+ if(tumorReads1plus > 0 || tumorReads1minus > 0)
+ {
+ strandedness1 = (double) tumorReads1plus / (double) (tumorReads1plus + tumorReads1minus);
+ }
+
+ if(tumorReads2plus > 0 || tumorReads2minus > 0)
+ {
+ strandedness2 = (double) tumorReads2plus / (double) (tumorReads2plus + tumorReads2minus);
+ if(tumorReads1plus > 0 || tumorReads1minus > 0)
+ {
+ strandednessDiff = java.lang.Math.abs(strandedness1 - strandedness2);
+ }
+ }
+ }
+ catch(Exception e)
+ {
+ // Exception parsing info from compareResult //
+ }
+ }
+
+ //stats.put("comparedPositions", (stats.get("comparedPositions") + 1));
+ comparedPositions++;
+
+ if(params.containsKey("verbose") && !compareResult.contains("Reference"))
+ System.err.println(chromNormal + "\t" + posNormal + "\t" + compareResult);
+
+ // If VCF format specified, supply it //
+
+ if(params.containsKey("output-vcf"))
+ {
+ int normalReads1 = Integer.parseInt(compareContents[2]);
+ int normalReads2 = Integer.parseInt(compareContents[3]);
+ String normalFreq = compareContents[4];
+ String normalCall = compareContents[5];
+ int tumorReads1 = Integer.parseInt(compareContents[6]);
+ int tumorReads2 = Integer.parseInt(compareContents[7]);
+ String tumorFreq = compareContents[8];
+ String tumorCall = compareContents[9];
+ String somStatus = compareContents[10];
+ Double germlineP = Double.parseDouble(compareContents[11]);
+ Double somaticP = Double.parseDouble(compareContents[12]);
+
+ String[] normalContents = lineNormal.split("\t");
+ //tumorContents = lineTumor.split("\t");
+ int pileupDepthNormal = Integer.parseInt(normalContents[3]);
+ int pileupDepthTumor = Integer.parseInt(tumorContents[3]);
+
+ int totalDepth = pileupDepthNormal + pileupDepthTumor;
+
+ if(allele2.startsWith("+"))
+ {
+ // INSERTION //
+ // Ref = ref base; Var = ref base followed by inserted bases //
+ String varColumn = allele1 + allele2.replace("+", "");
+ compareResult = "." + "\t" + allele1 + "\t" + varColumn + "\t" + ".";
+ }
+ else if(allele2.startsWith("-"))
+ {
+ // DELETION //
+ // Ref = ref base followed by deleted bases; var = ref base //
+ String refColumn = allele1 + allele2.replace("-", "");
+ compareResult = "." + "\t" + refColumn + "\t" + allele1 + "\t" + ".";
+ }
+ else
+ {
+ compareResult = "." + "\t" + allele1 + "\t" + allele2 + "\t" + ".";
+ }
+
+
+ // Decide on filter field //
+ if(params.containsKey("strand-filter") && strandednessDiff > 0.10 && (strandedness2 < 0.10 || strandedness2 > 0.90))
+ {
+ compareResult += "\t" + "str10";
+ }
+ else if(somStatus.equals("IndelFilter"))
+ {
+ compareResult += "\t" + "indelError";
+ }
+ else
+ {
+ compareResult += "\t" + "PASS";
+ }
+
+ // Determine somatic status id and score //
+ int ssCode = 0;
+ double somScore = 0;
+
+ if(somStatus.equals("Reference"))
+ {
+ // Wildtype //
+ ssCode = 0;
+ calledReference++;
+ }
+ else if(somStatus.equals("Germline"))
+ {
+ // Germline //
+ ssCode = 1;
+ calledGermline++;
+ if(somaticP == 0)
+ {
+ somScore = 0;
+ }
+ else
+ {
+ somScore = 0 - (10 * java.lang.Math.log10(somaticP));
+ }
+ }
+ else if(somStatus.equals("Somatic"))
+ {
+ // Somatic //
+ ssCode = 2;
+ calledSomatic++;
+ if(somaticP == 0)
+ {
+ somScore = 255;
+ }
+ else
+ {
+ somScore = 0 - (10 * java.lang.Math.log10(somaticP));
+ }
+ }
+ else if(somStatus.equals("LOH"))
+ {
+ // LOH //
+ ssCode = 3;
+ calledLOH++;
+ if(somaticP == 0)
+ {
+ somScore = 255;
+ }
+ else
+ {
+ somScore = 0 - (10 * java.lang.Math.log10(somaticP));
+ }
+ }
+ else
+ {
+ // Unknown //
+ calledUnknown++;
+ ssCode = 5;
+ }
+
+ // Adjust somatic score //
+ if(somScore > 255)
+ somScore = 255;
+
+ // Print the info field //
+
+ compareResult += "\t" + "DP=" + totalDepth;
+ if(somStatus.equals("Somatic"))
+ compareResult += ";SOMATIC";
+ compareResult += ";" + "SS=" + ssCode;
+ compareResult += ";" + "SSC=" + (int) somScore;
+ compareResult += ";" + "GPV=" + pvalueFormat.format(germlineP);
+ compareResult += ";" + "SPV=" + pvalueFormat.format(somaticP);
+
+ // Print the format field //
+
+ String tumorDP4 = "";
+ String normalDP4 = "";
+
+ if(compareContents.length >= 17)
+ {
+ try
+ {
+ tumorDP4 = compareContents[13] + "," + compareContents[14] + "," + compareContents[15] + "," + compareContents[16];
+ normalDP4 = compareContents[17] + "," + compareContents[18] + "," + compareContents[19] + "," + compareContents[20];
+ }
+ catch(Exception e)
+ {
+ // Exception parsing info from compareResult //
+ tumorDP4 = "";
+ normalDP4 = "";
+ }
+ }
+
+ if(tumorDP4.length() > 0)
+ compareResult += "\tGT:GQ:DP:RD:AD:FREQ:DP4";
+ else
+ compareResult += "\tGT:GQ:DP:RD:AD:FREQ";
+
+ // Determine normal genotype //
+ String normalGt = ".";
+ String tumorGt = ".";
+ if(normalCall.equals(refBase))
+ {
+ normalGt = "0/0";
+ }
+ else if(VarScan.isHeterozygous(normalCall))
+ {
+ normalGt = "0/1";
+ }
+ else
+ {
+ normalGt = "1/1";
+ }
+
+ if(tumorCall.equals(refBase))
+ {
+ tumorGt = "0/0";
+ }
+ else if(VarScan.isHeterozygous(tumorCall))
+ {
+ tumorGt = "0/1";
+ }
+ else
+ {
+ tumorGt = "1/1";
+ }
+
+ if(tumorDP4.length() > 0)
+ {
+ compareResult += "\t" + normalGt + ":.:" + pileupDepthNormal + ":" + normalReads1 + ":" + normalReads2 + ":" + normalFreq + ":" + normalDP4;
+ compareResult += "\t" + tumorGt + ":.:" + pileupDepthTumor + ":" + tumorReads1 + ":" + tumorReads2 + ":" + tumorFreq + ":" + tumorDP4;
+ }
+ else
+ {
+ compareResult += "\t" + normalGt + ":.:" + pileupDepthNormal + ":" + normalReads1 + ":" + normalReads2 + ":" + normalFreq;
+ compareResult += "\t" + tumorGt + ":.:" + pileupDepthTumor + ":" + tumorReads1 + ":" + tumorReads2 + ":" + tumorFreq;
+ }
+ }
+ // Print to master file for validation //
+
+ if(params.containsKey("validation"))
+ {
+ outValidation.println(chromNormal + "\t" + posNormal + "\t" + compareResult);
+ }
+
+ if(!params.containsKey("validation") && (compareResult.contains("Reference") || compareResult.contains("SS=0") || compareResult.contains("Filter")))
+ {
+ // Don't print reference/indelfilter positions unless doing validation //
+ }
+ else if(params.containsKey("strand-filter") && strandednessDiff > 0.10 && (strandedness2 < 0.10 || strandedness2 > 0.90))
+ {
+ // If filter is set, ignore variants that are supported largely by one strand //
+ compareResult = "StrandFilter";
+ }
+ else if(allele1.contains("-") || allele1.contains("+") || allele2.contains("-") || allele2.contains("+"))//if(compareResult.contains("INS") || compareResult.contains("DEL"))
+ {
+ outIndel.println(chromNormal + "\t" + posNormal + "\t" + compareResult);
+ }
+ else
+ {
+ outSnp.println(chromNormal + "\t" + posNormal + "\t" + compareResult);
+ }
+ }
+ else
+ {
+// System.err.println("Uncalled" + chromNormal + "\t" + posNormal + "\t" + compareResult);
+ }
+
+ if(compareResult.contains("Reference"))
+ calledReference++; //stats.put("calledReference", (stats.get("calledReference") + 1));
+ else if(compareResult.contains("IndelFilter"))
+ indelFilter++; //stats.put("indelFilter", (stats.get("indelFilter") + 1));
+ else if(compareResult.contains("StrandFilter"))
+ strandFilter++;
+ else if(compareResult.contains("Germline"))
+ calledGermline++; //stats.put("calledGermline", (stats.get("calledGermline") + 1));
+ else if(compareResult.contains("Somatic"))
+ calledSomatic++; //stats.put("calledSomatic", (stats.get("calledSomatic") + 1));
+ else if(compareResult.contains("LOH"))
+ calledLOH++; //stats.put("calledLOH", (stats.get("calledLOH") + 1));
+ else if(compareResult.contains("Unknown"))
+ calledUnknown++; //stats.put("calledUnknown", (stats.get("calledUnknown") + 1));
+ else if(compareResult.contains("Variant"))
+ calledVariant++; //stats.put("calledVariant", (stats.get("calledVariant") + 1));
+
+ prevChromNormal = chromNormal;
+ prevChromTumor = chromTumor;
+ }
+ else
+ {
+ //System.err.println("Failed to match positions " + chromNormal + " " + posNormal + " to Tumor " + chromTumor + " " + posTumor);
+ }
+ }
+ // If they're in sort order, do nothing so that tumor can catch up //
+ else if(inSortOrder(chromNormal, chromTumor))
+ {
+ System.err.println("Not resetting normal file because " + chromNormal + " < " + chromTumor);
+ }
+ // If we reached the end of the normal file but never saw this chromosome, //
+ // fast-forward until tumor chromosome changes and reset normal file //
+ else if(flagEOF)
+ {
+ flagEOF = false;
+
+ while(prevChromTumor.equals(chromTumor) && !flagEOF)
+ {
+ if((lineTumor = tumor.readLine()) != null)
+ {
+ tumorContents = lineTumor.split("\t");
+
+ if(tumorContents.length > 1)
+ {
+ chromTumor = tumorContents[0];
+ posTumor = Integer.parseInt(tumorContents[1]);
+ }
+ }
+ else
+ {
+ flagEOF = true;
+ }
+ }
+
+ // Reset the normal file if we've already passed this chromosome in normal //
+
+ if(!flagEOF && !normalWasReset)
+ {
+ if(inSortOrder(chromNormal, chromTumor))
+ {
+ System.err.println("Not resetting normal file because " + chromNormal + " < " + chromTumor);
+ }
+ else
+ {
+ System.err.println("Resetting normal file because " + chromNormal + " > " + chromTumor);
+ normalWasReset = true;
+ normal.close();
+ normal = new BufferedReader(new FileReader(normalPileupFile));
+ }
+
+ }
+ }
+
+ }
+
+
+ normal.close();
+ tumor.close();
+
+
+
+ outSnp.close();
+ outIndel.close();
+
+ System.err.println(tumorPositions + " positions in tumor");
+ System.err.println(sharedPositions + " positions shared in normal"); //stats.get("sharedPositions")
+ System.err.println(comparedPositions + " had sufficient coverage for comparison"); //stats.get("comparedPositions")
+ System.err.println(calledReference + " were called Reference"); //stats.get("calledReference")
+ System.err.println(indelFilter + " were mixed SNP-indel calls and filtered");
+ if(params.containsKey("strand-filter"))
+ System.err.println(strandFilter + " were removed by the strand filter");
+ System.err.println(calledGermline + " were called Germline");
+ System.err.println(calledLOH + " were called LOH");
+ System.err.println(calledSomatic + " were called Somatic");
+ System.err.println(calledUnknown + " were called Unknown");
+ System.err.println(calledVariant + " were called Variant");
+
+ }
+ catch (IOException e)
+ {
+ System.err.println("File Parsing Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ System.exit(11);
+ }
+ }
+
+
+ /**
+ * Determine if tumor chromosome is before normal chromosome in sort order
+ *
+ * @param args Command-line arguments
+ * @return HashMap of parameter names and their values
+ */
+ static boolean inSortOrder(String chrom1, String chrom2)
+ {
+ String[] testArray = {chrom1, chrom2};
+ Arrays.sort(testArray);
+
+ if(testArray[0].equals(chrom1))
+ return true;
+
+ return false;
+ }
+
+ /**
+ * Parses and verifies any command-line parameters
+ *
+ * @param args Command-line arguments
+ * @return HashMap of parameter names and their values
+ */
+ static String comparePositions(String lineNormal, String lineTumor, int minCoverage, int minReads2, double minVarFreq, int minAvgQual, double pValueThreshold, double somaticPvalue, double minFreqForHom, double normalPurity, double tumorPurity)
+ {
+ try
+ {
+ DecimalFormat df = new DecimalFormat("###.##");
+ // Set default parameters //
+
+ String refBase = "";
+ int normalDepth = 0;
+ String normalBases = "";
+ String normalQualities = "";
+ String normalMapQuals = "";
+
+ int tumorDepth = 0;
+ String tumorBases = "";
+ String tumorQualities = "";
+ String tumorMapQuals = "";
+
+ // Parse out normal info //
+ String[] normalContents = lineNormal.split("\t");
+ refBase = normalContents[2].toUpperCase();
+ // Parse out tumor info //
+ String[] tumorContents = lineTumor.split("\t");
+
+ // Parse out normal from pileup or CNS file //
+
+ if(normalContents.length >= 6 && normalContents.length <= 7)
+ {
+ normalDepth = Integer.parseInt(normalContents[3]);
+ normalBases = normalContents[4];
+ normalQualities = normalContents[5];
+ normalMapQuals = "";
+ if(normalContents.length == 7)
+ {
+ normalMapQuals = normalContents[6];
+ }
+
+ }
+ else if(normalContents.length >= 10 && normalContents.length <= 11)
+ {
+ normalDepth = Integer.parseInt(normalContents[7]);
+ normalBases = normalContents[8];
+ normalQualities = normalContents[9];
+ normalMapQuals = "";
+ if(normalContents.length == 11)
+ {
+ normalMapQuals = normalContents[10];
+ }
+ }
+
+ // Parse out tumor from pileup or CNS file //
+
+ if(tumorContents.length >= 6 && tumorContents.length <= 7)
+ {
+ tumorDepth = Integer.parseInt(tumorContents[3]);
+ tumorBases = tumorContents[4];
+ tumorQualities = tumorContents[5];
+ tumorMapQuals = "";
+ if(tumorContents.length == 7)
+ {
+ tumorMapQuals = tumorContents[6];
+ }
+
+ }
+ else if(tumorContents.length >= 10 && tumorContents.length <= 11)
+ {
+ tumorDepth = Integer.parseInt(tumorContents[7]);
+ tumorBases = tumorContents[8];
+ tumorQualities = tumorContents[9];
+ tumorMapQuals = "";
+ if(tumorContents.length == 11)
+ {
+ tumorMapQuals = tumorContents[10];
+ }
+ }
+
+
+ String somaticStatus = "";
+ String allele2 = "";
+ double pValue = 1;
+ double diffPvalue = 1;
+
+ if(tumorDepth >= minCoverage && normalDepth >= minCoverage)
+ {
+ // Adjust for tumor purity (i.e., tumor cellularity content of sample) //
+ double tumorMinVarFreq = minVarFreq;
+
+ // If tumor purity is less than 100%, reduce the minimum variant allele frequency accordingly //
+ if(tumorPurity < 1.00)
+ {
+ tumorMinVarFreq = (minVarFreq * tumorPurity);
+ }
+
+ HashMap<String, String> readCountsTumor = VarScan.getReadCounts(refBase, tumorBases, tumorQualities, minAvgQual, tumorMapQuals);
+ // String tumorConsensusLine = callConsensus(refBase, tumorPileup, min_reads2, min_var_freq, min_avg_qual, pValue, purityNormal);
+ String tumorConsensusLine = VarScan.callPosition(refBase, readCountsTumor, "CNS", minReads2, tumorMinVarFreq, minAvgQual, 0.99, minFreqForHom);
+ String[] tumorConsensusContents = tumorConsensusLine.split("\t");
+ String tumorConsensus = tumorConsensusContents[0];
+
+ if(tumorConsensus.equals("N"))
+ {
+ // No tumor call made, so make no call //
+ return("");
+ }
+ else if(normalDepth >= minCoverage)
+ {
+ // Adjust for normal purity (i.e., tumor contamination of normal in AML) //
+ double normalMinVarFreq = minVarFreq;
+
+ if(normalPurity < 1.00)
+ {
+ normalMinVarFreq = (minVarFreq / normalPurity);
+ }
+
+ HashMap<String, String> readCountsNormal = VarScan.getReadCounts(refBase, normalBases, normalQualities, minAvgQual, normalMapQuals);
+ String normalConsensusLine = VarScan.callPosition(refBase, readCountsNormal, "CNS", minReads2, normalMinVarFreq, minAvgQual, 0.99, minFreqForHom); //pValueThreshold, minFreqForHom);
+
+ String[] normalConsensusContents = normalConsensusLine.split("\t");
+ String normalConsensus = normalConsensusContents[0];
+
+ if(normalConsensus.equals("N"))
+ {
+ // Make no call at this position //
+ return("");
+ }
+ else
+ {
+ // Parse out the read counts in tumor //
+ int tumorReads1 = Integer.parseInt(tumorConsensusContents[1]);
+ int tumorReads2 = Integer.parseInt(tumorConsensusContents[2]);
+ int tumorCoverage = tumorReads1 + tumorReads2;
+ String tumorAllele2 = VarScan.getVarAllele(refBase, tumorConsensusContents[0]);
+
+ // Parse out strand support in tumor //
+ int tumorReads1plus = 0;
+ int tumorReads1minus = 0;
+ int tumorReads2plus = 0;
+ int tumorReads2minus = 0;
+ if(tumorConsensusContents.length > 14)
+ {
+ tumorReads1plus = Integer.parseInt(tumorConsensusContents[11]);
+ tumorReads1minus = Integer.parseInt(tumorConsensusContents[12]);
+ tumorReads2plus = Integer.parseInt(tumorConsensusContents[13]);
+ tumorReads2minus = Integer.parseInt(tumorConsensusContents[14]);
+ }
+
+ // Parse out strand support in normal //
+ int normalReads1plus = 0;
+ int normalReads1minus = 0;
+ int normalReads2plus = 0;
+ int normalReads2minus = 0;
+ if(normalConsensusContents.length > 14)
+ {
+ normalReads1plus = Integer.parseInt(normalConsensusContents[11]);
+ normalReads1minus = Integer.parseInt(normalConsensusContents[12]);
+ normalReads2plus = Integer.parseInt(normalConsensusContents[13]);
+ normalReads2minus = Integer.parseInt(normalConsensusContents[14]);
+ }
+
+ // Parse out the read counts in normal //
+
+ int normalReads1 = Integer.parseInt(normalConsensusContents[1]);
+ int normalReads2 = Integer.parseInt(normalConsensusContents[2]);
+ int normalCoverage = normalReads1 + normalReads2;
+ String normalAllele2 = VarScan.getVarAllele(refBase, normalConsensusContents[0]);
+
+
+ // Get the Normal Read counts for the tumor variant allele //
+
+ if(!tumorAllele2.equals(refBase)) // normalAllele2.equals(refBase) &&
+ {
+ allele2 = tumorAllele2;
+ if(readCountsNormal.containsKey(tumorAllele2))
+ {
+ String[] alleleContents = readCountsNormal.get(tumorAllele2).split("\t");
+ normalReads2 = Integer.parseInt(alleleContents[0]);
+ normalCoverage = normalReads1 + normalReads2;
+ }
+ }
+ else if(!normalAllele2.equals(refBase))
+ {
+ allele2 = normalAllele2;
+ }
+ else
+ {
+ // Neither consensus contained a variant allele, so get most-observed tumor variant //
+ if(tumorConsensusContents.length > 15)
+ {
+ allele2 = tumorConsensusContents[15];
+ }
+ else if(tumorConsensusContents.length == 10)
+ {
+ allele2 = tumorConsensusContents[9];
+ }
+ else if(normalConsensusContents.length > 15)
+ {
+ allele2 = normalConsensusContents[15];
+ }
+ else if(normalConsensusContents.length == 10)
+ {
+ allele2 = normalConsensusContents[9];
+ }
+ }
+
+
+ double normalFreq = (double) normalReads2 / (double) normalCoverage;
+ double tumorFreq = (double) tumorReads2 / (double) tumorCoverage;
+
+ // Calculate the frequency difference //
+ double freqDiff = tumorFreq - normalFreq;
+
+ // P-value of significant difference //
+ diffPvalue = VarScan.getSignificance(normalReads1, normalReads2, tumorReads1, tumorReads2);
+
+ // Format allele frequencies for printing //
+ String normalFreqPrint = df.format(normalFreq * 100) + "%";
+ String tumorFreqPrint = df.format(tumorFreq * 100) + "%";
+
+ // If Normal matches Tumor it's either reference or Germline //
+
+ if(normalConsensus.equals(tumorConsensus) && (normalConsensus.equals(refBase) || diffPvalue > somaticPvalue))
+ {
+ // CASE 0: Normal and Tumor Match //
+
+ if(normalConsensus.equals(refBase))
+ {
+ somaticStatus = "Reference";
+ }
+ else
+ {
+ // Recalculate p-value //
+ int totalReads1 = normalReads1 + tumorReads1;
+ int totalReads2 = normalReads2 + tumorReads2;
+ int totalCoverage = totalReads1 + totalReads2;
+ // P-value of Germline variant //
+ pValue = VarScan.getSignificance(totalCoverage, 0, totalReads1, totalReads2);
+ if(pValue <= somaticPvalue) //Changed from if(pValue <= pValueThreshold) 11-jun-2012
+ {
+ somaticStatus = "Germline";
+ allele2 = tumorAllele2;
+ }
+ else
+ {
+ somaticStatus = "Reference";
+ allele2 = refBase;
+ }
+ }
+ }
+
+ // If Normal does NOT match Tumor it could be Somatic, LOH, or Unknown //
+
+ else
+ {
+ if(normalConsensus.equals(tumorConsensus))
+ {
+ // Genotype calls match, but the difference must have been significant. //
+ // Let's try harder to call a variant for tumor here //
+ // tumorConsensusLine = VarScan.callPosition(refBase, readCountsTumor, "CNS", 1, 0.00, 0, 1.00, minFreqForHom);
+ // tumorConsensusContents = tumorConsensusLine.split("\t");
+ // tumorConsensus = tumorConsensusContents[0];
+ // tumorAllele2 = VarScan.getVarAllele(refBase, tumorConsensusContents[0]);
+ // System.err.println("Got a new consensus: " + tumorConsensus + " from " + tumorAllele2);
+ }
+ // CASE 1: Indel-associated SNP Filter //
+
+ if(tumorConsensus.contains("/") && !normalConsensus.contains("/") && !normalConsensus.equals(refBase))
+ somaticStatus = "IndelFilter";
+ else if(normalConsensus.contains("/") && !tumorConsensus.contains("/") && !tumorConsensus.equals(refBase))
+ somaticStatus = "IndelFilter";
+ else
+ {
+ // CASE 2: Somatic indel or SNP events, where difference in read counts is significant or else coverage is low //
+
+ if(diffPvalue <= somaticPvalue || normalFreq == 0.00) // || tumorCoverage < 30 || normalCoverage < 15)
+ {
+ // CASE 2A: Perfect Somatic Het //
+ if(normalConsensus.equals(refBase) && VarScan.isHeterozygous(tumorConsensus) && tumorFreq > normalFreq)
+ {
+ somaticStatus = "Somatic";
+ allele2 = tumorAllele2;
+ }
+ // CASE 2B: Somatic Homozygous //
+ else if(normalConsensus.equals(refBase) && VarScan.isHomozygous(tumorConsensus))
+ {
+ somaticStatus = "Somatic";
+ allele2 = tumorAllele2;
+ }
+ // CASE 2C: LOH of variant allele //
+ else if(tumorConsensus.equals(refBase) && VarScan.isHeterozygous(normalConsensus))
+ {
+ somaticStatus = "LOH";
+ allele2 = normalAllele2;
+ }
+ // CASE 2D: LOH of reference allele //
+ else if(VarScan.isHeterozygous(normalConsensus) && VarScan.isHomozygous(tumorConsensus))
+ {
+ somaticStatus = "LOH";
+ allele2 = tumorAllele2;
+ }
+ // CASE 2E: Variant alleles match but difference significant //
+ else if(tumorAllele2.equals(normalAllele2))
+ {
+ if(normalFreq > minVarFreq)
+ {
+ somaticStatus = "Germline";
+ }
+ else if(freqDiff >= 0.30 && tumorFreq > normalFreq)
+ {
+ somaticStatus = "Somatic";
+ }
+ else if(freqDiff <= -0.30 && tumorFreq < normalFreq)
+ {
+ somaticStatus = "LOH";
+ }
+ else// if(freqDiff < 0.50)
+ {
+ somaticStatus = "Germline"; // Should this be GOH? //
+ // Recalculate p-value //
+ int totalReads1 = normalReads1 + tumorReads1;
+ int totalReads2 = normalReads2 + tumorReads2;
+ int totalCoverage = totalReads1 + totalReads2;
+ pValue = VarScan.getSignificance(totalCoverage, 0, totalReads1, totalReads2);
+ }
+ // else
+ // {
+ // somaticStatus = "LOH";
+ // }
+ allele2 = tumorAllele2;
+ }
+ // CASE 2F: Variant alleles don't match but tumor het and higher freq = normal FalsePos //
+ else if(tumorFreq > normalFreq && VarScan.isHeterozygous(normalConsensus) && VarScan.isHeterozygous(tumorConsensus))
+ {
+ normalConsensus = refBase;
+ somaticStatus = "Somatic";
+ allele2 = tumorAllele2;
+ }
+ // CASE 2G: Unknown Somatic Change, e.g. reverse-LOH (GOH) //
+ else
+ {
+ somaticStatus = "Unknown";
+ if(tumorAllele2.equals(refBase))
+ allele2 = normalAllele2;
+ else
+ allele2 = tumorAllele2;
+ }
+
+ }
+ else
+ {
+ // CASE 3: Difference not significant //
+
+ // CASE 3A: One sample het, one sample hom = Germline //
+ if(tumorAllele2.equals(normalAllele2))
+ {
+ // Recalculate p-value //
+ int totalReads1 = normalReads1 + tumorReads1;
+ int totalReads2 = normalReads2 + tumorReads2;
+ int totalCoverage = totalReads1 + totalReads2;
+ pValue = VarScan.getSignificance(totalCoverage, 0, totalReads1, totalReads2);
+ if(pValue <= pValueThreshold)
+ {
+ somaticStatus = "Germline";
+ allele2 = tumorAllele2;
+ }
+ else
+ {
+ somaticStatus = "Reference";
+ allele2 = refBase;
+ }
+ }
+
+ // CASE 3B: Probable false positive in tumor//
+ else if(normalConsensus.equals(refBase))
+ {
+ somaticStatus = "Reference";
+ allele2 = tumorAllele2;
+ }
+
+ // CASE 3C: Probable false positive in numor//
+ else if(tumorConsensus.equals(refBase))
+ {
+ somaticStatus = "Reference";
+ allele2 = normalAllele2;
+ }
+ else
+ {
+ somaticStatus = "Unknown";
+ allele2 = normalAllele2 + "/" + tumorAllele2;
+ }
+
+ }
+
+ }
+
+ }
+
+
+ // Compile the report //
+
+ String resultLine = refBase + "\t" + allele2 + "\t";
+ resultLine += normalReads1 + "\t" + normalReads2 + "\t" + normalFreqPrint + "\t" + normalConsensus + "\t";
+ resultLine += tumorReads1 + "\t" + tumorReads2 + "\t" + tumorFreqPrint + "\t" + tumorConsensus + "\t";
+ resultLine += somaticStatus + "\t" + pValue + "\t" + diffPvalue + "\t";
+ resultLine += tumorReads1plus + "\t" + tumorReads1minus + "\t";
+ resultLine += tumorReads2plus + "\t" + tumorReads2minus + "\t";
+ resultLine += normalReads1plus + "\t" + normalReads1minus + "\t";
+ resultLine += normalReads2plus + "\t" + normalReads2minus;
+ return(resultLine);
+
+ }
+
+
+ }
+ else
+ {
+ return(""); // Normal did not meet coverage
+ }
+
+ }
+ else
+ {
+ return(""); // Tumor did not meet coverage
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Warning:");
+ e.printStackTrace(System.err);
+ }
+ return(""); // No call
+ }
+
+
+
+ /**
+ * Determines the sort order for chromosomes
+ *
+ * @param args Command-line arguments
+ * @return HashMap of parameter names and their values
+ */
+ static Boolean chromSorted(String chrom1, String chrom2)
+ {
+ Boolean answer = false;
+
+ chrom1.replace("X", "23");
+ chrom1.replace("Y", "24");
+ chrom1.replace("M", "25");
+
+ chrom2.replace("X", "23");
+ chrom2.replace("Y", "24");
+ chrom2.replace("M", "25");
+
+ String[] unsorted = {chrom1, chrom2};
+ String[] sorted = {chrom1, chrom2};
+ Arrays.sort(sorted);
+ System.err.println("Sorted order is " + sorted[0] + " " + sorted[1]);
+ try{
+ if(sorted[0].equals(unsorted[0]))
+ {
+ answer = true;
+ }
+ }
+ catch(Exception e)
+ {
+
+ }
+
+ return(answer);
+ }
+
+}
diff --git a/net/sf/varscan/Trio.java b/net/sf/varscan/Trio.java
new file mode 100644
index 0000000..85d46cb
--- /dev/null
+++ b/net/sf/varscan/Trio.java
@@ -0,0 +1,1284 @@
+/**
+ * @(#)Trio.java
+ *
+ * Copyright (c) 2009-2013 Daniel C. Koboldt and Washington University in St. Louis
+ *
+ * COPYRIGHT
+ */
+
+package net.sf.varscan;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.PrintStream;
+import java.text.DecimalFormat;
+import java.util.*;
+import java.lang.Math;
+
+/**
+ * A class for calling variants in a mother-father-child trio
+ *
+ * @version 2.3
+ *
+ * @author Daniel C. Koboldt <dkoboldt at genome.wustl.edu>
+ *
+ */
+public class Trio {
+ public Trio(String[] args, String callType)
+ {
+ // Define the usage message //
+ String usage = "USAGE: java -jar VarScan.jar trio [mpileup file] [output-basename] OPTIONS\n" +
+ "\tmpileup file - The SAMtools mpileup file for father, mother, child in that order\n" +
+ "\n" +
+ "\tOPTIONS:\n" +
+ "\t--output-name\tAn output base name for VCF files of results. Required for piped input\n" +
+ "\t--min-coverage\tMinimum read depth at a position to make a call [20]\n" +
+ "\t--min-reads2\tMinimum supporting reads at a position to call variants [2]\n" +
+ "\t--min-avg-qual\tMinimum base quality at a position to count a read [15]\n" +
+ "\t--min-var-freq\tMinimum variant allele frequency threshold [0.20]\n" +
+ "\t--min-freq-for-hom\tMinimum frequency to call homozygote [0.75]\n" +
+ "\t--p-value\tDefault p-value threshold for calling variants [0.05]\n" +
+ "\t--adj-var-freq\tAdjusted minimum VAF when recalling at variant site [0.05]\n" +
+ "\t--adj-p-value\tAdjusted p-value when recalling at variant site [0.10]\n" +
+ "\t--vcf-sample-list\tFor VCF output, a list of sample names in order, one per line\n" +
+ "\t--variants\tReport only variant (SNP/indel) positions [0]";
+
+ // Set parameter defaults //
+
+ HashMap<String, String> params = VarScan.getParams(args);
+
+ // Establish output file names //
+ String outputName = "output";
+ String outputSnp = "";
+ String outputIndel = "";
+
+ if(args.length >= 3 && !args[2].startsWith("-"))
+ {
+ outputName = args[2];
+ outputSnp = outputName + ".snp.vcf";
+ outputIndel = outputName + ".indel.vcf";
+ }
+
+ // Set up formatting for p-values //
+ DecimalFormat pvalueFormat = new DecimalFormat("0.####E0");
+
+ // Force VCF output //
+ params.put("output-vcf", "1");
+
+ // Set parameter defaults //
+
+ int minCoverage = 20;
+ int minReads2 = 4;
+ int minAvgQual = 15;
+ double minVarFreq = 0.20;
+ double minFreqForHom = 0.75;
+ double pValueThreshold = 0.01;
+ double strandPvalueThreshold = 0.01;
+ double adjustedMinVarFreq = 0.05;
+ double adjustedpValueThreshold = 0.10;
+ boolean strandFilter = true;
+ String sampleList = "";
+
+ if(callType.equals("CNS"))
+ {
+ // Set more rigorous parameters for consensus calling
+ minVarFreq = 0.20;
+ pValueThreshold = 0.01;
+ }
+
+ // Adjust parameters based on user input //
+
+ try
+ {
+ if(params.containsKey("min-coverage"))
+ minCoverage = Integer.parseInt(params.get("min-coverage"));
+
+ if(params.containsKey("min-reads2"))
+ minReads2 = Integer.parseInt(params.get("min-reads2"));
+
+ if(params.containsKey("min-var-freq"))
+ minVarFreq = Double.parseDouble(params.get("min-var-freq"));
+
+ if(params.containsKey("adj-var-freq"))
+ adjustedMinVarFreq = Double.parseDouble(params.get("adj-var-freq"));
+
+ if(params.containsKey("p-value"))
+ pValueThreshold = Double.parseDouble(params.get("p-value"));
+
+ if(params.containsKey("adj-p-value"))
+ adjustedpValueThreshold = Double.parseDouble(params.get("adj-p-value"));
+
+ if(params.containsKey("min-freq-for-hom"))
+ minFreqForHom = Double.parseDouble(params.get("min-freq-for-hom"));
+
+ if(params.containsKey("min-avg-qual"))
+ minAvgQual = Integer.parseInt(params.get("min-avg-qual"));
+
+ if(params.containsKey("output-name"))
+ {
+ outputName = params.get("output-name");
+ outputSnp = outputName + ".snp.vcf";
+ outputIndel = outputName + ".indel.vcf";
+
+ }
+
+ if(params.containsKey("strand-filter"))
+ {
+ int filter = Integer.parseInt(params.get("strand-filter"));
+ if(filter > 0)
+ strandFilter = true;
+ else
+ strandFilter = false;
+ }
+
+ if(params.containsKey("vcf-sample-list"))
+ {
+ File samplefile = new File(params.get("vcf-sample-list"));
+ // Parse sample list //
+ if(samplefile.exists())
+ {
+ BufferedReader in = new BufferedReader(new FileReader(samplefile));
+ String line = "";
+ if(in.ready())
+ {
+ while ((line = in.readLine()) != null)
+ {
+ String sampleName = line;
+ if(sampleList.length() > 0)
+ sampleList += "\t";
+ sampleList += sampleName;
+ }
+ }
+ else
+ {
+ System.err.println("Unable to open sample list");
+ }
+
+ in.close();
+ }
+
+ System.err.println("Got the following sample list: ");
+ System.err.println(sampleList);
+ }
+
+ if(params.containsKey("p-value"))
+ pValueThreshold = Double.parseDouble(params.get("p-value"));
+ else
+ System.err.println("Warning: No p-value threshold provided, so p-values will not be calculated");
+
+
+ // Check for correct input //
+
+ if(outputSnp.length() == 0 || outputIndel.length() == 0)
+ {
+ System.err.println("Please provide an output basename or SNP/indel output files!");
+ System.err.println(usage);
+ System.exit(1);
+ }
+
+ System.err.println("SNPs will be output to " + outputSnp);
+ System.err.println("Indels will be output to " + outputIndel);
+ System.err.println("Min coverage:\t" + minCoverage);
+ System.err.println("Min reads2:\t" + minReads2);
+ System.err.println("Min var freq:\t" + minVarFreq);
+ System.err.println("Min avg qual:\t" + minAvgQual);
+ System.err.println("P-value thresh:\t" + pValueThreshold);
+ }
+ catch(Exception e)
+ {
+ System.err.println("Input Parameter Threw Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ return;
+ }
+
+ // Print usage if -h or --help invoked //
+ if(params.containsKey("help") || params.containsKey("h"))
+ {
+ System.err.println(usage);
+ return;
+ }
+
+ // Define the statistics hash and reset counters //
+
+
+ long numBases = 0;
+ long numBasesCovered = 0;
+ long numVariantPositions = 0;
+ long numSNPpositions = 0;
+ long numIndelPositions = 0;
+ long numFailStrandFilter = 0;
+ long numFailMendelFilter = 0;
+ long numVariantsReported = 0;
+ long numVariantsReportedDeNovo = 0;
+ long numSNPsReported = 0;
+ long numSNPsReportedDeNovo = 0;
+ long numIndelsReported = 0;
+ long numIndelsReportedDeNovo = 0;
+
+ int numParsingExceptions = 0;
+
+ HashMap<String, Integer> stats = new HashMap<String, Integer>();
+
+ // Parse piped input or user-provided pileup file //
+
+ try
+ {
+
+ // Declare file-parsing variables //
+
+ BufferedReader in = VarScan.getInfile(args);
+ String line;
+
+ // If no input, print usage //
+
+ if(in == null)
+ {
+ System.out.println(usage);
+ return;
+ }
+
+ // If input file not ready, give it a few seconds //
+ int numNaps = 0;
+
+ while(!in.ready())
+ {
+ try {
+ Thread.sleep(5000);
+ numNaps++;
+
+ if(numNaps > 100)
+ {
+ System.err.println("Input file was not ready after 100 5-second cycles!");
+ System.exit(10);
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Exception while trying to get input" + e.getMessage());
+ System.exit(1);
+ }
+ }
+
+ // Proceed if input stream is ready //
+ String vcfHeader = "##fileformat=VCFv4.1";
+
+ if(in != null && in.ready())
+ {
+ // Declare output file //
+ PrintStream outSnp = null; // declare a print stream object for SNPs
+ PrintStream outIndel = null; // declare a print stream object for Indels
+
+ outSnp = new PrintStream( new FileOutputStream(outputSnp) );
+ outIndel = new PrintStream( new FileOutputStream(outputIndel) );
+
+ // Print a file header //
+ if(!params.containsKey("no-headers"))
+ {
+ if(params.containsKey("output-vcf"))
+ {
+ // Output VCF Header //
+
+ vcfHeader += "\n" + "##source=VarScan2";
+ vcfHeader += "\n" + "##INFO=<ID=ADP,Number=1,Type=Integer,Description=\"Average per-sample depth of bases with Phred score >= " + minAvgQual + "\">";
+ vcfHeader += "\n" + "##INFO=<ID=STATUS,Number=1,Type=String,Description=\"Variant status in trio (1=untransmitted, 2=transmitted, 3=denovo, 4=MIE)\">";
+ vcfHeader += "\n" + "##INFO=<ID=DENOVO,Number=0,Type=Flag,Description=\"Indicates apparent de novo mutations unique to the child\">";
+ vcfHeader += "\n" + "##FILTER=<ID=str10,Description=\"Less than 10% or more than 90% of variant supporting reads on one strand\">";
+ vcfHeader += "\n" + "##FILTER=<ID=indelError,Description=\"Likely artifact due to indel reads at this position\">";
+ vcfHeader += "\n" + "##FILTER=<ID=mendelError,Description=\"Apparent Mendelian inheritance error (MIE) in trio\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=SDP,Number=1,Type=Integer,Description=\"Raw Read Depth as reported by SAMtools\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Quality Read Depth of bases with Phred score >= " + minAvgQual + "\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=RD,Number=1,Type=Integer,Description=\"Depth of reference-supporting bases (reads1)\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Depth of variant-supporting bases (reads2)\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=FREQ,Number=1,Type=String,Description=\"Variant allele frequency\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=PVAL,Number=1,Type=String,Description=\"P-value from Fisher's Exact Test\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=RBQ,Number=1,Type=Integer,Description=\"Average quality of reference-supporting bases (qual1)\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=ABQ,Number=1,Type=Integer,Description=\"Average quality of variant-supporting bases (qual2)\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=RDF,Number=1,Type=Integer,Description=\"Depth of reference-supporting bases on forward strand (reads1plus)\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=RDR,Number=1,Type=Integer,Description=\"Depth of reference-supporting bases on reverse strand (reads1minus)\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=ADF,Number=1,Type=Integer,Description=\"Depth of variant-supporting bases on forward strand (reads2plus)\">";
+ vcfHeader += "\n" + "##FORMAT=<ID=ADR,Number=1,Type=Integer,Description=\"Depth of variant-supporting bases on reverse strand (reads2minus)\">";
+
+
+ }
+ else
+ {
+ // Output VarScan Header //
+ System.out.println("Chrom\tPosition\tRef\tVar\tStrandFilter:R1+:R1-:R2+:R2-:pval\tFather:Cov:Reads1:Reads2:Freq:P-value\tMother:Cov:Reads1:Reads2:Freq:P-value\tChild:Cov:Reads1:Reads2:Freq:P-value");
+ }
+
+ }
+
+
+
+ // Parse the infile line by line //
+
+ while ((line = in.readLine()) != null)
+ {
+ numBases++;//stats.put("numBases", (stats.get("numBases") + 1));
+
+ // Output progress line //
+ if(params.containsKey("verbose") && (numBases % 100000) == 0)
+ System.err.println(numBases + " positions parsed...");
+
+ // Begin try-catch for line parsing //
+
+ try
+ {
+ String[] lineContents = line.split("\t");
+
+ // Verify expected pileup format //
+
+ if(lineContents.length > 5 && lineContents[0].length() > 0 && lineContents[1].length() > 0 && lineContents[2].length() > 0 && lineContents[3].length() > 0)
+ {
+ if(numBases == 1 && params.containsKey("output-vcf"))
+ {
+ vcfHeader += "\n" + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT";
+ if(sampleList.length() > 0)
+ {
+ vcfHeader += "\t" + sampleList;
+ }
+ else
+ {
+ // print the VCF sample header //
+ vcfHeader += "\tFather\tMother\tChild";
+ }
+
+ // Output VCF Header //
+ outSnp.println(vcfHeader);
+ outIndel.println(vcfHeader);
+
+ }
+
+
+ String refName = lineContents[0];
+ String position = lineContents[1];
+ String refBase = lineContents[2].toUpperCase();
+ HashMap<String, Integer> varAlleles = new HashMap<String, Integer>();
+ boolean variantFlag = false;
+
+ // Declare variables for cross-sample calling and strand filter //
+ double strandPvalue = 1.00;
+ String strandFilterStatus = "";
+
+ if(lineContents.length > 12)
+ {
+ if(numBases == 1)
+ System.err.println("Warning: More than 3 samples in pileup; but only first 3 will be used and they should be father, mother child");
+ }
+
+ // Get Father Call //
+ int offset = 3;
+ int fatherDepth = Integer.parseInt(lineContents[offset]);
+ String fatherBases = lineContents[offset + 1];
+ String fatherQualities = lineContents[offset + 2];
+ int fatherQualityDepth = VarScan.qualityDepth(fatherQualities, minAvgQual);
+
+ // Get Mother Call //
+ offset = 6;
+ int motherDepth = Integer.parseInt(lineContents[offset]);
+ String motherBases = lineContents[offset + 1];
+ String motherQualities = lineContents[offset + 2];
+ int motherQualityDepth = VarScan.qualityDepth(motherQualities, minAvgQual);
+
+ // Get Child Call //
+ offset = 9;
+ int childDepth = Integer.parseInt(lineContents[offset]);
+ String childBases = lineContents[offset + 1];
+ String childQualities = lineContents[offset + 2];
+ int childQualityDepth = VarScan.qualityDepth(childQualities, minAvgQual);
+
+ if(fatherQualityDepth >= minCoverage && motherQualityDepth >= minCoverage && childQualityDepth >= minCoverage)
+ {
+ numBasesCovered++;
+
+ // Perform strand filter test //
+ String allBases = fatherBases + motherBases + childBases;
+ String allQualities = fatherQualities + motherQualities + childQualities;
+ HashMap<String, String> allCounts = VarScan.getReadCounts(refBase, allBases, allQualities, minAvgQual, "");
+ String positionCall = VarScan.callPosition(refBase, allCounts, "CNS", minReads2, 0.01, minAvgQual, 0.95, minFreqForHom);
+ String[] callContents = positionCall.split("\t");
+ if(callContents.length >= 15)
+ {
+ int reads1plus = Integer.parseInt(callContents[11]);
+ int reads1minus = Integer.parseInt(callContents[12]);
+ int reads2plus = Integer.parseInt(callContents[13]);
+ int reads2minus = Integer.parseInt(callContents[14]);
+ strandFilterStatus = VarScan.strandFilter(reads1plus, reads1minus, reads2plus, reads2minus, strandPvalueThreshold);
+ }
+// System.err.println(strandFilterStatus);
+
+
+ HashMap<String, String> fatherCounts = VarScan.getReadCounts(refBase, fatherBases, fatherQualities, minAvgQual, "");
+ HashMap<String, String> motherCounts = VarScan.getReadCounts(refBase, motherBases, motherQualities, minAvgQual, "");
+ HashMap<String, String> childCounts = VarScan.getReadCounts(refBase, childBases, childQualities, minAvgQual, "");
+
+ // Prepare Strings for Results //
+ String fatherCall = "";
+ String motherCall = "";
+ String childCall = "";
+ String trioStatus = "";
+
+ // Try trio calling //
+ String trioCall = callTrio(refBase, fatherCounts, motherCounts, childCounts, minReads2, minVarFreq, minAvgQual, pValueThreshold, minFreqForHom);
+ String[] trioCallContents = trioCall.split("\t");
+
+ if(trioCallContents.length >= 4)
+ {
+ fatherCall = trioCallContents[0];
+ motherCall = trioCallContents[1];
+ childCall = trioCallContents[2];
+
+ trioStatus = trioCallContents[3];
+ boolean recallTrio = false;
+
+ // Consider re-calling de novo mutations, MIEs, and untransmitted //
+ if(trioStatus.equals("DeNovo") || trioStatus.contains("MIE") || trioStatus.equals("Untransmitted"))
+ {
+ // Parse out the variant allele from each sample //
+ String[] fatherContents = fatherCall.split(":");
+ String[] motherContents = motherCall.split(":");
+ String[] childContents = childCall.split(":");
+
+ String fatherAllele = refBase;
+ String motherAllele = refBase;
+ String childAllele = refBase;
+
+ if(fatherContents.length >= 16)
+ fatherAllele = fatherContents[15];
+
+ if(motherContents.length >= 16)
+ motherAllele = motherContents[15];
+
+ if(childContents.length >= 16)
+ childAllele = childContents[15];
+
+ // Evaluate if we should re-call the trio with reduced thresholds //
+
+ if(trioStatus.equals("Untransmitted"))
+ {
+ // Re-call if child was Reference but has evidence of variant //
+ if(!childAllele.equals(refBase) && (childAllele.equals(motherAllele) || childAllele.equals(fatherAllele)))
+ recallTrio = true;
+ }
+ else if(trioStatus.equals("DeNovo"))
+ {
+ // Recall if child had de novo but either parent has evidence //
+ //if(fatherAllele.equals(childAllele) || motherAllele.equals(childAllele))
+ recallTrio = true;
+ }
+ else if(trioStatus.contains("MIE"))
+ {
+ // Recall if there was an apparent inheritance error //
+ recallTrio = true;
+ }
+
+ if(recallTrio)
+ {
+ // Adjust values and recall trio //
+// double adjustedMinVarFreq = minVarFreq / 2.00;
+// double adjustedpValueThreshold = 0.20;
+
+ trioCall = callTrio(refBase, fatherCounts, motherCounts, childCounts, minReads2, adjustedMinVarFreq, minAvgQual, adjustedpValueThreshold, minFreqForHom);
+ trioCallContents = trioCall.split("\t");
+
+ // Determine if something changed //
+ if(!trioStatus.equals(trioCallContents[3]))
+ {
+ String change = "initially " + trioStatus + " were re-called " + trioCallContents[3];
+ if(!stats.containsKey(change))
+ {
+// System.err.println("CHANGED FROM " + trioStatus + "\t" + trioCall);
+ stats.put(change, 1);
+ }
+ else
+ {
+ stats.put(change, (stats.get(change) + 1));
+ }
+ }
+
+ trioStatus = trioCallContents[3];
+
+ } // Otherwise don't re-call //
+
+ }
+ else
+ {
+ // Must have been Reference or Germline //
+ }
+
+ String variantType = "SNP";
+
+ fatherCall = trioCallContents[0];
+ motherCall = trioCallContents[1];
+ childCall = trioCallContents[2];
+
+ // Parse out the variant allele from each sample //
+ String[] fatherContents = fatherCall.split(":");
+ String[] motherContents = motherCall.split(":");
+ String[] childContents = childCall.split(":");
+
+ String fatherAllele = refBase;
+ String motherAllele = refBase;
+ String childAllele = refBase;
+
+ // BUILD FATHER VCF //
+
+ String fatherVCF = "./.:.:" + fatherQualityDepth;
+
+ if(fatherContents.length >= 15)
+ {
+ if(fatherContents.length >= 16)
+ fatherAllele = fatherContents[15];
+ String consBase = fatherContents[0];
+ int reads1 = Integer.parseInt(fatherContents[1]);
+ int reads2 = Integer.parseInt(fatherContents[2]);
+ String varFreq = fatherContents[3];
+ int qual1 = Integer.parseInt(fatherContents[6]);
+ int qual2 = Integer.parseInt(fatherContents[7]);
+ double pValue = Double.parseDouble(fatherContents[8]);
+ int reads1plus = Integer.parseInt(fatherContents[11]);
+ int reads1minus = Integer.parseInt(fatherContents[12]);
+ int reads2plus = Integer.parseInt(fatherContents[13]);
+ int reads2minus = Integer.parseInt(fatherContents[14]);
+
+ double logP = 0;
+ try {
+ logP = 0 - (10 * java.lang.Math.log10(pValue));
+ if(logP > 255)
+ logP = 255;
+ }
+ catch(Exception e)
+ {
+ // Stick with default logP value
+ }
+
+ // Father is wildtype //
+ if(consBase.equals(refBase))
+ {
+ // A reference call - recalculate p-value against a possible het //
+ int expReads1 = (reads1 + reads2) / 2;
+ int expReads2 = (reads1 + reads2) - expReads1;
+ double newPvalue = VarScan.getSignificance(reads1, reads2, expReads1, expReads2);
+ double newLogP = 0;
+ try {
+ newLogP = 0 - (10 * java.lang.Math.log10(newPvalue));
+ }
+ catch(Exception e)
+ {
+ // Stick with default logP value
+ }
+ fatherVCF = "0" + "/" + "0";
+ fatherVCF += ":" + (int) newLogP + ":" + fatherDepth + ":" + fatherQualityDepth;
+ fatherVCF += ":" + reads1 + ":" + reads2 + ":" + varFreq + ":" + pvalueFormat.format(pValue);
+ fatherVCF += ":" + qual1 + ":" + qual2;
+ fatherVCF += ":" + reads1plus + ":" + reads1minus + ":" + reads2plus + ":" + reads2minus;
+
+ }
+ // Father is variant //
+ else if(fatherAllele.length() > 0 && !fatherAllele.equals("N") && !fatherAllele.equals("."))
+ {
+ // Determine how many variant alleles have been seen //
+
+ int varAlleleNumber = 0;
+
+ // Determine if we've seen the variant and what its number is ##
+
+ if(varAlleles.containsKey(fatherAllele))
+ {
+ varAlleleNumber = varAlleles.get(fatherAllele);
+ }
+ else
+ {
+ // IF no variants yet seen, this is variant allele 1 //
+ varAlleleNumber = varAlleles.size() + 1;
+ varAlleles.put(fatherAllele, varAlleleNumber);
+ }
+
+ if(fatherContents.length >= 1)
+ {
+ if(VarScan.isHomozygous(consBase))
+ {
+ fatherVCF = varAlleleNumber + "/" + varAlleleNumber;
+ }
+ else
+ {
+ fatherVCF = "0" + "/" + varAlleleNumber;
+ }
+
+ fatherVCF += ":" + (int) logP + ":" + fatherDepth + ":" + fatherQualityDepth;
+ fatherVCF += ":" + reads1 + ":" + reads2 + ":" + varFreq + ":" + pvalueFormat.format(pValue);
+ fatherVCF += ":" + qual1 + ":" + qual2;
+ fatherVCF += ":" + reads1plus + ":" + reads1minus + ":" + reads2plus + ":" + reads2minus;
+ }
+
+ if(fatherAllele.length() > 1)
+ variantType = "Indel";
+ }
+
+ }
+
+ // BUILD MOTHER VCF //
+
+ String motherVCF = "./.:.:" + motherQualityDepth;
+
+ if(motherContents.length >= 15)
+ {
+ if(motherContents.length >= 16)
+ motherAllele = motherContents[15];
+ String consBase = motherContents[0];
+ int reads1 = Integer.parseInt(motherContents[1]);
+ int reads2 = Integer.parseInt(motherContents[2]);
+ String varFreq = motherContents[3];
+ int qual1 = Integer.parseInt(motherContents[6]);
+ int qual2 = Integer.parseInt(motherContents[7]);
+ double pValue = Double.parseDouble(motherContents[8]);
+ int reads1plus = Integer.parseInt(motherContents[11]);
+ int reads1minus = Integer.parseInt(motherContents[12]);
+ int reads2plus = Integer.parseInt(motherContents[13]);
+ int reads2minus = Integer.parseInt(motherContents[14]);
+
+ double logP = 0;
+ try {
+ logP = 0 - (10 * java.lang.Math.log10(pValue));
+ if(logP > 255)
+ logP = 255;
+ }
+ catch(Exception e)
+ {
+ // Stick with default logP value
+ }
+
+ // mother is wildtype //
+ if(consBase.equals(refBase))
+ {
+ // A reference call - recalculate p-value against a possible het //
+ int expReads1 = (reads1 + reads2) / 2;
+ int expReads2 = (reads1 + reads2) - expReads1;
+ double newPvalue = VarScan.getSignificance(reads1, reads2, expReads1, expReads2);
+ double newLogP = 0;
+ try {
+ newLogP = 0 - (10 * java.lang.Math.log10(newPvalue));
+ }
+ catch(Exception e)
+ {
+ // Stick with default logP value
+ }
+ motherVCF = "0" + "/" + "0";
+ motherVCF += ":" + (int) newLogP + ":" + motherDepth + ":" + motherQualityDepth;
+ motherVCF += ":" + reads1 + ":" + reads2 + ":" + varFreq + ":" + pvalueFormat.format(pValue);
+ motherVCF += ":" + qual1 + ":" + qual2;
+ motherVCF += ":" + reads1plus + ":" + reads1minus + ":" + reads2plus + ":" + reads2minus;
+
+ }
+ // mother is variant //
+ else if(motherAllele.length() > 0 && !motherAllele.equals("N") && !motherAllele.equals("."))
+ {
+ // Determine how many variant alleles have been seen //
+
+ int varAlleleNumber = 0;
+
+ // Determine if we've seen the variant and what its number is ##
+
+ if(varAlleles.containsKey(motherAllele))
+ {
+ varAlleleNumber = varAlleles.get(motherAllele);
+ }
+ else
+ {
+ // IF no variants yet seen, this is variant allele 1 //
+ varAlleleNumber = varAlleles.size() + 1;
+ varAlleles.put(motherAllele, varAlleleNumber);
+ }
+
+ if(motherContents.length >= 1)
+ {
+ if(VarScan.isHomozygous(consBase))
+ {
+ motherVCF = varAlleleNumber + "/" + varAlleleNumber;
+ }
+ else
+ {
+ motherVCF = "0" + "/" + varAlleleNumber;
+ }
+
+ motherVCF += ":" + (int) logP + ":" + motherDepth + ":" + motherQualityDepth;
+ motherVCF += ":" + reads1 + ":" + reads2 + ":" + varFreq + ":" + pvalueFormat.format(pValue);
+ motherVCF += ":" + qual1 + ":" + qual2;
+ motherVCF += ":" + reads1plus + ":" + reads1minus + ":" + reads2plus + ":" + reads2minus;
+ }
+
+ if(motherAllele.length() > 1)
+ variantType = "Indel";
+ }
+
+ }
+
+ // BUILD CHILD VCF //
+
+ String childVCF = "./.:.:" + childQualityDepth;
+
+ if(childContents.length >= 15)
+ {
+ if(childContents.length >= 16)
+ childAllele = childContents[15];
+ String consBase = childContents[0];
+ int reads1 = Integer.parseInt(childContents[1]);
+ int reads2 = Integer.parseInt(childContents[2]);
+ String varFreq = childContents[3];
+ int qual1 = Integer.parseInt(childContents[6]);
+ int qual2 = Integer.parseInt(childContents[7]);
+ double pValue = Double.parseDouble(childContents[8]);
+ int reads1plus = Integer.parseInt(childContents[11]);
+ int reads1minus = Integer.parseInt(childContents[12]);
+ int reads2plus = Integer.parseInt(childContents[13]);
+ int reads2minus = Integer.parseInt(childContents[14]);
+
+ double logP = 0;
+ try {
+ logP = 0 - (10 * java.lang.Math.log10(pValue));
+ if(logP > 255)
+ logP = 255;
+ }
+ catch(Exception e)
+ {
+ // Stick with default logP value
+ }
+
+ // child is wildtype //
+ if(consBase.equals(refBase))
+ {
+ // A reference call - recalculate p-value against a possible het //
+ int expReads1 = (reads1 + reads2) / 2;
+ int expReads2 = (reads1 + reads2) - expReads1;
+ double newPvalue = VarScan.getSignificance(reads1, reads2, expReads1, expReads2);
+ double newLogP = 0;
+ try {
+ newLogP = 0 - (10 * java.lang.Math.log10(newPvalue));
+ }
+ catch(Exception e)
+ {
+ // Stick with default logP value
+ }
+ childVCF = "0" + "/" + "0";
+ childVCF += ":" + (int) newLogP + ":" + childDepth + ":" + childQualityDepth;
+ childVCF += ":" + reads1 + ":" + reads2 + ":" + varFreq + ":" + pvalueFormat.format(pValue);
+ childVCF += ":" + qual1 + ":" + qual2;
+ childVCF += ":" + reads1plus + ":" + reads1minus + ":" + reads2plus + ":" + reads2minus;
+
+ }
+ // child is variant //
+ else if(childAllele.length() > 0 && !childAllele.equals("N") && !childAllele.equals("."))
+ {
+ // Determine how many variant alleles have been seen //
+
+ int varAlleleNumber = 0;
+
+ // Determine if we've seen the variant and what its number is ##
+
+ if(varAlleles.containsKey(childAllele))
+ {
+ varAlleleNumber = varAlleles.get(childAllele);
+ }
+ else
+ {
+ // IF no variants yet seen, this is variant allele 1 //
+ varAlleleNumber = varAlleles.size() + 1;
+ varAlleles.put(childAllele, varAlleleNumber);
+ }
+
+ if(childContents.length >= 1)
+ {
+ if(VarScan.isHomozygous(consBase))
+ {
+ childVCF = varAlleleNumber + "/" + varAlleleNumber;
+ }
+ else
+ {
+ childVCF = "0" + "/" + varAlleleNumber;
+ }
+
+ childVCF += ":" + (int) logP + ":" + childDepth + ":" + childQualityDepth;
+ childVCF += ":" + reads1 + ":" + reads2 + ":" + varFreq + ":" + pvalueFormat.format(pValue);
+ childVCF += ":" + qual1 + ":" + qual2;
+ childVCF += ":" + reads1plus + ":" + reads1minus + ":" + reads2plus + ":" + reads2minus;
+ }
+
+ if(childAllele.length() > 1)
+ variantType = "Indel";
+ }
+
+ }
+
+
+ // BEGIN BUILDING OUTPUT //
+
+
+ // Build an output line //
+ String outLine = refName + "\t" + position + "\t";
+
+
+ // Get All Variant alleles observed //
+
+ String varBases = "";
+ // First, obtain their unique keys which are in alphanumeric order //
+ String[] sortedKeys = (String[]) varAlleles.keySet().toArray(new String[0]);
+
+ // Create an empty array to put these into sorted order //
+ String[] alleleKeys = new String[sortedKeys.length];
+
+ // Put alleles into this array in their order of occurrence in VCF line //
+ for(String allele : sortedKeys)
+ {
+ int arrayIndex = varAlleles.get(allele) - 1;
+ alleleKeys[arrayIndex] = allele;
+ }
+
+ // Export all variant alleles into a comma-separated string//
+ // This is what's provided in native output, or converted to VCF format //
+ for(String allele : alleleKeys)
+ {
+ if(varBases.length() > 0)
+ varBases += ",";
+
+ varBases += allele;
+ }
+
+ // It's possible that we see no variant here, so we need the proper empty character //
+ if(varBases.length() == 0)
+ varBases = ".";
+
+
+ // Calculate average sample depth //
+ int avgQualityDepth = (fatherQualityDepth + motherQualityDepth + childQualityDepth) / 3;
+ String refColumn = "";
+ String varColumn = "";
+
+ // Handle complex positions with multiple alleles including at least one indel //
+
+ if(varBases.contains(",") && (varBases.contains("-") || varBases.contains("+")))
+ {
+ variantType = "INDEL";
+ // Multi-allele indel //
+ int maxDelSize = 0;
+ String maxDelBases = "";
+ // Go through each varAllele to find longest deletion //
+ String[] varBaseContents = varBases.split(",");
+ for(String varAllele : varBaseContents)
+ {
+ if(varAllele.startsWith("-"))
+ {
+ varAllele = varAllele.replace("-", "");
+ if(varAllele.length() > maxDelSize)
+ {
+ maxDelBases = varAllele;
+ maxDelSize = varAllele.length();
+ }
+ }
+ }
+
+ // Set refBase to maximum del //
+ refColumn = refBase + maxDelBases;
+
+ // Establish each allele in var Column //
+ varColumn = "";
+
+ for(String varAllele : varBaseContents)
+ {
+ if(varColumn.length() > 0)
+ varColumn = varColumn + ",";
+
+ if(varAllele.startsWith("-"))
+ {
+ varAllele = varAllele.replace("-", "");
+
+ // For the smaller deletion, determine ref bases to add //
+ if(varAllele.length() < maxDelSize)
+ {
+ String varEntry = maxDelBases.replace(varAllele, "");
+ varColumn = varColumn + refBase + varEntry;
+ }
+ else
+ {
+ varColumn = varColumn + refBase;
+ }
+ }
+ else if(varAllele.startsWith("+"))
+ {
+ varAllele = varAllele.replace("+", "");
+ String varEntry = refBase + varAllele + maxDelBases;
+ varColumn = varColumn + varEntry;
+ }
+ else
+ {
+ String varEntry = varAllele + maxDelBases;
+ varColumn = varColumn + varEntry;
+ }
+ }
+
+
+ }
+
+ else if(varBases.startsWith("+"))
+ {
+ variantType = "INDEL";
+ // INSERTION //
+ // Ref = ref base; Var = ref base followed by inserted bases //
+ refColumn = refBase;
+ varColumn = refBase + varBases.replace("+", "");
+ }
+ else if(varBases.startsWith("-"))
+ {
+ variantType = "INDEL";
+ // DELETION //
+ // Ref = ref base followed by deleted bases; var = ref base //
+ refColumn = refBase + varBases.replace("-", "");
+ varColumn = refBase;
+ }
+ else
+ {
+ // Variant type SNP //
+ refColumn = refBase;
+ varColumn = varBases;
+ }
+
+
+ // Ensure that varColumn does not contain any +/- //
+ varColumn = varColumn.replace("+", "");
+ varColumn = varColumn.replace("-", "");
+
+ // ADD REF, ALT, FILTER, INFO, and FORMAT FIELDS TO OUTPUT //
+
+ outLine += "." + "\t" + refColumn + "\t" + varColumn + "\t.\t";
+
+ String filterColumn = "";
+ if(trioStatus.contains("MIE"))
+ {
+ filterColumn = "mendelError";
+ }
+ else if (strandFilterStatus.contains("Fail"))
+ {
+ filterColumn = "str10";
+ }
+ else
+ {
+ filterColumn = "PASS";
+ }
+
+ outLine += filterColumn + "\t";
+ outLine += "ADP=" + avgQualityDepth + ";STATUS="; // + trioStatus;
+
+ if(trioStatus.contains("Untransmitted"))
+ {
+ outLine += "1";
+ }
+ else if(trioStatus.contains("Germline"))
+ {
+ outLine += "2";
+ }
+ else if(trioStatus.contains("DeNovo"))
+ {
+ outLine += "3;DENOVO";
+ }
+ else if(trioStatus.contains("MIE"))
+ {
+ outLine += "4";
+ }
+
+ outLine += "\t" + "GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR" + "\t";
+
+ outLine += fatherVCF + "\t" + motherVCF + "\t" + childVCF;
+ // outLine += "\t" + fatherCall.replace("\t", ":") + "\t" + motherCall.replace("\t", ":") + "\t" + childCall.replace("\t", ":");
+
+ // Count this trio status //
+ String statKey = "";
+ if(trioStatus.equals("Reference"))
+ {
+ // No counting or printing these sites //
+ }
+ else
+ {
+ // A variant position... flag it and count the type //
+ numVariantPositions++;
+ variantFlag = true;
+
+ if(variantType.equals("INDEL"))
+ {
+ numIndelPositions++;
+ }
+ else
+ {
+ numSNPpositions++;
+ }
+
+ // Also count pass/fail filter statuses //
+
+ if(strandFilterStatus.contains("Fail"))
+ {
+ numFailStrandFilter++;
+ }
+ else if(trioStatus.equals("MIE"))
+ {
+ numFailMendelFilter++;
+ }
+ else
+ {
+ numVariantsReported++;
+ if(trioStatus.equals("DeNovo"))
+ numVariantsReportedDeNovo++;
+
+ if(variantType.equals("INDEL"))
+ {
+ numIndelsReported++;
+ if(trioStatus.equals("DeNovo"))
+ numIndelsReportedDeNovo++;
+ }
+ else
+ {
+ numSNPsReported++;
+ if(trioStatus.equals("DeNovo"))
+ numSNPsReportedDeNovo++;
+ }
+ }
+
+ }
+
+ // Determine if we should print the output line //
+ if(variantFlag)
+ {
+ if(variantType.equals("SNP"))
+ {
+ outSnp.println(outLine);
+ }
+ else if(variantType.equals("INDEL"))
+ {
+ outIndel.println(outLine);
+ }
+ }
+ }
+ else
+ {
+ // The trioCallContents was less than 4 fields, so that's a problem //
+ System.err.println("No status for " + numBases);
+ }
+
+ }
+
+
+
+ }
+ else
+ {
+ if(lineContents.length >= 4 && lineContents[3].equals("0"))
+ {
+ // A pileup line with 0x coverage, so ignore
+ }
+ else
+ {
+ System.err.println("Error: Invalid format for pileup at line " + numBases + "\n" + line + "\n");
+ return;
+ }
+
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Parsing Exception on line:\n" + line + "\n" + e.getMessage() + "\n" + e.getLocalizedMessage());
+ e.printStackTrace();
+ numParsingExceptions++;
+ if(numParsingExceptions >= 5)
+ {
+ System.err.println("Too many parsing exceptions encountered; exiting");
+ return;
+ }
+ return;
+ }
+
+
+ }
+
+ in.close();
+
+ System.err.println(numBases + " bases in pileup file");
+ System.err.println(numBasesCovered + " met the coverage requirement of " + minCoverage);
+ System.err.println(numVariantPositions + " variant positions (" + numSNPpositions + " SNP, " + numIndelPositions + " indel)");
+ System.err.println(numFailStrandFilter + " were failed by the strand-filter");
+ System.err.println(numVariantsReported + " variant positions reported (" + numSNPsReported + " SNP, " + numIndelsReported + " indel)");
+ System.err.println(numVariantsReportedDeNovo + " de novo mutations reported (" + numSNPsReportedDeNovo + " SNP, " + numIndelsReportedDeNovo + " indel)");
+
+ // Print the status of each trio call //
+// System.err.println(stats.get("Reference") + " called Reference");
+ String[] statsKeys = (String[]) stats.keySet().toArray(new String[0]);
+
+ Arrays.sort(statsKeys);
+
+ // Get the total number of reads at this position //
+ int totalstats = 0;
+ for(String statsKey : statsKeys)
+ {
+ System.err.println(stats.get(statsKey) + " " + statsKey);
+ }
+
+ }
+ // Insufficient input was provided, so print usage //
+ else
+ {
+ System.err.println("Please provide an input file!\n" + usage);
+ System.exit(10);
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ System.exit(11);
+ }
+ }
+
+
+
+
+
+ public String callTrio(String refBase, HashMap<String, String> fatherCounts, HashMap<String, String> motherCounts, HashMap<String, String> childCounts, int minReads2, double minVarFreq, int minAvgQual, double pValueThreshold, double minFreqForHom)
+ {
+ String fatherCall = VarScan.callPosition(refBase, fatherCounts, "CNS", minReads2, minVarFreq, minAvgQual, pValueThreshold, minFreqForHom);
+ String motherCall = VarScan.callPosition(refBase, motherCounts, "CNS", minReads2, minVarFreq, minAvgQual, pValueThreshold, minFreqForHom);
+ String childCall = VarScan.callPosition(refBase, childCounts, "CNS", minReads2, minVarFreq, minAvgQual, pValueThreshold, minFreqForHom);
+
+ // Determine the father, mother, and child genotypes //
+ String trioStatus = "unknown";
+
+ try
+ {
+ String[] fatherContents = fatherCall.split("\t");
+ String[] motherContents = motherCall.split("\t");
+ String[] childContents = childCall.split("\t");
+
+ String father = fatherContents[0];
+ String mother = motherContents[0];
+ String child = childContents[0];
+ String fatherAllele = refBase;
+ String motherAllele = refBase;
+ String childAllele = refBase;
+
+ if(fatherContents.length >= 16)
+ fatherAllele = fatherContents[15];
+
+ if(motherContents.length >= 16)
+ motherAllele = motherContents[15];
+
+ if(childContents.length >= 16)
+ childAllele = childContents[15];
+
+ // Uninteresting case 1: Any Sample called N //
+ if(child.equals("N") || father.equals("N") || mother.equals("N"))
+ {
+ // Missing data, so not sure
+ trioStatus = "MissingData";
+ }
+ // CASE 1: ALL 3 SAMPLES WILDTYPE //
+ else if(father.equals(refBase) && mother.equals(refBase) && child.equals(refBase))
+ {
+ trioStatus = "Reference";
+ }
+ // CASE 2: DE NOVO MUTATION //
+ else if(father.equals(refBase) && mother.equals(refBase) && !child.equals(refBase))
+ {
+ trioStatus = "DeNovo";
+
+ }
+ else
+ {
+ // CHECK INDIVIDUAL GENOTYPE ALLELES //
+ String fatherGt = VarScan.codeToGenotype(father);
+ String motherGt = VarScan.codeToGenotype(mother);
+ String childGt = VarScan.codeToGenotype(child);
+
+ String father1 = "N";
+ String father2 = "N";
+ String[] fatherSplit = fatherGt.split("/");
+ if(fatherSplit.length == 2)
+ {
+ father1 = fatherSplit[0];
+ father2 = fatherSplit[1];
+ }
+
+ String mother1 = "N";
+ String mother2 = "N";
+ String[] motherSplit = motherGt.split("/");
+ if(motherSplit.length == 2)
+ {
+ mother1 = motherSplit[0];
+ mother2 = motherSplit[1];
+ }
+
+ String child1 = "N";
+ String child2 = "N";
+ String[] childSplit = childGt.split("/");
+ if(childSplit.length == 2)
+ {
+ child1 = childSplit[0];
+ child2 = childSplit[1];
+ }
+
+ if(father1.equals("*"))
+ father1 = refBase;
+ if(mother1.equals("*"))
+ mother1 = refBase;
+ if(child1.equals("*"))
+ child1 = refBase;
+
+ // CHILD IS VARIANT
+ if(!child.equals(refBase))
+ {
+ if((child1.equals(father1) || child1.equals(father2) || child2.equals(father1) || child2.equals(father2)) && (child1.equals(mother1) || child1.equals(mother2) || child2.equals(mother1) || child2.equals(mother2)))
+ {
+ trioStatus = "Germline";
+ }
+ else if(!child.equals(fatherAllele) && !child.equals(motherAllele))
+ {
+ trioStatus = "MultAlleles";
+ }
+ else
+ {
+ trioStatus = "MIE";
+ }
+ }
+ // CHILD IS WILDTYPE
+ else if(child.equals(refBase))
+ {
+ // Can one wildtype allele come from dad and one from mom ? //
+ if((father1.equals(refBase) || father2.equals(refBase)) && (mother1.equals(refBase) || mother2.equals(refBase)))
+ {
+ trioStatus = "Untransmitted";
+ }
+
+ else
+ {
+ trioStatus = "MIE";
+ }
+ }
+ }
+ // CASE 3B: GERMLINE VARIANT WITH CHILD HET AND ONE PARENT HOM
+ // CASE 3C: CHILD IS WILDTYPE, AND NEITHER PARENT HOMOZYGOUS-VARIANT
+ // CASE 3C: CHILD IS WILDTYPE, AND NEITHER PARENT HOMOZYGOUS-VARIANT
+ // CASE 4: IMPOSSIBLE HOMOZYGOTE
+ // CASE 5: SHOULD BE HET //
+
+ return(fatherCall.replace("\t", ":") + "\t" + motherCall.replace("\t", ":") + "\t" + childCall.replace("\t", ":") + "\t" + trioStatus);
+
+ }
+ catch(Exception e)
+ {
+ System.err.println("Error parsing genotypes: " + e.getMessage() + " local: " + e.getLocalizedMessage());
+ e.printStackTrace();
+
+ }
+
+ return("");
+ }
+
+}
diff --git a/net/sf/varscan/VarScan.java b/net/sf/varscan/VarScan.java
new file mode 100644
index 0000000..adf2283
--- /dev/null
+++ b/net/sf/varscan/VarScan.java
@@ -0,0 +1,1757 @@
+/**
+ * @(#)VarScan.java
+ *
+ * Copyright (c) 2009-2010 Daniel C. Koboldt and Washington University in St. Louis
+ *
+ * COPYRIGHT
+ */
+
+package net.sf.varscan;
+
+//Import required packages //
+
+import java.io.*;
+import java.util.*;
+import java.text.*;
+
+
+/**
+ * A set of tools for variant detection in next-generation sequence data.
+ *
+ * @version 2.3
+ *
+ * @author Daniel C. Koboldt <dkoboldt at genome.wustl.edu>
+ *
+ * <BR>
+ * <pre>
+ * COMMANDS
+ * pileup2snp [pileup file] OPTIONS
+ * Call SNPs from a pileup file that meet certain cutoffs
+ * Input: Pileup file and parameters
+ * Output: SNPs file with read counts and p-value
+ *
+ * pileup2indel [pileup file] OPTIONS
+ * Call indels from a pileup file that meet certain cutoffs
+ * Input: Pileup file and parameters
+ * Output: Indels file with read counts and p-value
+ *
+ * pileup2cns [pileup file] OPTIONS
+ * Call consensus genotypes (reference or variant) at sites with sufficient coverage
+ * Input: Pileup file and parameters
+ * Output: Consensus file with genotypes, read counts and p-values
+ *
+ * mpileup2cns [pileup file] OPTIONS
+ * Call consensus genotypes (reference or variant) across one or more samples
+ * Input: SAMtools mpileup file and parameters
+ * Output: Consensus file with genotypes, read counts and p-values, or VCF file
+ *
+ * somatic [normal_pileup] [tumor_pileup] [output] OPTIONS
+ * Determine somatic status of SNPs from normal/tumor pileup for positions
+ * Input: Normal pileup, tumor pileup, and positions file
+ * Output: SNPs file with read counts and somatic status
+ *
+ * readcounts [pileup] --variants-file [positions] --output-file [output]
+ * Obtain read counts for each allele of variants from a pileup file
+ * Input: Variants file and pileupfile
+ * Output: Variants file with read counts for each allele
+ *
+ * filter [variant file] OPTIONS
+ * Filter a set of SNPs/indels based on coverage, reads, p-value, etc.
+ * Input: SNPs file with read counts and p-value
+ * Output: Filtered SNPs file with read counts and p-value
+ *
+ * somaticFilter [somatic-status file] OPTIONS
+ * Filter VarScan Somatic/Germline/LOH calls for clusters and proximal indels
+ * Input: VarScan output for SNPs or Indels (varscan.output.snp)
+ * Output: Variants passing all filters (varscan.output.snp.filter)
+ *
+ * processSomatic [somatic-status file] OPTIONS
+ * Process VarScan output by somatic status and confidence
+ * Input: VarScan output for SNPs or Indels (varscan.output.snp)
+ * Output: Variants by somatic status (varscan.output.snp.Somatic)
+ *
+ * copyCaller [copynumber file] OPTIONS
+ * Process VarScan copynumber output to adjust for GC and make preliminary calls
+ * Input: VarScan copynumber output (varscan.output.copynumber)
+ * Output: Normalized copy number with preliminary calls (varscan.output.copynumber.called)
+ *
+ * compare [file1] [file2] [type] [output] OPTIONS
+ * Compares chromosome-position entries in two tab-delimited files
+ * Input: File 1 and File 2
+ * Output: Merged, intersected, or unique entries
+ *
+ * limit [variants] --regions-file [regions] --output-file [output]
+ * Limit a tab-delimited file (SNPs, pileup, etc) to a set of positions or regions
+ * Input: tab-delimited input file with chromosome & position; positions-file or regions-file
+ * Output: Entries in input-file matching regions or positions
+ *
+ * coverage [pileup-file] --regions-file [regions] --output-file [output]
+ * **Experimental** Calculate Q>20 coverage depth/breadth for a set of target regions
+ * Input: Pileup file and tab-delimited regions-file
+ * Output: Coverage report at various Q>20 depths (1x,10x,20x...)
+
+ *
+ * </pre>
+ *
+ *
+ */
+public class VarScan {
+
+ final static double MIN_FREQ_FOR_HOM = 0.70;
+
+ /**
+ * Runs the main execution logic
+ * @param args Command-line arguments
+ */
+ public static void main(String[] args) {
+
+ String usage = "VarScan v2.3\n\nUSAGE: java -jar VarScan.jar [COMMAND] [OPTIONS] \n\n";
+ usage = usage + "COMMANDS:\n" +
+ "\tpileup2snp\t\tIdentify SNPs from a pileup file\n" +
+ "\tpileup2indel\t\tIdentify indels a pileup file\n" +
+ "\tpileup2cns\t\tCall consensus and variants from a pileup file\n" +
+
+ "\tmpileup2snp\t\tIdentify SNPs from an mpileup file\n" +
+ "\tmpileup2indel\t\tIdentify indels an mpileup file\n" +
+ "\tmpileup2cns\t\tCall consensus and variants from an mpileup file\n\n" +
+
+ "\tsomatic\t\t\tCall germline/somatic variants from tumor-normal pileups\n" +
+ "\tcopynumber\t\t\tDetermine relative tumor copy number from tumor-normal pileups\n" +
+ "\treadcounts\t\tObtain read counts for a list of variants from a pileup file\n\n" +
+
+ "\tfilter\t\t\tFilter SNPs by coverage, frequency, p-value, etc.\n" +
+ "\tsomaticFilter\t\tFilter somatic variants for clusters/indels\n" +
+ "\tprocessSomatic\t\tIsolate Germline/LOH/Somatic calls from output\n" +
+ "\tcopyCaller\t\tGC-adjust and process copy number changes from VarScan copynumber output\n" +
+
+ "\tcompare\t\t\tCompare two lists of positions/variants\n" +
+ "\tlimit\t\t\tRestrict pileup/snps/indels to ROI positions\n" +
+ "\n";
+
+ if(args.length > 0)
+ {
+ HashMap<String, String> params = getParams(args);
+
+ if(args[0].equals("pileup2snp"))
+ {
+ pileup2call(args, params, "SNP");
+ }
+
+ else if(args[0].equals("pileup2indel"))
+ {
+ pileup2call(args, params, "INDEL");
+ }
+
+ else if(args[0].equals("pileup2cns"))
+ {
+ pileup2call(args, params, "CNS");
+ }
+
+ else if(args[0].equals("mpileup2snp") || args[0].equals("mpileup2indel") || args[0].equals("mpileup2cns") || args[0].equals("mpileup2vcf"))
+ {
+ mpileup2call(args, params, "CNS");
+ }
+
+
+ else if(args[0].equals("filter"))
+ {
+ filter(args, params);
+ }
+
+ else if(args[0].equals("somaticFilter"))
+ {
+ somaticFilter(args, params);
+ }
+
+ else if(args[0].equals("processSomatic"))
+ {
+ processSomatic(args, params);
+ }
+
+ else if(args[0].equals("copyCaller"))
+ {
+ copyCaller(args, params);
+ }
+
+ else if(args[0].equals("compare"))
+ {
+ compare(args, params);
+ }
+
+ else if(args[0].equals("readcounts"))
+ {
+ readcounts(args, params);
+ }
+
+ else if(args[0].equals("somatic"))
+ {
+ somatic(args, params);
+ }
+
+ else if(args[0].equals("trio"))
+ {
+ trio(args, params, "CNS");
+ }
+
+ else if(args[0].equals("copynumber"))
+ {
+ copynumber(args, params);
+ }
+
+ else if(args[0].equals("limit"))
+ {
+ limit(args, params);
+ }
+ else if(args[0].equals("coverage"))
+ {
+ coverage(args, params);
+ }
+ else if(args[0].equals("test"))
+ {
+ System.err.println("Testing...");
+ try
+ {
+ RandomAccessFile ref = new RandomAccessFile("test.fasta", "r");
+ ref.seek(52);
+ byte[] buffer = new byte[5];
+ ref.read(buffer);
+ String thisBase = new String(buffer);
+ System.err.println("Got " + thisBase);
+ }
+ catch(Exception e)
+ {
+ System.err.println("Error: Reference file: " + e.getLocalizedMessage());
+ }
+
+ }
+
+ else
+ {
+ System.err.println("Command not recognized\n" + usage);
+ }
+ }
+ else
+ {
+ System.err.println(usage);
+ }
+
+ }
+
+
+ /**
+ * Calls SNPs from a pileup file
+ *
+ * @param args Command-line arguments and parameters
+ * @param callType "SNP", "INDEL", or "CNS"
+ */
+ public static void pileup2call(String[] args, HashMap<String, String> params, String callType)
+ {
+ CallPileup pileupCall = new CallPileup(args, callType);
+ }
+
+ /**
+ * Calls SNPs from an mpileup file
+ *
+ * @param args Command-line arguments and parameters
+ * @param callType "SNP", "INDEL", or "CNS"
+ */
+ public static void mpileup2call(String[] args, HashMap<String, String> params, String callType)
+ {
+ CallMpileup mpileupCall = new CallMpileup(args, callType);
+ }
+
+ /**
+ * Obtains read counts for a list of variants
+ *
+ * @param args Command-line arguments
+ */
+ public static void readcounts(String[] args, HashMap<String, String> params)
+ {
+ ReadCounts myReadCounts = new ReadCounts(args, params);
+ }
+
+
+ /**
+ * Calls somatic/germline/LOH variants from normal and tumor pileup files
+ *
+ * @param args Command-line arguments
+ */
+ public static void somatic(String[] args, HashMap<String, String> params)
+ {
+ if(params.containsKey("mpileup"))
+ {
+ Somatic mySomatic = new Somatic(args, true);
+ }
+ else
+ {
+ Somatic mySomatic = new Somatic(args);
+ }
+
+ }
+
+ /**
+ * Calls SNPs in a father-mother-child trio from an mpileup file
+ *
+ * @param args Command-line arguments and parameters
+ * @param callType "SNP", "INDEL", or "CNS"
+ */
+ public static void trio(String[] args, HashMap<String, String> params, String callType)
+ {
+ Trio myTrio = new Trio(args, callType);
+ }
+
+
+ /**
+ * Determines tumor copy number from normal and tumor pileup files
+ *
+ * @param args Command-line arguments
+ */
+ public static void copynumber(String[] args, HashMap<String, String> params)
+ {
+ if(params.containsKey("mpileup"))
+ {
+ Copynumber myCopynumber = new Copynumber(args, true);
+ }
+ else
+ {
+ Copynumber myCopynumber = new Copynumber(args);
+ }
+
+ }
+
+
+ /**
+ * Filters variants by coverage, significance, frequency, etc.
+ *
+ * @param args Command-line arguments
+ */
+ public static void filter(String[] args, HashMap<String, String> params)
+ {
+ FilterVariants myFilter = new FilterVariants(args);
+ }
+
+ /**
+ * Filters variants by coverage, significance, frequency, etc.
+ *
+ * @param args Command-line arguments
+ */
+ public static void somaticFilter(String[] args, HashMap<String, String> params)
+ {
+ FilterSomatic myFilter = new FilterSomatic(args);
+ }
+
+ /**
+ * Splits VarScan output according to somatic status and confidence
+ *
+ * @param args Command-line arguments
+ */
+ public static void processSomatic(String[] args, HashMap<String, String> params)
+ {
+ ProcessSomatic myProcess = new ProcessSomatic(args);
+ }
+
+ /**
+ * Calls somatic copy number events from copynumber output
+ *
+ * @param args Command-line arguments
+ */
+ public static void copyCaller(String[] args, HashMap<String, String> params)
+ {
+ CopyCaller myCopy = new CopyCaller(args, params);
+ }
+
+ /**
+ * Compares two lists of positions/variants
+ *
+ * @param args Command-line arguments
+ */
+ public static void compare(String[] args, HashMap<String, String> params)
+ {
+ Comparison myComparison = new Comparison(args);
+ }
+
+
+ /**
+ * Limits pileup or variant files to a list of positions or regions
+ *
+ * @param args Command-line arguments
+ */
+ public static void limit(String[] args, HashMap<String, String> params)
+ {
+ LimitVariants myLimit = new LimitVariants(args);
+ }
+
+ /**
+ * Reports region coverage from a BAM file
+ *
+ * @param args Command-line arguments
+ */
+ public static void coverage(String[] args, HashMap<String, String> params)
+ {
+ Coverage myCoverage = new Coverage(args);
+ }
+
+
+
+ /**
+ * Parses and verifies any command-line parameters
+ *
+ * @param args Command-line arguments
+ * @return HashMap of parameter names and their values
+ */
+ static HashMap getParams(String[] args)
+ {
+ HashMap<String, String> params = new HashMap<String, String>();
+
+ // Parse out command line arguments //
+
+ String arg = "";
+ String value = "";
+ int i = 0, j = 0;
+
+ // Go through each argument in the command line //
+
+ while (i < args.length)
+ {
+ j = i + 1;
+ arg = args[i];
+
+ // If the argument starts with a hyphen, make use of it //
+
+ if (arg.startsWith("-"))
+ {
+ // Remove leading hyphens //
+ while(arg.startsWith("-"))
+ {
+ arg = arg.replaceFirst("-", "");
+ }
+
+ // Parse out parameters followed by values //
+
+ if (i < args.length && j < args.length && !args[j].startsWith("-"))
+ {
+ value = args[j];
+ params.put(arg, value);
+ }
+
+ // Set other parameters to true //
+
+ else
+ {
+ params.put(arg, "true");
+ }
+ }
+
+ i++;
+ }
+
+ return(params);
+ }
+
+
+ /**
+ * Gets the infile from command line or input buffer
+ *
+ * @param args Command-line arguments
+ * @return HashMap of parameter names and their values
+ */
+ static BufferedReader getInfile(String[] args)
+ {
+ BufferedReader in = null;
+
+ try
+ {
+ // Declare file-parsing variables //
+
+ String line;
+
+ // Check for file on command line //
+
+ if(args.length > 1 && !args[1].startsWith("-"))
+ {
+ File infile = new File(args[1]);
+ if(infile.exists())
+ {
+ // Parse the infile //
+ System.err.println("Reading input from " + args[1]);
+ in = new BufferedReader(new FileReader(args[1]));
+ }
+ else
+ {
+ System.err.println("File not found: " + args[1] + "\n");
+ System.exit(10);
+ }
+ }
+
+ // If no file from command line was parsed, try for piped input //
+
+ if(in == null)
+ {
+ // Check the input stream //
+ InputStreamReader instream = new InputStreamReader(System.in);
+ Thread.sleep(1000);
+
+ int num_naps = 0;
+
+ while(!instream.ready())
+ {
+ System.err.println("Input stream not ready, waiting for 5 seconds...");
+ Thread.sleep(5000);
+ num_naps++;
+
+ if(num_naps >= 100)
+ {
+ System.err.println("ERROR: Gave up waiting after 500 seconds...\n");
+ System.exit(10);
+ }
+ }
+
+ // If we have piped input, proceed with it //
+
+ if(instream.ready())
+ {
+ System.err.println("Reading input from STDIN");
+ in = new BufferedReader(instream);
+ }
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("ERROR: Unable to open input stream\n");
+ System.exit(10);
+ }
+
+ return(in);
+ }
+
+
+
+ /**
+ * Counts the number, quality, and strands of each allele from a pileup
+ *
+ * @param refBase Reference base at this position
+ * @param readBases String of read bases from pileup
+ * @param readQuals String of read base qualities from pileup
+ * @param minAvgQual Integer of minimum required base quality to count a base.
+ * @return results HashMap<String, String> of results for each allele
+ */
+ static HashMap<String, String> getReadCounts(String refBase, String readBases, String readQuals, int minAvgQual, String mapQuals)
+ {
+ HashMap<String, Integer> readCounts = new HashMap<String, Integer>();
+ HashMap<String, Integer> readCountsPlus = new HashMap<String, Integer>();
+ HashMap<String, Integer> readCountsMinus = new HashMap<String, Integer>();
+ HashMap<String, Integer> qualitySum = new HashMap<String, Integer>();
+ HashMap<String, Integer> mapQualitySum = new HashMap<String, Integer>();
+ HashMap<String, String> strandsSeen = new HashMap<String, String>();
+
+ int reads1 = 0;
+ int reads1indel = 0;
+ String readBase = "";
+ String prevBase = "";
+ String nextBase = "";
+ int baseQuality = 0;
+ int prevBaseQuality = 0;
+ int mapQuality = 1;
+ String strand = "";
+
+ String[] arrBases = readBases.split("");
+// char[] arrBases = readBases.toCharArray();
+ char[] arrQualities = readQuals.toCharArray();
+ char[] mapQualities = mapQuals.toCharArray();
+
+ // Set booleans for read Start //
+
+ boolean readStart = false;
+
+ // Set quality position offset //
+ int j = 0;
+
+ // Go through each base //
+
+ for(int i = 0; i < arrBases.length; i++)
+ {
+ readBase = arrBases[i];
+ if(i == 0 && readBase.length() == 0)
+ {
+ i++;
+ readBase = arrBases[i];
+ }
+
+ // Record previous and next base //
+ prevBase = "";
+ if(i > 1 && i < (arrBases.length - 1))
+ prevBase = arrBases[i - 1];
+
+ if(j > 1 && j < (arrQualities.length - 1))
+ prevBaseQuality = arrQualities[j - 1] - 33;
+
+ nextBase = "";
+ if(i < (arrBases.length - 1))
+ nextBase = arrBases[i + 1];
+
+ // Get the quality score //
+ if(j < arrQualities.length)
+ baseQuality = arrQualities[j] - 33;
+
+ // Get the map quality score //
+ if(j < mapQualities.length)
+ mapQuality = mapQualities[j] - 33;
+
+// System.err.println("Got " + readBase + " with quality " + arrQualities[j] + "=" + baseQuality + " Next " + nextBase);
+ // A period or comma NOT followed by indel represents a reference base //
+ if((readBase.equals(".") || readBase.equals(",")) && !(nextBase.equals("-") || nextBase.equals("+")))
+ {
+ strand = "+";
+ if(readBase.equals(","))
+ strand = "-";
+
+ if(baseQuality >= minAvgQual)
+ {
+ reads1++;
+
+ // Count the strands seen //
+
+ if(strandsSeen.containsKey("ref"))
+ {
+ String alreadySeen = strandsSeen.get("ref");
+ if(!(alreadySeen.length() >= 2 || alreadySeen.equals(strand)))
+ {
+ strandsSeen.put("ref", (strandsSeen.get("ref") + strand));
+ }
+ }
+ else
+ {
+ strandsSeen.put("ref", strand);
+ }
+
+ // Count strand-based read count //
+ if(strand.equals("+"))
+ {
+ // Plus strand //
+ if(readCountsPlus.containsKey("ref"))
+ {
+ readCountsPlus.put("ref", (readCountsPlus.get("ref") + 1));
+ }
+ else
+ {
+ readCountsPlus.put("ref", 1);
+ }
+ }
+ else
+ {
+ // Minus Strand //
+ if(readCountsMinus.containsKey("ref"))
+ {
+ readCountsMinus.put("ref", (readCountsMinus.get("ref") + 1));
+ }
+ else
+ {
+ readCountsMinus.put("ref", 1);
+ }
+ }
+
+ // Add the quality to the sum //
+
+ if(qualitySum.containsKey("ref"))
+ {
+ qualitySum.put("ref", (qualitySum.get("ref") + baseQuality));
+ mapQualitySum.put("ref", (mapQualitySum.get("ref") + mapQuality));
+ }
+ else
+ {
+ qualitySum.put("ref", baseQuality);
+ mapQualitySum.put("ref", mapQuality);
+ }
+ }
+
+ j++;
+
+ readStart = false;
+ }
+ // SNP Processing //
+ else if(readBase.toUpperCase().equals("A") || readBase.toUpperCase().equals("C") || readBase.toUpperCase().equals("G") || readBase.toUpperCase().equals("T"))
+ {
+ strand = "+";
+
+ if(readBase.equals("a") || readBase.equals("c") || readBase.equals("g") || readBase.equals("t"))
+ strand = "-";
+
+ readBase = readBase.toUpperCase();
+
+ // Check that we're not at start or end of read //
+ if(baseQuality >= minAvgQual)// && !readStart && !nextBase.equals("$"))
+ {
+ // Count the read //
+ if(readCounts.containsKey(readBase))
+ {
+ readCounts.put(readBase, (readCounts.get(readBase) + 1));
+ }
+ else
+ {
+ readCounts.put(readBase, 1);
+ }
+
+ // Count strand-based read count //
+ if(strand.equals("+"))
+ {
+ // Plus strand //
+ if(readCountsPlus.containsKey(readBase))
+ {
+ readCountsPlus.put(readBase, (readCountsPlus.get(readBase) + 1));
+ }
+ else
+ {
+ readCountsPlus.put(readBase, 1);
+ }
+ }
+ else
+ {
+ // Minus Strand //
+ if(readCountsMinus.containsKey(readBase))
+ {
+ readCountsMinus.put(readBase, (readCountsMinus.get(readBase) + 1));
+ }
+ else
+ {
+ readCountsMinus.put(readBase, 1);
+ }
+ }
+
+ // Count the strands seen //
+
+ if(strandsSeen.containsKey(readBase))
+ {
+ String alreadySeen = strandsSeen.get(readBase);
+ if(!(alreadySeen.length() >= 2 || alreadySeen.equals(strand)))
+ {
+ strandsSeen.put(readBase, (strandsSeen.get(readBase) + strand));
+ }
+ }
+ else
+ {
+ strandsSeen.put(readBase, strand);
+ }
+
+ if(qualitySum.containsKey(readBase))
+ {
+ qualitySum.put(readBase, (qualitySum.get(readBase) + baseQuality));
+ mapQualitySum.put(readBase, (mapQualitySum.get(readBase) + mapQuality));
+ }
+ else
+ {
+ qualitySum.put(readBase, baseQuality);
+ mapQualitySum.put(readBase, mapQuality);
+ }
+ }
+ else
+ {
+ // Base did not meet quality //
+// System.err.println("Low quality base: " + readBase + " " + baseQuality);
+ }
+
+ j++;
+ readStart = false;
+ }
+ // INDEL Processing //
+ else if(readBase.equals("+") || readBase.equals("-"))
+ {
+ String indelType = "";
+
+ if(readBase.equals("+"))
+ {
+ indelType = "INS";
+ }
+ else
+ {
+ indelType = "DEL";
+ }
+
+ // If the previous base was a reference, count this read as reference but with indel //
+
+ if(prevBase.equals(".") || prevBase.equals(","))
+ {
+ if(prevBaseQuality >= minAvgQual)
+ reads1indel++;
+ }
+
+ // Get deletion size and bases //
+ int indel_size = 0;
+ int max_parse = 1;
+ String indelBases = "";
+ try {
+ String stringWithSize = arrBases[i + 1] + arrBases[i + 2] + arrBases[i + 3];
+ stringWithSize = stringWithSize.replaceAll("[^0-9]", "");
+ indel_size = Integer.parseInt(stringWithSize);
+ max_parse = indel_size + Integer.toString(indel_size).length();
+
+ for(int bases_parsed = 0; bases_parsed < max_parse; bases_parsed++)
+ {
+ String thisBase = arrBases[i + 1 + bases_parsed];
+ try {
+ Integer.parseInt(thisBase); // Try to parse an integer from this string, which would be part of indel size
+ }
+ catch (Exception e)
+ {
+ // If no integer, count it.
+ if(thisBase.equals(".") || thisBase.equals(","))
+ bases_parsed = max_parse;
+ else if (thisBase.toUpperCase().equals("A") || thisBase.toUpperCase().equals("C") || thisBase.toUpperCase().equals("G") || thisBase.toUpperCase().equals("T") || thisBase.toUpperCase().equals("N"))
+ indelBases += thisBase;
+ }
+ //indelBases += arrBases[i + 3 + bases_parsed];
+ }
+ // Adjust i to beyond this indel //
+ i = i + max_parse;
+ }
+ catch (Exception e)
+ {
+ indel_size = Integer.parseInt(arrBases[i + 1]);
+ for(int bases_parsed = 0; bases_parsed < indel_size; bases_parsed++)
+ {
+ indelBases += arrBases[i + 2 + bases_parsed];
+ }
+ // Adjust i to beyond this indel //
+ i = i + 1 + indel_size;
+ }
+
+ // Determine strand //
+ if(indelBases.equals(indelBases.toUpperCase()))
+ {
+ strand = "+";
+ }
+ else
+ {
+ strand = "-";
+ }
+
+ // Correct case of alleles //
+ indelBases = indelBases.toUpperCase();
+
+ // Build an indel key //
+
+ String indelKey = indelType + "-" + indel_size + "-" + indelBases;
+
+ // Count the read //
+ if(readCounts.containsKey(indelKey))
+ {
+ readCounts.put(indelKey, (readCounts.get(indelKey) + 1));
+ }
+ else
+ {
+ readCounts.put(indelKey, 1);
+ }
+
+ // Count strand-based read count //
+ if(strand.equals("+"))
+ {
+ // Plus strand //
+ if(readCountsPlus.containsKey(indelKey))
+ {
+ readCountsPlus.put(indelKey, (readCountsPlus.get(indelKey) + 1));
+ }
+ else
+ {
+ readCountsPlus.put(indelKey, 1);
+ }
+ }
+ else
+ {
+ // Minus Strand //
+ if(readCountsMinus.containsKey(indelKey))
+ {
+ readCountsMinus.put(indelKey, (readCountsMinus.get(indelKey) + 1));
+ }
+ else
+ {
+ readCountsMinus.put(indelKey, 1);
+ }
+ }
+
+ // Count the strands seen //
+
+ if(strandsSeen.containsKey(indelKey))
+ {
+ String alreadySeen = strandsSeen.get(indelKey);
+ if(!(alreadySeen.length() >= 2 || alreadySeen.equals(strand)))
+ {
+ strandsSeen.put(indelKey, (strandsSeen.get(indelKey) + strand));
+ }
+ }
+ else
+ {
+ strandsSeen.put(indelKey, strand);
+ }
+
+ if(j < arrQualities.length)
+ {
+ baseQuality = arrQualities[j] - 33;
+ j++;
+ }
+ if(j < mapQualities.length)
+ mapQuality = mapQualities[j] - 33;
+
+ if(qualitySum.containsKey(indelKey))
+ {
+ qualitySum.put(indelKey, (qualitySum.get(indelKey) + baseQuality));
+ mapQualitySum.put(indelKey, (mapQualitySum.get(indelKey) + mapQuality));
+ }
+ else
+ {
+ qualitySum.put(indelKey, baseQuality);
+ mapQualitySum.put(indelKey, mapQuality);
+ }
+
+ readStart = false;
+ }
+ else if(readBase.toUpperCase().equals("N"))
+ {
+ // Ignore the base, but keep moving forward for qualities //
+ j++;
+ }
+ else if(readBase.equals("^"))
+ {
+ // Read start - skip the next base, which is mapping quality //
+ i++;
+ readStart = true;
+ }
+ else if(readBase.equals("$"))
+ {
+ // End of read //
+// i++;
+ readStart = false;
+
+ }
+ else
+ {
+ if(readBase.equals(".") || readBase.equals(","))
+ {
+ // This is the reference base that precedes an indel. Don't advance quality //
+ }
+ else
+ {
+ // Ignore characters like * which indicates a pad //
+ j++;
+ }
+
+ }
+ }
+
+ // Declare results hash //
+ HashMap<String, String> results = new HashMap<String, String>();
+
+ // Get ref base read counts //
+
+ int strands1 = 0;
+ if(strandsSeen.containsKey("ref"))
+ strands1 = strandsSeen.get("ref").length();
+
+ // Get average quality //
+
+ int avgQual1 = 0;
+ if(reads1 > 0)
+ avgQual1 = qualitySum.get("ref") / reads1;
+
+ // Get average map quality //
+
+ int avgMapQual1 = 0;
+ if(reads1 > 0)
+ avgMapQual1 = mapQualitySum.get("ref") / reads1;
+
+ // Get strand-specific read counts //
+ int reads1plus = 0;
+ int reads1minus = 0;
+ if(readCountsPlus.containsKey("ref"))
+ reads1plus = readCountsPlus.get("ref");
+ if(readCountsMinus.containsKey("ref"))
+ reads1minus = readCountsMinus.get("ref");
+
+ // Append ref info to read counts //
+ if(reads1 < 0)
+ reads1 = 0;
+ results.put(refBase, reads1 + "\t" + strands1 + "\t" + avgQual1 + "\t" + avgMapQual1 + "\t" + reads1plus + "\t" + reads1minus + "\t" + reads1indel);
+
+ // GO through all possible variant keys //
+
+ String[] variantKeys = (String[]) readCounts.keySet().toArray(new String[0]);
+ Arrays.sort(variantKeys);
+ for(String key : variantKeys)
+ {
+ int reads2 = readCounts.get(key);
+
+ // Get strand-specific read counts //
+ int reads2plus = 0;
+ int reads2minus = 0;
+ if(readCountsPlus.containsKey(key))
+ reads2plus = readCountsPlus.get(key);
+ if(readCountsMinus.containsKey(key))
+ reads2minus = readCountsMinus.get(key);
+
+ // Count number of variant-supporting strands //
+
+ int strands2 = 0;
+ if(strandsSeen.containsKey(key))
+ strands2 = strandsSeen.get(key).length();
+
+ // Get average quality //
+
+ int avg_qual2 = qualitySum.get(key) / reads2;
+
+ // Get average mapping quality //
+
+ int avg_map_qual2 = mapQualitySum.get(key) / reads2;
+
+ if(reads2 > 0)
+ {
+// System.err.println("Saving " + key + ": " + reads2 + "\t" + strands2 + "\t" + avg_qual2 + "\t" + avg_map_qual2 + "\t" + reads2plus + "\t" + reads2minus);
+ results.put(key, reads2 + "\t" + strands2 + "\t" + avg_qual2 + "\t" + avg_map_qual2 + "\t" + reads2plus + "\t" + reads2minus);
+ }
+ }
+
+ return(results);
+ }
+
+
+
+ /**
+ * Counts the depth of read bases meeting a minimum quality
+ *
+ * @param refBase Reference base at this position
+ * @param readBases String of read bases from pileup
+ * @param readQuals String of read base qualities from pileup
+ * @param minAvgQual Integer of minimum required base quality to count a base.
+ * @return results HashMap<String, String> of results for each allele
+ */
+ static int qualityDepth(String readQuals, int minAvgQual)
+ {
+ int baseQuality = 0;
+ int qualityDepth = 0;
+
+ char[] arrQualities = readQuals.toCharArray();
+
+ // Set quality position offset //
+ int j = 0;
+
+ // Go through each base //
+
+ for(j = 0; j < arrQualities.length; j++)
+ {
+ baseQuality = arrQualities[j] - 33;
+ if(baseQuality >= minAvgQual)
+ {
+ qualityDepth++;
+ }
+ }
+
+ return(qualityDepth);
+ }
+
+ /**
+ * Makes the base call (SNP, indel, or consensus) based on read counts
+ *
+ * @param refBase Reference base at this position
+ * @param readCounts HashMap of read counts for each base observed
+ * @param callType Type of call to make (SNP, indel, or consensus)
+ * @param minReads2 Minimum number of supporting reads to call a variant
+ * @param minVarFreq Minimum observed variant frequency to call a variant
+ * @param minAvgQual Integer of minimum required base quality to count a base.
+ * @param pValueThreshold Significance threshold below which variants will be called
+ * @return call The base call made at this position
+ */
+ static String callPosition(String refBase, HashMap<String, String> readCounts, String callType, int minReads2, double minVarFreq, int minAvgQual, double pValueThreshold, double minFreqForHom)
+ {
+ String callResult = "";
+ DecimalFormat df = new DecimalFormat("###.##");
+
+ int reads1 = 0;
+ int reads2 = 0;
+ int readsWithIndels = 0;
+ int strands1 = 0;
+ int strands2 = 0;
+ int avgQual1 = 0;
+ int avgQual2 = 0;
+ int avgMap1 = 0; // Average mapping quality of reference-supporting reads
+ int avgMap2 = 0; // Average mapping quality of variant-supporting reads
+ int reads1indel = 0; // Reference-supporting reads that contain indel at next base
+ int reads1plus = 0; // Reference-supporting reads on plus strand
+ int reads1minus = 0; // Reference-supporting reads on minus strand
+ int reads2plus = 0; // Variant-supporting reads on plus strand
+ int reads2minus = 0; // Variant-supporting reads on minus strand
+ double pValue = 1;
+ double varFreq = 0.00;
+ String varAllele = "";
+
+ try
+ {
+ if(readCounts.containsKey(refBase))
+ {
+ try
+ {
+ String[] refBaseContents = readCounts.get(refBase).split("\t");
+ reads1 = Integer.parseInt(refBaseContents[0]);
+ strands1 = Integer.parseInt(refBaseContents[1]);
+ avgQual1 = Integer.parseInt(refBaseContents[2]);
+ avgMap1 = Integer.parseInt(refBaseContents[3]);
+ reads1plus = Integer.parseInt(refBaseContents[4]);
+ reads1minus = Integer.parseInt(refBaseContents[5]);
+
+ if(refBaseContents.length > 6)
+ reads1indel = Integer.parseInt(refBaseContents[6]);
+ }
+ catch(Exception e)
+ {
+ System.err.println("Error parsing refBase readcounts from " + readCounts.get(refBase));
+ }
+ }
+
+ String[] alleleKeys = (String[]) readCounts.keySet().toArray(new String[0]);
+
+ Arrays.sort(alleleKeys);
+
+ // Get the total number of reads at this position //
+ int totalReadCounts = 0;
+ for(String allele : alleleKeys)
+ {
+ String[] alleleContents = readCounts.get(allele).split("\t");
+ try {
+ int thisReads = Integer.parseInt(alleleContents[0]);
+ totalReadCounts += thisReads;
+ }
+ catch(Exception e)
+ {
+ }
+ }
+
+ for(String allele : alleleKeys)
+ {
+ String[] alleleContents = readCounts.get(allele).split("\t");
+
+ if(allele.equals(refBase))
+ {
+ // Skip the reference base; we got that already //
+ }
+ else
+ {
+ // Reset variables //
+
+ int thisReads1 = reads1;
+ int thisReads2 = 0;
+ int thisStrands2 = 0;
+ int thisAvgQual2 = 0;
+ int thisAvgMap2 = 0;
+ int thisReads2plus = 0;
+ int thisReads2minus = 0;
+
+ // Parse the information //
+
+ try {
+ thisReads2 = Integer.parseInt(alleleContents[0]);
+ thisStrands2 = Integer.parseInt(alleleContents[1]);
+ thisAvgQual2 = Integer.parseInt(alleleContents[2]);
+ thisAvgMap2 = Integer.parseInt(alleleContents[3]);
+ thisReads2plus = Integer.parseInt(alleleContents[4]);
+ thisReads2minus = Integer.parseInt(alleleContents[5]);
+ // If this is an indel, make note of it //
+
+ if(allele.contains("INS") || allele.contains("DEL"))
+ {
+ readsWithIndels += thisReads2;
+ }
+ }
+ catch (Exception e)
+ {
+
+ }
+
+
+ if(!callType.equals("CNS") || thisReads2 > reads2)
+ {
+ //double thisVarFreq = (double) thisReads2 / (double) (reads1 + thisReads2);
+ double thisVarFreq = (double) thisReads2 / (double) totalReadCounts;
+ double thisPvalue = 1;
+ // For indels, adjust the read1 count //
+ if(allele.contains("INS") || allele.contains("DEL"))
+ {
+ //System.err.println(allele + " gets " + thisReads2 + " " + thisVarFreq);
+ // Adjust the reads1 counts which include reads supporting indels //
+// thisReads1 = reads1 - reads1indel;
+// if(thisReads1 < 0)
+// thisReads1 = 0;
+
+// thisVarFreq = (double) thisReads2 / (double) (thisReads1 + thisReads2);
+
+ // Correct for indel-containing reads, but ensure we don't overcorrect //
+ int thisTotalReadCounts = totalReadCounts - reads1indel;
+ if(thisTotalReadCounts < thisReads2)
+ thisTotalReadCounts = thisReads2;
+
+ // Compute new variant allele frequency //
+ thisVarFreq = (double) thisReads2 / (double) totalReadCounts;
+ }
+
+ // Calculate the p-value //
+ if(pValueThreshold == 0.99)
+ {
+ thisPvalue = 0.98;
+ }
+ else
+ {
+ thisPvalue = getSignificance(reads1, thisReads2);
+ }
+
+
+ // Save the most frequent variant allele, even if we won't use it //
+ if(thisReads2 > reads2 && thisAvgQual2 >= minAvgQual)
+ {
+// System.err.println(allele + " passed with " + thisReads2);
+ if(allele.contains("INS") || allele.contains("DEL"))
+ {
+ varAllele = getShortIndel(allele);
+ }
+ else
+ {
+ varAllele = allele;
+ }
+
+ reads2 = thisReads2;
+ strands2 = thisStrands2;
+ avgQual2 = thisAvgQual2;
+ avgMap2 = thisAvgMap2;
+ reads2plus = thisReads2plus;
+ reads2minus = thisReads2minus;
+ varFreq = thisVarFreq * 100;
+ pValue = thisPvalue;
+ }
+ else
+ {
+ //System.err.println(allele + " failed with " + thisReads2 + " " + thisAvgQual2);
+ }
+
+ // Call the variant if it meets calling criteria //
+
+ if(thisReads2 >= minReads2 && thisAvgQual2 >= minAvgQual && thisVarFreq >= minVarFreq)
+ {
+ thisReads1 = reads1;
+ thisVarFreq = thisVarFreq * 100;
+
+ // Determine type of variant //
+ String thisVarType = "SNP";
+ if(allele.contains("INS") || allele.contains("DEL"))
+ {
+ thisVarType = "INDEL";
+ thisReads1 = reads1;
+ if(thisReads1 < 0)
+ thisReads1 = 0;
+ // Change allele to short indel version //
+ allele = getShortIndel(allele);
+ }
+
+ if(thisPvalue <= pValueThreshold)
+ {
+ // Call the variant if we're variant calling //
+ if(callType.equals("SNP") || callType.equals("INDEL"))
+ {
+ reads2 = thisReads2;
+ strands2 = thisStrands2;
+ avgQual2 = thisAvgQual2;
+ avgMap2 = thisAvgMap2;
+ reads2plus = thisReads2plus;
+ reads2minus = thisReads2minus;
+ pValue = thisPvalue;
+
+ // Convert to consensus-like genotype //
+
+ String genotype = "";
+ if(thisVarFreq >= (minFreqForHom * 100))
+ {
+ genotype = allele + allele;
+ if(thisVarType.equals("INDEL"))
+ genotype = allele + "/" + allele;
+ }
+ else
+ {
+ genotype = refBase + allele;
+ if(thisVarType.equals("INDEL"))
+ genotype = "*/" + allele;
+ }
+
+ // Only report the desired variant type //
+
+ if(thisVarType.equals(callType))
+ {
+ // Report the variant regardless //
+ if(callResult.length() > 0)
+ callResult += "\n";
+
+ if(thisReads1 < 0)
+ thisReads1 = 0;
+
+ if(reads2 < 0)
+ reads2 = 0;
+
+ //callResult += allele + "\t" + reads1 + "\t" + reads2 + "\t" + df.format(thisVarFreq) + "%\t" + strands1 + "\t" + strands2 + "\t" + avgQual1 + "\t" + avgQual2 + "\t" + pValue;
+ callResult += genotypeToCode(genotype) + "\t" + thisReads1 + "\t" + reads2 + "\t" + df.format(thisVarFreq) + "%\t" + strands1 + "\t" + strands2 + "\t" + avgQual1 + "\t" + avgQual2 + "\t" + pValue;
+ callResult += "\t" + avgMap1 + "\t" + avgMap2;
+ callResult += "\t" + reads1plus + "\t" + reads1minus + "\t" + reads2plus + "\t" + reads2minus + "\t" + varAllele;
+ }
+
+ }
+ else if(callType.equals("CNS") && thisReads2 >= reads2)
+ {
+ reads2 = thisReads2;
+ strands2 = thisStrands2;
+ avgQual2 = thisAvgQual2;
+ avgMap2 = thisAvgMap2;
+ reads2plus = thisReads2plus;
+ reads2minus = thisReads2minus;
+ pValue = thisPvalue;
+
+ String genotype = "";
+ if(thisVarFreq >= (minFreqForHom * 100))
+ {
+ genotype = allele + allele;
+ if(thisVarType.equals("INDEL"))
+ genotype = allele + "/" + allele;
+ }
+ else
+ {
+ genotype = refBase + allele;
+ if(thisVarType.equals("INDEL"))
+ genotype = "*/" + allele;
+ }
+
+ callResult = genotypeToCode(genotype) + "\t" + thisReads1 + "\t" + reads2 + "\t" + df.format(thisVarFreq) + "%\t" + strands1 + "\t" + strands2 + "\t" + avgQual1 + "\t" + avgQual2 + "\t" + pValue;
+ callResult += "\t" + avgMap1 + "\t" + avgMap2;
+ callResult += "\t" + reads1plus + "\t" + reads1minus + "\t" + reads2plus + "\t" + reads2minus + "\t" + varAllele;
+ }
+
+ }
+ else
+ {
+ // Somehow p-value not less than threshold //
+
+ }
+ }
+ else
+ {
+ // Did not meet reads2, variant allele frequency, base quality, or p-value thresholds //
+ }
+
+ }
+
+
+
+ }
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Read Counts Exception: " + e.getLocalizedMessage());
+ e.printStackTrace(System.err);
+ }
+
+ // If we must have a call result for CNS calling, decide on reference or NO call //
+ if(callResult.length() == 0 && callType.equals("CNS"))
+ {
+
+ if(reads1 > 0 && reads1 > minReads2)
+ {
+ // Call reference because enough reads supporting ref base were observed //
+ callResult = refBase + "\t" + reads1 + "\t" + reads2 + "\t" + df.format(varFreq) + "%\t" + strands1 + "\t" + strands2 + "\t" + avgQual1 + "\t" + avgQual2 + "\t" + pValue;
+ callResult += "\t" + avgMap1 + "\t" + avgMap2;
+ callResult += "\t" + reads1plus + "\t" + reads1minus + "\t" + reads2plus + "\t" + reads2minus + "\t" + varAllele;
+ }
+ else
+ {
+ callResult = "N" + "\t" + reads1 + "\t" + reads2 + "\t" + df.format(varFreq) + "%\t" + strands1 + "\t" + strands2 + "\t" + avgQual1 + "\t" + avgQual2 + "\t" + pValue;
+ callResult += "\t" + avgMap1 + "\t" + avgMap2;
+ callResult += "\t" + reads1plus + "\t" + reads1minus + "\t" + reads2plus + "\t" + reads2minus + "\t" + varAllele;
+ }
+
+ }
+
+ return(callResult);
+ }
+
+
+ /**
+ * Evaluates strand bias in variant allele read counts according to provided p-value threshold
+ *
+ * @param reads1plus Number of reference-supporting reads on + strand
+ * @param reads1minus Number of reference-supporting reads on + strand
+ * @param reads2plus Number of reference-supporting reads on + strand
+ * @param reads2minus Number of reference-supporting reads on + strand
+ * @param strandPvalueThreshold P-value threshold below which variant fails strand filter
+ * @return call A string with strand filter status, counts, p-value
+ */
+ static String strandFilter(int reads1plus, int reads1minus, int reads2plus, int reads2minus, double strandPvalueThreshold)
+ {
+ DecimalFormat pvalueFormat = new DecimalFormat("0.####E0");
+ String strandFilterStatus = "Pass:" + reads1plus + ":" + reads1minus + ":" + reads2plus + ":" + reads2minus + ":" + 1;
+ double refStrandPlus = 0.50;
+ double varStrandPlus = 0.50;
+ double strandPvalue = 1.00;
+
+ // Calculate strandedness for variant allele //
+
+ if((reads2plus + reads2minus) > 0)
+ varStrandPlus = (double) reads2plus / (double) (reads2plus + reads2minus);
+
+ // To save time, only calculate p-value if var strandedness is biased //
+
+ if(varStrandPlus < 0.10 || varStrandPlus > 0.90)
+ {
+ // Calculate strandedness for reference allele if we have 2+ reads //
+
+ if((reads1plus + reads1minus) > 1)
+ {
+ refStrandPlus = (double) reads1plus / (double) (reads1plus + reads1minus);
+ strandPvalue = VarScan.getSignificance(reads1plus, reads1minus, reads2plus, reads2minus);
+ }
+ // Otherwise, only homozygous-variant reads seen, so compare to a 50/50 distribution //
+ else
+ {
+ // Compare to expected 50/50 distribution //
+ int testReads1plus = (int) (reads2plus + reads2minus) / 2;
+ int testReads1minus = (reads2plus + reads2minus) - testReads1plus;
+ strandPvalue = VarScan.getSignificance(testReads1plus, testReads1minus, reads2plus, reads2minus);
+ }
+
+ strandFilterStatus = "Pass:" + varStrandPlus + ":" + reads1plus + ":" + reads1minus + ":" + reads2plus + ":" + reads2minus + ":" + pvalueFormat.format(strandPvalue);
+
+ // If ref allele had good strandedness, and var allele did not, this may be a failure //
+ if(refStrandPlus >= 0.10 && refStrandPlus <= 0.90 && !(varStrandPlus >= 0.10 && varStrandPlus <= 0.90))
+ {
+ if(strandPvalue < strandPvalueThreshold)
+ {
+ strandFilterStatus = "Fail:" + reads1plus + ":" + reads1minus + ":" + reads2plus + ":" + reads2minus + ":" + pvalueFormat.format(strandPvalue);
+ }
+ }
+ }
+
+ return(strandFilterStatus);
+ }
+
+
+ /**
+ * Calculates significance of read counts versus baseline error
+ *
+ * @param obsReads1 Reads supporting allele 1
+ * @param obsReads2 Reads supporting allele 2
+ * @return p-value P-value from Fisher's Exact Test
+ */
+ public static double getSignificance(int obsReads1, int obsReads2)
+ {
+ double pValue = 1;
+ double baseline_error = 0.001;
+
+ int coverage = obsReads1 + obsReads2;
+
+ int expReads2 = (int) (coverage * baseline_error);
+ int expReads1 = coverage - expReads2;
+
+ pValue = getSignificance(expReads1, expReads2, obsReads1, obsReads2);
+ return(pValue);
+ }
+
+
+ /**
+ * Calculates significance of read counts between two samples
+ *
+ * @param expReads1 Reads supporting allele 1 (expected)
+ * @param expReads2 Reads supporting allele 2 (expected)
+ * @param obsReads1 Reads supporting allele 1 (observed)
+ * @param obsReads2 Reads supporting allele 2 (observed)
+ * @return p-value P-value from Fisher's Exact Test
+ */
+ public static double getSignificance(int expReads1, int expReads2, int obsReads1, int obsReads2)
+ {
+ double pValue = 1;
+
+ if(expReads1 < 0)
+ expReads1 = 0;
+
+ if(expReads2 < 0)
+ expReads2 = 0;
+
+ if(obsReads1 < 0)
+ obsReads1 = 0;
+
+ if(obsReads2 < 0)
+ obsReads2 = 0;
+
+ // Set up fisher's exact test //
+
+ FishersExact fisher = new FishersExact(expReads1 + expReads2 + obsReads1 + obsReads2 + 100);
+
+ // Calculate a p-value //
+
+ pValue = fisher.getRightTailedP(expReads1, expReads2, obsReads1, obsReads2);
+ int fisher_max = 1000;
+ int num_tries = 0;
+
+ while(Double.isNaN(pValue) && num_tries < 10)
+ {
+ fisher = new FishersExact(expReads1 + expReads2 + obsReads1 + obsReads2 + fisher_max);
+ //pValue = fisher.getTwoTailedP(expReads1, expReads2, obsReads1, obsReads2);
+ pValue = fisher.getRightTailedP(expReads1, expReads2, obsReads1, obsReads2);
+ fisher_max = fisher_max + 1000;
+ num_tries++;
+ }
+
+ if(num_tries >= 10)
+ System.err.println("Warning: unable to calculate p-value failure: " + expReads1 + "," + expReads2 + "," + obsReads1 + "," + obsReads2);
+
+ // If p-value is 1, do left-sided test //
+
+ if(pValue >= 0.999)
+ {
+ pValue = fisher.getLeftTailedP(expReads1, expReads2, obsReads1, obsReads2);
+
+ while(Double.isNaN(pValue))
+ {
+ fisher = new FishersExact(expReads1 + expReads2 + obsReads1 + obsReads2 + fisher_max);
+ //pValue = fisher.getTwoTailedP(expReads1, expReads2, obsReads1, obsReads2);
+ pValue = fisher.getLeftTailedP(expReads1, expReads2, obsReads1, obsReads2);
+ fisher_max = fisher_max + 1000;
+ }
+ }
+
+ return(pValue);
+ }
+
+
+ /**
+ * Converts from two-allele genotype to IUPAC code
+ *
+ * @param genotype
+ * @return code P-value from Fisher's Exact Test
+ */
+ static String genotypeToCode(String gt)
+ {
+ if(gt.equals("AA"))
+ return("A");
+
+ if(gt.equals("CC"))
+ return("C");
+
+ if(gt.equals("GG"))
+ return("G");
+
+ if(gt.equals("TT"))
+ return("T");
+
+ if(gt.equals("AC") || gt.equals("CA"))
+ return("M");
+
+ if(gt.equals("AG") || gt.equals("GA"))
+ return("R");
+
+ if(gt.equals("AT") || gt.equals("TA"))
+ return("W");
+
+ if(gt.equals("CG") || gt.equals("GC"))
+ return("S");
+
+ if(gt.equals("CT") || gt.equals("TC"))
+ return("Y");
+
+ if(gt.equals("GT") || gt.equals("TG"))
+ return("K");
+
+ if(gt.substring(0, 1).equals("N"))
+ return("N");
+
+ return(gt);
+ }
+
+
+ /**
+ * Gets variant allele from ref Base and consensus call
+ *
+ * @param genotype
+ * @return code P-value from Fisher's Exact Test
+ */
+ static String getVarAllele(String refBase, String consCode)
+ {
+ String varAllele = consCode;
+
+ if(consCode.contains("/"))
+ {
+ String[] tempArray = consCode.split("/");
+ if(tempArray.length > 1)
+ varAllele = tempArray[1];
+ }
+ else if(consCode.equals("M") || consCode.equals("R") || consCode.equals("W") || consCode.equals("S") || consCode.equals("Y") || consCode.equals("K"))
+ {
+ if(consCode.equals("M"))
+ {
+ if(refBase.equals("A"))
+ varAllele = "C";
+ else
+ varAllele = "A";
+ }
+
+ else if(consCode.equals("R"))
+ {
+ if(refBase.equals("A"))
+ varAllele = "G";
+ else
+ varAllele = "A";
+ }
+
+ else if(consCode.equals("W"))
+ {
+ if(refBase.equals("A"))
+ varAllele = "T";
+ else
+ varAllele = "A";
+ }
+
+ else if(consCode.equals("S"))
+ {
+ if(refBase.equals("G"))
+ varAllele = "C";
+ else
+ varAllele = "G";
+ }
+
+ else if(consCode.equals("Y"))
+ {
+ if(refBase.equals("C"))
+ varAllele = "T";
+ else
+ varAllele = "C";
+ }
+
+ else if(consCode.equals("K"))
+ {
+ if(refBase.equals("G"))
+ varAllele = "T";
+ else
+ varAllele = "G";
+ }
+ }
+
+ return(varAllele);
+ }
+
+ /**
+ * Converts from long to short indel allele
+ *
+ * @param genotype
+ * @return code P-value from Fisher's Exact Test
+ */
+ static String getShortIndel(String gt)
+ {
+ if(gt.contains("INS") || gt.contains("DEL"))
+ {
+ try
+ {
+ String[] gtContents = gt.split("-");
+ String indel_type = gtContents[0];
+ String indel_size = gtContents[1];
+ String indel_bases = gtContents[2];
+
+ if(indel_type.contains("INS"))
+ {
+ return("+" + indel_bases);
+ }
+ else
+ {
+ return("-" + indel_bases);
+ }
+ }
+ catch(Exception e)
+ {
+ System.err.println("Warning: error generating consensus from " + gt);
+ }
+
+ }
+
+ return("N");
+ }
+
+
+ /**
+ * Returns true if a variant is heterozygous
+ *
+ * @param genotype An ambiguity code or a a1/a2 genotype
+ * @return boolean True if heterozygous
+ */
+ static boolean isHeterozygous(String genotype)
+ {
+ if(genotype.contains("/"))
+ {
+ String[] alleles = genotype.split("/");
+ if(!alleles[0].equals(alleles[1]))
+ return(true);
+ }
+ else
+ {
+ if(genotype.equals("M") || genotype.equals("R") || genotype.equals("W") || genotype.equals("S") || genotype.equals("Y") || genotype.equals("K"))
+ return(true);
+ }
+
+ return(false);
+ }
+
+ /**
+ * Returns true if a variant is homozygous
+ *
+ * @param genotype An ambiguity code or a a1/a2 genotype
+ * @return boolean True if homozygous
+ */
+ static boolean isHomozygous(String genotype)
+ {
+ if(genotype.contains("/"))
+ {
+ String[] alleles = genotype.split("/");
+ if(alleles[0].equals(alleles[1]))
+ return(true);
+ }
+ else
+ {
+ if(genotype.equals("A") || genotype.equals("C") || genotype.equals("G") || genotype.equals("T"))
+ return(true);
+ }
+
+ return(false);
+ }
+
+
+ /**
+ * Converts IUPAC codes to two-letter genotypes
+ *
+ * @param genotype An ambiguity code or a a1/a2 genotype
+ * @return boolean True if homozygous
+ */
+ static String codeToGenotype(String code)
+ {
+ if(code.equals("A") || code.equals("C") || code.equals("G") || code.equals("T"))
+ return(code + "/" + code);
+ else if(code.contains("/"))
+ {
+ return(code);
+ }
+ else if(code.equals("M"))
+ return("A/C");
+ else if(code.equals("R"))
+ return("A/G");
+ else if(code.equals("W"))
+ return("A/T");
+ else if(code.equals("S"))
+ return("C/G");
+ else if(code.equals("Y"))
+ return("C/T");
+ else if(code.equals("K"))
+ return("G/T");
+ else
+ return("N/N");
+ }
+
+
+
+}
+
+
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/varscan.git
More information about the debian-med-commit
mailing list