[med-svn] [beagle] 01/06: Imported Upstream version 4.1~160616-7e4+dfsg
Dylan Aïssi
bob.dybian-guest at moszumanska.debian.org
Sat Jun 25 13:05:43 UTC 2016
This is an automated email from the git hooks/post-receive script.
bob.dybian-guest pushed a commit to branch master
in repository beagle.
commit 5e95ccce1c9beceae9620d41a43cd60c9208b632
Author: Dylan Aïssi <bob.dybian at gmail.com>
Date: Sat Jun 25 00:19:38 2016 +0200
Imported Upstream version 4.1~160616-7e4+dfsg
---
blbutil/BGZIPOutputStream.java | 237 ++++++++++++++++++++++++
blbutil/FileUtil.java | 161 +++++++++-------
blbutil/InputIt.java | 11 +-
main/CurrentData.java | 100 +++++-----
main/Main.java | 60 ++++--
main/MainHelper.java | 4 +-
main/WindowWriter.java | 227 ++++++++++++++++-------
sample/HaplotypeCoder.java | 15 +-
sample/ImputationData.java | 99 +++++-----
sample/LSHapBaum.java | 54 +++---
sample/RefHapSegs.java | 108 ++++-------
vcf/AllData.java | 4 +-
vcf/Data.java | 18 +-
vcf/R2Estimator.java | 153 ++++++++++++++++
vcf/VcfRecBuilder.java | 406 +++++++++++++++++++++++++++++++++++++++++
vcf/VcfWindow.java | 107 ++---------
vcf/VcfWriter.java | 319 ++++++++++----------------------
17 files changed, 1402 insertions(+), 681 deletions(-)
diff --git a/blbutil/BGZIPOutputStream.java b/blbutil/BGZIPOutputStream.java
new file mode 100644
index 0000000..7c80c91
--- /dev/null
+++ b/blbutil/BGZIPOutputStream.java
@@ -0,0 +1,237 @@
+/*
+ * Copyright (C) 2014 Brian L. Browning
+ *
+ * This file is part of Beagle
+ *
+ * Beagle is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Beagle is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package blbutil;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.zip.CRC32;
+import java.util.zip.Deflater;
+
+/**
+ * <p>Class {@code BGZIPOutputStream} is an output stream filter that performs
+ * BGZIP compression.
+ * </p>
+ * <p>The GZIP file format specification is described
+ * <a href="https://www.ietf.org/rfc/rfc1952.txt">RFC 1952</a>
+ * and the BGZIP file format specification is described in the
+ * <a href="https://samtools.github.io/hts-specs/SAMv1.pdf">
+ * Sequence Alignment/Map Format Specification</a>
+ * </p>
+ * <p>Instances of class {@code BGZIPOutputStream} are not thread safe.
+ * </p>
+ *
+ * @author Brian L. Browning {@code <browning at uw.edu>}
+ */
+public final class BGZIPOutputStream extends OutputStream {
+
+ // overhead bytes: 26 w/compression, 31 w/o compression
+ private static final boolean USE_GZIP = true;
+ private static final int NOCOMPRESS_XTRA_BYTES = 5;
+ private static final int MAX_INPUT_BYTES = (1 << 16) - 31;
+ private static final int MAX_OUTPUT_BYTES =
+ (MAX_INPUT_BYTES + NOCOMPRESS_XTRA_BYTES);
+
+ private final boolean writeEmptyBlock;
+ private final Deflater gzipDef;
+ private final CRC32 crc;
+ private final OutputStream os;
+
+ private final byte[] input = new byte[MAX_INPUT_BYTES];
+ private final byte[] output = new byte[MAX_OUTPUT_BYTES + 1];
+
+ private int iSize = 0;
+
+ /**
+ * Applies BGZIP compression on the specified files. The filename of
+ * each compressed file will be the original filename followed by ".gz".
+ * The original files are not deleted or overwritten. The program
+ * exits with an error message if any input filename ends with ".gz".
+ *
+ * @param args a list of files that will be compressed
+ * @throws IOException if an I/O error occurs
+ */
+ public static void main(String[] args) throws IOException {
+ if (args.length == 0) {
+ Utilities.exit("usage: java -ea -jar bgzip.jar [file1] [file2] ...");
+ }
+ for (String name : args) {
+ if (name.endsWith(".gz")) {
+ Utilities.exit("ERROR: input filename ends in \".gz\"");
+ }
+ File inFile = new File(name);
+ File outFile = new File(name + ".gz");
+ try (InputStream is = new FileInputStream(inFile)) {
+ try (FileOutputStream fos = new FileOutputStream(outFile);
+ BufferedOutputStream bos = new BufferedOutputStream(fos);
+ BGZIPOutputStream os = new BGZIPOutputStream(bos, true)) {
+ byte[] ba = new byte[2000];
+ int len = is.read(ba);
+ while (len != -1) {
+ os.write(ba, 0, len);
+ len = is.read(ba);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Creates a new {@code BGZIPOutputStream} instance that writes
+ * to the specified output stream.
+ *
+ * @param os the output stream
+ * @param writeEmptyBlock {@code true} if the {@code close()} method will
+ * write an empty BGZIP block to the end of the stream
+ *
+ * @throws NullPointerException if {@code os == null}
+ */
+ public BGZIPOutputStream(OutputStream os, boolean writeEmptyBlock) {
+ if (os==null) {
+ throw new NullPointerException(OutputStream.class.toString());
+ }
+ this.writeEmptyBlock = writeEmptyBlock;
+ this.gzipDef = new Deflater(Deflater.DEFAULT_COMPRESSION, USE_GZIP);
+ this.os = os;
+ this.crc = new CRC32();
+ assert crc.getValue()==0;
+ }
+
+ private void compressAndFlushBuffer() throws IOException {
+ crc.update(input, 0, iSize);
+ int crc32 = (int) crc.getValue();
+
+ gzipDef.setInput(input, 0, iSize);
+ gzipDef.finish();
+ int len = gzipDef.deflate(output, 0, output.length, Deflater.SYNC_FLUSH);
+ if (len > MAX_OUTPUT_BYTES) {
+ len = setOutputNoCompression();
+ }
+ writeBgzipBlock(iSize, crc32, output, len, os);
+ gzipDef.reset();
+ crc.reset();
+ iSize = 0;
+ }
+
+ /* Returns the number of bytes written to the output array */
+ private int setOutputNoCompression() {
+ output[0] = 1;
+ output[1] = (byte) (iSize & 0xff);
+ output[2] = (byte) ((iSize >> 8) & 0xff);
+ output[3] = (byte) (~output[1]);
+ output[4] = (byte) (~output[2]);
+ System.arraycopy(input, 0, output, 5, iSize);
+ return iSize + 5;
+ }
+
+ private static void writeBgzipBlock(int iSize, int crc32,
+ byte[] out, int outLength, OutputStream os) throws IOException {
+ if (iSize > (1<<16)) {
+ throw new IllegalArgumentException(String.valueOf(iSize));
+ }
+ writeBGZIPHeader(outLength, os);
+ os.write(out, 0, outLength);
+ writeInt(crc32, os);
+ writeInt(iSize, os);
+ }
+
+ private static void writeBGZIPHeader(int nCompressedBytes, OutputStream os)
+ throws IOException {
+ int bsize = nCompressedBytes + 25;
+ if ((bsize >> 16) != 0) {
+ throw new IllegalArgumentException(String.valueOf(nCompressedBytes));
+ }
+ byte[] ba = new byte[]
+ { 31, (byte) 139, // GZIP_MAGIC
+ Deflater.DEFLATED, // CM
+ 4, // FLG
+ 0, 0, 0, 0, // MTIME
+ 0, // XFL
+ (byte) 255, // OS
+ 6, 0, // XLEN
+ 66, 67, // BGZIP_SUBFIELD_MAGIC
+ 2, 0, // SLEN (Subfield LENgth)
+ (byte)(bsize & 0xff),
+ (byte)((bsize >> 8) & 0xff) // BGZIP block size - 1
+ };
+ os.write(ba);
+ }
+
+ private static void writeInt(int i, OutputStream os) throws IOException {
+ os.write((byte)(i & 0xff));
+ os.write((byte)((i >> 8) & 0xff));
+ os.write((byte)((i >> 16) & 0xff));
+ os.write((byte)((i >> 24) & 0xff));
+ }
+
+ @Override
+ public void write(int b) throws IOException {
+ input[iSize++] = (byte) b;
+ if (iSize==MAX_INPUT_BYTES) {
+ compressAndFlushBuffer();
+ }
+ }
+
+ @Override
+ public void write(byte[] ba) throws IOException {
+ write(ba, 0, ba.length);
+ }
+
+ @Override
+ public void write(byte[] buf, int off, int len)
+ throws IOException {
+ int availSize = input.length - iSize;
+ while ((len - off) >= availSize) {
+ System.arraycopy(buf, off, this.input, iSize, availSize);
+ iSize += availSize;
+ off += availSize;
+ len -= availSize;
+ assert iSize==MAX_INPUT_BYTES;
+ compressAndFlushBuffer();
+ assert iSize==0;
+ availSize = this.input.length - iSize;
+ }
+ if (len>0) {
+ System.arraycopy(buf, off, this.input, iSize, len);
+ iSize += len;
+ }
+ }
+
+ @Override
+ public void flush() throws IOException {
+ compressAndFlushBuffer();
+ os.flush();
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (iSize > 0) {
+ compressAndFlushBuffer();
+ }
+ if (writeEmptyBlock) {
+ compressAndFlushBuffer();
+ }
+ os.close();
+ }
+
+}
diff --git a/blbutil/FileUtil.java b/blbutil/FileUtil.java
index 8457dfc..c9b5657 100644
--- a/blbutil/FileUtil.java
+++ b/blbutil/FileUtil.java
@@ -18,21 +18,16 @@
*/
package blbutil;
-import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedWriter;
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
-import java.io.OutputStream;
import java.io.PrintWriter;
import java.util.zip.GZIPOutputStream;
-import net.sf.samtools.util.BlockCompressedOutputStream;
/**
* Class {@code FileUtil} contains static methods for working with files.
@@ -46,45 +41,65 @@ public class FileUtil {
}
/**
- * Returns a buffered {@code java.io.DataInputStream} reading from the
+ * Returns an unbuffered {@code java.io.FileInputStream} reading from the
* specified file. If the input stream cannot be opened, an error message
* will be printed and the java interpreter will exit.
* @param file a file
- * @return a buffered {@code java.io.DataInputStream} reading from the
- * specified file
+ * @return an unbuffered {@code java.io.FileInputStream}
* @throws NullPointerException if {@code file == null}
*/
- public static DataInputStream dataInputStream(File file) {
- DataInputStream dis = null;
+ public static FileInputStream fileInputStream(File file) {
+ FileInputStream fis = null;
try {
- dis = new DataInputStream(new BufferedInputStream(
- new FileInputStream(file)));
+ fis = new FileInputStream(file);
} catch (FileNotFoundException e) {
Utilities.exit("Error opening " + file, e);
}
- return dis;
+ return fis;
}
/**
- * Returns a buffered {@code java.io.DataOutputStream} writing to
- * the specified file. Any existing file corresponding to the
- * {@code File} object will be deleted. If the file cannot be opened,
- * an error message will be printed and the java interpreter will exit.
+ * Returns an unbuffered {@code java.io.FileOutputStream}. If the file
+ * cannot be opened for writing, an error message will be printed and the
+ * Java Virtual Machine will exit. If the specified file exists, bytes
+ * written by the returned {@code FileOutputSream} will overwrite the
+ * previously existing file.
+ *
* @param file a file
- * @return a buffered {@code java.io.DataOutputStream} writing to
- * the specified file
+ * @return an unbuffered {@code java.io.PrintWriter}
* @throws NullPointerException if {@code file == null}
*/
- public static DataOutputStream dataOutputStream(File file) {
- OutputStream dos = null;
+ public static FileOutputStream fileOutputStream(File file) {
+ FileOutputStream fos = null;
try {
- dos = new FileOutputStream(file);
+ fos = new FileOutputStream(file);
} catch (FileNotFoundException e) {
Utilities.exit("Error opening " + file, e);
}
- DataOutputStream out = new DataOutputStream(
- new BufferedOutputStream(dos));
- return out;
+ return fos;
+ }
+
+ /**
+ * Returns an unbuffered {@code java.io.FileOutputStream}. If the file
+ * cannot be opened for writing, an error message will be printed and the
+ * Java Virtual Machine will exit. If the specified file exists and
+ * {@code append} is {@code false}, bytes written by the returned
+ * {@code PrintWriter} will overwrite the previously existing file.
+ *
+ * @param file a file
+ * @param append {@code true} if bytes will be appended to the end of
+ * the file
+ * @return an unbuffered {@code java.io.PrintWriter}
+ * @throws NullPointerException if {@code file == null}
+ */
+ public static FileOutputStream fileOutputStream(File file, boolean append) {
+ FileOutputStream fos = null;
+ try {
+ fos = new FileOutputStream(file, append);
+ } catch (FileNotFoundException e) {
+ Utilities.exit("Error opening " + file, e);
+ }
+ return fos;
}
/**
@@ -102,10 +117,10 @@ public class FileUtil {
/**
* Returns a buffered {@code java.io.PrintWriter} writing to
* the specified file. The resulting file will be compressed using
- * the GZIP compression algorithm. Any existing file corresponding
- * to the specified file will be deleted. If the file
- * cannot be opened, an error message will be printed and the
- * java interpreter will exit.
+ * the GZIP compression algorithm. If the file cannot be opened, an
+ * error message will be printed and the java interpreter will exit.
+ * If the specified file exists, bytes written by the returned
+ * {@code PrintWriter} will overwrite the previously existing file.
* @param file a file
* @return a {@code java.io.PrintWriter} writing to the specified file
* @throws NullPointerException if {@code file == null}
@@ -113,8 +128,10 @@ public class FileUtil {
public static PrintWriter gzipPrintWriter(File file) {
PrintWriter out = null;
try {
- out = new PrintWriter(
- new GZIPOutputStream(new FileOutputStream(file)));
+ FileOutputStream fos = new FileOutputStream(file);
+ BufferedOutputStream bos = new BufferedOutputStream(fos);
+ GZIPOutputStream gzos = new GZIPOutputStream(bos);
+ out = new PrintWriter(gzos);
} catch (IOException e) {
Utilities.exit("Error opening " + file, e);
}
@@ -122,36 +139,56 @@ public class FileUtil {
}
/**
- * Returns a buffered {@code java.io.PrintWriter} writing to
- * the specified file. The resulting file will be compressed using
- * the BGZIP compression algorithm. Any existing file corresponding
- * to the specified file will be deleted. If the file
- * cannot be opened, an error message will be printed and the
- * java interpreter will exit.
+ * Returns a buffered {@code java.io.PrintWriter} that compresses
+ * data using the BGZIP algorithm and writes the compressed data
+ * to the specified file. The {@code close()} method of the returned
+ * {@code PrintWriter} will write an empty BGZIP block to the end of the
+ * output stream. If the file cannot be opened for writing, an error
+ * message will be printed and the Java Virtual Machine will exit.
+ * If the specified file exists, bytes written by the returned
+ * {@code PrintWriter} will overwrite the previously existing file.
*
* @param file a file
- * @return a buffered {@code java.io.PrintWriter} writing to
- * the specified file
+ * @return a buffered {@code java.io.PrintWriter}
* @throws NullPointerException if {@code file == null}
*/
public static PrintWriter bgzipPrintWriter(File file) {
- PrintWriter out = null;
- try {
- OutputStream fout = new FileOutputStream(file);
- out = new PrintWriter(new BlockCompressedOutputStream(
- new BufferedOutputStream(fout), file));
- } catch (FileNotFoundException e) {
- Utilities.exit("Error opening " + file, e);
- }
- return out;
+ boolean writeBuffer = true;
+ FileOutputStream fos = fileOutputStream(file);
+ BufferedOutputStream bos = new BufferedOutputStream(fos);
+ return new PrintWriter(new BGZIPOutputStream(bos, writeBuffer));
}
/**
- * Returns a buffered {@code java.io.PrintWriter} writing to
- * the specified file. Any existing file corresponding
- * to the specified filename will be deleted. If the file
- * cannot be opened, an error message will be printed and the
- * java interpreter will exit.
+ * Returns a buffered {@code java.io.PrintWriter} that compresses
+ * data using the BGZIP algorithm and writes the compressed data to
+ * the specified file. The {@code close()} method of the returned
+ * {@code PrintWriter} will write an empty BGZIP block to the end of the
+ * output stream. If the file cannot be opened for writing, an error
+ * message will be printed and the Java Virtual Machine will exit.
+ * If the specified file exists and {@code append} is {@code false}, bytes
+ * written by the returned {@code PrintWriter} will overwrite the
+ * previously existing file.
+ *
+ * @param file a file
+ * @param append {@code true} if bytes will be appended to the end of
+ * the file
+ * @return a buffered {@code java.io.PrintWriter}
+ * @throws NullPointerException if {@code file == null}
+ */
+ public static PrintWriter bgzipPrintWriter(File file, boolean append) {
+ boolean writeBuffer = true;
+ FileOutputStream fos = fileOutputStream(file, append);
+ BufferedOutputStream bos = new BufferedOutputStream(fos);
+ return new PrintWriter(new BGZIPOutputStream(bos, writeBuffer));
+ }
+
+ /**
+ * Returns a buffered {@code java.io.PrintWriter} writing to the
+ * specified file. If the file cannot be opened, an error message
+ * will be printed and the Java Virtual Machine will exit. If the specified
+ * file exists, bytes written by the returned {@code PrintWriter} will
+ * overwrite the previously existing file.
* @param file a file
* @return a buffered {@code java.io.PrintWriter} writing to
* the specified file
@@ -163,10 +200,10 @@ public class FileUtil {
/**
* Returns a buffered {@code java.io.PrintWriter} writing to
- * the specified file. If {@code append == false}
- * any existing file corresponding to the specified file will be deleted.
- * If the file cannot be opened, an error message will be printed and the
- * java interpreter will exit.
+ * the specified file. If the file cannot be opened, an error message will
+ * be printed and the Java Virtual Machine will exit. If the specified
+ * file exists and {@code append} is {@code false}, bytes written by the
+ * returned {@code PrintWriter} will overwrite the previously existing file.
*
* @param file a file
* @param append {@code true} if the data will be appended
@@ -187,11 +224,11 @@ public class FileUtil {
}
/**
- * Returns a non-buffered {@code java.io.PrintWriter} writing to
- * the specified file.
- * If {@code append == false} any existing file corresponding
- * to the specified file will be deleted. If the file cannot be opened,
- * an error message will be printed and the java interpreter will exit.
+ * Returns an unbuffered {@code java.io.PrintWriter} writing to
+ * the specified file. If the file cannot be opened, an error message will
+ * be printed and the Java Virtual Machine will exit. If the specified
+ * file exists and {@code append} is {@code false}, bytes written by the
+ * returned {@code PrintWriter} will overwrite the previously existing file.
*
* @param file a file
* @param append {@code true} if the data will be appended
diff --git a/blbutil/InputIt.java b/blbutil/InputIt.java
index eb34f31..6792749 100644
--- a/blbutil/InputIt.java
+++ b/blbutil/InputIt.java
@@ -208,7 +208,7 @@ public class InputIt implements FileIt<String> {
if (file.getName().endsWith(".gz")) {
if (isBGZipFile(file)) {
return new InputIt(
- new BlockCompressedInputStream(is), file);
+ new BlockCompressedInputStream(is), file);
}
else {
return new InputIt(new GZIPInputStream(is), file);
@@ -248,11 +248,12 @@ public class InputIt implements FileIt<String> {
if (file.getName().endsWith(".gz")) {
if (isBGZipFile(file)) {
return new InputIt(
- new BlockCompressedInputStream(is), file, bufferSize);
+ new BlockCompressedInputStream(is), file, bufferSize);
}
else {
return new InputIt(new GZIPInputStream(is), file, bufferSize);
}
+
}
else {
return new InputIt(is, file);
@@ -270,9 +271,9 @@ public class InputIt implements FileIt<String> {
private static boolean isBGZipFile(File file) throws IOException {
try (InputStream is=new BufferedInputStream(new FileInputStream(file))) {
- return BlockCompressedInputStream.isValidFile(is);
- }
- }
+ return BlockCompressedInputStream.isValidFile(is);
+ }
+ }
/**
* Constructs and returns an {@code InputIt} instance with the default
diff --git a/main/CurrentData.java b/main/CurrentData.java
index d6fbf3f..f818aea 100644
--- a/main/CurrentData.java
+++ b/main/CurrentData.java
@@ -44,11 +44,11 @@ public class CurrentData {
private final int window;
private final SampleHapPairs initHaps;
- private final int prevSplice;
- private final int nextOverlap;
- private final int nextSplice;
- private final int nextTargetSplice;
- private final int nextTargetOverlap;
+ private final int prevSpliceStart;
+ private final int nextOverlapStart;
+ private final int nextSpliceStart;
+ private final int nextTargetSpliceStart;
+ private final int nextTargetOverlapStart;
private final GL targetGL;
private final NuclearFamilies families;
@@ -104,11 +104,11 @@ public class CurrentData {
}
this.window = data.window();
this.initHaps = overlapHaps;
- this.prevSplice = data.overlap()/2;
- this.nextOverlap = nextOverlap(data, par.overlap());
- this.nextSplice = nextSplice(data, par.overlap());
- this.nextTargetOverlap = targetIndex(data, nextOverlap);
- this.nextTargetSplice = targetIndex(data, nextSplice);
+ this.prevSpliceStart = data.overlap()/2;
+ this.nextOverlapStart = CurrentData.this.nextOverlapStart(data, par.overlap());
+ this.nextSpliceStart = (data.nMarkers() + nextOverlapStart)/2;
+ this.nextTargetOverlapStart = targetIndex(data, nextOverlapStart);
+ this.nextTargetSpliceStart = targetIndex(data, nextSpliceStart);
this.families = families;
this.weights = new Weights(families);
@@ -130,18 +130,26 @@ public class CurrentData {
this.recombRate = recombRate(targetMarkers, genMap, par.mapscale());
}
- /* returns the first index in the next overlap */
- private static int nextOverlap(Data data, int overlap) {
- if (data.canAdvanceWindow() && data.lastWindowOnChrom()==false) {
- return data.nMarkers() - overlap;
+ /* Returns the index of the first marker in the overlap */
+ private static int nextOverlapStart(Data data, int targetOverlap) {
+ if (targetOverlap < 0) {
+ throw new IllegalArgumentException(String.valueOf(targetOverlap));
}
- else {
+ if (targetOverlap==0 || data.lastWindowOnChrom()) {
return data.nMarkers();
}
+ Markers markers = data.markers();
+ int nextOverlap = Math.max(0, data.nMarkers() - targetOverlap);
+ while (nextOverlap>0
+ && markers.marker(nextOverlap).pos()
+ == markers.marker(nextOverlap - 1).pos()) {
+ --nextOverlap;
+ }
+ return nextOverlap;
}
- /* returns the first index after the next splice point */
- private static int nextSplice(Data data, int overlap) {
+ /* Returns the index of the first marker after the next splice point */
+ private static int nextSpliceStart(Data data, int overlap) {
if (data.canAdvanceWindow() && data.lastWindowOnChrom()==false) {
return data.nMarkers() - overlap + (overlap/2);
}
@@ -188,25 +196,37 @@ public class CurrentData {
}
/**
- * Returns the first marker index after the splice point with
- * the previous marker window. Returns 0 if the current marker window
- * is the first marker window.
- * @return the first marker index after the splice point with
- * the previous marker window
+ * Returns the first marker index in the overlap between this
+ * marker window and the next marker window, or
+ * returns {@code this.nMarkers()} there is no overlap.
+ * @return the first marker index in the overlap between this
+ * marker window and the next marker window
*/
- public int prevSplice() {
- return prevSplice;
+ public int nextOverlapStart() {
+ return nextOverlapStart;
}
/**
- * Returns the first marker index in the overlap between this
+ * Returns the first target marker index in the overlap between this
* marker window and the next marker window, or
- * returns {@code this.nMarkers()} there is no overlap.
- * @return the first marker index in the overlap between this
+ * returns {@code this.nMarkers()} if there is no overlap or if there are
+ * no target markers in the overlap.
+ * @return the first target marker index in the overlap between this
* marker window and the next marker window
*/
- public int nextOverlap() {
- return nextOverlap;
+ public int nextTargetOverlapStart() {
+ return nextTargetOverlapStart;
+ }
+
+ /**
+ * Returns the first marker index after the splice point with
+ * the previous marker window. Returns 0 if the current marker window
+ * is the first marker window.
+ * @return the first marker index after the splice point with
+ * the previous marker window
+ */
+ public int prevSpliceStart() {
+ return prevSpliceStart;
}
/**
@@ -216,8 +236,8 @@ public class CurrentData {
* no markers after the splice point.
* @return the first marker index after the next splice point
*/
- public int nextSplice() {
- return nextSplice;
+ public int nextSpliceStart() {
+ return nextSpliceStart;
}
/**
@@ -227,31 +247,19 @@ public class CurrentData {
* @return the first target marker index after the splice point with
* the previous marker window
*/
- public int prevTargetSplice() {
+ public int prevTargetSpliceStart() {
return initHaps==null ? 0 : initHaps.nMarkers();
}
/**
- * Returns the first target marker index in the overlap between this
- * marker window and the next marker window, or
- * returns {@code this.nMarkers()} if there is no overlap or if there are
- * no target markers in the overlap.
- * @return the first target marker index in the overlap between this
- * marker window and the next marker window
- */
- public int nextTargetOverlap() {
- return nextTargetOverlap;
- }
-
- /**
* Returns the first target marker index after the splice point between this
* marker window and the next marker window, or returns
* {@code this.nTargetMarkers()} if there is no overlap or if there are
* no target markers after the splice point
* @return the first target marker index after the next splice point
*/
- public int nextTargetSplice() {
- return nextTargetSplice;
+ public int nextTargetSpliceStart() {
+ return nextTargetSpliceStart;
}
/**
diff --git a/main/Main.java b/main/Main.java
index c072110..79b3dfc 100644
--- a/main/Main.java
+++ b/main/Main.java
@@ -65,8 +65,8 @@ public class Main {
/**
* The program name and version.
*/
- public static final String program = "beagle.03May16.862.jar (version 4.1)";
- public static final String command = "java -jar beagle.03May16.862.jar";
+ public static final String program = "beagle.16Jun16.7e4.jar (version 4.1)";
+ public static final String command = "java -jar beagle.16Jun16.7e4.jar";
/**
* The copyright string.
@@ -78,7 +78,7 @@ public class Main {
*/
public static final String shortHelp = Main.program
+ Const.nl + Main.copyright
- + Const.nl + "Enter \"java -jar beagle.03May16.862.jar\" for a "
+ + Const.nl + "Enter \"java -jar beagle.16Jun16.7e4.jar\" for a "
+ "summary of command line " + "arguments.";
private final Par par;
@@ -108,14 +108,13 @@ public class Main {
runStats.printStartInfo();
GeneticMap genMap = geneticMap(par);
- try (Data data = (par.ref()==null) ? nonRefData(par) : allData(par)) {
- try (WindowWriter winOut = new WindowWriter(data.targetSamples(),
- par.out())) {
- Main main = new Main(par, data, genMap, winOut, runStats);
- main.phaseData();
- runStats.printSummaryAndClose(data.nTargetMarkersSoFar(),
- data.nMarkersSoFar());
- }
+ try (Data data = (par.ref()==null) ? nonRefData(par) : allData(par);
+ WindowWriter winOut = new WindowWriter(
+ data.targetSamples(), par.out())) {
+ Main main = new Main(par, data, genMap, winOut, runStats);
+ main.phaseData();
+ runStats.printSummaryAndClose(data.nTargetMarkersSoFar(),
+ data.nMarkersSoFar());
}
}
@@ -141,8 +140,9 @@ public class Main {
runStats.printSampleSummary(fam, data);
MainHelper mh = new MainHelper(par, genMap, runStats);
SampleHapPairs overlapHaps = null;
+ int overlap = 0;
while (data.canAdvanceWindow()) {
- advanceWindow();
+ advanceWindow(overlap, par.window());
CurrentData cd = new CurrentData(par, genMap, data, overlapHaps, fam);
GenotypeValues gv = gv(par, cd);
SampleHapPairs targetHapPairs = mh.phase(cd, gv);
@@ -157,6 +157,7 @@ public class Main {
printOutput(cd, targetHapPairs, alProbs, ibd);
}
overlapHaps = overlapHaps(cd, targetHapPairs);
+ overlap = cd.nMarkers() - cd.nextOverlapStart();
}
}
@@ -194,17 +195,40 @@ public class Main {
private void printOutput(CurrentData cd, SampleHapPairs targetHapPairs,
AlleleProbs alProbs, Map<IntPair, List<IbdSegment>> ibd) {
assert par.gt()!=null;
- windowOut.print(cd, targetHapPairs, alProbs, par.gprobs());
+ boolean markersAreImputed = cd.nTargetMarkers() < cd.nMarkers();
+ boolean[] isImputed = isImputed(cd);
+ int start = cd.prevSpliceStart();
+ int end = cd.nextSpliceStart();
+ boolean dose = markersAreImputed;
+ boolean gprobs = markersAreImputed && par.gprobs();
+ int nThreads = par.nthreads();
+ if (markersAreImputed){
+ alProbs = new ConstrainedAlleleProbs(targetHapPairs, alProbs,
+ cd.targetMarkerIndices());
+ }
+ windowOut.print(alProbs, isImputed, start, end, dose, gprobs, nThreads);
if (par.ibd()) {
windowOut.printIbd(cd, ibd);
}
}
+ private static boolean[] isImputed(CurrentData cd) {
+ boolean[] ba = new boolean[cd.nMarkers()];
+ if (cd.nTargetMarkers()<ba.length) {
+ for (int j=0; j<ba.length; ++j) {
+ if (cd.targetMarkerIndex(j) == -1) {
+ ba[j] = true;
+ }
+ }
+ }
+ return ba;
+ }
+
private SampleHapPairs overlapHaps(CurrentData cd,
SampleHapPairs targetHapPairs) {
- int nextOverlap = cd.nextTargetOverlap();
- int nextSplice = cd.nextTargetSplice();
- if (cd.nextOverlap() == cd.nextSplice()) {
+ int nextOverlap = cd.nextTargetOverlapStart();
+ int nextSplice = cd.nextTargetSpliceStart();
+ if (cd.nextOverlapStart() == cd.nextSpliceStart()) {
return null;
}
int nSamples = targetHapPairs.nSamples();
@@ -434,9 +458,9 @@ public class Main {
}
}
- private void advanceWindow() {
+ private void advanceWindow(int overlap, int window) {
if (data.canAdvanceWindow()) {
- data.advanceWindow(par.overlap(), par.window());
+ data.advanceWindow(overlap, window);
}
runStats.printWindowUpdate(data);
}
diff --git a/main/MainHelper.java b/main/MainHelper.java
index 57e0095..dd68eb4 100644
--- a/main/MainHelper.java
+++ b/main/MainHelper.java
@@ -162,8 +162,8 @@ public class MainHelper {
}
private List<HapPair> correctGenotypes(CurrentData cd, List<HapPair> hapPairs) {
- int start = cd.prevTargetSplice();
- int end = cd.nextTargetSplice();
+ int start = cd.prevTargetSpliceStart();
+ int end = cd.nextTargetSpliceStart();
GL modGL = new MaskedEndsGL(cd.targetGL(), start, end);
// File outFile = new File(par.out() + ".gterr");
// boolean append = cd.window() > 1;
diff --git a/main/WindowWriter.java b/main/WindowWriter.java
index ee0d794..2a7d6a8 100644
--- a/main/WindowWriter.java
+++ b/main/WindowWriter.java
@@ -19,19 +19,30 @@
package main;
import beagleutil.Samples;
+import blbutil.BGZIPOutputStream;
import blbutil.Const;
import blbutil.FileUtil;
import blbutil.IntPair;
-import haplotype.SampleHapPairs;
+import blbutil.Utilities;
import ibd.IbdSegment;
+import java.io.BufferedOutputStream;
+import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
import java.io.PrintWriter;
import java.text.DecimalFormat;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
import vcf.VcfWriter;
/**
@@ -46,14 +57,13 @@ public class WindowWriter implements Closeable {
private static final DecimalFormat df2 = new DecimalFormat("#.##");
- private boolean isClosed = false;
private boolean appendIbd = false;
private final Samples samples;
+ private final String outPrefix;
private final File vcfOutFile;
private final File ibdOutFile;
private final File hbdOutFile;
- private final PrintWriter vcfOut;
private final Map<IntPair, IbdSegment> ibdBuffer = new HashMap<>();
/**
@@ -73,47 +83,56 @@ public class WindowWriter implements Closeable {
throw new IllegalArgumentException("outPrefix.length()==0");
}
this.samples = samples;
+ this.outPrefix = outPrefix;
this.vcfOutFile = new File(outPrefix + ".vcf.gz");
this.ibdOutFile = new File(outPrefix + ".ibd");
this.hbdOutFile = new File(outPrefix + ".hbd");
- this.vcfOut = FileUtil.bgzipPrintWriter(vcfOutFile);
- boolean printGT = true;
- boolean printGP = true;
- boolean printGL = false;
- VcfWriter.writeMetaLines(samples.ids(), Main.program,
- printGT, printGP, printGL, vcfOut);
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ try (PrintWriter vcfOut=new PrintWriter(
+ new BGZIPOutputStream(baos, false))) {
+ boolean printGT = true;
+ boolean printGP = true;
+ boolean printGL = false;
+ VcfWriter.writeMetaLines(samples.ids(), Main.program,
+ printGT, printGP, printGL, vcfOut);
+ }
+ boolean append = false;
+ writeBytesToFile(baos.toByteArray(), vcfOutFile, append);
}
/**
- * Returns the samples whose data is written by {@code this}.
- * @return the samples whose data is written by {@code this}
+ * Returns the output file prefix.
+ * @return the output file prefix
*/
- public Samples samples() {
- return samples;
+ public String outPrefix() {
+ return outPrefix;
}
+ private static void write(byte[] ba, OutputStream out) {
+ try {
+ out.write(ba);
+ } catch (IOException e) {
+ Utilities.exit("Error writing byte", e);
+ }
+ }
- /**
- * Returns {@code true} if {@code this.close()} method has
- * been previously invoked and returns {@code false} otherwise.
- *
- * @return {@code true} if {@code this.close()} method has
- * been previously invoked
- */
- public boolean isClosed() {
- return isClosed;
+ private static void writeBytesToFile(byte[] ba, File file, boolean append) {
+ try {
+ try (FileOutputStream fos=new FileOutputStream(file, append)) {
+ fos.write(ba);
+ }
+ } catch (IOException e) {
+ Utilities.exit("Error writing to file: " + file, e);
+ }
}
/**
- * Closes this {@code WindowWriter} for writing. Calling the
- * {@code print()} method after invoking {@code close()} will
- * throw an {@code IllegalStateException}.
+ * Returns the samples whose data is written by {@code this}.
+ * @return the samples whose data is written by {@code this}
*/
- @Override
- public void close() {
- vcfOut.close();
- isClosed = true;
+ public Samples samples() {
+ return samples;
}
/**
@@ -127,58 +146,116 @@ public class WindowWriter implements Closeable {
* @throws NullPointerException if {@code cd == null || gv == null}
*/
public void printGV(CurrentData cd, GenotypeValues gv) {
- if (isClosed) {
- throw new IllegalStateException("isClosed()==true");
+ boolean append = true;
+ try (PrintWriter vcfOut = FileUtil.bgzipPrintWriter(vcfOutFile, append)) {
+ VcfWriter.appendRecords(gv, cd.prevTargetSpliceStart(),
+ cd.nextTargetSpliceStart(), vcfOut);
}
- VcfWriter.appendRecords(gv, cd.prevTargetSplice(),
- cd.nextTargetSplice(), vcfOut);
- vcfOut.flush();
}
/**
* Prints the data in {@code alProbs} for markers
* with index between {@code cd.lastSplice()} (inclusive) and
- * {@code cd.nextSplice()} (exclusive).
+ * {@code cd.nextSplice()} (exclusive) to the output
+ * VCF file: {@code this.outPrefix() + ".vcf.gz"}.
*
- * @param cd the input data for the current marker window
- * @param targetHapPairs the target haplotype pairs
* @param alProbs the estimated haplotype allele probabilities
- * @param gprobs {@code true} if the GP field should be printed when
- * imputed markers are present, and {@code false} otherwise
+ * @param isImputed an array of length {@code alProbs.nMarkers()}
+ * whose {@code j}-th element is {@code true} if the corresponding
+ * marker is imputed, and {@code false} otherwise
+ * @param start the starting marker index (inclusive)
+ * @param end the ending marker index (exclusive)
+ * @param dose {@code true} if the output FORMAT fields should contain
+ * a DS subfield, and {@code false} otherwise
+ * @param gprobs {@code true} if the output FORMAT fields should contain
+ * a GP subfield, and {@code false} otherwise
+ * @param nThreads the number of parallel threads to use
*
- * @throws IllegalStateException if {@code this.isClosed() == true}
- * @throws IllegalArgumentException if
- * {@code this.samples().equals(cd.targetSamples()) == false}
- * @throws IllegalArgumentException if
- * {@code this.samples().equals(alProbs.samples()) == false}
* @throws IllegalArgumentException if
- * {@code cd.markers().equals(alProbs.markers()) == false}
+ * {@code isImputed.length != alProbs.nMarkers()}
+ * @throws IllegalArgumentException if {@code nThreads < 1}
+ * @throws IndexOutOfBoundsException if
+ * {@code start < 0 || end > alProbs.nMarkers() || start > end}
* @throws NullPointerException if
- * {@code cd == null || targetHapPairs == null || alProbs == null}
+ * {@code alProbs == null || isImputed == null}
*/
- public void print(CurrentData cd, SampleHapPairs targetHapPairs,
- AlleleProbs alProbs, boolean gprobs) {
- if (isClosed) {
- throw new IllegalStateException("isClosed()==true");
+ public void print(AlleleProbs alProbs, boolean[] isImputed,
+ int start, int end, boolean dose, boolean gprobs, int nThreads) {
+ int step = nMarkersPerStep(alProbs.nSamples(), dose, gprobs);
+ int nSteps = nSteps(end-start, step);
+ final AtomicInteger atomicInt = new AtomicInteger(0);
+ final ConcurrentHashMap<Integer, byte[]> map = new ConcurrentHashMap<>();
+ ExecutorService es = Executors.newFixedThreadPool(nThreads);
+ for (int j=0; j<nThreads; ++j) {
+ es.submit(
+ () -> {
+ try {
+ int index = atomicInt.getAndIncrement();
+ while (index < nSteps) {
+ int segStart = start + step*index;
+ int segEnd = Math.min(segStart + step, end);
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ try (PrintWriter vcfOut=new PrintWriter(
+ new BGZIPOutputStream(baos, false))) {
+ VcfWriter.appendRecords(alProbs, isImputed,
+ segStart, segEnd, dose, gprobs, vcfOut);
+ }
+ map.put(index, baos.toByteArray());
+ index = atomicInt.getAndIncrement();
+ }
+ }
+ catch (Exception ex) {
+ Utilities.exit("", ex);
+ }
+ }
+ ) ;
}
- if (cd.markers().equals(alProbs.markers())==false) {
- throw new IllegalArgumentException("inconsistent markers");
+ try {
+ es.shutdown();
+ es.awaitTermination(Long.MAX_VALUE, TimeUnit.DAYS);
}
- if (samples.equals(cd.targetSamples()) == false
- || samples.equals(alProbs.samples()) == false) {
- throw new IllegalArgumentException("inconsistent samples");
+ catch (Throwable e) {
+ Utilities.exit("ERROR", e);
+ }
+ print(map, vcfOutFile);
+ }
+
+ private static void print(ConcurrentHashMap<Integer, byte[]> map,
+ File outFile) {
+ boolean append = true;
+ int index = 0;
+ try {
+ try (OutputStream fos = new BufferedOutputStream(
+ new FileOutputStream(outFile, append))) {
+ byte[] bytes = map.get(index++);
+ while (bytes!=null) {
+ write(bytes, fos);
+ bytes = map.get(index++);
+ }
+ }
+ } catch (IOException e) {
+ Utilities.exit("Error writing to file: " + outFile, e);
+ }
+ }
+
+ private static int nMarkersPerStep(int nSamples, boolean dose, boolean gprobs) {
+ int nBytesPerStep = 65536*50;
+ int bytesPerSample = 4;
+ if (dose) {
+ bytesPerSample += 2;
}
- int start = cd.prevSplice();
- int end = cd.nextSplice();
- if (cd.nTargetMarkers() < cd.nMarkers()){
- ConstrainedAlleleProbs constAlProbs = new ConstrainedAlleleProbs(
- targetHapPairs, alProbs, cd.targetMarkerIndices());
- VcfWriter.appendRecords(constAlProbs, start, end, gprobs, vcfOut);
+ if (gprobs) {
+ bytesPerSample += 6;
}
- else {
- VcfWriter.appendRecords(alProbs, start, end, vcfOut);
+ return nBytesPerStep / (nSamples*bytesPerSample);
+ }
+
+ private static int nSteps(int n, int step) {
+ int nSteps = n / step;
+ if (nSteps * step != n) {
+ ++nSteps;
}
- vcfOut.flush();
+ return nSteps;
}
/**
@@ -198,20 +275,16 @@ public class WindowWriter implements Closeable {
* @param ibdMap a map whose keys are pairs of haplotype indices and whose
* values are lists of IBD segments involving the haplotype pair key
*
- * @throws IllegalStateException if {@code this.isClosed()==true}
* @throws IllegalArgumentException if
* {@code this.samples().equals(cd.targetSamples()) == false}
* @throws NullPointerException if {@code cd == null || ibdMap == null}
*/
public void printIbd(CurrentData cd, Map<IntPair, List<IbdSegment>> ibdMap) {
- if (isClosed) {
- throw new IllegalStateException("isClosed()==true");
- }
if (samples.equals(cd.targetSamples()) == false) {
throw new IllegalArgumentException("inconsistent samples");
}
- printIbd(ibdMap, cd.prevTargetSplice(), cd.nextTargetOverlap(),
- cd.nextTargetSplice(), cd.nTargetMarkers());
+ printIbd(ibdMap, cd.prevTargetSpliceStart(), cd.nextTargetOverlapStart(),
+ cd.nextTargetSpliceStart(), cd.nTargetMarkers());
if (appendIbd==false) {
appendIbd = true;
}
@@ -279,4 +352,18 @@ public class WindowWriter implements Closeable {
out.print(Const.tab);
out.println(df2.format(tract.score()));
}
+
+ @Override
+ public void close() {
+ boolean append = true;
+ try {
+ try (FileOutputStream fos = new FileOutputStream(vcfOutFile, append);
+ BufferedOutputStream bos = new BufferedOutputStream(fos);
+ BGZIPOutputStream bgzip = new BGZIPOutputStream(bos, true)) {
+ // write empty BGZIP block to bgzip by closing bgzip
+ }
+ } catch (IOException e) {
+ Utilities.exit("Error closing file: " + vcfOutFile, e);
+ }
+ }
}
diff --git a/sample/HaplotypeCoder.java b/sample/HaplotypeCoder.java
index 4ae3782..6a38f34 100644
--- a/sample/HaplotypeCoder.java
+++ b/sample/HaplotypeCoder.java
@@ -26,6 +26,7 @@ import blbutil.IntList;
import blbutil.WrappedIntArray;
import haplotype.SampleHapPairs;
import java.util.Arrays;
+import java.util.stream.IntStream;
/**
* <p>Class {@code HaplotypeCoder} indexes the observed allele sequences
@@ -101,7 +102,7 @@ public class HaplotypeCoder {
throw new IllegalArgumentException("start > end");
}
IntArray[] val = new IntArray[2];
- int[] haps = initialHaps();
+ int[] haps = IntStream.range(0, nHaps).toArray();
int[] alleles = new int[nHaps];
IntList lastEnds = new IntList(1);
lastEnds.add(nHaps);
@@ -129,14 +130,6 @@ public class HaplotypeCoder {
return val;
}
- private int[] initialHaps() {
- int[] ia = new int[nHaps];
- for (int j = 0; j < ia.length; ++j) {
- ia[j] = j;
- }
- return ia;
- }
-
private IntList partition(int marker, int[] alleles, int[] haps,
IntList lastEnds) {
IntList nextEnds = new IntList( (4*lastEnds.size())/3 + 1 );
@@ -163,10 +156,10 @@ public class HaplotypeCoder {
}
private void setAlleles(int marker, int[] alleles) {
- for (int j = 0; j < nRefHaps; ++j) {
+ for (int j=0; j<nRefHaps; ++j) {
alleles[j] = refHapPairs.allele(marker, j);
}
- for (int j = nRefHaps; j < nHaps; ++j) {
+ for (int j = nRefHaps; j<nHaps; ++j) {
alleles[j] = targetHapPairs.allele(marker, j - nRefHaps);
}
}
diff --git a/sample/ImputationData.java b/sample/ImputationData.java
index d44b1d3..42847cb 100644
--- a/sample/ImputationData.java
+++ b/sample/ImputationData.java
@@ -71,23 +71,21 @@ public class ImputationData {
if (cd.targetSamples().equals(targetHapPairs.samples())==false) {
throw new IllegalArgumentException("inconsistent samples");
}
- int[] gtEnd = gtEnd(targetHapPairs.markers(), map, par.cluster());
- int[] gtStart = gtStart(gtEnd);
- this.refAlleles = new IntArray[gtStart.length];
- this.targAlleles = new IntArray[gtStart.length];
+ int[] targClustEnd = targClustEnd(targetHapPairs.markers(), map, par.cluster());
+ this.refAlleles = new IntArray[targClustEnd.length];
+ this.targAlleles = new IntArray[targClustEnd.length];
setCodedAlleles(cd.restrictedRefSampleHapPairs(), targetHapPairs,
- gtStart, gtEnd, refAlleles, targAlleles);
- this.nClusters = gtStart.length;
+ targClustEnd, refAlleles, targAlleles);
+ this.nClusters = targClustEnd.length;
this.refHapPairs = cd.refSampleHapPairs();
- this.refHapSegs = refHapSegs(refHapPairs, gtStart, gtEnd,
- cd.markerIndices(), par.nthreads());
+ this.refHapSegs = refHapSegs(refHapPairs, targClustEnd, cd.markerIndices());
this.targHapPairs = targetHapPairs;
- this.errProb = err(par.err(), gtStart, gtEnd);
- this.pRecomb = ImputationData.this.pRecomb(refHapSegs, map, par.ne());
+ this.errProb = err(par.err(), targClustEnd);
+ this.pRecomb = ImputationData.pRecomb(refHapSegs, map, par.ne());
this.weight = wts(refHapSegs, map);
}
- private static int[] gtEnd(Markers targetMarkers, GeneticMap genMap,
+ private static int[] targClustEnd(Markers targetMarkers, GeneticMap genMap,
float clusterDist) {
int nMarkers = targetMarkers.nMarkers();
int[] ends = new int[nMarkers];
@@ -104,33 +102,29 @@ public class ImputationData {
return Arrays.copyOf(ends, index);
}
- private static int[] gtStart(int[] gtEnd) {
- int[] gtStart = new int[gtEnd.length];
- for (int j=1; j<gtStart.length; ++j) {
- gtStart[j] = gtEnd[j - 1];
- }
- return gtStart;
- }
-
private static void setCodedAlleles(SampleHapPairs refHapPairs,
- SampleHapPairs targetHapPairs, int[] gtStart,
- int[] gtEnd, IntArray[] refAlleles, IntArray[] targAlleles) {
+ SampleHapPairs targetHapPairs, int[] targEnd,
+ IntArray[] refAlleles, IntArray[] targAlleles) {
HaplotypeCoder coder = new HaplotypeCoder(refHapPairs, targetHapPairs);
- for (int j=0; j<gtStart.length; ++j) {
- IntArray[] ia = coder.run(gtStart[j], gtEnd[j]);
+ int start = 0;
+ for (int j=0; j<targEnd.length; ++j) {
+ IntArray[] ia = coder.run(start, targEnd[j]);
refAlleles[j] = ia[0];
targAlleles[j] = ia[1];
+ start = targEnd[j];
}
}
- private static float[] err(float errRate, int[] gtStart, int[] gtEnd) {
+ private static float[] err(float errRate, int[] targClustEnd) {
float maxErrProb = 0.5f;
- float[] err = new float[gtStart.length];
+ float[] err = new float[targClustEnd.length];
+ int start = 0;
for (int j=0; j<err.length; ++j) {
- err[j] = errRate * (gtEnd[j] - gtStart[j]);
+ err[j] = errRate * (targClustEnd[j] - start);
if (err[j] > maxErrProb) {
err[j] = maxErrProb;
}
+ start = targClustEnd[j];
}
return err;
}
@@ -146,10 +140,10 @@ public class ImputationData {
}
private static int[] midPos(Markers refMarkers, RefHapSegs refHapSegs) {
- int[] midPos = new int[refHapSegs.nClusters()];
+ int[] midPos = new int[refHapSegs.nSegs() - 1];
for (int j=0; j<midPos.length; ++j) {
- int startPos = refMarkers.marker(refHapSegs.clusterStart(j)).pos();
- int endPos = refMarkers.marker(refHapSegs.clusterEnd(j) - 1).pos();
+ int startPos = refMarkers.marker(refHapSegs.segStart(j+1)).pos();
+ int endPos = refMarkers.marker(refHapSegs.segEnd(j) - 1).pos();
midPos[j] = (startPos + endPos) / 2;
}
return midPos;
@@ -174,15 +168,15 @@ public class ImputationData {
Markers refMarkers = refHapSegs.refHapPairs().markers();
double[] cumPos = cumPos(refMarkers, map);
int nMarkers = refMarkers.nMarkers();
- int nClusters = refHapSegs.nClusters();
+ int nClusters = refHapSegs.nSegs() - 1;
float[] wts = new float[cumPos.length];
if (nClusters > 0) {
- Arrays.fill(wts, 0, refHapSegs.clusterStart(0), Float.NaN);
+ Arrays.fill(wts, 0, refHapSegs.segStart(1), Float.NaN);
}
- for (int j = 0, jj = (nClusters - 1); j < jj; ++j) {
- int start = refHapSegs.clusterStart(j);
- int end = refHapSegs.clusterEnd(j);
- int nextStart = refHapSegs.clusterStart(j+1);
+ for (int j=1; j<nClusters; ++j) {
+ int start = refHapSegs.segStart(j);
+ int end = refHapSegs.segEnd(j-1);
+ int nextStart = refHapSegs.segStart(j+1);
double nextStartPos = cumPos[nextStart];
double totalLength = nextStartPos - cumPos[end - 1];
Arrays.fill(wts, start, end, 1f);
@@ -190,7 +184,7 @@ public class ImputationData {
wts[m] = (float) ( (nextStartPos - cumPos[m]) / totalLength );
}
}
- Arrays.fill(wts, refHapSegs.clusterStart(nClusters - 1), nMarkers, Float.NaN);
+ Arrays.fill(wts, refHapSegs.segStart(nClusters), nMarkers, Float.NaN);
return wts;
}
@@ -208,21 +202,26 @@ public class ImputationData {
}
private static RefHapSegs refHapSegs(SampleHapPairs refHapPairs,
- int[] gtStart, int[] gtEnd, int[] gtIndices, int nThreads) {
- assert gtStart.length == gtEnd.length;
- int[] clusterStart = new int[gtStart.length];
- int[] clusterEnd = new int[gtEnd.length];
- for (int j=0; j<clusterStart.length; ++j) {
- clusterStart[j] = gtIndices[gtStart[j]];
- if (j < clusterStart.length - 1) {
- clusterEnd[j] = gtIndices[gtEnd[j]];
- }
- else {
- clusterEnd[j] = gtIndices[gtEnd[j] - 1] + 1;
- }
+ int[] targClustEnd, int[] targToRef) {
+ int n = targClustEnd.length;
+ int[] refClusterStart = new int[n+1];
+ int[] refClusterEnd = new int[n+1];
+
+ refClusterStart[0] = 0;
+ refClusterEnd[0] = targToRef[targClustEnd[0] - 1] + 1;
+
+ refClusterStart[1] = targToRef[0];
+ refClusterEnd[1] = targToRef[targClustEnd[1] - 1] + 1;
+
+ for (int j=2; j<n; ++j) {
+ refClusterStart[j] = targToRef[targClustEnd[j-2]];
+ refClusterEnd[j] = targToRef[targClustEnd[j] - 1] + 1;
}
- return new RefHapSegs(refHapPairs, clusterStart, clusterEnd,
- nThreads);
+
+ refClusterStart[n] = targToRef[targClustEnd[n-2]];
+ refClusterEnd[n] = refHapPairs.nMarkers();
+
+ return new RefHapSegs(refHapPairs, refClusterStart, refClusterEnd);
}
/**
diff --git a/sample/LSHapBaum.java b/sample/LSHapBaum.java
index 2454cb8..09dea47 100644
--- a/sample/LSHapBaum.java
+++ b/sample/LSHapBaum.java
@@ -209,64 +209,64 @@ public class LSHapBaum {
}
private void setAlleleProbs(float[] alleleProbs) {
- int nClusters = refHapSegs.nClusters();
setFirstAlleleProbs(alleleProbs);
- for (int cluster=1; cluster < nClusters; ++cluster) {
- setAlleleProbs(alleleProbs, cluster);
+ int nSegsM1 = refHapSegs.nSegs() - 1;
+ for (int j=1; j<nSegsM1; ++j) {
+ setAlleleProbs(alleleProbs, j);
}
setLastAlleleProbs(alleleProbs);
}
private void setFirstAlleleProbs(float[] alleleProbs) {
int segment = 0;
- int refMarker = refHapSegs.clusterStart(segment);
int nSeq = refHapSegs.nSeq(segment);
+ int endRefMarker = refHapSegs.segStart(segment + 1);
float threshold = threshold(nSeq);
- for (int h=0; h<nSeq; ++h) {
- if (bwdHapProbs[segment][h] >= threshold) {
- for (int m=0; m<refMarker; ++m) {
+ for (int seq=0; seq<nSeq; ++seq) {
+ if (bwdHapProbs[segment][seq] >= threshold) {
+ for (int m=0; m<endRefMarker; ++m) {
int start = refMarkers.sumAlleles(m);
- int allele = refHapSegs.allele(segment, m, h);
- alleleProbs[start + allele] += bwdHapProbs[segment][h];
+ int allele = refHapSegs.allele(segment, m, seq);
+ alleleProbs[start + allele] += bwdHapProbs[segment][seq];
}
}
}
}
- private void setAlleleProbs(float[] alleleProbs, int cluster) {
- assert cluster > 0;
- int startRefMarker = refHapSegs.clusterStart(cluster-1);
- int midRefMarker = refHapSegs.clusterEnd(cluster - 1);
- int endRefMarker = refHapSegs.clusterStart(cluster);
- int nSeq = refHapSegs.nSeq(cluster);
+ private void setAlleleProbs(float[] alleleProbs, int segment) {
+ assert segment > 0;
+ int clustStart = refHapSegs.segStart(segment);
+ int clustEnd = refHapSegs.segEnd(segment - 1);
+ int nextClustStart = refHapSegs.segStart(segment + 1);
+ int nSeq = refHapSegs.nSeq(segment);
float threshold = threshold(nSeq);
for (int seq=0; seq<nSeq; ++seq) {
- boolean useFwd = fwdHapProbs[cluster-1][seq] >= threshold;
- boolean useBwd = bwdHapProbs[cluster][seq] >= threshold;
+ boolean useFwd = fwdHapProbs[segment-1][seq] >= threshold;
+ boolean useBwd = bwdHapProbs[segment][seq] >= threshold;
if (useFwd) {
- for (int m=startRefMarker; m<midRefMarker; ++m) {
+ for (int m=clustStart; m<clustEnd; ++m) {
int start = refMarkers.sumAlleles(m);
- int allele = refHapSegs.allele(cluster, m - startRefMarker, seq);
- alleleProbs[start + allele] += fwdHapProbs[cluster-1][seq];
+ int allele = refHapSegs.allele(segment, m - clustStart, seq);
+ alleleProbs[start + allele] += fwdHapProbs[segment-1][seq];
}
}
if (useFwd || useBwd) {
- for (int m=midRefMarker; m<endRefMarker; ++m) {
+ for (int m=clustEnd; m<nextClustStart; ++m) {
int start = refMarkers.sumAlleles(m);
- int allele = refHapSegs.allele(cluster, m - startRefMarker, seq);
+ int allele = refHapSegs.allele(segment, m - clustStart, seq);
double wt = impData.weight(m);
- alleleProbs[start + allele] += wt*fwdHapProbs[cluster-1][seq];
- alleleProbs[start + allele] += (1-wt)*bwdHapProbs[cluster][seq];
+ alleleProbs[start + allele] += wt*fwdHapProbs[segment-1][seq];
+ alleleProbs[start + allele] += (1-wt)*bwdHapProbs[segment][seq];
}
}
}
}
private void setLastAlleleProbs(float[] alleleProbs) {
- int segment = refHapSegs.nClusters();
+ int segment = refHapSegs.nSegs() - 1;
int cluster = segment - 1;
- int refMarkerStart = refHapSegs.clusterStart(cluster);
- int refMarkerEnd = refHapSegs.refHapPairs().nMarkers();
+ int refMarkerStart = refHapSegs.segStart(segment);
+ int refMarkerEnd = refHapSegs.segEnd(segment);
int nSeq = refHapSegs.nSeq(segment);
float threshold = threshold(nSeq);
for (int seq=0; seq<nSeq; ++seq) {
diff --git a/sample/RefHapSegs.java b/sample/RefHapSegs.java
index 3c71c1e..1ca13b6 100644
--- a/sample/RefHapSegs.java
+++ b/sample/RefHapSegs.java
@@ -18,9 +18,7 @@
*/
package sample;
-import blbutil.IntPair;
import haplotype.SampleHapPairs;
-import java.util.function.Function;
import java.util.stream.IntStream;
/**
@@ -34,52 +32,38 @@ import java.util.stream.IntStream;
*/
public class RefHapSegs {
- private final int[] clusterStart;
- private final int[] clusterEnd;
+ private final int[] segStart;
+ private final int[] segEnd;
private final SampleHapPairs refHapPairs;
private final RefHapSeg[] refHapSegs;
/**
* Constructs a new {@code RefHapSegs} instance from the specified data.
* @param refHapPairs the reference haplotype pairs
- * @param clusterStart an array whose {@code j}-th element is the
+ * @param segStart an array whose {@code j}-th element is the
* starting reference marker index (inclusive) for the {@code j}-th
- * marker cluster
- * @param clusterEnd an array whose {@code j}-th element is the
+ * segment of markers
+ * @param segEnd an array whose {@code j}-th element is the
* ending reference marker index (exclusive) for the {@code j}-th
- * marker cluster
- * @param nThreads the number of threads to use during object construction
+ * segment of markers
* @throws IllegalArgumentException if
- * {@code clusterStart.length != clusterEnd.length}
+ * {@code segStart.length != segEnd.length}
* @throws IllegalArgumentException if
- * {@code clusterStart.length > 0 && clusterStart[0] < 0}
- * @throws IllegalArgumentException if
- * {@code clusterEnd.length > 0 && clusterEnd[clusterEnd.length - 1] > nMarkers}
- * @throws IllegalArgumentException if
- * {@code clusterStart[j] >= clusterEnd[j]} for some {@code j} satisfying
- * {@code 0 <= j && j < clusterStart.length}
- * @throws IllegalArgumentException if
- * {@code clusterStart[j] < clusterEnd[j-1]} for some {@code j} satisfying
- * {@code 1 <= j && j < clusterStart.length}
- * @throws IllegalArgumentException if {@code nThreads < 0}
+ * {@code (segStart[j] < 0 || segStart[j] >= segEnd[j]
+ * || segEnd[j] >= refHapPairs.nMarkers())} for some {@code j} satisfying
+ * {@code 0 <= j && j < segStart.length}
* @throws NullPointerException if
- * {@code refHapPairs == null || clusterStart == null || clusterEnd == null}
+ * {@code refHapPairs == null || segStart == null || segEnd == null}
*/
- public RefHapSegs(SampleHapPairs refHapPairs, int[] clusterStart,
- int[] clusterEnd, int nThreads) {
- if (nThreads <=0) {
- throw new IllegalArgumentException(String.valueOf(nThreads));
- }
+ public RefHapSegs(SampleHapPairs refHapPairs, int[] segStart, int[] segEnd) {
int nMarkers = refHapPairs.nMarkers();
- checkClusters(clusterStart, clusterEnd, nMarkers);
- this.clusterStart = clusterStart.clone();
- this.clusterEnd = clusterEnd.clone();
+ checkClusters(segStart, segEnd, nMarkers);
+ this.segStart = segStart.clone();
+ this.segEnd = segEnd.clone();
this.refHapPairs = refHapPairs;
- this.refHapSegs = IntStream.rangeClosed(0, this.clusterStart.length)
+ this.refHapSegs = IntStream.range(0, this.segStart.length)
.parallel()
- .mapToObj(j -> intPair(j, this.clusterStart, this.clusterEnd,
- nMarkers))
- .map(ip -> new RefHapSeg(refHapPairs, ip.first(), ip.second()))
+ .mapToObj(j -> new RefHapSeg(refHapPairs, segStart[j], segEnd[j]))
.toArray(RefHapSeg[]::new);
}
@@ -87,29 +71,13 @@ public class RefHapSegs {
if (starts.length != ends.length) {
throw new IllegalArgumentException("inconsistent data");
}
- if (starts.length > 0 && starts[0] < 0) {
- throw new IllegalArgumentException("inconsistent data");
- }
- if (ends.length > 0 && ends[ends.length - 1] > nMarkers) {
- throw new IllegalArgumentException("inconsistent data");
- }
for (int j=0; j<starts.length; ++j) {
- if (starts[j] >= ends[j]) {
- throw new IllegalArgumentException("inconsistent data");
- }
- if (j>0 && ends[j-1] > starts[j]) {
+ if (starts[j] < 0 || starts[j] >= ends[j] || ends[j] > nMarkers) {
throw new IllegalArgumentException("inconsistent data");
}
}
}
- private static IntPair intPair(int index, int[] starts, int[] ends,
- int nMarkers) {
- int start = (index == 0) ? 0 : starts[index - 1];
- int end = (index == ends.length) ? nMarkers : ends[index];
- return new IntPair(start, end);
- }
-
/**
* Returns the reference haplotype pairs.
* @return the reference haplotype pairs
@@ -186,36 +154,36 @@ public class RefHapSegs {
}
/**
+ * Returns the number of marker segments..
+ * @return the number of marker segments
+ */
+ public int nSegs() {
+ return segStart.length;
+ }
+
+ /**
* Returns the index of the first marker (inclusive) in the specified
- * marker cluster.
- * @param cluster an index of a marker cluster
+ * marker segment.
+ * @param segment an index of a marker segment
* @return the index of the first marker (inclusive) in the specified
- * marker cluster
+ * marker segment
* @throws IndexOutOfBoundsException if
- * {@code cluster < 0 || cluster >= this.nClusters}
+ * {@code segment < 0 || segment >= this.nSegs()}
*/
- public int clusterStart(int cluster) {
- return clusterStart[cluster];
+ public int segStart(int segment) {
+ return segStart[segment];
}
/**
* Returns the index of the last marker (exclusive) in the specified
- * marker cluster.
- * @param cluster an index of a marker cluster
+ * marker segment.
+ * @param segment an index of a marker segment
* @return the index of the last marker (exclusive) in the specified
- * marker cluster
+ * marker segment
* @throws IndexOutOfBoundsException if
- * {@code cluster < 0 || cluster >= this.nClusters()}
- */
- public int clusterEnd(int cluster) {
- return clusterEnd[cluster];
- }
-
- /**
- * Returns the number of marker clusters.
- * @return the number of marker clusters
+ * {@code segment 0 || segment >= this.nSegs()}
*/
- public int nClusters() {
- return clusterStart.length;
+ public int segEnd(int segment) {
+ return segEnd[segment];
}
}
diff --git a/vcf/AllData.java b/vcf/AllData.java
index bf190b2..b5f5129 100644
--- a/vcf/AllData.java
+++ b/vcf/AllData.java
@@ -146,9 +146,9 @@ public class AllData implements Data {
}
@Override
- public void advanceWindow(int requestedOverlap, int windowSize) {
+ public void advanceWindow(int overlap, int windowSize) {
Samples refSamples = refWindow.samples();
- refData = refWindow.advanceWindow(requestedOverlap, windowSize);
+ refData = refWindow.advanceWindow(overlap, windowSize);
refEmissions = new RefGL(refSamples, refData);
refSampleHapPairs = new RefHapPairs(refEmissions.markers(), refSamples, refData);
targetData = targetWindow.advanceWindow(refEmissions.markers());
diff --git a/vcf/Data.java b/vcf/Data.java
index 566103e..03e3299 100644
--- a/vcf/Data.java
+++ b/vcf/Data.java
@@ -49,16 +49,12 @@ public interface Data extends Closeable {
/**
* Advances the sliding window of VCF records, and returns the advanced
- * window. The size of the advanced window and the number of markers
- * of overlap between the marker window immediately before method
- * invocation and the marker window immediately after method invocation
- * may differ from the requested values. If the advanced window size or
- * overlap is less than the requested value, the actual value will be
- * as large as possible. If {@code this.lastWindowOnChrom() == true}
- * before method invocation, then there will be no overlap between the
- * windows.
+ * window. The size of the advanced window may differ from the requested
+ * value. If the advanced window size
+ * is less than the requested value, the actual value will be
+ * as large as possible.
*
- * @param overlap the requested number of markers of overlap
+ * @param overlap the number of markers of overlap
* @param windowSize the requested number of the markers in the window
* immediately after the method returns
*
@@ -66,6 +62,10 @@ public interface Data extends Closeable {
* is detected
* @throws IllegalArgumentException if
* {@code overlap < 0 || overlap >= windowSize}
+ * @throws IllegalArgumentException if
+ * {@code overlap > this.nMarkers()} at the time of method invocation
+ * @throws IllegalArgumentException if
+ * {@code overlap > 0 && this.lastWindowOnChromosome() == true}
* @throws IllegalStateException if
* {@code this.canAdvanceWindow() == false}
*/
diff --git a/vcf/R2Estimator.java b/vcf/R2Estimator.java
new file mode 100644
index 0000000..e52459c
--- /dev/null
+++ b/vcf/R2Estimator.java
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) 2016 Brian L. Browning
+ *
+ * This file is part of Beagle
+ *
+ * Beagle is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Beagle is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package vcf;
+
+/**
+ * <p>Class {@code R2Estimator} estimates the correlation between the
+ * estimated allele dose and true allele dose for a set of genotypes.
+ * </p>
+ * <p>Instances of class {@code R2Estimator} are not thread-safe.
+ * </p>
+ *
+ * @author Brian L. Browning {@code <browning at uw.edu>}
+ */
+public final class R2Estimator {
+
+ private static final double MIN_R2_DEN = 1e-8;
+
+ private int nGenotypes = 0;
+ private double sumCall = 0;
+ private double sumSquareCall = 0;
+ private double sumExpected = 0;
+ private double sumExpectedSquare = 0;
+ private double sumSquareExpected= 0;
+ private double sumCallExpected = 0;
+
+ /**
+ * Constructs a new {@code R2Estimator} instance.
+ */
+ public R2Estimator() {
+ }
+
+ /**
+ * Clears all genotype data and sets the number of genotype with
+ * allele dose data to 0.
+ */
+ public void clear() {
+ this.nGenotypes = 0;
+ this.sumCall = 0.0;
+ this.sumSquareCall = 0.0;
+ this.sumExpected = 0.0;
+ this.sumExpectedSquare = 0.0;
+ this.sumSquareExpected = 0.0;
+ this.sumCallExpected = 0.0;
+ }
+
+ /**
+ * Returns the current number of genotypes with allele dose data.
+ * @return the current number of genotypes with allele dose data
+ */
+ public int nGenotypes() {
+ return nGenotypes;
+ }
+
+ /**
+ * Adds the specified allele dose probabilities for a genotype to the stored
+ * allele dose data.
+ * @param doseProbs an array of length 3 whose {@code j}-th element
+ * is the probability that the genotype contains {@code j} non-reference
+ * alleles.
+ * @throws IllegalArgumentException if {@code doseProbs.length != 3}
+ * @throws IllegalArgumentException if any element of {@code doseProbs} is
+ * less than 0
+ * @throws IllegalArgumentException if the sum of the elements in
+ * {@code doseProbs} differs from 1.0 by more than {@code 1e-5}
+ * @throws NullPointerException if {@code doseProbs == null}
+ */
+ public void addSampleData(double[] doseProbs) {
+ if (doseProbs.length != 3) {
+ throw new IllegalArgumentException(String.valueOf(doseProbs));
+ }
+ ++nGenotypes;
+ int call = checkDataAndReturnMaxIndex(doseProbs);
+ double exp = (doseProbs[1] + 2*doseProbs[2]);
+ double expSquare = (doseProbs[1] + 4*doseProbs[2]);
+ sumCall += call;
+ sumSquareCall += call*call;
+ sumExpected += exp;
+ sumExpectedSquare += expSquare;
+ sumSquareExpected += (exp*exp);
+ sumCallExpected += (call*exp);
+ }
+
+ private static int checkDataAndReturnMaxIndex(double[] probs) {
+ int maxIndex = 0;
+ double sum = 0.0;
+ for (int j=0; j<probs.length; ++j) {
+ if (probs[j] < 0) {
+ throw new IllegalArgumentException(String.valueOf(probs[j]));
+ }
+ sum += probs[j];
+ if (probs[j] > probs[maxIndex]) {
+ maxIndex = j;
+ }
+ }
+ if (Math.abs(sum - 1.0) > 1e-5) {
+ throw new IllegalArgumentException(String.valueOf(sum));
+ }
+ return maxIndex;
+ }
+
+ /**
+ * Returns the estimated squared correlation between the most probable
+ * ALT allele dose and the true ALT allele dose for the current
+ * genotype data.
+ * Returns 0 if the marker is monomorphic or if the most probable ALT
+ * allele dose is monomorphic.
+ *
+ * @return the estimated squared correlation between the most likely
+ * allele dose and the true allele dose
+ */
+ public double allelicR2() {
+ double f = 1.0/nGenotypes;
+ double cov = sumCallExpected - (sumCall * sumExpected * f);
+ double varBest = sumSquareCall - (sumCall * sumCall * f);
+ double varExp = sumExpectedSquare - (sumExpected * sumExpected * f);
+ double den = varBest*varExp;
+ return (den < MIN_R2_DEN) ? 0.0f : (cov*cov/den);
+ }
+
+ /**
+ * Returns the estimated squared correlation between the expected
+ * ALT allele dose and the true ALT allele dose for the current
+ * genotype data. Returns 0 if the marker is monomorphic.
+ *
+ * @return the estimated squared correlation between the expected ALT
+ * allele dose and the true ALT allele dose
+ */
+ public double doseR2() {
+ double f = 1.0/nGenotypes;
+ double num = sumSquareExpected - (sumExpected * sumExpected * f);
+ double den = sumExpectedSquare - (sumExpected * sumExpected * f);
+ if (num < 0.0) {
+ num = 0.0;
+ }
+ return (den < MIN_R2_DEN) ? 0.0f : (num / den);
+ }
+}
diff --git a/vcf/VcfRecBuilder.java b/vcf/VcfRecBuilder.java
new file mode 100644
index 0000000..e2356cc
--- /dev/null
+++ b/vcf/VcfRecBuilder.java
@@ -0,0 +1,406 @@
+/*
+ * Copyright (C) 2016 Brian L. Browning
+ *
+ * This file is part of Beagle
+ *
+ * Beagle is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Beagle is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package vcf;
+
+import blbutil.Const;
+import java.io.PrintWriter;
+import java.math.BigDecimal;
+import java.math.MathContext;
+import java.text.DecimalFormat;
+import java.util.Arrays;
+
+/**
+ * <p>Class {@code VcfRecBuilder} contains methods for constructing
+ * and printing a VCF record in VCF 4.2 format. The FORMAT field data
+ * for each sample is added sequentially to the record via the
+ * {@code addSampleData()} method.
+ *
+ * </p>
+ * <p>Instances of class {@code VcfRecBuilder} are not thread-safe.
+ * </p>
+ *
+ * @author Brian L. Browning {@code <browning at uw.edu>}
+ */
+public final class VcfRecBuilder {
+
+ /**
+ * The default initial size for the string buffer, which is 50
+ * characters.
+ */
+ public static final int DEFAULT_INIT_SIZE = 50;
+
+ private static final MathContext mc2 = new MathContext(2);
+ private static final BigDecimal ONE = new BigDecimal(1.0);
+ private static final String[] dsProbs = dsProbs();
+ private static final String[] r2Probs = r2Probs(dsProbs);
+
+ private final StringBuilder sb;
+ private final R2Estimator r2Est;
+ private final double[] gt3Probs;
+
+ private Marker marker;
+ private boolean printDS;
+ private boolean printGP;
+ private int nAlleles;
+ private int nGenotypes;
+ private int allele1;
+ private int allele2;
+ private double[] gtProbs;
+ private double[] dose;
+ private double[] cumAlleleProbs;
+
+ private static String[] dsProbs() {
+ DecimalFormat df = new DecimalFormat("#.##");
+ String[] probs = new String[201];
+ for (int j=0;j<probs.length; ++j) {
+ probs[j] = df.format(j/100.0);
+ }
+ return probs;
+ }
+
+ private static String[] r2Probs(String[] dsProbs) {
+ DecimalFormat df = new DecimalFormat("0.00");
+ String[] probs = Arrays.copyOf(dsProbs, 101);
+ for (int j=0;j<probs.length; ++j) {
+ if (probs[j].length()!=4) {
+ probs[j] = df.format(j/100.0);
+ }
+ }
+ return probs;
+ }
+
+ /**
+ * Constructs a new {@code VcfRecBuilder} instance with initial buffer
+ * size equal to {@code VcfRecBuilder.DEFAULT_INIT_SIZE}.
+ */
+ public VcfRecBuilder() {
+ this(DEFAULT_INIT_SIZE);
+ }
+
+ /**
+ * Constructs a new {@code VcfRecBuilder} instance with the specified
+ * initial buffer size.
+ *
+ * @param initSize the initial buffer size
+ * @throws NegativeArraySizeException if {@code initCapacity < 0}
+ */
+ public VcfRecBuilder(int initSize) {
+ this.sb = new StringBuilder(initSize);
+ this.r2Est = new R2Estimator();
+ this.gt3Probs = new double[3];
+ }
+
+ /**
+ * Clears existing data, and sets the current marker to the specified
+ * marker. If the FORMAT field contains a DS or GP subfield,
+ * the INFO field will include the AR2 (allele r2), DR2 (dose r2), and
+ * AF (ALT allele frequency) subfields.
+ * @param marker the marker to which data will be added
+ * @param printDS {@code true} if the FORMAT field in the VCF record for
+ * this marker will include a DS subfield, and {@code false} otherwise
+ * @param printGP {@code true} if the FORMAT field in the VCF record for
+ * this marker will include a GP subfield, and {@code false} otherwise
+ * @throws NullPointerException if {@code marker == null}
+ */
+ public void reset(Marker marker, boolean printDS, boolean printGP) {
+ this.sb.setLength(0);
+ this.r2Est.clear();
+ this.marker = marker;
+ this.printDS = printDS;
+ this.printGP = printGP;
+ this.allele1 = -1;
+ this.allele2 = -1;
+ this.nAlleles = marker.nAlleles();
+ this.nGenotypes = marker.nGenotypes();
+ this.gtProbs = new double[marker.nGenotypes()];
+ this.cumAlleleProbs = new double[marker.nAlleles()];
+ this.dose = new double[marker.nAlleles()];
+ }
+
+ /**
+ * Returns the current marker. Returns {@code null} if
+ * {@code this.reset()} has not been previously invoked.
+ * @return the current marker.
+ */
+ public Marker marker() {
+ return marker;
+ }
+
+ /**
+ * Returns {@code true} if the FORMAT field in the VCF record for
+ * this marker includes a DS subfield, and {@code false} otherwise
+ * @return {@code true} if the FORMAT field in the VCF record for
+ * this marker includes a DS subfield
+ */
+ public boolean printDS() {
+ return printDS;
+ }
+
+ /**
+ * Returns {@code true} if the FORMAT field in the VCF record for
+ * this marker includes a GP subfield, and {@code false} otherwise
+ * @return {@code true} if the FORMAT field in the VCF record for
+ * this marker includes a GP subfield
+ */
+ public boolean printGP() {
+ return printGP;
+ }
+
+ /**
+ * Adds the FORMAT field for a sample to the VCF record for the current
+ * marker.
+ * @param gtProbs the posterior genotype probabilities
+ * @throws IllegalArgumentException if
+ * {@code gtProbs.length != this.marker().nGenotypes()}
+ * @throws IllegalStateException if {@code this.marker() == null}
+ * @throws NullPointerException if {@code gtProbs == null}
+ */
+ public void addSampleData(double[] gtProbs) {
+ initializeFieldsAndCheckParameters(gtProbs);
+ this.gtProbs[0] = gtProbs[0];
+ gt3Probs[0] = gtProbs[0];
+ double maxProb = gtProbs[0];
+ double sum = gtProbs[0];
+ int gt = 1;
+ for (int a2=1; a2<nAlleles; ++a2) {
+ for (int a1=0; a1<=a2; ++a1) {
+ double gtProb = gtProbs[gt];
+ if (gtProb < 0) {
+ throw new IllegalArgumentException(String.valueOf(gtProb));
+ }
+ this.gtProbs[gt] = gtProb;
+ sum += gtProb;
+ dose[a1] += gtProb;
+ dose[a2] += gtProb;
+ gt3Probs[(a1==0) ? 1 : 2] += gtProb;
+ if (gtProb > maxProb) {
+ allele1 = a1;
+ allele2 = a2;
+ maxProb = gtProb;
+ }
+ ++gt;
+ }
+ }
+ if (Math.abs(sum - 1.0) > 1e-6) {
+ throw new IllegalArgumentException(String.valueOf(sum));
+ }
+ addToCumAlleleProbs(dose);
+ r2Est.addSampleData(gt3Probs);
+ appendFormatData(false);
+ }
+
+ private void initializeFieldsAndCheckParameters(double[] gtProbs) {
+ if (marker==null) {
+ throw new IllegalStateException();
+ }
+ if (gtProbs.length != nGenotypes) {
+ throw new IllegalArgumentException(String.valueOf(gtProbs.length));
+ }
+ Arrays.fill(gt3Probs, 0.0);
+ Arrays.fill(dose, 0.0);
+ allele1 = 0;
+ allele2 = 0;
+ }
+
+ /**
+ * Adds the FORMAT field for a sample to the VCF record for the current
+ * marker.
+ * @param alProbs1 the posterior allele probabilities for the individual's
+ * first allele
+ * @param alProbs2 the posterior allele probabilities for the individual's
+ * second allele
+ * @throws IllegalArgumentException if
+ * {@code alProbs1.length != this.marker().nAlleles()}
+ * @throws IllegalArgumentException if
+ * {@code alProbs2.length != this.marker().nAlleles()}
+ * @throws IllegalStateException if {@code this.marker() == null}
+ * @throws NullPointerException if
+ * {@code alProbs1 == null || alProbs2 == null}
+ */
+ public void addSampleData(double[] alProbs1, double[] alProbs2) {
+ if (marker==null) {
+ throw new IllegalStateException();
+ }
+ allele1 = maxIndexAndParameterCheck(alProbs1);
+ allele2 = maxIndexAndParameterCheck(alProbs2);
+ Arrays.fill(gt3Probs, 0.0);
+ dose[0] = alProbs1[0] + alProbs2[0];
+ gtProbs[0] = alProbs1[0] * alProbs2[0];
+ gt3Probs[0] = gtProbs[0];
+ int gt = 1;
+ for (int a2=1; a2<alProbs1.length; ++a2) {
+ dose[a2] = alProbs1[a2] + alProbs2[a2];
+ for (int a1=0; a1<=a2; ++a1) {
+ double gtProb = alProbs1[a1]*alProbs2[a2];
+ if (a1!=a2) {
+ gtProb += alProbs1[a2]*alProbs2[a1];
+ }
+ gtProbs[gt++] = gtProb;
+ gt3Probs[(a1==0) ? 1 : 2] += gtProb;
+ }
+ }
+ addToCumAlleleProbs(dose);
+ r2Est.addSampleData(gt3Probs);
+ boolean isPhased = true;
+ appendFormatData(isPhased);
+ }
+
+ private int maxIndexAndParameterCheck(double[] da) {
+ if (da.length != nAlleles) {
+ throw new IllegalArgumentException(String.valueOf(da.length));
+ }
+ int maxIndex = 0;
+ double sum = 0;
+ for (int j=0; j<da.length; ++j) {
+ if (da[j] < 0) {
+ throw new IllegalArgumentException(String.valueOf(da[j]));
+ }
+ sum += da[j];
+ if (da[j] > da[maxIndex]) {
+ maxIndex = j;
+ }
+ }
+ if (Math.abs(sum - 1.0) > 1e-6) {
+ throw new IllegalArgumentException(String.valueOf(sum));
+ }
+ return maxIndex;
+ }
+
+ private void addToCumAlleleProbs(double[] dose) {
+ for (int j=0; j<dose.length; ++j) {
+ cumAlleleProbs[j] += dose[j];
+ }
+ }
+
+ private void appendFormatData(boolean isPhased) {
+ sb.append(Const.tab);
+ sb.append(allele1);
+ sb.append(isPhased ? Const.phasedSep : Const.unphasedSep);
+ sb.append(allele2);
+ if (printDS) {
+ for (int j=1; j<nAlleles; ++j) {
+ sb.append( (j==1) ? Const.colon : Const.comma );
+ sb.append(dsProbs[(int) Math.rint(100*dose[j])]);
+ }
+ }
+ if (printGP) {
+ for (int j=0; j<gtProbs.length; ++j) {
+ sb.append(j==0? Const.colon : Const.comma);
+ sb.append(dsProbs[(int) Math.rint(100*gtProbs[j])]);
+ }
+ }
+ }
+
+ /**
+ * Prints the current VCF record for the current marker to the specified
+ * {@code PrintWriter}. If the FORMAT field contains a DS or GP subfield,
+ * the INFO field will include the AR2 (allele r2), DR2 (dose r2), and
+ * AF (ALT allele frequency) subfields. Invocation of this method has
+ * no effect if {@code this.reset()} has not previously been invoked.
+ * @param out the {@code PrintWriter} to which the VCF record will be
+ * printed
+ * @param isImputed {@code true} if the printed VCF record will
+ * have an IMP flag in the INFO field and {@code false} otherwise
+ * @throws NullPointerException if {@code out == null}
+ */
+ public void writeRec(PrintWriter out, boolean isImputed) {
+ if (marker!=null) {
+ printMarker(marker, out);
+ out.print(Const.tab);
+ out.print(Const.MISSING_DATA_CHAR); // QUAL
+ out.print(Const.tab);
+ out.print("PASS"); // FILTER
+ out.print(Const.tab);
+ printInfo(out, isImputed); // INFO
+ out.print(Const.tab);
+ out.print(format(printDS, printGP)); // FORMAT
+ out.println(sb);
+ }
+ }
+
+ private void printInfo(PrintWriter out, boolean isImputed) {
+ if (printDS || printGP) {
+ out.print("AR2=");
+ out.print(r2Probs[(int) Math.rint(100*r2Est.allelicR2())]);
+ out.print(";DR2=");
+ out.print(r2Probs[(int) Math.rint(100*r2Est.doseR2())]);
+ for (int j=1; j<nAlleles; ++j) {
+ out.print( (j==1) ? ";AF=" : Const.comma);
+ out.print(formatProb(cumAlleleProbs[j]/(2*r2Est.nGenotypes())));
+ }
+ if (isImputed) {
+ out.print(";IMP");
+ }
+ }
+ else {
+ out.print(Const.MISSING_DATA_CHAR);
+ }
+ }
+
+ private static String format(boolean printDS, boolean printGP) {
+ if (printDS) {
+ return printGP ? "GT:DS:GP" : "GT:DS";
+ }
+ else {
+ return "GT";
+ }
+ }
+
+ private static void printMarker(Marker marker, PrintWriter out) {
+ out.print(marker.chrom());
+ out.print(Const.tab);
+ out.print(marker.pos());
+ int nIds = marker.nIds();
+ if (nIds==0) {
+ out.print(Const.tab);
+ out.print(Const.MISSING_DATA_CHAR);
+ }
+ else {
+ for (int j=0; j<nIds; ++j) {
+ out.print(j==0 ? Const.tab : Const.semicolon);
+ out.print(marker.id(j));
+ }
+ }
+ int nAlleles = marker.nAlleles();
+ if (nAlleles==1) {
+ out.print(Const.tab);
+ out.print(marker.allele(0));
+ out.print(Const.tab);
+ out.print(Const.MISSING_DATA_CHAR);
+ }
+ else {
+ for (int j=0; j<nAlleles; ++j) {
+ out.print(j<2 ? Const.tab : Const.comma);
+ out.print(marker.allele(j));
+ }
+ }
+ }
+
+ private static String formatProb(double p) {
+ if (p>=0 && p <= 0.5) {
+ return new BigDecimal(p).round(mc2).toString();
+ }
+ else if (p <= 1.0) {
+ return new BigDecimal(p-1.0).round(mc2).add(ONE).toString();
+ }
+ else {
+ throw new IllegalArgumentException(String.valueOf(p));
+ }
+ }
+}
diff --git a/vcf/VcfWindow.java b/vcf/VcfWindow.java
index 8fc44f5..7f61c7e 100644
--- a/vcf/VcfWindow.java
+++ b/vcf/VcfWindow.java
@@ -95,7 +95,7 @@ public class VcfWindow implements Closeable {
* there will be no overlap between the advanced window and the previous
* window.
*
- * @param overlap the requested number of markers of overlap
+ * @param overlap the number of markers of overlap
* @param windowSize the requested number of the markers in the window
* immediately after the method returns
* @return the advanced window of VCF records
@@ -104,6 +104,10 @@ public class VcfWindow implements Closeable {
* a VCF record
* @throws IllegalArgumentException if
* {@code overlap < 0 || overlap >= windowSize}
+ * @throws IllegalArgumentException if {@code overlap > this.size()}
+ * at the time of method invocation
+ * @throws IllegalArgumentException if
+ * {@code overlap > 0 && this.lastWindowOnChromosome() == true}
* @throws IllegalStateException if
* {@code this.canAdvanceWindow() == false}
*/
@@ -112,8 +116,7 @@ public class VcfWindow implements Closeable {
throw new IllegalStateException("canAdvanceWindow()==false");
}
checkParameters(overlap, windowSize);
- overlap = getActualOverlap(overlap);
- List<VcfEmission> newWindow = new ArrayList<>(windowSize);
+ List<VcfEmission> newWindow = new ArrayList<>(overlap + 1000);
newWindow.addAll(window.subList(window.size() - overlap, window.size()));
int currentChromIndex = currentChromIndex(newWindow);
@@ -136,86 +139,17 @@ public class VcfWindow implements Closeable {
return window.toArray(new VcfEmission[0]);
}
- /**
- * Advances the sliding window of VCF records, and returns the advanced
- * window as a {@code VcfEmission[]} object. The size of the advanced
- * window and the number of markers of overlap between the marker window
- * immediately before method invocation and the marker window immediately
- * after method invocation may differ from the requested values. If the
- * distance the window is advanced or the overlap is less than the requested
- * value, the actual distance or overlap will be as large as possible. If
- * {@code this.lastWindowOnChrom() == true}
- * before method invocation, then there will be no overlap between the
- * advanced window and the previous window
- *
- * @param overlap the requested number of markers of overlap
- * @param cM the requested distance in cM to advance the window
- * @param map the genetic map
- * @return the advanced window of VCF records
- *
- * @throws IllegalArgumentException if a format error is detected in
- * a VCF record
- * @throws IllegalArgumentException if {@code overlap < 0 || cM <= 0}
- * @throws IllegalStateException if
- * {@code this.canAdvanceWindow() == false}
- */
- public VcfEmission[] advanceWindow(int overlap, double cM, GeneticMap map) {
- if (canAdvanceWindow()==false) {
- throw new IllegalStateException("canAdvanceWindow()==false");
- }
- if (overlap < 0) {
- throw new IllegalArgumentException(String.valueOf(overlap));
- }
- if (cM < 0) {
- throw new IllegalArgumentException(String.valueOf(cM));
- }
-
- overlap = getActualOverlap(overlap);
- List<VcfEmission> newWindow = new ArrayList<>(overlap + 1000);
-
- newWindow.addAll(window.subList(window.size() - overlap, window.size()));
- int currentChromIndex = currentChromIndex(newWindow);
- double endMapPos = startMapPos(newWindow, map) + cM;
- while (next != null
- && next.marker().chromIndex()==currentChromIndex
- && map.genPos(next.marker()) < endMapPos) {
- newWindow.add(next);
- next = it.hasNext() ? it.next() : null;
- }
- // add all markers at the same marker position
- VcfEmission last = newWindow.get(newWindow.size()-1);
- while (next!=null && samePosition(last, next)) {
- newWindow.add(next);
- next = it.hasNext() ? it.next() : null;
- }
- this.overlap = overlap;
- this.window.clear();
- this.window.addAll(newWindow);
- this.cumMarkerCnt += (window.size() - overlap);
- return window.toArray(new VcfEmission[0]);
- }
-
private void checkParameters(int overlap, int windowSize) {
if (overlap < 0 || overlap >= windowSize) {
String s = "overlap=" + overlap + "windowSize=" + windowSize;
throw new IllegalArgumentException(s);
}
- }
-
- private int getActualOverlap(int overlap) {
- if (window.isEmpty() || lastWindowOnChrom()) {
- return 0;
- }
- int n = window.size();
- if (overlap > n) {
- overlap = n;
+ if (overlap > window.size()) {
+ throw new IllegalArgumentException(String.valueOf(overlap));
}
- while (overlap > 0 && overlap < n
- && window.get(n - overlap).marker().pos()
- == window.get(n - overlap - 1).marker().pos()) {
- ++overlap;
+ if (overlap > 0 && lastWindowOnChrom()) {
+ throw new IllegalArgumentException(String.valueOf(overlap));
}
- return overlap;
}
private int currentChromIndex(List<VcfEmission> currentWindow) {
@@ -230,19 +164,6 @@ public class VcfWindow implements Closeable {
}
}
- private double startMapPos(List<VcfEmission> currentWindow, GeneticMap map) {
- if (currentWindow.isEmpty()==false) {
- Marker m = currentWindow.get(currentWindow.size() - 1).marker();
- return map.genPos(m);
- }
- else if (next!=null) {
- return map.genPos(next.marker());
- }
- else {
- return 0;
- }
- }
-
private boolean samePosition(VcfEmission a, VcfEmission b) {
return a.marker().chromIndex()==b.marker().chromIndex()
&& a.marker().pos()==b.marker().pos();
@@ -275,6 +196,14 @@ public class VcfWindow implements Closeable {
}
/**
+ * Returns the number of VCF records in the current window.
+ * @return the number of VCF records in the current window
+ */
+ public int size() {
+ return window.size();
+ }
+
+ /**
* Returns the number of VCF records in the overlap between the current
* window and the previous window. Returns 0 if the current window
* is the first window.
diff --git a/vcf/VcfWriter.java b/vcf/VcfWriter.java
index 5c6fe53..31a1639 100644
--- a/vcf/VcfWriter.java
+++ b/vcf/VcfWriter.java
@@ -1,32 +1,11 @@
-/*
- * Copyright (C) 2014 Brian L. Browning
- *
- * This file is part of Beagle
- *
- * Beagle is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Beagle is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
+/*__LICENSE__*/
package vcf;
import blbutil.Const;
import java.io.PrintWriter;
-import java.math.BigDecimal;
-import java.math.MathContext;
-import java.text.DecimalFormat;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import main.AlleleProbs;
-import main.ConstrainedAlleleProbs;
import main.GenotypeValues;
/**
@@ -40,13 +19,6 @@ import main.GenotypeValues;
*/
public final class VcfWriter {
- private static final String PASS = "PASS";
- private static final DecimalFormat df2 = new DecimalFormat("#.##");
- private static final DecimalFormat df3 = new DecimalFormat("#.###");
- private static final DecimalFormat df2_fixed = new DecimalFormat("0.00");
- private static final MathContext mc2 = new MathContext(2);
- private static final BigDecimal ONE = new BigDecimal(1.0);
-
private static final String fileformat = "##fileformat=VCFv4.2";
private static final String afInfo = "##INFO=<ID=AF,Number=A,Type=Float,"
@@ -62,7 +34,7 @@ public final class VcfWriter {
private static final String gtFormat = "##FORMAT=<ID=GT,Number=1,Type=String,"
+ "Description=\"Genotype\">";
- private static final String dsFormat = "##FORMAT=<ID=DS,Number=1,Type=Float,"
+ private static final String dsFormat = "##FORMAT=<ID=DS,Number=A,Type=Float,"
+"Description=\"estimated ALT dose [P(RA) + P(AA)]\">";
private static final String glFormat = "##FORMAT=<ID=GL,Number=G,Type=Float,"
+ "Description=\"Log10-scaled Genotype Likelihood\">";
@@ -189,152 +161,137 @@ public final class VcfWriter {
if (start > end) {
throw new IllegalArgumentException("start=" + start + " end=" + end);
}
- for (int marker=start; marker<end; ++marker) {
- printFixedFields(gv, marker, out);
- for (int sample=0, n=gv.nSamples(); sample<n; ++sample) {
- print_GT_DS_GP(gv, marker, sample, out);
- }
- out.println();
- }
- }
-
-
- private static void print_GT_DS_GP(GenotypeValues gv, int marker, int sample,
- PrintWriter out) {
- int nAlleles = gv.marker(marker).nAlleles();
- int nGenotypes = gv.marker(marker).nGenotypes();
- float[] dose = new float[nAlleles];
- int bestA1 = -1;
- int bestA2 = -1;
- int gt = 0;
- float sum = 0f;
- float maxGP = 0f;
- for (int a2=0; a2<nAlleles; ++a2) {
- for (int a1=0; a1<=a2; ++a1) {
- float value = gv.value(marker, sample, gt++);
- if (value > maxGP) {
- bestA1 = a1;
- bestA2 = a2;
- maxGP = value;
+ boolean printDS = true;
+ boolean printGP = true;
+ boolean isImputed = false;
+ VcfRecBuilder vrb = new VcfRecBuilder(12*gv.nSamples());
+ for (int m=start; m<end; ++m) {
+ Marker marker = gv.marker(m);
+ vrb.reset(marker, printDS, printGP);
+ double[] gprobs = new double[marker.nGenotypes()];
+ for (int s=0, n=gv.nSamples(); s<n; ++s) {
+ double sum = 0.0;
+ for (int gt=0; gt<gprobs.length; ++gt) {
+ gprobs[gt] = gv.value(m, s, gt);
+ sum += gprobs[gt];
}
- dose[a1] += value;
- dose[a2] += value;
- sum += value;
+ for (int gt=0; gt<gprobs.length; ++gt) {
+ gprobs[gt] /= sum;
+ }
+ vrb.addSampleData(gprobs);
}
- }
- out.print(Const.tab);
- out.print(bestA1 == -1 ? Const.MISSING_DATA_STRING : bestA1);
- out.print(Const.unphasedSep);
- out.print(bestA2 == -1 ? Const.MISSING_DATA_STRING : bestA2);
- for (int al = 1; al < dose.length; ++al) {
- out.print( (al==1) ? Const.colon : Const.comma);
- out.print(df2.format(dose[al]/sum));
- }
- for (gt=0; gt<nGenotypes; ++gt) {
- out.print(gt==0 ? Const.colon : Const.comma);
- double v = gv.value(marker, sample, gt)/sum;
- out.print(df2.format(v));
+ vrb.writeRec(out, isImputed);
}
}
/**
- * Writes the specified genotype data as VCF records to the specified
+ * Writes the data in alProbs for markers with index between
+ * {@code start} (inclusive) and {@code end} (exclusive) to the specified
* {@code PrintWriter}.
- * @param alProbs the sample haplotype pairs
+ * @param alProbs the estimated haplotype allele probabilities
+ * @param isImputed an array of length {@code alProbs.nMarkers()}
+ * whose {@code j}-th element is {@code true} if the corresponding
+ * marker is imputed, and {@code false} otherwise
* @param start the starting marker index (inclusive)
* @param end the ending marker index (exclusive)
- * @param gprobs {@code true} if the GP field should be printed, and
- * {@code false} otherwise.
- * @param out the {@code PrintWriter} to which VCF records will
- * be written
- *
+ * @param printDS {@code true} if the DS field should be printed, and
+ * {@code false} otherwise
+ * @param printGP {@code true} if the GP field should be printed, and
+ * {@code false} otherwise
+ * @param out the {@code PrintWriter} to which VCF records will be written
+ * @throws IllegalArgumentException if
+ * {@code isImputed.length != alProbs.nMarkers()}
* @throws IndexOutOfBoundsException if
* {@code (start < 0 || start > end || end > alProbs.nMarkers())}
- * @throws NullPointerException if {@code haps == null || out == null}
+ * @throws NullPointerException if
+ * {@code alProbs == null || isImputed == null || out == null}
*/
- public static void appendRecords(ConstrainedAlleleProbs alProbs,
- int start, int end, boolean gprobs, PrintWriter out) {
+ public static void appendRecords(AlleleProbs alProbs, boolean[] isImputed,
+ int start, int end, boolean printDS, boolean printGP,
+ PrintWriter out) {
+ if (isImputed.length != alProbs.nMarkers()) {
+ throw new IllegalArgumentException("inconsistent data");
+ }
if (start > end) {
throw new IllegalArgumentException("start=" + start + " end=" + end);
}
- boolean ds = true;
- String format = format(ds, gprobs);
+ VcfRecBuilder vrb = new VcfRecBuilder(4*alProbs.nSamples());
for (int m=start; m<end; ++m) {
Marker marker = alProbs.marker(m);
- boolean isImputed = alProbs.isImputed(m);
- GprobsStatistics gps = new GprobsStatistics(alProbs, m);
- String info = info(gps, isImputed);
- printFixedFields(marker, info, format, out);
+ vrb.reset(marker, printDS, printGP);
+ double[] a1 = new double[marker.nAlleles()];
+ double[] a2 = new double[marker.nAlleles()];
for (int sample=0, n=alProbs.nSamples(); sample<n; ++sample) {
- printGTandDose(alProbs, m, sample, ds, out);
- if (gprobs) {
- printGP(alProbs, m, sample, out);
+ for (int j=0; j<a1.length; ++j) {
+ a1[j] = alProbs.alProb1(m, sample, j);
+ a2[j] = alProbs.alProb2(m, sample, j);
}
+ vrb.addSampleData(a1, a2);
}
- out.println();
+ vrb.writeRec(out, isImputed[m]);
}
}
/**
- * Writes the specified genotype data as VCF records to the specified
+ * Writes the data in alProbs for markers with index between
+ * {@code start} (inclusive) and {@code end} (exclusive) to the specified
* {@code PrintWriter}.
- * @param alProbs the sample haplotype pairs
+ * @param alProbs the estimated haplotype allele probabilities
+ * @param isImputed an array of length {@code alProbs.nMarkers()}
+ * whose {@code j}-th element is {@code true} if the corresponding
+ * marker is imputed, and {@code false} otherwise
* @param start the starting marker index (inclusive)
* @param end the ending marker index (exclusive)
- * @param out the {@code PrintWriter} to which VCF records will
- * be written
- *
+ * @param printDS {@code true} if the DS field should be printed, and
+ * {@code false} otherwise
+ * @param printGP {@code true} if the GP field should be printed, and
+ * {@code false} otherwise
+ * @param out the {@code PrintWriter} to which VCF records will be written
+ * @throws IllegalArgumentException if
+ * {@code isImputed.length != alProbs.nMarkers()}
* @throws IndexOutOfBoundsException if
* {@code (start < 0 || start > end || end > alProbs.nMarkers())}
- * @throws NullPointerException if {@code haps == null || out == null}
+ * @throws NullPointerException if
+ * {@code alProbs == null || isImputed == null || out == null}
*/
- public static void appendRecords(AlleleProbs alProbs, int start, int end,
- PrintWriter out) {
- boolean ds = false;
- boolean gp = false;
- String info = Const.MISSING_DATA_STRING;
- String format = format(ds, gp);
+ public static void blockedAppendRecords(AlleleProbs alProbs,
+ boolean[] isImputed, int start, int end,
+ boolean printDS, boolean printGP, PrintWriter out) {
+ if (isImputed.length != alProbs.nMarkers()) {
+ throw new IllegalArgumentException("inconsistent data");
+ }
if (start > end) {
throw new IllegalArgumentException("start=" + start + " end=" + end);
}
- for (int m=start; m<end; ++m) {
- Marker marker = alProbs.marker(m);
- printFixedFields(marker, info, format, out);
- for (int sample=0, n=alProbs.nSamples(); sample<n; ++sample) {
- printGTandDose(alProbs, m, sample, ds, out);
- }
- out.println();
- }
- }
-
- private static void printGTandDose(AlleleProbs alProbs, int marker, int
- sample, boolean ds, PrintWriter out) {
- out.print(Const.tab);
- out.print(alProbs.allele1(marker, sample));
- out.append(Const.phasedSep);
- out.print(alProbs.allele2(marker, sample));
- if (ds) {
- int nAlleles = alProbs.marker(marker).nAlleles();
- for (int j = 1; j < nAlleles; ++j) {
- float p1 = alProbs.alProb1(marker, sample, j);
- float p2 = alProbs.alProb2(marker, sample, j);
- out.print( (j==1) ? Const.colon : Const.comma );
- out.print(df2.format(p1 + p2));
- }
+ int nSamples = alProbs.nSamples();
+ int size = 8;
+ int initCapacity = 4*alProbs.nSamples();
+ VcfRecBuilder[] recBuilders = new VcfRecBuilder[size];
+ double[][] a1 = new double[size][];
+ double[][] a2 = new double[size][];
+ for (int j=0; j<recBuilders.length; ++j) {
+ recBuilders[j] = new VcfRecBuilder(initCapacity);
}
- }
-
- private static void printGP(AlleleProbs alProbs, int marker, int sample,
- PrintWriter out) {
- int nAlleles = alProbs.marker(marker).nAlleles();
- for (int a2=0; a2<nAlleles; ++a2) {
- for (int a1=0; a1<=a2; ++a1) {
- out.print((a2 == 0 && a1 == 0) ? Const.colon : Const.comma);
- float gtProb = alProbs.gtProb(marker, sample, a1, a2);
- if (a1 != a2) {
- gtProb += alProbs.gtProb(marker, sample, a2, a1);
+ for (int ii=start; ii<end; ii+=size) {
+ int ni = Math.min(end, ii+size);
+ for (int i=ii; i<ni; ++i) {
+ Marker marker = alProbs.marker(i);
+ recBuilders[i-ii].reset(marker, printDS, printGP);
+ a1[i-ii] = new double[marker.nAlleles()];
+ a2[i-ii] = new double[marker.nAlleles()];
+ for (int jj=0; jj<nSamples; jj+=size) {
+ int nj = Math.min(jj+size, nSamples);
+ for (int j=jj; j<nj; ++j) { // j = sample
+ for (int k=0; k<a1[i-ii].length; ++k) {
+ a1[i-ii][k] = alProbs.alProb1(i, j, k);
+ a2[i-ii][k] = alProbs.alProb2(i, j, k);
+ }
+ recBuilders[i-ii].addSampleData(a1[i-ii], a2[i-ii]);
+ }
}
- out.print(df2.format(gtProb));
+ }
+ for (int i=ii; i<ni; ++i) {
+ recBuilders[i-ii].writeRec(out, isImputed[i]);
}
}
}
@@ -355,88 +312,10 @@ public final class VcfWriter {
out.print(Const.tab);
out.print(Const.MISSING_DATA_CHAR); // QUAL
out.print(Const.tab);
- out.print(PASS); // FILTER
+ out.print("PASS"); // FILTER
out.print(Const.tab);
out.print(Const.MISSING_DATA_CHAR); // INFO
out.print(Const.tab);
out.print("GT");
}
-
- private static void printFixedFields(GenotypeValues gv, int marker,
- PrintWriter out) {
- GprobsStatistics gpm = new GprobsStatistics(gv, marker);
- double[] alleleFreq = gpm.alleleFreq();
- out.print(gv.marker(marker));
- out.print(Const.tab);
- out.print(Const.MISSING_DATA_CHAR); // QUAL
- out.print(Const.tab);
- out.print(PASS); // FILTER
- out.print(Const.tab);
- out.print("AR2="); // INFO
- out.print(df2_fixed.format(gpm.allelicR2()));
- out.print(";DR2=");
- out.print(df2_fixed.format(gpm.doseR2()));
- for (int j=1; j<alleleFreq.length; ++j) {
- out.print( (j==1) ? ";AF=" : Const.comma);
- out.print(formatProb(alleleFreq[j]));
- }
- out.print(Const.tab);
- out.print("GT:DS:GP");
- }
-
- private static String info(GprobsStatistics gps, boolean isImputed) {
- double[] alleleFreq = gps.alleleFreq();
- StringBuilder sb = new StringBuilder(20);
- sb.append("AR2="); // INFO
- sb.append(df2_fixed.format(gps.allelicR2()));
- sb.append(";DR2=");
- sb.append(df2_fixed.format(gps.doseR2()));
- for (int j=1; j<alleleFreq.length; ++j) {
- sb.append( (j==1) ? ";AF=" : Const.comma);
- sb.append(formatProb(alleleFreq[j]));
- }
- if (isImputed) {
- sb.append(";IMP");
- }
- return sb.toString();
- }
-
- private static String format(boolean ds, boolean gp) {
- if (ds) {
- if (gp) {
- return "GT:DS:GP";
- }
- else {
- return "GT:DS";
- }
- }
- else {
- return "GT";
- }
- }
-
- private static void printFixedFields(Marker marker, String info,
- String format, PrintWriter out) {
- out.print(marker);
- out.print(Const.tab);
- out.print(Const.MISSING_DATA_CHAR); // QUAL
- out.print(Const.tab);
- out.print(PASS); // FILTER
- out.print(Const.tab);
- out.print(info);
- out.print(Const.tab);
- out.print(format);
- }
-
- private static String formatProb(double d) {
- if (d>=0 && d <= 0.5) {
- return new BigDecimal(d).round(mc2).toPlainString();
- }
- else if (d <= 1.0) {
- return new BigDecimal(d-1.0).round(mc2).add(ONE).toString();
- }
- else {
- throw new IllegalArgumentException(String.valueOf(d));
- }
- }
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/beagle.git
More information about the debian-med-commit
mailing list