[med-svn] [mach-haplotyper] 06/09: New upstream version 1.0.18
Andreas Tille
tille at debian.org
Sun Dec 10 16:50:29 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository mach-haplotyper.
commit 367f8eaa97d5e4d668a85cfeeb3bae1d5e9ff10b
Author: Andreas Tille <tille at debian.org>
Date: Sun Dec 10 17:48:02 2017 +0100
New upstream version 1.0.18
---
Makefile | 175 +++
debian/README.Debian | 6 -
debian/README.source | 8 -
debian/changelog | 6 -
debian/compat | 1 -
debian/control | 19 -
debian/copyright | 28 -
debian/mach-haplotyper.dirs | 1 -
debian/mach-haplotyper.links | 1 -
debian/mach1.1 | 12 -
debian/manpages | 1 -
debian/patches/Makefile.patch | 43 -
debian/patches/series | 1 -
debian/rules | 17 -
debian/source/format | 1 -
debian/upstream/metadata | 11 -
debian/watch | 2 -
libsrc/BasicHash.cpp | 164 +++
libsrc/BasicHash.h | 85 ++
libsrc/Constant.h | 61 +
libsrc/Error.cpp | 69 +
libsrc/Error.h | 35 +
libsrc/FortranFormat.cpp | 373 ++++++
libsrc/FortranFormat.h | 100 ++
libsrc/GenotypeLists.cpp | 486 +++++++
libsrc/GenotypeLists.h | 61 +
libsrc/Hash.cpp | 130 ++
libsrc/Hash.h | 27 +
libsrc/InputFile.cpp | 63 +
libsrc/InputFile.h | 158 +++
libsrc/IntArray.cpp | 389 ++++++
libsrc/IntArray.h | 153 +++
libsrc/Kinship.cpp | 92 ++
libsrc/Kinship.h | 43 +
libsrc/KinshipX.cpp | 63 +
libsrc/KinshipX.h | 41 +
libsrc/LongArray.cpp | 148 +++
libsrc/LongArray.h | 78 ++
libsrc/LongHash.cpp | 20 +
libsrc/LongHash.h | 247 ++++
libsrc/LongInt.h | 204 +++
libsrc/LongLongCounter.cpp | 60 +
libsrc/LongLongCounter.h | 36 +
libsrc/MapFunction.cpp | 44 +
libsrc/MapFunction.h | 25 +
libsrc/MathConstant.h | 52 +
libsrc/MathMatrix.cpp | 711 ++++++++++
libsrc/MathMatrix.h | 194 +++
libsrc/MathStats.cpp | 494 +++++++
libsrc/MathStats.h | 77 ++
libsrc/MathVector.cpp | 652 +++++++++
libsrc/MathVector.h | 207 +++
libsrc/MemoryAllocators.cpp | 260 ++++
libsrc/MemoryAllocators.h | 103 ++
libsrc/MemoryInfo.cpp | 38 +
libsrc/MemoryInfo.h | 26 +
libsrc/MiniDeflate.cpp | 349 +++++
libsrc/MiniDeflate.h | 103 ++
libsrc/Parameters.cpp | 735 +++++++++++
libsrc/Parameters.h | 292 +++++
libsrc/Pedigree.cpp | 916 +++++++++++++
libsrc/Pedigree.h | 155 +++
libsrc/PedigreeAlleleFreq.cpp | 260 ++++
libsrc/PedigreeAlleleFreq.h | 36 +
libsrc/PedigreeAlleles.h | 143 ++
libsrc/PedigreeDescription.cpp | 826 ++++++++++++
libsrc/PedigreeDescription.h | 82 ++
libsrc/PedigreeFamily.cpp | 294 +++++
libsrc/PedigreeFamily.h | 63 +
libsrc/PedigreeGlobals.cpp | 856 ++++++++++++
libsrc/PedigreeGlobals.h | 175 +++
libsrc/PedigreeLoader.cpp | 605 +++++++++
libsrc/PedigreePerson.cpp | 234 ++++
libsrc/PedigreePerson.h | 133 ++
libsrc/PedigreeTrim.cpp | 188 +++
libsrc/PedigreeTwin.cpp | 182 +++
libsrc/QuickIndex.cpp | 232 ++++
libsrc/QuickIndex.h | 52 +
libsrc/Random.cpp | 407 ++++++
libsrc/Random.h | 133 ++
libsrc/Sort.cpp | 369 ++++++
libsrc/Sort.h | 36 +
libsrc/StringArray.cpp | 325 +++++
libsrc/StringArray.h | 118 ++
libsrc/StringBasics.cpp | 1255 ++++++++++++++++++
libsrc/StringBasics.h | 274 ++++
libsrc/StringHash.cpp | 647 +++++++++
libsrc/StringHash.h | 276 ++++
libsrc/StringMap.cpp | 541 ++++++++
libsrc/StringMap.h | 122 ++
libsrc/TraitTransformations.cpp | 121 ++
libsrc/TraitTransformations.h | 30 +
mach1/AssociationAnalysis.cpp | 201 +++
mach1/AssociationAnalysis.h | 38 +
mach1/CostCalculator.cpp | 136 ++
mach1/CostCalculator.h | 45 +
mach1/DosageCalculator.cpp | 347 +++++
mach1/DosageCalculator.h | 71 +
mach1/ErrorRate.cpp | 65 +
mach1/ErrorRate.h | 40 +
mach1/HaplotypeKey.cpp | 305 +++++
mach1/HaplotypeKey.h | 98 ++
mach1/HaplotypeLoader.cpp | 882 +++++++++++++
mach1/HaplotypeLoader.h | 80 ++
mach1/Haplotyper.cpp | 2755 +++++++++++++++++++++++++++++++++++++++
mach1/Haplotyper.h | 323 +++++
mach1/Main.cpp | 635 +++++++++
mach1/Manners.cpp | 81 ++
mach1/Manners.h | 36 +
mach1/MergeHaplotypes.cpp | 205 +++
mach1/MergeHaplotypes.h | 56 +
mach1/OutputHandlers.cpp | 364 ++++++
mach1/OutputHandlers.h | 56 +
thunder/Main.cpp | 470 +++++++
thunder/ShotgunHaplotyper.cpp | 250 ++++
thunder/ShotgunHaplotyper.h | 44 +
thunder/ShotgunManners.cpp | 68 +
thunder/ShotgunManners.h | 36 +
118 files changed, 25696 insertions(+), 158 deletions(-)
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..1e404a8
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,175 @@
+#
+# MaCH Makefile -- Compiles and installs MACH and accessory applications
+# (c) 2006-2007 Goncalo Abecasis
+#
+
+# current version of the software
+VERSION=1.0.18.c
+
+# default installation directory
+INSTALLDIR=/usr/local/bin
+
+# default C++ compiler
+CXX=g++
+
+# default compilation flags are
+#
+# CFLAGS=-O2 -I./libsrc/ -I./mach1
+#
+CFLAGS=-O2 -static -I./libsrc -I./mach1 -D__ZLIB_AVAILABLE__ -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE
+
+# executable file names and locations
+BINDIR = executables
+MACH = $(BINDIR)/mach1
+THUNDER = $(BINDIR)/thunder
+EXECUTABLES = $(MACH) $(THUNDER)
+
+# MACH File Set
+MACHBASE = mach1/AssociationAnalysis mach1/CostCalculator \
+ mach1/DosageCalculator mach1/ErrorRate mach1/Manners \
+ mach1/Haplotyper mach1/HaplotypeKey mach1/HaplotypeLoader \
+ mach1/MergeHaplotypes mach1/OutputHandlers
+MACHHDR = $(MACHBASE:=.h)
+MACHSRC = $(MACHBASE:=.cpp) mach1/Main.cpp
+MACHOBJ = $(MACHSRC:.cpp=.o)
+
+# THUNDER File Set
+THUNDERBASE = mach1/AssociationAnalysis mach1/CostCalculator \
+ mach1/DosageCalculator mach1/ErrorRate \
+ mach1/Haplotyper mach1/HaplotypeKey mach1/HaplotypeLoader \
+ mach1/MergeHaplotypes mach1/OutputHandlers \
+ thunder/ShotgunHaplotyper thunder/ShotgunManners
+THUNDERHDR = $(THUNDERBASE:=.h)
+THUNDERSRC = $(THUNDERBASE:=.cpp) thunder/Main.cpp
+THUNDEROBJ = $(THUNDERSRC:.cpp=.o)
+
+# Utility Library File Set
+LIBFILE = libsrc/lib-goncalo.a
+LIBMAIN = libsrc/BasicHash libsrc/Error libsrc/FortranFormat \
+ libsrc/IntArray libsrc/InputFile \
+ libsrc/GenotypeLists libsrc/Hash \
+ libsrc/LongArray libsrc/LongHash libsrc/LongLongCounter \
+ libsrc/Kinship libsrc/KinshipX libsrc/MapFunction \
+ libsrc/MemoryAllocators libsrc/MemoryInfo \
+ libsrc/MathMatrix libsrc/MathStats \
+ libsrc/MathVector libsrc/MiniDeflate \
+ libsrc/Parameters libsrc/Pedigree libsrc/PedigreeAlleleFreq \
+ libsrc/PedigreeDescription libsrc/PedigreeFamily libsrc/PedigreeGlobals \
+ libsrc/PedigreePerson libsrc/QuickIndex libsrc/Random libsrc/Sort \
+ libsrc/StringArray libsrc/StringBasics libsrc/StringMap \
+ libsrc/StringHash libsrc/TraitTransformations
+LIBPED = libsrc/PedigreeLoader libsrc/PedigreeTwin libsrc/PedigreeTrim
+LIBSRC = $(LIBMAIN:=.cpp) $(LIBPED:=.cpp)
+LIBHDR = $(LIBMAIN:=.h) libsrc/Constant.h \
+ libsrc/MathConstant.h libsrc/PedigreeAlleles.h libsrc/LongInt.h
+LIBOBJ = $(LIBSRC:.cpp=.o)
+
+# private parameters
+FETCHDIR=$(HOME)/code
+DISTRIBDIR=$(HOME)/code/distrib/mach-$(VERSION)
+
+# helpful screen listing available options
+help :
+ @echo "MACH Source Distribution"
+ @echo " "
+ @echo "This Makefile will compile and install MaCH 1.0 on your system"
+ @echo " "
+ @echo " Type... To..."
+ @echo " make help Display this help screen"
+ @echo " make all Compile mach1 and related tools"
+ @echo " make install Install binaries in $(INSTALLDIR)"
+ @echo " make install INSTALLDIR=directory_for_binaries"
+ @echo " Install binaries in directory_for_binaries"
+ @echo " make clean Delete temporary files"
+
+# make everything
+all : $(EXECUTABLES)
+
+$(EXECUTABLES) : $(BINDIR)
+
+# dependencies for executables
+$(MACH) : $(LIBFILE) $(MACHOBJ) $(BINDIR)
+ $(CXX) $(CFLAGS) -o $@ $(MACHOBJ) $(LIBFILE) -lm -lz
+
+$(THUNDER) : $(LIBFILE) $(THUNDEROBJ) $(BINDIR)
+ $(CXX) $(CFLAGS) -o $@ $(THUNDEROBJ) $(LIBFILE) -lm -lz
+
+$(BINDIR) :
+ mkdir $(BINDIR)
+
+$(LIBFILE) : $(LIBOBJ) $(LIBHDR)
+ ar -cr $@ $(LIBOBJ)
+ ranlib $@
+
+$(MACHOBJ) : $(MACHHDR) $(LIBHDR)
+
+$(THUNDEROBJ) : $(THUNDERHDR) $(LIBHDR)
+
+$(CLUSTEROBJ) : $(CLUSTERHDR) $(LIBHDR)
+
+$(LIBOBJ) : $(LIBHDR)
+
+clean :
+ -rm -f */*.a */*.o $(EXECUTABLES)
+
+install : all $(INSTALLDIR)
+ @echo " "
+ @echo Installing to directory $(INSTALLDIR)
+ @echo To select a different directory, run
+ @echo " "
+ @echo make install INSTALLDIR=your_preferred_dir
+ @echo " "
+ cp $(EXECUTABLES) $(INSTALLDIR)
+
+$(INSTALLDIR) :
+ @echo " "
+ @echo Creating directory $(INSTALLDIR)
+ @echo " "
+ @mkdir $(INSTALLDIR)
+
+new-version :
+ mkdir -p $(DISTRIBDIR) $(DISTRIBDIR)/mach1
+ mkdir -p $(DISTRIBDIR)/libsrc $(DISTRIBDIR)/thunder
+ cp ChangeLog README $(DISTRIBDIR)
+ cp Makefile $(DISTRIBDIR)
+ cp -R examples $(DISTRIBDIR)
+
+fetch :
+ cd $(FETCHDIR) ; cp $(MACHSRC) $(MACHHDR) $(DISTRIBDIR)/mach1
+ cd $(FETCHDIR) ; cp thunder/ShotgunHaplotyper.cpp thunder/ShotgunManners.cpp thunder/Main.cpp $(DISTRIBDIR)/thunder
+ cd $(FETCHDIR) ; cp thunder/ShotgunHaplotyper.h thunder/ShotgunManners.h $(DISTRIBDIR)/thunder
+ cd $(FETCHDIR) ; cp $(LIBSRC) $(LIBHDR) $(DISTRIBDIR)/libsrc
+ cd $(DISTRIBDIR); csh ../stamp MaCH
+
+.c.o :
+ $(CXX) $(CFLAGS) -o $@ -c $*.c
+
+.cpp.X.o :
+ $(CXX) $(CFLAGS) -o $@ -c $*.cpp -DVERSION=\"$(VERSION)\" -D__CHROMOSOME_X__
+
+.cpp.o :
+ $(CXX) $(CFLAGS) -o $@ -c $*.cpp -DVERSION=\"$(VERSION)\"
+
+archive : clean
+ mkdir -p mach-$(VERSION)
+ cp -R README Makefile ChangeLog mach-$(VERSION)
+ cp -R mach1 libsrc examples thunder mach-$(VERSION)
+ tar -cvf mach-$(VERSION).tar mach-$(VERSION)
+ gzip -f --best mach-$(VERSION).tar
+ rm -rf mach-$(VERSION)
+
+distrib : $(EXECUTABLES)
+ mkdir -p mach-$(VERSION)
+ cp -R README ChangeLog $(EXECUTABLES) examples mach-$(VERSION)
+ tar -cvf `uname`-mach.tar mach-$(VERSION)
+ gzip -f `uname`-mach.tar
+ rm -rf mach-$(VERSION)
+
+windowszip : $(EXECUTABLES)
+ mkdir -p mach-$(VERSION)
+ cp -R README ChangeLog $(EXECUTABLES) examples mach-$(VERSION)
+ zip -r Windows-mach.zip mach-$(VERSION)
+ rm -rf mach-$(VERSION)
+
+.SUFFIXES : .cpp .c .o .X.o $(SUFFIXES)
+
diff --git a/debian/README.Debian b/debian/README.Debian
deleted file mode 100644
index 3998ffc..0000000
--- a/debian/README.Debian
+++ /dev/null
@@ -1,6 +0,0 @@
-mach-haplotyper for Debian
---------------------------
-
-This package cannot be redistributed.
-
- -- Steffen Moeller <moeller at debian.org> Tue, 19 Feb 2013 14:16:48 +0100
diff --git a/debian/README.source b/debian/README.source
deleted file mode 100644
index ccef36a..0000000
--- a/debian/README.source
+++ /dev/null
@@ -1,8 +0,0 @@
-MACH for Debian
-===============
-
-The package was renamed to mach-haplotyper because of a name conflict. It is what the "H" in "MACH" stands for.
-
-The source tree is not rooted and needs to be repackaged. This needs to be done manually until some good soul adds this to the download instructions.
-
-The Makefile was made understood LDFLAGS, additions to CFLAGS and CPPFLAGS to allow the hardening routine of Debian to kick in.
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index 0f21f30..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,6 +0,0 @@
-mach-haplotyper (1.0.18-1) UNRELEASED; urgency=low
-
- * This package is not released. Packaging instructions are only for
- internal use.
-
- -- Steffen Moeller <moeller at debian.org> Tue, 19 Feb 2013 14:16:48 +0100
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index f599e28..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-10
diff --git a/debian/control b/debian/control
deleted file mode 100644
index d4021c3..0000000
--- a/debian/control
+++ /dev/null
@@ -1,19 +0,0 @@
-Source: mach-haplotyper
-Maintainer: Steffen Moeller <moeller at debian.org>
-Section: non-free/science
-XS-Autobuild: no
-Priority: optional
-Build-Depends: debhelper (>= 100),
-Standards-Version: 3.9.8
-Vcs-Browser: https://anonscm.debian.org/viewvc/debian-med/trunk/packages/mach-haplotyper/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/mach-haplotyper/trunk/
-Homepage: http://www.sph.umich.edu/csg/abecasis/MACH/
-
-Package: mach-haplotyper
-Architecture: any
-Depends: ${shlibs:Depends},
- ${misc:Depends}
-Description: Markov Chain based SNP haplotyper
- Recent advancements in chip-based DNA genotyping allow to infer DNA
- variants that are not part of the chip but known to be associated
- with a combination of SNPs that are measured.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index acf6bb2..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,28 +0,0 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: mach-haplotyper
-Source: http://www.sph.umich.edu/csg/abecasis/MACH/download/
-
-Files: *
-Copyright: <years> <put author's name and email here>
- <years> <likewise for another author>
-License:
- not redistributable
-
-Files: debian/*
-Copyright: 2013 Steffen Moeller <moeller at debian.org>
-License: GPL-2+
- This package is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- .
- This package is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- .
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>
- .
- On Debian systems, the complete text of the GNU General
- Public License version 2 can be found in "/usr/share/common-licenses/GPL-2".
diff --git a/debian/mach-haplotyper.dirs b/debian/mach-haplotyper.dirs
deleted file mode 100644
index e772481..0000000
--- a/debian/mach-haplotyper.dirs
+++ /dev/null
@@ -1 +0,0 @@
-usr/bin
diff --git a/debian/mach-haplotyper.links b/debian/mach-haplotyper.links
deleted file mode 100644
index 2693165..0000000
--- a/debian/mach-haplotyper.links
+++ /dev/null
@@ -1 +0,0 @@
-usr/share/man/man1/mach1.1 usr/share/man/man1/thunder.1
diff --git a/debian/mach1.1 b/debian/mach1.1
deleted file mode 100644
index db42e3d..0000000
--- a/debian/mach1.1
+++ /dev/null
@@ -1,12 +0,0 @@
-.\" Hey, EMACS: -*- nroff -*-
-.TH MACH-HAPLOTYPER 1 "February 19, 2013"
-.SH NAME
-mach \- haplotyping and imputation of SNPs
-.SH SYNOPSIS
-.B mach1
-.br
-.B thunder
-.SH DESCRIPTION
-Some good soul please provide a man page for MACH.
-.SH SEE ALSO
-.BR http://www.sph.umich.edu/csg/abecasis/MACH
diff --git a/debian/manpages b/debian/manpages
deleted file mode 100644
index a5a7625..0000000
--- a/debian/manpages
+++ /dev/null
@@ -1 +0,0 @@
-debian/mach1.1
diff --git a/debian/patches/Makefile.patch b/debian/patches/Makefile.patch
deleted file mode 100644
index ace1fb6..0000000
--- a/debian/patches/Makefile.patch
+++ /dev/null
@@ -1,43 +0,0 @@
-Index: mach-haplotyper-1.0.18/Makefile
-===================================================================
---- mach-haplotyper-1.0.18.orig/Makefile
-+++ mach-haplotyper-1.0.18/Makefile
-@@ -16,7 +16,7 @@
- #
- # CFLAGS=-O2 -I./libsrc/ -I./mach1
- #
--CFLAGS=-O2 -static -I./libsrc -I./mach1 -D__ZLIB_AVAILABLE__ -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE
-+CFLAGS +=-O2 -I./libsrc -I./mach1 -D__ZLIB_AVAILABLE__ -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE
-
- # executable file names and locations
- BINDIR = executables
-@@ -89,10 +89,10 @@
-
- # dependencies for executables
- $(MACH) : $(LIBFILE) $(MACHOBJ) $(BINDIR)
-- $(CXX) $(CFLAGS) -o $@ $(MACHOBJ) $(LIBFILE) -lm -lz
-+ $(CXX) $(LDFLAGS) $(CFLAGS) -o $@ $(MACHOBJ) $(LIBFILE) -lm -lz
-
- $(THUNDER) : $(LIBFILE) $(THUNDEROBJ) $(BINDIR)
-- $(CXX) $(CFLAGS) -o $@ $(THUNDEROBJ) $(LIBFILE) -lm -lz
-+ $(CXX) $(LDFLAGS) $(CFLAGS) -o $@ $(THUNDEROBJ) $(LIBFILE) -lm -lz
-
- $(BINDIR) :
- mkdir $(BINDIR)
-@@ -142,13 +142,13 @@
- cd $(DISTRIBDIR); csh ../stamp MaCH
-
- .c.o :
-- $(CXX) $(CFLAGS) -o $@ -c $*.c
-+ $(CXX) $(CPPFLAGS) $(CFLAGS) -o $@ -c $*.c
-
- .cpp.X.o :
-- $(CXX) $(CFLAGS) -o $@ -c $*.cpp -DVERSION=\"$(VERSION)\" -D__CHROMOSOME_X__
-+ $(CXX) $(CPPFLAGS) $(CFLAGS) -o $@ -c $*.cpp -DVERSION=\"$(VERSION)\" -D__CHROMOSOME_X__
-
- .cpp.o :
-- $(CXX) $(CFLAGS) -o $@ -c $*.cpp -DVERSION=\"$(VERSION)\"
-+ $(CXX) $(CPPFLAGS) $(CFLAGS) -o $@ -c $*.cpp -DVERSION=\"$(VERSION)\"
-
- archive : clean
- mkdir -p mach-$(VERSION)
diff --git a/debian/patches/series b/debian/patches/series
deleted file mode 100644
index 5b1c0a4..0000000
--- a/debian/patches/series
+++ /dev/null
@@ -1 +0,0 @@
-Makefile.patch
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index 3978d2c..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/make -f
-# -*- makefile -*-
-
-# Uncomment this to turn on verbose mode.
-#export DH_VERBOSE=1
-
-DPKG_EXPORT_BUILDFLAGS = 1
-include /usr/share/dpkg/buildflags.mk
-
-%:
- dh $@
-
-override_dh_auto_build:
- $(MAKE) all
-
-override_dh_auto_install:
- $(MAKE) install INSTALLDIR=$(CURDIR)/debian/mach-haplotyper/usr/bin/
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/upstream/metadata b/debian/upstream/metadata
deleted file mode 100644
index 85575ad..0000000
--- a/debian/upstream/metadata
+++ /dev/null
@@ -1,11 +0,0 @@
-Reference:
- Author: "Yun Li and Cristen J. Willer and Jun Ding and Paul Scheet and Gonçalo R. Abecasis"
- Title: "MaCH: using sequence and genotype data to estimate haplotypes and unobserved genotypes"
- Journal: Genetic Epidemiology
- Year: 2010
- Volume: 34
- Number: 8
- Pages: 816-34
- DOI: 10.1002/gepi.20533
- PMID: 21058334
- URL: http://onlinelibrary.wiley.com/doi/10.1002/gepi.20533/abstract
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index 84e9c2d..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,2 +0,0 @@
-version=4
-http://www.sph.umich.edu/csg/abecasis/MACH/download/mach\.([0-9.]*)\.source.tgz
diff --git a/libsrc/BasicHash.cpp b/libsrc/BasicHash.cpp
new file mode 100644
index 0000000..3b9beba
--- /dev/null
+++ b/libsrc/BasicHash.cpp
@@ -0,0 +1,164 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/BasicHash.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "BasicHash.h"
+#include "Error.h"
+
+#include <stdio.h>
+
+BasicHash::BasicHash(int startsize)
+ {
+ count = 0;
+ size = startsize;
+ mask = startsize - 1;
+
+ // In this implementation, the size of hash tables must be a power of two
+ if (startsize & mask)
+ error("BasicHash: Hash table size must be a power of two.\n");
+
+ objects = new void * [size];
+ keys = new unsigned int [size];
+
+ for (unsigned int i = 0; i < size; i++)
+ { objects[i] = NULL; }
+ };
+
+BasicHash::~BasicHash()
+ {
+ delete [] objects;
+ delete [] keys;
+ }
+
+void BasicHash::Clear()
+ {
+// printf("Clearing...\n");
+
+ count = 0;
+
+ if (size > 16)
+ SetSize(16);
+
+ for (unsigned int i = 0; i < size; i++)
+ objects[i] = NULL;
+ }
+
+void BasicHash::SetSize(int newsize)
+ {
+ int newmask = newsize - 1;
+
+ void ** newobjects = new void * [newsize];
+ unsigned int * newkeys = new unsigned int [newsize];
+
+ for (int i = 0; i < newsize; i++)
+ { newobjects[i] = NULL; }
+
+ if (count)
+ for (unsigned int i = 0; i < size; i++)
+ if (objects[i] != NULL)
+ {
+ unsigned int key = keys[i];
+ unsigned int h = key & newmask;
+
+ while ( newobjects[h] != NULL && newkeys[h] != h)
+ h = (h + 1) & newmask;
+
+ newkeys[h] = key;
+ newobjects[h] = objects[i];
+ }
+
+ delete [] objects;
+ delete [] keys;
+
+ objects = newobjects;
+ keys = newkeys;
+ size = newsize;
+ mask = newmask;
+ }
+
+int BasicHash::Add(int key, void * object)
+ {
+ if (count * 2 > size)
+ Grow();
+
+ unsigned int h = Iterate(key);
+
+ while ((objects[h] != NULL) && (objects[h] != object))
+ h = ReIterate(key, h);
+
+ if (objects[h] == NULL)
+ {
+// printf("At position %d, inserted %x\n", h, key);
+ keys[h] = key;
+ count++;
+ }
+
+ objects[h] = object;
+
+ return h;
+ }
+
+int BasicHash::Find(int key)
+ {
+ int h = Iterate(key);
+
+ return objects[h] == NULL ? -1 : h;
+ }
+
+int BasicHash::Rehash(int key, int h)
+ {
+ h = ReIterate(key, h);
+
+ return objects[h] == NULL ? -1 : h;
+ }
+
+void BasicHash::Delete(unsigned int index)
+ {
+ if (index >= size || objects[index] == NULL)
+ return;
+
+ objects[index] = NULL;
+ count--;
+
+ if (count * 8 < size && size > 32)
+ Shrink();
+ else
+ {
+ // rehash the next entries until we find empty slot
+ index = (index + 1) & mask;
+
+ while (objects[index] != NULL)
+ {
+ if ((keys[index] & mask) != index)
+ {
+ unsigned int h = Iterate(keys[index]);
+
+ while ((objects[h] != NULL) && (objects[h] != objects[index]))
+ h = ReIterate(keys[index], h);
+
+ if (h != (unsigned int) index)
+ {
+ keys[h] = keys[index];
+ objects[h] = objects[index];
+ objects[index] = NULL;
+ }
+ }
+
+ index = (index + 1) & mask;
+ }
+ }
+ }
+
diff --git a/libsrc/BasicHash.h b/libsrc/BasicHash.h
new file mode 100644
index 0000000..3f6203b
--- /dev/null
+++ b/libsrc/BasicHash.h
@@ -0,0 +1,85 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/BasicHash.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __BASICHASH_H__
+#define __BASICHASH_H__
+
+#include <stdlib.h>
+
+class BasicHash
+ {
+ protected:
+ void ** objects;
+ unsigned int * keys;
+ unsigned int count, size;
+ unsigned int mask;
+
+ public:
+ BasicHash(int startsize = 32);
+ virtual ~BasicHash();
+
+ void Grow() { SetSize(size * 2); }
+ void Shrink() { SetSize(size / 2); }
+
+ void SetSize(int newsize);
+
+ void Clear();
+
+ int Capacity() const { return size; }
+ int Entries() const { return count; }
+
+ void * Object(int i) const { return objects[i]; }
+
+ void SetObject(int i, void * object)
+ { objects[i] = object; }
+
+ int Add (int key, void * object = NULL);
+ int Find (int key);
+ int Rehash (int key, int h);
+
+ BasicHash & operator = (const BasicHash & rhs);
+
+ void * operator [] (int i) const { return objects[i]; }
+
+ void Delete(unsigned int index);
+
+ bool SlotInUse(int index) { return objects[index] != NULL; }
+
+ private:
+ unsigned int Iterate(unsigned int key) const
+ {
+ unsigned int h = key & mask;
+
+ while (objects[h] != NULL && keys[h] != key)
+ h = (h + 1) & mask;
+
+ return h;
+ }
+
+ unsigned int ReIterate(unsigned int key, unsigned int h) const
+ {
+ h = (h + 1) & mask;
+
+ while (objects[h] != NULL && keys[h] != key)
+ h = (h + 1) & mask;
+
+ return h;
+ }
+ };
+
+#endif
+
diff --git a/libsrc/Constant.h b/libsrc/Constant.h
new file mode 100644
index 0000000..944df49
--- /dev/null
+++ b/libsrc/Constant.h
@@ -0,0 +1,61 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/Constant.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef _CONSTANT_H_
+#define _CONSTANT_H_
+
+#define COMPAREFUNC (int (*)(const void *, const void *))
+
+#define BUFSIZE 1024
+#define FILENAMELEN 100
+#define IDLEN 20
+
+#define SEPARATORS " \t\n\r\f/"
+#define WHITESPACE " \t\n\r\f"
+
+#define SWTABLESKIP 9
+#define SWTABLEMAX 10000
+
+#define _NAN_ ((double) (6.66666e-66))
+
+#define QTDTDATA "qtdt.dat"
+#define QTDTPED "qtdt.ped"
+#define QTDTIBD "qtdt.ibd"
+#define QTDTRAW "regress.tbl"
+#define GENIHDATAIN "genih.dat"
+
+#ifndef __WIN32__
+#define stricmp strcasecmp
+#endif
+
+// Constants for older haplotype handling programs
+// Constants for HAPLOXT
+#define XT_MAX_ALLELES 50 // Maximum alleles for crosstabulation
+#define XT_VECTORSIZE 10000 // Total haplotypes in population
+#define XT_POOLTRESH 7 // Threshold for pooling rare alleles
+// Simwalk Haplotype Vectors
+#define HV_MAXSIZE 100 // Haplotypes in single SimWalk pedigree
+#define HV_INFOTRESH 75 // Percentage of loci typed
+#define HV_STATELENGTH 100 // Markers per haplotype
+#define HV_SKIPLINES 4 // lines to skip at bottom of family tree
+// Simwalk Summary Files
+#define HT_TABLE_SIZE 1000
+#define HT_SKIP_LINES 9
+
+#endif
+
+
diff --git a/libsrc/Error.cpp b/libsrc/Error.cpp
new file mode 100644
index 0000000..fab6080
--- /dev/null
+++ b/libsrc/Error.cpp
@@ -0,0 +1,69 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/Error.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "Error.h"
+
+#include "stdlib.h"
+#include "stdarg.h"
+#include "stdio.h"
+
+// Declare a dummy class to ensure that compilers recognize this as C++ code
+class String;
+
+void error ( const char * msg, ... )
+ {
+ va_list ap;
+
+ va_start(ap, msg);
+
+ printf("\nFATAL ERROR - \n");
+ vprintf(msg, ap);
+ printf("\n\n");
+
+ va_end(ap);
+
+ exit(EXIT_FAILURE);
+ }
+
+void warning ( const char * msg, ... )
+ {
+ va_list ap;
+
+ va_start(ap, msg);
+
+ printf("\n\aWARNING - \n");
+ vprintf(msg, ap);
+ printf("\n");
+
+ va_end(ap);
+ }
+
+void numerror ( const char * msg , ... )
+ {
+ va_list ap;
+
+ va_start(ap, msg);
+
+ printf("\nFATAL NUMERIC ERROR - ");
+ vprintf(msg, ap);
+ printf("\n\n");
+
+ va_end(ap);
+
+ exit(EXIT_FAILURE);
+ }
+
diff --git a/libsrc/Error.h b/libsrc/Error.h
new file mode 100644
index 0000000..e5eb425
--- /dev/null
+++ b/libsrc/Error.h
@@ -0,0 +1,35 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/Error.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef _ERROR_H_
+#define _ERROR_H_
+
+// #ifdef __cplusplus
+// extern "C" {
+// #endif
+
+void error(const char * msg, ...);
+void warning(const char * msg, ...);
+void numerror(const char * msg, ...);
+
+// #ifdef __cplusplus
+// };
+// #endif
+
+
+#endif
+
diff --git a/libsrc/FortranFormat.cpp b/libsrc/FortranFormat.cpp
new file mode 100644
index 0000000..7c74381
--- /dev/null
+++ b/libsrc/FortranFormat.cpp
@@ -0,0 +1,373 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/FortranFormat.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "FortranFormat.h"
+#include "Error.h"
+
+FortranFormat::FortranFormat()
+ {
+ inputPos = -1;
+ endOfPattern = false;
+ }
+
+void FortranFormat::SetInputFile(IFILE & file)
+ {
+ input = file;
+ inputPos = -1;
+ endOfPattern = false;
+ }
+
+void FortranFormat::SetFormat(const String & formatString)
+ {
+ format = formatString;
+
+ inputPos = -1;
+ endOfPattern = false;
+
+ repeatCount = 0;
+
+ format.Clear();
+
+ // Remove blank spaces from format statement and extract
+ // the first bracketed expression
+ int level = 0;
+ for (int i = 0; i < formatString.Length(); i++)
+ {
+ if (formatString[i] == ' ' || formatString[i] == '\t' ||
+ formatString[i] == '\n' || formatString[i] == '\r')
+ continue;
+
+ if (formatString[i] == '(')
+ level++;
+
+ if (formatString[i] == ')')
+ level--;
+
+ format += formatString[i];
+
+ if (level == 0) break;
+ }
+
+ if (format[0] != '(' || format[format.Length() - 1] != ')')
+ error("Invalid FORTRAN format statement\n\n"
+ "The statement \"%s\" is not bracketed correctly.\n",
+ (const char *) formatString);
+
+ lastBracket = 1;
+ lastCount = 0;
+
+ formatPos = 1;
+ repeatCount = 0;
+
+ bracketStack.Clear();
+ bracketCounter.Clear();
+ bracketCount.Clear();
+ }
+
+int FortranFormat::GetNextInteger()
+ {
+ GetNextField(buffer);
+
+ return buffer.AsInteger();
+ }
+
+char FortranFormat::GetNextCharacter()
+ {
+ GetNextField(buffer);
+
+ return buffer[0];
+ }
+
+void FortranFormat::GetNextField(String & field)
+ {
+ while (!ProcessToken(field))
+ ;
+ }
+
+bool FortranFormat::ProcessToken(String & field)
+ {
+ // This flag only gets set if we encounter the final bracket or a ':'
+ endOfPattern = false;
+
+ // Read input from file, if appropriate
+ if (inputPos == -1)
+ {
+ inputLine.ReadLine(input);
+ inputPos = 0;
+ }
+
+ // First read repeat count specifier
+ if (repeatCount == 0)
+ repeatCount = GetIntegerFromFormat();
+
+ // By default, the repeat count should be 1
+ if (repeatCount == 0)
+ repeatCount = 1;
+
+ int repeatPos = formatPos;
+
+ // Check if this is a new bracketed grouping
+ if (format[formatPos] == '(')
+ {
+ formatPos++;
+
+ bracketStack.Push(formatPos);
+ bracketCounter.Push(repeatCount);
+ bracketCount.Push(repeatCount);
+
+ repeatCount = 0;
+
+ return false;
+ }
+
+ // Check if this an 'X' field
+ if (format[formatPos] == 'X')
+ {
+ formatPos++;
+
+ // No width specifier allowed for these fields
+ RejectWidth('X');
+
+ // Skip appropriate number of characters
+ inputPos += repeatCount;
+
+ // Reset repeat count
+ repeatCount = 0;
+
+ FinishField();
+
+ return false;
+ }
+
+ // Check if this is a '/' (vertical tab field)
+ if (format[formatPos] == '/')
+ {
+ formatPos++;
+
+ // No width specifier allowed for these fields
+ RejectWidth('/');
+
+ // Skip the appropriate number of lines
+ while (repeatCount--)
+ inputLine.ReadLine(input);
+
+ inputPos = 0;
+
+ // Separators are optional, so we might already be at the next field
+ if (format[formatPos] == ',' || format[formatPos] || ')')
+ FinishField();
+
+ return false;
+ }
+
+ // Check that we haven't encountered a rare, but unsupported input type
+ if (format[formatPos] == 'Q' || format[formatPos] == 'P' || format[formatPos] == 'B')
+ {
+ formatPos++;
+
+ int problemStart = formatPos;
+
+ while (format[formatPos] != ',' && format[formatPos] != ')' && format[formatPos] != '/')
+ formatPos++;
+
+ error("Unsupported pattern in FORMAT statement\n\n"
+ "Statement \"%s\" includes unsupporterd pattern '%s'\n",
+ (const char *) format,
+ (const char *) format.SubStr(problemStart, formatPos - problemStart));
+ }
+
+ if (format[formatPos] == ':')
+ {
+ formatPos++;
+
+ if (format[formatPos] == ',' || format[formatPos] || ')')
+ FinishField();
+
+ repeatCount = 0;
+
+ endOfPattern = true;
+
+ return false;
+ }
+
+ // All the other types we recognize include a width specifier
+
+ // Identify the location of the type specifier
+ int typeStart = formatPos;
+
+ while (CharacterFollows())
+ formatPos++;
+
+ int typeLen = formatPos - typeStart;
+
+ // Retrieve the field width
+ int width = GetIntegerFromFormat();
+
+ if (width == 0)
+ error("Unrecognized FORMAT statement\n\n"
+ "Statement \"%s\" is missing a width specifier for a field of type '%s'\n",
+ (const char *) format, (const char *) format.SubStr(typeStart, typeLen));
+
+ // Check for horizontal tab character
+ if (format[typeStart] == 'T')
+ {
+ // Move left by a specified number of characters
+ if (format[typeStart + 1] == 'L')
+ inputPos = width > inputPos ? 0 : inputPos - width;
+ // Move right by a specified number of characters
+ else if (format[typeStart + 1] == 'R')
+ inputPos += width;
+ // Or simply set the appropriate horizontal position
+ else
+ inputPos = width;
+
+ repeatCount--;
+
+ if (repeatCount)
+ formatPos = repeatPos;
+ else
+ FinishField();
+
+ return false;
+ }
+
+ // Assume that if we got here, we are looking at a data field!
+ field.Copy(inputLine, inputPos, width);
+ field.Trim();
+
+ inputPos += width;
+
+ repeatCount--;
+
+ if (repeatCount)
+ formatPos = repeatPos;
+ else
+ FinishField();
+
+ return true;
+ }
+
+int FortranFormat::GetIntegerFromFormat()
+ {
+ int result = 0;
+
+ while (DigitFollows())
+ result = result * 10 + (int) (format[formatPos++] - '0');
+
+ return result;
+ }
+
+bool FortranFormat::DigitFollows()
+ {
+ return (format[formatPos] >= '0') && (format[formatPos] <= '9');
+ }
+
+bool FortranFormat::CharacterFollows()
+ {
+ return (format[formatPos] >= 'A') && (format[formatPos] <= 'Z');
+ }
+
+void FortranFormat::RejectWidth(char ch)
+ {
+ // No width allowed for field types 'X' and '\'
+ if (DigitFollows())
+ error("Unrecognized FORTRAN format statement\n\n"
+ "The statement \"%s\" includes width specifier for field of type '%c'.\n",
+ (const char *) format, ch);
+ }
+
+void FortranFormat::FinishField(bool )
+ {
+ // Find the next field separator
+ while (format[formatPos] != ',' && format[formatPos] != ')')
+ {
+ if (format[formatPos] == '/')
+ return;
+
+ formatPos++;
+ }
+
+ // Skip commas
+ if (format[formatPos] == ',')
+ {
+ formatPos++;
+ return;
+ }
+
+ // If we found a bracket, then it is either the end of the statement
+ // (if bracketStack is empty) or we finish an internal grouping
+ if (bracketStack.Length())
+ {
+ // Retrieve information about this grouping
+ lastBracket = bracketStack.Pop();
+ lastCount = bracketCount.Pop();
+ int lastCounter = bracketCounter.Pop() - 1;
+
+ // Loop if required
+ if (lastCounter)
+ {
+ bracketStack.Push(lastBracket);
+ bracketCount.Push(lastCount);
+ bracketCounter.Push(lastCounter);
+
+ formatPos = lastBracket;
+ }
+ else
+ // Otherwise find the next separator
+ {
+ formatPos++;
+ FinishField();
+ return;
+ }
+ }
+ else
+ {
+ // If we finished the input line, then activate reset input counter
+ inputPos = -1;
+ endOfPattern = true;
+
+ // And re-use input tokens starting at the last bracket
+ formatPos = lastBracket;
+
+ if (lastBracket == 1)
+ return;
+
+ // With appropriate repeat counts
+ bracketStack.Push(lastBracket);
+ bracketCounter.Push(lastCount);
+ bracketCount.Push(lastCount);
+ }
+ }
+
+void FortranFormat::Flush()
+ {
+ while (!endOfPattern)
+ ProcessToken(buffer);
+
+ inputPos = -1;
+
+ lastBracket = 1;
+ lastCount = 0;
+
+ formatPos = 1;
+ repeatCount = 0;
+
+ bracketStack.Clear();
+ bracketCounter.Clear();
+ bracketCount.Clear();
+ }
+
diff --git a/libsrc/FortranFormat.h b/libsrc/FortranFormat.h
new file mode 100644
index 0000000..13c63b9
--- /dev/null
+++ b/libsrc/FortranFormat.h
@@ -0,0 +1,100 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/FortranFormat.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __FORTRAN_FORMAT__
+#define __FORTRAN_FORMAT__
+
+#include "StringBasics.h"
+#include "IntArray.h"
+
+class FortranFormat
+ {
+ public:
+ // This class reads a user specified input file, one line at a time,
+ // and returns individual fields according to a user specified format
+ // statement
+ FortranFormat();
+
+ // Set the fortran format statement
+ void SetFormat(const String & formatString);
+
+ // Set the input file
+ void SetInputFile(IFILE & file);
+
+ // Read one field from input file
+ void GetNextField(String & field);
+ int GetNextInteger();
+ char GetNextCharacter();
+
+ // Process a token in format statement and return true
+ // if token corresponds to input field. Return false if
+ // token led to processing of white-space or input line
+ // positioning
+ bool ProcessToken(String & field);
+
+ // Flush the pattern -- this finishes processing the current
+ // pattern and ensures that all trailing new-lines, etc. are
+ // handled correctly
+ void Flush();
+
+ private:
+ // The input line and current position along it
+ String inputLine;
+ int inputPos;
+
+ // The Fortran format statement and current position along it
+ String format;
+ int formatPos;
+
+ // The position of the pattern we are repeating, if any
+ int repeatCount;
+
+ // Returns an integer from the current format statement, if any
+ int GetIntegerFromFormat();
+
+ // These functions check the next character in format string
+ bool DigitFollows();
+ bool CharacterFollows();
+
+ // This function finish the input field
+ void FinishField(bool haveSlash = false);
+
+ // Reject width were appropriate
+ void RejectWidth(char type);
+
+ // The input file
+ IFILE input;
+
+ // Stacks to keep track of nested parenthesis
+ IntArray bracketStack;
+ IntArray bracketCount;
+ IntArray bracketCounter;
+
+ int lastBracket;
+ int lastCount;
+
+ // Buffer for reading fields
+ String buffer;
+
+ // Flag that indicates whether we have reached end-of-pattern
+ bool endOfPattern;
+ };
+
+#endif
+
+
+
diff --git a/libsrc/GenotypeLists.cpp b/libsrc/GenotypeLists.cpp
new file mode 100644
index 0000000..22069d8
--- /dev/null
+++ b/libsrc/GenotypeLists.cpp
@@ -0,0 +1,486 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/GenotypeLists.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "GenotypeLists.h"
+
+// When the next line is uncommented, the genotype elimination routines
+// produce a lot of output useful for debugging
+// #define DEBUG_ELIMINATOR
+
+GenotypeList::GenotypeList()
+ {
+ ignore = false;
+ }
+
+bool GenotypeList::EliminateGenotypes(Pedigree & ped, Family * family, int marker)
+ {
+ // First, allocate a genotype list for the family
+ GenotypeList * list = new GenotypeList [family->count];
+
+ // Next, update the possible allele lists for each individual
+ InitializeList(list, ped, family, marker);
+
+ // Then, do multiple rounds of elimination until a problem is found
+ // or no changes are made
+
+#ifdef DEBUG_ELIMINATOR
+ Print(list, ped, family, marker);
+#endif
+
+ while (PairwiseCheck(list, ped, family) || FamilyCheck(list, ped, family))
+#ifdef DEBUG_ELIMINATOR
+ Print(list, ped, family, marker)
+#endif
+ ;
+
+ for (int i = 0; i < family->count; i++)
+ if (!list[i].ignore && list[i].allele1.Length() == 0)
+ {
+ printf("%s - Family %s has a subtle genotype inconsistency\n",
+ (const char *) ped.markerNames[marker], (const char *) family->famid);
+
+ delete [] list;
+ return false;
+ }
+
+ delete [] list;
+ return true;
+ }
+
+void GenotypeList::InitializeList(GenotypeList * list, Pedigree & ped, Family * family, int marker)
+ {
+ for (int i = family->count - 1; i >= 0; i--)
+ {
+ Person & person = ped[family->path[i]];
+ int id = person.traverse;
+ bool maleX = person.sex == SEX_MALE && ped.chromosomeX;
+
+#ifdef DEBUG_ELIMINATOR
+ printf("Initializing genotype list for %s ...\n", (const char *) person.pid);
+#endif
+
+ // If an individual is genotyped ...
+ if (person.markers[marker].isKnown())
+ {
+ // Their genotype list starts with just one entry!
+ list[id].Dimension(1);
+ list[id].SetGenotype(0, person.markers[marker][0], person.markers[marker][1]);
+ list[id].alleles.Clear();
+ list[id].alleles.Push(person.markers[marker][0]);
+ list[id].alleles.PushIfNew(person.markers[marker][1]);
+ list[id].ignore = false;
+
+ // "Heterozygous" males have no possible genotypes
+ if (maleX && person.markers[marker].isHeterozygous())
+ list[id].Dimension(0);
+ }
+ else
+ if (list[id].alleles.Length())
+ if (person.sex == SEX_MALE && ped.chromosomeX)
+ {
+ // Males only carry one X chromosome
+ list[id].Dimension(list[id].alleles.Length() + 1);
+
+ for (int i = 0, out = 0; i < list[id].alleles.Length(); i++)
+ list[id].SetGenotype(out++, list[id].alleles[i], list[id].alleles[i]);
+ list[id].SetGenotype(list[id].alleles.Length(), -1, -1);
+
+ list[id].ignore = false;
+ }
+ else
+ {
+ // Build the genotype list based on the available allele lists
+ int count = list[id].alleles.Length() * (list[id].alleles.Length() + 3) / 2 + 1;
+
+ list[id].Dimension(count);
+
+ for (int i = 0, out = 0; i < list[id].alleles.Length(); i++)
+ {
+ // Allow for all pairs of "transmitted" alleles
+ for (int j = 0; j <= i; j++)
+ list[id].SetGenotype(out++, list[id].alleles[i], list[id].alleles[j]);
+
+ // Allow for an unstransmitted allele
+ list[id].SetGenotype(out++, list[id].alleles[i], -1);
+ }
+
+ // Allow for a pair of untransmitted alleles
+ list[id].SetGenotype(count - 1, -1, -1);
+
+ list[id].ignore = false;
+ }
+ else
+ list[id].ignore = true;
+
+ // If the individual is a founder this is all there is to it
+ if (i < family->founders) continue;
+
+ // If the individual is not a founder, update the parental genotype lists...
+ int fatid = person.father->traverse;
+ int motid = person.mother->traverse;
+
+ for (int i = 0; i < list[id].alleles.Length(); i++)
+ {
+ list[motid].alleles.PushIfNew(list[id].alleles[i]);
+ if (!maleX) list[fatid].alleles.PushIfNew(list[id].alleles[i]);
+ }
+ }
+ }
+
+bool GenotypeList::PairwiseCheck(GenotypeList * list, Pedigree & ped, Family * family)
+ {
+#ifdef DEBUG_ELIMINATOR
+ printf("Checking Relative Pairs ...\n");
+#endif
+
+ bool changed = false;
+
+ for (int i = family->count - 1; i >= family->founders; i--)
+ {
+ Person & person = ped[family->path[i]];
+
+ int id = person.traverse;
+ int fatid = person.father->traverse;
+ int motid = person.mother->traverse;
+
+ bool maleX = person.sex == SEX_MALE && ped.chromosomeX;
+
+ if (list[id].ignore) continue;
+
+ // Check if genotypes are consistent with paternal genotypes
+ for (int i = 0; i < list[id].allele1.Length(); i++)
+ {
+ int al1 = list[id].allele1[i];
+ int al2 = list[id].allele2[i];
+
+ // Remove offspring genotypes incompatible with parental genotypes
+ if ((maleX && !list[motid].Matches(al1) && al1 != -1) ||
+ (!maleX && !(al1 == -1 && al2 == -1) &&
+ ! (list[fatid].Matches(al1) && (al2 == -1 || list[motid].Matches(al2))) &&
+ !((al2 == -1 || list[fatid].Matches(al2)) && list[motid].Matches(al1))))
+ {
+ list[id].Delete(i--);
+ changed = true;
+ }
+ }
+
+ // The offspring genotype list allows for a wild-card untransmitted allele
+ // so any single parental genotype is possible
+ if (list[id].Matches(-1))
+ continue;
+
+ // Check if genotypes are consistent with offspring genotypes
+ for (int i = 0; i < list[motid].allele1.Length(); i++)
+ {
+ int al1 = list[motid].allele1[i];
+ int al2 = list[motid].allele2[i];
+
+ // Remove genotypes incompatible with offspring genotypes
+ if (!list[id].Matches(al1) &&
+ !list[id].Matches(al2))
+ {
+ list[motid].Delete(i--);
+ changed = true;
+ }
+ }
+
+ // Males don't affect genotype lists for their fathers
+ if (maleX) continue;
+
+ // Check if genotypes are consistent with offspring genotypes
+ for (int i = 0; i < list[fatid].allele1.Length(); i++)
+ {
+ int al1 = list[fatid].allele1[i];
+ int al2 = list[fatid].allele2[i];
+
+ // Remove genotypes incompatible with offspring genotypes
+ if (!list[id].Matches(al1) &&
+ !list[id].Matches(al2))
+ {
+ list[fatid].Delete(i--);
+ changed = true;
+ }
+ }
+
+#ifdef DEBUG_ELIMINATOR
+ printf("Done checking individual %s\n", (const char *) person.pid);
+ Print(list, ped, family, 0);
+#endif
+ }
+
+ return changed;
+ }
+
+
+bool GenotypeList::FamilyCheck(GenotypeList * list, Pedigree & ped, Family * family)
+ {
+#ifdef DEBUG_ELIMINATOR
+ printf("Checking Nuclear Families ...\n");
+#endif
+
+ bool changed = false;
+
+ for (int i = family->count - 1; i >= family->founders; i--)
+ {
+ Person & person = ped[family->path[i]];
+
+ int fatid = person.father->traverse;
+ int motid = person.mother->traverse;
+
+ // Only go through the loop once per sibship
+ if (person.sibs[0] != &person || list[fatid].ignore || list[motid].ignore)
+ continue;
+
+#ifdef DEBUG_ELIMINATOR
+ printf("Checking Sibship with %s ...\n", (const char *) person.pid);
+#endif
+
+ // Reset checked genotypes for the mother, father and child
+ list[fatid].checked = 0;
+ list[motid].checked = 0;
+
+ for (int i = 0; i < person.sibCount; i++)
+ list[person.sibs[i]->traverse].checked = 0;
+
+ // Go through each of the paternal genotypes
+ changed |= TrimParent(list, person, fatid, motid);
+
+ // Go through each of maternal genotypes
+ changed |= TrimParent(list, person, motid, fatid);
+
+ // Sort out the unchecked offspring genotypes ...
+ for (int i = 0; i < person.sibCount; i++)
+ {
+ int sibid = person.sibs[i]->traverse;
+ bool maleX = person.sibs[i]->sex == SEX_MALE && ped.chromosomeX;
+
+ // For dealing with male X chromosomes, the pairwise check is all we need
+ if (maleX) continue;
+
+ for (int j = list[sibid].checked; j < list[sibid].allele1.Length(); j++)
+ changed |= Cleanup(list, person, motid, fatid, sibid, j);
+ }
+
+#ifdef DEBUG_ELIMINATOR
+// Print(list, ped, family, 0);
+#endif
+ }
+
+ return changed;
+ }
+
+bool GenotypeList::Matches(int genotype, int allele)
+ {
+ return allele1[genotype] == allele || allele2[genotype] == allele;
+ }
+
+bool GenotypeList::Matches(int allele)
+ {
+ return allele1.Find(allele) != -1 || allele2.Find(allele) != -1;
+ }
+
+int GenotypeList::SaveGenotype(int genotype)
+ {
+ if (checked > genotype)
+ return genotype;
+
+ if (checked != genotype)
+ {
+ allele1.Swap(genotype, checked);
+ allele2.Swap(genotype, checked);
+ }
+
+ return checked++;
+ }
+
+bool GenotypeList::CheckTrio(GenotypeList * list, int fatid, int motid, int child,
+ int i, int j, int k)
+ {
+ return list[fatid].Matches(i, list[child].allele1[k]) &&
+ (list[motid].Matches(j, list[child].allele2[k]) || list[child].allele2[k] == -1) ||
+ (list[fatid].Matches(i, list[child].allele2[k]) || list[child].allele2[k] == -1) &&
+ list[motid].Matches(j, list[child].allele1[k]) ||
+ list[child].allele1[k] == -1 && list[child].allele2[k] == -1;
+ }
+
+void GenotypeList::Dimension(int genotypes)
+ {
+ allele1.Dimension(genotypes);
+ allele2.Dimension(genotypes);
+ }
+
+void GenotypeList::SetGenotype(int genotype, int al1, int al2)
+ {
+ allele1[genotype] = al1;
+ allele2[genotype] = al2;
+ }
+
+void GenotypeList::Delete(int genotype)
+ {
+ allele1.Delete(genotype);
+ allele2.Delete(genotype);
+ }
+
+bool GenotypeList::TrimParent(GenotypeList * list, Person & person, int motid, int fatid)
+ {
+ bool trimmed = false;
+
+ while (list[motid].checked < list[motid].allele1.Length())
+ {
+ int current = list[motid].allele1.Length() - 1;
+ bool saved = false;
+
+ // Pair it with each possible paternal genotype
+ for (int i = list[fatid].allele1.Length() - 1; i >= 0; i--)
+ {
+ int matches = 0;
+
+ // Find out if the pairing is compatible with at least one genotype for each child
+ for (int j = 0; j < person.sibCount; j++)
+ {
+ int sibid = person.sibs[j]->traverse;
+ int maleX = person.sibs[j]->sex == SEX_MALE && person.chromosomeX;
+
+ // Since we have done the pairwise check, there is nothing more
+ // to do for males ...
+ if (list[sibid].ignore || maleX)
+ {
+ matches++;
+ continue;
+ }
+
+ for (int k = list[sibid].allele1.Length() - 1; k >= 0; k--)
+ if (CheckTrio(list, motid, fatid, sibid, current, i, k))
+ {
+ matches++;
+ break;
+ }
+
+ if (matches != j + 1)
+ break;
+ }
+
+ // Save maternal and paternal genotypes, mark all compatible sibling genotypes
+ if (matches == person.sibCount)
+ {
+ for (int j = 0; j < person.sibCount; j++)
+ {
+ int sibid = person.sibs[j]->traverse;
+
+ for (int k = list[sibid].checked; k < list[sibid].allele1.Length(); k++)
+ if (CheckTrio(list, motid, fatid, sibid, current, i, k))
+ list[sibid].SaveGenotype(k);
+ }
+
+ list[motid].SaveGenotype(current);
+ list[fatid].SaveGenotype(i);
+
+ saved = true;
+
+ break;
+ }
+ }
+
+ if (!saved)
+ {
+ list[motid].Delete(current);
+ trimmed = true;
+ }
+ }
+
+ return trimmed;
+ }
+
+bool GenotypeList::Cleanup(GenotypeList * list, Person & person, int motid, int fatid, int child, int geno)
+ {
+ for (int current = 0; current < list[motid].allele1.Length(); current++)
+ for (int i = list[fatid].allele1.Length() - 1; i >= 0; i--)
+ if (CheckTrio(list, motid, fatid, child, current, i, geno))
+ {
+ int matches = 0;
+
+ // Find out if the pairing is compatible with at least one genotype for each child
+ for (int j = 0; j < person.sibCount; j++)
+ {
+ int sibid = person.sibs[j]->traverse;
+ int maleX = person.sibs[j]->sex == SEX_MALE && person.chromosomeX;
+
+ // After completing the pairwise check, all males are guaranteed
+ // to be compatible with their mothers
+ if (list[sibid].ignore || maleX)
+ {
+ matches++;
+ continue;
+ }
+
+ for (int k = list[sibid].allele1.Length() - 1; k >= 0; k--)
+ if (CheckTrio(list, motid, fatid, sibid, current, i, k))
+ {
+ matches++;
+ break;
+ }
+
+ if (matches != j + 1)
+ break;
+ }
+
+ // Update list of compatible sibling genotypes
+ if (matches == person.sibCount)
+ for (int j = 0; j < person.sibCount; j++)
+ {
+ int sibid = person.sibs[j]->traverse;
+
+ for (int k = list[sibid].checked; k < list[sibid].allele1.Length(); k++)
+ if (CheckTrio(list, motid, fatid, sibid, current, i, k))
+ list[sibid].SaveGenotype(k);
+
+ return false;
+ }
+ }
+
+ list[child].Delete(geno);
+
+ return true;
+ }
+
+void GenotypeList::Print(GenotypeList * list, Pedigree & ped, Family * family, int marker)
+ {
+ MarkerInfo * info = ped.GetMarkerInfo(marker);
+
+ for (int i = 0; i < family->count; i++)
+ {
+ printf("%s - ", (const char *) ped[family->path[i]].pid);
+
+ for (int j = 0; j < list[i].allele1.Length(); j++)
+ {
+ if (list[i].allele1[j] == -1)
+ printf("*/");
+ else
+ printf("%s/", (const char *) info->GetAlleleLabel(list[i].allele1[j]));
+
+ if (list[i].allele2[j] == -1)
+ printf("* ");
+ else
+ printf("%s ", (const char *) info->GetAlleleLabel(list[i].allele2[j]));
+ }
+
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
diff --git a/libsrc/GenotypeLists.h b/libsrc/GenotypeLists.h
new file mode 100644
index 0000000..ea9ba3d
--- /dev/null
+++ b/libsrc/GenotypeLists.h
@@ -0,0 +1,61 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/GenotypeLists.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __GENOTYPE_ELIMINATION__
+#define __GENOTYPE_ELIMINATION__
+
+#include "Pedigree.h"
+
+class GenotypeList
+ {
+ public:
+
+ IntArray allele1, allele2;
+ IntArray alleles;
+
+ bool ignore;
+ int checked;
+
+ GenotypeList();
+
+ static bool EliminateGenotypes(Pedigree & ped, Family * family, int marker);
+
+ void Dimension(int genotypes);
+ void Delete(int genotype);
+
+ bool Matches(int genotype, int allele);
+ bool Matches(int allele);
+
+ int SaveGenotype(int genotype);
+ void SetGenotype(int genotype, int al1, int al2);
+
+ private:
+ static void InitializeList(GenotypeList * list, Pedigree & p, Family * f, int marker);
+ static bool PairwiseCheck(GenotypeList * list, Pedigree & p, Family * f);
+ static bool FamilyCheck(GenotypeList * list, Pedigree & p, Family * f);
+
+ static bool CheckTrio(GenotypeList * list, int fatid, int motid, int child, int i, int j, int k);
+ static bool TrimParent(GenotypeList * list, Person & person, int fatid, int motid);
+ static bool Cleanup(GenotypeList * list, Person & person, int fatid, int motid, int child, int geno);
+
+ static void Print(GenotypeList * List, Pedigree & p, Family * f, int marker);
+ };
+
+
+
+#endif
+
diff --git a/libsrc/Hash.cpp b/libsrc/Hash.cpp
new file mode 100644
index 0000000..037f589
--- /dev/null
+++ b/libsrc/Hash.cpp
@@ -0,0 +1,130 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/Hash.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "Hash.h"
+
+#include <ctype.h>
+
+// ********************************************************
+//
+// This code is based on the original by Robert Jenkins.
+//
+// http://burtleburtle.net/bob/hash/doobs.html
+//
+// ********************************************************
+
+#define MIX_INTEGERS(a,b,c) \
+ { \
+ a -= b; a -= c; a ^= (c>>13); \
+ b -= c; b -= a; b ^= (a<<8); \
+ c -= a; c -= b; c ^= (b>>13); \
+ a -= b; a -= c; a ^= (c>>12); \
+ b -= c; b -= a; b ^= (a<<16); \
+ c -= a; c -= b; c ^= (b>>5); \
+ a -= b; a -= c; a ^= (c>>3); \
+ b -= c; b -= a; b ^= (a<<10); \
+ c -= a; c -= b; c ^= (b>>15); \
+ }
+
+#define ui (unsigned int)
+
+unsigned int hash ( const unsigned char * key, unsigned int length, unsigned int initval)
+ {
+ unsigned int a = 0x9e3779b9;
+ unsigned int b = 0x9e3779b9;
+ unsigned int c = initval;
+ unsigned int len = length;
+
+ /*---------------------------------------- handle most of the key */
+ while (len >= 12)
+ {
+ a += (key[0] +(ui(key[1])<<8) +(ui(key[2])<<16) +(ui(key[3])<<24));
+ b += (key[4] +(ui(key[5])<<8) +(ui(key[6])<<16) +(ui(key[7])<<24));
+ c += (key[8] +(ui(key[9])<<8) +(ui(key[10])<<16)+(ui(key[11])<<24));
+ MIX_INTEGERS(a,b,c);
+ key += 12; len -= 12;
+ }
+
+ /*------------------------------------- handle the last 11 bytes */
+ c += length;
+ switch(len) /* all the case statements fall through */
+ {
+ case 11: c+=(ui(key[10])<<24);
+ case 10: c+=(ui(key[9])<<16);
+ case 9 : c+=(ui(key[8])<<8);
+ /* the first byte of c is reserved for the length */
+
+ case 8 : b+=(ui(key[7])<<24);
+ case 7 : b+=(ui(key[6])<<16);
+ case 6 : b+=(ui(key[5])<<8);
+ case 5 : b+=key[4];
+
+ case 4 : a+=(ui(key[3])<<24);
+ case 3 : a+=(ui(key[2])<<16);
+ case 2 : a+=(ui(key[1])<<8);
+ case 1 : a+=key[0];
+ /* case 0: nothing left to add */
+ }
+ MIX_INTEGERS(a,b,c);
+
+ /*-------------------------------------------- report the result */
+ return c;
+ }
+
+unsigned int hash_no_case ( const unsigned char * key, unsigned int length, unsigned int initval)
+ {
+ unsigned int a = 0x9e3779b9;
+ unsigned int b = 0x9e3779b9;
+ unsigned int c = initval;
+ unsigned int len = length;
+
+ /*---------------------------------------- handle most of the key */
+ while (len >= 12)
+ {
+ a += (toupper(key[0]) +(ui(toupper(key[1]))<<8) +(ui(toupper(key[2]))<<16) +(ui(toupper(key[3]))<<24));
+ b += (toupper(key[4]) +(ui(toupper(key[5]))<<8) +(ui(toupper(key[6]))<<16) +(ui(toupper(key[7]))<<24));
+ c += (toupper(key[8]) +(ui(toupper(key[9]))<<8) +(ui(toupper(key[10]))<<16)+(ui(toupper(key[11]))<<24));
+ MIX_INTEGERS(a,b,c);
+ key += 12; len -= 12;
+ }
+
+ /*------------------------------------- handle the last 11 bytes */
+ c += length;
+ switch(len) /* all the case statements fall through */
+ {
+ case 11: c+=(ui(toupper(key[10]))<<24);
+ case 10: c+=(ui(toupper(key[9]))<<16);
+ case 9 : c+=(ui(toupper(key[8]))<<8);
+ /* the first byte of c is reserved for the length */
+
+ case 8 : b+=(ui(toupper(key[7]))<<24);
+ case 7 : b+=(ui(toupper(key[6]))<<16);
+ case 6 : b+=(ui(toupper(key[5]))<<8);
+ case 5 : b+=toupper(key[4]);
+
+ case 4 : a+=(ui(toupper(key[3]))<<24);
+ case 3 : a+=(ui(toupper(key[2]))<<16);
+ case 2 : a+=(ui(toupper(key[1]))<<8);
+ case 1 : a+=toupper(key[0]);
+ /* case 0: nothing left to add */
+ }
+ MIX_INTEGERS(a,b,c);
+
+ /*-------------------------------------------- report the result */
+ return c;
+ }
+
diff --git a/libsrc/Hash.h b/libsrc/Hash.h
new file mode 100644
index 0000000..f6a518b
--- /dev/null
+++ b/libsrc/Hash.h
@@ -0,0 +1,27 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/Hash.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __HASH_H__
+#define __HASH_H__
+
+unsigned int hash ( const unsigned char * key, unsigned int length, unsigned int initval);
+
+unsigned int hash_no_case ( const unsigned char * key, unsigned int length, unsigned int initval);
+
+#endif
+
+
diff --git a/libsrc/InputFile.cpp b/libsrc/InputFile.cpp
new file mode 100644
index 0000000..d215eca
--- /dev/null
+++ b/libsrc/InputFile.cpp
@@ -0,0 +1,63 @@
+#include "InputFile.h"
+#include "StringBasics.h"
+
+#include <stdarg.h>
+
+#ifdef __ZLIB_AVAILABLE__
+
+IFILE::IFILE(const char * filename, const char * mode)
+ {
+ // Some implementations of zlib will not open files that are
+ // larger than 2Gb. To ensure support for large (uncompressed)
+ // files, we fall-back on the regular fopen when the initial
+ // gzopen call fails and the filename does not end in .gz
+
+ gzMode = true;
+ gzHandle = gzopen(filename, mode);
+
+ if (gzHandle == NULL)
+ {
+ int lastchar = 0;
+
+ while (filename[lastchar] != 0) lastchar++;
+
+ if (lastchar >= 3 && filename[lastchar - 3] == '.' &&
+ filename[lastchar - 2] == 'g' &&
+ filename[lastchar - 1] == 'z')
+ return;
+
+ gzMode = false;
+ handle = fopen(filename, mode);
+ }
+ };
+
+#endif
+
+int ifprintf(IFILE output, char * format, ...)
+ {
+#ifdef __ZLIB_AVAILABLE__
+ if (output.gzMode == true)
+ {
+ String buffer;
+
+ va_list ap;
+ va_start(ap, format);
+
+ buffer.vprintf(format, ap);
+
+ va_end(ap);
+
+ return gzwrite(output.gzHandle, (const char *) buffer, buffer.Length());
+ }
+#endif
+
+ va_list ap;
+ va_start(ap, format);
+
+ int result = vfprintf(output.handle, format, ap);
+
+ va_end(ap);
+
+ return result;
+ }
+
diff --git a/libsrc/InputFile.h b/libsrc/InputFile.h
new file mode 100644
index 0000000..0b908f8
--- /dev/null
+++ b/libsrc/InputFile.h
@@ -0,0 +1,158 @@
+#ifndef __INPUTFILE_H__
+#define __INPUTFILE_H__
+
+#ifdef __gnu_linux__
+#ifndef __ZLIB_AVAILABLE__
+#define __ZLIB_AVAILABLE__
+#endif
+#endif
+
+#ifdef __ZLIB_AVAILABLE__
+
+#include <zlib.h>
+#include <stdio.h>
+
+class IFILE
+ {
+ public:
+ bool gzMode;
+ union
+ {
+ gzFile gzHandle;
+ FILE * handle;
+ };
+
+ IFILE()
+ {
+ gzMode = false;
+ handle = NULL;
+ }
+
+ IFILE(const char * filename, const char * mode);
+
+ operator void * ()
+ { return gzMode ? (void *) gzHandle : (void *) handle; }
+
+ IFILE operator = (const IFILE & rhs)
+ {
+ if ((gzMode = rhs.gzMode) == true)
+ gzHandle = rhs.gzHandle;
+ else
+ handle = rhs.handle;
+
+ return *this;
+ }
+
+ IFILE operator = (FILE * rhs)
+ {
+ gzMode = false;
+ handle = rhs;
+ return *this;
+ }
+
+ IFILE operator = (gzFile & rhs)
+ {
+ gzMode = true;
+ gzHandle = rhs;
+ return *this;
+ }
+
+ bool operator == (void * rhs)
+ {
+ if (rhs != NULL)
+ return false;
+ return gzMode ? gzHandle == rhs : handle == rhs;
+ }
+ };
+
+inline IFILE ifopen(const char * filename, const char * mode)
+ { IFILE file(filename, mode); return file; }
+
+inline int ifclose(IFILE & file)
+ {
+ int result = file.gzMode ? gzclose(file.gzHandle) : fclose(file.handle);
+ file.gzHandle = NULL;
+ return result;
+ }
+
+inline int ifgetc(IFILE & file)
+ { return file.gzMode ? gzgetc(file.gzHandle) : fgetc(file.handle); }
+
+inline void ifrewind(IFILE & file)
+ { if (file.gzMode) gzrewind(file.gzHandle); else rewind(file.handle); }
+
+inline int ifeof(IFILE & file)
+ { return file.gzMode ? gzeof(file.gzHandle) : feof(file.handle); }
+
+inline unsigned int ifread(IFILE & file, void * buffer, unsigned int size)
+ { return file.gzMode ? gzread(file.gzHandle, buffer, size) :
+ fread(buffer, 1, size, file.handle); }
+
+inline unsigned int ifwrite(IFILE & file, void * buffer, unsigned int size)
+ { return file.gzMode ? gzwrite(file.gzHandle, buffer, size) :
+ fwrite(buffer, 1, size, file.handle); }
+
+#else
+
+#include <stdio.h>
+
+class IFILE
+ {
+ public:
+ FILE * handle;
+
+ IFILE()
+ { handle = NULL; }
+ IFILE(const char * filename, const char * mode)
+ { handle = fopen(filename, mode); }
+ ~IFILE()
+ { }
+
+ operator FILE *()
+ { return handle; }
+
+ IFILE & operator = (FILE * rhs)
+ { handle = rhs; return *this; }
+
+ IFILE & operator = (const IFILE & rhs)
+ { handle = rhs.handle; return * this; }
+
+ bool operator == (void * rhs)
+ {
+ if (rhs != NULL)
+ return false;
+ return handle == rhs;
+ }
+ };
+
+inline IFILE ifopen(const char * filename, const char * mode)
+ { IFILE file(filename, mode); return file; }
+
+inline int ifclose(IFILE & file)
+ {
+ int result = fclose(file.handle);
+ file.handle = NULL;
+ return result;
+ }
+
+inline int ifgetc(IFILE & file)
+ { return fgetc(file.handle); }
+
+inline void ifrewind(IFILE & file)
+ { rewind(file.handle); }
+
+inline int ifeof(IFILE & file)
+ { return feof(file.handle); }
+
+inline unsigned int ifread(IFILE & file, void * buffer, unsigned int size)
+ { return fread(buffer, 1, size, file.handle); }
+
+inline unsigned int ifwrite(IFILE & file, void * buffer, unsigned int size)
+ { return fwrite(buffer, 1, size, file.handle); }
+
+#endif
+
+int ifprintf(IFILE output, char * format, ...);
+
+#endif
+
diff --git a/libsrc/IntArray.cpp b/libsrc/IntArray.cpp
new file mode 100644
index 0000000..3309933
--- /dev/null
+++ b/libsrc/IntArray.cpp
@@ -0,0 +1,389 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/IntArray.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "IntArray.h"
+#include "Error.h"
+#include "Hash.h"
+#include "Sort.h"
+
+#include <string.h>
+
+int IntArray::alloc = 4;
+
+IntArray::IntArray(int start_size)
+ {
+ count = start_size;
+ size = (count + alloc) / alloc * alloc;
+ items = new int [size];
+ }
+
+IntArray::IntArray(const IntArray & source)
+ {
+ count = source.count;
+ size = source.size;
+ items = new int [size];
+
+ for (int i = 0; i < count; i++)
+ items[i] = source.items[i];
+ }
+
+IntArray::~IntArray()
+ {
+ delete [] items;
+ }
+
+void IntArray::Grow(int new_size)
+ {
+ if (new_size > size)
+ {
+ if ((new_size >> 1) >= size)
+ size = (new_size + alloc) / alloc * alloc;
+ else
+ {
+ size = alloc;
+ while (size <= new_size)
+ size *= 2;
+ }
+
+ int * new_items = new int [size];
+ for (int i = 0; i < count; i++)
+ new_items[i] = items[i];
+ delete [] items;
+ items = new_items;
+ }
+ }
+
+int IntArray::Append(int value)
+ {
+ Grow(count + 1);
+ items[count++] = value;
+ return count;
+ }
+
+int IntArray::Append(const IntArray & rhs)
+ {
+ Grow(count + rhs.count);
+ for (int i = 0; i < rhs.count; i++)
+ items[count + i] = rhs.items[i];
+ count += rhs.count;
+ return count;
+ }
+
+void IntArray::Set(int value)
+ {
+ for (int i = 0; i < count; i++)
+ items[i] = value;
+ }
+
+void IntArray::SetSequence(int start, int increment)
+ {
+ for (int i = 0; i < count; i++, start += increment)
+ items[i] = start;
+ }
+
+int IntArray::Delete(int index)
+ {
+ count--;
+ if (count - index)
+ memmove(items + index, items + index + 1, sizeof(int) * (count - index));
+ return count;
+ }
+
+void IntArray::InsertAt(int index, int value)
+ {
+ Grow(count + 1);
+ if (count - index)
+ memmove(items + index + 1, items + index, sizeof(int) * (count - index));
+ items[index] = value;
+ count++;
+ }
+
+IntArray & IntArray::operator = (const IntArray & rhs)
+ {
+ Grow(rhs.count);
+ count = rhs.count;
+ for (int i = 0; i < count; i++)
+ items[i] = rhs.items[i];
+ return *this;
+ }
+
+int IntArray::Sum(int start, int end) const
+ {
+ int result = 0;
+
+ for (int i = start; i <= end; i++)
+ result += items[i];
+
+ return result;
+ }
+
+int IntArray::Max(int start, int end) const
+ {
+ if (start >= count) return 0;
+
+ int result = items[start];
+
+ for (int i = start + 1; i <= end; i++)
+ if (result < items[i])
+ result = items[i];
+
+ return result;
+ }
+
+int IntArray::Min(int start, int end) const
+ {
+ if (start >= count) return 0;
+
+ int result = items[start];
+
+ for (int i = start + 1; i <= end; i++)
+ if (result > items[i])
+ result = items[i];
+
+ return result;
+ }
+
+int IntArray::Find(int value) const
+ {
+ for (int i = 0; i < count; i++)
+ if (value == items[i])
+ return i;
+ return -1;
+ }
+
+int IntArray::BinarySearch(int value) const
+ {
+ int start = 0;
+ int stop = count - 1;
+
+ while (start <= stop)
+ {
+ int mid = (start + stop) / 2;
+
+ if (items[mid] == value)
+ return mid;
+
+ if (items[mid] > value)
+ stop = mid - 1;
+ else
+ start = mid + 1;
+ }
+
+ return -1;
+ }
+
+void IntArray::Zero()
+ {
+ for (int i = 0; i < count; i++)
+ items[i] = 0;
+ }
+
+int IntArray::Compare(int * a, int * b)
+ { return *a - *b; }
+
+void IntArray::Sort()
+ {
+ QuickSort(items, count, sizeof(int), COMPAREFUNC Compare);
+ }
+
+void IntArray::Sort(IntArray & freeRider)
+ {
+ QuickSort2(items, freeRider.items, count, sizeof(int), COMPAREFUNC Compare);
+ }
+
+
+void IntArray::Reverse()
+ {
+ for (int i = 0, j = count - 1; i < j; i++, j--)
+ Swap(i, j);
+ }
+
+int IntArray::CountIfGreater(int threshold) const
+ {
+ int result = 0;
+
+ for (int i = 0; i < count; i++)
+ if (items[i] > threshold)
+ result++;
+
+ return result;
+ }
+
+int IntArray::CountIfGreaterOrEqual(int treshold) const
+ {
+ int result = 0;
+
+ for (int i = 0; i < count; i++)
+ if (items[i] >= treshold)
+ result++;
+
+ return result;
+ }
+
+void IntArray::Add(int term)
+ {
+ for (int i = 0; i < count; i++)
+ items[i] += term;
+ }
+
+void IntArray::Multiply(int factor)
+ {
+ for (int i = 0; i < count; i++)
+ items[i] *= factor;
+ }
+
+void IntArray::Divide(int denominator)
+ {
+ for (int i = 0; i < count; i++)
+ items[i] /= denominator;
+ }
+
+void IntArray::Stack(const IntArray & a)
+ {
+ int end = count;
+
+ Dimension(count + a.count);
+
+ for (int i = 0; i < a.count; i++)
+ items[i + end] = a[i];
+ }
+
+bool IntArray::operator == (const IntArray & rhs) const
+ {
+ if (count != rhs.count)
+ return false;
+
+ for (int i = 0; i < rhs.count; i++)
+ if (items[i] != rhs.items[i])
+ return false;
+
+ return true;
+ }
+
+bool IntArray::operator != (const IntArray & rhs) const
+ {
+ return !(*this == rhs);
+ }
+
+// Check if all values are in ascending or descending order
+//
+
+bool IntArray::isAscending()
+ {
+ for (int i = 1; i < count; i++)
+ if (items[i] < items[i - 1])
+ return false;
+ return true;
+ }
+
+bool IntArray::isDescending()
+ {
+ for (int i = 1; i < count; i++)
+ if (items[i] > items[i - 1])
+ return false;
+ return true;
+ }
+
+void IntArray::Add(const IntArray & v)
+ {
+ if (Length() != v.Length())
+ error("IntArray::Add - vectors have different lengths\n"
+ "IntArrays - Left[%d] += Right[%d] ",
+ Length(), v.Length());
+
+ for (int i = 0; i < Length(); i++)
+ items[i] += v[i];
+ }
+
+int IntArray::InnerProduct(IntArray & v)
+ {
+ if (Length() != v.Length())
+ error("IntArray::InnerProduct - vectors have different dimensions\n"
+ "IntArrays - Left[%d] * Right[%d] ",
+ Length(), v.Length());
+
+ int sum = 0;
+ for (int i = 0; i < Length(); i++)
+ sum += items[i] * v[i];
+
+ return sum;
+ }
+
+void IntArray::Swap(IntArray & rhs)
+ {
+ int * temp = rhs.items;
+ rhs.items = items;
+ items = temp;
+
+ int swap = rhs.count;
+ rhs.count = count;
+ count = swap;
+
+ swap = rhs.size;
+ rhs.size = size;
+ size = swap;
+ }
+
+void IntArray::Print(FILE * output)
+ {
+ Print(output, "Array of Integers");
+ }
+
+void IntArray::Print(FILE * output, const char * label)
+ {
+ fprintf(output, "%s [%d elements]: ", label, count);
+
+ for (int i = 0; i < count; i++)
+ fprintf(output, "%d ", items[i]);
+
+ fprintf(output, "\n");
+ }
+
+void IntArray::PushIfNew(int value)
+ {
+ for (int i = 0; i < count; i++)
+ if (items[i] == value)
+ return;
+
+ Push(value);
+ }
+
+int IntArray::Product()
+ {
+ int product = 1;
+
+ for (int i = 0; i < count; i++)
+ product *= items[i];
+
+ return product;
+ }
+
+double IntArray::DoubleProduct()
+ {
+ double product = 1.0;
+
+ for (int i = 0; i < count; i++)
+ product *= items[i];
+
+ return product;
+ }
+
+int IntArray::Hash(int initval)
+ {
+ return hash((unsigned char *) items, sizeof(int) * count, initval);
+ }
+
diff --git a/libsrc/IntArray.h b/libsrc/IntArray.h
new file mode 100644
index 0000000..d6bf88d
--- /dev/null
+++ b/libsrc/IntArray.h
@@ -0,0 +1,153 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/IntArray.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __INTARRAY_H__
+#define __INTARRAY_H__
+
+#include <stdio.h>
+
+class IntArray
+ {
+ private:
+ int * items;
+ int size, count;
+
+ void Grow(int new_size);
+ static int Compare(int * a, int * b);
+
+ public:
+ static int alloc;
+
+ IntArray(int start_size = 0);
+ IntArray(const IntArray & source);
+ ~IntArray();
+
+ IntArray & operator = (const IntArray & rhs);
+
+ int & operator [] (int index) { return items[index]; }
+ int operator [] (int index) const { return items[index]; }
+
+ // Suggested by Anthony Berno, 12/28/06, to avoid "ambiguities" that
+ // Visual Studio encountered when handling implicit conversions ...
+ int & operator [] (char index) { return items[int(index)]; }
+ int operator [] (char index) const { return items[int(index)]; }
+ // ... who knows whether Visual Studio makes C++ annoying to encourage C#?
+
+ int & operator [] (double fraction)
+ { return items[(int) (count * fraction)]; }
+ int operator [] (double fraction) const
+ { return items[(int) (count * fraction)]; }
+
+ int Append(int value);
+ int Append(const IntArray & rhs);
+
+ void Push(int value) { Append(value); }
+ int Pop() { return items[--count]; }
+ int Peek() const { return items[count - 1]; }
+ int &Last() const { return items[count - 1]; }
+
+ void PushIfNew(int value); // used for maintaining list without duplicates
+
+ int Delete(int index);
+ void InsertAt(int index, int value);
+
+ int Find(int value) const;
+ int FastFind(int value) const { return BinarySearch(value); }
+ int BinarySearch(int value) const;
+ void Sort();
+ void Sort(IntArray & freeRider); // Sorts two arrays simultaneously
+
+ void Zero();
+ void Set(int value);
+ void SetSequence(int start = 0, int increment = 1);
+
+ int Length() const { return count; }
+ void Dimension(int new_count) { Grow(new_count); count = new_count; }
+ void Clear() { count = 0; }
+
+ int Sum() const { return Sum(0, count - 1); }
+ int Sum(int start) const { return Sum(start, count - 1); }
+ int Sum(int start, int end) const;
+
+ int Max() const { return Max(0, count - 1); }
+ int Max(int start) const { return Max(start, count - 1); }
+ int Max(int start, int end) const;
+
+ int Min() const { return Min(0, count - 1); }
+ int Min(int start) const { return Min(start, count - 1); }
+ int Min(int start, int end) const;
+
+ int Count() const {return count; }
+ int CountIfGreater(int treshold) const;
+ int CountIfGreaterOrEqual(int treshold) const;
+
+ void Swap(int i, int j)
+ { int tmp = items[i]; items[i] = items[j]; items[j] = tmp; }
+
+ void Reverse();
+
+ operator int * () { return items; }
+
+ void Add(int term);
+ void Subtract(int term) { Add(-term); }
+ void Multiply(int factor);
+ void Divide(int denominator);
+
+ void Add(const IntArray & rhs);
+
+ IntArray & operator += (int rhs)
+ { Add(rhs); return *this; }
+
+ IntArray & operator += (const IntArray & rhs)
+ { Add(rhs); return *this; }
+
+ IntArray & operator *= (int rhs)
+ { Multiply(rhs); return *this; }
+
+ IntArray & operator -= (int rhs)
+ { Add(-rhs); return *this; }
+
+ IntArray & operator /= (int rhs)
+ { Divide(rhs); return *this; }
+
+ int InnerProduct(IntArray & v);
+
+ bool operator == (const IntArray & rhs) const;
+ bool operator != (const IntArray & rhs) const;
+
+ bool isAscending();
+ bool isDescending();
+
+ void Stack(const IntArray & rhs);
+
+ void Swap(IntArray & rhs);
+
+ void Print() { Print(stdout); }
+ void Print(const char * label) { Print(stdout, label); }
+ void Print(FILE * output);
+ void Print(FILE * output, const char * label);
+
+ int Product();
+ double DoubleProduct();
+
+ int Hash(int initval = 0);
+ };
+
+#endif
+
+
+
diff --git a/libsrc/Kinship.cpp b/libsrc/Kinship.cpp
new file mode 100644
index 0000000..fa36d10
--- /dev/null
+++ b/libsrc/Kinship.cpp
@@ -0,0 +1,92 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/Kinship.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "Kinship.h"
+
+#define MAX_TABLE 500
+
+void Kinship::Setup(Family & f)
+ {
+ int count = f.count > MAX_TABLE ? MAX_TABLE : f.count;
+ int founders = f.founders > MAX_TABLE ? MAX_TABLE : f.founders;
+
+ allPairs.Dimension(count, count);
+
+ for (int i = 0; i < founders; i++)
+ {
+ for (int j = 0; j < founders; j++)
+ allPairs[i][j] = 0.0;
+ allPairs[i][i] = 0.5;
+ }
+
+ for (int i = founders; i < count; i++)
+ {
+ Person * p = &(f.ped[f.path[i]]);
+ int k = p->father->traverse;
+ int l = p->mother->traverse;
+
+ for (int j = 0; j < i; j++)
+ if (!p->isMzTwin(f.ped[f.path[j]]))
+ allPairs[i][j] = allPairs[j][i] =
+ (allPairs[k][j] + allPairs[l][j]) * 0.5;
+ else
+ allPairs[j][i] = allPairs[i][j] = 0.5 + allPairs[k][l] * 0.5;
+
+ allPairs[i][i] = 0.5 + allPairs[k][l] * 0.5;
+ }
+
+ fam = &f;
+ }
+
+double Kinship::operator() (Person & p1, Person & p2)
+ {
+ int i = p1.traverse;
+ int j = p2.traverse;
+
+ if (i >= MAX_TABLE || j >= MAX_TABLE)
+ {
+ if (p1.isFounder() && p2.isFounder())
+ return (&p1 == &p2) ? 0.5 : 0.0;
+
+ if (i == j || p1.isMzTwin(p2))
+ return 0.5 + (*this)(*p1.father, *p1.mother) * 0.5;
+
+ if (i < j)
+ return 0.5 * ((*this)(*p2.father, p1) + (*this)(*p2.mother, p1));
+ else
+ return 0.5 * ((*this)(*p1.father, p2) + (*this)(*p1.mother, p2));
+ }
+
+ return allPairs[i][j];
+ }
+
+bool Kinship::isInbred()
+ {
+ for (int i=0; i < allPairs.rows; i++)
+ if (allPairs[i][i] != 0.5)
+ return true;
+
+ for (int i=allPairs.rows; i < fam->count; i++)
+ if ((*this)(fam->ped[fam->path[i]], fam->ped[fam->path[i]]) != 0.5)
+ return true;
+
+ return false;
+ }
+
+
+
+
diff --git a/libsrc/Kinship.h b/libsrc/Kinship.h
new file mode 100644
index 0000000..89436b4
--- /dev/null
+++ b/libsrc/Kinship.h
@@ -0,0 +1,43 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/Kinship.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __KINSHIP_H__
+#define __KINSHIP_H__
+
+#include "Pedigree.h"
+#include "MathMatrix.h"
+
+class Kinship
+ {
+ public:
+ Matrix allPairs;
+ Family * fam;
+
+ Kinship() : allPairs()
+ { fam = NULL; }
+
+ void Setup(Family & f);
+
+ bool isInbred();
+
+ double operator () (Person & p1, Person & p2);
+
+ };
+
+#endif
+
+
diff --git a/libsrc/KinshipX.cpp b/libsrc/KinshipX.cpp
new file mode 100644
index 0000000..d230068
--- /dev/null
+++ b/libsrc/KinshipX.cpp
@@ -0,0 +1,63 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/KinshipX.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "KinshipX.h"
+
+void KinshipX::Setup(Family & f)
+ {
+ allPairs.Dimension(f.count, f.count);
+
+ for (int i = 0; i < f.founders; i++)
+ {
+ bool isMale = f.ped[f.path[i]].sex == SEX_MALE;
+ for (int j = 0; j < f.founders; j++)
+ allPairs[i][j] = 0.0;
+ allPairs[i][i] = isMale ? 1.0 : 0.5;
+ }
+
+ for (int i = f.founders; i < f.count; i++)
+ {
+ Person * p = &(f.ped[f.path[i]]);
+ int k = p->father->traverse;
+ int l = p->mother->traverse;
+
+ bool isMale = f.ped[f.path[i]].sex == SEX_MALE;
+ allPairs[i][i] = isMale ? 1.0 : 0.5 + allPairs[k][l] * 0.5;
+
+ for (int j = 0; j < i; j++)
+ if (!p->isMzTwin(f.ped[f.path[j]]))
+ allPairs[i][j] = allPairs[j][i] = isMale ?
+ allPairs[l][j] : (allPairs[k][j] + allPairs[l][j]) * 0.5;
+ else
+ allPairs[j][i] = allPairs[i][j] = allPairs[i][i];
+ }
+
+ fam = &f;
+ }
+
+double KinshipX::operator() (Person & p1, Person & p2)
+ {
+ int i = p1.traverse;
+ int j = p2.traverse;
+
+ return allPairs[i][j];
+ }
+
+
+
+
+
diff --git a/libsrc/KinshipX.h b/libsrc/KinshipX.h
new file mode 100644
index 0000000..82853b6
--- /dev/null
+++ b/libsrc/KinshipX.h
@@ -0,0 +1,41 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/KinshipX.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __KINSHIPX_H__
+#define __KINSHIPX_H__
+
+#include "Pedigree.h"
+#include "MathMatrix.h"
+
+class KinshipX
+ {
+ public:
+ Matrix allPairs;
+ Family * fam;
+
+ KinshipX() : allPairs()
+ { fam = NULL; }
+
+ void Setup(Family & f);
+
+ double operator () (Person & p1, Person & p2);
+
+ };
+
+#endif
+
+
diff --git a/libsrc/LongArray.cpp b/libsrc/LongArray.cpp
new file mode 100644
index 0000000..baead30
--- /dev/null
+++ b/libsrc/LongArray.cpp
@@ -0,0 +1,148 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/LongArray.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "LongArray.h"
+#include "Hash.h"
+#include "Sort.h"
+
+#include <string.h>
+
+int LongArray::alloc = 4;
+
+LongArray::LongArray(int start_size)
+ {
+ count = start_size;
+ size = (count + alloc) / alloc * alloc;
+ items = new longint [size];
+ }
+
+LongArray::LongArray(LongArray & source)
+ {
+ count = source.count;
+ size = source.size;
+ items = new longint [size];
+
+ for (int i = 0; i < count; i++)
+ items[i] = source.items[i];
+ }
+
+LongArray::~LongArray()
+ {
+ delete [] items;
+ }
+
+void LongArray::Grow(int new_size)
+ {
+ if (new_size > size)
+ {
+ if ((new_size >> 1) >= size)
+ size = (new_size + alloc) / alloc * alloc;
+ else
+ {
+ size = alloc;
+ while (size <= new_size)
+ size *= 2;
+ }
+
+ longint * new_items = new longint [size];
+ for (int i = 0; i < count; i++)
+ new_items[i] = items[i];
+ delete [] items;
+ items = new_items;
+ }
+ }
+
+int LongArray::Append(longint value)
+ {
+ Grow(count + 1);
+ items[count++] = value;
+ return count;
+ }
+
+void LongArray::Set(longint value)
+ {
+ for (int i = 0; i < count; i++)
+ items[i] = value;
+ }
+
+int LongArray::Delete(int index)
+ {
+ count--;
+ if (count - index)
+ memmove(items + index, items + index + 1, sizeof(longint) * (count - index));
+ return count;
+ }
+
+void LongArray::InsertAt(int index, longint value)
+ {
+ Grow(count + 1);
+ memmove(items + index + 1, items + index, sizeof(longint) * (count - index));
+ items[index] = value;
+ count++;
+ }
+
+LongArray & LongArray::operator = (const LongArray & rhs)
+ {
+ Grow(rhs.count);
+ count = rhs.count;
+ for (int i = 0; i < count; i++)
+ items[i] = rhs.items[i];
+ return *this;
+ }
+
+int LongArray::Find(longint value) const
+ {
+ for (int i = 0; i < count; i++)
+ if (value == items[i])
+ return i;
+ return -1;
+ }
+
+void LongArray::Zero()
+ {
+ for (int i = 0; i < count; i++)
+ items[i] = 0;
+ }
+
+void LongArray::Reverse()
+ {
+ for (int i = 0, j = count - 1; i < j; i++, j--)
+ Swap(i, j);
+ }
+
+bool LongArray::operator == (const LongArray & rhs) const
+ {
+ if (count != rhs.count)
+ return false;
+
+ for (int i = 0; i < rhs.count; i++)
+ if (items[i] != rhs.items[i])
+ return false;
+
+ return true;
+ }
+
+bool LongArray::operator != (const LongArray & rhs) const
+ {
+ return !(*this == rhs);
+ }
+
+int LongArray::Hash(int initval)
+ {
+ return hash((unsigned char *) items, sizeof(longint) * count, initval);
+ }
+
diff --git a/libsrc/LongArray.h b/libsrc/LongArray.h
new file mode 100644
index 0000000..0010305
--- /dev/null
+++ b/libsrc/LongArray.h
@@ -0,0 +1,78 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/LongArray.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __LONGINTARRAY_H__
+#define __LONGINTARRAY_H__
+
+#include "LongInt.h"
+
+class LongArray
+ {
+ private:
+ longint * items;
+ int size, count;
+
+ void Grow(int new_size);
+ static int Compare(int * a, int * b);
+
+ public:
+ static int alloc;
+
+ LongArray(int start_size = 0);
+ LongArray(LongArray & source);
+ ~LongArray();
+
+ LongArray & operator = (const LongArray & rhs);
+
+ longint & operator [] (int index) { return items[index]; }
+
+ int Append(longint value);
+ void Push(longint value) { Append(value); }
+ longint Pop() { return items[--count]; }
+ longint Peek() const { return items[count - 1]; }
+ longint &Last() const { return items[count - 1]; }
+
+ int Delete(int index);
+ void InsertAt(int index, longint value);
+
+ int Find(longint value) const;
+ void Sort();
+
+ void Zero();
+ void Set(longint value);
+
+ int Length() { return count; }
+ void Dimension(int new_count) { Grow(new_count); count = new_count; }
+ void Clear() { count = 0; }
+
+ void Swap(int i, int j)
+ { longint tmp = items[i]; items[i] = items[j]; items[j] = tmp; }
+
+ void Reverse();
+
+ operator longint * () { return items; }
+
+ bool operator == (const LongArray & rhs) const;
+ bool operator != (const LongArray & rhs) const;
+
+ int Hash(int initval);
+ };
+
+#endif /* __LONGINTARRAY_H */
+
+
+
diff --git a/libsrc/LongHash.cpp b/libsrc/LongHash.cpp
new file mode 100644
index 0000000..14a15b0
--- /dev/null
+++ b/libsrc/LongHash.cpp
@@ -0,0 +1,20 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/LongHash.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+
+
+
diff --git a/libsrc/LongHash.h b/libsrc/LongHash.h
new file mode 100644
index 0000000..1a45555
--- /dev/null
+++ b/libsrc/LongHash.h
@@ -0,0 +1,247 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/LongHash.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __LONGHASH_H__
+#define __LONGHASH_H__
+
+#include "Error.h"
+
+#include <limits.h>
+
+#ifdef UINT_MAX
+#define LH_NOTFOUND (UINT_MAX)
+#else
+#define LH_NOTFOUND 0xFFFFFFFF
+#endif
+
+template <class ObjectT> class LongHash
+ {
+ protected:
+ ObjectT * objects;
+ long long * keys;
+ bool * occupancy;
+ unsigned int count, size;
+ unsigned int mask;
+ bool allowDuplicates;
+
+ public:
+ LongHash(int startsize = 32)
+ {
+ count = 0;
+ size = startsize;
+ mask = startsize - 1;
+
+ // In this implementation, the size of hash tables must be a power of two
+ if (startsize & mask)
+ error("LongHash: Hash table size must be a power of two.\n");
+
+ occupancy = new bool [size];
+ objects = new ObjectT [size];
+ keys = new long long [size];
+
+ allowDuplicates = false;
+
+ for (unsigned int i = 0; i < size; i++)
+ { occupancy[i] = false; }
+ };
+
+ ~LongHash()
+ {
+ delete [] occupancy;
+ delete [] objects;
+ delete [] keys;
+ }
+
+ void Grow() { SetSize(size * 2); }
+ void Shrink() { SetSize(size / 2); }
+
+ void SetSize(int newsize)
+ {
+ int newmask = newsize - 1;
+
+ bool * newoccupancy = new bool [newsize];
+ ObjectT * newobjects = new ObjectT [newsize];
+ long long * newkeys = new long long [newsize];
+
+ for (int i = 0; i < newsize; i++)
+ newoccupancy[i] = false;
+
+ if (count)
+ for (unsigned int i = 0; i < size; i++)
+ if (occupancy[i] != false)
+ {
+ long long key = keys[i];
+ unsigned int h = newmask & (unsigned int) key;
+
+ while ( newoccupancy[h] == true && ( newkeys[h] != key || allowDuplicates ))
+ h = (h + 1) & newmask;
+
+ if ( newoccupancy[h] )
+ count--;
+
+ newkeys[h] = key;
+ newobjects[h] = objects[i];
+ newoccupancy[h] = true;
+ }
+
+ delete [] occupancy;
+ delete [] objects;
+ delete [] keys;
+
+ occupancy = newoccupancy;
+ objects = newobjects;
+ keys = newkeys;
+ size = newsize;
+ mask = newmask;
+ }
+
+ void Clear()
+ {
+ count = 0;
+
+ if (size > 32)
+ SetSize(32);
+
+ for (unsigned int i = 0; i < size; i++)
+ occupancy[i] = false;
+ }
+
+ int Capacity() const { return size; }
+ int Entries() const { return count; }
+
+ ObjectT Object(int i) const { return objects[i]; }
+ ObjectT & Object(int i) { return objects[i]; }
+
+ void SetObject(int i, ObjectT object)
+ { objects[i] = object; }
+
+ unsigned int Add (long long key, ObjectT object)
+ {
+ if (count * 2 > size)
+ Grow();
+
+ unsigned int h = Iterate(key);
+
+ while (allowDuplicates && occupancy[h] && objects[h] != object)
+ h = ReIterate(key, h);
+
+ if (!occupancy[h])
+ {
+ occupancy[h] = true;
+ keys[h] = key;
+ count++;
+ }
+
+ objects[h] = object;
+
+ return h;
+ }
+
+ unsigned int Find(long long key)
+ {
+ unsigned int h = Iterate(key);
+
+ return occupancy[h] ? h : LH_NOTFOUND;
+ }
+
+ unsigned int Rehash(long long key, unsigned int h)
+ {
+ h = ReIterate(key, h);
+
+ return occupancy[h] ? h : LH_NOTFOUND;
+ }
+
+ LongHash & operator = (const LongHash & rhs);
+
+ ObjectT operator [] (int i) const { return objects[i]; }
+ ObjectT operator [] (unsigned int i) const { return objects[i]; }
+
+ void Delete(unsigned int index)
+ {
+ if (index >= size || !occupancy[index])
+ return;
+
+ occupancy[index] = false;
+ count--;
+
+ if (count * 8 < size && size > 32)
+ Shrink();
+ else
+ {
+ // rehash the next entries until we find empty slot
+ index = (index + 1) & mask;
+
+ while (occupancy[index])
+ {
+ if ((keys[index] & mask) != index)
+ {
+ unsigned int h = Iterate(keys[index]);
+
+ while (occupancy[h] && objects[h] != objects[index])
+ h = ReIterate(keys[index], h);
+
+ if (h != (unsigned int) index)
+ {
+ keys[h] = keys[index];
+ occupancy[h] = true;
+ objects[h] = objects[index];
+
+ occupancy[index] = false;
+ }
+ }
+
+ index = (index + 1) & mask;
+ }
+ }
+ }
+
+
+ bool SlotInUse(int index) { return occupancy[index] == false; }
+ bool SlotInUse(unsigned int index) { return occupancy[index] == false; }
+
+ void SetAllowDuplicateKeys(bool toggle)
+ {
+ allowDuplicates = toggle;
+
+ if (count && !allowDuplicates)
+ SetSize(size);
+ }
+
+ private:
+ unsigned int Iterate(long long key) const
+ {
+ unsigned int h = mask & (unsigned int) key;
+
+ while (occupancy[h] == true && keys[h] != key)
+ h = (h + 1) & mask;
+
+ return h;
+ }
+
+ unsigned int ReIterate(long long key, unsigned int h) const
+ {
+ h = (h + 1) & mask;
+
+ while (occupancy[h] == true && keys[h] != key)
+ h = (h + 1) & mask;
+
+ return h;
+ }
+ };
+
+#endif
+
diff --git a/libsrc/LongInt.h b/libsrc/LongInt.h
new file mode 100644
index 0000000..b20f783
--- /dev/null
+++ b/libsrc/LongInt.h
@@ -0,0 +1,204 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/LongInt.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __LONGINT_H__
+#define __LONGINT_H__
+
+#ifdef __USE_LONGINT
+#ifndef __USE_LONG_INT
+#define __USE_LONG_INT
+#endif
+#endif
+
+#ifndef __USE_LONG_INT /* longints not enabled */
+
+#define NOTZERO ~0
+#define NOTONE ~1
+typedef int longint;
+
+#else /* longints enabled */
+
+/* GNU C supports long long ... */
+
+#ifdef __GNUC__
+#define __USE_LONG_LONG__
+#endif
+
+/* And so does the Intel Compiler ... */
+
+#ifdef __INTEL_COMPILER
+#define __USE_LONG_LONG__
+#endif
+
+/* And the SUN Pro Compiler ... */
+
+#ifdef __SUNPRO_CC
+#define __USE_LONG_LONG__
+#endif
+
+/* And the Digital Mars Compiler ... */
+
+#ifdef __DMC__
+#ifdef _INTEGRAL_MAX_BITS
+#if (_INTEGRAL_MAX_BITS >= 64)
+#define __USE_LONG_LONG__
+#endif
+#endif
+#endif
+
+/* Check for other compilers that support the C99 standard */
+
+#include "limits.h"
+#ifdef __LLONG_MAX
+#define __USE_LONG_LONG__
+#endif
+
+#ifdef __USE_LONG_LONG__
+
+/* If the long long type is supported natively */
+
+#define NOTZERO ~(0ULL)
+#define NOTONE ~(1ULL)
+typedef long long longint;
+
+#else
+
+/* Define a home brew long integer type */
+
+#define NOTZERO longint (~0,~0)
+#define NOTONE longint (~0,~1)
+
+class longint
+ {
+ public:
+ longint() {}
+
+ longint(unsigned int low)
+ { lo = low; hi = 0; }
+
+ longint(unsigned int high, unsigned int low)
+ { hi = high; lo = low; }
+
+ longint(const longint & source)
+ { hi = source.hi; lo = source.lo; }
+
+ operator int() { return lo; }
+ operator bool() { return lo != 0 || hi != 0; }
+
+ longint operator ~ ()
+ { return longint(~hi, ~lo); }
+
+ longint operator ^ (const longint & rhs)
+ { return longint(hi ^ rhs.hi, lo ^ rhs.lo); }
+
+ longint operator & (const longint & rhs)
+ { return longint(hi & rhs.hi, lo & rhs.lo); }
+
+ longint operator | (const longint & rhs)
+ { return longint(hi | rhs.hi, lo | rhs.lo); }
+
+ bool operator != (const longint & rhs)
+ { return lo != rhs.lo || hi != rhs.hi; }
+
+ bool operator != (unsigned int rhs)
+ { return lo != rhs || hi != 0; }
+
+ bool operator != (int rhs)
+ { return lo != (unsigned int) rhs || hi != 0; }
+
+ bool operator == (const longint & rhs) const
+ { return lo == rhs.lo && hi == rhs.hi; }
+
+ bool operator == (const unsigned int rhs) const
+ { return lo == rhs && hi == 0; }
+
+ bool operator == (const int rhs) const
+ { return lo == (unsigned int) rhs && hi == 0; }
+
+ longint & operator = (const longint & rhs)
+ { lo = rhs.lo; hi = rhs.hi; return *this; }
+
+ longint & operator = (unsigned int rhs)
+ { lo = rhs; hi = 0; return *this; }
+
+ longint & operator = (int rhs)
+ { lo = rhs; hi = 0; return *this; }
+
+ longint & operator ^= (const longint & rhs)
+ { hi ^= rhs.hi; lo ^= rhs.lo; return *this; }
+
+ longint & operator |= (const longint & rhs)
+ { hi |= rhs.hi; lo |= rhs.lo; return *this; }
+
+ longint operator &= (const longint & rhs)
+ { hi &= rhs.hi; lo &= rhs.lo; return *this; }
+
+ longint operator << (int bits)
+ { longint result(*this); result <<= bits; return result; }
+
+ longint & operator <<= (int bits)
+ {
+ if (bits <= 0)
+ return *this;
+ else
+ {
+ hi = (hi << 1) + ((lo & 0x80000000) != 0); lo <<= 1;
+ return *this <<= bits - 1;
+ }
+ }
+
+ longint operator >> (int bits)
+ { longint result(*this); result >>= bits; return result; }
+
+ longint & operator >>= (int bits)
+ {
+ if (bits <= 0)
+ return *this;
+ else
+ {
+ lo = (lo >> 1) + (hi & 1 ? 0x80000000 : 0); hi >>= 1;
+ return *this >>= bits - 1;
+ }
+ }
+
+ longint operator - (unsigned int rhs)
+ {
+ int high = (rhs > lo) ? hi - 1 : hi;
+ return longint(high, lo - rhs);
+ }
+
+ longint operator - (int rhs)
+ {
+ int high = ((unsigned int) rhs > lo) ? hi - 1 : hi;
+ return longint(high, lo - rhs);
+ }
+
+ private:
+ unsigned int hi, lo;
+ };
+
+#endif /* __GNUC__ */
+
+#endif /* __USE_LONG_INT */
+
+#endif /* __LONGINT_H__ */
+
+
+
+
+
+
diff --git a/libsrc/LongLongCounter.cpp b/libsrc/LongLongCounter.cpp
new file mode 100644
index 0000000..fa401e2
--- /dev/null
+++ b/libsrc/LongLongCounter.cpp
@@ -0,0 +1,60 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/LongLongCounter.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "LongLongCounter.h"
+
+LongCounter::LongCounter() : LongHash<int>()
+ {
+ SetAllowDuplicateKeys(false);
+ }
+
+void LongCounter::IncrementCount(long long key)
+ {
+ int slot = Find(key);
+
+ if (slot == LH_NOTFOUND)
+ Add(key, 1);
+ else if (Object(slot) == -1)
+ Delete(slot);
+ else
+ Object(slot)++;
+ }
+
+void LongCounter::DecrementCount(long long key)
+ {
+ int slot = Find(key);
+
+ if (slot == LH_NOTFOUND)
+ Add(key, -1);
+ else if (Object(slot) == 1)
+ Delete(slot);
+ else
+ Object(slot)--;
+ }
+
+int LongCounter::GetCount(long long key)
+ {
+ int slot = Find(key);
+
+ if (slot == LH_NOTFOUND)
+ return 0;
+ else
+ return Object(slot)--;
+ }
+
+
+
diff --git a/libsrc/LongLongCounter.h b/libsrc/LongLongCounter.h
new file mode 100644
index 0000000..e8a8c32
--- /dev/null
+++ b/libsrc/LongLongCounter.h
@@ -0,0 +1,36 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/LongLongCounter.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __LONGLONGCOUNTER_H_
+#define __LONGLONGCOUNTER_H_
+
+#include "LongHash.h"
+
+class LongCounter : public LongHash<int>
+ {
+ public:
+ LongCounter();
+
+ void IncrementCount(long long key);
+ void DecrementCount(long long key);
+ int GetCount(long long key);
+ };
+
+#endif
+
+
+
diff --git a/libsrc/MapFunction.cpp b/libsrc/MapFunction.cpp
new file mode 100644
index 0000000..7d5ca9e
--- /dev/null
+++ b/libsrc/MapFunction.cpp
@@ -0,0 +1,44 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/MapFunction.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "MapFunction.h"
+#include "MathConstant.h"
+
+#include <math.h>
+
+double DistanceToRecombination(double distance)
+ {
+ return (1.0 - exp(-2.0 * distance)) * 0.5;
+ }
+
+double RecombinationToDistance(double recombination)
+ {
+ return (log(max(1.0 - 2 * recombination, 1e-7)) * -0.5);
+ }
+
+double KosambiDistanceToRecombination(double distance)
+ {
+ double e_to_4x = exp(4.0 * distance);
+
+ return (0.5 * (e_to_4x - 1.0) / (e_to_4x + 1.0));
+ }
+
+double RecombinationToKosambiDistance(double theta)
+ {
+ return 0.25 * log((1.0 + 2*theta) / max(1.0 - 2.0*theta, 1e-7));
+ }
+
diff --git a/libsrc/MapFunction.h b/libsrc/MapFunction.h
new file mode 100644
index 0000000..0a91ced
--- /dev/null
+++ b/libsrc/MapFunction.h
@@ -0,0 +1,25 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/MapFunction.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __MAPFUNCTION_H__
+#define __MAPFUNCTION_H__
+
+double DistanceToRecombination(double distance);
+double RecombinationToDistance(double recombination);
+
+#endif
+
diff --git a/libsrc/MathConstant.h b/libsrc/MathConstant.h
new file mode 100644
index 0000000..56a8ffc
--- /dev/null
+++ b/libsrc/MathConstant.h
@@ -0,0 +1,52 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/MathConstant.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __MATHCONSTANT_H__
+#define __MATHCONSTANT_H__
+
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#endif
+
+#include "math.h"
+#include "stdlib.h"
+
+// Constants for numerical routines
+//
+
+#define TINY 1.0e-30 // A small number
+#define ITMAX 200 // Maximum number of iterations
+#define EPS 3.0e-7 // Relative accuracy
+#define ZEPS 3.0e-10 // Precision around zero
+#define FPMIN 1.0e-30 // Number near the smallest representable number
+#define FPMAX 1.0e+100 // Number near the largest representable number
+#define TOL 1.0e-6 // Zero SVD values below this
+#define GOLD 0.61803399 // Golden ratio
+#define CGOLD 0.38196601 // Complement of golden ratio
+
+inline double square(double a) { return a * a; }
+inline double sign(double a, double b) { return b >= 0 ? fabs(a) : -fabs(a); }
+inline double min(double a, double b) { return a < b ? a : b; }
+inline double max(double a, double b) { return a > b ? a : b; }
+
+inline int square(int a) { return a * a; }
+inline int sign(int a, int b) { return b >= 0 ? abs(a) : -abs(a); }
+inline int min(int a, int b) { return a < b ? a : b; }
+inline int max(int a, int b) { return a > b ? a : b; }
+
+#endif
+
diff --git a/libsrc/MathMatrix.cpp b/libsrc/MathMatrix.cpp
new file mode 100644
index 0000000..4f8288c
--- /dev/null
+++ b/libsrc/MathMatrix.cpp
@@ -0,0 +1,711 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/MathMatrix.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "MathMatrix.h"
+#include "MathVector.h"
+#include "MathConstant.h"
+#include "Sort.h"
+#include "Error.h"
+
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+
+int Matrix::alloc = 2;
+
+Matrix::~Matrix()
+ {
+ // printf("Deleting Matrix %s...\n", (const char *) label);
+
+ for (int i=0; i<size; i++)
+ delete data[i];
+
+ if (size)
+ delete [] data;
+
+ if (extraSize)
+ delete [] extras;
+ }
+
+void Matrix::Init()
+ {
+ rows = cols = extraSize = size = 0;
+ data = NULL;
+ extras = NULL;
+ label = "[Matrix]";
+ }
+
+void Matrix::SetLabel(const char * name)
+ {
+ label = '[';
+ label += name;
+ label += ']';
+ }
+
+void Matrix::Dimension(int m, int n)
+ {
+ if (n == cols && m == rows)
+ return;
+
+ if (n > extraSize)
+ {
+ int newSize = (n + alloc) / alloc * alloc;
+ ColumnExtras * newExtras = new ColumnExtras [newSize];
+
+ if (extras != NULL)
+ for (int i = 0; i < extraSize; i++)
+ newExtras[i] = extras[i];
+
+ if (extraSize)
+ delete [] extras;
+
+ extraSize = newSize;
+ extras = newExtras;
+ }
+
+ if (m > size)
+ {
+ int newSize = (m + alloc) / alloc * alloc;
+ Vector ** newData = new Vector * [newSize];
+
+ if (data != NULL)
+ for (int i = 0; i < size; i++)
+ newData[i] = data[i];
+
+ for (int i = size; i < newSize; i++)
+ newData[i] = new Vector(n);
+
+ if (size)
+ delete [] data;
+
+ size = newSize;
+ data = newData;
+ }
+
+ if (cols != n)
+ for (int i = 0; i < rows; i++)
+ data[i]->Dimension(n);
+
+ if (rows != m)
+ for (int i = rows; i < m; i++)
+ data[i]->Dimension(n);
+
+ rows = m;
+ cols = n;
+ }
+
+void Matrix::Zero()
+ {
+ for (int i = 0; i < rows; i++)
+ for (int j = 0; j < cols; j++)
+ (*(data[i]))[j] = 0.0;
+ }
+
+void Matrix::Identity()
+ {
+ if (rows != cols)
+ error("Matrix.Identity - Identity matrices must be square");
+
+ for (int i = 0; i < rows; i++)
+ for (int j = 0; j < cols; j++)
+ if (i == j)
+ (*(data[i]))[j] = 1.0;
+ else
+ (*(data[i]))[j] = 0.0;
+ }
+
+void Matrix::Set(double k)
+ {
+ for (int i = 0; i < rows; i++)
+ for (int j = 0; j < cols; j++)
+ (*(data[i]))[j] = k;
+ }
+
+void Matrix::Negate()
+ {
+ for (int i = 0; i < rows; i++)
+ for (int j = 0; j < cols; j++)
+ (*(data[i]))[j] = -(*(data[i]))[j];
+ }
+
+void Matrix::Copy(const Matrix & m)
+ {
+ Dimension(m.rows, m.cols);
+
+ if (m.data != NULL)
+ for (int i = 0; i < rows; i++)
+ for (int j = 0; j < cols; j++)
+ (*this)[i][j] = m[i][j];
+ }
+
+void Matrix::Transpose(const Matrix & m)
+ {
+ Dimension(m.cols, m.rows);
+
+ for (int i = 0; i < rows; i++)
+ for (int j = 0; j < cols; j++)
+ (*(data[i]))[j] = m[j][i];
+ }
+
+void Matrix::Add(double k)
+ {
+ for (int i = 0; i < rows; i++)
+ for (int j = 0; j < cols; j++)
+ (*(data[i]))[j] += k;
+ }
+
+void Matrix::Multiply(double k)
+ {
+ for (int i = 0; i < rows; i++)
+ for (int j = 0; j < cols; j++)
+ (*(data[i]))[j] *= k;
+ }
+
+void Matrix::Add(const Matrix & m)
+ {
+ if ( (rows != m.rows) && (cols != m.cols) )
+ error("Matrix.Add - Attempted to add incompatible matrices\n"
+ "Matrices - %s [%d, %d] + %s [%d, %d]\n",
+ (const char *) label, rows, cols,
+ (const char *) m.label, m.rows, m.cols);
+
+ for (int i = 0; i < rows; i++)
+ for (int j = 0; j < cols; j++)
+ (*(data[i]))[j] += m[i][j];
+ }
+
+void Matrix::AddMultiple(double k, const Matrix & m)
+ {
+ if ( (rows != m.rows) && (cols != m.cols) )
+ error("Matrix.AddMultiple - Attempted to add incompatible matrices\n"
+ "Matrices - %s [%d, %d] + k * %s [%d, %d]\n",
+ (const char *) label, rows, cols,
+ (const char *) m.label, m.rows, m.cols);
+
+ for (int i = 0; i < rows; i++)
+ for (int j = 0; j < cols; j++)
+ (*(data[i]))[j] += k * m[i][j];
+ }
+
+
+void Matrix::Product(const Matrix & l, const Matrix & r)
+ {
+ if (l.cols != r.rows)
+ error("Matrix.Multiply - Attempted to multiply incompatible matrices\n"
+ "Matrices - %s [%d, %d] + %s [%d, %d]\n",
+ (const char *) l.label, l.rows, l.cols,
+ (const char *) r.label, r.rows, r.cols);
+
+ Dimension(l.rows, r.cols);
+ Zero();
+
+ for (int k = 0; k < l.cols; k++)
+ for (int i = 0; i < rows; i++)
+ for (int j = 0; j < cols; j++)
+ (*(data[i]))[j] += l[i][k] * r[k][j];
+ }
+
+void Matrix::AddRows(double k, int r1, int r2)
+ {
+ Vector v(*(data[r1]));
+
+ v.Multiply(k);
+ data[r2]->Add(v);
+ }
+
+void Matrix::MultiplyRow(int r1, double k)
+ { data[r1]->Multiply(k); }
+
+void Matrix::AddRows(int r1, int r2)
+ { data[r2]->Add(*(data[r1])); }
+
+void Matrix::Reduce(double tol)
+ {
+ double pivot;
+ int pivotr = 0; // Initializing pivotr is not necessary, but avoids warnings
+ int r = 0; // the row we are currently reducing
+
+ for (int j = 0; j < cols; j++)
+ {
+ if (r > rows)
+ return;
+
+ pivot = 0.0;
+ for (int i = r; i < rows; i++)
+ if (fabs((*this)[i][j]) > pivot)
+ {
+ pivot = fabs((*this)[i][j]);
+ pivotr = i;
+ }
+
+ if (pivot <= tol)
+ {
+ for (int i = r; i < rows; i++)
+ (*this)[i][j] = 0.0;
+ continue;
+ }
+
+ SwapRows(pivotr, r);
+
+ double scale = (*this)[r][j];
+
+ (*this)[r][j] = 1.0;
+ for (int k = j+1; k < cols; k++)
+ (*this)[r][k] /= scale;
+
+ for (int i = r + 1; r < rows; i++)
+ {
+ scale = (*this)[i][j];
+ (*this)[i][j] = 0.0;
+ for (int k = j+1; k < cols; k++)
+ (*this)[i][k] -= (*this)[r][k] * scale;
+ }
+
+ r++;
+ }
+ }
+
+void Matrix::DeleteRow(int r)
+ {
+ Vector * temp = data[r];
+
+ for (int i = r + 1; i < rows; i++)
+ data[i-1] = data[i];
+
+ data[rows - 1] = temp;
+ rows--;
+ }
+
+void Matrix::DeleteColumn(int c)
+ {
+ for (int i = 0; i < rows; i++)
+ data[i] -> DeleteDimension(c);
+
+ for (int i = c + 1; i < cols; i++)
+ extras[i-1] = extras[i];
+
+ cols--;
+ }
+
+void Matrix::SwapColumns(int c1, int c2)
+ {
+ double temp;
+
+ for (int i = 0; i < rows; i++)
+ {
+ temp = (*data[i])[c1];
+ (*data[i])[c1] = (*data[i])[c2];
+ (*data[i])[c2] = temp;
+ }
+
+ extras[c1].Swap(extras[c2]);
+ }
+
+void Matrix::Read(FILE * f)
+ {
+ int r, c;
+ char buffer[100];
+
+ fscanf(f, " %s =", buffer);
+ buffer[strlen(buffer) - 1] = 0;
+ SetLabel(buffer);
+
+ fscanf(f, " [ %d x %d ]", &r, &c);
+ Dimension(r, c);
+
+ for (int c = 0; c < cols; c++)
+ {
+ fscanf(f, " %s", buffer);
+ SetColumnLabel(c, buffer);
+ }
+
+ for (int r = 0; r < rows; r++)
+ for (int c = 0; c < cols; c++)
+ fscanf(f, " %lf", &((*this)[r][c]));
+ }
+
+
+void Matrix::Print(FILE * f, int r, int c)
+ {
+ if (r == -1 || r > rows) r = rows;
+ if (c == -1 || c > cols) c = cols;
+
+ char dimensions[30];
+
+ sprintf(dimensions, "[%d x %d]", r, c);
+
+ int columnZero = label.Length() > 15 ? label.Length() : 15;
+
+ fprintf(f, "\n%*s =\n%*s ", columnZero, (const char *) label,
+ columnZero, dimensions);
+
+ int * precision = new int [c + 1];
+ int * width = new int [c + 1];
+
+ for (int j = 0; j < c; j++)
+ {
+ precision[j] = extras[j].GetPrecision();
+ width[j] = extras[j].GetWidth();
+ fprintf(f, "%*s ", width[j], (const char *) extras[j].label);
+ }
+
+ for (int i = 0; i < r; i++) {
+ fprintf(f, "\n%*s ", columnZero, (const char *) data[i]->label);
+ for (int j = 0; j < c; j++)
+ fprintf(f, "%*.*f ", width[j], precision[j], (*this)[i][j]); }
+
+ fprintf(f, "\n");
+
+ delete [] precision;
+ delete [] width;
+ }
+
+void Matrix::CopyLabels(Matrix & M)
+ {
+ for (int i = 0; i < rows; i++)
+ if (i < M.rows)
+ data[i]->SetLabel(M[i].label);
+
+ for (int i = 0; i < cols; i++)
+ if (i < M.cols)
+ SetColumnLabel(i, M.GetColumnLabel(i));
+ }
+
+// ColumnExtras class
+//
+
+void ColumnExtras::Init()
+ {
+ label = "column";
+ dirty = true;
+ precision = 3;
+ width = 7;
+ }
+
+ColumnExtras::~ColumnExtras()
+ { }
+
+void ColumnExtras::SetLabel(const char * name)
+ {
+ label = name;
+ }
+
+int ColumnExtras::GetWidth()
+ {
+ if (dirty)
+ {
+ if (precision + 2 > width)
+ width = precision + 2;
+ if (label.Length() > width)
+ width = label.Length();
+ dirty = false;
+ }
+ return width;
+ }
+
+void ColumnExtras::Copy(ColumnExtras & c)
+ {
+ width = c.width;
+ precision = c.precision;
+ dirty = c.dirty;
+ label = c.label;
+ }
+
+#define SWAP(a,b) {int swap=(a); (a)=(b); (b)=swap;}
+#define SWAPBOOL(a,b) {bool swap=(a); (a)=(b); (b)=swap;}
+
+void ColumnExtras::Swap(ColumnExtras & c)
+ {
+ SWAP(c.width, width);
+ SWAP(c.precision, precision);
+ SWAPBOOL(c.dirty, dirty);
+ c.label.Swap(label);
+ }
+
+int Matrix::CompareRows(Vector ** row1, Vector ** row2)
+ {
+ if ( (**row1)[0] < (**row2)[0]) return -1;
+ if ( (**row1)[0] > (**row2)[0]) return 1;
+ return 0;
+ }
+
+void Matrix::Sort()
+ {
+ QuickSort(data, rows, sizeof(Vector *), COMPAREFUNC CompareRows);
+ }
+
+bool Matrix::operator == (const Matrix & rhs) const
+ {
+ if (rhs.rows != rows || rhs.cols != cols) return false;
+
+ for (int i = 0; i < rows; i++)
+ if ((*this)[i] != rhs[i])
+ return false;
+ return true;
+ }
+
+void Matrix::StackBottom(const Matrix & m)
+ {
+ if (m.cols != cols)
+ error("Attempted to stack matrices with different number of columns");
+
+ int end = rows;
+
+ Dimension(rows + m.rows, cols);
+
+ for (int i = 0; i < m.rows; i++)
+ *(data[i + end]) = m[i];
+ }
+
+void Matrix::StackLeft(const Matrix & m)
+ {
+ if (m.rows != rows)
+ error("Attempted to stack matrics with different numbers of rows");
+
+ for (int i = 0; i < rows; i++)
+ data[i]->Stack(m[i]);
+
+ Dimension(rows, cols + m.cols);
+ }
+
+void Matrix::Swap(Matrix & m)
+ {
+ label.Swap(m.label);
+
+ ColumnExtras * tmpExtras = extras;
+ extras = m.extras;
+ m.extras = tmpExtras;
+
+ int swap;
+ swap = rows; rows = m.rows; m.rows = swap;
+ swap = cols; cols = m.cols; m.cols = swap;
+ swap = size; size = m.size; m.size = swap;
+ swap = extraSize; extraSize = m.extraSize; m.extraSize = swap;
+
+ Vector ** tmpData = data;
+ data = m.data;
+ m.data = tmpData;
+ }
+
+double Matrix::Min() const
+ {
+ if (rows == 0 || cols == 0)
+ return 0.0;
+
+ double minimum = data[0]->Min();
+
+ for (int i = 1; i < rows; i++)
+ minimum = min(data[i]->Min(), minimum);
+
+ return minimum;
+ }
+
+double Matrix::Max() const
+ {
+ if (rows == 0 || cols == 0)
+ return 0.0;
+
+ double maximum = data[0]->Max();
+
+ for (int i = 1; i < rows; i++)
+ maximum = max(data[i]->Max(), maximum);
+
+ return maximum;
+ }
+
+double Matrix::Mean() const
+ {
+ if (rows == 0 || cols == 0)
+ return 0.0;
+
+ double sum = data[0]->Sum();
+
+ for (int i = 1; i < rows; i++)
+ sum += data[i]->Sum();
+
+ return sum / (rows * cols);
+ }
+
+double Matrix::SafeMin() const
+ {
+ double lo = (rows > 0 && cols > 0) ? _NAN_ : 0.0;
+
+ int i, j;
+
+ for (i = 0; i < rows; i++)
+ {
+ for (j = 0; j < cols; j++)
+ if (data[i]->data[j] != _NAN_)
+ {
+ lo = data[i]->data[j];
+ break;
+ }
+ if (j != cols) break;
+ }
+
+ for ( ; i < rows; i++, j = 0)
+ for ( ; j < cols; j++)
+ if (data[i]->data[j] < lo && data[i]->data[j] != _NAN_)
+ lo = data[i]->data[j];
+
+ return lo;
+ }
+
+double Matrix::SafeMax() const
+ {
+ double hi = (rows > 0 && cols > 0) ? _NAN_ : 0.0;
+
+ int i, j;
+
+ for (i = 0; i < rows; i++)
+ {
+ for (j = 0; j < cols; j++)
+ if (data[i]->data[j] != _NAN_)
+ {
+ hi = data[i]->data[j];
+ break;
+ }
+ if (j != cols) break;
+ }
+
+ for ( ; i < rows; i++, j = 0)
+ for ( ; j < cols; j++)
+ if (data[i]->data[j] > hi && data[i]->data[j] != _NAN_)
+ hi = data[i]->data[j];
+
+ return hi;
+ }
+
+double Matrix::SafeMean() const
+ {
+ double sum = 0.0;
+ int count = 0;
+
+ for (int i = 0; i < rows; i++)
+ for (int j = 0; j < cols; j++)
+ if ((*this)[i][j] != _NAN_)
+ {
+ sum += (*this)[i][j];
+ count ++;
+ }
+
+ return (count) ? sum / count : 0.0;
+ }
+
+int Matrix::SafeCount() const
+ {
+ int total = 0;
+
+ for (int i = 0; i < rows; i++)
+ total += data[i]->SafeCount();
+
+ return total;
+ }
+
+void Matrix::PrintUpper(FILE * f, int r, int c, bool print_diag)
+ {
+ int columnZero;
+ int * precision = NULL, * width = NULL; // Initialization avoids compiler warnings
+
+ SetupPrint(f, r, c, columnZero, precision, width);
+
+ int upper = print_diag ? 0 : 1;
+ for (int i = 0; i < r ; i++)
+ {
+ fprintf(f, "\n%*s ", columnZero, (const char *) data[i]->label);
+
+ for (int j = 0; j < upper; j++)
+ fprintf(f, "%*.*s ", width[j], precision[j], " ");
+ for (int j = upper; j < c; j++)
+ fprintf(f, "%*.*f ", width[j], precision[j], (*this)[i][j]);
+
+ upper++;
+ }
+
+ fprintf(f, "\n");
+
+ delete [] precision;
+ delete [] width;
+ }
+
+void Matrix::PrintLower(FILE * f, int r, int c, bool print_diag)
+ {
+ if (r == -1 || r > rows) r = rows;
+ if (c == -1 || c > cols) c = cols;
+
+ String dimensions;
+ dimensions.printf("[%d x %d]", r, c);
+
+ int columnZero = label.Length() > 15 ? label.Length() : 15;
+
+ fprintf(f, "\n%*s =\n%*s ", columnZero, (const char *) label,
+ columnZero, (const char *) dimensions);
+
+ int * precision = new int [c + 1];
+ int * width = new int [c + 1];
+
+ for (int j = 0; j < c; j++)
+ {
+ precision[j] = extras[j].GetPrecision();
+ width[j] = extras[j].GetWidth();
+ fprintf(f, "%*s ", width[j], (const char *) extras[j].label);
+ }
+
+ int upper = print_diag ? 1 : 0;
+
+ for (int i = 0; i < r ; i++)
+ {
+ fprintf(f, "\n%*s ", columnZero, (const char *) data[i]->label);
+ for (int j = 0; j < upper; j++)
+ fprintf(f, "%*.*f ", width[j], precision[j],(*this)[i][j]);
+ for (int j = upper; j < c; j++)
+ fprintf(f, "%*.*s ", width[j], precision[j]," " );
+
+ upper++;
+ }
+
+ fprintf(f, "\n");
+
+ delete [] precision;
+ delete [] width;
+ }
+
+
+void Matrix::SetupPrint(FILE *f, int r, int c, int & column_zero, int * precision, int * width)
+ {
+ if (r == -1 || r > rows) r = rows;
+ if (c == -1 || c > cols) c = cols;
+
+ String dimensions;
+ dimensions.printf("[%d x %d]", r, c);
+
+ column_zero = label.Length() > 15 ? label.Length() : 15;
+
+ fprintf(f, "\n%*s =\n%*s ", column_zero, (const char *) label,
+ column_zero, (const char *) dimensions);
+
+ precision = new int [c + 1];
+ width = new int [c + 1];
+
+ for (int j = 0; j < c; j++)
+ {
+ precision[j] = extras[j].GetPrecision();
+ width[j] = extras[j].GetWidth();
+ fprintf(f, "%*s ", width[j], (const char *) extras[j].label);
+ }
+ }
+
diff --git a/libsrc/MathMatrix.h b/libsrc/MathMatrix.h
new file mode 100644
index 0000000..927421f
--- /dev/null
+++ b/libsrc/MathMatrix.h
@@ -0,0 +1,194 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/MathMatrix.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __MATHMATRIX_H__
+#define __MATHMATRIX_H__
+
+#include "MathVector.h"
+#include "Error.h"
+
+#include <stdio.h>
+
+class ColumnExtras
+ {
+ private:
+ bool dirty;
+ int precision, width;
+
+ void Init();
+ void Copy(ColumnExtras & c);
+
+ public:
+ String label;
+
+ ColumnExtras()
+ { Init(); }
+ ColumnExtras(ColumnExtras & original)
+ { Init(); Copy(original); }
+ ~ColumnExtras();
+
+ void SetLabel(const char * name);
+ void SetPrecision(int p)
+ {
+ precision = p;
+ dirty = true;
+ }
+ void SetWidth(int w)
+ {
+ width = w;
+ dirty = true;
+ }
+
+ int GetWidth();
+ int GetPrecision()
+ { return precision; }
+
+ ColumnExtras & operator = (ColumnExtras & rhs)
+ { Copy(rhs); return (*this); }
+
+ void Swap(ColumnExtras & rhs);
+ };
+
+class Matrix
+ {
+ public:
+ String label;
+ ColumnExtras * extras;
+ int rows, cols, size, extraSize;
+ Vector ** data;
+
+ Matrix()
+ { Init(); }
+ Matrix(Matrix & m)
+ { Init(); Copy(m); }
+ Matrix(Matrix & m, const char * name)
+ { Init(); Copy(m); SetLabel(name); }
+ Matrix(int n, int m)
+ { Init(); Dimension(n, m); }
+ Matrix(const char * name)
+ { Init(); SetLabel(name); }
+ Matrix(const char * name, int n, int m)
+ { Init(); Dimension(n, m); SetLabel(name); }
+ ~Matrix();
+
+ void Dimension(int m, int n);
+ void SetLabel(const char * name);
+ void SetColumnLabel(int n, const char * name)
+ { extras[n].SetLabel(name); }
+ const char * GetColumnLabel(int n)
+ { return extras[n].label; }
+ void SetColWidth(int n, int w)
+ { extras[n].SetWidth(w); }
+ void SetColPrecision(int n, int p)
+ { extras[n].SetPrecision(p); }
+ void CopyLabels(Matrix & m);
+
+ void Negate();
+ void Identity();
+ void Zero();
+ void Set(double k);
+
+ void Copy(const Matrix & m);
+ void Transpose(const Matrix & m);
+ void Add(const Matrix & m);
+ void AddMultiple(double k, const Matrix & m);
+ void Product(const Matrix & left, const Matrix & right);
+
+ void Add(double k);
+ void Multiply(double k);
+
+ // Reduces a matrix to row echelon form, assuming
+ // values smaller than tol are zero
+ void Reduce(double tol = 0.0);
+
+ Vector & operator [] (int i)
+ { assert(i < rows); return *(data[i]); }
+
+ const Vector & operator [] (int i) const
+ { assert(i < rows); return *(data[i]); }
+
+ void DeleteRow(int r);
+ void DeleteColumn(int c);
+
+ void SwapRows(int r1, int r2)
+ { Vector * temp = data[r1];
+ data[r1] = data[r2];
+ data[r2] = temp;
+ };
+
+ void SwapColumns(int c1, int c2);
+
+ void MultiplyRow(int r1, double k);
+ void AddRows(int r1, int r2);
+ void AddRows(double k, int r1, int r2);
+
+ // Sort according to numeric values in the first column
+ void Sort();
+
+ void Print(FILE * f, int maxRows = -1, int maxCols = -1);
+ void PrintUpper(FILE * f, int maxRows = -1, int maxCols = -1, bool print_diag = false);
+ void PrintLower(FILE * f, int maxRows = -1, int maxCols = -1, bool print_diag = false);
+ void SetupPrint(FILE *f, int r, int c, int & column_zero, int * precision, int * width);
+
+ void Read(FILE * f);
+
+ Matrix & operator = (const Matrix & rhs)
+ { Copy(rhs); return *this; }
+
+ bool operator == (const Matrix & rhs) const;
+ bool operator != (const Matrix & rhs) const { return !(*this == rhs); }
+
+ Matrix & operator *= (double rhs)
+ { Multiply(rhs); return *this; }
+ Matrix & operator /= (double rhs)
+ { Multiply(1.0/rhs); return *this; }
+
+ // Stack a matrix to the bottom of the current matrix
+ void StackBottom(const Matrix & m);
+
+ // Stack a matrix to the left of the current matrix
+ void StackLeft(const Matrix & m);
+
+ // Swap dynamic allocation for two matrices
+ void Swap(Matrix & m);
+
+ // Functions that calculate basic summary statistics
+ double Min() const;
+ double Max() const;
+ double Mean() const;
+
+ // Functions that calculate summary statistics in the presence of missing data
+ double SafeMin() const;
+ double SafeMax() const;
+ double SafeMean() const;
+ int SafeCount() const;
+
+ // Return the last row in matrix
+ Vector & Last() { return *(data[rows - 1]); }
+
+ private:
+ static int alloc;
+ static int CompareRows(Vector ** row1, Vector ** row2);
+
+ void Init();
+ };
+
+#endif
+
+
+
+
diff --git a/libsrc/MathStats.cpp b/libsrc/MathStats.cpp
new file mode 100644
index 0000000..bfcac1d
--- /dev/null
+++ b/libsrc/MathStats.cpp
@@ -0,0 +1,494 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/MathStats.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "MathConstant.h"
+#include "MathStats.h"
+#include "Error.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+// Approximates the sqrt of an integer
+// (should be within +/- 1 for inputs < 1,000,000
+
+int introot(int n)
+ {
+ int root = 1, scale = 1;
+
+ while (scale < n)
+ root *= 3, scale *= 10;
+
+ for (int i = 1; i < 4; i++)
+ root = (root + n / root) / 2;
+
+ return root;
+ }
+
+// The normal distribution function
+//
+
+#define LOWER_TAIL_ONE 7.5
+#define UPPER_TAIL_ZERO 20
+
+double ndist(double z, bool upper)
+ {
+ // C version of ID Hill, "The Normal Integral"
+ // Applied Statistics, Vol 22, pp. 424-427
+
+ // If 7 digit accuracy is enough, alternative is
+ // return erfcc(x / M_SQRT2) * 0.5;
+
+ if (z < 0)
+ {
+ upper = !upper;
+ z = -z;
+ }
+
+ if (z > LOWER_TAIL_ONE && !upper || z > UPPER_TAIL_ZERO)
+ return (upper) ? 0.0 : 1.0;
+
+ double p, y = 0.5 * z * z;
+
+ if (z < 1.28)
+ {
+ p = 0.5 - z * (0.398942280444 - 0.399903438504 * y /
+ (y + 5.75885480458 - 29.8213557808 /
+ (y + 2.62433121679 + 48.6959930692 /
+ (y + 5.92885724438))));
+ }
+ else
+ {
+ p = 0.398942270385 * exp (-y) /
+ (z - 2.8052e-8 + 1.00000615302 /
+ (z + 3.98064794e-4 + 1.98615381364 /
+ (z - 0.151679116635 + 5.29330324926 /
+ (z + 4.8385912808 - 15.1508972451 /
+ (z + 0.742380924027 + 30.789933034 /
+ (z + 3.99019417011))))));
+ }
+
+ return (upper) ? p : 1 - p;
+ }
+
+// The standard normal distribution
+//
+
+double ninv( double p )
+/****************************************************
+ C Equivalent of Wichura's PPND16, Algorithm AS241
+ Applied Statistics Vol 37 1988 pp 477 - 484
+*****************************************************/
+{
+ const double SPLIT1 = 0.425,
+ SPLIT2 = 5.0,
+ CONST1 = 0.180625,
+ CONST2 = 1.6;
+
+ static const double a[8] = {
+ 3.3871328727963666080E0,
+ 1.3314166789178437745E2,
+ 1.9715909503065514427E3,
+ 1.3731693765509461125E4,
+ 4.5921953931549871457E4,
+ 6.7265770927008700853E4,
+ 3.3430575583588128105E4,
+ 2.5090809287301226727E3
+ } ;
+
+ static const double b[7] = {
+ 4.2313330701600911252E1,
+ 6.8718700749205790830E2,
+ 5.3941960214247511077E3,
+ 2.1213794301586595867E4,
+ 3.9307895800092710610E4,
+ 2.8729085735721942674E4,
+ 5.2264952788528545610E3
+ } ;
+
+ static const double c[8] = {
+ 1.42343711074968357734E0,
+ 4.63033784615654529590E0,
+ 5.76949722146069140550E0,
+ 3.64784832476320460504E0,
+ 1.27045825245236838258E0,
+ 2.41780725177450611770E-1,
+ 2.27238449892691845833E-2,
+ 7.74545014278341407640E-4
+ } ;
+
+ static const double d[7] = {
+ 2.05319162663775882187E0,
+ 1.67638483018380384940E0,
+ 6.89767334985100004550E-1,
+ 1.48103976427480074590E-1,
+ 1.51986665636164571966E-2,
+ 5.47593808499534494600E-4,
+ 1.05075007164441684324E-9
+ } ;
+
+ static const double e[8] = {
+ 6.65790464350110377720E0,
+ 5.46378491116411436990E0,
+ 1.78482653991729133580E0,
+ 2.96560571828504891230E-1,
+ 2.65321895265761230930E-2,
+ 1.24266094738807843860E-3,
+ 2.71155556874348757815E-5,
+ 2.01033439929228813265E-7
+ } ;
+
+ static const double f[7] = {
+ 5.99832206555887937690E-1,
+ 1.36929880922735805310E-1,
+ 1.48753612908506148525E-2,
+ 7.86869131145613259100E-4,
+ 1.84631831751005468180E-5,
+ 1.42151175831644588870E-7,
+ 2.04426310338993978564E-15
+ } ;
+
+ double q = p - 0.5;
+ double r, x ;
+
+ if ( fabs( q ) < SPLIT1 ) {
+ r = CONST1 - q * q ;
+ return q * ((((((( a[7] * r + a[6] ) * r + a[5] ) * r + a[4] ) * r
+ + a[3] ) * r + a[2] ) * r + a[1] ) * r + a[0] ) /
+ ((((((( b[6] * r + b[5] ) * r + b[4] ) * r + b[3] ) * r
+ + b[2] ) * r + b[1] ) * r + b[0] ) * r + 1.0 ) ;
+ } else {
+ if ( q < 0 )
+ r = p ;
+ else
+ r = 1.0 - p ;
+
+ if ( r < 1e-200)
+ error("p-value [%.2g] outside range in ninv()", r );
+
+ if ( r > 0.0 )
+ {
+ r = sqrt( -log( r ) ) ;
+ if ( r <= SPLIT2 )
+ {
+ r -= CONST2 ;
+ x = ((((((( c[7] * r + c[6] ) * r + c[5] ) * r + c[4] ) * r
+ + c[3] ) * r + c[2] ) * r + c[1] ) * r + c[0] ) /
+ ((((((( d[6] * r + d[5] ) * r + d[4] ) * r + d[3] ) * r
+ + d[2] ) * r + d[1] ) * r + d[0] ) * r + 1.0 ) ;
+ }
+ else
+ {
+ r -= SPLIT2 ;
+ x = ((((((( e[7] * r + e[6] ) * r + e[5] ) * r + e[4] ) * r
+ + e[3] ) * r + e[2] ) * r + e[1] ) * r + e[0] ) /
+ ((((((( f[6] * r + f[5] ) * r + f[4] ) * r + f[3] ) * r
+ + f[2] ) * r + f[1] ) * r + f[0] ) * r + 1.0 ) ;
+ }
+ }
+ else
+ x = HUGE_VAL;
+
+ if ( q < 0 )
+ x = -x ;
+ return x ;
+ }
+}
+
+// The chi-squared distribution
+//
+double chidist(double x, double v)
+ { return gammq (0.5 * v, 0.5 * x); }
+
+// The non-central chi-squared distribution
+//
+double chidist(double x, double f, double theta)
+ {
+ if (x < 0.0 || f < 0.0 || theta < 0.0)
+ error("Invalid arguments in chidist function");
+
+ if (x == 0.0)
+ return 1.0;
+
+ // Evaluate the first term in series
+ int n = 1;
+
+ double lambda = theta * 0.5;
+ double u = exp(-lambda);
+ double v = u;
+ double x2 = x * 0.5, f2 = f * 0.5;
+ double t = pow(x2, f2) * exp(-x2) / exp(gammln(f2 + 1.0));
+
+ double result = v * t;
+
+ // Initial approximation
+ while ( f + 2.0 * n < x )
+ {
+ u *= lambda / n;
+ v += u;
+ t *= x / (f + 2.0 * n);
+ result += v * t;
+ n++;
+ }
+
+ // Loop until we have accurate result or exceed ITMAX
+ while (t * x / (f + 2.0 * n - x) > 1e-10)
+ {
+ if (n > ITMAX)
+ error("chidist function did not converge within %d iterations\n");
+
+ u *= lambda / n;
+ v += u;
+ t *= x / (f + 2.0 * n);
+ result += v * t;
+ n++;
+ }
+
+ return 1.0 - result;
+ }
+
+// The error function
+//
+double erff(double x)
+ {
+ return x < 0.0 ? -gammp(0.5, x*x) : gammp(0.5, x*x);
+ }
+
+double erffc(double x)
+ {
+ return x < 0.0 ? 1.0 + gammp(0.5, x*x) : gammq(0.5, x*x);
+ }
+
+double erfcc(double x)
+// returns the complementary of the error function erfc(x),
+// with a fractional error everywhere of less than
+// 1.2 * 10-7
+ {
+ double t, z, ans;
+
+ z = fabs(x);
+ t = 1.0 / (1.0 + 0.5 * z);
+ ans = t*exp(-z*z -1.26551223 +t*(1.00002368 +t*(0.37409196 +t*(0.09678418
+ +t*(-0.18628806 +t*(0.27886807 +t*(-1.13520398 +t*(1.48851587
+ +t*(-0.82215223 +t*0.17087277)))))))));
+ return (x >= 0.0 ? ans : 2.0 - ans);
+ }
+
+// The f-distribution
+//
+
+double fdist(double x, double v1, double v2)
+ {
+ return betai(v2/2, v1/2, v2/(v2+v1*x));
+ }
+
+// The student's T-distribution
+double tdist(double x, double df)
+ {
+ return betai(df * 0.5, 0.5, df/(df + x*x));
+ }
+
+// Gamma distribution functions
+//
+
+double gammln ( double xx )
+ {
+ double x, y, tmp, ser;
+ static double cof[6] = { 76.18009172947146, -86.50532032941677, 24.01409824083091,
+ -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5 };
+ int j;
+
+ y = x = xx;
+
+ tmp = x + 5.5;
+
+ tmp -= ( x + 0.5 ) * log ( tmp );
+
+ ser = 1.000000000190015;
+
+ for ( j=0; j<=5; j++) ser += cof[j]/++y;
+
+ return - tmp + log ( 2.5066282746310005 * ser/x);
+ }
+
+double gammp ( double a, double x )
+ {
+ double gamser, gamcf, gln;
+
+ if ( x < 0.0 || a <= 0.0)
+ error("Invalid arguments in routine gammp");
+ if ( x < (a + 1.0))
+ {
+ gser(&gamser, a, x, &gln); // use series representation
+ return gamser;
+ }
+ else
+ {
+ gcf(&gamcf, a, x, &gln); // use the continued fraction representation
+ return 1.0 - gamcf; // and take its complement
+ }
+ }
+
+double gammq ( double a, double x )
+ {
+ double gammser, gammcf, gln;
+
+ if (x < 0.0 || a <= 0.0) error("Invalid arguments in routine gammq");
+ if (x < (a + 1.0))
+ {
+ gser (&gammser, a, x, &gln); // use the series representation
+ return (1.0 - gammser); // and take its complement
+ }
+ else
+ {
+ gcf ( &gammcf, a, x, &gln); // use the continued fraction representation
+ return gammcf;
+ }
+ }
+
+void gser ( double * gamser, double a, double x, double * gln)
+ {
+ int n;
+ double sum, del, ap;
+
+ *gln=gammln(a);
+ if (x <= 0.0)
+ {
+ if (x < 0.0) error("x less than 0 in gamma series routine (gser)");
+ *gamser = 0.0;
+ return;
+ }
+ else
+ {
+ ap = a;
+ del = sum = 1.0 / a;
+ for (n = 1; n <= ITMAX; n++)
+ {
+ ++ ap;
+ del *= x / ap;
+ sum += del;
+ if ( fabs(del) < fabs(sum) * EPS )
+ {
+ *gamser = sum * exp ( -x + a * log (x) - (*gln));
+ return;
+ }
+ }
+ error("a too large, ITMAX too small in gamma series routine (gser)");
+ return;
+ }
+ }
+
+void gcf ( double * gammcf, double a, double x, double * gln)
+ {
+ int i;
+ double an, b, c, d, del, h;
+
+ *gln = gammln(a);
+
+ b = x + 1.0 - a; // Setup for evaluating continued fraction by
+ c = 1.0 / FPMIN; // Lentz method (cf NRC 5.2)
+ d = 1.0 / b;
+ h = d;
+
+ for ( i = 1; i <= ITMAX; i++ ) // Iterate to convergence
+ {
+ an = -i * ( i - a );
+ b += 2.0;
+ d = an * d + b;
+ if ( fabs(d) < FPMIN ) d = FPMIN;
+ c = b + an / c;
+ if ( fabs(c) < FPMIN ) c = FPMIN;
+ d = 1.0 / d;
+ del = d * c;
+ h *= del;
+ if ( fabs(del-1.0) < EPS ) break;
+ }
+ if ( i > ITMAX ) error ("a too large, ITMAX too small in gamma countinued fraction (gcf)");
+ *gammcf = exp (-x + a*log(x) - (*gln)) * h;
+ }
+
+// Beta functions
+//
+
+double betai(double a, double b, double x)
+ {
+ double bt;
+
+ if ( x < 0.0 || x > 1.0) error("betai: Bad x");
+ if ( x == 0.0 || x == 1.0)
+ bt = 0.0;
+ else
+ bt = exp(gammln(a+b)-gammln(a)-gammln(b)+a*log(x)+b*log(1.0-x));
+ if (x < (a + 1.0)/(a + b + 2.0))
+ // use continued fraction directly
+ return bt*betacf(a,b,x)/a;
+ else
+ // use continued fraction after making the symmetry transformation
+ return 1.0-bt*betacf(b,a,1.0-x)/b;
+ }
+
+double betacf(double a, double b, double x)
+ {
+ int m, m2;
+ double aa, c, d, del, h, qab, qam, qap;
+
+ // these q's will be used in factors that appear in coefficients
+ qab = a + b;
+ qap = a + 1.0;
+ qam = a - 1.0;
+
+ // First step of Lentz's method
+ c = 1.0;
+ d = 1.0 - qab*x/qap;
+ if (fabs(d) < FPMIN) d=FPMIN;
+ d = 1.0 / d;
+ h = d;
+ for (m=1; m<=ITMAX; m++)
+ {
+ m2 = 2*m;
+
+ // The even step of the recurrence
+ aa = m * (b-m)*x/((qam+m2)*(a+m2));
+ d = 1.0 + aa*d;
+ if (fabs(d) < FPMIN) d=FPMIN;
+ c = 1.0 + aa/c;
+ if (fabs(c) < FPMIN) c=FPMIN;
+ d = 1.0/d;
+ h *= d*c;
+
+ // The odd step of the recurrence
+ aa = -(a+m)*(qab+m)*x/((a+m2)*(qap+m2));
+ d = 1.0+aa*d;
+ if (fabs(d) < FPMIN) d = FPMIN;
+ c = 1.0+aa/c;
+ if (fabs(c) < FPMIN) c = FPMIN;
+ d = 1.0/d;
+ del = d*c;
+ h *= del;
+
+ // Are we done?
+ if (fabs(del - 1.0) < EPS) break;
+ }
+ if (m > ITMAX)
+ error("betacf: a or b too big or ITMAX too small");
+ return h;
+ }
+
+
+
+
+
diff --git a/libsrc/MathStats.h b/libsrc/MathStats.h
new file mode 100644
index 0000000..9dd03cc
--- /dev/null
+++ b/libsrc/MathStats.h
@@ -0,0 +1,77 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/MathStats.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef _MATHSTATS_H_
+#define _MATHSTATS_H_
+
+#include "MathVector.h"
+#include "MathMatrix.h"
+
+// Normal distribution functions
+//
+double ndist (double x, bool upper = true);
+
+// ninv(p) calculates X such that p = P(x >= X) for std normal dist
+//
+double ninv ( double p );
+
+// Chi-Sq distribution function
+// P(Chi>=X) for v degrees of freedom
+//
+double chidist(double x, double v);
+double chidist(double x, double v, double ncp);
+
+// F distribution function
+// P(F>=x) for v1 and v2 degrees freedom
+//
+double fdist(double x, double v1, double v2);
+
+// P(T>=x) for v degrees freedom
+double tdist(double x, double v);
+
+// Gamma distribution utility functions
+// (required for the chi-sq distribution)
+//
+
+double erff (double x); // the error function
+double erffc(double x); // the complementary error function
+double erfcc(double x); // heuristic version of erffc
+double gammln ( double xx ); // return the value of ln ( gamma ( xx ) ) | xx > 0
+double gammp ( double a, double x); // return the incomplete gamma function P(a,x)
+double gammq ( double a, double x); // return the incomplete gamma function Q(a,x) = 1 - P(a,x)
+
+// Estimates P(a,x) by its series representation and gammln(a)
+void gser ( double * gamser, double a, double x, double * gln);
+// Estimates Q(a,x) by its continued fraction representation and gammln(a)
+void gcf ( double * gammcf, double a, double x, double * gln);
+
+// Beta distribution utility functions
+//
+double betai(double a, double b, double x); // Returns the incomplete
+ // beta function Ix(a,b)
+double betacf(double a, double b, double x); // Evaluates continued fraction
+ // for incomplete beta function
+ // by modified Lentz's method
+
+// Rapid approximation to the sqrt for integers
+//
+
+int introot(int n);
+
+#endif
+
+
diff --git a/libsrc/MathVector.cpp b/libsrc/MathVector.cpp
new file mode 100644
index 0000000..c4d7aeb
--- /dev/null
+++ b/libsrc/MathVector.cpp
@@ -0,0 +1,652 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/MathVector.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "MathVector.h"
+#include "MathMatrix.h"
+#include "MathConstant.h"
+#include "Sort.h"
+#include "Error.h"
+
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#endif
+
+#include <string.h>
+#include <math.h>
+
+int Vector::alloc = 32;
+
+void Vector::Init()
+ {
+ dim = size = 0;
+ label = "Unknown";
+ data = NULL;
+ }
+
+Vector::~Vector()
+ {
+ // printf(" Deleting vector %s ...\n", (const char *) label);
+ if (data != NULL) delete [] data;
+ }
+
+void Vector::Dimension(int d)
+ {
+ if (d > size)
+ if (size < 1024)
+ {
+ size = (d + alloc) / alloc * alloc;
+ double * newData = new double [size];
+ if (data != NULL)
+ {
+ for (int i = 0; i < dim; i++)
+ newData[i] = data[i];
+ delete [] data;
+ }
+ data = newData;
+ }
+ else
+ {
+ while (size <= d)
+ size *= 2;
+
+ double * newData = new double [size];
+ if (data != NULL)
+ {
+ for (int i = 0; i < dim; i++)
+ newData[i] = data[i];
+ delete [] data;
+ }
+ data = newData;
+ }
+ dim = d;
+ }
+
+void Vector::Negate()
+ {
+ for (int i = 0; i < dim; i++)
+ data[i] = -data[i];
+ }
+
+void Vector::Add(double n)
+ {
+ for (int i = 0; i< dim; i++)
+ data[i] += n;
+ }
+
+void Vector::Multiply(double k)
+ {
+ for (int i = 0; i < dim; i++)
+ data[i] *= k;
+ }
+
+void Vector::Copy(const Vector & v)
+ {
+ Dimension(v.dim);
+
+ if (v.data != NULL)
+ for (int i=0; i < dim; i++)
+ data[i] = v.data[i];
+ }
+
+Vector & Vector::operator = (const Vector & rhs)
+ {
+ Copy(rhs);
+ return *this;
+ }
+
+void Vector::Add(Vector & v)
+ {
+ if (dim != v.dim)
+ error("Vector::Add - vectors have different dimensions\n"
+ "Vectors - %s [%d] + %s [%d] ",
+ (const char *) label, dim, (const char *) v.label, v.dim);
+
+ for (int i = 0; i < dim; i++)
+ data[i] += v.data[i];
+ }
+
+void Vector::AddMultiple(double k, Vector & v)
+ {
+ if (dim != v.dim)
+ error("Vector::AddMultiple - vectors are incompatible\n"
+ "Vectors - %s [%d] + %s [%d] ",
+ (const char *) label, dim, (const char *) v.label, v.dim);
+
+ for (int i = 0; i < dim; i++)
+ data[i] += k * v.data[i];
+ }
+
+
+void Vector::Subtract(Vector & v)
+ {
+ if (dim != v.dim)
+ error("Vector::Subtract - vectors have different dimensions\n"
+ "Vectors - %s [%d] + %s [%d] ",
+ (const char *) label, dim, (const char *) v.label, v.dim);
+
+ for (int i = 0; i < dim; i++)
+ data[i] -= v.data[i];
+ }
+
+
+void Vector::Zero()
+ {
+ for (int i = 0; i < dim; i++)
+ data[i] = 0.0;
+ }
+
+void Vector::Set(double k)
+ {
+ for (int i = 0; i < dim; i++)
+ data[i] = k;
+ }
+
+void Vector::SetMultiple(double k, Vector & v)
+ {
+ Dimension(v.dim);
+
+ for (int i = 0; i < dim; i++)
+ data[i] = k * v[i];
+ }
+
+double Vector::InnerProduct(Vector & v)
+ {
+ if (dim != v.dim)
+ error("Vector::InnerProduct - vectors have different dimensions\n"
+ "Vectors - %s[%d] * %s[%d] ",
+ (const char *) label, dim, (const char *) v.label, v.dim);
+
+ double sum = 0.0;
+ for (int i = 0; i < dim; i++)
+ sum += data[i] * v.data[i];
+
+ return sum;
+ }
+
+void Vector::Insert(int n, double value)
+ {
+ Dimension(dim + 1);
+
+ for (int i = dim - 1; i > n; i--)
+ data[i] = data[i - 1];
+ data[n] = value;
+ }
+
+void Vector::DeleteDimension(int n)
+ {
+ for (int i = n; i < dim - 1; i++)
+ data[i] = data[i + 1];
+ dim --;
+ }
+
+void Vector::Product(Matrix & m, Vector & v)
+ {
+ if (m.cols != v.dim)
+ error ("Vector::Product - Cannot Multiply Matrix by Vector\n"
+ "Vectors - %s [%d, %d] * %s [%d]\n",
+ (const char *) m.label, m.rows, m.cols,
+ (const char *) v.label, v.dim);
+
+ Dimension(m.rows);
+ Zero();
+
+ for(int i = 0; i < m.rows; i++)
+ for (int j = 0; j < m.cols; j++)
+ data[i] += m[i][j] * v[j];
+ }
+
+double Vector::Average() const
+ {
+ if (dim == 0)
+ error("Average undefined for null vector %s",
+ (const char *) label);
+
+ return Sum() / dim;
+ }
+
+double Vector::Product() const
+ {
+ double product = 1.0;
+
+ for (int j = 0; j < dim; j++)
+ product *= data[j];
+
+ return product;
+ }
+
+double Vector::Sum() const
+ {
+ double sum = 0.0;
+
+ for (int j=0; j<dim; j++)
+ sum += data[j];
+
+ return sum;
+ }
+
+double Vector::SumSquares() const
+ {
+ double sum = 0.0;
+
+ for (int j=0; j<dim; j++)
+ sum += data[j] * data[j];
+
+ return sum;
+ }
+
+void Vector::AveVar(double & ave, double & var) const
+ {
+ // uses a two pass method to correct for
+ // round-off errors
+
+ if (dim == 0)
+ error("Average and Variance undefined for null vector %s",
+ (const char *) label);
+
+ double s, ep;
+
+ ave = var = ep = 0.0;
+
+ for (int j=0; j<dim; j++)
+ ave += data[j];
+
+ ave /= dim;
+
+ for (int j=0; j<dim; j++)
+ {
+ s = data[j] - ave;
+ ep += s;
+ var += s*s;
+ }
+
+ if (dim > 1)
+ var = (var - ep*ep/dim)/(dim-1);
+ }
+
+double Vector::Var() const
+ {
+ double mean, var;
+ AveVar(mean, var);
+ return var;
+ }
+
+void Vector::Print(FILE * f, int d)
+ {
+ if (d == -1 || d > dim) d = dim;
+
+ fprintf(f, "%.15s : ", (const char *) label);
+ for (int i = 0; i < d; i++)
+ fprintf(f, "%7.3f ", data[i]);
+ fprintf(f, "\n");
+ }
+
+int Vector::CompareDouble(const double * a, const double * b)
+ {
+ if (*a < *b) return -1;
+ if (*a > *b) return 1;
+ return 0;
+ }
+
+void Vector::Sort()
+ {
+ QuickSort(data, dim, sizeof(double), COMPAREFUNC CompareDouble);
+ }
+
+void Vector::Sort(Vector & freeRider)
+ {
+ QuickSort2(data, freeRider.data, dim, sizeof(double),
+ COMPAREFUNC CompareDouble);
+ }
+
+int Vector::BinarySearch(double element)
+ {
+ void * pointer = ::BinarySearch
+ (&element, data, dim, sizeof(double), COMPAREFUNC CompareDouble);
+
+ if (pointer == NULL)
+ return -1;
+
+ return ((double *) pointer) - data;
+ }
+
+void Vector::RemoveDuplicates()
+ {
+ int out = 0;
+
+ for (int in = 1; in < Length(); in++)
+ if (data[in] != data[out])
+ data[++out] = data[in];
+
+ Dimension(out + 1);
+ }
+
+bool Vector::operator == (const Vector & rhs) const
+ {
+ if (rhs.dim != dim) return false;
+
+ for (int i = 0; i < dim; i++)
+ if (data[i] != rhs[i])
+ return false;
+ return true;
+ }
+
+// These functions are useful for simulation
+//
+
+int Vector::CountIfGreater(double threshold) const
+ {
+ int count = 0;
+
+ for (int i = 0; i < dim; i++)
+ if (data[i] > threshold)
+ count++;
+
+ return count;
+ }
+
+int Vector::CountIfGreaterOrEqual(double treshold) const
+ {
+ int count = 0;
+
+ for (int i = 0; i < dim; i++)
+ if (data[i] >= treshold)
+ count++;
+
+ return count;
+ }
+
+// Min and max functions
+//
+
+double Vector::Min() const
+ {
+ if (dim == 0)
+ return 0.0;
+
+ double min = data[0];
+
+ for (int i = 1; i < dim; i++)
+ if (data[i] < min)
+ min = data[i];
+
+ return min;
+ }
+
+double Vector::Max() const
+ {
+ if (dim == 0)
+ return 0.0;
+
+ double max = data[0];
+
+ for (int i = 1; i < dim; i++)
+ if (data[i] > max)
+ max = data[i];
+
+ return max;
+ }
+
+// Push and Pop functions for using vector as a stack
+//
+
+void Vector::Push(double value)
+ {
+ Dimension(dim + 1);
+ data[dim - 1] = value;
+ }
+
+void Vector::Stack(const Vector & v)
+ {
+ int end = dim;
+
+ Dimension(dim + v.dim);
+
+ for (int i = 0; i < v.dim; i++)
+ data[i + end] = v[i];
+ }
+
+// Check if all values are in ascending or descending order
+//
+
+bool Vector::isAscending()
+ {
+ for (int i = 1; i < dim; i++)
+ if (data[i] < data[i - 1])
+ return false;
+ return true;
+ }
+
+bool Vector::isDescending()
+ {
+ for (int i = 1; i < dim; i++)
+ if (data[i] > data[i - 1])
+ return false;
+ return true;
+ }
+
+// VectorFunc class
+//
+
+VectorFunc::VectorFunc()
+ { f = NULL; }
+
+VectorFunc::VectorFunc(double (*func)(Vector &))
+ { f = func; }
+
+double VectorFunc::Evaluate(Vector & v)
+ { return f(v); }
+
+#ifndef M_SQRT2
+#define M_SQRT2 1.41421356
+#endif
+
+#define MAXROUNDS 10
+#define SQRT_HALF (1.0/M_SQRT2)
+#define TWO (M_SQRT2 * M_SQRT2)
+
+void VectorFunc::Derivative(Vector & x, Vector & d, double h_start)
+ {
+ double a[MAXROUNDS][MAXROUNDS];
+
+ // Calculate derivatives along each direction ...
+ for (int k = 0; k < x.dim; k++)
+ {
+ double left, right;
+ double save_x = x[k];
+ double h = h_start;
+
+ // Evaluate function to the left of x along direction k
+ x[k] = save_x - h;
+ left = Evaluate(x);
+
+ // Initialize or update dfmin if appropriate...
+ if (k == 0 || left < dfmin)
+ dfmin = left, dpmin = x;
+
+ // Evaluate function to the right of x along direction k
+ x[k] = save_x + h;
+ right = Evaluate(x);
+
+ // Update dfmin
+ if (right < dfmin)
+ dfmin = left, dpmin = x;
+
+ // Initial crude estimate
+ a[0][0] = (right - left) / (2.0 * h);
+
+ // Initial guess of error is large
+ double err = 1e30;
+
+ // At each round, update Neville tableau with smaller stepsize and higher
+ // order extrapolation ...
+ for (int i = 1; i < MAXROUNDS; i++)
+ {
+ // Decrease h
+ h *= SQRT_HALF;
+
+ // Re-evaluate function and update dfmin as required
+ x[k] = save_x - h;
+ left = Evaluate(x);
+ if (left < dfmin) dfmin = left, dpmin = x;
+ x[k] = save_x + h;
+ right = Evaluate(x);
+ if (right < dfmin) dfmin = right, dpmin = x;
+
+ // Improved estimate of derivative
+ a[0][i] = (right - left) / (2.0 * h);
+
+ // Calculate extrapolations of various orders ...
+ double factor = TWO;
+
+ for (int j = 1; j <= i; j++)
+ {
+ a[j][i] = (a[j-1][i] * factor - a[j-1][i-1])/(factor - 1.0);
+
+ factor *= TWO;
+
+ double error = max(fabs(a[j][i] - a[j-1][i]), fabs(a[j][i] - a[j-1][i-1]));
+
+ // Did we improve solution?
+ if (error < err)
+ {
+ err = error;
+ d[k] = a[j][i];
+ }
+ }
+
+ // Stop if solution is deteriorating ...
+ if (fabs(a[i][i] - a[i-1][i-1]) >= 2.0 * err)
+ {
+ x[k] = save_x;
+ break;
+ }
+ }
+
+ x[k] = save_x;
+ }
+ }
+
+int Vector::SafeCount() const
+ {
+ int nonMissing = dim;
+
+ for (int i = 0; i < dim; i++)
+ if (data[i] == _NAN_)
+ nonMissing--;
+
+ return nonMissing;
+ }
+
+double Vector::SafeMin() const
+ {
+ double min = _NAN_;
+ int i;
+
+ for (i = 0; i < dim; i++)
+ if (data[i] != _NAN_)
+ {
+ min = data[i];
+ break;
+ }
+
+ for (; i < dim; i++)
+ if (data[i] != _NAN_ && data[i] < min)
+ min = data[i];
+
+ return min;
+ }
+
+double Vector::SafeMax() const
+ {
+ double max = _NAN_;
+ int i;
+
+ for (i = 0; i < dim; i++)
+ if (data[i] != _NAN_)
+ {
+ max = data[i];
+ break;
+ }
+
+ for (; i < dim; i++)
+ if (data[i] != _NAN_ && data[i] > max)
+ max = data[i];
+
+ return max;
+ }
+
+void Vector::Reverse()
+ {
+ for (int i = 0, j = dim - 1; i < j; i++, j--)
+ Swap(i, j);
+ }
+
+void Vector::InsertInSortedList(int value)
+ {
+ // Skip through large elements
+ int pos = dim - 1;
+
+ while (pos >= 0 && data[pos] > value)
+ pos--;
+
+ // If the value is already in the list, we are done
+ if (pos >= 0 && data[pos] == value)
+ return;
+
+ // Otherwise we need to grow array
+ Dimension(dim + 1);
+
+ // And then shift larger elements to the right
+ pos++;
+ for (int i = dim - 1; i > pos; i--)
+ data[i] = data[i - 1];
+
+ data[pos] = value;
+ }
+
+void Vector::Swap(Vector & rhs)
+ {
+ double * temp = rhs.data;
+ rhs.data = data;
+ data = temp;
+
+ int swap = rhs.dim;
+ rhs.dim = dim;
+ dim = swap;
+
+ swap = rhs.size;
+ rhs.size = size;
+ size = swap;
+ }
+
+double Vector::Average(double returnIfNull)
+ {
+ if (Length() == 0)
+ return returnIfNull;
+
+ return Average();
+ }
+
+double Vector::Var(double returnIfNull)
+ {
+ if (Length() == 0)
+ return returnIfNull;
+
+ return Var();
+ }
+
+
diff --git a/libsrc/MathVector.h b/libsrc/MathVector.h
new file mode 100644
index 0000000..e35182a
--- /dev/null
+++ b/libsrc/MathVector.h
@@ -0,0 +1,207 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/MathVector.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __MATHVECTOR_H__
+#define __MATHVECTOR_H__
+
+#include "StringBasics.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+class Matrix;
+
+class Vector
+ {
+ public:
+ int dim, size;
+ double * data;
+ String label;
+
+ Vector()
+ { Init(); }
+ Vector(Vector & v)
+ { Init(); Copy(v); }
+ Vector(int d)
+ { Init(); Dimension(d); }
+ Vector(const char * text)
+ { Init(); label = text; }
+ Vector(const char * text, int d)
+ { Init(); label = text; Dimension(d); }
+ Vector(const char * text, Vector & v)
+ { Init(); label = text; Copy(v); }
+
+ ~Vector();
+
+ void Dimension(int d);
+ int Length() const { return dim; }
+
+ void SetLabel(const char * text) { label = text; }
+
+ void Zero();
+ void Set(double k);
+ void Set(Vector & v) { Copy(v); };
+ void SetMultiple(double k, Vector & v);
+
+ void Negate();
+ void Add(double n);
+ void Multiply(double k);
+
+ double InnerProduct(Vector & v);
+ void Copy(const Vector & v);
+ void Add(Vector & v);
+ void AddMultiple(double k, Vector & v);
+ void Subtract(Vector & v);
+
+ void Product(Matrix & m, Vector & v);
+
+ double & operator [] (int n)
+ { assert(n < dim); return data[n]; }
+ double operator [] (int n) const
+ { assert(n < dim); return data[n]; }
+
+ double operator [] (double fraction)
+ { return data[(int) (dim * fraction)]; }
+ double & operator [] (double fraction) const
+ { return data[(int) (dim * fraction)]; }
+
+ Vector & operator = (const Vector & v);
+ bool operator == (const Vector & v) const;
+ bool operator != (const Vector & v) const { return !(*this == v); }
+
+ void Swap(int i, int j)
+ { double swap = data[i]; data[i] = data[j]; data[j] = swap; }
+ void Swap(Vector & rhs);
+
+ Vector & operator *= (double rhs) { Multiply(rhs); return *this; }
+ Vector & operator += (double rhs) { Add(rhs); return *this; }
+ Vector & operator -= (double rhs) { return *this += -rhs; }
+ Vector & operator /= (double rhs) { return *this *= 1/rhs; }
+
+ void DeleteDimension (int n);
+ void Delete(int n) { DeleteDimension(n); }
+ void Insert(int n, double value);
+
+ // Calculates average and variance
+ void AveVar(double & ave, double & var) const;
+ double Average() const;
+ double Var() const;
+
+ double Average(double returnIfNull);
+ double Var(double returnIfNull);
+
+ // Common descriptive functions
+ double Sum() const;
+ double SumSquares() const;
+ double Product() const;
+
+ // Find extreme values
+ double Min() const;
+ double Max() const;
+
+ // Return the number of elements in a subset
+ int CountIfGreater(double treshold) const;
+ int CountIfGreaterOrEqual(double treshold) const;
+
+ // Append another vector to the end
+ void Stack(const Vector & v);
+
+ void Print(int maxDim = -1) { Print(stdout, maxDim); }
+ void Print(FILE * output, int maxDim = -1);
+
+ // Routines for creating and searching through sorted vectors
+ void Sort();
+ void Reverse();
+ void Sort(Vector & freeRider);
+ int BinarySearch(double value);
+ int FastFind(double value) { return BinarySearch(value); }
+
+ // Remove consecutive duplicate elements from vector
+ void RemoveDuplicates();
+
+ // Query first and last elements
+ //
+
+ double & First() { return data[0]; }
+ double & Last() { return data[dim - 1]; }
+
+ // Routines for using a vector as a stack of doubles
+ //
+
+ void Clear() { dim = 0; }
+ void Push(double value);
+ double Pop() { return data[--dim]; }
+ double Peek() const { return data[dim-1]; }
+
+ // This routine adds items to a sorted list
+ //
+
+ void InsertInSortedList(int item);
+
+ static int alloc;
+
+ bool isAscending();
+ bool isDescending();
+
+ // Routines for dealing with vectors that include missing data
+ //
+
+ int SafeCount() const;
+ double SafeMin() const;
+ double SafeMax() const;
+
+ private:
+ static int CompareDouble(const double * a, const double * b);
+ void Init();
+ };
+
+
+
+class VectorFunc
+// Wrapper for multi-dimensional functions
+// so that they can be used as parameters
+// and keep private data
+ {
+ private:
+ double (*f)(Vector &);
+
+ public:
+ // Constructors
+ VectorFunc();
+ VectorFunc(double (*func)(Vector &));
+
+ // Virtual destructor ensures that dynamic objects are
+ // handled correctly
+ virtual ~VectorFunc() { }
+
+ virtual double Evaluate(Vector & v);
+
+ // Calculate derivatives along each direction. Delta is a guess value
+ // for the initial stepsize in numerical derivation
+ virtual void Derivative(Vector & point, Vector & d, double delta = 1.0);
+
+ // Minimum function value found while evaluating derivative
+ // and its location...
+ double dfmin;
+ Vector dpmin;
+ };
+
+#endif
+
+
+
+
diff --git a/libsrc/MemoryAllocators.cpp b/libsrc/MemoryAllocators.cpp
new file mode 100644
index 0000000..3f69f66
--- /dev/null
+++ b/libsrc/MemoryAllocators.cpp
@@ -0,0 +1,260 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/MemoryAllocators.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "MemoryAllocators.h"
+
+#include <stdlib.h>
+
+char *** AllocateCharCube(int n, int rows, int cols)
+ {
+ char *** cube = new char ** [n];
+
+ // Stop early if we are out of memory
+ if (cube == NULL)
+ return NULL;
+
+ for (int i = 0; i < n; i++)
+ {
+ cube[i] = AllocateCharMatrix(rows, cols);
+
+ // Safely unravel allocation if we run out of memory
+ if (cube[i] == NULL)
+ {
+ while (i--)
+ FreeCharMatrix(cube[i], rows);
+
+ delete [] cube;
+
+ return NULL;
+ }
+ }
+
+ return cube;
+ }
+
+int ** AllocateIntMatrix(int rows, int cols)
+ {
+ int ** matrix = new int * [rows];
+
+ // Stop early if we are out of memory
+ if (matrix == NULL)
+ return NULL;
+
+ for (int i = 0; i < rows; i++)
+ {
+ matrix[i] = new int [cols];
+
+ // Safely unravel allocation if we run out of memory
+ if (matrix[i] == NULL)
+ {
+ while (i--)
+ delete [] matrix[i];
+
+ delete [] matrix;
+
+ return NULL;
+ }
+ }
+
+ return matrix;
+ }
+
+char ** AllocateCharMatrix(int rows, int cols)
+ {
+ char ** matrix = new char * [rows];
+
+ // Stop early if we are out of memory
+ if (matrix == NULL)
+ return NULL;
+
+ for (int i = 0; i < rows; i++)
+ {
+ matrix[i] = new char [cols];
+
+ // Safely unravel allocation if we run out of memory
+ if (matrix[i] == NULL)
+ {
+ while (i--)
+ delete [] matrix[i];
+
+ delete [] matrix;
+
+ return NULL;
+ }
+ }
+
+ return matrix;
+ }
+
+float ** AllocateFloatMatrix(int rows, int cols)
+ {
+ float ** matrix = new float * [rows];
+
+ // Stop early if we are out of memory
+ if (matrix == NULL)
+ return NULL;
+
+ for (int i = 0; i < rows; i++)
+ {
+ matrix[i] = new float [cols];
+
+ // Safely unravel allocation if we run out of memory
+ if (matrix[i] == NULL)
+ {
+ while (i--)
+ delete [] matrix[i];
+
+ delete [] matrix;
+
+ return NULL;
+ }
+ }
+
+ return matrix;
+ }
+
+void FreeCharCube(char *** & cube, int n, int rows)
+ {
+ if (cube == NULL)
+ return;
+
+ for (int i = 0; i < n; i++)
+ FreeCharMatrix(cube[i], rows);
+
+ delete [] cube;
+
+ cube = NULL;
+ }
+
+void FreeCharMatrix(char ** & matrix, int rows)
+ {
+ if (matrix == NULL)
+ return;
+
+ for (int i = 0; i < rows; i++)
+ delete [] matrix[i];
+
+ delete [] matrix;
+
+ matrix = NULL;
+ }
+
+void FreeFloatMatrix(float ** & matrix, int rows)
+ {
+ if (matrix == NULL)
+ return;
+
+ for (int i = 0; i < rows; i++)
+ delete [] matrix[i];
+
+ delete [] matrix;
+
+ matrix = NULL;
+ }
+
+void FreeIntMatrix(int ** & matrix, int rows)
+ {
+ if (matrix == NULL)
+ return;
+
+ for (int i = 0; i < rows; i++)
+ delete [] matrix[i];
+
+ delete [] matrix;
+
+ matrix = NULL;
+ }
+
+short ** AllocateShortMatrix(int rows, int cols)
+ {
+ short ** matrix = new short * [rows];
+
+ // Stop early if we are out of memory
+ if (matrix == NULL)
+ return NULL;
+
+ for (int i = 0; i < rows; i++)
+ {
+ matrix[i] = new short [cols];
+
+ // Safely unravel allocation if we run out of memory
+ if (matrix[i] == NULL)
+ {
+ while (i--)
+ delete [] matrix[i];
+
+ delete [] matrix;
+
+ return NULL;
+ }
+ }
+
+ return matrix;
+ }
+
+void FreeShortMatrix(short ** & matrix, int rows)
+ {
+ if (matrix == NULL)
+ return;
+
+ for (int i = 0; i < rows; i++)
+ delete [] matrix[i];
+
+ delete [] matrix;
+
+ matrix = NULL;
+ }
+
+double ** AllocateDoubleMatrix(int rows, int cols)
+ {
+ double ** matrix = new double * [rows];
+
+ // Stop early if we are out of memory
+ if (matrix == NULL)
+ return NULL;
+
+ for (int i = 0; i < rows; i++)
+ {
+ matrix[i] = new double [cols];
+
+ // Safely unravel allocation if we run out of memory
+ if (matrix[i] == NULL)
+ {
+ while (i--)
+ delete [] matrix[i];
+
+ delete [] matrix;
+
+ return NULL;
+ }
+ }
+
+ return matrix;
+ }
+
+void FreeDoubleMatrix(double ** & matrix, int rows)
+ {
+ for (int i = 0; i < rows; i++)
+ delete [] matrix[i];
+
+ delete [] matrix;
+
+ matrix = NULL;
+ }
+
+
+
diff --git a/libsrc/MemoryAllocators.h b/libsrc/MemoryAllocators.h
new file mode 100644
index 0000000..4291c22
--- /dev/null
+++ b/libsrc/MemoryAllocators.h
@@ -0,0 +1,103 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/MemoryAllocators.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __MEMORY_ALLOCATORS_H__
+#define __MEMORY_ALLOCATORS_H__
+
+#include <stdlib.h>
+
+template <class T> T** AllocateMatrix(int rows, int cols);
+template <class T> T** AllocateMatrix(int rows, int cols, T value);
+template <class T> void FreeMatrix(T ** & matrix, int rows);
+
+char ** AllocateCharMatrix(int rows, int cols);
+void FreeCharMatrix(char ** & matrix, int rows);
+
+float ** AllocateFloatMatrix(int rows, int cols);
+void FreeFloatMatrix(float ** & matrix, int rows);
+
+double ** AllocateDoubleMatrix(int rows, int cols);
+void FreeDoubleMatrix(double ** & matrix, int rows);
+
+int ** AllocateIntMatrix(int rows, int cols);
+void FreeIntMatrix(int ** & matrix, int rows);
+
+short ** AllocateShortMatrix(int rows, int cols);
+void FreeShortMatrix(short ** & matrix, int rows);
+
+char *** AllocateCharCube(int n, int rows, int cols);
+void FreeCharCube(char *** & matrix, int n, int rows);
+
+
+// Template definitions follow ...
+//
+
+template <class T> T** AllocateMatrix(int rows, int cols)
+ {
+ T ** matrix = new T * [rows];
+
+ // Stop early if we are out of memory
+ if (matrix == NULL)
+ return NULL;
+
+ for (int i = 0; i < rows; i++)
+ {
+ matrix[i] = new T [cols];
+
+ // Safely unravel allocation if we run out of memory
+ if (matrix[i] == NULL)
+ {
+ while (i--)
+ delete [] matrix[i];
+
+ delete [] matrix;
+
+ return NULL;
+ }
+ }
+
+ return matrix;
+ };
+
+template <class T> T** AllocateMatrix(int rows, int cols, T value)
+ {
+ T ** matrix = AllocateMatrix<T>(rows, cols);
+
+ if (matrix != NULL)
+ for (int i = 0; i < rows; i++)
+ for (int j = 0; j < cols; j++)
+ matrix[i][j] = value;
+
+ return matrix;
+ };
+
+template <class T> void FreeMatrix(T ** & matrix, int rows)
+ {
+ if (matrix == NULL)
+ return;
+
+ for (int i = 0; i < rows; i++)
+ delete [] matrix[i];
+
+ delete [] matrix;
+
+ matrix = NULL;
+ };
+
+#endif
+
+
diff --git a/libsrc/MemoryInfo.cpp b/libsrc/MemoryInfo.cpp
new file mode 100644
index 0000000..1151147
--- /dev/null
+++ b/libsrc/MemoryInfo.cpp
@@ -0,0 +1,38 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/MemoryInfo.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "MemoryInfo.h"
+
+String & MemoryInfo(double bytes)
+ {
+ static String info;
+
+ if (bytes < 1024)
+ return info = "<1.0 kb";
+
+ if (bytes < 1024. * 1024.)
+ info.printf("%.1f kb", (bytes + 1023) / 1024.);
+ else if (bytes < 1024. * 1024. * 1024.)
+ info.printf("%.1f mb", (bytes + 1024. * 1024. - 1) / (1024. * 1024.));
+ else if (bytes < 1024. * 1024. * 1024. * 1024.)
+ info.printf("%.1f gb", bytes / (1024. * 1024. * 1024.));
+ else
+ info.printf("%.1f tb", bytes / (1024. * 1024. * 1024. * 1024.));
+
+ return info;
+ }
+
diff --git a/libsrc/MemoryInfo.h b/libsrc/MemoryInfo.h
new file mode 100644
index 0000000..27e2da1
--- /dev/null
+++ b/libsrc/MemoryInfo.h
@@ -0,0 +1,26 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/MemoryInfo.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __MEMORYINFO_H__
+#define __MEMORYINFO_H__
+
+#include "StringBasics.h"
+
+String & MemoryInfo(double bytes);
+
+#endif
+
diff --git a/libsrc/MiniDeflate.cpp b/libsrc/MiniDeflate.cpp
new file mode 100644
index 0000000..8058209
--- /dev/null
+++ b/libsrc/MiniDeflate.cpp
@@ -0,0 +1,349 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/MiniDeflate.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "MiniDeflate.h"
+
+// Convenient constants and macros
+//
+#define EMPTY_KEY 123
+#define uchar unsigned char
+
+#ifndef min
+#define min(a,b) (((a)<(b))?(a):(b))
+#endif
+
+MiniDeflate::MiniDeflate()
+ {
+ buffer = new uchar [BUFFER_SIZE + 5];
+ hash_keys = new uchar [HASH_SIZE];
+ hash_values = new uchar * [HASH_SIZE * HASH_DEPTH];
+ }
+
+MiniDeflate::~MiniDeflate()
+ {
+ delete [] buffer;
+ delete [] hash_keys;
+ delete [] hash_values;
+ }
+
+void MiniDeflate::EvaluateMatch(unsigned char * in, int len, int hash,
+ unsigned char * & best_pos, int & best_match)
+ {
+ int max = min(len, 0xFFFF + 66);
+
+ for (int i = HASH_DEPTH; i > 0; i--)
+ // Check each possible match (up to HASH_DEPTH)
+ {
+ uchar * pos = hash_values[hash * HASH_DEPTH + ((hash_keys[hash] + i) % HASH_DEPTH)];
+
+ if (pos == NULL || in - pos >= 0x4001) break;
+
+ int match = 0;
+
+ while (match < max && pos[match] == in[match])
+ match++;
+
+ if (match > best_match)
+ {
+ best_match = match;
+ best_pos = pos;
+ }
+ }
+
+ // If string seems pretty unique, add to hash table
+ if (best_match < OKAY_MATCH)
+ {
+ int delta = hash_keys[hash] = (uchar) ((hash_keys[hash] + 1) & 7);
+ hash_values[hash * 8 + delta] = in;
+ }
+ }
+
+void MiniDeflate::QuoteLiterals(unsigned char * & in, int literal,
+ unsigned char * & out, int & buffer_len,
+ FILE * output)
+ {
+ if (buffer_len < 0)
+ {
+ fwrite(buffer, out - buffer, 1, output);
+ buffer_len = BUFFER_SIZE;
+ out = buffer;
+ }
+
+ while (buffer_len < literal)
+ {
+ literal -= buffer_len;
+ while (buffer_len--)
+ { *out = *in; in++; out++; }
+ fwrite(buffer, BUFFER_SIZE, 1, output);
+ buffer_len = BUFFER_SIZE;
+ out = buffer;
+ }
+
+ while (literal--)
+ { *out = *in; in++; out++; buffer_len--; }
+ }
+
+void MiniDeflate::OutputLiterals(unsigned char * & in, int literal,
+ unsigned char * & out, int & buffer_len,
+ FILE * output)
+ {
+ while (literal > 0)
+ if (literal < 16)
+ {
+ *out = (char) literal; out++; buffer_len--;
+ QuoteLiterals(in, literal, out, buffer_len, output);
+ break;
+ }
+ else if (literal < 31)
+ {
+ *out = 15; out++; buffer_len--;
+ QuoteLiterals(in, 15, out, buffer_len, output);
+ *out = (uchar) (literal - 15); out++; buffer_len--;
+ QuoteLiterals(in, literal - 15, out, buffer_len, output);
+ break;
+ }
+ else
+ {
+ int length = min(literal, 0xFFFF + 31);
+ literal -= length;
+ length -= 31;
+
+ *out = 0; out++;
+ *out = (uchar) (length >> 8); out++;
+ *out = (uchar) (length & 0xFF); out++;
+ buffer_len -= 3;
+
+ QuoteLiterals(in, length + 31, out, buffer_len, output);
+ }
+ }
+
+
+void MiniDeflate::Deflate(FILE * output, void * void_input, size_t len)
+ {
+ uchar * in = (uchar *) void_input;
+ uchar * out = (uchar *) buffer;
+ int buffer_len = BUFFER_SIZE;
+
+ for (int i = 0; i < HASH_SIZE; i++) hash_keys[i] = EMPTY_KEY;
+
+ uchar * in2 = in;
+
+ while (len > 2)
+ {
+ // Hash the current input value
+ int hash = ((in[0] << 16) | (in[1] << 8) | in[2]) % HASH_SIZE;
+
+ if (hash_keys[hash] != EMPTY_KEY)
+ // Possible matches in hash table
+ {
+ int best_match = 0;
+ uchar * best_pos;
+
+ EvaluateMatch(in, len, hash, best_pos, best_match);
+
+ // If there are no decent matches
+ if (best_match < 3)
+ {
+ in++;
+ len--;
+ continue;
+ }
+
+ // Try look ahead if match isn't great
+ while (best_match < OKAY_MATCH && len > 3)
+ {
+ // Peek to see if we could get a better match
+ int next_hash = ((in[1] << 16) | (in[2] << 8) | in[3]) % HASH_SIZE;
+
+ if (hash_keys[next_hash] == EMPTY_KEY) break;
+
+ int next_match = 0;
+ uchar * next_pos;
+
+ EvaluateMatch(in + 1, len - 1, next_hash, next_pos, next_match);
+
+ // Didn't find a better match
+ if (next_match <= best_match + 1) break;
+
+ // Found a better match, so try again
+ in++;
+ len--;
+ best_match = next_match;
+ best_pos = next_pos;
+ }
+
+ int best_offset = in - best_pos - 1;
+
+ // This is where we output stuff
+ // Check if we have some literals to output first
+ OutputLiterals(in2, in - in2, out, buffer_len, output);
+
+ in2 = in += best_match;
+ len -= best_match;
+
+ if (best_match < 17 && best_offset < 0x1000)
+ {
+ *out = (uchar)(((best_match - 1) << 4) | (best_offset >> 8)); out++;
+ *out = (uchar)(best_offset & 0xFF); out++;
+ buffer_len -= 2;
+ }
+ else if (best_match < 66)
+ {
+ *out = (uchar) (16 | (best_offset >> 10)); out++;
+ *out = (uchar) ((best_offset >> 2) & 0xFF); out++;
+ *out = (uchar) ((best_offset << 6) | (best_match - 2)); out++;
+ buffer_len -= 3;
+ }
+ else
+ {
+ *out = (uchar) (16 | (best_offset >> 10)); out++;
+ *out = (uchar) ((best_offset >> 2) & 0xFF); out++;
+ *out = (uchar) (best_offset << 6); out++;
+ best_match -= 66;
+ *out = (uchar) (best_match >> 8); out++;
+ *out = (uchar) (best_match & 0xFF); out++;
+ buffer_len -= 5;
+ }
+
+ if (buffer_len <= 0)
+ {
+ fwrite(buffer, out - buffer, 1, output);
+ buffer_len = BUFFER_SIZE;
+ out = buffer;
+ }
+ }
+ // Never seen this sequence before
+ else
+ {
+ hash_keys[hash] = 0;
+ for (int i = 1; i < HASH_DEPTH; i++) hash_values[hash * 8 + i] = NULL;
+ hash_values[hash * 8] = in;
+ in++;
+ len--;
+ }
+ }
+
+ // Check if we have some trailing literals to output
+ in += len;
+ OutputLiterals(in2, in - in2, out, buffer_len, output);
+
+ // Flush output
+ if (out != buffer) fwrite(buffer, out - buffer, 1, output);
+ }
+
+void MiniDeflate::CiteLiteral(unsigned char * & out, int literal,
+ unsigned char * & in, int & buffer_len,
+ FILE * input)
+ {
+ while (buffer_len < literal)
+ {
+ literal -= buffer_len;
+ while (buffer_len--)
+ { *out = *in; in++; out++; }
+ buffer_len = fread(buffer + 5, 1, BUFFER_SIZE, input);
+ in = buffer + 5;
+ }
+
+ while (literal--)
+ { *out = *in; in++; out++; buffer_len--; }
+ }
+
+
+void MiniDeflate::Inflate(FILE * input, void * void_output, size_t len)
+ {
+ uchar * out = (uchar *) void_output;
+ uchar * in = (uchar *) buffer + 5;
+ int buffer_len = BUFFER_SIZE;
+
+ buffer_len = fread(buffer + 5, 1, BUFFER_SIZE, input);
+
+ while (len)
+ {
+ int match_len = *in >> 4;
+
+ // Matching a literal
+ if (match_len == 0)
+ {
+ match_len = *in & 0x0F;
+ in++, buffer_len--;
+
+ // If match_len == 0 then string is longer than 30 characters
+ // Strings of 16 - 30 characters are encoded as two short strings
+ if (match_len == 0)
+ {
+ match_len = (in[0] << 8) + in[1] + 31;
+ in += 2;
+ buffer_len -= 2;
+ }
+
+ CiteLiteral(out, match_len, in, buffer_len, input);
+ len -= match_len;
+ }
+ // Long match, 14 bit offset
+ else if (match_len == 1)
+ {
+ int offset = (((in[0] & 0x0F) << 10) | (in[1] << 2) | (in[2] >> 6)) + 1;
+ match_len = (in[2] & 0x3F) + 2;
+ in += 3; buffer_len -= 3;
+
+ if (match_len == 2)
+ {
+ match_len = ((in[0] << 8) | in[1]) + 66;
+ in += 2;
+ buffer_len -= 2;
+ }
+
+ uchar * match_pos = out - offset;
+ len -= match_len;
+ while (match_len--)
+ {
+ *out = *match_pos;
+ out++, match_pos++;
+ }
+ }
+ // Typical short match
+ else
+ {
+ int offset = (((in[0] & 0x0F) << 8) | in[1]) + 1;
+ in += 2; buffer_len -= 2;
+
+ uchar * match_pos = out - offset;
+ len -= ++match_len;
+ while (match_len--)
+ {
+ *out = *match_pos;
+ out++, match_pos++;
+ }
+ }
+
+ if (buffer_len < 5)
+ {
+ uchar * in2 = (uchar *) buffer + 5 - buffer_len;
+ while (in2 != buffer + 5)
+ { *in2 = *in; in2++; in++; }
+
+ in = buffer + 5 - buffer_len;
+ buffer_len += fread(buffer + 5, 1, BUFFER_SIZE, input);
+ }
+ }
+
+ if (buffer_len) fseek(input, -buffer_len, SEEK_CUR);
+ }
+
+
+
+
diff --git a/libsrc/MiniDeflate.h b/libsrc/MiniDeflate.h
new file mode 100644
index 0000000..fc12f58
--- /dev/null
+++ b/libsrc/MiniDeflate.h
@@ -0,0 +1,103 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/MiniDeflate.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __MINIDEFLATE_H__
+#define __MINIDEFLATE_H__
+
+#include <stdio.h>
+
+// MiniDeflate reads and writes files in a simple Deflate like format
+// A quick overview of this format follows, at the bottom of this file
+//
+
+// Performance tuning constants
+//
+
+// Hash table size is HASH_SIZE (a prime)
+#define HASH_SIZE 4093
+// Hash table depth is HASH_DEPTH (a power of 2)
+#define HASH_DEPTH 8
+// Matches that are not at least OKAY_MATCH chars are added to hash table
+#define OKAY_MATCH 32
+// Buffer size for FILE I/O
+#define BUFFER_SIZE (32 * 1024)
+
+class MiniDeflate
+ {
+ public:
+ MiniDeflate();
+ ~MiniDeflate();
+
+ void Deflate(FILE * output, void * input, size_t bytes);
+ void Inflate(FILE * input, void * ouput, size_t bytes);
+
+ private:
+ unsigned char * buffer;
+ unsigned char * hash_keys;
+ unsigned char ** hash_values;
+
+ // Inline functions used during file compression
+ inline void EvaluateMatch(unsigned char * in, int len, int hash,
+ unsigned char * & best_pos, int & best_match);
+ inline void QuoteLiterals(unsigned char * & in, int literal,
+ unsigned char * & out, int & buffer_len,
+ FILE * output);
+ inline void OutputLiterals(unsigned char * & in, int literal,
+ unsigned char * & out, int & buffer_len,
+ FILE * output);
+ inline void CiteLiteral(unsigned char * & out, int literal,
+ unsigned char * & in, int & buffer_len,
+ FILE * input);
+ };
+
+// Format specification for deflate files
+//
+// A compressed file is a sequence of bytes {0 .. N}.
+// Each byte is a sequence of bits [0 .. 7] with 0 as the Most Significant Bit.
+//
+// The following tokens are recognized:
+//
+// Literal quotes -- refer to unique strings
+//
+// BYTE0 BYTE1 BYTE2 Description
+// 0 HI LO Quote of 31 bytes of more
+// Followed by (HI << 8 + LO + 31) quoted chars
+// 0:4|LEN Quote of up to 1-15 bytes
+// Followed by LEN quoted chars
+//
+// String matches -- refer to previous strings in the input stream
+//
+// BYTE0 BYTE1 BYTE2 BYTE3 BYTE4 Description
+// 1:4|OFF OFF1 OFF2:2|0 HI LO Long match of > 66 bytes
+// Offset of OFF|OFF1|OFF2 + 1
+// Length of HI|LO + 66
+// 1:4|OFF OFF1 OFF2:2|LEN Distant match of < 66 bytes
+// Offset of OFF|OFF1|OFF2 + 1
+// Length of LEN + 2
+// LEN|OFF OFF1 Nearby short match
+// Offset OFF|OFF1 + 1
+// Length LEN
+//
+
+// NOTE: When partitioning bytes, I use the notation X:n|Y so that
+// X takes the n MSB bits of byte and Y takes the remaining bits.
+
+
+#endif
+
+
+
diff --git a/libsrc/Parameters.cpp b/libsrc/Parameters.cpp
new file mode 100644
index 0000000..31e23af
--- /dev/null
+++ b/libsrc/Parameters.cpp
@@ -0,0 +1,735 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/Parameters.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "Parameters.h"
+#include "Constant.h"
+#include "MathConstant.h"
+#include "Error.h"
+
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+
+int Parameter::nameCol = 30;
+int Parameter::statusCol = 15;
+
+Parameter::Parameter(char c, const char * desc, void * v)
+ {
+ ch = (char) tolower(c);
+ description = new char [strlen(desc) + 1];
+ strcpy(description, desc);
+ var = v;
+ warnings = NULL;
+ }
+
+bool Parameter::Read(int , char ** argv, int argn)
+ {
+ int p = 0;
+ char c = (char) tolower(argv[argn][p]);
+
+ if ((c == '-') || (c == '/'))
+ {
+ p++;
+ c = (char) tolower(argv[argn][p]);
+ }
+
+ if (c == ch)
+ {
+ Translate(&(argv[argn][++p]));
+ return true;
+ }
+ return false;
+ }
+
+bool Parameter::TranslateExtras(const char * , const char * )
+ { return false; }
+
+void Parameter::warning(const char * format, ...)
+ {
+ String buffer;
+
+ va_list ap;
+ va_start(ap, format);
+ buffer.vprintf(format, ap);
+ va_end(ap);
+
+ if (warnings == NULL)
+ ::warning(buffer);
+ else
+ (*warnings) += buffer;
+ }
+
+void IntParameter::Translate(const char * value)
+ {
+ * (int *) var = atoi(value);
+ }
+
+bool IntParameter::TranslateExtras(const char * value, const char * extras)
+ {
+ if (value[0] != 0 || !CheckInteger(extras))
+ return false;
+
+ Translate(extras);
+
+ return true;
+ }
+
+void IntParameter::Status()
+ {
+ printf("%*s : %*d (-%c9999)\n", nameCol, description,
+ statusCol, * (int *) var, ch);
+ }
+
+void SwitchParameter::Translate(const char * value)
+ {
+ switch (*value)
+ {
+ case '+' :
+ * (bool *) var = true;
+ break;
+ case '-' :
+ * (bool *) var = false;
+ break;
+ case 0 :
+ * (bool *) var = ! * (bool *) var;
+ break;
+ default :
+ warning("Command line parameter -%c%s: the option '%c' has no meaning\n",
+ ch, value, value[0]);
+ }
+ }
+
+void SwitchParameter::Status()
+ {
+ printf("%*s : %*s (-%c[+|-])\n", nameCol, description,
+ statusCol, * (bool *) var == false ? "OFF" : "ON", ch);
+ }
+
+DoubleParameter::DoubleParameter(char c, const char * desc, double & v)
+ : Parameter(c, desc, &v)
+ {}
+
+void DoubleParameter::Translate(const char * value)
+ {
+ if (value[0])
+ * (double *) var = atof(value);
+ else
+ * (double *) var = _NAN_;
+ }
+
+bool DoubleParameter::TranslateExtras(const char * value, const char * extras)
+ {
+ if (value[0] != 0 || !CheckDouble(extras))
+ return false;
+
+ Translate(extras);
+
+ return true;
+ }
+
+void DoubleParameter::Status()
+ {
+ double absolute_value = fabs(* (double *) var);
+
+ if (* (double *) var == _NAN_)
+ printf("%*s : %*s (-%c99.999)\n", nameCol, description,
+ statusCol, "NAN", ch);
+ else if (absolute_value >= 0.00095)
+ printf("%*s : % *.3f (-%c99.999)\n", nameCol, description,
+ statusCol, * (double *) var, ch);
+ else if (absolute_value <= 1e-15)
+ printf("%*s : % *.0f (-%c99.999)\n", nameCol, description,
+ statusCol, * (double *) var, ch);
+ else
+ printf("%*s : %*.0e (-%c99.999)\n", nameCol, description,
+ statusCol, * (double *) var, ch);
+ }
+
+void StringParameter::Translate(const char * value)
+ {
+ String * s = (String *) var;
+
+ *s = value;
+ }
+
+bool StringParameter::TranslateExtras(const char * value, const char * extras)
+ {
+ if (value[0] != 0 || !required && extras[0] == '-')
+ return false;
+
+ String * s = (String *) var;
+
+ *s = extras;
+
+ return true;
+ }
+
+void StringParameter::Status()
+ {
+ printf("%*s : %*s (-%cname)\n", nameCol, description,
+ statusCol, (const char *) (* (String *) var), ch);
+ }
+
+void ListParameter::Status()
+ {
+ OptionList * l;
+
+ for(l = options; l->ch != 0; l++)
+ if (l->code == *((int *)var))
+ break;
+
+ printf("%*s : %*s (-%c[%s])\n", nameCol, description,
+ statusCol, l->description, ch, (const char *) key);
+ }
+
+void ListParameter::Translate(const char * value)
+ {
+ OptionList * l;
+
+ for(l = options; l->ch != 0; l++)
+ if (tolower(l->ch) == tolower(value[0]))
+ break;
+
+ if (l->ch == 0 && tolower(value[0]) != 0)
+ warning("Command line parameter -%c%s: the option '%c' has no meaning\n",
+ ch, value, value[0], (const char *) key);
+
+ * ((int*) var) = l->code;
+ }
+
+ListParameter::ListParameter(char c, const char * desc, int & v, OptionList * opt)
+ : Parameter(c, desc, &v)
+ {
+ options = opt;
+
+ for (OptionList * l = options; l->ch != 0; l++)
+ {
+ key += l->ch;
+ key += '|';
+ }
+
+ key.SetLength(key.Length() - 1);
+ }
+
+SetParameter::SetParameter(char c, const char * desc, int & v, OptionList * opt)
+ : Parameter(c, desc, &v)
+ {
+ options = opt;
+
+ for (OptionList * l = options; l->ch != 0; l++)
+ {
+ key += l->ch;
+ key += '|';
+ }
+ key.SetLength(key.Length() - 1);
+ }
+
+void SetParameter::Status()
+ {
+ bool first = 0;
+ int temp = * (int *) var;
+
+ for(OptionList * l = options; l->ch != 0; l++)
+ if ((l->code & temp) || (l->code == *(int *) var) )
+ {
+ if (!first)
+ printf("%*s : %*s (-%c{%s})\n", nameCol, description,
+ statusCol, l->description, ch, (const char *) key);
+ else
+ printf("%*s & %*s\n", nameCol, "",
+ statusCol, l->description);
+ first = true;
+ temp &= ~l->code;
+ }
+ }
+
+void SetParameter::Translate(const char * value)
+ {
+ *(int*)var = 0;
+
+ for(const char * chr = value; *chr != 0; chr++)
+ {
+ int valid = false;
+
+ for(OptionList * l = options; l->ch != 0; l++)
+ if (tolower(l->ch) == tolower(*chr))
+ {
+ * ((int*) var) |= l->code;
+ valid = true;
+ }
+
+ if (!valid)
+ warning("Command line parameter -%c%s: the option '%c' has no meaning\n",
+ ch, value, *chr);
+ }
+ }
+
+LongParameters::LongParameters(const char * desc, LongParameterList * lst)
+ : Parameter('-', desc, NULL)
+ {
+ list = lst;
+
+ index.Clear();
+ legacyIndex.Clear();
+ group_len = 0;
+
+ LongParameterList * ptr = list + 1;
+
+ while (ptr->description != NULL)
+ {
+ if (ptr->type == LP_LEGACY_PARAMETERS)
+ break;
+
+ if (ptr->value != NULL)
+ index.Add(ptr->description, ptr);
+ else
+ group_len = max(strlen(ptr->description), group_len);
+
+ ptr++;
+ }
+
+ while (ptr->description != NULL)
+ {
+ if (ptr->value != NULL)
+ legacyIndex.Add(ptr->description, ptr);
+
+ ptr++;
+ }
+ }
+
+void LongParameters::Translate(const char * cstr)
+ {
+ String value(cstr);
+
+ int p = value.FastFindChar(':');
+ int option = p == -1 ? index.FindStem(value) : index.FindStem(value.Left(p));
+
+ if (option == -2)
+ {
+ warning("Command line parameter --%s is ambiguous\n", (const char *) value);
+ return;
+ }
+
+ LongParameterList * ptr;
+
+ if (option >= 0)
+ ptr = (LongParameterList *) index.Object(option);
+ else
+ {
+ int alternate = p == -1 ? legacyIndex.FindStem(value) :
+ legacyIndex.FindStem(value.Left(p));
+
+ if (alternate < 0)
+ {
+ warning("Command line parameter --%s is undefined\n", (const char *) value);
+ return;
+ }
+
+ ptr = (LongParameterList *) legacyIndex.Object(alternate);
+ ptr->touched = true;
+ }
+
+ if (ptr->type == LP_BOOL_PARAMETER)
+ {
+ if (p == -1)
+ * (bool *) ptr->value ^= true;
+ else
+ * (bool *) ptr->value = value.SubStr(p + 1).SlowCompare("ON") == 0;
+
+ // In exclusive groups, only one option may be selected
+ if (ptr->exclusive)
+ {
+ for (int i = -1; ptr[i].exclusive; i--) * (bool *)ptr[i].value = false;
+ for (int i = 1; ptr[i].exclusive; i++) * (bool *)ptr[i].value = false;
+ }
+ }
+ else if (ptr->type == LP_INT_PARAMETER)
+ if (p == -1)
+ * (int *) ptr->value = * (int *) ptr->value ? 0 : 1;
+ else
+ * (int *) ptr->value = value.SubStr(p + 1).SlowCompare("ON") == 0 ?
+ 1 : value.SubStr(p + 1).AsInteger();
+ else if (ptr->type == LP_DOUBLE_PARAMETER)
+ {
+ if (p != -1)
+ * (double *) ptr->value = value.SubStr(p + 1).AsDouble();
+ }
+ else if (ptr->type == LP_STRING_PARAMETER)
+ {
+ if (p != -1)
+ * (String *) ptr->value = value.SubStr(p + 1);
+ }
+ }
+
+bool LongParameters::TranslateExtras(const char * cstr, const char * extras)
+ {
+ if (strchr(cstr, ':') != NULL)
+ return false;
+
+ int option = index.FindStem(cstr);
+
+ LongParameterList * ptr;
+
+ if (option >= 0)
+ ptr = (LongParameterList *) index.Object(option);
+ else
+ {
+ option = legacyIndex.FindStem(cstr);
+
+ if (option < 0)
+ return false;
+
+ ptr = (LongParameterList *) legacyIndex.Object(option);
+ }
+
+ if (ptr->type == LP_INT_PARAMETER && CheckInteger(extras))
+ {
+ * (int *) ptr->value = atoi(extras);
+ return true;
+ }
+ else if (ptr->type == LP_DOUBLE_PARAMETER && CheckDouble(extras))
+ {
+ * (double *) ptr->value = atof(extras);
+ return true;
+ }
+ else if (ptr->type == LP_STRING_PARAMETER)
+ {
+ * (String *) ptr->value = extras;
+ return true;
+ }
+
+ return false;
+ }
+
+void LongParameters::Status(LongParameterList * ptr, int & line_len, bool & need_a_comma)
+ {
+ String state;
+ int line_start = group_len ? group_len + 5 : 0;
+
+ if (ptr->value == NULL)
+ {
+ printf("%s %*s :", need_a_comma ? "\n" : "", group_len + 2, ptr->description);
+ need_a_comma = false;
+ line_len = line_start;
+ }
+ else
+ {
+ if (ptr->type == LP_BOOL_PARAMETER)
+ state = * (bool *) ptr->value ? " [ON]" : "";
+ else if (ptr->type == LP_INT_PARAMETER)
+ if (* (int *) ptr->value == 1 && ptr->exclusive || * (int *) ptr->value == 0)
+ state = * (int *) ptr->value ? " [ON]" : "";
+ else
+ state = " [", state += * (int *) ptr->value, state += ']';
+ else if (ptr->type == LP_DOUBLE_PARAMETER)
+ if (* (double *) ptr->value != _NAN_)
+ {
+ double value = * (double *) ptr->value;
+
+ state = " [";
+ if (value == 0.0 || value >= 0.01)
+ state.catprintf("%.2f", value);
+ else
+ state.catprintf("%.1e", value);
+ state += ']';
+ }
+ else
+ state = "";
+ else if (ptr->type == LP_STRING_PARAMETER)
+ state = " [" + * (String *) ptr->value + "]";
+
+ int item_len = 3 + strlen(ptr->description) + need_a_comma + state.Length();
+
+ if (item_len + line_len > 78 && line_len > line_start)
+ {
+ line_len = line_start;
+ printf("%s\n%*s", need_a_comma ? "," : "", line_len, "");
+ need_a_comma = 0;
+ item_len -= 1;
+ }
+
+ printf("%s --%s%s", need_a_comma ? "," : (need_a_comma = true, ""),
+ ptr->description, (const char *) state);
+
+ need_a_comma = true;
+ line_len += item_len;
+ }
+ }
+
+void LongParameters::Status()
+ {
+ if (description != NULL && description[0] != 0)
+ printf("\n%s\n", description);
+
+ bool need_a_comma = false;
+ int line_len = 0;
+
+ bool legacy_parameters = false;
+ bool legacy_count = 0;
+
+ for (LongParameterList * ptr = list + 1; ptr->description != NULL; ptr++)
+ if (ptr->type == LP_LEGACY_PARAMETERS)
+ legacy_parameters = true;
+ else if (legacy_parameters == false)
+ Status(ptr, line_len, need_a_comma);
+ else if (ptr->touched)
+ {
+ if (legacy_count == 0)
+ {
+ printf("\n\nAdditional Options:\n %*s ", group_len + 3, "");
+ line_len = group_len + 5;
+ need_a_comma = false;
+ }
+
+ Status(ptr, line_len, need_a_comma);
+ legacy_count++;
+ }
+
+ printf("\n");
+ }
+
+void ParameterList::Add(Parameter * p)
+ {
+ if (count + 1 >= size)
+ error("Parameter list size should be increased");
+
+ p->SetWarningBuffer(warnings);
+ pl[count++] = p;
+ };
+
+void ParameterList::Read(int argc, char ** argv, int start)
+ {
+ MakeString(argc, argv, start);
+ for (int i=start; i < argc; i++)
+ {
+ bool success = false;
+
+ if (argv[i][0] == '-' && argv[i][1])
+ for (int j=0; j<count; j++)
+ {
+ success = tolower(argv[i][1]) == pl[j]->ch;
+
+ if (success)
+ {
+ if ((i+1 < argc) && pl[j]->TranslateExtras(argv[i]+2, argv[i+1]))
+ {
+ i++;
+ break;
+ }
+ if (argv[i][2] == 0 && (i+1 < argc) && (argv[i + 1][0] != '-'))
+ pl[j]->Translate(argv[++i]);
+ else
+ pl[j]->Translate(argv[i] + 2);
+ break;
+ }
+ }
+
+ if (!success)
+ {
+ String warning;
+
+ warning.printf("Command line parameter %s (#%d) ignored\n", argv[i], i);
+ warnings += warning;
+ }
+ }
+ }
+
+int ParameterList::ReadWithTrailer(int argc, char ** argv, int start)
+ {
+ MakeString(argc, argv, start);
+
+ int last_success = start - 1;
+ bool split = false;
+
+ for (int i=start; i < argc; i++)
+ {
+ bool success = false;
+
+ if (argv[i][0] == '-' && argv[i][1])
+ for (int j=0; j<count; j++)
+ {
+ success = tolower(argv[i][1]) == pl[j]->ch;
+
+ if (success)
+ {
+ if (argv[i][2] == 0 && (i+1 < argc) && (argv[i + 1][0] != '-'))
+ pl[j]->Translate(argv[i + 1]), split = true;
+ else
+ pl[j]->Translate(argv[i] + 2);
+ break;
+ }
+ }
+
+ if (success)
+ for (last_success++; last_success < i; last_success++)
+ warnings.printf("Command line parameter %s (#%d) ignored\n",
+ argv[last_success], last_success);
+
+ if (split) { split = false; last_success++; i++; }
+ }
+
+ return last_success;
+ };
+
+
+void ParameterList::Status()
+ {
+ printf("\nThe following parameters are in effect:\n");
+
+ for (int i=0; i<count; i++)
+ pl[i]->Status();
+
+ printf("\n");
+
+ if (warnings.Length())
+ {
+ ::warning("Problems encountered parsing command line:\n\n%s",
+ (const char *) warnings);
+ warnings.Clear();
+ }
+
+ if (messages.Length())
+ printf("NOTES:\n%s\n", (const char *) messages);
+ }
+
+void ParameterList::MakeString(int argc, char ** argv, int start)
+ {
+ int len = 0;
+
+ for (int i=start; i<argc; i++)
+ len += strlen(argv[i]) + 1;
+
+ string = new char [len+1];
+ string[0] = 0;
+
+ for (int i=start; i<argc; i++)
+ {
+ strcat(string, argv[i]);
+ strcat(string, " ");
+ }
+ }
+
+ParameterList::~ParameterList()
+ {
+ for (int i = 0; i < count; i++)
+ delete pl[i];
+ delete [] pl;
+ delete [] string;
+ };
+
+bool Parameter::CheckInteger(const char * value)
+ {
+ if ( value[0] != '+' && value[0] != '-' &&
+ (value[0] < '0' || value[0] > '9'))
+ return false;
+
+ int pos = 1;
+ while (value[pos] != 0)
+ if (value[pos] < '0' || value[pos] > '9')
+ return false;
+ else
+ pos++;
+
+ return true;
+ }
+
+bool Parameter::CheckDouble(const char * value)
+ {
+ if ( value[0] != '+' && value[0] != '-' && value[0] != '.' &&
+ (value[0] < '0' || value[0] > '9'))
+ return false;
+
+ bool decimal = value[0] == '.';
+
+ for (int pos = 1; value[pos] != 0; pos++)
+ if (value[pos] < '0' || value[pos] > '9')
+ if (!decimal && value[pos] == '.')
+ decimal = true;
+ else if (value[pos] == 'e' || value[pos] == 'E')
+ return CheckInteger(value + pos + 1);
+
+ return true;
+ }
+
+void ParameterList::Enforce(bool & var, bool value, const char * format, ...)
+ {
+ if (var == value)
+ return;
+
+ var = value;
+
+ String buffer;
+
+ va_list ap;
+ va_start(ap, format);
+ buffer.vprintf(format, ap);
+ va_end(ap);
+
+ messages += buffer;
+ }
+
+void ParameterList::Enforce(int & var, int value, const char * format, ...)
+ {
+ if (var == value)
+ return;
+
+ var = value;
+
+ String buffer;
+
+ va_list ap;
+ va_start(ap, format);
+ buffer.vprintf(format, ap);
+ va_end(ap);
+
+ messages += buffer;
+ }
+
+void ParameterList::Enforce(double & var, double value, const char * format, ...)
+ {
+ if (var == value)
+ return;
+
+ var = value;
+
+ String buffer;
+
+ va_list ap;
+ va_start(ap, format);
+ buffer.vprintf(format, ap);
+ va_end(ap);
+
+ messages += buffer;
+ }
+
+void ParameterList::Enforce(String & var, const char * value, const char * format, ...)
+ {
+ if (var.SlowCompare(value) == 0)
+ return;
+
+ var = value;
+
+ String buffer;
+ va_list ap;
+ va_start(ap, format);
+ buffer.vprintf(format, ap);
+ va_end(ap);
+
+ messages += buffer;
+ }
+
diff --git a/libsrc/Parameters.h b/libsrc/Parameters.h
new file mode 100644
index 0000000..94bd299
--- /dev/null
+++ b/libsrc/Parameters.h
@@ -0,0 +1,292 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/Parameters.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __PARAMETERS_H__
+#define __PARAMETERS_H__
+
+#include "StringMap.h"
+
+#include <ctype.h>
+#include <stddef.h>
+
+class ParameterList;
+
+class Parameter
+ {
+ protected:
+ char ch;
+ char * description;
+ void * var;
+
+ static int nameCol;
+ static int statusCol;
+
+ virtual void Translate(const char * value) = 0;
+ virtual bool TranslateExtras(const char * value, const char * extras);
+
+ static bool CheckInteger(const char * value);
+ static bool CheckDouble(const char * value);
+
+ String * warnings;
+
+ public:
+
+ Parameter(char c, const char * desc, void * v);
+
+ virtual ~Parameter()
+ {
+ delete [] description;
+ }
+
+ virtual bool Read(int argc, char ** argv, int argn);
+ virtual void Status() = 0;
+
+ static void SetNameLen(int len) { nameCol = len; }
+ static void SetStatusLen(int len) { statusCol = len; }
+
+ void SetWarningBuffer(String & buffer) { warnings = &buffer; }
+ void warning(const char * format, ...);
+
+ friend class ParameterList;
+ };
+
+class IntParameter : public Parameter
+ {
+ public:
+ IntParameter(char c, const char * desc, int & v)
+ : Parameter(c, desc, &v)
+ {}
+
+ virtual void Status();
+
+ protected:
+ virtual void Translate(const char * value);
+ virtual bool TranslateExtras(const char * value, const char * extras);
+ };
+
+class HiddenInteger : public IntParameter
+ {
+ public:
+ HiddenInteger(char c, const char * desc, int & v)
+ : IntParameter(c, desc, v)
+ {}
+
+ virtual void Status() { }
+ };
+
+
+class SwitchParameter : public Parameter
+ {
+ public:
+ SwitchParameter(char c, const char * desc, bool & v)
+ : Parameter(c, desc, &v)
+ {}
+
+ virtual void Status();
+
+ protected:
+ virtual void Translate(const char * value);
+ };
+
+class HiddenSwitch : public SwitchParameter
+ {
+ public:
+ HiddenSwitch(char c, const char * desc, bool & v)
+ : SwitchParameter(c, desc, v)
+ {}
+
+ virtual void Status() { }
+ };
+
+class DoubleParameter : public Parameter
+ {
+ public:
+ DoubleParameter(char c, const char * desc, double & v);
+
+ virtual void Status();
+
+ protected:
+ virtual void Translate(const char * value);
+ virtual bool TranslateExtras(const char * value, const char * extras);
+ };
+
+class HiddenDouble : public DoubleParameter
+ {
+ public:
+ HiddenDouble(char c, const char * desc, double &v)
+ : DoubleParameter(c, desc, v)
+ {}
+
+ virtual void Status() { }
+ };
+
+class StringParameter : public Parameter
+ {
+ public:
+ StringParameter(char c, const char * desc, String & v, bool allowBlank = true)
+ : Parameter(c, desc, &v)
+ { required = !allowBlank; }
+
+ virtual void Status();
+
+ protected:
+ bool required;
+
+ virtual void Translate(const char * value);
+ virtual bool TranslateExtras(const char * value, const char * extras);
+ };
+
+class HiddenString : public StringParameter
+ {
+ public:
+ HiddenString(char c, const char * desc, String & v)
+ : StringParameter(c, desc, v)
+ {}
+
+ virtual void Status() { }
+ };
+
+struct OptionList
+ {
+ char ch;
+ char * description;
+ int code;
+ };
+
+#define BEGIN_OPTION_LIST(name) ; OptionList name[] = {
+#define END_OPTION_LIST(none) , {0, none, 0} };
+
+class ListParameter : public Parameter
+ {
+ public:
+ ListParameter(char c, const char * desc, int & v, OptionList * opt);
+
+ virtual void Status();
+
+ protected:
+ String key;
+ OptionList * options;
+ virtual void Translate(const char * value);
+ };
+
+class SetParameter : public Parameter
+ {
+ public:
+ SetParameter(char c, const char * desc, int & v, OptionList * opt);
+
+ virtual void Status();
+
+ protected:
+ String key;
+ OptionList * options;
+ virtual void Translate(const char * value);
+ };
+
+struct LongParameterList
+ {
+ const char * description;
+ void * value;
+ bool exclusive;
+ int type;
+ bool touched;
+ };
+
+#define LP_BOOL_PARAMETER 1
+#define LP_INT_PARAMETER 2
+#define LP_DOUBLE_PARAMETER 3
+#define LP_STRING_PARAMETER 4
+#define LP_LEGACY_PARAMETERS 99
+
+#define BEGIN_LONG_PARAMETERS(array) LongParameterList array[] = {\
+ { NULL, NULL, false, 0, 0},
+#define LONG_PARAMETER_GROUP(label) { label, NULL, false, 0, 0},
+#define LONG_PARAMETER(label,boolptr) { label, boolptr, false, 1, 0},
+#define EXCLUSIVE_PARAMETER(label,boolptr) { label, boolptr, true, 1, 0},
+#define LONG_INTPARAMETER(label,intptr) { label, intptr, false, 2, 0},
+#define LONG_SMARTINTPARAMETER(label,intptr) { label, intptr, true, 2, 0},
+#define LONG_DOUBLEPARAMETER(label,doubleptr) { label, doubleptr, false, 3, 0},
+#define LONG_STRINGPARAMETER(label,stringptr) { label, stringptr, false, 4, 0},
+#define BEGIN_LEGACY_PARAMETERS() { "$$$", NULL, false, 99, 0},
+#define END_LONG_PARAMETERS() { NULL, NULL, false, 0, 0}};
+
+class LongParameters : public Parameter
+ {
+ public:
+ LongParameters(const char * desc, LongParameterList * list);
+
+ virtual void Status();
+
+ protected:
+ StringMap index;
+ StringMap legacyIndex;
+
+ LongParameterList * list;
+ int group_len;
+
+ virtual void Translate(const char * value);
+ virtual bool TranslateExtras(const char * value, const char * extras);
+
+ void Status(LongParameterList * ptr, int & line_len, bool & need_a_comma);
+ };
+
+class ParameterList
+ {
+ protected:
+ Parameter ** pl;
+ int count;
+ int size;
+
+ void MakeString(int argc, char ** argv, int start = 1);
+
+ public:
+ char * string;
+
+ ParameterList(int s = 36)
+ {
+ size = s;
+ count = 0;
+ pl = new Parameter * [size];
+ string = NULL;
+ }
+
+ virtual ~ParameterList();
+
+ void Add(Parameter * p);
+
+ // Tries to process all command line arguments
+ virtual void Read(int argc, char ** argv, int start = 1);
+
+ // Allows for trailing, unprocessed, filenames in the command line
+ // The number of translated argv[] items is returned
+ virtual int ReadWithTrailer(int argc, char ** argv, int start = 1);
+
+ // Outputs summary of parameter switches and settings
+ virtual void Status();
+
+ // Keeps track of warnings generated during parameter processing
+ String warnings;
+ String messages;
+
+ // Functions that gracefully enforce parameter settings
+ void Enforce(bool & var, bool value, const char * reason, ...);
+ void Enforce(int & var, int value, const char * reason, ...);
+ void Enforce(double & var, double value, const char * reason, ...);
+ void Enforce(String & var, const char * value, const char * reason, ...);
+ };
+
+#endif
+
diff --git a/libsrc/Pedigree.cpp b/libsrc/Pedigree.cpp
new file mode 100644
index 0000000..82aafa9
--- /dev/null
+++ b/libsrc/Pedigree.cpp
@@ -0,0 +1,916 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/Pedigree.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "Pedigree.h"
+#include "GenotypeLists.h"
+#include "MemoryInfo.h"
+#include "Constant.h"
+#include "Error.h"
+#include "Sort.h"
+
+#include <stdlib.h>
+
+bool Pedigree::sexAsCovariate = false;
+String Pedigree::missing("-99.999");
+
+Pedigree::Pedigree() : pd ()
+ {
+ haveTwins = count = 0;
+ size = 10000;
+ persons = new Person *[size];
+ familyCount = 0;
+ families = new Family * [1];
+ multiPd = NULL;
+ multiFileCount = 0;
+ }
+
+Pedigree::~Pedigree()
+ {
+ for (int i = 0; i < count; i++)
+ delete persons[i];
+
+ for (int i = 0; i < familyCount; i++)
+ delete families[i];
+
+ delete [] families;
+ delete [] persons;
+
+ if (multiPd != NULL)
+ delete [] multiPd;
+ }
+
+void Pedigree::Sort()
+ {
+ QuickSort(persons, count, sizeof (Person *),
+ COMPAREFUNC Pedigree::ComparePersons);
+
+ haveTwins = 0;
+
+ // Check for structural problems in input pedigree
+ bool problem = false;
+
+ // Check that we have no duplicates...
+ for (int i = 1; i < count; i++)
+ if (ComparePersons( (const Person **) &persons[i-1],
+ (const Person **) &persons[i]) == 0)
+ {
+ printf("Family %s: Person %s is duplicated\n",
+ (const char *) persons[i]->famid,
+ (const char *) persons[i]->pid);
+ problem = true;
+
+ do { i++; }
+ while (i < count &&
+ ComparePersons((const Person **) &persons[i-1],
+ (const Person **) &persons[i]) == 0);
+ }
+
+ // Assign parents...
+ for (int i = 0; i < count; i++)
+ {
+ persons[i]->serial = i;
+ persons[i]->father = FindPerson(persons[i]->famid, persons[i]->fatid);
+ persons[i]->mother = FindPerson(persons[i]->famid, persons[i]->motid);
+
+ problem |= !persons[i]->CheckParents();
+
+ persons[i]->AssessStatus();
+
+ // Check if we have any twins...
+ haveTwins |= persons[i]->zygosity;
+ }
+
+ if (problem)
+ error("Please correct problems with pedigree structure\n");
+
+ MakeSibships();
+ MakeFamilies();
+ }
+
+void Pedigree::MakeSibships()
+ {
+ Person ** sibs = new Person * [count];
+ for (int i = 0; i < count; i++)
+ sibs[i] = persons[i];
+
+ QuickSort(sibs, count, sizeof (Person *),
+ COMPAREFUNC Pedigree::CompareParents);
+
+ for (int first = 0; first < count; first++)
+ if (!sibs[first]->isFounder())
+ {
+ int last = first + 1;
+ while (last < count)
+ if (sibs[first]-> mother != sibs[last]->mother ||
+ sibs[first]-> father != sibs[last]->father)
+ break;
+ else last++;
+ last --;
+
+ for (int j = first; j <= last; j++)
+ {
+ if (sibs[j]->sibCount) delete [] sibs[j]->sibs;
+ sibs[j]->sibCount = last - first + 1;
+ sibs[j]->sibs = new Person * [sibs[j]->sibCount];
+ for (int k = first; k <= last; k++)
+ sibs[j]->sibs[k - first] = sibs[k];
+ }
+ first = last;
+ }
+ delete [] sibs;
+ }
+
+void Pedigree::MakeFamilies()
+ {
+ for (int i = 0; i < familyCount; i++)
+ delete families[i];
+ delete [] families;
+
+ familyCount = 0;
+ families = new Family * [count];
+
+ for (int first=0; first < count; first++)
+ {
+ int last = first;
+ while (last < count)
+ if (SlowCompare(persons[first]->famid, persons[last]->famid) == 0)
+ last++;
+ else break;
+
+ families[familyCount] = new Family(*this, first, --last, familyCount);
+
+ first = last;
+ familyCount++;
+ }
+ }
+
+// Utility functions for finding a person in a pedigree
+
+struct PedigreeKey
+ {
+ const char * famid;
+ const char * pid;
+ };
+
+int CompareKeyToPerson(PedigreeKey * key, Person ** p)
+ {
+ int result = SlowCompare(key->famid, (**p).famid);
+
+ if (result != 0)
+ return result;
+
+ return SlowCompare(key->pid, (**p).pid);
+ }
+
+int CompareKeyToFamily(PedigreeKey * key, Family ** f)
+ {
+ return SlowCompare(key->famid, (**f).famid);
+ }
+
+Person * Pedigree::FindPerson(const char * famid, const char * pid)
+ {
+ PedigreeKey key;
+ key.famid = famid;
+ key.pid = pid;
+
+ Person ** result = (Person **) BinarySearch
+ (&key, persons, count, sizeof(Person *),
+ COMPAREFUNC CompareKeyToPerson);
+
+ return (result == NULL) ? (Person *) NULL : *result;
+ }
+
+Person * Pedigree::FindPerson(const char *famid, const char *pid, int universe)
+ {
+ PedigreeKey key;
+ key.famid = famid;
+ key.pid = pid;
+
+ Person ** result = (Person **) BinarySearch
+ (&key, persons, universe, sizeof(Person *),
+ COMPAREFUNC CompareKeyToPerson);
+
+ return (result == NULL) ? (Person *) NULL : *result;
+ }
+
+Family * Pedigree::FindFamily(const char * famid)
+ {
+ PedigreeKey key;
+ key.famid = famid;
+
+ Family ** result = (Family **) BinarySearch
+ (&key, families, familyCount, sizeof(Family *),
+ COMPAREFUNC CompareKeyToFamily);
+
+ return (result == NULL) ? (Family *) NULL : *result;
+ }
+
+int Pedigree::CountAlleles(int marker)
+ { return ::CountAlleles(*this, marker); }
+
+void Pedigree::LumpAlleles(double min, bool reorder )
+ {
+ if (min > 0.0)
+ printf("Lumping alleles with frequencies of %.2f or less...\n\n", min);
+
+ for (int m=0; m < markerCount; m++)
+ ::LumpAlleles(*this, m, min, reorder);
+ }
+
+void Pedigree::EstimateFrequencies(int estimator, bool quiet)
+ {
+ bool estimated = false;
+ int line = 3;
+
+ const char * estimators[] =
+ { "using all genotypes", "using founder genotypes", "assumed equal" };
+
+ bool condensed = markerCount > 100;
+ int grain = markerCount / 50, estimates = 0;
+
+ for (int m=0; m < markerCount; m++)
+ if (::EstimateFrequencies(*this, m, estimator))
+ if (!quiet)
+ {
+ if (!estimated)
+ printf("Estimating allele frequencies... [%s]\n ",
+ estimators[estimator]), estimated = true;
+
+ if (condensed)
+ {
+ if (estimates++ % grain == 0)
+ {
+ printf(".");
+ fflush(stdout);
+ }
+ continue;
+ }
+
+ if ( line + markerNames[m].Length() + 1 > 79)
+ printf("\n "), line = 3;
+
+ printf("%s ", (const char *) markerNames[m]);
+ line += markerNames[m].Length() + 1;
+ }
+
+ if (estimated)
+ printf(condensed ? "\nDone estimating frequencies for %d markers\n\n" : "\n\n", estimates);
+ }
+
+int Pedigree::ComparePersons(const Person ** p1, const Person ** p2)
+ {
+ int result = SlowCompare((**p1).famid, (**p2).famid);
+
+ if (result != 0) return result;
+
+ return SlowCompare((**p1).pid, (**p2).pid);
+ }
+
+int Pedigree::CompareParents(const Person ** p1, const Person ** p2)
+ {
+ int result = SlowCompare((**p1).famid, (**p2).famid);
+
+ if (result) return result;
+
+ result = SlowCompare((**p1).fatid, (**p2).fatid);
+
+ if (result) return result;
+
+ return SlowCompare((**p1).motid, (**p2).motid);
+ }
+
+void Pedigree::Grow()
+ {
+ size *= 2;
+
+ Person ** temp = new Person * [size];
+ if (temp == NULL) error("Out of memory");
+
+ for (int i=0; i<count; i++)
+ temp[i] = persons[i];
+
+ delete [] persons;
+ persons = temp;
+ }
+
+void Pedigree::Add(Person & rhs)
+ {
+ if (count == size)
+ Grow();
+
+ persons[count] = new Person();
+ persons[count++]->Copy(rhs);
+ }
+
+void Pedigree::WriteDataFile(FILE * output)
+ {
+ // write in the following order:
+ // markers, traits, affections, covariates
+
+ if (haveTwins)
+ fprintf(output, " Z Zygosity \n");
+
+ for (int m = 0; m < markerCount; m++)
+ fprintf(output, " M %s \n", (const char *) markerNames[m]);
+
+ for (int t = 0; t < traitCount; t++)
+ fprintf(output, " T %s \n", (const char *) traitNames[t]);
+
+ for (int a = 0; a < affectionCount; a++)
+ fprintf(output, " A %s \n", (const char *) affectionNames[a]);
+
+ for (int c = 0; c < covariateCount; c++)
+ fprintf(output, " C %s \n", (const char *) covariateNames[c]);
+
+ fprintf(output, " E END-OF-DATA \n");
+ }
+
+void Pedigree::WritePedigreeFile(FILE * output)
+ {
+ MarkerInfo ** info = new MarkerInfo * [markerCount];
+
+ for (int i = 0; i < markerCount; i++)
+ info[i] = GetMarkerInfo(i);
+
+ for (int i = 0; i < count; i++)
+ WriteRecodedPerson(output, i, info);
+ fprintf(output, "end\n");
+
+ delete [] info;
+ }
+
+void Pedigree::WritePerson(FILE * output, int person, const char * famid,
+ const char * pid, const char * fatid, const char * motid)
+ {
+ WriteRecodedPerson(output, person, NULL, famid, pid, fatid, motid);
+ }
+
+void Pedigree::WriteRecodedPerson(
+ FILE * output, int person, MarkerInfo ** markerInfo,
+ const char * famid, const char * pid, const char * fatid,
+ const char * motid)
+ {
+ Person * p = persons[person];
+
+ if (famid == NULL) famid = p->famid;
+ if (pid == NULL) pid = p->pid;
+ if (fatid == NULL) fatid = p->fatid;
+ if (motid == NULL) motid = p->motid;
+
+ // write in the following order:
+ // markers, traits, affections, covariates
+
+ fprintf(output, "%s\t%s\t%s\t%s\t%d\t",
+ famid, pid, fatid, motid, p->sex);
+
+ char * twinCodes[] = {"0", "MZ", "DZ"};
+
+ if (haveTwins)
+ if (p->zygosity <= 2)
+ fprintf(output, "%s\t", twinCodes[p->zygosity]);
+ else
+ fprintf(output, "%d\t", p->zygosity);
+
+ for (int m = 0; m < markerCount; m++)
+ if (markerInfo == NULL)
+ fprintf(output, markerCount < 20 ? "%3d/%3d\t" : "%d/%d\t",
+ p->markers[m][0], p->markers[m][1]);
+ else
+ fprintf(output, markerCount < 20 ? "%3s/%3s\t" : "%s/%s\t",
+ (const char *) markerInfo[m]->GetAlleleLabel(p->markers[m][0]),
+ (const char *) markerInfo[m]->GetAlleleLabel(p->markers[m][1]));
+
+ for (int t = 0; t < traitCount; t++)
+ if (p->isPhenotyped(t))
+ fprintf(output, "%.3f\t", p->traits[t]);
+ else
+ fprintf(output, "x\t");
+
+ for (int a = 0; a < affectionCount; a++)
+ if (p->isDiagnosed(a))
+ fprintf(output, "%d\t", p->affections[a]);
+ else
+ fprintf(output, "x\t");
+
+ for (int c = 0; c < covariateCount; c++)
+ if (p->isControlled(c))
+ fprintf(output, "%.3f\t", p->covariates[c]);
+ else
+ fprintf(output, "x\t");
+
+ fprintf(output, "\n");
+ }
+
+void Pedigree::WriteDataFile(const char * output)
+ {
+ FILE * f = fopen(output, "wt");
+ if (f == NULL) error("Couldn't open data file %s", output);
+ WriteDataFile(f);
+ fclose(f);
+ }
+
+void Pedigree::WritePedigreeFile(const char * output)
+ {
+ FILE * f = fopen(output, "wt");
+ if (f == NULL) error("Couldn't open pedigree file %s", output);
+ WritePedigreeFile(f);
+ fclose(f);
+ }
+
+void Pedigree::PrepareDichotomization()
+ {
+
+ for (int t = 0; t < traitCount; t++)
+ {
+ String new_affection = traitNames[t] + "*";
+ GetAffectionID(new_affection);
+ }
+ }
+
+int Pedigree::Dichotomize(int t, double mean)
+ {
+ String new_affection = traitNames[t] + "*";
+
+ int af = GetAffectionID(new_affection);
+
+ if (mean == _NAN_)
+ {
+ mean = 0.0;
+ double dcount = 0;
+ for (int i = 0; i < count; i++)
+ if (persons[i]->isPhenotyped(t) &&
+ !persons[i]->isFounder())
+ {
+ mean += persons[i]->traits[t];
+ dcount ++;
+ }
+
+ if (!dcount) return af;
+
+ mean /= dcount;
+ }
+
+ printf("Dichotomizing %s around mean of %.3f ...\n",
+ (const char *) traitNames[t], mean);
+
+ for (int i = 0; i < count; i++)
+ if (persons[i]->isPhenotyped(t) && !persons[i]->isFounder())
+ persons[i]->affections[af] = persons[i]->traits[t] > mean ? 2 : 1;
+ else
+ persons[i]->affections[af] = 0;
+
+ Sort();
+
+ return af;
+ }
+
+void Pedigree::DichotomizeAll(double mean)
+ {
+ for (int t = 0; t < traitCount; t++)
+ Dichotomize(t, mean);
+ }
+
+bool Pedigree::InheritanceCheck(bool abortIfInconsistent)
+ {
+ bool fail = false;
+
+ if (haveTwins) fail |= TwinCheck();
+
+ if (chromosomeX)
+ fail |= SexLinkedCheck();
+ else
+ fail |= AutosomalCheck();
+
+ if (fail && abortIfInconsistent)
+ error("Mendelian inheritance errors detected\n");
+
+ return !fail;
+ }
+
+bool Pedigree::AutosomalCheck()
+ {
+ // Arrays indicating which alleles and homozygotes occur
+ IntArray haplos, genos, counts, failedFamilies;
+
+ bool fail = false;
+
+ // For each marker ...
+ for (int m = 0; m < markerCount; m++)
+ {
+ MarkerInfo * info = GetMarkerInfo(m);
+
+ // Summary for marker
+ int alleleCount = CountAlleles(m);
+ int genoCount = alleleCount * (alleleCount + 1) / 2;
+
+ // Initialize arrays
+ haplos.Dimension(alleleCount + 1);
+ haplos.Set(-1);
+
+ genos.Dimension(genoCount + 1);
+ genos.Set(-1);
+
+ failedFamilies.Dimension(familyCount);
+ failedFamilies.Zero();
+
+ counts.Dimension(alleleCount + 1);
+
+ for (int f = 0; f < familyCount; f++)
+ for (int i = families[f]->first; i <= families[f]->last; i++)
+ if (!persons[i]->isFounder() && persons[i]->sibs[0] == persons[i])
+ {
+ // This loop runs once per sibship
+ Alleles fat = persons[i]->father->markers[m];
+ Alleles mot = persons[i]->mother->markers[m];
+ bool fgeno = fat.isKnown();
+ bool mgeno = mot.isKnown();
+
+ // Number of alleles, homozygotes and genotypes in this sibship
+ int haplo = 0, homo = 0, diplo = 0;
+
+ // No. of different genotypes per allele
+ counts.Zero();
+
+ // In general, there should be no more than 3 genotypes per allele
+ bool too_many_genos = false;
+
+ for (int j = 0; j < persons[i]->sibCount; j++)
+ if (persons[i]->sibs[j]->isGenotyped(m))
+ {
+ Alleles geno = persons[i]->sibs[j]->markers[m];
+
+ int fat1 = fat.hasAllele(geno.one);
+ int fat2 = fat.hasAllele(geno.two);
+ int mot1 = mot.hasAllele(geno.one);
+ int mot2 = mot.hasAllele(geno.two);
+
+ if (fgeno && mgeno && !(fat1 && mot2 || fat2 && mot1) ||
+ fgeno && !(fat1 || fat2) || mgeno && !(mot1 || mot2))
+ {
+ printf("%s - Fam %s: Child %s [%s/%s] has ",
+ (const char *) markerNames[m],
+ (const char *) persons[i]->sibs[j]->famid,
+ (const char *) persons[i]->sibs[j]->pid,
+ (const char *) info->GetAlleleLabel(geno.one),
+ (const char *) info->GetAlleleLabel(geno.two));
+
+ if (!fgeno || !mgeno)
+ printf("%s [%s/%s]\n",
+ fgeno ? "father" : "mother",
+ (const char *) info->GetAlleleLabel(fgeno ? fat.one : mot.one),
+ (const char *) info->GetAlleleLabel(fgeno ? fat.two : mot.two));
+ else
+ printf("parents [%s/%s]*[%s/%s]\n",
+ (const char *) info->GetAlleleLabel(fat.one),
+ (const char *) info->GetAlleleLabel(fat.two),
+ (const char *) info->GetAlleleLabel(mot.one),
+ (const char *) info->GetAlleleLabel(mot.two));
+
+ fail = true;
+ failedFamilies[f] = true;
+ }
+ else
+ {
+ if (haplos[geno.one] != i) { haplo++; haplos[geno.one] = i;};
+ if (haplos[geno.two] != i) { haplo++; haplos[geno.two] = i;};
+
+ int index = geno.SequenceCoded();
+
+ if (genos[index] != i)
+ {
+ genos[index] = i;
+ diplo++;
+ counts[geno.one]++;
+ if (geno.isHomozygous())
+ homo++;
+ else
+ counts[geno.two]++;
+ if (counts[geno.one] > 2) too_many_genos = true;
+ if (counts[geno.two] > 2) too_many_genos = true;
+ }
+ }
+ }
+
+ if (fgeno)
+ {
+ if (haplos[fat.one] != i) { haplo++; haplos[fat.one] = i; }
+ if (haplos[fat.two] != i) { haplo++; haplos[fat.two] = i; }
+ homo += fat.isHomozygous();
+ }
+
+ if (mgeno)
+ {
+ if (haplos[mot.one] != i) { haplo++; haplos[mot.one] = i; }
+ if (haplos[mot.two] != i) { haplo++; haplos[mot.two] = i; }
+ homo += mot.isHomozygous();
+ }
+
+ if (diplo > 4 || haplo + homo > 4 || haplo == 4 && too_many_genos )
+ {
+ printf("%s - Fam %s: ",
+ (const char *) markerNames[m],
+ (const char *) persons[i]->famid);
+ if (persons[i]->father->markers[m].isKnown())
+ printf("Father %s [%s/%s] has children [",
+ (const char *) persons[i]->father->pid,
+ (const char *) info->GetAlleleLabel(fat.one),
+ (const char *) info->GetAlleleLabel(fat.two));
+ else if (persons[i]->mother->markers[m].isKnown())
+ printf("Mother %s [%s/%s] has children [",
+ (const char *) persons[i]->mother->pid,
+ (const char *) info->GetAlleleLabel(mot.one),
+ (const char *) info->GetAlleleLabel(mot.two));
+ else
+ printf("Couple %s * %s has children [",
+ (const char *) persons[i]->mother->pid,
+ (const char *) persons[i]->father->pid);
+
+ for (int j = 0; j < persons[i]->sibCount; j++)
+ printf("%s%s/%s", j == 0 ? "" : " ",
+ (const char *) info->GetAlleleLabel(persons[i]->sibs[j]->markers[m].one),
+ (const char *) info->GetAlleleLabel(persons[i]->sibs[j]->markers[m].two));
+ printf("]\n");
+
+ fail = true;
+ failedFamilies[f] = true;
+ }
+ }
+
+ for (int f = 0; f < familyCount; f++)
+ if (!failedFamilies[f] &&
+ (families[f]->count > families[f]->founders + 1) &&
+ !families[f]->isNuclear())
+ fail |= !GenotypeList::EliminateGenotypes(*this, families[f], m);
+ }
+
+ if (fail)
+ printf("\nMendelian inheritance errors detected\n");
+
+ return fail;
+ }
+
+bool Pedigree::SexLinkedCheck()
+ {
+ bool fail = false;
+
+ // Keep track of what families fail the basic inheritance check,
+ // so that we can run later run genotype elimination check on the remainder
+ IntArray failedFamilies(familyCount);
+
+ // For each marker ...
+ for (int m = 0; m < markerCount; m++)
+ {
+ MarkerInfo * info = GetMarkerInfo(m);
+
+ failedFamilies.Zero();
+
+ // Check for homozygous males
+ for (int f = 0; f < familyCount; f++)
+ for (int i = families[f]->first; i <= families[f]->last; i++)
+ if (persons[i]->sex == SEX_MALE && persons[i]->markers[m].isKnown() &&
+ !persons[i]->markers[m].isHomozygous())
+ {
+ printf("%s - Fam %s: Male %s has two X alleles [%s/%s]\n",
+ (const char *) markerNames[m],
+ (const char *) persons[i]->famid, (const char *) persons[i]->pid,
+ (const char *) info->GetAlleleLabel(persons[i]->markers[m].one),
+ (const char *) info->GetAlleleLabel(persons[i]->markers[m].two));
+
+ // Wipe this genotype so we don't get cascading errors below
+ persons[i]->markers[m][0] = persons[i]->markers[m][1] = 0;
+
+ fail = true;
+ failedFamilies[f] = true;
+ }
+
+ // Check full sibships for errors
+ // TODO -- We could do better by grouping male half-sibs
+ for (int f = 0; f < familyCount; f++)
+ for (int i = families[f]->first; i <= families[f]->last; i++)
+ if (!persons[i]->isFounder() && persons[i]->sibs[0] == persons[i])
+ {
+ // This loop runs once per sibship
+ Alleles fat = persons[i]->father->markers[m];
+ Alleles mot = persons[i]->mother->markers[m];
+
+ bool fgeno = fat.isKnown();
+ bool mgeno = mot.isKnown();
+
+ Alleles inferred_mother = mot;
+ Alleles first_sister;
+ Alleles inferred_father;
+
+ bool mother_ok = true;
+
+ int sisters = 0;
+
+ for (int j = 0; j < persons[i]->sibCount; j++)
+ if (persons[i]->sibs[j]->isGenotyped(m))
+ {
+ Alleles geno = persons[i]->sibs[j]->markers[m];
+
+ bool fat1 = fat.hasAllele(geno.one);
+ bool fat2 = fat.hasAllele(geno.two);
+ bool mot1 = mot.hasAllele(geno.one);
+ bool mot2 = mot.hasAllele(geno.two);
+
+ int sex = persons[i]->sibs[j]->sex;
+
+ if (sex == SEX_MALE)
+ {
+ if (mgeno && !mot1)
+ {
+ printf("%s - Fam %s: Child %s [%s/Y] has mother [%s/%s]\n",
+ (const char *) markerNames[m],
+ (const char *) persons[i]->famid,
+ (const char *) persons[i]->sibs[j]->pid,
+ (const char *) info->GetAlleleLabel(geno.one),
+ (const char *) info->GetAlleleLabel(mot.one),
+ (const char *) info->GetAlleleLabel(mot.two));
+ fail = true;
+ failedFamilies[f] = true;
+ }
+ else
+ mother_ok &= inferred_mother.AddAllele(geno.one);
+ }
+ if (sex == SEX_FEMALE)
+ {
+ if (fgeno && mgeno && !(fat1 && mot2 || fat2 && mot1) ||
+ fgeno && !(fat1 || fat2) || mgeno && !(mot1 || mot2))
+ {
+ printf("%s - Fam %s: Child %s [%s/%s] has ",
+ (const char *) markerNames[m],
+ (const char *) persons[i]->famid,
+ (const char *) persons[i]->sibs[j]->pid,
+ (const char *) info->GetAlleleLabel(geno.one),
+ (const char *) info->GetAlleleLabel(geno.two));
+
+ if (!fgeno)
+ printf("mother [%s/%s]\n",
+ (const char *) info->GetAlleleLabel(mot.one),
+ (const char *) info->GetAlleleLabel(mot.two));
+ else if (!mgeno)
+ printf("father [%s/Y]\n",
+ (const char *) info->GetAlleleLabel(fat.one));
+ else
+ printf("parents [%s/Y]*[%s/%s]\n",
+ (const char *) info->GetAlleleLabel(fat.one),
+ (const char *) info->GetAlleleLabel(mot.one),
+ (const char *) info->GetAlleleLabel(mot.two));
+
+ fail = true;
+ failedFamilies[f] = true;
+ }
+ else
+ {
+ if (!sisters++)
+ inferred_father = first_sister = geno;
+ else if (first_sister != geno)
+ {
+ inferred_father.Intersect(geno);
+
+ mother_ok &= inferred_mother.AddAllele(
+ geno.otherAllele(inferred_father.one));
+ mother_ok &= inferred_mother.AddAllele(
+ first_sister.otherAllele(inferred_father.one));
+ }
+
+ if (!fgeno && (mot1 ^ mot2))
+ inferred_father.Intersect(mot1 ? geno.two : geno.one);
+
+ if (!mgeno && (fat1 ^ fat2))
+ mother_ok &= inferred_mother.AddAllele(fat1 ? geno.two : geno.one);
+ }
+ }
+ }
+
+ if (!mother_ok || sisters && !inferred_father.isKnown())
+ {
+ printf("%s - Fam %s: ",
+ (const char *) markerNames[m],
+ (const char *) persons[i]->famid);
+ if (fgeno)
+ printf("Father %s [%s/Y] has children [",
+ (const char *) persons[i]->father->pid,
+ (const char *) info->GetAlleleLabel(fat.one));
+ else if (mgeno)
+ printf("Mother %s [%s/%s] has children [",
+ (const char *) persons[i]->mother->pid,
+ (const char *) info->GetAlleleLabel(mot.one),
+ (const char *) info->GetAlleleLabel(mot.two));
+ else
+ printf("Couple %s * %s has children [",
+ (const char *) persons[i]->mother->pid,
+ (const char *) persons[i]->father->pid);
+
+ for (int j = 0; j < persons[i]->sibCount; j++)
+ printf(
+ persons[i]->sibs[j]->sex == SEX_MALE ? "%s%s/Y" : "%s%s/%s",
+ j == 0 ? "" : " ",
+ (const char *) info->GetAlleleLabel(persons[i]->sibs[j]->markers[m].one),
+ (const char *) info->GetAlleleLabel(persons[i]->sibs[j]->markers[m].two));
+ printf("]\n");
+ fail = true;
+ failedFamilies[f] = true;
+ }
+ }
+
+ for (int f = 0; f < familyCount; f++)
+ if (!failedFamilies[f] &&
+ (families[f]->count > families[f]->founders + 1) &&
+ !families[f]->isNuclear())
+ fail |= !GenotypeList::EliminateGenotypes(*this, families[f], m);
+ }
+
+ if (fail)
+ printf("\nMendelian inheritance errors detected\n");
+
+ return fail;
+ }
+
+void Pedigree::ExtractFamily(int id, Pedigree & single_fam_ped)
+ {
+ for (int i = families[id]->first; i <= families[id]->last; i++)
+ single_fam_ped.Add(*persons[i]);
+
+ single_fam_ped.Sort();
+ }
+
+void Pedigree::ExtractOnAffection(int a, Pedigree & new_ped, int target_status)
+ {
+ for (int i = 0; i < count; i++)
+ if (persons[i]->affections[a] == target_status)
+ new_ped.Add(*persons[i]);
+ else
+ {
+ Person blank_person;
+ blank_person.CopyIDs(*persons[i]);
+ new_ped.Add(blank_person);
+ }
+
+ new_ped.Sort();
+ }
+
+void Pedigree::Filter(IntArray & filter)
+ {
+ if (filter.Length() != count)
+ error("Pedigree:Size of pedigree filter doesn't match number of persons in pedigree");
+
+ for (int i = 0; i < count; i++)
+ if (filter[i] == 1)
+ {
+ persons[i]->WipePhenotypes();
+ persons[i]->filter = true;
+ }
+ }
+
+void Pedigree::AddPerson(const char * famid, const char * pid,
+ const char * fatid, const char * motid,
+ int sex, bool delay_sort)
+ {
+ if (count == size) Grow();
+
+ persons[count] = new Person;
+
+ persons[count]->famid = famid;
+ persons[count]->pid = pid;
+ persons[count]->fatid = fatid;
+ persons[count]->motid = motid;
+ persons[count]->sex = sex;
+
+ count++;
+
+ if (!delay_sort) Sort();
+ }
+
+void Pedigree::ShowMemoryInfo()
+ {
+ unsigned int bytes = 0;
+
+ for (int i = 0; i < count; i++)
+ bytes += persons[i]->famid.BufferSize() + persons[i]->pid.BufferSize() +
+ persons[i]->fatid.BufferSize() + persons[i]->motid.BufferSize();
+
+ bytes += count * (markerCount * sizeof(Alleles) + traitCount * sizeof(double) +
+ covariateCount * sizeof(double) + affectionCount * sizeof(char) +
+ sizeof(Person));
+
+ printf(" %40s %s\n", "Pedigree file ...", (const char *) MemoryInfo(bytes));
+ }
+
+
+
diff --git a/libsrc/Pedigree.h b/libsrc/Pedigree.h
new file mode 100644
index 0000000..8098dd7
--- /dev/null
+++ b/libsrc/Pedigree.h
@@ -0,0 +1,155 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/Pedigree.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef _PEDIGREE_H_
+#define _PEDIGREE_H_
+
+#include "Constant.h"
+
+#include <stdio.h>
+
+#include "PedigreeAlleles.h"
+#include "PedigreePerson.h"
+#include "PedigreeGlobals.h"
+#include "PedigreeFamily.h"
+#include "PedigreeDescription.h"
+#include "PedigreeAlleleFreq.h"
+
+class Pedigree : public PedigreeGlobals
+ {
+ public:
+ static bool sexAsCovariate;
+ static String missing;
+
+ int size;
+ int count;
+ Person ** persons;
+ int familyCount;
+ Family ** families;
+ int haveTwins;
+
+ PedigreeDescription pd;
+ PedigreeDescription *multiPd;
+ int multiFileCount;
+
+ Pedigree();
+ ~Pedigree();
+
+ void Prepare(IFILE & input); // Read pedigree parameters from data file
+ void Load(IFILE & input); // Read pedigree from pedigree file
+ void LoadMendel(IFILE & input); // Read pedigree in Mendel format
+ void Prepare(const char * input); // Read pedigree parameters from named file
+
+ // Read pedigree parameters from named file, stop program on failure
+ // depending on setting of allow failures
+ void Load(const char * input, bool allowFailures = false);
+
+ // I/O related utility functions
+ int TranslateSexCode(const char * code, bool & failure);
+
+ void PrepareDichotomization(); // Register dummy affections for each trait
+ int Dichotomize(int trait, double mean = _NAN_);
+ void DichotomizeAll(double mean = _NAN_);
+
+ void WriteDataFile(FILE * output); // Write data file
+ void WritePedigreeFile(FILE * output); // Write pedigree file
+ void WriteDataFile(const char * output); // Write named data file
+ void WritePedigreeFile(const char * output); // Write named pedigree file
+ void WritePerson(FILE * output, int who, // Write a single person
+ const char * famid = NULL, // if supplied, famid, pid,
+ const char * pid = NULL, // fatid and motid allow a
+ const char * fatid = NULL, // pedigree or person to
+ const char * motid = NULL); // be renamed / restructured
+ void WriteRecodedPerson( // Like write person, but uses
+ FILE * output, int who, // user supplied markerInfo
+ MarkerInfo ** markerInfo, // array to recode marker
+ const char * famid = NULL, // alleles as they are written
+ const char * pid = NULL,
+ const char * fatid = NULL,
+ const char * motid = NULL);
+
+ void Sort(); // Sorts the pedigree items
+ Family * FindFamily(const char * famid); // Find a family
+ Person * FindPerson(const char * famid, // Find an individual
+ const char * pid);
+
+ // functions dealing with genetic markers
+ // Counts the alleles at a marker
+ int CountAlleles(int marker);
+
+ // Lumps together rare alleles and, depending on reorder flag,
+ // sorts alleles so the most common allele has the lowest index
+ void LumpAlleles(double treshold, bool reorder = true);
+
+ // Calculate allele frequencies
+ void EstimateFrequencies(int estimator, bool quiet = false);
+
+ // shorthand operators
+ Person & operator [] (int i)
+ {
+ return *(persons[i]);
+ }
+
+ // Perform a basic inheritance check
+ bool InheritanceCheck(bool abortIfInconsistent = true);
+ bool AutosomalCheck();
+ bool SexLinkedCheck();
+ bool TwinCheck();
+
+ // Merge twins into a single individual
+ void MergeTwins();
+
+ // Remove individuals with no data from pedigree
+ void Trim(bool quiet = false, int * informative = NULL);
+
+ // Add a single individual to a pedigree
+ void AddPerson(const char * famid, const char * pid,
+ const char * fatid, const char * motid,
+ int sex, bool delay_sort = false);
+
+ // Add all individuals in family with famid = id to new_ped
+ void ExtractFamily(int id, Pedigree & new_ped);
+ // Add individuals with affection status target_status for affection a to new_ped
+ void ExtractOnAffection(int a, Pedigree & new_ped, int target_status = 2);
+
+ // Remove all covariate, affection and genotype information from persons for which filter[i] = 0
+ void Filter(IntArray & filter);
+
+ // Reports memory usage for storing the pedigree
+ void ShowMemoryInfo();
+
+ private:
+ void Grow();
+ void Add(Person & rhs);
+
+ static int ComparePersons(const Person ** p1, const Person ** p2);
+ static int CompareParents(const Person ** p1, const Person ** p2);
+
+ void MakeSibships();
+ void MakeFamilies();
+
+ Person * FindPerson(const char * famid, const char * pid, int universe);
+
+ void ShowTrimHeader(bool & flag);
+ };
+
+#endif
+
+
+
+
+
diff --git a/libsrc/PedigreeAlleleFreq.cpp b/libsrc/PedigreeAlleleFreq.cpp
new file mode 100644
index 0000000..4b0c193
--- /dev/null
+++ b/libsrc/PedigreeAlleleFreq.cpp
@@ -0,0 +1,260 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/PedigreeAlleleFreq.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "PedigreeAlleleFreq.h"
+#include "QuickIndex.h"
+#include "Error.h"
+
+#include <math.h>
+
+int CountAlleles(Pedigree & /* ped */, int marker)
+ {
+ // With automatic recoding in the pedigree loader there
+ // is no need to iterate through the pedigree ...
+ MarkerInfo * info = Pedigree::GetMarkerInfo(marker);
+
+ return info->CountAlleles();
+ }
+
+void LumpAlleles(Pedigree & ped, int marker, double threshold, bool reorder)
+ {
+ // find out how many alleles there are
+ int alleles = ped.CountAlleles(marker);
+
+ if (alleles < 2) return;
+
+ MarkerInfo * info = PedigreeGlobals::GetMarkerInfo(marker);
+
+ if (alleles < info->freq.Length())
+ alleles = info->freq.Length() - 1;
+
+ IntArray counts(alleles + 1);
+ counts.Zero();
+
+ // Count number of occurrences for each allele
+ for (int i = 0; i < ped.count; i++)
+ {
+ counts[int(ped[i].markers[marker][0])]++;
+ counts[int(ped[i].markers[marker][1])]++;
+ }
+
+ // Calculate treshold for lumping alleles
+ int total = 0;
+ for (int i = 1; i <= alleles; i++)
+ total += counts[i];
+ int thresh = int(total * threshold);
+
+ // If threshold is set at zero, we artificially increase
+ // counts for alleles that do not appear in the pedigree
+ // but whose frequencies are set > 0.0. This ensures that
+ // allele frequency data does not get discarded when simply
+ // recoding alleles (vs. lumping)
+ if (thresh == 0)
+ for (int i = 1; i < info->freq.Length(); i++)
+ if (counts[i] == 0 && info->freq[i] > 0.0)
+ counts[i] = 1, total++;
+
+ // If allele reordering is disabled, put in dummy allele
+ // counts so as to ensure that allele have desired ordering
+ if (!reorder)
+ {
+ QuickIndex index(info->alleleLabels);
+ index.Reverse();
+
+ for (int i = 0; i < index.Length(); i++)
+ counts[index[i]] = i + 1;
+
+ total = counts.Sum(1, counts.Length() - 1);
+ }
+
+ // Order all alleles according to their frequency
+ // Zero, which corresponds to missing values, stays put!
+ counts[0] = total + 1;
+ QuickIndex index(counts);
+ index.Reverse();
+
+ // recode alleles
+ // all alleles where frequency < thresh are labelled N
+ // use counts array to keep track of labels
+ int N = 0;
+ bool rare = false;
+ for (int i = 0; i <= alleles; i++)
+ if (counts[index[i]] > thresh)
+ {
+ counts[index[i]] = i;
+ N++;
+ }
+ else
+ {
+ if (counts[index[i]] > 0)
+ rare = true;
+ counts[index[i]] = N;
+ }
+
+ // This loop does the recoding
+ for (int i = 0; i < ped.count; i++)
+ {
+ Alleles & current = ped[i].markers[marker];
+ current[0] = counts[current[0]];
+ current[1] = counts[current[1]];
+ }
+
+ StringArray oldLabels(info->alleleLabels);
+ String label;
+
+ info->alleleLabels.Clear();
+ info->alleleNumbers.Clear();
+
+ for (int i = 0; i < N; i++)
+ {
+ if (oldLabels.Length() <= index[i])
+ info->alleleLabels.Push(label = index[i]);
+ else
+ info->alleleLabels.Push(oldLabels[index[i]]);
+
+ if (i) info->alleleNumbers.SetInteger(info->alleleLabels.Last(), i);
+ }
+
+ // Reorder allele frequencies if necessary
+ if (info->freq.Length())
+ {
+ Vector freq(info->freq);
+
+ info->freq.Dimension(N);
+ info->freq[0] = 0.0;
+
+ for (int i = 1; i < N; i++)
+ {
+ info->freq[i] = freq[index[i]];
+ freq[index[i]] = 0;
+ }
+
+ if ((1.0 - info->freq.Sum()) > 1e-10)
+ rare = true;
+
+ if (rare)
+ {
+ info->freq.Dimension(N + 1);
+ info->freq[N] = 1.0 - info->freq.Sum();
+ }
+ }
+
+ if (rare)
+ {
+ info->alleleLabels.Push("OTHER");
+ info->alleleNumbers.SetInteger("OTHER", info->alleleLabels.Length());
+ }
+ }
+
+bool EstimateFrequencies(Pedigree & ped, int marker, int estimator)
+ {
+ int alleleCount = CountAlleles(ped, marker);
+
+ IntArray founder(alleleCount + 1);
+ IntArray all(alleleCount + 1);
+
+ founder.Zero();
+ all.Zero();
+
+ for (int i = 0; i < ped.count; i++)
+ {
+ // When counting alleles, note that males only carry one X chromosome
+ // and are arbitrarily scored as homozygous.
+ all[ped[i].markers[marker][0]]++;
+ if (!ped.chromosomeX || ped[i].sex != SEX_MALE)
+ all[ped[i].markers[marker][1]]++;
+ if (!ped[i].isFounder()) continue;
+ founder[ped[i].markers[marker][0]]++;
+ if (!ped.chromosomeX || ped[i].sex != SEX_MALE)
+ founder[ped[i].markers[marker][1]]++;
+ }
+
+ MarkerInfo * info = ped.GetMarkerInfo(marker);
+
+ if (info->freq.dim > 0)
+ {
+ // previous allele frequency information is available
+ if (alleleCount >= info->freq.dim)
+ error("For marker %s, input files define %d alleles, but at least\n"
+ "one other allele (named '%s') occurs in the pedigree\n",
+ (const char *) info->name, info->freq.dim - 1,
+ (const char *) info->GetAlleleLabel(alleleCount));
+
+ for (int i = 1; i <= alleleCount; i++)
+ if (all[i] > 0 && info->freq[i] <= 0.0)
+ error("Although allele %s for marker %s has frequency zero,\n"
+ "it occurs %d times in the pedigree",
+ (const char *) info->GetAlleleLabel(i), (const char *) info->name, all[i]);
+
+ return false;
+ }
+ else
+ {
+ if (alleleCount < 1)
+ {
+ // If no one is genotyped, default to two equifrequent allele
+ // since some programs do not like monomorphic markers
+ info->freq.Dimension(3);
+ info->freq[0] = 0.0;
+ info->freq[1] = 0.99999;
+ info->freq[2] = 0.00001;
+ return true;
+ }
+
+ info->freq.Dimension(alleleCount + 1);
+ info->freq.Zero();
+
+ if (estimator == FREQ_FOUNDERS && founder.Sum() > founder[0])
+ {
+ // Make sure the frequency of alleles occuring in the pedigree
+ // is never zero
+ for (int i = 1; i <= alleleCount; i++)
+ if (founder[i] == 0 && all[i] > 0)
+ founder[i] = 1;
+
+ // To get frequencies, just multiply counts by 1 / total_counts
+ double factor = 1.0 / (founder.Sum() - founder[0]);
+
+ for (int i = 1; i <= alleleCount; i++)
+ info->freq[i] = founder[i] * factor;
+ }
+ else if (estimator == FREQ_ALL || estimator == FREQ_FOUNDERS)
+ {
+ // To get frequencies, just multiply counts by 1 / total_counts
+ double factor = 1.0 / (all.Sum() - all[0]);
+
+ for (int i = 1; i <= alleleCount; i++)
+ info->freq[i] = all[i] * factor;
+ }
+ else if (estimator == FREQ_EQUAL)
+ // Assume all alleles have equal frequency
+ {
+ // Count the number of observed alleles
+ all[0] = 0;
+ int alleles = all.CountIfGreater(0);
+ double freq = 1.0 / alleles;
+
+ // Set equal frequencies for all occuring alleles
+ for (int i = 0; i <= alleleCount; i++)
+ info->freq[i] = all[i] ? freq : 0.0;
+ }
+ }
+
+ return true;
+ }
+
+
diff --git a/libsrc/PedigreeAlleleFreq.h b/libsrc/PedigreeAlleleFreq.h
new file mode 100644
index 0000000..7c0083d
--- /dev/null
+++ b/libsrc/PedigreeAlleleFreq.h
@@ -0,0 +1,36 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/PedigreeAlleleFreq.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __ALLELEFREQUENCIES_H__
+#define __ALLELEFREQUENCIES_H__
+
+#include "Pedigree.h"
+
+int CountAlleles(Pedigree & ped, int marker);
+void LumpAlleles(Pedigree & ped, int marker, double threshold, bool reorder);
+
+#define FREQ_ALL 0
+#define FREQ_FOUNDERS 1
+#define FREQ_EQUAL 2
+
+// Returns true if frequencies estimated, false if previous information okay
+bool EstimateFrequencies(Pedigree & ped, int marker, int estimator);
+
+#endif
+
+
+
diff --git a/libsrc/PedigreeAlleles.h b/libsrc/PedigreeAlleles.h
new file mode 100644
index 0000000..6ffed9b
--- /dev/null
+++ b/libsrc/PedigreeAlleles.h
@@ -0,0 +1,143 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/PedigreeAlleles.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __PEDALLELES_H__
+#define __PEDALLELES_H__
+
+#include "LongInt.h"
+
+class Alleles
+ {
+ public:
+ char one;
+ char two;
+
+ Alleles()
+ { one = two = 0; }
+
+ char & operator [] (int i)
+ { return (i == 1) ? one : two; }
+
+ // is the genotype fully defined?
+ bool isKnown()
+ { return (one * two) != 0; }
+ bool isHeterozygous()
+ { return isKnown() && (one != two); }
+ bool isHomozygous()
+ { return isKnown() && (one == two); }
+ bool hasAllele(int a)
+ { return (one == a) || (two == a); }
+
+ // in a bi-allelic system (a, NOT a)
+ bool isHeterozygousFor(int a)
+ { return isHeterozygous() && hasAllele(a); }
+ bool isHomozygousFor(int a)
+ { return !(isHeterozygousFor(a)); }
+
+ // how may alleles a in this genotype?
+ int countAlleles(int a)
+ { return ((one == a) ? 1 : 0) + ((two == a) ? 1 : 0); }
+
+ // what is the other allele, assuming genotype is (a, X)
+ int otherAllele(int a)
+ { return ((one == a) ? two : one); }
+
+ // are two unordered genotypes identical?
+ int identicalTo(Alleles & al)
+ { return ((al.one == one) && (al.two == two)) ||
+ ((al.two == one) && (al.one == two)); }
+
+ // how many alleles are identical by state
+ int countIBS(Alleles & al)
+ { return (one == al.one) ?
+ ((two == al.two) ? 2 : 1) :
+ ( (one == al.two) ?
+ ((two == al.one) ? 2 : 1) :
+ (((two == al.one) || (two == al.two)) ? 1 : 0));
+ }
+
+ int operator == (Alleles & rhs)
+ { return identicalTo(rhs); }
+ int operator != (Alleles & rhs)
+ { return !identicalTo(rhs); }
+
+ char Hi()
+ { return one > two ? one : two; }
+ char Lo()
+ { return one > two ? two : one; }
+
+ int SequenceCoded()
+ { return isKnown() ? Hi() * (Hi() - 1) / 2 + Lo() : 0; }
+
+ longint BinaryCoded()
+ {
+ if (isKnown())
+ {
+ longint allele1(1);
+ longint allele2(1);
+
+ allele1 <<= one - 1;
+ allele2 <<= two - 1;
+
+ return allele1 | allele2;
+ }
+ else
+ return NOTZERO;
+ }
+
+ void Intersect(Alleles & geno)
+ {
+ char a1 = Lo(), a2 = Hi();
+ char b1 = geno.Lo(), b2 = geno.Hi();
+
+ if (a1 == b1 && a2 == b2)
+ return;
+ if (a1 == b1 || a1 == b2)
+ one = two = a1;
+ else if (a2 == b1 || a2 == b2)
+ one = two = a2;
+ else
+ one = two = 0;
+ }
+
+ void Intersect(char allele)
+ {
+ if (one != allele && two != allele)
+ one = two = 0;
+ else
+ one = two = allele;
+ }
+
+ bool AddAllele(char allele)
+ {
+ if (one == allele || two == allele)
+ return true;
+
+ if (one != 0 && two != 0)
+ return false;
+
+ if (one == 0) one = allele; else two = allele;
+ return true;
+ }
+
+ void Wipe()
+ { one = two = 0; }
+ };
+
+#endif
+
+
diff --git a/libsrc/PedigreeDescription.cpp b/libsrc/PedigreeDescription.cpp
new file mode 100644
index 0000000..e806d76
--- /dev/null
+++ b/libsrc/PedigreeDescription.cpp
@@ -0,0 +1,826 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/PedigreeDescription.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "PedigreeDescription.h"
+#include "MapFunction.h"
+#include "MathVector.h"
+#include "Constant.h"
+#include "FortranFormat.h"
+#include "Error.h"
+
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <math.h>
+
+PedigreeDescription::PedigreeDescription()
+ {
+ columnCount = 0;
+ mendelFormat = false;
+ }
+
+PedigreeDescription::~PedigreeDescription()
+ { };
+
+PedigreeDescription & PedigreeDescription::operator = (PedigreeDescription & rhs)
+ {
+ columnCount = rhs.columnCount;
+
+ columns = rhs.columns;
+ columnHash = rhs.columnHash;
+
+ return *this;
+ };
+
+void PedigreeDescription::Load(IFILE & input, bool warnIfLinkage)
+ {
+ // Check if we are dealing with a linkage format data file
+ String buffer;
+ StringArray tokens;
+
+ mendelFormat = false;
+
+ ReadLineHelper(input, buffer, tokens);
+ ifrewind(input);
+
+ if (tokens.Length() == 4 && isdigit(tokens[0][0]))
+ {
+ if (warnIfLinkage) printf("Data file looks like a LINKAGE format file...\n\n");
+ LoadLinkageDataFile(input);
+ return;
+ }
+
+ if (buffer.Length() > 18
+ && (buffer.SubStr(8,8).SlowCompare("AUTOSOME") == 0 ||
+ buffer.SubStr(8,8).SlowCompare("X-LINKED") == 0)
+ && (isdigit(buffer[16]) || isdigit(buffer[17]))
+ && (isdigit(buffer[18]) || isdigit(buffer[19]) ||
+ buffer.Length() > 19 && isdigit(buffer[20])))
+ {
+ printf("Data file looks like a MENDEL format file...\n"
+ " Activating EXPERIMENTAL support for this format\n\n");
+ LoadMendelDataFile(input);
+ return;
+ }
+
+ // Reset things
+ ifrewind(input);
+ int done = 0;
+ int line = 0;
+
+ columns.Clear();
+ columnHash.Clear();
+ columnCount = 0;
+
+ while (!ifeof(input) && !done)
+ {
+ int i;
+
+ buffer.ReadLine(input);
+ line++;
+
+ tokens.Clear();
+ tokens.AddTokens(buffer, WHITESPACE);
+
+ if (tokens.Length() < 1) continue;
+
+ if (tokens.Length() == 1)
+ error("Problem reading data file:\n"
+ "Item #%d (of type %s) has no name.",
+ columnCount+1, (const char *) tokens[0]);
+
+ switch (toupper(tokens[0][0]))
+ {
+ case 'A' :
+ columnHash.Push(GetAffectionID(tokens[1]));
+ columns.Push(pcAffection);
+ columnCount++;
+ break;
+ case 'M' :
+ columnHash.Push(GetMarkerID(tokens[1]));
+ columns.Push(pcMarker);
+ columnCount++;
+ break;
+ case 'T' :
+ columnHash.Push(GetTraitID(tokens[1]));
+ columns.Push(pcTrait);
+ columnCount++;
+ break;
+ case 'C' :
+ columnHash.Push(GetCovariateID(tokens[1]));
+ columns.Push(pcCovariate);
+ columnCount++;
+ break;
+ case 'S' :
+ i = (int) tokens[0].SubStr(1);
+ i = i > 0 ? i : 1;
+ while (i--)
+ {
+ columns.Push(pcSkip);
+ columnHash.Push(0);
+ columnCount++;
+ }
+ break;
+ case 'Z' :
+ columnHash.Push(0);
+ columns.Push(pcZygosity);
+ columnCount++;
+ break;
+ case 'V' :
+ GetMarkerID(tokens[1]);
+ break;
+ case 'E' :
+ done = 1;
+ break;
+ case 'U' :
+ if (toupper(tokens[0][1]) == 'T' && toupper(tokens[0][2]) == 'C')
+ {
+ int c = GetCovariateID(tokens[1]);
+ int t = GetTraitID(tokens[1]);
+
+ if (c >= 32767 || t >= 32767)
+ error("Internal error processing data file\n");
+
+ columnHash.Push(t * 32768 + c);
+ columns.Push(pcUndocumentedTraitCovariate);
+ columnCount++;
+ break;
+ }
+ default :
+ error ("Problem in data file (line %d):\n%s\n",
+ line, (const char *) buffer);
+ }
+ }
+
+ columns.Push(pcEnd);
+ columnHash.Push(0);
+ };
+
+void PedigreeDescription::Load(const char * iFilename, bool warnIfLinkage)
+ {
+ IFILE f = ifopen(iFilename, "rb");
+
+ if (f == NULL)
+ error(
+ "The datafile %s cannot be opened\n\n"
+ "Common causes for this problem are:\n"
+ " * You might not have used the correct options to specify input file names,\n"
+ " please check the program documentation for information on how to do this\n\n"
+ " * The file doesn't exist or the filename might have been misspelt\n\n"
+ " * The file exists but it is being used by another program which you will need\n"
+ " to close before continuing\n\n"
+ " * The file is larger than 2GB and you haven't compiled this application with\n"
+ " large file support.\n\n",
+ iFilename);
+
+ Load(f, warnIfLinkage);
+ ifclose(f);
+
+ filename = iFilename;
+ };
+
+void PedigreeDescription::LoadMap(const char * iFilename)
+ {
+ IFILE f = ifopen(iFilename, "rb");
+
+ if (f == NULL)
+ error(
+ "The mapfile %s cannot be opened\n\n"
+ "Please check that the file exists and is not being used by another program\n"
+ "To find out how to set input filenames, check the documentation\n",
+ iFilename);
+
+ LoadMap(f);
+ ifclose(f);
+ };
+
+void PedigreeDescription::LoadMap(IFILE & input)
+ {
+ columns.Clear();
+ columnHash.Clear();
+ columnCount = 0;
+
+ int lastposition = 0;
+ String buffer;
+ StringArray tokens;
+
+ buffer.ReadLine(input);
+ tokens.AddTokens(buffer, WHITESPACE);
+
+ while (tokens.Length() == 0 && !ifeof(input))
+ {
+ buffer.ReadLine(input);
+ tokens.AddTokens(buffer, WHITESPACE);
+ }
+
+ if (tokens.Length() != 3)
+ error("Error reading map file header, which has %d columns.\n"
+ "Three columns were expected, corresponding to\n"
+ "MARKER_ID, MARKER_NAME and BASE_PAIR_POSITION\n"
+ "The offending header is transcribed below:\n\n"
+ "%s", tokens.Length(), (const char *) buffer);
+ else
+ printf("Map file column labels\n"
+ " -- COLUMN 1, Expecting MARKER_ID, Read %s\n"
+ " -- COLUMN 2, Expecting MARKER_NAME, Read %s\n"
+ " -- COLUMN 3, Expection BASE_PAIR_POSITION, Read %s\n\n",
+ (const char *) (tokens[0]), (const char *) (tokens[1]),
+ (const char *) (tokens[2]));
+
+ int line = 1;
+ while (!ifeof(input))
+ {
+ int serial;
+ long position;
+
+ buffer.ReadLine(input);
+ line++;
+
+ tokens.Clear();
+ tokens.AddTokens(buffer, WHITESPACE);
+
+ if (tokens.Length() < 1) continue;
+ if (tokens.Length() != 3)
+ error("Each line in the map file should have 3 tokens, corresponding\n"
+ "to MARKER_ID, MARKER_NAME and BASE_PAIR_POSITION respectively\n"
+ "However, there are %d tokens in line %d, transcribed below:\n\n"
+ "%s", tokens.Length(), line, (const char *) buffer);
+
+ serial = (int) tokens[0];
+ if (serial != columnCount + 1)
+ error("Reading Marker Index from Map File...\n"
+ "Markers should be indexed consecutively starting at 1\n"
+ "Marker %d does not fit this pattern\n", columnCount + 1);
+
+ position = (int) tokens[2];
+ if (position < lastposition)
+ error("Reading Marker Position from Map File...\n"
+ "Marker position should be in base-pairs\n"
+ "and markers should be in map order\n");
+
+ // TODO -- store marker locations somewhere!
+ lastposition = position;
+
+ columnHash.Push(GetMarkerID(tokens[1]));
+ columns.Push(pcMarker);
+ columnCount++;
+
+ GetMarkerInfo(tokens[1])->position = position * 1e-8;
+ }
+
+ columns.Push(pcEnd);
+ columnHash.Push(0);
+ };
+
+int PedigreeDescription::CountTextColumns()
+ {
+ int count = 0;
+
+ for (int i = 0; i < columnCount; i++, count++)
+ if (columns[i] == pcMarker)
+ count++;
+
+ return count;
+ }
+
+void PedigreeDescription::LoadLinkageDataFile(const char * iFilename)
+ {
+ IFILE f = ifopen(iFilename, "rb");
+
+ if (f == NULL)
+ error(
+ "The linkage format datafile %s cannot be opened\n\n"
+ "Please check that the file exists and is not being used by another program\n"
+ "To find out how to set input filenames, check the documentation\n",
+ iFilename);
+
+ LoadLinkageDataFile(f);
+ ifclose(f);
+
+ filename = iFilename;
+ };
+
+void PedigreeDescription::LoadLinkageDataFile(IFILE & input)
+ {
+ columns.Clear();
+ columnHash.Clear();
+ columnCount = 0;
+
+ String buffer, label;
+ StringArray tokens;
+
+ ReadLineHelper(input, buffer, tokens);
+
+ if (tokens.Length() != 4 || tokens[2].AsInteger() != (int) chromosomeX ||
+ tokens[0].AsInteger() < 0 )
+ error("Cannot handle first line of data file\n\n"
+ "Expecting four (4) numeric values, which correspond to:\n"
+ " num-loci -- number of loci in the pedigree\n"
+ " this value must be positive\n"
+ " risk-locus -- locus for which risks should be calculated\n"
+ " this value will be ignored\n"
+ " sex-link -- are the loci sex linked [0 - No, 1 - Yes]\n"
+ " %s\n"
+ " program -- which LINKAGE program do you want to use?\n"
+ " this value will also be ignored\n\n"
+ "The actual input read:\n%s\n",
+ chromosomeX ? "expecting X-linked data, so this value must be ONE (1)"
+ : "expecting autosomal data, so this must be ZERO (0)",
+ (const char *) buffer);
+
+ int numloci = tokens[0];
+
+ ReadLineHelper(input, buffer, tokens);
+
+ if (tokens.Length() != 4 ||
+ tokens[0].AsInteger() != 0 ||
+ tokens[3].AsInteger() != 0)
+ error("Cannot handle second line of data file\n\n"
+ "Expecting four (4) numeric values, which correspond to:\n"
+ " mutation-model -- must be zero, corresponding to no mutation\n"
+ " male-mutation-rate -- ignored\n"
+ " female-mutation-rate -- ignored\n"
+ " linkage-disequilibrium -- must be zero, may be used in the future to\n"
+ " read haplotype frequencies\n\n"
+ "The actual input read:\n%s\n", (const char *) buffer);
+
+ StringArray markerOrder;
+ int unknown = 0;
+
+ ReadLineHelper(input, buffer, markerOrder);
+
+ if (markerOrder.Length() > numloci)
+ error("The third line of the data file lists marker order\n\n"
+ "Although %d loci are defined [in the first line],\n"
+ "this line includes %d values:\n%s\n",
+ numloci, markerOrder.Length(), (const char *) buffer);
+
+ IntArray locus;
+ bool need_blank_line = false;
+
+ while (!ifeof(input) && numloci--)
+ {
+ if (ReadLineHelper(input, buffer, tokens) == 0)
+ error("Linkage data file ends unexpectedly");
+
+ if (tokens.Length() < 2)
+ error("Incomplete locus information in data file\n"
+ "Information for each locus should include 2 or more fiels\n"
+ "The expected fields are:\n"
+ " field_type -- indicator of locus type (trait, marker,...)\n"
+ " alleles -- number of alleles\n"
+ " name -- locus name, preceded by hash (#) sign\n\n"
+ "The actual input read:\n%s\n", (const char *) buffer);
+
+ int locus_type = (int) tokens[0];
+ int alleles = (int) tokens[1];
+
+ String locus_name("LOCUS");
+ locus_name += ++unknown;
+
+ if (tokens.Length() > 2 && tokens[2][0] == '#')
+ if (tokens[2][1] != 0)
+ locus_name = tokens[2].SubStr(1);
+ else if (tokens.Length() > 3)
+ locus_name = tokens[3];
+
+ if (locus_type == 4 && alleles == 0 ||
+ locus_type == 4 && alleles == 1)
+ {
+ columnHash.Push(GetCovariateID(locus_name));
+ columns.Push(pcCovariate);
+ columnCount++;
+ continue;
+ }
+
+ if (locus_type == 0 && alleles == 0)
+ {
+ columnHash.Push(GetTraitID(locus_name));
+ columns.Push(pcTrait);
+ columnCount++;
+ continue;
+ }
+
+ if ( ReadLineHelper(input, buffer, tokens) != alleles)
+ error("Expecting %d allele frequencies, but input has %d columns:\n"
+ "%s\n", alleles, tokens.Length(), (const char *) buffer);
+
+ Vector frequencies(alleles + 1);
+
+ frequencies[0] = 0.0;
+ for (int i = 1; i <= alleles; i++)
+ frequencies[i] = (double) tokens[i - 1];
+
+ double sum = frequencies.Sum();
+
+ if (sum <= 0.0)
+ error("Allele frequencies at %s sum to %f, which doesn't make sense\n",
+ (const char *) locus_name, sum);
+
+ if ( fabs(sum - 1.0) > 1.2e-5 )
+ {
+ printf("Allele frequencies at %s sum to %f, adjusted to 1.0\n",
+ (const char *) locus_name, sum);
+ need_blank_line = true;
+ }
+
+ if ( sum != 1.0)
+ frequencies *= 1.0 / sum;
+
+ switch (locus_type)
+ {
+ case 1 : {
+ // Affection
+ columnHash.Push(GetAffectionID(locus_name));
+ columns.Push(pcAffection);
+ columnCount++;
+
+ // Read number of liability classes
+ if (ReadLineHelper(input, buffer, tokens) == 0)
+ error("Linkage data file ends unexpectedly\n");
+
+ // Skip liability class data
+ int classes = tokens[0];
+ if (classes > 1)
+ { columnHash.Push(0); columns.Push(pcSkip); columnCount++; }
+
+ // Separate liability class rows for males and females for X-linked data
+ if (chromosomeX) classes *= 2;
+
+ while (classes--)
+ if (ReadLineHelper(input, buffer, tokens) == 0)
+ error("Linkage data file ends unexpectedly\n");
+
+ // Ignore map location for quantitative variables
+ locus.Push(-1);
+ } break;
+ case 3 : {
+ columnHash.Push(GetMarkerID(locus_name));
+ columns.Push(pcMarker);
+ columnCount++;
+
+ // Store allele frequencies
+ MarkerInfo * info = GetMarkerInfo(locus_name);
+
+ info->freq = frequencies;
+
+ // Initialize allele labels
+ info->alleleLabels.Clear();
+ for (int i = 0; i < frequencies.Length(); i++)
+ info->alleleLabels.Push(label = i);
+ info->IndexAlleles();
+
+ // Store marker id, so that we can track map location
+ locus.Push(GetMarkerID(locus_name));
+ } break;
+ case 0 : {
+ // Read number of quantitative variables
+ if (ReadLineHelper(input, buffer, tokens) == 0)
+ error("Linkage data file ends unexpectedly\n");
+
+ // Add each quantitative variable to pedigree
+ // Discard information on means
+ for (int vars = tokens[0], i = 0; i < vars; i++)
+ {
+ if (ReadLineHelper(input, buffer, tokens) == 0)
+ error("Linkage data file ends unexpectedly\n");
+
+ String trait_name(locus_name);
+
+ if (i) { trait_name += "."; trait_name += i + 1; }
+
+ columnHash.Push(GetTraitID(trait_name));
+ columns.Push(pcTrait);
+ columnCount++;
+ }
+
+ // Skip var-covar matrix
+ if (ReadLineHelper(input, buffer, tokens) == 0)
+ error("Linkage data file ends unexpectedly\n");
+
+ // Skip heterozygote scaling factor for var-covar matrix
+ if (ReadLineHelper(input, buffer, tokens) == 0)
+ error("Linkage data file ends unexpectedly\n");
+
+ // Ignore map location for quantitative variables
+ locus.Push(-1);
+ } break;
+ case 2 :
+ error ("The data file includes binary factors\n"
+ "Regretably, loci of this type are not supported\n\n");
+ break;
+ default :
+ error ("Unsupported locus type [%d] in data file", locus_type);
+ break;
+ }
+ }
+
+ if (need_blank_line) printf("\n");
+
+ columns.Push(pcEnd);
+ columnHash.Push(0);
+
+ ReadLineHelper(input, buffer, tokens);
+ int sexDifference = tokens.Length() ? tokens[0].AsInteger() : -1;
+ if (tokens.Length() != 2 ||
+ (sexDifference != 0 && sexDifference != 2) ||
+ tokens[1].AsInteger() != 0)
+ error("Error retrieving recombination information\n\n"
+ "Expecting two (2) numeric values, which correspond to:\n"
+ " sex-difference -- must be zero (no difference) or two (sex specific recombination)\n"
+ " map-function -- must be zero, that is, no interference\n"
+ "The actual input read:\n%s\n", (const char *) buffer);
+
+ Vector distances[2];
+ bool distance_in_centimorgans = false;
+
+ for (int r = 0; r <= sexDifference; r += 2)
+ {
+ ReadLineHelper(input, buffer, tokens);
+ if (tokens.Length() != markerOrder.Length() - 1)
+ error("Error retrieving recombination information\n\n"
+ "Expecting %d recombination fractions (current map includes %d loci)\n"
+ "Instead the following line was input:\n%s\n",
+ markerOrder.Length() - 1, markerOrder.Length(), (const char *) buffer);
+
+ distances[r >> 1].Dimension(tokens.Length());
+ for (int i = 0; i < tokens.Length(); i++)
+ distances[r >> 1][i] = (double) tokens[i];
+
+ if (distances[r >> 1].Min() < 0.0)
+ error("Linkage datafile specifies negative recombination fractions");
+
+ bool centimorgans = distances[r >> 1].Max() > 0.5;
+
+ if (centimorgans && !distance_in_centimorgans)
+ printf(" Some recombination fractions in datafile are greater than 0.5,\n"
+ " so recombination fractions will be interpreted as cM distances\n\n");
+
+ distance_in_centimorgans |= centimorgans;
+ }
+
+ double position = 0.0, positionMale = 0.0;
+
+ for (int i = 0, moving = false; i < markerOrder.Length(); i++)
+ {
+ int m = markerOrder[i].AsInteger() - 1;
+
+ if (m < 0 || m >= locus.Length())
+ error("The marker order in the linkage datafile is invalid\n");
+
+ m = locus[m];
+
+ if (m != -1)
+ {
+ MarkerInfo * info = GetMarkerInfo(m);
+ info->chromosome = chromosomeX ? 9999 : 0;
+
+ if (sexDifference == 2)
+ info->position = (position + positionMale) * 0.5,
+ info->positionFemale = position,
+ info->positionMale = positionMale;
+ else
+ info->position = info->positionMale = info->positionFemale = position;
+
+ moving = true;
+ }
+
+ if (i < markerOrder.Length() - 1 && moving)
+ position += distance_in_centimorgans ?
+ 0.01 * distances[0][i] : RecombinationToDistance(distances[0][i]);
+
+ if (sexDifference == 2 && i < markerOrder.Length() - 1 && moving)
+ positionMale += distance_in_centimorgans ?
+ 0.01 * distances[1][i] : RecombinationToDistance(distances[1][i]);
+ }
+ }
+
+int PedigreeDescription::ReadLineHelper(IFILE & input,
+ String & buffer,
+ StringArray & tokens)
+ {
+ do {
+ // Read Line
+ buffer.ReadLine(input);
+ buffer.Trim();
+
+ // Strip comments marked with >>
+ int pos = buffer.FastFind(">>");
+ if (pos == -1) pos = buffer.FastFind("<<");
+ if (pos == -1) pos = buffer.Length() + 1;
+ if (buffer[0] == '#') pos = 0;
+
+ // Find space/tab delimited tokens
+ tokens.Clear();
+ tokens.AddTokens(buffer.Left(pos - 1), WHITESPACE);
+
+ } while (tokens.Length() == 0 && !ifeof(input));
+
+ return tokens.Length();
+ }
+
+void PedigreeDescription::LoadMendelDataFile(const char * iFilename)
+ {
+ IFILE f = ifopen(iFilename, "rb");
+
+ if (f == NULL)
+ error(
+ "The MENDEL format datafile %s cannot be opened\n\n"
+ "Please check that the file exists and is not being used by another program\n"
+ "To find out how to set input filenames, check the documentation\n",
+ iFilename);
+
+ LoadMendelDataFile(f);
+ ifclose(f);
+ };
+
+void PedigreeDescription::LoadMendelDataFile(IFILE & file)
+ {
+ // Processes mendel format file
+ mendelFormat = true;
+
+ // Codominant markers are mapped to markers
+ // Non-codominant markers are mapped into multiple "affection status"
+ // (Y/N) variables
+ columns.Clear();
+ columnHash.Clear();
+ columnCount = 0;
+
+ FortranFormat parser;
+
+ // Variables for storing parsed input
+ String locusName;
+ String locusType;
+ String alleleLabel;
+ String alleleFreq;
+ String phenotype;
+ String genotype;
+ int phenoCount;
+ int alleleCount;
+
+ while (!ifeof(file))
+ {
+ // Cycle through headers for each locus
+ parser.SetInputFile(file);
+ parser.SetFormat("(2A8,I2,I3)");
+
+ // After retrieving locus name, check that we haven't tried to
+ // read past the end-of-file
+ parser.GetNextField(locusName);
+ parser.GetNextField(locusType);
+ alleleCount = parser.GetNextInteger();
+ phenoCount = parser.GetNextInteger();
+
+ if (locusName.IsEmpty() && locusType.IsEmpty() && alleleCount == 0 &&
+ phenoCount == 0 && ifeof(file))
+ break;
+
+ // Only recognize autosomal and x-linked loci
+ if (locusType.Compare("AUTOSOME") != 0 && locusType.Compare("X-LINKED"))
+ error("Unrecognized locus type '%s' in Mendel data file\n\n"
+ "Recognized locus types are \"AUTOSOME\" and \"X-LINKED\".",
+ (const char *) locusType);
+
+ if (locusType.Compare("AUTOSOME") == 0 && chromosomeX)
+ error("The data file indicates that locus %s is AUTOSOMAL, but\n"
+ "X-LINKED loci were expected as input\n",
+ (const char *) locusName);
+
+ if (locusType.Compare("X-LINKED") == 0 && !chromosomeX)
+ error("The data file indicates that locus %s is X-LINKED, but\n"
+ "AUTOSOMAL loci were expected as input\n",
+ (const char *) locusName);
+
+ if (locusName.IsEmpty())
+ error("Blank locus name encountered in data file\n");
+
+ if (phenoCount == 0)
+ {
+ // Co-dominant marker
+ columns.Push(pcMarker);
+ columnHash.Push(GetMarkerID(locusName));
+ columnCount++;
+
+ // Update marker info with allele labels and frequencies
+ MarkerInfo * info = GetMarkerInfo(locusName);
+
+ info->alleleLabels.Clear();
+ info->alleleLabels.Push("");
+ info->freq.Clear();
+
+ parser.SetFormat("(2A8)");
+
+ // Mendel allows allele names to be specified with frequencies
+ // left blank
+ for (int i = 0; i < alleleCount; i++)
+ {
+ parser.GetNextField(alleleLabel);
+ parser.GetNextField(alleleFreq);
+
+ if (alleleLabel.IsEmpty())
+ error("Locus %s is missing allele label for allele #%d\n",
+ (const char *) locusName, i+1);
+
+ info->alleleLabels.Push(alleleLabel);
+
+ if (!alleleFreq.IsEmpty())
+ {
+ if (info->freq.Length() == 0)
+ info->freq.Push(0.0);
+
+ info->freq.Push(alleleFreq.AsDouble());
+ }
+ }
+ info->IndexAlleles();
+
+ if (info->alleleLabels.Length() != info->freq.Length() &&
+ info->freq.Length() != 0)
+ error("Locus %s is missing allele frequency information for %d alleles\n",
+ (const char *) locusName,
+ info->alleleLabels.Length() - info->freq.Length());
+ }
+ else
+ {
+ // Non-codominant marker, which we decompose into multiple traits...
+ parser.SetFormat("(2A8)");
+
+ // First skip allele frequency information
+ for (int i = 0; i < alleleCount; i++)
+ {
+ parser.GetNextField(alleleLabel);
+ parser.GetNextField(alleleFreq);
+ }
+
+ // Then read in each phenotype
+ for (int i = 0; i < alleleCount; i++)
+ {
+ parser.SetFormat("(A8,I3)");
+ parser.GetNextField(phenotype);
+ int genoCount = parser.GetNextInteger();
+
+ parser.SetFormat("(A17)");
+ for (int j = 0; j < genoCount; j++)
+ parser.GetNextField(genotype);
+
+ columns.Push(pcAffection);
+ columnHash.Push(GetAffectionID(locusName + "->" + phenotype));
+ columnCount++;
+ }
+ }
+ }
+
+ columns.Push(pcEnd);
+ columnHash.Push(0);
+ }
+
+int PedigreeDescription::CountColumns(int type)
+ {
+ int count = 0;
+
+ for (int i = 0; i < columns.Length(); i++)
+ if (columns[i] == type)
+ count++;
+
+ return count;
+ }
+
+const char * PedigreeDescription::ColumnSummary(String & string)
+ {
+ string.Clear();
+ UpdateSummary(string, pcMarker, " markers [x2 cols]");
+ UpdateSummary(string, pcTrait, " traits");
+ UpdateSummary(string, pcAffection, " discrete traits");
+ UpdateSummary(string, pcCovariate, " covariates");
+ UpdateSummary(string, pcZygosity, " zygosity");
+ UpdateSummary(string, pcSkip, " skipped");
+ return string;
+ }
+
+void PedigreeDescription::UpdateSummary(String & string, int type, const char * label)
+ {
+ int count = CountColumns(type);
+
+ if (count)
+ {
+ if (string.Length())
+ string += ", ";
+ string += count;
+ string += label;
+ }
+ }
+
+
diff --git a/libsrc/PedigreeDescription.h b/libsrc/PedigreeDescription.h
new file mode 100644
index 0000000..ce27bff
--- /dev/null
+++ b/libsrc/PedigreeDescription.h
@@ -0,0 +1,82 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/PedigreeDescription.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __PEDDESCRIBE_H__
+#define __PEDDESCRIBE_H__
+
+#include "PedigreeGlobals.h"
+#include "PedigreePerson.h"
+#include "StringArray.h"
+#include "IntArray.h"
+
+#include <stdio.h>
+
+// Possible pedigree columns
+#define pcSkip 0
+#define pcMarker 1
+#define pcTrait 2
+#define pcAffection 3
+#define pcCovariate 4
+#define pcZygosity 5
+#define pcEnd 6
+
+// Undocumented pedigree column types -- not recommended
+#define pcUndocumentedTraitCovariate 1001
+
+class PedigreeDescription : public PedigreeGlobals
+ {
+ public:
+ int columnCount;
+ IntArray columns, columnHash;
+
+ PedigreeDescription();
+ ~PedigreeDescription();
+
+ void Load(IFILE & Input, bool warnIfLinkage = false);
+ void Load(const char * filename, bool warnIfLinkage = false);
+
+ void LoadLinkageDataFile(IFILE & input);
+ void LoadLinkageDataFile(const char * filename);
+
+ void LoadMendelDataFile(IFILE & input);
+ void LoadMendelDataFile(const char * filename);
+
+ void LoadMap(IFILE & Input);
+ void LoadMap(const char * filename);
+
+ PedigreeDescription & operator = (PedigreeDescription & rhs);
+
+ int CountTextColumns();
+
+ // returns a string summarizing column contents
+ const char * ColumnSummary(String & string);
+
+ // Flag specifying Mendel format
+ bool mendelFormat;
+
+ String filename;
+
+ private:
+ int ReadLineHelper(IFILE & input, String & buffer, StringArray & tokens);
+
+ int CountColumns(int type);
+ void UpdateSummary(String & string, int type, const char * label);
+ };
+
+#endif
+
+
diff --git a/libsrc/PedigreeFamily.cpp b/libsrc/PedigreeFamily.cpp
new file mode 100644
index 0000000..89009d6
--- /dev/null
+++ b/libsrc/PedigreeFamily.cpp
@@ -0,0 +1,294 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/PedigreeFamily.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "Pedigree.h"
+#include "Constant.h"
+#include "MathConstant.h"
+#include "Error.h"
+
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <limits.h>
+
+Family::Family(Pedigree & pedigree, int _first, int _last, int _serial) :
+ ped(pedigree)
+ {
+ serial = _serial;
+ first = _first;
+ last = _last;
+ count = last - first + 1;
+ path = new int [count];
+ famid = ped[first].famid;
+
+ founders = mzTwins = 0;
+
+ for (int i=first; i<=last; i++)
+ if (ped[i].isFounder())
+ {
+ ped[i].traverse = founders;
+ path[founders++] = ped[i].serial;
+ }
+ else
+ {
+ ped[i].traverse = -1;
+ if ( ped[i].isMzTwin(ped[i]))
+ for (int j = first; j < i; j++)
+ if (ped[i].isMzTwin( ped[j] ))
+ {
+ mzTwins++;
+ break;
+ }
+ }
+
+ nonFounders = count - founders;
+ generations = nonFounders == 0 ? 1 : 2;
+
+ int next = founders;
+ while (next < count)
+ {
+ bool check = false;
+
+ // Create traversal where path ancestors precede their offspring
+ for (int i=first; i<=last; i++)
+ if (ped[i].traverse == -1)
+ {
+ int fatherSerial = ped[i].father->traverse;
+ int motherSerial = ped[i].mother->traverse;
+
+ if (fatherSerial >= 0 && motherSerial >= 0)
+ {
+ check = true;
+
+ ped[i].traverse = next;
+ path[next++] = i;
+
+ if (fatherSerial >= founders || motherSerial >= founders)
+ generations = 3;
+
+ // If this individual is part of a set of MZ twins
+ if (ped[i].zygosity & 1)
+ for (int j = 0; j < ped[i].sibCount; j++)
+ {
+ Person & sib = *ped[i].sibs[j];
+
+ // Insert all co-twins at the same position in traversal
+ // order
+ if (sib.traverse == -1 && ped[i].zygosity == sib.zygosity)
+ {
+ sib.traverse = next;
+ path[next++] = sib.serial;
+ }
+ }
+ }
+ }
+
+ if (!check) ShowInvalidCycles();
+ }
+ }
+
+Family::~Family()
+ {
+ delete [] path;
+ }
+
+void Family::ShowInvalidCycles()
+ {
+ // Try and identify key individuals responsible for
+ // pedigree mess-up ... when this function is called
+ // pedigree has been traversed top-down and individuals
+ // that are correctly specified have IDs of >= 0.
+
+ // This routine traverses the pedigree bottom up to
+ // identify a subset of individuals likely to be causing
+ // the problem
+ IntArray descendants(ped.count);
+ descendants.Zero();
+
+ for (int i = first; i <= last; i++)
+ if (ped[i].traverse == -1)
+ {
+ descendants[ped[i].father->serial]++;
+ descendants[ped[i].mother->serial]++;
+ }
+
+ IntArray stack;
+
+ for (int i = first; i <= last; i++)
+ if (ped[i].traverse == -1 && descendants[i] == 0)
+ {
+ stack.Push(i);
+
+ do {
+ int j = stack.Pop();
+
+ if (ped[j].traverse != -1) continue;
+
+ ped[j].traverse = 9999;
+
+ if (--descendants[ped[j].father->serial] == 0)
+ stack.Push(ped[j].father->serial);
+ if (--descendants[ped[j].mother->serial] == 0)
+ stack.Push(ped[j].mother->serial);
+ } while (stack.Length());
+ }
+
+ printf("The structure of family %s requires\n"
+ "an individual to be his own ancestor.\n\n"
+ "To identify the problem(s), examine the\n"
+ "following key individuals:\n\n",
+ (const char *) famid);
+
+ for (int i = first; i <= last; i++)
+ if (ped[i].traverse == -1)
+ printf("Problem Person: %s\n", (const char *) ped[i].pid);
+
+ error("Invalid pedigree structure.");
+ }
+
+int Family::ConnectedGroups(IntArray * groupMembership)
+ {
+ IntArray groups(count);
+
+ // Use the quick union algorithm to identify connected groups
+ groups.SetSequence(0, 1);
+ for (int i = count - 1; i >= founders; i--)
+ {
+ // Lookup parents
+ int group0 = i;
+ int group1 = ped[path[i]].father->traverse;
+ int group2 = ped[path[i]].mother->traverse;
+
+ // Identify their corresponding groupings
+ while (groups[group0] != group0) group0 = groups[group0];
+ while (groups[group1] != group1) group1 = groups[group1];
+ while (groups[group2] != group2) group2 = groups[group2];
+
+ int group = group1 < group2 ? group1 : group2;
+ if (group0 < group) group = group0;
+
+ groups[group0] = groups[group1] = groups[group2] = group;
+ }
+
+ // Count groupings
+ int groupCount = 0;
+ for (int i = 0; i < founders; i++)
+ if (groups[i] == i)
+ groupCount++;
+
+ if (groupMembership == NULL)
+ return groupCount;
+
+ // Flatten tree so all items point to root
+ for (int i = 1; i < count; i++)
+ groups[i] = groups[groups[i]];
+
+ // Update group membership info
+ int group = 0;
+ groupMembership->Dimension(count);
+ for (int i = 0; i < count; i++)
+ if (groups[i] == i)
+ (*groupMembership)[i] = ++group;
+ else
+ (*groupMembership)[i] = (*groupMembership)[groups[i]];
+
+#if 0
+ // This stretch of code outputs family structure and group membership
+ // And should usually be commented out!
+ for (int j = first; j <= last; j++)
+ printf("%s %s %s %s %d %d\n",
+ (const char *) famid, (const char *) ped[j].pid,
+ (const char *) ped[j].fatid, (const char *) ped[j].motid,
+ ped[j].sex, groups[ped[j].traverse]);
+#endif
+
+ return groupCount;
+ }
+
+/*
+int Family::ConnectedGroups(IntArray * groupMembership)
+ {
+ IntArray * stack = new IntArray[count];
+ IntArray groups(count);
+
+ groups.Zero();
+
+ int group = 0;
+ int seed = count - 1;
+
+ // Search for connected sets of individuals until everyone is accounted for
+ while (true)
+ {
+ while ((seed >= 0) && (groups[seed] != 0))
+ seed--;
+
+ if (seed == -1)
+ break;
+
+ Mark(seed, ++group, stack, groups);
+
+ for (int j = seed; j >= founders; j--)
+ if (groups[j] == 0)
+ {
+ int fat_j = ped[path[j]].father->traverse;
+ int mot_j = ped[path[j]].mother->traverse;
+
+ if (groups[fat_j] == group || groups[mot_j] == group)
+ Mark(j, group, stack, groups);
+ else
+ stack[mot_j].Push(j),
+ stack[fat_j].Push(j);
+ }
+
+ for (int j = 0; j < count; j++)
+ stack[j].Clear();
+ }
+
+ if (groupMembership != NULL)
+ (*groupMembership) = groups;
+
+ // This stretch of code outputs family structure and group membership
+ // And should usually be commented out!
+#if 0
+ for (int j = first; j <= last; j++)
+ printf("%s %s %s %s %d %d\n",
+ (const char *) famid, (const char *) ped[j].pid,
+ (const char *) ped[j].fatid, (const char *) ped[j].motid,
+ ped[j].sex, groups[ped[j].traverse]);
+#endif
+
+ delete [] stack;
+
+ return group;
+ }
+
+void Family::Mark(int j, int group, IntArray * stack, IntArray & groups)
+ {
+ if (groups[j] == group) return;
+
+ groups[j] = group;
+
+ while (stack[j].Length())
+ Mark(stack[j].Pop(), group, stack, groups);
+
+ if (j < founders) return;
+
+ Mark(ped[path[j]].father->traverse, group, stack, groups);
+ Mark(ped[path[j]].mother->traverse, group, stack, groups);
+ }
+*/
+
diff --git a/libsrc/PedigreeFamily.h b/libsrc/PedigreeFamily.h
new file mode 100644
index 0000000..8c52de1
--- /dev/null
+++ b/libsrc/PedigreeFamily.h
@@ -0,0 +1,63 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/PedigreeFamily.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __PEDFAMILY_H__
+#define __PEDFAMILY_H__
+
+#include "PedigreeAlleles.h"
+#include "PedigreePerson.h"
+#include "StringBasics.h"
+
+class Pedigree;
+
+class Family
+ {
+ public:
+ Pedigree & ped;
+ String famid;
+ int serial;
+ int first, last; // sentinel family members
+ int count; // number of individuals in pedigree
+ int founders; // number of founders in pedigree
+ int nonFounders; // number of non-founders in pedigree
+ int mzTwins; // number of MZ twins, excluding 1st twin in set
+ int * path; // traverses the pedigree so that ancestors
+ // preceed their descendants
+
+ int generations; // Rough classification as:
+ // 1 -- all individuals are unrelated
+ // 2 -- two generations (inc. multiple couples)
+ // 3 -- three or more generations
+
+ bool isNuclear()
+ { return (generations == 2) && (founders == 2); }
+
+ Family(Pedigree & ped, int top, int bottom, int serial = 0);
+ ~Family();
+
+ int ConnectedGroups(IntArray * groupMembership = NULL);
+
+ private:
+ void ShowInvalidCycles();
+
+ Family & operator = (Family & rhs);
+// void Mark(int who, int group, IntArray * stack, IntArray & group_id );
+ };
+
+#endif
+
+
diff --git a/libsrc/PedigreeGlobals.cpp b/libsrc/PedigreeGlobals.cpp
new file mode 100644
index 0000000..d59a5b0
--- /dev/null
+++ b/libsrc/PedigreeGlobals.cpp
@@ -0,0 +1,856 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/PedigreeGlobals.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "PedigreeGlobals.h"
+#include "Sort.h"
+#include "Error.h"
+
+#include <math.h>
+#include <string.h>
+#include <ctype.h>
+
+int PedigreeGlobals::traitCount = 0;
+int PedigreeGlobals::affectionCount = 0;
+int PedigreeGlobals::covariateCount = 0;
+int PedigreeGlobals::markerCount = 0;
+
+// If this value isn't set, all X chromosome data will be rejected
+bool PedigreeGlobals::chromosomeX = false;
+bool PedigreeGlobals::sexSpecificMap = false;
+
+StringArray PedigreeGlobals::traitNames;
+StringArray PedigreeGlobals::markerNames;
+StringArray PedigreeGlobals::covariateNames;
+StringArray PedigreeGlobals::affectionNames;
+StringIntHash PedigreeGlobals::markerLookup;
+StringIntHash PedigreeGlobals::traitLookup;
+StringIntHash PedigreeGlobals::affectionLookup;
+StringIntHash PedigreeGlobals::covariateLookup;
+
+int PedigreeGlobals::markerInfoCount = 0;
+int PedigreeGlobals::markerInfoSize = 0;
+
+MarkerInfo ** PedigreeGlobals::markerInfo = NULL;
+MarkerInfo ** PedigreeGlobals::markerInfoByInteger = NULL;
+StringHash PedigreeGlobals::markerInfoByName;
+
+int MarkerInfo::count = 0;
+
+int MarkerInfo::ComparePosition(MarkerInfo ** left, MarkerInfo ** right)
+ {
+ if ((*left)->chromosome != (*right)->chromosome)
+ return (*left)->chromosome - (*right)->chromosome;
+
+ double difference = (*left)->position - (*right)->position;
+
+ if (difference > 0.0)
+ return 1;
+ else if (difference == 0.0)
+ return (*left)->serial - (*right)->serial;
+ else
+ return -1;
+ }
+
+String MarkerInfo::GetAlleleLabel(int allele)
+ {
+ if (alleleLabels.Length() > allele && alleleLabels[allele].Length())
+ return alleleLabels[allele];
+ else if (alleleLabels.Length() <= allele)
+ alleleLabels.Dimension(allele + 1);
+ return alleleLabels[allele] = allele;
+ }
+
+bool MarkerInfo::AdjustFrequencies()
+ {
+ if (freq.Length() <= 1)
+ {
+ freq.Clear();
+ return false;
+ }
+
+ if (freq.Min() < 0.0)
+ error("Locus %s has negative allele frequencies\n", (const char *) name);
+
+ double sum = freq.Sum();
+
+ if (sum <= 0.0)
+ error("Locus %s frequencies sum to %f, which doesn't make sense\n",
+ (const char *) name, sum);
+
+ if ( sum != 1.0)
+ freq *= 1.0 / sum;
+
+ if ( fabs(sum - 1.0) > 1.2e-5 )
+ {
+ printf("Locus %s frequencies sum to %f, adjusted to 1.0\n",
+ (const char *) name, sum);
+ return true;
+ }
+
+ return false;
+ }
+
+void MarkerInfo::IndexAlleles()
+ {
+ if (alleleLabels.Length() >= 255)
+ error("Marker %s has more than 254 distinct alleles\n",
+ (const char *) name);
+
+ alleleNumbers.Clear();
+
+ for (int i = 1; i < alleleLabels.Length(); i++)
+ alleleNumbers.SetInteger(alleleLabels[i], i);
+ }
+
+int MarkerInfo::NewAllele(const String & label)
+ {
+ if (alleleLabels.Length() == 0)
+ alleleLabels.Push("");
+
+ if (alleleLabels.Length() >= 255)
+ error("Marker %s has more than 254 distinct alleles\n",
+ (const char *) name);
+
+ alleleNumbers.SetInteger(label, alleleLabels.Length());
+ alleleLabels.Push(label);
+
+ return alleleLabels.Length() - 1;
+ }
+
+int PedigreeGlobals::GetTraitID(const char * name)
+ {
+ int idx = traitLookup.Integer(name);
+
+ if (idx != -1) return idx;
+
+ traitNames.Add(name);
+ traitLookup.SetInteger(name, traitCount);
+ return traitCount++;
+ }
+
+int PedigreeGlobals::GetAffectionID(const char * name)
+ {
+ int idx = affectionLookup.Integer(name);
+
+ if (idx != -1) return idx;
+
+ affectionNames.Add(name);
+ affectionLookup.SetInteger(name, affectionCount);
+ return affectionCount++;
+ }
+
+int PedigreeGlobals::GetCovariateID(const char * name)
+ {
+ int idx = covariateLookup.Integer(name);
+
+ if (idx != -1) return idx;
+
+ covariateNames.Add(name);
+ covariateLookup.SetInteger(name, covariateCount);
+ return covariateCount++;
+ }
+
+int PedigreeGlobals::GetMarkerID(const char * name)
+ {
+ int idx = markerLookup.Integer(name);
+
+ if (idx != -1) return idx;
+
+ markerNames.Add(name);
+ markerLookup.SetInteger(name, markerCount);
+
+ // Grow the marker info key ...
+ if (markerCount == 0)
+ {
+ markerInfoByInteger = new MarkerInfo * [16];
+
+ for (int i = 0; i < 16; i++)
+ markerInfoByInteger[i] = NULL;
+ }
+ else if ((markerCount & (markerCount - 1)) == 0 && markerCount > 15)
+ {
+ MarkerInfo ** newKey = new MarkerInfo * [markerCount * 2];
+
+ for (int i = 0; i < markerCount; i++)
+ newKey[i] = markerInfoByInteger[i];
+
+ for (int i = markerCount; i < markerCount * 2; i++)
+ newKey[i] = NULL;
+
+ delete [] markerInfoByInteger;
+
+ markerInfoByInteger = newKey;
+ }
+
+ return markerCount++;
+ }
+
+MarkerInfo * PedigreeGlobals::GetMarkerInfo(String & name)
+ {
+ MarkerInfo * info = (MarkerInfo *) markerInfoByName.Object(name);
+
+ if (info != NULL) return info;
+
+ info = new MarkerInfo(name);
+ markerInfoByName.Add(name, info);
+
+ if (markerInfoCount >= markerInfoSize)
+ GrowMarkerInfo();
+
+ markerInfo[markerInfoCount++] = info;
+
+ int markerId = LookupMarker(name);
+ if (markerId >= 0) markerInfoByInteger[markerId] = info;
+
+ return info;
+ }
+
+MarkerInfo * PedigreeGlobals::GetMarkerInfo(int markerId)
+ {
+ if (markerId >= markerCount)
+ error("Attempted to retrieve MarkerInfo using out-of-bounds index\n");
+
+ if (markerInfoByInteger[markerId] != NULL)
+ return markerInfoByInteger[markerId];
+ else
+ return GetMarkerInfo(markerNames[markerId]);
+ }
+
+void PedigreeGlobals::GrowMarkerInfo()
+ {
+ int newSize = markerInfoSize ? 2 * markerInfoSize : 32;
+
+ MarkerInfo ** newArray = new MarkerInfo * [newSize];
+
+ if (markerInfoSize)
+ {
+ memcpy(newArray, markerInfo, sizeof(MarkerInfo *) * markerInfoSize);
+ delete [] markerInfo;
+ }
+
+ markerInfo = newArray;
+ markerInfoSize = newSize;
+ }
+
+void PedigreeGlobals::FlagMissingMarkers(IntArray & missingMarkers)
+ {
+ int skipped_markers = 0;
+
+ if (missingMarkers.Length())
+ {
+ StringArray names;
+
+ printf("These markers couldn't be placed and won't be analysed:");
+
+ for (int i = 0; i < missingMarkers.Length(); i++)
+ names.Push(GetMarkerInfo(missingMarkers[i])->name);
+ names.Sort();
+
+ for (int i = 0, line = 80, lines = 0; i < missingMarkers.Length(); i++)
+ {
+ if (line + names[i].Length() + 1 > 79)
+ printf("\n "), line = 3, lines++;
+
+ if (lines < 5)
+ {
+ printf("%s ", (const char *) names[i]);
+ line += names[i].Length() + 1;
+ }
+ else
+ skipped_markers++;
+ }
+
+ if (skipped_markers)
+ printf("as well as %d other unlisted markers...", skipped_markers);
+
+ printf("\n\n");
+ }
+ }
+
+void PedigreeGlobals::GetOrderedMarkers(IntArray & markers)
+ {
+ if (markers.Length() == 0)
+ {
+ markers.Dimension(markerCount);
+ markers.SetSequence(0, 1);
+ }
+
+ MarkerInfo ** subset = new MarkerInfo * [markers.Length()];
+
+ int count = 0;
+ IntArray missingMarkers;
+
+ for (int i = 0; i < markers.Length(); i++)
+ {
+ MarkerInfo * info = GetMarkerInfo(markers[i]);
+
+ if (info->chromosome != -1)
+ subset[count++] = info;
+ else
+ missingMarkers.Push(i);
+ }
+
+ FlagMissingMarkers(missingMarkers);
+
+ QuickSort(subset, count, sizeof(MarkerInfo *),
+ COMPAREFUNC MarkerInfo::ComparePosition);
+
+ markers.Clear();
+ for (int i = 0; i < count; i++)
+ markers.Push(GetMarkerID(subset[i]->name));
+ }
+
+int PedigreeGlobals::SortMarkersInMapOrder(IntArray & markers, int chromosome)
+ {
+ if (markers.Length() == 0)
+ {
+ markers.Dimension(markerCount);
+ markers.SetSequence(0, 1);
+ }
+
+ MarkerInfo ** subset = new MarkerInfo * [markers.Length()];
+
+ int count = 0;
+ IntArray missingMarkers;
+
+ for (int i = 0; i < markers.Length(); i++)
+ {
+ MarkerInfo * info = GetMarkerInfo(markers[i]);
+
+ if (info->chromosome != -1)
+ subset[count++] = info;
+ else if (chromosome == -1)
+ missingMarkers.Push(i);
+ }
+
+ if (chromosome == -1)
+ FlagMissingMarkers(missingMarkers);
+
+ QuickSort(subset, count, sizeof(MarkerInfo *),
+ COMPAREFUNC MarkerInfo::ComparePosition);
+
+ markers.Clear();
+
+ int current_chromosome = -1, next_chromosome = 0;
+
+ for (int i = 0; i < count; i++)
+ if (subset[i]->chromosome < chromosome)
+ continue;
+ else if (current_chromosome == -1 ||
+ subset[i]->chromosome == current_chromosome)
+ {
+ markers.Push(GetMarkerID(subset[i]->name));
+ current_chromosome = subset[i]->chromosome;
+ }
+ else if (!next_chromosome)
+ {
+ next_chromosome = subset[i]->chromosome;
+ break;
+ }
+
+ delete [] subset;
+
+ return next_chromosome;
+ }
+
+void PedigreeGlobals::VerifySexSpecificOrder()
+ {
+ if (markerCount <= 1)
+ return;
+
+ MarkerInfo ** sortedMarkers = new MarkerInfo * [markerCount];
+
+ for (int i = 0; i < markerCount; i++)
+ sortedMarkers[i] = GetMarkerInfo(i);
+
+ QuickSort(sortedMarkers, markerCount, sizeof(MarkerInfo *),
+ COMPAREFUNC MarkerInfo::ComparePosition);
+
+ double prev_female = sortedMarkers[0]->positionFemale;
+ double prev_male = sortedMarkers[0]->positionMale;
+ double curr_female, curr_male;
+
+ int prev_chromosome = sortedMarkers[0]->chromosome;
+ int curr_chromosome;
+
+ for (int i = 1; i < markerCount; i++)
+ {
+ curr_chromosome = sortedMarkers[i]->chromosome;
+ curr_female = sortedMarkers[i]->positionFemale;
+ curr_male = sortedMarkers[i]->positionMale;
+
+ if (curr_chromosome == prev_chromosome &&
+ (curr_female < prev_female || curr_male < prev_male))
+ error("Sex-specific and sex-averaged maps are inconsistent.\n\n"
+ "In the sex-averaged map, marker %s (%.2f cM) follows marker %s (%.2f cM).\n"
+ "In the %smale map, marker %s (%.2f cM) PRECEDES marker %s (%.2f cM).\n",
+ (const char *) sortedMarkers[i]->name,
+ sortedMarkers[i]->position * 100,
+ (const char *) sortedMarkers[i-1]->name,
+ sortedMarkers[i-1]->position * 100,
+ curr_female < prev_female ? "fe" : "",
+ (const char *) sortedMarkers[i]->name,
+ (curr_female < prev_female ? curr_female : curr_male) * 100,
+ (const char *) sortedMarkers[i-1]->name,
+ (curr_female < prev_female ? prev_female : prev_male) * 100);
+
+ prev_chromosome = curr_chromosome;
+ prev_female = curr_female;
+ prev_male = curr_male;
+ }
+
+ delete [] sortedMarkers;
+ }
+
+void PedigreeGlobals::LoadAlleleFrequencies(const char * filename, bool required)
+ {
+ // This function is often called with an empty string, and not
+ // all implementations of the C library like that ...
+ if (filename[0] == 0)
+ if (required)
+ error("No name provided for required allele freuquency file\n");
+ else
+ return;
+
+ // If we get here, the filename is not empty and things should
+ // work as planned
+ IFILE f = ifopen(filename, "rb");
+
+ if (f == NULL)
+ if (required)
+ error("Failed to open required alallele frequency file '%s'",
+ (const char *) filename);
+ else
+ return;
+
+ LoadAlleleFrequencies(f);
+ ifclose(f);
+ }
+
+void PedigreeGlobals::LoadAlleleFrequencies(IFILE & input)
+ {
+ int done = 0;
+ String buffer;
+ StringArray tokens;
+ MarkerInfo *info = NULL;
+
+ bool need_blank_line = false;
+ int allele_size, old_max, next_allele = 0; // Initialization avoids compiler warning
+
+ while (!ifeof(input) && !done)
+ {
+ int i, j;
+
+ buffer.ReadLine(input);
+
+ tokens.Clear();
+ tokens.AddTokens(buffer, WHITESPACE);
+
+ if (tokens.Length() < 1) continue;
+
+ switch (toupper(tokens[0][0]))
+ {
+ case 'M' :
+ if (tokens.Length() == 1)
+ error("Unnamed marker in allele frequency file");
+ if (info != NULL)
+ need_blank_line |= info->AdjustFrequencies();
+ info = GetMarkerInfo(tokens[1]);
+ info->freq.Clear();
+ info->freq.Push(0.0);
+ next_allele = 1;
+ break;
+ case 'F' :
+ if (info != NULL)
+ for ( i = 1; i < tokens.Length(); i++)
+ {
+ buffer = next_allele++;
+
+ int allele = LoadAllele(info, buffer);
+
+ if (allele >= info->freq.Length())
+ {
+ old_max = info->freq.Length();
+ info->freq.Dimension(allele + 1);
+ for (j = old_max; j < allele; j++)
+ info->freq[j] = 0.0;
+ }
+
+ info->freq[allele] = tokens[i].AsDouble();
+ }
+ break;
+ case 'A' :
+ if (info == NULL) continue;
+
+ if (tokens.Length() != 3)
+ error("Error reading named allele frequencies for locus %s\n"
+ "Lines with named alleles should have the format\n"
+ " A allele_label allele_frequency\n\n"
+ "But the following line was read:\n%s\n",
+ (const char *) info->name, (const char *) buffer);
+
+ allele_size = LoadAllele(info, tokens[1]);
+ next_allele = atoi(tokens[1]) + 1;
+
+ if (allele_size < 1)
+ error("Error reading named allele frequencies for locus %s\n"
+ "An invalid allele label was encountered\n",
+ (const char *) info->name);
+
+ if (allele_size >= info->freq.Length())
+ {
+ old_max = info->freq.Length();
+ info->freq.Dimension(allele_size + 1);
+ for (i = old_max; i < allele_size; i++)
+ info->freq[i] = 0.0;
+ }
+
+ info->freq[allele_size] = tokens[2];
+ break;
+ case 'E' :
+ done = 1;
+ break;
+ default :
+ error ("Problem in allele frequency file.\n"
+ "Lines in this file should be of two types:\n"
+ " -- Marker name lines begin with an M\n"
+ " -- Frequency lines begin with an F\n\n"
+ "However the following line is different:\n%s\n",
+ (const char *) buffer);
+ }
+ }
+
+ if (info != NULL)
+ need_blank_line |= info->AdjustFrequencies();
+
+ if (need_blank_line) printf("\n");
+ }
+
+void PedigreeGlobals::LoadMarkerMap(const char * filename, bool filter)
+ {
+ IFILE f = ifopen(filename, "rb");
+ if (f == NULL) return;
+ LoadMarkerMap(f, filter);
+ ifclose(f);
+ }
+
+void PedigreeGlobals::LoadMarkerMap(IFILE & input, bool filter)
+ {
+ String buffer;
+ StringArray tokens;
+ bool first_pass = true;
+
+ while (!ifeof(input))
+ {
+ buffer.ReadLine(input);
+
+ tokens.Clear();
+ tokens.AddTokens(buffer, WHITESPACE);
+
+ if (tokens.Length() < 1) continue;
+
+ if (first_pass)
+ {
+ sexSpecificMap = (tokens.Length() == 5);
+
+ // if (sexSpecificMap)
+ // printf("\n Found sex-specific map ...\n\n");
+
+ first_pass = false;
+ }
+
+ if (tokens.Length() != 3 && !sexSpecificMap)
+ error("Error reading map file\n"
+ "Each line in this file should include 3 fields:\n"
+ "CHROMOSOME, MARKER_NAME, and POSITION\n"
+ "However the following line has %d fields\n%s\n",
+ tokens.Length(), (const char *) buffer);
+
+ if (tokens.Length() != 5 && sexSpecificMap)
+ error("Error reading map file\n"
+ "Each line in this file should include 5 fields:\n\n"
+ "CHROMOSOME, MARKER_NAME, SEX_AVERAGED_POS, FEMALE_POS AND MALE_POS\n\n"
+ "However the following line has %d fields\n%s\n",
+ tokens.Length(), (const char *) buffer);
+
+ bool previous_state = String::caseSensitive;
+ String::caseSensitive = false;
+
+ if ((tokens[0] == "CHR" || tokens[0] == "CHROMOSOME") &&
+ (tokens[1] == "MARKER" || tokens[1] == "MARKER_NAME" || tokens[1] == "MRK") &&
+ (tokens[2] == "KOSAMBI" || tokens[2] == "POS" || tokens[2] == "POSITION" ||
+ tokens[2] == "SEX_AVERAGED_POS" || tokens[2] == "CM" || tokens[2] == "HALDANE"))
+ continue;
+
+ String::caseSensitive = previous_state;
+
+ if (filter)
+ if (LookupMarker(tokens[1]) < 0)
+ continue;
+
+ MarkerInfo * info = GetMarkerInfo(tokens[1]);
+
+ int chr = (tokens[0][0] == 'x' || tokens[0][0] == 'X') ? 999 : (int) tokens[0];
+
+ info->chromosome = chr;
+ info->position = (double) tokens[2] * 0.01;
+
+ if (sexSpecificMap)
+ {
+ char * flag;
+
+ double female = strtod(tokens[3], &flag);
+ if (*flag)
+ error("In the map file, the female cM position for marker\n"
+ "%s is %s. This is not a valid number.",
+ (const char *) tokens[1], (const char *) tokens[3]);
+
+ double male = strtod(tokens[4], &flag);
+ if (*flag)
+ error("In the map file, the male cM position for marker\n"
+ "%s is %s. This is not a valid number.",
+ (const char *) tokens[1], (const char *) tokens[4]);
+
+ info->positionFemale = (double) female * 0.01;
+ info->positionMale = (double) male * 0.01;
+ }
+ else
+ info->positionFemale = info->positionMale = info->position;
+ }
+
+ if (sexSpecificMap) VerifySexSpecificOrder();
+ }
+
+void PedigreeGlobals::LoadBasepairMap(const char * filename)
+ {
+ IFILE f = ifopen(filename, "rb");
+ if (f == NULL)
+ error("The map file [%s] could not be opened\n\n"
+ "Please check that the filename is correct and that the file is\n"
+ "not being used by another program", filename);
+ LoadBasepairMap(f);
+ ifclose(f);
+ }
+
+void PedigreeGlobals::LoadBasepairMap(IFILE & input)
+ {
+ String buffer;
+ StringArray tokens;
+
+ sexSpecificMap = false;
+
+ while (!ifeof(input))
+ {
+ buffer.ReadLine(input);
+
+ tokens.Clear();
+ tokens.AddTokens(buffer, WHITESPACE);
+
+ if (tokens.Length() < 1) continue;
+
+ if (tokens.Length() != 3)
+ error("Error reading map file\n"
+ "Each line in this file should include 3 fields:\n"
+ "CHROMOSOME, MARKER_NAME, and POSITION\n"
+ "However the following line has %d fields\n%s\n",
+ tokens.Length(), (const char *) buffer);
+
+ bool previous_state = String::caseSensitive;
+ String::caseSensitive = false;
+
+ if ((tokens[0] == "CHR" || tokens[0] == "CHROMOSOME") &&
+ (tokens[1] == "MARKER" || tokens[1] == "MARKER_NAME" || tokens[1] == "MRK") &&
+ (tokens[2] == "BASEPAIR" || tokens[2] == "POS" || tokens[2] == "POSITION"))
+ continue;
+
+ String::caseSensitive = previous_state;
+
+ MarkerInfo * info = GetMarkerInfo(tokens[1]);
+
+ int chr = (tokens[0][0] == 'x' || tokens[0][0] == 'X') ? 999 : (int) tokens[0];
+
+ info->chromosome = chr;
+ info->position = (double) tokens[2];
+ }
+ }
+
+int PedigreeGlobals::instanceCount = 0;
+
+PedigreeGlobals::~PedigreeGlobals()
+ {
+ if (--instanceCount == 0 && markerInfoSize)
+ {
+ for (int i = 0; i < markerInfoCount; i++)
+ delete markerInfo[i];
+ delete [] markerInfo;
+ delete [] markerInfoByInteger;
+ }
+ }
+
+void PedigreeGlobals::WriteMapFile(const char * filename)
+ {
+ if (!MarkerPositionsAvailable())
+ return;
+
+ FILE * output = fopen(filename, "wt");
+
+ if (output == NULL)
+ error("Creating map file \"%s\"", filename);
+
+ WriteMapFile(output);
+ fclose(output);
+ }
+
+void PedigreeGlobals::WriteMapFile(FILE * output)
+ {
+ if (!sexSpecificMap)
+ fprintf(output, "CHR MARKER POS\n");
+ else
+ fprintf(output, "CHR MARKER POS POSF POSM\n");
+
+ for (int i = 0; i < markerInfoCount; i++)
+ if (markerInfo[i]->chromosome != -1)
+ if (!sexSpecificMap)
+ fprintf(output, "%3d %-10s %g\n",
+ markerInfo[i]->chromosome,
+ (const char *) markerInfo[i]->name,
+ markerInfo[i]->position * 100.0);
+ else
+ fprintf(output, "%3d %-10s %g %g %g\n",
+ markerInfo[i]->chromosome,
+ (const char *) markerInfo[i]->name,
+ markerInfo[i]->position * 100.0,
+ markerInfo[i]->positionFemale * 100.0,
+ markerInfo[i]->positionMale * 100.0);
+ }
+
+void PedigreeGlobals::WriteFreqFile(const char * filename, bool old_format)
+ {
+ FILE * output = fopen(filename, "wt");
+
+ if (output == NULL)
+ error("Creating allele frequency file \"%s\"", filename);
+
+ WriteFreqFile(output, old_format);
+ fclose(output);
+ }
+
+void PedigreeGlobals::WriteFreqFile(FILE * output, bool old_format)
+ {
+ for (int i = 0; i < markerInfoCount; i++)
+ {
+ MarkerInfo * info = markerInfo[i];
+
+ if (info->freq.Length() == 0) continue;
+
+ fprintf(output, "M %s\n", (const char *) info->name);
+
+ if (old_format && info->alleleLabels.Length() == 0)
+ for (int j = 1; j < info->freq.Length(); j++)
+ fprintf(output, "%s%.5f%s",
+ j % 7 == 1 ? "F " : "", info->freq[j],
+ j == info->freq.Length() - 1 ? "\n" : j % 7 == 0 ? "\n" : " ");
+ else
+ for (int j = 1; j < info->freq.Length(); j++)
+ if (info->freq[j] > 1e-7)
+ fprintf(output, "A %5s %.5f\n",
+ (const char *) info->GetAlleleLabel(j), info->freq[j]);
+ }
+ }
+
+bool PedigreeGlobals::MarkerPositionsAvailable()
+ {
+ for (int i = 0; i < markerInfoCount; i++)
+ if (markerInfo[i]->chromosome != -1)
+ return true;
+
+ return false;
+ }
+
+bool PedigreeGlobals::AlleleFrequenciesAvailable()
+ {
+ for (int i = 0; i < markerInfoCount; i++)
+ if (markerInfo[i]->freq.Length() > 1)
+ return true;
+
+ return false;
+ }
+
+int PedigreeGlobals::LoadAllele(int marker, String & token)
+ {
+ return LoadAllele(GetMarkerInfo(marker), token);
+ }
+
+int PedigreeGlobals::LoadAllele(MarkerInfo * info, String & token)
+ {
+ int allele = info->GetAlleleNumber(token);
+
+ if (allele >= 0) return allele;
+
+ static unsigned char lookup[128];
+ static bool init = false;
+
+ if (!init)
+ {
+ init = true;
+
+ for (int i = 0; i < 128; i++)
+ lookup[i] = 0;
+
+ for (int i = '1'; i <= '9'; i++)
+ lookup[i] = 1;
+
+ lookup[int('a')] = lookup[int('A')] = lookup[int('c')] = lookup[int('C')] = 2;
+ lookup[int('g')] = lookup[int('G')] = lookup[int('t')] = lookup[int('T')] = 2;
+ }
+
+ int first = token[0];
+ bool goodstart = first > 0 && first < 128;
+
+ if (token.Length() == 1 && goodstart && lookup[int(token[0])])
+ return info->NewAllele(token);
+
+ if (!goodstart || lookup[int(token[0])] != 1)
+ return 0;
+
+ int integer = token.AsInteger();
+ token = integer;
+
+ allele = info->GetAlleleNumber(token);
+
+ if (allele > 0)
+ return allele;
+
+ if (integer <= 0) return 0;
+
+ if (integer > 1000000)
+ {
+ static bool warn_user = true;
+
+ if (warn_user)
+ printf("Some allele numbers for marker %s are > 1000000\n"
+ "All allele numbers >1000000 will be treated as missing\n\n",
+ (const char *) info->name, warn_user = false);
+
+ return 0;
+ }
+
+ return info->NewAllele(token);
+ }
+
+
+
diff --git a/libsrc/PedigreeGlobals.h b/libsrc/PedigreeGlobals.h
new file mode 100644
index 0000000..707caa7
--- /dev/null
+++ b/libsrc/PedigreeGlobals.h
@@ -0,0 +1,175 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/PedigreeGlobals.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __PEDGLOBALS_H__
+#define __PEDGLOBALS_H__
+
+#include "Constant.h"
+#include "StringArray.h"
+#include "StringHash.h"
+#include "IntArray.h"
+#include "MathVector.h"
+
+class MarkerInfo
+ {
+ public:
+ // Chromosome number
+ int chromosome;
+
+ // Position along chromosome in morgans
+ double position;
+ double positionMale;
+ double positionFemale;
+
+ Vector freq;
+ String name;
+ StringArray alleleLabels;
+ StringIntHash alleleNumbers;
+
+ MarkerInfo(String & string)
+ {
+ serial = count++;
+ name = string;
+ chromosome = -1;
+ position = 0.0;
+ positionMale = 0.0;
+ positionFemale = 0.0;
+ }
+
+ bool AdjustFrequencies();
+
+ static int ComparePosition(MarkerInfo ** left, MarkerInfo ** right);
+
+ String GetAlleleLabel(int allele);
+ int GetAlleleNumber(const String & label) const
+ { return label == "0" ? 0 : alleleNumbers.Integer(label); }
+
+ int NewAllele(const String & label);
+
+ // Calling update serial for a series of markers ensures they are
+ // clustered in a particular order
+ void UpdateSerial()
+ { serial = count++; }
+
+ void IndexAlleles();
+
+ int CountAlleles()
+ { return alleleLabels.Length() ? alleleLabels.Length() - 1 : 0; }
+
+ private:
+ // How many marker info structures have we created?
+ static int count;
+ static String label;
+
+ // When sorting markers, use serial_no to break ties, so
+ // markers we saw first in the map file / datafile come
+ // first
+ int serial;
+ };
+
+class PedigreeGlobals
+ {
+ public:
+ static int traitCount;
+ static int markerCount;
+ static int affectionCount;
+ static int covariateCount;
+
+ // Should be set to true if handling X-linked data
+ static bool chromosomeX;
+ // Set to true when map file includes position info
+ // based on sex-specific recombination fractions
+ static bool sexSpecificMap;
+
+ static StringArray traitNames;
+ static StringArray covariateNames;
+ static StringArray affectionNames;
+ static StringArray markerNames;
+ static StringIntHash markerLookup;
+ static StringIntHash traitLookup;
+ static StringIntHash affectionLookup;
+ static StringIntHash covariateLookup;
+
+ // These functions are guaranteed to return a valid ID
+ // If no matching attribute exists, one is created
+ //
+
+ static int GetTraitID(const char * name);
+ static int GetMarkerID(const char * name);
+ static int GetCovariateID(const char * name);
+ static int GetAffectionID(const char * name);
+
+ // These functions return a matching ID or -1 if none is found
+ //
+
+ static int LookupTrait(const char * name)
+ { return traitLookup.Integer(name); }
+ static int LookupMarker(const char * name)
+ { return markerLookup.Integer(name); }
+ static int LookupCovariate(const char * name)
+ { return covariateLookup.Integer(name); }
+ static int LookupAffection(const char * name)
+ { return affectionLookup.Integer(name); }
+
+ static int markerInfoCount;
+ static int markerInfoSize;
+ static MarkerInfo ** markerInfo;
+ static StringHash markerInfoByName;
+ static MarkerInfo ** markerInfoByInteger;
+
+ static void GrowMarkerInfo();
+ static MarkerInfo * GetMarkerInfo(String & name);
+ static MarkerInfo * GetMarkerInfo(int marker);
+
+ static int SortMarkersInMapOrder(IntArray & markers, int chromosome = -1);
+ static void GetOrderedMarkers(IntArray & markers);
+ static void FlagMissingMarkers(IntArray & missingMarkers);
+
+ static bool MarkerPositionsAvailable();
+ static bool AlleleFrequenciesAvailable();
+
+ static void VerifySexSpecificOrder();
+
+ static void LoadAlleleFrequencies(const char * filename, bool required = false);
+ static void LoadAlleleFrequencies(IFILE & file);
+
+ static void LoadMarkerMap(const char * filename, bool filter = false);
+ static void LoadMarkerMap(IFILE & file, bool filter = false);
+
+ static void LoadBasepairMap(const char * filename);
+ static void LoadBasepairMap(IFILE & file);
+
+ static void WriteMapFile(const char * filename);
+ static void WriteMapFile(FILE * file);
+
+ static void WriteFreqFile(const char * filename, bool old_format = false);
+ static void WriteFreqFile(FILE * file, bool old_format = false);
+
+ static int LoadAllele(int marker, String & label); // Read an allele
+ static int LoadAllele(MarkerInfo * info, String & label);
+
+ PedigreeGlobals() { instanceCount++; }
+ ~PedigreeGlobals();
+
+ private:
+ static int instanceCount;
+
+ };
+
+#endif
+
+
diff --git a/libsrc/PedigreeLoader.cpp b/libsrc/PedigreeLoader.cpp
new file mode 100644
index 0000000..37b52be
--- /dev/null
+++ b/libsrc/PedigreeLoader.cpp
@@ -0,0 +1,605 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/PedigreeLoader.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "Pedigree.h"
+#include "FortranFormat.h"
+#include "Error.h"
+
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+
+void Pedigree::Prepare(IFILE & input)
+ {
+ pd.Load(input);
+ }
+
+void Pedigree::Load(IFILE & input)
+ {
+ if (pd.mendelFormat)
+ {
+ LoadMendel(input);
+ return;
+ }
+
+ int sexCovariate = sexAsCovariate ? GetCovariateID("sex") : -1;
+
+ int textCols = pd.CountTextColumns() + 5;
+ int oldCount = count;
+ bool warn = true;
+ int line = 0;
+
+ String buffer;
+ StringArray tokens;
+
+ while (!ifeof(input))
+ {
+ int field = 0;
+
+ buffer.ReadLine(input);
+
+ tokens.Clear();
+ tokens.AddTokens(buffer, SEPARATORS);
+
+ if (tokens.Length() == 0) continue;
+ if (tokens[0].SlowCompare("end") == 0) break;
+
+ line++;
+
+ if (tokens.Length() < textCols)
+ {
+ if (buffer.Length() > 79)
+ {
+ buffer.SetLength(75);
+ buffer += " ...";
+ }
+
+ String description;
+
+ pd.ColumnSummary(description);
+ error("Loading Pedigree...\n\n"
+ "Expecting %d columns (%s),\n"
+ "but read only %d columns in line %d.\n\n"
+ "The problem line is transcribed below:\n%s\n",
+ textCols, (const char *) description,
+ tokens.Length(), line, (const char *) buffer);
+ }
+
+ if (tokens.Length() > textCols && warn && textCols > 5)
+ {
+ pd.ColumnSummary(buffer);
+ printf("WARNING -- Trailing columns in pedigree file will be ignored\n"
+ " Expecting %d data columns (%s)\n"
+ " However line %d, for example, has %d data columns\n\n",
+ textCols - 5, (const char *) buffer, line, tokens.Length() - 5);
+ warn = false;
+ }
+
+ Person * p;
+
+ // create a new person if necessary
+ if (oldCount==0 || (p = FindPerson(tokens[0], tokens[1], oldCount))==NULL)
+ {
+ if (count == size) Grow();
+
+ p = persons[count++] = new Person;
+ }
+
+ p->famid = tokens[field++]; // famid
+ p->pid = tokens[field++]; // pid
+ p->fatid = tokens[field++]; // fatid
+ p->motid = tokens[field++]; // motid
+
+ bool failure = false;
+ p->sex = TranslateSexCode(tokens[field++], failure);
+ if (failure)
+ error("Can't interpret the sex of individual #%d\n"
+ "Family: %s Individual: %s Sex Code: %s", count,
+ (const char *) p->famid, (const char *) p->pid,
+ (const char *) tokens[field-1]);
+
+ if (sexAsCovariate)
+ if (p->sex)
+ p->covariates[sexCovariate] = p->sex;
+ else
+ p->covariates[sexCovariate] = _NAN_;
+
+ for (int col = 0; col < pd.columnCount; col++)
+ switch ( pd.columns[col] )
+ {
+ case pcAffection :
+ {
+ int a = pd.columnHash[col];
+ int new_status;
+
+ const char * affection = tokens[field++];
+
+ switch (toupper(affection[0]))
+ {
+ case '1' : case 'N' : case 'U' :
+ new_status = 1;
+ break;
+ case '2' : case 'D' : case 'A' : case 'Y' :
+ new_status = 2;
+ break;
+ default :
+ new_status = atoi(affection);
+ if (new_status < 0 || new_status > 2)
+ error("Incorrect formating for affection status "
+ "Col %d, Affection %s\n"
+ "Family: %s Individual: %s Status: %s",
+ col, (const char *) affectionNames[a],
+ (const char *) p->famid, (const char *) p->pid,
+ affection);
+ }
+ if (new_status != 0 && p->affections[a] != 0 &&
+ new_status != p->affections[a])
+ error("Conflict with previous affection status - "
+ "Col %d, Affection %s\n"
+ "Family: %s Individual: %s Old: %d New: %d",
+ col, (const char *) affectionNames[a],
+ (const char *) p->famid, (const char *) p->pid,
+ p->affections[a], new_status);
+ if (new_status) p->affections[a] = new_status;
+ break;
+ }
+ case pcMarker :
+ {
+ int m = pd.columnHash[col];
+
+ Alleles new_genotype;
+
+ new_genotype[0] = LoadAllele(m, tokens[field++]);
+ new_genotype[1] = LoadAllele(m, tokens[field++]);
+
+ if (p->markers[m].isKnown() && new_genotype.isKnown() &&
+ new_genotype != p->markers[m])
+ {
+ MarkerInfo * info = GetMarkerInfo(m);
+
+ error("Conflict with previous genotype - Col %d, Marker %s\n"
+ "Family: %s Individual: %s Old: %s/%s New: %s/%s",
+ col, (const char *) markerNames[m],
+ (const char *) p->famid, (const char *) p->pid,
+ (const char *) info->GetAlleleLabel(p->markers[m][0]),
+ (const char *) info->GetAlleleLabel(p->markers[m][1]),
+ (const char *) info->GetAlleleLabel(new_genotype[0]),
+ (const char *) info->GetAlleleLabel(new_genotype[1]));
+ }
+
+ if (new_genotype.isKnown()) p->markers[m] = new_genotype;
+ break;
+ }
+ case pcTrait :
+ case pcUndocumentedTraitCovariate :
+ {
+ int t = pd.columnHash[col];
+ double new_pheno = _NAN_;
+
+ if (pd.columns[col] == pcUndocumentedTraitCovariate)
+ t = t / 32768;
+
+ const char * value = tokens[field++];
+ char * flag = NULL;
+
+ if ( missing == (const char *) NULL || strcmp(value, missing) != 0)
+ new_pheno = strtod(value, &flag);
+ if ( flag != NULL && *flag ) new_pheno = _NAN_;
+
+ if ( p->traits[t] != _NAN_ && new_pheno != _NAN_ &&
+ new_pheno != p->traits[t])
+ error("Conflict with previous phenotype - Col %d, Trait %s\n"
+ "Family: %s Individual: %s Old: %f New: %f",
+ col, (const char *) traitNames[t],
+ (const char *) p->famid, (const char *) p->pid,
+ p->traits[t], new_pheno);
+
+ if ( new_pheno != _NAN_) p->traits[t] = new_pheno;
+ if (pd.columns[col] == pcTrait) break;
+ }
+ case pcCovariate :
+ {
+ int c = pd.columnHash[col];
+ double new_covar = _NAN_;
+
+ if (pd.columns[col] == pcUndocumentedTraitCovariate)
+ {
+ c = c % 32768;
+ field--;
+ }
+
+ const char * value = tokens[field++];
+ char * flag = NULL;
+
+ if ( missing == (const char *) NULL || strcmp(value, missing) != 0)
+ new_covar = strtod(value, &flag);
+ if ( flag != NULL && *flag ) new_covar = _NAN_;
+
+ if ( p->covariates[c] != _NAN_ && new_covar != _NAN_ &&
+ new_covar != p->covariates[c])
+ error("Conflict with previous value - Col %d, Covariate %s\n"
+ "Family: %s Individual: %s Old: %f New: %f",
+ col, (const char *) covariateNames[c],
+ (const char *) p->famid, (const char *) p->pid,
+ p->covariates[c], new_covar);
+
+ if ( new_covar != _NAN_) p->covariates[c] = new_covar;
+ break;
+ }
+ case pcSkip :
+ field++;
+ break;
+ case pcZygosity :
+ {
+ int new_zygosity;
+
+ const char * zygosity = tokens[field++];
+
+ switch (zygosity[0])
+ {
+ case 'D' : case 'd' :
+ new_zygosity = 2;
+ break;
+ case 'M' : case 'm' :
+ new_zygosity = 1;
+ break;
+ default :
+ new_zygosity = atoi(zygosity);
+ }
+ if (p->zygosity != 0 && new_zygosity != p->zygosity)
+ error("Conflict with previous zygosity - "
+ "Column %d in pedigree\n"
+ "Family: %s Individual: %s Old: %d New: %d\n",
+ col, (const char *) p->famid, (const char *) p->pid,
+ p->zygosity, new_zygosity);
+ p->zygosity = new_zygosity;
+ break;
+ }
+ case pcEnd :
+ break;
+ default :
+ error ("Inconsistent Pedigree Description -- Internal Error");
+ }
+ }
+
+ Sort();
+ }
+
+void Pedigree::LoadMendel(IFILE & input)
+ {
+ // First, retrieve the two format statements from file
+ String familyHeader;
+ String individualRecord;
+
+ familyHeader.ReadLine(input);
+ individualRecord.ReadLine(input);
+
+ // Then create two FORTRAN input streams...
+ // One will be used for retrieving family labels and sizes, the other
+ // will be used for individual information
+ FortranFormat headers, records;
+
+ headers.SetInputFile(input);
+ headers.SetFormat(familyHeader);
+
+ records.SetInputFile(input);
+ records.SetFormat(individualRecord);
+
+ // Storage for key pieces of information
+ String famid;
+ String phenotype;
+ String affectionCode;
+ String affectionStem;
+ int familySize;
+
+ String allele1, allele2;
+
+ int sexCovariate = sexAsCovariate ? GetCovariateID("sex") : -1;
+
+ while (!ifeof(input))
+ {
+ if (count == size)
+ Grow();
+
+ // Retrieve header for next family
+ familySize = headers.GetNextInteger();
+ headers.GetNextField(famid);
+ headers.Flush();
+
+ if (famid.IsEmpty())
+ if (ifeof(input) && familySize == 0)
+ break;
+ else
+ error("Blank family id encountered\n");
+
+ // Retrieve each individual in the family
+ for (int i = 0; i < familySize; i++)
+ {
+ Person * p = persons[count++] = new Person;
+
+ // Retrieve basic pedigree structure
+ p->famid = famid;
+ records.GetNextField(p->pid);
+ records.GetNextField(p->fatid);
+ records.GetNextField(p->motid);
+
+ if (p->pid.IsEmpty())
+ error("No unique identifier for individual #%d in family %s\n",
+ i + 1, (const char *) famid);
+
+ if (p->pid.Compare(".") == 0)
+ error("Family %s has an individual named '.', but this code is\n"
+ "reserved to indicate missing parents\n");
+
+ if (p->fatid.IsEmpty()) p->fatid = ".";
+ if (p->motid.IsEmpty()) p->motid = ".";
+
+ // Retrieve and decode sex code
+ char sex = records.GetNextCharacter();
+
+ switch (sex)
+ {
+ case '0' : case 'x' : case 'X' : case '?' : case 0 :
+ p->sex = 0; break;
+ case '1' : case 'm' : case 'M' :
+ p->sex = 1; break;
+ case '2' : case 'f' : case 'F' :
+ p->sex = 2; break;
+ default :
+ error("Can't interpret the sex of individual #%d\n"
+ "Family: %s Individual: %s Sex Code: %s", count,
+ (const char *) p->famid, (const char *) p->pid, sex);
+ };
+
+ if (sexAsCovariate)
+ if (p->sex)
+ p->covariates[sexCovariate] = p->sex;
+ else
+ p->covariates[sexCovariate] = _NAN_;
+
+ // Retrieve and decode zygosity
+ char zygosity = records.GetNextCharacter();
+
+ // Mendel uses a unique character to indicate each MZ pair,
+ // we use a unique odd number...
+ if (zygosity)
+ p->zygosity = (zygosity - ' ') * 2 - 1;
+
+ affectionStem.Clear();
+ for (int col = 0; col < pd.columnCount; col++)
+ switch ( pd.columns[col] )
+ {
+ case pcAffection :
+ {
+ int a = pd.columnHash[col];
+
+ // We expand each Mendel non-codominant trait into multiple
+ // affection status column... First, if this is not a
+ // continuation of a previous expansion we first retrieve
+ // and encode the affection status.
+ if (affectionStem.Length() == 0 ||
+ affectionNames[a].CompareToStem(affectionStem) != 0)
+ {
+ affectionStem.Copy(affectionNames[a], 0, affectionNames[a].FindChar('>') + 1);
+ records.GetNextField(phenotype);
+ affectionCode = affectionStem + phenotype;
+ }
+
+ // Then encode each phenotype appropriately
+ if (phenotype.IsEmpty())
+ p->affections[a] = 0;
+ else
+ p->affections[a] = affectionCode.Compare(affectionNames[a]) == 0 ? 2 : 1;
+
+ break;
+ }
+ case pcMarker :
+ {
+ int m = pd.columnHash[col];
+
+ records.GetNextField(phenotype);
+
+ if (phenotype.IsEmpty())
+ {
+ p->markers[m].one = p->markers[m].two = 0;
+ continue;
+ }
+
+ int separator = phenotype.FindChar('/');
+ if (separator == -1) separator = phenotype.FindChar('|');
+
+ if (separator == -1)
+ error("At marker %s, person %s in family %s has genotype %s.\n"
+ "This genotype is not in the 'al1/al2' format.\n",
+ (const char *) markerNames[m],
+ (const char *) p->pid,
+ (const char *) p->famid,
+ (const char *) phenotype);
+
+ allele1.Copy(phenotype, 0, separator);
+ allele1.Trim();
+
+ allele2.Copy(phenotype, separator + 1, 8);
+ allele2.Trim();
+
+ MarkerInfo * info = GetMarkerInfo(m);
+
+ int one = info->alleleNumbers.Integer(allele1);
+
+ if (one < 0)
+ if (info->freq.Length() == 0)
+ one = info->NewAllele(allele1);
+ else
+ error("At marker %s, person %s in family %s has genotype %s.\n"
+ "However, '%s' is not a valid allele for this marker.\n",
+ (const char *) markerNames[m],
+ (const char *) p->pid,
+ (const char *) p->famid,
+ (const char *) phenotype,
+ (const char *) allele1);
+
+ int two = info->alleleNumbers.Integer(allele2);
+
+ if (two < 0)
+ if (info->freq.Length() == 0)
+ two = info->NewAllele(allele2);
+ else
+ error("At marker %s, person %s in family %s has genotype %s.\n"
+ "However, '%s' is not a valid allele for this marker.\n",
+ (const char *) markerNames[m],
+ (const char *) p->pid,
+ (const char *) p->famid,
+ (const char *) phenotype,
+ (const char *) allele2);
+
+ p->markers[m].one = one;
+ p->markers[m].two = two;
+ break;
+ }
+ case pcEnd :
+ break;
+ case pcTrait :
+ case pcCovariate :
+ case pcSkip :
+ case pcZygosity :
+ default:
+ error ("Inconsistent Pedigree Description -- Internal Error");
+ }
+
+ records.Flush();
+ }
+ }
+
+ Sort();
+ }
+
+void Pedigree::Prepare(const char * filename)
+ {
+ // Clear any previously loaded pedigree description
+ if (multiPd != NULL)
+ delete [] multiPd;
+
+ multiFileCount = 1;
+
+ // Enable multifile support
+ StringArray filenames;
+
+ filenames.AddColumns(filename, ',');
+
+ if (filenames.Length() <= 1)
+ pd.Load(filename);
+ else
+ {
+ printf("AUTOMATIC MERGE ENABLED: Detected multiple datafile names, separated by commas...\n");
+
+ multiPd = new PedigreeDescription[filenames.Length()];
+
+ for (int i = 0; i < filenames.Length(); i++)
+ {
+ printf(" AUTOMATIC MERGE: Reading data file '%s' ...\n", (const char *) filenames[i]);
+ multiPd[i].Load(filenames[i], false);
+ }
+
+ multiFileCount = filenames.Length();
+ }
+ }
+
+void Pedigree::Load(const char * filename, bool allowFailures)
+ {
+ if (multiFileCount <= 1)
+ {
+ IFILE f = ifopen(filename, "rb");
+
+ if (f == NULL && allowFailures)
+ return;
+
+ if (f == NULL)
+ error(
+ "The pedigree file %s cannot be opened\n\n"
+ "Common causes for this problem are:\n"
+ " * You might not have used the correct options to specify input file names,\n"
+ " please check the program documentation for information on how to do this\n\n"
+ " * The file doesn't exist or the filename might have been misspelt\n\n"
+ " * The file exists but it is being used by another program which you will need\n"
+ " to close\n\n"
+ " * The file is larger than 2GB and you haven't compiled this application with\n"
+ " large file support.\n\n",
+ filename);
+
+ Load(f);
+ ifclose(f);
+ }
+ else
+ {
+ StringArray filenames;
+
+ filenames.AddColumns(filename, ',');
+
+ if (filenames.Length() != multiFileCount)
+ error("Different numbers of comma separated data and pedigree file names provided\n");
+
+ for (int i = 0; i < filenames.Length(); i++)
+ {
+ printf(" AUTOMATIC MERGE: Datafile '%s' matched to pedigree '%s' ...\n",
+ (const char *) multiPd[i].filename, (const char *) filenames[i]);
+
+ pd = multiPd[i];
+
+ IFILE f = ifopen(filenames[i], "rb");
+
+ if (f == NULL)
+ error("The pedigree file '%s' cannot be opened\n\n",
+ (const char *) filenames[i]);
+
+ Load(f);
+ ifclose(f);
+ }
+
+ printf("\n");
+ }
+ }
+
+int Pedigree::TranslateSexCode(const char * code, bool & failure)
+ {
+ failure = false;
+
+ switch (code[0])
+ {
+ case 'x' : case 'X' : case '?' :
+ return 0;
+ case '1' : case 'm' : case 'M' :
+ return 1;
+ case '2' : case 'f' : case 'F' :
+ return 2;
+ default :
+ {
+ bool result = atoi(code);
+
+ if (result != 0 && result != 1 && result != 2)
+ {
+ failure = true;
+ result = 0;
+ }
+
+ return result;
+ }
+ };
+ }
+
+
+
+
diff --git a/libsrc/PedigreePerson.cpp b/libsrc/PedigreePerson.cpp
new file mode 100644
index 0000000..3a3461d
--- /dev/null
+++ b/libsrc/PedigreePerson.cpp
@@ -0,0 +1,234 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/PedigreePerson.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "PedigreePerson.h"
+#include "Constant.h"
+#include "StringArray.h"
+#include "Error.h"
+
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <limits.h>
+
+Person::Person()
+ {
+ zygosity = sex = 0;
+ serial = traverse = -1;
+
+ markers = new Alleles [markerCount];
+ traits = new double [traitCount];
+ covariates = new double [covariateCount];
+ affections = new char [affectionCount];
+
+ for (int i = 0; i < traitCount; i++) traits[i] = _NAN_;
+ for (int i = 0; i < covariateCount; i++) covariates[i] = _NAN_;
+ for (int i = 0; i < affectionCount; i++) affections[i] = 0;
+
+ filter = false;
+
+ father = mother = NULL;
+ sibs = NULL;
+ sibCount = 0;
+
+ ngeno = 0;
+ hasBothParents = hasAllTraits = hasAllAffections = hasAllCovariates = false;
+ }
+
+Person::~Person()
+ {
+ delete [] markers;
+ delete [] traits;
+ delete [] affections;
+ delete [] covariates;
+
+ if (sibCount) delete [] sibs;
+ }
+
+void Person::Copy(Person & rhs)
+ {
+ CopyIDs(rhs);
+ CopyPhenotypes(rhs);
+ }
+
+void Person::CopyPhenotypes(Person & rhs)
+ {
+ for (int i = 0; i < Person::traitCount; i++)
+ traits[i] = rhs.traits[i];
+ for (int i = 0; i < Person::affectionCount; i++)
+ affections[i] = rhs.affections[i];
+ for (int i = 0; i < Person::covariateCount; i++)
+ covariates[i] = rhs.covariates[i];
+ for (int i = 0; i < Person::markerCount; i++)
+ markers[i] = rhs.markers[i];
+ ngeno = rhs.ngeno;
+ }
+
+void Person::WipePhenotypes(bool remove_genotypes)
+ {
+ for (int i = 0; i < traitCount; i++) traits[i] = _NAN_;
+ for (int i = 0; i < covariateCount; i++) covariates[i] = _NAN_;
+ for (int i = 0; i < affectionCount; i++) affections[i] = 0;
+
+ if (remove_genotypes)
+ {
+ for (int i = 0; i < markerCount; i++)
+ markers[i][0] = markers[i][1] = 0;
+ ngeno = 0;
+ }
+ }
+
+void Person::CopyIDs(Person & rhs)
+ {
+ famid = rhs.famid;
+ pid = rhs.pid;
+ fatid = rhs.fatid;
+ motid = rhs.motid;
+ sex = rhs.sex;
+ zygosity = rhs.zygosity;
+ }
+
+bool Person::CheckParents()
+ {
+ hasBothParents = father != NULL && mother != NULL;
+
+ if (!hasBothParents)
+ if (father != NULL || mother != NULL)
+ {
+ printf("Parent named %s for Person %s in Family %s is missing\n",
+ (father == NULL) ? (const char *) fatid : (const char *) motid,
+ (const char *) pid, (const char *) famid);
+ return false;
+ }
+ else
+ return true;
+
+ if (father->sex == SEX_FEMALE || mother->sex == SEX_MALE)
+ // If parents are switched around, we can fix it...
+ {
+ Person * swap = father;
+ father = mother;
+ mother = swap;
+
+ String temp = fatid;
+ fatid = motid;
+ motid = temp;
+ }
+
+ if (father->sex == SEX_FEMALE || mother->sex == SEX_MALE)
+ // If things still don't make sense then the problem is more serious ...
+ {
+ printf("Parental sex codes don't make sense for Person %s in Family %s\n",
+ (const char *) pid, (const char *) famid);
+ return false;
+ }
+
+ return true;
+ }
+
+void Person::AssessStatus()
+ {
+ hasBothParents = father != NULL && mother != NULL;
+
+ hasAllTraits = hasAllAffections = hasAllCovariates = true;
+
+ ngeno = 0;
+ for (int m = 0; m < markerCount; m++)
+ if (isGenotyped(m))
+ ngeno++;
+
+ for (int t = 0; t < traitCount; t++)
+ if (!isPhenotyped(t))
+ {
+ hasAllTraits = false;
+ break;
+ }
+
+ for (int c = 0; c < covariateCount; c++)
+ if (!isControlled(c))
+ {
+ hasAllCovariates = false;
+ break;
+ }
+
+ for (int a = 0; a < affectionCount; a++)
+ if (!isDiagnosed(a))
+ {
+ hasAllAffections = false;
+ break;
+ }
+ }
+
+void Person::Order(Person * & p1, Person * & p2)
+ {
+ if (p1->traverse > p2->traverse)
+ {
+ Person * temp = p1;
+ p1 = p2;
+ p2 = temp;
+ }
+ }
+
+int Person::GenotypedMarkers()
+ {
+ int count = 0;
+
+ for (int m = 0; m < Person::markerCount; m++)
+ if (markers[m].isKnown())
+ count++;
+
+ return count;
+ }
+
+bool Person::haveData()
+ {
+ if (ngeno)
+ return true;
+
+ for (int i = 0; i < affectionCount; i++)
+ if (affections[i] != 0)
+ return true;
+
+ for (int i = 0; i < traitCount; i++)
+ if (traits[i] != _NAN_)
+ return true;
+
+ return false;
+ }
+
+bool Person::isAncestor(Person * descendant)
+ {
+ if (traverse > descendant->traverse)
+ return false;
+
+ if (serial == descendant->serial)
+ return true;
+
+ if (descendant->isFounder())
+ return false;
+
+ return (isAncestor(descendant->mother) ||
+ isAncestor(descendant->father));
+ }
+
+
+
+
+
+
+
+
diff --git a/libsrc/PedigreePerson.h b/libsrc/PedigreePerson.h
new file mode 100644
index 0000000..9130198
--- /dev/null
+++ b/libsrc/PedigreePerson.h
@@ -0,0 +1,133 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/PedigreePerson.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __PEDPERSON_H__
+#define __PEDPERSON_H__
+
+#include "Constant.h"
+#include "PedigreeAlleles.h"
+#include "PedigreeGlobals.h"
+#include "StringArray.h"
+#include "IntArray.h"
+
+#define SEX_MALE 1
+#define SEX_FEMALE 2
+#define SEX_UNKNOWN 0
+
+class Person : public PedigreeGlobals
+ {
+ public:
+ String famid;
+ String pid;
+ String motid;
+ String fatid;
+ int sex;
+ int zygosity;
+ int serial, traverse;
+
+ Alleles * markers;
+ double * traits;
+ char * affections;
+ double * covariates;
+
+ Person * father;
+ Person * mother;
+
+ int sibCount;
+ Person ** sibs;
+
+ int ngeno;
+
+ bool filter;
+
+ Person();
+ ~Person();
+
+ bool isHalfSib(Person & sib)
+ {
+ return hasBothParents &&
+ ((sib.father == father) ^ (sib.mother == mother));
+ }
+
+ bool isSib(Person & sib)
+ {
+ return hasBothParents &&
+ (sib.father == father) && (sib.mother == mother);
+ }
+
+ bool isTwin(Person & twin)
+ {
+ return (zygosity != 0) && (zygosity == twin.zygosity) && isSib(twin);
+ }
+
+ bool isMzTwin(Person & mzTwin)
+ {
+ return (zygosity & 1) && (zygosity == mzTwin.zygosity) && isSib(mzTwin);
+ }
+
+ // Check that both parents or none are available
+ // Verify that fathers are male and mothers are female
+ bool CheckParents();
+
+ // Assess status before using quick diagnostics functions
+ void AssessStatus();
+
+ // Quick diagnostics
+ bool isFounder()
+ { return !hasBothParents; }
+ bool isSexed()
+ { return sex != 0; }
+ bool isGenotyped(int m)
+ { return markers[m].isKnown(); }
+ bool isFullyGenotyped()
+ { return ngeno == markerCount; }
+ bool isControlled(int c)
+ { return covariates[c] != _NAN_; }
+ bool isFullyControlled()
+ { return hasAllCovariates; }
+ bool isPhenotyped(int t)
+ { return traits[t] != _NAN_; }
+ bool isFullyPhenotyped()
+ { return hasAllTraits; }
+ bool isDiagnosed(int a)
+ { return affections[a] != 0; }
+ bool isFullyDiagnosed()
+ { return hasAllAffections; }
+ bool haveData();
+ bool isAncestor(Person * descendant);
+
+ int GenotypedMarkers();
+
+ static void Order(Person * & p1, Person * & p2);
+
+ void Copy(Person & rhs);
+ void CopyIDs(Person & rhs);
+ void CopyPhenotypes(Person & rhs);
+ void WipePhenotypes(bool remove_genotypes = true);
+
+ private:
+
+ bool hasAllCovariates, hasAllTraits,
+ hasAllAffections, hasBothParents;
+ };
+
+#endif
+
+
+
+
+
diff --git a/libsrc/PedigreeTrim.cpp b/libsrc/PedigreeTrim.cpp
new file mode 100644
index 0000000..0cc0404
--- /dev/null
+++ b/libsrc/PedigreeTrim.cpp
@@ -0,0 +1,188 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/PedigreeTrim.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "Pedigree.h"
+
+void Pedigree::ShowTrimHeader(bool & flag)
+ {
+ if (flag)
+ {
+ printf("Trimming uninformative individuals...\n");
+ flag = false;
+ }
+ }
+
+void Pedigree::Trim(bool quiet, int * informative)
+ {
+ int newCount = 0;
+ Person ** newPersons = new Person * [count];
+
+ // This function applies the following filters to reduce complexity
+ // of pedigree
+ //
+ // RULE 1: Remove all pedigrees no genotype or phenotype data
+ // RULE 2: Remove leaf individuals with no data
+ // RULE 3: Remove founder couples with <2 offspring and no data
+
+ bool showHeader = true;
+ IntArray discardable, offspring, mates, haveData;
+
+ for (int f = 0; f < familyCount; f++)
+ {
+ Family * fam = families[f];
+
+ // Cache for storing indicators about whether each family member is
+ // informative
+ haveData.Dimension(fam->count);
+
+ // Check that some data is available in the family
+ int hasData = false;
+ for (int i = fam->first; i <= fam->last; i++)
+ if (informative == NULL)
+ hasData |= haveData[persons[i]->traverse] = persons[i]->haveData();
+ else
+ hasData |= haveData[persons[i]->traverse] = informative[i];
+
+ if (!hasData)
+ {
+ if (!quiet)
+ {
+ ShowTrimHeader(showHeader);
+ printf(" Removing family %s: No data\n", (const char *) fam->famid);
+ }
+
+ for (int i = fam->first; i <= fam->last; i++)
+ delete persons[i];
+
+ continue;
+ }
+
+ // Assume that we need everyone in the family
+ discardable.Dimension(fam->count);
+ discardable.Set(0);
+
+ bool trimming = true;
+
+ while (trimming)
+ {
+ trimming = false;
+
+ // Tally the number of offspring for each individual
+ offspring.Dimension(fam->count);
+ offspring.Zero();
+
+ // Tally the number of mates for each individual
+ mates.Dimension(fam->count);
+ mates.Set(-1);
+
+ // In the first round, we count the number of offspring
+ // for each individual in the current trimmed version of the
+ // pedigree
+ for (int i = fam->count - 1; i >= fam->founders; i--)
+ {
+ if (discardable[i]) continue;
+
+ Person & p = *(persons[fam->path[i]]);
+
+ if (discardable[p.father->traverse])
+ continue;
+
+ if (offspring[i] == 0 && !haveData[p.traverse])
+ {
+ trimming = true;
+ discardable[i] = true;
+ continue;
+ }
+
+ int father = p.father->traverse;
+ int mother = p.mother->traverse;
+
+ if (mates[father] == -1 && mates[mother] == -1)
+ {
+ mates[father] = mother,
+ mates[mother] = father;
+ }
+ else if (mates[father] != mother)
+ {
+ if (mates[father] >= 0)
+ mates[mates[father]] = -2;
+
+ if (mates[mother] >= 0)
+ mates[mates[mother]] = -2;
+
+ mates[mother] = -2;
+ mates[father] = -2;
+ }
+
+ offspring[father]++;
+ offspring[mother]++;
+ }
+
+ // In the second pass, we remove individuals with no
+ // data who are founders with a single offspring (and
+ // no multiple matings) or who have no descendants
+ for (int i = fam->count - 1; i >= 0; i--)
+ {
+ if (discardable[i]) continue;
+
+ Person & p = *(persons[fam->path[i]]);
+
+ if (p.isFounder() || discardable[p.father->traverse])
+ {
+ if (mates[i] == -2 ||
+ offspring[i] > 1 ||
+ mates[i] >= fam->founders &&
+ !discardable[persons[fam->path[mates[i]]]->father->traverse] ||
+ haveData[p.traverse] ||
+ mates[i] != -1 && haveData[mates[i]])
+ continue;
+
+ trimming = true;
+ discardable[i] = true;
+ continue;
+ }
+ }
+ }
+
+ for (int i = fam->count - 1; i >= 0; i--)
+ if (discardable[i])
+ {
+ if (!quiet)
+ {
+ ShowTrimHeader(showHeader);
+ printf(" Removing person %s->%s: No data\n",
+ (const char *) fam->famid,
+ (const char *) persons[fam->path[i]]->pid);
+ }
+ delete persons[fam->path[i]];
+ }
+ else
+ newPersons[newCount++] = persons[fam->path[i]];
+ }
+
+ if (!showHeader)
+ printf("\n");
+
+ delete [] persons;
+
+ persons = newPersons;
+ count = newCount;
+ Sort();
+ }
+
+
+
diff --git a/libsrc/PedigreeTwin.cpp b/libsrc/PedigreeTwin.cpp
new file mode 100644
index 0000000..cb7d192
--- /dev/null
+++ b/libsrc/PedigreeTwin.cpp
@@ -0,0 +1,182 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/PedigreeTwin.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "Pedigree.h"
+#include "Error.h"
+
+#include <stdio.h>
+
+bool Pedigree::TwinCheck()
+ {
+ bool fail = false;
+ IntArray mzTwins;
+
+ for (int f = 0; f < familyCount; f++)
+ {
+ mzTwins.Clear();
+
+ for (int i = families[f]->first, j; i <= families[f]->last; i++)
+ // Is this person an identical twin?
+ if (persons[i]->isMzTwin( *persons[i] ))
+ {
+ // Have we got another identical sib yet?
+ for ( j = 0; j < mzTwins.Length(); j++)
+ if ( persons[i]->isMzTwin( *persons[mzTwins[j]] ) )
+ break;
+
+ // If not, add to list of twins
+ if (j == mzTwins.Length())
+ {
+ mzTwins.Push(i);
+ continue;
+ }
+
+ // Check that their genotypes are compatible and
+ // merge new twin's genotypes into original twin...
+ Person * original = persons[mzTwins[j]];
+ Person * twin = persons[i];
+
+ for (int m = 0; m < Person::markerCount; m++)
+ {
+ if (!original->markers[m].isKnown())
+ original->markers[m] = twin->markers[m];
+ else
+ if (twin->markers[m].isKnown() &&
+ twin->markers[m] != original->markers[m])
+ printf("MZ Twins %s and %s in family %s have "
+ "different %s genotypes\n",
+ (const char *) original->pid,
+ (const char *) twin->pid,
+ (const char *) original->famid,
+ (const char *) Person::markerNames[m]),
+ fail = true;
+
+ if (twin->sex != original->sex)
+ printf("MZ Twins %s and %s in family %s have "
+ "different sexes\n",
+ (const char *) original->pid,
+ (const char *) twin->pid,
+ (const char *) original->famid),
+ fail = true;
+ }
+ }
+
+ if (mzTwins.Length() == 0) continue;
+
+ // In the second pass we copy merged twin genotypes
+ // from original twin to other twins
+ for (int i = families[f]->first, j; i <= families[f]->last; i++)
+ if (persons[i]->isMzTwin( *persons[i] ))
+ {
+ for ( j = 0; j < mzTwins.Length(); j++)
+ if ( persons[i]->isMzTwin( *persons[mzTwins[j]] ) )
+ break;
+
+ if (mzTwins[j] == i) continue;
+
+ Person * original = persons[mzTwins[j]];
+ Person * twin = persons[i];
+
+ for (int m = 0; m < Person::markerCount; m++)
+ twin->markers[m] = original->markers[m];
+ }
+ }
+ return fail;
+ }
+
+void Pedigree::MergeTwins()
+ {
+ if (!haveTwins) return;
+
+ IntArray mzTwins, surplus;
+
+ printf("Merging MZ twins into a single individual...\n");
+
+ for (int f = 0; f < familyCount; f++)
+ {
+ mzTwins.Clear();
+
+ for (int i = families[f]->first, j; i <= families[f]->last; i++)
+ if (persons[i]->isMzTwin( *persons[i] ))
+ {
+ // Have we got another identical sib yet?
+ for ( j = 0; j < mzTwins.Length(); j++)
+ if ( persons[i]->isMzTwin( *persons[mzTwins[j]] ) )
+ break;
+
+ // If not, add to list of twins
+ if (j == mzTwins.Length())
+ {
+ mzTwins.Push(i);
+ continue;
+ }
+
+ // Append name to first twins name
+ persons[mzTwins[j]]->pid += ((char) '$') + persons[i]->pid;
+
+ // Set the first twin to affected if at least one of the cotwins is affected
+ for (int j = 0; j < affectionCount; j++)
+ if(persons[i]->affections[j] == 2)
+ persons[mzTwins[j]]->affections[j] = 2;
+
+ surplus.Push(i);
+ }
+
+ // More than one representative of each twin-pair...
+ if (surplus.Length())
+ {
+ // Reassign parent names for each offspring
+ for (int i = families[f]->first, j; i < families[f]->last; i++)
+ if (!persons[i]->isFounder())
+ {
+ if (persons[i]->father->isMzTwin(*persons[i]->father))
+ {
+ for ( j = 0; j < mzTwins.Length(); j++)
+ if (persons[i]->father->isMzTwin(*persons[mzTwins[j]]))
+ break;
+ persons[i]->fatid = persons[mzTwins[j]]->pid;
+ }
+ if (persons[i]->mother->isMzTwin(*persons[i]->mother))
+ {
+ for ( j = 0; j < mzTwins.Length(); j++)
+ if (persons[i]->mother->isMzTwin(*persons[mzTwins[j]]))
+ break;
+ persons[i]->motid = persons[mzTwins[j]]->pid;
+ }
+ }
+
+ // Delete surplus individuals
+ while (surplus.Length())
+ {
+ int serial = surplus.Pop();
+
+ delete persons[serial];
+
+ for ( count--; serial < count; serial++)
+ persons[serial] = persons[serial + 1];
+ }
+
+ // Resort pedigree
+ Sort();
+ }
+ }
+ }
+
+
+
+
+
diff --git a/libsrc/QuickIndex.cpp b/libsrc/QuickIndex.cpp
new file mode 100644
index 0000000..b3558b5
--- /dev/null
+++ b/libsrc/QuickIndex.cpp
@@ -0,0 +1,232 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/QuickIndex.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "QuickIndex.h"
+#include "Error.h"
+
+#define __QI_INVALID 0
+#define __QI_VECTOR 1
+#define __QI_INTARRAY 2
+#define __QI_STRINGARRAY 3
+
+QuickIndex::QuickIndex()
+ {
+ source = NULL;
+ datatype = __QI_INVALID;
+ }
+
+void QuickIndex::Index(const IntArray & source_data)
+ {
+ source = (const void *) &source_data;
+ datatype = __QI_INTARRAY;
+
+ Dimension(source_data.Length());
+ SetSequence();
+ Sort();
+ }
+
+void QuickIndex::Index(const Vector & source_data)
+ {
+ source = (const void *) &source_data;
+ datatype = __QI_VECTOR;
+
+ Dimension(source_data.Length());
+ SetSequence();
+ Sort();
+ }
+
+void QuickIndex::Index(const StringArray & source_data)
+ {
+ source = (const void *) &source_data;
+ datatype = __QI_STRINGARRAY;
+
+ Dimension(source_data.Length());
+ SetSequence();
+ Sort();
+ }
+
+void QuickIndex::IndexCounts(const StringIntMap & source_data)
+ {
+ IntArray counts(source_data.Length());
+
+ for (int i = 0; i < source_data.Length(); i++)
+ counts[i] = source_data.GetCount(i);
+
+ Index(counts);
+ }
+
+bool QuickIndex::IsBefore(int i, int j)
+ {
+ i = (*this)[i];
+ j = (*this)[j];
+
+ switch (datatype)
+ {
+ case __QI_VECTOR :
+ {
+ const Vector & data = * (const Vector *) source;
+ return data[i] < data[j];
+ }
+ case __QI_INTARRAY :
+ {
+ const IntArray & data = * (const IntArray *) source;
+ return data[i] < data[j];
+ }
+ case __QI_STRINGARRAY :
+ {
+ const StringArray & data = * (const StringArray *) source;
+ return data[i].SlowCompare(data[j]) < 0;
+ }
+ }
+ return 0;
+ }
+
+void QuickIndex::Sort()
+ {
+ struct __QuickIndexStack { int left, right; };
+
+ if (Length() <= 1)
+ return;
+
+ // Create a pseudo-stack to avoid recursion
+ __QuickIndexStack stack[32];
+
+ int stackIdx = 0;
+
+ // Size of minimum partition to median of three
+ const int Threshold = 7;
+
+ // current partitions
+ int lsize, rsize;
+ int l, mid, r;
+ int scanl, scanr, pivot;
+
+ l = 0;
+ r = Length() - 1;
+
+ while (1)
+ {
+ while (r > l)
+ {
+ if (r - l > Threshold)
+ // QuickSort : median of three partitioning
+ {
+ mid = (r + l) / 2;
+
+ // sort l, mid, and r
+ if (IsBefore(mid, l))
+ Swap(mid, l);
+
+ if (IsBefore(r, l))
+ Swap(r, l);
+
+ if (IsBefore(r, mid))
+ Swap(r, mid);
+
+ // set up for partitioning...
+ pivot = r - 1;
+
+ Swap(mid, pivot);
+
+ scanl = l + 1;
+ scanr = r - 2;
+ }
+ else
+ {
+ // set up random partition -- faster
+ pivot = r;
+ scanl = l;
+ scanr = r - 1;
+ }
+
+ while (1)
+ {
+ // scan from left for element >= pivot
+ while ((scanl < r) && IsBefore(scanl, pivot))
+ ++scanl;
+
+ while ((scanr > l) && IsBefore(pivot, scanr))
+ --scanr;
+
+ // if scans have met, we are done
+ if (scanl >= scanr)
+ break;
+
+ Swap(scanl, scanr);
+
+ if (scanl < r)
+ ++scanl;
+
+ if (scanr > l)
+ --scanr;
+ }
+
+ // Exchange final element
+ Swap(pivot, scanl);
+
+ // Place largest partition on stack
+ lsize = scanl - l;
+ rsize = r - scanl;
+
+ if (lsize > rsize)
+ {
+ // if size is one we are done
+ ++ stackIdx;
+
+ stack[stackIdx].left = l;
+ stack[stackIdx].right = scanl - 1;
+
+ if ( rsize != 0 )
+ l = scanl + 1;
+ else
+ break;
+ }
+ else
+ {
+ // if size is one we are done
+ ++ stackIdx;
+
+ stack[stackIdx].left = scanl + 1;
+ stack[stackIdx].right = r;
+
+ if ( lsize != 0 )
+ r = scanl - 1;
+ else
+ break;
+ }
+ }
+
+ // iterate with values from stack
+ if (stackIdx)
+ {
+ l = stack[stackIdx].left;
+ r = stack[stackIdx].right;
+
+ --stackIdx;
+ }
+ else
+ break;
+ }
+ }
+
+
+
+
+
+
+
+
diff --git a/libsrc/QuickIndex.h b/libsrc/QuickIndex.h
new file mode 100644
index 0000000..998b465
--- /dev/null
+++ b/libsrc/QuickIndex.h
@@ -0,0 +1,52 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/QuickIndex.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __QUICKINDEX_H__
+#define __QUICKINDEX_H__
+
+#include "MathVector.h"
+#include "StringArray.h"
+#include "IntArray.h"
+#include "StringMap.h"
+
+class QuickIndex : public IntArray
+ {
+ public:
+ QuickIndex();
+ QuickIndex(const IntArray & source_data)
+ { Index(source_data); }
+ QuickIndex(const StringArray & source_data)
+ { Index(source_data); }
+ QuickIndex(const Vector & source_data)
+ { Index(source_data); }
+
+ void Index(const IntArray & source_data);
+ void Index(const StringArray & source_data);
+ void Index(const Vector & source_data);
+ void IndexCounts(const StringIntMap & source_data);
+
+ private:
+ const void * source;
+ int datatype;
+
+ bool IsBefore(int i, int j);
+ void Sort();
+ };
+
+#endif
+
+
diff --git a/libsrc/Random.cpp b/libsrc/Random.cpp
new file mode 100644
index 0000000..6103150
--- /dev/null
+++ b/libsrc/Random.cpp
@@ -0,0 +1,407 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/Random.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+
+//////////////////////////////////////////////////////////////////////////////
+// This file includes code derived from the original Mersenne Twister Code
+// by Makoto Matsumoto and Takuji Nishimura
+// and is subject to their original copyright notice copied below:
+//////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////
+// COPYRIGHT NOTICE FOR MERSENNE TWISTER CODE
+// Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. The names of its contributors may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include "Random.h"
+#include "MathConstant.h"
+#include "Error.h"
+
+#include <math.h>
+
+//Constants used internally by Mersenne random number generator
+#define MERSENNE_N 624
+#define MERSENNE_M 397
+
+// constant vector a
+#define MATRIX_A 0x9908b0dfUL
+
+// most significant w-r bits
+#define UPPER_MASK 0x80000000UL
+
+// least significant r bits
+#define LOWER_MASK 0x7fffffffUL
+
+
+// Constants used internally by Park-Miller random generator
+#define IA 16807
+#define IM 2147483647
+#define AM (1.0 / IM)
+#define IQ 127773
+#define IR 2836
+#define NTAB 32
+#define NDIV (1+(IM-1)/NTAB)
+#define RNMX (1.0-EPS)
+
+Random::Random(long s)
+ {
+#ifndef __NO_MERSENNE
+ mt = new unsigned long [MERSENNE_N];
+ mti = MERSENNE_N + 1;
+ mersenneMult = 1.0/4294967296.0;
+#else
+ shuffler = new long [NTAB];
+#endif
+ Reset(s);
+ }
+
+Random::~Random()
+ {
+#ifndef __NO_MERSENNE
+ delete [] mt;
+#else
+ delete [] shuffler;
+#endif
+ }
+
+void Random::Reset(long s)
+ {
+ normSaved = 0;
+
+#ifndef __NO_MERSENNE
+ InitMersenne(s);
+#else
+ // 'Continuous' Random Generator
+ if ((seed = s) < 1)
+ seed = s == 0 ? 1 : -s; // seed == 0 would be disastrous
+
+ for (int j=NTAB+7; j>=0; j--) // Warm up and set shuffle table
+ {
+ long k = seed / IQ;
+ seed = IA * (seed - k * IQ) - IR * k;
+ if (seed < 0) seed += IM;
+ if (j < NTAB) shuffler[j] = seed;
+ }
+ last=shuffler[0];
+#endif
+ }
+
+// initializes mt[MERSENNE_N] with a seed
+void Random::InitMersenne(unsigned long s)
+ {
+ mt[0]= s & 0xffffffffUL;
+ for (mti = 1; mti < MERSENNE_N; mti++)
+ {
+ mt[mti] = (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
+ /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
+ /* In the previous versions, MSBs of the seed affect */
+ /* only MSBs of the array mt[]. */
+ /* 2002/01/09 modified by Makoto Matsumoto */
+
+ mt[mti] &= 0xffffffffUL;
+ }
+ }
+
+int Random::Binary()
+ {
+ return Next() > 0.5 ? 1 : 0;
+ }
+
+#ifndef __NO_MERSENNE
+
+double Random::Next()
+ {
+ unsigned long y;
+
+ // mag01[x] = x * MATRIX_A for x=0,1
+ static unsigned long mag01[2]={0x0UL, MATRIX_A};
+
+ if (mti >= MERSENNE_N)
+ {
+ /* generate MERSENNE_N words at one time */
+ int kk;
+
+ // If InitMersenne() has not been called, a default initial seed is used
+ if (mti == MERSENNE_N+1)
+ InitMersenne(5489UL);
+
+ for (kk=0; kk < MERSENNE_N-MERSENNE_M; kk++)
+ {
+ y = (mt[kk] & UPPER_MASK) | (mt[kk+1] & LOWER_MASK);
+ mt[kk] = mt[kk+MERSENNE_M] ^ (y >> 1) ^ mag01[y & 0x1UL];
+ }
+
+ for (; kk < MERSENNE_N-1; kk++)
+ {
+ y = (mt[kk] & UPPER_MASK) | (mt[kk+1] & LOWER_MASK);
+ mt[kk] = mt[kk+(MERSENNE_M - MERSENNE_N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
+ }
+
+ y = (mt[MERSENNE_N-1] & UPPER_MASK) | (mt[0] & LOWER_MASK);
+ mt[MERSENNE_N-1] = mt[MERSENNE_M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];
+
+ mti = 0;
+ }
+
+ y = mt[mti++];
+
+ // Tempering
+ y ^= (y >> 11);
+ y ^= (y << 7) & 0x9d2c5680UL;
+ y ^= (y << 15) & 0xefc60000UL;
+ y ^= (y >> 18);
+
+ return (mersenneMult * ((double) y + 0.5));
+ }
+
+// Generates a random number on [0,0xffffffff]-interval
+
+unsigned long Random::NextInt()
+ {
+ unsigned long y;
+
+ // mag01[x] = x * MATRIX_A for x=0,1
+ static unsigned long mag01[2]={0x0UL, MATRIX_A};
+
+ if (mti >= MERSENNE_N)
+ {
+ /* generate MERSENNE_N words at one time */
+ int kk;
+
+ // If InitMersenne() has not been called, a default initial seed is used
+ if (mti == MERSENNE_N + 1)
+ InitMersenne(5489UL);
+
+ for (kk= 0; kk < MERSENNE_N - MERSENNE_M; kk++)
+ {
+ y = (mt[kk] & UPPER_MASK) | (mt[kk+1] & LOWER_MASK);
+ mt[kk] = mt[kk+MERSENNE_M] ^ (y >> 1) ^ mag01[y & 0x1UL];
+ }
+
+ for (; kk< MERSENNE_N-1; kk++)
+ {
+ y = (mt[kk] & UPPER_MASK) | (mt[kk+1] & LOWER_MASK);
+ mt[kk] = mt[kk+(MERSENNE_M - MERSENNE_N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
+ }
+
+ y = (mt[MERSENNE_N-1] & UPPER_MASK) | (mt[0] & LOWER_MASK);
+ mt[MERSENNE_N-1] = mt[MERSENNE_M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];
+
+ mti = 0;
+ }
+
+ y = mt[mti++];
+
+ // Tempering
+ y ^= (y >> 11);
+ y ^= (y << 7) & 0x9d2c5680UL;
+ y ^= (y << 15) & 0xefc60000UL;
+ y ^= (y >> 18);
+
+ return y;
+ }
+
+#else
+
+double Random::Next()
+ {
+ // Compute seed = (IA * seed) % IM without overflows
+ // by Schrage's method
+ long k = seed / IQ;
+ seed = IA * (seed - k * IQ) - IR * k;
+ if (seed < 0) seed += IM;
+
+ // Map to 0..NTAB-1
+ int j = last/NDIV;
+
+ // Output value is shuffler[j], which is in turn replaced by seed
+ last = shuffler[j];
+ shuffler[j] = seed;
+
+ // Map to 0.0 .. 1.0 excluding endpoints
+ double temp = AM * last;
+ if (temp > RNMX) return RNMX;
+ return temp;
+ }
+
+unsigned long Random::NextInt()
+ {
+ // Compute seed = (IA * seed) % IM without overflows
+ // by Schrage's method
+ long k = seed / IQ;
+ seed = IA * (seed - k * IQ) - IR * k;
+ if (seed < 0) seed += IM;
+
+ // Map to 0..NTAB-1
+ int j = last/NDIV;
+
+ // Output value is shuffler[j], which is in turn replaced by seed
+ last = shuffler[j];
+ shuffler[j] = seed;
+
+ return last;
+ }
+
+#endif
+
+double Random::Normal()
+ {
+ double v1, v2, fac, rsq;
+
+ if (!normSaved) // Do we need new numbers?
+ {
+ do {
+ v1 = 2.0 * Next() - 1.0; // Pick two coordinates from
+ v2 = 2.0 * Next() - 1.0; // -1 to +1 and check if they
+ rsq = v1*v1 + v2*v2; // are in unit circle...
+ } while (rsq >= 1.0 || rsq == 0.0);
+
+ fac = sqrt(-2.0 * log(rsq)/rsq); // Apply the Box-Muller
+ normStore = v1 * fac; // transformation and save
+ normSaved = 1; // one deviate for next time
+ return v2 * fac;
+ }
+ else
+ {
+ normSaved = 0;
+ return normStore;
+ }
+ }
+
+void Random::Choose(int * array, int n, int k)
+ {
+ int choices = 1, others = 0;
+
+ if (k > n / 2)
+ {
+ choices = 0;
+ others = 1;
+ k = n - k;
+ }
+
+ for (int i = 0; i < n; i++)
+ array[i] = others;
+
+ while (k > 0)
+ {
+ int i = NextInt() % n;
+
+ if (array[i] == choices) continue;
+
+ array[i] = choices;
+ k--;
+ }
+ }
+
+void Random::Choose(int * array, float * weights, int n, int k)
+ {
+ int choices = 1, others = 0;
+
+ if (k > n / 2)
+ {
+ choices = 0;
+ others = 1;
+ k = n - k;
+ }
+
+ // First calculate cumulative sums of weights ...
+ float * cumulative = new float [n + 1];
+
+ cumulative[0] = 0;
+ for (int i = 1; i <= n; i++)
+ cumulative[i] = cumulative[i - 1] + weights[i - 1];
+
+ float & sum = cumulative[n], reject = 0.0;
+
+ for (int i = 0; i < n; i++)
+ array[i] = others;
+
+ while (k > 0)
+ {
+ float weight = Next() * sum;
+
+ int hi = n, lo = 0, i = 0;
+
+ while (hi >= lo)
+ {
+ i = (hi + lo) / 2;
+
+ if (cumulative[i + 1] <= weight)
+ lo = i + 1;
+ else if (cumulative[i] >= weight)
+ hi = i - 1;
+ else break;
+ }
+
+ if (array[i] == choices) continue;
+
+ array[i] = choices;
+ reject += weights[i];
+
+ // After selecting a substantial number of elements, update the cumulative
+ // distribution -- to ensure that at least half of our samples produce a hit
+ if (reject > sum * 0.50)
+ {
+ cumulative[0] = 0;
+ for (int i = 1; i <= n; i++)
+ if (array[i] != choices)
+ cumulative[i] = cumulative[i - 1] + weights[i - 1];
+ else
+ cumulative[i] = cumulative[i - 1];
+
+ reject = 0.0;
+ }
+
+ k--;
+ }
+
+ delete [] cumulative;
+ }
+
+Random globalRandom;
+
+
+
diff --git a/libsrc/Random.h b/libsrc/Random.h
new file mode 100644
index 0000000..e753a6c
--- /dev/null
+++ b/libsrc/Random.h
@@ -0,0 +1,133 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/Random.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+
+//////////////////////////////////////////////////////////////////////////////
+// This file includes code derived from the original Mersenne Twister Code
+// by Makoto Matsumoto and Takuji Nishimura
+// and is subject to their original copyright notice copied below:
+//////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////
+// COPYRIGHT NOTICE FOR MERSENNE TWISTER CODE
+//
+// Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. The names of its contributors may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+
+#ifndef __RANDOM_H__
+#define __RANDOM_H__
+
+// Define a quick and dirty generator
+#define RANDMUL 1664525L
+#define RANDADD 1013904223L
+
+#define RAND(seed) ((seed = seed * RANDMUL + RANDADD) & 0xFFFFFFFF)
+
+class Random
+// Implements the Mersenne Twister as default random number generator.
+// Compilation flag __NO_MERSENNE sets default generator to
+// a minimal Park-Miller with Bays-Durham shuffle and added safe guards.
+ {
+ protected:
+ // values for "minimal random values"
+ long seed;
+ long last;
+ long * shuffler;
+
+ // and for normal deviates
+ int normSaved;
+ double normStore;
+
+ double mersenneMult;
+
+ // Array for Mersenne state vector
+ unsigned long * mt;
+
+ // Used to signal that Mersenne state vector is not initialized
+ int mti;
+
+
+ public:
+
+ Random(long s = 0x7654321);
+ ~Random();
+
+ // Next bit in series of 0s and 1s
+ int Binary(); // Next bit in series of 0s and 1s
+
+ // Next value in series, between 0 and 1
+ double Next();
+
+ // Next integer
+ unsigned long NextInt();
+
+ // Random number form N(0,1)
+ double Normal();
+
+ void Reset(long s);
+ void InitMersenne(unsigned long s);
+
+ // Random number between 0 and 1
+ operator double()
+ { return Next(); }
+
+ // Random number between arbitrary bounds
+ double Uniform(double lo = 0.0, double hi = 1.0)
+ {
+ return lo + (hi - lo) * Next();
+ }
+
+ void Choose(int * array, int n, int k);
+ void Choose(int * array, float * weights, int n, int k);
+
+ };
+
+extern Random globalRandom;
+
+#endif
+
+
diff --git a/libsrc/Sort.cpp b/libsrc/Sort.cpp
new file mode 100644
index 0000000..072a345
--- /dev/null
+++ b/libsrc/Sort.cpp
@@ -0,0 +1,369 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/Sort.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "Sort.h"
+#include "Error.h"
+
+#include <stddef.h>
+#include <string.h>
+
+
+#define Item(b) (base_char+(b)*width)
+#define IsBefore(x,y) ((cmp(Item(x),Item(y)))<0)
+#define Exchange(x,y) {\
+ memcpy(tmp,Item(x),width);\
+ memcpy(Item(x),Item(y),width);\
+ memcpy(Item(y),tmp,width);\
+ }
+#define TRUE 1
+
+void QuickSort(void *base, size_t nelem, size_t width,
+ int (*cmp)(const void *, const void *))
+ {
+ struct __QuickSortStack { size_t left, right; };
+
+ if (nelem <= 1)
+ return;
+
+ // Create a pseudo-stack to avoid recursion
+
+ char * base_char = (char *) base;
+ const size_t stackSize = 128;
+
+ __QuickSortStack * stack = new __QuickSortStack[stackSize];
+ char * tmp = new char [width];
+
+ if ((stack == NULL) || (tmp == NULL))
+ error("Out of memory in QuickSort routine");
+
+ size_t stackIdx = 0;
+
+ // Size of minimum partition to median of three
+ const size_t Threshold = 7;
+
+ // current partitions
+
+ size_t lsize, rsize;
+ size_t l, mid, r;
+ size_t scanl, scanr, pivot;
+
+ l = 0;
+ r = nelem - 1;
+
+ while (TRUE)
+ {
+ while (r > l)
+ {
+ if (r - l > Threshold)
+ // QuickSort : median of three partitioning
+ {
+ mid = (r + l) / 2;
+
+ // sort l, mid, and r
+ if (IsBefore(mid, l))
+ Exchange(mid, l);
+
+ if (IsBefore(r, l))
+ Exchange(r, l);
+
+ if (IsBefore(r, mid))
+ Exchange(r, mid);
+
+ // set up for partitioning...
+ pivot = r - 1;
+
+ Exchange(mid, pivot);
+
+ scanl = l + 1;
+ scanr = r - 2;
+ }
+ else
+ {
+ // set up random partition -- faster
+ pivot = r;
+ scanl = l;
+ scanr = r - 1;
+ }
+
+ while (TRUE)
+ {
+ // scan from left for element >= pivot
+ while ((scanl < r) && IsBefore(scanl, pivot))
+ ++scanl;
+
+ while ((scanr > l) && IsBefore(pivot, scanr))
+ --scanr;
+
+ // if scans have met, we are done
+ if (scanl >= scanr)
+ break;
+
+ Exchange(scanl, scanr);
+
+ if (scanl < r)
+ ++scanl;
+
+ if (scanr > l)
+ --scanr;
+ }
+
+ // Exchange final element
+ Exchange(pivot, scanl);
+
+ // Place largest partition on stack
+ lsize = scanl - l;
+ rsize = r - scanl;
+
+ if (lsize > rsize)
+ {
+ // if size is one we are done
+ ++ stackIdx;
+
+ if (stackIdx == stackSize)
+ error("Out of Stack in QuickSort routine");
+
+ stack[stackIdx].left = l;
+ stack[stackIdx].right = scanl - 1;
+
+ if ( rsize != 0 )
+ l = scanl + 1;
+ else
+ break;
+ }
+ else
+ {
+ // if size is one we are done
+ ++ stackIdx;
+
+ if (stackIdx == stackSize)
+ error("Out of Stack in QuickSort routine");
+
+ stack[stackIdx].left = scanl + 1;
+ stack[stackIdx].right = r;
+
+ if ( lsize != 0 )
+ r = scanl - 1;
+ else
+ break;
+ }
+ }
+
+ // iterate with values from stack
+ if (stackIdx)
+ {
+ l = stack[stackIdx].left;
+ r = stack[stackIdx].right;
+
+ --stackIdx;
+ }
+ else
+ break;
+ }
+
+ delete [] stack;
+ delete [] tmp;
+ }
+
+#define Item2(b) (base_char2+(b)*width)
+#define Exchange2(x,y) {\
+ memcpy(tmp,Item(x),width);\
+ memcpy(Item(x),Item(y),width);\
+ memcpy(Item(y),tmp,width);\
+ memcpy(tmp,Item2(x),width);\
+ memcpy(Item2(x),Item2(y),width);\
+ memcpy(Item2(y),tmp,width);\
+ }
+
+
+void QuickSort2(void *base, void *base2, size_t nelem, size_t width,
+ int (*cmp)(const void *, const void *))
+ {
+ struct __QuickSortStack { size_t left, right; };
+
+ if (nelem <= 1)
+ return;
+
+ // Create a pseudo-stack to avoid recursion
+
+ char * base_char = (char *) base;
+ char * base_char2 = (char *) base2;
+ const size_t stackSize = 128;
+
+ __QuickSortStack * stack = new __QuickSortStack[stackSize];
+ char * tmp = new char [width];
+
+ if ((stack == NULL) || (tmp == NULL))
+ error("Out of memory in QuickSort routine");
+
+ size_t stackIdx = 0;
+
+ // Size of minimum partition to median of three
+ const size_t Threshold = 7;
+
+ // current partitions
+
+ size_t lsize, rsize;
+ size_t l, mid, r;
+ size_t scanl, scanr, pivot;
+
+ l = 0;
+ r = nelem - 1;
+
+ while (TRUE)
+ {
+ while (r > l)
+ {
+ if (r - l > Threshold)
+ // QuickSort : median of three partitioning
+ {
+ mid = (r + l) / 2;
+
+ // sort l, mid, and r
+ if (IsBefore(mid, l))
+ Exchange2(mid, l);
+
+ if (IsBefore(r, l))
+ Exchange2(r, l);
+
+ if (IsBefore(r, mid))
+ Exchange2(r, mid);
+
+ // set up for partitioning...
+ pivot = r - 1;
+
+ Exchange2(mid, pivot);
+
+ scanl = l + 1;
+ scanr = r - 2;
+ }
+ else
+ {
+ // set up random partition -- faster
+ pivot = r;
+ scanl = l;
+ scanr = r - 1;
+ }
+
+ while (TRUE)
+ {
+ // scan from left for element >= pivot
+ while ((scanl < r) && IsBefore(scanl, pivot))
+ ++scanl;
+
+ while ((scanr > l) && IsBefore(pivot, scanr))
+ --scanr;
+
+ // if scans have met, we are done
+ if (scanl >= scanr)
+ break;
+
+ Exchange2(scanl, scanr);
+
+ if (scanl < r)
+ ++scanl;
+
+ if (scanr > l)
+ --scanr;
+ }
+
+ // Exchange final element
+ Exchange2(pivot, scanl);
+
+ // Place largest partition on stack
+ lsize = scanl - l;
+ rsize = r - scanl;
+
+ if (lsize > rsize)
+ {
+ // if size is one we are done
+ ++ stackIdx;
+
+ if (stackIdx == stackSize)
+ error("Out of Stack in QuickSort routine");
+
+ stack[stackIdx].left = l;
+ stack[stackIdx].right = scanl - 1;
+
+ if ( rsize != 0 )
+ l = scanl + 1;
+ else
+ break;
+ }
+ else
+ {
+ // if size is one we are done
+ ++ stackIdx;
+
+ if (stackIdx == stackSize)
+ error("Out of Stack in QuickSort routine");
+
+ stack[stackIdx].left = scanl + 1;
+ stack[stackIdx].right = r;
+
+ if ( lsize != 0 )
+ r = scanl - 1;
+ else
+ break;
+ }
+ }
+
+ // iterate with values from stack
+ if (stackIdx)
+ {
+ l = stack[stackIdx].left;
+ r = stack[stackIdx].right;
+
+ --stackIdx;
+ }
+ else
+ break;
+ }
+
+ delete [] stack;
+ delete [] tmp;
+ }
+
+void * BinarySearch(const void *key, const void *base,
+ size_t nelem, size_t width,
+ int (*cmp)(const void *, const void *))
+ {
+ if (nelem == 0)
+ return NULL;
+
+ char * base_char = (char *) base;
+
+ int left = 0;
+ int right = nelem - 1;
+
+ while (right >= left)
+ {
+ int probe = (left + right) / 2;
+ int test = cmp(key, Item(probe));
+
+ if (test == 0)
+ return (void *) Item(probe);
+
+ if (test < 0)
+ right = probe - 1;
+ else
+ left = probe + 1;
+ }
+
+ return NULL;
+ }
+
+
diff --git a/libsrc/Sort.h b/libsrc/Sort.h
new file mode 100644
index 0000000..fce7871
--- /dev/null
+++ b/libsrc/Sort.h
@@ -0,0 +1,36 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/Sort.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __SORT_H__
+#define __SORT_H__
+
+#include "Constant.h"
+
+#include <stddef.h>
+
+void QuickSort(void *base, size_t nelem, size_t width,
+ int (*cmp)(const void *, const void *));
+
+void QuickSort2(void *base, void * base2, size_t nelem, size_t width,
+ int (*cmp)(const void *, const void *));
+
+void * BinarySearch(const void *key, const void *base,
+ size_t nelem, size_t width,
+ int (*cmp)(const void *, const void *));
+
+#endif
+
diff --git a/libsrc/StringArray.cpp b/libsrc/StringArray.cpp
new file mode 100644
index 0000000..aa413c9
--- /dev/null
+++ b/libsrc/StringArray.cpp
@@ -0,0 +1,325 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/StringArray.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "StringArray.h"
+#include "Sort.h"
+#include "Error.h"
+
+int StringArray::alloc = 32;
+
+StringArray::StringArray(int startsize)
+ {
+ count = startsize;
+ size = (startsize + alloc) / alloc * alloc;
+ strings = new String * [size];
+ for (int i = 0; i < count; i++)
+ strings[i] = new String;
+ };
+
+StringArray::StringArray(StringArray & rhs)
+ {
+ count = rhs.count;
+ size = (rhs.count + alloc) / alloc * alloc;
+ strings = new String * [size];
+
+ for (int i = 0; i < count; i++)
+ strings[i] = new String(rhs[i]);;
+ }
+
+StringArray::~StringArray()
+ {
+ for (int i = 0; i < count; i++)
+ delete strings[i];
+ delete [] strings;
+ }
+
+int StringArray::CharLength()
+ {
+ int charlen = 0;
+ for (int i = 0; i < count; i++)
+ charlen += strings[i]->Length();
+ return charlen;
+ }
+
+void StringArray::Read(const char * filename)
+ {
+ IFILE f = ifopen(filename, "rb");
+ if (f == NULL) return;
+ Read(f);
+ ifclose(f);
+ }
+
+void StringArray::Write(const char * filename)
+ {
+ FILE * f = fopen(filename, "wt");
+ if (f == NULL) return;
+ Write(f);
+ fclose(f);
+ }
+
+void StringArray::WriteLine(const char * filename)
+ {
+ FILE * f = fopen(filename, "wt");
+ if (f == NULL) return;
+ WriteLine(f);
+ fclose(f);
+ }
+
+void StringArray::Read(FILE * f)
+ {
+ while (!feof(f))
+ {
+ Grow(count + 1);
+ strings[count] = new String;
+ strings[count]->ReadLine(f);
+ count++;
+ }
+ }
+
+void StringArray::Write(FILE * f)
+ {
+ for(int i = 0; i < count; i++)
+ strings[i]->WriteLine(f);
+ }
+
+void StringArray::WriteLine(FILE * f)
+ {
+ for (int i = 0; i < count; i++)
+ fprintf(f, "%s%c", (const char *) (*strings[i]), i == count-1 ? '\n' : '\t');
+ }
+
+#ifdef __ZLIB_AVAILABLE__
+void StringArray::Read(IFILE & f)
+ {
+ while (!ifeof(f))
+ {
+ Grow(count + 1);
+ strings[count] = new String;
+ strings[count]->ReadLine(f);
+ count++;
+ }
+ }
+#endif
+
+void StringArray::Grow(int newsize)
+ {
+ if (newsize >= size)
+ {
+ if ((newsize >> 1) >= size)
+ size = (newsize + alloc) / alloc * alloc;
+ else
+ {
+ size = alloc;
+ while (size <= newsize)
+ size *= 2;
+ }
+ String ** tmp = new String * [size];
+ for (int i = 0; i < count; i++) tmp[i] = strings[i];
+ delete [] strings;
+ strings = tmp;
+ }
+ }
+
+void StringArray::Clear()
+ {
+ for (int i = 0; i < count; i++)
+ delete strings[i];
+ count = 0;
+ }
+
+int StringArray::AddColumns(const String & s, char ch)
+ {
+ for (int pos = 0; pos <= s.Length(); pos++)
+ {
+ int oldpos = pos;
+ pos = s.FindChar(ch, pos);
+ if (pos == -1) pos = s.Length();
+ Grow(count + 1);
+ strings[count++] = new String(s.Mid(oldpos, pos - 1));
+ };
+
+ return count;
+ }
+
+int StringArray::AddTokens(const String & s, char ch)
+ {
+ for (int pos = 0; pos < s.Length(); pos++)
+ {
+ while (pos < s.Length() && s[pos] == ch) pos++;
+ int oldpos = pos;
+
+ while (pos < s.Length() && s[pos] != ch) pos++;
+
+ if (oldpos < s.Length())
+ {
+ Grow(count + 1);
+ strings[count++] = new String(s.Mid(oldpos, pos - 1));
+ }
+ }
+
+ return count;
+ }
+
+int StringArray::AddTokens(const String & s, const String & separators)
+ {
+ for (int pos = 0; pos < s.Length(); pos++)
+ {
+ while (pos < s.Length() && separators.FindChar(s[pos]) != -1) pos++;
+ int oldpos = pos;
+
+ while (pos < s.Length() && separators.FindChar(s[pos]) == -1) pos++;
+
+ if (oldpos < s.Length())
+ {
+ Grow(count + 1);
+ strings[count++] = new String(s.Mid(oldpos, pos - 1));
+ }
+ }
+
+ return count;
+ }
+
+int StringArray::Dimension(int newcount)
+ {
+ if (newcount > count)
+ {
+ Grow(newcount);
+ for (int i = count; i < newcount; i++)
+ strings[i] = new String;
+ count = newcount;
+ }
+ else if (newcount < count)
+ {
+ for (int i = newcount; i < count; i++)
+ delete strings[i];
+ count = newcount;
+ }
+
+ return count;
+ }
+
+int StringArray::Find(const String & s) const
+ {
+ for (int i = 0; i < count; i++)
+ if (*(strings[i]) == s)
+ return i;
+ return -1;
+ }
+
+int StringArray::FastFind(const String & s) const
+ {
+ for (int i = 0; i < count; i++)
+ if (strings[i]->FastCompare(s) == 0)
+ return i;
+ return -1;
+ }
+
+int StringArray::SlowFind(const String & s) const
+ {
+ for (int i = 0; i < count; i++)
+ if (strings[i]->SlowCompare(s) == 0)
+ return i;
+ return -1;
+ }
+
+int StringArray::Add(const String & s)
+ {
+ Grow(count + 1);
+ strings[count] = new String(s);
+ return ++count;
+ }
+
+void StringArray::InsertAt(int position, const String & s)
+ {
+ Grow(count + 1);
+ for (int i = count; i > position; i--)
+ strings[i] = strings[i - 1];
+ strings[position] = new String(s);
+ count++;
+ }
+
+String & StringArray::Last() const
+ {
+ if (!count) error("StringArray: Null String Access");
+ return *(strings[count - 1]);
+ }
+
+void StringArray::Delete(int index)
+ {
+ delete strings[index];
+ count--;
+ for ( ; index < count; index++)
+ strings[index] = strings[index + 1];
+ }
+
+StringArray & StringArray::operator = (const StringArray & rhs)
+ {
+ Clear();
+ for (int i = 0; i < rhs.count; i++)
+ Push(*rhs.strings[i]);
+ return *this;
+ }
+
+bool StringArray::operator == (const StringArray & rhs)
+ {
+ if (count != rhs.count) return false;
+ for (int i = 0; i < rhs.count; i++)
+ if (*strings[i] != *rhs.strings[i])
+ return false;
+ return true;
+ }
+
+void StringArray::Sort()
+ {
+ QuickSort(strings, count, sizeof(String *), ComparisonForSort);
+ }
+
+int StringArray::ComparisonForSort(const void * a, const void * b)
+ {
+ String * string1 = *(String **) a;
+ String * string2 = *(String **) b;
+
+ return Compare(*string1, *string2);
+ }
+
+String StringArray::Pop()
+ {
+ String result = *(strings[count - 1]);
+
+ Dimension(count - 1);
+
+ return result;
+ }
+
+void StringArray::Trim()
+ {
+ for (int i = 0; i < count; i++)
+ strings[i]->Trim();
+ }
+
+void StringArray::Print()
+ {
+ for (int i = 0; i < count; i++)
+ printf("%s\n", (const char *) (*strings[i]));
+ }
+
+void StringArray::PrintLine()
+ {
+ for (int i = 0; i < count; i++)
+ printf("%s%c", (const char *) (*strings[i]), i == count - 1 ? '\n' : '\t');
+ }
+
diff --git a/libsrc/StringArray.h b/libsrc/StringArray.h
new file mode 100644
index 0000000..9c2698e
--- /dev/null
+++ b/libsrc/StringArray.h
@@ -0,0 +1,118 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/StringArray.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __STRING_ARRAY_H__
+#define __STRING_ARRAY_H__
+
+#include "StringBasics.h"
+
+class StringArray
+ {
+ protected:
+ String ** strings;
+ int size, count;
+
+ public:
+ static int alloc;
+
+ StringArray(int startsize = 0);
+ StringArray(StringArray & original);
+ virtual ~StringArray();
+
+ // Each line in a file is parsed into a separate array element
+ //
+
+ void Read(FILE * f);
+ void Write(FILE * f);
+ void WriteLine(FILE * f);
+ void Read(const char * filename);
+ void Write(const char * filename);
+ void WriteLine(const char * filename);
+
+#ifdef __ZLIB_AVAILABLE__
+ void Read(IFILE & f);
+#endif
+
+ // Write all strings to the screen
+ void Print();
+ void PrintLine();
+
+ void Grow(int newsize);
+ void Clear();
+
+ int Length() const { return count; }
+ int Dimension(int newcount);
+ int CharLength();
+
+ String & operator [] (int i) { return *(strings[i]); }
+ const String & operator [] (int i) const { return *(strings[i]); }
+
+ // These functions divide a string into tokens and append these to the
+ // array. Return value is the new array length
+ //
+
+ int AddColumns(const String & s, char ch = '\t');
+ int AddTokens(const String & s, char ch);
+ int AddTokens(const String & s, const String & separators = " \t\r\n");
+
+ int ReplaceColumns(const String & s, char ch = '\t')
+ { Clear(); return AddColumns(s, ch); }
+ int ReplaceTokens(const String & s, const String & separators = " \t\r\n")
+ { Clear(); return AddTokens(s, separators); }
+
+ // These functions add, insert or remove a single array element
+ //
+
+ int Add(const String & s);
+ void InsertAt(int position, const String & s);
+ void Delete(int position);
+
+ // These functions manipulate a string as a stack
+ //
+
+ String & Last() const;
+ int Push(const String & s) { return Add(s); }
+ String Pop();
+
+ // Linear search (N/2 comparisons on average) for a single element
+ // If searching is required, StringMaps are a better option
+ //
+
+ int Find(const String & s) const;
+ int FastFind(const String & s) const;
+ int SlowFind(const String & s) const;
+
+ // Alphetically orders strings
+ //
+ void Sort();
+
+ // Trims strings to remove whitespace
+ void Trim();
+
+ StringArray & operator = (const StringArray & rhs);
+
+ bool operator == (const StringArray & rhs);
+ bool operator != (const StringArray & rhs)
+ { return !(*this == rhs); }
+
+ private:
+ static int ComparisonForSort(const void * a, const void * b);
+ };
+
+#endif
+
+
diff --git a/libsrc/StringBasics.cpp b/libsrc/StringBasics.cpp
new file mode 100644
index 0000000..0929e17
--- /dev/null
+++ b/libsrc/StringBasics.cpp
@@ -0,0 +1,1255 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/StringBasics.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "StringBasics.h"
+#include "Error.h"
+#include "Constant.h"
+#include "MathConstant.h"
+
+#include <string.h>
+#include <ctype.h>
+#include <stdarg.h>
+
+#define SWP(A,B) {int tmp=a; a=b; b=tmp;}
+
+#ifdef _MSC_VER
+#ifndef snprintf
+#define vsnprintf _vsnprintf
+#define snprintf _snprintf
+#endif
+#endif
+
+// If natural ordering is defined, comparisons will
+// order strings including numbers correctly
+// (eg, ... "8", "9", "10" ...) rather than using
+// ASCII ordering (... "10", "8", "9", ...)
+#define NATURAL_ORDERING 1
+
+int String::alloc = 8;
+bool String::caseSensitive = true;
+
+void String::NewString(int startsize)
+ {
+ len = 0;
+ size = (startsize + alloc) / alloc * alloc;
+ buffer = new char [size];
+ buffer[0] = 0;
+ }
+
+String::String(const char * s)
+ {
+ int clen = s == NULL ? 0 : strlen(s);
+ NewString(clen);
+ if (clen)
+ {
+ len = clen;
+ memcpy(buffer, s, len + 1);
+ }
+ }
+
+String::String(char ch, int count)
+ {
+ NewString(count);
+ memset(buffer, ch, count);
+ buffer[count] = 0;
+ len = count;
+ }
+
+String::String(const String & s)
+ {
+ len = s.len;
+ size = (s.len + alloc) / alloc * alloc;;
+ buffer = new char [size];
+ memcpy(buffer, s.buffer, len + 1);
+ }
+
+void String::Grow(int newSize)
+ {
+ if (newSize >= size)
+ {
+ if ((newSize >> 1) >= size)
+ size = (newSize + alloc) / alloc * alloc;
+ else
+ {
+ size = alloc;
+ while (size <= newSize)
+ size *= 2;
+ }
+
+ char * tmp = new char [size];
+ memcpy(tmp, buffer, len + 1);
+ delete [] buffer;
+ buffer = tmp;
+ }
+ }
+
+void String::Swap(String & s)
+ {
+ char * temp = s.buffer;
+ s.buffer = buffer;
+ buffer = temp;
+
+ int swap = s.size;
+ s.size = size;
+ size = swap;
+
+ swap = s.len;
+ s.len = len;
+ len = swap;
+ }
+
+String & String::Copy(const String & s)
+ {
+ Grow(s.len);
+ len = s.len;
+ memcpy(buffer, s.buffer, len + 1);
+ return *this;
+ }
+
+String & String::Copy(const String & s, int start, int n)
+ {
+ if (s.len <= start) return Clear();
+ if (s.len < start + n) n = s.len - start;
+ Grow(n);
+ memcpy (buffer, s.buffer + start, n);
+ buffer[len = n] = 0;
+ return *this;
+ }
+
+String & String::Copy(const char * s)
+ {
+ if (s == NULL)
+ {
+ len = 0;
+ buffer[0] = 0;
+ }
+ else
+ {
+ int clen = strlen(s);
+ Grow(clen);
+ len = clen;
+ memcpy(buffer, s, len + 1);
+ }
+ return *this;
+ }
+
+String & String::ToUpper()
+ {
+ for (int i = 0; i < len; i++)
+ buffer[i] = (char) toupper(buffer[i]);
+ return *this;
+ }
+
+String & String::ToLower()
+ {
+ for (int i = 0; i < len; i++)
+ buffer[i] = (char) tolower(buffer[i]);
+ return *this;
+ }
+
+String String::AsUpper()
+ {
+ String temp;
+ temp = *this;
+ return temp.ToUpper();
+ }
+
+String String::AsLower()
+ {
+ String temp;
+ temp = *this;
+ return temp.ToLower();
+ }
+
+String String::Capitalize()
+ {
+ String temp;
+ temp = *this;
+ temp.buffer[0] = (char) toupper(temp.buffer[0]);
+ return temp;
+ }
+
+String & String::operator = (const String & rhs)
+ {
+ Copy(rhs);
+ return *this;
+ }
+
+String & String::operator = (const char * rhs)
+ {
+ Copy(rhs);
+ return * this;
+ }
+
+String & String::operator += (const String & rhs)
+ {
+ Grow(len + rhs.len);
+ memcpy(buffer + len, rhs.buffer, rhs.len + 1);
+ len += rhs.len;
+ return *this;
+ }
+
+String & String::operator += (const char * rhs)
+ {
+ if (rhs != NULL)
+ {
+ int clen = strlen(rhs);
+ Grow(len + clen);
+ memcpy(buffer + len, rhs, clen + 1);
+ len += clen;
+ }
+ return *this;
+ }
+
+String String::operator + (const String & rhs) const
+ {
+ String result(len + rhs.len);
+ memcpy(result.buffer, buffer, len);
+ memcpy(result.buffer + len, rhs.buffer, rhs.len + 1);
+ result.len = len + rhs.len;
+ return result;
+ }
+
+String String::operator + (const char * rhs) const
+ {
+ if (rhs != NULL)
+ {
+ int clen = strlen(rhs);
+ String result(len + clen);
+ memcpy(result.buffer, buffer, len);
+ memcpy(result.buffer + len, rhs, clen + 1);
+ result.len = len + clen;
+ return result;
+ }
+ return *this;
+ }
+
+String & String::operator = (char ch)
+ {
+ if (ch)
+ {
+ Grow(1);
+ buffer[0] = ch;
+ buffer[1] = 0;
+ len = 1;
+ }
+ else
+ len = buffer[0] = 0;
+ return *this;
+ }
+
+String & String::operator += (char ch)
+ {
+ if (ch)
+ {
+ Grow(len + 1);
+ buffer[len] = ch;
+ buffer[++len] = 0;
+ }
+ return *this;
+ }
+
+String String::operator + (char ch) const
+ {
+ String result(*this);
+ result += ch;
+ return result;
+ }
+
+String & String::operator = (int rhs)
+ {
+ Clear();
+
+ if (rhs < 0)
+ {
+ Add('-');
+ *this += (unsigned int) -rhs;
+ }
+ else
+ *this = (unsigned int) rhs;
+ return *this;
+ }
+
+String & String::operator = (unsigned int rhs)
+ {
+ Clear();
+
+ unsigned long base = 10;
+ int digits = 1;
+
+ while (rhs >= base)
+ {
+ base *= 10;
+ digits++;
+ }
+
+ Grow(digits);
+
+ while (base /= 10)
+ {
+ char ch = char (rhs / base);
+ rhs = rhs - ch * base;
+ buffer[len++] = char (ch + '0');
+ }
+ buffer[len] = 0;
+ return *this;
+ };
+
+String String::operator + (int rhs) const
+ {
+ String result(*this);
+ result += rhs;
+ return result;
+ };
+
+String String::operator + (unsigned int rhs) const
+ {
+ String result(*this);
+ result += rhs;
+ return result;
+ };
+
+String & String::operator += (int rhs)
+ {
+ String temp;
+ temp = rhs;
+ return *this += temp;
+ }
+
+String & String::operator += (unsigned int rhs)
+ {
+ String temp;
+ temp = rhs;
+ return *this += temp;
+ }
+
+String & String::operator *= (unsigned int rhs)
+ {
+ if (rhs == 0)
+ Clear();
+ else
+ {
+ String original(*this);
+
+ Grow(len * rhs);
+
+ for (unsigned int i = 1; i < rhs; i++)
+ *this += original;
+ }
+ return *this;
+ }
+
+String & String::operator = (double rhs)
+ {
+ LockBuffer(32);
+ sprintf(buffer, "%.3f", rhs);
+ UnlockBuffer();
+ return *this;
+ }
+
+String String::operator + (double rhs) const
+ {
+ String result(*this);
+ result += rhs;
+ return result;
+ }
+
+String & String::operator += (double rhs)
+ {
+ String temp;
+ temp = rhs;
+ return *this += temp;
+ }
+
+char * String::LockBuffer(int min)
+ {
+ if (min > 0) Grow(min);
+ return buffer;
+ }
+
+String & String::UnlockBuffer()
+ {
+ for (len = 0; len < size; len++)
+ if (buffer[len] == 0)
+ return *this;
+ error("BasicString - direct access overflowed buffer");
+ return *this;
+ }
+
+int String::Compare(const String & s) const
+ {
+ if (caseSensitive)
+ return String::FastCompare(s);
+ else
+ return String::SlowCompare(s);
+ }
+
+int String::Compare(const char * s) const
+ {
+ return caseSensitive ? FastCompare(s) : SlowCompare(s);
+ }
+
+int String::FastCompare(const String & s) const
+ {
+ for (int i = 0; i <= len; i++)
+ if (buffer[i] - s.buffer[i])
+ {
+#ifdef NATURAL_ORDERING
+ int d = i;
+ while (isdigit(buffer[d]) && isdigit(s.buffer[d]))
+ d++;
+ if (isdigit(buffer[d]))
+ return 1;
+ if (isdigit(s.buffer[d]))
+ return -1;
+#endif
+ return buffer[i] - s.buffer[i];
+ }
+ return 0;
+ }
+
+int String::FastCompare(const char * s) const
+ {
+ if (s == NULL)
+ return -len;
+
+ for (int i = 0; i <= len; i++)
+ if (buffer[i] - s[i])
+ {
+#ifdef NATURAL_ORDERING
+ int d = i;
+ while (isdigit(buffer[d]) && isdigit(s[d]))
+ d++;
+ if (isdigit(buffer[d]))
+ return 1;
+ if (isdigit(s[d]))
+ return -1;
+#endif
+ return buffer[i] - s[i];
+ }
+ return 0;
+ }
+
+int String::SlowCompare(const String & s) const
+ {
+ for (int i = 0; i <= len; i++)
+ if (toupper(buffer[i]) - toupper(s.buffer[i]))
+ {
+#ifdef NATURAL_ORDERING
+ int d = i;
+ while (isdigit(buffer[d]) && isdigit(s[d]))
+ d++;
+ if (isdigit(buffer[d]))
+ return 1;
+ if (isdigit(s.buffer[d]))
+ return -1;
+#endif
+ return toupper(buffer[i]) - toupper(s.buffer[i]);
+ }
+ return 0;
+ }
+
+int String::SlowCompare(const char * s) const
+ {
+ if (s == NULL)
+ return -len;
+
+ for (int i = 0; i <= len; i++)
+ if (toupper(buffer[i]) - toupper(s[i]))
+ {
+#ifdef NATURAL_ORDERING
+ int d = i;
+ while (isdigit(buffer[d]) && isdigit(s[d]))
+ d++;
+ if (isdigit(buffer[d]))
+ return 1;
+ if (isdigit(s[d]))
+ return -1;
+#endif
+ return toupper(buffer[i]) - toupper(s[i]);
+ }
+ return 0;
+ }
+
+String & String::ReadLine(FILE * f)
+ {
+ len = 0;
+ buffer[len] = 0;
+
+ if (f == NULL) return *this;
+
+ int clen = 0;
+ char check[2] = {0, 0};
+
+ int step = 128;
+ String format("%128[^\n\r]%1[\n\r]");
+
+ while (check[0] != '\n' && check[0] != '\r')
+ {
+ if (clen)
+ {
+ step *= 2;
+ format.printf("%%%d%s", step, "[^\n\r]%1[\n\r]");
+ }
+ clen += step;
+
+ int io = fscanf(f, format, LockBuffer(clen) + len, check);
+ UnlockBuffer();
+ // Avoid getting stuck on zero length lines (system specific!)
+ if (io == 0 && check[0] != '\n' && check[0] != '\r')
+ fscanf(f, "%1[\n\r]", check);
+ if (io == 0 || io == EOF) break;
+ }
+
+ if (check[0] == '\n') fscanf(f, "%*1[\r]");
+ if (check[0] == '\r') fscanf(f, "%*1[\n]");
+
+ return *this;
+ }
+
+String & String::Read(FILE * f)
+ {
+ len = 0;
+ buffer[len] = 0;
+
+ if (f == NULL) return *this;
+
+ int clen = 0;
+ char check[2] = {'G', 0};
+
+ while (strchr(WHITESPACE, check[0]) == NULL)
+ {
+ clen += READBUF;
+ int io = fscanf(f, " %" READBUFSTR "[^" WHITESPACE "]"
+ "%1[" WHITESPACE "]", LockBuffer(clen) + len, check);
+ if (io == 0 || io == EOF) break;
+ UnlockBuffer();
+ }
+
+ return *this;
+ }
+
+String & String::Read()
+ {
+ return Read(stdin);
+ }
+
+#ifdef __ZLIB_AVAILABLE__
+String & String::Read(IFILE & f)
+ {
+ len = 0;
+ buffer[len] = 0;
+
+ if (f == NULL) return *this;
+
+ bool leading = true;
+
+ while (true)
+ {
+ int ch = ifgetc(f);
+
+ if (ch == -1) break;
+
+ if (strchr(WHITESPACE, ch) != NULL)
+ if (leading)
+ continue;
+ else
+ break;
+
+ if (len + 1 == size)
+ Grow(len + 1);
+
+ buffer[len++] = (char) ch;
+ buffer[len] = 0;
+
+ leading = false;
+ }
+
+ return *this;
+ }
+#endif
+
+String & String::ReadLine()
+ {
+ static int last = 0;
+ int ch;
+
+ len = 0;
+ buffer[len] = 0;
+
+ while (true)
+ {
+ ch = getchar();
+
+ if (ch == EOF)
+ break;
+
+ if (ch == 10)
+ if (last == 13)
+ { last = 0; continue; }
+ else
+ { last = 10; break; }
+
+ if (ch == 13)
+ if (last == 10)
+ { last = 0; continue; }
+ else
+ { last = 13; break; }
+
+ if (len + 1 == size) Grow(len + 1);
+
+ last = ch;
+ buffer[len++] = (char) last;
+ buffer[len] = 0;
+ }
+
+ return *this;
+ }
+
+#ifdef __ZLIB_AVAILABLE__
+String & String::ReadLine(IFILE & f)
+ {
+ static int last = 0;
+ int ch;
+
+ len = 0;
+ buffer[len] = 0;
+
+ while (true)
+ {
+ ch = ifgetc(f);
+
+ if (ch == EOF)
+ break;
+
+ if (ch == 10)
+ if (last == 13)
+ { last = 0; continue; }
+ else
+ { last = 10; break; }
+
+ if (ch == 13)
+ if (last == 10)
+ { last = 0; continue; }
+ else
+ { last = 13; break; }
+
+ if (len + 1 == size) Grow(len + 1);
+
+ last = ch;
+ buffer[len++] = (char) last;
+ buffer[len] = 0;
+ }
+
+ return *this;
+ }
+#endif
+
+void String::Write(FILE * f)
+ {
+ fprintf(f, "%s", buffer);
+ }
+
+void String::Write()
+ {
+ Write(stdout);
+ }
+
+void String::WriteLine()
+ {
+ WriteLine(stdout);
+ }
+
+void String::WriteLine(FILE * f)
+ {
+ if (f == NULL) return;
+ fprintf(f, "%s\n", buffer);
+ }
+
+String String::Left(int n) const
+ {
+ if (n < 0) n = 0;
+ if (len < n) n = len;
+ String result(n);
+ memcpy(result.buffer, buffer, n);
+ result.buffer[result.len = n] = 0;
+ return result;
+ }
+
+String String::Right(int n) const
+ {
+ if (n < 0) n = 0;
+ if (len < n) n = len;
+ String result(n);
+ memcpy(result.buffer, buffer + len - n, n);
+ result.buffer[result.len = n] = 0;
+ return result;
+ }
+
+String String::SubStr(int start, int n) const
+ {
+ if (start < 0) { n += start; start = 0; };
+ n = min(len - start, n);
+ n = max(n, 0);
+ String result(n);
+ if (start > len) return result;
+ memcpy (result.buffer, buffer + start, n);
+ result.buffer[result.len = n] = 0;
+ return result;
+ }
+
+String String::SubStr(int start) const
+ {
+ return SubStr(start, len - start);
+ }
+
+String String::Mid(int start, int end) const
+ {
+ return SubStr(start, end - start + 1);
+ }
+
+int String::FindChar(char ch, int start) const
+ {
+ return caseSensitive ? FastFindChar(ch, start) : SlowFindChar(ch, start);
+ }
+
+int String::FastFindChar(char ch, int start) const
+ {
+ for ( ; start < len; start++)
+ if (buffer[start] == ch)
+ return start;
+ return -1;
+ }
+
+int String::SlowFindChar(char ch, int start) const
+ {
+ ch = (char) toupper(ch);
+ for ( ; start < len; start++)
+ if (toupper(buffer[start]) == ch)
+ return start;
+ return -1;
+ }
+
+int String::Find(const String & pattern, int start) const
+ {
+ return caseSensitive ? FastFind(pattern, start) : SlowFind(pattern, start);
+ }
+
+// TODO -- We should have a better string search algorithm
+
+int String::FastFind(const String & pattern, int start) const
+ {
+ for (int i ; start <= len - pattern.Length(); start++)
+ if (buffer[start] == pattern[0])
+ {
+ for (i = 1; i < pattern.Length(); i++)
+ if (pattern[i] != buffer[start + i])
+ break;
+ if (i == pattern.Length()) return start;
+ }
+ return -1;
+ }
+
+int String::SlowFind(const String & pattern, int start) const
+ {
+ int firstchar = toupper(pattern[0]);
+
+ for (int i ; start <= len - pattern.Length(); start++)
+ if (toupper(buffer[start]) == firstchar)
+ {
+ for (i = 1; i < pattern.Length(); i++)
+ if (toupper(pattern[i]) != toupper(buffer[start + i]))
+ break;
+ if (i == pattern.Length()) return start;
+ }
+ return -1;
+ }
+
+int String::SetLength(int newlen)
+ {
+ if (newlen > len)
+ {
+ Grow(newlen);
+ memset(buffer + len, ' ', newlen - len);
+ }
+ buffer[newlen] = 0;
+ return len = newlen;
+ }
+
+String & String::Filter(const String & s)
+ {
+ int to = 0;
+ for (int from = 0; from < len; from++)
+ if (s.FindChar(buffer[from]) != -1)
+ buffer[to++] = buffer[from];
+ buffer[len = to] = 0;
+ return *this;
+ }
+
+String & String::Filter(const char * s)
+ {
+ String filter(s);
+ return Filter(filter);
+ }
+
+String & String::ExcludeCharacters(const String & s)
+ {
+ int to = 0;
+ for (int from = 0; from < len; from++)
+ if (s.FindChar(buffer[from]) == -1)
+ buffer[to++] = buffer[from];
+ buffer[len = to] = 0;
+ return *this;
+ }
+
+String & String::ExcludeCharacters(const char * s)
+ {
+ String excluded(s);
+ return ExcludeCharacters(excluded);
+ }
+
+String operator + (const char * lhs, const String & rhs)
+ {
+ String result(lhs);
+ result += rhs;
+ return result;
+ }
+
+String operator + (char lhs, const String & rhs)
+ {
+ String result(lhs);
+ result += rhs;
+ return result;
+ }
+
+String operator + (int lhs, const String & rhs)
+ {
+ String result;
+ result = lhs;
+ result += rhs;
+ return result;
+ }
+
+String operator + (unsigned int lhs, const String & rhs)
+ {
+ String result;
+ result = lhs;
+ result += rhs;
+ return result;
+ }
+
+long String::AsInteger() const
+ {
+ long integer = 0;
+ int base = 10;
+ int pos = 0;
+ int sign = 1;
+
+ if (buffer[pos] == '-')
+ sign = -1, pos++;
+
+ if ( len > pos + 2 && buffer[pos] == '0' &&
+ (buffer[pos+1] == 'x' || buffer[pos+1] == 'X'))
+ base = 16, pos += 2;
+
+ for ( ; pos < len; pos++)
+ {
+ char digit = (char) toupper(buffer[pos]);
+
+ if (digit >= '0' && digit <= '9')
+ integer = integer * base + digit - '0';
+ else if (digit >= 'A' && digit <= 'F' && base == 16)
+ integer = integer * base + digit - 'A' + 10;
+ else
+ return sign * integer;
+ }
+
+ return sign * integer;
+ }
+
+String & String::Invert()
+ {
+ for (int i = 0, j = len - 1; i < j; i++, j--)
+ {
+ char tmp = buffer[i];
+ buffer[i] = buffer[j];
+ buffer[j] = tmp;
+ }
+ return *this;
+ }
+
+String String::RightToLeft()
+ {
+ String result(*this);
+ result.Invert();
+ return result;
+ }
+
+String & String::Invert(const String & s)
+ {
+ Copy(s);
+ return Invert();
+ }
+
+int String::CompareToStem(const String & stem) const
+ {
+ if (caseSensitive)
+ return String::FastCompareToStem(stem);
+ else
+ return String::SlowCompareToStem(stem);
+ }
+
+int String::FastCompareToStem(const String & stem) const
+ {
+ for (int i = 0; i < stem.len; i++)
+ if (buffer[i] - stem.buffer[i])
+ return buffer[i] - stem.buffer[i];
+ return 0;
+ }
+
+int String::SlowCompareToStem(const String & stem) const
+ {
+ for (int i = 0; i < stem.len; i++)
+ if (toupper(buffer[i]) - toupper(stem.buffer[i]))
+ return toupper(buffer[i]) - toupper(stem.buffer[i]);
+ return 0;
+ }
+
+int String::CompareToStem(const char * stem) const
+ {
+ if (caseSensitive)
+ return String::FastCompareToStem(stem);
+ else
+ return String::SlowCompareToStem(stem);
+ }
+
+int String::FastCompareToStem(const char * stem) const
+ {
+ for (int i = 0; stem[i] != 0; i++)
+ if (buffer[i] - stem[i])
+ return buffer[i] - stem[i];
+ return 0;
+ }
+
+int String::SlowCompareToStem(const char * stem) const
+ {
+ for (int i = 0; stem[i] != 0; i++)
+ if (toupper(buffer[i]) - toupper(stem[i]))
+ return toupper(buffer[i]) - toupper(stem[i]);
+ return 0;
+ }
+
+int String::MatchesBeginningOf(const String & stem) const
+ {
+ if (caseSensitive)
+ return String::FastMatchesBeginningOf(stem);
+ else
+ return String::SlowMatchesBeginningOf(stem);
+ }
+
+int String::FastMatchesBeginningOf(const String & stem) const
+ {
+ for (int i = 0; i < len; i++)
+ if (buffer[i] - stem.buffer[i])
+ return buffer[i] - stem.buffer[i];
+ return 0;
+ }
+
+int String::SlowMatchesBeginningOf(const String & stem) const
+ {
+ for (int i = 0; i < len; i++)
+ if (toupper(buffer[i]) - toupper(stem.buffer[i]))
+ return toupper(buffer[i]) - toupper(stem.buffer[i]);
+ return 0;
+ }
+
+int String::MatchesBeginningOf(const char * stem) const
+ {
+ if (caseSensitive)
+ return String::FastMatchesBeginningOf(stem);
+ else
+ return String::SlowMatchesBeginningOf(stem);
+ }
+
+int String::FastMatchesBeginningOf(const char * stem) const
+ {
+ for (int i = 0; i < len; i++)
+ if (buffer[i] - stem[i])
+ return buffer[i] - stem[i];
+ return 0;
+ }
+
+int String::SlowMatchesBeginningOf(const char * stem) const
+ {
+ for (int i = 0; i < len; i++)
+ if (toupper(buffer[i]) - toupper(stem[i]))
+ return toupper(buffer[i]) - toupper(stem[i]);
+ return 0;
+ }
+
+String & String::Trim(char character)
+ {
+ int first = 0;
+ while (buffer[first] && buffer[first] == character)
+ first++;
+
+ int last = len - 1;
+ while (last >= 0 && buffer[last] == character)
+ last--;
+
+ int out = 0;
+ while (first <= last)
+ buffer[out++] = buffer[first++];
+
+ buffer[len = out] = 0;
+
+ return *this;
+ }
+
+String & String::Trim()
+ {
+ int first = 0;
+ while (buffer[first] && isspace(buffer[first]))
+ first++;
+
+ int last = len - 1;
+ while (last >= 0 && isspace(buffer[last]))
+ last--;
+
+ int out = 0;
+ while (first <= last)
+ buffer[out++] = buffer[first++];
+
+ buffer[len = out] = 0;
+
+ return *this;
+ }
+
+#define VSNPRINTF_NOT_CHECKED 0
+#define VSNPRINTF_IS_OK 1
+#define VSNPRINTF_NOT_OK 2
+
+int String::vsnprintfChecked = 0;
+
+int String::printf(const char * format, ...)
+ {
+ va_list ap;
+ va_start(ap, format);
+
+ vprintf(format, ap);
+
+ va_end(ap);
+ return len;
+ }
+
+int String::catprintf(const char * format, ...)
+ {
+ va_list ap;
+ va_start(ap, format);
+
+ vcatprintf(format, ap);
+
+ va_end(ap);
+ return len;
+ }
+
+int String::vprintf(const char * format, va_list ap)
+ {
+ check_vsnprintf();
+
+ while (true)
+ {
+ int bytes_needed;
+ #ifdef va_copy
+ va_list arguments;
+ va_copy (arguments, ap);
+ #else
+ va_list & arguments = ap;
+ #endif
+
+ if (vsnprintfChecked == VSNPRINTF_IS_OK)
+ bytes_needed = vsnprintf(buffer, size, format, arguments);
+ else
+ bytes_needed = my_vsnprintf(buffer, size, format, arguments);
+
+ #ifdef va_copy
+ va_end(arguments);
+ #endif
+
+ if (bytes_needed >= size)
+ Grow(bytes_needed);
+ else if (bytes_needed == -1)
+ Grow(size * 2);
+ else
+ {
+ return len = bytes_needed;
+ }
+ }
+ }
+
+void String::check_vsnprintf()
+ {
+ if (vsnprintfChecked == VSNPRINTF_NOT_CHECKED)
+ {
+ char temp[100];
+
+ memset(temp, 0, 100);
+ int check = snprintf(temp, 5, "%5s", "VSNPRINTF");
+
+ if (temp[6] != 0 || temp[7] != 0 || (check != 9 && check != -1))
+ /*
+ error("This program requires a working version of vsnprintf\n"
+ "However, vsnprintf in the current library seems buggy\n\n"
+ "Recompiling this program with the -D__REPLACE_SNPRINTF__ flag\n"
+ "may solve this problem.\n\n");
+ */
+ vsnprintfChecked = VSNPRINTF_NOT_OK;
+ else
+ vsnprintfChecked = VSNPRINTF_IS_OK;
+ }
+ }
+
+int String::vcatprintf(const char * format, va_list ap)
+ {
+ check_vsnprintf();
+
+ if (len == size)
+ Grow(size * 2);
+
+ while (true)
+ {
+ int bytes_needed;
+ #ifdef va_copy
+ va_list arguments;
+ va_copy (arguments, ap);
+ #else
+ va_list & arguments = ap;
+ #endif
+
+ if (vsnprintfChecked == VSNPRINTF_IS_OK)
+ bytes_needed = len + vsnprintf(buffer + len, size - len, format, arguments);
+ else
+ bytes_needed = len + my_vsnprintf(buffer + len, size - len, format, arguments);
+
+ #ifdef va_copy
+ va_end(arguments);
+ #endif
+
+ if (bytes_needed >= size)
+ Grow(bytes_needed);
+ else if (bytes_needed < len)
+ Grow(size * 2);
+ else
+ {
+ return len = bytes_needed;
+ }
+ }
+ }
+
+FILE * String::my_vsnprintf_file = NULL;
+
+int String::my_vsnprintf(char * buffer, int bufsize, const char * format, va_list args)
+ {
+ if (my_vsnprintf_file == NULL)
+ {
+ my_vsnprintf_file = tmpfile();
+ atexit(my_vsnprintf_close_file);
+ }
+
+ rewind(my_vsnprintf_file);
+
+ int len = vfprintf(my_vsnprintf_file, format, args);
+
+ rewind(my_vsnprintf_file);
+
+ if (len < bufsize)
+ buffer[bufsize = len] = 0;
+ fread(buffer, 1, bufsize, my_vsnprintf_file);
+
+ return len;
+ }
+
+int String::my_snprintf(char * buffer, int bufsize, const char * format, ...)
+ {
+ va_list ap;
+ va_start(ap, format);
+
+ int bytes = my_vsnprintf(buffer, bufsize, format, ap);
+
+ va_end(ap);
+
+ return bytes;
+ }
+
+void String::my_vsnprintf_close_file()
+ {
+ fclose(my_vsnprintf_file);
+ }
+
+bool String::IsNumber()
+ {
+ int pos = 0;
+ bool digits = false;
+
+ // Skip leading sign
+ if (buffer[pos] == '-' || buffer[pos] == '+')
+ pos++;
+
+ // Check integer portion
+ while (buffer[pos] >= '0' && buffer[pos] <= '9')
+ pos++, digits = true;
+
+ // Skip decimal point
+ if (buffer[pos] == '.')
+ {
+ pos++;
+
+ // Check fractional portion
+ while (buffer[pos] >= '0' && buffer[pos] <= '9')
+ pos++, digits = true;
+ }
+
+ if (!digits) return false;
+
+ // Check exponent
+ if (buffer[pos] == 'E' || buffer[pos] == 'e')
+ {
+ pos++;
+
+ // Skip leading sign
+ if (buffer[pos] == '-' || buffer[pos] == '+')
+ pos++;
+
+ digits = false;
+
+ // Check exponent digits
+ while (buffer[pos] >= '0' && buffer[pos] <= '9')
+ pos++, digits = true;
+ }
+
+ return (pos == len) && digits;
+ }
+
+void String::Fill(char ch, int length)
+ {
+ if (length >= 0)
+ SetLength(length);
+
+ for (int i = 0; i < len; i++)
+ buffer[i] = ch;
+ }
+
diff --git a/libsrc/StringBasics.h b/libsrc/StringBasics.h
new file mode 100644
index 0000000..26ac265
--- /dev/null
+++ b/libsrc/StringBasics.h
@@ -0,0 +1,274 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/StringBasics.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __BASICSTRING_H__
+#define __BASICSTRING_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#define READBUF 128
+#define READBUFSTR "128"
+
+#ifdef __PREFIX_STRING__
+#define String BasicString
+#endif
+
+#include "InputFile.h"
+
+class String
+ {
+ private:
+ void NewString(int startsize);
+
+ protected:
+ char * buffer;
+ int len, size;
+
+ public:
+ static int alloc;
+ static bool caseSensitive;
+
+ String(int startsize = 0) { NewString(startsize); }
+ String(const char * s);
+ String(const String & s);
+ String(char ch, int count = 1);
+
+ ~String()
+ {
+ delete [] buffer;
+ }
+
+ String & Clear()
+ {
+ len = buffer[0] = 0;
+ return *this;
+ }
+
+ String & Copy(const String & s);
+ String & Copy(const String & s, int start, int count);
+ String & Copy(const char * s);
+
+ bool IsEmpty() { return len == 0; }
+ String & ToUpper();
+ String & ToLower();
+ String AsUpper();
+ String AsLower();
+ String Capitalize();
+
+ String & operator = (char ch);
+ String operator + (char ch) const;
+ String & operator += (char ch);
+
+ String & operator = (const String & rhs);
+ String operator + (const String & rhs) const;
+ String & operator += (const String & rhs);
+
+ String & operator = (const char * rhs);
+ String operator + (const char * rhs) const;
+ String & operator += (const char * rhs);
+
+ String & operator = (int rhs);
+ String operator + (int rhs) const;
+ String & operator += (int rhs);
+
+ String & operator = (double rhs);
+ String operator + (double rhs) const;
+ String & operator += (double rhs);
+
+ String & operator = (unsigned int rhs);
+ String operator + (unsigned int rhs) const;
+ String & operator += (unsigned int rhs);
+ String operator * (unsigned int rhs) const;
+ String & operator *= (unsigned int rhs);
+
+ int Compare(const String & rhs) const;
+ int FastCompare(const String & rhs) const;
+ int SlowCompare(const String & rhs) const;
+
+ int Compare(const char * rhs) const;
+ int FastCompare(const char * rhs) const;
+ int SlowCompare(const char * rhs) const;
+
+ int CompareToStem(const String & stem) const;
+ int FastCompareToStem(const String & stem) const;
+ int SlowCompareToStem(const String & stem) const;
+
+ int CompareToStem(const char * stem) const;
+ int FastCompareToStem(const char * stem) const;
+ int SlowCompareToStem(const char * stem) const;
+
+ int MatchesBeginningOf(const String & stem) const;
+ int FastMatchesBeginningOf(const String & stem) const;
+ int SlowMatchesBeginningOf(const String & stem) const;
+
+ int MatchesBeginningOf(const char * stem) const;
+ int FastMatchesBeginningOf(const char * stem) const;
+ int SlowMatchesBeginningOf(const char * stem) const;
+
+ int operator == (const String & rhs) const { return Compare(rhs) == 0; }
+ int operator != (const String & rhs) const { return Compare(rhs) != 0; }
+ int operator < (const String & rhs) const { return Compare(rhs) < 0; }
+ int operator > (const String & rhs) const { return Compare(rhs) > 0; }
+ int operator >= (const String & rhs) const { return Compare(rhs) >= 0; }
+ int operator <= (const String & rhs) const { return Compare(rhs) <= 0; }
+
+ int operator == (const char * rhs) const { return Compare(rhs) == 0; }
+ int operator != (const char * rhs) const { return Compare(rhs) != 0; }
+ int operator < (const char * rhs) const { return Compare(rhs) < 0; }
+ int operator > (const char * rhs) const { return Compare(rhs) > 0; }
+ int operator <= (const char * rhs) const { return Compare(rhs) <= 0; }
+ int operator >= (const char * rhs) const { return Compare(rhs) >= 0; }
+
+ operator const char * () const { return buffer; }
+ operator int () const { return atoi(buffer); }
+ operator double () const { return atof(buffer); }
+ char operator [] (int i) const { return buffer[i]; }
+ char & operator [] (int i) { return buffer[i]; }
+
+ char & Last() { return buffer[len - 1]; }
+ char & First() { return buffer[0]; }
+
+ void Grow(int newSize);
+ void Swap(String & s);
+
+ char * LockBuffer(int size = -1);
+ String & UnlockBuffer();
+
+ String & Read();
+ String & ReadLine();
+ void WriteLine();
+ void Write();
+
+ String & Read(FILE * f);
+ String & ReadLine(FILE * f);
+ void WriteLine(FILE * f);
+ void Write(FILE * f);
+
+#ifdef __ZLIB_AVAILABLE__
+ String & Read(IFILE & f);
+ String & ReadLine(IFILE & f);
+#endif
+
+ String Left(int count) const;
+ String Right(int count) const;
+ String Mid(int start, int end) const;
+ String SubStr(int start, int count) const;
+ String SubStr(int start) const;
+
+ int FindChar(char ch, int start = 0) const;
+ int FastFindChar(char ch, int start = 0) const;
+ int SlowFindChar(char ch, int start = 0) const;
+
+ int Find(const String & str, int start = 0) const;
+ int FastFind(const String & str, int start = 0) const;
+ int SlowFind(const String & str, int start = 0) const;
+
+ String & Filter(const String & s);
+ String & Filter(const char * s);
+
+ String & ExcludeCharacters(const String & s);
+ String & ExcludeCharacters(const char * s);
+
+ int Length() const { return len; }
+ int BufferSize() const { return size; }
+
+ int SetLength(int newlen);
+ int Dimension(int newlen) { return SetLength(newlen); }
+
+ String & Add(const String & s) { return *this += s; }
+ String & Add(char ch) { return *this += ch; }
+
+ String RightToLeft();
+ String & Invert();
+ String & Invert(const String & s);
+
+ String & Trim();
+ String & Trim(char character);
+
+ long AsInteger() const;
+ double AsDouble() const { return (double) *this; }
+
+ int printf(const char * format, ...);
+ int vprintf(const char * format, va_list arglist);
+
+ int catprintf(const char * format, ...);
+ int vcatprintf(const char * format, va_list arglist);
+
+ // Replacement vsnprintf and snprint functions for
+ // problematic architectures...
+
+ static int my_snprintf(char * buffer, int bufsize, const char * format, ...);
+ static int my_vsnprintf(char * buffer, int bufsize, const char * format, va_list args);
+ static void my_vsnprintf_close_file();
+ static void check_vsnprintf();
+
+ // Check string contents
+ bool IsNumber();
+
+ // Explicit conversions
+ const unsigned char * uchar() const { return (unsigned char *) buffer; }
+ const signed char * schar() const { return (signed char *) buffer; }
+
+ static FILE * my_vsnprintf_file;
+
+ // Utility functions
+ void Fill(char ch, int length = -1);
+
+ private:
+
+ static int vsnprintfChecked;
+ };
+
+inline int Compare(const String & s1, const String & s2)
+ { return s1.Compare(s2); }
+
+inline int Compare(const String & s1, const char * s2)
+ { return s1.Compare(s2); }
+
+inline int Compare(const char * s1, const String & s2)
+ { return -s2.Compare(s1); }
+
+inline int FastCompare(const String & s1, const String & s2)
+ { return s1.FastCompare(s2); }
+
+inline int FastCompare(const String & s1, const char * s2)
+ { return s1.FastCompare(s2); }
+
+inline int FastCompare(const char * s1, const String & s2)
+ { return -s2.FastCompare(s1); }
+
+inline int SlowCompare(const String & s1, const String & s2)
+ { return s1.SlowCompare(s2); }
+
+inline int SlowCompare(const String & s1, const char * s2)
+ { return s1.SlowCompare(s2); }
+
+inline int SlowCompare(const char * s1, const String & s2)
+ { return -s2.SlowCompare(s1); }
+
+String operator + (char lhs, const String & rhs);
+String operator + (const char * lhs, const String & rhs);
+String operator + (int lhs, const String & rhs);
+String operator + (unsigned int lhs, const String & rhs);
+
+#endif
+
+
+
+
diff --git a/libsrc/StringHash.cpp b/libsrc/StringHash.cpp
new file mode 100644
index 0000000..003d9fd
--- /dev/null
+++ b/libsrc/StringHash.cpp
@@ -0,0 +1,647 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/StringHash.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "StringHash.h"
+#include "Error.h"
+
+StringHash::StringHash(int startsize)
+ {
+ count = 0;
+ size = startsize;
+ mask = startsize - 1;
+
+ // In this implementation, the size of hash tables must be a power of two
+ if (startsize & mask)
+ error("StringHash: Hash table size must be a power of two.\n");
+
+ strings = new String * [size];
+ objects = new void * [size];
+ keys = new unsigned int [size];
+
+ for (unsigned int i = 0; i < size; i++)
+ { strings[i] = NULL; objects[i] = NULL; }
+ };
+
+StringHash::~StringHash()
+ {
+ for (unsigned int i = 0; i < size; i++)
+ if (strings[i] != NULL)
+ delete strings[i];
+
+ delete [] strings;
+ delete [] objects;
+ delete [] keys;
+ }
+
+void StringHash::Clear()
+ {
+ for (unsigned int i = 0; i < size; i++)
+ if (strings[i] != NULL)
+ {
+ delete strings[i];
+ strings[i] = NULL;
+ }
+
+ count = 0;
+
+ if (size > 256)
+ SetSize(256);
+ }
+
+void StringHash::SetSize(int newsize)
+ {
+ int newmask = newsize - 1;
+
+ String ** newstrings = new String * [newsize];
+ void ** newobjects = new void * [newsize];
+ unsigned int * newkeys = new unsigned int [newsize];
+
+ for (int i = 0; i < newsize; i++)
+ { newstrings[i] = NULL; newobjects[i] = NULL; }
+
+ if (count)
+ for (unsigned int i = 0; i < size; i++)
+ if (strings[i] != NULL)
+ {
+ unsigned int key = keys[i];
+ unsigned int h = key & newmask;
+
+ while ( newstrings[h] != NULL &&
+ (newkeys[h] != key || newstrings[h]->SlowCompare(*strings[i]) != 0) )
+ h = (h + 1) & newmask;
+
+ newkeys[h] = key;
+ newstrings[h] = strings[i];
+ newobjects[h] = objects[i];
+ }
+
+ delete [] strings;
+ delete [] objects;
+ delete [] keys;
+
+ strings = newstrings;
+ objects = newobjects;
+ keys = newkeys;
+ size = newsize;
+ mask = newmask;
+ }
+
+int StringHash::Add(const String & string, void * object)
+ {
+ unsigned int key = hash_no_case((unsigned char *) (const char *) string, string.Length(), 0);
+ unsigned int h = Iterate(key, string);
+
+ if (strings[h] == NULL)
+ Insert(h, key, string);
+
+ objects[h] = object;
+
+ if (count * 2 > size)
+ {
+ Grow();
+ return Iterate(key, string);
+ }
+
+ return h;
+ }
+
+int StringHash::Find(const String & string, void * (*create_object)())
+ {
+ unsigned int key = hash_no_case(string.uchar(), string.Length(), 0);
+ unsigned int h = Iterate(key, string);
+
+ if (strings[h] == NULL && create_object == NULL)
+ return -1;
+
+ if (strings[h] == NULL && create_object != NULL)
+ {
+ Insert(h, key, string);
+ objects[h] = create_object();
+
+ if (count * 2 > size)
+ {
+ Grow();
+ return Iterate(key, string);
+ }
+ }
+
+ return h;
+ }
+
+int StringHash::Find(const String & string) const
+ {
+ unsigned int key = hash_no_case(string.uchar(), string.Length(), 0);
+ unsigned int h = Iterate(key, string);
+
+ if (strings[h] == NULL)
+ return -1;
+
+ return h;
+ }
+void * StringHash::CreateHash()
+ {
+ return (void *) new StringHash();
+ }
+
+void StringHash::Delete(unsigned int index)
+ {
+ if (index >= size || strings[index] == NULL)
+ return;
+
+ delete strings[index];
+ strings[index] = NULL;
+ count--;
+
+ if (count * 8 < size && size > 32)
+ Shrink();
+ else
+ {
+ // rehash the next strings until we find empty slot
+ index = (index + 1) & mask;
+
+ while (strings[index] != NULL)
+ {
+ if ((keys[index] & mask) != index)
+ {
+ unsigned int h = Iterate(keys[index], *strings[index]);
+
+ if (h != (unsigned int) index)
+ {
+ keys[h] = keys[index];
+ strings[h] = strings[index];
+ objects[h] = objects[index];
+
+ strings[index] = NULL;
+ objects[index] = NULL;
+ }
+ }
+
+ index = (index + 1) & mask;
+ }
+ }
+ }
+
+void StringHash::ReadLinesFromFile(const char * filename)
+ {
+ IFILE f = ifopen(filename, "rb");
+ if (f == NULL) return;
+ ReadLinesFromFile(f);
+ ifclose(f);
+ }
+
+void StringHash::ReadLinesFromFile(FILE * f)
+ {
+ String buffer;
+
+ while (!feof(f))
+ Add(buffer.ReadLine(f).Trim());
+ }
+
+#ifdef __ZLIB_AVAILABLE__
+void StringHash::ReadLinesFromFile(IFILE & f)
+ {
+ String buffer;
+
+ while (!ifeof(f))
+ Add(buffer.ReadLine(f).Trim());
+ }
+#endif
+
+// StringIntHash implementation
+
+StringIntHash::StringIntHash(int startsize)
+ {
+ count = 0;
+ size = startsize;
+ mask = startsize - 1;
+
+ // In this implementation, the size of hash tables must be a power of two
+ if (startsize & mask)
+ error("StringIntHash: Hash table size must be a power of two.\n");
+
+ strings = new String * [size];
+ integers = new int [size];
+ keys = new unsigned int [size];
+
+ for (unsigned int i = 0; i < size; i++)
+ strings[i] = NULL;
+ };
+
+StringIntHash::~StringIntHash()
+ {
+ for (unsigned int i = 0; i < size; i++)
+ if (strings[i] != NULL)
+ delete strings[i];
+
+ delete [] strings;
+ delete [] integers;
+ delete [] keys;
+ }
+
+void StringIntHash::SetSize(int newsize)
+ {
+ int newmask = newsize - 1;
+
+ String ** newstrings = new String * [newsize];
+ int * newintegers = new int [newsize];
+ unsigned int * newkeys = new unsigned int [newsize];
+
+ for (int i = 0; i < newsize; i++)
+ newstrings[i] = NULL;
+
+ for (unsigned int i = 0; i < size; i++)
+ if (strings[i] != NULL)
+ {
+ unsigned int key = keys[i];
+ unsigned int h = key & newmask;
+
+ while ( newstrings[h] != NULL &&
+ (newkeys[h] != key || newstrings[h]->SlowCompare(*strings[i]) != 0) )
+ h = (h + 1) & newmask;
+
+ newkeys[h] = key;
+ newstrings[h] = strings[i];
+ newintegers[h] = integers[i];
+ }
+
+ delete [] strings;
+ delete [] integers;
+ delete [] keys;
+
+ strings = newstrings;
+ integers = newintegers;
+ keys = newkeys;
+ size = newsize;
+ mask = newmask;
+ }
+
+void StringIntHash::Clear()
+ {
+ for (unsigned int i = 0; i < size; i++)
+ if (strings[i] != NULL)
+ {
+ delete strings[i];
+ strings[i] = NULL;
+ }
+
+ count = 0;
+
+ if (size > 256)
+ SetSize(256);
+ }
+
+int StringIntHash::Add(const String & string, int value)
+ {
+ unsigned int key = hash_no_case((unsigned char *) (const char *) string, string.Length(), 0);
+ unsigned int h = Iterate(key, string);
+
+ if (strings[h] == NULL)
+ Insert(h, key, string);
+
+ integers[h] = value;
+
+ if (count * 2 > size)
+ {
+ Grow();
+ return Iterate(key, string);
+ }
+
+ return h;
+ }
+
+int StringIntHash::Find(const String & string, int defaultValue)
+ {
+ unsigned int key = hash_no_case(string.uchar(), string.Length(), 0);
+ unsigned int h = Iterate(key, string);
+
+ if (strings[h] == NULL)
+ {
+ Insert(h, key, string);
+ integers[h] = defaultValue;
+
+ if (count * 2 > size)
+ {
+ Grow();
+ return Iterate(key, string);
+ }
+ }
+
+ return h;
+ }
+
+int StringIntHash::Find(const String & string) const
+ {
+ unsigned int key = hash_no_case(string.uchar(), string.Length(), 0);
+ unsigned int h = Iterate(key, string);
+
+ if (strings[h] == NULL)
+ return -1;
+
+ return h;
+ }
+
+void StringIntHash::Delete(unsigned int index)
+ {
+ if (index >= size || strings[index] == NULL)
+ return;
+
+ delete strings[index];
+ strings[index] = NULL;
+ count--;
+
+ if (count * 8 < size && size > 32)
+ Shrink();
+ else
+ {
+ // rehash the next strings until we find empty slot
+ index = (index + 1) & mask;
+
+ while (strings[index] != NULL)
+ {
+ if ((keys[index] & mask) != index)
+ {
+ unsigned int h = Iterate(keys[index], *strings[index]);
+
+ if (h != (unsigned int) index)
+ {
+ keys[h] = keys[index];
+ strings[h] = strings[index];
+ integers[h] = integers[index];
+
+ strings[index] = NULL;
+ }
+ }
+
+ index = (index + 1) & mask;
+ }
+ }
+ }
+
+// StringDoubleHash implementation
+
+StringDoubleHash::StringDoubleHash(int startsize)
+ {
+ count = 0;
+ size = startsize;
+ mask = startsize - 1;
+
+ // In this implementation, the size of hash tables must be a power of two
+ if (startsize & mask)
+ error("StringDoubleHash: Hash table size must be a power of two.\n");
+
+ strings = new String * [size];
+ doubles = new double [size];
+ keys = new unsigned int [size];
+
+ for (unsigned int i = 0; i < size; i++)
+ strings[i] = NULL;
+ };
+
+StringDoubleHash::~StringDoubleHash()
+ {
+ for (unsigned int i = 0; i < size; i++)
+ if (strings[i] != NULL)
+ delete strings[i];
+
+ delete [] strings;
+ delete [] doubles;
+ delete [] keys;
+ }
+
+void StringDoubleHash::SetSize(int newsize)
+ {
+ int newmask = newsize - 1;
+
+ String ** newstrings = new String * [newsize];
+ double * newdoubles = new double [newsize];
+ unsigned int * newkeys = new unsigned int [newsize];
+
+ for (int i = 0; i < newsize; i++)
+ newstrings[i] = NULL;
+
+ for (unsigned int i = 0; i < size; i++)
+ if (strings[i] != NULL)
+ {
+ unsigned int key = keys[i];
+ unsigned int h = key & newmask;
+
+ while ( newstrings[h] != NULL &&
+ (newkeys[h] != key || newstrings[h]->SlowCompare(*strings[i]) != 0) )
+ h = (h + 1) & newmask;
+
+ newkeys[h] = key;
+ newstrings[h] = strings[i];
+ newdoubles[h] = doubles[i];
+ }
+
+ delete [] strings;
+ delete [] doubles;
+ delete [] keys;
+
+ strings = newstrings;
+ doubles = newdoubles;
+ keys = newkeys;
+ size = newsize;
+ mask = newmask;
+ }
+
+int StringDoubleHash::Add(const String & string, double value)
+ {
+ unsigned int key = hash_no_case((unsigned char *) (const char *) string, string.Length(), 0);
+ unsigned int h = Iterate(key, string);
+
+ if (strings[h] == NULL)
+ Insert(h, key, string);
+
+ doubles[h] = value;
+
+ if (count * 2 > size)
+ {
+ Grow();
+ return Iterate(key, string);
+ }
+
+ return h;
+ }
+
+int StringDoubleHash::Find(const String & string, double defaultValue)
+ {
+ unsigned int key = hash_no_case(string.uchar(), string.Length(), 0);
+ unsigned int h = Iterate(key, string);
+
+ if (strings[h] == NULL)
+ {
+ Insert(h, key, string);
+ doubles[h] = defaultValue;
+
+ if (count * 2 > size)
+ {
+ Grow();
+ return Iterate(key, string);
+ }
+ }
+
+ return h;
+ }
+
+int StringDoubleHash::Find(const String & string) const
+ {
+ unsigned int key = hash_no_case(string.uchar(), string.Length(), 0);
+ unsigned int h = Iterate(key, string);
+
+ if (strings[h] == NULL)
+ return -1;
+
+ return h;
+ }
+
+void StringDoubleHash::Delete(unsigned int index)
+ {
+ if (index >= size || strings[index] == NULL)
+ return;
+
+ delete strings[index];
+ strings[index] = NULL;
+ count--;
+
+ if (count * 8 < size && size > 32)
+ Shrink();
+ else
+ {
+ // rehash the next strings until we find empty slot
+ index = (index + 1) & mask;
+
+ while (strings[index] != NULL)
+ {
+ if ((keys[index] & mask) != index)
+ {
+ unsigned int h = Iterate(keys[index], *strings[index]);
+
+ if (h != (unsigned int) index)
+ {
+ keys[h] = keys[index];
+ strings[h] = strings[index];
+ doubles[h] = doubles[index];
+
+ strings[index] = NULL;
+ }
+ }
+
+ index = (index + 1) & mask;
+ }
+ }
+ }
+
+void StringHash::Print()
+ {
+ Print(stdout);
+ }
+
+void StringHash::Print(const char * filename)
+ {
+ FILE * output = fopen(filename, "wt");
+ if (output == NULL)
+ return;
+ Print(output);
+ fclose(output);
+ }
+
+void StringHash::Print(FILE * output)
+ {
+ for (unsigned int i = 0; i < size; i++)
+ if (SlotInUse(i))
+ strings[i]->WriteLine(output);
+ }
+
+int StringIntHash::GetCount(const String & key) const
+ {
+ int index = Find(key);
+ return index == -1 ? 0 : integers[index];
+ }
+
+int StringIntHash::IncrementCount(const String & key)
+ {
+ int index = Find(key);
+
+ if (index != -1)
+ return ++(integers[index]);
+
+ SetInteger(key, 1);
+ return 1;
+ }
+
+int StringIntHash::DecrementCount(const String & key)
+ {
+ int index = Find(key);
+
+ if (index != -1)
+ return --(integers[index]);
+
+ SetInteger(key, -1);
+ return -1;
+ }
+
+void StringDoubleHash::Clear()
+ {
+ for (unsigned int i = 0; i < size; i++)
+ if (strings[i] != NULL)
+ {
+ delete strings[i];
+ strings[i] = NULL;
+ }
+
+ count = 0;
+
+ if (size > 256)
+ SetSize(256);
+ }
+
+StringHash & StringHash::operator = (const StringHash & rhs)
+ {
+ Clear();
+
+ for (int i = 0; i < rhs.Capacity(); i++)
+ if (rhs.SlotInUse(i))
+ Add(*(rhs.strings[i]), rhs.objects[i]);
+
+ return *this;
+ }
+
+StringIntHash & StringIntHash::operator = (const StringIntHash & rhs)
+ {
+ Clear();
+
+ for (int i = 0; i < rhs.Capacity(); i++)
+ if (rhs.SlotInUse(i))
+ Add(*(rhs.strings[i]), rhs.integers[i]);
+
+ return *this;
+ }
+
+StringDoubleHash & StringDoubleHash::operator = (const StringDoubleHash & rhs)
+ {
+ Clear();
+
+ for (int i = 0; i < rhs.Capacity(); i++)
+ if (rhs.SlotInUse(i))
+ Add(*(rhs.strings[i]), rhs.doubles[i]);
+
+ return *this;
+ }
+
+
+
diff --git a/libsrc/StringHash.h b/libsrc/StringHash.h
new file mode 100644
index 0000000..6179342
--- /dev/null
+++ b/libsrc/StringHash.h
@@ -0,0 +1,276 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/StringHash.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __STRINGHASH_H__
+#define __STRINGHASH_H__
+
+#include "StringBasics.h"
+#include "Constant.h"
+#include "Hash.h"
+
+class StringHash
+ {
+ protected:
+ String ** strings;
+ void ** objects;
+ unsigned int * keys;
+ unsigned int count, size;
+ unsigned int mask;
+
+ public:
+ StringHash(int startsize = 32);
+ virtual ~StringHash();
+
+ void Grow() { SetSize(size * 2); }
+ void Shrink() { SetSize(size / 2); }
+
+ void SetSize(int newsize);
+
+ void Clear();
+
+ int Capacity() const { return size; }
+ int Entries() const { return count; }
+
+ void * Object(int i) const { return objects[i]; }
+ void * Object(const String & key) const
+ {
+ int index = Find(key);
+
+ return index >= 0 ? objects[index] : NULL;
+ }
+ void * Object(const String & key, void * (*create_object)())
+ {
+ int index = Find(key, create_object);
+
+ return objects[index];
+ }
+
+ void SetObject(int i, void * object)
+ { objects[i] = object; }
+ void SetObject(const String & key, void * object)
+ { Add(key, object); }
+
+ int Add(const String & s, void * object = NULL);
+ int Find(const String & s, void * (*create_object)() = NULL);
+ int Find(const String & s) const;
+
+ StringHash & operator = (const StringHash & rhs);
+
+ const String & operator [] (int i) const { return *(strings[i]); }
+ String & operator [] (int i) { return *(strings[i]); }
+// String & String(int i) { return *(strings[i]); }
+
+ static void * CreateHash();
+
+ void Delete(unsigned int index);
+ void Delete(const String & key) { Delete(Find(key)); }
+
+ bool SlotInUse(int index) const { return strings[index] != NULL; }
+
+ void Print();
+ void Print(FILE * file);
+ void Print(const char * filename);
+
+ // Initialize hash with the contents of a file
+ void ReadLinesFromFile(FILE * file);
+ void ReadLinesFromFile(const char * filename);
+
+#ifdef __ZLIB_AVAILABLE__
+ void ReadLinesFromFile(IFILE & file);
+#endif
+
+ private:
+
+ unsigned int Iterate(unsigned int key, const String & string) const
+ {
+ unsigned int h = key & mask;
+
+ while ( strings[h] != NULL &&
+ ( keys[h] != key ||
+ strings[h]->SlowCompare(string) != 0) )
+ h = (h + 1) & mask;
+
+ return h;
+ }
+
+ void Insert(unsigned int where, unsigned int key, const String & string)
+ {
+ strings[where] = new String;
+ *(strings[where]) = string;
+ keys[where] = key;
+
+ count++;
+ }
+ };
+
+class StringIntHash
+ {
+ protected:
+ String ** strings;
+ int * integers;
+ unsigned int * keys;
+ unsigned int count, size;
+ unsigned int mask;
+
+ public:
+ StringIntHash(int startsize = 32);
+ virtual ~StringIntHash();
+
+ void Grow() { SetSize(size * 2); }
+ void Shrink() { SetSize(size / 2); }
+
+ void SetSize(int newsize);
+
+ void Clear();
+
+ int Capacity() const { return size; }
+ int Entries() const { return count; }
+
+ int Integer(int i) const { return integers[i]; }
+ int Integer(const String & key) const
+ {
+ int index = Find(key);
+
+ return index >= 0 ? integers[index] : -1;
+ }
+
+ void SetInteger(int i, int value)
+ { integers[i] = value; }
+ void SetInteger(const String & key, int value)
+ { Add(key, value); }
+
+ int IncrementCount(const String & key);
+ int DecrementCount(const String & key);
+ int GetCount(const String & key) const;
+ int GetCount(int index) const { return integers[index]; }
+
+ int Add(const String & s, int integer);
+ int Find(const String & s, int defaultValue);
+ int Find(const String & s) const;
+
+ StringIntHash & operator = (const StringIntHash & rhs);
+
+ const String & operator [] (int i) const { return *(strings[i]); }
+ String & operator [] (int i) { return *(strings[i]); }
+// String & String(int i) { return *(strings[i]); }
+
+ void Delete(unsigned int index);
+ void Delete(const String & key) { Delete(Find(key)); }
+
+ bool SlotInUse(int index) const { return strings[index] != NULL; }
+
+ private:
+
+ unsigned int Iterate(unsigned int key, const String & string) const
+ {
+ unsigned int h = key & mask;
+
+ while ( strings[h] != NULL &&
+ ( keys[h] != key ||
+ strings[h]->SlowCompare(string) != 0) )
+ h = (h + 1) & mask;
+
+ return h;
+ }
+
+ void Insert(unsigned int where, unsigned int key, const String & string)
+ {
+ strings[where] = new String;
+ *(strings[where]) = string;
+ keys[where] = key;
+
+ count++;
+ }
+ };
+
+class StringDoubleHash
+ {
+ protected:
+ String ** strings;
+ double * doubles;
+ unsigned int * keys;
+ unsigned int count, size;
+ unsigned int mask;
+
+ public:
+ StringDoubleHash(int startsize = 32);
+ virtual ~StringDoubleHash();
+
+ void Grow() { SetSize(size * 2); }
+ void Shrink() { SetSize(size / 2); }
+
+ void SetSize(int newsize);
+
+ void Clear();
+
+ int Capacity() const { return size; }
+ int Entries() const { return count; }
+
+ double Double(int i) const { return doubles[i]; }
+ double Double(const String & key) const
+ {
+ int index = Find(key);
+
+ return index >= 0 ? doubles[index] : _NAN_;
+ }
+
+ void SetDouble(int i, double value)
+ { doubles[i] = value; }
+ void SetDouble(const String & key, double value)
+ { Add(key, value); }
+
+ int Add(const String & s, double value);
+ int Find(const String & s, double defaultValue);
+ int Find(const String & s) const;
+
+ StringDoubleHash & operator = (const StringDoubleHash & rhs);
+
+ const String & operator [] (int i) const { return *(strings[i]); }
+ String & operator [] (int i) { return *(strings[i]); }
+// String & String(int i) { return *(strings[i]); }
+
+ void Delete(unsigned int index);
+ void Delete(const String & key) { Delete(Find(key)); }
+
+ bool SlotInUse(int index) const { return strings[index] != NULL; }
+
+ private:
+
+ unsigned int Iterate(unsigned int key, const String & string) const
+ {
+ unsigned int h = key & mask;
+
+ while ( strings[h] != NULL &&
+ ( keys[h] != key ||
+ strings[h]->SlowCompare(string) != 0) )
+ h = (h + 1) & mask;
+
+ return h;
+ }
+
+ void Insert(unsigned int where, unsigned int key, const String & string)
+ {
+ strings[where] = new String;
+ *(strings[where]) = string;
+ keys[where] = key;
+
+ count++;
+ }
+ };
+
+#endif
+
diff --git a/libsrc/StringMap.cpp b/libsrc/StringMap.cpp
new file mode 100644
index 0000000..4b67f2a
--- /dev/null
+++ b/libsrc/StringMap.cpp
@@ -0,0 +1,541 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/StringMap.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "StringMap.h"
+
+int StringMap::alloc = 8;
+
+StringMap::StringMap(int startsize)
+ {
+ count = 0;
+ size = (startsize + alloc) / alloc * alloc;
+ strings = new ::String * [size];
+ objects = new void * [size];
+ };
+
+StringMap::~StringMap()
+ {
+ for (int i = 0; i < count; i++)
+ delete strings[i];
+ delete [] strings;
+ delete [] objects;
+ }
+
+void StringMap::Grow(int newsize)
+ {
+ if (newsize >= size)
+ {
+ if ((newsize >> 1) >= size)
+ size = (newsize + alloc) / alloc * alloc;
+ else
+ {
+ size = alloc;
+ while (size <= newsize)
+ size *= 2;
+ }
+
+ size = (newsize + alloc) / alloc * alloc;
+
+ ::String ** newStrings = new ::String * [size];
+ void ** newObjects = new void * [size];
+
+ for (int i = 0; i < count; i++)
+ {
+ newStrings[i] = strings[i];
+ newObjects[i] = objects[i];
+ }
+
+ delete [] strings;
+ delete [] objects;
+
+ strings = newStrings;
+ objects = newObjects;
+ }
+ }
+
+int StringMap::Add(const ::String & key, void * object)
+ {
+ if (count == 0)
+ {
+ Grow(1);
+ strings[0] = new ::String(key);
+ objects[0] = object;
+ return count++;
+ }
+
+ int left = 0;
+ int right = count - 1;
+
+ while (right > left)
+ {
+ int probe = (left + right) / 2;
+ int test = key.SlowCompare(*(strings[probe]));
+
+ if (test == 0)
+ {
+ objects[probe] = object;
+ return probe;
+ }
+
+ if (test < 0)
+ right = probe - 1;
+ else
+ left = probe + 1;
+ }
+
+ int insertAt = left;
+ int test = key.SlowCompare(*(strings[insertAt]));
+
+ if (test == 0)
+ {
+ objects[insertAt] = object;
+ return insertAt;
+ }
+
+ if (test > 0) insertAt++;
+
+ Grow(count + 1);
+
+ if (insertAt < count)
+ {
+ for (int i = count; i > insertAt; i--)
+ {
+ strings[i] = strings[i - 1];
+ objects[i] = objects[i - 1];
+ }
+ }
+
+ strings[insertAt] = new ::String(key);
+ objects[insertAt] = object;
+ count++;
+
+ return insertAt;
+ }
+
+int StringMap::Find(const ::String & s, void * (*create_object)())
+ {
+ if (!count)
+ return create_object == NULL ? -1 : Add(s, create_object());
+
+ int left = 0;
+ int right = count - 1;
+
+ while (right > left)
+ {
+ int probe = (left + right) / 2;
+ int test = s.SlowCompare(*(strings[probe]));
+
+ if (test == 0)
+ return probe;
+
+ if (test < 0)
+ right = probe - 1;
+ else
+ left = probe + 1;
+ }
+
+ int position = left;
+ int test = s.SlowCompare(*(strings[left]));
+
+ if (test == 0)
+ return position;
+
+ if (create_object == NULL)
+ return -1;
+
+ if (test > 0)
+ position++;
+
+ Grow(count + 1);
+
+ if (position < count)
+ {
+ for (int i = count; i > position; i--)
+ {
+ strings[i] = strings[i - 1];
+ objects[i] = objects[i - 1];
+ }
+ }
+
+ strings[position] = new ::String (s);
+ objects[position] = create_object();
+ count++;
+
+ return position;
+ }
+
+int StringMap::Find(const ::String & s) const
+ {
+ if (!count) return -1;
+
+ int left = 0;
+ int right = count - 1;
+
+ while (right > left)
+ {
+ int probe = (left + right) / 2;
+ int test = s.SlowCompare(*(strings[probe]));
+
+ if (test == 0)
+ return probe;
+
+ if (test < 0)
+ right = probe - 1;
+ else
+ left = probe + 1;
+ }
+
+ int position = left;
+ int test = s.SlowCompare(*(strings[left]));
+
+ if (test == 0)
+ return position;
+
+ return -1;
+ }
+
+int StringMap::FindStem(const ::String & stem) const
+ {
+ if (!count) return -1;
+
+ int left = 0;
+ int right = count - 1;
+
+ while (right > left)
+ {
+ int probe = (left + right) / 2;
+ int test = strings[probe]->SlowCompareToStem(stem);
+
+ if (test == 0)
+ {
+ if (left < probe && strings[probe-1]->SlowCompareToStem(stem) == 0 ||
+ right > probe && strings[probe+1]->SlowCompareToStem(stem) == 0)
+ return -2;
+
+ return probe;
+ }
+
+ if (test > 0)
+ right = probe - 1;
+ else
+ left = probe + 1;
+ }
+
+ if (strings[left]->SlowCompareToStem(stem) == 0)
+ return left;
+
+ return -1;
+ }
+
+void * StringMap::CreateMap()
+ {
+ return (void *) new StringMap();
+ }
+
+void StringMap::Clear()
+ {
+ for (int i = 0; i < count; i++)
+ delete strings[i];
+ count = 0;
+ }
+
+void StringMap::Delete(int index)
+ {
+ count--;
+
+ delete strings[index];
+
+ for (int i = index; i < count; i++)
+ {
+ strings[i] = strings[i+1];
+ objects[i] = objects[i+1];
+ }
+ }
+
+// StringIntMap class
+//
+
+int StringIntMap::alloc = 8;
+
+StringIntMap::StringIntMap(int startsize)
+ {
+ count = 0;
+ size = (startsize + alloc) / alloc * alloc;
+ strings = new ::String * [size];
+ integers = new int[size];
+ };
+
+StringIntMap::~StringIntMap()
+ {
+ for (int i = 0; i < count; i++)
+ delete strings[i];
+ delete [] strings;
+ delete [] integers;
+ }
+
+void StringIntMap::Grow(int newsize)
+ {
+ if (newsize >= size)
+ {
+ if ((newsize >> 1) >= size)
+ size = (newsize + alloc) / alloc * alloc;
+ else
+ {
+ size = alloc;
+ while (size <= newsize)
+ size *= 2;
+ }
+
+ ::String ** newStrings = new ::String * [size];
+ int * newIntegers = new int [size];
+
+ for (int i = 0; i < count; i++)
+ {
+ newStrings[i] = strings[i];
+ newIntegers[i] = integers[i];
+ }
+
+ delete [] strings;
+ delete [] integers;
+
+ strings = newStrings;
+ integers = newIntegers;
+ }
+ }
+
+int StringIntMap::Add(const ::String & key, int integer)
+ {
+ if (count == 0)
+ {
+ Grow(1);
+ strings[0] = new ::String(key);
+ integers[0] = integer;
+ return count++;
+ }
+
+ int left = 0;
+ int right = count - 1;
+
+ while (right > left)
+ {
+ int probe = (left + right) / 2;
+ int test = key.SlowCompare(*(strings[probe]));
+
+ if (test == 0)
+ {
+ integers[probe] = integer;
+ return probe;
+ }
+
+ if (test < 0)
+ right = probe - 1;
+ else
+ left = probe + 1;
+ }
+
+ int insertAt = left;
+ int test = key.SlowCompare(*(strings[insertAt]));
+
+ if (test == 0)
+ {
+ integers[insertAt] = integer;
+ return insertAt;
+ }
+
+ if (test > 0) insertAt++;
+
+ Grow(count + 1);
+
+ if (insertAt < count)
+ {
+ for (int i = count; i > insertAt; i--)
+ {
+ strings[i] = strings[i - 1];
+ integers[i] = integers[i - 1];
+ }
+ }
+
+ strings[insertAt] = new ::String(key);
+ integers[insertAt] = integer;
+ count++;
+
+ return insertAt;
+ }
+
+int StringIntMap::Find(const ::String & s, int defaultValue)
+ {
+ if (!count)
+ return Add(s, defaultValue);
+
+ int left = 0;
+ int right = count - 1;
+
+ while (right > left)
+ {
+ int probe = (left + right) / 2;
+ int test = s.SlowCompare(*(strings[probe]));
+
+ if (test == 0)
+ return probe;
+
+ if (test < 0)
+ right = probe - 1;
+ else
+ left = probe + 1;
+ }
+
+ int position = left;
+ int test = s.SlowCompare(*(strings[left]));
+
+ if (test == 0)
+ return position;
+
+ if (test > 0)
+ position++;
+
+ Grow(count + 1);
+
+ if (position < count)
+ {
+ for (int i = count; i > position; i--)
+ {
+ strings[i] = strings[i - 1];
+ integers[i] = integers[i - 1];
+ }
+ }
+
+ strings[position] = new ::String (s);
+ integers[position] = defaultValue;
+ count++;
+
+ return position;
+ }
+
+int StringIntMap::Find(const ::String & s) const
+ {
+ if (!count) return -1;
+
+ int left = 0;
+ int right = count - 1;
+
+ while (right > left)
+ {
+ int probe = (left + right) / 2;
+ int test = s.SlowCompare(*(strings[probe]));
+
+ if (test == 0)
+ return probe;
+
+ if (test < 0)
+ right = probe - 1;
+ else
+ left = probe + 1;
+ }
+
+ int position = left;
+ int test = s.SlowCompare(*(strings[left]));
+
+ if (test == 0)
+ return position;
+
+ return -1;
+ }
+
+int StringIntMap::FindStem(const ::String & stem) const
+ {
+ if (!count) return -1;
+
+ int left = 0;
+ int right = count - 1;
+
+ while (right > left)
+ {
+ int probe = (left + right) / 2;
+ int test = strings[probe]->SlowCompareToStem(stem);
+
+ if (test == 0)
+ {
+ if (left < probe && strings[probe-1]->SlowCompareToStem(stem) == 0 ||
+ right > probe && strings[probe+1]->SlowCompareToStem(stem) == 0)
+ return -2;
+
+ return probe;
+ }
+
+ if (test > 0)
+ right = probe - 1;
+ else
+ left = probe + 1;
+ }
+
+ if (strings[left]->SlowCompareToStem(stem) == 0)
+ return left;
+
+ return -1;
+ }
+
+void StringIntMap::Clear()
+ {
+ for (int i = 0; i < count; i++)
+ delete strings[i];
+ count = 0;
+ }
+
+int StringIntMap::GetCount(const ::String & key) const
+ {
+ int index = Find(key);
+ return index == -1 ? 0 : integers[index];
+ }
+
+int StringIntMap::IncrementCount(const ::String & key)
+ {
+ int index = Find(key);
+
+ if (index != -1)
+ return ++(integers[index]);
+
+ SetInteger(key, 1);
+ return 1;
+ }
+
+int StringIntMap::DecrementCount(const ::String & key)
+ {
+ int index = Find(key);
+
+ if (index != -1)
+ return --(integers[index]);
+
+ SetInteger(key, -1);
+ return -1;
+ }
+
+void StringIntMap::Delete(int index)
+ {
+ count--;
+
+ delete strings[index];
+
+ for (int i = index; i < count; i++)
+ {
+ strings[i] = strings[i+1];
+ integers[i] = integers[i+1];
+ }
+ }
+
+
+
+
diff --git a/libsrc/StringMap.h b/libsrc/StringMap.h
new file mode 100644
index 0000000..fd1a95a
--- /dev/null
+++ b/libsrc/StringMap.h
@@ -0,0 +1,122 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/StringMap.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __STRINGMAP_H__
+#define __STRINGMAP_H__
+
+#include "StringBasics.h"
+
+class StringMap
+ {
+ protected:
+ ::String ** strings;
+ void ** objects;
+ int count, size;
+
+ public:
+ static int alloc;
+
+ StringMap(int startsize = 0);
+ virtual ~StringMap();
+
+ void Grow(int newsize);
+ void Clear();
+ int Length() const { return count; }
+
+ void * Object(int i) const { return objects[i]; }
+ void * Object(const ::String & key) const
+ {
+ int index = Find(key);
+ return (index >= 0) ? objects[index] : NULL;
+ }
+ void * Object(const ::String & key, void * (*create_object)())
+ { return objects[Find(key, create_object)]; }
+
+ void SetObject(int i, void * object)
+ { objects[i] = object; }
+ void SetObject(const ::String & key, void * object)
+ { Add(key, object); }
+
+ int Add(const ::String & s, void * object = NULL);
+ int Find(const ::String & s, void * (*create_object)() = NULL);
+ int Find(const ::String & s) const;
+ int FindStem(const ::String & stem) const;
+
+ StringMap & operator = (const StringMap & rhs);
+
+ const ::String & operator [] (int i) const { return *(strings[i]); }
+ ::String & operator [] (int i) { return *(strings[i]); }
+ ::String & String(int i) { return *(strings[i]); }
+
+ static void * CreateMap();
+
+ void Delete(int index);
+ };
+
+class StringIntMap
+ {
+ protected:
+ ::String ** strings;
+ int * integers;
+ int count, size;
+
+ public:
+ static int alloc;
+
+ StringIntMap(int startsize = 0);
+ virtual ~StringIntMap();
+
+ void Grow(int newsize);
+ void Clear();
+ int Length() const { return count; }
+
+ int Integer(int i) const { return integers[i]; }
+ int Integer(const ::String & key) const
+ {
+ int index = Find(key);
+ return (index >= 0) ? (int) integers[index] : -1;
+ }
+
+ void SetInteger(int i, int value)
+ { integers[i] = value; }
+ void SetInteger(const ::String & key, int value)
+ { Add(key, value); }
+
+ int Add(const ::String & s, int i);
+ int Find(const ::String & s, int defaultValue);
+ int Find(const ::String & s) const;
+ int FindStem(const ::String & stem) const;
+
+ StringIntMap & operator = (const StringIntMap & rhs);
+
+ const ::String & operator [] (int i) const { return *(strings[i]); }
+ ::String & operator [] (int i) { return *(strings[i]); }
+ ::String & String(int i) { return *(strings[i]); }
+
+ static void * CreateMap();
+
+ int IncrementCount(const ::String & key);
+ int DecrementCount(const ::String & key);
+ int GetCount(const ::String & key) const;
+ int GetCount(int index) const { return integers[index]; }
+
+ void Delete(int index);
+ };
+
+#endif
+
+
diff --git a/libsrc/TraitTransformations.cpp b/libsrc/TraitTransformations.cpp
new file mode 100644
index 0000000..dc4c1a5
--- /dev/null
+++ b/libsrc/TraitTransformations.cpp
@@ -0,0 +1,121 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/TraitTransformations.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "TraitTransformations.h"
+#include "QuickIndex.h"
+#include "MathStats.h"
+
+void InverseNormalTransform(Pedigree & ped)
+ {
+ Vector phenotypes;
+ IntArray individuals;
+ QuickIndex index;
+
+ phenotypes.Dimension(ped.count);
+ individuals.Dimension(ped.count);
+
+ for (int trait = 0; trait < ped.traitCount; trait++)
+ {
+ phenotypes.Dimension(0);
+ individuals.Dimension(0);
+
+ for (int i = 0; i < ped.count; i++)
+ if (ped[i].traits[trait] != _NAN_)
+ {
+ phenotypes.Push(ped[i].traits[trait]);
+ individuals.Push(i);
+ }
+
+ int count = individuals.Length();
+
+ if (count == 0) continue;
+
+ index.Index(phenotypes);
+
+ double scale = 1.0 / count;
+
+ for (int i = 0, j; i < index.Length(); i++)
+ {
+ for (j = i; j + 1 < index.Length(); j++)
+ if (ped[individuals[index[i]]].traits[trait] !=
+ ped[individuals[index[j]]].traits[trait] )
+ break;
+
+ if (ped[individuals[index[i]]].traits[trait] !=
+ ped[individuals[index[j]]].traits[trait] )
+ j--;
+
+ double z = ninv(((i + j) * 0.5 + 0.5) * scale);
+
+ for (int k = i; k <= j; k++)
+ ped[individuals[index[k]]].traits[trait] = z;
+
+ i = j;
+ }
+ }
+ }
+
+void InverseNormalTransform(Pedigree & ped, int trait)
+ {
+ Vector phenotypes;
+ IntArray individuals;
+ QuickIndex index;
+
+ phenotypes.Dimension(ped.count);
+ phenotypes.Dimension(0);
+
+ individuals.Dimension(ped.count);
+ individuals.Dimension(0);
+
+ for (int i = 0; i < ped.count; i++)
+ if (ped[i].traits[trait] != _NAN_)
+ {
+ phenotypes.Push(ped[i].traits[trait]);
+ individuals.Push(i);
+ }
+
+ int count = individuals.Length();
+
+ if (count == 0) return;
+
+ index.Index(phenotypes);
+
+ double scale = 1.0 / count;
+
+ for (int i = 0, j; i < index.Length(); i++)
+ {
+ for (j = i; j + 1 < index.Length(); j++)
+ if (ped[individuals[index[i]]].traits[trait] !=
+ ped[individuals[index[j]]].traits[trait] )
+ break;
+
+ if (ped[individuals[index[i]]].traits[trait] !=
+ ped[individuals[index[j]]].traits[trait] )
+ j--;
+
+ double z = ninv(((i + j) * 0.5 + 0.5) * scale);
+
+ for (int k = i; k <= j; k++)
+ ped[individuals[index[k]]].traits[trait] = z;
+
+ i = j;
+ }
+ }
+
+
+
+
diff --git a/libsrc/TraitTransformations.h b/libsrc/TraitTransformations.h
new file mode 100644
index 0000000..888986a
--- /dev/null
+++ b/libsrc/TraitTransformations.h
@@ -0,0 +1,30 @@
+//////////////////////////////////////////////////////////////////////
+// libsrc/TraitTransformations.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __TRAIT_TRANSFORMS__
+#define __TRAIT_TRANSFORMS__
+
+#include "Pedigree.h"
+
+void InverseNormalTransform(Pedigree & ped);
+void InverseNormalTransform(Pedigree & ped, int trait);
+
+#endif
+
+
+
+
diff --git a/mach1/AssociationAnalysis.cpp b/mach1/AssociationAnalysis.cpp
new file mode 100644
index 0000000..1f9a930
--- /dev/null
+++ b/mach1/AssociationAnalysis.cpp
@@ -0,0 +1,201 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/AssociationAnalysis.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "AssociationAnalysis.h"
+#include "MathStats.h"
+
+#include <math.h>
+
+void AssociationAnalysis::ScoreNPL(String & prefix, Pedigree & ped, Haplotyper & engine, int rounds)
+ {
+ FILE * file = fopen(prefix + ".npl", "wt");
+
+ if (file == NULL)
+ printf("Error opening output file [%s.npl]\n", (const char *) prefix);
+ else
+ {
+ fprintf(file, "Marker");
+ for (int j = 0; j < ped.affectionCount; j++)
+ fprintf(file,
+ "\tstat(U)\ts.d.\tstat(A)\ts.d.\tt(%s)\tdf\tp-val"
+ "\tt(%s=1)\tdf\tp-val\tt(%s=2)\tdf\tp-val",
+ (const char *) ped.affectionNames[j],
+ (const char *) ped.affectionNames[j],
+ (const char *) ped.affectionNames[j]);
+ fprintf(file, "\n");
+
+ for (int i = 0; i < ped.markerCount; i++)
+ {
+ fprintf(file, "%s", (const char *) ped.markerNames[i]);
+
+ for (int j = 0; j < ped.affectionCount; j++)
+ {
+ double sum[2] = {0.0, 0.0}, sumsq[2] = {0.0, 0.0};
+ int counts[2] = {0, 0};
+
+ double scale = 1.0 / (rounds * engine.states);
+
+ for (int k = 0; k < ped.count; k++)
+ if (ped[k].affections[j] != 0)
+ {
+ double score = engine.diseaseScores[k][i * ped.affectionCount + j] * scale;
+
+ sum[ped[k].affections[j] - 1] += score;
+ sumsq[ped[k].affections[j] - 1] += score * score;
+ counts[ped[k].affections[j] - 1] ++;
+
+#if _DEBUG
+ printf("Person %s->%s: %.3f\n",
+ (const char *) ped[k].famid, (const char *) ped[k].pid, score);
+#endif
+ }
+
+ if (counts[0] <= 1 || counts[1] <= 1)
+ {
+ fprintf(file, "\t-\t-\t-\t-\t-\t-\t-");
+ continue;
+ }
+
+ sum[0] /= counts[0]; sumsq[0] /= counts[0];
+ sum[1] /= counts[1]; sumsq[1] /= counts[1];
+
+ sumsq[0] = (sumsq[0] - (sum[0] * sum[0])) * counts[0] / (counts[0] - 1);
+ sumsq[1] = (sumsq[1] - (sum[1] * sum[1])) * counts[1] / (counts[1] - 1);
+
+ double s0n = sumsq[0] / counts[0];
+ double s1n = sumsq[1] / counts[1];
+
+ double t = (sum[1] - sum[0]) / sqrt(s0n + s1n + 1e-10);
+ double df = (s0n + s1n) * (s0n + s1n) /
+ (s0n * s0n / (counts[0] - 1) + s1n * s1n / (counts[1] - 1) + 1e-30);
+
+ fprintf(file, "\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.2f\t",
+ sum[0], sqrt(sumsq[0]), sum[1], sqrt(sumsq[1]), t, df);
+
+ OutputPValue(file, t, df);
+
+ double tunaff = sum[0] / sqrt(s0n + 1e-10);
+ double dfunaff = counts[0] - 1;
+
+ fprintf(file, "\t%.3f\t%.0f\t", tunaff, dfunaff);
+
+ OutputPValue(file, -tunaff, dfunaff);
+
+ double taff = sum[1] / sqrt(s1n + 1e-10);
+ double dfaff = counts[1] - 1;
+
+ fprintf(file, "\t%.3f\t%.0f\t", taff, dfaff);
+
+ OutputPValue(file, taff, dfaff);
+ }
+
+ fprintf(file, "\n");
+ }
+
+ fclose(file);
+ }
+
+ printf("Wrote out file [%s.npl] with NPL statistics ...\n\n", (const char *) prefix);
+ }
+
+void AssociationAnalysis::ScoreMarkers(String & prefix, Pedigree & ped, DosageCalculator & doses)
+ {
+ FILE * file = fopen(prefix + ".assoc", "wt");
+
+ if (file == NULL)
+ printf("Error opening output file [%s.assoc]\n", (const char *) prefix);
+ else
+ {
+ fprintf(file, "Marker");
+ for (int j = 0; j < ped.affectionCount; j++)
+ fprintf(file, "\tdose(U)\ts.d.\tdose(A)\ts.d.\tt(%s)\tdf\tp-val", (const char *) ped.affectionNames[j]);
+ fprintf(file, "\n");
+
+ for (int i = 0; i < ped.markerCount; i++)
+ {
+ fprintf(file, "%s", (const char *) ped.markerNames[i]);
+
+ for (int j = 0; j < ped.affectionCount; j++)
+ {
+ double sum[2] = {0.0, 0.0}, sumsq[2] = {0.0, 0.0};
+ int counts[2] = {0, 0};
+
+ for (int k = 0; k < ped.count; k++)
+ if (ped[k].affections[j] != 0)
+ {
+ double score = doses.GetDosage(k, i);
+
+ sum[ped[k].affections[j] - 1] += score;
+ sumsq[ped[k].affections[j] - 1] += score * score;
+ counts[ped[k].affections[j] - 1] ++;
+ }
+
+ if (counts[0] <= 1 || counts[1] <= 1)
+ {
+ fprintf(file, "\t-\t-\t-\t-\t-\t-\t-");
+ continue;
+ }
+
+ sum[0] /= counts[0]; sumsq[0] /= counts[0];
+ sum[1] /= counts[1]; sumsq[1] /= counts[1];
+
+ sumsq[0] = (sumsq[0] - (sum[0] * sum[0])) * counts[0] / (counts[0] - 1);
+ sumsq[1] = (sumsq[1] - (sum[1] * sum[1])) * counts[1] / (counts[1] - 1);
+
+ double s0n = sumsq[0] / counts[0];
+ double s1n = sumsq[1] / counts[1];
+
+ double t = (sum[1] - sum[0]) / sqrt(s0n + s1n + 1e-10);
+ double df = (s0n + s1n) * (s0n + s1n) /
+ (s0n * s0n / (counts[0] - 1) + s1n * s1n / (counts[1] - 1));
+
+ fprintf(file, "\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.2f\t",
+ sum[0], sqrt(sumsq[0]), sum[1], sqrt(sumsq[1]), t,
+ df);
+
+ if (df < 1.0)
+ fprintf(file, "-");
+ else
+ {
+ double pvalue = tdist(t, df);
+
+ fprintf(file, "%#.2g", pvalue);
+ }
+ }
+
+ fprintf(file, "\n");
+ }
+
+ fclose(file);
+ }
+
+ printf("Wrote out file [%s.assoc] with marker-by-marker t statistics ...\n\n",
+ (const char *) prefix);
+ }
+
+void AssociationAnalysis::OutputPValue(FILE * file, double t, double df)
+ {
+ if (df < 1.0)
+ fprintf(file, "-");
+ else
+ {
+ double pvalue = t >= 0.0 ? tdist(t, df) * 0.5 : 1.0 - tdist(t, df) * 0.5;
+
+ fprintf(file, "%#.2g", pvalue);
+ }
+ }
+
diff --git a/mach1/AssociationAnalysis.h b/mach1/AssociationAnalysis.h
new file mode 100644
index 0000000..3a59b98
--- /dev/null
+++ b/mach1/AssociationAnalysis.h
@@ -0,0 +1,38 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/AssociationAnalysis.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __ASSOCIATIONANALYSIS_H__
+#define __ASSOCIATIONANALYSIS_H__
+
+#include "Pedigree.h"
+#include "Haplotyper.h"
+#include "DosageCalculator.h"
+
+class AssociationAnalysis
+ {
+ public:
+ static void ScoreNPL(String & prefix, Pedigree & ped, Haplotyper & engine, int rounds);
+ static void ScoreMarkers(String & prefix, Pedigree & ped, DosageCalculator & doses);
+
+ private:
+ static void OutputPValue(FILE * output, double t, double df);
+ };
+
+#endif
+
+
+
diff --git a/mach1/CostCalculator.cpp b/mach1/CostCalculator.cpp
new file mode 100644
index 0000000..1eb6bfc
--- /dev/null
+++ b/mach1/CostCalculator.cpp
@@ -0,0 +1,136 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/CostCalculator.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "CostCalculator.h"
+#include "MemoryAllocators.h"
+#include "MathStats.h"
+#include "LongLongCounter.h"
+
+#include <math.h>
+
+CostCalculator::CostCalculator()
+ {
+ cost = NULL;
+ path = NULL;
+ }
+
+CostCalculator::~CostCalculator()
+ {
+ if (cost != NULL) delete [] cost;
+ if (path != NULL) delete [] path;
+ }
+
+void CostCalculator::OptimizeCost(char ** haplotypes, int count, int markers)
+ {
+ LongCounter uniqueHaplotypes;
+
+ int bits = sizeof(long long) * 8 - 1;
+ int limit = count / 2;
+
+ long long * individualHaplotypes = new long long [count];
+ int ** uniqueCounts = AllocateIntMatrix(markers, bits);
+
+ // First we construct a matrix with the number of unique haplotypes
+ // along different portions of the current solution
+ for (int i = 0; i < markers; i++)
+ {
+ uniqueHaplotypes.Clear();
+ // Retrieve one marker haplotypes
+ for (int j = 0; j < count; j++)
+ {
+ individualHaplotypes[j] = haplotypes[j][i];
+ uniqueHaplotypes.IncrementCount(individualHaplotypes[j]);
+ }
+
+ // Count the number of unique haplotypes
+ uniqueCounts[i][0] = uniqueHaplotypes.Entries();
+
+ for (int j = 1; j < bits; j++)
+ {
+ if (uniqueHaplotypes.Entries() > limit || i + j >= markers)
+ {
+ uniqueCounts[i][j] = count;
+ continue;
+ }
+
+ uniqueHaplotypes.Clear();
+
+ for (int k = 0; k < count; k++)
+ {
+ individualHaplotypes[k] = individualHaplotypes[k] * 2 + haplotypes[k][i+j];
+ uniqueHaplotypes.IncrementCount(individualHaplotypes[k]);
+ }
+
+ uniqueCounts[i][j] = uniqueHaplotypes.Entries();
+ }
+ }
+
+ // Finally, we use dynamic programming to find the best cost path
+ if (cost != NULL) delete [] cost;
+ if (path != NULL) delete [] path;
+
+ cost = new double [markers];
+ path = new int [markers];
+
+ cost[0] = BasicCost(count);
+ path[0] = 0;
+
+ for (int i = 1; i < markers; i++)
+ {
+ cost[i] = cost[i - 1] + BasicCost(count);
+ path[i] = i;
+
+ for (int j = 1; j < bits; j++)
+ if (i - j >= 0)
+ {
+ double alternate_cost = cost[i - j]
+ + ReducedCost(uniqueCounts[i-j][j], count) * j
+ + TranslationCost(count);
+
+ if (alternate_cost < cost[i])
+ {
+ cost[i] = alternate_cost;
+ path[i] = i - j;
+ }
+ }
+ }
+
+#ifdef _DEBUG
+ // Estimate savings
+ double naiveCost = markers * BasicCost(count);
+
+ printf("Reduced state space optimization would result in a speedup of about %.1f-fold\n",
+ naiveCost / cost[markers-1]);
+
+ int position = markers - 1;
+ printf(" Optimal Path: ");
+ while (position != 0)
+ if (path[position] == position)
+ printf(" Step(%d) ", position--);
+ else
+ {
+ printf(" Condense(%d -> %d)", position, path[position]);
+ position = path[position];
+ }
+ printf("\n");
+#endif
+
+ // We are all done, free memory
+ FreeIntMatrix(uniqueCounts, markers);
+ delete [] individualHaplotypes;
+ }
+
diff --git a/mach1/CostCalculator.h b/mach1/CostCalculator.h
new file mode 100644
index 0000000..5fec7f6
--- /dev/null
+++ b/mach1/CostCalculator.h
@@ -0,0 +1,45 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/CostCalculator.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __COSTCALCULATOR_H__
+#define __COSTCALCULATOR_H__
+
+class CostCalculator
+ {
+ public:
+ CostCalculator();
+ ~CostCalculator();
+
+ void OptimizeCost(char ** haplotypes, int count, int markers);
+
+ double * cost;
+ int * path;
+
+ protected:
+ double BasicCost(int count)
+ { return count * count; }
+
+ double ReducedCost(int unique, int count)
+ { return 4.0 * unique * unique + unique * count; }
+
+ double TranslationCost(int count)
+ { return 2.0 * count * count; }
+ };
+
+#endif
+
+
diff --git a/mach1/DosageCalculator.cpp b/mach1/DosageCalculator.cpp
new file mode 100644
index 0000000..c0e6ef8
--- /dev/null
+++ b/mach1/DosageCalculator.cpp
@@ -0,0 +1,347 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/DosageCalculator.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "DosageCalculator.h"
+#include "MemoryAllocators.h"
+#include "Pedigree.h"
+#include "MemoryInfo.h"
+
+#include <math.h>
+#include <limits.h>
+
+#ifndef UCHAR_MAX
+#define UCHAR_MAX 255
+#endif
+
+#ifndef USHRT_MAX
+#define USHRT_MAX 65535
+#endif
+
+#ifndef UINT_MAX
+#define UINT_MAX 4294967295U
+#endif
+
+bool DosageCalculator::storeDosage = false;
+bool DosageCalculator::storeDistribution = false;
+
+DosageCalculator::DosageCalculator(int N, int G, int M)
+ {
+ storeDosage |= storeDistribution;
+
+ stored = 0;
+ samples = N;
+ genotypes = G;
+ markers = M;
+
+ cTwo = cDosage = NULL;
+ sDosage = sDosage = NULL;
+ iDosage = iDosage = NULL;
+
+ if (!storeDosage)
+ {
+ readyForUse = true;
+ wordSize = 0;
+ return;
+ }
+
+ wordSize = N <= (UCHAR_MAX / 2) ? 1 : N < (USHRT_MAX / 2) ? 2 : 4;
+
+ switch (wordSize)
+ {
+ case 1 : cDosage = AllocateMatrix<unsigned char>(G, M, 0); break;
+ case 2 : sDosage = AllocateMatrix<unsigned short>(G, M, 0); break;
+ case 4 : iDosage = AllocateMatrix<unsigned int>(G, M, 0); break;
+ }
+
+ readyForUse = cDosage != NULL || sDosage != NULL || iDosage != NULL;
+
+ if (!storeDistribution | !readyForUse) return;
+
+ switch (wordSize)
+ {
+ case 1 : cTwo = AllocateMatrix<unsigned char>(G, M, 0); break;
+ case 2 : sTwo = AllocateMatrix<unsigned short>(G, M, 0); break;
+ case 4 : iTwo = AllocateMatrix<unsigned int>(G, M, 0); break;
+ }
+
+ readyForUse = cTwo != NULL || sTwo != NULL || iTwo != NULL;
+ }
+
+DosageCalculator::~DosageCalculator()
+ {
+ switch (wordSize)
+ {
+ case 1 :
+ if (cDosage != NULL) FreeMatrix(cDosage, genotypes);
+ if (cTwo != NULL) FreeMatrix(cTwo, genotypes);
+ break;
+ case 2 :
+ if (sDosage != NULL) FreeMatrix(sDosage, genotypes);
+ if (sTwo != NULL) FreeMatrix(sTwo, genotypes);
+ break;
+ case 4 :
+ if (iDosage != NULL) FreeMatrix(iDosage, genotypes);
+ if (iTwo != NULL) FreeMatrix(iTwo, genotypes);
+ break;
+ }
+ }
+
+double DosageCalculator::GetDosage(int individual, int genotype)
+ {
+ if (stored == 0) return 0.0;
+
+ switch (wordSize)
+ {
+ case 1 : return (2 * stored - cDosage[individual][genotype]) / (double) stored;
+ case 2 : return (2 * stored - sDosage[individual][genotype]) / (double) stored;
+ case 4 : return (2 * stored - iDosage[individual][genotype]) / (double) stored;
+ }
+
+ return 0.0;
+ }
+
+void DosageCalculator::GetCounts(int individual, int genotype,
+ unsigned int & n0, unsigned int & n1, unsigned int & n2)
+ {
+ n0 = n1 = n2 = 0;
+
+ if (stored == 0)
+ return;
+
+ switch (wordSize)
+ {
+ case 1 :
+ n2 = cTwo[individual][genotype];
+ n1 = cDosage[individual][genotype] - 2 * n2;
+ break;
+ case 2 :
+ n2 = sTwo[individual][genotype];
+ n1 = sDosage[individual][genotype] - 2 * n2;
+ break;
+ case 4 :
+ n2 = iTwo[individual][genotype];
+ n1 = iDosage[individual][genotype] - 2 * n2;
+ break;
+ }
+
+ n0 = stored - n2 - n1;
+ }
+
+double DosageCalculator::GetQuality(int individual, int genotype)
+ {
+ if (stored == 0) return 0.0;
+
+ unsigned int n2, n1, n0;
+
+ GetCounts(individual, genotype, n0, n1, n2);
+
+ if (n0 >= n2 && n0 >= n1)
+ return n0 / (double) stored;
+
+ if (n1 >= n2)
+ return n1 / (double) stored;
+
+ return n2 / (double) stored;
+ }
+
+int DosageCalculator::GetBestGenotype(int individual, int genotype)
+ {
+ if (stored == 0) return 1;
+
+ unsigned int n2, n1, n0;
+
+ GetCounts(individual, genotype, n0, n1, n2);
+
+ if (n0 >= n2 && n0 >= n1)
+ return 0;
+
+ if (n1 >= n2)
+ return 1;
+
+ return 2;
+ }
+
+void DosageCalculator::EstimateMemoryInfo(int Samples, int Genotypes, int Markers)
+ {
+ if (storeDosage == false && storeDistribution == false)
+ return;
+
+ int bytesPerItem = Samples < (UCHAR_MAX / 2) ? sizeof(unsigned char) :
+ Samples < (USHRT_MAX / 2) ? sizeof(unsigned short) :
+ sizeof(unsigned int);
+
+ double bytes = bytesPerItem * (double) Genotypes * Markers;
+
+ if (storeDistribution) bytes *= 2;
+
+ printf(" %40s %s\n", "Dosage Calculator ...", (const char *) MemoryInfo(bytes));
+ }
+
+void DosageCalculator::ShowMemoryInfo()
+ {
+ EstimateMemoryInfo(samples, genotypes, markers);
+ }
+
+void DosageCalculator::Update(char ** newHaplotypes)
+ {
+ if (storeDosage == false)
+ return;
+
+ if (wordSize == 1)
+ for (int i = 0; i < genotypes * 2; i += 2)
+ for (int j = 0; j < markers; j++)
+ cDosage[i/2][j] += newHaplotypes[i][j] + newHaplotypes[i + 1][j];
+ else if (wordSize == 2)
+ for (int i = 0; i < genotypes * 2; i += 2)
+ for (int j = 0; j < markers; j++)
+ sDosage[i/2][j] += newHaplotypes[i][j] + newHaplotypes[i + 1][j];
+ else if (wordSize == 4)
+ for (int i = 0; i < genotypes * 2; i += 2)
+ for (int j = 0; j < markers; j++)
+ iDosage[i/2][j] += newHaplotypes[i][j] + newHaplotypes[i + 1][j];
+
+ if (storeDistribution)
+ {
+ if (wordSize == 1)
+ for (int i = 0; i < genotypes * 2; i += 2)
+ for (int j = 0; j < markers; j++)
+ cTwo[i/2][j] += (newHaplotypes[i][j] + newHaplotypes[i + 1][j]) == 2;
+ else if (wordSize == 2)
+ for (int i = 0; i < genotypes * 2; i += 2)
+ for (int j = 0; j < markers; j++)
+ sTwo[i/2][j] += (newHaplotypes[i][j] + newHaplotypes[i + 1][j]) == 2;
+ else if (wordSize == 4)
+ for (int i = 0; i < genotypes * 2; i += 2)
+ for (int j = 0; j < markers; j++)
+ iTwo[i/2][j] += (newHaplotypes[i][j] + newHaplotypes[i + 1][j]) == 2;
+ }
+
+
+ stored++;
+ }
+
+#ifndef square
+#define square(x) ((x)*(x))
+#endif
+
+#ifndef max
+#define max(a,b) ((a)>(b)?(a):(b))
+#endif
+
+void DosageCalculator::OutputMarkerInfo(const char * filename)
+ {
+ FILE * f = fopen(filename, "wt");
+
+ if (f == NULL)
+ {
+ printf("Failed to open output file [%s]\n", filename);
+ return;
+ }
+
+ OutputMarkerInfo(f);
+
+ fclose(f);
+
+ printf("Wrote out file [%s] with marker information\n", filename);
+ }
+
+void DosageCalculator::OutputMarkerInfo(FILE * output)
+ {
+ if (stored == 0) return;
+
+ if (!storeDistribution)
+ {
+ OutputBasicMarkerInfo(output);
+ return;
+ }
+
+ fprintf(output, "SNP\tAl1\tAl2\tFreq1\tMAF\tQuality\tRsq\n");
+
+ double scale_sg = 1.0 / (samples * genotypes + 1e-30);
+ double scale_g = 1.0 / (genotypes + 1e-30);
+ double scale_ss = 1.0 / (samples * samples + 1e-30);
+
+ for (int marker = 0; marker < markers; marker++)
+ {
+ double p0 = 0.0, p1 = 0.0;
+ double qc = 0.0, sumsq = 0.0;
+
+ for (int sample = 0; sample < genotypes; sample++)
+ {
+ unsigned int n0, n1, n2;
+
+ GetCounts(sample, marker, n0, n1, n2);
+
+ p0 += n0; p1 += n1;
+ qc += (n0 > n1 && n0 > n2) ? n0 : (n1 > n2) ? n1 : n2;
+ sumsq += square(n0 + n1 * 0.5) * scale_ss;
+ }
+
+ p0 *= scale_sg; p1 *= scale_sg;
+ qc *= scale_sg; sumsq *= scale_g;
+
+ double freq = p0 + p1 * 0.50;
+ double var1 = max(p0 + p1 * 0.25 - square(freq), 0);
+ double var2 = max(sumsq - square(freq), 0);
+
+ MarkerInfo * info = Pedigree::GetMarkerInfo(marker);
+
+ fprintf(output, "%s\t%s\t%s\t%.4f\t%.4f\t%.4f\t%.4f\n",
+ (const char *) info->name,
+ (const char *) info->GetAlleleLabel(1),
+ info->CountAlleles() > 1 ? (const char *) info->GetAlleleLabel(2) : "-",
+ freq, freq > 0.50 ? 1.0 - freq : freq, qc, var2 / (var1 + 1e-30));
+ }
+ }
+
+void DosageCalculator::OutputBasicMarkerInfo(FILE * output)
+ {
+ fprintf(output, "SNP\tAl1\tAl2\tFreq\tRsq_hat\n");
+
+ double scale = 1.0 / (genotypes + 1e-30);
+
+ for (int marker = 0; marker < markers; marker++)
+ {
+ double sum = 0.0, sumsq = 0.0;
+
+ for (int sample = 0; sample < genotypes; sample++)
+ {
+ double dose = GetDosage(sample, marker);
+
+ sum += dose;
+ sumsq += dose * dose;
+ }
+
+ sum *= scale;
+ sumsq *= scale;
+
+ double freq = sum * 0.50;
+ double var1 = 2 * freq * (1.0 - freq);
+ double var2 = max(sumsq - sum * sum, 0.0);
+
+ MarkerInfo * info = Pedigree::GetMarkerInfo(marker);
+
+ fprintf(output, "%s\t%s\t%s\t%.4f\t%.4f\n",
+ (const char *) info->name,
+ (const char *) info->GetAlleleLabel(1),
+ info->CountAlleles() > 1 ? (const char *) info->GetAlleleLabel(2) : "-",
+ freq > 0.50 ? 1.0 - freq : freq, var2 > var1 ? 1.0 : var2 / (var1 + 1e-30));
+ }
+ }
+
+
+
diff --git a/mach1/DosageCalculator.h b/mach1/DosageCalculator.h
new file mode 100644
index 0000000..f15c527
--- /dev/null
+++ b/mach1/DosageCalculator.h
@@ -0,0 +1,71 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/DosageCalculator.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __DOSAGECALCULATOR_H__
+#define __DOSAGECALCULATOR_H__
+
+#include <stdio.h>
+
+class DosageCalculator
+ {
+ public:
+ bool readyForUse;
+
+ static bool storeDosage;
+ static bool storeDistribution;
+
+ DosageCalculator(int samples, int genotypes, int markers);
+ ~DosageCalculator();
+
+ void Update(char ** newHaplotypes);
+
+ void GetCounts(int individual, int genotype,
+ unsigned int & n0, unsigned int & n1, unsigned int & n2);
+
+ double GetDosage(int individual, int marker);
+ double GetQuality(int individual, int marker);
+ int GetBestGenotype(int individual, int marker);
+
+ void OutputMarkerInfo(const char * filename);
+ void OutputMarkerInfo(FILE * output);
+ void OutputBasicMarkerInfo(FILE * output);
+
+ static void EstimateMemoryInfo(int samples, int genotypes, int markers);
+ void ShowMemoryInfo();
+
+ private:
+ int wordSize;
+
+ unsigned char ** cDosage;
+ unsigned short ** sDosage;
+ unsigned int ** iDosage;
+
+ unsigned char ** cTwo;
+ unsigned short ** sTwo;
+ unsigned int ** iTwo;
+
+ int samples;
+ int genotypes;
+ int markers;
+
+ int stored;
+ };
+
+
+#endif
+
+
diff --git a/mach1/ErrorRate.cpp b/mach1/ErrorRate.cpp
new file mode 100644
index 0000000..d85db3c
--- /dev/null
+++ b/mach1/ErrorRate.cpp
@@ -0,0 +1,65 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/ErrorRate.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "ErrorRate.h"
+
+#include <math.h>
+
+Errors::Errors()
+ {
+ rate = 0.0;
+
+ Reset();
+ }
+
+float Errors::Update()
+ {
+ if (matches + mismatches > 0)
+ {
+ float previous = 0.0;
+ rate = mismatches / (double) (matches + mismatches);
+
+ if (uncertain_pairs)
+ while ((rate > 1e-10) && (fabs(rate - previous) > rate * 1e-4))
+ {
+ double ratio = rate * rate / (rate * rate + (1.0 - rate) * (1.0 - rate));
+
+ previous = rate;
+ rate = (mismatches + ratio * uncertain_pairs * 2.0) / (matches + mismatches + uncertain_pairs * 2.0);
+ }
+ }
+ else if (uncertain_pairs)
+ rate = 0.0;
+
+ return rate;
+ }
+
+void Errors::Reset()
+ {
+ matches = mismatches = uncertain_pairs = 0;
+ }
+
+Errors & Errors::operator += (const Errors & rhs)
+ {
+ matches += rhs.matches;
+ mismatches += rhs.mismatches;
+ uncertain_pairs += rhs.uncertain_pairs;
+
+ return *this;
+ }
+
+
diff --git a/mach1/ErrorRate.h b/mach1/ErrorRate.h
new file mode 100644
index 0000000..1638a31
--- /dev/null
+++ b/mach1/ErrorRate.h
@@ -0,0 +1,40 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/ErrorRate.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __ERRORRATE_H__
+#define __ERRORRATE_H__
+
+class Errors
+ {
+ public:
+ int mismatches;
+ int matches;
+ int uncertain_pairs;
+
+ float rate;
+
+ Errors();
+
+ float Update();
+ void Reset();
+
+ Errors & operator += (const Errors & rhs);
+ };
+
+#endif
+
+
diff --git a/mach1/HaplotypeKey.cpp b/mach1/HaplotypeKey.cpp
new file mode 100644
index 0000000..71c60c8
--- /dev/null
+++ b/mach1/HaplotypeKey.cpp
@@ -0,0 +1,305 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/HaplotypeKey.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "HaplotypeKey.h"
+#include "Error.h"
+
+#include <stdio.h>
+
+HaplotypeHash::HaplotypeHash(int SIZE)
+ {
+ size = SIZE;
+ count = 0;
+
+ extras = new int [size];
+ counts = new int [size];
+ codes = new int [size];
+ }
+
+HaplotypeHash::~HaplotypeHash()
+ {
+ delete [] extras;
+ delete [] counts;
+ delete [] codes;
+ }
+
+void HaplotypeHash::Clear()
+ {
+ for (int i = 0; i < size; i++)
+ counts[i] = 0;
+ count = 0;
+ }
+
+int HaplotypeHash::GetPosition(int code)
+ {
+ int h = code % size;
+
+ while (true)
+ {
+ // Slot is at the correct position
+ if (codes[h] == code)
+ return h;
+
+ // Rehashing is required ...
+ h++;
+
+ if (h == size) h = 0;
+ }
+ }
+
+bool HaplotypeHash::IncrementCount(int code)
+ {
+ int h = code % size;
+
+ while (true)
+ {
+ // Slot is empty
+ if (counts[h] == 0)
+ {
+ InsertAtSlot(h, code);
+ return true;
+ }
+
+ // Slot is at the correct position
+ if (codes[h] == code)
+ {
+ counts[h]++;
+ return false;
+ }
+
+ // Rehashing is required ...
+ h++;
+
+ if (h == size) h = 0;
+ }
+ }
+
+bool HaplotypeHash::DecrementCount(int code)
+ {
+ int h = code % size;
+
+ while (true)
+ {
+ // Slot is at the correct position
+ if (codes[h] == code)
+ {
+ counts[h]--;
+
+ // Entry was deleted ...
+ if (counts[h] == 0)
+ {
+ count--;
+
+ // Rehash subsequent entries, as necessary
+ for (int rehash = h + 1; rehash != h; rehash == rehash + 1 == size ? 0 : rehash + 1)
+ {
+ // Done rehashing when an empty slot is reached
+ if (counts[rehash] == 0) break;
+
+ // Find out new position for the code
+ int h = codes[rehash] % size;
+
+ // If the same as current position, nothing to do
+ if (h == rehash) continue;
+
+ // Otherwise, bubble it up to an appropriate slot
+ while (counts[h] != 0 && h != rehash)
+ if (++h == size)
+ h = 0;
+
+ // Move entry if required
+ if (h != rehash)
+ {
+ counts[h] = counts[rehash];
+ codes[h] = codes[rehash];
+
+ counts[rehash] = 0;
+ }
+ }
+
+ return true;
+ }
+
+ return false;
+ }
+
+ // Rehashing is required ...
+ h++;
+
+ if (h == size) h = 0;
+ }
+ }
+
+HaplotypeKey::HaplotypeKey(int haplotypeCount, int maximumUnique)
+ {
+ codes = new int [count = haplotypeCount];
+ map = new int [count];
+
+ haplotypes = new int [2 * (maxUnique = maximumUnique)];
+ counts = new int [2 * maxUnique];
+ }
+
+HaplotypeKey::~HaplotypeKey()
+ {
+ delete [] codes;
+ delete [] map;
+ delete [] haplotypes;
+ delete [] counts;
+ }
+
+void HaplotypeKey::Clear()
+ {
+ for (int i = 0; i < count; i++)
+ codes[i] = 0;
+ }
+
+void HaplotypeKey::ExtendCodes(char ** haplotypes)
+ {
+ for (int i = 0; i < count; i++)
+ codes[i] = codes[i] * 2 + haplotypes[i][to];
+ to++;
+ }
+
+void HaplotypeKey::TrimCodes()
+ {
+ for (int i = 0; i < count; i++)
+ codes[i] /= 2;
+ to--;
+ }
+
+void HaplotypeKey::HashCodes()
+ {
+ hash.Clear();
+
+ for (int i = 0; i < count; i++)
+ hash.IncrementCount(codes[i]);
+ }
+
+void HaplotypeKey::BuildMap()
+ {
+ unique = 0;
+
+ for (int i = 0; i < hash.size; i++)
+ if (!hash.SlotIsEmpty(i))
+ {
+ hash.extras[i] = unique;
+ haplotypes[unique] = hash.codes[i];
+ counts[unique] = hash.counts[i];
+
+ unique++;
+ }
+
+ for (int i = 0; i < count; i++)
+ map[i] = hash.extras[hash.GetPosition(codes[i])];
+ }
+
+int HaplotypeKey::EncodeHaplotype(char * haplotype)
+ {
+ int code = 0;
+
+ for (int i = from; i < to; i++)
+ code = code * 2 + haplotype[i];
+
+ return code;
+ }
+
+void HaplotypeKey::ReplaceHaplotype(int slot, char * haplotype)
+ {
+ int new_code = EncodeHaplotype(haplotype);
+
+ if (new_code == codes[slot])
+ return;
+
+ if (hash.DecrementCount(codes[slot]))
+ if (--unique > map[slot])
+ {
+ for (int i = 0; i < hash.size; i++)
+ if (hash.extras[i] > map[slot])
+ hash.extras[i]--;
+
+ for (int i = 0; i < count; i++)
+ if (map[i] > map[slot])
+ map[i]--;
+ }
+
+ if (hash.IncrementCount(codes[slot] = new_code))
+ {
+ map[slot] = hash.extras[hash.GetPosition(new_code)] = unique;
+ haplotypes[unique] = new_code;
+ counts[unique] = new_code;
+
+ unique++;
+
+ if (unique >= maxUnique * 2)
+ error("Haplotype hashing failed -- too many distinct haplotypes between"
+ "markers %d and %d\n", from + 1, to + 1);
+ }
+ else
+ map[slot] = hash.extras[hash.GetPosition(new_code)];
+ }
+
+void HaplotypeKey::Initialize(char ** haplotypes, int start, int stop)
+ {
+ from = to = start;
+
+ int max = 1;
+
+ // The first few markers are guaranteed to keep us under the complexity
+ // limit
+ Clear();
+ while (max < maxUnique && to != stop)
+ {
+ ExtendCodes(haplotypes);
+ max *= 2;
+ }
+
+ // After enough markers have been added, we must monitor further extensions
+ HashCodes();
+ while (hash.count <= maxUnique && from - to < 31 && to != stop)
+ {
+ ExtendCodes(haplotypes);
+ HashCodes();
+ }
+
+ if (hash.count > maxUnique)
+ {
+ TrimCodes();
+ HashCodes();
+ }
+
+ BuildMap();
+ }
+
+void ParseHaplotypes(char ** haplotypes, int count, int markers, int threshold)
+ {
+ HaplotypeKey key(count, threshold);
+
+ key.to = -1;
+
+ int blocks = 0;
+ while (key.to < markers - 1)
+ {
+ key.Initialize(haplotypes, key.to + 1, markers);
+
+ printf("Block from %5d - %5d : %3d haplotypes\n", key.from, key.to, key.unique);
+
+ blocks++;
+ }
+
+ printf("Region can be parsed into %d blocks\n\n", blocks);
+ }
+
diff --git a/mach1/HaplotypeKey.h b/mach1/HaplotypeKey.h
new file mode 100644
index 0000000..06c10b2
--- /dev/null
+++ b/mach1/HaplotypeKey.h
@@ -0,0 +1,98 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/HaplotypeKey.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __HAPLOTYPE_KEY__
+#define __HAPLOTYPE_KEY__
+
+class HaplotypeHash
+ {
+ public:
+ int * codes;
+ int * counts;
+ int * extras;
+
+ int size;
+ int count;
+
+ HaplotypeHash(int size = 257);
+ ~HaplotypeHash();
+
+ bool IncrementCount(int code);
+ bool DecrementCount(int code);
+ int GetPosition(int code);
+
+ bool SlotIsEmpty(int slot)
+ { return counts[slot] == 0; }
+
+ void Clear();
+
+ private:
+ void InsertAtSlot(int slot, int code)
+ {
+ codes[slot] = code;
+ counts[slot]++;
+ count++;
+ }
+ };
+
+class HaplotypeKey
+ {
+ public:
+ int * codes;
+ int * map;
+ int count;
+ int from, to;
+
+ int unique;
+ int * haplotypes;
+ int * counts;
+
+ HaplotypeKey(int haplotypeCount, int maxUnique);
+ ~HaplotypeKey();
+
+ void Initialize(char ** haplotypes, int start, int stop);
+
+ void Clear();
+
+ // Extend the current encoding by adding one marker
+ void ExtendCodes(char ** haplotypes);
+
+ // Or trim it by removing the last marker
+ void TrimCodes();
+
+ // List all unique haplotypes and map each haplotype to an id
+ void BuildMap();
+ void HashCodes();
+
+ // Transform haplotype into a numeric code
+ int EncodeHaplotype(char * haplotype);
+
+ // Replace haplotype at a specific slot
+ void ReplaceHaplotype(int slot, char * haplotype);
+
+ private:
+ HaplotypeHash hash;
+
+ int maxUnique;
+ };
+
+void ParseHaplotypes(char ** haplotypes, int count, int markers, int threshold);
+
+#endif
+
+
+
diff --git a/mach1/HaplotypeLoader.cpp b/mach1/HaplotypeLoader.cpp
new file mode 100644
index 0000000..168c631
--- /dev/null
+++ b/mach1/HaplotypeLoader.cpp
@@ -0,0 +1,882 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/HaplotypeLoader.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "HaplotypeLoader.h"
+#include "MemoryAllocators.h"
+#include "MemoryInfo.h"
+#include "Error.h"
+
+#include <ctype.h>
+
+#ifndef ZEPS
+#define ZEPS 1e-30
+#endif
+
+bool HaplotypeLoader::hapmapFormat = false;
+bool HaplotypeLoader::vcfReference = false;
+bool HaplotypeLoader::autoFlip = false;
+bool HaplotypeLoader::loadPositions = false;
+double HaplotypeLoader::startposition = 0.0;
+double HaplotypeLoader::endposition = 300000000.0; //300Mb
+bool HaplotypeLoader::forceImputation = false;
+
+HaplotypeLoader::HaplotypeLoader()
+ {
+ haplotypes = NULL;
+ markerCount = 0;
+ count = 0;
+ }
+
+HaplotypeLoader::~HaplotypeLoader()
+ {
+ if (haplotypes != NULL)
+ FreeCharMatrix(haplotypes, count + 1);
+ }
+
+void HaplotypeLoader::LoadMarkerList(const char * filename)
+ {
+ IFILE f = ifopen(filename, "rb");
+
+ if (f == NULL)
+ return; // error("Marker list [%s] could not be opened\n", filename);
+
+ LoadMarkerList(f);
+ ifclose(f);
+ }
+
+void HaplotypeLoader::LoadHaplotypes(const char * filename)
+ {
+ IFILE f = ifopen(filename, "rb");
+
+ if (f == NULL)
+ {
+ if (Pedigree::markerCount)
+ error("File [%s] with phased haplotypes could not be opened\n", filename);
+
+ return;
+ }
+
+ LoadHaplotypes(f);
+ ifclose(f);
+ }
+
+void HaplotypeLoader::LoadMarkerList(IFILE file)
+{
+ if (hapmapFormat)
+ {
+ LoadHapMapLegendFile(file);
+ return;
+ }
+
+ String buffer;
+ StringArray tokens;
+
+ printf("Loading list of markers in phased haplotype ...\n");
+
+ while (!ifeof(file))
+ {
+ buffer.ReadLine(file);
+ tokens.ReplaceTokens(buffer);
+
+ if (tokens.Length() == 0) continue;
+
+ if (tokens.Length() > 1)
+ error("Each line should list exactly one marker name, but the\n"
+ "following line appears to include extra information:\n\n"
+ "%s\n%s",
+ (const char *) buffer,
+ tokens.Length() != 4 ? "" : "\n"
+ "If you are using a HapMap-style legend file, remember to\n"
+ "use the --hapmapFormat command line option.\n\n");
+
+ int markerId = Pedigree::GetMarkerID(tokens[0]);
+
+ if (markerCount++ != markerId)
+ error("Marker %s is duplicated.\n\n"
+ "Every marker should have a unique name.\n",
+ (const char *) tokens[0]);
+ }
+ }
+
+void HaplotypeLoader::LoadHapMapLegendFile(IFILE file)
+ {
+ String buffer;
+ StringArray tokens;
+
+ printf("Loading HapMap-style legend file ...\n");
+
+ buffer.ReadLine(file);
+ while (!ifeof(file))
+ {
+ buffer.ReadLine(file);
+ tokens.ReplaceTokens(buffer);
+
+ if (tokens.Length() == 0) continue;
+
+ if (tokens.Length() != 4)
+ error("Each line should list the marker name, position and alleles,\n"
+ "but the following line includes %d items (instead of 4):\n\n"
+ "%s\n", tokens.Length(), (const char *) buffer);
+
+ int markerId = Pedigree::GetMarkerID(tokens[0]);
+
+ if (markerCount++ != markerId)
+ error("Marker %s is duplicated.\n\n"
+ "Every marker should have a unique name.\n",
+ (const char *) tokens[0]);
+
+ MarkerInfo * info = Pedigree::GetMarkerInfo(markerId);
+
+ info->NewAllele(tokens[2]);
+ info->NewAllele(tokens[3]);
+
+ if (!loadPositions) continue;
+
+ info->position = tokens[1].AsDouble();
+ }
+ }
+
+void HaplotypeLoader::LoadHaplotypes(IFILE file)
+{
+ if (vcfReference)
+ {
+ LoadVcf(file);
+ return;
+ }
+
+ if (hapmapFormat)
+ {
+ LoadHapMapHaplotypes(file);
+ return;
+ }
+
+ // Don't load haplotypes unless we have a marker list
+ if (markerCount == 0)
+ {
+ printf(" WARNING -- Since no marker list was provided, haplotype file will be ignored\n\n");
+ return;
+ }
+
+ printf("Loading phased haplotypes ...\n");
+
+ String buffer;
+ StringArray tokens;
+
+ // In the first pass, we simply count the number of non-blank lines
+ while (!ifeof(file))
+ {
+ buffer.ReadLine(file);
+ tokens.ReplaceTokens(buffer);
+
+ if (!tokens.Length()) continue;
+
+ count++;
+ }
+
+ // Check if we got some valid input
+ if (count == 0 || markerCount == 0)
+ return;
+
+ // Then, we allocate memory for storing the phased haplotypes
+ haplotypes = AllocateCharMatrix(count + 1, Pedigree::markerCount);
+
+ // And finally, we load the data in a second pass
+ ifrewind(file);
+
+ int line = 0, index = 0;
+ while (!ifeof(file))
+ {
+ line++;
+ buffer.ReadLine(file);
+ tokens.ReplaceTokens(buffer);
+
+ if (tokens.Length() == 0) continue;
+
+ int hapstart = tokens.Length() - 1;
+ int offset = markerCount;
+
+ while ((offset -= tokens[hapstart].Length()) > 0 && hapstart > 0)
+ hapstart--;
+
+ if (offset != 0)
+ error("The haplotype file format was not recognized\n"
+ "(Problem occured reading haplotype #%d in line #%d)\n\n"
+ "Check that the number of markers matches the SNPs list\n",
+ ++line, ++index);
+
+ for (int i = 0; i < markerCount; i++)
+ {
+ MarkerInfo * info = Pedigree::GetMarkerInfo(i);
+
+ if (offset == tokens[hapstart].Length())
+ {
+ offset = 0;
+ hapstart++;
+ }
+
+ int al = info->GetAlleleNumber(tokens[hapstart][offset++]);
+
+ if (al == -1)
+ al = info->NewAllele(tokens[hapstart][offset-1]);
+
+ if (al == 0)
+ error("Missing data in haplotype %d at position %d (marker %s)\n",
+ ++index, ++i, (const char *) Pedigree::markerNames[i]);
+
+ if (al > 2)
+ error("More than 2 alleles at position %d (marker %s)\n",
+ ++i, (const char *) Pedigree::markerNames[i]);
+
+ haplotypes[index][i] = al;
+ }
+ index++;
+ }
+
+ for (int i = 0; i < markerCount; i++)
+ {
+ MarkerInfo * info = Pedigree::GetMarkerInfo(i);
+
+ // The last row in the haplotypes file stores the number of distinct
+ // alleles observed in the input files
+ haplotypes[count][i] = info->CountAlleles();
+ }
+}
+
+void HaplotypeLoader::LoadVcf(IFILE file)
+{
+ String buffer;
+ StringArray tokens;
+
+ // in the first pass, load markers, and count missingness or unphased
+ // load positions ($chr:$pos for all markers ignoring rsIDs, added :$alt for non-SNPs because some have the same positions:e.g., 14:50032210; 55811665)
+ printf("Loading markers from VCF-format input file\n");
+ int vcfFields = 0;
+ int numDiploid = 0;
+ int linenum = 0;
+ while (!ifeof(file))
+ {
+ linenum ++;
+ buffer.ReadLine(file);
+ tokens.ReplaceTokens(buffer);
+
+ if (tokens.Length() == 0) continue;
+
+ //skip comment lines
+ if (tokens[0].SubStr(0,2) == "##") continue;
+
+ // count number of individuals from the header line
+ if (tokens[0].SubStr(0,6) == "#CHROM")
+ {
+ vcfFields = tokens.Length();
+ numDiploid = vcfFields - VCF_HEADING_FIELDS;
+ count = numDiploid * 2;
+ continue;
+ }
+
+ if (tokens.Length() != vcfFields)
+ error("Each line in vcf input file should contain exact %d fields (%d header fields + %d individuals)\n"
+ "while line %d does not:\n\n", vcfFields, VCF_HEADING_FIELDS, numDiploid, linenum);
+
+ if (tokens[1].AsDouble() < startposition) continue;
+ if (tokens[1].AsDouble() > endposition) continue;
+
+ String MarkerPosition = tokens[0]+":"+tokens[1];
+ if (tokens[3].Length() > 1 || tokens[4].Length() > 1)
+ {
+ String extraIndelAllele = tokens[3]+"_"+tokens[4];
+ int extraLength = extraIndelAllele.Length();
+ if (extraLength > 5) extraLength = 5;
+ MarkerPosition = MarkerPosition+":"+extraIndelAllele.SubStr(0,extraLength);
+ }
+
+ int markerId = Pedigree::GetMarkerID(MarkerPosition);
+
+ if (markerCount++ != markerId)
+ error("Marker %s is duplicated. Every marker should have a unique name.\n\n", (const char *) MarkerPosition);
+
+ }
+
+ // Check if we got some valid input
+ if (markerCount == 0)
+ {
+ printf(" WARNING -- Since we read ZERO marker from vcf input file, the vcf input will be ignored\n\n");
+ return;
+ }
+
+ // Check if we got some valid input
+ if (count == 0)
+ {
+ printf(" WARNING -- Since we read ZERO individuals from vcf input file, the vcf input will be ignored\n\n");
+ return;
+ }
+
+ // Then, we allocate memory for storing the phased haplotypes
+ haplotypes = AllocateCharMatrix(count + 1, Pedigree::markerCount);
+ if (haplotypes == NULL) error("Failed to allocate memory for reference haplotypes\n\n");
+
+ // And finally, we load the data in a second pass
+ // load haplotypes as well as reset allele labels
+ printf("Loading haplotypes from VCF-format input file (WARNING: genotypes would be treated as phased regardless of delimiter) ...\n");
+ ifrewind(file);
+
+ linenum = 0;
+ int markerIndex = 0;
+ while (!ifeof(file))
+ {
+ linenum ++;
+ buffer.ReadLine(file);
+ tokens.ReplaceTokens(buffer);
+
+ if (tokens.Length() == 0) continue;
+
+ //skip comment lines and header line
+ if (tokens[0].SubStr(0,1) == "#") continue;
+
+ // already checked #fields in the first pass
+
+ if (tokens[1].AsDouble() < startposition) continue;
+ if (tokens[1].AsDouble() > endposition) continue;
+
+ // update allele label (instead of using 0 and 1)
+ MarkerInfo * info = Pedigree::GetMarkerInfo(markerIndex);
+ info->NewAllele(tokens[3]);
+ info->NewAllele(tokens[4]);
+ info->position = tokens[1].AsDouble();
+
+ // only 0 and 1 allowed
+ bool badchar = false;
+ for (int i = VCF_HEADING_FIELDS; i < vcfFields; i++)
+ {
+ int personIndex = i - VCF_HEADING_FIELDS;
+ int haplotypeIndex = personIndex * 2;
+
+ StringArray ThisPersonInformation;
+ ThisPersonInformation.ReplaceTokens(tokens[i], ":");
+ int genotypeLength = ThisPersonInformation[0].Length();
+ if (ThisPersonInformation[0].Length() != 3)
+ error("Each genotype has to be a string of three characters: allele1, delimiter, and allele2. \n"
+ "The following genotype in line %d does not: %s\n\n", linenum, (const char *) ThisPersonInformation[0]);
+
+ //load first allele
+ if (ThisPersonInformation[0].SubStr(0,1) == "0") haplotypes[haplotypeIndex][markerIndex] = 1;
+ else if (ThisPersonInformation[0].SubStr(0,1) == "1") haplotypes[haplotypeIndex][markerIndex] = 2;
+ else badchar = true;
+
+ if (badchar) error("Alleles in vcf can only be '0' or '1', separated by any one-character delimiter.\n"
+ "The following Allele(s) in line %d does not: %s\n\n", linenum, (const char *) ThisPersonInformation[0]);
+
+ //load second allele
+ if (ThisPersonInformation[0].SubStr(2,1) == "0") haplotypes[haplotypeIndex+1][markerIndex] = 1;
+ else if (ThisPersonInformation[0].SubStr(2,1) == "1") haplotypes[haplotypeIndex+1][markerIndex] = 2;
+ else badchar = true;
+
+ if (badchar) error("Alleles in vcf can only be '0' or '1', separated by any one-character delimiter.\n"
+ "The following Allele(s) in line %d does not: %s\n\n", linenum, (const char *) ThisPersonInformation[0]);
+ }
+
+ markerIndex ++;
+ }
+
+ // add the fake last haplotype
+ for (int i = 0; i < markerCount; i++) haplotypes[count][i] = 2;
+
+ printf(" Loaded %d reference haplotypes at %d markers\n", count, markerCount);
+}
+
+
+void HaplotypeLoader::LoadHapMapHaplotypes(IFILE file)
+ {
+ // Don't load haplotypes unless we have a marker list
+ if (markerCount == 0)
+ {
+ printf(" WARNING -- Since no legend file was provided, haplotype file will be ignored\n\n");
+ return;
+ }
+
+ printf("Loading HapMap-style phased haplotypes ...\n");
+
+ String buffer;
+ StringArray tokens;
+
+ // In the first pass, we simply count the number of non-blank lines
+ while (!ifeof(file))
+ {
+ buffer.ReadLine(file);
+ buffer.Trim();
+
+ if (buffer.Length() != 2 * markerCount - 1)
+ {
+ if (buffer.Length())
+ error("According to the legend file, there should be %d alleles per haplotype.\n"
+ "However, some lines have an unexpected character count\n", markerCount);
+
+ continue;
+ }
+
+ count++;
+ }
+
+ // Check if we got some valid input
+ if (count == 0 || markerCount == 0)
+ return;
+
+ // Then, we allocate memory for storing the phased haplotypes
+ haplotypes = AllocateCharMatrix(count + 1, Pedigree::markerCount);
+
+ // And finally, we load the data in a second pass
+ ifrewind(file);
+
+ int line = 0, index = 0;
+ while (!ifeof(file))
+ {
+ line++;
+ buffer.ReadLine(file);
+ buffer.Trim();
+
+ if (buffer.Length() != 2 * markerCount - 1) continue;
+
+ bool badchar = false;
+ for (int i = 0; i < markerCount; i++)
+ {
+ if (buffer[i * 2] == '0')
+ haplotypes[index][i] = 1;
+ else if (buffer[i * 2] == '1')
+ haplotypes[index][i] = 2;
+ else
+ badchar = true;
+
+ if (badchar || buffer[i * 2 + 1] != ' ' && buffer[i * 2 + 1] != 0)
+ error("Haplotype file should include a series of '0's and '1's,\n"
+ "separated by spaces. However, an unexpected character was\n"
+ "encountered in line %d.\n", line);
+ }
+ index++;
+ }
+
+ for (int i = 0; i < markerCount; i++)
+ haplotypes[count][i] = 2;
+ }
+
+void HaplotypeLoader::ConsistencyCheck(Pedigree & ped, int pedMarkerCount)
+ {
+ if (count == 0 || markerCount == 0)
+ return;
+
+ if (markerCount != Pedigree::markerCount)
+ {
+ printf("The following markers appear in the pedigree, but not in the phased haplotypes:");
+
+ int skipped_markers = 0;
+ for (int i = markerCount, line = 80, lines = 0; i < Pedigree::markerCount; i++)
+ if (lines < 10)
+ {
+ if (line + Pedigree::markerNames[i].Length() + 1 > 79)
+ if (lines == 9)
+ {
+ printf("\n");
+ lines++;
+ skipped_markers++;
+ continue;
+ }
+ else
+ printf("\n "), line = 3, lines++;
+
+ printf("%s ", (const char *) Pedigree::markerNames[i]);
+ line += Pedigree::markerNames[i].Length() + 1;
+ }
+ else
+ skipped_markers++;
+
+ int numLostPedMarkers = 0;
+ if (skipped_markers)
+ {
+ numLostPedMarkers = Pedigree::markerCount - markerCount;
+ printf("These %d markers and %d other%s (%d total) will be ignored\n\n",
+ Pedigree::markerCount - markerCount - skipped_markers,
+ skipped_markers, skipped_markers == 1 ? "" : "s",
+ numLostPedMarkers);
+ }
+ else
+ printf("These %d markers will be ignored\n", Pedigree::markerCount - markerCount);
+
+ Pedigree::markerCount = markerCount;
+
+ double percentLostPedMarker = 100.0 * numLostPedMarkers / pedMarkerCount;
+ if (percentLostPedMarker > 90.0)
+ if (!forceImputation)
+ error(">90% [%3.2f%] of SNPs in sample marker information file are not in reference. Imputation is\n"
+ "aborted. If this number looks correct to you, turn on --forceImputation.\n\n", percentLostPedMarker);
+ else
+ printf("WARNING: >90% [%3.2f%] of SNPs in sample marker information file are not in reference.\n", percentLostPedMarker);
+ }
+
+ bool warnings = false;
+ bool errors = false;
+ int numbersToBases = 0;
+
+ for (int i = 0; i < markerCount; i++)
+ {
+ bool bad_marker = false;
+ MarkerInfo * info = Pedigree::GetMarkerInfo(i);
+
+ if (autoFlip && info->CountAlleles() > haplotypes[count][i])
+ if (RenameAlleles(ped, i))
+ numbersToBases++;
+
+ // The last row in the haplotypes table stores the number of distinct alleles
+ // observed in the input files
+ if (info->CountAlleles() > haplotypes[count][i] && info->CountAlleles() > 2)
+ {
+ if (autoFlip && FixStrand(ped, i))
+ printf("Fixed alleles for marker %s ... ", (const char *) info->name);
+ else
+ {
+ printf("Mismatched alleles for marker %s ... ", (const char *) info->name);
+ errors |= info->CountAlleles() > 2;
+ bad_marker = info->CountAlleles() > 2;
+ }
+
+ printf("Phased Haps: [%s", (const char *) info->GetAlleleLabel(1));
+ for (int j = 2; j < haplotypes[count][i] + 1; j++)
+ printf(",%s", (const char *) info->GetAlleleLabel(j));
+ printf("] Pedigree: [%s", (const char *) info->GetAlleleLabel(haplotypes[count][i]+1));
+ for (int j = haplotypes[count][i] + 2; j < info->CountAlleles() + 1; j++)
+ printf(",%s", (const char *) info->GetAlleleLabel(j));
+ printf("]\n");
+
+ warnings = true;
+ }
+
+ if (bad_marker)
+ {
+ if (autoFlip)
+ {
+ printf(" Genotypes for marker %s will be discarded\n", (const char *) info->name);
+
+ for (int j = 0; j < ped.count; j++)
+ ped[j].markers[i][0] = ped[j].markers[i][1] = 0;
+ }
+ }
+
+ // We do a final sanity check to see if the allele frequencies are
+ // similar in the pedigree to be phased in the haplotype file
+ int hapCount[2] = {0, 0};
+
+ for (int j = 0; j < count; j++)
+ hapCount[haplotypes[j][i] - 1]++;
+
+ int pedCount[2] = {0, 0};
+
+ for (int j = 0; j < ped.count; j++)
+ if (ped[j].markers[i].isKnown())
+ {
+ pedCount[ped[j].markers[i][0] - 1]++;
+ pedCount[ped[j].markers[i][1] - 1]++;
+ }
+
+ double pedTotal = pedCount[0] + pedCount[1];
+
+ if (pedCount[0] + pedCount[1] <= 1) continue;
+
+ double freq = (pedCount[0] + hapCount[0]) / (count + pedTotal);
+ double offset = square(hapCount[0] - freq * count);
+ double chisq = offset / (count * freq + ZEPS) +
+ offset / (count * (1.0 - freq) + ZEPS) +
+ offset / (pedTotal * (1.0 - freq) + ZEPS) +
+ offset / (pedTotal * freq + ZEPS);
+
+ // Should only be exceed about 1/10,000 tries
+ if (chisq > 15.13)
+ {
+ printf("Warning: Allele %s (at %s) has frequency %f in phased haplos, but %f in the sample\n",
+ (const char *) info->GetAlleleLabel(1),
+ (const char *) Pedigree::markerNames[i],
+ hapCount[0] / (count + ZEPS), pedCount[0] / (pedTotal + ZEPS));
+ warnings = true;
+ }
+
+ // TODO -- we should probably calculate Fst to compare the two
+ // sets of haplotypes ...
+ }
+
+ if (numbersToBases)
+ printf("Numeric labels converted to bases at %d markers...\n", numbersToBases);
+
+ if (errors && !autoFlip)
+ error("Please ensure that allele labels in pedigree are consistent with haplotype file\n");
+
+ if (warnings)
+ printf("\n");
+ }
+
+void HaplotypeLoader::ShowMemoryInfo()
+ {
+ if (count == 0) return;
+
+ int bytes = markerCount * count * sizeof(char);
+
+ printf(" %40s %s\n", "Phase known haplotypes ...", (const char *) MemoryInfo(bytes));
+ }
+
+// The function below converts numeric allele labels to base-pairs
+// (assuming 1,2,3,4 match A,C,G,T)
+
+bool HaplotypeLoader::RenameAlleles(Pedigree & ped, int marker)
+ {
+ int base_alleles = haplotypes[count][marker];
+ int total_alleles = ped.CountAlleles(marker);
+
+ MarkerInfo * info = ped.GetMarkerInfo(marker);
+
+ // Only apply fix to two alleles at a time
+ if (total_alleles - base_alleles > 2 || total_alleles > 4)
+ return false;
+
+ // Abort unless all old alleles are labeled as bases
+ for (int i = 1; i <= base_alleles; i++)
+ if (info->alleleLabels[i].Length() != 1 ||
+ toupper(info->alleleLabels[i][0]) != 'A' &&
+ toupper(info->alleleLabels[i][0]) != 'C' &&
+ toupper(info->alleleLabels[i][0]) != 'G' &&
+ toupper(info->alleleLabels[i][0]) != 'T')
+ continue;
+
+ // Abort unless all new alleles are named as numbers
+ for (int i = base_alleles + 1; i <= total_alleles; i++)
+ if (info->alleleLabels[i].Length() != 1 ||
+ info->alleleLabels[i][0] < '1' ||
+ info->alleleLabels[i][0] > '4')
+ return false;
+
+ StringArray newLabels;
+ StringIntHash newNumbers;
+
+ newLabels.Push("");
+ for (int i = 1; i <= base_alleles; i++)
+ {
+ newLabels.Push(info->alleleLabels[i]);
+ newNumbers.Add(info->alleleLabels[i], i);
+ }
+
+ int nextAllele = base_alleles + 1;
+ int rename[5] = {0, 0, 0, 0, 0};
+ const char * bases[4] = {"A", "C", "G", "T"};
+
+ // Next, generate a new set of allele labels and an appropriate index
+ for (int i = base_alleles + 1; i <= total_alleles; i++)
+ {
+ int base = info->alleleLabels[i][0] - '1';
+
+ int newNumber = newNumbers.Integer(bases[base]);
+
+ if (newNumber > 0)
+ {
+ rename[i] = newNumber;
+ continue;
+ }
+
+ newLabels.Push(bases[base]);
+ newNumbers.Add(newLabels.Last(), nextAllele);
+ rename[i] = nextAllele++;
+ }
+
+ // Finally, apply the renaming filter to the rest of the pedigree
+ for (int i = 0; i < ped.count; i++)
+ {
+ if (rename[ped[i].markers[marker][0]])
+ ped[i].markers[marker][0] = rename[ped[i].markers[marker][0]];
+ if (rename[ped[i].markers[marker][1]])
+ ped[i].markers[marker][1] = rename[ped[i].markers[marker][1]];
+ }
+
+ info->alleleLabels = newLabels;
+ info->alleleNumbers = newNumbers;
+
+ return true;
+ }
+
+// The code below tries to automatically fix allele flips, but is not
+// enabled by default -- I am not convinced this is a good idea??
+
+bool HaplotypeLoader::FixStrand(Pedigree & ped, int marker)
+ {
+ int base_alleles = haplotypes[count][marker];
+
+ for (int i = 0; i < ped.count; i++)
+ if (ped[i].markers[marker].isKnown())
+ if (ped[i].markers[marker].Lo() <= base_alleles ||
+ ped[i].markers[marker].Hi() > base_alleles + 2)
+ return false;
+
+ MarkerInfo * info = ped.GetMarkerInfo(marker);
+
+ if (info->CountAlleles() == base_alleles + 1)
+ return FlipAllele(ped, marker, base_alleles + 1);
+
+ if (info->CountAlleles() == base_alleles + 2)
+ return FlipAlleles(ped, marker, base_alleles + 1, base_alleles + 2);
+
+ return false;
+ }
+
+bool HaplotypeLoader::FlipAllele(Pedigree & ped, int marker, int al1)
+ {
+ MarkerInfo * info = Pedigree::GetMarkerInfo(marker);
+ String label1 = info->GetAlleleLabel(al1);
+
+ if (label1.Length() != 1)
+ return false;
+
+ String flip1 = FlipAllele(label1);
+
+ int flipped1 = info->GetAlleleNumber(flip1);
+
+ if (flipped1 >= al1 || flipped1 < 1)
+ return false;
+
+ for (int i = 0; i < ped.count; i++)
+ for (int j = 0; j < 2; j++)
+ if (ped[i].markers[marker][j] == al1)
+ ped[i].markers[marker][j] = flipped1;
+
+ return true;
+ }
+
+bool HaplotypeLoader::FlipAlleles(Pedigree & ped, int marker, int al1, int al2)
+ {
+ MarkerInfo * info = Pedigree::GetMarkerInfo(marker);
+ String label1 = info->GetAlleleLabel(al1);
+ String label2 = info->GetAlleleLabel(al2);
+
+ if (label1.Length() != 1 || label2.Length() != 1)
+ return false;
+
+ String flip1 = FlipAllele(label1);
+ String flip2 = FlipAllele(label2);
+
+ int flipped1 = info->GetAlleleNumber(flip1);
+ int flipped2 = info->GetAlleleNumber(flip2);
+
+ if (flipped1 == al2)
+ return false;
+
+ if (flipped1 > 2 || flipped2 > 2 || flipped1 < 1 || flipped2 < 1)
+ return false;
+
+ for (int i = 0; i < ped.count; i++)
+ for (int j = 0; j < 2; j++)
+ {
+ if (ped[i].markers[marker][j] == al1)
+ ped[i].markers[marker][j] = flipped1;
+ else if (ped[i].markers[marker][j] == al2)
+ ped[i].markers[marker][j] = flipped2;
+ }
+
+ return true;
+ }
+
+const char * HaplotypeLoader::FlipAllele(String & allele)
+ {
+ static const char * flips[4] = {"A", "C", "G", "T"};
+
+ if (allele.Length() != 1)
+ return "";
+
+ switch (allele[0])
+ {
+ case 'A': case 'a':
+ return flips[3];
+ case 'C': case 'c':
+ return flips[2];
+ case 'G': case 'g':
+ return flips[1];
+ case 'T': case 't':
+ return flips[0];
+ default:
+ return "";
+ }
+ }
+
+void HaplotypeLoader::WriteMarkerList(const char * filename, int from, int to)
+ {
+ FILE * f = fopen(filename, "wb");
+
+ if (f == NULL)
+ return; // error("Marker list [%s] could not be opened\n", filename);
+
+ WriteMarkerList(f, from, to);
+ fclose(f);
+ }
+
+void HaplotypeLoader::WriteHaplotypes(const char * filename, int from, int to)
+ {
+ FILE * f = fopen(filename, "wb");
+
+ if (f == NULL)
+ return; // error("Marker list [%s] could not be opened\n", filename);
+
+ WriteHaplotypes(f, from, to);
+ fclose(f);
+ }
+
+void HaplotypeLoader::WriteMarkerList(FILE * output, int from, int to)
+ {
+ if (from == -1) from = 0;
+ if (to == -1) to = count;
+
+ if (hapmapFormat)
+ {
+ fprintf(output, "rs\tposition\t0\t1\n");
+ for (int i = from; i < to; i++)
+ {
+ MarkerInfo * info = Pedigree::GetMarkerInfo(i);
+
+ fprintf(output, "%s\t%d\t%s\t%s\n",
+ (const char *) info->name, info->position > 0 ? (int) info->position : i+1,
+ (const char *) info->GetAlleleLabel(1),
+ (const char *) info->GetAlleleLabel(2));
+ }
+ }
+ else
+ {
+ for (int i = from; i < to; i++)
+ fprintf(output, "%s\n", (const char *) Pedigree::markerNames[i]);
+ }
+ }
+
+void HaplotypeLoader::WriteHaplotypes(FILE * output, int from, int to)
+ {
+ if (from == -1) from = 0;
+ if (to == -1) to = count;
+
+ if (hapmapFormat)
+ for (int i = 0; i < count; i++)
+ {
+ for (int j = from; j < to; j++)
+ fprintf(output, "%d ", haplotypes[i][j] - 1);
+ fprintf(output, "\n");
+ }
+ else
+ for (int i = 0; i < count; i++)
+ {
+ printf("HAP%d ", count);
+ for (int j = from; j < to; j++)
+ fprintf(output, "%s ", (const char *) Pedigree::GetMarkerInfo(i)->alleleLabels[haplotypes[i][j]]);
+ fprintf(output, "\n");
+ }
+ }
+
diff --git a/mach1/HaplotypeLoader.h b/mach1/HaplotypeLoader.h
new file mode 100644
index 0000000..3b7c75a
--- /dev/null
+++ b/mach1/HaplotypeLoader.h
@@ -0,0 +1,80 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/HaplotypeLoader.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __HAPLOTYPE_LOADER_H__
+#define __HAPLOTYPE_LOADER_H__
+
+#include "Pedigree.h"
+
+#include <stdio.h>
+
+class HaplotypeLoader
+ {
+ public:
+ HaplotypeLoader();
+ ~HaplotypeLoader();
+
+ static const int VCF_HEADING_FIELDS = 9;
+ static bool hapmapFormat;
+ static bool vcfReference;
+ static bool autoFlip;
+ static bool loadPositions;
+
+ static double startposition;
+ static double endposition;
+
+ static bool forceImputation;
+
+ void LoadMarkerList(const char * filename);
+ void LoadMarkerList(IFILE file);
+ void LoadHapMapLegendFile(IFILE file);
+
+ void LoadHaplotypes(const char * filename);
+ void LoadHaplotypes(IFILE file);
+ void LoadHapMapHaplotypes(IFILE file);
+
+ //load both positions (chr$chr:$pos for all, ignoring rsIDs) and haplotypes
+ void LoadVcf(IFILE file);
+
+ void WriteMarkerList(const char * filename, int from = -1, int to = -1);
+ void WriteMarkerList(FILE * file, int from = -1, int to = -1);
+
+ void WriteHaplotypes(const char * filename, int from = -1, int to = -1);
+ void WriteHaplotypes(FILE * file, int from = -1, int to = -1);
+
+ void ConsistencyCheck(Pedigree & ped, int pedMarkerCount);
+ bool RenameAlleles(Pedigree & ped, int marker);
+ bool FixStrand(Pedigree & ped, int marker);
+
+ int markerCount;
+ int count;
+ char ** haplotypes;
+
+ void ShowMemoryInfo();
+
+ private:
+ double square(double x)
+ { return x * x; }
+
+ bool FlipAllele(Pedigree & ped, int marker, int allele);
+ bool FlipAlleles(Pedigree & ped, int marker, int allele1, int allele2);
+
+ const char * FlipAllele(String & allele1);
+ };
+
+#endif
+
diff --git a/mach1/Haplotyper.cpp b/mach1/Haplotyper.cpp
new file mode 100644
index 0000000..9aef1df
--- /dev/null
+++ b/mach1/Haplotyper.cpp
@@ -0,0 +1,2755 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/Haplotyper.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "Haplotyper.h"
+#include "StringArray.h"
+#include "Pedigree.h"
+#include "MemoryAllocators.h"
+#include "MemoryInfo.h"
+#include "Error.h"
+
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+
+#ifdef __DOUBLE_HAPLOTYPING__
+#define float double
+#endif
+
+#ifndef min
+#define min(a,b) ((a)<(b)?(a):(b))
+#endif
+
+// Coding for individual genotypes used here is:
+//
+// 0 -- missing!
+// 1 -- homozygous for allele 1
+// 2 -- heterozygous
+// 3 -- homozygous for allele 2
+//
+// 16+ -- ordered genotype
+// 1 -- first allele is 1
+// 2 -- first allele is 2
+// 4 -- second allele is 1
+// 8 -- second allele is 2
+//
+// 32+ -- allele choice depends on another individual
+//
+
+Haplotyper::Haplotyper()
+ {
+ phased = 0;
+ individuals = 0;
+ markers = 0;
+ states = 0;
+
+ genotypes = NULL;
+ haplotypes = NULL;
+ weights = NULL;
+
+ thetas = NULL;
+ distances = NULL;
+
+ readyForUse = false;
+ greedy = false;
+ economyMode = false;
+
+ memoryBlock = NULL;
+ smallMemoryBlock = NULL;
+
+ stack = NULL;
+ stackPtr = 0;
+
+ updateDiseaseScores = false;
+ diseaseCount = 0;
+ diseaseStatus = NULL;
+ diseaseScores = NULL;
+
+ posterior = NULL;
+ rightMatrices = NULL;
+
+ orderedGenotypes = false;
+ orderedGenotypeFlags = NULL;
+
+ skipSanityCheck = false;
+
+ // Entry [x][y] in the penetrance matrix gives the probability of
+ // observing genotype y when the true genotype is x. By default,
+ // we set up the matrix to assume no errors.
+ // SetErrorRate(0.0);
+ }
+
+Haplotyper::~Haplotyper()
+ {
+ if (individuals != 0 && markers != 0)
+ {
+ // In a efficient version, we should not allocate memory for phased haplotypes
+ // for (int i = 0; i < individuals - phased; i++)
+ // delete [] genotypes[i];
+
+ for (int i = 0; i < individuals; i++)
+ {
+ delete [] genotypes[i];
+ delete [] haplotypes[i * 2];
+ delete [] haplotypes[i * 2 + 1];
+ }
+
+ delete [] genotypes;
+ delete [] haplotypes;
+ delete [] marginals;
+
+ delete [] leftMatrices;
+ delete [] leftProbabilities;
+
+ delete [] thetas;
+ delete [] crossovers;
+
+ delete [] error_models;
+ delete [] penetrances;
+
+ for (int i = 0; i < markers; i++)
+ if (memoryBlock[i] != NULL)
+ delete [] memoryBlock[i];
+
+ delete [] stack;
+ delete [] memoryBlock;
+ delete [] smallMemoryBlock;
+
+ if (distances != NULL)
+ delete [] distances;
+
+ if (orderedGenotypeFlags != NULL)
+ delete [] orderedGenotypeFlags;
+ }
+
+ if (weights != NULL)
+ delete [] weights;
+
+ if (diseaseCount != 0)
+ {
+ FreeCharMatrix(diseaseStatus, individuals);
+ FreeFloatMatrix(diseaseScores, individuals);
+ FreeFloatMatrix(nplScores, 3);
+ }
+
+ if (rightMatrices != NULL)
+ FreeFloatMatrix(rightMatrices, 2);
+
+ if (posterior != NULL)
+ {
+ FreeFloatMatrix(posterior, 3);
+ FreeFloatMatrix(mlinfo, 4);
+ }
+ }
+
+void Haplotyper::AllocateWeights()
+ {
+ // Free original vector, if required
+ if (weights != NULL) delete [] weights;
+
+ // Allocate vector for weights
+ weights = new float [individuals];
+
+ // Trap out of memory conditions
+ if (weights == NULL)
+ error("Out of memory allocating weights for each individual\n");
+ }
+
+void Haplotyper::FreeWeights()
+ {
+ delete [] weights;
+ weights = NULL;
+ }
+
+void Haplotyper::CalculateWeights()
+ {
+ AllocateWeights();
+
+ // Calculate weights ...
+ float sum = 0.0;
+ for (int i = 0; i < individuals - phased; i++)
+ {
+ weights[i] = 0.0;
+
+ for (int j = 0; j < markers; j++)
+ if (genotypes[i][j] != GENOTYPE_MISSING)
+ weights[i]++;
+
+ if (weights[i] == 0.0)
+ weights[i] = 1e-30;
+
+ sum += weights[i];
+ }
+
+ // Phase known haplotypes get the maximum weight
+ for (int i = individuals - phased; i < individuals; i++)
+ sum += weights[i] = markers;
+
+ // Give up if there are no genotyped individuals
+ if (sum == 0.0)
+ FreeWeights();
+ }
+
+void Haplotyper::RandomSetup(Random * rand)
+ {
+ if (rand == NULL)
+ rand = &globalRandom;
+
+ for (int j = 0; j < markers; j++)
+ {
+ int alleles = 0, mac = 0;
+
+ for (int i = 0; i < individuals - phased; i++)
+ if (genotypes[i][j] != GENOTYPE_MISSING)
+ if ((genotypes[i][j] & GENOTYPE_ORDERED) == 0)
+ alleles += 2, mac += genotypes[i][j] - 1;
+ else
+ {
+ alleles += ((genotypes[i][j] & FIRST_ALLELE) != 0) +
+ ((genotypes[i][j] & SECOND_ALLELE) != 0);
+ mac += ((genotypes[i][j] & FIRST_ALLELE_TWO) != 0) +
+ ((genotypes[i][j] & SECOND_ALLELE_TWO) != 0);
+ }
+
+ for (int i = individuals - phased; i < individuals; i++)
+ {
+ mac += haplotypes[i * 2][j] == 1;
+ mac += haplotypes[i * 2 + 1][j] == 1;
+ }
+ alleles += phased * 2;
+
+ if (alleles == 0)
+ {
+ for (int i = 0; i < individuals; i++)
+ haplotypes[i * 2][j] = haplotypes[i * 2 + 1][j] = 0;
+ continue;
+ }
+
+ float freq = mac / (float) alleles;
+
+ for (int i = 0; i < individuals - phased; i++)
+ switch (genotypes[i][j] & ~GENOTYPE_LINKED)
+ {
+ case 0:
+ haplotypes[i * 2][j] = rand->Next() < freq;
+ haplotypes[i * 2 + 1][j] = rand->Next() < freq;
+ break;
+ case 1:
+ haplotypes[i * 2][j] = 0;
+ haplotypes[i * 2 + 1][j] = 0;
+ break;
+ case 2:
+ {
+ int bit = rand->Binary();
+
+ haplotypes[i * 2][j] = bit;
+ haplotypes[i * 2 + 1][j] = bit ^ 1;
+ }
+ break;
+ case 3:
+ haplotypes[i * 2][j] = 1;
+ haplotypes[i * 2 + 1][j] = 1;
+ break;
+ default:
+ // Ordered genotype ...
+ haplotypes[i * 2][j] = genotypes[i][j] & FIRST_ALLELE ?
+ (genotypes[i][j] & FIRST_ALLELE_TWO) > 0 : rand->Next() > freq;
+ haplotypes[i * 2 + 1][j] = genotypes[i][j] & SECOND_ALLELE ?
+ (genotypes[i][j] & SECOND_ALLELE_TWO) > 0 : rand->Next() > freq;
+ }
+ }
+ }
+
+bool Haplotyper::AllocateMemory(int persons, int maxStates, int m)
+ {
+ individuals = persons;
+ states = maxStates > 1 && maxStates < individuals * 2 ? maxStates & ~1: individuals * 2 - 2;
+ markers = m;
+
+ genotypes = AllocateCharMatrix(individuals, markers);
+ haplotypes = AllocateCharMatrix(individuals * 2, markers);
+
+ marginals = new float [states * 2];
+
+ leftMatrices = new float * [markers];
+ leftProbabilities = new float [markers];
+
+ memoryBlock = new float * [markers];
+ smallMemoryBlock = new float * [markers];
+ smallFree = 0;
+
+ stack = new int [markers];
+ stackPtr = -1;
+
+ thetas = new float [markers - 1];
+ crossovers = new int [markers - 1];
+
+ error_models = new Errors [markers];
+ penetrances = new float [markers * 9];
+
+ if (genotypes == NULL || haplotypes == NULL || marginals == NULL ||
+ leftMatrices == NULL || leftProbabilities == NULL || thetas == NULL ||
+ crossovers == NULL || error_models == NULL || penetrances == NULL)
+ return readyForUse = false;
+
+ for (int i = 0; i < markers; i++)
+ memoryBlock[i] = smallMemoryBlock[i] = NULL;
+
+ for (int i = 0; i < markers - 1; i++)
+ thetas[i] = 0.01;
+
+ gridSize = economyMode ? (int) sqrt((double)markers) : markers;
+
+ orderedGenotypeFlags = new int [individuals];
+
+ for (int i = 0; i < individuals; i++)
+ orderedGenotypeFlags = 0;
+
+ return readyForUse = true;
+ }
+
+bool Haplotyper::AllocateDiseaseStatus(int nDiseases)
+ {
+ diseaseCount = nDiseases;
+
+ if (diseaseCount == 0)
+ return true;
+
+ readyForUse = AllocateRightMatrices();
+
+ if (!readyForUse)
+ return false;
+
+ diseaseStatus = AllocateCharMatrix(individuals, nDiseases);
+ diseaseScores = AllocateFloatMatrix(individuals, nDiseases * markers);
+ nplScores = AllocateFloatMatrix(3, nDiseases);
+
+ if (diseaseScores == NULL || diseaseStatus == NULL || nplScores == NULL)
+ {
+ diseaseCount = 0;
+ if (diseaseStatus != NULL) FreeCharMatrix(diseaseStatus, individuals);
+ if (diseaseScores != NULL) FreeFloatMatrix(diseaseScores, individuals);
+ if (nplScores != NULL) FreeFloatMatrix(nplScores, 2);
+ return false;
+ }
+
+ for (int i = 0; i < individuals; i++)
+ for (int j = 0; j < nDiseases * markers; j++)
+ diseaseScores[i][j] = 0.0;
+
+ for (int i = 0; i < individuals; i++)
+ for (int j = 0; j < nDiseases; j++)
+ diseaseStatus[i][j] = 0;
+
+ return true;
+ }
+
+bool Haplotyper::AllocateMLEMemory()
+ {
+ readyForUse = AllocateRightMatrices();
+
+ if (!readyForUse)
+ return false;
+
+ posterior = AllocateFloatMatrix(3, markers);
+ mlinfo = AllocateFloatMatrix(4, markers);
+
+ readyForUse = posterior != NULL && mlinfo != NULL;
+
+ if (!readyForUse)
+ {
+ if (rightMatrices != NULL) FreeFloatMatrix(rightMatrices, 2);
+ if (posterior != NULL) FreeFloatMatrix(posterior, 3);
+ if (mlinfo != NULL) FreeFloatMatrix(mlinfo, 4);
+ }
+
+ return readyForUse;
+ }
+
+bool Haplotyper::AllocateRightMatrices()
+ {
+ int matrixSize = orderedGenotypes ? states * states : states * (states + 1) / 2;
+
+ rightMatrices = AllocateFloatMatrix(2, matrixSize);
+
+ return rightMatrices != NULL;
+ }
+
+bool Haplotyper::AllocateDistances()
+ {
+ distances = new float [markers - 1];
+
+ return distances != NULL;
+ }
+
+bool Haplotyper::ForceMemoryAllocation()
+ {
+ // Cycle through individuals, with the exact same steps as the actual
+ // haplotyper and request memory ... by requesting all memory upfront,
+ // we force crashes to happen early.
+ for (int i = 0; i < individuals - phased; i++)
+ {
+ ResetMemoryPool();
+ GetMemoryBlock(0);
+
+ if (leftMatrices[0] == NULL)
+ return false;
+
+ int skipped = 0;
+ for (int j = 1; j < markers; j++)
+ if (genotypes[i][j] != GENOTYPE_MISSING || j == markers - 1)
+ {
+ GetMemoryBlock(j);
+
+ if (leftMatrices[j] == NULL)
+ return false;
+ }
+ else
+ skipped++;
+
+ if (skipped == 0) break;
+ }
+
+ if (!phased)
+ return true;
+
+ ResetMemoryPool();
+ for (int j = 0; j < markers; j++)
+ {
+ GetSmallMemoryBlock(j);
+
+ if (leftMatrices[j] == NULL)
+ return false;
+ }
+
+ return true;
+ }
+
+void Haplotyper::Transpose(float * source, float * dest, float theta)
+ {
+ if (theta == 0.0)
+ {
+ for (int i = 0; i < states; i++)
+ for (int j = 0; j <= i; j++, dest++, source++)
+ *dest = *source;
+
+ return;
+ }
+
+ float sum = 0.0;
+ float * probability = source;
+ float * output = dest;
+
+ for (int i = 0; i < states; i++)
+ marginals[i] = 0.0;
+
+ for (int i = 0; i < states; i++)
+ {
+ for (int j = 0; j < i; j++)
+ {
+ sum += *probability;
+ marginals[i] += *probability;
+ marginals[j] += *probability;
+ probability++;
+ }
+
+ sum += *probability;
+ marginals[i] += (*probability) * 2.0;
+ probability++;
+ }
+
+ probability = source;
+
+ float no_change = (1.0 - theta) * (1.0 - theta);
+ float one_change = (1.0 - theta) * theta / states;
+ float two_changes = sum * theta * theta / (states * states);
+
+ // Automatically rescale likelihoods when they get too small
+ if (sum < 1e-15)
+ {
+ no_change *= 1e30;
+ one_change *= 1e30;
+ two_changes *= 1e30;
+ }
+
+ // This final loop actually transposes the probabilities for each state
+ if (weights == NULL)
+ for (int i = 0; i < states; i++)
+ {
+ for (int j = 0; j < i; j++)
+ {
+ *output = *probability * no_change +
+ marginals[i] * one_change +
+ marginals[j] * one_change +
+ 2 * two_changes;
+
+ probability++;
+ output++;
+ }
+
+ *output = *probability * no_change +
+ marginals[i] * one_change +
+ two_changes;
+
+ probability++;
+ output++;
+ }
+ else
+ for (int i = 0; i < states; i++)
+ {
+ for (int j = 0; j < i; j++)
+ {
+ *output = *probability * no_change +
+ marginals[i] * one_change * weights[j / 2] +
+ marginals[j] * one_change * weights[i / 2]+
+ 2 * two_changes * weights[i / 2] * weights[j / 2];
+
+ probability++;
+ output++;
+ }
+
+ *output = *probability * no_change +
+ marginals[i] * one_change * weights[i / 2] +
+ two_changes * weights[i / 2] * weights[i / 2];
+
+ probability++;
+ output++;
+ }
+ }
+
+void Haplotyper::TransposeOrdered(float * source, float * dest, float theta)
+ {
+ if (theta == 0.0)
+ {
+ for (int i = 0; i < states; i++)
+ for (int j = 0; j < states; j++, dest++, source++)
+ *dest = *source;
+
+ return;
+ }
+
+ float sum = 0.0;
+ float * probability = source;
+ float * output = dest;
+
+ for (int i = 0; i < 2 * states; i++)
+ marginals[i] = 0.0;
+
+ for (int i = 0; i < states; i++)
+ for (int j = 0; j < states; j++)
+ {
+ sum += *probability;
+ marginals[i] += *probability;
+ marginals[states + j] += *probability;
+ probability++;
+ }
+
+ probability = source;
+
+ float no_change = (1.0 - theta) * (1.0 - theta);
+ float one_change = (1.0 - theta) * theta / states;
+ float two_changes = sum * theta * theta / (states * states);
+
+ // Automatically rescale likelihoods when they get too small
+ if (sum < 1e-15)
+ {
+ no_change *= 1e30;
+ one_change *= 1e30;
+ two_changes *= 1e30;
+ }
+
+ // This final loop actually transposes the probabilities for each state
+ if (weights == NULL)
+ for (int i = 0; i < states; i++)
+ for (int j = 0; j < i; j++, probability++, output++)
+ *output = *probability * no_change +
+ marginals[i] * one_change +
+ marginals[states + j] * one_change +
+ 2 * two_changes;
+ else
+ for (int i = 0; i < states; i++)
+ for (int j = 0; j < states; j++, probability++, output++)
+ *output = *probability * no_change +
+ marginals[i] * one_change * weights[j / 2] +
+ marginals[states + j] * one_change * weights[i / 2]+
+ 2 * two_changes * weights[i / 2] * weights[j / 2];
+ }
+
+void Haplotyper::TransposeHaplotype(float * source, float * dest, float theta)
+ {
+ if (theta == 0.0)
+ {
+ for (int i = 0; i < states; i++)
+ dest[i] = source[i];
+
+ return;
+ }
+
+ float sum = 0.0;
+ for (int i = 0; i < states; i++)
+ sum += source[i];
+
+ float no_change = 1.0 - theta;
+ float one_change = sum * theta / states;
+
+ // Automatically rescale likelihoods when they get too small
+ if (sum < 1e-15)
+ {
+ no_change *= 1e30;
+ one_change *= 1e30;
+ }
+
+ // This final loop actually transposes the probabilities for each state
+ if (weights == NULL)
+ for (int i = 0; i < states; i++)
+ dest[i] = source[i] * no_change + one_change;
+ else
+ for (int i = 0; i < states; i++)
+ dest[i] = source[i] * no_change + one_change * weights[i / 2];
+ }
+
+void Haplotyper::ConditionOnData(float * matrix, int marker, char genotype)
+ {
+ // We treat missing genotypes as uninformative about the mosaic's
+ // underlying state. If we were to allow for deletions and the like,
+ // that may no longer be true.
+ if (genotype == GENOTYPE_MISSING)
+ return;
+
+ for (int i = 0; i < states; i++)
+ {
+ double factors[2];
+
+ factors[0] = Penetrance(marker, haplotypes[i][marker], genotype - 1);
+ factors[1] = Penetrance(marker, haplotypes[i][marker] + 1, genotype - 1);
+
+ for (int j = 0; j <= i; j++, matrix++)
+ *matrix *= factors[haplotypes[j][marker]];
+ }
+ }
+
+void Haplotyper::ConditionOnOrderedData(float * matrix, int marker, char genotype)
+ {
+ if (genotype == GENOTYPE_MISSING)
+ return;
+
+ if ((genotype & GENOTYPE_ORDERED) == 0)
+ {
+ for (int i = 0; i < states; i++)
+ {
+ double factors[2];
+
+ factors[0] = Penetrance(marker, haplotypes[i][marker], genotype);
+ factors[1] = Penetrance(marker, haplotypes[i][marker] + 1, genotype);
+
+ for (int j = 0; j < states; j++, matrix++)
+ *matrix *= factors[haplotypes[j][marker]];
+ }
+
+ return;
+ }
+
+ double erate = GetErrorRate(marker);
+ double complement = 1.0 - erate;
+
+ double first_factor[2], second_factor[2];
+
+ first_factor[0] = genotype & FIRST_ALLELE ?
+ (genotype & FIRST_ALLELE_ONE ? complement : erate) : 1.0;
+ first_factor[1] = genotype & FIRST_ALLELE ?
+ (genotype & FIRST_ALLELE_ONE ? erate : complement) : 1.0;
+
+ second_factor[0] = genotype & SECOND_ALLELE ?
+ (genotype & SECOND_ALLELE_ONE ? complement : erate) : 1.0;
+ second_factor[1] = genotype & SECOND_ALLELE ?
+ (genotype & SECOND_ALLELE_ONE ? erate : complement) : 1.0;
+
+ for (int i = 0; i < states; i++)
+ {
+ double factors[2];
+
+ factors[0] = first_factor[haplotypes[i][marker]] * second_factor[0];
+ factors[1] = first_factor[haplotypes[i][marker]] * second_factor[1];
+
+ for (int j = 0; j < states; j++, matrix++)
+ *matrix *= factors[haplotypes[j][marker]];
+ }
+ }
+
+void Haplotyper::ConditionHaplotypeOnData(float * matrix, int marker, char allele)
+ {
+ double factors[2];
+
+ factors[0] = GetErrorRate(marker);
+ factors[1] = 1.0 - factors[0];
+
+ for (int i = 0; i < states; i++)
+ matrix[i] *= haplotypes[i][marker] == allele ? factors[1] : factors[0];
+ }
+
+bool Haplotyper::SanityCheck()
+ {
+ bool okay = true;
+
+#ifdef _DEBUG
+
+ // Current implementation ignores ordered genotype data
+ if (skipSanityCheck)
+ return true;
+
+ for (int i = 0; i < markers; i++)
+ if (haplotypes[states][i] + haplotypes[states + 1][i] + 1 !=
+ genotypes[states / 2][i] && genotypes[states / 2][i] != GENOTYPE_MISSING)
+ printf("Mismatch at marker %d\n", i + 1, okay = false);
+#endif
+
+ return okay;
+ }
+
+void Haplotyper::SetupPrior(float * matrix)
+ {
+ float prior = 1.0 / (states * states);
+
+ if (weights == NULL)
+ {
+ for (int i = 0; i < states; i++)
+ {
+ for (int j = 0; j < i; j++)
+ {
+ *matrix = 2.0 * prior;
+ matrix++;
+ }
+ *matrix = prior;
+ matrix++;
+ }
+ }
+ else
+ for (int i = 0; i < states; i++)
+ {
+ for (int j = 0; j < i; j++)
+ {
+ *matrix = 2.0 * prior * weights[i / 2] * weights[j / 2];
+ matrix++;
+ }
+ *matrix = prior * weights[i / 2] * weights[i / 2];
+ matrix++;
+ }
+ }
+
+void Haplotyper::SetupOrderedPrior(float * matrix)
+ {
+ float prior = 1.0 / (states * states);
+
+ if (weights == NULL)
+ for (int i = 0; i < states; i++)
+ for (int j = 0; j < states; j++, matrix++)
+ *matrix = prior;
+ else
+ for (int i = 0; i < states; i++)
+ for (int j = 0; j < states; j++, matrix++)
+ *matrix = prior * weights[i / 2] * weights[j / 2];
+ }
+
+void Haplotyper::ScoreLeftConditional()
+ {
+ ResetMemoryPool();
+ GetMemoryBlock(0);
+
+ SetupPrior(leftMatrices[0]);
+ ConditionOnData(leftMatrices[0], 0, genotypes[states / 2][0]);
+
+ double theta = 0.0;
+ float *from = leftMatrices[0];
+ for (int i = 1; i < markers; i++)
+ {
+ // Cumulative recombination fraction allows us to skip uninformative positions
+ theta = theta + thetas[i - 1] - theta * thetas[i - 1];
+
+ // Skip over uninformative positions to save time
+ if (genotypes[states / 2][i] != GENOTYPE_MISSING || i == markers - 1)
+ {
+ GetMemoryBlock(i);
+
+ Transpose(from, leftMatrices[i], theta);
+ ConditionOnData(leftMatrices[i], i, genotypes[states / 2][i]);
+
+ theta = 0;
+ from = leftMatrices[i];
+ }
+ }
+
+ MarkMemoryPool();
+ }
+
+void Haplotyper::ImputeGenotypes()
+ {
+ RewindMemoryPool();
+
+ // Process the last position
+ RetrieveMemoryBlock(markers - 1);
+ ImputeGenotypes(leftMatrices[markers - 1], markers - 1);
+
+ SetupPrior(rightMatrices[0]);
+ ConditionOnData(rightMatrices[0], markers - 1, genotypes[states / 2][markers - 1]);
+
+ float *temp;
+ float *from = rightMatrices[0];
+ float *to = rightMatrices[1];
+
+ for (int i = markers - 2; i >= 0; i--)
+ {
+ // Move things along
+ Transpose(from, to, thetas[i]);
+
+ // Find nearest informative marker
+ double theta = 0.0;
+ int left = i;
+
+ while (left > 0 && genotypes[states / 2][left] == GENOTYPE_MISSING)
+ {
+ // Cumulative recombination fraction to nearest marker
+ theta = theta + thetas[left - 1] - theta * thetas[left - 1];
+ left--;
+ }
+
+ RetrieveMemoryBlock(left);
+ float * leftMatrix = leftMatrices[left];
+
+ if (left != i)
+ {
+ Transpose(leftMatrix, from, theta);
+ leftMatrix = from;
+ }
+
+ ImputeGenotypes(leftMatrix, to, i);
+ ConditionOnData(to, i, genotypes[states / 2][i]);
+
+ temp = from;
+ from = to;
+ to = temp;
+ }
+ }
+
+void Haplotyper::ImputeGenotypes(float * matrix, int marker)
+ {
+ posterior[0][marker] = posterior[1][marker] = posterior[2][marker] = 0.0;
+
+ for (int i = 0; i < states; i++)
+ if (haplotypes[i][marker])
+ for (int j = 0; j <= i; j++, matrix++)
+ posterior[haplotypes[j][marker] + 1][marker] += *matrix;
+ else
+ for (int j = 0; j <= i; j++, matrix++)
+ posterior[haplotypes[j][marker]][marker] += *matrix;
+
+ NormalizePosterior(marker);
+ }
+
+void Haplotyper::ImputeGenotypes(float * matrix1, float * matrix2, int marker)
+ {
+ posterior[0][marker] = posterior[1][marker] = posterior[2][marker] = 0.0;
+
+ if (weights == NULL)
+ for (int i = 0; i < states; i++, matrix1++, matrix2++)
+ {
+ if (haplotypes[i][marker])
+ for (int j = 0; j < i; j++, matrix1++, matrix2++)
+ posterior[haplotypes[j][marker] + 1][marker] += *matrix1 * *matrix2 * 0.5;
+ else
+ for (int j = 0; j < i; j++, matrix1++, matrix2++)
+ posterior[haplotypes[j][marker]][marker] += *matrix1 * *matrix2 * 0.5;
+
+ posterior[haplotypes[i][marker] * 2][marker] += *matrix1 * *matrix2;
+ }
+ else
+ for (int i = 0; i < states; i++, matrix1++, matrix2++)
+ {
+ if (haplotypes[i][marker])
+ for (int j = 0; j < i; j++, matrix1++, matrix2++)
+ posterior[haplotypes[j][marker] + 1][marker] +=
+ *matrix1 * *matrix2 * 0.5 / (weights[j / 2] * weights[i / 2] + 1e-30);
+ else
+ for (int j = 0; j < i; j++, matrix1++, matrix2++)
+ posterior[haplotypes[j][marker]][marker] +=
+ *matrix1 * *matrix2 * 0.5 / (weights[j / 2] * weights[i / 2] + 1e-30);
+
+ posterior[haplotypes[i][marker] * 2][marker] +=
+ *matrix1 * *matrix2 / (weights[i / 2] * weights[i / 2] + 1e-30);
+ }
+
+
+ NormalizePosterior(marker);
+ }
+
+void Haplotyper::ScoreLeftConditionalForOrderedGenotypes()
+ {
+ ResetMemoryPool();
+ GetMemoryBlock(0);
+
+ SetupOrderedPrior(leftMatrices[0]);
+ ConditionOnOrderedData(leftMatrices[0], 0, genotypes[states / 2][0]);
+
+ double theta = 0.0;
+ float *from = leftMatrices[0];
+ for (int i = 1; i < markers; i++)
+ {
+ // Cumulative recombination fraction allows us to skip uninformative positions
+ theta = theta + thetas[i - 1] - theta * thetas[i - 1];
+
+ // Skip over uninformative positions to save time
+ if (genotypes[states / 2][i] != GENOTYPE_MISSING || i == markers - 1)
+ {
+ GetMemoryBlock(i);
+
+ TransposeOrdered(from, leftMatrices[i], theta);
+ ConditionOnOrderedData(leftMatrices[i], i, genotypes[states / 2][i]);
+
+ theta = 0;
+ from = leftMatrices[i];
+ }
+ }
+
+ MarkMemoryPool();
+ }
+
+void Haplotyper::ImputeGenotypesFromOrderedData()
+ {
+ RewindMemoryPool();
+
+ // Process the last position
+ RetrieveMemoryBlock(markers - 1);
+ ImputeGenotypesFromOrderedData(leftMatrices[markers - 1], markers - 1);
+
+ SetupOrderedPrior(rightMatrices[0]);
+ ConditionOnOrderedData(rightMatrices[0], 0, genotypes[states / 2][0]);
+
+ float *temp;
+ float *from = rightMatrices[0];
+ float *to = rightMatrices[1];
+
+ for (int i = markers - 2; i >= 0; i--)
+ {
+ // Move things along
+ TransposeOrdered(from, to, thetas[i]);
+
+ // Find nearest informative marker
+ double theta = 0.0;
+ int left = i;
+
+ while (left > 0 && genotypes[states / 2][left] == GENOTYPE_MISSING)
+ {
+ // Cumulative recombination fraction to nearest marker
+ theta = theta + thetas[left - 1] - theta * thetas[left - 1];
+ left--;
+ }
+
+ RetrieveMemoryBlock(left);
+ float * leftMatrix = leftMatrices[left];
+
+ if (left != i)
+ {
+ TransposeOrdered(leftMatrix, from, theta);
+ leftMatrix = from;
+ }
+
+ ImputeGenotypesFromOrderedData(leftMatrix, to, i);
+ ConditionOnOrderedData(to, i, genotypes[states / 2][i]);
+
+ temp = from;
+ from = to;
+ to = temp;
+ }
+ }
+
+void Haplotyper::ImputeGenotypesFromOrderedData(float * matrix, int marker)
+ {
+ posterior[0][marker] = posterior[1][marker] = posterior[2][marker] = 0.0;
+
+ for (int i = 0; i < states; i++)
+ if (haplotypes[i][marker])
+ for (int j = 0; j < states; j++, matrix++)
+ posterior[haplotypes[j][marker] + 1][marker] += *matrix;
+ else
+ for (int j = 0; j < states; j++, matrix++)
+ posterior[haplotypes[j][marker]][marker] += *matrix;
+
+ NormalizePosterior(marker);
+ }
+
+void Haplotyper::ImputeGenotypesFromOrderedData(float * matrix1, float * matrix2, int marker)
+ {
+ posterior[0][marker] = posterior[1][marker] = posterior[2][marker] = 0.0;
+
+ if (weights == NULL)
+ for (int i = 0; i < states; i++)
+ if (haplotypes[i][marker])
+ for (int j = 0; j < states; j++, matrix1++, matrix2++)
+ posterior[haplotypes[j][marker] + 1][marker] += *matrix1 * *matrix2;
+ else
+ for (int j = 0; j < states; j++, matrix1++, matrix2++)
+ posterior[haplotypes[j][marker]][marker] += *matrix1 * *matrix2;
+ else
+ for (int i = 0; i < states; i++)
+ if (haplotypes[i][marker])
+ for (int j = 0; j < states; j++, matrix1++, matrix2++)
+ posterior[haplotypes[j][marker] + 1][marker] +=
+ *matrix1 * *matrix2 / (weights[j / 2] * weights[i / 2] + 1e-30);
+ else
+ for (int j = 0; j < states; j++, matrix1++, matrix2++)
+ posterior[haplotypes[j][marker]][marker] +=
+ *matrix1 * *matrix2 / (weights[j / 2] * weights[i / 2] + 1e-30);
+
+ NormalizePosterior(marker);
+ }
+
+void Haplotyper::ScoreLeftConditionalForHaplotype()
+ {
+ ResetMemoryPool();
+ GetSmallMemoryBlock(0);
+
+ float * matrix = leftMatrices[0];
+
+ float prior = 1.0 / states;
+
+ if (weights == NULL)
+ for (int i = 0; i < states; i++)
+ matrix[i] = prior;
+ else
+ for (int i = 0; i < states; i++)
+ matrix[i] = prior * weights[i / 2];
+
+ ConditionHaplotypeOnData(leftMatrices[0], 0, haplotypes[states][0]);
+
+ for (int i = 1; i < markers; i++)
+ {
+ GetSmallMemoryBlock(i);
+
+ TransposeHaplotype(leftMatrices[i - 1], leftMatrices[i], thetas[i - 1]);
+ ConditionHaplotypeOnData(leftMatrices[i], i, haplotypes[states][i]);
+ }
+
+ MarkMemoryPool();
+ }
+
+void Haplotyper::SampleChromosomes(Random * rand)
+ {
+ // Print(markers - 1);
+ RewindMemoryPool();
+ RetrieveMemoryBlock(markers - 1);
+
+ float * probability = leftMatrices[markers - 1];
+ float sum = 0.0;
+
+ // Calculate sum over all states
+ for (int i = 0; i < states; i++)
+ for (int j = 0; j <= i; j++)
+ {
+ sum += *probability;
+ probability++;
+ }
+
+ // Sample number and select state
+ float choice = rand->Uniform(0, sum);
+
+ sum = 0.0;
+
+ int first = 0, second = 0;
+ for (probability = leftMatrices[markers - 1]; first < states; first++)
+ {
+ for (second = 0; second <= first; second++)
+ {
+ sum += *probability;
+ probability++;
+
+ if (sum >= choice) break;
+ }
+
+ if (second <= first) break;
+ }
+
+ // printf("Cumulative probability: %g\n", sum);
+ // printf(" Random draw: %g\n", choice);
+ // printf(" Selected state: %g\n", *(probability - 1));
+
+ for (int j = markers - 2; j >= 0; j--)
+ {
+ // printf("Sum: %f, Chose (%d,%d)\n", sum, first, second);
+
+ ImputeAlleles(j + 1, first, second, rand);
+
+ // Starting marker for this iteration
+ int j0 = j;
+
+ // Cumulative recombination fraction, skipping over uninformative
+ // positions
+ float theta = thetas[j];
+ while (genotypes[states / 2][j] == GENOTYPE_MISSING && j > 0)
+ {
+ --j;
+ theta = theta + thetas[j] - theta * thetas[j];
+ }
+
+ // When examining the previous location we consider three alternatives:
+ // states that could be reached when both haplotypes recombine (11),
+ // states that can be reached when the first (10) or second (01) haplotype recombines,
+ // and the states that can be reached without recombination.
+
+ float sum00 = 0.0, sum01 = 0.0, sum10 = 0.0, sum11 = 0.0;
+
+ RetrieveMemoryBlock(j);
+ probability = leftMatrices[j];
+
+ for (int k = 0; k < states; k++)
+ for (int l = 0; l <= k; l++, probability++)
+ {
+ sum11 += *probability;
+ if (first == k || first == l) sum01 += *probability;
+ if (second == k || second == l) sum10 += *probability;
+ if (first == k && second == l || first == l && second == k) sum00 += *probability;
+ }
+
+ if (weights != NULL)
+ {
+ sum01 *= weights[second / 2];
+ sum10 *= weights[first / 2];
+ sum11 *= weights[second / 2] * weights[first / 2];
+ }
+
+ sum = sum11 * theta * theta / (states * states) +
+ (sum10 + sum01) * theta * (1.0 - theta) / states +
+ sum00 * (1.0 - theta) * (1.0 - theta);
+
+ // Sample number and decide how many state changes occurred between the
+ // two positions
+ choice = rand->Uniform(0, sum);
+
+ // The most likely outcome is that no changes occur ...
+ choice -= sum00 * (1.0 - theta) * (1.0 - theta);
+ if (choice <= 0.0)
+ {
+ // Record outcomes for intermediate, uninformative, positions
+ FillPath(states, j, j0 + 1, first);
+ FillPath(states + 1, j, j0 + 1, second);
+
+ continue;
+ }
+
+ // But perhaps the first or second haplotype recombined
+ probability = leftMatrices[j];
+
+ choice -= sum10 * theta * (1.0 - theta) / states;
+ if (choice <= 0.0)
+ {
+ // The first haplotype changed ...
+ choice = choice * states / (theta * (1.0 - theta));
+
+ // Record the original state
+ int first0 = first;
+
+ if (weights != NULL) choice /= weights[first / 2];
+
+ for (first = 0; first < states; first++)
+ {
+ if (first >= second)
+ choice += probability[first * (first + 1) / 2 + second];
+ else
+ choice += probability[second * (second + 1) / 2 + first];
+
+ if (choice >= 0.0) break;
+ }
+
+ // Record outcomes for intermediate, uninformative, positions
+ SamplePath(states, j, j0 + 1, first, first0, rand);
+ FillPath(states + 1, j, j0 + 1, second);
+
+ continue;
+ }
+
+ choice -= sum01 * theta * (1.0 - theta) / states;
+ if (choice <= 0.0)
+ {
+ // The second haplotype changed ...
+ choice = choice * states / (theta * (1.0 - theta));
+
+ // Save the original state
+ int second0 = second;
+
+ if (weights != NULL) choice /= weights[second / 2];
+
+ for (second = 0; second < states; second++)
+ {
+ if (first >= second)
+ choice += probability[first * (first + 1) / 2 + second];
+ else
+ choice += probability[second * (second + 1) / 2 + first];
+
+ if (choice >= 0.0) break;
+ }
+
+ // Record outcomes for intermediate, uninformative, positions
+ FillPath(states, j, j0 + 1, first);
+ SamplePath(states + 1, j, j0 + 1, second, second0, rand);
+
+ continue;
+ }
+
+ // Try to select any other state
+ choice *= states * states / (theta * theta);
+ sum = 0.0;
+
+ // Save the original states
+ int first0 = first;
+ int second0 = second;
+
+ if (weights != NULL) choice /= weights[first / 2] * weights[second / 2];
+
+ for (first = 0; first < states; first++)
+ {
+ for (second = 0; second <= first; second++, probability++)
+ {
+ sum += *probability;
+
+ if (sum > choice) break;
+ }
+
+ if (second <= first) break;
+ }
+
+ if (rand->Binary())
+ {
+ int temp = first;
+ first = second;
+ second = temp;
+ }
+
+ // Record outcomes for intermediate, uninformative, positions
+ SamplePath(states, j, j0 + 1, first, first0, rand);
+ SamplePath(states + 1, j, j0 + 1, second, second0, rand);
+ }
+
+ ImputeAlleles(0, first, second, rand);
+ }
+
+void Haplotyper::SampleChromosomesFromOrderedData(Random * rand)
+ {
+ // Print(markers - 1);
+ RewindMemoryPool();
+ RetrieveMemoryBlock(markers - 1);
+
+ float * probability = leftMatrices[markers - 1];
+ float sum = 0.0;
+
+ // Calculate sum over all states
+ for (int i = 0; i < states; i++)
+ for (int j = 0; j < states; j++, probability++)
+ sum += *probability;
+
+ // Sample number and select state
+ float choice = rand->Uniform(0, sum);
+
+ sum = 0.0;
+
+ int first = 0, second = 0;
+ for (probability = leftMatrices[markers - 1]; first < states; first++)
+ {
+ for (second = 0; second < states; second++, probability++)
+ {
+ sum += *probability;
+
+ if (sum >= choice) break;
+ }
+
+ if (second < states) break;
+ }
+
+ for (int j = markers - 2; j >= 0; j--)
+ {
+ // TODO -- Impute alleles should take ordered data into account!
+ ImputeAlleles(j + 1, first, second, rand);
+
+ // Starting marker for this iteration
+ int j0 = j;
+
+ // Cumulative recombination fraction, skipping over uninformative
+ // positions
+ float theta = thetas[j];
+ while (genotypes[states / 2][j] == GENOTYPE_MISSING && j > 0)
+ {
+ --j;
+ theta = theta + thetas[j] - theta * thetas[j];
+ }
+
+ // When examining the previous location we consider three alternatives:
+ // states that could be reached when both haplotypes recombine (11),
+ // states that can be reached when the first (10) or second (01) haplotype recombines,
+ // and the states that can be reached without recombination.
+
+ float sum00 = 0.0, sum01 = 0.0, sum10 = 0.0, sum11 = 0.0;
+
+ RetrieveMemoryBlock(j);
+ probability = leftMatrices[j];
+
+ // CURRENT EDITING SPOT !!
+ for (int k = 0; k < states; k++)
+ for (int l = 0; l <= k; l++, probability++)
+ {
+ sum11 += *probability;
+ if (first == k || first == l) sum01 += *probability;
+ if (second == k || second == l) sum10 += *probability;
+ if (first == k && second == l || first == l && second == k) sum00 += *probability;
+ }
+
+ if (weights != NULL)
+ {
+ sum01 *= weights[second / 2];
+ sum10 *= weights[first / 2];
+ sum11 *= weights[second / 2] * weights[first / 2];
+ }
+
+ sum = sum11 * theta * theta / (states * states) +
+ (sum10 + sum01) * theta * (1.0 - theta) / states +
+ sum00 * (1.0 - theta) * (1.0 - theta);
+
+ // Sample number and decide how many state changes occurred between the
+ // two positions
+ choice = rand->Uniform(0, sum);
+
+ // The most likely outcome is that no changes occur ...
+ choice -= sum00 * (1.0 - theta) * (1.0 - theta);
+ if (choice <= 0.0)
+ {
+ // Record outcomes for intermediate, uninformative, positions
+ FillPath(states, j, j0 + 1, first);
+ FillPath(states + 1, j, j0 + 1, second);
+
+ continue;
+ }
+
+ // But perhaps the first or second haplotype recombined
+ probability = leftMatrices[j];
+
+ choice -= sum10 * theta * (1.0 - theta) / states;
+ if (choice <= 0.0)
+ {
+ // The first haplotype changed ...
+ choice = choice * states / (theta * (1.0 - theta));
+
+ // Record the original state
+ int first0 = first;
+
+ if (weights != NULL) choice /= weights[first / 2];
+
+ for (first = 0; first < states; first++)
+ {
+ if (first >= second)
+ choice += probability[first * (first + 1) / 2 + second];
+ else
+ choice += probability[second * (second + 1) / 2 + first];
+
+ if (choice >= 0.0) break;
+ }
+
+ // Record outcomes for intermediate, uninformative, positions
+ SamplePath(states, j, j0 + 1, first, first0, rand);
+ FillPath(states + 1, j, j0 + 1, second);
+
+ continue;
+ }
+
+ choice -= sum01 * theta * (1.0 - theta) / states;
+ if (choice <= 0.0)
+ {
+ // The second haplotype changed ...
+ choice = choice * states / (theta * (1.0 - theta));
+
+ // Save the original state
+ int second0 = second;
+
+ if (weights != NULL) choice /= weights[second / 2];
+
+ for (second = 0; second < states; second++)
+ {
+ if (first >= second)
+ choice += probability[first * (first + 1) / 2 + second];
+ else
+ choice += probability[second * (second + 1) / 2 + first];
+
+ if (choice >= 0.0) break;
+ }
+
+ // Record outcomes for intermediate, uninformative, positions
+ FillPath(states, j, j0 + 1, first);
+ SamplePath(states + 1, j, j0 + 1, second, second0, rand);
+
+ continue;
+ }
+
+ // Try to select any other state
+ choice *= states * states / (theta * theta);
+ sum = 0.0;
+
+ // Save the original states
+ int first0 = first;
+ int second0 = second;
+
+ if (weights != NULL) choice /= weights[first / 2] * weights[second / 2];
+
+ for (first = 0; first < states; first++)
+ {
+ for (second = 0; second <= first; second++, probability++)
+ {
+ sum += *probability;
+
+ if (sum > choice) break;
+ }
+
+ if (second <= first) break;
+ }
+
+ if (rand->Binary())
+ {
+ int temp = first;
+ first = second;
+ second = temp;
+ }
+
+ // Record outcomes for intermediate, uninformative, positions
+ SamplePath(states, j, j0 + 1, first, first0, rand);
+ SamplePath(states + 1, j, j0 + 1, second, second0, rand);
+ }
+
+ ImputeAlleles(0, first, second, rand);
+ }
+
+void Haplotyper::SampleHaplotypeSource(Random * rand)
+ {
+ RewindMemoryPool();
+
+ float * probability = leftMatrices[markers - 1];
+ float sum = 0.0;
+
+ // Calculate sum over all states
+ for (int i = 0; i < states; i++)
+ sum += probability[i];
+
+ // Sample number and select state
+ float choice = rand->Uniform(0, sum);
+
+ int haplotype;
+ sum = 0.0;
+
+ for (haplotype = 0; haplotype < states; haplotype++)
+ {
+ sum += probability[haplotype];
+
+ if (sum >= choice) break;
+ }
+
+ for (int j = markers - 2; j >= 0; j--)
+ {
+ // Track whether imputed state matches observed allele
+ if (haplotypes[haplotype][j+1] == haplotypes[states][j+1])
+ error_models[j + 1].matches++;
+ else
+ error_models[j + 1].mismatches++;
+
+ float theta = thetas[j];
+
+ probability = leftMatrices[j];
+
+ float nocross = probability[haplotype] * (1.0 - theta);
+
+ float sum = 0.0;
+ for (int k = 0; k < states; k++)
+ sum += probability[k];
+
+ float cross = sum * theta / states;
+
+ if (weights != NULL)
+ cross *= weights[haplotype / 2];
+
+ // Sample number and decide how many state changes occurred between the
+ // two positions
+ choice = rand->Uniform(0, nocross + cross);
+
+ // The most likely outcome is that no changes occur ...
+ if (choice <= nocross)
+ continue;
+
+ crossovers[j]++;
+
+ // If a crossover occured, we need to sample a state according to probability
+ choice = rand->Uniform(0, sum);
+
+ sum = 0.0;
+ for (haplotype = 0; haplotype < states; haplotype++)
+ {
+ sum += probability[haplotype];
+
+ if (sum >= choice) break;
+ }
+ }
+
+ // Track whether imputed state matches observed allele
+ if (haplotypes[haplotype][0] == haplotypes[states][0])
+ error_models[0].matches++;
+ else
+ error_models[0].mismatches++;
+ }
+
+void Haplotyper::UpdateThetasWithDistances()
+ {
+ double scale = 1.0 / (individuals * 2);
+
+ // First we estimate a base line rate to be applied to intervals with
+ // 0 or 1 observed "crossovers"
+ double base_crossovers = 1;
+ double base_length = 0;
+
+ for (int i = 0; i < markers - 1; i++)
+ if (crossovers[i] <= 1)
+ {
+ base_crossovers += crossovers[i];
+ base_length += distances[i];
+ }
+
+ double base_rate = base_crossovers * scale / (base_length ? base_length : 1);
+
+ // Then we update the rate for each interval using either the number
+ // of observed crossovers (if > 1) or the baseline rate
+ for (int i = 0; i < markers - 1; i++)
+ if (crossovers[i] > 1)
+ thetas[i] = crossovers[i] * scale;
+ else
+ thetas[i] = base_rate * distances[i];
+ }
+
+void Haplotyper::UpdateThetas()
+ {
+ if (distances != NULL)
+ {
+ UpdateThetasWithDistances();
+ return;
+ }
+
+ double scale = 1.0 / (individuals * 2);
+
+ // First we estimate a base line rate to be applied to intervals with
+ // 0 or 1 observed "crossovers"
+ int base_count = 1, base_intervals = 0;
+
+ for (int i = 0; i < markers - 1; i++)
+ if (crossovers[i] <= 1)
+ base_count += crossovers[i], base_intervals++;
+
+ double base_rate = base_count * scale / (base_intervals ? base_intervals : 1);
+
+ // Then we update the rate for each interval using either the number
+ // of observed crossovers (if > 1) or the baseline rate
+ for (int i = 0; i < markers - 1; i++)
+ if (crossovers[i] > 1)
+ thetas[i] = crossovers[i] * scale;
+ else
+ thetas[i] = base_rate;
+ }
+
+double Haplotyper::UpdateErrorRate()
+ {
+ // Group markers into those with low error rates, which are estimated as a
+ // group, and those with high error rates, which are estimated individually
+ Errors baseModel;
+
+ for (int i = 0; i < markers; i++)
+ if (error_models[i].mismatches <= 2)
+ baseModel += error_models[i];
+ else
+ SetErrorRate(i, error_models[i].Update());
+
+ baseModel.Update();
+
+ for (int i = 0; i < markers; i++)
+ if (error_models[i].mismatches <= 2)
+ SetErrorRate(i, baseModel.rate);
+
+ return baseModel.rate;
+ }
+
+double Haplotyper::GetErrorRate()
+ {
+ double average = 0.0;
+
+ for (int i = 0; i < markers; i++)
+ average += GetErrorRate(i);
+
+ return average / (markers + 1e-30);
+ }
+
+int Haplotyper::TotalCrossovers()
+ {
+ int total = 0;
+
+ for (int i = 0; i < markers - 1; i++)
+ total += crossovers[i];
+
+ return total;
+ }
+
+void Haplotyper::ResetCrossovers()
+ {
+ for (int i = 0; i < markers - 1; i++)
+ crossovers[i] = 0;
+
+ for (int i = 0; i < markers; i++)
+ error_models[i].Reset();
+ }
+
+void Haplotyper::Print()
+ {
+ printf("Reference Haplotypes\n");
+
+ for (int i = 0; i < states; i++)
+ {
+ printf("%3d ", i);
+
+ for (int j = 0; j < markers; j++)
+ printf("%d", haplotypes[i][j]);
+ printf("\n");
+ }
+
+ printf("\nGenotypes to be phased\n%3s ", "");
+
+ for (int j = 0; j < markers; j++)
+ printf(genotypes[states / 2][j] == GENOTYPE_MISSING ? "." : "%d", genotypes[individuals - 1][j]);
+
+ printf("\nSelected Haplotypes\n");
+
+ for (int i = states; i < states + 2; i++)
+ {
+ printf("%3d ", i);
+
+ for (int j = 0; j < markers; j++)
+ printf("%d", haplotypes[i][j]);
+ printf("\n");
+ }
+
+ printf("\n");
+
+ if (individuals < 10)
+ for (int i = 0; i < markers; i++)
+ Print(i);
+
+// for (int i = 0; i < markers; i++)
+// {
+// printf("Left Conditional Matrix at marker %d", i);
+// Print(i);
+// printf("\n");
+// }
+ }
+
+void Haplotyper::Print(int marker)
+ {
+ float * pointer = leftMatrices[marker];
+
+ for (int i = 0; i < individuals * 2 - 2; i++)
+ for (int j = 0; j <= i; j++, pointer++)
+ printf("state (%d,%d) = %g [geno = %d/%d] \n",
+ i, j, *pointer, haplotypes[i][marker], haplotypes[j][marker]);
+ }
+
+void Haplotyper::SetErrorRate(int marker, float rate)
+ {
+ // These are the penetrances for underlying homozygous genotypes
+ Penetrance(marker, 0, 0) = Penetrance(marker, 2, 2) = square(1.0 - rate);
+ Penetrance(marker, 0, 1) = Penetrance(marker, 2, 1) = 2 * (1.0 - rate) * rate;
+ Penetrance(marker, 0, 2) = Penetrance(marker, 2, 0) = square(rate);
+
+ // These are the penetrances for underlying heterozygous genotypes
+ Penetrance(marker, 1, 0) = Penetrance(marker, 1, 2) = (1.0 - rate) * rate;
+ Penetrance(marker, 1, 1) = square(1.0 - rate) + square(rate);
+
+ // Save estimated error rate
+ error_models[marker].rate = rate;
+ }
+
+void Haplotyper::SetErrorRate(float rate)
+ {
+ for (int i = 0; i < markers; i++)
+ SetErrorRate(i, rate);
+ }
+
+void Haplotyper::UpdateDiseaseScores(int marker, int state)
+ {
+// printf("Sampled state %d [status = %d, score = %.1f]\n",
+// state, diseaseStatus[state / 2][0], nplScores[diseaseStatus[state / 2][0]][0]);
+
+ marker *= diseaseCount;
+
+ for (int j = 0; j < diseaseCount; j++)
+ diseaseScores[states / 2][marker + j] += nplScores[diseaseStatus[state / 2][j]][j];
+ }
+
+void Haplotyper::ImputeAlleles(int marker, int state1, int state2, Random * rand)
+ {
+ // if (updateDiseaseScores)
+ // {
+ // UpdateDiseaseScores(marker, state1);
+ // UpdateDiseaseScores(marker, state2);
+ // }
+
+ int imputed1 = haplotypes[state1][marker];
+ int imputed2 = haplotypes[state2][marker];
+
+ int genotype = genotypes[states / 2][marker];
+
+ if (genotype != GENOTYPE_HOMOZYGOUS_FOR_ONE &&
+ genotype != GENOTYPE_HOMOZYGOUS_FOR_TWO)
+ {
+ haplotypes[states][marker] = imputed1;
+ haplotypes[states + 1][marker] = imputed2;
+ }
+
+ if (genotype == GENOTYPE_MISSING) return;
+
+ int differences = abs(genotype - imputed1 - imputed2 - 1);
+
+ if (genotype == GENOTYPE_HETEROZYGOUS && differences == 0)
+ error_models[marker].uncertain_pairs++;
+ else
+ {
+ error_models[marker].matches += 2 - differences;
+ error_models[marker].mismatches += differences;
+ }
+
+ if (genotype != GENOTYPE_HETEROZYGOUS) return;
+
+ if (imputed1 == imputed2)
+ if (rand->Binary())
+ haplotypes[states][marker] = !imputed2;
+ else
+ haplotypes[states + 1][marker] = !imputed1;
+ }
+
+void Haplotyper::ImputeAllele(int haplotype, int marker, int state)
+ {
+ // if (updateDiseaseScores) UpdateDiseaseScores(marker, state);
+
+ haplotypes[haplotype][marker] = haplotypes[state][marker];
+ }
+
+void Haplotyper::BuildConsensus(int samples)
+ {
+ char ** sample = AllocateCharMatrix(samples * 2, markers);
+ char ** consensus = AllocateCharMatrix(individuals * 2, markers);
+
+ ResetCrossovers();
+
+ for (int i = 0, slot = individuals - 1; i < individuals - phased; i++)
+ {
+ SwapIndividuals(i, slot);
+
+ // Initialize sampled haplotypes
+ for (int j = 0; j < markers; j++)
+ if (genotypes[slot][j] == GENOTYPE_HOMOZYGOUS_FOR_ONE ||
+ genotypes[slot][j] == GENOTYPE_HOMOZYGOUS_FOR_TWO)
+ for (int k = 0; k < samples; k++)
+ sample[k * 2][j] = sample[k * 2 + 1][j] = genotypes[slot][j] / 2;
+
+ ScoreLeftConditional();
+
+ for (int j = 0; j < samples; j++)
+ {
+ Swap(haplotypes[slot * 2], sample[j * 2]);
+ Swap(haplotypes[slot * 2 + 1], sample[j * 2 + 1]);
+
+ SampleChromosomes(&globalRandom);
+
+ Swap(haplotypes[slot * 2], sample[j * 2]);
+ Swap(haplotypes[slot * 2 + 1], sample[j * 2 + 1]);
+ }
+
+ BuildConsensus(consensus + i * 2, sample, samples);
+
+ SwapIndividuals(i, slot);
+ }
+
+ FreeCharMatrix(sample, samples * 2);
+ FreeCharMatrix(haplotypes, individuals * 2);
+
+ haplotypes = consensus;
+ }
+
+void Haplotyper::BuildConsensus(char ** consensus, char ** haplotypes, int count)
+ {
+ // The phase for each pair of haplotypes indicates their ordering
+ // in relation to the consensus
+ char * phase = new char [count];
+
+ // Trap out of memory conditions
+ if (phase == NULL)
+ error("Out of memory allocating phase bit-array\n");
+
+ // Select phase based on the first heterozygous position for each haplotype
+ for (int i = 0; i < count; i++)
+ {
+ phase[i] = 0;
+
+ for (int j = 0; j < markers; j++)
+ if (haplotypes[i * 2][j] != haplotypes[i * 2 + 1][j])
+ {
+ phase[i] = haplotypes[i * 2][j] > haplotypes[i * 2 + 1][j];
+ break;
+ }
+ }
+
+ // Build consensus one position at a time ...
+ for (int i = 0; i < markers; i++)
+ {
+ int counts[4] = {0, 0, 0, 0};
+
+ // Count the number of occurences for each genotype
+ for (int j = 0; j < count; j++)
+ counts[haplotypes[j * 2 + phase[j]][i] * 2 + haplotypes[j * 2 + (phase[j] ^ 1)][i]]++;
+
+ // Select the most likely genotype
+ int best = 0;
+
+ for (int j = 1; j < 4; j++)
+ if (counts[j] > counts[best])
+ best = j;
+
+ // Assign it to the consensus
+ consensus[0][i] = best / 2;
+ consensus[1][i] = best % 2;
+
+ // If a heterozygous genotype was selected, update the phase for other informative
+ // haplotypes
+ if (best == 0 || best == 3) continue;
+
+ int complement = (best ^ 3);
+
+ for (int j = 0; j < count; j++)
+ if ((haplotypes[j * 2 + phase[j]][i] * 2 + haplotypes[j * 2 + (phase[j] ^ 1)][i]) == complement)
+ phase[j] = phase[j] ^ 1;
+ }
+
+ delete [] phase;
+ }
+
+void Haplotyper::WarmUp(int seeds, int rounds)
+ {
+ if (seeds < 0 || seeds > individuals || rounds <= 0)
+ return;
+
+ int saved_individuals = individuals;
+
+ individuals = seeds;
+
+ for (int i = 0; i < rounds; i++)
+ {
+ LoopThroughChromosomes();
+ UpdateThetas();
+ UpdateErrorRate();
+ }
+
+ for (int i = seeds; i < saved_individuals; i++)
+ {
+ SwapIndividuals(i, individuals - 1);
+
+ ScoreLeftConditional();
+ SampleChromosomes(&globalRandom);
+
+ SwapIndividuals(i, individuals - 1);
+ }
+
+ individuals = saved_individuals;
+ }
+
+void Haplotyper::SwapIndividuals(int a, int b)
+ {
+ // if (b < 0 || b >= individuals)
+ // printf("Bad Swap!");
+
+ Swap(genotypes[a], genotypes[b]);
+ Swap(haplotypes[a * 2], haplotypes[b * 2]);
+ Swap(haplotypes[a * 2 + 1], haplotypes[b * 2 + 1]);
+
+ if (diseaseCount)
+ {
+ Swap(diseaseStatus[a], diseaseStatus[b]);
+ Swap(diseaseScores[a], diseaseScores[b]);
+ }
+
+ if (weights != NULL)
+ {
+ float temp = weights[a];
+ weights[a] = weights[b];
+ weights[b] = temp;
+ }
+ }
+
+void Haplotyper::SwapHaplotypes(int a, int b)
+ {
+ Swap(haplotypes[a], haplotypes[b]);
+ }
+
+void Haplotyper::ScaleWeights()
+ {
+ float sum = 0.0;
+
+ for (int i = 0; i < states / 2; i++)
+ sum += weights[i];
+
+ float scale = states / sum * 0.5;
+
+ for (int i = 0; i < individuals; i++)
+ weights[i] *= scale;
+ }
+
+void Haplotyper::SelectReferenceSet(int * array, int forWhom)
+ {
+ if (greedy)
+ {
+ // Sanity check
+ // assert(states == phased * 2);
+
+ if (states == phased * 2)
+ // default greedy
+ {
+ // We exclude inferred haplotypes from the reference set
+ for (int i = 0; i < individuals - phased; i++)
+ array[i] = 0;
+
+ // We include phased haplotypes as our reference set
+ for (int i = individuals - phased; i < individuals - 1; i++)
+ array[i] = 1;
+
+ // For the last entry in the reference set, we may need to pick
+ // a pair of inferred haplotypes
+ if (forWhom < individuals - phased)
+ array[forWhom] = 1;
+ else
+ array[globalRandom.NextInt() % (individuals - phased)] = 1;
+ } // end of default greedy
+ else
+ // approximate greedy
+ {
+ for (int i = 0; i < individuals - phased; i++)
+ array[i] = 0;
+
+ if (forWhom >= individuals-phased)
+ globalRandom.Choose( & array[individuals - phased], phased-1, states / 2);
+ else
+ {
+ // total phased trials, states/2 successes, so could overwrite the original array[individuals - 1]
+ globalRandom.Choose( & array[individuals - phased], phased, states / 2);
+ if (array[individuals - 1] == 1) array[forWhom] = 1;
+ array[individuals - 1] = 1;
+ }
+ } // end of approximate greedy
+ }
+
+ else if (weights != NULL)
+ globalRandom.Choose(array, weights, individuals - 1, states / 2);
+ else
+ globalRandom.Choose(array, individuals - 1, states / 2);
+
+ // Swap reference set into position
+ for (int j = 0, out = 0; j < individuals; j++)
+ if (array[j])
+ SwapIndividuals(j, out++);
+ }
+
+void Haplotyper::LoopThroughChromosomes()
+ {
+ bool approximate = (states == individuals * 2 - 2) ? false : true;
+
+ ResetCrossovers();
+
+ int * array = NULL;
+
+ if (approximate)
+ {
+ array = new int [individuals];
+
+ if (array == NULL)
+ error("Out of memory allocating array for sampling individuals\n");
+
+ array[individuals - 1] = 1;
+ }
+
+ for (int i = individuals - 1; i >= 0; i--)
+ {
+ SwapIndividuals(i, individuals - 1);
+
+ if (approximate)
+ SelectReferenceSet(array, i);
+
+ if (weights != NULL)
+ ScaleWeights();
+
+ if (updateDiseaseScores)
+ ScoreNPL();
+
+ if (i < individuals - phased)
+ {
+ ScoreLeftConditional();
+ SampleChromosomes(&globalRandom);
+
+ if (updateDiseaseScores && diseaseCount)
+ IntegrateNPL();
+
+#ifdef _DEBUG
+ if (!SanityCheck())
+ {
+ printf("\nProblems above occurred haplotyping individual %d\n\n", i);
+ Print();
+ }
+#endif
+ }
+ else
+ {
+ ScoreLeftConditionalForHaplotype();
+ SampleHaplotypeSource(&globalRandom);
+ SwapHaplotypes(states, states + 1);
+ ScoreLeftConditionalForHaplotype();
+ SampleHaplotypeSource(&globalRandom);
+ SwapHaplotypes(states, states + 1);
+ }
+
+ if (approximate)
+ for (int j = individuals - 1, out = states / 2; j >= 0; j--)
+ if (array[j])
+ SwapIndividuals(j, out--);
+
+ SwapIndividuals(i, individuals - 1);
+ }
+
+ if (approximate)
+ delete [] array;
+ }
+
+void Haplotyper::OutputMLEs(Pedigree & ped, const String & prefix, bool mldetails)
+ {
+ IFILE dose = ifopen(prefix + ".mldose.gz", "wt");
+ IFILE geno = ifopen(prefix + ".mlgeno.gz", "wt");
+ FILE * info = fopen(prefix + ".mlinfo", "wt");
+ IFILE qc;
+ IFILE prob;
+ if (mldetails)
+ {
+ qc = ifopen(prefix + ".mlqc.gz", "wt");
+ prob = ifopen(prefix + ".mlprob.gz", "wt");
+ }
+
+ printf("Estimating MLE for missing genotypes conditional on current state...\n");
+
+ if (dose == NULL || info == NULL || geno == NULL)
+ error("Failed to open output file for MLE estimates");
+
+ if (mldetails && (qc == NULL | prob == NULL))
+ error ("Failed to open output file for detailed MLE estimates");
+
+ ResetMarkerInfo();
+
+ bool approximate = (states == individuals * 2 - 2) ? false : true;
+
+ ResetCrossovers();
+
+ int * array = NULL;
+
+ if (approximate)
+ {
+ array = new int [individuals];
+
+ if (array == NULL)
+ printf("Out of memory allocating array for sampling individuals\n");
+
+ array[individuals - 1] = 1;
+ }
+
+ int matches = 0, partialmatches = 0, mismatches = 0;
+ for (int i = 0; i < individuals - phased; i++)
+ {
+ SwapIndividuals(i, individuals - 1);
+
+ if (approximate)
+ SelectReferenceSet(array, i);
+
+ if (weights != NULL)
+ ScaleWeights();
+
+ ScoreLeftConditional();
+ ImputeGenotypes();
+
+ if (approximate)
+ for (int j = individuals - 1, out = states / 2; j >= 0; j--)
+ if (array[j])
+ SwapIndividuals(j, out--);
+
+ SwapIndividuals(i, individuals - 1);
+
+ ifprintf(dose, "%s->%s ML_DOSE", (const char *) ped[i].famid, (const char *) ped[i].pid);
+ ifprintf(geno, "%s->%s ML_GENO", (const char *) ped[i].famid, (const char *) ped[i].pid);
+
+ if (qc)
+ ifprintf(qc, "%s->%s ML_QC", (const char *) ped[i].famid, (const char *) ped[i].pid);
+
+ if (prob)
+ ifprintf(prob, "%s->%s ML_PROB", (const char *) ped[i].famid, (const char *) ped[i].pid);
+
+ for (int marker = 0; marker < markers; marker++)
+ {
+ int best = 0;
+ if (posterior[1][marker] > posterior[best][marker]) best = 1;
+ if (posterior[2][marker] > posterior[best][marker]) best = 2;
+
+ MarkerInfo * info = ped.GetMarkerInfo(marker);
+
+ ifprintf(dose, " %.3f", posterior[0][marker] * 2.0 + posterior[1][marker]);
+ ifprintf(geno, " %s/%s", (const char *) info->GetAlleleLabel((best + 2) / 2),
+ (const char *) info->GetAlleleLabel((best + 3) / 2));
+
+ if (qc)
+ ifprintf(qc, " %.3f", posterior[best][marker]);
+
+ if (prob)
+ ifprintf(prob, " %.3f %.3f", posterior[0][marker], posterior[1][marker]);
+
+ if (genotypes[i][marker] == 0 && ped[i].markers[marker].isKnown())
+ {
+ int observed = ped[i].markers[marker].SequenceCoded() - 1;
+
+ if (observed == best)
+ matches++;
+ else if (observed == 1 || best == 1)
+ partialmatches++;
+ else mismatches++;
+ }
+ }
+
+ UpdateMarkerInfo();
+
+ ifprintf(dose, "\n");
+ ifprintf(geno, "\n");
+
+ if (qc) ifprintf(qc, "\n");
+ if (prob) ifprintf(prob, "\n");
+ }
+
+ OutputMarkerInfo(info);
+
+ fclose(info);
+ ifclose(dose);
+ ifclose(geno);
+ if (qc) ifclose(qc);
+ if (prob) ifclose(prob);
+
+ printf(" File [%s.mlinfo] contains marker summary information ...\n"
+ " File [%s.mldose] contains MLE for dosage ...\n"
+ " File [%s.mlgeno] contains MLE for most likely genotype\n",
+ (const char *) prefix, (const char *) prefix, (const char *) prefix);
+
+ printf(!mldetails ? "\n" :
+ " File [%s.mlqc] contains MLE for quality score ...\n"
+ " File [%s.mlprobs] contains MLE probabilities for each genotype ...\n\n",
+ (const char *) prefix, (const char *) prefix);
+
+ if (matches + mismatches + partialmatches)
+ {
+ double total = matches + mismatches + partialmatches;
+
+ printf("Comparing %.0f masked genotypes with MLE estimates ...\n", total);
+ printf(" Estimated per genotype error rate is %.4f\n",
+ (mismatches + partialmatches) / total);
+ printf(" Estimated per allele error rate is %.4f\n\n",
+ (mismatches + partialmatches * 0.5) / total);
+ }
+
+ if (approximate)
+ delete [] array;
+ }
+
+void Haplotyper::ShowMemoryInfo()
+ {
+ int blocks = 0;
+
+ for (int i = 0; i < markers; i++)
+ if (memoryBlock[i])
+ blocks++;
+
+ if (states <= 0 || states > individuals * 2 - 2)
+ states = 2 * individuals - 2;
+
+ double bytes = sizeof(char) * (double) individuals * markers * 3 // Genotypes, Haplotypes
+ + sizeof(float) * (double) states * 2 // Marginals
+ + sizeof(float) * (double) blocks * states * (states + 1) / 2 // matrices
+ + sizeof(float) * (double) markers * 11 // penetrances, probabilities, thetas
+ + sizeof(int) * (double) markers // crossover counts
+ + sizeof(Errors) * (double) markers; // error model information
+
+ printf(" %40s %s\n", "Haplotyping engine (actual) ...", (const char *) MemoryInfo(bytes));
+ }
+
+void Haplotyper::EstimateMemoryInfo(int Individuals, int Markers, int States, bool Compact, bool Phased)
+ {
+ if (States <= 0 || States > Individuals * 2 - 2)
+ States = 2 * Individuals - 2;
+
+ int positions = Compact ? 2 * (int) sqrt((double)Markers) + 1 : Markers;
+
+ if (Phased)
+ if (Markers / ((States + 1) / 2) + 1 > positions)
+ positions = Markers / ((States + 1) / 2) + 1;
+
+ double bytes = sizeof(char) * (double) Individuals * Markers * 3 // Genotypes, Haplotypes
+ + sizeof(float) * (double) States * 2 // Marginals
+ + sizeof(float) * (double) positions * States * (States + 1) / 2 // matrices
+ + sizeof(float) * (double) Markers * 11 // penetrances, probabilities, thetas
+ + sizeof(int) * (double) Markers // crossover counts
+ + sizeof(Errors) * (double) Markers; // error model information
+
+ printf(" %40s %s\n", "Haplotyping engine (max) ...", (const char *) MemoryInfo(bytes));
+ }
+
+void Haplotyper::ShowMLEMemoryInfo()
+ {
+ EstimateMLEMemoryInfo(individuals, markers, states);
+ }
+
+void Haplotyper::EstimateMLEMemoryInfo(int Individuals, int Markers, int States)
+ {
+ if (States <= 0 || States > Individuals * 2 - 2)
+ States = 2 * Individuals - 2;
+
+ double bytes = sizeof(float) * (double) 2 * States * (States + 1) / 2 +
+ sizeof(float) * (double) 7 * Markers;
+
+ printf(" %40s %s\n", "MLE Estimator ...", (const char *) MemoryInfo(bytes));
+ }
+
+void Haplotyper::EstimateDiseaseMemoryInfo(int Individuals, int Markers, int Diseases)
+ {
+ // TODO -- Disease memory info estimate should take into account the two right matrices
+
+ double bytes = sizeof(short) * (double) Diseases * Markers * Individuals +
+ sizeof(char) * (double) Diseases * Individuals;
+
+ printf(" %40s %s\n", "Non-parametric scores ...", (const char *) MemoryInfo(bytes));
+ }
+
+void Haplotyper::FillPath(int haplotype, int fromMarker, int toMarker, int state)
+ {
+ fromMarker++;
+
+ while (fromMarker < toMarker)
+ ImputeAllele(haplotype, fromMarker++, state);
+ }
+
+void Haplotyper::SamplePath(int haplo, int fromMarker, int toMarker, int fromState, int toState, Random * rand)
+ {
+ double theta = 0.0;
+
+ // Calculate overall recombination fraction for the interval
+ for (int i = fromMarker; i < toMarker; i++)
+ theta = thetas[i] + theta - theta * thetas[i];
+
+ // Impute a path between the two end markers, assuming no genotypes
+ // are observed -- the only constraint is that we must start at
+ // fromState and end at toState with at least one intervening recombinant
+ while (fromMarker < toMarker - 1)
+ {
+ double r = rand->Uniform(0.0, theta);
+
+ double theta1 = thetas[fromMarker];
+
+ if (theta < 0.9)
+ // Fast closed formula
+ theta = (theta - theta1) / (1.0 - theta1);
+ else
+ {
+ theta = 0.0;
+
+ // More accurate, iterative formula
+ for (int i = fromMarker + 1; i < toMarker; i++)
+ theta = thetas[i] + theta - theta * thetas[i];
+ }
+
+ if (r > theta1)
+ {
+ // No recombinant in the in first interval
+ ImputeAllele(haplo, ++fromMarker, fromState);
+ continue;
+ }
+
+ crossovers[fromMarker]++;
+ if (r < theta1 * (1.0 - theta))
+ {
+ // No recombinant in the second interval
+ FillPath(haplo, fromMarker, toMarker, toState);
+ return;
+ }
+ else if (weights != NULL)
+ {
+ // Recombinants in both intervals, so we must sample
+ // an intervening state -- potentially taking weights
+ // into account
+ double sum = 0.0;
+
+ for (int i = 0; i < states; i++)
+ sum += weights[i];
+
+ r = rand->Uniform(0, sum);
+
+ sum = weights[0];
+ fromState = 0;
+
+ for (int i = 1; i < states, sum < r; i++)
+ {
+ sum += weights[i];
+ fromState++;
+ }
+
+ ImputeAllele(haplo, ++fromMarker, fromState);
+ }
+ else
+ ImputeAllele(haplo, ++fromMarker, fromState = (int) (rand->Next() * states));
+ }
+
+ // If we get here, record obligate recombinant between two consecutive markers
+ crossovers[fromMarker]++;
+ }
+
+// Memory management functions
+//
+
+void Haplotyper::GetMemoryBlock(int marker)
+ {
+ if (!economyMode || marker == 0 || marker > stack[stackPtr] + gridSize)
+ {
+ stack[++stackPtr] = marker;
+ leftMatrices[marker] = GetLargeBlock();
+
+ ResetReuseablePool();
+ }
+ else
+ leftMatrices[marker] = GetReuseableBlock();
+ }
+
+void Haplotyper::GetSmallMemoryBlock(int marker)
+ {
+ leftMatrices[marker] = GetSmallBlock();
+ }
+
+void Haplotyper::RetrieveMemoryBlock(int marker)
+ {
+ if (stack[stackPtr] <= marker)
+ return;
+ else
+ {
+ ResetReuseablePool();
+
+ double theta = 0.0;
+ float *from = leftMatrices[stack[--stackPtr]];
+
+ for (int i = stack[stackPtr] + 1; i <= marker; i++)
+ {
+ // Cumulative recombination fraction allows us to skip uninformative positions
+ theta = theta + thetas[i - 1] - theta * thetas[i - 1];
+
+ // Skip over uninformative positions to save time
+ if (genotypes[states / 2][i] != GENOTYPE_MISSING || i == markers - 1)
+ {
+ leftMatrices[i] = GetReuseableBlock();
+
+ Transpose(from, leftMatrices[i], theta);
+ ConditionOnData(leftMatrices[i], i, genotypes[states / 2][i]);
+
+ theta = 0;
+ from = leftMatrices[i];
+ }
+ }
+ }
+ }
+
+float * Haplotyper::AllocateMemoryBlock()
+ {
+ int blockSize = orderedGenotypes ? states * states : states * (states + 1) / 2;
+
+ float * block = new float [blockSize];
+
+ for (int i = 0; i < blockSize / states; i++)
+ if (smallFree < markers)
+ smallMemoryBlock[smallFree++] = block + i * states;
+
+ return block;
+ }
+
+float * Haplotyper::GetLargeBlock()
+ {
+ if (memoryBlock[nextAvailable] == NULL)
+ memoryBlock[nextAvailable] = AllocateMemoryBlock();
+
+ return memoryBlock[nextAvailable++];
+ }
+
+float * Haplotyper::GetSmallBlock()
+ {
+ if (smallMemoryBlock[nextSmallAvailable] == NULL)
+ {
+ while (memoryBlock[nextAvailable] != NULL)
+ nextAvailable++;
+
+ memoryBlock[nextAvailable++] = AllocateMemoryBlock();
+ }
+
+ return smallMemoryBlock[nextSmallAvailable++];
+ }
+
+float * Haplotyper::GetReuseableBlock()
+ {
+ if (memoryBlock[nextReuseable] == NULL)
+ memoryBlock[nextReuseable] = AllocateMemoryBlock();
+
+ return memoryBlock[nextReuseable--];
+ }
+
+void Haplotyper::ResetMemoryPool()
+ {
+ nextAvailable = nextSmallAvailable = 0;
+ nextReuseable = markers - 1;
+ stackPtr = -1;
+ }
+
+void Haplotyper::ResetReuseablePool()
+ {
+ nextReuseable = markers - 1;
+ }
+
+void Haplotyper::ScoreNPL()
+ {
+ if (diseaseCount == 0)
+ return;
+
+ for (int i = 0; i < 3; i++)
+ for (int j = 0; j < diseaseCount; j++)
+ nplScores[i][j] = 0.0;
+
+ for (int i = 0; i < states / 2; i++)
+ for (int j = 0; j < diseaseCount; j++)
+ if (diseaseStatus[i][j])
+ nplScores[3 - diseaseStatus[i][j]][j] += weights == NULL ? 1 : weights[i];
+
+ for (int j = 0; j < diseaseCount; j++)
+ nplScores[1][j] = -nplScores[1][j];
+ }
+
+void Haplotyper::MarkMemoryPool()
+ {
+ savedStackPtr = stackPtr;
+ }
+
+void Haplotyper::RewindMemoryPool()
+ {
+ if (stackPtr != savedStackPtr)
+ {
+ stackPtr = savedStackPtr;
+
+ if (stack[stackPtr] != markers - 1)
+ stack[++stackPtr] = markers - 1;
+ }
+ }
+
+bool Haplotyper::LoadCrossoverRates(const char * filename)
+ {
+ IFILE f = ifopen(filename, "rb");
+
+ if (f == NULL)
+ {
+ printf("Warning: crossover rate map file [%s] not available\n", (const char *) filename);
+ return false;
+ }
+
+
+ LoadCrossoverRates(f);
+ ifclose(f);
+
+ return true;
+ }
+
+void Haplotyper::LoadCrossoverRates(IFILE file)
+ {
+ String buffer;
+ StringArray tokens;
+
+ printf("Loading crossover rates ...\n");
+ buffer.ReadLine(file);
+
+ int interval = 0;
+ while (!ifeof(file) && interval < markers - 1)
+ {
+ buffer.ReadLine(file);
+ tokens.ReplaceTokens(buffer);
+
+ if (tokens.Length() == 0) continue;
+
+ if (tokens.Length() < 2)
+ error("The following line could not be parsed:\n\n%s\n"
+ "Each line should list an interval followed by the corresponding\n"
+ "crossover rate. Intervals should be in map order.", (const char *) buffer);
+
+ thetas[interval++] = tokens[1].AsDouble();
+ }
+ }
+
+bool Haplotyper::LoadErrorRates(const char * filename)
+ {
+ IFILE f = ifopen(filename, "rb");
+
+ if (f == NULL)
+ {
+ printf("Warning: error rate map file [%s] not available\n", (const char *) filename);
+ return false;
+ }
+
+ LoadErrorRates(f);
+ ifclose(f);
+
+ return true;
+ }
+
+void Haplotyper::LoadErrorRates(IFILE file)
+ {
+ String buffer;
+ StringArray tokens;
+
+ printf("Loading mosaic error rates ...\n");
+ buffer.ReadLine(file);
+
+ int marker = 0;
+ while (!ifeof(file) && marker < markers)
+ {
+ buffer.ReadLine(file);
+ tokens.ReplaceTokens(buffer);
+
+ if (tokens.Length() == 0) continue;
+
+ if (tokens.Length() < 2)
+ error("The following line could not be parsed:\n\n%s\n"
+ "Each line should list a marker followed by the corresponding\n"
+ "error rate. Markers should be in map order.", (const char *) buffer);
+
+ SetErrorRate(marker++, tokens[1].AsDouble());
+ }
+ }
+
+void Haplotyper::ResetMarkerInfo()
+ {
+ for (int i = 0; i < markers; i++)
+ mlinfo[0][i] = mlinfo[1][i] = mlinfo[2][i] = mlinfo[3][i] = 0.0;
+ }
+
+#ifndef square
+#define square(x) ((x)*(x))
+#endif
+
+void Haplotyper::UpdateMarkerInfo()
+ {
+ for (int i = 0; i < markers; i++)
+ {
+ int best = 0;
+ if (posterior[1][i] > posterior[best][i]) best = 1;
+ if (posterior[2][i] > posterior[best][i]) best = 2;
+
+ mlinfo[0][i] += posterior[0][i];
+ mlinfo[1][i] += posterior[1][i];
+ mlinfo[2][i] += square(posterior[0][i] + posterior[1][i] * 0.50);
+ mlinfo[3][i] += posterior[best][i];
+ }
+ }
+
+void Haplotyper::OutputMarkerInfo(FILE * output)
+ {
+ fprintf(output, "SNP\tAl1\tAl2\tFreq1\tMAF\tQuality\tRsq\n");
+
+ for (int i = 0; i < markers; i++)
+ {
+ double p0 = mlinfo[0][i] / (individuals - phased + 1e-30);
+ double p1 = mlinfo[1][i] / (individuals - phased + 1e-30);
+ double sumsq = mlinfo[2][i] / (individuals - phased + 1e-30);
+ double qc = mlinfo[3][i] / (individuals - phased + 1e-30);
+
+ double freq = p0 + p1 * 0.50;
+ double var1 = max(p0 + p1 * 0.25 - square(freq), 0);
+ double var2 = max(sumsq - square(freq), 0);
+
+ // To avoid problems due to rounding in calculation of sumsq - square(freq)
+ if (var2 < 1e-7) var2 = 0.0;
+
+ MarkerInfo * info = Pedigree::GetMarkerInfo(i);
+
+ fprintf(output, "%s\t%s\t%s\t%.4f\t%.4f\t%.4f\t%.4f\n",
+ (const char *) info->name,
+ (const char *) info->GetAlleleLabel(1),
+ info->CountAlleles() > 1 ? (const char *) info->GetAlleleLabel(2) : "-",
+ freq, freq > 0.50 ? 1.0 - freq : freq, qc, var2 / (var1 + 1e-30));
+ }
+ }
+
+void Haplotyper::IntegrateNPL(float * matrix, int marker)
+ {
+ float * matrixStart = matrix;
+ marker *= diseaseCount;
+
+ for (int disease = 0; disease < diseaseCount; disease++)
+ {
+ double source_status[3] = { 0.0, 0.0, 0.0 };
+
+ matrix = matrixStart;
+ for (int i = 0; i < states; i++)
+ {
+ int i_status = diseaseStatus[i/2][disease];
+
+ for (int j = 0; j <= i; j++, matrix++)
+ {
+ source_status[i_status] += *matrix;
+ source_status[diseaseStatus[j/2][disease]] += *matrix;
+ }
+ }
+
+ double sum = source_status[1] + source_status[2];
+
+ if (sum > 0.0)
+ diseaseScores[states/2][marker + disease] +=
+ nplScores[1][disease] * source_status[1] / sum +
+ nplScores[2][disease] * source_status[2] / sum;
+ }
+ }
+
+void Haplotyper::IntegrateNPL(float * matrix1, float * matrix2, int marker)
+ {
+ float * matrixStart1 = matrix1;
+ float * matrixStart2 = matrix2;
+ marker *= diseaseCount;
+
+ if (weights == NULL)
+ for (int disease = 0; disease < diseaseCount; disease++)
+ {
+ double source_status[3] = { 0.0, 0.0, 0.0 };
+
+ matrix1 = matrixStart1; matrix2 = matrixStart2;
+ for (int i = 0; i < states; i++, matrix1++, matrix2++)
+ {
+ int i_status = diseaseStatus[i/2][disease];
+
+ for (int j = 0; j < i; j++, matrix1++, matrix2++)
+ {
+ source_status[i_status] += *matrix1 * *matrix2;
+ source_status[diseaseStatus[j/2][disease]] += *matrix1 * *matrix2;
+ }
+
+ source_status[i_status] += *matrix1 * *matrix2 * 4.0;
+ }
+
+ double sum = source_status[1] + source_status[2];
+
+ if (sum > 0.0)
+ diseaseScores[states/2][marker + disease] +=
+ nplScores[1][disease] * source_status[1] / sum +
+ nplScores[2][disease] * source_status[2] / sum;
+ }
+ else
+ {
+ for (int disease = 0; disease < diseaseCount; disease++)
+ {
+ double source_status[3] = { 0.0, 0.0, 0.0 };
+
+ matrix1 = matrixStart1; matrix2 = matrixStart2;
+ for (int i = 0; i < states; i++, matrix1++, matrix2++)
+ {
+ int i_status = diseaseStatus[i/2][disease];
+
+ for (int j = 0; j <= i; j++, matrix1++, matrix2++)
+ {
+ double cell = *matrix1 * *matrix2 / (weights[j / 2] * weights[i / 2] + 1e-30);
+
+ source_status[i_status] += cell;
+ source_status[diseaseStatus[j/2][disease]] += cell;
+ }
+
+ source_status[i_status] += *matrix1 * *matrix2 * 4.0 / (weights[i / 2] * weights[i / 2] + 1e-30);
+ }
+
+ double sum = source_status[1] + source_status[2];
+
+ if (sum > 0.0)
+ diseaseScores[states/2][marker + disease] +=
+ nplScores[1][disease] * source_status[1] / sum +
+ nplScores[2][disease] * source_status[2] / sum;
+ }
+ }
+ }
+
+void Haplotyper::IntegrateNPL()
+ {
+ RewindMemoryPool();
+
+ // Process the last position
+ RetrieveMemoryBlock(markers - 1);
+ IntegrateNPL(leftMatrices[markers - 1], markers - 1);
+
+ SetupPrior(rightMatrices[0]);
+ ConditionOnData(rightMatrices[0], 0, genotypes[states / 2][0]);
+
+ float *temp;
+ float *from = rightMatrices[0];
+ float *to = rightMatrices[1];
+
+ for (int i = markers - 2; i >= 0; i--)
+ {
+ // Move things along
+ Transpose(from, to, thetas[i]);
+
+ // Find nearest informative marker
+ double theta = 0.0;
+ int left = i;
+
+ while (left > 0 && genotypes[states / 2][left] == GENOTYPE_MISSING)
+ {
+ // Cumulative recombination fraction to nearest marker
+ theta = theta + thetas[left - 1] - theta * thetas[left - 1];
+ left--;
+ }
+
+ RetrieveMemoryBlock(left);
+ float * leftMatrix = leftMatrices[left];
+
+ if (left != i)
+ {
+ Transpose(leftMatrix, from, theta);
+ leftMatrix = from;
+ }
+
+ IntegrateNPL(leftMatrix, to, i);
+ ConditionOnData(to, i, genotypes[states / 2][i]);
+
+ temp = from;
+ from = to;
+ to = temp;
+ }
+ }
+
+
diff --git a/mach1/Haplotyper.h b/mach1/Haplotyper.h
new file mode 100644
index 0000000..685af24
--- /dev/null
+++ b/mach1/Haplotyper.h
@@ -0,0 +1,323 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/Haplotyper.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __HAPLOTYPER_H__
+#define __HAPLOTYPER_H__
+
+#include "Random.h"
+#include "StringBasics.h"
+#include "ErrorRate.h"
+#include "InputFile.h"
+
+#ifdef __DOUBLE_HAPLOTYPING__
+#define float double
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+
+class Pedigree;
+class String;
+
+class Haplotyper
+ {
+ public:
+ // These are the basic variables that store information
+ // about the underlying state
+
+ int phased, individuals, states, markers;
+
+ char ** genotypes;
+ char ** haplotypes;
+ float * thetas;
+ float * distances;
+
+ // Store information about estimated error rates
+ // and about observed mismatches between mosaic and
+ // actual genotypes
+ Errors * error_models;
+
+ // These additional variables store optional information
+ // about disease status for non-parametric mapping
+
+ int diseaseCount;
+ char ** diseaseStatus;
+ float ** diseaseScores;
+ float ** nplScores;
+
+ // Determines whether NPL scores are updated simultaneously with
+ // haplotype sources
+ bool updateDiseaseScores;
+
+ // These flags report the current status and determine the
+ // analytical strategy
+
+ bool readyForUse;
+ bool greedy;
+ bool economyMode;
+
+ // These variables store information about the presence of partially
+ // ordered genotypes, both globally and at the individual level
+
+ bool orderedGenotypes;
+ int * orderedGenotypeFlags;
+
+ Haplotyper();
+ ~Haplotyper();
+
+ // Set the error rate. This is an omnibus rate which combines
+ // the effects of mutation, gene-conversion and genotyping error.
+ // With current genotyping technologies a value of 0.001 or higher
+ // is reasonable
+ void SetErrorRate(float rate);
+ void SetErrorRate(int marker, float rate);
+
+ double GetErrorRate(int marker)
+ { return error_models[marker].rate; }
+ double GetErrorRate();
+
+ // Initialization function, allocates memory
+ // necessary for tracking haplotypes and
+ // genotypes
+ bool AllocateMemory(int nIndividuals, int maxHaplos, int nMarkers);
+ bool AllocateDiseaseStatus(int nDiseases);
+ bool AllocateDistances();
+ bool AllocateMLEMemory();
+ bool AllocateRightMatrices();
+
+ // By default, some memory allocation is delayed and carried out
+ // on an as needed basis. This routine forces these allocations
+ // to happen, based on current genotype data
+ bool ForceMemoryAllocation();
+
+ // Functions for initializing haplotype list
+ // based on observed haplotypes
+ virtual void RandomSetup(Random * rand = NULL);
+
+ // Basic markov chain functionality (for genotypes)
+ void Transpose(float * source, float * dest, float theta);
+ void Transpose(float * source, float * dest, float * priors, float theta);
+
+ virtual void ConditionOnData(float * matrix, int marker, char genotype);
+ void ConditionOnData(int * stateKey, float * matrix, char genotype);
+
+ void ScoreLeftConditional();
+ void SampleChromosomes(Random * rand);
+
+ void SetupPrior(float * matrix);
+
+ // Routines for estimating MLEs for NPL scores
+ void IntegrateNPL();
+ void IntegrateNPL(float * matrix, int marker);
+ void IntegrateNPL(float * left, float * right, int marker);
+
+ // Routines for estimating MLEs for missing genotypes
+ void ImputeGenotypes();
+ void ImputeGenotypes(float * matrix, int marker);
+ void ImputeGenotypes(float * left, float * right, int marker);
+
+ // Analogous estimation routines that deal with partially ordered genotypes
+ void ImputeGenotypesFromOrderedData();
+ void ImputeGenotypesFromOrderedData(float * matrix, int marker);
+ void ImputeGenotypesFromOrderedData(float * left, float * right, int marker);
+
+ void OutputMLEs(Pedigree & ped, const String & prefix, bool detailed);
+
+ // Basic markov chain functionality (for phase known haplotypes)
+ void TransposeHaplotype(float * source, float * dest, float theta);
+ void ConditionHaplotypeOnData(float * matrix, int marker, char allele);
+ void ScoreLeftConditionalForHaplotype();
+ void SampleHaplotypeSource(Random * rand);
+
+ // Markov chain functionality for dealing partially phased data
+ void TransposeOrdered(float * source, float * dest, float theta);
+
+ void ConditionOnOrderedData(float * matrix, int marker, char genotype);
+
+ void ScoreLeftConditionalForOrderedGenotypes();
+ void SampleChromosomesFromOrderedData(Random * rand);
+
+ void SetupOrderedPrior(float * matrix);
+
+ // Higher level markov chain functionality
+ void WarmUp(int seeds, int rounds);
+ void LoopThroughChromosomes();
+
+ // Build a set of consensus haplotypes
+ void BuildConsensus(int samples);
+
+ // These functions update parameters based on the last iteration
+ void UpdateThetas();
+ void UpdateThetasWithDistances();
+ double UpdateErrorRate();
+
+ // These functions allow for different weights to be placed on each
+ // possible haplotype. Currently these weights are simply based on
+ // the number of positions where genotype data is available.
+ virtual void CalculateWeights();
+ void AllocateWeights();
+ void FreeWeights();
+ void ScaleWeights();
+
+ int TotalCrossovers();
+
+ // Report memory used by haplotyping engine
+ void ShowMemoryInfo();
+ void ShowMLEMemoryInfo();
+
+ static void EstimateMemoryInfo(int Individuals, int Markers, int States, bool Compact, bool Phased);
+ static void EstimateDiseaseMemoryInfo(int Individuals, int Markers, int Diseases);
+ static void EstimateMLEMemoryInfo(int Individuals, int Markers, int Diseases);
+
+ // Retrieve parameters from file
+ bool LoadCrossoverRates(const char * filename);
+ void LoadCrossoverRates(IFILE file);
+ bool LoadErrorRates(const char * filename);
+ void LoadErrorRates(IFILE file);
+
+ protected:
+ float * marginals;
+ float ** leftMatrices;
+ float * leftProbabilities;
+
+ // These are used to impute missing genotypes in a forward-backward algorithm
+ float ** rightMatrices;
+ float ** posterior;
+ float ** mlinfo;
+
+ float * weights;
+
+ int * crossovers;
+
+ // Impute two alleles at a particular marker, given sampled state
+ virtual void ImputeAlleles(int marker, int state1, int state2, Random * rand);
+ void ImputeAllele(int haplotype, int marker, int state);
+
+ // Merge a set of sampled haplotypes into a consensus pair
+ void BuildConsensus(char ** consensus, char ** haplotypes, int count);
+
+ float & Penetrance(int marker, int true_genotype, int observed_genotype)
+ {
+ return penetrances[marker * 9 + true_genotype * 3 + observed_genotype];
+ }
+
+ void FillPath(int haplotype, int fromMarker, int toMarker, int state);
+ void SamplePath(int haplotype, int fromMarker, int toMarker, int fromState, int toState, Random * rand);
+
+ // Routines for producing summary information about MLE estimates of genotypes
+ void ResetMarkerInfo();
+ void UpdateMarkerInfo();
+ void OutputMarkerInfo(FILE * output);
+
+ void NormalizePosterior(int marker)
+ {
+ double sum = posterior[0][marker] + posterior[1][marker] + posterior[2][marker];
+
+ if (sum > 0.0)
+ {
+ posterior[0][marker] /= sum;
+ posterior[1][marker] /= sum;
+ posterior[2][marker] /= sum;
+ }
+ }
+
+ bool skipSanityCheck;
+
+ private:
+ void Swap(char * & array1, char * & array2)
+ { char * temp = array1; array1 = array2; array2 = temp; }
+
+ void Swap(float * & array1, float * & array2)
+ { float * temp = array1; array1 = array2; array2 = temp; }
+
+ void SelectReferenceSet(int * choices, int forWhom);
+ void SwapIndividuals(int a, int b);
+ void SwapHaplotypes(int a, int b);
+
+ void ScoreNPL();
+ void UpdateDiseaseScores(int marker, int state);
+
+ bool SanityCheck();
+
+ void Print(int marker);
+ void Print();
+
+ float max(float a, float b) { return a > b ? a : b; };
+ float max(float a, float b, float c) { return max(max(a, b), c); }
+ float max(float a, float b, float c, float d) { return max(max(a, b, c), d); }
+
+ float square(float a) { return a * a; }
+
+ float * penetrances;
+
+ void ResetCrossovers();
+
+ // A series of memory management functions lets us delay allocation
+ // of big blocks of memory until they are needed. Even more importantly
+ // it allows us to reuse memory as needed.
+ float ** memoryBlock;
+ float ** smallMemoryBlock;
+ int * stack, stackPtr, savedStackPtr;
+ int smallFree, nextAvailable, nextSmallAvailable, nextReuseable;
+ int gridSize;
+
+ // This is the low level allocator
+ float * AllocateMemoryBlock();
+
+ // This retrieves a large block, used for modeling unphased genotypes
+ float * GetLargeBlock();
+ float * GetReuseableBlock();
+
+ // This retrieves a smaller block, used for modeling a single haplotype
+ float * GetSmallBlock();
+
+ // This resets the memory pool
+ void ResetMemoryPool();
+ void ResetReuseablePool();
+
+ // These commands allow us to run multiple passes through the data
+ void MarkMemoryPool();
+ void RewindMemoryPool();
+
+ // These are the high level interfaces ...
+ void GetMemoryBlock(int marker);
+ void GetSmallMemoryBlock(int marker);
+ void RetrieveMemoryBlock(int marker);
+ };
+
+#ifdef __DOUBLE_HAPLOTYPING__
+#undef float
+#endif
+
+#define GENOTYPE_MISSING 0
+#define GENOTYPE_HOMOZYGOUS_FOR_ONE 1
+#define GENOTYPE_HETEROZYGOUS 2
+#define GENOTYPE_HOMOZYGOUS_FOR_TWO 3
+
+#define GENOTYPE_ORDERED 16
+#define FIRST_ALLELE_ONE 1
+#define FIRST_ALLELE_TWO 2
+#define FIRST_ALLELE (1 | 2)
+#define SECOND_ALLELE_ONE 4
+#define SECOND_ALLELE_TWO 8
+#define SECOND_ALLELE (4 | 8)
+
+#define GENOTYPE_LINKED 32
+
+#endif
+
+
diff --git a/mach1/Main.cpp b/mach1/Main.cpp
new file mode 100644
index 0000000..9f1eb7c
--- /dev/null
+++ b/mach1/Main.cpp
@@ -0,0 +1,635 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/Main.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "AssociationAnalysis.h"
+#include "CostCalculator.h"
+#include "OutputHandlers.h"
+#include "MergeHaplotypes.h"
+#include "HaplotypeLoader.h"
+#include "Parameters.h"
+#include "Manners.h"
+#include "Error.h"
+#include "InputFile.h"
+#include <ctime>
+
+float * thetas = NULL;
+int nthetas = 0;
+
+float * error_rates = NULL;
+int nerror_rates = 0;
+
+int CountPedMarkers (const char * filename)
+{
+ // only count those with flag M
+ IFILE file = ifopen(filename, "rb");
+
+ if (file == NULL) error("Sample Marker Information File [%s] could not be opened\n", filename);
+
+ String buffer;
+ StringArray tokens;
+
+ printf("Count number of markers in the sample marker information file ...\n");
+
+ int pedMarkerCount = 0;
+ while (!ifeof(file))
+ {
+ buffer.ReadLine(file);
+ tokens.ReplaceTokens(buffer);
+
+ if (tokens.Length() == 0) continue;
+
+ if (tokens.Length() != 2)
+ error("Each line in sample marker information file should list exactly two fields, but the\n"
+ "following line does not: \n%s\n\n", (const char *) buffer);
+
+ if (tokens[0] == "M") pedMarkerCount ++;
+ }
+
+ ifclose(file);
+
+ return pedMarkerCount;
+}
+
+void UpdateVector(float * current, float * & vector, int & n, int length)
+ {
+ if (n++ == 0)
+ {
+ vector = new float[length];
+
+ for (int i = 0; i < length; i++)
+ vector[i] = current[i];
+ }
+ else
+ for (int i = 0; i < length; i++)
+ vector[i] += current[i];
+ }
+
+void UpdateErrorRates(Errors * current, float * & vector, int & n, int length)
+ {
+ if (n++ == 0)
+ {
+ vector = new float[length];
+
+ for (int i = 0; i < length; i++)
+ vector[i] = current[i].rate;
+ }
+ else
+ for (int i = 0; i < length; i++)
+ vector[i] += current[i].rate;
+ }
+
+void EvaluateHaplotypes(DosageCalculator & doses, Pedigree & ped, char ** genotypes)
+ {
+ int matches = 0;
+ int mismatches = 0;
+ int partialmatches = 0;
+
+ for (int i = 0; i < ped.count; i++)
+ for (int j = 0; j < ped.markerCount; j++)
+ if (genotypes[i][j] == 0 && ped[i].markers[j].isKnown())
+ {
+ int actual = ped[i].markers[j].SequenceCoded() - 1;
+ int imputed = doses.GetBestGenotype(i, j);
+
+ if (actual == imputed)
+ matches++;
+ else if (actual == 1 || imputed == 1)
+ partialmatches++;
+ else
+ mismatches++;
+ }
+
+ double total = matches + partialmatches + mismatches + 1e-30;
+
+ printf("Comparing %.0f masked genotypes with estimates ...\n", total);
+ printf(" Estimated per genotype error rate is %.4f\n",
+ (mismatches + partialmatches) / total);
+ printf(" Estimated per allele error rate is %.4f\n\n",
+ (mismatches + partialmatches * 0.5) / total);
+ }
+
+int MemoryAllocationFailure()
+ {
+ printf("FATAL ERROR - Memory allocation failed\n");
+ return -1;
+ }
+
+int main(int argc, char ** argv)
+ {
+ setbuf(stdout, NULL);
+ setbuf(stderr, NULL);
+
+ clock_t startt, endt;
+ startt = clock();
+ String datfile, pedfile, hapfile(""), mapfile, hapsnps, outfile("mach1.out");
+ String crossFile(""), errorFile("");
+
+ double errorRate = 0.001, mask = 0.0;
+ int seed = 123456, warmup = 0, states = 0;
+ int burnin = 0, rounds = 0, polling = 0, samples = 0;
+ bool weighted = false, greedy = false, compact = false;
+ bool association = false, quickNPL = false, mle = false, mledetails = false, uncompressed = false;
+
+ SetupCrashHandlers();
+ SetCrashExplanation("reading command line options");
+
+#ifndef VERSION
+ printf("Mach 1.0 -- Markov Chain Haplotyping\n"
+#else
+ printf("Mach " VERSION " -- Markov Chain Haplotyping\n"
+#endif
+ "(c) 2005-2007 Goncalo Abecasis, with thanks to Yun Li, Paul Scheet\n\n");
+
+ ParameterList pl;
+
+BEGIN_LONG_PARAMETERS(longParameters)
+ LONG_PARAMETER_GROUP("Input Files")
+ LONG_STRINGPARAMETER("datfile", &datfile)
+ LONG_STRINGPARAMETER("pedfile", &pedfile)
+ LONG_DOUBLEPARAMETER("mask", &mask)
+ LONG_PARAMETER_GROUP("Optional Files")
+ LONG_STRINGPARAMETER("crossoverMap", &crossFile)
+ LONG_STRINGPARAMETER("errorMap", &errorFile)
+ LONG_STRINGPARAMETER("physicalMap", &mapfile)
+ LONG_PARAMETER_GROUP("Phased Data")
+ LONG_STRINGPARAMETER("snps", &hapsnps)
+ LONG_STRINGPARAMETER("haps", &hapfile)
+ LONG_PARAMETER("hapmapFormat", &HaplotypeLoader::hapmapFormat)
+ LONG_PARAMETER("vcfReference", &HaplotypeLoader::vcfReference)
+ LONG_PARAMETER("autoFlip", &HaplotypeLoader::autoFlip)
+ LONG_PARAMETER("greedy", &greedy)
+ LONG_PARAMETER_GROUP("Regional Options")
+ LONG_DOUBLEPARAMETER("startposition", &HaplotypeLoader::startposition)
+ LONG_DOUBLEPARAMETER("endposition", &HaplotypeLoader::endposition)
+ LONG_PARAMETER_GROUP("Markov Sampler")
+ LONG_INTPARAMETER("seed", &seed)
+ LONG_INTPARAMETER("burnin", &burnin)
+ LONG_INTPARAMETER("rounds", &rounds)
+ LONG_PARAMETER_GROUP("Mapping Options")
+ LONG_PARAMETER("npl", &quickNPL)
+ LONG_PARAMETER("association", &association)
+ LONG_PARAMETER_GROUP("Haplotyper")
+ LONG_INTPARAMETER("states", &states)
+ LONG_DOUBLEPARAMETER("errorRate", &errorRate)
+ LONG_PARAMETER("weighted", &weighted)
+ LONG_PARAMETER("compact", &compact)
+ LONG_PARAMETER_GROUP("Imputation")
+ LONG_PARAMETER("forceImputation", &HaplotypeLoader::forceImputation)
+ LONG_PARAMETER("geno", &OutputManager::outputGenotypes)
+ LONG_PARAMETER("quality", &OutputManager::outputQuality)
+ LONG_PARAMETER("dosage", &OutputManager::outputDosage)
+ LONG_PARAMETER("probs", &OutputManager::outputProbabilities)
+ LONG_PARAMETER("mle", &mle)
+ LONG_PARAMETER_GROUP("Output Files")
+ LONG_STRINGPARAMETER("prefix", &outfile)
+ LONG_PARAMETER("phase", &OutputManager::outputHaplotypes)
+ LONG_PARAMETER("uncompressed", &OutputManager::uncompressed)
+ LONG_PARAMETER("mldetails", &mledetails)
+ LONG_PARAMETER_GROUP("Interim Output")
+ LONG_INTPARAMETER("sampleInterval", &samples)
+ LONG_INTPARAMETER("interimInterval", &polling)
+ BEGIN_LEGACY_PARAMETERS()
+ LONG_STRINGPARAMETER("legend", &hapsnps)
+END_LONG_PARAMETERS();
+
+ pl.Add(new LongParameters("Available Options", longParameters));
+
+ pl.Add(new HiddenString('d', "Data File", datfile));
+ pl.Add(new HiddenString('p', "Pedigree File", pedfile));
+ pl.Add(new HiddenString('m', "Map File", mapfile));
+ pl.Add(new HiddenString('h', "Haplotype File", hapfile));
+ pl.Add(new HiddenString('s', "Haplotype SNP File", hapsnps));
+ pl.Add(new HiddenString('o', "Output File", outfile));
+ pl.Add(new HiddenInteger('r', "Haplotyping Rounds", rounds));
+ pl.Add(new HiddenDouble('e', "Error Rate", errorRate));
+
+// pl.Add(new IntParameter('w', "Warm-Up Sample", warmup));
+// pl.Add(new HiddenIntParameter('s', "Random Seed", seed));
+
+ pl.Read(argc, argv);
+ pl.Status();
+
+ // Setup random seed ...
+ globalRandom.Reset(seed);
+
+ SetCrashExplanation("loading phased chromosomes");
+
+ // Load phased haplotypes, if available
+ HaplotypeLoader haps;
+
+ // regional imputation only works with vcf input haplotypes
+ if (!HaplotypeLoader::vcfReference && (HaplotypeLoader::startposition != 0.0 || HaplotypeLoader::endposition != 300000000.0))
+ error("--startposition and --endposition options for regional imputation only work vcf input reference haplotypes.\n\n");
+
+ if (!HaplotypeLoader::vcfReference) haps.LoadMarkerList(hapsnps);
+ haps.LoadHaplotypes(hapfile);
+
+ if (haps.count == 0 || haps.markerCount == 0)
+ {
+ if (hapfile != "") error("Read ZERO reference haplotypes from %s\n\n", (const char *) hapfile);
+ else
+ printf("WARNING: no reference haplotypes provided.\n");
+ }
+
+ if (greedy)
+ {
+ if (haps.count == 0)
+ {
+ printf(" GREEDY SOLUTION NOT AVAILABLE. Although you requested a greedy\n"
+ " solution, no phased haplotypes were provided as input.\n\n");
+ greedy = false;
+ }
+ else
+ {
+ printf(" GREEDY SOLUTION. Phased haplotypes will be used to resolve ambiguous\n"
+ " individuals and generate a greedy solution.\n\n");
+ //states = haps.count;
+ // default --greedy (using all reference haplotypes
+ // 1.0.16.b allows --greedy together with --states SS (SS <= haps.count)
+ if (states == 0 || states > haps.count) {states = haps.count;}
+ }
+ }
+
+ SetCrashExplanation("loading pedigree file");
+
+ // Setup and load pedigree ...
+ Pedigree ped;
+
+ int numPedMarkers = CountPedMarkers(datfile);
+ printf(" Read %d markers from sample marker information file\n", numPedMarkers);
+ ped.Prepare(datfile);
+ ped.Load(pedfile);
+ ped.LoadMarkerMap(mapfile);
+
+ printf("Loaded pedigree with:\n"
+ " %d individuals to be haplotyped at %d markers\n",
+ ped.count, ped.markerCount);
+
+ if (mask > 0.0)
+ printf(" %.0f%% of genotypes will be masked prior to haplotyping\n", mask * 100.0);
+
+ // Check if physical map is available
+ bool positionsAvailable = true;
+
+ for (int i = 0; i < ped.markerCount; i++)
+ if (ped.GetMarkerInfo(i)->chromosome < 0)
+ {
+ positionsAvailable = false;
+ break;
+ }
+
+ if (positionsAvailable)
+ {
+ printf(" Physical map will be used to improve crossover rate estimates.\n");
+
+ for (int i = 1; i < ped.markerCount; i++)
+ if (ped.GetMarkerInfo(i)->position <= ped.GetMarkerInfo(i - 1)->position ||
+ ped.GetMarkerInfo(i)->chromosome != ped.GetMarkerInfo(i - 1)->chromosome)
+ {
+ printf(" FATAL ERROR -- Problems with physical map ...\n\n"
+ " Before continuing, check the following:\n"
+ " * All markers are on the same chromosome\n"
+ " * All marker positions are unique\n"
+ " * Markers in pedigree and haplotype files are ordered by physical position\n\n");
+ return -1;
+ }
+ }
+
+ printf("\n");
+
+ // Check that haplotypes and pedigree are consistent
+ haps.ConsistencyCheck(ped, numPedMarkers);
+
+ printf("Formating genotypes and allocating memory for haplotyping\n");
+ ped.ShowMemoryInfo();
+ haps.ShowMemoryInfo();
+
+ SetCrashExplanation("allocating memory for haplotype engine and consensus builder");
+
+ Haplotyper engine;
+
+ engine.economyMode = compact;
+
+ engine.EstimateMemoryInfo(ped.count + haps.count / 2, ped.markerCount, states, compact, haps.count != 0);
+ engine.AllocateMemory(ped.count + haps.count / 2, states, ped.markerCount);
+
+ // Copy genotypes into haplotyping engine
+ if (engine.readyForUse)
+ for (int i = 0; i < ped.count; i++)
+ for (int j = 0; j < ped.markerCount; j++)
+ {
+ if (mask == 0.0 || globalRandom.Next() > mask)
+ engine.genotypes[i][j] = ped[i].markers[j].SequenceCoded();
+ else
+ engine.genotypes[i][j] = 0;
+ }
+
+ // Verify that no more than two alleles were present for each marker
+ if (engine.readyForUse)
+ {
+ StringHash badMarkers;
+
+ for (int i = 0; i < ped.count; i++)
+ for (int j = 0; j < ped.markerCount; j++)
+ if (engine.genotypes[i][j] > 3)
+ badMarkers.Add(ped.markerNames[j]);
+
+ if (badMarkers.Entries() > 0)
+ {
+ printf("\n\nFATAL ERROR:\n"
+ "This version of MaCH is designed for bi-allelic markers\n"
+ "However, the following marker(s) have >2 alleles:\n ");
+
+ int togo = badMarkers.Entries();
+ for (int i = 0, new_line = 3, lines = 0; i < badMarkers.Capacity(); i++)
+ if (badMarkers.SlotInUse(i))
+ {
+ if (new_line + badMarkers[i].Length() > 78)
+ printf("\n "), new_line = 3, lines++;
+
+ if (lines > 10 && togo > 5) break;
+
+ printf("%s ", (const char *) badMarkers[i]);
+ new_line += badMarkers[i].Length();
+ togo--;
+ }
+
+ if (togo) printf("\n%d additional markers not listed", togo);
+
+ printf("\n\nPlease remove or recode markers with more than 2 alleles\n\n");
+ return(-1);
+ }
+ }
+
+ // Copy phased haplotypes into haplotyping engine
+ engine.phased = haps.count / 2;
+
+ if (engine.readyForUse)
+ for (int i = 0; i < (haps.count & ~1); i++)
+ for (int j = 0; j < ped.markerCount; j++)
+ engine.haplotypes[ped.count * 2 + i][j] = haps.haplotypes[i][j] - 1;
+
+ if (engine.readyForUse == false || engine.ForceMemoryAllocation() == false)
+ return MemoryAllocationFailure();
+
+ if (positionsAvailable && engine.AllocateDistances())
+ {
+ for (int i = 1; i < ped.markerCount; i++)
+ engine.distances[i - 1] = ped.GetMarkerInfo(i)->position -
+ ped.GetMarkerInfo(i-1)->position;
+ }
+
+ engine.ShowMemoryInfo();
+
+ if (mle)
+ {
+ engine.ShowMLEMemoryInfo();
+ if (!engine.AllocateMLEMemory())
+ return MemoryAllocationFailure();
+ }
+
+ if (quickNPL && Pedigree::affectionCount)
+ {
+ engine.EstimateDiseaseMemoryInfo(ped.count, ped.markerCount, ped.affectionCount);
+
+ if (engine.AllocateDiseaseStatus(Pedigree::affectionCount))
+ {
+ for (int i = 0; i < ped.count; i++)
+ for (int j = 0; j < ped.affectionCount; j++)
+ engine.diseaseStatus[i][j] = ped[i].affections[j];
+ }
+ else
+ return MemoryAllocationFailure();
+ }
+ else
+ quickNPL = false;
+
+ ConsensusBuilder::EstimateMemoryInfo(rounds - burnin, ped.count * 2, ped.markerCount);
+ ConsensusBuilder consensus(rounds - burnin, ped.count * 2, ped.markerCount);
+
+ if (consensus.readyForUse == false)
+ return MemoryAllocationFailure();
+
+ DosageCalculator::storeDistribution = OutputManager::outputDosage ||
+ OutputManager::outputQuality ||
+ OutputManager::outputGenotypes ||
+ OutputManager::outputProbabilities ||
+ mask > 0.0;
+
+ DosageCalculator::EstimateMemoryInfo(rounds - burnin, ped.count, ped.markerCount);
+ DosageCalculator doses(rounds - burnin, ped.count, ped.markerCount);
+
+ if (doses.readyForUse == false)
+ return MemoryAllocationFailure();
+
+ if (weighted)
+ engine.CalculateWeights();
+
+ printf("Memory allocated successfully\n\n");
+
+ SetCrashExplanation("loading error rate and cross over maps");
+
+ engine.SetErrorRate(errorRate);
+
+ bool newline;
+ if (crossFile != "") newline = engine.LoadCrossoverRates(crossFile);
+ if (errorFile != "") newline |= engine.LoadErrorRates(errorFile);
+ if (newline) printf("\n");
+
+ SetCrashExplanation("searching for initial haplotype set");
+
+ engine.greedy = greedy;
+ engine.RandomSetup();
+ printf("Found initial haplotype set\n\n");
+
+ SetCrashExplanation("revving up haplotyping engine");
+
+ if (warmup)
+ {
+ engine.WarmUp(warmup, 5);
+
+ printf("Warmed up haplotyping engine ...\n\n");
+ }
+
+// ParseHaplotypes(engine.haplotypes, engine.individuals * 2 - 2, engine.markers, 32);
+
+// The cost calculator uses heurestics to try and find faster haplotyping
+// strategies -- however, these are not yet implemented!
+// CostCalculator blueSky;
+
+ SetCrashExplanation("interating through markov chain haplotyping procedure");
+
+ for (int i = 0; i < rounds; i++)
+ {
+ engine.LoopThroughChromosomes();
+
+ engine.UpdateThetas();
+ errorRate = engine.UpdateErrorRate();
+
+ printf("Markov Chain iteration %d [%d mosaic crossovers]\n",
+ i + 1, engine.TotalCrossovers() );
+
+ if (i < burnin)
+ continue;
+
+ if (OutputManager::outputHaplotypes)
+ consensus.Store(engine.haplotypes);
+
+ if (doses.storeDosage || doses.storeDistribution)
+ doses.Update(engine.haplotypes);
+
+ // blueSky.OptimizeCost(engine.haplotypes, engine.individuals * 2, engine.markers);
+
+ if (polling > 0 && ((i - burnin) % polling) == 0)
+ {
+ OutputManager::OutputConsensus(ped, consensus, doses, outfile + ".prelim" + (i + 1));
+
+ FILE * file = fopen(outfile + ".prelim" + (i + 1) + ".rec", "wt");
+
+ if (file == NULL)
+ printf("Error opening output file [%s.prelim.%d.rec]\n", (const char *) outfile, i+1);
+ else
+ {
+ fprintf(file, "Interval AvgRate LastRate\n");
+ for (int j = 0; j < engine.markers - 1; j++)
+ fprintf(file, "%d-%d %.4f %.4f\n", j + 1, j + 2,
+ nthetas ? thetas[j] / nthetas : engine.thetas[j],
+ engine.thetas[j]);
+ fclose(file);
+
+ printf("Wrote out file [%s.prelim.%d.rec] with mosaic crossover rates ...\n", (const char *) outfile, i+1);
+ }
+
+ file = fopen(outfile + ".prelim" + (i + 1) + ".erate", "wt");
+
+ if (file == NULL)
+ printf("Error opening output file [%s.prelim.%d.erate]\n", (const char *) outfile, i+1);
+ else
+ {
+ fprintf(file, "Marker AvgRate LastRate\n");
+ for (int j = 0; j < engine.markers; j++)
+ fprintf(file, "%s %.4f %.4f\n", (const char *) ped.markerNames[j],
+ nerror_rates ? error_rates[j] / nerror_rates : engine.GetErrorRate(j),
+ engine.GetErrorRate(j));
+ fclose(file);
+
+ printf("Wrote out file [%s.prelim.%d.erate] with per marker error rates ...\n\n",
+ (const char *) outfile, i+1);
+ }
+
+
+ }
+
+ if (samples > 0 && ((i - burnin) % samples) == 0)
+ OutputManager::WriteHaplotypes(outfile + ".sample" + (i + 1), ped, engine.haplotypes);
+
+ UpdateVector(engine.thetas, thetas, nthetas, engine.markers - 1);
+ UpdateErrorRates(engine.error_models, error_rates, nerror_rates, engine.markers);
+
+ engine.updateDiseaseScores = quickNPL;
+ }
+
+ if (rounds) printf("\n");
+
+ SetCrashExplanation("estimating maximum likelihood solution, conditional on current state");
+
+ if (mle)
+ {
+ // Use best available error and crossover rates for MLE
+ if (nerror_rates)
+ for (int i = 0; i < engine.markers; i++)
+ engine.SetErrorRate(i, error_rates[i] / nerror_rates);
+
+ if (nthetas)
+ for (int i = 0; i < engine.markers - 1; i++)
+ engine.thetas[i] = thetas[i] / nthetas;
+
+ engine.OutputMLEs(ped, outfile, mledetails);
+ }
+
+// ParseHaplotypes(engine.haplotypes, engine.individuals * 2 - 2, engine.markers, 32);
+
+ SetCrashExplanation("outputing solution");
+
+ // If we did multiple rounds of haplotyping, then generate consensus
+ if (rounds > 1)
+ OutputManager::OutputConsensus(ped, consensus, doses, outfile);
+ else
+ {
+ if (uncompressed) {OutputManager::WriteHaplotypes(outfile, ped, engine.haplotypes);}
+ else {OutputManager::GzWriteHaplotypes(outfile + ".gz", ped, engine.haplotypes);}
+ }
+
+ if (doses.storeDosage || doses.storeDistribution)
+ doses.OutputMarkerInfo(outfile + ".info");
+
+ FILE * file = fopen(outfile + ".rec", "wt");
+
+ if (file == NULL)
+ printf("Error opening output file [%s.rec]\n", (const char *) outfile);
+ else
+ {
+ fprintf(file, "Interval AvgRate LastRate\n");
+ for (int i = 0; i < engine.markers - 1; i++)
+ fprintf(file, "%d-%d %.4f %.4f\n", i + 1, i + 2,
+ nthetas ? thetas[i] / nthetas : engine.thetas[i],
+ engine.thetas[i]);
+ fclose(file);
+
+ if (thetas != NULL) delete [] thetas;
+
+ printf("Wrote out file [%s.rec] with mosaic crossover rates ...\n", (const char *) outfile);
+ }
+
+ file = fopen(outfile + ".erate", "wt");
+
+ if (file == NULL)
+ printf("Error opening output file [%s.erate]\n", (const char *) outfile);
+ else
+ {
+ fprintf(file, "Marker AvgRate LastRate\n");
+ for (int i = 0; i < engine.markers; i++)
+ fprintf(file, "%s %.4f %.4f\n", (const char *) ped.markerNames[i],
+ nerror_rates ? error_rates[i] / nerror_rates : engine.GetErrorRate(i),
+ engine.GetErrorRate(i));
+ fclose(file);
+
+ if (error_rates != NULL) delete [] error_rates;
+
+ printf("Wrote out file [%s.erate] with per marker error rates ...\n\n",
+ (const char *) outfile);
+ }
+
+ if (quickNPL && rounds)
+ AssociationAnalysis::ScoreNPL(outfile, ped, engine, rounds - burnin);
+
+ if (association && rounds)
+ AssociationAnalysis::ScoreMarkers(outfile, ped, doses);
+
+ if (mask > 0.0 && rounds)
+ EvaluateHaplotypes(doses, ped, engine.genotypes);
+
+ printf("Estimated mismatch rate in Markov model is: %.5f\n\n", errorRate);
+
+ endt = clock();
+ int lapsetime = (int) ((double)(endt - startt) / CLOCKS_PER_SEC);
+ printf("Analysis took %d seconds\n\n", lapsetime);
+
+ }
+
+
+
diff --git a/mach1/Manners.cpp b/mach1/Manners.cpp
new file mode 100644
index 0000000..1b0ebbb
--- /dev/null
+++ b/mach1/Manners.cpp
@@ -0,0 +1,81 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/Manners.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "Manners.h"
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static const char * machCrashExplanation = NULL;
+
+void SetCrashExplanation(const char * specialMessage)
+ {
+ machCrashExplanation = specialMessage;
+ }
+
+void UnsetCrashExplanation()
+ {
+ machCrashExplanation = NULL;
+ }
+
+void SetupCrashHandlers()
+ {
+ signal(SIGINT, (signal_handler) UserBreak);
+ signal(SIGSEGV, (signal_handler) OutOfMemory);
+ signal(SIGABRT, (signal_handler) OutOfMemory);
+ }
+
+void OutOfMemory(int)
+ {
+ printf("\n\nMACH 1.0 HAS CRASHED\n\n"
+ "The operating system has decided to terminate this run,\n"
+ "of the Markov Chain Haplotyper (MACH). Either MACH 1.0\n"
+ "requested too much memory or you have encountered a bug.\n\n"
+ "There are a number of ways to limit the amount of memory\n"
+ "used by MACH. Here are some suggestions that may help:\n\n"
+ " --compact: if there are many markers to haplotype, this\n"
+ " option will significantly reduce the amount of\n"
+ " memory used by the haplotyping engine.\n\n"
+ " --greedy: if you are using haplotypes from a reference\n"
+ " sample to infer missing genotypes or haplotype\n"
+ " your own sample, the --greedy option can\n"
+ " dramatically reduce memory use.\n\n"
+ " --mle: if you are using haplotypes from a reference\n"
+ " sample to infer missing genotypes in your sample\n"
+ " this option will reduce the amount of memory used\n"
+ " by the consensus builder\n\n"
+ "If you don't think this is a memory issue, you can help\n"
+ "improve this program by reporting bugs via a short e-mail\n"
+ "to goncalo at umich.edu. These e-mails are most helpful if you\n"
+ "include a description of the problem and example of how it can\n"
+ "be reproduced.\n\n");
+
+ if (machCrashExplanation != NULL)
+ printf("MACH 1.0 crashed while %s\n", machCrashExplanation);
+
+ exit(EXIT_FAILURE);
+ }
+
+void UserBreak(int)
+ {
+ printf("\n\nMACH 1.0 STOPPED BY USER\n\n");
+
+ exit(EXIT_FAILURE);
+ }
+
+
diff --git a/mach1/Manners.h b/mach1/Manners.h
new file mode 100644
index 0000000..411216e
--- /dev/null
+++ b/mach1/Manners.h
@@ -0,0 +1,36 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/Manners.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __MANNERS_H__
+#define __MANNERS_H__
+
+// This function sets up signal handlers to ensure
+// merlin terminates gracefully when something goes
+// wrong
+
+void SetupCrashHandlers();
+
+typedef void (*signal_handler)(int);
+
+void OutOfMemory(int);
+void UserBreak(int);
+
+void SetCrashExplanation(const char * specialMessage);
+void UnsetCrashExplanation();
+
+#endif
+
diff --git a/mach1/MergeHaplotypes.cpp b/mach1/MergeHaplotypes.cpp
new file mode 100644
index 0000000..d8e1fe4
--- /dev/null
+++ b/mach1/MergeHaplotypes.cpp
@@ -0,0 +1,205 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/MergeHaplotypes.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "MergeHaplotypes.h"
+#include "MemoryAllocators.h"
+#include "OutputHandlers.h"
+#include "MemoryInfo.h"
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef CHAR_BIT
+#define SEVEN (CHAR_BIT - 1)
+#define EIGHT (CHAR_BIT)
+#else
+#define SEVEN 7
+#define EIGHT 8
+#endif
+
+ConsensusBuilder::ConsensusBuilder(int N, int H, int M)
+ {
+ if (OutputManager::outputHaplotypes == false)
+ {
+ stored = samples = haplotypes = markers = 0;
+
+ sampledHaplotypes = NULL;
+ consensus = NULL;
+
+ readyForUse = true;
+
+ return;
+ }
+
+ stored = 0;
+
+ samples = N;
+ haplotypes = H;
+ markers = M;
+
+ sampledHaplotypes = AllocateCharCube(samples, haplotypes, (markers + SEVEN) / EIGHT );
+ consensus = AllocateCharMatrix(haplotypes, markers);
+
+ readyForUse = consensus != NULL && sampledHaplotypes != NULL;
+ }
+
+ConsensusBuilder::~ConsensusBuilder()
+ {
+
+
+ if (sampledHaplotypes != NULL)
+ FreeCharCube(sampledHaplotypes, samples, haplotypes);
+
+ if (consensus != NULL)
+ FreeCharMatrix(consensus, haplotypes);
+ }
+
+void ConsensusBuilder::Store(char ** newHaplotypes)
+ {
+ if (sampledHaplotypes == 0) return;
+
+ for (int i = 0; i < haplotypes; i++)
+ {
+ int byte = 0, mask = 1;
+
+ for (int j = 0; j < markers; j++)
+ {
+ if (newHaplotypes[i][j])
+ sampledHaplotypes[stored][i][byte] |= mask;
+ else
+ sampledHaplotypes[stored][i][byte] &= ~mask;
+
+ mask = (mask == (1 << SEVEN)) ? (byte++, 1) : mask * 2;
+ }
+ }
+
+ stored++;
+ }
+
+void ConsensusBuilder::Merge()
+ {
+ // Don't try to build consensus with no data
+ if (stored == 0) return;
+
+ // Initialize haplotype quality scores
+ flips = errors = 0;
+
+ // The phase for each pair of haplotypes indicates their ordering
+ // in relation to the consensus
+ char * phase = new char [stored];
+
+ // Loop through each haplotype in the set
+ for (int h = 0; h < haplotypes; h += 2)
+ {
+ // Select phase based on the first heterozygous position for each haplotype
+ for (int i = 0; i < stored; i++)
+ {
+ phase[i] = 0;
+
+ for (int j = 0, byte = 0, mask = 1; j < markers; j++)
+ {
+ if ((sampledHaplotypes[i][h][byte] ^ sampledHaplotypes[i][h + 1][byte]) & mask)
+ {
+ phase[i] = (sampledHaplotypes[i][h][byte] & mask) != 0;
+ break;
+ }
+
+ mask = (mask == (1 << SEVEN)) ? (byte++, 1) : mask * 2;
+ }
+ }
+
+ // Build consensus one position at a time ...
+ for (int i = 0, byte = 0, mask = 1; i < markers; i++)
+ {
+ int counts[4] = {0, 0, 0, 0};
+
+ // Count the number of occurences for each genotype
+ for (int j = 0; j < stored; j++)
+ {
+ int allele1 = (sampledHaplotypes[j][h + phase[j]][byte] & mask) != 0;
+ int allele2 = (sampledHaplotypes[j][h + (phase[j] ^ 1)][byte] & mask) != 0;
+
+ counts[allele1 * 2 + allele2]++;
+ }
+
+ // Record the expect number of copies of the common allele
+ // dosage[h / 2][i] = (short)((counts[1] + counts[2] + 2 * counts[3]) * scale + 0.5);
+
+ // Select the most likely genotype
+ int best = 0;
+
+ for (int j = 1; j < 4; j++)
+ if (counts[j] > counts[best])
+ best = j;
+
+ // Assign it to the consensus
+ consensus[h][i] = best / 2;
+ consensus[h + 1][i] = best % 2;
+
+ // Count number of samples with an alternative solutions
+ int alternative_solution_weight = (best == 0 || best == 3) ?
+ (stored - counts[best]) : (counts[0] + counts[3]);
+
+ // Update estimated haplotype quality scores
+ errors += alternative_solution_weight;
+
+ // If a heterozygous genotype was selected, update the phase for other informative
+ // haplotypes
+ if (best != 0 && best != 3 && counts[best ^ 3] > 0)
+ {
+ // Update estimated flip scores
+ flips += counts[best ^ 3];
+
+ best = best ^ 3;
+
+ for (int j = 0; j < stored; j++)
+ {
+ int allele1 = (sampledHaplotypes[j][h + phase[j]][byte] & mask) != 0;
+ int allele2 = (sampledHaplotypes[j][h + (phase[j] ^ 1)][byte] & mask) != 0;
+
+ if ((allele1 * 2 + allele2) == best)
+ phase[j] = phase[j] ^ 1;
+ }
+ }
+
+ mask = (mask == (1 << SEVEN)) ? (byte++, 1) : mask * 2;
+ }
+ }
+
+ delete [] phase;
+
+ flips /= stored;
+ errors /= stored;
+ }
+
+void ConsensusBuilder::ShowMemoryInfo()
+ {
+ EstimateMemoryInfo(samples, haplotypes, markers);
+ }
+
+void ConsensusBuilder::EstimateMemoryInfo(int Samples, int Haplotypes, int Markers)
+ {
+ if (OutputManager::outputHaplotypes == false)
+ return;
+
+ double bytes = sizeof(char) * (double) Samples * Haplotypes * (Markers + SEVEN) / EIGHT +
+ sizeof(char) * (double) Haplotypes * Markers;
+
+ printf(" %40s %s\n", "Consensus builder ...", (const char *) MemoryInfo(bytes));
+ }
+
diff --git a/mach1/MergeHaplotypes.h b/mach1/MergeHaplotypes.h
new file mode 100644
index 0000000..84d99d4
--- /dev/null
+++ b/mach1/MergeHaplotypes.h
@@ -0,0 +1,56 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/MergeHaplotypes.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __MERGE_HAPLOTYPES__
+#define __MERGE_HAPLOTYPES__
+
+class ConsensusBuilder
+ {
+ public:
+ int stored;
+
+ int samples;
+ int haplotypes;
+ int markers;
+
+ char *** sampledHaplotypes;
+ char ** consensus;
+
+ bool readyForUse;
+
+ ConsensusBuilder(int samples, int haplotypes, int markers);
+ ~ConsensusBuilder();
+
+ void Store(char ** newHaplotypes);
+ void Merge();
+
+ // Quality scores for estimated haplotypes
+ float flips;
+ float errors;
+
+ // Report memory usage
+ void ShowMemoryInfo();
+ static void EstimateMemoryInfo(int Samples, int Haplotypes, int Markers);
+
+ private:
+ };
+
+
+#endif
+
+
+
diff --git a/mach1/OutputHandlers.cpp b/mach1/OutputHandlers.cpp
new file mode 100644
index 0000000..78b5750
--- /dev/null
+++ b/mach1/OutputHandlers.cpp
@@ -0,0 +1,364 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/OutputHandlers.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "OutputHandlers.h"
+
+bool OutputManager::outputHaplotypes = false;
+bool OutputManager::outputGenotypes = false;
+bool OutputManager::outputDosage = false;
+bool OutputManager::outputProbabilities = false;
+bool OutputManager::outputQuality = false;
+bool OutputManager::uncompressed = false;
+
+void OutputManager::GzWriteHaplotypes(const char * outfile, Pedigree & ped, char ** haplotypes)
+ {
+ if (!OutputManager::outputHaplotypes)
+ return;
+
+ IFILE file = ifopen(outfile, "wt");
+
+ if (file == NULL)
+ {
+ printf("Error opening output file [%s]\n", (const char *) outfile);
+ exit(1);
+ }
+
+ for (int i = 0; i < ped.count * 2; i++)
+ {
+ ifprintf(file, "%s->%s HAPLO%d ",
+ (const char *) ped[i / 2].famid,
+ (const char *) ped[i / 2].pid, i % 2 + 1);
+
+ for (int j = 0; j < ped.markerCount; j++)
+ if (ped[i/2].markers[j].isKnown())
+ ifprintf(file, "%s", (const char *) ped.GetMarkerInfo(j)->GetAlleleLabel(haplotypes[i][j] + 1).ToUpper());
+ else
+ ifprintf(file, "%s", (const char *) ped.GetMarkerInfo(j)->GetAlleleLabel(haplotypes[i][j] + 1).ToLower());
+
+ ifprintf(file, "\n");
+ }
+
+ ifclose(file);
+
+ printf("Wrote out file [%s] with phased chromosomes ...\n", (const char *) outfile);
+ }
+
+void OutputManager::WriteHaplotypes(const char * outfile, Pedigree & ped, char ** haplotypes)
+ {
+ if (!OutputManager::outputHaplotypes)
+ return;
+
+ FILE * file = fopen(outfile, "wt");
+
+ if (file == NULL)
+ {
+ printf("Error opening output file [%s]\n", (const char *) outfile);
+ exit(1);
+ }
+
+ for (int i = 0; i < ped.count * 2; i++)
+ {
+ fprintf(file, "%s->%s HAPLO%d ",
+ (const char *) ped[i / 2].famid,
+ (const char *) ped[i / 2].pid, i % 2 + 1);
+
+ for (int j = 0; j < ped.markerCount; j++)
+ //if (ped[i/2].markers[j].isKnown())
+ //fprintf(file, "%s", (const char *) ped.GetMarkerInfo(j)->GetAlleleLabel(haplotypes[i][j] + 1).ToUpper());
+ //else
+ fprintf(file, "%s", (const char *) ped.GetMarkerInfo(j)->GetAlleleLabel(haplotypes[i][j] + 1).ToLower());
+
+ fprintf(file, "\n");
+ }
+
+ fclose(file);
+
+ printf("Wrote out file [%s] with phased chromosomes ...\n", (const char *) outfile);
+ }
+
+
+void OutputManager::OutputConsensus(Pedigree & ped, ConsensusBuilder & consensus,
+ DosageCalculator & doses, String filename)
+ {
+ consensus.Merge();
+
+ if (consensus.stored)
+ printf("Merged sampled haplotypes to generate consensus\n"
+ " Consensus estimated to have %.1f errors in missing genotypes and %.1f flips in haplotypes\n\n",
+ consensus.errors, consensus.flips);
+
+ if (uncompressed)
+ {
+ if (outputHaplotypes) WriteHaplotypes(filename, ped, consensus.consensus);
+ if (outputGenotypes) WriteGenotypes(filename + ".geno", ped, doses);
+ if (outputQuality) WriteQuality(filename + ".qc", ped, doses);
+ if (outputDosage) WriteDosages(filename + ".dose", ped, doses);
+ if (outputProbabilities) WriteProbabilities(filename + ".prob", ped, doses);
+ }
+ else
+ {
+ if (outputHaplotypes) GzWriteHaplotypes(filename + ".gz", ped, consensus.consensus);
+ if (outputGenotypes) GzWriteGenotypes(filename + ".geno.gz", ped, doses);
+ if (outputQuality) GzWriteQuality(filename + ".qc.gz", ped, doses);
+ if (outputDosage) GzWriteDosages(filename + ".dose.gz", ped, doses);
+ if (outputProbabilities) GzWriteProbabilities(filename + ".prob.gz", ped, doses);
+ }
+ }
+
+void OutputManager::GzWriteGenotypes(const char * outfile, Pedigree & ped, DosageCalculator & doses)
+ {
+ IFILE file = ifopen(outfile, "wt");
+
+ if (file == NULL)
+ {
+ printf("Error opening output file [%s]\n", (const char *) outfile);
+ exit(1);
+ }
+
+ for (int i = 0; i < ped.count; i++)
+ {
+ ifprintf(file, "%s->%s GENO ", (const char *) ped[i].famid,
+ (const char *) ped[i].pid);
+
+ for (int j = 0; j < ped.markerCount; j++)
+ {
+ int best = doses.GetBestGenotype(i, j);
+
+ MarkerInfo * info = ped.GetMarkerInfo(j);
+
+ ifprintf(file, " %s/%s", (const char *) info->GetAlleleLabel((best + 2) / 2),
+ (const char *) info->GetAlleleLabel((best + 3) / 2));
+ }
+
+ ifprintf(file, "\n");
+ }
+
+ ifclose(file);
+
+ printf("Wrote out file [%s] with imputed genotypes ...\n", (const char *) outfile);
+ }
+
+void OutputManager::WriteGenotypes(const char * outfile, Pedigree & ped, DosageCalculator & doses)
+ {
+ FILE * file = fopen(outfile, "wt");
+
+ if (file == NULL)
+ {
+ printf("Error opening output file [%s]\n", (const char *) outfile);
+ exit(1);
+ }
+
+ for (int i = 0; i < ped.count; i++)
+ {
+ fprintf(file, "%s->%s GENO ", (const char *) ped[i].famid,
+ (const char *) ped[i].pid);
+
+ for (int j = 0; j < ped.markerCount; j++)
+ {
+ int best = doses.GetBestGenotype(i, j);
+
+ MarkerInfo * info = ped.GetMarkerInfo(j);
+
+ fprintf(file, " %s/%s", (const char *) info->GetAlleleLabel((best + 2) / 2),
+ (const char *) info->GetAlleleLabel((best + 3) / 2));
+ }
+
+ fprintf(file, "\n");
+ }
+
+ fclose(file);
+
+ printf("Wrote out file [%s] with imputed genotypes ...\n", (const char *) outfile);
+ }
+
+
+void OutputManager::GzWriteDosages(const char * outfile, Pedigree & ped, DosageCalculator & doses)
+ {
+ IFILE file = ifopen(outfile, "wt");
+
+ if (file == NULL)
+ {
+ printf("Error opening output file [%s]\n", (const char *) outfile);
+ exit(1);
+ }
+
+ for (int i = 0; i < ped.count; i++)
+ {
+ ifprintf(file, "%s->%s ALLELE_DOSE ", (const char *) ped[i].famid,
+ (const char *) ped[i].pid);
+
+ for (int j = 0; j < ped.markerCount; j++)
+ ifprintf(file, "%.3f ", doses.GetDosage(i, j));
+
+ ifprintf(file, "\n");
+ }
+
+ ifclose(file);
+
+ printf("Wrote out file [%s] with dosage information...\n", (const char *) outfile);
+ }
+
+void OutputManager::WriteDosages(const char * outfile, Pedigree & ped, DosageCalculator & doses)
+ {
+ FILE * file = fopen(outfile, "wt");
+
+ if (file == NULL)
+ {
+ printf("Error opening output file [%s]\n", (const char *) outfile);
+ exit(1);
+ }
+
+ for (int i = 0; i < ped.count; i++)
+ {
+ fprintf(file, "%s->%s ALLELE_DOSE ", (const char *) ped[i].famid,
+ (const char *) ped[i].pid);
+
+ for (int j = 0; j < ped.markerCount; j++)
+ fprintf(file, "%.3f ", doses.GetDosage(i, j));
+
+ fprintf(file, "\n");
+ }
+
+ fclose(file);
+
+ printf("Wrote out file [%s] with dosage information...\n", (const char *) outfile);
+ }
+
+
+void OutputManager::GzWriteQuality(const char * outfile, Pedigree & ped, DosageCalculator & doses)
+ {
+ IFILE file = ifopen(outfile, "wt");
+
+ if (file == NULL)
+ {
+ printf("Error opening output file [%s]\n", (const char *) outfile);
+ exit(1);
+ }
+
+ for (int i = 0; i < ped.count; i++)
+ {
+ ifprintf(file, "%s->%s GENO_QUALITY ", (const char *) ped[i].famid,
+ (const char *) ped[i].pid);
+
+ for (int j = 0; j < ped.markerCount; j++)
+ ifprintf(file, "%.3f ", doses.GetQuality(i, j));
+
+ ifprintf(file, "\n");
+ }
+
+ ifclose(file);
+
+ printf("Wrote out file [%s] with quality scores...\n", (const char *) outfile);
+ }
+
+void OutputManager::WriteQuality(const char * outfile, Pedigree & ped, DosageCalculator & doses)
+ {
+ FILE * file = fopen(outfile, "wt");
+
+ if (file == NULL)
+ {
+ printf("Error opening output file [%s]\n", (const char *) outfile);
+ exit(1);
+ }
+
+ for (int i = 0; i < ped.count; i++)
+ {
+ fprintf(file, "%s->%s GENO_QUALITY ", (const char *) ped[i].famid,
+ (const char *) ped[i].pid);
+
+ for (int j = 0; j < ped.markerCount; j++)
+ fprintf(file, "%.3f ", doses.GetQuality(i, j));
+
+ fprintf(file, "\n");
+ }
+
+ fclose(file);
+
+ printf("Wrote out file [%s] with quality scores...\n", (const char *) outfile);
+ }
+
+void OutputManager::GzWriteProbabilities(const char * outfile, Pedigree & ped, DosageCalculator & doses)
+ {
+ IFILE file = ifopen(outfile, "wt");
+
+ if (file == NULL)
+ {
+ printf("Error opening output file [%s]\n", (const char *) outfile);
+ exit(1);
+ }
+
+ for (int i = 0; i < ped.count; i++)
+ {
+ ifprintf(file, "%s->%s GENO_PROBS", (const char *) ped[i].famid,
+ (const char *) ped[i].pid);
+
+ unsigned int n0, n1, n2;
+
+ for (int j = 0; j < ped.markerCount; j++)
+ {
+ doses.GetCounts(i, j, n0, n1, n2);
+
+ double d = n0 + n1 + n2 + 1e-30;
+
+ ifprintf(file, " %.3f %.3f", n0 / d, n1 / d);
+ }
+
+ ifprintf(file, "\n");
+ }
+
+ ifclose(file);
+
+ printf("Wrote out file [%s] with genotype probabilities...\n", (const char *) outfile);
+ }
+
+
+void OutputManager::WriteProbabilities(const char * outfile, Pedigree & ped, DosageCalculator & doses)
+ {
+ FILE * file = fopen(outfile, "wt");
+
+ if (file == NULL)
+ {
+ printf("Error opening output file [%s]\n", (const char *) outfile);
+ exit(1);
+ }
+
+ for (int i = 0; i < ped.count; i++)
+ {
+ fprintf(file, "%s->%s GENO_PROBS", (const char *) ped[i].famid,
+ (const char *) ped[i].pid);
+
+ unsigned int n0, n1, n2;
+
+ for (int j = 0; j < ped.markerCount; j++)
+ {
+ doses.GetCounts(i, j, n0, n1, n2);
+
+ double d = n0 + n1 + n2 + 1e-30;
+
+ fprintf(file, " %.3f %.3f", n0 / d, n1 / d);
+ }
+
+ fprintf(file, "\n");
+ }
+
+ fclose(file);
+
+ printf("Wrote out file [%s] with genotype probabilities...\n", (const char *) outfile);
+ }
+
+
+
diff --git a/mach1/OutputHandlers.h b/mach1/OutputHandlers.h
new file mode 100644
index 0000000..2ea615a
--- /dev/null
+++ b/mach1/OutputHandlers.h
@@ -0,0 +1,56 @@
+//////////////////////////////////////////////////////////////////////
+// mach1/OutputHandlers.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __OUTPUTHANDLERS_H__
+#define __OUTPUTHANDLERS_H__
+
+#include "MergeHaplotypes.h"
+#include "DosageCalculator.h"
+#include "Pedigree.h"
+#include "InputFile.h"
+
+class OutputManager
+ {
+ public:
+ static void GzWriteHaplotypes(const char * outfile, Pedigree & ped, char ** haplotypes);
+ static void GzWriteQuality(const char * outfile, Pedigree & ped, short ** matrix);
+ static void GzWriteGenotypes(const char * outfile, Pedigree & ped, DosageCalculator & doses);
+ static void GzWriteDosages(const char * outfile, Pedigree & ped, DosageCalculator & doses);
+ static void GzWriteProbabilities(const char * outfile, Pedigree & ped, DosageCalculator & doses);
+ static void GzWriteQuality(const char * outfile, Pedigree & ped, DosageCalculator & doses);
+
+ static void WriteHaplotypes(const char * outfile, Pedigree & ped, char ** haplotypes);
+ static void WriteQuality(const char * outfile, Pedigree & ped, short ** matrix);
+ static void WriteGenotypes(const char * outfile, Pedigree & ped, DosageCalculator & doses);
+ static void WriteDosages(const char * outfile, Pedigree & ped, DosageCalculator & doses);
+ static void WriteProbabilities(const char * outfile, Pedigree & ped, DosageCalculator & doses);
+ static void WriteQuality(const char * outfile, Pedigree & ped, DosageCalculator & doses);
+
+ static void OutputConsensus(Pedigree & ped, ConsensusBuilder & consensus, DosageCalculator & doses, String filename);
+
+ static bool outputHaplotypes;
+ static bool outputGenotypes;
+ static bool outputDosage;
+ static bool outputProbabilities;
+ static bool outputQuality;
+ static bool uncompressed;
+ };
+
+
+#endif
+
+
diff --git a/thunder/Main.cpp b/thunder/Main.cpp
new file mode 100644
index 0000000..d684de9
--- /dev/null
+++ b/thunder/Main.cpp
@@ -0,0 +1,470 @@
+//////////////////////////////////////////////////////////////////////
+// thunder/Main.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "ShotgunHaplotyper.h"
+#include "ShotgunManners.h"
+#include "OutputHandlers.h"
+#include "MergeHaplotypes.h"
+#include "HaplotypeLoader.h"
+#include "Parameters.h"
+#include "InputFile.h"
+#include "Error.h"
+#include <ctime>
+
+float * thetas = NULL;
+int nthetas = 0;
+
+float * error_rates = NULL;
+int nerror_rates = 0;
+
+void UpdateVector(float * current, float * & vector, int & n, int length)
+ {
+ if (n++ == 0)
+ {
+ vector = new float[length];
+
+ for (int i = 0; i < length; i++)
+ vector[i] = current[i];
+ }
+ else
+ for (int i = 0; i < length; i++)
+ vector[i] += current[i];
+ }
+
+void UpdateErrorRates(Errors * current, float * & vector, int & n, int length)
+ {
+ if (n++ == 0)
+ {
+ vector = new float[length];
+
+ for (int i = 0; i < length; i++)
+ vector[i] = current[i].rate;
+ }
+ else
+ for (int i = 0; i < length; i++)
+ vector[i] += current[i].rate;
+ }
+
+int MemoryAllocationFailure()
+ {
+ printf("FATAL ERROR - Memory allocation failed\n");
+ return -1;
+ }
+
+void LoadPolymorphicSites(const String & filename)
+ {
+ StringArray input, tokens;
+
+ input.Read(filename);
+
+ for (int i = 0; i < input.Length(); i++)
+ {
+ tokens.ReplaceTokens(input[i]);
+
+ if (tokens.Length() < 3) continue;
+
+ int markers = Pedigree::markerCount;
+ int marker = Pedigree::GetMarkerID(tokens[0]);
+ int al1 = Pedigree::LoadAllele(marker, tokens[1]);
+ int al2 = Pedigree::LoadAllele(marker, tokens[2]);
+
+ if (markers != marker)
+ error("Each polymorphic site should only occur once, but site %s is duplicated\n",
+ (const char *) tokens[0]);
+
+ if (al1 != 1 || al2 != 2)
+ error("Allele labels '%s' and '%s' for polymorphic site '%s' are not valid\n",
+ (const char *) tokens[1], (const char *) tokens[2], (const char *) tokens[0]);
+ }
+
+ if (Pedigree::markerCount == 0)
+ error("No information on polymorphic sites available,\n"
+ "please check you provided correct filename.\n");
+ }
+
+void LoadShotgunSamples(Pedigree & ped, const String & filename)
+ {
+ bool fail;
+
+ String input;
+ StringArray tokens;
+
+ IFILE f = ifopen(filename, "rb");
+
+ if (f == NULL)
+ error("Failed to open file with read count data,\n"
+ "please check you provided correct filename.\n");
+
+ while (!ifeof(f))
+ {
+ input.ReadLine(f);
+ tokens.ReplaceTokens(input);
+
+ if (tokens.Length() < 5) continue;
+
+ ped.AddPerson(tokens[0], tokens[1], tokens[2], tokens[3], ped.TranslateSexCode(tokens[5], fail), true);
+ }
+
+ ifclose(f);
+
+ ped.Sort();
+ }
+
+void LoadShotgunResults(Pedigree & ped, char ** genotypes, const String & filename)
+ {
+ bool fail;
+
+ String input;
+ StringArray tokens;
+
+ IFILE f = ifopen(filename, "rt");
+
+ if (f == NULL)
+ error("Failed to open file with read count data,\n"
+ "please check you provided correct filename.\n");
+
+ while (!ifeof(f))
+ {
+ input.ReadLine(f);
+ tokens.ReplaceTokens(input);
+
+ if (tokens.Length() == 0) continue;
+
+ if (tokens.Length() != ped.markerCount * 2 + 5)
+ error("Incorrect number of columns for line beggining:\n\n"
+ " %.70s\n\n"
+ "Expecting ids for family, individual, parents and sex, followed\n"
+ "by %d columns with read count summaries\n\n",
+ (const char *) input, ped.markerCount * 2);
+
+ int person = ped.FindPerson(tokens[0], tokens[1])->serial;
+
+ for (int i = 0; i < ped.markerCount; i++)
+ {
+ int allele1 = tokens[5 + i * 2].AsInteger();
+ int allele2 = tokens[5 + i * 2 + 1].AsInteger();
+
+ if (allele1 >= 16) allele1 = 15;
+ if (allele2 >= 16) allele2 = 15;
+
+ genotypes[person][i] = allele2 * 16 + allele1;
+ }
+ }
+ ifclose(f);
+ }
+
+
+
+int main(int argc, char ** argv)
+ {
+setbuf(stdout, NULL);
+setbuf(stderr, NULL);
+ String polymorphicSites, readCounts, mapfile, outfile("thunder.out");
+ String crossFile, errorFile;
+
+ double sequencingError = 0.005, errorRate = 0.01;
+ int seed = 123456, warmup = 0, states = 0;
+ int burnin = 0, rounds = 0, polling = 0, samples = 0;
+ bool weighted = false, compact = false;
+ bool mle = false, mledetails = false, uncompressed = false;
+
+ SetupCrashHandlers();
+ SetCrashExplanation("reading command line options");
+
+ printf("Thunder 1.0.16.c -- Markov Chain Haplotyping for Shotgun Sequence Data\n"
+ "(c) 2005-2007 Goncalo Abecasis, with thanks to Yun Li, Paul Scheet\n\n");
+
+ ParameterList pl;
+ clock_t startt, endt;
+ startt = clock();
+
+BEGIN_LONG_PARAMETERS(longParameters)
+ LONG_PARAMETER_GROUP("Shotgun Sequences")
+ LONG_STRINGPARAMETER("polymorphicSites", &polymorphicSites)
+ LONG_STRINGPARAMETER("readCounts", &readCounts)
+ LONG_DOUBLEPARAMETER("seqError", &sequencingError)
+ LONG_PARAMETER_GROUP("Optional Files")
+ LONG_STRINGPARAMETER("crossoverMap", &crossFile)
+ LONG_STRINGPARAMETER("errorMap", &errorFile)
+ LONG_STRINGPARAMETER("physicalMap", &mapfile)
+ LONG_PARAMETER_GROUP("Markov Sampler")
+ LONG_INTPARAMETER("seed", &seed)
+ LONG_INTPARAMETER("burnin", &burnin)
+ LONG_INTPARAMETER("rounds", &rounds)
+ LONG_PARAMETER_GROUP("Haplotyper")
+ LONG_INTPARAMETER("states", &states)
+ LONG_DOUBLEPARAMETER("errorRate", &errorRate)
+ LONG_PARAMETER("weighted", &weighted)
+ LONG_PARAMETER("compact", &compact)
+ LONG_PARAMETER_GROUP("Imputation")
+ LONG_PARAMETER("geno", &OutputManager::outputGenotypes)
+ LONG_PARAMETER("quality", &OutputManager::outputQuality)
+ LONG_PARAMETER("dosage", &OutputManager::outputDosage)
+ LONG_PARAMETER("mle", &mle)
+ LONG_PARAMETER_GROUP("Output Files")
+ LONG_STRINGPARAMETER("prefix", &outfile)
+ LONG_PARAMETER("phase", &OutputManager::outputHaplotypes)
+ LONG_PARAMETER("uncompressed", &OutputManager::uncompressed)
+ LONG_PARAMETER("mldetails", &mledetails)
+ LONG_PARAMETER_GROUP("Interim Output")
+ LONG_INTPARAMETER("sampleInterval", &samples)
+ LONG_INTPARAMETER("interimInterval", &polling)
+END_LONG_PARAMETERS();
+
+ pl.Add(new LongParameters("Available Options", longParameters));
+
+ pl.Add(new HiddenString('m', "Map File", mapfile));
+ pl.Add(new HiddenString('o', "Output File", outfile));
+ pl.Add(new HiddenInteger('r', "Haplotyping Rounds", rounds));
+ pl.Add(new HiddenDouble('e', "Error Rate", errorRate));
+
+ pl.Read(argc, argv);
+ pl.Status();
+
+ // Setup random seed ...
+ globalRandom.Reset(seed);
+
+ SetCrashExplanation("loading information on polymorphic sites");
+
+ // Setup and load a list of polymorphic sites, each with two allele labels ...
+ Pedigree ped;
+
+ SetCrashExplanation("loading shotgun data - first pass");
+
+ LoadShotgunSamples(ped, readCounts);
+
+ LoadPolymorphicSites(polymorphicSites);
+
+ SetCrashExplanation("loading map information for polymorphic sites");
+
+ printf("Loaded information on %d polymorphic sites\n\n", Pedigree::markerCount);
+
+ Pedigree::LoadMarkerMap(mapfile);
+
+ // Check if physical map is available
+ bool positionsAvailable = true;
+
+ for (int i = 0; i < ped.markerCount; i++)
+ if (Pedigree::GetMarkerInfo(i)->chromosome < 0)
+ {
+ positionsAvailable = false;
+ break;
+ }
+
+ if (positionsAvailable)
+ {
+ printf(" Physical map will be used to improve crossover rate estimates.\n");
+
+ for (int i = 1; i < ped.markerCount; i++)
+ if (ped.GetMarkerInfo(i)->position <= ped.GetMarkerInfo(i - 1)->position ||
+ ped.GetMarkerInfo(i)->chromosome != ped.GetMarkerInfo(i - 1)->chromosome)
+ {
+ printf(" FATAL ERROR -- Problems with physical map ...\n\n"
+ " Before continuing, check the following:\n"
+ " * All markers are on the same chromosome\n"
+ " * All marker positions are unique\n"
+ " * Markers in pedigree and haplotype files are ordered by physical position\n\n");
+ return -1;
+ }
+ }
+
+ printf("\n");
+
+ printf("Processing input files and allocating memory for haplotyping\n");
+
+ SetCrashExplanation("allocating memory for haplotype engine and consensus builder");
+
+ ShotgunHaplotyper engine;
+
+ engine.economyMode = compact;
+
+ engine.EstimateMemoryInfo(ped.count, ped.markerCount, states, compact);
+ engine.AllocateMemory(ped.count, states, ped.markerCount);
+
+ // Copy genotypes into haplotyping engine
+ if (engine.readyForUse)
+ LoadShotgunResults(ped, engine.genotypes, readCounts);
+
+ // Copy phased haplotypes into haplotyping engine
+ engine.phased = 0;
+
+ if (engine.readyForUse == false || engine.ForceMemoryAllocation() == false)
+ return MemoryAllocationFailure();
+
+ if (positionsAvailable && engine.AllocateDistances())
+ {
+ for (int i = 1; i < ped.markerCount; i++)
+ engine.distances[i - 1] = ped.GetMarkerInfo(i)->position -
+ ped.GetMarkerInfo(i-1)->position;
+ }
+
+ engine.ShowMemoryInfo();
+
+ if (mle)
+ {
+ engine.ShowMLEMemoryInfo();
+ if (!engine.AllocateMLEMemory())
+ return MemoryAllocationFailure();
+ }
+
+ ConsensusBuilder::EstimateMemoryInfo(rounds - burnin, ped.count * 2, ped.markerCount);
+ ConsensusBuilder consensus(rounds - burnin, ped.count * 2, ped.markerCount);
+
+ if (consensus.readyForUse == false)
+ return MemoryAllocationFailure();
+
+ DosageCalculator::storeDistribution = OutputManager::outputDosage ||
+ OutputManager::outputQuality ||
+ OutputManager::outputGenotypes;
+
+ DosageCalculator::EstimateMemoryInfo(rounds - burnin, ped.count, ped.markerCount);
+ DosageCalculator doses(rounds - burnin, ped.count, ped.markerCount);
+
+ if (doses.readyForUse == false)
+ return MemoryAllocationFailure();
+
+ if (weighted)
+ engine.CalculateWeights();
+
+ printf("Memory allocated successfully\n\n");
+
+ SetCrashExplanation("loading error rate and cross over maps");
+
+ engine.SetErrorRate(errorRate);
+ engine.SetShotgunError(sequencingError);
+
+ bool newline = engine.LoadCrossoverRates(crossFile);
+ newline |= engine.LoadErrorRates(errorFile);
+ if (newline) printf("\n");
+
+ SetCrashExplanation("searching for initial haplotype set");
+
+ engine.RandomSetup();
+ printf("Found initial haplotype set\n\n");
+
+ SetCrashExplanation("revving up haplotyping engine");
+
+ SetCrashExplanation("interating through markov chain haplotyping procedure");
+
+ for (int i = 0; i < rounds; i++)
+ {
+ engine.LoopThroughChromosomes();
+
+ engine.UpdateThetas();
+ errorRate = engine.UpdateErrorRate();
+
+ printf("Markov Chain iteration %d [%d mosaic crossovers]\n",
+ i + 1, engine.TotalCrossovers() );
+
+ if (i < burnin)
+ continue;
+
+ if (OutputManager::outputHaplotypes)
+ consensus.Store(engine.haplotypes);
+
+ if (doses.storeDosage || doses.storeDistribution)
+ doses.Update(engine.haplotypes);
+
+ if (polling > 0 && ((i - burnin) % polling) == 0)
+ OutputManager::OutputConsensus(ped, consensus, doses, outfile + ".prelim" + (i + 1));
+
+ if (samples > 0 && ((i - burnin) % samples) == 0)
+ OutputManager::WriteHaplotypes(outfile + ".sample" + (i + 1), ped, engine.haplotypes);
+
+ UpdateVector(engine.thetas, thetas, nthetas, engine.markers - 1);
+ UpdateErrorRates(engine.error_models, error_rates, nerror_rates, engine.markers);
+ }
+
+ if (rounds) printf("\n");
+
+ SetCrashExplanation("estimating maximum likelihood solution, conditional on current state");
+
+ if (mle)
+ {
+ // Use best available error and crossover rates for MLE
+ if (nerror_rates)
+ for (int i = 0; i < engine.markers; i++)
+ engine.SetErrorRate(i, error_rates[i] / nerror_rates);
+
+ if (nthetas)
+ for (int i = 0; i < engine.markers - 1; i++)
+ engine.thetas[i] = thetas[i] / nthetas;
+
+ engine.OutputMLEs(ped, outfile, mledetails);
+ }
+
+// ParseHaplotypes(engine.haplotypes, engine.individuals * 2 - 2, engine.markers, 32);
+
+ SetCrashExplanation("outputing solution");
+
+ // If we did multiple rounds of haplotyping, then generate consensus
+ if (rounds > 1)
+ OutputManager::OutputConsensus(ped, consensus, doses, outfile);
+ else
+ OutputManager::WriteHaplotypes(outfile, ped, engine.haplotypes);
+
+ if (doses.storeDosage || doses.storeDistribution)
+ doses.OutputMarkerInfo(outfile + ".info");
+
+ FILE * file = fopen(outfile + ".rec", "wt");
+
+ if (file == NULL)
+ printf("Error opening output file [%s.rec]\n", (const char *) outfile);
+ else
+ {
+ fprintf(file, "Interval AvgRate LastRate\n");
+ for (int i = 0; i < engine.markers - 1; i++)
+ fprintf(file, "%d-%d %.4f %.4f\n", i + 1, i + 2,
+ nthetas ? thetas[i] / nthetas : engine.thetas[i],
+ engine.thetas[i]);
+ fclose(file);
+
+ if (thetas != NULL) delete [] thetas;
+
+ printf("Wrote out file [%s.rec] with mosaic crossover rates ...\n", (const char *) outfile);
+ }
+
+ file = fopen(outfile + ".erate", "wt");
+
+ if (file == NULL)
+ printf("Error opening output file [%s.erate]\n", (const char *) outfile);
+ else
+ {
+ fprintf(file, "Marker AvgRate LastRate\n");
+ for (int i = 0; i < engine.markers; i++)
+ fprintf(file, "%s %.4f %.4f\n", (const char *) ped.markerNames[i],
+ nerror_rates ? error_rates[i] / nerror_rates : engine.GetErrorRate(i),
+ engine.GetErrorRate(i));
+ fclose(file);
+
+ if (error_rates != NULL) delete [] error_rates;
+
+ printf("Wrote out file [%s.erate] with per marker error rates ...\n\n",
+ (const char *) outfile);
+ }
+
+ printf("Estimated mismatch rate in Markov model is: %.5f\n", errorRate);
+ endt = clock();
+ int lapsetime = (int) ((double)(endt - startt) / CLOCKS_PER_SEC);
+ printf("Analysis took %d seconds\n\n", lapsetime);
+
+ }
+
+
+
+
+
diff --git a/thunder/ShotgunHaplotyper.cpp b/thunder/ShotgunHaplotyper.cpp
new file mode 100644
index 0000000..5241f9c
--- /dev/null
+++ b/thunder/ShotgunHaplotyper.cpp
@@ -0,0 +1,250 @@
+//////////////////////////////////////////////////////////////////////
+// thunder/ShotgunHaplotyper.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "ShotgunHaplotyper.h"
+#include "MemoryAllocators.h"
+
+#include <math.h>
+
+ShotgunHaplotyper::ShotgunHaplotyper()
+ {
+ skipSanityCheck = true;
+
+ shotgunErrorMatrix = AllocateFloatMatrix(3, 256);
+
+ SetShotgunError(0.005);
+ }
+
+ShotgunHaplotyper::~ShotgunHaplotyper()
+ {
+ FreeFloatMatrix(shotgunErrorMatrix, 3);
+ }
+
+void ShotgunHaplotyper::CalculateWeights()
+ {
+ AllocateWeights();
+
+ // Calculate weights ...
+ float sum = 0.0;
+ for (int i = 0; i < individuals - phased; i++)
+ {
+ weights[i] = 0.0;
+
+ for (int j = 0; j < markers; j++)
+ weights[i] += (genotypes[i][j] % 16) + (genotypes[i][j] / 16);
+
+ sum += weights[i];
+ }
+
+ // Give up if there are no genotyped individuals
+ if (sum == 0.0)
+ FreeWeights();
+ }
+
+void ShotgunHaplotyper::RandomSetup(Random * rand)
+ {
+ if (rand == NULL)
+ rand = &globalRandom;
+
+ for (int j = 0; j < markers; j++)
+ {
+ int alleles = 0, mac = 0;
+
+ for (int i = 0; i < individuals; i++)
+ {
+ int g = (unsigned char) genotypes[i][j];
+
+ alleles += (g % 16);
+ mac += (g / 16);
+ }
+ alleles += mac;
+
+ if (alleles == 0)
+ {
+ for (int i = 0; i < individuals; i++)
+ haplotypes[i * 2][j] = haplotypes[i * 2 + 1][j] = 0;
+ continue;
+ }
+
+ double freq = mac / (double) alleles;
+
+ double prior_11 = (1.0 - freq) * (1.0 - freq);
+ double prior_12 = 2.0 * freq * (1.0 - freq);
+ double prior_22 = freq * freq;
+
+ for (int i = 0; i < individuals; i++)
+ {
+ int observed = (unsigned char) (genotypes[i][j]);
+
+ double posterior_11 = prior_11 * shotgunErrorMatrix[0][observed];
+ double posterior_12 = prior_12 * shotgunErrorMatrix[1][observed];
+ double posterior_22 = prior_22 * shotgunErrorMatrix[2][observed];
+ double sum = posterior_11 + posterior_12 + posterior_22;
+
+ if (sum == 0)
+ printf("Problem!\n");
+
+ posterior_11 /= sum;
+ posterior_12 /= sum;
+
+ double r = rand->Next();
+
+ if (r < posterior_11)
+ {
+ haplotypes[i * 2][j] = 0;
+ haplotypes[i * 2 + 1][j] = 0;
+ }
+ else if (r < posterior_11 + posterior_12)
+ {
+ bool bit = rand->Binary();
+
+ haplotypes[i * 2][j] = bit;
+ haplotypes[i * 2 + 1][j] = bit ^ 1;
+ }
+ else
+ {
+ haplotypes[i * 2][j] = 1;
+ haplotypes[i * 2 + 1][j] = 1;
+ }
+ }
+ }
+ }
+
+void ShotgunHaplotyper::SetShotgunError(double rate)
+ {
+ // Store the background rate
+ shotgunError = rate;
+
+ // First calculate binomial coefficients
+ int binomial[33][33];
+
+ binomial[0][0] = 1;
+ binomial[1][0] = binomial[1][1] = 1;
+
+ for (int i = 2; i < 32; i++)
+ {
+ binomial[i][0] = binomial[i][i] = 1;
+
+ for (int j = 1; (j < i) && (j < 16); j++)
+ binomial[i][j] = binomial[i-1][j] + binomial[i-1][j-1];
+ }
+
+ // Next setup the error matrices for each possible true genotype
+ for (int i = 0; i < 16; i++)
+ for (int j = 0; j < 16; j++)
+ if (rate == 0)
+ {
+ shotgunErrorMatrix[0][j*16 + i] = j == 0 ? 1.0 : 0.0;
+ shotgunErrorMatrix[1][j*16 + i] = pow(0.5, i + j) * binomial[i+j][j];
+ shotgunErrorMatrix[2][j*16 + i] = i == 0 ? 1.0 : 0.0;
+ }
+ else
+ {
+ shotgunErrorMatrix[0][j*16 + i] = pow(1.0 - rate, i) * pow(rate, j) * binomial[i+j][j];
+ shotgunErrorMatrix[1][j*16 + i] = pow(0.5, i + j) * binomial[i+j][j];
+ shotgunErrorMatrix[2][j*16 + i] = pow(rate, i) * pow(1.0 - rate, j) * binomial[i+j][j];
+ }
+ }
+
+void ShotgunHaplotyper::ConditionOnData(float * matrix, int marker, char genotype)
+ {
+ // We treat missing genotypes as uninformative about the mosaic's
+ // underlying state. If we were to allow for deletions and the like,
+ // that may no longer be true.
+ if (genotype == GENOTYPE_MISSING)
+ return;
+
+ int g = (unsigned char) genotype;
+
+ double conditional_probs[3];
+
+ for (int i = 0; i < 3; i++)
+ conditional_probs[i] = Penetrance(marker, i, 0) * shotgunErrorMatrix[0][g] +
+ Penetrance(marker, i, 1) * shotgunErrorMatrix[1][g] +
+ Penetrance(marker, i, 2) * shotgunErrorMatrix[2][g];
+
+ for (int i = 0; i < states; i++)
+ {
+ double factors[2];
+
+ factors[0] = conditional_probs[haplotypes[i][marker]];
+ factors[1] = conditional_probs[haplotypes[i][marker] + 1];
+
+ for (int j = 0; j <= i; j++, matrix++)
+ *matrix *= factors[haplotypes[j][marker]];
+ }
+ }
+
+void ShotgunHaplotyper::ImputeAlleles(int marker, int state1, int state2, Random * rand)
+ {
+ int copied1 = haplotypes[state1][marker];
+ int copied2 = haplotypes[state2][marker];
+
+ int genotype = (unsigned char) genotypes[states / 2][marker];
+
+ double posterior_11 = Penetrance(marker, copied1 + copied2, 0) * shotgunErrorMatrix[0][genotype];
+ double posterior_12 = Penetrance(marker, copied1 + copied2, 1) * shotgunErrorMatrix[1][genotype];
+ double posterior_22 = Penetrance(marker, copied1 + copied2, 2) * shotgunErrorMatrix[2][genotype];
+ double sum = posterior_11 + posterior_12 + posterior_22;
+
+ posterior_11 /= sum;
+ posterior_22 /= sum;
+
+ double r = rand->Next();
+
+ if (r < posterior_11)
+ {
+ haplotypes[states][marker] = 0;
+ haplotypes[states + 1][marker] = 0;
+ }
+ else if (r < posterior_11 + posterior_22)
+ {
+ haplotypes[states][marker] = 1;
+ haplotypes[states + 1][marker] = 1;
+ }
+ else if (copied1 != copied2)
+ {
+ double rate = GetErrorRate(marker);
+
+ if (rand->Next() < rate * rate / ((rate * rate) + (1.0 - rate) * (1.0 - rate)))
+ {
+ copied1 = !copied1;
+ copied2 = !copied2;
+ }
+
+ haplotypes[states][marker] = copied1;
+ haplotypes[states + 1][marker] = copied2;
+ }
+ else
+ {
+ bool bit = rand->Binary();
+
+ haplotypes[states][marker] = bit;
+ haplotypes[states + 1][marker] = bit ^ 1;
+ }
+
+ int imputed1 = haplotypes[states][marker];
+ int imputed2 = haplotypes[states + 1][marker];
+
+ int differences = abs(copied1 - imputed1) + abs(copied2 - imputed2);
+
+ error_models[marker].matches += 2 - differences;
+ error_models[marker].mismatches += differences;
+ }
+
+
+
diff --git a/thunder/ShotgunHaplotyper.h b/thunder/ShotgunHaplotyper.h
new file mode 100644
index 0000000..4d81f7e
--- /dev/null
+++ b/thunder/ShotgunHaplotyper.h
@@ -0,0 +1,44 @@
+//////////////////////////////////////////////////////////////////////
+// thunder/ShotgunHaplotyper.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "Haplotyper.h"
+
+class ShotgunHaplotyper : public Haplotyper
+ {
+ public:
+ ShotgunHaplotyper();
+ ~ShotgunHaplotyper();
+
+ virtual void CalculateWeights();
+ virtual void RandomSetup(Random * rand = NULL);
+ virtual void ConditionOnData(float * matrix, int marker, char genotype);
+ virtual void ImputeAlleles(int marker, int state1, int state2, Random * rand);
+
+ void SetShotgunError(double rate);
+ double GetShotgunError() { return shotgunError; }
+
+ static void EstimateMemoryInfo(int Individuals, int Markers, int States, bool Compact)
+ {
+ Haplotyper::EstimateMemoryInfo(Individuals, Markers, States, Compact, false);
+ }
+
+ protected:
+ float ** shotgunErrorMatrix;
+ float shotgunError;
+ };
+
+
diff --git a/thunder/ShotgunManners.cpp b/thunder/ShotgunManners.cpp
new file mode 100644
index 0000000..d7d8653
--- /dev/null
+++ b/thunder/ShotgunManners.cpp
@@ -0,0 +1,68 @@
+//////////////////////////////////////////////////////////////////////
+// thunder/ShotgunManners.cpp
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#include "Manners.h"
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static const char * machCrashExplanation = NULL;
+
+void SetCrashExplanation(const char * specialMessage)
+ {
+ machCrashExplanation = specialMessage;
+ }
+
+void UnsetCrashExplanation()
+ {
+ machCrashExplanation = NULL;
+ }
+
+void SetupCrashHandlers()
+ {
+ signal(SIGINT, (signal_handler) UserBreak);
+ signal(SIGSEGV, (signal_handler) OutOfMemory);
+ signal(SIGABRT, (signal_handler) OutOfMemory);
+ }
+
+void OutOfMemory(int)
+ {
+ printf("\n\nMACH 1.0 FOR SHOTGUN SEQUENCES HAS CRASHED\n\n"
+ "The operating system has decided to terminate this run,\n"
+ "of the Markov Chain Haplotyper (MACH). Either MACH 1.0\n"
+ "requested too much memory or you have encountered a bug.\n\n"
+ "If you don't think this is a memory issue, you can help\n"
+ "improve this program by reporting bugs via a short e-mail\n"
+ "to goncalo at umich.edu. These e-mails are most helpful if you\n"
+ "include a description of the problem and example of how it can\n"
+ "be reproduced.\n\n");
+
+ if (machCrashExplanation != NULL)
+ printf("MACH 1.0 crashed while %s\n", machCrashExplanation);
+
+ exit(EXIT_FAILURE);
+ }
+
+void UserBreak(int)
+ {
+ printf("\n\nMACH 1.0 STOPPED BY USER\n\n");
+
+ exit(EXIT_FAILURE);
+ }
+
+
diff --git a/thunder/ShotgunManners.h b/thunder/ShotgunManners.h
new file mode 100644
index 0000000..1e5ec9b
--- /dev/null
+++ b/thunder/ShotgunManners.h
@@ -0,0 +1,36 @@
+//////////////////////////////////////////////////////////////////////
+// thunder/ShotgunManners.h
+// (c) 2000-2008 Goncalo Abecasis
+//
+// This file is distributed as part of the MaCH source code package
+// and may not be redistributed in any form, without prior written
+// permission from the author. Permission is granted for you to
+// modify this file for your own personal use, but modified versions
+// must retain this copyright notice and must not be distributed.
+//
+// Permission is granted for you to use this file to compile MaCH.
+//
+// All computer programs have bugs. Use this file at your own risk.
+//
+// Saturday April 12, 2008
+//
+
+#ifndef __MANNERS_H__
+#define __MANNERS_H__
+
+// This function sets up signal handlers to ensure
+// merlin terminates gracefully when something goes
+// wrong
+
+void SetupCrashHandlers();
+
+typedef void (*signal_handler)(int);
+
+void OutOfMemory(int);
+void UserBreak(int);
+
+void SetCrashExplanation(const char * specialMessage);
+void UnsetCrashExplanation();
+
+#endif
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/mach-haplotyper.git
More information about the debian-med-commit
mailing list