[med-svn] [dwgsim] 01/03: Imported Upstream version 0.1.11

Kevin Murray daube-guest at moszumanska.debian.org
Wed Sep 23 01:01:45 UTC 2015


This is an automated email from the git hooks/post-receive script.

daube-guest pushed a commit to branch master
in repository dwgsim.

commit 58f34768b3edd452b2fbaec2aaad11411f3a0818
Author: Kevin Murray <spam at kdmurray.id.au>
Date:   Wed Sep 23 09:14:22 2015 +1000

    Imported Upstream version 0.1.11
---
 .gitmodules                            |    3 +
 INSTALL                                |   33 +
 LICENSE                                |  339 ++++++++++
 Makefile                               |   73 +++
 README                                 |    9 +
 scripts/dwgsim_eval_plot.py            |  120 ++++
 scripts/dwgsim_mut_to_vcf.pl           |  106 ++++
 scripts/dwgsim_pileup_eval.pl          |  202 ++++++
 scripts/galaxy/dwgsim_eval_wrapper.py  |  157 +++++
 scripts/galaxy/dwgsim_eval_wrapper.xml |   73 +++
 scripts/galaxy/dwgsim_wrapper.py       |  182 ++++++
 scripts/galaxy/dwgsim_wrapper.xml      |  129 ++++
 src/contigs.c                          |   54 ++
 src/contigs.h                          |   23 +
 src/dwgsim.c                           | 1085 ++++++++++++++++++++++++++++++++
 src/dwgsim.h                           |   26 +
 src/dwgsim_eval.c                      |  698 ++++++++++++++++++++
 src/dwgsim_eval.h                      |   66 ++
 src/dwgsim_opt.c                       |  381 +++++++++++
 src/dwgsim_opt.h                       |   66 ++
 src/mut.c                              |  888 ++++++++++++++++++++++++++
 src/mut.h                              |   93 +++
 src/mut_bed.c                          |  136 ++++
 src/mut_bed.h                          |   47 ++
 src/mut_input.c                        |   81 +++
 src/mut_input.h                        |   52 ++
 src/mut_txt.c                          |  127 ++++
 src/mut_txt.h                          |   47 ++
 src/mut_vcf.c                          |  267 ++++++++
 src/mut_vcf.h                          |   39 ++
 src/regions_bed.c                      |  148 +++++
 src/regions_bed.h                      |   21 +
 32 files changed, 5771 insertions(+)

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..e37d154
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "samtools"]
+	path = samtools
+	url = http://github.com/samtools/samtools.git
diff --git a/INSTALL b/INSTALL
new file mode 100644
index 0000000..cc1d7cf
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,33 @@
+Welcome to DWGSIM
+
+=== Prerequisites ===
+
+If you have just cloned this repository using GIT, you must 
+initialize the SAMtools sub-module:
+ git submodule init
+ git submodule update
+You should now have the SAMtools-compatible version checked
+out in the samtools sub-directory.
+
+=== Building DWGSIM ===
+
+To build DWGSIM, execute the following commands:
+ make
+
+To install DWGSIM, after building, copy the following binaries
+to their appropriate installation directories:
+ dwgsim
+ dwgsim_eval
+ dwgims_pileup_eval.pl
+
+=== DWGSIM with Galaxy ===
+
+DWGSIM can be incorporated into galaxy.
+NB: this is out-of-date.
+
+1. Copy over the wrapper scripts:
+  cp scripts/galaxy/dwgsim_wrapper.* galaxy-dist/tools/sr_mapping
+2. Add to the list of available tools.
+  2a. Open galaxy-dist/tool_conf.xml 
+  2b. At the end of the NGS: mapping section, add the following:
+    <tool file="sr_mapping/dwgsim_wrapper.xml" />
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d511905
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,339 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..0d5b9ec
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,73 @@
+PACKAGE_VERSION="0.1.11"
+CC=			gcc
+CFLAGS=		-g -Wall -O3 #-m64 #-arch ppc
+DFLAGS=		-D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -DPACKAGE_VERSION=\\\"${PACKAGE_VERSION}\\\"
+DWGSIM_AOBJS = src/dwgsim_opt.o src/mut.o src/contigs.o src/regions_bed.o \
+			   src/mut_txt.o src/mut_bed.o src/mut_vcf.o src/mut_input.o src/dwgsim.o
+DWGSIM_EVAL_AOBJS = src/dwgsim_eval.o \
+					samtools/knetfile.o \
+					samtools/bgzf.o samtools/kstring.o samtools/bam_aux.o samtools/bam.o samtools/bam_import.o samtools/sam.o samtools/bam_index.o \
+					samtools/bam_pileup.o samtools/bam_lpileup.o samtools/bam_md.o samtools/razf.o samtools/faidx.o samtools/bedidx.o \
+					samtools/bam_sort.o samtools/sam_header.o samtools/bam_reheader.o samtools/kprobaln.o samtools/bam_cat.o
+
+PROG=		dwgsim dwgsim_eval
+INCLUDES=	-I.
+SUBDIRS=	samtools . 
+CLEAN_SUBDIRS=	samtools src
+LIBPATH=
+
+.SUFFIXES:.c .o
+
+.c.o:
+		$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all-recur lib-recur clean-recur cleanlocal-recur install-recur:
+		@target=`echo $@ | sed s/-recur//`; \
+		wdir=`pwd`; \
+		list='$(SUBDIRS)'; for subdir in $$list; do \
+			cd $$subdir; \
+			$(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
+				INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \
+			cd $$wdir; \
+		done;
+
+all:$(PROG)
+
+.PHONY:all lib clean cleanlocal
+.PHONY:all-recur lib-recur clean-recur cleanlocal-recur install-recur
+
+dwgsim:lib-recur $(DWGSIM_AOBJS)
+	$(CC) $(CFLAGS) -o $@ $(DWGSIM_AOBJS) -lm -lz -lpthread
+
+dwgsim_eval:lib-recur $(DWGSIM_EVAL_AOBJS)
+	$(CC) $(CFLAGS) -o $@ $(DWGSIM_EVAL_AOBJS) -Lsamtools -lm -lz -lpthread
+
+cleanlocal:
+		rm -vfr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib; \
+		wdir=`pwd`; \
+		list='$(CLEAN_SUBDIRS)'; for subdir in $$list; do \
+			if [ -d $$subdir ]; then \
+				cd $$subdir; \
+				pwd; \
+				rm -vfr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib; \
+				cd $$wdir; \
+			fi; \
+		done;
+
+clean:cleanlocal-recur
+
+dist:clean
+	if [ -f dwgsim-${PACKAGE_VERSION}.tar.gz ]; then \
+        rm -rv dwgsim-${PACKAGE_VERSION}.tar.gz; \
+	fi; \
+	if [ -f dwgsim-${PACKAGE_VERSION}.tar ]; then \
+        rm -rv dwgsim-${PACKAGE_VERSION}.tar; \
+	fi; \
+	if [ -d dwgsim-${PACKAGE_VERSION} ]; then \
+        rm -rv dwgsim-${PACKAGE_VERSION}; \
+	fi; \
+    mkdir dwgsim-${PACKAGE_VERSION}; \
+	cp -r INSTALL LICENSE Makefile README scripts src dwgsim-${PACKAGE_VERSION}/.; \
+	tar -vcf dwgsim-${PACKAGE_VERSION}.tar dwgsim-${PACKAGE_VERSION}; \
+	gzip -9 dwgsim-${PACKAGE_VERSION}.tar; \
+	rm -rv dwgsim-${PACKAGE_VERSION};
diff --git a/README b/README
new file mode 100644
index 0000000..31a7e4f
--- /dev/null
+++ b/README
@@ -0,0 +1,9 @@
+Welcome to DWGSIM.
+
+Please see the file LICENSE for details.
+Please see the file INSTALL for installation instructions;
+
+This software package has limited support since it is in active development.
+
+Please see the following page for more details:
+https://sourceforge.net/apps/mediawiki/dnaa/index.php?title=Whole_Genome_Simulation
diff --git a/scripts/dwgsim_eval_plot.py b/scripts/dwgsim_eval_plot.py
new file mode 100644
index 0000000..b3ea6ec
--- /dev/null
+++ b/scripts/dwgsim_eval_plot.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+
+import os
+import sys
+from optparse import OptionParser
+import re
+
+import string
+import pylab 
+
+class Table:
+
+    def __init__(self, fn, min_mapq):
+        ncol = -1
+
+        # read in the data
+        data = list()
+        fh = open(fn, 'r')
+        for line in fh:
+            line = line.rstrip("\r\n")
+            if '#' == line[0]:
+                continue
+            else:
+                table = self.__parse(line)
+                if table[0] < min_mapq:
+                    continue
+                data.append(table)
+                if -1 == ncol:
+                    ncol = len(table)
+                elif ncol != len(table):
+                    sys.stderr.write("Error: table was misformatted\n")
+                    sys.exit(1)
+                    
+        fh.close()
+
+        # re-arrange the data
+        self.matrix = [[data[i][j] for j in xrange(ncol)] for i in xrange(len(data))]
+
+    # NB: assumes sorted descending
+    def __parse(self, line):
+        tokens = re.findall(r'[\w+-.]+', line)
+        for i in range(0, 13):
+            tokens[i] = int(tokens[i])
+        for i in range(13, len(tokens)):
+            tokens[i] = float(tokens[i])
+        return tokens
+
+def main(options):
+    fmts = ['-', '--', '-.', ':', '.']
+
+    if None == options.fns or 0 == len(options.fns):
+        return
+
+    # read in through the files
+    fmts_i = 0
+    for fn in options.fns:
+        table = Table(fn, options.min_mapq)
+        # extract sensitivity 
+        x = [table.matrix[i][17] for i in range(len(table.matrix))]
+        # extract specificity
+        y = [table.matrix[i][16] for i in range(len(table.matrix))]
+        pylab.plot(x, y, fmts[fmts_i])
+        fmts_i = fmts_i + 1
+        if len(fmts) <= fmts_i:
+            fmts_i = 0
+    # plot values
+    pylab.title('DWGSIM ROC')
+    pylab.xlabel('specificity')
+    pylab.ylabel('sensitivity')
+    # xlim
+    if None != options.xlim:
+        nums = re.findall(r'\d+\.?\d*', options.xlim)
+        if 2 == len(nums):
+            pylab.xlim(xmin=float(nums[0]), xmax=float(nums[1]))
+    # ylim
+    if None != options.ylim:
+        nums = re.findall(r'\d+\.?\d*', options.ylim)
+        if 2 == len(nums):
+            pylab.ylim(ymin=float(nums[0]), ymax=float(nums[1]))
+    # legend
+    if not options.no_legend:
+        if None != options.ids and len(options.fns) == len(options.ids):
+            pylab.legend(options.ids, title='IDs', loc='lower left')
+        elif options.infer_ids:
+            ids = list()
+            for fn in options.fns:
+                i = re.sub(r'^.*\/', '', fn)
+                i = re.sub(r'\.sam\.txt$', '', i)
+                i = re.sub(r'^out', '', i)
+                i = re.sub(r'^dwgsim_eval', '', i)
+                i = re.sub(r'^\.', '', i)
+                ids.append(i)
+            pylab.legend(ids, title='IDs', loc='lower left')
+    # show the plot
+    if None == options.out:
+        pylab.show()
+    else:
+        # check if the file extension is present, if not, add it
+        if len(options.out) <= len(options.outtype) or options.outtype != options.out[len(options.out)-len(options.outtype):]:
+            pylab.savefig("%s.%s" % (options.out, options.outtype), format=options.outtype)
+        else:
+            pylab.savefig(options.out, format=options.outtype)
+        pylab.close()
+
+if __name__ == '__main__':
+    parser = OptionParser()
+    parser.add_option('--fn', help="an output file from dwgsim_eval", action="append", dest="fns")
+    parser.add_option('--id', help="the corresponding ID to add to the plot legened", action="append", dest="ids")
+    parser.add_option('--infer-ids', help="infer the ids from the file names", action="store_true", default=True, dest="infer_ids")
+    parser.add_option('--xlim', help="the x-axis range", dest="xlim")
+    parser.add_option('--ylim', help="the y-axis range", dest="ylim")
+    parser.add_option('--out', help="the output file", dest="out")
+    parser.add_option('--outtype', help="the output file type (ex. pdf, png)", dest="outtype", default="pdf")
+    parser.add_option('--min-mapq', help="minimum mapping quality (inclusive)", dest="min_mapq", type=int, default=0)
+    parser.add_option('--no-legend', help="do not add a legend to the plot", dest="no_legend", action="store_true", default=False)
+    if len(sys.argv[1:]) < 1:
+        parser.print_help()
+    else:
+        options, args = parser.parse_args()
+        main(options)
diff --git a/scripts/dwgsim_mut_to_vcf.pl b/scripts/dwgsim_mut_to_vcf.pl
new file mode 100644
index 0000000..04a9b80
--- /dev/null
+++ b/scripts/dwgsim_mut_to_vcf.pl
@@ -0,0 +1,106 @@
+#!/bin/perl
+
+use strict;
+use warnings;
+
+# TODO:
+# getopt
+# better messaging
+# usage
+# pod...
+
+my $fn = shift || die "No input file";
+my %to_alt = (
+	"AR" => "G", "GR" => "A",
+	"CY" => "T", "TY" => "C",
+	"CS" => "G", "GS" => "C",
+	"AW" => "T", "TW" => "A",
+	"GK" => "T", "TK" => "G",
+	"AM" => "C", "CM" => "A"
+);
+
+open(FH, "$fn") || die "Could not open '$fn' for reading.\n";
+
+print STDOUT "##fileformat=VCFv4.1\n";
+print STDOUT "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSample\n";
+
+my $type = -1; # 1 - SUB, 2 - INS, 3 - DEL
+my $prev_name = -1;
+my $prev_pos = -1;
+my $del_n = 0;
+my $del_name = "";
+my $del_pos = -1;
+my $del_ref = "";
+my $del_s = -1;
+my $line_n = 0;
+while(defined(my $line = <FH>)) {
+	if($line =~ m/^(.+)\t(\d+)\t(.+)\t(.+)\t(\d+)$/) {
+        $line_n++;
+		my $name = $1;
+		my $pos = $2;
+		my $ref = $3;
+		my $alt = $4;
+		my $s = $5;
+		if($prev_name ne $name) {
+			if(0 < $del_n) {
+				printf(STDOUT "%s\t%d\t.\t%s\t.\t100\tPASS\tpl=%d\n",
+					$del_name, $del_pos, $del_ref, $del_s);
+			}
+			# reset
+			$prev_pos = -1; $del_n = 0; $del_s = -1;
+			$del_name = ""; $del_pos = -1; $del_ref = "";
+		}
+		if("-" eq $alt) { # DEL
+			if(0 < $del_n && $del_pos + $del_n == $pos && $s == $del_s) {
+				$del_n++;
+				$del_ref .= $ref;
+			}
+			else {
+				if(0 < $del_n) {
+					printf(STDOUT "%s\t%d\t.\t%s\t.\t100\tPASS\tpl=%d\n",
+						$del_name, $del_pos, $del_ref, $del_s);
+				}
+				$del_n = 1;
+				$del_name = $name;
+				$del_pos = $pos;
+				$del_ref = $ref;
+				$del_s = $s;
+			}
+		}
+		else {
+			# print deletion, if it exists
+			if(0 < $del_n) {
+				printf(STDOUT "%s\t%d\t.\t%s\t.\t100\tPASS\tpl=%d\n",
+					$del_name, $del_pos, $del_ref, $s);
+				# reset
+				$prev_pos = -1; $del_n = 0; $del_s = -1;
+				$del_name = ""; $del_pos = -1; $del_ref = "";
+			}
+			if("-" eq $ref) { # INS
+				printf(STDOUT "%s\t%d\t.\t.\t%s\t100\tPASS\tpl=%d\n",
+					$name, $pos, $alt, $s);
+			}
+			else { # SUB
+				if(3 != $s) {
+					if(!defined($to_alt{"$ref"."$alt"})) {
+						die "Could not convert $ref$alt"." on $line_n.\n";
+					}
+					$alt = $to_alt{"$ref"."$alt"};
+				}
+				printf(STDOUT "%s\t%d\t.\t%s\t%s\t100\tPASS\tpl=%d\n",
+					$name, $pos, $ref, $alt, $s);
+			}
+		}
+		$prev_name = $name;
+		$prev_pos = $pos;
+	}
+	else {
+		die "Input file is not in the proper format.\n";
+	}
+}
+# print deletion, if it exists
+if(0 < $del_n) {
+	printf(STDOUT "%s\t%d\t.\t%s\t.\t100\tPASS\tpl=%d\n",
+		$del_name, $del_pos, $del_ref, $del_s);
+}
+close(FH);
diff --git a/scripts/dwgsim_pileup_eval.pl b/scripts/dwgsim_pileup_eval.pl
new file mode 100644
index 0000000..1d459ac
--- /dev/null
+++ b/scripts/dwgsim_pileup_eval.pl
@@ -0,0 +1,202 @@
+#!/usr/bin/perl -w
+
+# Copied from SAMtools (http://samtools.sourceforge.net)
+
+# This perl code is very cryptic.  Evaluates the fraction 
+# of reads correctly aligned at various mapping quality 
+# thresholds.
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+select STDERR; $| = 1;  # make STDERR unbuffered
+
+&dwgsim_eval;
+exit;
+
+sub dwgsim_eval {
+	my %opts = (g=>5, n=>0);
+	getopts('bpc:gn:', \%opts);
+	print_usage() if (@ARGV == 0 && -t STDIN);
+	my ($flag, $m, $ctr) = (0, 0, 0);
+	my $gap = $opts{g};
+	my $bwa = (defined $opts{b}) ? 1 : 0;
+	$flag |= 1 if (defined $opts{p});
+	$flag |= 2 if (defined $opts{c});
+	my $n = 2*$opts{n};
+	my @a = ();
+
+	my $prev_read_name = "ZZZ";
+	my @reads = ();
+
+	my ($found_correct_1, $found_correct_2, $is_correct_1, $is_correct_2, $found_pair_correct, $is_pair_correct) = (0, 0, 0, 0, 0, 0);
+
+	print STDERR "Analyzing...\nCurrently on:\n0";
+	while (<>) {
+		$ctr++;
+		if(0 == ($ctr % 10000)) {
+			print STDERR "\r$ctr";
+		}
+		next if (/^\@/);
+		my @t = split("\t");
+		next if (@t < 11);
+		my $sam = $_;
+		my $cur_read_name = $t[0];
+
+		if($prev_read_name ne $cur_read_name) { # process the read(s)
+			die unless (0 == length($prev_read_name) || 0 < scalar(@reads));
+
+			# go through each end
+			# want to know
+			# 	- is the correct location represented in each end?
+			# 	- if so, does the correct location have the best alignment score?
+			#   - what's the combined alignment score of the best pair?
+			@a = process_reads(\@reads, $flag, $gap, $bwa);
+			$found_correct_1 += $a[0];
+			$found_correct_2 += $a[1];
+			$is_correct_1 += $a[2];
+			$is_correct_2 += $a[3];
+			$found_pair_correct += $a[4];
+			$is_pair_correct += $a[5];
+
+			$prev_read_name = $cur_read_name;
+			@reads = ();
+			$m++;
+		}
+		# store the read
+		push(@reads, $sam);
+
+		#print STDERR $line if (($flag&1) && !$is_correct && $q > 0);
+	}
+	@a = process_reads(\@reads, $flag, $gap, $bwa);
+	$found_correct_1 += $a[0];
+	$found_correct_2 += $a[1];
+	$is_correct_1 += $a[2];
+	$is_correct_2 += $a[3];
+	$found_pair_correct += $a[4];
+	$is_pair_correct += $a[5];
+	print STDERR "\r$ctr\n";
+
+	printf(STDERR "%d\t%d\t%d\t%d\t%d\t%d\t%d\n",
+		$m, $found_correct_1, $found_correct_2, 
+		$is_correct_1, $is_correct_2,
+		$found_pair_correct,
+		$is_pair_correct);
+
+	print STDERR "Analysis complete.\n";
+}
+
+sub print_usage {
+	print STDERR "Usage: dwgsim_eval.pl [-pcnd] [-g GAP] <in.sam>\n";
+	print STDERR "\t\t-p\t\tprint incorrect alignments\n";
+	print STDERR "\t\t-c\t\tcolor space alignments\n";
+	print STDERR "\t\t-n\tINT\tnumber of raw input paired-end reads\n";
+	print STDERR "\t\t-d\tINT\tdivide quality by this factor\n";
+	print STDERR "\t\t-i\t\tprint only alignments with indels\n";
+	print STDERR "\t\t-b\t\talignments are from BWA\n";
+	exit(1);
+}
+
+sub swap {
+	my ($a, $b) = @_;
+	return ($b, $a);
+}
+
+sub process_reads {
+	my ($reads, $flag, $gap, $bwa) = @_;
+
+	my ($found_correct_1, $found_correct_2, $is_correct_1, $is_correct_2, $found_pair_correct, $is_pair_correct) = (0, 0, 0, 0, 0, 0);
+	my ($correct_index_1, $correct_index_2, $max_AS_index_1, $max_AS_index_2, $max_AS_1, $max_AS_2) = (-1, -1, -1, -1, -1000000, -1000000);
+
+	# go through reads
+	for(my $i=0;$i<scalar(@$reads);$i++) {
+		my $line = $reads->[$i];
+
+		my @t = split(/\t/, $line);
+		die if (@t < 11);
+		my ($q, $chr, $left, $rght) = ($t[4], $t[2], $t[3], $t[3]);
+		# right coordinate
+		$_ = $t[5]; s/(\d+)[MDN]/$rght+=$1,'x'/eg;
+		--$rght;
+
+		# skip unmapped reads
+		next if (($t[1]&0x4) || $chr eq '*');
+
+		# parse read name
+		if($t[0] =~ m/^(\S+)_(\d+)_(\d+)_(\d)_(\d)_(\d+):(\d+):(\d+)_(\d+):(\d+):(\d+)_(\S+)/) {
+			my ($o_chr, $pos_1, $pos_2, $str_1, $str_2) = ($1, $2, $3, $4, $5);
+			my ($n_err_1, $n_sub_1, $n_indel_1, $n_err_2, $n_sub_2, $n_indel_2) = ($6, $7, $8, $9, $10, $11);
+			my $end = 2;
+			if(($t[1]&0x40)) { # read #1
+				$end = 1;
+			}
+
+			# check alignment score
+			for(my $j=11;$j<scalar(@t);$j++) {
+				if($t[$j] =~ m/AS:i:(-?\d+)/) {
+					my $AS = $1;
+					if(1 == $end) {
+						if($max_AS_index_1 == -1 || $max_AS_1 < $AS) {
+							$max_AS_index_1 = $i;
+							$max_AS_1 = $AS;
+						}
+					}
+					else {
+						if($max_AS_index_2 == -1 || $max_AS_2 < $AS) {
+							$max_AS_index_2 = $i;
+							$max_AS_2 = $AS;
+						}
+					}
+					last;
+				}
+			}
+
+			if ($o_chr eq $chr) { # same chr
+				if ($flag & 2) { # SOLiD
+					if(1 == $bwa) {
+						# Swap 1 and 2
+						($pos_1, $pos_2) = &swap($pos_1, $pos_2);
+						($str_1, $str_2) = &swap($str_1, $str_2);
+						($n_err_1, $n_err_2) = &swap($n_err_1, $n_err_2);
+						($n_sub_1, $n_sub_2) = &swap($n_sub_1, $n_sub_2);
+						($n_indel_1, $n_indel_2) = &swap($n_indel_1, $n_indel_2);
+					}
+				}
+
+				if(1 == $end) { # first read
+					if(abs($pos_1 - $left) <= $gap) {
+						$found_correct_1=1;
+						$correct_index_1 = $i; 
+					}
+				} else { # second read
+					if(abs($pos_2 - $left) <= $gap) {
+						$found_correct_2=1;
+						$correct_index_1 = $i; 
+					}
+				}
+			}
+		} else {
+			warn("[dwgsim_eval] read '$t[0]' was not generated by dwgsim?\n");
+			next;
+		}
+	}
+
+	# was the max AS the correct one?
+	if(0 < $found_correct_1 && $correct_index_1 == $max_AS_index_1) {
+		$is_correct_1=1;
+	}
+	if(0 < $found_correct_2 && $correct_index_2 == $max_AS_index_2) {
+		$is_correct_2=1;
+	}
+
+	# check pair ???
+	if(0 < $found_correct_1 && 0 < $found_correct_2) {
+		$found_pair_correct = 1;
+	}
+	if(1 == $is_correct_1 && 1 == $is_correct_2) {
+		$is_pair_correct = 1;
+	}
+
+	return ($found_correct_1, $found_correct_2, $is_correct_1, $is_correct_2, $found_pair_correct, $is_pair_correct);
+}
diff --git a/scripts/galaxy/dwgsim_eval_wrapper.py b/scripts/galaxy/dwgsim_eval_wrapper.py
new file mode 100644
index 0000000..23e84b0
--- /dev/null
+++ b/scripts/galaxy/dwgsim_eval_wrapper.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python
+
+"""
+Runs DWGSIM_EVAL
+
+usage: dwgsim_eval_wrapper.py [options]
+    -a,--alignmentScore=a: split alignments by alignment score instead of mapping quality
+    -b,--bwa=b: alignments are from BWA (SOLiD only)
+    -c,--colorSpace=c: color space alignments
+    -d,--scoreFactor=d: divide quality/alignment score by this factor
+    -g,--wiggle=g: gap "wiggle"
+    -n,--numReads=n: number of raw input paired-end reads (otherwise, inferred from all SAM records present).
+    -q,--minMapq=q: consider only alignments with this mapping quality or greater.
+    -z,--singleEnd=z: input contains only single end reads
+    -S,--sam=s: input SAM file
+    -B,--bam=B: input BAM file
+    -p,--printIncorrect=p: print incorrect alignments
+    -s,--numSnps=s: consider only alignments with the number of specified SNPs
+    -e,--numErrors=e: consider only alignments with the number of specified errors
+    -i,--indels=i: consider only alignments with indels
+    -o,--output=u: The file to save the output 
+"""
+
+import optparse, os, shutil, subprocess, sys, tempfile
+
+def stop_err( msg ):
+    sys.stderr.write( '%s\n' % msg )
+    sys.exit()
+
+def run_process ( cmd, name, tmp_dir, buffsize ):
+    try:
+        tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
+        tmp_stderr = open( tmp, 'wb' )
+        proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
+        returncode = proc.wait()
+        tmp_stderr.close()
+        # get stderr, allowing for case where it's very large
+        tmp_stderr = open( tmp, 'rb' )
+        stderr = ''
+        try:
+            while True:
+                stderr += tmp_stderr.read( buffsize )
+                if not stderr or len( stderr ) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
+        tmp_stderr.close()
+        if returncode != 0:
+            raise Exception, stderr
+    except Exception, e:
+        raise Exception, 'Error in \'' + name + '\'. \n' + str( e )
+
+def check_output ( output, canBeEmpty ):
+    if 0 < os.path.getsize( output ):
+        return True
+    elif False == canBeEmpty:
+        raise Exception, 'The output file is empty:' + output
+
+def __main__():
+    #Parse Command Line
+    parser = optparse.OptionParser()
+
+    parser.add_option( '-a', '--alignmentScore', action='store_true', dest='alignmentScore', default=False, help='split alignments by alignment score instead of mapping quality' )
+    parser.add_option( '-b', '--bwa', action='store_true', dest='bwa', default=False, help='alignments are from BWA (SOLiD only)' )
+    parser.add_option( '-c', '--colorSpace', action='store_true', dest='colorSpace', default=False, help='generate reads in color space (SOLiD reads)' )
+    parser.add_option( '-d', '--scoreFactor', dest='scoreFactor', type='int', help='divide quality/alignment score by this factor' )
+    parser.add_option( '-g', '--wiggle', dest='wiggle', type='int', help='gap "wiggle"' )
+    parser.add_option( '-n', '--numReads', dest='numReads', type='int', help='number of raw input paired-end reads (otherwise, inferred from all SAM records present).' )
+    parser.add_option( '-q', '--minMapq', dest='minMapq', type='int', help='consider only alignments with this mapping quality or greater.' )
+    parser.add_option( '-z', '--singleEnd', action='store_true', dest='singleEnd', default=False, help='input contains only single end reads' )
+    parser.add_option( '-S', '--sam', dest='sam', default=None, help='input SAM' )
+    parser.add_option( '-B', '--bam', dest='bam', default=None, help='input BAM' )
+    parser.add_option( '-p', '--printIncorrect', action='store_true', dest='printIncorrect', default=False, help='print incorrect alignments' )
+    parser.add_option( '-s', '--numSnps', dest='numSnps', type="int", help='consider only alignments with the number of specified SNPs' )
+    parser.add_option( '-e', '--numErrors', dest='numErrors', type="int", default=False, help='consider only alignments with the number of specified errors' )
+    parser.add_option( '-i', '--indels', action='store_true', dest='indels', default=False, help='consider only alignments with indels' )
+    parser.add_option( '-o', '--output', dest='output', help='The file to save the output' )
+    
+    (options, args) = parser.parse_args()
+
+    # output version # of tool
+    try:
+        tmp = tempfile.NamedTemporaryFile().name
+        tmp_stdout = open( tmp, 'wb' )
+        proc = subprocess.Popen( args='dwgsim_eval 2>&1', shell=True, stdout=tmp_stdout )
+        tmp_stdout.close()
+        returncode = proc.wait()
+        stdout = None
+        for line in open( tmp_stdout.name, 'rb' ):
+            if line.lower().find( 'version' ) >= 0:
+                stdout = line.strip()
+                break
+        if stdout:
+            sys.stdout.write( '%s\n' % stdout )
+        else:
+            raise Exception
+    except:
+        sys.stdout.write( 'Could not determine DWGSIM_EVAL version\n' )
+
+    buffsize = 1048576
+
+    # make temp directory for dwgsim, requires trailing slash
+    tmp_dir = '%s/' % tempfile.mkdtemp()
+
+    #'generic' options used in all dwgsim commands here
+
+    try:
+        tmp_dir = '%s/' % tempfile.mkdtemp()
+        dwgsim_eval_cmd = 'dwgsim_eval'
+        if True == options.alignmentScore:
+            dwgsim_eval_cmd = dwgsim_eval_cmd + ' -a'
+        if True == options.bwa:
+            dwgsim_eval_cmd = dwgsim_eval_cmd + ' -b'
+        if True == options.colorSpace:
+            dwgsim_eval_cmd = dwgsim_eval_cmd + ' -c'
+        use_p = False
+        if 0 <= options.numSnps:
+            use_p = True
+            dwgsim_eval_cmd = dwgsim_eval_cmd + (' -s %s' % options.numSnps)
+        if 0 <= options.numErrors:
+            use_p = True
+            dwgsim_eval_cmd = dwgsim_eval_cmd + (' -e %s' % options.numErrors)
+        if True == options.indels:
+            use_p = True
+            dwgsim_eval_cmd = dwgsim_eval_cmd + ' -i'
+        if True == use_p or True == options.printIncorrect:
+            dwgsim_eval_cmd = dwgsim_eval_cmd + ' -p'
+        if True == options.singleEnd:
+            dwgsim_eval_cmd = dwgsim_eval_cmd + ' -z'
+        dwgsim_eval_cmd = '%s -d %s -g %s -n %s -q %s' % (dwgsim_eval_cmd, \
+                options.scoreFactor, \
+                options.wiggle, \
+                options.numReads, \
+                options.minMapq)
+        if None != options.sam:
+            dwgsim_eval_cmd = dwgsim_eval_cmd + ' -S ' + options.sam
+        elif None != options.bam:
+            dwgsim_eval_cmd = dwgsim_eval_cmd + ' ' + options.bam
+        else:
+            raise Exception, 'Input file was neither a SAM nor BAM'
+        dwgsim_eval_cmd = dwgsim_eval_cmd + ' > ' + options.output
+
+        # need to nest try-except in try-finally to handle 2.4
+        try:
+            # dwgsim
+            run_process ( dwgsim_eval_cmd, 'dwgsim', tmp_dir, buffsize )
+            # check that there are results in the output file
+            check_output ( options.output, False )
+            sys.stdout.write( 'DWGSIM_EVAL successful' )
+        except Exception, e:
+            stop_err( 'DWGSIM_EVAL failed.\n' + str( e ) )
+    finally:
+        # clean up temp dir
+        if os.path.exists( tmp_dir ):
+            shutil.rmtree( tmp_dir )
+
+if __name__=="__main__": __main__()
diff --git a/scripts/galaxy/dwgsim_eval_wrapper.xml b/scripts/galaxy/dwgsim_eval_wrapper.xml
new file mode 100644
index 0000000..7765b83
--- /dev/null
+++ b/scripts/galaxy/dwgsim_eval_wrapper.xml
@@ -0,0 +1,73 @@
+<tool id="dwgsim" name="Evaluate simulated reads" version="1.0.0">
+  <description>from a SAM/BAM file using dwgsim_eval</description>
+  <command interpreter="python">
+	  if 'sam' == input.ext:
+		  dwgsim_wrapper.py \
+		  $alignmentScore \
+		  $bwa \
+		  $colorSpace \
+		  -d $scoreFactor \
+		  -g $wiggle \
+		  -n $numReads \
+		  -q $minMapq \
+		  $singleEnd \
+		  $printIncorrect \
+		  -s $numSnps \
+		  -e $numErrors \
+		  $indels \
+		  -s $input \
+		  -o $output
+      else:
+		  dwgsim_wrapper.py \
+		  $alignmentScore \
+		  $bwa \
+		  $colorSpace \
+		  -d $scoreFactor \
+		  -g $wiggle \
+		  -n $numReads \
+		  -q $minMapq \
+		  $singleEnd \
+		  $printIncorrect \
+		  -s $numSnps \
+		  -e $numErrors \
+		  $indels \
+		  -b $input \
+		  -o $output
+	       
+  </command>
+  <inputs>
+	<param format="sam,bam" name="input" type="data" label="SAM/BAM file to evaluate"/>
+	<param name="alignmentScore" type="boolean" truevalue="-a" falsevalue="" checked="False" label="split alignments by alignment score instead of mapping quality"/>
+	<param name="bwa" type="boolean" truevalue="-b" falsevalue="" checked="False" label="alignments are from BWA (SOLiD only)"/>
+	<param name="colorSpace" type="boolean" truevalue="-c" falsevalue="" checked="False" label="color space alignments"/>
+	<param name="scoreFactor" size="4" type="text" value="1">
+		<label>divide quality/alignment score by this factor</label>
+	</param> 
+	<param name="wiggle" size="4" type="text" value="5">
+		<label>gap "wiggle"</label>
+	</param> 
+	<param name="numReads" size="4" type="text" value="-1">
+		<label>number of raw input paired-end reads (otherwise, inferred from all SAM records present)</label>
+	</param>
+	<param name="minMapq" size="4" type="text" value="0">
+		<label>consider only alignments with this mapping quality or greater</label>
+	</param>
+	<param name="singleEnd" type="boolean" truevalue="-z" falsevalue="" checked="False" label="input contains only single end reads"/>
+	<param name="printIncorrect" type="boolean" truevalue="-p" falsevalue="" checked="False" label="print incorrect alignments"/>
+	<param name="numSnps" size="4" type="text" value="-1">
+		<label>consider only alignments with the number of specified SNPs</label>
+	</param>
+	<param name="numErrors" size="4" type="text" value="-1">
+		<label>consider only alignments with the number of specified errors</label>
+	</param>
+	<param name="indels" type="boolean" truevalue="-i"  falsevalue="" checked="False" label="consider only alignments with indels"/>
+   </inputs>
+  <outputs>
+    <data format="text" name="output"/>
+  </outputs>
+ 
+  <help>
+	  This tool evaluates simulated reads from DWGSIM.  For more information, please see https://sourceforge.net/apps/mediawiki/dnaa/index.php?title=Whole_Genome_Simulation#Evaluating_mapping_-_dwgsim_eval
+  </help>
+ 
+</tool>
diff --git a/scripts/galaxy/dwgsim_wrapper.py b/scripts/galaxy/dwgsim_wrapper.py
new file mode 100644
index 0000000..20fce4e
--- /dev/null
+++ b/scripts/galaxy/dwgsim_wrapper.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+
+"""
+Runs DWGSIM
+
+usage: dwgsim_wrapper.py [options]
+    -e,--eRateFirstread=e: base/color error rate of the first read [0.020]
+    -E,--eRateSecondread=E: base/color error rate of the second read [0.020]
+    -d,--innerDist=d: inner distance between the two ends [500]
+    -s,--stdev=s: standard deviation [50]
+    -N,--numReads=N: number of read pairs [1000000]
+    -1,--loFirstread=1: length of the first read [70]
+    -2,--loSecondread=2: length of the second read [70]
+    -r,--mutRate=r: rate of mutations [0.0010]
+    -R,--fracIndels=R: fraction of mutations that are indels [0.10]
+    -X,--indelExt=X: probability an indel is extended [0.30]
+    -y,--randProb=y: probability of a random DNA read [0.10]
+    -n,--maxN=n: maximum number of Ns allowed in a given read [0]
+    -c,--platformType=c: generate reads for Illumina (nuc space), for SOLiD (in color space) or Ion Torrent 
+    -S,--strand=S: strand 0: default, 1: same strand, 2: opposite strand
+    -f,--flowOrder=f: the flow order for Ion Torrent data [(null)]
+    -O,--splitOutput=O: which output needs to be reported, one: one single; two: two files; all: all output options
+    -H,--haploid=H: haploid mode
+    -i,--input=i: the reference genome FASTA
+    -3,--outputBFAST=3: the BFAST output FASTQ
+    -4,--outputBWA1=4: the first BWA output FASTQ
+    -5,--outputBWA2=5: the second BWA output FASTQ
+    -6,--outputMutations=6: the output mutations TXT
+"""
+
+import optparse, os, shutil, subprocess, sys, tempfile
+
+def stop_err( msg ):
+    sys.stderr.write( '%s\n' % msg )
+    sys.exit()
+
+def run_process ( cmd, name, tmp_dir, buffsize ):
+    try:
+        tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
+        tmp_stderr = open( tmp, 'wb' )
+        proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
+        returncode = proc.wait()
+        tmp_stderr.close()
+        # get stderr, allowing for case where it's very large
+        tmp_stderr = open( tmp, 'rb' )
+        stderr = ''
+        try:
+            while True:
+                stderr += tmp_stderr.read( buffsize )
+                if not stderr or len( stderr ) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
+        tmp_stderr.close()
+        if returncode != 0:
+            raise Exception, stderr
+    except Exception, e:
+        raise Exception, 'Error in \'' + name + '\'. \n' + str( e )
+
+def check_output ( output, canBeEmpty ):
+    if 0 < os.path.getsize( output ):
+        return True
+    elif False == canBeEmpty:
+        raise Exception, 'The output file is empty:' + output
+
+def __main__():
+    #Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option( '-e', '--eRateFirstread', dest='eRateFirstread', type='float', help='base/color error rate of the first read' )
+    parser.add_option( '-E', '--eRateSecondread', dest='eRateSecondread', type='float', help='base/color error rate of the second read' )
+    parser.add_option( '-d', '--innerDist', dest='innerDist', type='int', help='inner distance between the two ends' )
+    parser.add_option( '-s', '--stdev', dest='stdev', type='float', help='standard deviation' )
+    parser.add_option( '-N', '--numReads', dest='numReads', type='int', help='number of read pairs' )
+    parser.add_option( '-r', '--mutRate', dest='mutRate', type='float', help='rate of mutations' )
+    parser.add_option( '-R', '--fracIndels', dest='fracIndels', type='float', help='fraction of mutations that are indels' )
+    parser.add_option( '-X', '--indelExt', dest='indelExt', type='float', help='probability an indel is extended' )
+    parser.add_option( '-y', '--randProb', dest='randProb', type='float', help='probability of a random DNA read' )
+    parser.add_option( '-n', '--maxN', dest='maxN', type='int', help='maximum number of Ns allowed in a given read' )
+    parser.add_option( '-c', '--platformType', dest='platformType', type='int', help='Platform to simulate: 0: Illumina; 1: solid; 2: Ion Torrent' )
+    parser.add_option( '-S', '--strand', dest='strand', type='choice', default='0', choices=('0', '1', '2'), help='strand 0: default, 1: same strand, 2: opposite strand' )
+    parser.add_option( '-f', '--flowOrder', dest='flowOrder', type='int', default='2')
+    parser.add_option( '-H', '--haploid', action='store_true', dest='haploid', default=False, help='haploid mode' )
+    parser.add_option( '-i', '--input', dest='input', help='The reference genome fasta' )
+    parser.add_option( '-O', dest='splitoutput', type='choice', default='two',choices=('two', 'one', 'all'), help='one: one single; two: two files; all: all output options' )
+    parser.add_option( '-1', '--loFirstread', dest='loFirstread', type='int', help='length of the first read' )
+    parser.add_option( '-2', '--loSecondread', dest='loSecondread', type='int', help='length of the second read' )
+    parser.add_option( '-6', '--outputMutations', dest='outputMutations', help='the output mutations TXT' )
+    parser.add_option( '-4', '--outputBWA1', dest='outputBWA1', help='the first BWA output FASTQ' )
+    parser.add_option( '-5', '--outputBWA2', dest='outputBWA2', help='the second BWA output FASTQ' )
+    parser.add_option( '-3', '--outputBFAST', dest='outputBFAST', help='the BFAST output FASTQ' )
+							
+    (options, args) = parser.parse_args()
+
+    # output version # of tool
+    try:
+        tmp = tempfile.NamedTemporaryFile().name
+        tmp_stdout = open( tmp, 'wb' )
+        proc = subprocess.Popen( args='dwgsim 2>&1', shell=True, stdout=tmp_stdout )
+        tmp_stdout.close()
+        returncode = proc.wait()
+        stdout = None
+        for line in open( tmp_stdout.name, 'rb' ):
+            if line.lower().find( 'version' ) >= 0:
+                stdout = line.strip()
+                break
+        if stdout:
+            sys.stdout.write( '%s\n' % stdout )
+        else:
+            raise Exception
+    except:
+        sys.stdout.write( 'Could not determine DWGSIM version\n' )
+
+    buffsize = 1048576
+
+    # make temp directory for dwgsim, requires trailing slash
+    tmp_dir = '%s/' % tempfile.mkdtemp()
+
+    #'generic' options used in all dwgsim commands here
+
+    try:
+        reference_filepath = tempfile.NamedTemporaryFile( dir=tmp_dir, suffix='.fa' ).name
+        os.symlink( options.input, reference_filepath )
+        assert reference_filepath and os.path.exists( reference_filepath ), 'A valid genome reference was not provided.'
+        tmp_dir = '%s/' % tempfile.mkdtemp()
+        dwgsim_output_prefix = "dwgsim_output_prefix"
+        dwgsim_cmd = 'dwgsim -e %s -E %s -d %s -s %s -N %s -1 %s -2 %s -r %s -R %s -X %s -y %s -c %s -n %s' % \
+                (options.eRateFirstread, \
+                options.eRateSecondread, \
+                options.innerDist, \
+                options.stdev, \
+                options.numReads, \
+                options.loFirstread, \
+                options.loSecondread, \
+                options.mutRate, \
+                options.fracIndels, \
+                options.indelExt, \
+                options.randProb, \
+                options.platformType, \
+                options.maxN)
+        if options.haploid:
+            dwgsim_cmd = dwgsim_cmd + ' -H'
+        if options.platformType == "2":
+            dwgsim_cmd = dwgsim_cmd + options.flowOrder
+        dwgsim_cmd = dwgsim_cmd + ' ' + options.input
+        dwgsim_cmd = dwgsim_cmd + ' ' + tmp_dir + dwgsim_output_prefix
+        
+        # temporary file to check the command being launched
+        tmpfile = open('/tmp/dwgsim.log', 'w')
+        tmpfile.write("%s" % dwgsim_cmd)
+
+        # need to nest try-except in try-finally to handle 2.4
+        try:
+			# launch dwgsim + accept mutations.txt output
+			run_process ( dwgsim_cmd, 'dwgsim', tmp_dir, buffsize )
+			cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.mutations.txt' + ' ' + options.outputMutations
+			run_process ( cmd, 'mv #1', tmp_dir, buffsize )
+			check_output ( options.outputMutations, True )
+			# check which output required and move files accordingly
+			tmpfile.write("\nsplitoutput: %s \ncondition one: %s \ncondition two: %s \ncondition all: %s\n" % (options.splitoutput, str((options.splitoutput == 'one')), str((options.splitoutput == 'two')), str((options.splitoutput == 'all'))))
+			if  any( [options.splitoutput == 'two' , options.splitoutput == 'all'] ) :	# two files
+				cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.bwa.read1.fastq' + ' ' + options.outputBWA1
+				run_process ( cmd, 'mv #3', tmp_dir, buffsize )
+				cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.bwa.read2.fastq' + ' ' + options.outputBWA2
+				run_process ( cmd, 'mv #4', tmp_dir, buffsize )
+				tmpfile.write("\nmoved!!: %s" % options.splitoutput)
+				# check that there are results in the output file
+				check_output ( options.outputBWA1, False )
+				check_output ( options.outputBWA2, False )
+			if any( [options.splitoutput == 'one' , options.splitoutput == 'all'] ):
+				cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.bfast.fastq' + ' ' + options.outputBFAST
+				run_process ( cmd, 'mv #2', tmp_dir, buffsize )
+				# check that there are results in the output file
+				check_output ( options.outputBFAST, False )
+			sys.stdout.write( 'DWGSIM successful' )
+        except Exception, e:
+			stop_err( 'DWGSIM failed.\n' + str( e ) )
+    finally:
+        # clean up temp dir
+        if os.path.exists( tmp_dir ):
+            shutil.rmtree( tmp_dir )
+
+if __name__=="__main__": __main__()
diff --git a/scripts/galaxy/dwgsim_wrapper.xml b/scripts/galaxy/dwgsim_wrapper.xml
new file mode 100644
index 0000000..3a6de6c
--- /dev/null
+++ b/scripts/galaxy/dwgsim_wrapper.xml
@@ -0,0 +1,129 @@
+<tool id="dwgsim" name="Generate simulated reads" version="1.1.0">
+  <description>from a fasta sequence using dwgsim</description>
+  <command interpreter="python">
+	#if $outputFormat.splitOutput == "two"  #dwgsim_wrapper.py \
+		-4 $outputPEFirst \
+	    -5 $outputPESecond \
+	#else if $outputFormat.splitOutput == "one" #dwgsim_wrapper.py \
+		-3 $outputSingleReads \
+	#else  #dwgsim_wrapper.py \
+		-3 $outputSingleReads \
+		-4 $outputPEFirst \
+	    -5 $outputPESecond \
+	#end if 
+	  -6 $outputMutations \
+	  -e $eRateFirstread \
+	  -E $eRateSecondread \
+	  -d $innerDist \
+	  -s $stDev \
+	  -N $noReadpairs \
+	  -1 $loFirstread \
+	  -2 $loSecondread \
+	  -r $rateOfMut \
+	  -R $rateOfIndels \
+	  -X $probExtRead \
+	  -y $probRandRead \
+	  -n $maxN \
+	  -S $strandDirection \
+	  -c $platform.platformType	\
+	  #if $platform.platformType == "2" #
+	  -f $platform.flowOrder
+	  #end if	  
+	  $haploMode \
+	  -O $outputFormat.splitOutput  \
+	  -i $input \ 
+
+  </command>
+  <inputs>
+	<param format="fasta" name="input" type="data" label="Select fasta file to generate reads from"/>
+	<conditional name="platform">
+	<param name="platformType" type="select" label="Select the platform to simulate reads for">
+		<option value="0" selected="True">Illumina</option>
+		<option value="1" >SOLiD (Color space)</option>
+		<option value="2" >Ion Torrent</option>
+	</param>
+	<when value="0"/>
+	<when value="1"/>
+	<when value="2">
+		<param name="flowOrder" type="text" size="12" value="2">
+			<label> Flow order for Ion Torrent data </label>
+		</param>
+	</when>
+	</conditional>
+ 	<conditional name="outputFormat">
+		<param name="splitOutput" type="select" label="Output mate reads in">
+			<option value="two" selected="True">Two separate files</option>
+			<option value="one" >In a single file (~bfast)</option>
+			<option value="all" >Both output options</option>
+		</param>
+		<when value="two"/>
+		<when value="one"/>
+		<when value="all"/>
+	</conditional>
+	<param name="noReadpairs" size="12" type="text" value="1000000">
+		<label>number of read pairs</label>
+	</param>
+	<param name="noReadpairs" size="12" type="text" value="1000000">
+		<label>number of read pairs</label>
+	</param>
+	<param name="loFirstread" size="4" type="text" value="70">
+		<label>length of the first read </label>
+	</param>
+	<param name="loSecondread" size="4" type="text" value="70">
+		<label>length of the second read </label>
+	</param>
+	<param name="eRateFirstread" size="4" type="text" value="0.020">
+		<label>base/color error rate of the first read</label>
+	</param>
+	<param name="eRateSecondread" size="4" type="text" value="0.020">
+		<label>base/color error rate of the second read</label>
+	</param>
+	<param name="innerDist" size="4" type="text" value="500">
+		<label>inner distance between the two ends</label>
+	</param>
+	<param name="stDev" size="4" type="text" value="50">
+		<label>Standard deviation on the inner distance between two ends</label>
+	</param>	
+	<param name="rateOfMut" size="4" type="text" value="0.0010">
+		<label>rate of mutations</label>
+	</param>
+	<param name="rateOfIndels" size="4" type="text" value="0.10">
+		<label>fraction of mutations that are indels</label>
+	</param>
+	<param name="probExtRead" size="4" type="text" value="0.30">
+		<label>probability an indel is extended</label>
+	</param>
+	<param name="probRandRead" size="4" type="text" value="0.10">
+		<label>probability of a random DNA read</label>
+	</param>
+	<param name="maxN" size="4" type="text" value="0">
+		<label>maximum number of Ns allowed in a given read</label>
+	</param>
+	<param name="strandDirection" type="select" label="Direction of strand (default: opposite strand for Illumina, same strand for SOLiD?Ion Torrent)">
+		<option value="0" selected="True">Default</option>
+		<option value="1">Same strand (mate pair)</option>
+		<option value="2">Opposite strand (paired end)</option>
+	</param>
+	<param name="haploMode" type="boolean" truevalue="-H" falsevalue="" checked="False" label="Haplotype mode"/>
+   
+</inputs>
+ 
+  <outputs>
+    <data format="fastqsanger" name="outputSingleReads" label="Simulated reads from ${on_string}">
+		<filter>(outputFormat['splitOutput'] == 'one') | (outputFormat['splitOutput'] == 'all')</filter>
+	</data>
+    <data format="fastqsanger" name="outputPEFirst" label="Simulated forw reads from ${on_string}">
+		<filter>(outputFormat['splitOutput'] == 'two') | (outputFormat['splitOutput'] == 'all') </filter> 
+	</data>
+    <data format="fastqsanger" name="outputPESecond" label="Simulated rev reads from ${on_string}">
+		<filter>(outputFormat['splitOutput'] == 'two') | (outputFormat['splitOutput'] == 'all')</filter>
+	</data>
+    <data format="text" name="outputMutations" label="Mutation report of reads from ${on_string}"/>
+  </outputs>
+ 
+  <help>
+This tool simulate reads from a given fasta file.
+For more information, please see https://sourceforge.net/apps/mediawiki/dnaa/index.php?title=Whole_Genome_Simulation
+  </help>
+ 
+</tool>
diff --git a/src/contigs.c b/src/contigs.c
new file mode 100644
index 0000000..f2965f7
--- /dev/null
+++ b/src/contigs.c
@@ -0,0 +1,54 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   */
+
+#include <stdlib.h>
+#include <assert.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <string.h>
+#include "contigs.h"
+
+contigs_t* contigs_init()
+{
+  return calloc(1, sizeof(contigs_t));
+}
+
+void contigs_add(contigs_t *c, char *name, uint32_t len)
+{
+  c->n++;
+  c->contigs = realloc(c->contigs, c->n * sizeof(contig_t));
+  c->contigs[c->n-1].name = strdup(name);
+  c->contigs[c->n-1].len = len;
+}
+
+void contigs_destroy(contigs_t *c)
+{
+  int32_t i;
+  for(i=0;i<c->n;i++) {
+      free(c->contigs[i].name);
+  }
+  free(c->contigs);
+  free(c);
+} 
diff --git a/src/contigs.h b/src/contigs.h
new file mode 100644
index 0000000..5bce731
--- /dev/null
+++ b/src/contigs.h
@@ -0,0 +1,23 @@
+#ifndef CONTIGS_H_
+#define CONTIGS_H_
+
+typedef struct {
+    char *name;
+    int32_t len;
+} contig_t;
+
+typedef struct {
+    contig_t *contigs;
+    int32_t n;
+} contigs_t;
+
+contigs_t* 
+contigs_init();
+
+void 
+contigs_add(contigs_t *c, char *name, uint32_t len);
+
+void 
+contigs_destroy(contigs_t *c);
+
+#endif
diff --git a/src/dwgsim.c b/src/dwgsim.c
new file mode 100644
index 0000000..daef8e7
--- /dev/null
+++ b/src/dwgsim.c
@@ -0,0 +1,1085 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   */
+
+/* Original Contact: Heng Li <lh3 at sanger.ac.uk> */
+
+/* This program is separated from maq's read simulator with Colin
+ * Hercus' modification to allow longer indels. Colin is the chief
+ * developer of novoalign. */
+
+/* This program was modified by Nils Homer for inclusion in the 
+ * DNAA package */
+
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+#include "contigs.h"
+#include "mut.h"
+#include "mut_txt.h"
+#include "mut_bed.h"
+#include "regions_bed.h"
+#include "dwgsim_opt.h"
+#include "dwgsim.h"
+//#include <config.h>
+
+uint8_t nst_nt4_table[256] = {
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 5 /*'-'*/, 4, 4,
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+    4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4, 
+    4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+    4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4, 
+    4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4
+};
+
+#define __gen_read(x, start, iter) do {									\
+    for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) {	\
+        mut_t c = currseq->s[i], mut_type = c & mutmsk;			\
+        if (ext_coor[x] < 0) {								\
+            if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
+            ext_coor[x] = i;								\
+            if(1 == strand[x]) ext_coor[x] -= s[x]-1; \
+        }													\
+        if (mut_type == DELETE) { \
+            ++n_indel[x];				\
+            if(1 == strand[x]) ext_coor[x]--; \
+            if(0 == k) n_indel_first[x]++; \
+        } \
+        else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \
+            tmp_seq[x][k++] = c & 0xf;						\
+            if (mut_type == SUBSTITUTE) { \
+                ++n_sub[x];			\
+                if(0 == k) n_sub_first[x]++; \
+            } 												\
+        } else {											\
+            mut_t n, ins;										\
+            assert(mut_type == INSERT); \
+            ++n_indel[x];									\
+            n_indel_first[x]++;							\
+            if(1 == mut_get_ins(currseq, i, &n, &ins)) { \
+                if(0 == strand[x]) { \
+                    while(n > 0 && k < s[x]) { \
+                        tmp_seq[x][k++] = ins & 0x3;                \
+                        --n, ins >>= 2; \
+                    } \
+                    if(k < s[x]) tmp_seq[x][k++] = c & 0xf;						\
+                } else { \
+                    tmp_seq[x][k++] = c & 0xf;						\
+                    while(n > 0 && k < s[x]) { \
+                        ext_coor[x]++; \
+                        tmp_seq[x][k++] = (ins >> ((n-1) << 1) & 0x3);                \
+                        --n; \
+                    } \
+                } \
+            } else { \
+                int32_t byte_index, bit_index; \
+                uint32_t num_ins; \
+                uint8_t *insertion = NULL; \
+                insertion = mut_get_ins_long_n(currseq->ins[ins], &num_ins); \
+                if(0 == strand[x]) { \
+                    byte_index = mut_packed_len(num_ins) - 1; bit_index = 3 - (num_ins & 3); \
+                    while(num_ins > 0 && k < s[x]) { \
+                        assert(0 <= byte_index); \
+                        tmp_seq[x][k++] = (insertion[byte_index] >> (bit_index << 1)) & 0x3;                \
+                        --num_ins; \
+                        bit_index--; \
+                        if (bit_index < 0) { \
+                            bit_index = 3; \
+                            byte_index--; \
+                        } \
+                    } \
+                    if(k < s[x]) tmp_seq[x][k++] = c & 0xf;						\
+                } else { \
+                    tmp_seq[x][k++] = c & 0xf;						\
+                    byte_index = 0; bit_index = 0; \
+                    while(num_ins > 0 && k < s[x]) { \
+                        ext_coor[x]++; \
+                        tmp_seq[x][k++] = (insertion[byte_index] >> (bit_index << 1)) & 0x3;                \
+                        --num_ins; \
+                        bit_index++; \
+                        if (4 == bit_index) { \
+                            bit_index = 0; \
+                            byte_index++; \
+                        } \
+                    } \
+                } \
+            }													\
+        } \
+    }														\
+    if (k != s[x]) ext_coor[x] = -10;						\
+    if (1 == strand[x]) { \
+        for (k = 0; k < s[x]; ++k) tmp_seq[x][k] = tmp_seq[x][k] < 4? 3 - tmp_seq[x][k] : 4; \
+    } 														\
+} while (0)
+
+/* Simple normal random number generator, copied from genran.c */
+
+double ran_normal()
+{ 
+  static int iset = 0; 
+  static double gset; 
+  double fac, rsq, v1, v2; 
+  if (iset == 0) {
+      do { 
+          v1 = 2.0 * drand48() - 1.0;
+          v2 = 2.0 * drand48() - 1.0; 
+          rsq = v1 * v1 + v2 * v2;
+      } while (rsq >= 1.0 || rsq == 0.0);
+      fac = sqrt(-2.0 * log(rsq) / rsq); 
+      gset = v1 * fac; 
+      iset = 1;
+      return v2 * fac;
+  } else {
+      iset = 0;
+      return gset;
+  }
+}
+
+int32_t ran_num(double prob, int32_t n)
+{
+  int32_t i, r;
+  for(i = r = 0; i < n; i++) {
+      if(drand48() < prob) r++;
+  }
+  return r;
+}
+
+int32_t get_muttype(char *str)
+{
+  int32_t i;
+  for(i=0;i<strlen(str);i++) {
+      str[i] = tolower(str[i]);
+  }
+  if(0 == strcmp("snp", str) || 0 == strcmp("substitute", str) || 0 == strcmp("sub", str) || 0 == strcmp("s", str)) {
+      return SUBSTITUTE;
+  }
+  else if(0 == strcmp("insertion", str) || 0 == strcmp("insert", str) || 0 == strcmp("ins", str) || 0 == strcmp("i", str)) {
+      return INSERT;
+  }
+  else if(0 == strcmp("deletion", str) || 0 == strcmp("delet", str) || 0 == strcmp("del", str) || 0 == strcmp("d", str)) {
+      return DELETE;
+  }
+  return -1;
+}
+
+char iupac_and_base_to_mut(char iupac, char base)
+{
+  static char *codes = "XACMGRSVTWYHKDBN";
+  int32_t i;
+  int32_t b = nst_nt4_table[(int)base];
+  for(i=0;i<4;i++) { 
+      if(codes[1<<(b&0x3) | 1<<(i&0x3)] == iupac) {
+          return "ACGTN"[i];
+      }
+  }
+  return 'X';
+}
+
+char bases_to_iupac(char b1, char b2)
+{
+  int32_t a1, a2, n;
+  a1 = nst_nt4_table[(int)b1];
+  a2 = nst_nt4_table[(int)b2];
+  if(4 == a1 || 4 == a2) return 'X';
+  if(a1 == b1) return 'X';
+  if(a1 < a2) {
+      n = a1 + (a2 << 2);
+  }
+  else {
+      n = a2 + (a1 << 2);
+  }
+  switch(n) {
+    case 4: return 'M'; // 0 + 4*1 = M
+    case 8: return 'R'; // 0 + 4*2 = R
+    case 12: return 'W'; // 0 + 4*3 = W
+    case 9: return 'S'; // 1 + 4*2 = S
+    case 13: return 'Y'; // 1 + 4*3 = Y
+    case 14: return 'K'; // 2 + 4*3 = K
+    default: break;
+  }
+  return 'X';
+}
+
+/* Error-checking open, copied from utils.c */
+#define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
+
+FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
+{
+  FILE *fp = 0;
+  if (strcmp(fn, "-") == 0)
+    return (strstr(mode, "r"))? stdin : stdout;
+  if ((fp = fopen(fn, mode)) == 0) {
+      fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn);
+      abort();
+  }
+  return fp;
+}
+
+/* dwgsim */
+
+#define __gen_errors_mismatches(_cur_seq, _j, _start, _iter, _len) do { \
+    for (i = (_start); 0 <= i && i < _len; _iter) { \
+        mut_t c = _cur_seq[_j][i]; \
+        if (c >= 4) c = 4; \
+        else if(drand48() < e[_j]->start + e[_j]->by*i) { \
+            c = (c + (mut_t)(drand48() * 3.0 + 1)) & 3; \
+            ++n_err[_j]; \
+            if(0 == i) ++n_err_first[_j]; \
+        } \
+        _cur_seq[_j][i] = c; \
+    } \
+} while(0)
+
+int32_t
+generate_errors_flows(dwgsim_opt_t *opt, uint8_t **seq, uint8_t **mask, int32_t *mem, int32_t len, uint8_t strand, double e, int32_t *_n_err)
+{
+  int32_t i, j, k, hp_l, flow_i, n_err;
+  uint8_t prev_c, c;
+
+  // remove Ns
+  for(i=0;i<len;i++) {
+      if(4 <= (*seq)[i]) {
+          (*seq)[i] = 0;
+      }
+  }
+
+  if(1 == strand) {
+      for(i=0;i<len>>1;i++) {
+          c = (*seq)[i];
+          (*seq)[i] = (*seq)[len-i-1];
+          (*seq)[len-i-1] = c;
+      }
+  }
+
+  // skip forward to the first flow
+  for(i=0;i<opt->flow_order_len;i++) {
+      c = (4 <= (*seq)[0]) ? 0 : (*seq)[0];
+      if(c == opt->flow_order[i]) {
+          break;
+      }
+      (*mask)[i] = 0;
+  }
+  assert(opt->flow_order_len != i);
+  flow_i = i;
+  prev_c = 4;
+  for(i=0;i<len;i++) {
+      c = (4 <= (*seq)[i]) ? 0 : (*seq)[i];
+      while(c != opt->flow_order[flow_i]) { // skip paste empty flows
+          (*mask)[flow_i] = 0;
+          flow_i = (flow_i + 1) % opt->flow_order_len;
+      }
+      if(prev_c != c) { // new hp
+          (*mask)[flow_i] = 0;
+          n_err = 0;
+          while(drand48() < e) { // how many bases should we insert/delete
+              n_err++;
+          }
+          if(0 < n_err) {
+              if(drand48() < 0.5) { // insert
+                  // more memory
+                  while((*mem) <= len + n_err) {
+                      (*mem) <<= 1; // double
+                      (*seq) = realloc((*seq), sizeof(uint8_t) * (*mem));
+                      (*mask) = realloc((*mask), sizeof(uint8_t) * (*mem));
+                  }
+                  // shift up
+                  for(j=len-1;i<=j;j--) {
+                      (*seq)[j+n_err] = (*seq)[j];
+                  }
+                  for(j=i;j<i+n_err;j++) {
+                      (*seq)[j] = c;
+                  }
+                  len += n_err;
+              }
+              else {
+                  int32_t next_c = 4;
+                  // get the hp length
+                  for(j=i,hp_l=0;j<len;j++,hp_l++) {
+                      next_c = (4 <= (*seq)[j]) ? 0 : (*seq)[j];
+                      if(c != next_c) {
+                          break;
+                      }
+                  }
+                  // bound errors by the hp length
+                  n_err = (hp_l < n_err) ? hp_l : n_err;
+                  // shift down
+                  for(j=i;j<len-n_err;j++) {
+                      (*seq)[j] = (*seq)[j+n_err];
+                  }
+                  len -= n_err;
+                  (*mask)[flow_i] = 1;
+                  // Note: we need to make sure that if we delete all the bases,
+                  // that the neighboring bases are not the same.  If they are,
+                  // we need to simulate a "dot-fill", whereby we add a
+                  // non-incorporating flow.
+                  if(n_err == hp_l && (0 == i || prev_c == next_c)) {
+                      j = 0;
+                      // get the number of subsequent flows until the next base
+                      while(next_c != opt->flow_order[(flow_i + j) % opt->flow_order_len]) { // skip paste empty flows
+                          j++;
+                      }
+                      assert(0 < j);
+                      // pick one to fill in
+                      k = (int)(drand48() * j);
+                      // shift up
+                      for(j=len-1;i<=j;j--) {
+                          (*seq)[j+1] = (*seq)[j];
+                      }
+                      // add the filled in base
+                      (*seq)[i] = opt->flow_order[(flow_i + k) % opt->flow_order_len];
+                      len++;
+                  }
+              }
+              (*_n_err) += n_err;
+          }
+          prev_c = c;
+      }
+  }
+
+  // second pass: add empty flows
+  prev_c = 4;
+  for(i=0;i<len;i++) {
+      c = (4 <= (*seq)[i]) ? 0 : (*seq)[i];
+      while(c != opt->flow_order[flow_i]) {
+          n_err = 0;
+          while(drand48() < e) {
+              n_err++;
+          }
+          if(0 == (*mask)[flow_i] && 0 < n_err) {  // insert
+              // more memory
+              while((*mem) <= len + n_err) {
+                  (*mem) <<= 1; // double
+                  (*seq) = realloc((*seq), sizeof(uint8_t) * (*mem));
+                  (*mask) = realloc((*mask), sizeof(uint8_t) * (*mem));
+              }
+              // shift up
+              for(j=len-1;i<=j;j--) {
+                  (*seq)[j+n_err] = (*seq)[j];
+              }
+              // copy
+              for(j=i;j<i+n_err;j++) {
+                  (*seq)[j] = opt->flow_order[flow_i];
+              }
+              len += n_err;
+              (*_n_err) += n_err;
+          }
+          flow_i = (flow_i + 1) % opt->flow_order_len;
+      }
+  }
+
+  if(1 == strand) {
+      for(i=0;i<len>>1;i++) {
+          c = (*seq)[i];
+          (*seq)[i] = (*seq)[len-i-1];
+          (*seq)[len-i-1] = c;
+      }
+  }
+
+  return len;
+}
+
+void dwgsim_core(dwgsim_opt_t * opt)
+{
+  seq_t seq;
+  mutseq_t *mutseq[2]={NULL,NULL};
+  uint64_t tot_len, ii=0, ctr=0;
+  int i, l, m, n_ref, contig_i;
+  char name[1024], *qstr;
+  int32_t name_len_max=0;
+  int size[2], prev_skip=0, qstr_l=0;
+  int num_n[2];
+  uint8_t *tmp_seq[2]={NULL,NULL};
+  uint8_t *tmp_seq_flow_mask[2]={NULL,NULL};
+  int32_t tmp_seq_mem[2]={0,0};
+  int64_t n_sim = 0;
+  error_t *e[2]={NULL,NULL};
+  FILE *fp_muts_input = NULL;
+  FILE *fp_regions_bed = NULL;
+  muts_input_t *muts_input = NULL;
+  regions_bed_txt *regions_bed = NULL;
+  contigs_t *contigs = NULL;
+
+  e[0] = &opt->e[0]; e[1] = &opt->e[1];
+
+  INIT_SEQ(seq);
+  seq_set_block_size(0x1000000);
+  l = opt->length[0] > opt->length[1]? opt->length[0] : opt->length[1];
+  qstr_l = l;
+  qstr = (char*)calloc(qstr_l+1, 1);
+  tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
+  tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
+  if(IONTORRENT == opt->data_type) {
+      tmp_seq_flow_mask[0] = (uint8_t*)calloc(l+2, 1);
+      tmp_seq_flow_mask[1] = (uint8_t*)calloc(l+2, 1);
+  }
+  tmp_seq_mem[0] = tmp_seq_mem[1] = l+2;
+  size[0] = opt->length[0]; size[1] = opt->length[1];
+  
+  if(0 <= opt->fn_muts_input_type) {
+      contigs = contigs_init();
+  }
+  
+  if(NULL != opt->fn_regions_bed) {
+      fp_regions_bed = xopen(opt->fn_regions_bed, "r");
+      if(NULL == contigs) contigs = contigs_init();
+  }
+
+  tot_len = n_ref = 0;
+  mut_print_header_pre(opt->fp_vcf);
+  if(NULL != opt->fp_fai) {
+      int dummy_int[3];
+      while(0 < fscanf(opt->fp_fai, "%s\t%d\t%d\t%d\t%d", name, &l, &dummy_int[0], &dummy_int[1], &dummy_int[2])) {
+          fprintf(stderr, "[dwgsim_core] %s length: %d\n", name, l);
+          tot_len += l;
+          ++n_ref;
+          mut_print_header_contig(opt->fp_vcf, name, l);
+          if(NULL != contigs) {
+              contigs_add(contigs, name, l);
+          }
+      }
+  }
+  else {
+      while ((l = seq_read_fasta(opt->fp_fa, &seq, name, 0)) >= 0) {
+          fprintf(stderr, "[dwgsim_core] %s length: %d\n", name, l);
+          tot_len += l;
+          ++n_ref;
+          mut_print_header_contig(opt->fp_vcf, name, l);
+          if(NULL != contigs) {
+              contigs_add(contigs, name, l);
+          }
+      }
+  }
+  fprintf(stderr, "[dwgsim_core] %d sequences, total length: %llu\n", n_ref, (long long)tot_len);
+  rewind(opt->fp_fa);
+  mut_print_header_post(opt->fp_vcf);
+
+  if(0 <= opt->fn_muts_input_type) {
+      fp_muts_input = xopen(opt->fn_muts_input, "r");
+      muts_input = muts_input_init(fp_muts_input, contigs, opt->fn_muts_input_type); // read in the mutation file
+  }
+  
+  if(NULL != opt->fn_regions_bed) {
+      regions_bed = regions_bed_init(fp_regions_bed, contigs);
+      // recalculate the total length
+      tot_len = 0;
+      for(i=0;i<regions_bed->n;i++) {
+          tot_len += regions_bed->end[i] - regions_bed->start[i] + 1;
+      }
+  }
+  if(NULL != contigs) {
+      contigs_destroy(contigs);
+      contigs = NULL;
+  }
+
+  if(0 == opt->muts_only) {
+      fprintf(stderr, "[dwgsim_core] Currently on: \n0");
+  }
+  else {
+      fprintf(stderr, "[dwgsim_core] Currently on:");
+  }
+  contig_i = 0;
+  while ((l = seq_read_fasta(opt->fp_fa, &seq, name, 0)) >= 0) {
+      int64_t n_pairs = 0;
+      n_ref--;
+      
+      if(1 == opt->muts_only) {
+          fprintf(stderr, "\r[dwgsim_core] Currently on: %s", name);
+          if(name_len_max < strlen(name)) {
+              name_len_max = strlen(name);
+          } 
+          else {
+              for(i=0;i<name_len_max-strlen(name);i++) {
+                  fputc(' ', stderr);
+              }
+          }
+      }
+
+      if(0 == opt->muts_only) {
+          if(0 == n_ref && opt->C < 0) {
+              n_pairs = opt->N - n_sim;
+          }
+          else {
+              if(NULL != regions_bed) {
+                  // recalculate l
+                  m = 0;
+                  for(i=0;i<regions_bed->n;i++) {
+                      if(contig_i == regions_bed->contig[i]) {
+                          m += regions_bed->end[i] - regions_bed->start[i] + 1;
+                      }
+                  }
+                  if(0 == m) {
+                      fprintf(stderr, "[dwgsim_core] #0 skip sequence '%s' as it is not in the targeted region\n", name);
+                      contig_i++;
+                      continue; // skip this region
+                  }
+                  l = m;
+
+                  int num_n = 0;
+                  for(i=0;i<regions_bed->n;i++) {           
+                      if(contig_i == regions_bed->contig[i]) {                    
+                          int m;                                                                            
+                          for(m=regions_bed->start[i];m<=regions_bed->end[i];m++) {                                               
+                              switch (seq.s[m-1]) {                                                                                                             
+                                case 'a':
+                                case 'A':
+                                case 'c':
+                                case 'C':
+                                case 'g':
+                                case 'G':
+                                case 't':
+                                case 'T':
+                                  break;                                                                                                                                                                         default:     
+                                    num_n++;                                                                                                                                                                         break;     
+                              }                                                                                                                                                                            }                  
+                      }                                                                                                                                                                            } 
+                  if(0.95 < num_n / (double)l) { // TODO: arbitrary cutoff
+                      fprintf(stderr, "[dwgsim_core] #1 skip sequence '%s' as %d out of %d bases are non-ACGT\n", name, num_n, l);
+                      contig_i++;
+                      continue;
+                  }
+              }
+              if(0 < opt->N) {
+                  // based on -N
+                  n_pairs = (uint64_t)((long double)l / tot_len * opt->N + 0.5);
+                  if(opt->N - n_sim < n_pairs) n_pairs = opt->N - n_sim; // make sure we don't simulate too many reads
+              }
+              else {
+                  // based on coverage, with added random reads
+                  n_pairs = (uint64_t)(l * opt->C / ((long double)(size[0] + size[1])) / (1.0 - opt->rand_read) + 0.5);
+              }
+          }
+
+          // for paired end/mate pair, make sure we have enough bases in this
+          // sequence
+          if (0 < opt->length[1] && l < opt->dist + 3 * opt->std_dev) {
+              if(0 == prev_skip) fprintf(stderr, "\n");
+              prev_skip = 1;
+              fprintf(stderr, "[dwgsim_core] #2 skip sequence '%s' as it is shorter than %f!\n", name, opt->dist + 3 * opt->std_dev);
+              contig_i++;
+              continue;
+          }
+          else if (l < opt->length[0] || (0 < opt->length[1] && l < opt->length[1])) {
+              if(0 == prev_skip) fprintf(stderr, "\n");
+              prev_skip = 1;
+              fprintf(stderr, "[dwgsim_core] #3 skip sequence '%s' as it is shorter than %d!\n", name, (l < opt->length[0]) ? opt->length[0] : opt->length[1]);
+              contig_i++;
+              continue;
+          }
+          else if (n_pairs < 0) { // NB: this should not happen
+              // not enough pairs
+              continue;
+          }
+          prev_skip = 0;
+      }
+
+      // generate mutations and print them out
+      mutseq[0] = mutseq_init(); mutseq[1] = mutseq_init();
+      mut_diref(opt, &seq, mutseq[0], mutseq[1], contig_i, muts_input);
+      mut_print(name, &seq, mutseq[0], mutseq[1], opt->fp_mut, opt->fp_vcf);
+
+      if(0 == opt->muts_only) {
+          for (ii = 0; ii != n_pairs; ++ii, ++ctr) { // the core loop
+              if(0 == (ctr % 10000)) {
+                  fprintf(stderr, "\r[dwgsim_core] %llu",
+                          (unsigned long long int)ctr);
+              }
+              double ran;
+              int d, pos, s[2], strand[2];
+              int n_sub[2], n_indel[2], n_err[2], ext_coor[2]={0,0}, j, k;
+              int n_sub_first[2], n_indel_first[2], n_err_first[2]; // need this for SOLID data
+              int c1, c2, c;
+
+              s[0] = size[0]; s[1] = size[1];
+
+              if(opt->rand_read < drand48()) { 
+
+                  if(NULL == regions_bed) {
+                      do { // avoid boundary failure
+                          if(0 < s[1]) { // paired end/mate pair
+                              ran = ran_normal();
+                              ran = ran * opt->std_dev + opt->dist;
+                              d = (int)(ran + 0.5);
+                          }
+                          else {
+                              d = 0;
+                          }
+                          pos = (int)((l - d + 1) * drand48());
+                      } while (pos < 0 
+                               || pos >= seq.l 
+                               || pos + d - 1 >= seq.l 
+                               || (0 < s[1] && 0 == opt->is_inner && ((0 < s[0] && d <= s[1]) || (d <= s[0] && 0 < s[1]))));
+                  } 
+                  else {
+                      do { // avoid boundary failure
+                          if(0 < s[1]) {
+                              ran = ran_normal();
+                              ran = ran * opt->std_dev + opt->dist;
+                              d = (int)(ran + 0.5);
+                          }
+                          else {
+                              d = 0;
+                          }
+                          pos = (int)((l - d + 1) * drand48());
+                          // convert in the bed file
+                          for(i=0;i<regions_bed->n;i++) { // TODO: regions are in sorted order... so optimize
+                              if(contig_i == regions_bed->contig[i]) {
+                                  j = regions_bed->end[i] - regions_bed->start[i] + 1;
+                                  if(pos < j) {
+                                      pos = regions_bed->start[i] + pos - 1; // zero-based
+                                      break;
+                                  }
+                                  else {
+                                      pos -= j;
+                                  }
+                              }
+                          }
+                      } while (pos < 0 
+                               || pos >= seq.l 
+                               || pos + d - 1 >= seq.l 
+                               || (0 < s[1] && 0 == opt->is_inner && ((0 < s[0] && d <= s[1]) || (d <= s[0] && 0 < s[1])))
+                               || 0 == regions_bed_query(regions_bed, contig_i, pos, pos + s[0] + s[1] + d - 1));
+                  }
+
+                  // generate the read sequences
+                  mutseq_t *currseq = mutseq[drand48()<opt->mut_freq?0:1]; // haplotype from which the reads are generated
+                  n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
+                  n_sub_first[0] = n_sub_first[1] = n_indel_first[0] = n_indel_first[1] = n_err_first[0] = n_err_first[1] = 0;
+                  num_n[0]=num_n[1]=0;
+
+                  // strand
+                  if(2 == opt->strandedness || (0 == opt->strandedness && ILLUMINA == opt->data_type)) {
+                      // opposite strand by default for Illumina
+                      strand[0] = 0; strand[1] = 1; 
+                  }
+                  else if(1 == opt->strandedness || (0 == opt->strandedness && (SOLID == opt->data_type || IONTORRENT == opt->data_type))) {
+                      // same strands by default for SOLiD
+                      strand[0] = 0; strand[1] = 0; 
+                  }
+                  else {
+                      // should not reach here
+                      assert(1 == 0);
+                  }
+                  if (drand48() < 0.5) { // which strand ?
+                      // Flip strands 
+                      strand[0] = (1 + strand[0]) % 2;
+                      strand[1] = (1 + strand[1]) % 2;
+                  }
+
+                  // generate the reads in base space
+                  if(0 < s[1]) { // paired end or mate pair
+                      if(strand[0] == strand[1]) { // same strand
+                          if(0 == strand[0]) { // + strand
+                              /*
+                               * 5' E2 -----> .... E1 -----> 3'
+                               * 3'           ....           5'
+                               */
+                              if(0 == opt->is_inner) {
+                                  __gen_read(0, pos + d - s[0], ++i); 
+                              }
+                              else {
+                                  __gen_read(0, pos + s[1] + d, ++i); 
+                              }
+                              __gen_read(1, pos, ++i);
+                          }
+                          else { // - strand
+                              /*
+                               * 3'           ....            5'
+                               * 5' <----- E1 .... <----- E2  3'
+                               */
+                              __gen_read(0, pos + s[0], --i);
+                              if(0 == opt->is_inner) {
+                                  __gen_read(1, pos + d, --i);
+                              }
+                              else {
+                                  __gen_read(1, pos + s[0] + d + s[1], --i);
+                              }
+                          }
+                      }
+                      else { // opposite strand
+                          if(0 == strand[0]) { // + strand
+                              /*
+                               * 5' E1 -----> ....           3'
+                               * 3'           .... <----- E2 5'
+                               */
+                              __gen_read(0, pos, ++i);
+                              if(0 == opt->is_inner) {
+                                  __gen_read(1, pos + d, --i);
+                              }
+                              else {
+                                  __gen_read(1, pos + s[0] + d + s[1], --i);
+                              }
+                          }
+                          else { // - strand
+                              /*
+                               * 5' E2 -----> ....           3'
+                               * 3'           .... <----- E1 5'
+                               */
+                              if(0 == opt->is_inner) {
+                                  __gen_read(0, pos + d, --i);
+                              }
+                              else {
+                                  __gen_read(0, pos + s[1] + d + s[0], --i); 
+                              }
+                              __gen_read(1, pos, i++);
+                          }
+                      }
+                  }
+                  else { // fragment
+                      if(0 == strand[0]) {
+                          __gen_read(0, pos, ++i); // + strand
+                      }
+                      else {
+                          __gen_read(0, pos + s[0] - 1, --i); // - strand
+                      }
+                  }
+
+                  // Count # of Ns
+                  for (j = 0; j < 2; ++j) {
+                      num_n[j]=0;
+                      if(0 < s[j]) {
+                          for (i = 0; i < s[j]; ++i) {
+                              if(tmp_seq[j][i] == 4) num_n[j]++;
+                          }
+                      }
+                  }
+
+                  if (ext_coor[0] < 0 || ext_coor[1] < 0 || opt->max_n < num_n[0] || opt->max_n < num_n[1]) { // fail to generate the read(s)
+                      --ii;
+                      --ctr;
+                      continue;
+                  }
+
+                  if(SOLID == opt->data_type) {
+                      // Convert to color sequence, use the first base as the adaptor
+                      for (j = 0; j < 2; ++j) {
+                          if(0 < s[j]) {
+                              c1 = 0; // adaptor 
+                              for (i = 0; i < s[j]; ++i) {
+                                  c2 = tmp_seq[j][i]; // current base
+                                  c = __gf_add(c1, c2);
+                                  tmp_seq[j][i] = c;
+                                  c1 = c2; // save previous base
+                              }
+                          }
+                      }
+                  }
+
+                  // generate sequencing errors
+                  if(IONTORRENT == opt->data_type) {
+                      s[0] = generate_errors_flows(opt, &tmp_seq[0], &tmp_seq_flow_mask[0], &tmp_seq_mem[0], s[0], strand[0], e[0]->start, &n_err[0]);
+                      s[1] = generate_errors_flows(opt, &tmp_seq[1], &tmp_seq_flow_mask[1], &tmp_seq_mem[1], s[1], strand[1], e[1]->start, &n_err[1]);
+                  }
+                  else { // Illumina/SOLiD
+                      if(0 < s[0]) {
+                          if(0 == strand[0]) { 
+                              __gen_errors_mismatches(tmp_seq, 0, 0, ++i, s[0]); 
+                          }
+                          else { 
+                              __gen_errors_mismatches(tmp_seq, 0, s[0]-1, --i, s[0]); 
+                          }
+                      }
+                      if(0 < s[1]) {
+                          if(0 == strand[1]) { 
+                              __gen_errors_mismatches(tmp_seq, 1, 0, ++i, s[1]); 
+                          }
+                          else { 
+                              __gen_errors_mismatches(tmp_seq, 1, s[1]-1, --i, s[1]); 
+                          }
+                      }
+                  }
+
+                  // print
+                  for (j = 0; j < 2; ++j) {
+                      if(s[j] <= 0) {
+                          continue;
+                      }
+                      if(IONTORRENT == opt->data_type && qstr_l < s[j]) {
+                          qstr_l = s[j];
+                          qstr = realloc(qstr, (1+qstr_l) * sizeof(char));
+                      }
+                      if(NULL != opt->fixed_quality) {
+                          for (i = 0; i < s[j]; ++i) {
+                              qstr[i] = opt->fixed_quality[0];
+                          }
+                      }
+                      else {
+                          for (i = 0; i < s[j]; ++i) {
+                              qstr[i] = (int)(-10.0 * log(e[j]->start + e[j]->by*i) / log(10.0) + 0.499) + 33;
+                              if(0 < opt->quality_std) {
+                                  qstr[i] += (int)((ran_normal() * opt->quality_std) + 0.5);
+                                  if(qstr[i] < 33) qstr[i] = 33;
+                                  if(73 < qstr[i]) qstr[i] = 73;
+                              }
+                          }
+                      }
+                      qstr[i] = 0;
+                      // BWA
+                      FILE *fpo = (0 == j) ? opt->fp_bwa1: opt->fp_bwa2;
+                      if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
+                          fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n", 
+                                  (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+                                  (NULL == opt->read_prefix) ? "" : "_",
+                                  name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0,
+                                  n_err[0], n_sub[0], n_indel[0],
+                                  n_err[1], n_sub[1],n_indel[1],
+                                  (long long)ii, j+1);
+                          for (i = 0; i < s[j]; ++i)
+                            fputc("ACGTN"[(int)tmp_seq[j][i]], fpo);
+                          fprintf(fpo, "\n+\n%s\n", qstr);
+                      }
+                      else {
+                          // Note: BWA ignores the adapter and the first color, so this is a misrepresentation 
+                          // in samtools.  We must first skip the first color.  Basically, a 50 color read is a 
+                          // 49 color read for BWA.
+                          //
+                          // Note: BWA outputs F3 to read1, annotated as read "2", and outputs R3 to read2,
+                          // annotated as read "1".
+                          fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n", 
+                                  (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+                                  (NULL == opt->read_prefix) ? "" : "_",
+                                  name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0,
+                                  n_err[0] - n_err_first[0], n_sub[0] - n_sub_first[0], n_indel[0] - n_indel_first[0], 
+                                  n_err[1] - n_err_first[1], n_sub[1] - n_sub_first[1], n_indel[1] - n_indel_first[1],
+                                  (long long)ii, 2 - j);
+                          //fputc('A', fpo);
+                          for (i = 1; i < s[j]; ++i)
+                            fputc("ACGTN"[(int)tmp_seq[j][i]], fpo);
+                          fprintf(fpo, "\n+\n");
+                          for (i = 1; i < s[j]; ++i) 
+                            fputc(qstr[i], fpo);
+                          fprintf(fpo, "\n");
+                      }
+
+                      // BFAST output
+                      fprintf(opt->fp_bfast, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx\n", 
+                              (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+                              (NULL == opt->read_prefix) ? "" : "_",
+                              name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0,
+                              n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
+                              (long long)ii);
+                      if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
+                          for (i = 0; i < s[j]; ++i)
+                            fputc("ACGTN"[(int)tmp_seq[j][i]], opt->fp_bfast);
+                          fprintf(opt->fp_bfast, "\n+\n%s\n", qstr);
+                      }
+                      else {
+                          fputc('A', opt->fp_bfast);
+                          for (i = 0; i < s[j]; ++i)
+                            fputc("01234"[(int)tmp_seq[j][i]], opt->fp_bfast);
+                          fprintf(opt->fp_bfast, "\n+\n");
+                          for (i = 0; i < s[j]; ++i) 
+                            fputc(qstr[i], opt->fp_bfast);
+                          fprintf(opt->fp_bfast, "\n");
+                      }
+                  }
+                  n_sim++;
+              }
+              else { // random DNA read
+                  for(j=0;j<2;j++) {
+                      if(s[j] <= 0) {
+                          continue;
+                      } 
+                      if(IONTORRENT == opt->data_type && qstr_l < s[j]) {
+                          qstr_l = s[j];
+                          qstr = realloc(qstr, (1+qstr_l) * sizeof(char));
+                      }
+                      // get random sequence
+                      for (i = 0; i < s[j]; ++i) {
+                          tmp_seq[j][i] = (int)(drand48() * 4.0) & 3;
+                      }
+                      if(NULL != opt->fixed_quality) {
+                          for (i = 0; i < s[j]; ++i) {
+                              qstr[i] = opt->fixed_quality[0];
+                          }
+                      }
+                      else {
+                          for (i = 0; i < s[j]; ++i) {
+                              qstr[i] = (int)(-10.0 * log(e[j]->start + e[j]->by*i) / log(10.0) + 0.499) + 33;
+                              if(0 < opt->quality_std) {
+                                  qstr[i] += (int)((ran_normal() * opt->quality_std) + 0.5);
+                                  if(qstr[i] < 33) qstr[i] = 33;
+                                  if(73 < qstr[i]) qstr[i] = 73;
+                              }
+                          }
+                      }
+                      qstr[i] = 0;
+                      if(SOLID == opt->data_type) { // convert to color space
+                          if(0 < s[j]) {
+                              c1 = 0; // adaptor 
+                              for (i = 0; i < s[j]; ++i) {
+                                  c2 = tmp_seq[j][i]; // current base
+                                  c = __gf_add(c1, c2);
+                                  tmp_seq[j][i] = c;
+                                  c1 = c2; // save previous base
+                              }
+                          }
+                      }
+                      // BWA
+                      FILE *fpo = (0 == j) ? opt->fp_bwa1: opt->fp_bwa2;
+                      if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
+                          fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n", 
+                                  (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+                                  (NULL == opt->read_prefix) ? "" : "_",
+                                  "rand", 0, 0, 0, 0, 1, 1,
+                                  0, 0, 0, 0, 0, 0,
+                                  (long long)ii,
+                                  j+1);
+                          for (i = 0; i < s[j]; ++i)
+                            fputc("ACGTN"[(int)tmp_seq[j][i]], fpo);
+                          fprintf(fpo, "\n+\n%s\n", qstr);
+                      }
+                      else {
+                          // Note: BWA ignores the adapter and the first color, so this is a misrepresentation 
+                          // in samtools.  We must first skip the first color.  Basically, a 50 color read is a 
+                          // 49 color read for BWA.
+                          //
+                          // Note: BWA outputs F3 to read1, annotated as read "2", and outputs R3 to read2,
+                          // annotated as read "1".
+                          fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n", 
+                                  (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+                                  (NULL == opt->read_prefix) ? "" : "_",
+                                  "rand", 0, 0, 0, 0, 1, 1,
+                                  0, 0, 0, 0, 0, 0,
+                                  (long long)ii, 2 - j);
+                          //fputc('A', fpo);
+                          for (i = 1; i < s[j]; ++i)
+                            fputc("ACGTN"[(int)tmp_seq[j][i]], fpo);
+                          fprintf(fpo, "\n+\n");
+                          for (i = 1; i < s[j]; ++i) 
+                            fputc(qstr[i], fpo);
+                          fprintf(fpo, "\n");
+                      }
+
+                      // BFAST output
+                      fprintf(opt->fp_bfast, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx\n", 
+                              (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+                              (NULL == opt->read_prefix) ? "" : "_",
+                              "rand", 0, 0, 0, 0, 1, 1,
+                              0, 0, 0, 0, 0, 0,
+                              (long long)ii);
+                      if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
+                          for (i = 0; i < s[j]; ++i)
+                            fputc("ACGTN"[(int)tmp_seq[j][i]], opt->fp_bfast);
+                          fprintf(opt->fp_bfast, "\n+\n%s\n", qstr);
+                      }
+                      else {
+                          fputc('A', opt->fp_bfast);
+                          for (i = 0; i < s[j]; ++i)
+                            fputc("01234"[(int)tmp_seq[j][i]], opt->fp_bfast);
+                          fprintf(opt->fp_bfast, "\n+\n");
+                          for (i = 0; i < s[j]; ++i) 
+                            fputc(qstr[i], opt->fp_bfast);
+                          fprintf(opt->fp_bfast, "\n");
+                      }
+                  }
+                  n_sim++;
+              }
+          }
+          fprintf(stderr, "\r[dwgsim_core] %llu",
+                  (unsigned long long int)ctr);
+      }
+      mutseq_destroy(mutseq[0]);
+      mutseq_destroy(mutseq[1]);
+      contig_i++;
+  }
+  fprintf(stderr, "\n[dwgsim_core] Complete!\n");
+
+  free(seq.s); free(qstr);
+  free(tmp_seq[0]); free(tmp_seq[1]);
+  if(IONTORRENT == opt->data_type) {
+      free(tmp_seq_flow_mask[0]); free(tmp_seq_flow_mask[1]);
+  }
+  if(0 <= opt->fn_muts_input_type) {
+      muts_input_destroy(muts_input);
+  }
+  if(NULL != opt->fn_regions_bed) {
+      regions_bed_destroy(regions_bed);
+      fclose(fp_regions_bed);
+  }
+}
+
+int main(int argc, char *argv[])
+{
+  dwgsim_opt_t *opt = NULL;
+
+  // update the mutant sequence bounds
+  mutseq_init_bounds();
+
+  opt = dwgsim_opt_init();
+
+  char fn_fai[1024]="\0";
+  char fn_tmp[1024]="\0";
+
+  if(0 == dwgsim_opt_parse(opt, argc, argv)) {
+      return dwgsim_opt_usage(opt);
+  }
+
+  // Open files
+  opt->fp_fa =	xopen(argv[optind+0], "r");
+  strcpy(fn_fai, argv[optind+0]); strcat(fn_fai, ".fai");
+  opt->fp_fai = fopen(fn_fai, "r"); // NB: depends on returning NULL;
+  strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".mutations.txt");
+  opt->fp_mut = xopen(fn_tmp, "w");
+  strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".mutations.vcf");
+  opt->fp_vcf = xopen(fn_tmp, "w");
+  if(0 == opt->muts_only) {
+      strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bfast.fastq");
+      opt->fp_bfast = xopen(fn_tmp, "w");
+      strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bwa.read1.fastq");
+      opt->fp_bwa1 = xopen(fn_tmp, "w");
+      strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bwa.read2.fastq");
+      opt->fp_bwa2 = xopen(fn_tmp, "w");
+  }
+
+  // Run simulation
+  dwgsim_core(opt);
+
+  // Close files
+  if(0 == opt->muts_only) {
+      fclose(opt->fp_fa); fclose(opt->fp_bfast); fclose(opt->fp_bwa1); fclose(opt->fp_bwa2); 
+  }
+  if(NULL != opt->fp_fai) fclose(opt->fp_fai);
+  fclose(opt->fp_mut);
+  fclose(opt->fp_vcf);
+
+  dwgsim_opt_destroy(opt);
+
+  return 0;
+}
diff --git a/src/dwgsim.h b/src/dwgsim.h
new file mode 100644
index 0000000..f771736
--- /dev/null
+++ b/src/dwgsim.h
@@ -0,0 +1,26 @@
+#ifndef DWGSIM_H
+#define DWGSIM_H
+
+#include "dwgsim_opt.h"
+
+#define __gf_add(_x, _y) ((_x >= 4 || _y >= 4) ? 4 : (_x ^ _y))
+#define __IS_TRUE(_val) ((_val == 1) ? "True" : "False")
+
+extern uint8_t nst_nt4_table[256];
+
+enum data_type_t {
+    ILLUMINA=0,
+    SOLID=1,
+    IONTORRENT=2
+};
+
+char iupac_and_base_to_mut(char iupac, char base);
+
+char bases_to_iupac(char b1, char b2);
+
+int32_t get_muttype(char *str);
+
+int32_t
+generate_errors_flows(dwgsim_opt_t *opt, uint8_t **seq, uint8_t **mask, int32_t *mem, int32_t len, uint8_t strand, double e, int32_t *_n_err);
+
+#endif
diff --git a/src/dwgsim_eval.c b/src/dwgsim_eval.c
new file mode 100755
index 0000000..4bf4288
--- /dev/null
+++ b/src/dwgsim_eval.c
@@ -0,0 +1,698 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <limits.h>
+#include <unistd.h>
+#include <float.h>
+#include <sys/resource.h>
+#include <math.h>
+#include "samtools/bam.h"
+#include "samtools/sam.h"
+#include "dwgsim_eval.h"
+
+#define __IS_TRUE(_val) ((_val == 1) ? "True" : "False")
+
+int 
+print_usage(dwgsim_eval_args_t *args)
+{
+  fprintf(stderr, "\n");
+  fprintf(stderr, "Program: dwgsim_eval (short read simulation evaluator)\n");
+  fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
+  fprintf(stderr, "Contact: Nils Homer <dnaa-help at lists.sourceforge.net>\n\n");
+  fprintf(stderr, "Usage: dwgsim_eval [options] <in.sam/in.bam>\n\n");
+  fprintf(stderr, "Options:\n");
+  fprintf(stderr, "\t-a\tINT\tsplit by [%d]:\n", args->a);
+  fprintf(stderr, "\t\t\t\t\t0: by mapping quality\n");
+  fprintf(stderr, "\t\t\t\t\t1: by alignment score\n");
+  fprintf(stderr, "\t\t\t\t\t2: by suboptimal alignment score\n");
+  fprintf(stderr, "\t\t\t\t\t3: by alignment score - suboptimal alignment score\n");
+  fprintf(stderr, "\t-b\t\talignments are from BWA (SOLiD only) [%s]\n", __IS_TRUE(args->b));
+  fprintf(stderr, "\t-c\t\tcolor space alignments [%s]\n", __IS_TRUE(args->c));
+  fprintf(stderr, "\t-d\tINT\tdivide quality/alignment score by this factor [%d]\n", args->d);
+  fprintf(stderr, "\t-g\t\tgap \"wiggle\" [%d]\n", args->g);
+  fprintf(stderr, "\t-m\t\tconsecutive alignments with the same name (and end for multi-ends) should be treated as multi-mapped reads [%s]\n", __IS_TRUE(args->m));
+  fprintf(stderr, "\t-n\tINT\tnumber of raw input paired-end reads (otherwise, inferred from all SAM records present) [%d]\n", args->n);
+  fprintf(stderr, "\t-q\tINT\tconsider only alignments with this mapping quality or greater [%d]\n", args->q);
+  fprintf(stderr, "\t-z\t\tinput contains only single end reads [%s]\n", __IS_TRUE(args->z));
+  fprintf(stderr, "\t-S\t\tinput is SAM [%s]\n", __IS_TRUE(args->S));
+  fprintf(stderr, "\t-p\t\tprint incorrect alignments [%s]\n", __IS_TRUE(args->p));
+  fprintf(stderr, "\t-s\tINT\tconsider only alignments with the number of specified SNPs [%d]\n", args->s);
+  fprintf(stderr, "\t-e\tINT\tconsider only alignments with the number of specified errors [%d]\n", args->e);
+  fprintf(stderr, "\t-i\t\tconsider only alignments with indels [%s]\n", __IS_TRUE(args->i));
+  fprintf(stderr, "\t-P\tSTRING\ta read prefix that was prepended to each read name [%s]\n", (NULL == args->P) ? "not using" : args->P);
+  fprintf(stderr, "\t-h\t\tprint this help message\n");
+  return 1;
+}
+
+/* Action */
+enum {Exit, Warn, LastActionType};
+/* Type */
+enum {  
+    Dummy,
+    OutOfRange, /* e.g. command line args */
+    InputArguments, 
+    IllegalFileName,   
+    IllegalPath,
+    OpenFileError,
+    EndOfFile,
+    ReallocMemory,
+    MallocMemory,
+    ThreadError,
+    ReadFileError,
+    WriteFileError,
+    DeleteFileError,
+    LastErrorType,
+};       
+#define BREAK_LINE "************************************************************\n"
+void
+dwgsim_eval_print_error(char* FunctionName, char *VariableName, char* Message, int Action, int type)
+{
+  static char ErrorString[][20]=
+    { "\0", "OutOfRange", "InputArguments", "IllegalFileName", "IllegalPath", "OpenFileError", "EndOfFile", "ReallocMemory", "MallocMemory", "ThreadError", "ReadFileError", "WriteFileError", "DeleteFileError"};
+  static char ActionType[][20]={"Fatal Error", "Warning"};
+  fprintf(stderr, "%s\rIn function \"%s\": %s[%s]. ",
+          BREAK_LINE, FunctionName, ActionType[Action], ErrorString[type]);
+
+  /* Only print variable name if is available */
+  if(VariableName) {
+      fprintf(stderr, "Variable/Value: %s.\n", VariableName);
+  }
+  /* Only print message name if is available */
+  if(Message) {
+      fprintf(stderr, "Message: %s.\n", Message);
+  }
+  if(type == ReadFileError ||
+     type == OpenFileError ||
+     type == WriteFileError) {
+      perror("The file stream error was:");
+  }
+
+  switch(Action) {
+    case Exit:
+      fprintf(stderr, " ***** Exiting due to errors *****\n");
+      fprintf(stderr, "%s", BREAK_LINE);
+      exit(EXIT_FAILURE);
+      break; /* Not necessary actually! */
+    case Warn:
+      fprintf(stderr, " ***** Warning *****\n");
+      fprintf(stderr, "%s", BREAK_LINE);
+      break;
+    default:
+      fprintf(stderr, "Trouble!!!\n");
+      fprintf(stderr, "%s", BREAK_LINE);
+  }
+}
+
+
+int 
+main(int argc, char *argv[])
+{
+  char c;
+  dwgsim_eval_args_t args;
+
+  args.a = args.b = args.c = args.i = args.m = args.n = args.p = args.q = args.z = 0; 
+  args.d = 1;
+  args.e = -1;
+  args.g = 5;
+  args.s = -1;
+  args.S = 0;
+  args.P = NULL;
+
+  while(0 <= (c = getopt(argc, argv, "a:d:e:g:m:n:q:s:bchimpzSP:"))) {
+      switch(c) {
+        case 'a': args.a = atoi(optarg); break;
+        case 'b': args.b = 1; break;
+        case 'c': args.c = 1; break;
+        case 'd': args.d = atoi(optarg); break;
+        case 'g': args.g = atoi(optarg); break;
+        case 'm': args.m = 1; break;
+        case 'h': return print_usage(&args); break;
+        case 'n': args.n = atoi(optarg); break;
+        case 'q': args.q = atoi(optarg); break;
+        case 'z': args.z = 1; break;
+        case 'S': args.S = 1; break;
+        case 'p': args.p = 1; break;
+        case 's': args.s = atoi(optarg); break;
+        case 'e': args.e = atoi(optarg); break;
+        case 'i': args.i = 1; break;
+        case 'P': free(args.P); args.P = strdup(optarg); break;
+        default: fprintf(stderr, "Unrecognized option: -%c\n", c); return 1;
+      }
+  }
+
+  if(argc == optind) {
+      return print_usage(&args);
+  }
+
+  run(&args, argc - optind, argv + optind);
+
+  free(args.P);
+
+  return 0;
+}
+
+void 
+run(dwgsim_eval_args_t *args,
+    int32_t num_files,
+    char *files[]) 
+{
+  char *FnName="run";
+  int32_t i, n = 0;
+  samfile_t *fp_in = NULL;
+  samfile_t *fp_out = NULL;
+  bam1_t *b=NULL;
+  dwgsim_eval_counts_t *counts;
+  char *prev_qname=NULL;
+  int32_t prev_end=-1;
+
+  // initialize counts
+  counts = dwgsim_eval_counts_init();
+
+  fprintf(stderr, "Analyzing...\nCurrently on:\n0");
+  for(i=0;i<num_files;i++) {
+      // Open the file
+      fp_in = samopen(files[i], (1 == args->S) ? "r" : "rb", 0); 
+      if(NULL == fp_in) {
+          dwgsim_eval_print_error(FnName, files[i], "Could not open file for reading", Exit, OpenFileError);
+      }
+
+      if(0 == i && 1 == args->p) {
+          fp_out = samopen("-", "wh", fp_in->header);
+          if(NULL == fp_out) {
+              dwgsim_eval_print_error(FnName, "stdout", "Could not open file stream for writing", Exit, OpenFileError);
+          }
+      }
+
+      b = bam_init1();
+      while(0 < samread(fp_in, b)) {
+          if(1 == args->m &&
+             NULL != prev_qname && 
+             prev_end == (BAM_FREAD1 & b->core.flag) &&
+             0 == strcmp(prev_qname, bam1_qname(b))) {
+              // do nothing
+          }
+          else {
+              if(1 == args->m) {
+                  free(prev_qname);
+                  prev_qname = strdup(bam1_qname(b));
+                  prev_end = (BAM_FREAD1 & b->core.flag);
+              }
+
+              process_bam(counts, args, fp_in->header, b, fp_out);
+
+              if((BAM_FPAIRED & b->core.flag)) { // paired end
+                  if(1 == args->z) { // expect single end
+                      dwgsim_eval_print_error(FnName, NULL, "Found a read that was paired end", Exit, OutOfRange);
+                  }
+                  if((BAM_FREAD1 & b->core.flag)) { // count # of pairs
+                      n++;
+                  }
+              }
+              else { // single end
+                  if(0 == args->z) { // expect paired end
+                      dwgsim_eval_print_error(FnName, NULL, "Found a read that was not paired", Exit, OutOfRange);
+                  }
+                  n++;
+              }
+
+
+              if(0 == (n % 10000)) {
+                  fprintf(stderr, "\r%lld", (long long int)n);
+              }
+          }
+
+          bam_destroy1(b);
+          b = bam_init1();
+      }
+      bam_destroy1(b);
+
+      // Close the file
+      samclose(fp_in);
+  }
+  free(prev_qname);
+
+  if(1 == args->p) samclose(fp_out);
+
+  fprintf(stderr, "\r%lld\n", (long long int)n);
+
+  if(0 < args->n) {
+      if(n != args->n) {
+          fprintf(stderr, "(-n)=%d\tn=%d\n", args->n, n);
+          dwgsim_eval_print_error(FnName, NULL, "Number of reads found differs from the number specified (-n)", Warn, OutOfRange);
+      }
+  }
+  if(0 == args->z) {
+      dwgsim_eval_counts_print(counts, args->a, args->d, (0 < args->n) ? 2*args->n : 2*n);
+  }
+  else {
+      dwgsim_eval_counts_print(counts, args->a, args->d, (0 < args->n) ? args->n : n);
+  }
+
+  dwgsim_eval_counts_destroy(counts);
+
+  fprintf(stderr, "Analysis complete.\n");
+}
+
+uint32_t bam_calclip(const bam1_t *b)
+{
+  const bam1_core_t *c = &b->core;
+  const uint32_t *cigar = bam1_cigar(b);
+  uint32_t k, end = 0;
+  for (k = 0; k < c->n_cigar; ++k) {
+      int op = cigar[k] & BAM_CIGAR_MASK;
+      if(op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP) {
+        end += cigar[k] >> BAM_CIGAR_SHIFT;
+      }
+      else {
+        break;
+      }
+  }
+  return end;
+}
+
+
+void
+process_bam(dwgsim_eval_counts_t *counts,
+            dwgsim_eval_args_t *args,
+            bam_header_t *header,
+            bam1_t *b,
+            samfile_t *fp_out)
+{
+  char *FnName="process_bam";
+  int32_t left, metric=INT_MIN;
+  char *chr=NULL;
+  char *name=NULL, *ptr=NULL;
+  char chr_name[1028]="\0";
+  char read_num[1028]="\0";
+  int32_t pos_1, pos_2, str_1, str_2, rand_1, rand2; 
+  int32_t n_err_1, n_sub_1, n_indel_1, n_err_2, n_sub_2, n_indel_2;
+  int32_t pos, str, rand;
+  int32_t i, j, tmp;
+  int32_t predicted_value, actual_value;
+  int32_t clip;
+
+  // mapping quality threshold
+  if(b->core.qual < args->q) return;
+
+  // parse read name
+  name = strdup(bam1_qname(b));
+  ptr = name; // save to be freed
+  char *to_rm="_::_::_______"; // to remove
+  for(i=b->core.l_qname-1,j=0;0<=i && j<13;i--) { // replace with spaces 
+      if(name[i] == to_rm[j]) {
+          name[i] = ' '; j++; 
+      }
+  }
+  // check for the prefix
+  if(NULL != args->P) {
+      j = strlen(name);
+      tmp = strlen(args->P);
+      if(j < tmp || 0 != strncmp(args->P, name, tmp)) {
+          dwgsim_eval_print_error(FnName, name, "[dwgsim_eval] could not match read name with given read name prefix (-P)", Exit, OutOfRange);
+          free(ptr);
+          return;
+      }
+      name += tmp + 1;
+  }
+  if(14 != sscanf(name, "%s %d %d %1d %1d %1d %1d %d %d %d %d %d %d %s",
+                  chr_name, &pos_1, &pos_2, &str_1, &str_2, &rand_1, &rand2,
+                  &n_err_1, &n_sub_1, &n_indel_1,
+                  &n_err_2, &n_sub_2, &n_indel_2,
+                  read_num)) {
+      dwgsim_eval_print_error(FnName, name, "[dwgsim_eval] read was not generated by dwgsim?", Exit, OutOfRange);
+      free(ptr);
+      return;
+  }
+  // check for a prefix, and make sure it was removed correctly
+  if(1 == args->z || (b->core.flag & BAM_FREAD1)) {
+      rand = rand_1;
+  }
+  else {
+      rand = rand2;
+  }
+  if(0 == rand) {
+      for(j=0;j<header->n_targets;j++) {
+          i = strlen(name);
+          tmp = strlen(header->target_name[j]); 
+          i = (i < tmp) ? i : tmp;
+          if(0 == strncmp(name, header->target_name[j], i)) {
+              break;
+          }
+      }
+      if(j == header->n_targets) {
+          dwgsim_eval_print_error(FnName, name, "[dwgsim_eval] the mapped contig does not exist in the SAM header; perhaps you have a read name prefix?", Exit, OutOfRange);
+      }
+  }
+  free(ptr);
+  ptr = name = NULL;
+
+  // get metric value
+  if(0 == args->a) {
+      metric = (b->core.qual / args->d); 
+      if(DWGSIM_EVAL_MAXQ < metric) metric = DWGSIM_EVAL_MAXQ;
+  }
+  else if((BAM_FUNMAP & b->core.flag) || 0 == b->core.qual) { // unmapped or zero quality
+      metric = DWGSIM_EVAL_MINAS;
+  }
+  else {
+      uint8_t *aux_AS=NULL, *aux_XS=NULL;
+      metric = DWGSIM_EVAL_MINAS+1;
+      if(1 == args->a || 3 == args->a) {
+          aux_AS = bam_aux_get(b, "AS");
+          if(NULL == aux_AS) {
+              metric = DWGSIM_EVAL_MINAS;
+          }
+      }
+      if(2 == args->a || 3 == args->a) {
+          aux_XS = bam_aux_get(b, "XS");
+          if(NULL == aux_XS) {
+              metric = DWGSIM_EVAL_MINAS;
+          }
+      }
+      if(metric != DWGSIM_EVAL_MINAS) {
+          switch(args->a) {
+            case 1:
+              metric = bam_aux2i(aux_AS);
+              break;
+            case 2:
+              metric = bam_aux2i(aux_XS);
+              break;
+            case 3:
+              metric = bam_aux2i(aux_AS) - bam_aux2i(aux_XS);
+              break;
+            default:
+              metric = DWGSIM_EVAL_MINAS;
+              break;
+          }
+      }
+  }
+  metric /= args->d;
+  if(metric < DWGSIM_EVAL_MINAS) metric = DWGSIM_EVAL_MINAS;
+
+  if(1 == args->i) { // indels only
+      if(1 == args->z || (b->core.flag & BAM_FREAD1)) {
+          if(0 == n_indel_1) return;
+      }
+      else {
+          if(0 == n_indel_2) return;
+      }
+  }
+  else if(0 <= args->e && n_err_1 !=  args->e) { // # of errors
+      return;
+  }
+  else if(0 <= args->s && n_sub_1 !=  args->s) { // # of snps
+      return;
+  }
+
+  if(1 == args->c && 1 == args->b) { // SOLiD and BWA
+      // Swap 1 and 2
+      tmp=n_err_1; n_err_1=n_err_2; n_err_2=tmp;
+      tmp=n_sub_1; n_sub_1=n_sub_2; n_sub_2=tmp;
+      tmp=n_indel_1; n_indel_1=n_indel_2; n_indel_2=tmp;
+  }
+
+  // copy data
+  if(1 == args->z || (b->core.flag & BAM_FREAD1)) {
+      pos = pos_1; str = str_1; rand = rand_1;
+  }
+  else {
+      pos = pos_2; str = str_2; rand = rand2;
+  }
+
+  // get the actual value 
+  if(1 == rand) {
+      actual_value = DWGSIM_EVAL_UNMAPPABLE;
+  }
+  else {
+      actual_value = DWGSIM_EVAL_MAPPABLE;
+  }
+
+  // get the predicted value
+  if((BAM_FUNMAP & b->core.flag)) { // unmapped
+      predicted_value = DWGSIM_EVAL_UNMAPPED;
+  }
+  else { // mapped (correctly?)
+      clip = bam_calclip(b); 
+      chr = header->target_name[b->core.tid];
+      left = b->core.pos - clip;
+
+      if(1 == rand // should not map 
+         || str != bam1_strand(b) // different strand
+         || 0 != strcmp(chr, chr_name)  // different chromosome
+         || args->g < fabs(pos - left)) { // out of bounds (positionally) 
+          predicted_value = DWGSIM_EVAL_MAPPED_INCORRECTLY;
+      }
+      else {
+          predicted_value = DWGSIM_EVAL_MAPPED_CORRECTLY;
+      }
+  }
+
+  dwgsim_eval_counts_add(counts, metric, actual_value, predicted_value);
+
+  // print incorrect alignments
+  if(1 == args->p && DWGSIM_EVAL_MAPPED_INCORRECTLY == predicted_value) {
+      if(samwrite(fp_out, b) <= 0) {
+          dwgsim_eval_print_error(FnName, "stdout", "Could not write to stream", Exit, WriteFileError);
+      }
+  }
+}
+
+dwgsim_eval_counts_t *
+dwgsim_eval_counts_init()
+{
+  dwgsim_eval_counts_t *counts;
+
+  counts = malloc(sizeof(dwgsim_eval_counts_t));
+
+  counts->min_score = counts->max_score = 0;
+
+  counts->mc = malloc(sizeof(int32_t)); assert(NULL != counts->mc);
+  counts->mi = malloc(sizeof(int32_t)); assert(NULL != counts->mi);
+  counts->mu = malloc(sizeof(int32_t)); assert(NULL != counts->mu);
+  counts->um = malloc(sizeof(int32_t)); assert(NULL != counts->um);
+  counts->uu = malloc(sizeof(int32_t)); assert(NULL != counts->uu);
+
+  counts->mc[0] = counts->mi[0] = counts->mu[0] = 0;
+  counts->um[0] = counts->uu[0] = 0;
+
+  return counts;
+}
+
+void
+dwgsim_eval_counts_destroy(dwgsim_eval_counts_t *counts)
+{
+  free(counts->mc);
+  free(counts->mi);
+  free(counts->mu);
+  free(counts->um);
+  free(counts->uu);
+  free(counts);
+}
+
+void 
+dwgsim_eval_counts_add(dwgsim_eval_counts_t *counts, int32_t score, int32_t actual_value, int32_t predicted_value)
+{
+  char *FnName="dwgsim_eval_counts_add";
+  int32_t i, m, n;
+  if(counts->max_score < score) {
+      m = score - counts->min_score + 1;
+      n = counts->max_score - counts->min_score + 1;
+
+      counts->mc = realloc(counts->mc, sizeof(int32_t)*m); assert(NULL != counts->mc);
+      counts->mi = realloc(counts->mi, sizeof(int32_t)*m); assert(NULL != counts->mi);
+      counts->mu = realloc(counts->mu, sizeof(int32_t)*m); assert(NULL != counts->mu);
+      counts->um = realloc(counts->um, sizeof(int32_t)*m); assert(NULL != counts->um);
+      counts->uu = realloc(counts->uu, sizeof(int32_t)*m); assert(NULL != counts->uu);
+
+      // initialize to zero
+      for(i=n;i<m;i++) {
+          counts->mc[i] = counts->mi[i] = counts->mu[i] = 0;
+          counts->um[i] = counts->uu[i] = 0;
+      }
+      counts->max_score = score;
+  }
+  else if(score < counts->min_score) {
+      m = counts->max_score - score + 1;
+      n = counts->max_score - counts->min_score + 1;
+
+      counts->mc = realloc(counts->mc, sizeof(int32_t)*m); assert(NULL != counts->mc);
+      counts->mi = realloc(counts->mi, sizeof(int32_t)*m); assert(NULL != counts->mi);
+      counts->mu = realloc(counts->mu, sizeof(int32_t)*m); assert(NULL != counts->mu);
+      counts->um = realloc(counts->um, sizeof(int32_t)*m); assert(NULL != counts->um);
+      counts->uu = realloc(counts->uu, sizeof(int32_t)*m); assert(NULL != counts->uu);
+
+      // shift up
+      for(i=m-1;m-n<=i;i--) {
+          counts->mc[i] = counts->mc[i-(m-n)]; 
+          counts->mi[i] = counts->mi[i-(m-n)]; 
+          counts->mu[i] = counts->mu[i-(m-n)]; 
+          counts->um[i] = counts->um[i-(m-n)]; 
+          counts->uu[i] = counts->uu[i-(m-n)]; 
+      }
+      // initialize to zero
+      for(i=0;i<m-n;i++) {
+          counts->mc[i] = counts->mi[i] = counts->mu[i] = 0;
+          counts->um[i] = counts->uu[i] = 0;
+      }
+      counts->min_score = score;
+  }
+
+  // check actual value
+  switch(actual_value) {
+    case DWGSIM_EVAL_MAPPABLE:
+    case DWGSIM_EVAL_UNMAPPABLE:
+      break;
+    default:
+      dwgsim_eval_print_error(FnName, "actual_value", "Could not understand actual value", Exit, OutOfRange);
+  }
+
+  // check predicted value
+  switch(predicted_value) {
+    case DWGSIM_EVAL_MAPPED_CORRECTLY:
+    case DWGSIM_EVAL_MAPPED_INCORRECTLY:
+    case DWGSIM_EVAL_UNMAPPED:
+      break;
+    default:
+      dwgsim_eval_print_error(FnName, "predicted_value", "Could not understand predicted value", Exit, OutOfRange);
+  }
+
+  switch(actual_value) {
+    case DWGSIM_EVAL_MAPPABLE:
+      switch(predicted_value) {
+        case DWGSIM_EVAL_MAPPED_CORRECTLY:
+          counts->mc[score-counts->min_score]++; break;
+        case DWGSIM_EVAL_MAPPED_INCORRECTLY:
+          counts->mi[score-counts->min_score]++; break;
+        case DWGSIM_EVAL_UNMAPPED:
+          counts->mu[score-counts->min_score]++; break;
+        default:
+          break; // should not reach here
+      }
+      break;
+    case DWGSIM_EVAL_UNMAPPABLE:
+      switch(predicted_value) {
+        case DWGSIM_EVAL_MAPPED_CORRECTLY:
+          dwgsim_eval_print_error(FnName, "predicted_value", "predicted value cannot be mapped correctly when the read is unmappable", Exit, OutOfRange); break;
+        case DWGSIM_EVAL_MAPPED_INCORRECTLY:
+          counts->um[score-counts->min_score]++; break;
+        case DWGSIM_EVAL_UNMAPPED:
+          counts->uu[score-counts->min_score]++; break;
+        default:
+          break; // should not reach here
+      }
+      break;
+    default:
+      break; // should not reach here
+  }
+}
+
+void 
+dwgsim_eval_counts_print(dwgsim_eval_counts_t *counts, int32_t a, int32_t d, int32_t n)
+{
+  int32_t i;
+  int32_t max = 0;
+  int32_t mc_sum, mi_sum, mu_sum, um_sum, uu_sum;
+  int32_t m_total, mm_total, u_total;
+  char format[1024]="\0";
+
+  mc_sum = mi_sum = mu_sum = um_sum = uu_sum = 0;
+  m_total = mm_total = u_total = 0;
+
+  // create the format string
+  for(i=counts->max_score - counts->min_score;0<=i;i--) {
+      m_total += counts->mc[i] + counts->mi[i] + counts->mu[i];
+      u_total += counts->um[i] + counts->uu[i];
+      max += counts->mc[i] + counts->mi[i] + counts->mu[i] + counts->um[i] + counts->uu[i];
+  }
+  max = 1 + log10(max);
+  strcat(format, "%.2d ");
+  for(i=0;i<12;i++) {
+      sprintf(format + (int)strlen(format), "%%%dd ", max);
+  }
+  strcat(format + (int)strlen(format), "%.3e %.3e %.3e %.3e %.3e %.3e\n");
+
+  // header
+  fprintf(stdout, "# thr | the minimum %s threshold\n", (0 == a) ? "mapping quality" : "alignment score");
+  fprintf(stdout, "# mc | the number of reads mapped correctly that should be mapped at the threshold\n");
+  fprintf(stdout, "# mi | the number of reads mapped incorrectly that should be mapped be mapped at the threshold\n");
+          
+  fprintf(stdout, "# mu | the number of reads unmapped that should be mapped be mapped at the threshold\n");
+          
+  fprintf(stdout, "# um | the number of reads mapped that should be unmapped be mapped at the threshold\n");
+  fprintf(stdout, "# uu | the number of reads unmapped that should be unmapped be mapped at the threshold\n");
+  fprintf(stdout, "# mc' + mi' + mu' + um' + uu' | the total number of reads mapped at the threshold\n");
+  fprintf(stdout, "# mc' | the number of reads mapped correctly that should be mapped at or greater than that threshold\n");
+  fprintf(stdout, "# mi' | the number of reads mapped incorrectly that should be mapped be mapped at or greater than that threshold\n");
+          
+  fprintf(stdout, "# mu' | the number of reads unmapped that should be mapped be mapped at or greater than that threshold\n");
+          
+  fprintf(stdout, "# um' | the number of reads mapped that should be unmapped be mapped at or greater than that threshold\n");
+  fprintf(stdout, "# uu' | the number of reads unmapped that should be unmapped be mapped at or greater than that threshold\n");
+  fprintf(stdout, "# mc' + mi' + mu' + um' + uu' | the total number of reads mapped at or greater than the threshold\n");
+          
+  fprintf(stdout, "# (mc / (mc' + mi' + mu')) | sensitivity: the fraction of reads that should be mapped that are mapped correctly at the threshold\n");
+  fprintf(stdout, "# (mc / mc' + mi') | positive predictive value: the fraction of mapped reads that are mapped correctly at the threshold\n");
+  fprintf(stdout, "# (um / (um' + uu')) | false discovery rate: the fraction of random reads that are mapped at the threshold\n");
+  fprintf(stdout, "# (mc' / (mc' + mi' + mu')) | sensitivity: the fraction of reads that should be mapped that are mapped correctly at or greater than the threshold\n");
+  fprintf(stdout, "# (mc' / mc' + mi') | positive predictive value: the fraction of mapped reads that are mapped correctly at or greater than the threshold\n");
+  fprintf(stdout, "# (um' / (um' + uu')) | false discovery rate: the fraction of random reads that are mapped at or greater than the threshold\n");
+
+
+  // print
+  for(i=counts->max_score - counts->min_score;0<=i;i--) {
+      double num, den;
+
+      mc_sum += counts->mc[i];
+      mi_sum += counts->mi[i];
+      mu_sum += counts->mu[i];
+      um_sum += counts->um[i];
+      uu_sum += counts->uu[i];
+      mm_total += counts->mc[i] + counts->mi[i];
+
+      /* Notes:
+       *  notice that the denominator for sensitivity (and fdr) for the "ge" 
+       *  (greater than or equal) threshold is the "total", while the denominator 
+       *  for ppv is "@ >= Q".  The reasoning behind this is ppv is a measure of
+       *  the quality of mappings that will be returned when using a Q threshold
+       *  while the sensitivity want to measure the fraction of mappings that
+       *  will be returned compared to the maximum.  Basically, if we accept
+       *  only mappings at a given threshold, and call the rest unmapped, what
+       *  happens?
+       *  - sensitivity tells us the # of correct mappings out of the total possible
+       *  mappings.
+       *  - ppv tells us the # of correct mappings out of the total mappings.
+       *  - fdr tells us the # of random mappings out of the total unmappable.
+       */
+       
+      // "at" sensitivity: mapped correctly @ Q / mappable @ Q
+      num = counts->mc[i];
+      den = counts->mc[i] + counts->mi[i] + counts->mu[i];
+      double sens_at_thr = (0 == den) ? 0. : (num / (double)den);  
+      // "ge" sensitivity: mapped correctly @ >= Q / total mappable
+      double sens_ge_thr = (0 == m_total) ? 0. : (mc_sum / (double)m_total);
+      
+      // "at" positive predictive value: mapped correctly @ Q / mappable and mapped @ Q
+      num = counts->mc[i];
+      den = counts->mc[i] + counts->mi[i];
+      double ppv_at_thr = (0 == den) ? 0. : (num / (double)den);
+      // "ge" positive predictive value: mapped correctly @ >= Q / mappable and mapped @ >= Q
+      double ppv_ge_thr = (0 == mm_total) ? 0. : (mc_sum / (double)mm_total);
+
+      // "at" false discovery rate: unmappable and mapped @ Q / unmappable @ Q
+      num = counts->um[i];
+      den = counts->um[i] + counts->uu[i];
+      double fdr_at_thr = (0 == den) ? 0. : (num / (double)den);
+      // "ge" false discovery rate: unmappable and mapped @ >= Q / unmappable @ >= Q
+      double fdr_ge_thr = (0 == u_total) ? 0. : (um_sum / (double)u_total);
+      
+      fprintf(stdout, format,
+              (i + counts->min_score)*d,
+              counts->mc[i], counts->mi[i], counts->mu[i], counts->um[i], counts->uu[i],
+              counts->mc[i] + counts->mi[i] + counts->mu[i] + counts->um[i] + counts->uu[i],
+              mc_sum, mi_sum, mu_sum, um_sum, uu_sum,
+              mc_sum + mi_sum + mu_sum + um_sum + uu_sum,
+              sens_at_thr, ppv_at_thr, fdr_at_thr,
+              sens_ge_thr, ppv_ge_thr, fdr_ge_thr);
+  }
+}
diff --git a/src/dwgsim_eval.h b/src/dwgsim_eval.h
new file mode 100644
index 0000000..ba85630
--- /dev/null
+++ b/src/dwgsim_eval.h
@@ -0,0 +1,66 @@
+#ifndef DWGSIM_EVAL_H_
+#define DWGSIM_EVAL_H_
+
+#define DWGSIM_EVAL_MAXQ 255
+#define DWGSIM_EVAL_MINAS -5000
+
+typedef struct {
+    int32_t a; // alignment score or not
+    int32_t b; // bwa or not
+    int32_t c; // color space or not
+    int32_t d; // divide by factor
+    int32_t e; // print only alignments with # of errors
+    int32_t g; // gap "wiggle"
+    int32_t i; // indel only
+    int32_t m; // multi-mapped
+    int32_t n; // # of pe alignments
+    int32_t p; // print incorrect alignments or not
+    int32_t q; // consider only alignments with this mapping quality or greater
+    int32_t s; // print only alignments with # of SNPs 
+    int32_t z; // input reads are single end
+    int32_t S; // input reads are in text SAM format
+    char *P; // read name prefix
+} dwgsim_eval_args_t;
+
+// Actual value
+enum {
+    DWGSIM_EVAL_MAPPABLE   =0,
+    DWGSIM_EVAL_UNMAPPABLE =1
+};
+
+// Prediction
+enum {
+    DWGSIM_EVAL_MAPPED_CORRECTLY   =0,
+    DWGSIM_EVAL_MAPPED_INCORRECTLY =1,
+    DWGSIM_EVAL_UNMAPPED           =2
+};
+
+typedef struct {
+    int32_t *mc; // mappable && mapped correctly
+    int32_t *mi; // mappable && mapped incorrectly
+    int32_t *mu; // mappable && unmapped
+    int32_t *um; // unmappable && mapped
+    int32_t *uu; // unmappable && unmapped
+    int32_t min_score, max_score;
+} dwgsim_eval_counts_t;
+
+void 
+run(dwgsim_eval_args_t *args,
+         int32_t num_files,
+         char *files[]);
+void
+process_bam(dwgsim_eval_counts_t *counts,
+                    dwgsim_eval_args_t *args,
+                    bam_header_t *header,
+                    bam1_t *b,
+                    samfile_t *fp_out);
+dwgsim_eval_counts_t *
+dwgsim_eval_counts_init();
+void
+dwgsim_eval_counts_destroy(dwgsim_eval_counts_t *counts);
+void 
+dwgsim_eval_counts_add(dwgsim_eval_counts_t *counts, int32_t score, int32_t actual_value, int32_t predicted_value);
+void 
+dwgsim_eval_counts_print(dwgsim_eval_counts_t *counts, int32_t a, int32_t d, int32_t n);
+
+#endif
diff --git a/src/dwgsim_opt.c b/src/dwgsim_opt.c
new file mode 100644
index 0000000..f1339dd
--- /dev/null
+++ b/src/dwgsim_opt.c
@@ -0,0 +1,381 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   */
+
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+#include "mut.h"
+#include "dwgsim.h"
+#include "dwgsim_opt.h"
+
+dwgsim_opt_t* dwgsim_opt_init()
+{
+  dwgsim_opt_t *opt;
+  opt = calloc(1, sizeof(dwgsim_opt_t));
+  opt->e[0].start = opt->e[0].end = opt->e[1].start = opt->e[1].end = 0.02;
+  opt->e[0].by = opt->e[1].by = 0;
+  opt->is_inner = 0;
+  opt->dist = 500;
+  opt->std_dev = 50;
+  opt->N = -1;
+  opt->C = 100;
+  opt->length[0] = opt->length[1] = 70;
+  opt->mut_rate = 0.001;
+  opt->mut_freq = 0.5;
+  opt->indel_frac = 0.1;
+  opt->indel_extend = 0.3;
+  opt->indel_min = 1;
+  opt->rand_read = 0.05;
+  opt->data_type = ILLUMINA;
+  opt->strandedness = 0;
+  opt->max_n = 0;
+  opt->flow_order = NULL;
+  opt->flow_order_len = 0;
+  opt->use_base_error = 0;
+  opt->seed = -1;
+  opt->muts_only = 0;
+  opt->fixed_quality = NULL;
+  opt->quality_std = 2.0;
+  opt->fn_muts_input = NULL;
+  opt->fn_muts_input_type = -1;
+  opt->fn_regions_bed = NULL;
+  opt->fp_mut = opt->fp_bfast = opt->fp_bwa1 = opt->fp_bwa2 = NULL;
+  opt->fp_fa = opt->fp_fai = NULL;
+  opt->read_prefix = NULL;
+
+  return opt;
+}
+
+void dwgsim_opt_destroy(dwgsim_opt_t *opt)
+{
+  free(opt->fixed_quality);
+  free(opt->fn_muts_input);
+  free(opt->fn_regions_bed);
+  free(opt->flow_order);
+  free(opt->read_prefix);
+  free(opt);
+}
+
+int dwgsim_opt_usage(dwgsim_opt_t *opt)
+{
+  mutseq_init_bounds();
+  fprintf(stderr, "\n");
+  fprintf(stderr, "Program: dwgsim (short read simulator)\n");
+  fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
+  fprintf(stderr, "Contact: Nils Homer <dnaa-help at lists.sourceforge.net>\n\n");
+  fprintf(stderr, "Usage:   dwgsim [options] <in.ref.fa> <out.prefix>\n\n");
+  fprintf(stderr, "Options:\n");
+  fprintf(stderr, "         -e FLOAT      per base/color/flow error rate of the first read [from %.3f to %.3f by %.3f]\n", opt->e[0].start, opt->e[0].end, opt->e[0].by);
+  fprintf(stderr, "         -E FLOAT      per base/color/flow error rate of the second read [from %.3f to %.3f by %.3f]\n", opt->e[1].start, opt->e[1].end, opt->e[1].by);
+  fprintf(stderr, "         -i            use the inner distance instead of the outer distance for pairs [%s]\n", __IS_TRUE(opt->is_inner));
+  fprintf(stderr, "         -d INT        %s distance between the two ends for pairs [%d]\n", 
+          (0 == opt->is_inner) ? "outer" : "inner", opt->dist);
+  fprintf(stderr, "         -s INT        standard deviation of the distance for pairs [%.3f]\n", opt->std_dev);
+  fprintf(stderr, "         -N INT        number of read pairs (-1 to disable) [%lld]\n", (signed long long int)opt->N);
+  fprintf(stderr, "         -C FLOAT      mean coverage across available positions (-1 to disable) [%.2lf]\n", opt->C);
+  fprintf(stderr, "         -1 INT        length of the first read [%d]\n", opt->length[0]);
+  fprintf(stderr, "         -2 INT        length of the second read [%d]\n", opt->length[1]);
+  fprintf(stderr, "         -r FLOAT      rate of mutations [%.4f]\n", opt->mut_rate);
+  fprintf(stderr, "         -F FLOAT      frequency of given mutation to simulate low fequency somatic mutations [%.4f]\n", opt->mut_freq);
+  fprintf(stderr, "                           NB: freqeuncy F refers to the first strand of mutation, therefore mutations \n"); 
+  fprintf(stderr, "                           on the second strand occour with a frequency of 1-F \n");
+  fprintf(stderr, "         -R FLOAT      fraction of mutations that are indels [%.2f]\n", opt->indel_frac);
+  fprintf(stderr, "         -X FLOAT      probability an indel is extended [%.2f]\n", opt->indel_extend);
+  fprintf(stderr, "         -I INT        the minimum length indel [%d]\n", opt->indel_min);
+  fprintf(stderr, "         -y FLOAT      probability of a random DNA read [%.2f]\n", opt->rand_read);
+  fprintf(stderr, "         -n INT        maximum number of Ns allowed in a given read [%d]\n", opt->max_n);
+  fprintf(stderr, "         -c INT        generate reads for [%d]:\n", opt->data_type);
+  fprintf(stderr, "                           0: Illumina\n");
+  fprintf(stderr, "                           1: SOLiD\n");
+  fprintf(stderr, "                           2: Ion Torrent\n");
+  fprintf(stderr, "         -S INT        generate reads [%d]:\n", opt->strandedness);
+  fprintf(stderr, "                           0: default (opposite strand for Illumina, same strand for SOLiD/Ion Torrent)\n");
+  fprintf(stderr, "                           1: same strand (mate pair)\n");
+  fprintf(stderr, "                           2: opposite strand (paired end)\n");
+  fprintf(stderr, "         -f STRING     the flow order for Ion Torrent data [%s]\n", (char*)opt->flow_order);
+  fprintf(stderr, "         -B            use a per-base error rate for Ion Torrent data [%s]\n", __IS_TRUE(opt->use_base_error));
+  fprintf(stderr, "         -H            haploid mode [%s]\n", __IS_TRUE(opt->is_hap));
+  fprintf(stderr, "         -z INT        random seed (-1 uses the current time) [%d]\n", opt->seed);
+  fprintf(stderr, "         -M            generate a mutations file only [%s]\n", __IS_TRUE(opt->muts_only));
+  fprintf(stderr, "         -m FILE       the mutations txt file to re-create [%s]\n", (MUT_INPUT_TXT != opt->fn_muts_input_type) ? "not using" : opt->fn_muts_input);
+  fprintf(stderr, "         -b FILE       the bed-like file set of candidate mutations [%s]\n", (MUT_INPUT_BED == opt->fn_muts_input_type) ? "not using" : opt->fn_muts_input);
+  fprintf(stderr, "         -v FILE       the vcf file set of candidate mutations (use pl tag for strand) [%s]\n", (MUT_INPUT_VCF == opt->fn_muts_input_type) ? "not using" : opt->fn_muts_input);
+  fprintf(stderr, "         -x FILE       the bed of regions to cover [%s]\n", (NULL == opt->fn_regions_bed) ? "not using" : opt->fn_regions_bed);
+  fprintf(stderr, "         -P STRING     a read prefix to prepend to each read name [%s]\n", (NULL == opt->read_prefix) ? "not using" : opt->read_prefix);
+  fprintf(stderr, "         -q STRING     a fixed base quality to apply (single character) [%s]\n", (NULL == opt->fixed_quality) ? "not using" : opt->fixed_quality);
+  fprintf(stderr, "         -Q FLOAT      standard deviation of the base quality scores [%.2lf]\n", (NULL == opt->fixed_quality) ? opt->quality_std : 0.0);
+  fprintf(stderr, "         -s INT        standard deviation of the distance for pairs [%.3f]\n", opt->std_dev);
+  fprintf(stderr, "         -h            print this message\n");
+  fprintf(stderr, "\n");
+  fprintf(stderr, "Note: For SOLiD mate pair reads and BFAST, the first read is F3 and the second is R3. For SOLiD mate pair reads\n");
+  fprintf(stderr, "and BWA, the reads in the first file are R3 the reads annotated as the first read etc.\n");
+  fprintf(stderr, "\n");
+  fprintf(stderr, "Note: The longest supported insertion is %u.\n", UINT32_MAX);
+  fprintf(stderr, "\n");
+  return 1;
+}
+
+static void get_error_rate(const char *str, error_t *e)
+{
+  int32_t i;
+
+  e->start = atof(str);
+  for(i=0;i<strlen(str);i++) {
+      if(',' == str[i] || '-' == str[i]) {
+          break;
+      }
+  }
+  if(i<strlen(str)-1) {
+      i++;
+      e->end = atof(str+i);
+  }
+  else {
+      e->end = e->start;
+  }
+}
+
+int32_t
+dwgsim_opt_is_int(char *optarg)
+{
+  int32_t i;
+  for(i=0;i<strlen(optarg);i++) {
+      if(!isdigit(optarg[i])) return 0;
+  }
+  return 1;
+}
+
+int32_t
+dwgsim_atoi(char *optarg, char flag)
+{
+  if(0 == dwgsim_opt_is_int(optarg)) {
+      fprintf(stderr, "Error: command line option -%c is not a number [%s]\n", flag, optarg);
+      exit(1);
+  }
+  return atoi(optarg);
+}
+
+int32_t
+dwgsim_opt_parse(dwgsim_opt_t *opt, int argc, char *argv[]) 
+{
+  int32_t i;
+  int c;
+  int muts_input_type = 0;
+  
+  while ((c = getopt(argc, argv, "id:s:N:C:1:2:e:E:r:F:R:X:I:c:S:n:y:BHf:z:Mm:b:v:x:P:q:Q:h")) >= 0) {
+      switch (c) {
+        case 'i': opt->is_inner = 1; break;
+        case 'd': opt->dist = dwgsim_atoi(optarg, 'd'); break;
+        case 's': opt->std_dev = atof(optarg); break;
+        case 'N': opt->N = dwgsim_atoi(optarg, 'N'); opt->C = -1; break;
+        case 'C': opt->C = atof(optarg); opt->N = -1; break;
+        case '1': opt->length[0] = dwgsim_atoi(optarg, '1'); break;
+        case '2': opt->length[1] = dwgsim_atoi(optarg, '2'); break;
+        case 'e': get_error_rate(optarg, &opt->e[0]); break;
+        case 'E': get_error_rate(optarg, &opt->e[1]); break;
+        case 'r': opt->mut_rate = atof(optarg); break;
+        case 'F': opt->mut_freq = atof(optarg); break;
+        case 'R': opt->indel_frac = atof(optarg); break;
+        case 'X': opt->indel_extend = atof(optarg); break;
+        case 'I': opt->indel_min = dwgsim_atoi(optarg, 'I'); break;
+        case 'c': opt->data_type = dwgsim_atoi(optarg, 'c'); break;
+        case 'S': opt->strandedness = dwgsim_atoi(optarg, 'S'); break;
+        case 'n': opt->max_n = dwgsim_atoi(optarg, 'n'); break;
+        case 'y': opt->rand_read = atof(optarg); break;
+        case 'f': 
+                  if(NULL != opt->flow_order) free(opt->flow_order);
+                  opt->flow_order = (int8_t*)strdup(optarg);
+                  break;
+        case 'B': opt->use_base_error = 1; break;
+        case 'H': opt->is_hap = 1; break;
+        case 'h': return 0;
+        case 'z': opt->seed = dwgsim_atoi(optarg, 'n'); break;
+        case 'M': opt->muts_only = 1; break;
+        case 'm': free(opt->fn_muts_input); opt->fn_muts_input = strdup(optarg); opt->fn_muts_input_type = MUT_INPUT_TXT; muts_input_type |= 0x1; break;
+        case 'b': free(opt->fn_muts_input); opt->fn_muts_input = strdup(optarg); opt->fn_muts_input_type = MUT_INPUT_BED; muts_input_type |= 0x2; break;
+        case 'v': free(opt->fn_muts_input); opt->fn_muts_input = strdup(optarg); opt->fn_muts_input_type = MUT_INPUT_VCF; muts_input_type |= 0x4; break;
+        case 'x': free(opt->fn_regions_bed); opt->fn_regions_bed = strdup(optarg); break;
+        case 'P': free(opt->read_prefix); opt->read_prefix = strdup(optarg); break;
+        case 'q': opt->fixed_quality = strdup(optarg); break;
+        case 'Q': opt->quality_std = atof(optarg); break;
+        default: fprintf(stderr, "Unrecognized option: -%c\n", c); return 0;
+      }
+  }
+  if (argc - optind < 2) return 0;
+
+  __check_option(opt->is_inner, 0, 1, "-i");
+  __check_option(opt->dist, 0, INT32_MAX, "-d");
+  __check_option(opt->std_dev, 0, INT32_MAX, "-s");
+  if(opt->N < 0 && opt->C < 0) {
+      fprintf(stderr, "Must use one of -N or -C");
+      return 0;
+  }
+  else if(0 < opt->N && 0 < opt->C) {
+      fprintf(stderr, "Cannot use both -N or -C");
+      return 0;
+  }
+  else if(0 < opt->N) {
+      __check_option(opt->N, 1, INT32_MAX, "-N");
+      __check_option(opt->C, INT32_MIN, -1, "-C");
+  }
+  else {
+      __check_option(opt->N, INT32_MIN, -1, "-N");
+      __check_option(opt->C, 0, INT32_MAX, "-C");
+  }
+  __check_option(opt->length[0], 1, INT32_MAX, "-1");
+  __check_option(opt->length[1], 0, INT32_MAX, "-2");
+  // error rate
+  for(i=0;i<2;i++) {
+      if(opt->e[i].start < 0.0 || 1.0 < opt->e[i].start) {
+          fprintf(stderr, "End %s: the start error is out of range (-e)\n", (0 == i) ? "one" : "two");
+          return 0;
+      }
+      if(opt->e[i].end < 0.0 || 1.0 < opt->e[i].end) {
+          fprintf(stderr, "End %s: the end error is out of range (-e)\n", (0 == i) ? "one" : "two");
+          return 0;
+      }
+      if(IONTORRENT == opt->data_type) {
+          if(opt->e[i].end != opt->e[i].start) {
+              fprintf(stderr, "End %s: a uniform error rate must be given for Ion Torrent data\n", (0 == i) ? "one" : "two");
+              return 0;
+          }
+      }
+  }
+  __check_option(opt->mut_rate, 0, 1.0, "-r");
+  __check_option(opt->indel_frac, 0, 1.0, "-R");
+  __check_option(opt->indel_extend, 0, 1.0, "-X");
+  __check_option(opt->indel_min, 1, INT32_MAX, "-I");
+  __check_option(opt->data_type, 0, 2, "-c");
+  __check_option(opt->strandedness, 0, 2, "-S");
+  __check_option(opt->max_n, 0, INT32_MAX, "-n");
+  __check_option(opt->rand_read, 0, 1.0, "-y");
+  if(IONTORRENT == opt->data_type && NULL == opt->flow_order) {
+      fprintf(stderr, "Error: command line option -f is required\n");
+      return 0;
+  }
+  __check_option(opt->use_base_error, 0, 1, "-B");
+  __check_option(opt->is_hap, 0, 1, "-H");
+
+  if(NULL != opt->fixed_quality && 1 != strlen(opt->fixed_quality)) {
+      fprintf(stderr, "Error: command line option -q requires one character\n");
+      return 0;
+  }
+  __check_option(opt->quality_std, 0, INT32_MAX, "-Q");
+
+  if(NULL != opt->read_prefix) {
+      fprintf(stderr, "Warning: remember to use the -P option with dwgsim_eval\n");
+  }
+  
+  switch(muts_input_type) {
+    case 0x0:
+    case 0x1:
+    case 0x2:
+    case 0x4:
+      break;
+    default:
+      fprintf(stderr, "Error: -m/-b/-v cannot be used together\n");
+      return 0;
+      break;
+  }
+  
+  // random seed
+  srand48((-1 == opt->seed) ? time(0) : opt->seed);
+
+  if(IONTORRENT == opt->data_type) {
+      if(NULL != opt->flow_order) {
+          // uniform error rates only (so far)
+          if(opt->e[0].start != opt->e[0].end || opt->e[1].start != opt->e[1].end) {
+              fprintf(stderr, "Error: non-uniform error rate not support for Ion Torrent data\n");
+              return 0;
+          }
+          // update flow order
+          opt->flow_order_len = strlen((char*)opt->flow_order);
+          for(i=0;i<opt->flow_order_len;i++) {
+              opt->flow_order[i] = nst_nt4_table[opt->flow_order[i]];
+          }
+      }
+      else {
+          fprintf(stderr, "Error: -f must be given for Ion Torrent data\n");
+          return 0;
+      }
+  }
+  // use base error rate
+  if(IONTORRENT == opt->data_type && NULL != opt->flow_order && 1 == opt->use_base_error) {
+      uint8_t *tmp_seq=NULL;
+      uint8_t *tmp_seq_flow_mask=NULL;
+      int32_t tmp_seq_mem, s, cur_n_err, n_err, counts;
+      int32_t j, k;
+      double sf = 0.0;
+      for(i=0;i<2;i++) {
+          if(opt->length[i] <= 0) continue;
+          fprintf(stderr, "[dwgsim_core] Updating error rate for end %d\n", i+1);
+          if(0 < i && opt->length[i] == opt->length[1-i]) {
+              opt->e[i].start = opt->e[1-i].start;
+              opt->e[i].end = opt->e[1-i].end;
+              opt->e[i].by = opt->e[1-i].by;
+              fprintf(stderr, "[dwgsim_core] Using scaling factor from previous end\n[dwgsim_core] Updated with scaling factor %.5lf\n", sf);
+              continue;
+          }
+          tmp_seq_mem = opt->length[i]+2;
+          tmp_seq = (uint8_t*)calloc(tmp_seq_mem, 1);
+          tmp_seq_flow_mask = (uint8_t*)calloc(tmp_seq_mem, 1);
+          n_err = counts = 0;
+          for(j=0;j<ERROR_RATE_NUM_RANDOM_READS;j++) {
+              if(0 == (j % 10000)) {
+                  fprintf(stderr, "\r[dwgsim_core] %d", j);
+              }
+              for(k=0;k<opt->length[i];k++) {
+                  tmp_seq[k] = (int)(drand48() * 4.0) & 3;
+              }
+              cur_n_err = 0;
+              s = opt->length[i];
+              s = generate_errors_flows(opt, &tmp_seq, &tmp_seq_flow_mask, &tmp_seq_mem, s, 0, opt->e[i].start, &cur_n_err);
+              n_err += cur_n_err;
+              counts += s;
+          }
+          //fprintf(stderr, "before %lf,%lf,%lf\n", opt->e[i].start, opt->e[i].by, opt->e[i].end); 
+          sf = opt->e[i].start / (n_err / (1.0 * counts));
+          opt->e[i].start = opt->e[i].end *= sf;
+          opt->e[i].by = (opt->e[i].end - opt->e[i].start) / opt->length[i];
+          //fprintf(stderr, "after %lf,%lf,%lf\n", opt->e[i].start, opt->e[i].by, opt->e[i].end); 
+          free(tmp_seq);
+          free(tmp_seq_flow_mask);
+          fprintf(stderr, "\r[dwgsim_core] %d\n[dwgsim_core] Updated with scaling factor %.5lf!\n", j, sf);
+      }
+  }
+  else {
+      opt->e[0].by = (opt->e[0].end - opt->e[0].start) / opt->length[0];
+      opt->e[1].by = (opt->e[1].end - opt->e[1].start) / opt->length[1];
+  }
+  
+  __check_option(opt->muts_only, 0, 1, "-M");
+
+  return 1;
+}
diff --git a/src/dwgsim_opt.h b/src/dwgsim_opt.h
new file mode 100644
index 0000000..e535165
--- /dev/null
+++ b/src/dwgsim_opt.h
@@ -0,0 +1,66 @@
+#ifndef DWGSIM_OPT_H
+#define DWGSIM_OPT_H
+
+#define ERROR_RATE_NUM_RANDOM_READS 1000000
+
+typedef struct {
+    double start, by, end;
+} error_t;
+
+typedef struct {
+    error_t e[2];
+    int32_t is_inner;
+    int32_t dist;
+    double std_dev;
+    int64_t N;
+    double C;
+    int32_t length[2];
+    double mut_rate;
+    double mut_freq;
+    double indel_frac;
+    double indel_extend;
+    int32_t indel_min;
+    double rand_read;
+    int32_t max_n;
+    int32_t data_type;
+    int32_t strandedness;
+    int8_t *flow_order;
+    int32_t flow_order_len;
+    int32_t use_base_error;
+    int32_t is_hap;
+    int32_t seed;
+    int32_t muts_only;
+    char *fixed_quality;
+    double quality_std;
+    char *fn_muts_input;
+    int32_t fn_muts_input_type;
+    char *fn_regions_bed;
+    FILE *fp_mut;
+    FILE *fp_vcf;
+    FILE *fp_bfast;
+    FILE *fp_bwa1;
+    FILE *fp_bwa2;
+    FILE *fp_fa;
+    FILE *fp_fai;
+    char *read_prefix;
+} dwgsim_opt_t;
+
+dwgsim_opt_t* 
+dwgsim_opt_init();
+
+void 
+dwgsim_opt_destroy(dwgsim_opt_t *opt);
+
+#define __check_option(_val, _min, _max, _opt) \
+  if(_val < _min || _max < _val) { \
+      fprintf(stderr, "Error: command line option %s was out of range\n", _opt); \
+      return 0; \
+  } 
+
+int 
+dwgsim_opt_usage(dwgsim_opt_t *opt);
+
+int32_t
+dwgsim_opt_parse(dwgsim_opt_t *opt, int argc, char *argv[]); 
+
+#endif
diff --git a/src/mut.c b/src/mut.c
new file mode 100644
index 0000000..92b8764
--- /dev/null
+++ b/src/mut.c
@@ -0,0 +1,888 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   */
+
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+#include "contigs.h"
+#include "mut_txt.h"
+#include "mut_bed.h"
+#include "regions_bed.h"
+#include "dwgsim_opt.h"
+#include "mut.h"
+
+static int SEQ_BLOCK_SIZE = 512;
+
+void seq_set_block_size(int size)
+{
+    SEQ_BLOCK_SIZE = size;
+}
+
+int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment)
+{
+  int c, l, max;
+  char *p;
+
+  c = 0;
+  while (!feof(fp) && fgetc(fp) != '>');
+  if (feof(fp)) return -1;
+  p = locus;
+  while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n')
+    if (c != '\r') *p++ = c;
+  *p = '\0';
+  if (comment) {
+      p = comment;
+      if (c != '\n') {
+          while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t'));
+          if (c != '\n') {
+              *p++ = c;
+              while (!feof(fp) && (c = fgetc(fp)) != '\n')
+                if (c != '\r') *p++ = c;
+          }
+      }
+      *p = '\0';
+  } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n');
+  l = 0; max = seq->m;
+  while (!feof(fp) && (c = fgetc(fp)) != '>') {
+      if (isalpha(c) || c == '-' || c == '.') {
+          if (l + 1 >= max) {
+              max += SEQ_BLOCK_SIZE;
+              seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max);
+          }
+          seq->s[l++] = (unsigned char)c;
+      }
+  }
+  if (c == '>') ungetc(c,fp);
+  seq->s[l] = 0;
+  seq->m = max; seq->l = l;
+  return l;
+} 
+
+mut_t mutmsk;
+mut_t mut_and_type_mask;
+mut_t muttype_shift;
+mut_t ins_length_shift;
+mut_t ins_length_mask;
+mut_t ins_length_max;
+mut_t ins_long_length_max;
+mut_t ins_mask;
+
+void
+mutseq_init_bounds()
+{
+  int32_t i;
+  mutmsk = (mut_t)0x30;
+  mut_and_type_mask = (mut_t)0x3F;
+  muttype_shift = 6; // bits 5-6 store the mutation type
+  ins_length_shift = 59; // bits 60-64 store the insertion length
+  ins_length_mask = 0x1F; // bits 60-64 store the insertion length
+  ins_length_max = ((ins_length_shift - muttype_shift) >> 1);
+  ins_long_length_max = UINT32_MAX;
+  if(ins_length_mask < ins_length_max) { // we exceed storing the length of the indel
+      ins_length_max = ins_length_mask;
+  }
+  // create the insertion mask based on the maximum length insertion
+  ins_mask = 0;
+  for(i=0;i<ins_length_max;i++) {
+      ins_mask = (ins_mask << 2) | 0x3; 
+  }
+}
+
+mutseq_t *
+mutseq_init()
+{
+  mutseq_t *seq = NULL;
+  seq = calloc(1, sizeof(mutseq_t));
+  mutseq_init_bounds();
+  return seq;
+}
+
+void
+mutseq_destroy(mutseq_t *seq)
+{
+  int32_t i;
+  for(i=0;i<seq->ins_l;i++) {
+      free(seq->ins[i]);
+  }
+  free(seq->ins);
+  free(seq->s);
+  free(seq);
+}
+
+mut_t
+mut_get_ins_length(mutseq_t *seq, int32_t i)
+{
+  mut_t m, n, index;
+  assert(0 <= i && i < seq->l);
+  m = seq->s[i];
+  assert(INSERT == (m & mutmsk));
+  n = (m >> ins_length_shift) & ins_length_mask;
+  if(0 == n) {
+      uint32_t l = 0;
+      index = (m >> muttype_shift) & ins_mask;
+      assert(0 <= index && index < seq->ins_l);
+      mut_get_ins_long_n(seq->ins[index], &l);
+      return (mut_t)l;
+  }
+  else {
+      return n;
+  }
+}
+
+// returns 0 if it is a long insertion, 1 otherwise
+// if 0 is returned, the index into the long insertion list
+// is returned in "ins" and "n" is 0.
+int32_t
+mut_get_ins(mutseq_t *seq, int32_t i, mut_t *n, mut_t *ins)
+{
+  mut_t m;
+  assert(0 <= i && i < seq->l);
+  m = seq->s[i];
+  assert(INSERT == (m & mutmsk));
+  (*n) = (m >> ins_length_shift) & ins_length_mask;
+  (*ins) = (m >> muttype_shift) & ins_mask;
+  if(0 == (*n)) {
+      assert(0 <= (*ins) && (*ins) < seq->ins_l);
+      return 0;
+  }
+  else {
+      assert(0 < (*n));
+      assert((*n) <= ins_length_max);
+      return 1;
+  }
+}
+
+inline int32_t
+mut_get_ins_bytes(int32_t n)
+{
+  int32_t num_bytes = 1; // one-byte for the type
+  // variable number of bytes to store the length
+  if(n <= UINT8_MAX) num_bytes += sizeof(uint8_t); 
+  else if(n <= UINT16_MAX) num_bytes += sizeof(uint16_t); 
+  else if(n <= UINT32_MAX) num_bytes += sizeof(uint32_t); 
+  else {
+      fprintf(stderr, "UINT32_MAX < n");
+      exit(1);
+  }
+  // store the insertion
+  num_bytes += mut_packed_len(n);
+  return num_bytes;
+}
+
+uint8_t*
+mut_get_ins_long_n(uint8_t *ins, uint32_t *n)
+{
+  switch(ins[0]) {
+    case sizeof(uint8_t):
+      (*n) = (uint32_t)(ins[1]);
+      return ins + 2;
+      break;
+    case sizeof(uint16_t):
+      (*n) = (uint32_t)(((uint16_t*)(ins+1))[0]);
+      return ins + 3;
+      break;
+    case sizeof(uint32_t):
+    default:
+      (*n) = (uint32_t)(((uint32_t*)(ins+1))[0]);
+      return ins + 5;
+      break;
+  }
+  return NULL;
+}
+
+inline uint8_t*
+mut_set_ins_long_n(uint8_t *ins, uint32_t n)
+{
+  // set type
+  if(n <= UINT8_MAX) ins[0] = (uint8_t)sizeof(uint8_t);
+  else if(n <= UINT16_MAX) ins[0] = (uint8_t)sizeof(uint16_t);
+  else ins[0] = (uint8_t)sizeof(uint32_t);
+      
+  // set length
+  switch(ins[0]) {
+    case sizeof(uint8_t):
+      ins[1] = (uint8_t)n;
+      return ins + 2;
+      break;
+    case sizeof(uint16_t):
+      (*((uint16_t*)(ins+1))) = (uint16_t)n;
+      return ins + 3;
+      break;
+    case sizeof(uint32_t):
+    default:
+      (*((uint32_t*)(ins+1))) = n;
+      return ins + 5;
+      break;
+  }
+  return NULL;
+}
+
+
+static inline void
+mut_print_ins(FILE *fp, mutseq_t *seq, int32_t i)
+{
+  mut_t n, ins;
+  if(1 == mut_get_ins(seq, i, &n, &ins)) {
+      while (n > 0) {
+          fputc("ACGTN"[ins & 0x3], fp);
+          ins >>= 2;
+          n--;
+      }
+  }
+  else { // long insertion
+      int32_t byte_index, bit_index;
+      uint32_t num_ins = 0;
+      uint8_t *insertion = NULL;
+      assert(NULL != seq->ins[ins]);
+      insertion = mut_get_ins_long_n(seq->ins[ins], &num_ins);
+      // reverse order
+      byte_index = mut_packed_len(num_ins)-1;
+      bit_index = (num_ins+3) & 3; // % 4
+      while(0 < num_ins) {
+          fputc("ACGTN"[(insertion[byte_index] >> (bit_index << 1)) & 0x3], fp);
+          bit_index--;
+          if(bit_index < 0) {
+              bit_index = 3;
+              byte_index--;
+          }
+          num_ins--;
+      }
+  }
+}
+
+// bases is NULL if we are to randomly simulate the bases
+void mut_add_ins(dwgsim_opt_t *opt, mutseq_t *hap1, mutseq_t *hap2, int32_t i, int32_t c, int8_t hap, char *bases, mut_t num_ins)
+{
+  mut_t ins = 0;
+  int64_t j;
+
+  if (NULL == bases) {
+      if(num_ins == 0) {
+          // get the new insertion length
+          do {
+              num_ins++;
+          } while (num_ins < ins_long_length_max && (num_ins < opt->indel_min || drand48() < opt->indel_extend));
+      }
+  } else {
+      num_ins = strlen(bases); // ignores num_ins
+  }
+  if (ins_long_length_max < num_ins) num_ins = ins_long_length_max;
+
+  if (hap < 0) {
+      // set ploidy
+      if (opt->is_hap || drand48() < 0.333333) { // hom-ins
+          hap = 3;
+      } else if (drand48() < 0.5) {
+          hap = 1;
+      } else {
+          hap = 2;
+      }
+  }
+
+  if (num_ins <= ins_length_max) { // short
+      // generate the insertion
+      if (NULL == bases) {
+          for (j=0;j<num_ins;j++) {
+              ins = (ins << 2) | (mut_t)(drand48() * 4.0);
+          }
+      } else {
+          for (j = num_ins; 0 <= j; --j) {
+              ins = (ins << 2) | nst_nt4_table[(int)bases[j]];
+          } 
+      }
+      // store
+      if (hap & 0x1) hap1->s[i] = (num_ins << ins_length_shift) | (ins << muttype_shift) | INSERT | c;
+      if (hap & 0x2) hap2->s[i] = (num_ins << ins_length_shift) | (ins << muttype_shift) | INSERT | c;
+  } else { // long
+      int32_t byte_index, bit_index;
+      uint8_t *hap1_ins = NULL, *hap2_ins = NULL;
+      if (hap & 0x1) {
+          while (hap1->ins_m <= hap1->ins_l) { // realloc
+              hap1->ins_m = (hap1->ins_m < 16) ? 16 : (hap1->ins_m << 1); 
+              hap1->ins = realloc(hap1->ins, sizeof(uint8_t*) * hap1->ins_m);
+          }
+          hap1->ins[hap1->ins_l] = calloc(mut_get_ins_bytes(num_ins), sizeof(uint8_t));
+          hap1_ins = mut_set_ins_long_n(hap1->ins[hap1->ins_l], num_ins); 
+      }
+      if (hap & 0x2) {
+          while (hap2->ins_m <= hap2->ins_l) { // realloc
+              hap2->ins_m = (hap2->ins_m < 16) ? 16 : (hap2->ins_m << 1); 
+              hap2->ins = realloc(hap2->ins, sizeof(uint8_t*) * hap2->ins_m);
+          }
+          hap2->ins[hap2->ins_l] = calloc(mut_get_ins_bytes(num_ins), sizeof(uint8_t));
+          hap2_ins = mut_set_ins_long_n(hap2->ins[hap2->ins_l], num_ins); 
+      }
+      byte_index = 0;
+      bit_index = 0;
+      while(0 < num_ins) {
+          uint8_t b;
+          if (NULL == bases) {
+              b = ((uint8_t)(drand48() * 4.0)) << (bit_index << 1);
+          } else {
+              b = nst_nt4_table[(int)bases[num_ins-1]] << (bit_index << 1);
+          }
+          if (hap & 0x1) hap1_ins[byte_index] |= b;
+          if (hap & 0x2) hap2_ins[byte_index] |= b;
+          bit_index++;
+          if(4 == bit_index) {
+              bit_index=0;
+              byte_index++;
+          }
+          num_ins--;
+      }
+      if (hap & 0x1) {
+          assert(hap1->ins_l <= ins_mask);
+          hap1->s[i] = (hap1->ins_l << muttype_shift) | INSERT | c;
+          hap1->ins_l++;
+      }
+      if (hap & 0x2) {
+          assert(hap2->ins_l <= ins_mask);
+          hap2->s[i] = (hap2->ins_l << muttype_shift) | INSERT | c;
+          hap2->ins_l++;
+      }
+  }
+}
+
+void mut_debug(const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2)
+{
+  int32_t i;
+  // DEBUG
+  for (i = 0; i < seq->l; ++i) {
+      mut_t c[3];
+      c[0] = nst_nt4_table[(mut_t)seq->s[i]];
+      c[1] = hap1->s[i]; c[2] = hap2->s[i];
+      if (c[0] >= 4) continue;
+      if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
+          if ((c[1] & mut_and_type_mask) == (c[2] & mut_and_type_mask)) { // hom
+              if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
+                  assert((c[1]&0x3) == (c[2]&0x3));
+                  assert((c[0]&0x3) != (c[1]&0x3));
+                  continue;
+              } else if ((c[1]&mutmsk) == DELETE) { // del
+                  continue;
+              } else if ((c[1] & mutmsk) == INSERT) { // ins
+                  mut_t n1 = mut_get_ins_length(hap1, i);
+                  mut_t n2 = mut_get_ins_length(hap1, i);
+                  assert(n1 > 0);
+                  assert(n1 <= ins_long_length_max);
+                  assert(n1 == n2);
+              } else assert(0);
+          } else { // het
+              if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
+                  assert((c[1]&0x3) != (c[2]&0x3));
+                  assert((c[0]&0x3) == (c[1]&0x3) || (c[0]&0x3) == (c[2]&0x3));
+                  assert((c[0]&0x3) != (c[1]&0x3) || (c[0]&0x3) != (c[2]&0x3));
+                  continue;
+              } else if ((c[1]&mutmsk) == DELETE) {
+                  continue;
+              } else if ((c[2]&mutmsk) == DELETE) {
+                  continue;
+              } else if ((c[1]&mutmsk) == INSERT) { // ins 1
+                  mut_t n = mut_get_ins_length(hap1, i);
+                  assert(n > 0);
+                  assert(n <= ins_long_length_max);
+              } else if ((c[2]&mutmsk) == INSERT) { // ins 2
+                  mut_t n = mut_get_ins_length(hap2, i);
+                  assert(n > 0);
+                  assert(n <= ins_long_length_max);
+              } else assert(0);
+          }
+      }
+  }
+}
+
+static void
+mut_left_justify_ins(mutseq_t *hap1, int32_t i)
+{
+  // NB: should we also be checking both haps for homozygous cases
+  mut_t n, ins, j;
+  if(1 == mut_get_ins(hap1, i, &n, &ins)) { // short
+      assert(n > 0);
+      j=i;
+      while(0 < j
+            && INSERT != (hap1->s[j-1]&mutmsk) // no insertion
+            && SUBSTITUTE != (hap1->s[j-1]&mutmsk) // no substitution
+            && DELETE != (hap1->s[j-1]&mutmsk) // no deletion 
+            && ((ins >> ((n-1) << 1)) & 3) == (hap1->s[j-1]&3)) { // end of insertion matches previous base
+          // update ins
+          ins = (ins & ~((mut_t)3 << ((n-1) << 1))); // zero out the last base
+          ins <<= 2; // shift over
+          ins |= (hap1->s[j-1]&3); // insert the first base
+          hap1->s[j] = (hap1->s[j]&3); // make it NOCHANGE
+          j--;
+      }
+      hap1->s[j] = (n << ins_length_shift) | (ins << muttype_shift) | INSERT | (hap1->s[j]&3); // re-insert
+  } else { // long
+      int32_t bit_index, byte_index;
+      uint32_t num_ins;
+      uint8_t *insertion = NULL;
+      insertion = mut_get_ins_long_n(hap1->ins[ins], &num_ins);
+      assert(num_ins > 0);
+      j=i;
+      while(0 < j
+            && INSERT != (hap1->s[j-1]&mutmsk) // no insertion
+            && SUBSTITUTE != (hap1->s[j-1]&mutmsk) // no substitution
+            && DELETE != (hap1->s[j-1]&mutmsk) // no deletion 
+            // NB: last base of the insertion has a bit index of zero and byte index of zero
+            && ((insertion[0] >> (0 << 1)) & 3) == (hap1->s[j-1]&3)) { // end of insertion matches previous base
+          // start with the last base
+          for (byte_index = 0; byte_index < mut_packed_len(num_ins); byte_index++) {
+              insertion[byte_index] >>= 2; // shift down, get rid of first base (last two bits)
+              if (byte_index + 1 < mut_packed_len(num_ins)) {
+                  uint8_t b = (insertion[byte_index+1] >> (0 << 1)) & 3; // last base of the next byte (first two bits)
+                  insertion[byte_index] |= b << 6; // move the new base to the last two bits
+              }
+          }
+          // add a base to the front
+          bit_index = (num_ins+3) & 3; // % 4
+          byte_index = mut_packed_len(num_ins)-1;
+          insertion[byte_index] |= (hap1->s[j-1]&3) << (bit_index << 1);
+          hap1->s[j] = (hap1->s[j]&3); // make it NOCHANGE
+          j--;
+      }
+      hap1->s[j] = (ins << muttype_shift) | INSERT | (hap1->s[j]&3); // re-insert
+  }
+}
+
+// left-justify all the insertions and deletions
+static void
+mut_left_justify(const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2)
+{
+  mut_t i, j;
+  int del_length;
+  int prev_del[2] = {0, 0};
+  assert(hap1->l == seq->l);
+  assert(hap2->l == seq->l);
+  for (i = 0; i < seq->l; ++i) {
+      mut_t c[3];
+      c[0] = nst_nt4_table[(mut_t)seq->s[i]];
+      c[1] = hap1->s[i]; c[2] = hap2->s[i];
+      if (c[0] >= 4) continue;
+      if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
+          if ((c[1] & mut_and_type_mask) == (c[2] & mut_and_type_mask)) { // hom
+              // TODO: code re-use
+              if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
+                  prev_del[0] = prev_del[1] = 0;
+                  continue;
+              } else if ((c[1]&mutmsk) == DELETE) { // del
+                  if(prev_del[0] == 1 || prev_del[1] == 1) continue;
+                  prev_del[0] = prev_del[1] = 1;
+                  for(j=i+1,del_length=1;j<seq->l && (hap1->s[j]&mutmsk) == DELETE;j++) { // get del length
+                      del_length++;
+                  }
+                  if(seq->l <= i+del_length) continue;
+                  // left-justify
+                  for(j=i-1;0 < i && 0<=j;j--) {
+                      if(INSERT != (hap1->s[j]&mutmsk) && INSERT != (hap2->s[j]&mutmsk) // no insertion
+                         && DELETE != (hap1->s[j]&mutmsk) && DELETE != (hap2->s[j]&mutmsk) // no deletion 
+                         && (hap1->s[j]&3) == (hap1->s[j+del_length]&3)  // hap1 bases match
+                         && (hap2->s[j]&3) == (hap2->s[j+del_length]&3)) { // hap2 bases match
+                          // shift and make it NOCHANGE
+                          mut_t t;
+                          t = hap1->s[j]; hap1->s[j] = hap1->s[j+del_length]; hap1->s[j+del_length] = (t | mutmsk) ^ mutmsk;
+                          t = hap2->s[j]; hap2->s[j] = hap2->s[j+del_length]; hap2->s[j+del_length] = (t | mutmsk) ^ mutmsk;
+                      }
+                      else {
+                          break;
+                      }
+                      if(0 == j) break;
+                  }
+              } else if ((c[1] & mutmsk) == INSERT) { // ins
+                  prev_del[0] = prev_del[1] = 0;
+                  mut_left_justify_ins(hap1, i);
+                  mut_left_justify_ins(hap2, i);
+              }  else assert(0);
+          } else { // het
+              if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
+                  prev_del[0] = prev_del[1] = 0;
+                  continue;
+              } else if ((c[1]&mutmsk) == DELETE) {
+                  if(prev_del[0] == 1) continue;
+                  prev_del[0] = 1;
+                  for(j=i+1,del_length=1;j<seq->l && (hap1->s[j]&mutmsk) == DELETE;j++) { // get del length
+                      del_length++;
+                  }
+                  if(seq->l <= i+del_length) continue;
+                  // left-justify
+                  for(j=i-1;0 < i && 0<=j;j--) {
+                      if(INSERT != (hap1->s[j]&mutmsk) // no insertion
+                         && SUBSTITUTE != (hap1->s[j]&mutmsk) // no substitution
+                         && DELETE != (hap1->s[j]&mutmsk) // no deletion 
+                         && (hap1->s[j]&3) == (hap1->s[j+del_length]&3))  { // hap1 bases match
+                          // shift and make it NOCHANGE
+                          mut_t t;
+                          t = hap1->s[j]; hap1->s[j] = hap1->s[j+del_length]; hap1->s[j+del_length] = (t | mutmsk) ^ mutmsk;
+                      }
+                      else {
+                          break;
+                      }
+                      if(0 == j) break;
+                  }
+              } else if ((c[2]&mutmsk) == DELETE) {
+                  if(prev_del[1] == 1) continue;
+                  prev_del[1] = 1;
+                  for(j=i+1,del_length=1;j<seq->l && (hap2->s[j]&mutmsk) == DELETE;j++) { // get del length
+                      del_length++;
+                  }
+                  if(seq->l <= i+del_length) continue;
+                  // left-justify
+                  for(j=i-1;0 < i && 0<=j;j--) {
+                      if(INSERT != (hap2->s[j]&mutmsk) // no insertion
+                         && SUBSTITUTE != (hap2->s[j]&mutmsk) // no substitution
+                         && DELETE != (hap2->s[j]&mutmsk) // no deletion 
+                         && (hap2->s[j]&3) == (hap2->s[j+del_length]&3))  { // hap2 bases match
+                          // shift and make it NOCHANGE
+                          mut_t t;
+                          t = hap2->s[j]; hap2->s[j] = hap2->s[j+del_length]; hap2->s[j+del_length] = (t | mutmsk) ^ mutmsk;
+                      }
+                      else {
+                          break;
+                      }
+                      if(0 == j) break;
+                  }
+              } else if ((c[1]&mutmsk) == INSERT) { // ins 1
+                  prev_del[0] = prev_del[1] = 0;
+                  mut_left_justify_ins(hap1, i);
+              } else if ((c[2]&mutmsk) == INSERT) { // ins 2
+                  prev_del[0] = prev_del[1] = 0;
+                  mut_left_justify_ins(hap2, i);
+              } else assert(0);
+          }
+      }
+      else {
+          prev_del[0] = prev_del[1] = 0;
+      }
+  }
+}
+
+void mut_diref(dwgsim_opt_t *opt, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2, 
+               int32_t contig_i, muts_input_t *muts_input)
+{
+  int32_t i, j, deleting = 0, deletion_length = 0;
+  mutseq_t *ret[2];
+
+  ret[0] = hap1; ret[1] = hap2;
+  ret[0]->l = seq->l; ret[1]->l = seq->l;
+  ret[0]->m = seq->m; ret[1]->m = seq->m;
+  ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
+  ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
+  ret[0]->ins = NULL; ret[1]->ins = NULL;
+  ret[0]->ins_l = 0; ret[1]->ins_l = 0;
+  ret[0]->ins_m = 0; ret[1]->ins_m = 0;
+
+  if(NULL == muts_input) {
+      for (i = 0; i < seq->l; ++i) {
+          mut_t c;
+          c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]];
+          if (deleting) {
+              if (deletion_length < opt->indel_min || drand48() < opt->indel_extend) {
+                  if (deleting & 1) ret[0]->s[i] |= DELETE|c;
+                  if (deleting & 2) ret[1]->s[i] |= DELETE|c;
+                  deletion_length++;
+                  continue;
+              } else deleting = deletion_length = 0;
+          }
+          if (c < 4 && drand48() < opt->mut_rate) { // mutation
+              if (drand48() >= opt->indel_frac) { // substitution
+                  double r = drand48();
+                  c = (c + (mut_t)(r * 3.0 + 1)) & 3;
+                  if (opt->is_hap || drand48() < 0.333333) { // hom
+                      ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
+                  } else { // het
+                      ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
+                  }
+              } else { // indel
+                  if (drand48() < 0.5) { // deletion
+                      if (opt->is_hap || drand48() < 0.3333333) { // hom-del
+                          ret[0]->s[i] = ret[1]->s[i] = DELETE|c;
+                          deleting = 3;
+                      } else { // het-del
+                          deleting = drand48()<0.5?1:2;
+                          ret[deleting-1]->s[i] = DELETE|c;
+                      }
+                      deletion_length = 1;
+                  } else { // insertion
+                      mut_add_ins(opt, ret[0], ret[1], i, c, -1, NULL, 0);
+                  }
+              }
+          }
+      }
+  }
+  else {
+      if(MUT_INPUT_BED == muts_input->type) { // BED
+          muts_bed_t *muts_bed = muts_input->data.bed; 
+          // seed
+          for (i = 0; i < seq->l; ++i) {
+              ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]];
+          }
+          // mutates exactly based on a BED file
+          for (i = 0; i < muts_bed->n; ++i) {
+              if (muts_bed->muts[i].contig == contig_i) {
+                  int32_t has_bases = 1, is_hom = 0, hap = 0;
+                  int32_t which_hap = 0;
+                  mut_t c;
+
+                  // does this mutation have random bases?
+                  if (0 == strcmp("*", muts_bed->muts[i].bases)) has_bases = 0; // random bases
+
+                  // het or hom?
+                  if (opt->is_hap || drand48() < 0.333333) {
+                      is_hom = 1; // hom
+                      hap = 3;
+                  }
+                  else {
+                      which_hap = drand48()<0.5?0:1;
+                      hap = 1 << which_hap;
+                  }
+
+                  // mutate
+                  if (SUBSTITUTE == muts_bed->muts[i].type) {
+                      for (j = muts_bed->muts[i].start; j < muts_bed->muts[i].end; ++j) { // for each base
+                          c = (mut_t)nst_nt4_table[(int)seq->s[j]];
+                          if (0 == has_bases) { // random DNA base
+                              double r = drand48();
+                              c = (c + (mut_t)(r * 3.0 + 1)) & 3;
+                          }
+                          else {
+                              c = (mut_t)nst_nt4_table[(int)muts_bed->muts[i].bases[j - muts_bed->muts[i].start]]; 
+                          }
+                          if (1 == is_hom) {
+                              ret[0]->s[j] = ret[1]->s[j] = SUBSTITUTE|c;
+                          } else { // het
+                              ret[which_hap]->s[j] = SUBSTITUTE|c;
+                          }
+                      }
+                  }
+                  else if (DELETE == muts_bed->muts[i].type) {
+                      for (j = muts_bed->muts[i].start; j < muts_bed->muts[i].end; ++j) { // for each base
+                          c = (mut_t)nst_nt4_table[(int)seq->s[j]];
+                          if (1 == is_hom) {
+                              ret[0]->s[j] = ret[1]->s[j] = DELETE|c;
+                          } else { // het-del
+                              ret[which_hap]->s[j] = DELETE|c;
+                          }
+                      }
+                  }
+                  else if (INSERT == muts_bed->muts[i].type) {
+                      c = (mut_t)nst_nt4_table[(int)seq->s[muts_bed->muts[i].start]];
+                      if (0 == has_bases) {
+                          mut_add_ins(opt, ret[0], ret[1], muts_bed->muts[i].start, c, hap, NULL, muts_bed->muts[i].end - muts_bed->muts[i].start);
+                      } else {
+                          mut_add_ins(opt, ret[0], ret[1], muts_bed->muts[i].start, c, hap, muts_bed->muts[i].bases, 0);
+                      }
+                  }
+              }
+              else if (contig_i < muts_bed->muts[i].contig) {
+                  break;
+              }
+          }
+      }
+      else if(MUT_INPUT_TXT == muts_input->type || MUT_INPUT_VCF == muts_input->type) { // TXT or VCF
+          muts_txt_t *muts_txt = NULL;
+          if(MUT_INPUT_TXT == muts_input->type) {
+              muts_txt = muts_input->data.txt; 
+          }
+          else {
+              muts_txt = muts_input->data.vcf; 
+          }
+          // seed
+          for (i = 0; i < seq->l; ++i) {
+              ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]];
+          }
+          for (i = 0; i < muts_txt->n; ++i) {
+              if (muts_txt->muts[i].contig == contig_i) {
+                  int8_t type = muts_txt->muts[i].type;
+                  uint32_t pos = muts_txt->muts[i].pos;
+                  int8_t is_hap = muts_txt->muts[i].is_hap;
+                  mut_t c = (mut_t)nst_nt4_table[(int)seq->s[pos-1]];
+
+                  if (DELETE == type) {
+                      if (is_hap & 1) ret[0]->s[pos-1] |= DELETE|c;
+                      if (is_hap & 2) ret[1]->s[pos-1] |= DELETE|c;
+                  }
+                  else if (SUBSTITUTE == type) {
+                      if (is_hap & 1) ret[0]->s[pos-1] = SUBSTITUTE|nst_nt4_table[(int)muts_txt->muts[i].bases[0]];
+                      if (is_hap & 2) ret[1]->s[pos-1] = SUBSTITUTE|nst_nt4_table[(int)muts_txt->muts[i].bases[0]];
+                  }
+                  else if (INSERT == type) {
+                      mut_add_ins(opt, ret[0], ret[1], pos-1, c, is_hap, muts_txt->muts[i].bases, 0);
+                  }
+              }
+          }
+      }
+      else {
+          fprintf(stderr, "Error: unknown mutation input type!\n");
+          exit(1);
+      }
+  }
+
+  // DEBUG
+  mut_debug(seq, hap1, hap2);
+  
+  // DEBUG
+  mut_left_justify(seq, hap1, hap2);
+  mut_debug(seq, hap1, hap2);
+}
+
+void mut_print_header_pre(FILE *fpout_vcf)
+{
+  fprintf(fpout_vcf, "##fileformat=VCFv4.1\n");
+}
+
+void mut_print_header_post(FILE *fpout_vcf)
+{
+  fprintf(fpout_vcf, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">\n");
+  fprintf(fpout_vcf, "##INFO=<ID=pl,Number=1,Type=Integer,Description=\"Phasing: 1 - HET contig 1, #2 - HET contig #2, 3 - HOM both contigs\">\n"); 
+  fprintf(fpout_vcf, "##INFO=<ID=mt,Number=1,Type=String,Description=\"Variant Type: SUBSTITUTE/INSERT/DELETE\">\n");
+  fprintf(fpout_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n");
+}
+
+void mut_print_header_contig(FILE *fpout_vcf, const char *name, int32_t length)
+{
+  // TODO: does mutating the reference change the sequence length?  If so, this
+  // is not correct.
+  fprintf(fpout_vcf, "##contig=<ID=%s,length=%d>\n", name, length);
+}
+
+// TODO: add AD, DP, GQ, GT, PL
+void mut_print(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2, FILE *fpout_txt, FILE *fpout_vcf)
+{
+  int32_t i, j, hap;
+  
+  // body
+  mut_t mut_prev[2] = {NOCHANGE,NOCHANGE}; // for deletions
+  for (i = 0; i < seq->l; ++i) {
+      mut_t c[3];
+      c[0] = nst_nt4_table[(int)seq->s[i]];
+      c[1] = hap1->s[i]; c[2] = hap2->s[i];
+      if (c[0] >= 4) {
+          // do nothing
+      } else if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
+          fprintf(fpout_txt, "%s\t%d\t", name, i+1);
+          if ((c[1] & mut_and_type_mask) == (c[2] & mut_and_type_mask)) { // hom
+              if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
+                  fprintf(fpout_txt, "%c\t%c\t3\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
+                  fprintf(fpout_vcf, "%s\t%d\t.\t%c\t%c\t100\tPASS\tAF=1.0;pl=3;mt=SUBSTITUTE\n", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
+              } else if ((c[1]&mutmsk) == DELETE) { // del
+                  fprintf(fpout_txt, "%c\t-\t3\n", "ACGTN"[c[0]]);
+                  if(0 == mut_prev[0] || 0 == mut_prev[1]) {
+                      fprintf(fpout_vcf, "%s\t%d\t.\t", name, i);
+                      if (0 < i) fputc("ACGTN"[nst_nt4_table[(int)seq->s[i-1]]], fpout_vcf);
+                      for (j = i; j < seq->l && (c[1] & mut_and_type_mask) == (c[2] & mut_and_type_mask) && (c[1]&mutmsk) == DELETE; ++j) {
+                          fputc("ACGTN"[c[0]], fpout_vcf);
+                          // NB: this modifies 'c'
+                          if (j+1 < seq->l) { 
+                              c[0] = nst_nt4_table[(int)seq->s[j+1]];
+                              c[1] = hap1->s[j+1]; c[2] = hap2->s[j+1];
+                          }
+                      }
+                      if (0 < i) fprintf(fpout_vcf, "\t%c", "ACGTN"[nst_nt4_table[(int)seq->s[i-1]]]);
+                      else fprintf(fpout_vcf, "\t.");
+                      fprintf(fpout_vcf, "\t100\tPASS\tAF=1.0;pl=3;mt=DELETE\n"); 
+                  }
+                  // NB: convert back 'c'
+                  c[0] = nst_nt4_table[(int)seq->s[i]];
+                  c[1] = hap1->s[i]; c[2] = hap2->s[i];
+              } else if ((c[1] & mutmsk) == INSERT) { // ins
+                  fprintf(fpout_txt, "-\t");
+                  mut_print_ins(fpout_txt, hap1, i);
+                  fprintf(fpout_txt, "\t3\n");
+                  fprintf(fpout_vcf, "%s\t%d\t.\t%c\t%c", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[0]]);
+                  mut_print_ins(fpout_vcf, hap1, i);
+                  fprintf(fpout_vcf, "\t100\tPASS\tAF=1.0;pl=3;mt=INSERT\n");
+              }  else assert(0);
+          } else { // het
+              if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
+                  hap = ((c[1]&mutmsk) == SUBSTITUTE) ? 1 : 2;
+                  fprintf(fpout_txt, "%c\t%c\t%d\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)], hap);
+                  if(1 == hap) fprintf(fpout_vcf, "%s\t%d\t.\t%c\t%c\t100\tPASS\tAF=0.5;pl=1;mt=SUBSTITUTE\n", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
+                  else fprintf(fpout_vcf, "%s\t%d\t.\t%c\t%c\t100\tPASS\tAF=0.5;pl=2;mt=SUBSTITUTE\n", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[2]&0xf]);
+              } else if ((c[1]&mutmsk) == DELETE) {
+                  fprintf(fpout_txt, "%c\t-\t1\n", "ACGTN"[c[0]]);
+                  if(0 == mut_prev[0]) {
+                      fprintf(fpout_vcf, "%s\t%d\t.\t", name, i);
+                      if (0 < i) fputc("ACGTN"[nst_nt4_table[(int)seq->s[i-1]]], fpout_vcf);
+                      for (j = i; j < seq->l && (c[1] & mut_and_type_mask) != (c[2] & mut_and_type_mask) && (c[1]&mutmsk) == DELETE; ++j) {
+                          fputc("ACGTN"[c[0]], fpout_vcf);
+                          // NB: this modifies 'c'
+                          if (j+1 < seq->l) { 
+                              c[0] = nst_nt4_table[(int)seq->s[j+1]];
+                              c[1] = hap1->s[j+1]; c[2] = hap2->s[j+1];
+                          }
+                      }
+                      if (0 < i) fprintf(fpout_vcf, "\t%c", "ACGTN"[nst_nt4_table[(int)seq->s[i-1]]]);
+                      else fprintf(fpout_vcf, "\t.");
+                      fprintf(fpout_vcf, "\t100\tPASS\tAF=1.0;pl=1;mt=DELETE\n"); 
+                  }
+                  // NB: convert back 'c'
+                  c[0] = nst_nt4_table[(int)seq->s[i]];
+                  c[1] = hap1->s[i]; c[2] = hap2->s[i];
+              } else if ((c[2]&mutmsk) == DELETE) {
+                  fprintf(fpout_txt, "%c\t-\t2\n", "ACGTN"[c[0]]);
+                  if(0 == mut_prev[1]) {
+                      fprintf(fpout_vcf, "%s\t%d\t.\t", name, i);
+                      if (0 < i) fputc("ACGTN"[nst_nt4_table[(int)seq->s[i-1]]], fpout_vcf);
+                      for (j = i; j < seq->l && (c[1] & mut_and_type_mask) != (c[2] & mut_and_type_mask) && (c[2]&mutmsk) == DELETE; ++j) {
+                          fputc("ACGTN"[c[0]], fpout_vcf);
+                          // NB: this modifies 'c'
+                          if (j+1 < seq->l) { 
+                              c[0] = nst_nt4_table[(int)seq->s[j+1]];
+                              c[1] = hap1->s[j+1]; c[2] = hap2->s[j+1];
+                          }
+                      }
+                      if (0 < i) fprintf(fpout_vcf, "\t%c", "ACGTN"[nst_nt4_table[(int)seq->s[i-1]]]);
+                      else fprintf(fpout_vcf, "\t.");
+                      fprintf(fpout_vcf, "\t100\tPASS\tAF=1.0;pl=2;mt=DELETE\n"); 
+                  }
+                  // NB: convert back 'c'
+                  c[0] = nst_nt4_table[(int)seq->s[i]];
+                  c[1] = hap1->s[i]; c[2] = hap2->s[i];
+              } else if ((c[1]&mutmsk) == INSERT) { // ins 1
+                  fprintf(fpout_txt, "-\t");
+                  mut_print_ins(fpout_txt, hap1, i);
+                  fprintf(fpout_txt, "\t1\n");
+                  fprintf(fpout_vcf, "%s\t%d\t.\t%c\t%c", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[0]]);
+                  mut_print_ins(fpout_vcf, hap1, i);
+                  fprintf(fpout_vcf, "\t100\tPASS\tAF=0.5;pl=1;mt=INSERT\n");
+              } else if ((c[2]&mutmsk) == INSERT) { // ins 2
+                  fprintf(fpout_txt, "-\t");
+                  mut_print_ins(fpout_txt, hap2, i);
+                  fprintf(fpout_txt, "\t2\n");
+                  fprintf(fpout_vcf, "%s\t%d\t.\t%c\t%c", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[0]]);
+                  mut_print_ins(fpout_vcf, hap2, i);
+                  fprintf(fpout_vcf, "\t100\tPASS\tAF=0.5;pl=2;mt=INSERT\n");
+              } else assert(0);
+          }
+      }
+      mut_prev[0] = (c[1] & mutmsk) != NOCHANGE;
+      mut_prev[1] = (c[2] & mutmsk) != NOCHANGE;
+  }
+}
diff --git a/src/mut.h b/src/mut.h
new file mode 100644
index 0000000..56504a4
--- /dev/null
+++ b/src/mut.h
@@ -0,0 +1,93 @@
+#ifndef MUT_H
+#define MUT_H
+
+#include "contigs.h"
+#include "mut_txt.h"
+#include "mut_bed.h"
+#include "mut_input.h"
+#include "dwgsim_opt.h"
+#include "dwgsim.h"
+
+/* FASTA parser, copied from seq.c */
+typedef struct {
+    int l, m; /* length and maximum buffer size */
+    unsigned char *s; /* sequence */
+} seq_t;
+
+#define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0
+
+void 
+seq_set_block_size(int size);
+
+int 
+seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment);
+
+enum muttype_t {
+    NOCHANGE = 0, 
+    INSERT = 0x10, 
+    SUBSTITUTE = 0x20, 
+    DELETE = 0x30
+};
+
+typedef uint64_t mut_t;
+extern mut_t mutmsk;
+extern mut_t mut_and_type_mask;
+extern mut_t muttype_shift; 
+extern mut_t ins_length_shift;
+extern mut_t ins_length_mask;
+extern mut_t ins_length_max;
+extern mut_t ins_long_length_max;
+extern mut_t ins_mask;
+
+typedef struct {
+    int l, m; /* length and maximum buffer size */
+    mut_t *s; /* sequence */
+    uint8_t **ins; /* long insertions */
+    int ins_l, ins_m; /* length and maximum buffer size for long insertions */
+} mutseq_t;
+
+void
+mutseq_init_bounds();
+
+mutseq_t *
+mutseq_init();
+
+void
+mutseq_destroy(mutseq_t *seq);
+
+inline int32_t
+mut_get_ins_bytes(int32_t n);
+
+inline uint8_t*
+mut_get_ins_long_n(uint8_t *ins, uint32_t *n);
+
+mut_t
+mut_get_ins_length(mutseq_t *seq, int32_t i);
+
+int32_t
+mut_get_ins(mutseq_t *seq, int32_t i, mut_t *n, mut_t *ins);
+
+void 
+mut_diref(dwgsim_opt_t *opt, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2, int32_t contig_i, muts_input_t *muts_input);
+
+void mut_print_header_pre(FILE *fpout_vcf);
+void mut_print_header_post(FILE *fpout_vcf);
+void mut_print_header_contig(FILE *fpout_vcf, const char *name, int32_t length);
+
+// Columns:
+// 1 - chromosome name
+// 2 - position (one-based)
+// 3 - reference base (dash if there was an insertion)
+// 4 - variant allele (IUPAC code or insertion base(s))
+// 5 - '-' for homozygous, '+' for heterozygous
+void 
+mut_print(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2, FILE *fpout_txt, FILE *fpout_vcf);
+
+// 0 - 0
+// 1 - 1
+// 5 - 2
+// 9 - 3
+// ...
+#define mut_packed_len(_n) ((_n + 3) >> 2)
+
+#endif
diff --git a/src/mut_bed.c b/src/mut_bed.c
new file mode 100644
index 0000000..1af655b
--- /dev/null
+++ b/src/mut_bed.c
@@ -0,0 +1,136 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <string.h>
+#include "contigs.h"
+#include "mut.h"
+#include "dwgsim.h"
+#include "mut_bed.h"
+
+muts_bed_t *muts_bed_init(FILE *fp, contigs_t *c)
+{
+  muts_bed_t *m = NULL;
+  int32_t i;
+  char name[1024];
+  uint32_t start, end;
+  char type[1024];
+  char bases[1024];
+  uint32_t prev_contig=0, max_end=0;
+
+  m = calloc(1, sizeof(muts_bed_t));
+  m->n = 0;
+  m->mem = 1024;
+  m->muts = malloc(m->mem * sizeof(mut_bed_t));
+
+  i = 0;
+  // start is zero-based
+  // one is one-based
+  while(0 < fscanf(fp, "%s\t%u\t%u\t%s\t%s", name, &start, &end, bases, type)) {
+      // find the contig
+      while(i < c->n && 0 != strcmp(name, c->contigs[i].name)) {
+          i++;
+      }
+      if(c->n == i) {
+          fprintf(stderr, "Error: contig not found [%s]\n", name);
+          exit(1);
+      }
+      else if(c->contigs[i].len <= start) {
+          fprintf(stderr, "Error: start out of range [%s,%u]\n", name, start);
+          exit(1);
+      }
+      else if(c->contigs[i].len < end) {
+          fprintf(stderr, "Error: end out of range [%s,%u]\n", name, end);
+          exit(1);
+      }
+      else if(end <= start) {
+          fprintf(stderr, "Error: end <= start [%s,%u,%u]\n", name, start, end);
+          exit(1);
+      }
+      else if(0 != strcmp("*", bases) && (end - start) != strlen(bases)) {
+          fprintf(stderr, "Error: bases did not match start and end [%s,%u,%u,%s]\n", name, start, end, bases);
+          exit(1);
+      }
+      else if(prev_contig == i && start+1 <= max_end) {
+          fprintf(stderr, "Warning: overlapping entries, ignoring entry [%s\t%u\t%u\t%s\t%s]\n", name, start, end, bases, type);
+          continue;
+      }
+
+      if(prev_contig != i || max_end < end) {
+          prev_contig = i;
+          max_end = end;
+      }
+
+      if(m->n == m->mem) {
+          m->mem <<= 1;
+          m->muts = realloc(m->muts, m->mem * sizeof(mut_bed_t)); 
+      }
+      m->muts[m->n].contig = i;
+      m->muts[m->n].start = start;
+      m->muts[m->n].end = end; // one-based
+
+      m->muts[m->n].type = get_muttype(type);
+      switch(m->muts[m->n].type) {
+        case SUBSTITUTE:
+          break;
+        case INSERT:
+          if(ins_length_max < end - start) {
+              fprintf(stderr, "Error: insertion of length %d exceeded the maximum supported length of %d\n",
+                      end - start,
+                      (int32_t)ins_length_max);
+              exit(1);
+          }
+          break;
+        case DELETE:
+          break;
+        default:
+          // error
+          fprintf(stderr, "Error: mutation type unrecognized [%s]\n", type);
+          exit(1);
+      }
+      m->muts[m->n].bases = strdup(bases);
+      m->n++;
+  }
+  if(m->n < m->mem) {
+      m->mem = m->n;
+      m->muts = realloc(m->muts, m->mem * sizeof(mut_bed_t)); 
+  }
+
+  return m;
+}
+
+void muts_bed_destroy(muts_bed_t *m)
+{
+  int32_t i;
+  for(i=0;i<m->n;i++) {
+      free(m->muts[i].bases);
+  }
+  free(m->muts);
+  free(m);
+}
diff --git a/src/mut_bed.h b/src/mut_bed.h
new file mode 100644
index 0000000..772e714
--- /dev/null
+++ b/src/mut_bed.h
@@ -0,0 +1,47 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   */
+
+#ifndef MUT_BED_H
+#define MUT_BED_H
+
+typedef struct {
+    uint32_t contig; // zero-based
+    uint32_t start; // zero-based
+    uint32_t end; // zero-based end position
+    uint8_t type; // mutation type
+    char *bases; // mutation bases
+} mut_bed_t;
+
+typedef struct {
+    mut_bed_t *muts;
+    int32_t n;
+    int32_t mem;
+} muts_bed_t;
+
+muts_bed_t *muts_bed_init(FILE *fp, contigs_t *c);
+
+void muts_bed_destroy(muts_bed_t *m);
+
+#endif
diff --git a/src/mut_input.c b/src/mut_input.c
new file mode 100644
index 0000000..1e6c9ff
--- /dev/null
+++ b/src/mut_input.c
@@ -0,0 +1,81 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   */
+
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+#include "contigs.h"
+#include "mut.h"
+#include "dwgsim.h"
+#include "mut_txt.h"
+#include "mut_bed.h"
+#include "mut_vcf.h"
+#include "mut_input.h"
+
+muts_input_t *muts_input_init(FILE *fp, contigs_t *c, int32_t type)
+{
+  muts_input_t *m = calloc(1, sizeof(muts_input_t));
+  m->type = type;
+  switch(type) {
+    case MUT_INPUT_BED:
+      m->data.bed = muts_bed_init(fp, c);
+      break;
+    case MUT_INPUT_TXT:
+      m->data.txt = muts_txt_init(fp, c);
+      break;
+    case MUT_INPUT_VCF:
+      m->data.vcf = muts_vcf_init(fp, c);
+      break;
+    default:
+      fprintf(stderr, "Error: mutation input type unrecognized!\n");
+      exit(1);
+  }
+  return m;
+}
+
+void muts_input_destroy(muts_input_t *m)
+{
+  switch(m->type) {
+    case MUT_INPUT_BED:
+      muts_bed_destroy(m->data.bed);
+      break;
+    case MUT_INPUT_TXT:
+      muts_txt_destroy(m->data.txt);
+      break;
+    case MUT_INPUT_VCF:
+      muts_vcf_destroy(m->data.vcf);
+      break;
+    default:
+      fprintf(stderr, "Error: mutation input type unrecognized!\n");
+      exit(1);
+  }
+  free(m);
+}
diff --git a/src/mut_input.h b/src/mut_input.h
new file mode 100644
index 0000000..2ae9e7f
--- /dev/null
+++ b/src/mut_input.h
@@ -0,0 +1,52 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   */
+
+#ifndef MUT_INPUT_H
+#define MUT_INPUT_H
+
+#include "mut_bed.h"
+#include "mut_txt.h"
+#include "mut_vcf.h"
+
+enum {
+    MUT_INPUT_BED = 0,
+    MUT_INPUT_TXT = 1,
+    MUT_INPUT_VCF = 2
+};
+
+typedef struct {
+    int32_t type;
+    union {
+        muts_bed_t *bed;
+        muts_txt_t *txt;
+        muts_vcf_t *vcf;
+    } data;
+} muts_input_t;
+
+muts_input_t *muts_input_init(FILE *fp, contigs_t *c, int32_t type);
+
+void muts_input_destroy(muts_input_t *m);
+
+#endif
diff --git a/src/mut_txt.c b/src/mut_txt.c
new file mode 100644
index 0000000..5019563
--- /dev/null
+++ b/src/mut_txt.c
@@ -0,0 +1,127 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   */
+
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+#include "contigs.h"
+#include "mut.h"
+#include "dwgsim.h"
+#include "mut_txt.h"
+
+muts_txt_t *muts_txt_init(FILE *fp, contigs_t *c)
+{
+  muts_txt_t *m = NULL;
+  int32_t i;
+  char name[1024];
+  uint32_t pos, prev_pos = 0, is_hap;
+  char ref, mut[1024];
+
+  m = calloc(1, sizeof(muts_txt_t));
+  m->n = 0;
+  m->mem = 1024;
+  m->muts = malloc(m->mem * sizeof(mut_txt_t));
+
+  i = 0;
+  while(0 < fscanf(fp, "%s\t%u\t%c\t%s\t%d", name, &pos, &ref, mut, &is_hap)) {
+      // find the contig
+      while(i < c->n && 0 != strcmp(name, c->contigs[i].name)) {
+          i++;
+          prev_pos = 0;
+      }
+      if(c->n == i) {
+          fprintf(stderr, "Error: contig not found [%s]\n", name);
+          exit(1);
+      }
+      else if(pos <= 0 || c->contigs[i].len < pos) {
+          fprintf(stderr, "Error: start out of range [%s,%u]\n", name, pos);
+          exit(1);
+      }
+      else if(pos < prev_pos) {
+          fprintf(stderr, "Error: out of order [%s,%u]\n", name, pos);
+          exit(1);
+      }
+
+      if(m->n == m->mem) {
+          m->mem <<= 1;
+          m->muts = realloc(m->muts, m->mem * sizeof(mut_txt_t)); 
+      }
+      m->muts[m->n].contig = i;
+      m->muts[m->n].pos = pos;
+      m->muts[m->n].bases = strdup(mut);
+
+      if('-' == ref && '-' != mut[0]) {
+          m->muts[m->n].type = INSERT;
+      }
+      else if('-' != ref && '-' == mut[0]) {
+          m->muts[m->n].type = DELETE;
+      }
+      else if('-' != ref && '-' != mut[0]) {
+          m->muts[m->n].type = SUBSTITUTE;
+          if(is_hap < 3) { // heterozygous
+              if(nst_nt4_table[(int)m->muts[m->n].bases[0]] < 4) {
+                  fprintf(stderr, "Error: heterozygous bases must be in IUPAC form\n");
+                  exit(1);
+              }
+              m->muts[m->n].bases[0] = iupac_and_base_to_mut(m->muts[m->n].bases[0], ref);
+              if('X' == m->muts[m->n].bases[0]) {
+                  fprintf(stderr, "Error: out of range\n");
+                  exit(1);
+              }
+              m->muts[m->n].bases[1] = '\0';
+          }
+      }
+      else {
+          fprintf(stderr, "Error: out of range\n");
+          exit(1);
+      }
+      
+      m->muts[m->n].is_hap = is_hap;
+
+      m->n++;
+  }
+  if(m->n < m->mem) {
+      m->mem = m->n;
+      m->muts = realloc(m->muts, m->mem * sizeof(mut_txt_t)); 
+  }
+
+  return m;
+}
+
+void muts_txt_destroy(muts_txt_t *m)
+{
+  int32_t i;
+  for(i=0;i<m->n;i++) {
+      free(m->muts[i].bases);
+  }
+  free(m->muts);
+  free(m);
+}
diff --git a/src/mut_txt.h b/src/mut_txt.h
new file mode 100644
index 0000000..0b7ab3c
--- /dev/null
+++ b/src/mut_txt.h
@@ -0,0 +1,47 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   */
+
+#ifndef MUT_TXT_H
+#define MUT_TXT_H
+
+typedef struct {
+    uint32_t contig;
+    uint32_t pos;
+    uint8_t type;
+    uint8_t is_hap;
+    char *bases;
+} mut_txt_t;
+
+typedef struct {
+    mut_txt_t *muts;
+    int32_t n;
+    int32_t mem;
+} muts_txt_t;
+
+muts_txt_t *muts_txt_init(FILE *fp, contigs_t *c);
+
+void muts_txt_destroy(muts_txt_t *m);
+
+#endif
diff --git a/src/mut_vcf.c b/src/mut_vcf.c
new file mode 100644
index 0000000..4ff9159
--- /dev/null
+++ b/src/mut_vcf.c
@@ -0,0 +1,267 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   */
+
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+#include "contigs.h"
+#include "mut.h"
+#include "dwgsim.h"
+#include "mut_vcf.h"
+
+#define BUFFER_L 1048576
+
+muts_vcf_t *muts_vcf_init(FILE *fp, contigs_t *c)
+{
+  static int32_t warned = 0;
+  muts_vcf_t *m = NULL;
+  int32_t i, j;
+  char name[1024]; // 1. #CHROM
+  uint32_t pos; // 2. POS
+  char id[1024]; // 3. ID
+  char ref[1024]; // 4. REF
+  char alt[1025]; // 5. ALT
+  size_t n_read = -1;
+  uint32_t prev_pos = 0, is_hap;
+  int32_t ref_l, alt_l;
+  char buffer[BUFFER_L];
+  int32_t s, n, len;
+
+  m = calloc(1, sizeof(muts_vcf_t));
+  m->n = 0;
+  m->mem = 1024;
+  m->muts = malloc(m->mem * sizeof(mut_vcf_t));
+  
+  // read in some buffer
+  n_read = fread(buffer, sizeof(char), BUFFER_L, fp);
+
+  i = s = n = 0;
+  while(0 != s || 0 != n_read) {
+      len = n_read + s;
+      s = 0;
+      while(s < len) { // while more characters in the buffer
+          // find the next newline
+          for(n=s;n<len;n++) {
+              if('\n' == buffer[n] || '\r' == buffer[n]) break;
+          }
+          if(n == len && 0 < n_read) { // there are more characters to read
+              break;
+          }
+          else if(s == n) { // newline found, skip
+              s++; 
+              continue;
+          }
+          else if(buffer[s] == '#') { // skip headers
+              s = n;
+              continue;
+          }
+
+          // process
+          if(EOF == sscanf(buffer+s, "%s\t%u\t%s\t%s\t%s", name, &pos, id, ref, alt)) {
+              fprintf(stderr, "Error: VCF parsing error\n"); 
+              exit(1);
+          }
+          // find ploidy in the "pl" (lowercase) tag
+          is_hap = 4;
+          while(s+4 < n) { // [\t;]pl=[1-3]
+              if(('\t' == buffer[s] || ';' == buffer[s]) && 'p' == buffer[s+1] && 'l' == buffer[s+2] && '=' == buffer[s+3]) {
+                  s += 4; // skip over '[\t;]pl='
+                  switch(buffer[s]) {
+                    case '1':
+                      is_hap = 1;
+                      break;
+                    case '2':
+                      is_hap = 2;
+                      break;
+                    case '3':
+                      is_hap = 3;
+                      break;
+                    default:
+                      fprintf(stderr, "Error: Could not determine the strand of the mutation from the 'pl' tag.\n");
+                      exit(1);
+                      break;
+                  }
+                  break;
+              }
+              s++;
+          }
+          if(4 == is_hap && 0 == warned) {
+              fprintf(stderr, "Warning: strand of the mutation not found; please use the 'pl' tag.\n"); 
+              warned = 1;
+              is_hap = 3;
+          }
+
+          s = n+1; // move past the next newline
+
+          // find the contig
+          while(i < c->n && 0 != strcmp(name, c->contigs[i].name)) {
+              i++;
+              prev_pos = 0;
+          }
+          if(c->n == i) {
+              fprintf(stderr, "Error: contig not found [%s]\n", name);
+              exit(1);
+          }
+          else if(pos <= 0 || c->contigs[i].len < pos) {
+              fprintf(stderr, "Error: start out of range [%s,%u]\n", name, pos);
+              exit(1);
+          }
+          else if(pos < prev_pos) {
+              fprintf(stderr, "Error: out of order [%s,%u]\n", name, pos);
+              exit(1);
+          }
+
+          ref_l = strlen(ref);
+          alt_l = strlen(alt);
+
+          if(1 == ref_l && ref[0] == '.') {
+              ref[0] = '\0';
+              ref_l = 0;
+          }
+          if(1 == alt_l && alt[0] == '.') {
+              alt[0] = '\0';
+              alt_l = 0;
+          }
+          if(0 == alt_l && 0 == ref_l) {
+              fprintf(stderr, "Error: empty alleles\n");
+              exit(1);
+          }
+
+          // TODO: support multiple alleles
+          for(j=0;j<alt_l;j++) {
+              if(',' == alt[j]) {
+                  fprintf(stderr, "Error: multiple alleles are not supported\n");
+                  exit(1);
+              }
+          }
+          
+          // to upper, and check for non-ACGT bases
+          for(j=0;j<ref_l;j++) {
+              ref[j] = "ACGTN"[nst_nt4_table[(int)ref[j]]];
+              if('N' == ref[j]) {
+                  fprintf(stderr, "Error: non-ACGT base found\n");
+                  exit(1);
+              }
+          }
+          for(j=0;j<alt_l;j++) {
+              alt[j] = "ACGTN"[nst_nt4_table[(int)alt[j]]];
+              if('N' == alt[j]) {
+                  fprintf(stderr, "Error: non-ACGT base found\n");
+                  exit(1);
+              }
+          }
+
+          if(ref_l == alt_l) { // SNP
+              while(m->mem <= m->n + ref_l - 1) {
+                  m->mem <<= 1;
+                  m->muts = realloc(m->muts, m->mem * sizeof(mut_vcf_t)); 
+              }
+              for(j=0;j<ref_l;j++) { // multiple snps
+                  m->muts[m->n].contig = i;
+                  m->muts[m->n].pos = pos + j;
+                  m->muts[m->n].type = SUBSTITUTE;
+                  m->muts[m->n].is_hap = is_hap;
+                  m->muts[m->n].bases = malloc(sizeof(char) * 2);
+                  m->muts[m->n].bases[0] = alt[j];
+                  m->muts[m->n].bases[1] = '\0';
+                  m->n++;
+              }
+          }
+          else if(ref_l < alt_l) { // INS
+              if(m->n == m->mem) {
+                  m->mem <<= 1;
+                  m->muts = realloc(m->muts, m->mem * sizeof(mut_vcf_t)); 
+              }
+              // skip over match bases
+              for(j=0;j<ref_l;j++,pos++) {
+                  if(ref[j] != alt[j]) break;
+              }
+              //pos--; // make 0-based for INS
+              m->muts[m->n].contig = i;
+              m->muts[m->n].pos = pos;
+              m->muts[m->n].type = INSERT;
+              m->muts[m->n].is_hap = is_hap;
+              m->muts[m->n].bases = strdup(alt+j);
+              m->n++;
+          }
+          else { // DEL
+              // skip over match bases
+              for(j=0;j<alt_l;j++,pos++) {
+                  if(ref[j] != alt[j]) break;
+              }
+              if(j == ref_l) {
+                  fprintf(stderr, "Error: no deleted bases\n");
+                  exit(1);
+              }
+              while(m->mem < m->n + (ref_l - j)) {
+                  m->mem <<= 1;
+                  m->muts = realloc(m->muts, m->mem * sizeof(mut_vcf_t)); 
+              }
+              for(;j<ref_l;j++,pos++) {
+                  m->muts[m->n].contig = i;
+                  m->muts[m->n].pos = pos;
+                  m->muts[m->n].type = DELETE;
+                  m->muts[m->n].is_hap = is_hap;
+                  m->muts[m->n].bases = NULL;
+                  m->n++;
+              }
+          }
+
+          prev_pos = pos;
+
+          // move to 'n'
+          s = n;
+      }
+      n++; // move past last character
+      // shift unused characters down 
+      for(s = 0; n < len; n++, s++) {
+          buffer[s] = buffer[n];
+      }
+      // read in some buffer
+      n_read = fread(buffer + s, sizeof(char), BUFFER_L - s, fp);
+  }
+  if(m->n < m->mem) {
+      m->mem = m->n;
+      m->muts = realloc(m->muts, m->mem * sizeof(mut_vcf_t)); 
+  }
+
+  return m;
+}
+
+void muts_vcf_destroy(muts_vcf_t *m)
+{
+  int32_t i;
+  for(i=0;i<m->n;i++) {
+      free(m->muts[i].bases);
+  }
+  free(m->muts);
+  free(m);
+}
diff --git a/src/mut_vcf.h b/src/mut_vcf.h
new file mode 100644
index 0000000..fe8ce76
--- /dev/null
+++ b/src/mut_vcf.h
@@ -0,0 +1,39 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   */
+
+#ifndef MUT_VCF_H
+#define MUT_VCF_H
+
+#include "mut_txt.h"
+
+typedef muts_txt_t muts_vcf_t;
+typedef mut_txt_t mut_vcf_t;
+
+
+muts_vcf_t *muts_vcf_init(FILE *fp, contigs_t *c);
+
+void muts_vcf_destroy(muts_vcf_t *m);
+
+#endif
diff --git a/src/regions_bed.c b/src/regions_bed.c
new file mode 100644
index 0000000..0b70117
--- /dev/null
+++ b/src/regions_bed.c
@@ -0,0 +1,148 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   */
+
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+#include "contigs.h"
+#include "regions_bed.h"
+
+regions_bed_txt *regions_bed_init(FILE *fp, contigs_t *c)
+{
+  regions_bed_txt *r = NULL;
+  char name[1024];
+  uint32_t start, end, len;
+  int32_t i, prev_contig, prev_start, prev_end, b;
+
+  r = calloc(1, sizeof(regions_bed_txt));
+  r->n = 0;
+  r->mem = 4;
+  r->contig = malloc(r->mem * sizeof(uint32_t));
+  r->start = malloc(r->mem * sizeof(uint32_t));
+  r->end = malloc(r->mem * sizeof(uint32_t));
+  
+  i = 0;
+  prev_contig = prev_start = prev_end = -1;
+  while(0 < fscanf(fp, "%s\t%u\t%u", name, &start, &end)) {
+      len = end - start + 1;
+      // find the contig
+      while(i < c->n && 0 != strcmp(name, c->contigs[i].name)) {
+          i++;
+      }
+      if(c->n == i) {
+          fprintf(stderr, "Error: contig not found [%s]\n", name);
+          exit(1);
+      }
+      else if(c->contigs[i].len < start) {
+          fprintf(stderr, "Error: start out of range [%s,%u]\n", name, start);
+          exit(1);
+      }
+      else if(c->contigs[i].len < end) {
+          fprintf(stderr, "Error: end out of range [%s,%u]\n", name, end);
+          exit(1);
+      }
+      else if(end < start) {
+          fprintf(stderr, "Error: end < start [%s,%u,%u]\n", name, start, end);
+          exit(1);
+      }
+      else if(prev_contig == i && start < prev_start) {
+          fprintf(stderr, "Error: the input was not sorted [%s,%u,%u,%u]\n", name, start, end, len);
+          exit(1);
+      }
+      
+      if(end - start + 1 != len) {
+          fprintf(stderr, "Warning: len != end - start + 1 [%s,%u,%u,%u]\n", name, start, end, len);
+      }
+      
+      if(prev_contig == i && start <= prev_end && prev_start <= start) {
+          if(prev_end < end) {
+              r->end[r->n-1] = end;
+              prev_end = end;
+          }
+      }
+      else {
+          prev_contig = i;
+          prev_start = start;
+          prev_end = end;
+          while(r->mem <= r->n) {
+              r->mem <<= 1;
+              r->contig = realloc(r->contig, r->mem * sizeof(uint32_t));
+              r->start = realloc(r->start, r->mem * sizeof(uint32_t));
+              r->end = realloc(r->end, r->mem * sizeof(uint32_t));
+          }
+          r->contig[r->n] = i;
+          r->start[r->n] = start;
+          r->end[r->n] = end;
+          r->n++;
+      }
+      // move to the end of the line
+      while(EOF != (b = fgetc(fp))) {
+          if('\n' == b || '\r' == b) break;
+      }
+  }
+  return r;
+}
+
+void regions_bed_destroy(regions_bed_txt *r)
+{
+  free(r->contig);
+  free(r->start);
+  free(r->end);
+  free(r);
+}
+
+int32_t regions_bed_query(regions_bed_txt *r, uint32_t contig, uint32_t start, uint32_t end) 
+{
+  int32_t low, high, mid;
+  if(NULL == r) return 1;
+
+  low = 0;
+  high = r->n-1;
+  
+  while(low <= high) {
+      mid = (low + high) / 2;
+      if(contig < r->contig[mid] ||
+         (contig == r->contig[mid] && start < r->start[mid])) {
+          high = mid - 1;
+      }
+      else if(r->contig[mid] < contig ||
+              (r->contig[mid] == contig && r->end[mid] < end)) {
+          low = mid + 1;
+      }
+      else if(r->contig[mid] == contig && r->start[mid] <= start && end <= r->end[mid]) {
+          return 1;
+      }
+      else {
+          break;
+      }
+  }
+  return 0;
+}
diff --git a/src/regions_bed.h b/src/regions_bed.h
new file mode 100644
index 0000000..9243bb9
--- /dev/null
+++ b/src/regions_bed.h
@@ -0,0 +1,21 @@
+#ifndef REGIONS_BED_H
+#define REGIONS_BED_H
+
+typedef struct {
+    uint32_t *contig;
+    uint32_t *start;
+    uint32_t *end;
+    uint32_t n;
+    uint32_t mem;
+} regions_bed_txt;
+
+regions_bed_txt *
+regions_bed_init(FILE *fp, contigs_t *c);
+
+void 
+regions_bed_destroy(regions_bed_txt *r);
+
+int32_t 
+regions_bed_query(regions_bed_txt *r, uint32_t contig, uint32_t start, uint32_t end); 
+
+#endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/dwgsim.git



More information about the debian-med-commit mailing list