[med-svn] [dwgsim] 01/03: Imported Upstream version 0.1.11
Kevin Murray
daube-guest at moszumanska.debian.org
Wed Sep 23 01:01:45 UTC 2015
This is an automated email from the git hooks/post-receive script.
daube-guest pushed a commit to branch master
in repository dwgsim.
commit 58f34768b3edd452b2fbaec2aaad11411f3a0818
Author: Kevin Murray <spam at kdmurray.id.au>
Date: Wed Sep 23 09:14:22 2015 +1000
Imported Upstream version 0.1.11
---
.gitmodules | 3 +
INSTALL | 33 +
LICENSE | 339 ++++++++++
Makefile | 73 +++
README | 9 +
scripts/dwgsim_eval_plot.py | 120 ++++
scripts/dwgsim_mut_to_vcf.pl | 106 ++++
scripts/dwgsim_pileup_eval.pl | 202 ++++++
scripts/galaxy/dwgsim_eval_wrapper.py | 157 +++++
scripts/galaxy/dwgsim_eval_wrapper.xml | 73 +++
scripts/galaxy/dwgsim_wrapper.py | 182 ++++++
scripts/galaxy/dwgsim_wrapper.xml | 129 ++++
src/contigs.c | 54 ++
src/contigs.h | 23 +
src/dwgsim.c | 1085 ++++++++++++++++++++++++++++++++
src/dwgsim.h | 26 +
src/dwgsim_eval.c | 698 ++++++++++++++++++++
src/dwgsim_eval.h | 66 ++
src/dwgsim_opt.c | 381 +++++++++++
src/dwgsim_opt.h | 66 ++
src/mut.c | 888 ++++++++++++++++++++++++++
src/mut.h | 93 +++
src/mut_bed.c | 136 ++++
src/mut_bed.h | 47 ++
src/mut_input.c | 81 +++
src/mut_input.h | 52 ++
src/mut_txt.c | 127 ++++
src/mut_txt.h | 47 ++
src/mut_vcf.c | 267 ++++++++
src/mut_vcf.h | 39 ++
src/regions_bed.c | 148 +++++
src/regions_bed.h | 21 +
32 files changed, 5771 insertions(+)
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..e37d154
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "samtools"]
+ path = samtools
+ url = http://github.com/samtools/samtools.git
diff --git a/INSTALL b/INSTALL
new file mode 100644
index 0000000..cc1d7cf
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,33 @@
+Welcome to DWGSIM
+
+=== Prerequisites ===
+
+If you have just cloned this repository using GIT, you must
+initialize the SAMtools sub-module:
+ git submodule init
+ git submodule update
+You should now have the SAMtools-compatible version checked
+out in the samtools sub-directory.
+
+=== Building DWGSIM ===
+
+To build DWGSIM, execute the following commands:
+ make
+
+To install DWGSIM, after building, copy the following binaries
+to their appropriate installation directories:
+ dwgsim
+ dwgsim_eval
+ dwgims_pileup_eval.pl
+
+=== DWGSIM with Galaxy ===
+
+DWGSIM can be incorporated into galaxy.
+NB: this is out-of-date.
+
+1. Copy over the wrapper scripts:
+ cp scripts/galaxy/dwgsim_wrapper.* galaxy-dist/tools/sr_mapping
+2. Add to the list of available tools.
+ 2a. Open galaxy-dist/tool_conf.xml
+ 2b. At the end of the NGS: mapping section, add the following:
+ <tool file="sr_mapping/dwgsim_wrapper.xml" />
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d511905
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,339 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..0d5b9ec
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,73 @@
+PACKAGE_VERSION="0.1.11"
+CC= gcc
+CFLAGS= -g -Wall -O3 #-m64 #-arch ppc
+DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -DPACKAGE_VERSION=\\\"${PACKAGE_VERSION}\\\"
+DWGSIM_AOBJS = src/dwgsim_opt.o src/mut.o src/contigs.o src/regions_bed.o \
+ src/mut_txt.o src/mut_bed.o src/mut_vcf.o src/mut_input.o src/dwgsim.o
+DWGSIM_EVAL_AOBJS = src/dwgsim_eval.o \
+ samtools/knetfile.o \
+ samtools/bgzf.o samtools/kstring.o samtools/bam_aux.o samtools/bam.o samtools/bam_import.o samtools/sam.o samtools/bam_index.o \
+ samtools/bam_pileup.o samtools/bam_lpileup.o samtools/bam_md.o samtools/razf.o samtools/faidx.o samtools/bedidx.o \
+ samtools/bam_sort.o samtools/sam_header.o samtools/bam_reheader.o samtools/kprobaln.o samtools/bam_cat.o
+
+PROG= dwgsim dwgsim_eval
+INCLUDES= -I.
+SUBDIRS= samtools .
+CLEAN_SUBDIRS= samtools src
+LIBPATH=
+
+.SUFFIXES:.c .o
+
+.c.o:
+ $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all-recur lib-recur clean-recur cleanlocal-recur install-recur:
+ @target=`echo $@ | sed s/-recur//`; \
+ wdir=`pwd`; \
+ list='$(SUBDIRS)'; for subdir in $$list; do \
+ cd $$subdir; \
+ $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
+ INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \
+ cd $$wdir; \
+ done;
+
+all:$(PROG)
+
+.PHONY:all lib clean cleanlocal
+.PHONY:all-recur lib-recur clean-recur cleanlocal-recur install-recur
+
+dwgsim:lib-recur $(DWGSIM_AOBJS)
+ $(CC) $(CFLAGS) -o $@ $(DWGSIM_AOBJS) -lm -lz -lpthread
+
+dwgsim_eval:lib-recur $(DWGSIM_EVAL_AOBJS)
+ $(CC) $(CFLAGS) -o $@ $(DWGSIM_EVAL_AOBJS) -Lsamtools -lm -lz -lpthread
+
+cleanlocal:
+ rm -vfr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib; \
+ wdir=`pwd`; \
+ list='$(CLEAN_SUBDIRS)'; for subdir in $$list; do \
+ if [ -d $$subdir ]; then \
+ cd $$subdir; \
+ pwd; \
+ rm -vfr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib; \
+ cd $$wdir; \
+ fi; \
+ done;
+
+clean:cleanlocal-recur
+
+dist:clean
+ if [ -f dwgsim-${PACKAGE_VERSION}.tar.gz ]; then \
+ rm -rv dwgsim-${PACKAGE_VERSION}.tar.gz; \
+ fi; \
+ if [ -f dwgsim-${PACKAGE_VERSION}.tar ]; then \
+ rm -rv dwgsim-${PACKAGE_VERSION}.tar; \
+ fi; \
+ if [ -d dwgsim-${PACKAGE_VERSION} ]; then \
+ rm -rv dwgsim-${PACKAGE_VERSION}; \
+ fi; \
+ mkdir dwgsim-${PACKAGE_VERSION}; \
+ cp -r INSTALL LICENSE Makefile README scripts src dwgsim-${PACKAGE_VERSION}/.; \
+ tar -vcf dwgsim-${PACKAGE_VERSION}.tar dwgsim-${PACKAGE_VERSION}; \
+ gzip -9 dwgsim-${PACKAGE_VERSION}.tar; \
+ rm -rv dwgsim-${PACKAGE_VERSION};
diff --git a/README b/README
new file mode 100644
index 0000000..31a7e4f
--- /dev/null
+++ b/README
@@ -0,0 +1,9 @@
+Welcome to DWGSIM.
+
+Please see the file LICENSE for details.
+Please see the file INSTALL for installation instructions;
+
+This software package has limited support since it is in active development.
+
+Please see the following page for more details:
+https://sourceforge.net/apps/mediawiki/dnaa/index.php?title=Whole_Genome_Simulation
diff --git a/scripts/dwgsim_eval_plot.py b/scripts/dwgsim_eval_plot.py
new file mode 100644
index 0000000..b3ea6ec
--- /dev/null
+++ b/scripts/dwgsim_eval_plot.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+
+import os
+import sys
+from optparse import OptionParser
+import re
+
+import string
+import pylab
+
+class Table:
+
+ def __init__(self, fn, min_mapq):
+ ncol = -1
+
+ # read in the data
+ data = list()
+ fh = open(fn, 'r')
+ for line in fh:
+ line = line.rstrip("\r\n")
+ if '#' == line[0]:
+ continue
+ else:
+ table = self.__parse(line)
+ if table[0] < min_mapq:
+ continue
+ data.append(table)
+ if -1 == ncol:
+ ncol = len(table)
+ elif ncol != len(table):
+ sys.stderr.write("Error: table was misformatted\n")
+ sys.exit(1)
+
+ fh.close()
+
+ # re-arrange the data
+ self.matrix = [[data[i][j] for j in xrange(ncol)] for i in xrange(len(data))]
+
+ # NB: assumes sorted descending
+ def __parse(self, line):
+ tokens = re.findall(r'[\w+-.]+', line)
+ for i in range(0, 13):
+ tokens[i] = int(tokens[i])
+ for i in range(13, len(tokens)):
+ tokens[i] = float(tokens[i])
+ return tokens
+
+def main(options):
+ fmts = ['-', '--', '-.', ':', '.']
+
+ if None == options.fns or 0 == len(options.fns):
+ return
+
+ # read in through the files
+ fmts_i = 0
+ for fn in options.fns:
+ table = Table(fn, options.min_mapq)
+ # extract sensitivity
+ x = [table.matrix[i][17] for i in range(len(table.matrix))]
+ # extract specificity
+ y = [table.matrix[i][16] for i in range(len(table.matrix))]
+ pylab.plot(x, y, fmts[fmts_i])
+ fmts_i = fmts_i + 1
+ if len(fmts) <= fmts_i:
+ fmts_i = 0
+ # plot values
+ pylab.title('DWGSIM ROC')
+ pylab.xlabel('specificity')
+ pylab.ylabel('sensitivity')
+ # xlim
+ if None != options.xlim:
+ nums = re.findall(r'\d+\.?\d*', options.xlim)
+ if 2 == len(nums):
+ pylab.xlim(xmin=float(nums[0]), xmax=float(nums[1]))
+ # ylim
+ if None != options.ylim:
+ nums = re.findall(r'\d+\.?\d*', options.ylim)
+ if 2 == len(nums):
+ pylab.ylim(ymin=float(nums[0]), ymax=float(nums[1]))
+ # legend
+ if not options.no_legend:
+ if None != options.ids and len(options.fns) == len(options.ids):
+ pylab.legend(options.ids, title='IDs', loc='lower left')
+ elif options.infer_ids:
+ ids = list()
+ for fn in options.fns:
+ i = re.sub(r'^.*\/', '', fn)
+ i = re.sub(r'\.sam\.txt$', '', i)
+ i = re.sub(r'^out', '', i)
+ i = re.sub(r'^dwgsim_eval', '', i)
+ i = re.sub(r'^\.', '', i)
+ ids.append(i)
+ pylab.legend(ids, title='IDs', loc='lower left')
+ # show the plot
+ if None == options.out:
+ pylab.show()
+ else:
+ # check if the file extension is present, if not, add it
+ if len(options.out) <= len(options.outtype) or options.outtype != options.out[len(options.out)-len(options.outtype):]:
+ pylab.savefig("%s.%s" % (options.out, options.outtype), format=options.outtype)
+ else:
+ pylab.savefig(options.out, format=options.outtype)
+ pylab.close()
+
+if __name__ == '__main__':
+ parser = OptionParser()
+ parser.add_option('--fn', help="an output file from dwgsim_eval", action="append", dest="fns")
+ parser.add_option('--id', help="the corresponding ID to add to the plot legened", action="append", dest="ids")
+ parser.add_option('--infer-ids', help="infer the ids from the file names", action="store_true", default=True, dest="infer_ids")
+ parser.add_option('--xlim', help="the x-axis range", dest="xlim")
+ parser.add_option('--ylim', help="the y-axis range", dest="ylim")
+ parser.add_option('--out', help="the output file", dest="out")
+ parser.add_option('--outtype', help="the output file type (ex. pdf, png)", dest="outtype", default="pdf")
+ parser.add_option('--min-mapq', help="minimum mapping quality (inclusive)", dest="min_mapq", type=int, default=0)
+ parser.add_option('--no-legend', help="do not add a legend to the plot", dest="no_legend", action="store_true", default=False)
+ if len(sys.argv[1:]) < 1:
+ parser.print_help()
+ else:
+ options, args = parser.parse_args()
+ main(options)
diff --git a/scripts/dwgsim_mut_to_vcf.pl b/scripts/dwgsim_mut_to_vcf.pl
new file mode 100644
index 0000000..04a9b80
--- /dev/null
+++ b/scripts/dwgsim_mut_to_vcf.pl
@@ -0,0 +1,106 @@
+#!/bin/perl
+
+use strict;
+use warnings;
+
+# TODO:
+# getopt
+# better messaging
+# usage
+# pod...
+
+my $fn = shift || die "No input file";
+my %to_alt = (
+ "AR" => "G", "GR" => "A",
+ "CY" => "T", "TY" => "C",
+ "CS" => "G", "GS" => "C",
+ "AW" => "T", "TW" => "A",
+ "GK" => "T", "TK" => "G",
+ "AM" => "C", "CM" => "A"
+);
+
+open(FH, "$fn") || die "Could not open '$fn' for reading.\n";
+
+print STDOUT "##fileformat=VCFv4.1\n";
+print STDOUT "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSample\n";
+
+my $type = -1; # 1 - SUB, 2 - INS, 3 - DEL
+my $prev_name = -1;
+my $prev_pos = -1;
+my $del_n = 0;
+my $del_name = "";
+my $del_pos = -1;
+my $del_ref = "";
+my $del_s = -1;
+my $line_n = 0;
+while(defined(my $line = <FH>)) {
+ if($line =~ m/^(.+)\t(\d+)\t(.+)\t(.+)\t(\d+)$/) {
+ $line_n++;
+ my $name = $1;
+ my $pos = $2;
+ my $ref = $3;
+ my $alt = $4;
+ my $s = $5;
+ if($prev_name ne $name) {
+ if(0 < $del_n) {
+ printf(STDOUT "%s\t%d\t.\t%s\t.\t100\tPASS\tpl=%d\n",
+ $del_name, $del_pos, $del_ref, $del_s);
+ }
+ # reset
+ $prev_pos = -1; $del_n = 0; $del_s = -1;
+ $del_name = ""; $del_pos = -1; $del_ref = "";
+ }
+ if("-" eq $alt) { # DEL
+ if(0 < $del_n && $del_pos + $del_n == $pos && $s == $del_s) {
+ $del_n++;
+ $del_ref .= $ref;
+ }
+ else {
+ if(0 < $del_n) {
+ printf(STDOUT "%s\t%d\t.\t%s\t.\t100\tPASS\tpl=%d\n",
+ $del_name, $del_pos, $del_ref, $del_s);
+ }
+ $del_n = 1;
+ $del_name = $name;
+ $del_pos = $pos;
+ $del_ref = $ref;
+ $del_s = $s;
+ }
+ }
+ else {
+ # print deletion, if it exists
+ if(0 < $del_n) {
+ printf(STDOUT "%s\t%d\t.\t%s\t.\t100\tPASS\tpl=%d\n",
+ $del_name, $del_pos, $del_ref, $s);
+ # reset
+ $prev_pos = -1; $del_n = 0; $del_s = -1;
+ $del_name = ""; $del_pos = -1; $del_ref = "";
+ }
+ if("-" eq $ref) { # INS
+ printf(STDOUT "%s\t%d\t.\t.\t%s\t100\tPASS\tpl=%d\n",
+ $name, $pos, $alt, $s);
+ }
+ else { # SUB
+ if(3 != $s) {
+ if(!defined($to_alt{"$ref"."$alt"})) {
+ die "Could not convert $ref$alt"." on $line_n.\n";
+ }
+ $alt = $to_alt{"$ref"."$alt"};
+ }
+ printf(STDOUT "%s\t%d\t.\t%s\t%s\t100\tPASS\tpl=%d\n",
+ $name, $pos, $ref, $alt, $s);
+ }
+ }
+ $prev_name = $name;
+ $prev_pos = $pos;
+ }
+ else {
+ die "Input file is not in the proper format.\n";
+ }
+}
+# print deletion, if it exists
+if(0 < $del_n) {
+ printf(STDOUT "%s\t%d\t.\t%s\t.\t100\tPASS\tpl=%d\n",
+ $del_name, $del_pos, $del_ref, $del_s);
+}
+close(FH);
diff --git a/scripts/dwgsim_pileup_eval.pl b/scripts/dwgsim_pileup_eval.pl
new file mode 100644
index 0000000..1d459ac
--- /dev/null
+++ b/scripts/dwgsim_pileup_eval.pl
@@ -0,0 +1,202 @@
+#!/usr/bin/perl -w
+
+# Copied from SAMtools (http://samtools.sourceforge.net)
+
+# This perl code is very cryptic. Evaluates the fraction
+# of reads correctly aligned at various mapping quality
+# thresholds.
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+select STDERR; $| = 1; # make STDERR unbuffered
+
+&dwgsim_eval;
+exit;
+
+sub dwgsim_eval {
+ my %opts = (g=>5, n=>0);
+ getopts('bpc:gn:', \%opts);
+ print_usage() if (@ARGV == 0 && -t STDIN);
+ my ($flag, $m, $ctr) = (0, 0, 0);
+ my $gap = $opts{g};
+ my $bwa = (defined $opts{b}) ? 1 : 0;
+ $flag |= 1 if (defined $opts{p});
+ $flag |= 2 if (defined $opts{c});
+ my $n = 2*$opts{n};
+ my @a = ();
+
+ my $prev_read_name = "ZZZ";
+ my @reads = ();
+
+ my ($found_correct_1, $found_correct_2, $is_correct_1, $is_correct_2, $found_pair_correct, $is_pair_correct) = (0, 0, 0, 0, 0, 0);
+
+ print STDERR "Analyzing...\nCurrently on:\n0";
+ while (<>) {
+ $ctr++;
+ if(0 == ($ctr % 10000)) {
+ print STDERR "\r$ctr";
+ }
+ next if (/^\@/);
+ my @t = split("\t");
+ next if (@t < 11);
+ my $sam = $_;
+ my $cur_read_name = $t[0];
+
+ if($prev_read_name ne $cur_read_name) { # process the read(s)
+ die unless (0 == length($prev_read_name) || 0 < scalar(@reads));
+
+ # go through each end
+ # want to know
+ # - is the correct location represented in each end?
+ # - if so, does the correct location have the best alignment score?
+ # - what's the combined alignment score of the best pair?
+ @a = process_reads(\@reads, $flag, $gap, $bwa);
+ $found_correct_1 += $a[0];
+ $found_correct_2 += $a[1];
+ $is_correct_1 += $a[2];
+ $is_correct_2 += $a[3];
+ $found_pair_correct += $a[4];
+ $is_pair_correct += $a[5];
+
+ $prev_read_name = $cur_read_name;
+ @reads = ();
+ $m++;
+ }
+ # store the read
+ push(@reads, $sam);
+
+ #print STDERR $line if (($flag&1) && !$is_correct && $q > 0);
+ }
+ @a = process_reads(\@reads, $flag, $gap, $bwa);
+ $found_correct_1 += $a[0];
+ $found_correct_2 += $a[1];
+ $is_correct_1 += $a[2];
+ $is_correct_2 += $a[3];
+ $found_pair_correct += $a[4];
+ $is_pair_correct += $a[5];
+ print STDERR "\r$ctr\n";
+
+ printf(STDERR "%d\t%d\t%d\t%d\t%d\t%d\t%d\n",
+ $m, $found_correct_1, $found_correct_2,
+ $is_correct_1, $is_correct_2,
+ $found_pair_correct,
+ $is_pair_correct);
+
+ print STDERR "Analysis complete.\n";
+}
+
+sub print_usage {
+ print STDERR "Usage: dwgsim_eval.pl [-pcnd] [-g GAP] <in.sam>\n";
+ print STDERR "\t\t-p\t\tprint incorrect alignments\n";
+ print STDERR "\t\t-c\t\tcolor space alignments\n";
+ print STDERR "\t\t-n\tINT\tnumber of raw input paired-end reads\n";
+ print STDERR "\t\t-d\tINT\tdivide quality by this factor\n";
+ print STDERR "\t\t-i\t\tprint only alignments with indels\n";
+ print STDERR "\t\t-b\t\talignments are from BWA\n";
+ exit(1);
+}
+
+sub swap {
+ my ($a, $b) = @_;
+ return ($b, $a);
+}
+
+sub process_reads {
+ my ($reads, $flag, $gap, $bwa) = @_;
+
+ my ($found_correct_1, $found_correct_2, $is_correct_1, $is_correct_2, $found_pair_correct, $is_pair_correct) = (0, 0, 0, 0, 0, 0);
+ my ($correct_index_1, $correct_index_2, $max_AS_index_1, $max_AS_index_2, $max_AS_1, $max_AS_2) = (-1, -1, -1, -1, -1000000, -1000000);
+
+ # go through reads
+ for(my $i=0;$i<scalar(@$reads);$i++) {
+ my $line = $reads->[$i];
+
+ my @t = split(/\t/, $line);
+ die if (@t < 11);
+ my ($q, $chr, $left, $rght) = ($t[4], $t[2], $t[3], $t[3]);
+ # right coordinate
+ $_ = $t[5]; s/(\d+)[MDN]/$rght+=$1,'x'/eg;
+ --$rght;
+
+ # skip unmapped reads
+ next if (($t[1]&0x4) || $chr eq '*');
+
+ # parse read name
+ if($t[0] =~ m/^(\S+)_(\d+)_(\d+)_(\d)_(\d)_(\d+):(\d+):(\d+)_(\d+):(\d+):(\d+)_(\S+)/) {
+ my ($o_chr, $pos_1, $pos_2, $str_1, $str_2) = ($1, $2, $3, $4, $5);
+ my ($n_err_1, $n_sub_1, $n_indel_1, $n_err_2, $n_sub_2, $n_indel_2) = ($6, $7, $8, $9, $10, $11);
+ my $end = 2;
+ if(($t[1]&0x40)) { # read #1
+ $end = 1;
+ }
+
+ # check alignment score
+ for(my $j=11;$j<scalar(@t);$j++) {
+ if($t[$j] =~ m/AS:i:(-?\d+)/) {
+ my $AS = $1;
+ if(1 == $end) {
+ if($max_AS_index_1 == -1 || $max_AS_1 < $AS) {
+ $max_AS_index_1 = $i;
+ $max_AS_1 = $AS;
+ }
+ }
+ else {
+ if($max_AS_index_2 == -1 || $max_AS_2 < $AS) {
+ $max_AS_index_2 = $i;
+ $max_AS_2 = $AS;
+ }
+ }
+ last;
+ }
+ }
+
+ if ($o_chr eq $chr) { # same chr
+ if ($flag & 2) { # SOLiD
+ if(1 == $bwa) {
+ # Swap 1 and 2
+ ($pos_1, $pos_2) = &swap($pos_1, $pos_2);
+ ($str_1, $str_2) = &swap($str_1, $str_2);
+ ($n_err_1, $n_err_2) = &swap($n_err_1, $n_err_2);
+ ($n_sub_1, $n_sub_2) = &swap($n_sub_1, $n_sub_2);
+ ($n_indel_1, $n_indel_2) = &swap($n_indel_1, $n_indel_2);
+ }
+ }
+
+ if(1 == $end) { # first read
+ if(abs($pos_1 - $left) <= $gap) {
+ $found_correct_1=1;
+ $correct_index_1 = $i;
+ }
+ } else { # second read
+ if(abs($pos_2 - $left) <= $gap) {
+ $found_correct_2=1;
+ $correct_index_1 = $i;
+ }
+ }
+ }
+ } else {
+ warn("[dwgsim_eval] read '$t[0]' was not generated by dwgsim?\n");
+ next;
+ }
+ }
+
+ # was the max AS the correct one?
+ if(0 < $found_correct_1 && $correct_index_1 == $max_AS_index_1) {
+ $is_correct_1=1;
+ }
+ if(0 < $found_correct_2 && $correct_index_2 == $max_AS_index_2) {
+ $is_correct_2=1;
+ }
+
+ # check pair ???
+ if(0 < $found_correct_1 && 0 < $found_correct_2) {
+ $found_pair_correct = 1;
+ }
+ if(1 == $is_correct_1 && 1 == $is_correct_2) {
+ $is_pair_correct = 1;
+ }
+
+ return ($found_correct_1, $found_correct_2, $is_correct_1, $is_correct_2, $found_pair_correct, $is_pair_correct);
+}
diff --git a/scripts/galaxy/dwgsim_eval_wrapper.py b/scripts/galaxy/dwgsim_eval_wrapper.py
new file mode 100644
index 0000000..23e84b0
--- /dev/null
+++ b/scripts/galaxy/dwgsim_eval_wrapper.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python
+
+"""
+Runs DWGSIM_EVAL
+
+usage: dwgsim_eval_wrapper.py [options]
+ -a,--alignmentScore=a: split alignments by alignment score instead of mapping quality
+ -b,--bwa=b: alignments are from BWA (SOLiD only)
+ -c,--colorSpace=c: color space alignments
+ -d,--scoreFactor=d: divide quality/alignment score by this factor
+ -g,--wiggle=g: gap "wiggle"
+ -n,--numReads=n: number of raw input paired-end reads (otherwise, inferred from all SAM records present).
+ -q,--minMapq=q: consider only alignments with this mapping quality or greater.
+ -z,--singleEnd=z: input contains only single end reads
+ -S,--sam=s: input SAM file
+ -B,--bam=B: input BAM file
+ -p,--printIncorrect=p: print incorrect alignments
+ -s,--numSnps=s: consider only alignments with the number of specified SNPs
+ -e,--numErrors=e: consider only alignments with the number of specified errors
+ -i,--indels=i: consider only alignments with indels
+ -o,--output=u: The file to save the output
+"""
+
+import optparse, os, shutil, subprocess, sys, tempfile
+
+def stop_err( msg ):
+ sys.stderr.write( '%s\n' % msg )
+ sys.exit()
+
+def run_process ( cmd, name, tmp_dir, buffsize ):
+ try:
+ tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
+ tmp_stderr = open( tmp, 'wb' )
+ proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
+ returncode = proc.wait()
+ tmp_stderr.close()
+ # get stderr, allowing for case where it's very large
+ tmp_stderr = open( tmp, 'rb' )
+ stderr = ''
+ try:
+ while True:
+ stderr += tmp_stderr.read( buffsize )
+ if not stderr or len( stderr ) % buffsize != 0:
+ break
+ except OverflowError:
+ pass
+ tmp_stderr.close()
+ if returncode != 0:
+ raise Exception, stderr
+ except Exception, e:
+ raise Exception, 'Error in \'' + name + '\'. \n' + str( e )
+
+def check_output ( output, canBeEmpty ):
+ if 0 < os.path.getsize( output ):
+ return True
+ elif False == canBeEmpty:
+ raise Exception, 'The output file is empty:' + output
+
+def __main__():
+ #Parse Command Line
+ parser = optparse.OptionParser()
+
+ parser.add_option( '-a', '--alignmentScore', action='store_true', dest='alignmentScore', default=False, help='split alignments by alignment score instead of mapping quality' )
+ parser.add_option( '-b', '--bwa', action='store_true', dest='bwa', default=False, help='alignments are from BWA (SOLiD only)' )
+ parser.add_option( '-c', '--colorSpace', action='store_true', dest='colorSpace', default=False, help='generate reads in color space (SOLiD reads)' )
+ parser.add_option( '-d', '--scoreFactor', dest='scoreFactor', type='int', help='divide quality/alignment score by this factor' )
+ parser.add_option( '-g', '--wiggle', dest='wiggle', type='int', help='gap "wiggle"' )
+ parser.add_option( '-n', '--numReads', dest='numReads', type='int', help='number of raw input paired-end reads (otherwise, inferred from all SAM records present).' )
+ parser.add_option( '-q', '--minMapq', dest='minMapq', type='int', help='consider only alignments with this mapping quality or greater.' )
+ parser.add_option( '-z', '--singleEnd', action='store_true', dest='singleEnd', default=False, help='input contains only single end reads' )
+ parser.add_option( '-S', '--sam', dest='sam', default=None, help='input SAM' )
+ parser.add_option( '-B', '--bam', dest='bam', default=None, help='input BAM' )
+ parser.add_option( '-p', '--printIncorrect', action='store_true', dest='printIncorrect', default=False, help='print incorrect alignments' )
+ parser.add_option( '-s', '--numSnps', dest='numSnps', type="int", help='consider only alignments with the number of specified SNPs' )
+ parser.add_option( '-e', '--numErrors', dest='numErrors', type="int", default=False, help='consider only alignments with the number of specified errors' )
+ parser.add_option( '-i', '--indels', action='store_true', dest='indels', default=False, help='consider only alignments with indels' )
+ parser.add_option( '-o', '--output', dest='output', help='The file to save the output' )
+
+ (options, args) = parser.parse_args()
+
+ # output version # of tool
+ try:
+ tmp = tempfile.NamedTemporaryFile().name
+ tmp_stdout = open( tmp, 'wb' )
+ proc = subprocess.Popen( args='dwgsim_eval 2>&1', shell=True, stdout=tmp_stdout )
+ tmp_stdout.close()
+ returncode = proc.wait()
+ stdout = None
+ for line in open( tmp_stdout.name, 'rb' ):
+ if line.lower().find( 'version' ) >= 0:
+ stdout = line.strip()
+ break
+ if stdout:
+ sys.stdout.write( '%s\n' % stdout )
+ else:
+ raise Exception
+ except:
+ sys.stdout.write( 'Could not determine DWGSIM_EVAL version\n' )
+
+ buffsize = 1048576
+
+ # make temp directory for dwgsim, requires trailing slash
+ tmp_dir = '%s/' % tempfile.mkdtemp()
+
+ #'generic' options used in all dwgsim commands here
+
+ try:
+ tmp_dir = '%s/' % tempfile.mkdtemp()
+ dwgsim_eval_cmd = 'dwgsim_eval'
+ if True == options.alignmentScore:
+ dwgsim_eval_cmd = dwgsim_eval_cmd + ' -a'
+ if True == options.bwa:
+ dwgsim_eval_cmd = dwgsim_eval_cmd + ' -b'
+ if True == options.colorSpace:
+ dwgsim_eval_cmd = dwgsim_eval_cmd + ' -c'
+ use_p = False
+ if 0 <= options.numSnps:
+ use_p = True
+ dwgsim_eval_cmd = dwgsim_eval_cmd + (' -s %s' % options.numSnps)
+ if 0 <= options.numErrors:
+ use_p = True
+ dwgsim_eval_cmd = dwgsim_eval_cmd + (' -e %s' % options.numErrors)
+ if True == options.indels:
+ use_p = True
+ dwgsim_eval_cmd = dwgsim_eval_cmd + ' -i'
+ if True == use_p or True == options.printIncorrect:
+ dwgsim_eval_cmd = dwgsim_eval_cmd + ' -p'
+ if True == options.singleEnd:
+ dwgsim_eval_cmd = dwgsim_eval_cmd + ' -z'
+ dwgsim_eval_cmd = '%s -d %s -g %s -n %s -q %s' % (dwgsim_eval_cmd, \
+ options.scoreFactor, \
+ options.wiggle, \
+ options.numReads, \
+ options.minMapq)
+ if None != options.sam:
+ dwgsim_eval_cmd = dwgsim_eval_cmd + ' -S ' + options.sam
+ elif None != options.bam:
+ dwgsim_eval_cmd = dwgsim_eval_cmd + ' ' + options.bam
+ else:
+ raise Exception, 'Input file was neither a SAM nor BAM'
+ dwgsim_eval_cmd = dwgsim_eval_cmd + ' > ' + options.output
+
+ # need to nest try-except in try-finally to handle 2.4
+ try:
+ # dwgsim
+ run_process ( dwgsim_eval_cmd, 'dwgsim', tmp_dir, buffsize )
+ # check that there are results in the output file
+ check_output ( options.output, False )
+ sys.stdout.write( 'DWGSIM_EVAL successful' )
+ except Exception, e:
+ stop_err( 'DWGSIM_EVAL failed.\n' + str( e ) )
+ finally:
+ # clean up temp dir
+ if os.path.exists( tmp_dir ):
+ shutil.rmtree( tmp_dir )
+
+if __name__=="__main__": __main__()
diff --git a/scripts/galaxy/dwgsim_eval_wrapper.xml b/scripts/galaxy/dwgsim_eval_wrapper.xml
new file mode 100644
index 0000000..7765b83
--- /dev/null
+++ b/scripts/galaxy/dwgsim_eval_wrapper.xml
@@ -0,0 +1,73 @@
+<tool id="dwgsim" name="Evaluate simulated reads" version="1.0.0">
+ <description>from a SAM/BAM file using dwgsim_eval</description>
+ <command interpreter="python">
+ if 'sam' == input.ext:
+ dwgsim_wrapper.py \
+ $alignmentScore \
+ $bwa \
+ $colorSpace \
+ -d $scoreFactor \
+ -g $wiggle \
+ -n $numReads \
+ -q $minMapq \
+ $singleEnd \
+ $printIncorrect \
+ -s $numSnps \
+ -e $numErrors \
+ $indels \
+ -s $input \
+ -o $output
+ else:
+ dwgsim_wrapper.py \
+ $alignmentScore \
+ $bwa \
+ $colorSpace \
+ -d $scoreFactor \
+ -g $wiggle \
+ -n $numReads \
+ -q $minMapq \
+ $singleEnd \
+ $printIncorrect \
+ -s $numSnps \
+ -e $numErrors \
+ $indels \
+ -b $input \
+ -o $output
+
+ </command>
+ <inputs>
+ <param format="sam,bam" name="input" type="data" label="SAM/BAM file to evaluate"/>
+ <param name="alignmentScore" type="boolean" truevalue="-a" falsevalue="" checked="False" label="split alignments by alignment score instead of mapping quality"/>
+ <param name="bwa" type="boolean" truevalue="-b" falsevalue="" checked="False" label="alignments are from BWA (SOLiD only)"/>
+ <param name="colorSpace" type="boolean" truevalue="-c" falsevalue="" checked="False" label="color space alignments"/>
+ <param name="scoreFactor" size="4" type="text" value="1">
+ <label>divide quality/alignment score by this factor</label>
+ </param>
+ <param name="wiggle" size="4" type="text" value="5">
+ <label>gap "wiggle"</label>
+ </param>
+ <param name="numReads" size="4" type="text" value="-1">
+ <label>number of raw input paired-end reads (otherwise, inferred from all SAM records present)</label>
+ </param>
+ <param name="minMapq" size="4" type="text" value="0">
+ <label>consider only alignments with this mapping quality or greater</label>
+ </param>
+ <param name="singleEnd" type="boolean" truevalue="-z" falsevalue="" checked="False" label="input contains only single end reads"/>
+ <param name="printIncorrect" type="boolean" truevalue="-p" falsevalue="" checked="False" label="print incorrect alignments"/>
+ <param name="numSnps" size="4" type="text" value="-1">
+ <label>consider only alignments with the number of specified SNPs</label>
+ </param>
+ <param name="numErrors" size="4" type="text" value="-1">
+ <label>consider only alignments with the number of specified errors</label>
+ </param>
+ <param name="indels" type="boolean" truevalue="-i" falsevalue="" checked="False" label="consider only alignments with indels"/>
+ </inputs>
+ <outputs>
+ <data format="text" name="output"/>
+ </outputs>
+
+ <help>
+ This tool evaluates simulated reads from DWGSIM. For more information, please see https://sourceforge.net/apps/mediawiki/dnaa/index.php?title=Whole_Genome_Simulation#Evaluating_mapping_-_dwgsim_eval
+ </help>
+
+</tool>
diff --git a/scripts/galaxy/dwgsim_wrapper.py b/scripts/galaxy/dwgsim_wrapper.py
new file mode 100644
index 0000000..20fce4e
--- /dev/null
+++ b/scripts/galaxy/dwgsim_wrapper.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+
+"""
+Runs DWGSIM
+
+usage: dwgsim_wrapper.py [options]
+ -e,--eRateFirstread=e: base/color error rate of the first read [0.020]
+ -E,--eRateSecondread=E: base/color error rate of the second read [0.020]
+ -d,--innerDist=d: inner distance between the two ends [500]
+ -s,--stdev=s: standard deviation [50]
+ -N,--numReads=N: number of read pairs [1000000]
+ -1,--loFirstread=1: length of the first read [70]
+ -2,--loSecondread=2: length of the second read [70]
+ -r,--mutRate=r: rate of mutations [0.0010]
+ -R,--fracIndels=R: fraction of mutations that are indels [0.10]
+ -X,--indelExt=X: probability an indel is extended [0.30]
+ -y,--randProb=y: probability of a random DNA read [0.10]
+ -n,--maxN=n: maximum number of Ns allowed in a given read [0]
+ -c,--platformType=c: generate reads for Illumina (nuc space), for SOLiD (in color space) or Ion Torrent
+ -S,--strand=S: strand 0: default, 1: same strand, 2: opposite strand
+ -f,--flowOrder=f: the flow order for Ion Torrent data [(null)]
+ -O,--splitOutput=O: which output needs to be reported, one: one single; two: two files; all: all output options
+ -H,--haploid=H: haploid mode
+ -i,--input=i: the reference genome FASTA
+ -3,--outputBFAST=3: the BFAST output FASTQ
+ -4,--outputBWA1=4: the first BWA output FASTQ
+ -5,--outputBWA2=5: the second BWA output FASTQ
+ -6,--outputMutations=6: the output mutations TXT
+"""
+
+import optparse, os, shutil, subprocess, sys, tempfile
+
+def stop_err( msg ):
+ sys.stderr.write( '%s\n' % msg )
+ sys.exit()
+
+def run_process ( cmd, name, tmp_dir, buffsize ):
+ try:
+ tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
+ tmp_stderr = open( tmp, 'wb' )
+ proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
+ returncode = proc.wait()
+ tmp_stderr.close()
+ # get stderr, allowing for case where it's very large
+ tmp_stderr = open( tmp, 'rb' )
+ stderr = ''
+ try:
+ while True:
+ stderr += tmp_stderr.read( buffsize )
+ if not stderr or len( stderr ) % buffsize != 0:
+ break
+ except OverflowError:
+ pass
+ tmp_stderr.close()
+ if returncode != 0:
+ raise Exception, stderr
+ except Exception, e:
+ raise Exception, 'Error in \'' + name + '\'. \n' + str( e )
+
+def check_output ( output, canBeEmpty ):
+ if 0 < os.path.getsize( output ):
+ return True
+ elif False == canBeEmpty:
+ raise Exception, 'The output file is empty:' + output
+
+def __main__():
+ #Parse Command Line
+ parser = optparse.OptionParser()
+ parser.add_option( '-e', '--eRateFirstread', dest='eRateFirstread', type='float', help='base/color error rate of the first read' )
+ parser.add_option( '-E', '--eRateSecondread', dest='eRateSecondread', type='float', help='base/color error rate of the second read' )
+ parser.add_option( '-d', '--innerDist', dest='innerDist', type='int', help='inner distance between the two ends' )
+ parser.add_option( '-s', '--stdev', dest='stdev', type='float', help='standard deviation' )
+ parser.add_option( '-N', '--numReads', dest='numReads', type='int', help='number of read pairs' )
+ parser.add_option( '-r', '--mutRate', dest='mutRate', type='float', help='rate of mutations' )
+ parser.add_option( '-R', '--fracIndels', dest='fracIndels', type='float', help='fraction of mutations that are indels' )
+ parser.add_option( '-X', '--indelExt', dest='indelExt', type='float', help='probability an indel is extended' )
+ parser.add_option( '-y', '--randProb', dest='randProb', type='float', help='probability of a random DNA read' )
+ parser.add_option( '-n', '--maxN', dest='maxN', type='int', help='maximum number of Ns allowed in a given read' )
+ parser.add_option( '-c', '--platformType', dest='platformType', type='int', help='Platform to simulate: 0: Illumina; 1: solid; 2: Ion Torrent' )
+ parser.add_option( '-S', '--strand', dest='strand', type='choice', default='0', choices=('0', '1', '2'), help='strand 0: default, 1: same strand, 2: opposite strand' )
+ parser.add_option( '-f', '--flowOrder', dest='flowOrder', type='int', default='2')
+ parser.add_option( '-H', '--haploid', action='store_true', dest='haploid', default=False, help='haploid mode' )
+ parser.add_option( '-i', '--input', dest='input', help='The reference genome fasta' )
+ parser.add_option( '-O', dest='splitoutput', type='choice', default='two',choices=('two', 'one', 'all'), help='one: one single; two: two files; all: all output options' )
+ parser.add_option( '-1', '--loFirstread', dest='loFirstread', type='int', help='length of the first read' )
+ parser.add_option( '-2', '--loSecondread', dest='loSecondread', type='int', help='length of the second read' )
+ parser.add_option( '-6', '--outputMutations', dest='outputMutations', help='the output mutations TXT' )
+ parser.add_option( '-4', '--outputBWA1', dest='outputBWA1', help='the first BWA output FASTQ' )
+ parser.add_option( '-5', '--outputBWA2', dest='outputBWA2', help='the second BWA output FASTQ' )
+ parser.add_option( '-3', '--outputBFAST', dest='outputBFAST', help='the BFAST output FASTQ' )
+
+ (options, args) = parser.parse_args()
+
+ # output version # of tool
+ try:
+ tmp = tempfile.NamedTemporaryFile().name
+ tmp_stdout = open( tmp, 'wb' )
+ proc = subprocess.Popen( args='dwgsim 2>&1', shell=True, stdout=tmp_stdout )
+ tmp_stdout.close()
+ returncode = proc.wait()
+ stdout = None
+ for line in open( tmp_stdout.name, 'rb' ):
+ if line.lower().find( 'version' ) >= 0:
+ stdout = line.strip()
+ break
+ if stdout:
+ sys.stdout.write( '%s\n' % stdout )
+ else:
+ raise Exception
+ except:
+ sys.stdout.write( 'Could not determine DWGSIM version\n' )
+
+ buffsize = 1048576
+
+ # make temp directory for dwgsim, requires trailing slash
+ tmp_dir = '%s/' % tempfile.mkdtemp()
+
+ #'generic' options used in all dwgsim commands here
+
+ try:
+ reference_filepath = tempfile.NamedTemporaryFile( dir=tmp_dir, suffix='.fa' ).name
+ os.symlink( options.input, reference_filepath )
+ assert reference_filepath and os.path.exists( reference_filepath ), 'A valid genome reference was not provided.'
+ tmp_dir = '%s/' % tempfile.mkdtemp()
+ dwgsim_output_prefix = "dwgsim_output_prefix"
+ dwgsim_cmd = 'dwgsim -e %s -E %s -d %s -s %s -N %s -1 %s -2 %s -r %s -R %s -X %s -y %s -c %s -n %s' % \
+ (options.eRateFirstread, \
+ options.eRateSecondread, \
+ options.innerDist, \
+ options.stdev, \
+ options.numReads, \
+ options.loFirstread, \
+ options.loSecondread, \
+ options.mutRate, \
+ options.fracIndels, \
+ options.indelExt, \
+ options.randProb, \
+ options.platformType, \
+ options.maxN)
+ if options.haploid:
+ dwgsim_cmd = dwgsim_cmd + ' -H'
+ if options.platformType == "2":
+ dwgsim_cmd = dwgsim_cmd + options.flowOrder
+ dwgsim_cmd = dwgsim_cmd + ' ' + options.input
+ dwgsim_cmd = dwgsim_cmd + ' ' + tmp_dir + dwgsim_output_prefix
+
+ # temporary file to check the command being launched
+ tmpfile = open('/tmp/dwgsim.log', 'w')
+ tmpfile.write("%s" % dwgsim_cmd)
+
+ # need to nest try-except in try-finally to handle 2.4
+ try:
+ # launch dwgsim + accept mutations.txt output
+ run_process ( dwgsim_cmd, 'dwgsim', tmp_dir, buffsize )
+ cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.mutations.txt' + ' ' + options.outputMutations
+ run_process ( cmd, 'mv #1', tmp_dir, buffsize )
+ check_output ( options.outputMutations, True )
+ # check which output required and move files accordingly
+ tmpfile.write("\nsplitoutput: %s \ncondition one: %s \ncondition two: %s \ncondition all: %s\n" % (options.splitoutput, str((options.splitoutput == 'one')), str((options.splitoutput == 'two')), str((options.splitoutput == 'all'))))
+ if any( [options.splitoutput == 'two' , options.splitoutput == 'all'] ) : # two files
+ cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.bwa.read1.fastq' + ' ' + options.outputBWA1
+ run_process ( cmd, 'mv #3', tmp_dir, buffsize )
+ cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.bwa.read2.fastq' + ' ' + options.outputBWA2
+ run_process ( cmd, 'mv #4', tmp_dir, buffsize )
+ tmpfile.write("\nmoved!!: %s" % options.splitoutput)
+ # check that there are results in the output file
+ check_output ( options.outputBWA1, False )
+ check_output ( options.outputBWA2, False )
+ if any( [options.splitoutput == 'one' , options.splitoutput == 'all'] ):
+ cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.bfast.fastq' + ' ' + options.outputBFAST
+ run_process ( cmd, 'mv #2', tmp_dir, buffsize )
+ # check that there are results in the output file
+ check_output ( options.outputBFAST, False )
+ sys.stdout.write( 'DWGSIM successful' )
+ except Exception, e:
+ stop_err( 'DWGSIM failed.\n' + str( e ) )
+ finally:
+ # clean up temp dir
+ if os.path.exists( tmp_dir ):
+ shutil.rmtree( tmp_dir )
+
+if __name__=="__main__": __main__()
diff --git a/scripts/galaxy/dwgsim_wrapper.xml b/scripts/galaxy/dwgsim_wrapper.xml
new file mode 100644
index 0000000..3a6de6c
--- /dev/null
+++ b/scripts/galaxy/dwgsim_wrapper.xml
@@ -0,0 +1,129 @@
+<tool id="dwgsim" name="Generate simulated reads" version="1.1.0">
+ <description>from a fasta sequence using dwgsim</description>
+ <command interpreter="python">
+ #if $outputFormat.splitOutput == "two" #dwgsim_wrapper.py \
+ -4 $outputPEFirst \
+ -5 $outputPESecond \
+ #else if $outputFormat.splitOutput == "one" #dwgsim_wrapper.py \
+ -3 $outputSingleReads \
+ #else #dwgsim_wrapper.py \
+ -3 $outputSingleReads \
+ -4 $outputPEFirst \
+ -5 $outputPESecond \
+ #end if
+ -6 $outputMutations \
+ -e $eRateFirstread \
+ -E $eRateSecondread \
+ -d $innerDist \
+ -s $stDev \
+ -N $noReadpairs \
+ -1 $loFirstread \
+ -2 $loSecondread \
+ -r $rateOfMut \
+ -R $rateOfIndels \
+ -X $probExtRead \
+ -y $probRandRead \
+ -n $maxN \
+ -S $strandDirection \
+ -c $platform.platformType \
+ #if $platform.platformType == "2" #
+ -f $platform.flowOrder
+ #end if
+ $haploMode \
+ -O $outputFormat.splitOutput \
+ -i $input \
+
+ </command>
+ <inputs>
+ <param format="fasta" name="input" type="data" label="Select fasta file to generate reads from"/>
+ <conditional name="platform">
+ <param name="platformType" type="select" label="Select the platform to simulate reads for">
+ <option value="0" selected="True">Illumina</option>
+ <option value="1" >SOLiD (Color space)</option>
+ <option value="2" >Ion Torrent</option>
+ </param>
+ <when value="0"/>
+ <when value="1"/>
+ <when value="2">
+ <param name="flowOrder" type="text" size="12" value="2">
+ <label> Flow order for Ion Torrent data </label>
+ </param>
+ </when>
+ </conditional>
+ <conditional name="outputFormat">
+ <param name="splitOutput" type="select" label="Output mate reads in">
+ <option value="two" selected="True">Two separate files</option>
+ <option value="one" >In a single file (~bfast)</option>
+ <option value="all" >Both output options</option>
+ </param>
+ <when value="two"/>
+ <when value="one"/>
+ <when value="all"/>
+ </conditional>
+ <param name="noReadpairs" size="12" type="text" value="1000000">
+ <label>number of read pairs</label>
+ </param>
+ <param name="noReadpairs" size="12" type="text" value="1000000">
+ <label>number of read pairs</label>
+ </param>
+ <param name="loFirstread" size="4" type="text" value="70">
+ <label>length of the first read </label>
+ </param>
+ <param name="loSecondread" size="4" type="text" value="70">
+ <label>length of the second read </label>
+ </param>
+ <param name="eRateFirstread" size="4" type="text" value="0.020">
+ <label>base/color error rate of the first read</label>
+ </param>
+ <param name="eRateSecondread" size="4" type="text" value="0.020">
+ <label>base/color error rate of the second read</label>
+ </param>
+ <param name="innerDist" size="4" type="text" value="500">
+ <label>inner distance between the two ends</label>
+ </param>
+ <param name="stDev" size="4" type="text" value="50">
+ <label>Standard deviation on the inner distance between two ends</label>
+ </param>
+ <param name="rateOfMut" size="4" type="text" value="0.0010">
+ <label>rate of mutations</label>
+ </param>
+ <param name="rateOfIndels" size="4" type="text" value="0.10">
+ <label>fraction of mutations that are indels</label>
+ </param>
+ <param name="probExtRead" size="4" type="text" value="0.30">
+ <label>probability an indel is extended</label>
+ </param>
+ <param name="probRandRead" size="4" type="text" value="0.10">
+ <label>probability of a random DNA read</label>
+ </param>
+ <param name="maxN" size="4" type="text" value="0">
+ <label>maximum number of Ns allowed in a given read</label>
+ </param>
+ <param name="strandDirection" type="select" label="Direction of strand (default: opposite strand for Illumina, same strand for SOLiD?Ion Torrent)">
+ <option value="0" selected="True">Default</option>
+ <option value="1">Same strand (mate pair)</option>
+ <option value="2">Opposite strand (paired end)</option>
+ </param>
+ <param name="haploMode" type="boolean" truevalue="-H" falsevalue="" checked="False" label="Haplotype mode"/>
+
+</inputs>
+
+ <outputs>
+ <data format="fastqsanger" name="outputSingleReads" label="Simulated reads from ${on_string}">
+ <filter>(outputFormat['splitOutput'] == 'one') | (outputFormat['splitOutput'] == 'all')</filter>
+ </data>
+ <data format="fastqsanger" name="outputPEFirst" label="Simulated forw reads from ${on_string}">
+ <filter>(outputFormat['splitOutput'] == 'two') | (outputFormat['splitOutput'] == 'all') </filter>
+ </data>
+ <data format="fastqsanger" name="outputPESecond" label="Simulated rev reads from ${on_string}">
+ <filter>(outputFormat['splitOutput'] == 'two') | (outputFormat['splitOutput'] == 'all')</filter>
+ </data>
+ <data format="text" name="outputMutations" label="Mutation report of reads from ${on_string}"/>
+ </outputs>
+
+ <help>
+This tool simulate reads from a given fasta file.
+For more information, please see https://sourceforge.net/apps/mediawiki/dnaa/index.php?title=Whole_Genome_Simulation
+ </help>
+
+</tool>
diff --git a/src/contigs.c b/src/contigs.c
new file mode 100644
index 0000000..f2965f7
--- /dev/null
+++ b/src/contigs.c
@@ -0,0 +1,54 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <assert.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <string.h>
+#include "contigs.h"
+
+contigs_t* contigs_init()
+{
+ return calloc(1, sizeof(contigs_t));
+}
+
+void contigs_add(contigs_t *c, char *name, uint32_t len)
+{
+ c->n++;
+ c->contigs = realloc(c->contigs, c->n * sizeof(contig_t));
+ c->contigs[c->n-1].name = strdup(name);
+ c->contigs[c->n-1].len = len;
+}
+
+void contigs_destroy(contigs_t *c)
+{
+ int32_t i;
+ for(i=0;i<c->n;i++) {
+ free(c->contigs[i].name);
+ }
+ free(c->contigs);
+ free(c);
+}
diff --git a/src/contigs.h b/src/contigs.h
new file mode 100644
index 0000000..5bce731
--- /dev/null
+++ b/src/contigs.h
@@ -0,0 +1,23 @@
+#ifndef CONTIGS_H_
+#define CONTIGS_H_
+
+typedef struct {
+ char *name;
+ int32_t len;
+} contig_t;
+
+typedef struct {
+ contig_t *contigs;
+ int32_t n;
+} contigs_t;
+
+contigs_t*
+contigs_init();
+
+void
+contigs_add(contigs_t *c, char *name, uint32_t len);
+
+void
+contigs_destroy(contigs_t *c);
+
+#endif
diff --git a/src/dwgsim.c b/src/dwgsim.c
new file mode 100644
index 0000000..daef8e7
--- /dev/null
+++ b/src/dwgsim.c
@@ -0,0 +1,1085 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+ */
+
+/* Original Contact: Heng Li <lh3 at sanger.ac.uk> */
+
+/* This program is separated from maq's read simulator with Colin
+ * Hercus' modification to allow longer indels. Colin is the chief
+ * developer of novoalign. */
+
+/* This program was modified by Nils Homer for inclusion in the
+ * DNAA package */
+
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+#include "contigs.h"
+#include "mut.h"
+#include "mut_txt.h"
+#include "mut_bed.h"
+#include "regions_bed.h"
+#include "dwgsim_opt.h"
+#include "dwgsim.h"
+//#include <config.h>
+
+uint8_t nst_nt4_table[256] = {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
+};
+
+#define __gen_read(x, start, iter) do { \
+ for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \
+ mut_t c = currseq->s[i], mut_type = c & mutmsk; \
+ if (ext_coor[x] < 0) { \
+ if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
+ ext_coor[x] = i; \
+ if(1 == strand[x]) ext_coor[x] -= s[x]-1; \
+ } \
+ if (mut_type == DELETE) { \
+ ++n_indel[x]; \
+ if(1 == strand[x]) ext_coor[x]--; \
+ if(0 == k) n_indel_first[x]++; \
+ } \
+ else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \
+ tmp_seq[x][k++] = c & 0xf; \
+ if (mut_type == SUBSTITUTE) { \
+ ++n_sub[x]; \
+ if(0 == k) n_sub_first[x]++; \
+ } \
+ } else { \
+ mut_t n, ins; \
+ assert(mut_type == INSERT); \
+ ++n_indel[x]; \
+ n_indel_first[x]++; \
+ if(1 == mut_get_ins(currseq, i, &n, &ins)) { \
+ if(0 == strand[x]) { \
+ while(n > 0 && k < s[x]) { \
+ tmp_seq[x][k++] = ins & 0x3; \
+ --n, ins >>= 2; \
+ } \
+ if(k < s[x]) tmp_seq[x][k++] = c & 0xf; \
+ } else { \
+ tmp_seq[x][k++] = c & 0xf; \
+ while(n > 0 && k < s[x]) { \
+ ext_coor[x]++; \
+ tmp_seq[x][k++] = (ins >> ((n-1) << 1) & 0x3); \
+ --n; \
+ } \
+ } \
+ } else { \
+ int32_t byte_index, bit_index; \
+ uint32_t num_ins; \
+ uint8_t *insertion = NULL; \
+ insertion = mut_get_ins_long_n(currseq->ins[ins], &num_ins); \
+ if(0 == strand[x]) { \
+ byte_index = mut_packed_len(num_ins) - 1; bit_index = 3 - (num_ins & 3); \
+ while(num_ins > 0 && k < s[x]) { \
+ assert(0 <= byte_index); \
+ tmp_seq[x][k++] = (insertion[byte_index] >> (bit_index << 1)) & 0x3; \
+ --num_ins; \
+ bit_index--; \
+ if (bit_index < 0) { \
+ bit_index = 3; \
+ byte_index--; \
+ } \
+ } \
+ if(k < s[x]) tmp_seq[x][k++] = c & 0xf; \
+ } else { \
+ tmp_seq[x][k++] = c & 0xf; \
+ byte_index = 0; bit_index = 0; \
+ while(num_ins > 0 && k < s[x]) { \
+ ext_coor[x]++; \
+ tmp_seq[x][k++] = (insertion[byte_index] >> (bit_index << 1)) & 0x3; \
+ --num_ins; \
+ bit_index++; \
+ if (4 == bit_index) { \
+ bit_index = 0; \
+ byte_index++; \
+ } \
+ } \
+ } \
+ } \
+ } \
+ } \
+ if (k != s[x]) ext_coor[x] = -10; \
+ if (1 == strand[x]) { \
+ for (k = 0; k < s[x]; ++k) tmp_seq[x][k] = tmp_seq[x][k] < 4? 3 - tmp_seq[x][k] : 4; \
+ } \
+} while (0)
+
+/* Simple normal random number generator, copied from genran.c */
+
+double ran_normal()
+{
+ static int iset = 0;
+ static double gset;
+ double fac, rsq, v1, v2;
+ if (iset == 0) {
+ do {
+ v1 = 2.0 * drand48() - 1.0;
+ v2 = 2.0 * drand48() - 1.0;
+ rsq = v1 * v1 + v2 * v2;
+ } while (rsq >= 1.0 || rsq == 0.0);
+ fac = sqrt(-2.0 * log(rsq) / rsq);
+ gset = v1 * fac;
+ iset = 1;
+ return v2 * fac;
+ } else {
+ iset = 0;
+ return gset;
+ }
+}
+
+int32_t ran_num(double prob, int32_t n)
+{
+ int32_t i, r;
+ for(i = r = 0; i < n; i++) {
+ if(drand48() < prob) r++;
+ }
+ return r;
+}
+
+int32_t get_muttype(char *str)
+{
+ int32_t i;
+ for(i=0;i<strlen(str);i++) {
+ str[i] = tolower(str[i]);
+ }
+ if(0 == strcmp("snp", str) || 0 == strcmp("substitute", str) || 0 == strcmp("sub", str) || 0 == strcmp("s", str)) {
+ return SUBSTITUTE;
+ }
+ else if(0 == strcmp("insertion", str) || 0 == strcmp("insert", str) || 0 == strcmp("ins", str) || 0 == strcmp("i", str)) {
+ return INSERT;
+ }
+ else if(0 == strcmp("deletion", str) || 0 == strcmp("delet", str) || 0 == strcmp("del", str) || 0 == strcmp("d", str)) {
+ return DELETE;
+ }
+ return -1;
+}
+
+char iupac_and_base_to_mut(char iupac, char base)
+{
+ static char *codes = "XACMGRSVTWYHKDBN";
+ int32_t i;
+ int32_t b = nst_nt4_table[(int)base];
+ for(i=0;i<4;i++) {
+ if(codes[1<<(b&0x3) | 1<<(i&0x3)] == iupac) {
+ return "ACGTN"[i];
+ }
+ }
+ return 'X';
+}
+
+char bases_to_iupac(char b1, char b2)
+{
+ int32_t a1, a2, n;
+ a1 = nst_nt4_table[(int)b1];
+ a2 = nst_nt4_table[(int)b2];
+ if(4 == a1 || 4 == a2) return 'X';
+ if(a1 == b1) return 'X';
+ if(a1 < a2) {
+ n = a1 + (a2 << 2);
+ }
+ else {
+ n = a2 + (a1 << 2);
+ }
+ switch(n) {
+ case 4: return 'M'; // 0 + 4*1 = M
+ case 8: return 'R'; // 0 + 4*2 = R
+ case 12: return 'W'; // 0 + 4*3 = W
+ case 9: return 'S'; // 1 + 4*2 = S
+ case 13: return 'Y'; // 1 + 4*3 = Y
+ case 14: return 'K'; // 2 + 4*3 = K
+ default: break;
+ }
+ return 'X';
+}
+
+/* Error-checking open, copied from utils.c */
+#define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
+
+FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
+{
+ FILE *fp = 0;
+ if (strcmp(fn, "-") == 0)
+ return (strstr(mode, "r"))? stdin : stdout;
+ if ((fp = fopen(fn, mode)) == 0) {
+ fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn);
+ abort();
+ }
+ return fp;
+}
+
+/* dwgsim */
+
+#define __gen_errors_mismatches(_cur_seq, _j, _start, _iter, _len) do { \
+ for (i = (_start); 0 <= i && i < _len; _iter) { \
+ mut_t c = _cur_seq[_j][i]; \
+ if (c >= 4) c = 4; \
+ else if(drand48() < e[_j]->start + e[_j]->by*i) { \
+ c = (c + (mut_t)(drand48() * 3.0 + 1)) & 3; \
+ ++n_err[_j]; \
+ if(0 == i) ++n_err_first[_j]; \
+ } \
+ _cur_seq[_j][i] = c; \
+ } \
+} while(0)
+
+int32_t
+generate_errors_flows(dwgsim_opt_t *opt, uint8_t **seq, uint8_t **mask, int32_t *mem, int32_t len, uint8_t strand, double e, int32_t *_n_err)
+{
+ int32_t i, j, k, hp_l, flow_i, n_err;
+ uint8_t prev_c, c;
+
+ // remove Ns
+ for(i=0;i<len;i++) {
+ if(4 <= (*seq)[i]) {
+ (*seq)[i] = 0;
+ }
+ }
+
+ if(1 == strand) {
+ for(i=0;i<len>>1;i++) {
+ c = (*seq)[i];
+ (*seq)[i] = (*seq)[len-i-1];
+ (*seq)[len-i-1] = c;
+ }
+ }
+
+ // skip forward to the first flow
+ for(i=0;i<opt->flow_order_len;i++) {
+ c = (4 <= (*seq)[0]) ? 0 : (*seq)[0];
+ if(c == opt->flow_order[i]) {
+ break;
+ }
+ (*mask)[i] = 0;
+ }
+ assert(opt->flow_order_len != i);
+ flow_i = i;
+ prev_c = 4;
+ for(i=0;i<len;i++) {
+ c = (4 <= (*seq)[i]) ? 0 : (*seq)[i];
+ while(c != opt->flow_order[flow_i]) { // skip paste empty flows
+ (*mask)[flow_i] = 0;
+ flow_i = (flow_i + 1) % opt->flow_order_len;
+ }
+ if(prev_c != c) { // new hp
+ (*mask)[flow_i] = 0;
+ n_err = 0;
+ while(drand48() < e) { // how many bases should we insert/delete
+ n_err++;
+ }
+ if(0 < n_err) {
+ if(drand48() < 0.5) { // insert
+ // more memory
+ while((*mem) <= len + n_err) {
+ (*mem) <<= 1; // double
+ (*seq) = realloc((*seq), sizeof(uint8_t) * (*mem));
+ (*mask) = realloc((*mask), sizeof(uint8_t) * (*mem));
+ }
+ // shift up
+ for(j=len-1;i<=j;j--) {
+ (*seq)[j+n_err] = (*seq)[j];
+ }
+ for(j=i;j<i+n_err;j++) {
+ (*seq)[j] = c;
+ }
+ len += n_err;
+ }
+ else {
+ int32_t next_c = 4;
+ // get the hp length
+ for(j=i,hp_l=0;j<len;j++,hp_l++) {
+ next_c = (4 <= (*seq)[j]) ? 0 : (*seq)[j];
+ if(c != next_c) {
+ break;
+ }
+ }
+ // bound errors by the hp length
+ n_err = (hp_l < n_err) ? hp_l : n_err;
+ // shift down
+ for(j=i;j<len-n_err;j++) {
+ (*seq)[j] = (*seq)[j+n_err];
+ }
+ len -= n_err;
+ (*mask)[flow_i] = 1;
+ // Note: we need to make sure that if we delete all the bases,
+ // that the neighboring bases are not the same. If they are,
+ // we need to simulate a "dot-fill", whereby we add a
+ // non-incorporating flow.
+ if(n_err == hp_l && (0 == i || prev_c == next_c)) {
+ j = 0;
+ // get the number of subsequent flows until the next base
+ while(next_c != opt->flow_order[(flow_i + j) % opt->flow_order_len]) { // skip paste empty flows
+ j++;
+ }
+ assert(0 < j);
+ // pick one to fill in
+ k = (int)(drand48() * j);
+ // shift up
+ for(j=len-1;i<=j;j--) {
+ (*seq)[j+1] = (*seq)[j];
+ }
+ // add the filled in base
+ (*seq)[i] = opt->flow_order[(flow_i + k) % opt->flow_order_len];
+ len++;
+ }
+ }
+ (*_n_err) += n_err;
+ }
+ prev_c = c;
+ }
+ }
+
+ // second pass: add empty flows
+ prev_c = 4;
+ for(i=0;i<len;i++) {
+ c = (4 <= (*seq)[i]) ? 0 : (*seq)[i];
+ while(c != opt->flow_order[flow_i]) {
+ n_err = 0;
+ while(drand48() < e) {
+ n_err++;
+ }
+ if(0 == (*mask)[flow_i] && 0 < n_err) { // insert
+ // more memory
+ while((*mem) <= len + n_err) {
+ (*mem) <<= 1; // double
+ (*seq) = realloc((*seq), sizeof(uint8_t) * (*mem));
+ (*mask) = realloc((*mask), sizeof(uint8_t) * (*mem));
+ }
+ // shift up
+ for(j=len-1;i<=j;j--) {
+ (*seq)[j+n_err] = (*seq)[j];
+ }
+ // copy
+ for(j=i;j<i+n_err;j++) {
+ (*seq)[j] = opt->flow_order[flow_i];
+ }
+ len += n_err;
+ (*_n_err) += n_err;
+ }
+ flow_i = (flow_i + 1) % opt->flow_order_len;
+ }
+ }
+
+ if(1 == strand) {
+ for(i=0;i<len>>1;i++) {
+ c = (*seq)[i];
+ (*seq)[i] = (*seq)[len-i-1];
+ (*seq)[len-i-1] = c;
+ }
+ }
+
+ return len;
+}
+
+void dwgsim_core(dwgsim_opt_t * opt)
+{
+ seq_t seq;
+ mutseq_t *mutseq[2]={NULL,NULL};
+ uint64_t tot_len, ii=0, ctr=0;
+ int i, l, m, n_ref, contig_i;
+ char name[1024], *qstr;
+ int32_t name_len_max=0;
+ int size[2], prev_skip=0, qstr_l=0;
+ int num_n[2];
+ uint8_t *tmp_seq[2]={NULL,NULL};
+ uint8_t *tmp_seq_flow_mask[2]={NULL,NULL};
+ int32_t tmp_seq_mem[2]={0,0};
+ int64_t n_sim = 0;
+ error_t *e[2]={NULL,NULL};
+ FILE *fp_muts_input = NULL;
+ FILE *fp_regions_bed = NULL;
+ muts_input_t *muts_input = NULL;
+ regions_bed_txt *regions_bed = NULL;
+ contigs_t *contigs = NULL;
+
+ e[0] = &opt->e[0]; e[1] = &opt->e[1];
+
+ INIT_SEQ(seq);
+ seq_set_block_size(0x1000000);
+ l = opt->length[0] > opt->length[1]? opt->length[0] : opt->length[1];
+ qstr_l = l;
+ qstr = (char*)calloc(qstr_l+1, 1);
+ tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
+ tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
+ if(IONTORRENT == opt->data_type) {
+ tmp_seq_flow_mask[0] = (uint8_t*)calloc(l+2, 1);
+ tmp_seq_flow_mask[1] = (uint8_t*)calloc(l+2, 1);
+ }
+ tmp_seq_mem[0] = tmp_seq_mem[1] = l+2;
+ size[0] = opt->length[0]; size[1] = opt->length[1];
+
+ if(0 <= opt->fn_muts_input_type) {
+ contigs = contigs_init();
+ }
+
+ if(NULL != opt->fn_regions_bed) {
+ fp_regions_bed = xopen(opt->fn_regions_bed, "r");
+ if(NULL == contigs) contigs = contigs_init();
+ }
+
+ tot_len = n_ref = 0;
+ mut_print_header_pre(opt->fp_vcf);
+ if(NULL != opt->fp_fai) {
+ int dummy_int[3];
+ while(0 < fscanf(opt->fp_fai, "%s\t%d\t%d\t%d\t%d", name, &l, &dummy_int[0], &dummy_int[1], &dummy_int[2])) {
+ fprintf(stderr, "[dwgsim_core] %s length: %d\n", name, l);
+ tot_len += l;
+ ++n_ref;
+ mut_print_header_contig(opt->fp_vcf, name, l);
+ if(NULL != contigs) {
+ contigs_add(contigs, name, l);
+ }
+ }
+ }
+ else {
+ while ((l = seq_read_fasta(opt->fp_fa, &seq, name, 0)) >= 0) {
+ fprintf(stderr, "[dwgsim_core] %s length: %d\n", name, l);
+ tot_len += l;
+ ++n_ref;
+ mut_print_header_contig(opt->fp_vcf, name, l);
+ if(NULL != contigs) {
+ contigs_add(contigs, name, l);
+ }
+ }
+ }
+ fprintf(stderr, "[dwgsim_core] %d sequences, total length: %llu\n", n_ref, (long long)tot_len);
+ rewind(opt->fp_fa);
+ mut_print_header_post(opt->fp_vcf);
+
+ if(0 <= opt->fn_muts_input_type) {
+ fp_muts_input = xopen(opt->fn_muts_input, "r");
+ muts_input = muts_input_init(fp_muts_input, contigs, opt->fn_muts_input_type); // read in the mutation file
+ }
+
+ if(NULL != opt->fn_regions_bed) {
+ regions_bed = regions_bed_init(fp_regions_bed, contigs);
+ // recalculate the total length
+ tot_len = 0;
+ for(i=0;i<regions_bed->n;i++) {
+ tot_len += regions_bed->end[i] - regions_bed->start[i] + 1;
+ }
+ }
+ if(NULL != contigs) {
+ contigs_destroy(contigs);
+ contigs = NULL;
+ }
+
+ if(0 == opt->muts_only) {
+ fprintf(stderr, "[dwgsim_core] Currently on: \n0");
+ }
+ else {
+ fprintf(stderr, "[dwgsim_core] Currently on:");
+ }
+ contig_i = 0;
+ while ((l = seq_read_fasta(opt->fp_fa, &seq, name, 0)) >= 0) {
+ int64_t n_pairs = 0;
+ n_ref--;
+
+ if(1 == opt->muts_only) {
+ fprintf(stderr, "\r[dwgsim_core] Currently on: %s", name);
+ if(name_len_max < strlen(name)) {
+ name_len_max = strlen(name);
+ }
+ else {
+ for(i=0;i<name_len_max-strlen(name);i++) {
+ fputc(' ', stderr);
+ }
+ }
+ }
+
+ if(0 == opt->muts_only) {
+ if(0 == n_ref && opt->C < 0) {
+ n_pairs = opt->N - n_sim;
+ }
+ else {
+ if(NULL != regions_bed) {
+ // recalculate l
+ m = 0;
+ for(i=0;i<regions_bed->n;i++) {
+ if(contig_i == regions_bed->contig[i]) {
+ m += regions_bed->end[i] - regions_bed->start[i] + 1;
+ }
+ }
+ if(0 == m) {
+ fprintf(stderr, "[dwgsim_core] #0 skip sequence '%s' as it is not in the targeted region\n", name);
+ contig_i++;
+ continue; // skip this region
+ }
+ l = m;
+
+ int num_n = 0;
+ for(i=0;i<regions_bed->n;i++) {
+ if(contig_i == regions_bed->contig[i]) {
+ int m;
+ for(m=regions_bed->start[i];m<=regions_bed->end[i];m++) {
+ switch (seq.s[m-1]) {
+ case 'a':
+ case 'A':
+ case 'c':
+ case 'C':
+ case 'g':
+ case 'G':
+ case 't':
+ case 'T':
+ break; default:
+ num_n++; break;
+ } }
+ } }
+ if(0.95 < num_n / (double)l) { // TODO: arbitrary cutoff
+ fprintf(stderr, "[dwgsim_core] #1 skip sequence '%s' as %d out of %d bases are non-ACGT\n", name, num_n, l);
+ contig_i++;
+ continue;
+ }
+ }
+ if(0 < opt->N) {
+ // based on -N
+ n_pairs = (uint64_t)((long double)l / tot_len * opt->N + 0.5);
+ if(opt->N - n_sim < n_pairs) n_pairs = opt->N - n_sim; // make sure we don't simulate too many reads
+ }
+ else {
+ // based on coverage, with added random reads
+ n_pairs = (uint64_t)(l * opt->C / ((long double)(size[0] + size[1])) / (1.0 - opt->rand_read) + 0.5);
+ }
+ }
+
+ // for paired end/mate pair, make sure we have enough bases in this
+ // sequence
+ if (0 < opt->length[1] && l < opt->dist + 3 * opt->std_dev) {
+ if(0 == prev_skip) fprintf(stderr, "\n");
+ prev_skip = 1;
+ fprintf(stderr, "[dwgsim_core] #2 skip sequence '%s' as it is shorter than %f!\n", name, opt->dist + 3 * opt->std_dev);
+ contig_i++;
+ continue;
+ }
+ else if (l < opt->length[0] || (0 < opt->length[1] && l < opt->length[1])) {
+ if(0 == prev_skip) fprintf(stderr, "\n");
+ prev_skip = 1;
+ fprintf(stderr, "[dwgsim_core] #3 skip sequence '%s' as it is shorter than %d!\n", name, (l < opt->length[0]) ? opt->length[0] : opt->length[1]);
+ contig_i++;
+ continue;
+ }
+ else if (n_pairs < 0) { // NB: this should not happen
+ // not enough pairs
+ continue;
+ }
+ prev_skip = 0;
+ }
+
+ // generate mutations and print them out
+ mutseq[0] = mutseq_init(); mutseq[1] = mutseq_init();
+ mut_diref(opt, &seq, mutseq[0], mutseq[1], contig_i, muts_input);
+ mut_print(name, &seq, mutseq[0], mutseq[1], opt->fp_mut, opt->fp_vcf);
+
+ if(0 == opt->muts_only) {
+ for (ii = 0; ii != n_pairs; ++ii, ++ctr) { // the core loop
+ if(0 == (ctr % 10000)) {
+ fprintf(stderr, "\r[dwgsim_core] %llu",
+ (unsigned long long int)ctr);
+ }
+ double ran;
+ int d, pos, s[2], strand[2];
+ int n_sub[2], n_indel[2], n_err[2], ext_coor[2]={0,0}, j, k;
+ int n_sub_first[2], n_indel_first[2], n_err_first[2]; // need this for SOLID data
+ int c1, c2, c;
+
+ s[0] = size[0]; s[1] = size[1];
+
+ if(opt->rand_read < drand48()) {
+
+ if(NULL == regions_bed) {
+ do { // avoid boundary failure
+ if(0 < s[1]) { // paired end/mate pair
+ ran = ran_normal();
+ ran = ran * opt->std_dev + opt->dist;
+ d = (int)(ran + 0.5);
+ }
+ else {
+ d = 0;
+ }
+ pos = (int)((l - d + 1) * drand48());
+ } while (pos < 0
+ || pos >= seq.l
+ || pos + d - 1 >= seq.l
+ || (0 < s[1] && 0 == opt->is_inner && ((0 < s[0] && d <= s[1]) || (d <= s[0] && 0 < s[1]))));
+ }
+ else {
+ do { // avoid boundary failure
+ if(0 < s[1]) {
+ ran = ran_normal();
+ ran = ran * opt->std_dev + opt->dist;
+ d = (int)(ran + 0.5);
+ }
+ else {
+ d = 0;
+ }
+ pos = (int)((l - d + 1) * drand48());
+ // convert in the bed file
+ for(i=0;i<regions_bed->n;i++) { // TODO: regions are in sorted order... so optimize
+ if(contig_i == regions_bed->contig[i]) {
+ j = regions_bed->end[i] - regions_bed->start[i] + 1;
+ if(pos < j) {
+ pos = regions_bed->start[i] + pos - 1; // zero-based
+ break;
+ }
+ else {
+ pos -= j;
+ }
+ }
+ }
+ } while (pos < 0
+ || pos >= seq.l
+ || pos + d - 1 >= seq.l
+ || (0 < s[1] && 0 == opt->is_inner && ((0 < s[0] && d <= s[1]) || (d <= s[0] && 0 < s[1])))
+ || 0 == regions_bed_query(regions_bed, contig_i, pos, pos + s[0] + s[1] + d - 1));
+ }
+
+ // generate the read sequences
+ mutseq_t *currseq = mutseq[drand48()<opt->mut_freq?0:1]; // haplotype from which the reads are generated
+ n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
+ n_sub_first[0] = n_sub_first[1] = n_indel_first[0] = n_indel_first[1] = n_err_first[0] = n_err_first[1] = 0;
+ num_n[0]=num_n[1]=0;
+
+ // strand
+ if(2 == opt->strandedness || (0 == opt->strandedness && ILLUMINA == opt->data_type)) {
+ // opposite strand by default for Illumina
+ strand[0] = 0; strand[1] = 1;
+ }
+ else if(1 == opt->strandedness || (0 == opt->strandedness && (SOLID == opt->data_type || IONTORRENT == opt->data_type))) {
+ // same strands by default for SOLiD
+ strand[0] = 0; strand[1] = 0;
+ }
+ else {
+ // should not reach here
+ assert(1 == 0);
+ }
+ if (drand48() < 0.5) { // which strand ?
+ // Flip strands
+ strand[0] = (1 + strand[0]) % 2;
+ strand[1] = (1 + strand[1]) % 2;
+ }
+
+ // generate the reads in base space
+ if(0 < s[1]) { // paired end or mate pair
+ if(strand[0] == strand[1]) { // same strand
+ if(0 == strand[0]) { // + strand
+ /*
+ * 5' E2 -----> .... E1 -----> 3'
+ * 3' .... 5'
+ */
+ if(0 == opt->is_inner) {
+ __gen_read(0, pos + d - s[0], ++i);
+ }
+ else {
+ __gen_read(0, pos + s[1] + d, ++i);
+ }
+ __gen_read(1, pos, ++i);
+ }
+ else { // - strand
+ /*
+ * 3' .... 5'
+ * 5' <----- E1 .... <----- E2 3'
+ */
+ __gen_read(0, pos + s[0], --i);
+ if(0 == opt->is_inner) {
+ __gen_read(1, pos + d, --i);
+ }
+ else {
+ __gen_read(1, pos + s[0] + d + s[1], --i);
+ }
+ }
+ }
+ else { // opposite strand
+ if(0 == strand[0]) { // + strand
+ /*
+ * 5' E1 -----> .... 3'
+ * 3' .... <----- E2 5'
+ */
+ __gen_read(0, pos, ++i);
+ if(0 == opt->is_inner) {
+ __gen_read(1, pos + d, --i);
+ }
+ else {
+ __gen_read(1, pos + s[0] + d + s[1], --i);
+ }
+ }
+ else { // - strand
+ /*
+ * 5' E2 -----> .... 3'
+ * 3' .... <----- E1 5'
+ */
+ if(0 == opt->is_inner) {
+ __gen_read(0, pos + d, --i);
+ }
+ else {
+ __gen_read(0, pos + s[1] + d + s[0], --i);
+ }
+ __gen_read(1, pos, i++);
+ }
+ }
+ }
+ else { // fragment
+ if(0 == strand[0]) {
+ __gen_read(0, pos, ++i); // + strand
+ }
+ else {
+ __gen_read(0, pos + s[0] - 1, --i); // - strand
+ }
+ }
+
+ // Count # of Ns
+ for (j = 0; j < 2; ++j) {
+ num_n[j]=0;
+ if(0 < s[j]) {
+ for (i = 0; i < s[j]; ++i) {
+ if(tmp_seq[j][i] == 4) num_n[j]++;
+ }
+ }
+ }
+
+ if (ext_coor[0] < 0 || ext_coor[1] < 0 || opt->max_n < num_n[0] || opt->max_n < num_n[1]) { // fail to generate the read(s)
+ --ii;
+ --ctr;
+ continue;
+ }
+
+ if(SOLID == opt->data_type) {
+ // Convert to color sequence, use the first base as the adaptor
+ for (j = 0; j < 2; ++j) {
+ if(0 < s[j]) {
+ c1 = 0; // adaptor
+ for (i = 0; i < s[j]; ++i) {
+ c2 = tmp_seq[j][i]; // current base
+ c = __gf_add(c1, c2);
+ tmp_seq[j][i] = c;
+ c1 = c2; // save previous base
+ }
+ }
+ }
+ }
+
+ // generate sequencing errors
+ if(IONTORRENT == opt->data_type) {
+ s[0] = generate_errors_flows(opt, &tmp_seq[0], &tmp_seq_flow_mask[0], &tmp_seq_mem[0], s[0], strand[0], e[0]->start, &n_err[0]);
+ s[1] = generate_errors_flows(opt, &tmp_seq[1], &tmp_seq_flow_mask[1], &tmp_seq_mem[1], s[1], strand[1], e[1]->start, &n_err[1]);
+ }
+ else { // Illumina/SOLiD
+ if(0 < s[0]) {
+ if(0 == strand[0]) {
+ __gen_errors_mismatches(tmp_seq, 0, 0, ++i, s[0]);
+ }
+ else {
+ __gen_errors_mismatches(tmp_seq, 0, s[0]-1, --i, s[0]);
+ }
+ }
+ if(0 < s[1]) {
+ if(0 == strand[1]) {
+ __gen_errors_mismatches(tmp_seq, 1, 0, ++i, s[1]);
+ }
+ else {
+ __gen_errors_mismatches(tmp_seq, 1, s[1]-1, --i, s[1]);
+ }
+ }
+ }
+
+ // print
+ for (j = 0; j < 2; ++j) {
+ if(s[j] <= 0) {
+ continue;
+ }
+ if(IONTORRENT == opt->data_type && qstr_l < s[j]) {
+ qstr_l = s[j];
+ qstr = realloc(qstr, (1+qstr_l) * sizeof(char));
+ }
+ if(NULL != opt->fixed_quality) {
+ for (i = 0; i < s[j]; ++i) {
+ qstr[i] = opt->fixed_quality[0];
+ }
+ }
+ else {
+ for (i = 0; i < s[j]; ++i) {
+ qstr[i] = (int)(-10.0 * log(e[j]->start + e[j]->by*i) / log(10.0) + 0.499) + 33;
+ if(0 < opt->quality_std) {
+ qstr[i] += (int)((ran_normal() * opt->quality_std) + 0.5);
+ if(qstr[i] < 33) qstr[i] = 33;
+ if(73 < qstr[i]) qstr[i] = 73;
+ }
+ }
+ }
+ qstr[i] = 0;
+ // BWA
+ FILE *fpo = (0 == j) ? opt->fp_bwa1: opt->fp_bwa2;
+ if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
+ fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n",
+ (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+ (NULL == opt->read_prefix) ? "" : "_",
+ name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0,
+ n_err[0], n_sub[0], n_indel[0],
+ n_err[1], n_sub[1],n_indel[1],
+ (long long)ii, j+1);
+ for (i = 0; i < s[j]; ++i)
+ fputc("ACGTN"[(int)tmp_seq[j][i]], fpo);
+ fprintf(fpo, "\n+\n%s\n", qstr);
+ }
+ else {
+ // Note: BWA ignores the adapter and the first color, so this is a misrepresentation
+ // in samtools. We must first skip the first color. Basically, a 50 color read is a
+ // 49 color read for BWA.
+ //
+ // Note: BWA outputs F3 to read1, annotated as read "2", and outputs R3 to read2,
+ // annotated as read "1".
+ fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n",
+ (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+ (NULL == opt->read_prefix) ? "" : "_",
+ name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0,
+ n_err[0] - n_err_first[0], n_sub[0] - n_sub_first[0], n_indel[0] - n_indel_first[0],
+ n_err[1] - n_err_first[1], n_sub[1] - n_sub_first[1], n_indel[1] - n_indel_first[1],
+ (long long)ii, 2 - j);
+ //fputc('A', fpo);
+ for (i = 1; i < s[j]; ++i)
+ fputc("ACGTN"[(int)tmp_seq[j][i]], fpo);
+ fprintf(fpo, "\n+\n");
+ for (i = 1; i < s[j]; ++i)
+ fputc(qstr[i], fpo);
+ fprintf(fpo, "\n");
+ }
+
+ // BFAST output
+ fprintf(opt->fp_bfast, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx\n",
+ (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+ (NULL == opt->read_prefix) ? "" : "_",
+ name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0,
+ n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
+ (long long)ii);
+ if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
+ for (i = 0; i < s[j]; ++i)
+ fputc("ACGTN"[(int)tmp_seq[j][i]], opt->fp_bfast);
+ fprintf(opt->fp_bfast, "\n+\n%s\n", qstr);
+ }
+ else {
+ fputc('A', opt->fp_bfast);
+ for (i = 0; i < s[j]; ++i)
+ fputc("01234"[(int)tmp_seq[j][i]], opt->fp_bfast);
+ fprintf(opt->fp_bfast, "\n+\n");
+ for (i = 0; i < s[j]; ++i)
+ fputc(qstr[i], opt->fp_bfast);
+ fprintf(opt->fp_bfast, "\n");
+ }
+ }
+ n_sim++;
+ }
+ else { // random DNA read
+ for(j=0;j<2;j++) {
+ if(s[j] <= 0) {
+ continue;
+ }
+ if(IONTORRENT == opt->data_type && qstr_l < s[j]) {
+ qstr_l = s[j];
+ qstr = realloc(qstr, (1+qstr_l) * sizeof(char));
+ }
+ // get random sequence
+ for (i = 0; i < s[j]; ++i) {
+ tmp_seq[j][i] = (int)(drand48() * 4.0) & 3;
+ }
+ if(NULL != opt->fixed_quality) {
+ for (i = 0; i < s[j]; ++i) {
+ qstr[i] = opt->fixed_quality[0];
+ }
+ }
+ else {
+ for (i = 0; i < s[j]; ++i) {
+ qstr[i] = (int)(-10.0 * log(e[j]->start + e[j]->by*i) / log(10.0) + 0.499) + 33;
+ if(0 < opt->quality_std) {
+ qstr[i] += (int)((ran_normal() * opt->quality_std) + 0.5);
+ if(qstr[i] < 33) qstr[i] = 33;
+ if(73 < qstr[i]) qstr[i] = 73;
+ }
+ }
+ }
+ qstr[i] = 0;
+ if(SOLID == opt->data_type) { // convert to color space
+ if(0 < s[j]) {
+ c1 = 0; // adaptor
+ for (i = 0; i < s[j]; ++i) {
+ c2 = tmp_seq[j][i]; // current base
+ c = __gf_add(c1, c2);
+ tmp_seq[j][i] = c;
+ c1 = c2; // save previous base
+ }
+ }
+ }
+ // BWA
+ FILE *fpo = (0 == j) ? opt->fp_bwa1: opt->fp_bwa2;
+ if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
+ fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n",
+ (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+ (NULL == opt->read_prefix) ? "" : "_",
+ "rand", 0, 0, 0, 0, 1, 1,
+ 0, 0, 0, 0, 0, 0,
+ (long long)ii,
+ j+1);
+ for (i = 0; i < s[j]; ++i)
+ fputc("ACGTN"[(int)tmp_seq[j][i]], fpo);
+ fprintf(fpo, "\n+\n%s\n", qstr);
+ }
+ else {
+ // Note: BWA ignores the adapter and the first color, so this is a misrepresentation
+ // in samtools. We must first skip the first color. Basically, a 50 color read is a
+ // 49 color read for BWA.
+ //
+ // Note: BWA outputs F3 to read1, annotated as read "2", and outputs R3 to read2,
+ // annotated as read "1".
+ fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n",
+ (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+ (NULL == opt->read_prefix) ? "" : "_",
+ "rand", 0, 0, 0, 0, 1, 1,
+ 0, 0, 0, 0, 0, 0,
+ (long long)ii, 2 - j);
+ //fputc('A', fpo);
+ for (i = 1; i < s[j]; ++i)
+ fputc("ACGTN"[(int)tmp_seq[j][i]], fpo);
+ fprintf(fpo, "\n+\n");
+ for (i = 1; i < s[j]; ++i)
+ fputc(qstr[i], fpo);
+ fprintf(fpo, "\n");
+ }
+
+ // BFAST output
+ fprintf(opt->fp_bfast, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx\n",
+ (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+ (NULL == opt->read_prefix) ? "" : "_",
+ "rand", 0, 0, 0, 0, 1, 1,
+ 0, 0, 0, 0, 0, 0,
+ (long long)ii);
+ if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
+ for (i = 0; i < s[j]; ++i)
+ fputc("ACGTN"[(int)tmp_seq[j][i]], opt->fp_bfast);
+ fprintf(opt->fp_bfast, "\n+\n%s\n", qstr);
+ }
+ else {
+ fputc('A', opt->fp_bfast);
+ for (i = 0; i < s[j]; ++i)
+ fputc("01234"[(int)tmp_seq[j][i]], opt->fp_bfast);
+ fprintf(opt->fp_bfast, "\n+\n");
+ for (i = 0; i < s[j]; ++i)
+ fputc(qstr[i], opt->fp_bfast);
+ fprintf(opt->fp_bfast, "\n");
+ }
+ }
+ n_sim++;
+ }
+ }
+ fprintf(stderr, "\r[dwgsim_core] %llu",
+ (unsigned long long int)ctr);
+ }
+ mutseq_destroy(mutseq[0]);
+ mutseq_destroy(mutseq[1]);
+ contig_i++;
+ }
+ fprintf(stderr, "\n[dwgsim_core] Complete!\n");
+
+ free(seq.s); free(qstr);
+ free(tmp_seq[0]); free(tmp_seq[1]);
+ if(IONTORRENT == opt->data_type) {
+ free(tmp_seq_flow_mask[0]); free(tmp_seq_flow_mask[1]);
+ }
+ if(0 <= opt->fn_muts_input_type) {
+ muts_input_destroy(muts_input);
+ }
+ if(NULL != opt->fn_regions_bed) {
+ regions_bed_destroy(regions_bed);
+ fclose(fp_regions_bed);
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ dwgsim_opt_t *opt = NULL;
+
+ // update the mutant sequence bounds
+ mutseq_init_bounds();
+
+ opt = dwgsim_opt_init();
+
+ char fn_fai[1024]="\0";
+ char fn_tmp[1024]="\0";
+
+ if(0 == dwgsim_opt_parse(opt, argc, argv)) {
+ return dwgsim_opt_usage(opt);
+ }
+
+ // Open files
+ opt->fp_fa = xopen(argv[optind+0], "r");
+ strcpy(fn_fai, argv[optind+0]); strcat(fn_fai, ".fai");
+ opt->fp_fai = fopen(fn_fai, "r"); // NB: depends on returning NULL;
+ strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".mutations.txt");
+ opt->fp_mut = xopen(fn_tmp, "w");
+ strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".mutations.vcf");
+ opt->fp_vcf = xopen(fn_tmp, "w");
+ if(0 == opt->muts_only) {
+ strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bfast.fastq");
+ opt->fp_bfast = xopen(fn_tmp, "w");
+ strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bwa.read1.fastq");
+ opt->fp_bwa1 = xopen(fn_tmp, "w");
+ strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bwa.read2.fastq");
+ opt->fp_bwa2 = xopen(fn_tmp, "w");
+ }
+
+ // Run simulation
+ dwgsim_core(opt);
+
+ // Close files
+ if(0 == opt->muts_only) {
+ fclose(opt->fp_fa); fclose(opt->fp_bfast); fclose(opt->fp_bwa1); fclose(opt->fp_bwa2);
+ }
+ if(NULL != opt->fp_fai) fclose(opt->fp_fai);
+ fclose(opt->fp_mut);
+ fclose(opt->fp_vcf);
+
+ dwgsim_opt_destroy(opt);
+
+ return 0;
+}
diff --git a/src/dwgsim.h b/src/dwgsim.h
new file mode 100644
index 0000000..f771736
--- /dev/null
+++ b/src/dwgsim.h
@@ -0,0 +1,26 @@
+#ifndef DWGSIM_H
+#define DWGSIM_H
+
+#include "dwgsim_opt.h"
+
+#define __gf_add(_x, _y) ((_x >= 4 || _y >= 4) ? 4 : (_x ^ _y))
+#define __IS_TRUE(_val) ((_val == 1) ? "True" : "False")
+
+extern uint8_t nst_nt4_table[256];
+
+enum data_type_t {
+ ILLUMINA=0,
+ SOLID=1,
+ IONTORRENT=2
+};
+
+char iupac_and_base_to_mut(char iupac, char base);
+
+char bases_to_iupac(char b1, char b2);
+
+int32_t get_muttype(char *str);
+
+int32_t
+generate_errors_flows(dwgsim_opt_t *opt, uint8_t **seq, uint8_t **mask, int32_t *mem, int32_t len, uint8_t strand, double e, int32_t *_n_err);
+
+#endif
diff --git a/src/dwgsim_eval.c b/src/dwgsim_eval.c
new file mode 100755
index 0000000..4bf4288
--- /dev/null
+++ b/src/dwgsim_eval.c
@@ -0,0 +1,698 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <limits.h>
+#include <unistd.h>
+#include <float.h>
+#include <sys/resource.h>
+#include <math.h>
+#include "samtools/bam.h"
+#include "samtools/sam.h"
+#include "dwgsim_eval.h"
+
+#define __IS_TRUE(_val) ((_val == 1) ? "True" : "False")
+
+int
+print_usage(dwgsim_eval_args_t *args)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Program: dwgsim_eval (short read simulation evaluator)\n");
+ fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
+ fprintf(stderr, "Contact: Nils Homer <dnaa-help at lists.sourceforge.net>\n\n");
+ fprintf(stderr, "Usage: dwgsim_eval [options] <in.sam/in.bam>\n\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, "\t-a\tINT\tsplit by [%d]:\n", args->a);
+ fprintf(stderr, "\t\t\t\t\t0: by mapping quality\n");
+ fprintf(stderr, "\t\t\t\t\t1: by alignment score\n");
+ fprintf(stderr, "\t\t\t\t\t2: by suboptimal alignment score\n");
+ fprintf(stderr, "\t\t\t\t\t3: by alignment score - suboptimal alignment score\n");
+ fprintf(stderr, "\t-b\t\talignments are from BWA (SOLiD only) [%s]\n", __IS_TRUE(args->b));
+ fprintf(stderr, "\t-c\t\tcolor space alignments [%s]\n", __IS_TRUE(args->c));
+ fprintf(stderr, "\t-d\tINT\tdivide quality/alignment score by this factor [%d]\n", args->d);
+ fprintf(stderr, "\t-g\t\tgap \"wiggle\" [%d]\n", args->g);
+ fprintf(stderr, "\t-m\t\tconsecutive alignments with the same name (and end for multi-ends) should be treated as multi-mapped reads [%s]\n", __IS_TRUE(args->m));
+ fprintf(stderr, "\t-n\tINT\tnumber of raw input paired-end reads (otherwise, inferred from all SAM records present) [%d]\n", args->n);
+ fprintf(stderr, "\t-q\tINT\tconsider only alignments with this mapping quality or greater [%d]\n", args->q);
+ fprintf(stderr, "\t-z\t\tinput contains only single end reads [%s]\n", __IS_TRUE(args->z));
+ fprintf(stderr, "\t-S\t\tinput is SAM [%s]\n", __IS_TRUE(args->S));
+ fprintf(stderr, "\t-p\t\tprint incorrect alignments [%s]\n", __IS_TRUE(args->p));
+ fprintf(stderr, "\t-s\tINT\tconsider only alignments with the number of specified SNPs [%d]\n", args->s);
+ fprintf(stderr, "\t-e\tINT\tconsider only alignments with the number of specified errors [%d]\n", args->e);
+ fprintf(stderr, "\t-i\t\tconsider only alignments with indels [%s]\n", __IS_TRUE(args->i));
+ fprintf(stderr, "\t-P\tSTRING\ta read prefix that was prepended to each read name [%s]\n", (NULL == args->P) ? "not using" : args->P);
+ fprintf(stderr, "\t-h\t\tprint this help message\n");
+ return 1;
+}
+
+/* Action */
+enum {Exit, Warn, LastActionType};
+/* Type */
+enum {
+ Dummy,
+ OutOfRange, /* e.g. command line args */
+ InputArguments,
+ IllegalFileName,
+ IllegalPath,
+ OpenFileError,
+ EndOfFile,
+ ReallocMemory,
+ MallocMemory,
+ ThreadError,
+ ReadFileError,
+ WriteFileError,
+ DeleteFileError,
+ LastErrorType,
+};
+#define BREAK_LINE "************************************************************\n"
+void
+dwgsim_eval_print_error(char* FunctionName, char *VariableName, char* Message, int Action, int type)
+{
+ static char ErrorString[][20]=
+ { "\0", "OutOfRange", "InputArguments", "IllegalFileName", "IllegalPath", "OpenFileError", "EndOfFile", "ReallocMemory", "MallocMemory", "ThreadError", "ReadFileError", "WriteFileError", "DeleteFileError"};
+ static char ActionType[][20]={"Fatal Error", "Warning"};
+ fprintf(stderr, "%s\rIn function \"%s\": %s[%s]. ",
+ BREAK_LINE, FunctionName, ActionType[Action], ErrorString[type]);
+
+ /* Only print variable name if is available */
+ if(VariableName) {
+ fprintf(stderr, "Variable/Value: %s.\n", VariableName);
+ }
+ /* Only print message name if is available */
+ if(Message) {
+ fprintf(stderr, "Message: %s.\n", Message);
+ }
+ if(type == ReadFileError ||
+ type == OpenFileError ||
+ type == WriteFileError) {
+ perror("The file stream error was:");
+ }
+
+ switch(Action) {
+ case Exit:
+ fprintf(stderr, " ***** Exiting due to errors *****\n");
+ fprintf(stderr, "%s", BREAK_LINE);
+ exit(EXIT_FAILURE);
+ break; /* Not necessary actually! */
+ case Warn:
+ fprintf(stderr, " ***** Warning *****\n");
+ fprintf(stderr, "%s", BREAK_LINE);
+ break;
+ default:
+ fprintf(stderr, "Trouble!!!\n");
+ fprintf(stderr, "%s", BREAK_LINE);
+ }
+}
+
+
+int
+main(int argc, char *argv[])
+{
+ char c;
+ dwgsim_eval_args_t args;
+
+ args.a = args.b = args.c = args.i = args.m = args.n = args.p = args.q = args.z = 0;
+ args.d = 1;
+ args.e = -1;
+ args.g = 5;
+ args.s = -1;
+ args.S = 0;
+ args.P = NULL;
+
+ while(0 <= (c = getopt(argc, argv, "a:d:e:g:m:n:q:s:bchimpzSP:"))) {
+ switch(c) {
+ case 'a': args.a = atoi(optarg); break;
+ case 'b': args.b = 1; break;
+ case 'c': args.c = 1; break;
+ case 'd': args.d = atoi(optarg); break;
+ case 'g': args.g = atoi(optarg); break;
+ case 'm': args.m = 1; break;
+ case 'h': return print_usage(&args); break;
+ case 'n': args.n = atoi(optarg); break;
+ case 'q': args.q = atoi(optarg); break;
+ case 'z': args.z = 1; break;
+ case 'S': args.S = 1; break;
+ case 'p': args.p = 1; break;
+ case 's': args.s = atoi(optarg); break;
+ case 'e': args.e = atoi(optarg); break;
+ case 'i': args.i = 1; break;
+ case 'P': free(args.P); args.P = strdup(optarg); break;
+ default: fprintf(stderr, "Unrecognized option: -%c\n", c); return 1;
+ }
+ }
+
+ if(argc == optind) {
+ return print_usage(&args);
+ }
+
+ run(&args, argc - optind, argv + optind);
+
+ free(args.P);
+
+ return 0;
+}
+
+void
+run(dwgsim_eval_args_t *args,
+ int32_t num_files,
+ char *files[])
+{
+ char *FnName="run";
+ int32_t i, n = 0;
+ samfile_t *fp_in = NULL;
+ samfile_t *fp_out = NULL;
+ bam1_t *b=NULL;
+ dwgsim_eval_counts_t *counts;
+ char *prev_qname=NULL;
+ int32_t prev_end=-1;
+
+ // initialize counts
+ counts = dwgsim_eval_counts_init();
+
+ fprintf(stderr, "Analyzing...\nCurrently on:\n0");
+ for(i=0;i<num_files;i++) {
+ // Open the file
+ fp_in = samopen(files[i], (1 == args->S) ? "r" : "rb", 0);
+ if(NULL == fp_in) {
+ dwgsim_eval_print_error(FnName, files[i], "Could not open file for reading", Exit, OpenFileError);
+ }
+
+ if(0 == i && 1 == args->p) {
+ fp_out = samopen("-", "wh", fp_in->header);
+ if(NULL == fp_out) {
+ dwgsim_eval_print_error(FnName, "stdout", "Could not open file stream for writing", Exit, OpenFileError);
+ }
+ }
+
+ b = bam_init1();
+ while(0 < samread(fp_in, b)) {
+ if(1 == args->m &&
+ NULL != prev_qname &&
+ prev_end == (BAM_FREAD1 & b->core.flag) &&
+ 0 == strcmp(prev_qname, bam1_qname(b))) {
+ // do nothing
+ }
+ else {
+ if(1 == args->m) {
+ free(prev_qname);
+ prev_qname = strdup(bam1_qname(b));
+ prev_end = (BAM_FREAD1 & b->core.flag);
+ }
+
+ process_bam(counts, args, fp_in->header, b, fp_out);
+
+ if((BAM_FPAIRED & b->core.flag)) { // paired end
+ if(1 == args->z) { // expect single end
+ dwgsim_eval_print_error(FnName, NULL, "Found a read that was paired end", Exit, OutOfRange);
+ }
+ if((BAM_FREAD1 & b->core.flag)) { // count # of pairs
+ n++;
+ }
+ }
+ else { // single end
+ if(0 == args->z) { // expect paired end
+ dwgsim_eval_print_error(FnName, NULL, "Found a read that was not paired", Exit, OutOfRange);
+ }
+ n++;
+ }
+
+
+ if(0 == (n % 10000)) {
+ fprintf(stderr, "\r%lld", (long long int)n);
+ }
+ }
+
+ bam_destroy1(b);
+ b = bam_init1();
+ }
+ bam_destroy1(b);
+
+ // Close the file
+ samclose(fp_in);
+ }
+ free(prev_qname);
+
+ if(1 == args->p) samclose(fp_out);
+
+ fprintf(stderr, "\r%lld\n", (long long int)n);
+
+ if(0 < args->n) {
+ if(n != args->n) {
+ fprintf(stderr, "(-n)=%d\tn=%d\n", args->n, n);
+ dwgsim_eval_print_error(FnName, NULL, "Number of reads found differs from the number specified (-n)", Warn, OutOfRange);
+ }
+ }
+ if(0 == args->z) {
+ dwgsim_eval_counts_print(counts, args->a, args->d, (0 < args->n) ? 2*args->n : 2*n);
+ }
+ else {
+ dwgsim_eval_counts_print(counts, args->a, args->d, (0 < args->n) ? args->n : n);
+ }
+
+ dwgsim_eval_counts_destroy(counts);
+
+ fprintf(stderr, "Analysis complete.\n");
+}
+
+uint32_t bam_calclip(const bam1_t *b)
+{
+ const bam1_core_t *c = &b->core;
+ const uint32_t *cigar = bam1_cigar(b);
+ uint32_t k, end = 0;
+ for (k = 0; k < c->n_cigar; ++k) {
+ int op = cigar[k] & BAM_CIGAR_MASK;
+ if(op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP) {
+ end += cigar[k] >> BAM_CIGAR_SHIFT;
+ }
+ else {
+ break;
+ }
+ }
+ return end;
+}
+
+
+void
+process_bam(dwgsim_eval_counts_t *counts,
+ dwgsim_eval_args_t *args,
+ bam_header_t *header,
+ bam1_t *b,
+ samfile_t *fp_out)
+{
+ char *FnName="process_bam";
+ int32_t left, metric=INT_MIN;
+ char *chr=NULL;
+ char *name=NULL, *ptr=NULL;
+ char chr_name[1028]="\0";
+ char read_num[1028]="\0";
+ int32_t pos_1, pos_2, str_1, str_2, rand_1, rand2;
+ int32_t n_err_1, n_sub_1, n_indel_1, n_err_2, n_sub_2, n_indel_2;
+ int32_t pos, str, rand;
+ int32_t i, j, tmp;
+ int32_t predicted_value, actual_value;
+ int32_t clip;
+
+ // mapping quality threshold
+ if(b->core.qual < args->q) return;
+
+ // parse read name
+ name = strdup(bam1_qname(b));
+ ptr = name; // save to be freed
+ char *to_rm="_::_::_______"; // to remove
+ for(i=b->core.l_qname-1,j=0;0<=i && j<13;i--) { // replace with spaces
+ if(name[i] == to_rm[j]) {
+ name[i] = ' '; j++;
+ }
+ }
+ // check for the prefix
+ if(NULL != args->P) {
+ j = strlen(name);
+ tmp = strlen(args->P);
+ if(j < tmp || 0 != strncmp(args->P, name, tmp)) {
+ dwgsim_eval_print_error(FnName, name, "[dwgsim_eval] could not match read name with given read name prefix (-P)", Exit, OutOfRange);
+ free(ptr);
+ return;
+ }
+ name += tmp + 1;
+ }
+ if(14 != sscanf(name, "%s %d %d %1d %1d %1d %1d %d %d %d %d %d %d %s",
+ chr_name, &pos_1, &pos_2, &str_1, &str_2, &rand_1, &rand2,
+ &n_err_1, &n_sub_1, &n_indel_1,
+ &n_err_2, &n_sub_2, &n_indel_2,
+ read_num)) {
+ dwgsim_eval_print_error(FnName, name, "[dwgsim_eval] read was not generated by dwgsim?", Exit, OutOfRange);
+ free(ptr);
+ return;
+ }
+ // check for a prefix, and make sure it was removed correctly
+ if(1 == args->z || (b->core.flag & BAM_FREAD1)) {
+ rand = rand_1;
+ }
+ else {
+ rand = rand2;
+ }
+ if(0 == rand) {
+ for(j=0;j<header->n_targets;j++) {
+ i = strlen(name);
+ tmp = strlen(header->target_name[j]);
+ i = (i < tmp) ? i : tmp;
+ if(0 == strncmp(name, header->target_name[j], i)) {
+ break;
+ }
+ }
+ if(j == header->n_targets) {
+ dwgsim_eval_print_error(FnName, name, "[dwgsim_eval] the mapped contig does not exist in the SAM header; perhaps you have a read name prefix?", Exit, OutOfRange);
+ }
+ }
+ free(ptr);
+ ptr = name = NULL;
+
+ // get metric value
+ if(0 == args->a) {
+ metric = (b->core.qual / args->d);
+ if(DWGSIM_EVAL_MAXQ < metric) metric = DWGSIM_EVAL_MAXQ;
+ }
+ else if((BAM_FUNMAP & b->core.flag) || 0 == b->core.qual) { // unmapped or zero quality
+ metric = DWGSIM_EVAL_MINAS;
+ }
+ else {
+ uint8_t *aux_AS=NULL, *aux_XS=NULL;
+ metric = DWGSIM_EVAL_MINAS+1;
+ if(1 == args->a || 3 == args->a) {
+ aux_AS = bam_aux_get(b, "AS");
+ if(NULL == aux_AS) {
+ metric = DWGSIM_EVAL_MINAS;
+ }
+ }
+ if(2 == args->a || 3 == args->a) {
+ aux_XS = bam_aux_get(b, "XS");
+ if(NULL == aux_XS) {
+ metric = DWGSIM_EVAL_MINAS;
+ }
+ }
+ if(metric != DWGSIM_EVAL_MINAS) {
+ switch(args->a) {
+ case 1:
+ metric = bam_aux2i(aux_AS);
+ break;
+ case 2:
+ metric = bam_aux2i(aux_XS);
+ break;
+ case 3:
+ metric = bam_aux2i(aux_AS) - bam_aux2i(aux_XS);
+ break;
+ default:
+ metric = DWGSIM_EVAL_MINAS;
+ break;
+ }
+ }
+ }
+ metric /= args->d;
+ if(metric < DWGSIM_EVAL_MINAS) metric = DWGSIM_EVAL_MINAS;
+
+ if(1 == args->i) { // indels only
+ if(1 == args->z || (b->core.flag & BAM_FREAD1)) {
+ if(0 == n_indel_1) return;
+ }
+ else {
+ if(0 == n_indel_2) return;
+ }
+ }
+ else if(0 <= args->e && n_err_1 != args->e) { // # of errors
+ return;
+ }
+ else if(0 <= args->s && n_sub_1 != args->s) { // # of snps
+ return;
+ }
+
+ if(1 == args->c && 1 == args->b) { // SOLiD and BWA
+ // Swap 1 and 2
+ tmp=n_err_1; n_err_1=n_err_2; n_err_2=tmp;
+ tmp=n_sub_1; n_sub_1=n_sub_2; n_sub_2=tmp;
+ tmp=n_indel_1; n_indel_1=n_indel_2; n_indel_2=tmp;
+ }
+
+ // copy data
+ if(1 == args->z || (b->core.flag & BAM_FREAD1)) {
+ pos = pos_1; str = str_1; rand = rand_1;
+ }
+ else {
+ pos = pos_2; str = str_2; rand = rand2;
+ }
+
+ // get the actual value
+ if(1 == rand) {
+ actual_value = DWGSIM_EVAL_UNMAPPABLE;
+ }
+ else {
+ actual_value = DWGSIM_EVAL_MAPPABLE;
+ }
+
+ // get the predicted value
+ if((BAM_FUNMAP & b->core.flag)) { // unmapped
+ predicted_value = DWGSIM_EVAL_UNMAPPED;
+ }
+ else { // mapped (correctly?)
+ clip = bam_calclip(b);
+ chr = header->target_name[b->core.tid];
+ left = b->core.pos - clip;
+
+ if(1 == rand // should not map
+ || str != bam1_strand(b) // different strand
+ || 0 != strcmp(chr, chr_name) // different chromosome
+ || args->g < fabs(pos - left)) { // out of bounds (positionally)
+ predicted_value = DWGSIM_EVAL_MAPPED_INCORRECTLY;
+ }
+ else {
+ predicted_value = DWGSIM_EVAL_MAPPED_CORRECTLY;
+ }
+ }
+
+ dwgsim_eval_counts_add(counts, metric, actual_value, predicted_value);
+
+ // print incorrect alignments
+ if(1 == args->p && DWGSIM_EVAL_MAPPED_INCORRECTLY == predicted_value) {
+ if(samwrite(fp_out, b) <= 0) {
+ dwgsim_eval_print_error(FnName, "stdout", "Could not write to stream", Exit, WriteFileError);
+ }
+ }
+}
+
+dwgsim_eval_counts_t *
+dwgsim_eval_counts_init()
+{
+ dwgsim_eval_counts_t *counts;
+
+ counts = malloc(sizeof(dwgsim_eval_counts_t));
+
+ counts->min_score = counts->max_score = 0;
+
+ counts->mc = malloc(sizeof(int32_t)); assert(NULL != counts->mc);
+ counts->mi = malloc(sizeof(int32_t)); assert(NULL != counts->mi);
+ counts->mu = malloc(sizeof(int32_t)); assert(NULL != counts->mu);
+ counts->um = malloc(sizeof(int32_t)); assert(NULL != counts->um);
+ counts->uu = malloc(sizeof(int32_t)); assert(NULL != counts->uu);
+
+ counts->mc[0] = counts->mi[0] = counts->mu[0] = 0;
+ counts->um[0] = counts->uu[0] = 0;
+
+ return counts;
+}
+
+void
+dwgsim_eval_counts_destroy(dwgsim_eval_counts_t *counts)
+{
+ free(counts->mc);
+ free(counts->mi);
+ free(counts->mu);
+ free(counts->um);
+ free(counts->uu);
+ free(counts);
+}
+
+void
+dwgsim_eval_counts_add(dwgsim_eval_counts_t *counts, int32_t score, int32_t actual_value, int32_t predicted_value)
+{
+ char *FnName="dwgsim_eval_counts_add";
+ int32_t i, m, n;
+ if(counts->max_score < score) {
+ m = score - counts->min_score + 1;
+ n = counts->max_score - counts->min_score + 1;
+
+ counts->mc = realloc(counts->mc, sizeof(int32_t)*m); assert(NULL != counts->mc);
+ counts->mi = realloc(counts->mi, sizeof(int32_t)*m); assert(NULL != counts->mi);
+ counts->mu = realloc(counts->mu, sizeof(int32_t)*m); assert(NULL != counts->mu);
+ counts->um = realloc(counts->um, sizeof(int32_t)*m); assert(NULL != counts->um);
+ counts->uu = realloc(counts->uu, sizeof(int32_t)*m); assert(NULL != counts->uu);
+
+ // initialize to zero
+ for(i=n;i<m;i++) {
+ counts->mc[i] = counts->mi[i] = counts->mu[i] = 0;
+ counts->um[i] = counts->uu[i] = 0;
+ }
+ counts->max_score = score;
+ }
+ else if(score < counts->min_score) {
+ m = counts->max_score - score + 1;
+ n = counts->max_score - counts->min_score + 1;
+
+ counts->mc = realloc(counts->mc, sizeof(int32_t)*m); assert(NULL != counts->mc);
+ counts->mi = realloc(counts->mi, sizeof(int32_t)*m); assert(NULL != counts->mi);
+ counts->mu = realloc(counts->mu, sizeof(int32_t)*m); assert(NULL != counts->mu);
+ counts->um = realloc(counts->um, sizeof(int32_t)*m); assert(NULL != counts->um);
+ counts->uu = realloc(counts->uu, sizeof(int32_t)*m); assert(NULL != counts->uu);
+
+ // shift up
+ for(i=m-1;m-n<=i;i--) {
+ counts->mc[i] = counts->mc[i-(m-n)];
+ counts->mi[i] = counts->mi[i-(m-n)];
+ counts->mu[i] = counts->mu[i-(m-n)];
+ counts->um[i] = counts->um[i-(m-n)];
+ counts->uu[i] = counts->uu[i-(m-n)];
+ }
+ // initialize to zero
+ for(i=0;i<m-n;i++) {
+ counts->mc[i] = counts->mi[i] = counts->mu[i] = 0;
+ counts->um[i] = counts->uu[i] = 0;
+ }
+ counts->min_score = score;
+ }
+
+ // check actual value
+ switch(actual_value) {
+ case DWGSIM_EVAL_MAPPABLE:
+ case DWGSIM_EVAL_UNMAPPABLE:
+ break;
+ default:
+ dwgsim_eval_print_error(FnName, "actual_value", "Could not understand actual value", Exit, OutOfRange);
+ }
+
+ // check predicted value
+ switch(predicted_value) {
+ case DWGSIM_EVAL_MAPPED_CORRECTLY:
+ case DWGSIM_EVAL_MAPPED_INCORRECTLY:
+ case DWGSIM_EVAL_UNMAPPED:
+ break;
+ default:
+ dwgsim_eval_print_error(FnName, "predicted_value", "Could not understand predicted value", Exit, OutOfRange);
+ }
+
+ switch(actual_value) {
+ case DWGSIM_EVAL_MAPPABLE:
+ switch(predicted_value) {
+ case DWGSIM_EVAL_MAPPED_CORRECTLY:
+ counts->mc[score-counts->min_score]++; break;
+ case DWGSIM_EVAL_MAPPED_INCORRECTLY:
+ counts->mi[score-counts->min_score]++; break;
+ case DWGSIM_EVAL_UNMAPPED:
+ counts->mu[score-counts->min_score]++; break;
+ default:
+ break; // should not reach here
+ }
+ break;
+ case DWGSIM_EVAL_UNMAPPABLE:
+ switch(predicted_value) {
+ case DWGSIM_EVAL_MAPPED_CORRECTLY:
+ dwgsim_eval_print_error(FnName, "predicted_value", "predicted value cannot be mapped correctly when the read is unmappable", Exit, OutOfRange); break;
+ case DWGSIM_EVAL_MAPPED_INCORRECTLY:
+ counts->um[score-counts->min_score]++; break;
+ case DWGSIM_EVAL_UNMAPPED:
+ counts->uu[score-counts->min_score]++; break;
+ default:
+ break; // should not reach here
+ }
+ break;
+ default:
+ break; // should not reach here
+ }
+}
+
+void
+dwgsim_eval_counts_print(dwgsim_eval_counts_t *counts, int32_t a, int32_t d, int32_t n)
+{
+ int32_t i;
+ int32_t max = 0;
+ int32_t mc_sum, mi_sum, mu_sum, um_sum, uu_sum;
+ int32_t m_total, mm_total, u_total;
+ char format[1024]="\0";
+
+ mc_sum = mi_sum = mu_sum = um_sum = uu_sum = 0;
+ m_total = mm_total = u_total = 0;
+
+ // create the format string
+ for(i=counts->max_score - counts->min_score;0<=i;i--) {
+ m_total += counts->mc[i] + counts->mi[i] + counts->mu[i];
+ u_total += counts->um[i] + counts->uu[i];
+ max += counts->mc[i] + counts->mi[i] + counts->mu[i] + counts->um[i] + counts->uu[i];
+ }
+ max = 1 + log10(max);
+ strcat(format, "%.2d ");
+ for(i=0;i<12;i++) {
+ sprintf(format + (int)strlen(format), "%%%dd ", max);
+ }
+ strcat(format + (int)strlen(format), "%.3e %.3e %.3e %.3e %.3e %.3e\n");
+
+ // header
+ fprintf(stdout, "# thr | the minimum %s threshold\n", (0 == a) ? "mapping quality" : "alignment score");
+ fprintf(stdout, "# mc | the number of reads mapped correctly that should be mapped at the threshold\n");
+ fprintf(stdout, "# mi | the number of reads mapped incorrectly that should be mapped be mapped at the threshold\n");
+
+ fprintf(stdout, "# mu | the number of reads unmapped that should be mapped be mapped at the threshold\n");
+
+ fprintf(stdout, "# um | the number of reads mapped that should be unmapped be mapped at the threshold\n");
+ fprintf(stdout, "# uu | the number of reads unmapped that should be unmapped be mapped at the threshold\n");
+ fprintf(stdout, "# mc' + mi' + mu' + um' + uu' | the total number of reads mapped at the threshold\n");
+ fprintf(stdout, "# mc' | the number of reads mapped correctly that should be mapped at or greater than that threshold\n");
+ fprintf(stdout, "# mi' | the number of reads mapped incorrectly that should be mapped be mapped at or greater than that threshold\n");
+
+ fprintf(stdout, "# mu' | the number of reads unmapped that should be mapped be mapped at or greater than that threshold\n");
+
+ fprintf(stdout, "# um' | the number of reads mapped that should be unmapped be mapped at or greater than that threshold\n");
+ fprintf(stdout, "# uu' | the number of reads unmapped that should be unmapped be mapped at or greater than that threshold\n");
+ fprintf(stdout, "# mc' + mi' + mu' + um' + uu' | the total number of reads mapped at or greater than the threshold\n");
+
+ fprintf(stdout, "# (mc / (mc' + mi' + mu')) | sensitivity: the fraction of reads that should be mapped that are mapped correctly at the threshold\n");
+ fprintf(stdout, "# (mc / mc' + mi') | positive predictive value: the fraction of mapped reads that are mapped correctly at the threshold\n");
+ fprintf(stdout, "# (um / (um' + uu')) | false discovery rate: the fraction of random reads that are mapped at the threshold\n");
+ fprintf(stdout, "# (mc' / (mc' + mi' + mu')) | sensitivity: the fraction of reads that should be mapped that are mapped correctly at or greater than the threshold\n");
+ fprintf(stdout, "# (mc' / mc' + mi') | positive predictive value: the fraction of mapped reads that are mapped correctly at or greater than the threshold\n");
+ fprintf(stdout, "# (um' / (um' + uu')) | false discovery rate: the fraction of random reads that are mapped at or greater than the threshold\n");
+
+
+ // print
+ for(i=counts->max_score - counts->min_score;0<=i;i--) {
+ double num, den;
+
+ mc_sum += counts->mc[i];
+ mi_sum += counts->mi[i];
+ mu_sum += counts->mu[i];
+ um_sum += counts->um[i];
+ uu_sum += counts->uu[i];
+ mm_total += counts->mc[i] + counts->mi[i];
+
+ /* Notes:
+ * notice that the denominator for sensitivity (and fdr) for the "ge"
+ * (greater than or equal) threshold is the "total", while the denominator
+ * for ppv is "@ >= Q". The reasoning behind this is ppv is a measure of
+ * the quality of mappings that will be returned when using a Q threshold
+ * while the sensitivity want to measure the fraction of mappings that
+ * will be returned compared to the maximum. Basically, if we accept
+ * only mappings at a given threshold, and call the rest unmapped, what
+ * happens?
+ * - sensitivity tells us the # of correct mappings out of the total possible
+ * mappings.
+ * - ppv tells us the # of correct mappings out of the total mappings.
+ * - fdr tells us the # of random mappings out of the total unmappable.
+ */
+
+ // "at" sensitivity: mapped correctly @ Q / mappable @ Q
+ num = counts->mc[i];
+ den = counts->mc[i] + counts->mi[i] + counts->mu[i];
+ double sens_at_thr = (0 == den) ? 0. : (num / (double)den);
+ // "ge" sensitivity: mapped correctly @ >= Q / total mappable
+ double sens_ge_thr = (0 == m_total) ? 0. : (mc_sum / (double)m_total);
+
+ // "at" positive predictive value: mapped correctly @ Q / mappable and mapped @ Q
+ num = counts->mc[i];
+ den = counts->mc[i] + counts->mi[i];
+ double ppv_at_thr = (0 == den) ? 0. : (num / (double)den);
+ // "ge" positive predictive value: mapped correctly @ >= Q / mappable and mapped @ >= Q
+ double ppv_ge_thr = (0 == mm_total) ? 0. : (mc_sum / (double)mm_total);
+
+ // "at" false discovery rate: unmappable and mapped @ Q / unmappable @ Q
+ num = counts->um[i];
+ den = counts->um[i] + counts->uu[i];
+ double fdr_at_thr = (0 == den) ? 0. : (num / (double)den);
+ // "ge" false discovery rate: unmappable and mapped @ >= Q / unmappable @ >= Q
+ double fdr_ge_thr = (0 == u_total) ? 0. : (um_sum / (double)u_total);
+
+ fprintf(stdout, format,
+ (i + counts->min_score)*d,
+ counts->mc[i], counts->mi[i], counts->mu[i], counts->um[i], counts->uu[i],
+ counts->mc[i] + counts->mi[i] + counts->mu[i] + counts->um[i] + counts->uu[i],
+ mc_sum, mi_sum, mu_sum, um_sum, uu_sum,
+ mc_sum + mi_sum + mu_sum + um_sum + uu_sum,
+ sens_at_thr, ppv_at_thr, fdr_at_thr,
+ sens_ge_thr, ppv_ge_thr, fdr_ge_thr);
+ }
+}
diff --git a/src/dwgsim_eval.h b/src/dwgsim_eval.h
new file mode 100644
index 0000000..ba85630
--- /dev/null
+++ b/src/dwgsim_eval.h
@@ -0,0 +1,66 @@
+#ifndef DWGSIM_EVAL_H_
+#define DWGSIM_EVAL_H_
+
+#define DWGSIM_EVAL_MAXQ 255
+#define DWGSIM_EVAL_MINAS -5000
+
+typedef struct {
+ int32_t a; // alignment score or not
+ int32_t b; // bwa or not
+ int32_t c; // color space or not
+ int32_t d; // divide by factor
+ int32_t e; // print only alignments with # of errors
+ int32_t g; // gap "wiggle"
+ int32_t i; // indel only
+ int32_t m; // multi-mapped
+ int32_t n; // # of pe alignments
+ int32_t p; // print incorrect alignments or not
+ int32_t q; // consider only alignments with this mapping quality or greater
+ int32_t s; // print only alignments with # of SNPs
+ int32_t z; // input reads are single end
+ int32_t S; // input reads are in text SAM format
+ char *P; // read name prefix
+} dwgsim_eval_args_t;
+
+// Actual value
+enum {
+ DWGSIM_EVAL_MAPPABLE =0,
+ DWGSIM_EVAL_UNMAPPABLE =1
+};
+
+// Prediction
+enum {
+ DWGSIM_EVAL_MAPPED_CORRECTLY =0,
+ DWGSIM_EVAL_MAPPED_INCORRECTLY =1,
+ DWGSIM_EVAL_UNMAPPED =2
+};
+
+typedef struct {
+ int32_t *mc; // mappable && mapped correctly
+ int32_t *mi; // mappable && mapped incorrectly
+ int32_t *mu; // mappable && unmapped
+ int32_t *um; // unmappable && mapped
+ int32_t *uu; // unmappable && unmapped
+ int32_t min_score, max_score;
+} dwgsim_eval_counts_t;
+
+void
+run(dwgsim_eval_args_t *args,
+ int32_t num_files,
+ char *files[]);
+void
+process_bam(dwgsim_eval_counts_t *counts,
+ dwgsim_eval_args_t *args,
+ bam_header_t *header,
+ bam1_t *b,
+ samfile_t *fp_out);
+dwgsim_eval_counts_t *
+dwgsim_eval_counts_init();
+void
+dwgsim_eval_counts_destroy(dwgsim_eval_counts_t *counts);
+void
+dwgsim_eval_counts_add(dwgsim_eval_counts_t *counts, int32_t score, int32_t actual_value, int32_t predicted_value);
+void
+dwgsim_eval_counts_print(dwgsim_eval_counts_t *counts, int32_t a, int32_t d, int32_t n);
+
+#endif
diff --git a/src/dwgsim_opt.c b/src/dwgsim_opt.c
new file mode 100644
index 0000000..f1339dd
--- /dev/null
+++ b/src/dwgsim_opt.c
@@ -0,0 +1,381 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+#include "mut.h"
+#include "dwgsim.h"
+#include "dwgsim_opt.h"
+
+dwgsim_opt_t* dwgsim_opt_init()
+{
+ dwgsim_opt_t *opt;
+ opt = calloc(1, sizeof(dwgsim_opt_t));
+ opt->e[0].start = opt->e[0].end = opt->e[1].start = opt->e[1].end = 0.02;
+ opt->e[0].by = opt->e[1].by = 0;
+ opt->is_inner = 0;
+ opt->dist = 500;
+ opt->std_dev = 50;
+ opt->N = -1;
+ opt->C = 100;
+ opt->length[0] = opt->length[1] = 70;
+ opt->mut_rate = 0.001;
+ opt->mut_freq = 0.5;
+ opt->indel_frac = 0.1;
+ opt->indel_extend = 0.3;
+ opt->indel_min = 1;
+ opt->rand_read = 0.05;
+ opt->data_type = ILLUMINA;
+ opt->strandedness = 0;
+ opt->max_n = 0;
+ opt->flow_order = NULL;
+ opt->flow_order_len = 0;
+ opt->use_base_error = 0;
+ opt->seed = -1;
+ opt->muts_only = 0;
+ opt->fixed_quality = NULL;
+ opt->quality_std = 2.0;
+ opt->fn_muts_input = NULL;
+ opt->fn_muts_input_type = -1;
+ opt->fn_regions_bed = NULL;
+ opt->fp_mut = opt->fp_bfast = opt->fp_bwa1 = opt->fp_bwa2 = NULL;
+ opt->fp_fa = opt->fp_fai = NULL;
+ opt->read_prefix = NULL;
+
+ return opt;
+}
+
+void dwgsim_opt_destroy(dwgsim_opt_t *opt)
+{
+ free(opt->fixed_quality);
+ free(opt->fn_muts_input);
+ free(opt->fn_regions_bed);
+ free(opt->flow_order);
+ free(opt->read_prefix);
+ free(opt);
+}
+
+int dwgsim_opt_usage(dwgsim_opt_t *opt)
+{
+ mutseq_init_bounds();
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Program: dwgsim (short read simulator)\n");
+ fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
+ fprintf(stderr, "Contact: Nils Homer <dnaa-help at lists.sourceforge.net>\n\n");
+ fprintf(stderr, "Usage: dwgsim [options] <in.ref.fa> <out.prefix>\n\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -e FLOAT per base/color/flow error rate of the first read [from %.3f to %.3f by %.3f]\n", opt->e[0].start, opt->e[0].end, opt->e[0].by);
+ fprintf(stderr, " -E FLOAT per base/color/flow error rate of the second read [from %.3f to %.3f by %.3f]\n", opt->e[1].start, opt->e[1].end, opt->e[1].by);
+ fprintf(stderr, " -i use the inner distance instead of the outer distance for pairs [%s]\n", __IS_TRUE(opt->is_inner));
+ fprintf(stderr, " -d INT %s distance between the two ends for pairs [%d]\n",
+ (0 == opt->is_inner) ? "outer" : "inner", opt->dist);
+ fprintf(stderr, " -s INT standard deviation of the distance for pairs [%.3f]\n", opt->std_dev);
+ fprintf(stderr, " -N INT number of read pairs (-1 to disable) [%lld]\n", (signed long long int)opt->N);
+ fprintf(stderr, " -C FLOAT mean coverage across available positions (-1 to disable) [%.2lf]\n", opt->C);
+ fprintf(stderr, " -1 INT length of the first read [%d]\n", opt->length[0]);
+ fprintf(stderr, " -2 INT length of the second read [%d]\n", opt->length[1]);
+ fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", opt->mut_rate);
+ fprintf(stderr, " -F FLOAT frequency of given mutation to simulate low fequency somatic mutations [%.4f]\n", opt->mut_freq);
+ fprintf(stderr, " NB: freqeuncy F refers to the first strand of mutation, therefore mutations \n");
+ fprintf(stderr, " on the second strand occour with a frequency of 1-F \n");
+ fprintf(stderr, " -R FLOAT fraction of mutations that are indels [%.2f]\n", opt->indel_frac);
+ fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", opt->indel_extend);
+ fprintf(stderr, " -I INT the minimum length indel [%d]\n", opt->indel_min);
+ fprintf(stderr, " -y FLOAT probability of a random DNA read [%.2f]\n", opt->rand_read);
+ fprintf(stderr, " -n INT maximum number of Ns allowed in a given read [%d]\n", opt->max_n);
+ fprintf(stderr, " -c INT generate reads for [%d]:\n", opt->data_type);
+ fprintf(stderr, " 0: Illumina\n");
+ fprintf(stderr, " 1: SOLiD\n");
+ fprintf(stderr, " 2: Ion Torrent\n");
+ fprintf(stderr, " -S INT generate reads [%d]:\n", opt->strandedness);
+ fprintf(stderr, " 0: default (opposite strand for Illumina, same strand for SOLiD/Ion Torrent)\n");
+ fprintf(stderr, " 1: same strand (mate pair)\n");
+ fprintf(stderr, " 2: opposite strand (paired end)\n");
+ fprintf(stderr, " -f STRING the flow order for Ion Torrent data [%s]\n", (char*)opt->flow_order);
+ fprintf(stderr, " -B use a per-base error rate for Ion Torrent data [%s]\n", __IS_TRUE(opt->use_base_error));
+ fprintf(stderr, " -H haploid mode [%s]\n", __IS_TRUE(opt->is_hap));
+ fprintf(stderr, " -z INT random seed (-1 uses the current time) [%d]\n", opt->seed);
+ fprintf(stderr, " -M generate a mutations file only [%s]\n", __IS_TRUE(opt->muts_only));
+ fprintf(stderr, " -m FILE the mutations txt file to re-create [%s]\n", (MUT_INPUT_TXT != opt->fn_muts_input_type) ? "not using" : opt->fn_muts_input);
+ fprintf(stderr, " -b FILE the bed-like file set of candidate mutations [%s]\n", (MUT_INPUT_BED == opt->fn_muts_input_type) ? "not using" : opt->fn_muts_input);
+ fprintf(stderr, " -v FILE the vcf file set of candidate mutations (use pl tag for strand) [%s]\n", (MUT_INPUT_VCF == opt->fn_muts_input_type) ? "not using" : opt->fn_muts_input);
+ fprintf(stderr, " -x FILE the bed of regions to cover [%s]\n", (NULL == opt->fn_regions_bed) ? "not using" : opt->fn_regions_bed);
+ fprintf(stderr, " -P STRING a read prefix to prepend to each read name [%s]\n", (NULL == opt->read_prefix) ? "not using" : opt->read_prefix);
+ fprintf(stderr, " -q STRING a fixed base quality to apply (single character) [%s]\n", (NULL == opt->fixed_quality) ? "not using" : opt->fixed_quality);
+ fprintf(stderr, " -Q FLOAT standard deviation of the base quality scores [%.2lf]\n", (NULL == opt->fixed_quality) ? opt->quality_std : 0.0);
+ fprintf(stderr, " -s INT standard deviation of the distance for pairs [%.3f]\n", opt->std_dev);
+ fprintf(stderr, " -h print this message\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Note: For SOLiD mate pair reads and BFAST, the first read is F3 and the second is R3. For SOLiD mate pair reads\n");
+ fprintf(stderr, "and BWA, the reads in the first file are R3 the reads annotated as the first read etc.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Note: The longest supported insertion is %u.\n", UINT32_MAX);
+ fprintf(stderr, "\n");
+ return 1;
+}
+
+static void get_error_rate(const char *str, error_t *e)
+{
+ int32_t i;
+
+ e->start = atof(str);
+ for(i=0;i<strlen(str);i++) {
+ if(',' == str[i] || '-' == str[i]) {
+ break;
+ }
+ }
+ if(i<strlen(str)-1) {
+ i++;
+ e->end = atof(str+i);
+ }
+ else {
+ e->end = e->start;
+ }
+}
+
+int32_t
+dwgsim_opt_is_int(char *optarg)
+{
+ int32_t i;
+ for(i=0;i<strlen(optarg);i++) {
+ if(!isdigit(optarg[i])) return 0;
+ }
+ return 1;
+}
+
+int32_t
+dwgsim_atoi(char *optarg, char flag)
+{
+ if(0 == dwgsim_opt_is_int(optarg)) {
+ fprintf(stderr, "Error: command line option -%c is not a number [%s]\n", flag, optarg);
+ exit(1);
+ }
+ return atoi(optarg);
+}
+
+int32_t
+dwgsim_opt_parse(dwgsim_opt_t *opt, int argc, char *argv[])
+{
+ int32_t i;
+ int c;
+ int muts_input_type = 0;
+
+ while ((c = getopt(argc, argv, "id:s:N:C:1:2:e:E:r:F:R:X:I:c:S:n:y:BHf:z:Mm:b:v:x:P:q:Q:h")) >= 0) {
+ switch (c) {
+ case 'i': opt->is_inner = 1; break;
+ case 'd': opt->dist = dwgsim_atoi(optarg, 'd'); break;
+ case 's': opt->std_dev = atof(optarg); break;
+ case 'N': opt->N = dwgsim_atoi(optarg, 'N'); opt->C = -1; break;
+ case 'C': opt->C = atof(optarg); opt->N = -1; break;
+ case '1': opt->length[0] = dwgsim_atoi(optarg, '1'); break;
+ case '2': opt->length[1] = dwgsim_atoi(optarg, '2'); break;
+ case 'e': get_error_rate(optarg, &opt->e[0]); break;
+ case 'E': get_error_rate(optarg, &opt->e[1]); break;
+ case 'r': opt->mut_rate = atof(optarg); break;
+ case 'F': opt->mut_freq = atof(optarg); break;
+ case 'R': opt->indel_frac = atof(optarg); break;
+ case 'X': opt->indel_extend = atof(optarg); break;
+ case 'I': opt->indel_min = dwgsim_atoi(optarg, 'I'); break;
+ case 'c': opt->data_type = dwgsim_atoi(optarg, 'c'); break;
+ case 'S': opt->strandedness = dwgsim_atoi(optarg, 'S'); break;
+ case 'n': opt->max_n = dwgsim_atoi(optarg, 'n'); break;
+ case 'y': opt->rand_read = atof(optarg); break;
+ case 'f':
+ if(NULL != opt->flow_order) free(opt->flow_order);
+ opt->flow_order = (int8_t*)strdup(optarg);
+ break;
+ case 'B': opt->use_base_error = 1; break;
+ case 'H': opt->is_hap = 1; break;
+ case 'h': return 0;
+ case 'z': opt->seed = dwgsim_atoi(optarg, 'n'); break;
+ case 'M': opt->muts_only = 1; break;
+ case 'm': free(opt->fn_muts_input); opt->fn_muts_input = strdup(optarg); opt->fn_muts_input_type = MUT_INPUT_TXT; muts_input_type |= 0x1; break;
+ case 'b': free(opt->fn_muts_input); opt->fn_muts_input = strdup(optarg); opt->fn_muts_input_type = MUT_INPUT_BED; muts_input_type |= 0x2; break;
+ case 'v': free(opt->fn_muts_input); opt->fn_muts_input = strdup(optarg); opt->fn_muts_input_type = MUT_INPUT_VCF; muts_input_type |= 0x4; break;
+ case 'x': free(opt->fn_regions_bed); opt->fn_regions_bed = strdup(optarg); break;
+ case 'P': free(opt->read_prefix); opt->read_prefix = strdup(optarg); break;
+ case 'q': opt->fixed_quality = strdup(optarg); break;
+ case 'Q': opt->quality_std = atof(optarg); break;
+ default: fprintf(stderr, "Unrecognized option: -%c\n", c); return 0;
+ }
+ }
+ if (argc - optind < 2) return 0;
+
+ __check_option(opt->is_inner, 0, 1, "-i");
+ __check_option(opt->dist, 0, INT32_MAX, "-d");
+ __check_option(opt->std_dev, 0, INT32_MAX, "-s");
+ if(opt->N < 0 && opt->C < 0) {
+ fprintf(stderr, "Must use one of -N or -C");
+ return 0;
+ }
+ else if(0 < opt->N && 0 < opt->C) {
+ fprintf(stderr, "Cannot use both -N or -C");
+ return 0;
+ }
+ else if(0 < opt->N) {
+ __check_option(opt->N, 1, INT32_MAX, "-N");
+ __check_option(opt->C, INT32_MIN, -1, "-C");
+ }
+ else {
+ __check_option(opt->N, INT32_MIN, -1, "-N");
+ __check_option(opt->C, 0, INT32_MAX, "-C");
+ }
+ __check_option(opt->length[0], 1, INT32_MAX, "-1");
+ __check_option(opt->length[1], 0, INT32_MAX, "-2");
+ // error rate
+ for(i=0;i<2;i++) {
+ if(opt->e[i].start < 0.0 || 1.0 < opt->e[i].start) {
+ fprintf(stderr, "End %s: the start error is out of range (-e)\n", (0 == i) ? "one" : "two");
+ return 0;
+ }
+ if(opt->e[i].end < 0.0 || 1.0 < opt->e[i].end) {
+ fprintf(stderr, "End %s: the end error is out of range (-e)\n", (0 == i) ? "one" : "two");
+ return 0;
+ }
+ if(IONTORRENT == opt->data_type) {
+ if(opt->e[i].end != opt->e[i].start) {
+ fprintf(stderr, "End %s: a uniform error rate must be given for Ion Torrent data\n", (0 == i) ? "one" : "two");
+ return 0;
+ }
+ }
+ }
+ __check_option(opt->mut_rate, 0, 1.0, "-r");
+ __check_option(opt->indel_frac, 0, 1.0, "-R");
+ __check_option(opt->indel_extend, 0, 1.0, "-X");
+ __check_option(opt->indel_min, 1, INT32_MAX, "-I");
+ __check_option(opt->data_type, 0, 2, "-c");
+ __check_option(opt->strandedness, 0, 2, "-S");
+ __check_option(opt->max_n, 0, INT32_MAX, "-n");
+ __check_option(opt->rand_read, 0, 1.0, "-y");
+ if(IONTORRENT == opt->data_type && NULL == opt->flow_order) {
+ fprintf(stderr, "Error: command line option -f is required\n");
+ return 0;
+ }
+ __check_option(opt->use_base_error, 0, 1, "-B");
+ __check_option(opt->is_hap, 0, 1, "-H");
+
+ if(NULL != opt->fixed_quality && 1 != strlen(opt->fixed_quality)) {
+ fprintf(stderr, "Error: command line option -q requires one character\n");
+ return 0;
+ }
+ __check_option(opt->quality_std, 0, INT32_MAX, "-Q");
+
+ if(NULL != opt->read_prefix) {
+ fprintf(stderr, "Warning: remember to use the -P option with dwgsim_eval\n");
+ }
+
+ switch(muts_input_type) {
+ case 0x0:
+ case 0x1:
+ case 0x2:
+ case 0x4:
+ break;
+ default:
+ fprintf(stderr, "Error: -m/-b/-v cannot be used together\n");
+ return 0;
+ break;
+ }
+
+ // random seed
+ srand48((-1 == opt->seed) ? time(0) : opt->seed);
+
+ if(IONTORRENT == opt->data_type) {
+ if(NULL != opt->flow_order) {
+ // uniform error rates only (so far)
+ if(opt->e[0].start != opt->e[0].end || opt->e[1].start != opt->e[1].end) {
+ fprintf(stderr, "Error: non-uniform error rate not support for Ion Torrent data\n");
+ return 0;
+ }
+ // update flow order
+ opt->flow_order_len = strlen((char*)opt->flow_order);
+ for(i=0;i<opt->flow_order_len;i++) {
+ opt->flow_order[i] = nst_nt4_table[opt->flow_order[i]];
+ }
+ }
+ else {
+ fprintf(stderr, "Error: -f must be given for Ion Torrent data\n");
+ return 0;
+ }
+ }
+ // use base error rate
+ if(IONTORRENT == opt->data_type && NULL != opt->flow_order && 1 == opt->use_base_error) {
+ uint8_t *tmp_seq=NULL;
+ uint8_t *tmp_seq_flow_mask=NULL;
+ int32_t tmp_seq_mem, s, cur_n_err, n_err, counts;
+ int32_t j, k;
+ double sf = 0.0;
+ for(i=0;i<2;i++) {
+ if(opt->length[i] <= 0) continue;
+ fprintf(stderr, "[dwgsim_core] Updating error rate for end %d\n", i+1);
+ if(0 < i && opt->length[i] == opt->length[1-i]) {
+ opt->e[i].start = opt->e[1-i].start;
+ opt->e[i].end = opt->e[1-i].end;
+ opt->e[i].by = opt->e[1-i].by;
+ fprintf(stderr, "[dwgsim_core] Using scaling factor from previous end\n[dwgsim_core] Updated with scaling factor %.5lf\n", sf);
+ continue;
+ }
+ tmp_seq_mem = opt->length[i]+2;
+ tmp_seq = (uint8_t*)calloc(tmp_seq_mem, 1);
+ tmp_seq_flow_mask = (uint8_t*)calloc(tmp_seq_mem, 1);
+ n_err = counts = 0;
+ for(j=0;j<ERROR_RATE_NUM_RANDOM_READS;j++) {
+ if(0 == (j % 10000)) {
+ fprintf(stderr, "\r[dwgsim_core] %d", j);
+ }
+ for(k=0;k<opt->length[i];k++) {
+ tmp_seq[k] = (int)(drand48() * 4.0) & 3;
+ }
+ cur_n_err = 0;
+ s = opt->length[i];
+ s = generate_errors_flows(opt, &tmp_seq, &tmp_seq_flow_mask, &tmp_seq_mem, s, 0, opt->e[i].start, &cur_n_err);
+ n_err += cur_n_err;
+ counts += s;
+ }
+ //fprintf(stderr, "before %lf,%lf,%lf\n", opt->e[i].start, opt->e[i].by, opt->e[i].end);
+ sf = opt->e[i].start / (n_err / (1.0 * counts));
+ opt->e[i].start = opt->e[i].end *= sf;
+ opt->e[i].by = (opt->e[i].end - opt->e[i].start) / opt->length[i];
+ //fprintf(stderr, "after %lf,%lf,%lf\n", opt->e[i].start, opt->e[i].by, opt->e[i].end);
+ free(tmp_seq);
+ free(tmp_seq_flow_mask);
+ fprintf(stderr, "\r[dwgsim_core] %d\n[dwgsim_core] Updated with scaling factor %.5lf!\n", j, sf);
+ }
+ }
+ else {
+ opt->e[0].by = (opt->e[0].end - opt->e[0].start) / opt->length[0];
+ opt->e[1].by = (opt->e[1].end - opt->e[1].start) / opt->length[1];
+ }
+
+ __check_option(opt->muts_only, 0, 1, "-M");
+
+ return 1;
+}
diff --git a/src/dwgsim_opt.h b/src/dwgsim_opt.h
new file mode 100644
index 0000000..e535165
--- /dev/null
+++ b/src/dwgsim_opt.h
@@ -0,0 +1,66 @@
+#ifndef DWGSIM_OPT_H
+#define DWGSIM_OPT_H
+
+#define ERROR_RATE_NUM_RANDOM_READS 1000000
+
+typedef struct {
+ double start, by, end;
+} error_t;
+
+typedef struct {
+ error_t e[2];
+ int32_t is_inner;
+ int32_t dist;
+ double std_dev;
+ int64_t N;
+ double C;
+ int32_t length[2];
+ double mut_rate;
+ double mut_freq;
+ double indel_frac;
+ double indel_extend;
+ int32_t indel_min;
+ double rand_read;
+ int32_t max_n;
+ int32_t data_type;
+ int32_t strandedness;
+ int8_t *flow_order;
+ int32_t flow_order_len;
+ int32_t use_base_error;
+ int32_t is_hap;
+ int32_t seed;
+ int32_t muts_only;
+ char *fixed_quality;
+ double quality_std;
+ char *fn_muts_input;
+ int32_t fn_muts_input_type;
+ char *fn_regions_bed;
+ FILE *fp_mut;
+ FILE *fp_vcf;
+ FILE *fp_bfast;
+ FILE *fp_bwa1;
+ FILE *fp_bwa2;
+ FILE *fp_fa;
+ FILE *fp_fai;
+ char *read_prefix;
+} dwgsim_opt_t;
+
+dwgsim_opt_t*
+dwgsim_opt_init();
+
+void
+dwgsim_opt_destroy(dwgsim_opt_t *opt);
+
+#define __check_option(_val, _min, _max, _opt) \
+ if(_val < _min || _max < _val) { \
+ fprintf(stderr, "Error: command line option %s was out of range\n", _opt); \
+ return 0; \
+ }
+
+int
+dwgsim_opt_usage(dwgsim_opt_t *opt);
+
+int32_t
+dwgsim_opt_parse(dwgsim_opt_t *opt, int argc, char *argv[]);
+
+#endif
diff --git a/src/mut.c b/src/mut.c
new file mode 100644
index 0000000..92b8764
--- /dev/null
+++ b/src/mut.c
@@ -0,0 +1,888 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+#include "contigs.h"
+#include "mut_txt.h"
+#include "mut_bed.h"
+#include "regions_bed.h"
+#include "dwgsim_opt.h"
+#include "mut.h"
+
+static int SEQ_BLOCK_SIZE = 512;
+
+void seq_set_block_size(int size)
+{
+ SEQ_BLOCK_SIZE = size;
+}
+
+int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment)
+{
+ int c, l, max;
+ char *p;
+
+ c = 0;
+ while (!feof(fp) && fgetc(fp) != '>');
+ if (feof(fp)) return -1;
+ p = locus;
+ while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n')
+ if (c != '\r') *p++ = c;
+ *p = '\0';
+ if (comment) {
+ p = comment;
+ if (c != '\n') {
+ while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t'));
+ if (c != '\n') {
+ *p++ = c;
+ while (!feof(fp) && (c = fgetc(fp)) != '\n')
+ if (c != '\r') *p++ = c;
+ }
+ }
+ *p = '\0';
+ } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n');
+ l = 0; max = seq->m;
+ while (!feof(fp) && (c = fgetc(fp)) != '>') {
+ if (isalpha(c) || c == '-' || c == '.') {
+ if (l + 1 >= max) {
+ max += SEQ_BLOCK_SIZE;
+ seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max);
+ }
+ seq->s[l++] = (unsigned char)c;
+ }
+ }
+ if (c == '>') ungetc(c,fp);
+ seq->s[l] = 0;
+ seq->m = max; seq->l = l;
+ return l;
+}
+
+mut_t mutmsk;
+mut_t mut_and_type_mask;
+mut_t muttype_shift;
+mut_t ins_length_shift;
+mut_t ins_length_mask;
+mut_t ins_length_max;
+mut_t ins_long_length_max;
+mut_t ins_mask;
+
+void
+mutseq_init_bounds()
+{
+ int32_t i;
+ mutmsk = (mut_t)0x30;
+ mut_and_type_mask = (mut_t)0x3F;
+ muttype_shift = 6; // bits 5-6 store the mutation type
+ ins_length_shift = 59; // bits 60-64 store the insertion length
+ ins_length_mask = 0x1F; // bits 60-64 store the insertion length
+ ins_length_max = ((ins_length_shift - muttype_shift) >> 1);
+ ins_long_length_max = UINT32_MAX;
+ if(ins_length_mask < ins_length_max) { // we exceed storing the length of the indel
+ ins_length_max = ins_length_mask;
+ }
+ // create the insertion mask based on the maximum length insertion
+ ins_mask = 0;
+ for(i=0;i<ins_length_max;i++) {
+ ins_mask = (ins_mask << 2) | 0x3;
+ }
+}
+
+mutseq_t *
+mutseq_init()
+{
+ mutseq_t *seq = NULL;
+ seq = calloc(1, sizeof(mutseq_t));
+ mutseq_init_bounds();
+ return seq;
+}
+
+void
+mutseq_destroy(mutseq_t *seq)
+{
+ int32_t i;
+ for(i=0;i<seq->ins_l;i++) {
+ free(seq->ins[i]);
+ }
+ free(seq->ins);
+ free(seq->s);
+ free(seq);
+}
+
+mut_t
+mut_get_ins_length(mutseq_t *seq, int32_t i)
+{
+ mut_t m, n, index;
+ assert(0 <= i && i < seq->l);
+ m = seq->s[i];
+ assert(INSERT == (m & mutmsk));
+ n = (m >> ins_length_shift) & ins_length_mask;
+ if(0 == n) {
+ uint32_t l = 0;
+ index = (m >> muttype_shift) & ins_mask;
+ assert(0 <= index && index < seq->ins_l);
+ mut_get_ins_long_n(seq->ins[index], &l);
+ return (mut_t)l;
+ }
+ else {
+ return n;
+ }
+}
+
+// returns 0 if it is a long insertion, 1 otherwise
+// if 0 is returned, the index into the long insertion list
+// is returned in "ins" and "n" is 0.
+int32_t
+mut_get_ins(mutseq_t *seq, int32_t i, mut_t *n, mut_t *ins)
+{
+ mut_t m;
+ assert(0 <= i && i < seq->l);
+ m = seq->s[i];
+ assert(INSERT == (m & mutmsk));
+ (*n) = (m >> ins_length_shift) & ins_length_mask;
+ (*ins) = (m >> muttype_shift) & ins_mask;
+ if(0 == (*n)) {
+ assert(0 <= (*ins) && (*ins) < seq->ins_l);
+ return 0;
+ }
+ else {
+ assert(0 < (*n));
+ assert((*n) <= ins_length_max);
+ return 1;
+ }
+}
+
+inline int32_t
+mut_get_ins_bytes(int32_t n)
+{
+ int32_t num_bytes = 1; // one-byte for the type
+ // variable number of bytes to store the length
+ if(n <= UINT8_MAX) num_bytes += sizeof(uint8_t);
+ else if(n <= UINT16_MAX) num_bytes += sizeof(uint16_t);
+ else if(n <= UINT32_MAX) num_bytes += sizeof(uint32_t);
+ else {
+ fprintf(stderr, "UINT32_MAX < n");
+ exit(1);
+ }
+ // store the insertion
+ num_bytes += mut_packed_len(n);
+ return num_bytes;
+}
+
+uint8_t*
+mut_get_ins_long_n(uint8_t *ins, uint32_t *n)
+{
+ switch(ins[0]) {
+ case sizeof(uint8_t):
+ (*n) = (uint32_t)(ins[1]);
+ return ins + 2;
+ break;
+ case sizeof(uint16_t):
+ (*n) = (uint32_t)(((uint16_t*)(ins+1))[0]);
+ return ins + 3;
+ break;
+ case sizeof(uint32_t):
+ default:
+ (*n) = (uint32_t)(((uint32_t*)(ins+1))[0]);
+ return ins + 5;
+ break;
+ }
+ return NULL;
+}
+
+inline uint8_t*
+mut_set_ins_long_n(uint8_t *ins, uint32_t n)
+{
+ // set type
+ if(n <= UINT8_MAX) ins[0] = (uint8_t)sizeof(uint8_t);
+ else if(n <= UINT16_MAX) ins[0] = (uint8_t)sizeof(uint16_t);
+ else ins[0] = (uint8_t)sizeof(uint32_t);
+
+ // set length
+ switch(ins[0]) {
+ case sizeof(uint8_t):
+ ins[1] = (uint8_t)n;
+ return ins + 2;
+ break;
+ case sizeof(uint16_t):
+ (*((uint16_t*)(ins+1))) = (uint16_t)n;
+ return ins + 3;
+ break;
+ case sizeof(uint32_t):
+ default:
+ (*((uint32_t*)(ins+1))) = n;
+ return ins + 5;
+ break;
+ }
+ return NULL;
+}
+
+
+static inline void
+mut_print_ins(FILE *fp, mutseq_t *seq, int32_t i)
+{
+ mut_t n, ins;
+ if(1 == mut_get_ins(seq, i, &n, &ins)) {
+ while (n > 0) {
+ fputc("ACGTN"[ins & 0x3], fp);
+ ins >>= 2;
+ n--;
+ }
+ }
+ else { // long insertion
+ int32_t byte_index, bit_index;
+ uint32_t num_ins = 0;
+ uint8_t *insertion = NULL;
+ assert(NULL != seq->ins[ins]);
+ insertion = mut_get_ins_long_n(seq->ins[ins], &num_ins);
+ // reverse order
+ byte_index = mut_packed_len(num_ins)-1;
+ bit_index = (num_ins+3) & 3; // % 4
+ while(0 < num_ins) {
+ fputc("ACGTN"[(insertion[byte_index] >> (bit_index << 1)) & 0x3], fp);
+ bit_index--;
+ if(bit_index < 0) {
+ bit_index = 3;
+ byte_index--;
+ }
+ num_ins--;
+ }
+ }
+}
+
+// bases is NULL if we are to randomly simulate the bases
+void mut_add_ins(dwgsim_opt_t *opt, mutseq_t *hap1, mutseq_t *hap2, int32_t i, int32_t c, int8_t hap, char *bases, mut_t num_ins)
+{
+ mut_t ins = 0;
+ int64_t j;
+
+ if (NULL == bases) {
+ if(num_ins == 0) {
+ // get the new insertion length
+ do {
+ num_ins++;
+ } while (num_ins < ins_long_length_max && (num_ins < opt->indel_min || drand48() < opt->indel_extend));
+ }
+ } else {
+ num_ins = strlen(bases); // ignores num_ins
+ }
+ if (ins_long_length_max < num_ins) num_ins = ins_long_length_max;
+
+ if (hap < 0) {
+ // set ploidy
+ if (opt->is_hap || drand48() < 0.333333) { // hom-ins
+ hap = 3;
+ } else if (drand48() < 0.5) {
+ hap = 1;
+ } else {
+ hap = 2;
+ }
+ }
+
+ if (num_ins <= ins_length_max) { // short
+ // generate the insertion
+ if (NULL == bases) {
+ for (j=0;j<num_ins;j++) {
+ ins = (ins << 2) | (mut_t)(drand48() * 4.0);
+ }
+ } else {
+ for (j = num_ins; 0 <= j; --j) {
+ ins = (ins << 2) | nst_nt4_table[(int)bases[j]];
+ }
+ }
+ // store
+ if (hap & 0x1) hap1->s[i] = (num_ins << ins_length_shift) | (ins << muttype_shift) | INSERT | c;
+ if (hap & 0x2) hap2->s[i] = (num_ins << ins_length_shift) | (ins << muttype_shift) | INSERT | c;
+ } else { // long
+ int32_t byte_index, bit_index;
+ uint8_t *hap1_ins = NULL, *hap2_ins = NULL;
+ if (hap & 0x1) {
+ while (hap1->ins_m <= hap1->ins_l) { // realloc
+ hap1->ins_m = (hap1->ins_m < 16) ? 16 : (hap1->ins_m << 1);
+ hap1->ins = realloc(hap1->ins, sizeof(uint8_t*) * hap1->ins_m);
+ }
+ hap1->ins[hap1->ins_l] = calloc(mut_get_ins_bytes(num_ins), sizeof(uint8_t));
+ hap1_ins = mut_set_ins_long_n(hap1->ins[hap1->ins_l], num_ins);
+ }
+ if (hap & 0x2) {
+ while (hap2->ins_m <= hap2->ins_l) { // realloc
+ hap2->ins_m = (hap2->ins_m < 16) ? 16 : (hap2->ins_m << 1);
+ hap2->ins = realloc(hap2->ins, sizeof(uint8_t*) * hap2->ins_m);
+ }
+ hap2->ins[hap2->ins_l] = calloc(mut_get_ins_bytes(num_ins), sizeof(uint8_t));
+ hap2_ins = mut_set_ins_long_n(hap2->ins[hap2->ins_l], num_ins);
+ }
+ byte_index = 0;
+ bit_index = 0;
+ while(0 < num_ins) {
+ uint8_t b;
+ if (NULL == bases) {
+ b = ((uint8_t)(drand48() * 4.0)) << (bit_index << 1);
+ } else {
+ b = nst_nt4_table[(int)bases[num_ins-1]] << (bit_index << 1);
+ }
+ if (hap & 0x1) hap1_ins[byte_index] |= b;
+ if (hap & 0x2) hap2_ins[byte_index] |= b;
+ bit_index++;
+ if(4 == bit_index) {
+ bit_index=0;
+ byte_index++;
+ }
+ num_ins--;
+ }
+ if (hap & 0x1) {
+ assert(hap1->ins_l <= ins_mask);
+ hap1->s[i] = (hap1->ins_l << muttype_shift) | INSERT | c;
+ hap1->ins_l++;
+ }
+ if (hap & 0x2) {
+ assert(hap2->ins_l <= ins_mask);
+ hap2->s[i] = (hap2->ins_l << muttype_shift) | INSERT | c;
+ hap2->ins_l++;
+ }
+ }
+}
+
+void mut_debug(const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2)
+{
+ int32_t i;
+ // DEBUG
+ for (i = 0; i < seq->l; ++i) {
+ mut_t c[3];
+ c[0] = nst_nt4_table[(mut_t)seq->s[i]];
+ c[1] = hap1->s[i]; c[2] = hap2->s[i];
+ if (c[0] >= 4) continue;
+ if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
+ if ((c[1] & mut_and_type_mask) == (c[2] & mut_and_type_mask)) { // hom
+ if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
+ assert((c[1]&0x3) == (c[2]&0x3));
+ assert((c[0]&0x3) != (c[1]&0x3));
+ continue;
+ } else if ((c[1]&mutmsk) == DELETE) { // del
+ continue;
+ } else if ((c[1] & mutmsk) == INSERT) { // ins
+ mut_t n1 = mut_get_ins_length(hap1, i);
+ mut_t n2 = mut_get_ins_length(hap1, i);
+ assert(n1 > 0);
+ assert(n1 <= ins_long_length_max);
+ assert(n1 == n2);
+ } else assert(0);
+ } else { // het
+ if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
+ assert((c[1]&0x3) != (c[2]&0x3));
+ assert((c[0]&0x3) == (c[1]&0x3) || (c[0]&0x3) == (c[2]&0x3));
+ assert((c[0]&0x3) != (c[1]&0x3) || (c[0]&0x3) != (c[2]&0x3));
+ continue;
+ } else if ((c[1]&mutmsk) == DELETE) {
+ continue;
+ } else if ((c[2]&mutmsk) == DELETE) {
+ continue;
+ } else if ((c[1]&mutmsk) == INSERT) { // ins 1
+ mut_t n = mut_get_ins_length(hap1, i);
+ assert(n > 0);
+ assert(n <= ins_long_length_max);
+ } else if ((c[2]&mutmsk) == INSERT) { // ins 2
+ mut_t n = mut_get_ins_length(hap2, i);
+ assert(n > 0);
+ assert(n <= ins_long_length_max);
+ } else assert(0);
+ }
+ }
+ }
+}
+
+static void
+mut_left_justify_ins(mutseq_t *hap1, int32_t i)
+{
+ // NB: should we also be checking both haps for homozygous cases
+ mut_t n, ins, j;
+ if(1 == mut_get_ins(hap1, i, &n, &ins)) { // short
+ assert(n > 0);
+ j=i;
+ while(0 < j
+ && INSERT != (hap1->s[j-1]&mutmsk) // no insertion
+ && SUBSTITUTE != (hap1->s[j-1]&mutmsk) // no substitution
+ && DELETE != (hap1->s[j-1]&mutmsk) // no deletion
+ && ((ins >> ((n-1) << 1)) & 3) == (hap1->s[j-1]&3)) { // end of insertion matches previous base
+ // update ins
+ ins = (ins & ~((mut_t)3 << ((n-1) << 1))); // zero out the last base
+ ins <<= 2; // shift over
+ ins |= (hap1->s[j-1]&3); // insert the first base
+ hap1->s[j] = (hap1->s[j]&3); // make it NOCHANGE
+ j--;
+ }
+ hap1->s[j] = (n << ins_length_shift) | (ins << muttype_shift) | INSERT | (hap1->s[j]&3); // re-insert
+ } else { // long
+ int32_t bit_index, byte_index;
+ uint32_t num_ins;
+ uint8_t *insertion = NULL;
+ insertion = mut_get_ins_long_n(hap1->ins[ins], &num_ins);
+ assert(num_ins > 0);
+ j=i;
+ while(0 < j
+ && INSERT != (hap1->s[j-1]&mutmsk) // no insertion
+ && SUBSTITUTE != (hap1->s[j-1]&mutmsk) // no substitution
+ && DELETE != (hap1->s[j-1]&mutmsk) // no deletion
+ // NB: last base of the insertion has a bit index of zero and byte index of zero
+ && ((insertion[0] >> (0 << 1)) & 3) == (hap1->s[j-1]&3)) { // end of insertion matches previous base
+ // start with the last base
+ for (byte_index = 0; byte_index < mut_packed_len(num_ins); byte_index++) {
+ insertion[byte_index] >>= 2; // shift down, get rid of first base (last two bits)
+ if (byte_index + 1 < mut_packed_len(num_ins)) {
+ uint8_t b = (insertion[byte_index+1] >> (0 << 1)) & 3; // last base of the next byte (first two bits)
+ insertion[byte_index] |= b << 6; // move the new base to the last two bits
+ }
+ }
+ // add a base to the front
+ bit_index = (num_ins+3) & 3; // % 4
+ byte_index = mut_packed_len(num_ins)-1;
+ insertion[byte_index] |= (hap1->s[j-1]&3) << (bit_index << 1);
+ hap1->s[j] = (hap1->s[j]&3); // make it NOCHANGE
+ j--;
+ }
+ hap1->s[j] = (ins << muttype_shift) | INSERT | (hap1->s[j]&3); // re-insert
+ }
+}
+
+// left-justify all the insertions and deletions
+static void
+mut_left_justify(const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2)
+{
+ mut_t i, j;
+ int del_length;
+ int prev_del[2] = {0, 0};
+ assert(hap1->l == seq->l);
+ assert(hap2->l == seq->l);
+ for (i = 0; i < seq->l; ++i) {
+ mut_t c[3];
+ c[0] = nst_nt4_table[(mut_t)seq->s[i]];
+ c[1] = hap1->s[i]; c[2] = hap2->s[i];
+ if (c[0] >= 4) continue;
+ if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
+ if ((c[1] & mut_and_type_mask) == (c[2] & mut_and_type_mask)) { // hom
+ // TODO: code re-use
+ if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
+ prev_del[0] = prev_del[1] = 0;
+ continue;
+ } else if ((c[1]&mutmsk) == DELETE) { // del
+ if(prev_del[0] == 1 || prev_del[1] == 1) continue;
+ prev_del[0] = prev_del[1] = 1;
+ for(j=i+1,del_length=1;j<seq->l && (hap1->s[j]&mutmsk) == DELETE;j++) { // get del length
+ del_length++;
+ }
+ if(seq->l <= i+del_length) continue;
+ // left-justify
+ for(j=i-1;0 < i && 0<=j;j--) {
+ if(INSERT != (hap1->s[j]&mutmsk) && INSERT != (hap2->s[j]&mutmsk) // no insertion
+ && DELETE != (hap1->s[j]&mutmsk) && DELETE != (hap2->s[j]&mutmsk) // no deletion
+ && (hap1->s[j]&3) == (hap1->s[j+del_length]&3) // hap1 bases match
+ && (hap2->s[j]&3) == (hap2->s[j+del_length]&3)) { // hap2 bases match
+ // shift and make it NOCHANGE
+ mut_t t;
+ t = hap1->s[j]; hap1->s[j] = hap1->s[j+del_length]; hap1->s[j+del_length] = (t | mutmsk) ^ mutmsk;
+ t = hap2->s[j]; hap2->s[j] = hap2->s[j+del_length]; hap2->s[j+del_length] = (t | mutmsk) ^ mutmsk;
+ }
+ else {
+ break;
+ }
+ if(0 == j) break;
+ }
+ } else if ((c[1] & mutmsk) == INSERT) { // ins
+ prev_del[0] = prev_del[1] = 0;
+ mut_left_justify_ins(hap1, i);
+ mut_left_justify_ins(hap2, i);
+ } else assert(0);
+ } else { // het
+ if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
+ prev_del[0] = prev_del[1] = 0;
+ continue;
+ } else if ((c[1]&mutmsk) == DELETE) {
+ if(prev_del[0] == 1) continue;
+ prev_del[0] = 1;
+ for(j=i+1,del_length=1;j<seq->l && (hap1->s[j]&mutmsk) == DELETE;j++) { // get del length
+ del_length++;
+ }
+ if(seq->l <= i+del_length) continue;
+ // left-justify
+ for(j=i-1;0 < i && 0<=j;j--) {
+ if(INSERT != (hap1->s[j]&mutmsk) // no insertion
+ && SUBSTITUTE != (hap1->s[j]&mutmsk) // no substitution
+ && DELETE != (hap1->s[j]&mutmsk) // no deletion
+ && (hap1->s[j]&3) == (hap1->s[j+del_length]&3)) { // hap1 bases match
+ // shift and make it NOCHANGE
+ mut_t t;
+ t = hap1->s[j]; hap1->s[j] = hap1->s[j+del_length]; hap1->s[j+del_length] = (t | mutmsk) ^ mutmsk;
+ }
+ else {
+ break;
+ }
+ if(0 == j) break;
+ }
+ } else if ((c[2]&mutmsk) == DELETE) {
+ if(prev_del[1] == 1) continue;
+ prev_del[1] = 1;
+ for(j=i+1,del_length=1;j<seq->l && (hap2->s[j]&mutmsk) == DELETE;j++) { // get del length
+ del_length++;
+ }
+ if(seq->l <= i+del_length) continue;
+ // left-justify
+ for(j=i-1;0 < i && 0<=j;j--) {
+ if(INSERT != (hap2->s[j]&mutmsk) // no insertion
+ && SUBSTITUTE != (hap2->s[j]&mutmsk) // no substitution
+ && DELETE != (hap2->s[j]&mutmsk) // no deletion
+ && (hap2->s[j]&3) == (hap2->s[j+del_length]&3)) { // hap2 bases match
+ // shift and make it NOCHANGE
+ mut_t t;
+ t = hap2->s[j]; hap2->s[j] = hap2->s[j+del_length]; hap2->s[j+del_length] = (t | mutmsk) ^ mutmsk;
+ }
+ else {
+ break;
+ }
+ if(0 == j) break;
+ }
+ } else if ((c[1]&mutmsk) == INSERT) { // ins 1
+ prev_del[0] = prev_del[1] = 0;
+ mut_left_justify_ins(hap1, i);
+ } else if ((c[2]&mutmsk) == INSERT) { // ins 2
+ prev_del[0] = prev_del[1] = 0;
+ mut_left_justify_ins(hap2, i);
+ } else assert(0);
+ }
+ }
+ else {
+ prev_del[0] = prev_del[1] = 0;
+ }
+ }
+}
+
+void mut_diref(dwgsim_opt_t *opt, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2,
+ int32_t contig_i, muts_input_t *muts_input)
+{
+ int32_t i, j, deleting = 0, deletion_length = 0;
+ mutseq_t *ret[2];
+
+ ret[0] = hap1; ret[1] = hap2;
+ ret[0]->l = seq->l; ret[1]->l = seq->l;
+ ret[0]->m = seq->m; ret[1]->m = seq->m;
+ ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
+ ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
+ ret[0]->ins = NULL; ret[1]->ins = NULL;
+ ret[0]->ins_l = 0; ret[1]->ins_l = 0;
+ ret[0]->ins_m = 0; ret[1]->ins_m = 0;
+
+ if(NULL == muts_input) {
+ for (i = 0; i < seq->l; ++i) {
+ mut_t c;
+ c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]];
+ if (deleting) {
+ if (deletion_length < opt->indel_min || drand48() < opt->indel_extend) {
+ if (deleting & 1) ret[0]->s[i] |= DELETE|c;
+ if (deleting & 2) ret[1]->s[i] |= DELETE|c;
+ deletion_length++;
+ continue;
+ } else deleting = deletion_length = 0;
+ }
+ if (c < 4 && drand48() < opt->mut_rate) { // mutation
+ if (drand48() >= opt->indel_frac) { // substitution
+ double r = drand48();
+ c = (c + (mut_t)(r * 3.0 + 1)) & 3;
+ if (opt->is_hap || drand48() < 0.333333) { // hom
+ ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
+ } else { // het
+ ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
+ }
+ } else { // indel
+ if (drand48() < 0.5) { // deletion
+ if (opt->is_hap || drand48() < 0.3333333) { // hom-del
+ ret[0]->s[i] = ret[1]->s[i] = DELETE|c;
+ deleting = 3;
+ } else { // het-del
+ deleting = drand48()<0.5?1:2;
+ ret[deleting-1]->s[i] = DELETE|c;
+ }
+ deletion_length = 1;
+ } else { // insertion
+ mut_add_ins(opt, ret[0], ret[1], i, c, -1, NULL, 0);
+ }
+ }
+ }
+ }
+ }
+ else {
+ if(MUT_INPUT_BED == muts_input->type) { // BED
+ muts_bed_t *muts_bed = muts_input->data.bed;
+ // seed
+ for (i = 0; i < seq->l; ++i) {
+ ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]];
+ }
+ // mutates exactly based on a BED file
+ for (i = 0; i < muts_bed->n; ++i) {
+ if (muts_bed->muts[i].contig == contig_i) {
+ int32_t has_bases = 1, is_hom = 0, hap = 0;
+ int32_t which_hap = 0;
+ mut_t c;
+
+ // does this mutation have random bases?
+ if (0 == strcmp("*", muts_bed->muts[i].bases)) has_bases = 0; // random bases
+
+ // het or hom?
+ if (opt->is_hap || drand48() < 0.333333) {
+ is_hom = 1; // hom
+ hap = 3;
+ }
+ else {
+ which_hap = drand48()<0.5?0:1;
+ hap = 1 << which_hap;
+ }
+
+ // mutate
+ if (SUBSTITUTE == muts_bed->muts[i].type) {
+ for (j = muts_bed->muts[i].start; j < muts_bed->muts[i].end; ++j) { // for each base
+ c = (mut_t)nst_nt4_table[(int)seq->s[j]];
+ if (0 == has_bases) { // random DNA base
+ double r = drand48();
+ c = (c + (mut_t)(r * 3.0 + 1)) & 3;
+ }
+ else {
+ c = (mut_t)nst_nt4_table[(int)muts_bed->muts[i].bases[j - muts_bed->muts[i].start]];
+ }
+ if (1 == is_hom) {
+ ret[0]->s[j] = ret[1]->s[j] = SUBSTITUTE|c;
+ } else { // het
+ ret[which_hap]->s[j] = SUBSTITUTE|c;
+ }
+ }
+ }
+ else if (DELETE == muts_bed->muts[i].type) {
+ for (j = muts_bed->muts[i].start; j < muts_bed->muts[i].end; ++j) { // for each base
+ c = (mut_t)nst_nt4_table[(int)seq->s[j]];
+ if (1 == is_hom) {
+ ret[0]->s[j] = ret[1]->s[j] = DELETE|c;
+ } else { // het-del
+ ret[which_hap]->s[j] = DELETE|c;
+ }
+ }
+ }
+ else if (INSERT == muts_bed->muts[i].type) {
+ c = (mut_t)nst_nt4_table[(int)seq->s[muts_bed->muts[i].start]];
+ if (0 == has_bases) {
+ mut_add_ins(opt, ret[0], ret[1], muts_bed->muts[i].start, c, hap, NULL, muts_bed->muts[i].end - muts_bed->muts[i].start);
+ } else {
+ mut_add_ins(opt, ret[0], ret[1], muts_bed->muts[i].start, c, hap, muts_bed->muts[i].bases, 0);
+ }
+ }
+ }
+ else if (contig_i < muts_bed->muts[i].contig) {
+ break;
+ }
+ }
+ }
+ else if(MUT_INPUT_TXT == muts_input->type || MUT_INPUT_VCF == muts_input->type) { // TXT or VCF
+ muts_txt_t *muts_txt = NULL;
+ if(MUT_INPUT_TXT == muts_input->type) {
+ muts_txt = muts_input->data.txt;
+ }
+ else {
+ muts_txt = muts_input->data.vcf;
+ }
+ // seed
+ for (i = 0; i < seq->l; ++i) {
+ ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]];
+ }
+ for (i = 0; i < muts_txt->n; ++i) {
+ if (muts_txt->muts[i].contig == contig_i) {
+ int8_t type = muts_txt->muts[i].type;
+ uint32_t pos = muts_txt->muts[i].pos;
+ int8_t is_hap = muts_txt->muts[i].is_hap;
+ mut_t c = (mut_t)nst_nt4_table[(int)seq->s[pos-1]];
+
+ if (DELETE == type) {
+ if (is_hap & 1) ret[0]->s[pos-1] |= DELETE|c;
+ if (is_hap & 2) ret[1]->s[pos-1] |= DELETE|c;
+ }
+ else if (SUBSTITUTE == type) {
+ if (is_hap & 1) ret[0]->s[pos-1] = SUBSTITUTE|nst_nt4_table[(int)muts_txt->muts[i].bases[0]];
+ if (is_hap & 2) ret[1]->s[pos-1] = SUBSTITUTE|nst_nt4_table[(int)muts_txt->muts[i].bases[0]];
+ }
+ else if (INSERT == type) {
+ mut_add_ins(opt, ret[0], ret[1], pos-1, c, is_hap, muts_txt->muts[i].bases, 0);
+ }
+ }
+ }
+ }
+ else {
+ fprintf(stderr, "Error: unknown mutation input type!\n");
+ exit(1);
+ }
+ }
+
+ // DEBUG
+ mut_debug(seq, hap1, hap2);
+
+ // DEBUG
+ mut_left_justify(seq, hap1, hap2);
+ mut_debug(seq, hap1, hap2);
+}
+
+void mut_print_header_pre(FILE *fpout_vcf)
+{
+ fprintf(fpout_vcf, "##fileformat=VCFv4.1\n");
+}
+
+void mut_print_header_post(FILE *fpout_vcf)
+{
+ fprintf(fpout_vcf, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">\n");
+ fprintf(fpout_vcf, "##INFO=<ID=pl,Number=1,Type=Integer,Description=\"Phasing: 1 - HET contig 1, #2 - HET contig #2, 3 - HOM both contigs\">\n");
+ fprintf(fpout_vcf, "##INFO=<ID=mt,Number=1,Type=String,Description=\"Variant Type: SUBSTITUTE/INSERT/DELETE\">\n");
+ fprintf(fpout_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n");
+}
+
+void mut_print_header_contig(FILE *fpout_vcf, const char *name, int32_t length)
+{
+ // TODO: does mutating the reference change the sequence length? If so, this
+ // is not correct.
+ fprintf(fpout_vcf, "##contig=<ID=%s,length=%d>\n", name, length);
+}
+
+// TODO: add AD, DP, GQ, GT, PL
+void mut_print(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2, FILE *fpout_txt, FILE *fpout_vcf)
+{
+ int32_t i, j, hap;
+
+ // body
+ mut_t mut_prev[2] = {NOCHANGE,NOCHANGE}; // for deletions
+ for (i = 0; i < seq->l; ++i) {
+ mut_t c[3];
+ c[0] = nst_nt4_table[(int)seq->s[i]];
+ c[1] = hap1->s[i]; c[2] = hap2->s[i];
+ if (c[0] >= 4) {
+ // do nothing
+ } else if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
+ fprintf(fpout_txt, "%s\t%d\t", name, i+1);
+ if ((c[1] & mut_and_type_mask) == (c[2] & mut_and_type_mask)) { // hom
+ if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
+ fprintf(fpout_txt, "%c\t%c\t3\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
+ fprintf(fpout_vcf, "%s\t%d\t.\t%c\t%c\t100\tPASS\tAF=1.0;pl=3;mt=SUBSTITUTE\n", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
+ } else if ((c[1]&mutmsk) == DELETE) { // del
+ fprintf(fpout_txt, "%c\t-\t3\n", "ACGTN"[c[0]]);
+ if(0 == mut_prev[0] || 0 == mut_prev[1]) {
+ fprintf(fpout_vcf, "%s\t%d\t.\t", name, i);
+ if (0 < i) fputc("ACGTN"[nst_nt4_table[(int)seq->s[i-1]]], fpout_vcf);
+ for (j = i; j < seq->l && (c[1] & mut_and_type_mask) == (c[2] & mut_and_type_mask) && (c[1]&mutmsk) == DELETE; ++j) {
+ fputc("ACGTN"[c[0]], fpout_vcf);
+ // NB: this modifies 'c'
+ if (j+1 < seq->l) {
+ c[0] = nst_nt4_table[(int)seq->s[j+1]];
+ c[1] = hap1->s[j+1]; c[2] = hap2->s[j+1];
+ }
+ }
+ if (0 < i) fprintf(fpout_vcf, "\t%c", "ACGTN"[nst_nt4_table[(int)seq->s[i-1]]]);
+ else fprintf(fpout_vcf, "\t.");
+ fprintf(fpout_vcf, "\t100\tPASS\tAF=1.0;pl=3;mt=DELETE\n");
+ }
+ // NB: convert back 'c'
+ c[0] = nst_nt4_table[(int)seq->s[i]];
+ c[1] = hap1->s[i]; c[2] = hap2->s[i];
+ } else if ((c[1] & mutmsk) == INSERT) { // ins
+ fprintf(fpout_txt, "-\t");
+ mut_print_ins(fpout_txt, hap1, i);
+ fprintf(fpout_txt, "\t3\n");
+ fprintf(fpout_vcf, "%s\t%d\t.\t%c\t%c", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[0]]);
+ mut_print_ins(fpout_vcf, hap1, i);
+ fprintf(fpout_vcf, "\t100\tPASS\tAF=1.0;pl=3;mt=INSERT\n");
+ } else assert(0);
+ } else { // het
+ if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
+ hap = ((c[1]&mutmsk) == SUBSTITUTE) ? 1 : 2;
+ fprintf(fpout_txt, "%c\t%c\t%d\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)], hap);
+ if(1 == hap) fprintf(fpout_vcf, "%s\t%d\t.\t%c\t%c\t100\tPASS\tAF=0.5;pl=1;mt=SUBSTITUTE\n", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
+ else fprintf(fpout_vcf, "%s\t%d\t.\t%c\t%c\t100\tPASS\tAF=0.5;pl=2;mt=SUBSTITUTE\n", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[2]&0xf]);
+ } else if ((c[1]&mutmsk) == DELETE) {
+ fprintf(fpout_txt, "%c\t-\t1\n", "ACGTN"[c[0]]);
+ if(0 == mut_prev[0]) {
+ fprintf(fpout_vcf, "%s\t%d\t.\t", name, i);
+ if (0 < i) fputc("ACGTN"[nst_nt4_table[(int)seq->s[i-1]]], fpout_vcf);
+ for (j = i; j < seq->l && (c[1] & mut_and_type_mask) != (c[2] & mut_and_type_mask) && (c[1]&mutmsk) == DELETE; ++j) {
+ fputc("ACGTN"[c[0]], fpout_vcf);
+ // NB: this modifies 'c'
+ if (j+1 < seq->l) {
+ c[0] = nst_nt4_table[(int)seq->s[j+1]];
+ c[1] = hap1->s[j+1]; c[2] = hap2->s[j+1];
+ }
+ }
+ if (0 < i) fprintf(fpout_vcf, "\t%c", "ACGTN"[nst_nt4_table[(int)seq->s[i-1]]]);
+ else fprintf(fpout_vcf, "\t.");
+ fprintf(fpout_vcf, "\t100\tPASS\tAF=1.0;pl=1;mt=DELETE\n");
+ }
+ // NB: convert back 'c'
+ c[0] = nst_nt4_table[(int)seq->s[i]];
+ c[1] = hap1->s[i]; c[2] = hap2->s[i];
+ } else if ((c[2]&mutmsk) == DELETE) {
+ fprintf(fpout_txt, "%c\t-\t2\n", "ACGTN"[c[0]]);
+ if(0 == mut_prev[1]) {
+ fprintf(fpout_vcf, "%s\t%d\t.\t", name, i);
+ if (0 < i) fputc("ACGTN"[nst_nt4_table[(int)seq->s[i-1]]], fpout_vcf);
+ for (j = i; j < seq->l && (c[1] & mut_and_type_mask) != (c[2] & mut_and_type_mask) && (c[2]&mutmsk) == DELETE; ++j) {
+ fputc("ACGTN"[c[0]], fpout_vcf);
+ // NB: this modifies 'c'
+ if (j+1 < seq->l) {
+ c[0] = nst_nt4_table[(int)seq->s[j+1]];
+ c[1] = hap1->s[j+1]; c[2] = hap2->s[j+1];
+ }
+ }
+ if (0 < i) fprintf(fpout_vcf, "\t%c", "ACGTN"[nst_nt4_table[(int)seq->s[i-1]]]);
+ else fprintf(fpout_vcf, "\t.");
+ fprintf(fpout_vcf, "\t100\tPASS\tAF=1.0;pl=2;mt=DELETE\n");
+ }
+ // NB: convert back 'c'
+ c[0] = nst_nt4_table[(int)seq->s[i]];
+ c[1] = hap1->s[i]; c[2] = hap2->s[i];
+ } else if ((c[1]&mutmsk) == INSERT) { // ins 1
+ fprintf(fpout_txt, "-\t");
+ mut_print_ins(fpout_txt, hap1, i);
+ fprintf(fpout_txt, "\t1\n");
+ fprintf(fpout_vcf, "%s\t%d\t.\t%c\t%c", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[0]]);
+ mut_print_ins(fpout_vcf, hap1, i);
+ fprintf(fpout_vcf, "\t100\tPASS\tAF=0.5;pl=1;mt=INSERT\n");
+ } else if ((c[2]&mutmsk) == INSERT) { // ins 2
+ fprintf(fpout_txt, "-\t");
+ mut_print_ins(fpout_txt, hap2, i);
+ fprintf(fpout_txt, "\t2\n");
+ fprintf(fpout_vcf, "%s\t%d\t.\t%c\t%c", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[0]]);
+ mut_print_ins(fpout_vcf, hap2, i);
+ fprintf(fpout_vcf, "\t100\tPASS\tAF=0.5;pl=2;mt=INSERT\n");
+ } else assert(0);
+ }
+ }
+ mut_prev[0] = (c[1] & mutmsk) != NOCHANGE;
+ mut_prev[1] = (c[2] & mutmsk) != NOCHANGE;
+ }
+}
diff --git a/src/mut.h b/src/mut.h
new file mode 100644
index 0000000..56504a4
--- /dev/null
+++ b/src/mut.h
@@ -0,0 +1,93 @@
+#ifndef MUT_H
+#define MUT_H
+
+#include "contigs.h"
+#include "mut_txt.h"
+#include "mut_bed.h"
+#include "mut_input.h"
+#include "dwgsim_opt.h"
+#include "dwgsim.h"
+
+/* FASTA parser, copied from seq.c */
+typedef struct {
+ int l, m; /* length and maximum buffer size */
+ unsigned char *s; /* sequence */
+} seq_t;
+
+#define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0
+
+void
+seq_set_block_size(int size);
+
+int
+seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment);
+
+enum muttype_t {
+ NOCHANGE = 0,
+ INSERT = 0x10,
+ SUBSTITUTE = 0x20,
+ DELETE = 0x30
+};
+
+typedef uint64_t mut_t;
+extern mut_t mutmsk;
+extern mut_t mut_and_type_mask;
+extern mut_t muttype_shift;
+extern mut_t ins_length_shift;
+extern mut_t ins_length_mask;
+extern mut_t ins_length_max;
+extern mut_t ins_long_length_max;
+extern mut_t ins_mask;
+
+typedef struct {
+ int l, m; /* length and maximum buffer size */
+ mut_t *s; /* sequence */
+ uint8_t **ins; /* long insertions */
+ int ins_l, ins_m; /* length and maximum buffer size for long insertions */
+} mutseq_t;
+
+void
+mutseq_init_bounds();
+
+mutseq_t *
+mutseq_init();
+
+void
+mutseq_destroy(mutseq_t *seq);
+
+inline int32_t
+mut_get_ins_bytes(int32_t n);
+
+inline uint8_t*
+mut_get_ins_long_n(uint8_t *ins, uint32_t *n);
+
+mut_t
+mut_get_ins_length(mutseq_t *seq, int32_t i);
+
+int32_t
+mut_get_ins(mutseq_t *seq, int32_t i, mut_t *n, mut_t *ins);
+
+void
+mut_diref(dwgsim_opt_t *opt, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2, int32_t contig_i, muts_input_t *muts_input);
+
+void mut_print_header_pre(FILE *fpout_vcf);
+void mut_print_header_post(FILE *fpout_vcf);
+void mut_print_header_contig(FILE *fpout_vcf, const char *name, int32_t length);
+
+// Columns:
+// 1 - chromosome name
+// 2 - position (one-based)
+// 3 - reference base (dash if there was an insertion)
+// 4 - variant allele (IUPAC code or insertion base(s))
+// 5 - '-' for homozygous, '+' for heterozygous
+void
+mut_print(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2, FILE *fpout_txt, FILE *fpout_vcf);
+
+// 0 - 0
+// 1 - 1
+// 5 - 2
+// 9 - 3
+// ...
+#define mut_packed_len(_n) ((_n + 3) >> 2)
+
+#endif
diff --git a/src/mut_bed.c b/src/mut_bed.c
new file mode 100644
index 0000000..1af655b
--- /dev/null
+++ b/src/mut_bed.c
@@ -0,0 +1,136 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <string.h>
+#include "contigs.h"
+#include "mut.h"
+#include "dwgsim.h"
+#include "mut_bed.h"
+
+muts_bed_t *muts_bed_init(FILE *fp, contigs_t *c)
+{
+ muts_bed_t *m = NULL;
+ int32_t i;
+ char name[1024];
+ uint32_t start, end;
+ char type[1024];
+ char bases[1024];
+ uint32_t prev_contig=0, max_end=0;
+
+ m = calloc(1, sizeof(muts_bed_t));
+ m->n = 0;
+ m->mem = 1024;
+ m->muts = malloc(m->mem * sizeof(mut_bed_t));
+
+ i = 0;
+ // start is zero-based
+ // one is one-based
+ while(0 < fscanf(fp, "%s\t%u\t%u\t%s\t%s", name, &start, &end, bases, type)) {
+ // find the contig
+ while(i < c->n && 0 != strcmp(name, c->contigs[i].name)) {
+ i++;
+ }
+ if(c->n == i) {
+ fprintf(stderr, "Error: contig not found [%s]\n", name);
+ exit(1);
+ }
+ else if(c->contigs[i].len <= start) {
+ fprintf(stderr, "Error: start out of range [%s,%u]\n", name, start);
+ exit(1);
+ }
+ else if(c->contigs[i].len < end) {
+ fprintf(stderr, "Error: end out of range [%s,%u]\n", name, end);
+ exit(1);
+ }
+ else if(end <= start) {
+ fprintf(stderr, "Error: end <= start [%s,%u,%u]\n", name, start, end);
+ exit(1);
+ }
+ else if(0 != strcmp("*", bases) && (end - start) != strlen(bases)) {
+ fprintf(stderr, "Error: bases did not match start and end [%s,%u,%u,%s]\n", name, start, end, bases);
+ exit(1);
+ }
+ else if(prev_contig == i && start+1 <= max_end) {
+ fprintf(stderr, "Warning: overlapping entries, ignoring entry [%s\t%u\t%u\t%s\t%s]\n", name, start, end, bases, type);
+ continue;
+ }
+
+ if(prev_contig != i || max_end < end) {
+ prev_contig = i;
+ max_end = end;
+ }
+
+ if(m->n == m->mem) {
+ m->mem <<= 1;
+ m->muts = realloc(m->muts, m->mem * sizeof(mut_bed_t));
+ }
+ m->muts[m->n].contig = i;
+ m->muts[m->n].start = start;
+ m->muts[m->n].end = end; // one-based
+
+ m->muts[m->n].type = get_muttype(type);
+ switch(m->muts[m->n].type) {
+ case SUBSTITUTE:
+ break;
+ case INSERT:
+ if(ins_length_max < end - start) {
+ fprintf(stderr, "Error: insertion of length %d exceeded the maximum supported length of %d\n",
+ end - start,
+ (int32_t)ins_length_max);
+ exit(1);
+ }
+ break;
+ case DELETE:
+ break;
+ default:
+ // error
+ fprintf(stderr, "Error: mutation type unrecognized [%s]\n", type);
+ exit(1);
+ }
+ m->muts[m->n].bases = strdup(bases);
+ m->n++;
+ }
+ if(m->n < m->mem) {
+ m->mem = m->n;
+ m->muts = realloc(m->muts, m->mem * sizeof(mut_bed_t));
+ }
+
+ return m;
+}
+
+void muts_bed_destroy(muts_bed_t *m)
+{
+ int32_t i;
+ for(i=0;i<m->n;i++) {
+ free(m->muts[i].bases);
+ }
+ free(m->muts);
+ free(m);
+}
diff --git a/src/mut_bed.h b/src/mut_bed.h
new file mode 100644
index 0000000..772e714
--- /dev/null
+++ b/src/mut_bed.h
@@ -0,0 +1,47 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+ */
+
+#ifndef MUT_BED_H
+#define MUT_BED_H
+
+typedef struct {
+ uint32_t contig; // zero-based
+ uint32_t start; // zero-based
+ uint32_t end; // zero-based end position
+ uint8_t type; // mutation type
+ char *bases; // mutation bases
+} mut_bed_t;
+
+typedef struct {
+ mut_bed_t *muts;
+ int32_t n;
+ int32_t mem;
+} muts_bed_t;
+
+muts_bed_t *muts_bed_init(FILE *fp, contigs_t *c);
+
+void muts_bed_destroy(muts_bed_t *m);
+
+#endif
diff --git a/src/mut_input.c b/src/mut_input.c
new file mode 100644
index 0000000..1e6c9ff
--- /dev/null
+++ b/src/mut_input.c
@@ -0,0 +1,81 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+#include "contigs.h"
+#include "mut.h"
+#include "dwgsim.h"
+#include "mut_txt.h"
+#include "mut_bed.h"
+#include "mut_vcf.h"
+#include "mut_input.h"
+
+muts_input_t *muts_input_init(FILE *fp, contigs_t *c, int32_t type)
+{
+ muts_input_t *m = calloc(1, sizeof(muts_input_t));
+ m->type = type;
+ switch(type) {
+ case MUT_INPUT_BED:
+ m->data.bed = muts_bed_init(fp, c);
+ break;
+ case MUT_INPUT_TXT:
+ m->data.txt = muts_txt_init(fp, c);
+ break;
+ case MUT_INPUT_VCF:
+ m->data.vcf = muts_vcf_init(fp, c);
+ break;
+ default:
+ fprintf(stderr, "Error: mutation input type unrecognized!\n");
+ exit(1);
+ }
+ return m;
+}
+
+void muts_input_destroy(muts_input_t *m)
+{
+ switch(m->type) {
+ case MUT_INPUT_BED:
+ muts_bed_destroy(m->data.bed);
+ break;
+ case MUT_INPUT_TXT:
+ muts_txt_destroy(m->data.txt);
+ break;
+ case MUT_INPUT_VCF:
+ muts_vcf_destroy(m->data.vcf);
+ break;
+ default:
+ fprintf(stderr, "Error: mutation input type unrecognized!\n");
+ exit(1);
+ }
+ free(m);
+}
diff --git a/src/mut_input.h b/src/mut_input.h
new file mode 100644
index 0000000..2ae9e7f
--- /dev/null
+++ b/src/mut_input.h
@@ -0,0 +1,52 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+ */
+
+#ifndef MUT_INPUT_H
+#define MUT_INPUT_H
+
+#include "mut_bed.h"
+#include "mut_txt.h"
+#include "mut_vcf.h"
+
+enum {
+ MUT_INPUT_BED = 0,
+ MUT_INPUT_TXT = 1,
+ MUT_INPUT_VCF = 2
+};
+
+typedef struct {
+ int32_t type;
+ union {
+ muts_bed_t *bed;
+ muts_txt_t *txt;
+ muts_vcf_t *vcf;
+ } data;
+} muts_input_t;
+
+muts_input_t *muts_input_init(FILE *fp, contigs_t *c, int32_t type);
+
+void muts_input_destroy(muts_input_t *m);
+
+#endif
diff --git a/src/mut_txt.c b/src/mut_txt.c
new file mode 100644
index 0000000..5019563
--- /dev/null
+++ b/src/mut_txt.c
@@ -0,0 +1,127 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+#include "contigs.h"
+#include "mut.h"
+#include "dwgsim.h"
+#include "mut_txt.h"
+
+muts_txt_t *muts_txt_init(FILE *fp, contigs_t *c)
+{
+ muts_txt_t *m = NULL;
+ int32_t i;
+ char name[1024];
+ uint32_t pos, prev_pos = 0, is_hap;
+ char ref, mut[1024];
+
+ m = calloc(1, sizeof(muts_txt_t));
+ m->n = 0;
+ m->mem = 1024;
+ m->muts = malloc(m->mem * sizeof(mut_txt_t));
+
+ i = 0;
+ while(0 < fscanf(fp, "%s\t%u\t%c\t%s\t%d", name, &pos, &ref, mut, &is_hap)) {
+ // find the contig
+ while(i < c->n && 0 != strcmp(name, c->contigs[i].name)) {
+ i++;
+ prev_pos = 0;
+ }
+ if(c->n == i) {
+ fprintf(stderr, "Error: contig not found [%s]\n", name);
+ exit(1);
+ }
+ else if(pos <= 0 || c->contigs[i].len < pos) {
+ fprintf(stderr, "Error: start out of range [%s,%u]\n", name, pos);
+ exit(1);
+ }
+ else if(pos < prev_pos) {
+ fprintf(stderr, "Error: out of order [%s,%u]\n", name, pos);
+ exit(1);
+ }
+
+ if(m->n == m->mem) {
+ m->mem <<= 1;
+ m->muts = realloc(m->muts, m->mem * sizeof(mut_txt_t));
+ }
+ m->muts[m->n].contig = i;
+ m->muts[m->n].pos = pos;
+ m->muts[m->n].bases = strdup(mut);
+
+ if('-' == ref && '-' != mut[0]) {
+ m->muts[m->n].type = INSERT;
+ }
+ else if('-' != ref && '-' == mut[0]) {
+ m->muts[m->n].type = DELETE;
+ }
+ else if('-' != ref && '-' != mut[0]) {
+ m->muts[m->n].type = SUBSTITUTE;
+ if(is_hap < 3) { // heterozygous
+ if(nst_nt4_table[(int)m->muts[m->n].bases[0]] < 4) {
+ fprintf(stderr, "Error: heterozygous bases must be in IUPAC form\n");
+ exit(1);
+ }
+ m->muts[m->n].bases[0] = iupac_and_base_to_mut(m->muts[m->n].bases[0], ref);
+ if('X' == m->muts[m->n].bases[0]) {
+ fprintf(stderr, "Error: out of range\n");
+ exit(1);
+ }
+ m->muts[m->n].bases[1] = '\0';
+ }
+ }
+ else {
+ fprintf(stderr, "Error: out of range\n");
+ exit(1);
+ }
+
+ m->muts[m->n].is_hap = is_hap;
+
+ m->n++;
+ }
+ if(m->n < m->mem) {
+ m->mem = m->n;
+ m->muts = realloc(m->muts, m->mem * sizeof(mut_txt_t));
+ }
+
+ return m;
+}
+
+void muts_txt_destroy(muts_txt_t *m)
+{
+ int32_t i;
+ for(i=0;i<m->n;i++) {
+ free(m->muts[i].bases);
+ }
+ free(m->muts);
+ free(m);
+}
diff --git a/src/mut_txt.h b/src/mut_txt.h
new file mode 100644
index 0000000..0b7ab3c
--- /dev/null
+++ b/src/mut_txt.h
@@ -0,0 +1,47 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+ */
+
+#ifndef MUT_TXT_H
+#define MUT_TXT_H
+
+typedef struct {
+ uint32_t contig;
+ uint32_t pos;
+ uint8_t type;
+ uint8_t is_hap;
+ char *bases;
+} mut_txt_t;
+
+typedef struct {
+ mut_txt_t *muts;
+ int32_t n;
+ int32_t mem;
+} muts_txt_t;
+
+muts_txt_t *muts_txt_init(FILE *fp, contigs_t *c);
+
+void muts_txt_destroy(muts_txt_t *m);
+
+#endif
diff --git a/src/mut_vcf.c b/src/mut_vcf.c
new file mode 100644
index 0000000..4ff9159
--- /dev/null
+++ b/src/mut_vcf.c
@@ -0,0 +1,267 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+#include "contigs.h"
+#include "mut.h"
+#include "dwgsim.h"
+#include "mut_vcf.h"
+
+#define BUFFER_L 1048576
+
+muts_vcf_t *muts_vcf_init(FILE *fp, contigs_t *c)
+{
+ static int32_t warned = 0;
+ muts_vcf_t *m = NULL;
+ int32_t i, j;
+ char name[1024]; // 1. #CHROM
+ uint32_t pos; // 2. POS
+ char id[1024]; // 3. ID
+ char ref[1024]; // 4. REF
+ char alt[1025]; // 5. ALT
+ size_t n_read = -1;
+ uint32_t prev_pos = 0, is_hap;
+ int32_t ref_l, alt_l;
+ char buffer[BUFFER_L];
+ int32_t s, n, len;
+
+ m = calloc(1, sizeof(muts_vcf_t));
+ m->n = 0;
+ m->mem = 1024;
+ m->muts = malloc(m->mem * sizeof(mut_vcf_t));
+
+ // read in some buffer
+ n_read = fread(buffer, sizeof(char), BUFFER_L, fp);
+
+ i = s = n = 0;
+ while(0 != s || 0 != n_read) {
+ len = n_read + s;
+ s = 0;
+ while(s < len) { // while more characters in the buffer
+ // find the next newline
+ for(n=s;n<len;n++) {
+ if('\n' == buffer[n] || '\r' == buffer[n]) break;
+ }
+ if(n == len && 0 < n_read) { // there are more characters to read
+ break;
+ }
+ else if(s == n) { // newline found, skip
+ s++;
+ continue;
+ }
+ else if(buffer[s] == '#') { // skip headers
+ s = n;
+ continue;
+ }
+
+ // process
+ if(EOF == sscanf(buffer+s, "%s\t%u\t%s\t%s\t%s", name, &pos, id, ref, alt)) {
+ fprintf(stderr, "Error: VCF parsing error\n");
+ exit(1);
+ }
+ // find ploidy in the "pl" (lowercase) tag
+ is_hap = 4;
+ while(s+4 < n) { // [\t;]pl=[1-3]
+ if(('\t' == buffer[s] || ';' == buffer[s]) && 'p' == buffer[s+1] && 'l' == buffer[s+2] && '=' == buffer[s+3]) {
+ s += 4; // skip over '[\t;]pl='
+ switch(buffer[s]) {
+ case '1':
+ is_hap = 1;
+ break;
+ case '2':
+ is_hap = 2;
+ break;
+ case '3':
+ is_hap = 3;
+ break;
+ default:
+ fprintf(stderr, "Error: Could not determine the strand of the mutation from the 'pl' tag.\n");
+ exit(1);
+ break;
+ }
+ break;
+ }
+ s++;
+ }
+ if(4 == is_hap && 0 == warned) {
+ fprintf(stderr, "Warning: strand of the mutation not found; please use the 'pl' tag.\n");
+ warned = 1;
+ is_hap = 3;
+ }
+
+ s = n+1; // move past the next newline
+
+ // find the contig
+ while(i < c->n && 0 != strcmp(name, c->contigs[i].name)) {
+ i++;
+ prev_pos = 0;
+ }
+ if(c->n == i) {
+ fprintf(stderr, "Error: contig not found [%s]\n", name);
+ exit(1);
+ }
+ else if(pos <= 0 || c->contigs[i].len < pos) {
+ fprintf(stderr, "Error: start out of range [%s,%u]\n", name, pos);
+ exit(1);
+ }
+ else if(pos < prev_pos) {
+ fprintf(stderr, "Error: out of order [%s,%u]\n", name, pos);
+ exit(1);
+ }
+
+ ref_l = strlen(ref);
+ alt_l = strlen(alt);
+
+ if(1 == ref_l && ref[0] == '.') {
+ ref[0] = '\0';
+ ref_l = 0;
+ }
+ if(1 == alt_l && alt[0] == '.') {
+ alt[0] = '\0';
+ alt_l = 0;
+ }
+ if(0 == alt_l && 0 == ref_l) {
+ fprintf(stderr, "Error: empty alleles\n");
+ exit(1);
+ }
+
+ // TODO: support multiple alleles
+ for(j=0;j<alt_l;j++) {
+ if(',' == alt[j]) {
+ fprintf(stderr, "Error: multiple alleles are not supported\n");
+ exit(1);
+ }
+ }
+
+ // to upper, and check for non-ACGT bases
+ for(j=0;j<ref_l;j++) {
+ ref[j] = "ACGTN"[nst_nt4_table[(int)ref[j]]];
+ if('N' == ref[j]) {
+ fprintf(stderr, "Error: non-ACGT base found\n");
+ exit(1);
+ }
+ }
+ for(j=0;j<alt_l;j++) {
+ alt[j] = "ACGTN"[nst_nt4_table[(int)alt[j]]];
+ if('N' == alt[j]) {
+ fprintf(stderr, "Error: non-ACGT base found\n");
+ exit(1);
+ }
+ }
+
+ if(ref_l == alt_l) { // SNP
+ while(m->mem <= m->n + ref_l - 1) {
+ m->mem <<= 1;
+ m->muts = realloc(m->muts, m->mem * sizeof(mut_vcf_t));
+ }
+ for(j=0;j<ref_l;j++) { // multiple snps
+ m->muts[m->n].contig = i;
+ m->muts[m->n].pos = pos + j;
+ m->muts[m->n].type = SUBSTITUTE;
+ m->muts[m->n].is_hap = is_hap;
+ m->muts[m->n].bases = malloc(sizeof(char) * 2);
+ m->muts[m->n].bases[0] = alt[j];
+ m->muts[m->n].bases[1] = '\0';
+ m->n++;
+ }
+ }
+ else if(ref_l < alt_l) { // INS
+ if(m->n == m->mem) {
+ m->mem <<= 1;
+ m->muts = realloc(m->muts, m->mem * sizeof(mut_vcf_t));
+ }
+ // skip over match bases
+ for(j=0;j<ref_l;j++,pos++) {
+ if(ref[j] != alt[j]) break;
+ }
+ //pos--; // make 0-based for INS
+ m->muts[m->n].contig = i;
+ m->muts[m->n].pos = pos;
+ m->muts[m->n].type = INSERT;
+ m->muts[m->n].is_hap = is_hap;
+ m->muts[m->n].bases = strdup(alt+j);
+ m->n++;
+ }
+ else { // DEL
+ // skip over match bases
+ for(j=0;j<alt_l;j++,pos++) {
+ if(ref[j] != alt[j]) break;
+ }
+ if(j == ref_l) {
+ fprintf(stderr, "Error: no deleted bases\n");
+ exit(1);
+ }
+ while(m->mem < m->n + (ref_l - j)) {
+ m->mem <<= 1;
+ m->muts = realloc(m->muts, m->mem * sizeof(mut_vcf_t));
+ }
+ for(;j<ref_l;j++,pos++) {
+ m->muts[m->n].contig = i;
+ m->muts[m->n].pos = pos;
+ m->muts[m->n].type = DELETE;
+ m->muts[m->n].is_hap = is_hap;
+ m->muts[m->n].bases = NULL;
+ m->n++;
+ }
+ }
+
+ prev_pos = pos;
+
+ // move to 'n'
+ s = n;
+ }
+ n++; // move past last character
+ // shift unused characters down
+ for(s = 0; n < len; n++, s++) {
+ buffer[s] = buffer[n];
+ }
+ // read in some buffer
+ n_read = fread(buffer + s, sizeof(char), BUFFER_L - s, fp);
+ }
+ if(m->n < m->mem) {
+ m->mem = m->n;
+ m->muts = realloc(m->muts, m->mem * sizeof(mut_vcf_t));
+ }
+
+ return m;
+}
+
+void muts_vcf_destroy(muts_vcf_t *m)
+{
+ int32_t i;
+ for(i=0;i<m->n;i++) {
+ free(m->muts[i].bases);
+ }
+ free(m->muts);
+ free(m);
+}
diff --git a/src/mut_vcf.h b/src/mut_vcf.h
new file mode 100644
index 0000000..fe8ce76
--- /dev/null
+++ b/src/mut_vcf.h
@@ -0,0 +1,39 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+ */
+
+#ifndef MUT_VCF_H
+#define MUT_VCF_H
+
+#include "mut_txt.h"
+
+typedef muts_txt_t muts_vcf_t;
+typedef mut_txt_t mut_vcf_t;
+
+
+muts_vcf_t *muts_vcf_init(FILE *fp, contigs_t *c);
+
+void muts_vcf_destroy(muts_vcf_t *m);
+
+#endif
diff --git a/src/regions_bed.c b/src/regions_bed.c
new file mode 100644
index 0000000..0b70117
--- /dev/null
+++ b/src/regions_bed.c
@@ -0,0 +1,148 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+#include "contigs.h"
+#include "regions_bed.h"
+
+regions_bed_txt *regions_bed_init(FILE *fp, contigs_t *c)
+{
+ regions_bed_txt *r = NULL;
+ char name[1024];
+ uint32_t start, end, len;
+ int32_t i, prev_contig, prev_start, prev_end, b;
+
+ r = calloc(1, sizeof(regions_bed_txt));
+ r->n = 0;
+ r->mem = 4;
+ r->contig = malloc(r->mem * sizeof(uint32_t));
+ r->start = malloc(r->mem * sizeof(uint32_t));
+ r->end = malloc(r->mem * sizeof(uint32_t));
+
+ i = 0;
+ prev_contig = prev_start = prev_end = -1;
+ while(0 < fscanf(fp, "%s\t%u\t%u", name, &start, &end)) {
+ len = end - start + 1;
+ // find the contig
+ while(i < c->n && 0 != strcmp(name, c->contigs[i].name)) {
+ i++;
+ }
+ if(c->n == i) {
+ fprintf(stderr, "Error: contig not found [%s]\n", name);
+ exit(1);
+ }
+ else if(c->contigs[i].len < start) {
+ fprintf(stderr, "Error: start out of range [%s,%u]\n", name, start);
+ exit(1);
+ }
+ else if(c->contigs[i].len < end) {
+ fprintf(stderr, "Error: end out of range [%s,%u]\n", name, end);
+ exit(1);
+ }
+ else if(end < start) {
+ fprintf(stderr, "Error: end < start [%s,%u,%u]\n", name, start, end);
+ exit(1);
+ }
+ else if(prev_contig == i && start < prev_start) {
+ fprintf(stderr, "Error: the input was not sorted [%s,%u,%u,%u]\n", name, start, end, len);
+ exit(1);
+ }
+
+ if(end - start + 1 != len) {
+ fprintf(stderr, "Warning: len != end - start + 1 [%s,%u,%u,%u]\n", name, start, end, len);
+ }
+
+ if(prev_contig == i && start <= prev_end && prev_start <= start) {
+ if(prev_end < end) {
+ r->end[r->n-1] = end;
+ prev_end = end;
+ }
+ }
+ else {
+ prev_contig = i;
+ prev_start = start;
+ prev_end = end;
+ while(r->mem <= r->n) {
+ r->mem <<= 1;
+ r->contig = realloc(r->contig, r->mem * sizeof(uint32_t));
+ r->start = realloc(r->start, r->mem * sizeof(uint32_t));
+ r->end = realloc(r->end, r->mem * sizeof(uint32_t));
+ }
+ r->contig[r->n] = i;
+ r->start[r->n] = start;
+ r->end[r->n] = end;
+ r->n++;
+ }
+ // move to the end of the line
+ while(EOF != (b = fgetc(fp))) {
+ if('\n' == b || '\r' == b) break;
+ }
+ }
+ return r;
+}
+
+void regions_bed_destroy(regions_bed_txt *r)
+{
+ free(r->contig);
+ free(r->start);
+ free(r->end);
+ free(r);
+}
+
+int32_t regions_bed_query(regions_bed_txt *r, uint32_t contig, uint32_t start, uint32_t end)
+{
+ int32_t low, high, mid;
+ if(NULL == r) return 1;
+
+ low = 0;
+ high = r->n-1;
+
+ while(low <= high) {
+ mid = (low + high) / 2;
+ if(contig < r->contig[mid] ||
+ (contig == r->contig[mid] && start < r->start[mid])) {
+ high = mid - 1;
+ }
+ else if(r->contig[mid] < contig ||
+ (r->contig[mid] == contig && r->end[mid] < end)) {
+ low = mid + 1;
+ }
+ else if(r->contig[mid] == contig && r->start[mid] <= start && end <= r->end[mid]) {
+ return 1;
+ }
+ else {
+ break;
+ }
+ }
+ return 0;
+}
diff --git a/src/regions_bed.h b/src/regions_bed.h
new file mode 100644
index 0000000..9243bb9
--- /dev/null
+++ b/src/regions_bed.h
@@ -0,0 +1,21 @@
+#ifndef REGIONS_BED_H
+#define REGIONS_BED_H
+
+typedef struct {
+ uint32_t *contig;
+ uint32_t *start;
+ uint32_t *end;
+ uint32_t n;
+ uint32_t mem;
+} regions_bed_txt;
+
+regions_bed_txt *
+regions_bed_init(FILE *fp, contigs_t *c);
+
+void
+regions_bed_destroy(regions_bed_txt *r);
+
+int32_t
+regions_bed_query(regions_bed_txt *r, uint32_t contig, uint32_t start, uint32_t end);
+
+#endif
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/dwgsim.git
More information about the debian-med-commit
mailing list