[med-svn] [stacks] 01/05: Imported Upstream version 1.38
Andreas Tille
tille at debian.org
Tue Apr 19 09:03:01 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository stacks.
commit 83ef6c396fa114322b8ddcf12e7a60e623c439a2
Author: Andreas Tille <tille at debian.org>
Date: Tue Apr 19 10:43:14 2016 +0200
Imported Upstream version 1.38
---
ChangeLog | 24 +
Makefile.am | 10 +-
Makefile.in | 195 +-
aclocal.m4 | 65 +-
config/compile | 2 +-
config/depcomp | 2 +-
config/install-sh | 366 +-
config/missing | 2 +-
config/test-driver | 15 +-
configure | 33 +-
configure.ac | 2 +-
php/CatalogClass.php | 8 +-
php/catalog.php | 5 +-
php/export_batch.php | 6 +
scripts/denovo_map.pl | 1017 +++--
scripts/ref_map.pl | 972 +++--
sql/stacks.sql | 1 +
src/DNANSeq.cc | 2 +-
src/DNANSeq.h | 4 +-
src/GappedAln.h | 594 +++
src/PopMap.h | 254 +-
src/PopSum.h | 1113 ++---
src/SamI.h | 31 +-
src/aln_utils.cc | 216 +
src/aln_utils.h | 42 +
src/catalog_utils.cc | 378 +-
src/constants.h | 4 +-
src/cstacks.cc | 642 ++-
src/cstacks.h | 7 +-
src/kmers.cc | 273 +-
src/kmers.h | 25 +-
src/locus.cc | 62 +-
src/locus.h | 42 +-
src/mstack.cc | 88 +-
src/mstack.h | 18 +-
src/populations.cc | 11098 +++++++++++++++++++++++------------------------
src/process_radtags.cc | 4 +-
src/pstacks.cc | 30 +-
src/sql_utilities.h | 660 +--
src/sstacks.cc | 496 ++-
src/sstacks.h | 32 +-
src/stacks.cc | 28 +-
src/stacks.h | 163 +-
src/ustacks.cc | 2810 +++++++-----
src/ustacks.h | 43 +-
src/utils.cc | 8 +
src/utils.h | 2 +
47 files changed, 12550 insertions(+), 9344 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index dbd196d..2614b51 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,27 @@
+Stacks 1.38 - April 18, 2016
+----------------------------
+ Feature: denovo_map.pl and ref_map.pl now print depth of coverage for each sample. The ustacks
+ program now prints depth of coverage after each algorithm stage to see how each stage improves
+ (or not) the depth of coverage.
+ Feature: complete refactoring of denovo_map.pl and ref_map.pl. Separated computation from
+ SQL loading. Added auto creation/deletion of database. Enabled samples to be read from population
+ map instead of specifying them on the command line.
+ Feature: added Needleman–Wunsch algorithm to ustacks, cstacks, sstacks to provide for gapped
+ alignments. Includes --max_gaps and --min_aln_len parameters to contain crazy
+ alignments. sstacks now includes a CIGAR string describing the alignment to the catalog.
+ Feature: optimized ustacks for a 33% decrease in run time.
+ Feature: added new file, sample_X.models.tsv.gz, produced by ustacks and pstacks. Contains a subset
+ of the information in the sample_X.tags.tsv.gz file, allows for data to be loaded much faster in the
+ later stages of the pipeline, greatly speeding up run times.
+ Bugfix: added code to prevent populations from improperly reading SNP positions past the length of
+ a particluar locus (that is shorter than the catalog locus).
+ Bugfix: corrected bug in process_radtags when using inline barcodes on paired-end reads. The paired-
+ end reads were not being truncated uniformly.
+ Bugfix: corrected bug in populations where if enough empty files were fed into the program
+ it could place files in the wrong population or segfault.
+ Bugfix: corrected PHP files for exporting to include LnL filter.
+ Bugfix: corrected mappable markers filter in web interface.
+
Stacks 1.37 - Feb 24, 2016
--------------------------
Feature: converted PHP database code from MDB2 to MySQLi. MDB2 is no longer a
diff --git a/Makefile.am b/Makefile.am
index f3f958e..00c7114 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -5,7 +5,7 @@ ustacks_SOURCES = src/ustacks.h src/ustacks.cc src/stacks.h src/stacks.cc src/co
src/mstack.h src/mstack.cc src/mst.h src/mst.cc src/cmb.h src/cmb.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc \
src/models.h src/models.cc src/utils.h src/utils.cc \
- src/kmers.h src/kmers.cc \
+ src/kmers.h src/kmers.cc src/GappedAln.h src/aln_utils.h src/aln_utils.cc \
src/input.h src/input.cc src/Tsv.h src/BowtieI.h src/FastaI.h src/FastqI.h src/SamI.h \
src/gzFasta.h src/gzFastq.h
@@ -25,6 +25,7 @@ estacks_SOURCES = src/estacks.h src/estacks.cc src/stacks.h src/stacks.cc src/co
cstacks_SOURCES = src/cstacks.h src/cstacks.cc src/constants.h \
src/stacks.h src/stacks.cc src/locus.h src/locus.cc \
src/kmers.h src/kmers.cc src/utils.h src/utils.cc \
+ src/GappedAln.h src/aln_utils.h src/aln_utils.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc \
src/input.h src/input.cc src/sql_utilities.h
@@ -37,6 +38,8 @@ hstacks_SOURCES = src/hstacks.h src/hstacks.cc src/constants.h \
sstacks_SOURCES = src/sstacks.h src/sstacks.cc src/constants.h \
src/stacks.h src/stacks.cc src/locus.h src/locus.cc \
+ src/kmers.h src/kmers.cc \
+ src/GappedAln.h src/aln_utils.h src/aln_utils.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc src/utils.h src/utils.cc \
src/input.h src/input.cc src/sql_utilities.h
@@ -45,7 +48,7 @@ rxstacks_SOURCES = src/rxstacks.h src/rxstacks.cc src/constants.h \
src/DNANSeq.h src/DNANSeq.cc src/DNASeq.h src/DNASeq.cc \
src/mst.h src/mst.cc \
src/models.h src/models.cc \
- src/utils.h src/utils.cc \
+ src/aln_utils.h src/aln_utils.cc src/utils.h src/utils.cc \
src/input.h src/input.cc src/sql_utilities.h
process_radtags_SOURCES = src/process_radtags.h src/process_radtags.cc src/constants.h \
@@ -84,6 +87,7 @@ genotypes_SOURCES = src/genotypes.h src/genotypes.cc src/constants.h \
src/log_utils.h src/log_utils.cc \
src/stacks.h src/stacks.cc src/locus.h src/locus.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc \
+ src/aln_utils.h src/aln_utils.cc \
src/PopMap.h src/genotype_dictionaries.h \
src/input.h src/input.cc src/sql_utilities.h src/renz.h
@@ -92,7 +96,7 @@ populations_SOURCES = src/populations.h src/populations.cc src/constants.h \
src/log_utils.h src/log_utils.cc \
src/stacks.h src/stacks.cc src/locus.h src/locus.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc \
- src/PopMap.h src/PopSum.h src/genotype_dictionaries.h \
+ src/PopMap.h src/PopSum.h src/aln_utils.h src/aln_utils.cc src/genotype_dictionaries.h \
src/input.h src/input.cc src/sql_utilities.h src/renz.h \
src/bootstrap.h src/ordered.h src/smoothing.h src/smoothing_utils.h
diff --git a/Makefile.in b/Makefile.in
index b585d3d..a3958c0 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -1,7 +1,7 @@
-# Makefile.in generated by automake 1.14.1 from Makefile.am.
+# Makefile.in generated by automake 1.15 from Makefile.am.
# @configure_input@
-# Copyright (C) 1994-2013 Free Software Foundation, Inc.
+# Copyright (C) 1994-2014 Free Software Foundation, Inc.
# This Makefile.in is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -17,7 +17,17 @@
VPATH = @srcdir@
-am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
+am__is_gnu_make = { \
+ if test -z '$(MAKELEVEL)'; then \
+ false; \
+ elif test -n '$(MAKE_HOST)'; then \
+ true; \
+ elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+ true; \
+ else \
+ false; \
+ fi; \
+}
am__make_running_with_option = \
case $${target_option-} in \
?) ;; \
@@ -85,19 +95,14 @@ bin_PROGRAMS = ustacks$(EXEEXT) pstacks$(EXEEXT) estacks$(EXEEXT) \
clone_filter$(EXEEXT) genotypes$(EXEEXT) populations$(EXEEXT) \
phasedstacks$(EXEEXT)
subdir = .
-DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
- $(top_srcdir)/configure $(am__configure_deps) \
- $(srcdir)/config.h.in $(dist_bin_SCRIPTS) \
- $(dist_noinst_SCRIPTS) $(top_srcdir)/config/depcomp \
- $(top_srcdir)/config/test-driver ChangeLog INSTALL README \
- config/compile config/depcomp config/install-sh config/missing \
- $(top_srcdir)/config/compile $(top_srcdir)/config/install-sh \
- $(top_srcdir)/config/missing
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/acinclude.m4 \
$(top_srcdir)/configure.ac
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
$(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(top_srcdir)/configure \
+ $(am__configure_deps) $(dist_bin_SCRIPTS) \
+ $(dist_noinst_SCRIPTS) $(am__DIST_COMMON)
am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
configure.lineno config.status.lineno
mkinstalldirs = $(install_sh) -d
@@ -123,8 +128,8 @@ clone_filter_LINK = $(CXXLD) $(clone_filter_CXXFLAGS) $(CXXFLAGS) \
am_cstacks_OBJECTS = src/cstacks-cstacks.$(OBJEXT) \
src/cstacks-stacks.$(OBJEXT) src/cstacks-locus.$(OBJEXT) \
src/cstacks-kmers.$(OBJEXT) src/cstacks-utils.$(OBJEXT) \
- src/cstacks-DNASeq.$(OBJEXT) src/cstacks-DNANSeq.$(OBJEXT) \
- src/cstacks-input.$(OBJEXT)
+ src/cstacks-aln_utils.$(OBJEXT) src/cstacks-DNASeq.$(OBJEXT) \
+ src/cstacks-DNANSeq.$(OBJEXT) src/cstacks-input.$(OBJEXT)
cstacks_OBJECTS = $(am_cstacks_OBJECTS)
cstacks_LDADD = $(LDADD)
cstacks_LINK = $(CXXLD) $(cstacks_CXXFLAGS) $(CXXFLAGS) \
@@ -144,6 +149,7 @@ am_genotypes_OBJECTS = src/genotypes-genotypes.$(OBJEXT) \
src/genotypes-log_utils.$(OBJEXT) \
src/genotypes-stacks.$(OBJEXT) src/genotypes-locus.$(OBJEXT) \
src/genotypes-DNASeq.$(OBJEXT) src/genotypes-DNANSeq.$(OBJEXT) \
+ src/genotypes-aln_utils.$(OBJEXT) \
src/genotypes-input.$(OBJEXT)
genotypes_OBJECTS = $(am_genotypes_OBJECTS)
genotypes_LDADD = $(LDADD)
@@ -185,6 +191,7 @@ am_populations_OBJECTS = src/populations-populations.$(OBJEXT) \
src/populations-locus.$(OBJEXT) \
src/populations-DNASeq.$(OBJEXT) \
src/populations-DNANSeq.$(OBJEXT) \
+ src/populations-aln_utils.$(OBJEXT) \
src/populations-input.$(OBJEXT)
populations_OBJECTS = $(am_populations_OBJECTS)
populations_LDADD = $(LDADD)
@@ -228,13 +235,15 @@ am_rxstacks_OBJECTS = src/rxstacks-rxstacks.$(OBJEXT) \
src/rxstacks-stacks.$(OBJEXT) src/rxstacks-locus.$(OBJEXT) \
src/rxstacks-DNANSeq.$(OBJEXT) src/rxstacks-DNASeq.$(OBJEXT) \
src/rxstacks-mst.$(OBJEXT) src/rxstacks-models.$(OBJEXT) \
- src/rxstacks-utils.$(OBJEXT) src/rxstacks-input.$(OBJEXT)
+ src/rxstacks-aln_utils.$(OBJEXT) src/rxstacks-utils.$(OBJEXT) \
+ src/rxstacks-input.$(OBJEXT)
rxstacks_OBJECTS = $(am_rxstacks_OBJECTS)
rxstacks_LDADD = $(LDADD)
rxstacks_LINK = $(CXXLD) $(rxstacks_CXXFLAGS) $(CXXFLAGS) \
$(rxstacks_LDFLAGS) $(LDFLAGS) -o $@
am_sstacks_OBJECTS = src/sstacks-sstacks.$(OBJEXT) \
src/sstacks-stacks.$(OBJEXT) src/sstacks-locus.$(OBJEXT) \
+ src/sstacks-kmers.$(OBJEXT) src/sstacks-aln_utils.$(OBJEXT) \
src/sstacks-DNASeq.$(OBJEXT) src/sstacks-DNANSeq.$(OBJEXT) \
src/sstacks-utils.$(OBJEXT) src/sstacks-input.$(OBJEXT)
sstacks_OBJECTS = $(am_sstacks_OBJECTS)
@@ -246,7 +255,8 @@ am_ustacks_OBJECTS = src/ustacks-ustacks.$(OBJEXT) \
src/ustacks-mst.$(OBJEXT) src/ustacks-cmb.$(OBJEXT) \
src/ustacks-DNASeq.$(OBJEXT) src/ustacks-DNANSeq.$(OBJEXT) \
src/ustacks-models.$(OBJEXT) src/ustacks-utils.$(OBJEXT) \
- src/ustacks-kmers.$(OBJEXT) src/ustacks-input.$(OBJEXT)
+ src/ustacks-kmers.$(OBJEXT) src/ustacks-aln_utils.$(OBJEXT) \
+ src/ustacks-input.$(OBJEXT)
ustacks_OBJECTS = $(am_ustacks_OBJECTS)
ustacks_LDADD = $(LDADD)
ustacks_LINK = $(CXXLD) $(ustacks_CXXFLAGS) $(CXXFLAGS) \
@@ -539,6 +549,11 @@ TEST_LOGS = $(am__test_logs2:.test.log=.log)
TEST_LOG_DRIVER = $(SHELL) $(top_srcdir)/config/test-driver
TEST_LOG_COMPILE = $(TEST_LOG_COMPILER) $(AM_TEST_LOG_FLAGS) \
$(TEST_LOG_FLAGS)
+am__DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/config.h.in \
+ $(top_srcdir)/config/compile $(top_srcdir)/config/depcomp \
+ $(top_srcdir)/config/install-sh $(top_srcdir)/config/missing \
+ $(top_srcdir)/config/test-driver ChangeLog INSTALL README \
+ config/compile config/depcomp config/install-sh config/missing
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
distdir = $(PACKAGE)-$(VERSION)
top_distdir = $(distdir)
@@ -656,7 +671,7 @@ ustacks_SOURCES = src/ustacks.h src/ustacks.cc src/stacks.h src/stacks.cc src/co
src/mstack.h src/mstack.cc src/mst.h src/mst.cc src/cmb.h src/cmb.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc \
src/models.h src/models.cc src/utils.h src/utils.cc \
- src/kmers.h src/kmers.cc \
+ src/kmers.h src/kmers.cc src/GappedAln.h src/aln_utils.h src/aln_utils.cc \
src/input.h src/input.cc src/Tsv.h src/BowtieI.h src/FastaI.h src/FastqI.h src/SamI.h \
src/gzFasta.h src/gzFastq.h
@@ -676,6 +691,7 @@ estacks_SOURCES = src/estacks.h src/estacks.cc src/stacks.h src/stacks.cc src/co
cstacks_SOURCES = src/cstacks.h src/cstacks.cc src/constants.h \
src/stacks.h src/stacks.cc src/locus.h src/locus.cc \
src/kmers.h src/kmers.cc src/utils.h src/utils.cc \
+ src/GappedAln.h src/aln_utils.h src/aln_utils.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc \
src/input.h src/input.cc src/sql_utilities.h
@@ -688,6 +704,8 @@ hstacks_SOURCES = src/hstacks.h src/hstacks.cc src/constants.h \
sstacks_SOURCES = src/sstacks.h src/sstacks.cc src/constants.h \
src/stacks.h src/stacks.cc src/locus.h src/locus.cc \
+ src/kmers.h src/kmers.cc \
+ src/GappedAln.h src/aln_utils.h src/aln_utils.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc src/utils.h src/utils.cc \
src/input.h src/input.cc src/sql_utilities.h
@@ -696,7 +714,7 @@ rxstacks_SOURCES = src/rxstacks.h src/rxstacks.cc src/constants.h \
src/DNANSeq.h src/DNANSeq.cc src/DNASeq.h src/DNASeq.cc \
src/mst.h src/mst.cc \
src/models.h src/models.cc \
- src/utils.h src/utils.cc \
+ src/aln_utils.h src/aln_utils.cc src/utils.h src/utils.cc \
src/input.h src/input.cc src/sql_utilities.h
process_radtags_SOURCES = src/process_radtags.h src/process_radtags.cc src/constants.h \
@@ -735,6 +753,7 @@ genotypes_SOURCES = src/genotypes.h src/genotypes.cc src/constants.h \
src/log_utils.h src/log_utils.cc \
src/stacks.h src/stacks.cc src/locus.h src/locus.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc \
+ src/aln_utils.h src/aln_utils.cc \
src/PopMap.h src/genotype_dictionaries.h \
src/input.h src/input.cc src/sql_utilities.h src/renz.h
@@ -743,7 +762,7 @@ populations_SOURCES = src/populations.h src/populations.cc src/constants.h \
src/log_utils.h src/log_utils.cc \
src/stacks.h src/stacks.cc src/locus.h src/locus.cc \
src/DNASeq.h src/DNASeq.cc src/DNANSeq.h src/DNANSeq.cc \
- src/PopMap.h src/PopSum.h src/genotype_dictionaries.h \
+ src/PopMap.h src/PopSum.h src/aln_utils.h src/aln_utils.cc src/genotype_dictionaries.h \
src/input.h src/input.cc src/sql_utilities.h src/renz.h \
src/bootstrap.h src/ordered.h src/smoothing.h src/smoothing_utils.h
@@ -825,7 +844,6 @@ $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps)
echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign Makefile'; \
$(am__cd) $(top_srcdir) && \
$(AUTOMAKE) --foreign Makefile
-.PRECIOUS: Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
@case '$?' in \
*config.status*) \
@@ -935,6 +953,8 @@ src/cstacks-kmers.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
src/cstacks-utils.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
+src/cstacks-aln_utils.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
src/cstacks-DNASeq.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
src/cstacks-DNANSeq.$(OBJEXT): src/$(am__dirstamp) \
@@ -981,6 +1001,8 @@ src/genotypes-DNASeq.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
src/genotypes-DNANSeq.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
+src/genotypes-aln_utils.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
src/genotypes-input.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
@@ -1053,6 +1075,8 @@ src/populations-DNASeq.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
src/populations-DNANSeq.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
+src/populations-aln_utils.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
src/populations-input.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
@@ -1131,6 +1155,8 @@ src/rxstacks-mst.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
src/rxstacks-models.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
+src/rxstacks-aln_utils.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
src/rxstacks-utils.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
src/rxstacks-input.$(OBJEXT): src/$(am__dirstamp) \
@@ -1145,6 +1171,10 @@ src/sstacks-stacks.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
src/sstacks-locus.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
+src/sstacks-kmers.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
+src/sstacks-aln_utils.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
src/sstacks-DNASeq.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
src/sstacks-DNANSeq.$(OBJEXT): src/$(am__dirstamp) \
@@ -1177,6 +1207,8 @@ src/ustacks-utils.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
src/ustacks-kmers.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
+src/ustacks-aln_utils.$(OBJEXT): src/$(am__dirstamp) \
+ src/$(DEPDIR)/$(am__dirstamp)
src/ustacks-input.$(OBJEXT): src/$(am__dirstamp) \
src/$(DEPDIR)/$(am__dirstamp)
@@ -1235,6 +1267,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/clone_filter-write.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/cstacks-DNANSeq.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/cstacks-DNASeq.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/cstacks-aln_utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/cstacks-cstacks.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/cstacks-input.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/cstacks-kmers.Po at am__quote@
@@ -1251,6 +1284,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/estacks-utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-DNANSeq.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-DNASeq.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-aln_utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-catalog_utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-genotypes.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/genotypes-input.Po at am__quote@
@@ -1279,6 +1313,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/phasedstacks-utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-DNANSeq.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-DNASeq.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-aln_utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-catalog_utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-input.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/populations-locus.Po at am__quote@
@@ -1311,6 +1346,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/pstacks-utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-DNANSeq.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-DNASeq.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-aln_utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-input.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-locus.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-models.Po at am__quote@
@@ -1320,13 +1356,16 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/rxstacks-utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/sstacks-DNANSeq.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/sstacks-DNASeq.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/sstacks-aln_utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/sstacks-input.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/sstacks-kmers.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/sstacks-locus.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/sstacks-sstacks.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/sstacks-stacks.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/sstacks-utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/ustacks-DNANSeq.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/ustacks-DNASeq.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/ustacks-aln_utils.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/ustacks-cmb.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/ustacks-input.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at src/$(DEPDIR)/ustacks-kmers.Po at am__quote@
@@ -1521,6 +1560,20 @@ src/cstacks-utils.obj: src/utils.cc
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(cstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/cstacks-utils.obj `if test -f 'src/utils.cc'; then $(CYGPATH_W) 'src/utils.cc'; else $(CYGPATH_W) '$(srcdir)/src/utils.cc'; fi`
+src/cstacks-aln_utils.o: src/aln_utils.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(cstacks_CXXFLAGS) $(CXXFLAGS) -MT src/cstacks-aln_utils.o -MD -MP -MF src/$(DEPDIR)/cstacks-aln_utils.Tpo -c -o src/cstacks-aln_utils.o `test -f 'src/aln_utils.cc' || echo '$(srcdir)/'`src/aln_utils.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/cstacks-aln_utils.Tpo src/$(DEPDIR)/cstacks-aln_utils.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/aln_utils.cc' object='src/cstacks-aln_utils.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(cstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/cstacks-aln_utils.o `test -f 'src/aln_utils.cc' || echo '$(srcdir)/'`src/aln_utils.cc
+
+src/cstacks-aln_utils.obj: src/aln_utils.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(cstacks_CXXFLAGS) $(CXXFLAGS) -MT src/cstacks-aln_utils.obj -MD -MP -MF src/$(DEPDIR)/cstacks-aln_utils.Tpo -c -o src/cstacks-aln_utils.obj `if test -f 'src/aln_utils.cc'; then $(CYGPATH_W) 'src/aln_utils.cc'; else $(CYGPATH_W) '$(srcdir)/src/aln_utils.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/cstacks-aln_utils.Tpo src/$(DEPDIR)/cstacks-aln_utils.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/aln_utils.cc' object='src/cstacks-aln_utils.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(cstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/cstacks-aln_utils.obj `if test -f 'src/aln_utils.cc'; then $(CYGPATH_W) 'src/aln_utils.cc'; else $(CYGPATH_W) '$(srcdir)/src/aln_utils.cc'; fi`
+
src/cstacks-DNASeq.o: src/DNASeq.cc
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(cstacks_CXXFLAGS) $(CXXFLAGS) -MT src/cstacks-DNASeq.o -MD -MP -MF src/$(DEPDIR)/cstacks-DNASeq.Tpo -c -o src/cstacks-DNASeq.o `test -f 'src/DNASeq.cc' || echo '$(srcdir)/'`src/DNASeq.cc
@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/cstacks-DNASeq.Tpo src/$(DEPDIR)/cstacks-DNASeq.Po
@@ -1787,6 +1840,20 @@ src/genotypes-DNANSeq.obj: src/DNANSeq.cc
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -c -o src/genotypes-DNANSeq.obj `if test -f 'src/DNANSeq.cc'; then $(CYGPATH_W) 'src/DNANSeq.cc'; else $(CYGPATH_W) '$(srcdir)/src/DNANSeq.cc'; fi`
+src/genotypes-aln_utils.o: src/aln_utils.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -MT src/genotypes-aln_utils.o -MD -MP -MF src/$(DEPDIR)/genotypes-aln_utils.Tpo -c -o src/genotypes-aln_utils.o `test -f 'src/aln_utils.cc' || echo '$(srcdir)/'`src/aln_utils.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/genotypes-aln_utils.Tpo src/$(DEPDIR)/genotypes-aln_utils.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/aln_utils.cc' object='src/genotypes-aln_utils.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -c -o src/genotypes-aln_utils.o `test -f 'src/aln_utils.cc' || echo '$(srcdir)/'`src/aln_utils.cc
+
+src/genotypes-aln_utils.obj: src/aln_utils.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -MT src/genotypes-aln_utils.obj -MD -MP -MF src/$(DEPDIR)/genotypes-aln_utils.Tpo -c -o src/genotypes-aln_utils.obj `if test -f 'src/aln_utils.cc'; then $(CYGPATH_W) 'src/aln_utils.cc'; else $(CYGPATH_W) '$(srcdir)/src/aln_utils.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/genotypes-aln_utils.Tpo src/$(DEPDIR)/genotypes-aln_utils.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/aln_utils.cc' object='src/genotypes-aln_utils.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -c -o src/genotypes-aln_utils.obj `if test -f 'src/aln_utils.cc'; then $(CYGPATH_W) 'src/aln_utils.cc'; else $(CYGPATH_W) '$(srcdir)/src/aln_utils.cc'; fi`
+
src/genotypes-input.o: src/input.cc
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(genotypes_CXXFLAGS) $(CXXFLAGS) -MT src/genotypes-input.o -MD -MP -MF src/$(DEPDIR)/genotypes-input.Tpo -c -o src/genotypes-input.o `test -f 'src/input.cc' || echo '$(srcdir)/'`src/input.cc
@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/genotypes-input.Tpo src/$(DEPDIR)/genotypes-input.Po
@@ -2179,6 +2246,20 @@ src/populations-DNANSeq.obj: src/DNANSeq.cc
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -c -o src/populations-DNANSeq.obj `if test -f 'src/DNANSeq.cc'; then $(CYGPATH_W) 'src/DNANSeq.cc'; else $(CYGPATH_W) '$(srcdir)/src/DNANSeq.cc'; fi`
+src/populations-aln_utils.o: src/aln_utils.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -MT src/populations-aln_utils.o -MD -MP -MF src/$(DEPDIR)/populations-aln_utils.Tpo -c -o src/populations-aln_utils.o `test -f 'src/aln_utils.cc' || echo '$(srcdir)/'`src/aln_utils.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/populations-aln_utils.Tpo src/$(DEPDIR)/populations-aln_utils.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/aln_utils.cc' object='src/populations-aln_utils.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -c -o src/populations-aln_utils.o `test -f 'src/aln_utils.cc' || echo '$(srcdir)/'`src/aln_utils.cc
+
+src/populations-aln_utils.obj: src/aln_utils.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -MT src/populations-aln_utils.obj -MD -MP -MF src/$(DEPDIR)/populations-aln_utils.Tpo -c -o src/populations-aln_utils.obj `if test -f 'src/aln_utils.cc'; then $(CYGPATH_W) 'src/aln_utils.cc'; else $(CYGPATH_W) '$(srcdir)/src/aln_utils.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/populations-aln_utils.Tpo src/$(DEPDIR)/populations-aln_utils.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/aln_utils.cc' object='src/populations-aln_utils.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -c -o src/populations-aln_utils.obj `if test -f 'src/aln_utils.cc'; then $(CYGPATH_W) 'src/aln_utils.cc'; else $(CYGPATH_W) '$(srcdir)/src/aln_utils.cc'; fi`
+
src/populations-input.o: src/input.cc
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(populations_CXXFLAGS) $(CXXFLAGS) -MT src/populations-input.o -MD -MP -MF src/$(DEPDIR)/populations-input.Tpo -c -o src/populations-input.o `test -f 'src/input.cc' || echo '$(srcdir)/'`src/input.cc
@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/populations-input.Tpo src/$(DEPDIR)/populations-input.Po
@@ -2613,6 +2694,20 @@ src/rxstacks-models.obj: src/models.cc
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rxstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/rxstacks-models.obj `if test -f 'src/models.cc'; then $(CYGPATH_W) 'src/models.cc'; else $(CYGPATH_W) '$(srcdir)/src/models.cc'; fi`
+src/rxstacks-aln_utils.o: src/aln_utils.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rxstacks_CXXFLAGS) $(CXXFLAGS) -MT src/rxstacks-aln_utils.o -MD -MP -MF src/$(DEPDIR)/rxstacks-aln_utils.Tpo -c -o src/rxstacks-aln_utils.o `test -f 'src/aln_utils.cc' || echo '$(srcdir)/'`src/aln_utils.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/rxstacks-aln_utils.Tpo src/$(DEPDIR)/rxstacks-aln_utils.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/aln_utils.cc' object='src/rxstacks-aln_utils.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rxstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/rxstacks-aln_utils.o `test -f 'src/aln_utils.cc' || echo '$(srcdir)/'`src/aln_utils.cc
+
+src/rxstacks-aln_utils.obj: src/aln_utils.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rxstacks_CXXFLAGS) $(CXXFLAGS) -MT src/rxstacks-aln_utils.obj -MD -MP -MF src/$(DEPDIR)/rxstacks-aln_utils.Tpo -c -o src/rxstacks-aln_utils.obj `if test -f 'src/aln_utils.cc'; then $(CYGPATH_W) 'src/aln_utils.cc'; else $(CYGPATH_W) '$(srcdir)/src/aln_utils.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/rxstacks-aln_utils.Tpo src/$(DEPDIR)/rxstacks-aln_utils.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/aln_utils.cc' object='src/rxstacks-aln_utils.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rxstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/rxstacks-aln_utils.obj `if test -f 'src/aln_utils.cc'; then $(CYGPATH_W) 'src/aln_utils.cc'; else $(CYGPATH_W) '$(srcdir)/src/aln_utils.cc'; fi`
+
src/rxstacks-utils.o: src/utils.cc
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rxstacks_CXXFLAGS) $(CXXFLAGS) -MT src/rxstacks-utils.o -MD -MP -MF src/$(DEPDIR)/rxstacks-utils.Tpo -c -o src/rxstacks-utils.o `test -f 'src/utils.cc' || echo '$(srcdir)/'`src/utils.cc
@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/rxstacks-utils.Tpo src/$(DEPDIR)/rxstacks-utils.Po
@@ -2683,6 +2778,34 @@ src/sstacks-locus.obj: src/locus.cc
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/sstacks-locus.obj `if test -f 'src/locus.cc'; then $(CYGPATH_W) 'src/locus.cc'; else $(CYGPATH_W) '$(srcdir)/src/locus.cc'; fi`
+src/sstacks-kmers.o: src/kmers.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sstacks_CXXFLAGS) $(CXXFLAGS) -MT src/sstacks-kmers.o -MD -MP -MF src/$(DEPDIR)/sstacks-kmers.Tpo -c -o src/sstacks-kmers.o `test -f 'src/kmers.cc' || echo '$(srcdir)/'`src/kmers.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/sstacks-kmers.Tpo src/$(DEPDIR)/sstacks-kmers.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/kmers.cc' object='src/sstacks-kmers.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/sstacks-kmers.o `test -f 'src/kmers.cc' || echo '$(srcdir)/'`src/kmers.cc
+
+src/sstacks-kmers.obj: src/kmers.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sstacks_CXXFLAGS) $(CXXFLAGS) -MT src/sstacks-kmers.obj -MD -MP -MF src/$(DEPDIR)/sstacks-kmers.Tpo -c -o src/sstacks-kmers.obj `if test -f 'src/kmers.cc'; then $(CYGPATH_W) 'src/kmers.cc'; else $(CYGPATH_W) '$(srcdir)/src/kmers.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/sstacks-kmers.Tpo src/$(DEPDIR)/sstacks-kmers.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/kmers.cc' object='src/sstacks-kmers.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/sstacks-kmers.obj `if test -f 'src/kmers.cc'; then $(CYGPATH_W) 'src/kmers.cc'; else $(CYGPATH_W) '$(srcdir)/src/kmers.cc'; fi`
+
+src/sstacks-aln_utils.o: src/aln_utils.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sstacks_CXXFLAGS) $(CXXFLAGS) -MT src/sstacks-aln_utils.o -MD -MP -MF src/$(DEPDIR)/sstacks-aln_utils.Tpo -c -o src/sstacks-aln_utils.o `test -f 'src/aln_utils.cc' || echo '$(srcdir)/'`src/aln_utils.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/sstacks-aln_utils.Tpo src/$(DEPDIR)/sstacks-aln_utils.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/aln_utils.cc' object='src/sstacks-aln_utils.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/sstacks-aln_utils.o `test -f 'src/aln_utils.cc' || echo '$(srcdir)/'`src/aln_utils.cc
+
+src/sstacks-aln_utils.obj: src/aln_utils.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sstacks_CXXFLAGS) $(CXXFLAGS) -MT src/sstacks-aln_utils.obj -MD -MP -MF src/$(DEPDIR)/sstacks-aln_utils.Tpo -c -o src/sstacks-aln_utils.obj `if test -f 'src/aln_utils.cc'; then $(CYGPATH_W) 'src/aln_utils.cc'; else $(CYGPATH_W) '$(srcdir)/src/aln_utils.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/sstacks-aln_utils.Tpo src/$(DEPDIR)/sstacks-aln_utils.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/aln_utils.cc' object='src/sstacks-aln_utils.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sstacks_CXXFLAGS) $(CXXFLAGS) -c -o src/sstacks-aln_utils.obj `if test -f 'src/aln_utils.cc'; then $(CYGPATH_W) 'src/aln_utils.cc'; else $(CYGPATH_W) '$(srcdir)/src/aln_utils.cc'; fi`
+
src/sstacks-DNASeq.o: src/DNASeq.cc
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sstacks_CXXFLAGS) $(CXXFLAGS) -MT src/sstacks-DNASeq.o -MD -MP -MF src/$(DEPDIR)/sstacks-DNASeq.Tpo -c -o src/sstacks-DNASeq.o `test -f 'src/DNASeq.cc' || echo '$(srcdir)/'`src/DNASeq.cc
@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/sstacks-DNASeq.Tpo src/$(DEPDIR)/sstacks-DNASeq.Po
@@ -2879,6 +3002,20 @@ src/ustacks-kmers.obj: src/kmers.cc
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ustacks_CXXFLAGS) $(CXXFLAGS) -c -o src/ustacks-kmers.obj `if test -f 'src/kmers.cc'; then $(CYGPATH_W) 'src/kmers.cc'; else $(CYGPATH_W) '$(srcdir)/src/kmers.cc'; fi`
+src/ustacks-aln_utils.o: src/aln_utils.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ustacks_CXXFLAGS) $(CXXFLAGS) -MT src/ustacks-aln_utils.o -MD -MP -MF src/$(DEPDIR)/ustacks-aln_utils.Tpo -c -o src/ustacks-aln_utils.o `test -f 'src/aln_utils.cc' || echo '$(srcdir)/'`src/aln_utils.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/ustacks-aln_utils.Tpo src/$(DEPDIR)/ustacks-aln_utils.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/aln_utils.cc' object='src/ustacks-aln_utils.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ustacks_CXXFLAGS) $(CXXFLAGS) -c -o src/ustacks-aln_utils.o `test -f 'src/aln_utils.cc' || echo '$(srcdir)/'`src/aln_utils.cc
+
+src/ustacks-aln_utils.obj: src/aln_utils.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ustacks_CXXFLAGS) $(CXXFLAGS) -MT src/ustacks-aln_utils.obj -MD -MP -MF src/$(DEPDIR)/ustacks-aln_utils.Tpo -c -o src/ustacks-aln_utils.obj `if test -f 'src/aln_utils.cc'; then $(CYGPATH_W) 'src/aln_utils.cc'; else $(CYGPATH_W) '$(srcdir)/src/aln_utils.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/ustacks-aln_utils.Tpo src/$(DEPDIR)/ustacks-aln_utils.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/aln_utils.cc' object='src/ustacks-aln_utils.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ustacks_CXXFLAGS) $(CXXFLAGS) -c -o src/ustacks-aln_utils.obj `if test -f 'src/aln_utils.cc'; then $(CYGPATH_W) 'src/aln_utils.cc'; else $(CYGPATH_W) '$(srcdir)/src/aln_utils.cc'; fi`
+
src/ustacks-input.o: src/input.cc
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ustacks_CXXFLAGS) $(CXXFLAGS) -MT src/ustacks-input.o -MD -MP -MF src/$(DEPDIR)/ustacks-input.Tpo -c -o src/ustacks-input.o `test -f 'src/input.cc' || echo '$(srcdir)/'`src/input.cc
@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/$(DEPDIR)/ustacks-input.Tpo src/$(DEPDIR)/ustacks-input.Po
@@ -3006,7 +3143,7 @@ $(TEST_SUITE_LOG): $(TEST_LOGS)
if test -n "$$am__remaking_logs"; then \
echo "fatal: making $(TEST_SUITE_LOG): possible infinite" \
"recursion detected" >&2; \
- else \
+ elif test -n "$$redo_logs"; then \
am__remaking_logs=yes $(MAKE) $(AM_MAKEFLAGS) $$redo_logs; \
fi; \
if $(am__make_dryrun); then :; else \
@@ -3216,15 +3353,15 @@ dist-xz: distdir
$(am__post_remove_distdir)
dist-tarZ: distdir
- @echo WARNING: "Support for shar distribution archives is" \
- "deprecated." >&2
+ @echo WARNING: "Support for distribution archives compressed with" \
+ "legacy program 'compress' is deprecated." >&2
@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
$(am__post_remove_distdir)
dist-shar: distdir
- @echo WARNING: "Support for distribution archives compressed with" \
- "legacy program 'compress' is deprecated." >&2
+ @echo WARNING: "Support for shar distribution archives is" \
+ "deprecated." >&2
@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
$(am__post_remove_distdir)
@@ -3260,17 +3397,17 @@ distcheck: dist
esac
chmod -R a-w $(distdir)
chmod u+w $(distdir)
- mkdir $(distdir)/_build $(distdir)/_inst
+ mkdir $(distdir)/_build $(distdir)/_build/sub $(distdir)/_inst
chmod a-w $(distdir)
test -d $(distdir)/_build || exit 0; \
dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
&& dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
&& am__cwd=`pwd` \
- && $(am__cd) $(distdir)/_build \
- && ../configure \
+ && $(am__cd) $(distdir)/_build/sub \
+ && ../../configure \
$(AM_DISTCHECK_CONFIGURE_FLAGS) \
$(DISTCHECK_CONFIGURE_FLAGS) \
- --srcdir=.. --prefix="$$dc_install_base" \
+ --srcdir=../.. --prefix="$$dc_install_base" \
&& $(MAKE) $(AM_MAKEFLAGS) \
&& $(MAKE) $(AM_MAKEFLAGS) dvi \
&& $(MAKE) $(AM_MAKEFLAGS) check \
@@ -3464,6 +3601,8 @@ uninstall-am: uninstall-binPROGRAMS uninstall-dist_binSCRIPTS \
uninstall-dist_binSCRIPTS uninstall-hook \
uninstall-nobase_pkgdataDATA
+.PRECIOUS: Makefile
+
debug:
$(MAKE) all "CXXFLAGS=-g -Wall -DDEBUG -std=gnu++0x"
diff --git a/aclocal.m4 b/aclocal.m4
index d2198f2..8b1f091 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -1,6 +1,6 @@
-# generated automatically by aclocal 1.14.1 -*- Autoconf -*-
+# generated automatically by aclocal 1.15 -*- Autoconf -*-
-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -20,7 +20,7 @@ You have another version of autoconf. It may work, but is not guaranteed to.
If you have problems, you may need to regenerate the build system entirely.
To do so, use the procedure documented by the package, typically 'autoreconf'.])])
-# Copyright (C) 2002-2013 Free Software Foundation, Inc.
+# Copyright (C) 2002-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -32,10 +32,10 @@ To do so, use the procedure documented by the package, typically 'autoreconf'.])
# generated from the m4 files accompanying Automake X.Y.
# (This private macro should not be called outside this file.)
AC_DEFUN([AM_AUTOMAKE_VERSION],
-[am__api_version='1.14'
+[am__api_version='1.15'
dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
dnl require some minimum version. Point them to the right macro.
-m4_if([$1], [1.14.1], [],
+m4_if([$1], [1.15], [],
[AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
])
@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
# This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.14.1])dnl
+[AM_AUTOMAKE_VERSION([1.15])dnl
m4_ifndef([AC_AUTOCONF_VERSION],
[m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
_AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
# AM_AUX_DIR_EXPAND -*- Autoconf -*-
-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -103,15 +103,14 @@ _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
# configured tree to be moved without reconfiguration.
AC_DEFUN([AM_AUX_DIR_EXPAND],
-[dnl Rely on autoconf to set up CDPATH properly.
-AC_PREREQ([2.50])dnl
-# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd $ac_aux_dir && pwd`
+[AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl
+# Expand $ac_aux_dir to an absolute path.
+am_aux_dir=`cd "$ac_aux_dir" && pwd`
])
# AM_CONDITIONAL -*- Autoconf -*-
-# Copyright (C) 1997-2013 Free Software Foundation, Inc.
+# Copyright (C) 1997-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -142,7 +141,7 @@ AC_CONFIG_COMMANDS_PRE(
Usually this means the macro was only invoked conditionally.]])
fi])])
-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -333,7 +332,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl
# Generate code to set up dependency tracking. -*- Autoconf -*-
-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -409,7 +408,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
# Do all the work for Automake. -*- Autoconf -*-
-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -499,8 +498,8 @@ AC_REQUIRE([AC_PROG_MKDIR_P])dnl
# <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
# <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
AC_SUBST([mkdir_p], ['$(MKDIR_P)'])
-# We need awk for the "check" target. The system "awk" is bad on
-# some platforms.
+# We need awk for the "check" target (and possibly the TAP driver). The
+# system "awk" is bad on some platforms.
AC_REQUIRE([AC_PROG_AWK])dnl
AC_REQUIRE([AC_PROG_MAKE_SET])dnl
AC_REQUIRE([AM_SET_LEADING_DOT])dnl
@@ -573,7 +572,11 @@ to "yes", and re-run configure.
END
AC_MSG_ERROR([Your 'rm' program is bad, sorry.])
fi
-fi])
+fi
+dnl The trailing newline in this macro's definition is deliberate, for
+dnl backward compatibility and to allow trailing 'dnl'-style comments
+dnl after the AM_INIT_AUTOMAKE invocation. See automake bug#16841.
+])
dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion. Do not
dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further
@@ -602,7 +605,7 @@ for _am_header in $config_headers :; do
done
echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -613,7 +616,7 @@ echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_co
# Define $install_sh.
AC_DEFUN([AM_PROG_INSTALL_SH],
[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
-if test x"${install_sh}" != xset; then
+if test x"${install_sh+set}" != xset; then
case $am_aux_dir in
*\ * | *\ *)
install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@@ -623,7 +626,7 @@ if test x"${install_sh}" != xset; then
fi
AC_SUBST([install_sh])])
-# Copyright (C) 2003-2013 Free Software Foundation, Inc.
+# Copyright (C) 2003-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -644,7 +647,7 @@ AC_SUBST([am__leading_dot])])
# Check to see how 'make' treats includes. -*- Autoconf -*-
-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -694,7 +697,7 @@ rm -f confinc confmf
# Fake the existence of programs that GNU maintainers use. -*- Autoconf -*-
-# Copyright (C) 1997-2013 Free Software Foundation, Inc.
+# Copyright (C) 1997-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -733,7 +736,7 @@ fi
# Helper functions for option handling. -*- Autoconf -*-
-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -762,7 +765,7 @@ AC_DEFUN([_AM_SET_OPTIONS],
AC_DEFUN([_AM_IF_OPTION],
[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -809,7 +812,7 @@ AC_LANG_POP([C])])
# For backward compatibility.
AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])
-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -828,7 +831,7 @@ AC_DEFUN([AM_RUN_LOG],
# Check to make sure that the build environment is sane. -*- Autoconf -*-
-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -909,7 +912,7 @@ AC_CONFIG_COMMANDS_PRE(
rm -f conftest.file
])
-# Copyright (C) 2009-2013 Free Software Foundation, Inc.
+# Copyright (C) 2009-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -969,7 +972,7 @@ AC_SUBST([AM_BACKSLASH])dnl
_AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
])
-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -997,7 +1000,7 @@ fi
INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
AC_SUBST([INSTALL_STRIP_PROGRAM])])
-# Copyright (C) 2006-2013 Free Software Foundation, Inc.
+# Copyright (C) 2006-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -1016,7 +1019,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
# Check how to create a tarball. -*- Autoconf -*-
-# Copyright (C) 2004-2013 Free Software Foundation, Inc.
+# Copyright (C) 2004-2014 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
diff --git a/config/compile b/config/compile
index 531136b..a85b723 100755
--- a/config/compile
+++ b/config/compile
@@ -3,7 +3,7 @@
scriptversion=2012-10-14.11; # UTC
-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
# Written by Tom Tromey <tromey at cygnus.com>.
#
# This program is free software; you can redistribute it and/or modify
diff --git a/config/depcomp b/config/depcomp
index 4ebd5b3..fc98710 100755
--- a/config/depcomp
+++ b/config/depcomp
@@ -3,7 +3,7 @@
scriptversion=2013-05-30.07; # UTC
-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/config/install-sh b/config/install-sh
index 377bb86..0b0fdcb 100755
--- a/config/install-sh
+++ b/config/install-sh
@@ -1,7 +1,7 @@
#!/bin/sh
# install - install a program, script, or datafile
-scriptversion=2011-11-20.07; # UTC
+scriptversion=2013-12-25.23; # UTC
# This originates from X11R5 (mit/util/scripts/install.sh), which was
# later released in X11R6 (xc/config/util/install.sh) with the
@@ -41,19 +41,15 @@ scriptversion=2011-11-20.07; # UTC
# This script is compatible with the BSD install script, but was written
# from scratch.
+tab=' '
nl='
'
-IFS=" "" $nl"
+IFS=" $tab$nl"
-# set DOITPROG to echo to test this script
+# Set DOITPROG to "echo" to test this script.
-# Don't use :- since 4.3BSD and earlier shells don't like it.
doit=${DOITPROG-}
-if test -z "$doit"; then
- doit_exec=exec
-else
- doit_exec=$doit
-fi
+doit_exec=${doit:-exec}
# Put in absolute file names if you don't have them in your path;
# or use environment vars.
@@ -68,17 +64,6 @@ mvprog=${MVPROG-mv}
rmprog=${RMPROG-rm}
stripprog=${STRIPPROG-strip}
-posix_glob='?'
-initialize_posix_glob='
- test "$posix_glob" != "?" || {
- if (set -f) 2>/dev/null; then
- posix_glob=
- else
- posix_glob=:
- fi
- }
-'
-
posix_mkdir=
# Desired mode of installed file.
@@ -97,7 +82,7 @@ dir_arg=
dst_arg=
copy_on_change=false
-no_target_directory=
+is_target_a_directory=possibly
usage="\
Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE
@@ -137,46 +122,57 @@ while test $# -ne 0; do
-d) dir_arg=true;;
-g) chgrpcmd="$chgrpprog $2"
- shift;;
+ shift;;
--help) echo "$usage"; exit $?;;
-m) mode=$2
- case $mode in
- *' '* | *' '* | *'
-'* | *'*'* | *'?'* | *'['*)
- echo "$0: invalid mode: $mode" >&2
- exit 1;;
- esac
- shift;;
+ case $mode in
+ *' '* | *"$tab"* | *"$nl"* | *'*'* | *'?'* | *'['*)
+ echo "$0: invalid mode: $mode" >&2
+ exit 1;;
+ esac
+ shift;;
-o) chowncmd="$chownprog $2"
- shift;;
+ shift;;
-s) stripcmd=$stripprog;;
- -t) dst_arg=$2
- # Protect names problematic for 'test' and other utilities.
- case $dst_arg in
- -* | [=\(\)!]) dst_arg=./$dst_arg;;
- esac
- shift;;
+ -t)
+ is_target_a_directory=always
+ dst_arg=$2
+ # Protect names problematic for 'test' and other utilities.
+ case $dst_arg in
+ -* | [=\(\)!]) dst_arg=./$dst_arg;;
+ esac
+ shift;;
- -T) no_target_directory=true;;
+ -T) is_target_a_directory=never;;
--version) echo "$0 $scriptversion"; exit $?;;
- --) shift
- break;;
+ --) shift
+ break;;
- -*) echo "$0: invalid option: $1" >&2
- exit 1;;
+ -*) echo "$0: invalid option: $1" >&2
+ exit 1;;
*) break;;
esac
shift
done
+# We allow the use of options -d and -T together, by making -d
+# take the precedence; this is for compatibility with GNU install.
+
+if test -n "$dir_arg"; then
+ if test -n "$dst_arg"; then
+ echo "$0: target directory not allowed when installing a directory." >&2
+ exit 1
+ fi
+fi
+
if test $# -ne 0 && test -z "$dir_arg$dst_arg"; then
# When -d is used, all remaining arguments are directories to create.
# When -t is used, the destination is already specified.
@@ -208,6 +204,15 @@ if test $# -eq 0; then
fi
if test -z "$dir_arg"; then
+ if test $# -gt 1 || test "$is_target_a_directory" = always; then
+ if test ! -d "$dst_arg"; then
+ echo "$0: $dst_arg: Is not a directory." >&2
+ exit 1
+ fi
+ fi
+fi
+
+if test -z "$dir_arg"; then
do_exit='(exit $ret); exit $ret'
trap "ret=129; $do_exit" 1
trap "ret=130; $do_exit" 2
@@ -223,16 +228,16 @@ if test -z "$dir_arg"; then
*[0-7])
if test -z "$stripcmd"; then
- u_plus_rw=
+ u_plus_rw=
else
- u_plus_rw='% 200'
+ u_plus_rw='% 200'
fi
cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;;
*)
if test -z "$stripcmd"; then
- u_plus_rw=
+ u_plus_rw=
else
- u_plus_rw=,u+rw
+ u_plus_rw=,u+rw
fi
cp_umask=$mode$u_plus_rw;;
esac
@@ -269,41 +274,15 @@ do
# If destination is a directory, append the input filename; won't work
# if double slashes aren't ignored.
if test -d "$dst"; then
- if test -n "$no_target_directory"; then
- echo "$0: $dst_arg: Is a directory" >&2
- exit 1
+ if test "$is_target_a_directory" = never; then
+ echo "$0: $dst_arg: Is a directory" >&2
+ exit 1
fi
dstdir=$dst
dst=$dstdir/`basename "$src"`
dstdir_status=0
else
- # Prefer dirname, but fall back on a substitute if dirname fails.
- dstdir=`
- (dirname "$dst") 2>/dev/null ||
- expr X"$dst" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
- X"$dst" : 'X\(//\)[^/]' \| \
- X"$dst" : 'X\(//\)$' \| \
- X"$dst" : 'X\(/\)' \| . 2>/dev/null ||
- echo X"$dst" |
- sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
- s//\1/
- q
- }
- /^X\(\/\/\)[^/].*/{
- s//\1/
- q
- }
- /^X\(\/\/\)$/{
- s//\1/
- q
- }
- /^X\(\/\).*/{
- s//\1/
- q
- }
- s/.*/./; q'
- `
-
+ dstdir=`dirname "$dst"`
test -d "$dstdir"
dstdir_status=$?
fi
@@ -314,74 +293,74 @@ do
if test $dstdir_status != 0; then
case $posix_mkdir in
'')
- # Create intermediate dirs using mode 755 as modified by the umask.
- # This is like FreeBSD 'install' as of 1997-10-28.
- umask=`umask`
- case $stripcmd.$umask in
- # Optimize common cases.
- *[2367][2367]) mkdir_umask=$umask;;
- .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;;
-
- *[0-7])
- mkdir_umask=`expr $umask + 22 \
- - $umask % 100 % 40 + $umask % 20 \
- - $umask % 10 % 4 + $umask % 2
- `;;
- *) mkdir_umask=$umask,go-w;;
- esac
-
- # With -d, create the new directory with the user-specified mode.
- # Otherwise, rely on $mkdir_umask.
- if test -n "$dir_arg"; then
- mkdir_mode=-m$mode
- else
- mkdir_mode=
- fi
-
- posix_mkdir=false
- case $umask in
- *[123567][0-7][0-7])
- # POSIX mkdir -p sets u+wx bits regardless of umask, which
- # is incompatible with FreeBSD 'install' when (umask & 300) != 0.
- ;;
- *)
- tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$
- trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0
-
- if (umask $mkdir_umask &&
- exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1
- then
- if test -z "$dir_arg" || {
- # Check for POSIX incompatibilities with -m.
- # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or
- # other-writable bit of parent directory when it shouldn't.
- # FreeBSD 6.1 mkdir -m -p sets mode of existing directory.
- ls_ld_tmpdir=`ls -ld "$tmpdir"`
- case $ls_ld_tmpdir in
- d????-?r-*) different_mode=700;;
- d????-?--*) different_mode=755;;
- *) false;;
- esac &&
- $mkdirprog -m$different_mode -p -- "$tmpdir" && {
- ls_ld_tmpdir_1=`ls -ld "$tmpdir"`
- test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1"
- }
- }
- then posix_mkdir=:
- fi
- rmdir "$tmpdir/d" "$tmpdir"
- else
- # Remove any dirs left behind by ancient mkdir implementations.
- rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null
- fi
- trap '' 0;;
- esac;;
+ # Create intermediate dirs using mode 755 as modified by the umask.
+ # This is like FreeBSD 'install' as of 1997-10-28.
+ umask=`umask`
+ case $stripcmd.$umask in
+ # Optimize common cases.
+ *[2367][2367]) mkdir_umask=$umask;;
+ .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;;
+
+ *[0-7])
+ mkdir_umask=`expr $umask + 22 \
+ - $umask % 100 % 40 + $umask % 20 \
+ - $umask % 10 % 4 + $umask % 2
+ `;;
+ *) mkdir_umask=$umask,go-w;;
+ esac
+
+ # With -d, create the new directory with the user-specified mode.
+ # Otherwise, rely on $mkdir_umask.
+ if test -n "$dir_arg"; then
+ mkdir_mode=-m$mode
+ else
+ mkdir_mode=
+ fi
+
+ posix_mkdir=false
+ case $umask in
+ *[123567][0-7][0-7])
+ # POSIX mkdir -p sets u+wx bits regardless of umask, which
+ # is incompatible with FreeBSD 'install' when (umask & 300) != 0.
+ ;;
+ *)
+ tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$
+ trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0
+
+ if (umask $mkdir_umask &&
+ exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1
+ then
+ if test -z "$dir_arg" || {
+ # Check for POSIX incompatibilities with -m.
+ # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or
+ # other-writable bit of parent directory when it shouldn't.
+ # FreeBSD 6.1 mkdir -m -p sets mode of existing directory.
+ ls_ld_tmpdir=`ls -ld "$tmpdir"`
+ case $ls_ld_tmpdir in
+ d????-?r-*) different_mode=700;;
+ d????-?--*) different_mode=755;;
+ *) false;;
+ esac &&
+ $mkdirprog -m$different_mode -p -- "$tmpdir" && {
+ ls_ld_tmpdir_1=`ls -ld "$tmpdir"`
+ test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1"
+ }
+ }
+ then posix_mkdir=:
+ fi
+ rmdir "$tmpdir/d" "$tmpdir"
+ else
+ # Remove any dirs left behind by ancient mkdir implementations.
+ rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null
+ fi
+ trap '' 0;;
+ esac;;
esac
if
$posix_mkdir && (
- umask $mkdir_umask &&
- $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir"
+ umask $mkdir_umask &&
+ $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir"
)
then :
else
@@ -391,53 +370,51 @@ do
# directory the slow way, step by step, checking for races as we go.
case $dstdir in
- /*) prefix='/';;
- [-=\(\)!]*) prefix='./';;
- *) prefix='';;
+ /*) prefix='/';;
+ [-=\(\)!]*) prefix='./';;
+ *) prefix='';;
esac
- eval "$initialize_posix_glob"
-
oIFS=$IFS
IFS=/
- $posix_glob set -f
+ set -f
set fnord $dstdir
shift
- $posix_glob set +f
+ set +f
IFS=$oIFS
prefixes=
for d
do
- test X"$d" = X && continue
-
- prefix=$prefix$d
- if test -d "$prefix"; then
- prefixes=
- else
- if $posix_mkdir; then
- (umask=$mkdir_umask &&
- $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break
- # Don't fail if two instances are running concurrently.
- test -d "$prefix" || exit 1
- else
- case $prefix in
- *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;;
- *) qprefix=$prefix;;
- esac
- prefixes="$prefixes '$qprefix'"
- fi
- fi
- prefix=$prefix/
+ test X"$d" = X && continue
+
+ prefix=$prefix$d
+ if test -d "$prefix"; then
+ prefixes=
+ else
+ if $posix_mkdir; then
+ (umask=$mkdir_umask &&
+ $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break
+ # Don't fail if two instances are running concurrently.
+ test -d "$prefix" || exit 1
+ else
+ case $prefix in
+ *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;;
+ *) qprefix=$prefix;;
+ esac
+ prefixes="$prefixes '$qprefix'"
+ fi
+ fi
+ prefix=$prefix/
done
if test -n "$prefixes"; then
- # Don't fail if two instances are running concurrently.
- (umask $mkdir_umask &&
- eval "\$doit_exec \$mkdirprog $prefixes") ||
- test -d "$dstdir" || exit 1
- obsolete_mkdir_used=true
+ # Don't fail if two instances are running concurrently.
+ (umask $mkdir_umask &&
+ eval "\$doit_exec \$mkdirprog $prefixes") ||
+ test -d "$dstdir" || exit 1
+ obsolete_mkdir_used=true
fi
fi
fi
@@ -472,15 +449,12 @@ do
# If -C, don't bother to copy if it wouldn't change the file.
if $copy_on_change &&
- old=`LC_ALL=C ls -dlL "$dst" 2>/dev/null` &&
- new=`LC_ALL=C ls -dlL "$dsttmp" 2>/dev/null` &&
-
- eval "$initialize_posix_glob" &&
- $posix_glob set -f &&
+ old=`LC_ALL=C ls -dlL "$dst" 2>/dev/null` &&
+ new=`LC_ALL=C ls -dlL "$dsttmp" 2>/dev/null` &&
+ set -f &&
set X $old && old=:$2:$4:$5:$6 &&
set X $new && new=:$2:$4:$5:$6 &&
- $posix_glob set +f &&
-
+ set +f &&
test "$old" = "$new" &&
$cmpprog "$dst" "$dsttmp" >/dev/null 2>&1
then
@@ -493,24 +467,24 @@ do
# to itself, or perhaps because mv is so ancient that it does not
# support -f.
{
- # Now remove or move aside any old file at destination location.
- # We try this two ways since rm can't unlink itself on some
- # systems and the destination file might be busy for other
- # reasons. In this case, the final cleanup might fail but the new
- # file should still install successfully.
- {
- test ! -f "$dst" ||
- $doit $rmcmd -f "$dst" 2>/dev/null ||
- { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null &&
- { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; }
- } ||
- { echo "$0: cannot unlink or rename $dst" >&2
- (exit 1); exit 1
- }
- } &&
-
- # Now rename the file to the real destination.
- $doit $mvcmd "$dsttmp" "$dst"
+ # Now remove or move aside any old file at destination location.
+ # We try this two ways since rm can't unlink itself on some
+ # systems and the destination file might be busy for other
+ # reasons. In this case, the final cleanup might fail but the new
+ # file should still install successfully.
+ {
+ test ! -f "$dst" ||
+ $doit $rmcmd -f "$dst" 2>/dev/null ||
+ { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null &&
+ { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; }
+ } ||
+ { echo "$0: cannot unlink or rename $dst" >&2
+ (exit 1); exit 1
+ }
+ } &&
+
+ # Now rename the file to the real destination.
+ $doit $mvcmd "$dsttmp" "$dst"
}
fi || exit 1
diff --git a/config/missing b/config/missing
index db98974..f62bbae 100755
--- a/config/missing
+++ b/config/missing
@@ -3,7 +3,7 @@
scriptversion=2013-10-28.13; # UTC
-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
# Originally written by Fran,cois Pinard <pinard at iro.umontreal.ca>, 1996.
# This program is free software; you can redistribute it and/or modify
diff --git a/config/test-driver b/config/test-driver
index d306056..8e575b0 100755
--- a/config/test-driver
+++ b/config/test-driver
@@ -3,7 +3,7 @@
scriptversion=2013-07-13.22; # UTC
-# Copyright (C) 2011-2013 Free Software Foundation, Inc.
+# Copyright (C) 2011-2014 Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -106,11 +106,14 @@ trap "st=143; $do_exit" 15
# Test script is run here.
"$@" >$log_file 2>&1
estatus=$?
+
if test $enable_hard_errors = no && test $estatus -eq 99; then
- estatus=1
+ tweaked_estatus=1
+else
+ tweaked_estatus=$estatus
fi
-case $estatus:$expect_failure in
+case $tweaked_estatus:$expect_failure in
0:yes) col=$red res=XPASS recheck=yes gcopy=yes;;
0:*) col=$grn res=PASS recheck=no gcopy=no;;
77:*) col=$blu res=SKIP recheck=no gcopy=yes;;
@@ -119,6 +122,12 @@ case $estatus:$expect_failure in
*:*) col=$red res=FAIL recheck=yes gcopy=yes;;
esac
+# Report the test outcome and exit status in the logs, so that one can
+# know whether the test passed or failed simply by looking at the '.log'
+# file, without the need of also peaking into the corresponding '.trs'
+# file (automake bug#11814).
+echo "$res $test_name (exit status: $estatus)" >>$log_file
+
# Report outcome to console.
echo "${col}${res}${std}: $test_name"
diff --git a/configure b/configure
index 58fc063..bed374a 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for Stacks 1.37.
+# Generated by GNU Autoconf 2.69 for Stacks 1.38.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='Stacks'
PACKAGE_TARNAME='stacks'
-PACKAGE_VERSION='1.37'
-PACKAGE_STRING='Stacks 1.37'
+PACKAGE_VERSION='1.38'
+PACKAGE_STRING='Stacks 1.38'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1283,7 +1283,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures Stacks 1.37 to adapt to many kinds of systems.
+\`configure' configures Stacks 1.38 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1349,7 +1349,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of Stacks 1.37:";;
+ short | recursive ) echo "Configuration of Stacks 1.38:";;
esac
cat <<\_ACEOF
@@ -1456,7 +1456,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-Stacks configure 1.37
+Stacks configure 1.38
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1913,7 +1913,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by Stacks $as_me 1.37, which was
+It was created by Stacks $as_me 1.38, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2290,7 +2290,7 @@ ac_config_sub="$SHELL $ac_aux_dir/config.sub" # Please don't use this var.
ac_configure="$SHELL $ac_aux_dir/configure" # Please don't use this var.
-am__api_version='1.14'
+am__api_version='1.15'
# Find a good install program. We prefer a C program (faster),
# so one script is as good as another. But avoid the broken or
@@ -2462,8 +2462,8 @@ test "$program_suffix" != NONE &&
ac_script='s/[\\$]/&&/g;s/;s,x,x,$//'
program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"`
-# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd $ac_aux_dir && pwd`
+# Expand $ac_aux_dir to an absolute path.
+am_aux_dir=`cd "$ac_aux_dir" && pwd`
if test x"${MISSING+set}" != xset; then
case $am_aux_dir in
@@ -2482,7 +2482,7 @@ else
$as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;}
fi
-if test x"${install_sh}" != xset; then
+if test x"${install_sh+set}" != xset; then
case $am_aux_dir in
*\ * | *\ *)
install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@@ -2776,7 +2776,7 @@ fi
# Define the identity of the package.
PACKAGE='stacks'
- VERSION='1.37'
+ VERSION='1.38'
cat >>confdefs.h <<_ACEOF
@@ -2810,8 +2810,8 @@ MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
# <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
mkdir_p='$(MKDIR_P)'
-# We need awk for the "check" target. The system "awk" is bad on
-# some platforms.
+# We need awk for the "check" target (and possibly the TAP driver). The
+# system "awk" is bad on some platforms.
# Always define AMTAR for backward compatibility. Yes, it's still used
# in the wild :-( We should find a proper way to deprecate it ...
AMTAR='$${TAR-tar}'
@@ -2869,6 +2869,7 @@ END
fi
fi
+
ac_config_headers="$ac_config_headers config.h"
@@ -6224,7 +6225,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by Stacks $as_me 1.37, which was
+This file was extended by Stacks $as_me 1.38, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6290,7 +6291,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
-Stacks config.status 1.37
+Stacks config.status 1.38
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
diff --git a/configure.ac b/configure.ac
index ba9ec98..71b3f52 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
# Process this file with autoconf to produce a configure script.
AC_PREREQ(2.59)
-AC_INIT([Stacks], [1.37])
+AC_INIT([Stacks], [1.38])
AC_CONFIG_AUX_DIR([config])
AM_INIT_AUTOMAKE([-Wall -Werror foreign parallel-tests subdir-objects])
AC_CONFIG_SRCDIR([src/ustacks.cc])
diff --git a/php/CatalogClass.php b/php/CatalogClass.php
index 938a47d..d0af8ec 100644
--- a/php/CatalogClass.php
+++ b/php/CatalogClass.php
@@ -119,7 +119,12 @@ class Catalog {
$this->params[] = &$this->display['filter_vprog'];
$typestr .= "i";
- } else if ($filter == "cata") {
+ } else if ($filter == "lnl") {
+ $this->params[] = &$this->display['filter_lnl_l'];
+ $this->params[] = &$this->display['filter_lnl_u'];
+ $typestr .= "ii";
+
+ } else if ($filter == "cata") {
$this->params[] = &$this->display['filter_cata'];
$typestr .= "i";
@@ -153,6 +158,7 @@ class Catalog {
"pare" => "(parents >= ? AND parents <= ?)",
"prog" => "(progeny >= ?)",
"vprog" => "(valid_progeny >= ?)",
+ "lnl" => "(lnl >= ? AND lnl <= ?)",
"mark" => "(marker LIKE ?)",
"est" => "(ests > 0)",
"pe" => "(pe_radtags > 0)",
diff --git a/php/catalog.php b/php/catalog.php
index 15abced..4ecf125 100644
--- a/php/catalog.php
+++ b/php/catalog.php
@@ -626,11 +626,11 @@ function write_pagination($num_tags, &$start_gene, &$end_gene, $destination) {
$num_pages = floor($num_tags / $per_page);
$num_pages += $num_tags % $per_page >= 1 ? 1 : 0;
- if ($page > $num_pages) {
+ if ($num_pages > 0 && $page > $num_pages) {
$page = $num_pages;
$cur_page = $num_pages;
}
-
+
// Determine the start and end gene numbers
$start_gene = 1 + (($page - 1) * $per_page);
$end_gene =
@@ -1391,6 +1391,7 @@ function prepare_filter_parameters(&$display_params, &$param) {
} else if ($filter == "mark") {
$param[] = &$display_params['filter_mark'];
+ $typestr .= "s";
}
}
diff --git a/php/export_batch.php b/php/export_batch.php
index fcd2bbd..9fcab4c 100644
--- a/php/export_batch.php
+++ b/php/export_batch.php
@@ -115,6 +115,12 @@ function process_filter(&$display_params, &$filters) {
$display_params['filter_vprog'] = $_GET['filter_vprog'];
array_push($filters, "vprog=" . $_GET['filter_vprog']);
+ } else if ($filter == "lnl") {
+ $display_params['filter_lnl_l'] = $_GET['filter_lnl_l'];
+ array_push($filters, "lnl_l=" . $_GET['filter_lnl_l']);
+ $display_params['filter_lnl_u'] = $_GET['filter_lnl_u'];
+ array_push($filters, "lnl_u=" . $_GET['filter_lnl_l']);
+
} else if ($filter == "cata") {
$display_params['filter_cata'] = $_GET['filter_cata'];
array_push($filters, "cata=" . $_GET['filter_cata']);
diff --git a/scripts/denovo_map.pl b/scripts/denovo_map.pl
index 858d80c..c5ddaaf 100755
--- a/scripts/denovo_map.pl
+++ b/scripts/denovo_map.pl
@@ -1,6 +1,6 @@
#!/usr/bin/env perl
#
-# Copyright 2010-2015, Julian Catchen <jcatchen at illinois.edu>
+# Copyright 2010-2016, Julian Catchen <jcatchen at illinois.edu>
#
# This file is part of Stacks.
#
@@ -33,12 +33,20 @@ use File::Temp qw/ mktemp /;
use File::Spec;
use constant stacks_version => "_VERSION_";
-my $dry_run = 0;
-my $sql = 1;
+use constant true => 1;
+use constant false => 0;
+
+my $dry_run = false;
+my $sql = true;
+my $create_db = false;
+my $overw_db = false;
+my $gapped_alns = false;
my $mysql_config = "_PKGDATADIR_" . "sql/mysql.cnf";
+my $mysql_tables = "_PKGDATADIR_" . "sql/stacks.sql";
my $exe_path = "_BINDIR_";
my $out_path = "";
my $popmap_path = "";
+my $sample_path = "";
my $db = "";
my $data_type = "map";
my $min_cov = 0;
@@ -47,6 +55,7 @@ my $batch_id = -1;
my $sample_id = 1;
my $desc = ""; # Database description of this dataset
my $date = ""; # Date relevent to this data, formatted for SQL: 2009-05-31
+my $gzip = false;
my @parents;
my @progeny;
@@ -58,48 +67,25 @@ my $cmd_str = $0 . " " . join(" ", @ARGV);
parse_command_line();
-check_input_files(\@parents, \@progeny, \@samples);
-
my $cnf = (-e $ENV{"HOME"} . "/.my.cnf") ? $ENV{"HOME"} . "/.my.cnf" : $mysql_config;
#
# Check for the existence of the necessary pipeline programs
#
-die ("Unable to find '" . $exe_path . "ustacks'.\n") if (!-e $exe_path . "ustacks" || !-x $exe_path . "ustacks");
-die ("Unable to find '" . $exe_path . "cstacks'.\n") if (!-e $exe_path . "cstacks" || !-x $exe_path . "cstacks");
-die ("Unable to find '" . $exe_path . "sstacks'.\n") if (!-e $exe_path . "sstacks" || !-x $exe_path . "sstacks");
-die ("Unable to find '" . $exe_path . "genotypes'.\n") if (!-e $exe_path . "genotypes" || !-x $exe_path . "genotypes");
-die ("Unable to find '" . $exe_path . "populations'.\n") if (!-e $exe_path . "populations" || !-x $exe_path . "populations");
+die ("Unable to find '" . $exe_path . "ustacks'.\n") if (!-e $exe_path . "ustacks" || !-x $exe_path . "ustacks");
+die ("Unable to find '" . $exe_path . "cstacks'.\n") if (!-e $exe_path . "cstacks" || !-x $exe_path . "cstacks");
+die ("Unable to find '" . $exe_path . "sstacks'.\n") if (!-e $exe_path . "sstacks" || !-x $exe_path . "sstacks");
+die ("Unable to find '" . $exe_path . "genotypes'.\n") if (!-e $exe_path . "genotypes" || !-x $exe_path . "genotypes");
+die ("Unable to find '" . $exe_path . "populations'.\n") if (!-e $exe_path . "populations" || !-x $exe_path . "populations");
die ("Unable to find '" . $exe_path . "index_radtags.pl'.\n") if (!-e $exe_path . "index_radtags.pl" || !-x $exe_path . "index_radtags.pl");
-my ($i, $log, $log_fh, $pipe_fh, $pfile, $file, $num_files, $parent, $sample, %map);
-
-$i = 1;
-$num_files = scalar(@parents) + scalar(@progeny) + scalar(@samples);
-
-my (@types, $type, @pop_ids, $pop, %pops, @grp_ids, $grp, %grps);
+my ($log, $log_fh, $sample);
-parse_population_map(\@samples, \@pop_ids, \%pops, \@grp_ids, \%grps) if ($data_type eq "population");
-
-foreach $parent (@parents) {
- push(@types, "parent");
- push(@pop_ids, "1");
- push(@grp_ids, "1");
-}
-foreach $parent (@progeny) {
- push(@types, "progeny");
- push(@pop_ids, "1");
- push(@grp_ids, "1");
-}
-foreach $parent (@samples) {
- push(@types, "sample");
-}
+my (@sample_list, %pop_ids, %pops, %grp_ids, %grps, %sample_ids);
-my (@results, $minc, $minrc, $cmd, $ppath, $pop_cnt);
+parse_population_map(\@sample_list, \%pop_ids, \%pops, \%grp_ids, \%grps);
-$pop_cnt = scalar(keys %pops);
-$minc = $min_cov > 0 ? "-m $min_cov" : "";
-$minrc = $min_rcov > 0 ? "-m $min_rcov" : $minc;
+initialize_samples(\@parents, \@progeny, \@samples, \@sample_list, \%pop_ids, \%grp_ids);
#
# Open the log file
@@ -108,447 +94,586 @@ $log = "$out_path/denovo_map.log";
open($log_fh, ">$log") or die("Unable to open log file '$log'; $!\n");
print $log_fh
- "denovo_map.pl version ", stacks_version, " started at ", strftime("%Y-%m-%d %H:%M:%S",(localtime(time))), "\n",
- $cmd_str, "\n";
+ "denovo_map.pl version ", stacks_version, " started at ", strftime("%Y-%m-%d %H:%M:%S", (localtime(time))), "\n",
+ $cmd_str, "\n\n";
-if ($sql == 1) {
- #
- # SQL Batch ID for this set of Radtags, along with description and date of
- # sequencing. Insert this batch data into the database.
- #
- `mysql --defaults-file=$cnf $db -e "INSERT INTO batches SET id=$batch_id, description='$desc', date='$date', type='$data_type'"` if ($dry_run == 0);
+initialize_database($log_fh, \@parents, \@progeny, \@samples, \%sample_ids) if ($sql == true);
- print $log_fh "mysql --defaults-file=$cnf $db -e \"INSERT INTO batches SET id=$batch_id, description='$desc', date='$date', type='$data_type'\"\n";
-}
+execute_stacks($log_fh, $sample_id, \@parents, \@progeny, \@samples, \%sample_ids);
-my $gzip = 0;
+load_sql_data($log_fh, \%pops, \@parents, \@progeny, \@samples) if ($sql == true);
-foreach $sample (@parents, @progeny, @samples) {
- my ($ftype, $pfile) = "";
+print $log_fh "\ndenovo_map.pl completed at ", strftime("%Y-%m-%d %H:%M:%S", (localtime(time))), "\n";
+close($log_fh);
- my ($prefix, $suffix) = ($sample =~ /^(.+)\.(.+)$/);
- if ($suffix eq "gz") {
- $gzip = 1;
- ($prefix, $suffix) = ($prefix =~ /^(.+)\.(.+)$/);
- }
+sub execute_stacks {
+ my ($log_fh, $sample_id, $parents, $progeny, $samples, $sample_ids) = @_;
+
+ my (@results, @depths_of_cov);
+ my ($pop_cnt, $sample, $num_files, $i, $cmd, $pipe_fh, $path, $cat_file);
- if ($prefix =~ /^.*\/.+$/) {
- ($pfile) = ($prefix =~ /^.*\/(.+)$/);
- } else {
- $pfile = $prefix;
- }
+ my $minc = $min_cov > 0 ? "-m $min_cov" : "";
+ my $minrc = $min_rcov > 0 ? "-m $min_rcov" : $minc;
- if ($gzip == 1) {
- if ($suffix =~ /^fa_?\d?$/ || $suffix =~ /^fasta_?\d?$/) {
- $ftype = "gzfasta";
- } elsif ($suffix =~ /^fq_?\d?$/ || $suffix =~ /^fastq_?\d?$/) {
- $ftype = "gzfastq";
- } else {
- die("Unknown input file type.\n");
- }
- } else {
- if ($suffix =~ /^fa_?\d?$/ || $suffix =~ /^fasta_?\d?$/) {
- $ftype = "fasta";
- } elsif ($suffix =~ /^fq_?\d?$/ || $suffix =~ /^fastq_?\d?$/) {
- $ftype = "fastq";
- } else {
- die("Unknown input file type.\n");
- }
- }
+ $i = 1;
+ $num_files = scalar(@{$parents}) + scalar(@{$progeny}) + scalar(@{$samples});
- $type = shift @types;
- $pop = shift @pop_ids;
- $grp = shift @grp_ids;
+ #
+ # Assemble RAD loci in each individual.
+ #
+ foreach $sample (@parents, @progeny, @samples) {
- printf("Identifying unique stacks; file % 3s of % 3s [%s]\n", $i, $num_files, $pfile);
- printf($log_fh "Identifying unique stacks; file % 3s of % 3s [%s]\n", $i, $num_files, $pfile);
+ printf("Identifying unique stacks; file % 3s of % 3s [%s]\n", $i, $num_files, $sample->{'file'});
+ printf($log_fh "Identifying unique stacks; file % 3s of % 3s [%s]\n", $i, $num_files, $sample->{'file'});
+
+ if (scalar(keys %{$sample_ids}) > 0) {
+ $sample_id = $sample_ids->{$sample->{'file'}};
+ }
+
+ $path = $sample->{'path'} . $sample->{'file'} . "." . $sample->{'suffix'};
+
+ if ($sample->{'type'} eq "sample") {
+ $cmd = $exe_path . "ustacks -t $sample->{'fmt'} -f $path -o $out_path -i $sample_id -r $minc " . join(" ", @_ustacks) . " 2>&1";
+ } elsif ($sample->{'type'} eq "parent") {
+ $cmd = $exe_path . "ustacks -t $sample->{'fmt'} -f $path -o $out_path -i $sample_id -r $minc " . join(" ", @_ustacks) . " 2>&1";
+ } elsif ($sample->{'type'} eq "progeny") {
+ $cmd = $exe_path . "ustacks -t $sample->{'fmt'} -f $path -o $out_path -i $sample_id -r $minrc " . join(" ", @_ustacks) . " 2>&1";
+ }
+ print STDERR " $cmd\n";
+ print $log_fh "$cmd\n";
+
+ if ($dry_run == false) {
+ @results = `$cmd`;
+
+ #
+ # Pull the depth of coverage from ustacks.
+ #
+ my $depth;
+ if ($gapped_alns) {
+ my @lines = grep(/^After gapped alignments, coverage depth Mean/, @results);
+ ($depth) = ($lines[0] =~ /^After gapped alignments, coverage depth Mean: (\d+\.?\d*); Std Dev: .+; Max: .+$/);
+ } else {
+ my @lines = grep(/^After remainders merged, coverage depth Mean/, @results);
+ ($depth) = ($lines[0] =~ /^After remainders merged, coverage depth Mean: (\d+\.?\d*); Std Dev: .+; Max: .+$/);
+ }
+ push(@depths_of_cov, [$sample->{'file'}, $depth]);
+ }
+ write_results(\@results, $log_fh);
+
+ $i++;
+ $sample_id++;
+ }
+
+ write_depths_of_cov(\@depths_of_cov, $log_fh);
- if ($sql == 1) {
- if ($dry_run == 0) {
- `mysql --defaults-file=$cnf $db -e "INSERT INTO samples SET sample_id=$i, batch_id=$batch_id, type='$type', file='$pfile', pop_id='$pop', group_id='$grp'"`;
- @results = `mysql --defaults-file=$cnf $db -N -B -e "SELECT id FROM samples WHERE sample_id=$i AND batch_id=$batch_id AND type='$type' AND file='$pfile'"`;
- chomp $results[0];
- $sample_id = $results[0];
- }
- print $log_fh "mysql --defaults-file=$cnf $db -e \"INSERT INTO samples SET sample_id=$i, batch_id=$batch_id, type='$type', file='$pfile', pop_id='$pop', group_id='$grp'\"\n";
+ #
+ # Generate catalog of RAD loci.
+ #
+ print STDERR "Generating catalog...\n";
+ my $file_paths = "";
+ foreach $sample (@parents, @samples) {
+ $file_paths .= "-s $out_path/$sample->{'file'} ";
}
- $map{$pfile} = $sample_id;
-
- if ($type eq "parent" || $type eq "sample") {
- $cmd = $exe_path . "ustacks -t $ftype -f $sample -o $out_path -i $sample_id $minc " . join(" ", @_ustacks) . " 2>&1";
- } elsif ($type eq "progeny") {
- $cmd = $exe_path . "ustacks -t $ftype -f $sample -o $out_path -i $sample_id $minrc " . join(" ", @_ustacks) . " 2>&1";
- }
+ $cmd = $exe_path . "cstacks -b $batch_id -o $out_path $file_paths " . join(" ", @_cstacks) . " 2>&1";
print STDERR " $cmd\n";
- print $log_fh "$cmd\n";
- @results = `$cmd` if ($dry_run == 0);
- write_results(\@results, $log_fh);
+ print $log_fh "$cmd\n\n";
- print STDERR " Loading ustacks output to $db..." if ($sql == 1);
+ if ($dry_run == false) {
+ open($pipe_fh, "$cmd |");
+ while (<$pipe_fh>) {
+ print $log_fh $_;
+ if ($_ =~ /failed/i) { print STDERR "Catalog construction failed.\n"; exit(1); }
+ }
+ close($pipe_fh);
+ }
- if ($gzip == 1) {
- $file = "$out_path/$pfile" . ".tags.tsv.gz";
- import_gzsql_file($log_fh, $file, "unique_tags", 1);
+ #
+ # Match parents, progeny, or samples to the catalog.
+ #
+ $file_paths = "";
+ print STDERR "Matching samples to the catalog...\n";
- $file = "$out_path/$pfile" . ".snps.tsv.gz";
- import_gzsql_file($log_fh, $file, "snps", 1);
+ foreach $sample (@parents, @progeny, @samples) {
+ $file_paths .= "-s $out_path/$sample->{'file'} ";
+ }
- $file = "$out_path/$pfile" . ".alleles.tsv.gz";
- import_gzsql_file($log_fh, $file, "alleles", 1);
+ $cat_file = "batch_" . $batch_id;
+ $cmd = $exe_path . "sstacks -b $batch_id -c $out_path/$cat_file -o $out_path $file_paths " . join(" ", @_sstacks) . " 2>&1";
+ print STDERR " $cmd\n";
+ print $log_fh "$cmd\n\n";
+
+ if ($dry_run == false) {
+ open($pipe_fh, "$cmd |");
+ while (<$pipe_fh>) {
+ print $log_fh $_;
+ }
+ close($pipe_fh);
+ }
+
+ if ($data_type eq "map") {
+ #
+ # Generate a set of observed haplotypes and a set of markers and generic genotypes
+ #
+ printf(STDERR "Generating genotypes...\n");
+
+ $cmd = $exe_path . "genotypes -b $batch_id -P $out_path -r 1 -c -s " . join(" ", @_genotypes) . " 2>&1";
+ print STDERR "$cmd\n";
+ print $log_fh "$cmd\n";
+
+ if ($dry_run == 0) {
+ open($pipe_fh, "$cmd |");
+ while (<$pipe_fh>) {
+ print $log_fh $_;
+ }
+ close($pipe_fh);
+ }
} else {
- $file = "$out_path/$pfile" . ".tags.tsv";
- import_sql_file($log_fh, $file, "unique_tags", 1);
+ printf(STDERR "Calculating population-level summary statistics\n");
- $file = "$out_path/$pfile" . ".snps.tsv";
- import_sql_file($log_fh, $file, "snps", 1);
+ $cmd = $exe_path . "populations -b $batch_id -P $out_path -s " . join(" ", @_populations) . " 2>&1";
+ print STDERR "$cmd\n";
+ print $log_fh "$cmd\n";
- $file = "$out_path/$pfile" . ".alleles.tsv";
- import_sql_file($log_fh, $file, "alleles", 1);
+ if ($dry_run == 0) {
+ open($pipe_fh, "$cmd |");
+ while (<$pipe_fh>) {
+ print $log_fh $_;
+ }
+ close($pipe_fh);
+ }
}
- print STDERR "done.\n" if ($sql == 1);
-
- $i++;
-
- $sample_id++ if ($sql == 0);
}
-my ($pfile, $cat_file);
-
-#
-# Generate catalog of RAD-Tags
-#
-print STDERR "Generating catalog...\n";
-my $file_paths = "";
-foreach $sample (@parents, @samples) {
- my ($prefix, $suffix) = ($sample =~ /^(.+)\.(.+)$/);
-
- if ($suffix eq "gz") {
- ($prefix, $suffix) = ($prefix =~ /^(.+)\.(.+)$/);
- }
+sub parse_population_map {
+ my ($sample_list, $pop_ids, $pops, $grp_ids, $grps) = @_;
- if ($prefix =~ /^.*\/.+$/) {
- ($pfile) = ($prefix =~ /^.*\/(.+)$/);
- } else {
- $pfile = $prefix;
- }
+ my ($fh, @parts, $line, $sample);
- $file_paths .= "-s $out_path/$pfile ";
-}
+ return if (length($popmap_path) == 0);
+
+ open($fh, "<$popmap_path") or die("Unable to open population map, '$popmap_path', $!\n");
-$cat_file = "batch_" . $batch_id;
-$cmd = $exe_path . "cstacks -b $batch_id -o $out_path $file_paths " . join(" ", @_cstacks) . " 2>&1";
-print STDERR " $cmd\n";
-print $log_fh "$cmd\n";
+ while ($line = <$fh>) {
+ chomp $line;
-if ($dry_run == 0) {
- open($pipe_fh, "$cmd |");
- while (<$pipe_fh>) {
- print $log_fh $_;
- if ($_ =~ /failed/i) { print STDERR "Catalog construction failed.\n"; exit(1); }
- }
- close($pipe_fh);
-}
+ next if ($line =~ /^\s*#/);
+
+ @parts = split(/\t/, $line);
-print STDERR " Importing catalog to MySQL database..." if ($sql == 1);
+ if (scalar(@parts) > 3) {
+ die("Unable to parse population map, '$popmap_path' (map should contain no more than three columns).\n");
+ }
-if ($gzip == 1) {
- $file = "$out_path/$cat_file" . ".catalog.tags.tsv.gz";
- import_gzsql_file($log_fh, $file, "catalog_tags", 1);
+ push(@{$sample_list}, $parts[0]);
- $file = "$out_path/$cat_file" . ".catalog.snps.tsv.gz";
- import_gzsql_file($log_fh, $file, "catalog_snps", 1);
+ $pop_ids->{$parts[0]} = $parts[1];
+ $pops->{$parts[1]}++;
+
+ if (scalar(@parts) > 2) {
+ $grp_ids->{$parts[0]} = $parts[2];
+ $grps->{$parts[2]}++;
+ }
+ }
- $file = "$out_path/$cat_file" . ".catalog.alleles.tsv.gz";
- import_gzsql_file($log_fh, $file, "catalog_alleles", 1);
+ if (scalar(keys %{$grps}) == 0) {
+ $grps->{"1"}++;
-} else {
- $file = "$out_path/$cat_file" . ".catalog.tags.tsv";
- import_sql_file($log_fh, $file, "catalog_tags", 1);
+ foreach $sample (@{$sample_list}) {
+ $grp_ids->{$sample} = "1";
+ }
+ }
- $file = "$out_path/$cat_file" . ".catalog.snps.tsv";
- import_sql_file($log_fh, $file, "catalog_snps", 1);
+ print STDERR "Parsed population map: ", scalar(@{$sample_list}), " files in ", scalar(keys %{$pops});
+ scalar(keys %{$pops}) == 1 ? print STDERR " population" : print STDERR " populations";
+ print STDERR " and ", scalar(keys %{$grps});
+ scalar(keys %{$grps}) == 1 ? print STDERR " group.\n" : print STDERR " groups.\n";
- $file = "$out_path/$cat_file" . ".catalog.alleles.tsv";
- import_sql_file($log_fh, $file, "catalog_alleles", 1);
+ close($fh);
}
-print STDERR "done.\n" if ($sql == 1);
-#
-# Match parents and progeny to the catalog
-#
-$file_paths = "";
-print STDERR "Matching samples to the catalog...\n";
-
-foreach $sample (@parents, @progeny, @samples) {
+sub initialize_samples {
+ my ($parents, $progeny, $samples, $sample_list, $pop_ids, $grp_ids) = @_;
+
+ my ($local_gzip, $file, $prefix, $suffix, $path, $found, $i);
+
+ if (length(scalar(@{$sample_list})) > 0 && scalar(@{$samples}) == 0) {
+ my @suffixes = ("fq", "fastq", "fq.gz", "fastq.gz", "fa", "fasta", "fa.gz", "fasta.gz");
+ my @fmts = ("fastq", "fastq", "gzfastq", "gzfastq", "fasta", "fasta", "gzfasta", "gzfasta");
+
+ #
+ # If a population map was specified and no samples were provided on the command line.
+ #
+ foreach $sample (@{$sample_list}) {
+ $found = false;
+
+ for ($i = 0; $i < scalar(@suffixes); $i++) {
+ $path = $sample_path . $sample . "." . $suffixes[$i];
+ if (-e $path) {
+
+ if ($i == 2 || $i == 3 || $i == 6 || $i == 7) {
+ $gzip = true;
+ }
+
+ push(@{$samples}, {'path' => $sample_path,
+ 'file' => $sample,
+ 'suffix' => $suffixes[$i],
+ 'type' => "sample",
+ 'fmt' => $fmts[$i]});
+ $found = true;
+ last;
+ }
+ }
+
+ if ($found == false) {
+ die("Unable to find sample '$sample' in directory '$sample_path' as specified in the population map, '$popmap_path'.\n");
+ }
+ }
+
+ } else {
+ #
+ # Process any samples were specified on the command line.
+ #
+ foreach $sample (@{$parents}, @{$progeny}, @{$samples}) {
+ $local_gzip = false;
+
+ ($prefix, $suffix) = ($sample->{'path'} =~ /^(.+)\.(.+)$/);
+
+ if ($suffix eq "gz") {
+ $gzip = true;
+ $local_gzip = true;
+ ($prefix, $suffix) = ($prefix =~ /^(.+)\.(.+)$/);
+ }
+
+ $sample->{'suffix'} = $suffix;
+ $sample->{'suffix'} .= ".gz" if ($local_gzip == true);
+
+ if ($prefix =~ /^.*\/.+$/) {
+ ($path, $file) = ($prefix =~ /^(.*\/)(.+)$/);
+ } else {
+ $file = $prefix;
+ $path = "";
+ }
+
+ $sample->{'path'} = $path;
+ $sample->{'file'} = $file;
+
+ if ($local_gzip == true) {
+ if ($suffix =~ /^fa$/ || $suffix =~ /^fasta$/) {
+ $sample->{'fmt'} = "gzfasta";
+ } elsif ($suffix =~ /^fq$/ || $suffix =~ /^fastq$/) {
+ $sample->{'fmt'} = "gzfastq";
+ } else {
+ die("Unknown input file type for file '" . $sample->{'path'} . "'.\n");
+ }
+ } else {
+ if ($suffix =~ /^fa$/ || $suffix =~ /^fasta$/) {
+ $sample->{'fmt'} = "fasta";
+ } elsif ($suffix =~ /^fq$/ || $suffix =~ /^fastq$/) {
+ $sample->{'fmt'} = "fastq";
+ } else {
+ die("Unknown input file type for file '" . $sample->{'path'} . "'.\n");
+ }
+ }
+
+ $path = $sample->{'path'} . $sample->{'file'} . "." . $sample->{'suffix'};
+
+ if (!-e $path) {
+ die("Unable to locate sample file '$path'\n");
+ }
+ }
+
+ foreach $sample (@parents) {
+ $sample->{'type'} = "parent";
+ }
+ foreach $sample (@progeny) {
+ $sample->{'type'} = "progeny";
+ }
+ foreach $sample (@samples) {
+ $sample->{'type'} = "sample";
+ }
+ }
- my ($prefix, $suffix) = ($sample =~ /^(.+)\.(.+)$/);
+ #
+ # If a population map was specified, make sure all samples in the list were found (and vice versa) and assign popualtion IDs.
+ #
+ if (scalar(@{$sample_list}) > 0) {
+
+ my %sample_hash;
+
+ foreach $sample (@{$samples}) {
+ $sample_hash{$sample->{'file'}}++;
+
+ if (!defined($pop_ids->{$sample->{'file'}})) {
+ die("Unable to find an entry for '" . $sample->{'file'} . "' in the population map, '$popmap_path'.\n");
+ } else {
+ $sample->{'pop_id'} = $pop_ids->{$sample->{'file'}};
+ }
+ if (!defined($grp_ids->{$sample->{'file'}})) {
+ die("Unable to find an entry for '" . $sample->{'file'} . "' in the population map, '$popmap_path'.\n");
+ } else {
+ $sample->{'grp_id'} = $grp_ids->{$sample->{'file'}};
+ }
+ }
+
+ foreach $sample (@{$sample_list}) {
+ if (!defined($sample_hash{$sample})) {
+ die("Unable to find a file corresponding to the population map entry '" . $sample . "' in the population map, '$popmap_path'.\n");
+ }
+ }
- if ($suffix eq "gz") {
- ($prefix, $suffix) = ($prefix =~ /^(.+)\.(.+)$/);
+ } else {
+ foreach $sample (@{$parents}, @{$progeny}, @{$samples}) {
+ $sample->{'pop_id'} = "1";
+ $sample->{'grp_id'} = "1";
+ $pop_ids->{$sample->{'file'}} = $sample->{'pop_id'};
+ $grp_ids->{$sample->{'file'}} = $sample->{'grp_id'};
+ }
}
- if ($prefix =~ /^.*\/.+$/) {
- ($pfile) = ($prefix =~ /^.*\/(.+)$/);
- } else {
- $pfile = $prefix;
+ #
+ # Check that no duplicate files were specified.
+ #
+ my (%files, $file);
+ foreach $file (@{$parents}, @{$progeny}, @{$samples}) {
+ $files{$file}++;
+ }
+ foreach $file (keys %files) {
+ if ($files{$file} > 1) {
+ die("A duplicate file was specified which may create undefined results, '$file'\n");
+ }
}
- $file_paths .= "-s $out_path/$pfile ";
+ print STDERR "Found ", scalar(@{$parents}), " parental file(s).\n" if (scalar(@{$parents}) > 0);
+ print STDERR "Found ", scalar(@{$progeny}), " progeny file(s).\n" if (scalar(@{$progeny}) > 0);
+ print STDERR "Found ", scalar(@{$samples}), " sample file(s).\n" if (scalar(@{$samples}) > 0);
}
-$cmd = $exe_path . "sstacks -b $batch_id -c $out_path/$cat_file -o $out_path $file_paths " . join(" ", @_sstacks) . " 2>&1";
-print STDERR " $cmd\n";
-print $log_fh "$cmd\n";
- at results = `$cmd` if ($dry_run == 0);
-print $log_fh @results;
+sub initialize_database {
+ my ($log_fh, $parents, $progeny, $samples, $sample_ids) = @_;
-#
-# Load the sstacks results to the database if requested.
-#
-if ($sql == 1) {
- $i = 1;
- $num_files = scalar(@parents) + scalar(@progeny) + scalar(@samples);
+ my (@results, $sample_id, $sample);
- foreach $sample (@parents, @progeny, @samples) {
+ print $log_fh "Initializing the database...\n";
+
+ #
+ # Create the database.
+ #
+ if ($create_db) {
+ #
+ # Check that the database doesn't already exist.
+ #
+ if ($dry_run == false) {
+ @results = `mysql --defaults-file=$cnf -N -B -e "SHOW DATABASES LIKE '$db'"`;
+ if (scalar(@results) > 0 && $overw_db == false) {
+ die("Unable to create database '$db', it already exists.\n");
+ }
+ }
+
+ if ($overw_db == true) {
+ `mysql --defaults-file=$cnf -N -B -e "DROP DATABASE IF EXISTS $db"` if ($dry_run == false);
+ print $log_fh "mysql --defaults-file=$cnf -N -B -e \"DROP DATABASE IF EXISTS $db\"\n";
+ }
+
+ `mysql --defaults-file=$cnf -e "CREATE DATABASE $db"` if ($dry_run == false);
+ print $log_fh "mysql --defaults-file=$cnf $db -e \"CREATE DATABASE $db\"\n";
+ `mysql --defaults-file=$cnf $db < $mysql_tables` if ($dry_run == false);
+ print $log_fh "mysql --defaults-file=$cnf $db < $mysql_tables\n";
+ }
+
+ #
+ # Set the SQL Batch ID for this set of loci, along with description and date of
+ # sequencing. Insert this batch data into the database.
+ #
+ `mysql --defaults-file=$cnf $db -e "INSERT INTO batches SET id=$batch_id, description='$desc', date='$date', type='$data_type'"` if ($dry_run == false);
- my ($prefix, $suffix) = ($sample =~ /^(.+)\.(.+)$/);
+ print $log_fh "mysql --defaults-file=$cnf $db -e \"INSERT INTO batches SET id=$batch_id, description='$desc', date='$date', type='$data_type'\"\n";
- if ($suffix eq "gz") {
- ($prefix, $suffix) = ($prefix =~ /^(.+)\.(.+)$/);
- }
+ print $log_fh "Loading sample data into the MySQL database...\n";
- if ($prefix =~ /^.*\/.+$/) {
- ($pfile) = ($prefix =~ /^.*\/(.+)$/);
- } else {
- $pfile = $prefix;
- }
+ my $i = 1;
- printf(STDERR "Loading sstacks output to $db; file % 3s of % 3s [%s]\n", $i, $num_files, $pfile);
+ foreach $sample (@{$parents}, @{$progeny}, @{$samples}) {
- if ($gzip == 1) {
- $file = "$out_path/" . $pfile . ".matches.tsv.gz";
- import_gzsql_file($log_fh, $file, "matches", 1);
+ if ($dry_run == false) {
+ `mysql --defaults-file=$cnf $db -e "INSERT INTO samples SET sample_id=$i, batch_id=$batch_id, type='$sample->{'type'}', file='$sample->{'file'}', pop_id='$sample->{'pop_id'}', group_id='$sample->{'grp_id'}'"`;
+ @results = `mysql --defaults-file=$cnf $db -N -B -e "SELECT id FROM samples WHERE sample_id=$i AND batch_id=$batch_id AND type='$sample->{'type'}' AND file='$sample->{'file'}'"`;
+ chomp $results[0];
+ $sample_id = $results[0];
- } else {
- $file = "$out_path/" . $pfile . ".matches.tsv";
- import_sql_file($log_fh, $file, "matches", 1);
+ #
+ # Save the sample ID to use when running ustacks.
+ #
+ $sample_ids->{$sample->{'file'}} = $sample_id;
}
+ print $log_fh "mysql --defaults-file=$cnf $db -e \"INSERT INTO samples SET sample_id=$i, batch_id=$batch_id, type='$sample->{'type'}', file='$sample->{'file'}', pop_id='$sample->{'pop_id'}', group_id='$sample->{'grp_id'}'\"\n";
- $i++;
+ $i++;
}
+
+ print $log_fh "\n";
}
-if ($data_type eq "map") {
- #
- # Generate a set of observed haplotypes and a set of markers and generic genotypes
- #
- printf(STDERR "Generating genotypes...\n");
+sub load_sql_data {
+ my ($log_fh, $pops, $parents, $progeny, $samples) = @_;
- $cmd = $exe_path . "genotypes -b $batch_id -P $out_path -r 1 -c -s " . join(" ", @_genotypes) . " 2>&1";
- print STDERR "$cmd\n";
- print $log_fh "$cmd\n";
+ my ($pop_cnt, $sample, $num_files, $i, $file);
- if ($dry_run == 0) {
- open($pipe_fh, "$cmd |");
- while (<$pipe_fh>) {
- print $log_fh $_;
- }
- close($pipe_fh);
- }
+ print STDERR "\nComputation is complete, loading results to the database '$db'.\n";
+
+ my $pop_cnt = scalar(keys %{$pops});
- $file = "$out_path/batch_" . $batch_id . ".markers.tsv";
- import_sql_file($log_fh, $file, "markers", 1);
+ $i = 1;
+ $num_files = scalar(@{$parents}) + scalar(@{$progeny}) + scalar(@{$samples});
- $file = "$out_path/batch_" . $batch_id . ".genotypes_1.txt";
- import_sql_file($log_fh, $file, "catalog_genotypes", 1);
-} else {
- printf(STDERR "Calculating population-level summary statistics\n");
+ foreach $sample (@{$parents}, @{$progeny}, @{$samples}) {
- $cmd = $exe_path . "populations -b $batch_id -P $out_path -s " . join(" ", @_populations) . " 2>&1";
- print STDERR "$cmd\n";
- print $log_fh "$cmd\n";
+ printf(STDERR "Loading ustacks output to $db; file % 3s of % 3s [%s]...", $i, $num_files, $sample->{'file'});
- if ($dry_run == 0) {
- open($pipe_fh, "$cmd |");
- while (<$pipe_fh>) {
- print $log_fh $_;
- }
- close($pipe_fh);
- }
+ if ($gzip == true) {
+ $file = "$out_path/$sample->{'file'}" . ".tags.tsv.gz";
+ import_gzsql_file($log_fh, $file, "unique_tags", 1);
- $file = "$out_path/batch_" . $batch_id . ".markers.tsv";
- import_sql_file($log_fh, $file, "markers", 1);
+ $file = "$out_path/$sample->{'file'}" . ".snps.tsv.gz";
+ import_gzsql_file($log_fh, $file, "snps", 1);
- $file = "$out_path/batch_" . $batch_id . ".sumstats.tsv";
- import_sql_file($log_fh, $file, "sumstats", $pop_cnt+1);
+ $file = "$out_path/$sample->{'file'}" . ".alleles.tsv.gz";
+ import_gzsql_file($log_fh, $file, "alleles", 1);
- $file = "$out_path/batch_" . $batch_id . ".hapstats.tsv";
- import_sql_file($log_fh, $file, "hapstats", $pop_cnt+1);
+ } else {
+ $file = "$out_path/$sample->{'file'}" . ".tags.tsv";
+ import_sql_file($log_fh, $file, "unique_tags", 1);
- #
- # Import the Fst files.
- #
- my $fst_cnt = 0;
- my (@keys, $m, $n);
- @keys = sort keys %pops;
- for ($m = 0; $m < scalar(@keys); $m++) {
- for ($n = 0; $n < scalar(@keys); $n++) {
- $file = "$out_path/batch_" . $batch_id . ".fst_" . $keys[$m] . "-" . $keys[$n] . ".tsv";
-
- if (-e $file) {
- import_sql_file($log_fh, $file, "fst", 1);
- $fst_cnt++;
- }
- }
- }
- print STDERR "Imported $fst_cnt Fst file(s).\n";
+ $file = "$out_path/$sample->{'file'}" . ".snps.tsv";
+ import_sql_file($log_fh, $file, "snps", 1);
- #
- # Import the Phi_st files.
- #
- $fst_cnt = 0;
- for ($m = 0; $m < scalar(@keys); $m++) {
- for ($n = 0; $n < scalar(@keys); $n++) {
- $file = "$out_path/batch_" . $batch_id . ".phistats_" . $keys[$m] . "-" . $keys[$n] . ".tsv";
-
- if (-e $file) {
- import_sql_file($log_fh, $file, "phist", 3);
- $fst_cnt++;
- }
- }
+ $file = "$out_path/$sample->{'file'}" . ".alleles.tsv";
+ import_sql_file($log_fh, $file, "alleles", 1);
+ }
+ print STDERR "done.\n";
+
+ $i++;
}
- print STDERR "Imported $fst_cnt Haplotype Fst file(s).\n";
-}
-if ($sql) {
- #
- # Index the radtags database
- #
- print STDERR "Indexing the database...\n";
- $cmd = $exe_path . "index_radtags.pl -D $db -t -c 2>&1";
- print STDERR "$cmd\n";
- print $log_fh "$cmd\n";
- @results = `$cmd` if ($dry_run == 0);
- print $log_fh @results;
-}
+ print STDERR "Importing catalog to $db...";
-print $log_fh "denovo_map.pl completed at ", strftime("%Y-%m-%d %H:%M:%S",(localtime(time))), "\n";
+ my $cat_file = "batch_" . $batch_id;
-close($log_fh);
+ if ($gzip == true) {
+ $file = "$out_path/$cat_file" . ".catalog.tags.tsv.gz";
+ import_gzsql_file($log_fh, $file, "catalog_tags", 1);
-sub parse_population_map {
- my ($samples, $pop_ids, $pops, $grp_ids, $grps) = @_;
-
- my ($fh, @parts, $line, %ids, $file, $path);
-
- if (length($popmap_path) == 0) {
- foreach $path (@{$samples}) {
- push(@{$pop_ids}, "1");
- push(@{$grp_ids}, "1");
- $pops->{"1"}++;
- $grps->{"1"}++;
- }
- return;
- }
-
- open($fh, "<$popmap_path") or die("Unable to open population map, '$popmap_path', $!\n");
+ $file = "$out_path/$cat_file" . ".catalog.snps.tsv.gz";
+ import_gzsql_file($log_fh, $file, "catalog_snps", 1);
- while ($line = <$fh>) {
- chomp $line;
- @parts = split(/\t/, $line);
+ $file = "$out_path/$cat_file" . ".catalog.alleles.tsv.gz";
+ import_gzsql_file($log_fh, $file, "catalog_alleles", 1);
- if (scalar(@parts) > 3) {
- die("Unable to parse population map, '$popmap_path' (map should contain no more than three columns).\n");
- }
+ } else {
+ $file = "$out_path/$cat_file" . ".catalog.tags.tsv";
+ import_sql_file($log_fh, $file, "catalog_tags", 1);
- $ids{$parts[0]} = $parts[1];
+ $file = "$out_path/$cat_file" . ".catalog.snps.tsv";
+ import_sql_file($log_fh, $file, "catalog_snps", 1);
- if (scalar(@parts) > 2) {
- push(@{$grp_ids}, $parts[2]);
- $grps->{$parts[2]}++;
- }
+ $file = "$out_path/$cat_file" . ".catalog.alleles.tsv";
+ import_sql_file($log_fh, $file, "catalog_alleles", 1);
}
+ print STDERR "done.\n";
- if (scalar(keys %{$grps}) == 0) {
- $grps->{"1"}++;
- }
+ #
+ # Load the sstacks results to the database if requested.
+ #
+ $i = 1;
+ foreach $sample (@{$parents}, @{$progeny}, @{$samples}) {
- foreach $path (@{$samples}) {
- my ($prefix, $suffix);
- if ($path =~ /^.+\..+\.gz$/) {
- ($prefix, $suffix) = ($path =~ /^(.+)\.(.+)\.gz$/);
- } else {
- ($prefix, $suffix) = ($path =~ /^(.+)\.(.+)$/);
- }
+ printf(STDERR "Loading sstacks output to $db; file % 3s of % 3s [%s]...", $i, $num_files, $sample->{'file'});
- if ($prefix =~ /^.*\/.+$/) {
- ($file) = ($prefix =~ /^.*\/(.+)$/);
- } else {
- $file = $prefix;
- }
+ if ($gzip == true) {
+ $file = "$out_path/" . $sample->{'file'} . ".matches.tsv.gz";
+ import_gzsql_file($log_fh, $file, "matches", 1);
- if (!defined($ids{$file})) {
- die("Unable to find '$file' in the population map, '$popmap_path'.\n");
+ } else {
+ $file = "$out_path/" . $sample->{'file'} . ".matches.tsv";
+ import_sql_file($log_fh, $file, "matches", 1);
}
+ print STDERR "done.\n";
- push(@{$pop_ids}, $ids{$file});
- $pops->{$ids{$file}}++;
+ $i++;
}
- print STDERR "Parsed population map: ", scalar(@{$samples}), " files in ", scalar(keys %{$pops});
- scalar(keys %{$pops}) == 1 ? print STDERR " population" : print STDERR " populations";
- print STDERR " and ", scalar(keys %{$grps});
- scalar(keys %{$grps}) == 1 ? print STDERR " group.\n" : print STDERR " groups.\n";
-
- close($fh);
-}
+ if ($data_type eq "map") {
+ #
+ # Load the outputs generated by genotypes to the database.
+ #
+ $file = "$out_path/batch_" . $batch_id . ".markers.tsv";
+ import_sql_file($log_fh, $file, "markers", 1);
-sub check_input_files {
- my ($parents, $progeny, $samples) = @_;
-
- #
- # Check that no duplicate files were specified.
- #
- my (%files, $file);
- foreach $file (@{$parents}, @{$progeny}, @{$samples}) {
- $files{$file}++;
- }
- foreach $file (keys %files) {
- if ($files{$file} > 1) {
- print STDERR "A duplicate file was specified which may create undefined results, '$file'\n";
- usage();
- }
- }
+ $file = "$out_path/batch_" . $batch_id . ".genotypes_1.txt";
+ import_sql_file($log_fh, $file, "catalog_genotypes", 1);
+
+ } else {
+ #
+ # Load the outputs generated by populations to the database.
+ #
+ $file = "$out_path/batch_" . $batch_id . ".markers.tsv";
+ import_sql_file($log_fh, $file, "markers", 1);
+
+ $file = "$out_path/batch_" . $batch_id . ".sumstats.tsv";
+ import_sql_file($log_fh, $file, "sumstats", $pop_cnt+1);
+
+ $file = "$out_path/batch_" . $batch_id . ".hapstats.tsv";
+ import_sql_file($log_fh, $file, "hapstats", $pop_cnt+1);
+
+ #
+ # Import the Fst files.
+ #
+ my $fst_cnt = 0;
+ my (@keys, $m, $n);
+ @keys = sort keys %pops;
+ for ($m = 0; $m < scalar(@keys); $m++) {
+ for ($n = 0; $n < scalar(@keys); $n++) {
+ $file = "$out_path/batch_" . $batch_id . ".fst_" . $keys[$m] . "-" . $keys[$n] . ".tsv";
+
+ if (-e $file) {
+ import_sql_file($log_fh, $file, "fst", 1);
+ $fst_cnt++;
+ }
+ }
+ }
+ print STDERR "Imported $fst_cnt Fst file(s).\n";
+
+ #
+ # Import the Phi_st files.
+ #
+ $fst_cnt = 0;
+ for ($m = 0; $m < scalar(@keys); $m++) {
+ for ($n = 0; $n < scalar(@keys); $n++) {
+ $file = "$out_path/batch_" . $batch_id . ".phistats_" . $keys[$m] . "-" . $keys[$n] . ".tsv";
+
+ if (-e $file) {
+ import_sql_file($log_fh, $file, "phist", 3);
+ $fst_cnt++;
+ }
+ }
+ }
+ print STDERR "Imported $fst_cnt Haplotype Fst file(s).\n";
+ }
+
+ print $log_fh "\n";
#
- # Check that all the files exist and are accessible.
+ # Index the radtags database
#
- foreach $file (@{$parents}) {
- if (!-e $file) {
- print STDERR "Unable to locate parental file '$file'\n";
- usage();
- }
- }
- print STDERR "Found ", scalar(@{$parents}), " parental file(s).\n" if (scalar(@{$parents}) > 0);
-
- foreach $file (@{$progeny}) {
- if (!-e $file) {
- print STDERR "Unable to locate progeny file '$file'\n";
- usage();
- }
- }
- print STDERR "Found ", scalar(@{$progeny}), " progeny file(s).\n" if (scalar(@{$progeny}) > 0);
-
- foreach $file (@{$samples}) {
- if (!-e $file) {
- print STDERR "Unable to locate sample file '$file'\n";
- usage();
- }
- }
- print STDERR "Found ", scalar(@{$samples}), " sample file(s).\n" if (scalar(@{$samples}) > 0);
+ my ($cmd, @results);
+ print STDERR "Indexing the database...\n";
+ $cmd = $exe_path . "index_radtags.pl -D $db -t -c 2>&1";
+ print STDERR "$cmd\n";
+ print $log_fh "$cmd\n";
+ @results = `$cmd` if ($dry_run == false);
+ print $log_fh @results;
}
sub write_results {
@@ -564,6 +689,20 @@ sub write_results {
}
}
+sub write_depths_of_cov {
+ my ($depths, $log_fh) = @_;
+
+ print STDERR "\nDepths of Coverage for Processed Samples:\n";
+ print $log_fh "\nDepths of Coverage for Processed Samples:\n";
+
+ foreach $a (@{$depths}) {
+ print STDERR $a->[0], ": ", $a->[1], "x\n";
+ print $log_fh $a->[0], ": ", $a->[1], "x\n";
+ }
+ print STDERR "\n";
+ print $log_fh "\n";
+}
+
sub import_sql_file {
my ($log_fh, $file, $table, $skip_lines) = @_;
@@ -571,9 +710,9 @@ sub import_sql_file {
$ignore = "IGNORE $skip_lines LINES" if ($skip_lines > 0);
- @results = `mysql --defaults-file=$cnf $db -e "LOAD DATA LOCAL INFILE '$file' INTO TABLE $table $ignore"` if ($sql == 1 && $dry_run == 0);
+ @results = `mysql --defaults-file=$cnf $db -e "LOAD DATA LOCAL INFILE '$file' INTO TABLE $table $ignore"` if ($sql == true && $dry_run == false);
- if ($sql == 1) {
+ if ($sql == true) {
print $log_fh "mysql --defaults-file=$cnf $db -e \"LOAD DATA LOCAL INFILE '$file' INTO TABLE $table $ignore\"\n", @results;
}
}
@@ -590,7 +729,7 @@ sub import_gzsql_file {
#
my $tmpdir = File::Spec->tmpdir();
my $named_pipe = mktemp($tmpdir . "/denovo_map_XXXXXX");
- if ($sql == 1 && $dry_run == 0) {
+ if ($sql == true && $dry_run == false) {
mkfifo($named_pipe, 0700) || die("Unable to create named pipe for loading gzipped data: $named_pipe, $!");
print $log_fh "Streaming $file into named pipe $named_pipe.\n";
}
@@ -598,18 +737,18 @@ sub import_gzsql_file {
#
# Dump our gzipped data onto the named pipe.
#
- system("gunzip -c $file > $named_pipe &") if ($sql == 1 && $dry_run == 0);
+ system("gunzip -c $file > $named_pipe &") if ($sql == true && $dry_run == false);
- @results = `mysql --defaults-file=$cnf $db -e "LOAD DATA LOCAL INFILE '$named_pipe' INTO TABLE $table $ignore"` if ($sql == 1 && $dry_run == 0);
+ @results = `mysql --defaults-file=$cnf $db -e "LOAD DATA LOCAL INFILE '$named_pipe' INTO TABLE $table $ignore"` if ($sql == true && $dry_run == false);
- if ($sql == 1) {
+ if ($sql == true) {
print $log_fh "mysql --defaults-file=$cnf $db -e \"LOAD DATA LOCAL INFILE '$named_pipe' INTO TABLE $table $ignore\"\n", @results;
}
#
# Remove the pipe.
#
- unlink($named_pipe) if ($sql == 1 && $dry_run == 0);
+ unlink($named_pipe) if ($sql == true && $dry_run == false);
}
sub parse_command_line {
@@ -617,25 +756,43 @@ sub parse_command_line {
while (@ARGV) {
$_ = shift @ARGV;
- if ($_ =~ /^-p$/) { push(@parents, shift @ARGV); }
- elsif ($_ =~ /^-r$/) { push(@progeny, shift @ARGV); }
- elsif ($_ =~ /^-s$/) { push(@samples, shift @ARGV); }
- elsif ($_ =~ /^-d$/) { $dry_run++; }
- elsif ($_ =~ /^-o$/) { $out_path = shift @ARGV; }
- elsif ($_ =~ /^-D$/) { $desc = shift @ARGV; }
- elsif ($_ =~ /^-e$/) { $exe_path = shift @ARGV; }
- elsif ($_ =~ /^-b$/) { $batch_id = shift @ARGV; }
- elsif ($_ =~ /^-i$/) { $sample_id = shift @ARGV; }
- elsif ($_ =~ /^-a$/) { $date = shift @ARGV; }
- elsif ($_ =~ /^-S$/) { $sql = 0; }
- elsif ($_ =~ /^-B$/) { $db = shift @ARGV; }
- elsif ($_ =~ /^-m$/) { $min_cov = shift @ARGV; }
- elsif ($_ =~ /^-P$/) { $min_rcov = shift @ARGV; }
- elsif ($_ =~ /^-O$/) {
+ if ($_ =~ /^-v$/) { version(); exit(); }
+ elsif ($_ =~ /^-h$/) { usage(); }
+ elsif ($_ =~ /^-p$/) { push(@parents, { 'path' => shift @ARGV }); }
+ elsif ($_ =~ /^-r$/) { push(@progeny, { 'path' => shift @ARGV }); }
+ elsif ($_ =~ /^-s$/) { push(@samples, { 'path' => shift @ARGV }); }
+ elsif ($_ =~ /^-d$/) { $dry_run = true; }
+ elsif ($_ =~ /^-o$/) { $out_path = shift @ARGV; }
+ elsif ($_ =~ /^-D$/) { $desc = shift @ARGV; }
+ elsif ($_ =~ /^-e$/) { $exe_path = shift @ARGV; }
+ elsif ($_ =~ /^-b$/) { $batch_id = shift @ARGV; }
+ elsif ($_ =~ /^-i$/) { $sample_id = shift @ARGV; }
+ elsif ($_ =~ /^-a$/) { $date = shift @ARGV; }
+ elsif ($_ =~ /^-S$/) { $sql = false; }
+ elsif ($_ =~ /^-B$/) { $db = shift @ARGV; }
+ elsif ($_ =~ /^-m$/) { $min_cov = shift @ARGV; }
+ elsif ($_ =~ /^-P$/) { $min_rcov = shift @ARGV; }
+ elsif ($_ =~ /^--samples$/) {
+ $sample_path = shift @ARGV;
+
+ } elsif ($_ =~ /^-O$/) {
$popmap_path = shift @ARGV;
push(@_populations, "-M " . $popmap_path);
- } elsif ($_ =~ /^-A$/) {
+ } elsif ($_ =~ /^--gapped$/) {
+ $gapped_alns = true;
+ push(@_ustacks, "--gapped ");
+ push(@_cstacks, "--gapped ");
+ push(@_sstacks, "--gapped ");
+
+ } elsif ($_ =~ /^--create_db$/) {
+ $create_db = true;
+
+ } elsif ($_ =~ /^--overw_db$/) {
+ $overw_db = true;
+ $create_db = true;
+
+ } elsif ($_ =~ /^-A$/) {
$arg = shift @ARGV;
push(@_genotypes, "-t " . $arg);
@@ -646,7 +803,7 @@ sub parse_command_line {
}
} elsif ($_ =~ /^-t$/) {
- push(@_ustacks, "-d -r");
+ push(@_ustacks, "-d ");
} elsif ($_ =~ /^-T$/) {
$arg = shift @ARGV;
@@ -706,8 +863,6 @@ sub parse_command_line {
usage();
}
}
- elsif ($_ =~ /^-v$/) { version(); exit(); }
- elsif ($_ =~ /^-h$/) { usage(); }
else {
print STDERR "Unknown command line option: '$_'\n";
usage();
@@ -722,7 +877,7 @@ sub parse_command_line {
usage();
}
- if ($sql > 0 && length($date) == 0) {
+ if ($sql == true && length($date) == 0) {
$date = strftime("%Y-%m-%d", (localtime(time)));
}
@@ -731,12 +886,16 @@ sub parse_command_line {
usage();
}
- if (scalar(@parents) == 0 && scalar(@samples) == 0) {
+ if (scalar(@parents) == 0 && scalar(@samples) == 0 && length($popmap_path) == 0) {
print STDERR "You must specify at least one parent or sample file.\n";
usage();
}
- if (scalar(@samples) > 0) {
+ if (length($sample_path) > 0) {
+ $sample_path .= "/" if (substr($sample_path, -1) ne "/");
+ }
+
+ if (scalar(@samples) > 0 || length($popmap_path) > 0) {
$data_type = "population";
} else {
$data_type = "map";
@@ -752,33 +911,41 @@ sub usage {
print STDERR <<EOQ;
denovo_map.pl -p path -r path [-s path] -o path [-t] [-m min_cov] [-M mismatches] [-n mismatches] [-T num_threads] [-A type] [-O popmap] [-B db -b batch_id -D "desc"] [-S -i num] [-e path] [-d] [-h]
- p: path to a FASTQ/FASTA file containing parent sequences from a mapping cross.
- r: path to a FASTQ/FASTA file containing progeny sequences from a mapping cross.
- s: path to a FASTQ/FASTA file containing an individual sample from a population.
+ b: batch ID representing this dataset (an integer, e.g. 1, 2, 3).
o: path to write pipeline output files.
- A: if processing a genetic map, specify the cross type, 'CP', 'F2', 'BC1', 'DH', or 'GEN'.
O: if analyzing one or more populations, specify a pOpulation map.
+ A: if processing a genetic map, specify the cross type, 'CP', 'F2', 'BC1', 'DH', or 'GEN'.
T: specify the number of threads to execute.
e: executable path, location of pipeline programs.
d: perform a dry run. Do not actually execute any programs, just print what would be executed.
h: display this help message.
+ Specify each sample separately:
+ p: path to a FASTQ/FASTA file containing one set of parent sequences from a mapping cross.
+ r: path to a FASTQ/FASTA file containing one set of progeny sequences from a mapping cross.
+ s: path to a FASTQ/FASTA file containing an individual sample from a population.
+ Specify a path to samples and provide a population map:
+ --samples <path>: specify a path to the directory of samples (samples will be read from population map).
+
Stack assembly options:
m: specify a minimum number of identical, raw reads required to create a stack.
- P: specify a minimum number of identical, raw reads required to create a stack in 'progeny' individuals.
M: specify the number of mismatches allowed between loci when processing a single individual (default 2).
- N: specify the number of mismatches allowed when aligning secondary reads to primary stacks (default M+2).
n: specify the number of mismatches allowed between loci when building the catalog (default 1).
- t: remove, or break up, highly repetitive RAD-Tags in the ustacks program.
- H: disable calling haplotypes from secondary reads.
+ --gapped: perform gapped assemblies in ustacks, cstacks, and sstacks (default: off).
+
+ Advanced (rarely used) options:
+ P: specify a minimum number of identical, raw reads required to create a stack in 'progeny' individuals.
+ N: specify the number of mismatches allowed when aligning secondary reads to primary stacks (default M+2).
+ t: remove, or break up, highly repetitive RAD-Tags in the ustacks program.
+ H: disable calling haplotypes from secondary reads.
Database options:
- b: batch ID representing this dataset.
B: specify a database to load data into.
- D: batch description.
- a: batch run date, yyyy-mm-dd, if not provided, current date will be used.
+ D: a description of this batch to be stored in the database.
S: disable recording SQL data in the database.
i: starting sample_id, this is determined automatically if database interaction is enabled.
+ --create_db: create the database specified by '-B' and populate the tables.
+ --overw_db: delete the database before creating a new copy of it (turns on --create_db).
SNP Model Options (these options are passed on to ustacks):
--bound_low <num>: lower bound for epsilon, the error rate, between 0 and 1.0 (default 0).
@@ -790,5 +957,5 @@ denovo_map.pl -p path -r path [-s path] -o path [-t] [-m min_cov] [-M mismatches
EOQ
-exit(0);
+ exit(0);
}
diff --git a/scripts/ref_map.pl b/scripts/ref_map.pl
index 0448dcb..d6ed872 100755
--- a/scripts/ref_map.pl
+++ b/scripts/ref_map.pl
@@ -1,6 +1,6 @@
#!/usr/bin/env perl
#
-# Copyright 2010-2015, Julian Catchen <jcatchen at illinois.edu>
+# Copyright 2010-2016, Julian Catchen <jcatchen at illinois.edu>
#
# This file is part of Stacks.
#
@@ -33,18 +33,28 @@ use File::Temp qw/ mktemp /;
use File::Spec;
use constant stacks_version => "_VERSION_";
-my $dry_run = 0;
-my $sql = 1;
+use constant true => 1;
+use constant false => 0;
+
+my $dry_run = false;
+my $sql = true;
+my $create_db = false;
+my $overw_db = false;
my $mysql_config = "_PKGDATADIR_" . "sql/mysql.cnf";
+my $mysql_tables = "_PKGDATADIR_" . "sql/stacks.sql";
my $exe_path = "_BINDIR_";
my $out_path = "";
my $popmap_path = "";
+my $sample_path = "";
my $db = "";
my $data_type = "map";
+my $min_cov = 0;
+my $min_rcov = 0;
my $batch_id = -1;
my $sample_id = 1;
my $desc = ""; # Database description of this dataset
my $date = ""; # Date relevent to this data, formatted for SQL: 2009-05-31
+my $gzip = false;
my @parents;
my @progeny;
@@ -56,46 +66,25 @@ my $cmd_str = $0 . " " . join(" ", @ARGV);
parse_command_line();
-check_input_files(\@parents, \@progeny, \@samples);
-
my $cnf = (-e $ENV{"HOME"} . "/.my.cnf") ? $ENV{"HOME"} . "/.my.cnf" : $mysql_config;
#
# Check for the existence of the necessary pipeline programs
#
-die ("Unable to find '" . $exe_path . "pstacks'.\n") if (!-e $exe_path . "pstacks" || !-x $exe_path . "pstacks");
-die ("Unable to find '" . $exe_path . "cstacks'.\n") if (!-e $exe_path . "cstacks" || !-x $exe_path . "cstacks");
-die ("Unable to find '" . $exe_path . "sstacks'.\n") if (!-e $exe_path . "sstacks" || !-x $exe_path . "sstacks");
-die ("Unable to find '" . $exe_path . "genotypes'.\n") if (!-e $exe_path . "genotypes" || !-x $exe_path . "genotypes");
-die ("Unable to find '" . $exe_path . "populations'.\n") if (!-e $exe_path . "populations" || !-x $exe_path . "populations");
+die ("Unable to find '" . $exe_path . "pstacks'.\n") if (!-e $exe_path . "pstacks" || !-x $exe_path . "pstacks");
+die ("Unable to find '" . $exe_path . "cstacks'.\n") if (!-e $exe_path . "cstacks" || !-x $exe_path . "cstacks");
+die ("Unable to find '" . $exe_path . "sstacks'.\n") if (!-e $exe_path . "sstacks" || !-x $exe_path . "sstacks");
+die ("Unable to find '" . $exe_path . "genotypes'.\n") if (!-e $exe_path . "genotypes" || !-x $exe_path . "genotypes");
+die ("Unable to find '" . $exe_path . "populations'.\n") if (!-e $exe_path . "populations" || !-x $exe_path . "populations");
die ("Unable to find '" . $exe_path . "index_radtags.pl'.\n") if (!-e $exe_path . "index_radtags.pl" || !-x $exe_path . "index_radtags.pl");
-my ($i, $log, $log_fh, $pipe_fh, $pfile, $file, $num_files, $parent, $sample, %map);
-
-$i = 1;
-$num_files = scalar(@parents) + scalar(@progeny) + scalar(@samples);
-
-my (@types, $type, @pop_ids, $pop, %pops, @grp_ids, $grp, %grps);
+my ($log, $log_fh, $sample);
-parse_population_map(\@samples, \@pop_ids, \%pops, \@grp_ids, \%grps) if ($data_type eq "population");
-
-foreach $parent (@parents) {
- push(@types, "parent");
- push(@pop_ids, "1");
- push(@grp_ids, "1");
-}
-foreach $parent (@progeny) {
- push(@types, "progeny");
- push(@pop_ids, "1");
- push(@grp_ids, "1");
-}
-foreach $parent (@samples) {
- push(@types, "sample");
-}
+my (@sample_list, %pop_ids, %pops, %grp_ids, %grps, %sample_ids);
-my (@results, $cmd, $pop_cnt);
+parse_population_map(\@sample_list, \%pop_ids, \%pops, \%grp_ids, \%grps);
-$pop_cnt = scalar(keys %pops);
+initialize_samples(\@parents, \@progeny, \@samples, \@sample_list, \%pop_ids, \%grp_ids);
#
# Open the log file
@@ -104,423 +93,567 @@ $log = "$out_path/ref_map.log";
open($log_fh, ">$log") or die("Unable to open log file '$log'; $!\n");
print $log_fh
- "ref_map.pl version ", stacks_version, " started at ", strftime("%Y-%m-%d %H:%M:%S",(localtime(time))), "\n",
- $cmd_str, "\n";
+ "ref_map.pl version ", stacks_version, " started at ", strftime("%Y-%m-%d %H:%M:%S", (localtime(time))), "\n",
+ $cmd_str, "\n\n";
-if ($sql == 1) {
- #
- # SQL Batch ID for this set of Radtags, along with description and date of
- # sequencing. Insert this batch data into the database.
- #
- `mysql --defaults-file=$cnf $db -e "INSERT INTO batches SET id=$batch_id, description='$desc', date='$date', type='$data_type'"` if ($dry_run == 0);
+initialize_database($log_fh, \@parents, \@progeny, \@samples, \%sample_ids) if ($sql == true);
- print $log_fh "mysql --defaults-file=$cnf $db -e \"INSERT INTO batches SET id=$batch_id, description='$desc', date='$date', type='$data_type'\"\n";
-}
+execute_stacks($log_fh, $sample_id, \@parents, \@progeny, \@samples, \%sample_ids);
-my $gzip = 0;
+load_sql_data($log_fh, \%pops, \@parents, \@progeny, \@samples) if ($sql == true);
-foreach $sample (@parents, @progeny, @samples) {
- my ($ftype, $pfile) = "";
+print $log_fh "\nref_map.pl completed at ", strftime("%Y-%m-%d %H:%M:%S", (localtime(time))), "\n";
+close($log_fh);
- my ($prefix, $suffix) = ($sample =~ /^(.+)\.(.+)$/);
- if ($prefix =~ /^.*\/.+$/) {
- ($pfile) = ($prefix =~ /^.*\/(.+)$/);
- } else {
- $pfile = $prefix;
- }
+sub execute_stacks {
+ my ($log_fh, $sample_id, $parents, $progeny, $samples, $sample_ids) = @_;
+
+ my (@results, @depths_of_cov);
+ my ($pop_cnt, $sample, $num_files, $i, $cmd, $pipe_fh, $path, $cat_file);
- if ($suffix =~ /^bowtie$/) {
- $ftype = "bowtie";
- } elsif ($suffix =~ /^sam$/) {
- $ftype = "sam";
- } elsif ($suffix =~ /^bam$/) {
- $ftype = "bam";
- $gzip = 1;
- } elsif ($suffix =~ /^map$/) {
- $ftype = "tsv";
- } else {
- die("Unknown input file type.\n");
- }
+ my $minc = $min_cov > 0 ? "-m $min_cov" : "";
+ my $minrc = $min_rcov > 0 ? "-m $min_rcov" : $minc;
- $type = shift @types;
- $pop = shift @pop_ids;
- $grp = shift @grp_ids;
+ $i = 1;
+ $num_files = scalar(@{$parents}) + scalar(@{$progeny}) + scalar(@{$samples});
- printf("Identifying unique stacks; file % 3s of % 3s [%s]\n", $i, $num_files, $pfile);
- printf($log_fh "Identifying unique stacks; file % 3s of % 3s [%s]\n", $i, $num_files, $pfile);
+ #
+ # Assemble RAD loci in each individual.
+ #
+ foreach $sample (@parents, @progeny, @samples) {
- if ($sql == 1) {
- if ($dry_run == 0) {
- `mysql --defaults-file=$cnf $db -e "INSERT INTO samples SET sample_id=$i, batch_id=$batch_id, type='$type', file='$pfile', pop_id='$pop', group_id='$grp'"`;
- @results = `mysql --defaults-file=$cnf $db -N -B -e "SELECT id FROM samples WHERE sample_id=$i AND batch_id=$batch_id AND type='$type' AND file='$pfile'"`;
- chomp $results[0];
- $sample_id = $results[0];
- }
- print $log_fh "mysql --defaults-file=$cnf $db -e \"INSERT INTO samples SET sample_id=$i, batch_id=$batch_id, type='$type', file='$pfile', pop_id='$pop', group_id='$grp'\"\n";
+ printf("Identifying unique stacks; file % 3s of % 3s [%s]\n", $i, $num_files, $sample->{'file'});
+ printf($log_fh "Identifying unique stacks; file % 3s of % 3s [%s]\n", $i, $num_files, $sample->{'file'});
+
+ if (scalar(keys %{$sample_ids}) > 0) {
+ $sample_id = $sample_ids->{$sample->{'file'}};
+ }
+
+ $path = $sample->{'path'} . $sample->{'file'} . "." . $sample->{'suffix'};
+
+ if ($sample->{'type'} eq "sample") {
+ $cmd = $exe_path . "pstacks -t $sample->{'fmt'} -f $path -o $out_path -i $sample_id $minc " . join(" ", @_pstacks) . " 2>&1";
+ } elsif ($sample->{'type'} eq "parent") {
+ $cmd = $exe_path . "pstacks -t $sample->{'fmt'} -f $path -o $out_path -i $sample_id $minc " . join(" ", @_pstacks) . " 2>&1";
+ } elsif ($sample->{'type'} eq "progeny") {
+ $cmd = $exe_path . "pstacks -t $sample->{'fmt'} -f $path -o $out_path -i $sample_id $minrc " . join(" ", @_pstacks) . " 2>&1";
+ }
+ print STDERR " $cmd\n";
+ print $log_fh "$cmd\n";
+
+ if ($dry_run == false) {
+ @results = `$cmd`;
+
+ #
+ # Pull the depth of coverage from pstacks.
+ #
+ my @lines = grep(/Mean coverage depth is/, @results);
+ my ($depth) = ($lines[0] =~ /^ Mean coverage depth is (\d+\.?\d*); Std Dev: .+; Max: .+$/);
+ push(@depths_of_cov, [$sample->{'file'}, $depth]);
+ }
+ write_results(\@results, $log_fh);
+
+ $i++;
+ $sample_id++;
}
- $map{$pfile} = $sample_id;
-
- $cmd = $exe_path . "pstacks -t $ftype -f $sample -o $out_path -i $sample_id " . join(" ", @_pstacks) . " 2>&1";
- print STDERR " $cmd\n";
- print $log_fh "$cmd\n";
- @results = `$cmd` if ($dry_run == 0);
- write_results(\@results, $log_fh);
-
- print STDERR " Loading pstacks output to $db..." if ($sql == 1);
+ write_depths_of_cov(\@depths_of_cov, $log_fh);
- if ($gzip == 1) {
- $file = "$out_path/$pfile" . ".tags.tsv.gz";
- import_gzsql_file($log_fh, $file, "unique_tags", 1);
-
- $file = "$out_path/$pfile" . ".snps.tsv.gz";
- import_gzsql_file($log_fh, $file, "snps", 1);
+ #
+ # Generate catalog of RAD loci.
+ #
+ print STDERR "Generating catalog...\n";
+ my $file_paths = "";
+ foreach $sample (@parents, @samples) {
+ $file_paths .= "-s $out_path/$sample->{'file'} ";
+ }
- $file = "$out_path/$pfile" . ".alleles.tsv.gz";
- import_gzsql_file($log_fh, $file, "alleles", 1);
+ $cmd = $exe_path . "cstacks -g -b $batch_id -o $out_path $file_paths " . join(" ", @_cstacks) . " 2>&1";
+ print STDERR " $cmd\n";
+ print $log_fh "$cmd\n\n";
+
+ if ($dry_run == false) {
+ open($pipe_fh, "$cmd |");
+ while (<$pipe_fh>) {
+ print $log_fh $_;
+ if ($_ =~ /failed/i) { print STDERR "Catalog construction failed.\n"; exit(1); }
+ }
+ close($pipe_fh);
+ }
- } else {
- $file = "$out_path/$pfile" . ".tags.tsv";
- import_sql_file($log_fh, $file, "unique_tags", 1);
+ #
+ # Match parents, progeny, or samples to the catalog.
+ #
+ $file_paths = "";
+ print STDERR "Matching samples to the catalog...\n";
- $file = "$out_path/$pfile" . ".snps.tsv";
- import_sql_file($log_fh, $file, "snps", 1);
+ foreach $sample (@parents, @progeny, @samples) {
+ $file_paths .= "-s $out_path/$sample->{'file'} ";
+ }
- $file = "$out_path/$pfile" . ".alleles.tsv";
- import_sql_file($log_fh, $file, "alleles", 1);
+ $cat_file = "batch_" . $batch_id;
+ $cmd = $exe_path . "sstacks -g -b $batch_id -c $out_path/$cat_file -o $out_path $file_paths " . join(" ", @_sstacks) . " 2>&1";
+ print STDERR " $cmd\n";
+ print $log_fh "$cmd\n\n";
+
+ if ($dry_run == false) {
+ open($pipe_fh, "$cmd |");
+ while (<$pipe_fh>) {
+ print $log_fh $_;
+ }
+ close($pipe_fh);
}
- print STDERR "done.\n" if ($sql == 1);
- $i++;
+ if ($data_type eq "map") {
+ #
+ # Generate a set of observed haplotypes and a set of markers and generic genotypes
+ #
+ printf(STDERR "Generating genotypes...\n");
- $sample_id++ if ($sql == 0);
-}
+ $cmd = $exe_path . "genotypes -b $batch_id -P $out_path -r 1 -c -s " . join(" ", @_genotypes) . " 2>&1";
+ print STDERR "$cmd\n";
+ print $log_fh "$cmd\n";
-my ($pfile, $cat_file);
+ if ($dry_run == 0) {
+ open($pipe_fh, "$cmd |");
+ while (<$pipe_fh>) {
+ print $log_fh $_;
+ }
+ close($pipe_fh);
+ }
-#
-# Generate catalog of RAD-Tags
-#
-print STDERR "Generating catalog...\n";
-my $file_paths = "";
-
-foreach $sample (@parents, @samples) {
- my ($prefix, $suffix) = ($sample =~ /^(.+)\.(.+)$/);
-
- if ($prefix =~ /^.*\/.+$/) {
- ($pfile) = ($prefix =~ /^.*\/(.+)$/);
} else {
- $pfile = $prefix;
- }
-
- $file_paths .= "-s $out_path/$pfile ";
-}
-
-$cat_file = "batch_" . $batch_id;
-$cmd = $exe_path . "cstacks -g -b $batch_id -o $out_path $file_paths " . join(" ", @_cstacks) . " 2>&1";
-print STDERR " $cmd\n";
-print $log_fh "$cmd\n";
-
-if ($dry_run == 0) {
- open($pipe_fh, "$cmd |");
- while (<$pipe_fh>) {
- print $log_fh $_;
- if ($_ =~ /failed/i) { print STDERR "Catalog construction failed.\n"; exit(1); }
+ printf(STDERR "Calculating population-level summary statistics\n");
+
+ $cmd = $exe_path . "populations -b $batch_id -P $out_path -s " . join(" ", @_populations) . " 2>&1";
+ print STDERR "$cmd\n";
+ print $log_fh "$cmd\n";
+
+ if ($dry_run == 0) {
+ open($pipe_fh, "$cmd |");
+ while (<$pipe_fh>) {
+ print $log_fh $_;
+ }
+ close($pipe_fh);
+ }
}
- close($pipe_fh);
}
-print STDERR " Importing catalog to MySQL database..." if ($sql == 1);
-
-if ($gzip == 1) {
- $file = "$out_path/$cat_file" . ".catalog.tags.tsv.gz";
- import_gzsql_file($log_fh, $file, "catalog_tags", 1);
+sub parse_population_map {
+ my ($sample_list, $pop_ids, $pops, $grp_ids, $grps) = @_;
- $file = "$out_path/$cat_file" . ".catalog.snps.tsv.gz";
- import_gzsql_file($log_fh, $file, "catalog_snps", 1);
+ my ($fh, @parts, $line, $sample);
- $file = "$out_path/$cat_file" . ".catalog.alleles.tsv.gz";
- import_gzsql_file($log_fh, $file, "catalog_alleles", 1);
+ return if (length($popmap_path) == 0);
+
+ open($fh, "<$popmap_path") or die("Unable to open population map, '$popmap_path', $!\n");
-} else {
- $file = "$out_path/$cat_file" . ".catalog.tags.tsv";
- import_sql_file($log_fh, $file, "catalog_tags", 1);
+ while ($line = <$fh>) {
+ chomp $line;
- $file = "$out_path/$cat_file" . ".catalog.snps.tsv";
- import_sql_file($log_fh, $file, "catalog_snps", 1);
+ next if ($line =~ /^\s*#/);
+
+ @parts = split(/\t/, $line);
- $file = "$out_path/$cat_file" . ".catalog.alleles.tsv";
- import_sql_file($log_fh, $file, "catalog_alleles", 1);
-}
-print STDERR "done.\n" if ($sql == 1);
+ if (scalar(@parts) > 3) {
+ die("Unable to parse population map, '$popmap_path' (map should contain no more than three columns).\n");
+ }
-#
-# Match parents and progeny to the catalog
-#
-$file_paths = "";
-print STDERR "Matching samples to the catalog...\n";
+ push(@{$sample_list}, $parts[0]);
-foreach $sample (@parents, @progeny, @samples) {
+ $pop_ids->{$parts[0]} = $parts[1];
+ $pops->{$parts[1]}++;
+
+ if (scalar(@parts) > 2) {
+ $grp_ids->{$parts[0]} = $parts[2];
+ $grps->{$parts[2]}++;
+ }
+ }
- my ($prefix, $suffix) = ($sample =~ /^(.+)\.(.+)$/);
+ if (scalar(keys %{$grps}) == 0) {
+ $grps->{"1"}++;
- if ($prefix =~ /^.*\/.+$/) {
- ($pfile) = ($prefix =~ /^.*\/(.+)$/);
- } else {
- $pfile = $prefix;
+ foreach $sample (@{$sample_list}) {
+ $grp_ids->{$sample} = "1";
+ }
}
- $file_paths .= "-s $out_path/$pfile ";
+ print STDERR "Parsed population map: ", scalar(@{$sample_list}), " files in ", scalar(keys %{$pops});
+ scalar(keys %{$pops}) == 1 ? print STDERR " population" : print STDERR " populations";
+ print STDERR " and ", scalar(keys %{$grps});
+ scalar(keys %{$grps}) == 1 ? print STDERR " group.\n" : print STDERR " groups.\n";
+
+ close($fh);
}
-$cmd = $exe_path . "sstacks -g -b $batch_id -c $out_path/$cat_file -o $out_path $file_paths " . join(" ", @_sstacks) . " 2>&1";
-print STDERR " $cmd\n";
-print $log_fh "$cmd\n";
- at results = `$cmd` if ($dry_run == 0);
-print $log_fh @results;
+sub initialize_samples {
+ my ($parents, $progeny, $samples, $sample_list, $pop_ids, $grp_ids) = @_;
+
+ my ($local_gzip, $file, $prefix, $suffix, $path, $found, $i);
+
+ if (length(scalar(@{$sample_list})) > 0 && scalar(@{$samples}) == 0) {
+ my @suffixes = ("sam", "bam", "map", "bowtie");
+ my @fmts = ("sam", "bam", "map", "bowtie");
+
+ #
+ # If a population map was specified and no samples were provided on the command line.
+ #
+ foreach $sample (@{$sample_list}) {
+ $found = false;
+
+ for ($i = 0; $i < scalar(@suffixes); $i++) {
+ $path = $sample_path . $sample . "." . $suffixes[$i];
+ if (-e $path) {
+
+ if ($i == 1) {
+ $gzip = true;
+ }
+
+ push(@{$samples}, {'path' => $sample_path,
+ 'file' => $sample,
+ 'suffix' => $suffixes[$i],
+ 'type' => "sample",
+ 'fmt' => $fmts[$i]});
+ $found = true;
+ last;
+ }
+ }
+
+ if ($found == false) {
+ die("Unable to find sample '$sample' in directory '$sample_path' as specified in the population map, '$popmap_path'.\n");
+ }
+ }
+
+ } else {
+ #
+ # Process any samples were specified on the command line.
+ #
+ foreach $sample (@{$parents}, @{$progeny}, @{$samples}) {
+
+ ($prefix, $suffix) = ($sample->{'path'} =~ /^(.+)\.(.+)$/);
+
+ $sample->{'suffix'} = $suffix;
+
+ if ($prefix =~ /^.*\/.+$/) {
+ ($path, $file) = ($prefix =~ /^(.*\/)(.+)$/);
+ } else {
+ $file = $prefix;
+ $path = "";
+ }
+
+ $sample->{'path'} = $path;
+ $sample->{'file'} = $file;
+
+ if ($suffix =~ /^bam$/) {
+ $sample->{'fmt'} = "bam";
+ $gzip = true;
+ } elsif ($suffix =~ /^sam$/) {
+ $sample->{'fmt'} = "sam";
+ } elsif ($suffix =~ /^map$/) {
+ $sample->{'fmt'} = "map";
+ } elsif ($suffix =~ /^bowtie$/) {
+ $sample->{'fmt'} = "bowtie";
+ } else {
+ die("Unknown input file type for file '" . $sample->{'path'} . "'.\n");
+ }
-#
-# Load the sstacks results to the database if requested.
-#
-if ($sql == 1) {
- $i = 1;
- $num_files = scalar(@parents) + scalar(@progeny) + scalar(@samples);
+ $path = $sample->{'path'} . $sample->{'file'} . "." . $sample->{'suffix'};
+
+ if (!-e $path) {
+ die("Unable to locate sample file '$path'\n");
+ }
+ }
+
+ foreach $sample (@parents) {
+ $sample->{'type'} = "parent";
+ }
+ foreach $sample (@progeny) {
+ $sample->{'type'} = "progeny";
+ }
+ foreach $sample (@samples) {
+ $sample->{'type'} = "sample";
+ }
+ }
- foreach $sample (@parents, @progeny, @samples) {
+ #
+ # If a population map was specified, make sure all samples in the list were found (and vice versa) and assign popualtion IDs.
+ #
+ if (scalar(@{$sample_list}) > 0) {
+
+ my %sample_hash;
+
+ foreach $sample (@{$samples}) {
+ $sample_hash{$sample->{'file'}}++;
+
+ if (!defined($pop_ids->{$sample->{'file'}})) {
+ die("Unable to find an entry for '" . $sample->{'file'} . "' in the population map, '$popmap_path'.\n");
+ } else {
+ $sample->{'pop_id'} = $pop_ids->{$sample->{'file'}};
+ }
+ if (!defined($grp_ids->{$sample->{'file'}})) {
+ die("Unable to find an entry for '" . $sample->{'file'} . "' in the population map, '$popmap_path'.\n");
+ } else {
+ $sample->{'grp_id'} = $grp_ids->{$sample->{'file'}};
+ }
+ }
+
+ foreach $sample (@{$sample_list}) {
+ if (!defined($sample_hash{$sample})) {
+ die("Unable to find a file corresponding to the population map entry '" . $sample . "' in the population map, '$popmap_path'.\n");
+ }
+ }
- my ($prefix, $suffix) = ($sample =~ /^(.+)\.(.+)$/);
+ } else {
+ foreach $sample (@{$parents}, @{$progeny}, @{$samples}) {
+ $sample->{'pop_id'} = "1";
+ $sample->{'grp_id'} = "1";
+ $pop_ids->{$sample->{'file'}} = $sample->{'pop_id'};
+ $grp_ids->{$sample->{'file'}} = $sample->{'grp_id'};
+ }
+ }
- if ($prefix =~ /^.*\/.+$/) {
- ($pfile) = ($prefix =~ /^.*\/(.+)$/);
- } else {
- $pfile = $prefix;
+ #
+ # Check that no duplicate files were specified.
+ #
+ my (%files, $file);
+ foreach $file (@{$parents}, @{$progeny}, @{$samples}) {
+ $files{$file}++;
+ }
+ foreach $file (keys %files) {
+ if ($files{$file} > 1) {
+ die("A duplicate file was specified which may create undefined results, '$file'\n");
}
+ }
- printf(STDERR "Loading sstacks output to $db; file % 3s of % 3s [%s]\n", $i, $num_files, $pfile);
+ print STDERR "Found ", scalar(@{$parents}), " parental file(s).\n" if (scalar(@{$parents}) > 0);
+ print STDERR "Found ", scalar(@{$progeny}), " progeny file(s).\n" if (scalar(@{$progeny}) > 0);
+ print STDERR "Found ", scalar(@{$samples}), " sample file(s).\n" if (scalar(@{$samples}) > 0);
+}
- if ($gzip == 1) {
- $file = "$out_path/" . $pfile . ".matches.tsv.gz";
- import_gzsql_file($log_fh, $file, "matches", 1);
+sub initialize_database {
+ my ($log_fh, $parents, $progeny, $samples, $sample_ids) = @_;
- } else {
- $file = "$out_path/" . $pfile . ".matches.tsv";
- import_sql_file($log_fh, $file, "matches", 1);
- }
+ my (@results, $sample_id, $sample);
- $i++;
- }
-}
+ print $log_fh "Initializing the database...\n";
-if ($data_type eq "map") {
#
- # Generate a set of observed haplotypes and a set of markers and generic genotypes
+ # Create the database.
#
- printf(STDERR "Generating genotypes...\n");
-
- $cmd = $exe_path . "genotypes -b $batch_id -P $out_path -r 1 -c -s " . join(" ", @_genotypes) . " 2>&1";
- print STDERR "$cmd\n";
- print $log_fh "$cmd\n";
-
- if ($dry_run == 0) {
- open($pipe_fh, "$cmd |");
- while (<$pipe_fh>) {
- print $log_fh $_;
- }
- close($pipe_fh);
+ if ($create_db) {
+ #
+ # Check that the database doesn't already exist.
+ #
+ if ($dry_run == false) {
+ @results = `mysql --defaults-file=$cnf -N -B -e "SHOW DATABASES LIKE '$db'"`;
+ if (scalar(@results) > 0 && $overw_db == false) {
+ die("Unable to create database '$db', it already exists.\n");
+ }
+ }
+
+ if ($overw_db == true) {
+ `mysql --defaults-file=$cnf -N -B -e "DROP DATABASE IF EXISTS $db"` if ($dry_run == false);
+ print $log_fh "mysql --defaults-file=$cnf -N -B -e \"DROP DATABASE IF EXISTS '$db'\"\n";
+ }
+
+ `mysql --defaults-file=$cnf -e "CREATE DATABASE $db"` if ($dry_run == false);
+ print $log_fh "mysql --defaults-file=$cnf $db -e \"CREATE DATABASE $db\"\n";
+ `mysql --defaults-file=$cnf $db < $mysql_tables` if ($dry_run == false);
+ print $log_fh "mysql --defaults-file=$cnf $db < $mysql_tables\n";
}
- $file = "$out_path/batch_" . $batch_id . ".markers.tsv";
- import_sql_file($log_fh, $file, "markers", 1);
-
- $file = "$out_path/batch_" . $batch_id . ".genotypes_1.txt";
- import_sql_file($log_fh, $file, "catalog_genotypes", 1);
-
-} else {
- printf(STDERR "Calculating population-level summary statistics\n");
+ #
+ # Set the SQL Batch ID for this set of loci, along with description and date of
+ # sequencing. Insert this batch data into the database.
+ #
+ `mysql --defaults-file=$cnf $db -e "INSERT INTO batches SET id=$batch_id, description='$desc', date='$date', type='$data_type'"` if ($dry_run == false);
- $cmd = $exe_path . "populations -b $batch_id -P $out_path -s " . join(" ", @_populations) . " 2>&1";
- print STDERR "$cmd\n";
- print $log_fh "$cmd\n";
+ print $log_fh "mysql --defaults-file=$cnf $db -e \"INSERT INTO batches SET id=$batch_id, description='$desc', date='$date', type='$data_type'\"\n";
- if ($dry_run == 0) {
- open($pipe_fh, "$cmd |");
- while (<$pipe_fh>) {
- print $log_fh $_;
- }
- close($pipe_fh);
- }
+ print $log_fh "Loading sample data into the MySQL database...\n";
- $file = "$out_path/batch_" . $batch_id . ".markers.tsv";
- import_sql_file($log_fh, $file, "markers", 1);
+ my $i = 1;
- $file = "$out_path/batch_" . $batch_id . ".sumstats.tsv";
- import_sql_file($log_fh, $file, "sumstats", $pop_cnt+1);
+ foreach $sample (@{$parents}, @{$progeny}, @{$samples}) {
- $file = "$out_path/batch_" . $batch_id . ".hapstats.tsv";
- import_sql_file($log_fh, $file, "hapstats", $pop_cnt+1);
+ if ($dry_run == false) {
+ `mysql --defaults-file=$cnf $db -e "INSERT INTO samples SET sample_id=$i, batch_id=$batch_id, type='$sample->{'type'}', file='$sample->{'file'}', pop_id='$sample->{'pop_id'}', group_id='$sample->{'grp_id'}'"`;
+ @results = `mysql --defaults-file=$cnf $db -N -B -e "SELECT id FROM samples WHERE sample_id=$i AND batch_id=$batch_id AND type='$sample->{'type'}' AND file='$sample->{'file'}'"`;
+ chomp $results[0];
+ $sample_id = $results[0];
- #
- # Import the Fst files.
- #
- my $fst_cnt = 0;
- my (@keys, $m, $n);
- @keys = sort keys %pops;
- for ($m = 0; $m < scalar(@keys); $m++) {
- for ($n = 0; $n < scalar(@keys); $n++) {
- $file = "$out_path/batch_" . $batch_id . ".fst_" . $keys[$m] . "-" . $keys[$n] . ".tsv";
-
- if (-e $file) {
- import_sql_file($log_fh, $file, "fst", 1);
- $fst_cnt++;
- }
+ #
+ # Save the sample ID to use when running pstacks.
+ #
+ $sample_ids->{$sample->{'file'}} = $sample_id;
}
- }
- print STDERR "Imported $fst_cnt SNP Fst file(s).\n";
+ print $log_fh "mysql --defaults-file=$cnf $db -e \"INSERT INTO samples SET sample_id=$i, batch_id=$batch_id, type='$sample->{'type'}', file='$sample->{'file'}', pop_id='$sample->{'pop_id'}', group_id='$sample->{'grp_id'}'\"\n";
- #
- # Import the Phi_st files.
- #
- $fst_cnt = 0;
- for ($m = 0; $m < scalar(@keys); $m++) {
- for ($n = 0; $n < scalar(@keys); $n++) {
- $file = "$out_path/batch_" . $batch_id . ".phistats_" . $keys[$m] . "-" . $keys[$n] . ".tsv";
-
- if (-e $file) {
- import_sql_file($log_fh, $file, "phist", 3);
- $fst_cnt++;
- }
- }
+ $i++;
}
- print STDERR "Imported $fst_cnt Haplotype Fst file(s).\n";
-}
-if ($sql) {
- #
- # Index the radtags database
- #
- print STDERR "Indexing the database...\n";
- $cmd = $exe_path . "index_radtags.pl -D $db -t -c 2>&1";
- print STDERR "$cmd\n";
- print $log_fh "$cmd\n";
- @results = `$cmd` if ($dry_run == 0);
- print $log_fh @results;
+ print $log_fh "\n";
}
-print $log_fh "refmap_map.pl completed at ", strftime("%Y-%m-%d %H:%M:%S",(localtime(time))), "\n";
+sub load_sql_data {
+ my ($log_fh, $pops, $parents, $progeny, $samples) = @_;
-close($log_fh);
+ my ($pop_cnt, $sample, $num_files, $i, $file);
-sub parse_population_map {
- my ($samples, $pop_ids, $pops, $grp_ids, $grps) = @_;
+ print STDERR "\nComputation is complete, loading results to the database '$db'.\n";
+
+ my $pop_cnt = scalar(keys %{$pops});
- my ($fh, @parts, $line, %ids, $file, $path);
+ $i = 1;
+ $num_files = scalar(@{$parents}) + scalar(@{$progeny}) + scalar(@{$samples});
- if (length($popmap_path) == 0) {
- foreach $path (@{$samples}) {
- push(@{$pop_ids}, "1");
- push(@{$grp_ids}, "1");
- $pops->{"1"}++;
- $grps->{"1"}++;
- }
- return;
- }
+ foreach $sample (@{$parents}, @{$progeny}, @{$samples}) {
- open($fh, "<$popmap_path") or die("Unable to open population map, '$popmap_path', $!\n");
+ printf(STDERR "Loading pstacks output to $db; file % 3s of % 3s [%s]...", $i, $num_files, $sample->{'file'});
- while ($line = <$fh>) {
- chomp $line;
- @parts = split(/\t/, $line);
+ if ($gzip == true) {
+ $file = "$out_path/$sample->{'file'}" . ".tags.tsv.gz";
+ import_gzsql_file($log_fh, $file, "unique_tags", 1);
- if (scalar(@parts) > 3) {
- die("Unable to parse population map, '$popmap_path' (map should contain no more than three columns).\n");
- }
+ $file = "$out_path/$sample->{'file'}" . ".snps.tsv.gz";
+ import_gzsql_file($log_fh, $file, "snps", 1);
- $ids{$parts[0]} = $parts[1];
+ $file = "$out_path/$sample->{'file'}" . ".alleles.tsv.gz";
+ import_gzsql_file($log_fh, $file, "alleles", 1);
- if (scalar(@parts) > 2) {
- push(@{$grp_ids}, $parts[2]);
- $grps->{$parts[2]}++;
- }
- }
+ } else {
+ $file = "$out_path/$sample->{'file'}" . ".tags.tsv";
+ import_sql_file($log_fh, $file, "unique_tags", 1);
- if (scalar(keys %{$grps}) == 0) {
- $grps->{"1"}++;
+ $file = "$out_path/$sample->{'file'}" . ".snps.tsv";
+ import_sql_file($log_fh, $file, "snps", 1);
+
+ $file = "$out_path/$sample->{'file'}" . ".alleles.tsv";
+ import_sql_file($log_fh, $file, "alleles", 1);
+ }
+ print STDERR "done.\n";
+
+ $i++;
}
- foreach $path (@{$samples}) {
- my ($prefix, $suffix);
- if ($path =~ /^.+\..+\.gz$/) {
- ($prefix, $suffix) = ($path =~ /^(.+)\.(.+)\.gz$/);
- } else {
- ($prefix, $suffix) = ($path =~ /^(.+)\.(.+)$/);
- }
+ print STDERR "Importing catalog to $db...";
- if ($prefix =~ /^.*\/.+$/) {
- ($file) = ($prefix =~ /^.*\/(.+)$/);
- } else {
- $file = $prefix;
- }
+ my $cat_file = "batch_" . $batch_id;
- if (!defined($ids{$file})) {
- die("Unable to find '$file' in the population map, '$popmap_path'.\n");
- }
+ if ($gzip == true) {
+ $file = "$out_path/$cat_file" . ".catalog.tags.tsv.gz";
+ import_gzsql_file($log_fh, $file, "catalog_tags", 1);
- push(@{$pop_ids}, $ids{$file});
- $pops->{$ids{$file}}++;
- }
+ $file = "$out_path/$cat_file" . ".catalog.snps.tsv.gz";
+ import_gzsql_file($log_fh, $file, "catalog_snps", 1);
- print STDERR "Parsed population map: ", scalar(@{$samples}), " files in ", scalar(keys %{$pops});
- scalar(keys %{$pops}) == 1 ? print STDERR " population" : print STDERR " populations";
- print STDERR " and ", scalar(keys %{$grps});
- scalar(keys %{$grps}) == 1 ? print STDERR " group.\n" : print STDERR " groups.\n";
+ $file = "$out_path/$cat_file" . ".catalog.alleles.tsv.gz";
+ import_gzsql_file($log_fh, $file, "catalog_alleles", 1);
- close($fh);
-}
+ } else {
+ $file = "$out_path/$cat_file" . ".catalog.tags.tsv";
+ import_sql_file($log_fh, $file, "catalog_tags", 1);
-sub check_input_files {
- my ($parents, $progeny, $samples) = @_;
+ $file = "$out_path/$cat_file" . ".catalog.snps.tsv";
+ import_sql_file($log_fh, $file, "catalog_snps", 1);
- #
- # Check that no duplicate files were specified.
- #
- my (%files, $file);
- foreach $file (@{$parents}, @{$progeny}, @{$samples}) {
- $files{$file}++;
- }
- foreach $file (keys %files) {
- if ($files{$file} > 1) {
- print STDERR "A duplicate file was specified which may create undefined results, '$file'\n";
- usage();
- }
+ $file = "$out_path/$cat_file" . ".catalog.alleles.tsv";
+ import_sql_file($log_fh, $file, "catalog_alleles", 1);
}
+ print STDERR "done.\n";
#
- # Check that all the files exist and are accessible.
+ # Load the sstacks results to the database if requested.
#
- foreach $file (@{$parents}) {
- if (!-e $file) {
- print STDERR "Unable to locate parental file '$file'\n";
- usage();
- }
- }
- print STDERR "Found ", scalar(@{$parents}), " parental file(s).\n" if (scalar(@{$parents}) > 0);
+ $i = 1;
+ foreach $sample (@{$parents}, @{$progeny}, @{$samples}) {
- foreach $file (@{$progeny}) {
- if (!-e $file) {
- print STDERR "Unable to locate progeny file '$file'\n";
- usage();
+ printf(STDERR "Loading sstacks output to $db; file % 3s of % 3s [%s]...", $i, $num_files, $sample->{'file'});
+
+ if ($gzip == true) {
+ $file = "$out_path/" . $sample->{'file'} . ".matches.tsv.gz";
+ import_gzsql_file($log_fh, $file, "matches", 1);
+
+ } else {
+ $file = "$out_path/" . $sample->{'file'} . ".matches.tsv";
+ import_sql_file($log_fh, $file, "matches", 1);
}
+ print STDERR "done.\n";
+
+ $i++;
}
- print STDERR "Found ", scalar(@{$progeny}), " progeny file(s).\n" if (scalar(@{$progeny}) > 0);
- foreach $file (@{$samples}) {
- if (!-e $file) {
- print STDERR "Unable to locate sample file '$file'\n";
- usage();
- }
+ if ($data_type eq "map") {
+ #
+ # Load the outputs generated by genotypes to the database.
+ #
+ $file = "$out_path/batch_" . $batch_id . ".markers.tsv";
+ import_sql_file($log_fh, $file, "markers", 1);
+
+ $file = "$out_path/batch_" . $batch_id . ".genotypes_1.txt";
+ import_sql_file($log_fh, $file, "catalog_genotypes", 1);
+
+ } else {
+ #
+ # Load the outputs generated by populations to the database.
+ #
+ $file = "$out_path/batch_" . $batch_id . ".markers.tsv";
+ import_sql_file($log_fh, $file, "markers", 1);
+
+ $file = "$out_path/batch_" . $batch_id . ".sumstats.tsv";
+ import_sql_file($log_fh, $file, "sumstats", $pop_cnt+1);
+
+ $file = "$out_path/batch_" . $batch_id . ".hapstats.tsv";
+ import_sql_file($log_fh, $file, "hapstats", $pop_cnt+1);
+
+ #
+ # Import the Fst files.
+ #
+ my $fst_cnt = 0;
+ my (@keys, $m, $n);
+ @keys = sort keys %pops;
+ for ($m = 0; $m < scalar(@keys); $m++) {
+ for ($n = 0; $n < scalar(@keys); $n++) {
+ $file = "$out_path/batch_" . $batch_id . ".fst_" . $keys[$m] . "-" . $keys[$n] . ".tsv";
+
+ if (-e $file) {
+ import_sql_file($log_fh, $file, "fst", 1);
+ $fst_cnt++;
+ }
+ }
+ }
+ print STDERR "Imported $fst_cnt Fst file(s).\n";
+
+ #
+ # Import the Phi_st files.
+ #
+ $fst_cnt = 0;
+ for ($m = 0; $m < scalar(@keys); $m++) {
+ for ($n = 0; $n < scalar(@keys); $n++) {
+ $file = "$out_path/batch_" . $batch_id . ".phistats_" . $keys[$m] . "-" . $keys[$n] . ".tsv";
+
+ if (-e $file) {
+ import_sql_file($log_fh, $file, "phist", 3);
+ $fst_cnt++;
+ }
+ }
+ }
+ print STDERR "Imported $fst_cnt Haplotype Fst file(s).\n";
}
- print STDERR "Found ", scalar(@{$samples}), " sample file(s).\n" if (scalar(@{$samples}) > 0);
+
+ print $log_fh "\n";
+
+ #
+ # Index the radtags database
+ #
+ my ($cmd, @results);
+ print STDERR "Indexing the database...\n";
+ $cmd = $exe_path . "index_radtags.pl -D $db -t -c 2>&1";
+ print STDERR "$cmd\n";
+ print $log_fh "$cmd\n";
+ @results = `$cmd` if ($dry_run == false);
+ print $log_fh @results;
}
sub write_results {
@@ -536,6 +669,20 @@ sub write_results {
}
}
+sub write_depths_of_cov {
+ my ($depths, $log_fh) = @_;
+
+ print STDERR "\nDepths of Coverage for Processed Samples:\n";
+ print $log_fh "\nDepths of Coverage for Processed Samples:\n";
+
+ foreach $a (@{$depths}) {
+ print STDERR $a->[0], ": ", $a->[1], "x\n";
+ print $log_fh $a->[0], ": ", $a->[1], "x\n";
+ }
+ print STDERR "\n";
+ print $log_fh "\n";
+}
+
sub import_sql_file {
my ($log_fh, $file, $table, $skip_lines) = @_;
@@ -543,9 +690,9 @@ sub import_sql_file {
$ignore = "IGNORE $skip_lines LINES" if ($skip_lines > 0);
- @results = `mysql --defaults-file=$cnf $db -e "LOAD DATA LOCAL INFILE '$file' INTO TABLE $table $ignore"` if ($sql == 1 && $dry_run == 0);
+ @results = `mysql --defaults-file=$cnf $db -e "LOAD DATA LOCAL INFILE '$file' INTO TABLE $table $ignore"` if ($sql == true && $dry_run == false);
- if ($sql == 1) {
+ if ($sql == true) {
print $log_fh "mysql --defaults-file=$cnf $db -e \"LOAD DATA LOCAL INFILE '$file' INTO TABLE $table $ignore\"\n", @results;
}
}
@@ -562,7 +709,7 @@ sub import_gzsql_file {
#
my $tmpdir = File::Spec->tmpdir();
my $named_pipe = mktemp($tmpdir . "/denovo_map_XXXXXX");
- if ($sql == 1 && $dry_run == 0) {
+ if ($sql == true && $dry_run == false) {
mkfifo($named_pipe, 0700) || die("Unable to create named pipe for loading gzipped data: $named_pipe, $!");
print $log_fh "Streaming $file into named pipe $named_pipe.\n";
}
@@ -570,9 +717,9 @@ sub import_gzsql_file {
#
# Dump our gzipped data onto the named pipe.
#
- system("gunzip -c $file > $named_pipe &") if ($sql == 1 && $dry_run == 0);
+ system("gunzip -c $file > $named_pipe &") if ($sql == true && $dry_run == false);
- @results = `mysql --defaults-file=$cnf $db -e "LOAD DATA LOCAL INFILE '$named_pipe' INTO TABLE $table $ignore"` if ($sql == 1 && $dry_run == 0);
+ @results = `mysql --defaults-file=$cnf $db -e "LOAD DATA LOCAL INFILE '$named_pipe' INTO TABLE $table $ignore"` if ($sql == true && $dry_run == false);
if ($sql == 1) {
print $log_fh "mysql --defaults-file=$cnf $db -e \"LOAD DATA LOCAL INFILE '$named_pipe' INTO TABLE $table $ignore\"\n", @results;
@@ -581,31 +728,45 @@ sub import_gzsql_file {
#
# Remove the pipe.
#
- unlink($named_pipe) if ($sql == 1 && $dry_run == 0);
+ unlink($named_pipe) if ($sql == true && $dry_run == false);
}
sub parse_command_line {
- my $arg;
+ my ($arg);
while (@ARGV) {
$_ = shift @ARGV;
- if ($_ =~ /^-p$/) { push(@parents, shift @ARGV); }
- elsif ($_ =~ /^-r$/) { push(@progeny, shift @ARGV); }
- elsif ($_ =~ /^-s$/) { push(@samples, shift @ARGV); }
- elsif ($_ =~ /^-o$/) { $out_path = shift @ARGV; }
- elsif ($_ =~ /^-D$/) { $desc = shift @ARGV; }
- elsif ($_ =~ /^-e$/) { $exe_path = shift @ARGV; }
- elsif ($_ =~ /^-b$/) { $batch_id = shift @ARGV; }
- elsif ($_ =~ /^-i$/) { $sample_id = shift @ARGV; }
- elsif ($_ =~ /^-a$/) { $date = shift @ARGV; }
- elsif ($_ =~ /^-S$/) { $sql = 0; }
- elsif ($_ =~ /^-B$/) { $db = shift @ARGV; }
- elsif ($_ =~ /^-d$/) { $dry_run++; }
- elsif ($_ =~ /^-O$/) {
+ if ($_ =~ /^-v$/) { version(); exit(); }
+ elsif ($_ =~ /^-h$/) { usage(); }
+ elsif ($_ =~ /^-p$/) { push(@parents, { 'path' => shift @ARGV }); }
+ elsif ($_ =~ /^-r$/) { push(@progeny, { 'path' => shift @ARGV }); }
+ elsif ($_ =~ /^-s$/) { push(@samples, { 'path' => shift @ARGV }); }
+ elsif ($_ =~ /^-d$/) { $dry_run = true; }
+ elsif ($_ =~ /^-o$/) { $out_path = shift @ARGV; }
+ elsif ($_ =~ /^-D$/) { $desc = shift @ARGV; }
+ elsif ($_ =~ /^-e$/) { $exe_path = shift @ARGV; }
+ elsif ($_ =~ /^-b$/) { $batch_id = shift @ARGV; }
+ elsif ($_ =~ /^-i$/) { $sample_id = shift @ARGV; }
+ elsif ($_ =~ /^-a$/) { $date = shift @ARGV; }
+ elsif ($_ =~ /^-S$/) { $sql = false; }
+ elsif ($_ =~ /^-B$/) { $db = shift @ARGV; }
+ elsif ($_ =~ /^-m$/) { $min_cov = shift @ARGV; }
+ elsif ($_ =~ /^-P$/) { $min_rcov = shift @ARGV; }
+ elsif ($_ =~ /^--samples$/) {
+ $sample_path = shift @ARGV;
+
+ } elsif ($_ =~ /^-O$/) {
$popmap_path = shift @ARGV;
push(@_populations, "-M " . $popmap_path);
- } elsif ($_ =~ /^-A$/) {
+ } elsif ($_ =~ /^--create_db$/) {
+ $create_db = true;
+
+ } elsif ($_ =~ /^--overw_db$/) {
+ $overw_db = true;
+ $create_db = true;
+
+ } elsif ($_ =~ /^-A$/) {
$arg = shift @ARGV;
push(@_genotypes, "-t " . $arg);
@@ -615,6 +776,9 @@ sub parse_command_line {
usage();
}
+ } elsif ($_ =~ /^-t$/) {
+ push(@_pstacks, "-d ");
+
} elsif ($_ =~ /^-T$/) {
$arg = shift @ARGV;
push(@_pstacks, "-p " . $arg);
@@ -622,12 +786,6 @@ sub parse_command_line {
push(@_sstacks, "-p " . $arg);
push(@_populations, "-t " . $arg);
- } elsif ($_ =~ /^-m$/) {
- push(@_pstacks, "-m " . shift @ARGV);
-
- } elsif ($_ =~ /^-n$/) {
- push(@_cstacks, "-n " . shift @ARGV);
-
} elsif ($_ =~ /^--bound_low$/) {
push(@_pstacks, "--bound_low " . shift @ARGV);
push(@_pstacks, "--model_type bounded");
@@ -635,6 +793,7 @@ sub parse_command_line {
} elsif ($_ =~ /^--bound_high$/) {
push(@_pstacks, "--bound_high " . shift @ARGV);
push(@_pstacks, "--model_type bounded");
+
} elsif ($_ =~ /^--alpha$/) {
push(@_pstacks, "--alpha " . shift @ARGV);
@@ -666,8 +825,6 @@ sub parse_command_line {
usage();
}
}
- elsif ($_ =~ /^-v$/) { version(); exit(); }
- elsif ($_ =~ /^-h$/) { usage(); }
else {
print STDERR "Unknown command line option: '$_'\n";
usage();
@@ -682,7 +839,7 @@ sub parse_command_line {
usage();
}
- if ($sql > 0 && length($date) == 0) {
+ if ($sql == true && length($date) == 0) {
$date = strftime("%Y-%m-%d", (localtime(time)));
}
@@ -691,12 +848,16 @@ sub parse_command_line {
usage();
}
- if (scalar(@parents) == 0 && scalar(@samples) == 0) {
+ if (scalar(@parents) == 0 && scalar(@samples) == 0 && length($popmap_path) == 0) {
print STDERR "You must specify at least one parent or sample file.\n";
usage();
}
- if (scalar(@samples) > 0) {
+ if (length($sample_path) > 0) {
+ $sample_path .= "/" if (substr($sample_path, -1) ne "/");
+ }
+
+ if (scalar(@samples) > 0 || length($popmap_path) > 0) {
$data_type = "population";
} else {
$data_type = "map";
@@ -711,29 +872,36 @@ sub usage {
version();
print STDERR <<EOQ;
-ref_map.pl -p path -r path [-s path] -o path [-n mismatches] [-m min_cov] [-T num_threads] [-A type] [-O popmap] [-B db -b batch_id -D "desc" -a yyyy-mm-dd] [-S -i id] [-e path] [-d] [-h]
- p: path to a Bowtie/SAM/BAM file containing parent sequences from a mapping cross.
- r: path to a Bowtie/SAM/BAM file containing progeny sequences from a mapping cross.
- s: path to a Bowtie/SAM/BAM file containing an individual sample from a population.
+ref_map.pl -p path -r path [-s path] -o path [-t] [-m min_cov] [-M mismatches] [-n mismatches] [-T num_threads] [-A type] [-O popmap] [-B db -b batch_id -D "desc"] [-S -i num] [-e path] [-d] [-h]
+ b: batch ID representing this dataset (an integer, e.g. 1, 2, 3).
o: path to write pipeline output files.
+ O: if analyzing one or more populations, specify a pOpulation map.
A: if processing a genetic map, specify the cross type, 'CP', 'F2', 'BC1', 'DH', or 'GEN'.
- n: specify the number of mismatches allowed between loci when building the catalog (default 1).
T: specify the number of threads to execute.
- m: specify the minimum depth of coverage to report a stack in pstacks (default 1).
- O: if analyzing one or more populations, specify a pOpulation map
e: executable path, location of pipeline programs.
d: perform a dry run. Do not actually execute any programs, just print what would be executed.
h: display this help message.
- Database options:
+ Specify each sample separately:
+ p: path to a Bowtie/SAM/BAM file containing one set of parent sequences from a mapping cross.
+ r: path to a Bowtie/SAM/BAM file containing one set of progeny sequences from a mapping cross.
+ s: path to a Bowtie/SAM/BAM file containing an individual sample from a population.
+ Specify a path to samples and provide a population map:
+ --samples <path>: specify a path to the directory of samples (samples will be read from population map).
+
+ Stack assembly options:
+ m: specify a minimum number of identical, raw reads required to create a stack (default 3).
+ P: specify a minimum number of identical, raw reads required to create a stack in 'progeny' individuals.
+
+ Database options:
B: specify a database to load data into.
- b: batch ID representing this dataset in the database.
- D: batch description
- a: batch run date, yyyy-mm-dd
+ D: a description of this batch to be stored in the database.
S: disable recording SQL data in the database.
i: starting sample_id, this is determined automatically if database interaction is enabled.
+ --create_db: create the database specified by '-B' and populate the tables.
+ --overw_db: delete the database before creating a new copy of it (turns on --create_db).
- SNP Model Options (these options are passed on to pstacks):
+ SNP Model Options (these options are passed on to pstacks):
--bound_low <num>: lower bound for epsilon, the error rate, between 0 and 1.0 (default 0).
--bound_high <num>: upper bound for epsilon, the error rate, between 0 and 1.0 (default 1).
--alpha <num>: chi square significance level required to call a heterozygote or homozygote, either 0.1, 0.05 (default), 0.01, or 0.001.
@@ -743,5 +911,5 @@ ref_map.pl -p path -r path [-s path] -o path [-n mismatches] [-m min_cov] [-T nu
EOQ
-exit(0);
+ exit(0);
}
diff --git a/sql/stacks.sql b/sql/stacks.sql
index e3e30d3..487cb46 100644
--- a/sql/stacks.sql
+++ b/sql/stacks.sql
@@ -258,6 +258,7 @@ create table matches (
allele varchar(256),
depth int unsigned not null,
lnl float,
+ cigar varchar(256),
INDEX batch_id_index (batch_id),
INDEX catalog_id_index (catalog_id),
INDEX sample_id_index (sample_id),
diff --git a/src/DNANSeq.cc b/src/DNANSeq.cc
index b4e3905..6b9a2da 100644
--- a/src/DNANSeq.cc
+++ b/src/DNANSeq.cc
@@ -151,7 +151,7 @@ char DNANSeq::operator[](int pos) {
return base;
}
-int DNANSeq::size() {
+int DNANSeq::size() const {
return this->bits / bits_per_nuc;
}
diff --git a/src/DNANSeq.h b/src/DNANSeq.h
index 96b1355..b7f09e0 100644
--- a/src/DNANSeq.h
+++ b/src/DNANSeq.h
@@ -40,7 +40,7 @@ const unsigned short int byte_size = 8;
//
// DNA Sequence Storage Class
//
-// Two-bit compression, four bases per byte of storage:
+// Three-bit compression, 2.667 bases per byte of storage:
// A == 000
// C == 001
// G == 010
@@ -64,7 +64,7 @@ public:
~DNANSeq();
char operator[](int);
- int size();
+ int size() const;
char *seq(char *);
char *seq();
char *subseq(char *, int, int);
diff --git a/src/GappedAln.h b/src/GappedAln.h
new file mode 100644
index 0000000..fc05ab0
--- /dev/null
+++ b/src/GappedAln.h
@@ -0,0 +1,594 @@
+// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
+//
+// Copyright 2016, Julian Catchen <jcatchen at illinois.edu>
+//
+// This file is part of Stacks.
+//
+// Stacks is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Stacks is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Stacks. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#ifndef __GAPPEDALN_H__
+#define __GAPPEDALN_H__
+
+enum dynprog {dynp_down, dynp_right, dynp_diag};
+
+class AlignRes {
+public:
+ string cigar;
+ uint gap_cnt;
+ uint contiguity;
+ double pct_id;
+ AlignRes() {
+ this->gap_cnt = 0;
+ this->contiguity = 0;
+ this->pct_id = 0.0;
+ }
+ AlignRes(string cigar, uint gapcnt, uint contiguity, double pct_id) {
+ this->cigar = cigar;
+ this->gap_cnt = gapcnt;
+ this->contiguity = contiguity;
+ this->pct_id = pct_id;
+ }
+};
+
+class AlignPath {
+public:
+ bool diag;
+ bool left;
+ bool up;
+
+ AlignPath() {
+ diag = false;
+ left = false;
+ up = false;
+ }
+ int count() {
+ int cnt;
+ cnt = this->up ? 1 : 0;
+ cnt += this->left ? 1 : 0;
+ cnt += this->diag ? 1 : 0;
+
+ return cnt;
+ }
+};
+
+//
+// Needleman-Wunsch Alignment
+//
+const double gapopen_score = -10;
+const double gapext_score = -0.5;
+const double mismatch_score = -4;
+const double match_score = 5;
+
+class GappedAln {
+ uint _m;
+ uint _n;
+ uint _m_size;
+ uint _n_size;
+ double **matrix;
+ AlignPath **path;
+ AlignRes _aln;
+
+ inline int swap(double *, dynprog *, int, int);
+ int trace_alignment(string, string);
+
+ public:
+ GappedAln();
+ GappedAln(int i) : GappedAln(i, i) {};
+ GappedAln(int, int);
+ ~GappedAln();
+
+ int init(int, int);
+ int align(string, string);
+ AlignRes result();
+
+ int parse_cigar(vector<pair<char, uint> > &);
+ int dump_alignment(string, string);
+};
+
+GappedAln::GappedAln()
+{
+ this->_m = 0;
+ this->_n = 0;
+ this->_m_size = this->_m;
+ this->_n_size = this->_n;
+ this->matrix = NULL;
+ this->path = NULL;
+}
+
+GappedAln::GappedAln(int len_1, int len_2)
+{
+ this->_m = len_1 + 1;
+ this->_n = len_2 + 1;
+ this->_m_size = this->_m;
+ this->_n_size = this->_n;
+
+ this->matrix = new double * [this->_m];
+ for (uint i = 0; i < this->_m; i++)
+ this->matrix[i] = new double [this->_n];
+
+ this->path = new AlignPath * [this->_m];
+ for (uint i = 0; i < this->_m; i++)
+ this->path[i] = new AlignPath [this->_n];
+}
+
+GappedAln::~GappedAln()
+{
+ for (int i = 0; i < this->_m; i++) {
+ delete [] this->matrix[i];
+ delete [] this->path[i];
+ }
+ delete [] this->matrix;
+ delete [] this->path;
+}
+
+int
+GappedAln::init(int size_1, int size_2)
+{
+ //
+ // Resize the underlying matrix and path arrays, if necessary.
+ //
+ if ((size_1 + 1) > this->_m_size || (size_2 + 1) > this->_n_size) {
+ for (int i = 0; i < this->_m_size; i++) {
+ delete [] this->matrix[i];
+ delete [] this->path[i];
+ }
+ if (this->_m_size > 0) {
+ delete [] this->matrix;
+ delete [] this->path;
+ }
+
+ this->_m_size = size_1 + 1;
+ this->_n_size = size_2 + 1;
+ //
+ // Resize the arrays to be 25% larger than the requested size.
+ //
+ int new_size = this->_m_size > this->_n_size ? this->_m_size : this->_n_size;
+ new_size += int((double) new_size * 0.25);
+ this->_m_size = new_size;
+ this->_n_size = new_size;
+
+ this->matrix = new double * [this->_m_size];
+ for (uint i = 0; i < this->_m_size; i++)
+ this->matrix[i] = new double [this->_n_size];
+
+ this->path = new AlignPath * [this->_m_size];
+ for (uint i = 0; i < this->_m_size; i++)
+ this->path[i] = new AlignPath [this->_n_size];
+ }
+
+ //
+ // Otherwise, set the dimensions of the matrix and path arrays to be the sequence lengths.
+ //
+ this->_m = size_1 + 1;
+ this->_n = size_2 + 1;
+
+ return 0;
+}
+
+int
+GappedAln::align(string tag_1, string tag_2)
+{
+ // j---->
+ // [0][1][2][3]...[n-1]
+ // +--------------------
+ // i [0] | [i][j]
+ // | [1] |
+ // | [2] |
+ // v [3] |
+ // ... |
+ // [m-1] |
+ //
+
+ //
+ // Initialize the first column and row of the dynamic programming
+ // matrix and the path array.
+ //
+ path[0][0].diag = false;
+ path[0][0].up = false;
+ path[0][0].left = false;
+ matrix[0][0] = 0.0;
+ for (uint i = 1; i < this->_m; i++) {
+ this->matrix[i][0] = this->path[i - 1][0].up ? this->matrix[i - 1][0] + gapext_score : this->matrix[i - 1][0] + gapopen_score;
+ this->path[i][0].diag = false;
+ this->path[i][0].up = true;
+ this->path[i][0].left = false;
+ }
+ for (uint j = 1; j < this->_n; j++) {
+ this->matrix[0][j] = this->path[0][j - 1].left ? this->matrix[0][j - 1] + gapext_score : this->matrix[0][j - 1] + gapopen_score;
+ this->path[0][j].diag = false;
+ this->path[0][j].up = false;
+ this->path[0][j].left = true;
+ }
+
+ double score_down, score_diag, score_right;
+ double scores[3];
+ dynprog direction[3];
+
+ for (uint i = 1; i < this->_m; i++) {
+ for (uint j = 1; j < this->_n; j++) {
+ // Calculate the score:
+ // 1) If we were to move down from the above cell.
+ score_down = this->matrix[i - 1][j];
+ score_down += this->path[i - 1][j].up ? gapext_score : gapopen_score;
+ // 2) If we were to move diagonally from the above and left cell.
+ score_diag = this->matrix[i - 1][j - 1] + (tag_1[i - 1] == tag_2[j - 1] ? match_score : mismatch_score);
+ // 3) If we were to move over from the cell left of us.
+ score_right = this->matrix[i][j - 1];
+ score_right += this->path[i][j - 1].left ? gapext_score : gapopen_score;
+
+ //
+ // Sort the scores, highest to lowest.
+ //
+ scores[0] = score_down;
+ direction[0] = dynp_down;
+ scores[1] = score_diag;
+ direction[1] = dynp_diag;
+ scores[2] = score_right;
+ direction[2] = dynp_right;
+
+ if (scores[0] < scores[1])
+ this->swap(scores, direction, 0, 1);
+ if (scores[1] < scores[2])
+ this->swap(scores, direction, 1, 2);
+ if (scores[0] < scores[1])
+ this->swap(scores, direction, 0, 1);
+
+ this->matrix[i][j] = scores[0];
+
+ if (scores[0] > scores[1]) {
+ //
+ // One path is best.
+ //
+ switch (direction[0]) {
+ case dynp_diag:
+ this->path[i][j].diag = true;
+ this->path[i][j].up = false;
+ this->path[i][j].left = false;
+ break;
+ case dynp_down:
+ this->path[i][j].diag = false;
+ this->path[i][j].up = true;
+ this->path[i][j].left = false;
+ break;
+ case dynp_right:
+ default:
+ this->path[i][j].diag = false;
+ this->path[i][j].up = false;
+ this->path[i][j].left = true;
+ }
+
+ } else if (scores[0] == scores[1]) {
+ //
+ // Two of the paths are equivalent.
+ //
+ switch (direction[0]) {
+ case dynp_diag:
+ this->path[i][j].diag = true;
+
+ switch (direction[1]) {
+ case dynp_down:
+ this->path[i][j].up = true;
+ this->path[i][j].left = false;
+ break;
+ default:
+ case dynp_right:
+ this->path[i][j].up = false;
+ this->path[i][j].left = true;
+ break;
+ }
+ break;
+ case dynp_down:
+ this->path[i][j].up = true;
+
+ switch (direction[1]) {
+ case dynp_right:
+ this->path[i][j].diag = false;
+ this->path[i][j].left = true;
+ break;
+ default:
+ case dynp_diag:
+ this->path[i][j].diag = true;
+ this->path[i][j].left = false;
+ break;
+ }
+ break;
+ default:
+ case dynp_right:
+ this->path[i][j].left = true;
+
+ switch (direction[1]) {
+ case dynp_diag:
+ this->path[i][j].diag = true;
+ this->path[i][j].up = false;
+ break;
+ default:
+ case dynp_down:
+ this->path[i][j].diag = false;
+ this->path[i][j].up = true;
+ break;
+ }
+ break;
+ }
+
+ } else {
+ //
+ // All paths equivalent.
+ //
+ this->path[i][j].diag = true;
+ this->path[i][j].up = true;
+ this->path[i][j].left = true;
+ }
+ }
+ }
+
+ // dump_alignment(tag_1, tag_2, matrix, path);
+
+ if (this->trace_alignment(tag_1, tag_2))
+ return 1;
+
+ return 0;
+}
+
+inline int
+GappedAln::swap(double *scores, dynprog *direction, int index_1, int index_2)
+{
+ double swap = scores[index_1];
+ scores[index_1] = scores[index_2];
+ scores[index_2] = swap;
+ dynprog swapdir = direction[index_1];
+ direction[index_1] = direction[index_2];
+ direction[index_2] = swapdir;
+
+ return 0;
+}
+
+bool
+compare_alignres(AlignRes a, AlignRes b)
+{
+ if (a.gap_cnt == b.gap_cnt) {
+
+ if (a.pct_id == b.pct_id)
+ return (a.contiguity > b.contiguity);
+ else
+ return (a.pct_id > b.pct_id);
+
+ } else {
+ return (a.gap_cnt < b.gap_cnt);
+ }
+}
+
+int
+GappedAln::trace_alignment(string tag_1, string tag_2)
+{
+ // j---->
+ // [0][1][2][3]...[n-1]
+ // +--------------------
+ // i [0] | [i][j]
+ // | [1] |
+ // | [2] |
+ // v [3] |
+ // ... |
+ // [m-1] |
+ //
+ int i, j, cnt, len, gaps, contiguity;
+ double ident;
+ string cigar;
+ char buf[id_len];
+
+ vector<AlignRes> alns;
+ bool more_paths = true;
+ bool seq_break = false;
+
+ do {
+ more_paths = false;
+
+ i = this->_m - 1;
+ j = this->_n - 1;
+
+ string aln_1, aln_2;
+
+ while (i > 0 || j > 0) {
+ cnt = this->path[i][j].count();
+
+ if (cnt > 1) more_paths = true;
+
+ if (this->path[i][j].diag) {
+ aln_1 += tag_1[i - 1];
+ aln_2 += tag_2[j - 1];
+ if (cnt > 1) this->path[i][j].diag = false;
+ i--;
+ j--;
+ } else if (this->path[i][j].up) {
+ aln_1 += tag_1[i - 1];
+ aln_2 += "-";
+ if (cnt > 1) this->path[i][j].up = false;
+ i--;
+ } else if (this->path[i][j].left) {
+ aln_1 += "-";
+ aln_2 += tag_2[j - 1];
+ if (cnt > 1) this->path[i][j].left = false;
+ j--;
+ }
+ }
+
+ reverse(aln_1.begin(), aln_1.end());
+ reverse(aln_2.begin(), aln_2.end());
+
+ //
+ // Convert to CIGAR strings.
+ //
+ cigar = "";
+ len = aln_1.length();
+ gaps = 0;
+ contiguity = 0;
+ seq_break = false;
+ ident = 0.0;
+ i = 0;
+ while (i < len) {
+ if (aln_1[i] != '-' && aln_2[i] != '-') {
+ cnt = 0;
+ do {
+ if (aln_1[i] == aln_2[i]) ident++;
+ cnt++;
+ i++;
+ if (seq_break == false) contiguity++;
+ } while (i < len && aln_1[i] != '-' && aln_2[i] != '-');
+ sprintf(buf, "%dM", cnt);
+
+ } else if (aln_1[i] == '-') {
+ cnt = 0;
+ do {
+ cnt++;
+ i++;
+ } while (i < len && aln_1[i] == '-');
+ sprintf(buf, "%dD", cnt);
+ gaps++;
+ seq_break = true;
+
+ } else {
+ cnt = 0;
+ do {
+ cnt++;
+ i++;
+ } while (i < len && aln_2[i] == '-');
+ sprintf(buf, "%dI", cnt);
+ gaps++;
+ seq_break = true;
+ }
+
+ cigar += buf;
+ }
+
+ alns.push_back(AlignRes(cigar, gaps, contiguity, (ident / (double) len)));
+
+ // cerr << aln_1 << " [" << cigar << ", contiguity: " << contiguity << ", gaps: " << gaps << "]\n"
+ // << aln_2 << "\n";
+
+ } while (more_paths);
+
+
+ sort(alns.begin(), alns.end(), compare_alignres);
+ this->_aln = alns[0];
+ // cerr << "Final alignment: " << this->_aln.cigar << "; contiguity: " << contiguity << "; gaps: " << this->_aln.gap_cnt << "\n";
+
+ return 1;
+}
+
+AlignRes
+GappedAln::result()
+{
+ return this->_aln;
+}
+
+int
+GappedAln::parse_cigar(vector<pair<char, uint> > &cigar)
+{
+ char buf[id_len];
+ int dist;
+ const char *p, *q;
+
+ p = this->_aln.cigar.c_str();
+
+ cigar.clear();
+
+ while (*p != '\0') {
+ q = p + 1;
+
+ while (*q != '\0' && isdigit(*q))
+ q++;
+ strncpy(buf, p, q - p);
+ buf[q-p] = '\0';
+ dist = atoi(buf);
+
+ cigar.push_back(make_pair(*q, dist));
+
+ p = q + 1;
+ }
+
+ return 0;
+}
+
+int
+GappedAln::dump_alignment(string tag_1, string tag_2)
+{
+ // j---->
+ // [0][1][2][3]...[n-1]
+ // +--------------------
+ // i [0] | [i][j]
+ // | [1] |
+ // | [2] |
+ // v [3] |
+ // ... |
+ // [m-1] |
+ //
+
+ //
+ // Output the score matrix.
+ //
+ cout << " ";
+ for (uint j = 0; j < this->_n; j++)
+ cout << " " << tag_2[j] << " |";
+ cout << "\n";
+
+ cout << " ";
+ for (uint j = 0; j < this->_n; j++)
+ printf("% 6.1f|", this->matrix[0][j]);
+ cout << "\n";
+
+ for (uint i = 1; i < this->_m; i++) {
+ cout << tag_1[i - 1] << " ";
+ for (uint j = 0; j < this->_n; j++)
+ printf("% 6.1f|", this->matrix[i][j]);
+ cout << "\n";
+ }
+
+ cout << "\n";
+
+ //
+ // Output the path matrix.
+ //
+ cout << " ";
+ for (uint j = 0; j < this->_n; j++)
+ cout << " " << tag_2[j] << " |";
+ cout << "\n";
+
+ cout << " ";
+ for (uint j = 0; j < this->_n; j++) {
+ cout << " ";
+ this->path[0][j].diag ? cout << "d" : cout << " ";
+ this->path[0][j].up ? cout << "u" : cout << " ";
+ this->path[0][j].left ? cout << "l" : cout << " ";
+ cout << "|";
+ }
+ cout << "\n";
+
+ for (uint i = 1; i < this->_m; i++) {
+ cout << tag_1[i - 1] << " ";
+ for (uint j = 0; j < this->_n; j++) {
+ cout << " ";
+ this->path[i][j].diag ? cout << "d" : cout << " ";
+ this->path[i][j].up ? cout << "u" : cout << " ";
+ this->path[i][j].left ? cout << "l" : cout << " ";
+ cout << "|";
+ }
+ cout << "\n";
+ }
+
+ cout << "\n";
+
+ return 0;
+}
+
+#endif // __GAPPEDALN_H__
diff --git a/src/PopMap.h b/src/PopMap.h
index bd5d574..495d42a 100644
--- a/src/PopMap.h
+++ b/src/PopMap.h
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2011-2015, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2011-2016, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -21,8 +21,6 @@
#ifndef __POPMAP_H__
#define __POPMAP_H__
-#include "stacks.h"
-#include "locus.h"
#include <string.h>
#include <string>
using std::string;
@@ -37,6 +35,10 @@ using std::set;
using std::pair;
using std::make_pair;
+#include "stacks.h"
+#include "locus.h"
+#include "aln_utils.h"
+
class Datum {
public:
int id; // Stack ID
@@ -48,18 +50,48 @@ public:
char *model; // String representing SNP model output for each nucleotide at this locus.
char *gtype; // Genotype
char *trans_gtype; // Translated Genotype
+ char *cigar; // CIGAR string describing how the datum aligns to the catalog locus.
double lnl; // Log likelihood of this locus.
vector<char *> obshap; // Observed Haplotypes
vector<SNP *> snps;
- Datum() { corrected = false; gtype = NULL; trans_gtype = NULL; model = NULL; tot_depth = 0; len = 0; lnl = 0.0; merge_partner = 0; }
+ Datum() {
+ this->corrected = false;
+ this->gtype = NULL;
+ this->trans_gtype = NULL;
+ this->model = NULL;
+ this->cigar = NULL;
+ this->tot_depth = 0;
+ this->len = 0;
+ this->lnl = 0.0;
+ this->merge_partner = 0;
+ }
~Datum() {
- for (uint i = 0; i < this->obshap.size(); i++)
- delete [] this->obshap[i];
- for (uint i = 0; i < this->snps.size(); i++)
- delete this->snps[i];
- delete [] this->gtype;
- delete [] this->trans_gtype;
- delete [] this->model;
+ for (uint i = 0; i < this->obshap.size(); i++)
+ delete [] this->obshap[i];
+ for (uint i = 0; i < this->snps.size(); i++)
+ delete this->snps[i];
+ delete [] this->gtype;
+ delete [] this->trans_gtype;
+ delete [] this->model;
+ delete [] this->cigar;
+ }
+ int add_model(const char *model)
+ {
+ if (this->cigar == NULL) {
+ this->len = strlen(model);
+ this->model = new char[this->len + 1];
+ strcpy(this->model, model);
+
+ } else {
+ vector<pair<char, uint> > c;
+ this->len = parse_cigar(this->cigar, c);
+ this->model = new char[this->len + 1];
+ apply_cigar_to_model_seq(this->model, this->len, model, c);
+ // cerr << "Cigar: " << this->cigar << "\n"
+ // << "Old model: " << model << "\n"
+ // << "Gapped model: " << this->model << "\n";
+ }
+ return 0;
}
};
@@ -102,10 +134,10 @@ PopMap<LocusT>::PopMap(int num_samples, int num_loci) {
this->data = new Datum **[num_loci];
for (int i = 0; i < num_loci; i++) {
- this->data[i] = new Datum *[num_samples];
+ this->data[i] = new Datum *[num_samples];
- for (int j = 0; j < num_samples; j++)
- this->data[i][j] = NULL;
+ for (int j = 0; j < num_samples; j++)
+ this->data[i][j] = NULL;
}
this->num_samples = num_samples;
@@ -115,23 +147,23 @@ PopMap<LocusT>::PopMap(int num_samples, int num_loci) {
template<class LocusT>
PopMap<LocusT>::~PopMap() {
for (int i = 0; i < this->num_loci; i++) {
- for (int j = 0; j < this->num_samples; j++)
- delete this->data[i][j];
- delete [] this->data[i];
+ for (int j = 0; j < this->num_samples; j++)
+ delete this->data[i][j];
+ delete [] this->data[i];
}
delete [] this->data;
}
template<class LocusT>
int PopMap<LocusT>::populate(vector<int> &sample_ids,
- map<int, LocusT*> &catalog,
- vector<vector<CatMatch *> > &matches) {
+ map<int, LocusT*> &catalog,
+ vector<vector<CatMatch *> > &matches) {
//
// Record the array position of each sample that we will load.
//
for (uint i = 0; i < sample_ids.size(); i++) {
- this->sample_order[sample_ids[i]] = i;
- this->rev_sample_order[i] = sample_ids[i];
+ this->sample_order[sample_ids[i]] = i;
+ this->rev_sample_order[i] = sample_ids[i];
}
//
@@ -141,9 +173,9 @@ int PopMap<LocusT>::populate(vector<int> &sample_ids,
typename std::map<int, LocusT*>::iterator it;
uint i = 0;
for (it = catalog.begin(); it != catalog.end(); it++) {
- this->locus_order[it->first] = i;
- this->rev_locus_order[i] = it->first;
- i++;
+ this->locus_order[it->first] = i;
+ this->rev_locus_order[i] = it->first;
+ i++;
}
//
@@ -158,58 +190,70 @@ int PopMap<LocusT>::populate(vector<int> &sample_ids,
int locus, sample;
for (i = 0; i < matches.size(); i++) {
- for (uint j = 0; j < matches[i].size(); j++) {
- sample = this->sample_order[matches[i][j]->sample_id];
-
- if (this->locus_order.count(matches[i][j]->cat_id) == 0)
- continue;
-
- locus = this->locus_order[matches[i][j]->cat_id];
-
- // cerr << "Translating sample id: " << matches[i][j]->sample_id << " to index " << sample << "\n";
- // cerr << "Translating locus id: " << matches[i][j]->cat_id << " to index " << locus << "\n";
-
- if (this->data[locus][sample] == NULL) {
-
- if (this->blacklist.count(make_pair(matches[i][j]->sample_id, matches[i][j]->cat_id)) == 0) {
- // cerr << "Creating new datum for tag ID: " << matches[i][j]->tag_id << "\n";
- d = new Datum;
- d->id = matches[i][j]->tag_id;
- char *h = new char[strlen(matches[i][j]->haplotype) + 1];
- strcpy(h, matches[i][j]->haplotype);
- d->obshap.push_back(h);
- d->depth.push_back(matches[i][j]->depth);
- d->tot_depth += matches[i][j]->depth;
- d->lnl = matches[i][j]->lnl;
- this->data[locus][sample] = d;
-
- catalog[matches[i][j]->cat_id]->hcnt++;
- catalog[matches[i][j]->cat_id]->cnt++;
- }
- } else {
- // cerr << " Adding haplotype to existing datum: sample: " << matches[i][j]->sample_id << ". tag: " << matches[i][j]->tag_id << "\n";
- //
- // Check that the IDs of the two matches are the same. If not, then two tags
- // match this locus and the locus is invalid, set back to NULL.
- //
- if (matches[i][j]->tag_id == this->data[locus][sample]->id) {
- char *h = new char[strlen(matches[i][j]->haplotype) + 1];
- strcpy(h, matches[i][j]->haplotype);
- this->data[locus][sample]->obshap.push_back(h);
- this->data[locus][sample]->depth.push_back(matches[i][j]->depth);
- this->data[locus][sample]->tot_depth += matches[i][j]->depth;
- this->data[locus][sample]->lnl = matches[i][j]->lnl;
-
- } else {
- //cerr << " Deleting sample, multiple tag matches\n";
- delete this->data[locus][sample];
- this->data[locus][sample] = NULL;
- this->blacklist.insert(make_pair(matches[i][j]->sample_id, matches[i][j]->cat_id));
- catalog[matches[i][j]->cat_id]->hcnt--;
- catalog[matches[i][j]->cat_id]->confounded_cnt++;
- }
- }
- }
+ for (uint j = 0; j < matches[i].size(); j++) {
+ sample = this->sample_order[matches[i][j]->sample_id];
+
+ if (this->locus_order.count(matches[i][j]->cat_id) == 0)
+ continue;
+
+ locus = this->locus_order[matches[i][j]->cat_id];
+
+ // cerr << "Translating sample id: " << matches[i][j]->sample_id << " to index " << sample << "\n";
+ // cerr << "Translating locus id: " << matches[i][j]->cat_id << " to index " << locus << "\n";
+
+ if (this->data[locus][sample] == NULL) {
+
+ if (this->blacklist.count(make_pair(matches[i][j]->sample_id, matches[i][j]->cat_id)) == 0) {
+ // cerr << "Creating new datum for tag ID: " << matches[i][j]->tag_id << "\n";
+ d = new Datum;
+ d->id = matches[i][j]->tag_id;
+ char *h = new char[strlen(matches[i][j]->haplotype) + 1];
+ strcpy(h, matches[i][j]->haplotype);
+ d->obshap.push_back(h);
+ d->depth.push_back(matches[i][j]->depth);
+ d->tot_depth += matches[i][j]->depth;
+ d->lnl = matches[i][j]->lnl;
+
+ if (matches[i][j]->cigar != NULL) {
+ d->cigar = new char[strlen(matches[i][j]->cigar) + 1];
+ strcpy(d->cigar, matches[i][j]->cigar);
+ }
+
+ this->data[locus][sample] = d;
+
+ catalog[matches[i][j]->cat_id]->hcnt++;
+ catalog[matches[i][j]->cat_id]->cnt++;
+ }
+ } else {
+ // cerr << " Adding haplotype to existing datum: sample: " << matches[i][j]->sample_id << ". tag: " << matches[i][j]->tag_id << "\n";
+ //
+ // Check that the IDs of the two matches are the same. If not, then two tags
+ // match this locus and the locus is invalid, set back to NULL.
+ //
+ if (matches[i][j]->tag_id == this->data[locus][sample]->id) {
+ char *h = new char[strlen(matches[i][j]->haplotype) + 1];
+ strcpy(h, matches[i][j]->haplotype);
+
+ if (matches[i][j]->cigar != NULL && strcmp(this->data[locus][sample]->cigar, matches[i][j]->cigar) != 0)
+ cerr << "Warning: disparate CIGAR strings, catalog locus " << matches[i][j]->cat_id
+ << "; sample ID: " << matches[i][j]->sample_id << "; sample locus ID: " << matches[i][j]->tag_id
+ << "; datum cigar: " << this->data[locus][sample]->cigar << "; matches cigar: " << matches[i][j]->cigar << "\n";
+
+ this->data[locus][sample]->obshap.push_back(h);
+ this->data[locus][sample]->depth.push_back(matches[i][j]->depth);
+ this->data[locus][sample]->tot_depth += matches[i][j]->depth;
+ this->data[locus][sample]->lnl = matches[i][j]->lnl;
+
+ } else {
+ //cerr << " Deleting sample, multiple tag matches\n";
+ delete this->data[locus][sample];
+ this->data[locus][sample] = NULL;
+ this->blacklist.insert(make_pair(matches[i][j]->sample_id, matches[i][j]->cat_id));
+ catalog[matches[i][j]->cat_id]->hcnt--;
+ catalog[matches[i][j]->cat_id]->confounded_cnt++;
+ }
+ }
+ }
}
return 0;
@@ -223,8 +267,8 @@ int PopMap<LocusT>::order_loci(map<int, LocusT*> &catalog)
typename std::map<int, LocusT*>::iterator it;
for (it = catalog.begin(); it != catalog.end(); it++) {
- if (strlen(it->second->loc.chr) > 0)
- this->ordered_loci[it->second->loc.chr].push_back(it->second);
+ if (strlen(it->second->loc.chr) > 0)
+ this->ordered_loci[it->second->loc.chr].push_back(it->second);
}
//
@@ -232,7 +276,7 @@ int PopMap<LocusT>::order_loci(map<int, LocusT*> &catalog)
//
typename map<string, vector<LocusT*> >::iterator cit;
for (cit = this->ordered_loci.begin(); cit != this->ordered_loci.end(); cit++)
- sort(cit->second.begin(), cit->second.end(), bp_compare);
+ sort(cit->second.begin(), cit->second.end(), bp_compare);
return 0;
}
@@ -248,25 +292,25 @@ int PopMap<LocusT>::prune(set<int> &remove_ids) {
int j = 0;
for (int i = 0; i < this->num_loci; i++) {
- loc_id = this->rev_locus_order[i];
-
- //
- // Keep this locus.
- //
- if (remove_ids.count(loc_id) == 0) {
- d[j] = this->data[i];
- new_loc_order[loc_id] = j;
- new_rev_loc_order[j] = loc_id;
- j++;
-
- } else {
- //
- // Remove this locus.
- //
- for (int k = 0; k < this->num_samples; k++)
- delete this->data[i][k];
- delete [] this->data[i];
- }
+ loc_id = this->rev_locus_order[i];
+
+ //
+ // Keep this locus.
+ //
+ if (remove_ids.count(loc_id) == 0) {
+ d[j] = this->data[i];
+ new_loc_order[loc_id] = j;
+ new_rev_loc_order[j] = loc_id;
+ j++;
+
+ } else {
+ //
+ // Remove this locus.
+ //
+ for (int k = 0; k < this->num_samples; k++)
+ delete this->data[i][k];
+ delete [] this->data[i];
+ }
}
delete [] this->data;
@@ -285,17 +329,17 @@ int PopMap<LocusT>::prune(set<int> &remove_ids) {
typename map<string, vector<LocusT*> >::iterator cit;
for (cit = this->ordered_loci.begin(); cit != this->ordered_loci.end(); cit++) {
- for (uint k = 0; k < cit->second.size(); k++) {
- if (remove_ids.count(cit->second[k]->id) == 0)
- new_ordered_loci[cit->first].push_back(cit->second[k]);
- }
+ for (uint k = 0; k < cit->second.size(); k++) {
+ if (remove_ids.count(cit->second[k]->id) == 0)
+ new_ordered_loci[cit->first].push_back(cit->second[k]);
+ }
}
this->ordered_loci.clear();
this->ordered_loci = new_ordered_loci;
for (cit = this->ordered_loci.begin(); cit != this->ordered_loci.end(); cit++)
- sort(cit->second.begin(), cit->second.end(), bp_compare);
+ sort(cit->second.begin(), cit->second.end(), bp_compare);
return new_size;
}
@@ -313,9 +357,9 @@ Datum *PopMap<LocusT>::datum(int locus, int sample) {
template<class LocusT>
bool PopMap<LocusT>::blacklisted(int locus, int sample) {
if (this->blacklist.count(make_pair(sample, locus)) > 0)
- return true;
+ return true;
else
- return false;
+ return false;
}
#endif // __POPMAP_H__
diff --git a/src/PopSum.h b/src/PopSum.h
index 04c0929..9b53c3e 100644
--- a/src/PopSum.h
+++ b/src/PopSum.h
@@ -56,17 +56,17 @@ public:
double bs[PopStatSize];
PopStat() {
- this->loc_id = 0;
- this->bp = 0;
- this->fixed = false;
- this->alleles = 0.0;
- this->snp_cnt = 0;
-
- for (uint i = 0; i < PopStatSize; i++) {
- this->stat[i] = 0.0;
- this->smoothed[i] = 0.0;
- this->bs[i] = 0.0;
- }
+ this->loc_id = 0;
+ this->bp = 0;
+ this->fixed = false;
+ this->alleles = 0.0;
+ this->snp_cnt = 0;
+
+ for (uint i = 0; i < PopStatSize; i++) {
+ this->stat[i] = 0.0;
+ this->smoothed[i] = 0.0;
+ this->bs[i] = 0.0;
+ }
}
virtual ~PopStat() {
}
@@ -83,11 +83,11 @@ public:
uint popcnt;
HapStat(): PopStat() {
- comp = NULL;
+ comp = NULL;
}
~HapStat() {
- if (this->comp != NULL)
- delete [] comp;
+ if (this->comp != NULL)
+ delete [] comp;
}
};
@@ -99,7 +99,7 @@ public:
string hap_str; // Human-readable string of haplotype counts.
LocStat(): PopStat() {
- this->hap_cnt = 0;
+ this->hap_cnt = 0;
}
~LocStat() {};
};
@@ -121,21 +121,21 @@ public:
double *comp;
PopPair() {
- col = 0;
- pi = 0.0;
- fst = 0.0;
- fet_p = 0.0;
- fet_or = 0.0;
- or_se = 0.0;
- lod = 0.0;
- ci_low = 0.0;
- ci_high = 0.0;
- amova_fst = 0.0;
- comp = NULL;
+ col = 0;
+ pi = 0.0;
+ fst = 0.0;
+ fet_p = 0.0;
+ fet_or = 0.0;
+ or_se = 0.0;
+ lod = 0.0;
+ ci_low = 0.0;
+ ci_high = 0.0;
+ amova_fst = 0.0;
+ comp = NULL;
}
~PopPair() {
- if (this->comp != NULL)
- delete [] comp;
+ if (this->comp != NULL)
+ delete [] comp;
}
};
@@ -156,17 +156,17 @@ public:
double π
SumStat(): PopStat(), pi(this->stat[0]) {
- num_indv = 0.0;
- p = 0.0;
- p_nuc = 0;
- q_nuc = 0;
- obs_het = 0.0;
- obs_hom = 0.0;
- exp_het = 0.0;
- exp_hom = 0.0;
- snp_cnt = 0;
- incompatible_site = false;
- filtered_site = false;
+ num_indv = 0.0;
+ p = 0.0;
+ p_nuc = 0;
+ q_nuc = 0;
+ obs_het = 0.0;
+ obs_hom = 0.0;
+ exp_het = 0.0;
+ exp_hom = 0.0;
+ snp_cnt = 0;
+ incompatible_site = false;
+ filtered_site = false;
}
};
@@ -175,10 +175,10 @@ public:
SumStat *nucs; // Array containing summary statistics for
// each nucleotide position at this locus.
LocSum(int len) {
- this->nucs = new SumStat[len];
+ this->nucs = new SumStat[len];
}
~LocSum() {
- delete [] this->nucs;
+ delete [] this->nucs;
}
};
@@ -198,18 +198,18 @@ public:
int priv_allele;
NucTally() {
- loc_id = 0;
- bp = 0;
- col = 0;
- num_indv = 0;
- pop_cnt = 0;
- allele_cnt = 0;
- p_allele = 0;
- q_allele = 0;
- p_freq = 0.0;
- obs_het = 0.0;
- priv_allele = -1;
- fixed = true;
+ loc_id = 0;
+ bp = 0;
+ col = 0;
+ num_indv = 0;
+ pop_cnt = 0;
+ allele_cnt = 0;
+ p_allele = 0;
+ q_allele = 0;
+ p_freq = 0.0;
+ obs_het = 0.0;
+ priv_allele = -1;
+ fixed = true;
}
};
@@ -218,10 +218,10 @@ public:
NucTally *nucs;
LocTally(int len) {
- this->nucs = new NucTally[len];
+ this->nucs = new NucTally[len];
}
~LocTally() {
- delete [] this->nucs;
+ delete [] this->nucs;
}
};
@@ -297,10 +297,10 @@ PopSum<LocusT>::PopSum(int num_loci, int num_populations) {
this->data = new LocSum **[num_loci];
for (int i = 0; i < num_loci; i++) {
- this->data[i] = new LocSum *[num_populations];
+ this->data[i] = new LocSum *[num_populations];
- for (int j = 0; j < num_populations; j++)
- this->data[i][j] = NULL;
+ for (int j = 0; j < num_populations; j++)
+ this->data[i][j] = NULL;
}
this->num_pops = num_populations;
@@ -310,10 +310,10 @@ PopSum<LocusT>::PopSum(int num_loci, int num_populations) {
template<class LocusT>
PopSum<LocusT>::~PopSum() {
for (int i = 0; i < this->num_loci; i++) {
- for (int j = 0; j < this->num_pops; j++)
- delete this->data[i][j];
- delete [] this->data[i];
- delete this->loc_tally[i];
+ for (int j = 0; j < this->num_pops; j++)
+ delete this->data[i][j];
+ delete [] this->data[i];
+ delete this->loc_tally[i];
}
delete [] this->data;
delete [] this->loc_tally;
@@ -324,9 +324,9 @@ int PopSum<LocusT>::initialize(PopMap<LocusT> *pmap) {
int locus_id;
for (int i = 0; i < this->num_loci; i++) {
- locus_id = pmap->rev_locus_index(i);
- this->locus_order[locus_id] = i;
- this->rev_locus_order[i] = locus_id;
+ locus_id = pmap->rev_locus_index(i);
+ this->locus_order[locus_id] = i;
+ this->rev_locus_order[i] = locus_id;
}
return 0;
@@ -334,10 +334,10 @@ int PopSum<LocusT>::initialize(PopMap<LocusT> *pmap) {
template<class LocusT>
int PopSum<LocusT>::add_population(map<int, LocusT *> &catalog,
- PopMap<LocusT> *pmap,
- uint population_id,
- uint start_index, uint end_index,
- bool verbose, ofstream &log_fh) {
+ PopMap<LocusT> *pmap,
+ uint population_id,
+ uint start_index, uint end_index,
+ bool verbose, ofstream &log_fh) {
LocusT *loc;
Datum **d;
LocSum **s;
@@ -348,9 +348,9 @@ int PopSum<LocusT>::add_population(map<int, LocusT *> &catalog,
int incompatible_loci = 0;
if (verbose)
- log_fh << "\n#\n# Recording sites that have incompatible loci -- loci with too many alleles present.\n"
- << "#\n"
- << "# Level\tAction\tLocus ID\tChr\tBP\tColumn\tPopID\n#\n";
+ log_fh << "\n#\n# Recording sites that have incompatible loci -- loci with too many alleles present.\n"
+ << "#\n"
+ << "# Level\tAction\tLocus ID\tChr\tBP\tColumn\tPopID\n#\n";
//
// Determine the index for this population
@@ -365,66 +365,66 @@ int PopSum<LocusT>::add_population(map<int, LocusT *> &catalog,
this->pop_sizes[population_id] = end_index - start_index + 1;
for (int i = 0; i < this->num_loci; i++) {
- locus_id = pmap->rev_locus_index(i);
- d = pmap->locus(locus_id);
- s = this->locus(locus_id);
- loc = catalog[locus_id];
- //
- // Create an array of SumStat objects
- //
- len = strlen(loc->con);
- s[pop_index] = new LocSum(len);
-
- //
- // Check if this locus has already been filtered and is NULL in all individuals.
- //
- bool filtered = true;
- for (uint k = start_index; k <= end_index; k++) {
- if (d[k] != NULL) filtered = false;
- }
- if (filtered == true) {
- for (uint k = 0; k < len; k++) {
- s[pop_index]->nucs[k].filtered_site = true;
- }
- continue;
- }
-
- //
- // The catalog records which nucleotides are heterozygous. For these nucleotides we will
- // calculate observed genotype frequencies, allele frequencies, and expected genotype frequencies.
- //
- for (uint k = 0; k < loc->snps.size(); k++) {
- res = this->tally_heterozygous_pos(loc, d, s[pop_index],
- loc->snps[k]->col, k, start_index, end_index);
- //
- // If site is incompatible (too many alleles present), log it.
- //
- if (res < 0) {
- s[pop_index]->nucs[loc->snps[k]->col].incompatible_site = true;
-
- incompatible_loci++;
- if (verbose)
- log_fh << "within_population\t"
- << "incompatible_locus\t"
- << loc->id << "\t"
- << loc->loc.chr << "\t"
- << loc->sort_bp(loc->snps[k]->col) << "\t"
- << loc->snps[k]->col << "\t"
- << pop_key[population_id] << "\n";
- }
-
- snp_cols.insert(loc->snps[k]->col);
- }
- //
- // For all other fixed sites, we just need to record them.
- //
- for (uint k = 0; k < len; k++) {
- if (snp_cols.count(k)) continue;
- this->tally_fixed_pos(loc, d, s[pop_index],
- k, start_index, end_index);
- }
-
- snp_cols.clear();
+ locus_id = pmap->rev_locus_index(i);
+ d = pmap->locus(locus_id);
+ s = this->locus(locus_id);
+ loc = catalog[locus_id];
+ //
+ // Create an array of SumStat objects
+ //
+ len = strlen(loc->con);
+ s[pop_index] = new LocSum(len);
+
+ //
+ // Check if this locus has already been filtered and is NULL in all individuals.
+ //
+ bool filtered = true;
+ for (uint k = start_index; k <= end_index; k++) {
+ if (d[k] != NULL) filtered = false;
+ }
+ if (filtered == true) {
+ for (uint k = 0; k < len; k++) {
+ s[pop_index]->nucs[k].filtered_site = true;
+ }
+ continue;
+ }
+
+ //
+ // The catalog records which nucleotides are heterozygous. For these nucleotides we will
+ // calculate observed genotype frequencies, allele frequencies, and expected genotype frequencies.
+ //
+ for (uint k = 0; k < loc->snps.size(); k++) {
+ res = this->tally_heterozygous_pos(loc, d, s[pop_index],
+ loc->snps[k]->col, k, start_index, end_index);
+ //
+ // If site is incompatible (too many alleles present), log it.
+ //
+ if (res < 0) {
+ s[pop_index]->nucs[loc->snps[k]->col].incompatible_site = true;
+
+ incompatible_loci++;
+ if (verbose)
+ log_fh << "within_population\t"
+ << "incompatible_locus\t"
+ << loc->id << "\t"
+ << loc->loc.chr << "\t"
+ << loc->sort_bp(loc->snps[k]->col) << "\t"
+ << loc->snps[k]->col << "\t"
+ << pop_key[population_id] << "\n";
+ }
+
+ snp_cols.insert(loc->snps[k]->col);
+ }
+ //
+ // For all other fixed sites, we just need to record them.
+ //
+ for (uint k = 0; k < len; k++) {
+ if (snp_cols.count(k)) continue;
+ this->tally_fixed_pos(loc, d, s[pop_index],
+ k, start_index, end_index);
+ }
+
+ snp_cols.clear();
}
cerr << "Population '" << pop_key[population_id] << "' contained " << incompatible_loci << " incompatible loci -- more than two alleles present.\n";
@@ -443,93 +443,93 @@ int PopSum<LocusT>::tally(map<int, LocusT *> &catalog)
uint16_t p_cnt, q_cnt, len, col;
for (int n = 0; n < this->num_loci; n++) {
- locus_id = this->rev_locus_index(n);
- loc = catalog[locus_id];
- s = this->locus(locus_id);
- len = strlen(loc->con);
-
- ltally = new LocTally(len);
- this->loc_tally[n] = ltally;
-
- // for (uint i = 0; i < loc->snps.size(); i++) {
- // uint col = loc->snps[i]->col;
- for (col = 0; col < len; col++) {
-
- ltally->nucs[col].col = col;
- ltally->nucs[col].bp = loc->sort_bp(col);
- ltally->nucs[col].loc_id = locus_id;
-
- this->tally_ref_alleles(s, col,
- ltally->nucs[col].allele_cnt,
- ltally->nucs[col].p_allele,
- ltally->nucs[col].q_allele,
- p_cnt, q_cnt);
-
- //
- // Is this site variable?
- //
- if (ltally->nucs[col].allele_cnt > 1)
- ltally->nucs[col].fixed = false;
-
- for (int j = 0; j < this->num_pops; j++) {
- //
- // Sum the number of individuals examined at this locus across populations.
- //
- ltally->nucs[col].num_indv += s[j]->nucs[col].num_indv;
- ltally->nucs[col].pop_cnt += s[j]->nucs[col].num_indv > 0 ? 1 : 0;
- }
-
- for (int j = 0; j < this->num_pops; j++) {
- //
- // Sum the most frequent allele across populations.
- //
- if (s[j]->nucs[col].p_nuc == ltally->nucs[col].p_allele)
- ltally->nucs[col].p_freq +=
- s[j]->nucs[col].p * (s[j]->nucs[col].num_indv / (double) ltally->nucs[col].num_indv);
- else
- ltally->nucs[col].p_freq +=
- (1 - s[j]->nucs[col].p) * (s[j]->nucs[col].num_indv / (double) ltally->nucs[col].num_indv);
- //
- // Sum observed heterozygosity across populations.
- //
- ltally->nucs[col].obs_het +=
- s[j]->nucs[col].obs_het * (s[j]->nucs[col].num_indv / (double) ltally->nucs[col].num_indv);
- }
-
- //
- // We want to report the most frequent allele as the P allele. Reorder the alleles
- // if necessary.
- //
- if (ltally->nucs[col].p_freq < 0.5) {
- char a = ltally->nucs[col].p_allele;
- ltally->nucs[col].p_allele = ltally->nucs[col].q_allele;
- ltally->nucs[col].q_allele = a;
- ltally->nucs[col].p_freq = 1 - ltally->nucs[col].p_freq;
- uint b = p_cnt;
- p_cnt = q_cnt;
- q_cnt = b;
- }
-
- //
- // Check if this is a private allele. Either the site is variable and
- // the allele exists in one population, or the site is fixed and one
- // population is homozygous for the private allele.
- //
- variable_pop = -1;
-
- if (p_cnt == 1 && q_cnt > 1) {
- for (int j = 0; j < this->num_pops; j++)
- if (s[j]->nucs[col].p_nuc == ltally->nucs[col].p_allele ||
- s[j]->nucs[col].q_nuc == ltally->nucs[col].p_allele)
- variable_pop = j;
- } else if (p_cnt > 1 && q_cnt == 1) {
- for (int j = 0; j < this->num_pops; j++)
- if (s[j]->nucs[col].p_nuc == ltally->nucs[col].q_allele ||
- s[j]->nucs[col].q_nuc == ltally->nucs[col].q_allele)
- variable_pop = j;
- }
- ltally->nucs[col].priv_allele = variable_pop;
- }
+ locus_id = this->rev_locus_index(n);
+ loc = catalog[locus_id];
+ s = this->locus(locus_id);
+ len = strlen(loc->con);
+
+ ltally = new LocTally(len);
+ this->loc_tally[n] = ltally;
+
+ // for (uint i = 0; i < loc->snps.size(); i++) {
+ // uint col = loc->snps[i]->col;
+ for (col = 0; col < len; col++) {
+
+ ltally->nucs[col].col = col;
+ ltally->nucs[col].bp = loc->sort_bp(col);
+ ltally->nucs[col].loc_id = locus_id;
+
+ this->tally_ref_alleles(s, col,
+ ltally->nucs[col].allele_cnt,
+ ltally->nucs[col].p_allele,
+ ltally->nucs[col].q_allele,
+ p_cnt, q_cnt);
+
+ //
+ // Is this site variable?
+ //
+ if (ltally->nucs[col].allele_cnt > 1)
+ ltally->nucs[col].fixed = false;
+
+ for (int j = 0; j < this->num_pops; j++) {
+ //
+ // Sum the number of individuals examined at this locus across populations.
+ //
+ ltally->nucs[col].num_indv += s[j]->nucs[col].num_indv;
+ ltally->nucs[col].pop_cnt += s[j]->nucs[col].num_indv > 0 ? 1 : 0;
+ }
+
+ for (int j = 0; j < this->num_pops; j++) {
+ //
+ // Sum the most frequent allele across populations.
+ //
+ if (s[j]->nucs[col].p_nuc == ltally->nucs[col].p_allele)
+ ltally->nucs[col].p_freq +=
+ s[j]->nucs[col].p * (s[j]->nucs[col].num_indv / (double) ltally->nucs[col].num_indv);
+ else
+ ltally->nucs[col].p_freq +=
+ (1 - s[j]->nucs[col].p) * (s[j]->nucs[col].num_indv / (double) ltally->nucs[col].num_indv);
+ //
+ // Sum observed heterozygosity across populations.
+ //
+ ltally->nucs[col].obs_het +=
+ s[j]->nucs[col].obs_het * (s[j]->nucs[col].num_indv / (double) ltally->nucs[col].num_indv);
+ }
+
+ //
+ // We want to report the most frequent allele as the P allele. Reorder the alleles
+ // if necessary.
+ //
+ if (ltally->nucs[col].p_freq < 0.5) {
+ char a = ltally->nucs[col].p_allele;
+ ltally->nucs[col].p_allele = ltally->nucs[col].q_allele;
+ ltally->nucs[col].q_allele = a;
+ ltally->nucs[col].p_freq = 1 - ltally->nucs[col].p_freq;
+ uint b = p_cnt;
+ p_cnt = q_cnt;
+ q_cnt = b;
+ }
+
+ //
+ // Check if this is a private allele. Either the site is variable and
+ // the allele exists in one population, or the site is fixed and one
+ // population is homozygous for the private allele.
+ //
+ variable_pop = -1;
+
+ if (p_cnt == 1 && q_cnt > 1) {
+ for (int j = 0; j < this->num_pops; j++)
+ if (s[j]->nucs[col].p_nuc == ltally->nucs[col].p_allele ||
+ s[j]->nucs[col].q_nuc == ltally->nucs[col].p_allele)
+ variable_pop = j;
+ } else if (p_cnt > 1 && q_cnt == 1) {
+ for (int j = 0; j < this->num_pops; j++)
+ if (s[j]->nucs[col].p_nuc == ltally->nucs[col].q_allele ||
+ s[j]->nucs[col].q_nuc == ltally->nucs[col].q_allele)
+ variable_pop = j;
+ }
+ ltally->nucs[col].priv_allele = variable_pop;
+ }
}
return 0;
@@ -537,9 +537,9 @@ int PopSum<LocusT>::tally(map<int, LocusT *> &catalog)
template<class LocusT>
int PopSum<LocusT>::tally_ref_alleles(LocSum **s, int snp_index,
- short unsigned int &allele_cnt,
- char &p_allele, char &q_allele,
- short unsigned int &p_cnt, short unsigned int &q_cnt)
+ short unsigned int &allele_cnt,
+ char &p_allele, char &q_allele,
+ short unsigned int &p_cnt, short unsigned int &q_cnt)
{
int nucs[4] = {0};
char nuc[2];
@@ -549,30 +549,30 @@ int PopSum<LocusT>::tally_ref_alleles(LocSum **s, int snp_index,
allele_cnt = 0;
for (int j = 0; j < this->num_pops; j++) {
- nuc[0] = 0;
- nuc[1] = 0;
+ nuc[0] = 0;
+ nuc[1] = 0;
nuc[0] = s[j]->nucs[snp_index].p_nuc;
nuc[1] = s[j]->nucs[snp_index].q_nuc;
- for (uint k = 0; k < 2; k++)
- switch(nuc[k]) {
- case 'A':
- case 'a':
- nucs[0]++;
- break;
- case 'C':
- case 'c':
- nucs[1]++;
- break;
- case 'G':
- case 'g':
- nucs[2]++;
- break;
- case 'T':
- case 't':
- nucs[3]++;
- break;
- }
+ for (uint k = 0; k < 2; k++)
+ switch(nuc[k]) {
+ case 'A':
+ case 'a':
+ nucs[0]++;
+ break;
+ case 'C':
+ case 'c':
+ nucs[1]++;
+ break;
+ case 'G':
+ case 'g':
+ nucs[2]++;
+ break;
+ case 'T':
+ case 't':
+ nucs[3]++;
+ break;
+ }
}
//
@@ -582,12 +582,12 @@ int PopSum<LocusT>::tally_ref_alleles(LocSum **s, int snp_index,
//
int i;
for (i = 0; i < 4; i++)
- if (nucs[i] > 0) allele_cnt++;
+ if (nucs[i] > 0) allele_cnt++;
if (allele_cnt > 2) {
- p_allele = 0;
- q_allele = 0;
- return 0;
+ p_allele = 0;
+ q_allele = 0;
+ return 0;
}
//
@@ -595,39 +595,39 @@ int PopSum<LocusT>::tally_ref_alleles(LocSum **s, int snp_index,
//
i = 0;
while (p_allele == 0 && i < 4) {
- if (nucs[i] > 0) {
- switch(i) {
- case 0:
- p_allele = 'A';
- break;
- case 1:
- p_allele = 'C';
- break;
- case 2:
- p_allele = 'G';
- break;
- case 3:
- p_allele = 'T';
- break;
- }
- }
- i++;
+ if (nucs[i] > 0) {
+ switch(i) {
+ case 0:
+ p_allele = 'A';
+ break;
+ case 1:
+ p_allele = 'C';
+ break;
+ case 2:
+ p_allele = 'G';
+ break;
+ case 3:
+ p_allele = 'T';
+ break;
+ }
+ }
+ i++;
}
while (q_allele == 0 && i < 4) {
- if (nucs[i] > 0) {
- switch(i) {
- case 1:
- q_allele = 'C';
- break;
- case 2:
- q_allele = 'G';
- break;
- case 3:
- q_allele = 'T';
- break;
- }
- }
- i++;
+ if (nucs[i] > 0) {
+ switch(i) {
+ case 1:
+ q_allele = 'C';
+ break;
+ case 2:
+ q_allele = 'G';
+ break;
+ case 3:
+ q_allele = 'T';
+ break;
+ }
+ }
+ i++;
}
//
@@ -637,16 +637,16 @@ int PopSum<LocusT>::tally_ref_alleles(LocSum **s, int snp_index,
q_cnt = 0;
for (int j = 0; j < this->num_pops; j++) {
- nuc[0] = 0;
- nuc[1] = 0;
+ nuc[0] = 0;
+ nuc[1] = 0;
nuc[0] = s[j]->nucs[snp_index].p_nuc;
nuc[1] = s[j]->nucs[snp_index].q_nuc;
- for (uint k = 0; k < 2; k++)
- if (nuc[k] != 0 && nuc[k] == p_allele)
- p_cnt++;
- else if (nuc[k] != 0 && nuc[k] == q_allele)
- q_cnt++;
+ for (uint k = 0; k < 2; k++)
+ if (nuc[k] != 0 && nuc[k] == p_allele)
+ p_cnt++;
+ else if (nuc[k] != 0 && nuc[k] == q_allele)
+ q_cnt++;
}
return 1;
@@ -663,7 +663,7 @@ PopPair *PopSum<LocusT>::Fst(int locus, int pop_1, int pop_2, int pos)
// If this locus only appears in one population do not calculate Fst.
//
if (s_1->nucs[pos].num_indv == 0 || s_2->nucs[pos].num_indv == 0)
- return pair;
+ return pair;
//
// Calculate Fst at a locus, sub-population relative to that found in the entire population
@@ -677,7 +677,7 @@ PopPair *PopSum<LocusT>::Fst(int locus, int pop_1, int pop_2, int pos)
pi_2 = s_2->nucs[pos].pi;
if (pi_1 == 0 && pi_2 == 0 && s_1->nucs[pos].p_nuc == s_2->nucs[pos].p_nuc)
- return pair;
+ return pair;
//
// Calculate Pi over the entire pooled population.
@@ -693,34 +693,34 @@ PopPair *PopSum<LocusT>::Fst(int locus, int pop_1, int pop_2, int pos)
nucs[3] = s_2->nucs[pos].q_nuc;
for (int i = 0; i < 4; i++)
- switch(nucs[i]) {
- case 'A':
- ncnt[0]++;
- break;
- case 'C':
- ncnt[1]++;
- break;
- case 'G':
- ncnt[2]++;
- break;
- case 'T':
- ncnt[3]++;
- break;
- }
+ switch(nucs[i]) {
+ case 'A':
+ ncnt[0]++;
+ break;
+ case 'C':
+ ncnt[1]++;
+ break;
+ case 'G':
+ ncnt[2]++;
+ break;
+ case 'T':
+ ncnt[3]++;
+ break;
+ }
int allele_cnt = 0;
for (int i = 0; i < 4; i++)
- if (ncnt[i] > 0) allele_cnt++;
+ if (ncnt[i] > 0) allele_cnt++;
if (allele_cnt > 2)
- return NULL;
+ return NULL;
double tot_alleles = n_1 + n_2;
double p_1 = round(n_1 * s_1->nucs[pos].p);
double q_1 = n_1 - p_1;
double p_2 =
- s_1->nucs[pos].p_nuc == s_2->nucs[pos].p_nuc ?
- s_2->nucs[pos].p : (1 - s_2->nucs[pos].p);
+ s_1->nucs[pos].p_nuc == s_2->nucs[pos].p_nuc ?
+ s_2->nucs[pos].p : (1 - s_2->nucs[pos].p);
p_2 = round(n_2 * p_2);
double q_2 = n_2 - p_2;
@@ -741,15 +741,15 @@ PopPair *PopSum<LocusT>::Fst(int locus, int pop_1, int pop_2, int pos)
this->fishers_exact_test(pair, p_1, q_1, p_2, q_2);
// cerr << "Locus: " << locus << ", pos: " << pos << "\n"
- // << " p_1.nuc: " << s_1->nucs[pos].p_nuc << "; q_1.nuc: " << s_1->nucs[pos].q_nuc
- // << "; p_2.nuc: " << s_2->nucs[pos].p_nuc << "; q_2.nuc: " << s_2->nucs[pos].q_nuc << "\n"
- // << " Total alleles: " << tot_alleles << "; " << " s_1.p: " << s_1->nucs[pos].p
- // << "; s_2.p: " << s_2->nucs[pos].p << "\n"
- // << " p_1: " << p_1 << "; q_1: " << q_1 << " p_2: " << p_2 << "; q_2: " << q_2 << "\n"
- // << " Pi1: " << pi_1 << "; Pi2: " << pi_2 << "; PiAll: " << pi_all << "\n"
- // << " N1: " << n_1 << "; N1 choose 2: " << bcoeff_1 << "\n"
- // << " N2: " << n_2 << "; N2 choose 2: " << bcoeff_2 << "\n"
- // << " Fst: " << Fst << "\n";
+ // << " p_1.nuc: " << s_1->nucs[pos].p_nuc << "; q_1.nuc: " << s_1->nucs[pos].q_nuc
+ // << "; p_2.nuc: " << s_2->nucs[pos].p_nuc << "; q_2.nuc: " << s_2->nucs[pos].q_nuc << "\n"
+ // << " Total alleles: " << tot_alleles << "; " << " s_1.p: " << s_1->nucs[pos].p
+ // << "; s_2.p: " << s_2->nucs[pos].p << "\n"
+ // << " p_1: " << p_1 << "; q_1: " << q_1 << " p_2: " << p_2 << "; q_2: " << q_2 << "\n"
+ // << " Pi1: " << pi_1 << "; Pi2: " << pi_2 << "; PiAll: " << pi_all << "\n"
+ // << " N1: " << n_1 << "; N1 choose 2: " << bcoeff_1 << "\n"
+ // << " N2: " << n_2 << "; N2 choose 2: " << bcoeff_2 << "\n"
+ // << " Fst: " << Fst << "\n";
//
// Calculate Fst (corrected for different samples sizes) using an AMOVA method,
@@ -759,45 +759,45 @@ PopPair *PopSum<LocusT>::Fst(int locus, int pop_1, int pop_2, int pos)
double p_1_freq = s_1->nucs[pos].p;
double q_1_freq = 1 - p_1_freq;
double p_2_freq =
- s_1->nucs[pos].p_nuc == s_2->nucs[pos].p_nuc ?
- s_2->nucs[pos].p : (1 - s_2->nucs[pos].p);
+ s_1->nucs[pos].p_nuc == s_2->nucs[pos].p_nuc ?
+ s_2->nucs[pos].p : (1 - s_2->nucs[pos].p);
double q_2_freq = 1 - p_2_freq;
double p_avg_cor =
- ( (s_1->nucs[pos].num_indv * p_1_freq) + (s_2->nucs[pos].num_indv * p_2_freq) ) /
- ( s_1->nucs[pos].num_indv + s_2->nucs[pos].num_indv );
+ ( (s_1->nucs[pos].num_indv * p_1_freq) + (s_2->nucs[pos].num_indv * p_2_freq) ) /
+ ( s_1->nucs[pos].num_indv + s_2->nucs[pos].num_indv );
double n_avg_cor = 2 * ((s_1->nucs[pos].num_indv / 2) + (s_2->nucs[pos].num_indv / 2));
pair->amova_fst =
- (
- (s_1->nucs[pos].num_indv * pow((p_1_freq - p_avg_cor), 2) +
- s_2->nucs[pos].num_indv * pow((p_2_freq - p_avg_cor), 2))
- /
- n_avg_cor
- )
- /
- (p_avg_cor * (1 - p_avg_cor));
+ (
+ (s_1->nucs[pos].num_indv * pow((p_1_freq - p_avg_cor), 2) +
+ s_2->nucs[pos].num_indv * pow((p_2_freq - p_avg_cor), 2))
+ /
+ n_avg_cor
+ )
+ /
+ (p_avg_cor * (1 - p_avg_cor));
if (log_fst_comp) {
- pair->comp = new double[18];
- pair->comp[0] = n_1;
- pair->comp[1] = n_2;
- pair->comp[2] = tot_alleles;
- pair->comp[3] = p_1;
- pair->comp[4] = q_1;
- pair->comp[5] = p_2;
- pair->comp[6] = q_2;
- pair->comp[7] = pi_1;
- pair->comp[8] = pi_2;
- pair->comp[9] = pi_all;
- pair->comp[10] = bcoeff_1;
- pair->comp[11] = bcoeff_2;
- pair->comp[12] = p_1_freq;
- pair->comp[13] = q_1_freq;
- pair->comp[14] = p_2_freq;
- pair->comp[15] = q_2_freq;
- pair->comp[16] = p_avg_cor;
- pair->comp[17] = n_avg_cor;
+ pair->comp = new double[18];
+ pair->comp[0] = n_1;
+ pair->comp[1] = n_2;
+ pair->comp[2] = tot_alleles;
+ pair->comp[3] = p_1;
+ pair->comp[4] = q_1;
+ pair->comp[5] = p_2;
+ pair->comp[6] = q_2;
+ pair->comp[7] = pi_1;
+ pair->comp[8] = pi_2;
+ pair->comp[9] = pi_all;
+ pair->comp[10] = bcoeff_1;
+ pair->comp[11] = bcoeff_2;
+ pair->comp[12] = p_1_freq;
+ pair->comp[13] = q_1_freq;
+ pair->comp[14] = p_2_freq;
+ pair->comp[15] = q_2_freq;
+ pair->comp[16] = p_avg_cor;
+ pair->comp[17] = n_avg_cor;
}
// //
@@ -822,17 +822,18 @@ int PopSum<LocusT>::tally_fixed_pos(LocusT *locus, Datum **d, LocSum *s, int pos
char p_nuc = 0;
for (uint i = start; i <= end; i++) {
- if (d[i] == NULL || pos >= d[i]->len) continue;
- //
- // Before counting this individual, make sure the model definitively called this
- // position as hEterozygous or hOmozygous.
- //
- if (d[i]->model[pos] == 'E') {
- cerr << "Warning: heterozygous model call at fixed nucleotide position: "
- << "locus " << locus->id << " individual " << d[i]->id << "; position: " << pos << "\n";
- }
- num_indv++;
- p_nuc = locus->con[pos];
+ if (d[i] == NULL || pos >= d[i]->len) continue;
+ //
+ // Before counting this individual, make sure the model definitively called this
+ // position as hEterozygous or hOmozygous.
+ //
+ if (d[i]->model[pos] == 'E') {
+ cerr << "Model: " << d[i]->model << "\n";
+ cerr << "Warning: heterozygous model call at fixed nucleotide position: "
+ << "locus " << locus->id << " individual " << d[i]->id << "; position: " << pos << "\n";
+ }
+ num_indv++;
+ p_nuc = locus->con[pos];
}
//
// Record the results in the PopSum object.
@@ -844,14 +845,14 @@ int PopSum<LocusT>::tally_fixed_pos(LocusT *locus, Datum **d, LocSum *s, int pos
s->nucs[pos].alleles = 2 * num_indv;
if (num_indv > 0) {
- s->nucs[pos].p = 1.0;
- s->nucs[pos].p_nuc = p_nuc;
- s->nucs[pos].obs_hom = 1.0;
- s->nucs[pos].obs_het = 0.0;
- s->nucs[pos].exp_hom = 1.0;
- s->nucs[pos].exp_het = 0.0;
- s->nucs[pos].stat[0] = 0.0; // pi
- s->nucs[pos].stat[1] = -7.0; // fis
+ s->nucs[pos].p = 1.0;
+ s->nucs[pos].p_nuc = p_nuc;
+ s->nucs[pos].obs_hom = 1.0;
+ s->nucs[pos].obs_het = 0.0;
+ s->nucs[pos].exp_hom = 1.0;
+ s->nucs[pos].exp_het = 0.0;
+ s->nucs[pos].stat[0] = 0.0; // pi
+ s->nucs[pos].stat[1] = -7.0; // fis
}
return 0;
@@ -859,7 +860,7 @@ int PopSum<LocusT>::tally_fixed_pos(LocusT *locus, Datum **d, LocSum *s, int pos
template<class LocusT>
int PopSum<LocusT>::tally_heterozygous_pos(LocusT *locus, Datum **d, LocSum *s,
- int pos, int snp_index, uint start, uint end)
+ int pos, int snp_index, uint start, uint end)
{
//
// Tally up the genotype frequencies.
@@ -874,33 +875,33 @@ int PopSum<LocusT>::tally_heterozygous_pos(LocusT *locus, Datum **d, LocSum *s,
// Iterate over each individual in this sub-population.
//
for (i = start; i <= end; i++) {
- if (d[i] == NULL || pos >= d[i]->len || d[i]->model[pos] == 'U') continue;
-
- //
- // Pull each allele for this SNP from the observed haplotype.
- //
- for (uint j = 0; j < d[i]->obshap.size(); j++) {
- nuc = d[i]->obshap[j][snp_index];
-
- switch(nuc) {
- case 'A':
- case 'a':
- nucs[0]++;
- break;
- case 'C':
- case 'c':
- nucs[1]++;
- break;
- case 'G':
- case 'g':
- nucs[2]++;
- break;
- case 'T':
- case 't':
- nucs[3]++;
- break;
- }
- }
+ if (d[i] == NULL || pos >= d[i]->len || d[i]->model[pos] == 'U') continue;
+
+ //
+ // Pull each allele for this SNP from the observed haplotype.
+ //
+ for (uint j = 0; j < d[i]->obshap.size(); j++) {
+ nuc = d[i]->obshap[j][snp_index];
+
+ switch(nuc) {
+ case 'A':
+ case 'a':
+ nucs[0]++;
+ break;
+ case 'C':
+ case 'c':
+ nucs[1]++;
+ break;
+ case 'G':
+ case 'g':
+ nucs[2]++;
+ break;
+ case 'T':
+ case 't':
+ nucs[3]++;
+ break;
+ }
+ }
}
//
@@ -910,10 +911,10 @@ int PopSum<LocusT>::tally_heterozygous_pos(LocusT *locus, Datum **d, LocSum *s,
//
int allele_cnt = 0;
for (i = 0; i < 4; i++)
- if (nucs[i] > 0) allele_cnt++;
+ if (nucs[i] > 0) allele_cnt++;
if (allele_cnt > 2)
- return -1;
+ return -1;
//
// Record which nucleotide is the P allele and which is the Q allele.
@@ -923,39 +924,39 @@ int PopSum<LocusT>::tally_heterozygous_pos(LocusT *locus, Datum **d, LocSum *s,
i = 0;
while (p_allele == 0 && i < 4) {
- if (nucs[i] > 0) {
- switch(i) {
- case 0:
- p_allele = 'A';
- break;
- case 1:
- p_allele = 'C';
- break;
- case 2:
- p_allele = 'G';
- break;
- case 3:
- p_allele = 'T';
- break;
- }
- }
- i++;
+ if (nucs[i] > 0) {
+ switch(i) {
+ case 0:
+ p_allele = 'A';
+ break;
+ case 1:
+ p_allele = 'C';
+ break;
+ case 2:
+ p_allele = 'G';
+ break;
+ case 3:
+ p_allele = 'T';
+ break;
+ }
+ }
+ i++;
}
while (q_allele == 0 && i < 4) {
- if (nucs[i] > 0) {
- switch(i) {
- case 1:
- q_allele = 'C';
- break;
- case 2:
- q_allele = 'G';
- break;
- case 3:
- q_allele = 'T';
- break;
- }
- }
- i++;
+ if (nucs[i] > 0) {
+ switch(i) {
+ case 1:
+ q_allele = 'C';
+ break;
+ case 2:
+ q_allele = 'G';
+ break;
+ case 3:
+ q_allele = 'T';
+ break;
+ }
+ }
+ i++;
}
//cerr << " P Allele: " << p_allele << "; Q Allele: " << q_allele << "\n";
@@ -968,23 +969,23 @@ int PopSum<LocusT>::tally_heterozygous_pos(LocusT *locus, Datum **d, LocSum *s,
double obs_q = 0.0;
for (i = start; i <= end; i++) {
- if (d[i] == NULL || pos >= d[i]->len) continue;
- //
- // Before counting this individual, make sure the model definitively called this
- // position as hEterozygous or hOmozygous.
- //
- if (d[i]->model[pos] == 'E' || d[i]->model[pos] == 'O')
- num_indv++;
- else
- continue;
-
- if (d[i]->obshap.size() > 1 &&
- this->tally_observed_haplotypes(d[i]->obshap, snp_index) == 2)
- obs_het++;
- else if (d[i]->obshap[0][snp_index] == p_allele)
- obs_p++;
- else if (d[i]->obshap[0][snp_index] == q_allele)
- obs_q++;
+ if (d[i] == NULL || pos >= d[i]->len) continue;
+ //
+ // Before counting this individual, make sure the model definitively called this
+ // position as hEterozygous or hOmozygous.
+ //
+ if (d[i]->model[pos] == 'E' || d[i]->model[pos] == 'O')
+ num_indv++;
+ else
+ continue;
+
+ if (d[i]->obshap.size() > 1 &&
+ this->tally_observed_haplotypes(d[i]->obshap, snp_index) == 2)
+ obs_het++;
+ else if (d[i]->obshap[0][snp_index] == p_allele)
+ obs_p++;
+ else if (d[i]->obshap[0][snp_index] == q_allele)
+ obs_q++;
}
//cerr << " Num Individuals: " << num_indv << "; Obs Hets: " << obs_het << "; Obs P: " << obs_p << "; Obs Q: " << obs_q << "\n";
@@ -1003,7 +1004,7 @@ int PopSum<LocusT>::tally_heterozygous_pos(LocusT *locus, Datum **d, LocSum *s,
s->nucs[pos].stat[0] = this->pi(tot_alleles, allele_p, allele_q);
if (s->nucs[pos].stat[0] == 0.0)
- s->nucs[pos].fixed = true;
+ s->nucs[pos].fixed = true;
//
// Convert to allele frequencies
@@ -1017,21 +1018,21 @@ int PopSum<LocusT>::tally_heterozygous_pos(LocusT *locus, Datum **d, LocSum *s,
// // If the minor allele frequency is below the cutoff, set it to zero.
// //
// if (minor_allele_freq > 0) {
- // if (allele_p < allele_q) {
- // if (allele_p < minor_allele_freq) {
- // s->nucs[pos].pi = 0.0;
- // s->nucs[pos].fixed = true;
- // s->nucs[pos].filtered_site = true;
- // return 0;
- // }
- // } else {
- // if (allele_q < minor_allele_freq) {
- // s->nucs[pos].pi = 0.0;
- // s->nucs[pos].fixed = true;
- // s->nucs[pos].filtered_site = true;
- // return 0;
- // }
- // }
+ // if (allele_p < allele_q) {
+ // if (allele_p < minor_allele_freq) {
+ // s->nucs[pos].pi = 0.0;
+ // s->nucs[pos].fixed = true;
+ // s->nucs[pos].filtered_site = true;
+ // return 0;
+ // }
+ // } else {
+ // if (allele_q < minor_allele_freq) {
+ // s->nucs[pos].pi = 0.0;
+ // s->nucs[pos].fixed = true;
+ // s->nucs[pos].filtered_site = true;
+ // return 0;
+ // }
+ // }
// }
//
@@ -1087,29 +1088,29 @@ int PopSum<LocusT>::tally_observed_haplotypes(vector<char *> &obshap, int snp_in
for (uint j = 0; j < obshap.size(); j++) {
nuc = obshap[j][snp_index];
- switch(nuc) {
- case 'A':
- case 'a':
- nucs[0]++;
- break;
- case 'C':
- case 'c':
- nucs[1]++;
- break;
- case 'G':
- case 'g':
- nucs[2]++;
- break;
- case 'T':
- case 't':
- nucs[3]++;
- break;
- }
+ switch(nuc) {
+ case 'A':
+ case 'a':
+ nucs[0]++;
+ break;
+ case 'C':
+ case 'c':
+ nucs[1]++;
+ break;
+ case 'G':
+ case 'g':
+ nucs[2]++;
+ break;
+ case 'T':
+ case 't':
+ nucs[3]++;
+ break;
+ }
}
int allele_cnt = 0;
for (int i = 0; i < 4; i++)
- if (nucs[i] > 0) allele_cnt++;
+ if (nucs[i] > 0) allele_cnt++;
return allele_cnt;
}
@@ -1151,10 +1152,10 @@ int PopSum<LocusT>::fishers_exact_test(PopPair *pair, double p_1, double q_1, do
// sprintf(q2_str, "% 3.0f", q_2);
//
// cerr
- // << " | Allele1 | Allele2 | " << "\n"
- // << "-----+---------+---------+" << "\n"
- // << "Pop1 | " << p1_str << " | " << q1_str << " |" << "\n"
- // << "Pop2 | " << p2_str << " | " << q2_str << " |" << "\n\n";
+ // << " | Allele1 | Allele2 | " << "\n"
+ // << "-----+---------+---------+" << "\n"
+ // << "Pop1 | " << p1_str << " | " << q1_str << " |" << "\n"
+ // << "Pop2 | " << p2_str << " | " << q2_str << " |" << "\n\n";
//
// Compute the first tail.
@@ -1171,31 +1172,31 @@ int PopSum<LocusT>::fishers_exact_test(PopPair *pair, double p_1, double q_1, do
// Compute p and repeat until one or more cells equal 0.
//
if (d_1 - d_2 < 0) {
- do {
- p = (this->binomial_coeff(r_1, p1) * this->binomial_coeff(r_2, p2)) / den;
+ do {
+ p = (this->binomial_coeff(r_1, p1) * this->binomial_coeff(r_2, p2)) / den;
- tail_1 += p;
- p1--;
- q2--;
- p2++;
- q1++;
- } while (p1 >= 0 && q2 >= 0);
+ tail_1 += p;
+ p1--;
+ q2--;
+ p2++;
+ q1++;
+ } while (p1 >= 0 && q2 >= 0);
} else {
- //
- // Else, if (p_1*q_2 - p_2*q_1) > 0 decrease cells p_2 and q_1 by one and add one to p_1 and q_2.
- // Compute p and repeat until one or more cells equal 0.
- //
- do {
- p = (this->binomial_coeff(r_1, p1) * this->binomial_coeff(r_2, p2)) / den;
-
- tail_1 += p;
-
- p2--;
- q1--;
- p1++;
- q2++;
- } while (p2 >= 0 && q1 >= 0);
+ //
+ // Else, if (p_1*q_2 - p_2*q_1) > 0 decrease cells p_2 and q_1 by one and add one to p_1 and q_2.
+ // Compute p and repeat until one or more cells equal 0.
+ //
+ do {
+ p = (this->binomial_coeff(r_1, p1) * this->binomial_coeff(r_2, p2)) / den;
+
+ tail_1 += p;
+
+ p2--;
+ q1--;
+ p1++;
+ q2++;
+ } while (p2 >= 0 && q1 >= 0);
}
//
@@ -1209,29 +1210,29 @@ int PopSum<LocusT>::fishers_exact_test(PopPair *pair, double p_1, double q_1, do
// to keep the marginals the same.
//
if (d_1 - d_2 < 0) {
- if (p2 < q1) {
- q2 += p2;
- p1 += p2;
- q1 -= p2;
- p2 = 0;
- } else {
- p1 += q1;
- q2 += q1;
- p2 -= q1;
- q1 = 0;
- }
+ if (p2 < q1) {
+ q2 += p2;
+ p1 += p2;
+ q1 -= p2;
+ p2 = 0;
+ } else {
+ p1 += q1;
+ q2 += q1;
+ p2 -= q1;
+ q1 = 0;
+ }
} else {
- if (p1 < q2) {
- q1 += p1;
- p2 += p1;
- q2 -= p1;
- p1 = 0;
- } else {
- p2 += q2;
- q1 += q2;
- p1 -= q2;
- q2 = 0;
- }
+ if (p1 < q2) {
+ q1 += p1;
+ p2 += p1;
+ q2 -= p1;
+ p1 = 0;
+ } else {
+ p2 += q2;
+ q1 += q2;
+ p1 -= q2;
+ q2 = 0;
+ }
}
//
@@ -1239,36 +1240,36 @@ int PopSum<LocusT>::fishers_exact_test(PopPair *pair, double p_1, double q_1, do
// Compute p and repeat until tail_2 > tail_1.
//
if (d_1 - d_2 < 0) {
- do {
- p = (this->binomial_coeff(r_1, p1) * this->binomial_coeff(r_2, p2)) / den;
+ do {
+ p = (this->binomial_coeff(r_1, p1) * this->binomial_coeff(r_2, p2)) / den;
- tail_2 += p;
+ tail_2 += p;
- p1--;
- q2--;
- p2++;
- q1++;
- } while (tail_2 < tail_1 && p1 >= 0 && q2 >= 0);
+ p1--;
+ q2--;
+ p2++;
+ q1++;
+ } while (tail_2 < tail_1 && p1 >= 0 && q2 >= 0);
- tail_2 -= p;
+ tail_2 -= p;
} else {
- //
- // Else, if (p_1*q_2 - p_2*q_1) > 0 decrease cells p_2 and q_1 by one and add one to p_1 and q_2.
- // Compute p and repeat until one or more cells equal 0.
- //
- do {
- p = (this->binomial_coeff(r_1, p1) * this->binomial_coeff(r_2, p2)) / den;
-
- tail_2 += p;
-
- p2--;
- q1--;
- p1++;
- q2++;
- } while (tail_2 < tail_1 && p2 >= 0 && q1 >= 0);
-
- tail_2 -= p;
+ //
+ // Else, if (p_1*q_2 - p_2*q_1) > 0 decrease cells p_2 and q_1 by one and add one to p_1 and q_2.
+ // Compute p and repeat until one or more cells equal 0.
+ //
+ do {
+ p = (this->binomial_coeff(r_1, p1) * this->binomial_coeff(r_2, p2)) / den;
+
+ tail_2 += p;
+
+ p2--;
+ q1--;
+ p1++;
+ q2++;
+ } while (tail_2 < tail_1 && p2 >= 0 && q1 >= 0);
+
+ tail_2 -= p;
}
pair->fet_p = tail_1 + tail_2;
@@ -1280,10 +1281,10 @@ int PopSum<LocusT>::fishers_exact_test(PopPair *pair, double p_1, double q_1, do
// zero, we will increment all allele frequencies by one.
//
if (p_1 == 0 || q_1 == 0 || p_2 == 0 || q_2 == 0) {
- p_1++;
- q_1++;
- p_2++;
- q_2++;
+ p_1++;
+ q_1++;
+ p_2++;
+ q_2++;
}
pair->fet_or = (p_1 * q_2) / (q_1 * p_2);
@@ -1318,8 +1319,8 @@ double PopSum<LocusT>::pi(double tot_alleles, double p, double q)
// pi = 1 - Sum_i( (n_i choose 2) ) / (n choose 2)
//
double pi =
- this->binomial_coeff(p, 2) +
- this->binomial_coeff(q, 2);
+ this->binomial_coeff(p, 2) +
+ this->binomial_coeff(q, 2);
pi = pi / binomial_coeff(tot_alleles, 2);
pi = 1 - pi;
diff --git a/src/SamI.h b/src/SamI.h
index d12c290..8bab0d1 100644
--- a/src/SamI.h
+++ b/src/SamI.h
@@ -195,14 +195,17 @@ Sam::find_start_bp_pos(int aln_bp, vector<pair<char, uint> > &cigar)
int
Sam::edit_gaps(vector<pair<char, uint> > &cigar, char *seq)
{
- char buf[id_len];
- uint size = cigar.size();
- char op;
- uint dist, bp, len, buf_len, j, k, stop;
+ char *buf;
+ uint size = cigar.size();
+ char op;
+ uint dist, bp, len, buf_len, buf_size, j, k, stop;
len = strlen(seq);
bp = 0;
+ buf = new char[len + 1];
+ buf_size = len + 1;
+
for (uint i = 0; i < size; i++) {
op = cigar[i].first;
dist = cigar[i].second;
@@ -222,8 +225,10 @@ Sam::edit_gaps(vector<pair<char, uint> > &cigar, char *seq)
// Pad the read with sufficent Ns to match the deletion, shifting the existing
// sequence down. Trim the final length to keep the read length consistent.
//
- strncpy(buf, seq + bp, id_len - 1);
- buf[id_len - 1] = '\0';
+ k = bp >= len ? len : bp;
+
+ strncpy(buf, seq + k, buf_size - 1);
+ buf[buf_size - 1] = '\0';
buf_len = strlen(buf);
stop = bp + dist;
@@ -246,10 +251,12 @@ Sam::edit_gaps(vector<pair<char, uint> > &cigar, char *seq)
// An insertion has occurred in the read relative to the reference genome. Delete the
// inserted bases and pad the end of the read with Ns.
//
- k = bp + dist;
- strncpy(buf, seq + k, id_len - 1);
- buf[id_len - 1] = '\0';
- buf_len = strlen(buf);
+ if (bp >= len) break;
+
+ k = bp + dist > len ? len : bp + dist;
+ strncpy(buf, seq + k, buf_size - 1);
+ buf[buf_size - 1] = '\0';
+ buf_len = strlen(buf);
j = bp;
k = 0;
@@ -269,9 +276,13 @@ Sam::edit_gaps(vector<pair<char, uint> > &cigar, char *seq)
case 'M':
bp += dist;
break;
+ default:
+ break;
}
}
+ delete [] buf;
+
return 0;
}
diff --git a/src/aln_utils.cc b/src/aln_utils.cc
new file mode 100644
index 0000000..a253981
--- /dev/null
+++ b/src/aln_utils.cc
@@ -0,0 +1,216 @@
+// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
+//
+// Copyright 2016, Julian Catchen <jcatchen at illinois.edu>
+//
+// This file is part of Stacks.
+//
+// Stacks is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Stacks is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Stacks. If not, see <http://www.gnu.org/licenses/>.
+//
+
+//
+// aln_utils.cc -- common routines needed for dealing with gapped alignments.
+//
+
+#include "aln_utils.h"
+
+string
+invert_cigar(string cigar)
+{
+ for (uint i = 0; i < cigar.length(); i++) {
+ if (cigar[i] == 'I')
+ cigar[i] = 'D';
+ else if (cigar[i] == 'D')
+ cigar[i] = 'I';
+ }
+
+ return cigar;
+}
+
+int
+parse_cigar(const char *cigar_str, vector<pair<char, uint> > &cigar)
+{
+ char buf[id_len];
+ int dist;
+ const char *p, *q;
+
+ uint seqlen = 0;
+
+ cigar.clear();
+
+ p = cigar_str;
+
+ while (*p != '\0') {
+ q = p + 1;
+
+ while (*q != '\0' && isdigit(*q))
+ q++;
+ strncpy(buf, p, q - p);
+ buf[q-p] = '\0';
+ dist = is_integer(buf);
+
+ cigar.push_back(make_pair(*q, dist));
+
+ seqlen += dist;
+
+ p = q + 1;
+ }
+
+ return seqlen;
+}
+
+string
+apply_cigar_to_seq(const char *seq, vector<pair<char, uint> > &cigar)
+{
+ uint size = cigar.size();
+ char op;
+ uint dist, bp, edited_bp, stop;
+ string edited_seq;
+
+ //
+ // Calculate the overall sequence length.
+ //
+ uint seqlen = 0;
+ for (uint i = 0; i < size; i++)
+ seqlen += cigar[i].second;
+
+ bp = 0;
+
+ edited_seq.reserve(seqlen);
+
+ for (uint i = 0; i < size; i++) {
+ op = cigar[i].first;
+ dist = cigar[i].second;
+
+ switch(op) {
+ case 'S':
+ stop = bp + dist;
+ while (bp < stop) {
+ edited_seq.push_back('N');
+ bp++;
+ }
+ break;
+ case 'D':
+ edited_bp = 0;
+ while (edited_bp < dist) {
+ edited_seq.push_back('N');
+ edited_bp++;
+ }
+ break;
+ case 'I':
+ case 'M':
+ stop = bp + dist;
+ while (bp < stop) {
+ edited_seq.push_back(seq[bp]);
+ bp++;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ return edited_seq;
+}
+
+int
+apply_cigar_to_model_seq(char *seq, uint seq_len, const char *model, vector<pair<char, uint> > &cigar)
+{
+ uint size = cigar.size();
+ char op;
+ uint dist, model_bp, seq_bp, model_len, stop;
+
+ model_len = strlen(model);
+ model_bp = 0;
+ seq_bp = 0;
+
+ for (uint i = 0; i < size; i++) {
+ op = cigar[i].first;
+ dist = cigar[i].second;
+
+ switch(op) {
+ case 'S':
+ stop = seq_bp + dist;
+ stop = stop > seq_len ? seq_len : stop;
+ while (seq_bp < stop) {
+ seq[seq_bp] = 'U';
+ seq_bp++;
+ model_bp++;
+ }
+ break;
+ case 'D':
+ stop = seq_bp + dist;
+ stop = stop > seq_len ? seq_len : stop;
+ while (seq_bp < stop) {
+ seq[seq_bp] = 'U';
+ seq_bp++;
+ }
+ break;
+ case 'I':
+ case 'M':
+ stop = model_bp + dist;
+ stop = stop > seq_len ? seq_len : stop;
+ while (model_bp < stop) {
+ seq[seq_bp] = model[model_bp];
+ seq_bp++;
+ model_bp++;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ seq[seq_len] = '\0';
+
+ return 0;
+}
+
+int
+adjust_snps_for_gaps(vector<pair<char, uint> > &cigar, Locus *loc)
+{
+ uint size = cigar.size();
+ char op;
+ uint dist, bp, stop, offset, snp_index;
+
+ bp = 0;
+ offset = 0;
+ snp_index = 0;
+
+ for (uint i = 0; i < size; i++) {
+ op = cigar[i].first;
+ dist = cigar[i].second;
+
+ switch(op) {
+ case 'D':
+ offset += dist;
+ break;
+ case 'I':
+ case 'M':
+ case 'S':
+ stop = bp + dist;
+ while (bp < stop && snp_index < loc->snps.size()) {
+ if (loc->snps[snp_index]->col == bp) {
+ loc->snps[snp_index]->col += offset;
+ snp_index++;
+ }
+ bp++;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ return 0;
+}
diff --git a/src/aln_utils.h b/src/aln_utils.h
new file mode 100644
index 0000000..795fd24
--- /dev/null
+++ b/src/aln_utils.h
@@ -0,0 +1,42 @@
+// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
+//
+// Copyright 2016, Julian Catchen <jcatchen at illinois.edu>
+//
+// This file is part of Stacks.
+//
+// Stacks is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Stacks is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Stacks. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#ifndef __ALN_UTILS_H__
+#define __ALN_UTILS_H__
+
+#include <utility>
+using std::pair;
+using std::make_pair;
+#include <string>
+using std::string;
+#include <vector>
+using std::vector;
+
+#include "locus.h"
+#include "constants.h"
+#include "utils.h"
+
+string invert_cigar(string);
+int parse_cigar(const char *, vector<pair<char, uint> > &);
+string apply_cigar_to_seq(const char *, vector<pair<char, uint> > &);
+int apply_cigar_to_model_seq(char *, uint, const char *, vector<pair<char, uint> > &);
+int adjust_snps_for_gaps(vector<pair<char, uint> > &, Locus *);
+
+#endif // __ALN_UTILS_H__
diff --git a/src/catalog_utils.cc b/src/catalog_utils.cc
index b01316e..61d666d 100644
--- a/src/catalog_utils.cc
+++ b/src/catalog_utils.cc
@@ -35,17 +35,17 @@ reduce_catalog(map<int, CSLocus *> &catalog, set<int> &whitelist, set<int> &blac
CSLocus *loc;
if (whitelist.size() == 0 && blacklist.size() == 0)
- return 0;
+ return 0;
int i = 0;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
+ loc = it->second;
- if (whitelist.size() > 0 && whitelist.count(loc->id) == 0) continue;
- if (blacklist.count(loc->id)) continue;
+ if (whitelist.size() > 0 && whitelist.count(loc->id) == 0) continue;
+ if (blacklist.count(loc->id)) continue;
- list[it->first] = it->second;
- i++;
+ list[it->first] = it->second;
+ i++;
}
catalog = list;
@@ -61,47 +61,47 @@ implement_single_snp_whitelist(map<int, CSLocus *> &catalog, PopSum<CSLocus> *ps
LocTally *t;
if (whitelist.size() > 0) {
- map<int, set<int> >::iterator it;
-
- for (it = whitelist.begin(); it != whitelist.end(); it++) {
- loc = catalog[it->first];
- t = psum->locus_tally(loc->id);
-
- //
- // If no specific SNPs are specified in the whitelist all SNPs are included, choose the first variant.
- //
- if (it->second.size() == 0) {
- for (uint i = 0; i < loc->snps.size(); i++)
- if (t->nucs[loc->snps[i]->col].fixed == false) {
- new_wl[loc->id].insert(loc->snps[i]->col);
- break;
- }
- } else {
- //
- // Otherwise, choose the first SNP that is already in the whitelist.
- //
- for (uint i = 0; i < loc->snps.size(); i++) {
- if (it->second.count(loc->snps[i]->col) == 0 ||
- t->nucs[loc->snps[i]->col].fixed == true)
- continue;
- new_wl[loc->id].insert(loc->snps[i]->col);
- break;
- }
- }
- }
+ map<int, set<int> >::iterator it;
+
+ for (it = whitelist.begin(); it != whitelist.end(); it++) {
+ loc = catalog[it->first];
+ t = psum->locus_tally(loc->id);
+
+ //
+ // If no specific SNPs are specified in the whitelist all SNPs are included, choose the first variant.
+ //
+ if (it->second.size() == 0) {
+ for (uint i = 0; i < loc->snps.size(); i++)
+ if (t->nucs[loc->snps[i]->col].fixed == false) {
+ new_wl[loc->id].insert(loc->snps[i]->col);
+ break;
+ }
+ } else {
+ //
+ // Otherwise, choose the first SNP that is already in the whitelist.
+ //
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ if (it->second.count(loc->snps[i]->col) == 0 ||
+ t->nucs[loc->snps[i]->col].fixed == true)
+ continue;
+ new_wl[loc->id].insert(loc->snps[i]->col);
+ break;
+ }
+ }
+ }
} else {
- map<int, CSLocus *>::iterator it;
-
- for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++)
- if (t->nucs[loc->snps[i]->col].fixed == false) {
- new_wl[loc->id].insert(loc->snps[i]->col);
- break;
- }
- }
+ map<int, CSLocus *>::iterator it;
+
+ for (it = catalog.begin(); it != catalog.end(); it++) {
+ loc = it->second;
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++)
+ if (t->nucs[loc->snps[i]->col].fixed == false) {
+ new_wl[loc->id].insert(loc->snps[i]->col);
+ break;
+ }
+ }
}
whitelist = new_wl;
@@ -117,34 +117,34 @@ implement_random_snp_whitelist(map<int, CSLocus *> &catalog, PopSum<CSLocus> *ps
uint index;
if (whitelist.size() > 0) {
- map<int, set<int> >::iterator it;
-
- for (it = whitelist.begin(); it != whitelist.end(); it++) {
- loc = catalog[it->first];
-
- if (loc->snps.size() == 0) continue;
-
- if (it->second.size() == 0) {
- index = rand() % loc->snps.size();
- new_wl[loc->id].insert(loc->snps[index]->col);
- } else {
- do {
- index = rand() % loc->snps.size();
- } while (it->second.count(loc->snps[index]->col) == 0);
- new_wl[loc->id].insert(loc->snps[index]->col);
- }
- }
+ map<int, set<int> >::iterator it;
+
+ for (it = whitelist.begin(); it != whitelist.end(); it++) {
+ loc = catalog[it->first];
+
+ if (loc->snps.size() == 0) continue;
+
+ if (it->second.size() == 0) {
+ index = rand() % loc->snps.size();
+ new_wl[loc->id].insert(loc->snps[index]->col);
+ } else {
+ do {
+ index = rand() % loc->snps.size();
+ } while (it->second.count(loc->snps[index]->col) == 0);
+ new_wl[loc->id].insert(loc->snps[index]->col);
+ }
+ }
} else {
- map<int, CSLocus *>::iterator it;
+ map<int, CSLocus *>::iterator it;
- for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
+ for (it = catalog.begin(); it != catalog.end(); it++) {
+ loc = it->second;
- if (loc->snps.size() > 0) {
- index = rand() % loc->snps.size();
- new_wl[loc->id].insert(loc->snps[index]->col);
- }
- }
+ if (loc->snps.size() > 0) {
+ index = rand() % loc->snps.size();
+ new_wl[loc->id].insert(loc->snps[index]->col);
+ }
+ }
}
whitelist = new_wl;
@@ -168,29 +168,29 @@ check_whitelist_integrity(map<int, CSLocus *> &catalog, map<int, set<int> > &whi
cerr << "Checking the integrity of the whitelist...";
for (it = whitelist.begin(); it != whitelist.end(); it++) {
- if (catalog.count(it->first) == 0) {
- rm_loci++;
- cerr << "\n Removing locus " << it->first << " from whitelist as it does not exist in the catalog.";
- } else {
- loc = catalog[it->first];
-
- if (it->second.size() == 0) {
- new_wl.insert(make_pair(it->first, std::set<int>()));
- continue;
- }
-
- set<int> cat_snps;
- for (uint i = 0; i < loc->snps.size(); i++)
- cat_snps.insert(loc->snps[i]->col);
-
- for (sit = it->second.begin(); sit != it->second.end(); sit++)
- if (cat_snps.count(*sit)) {
- new_wl[it->first].insert(*sit);
- } else {
- rm_snps++;
- cerr << "\n Removing SNP at column " << *sit << " in locus " << it->first << " from whitelist as it does not exist in the catalog.";
- }
- }
+ if (catalog.count(it->first) == 0) {
+ rm_loci++;
+ cerr << "\n Removing locus " << it->first << " from whitelist as it does not exist in the catalog.";
+ } else {
+ loc = catalog[it->first];
+
+ if (it->second.size() == 0) {
+ new_wl.insert(make_pair(it->first, std::set<int>()));
+ continue;
+ }
+
+ set<int> cat_snps;
+ for (uint i = 0; i < loc->snps.size(); i++)
+ cat_snps.insert(loc->snps[i]->col);
+
+ for (sit = it->second.begin(); sit != it->second.end(); sit++)
+ if (cat_snps.count(*sit)) {
+ new_wl[it->first].insert(*sit);
+ } else {
+ rm_snps++;
+ cerr << "\n Removing SNP at column " << *sit << " in locus " << it->first << " from whitelist as it does not exist in the catalog.";
+ }
+ }
}
whitelist = new_wl;
@@ -198,7 +198,7 @@ check_whitelist_integrity(map<int, CSLocus *> &catalog, map<int, set<int> > &whi
if (rm_loci > 0 || rm_snps > 0) cerr << "\n";
cerr << "done.\n"
- << "Removed " << rm_loci << " loci and " << rm_snps << " SNPs from the whitelist that were not found in the catalog.\n";
+ << "Removed " << rm_loci << " loci and " << rm_snps << " SNPs from the whitelist that were not found in the catalog.\n";
return 0;
}
@@ -211,17 +211,17 @@ reduce_catalog(map<int, CSLocus *> &catalog, map<int, set<int> > &whitelist, set
CSLocus *loc;
if (whitelist.size() == 0 && blacklist.size() == 0)
- return 0;
+ return 0;
int i = 0;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
+ loc = it->second;
- if (whitelist.size() > 0 && whitelist.count(loc->id) == 0) continue;
- if (blacklist.count(loc->id)) continue;
+ if (whitelist.size() > 0 && whitelist.count(loc->id) == 0) continue;
+ if (blacklist.count(loc->id)) continue;
- list[it->first] = it->second;
- i++;
+ list[it->first] = it->second;
+ i++;
}
catalog = list;
@@ -237,7 +237,7 @@ reduce_catalog_snps(map<int, CSLocus *> &catalog, map<int, set<int> > &whitelist
Datum **d;
if (whitelist.size() == 0)
- return 0;
+ return 0;
//
// We want to prune out SNP objects that are not in the whitelist.
@@ -248,93 +248,93 @@ reduce_catalog_snps(map<int, CSLocus *> &catalog, map<int, set<int> > &whitelist
map<string, int> obshaps;
map<string, int>::iterator sit;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
-
- if (whitelist[loc->id].size() == 0)
- continue;
-
- tmp.clear();
- cols.clear();
-
- d = pmap->locus(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- if (whitelist[loc->id].count(loc->snps[i]->col) > 0) {
- tmp.push_back(loc->snps[i]);
- cols.push_back(i);
- } else {
- //
- // Change the model calls in the samples to no longer contain this SNP.
- //
- pos = loc->snps[i]->col;
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- if (d[j] == NULL || pos >= d[j]->len)
- continue;
- if (d[j]->model != NULL) {
- d[j]->model[pos] = 'U';
- }
- }
-
- delete loc->snps[i];
- }
- }
- loc->snps.clear();
- for (uint i = 0; i < tmp.size(); i++)
- loc->snps.push_back(tmp[i]);
-
- map<string, int>::iterator it;
- char allele_old[id_len], allele_new[id_len];
- //
- // We need to adjust the catalog's list of haplotypes/alleles
- // for this locus to account for the pruned SNPs.
- //
- for (it = loc->alleles.begin(); it != loc->alleles.end(); it++) {
- strncpy(allele_old, it->first.c_str(), id_len - 1);
- allele_old[id_len - 1] = '\0';
-
- for (uint k = 0; k < cols.size(); k++)
- allele_new[k] = allele_old[cols[k]];
- allele_new[cols.size()] = '\0';
- obshaps[string(allele_new)] += it->second;
- }
- loc->alleles.clear();
- for (sit = obshaps.begin(); sit != obshaps.end(); sit++) {
- loc->alleles[sit->first] = sit->second;
- }
- obshaps.clear();
-
- loc->populate_alleles();
-
- //
- // Now we need to adjust the matched haplotypes to sync to
- // the SNPs left in the catalog.
- //
- // Reducing the lengths of the haplotypes may create
- // redundant (shorter) haplotypes, we need to remove these.
- //
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL) continue;
-
- for (uint j = 0; j < d[i]->obshap.size(); j++) {
- for (uint k = 0; k < cols.size(); k++)
- d[i]->obshap[j][k] = d[i]->obshap[j][cols[k]];
- d[i]->obshap[j][cols.size()] = '\0';
- obshaps[d[i]->obshap[j]] += d[i]->depth[j];
- }
- uint j = 0;
- for (sit = obshaps.begin(); sit != obshaps.end(); sit++) {
- strcpy(d[i]->obshap[j], sit->first.c_str());
- d[i]->depth[j] = sit->second;
- j++;
- }
- while (j < d[i]->obshap.size()) {
- delete [] d[i]->obshap[j];
- j++;
- }
- d[i]->obshap.resize(obshaps.size());
- d[i]->depth.resize(obshaps.size());
- obshaps.clear();
- }
+ loc = it->second;
+
+ if (whitelist[loc->id].size() == 0)
+ continue;
+
+ tmp.clear();
+ cols.clear();
+
+ d = pmap->locus(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ if (whitelist[loc->id].count(loc->snps[i]->col) > 0) {
+ tmp.push_back(loc->snps[i]);
+ cols.push_back(i);
+ } else {
+ //
+ // Change the model calls in the samples to no longer contain this SNP.
+ //
+ pos = loc->snps[i]->col;
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ if (d[j] == NULL || pos >= d[j]->len)
+ continue;
+ if (d[j]->model != NULL) {
+ d[j]->model[pos] = 'U';
+ }
+ }
+
+ delete loc->snps[i];
+ }
+ }
+ loc->snps.clear();
+ for (uint i = 0; i < tmp.size(); i++)
+ loc->snps.push_back(tmp[i]);
+
+ map<string, int>::iterator it;
+ char allele_old[id_len], allele_new[id_len];
+ //
+ // We need to adjust the catalog's list of haplotypes/alleles
+ // for this locus to account for the pruned SNPs.
+ //
+ for (it = loc->alleles.begin(); it != loc->alleles.end(); it++) {
+ strncpy(allele_old, it->first.c_str(), id_len - 1);
+ allele_old[id_len - 1] = '\0';
+
+ for (uint k = 0; k < cols.size(); k++)
+ allele_new[k] = allele_old[cols[k]];
+ allele_new[cols.size()] = '\0';
+ obshaps[string(allele_new)] += it->second;
+ }
+ loc->alleles.clear();
+ for (sit = obshaps.begin(); sit != obshaps.end(); sit++) {
+ loc->alleles[sit->first] = sit->second;
+ }
+ obshaps.clear();
+
+ loc->populate_alleles();
+
+ //
+ // Now we need to adjust the matched haplotypes to sync to
+ // the SNPs left in the catalog.
+ //
+ // Reducing the lengths of the haplotypes may create
+ // redundant (shorter) haplotypes, we need to remove these.
+ //
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (d[i] == NULL) continue;
+
+ for (uint j = 0; j < d[i]->obshap.size(); j++) {
+ for (uint k = 0; k < cols.size(); k++)
+ d[i]->obshap[j][k] = d[i]->obshap[j][cols[k]];
+ d[i]->obshap[j][cols.size()] = '\0';
+ obshaps[d[i]->obshap[j]] += d[i]->depth[j];
+ }
+ uint j = 0;
+ for (sit = obshaps.begin(); sit != obshaps.end(); sit++) {
+ strcpy(d[i]->obshap[j], sit->first.c_str());
+ d[i]->depth[j] = sit->second;
+ j++;
+ }
+ while (j < d[i]->obshap.size()) {
+ delete [] d[i]->obshap[j];
+ j++;
+ }
+ d[i]->obshap.resize(obshaps.size());
+ d[i]->depth.resize(obshaps.size());
+ obshaps.clear();
+ }
}
return 0;
diff --git a/src/constants.h b/src/constants.h
index 9e1f3d9..1bd1f03 100644
--- a/src/constants.h
+++ b/src/constants.h
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2010, Julian Catchen <jcatchen at uoregon.edu>
+// Copyright 2016, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -28,6 +28,8 @@
#include "config.h"
#endif
+typedef unsigned int uint;
+
//
//
//
diff --git a/src/cstacks.cc b/src/cstacks.cc
index 13b0304..464cc99 100644
--- a/src/cstacks.cc
+++ b/src/cstacks.cc
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2010-2015, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2010-2016, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -28,16 +28,19 @@
queue<pair<int, string> > samples;
string out_path;
string catalog_path;
-FileT in_file_type = FileT::sql;
-int batch_id = 0;
-int ctag_dist = 1;
-bool set_kmer_len = true;
-int kmer_len = 0;
-searcht search_type = sequence;
-int num_threads = 1;
-bool mult_matches = false;
-bool report_mmatches = false;
+FileT in_file_type = FileT::sql;
+int batch_id = 0;
+int ctag_dist = 1;
+bool set_kmer_len = true;
+int kmer_len = 0;
+searcht search_type = sequence;
+int num_threads = 1;
+bool mult_matches = false;
+bool report_mmatches = false;
bool require_uniq_haplotypes = false;
+bool gapped_alignments = false;
+double min_match_len = 0.80;
+double max_gaps = 2.0;
int main (int argc, char* argv[]) {
@@ -114,21 +117,35 @@ int main (int argc, char* argv[]) {
if (search_type == sequence) {
cerr << "Searching for sequence matches...\n";
find_kmer_matches_by_sequence(catalog, sample, ctag_dist);
- } else if (search_type == genomic_loc) {
+
+ if (gapped_alignments) {
+ cerr << "Searching for gapped alignments...\n";
+ search_for_gaps(catalog, sample, min_match_len, ctag_dist);
+ }
+
+ } else if (search_type == genomic_loc) {
cerr << "Searching for matches by genomic location...\n";
find_matches_by_genomic_loc(cat_index, sample);
}
cerr << "Merging matches into catalog...\n";
uint mmatches = 0;
- merge_matches(catalog, sample, s, ctag_dist, mmatches);
- cerr << " " << mmatches << " loci matched more than one catalog locus and were excluded.\n";
+ uint gmatches = 0;
+ uint umatches = 0;
+ uint nmatches = 0;
+ merge_matches(catalog, sample, s, ctag_dist, nmatches, umatches, gmatches, mmatches);
+ cerr << " " << umatches << " loci were matched to a catalog locus.\n"
+ << " " << gmatches << " loci were matched to a catalog locus using gapped alignments.\n"
+ << " " << nmatches << " loci were newly added to the catalog.\n"
+ << " " << mmatches << " loci matched more than one catalog locus and were excluded.\n";
//
// Regenerate the alleles for the catalog tags after merging the new sample into the catalog.
//
- for (cat_it = catalog.begin(); cat_it != catalog.end(); cat_it++)
+ for (cat_it = catalog.begin(); cat_it != catalog.end(); cat_it++) {
cat_it->second->populate_alleles();
+ cat_it->second->match_cnt = 0;
+ }
if (search_type == genomic_loc) {
cerr << " Updating catalog index...\n";
@@ -192,6 +209,7 @@ int characterize_mismatch_snps(CLocus *catalog_tag, QLocus *query_tag) {
if (snp_cols.count(i) == 0 &&
(*c != *q) && (*c != 'N' && *q != 'N')) {
+ // cerr << "Adding a new SNP at position " << c - c_beg << ", " << *c << "/" << *q << "\n";
SNP *s = new SNP;
s->type = snp_type_het;
s->col = c - c_beg;
@@ -222,12 +240,17 @@ int characterize_mismatch_snps(CLocus *catalog_tag, QLocus *query_tag) {
}
int
-merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int, string> &sample_file, int ctag_dist, uint &mmatches)
+merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int, string> &sample_file, int ctag_dist,
+ uint &new_matches, uint &unique_matches, uint &gapped_matches, uint &multiple_matches)
{
map<int, QLocus *>::iterator i;
- vector<Match *>::iterator mat_it;
CLocus *ctag;
QLocus *qtag;
+ string cseq, qseq, cigar_str;
+ int cseq_len, qseq_len, match_index;
+ vector<pair<char, uint> > cigar;
+
+ GappedAln *aln = new GappedAln();
for (i = sample.begin(); i != sample.end(); i++) {
qtag = i->second;
@@ -238,6 +261,7 @@ merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int,
//
if (qtag->matches.size() == 0) {
add_unique_tag(sample_file, catalog, qtag);
+ new_matches++;
continue;
}
@@ -247,24 +271,24 @@ merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int,
// for a locus.
//
map<int, uint> local_matches;
- map<int, uint>::iterator j;
- for (mat_it = qtag->matches.begin(); mat_it != qtag->matches.end(); mat_it++) {
- if (local_matches.count((*mat_it)->cat_id) == 0)
- local_matches[(*mat_it)->cat_id] = (*mat_it)->dist;
- else if ((*mat_it)->dist < local_matches[(*mat_it)->cat_id])
- local_matches[(*mat_it)->cat_id] = (*mat_it)->dist;
+ for (uint k = 0; k < qtag->matches.size(); k++) {
+ if (local_matches.count(qtag->matches[k]->cat_id) == 0)
+ local_matches[qtag->matches[k]->cat_id] = qtag->matches[k]->dist;
+ else if (qtag->matches[k]->dist < local_matches[qtag->matches[k]->cat_id])
+ local_matches[qtag->matches[k]->cat_id] = qtag->matches[k]->dist;
}
uint min_dist = 1000;
uint num_matches = 0;
int min_cat_id = -1;
+
//
// Find the minimum distance and then check how many matches have that distance.
//
- for (j = local_matches.begin(); j != local_matches.end(); j++)
+ for (map<int, uint>::iterator j = local_matches.begin(); j != local_matches.end(); j++)
min_dist = j->second < min_dist ? j->second : min_dist;
- for (j = local_matches.begin(); j != local_matches.end(); j++)
+ for (map<int, uint>::iterator j = local_matches.begin(); j != local_matches.end(); j++)
if (j->second == min_dist) {
num_matches++;
min_cat_id = j->first;
@@ -274,12 +298,12 @@ merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int,
// Emit a warning if the query tag matches more than one tag in the catalog.
//
if (num_matches > 1) {
- mmatches++;
+ multiple_matches++;
if (report_mmatches) {
cerr <<
" Warning: sample " << sample_file.second << ", tag " << qtag->id <<
", matches more than one tag in the catalog and was excluded: ";
- for (j = local_matches.begin(); j != local_matches.end(); j++)
+ for (map<int, uint>::iterator j = local_matches.begin(); j != local_matches.end(); j++)
cerr << j->first << " ";
cerr << "\n";
}
@@ -295,6 +319,78 @@ merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int,
if (ctag == NULL)
cerr << " Unable to locate catalog tag " << min_cat_id << "\n";
+ cigar_str = "";
+
+ for (uint k = 0; k < qtag->matches.size(); k++)
+ if (qtag->matches[k]->cat_id == min_cat_id) {
+ cigar_str = qtag->matches[k]->cigar;
+ match_index = k;
+ break;
+ }
+
+ bool gapped_aln = false;
+ if (cigar_str.length() > 0)
+ gapped_aln = true;
+
+ //
+ // If the match was a gapped alignment, adjust the lengths of the consensus sequences.
+ // Adjust the postition of any SNPs that were shifted down sequence due to a gap.
+ //
+ if (gapped_aln) {
+ qseq_len = parse_cigar(cigar_str.c_str(), cigar);
+
+ if (ctag->match_cnt > 0) {
+ string query_allele, query_seq, cat_allele, cat_seq;
+ // cerr << " Warning: Catalog locus " << ctag->id
+ // << ", Sample " << qtag->sample_id << ", locus " << qtag->id
+ // << "; sequence length has changed since original alignment: "
+ // << qseq_len << " <-> " << ctag->len
+ // << "; re-aligning.\n";
+
+ //
+ // Find the proper query allele to align against the catalog. We can align
+ // against the catalog consensus because the allele strings may have changed.
+ //
+ query_allele = qtag->matches[match_index]->query_type;
+ for (uint k = 0; k < qtag->strings.size(); k++)
+ if (qtag->strings[k].first == query_allele) {
+ query_seq = qtag->strings[k].second;
+ break;
+ }
+ aln->init(ctag->len, qtag->len);
+ aln->align(ctag->con, query_seq);
+ cigar_str = invert_cigar(aln->result().cigar);
+ parse_cigar(cigar_str.c_str(), cigar);
+ }
+
+ qseq = apply_cigar_to_seq(qtag->con, cigar);
+ adjust_snps_for_gaps(cigar, qtag);
+
+ cigar_str = invert_cigar(cigar_str);
+ cseq_len = parse_cigar(cigar_str.c_str(), cigar);
+ cseq = apply_cigar_to_seq(ctag->con, cigar);
+ adjust_snps_for_gaps(cigar, ctag);
+
+ //
+ // If the alignment modified the catalog locus, record it so we can re-align
+ // any other matching sequences from this sample.
+ //
+ if (cseq_len > ctag->len)
+ ctag->match_cnt++;
+
+ //
+ // Adjust the consensus sequences for both loci.
+ //
+ ctag->add_consensus(cseq.c_str());
+ qtag->add_consensus(qseq.c_str());
+
+ gapped_matches++;
+
+ } else {
+ unique_matches++;
+ ctag->match_cnt++;
+ }
+
//
// If mismatches are allowed between query and catalog tags, identify the
// mismatches and convert them into SNP objects to be merged into the catalog tag.
@@ -309,14 +405,19 @@ merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int,
// Merge the SNPs and alleles from the sample into the catalog tag.
//
if (!ctag->merge_snps(qtag)) {
- cerr << "Error merging " << sample_file.second << ", tag " << qtag->id <<
- " with catalog tag " << ctag->id << "\n";
+ cerr << "Error merging " << sample_file.second << ", tag " << qtag->id
+ << " with catalog tag " << ctag->id << "\n";
}
- //
- // If the catalog consensus tag is shorter than the query tag, replace it.
- //
- if (strlen(ctag->con) < strlen(qtag->con)) {
+ //
+ // Add any new sequence information into the catalog consensus.
+ //
+ if (gapped_aln) {
+ for (uint k = 0; k < ctag->len; k++)
+ if (qtag->con[k] != 'N' && ctag->con[k] == 'N')
+ ctag->con[k] = qtag->con[k];
+
+ } else if (strlen(ctag->con) < strlen(qtag->con)) {
ctag->add_consensus(qtag->con);
}
@@ -373,13 +474,13 @@ int find_kmer_matches_by_sequence(map<int, CLocus *> &catalog, map<int, QLocus *
// Calculate the distance (number of mismatches) between each pair
// of Radtags. We expect all radtags to be the same length;
//
- CatKmerHashMap kmer_map;
- vector<char *> kmer_map_keys;
- map<int, QLocus *>::iterator it;
+ KmerHashMap kmer_map;
+ map<int, pair<allele_type, int> > allele_map;
+ vector<char *> kmer_map_keys;
+ map<int, QLocus *>::iterator it;
vector<pair<allele_type, string> >::iterator allele;
QLocus *tag_1;
CLocus *tag_2;
- int i, j;
// OpenMP can't parallelize random access iterators, so we convert
// our map to a vector of integer keys.
@@ -395,102 +496,356 @@ int find_kmer_matches_by_sequence(map<int, CLocus *> &catalog, map<int, QLocus *
if (set_kmer_len) kmer_len = determine_kmer_length(con_len, ctag_dist);
int num_kmers = con_len - kmer_len + 1;
- cerr << " Distance allowed between stacks: " << ctag_dist << "\n"
- << " Using a k-mer length of " << kmer_len << "\n"
- << " Number of kmers per sequence: " << num_kmers << "\n";
-
//
// Calculate the minimum number of matching k-mers required for a possible sequence match.
//
int min_hits = calc_min_kmer_matches(kmer_len, ctag_dist, con_len, set_kmer_len ? true : false);
- populate_kmer_hash(catalog, kmer_map, kmer_map_keys, kmer_len);
+ cerr << " Distance allowed between stacks: " << ctag_dist
+ << "; searching with a k-mer length of " << kmer_len << " (" << num_kmers << " k-mers per read); "
+ << min_hits << " k-mer hits required.\n";
+ // clock_t time_1, time_2, time_3, time_4;
+ // double per_locus = 0.0;
+
+ // time_1 = clock();
+ populate_kmer_hash(catalog, kmer_map, kmer_map_keys, allele_map, kmer_len);
+ // time_2 = clock();
+
cerr << " " << catalog.size() << " loci in the catalog, " << kmer_map.size() << " kmers in the catalog hash.\n";
-
- #pragma omp parallel private(i, j, tag_1, tag_2, allele)
- {
+
+ #pragma omp parallel private(tag_1, tag_2, allele)
+ {
+ KmerHashMap::iterator h;
+ vector<char *> kmers;
+ set<string> uniq_kmers;
+ vector<int> hits;
+ vector<pair<int, int> > ordered_hits;
+ uint hit_cnt, index, prev_id, allele_id, hits_size;
+ int d;
+ pair<allele_type, int> cat_hit;
+
+ initialize_kmers(kmer_len, num_kmers, kmers);
+
#pragma omp for
- for (i = 0; i < (int) keys.size(); i++) {
+ for (uint i = 0; i < keys.size(); i++) {
tag_1 = sample[keys[i]];
- for (allele = tag_1->strings.begin(); allele != tag_1->strings.end(); allele++) {
+ // time_3 = clock();
- vector<char *> kmers;
- generate_kmers(allele->second.c_str(), kmer_len, num_kmers, kmers);
+ for (allele = tag_1->strings.begin(); allele != tag_1->strings.end(); allele++) {
+
+ generate_kmers_lazily(allele->second.c_str(), kmer_len, num_kmers, kmers);
- map<int, vector<allele_type> > hits;
- vector<pair<allele_type, int> >::iterator map_it;
- int d;
//
- // Lookup the occurances of each k-mer in the kmer_map
- //
- for (j = 0; j < num_kmers; j++) {
+ // We want to create a list of unique kmers to search with; otherwise, repetitive kmers will
+ // generate, multiple, spurious hits in sequences with multiple copies of the same kmer.
+ //
+ uniq_kmers.clear();
+ for (uint j = 0; j < num_kmers; j++)
+ uniq_kmers.insert(kmers[j]);
+
+ hits.clear();
+ ordered_hits.clear();
+
+ //
+ // Lookup the occurances of each k-mer in the kmer_map
+ //
+ for (set<string>::iterator j = uniq_kmers.begin(); j != uniq_kmers.end(); j++) {
+
+ h = kmer_map.find(j->c_str());
+
+ if (h != kmer_map.end())
+ for (uint k = 0; k < h->second.size(); k++)
+ hits.push_back(h->second[k]);
+ }
+
+ //
+ // Sort the vector of indexes; provides the number of hits to each allele/locus
+ // and orders them largest to smallest.
+ //
+ sort(hits.begin(), hits.end());
+
+ //
+ // Iterate through the list of hits and collapse them down by number of kmer hits per allele.
+ //
+ hits_size = hits.size();
+
+ if (hits_size == 0)
+ continue;
+
+ prev_id = hits[0];
+ index = 0;
+
+ do {
+ hit_cnt = 0;
+ allele_id = prev_id;
+
+ while (hits[index] == prev_id) {
+ hit_cnt++;
+ index++;
+ }
+
+ if (index < hits_size)
+ prev_id = hits[index];
- if (kmer_map.count(kmers[j]) > 0)
- for (map_it = kmer_map[kmers[j]].begin();
- map_it != kmer_map[kmers[j]].end();
- map_it++)
- hits[map_it->second].push_back(map_it->first);
+ if (hit_cnt >= min_hits)
+ ordered_hits.push_back(make_pair(allele_id, hit_cnt));
+
+ } while (index < hits_size);
+
+ for (uint j = 0; j < ordered_hits.size(); j++) {
+ cat_hit = allele_map.at(ordered_hits[j].first);
+ hit_cnt = ordered_hits[j].second;
+
+ tag_2 = catalog[cat_hit.second];
+
+ d = dist(allele->second.c_str(), tag_2, cat_hit.first);
+
+ if (d < 0)
+ cerr <<
+ "Unknown error calculating distance between " <<
+ tag_1->id << " and " << tag_2->id << "; query allele: " << allele->first << "\n";
+
+ //
+ // Add a match to the query sequence: catalog ID, catalog allele, query allele, distance
+ //
+ if (d <= ctag_dist)
+ tag_1->add_match(tag_2->id, cat_hit.first, allele->first, d);
}
+ }
- //
- // Free the allocated k-mers.
- //
- for (j = 0; j < num_kmers; j++)
- delete [] kmers[j];
- kmers.clear();
+ // Sort the vector of distances.
+ sort(tag_1->matches.begin(), tag_1->matches.end(), compare_matches);
+
+ // time_4 = clock();
+ // per_locus += (time_4 - time_3);
+ }
+
+ //
+ // Free the allocated k-mers.
+ //
+ for (uint j = 0; j < kmers.size(); j++)
+ delete [] kmers[j];
+ kmers.clear();
+ }
+
+ // cerr << "Time to kmerize catalog: " << time_2 - time_1 << "\n"
+ // << "Average time per locus: " << per_locus / (double) keys.size() << "\n";
+
+ free_kmer_hash(kmer_map, kmer_map_keys);
+
+ return 0;
+}
+
+int
+search_for_gaps(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, double min_match_len, double ctag_dist)
+{
+ //
+ // Search for loci that can be merged with a gapped alignment.
+ //
+ KmerHashMap kmer_map;
+ map<int, pair<allele_type, int> > allele_map;
+ vector<char *> kmer_map_keys;
+ map<int, QLocus *>::iterator it;
+ QLocus *tag_1;
+ CLocus *tag_2;
+
+ //
+ // OpenMP can't parallelize random access iterators, so we convert
+ // our map to a vector of integer keys.
+ //
+ vector<int> keys;
+ for (it = sample.begin(); it != sample.end(); it++)
+ keys.push_back(it->first);
- //cerr << " Tag " << tag_1->id << " hit " << hits.size() << " kmers.\n";
+ //
+ // Calculate the number of k-mers we will generate. If kmer_len == 0,
+ // determine the optimal length for k-mers.
+ //
+ int con_len = strlen(sample[keys[0]]->con);
+ int kmer_len = 19;
+ int num_kmers = con_len - kmer_len + 1;
+
+ //
+ // Calculate the minimum number of matching k-mers required for a possible sequence match.
+ //
+ int min_hits = (round((double) con_len * min_match_len) - (kmer_len * max_gaps)) - kmer_len + 1;
- map<int, vector<allele_type> >::iterator hit_it;
- vector<allele_type>::iterator all_it;
+ cerr << " Searching with a k-mer length of " << kmer_len << " (" << num_kmers << " k-mers per read); " << min_hits << " k-mer hits required.\n";
+
+ // clock_t time_1, time_2, time_3, time_4;
+ // double per_locus = 0.0;
+
+ // time_1 = clock();
+ populate_kmer_hash(catalog, kmer_map, kmer_map_keys, allele_map, kmer_len);
+ // time_2 = clock();
+
+ #pragma omp parallel private(tag_1, tag_2)
+ {
+ KmerHashMap::iterator h;
+ AlignRes aln_res;
+ vector<char *> kmers;
+ set<string> uniq_kmers;
+ vector<int> hits;
+ vector<pair<int, int> > ordered_hits;
+ uint hit_cnt, index, prev_id, allele_id, hits_size, stop, top_hit;
+ int d;
+ vector<pair<char, uint> > cigar;
+ pair<allele_type, int> cat_hit;
+ string cat_seq;
+
+ GappedAln *aln = new GappedAln();
+
+ initialize_kmers(kmer_len, num_kmers, kmers);
+
+ #pragma omp for schedule(dynamic)
+ for (uint i = 0; i < keys.size(); i++) {
+ tag_1 = sample[keys[i]];
+
+ //
+ // If we already matched this locus to the catalog without using gapped alignments, skip it now.
+ //
+ if (tag_1->matches.size() > 0)
+ continue;
+
+ // time_3 = clock();
+
+ for (vector<pair<allele_type, string> >::iterator allele = tag_1->strings.begin(); allele != tag_1->strings.end(); allele++) {
+
+ generate_kmers_lazily(allele->second.c_str(), kmer_len, num_kmers, kmers);
//
- // Iterate through the list of hits. For each hit, total up the hits to the various alleles.
- // Any allele that has more than min_hits check its full length to verify a match.
+ // We want to create a list of unique kmers to search with; otherwise, repetitive kmers will
+ // generate, multiple, spurious hits in sequences with multiple copies of the same kmer.
+ //
+ uniq_kmers.clear();
+ for (uint j = 0; j < num_kmers; j++)
+ uniq_kmers.insert(kmers[j]);
+
+ hits.clear();
+ ordered_hits.clear();
+
//
- for (hit_it = hits.begin(); hit_it != hits.end(); hit_it++) {
- //cerr << " Tag " << hit_it->first << " has " << hit_it->second << " hits (min hits: " << min_hits << ")\n";
+ // Lookup the occurances of each k-mer in the kmer_map
+ //
+ for (set<string>::iterator j = uniq_kmers.begin(); j != uniq_kmers.end(); j++) {
+
+ h = kmer_map.find(j->c_str());
- map<allele_type, int> allele_cnts;
- map<allele_type, int>::iterator cnt_it;
+ if (h != kmer_map.end())
+ for (uint k = 0; k < h->second.size(); k++)
+ hits.push_back(h->second[k]);
+ }
+
+ //
+ // Sort the vector of indexes; provides the number of hits to each allele/locus
+ // and orders them largest to smallest.
+ //
+ sort(hits.begin(), hits.end());
- for (all_it = hit_it->second.begin(); all_it != hit_it->second.end(); all_it++)
- allele_cnts[*all_it]++;
+ //
+ // Iterate through the list of hits and collapse them down by number of kmer hits per allele.
+ //
+ hits_size = hits.size();
- for (cnt_it = allele_cnts.begin(); cnt_it != allele_cnts.end(); cnt_it++) {
- //cerr << " allele " << cnt_it->first << " has " << cnt_it->second << " hits\n";
+ if (hits_size == 0)
+ continue;
- if (cnt_it->second < min_hits) continue;
+ prev_id = hits[0];
+ index = 0;
- //cerr << " Match found, checking full-length match\n";
+ do {
+ hit_cnt = 0;
+ allele_id = prev_id;
- tag_2 = catalog[hit_it->first];
+ while (hits[index] == prev_id) {
+ hit_cnt++;
+ index++;
+ }
- d = dist(allele->second.c_str(), tag_2, cnt_it->first);
+ if (index < hits_size)
+ prev_id = hits[index];
+
+ if (hit_cnt >= min_hits)
+ ordered_hits.push_back(make_pair(allele_id, hit_cnt));
+
+ } while (index < hits_size);
+
+ if (ordered_hits.size() == 0)
+ continue;
+
+ //
+ // Process the hits from most kmer hits to least kmer hits.
+ //
+ sort(ordered_hits.begin(), ordered_hits.end(), compare_pair_intint);
+
+ //
+ // Only try to align the sequences with the most kmers in common.
+ //
+ top_hit = ordered_hits[0].second;
+ stop = 1;
+ for (uint j = 1; j < ordered_hits.size(); j++)
+ if (ordered_hits[j].second < top_hit) {
+ stop = j;
+ break;
+ }
+
+ for (uint j = 0; j < stop; j++) {
+ cat_hit = allele_map.at(ordered_hits[j].first);
+ hit_cnt = ordered_hits[j].second;
+
+ tag_2 = catalog[cat_hit.second];
+
+ cat_seq = "";
+ for (uint k = 0; k < tag_2->strings.size(); k++)
+ if (tag_2->strings[k].first == cat_hit.first) {
+ cat_seq = tag_2->strings[k].second;
+ break;
+ }
- if (d < 0)
- cerr <<
- "Unknown error calculating distance between " <<
- tag_1->id << " and " << tag_2->id << "; query allele: " << allele->first << "\n";
+ aln->init(tag_2->len, tag_1->len);
- //cerr << " Distance: " << d << " CTAG_DIST: " << ctag_dist << "\n";
+ if (aln->align(cat_seq, allele->second)) {
+ cigar.clear();
+ aln->parse_cigar(cigar);
+ aln_res = aln->result();
+ d = dist(cat_seq.c_str(), allele->second.c_str(), cigar);
//
- // Add a match to the query sequence: catalog ID, catalog allele, query allele, distance
+ // If the alignment has too many gaps, skip it.
//
- if (d <= ctag_dist)
- tag_1->add_match(tag_2->id, cnt_it->first, allele->first, d);
- }
- }
+ if (aln_res.gap_cnt <= (max_gaps + 1)) {
+ //
+ // If the alignment doesn't span enough of the two sequences, skip it.
+ //
+ if (aln_res.pct_id >= min_match_len) {
+
+ if (d <= ctag_dist)
+ tag_1->add_match(tag_2->id, cat_hit.first, allele->first, d, invert_cigar(aln_res.cigar));
+ }
+ }
+ }
+ }
}
- // Sort the vector of distances.
- sort(tag_1->matches.begin(), tag_1->matches.end(), compare_matches);
+ // time_4 = clock();
+ // per_locus += (time_4 - time_3);
}
+
+ //
+ // Free the k-mers we generated for this query
+ //
+ for (uint j = 0; j < kmers.size(); j++)
+ delete [] kmers[j];
+ kmers.clear();
+
+ delete aln;
}
+ // cerr << "Time to kmerize catalog: " << time_2 - time_1 << "\n"
+ // << "Average time per locus: " << per_locus / (double) keys.size() << "\n";
+
free_kmer_hash(kmer_map, kmer_map_keys);
return 0;
@@ -779,7 +1134,7 @@ int merge_allele(Locus *locus, SNP *snp) {
new_allele = "";
pos = 0;
- //cerr << "Allele length: " << allele.size() << "\n";
+ // cerr << "Allele length: " << allele.size() << "\n";
for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
//
@@ -788,10 +1143,10 @@ int merge_allele(Locus *locus, SNP *snp) {
//
if ((*k).first == "merge") {
new_allele += locus->con[(*k).second->col];
- //cerr << " Adding char from consensus position " << (*k).second->col << "\n";
+ // cerr << " Adding char '" << locus->con[k->second->col] << "' from consensus position " << (*k).second->col << "\n";
} else {
new_allele += allele[pos];
- //cerr << " Adding char from allele position " << pos << "\n";
+ // cerr << " Adding char '" << allele[pos] << "' from allele position " << pos << "\n";
pos++;
}
}
@@ -878,11 +1233,42 @@ int CLocus::merge_snps(QLocus *matched_tag) {
sort(merged_snps.begin(), merged_snps.end(), compare_pair_snp);
//
- // Merge the alleles accounting for any SNPs added from either of the two samples.
+ // If the catalog tag has no defined alleles, create a matching haplotype
+ // from the consensus sequence before merging in the new alleles.
//
string allele, new_allele;
int pos;
+ if (this->alleles.size() == 0) {
+ char c;
+ new_allele = "";
+ for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
+ csnp = k->second;
+ c = this->con[k->second->col];
+
+ new_allele += (csnp->col > this->len - 1) ? 'N' : c;
+
+ if (csnp->col > this->len - 1) continue;
+
+ if (c != csnp->rank_1 &&
+ c != csnp->rank_2 &&
+ c != csnp->rank_3 &&
+ c != csnp->rank_4) {
+
+ if (csnp->rank_3 == 0)
+ csnp->rank_3 = c;
+ else
+ csnp->rank_4 = c;
+ }
+ }
+
+ if (new_allele.length() > 0)
+ merged_alleles.insert(new_allele);
+ }
+
+ //
+ // Merge the alleles accounting for any SNPs added from either of the two samples.
+ //
for (j = this->alleles.begin(); j != this->alleles.end(); j++) {
allele = j->first;
new_allele = "";
@@ -896,7 +1282,7 @@ int CLocus::merge_snps(QLocus *matched_tag) {
if (k->first == "sample") {
new_allele += k->second->col > this->len - 1 ? 'N' : this->con[k->second->col];
} else {
- new_allele += allele[pos];
+ new_allele += allele[pos];
pos++;
}
}
@@ -953,12 +1339,12 @@ int CLocus::merge_snps(QLocus *matched_tag) {
merged_alleles.insert(new_allele);
}
- //
- // If the newly merged alleles contain Ns due to different sequence lengths,
- // check if we can reduce the alleles as one of the longer allele haplotypes
- // may fully encompass a shorter allele haplotype that has been padded with Ns.
- //
- if (require_uniq_haplotypes) this->reduce_alleles(merged_alleles);
+ // //
+ // // If the newly merged alleles contain Ns due to different sequence lengths,
+ // // check if we can reduce the alleles as one of the longer allele haplotypes
+ // // may fully encompass a shorter allele haplotype that has been padded with Ns.
+ // //
+ // if (require_uniq_haplotypes) this->reduce_alleles(merged_alleles);
//
// Update the catalog entry's list of SNPs and alleles
@@ -1315,6 +1701,8 @@ initialize_new_catalog(pair<int, string> &sample, map<int, CLocus *> &catalog)
k++;
}
+ cerr << " " << catalog.size() << " loci were newly added to the catalog.\n";
+
return 1;
}
@@ -1374,26 +1762,29 @@ int parse_command_line(int argc, char* argv[]) {
while (1) {
static struct option long_options[] = {
- {"help", no_argument, NULL, 'h'},
- {"version", no_argument, NULL, 'v'},
- {"mmatches", no_argument, NULL, 'm'},
- {"genomic_loc", no_argument, NULL, 'g'},
- {"uniq_haplotypes", no_argument, NULL, 'u'},
- {"report_mmatches", no_argument, NULL, 'R'},
- {"batch_id", required_argument, NULL, 'b'},
- {"ctag_dist", required_argument, NULL, 'n'},
- {"k_len", required_argument, NULL, 'k'},
- {"catalog", required_argument, NULL, 'c'},
- {"sample", required_argument, NULL, 's'},
- {"outpath", required_argument, NULL, 'o'},
- {"num_threads", required_argument, NULL, 'p'},
+ {"help", no_argument, NULL, 'h'},
+ {"version", no_argument, NULL, 'v'},
+ {"mmatches", no_argument, NULL, 'm'},
+ {"genomic_loc", no_argument, NULL, 'g'},
+ {"uniq_haplotypes", no_argument, NULL, 'u'},
+ {"report_mmatches", no_argument, NULL, 'R'},
+ {"gapped", no_argument, NULL, 'G'},
+ {"max_gaps", required_argument, NULL, 'X'},
+ {"min_aln_len", required_argument, NULL, 'x'},
+ {"batch_id", required_argument, NULL, 'b'},
+ {"ctag_dist", required_argument, NULL, 'n'},
+ {"k_len", required_argument, NULL, 'k'},
+ {"catalog", required_argument, NULL, 'c'},
+ {"sample", required_argument, NULL, 's'},
+ {"outpath", required_argument, NULL, 'o'},
+ {"num_threads", required_argument, NULL, 'p'},
{0, 0, 0, 0}
};
// getopt_long stores the option index here.
int option_index = 0;
- c = getopt_long(argc, argv, "hgvuRmo:s:c:b:p:n:k:", long_options, &option_index);
+ c = getopt_long(argc, argv, "hgvuRmGX:x:o:s:c:b:p:n:k:", long_options, &option_index);
// Detect the end of the options.
if (c == -1)
@@ -1439,6 +1830,15 @@ int parse_command_line(int argc, char* argv[]) {
case 'u':
require_uniq_haplotypes = true;
break;
+ case 'G':
+ gapped_alignments = true;
+ break;
+ case 'X':
+ max_gaps = is_double(optarg);
+ break;
+ case 'x':
+ min_match_len = is_double(optarg);
+ break;
case 'v':
version();
break;
@@ -1493,6 +1893,10 @@ void help() {
<< " h: display this help messsage." << "\n\n"
<< " Catalog editing:\n"
<< " --catalog <path>: provide the path to an existing catalog. cstacks will add data to this existing catalog.\n\n"
+ << " Gapped assembly options:\n"
+ << " --gapped: preform gapped alignments between stacks.\n"
+ << " --max_gaps: number of gaps allowed between stacks before merging (default: 2).\n"
+ << " --min_aln_len: minimum length of aligned sequence in a gapped alignment (default: 0.80).\n\n"
<< " Advanced options:\n"
<< " --k_len <len>: specify k-mer size for matching between between catalog loci (automatically calculated by default).\n"
<< " --report_mmatches: report query loci that match more than one catalog locus.\n";
diff --git a/src/cstacks.h b/src/cstacks.h
index 8ced945..13f0c0e 100644
--- a/src/cstacks.h
+++ b/src/cstacks.h
@@ -63,7 +63,9 @@ using std::queue;
#include "stacks.h"
#include "kmers.h"
#include "locus.h"
+#include "GappedAln.h"
#include "sql_utilities.h"
+#include "aln_utils.h"
#include "utils.h"
void help( void );
@@ -73,11 +75,12 @@ int initialize_new_catalog(pair<int, string> &, map<int, CLocus *> &);
int initialize_existing_catalog(string, map<int, CLocus *> &);
int update_catalog_index(map<int, CLocus *> &, map<string, int> &);
int find_kmer_matches_by_sequence(map<int, CLocus *> &, map<int, QLocus *> &, int);
+int search_for_gaps(map<int, CLocus *> &, map<int, QLocus *> &, double, double);
int find_matches_by_sequence(map<int, CLocus *> &, map<int, QLocus *> &);
int find_matches_by_genomic_loc(map<string, int> &, map<int, QLocus *> &);
int characterize_mismatch_snps(CLocus *, QLocus *);
-int merge_allele(Locus *locus, SNP *snp);
-int merge_matches(map<int, CLocus *> &, map<int, QLocus *> &, pair<int, string> &, int, uint &);
+int merge_allele(Locus *, SNP *);
+int merge_matches(map<int, CLocus *> &, map<int, QLocus *> &, pair<int, string> &, int, uint &, uint &, uint &, uint &);
int add_unique_tag(pair<int, string> &, map<int, CLocus *> &, QLocus *);
bool compare_dist(pair<int, int>, pair<int, int>);
diff --git a/src/kmers.cc b/src/kmers.cc
index fc68e54..60bf0c5 100644
--- a/src/kmers.cc
+++ b/src/kmers.cc
@@ -71,14 +71,52 @@ int calc_min_kmer_matches(int kmer_len, int dist, int read_len, bool exit_err) {
if (min_matches <= 0 && exit_err)
exit(1);
else if (min_matches <= 0)
- min_matches = 1;
-
- cerr << " Minimum number of k-mers to define a match: " << min_matches << "\n";
+ min_matches = 1;
return min_matches;
}
-int generate_kmers(const char *seq, int kmer_len, int num_kmers, vector<char *> &kmers) {
+int
+initialize_kmers(int kmer_len, int num_kmers, vector<char *> &kmers)
+{
+ char *kmer;
+
+ for (int i = 0; i < num_kmers; i++) {
+ kmer = new char[kmer_len + 1];
+ kmers.push_back(kmer);
+ }
+
+ return 0;
+}
+
+int
+generate_kmers_lazily(const char *seq, uint kmer_len, uint num_kmers, vector<char *> &kmers)
+{
+ char *kmer;
+ const char *k = seq;
+
+ if (num_kmers > kmers.size()) {
+ int new_kmers = num_kmers - kmers.size();
+
+ for (uint i = 0; i < new_kmers; i++) {
+ kmer = new char[kmer_len + 1];
+ kmers.push_back(kmer);
+ }
+ }
+
+ for (uint i = 0; i < num_kmers; i++) {
+ kmer = kmers.at(i);
+ strncpy(kmer, k, kmer_len);
+ kmer[kmer_len] = '\0';
+ k++;
+ }
+
+ return 0;
+}
+
+int
+generate_kmers(const char *seq, int kmer_len, int num_kmers, vector<char *> &kmers)
+{
char *kmer;
const char *k = seq;
@@ -151,7 +189,9 @@ int generate_permutations(map<int, char **> &pstrings, int width) {
return 0;
}
-int populate_kmer_hash(map<int, MergedStack *> &merged, KmerHashMap &kmer_map, vector<char *> &kmer_map_keys, int kmer_len) {
+int
+populate_kmer_hash(map<int, MergedStack *> &merged, KmerHashMap &kmer_map, vector<char *> &kmer_map_keys, int kmer_len)
+{
map<int, MergedStack *>::iterator it;
MergedStack *tag;
vector<char *> kmers;
@@ -173,16 +213,16 @@ int populate_kmer_hash(map<int, MergedStack *> &merged, KmerHashMap &kmer_map, v
// Hash the kmers
for (int j = 0; j < num_kmers; j++) {
- exists = kmer_map.count(kmers[j]) == 0 ? false : true;
+ exists = kmer_map.count(kmers[j]) == 0 ? false : true;
kmer_map[kmers[j]].push_back(tag->id);
- if (exists)
- delete [] kmers[j];
- else
- kmer_map_keys.push_back(kmers[j]);
- }
- kmers.clear();
+ if (exists)
+ delete [] kmers[j];
+ else
+ kmer_map_keys.push_back(kmers[j]);
+ }
+ kmers.clear();
}
//dump_kmer_map(kmer_map);
@@ -190,23 +230,26 @@ int populate_kmer_hash(map<int, MergedStack *> &merged, KmerHashMap &kmer_map, v
return 0;
}
-int populate_kmer_hash(map<int, Locus *> &catalog, CatKmerHashMap &kmer_map, vector<char *> &kmer_map_keys, int kmer_len) {
+int
+populate_kmer_hash(map<int, Locus *> &catalog, CatKmerHashMap &kmer_map, vector<char *> &kmer_map_keys, int kmer_len)
+{
map<int, Locus *>::iterator it;
vector<pair<allele_type, string> >::iterator allele;
vector<char *> kmers;
Locus *tag;
char *hash_key;
bool exists;
+ int num_kmers;
//
// Break each stack down into k-mers and create a hash map of those k-mers
// recording in which sequences they occur.
//
- int num_kmers = strlen(catalog.begin()->second->con) - kmer_len + 1;
-
for (it = catalog.begin(); it != catalog.end(); it++) {
tag = it->second;
+ num_kmers = strlen(tag->con) - kmer_len + 1;
+
//
// Iterate through the possible Catalog alleles
//
@@ -217,17 +260,137 @@ int populate_kmer_hash(map<int, Locus *> &catalog, CatKmerHashMap &kmer_map, vec
generate_kmers(allele->second.c_str(), kmer_len, num_kmers, kmers);
for (int j = 0; j < num_kmers; j++) {
- hash_key = kmers[j];
+ hash_key = kmers[j];
exists = kmer_map.count(hash_key) == 0 ? false : true;
kmer_map[hash_key].push_back(make_pair(allele->first, tag->id));
if (exists)
- delete [] kmers[j];
+ delete [] kmers[j];
else
- kmer_map_keys.push_back(hash_key);
+ kmer_map_keys.push_back(hash_key);
+ }
+ kmers.clear();
+ }
+ }
+
+ //dump_kmer_map(kmer_map);
+
+ return 0;
+}
+
+int
+populate_kmer_hash(map<int, Locus *> &catalog, KmerHashMap &kmer_map, vector<char *> &kmer_map_keys, map<int, pair<allele_type, int> > &allele_map, int kmer_len)
+{
+ map<int, Locus *>::iterator it;
+ KmerHashMap::iterator map_it;
+ vector<pair<allele_type, string> >::iterator allele;
+ map<int, pair<allele_type, int> >::iterator allele_it;
+ vector<char *> kmers;
+ Locus *tag;
+ char *hash_key;
+
+ //
+ // Break each stack down into k-mers and create a hash map of those k-mers
+ // recording in which sequences they occur.
+ //
+ int num_kmers;
+ int allele_index = 0;
+
+ allele_it = allele_map.begin();
+
+ for (it = catalog.begin(); it != catalog.end(); it++) {
+ tag = it->second;
+
+ num_kmers = strlen(tag->con) - kmer_len + 1;
+
+ //
+ // Iterate through the possible Catalog alleles
+ //
+ for (allele = tag->strings.begin(); allele != tag->strings.end(); allele++) {
+ //
+ // Generate and hash the kmers for this allele string
+ //
+ generate_kmers(allele->second.c_str(), kmer_len, num_kmers, kmers);
+
+ allele_it = allele_map.insert(allele_it, make_pair(allele_index, make_pair(allele->first, tag->id)));
+
+ for (int j = 0; j < num_kmers; j++) {
+ hash_key = kmers[j];
+
+ map_it = kmer_map.find(hash_key);
+
+ if (map_it != kmer_map.end()) {
+ map_it->second.push_back(allele_index);
+ delete [] kmers[j];
+ } else {
+ kmer_map[hash_key].push_back(allele_index);
+ kmer_map_keys.push_back(hash_key);
+ }
}
kmers.clear();
+
+ allele_index++;
+ }
+ }
+
+ //dump_kmer_map(kmer_map);
+
+ return 0;
+}
+
+int
+populate_kmer_hash(map<int, CLocus *> &catalog, KmerHashMap &kmer_map, vector<char *> &kmer_map_keys, map<int, pair<allele_type, int> > &allele_map, int kmer_len)
+{
+ map<int, CLocus *>::iterator it;
+ KmerHashMap::iterator map_it;
+ vector<pair<allele_type, string> >::iterator allele;
+ map<int, pair<allele_type, int> >::iterator allele_it;
+ vector<char *> kmers;
+ Locus *tag;
+ char *hash_key;
+
+ //
+ // Break each stack down into k-mers and create a hash map of those k-mers
+ // recording in which sequences they occur.
+ //
+ int num_kmers;
+ int allele_index = 0;
+
+ allele_it = allele_map.begin();
+
+ for (it = catalog.begin(); it != catalog.end(); it++) {
+ tag = it->second;
+
+ num_kmers = strlen(tag->con) - kmer_len + 1;
+
+ //
+ // Iterate through the possible Catalog alleles
+ //
+ for (allele = tag->strings.begin(); allele != tag->strings.end(); allele++) {
+ //
+ // Generate and hash the kmers for this allele string
+ //
+ generate_kmers(allele->second.c_str(), kmer_len, num_kmers, kmers);
+
+ allele_it = allele_map.insert(allele_it, make_pair(allele_index, make_pair(allele->first, tag->id)));
+
+ for (int j = 0; j < num_kmers; j++) {
+ hash_key = kmers[j];
+
+ map_it = kmer_map.find(hash_key);
+
+ if (map_it != kmer_map.end()) {
+ map_it->second.push_back(allele_index);
+ delete [] kmers[j];
+ } else {
+ kmer_map[hash_key].push_back(allele_index);
+ kmer_map_keys.push_back(hash_key);
+ }
+ }
+ kmers.clear();
+
+ allele_index++;
}
}
@@ -245,7 +408,7 @@ free_kmer_hash(CatKmerHashMap &kmer_map, vector<char *> &kmer_map_keys)
kmer_map.clear();
for (uint i = 0; i < kmer_map_keys.size(); i++) {
- delete [] kmer_map_keys[i];
+ delete [] kmer_map_keys[i];
}
kmer_map_keys.clear();
@@ -261,7 +424,7 @@ free_kmer_hash(KmerHashMap &kmer_map, vector<char *> &kmer_map_keys)
kmer_map.clear();
for (uint i = 0; i < kmer_map_keys.size(); i++) {
- delete [] kmer_map_keys[i];
+ delete [] kmer_map_keys[i];
}
kmer_map_keys.clear();
@@ -288,14 +451,60 @@ int dist(const char *tag_1, Locus *tag_2, allele_type allele) {
// Count the number of characters that are different
// between the two sequences.
while (p < p_end && q < q_end) {
- dist += (*p == *q) ? 0 : 1;
- p++;
- q++;
+ dist += (*p == *q) ? 0 : 1;
+ p++;
+ q++;
}
return dist;
}
+int
+dist(const char *tag_1, const char *tag_2, vector<pair<char, uint> > &cigar)
+{
+ uint size = cigar.size();
+ char op;
+ uint dist, len, pos_1, pos_2, stop;
+ int mismatches = 0;
+
+ len = strlen(tag_1);
+ pos_1 = 0;
+ pos_2 = 0;
+
+ for (uint i = 0; i < size; i++) {
+ op = cigar[i].first;
+ dist = cigar[i].second;
+
+ switch(op) {
+ case 'D':
+ //
+ // A deletion has occured in tag_1 relative to tag_2.
+ //
+ pos_2 += dist;
+ break;
+ case 'I':
+ //
+ // An insertion has occured in tag_1 relative to tag_2.
+ //
+ pos_1 += dist;
+ break;
+ case 'M':
+ stop = pos_1 + dist;
+ while (pos_1 < stop && pos_1 < len && pos_2 < len) {
+ if (tag_1[pos_1] != 'N' && tag_2[pos_2] != 'N' && tag_1[pos_1] != tag_2[pos_2])
+ mismatches++;
+ pos_1++;
+ pos_2++;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ return mismatches;
+}
+
int dist(Locus *tag_1, Locus *tag_2) {
int dist = 0;
char *p = tag_1->con;
@@ -315,9 +524,9 @@ int dist(Locus *tag_1, Locus *tag_2) {
// between the two sequences.
//
while (p < p_end && q < q_end) {
- dist += (*p == *q) ? 0 : 1;
- p++;
- q++;
+ dist += (*p == *q) ? 0 : 1;
+ p++;
+ q++;
}
return dist;
@@ -346,9 +555,9 @@ int dist(MergedStack *tag_1, MergedStack *tag_2) {
// between the two sequences.
//
while (p < p_end && q < q_end) {
- dist += (*p == *q) ? 0 : 1;
- p++;
- q++;
+ dist += (*p == *q) ? 0 : 1;
+ p++;
+ q++;
}
return dist;
@@ -378,9 +587,9 @@ int dist(MergedStack *tag_1, char *seq) {
// between the two sequences.
//
while (p < p_end && q < q_end) {
- dist += (*p == *q) ? 0 : 1;
- p++;
- q++;
+ dist += (*p == *q) ? 0 : 1;
+ p++;
+ q++;
}
return dist;
diff --git a/src/kmers.h b/src/kmers.h
index 653e79c..565f982 100644
--- a/src/kmers.h
+++ b/src/kmers.h
@@ -56,19 +56,19 @@ using google::sparse_hash_map;
struct hash_charptr {
size_t operator()(const char *__s) const
{
- size_t __result = static_cast<size_t>(14695981039346656037ULL);
- unsigned int __len = strlen(__s);
- for (unsigned int i = 0; i < __len; i++) {
- __result ^= static_cast<size_t>(__s[i]);
- __result *= static_cast<size_t>(1099511628211ULL);
- }
-
- return __result;
+ size_t __result = static_cast<size_t>(14695981039346656037ULL);
+ unsigned int __len = strlen(__s);
+ for (unsigned int i = 0; i < __len; i++) {
+ __result ^= static_cast<size_t>(__s[i]);
+ __result *= static_cast<size_t>(1099511628211ULL);
+ }
+
+ return __result;
}
};
struct eqstr {
bool operator()(const char* s1, const char* s2) const {
- return strcmp(s1, s2) == 0;
+ return strcmp(s1, s2) == 0;
}
};
@@ -82,9 +82,15 @@ typedef unordered_map<const char *, vector<pair<string, int> >, hash_charptr, eq
int determine_kmer_length(int, int);
int calc_min_kmer_matches(int, int, int, bool);
+int initialize_kmers(int, int, vector<char *> &);
int generate_kmers(const char *, int, int, vector<char *> &);
+int generate_kmers_lazily(const char *, uint, uint, vector<char *> &);
+
int populate_kmer_hash(map<int, MergedStack *> &, KmerHashMap &, vector<char *> &, int);
int populate_kmer_hash(map<int, Locus *> &, CatKmerHashMap &, vector<char *> &, int);
+int populate_kmer_hash(map<int, Locus *> &, KmerHashMap &, vector<char *> &, map<int, pair<allele_type, int> > &, int);
+int populate_kmer_hash(map<int, CLocus *> &, KmerHashMap &, vector<char *> &, map<int, pair<allele_type, int> > &, int);
+
int free_kmer_hash(KmerHashMap &, vector<char *> &);
int free_kmer_hash(CatKmerHashMap &, vector<char *> &);
@@ -93,6 +99,7 @@ int generate_permutations(map<int, char **> &, int);
//
// Utilities
//
+int dist(const char *, const char *, vector<pair<char, uint> > &);
int dist(const char *, Locus *, allele_type);
int dist(Locus *, Locus *);
int dist(MergedStack *, MergedStack *);
diff --git a/src/locus.cc b/src/locus.cc
index b726665..be6e4a4 100644
--- a/src/locus.cc
+++ b/src/locus.cc
@@ -27,17 +27,17 @@ uint
Locus::sort_bp(uint k)
{
if (this->loc.strand == plus)
- return this->loc.bp + k;
+ return this->loc.bp + k;
else
- return (k == 0 ? this->loc.bp - this->len + 1 : this->loc.bp - k);
+ return (k == 0 ? this->loc.bp - this->len + 1 : this->loc.bp - k);
}
int
Locus::snp_index(uint col)
{
for (uint i = 0; i < this->snps.size(); i++)
- if (this->snps[i]->col == col)
- return i;
+ if (this->snps[i]->col == col)
+ return i;
return -1;
}
@@ -45,7 +45,7 @@ int
Locus::add_consensus(const char *seq)
{
if (this->con != NULL)
- delete [] this->con;
+ delete [] this->con;
this->len = strlen(seq);
this->con = new char[this->len + 1];
@@ -66,27 +66,27 @@ Locus::populate_alleles()
// Is this effective?
//
for (uint n = 0; n < this->strings.size(); n++) {
- this->strings[n].first.clear();
- this->strings[n].second.clear();
+ this->strings[n].first.clear();
+ this->strings[n].second.clear();
}
this->strings.clear();
if (this->snps.size() == 0) {
- this->strings.push_back(make_pair("consensus", this->con));
- return 0;
+ this->strings.push_back(make_pair("consensus", this->con));
+ return 0;
}
for (j = this->alleles.begin(); j != this->alleles.end(); j++) {
- s = this->con;
- k = 0;
+ s = this->con;
+ k = 0;
- for (i = this->snps.begin(); i != this->snps.end(); i++) {
- if ((*i)->col < this->len)
- s.replace((*i)->col, 1, 1, j->first[k]);
- k++;
- }
+ for (i = this->snps.begin(); i != this->snps.end(); i++) {
+ if ((*i)->col < this->len)
+ s.replace((*i)->col, 1, 1, j->first[k]);
+ k++;
+ }
- this->strings.push_back(make_pair(j->first, s));
+ this->strings.push_back(make_pair(j->first, s));
}
return 0;
@@ -122,6 +122,22 @@ QLocus::add_match(int catalog_id, allele_type cat_type, allele_type query_type,
}
int
+QLocus::add_match(int catalog_id, allele_type cat_type, allele_type query_type, int distance, string cigar)
+{
+ Match *m = new Match;
+
+ m->cat_id = catalog_id;
+ m->cat_type = cat_type;
+ m->query_type = query_type;
+ m->dist = distance;
+ m->cigar = cigar;
+
+ this->matches.push_back(m);
+
+ return 0;
+}
+
+int
QLocus::add_match(int catalog_id, allele_type cat_type)
{
Match *m = new Match;
@@ -135,3 +151,15 @@ QLocus::add_match(int catalog_id, allele_type cat_type)
return 0;
}
+
+int
+QLocus::clear_matches()
+{
+ vector<Match *>::iterator it;
+
+ for (it = this->matches.begin(); it != this->matches.end(); it++)
+ delete *it;
+ this->matches.clear();
+
+ return 0;
+}
diff --git a/src/locus.h b/src/locus.h
index 495415d..2d2e78f 100644
--- a/src/locus.h
+++ b/src/locus.h
@@ -40,6 +40,7 @@ typedef struct match {
uint cat_id;
allele_type cat_type;
allele_type query_type;
+ string cigar;
uint dist;
} Match;
@@ -70,20 +71,20 @@ class Locus {
vector<pair<allele_type, string> > strings; // Strings for matching (representing the various allele combinations)
Locus() {
- id = 0;
- sample_id = 0;
- depth = 0;
- model = NULL;
- con = NULL;
- len = 0;
- lnl = 0.0;
- blacklisted = false;
+ id = 0;
+ sample_id = 0;
+ depth = 0;
+ model = NULL;
+ con = NULL;
+ len = 0;
+ lnl = 0.0;
+ blacklisted = false;
deleveraged = false;
- lumberjackstack = false;
+ lumberjackstack = false;
}
virtual ~Locus() {
delete [] con;
- delete [] model;
+ delete [] model;
for (uint i = 0; i < snps.size(); i++)
delete snps[i];
for (uint i = 0; i < comp.size(); i++)
@@ -107,8 +108,10 @@ class QLocus : public Locus {
QLocus(): Locus() {}
~QLocus();
+ int add_match(int, allele_type, allele_type, int, string);
int add_match(int, allele_type, allele_type, int);
int add_match(int, allele_type);
+ int clear_matches();
};
//
@@ -118,6 +121,11 @@ class QLocus : public Locus {
class CLocus : public Locus {
public:
vector<pair<int, int> > sources; // Sample/ID pairs for the sources contributing to this catalog entry
+ uint match_cnt;
+
+ CLocus() : Locus() {
+ this->match_cnt = 0;
+ };
int merge_snps(QLocus *);
int reduce_alleles(set<string> &);
@@ -130,13 +138,13 @@ class CLocus : public Locus {
class CSLocus : public Locus {
public:
CSLocus() : Locus() {
- this->f = 0.0;
- this->cnt = 0;
- this->hcnt = 0;
- this->gcnt = 0;
- this->trans_gcnt = 0;
- this->chisq = 1.0;
- this->confounded_cnt = 0;
+ this->f = 0.0;
+ this->cnt = 0;
+ this->hcnt = 0;
+ this->gcnt = 0;
+ this->trans_gcnt = 0;
+ this->chisq = 1.0;
+ this->confounded_cnt = 0;
};
string annotation;
string marker;
diff --git a/src/mstack.cc b/src/mstack.cc
index d2a1c60..32127bc 100644
--- a/src/mstack.cc
+++ b/src/mstack.cc
@@ -100,7 +100,9 @@ int MergedStack::add_dist(const int id, const int dist) {
return 0;
}
-DNASeq **MergedStack::gen_matrix(map<int, Stack *> &unique, map<int, Rem *> &rem) {
+DNANSeq **
+MergedStack::gen_matrix(map<int, Stack *> &unique, map<int, Rem *> &rem)
+{
Stack *tag;
//
@@ -114,7 +116,7 @@ DNASeq **MergedStack::gen_matrix(map<int, Stack *> &unique, map<int, Rem *> &rem
uint cnt = this->count + this->remtags.size();
if (this->matrix != NULL)
delete [] this->matrix;
- this->matrix = new DNASeq * [cnt];
+ this->matrix = new DNANSeq * [cnt];
vector<int>::iterator j;
int i = 0;
@@ -149,9 +151,9 @@ MergedStack::gen_matrix(map<int, PStack *> &unique)
// reuse the existing char arrays in the unique and rem maps
//
uint cnt = this->count;
- if (this->pmatrix != NULL)
+ if (this->matrix != NULL)
delete [] this->matrix;
- this->pmatrix = new DNANSeq * [cnt];
+ this->matrix = new DNANSeq * [cnt];
vector<int>::iterator j;
int i = 0;
@@ -159,12 +161,12 @@ MergedStack::gen_matrix(map<int, PStack *> &unique)
tag = unique[*j];
for (uint k = 0; k < tag->count; k++) {
- this->pmatrix[i] = tag->seq;
+ this->matrix[i] = tag->seq;
i++;
}
}
- return this->pmatrix;
+ return this->matrix;
}
double
@@ -177,79 +179,29 @@ MergedStack::calc_likelihood()
// Iterate over each column of the array and call the consensus base.
//
int row, col, tot;
- int length = this->matrix[0]->size;
+ int length = this->matrix[0]->size();
int height = this->count + this->remtags.size();
map<char, int> nuc;
map<char, int>::iterator max, n;
- DNASeq *d;
+ DNANSeq *d;
+
+ uint cur_gap = this->gaps.size() > 0 ? 0 : 1;
this->lnl = 0;
for (col = 0; col < length; col++) {
- nuc['A'] = 0;
- nuc['G'] = 0;
- nuc['C'] = 0;
- nuc['T'] = 0;
//
- // Count the nucleotide type at each position in the column.
- //
- for (row = 0; row < height; row++) {
- d = this->matrix[row];
- nuc[(*d)[col]]++;
- }
- //
- // Find the base with a plurality of occurances and call it.
+ // Don't invoke the model within gaps.
//
- max = nuc.end();
- tot = 0;
- for (n = nuc.begin(); n != nuc.end(); n++) {
- tot += n->second;
- if (max == nuc.end() || n->second > max->second)
- max = n;
+ if (cur_gap < this->gaps.size() && col == this->gaps[cur_gap].start) {
+ do {
+ col++;
+ } while (col < this->gaps[cur_gap].end && col < length);
+ col--;
+ continue;
}
- //
- // For nucleotide positions with potential polymorphism (i.e. two or more alleles at
- // the locus that differ at that position), first find the ML genotype (call_multinomial_snp).
- // If it returns 'het' calculate the heterozygous_likelihood(), otherwise calculate homozygous
- // likelihood.
- //
- snp_type res = this->snps[col]->type;
-
- if (res == snp_type_het)
- this->lnl += heterozygous_likelihood(col, nuc);
- else if (res == snp_type_hom)
- this->lnl += homozygous_likelihood(col, nuc);
- else {
- double homlnl = homozygous_likelihood(col, nuc);
- double hetlnl = heterozygous_likelihood(col, nuc);
- this->lnl += hetlnl > homlnl ? hetlnl : homlnl;
- }
- }
-
- return this->lnl;
-}
-
-double
-MergedStack::calc_likelihood_pstacks()
-{
- if (this->pmatrix == NULL || this->snps.size() == 0)
- return 0;
-
- //
- // Iterate over each column of the array and call the consensus base.
- //
- int row, col, tot;
- int length = this->pmatrix[0]->size();
- int height = this->count;
- map<char, int> nuc;
- map<char, int>::iterator max, n;
- DNANSeq *d;
-
- this->lnl = 0;
-
- for (col = 0; col < length; col++) {
nuc['A'] = 0;
nuc['G'] = 0;
nuc['C'] = 0;
@@ -259,7 +211,7 @@ MergedStack::calc_likelihood_pstacks()
// Count the nucleotide type at each position in the column.
//
for (row = 0; row < height; row++) {
- d = this->pmatrix[row];
+ d = this->matrix[row];
nuc[(*d)[col]]++;
}
//
diff --git a/src/mstack.h b/src/mstack.h
index a14ee3e..d899d59 100644
--- a/src/mstack.h
+++ b/src/mstack.h
@@ -44,12 +44,12 @@ class MergedStack {
//
// Stack component parts
//
- int count; // Number of merged stacks
- vector<int> utags; // Stack IDs that have been merged into this MergedStack
- vector<pair<int, int> > dist; // Vector describing the distance between this stack and other stacks.
- vector<int> remtags; // Remainder tag IDs that have been merged into this Stack
- DNASeq **matrix; // Two-dimensional array for iterating over the combined stack (stacks and remainders).
- DNANSeq **pmatrix; // Two-dimensional array for iterating over the combined stack aligned to a reference..
+ int count; // Number of merged stacks
+ vector<int> utags; // Stack IDs that have been merged into this MergedStack
+ vector<int> remtags; // Remainder tag IDs that have been merged into this Stack
+ DNANSeq **matrix; // Two-dimensional array for iterating over the combined stack (stacks and remainders).
+ vector<pair<int, int> > dist; // Vector describing the distance between this stack and other stacks.
+ vector<Aln> alns; // Vector describing gapped alignments between this stack and other stacks.
int cohort_id; // Group ID of all stacks that were originally part of the same subgraph
double lnl; // Log likelihood of this stack
@@ -60,12 +60,15 @@ class MergedStack {
PhyLoc loc; // Physical genome location of this Stack.
vector<SNP *> snps; // Single Nucleotide Polymorphisms found in this Stack
map<string, int> alleles; // Set of alleles defined by the SNPs found in this Stack
+ vector<Gap> gaps;
+
//
// Flags
//
bool deleveraged;
bool masked;
bool blacklisted;
+ bool gappedlumberjack;
bool lumberjackstack;
MergedStack();
@@ -74,10 +77,9 @@ class MergedStack {
int add_consensus(DNASeq *);
int add_consensus(DNANSeq *);
int add_dist(const int id, const int dist);
- DNASeq **gen_matrix(map<int, Stack *> &, map<int, Rem *> &);
+ DNANSeq **gen_matrix(map<int, Stack *> &, map<int, Rem *> &);
DNANSeq **gen_matrix(map<int, PStack *> &);
double calc_likelihood();
- double calc_likelihood_pstacks();
string write_cmb();
};
diff --git a/src/populations.cc b/src/populations.cc
index cf17cf2..97bbfa5 100644
--- a/src/populations.cc
+++ b/src/populations.cc
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2012-2015, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2012-2016, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -107,33 +107,33 @@ int main (int argc, char* argv[]) {
parse_command_line(argc, argv);
cerr
- << "Fst kernel smoothing: " << (kernel_smoothed == true ? "on" : "off") << "\n"
- << "Bootstrap resampling: ";
+ << "Fst kernel smoothing: " << (kernel_smoothed == true ? "on" : "off") << "\n"
+ << "Bootstrap resampling: ";
if (bootstrap)
- cerr << "on, " << (bootstrap_type == bs_exact ? "exact; " : "approximate; ") << bootstrap_reps << " reptitions\n";
+ cerr << "on, " << (bootstrap_type == bs_exact ? "exact; " : "approximate; ") << bootstrap_reps << " reptitions\n";
else
- cerr << "off\n";
+ cerr << "off\n";
cerr
- << "Percent samples limit per population: " << sample_limit << "\n"
- << "Locus Population limit: " << population_limit << "\n"
- << "Minimum stack depth: " << min_stack_depth << "\n"
- << "Log liklihood filtering: " << (filter_lnl == true ? "on" : "off") << "; threshold: " << lnl_limit << "\n"
- << "Minor allele frequency cutoff: " << minor_allele_freq << "\n"
- << "Maximum observed heterozygosity cutoff: " << max_obs_het << "\n"
- << "Applying Fst correction: ";
+ << "Percent samples limit per population: " << sample_limit << "\n"
+ << "Locus Population limit: " << population_limit << "\n"
+ << "Minimum stack depth: " << min_stack_depth << "\n"
+ << "Log liklihood filtering: " << (filter_lnl == true ? "on" : "off") << "; threshold: " << lnl_limit << "\n"
+ << "Minor allele frequency cutoff: " << minor_allele_freq << "\n"
+ << "Maximum observed heterozygosity cutoff: " << max_obs_het << "\n"
+ << "Applying Fst correction: ";
switch(fst_correction) {
case p_value:
- cerr << "P-value correction.\n";
- break;
+ cerr << "P-value correction.\n";
+ break;
case bonferroni_win:
- cerr << "Bonferroni correction within sliding window.\n";
- break;
+ cerr << "Bonferroni correction within sliding window.\n";
+ break;
case bonferroni_gen:
- cerr << "Bonferroni correction across genome wide sites.\n";
- break;
+ cerr << "Bonferroni correction across genome wide sites.\n";
+ break;
case no_correction:
- cerr << "none.\n";
- break;
+ cerr << "none.\n";
+ break;
}
//
@@ -150,19 +150,19 @@ int main (int argc, char* argv[]) {
vector<pair<int, string> > files;
if (!build_file_list(files, pop_indexes, grp_members))
- exit(1);
+ exit(1);
if (wl_file.length() > 0) {
- load_marker_column_list(wl_file, whitelist);
- cerr << "Loaded " << whitelist.size() << " whitelisted markers.\n";
+ load_marker_column_list(wl_file, whitelist);
+ cerr << "Loaded " << whitelist.size() << " whitelisted markers.\n";
}
if (bl_file.length() > 0) {
- load_marker_list(bl_file, blacklist);
- cerr << "Loaded " << blacklist.size() << " blacklisted markers.\n";
+ load_marker_list(bl_file, blacklist);
+ cerr << "Loaded " << blacklist.size() << " blacklisted markers.\n";
}
if (bs_wl_file.length() > 0) {
- load_marker_list(bs_wl_file, bootstraplist);
- cerr << "Loaded " << bootstraplist.size() << " markers to include when bootstrapping.\n";
+ load_marker_list(bs_wl_file, bootstraplist);
+ cerr << "Loaded " << bootstraplist.size() << " markers to include when bootstrapping.\n";
}
//
@@ -174,7 +174,7 @@ int main (int argc, char* argv[]) {
ofstream log_fh(log_path.c_str(), ofstream::out);
if (log_fh.fail()) {
cerr << "Error opening log file '" << log_path << "'\n";
- exit(1);
+ exit(1);
}
init_log(log_fh, argc, argv);
@@ -187,8 +187,8 @@ int main (int argc, char* argv[]) {
int res;
catalog_file << in_path << "batch_" << batch_id << ".catalog";
if ((res = load_loci(catalog_file.str(), catalog, false, false, compressed)) == 0) {
- cerr << "Unable to load the catalog '" << catalog_file.str() << "'\n";
- return 0;
+ cerr << "Unable to load the catalog '" << catalog_file.str() << "'\n";
+ return 0;
}
//
@@ -212,41 +212,43 @@ int main (int argc, char* argv[]) {
vector<vector<CatMatch *> > catalog_matches;
map<int, string> samples;
vector<int> sample_ids;
+ uint removed_cnt = 0;
+
for (int i = 0; i < (int) files.size(); i++) {
- vector<CatMatch *> m;
- load_catalog_matches(in_path + files[i].second, m);
-
- if (m.size() == 0) {
- cerr << "Warning: unable to find any matches in file '" << files[i].second << "', excluding this sample from population analysis.\n";
- //
- // This case is generated by an existing, but empty file.
- // Remove this sample from the population index which was built from
- // existing files, but we couldn't yet check for empty files.
- //
- map<int, pair<int, int> >::iterator pit;
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++)
- if (i >= pit->second.first && i <= pit->second.second) {
- pit->second.second--;
- pit++;
- while (pit != pop_indexes.end()) {
- pit->second.first--;
- pit->second.second--;
- pit++;
- }
- break;
- }
-
- continue;
- }
-
- catalog_matches.push_back(m);
- if (samples.count(m[0]->sample_id) == 0) {
- samples[m[0]->sample_id] = files[i].second;
- sample_ids.push_back(m[0]->sample_id);
- } else {
- cerr << "Fatal error: sample ID " << m[0]->sample_id << " occurs twice in this data set, likely the pipeline was run incorrectly.\n";
- exit(0);
- }
+ vector<CatMatch *> m;
+ load_catalog_matches(in_path + files[i].second, m);
+
+ if (m.size() == 0) {
+ cerr << " Warning: unable to find any matches in file '" << files[i].second << "', excluding this sample from population analysis.\n";
+ //
+ // This case is generated by an existing, but empty file.
+ // Remove this sample from the population index which was built from
+ // existing files, but we couldn't yet check for empty files.
+ //
+ uint index = i - removed_cnt;
+ for (map<int, pair<int, int> >::iterator pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++)
+ if ((index >= pit->second.first && index <= pit->second.second)) {
+ pit->second.second--;
+ removed_cnt++;
+ pit++;
+ while (pit != pop_indexes.end()) {
+ pit->second.first--;
+ pit->second.second--;
+ pit++;
+ }
+ break;
+ }
+ continue;
+ }
+
+ catalog_matches.push_back(m);
+ if (samples.count(m[0]->sample_id) == 0) {
+ samples[m[0]->sample_id] = files[i].second;
+ sample_ids.push_back(m[0]->sample_id);
+ } else {
+ cerr << "Fatal error: sample ID " << m[0]->sample_id << " occurs twice in this data set, likely the pipeline was run incorrectly.\n";
+ exit(0);
+ }
}
//
@@ -265,7 +267,7 @@ int main (int argc, char* argv[]) {
// Output a list of heterozygous loci and the associate haplotype frequencies.
//
if (sql_out)
- write_sql(catalog, pmap);
+ write_sql(catalog, pmap);
log_fh << "# Distribution of population loci.\n";
log_haplotype_cnts(catalog, log_fh);
@@ -288,34 +290,32 @@ int main (int argc, char* argv[]) {
// and records model calls for each nucleotide: O (hOmozygous), E (hEterozygous), U (Unknown)
//
for (uint i = 0; i < sample_ids.size(); i++) {
- map<int, ModRes *> modres;
- load_model_results(in_path + samples[sample_ids[i]], modres);
-
- if (modres.size() == 0) {
- cerr << "Warning: unable to find any model results in file '" << samples[sample_ids[i]] << "', excluding this sample from population analysis.\n";
- continue;
- }
-
- for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- d = pmap->datum(loc->id, sample_ids[i]);
-
- if (d != NULL) {
- if (modres.count(d->id) == 0) {
- cerr << "Fatal error: Unable to find model data for catalog locus " << loc->id
- << ", sample ID " << sample_ids[i] << ", sample locus " << d->id
- << "; likely IDs were mismatched when running pipeline.\n";
- exit(0);
- }
- d->len = strlen(modres[d->id]->model);
- d->model = new char[d->len + 1];
- strcpy(d->model, modres[d->id]->model);
- }
- }
-
- for (mit = modres.begin(); mit != modres.end(); mit++)
- delete mit->second;
- modres.clear();
+ map<int, ModRes *> modres;
+ load_model_results(in_path + samples[sample_ids[i]], modres);
+
+ if (modres.size() == 0) {
+ cerr << " Warning: unable to find any model results in file '" << samples[sample_ids[i]] << "', excluding this sample from population analysis.\n";
+ continue;
+ }
+
+ for (it = catalog.begin(); it != catalog.end(); it++) {
+ loc = it->second;
+ d = pmap->datum(loc->id, sample_ids[i]);
+
+ if (d != NULL) {
+ if (modres.count(d->id) == 0) {
+ cerr << "Fatal error: Unable to find model data for catalog locus " << loc->id
+ << ", sample ID " << sample_ids[i] << ", sample locus " << d->id
+ << "; likely IDs were mismatched when running pipeline.\n";
+ exit(0);
+ }
+ d->add_model(modres[d->id]->model);
+ }
+ }
+
+ for (mit = modres.begin(); mit != modres.end(); mit++)
+ delete mit->second;
+ modres.clear();
}
uint pop_id, start_index, end_index;
@@ -325,11 +325,11 @@ int main (int argc, char* argv[]) {
psum->initialize(pmap);
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- start_index = pit->second.first;
- end_index = pit->second.second;
- pop_id = pit->first;
- cerr << "Generating nucleotide-level summary statistics for population '" << pop_key[pop_id] << "'\n";
- psum->add_population(catalog, pmap, pop_id, start_index, end_index, verbose, log_fh);
+ start_index = pit->second.first;
+ end_index = pit->second.second;
+ pop_id = pit->first;
+ cerr << "Generating nucleotide-level summary statistics for population '" << pop_key[pop_id] << "'\n";
+ psum->add_population(catalog, pmap, pop_id, start_index, end_index, verbose, log_fh);
}
cerr << "Tallying loci across populations...";
@@ -346,15 +346,15 @@ int main (int argc, char* argv[]) {
cerr << "Pruned " << pruned_snps << " variant sites due to filter constraints.\n";
if (!verbose)
- cerr << " (enable the --verbose flag to record the reason why each site was filtered in the batch_X.populations.log file.)\n";
+ cerr << " (enable the --verbose flag to record the reason why each site was filtered in the batch_X.populations.log file.)\n";
//
// Create an artificial whitelist if the user requested only the first or a random SNP per locus.
//
if (write_single_snp)
- implement_single_snp_whitelist(catalog, psum, whitelist);
+ implement_single_snp_whitelist(catalog, psum, whitelist);
else if (write_random_snp)
- implement_random_snp_whitelist(catalog, psum, whitelist);
+ implement_random_snp_whitelist(catalog, psum, whitelist);
//
// Remove the accumulated SNPs
@@ -371,7 +371,7 @@ int main (int argc, char* argv[]) {
//
map<int, pair<merget, int> > merge_map;
if (merge_sites && loci_ordered)
- merge_shared_cutsite_loci(catalog, pmap, psum, merge_map, log_fh);
+ merge_shared_cutsite_loci(catalog, pmap, psum, merge_map, log_fh);
//
// Regenerate summary statistics after pruning SNPs and merging loci.
@@ -380,30 +380,30 @@ int main (int argc, char* argv[]) {
psum = new PopSum<CSLocus>(pmap->loci_cnt(), pop_indexes.size());
psum->initialize(pmap);
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- start_index = pit->second.first;
- end_index = pit->second.second;
- pop_id = pit->first;
- cerr << "Regenerating nucleotide-level summary statistics for population '" << pop_key[pop_id] << "'\n";
- psum->add_population(catalog, pmap, pop_id, start_index, end_index, verbose, log_fh);
+ start_index = pit->second.first;
+ end_index = pit->second.second;
+ pop_id = pit->first;
+ cerr << "Regenerating nucleotide-level summary statistics for population '" << pop_key[pop_id] << "'\n";
+ psum->add_population(catalog, pmap, pop_id, start_index, end_index, verbose, log_fh);
}
cerr << "Re-tallying loci across populations...";
psum->tally(catalog);
cerr << "done.\n";
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = pit->first;
- if (kernel_smoothed && loci_ordered) {
- cerr << " Generating kernel-smoothed population statistics...\n";
- kernel_smoothed_popstats(catalog, pmap, psum, pop_id, log_fh);
- }
+ pop_id = pit->first;
+ if (kernel_smoothed && loci_ordered) {
+ cerr << " Generating kernel-smoothed population statistics...\n";
+ kernel_smoothed_popstats(catalog, pmap, psum, pop_id, log_fh);
+ }
}
calculate_haplotype_stats(files, pop_indexes, catalog, pmap, psum);
if (calc_fstats) {
- calculate_haplotype_divergence(files, pop_indexes, grp_members, catalog, pmap, psum);
+ calculate_haplotype_divergence(files, pop_indexes, grp_members, catalog, pmap, psum);
- calculate_haplotype_divergence_pairwise(files, pop_indexes, grp_members, catalog, pmap, psum);
+ calculate_haplotype_divergence_pairwise(files, pop_indexes, grp_members, catalog, pmap, psum);
}
//
@@ -420,67 +420,67 @@ int main (int argc, char* argv[]) {
// Output data in requested formats
//
if (fasta_out)
- write_fasta(catalog, pmap, samples, sample_ids);
+ write_fasta(catalog, pmap, samples, sample_ids);
if (fasta_strict_out)
- write_strict_fasta(catalog, pmap, samples, sample_ids);
+ write_strict_fasta(catalog, pmap, samples, sample_ids);
if (genepop_out && ordered_export)
- write_genepop_ordered(catalog, pmap, psum, pop_indexes, samples, log_fh);
+ write_genepop_ordered(catalog, pmap, psum, pop_indexes, samples, log_fh);
else if (genepop_out)
- write_genepop(catalog, pmap, psum, pop_indexes, samples);
+ write_genepop(catalog, pmap, psum, pop_indexes, samples);
if (structure_out && ordered_export)
- write_structure_ordered(catalog, pmap, psum, pop_indexes, samples, log_fh);
+ write_structure_ordered(catalog, pmap, psum, pop_indexes, samples, log_fh);
else if (structure_out)
- write_structure(catalog, pmap, psum, pop_indexes, samples);
+ write_structure(catalog, pmap, psum, pop_indexes, samples);
if (fastphase_out)
- write_fastphase(catalog, pmap, psum, pop_indexes, samples);
+ write_fastphase(catalog, pmap, psum, pop_indexes, samples);
if (phase_out)
- write_phase(catalog, pmap, psum, pop_indexes, samples);
+ write_phase(catalog, pmap, psum, pop_indexes, samples);
if (beagle_out)
- write_beagle(catalog, pmap, psum, pop_indexes, samples);
+ write_beagle(catalog, pmap, psum, pop_indexes, samples);
if (beagle_phased_out)
- write_beagle_phased(catalog, pmap, psum, pop_indexes, samples);
+ write_beagle_phased(catalog, pmap, psum, pop_indexes, samples);
if (plink_out)
- write_plink(catalog, pmap, psum, pop_indexes, samples);
+ write_plink(catalog, pmap, psum, pop_indexes, samples);
if (hzar_out)
- write_hzar(catalog, pmap, psum, pop_indexes, samples);
+ write_hzar(catalog, pmap, psum, pop_indexes, samples);
if (treemix_out)
- write_treemix(catalog, pmap, psum, pop_indexes, samples);
+ write_treemix(catalog, pmap, psum, pop_indexes, samples);
if (phylip_out || phylip_var)
- write_phylip(catalog, pmap, psum, pop_indexes, samples);
+ write_phylip(catalog, pmap, psum, pop_indexes, samples);
if (phylip_var_all)
- write_fullseq_phylip(catalog, pmap, psum, pop_indexes, samples);
+ write_fullseq_phylip(catalog, pmap, psum, pop_indexes, samples);
if (vcf_haplo_out)
- write_vcf_haplotypes(catalog, pmap, psum, samples, sample_ids);
+ write_vcf_haplotypes(catalog, pmap, psum, samples, sample_ids);
if (vcf_out && ordered_export)
- write_vcf_ordered(catalog, pmap, psum, samples, sample_ids, merge_map, log_fh);
+ write_vcf_ordered(catalog, pmap, psum, samples, sample_ids, merge_map, log_fh);
else if (vcf_out)
- write_vcf(catalog, pmap, psum, samples, sample_ids, merge_map);
+ write_vcf(catalog, pmap, psum, samples, sample_ids, merge_map);
//
// Calculate and write Fst.
//
if (calc_fstats)
- write_fst_stats(files, pop_indexes, catalog, pmap, psum, log_fh);
+ write_fst_stats(files, pop_indexes, catalog, pmap, psum, log_fh);
//
// Output nucleotide-level genotype calls for each individual.
//
if (genomic_out)
- write_genomic(catalog, pmap);
+ write_genomic(catalog, pmap);
log_fh.close();
@@ -489,9 +489,9 @@ int main (int argc, char* argv[]) {
int
apply_locus_constraints(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- map<int, pair<int, int> > &pop_indexes,
- ofstream &log_fh)
+ PopMap<CSLocus> *pmap,
+ map<int, pair<int, int> > &pop_indexes,
+ ofstream &log_fh)
{
uint pop_id, start_index, end_index;
CSLocus *loc;
@@ -500,8 +500,8 @@ apply_locus_constraints(map<int, CSLocus *> &catalog,
if (sample_limit == 0 && population_limit == 0 && min_stack_depth == 0) return 0;
if (verbose)
- log_fh << "\n#\n# List of loci removed by first filtering stage of sample and population constraints\n#\n"
- << "# Action\tLocus ID\tChr\tBP\tColumn\tReason\n";
+ log_fh << "\n#\n# List of loci removed by first filtering stage of sample and population constraints\n#\n"
+ << "# Action\tLocus ID\tChr\tBP\tColumn\tReason\n";
map<int, CSLocus *>::iterator it;
map<int, pair<int, int> >::iterator pit;
@@ -520,20 +520,20 @@ apply_locus_constraints(map<int, CSLocus *> &catalog,
pop_id = 0;
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- start_index = pit->second.first;
- end_index = pit->second.second;
- pop_tot[pop_id] = 0;
+ start_index = pit->second.first;
+ end_index = pit->second.second;
+ pop_tot[pop_id] = 0;
- for (uint i = start_index; i <= end_index; i++) {
- samples[i] = pop_id;
- pop_tot[pop_id]++;
- }
- pop_order[pop_id] = pit->first;
- pop_id++;
+ for (uint i = start_index; i <= end_index; i++) {
+ samples[i] = pop_id;
+ pop_tot[pop_id]++;
+ }
+ pop_order[pop_id] = pit->first;
+ pop_id++;
}
for (uint i = 0; i < pop_cnt; i++)
- pop_cnts[i] = 0;
+ pop_cnts[i] = 0;
double pct = 0.0;
bool pop_limit = false;
@@ -543,100 +543,100 @@ apply_locus_constraints(map<int, CSLocus *> &catalog,
set<int> blacklist;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- d = pmap->locus(loc->id);
-
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- //
- // Check that each sample is over the minimum stack depth for this locus.
- //
- if (d[i] != NULL &&
- min_stack_depth > 0 &&
- d[i]->tot_depth < min_stack_depth) {
- below_stack_dep++;
- delete d[i];
- d[i] = NULL;
- loc->hcnt--;
- }
-
- //
- // Check that each sample is over the log likelihood threshold.
- //
- if (d[i] != NULL &&
- filter_lnl &&
- d[i]->lnl < lnl_limit) {
- below_lnl_thresh++;
- delete d[i];
- d[i] = NULL;
- loc->hcnt--;
- }
- }
-
- //
- // Tally up the count of samples in this population.
- //
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] != NULL)
- pop_cnts[samples[i]]++;
- }
-
- //
- // Check that the counts for each population are over sample_limit. If not, zero out
- // the members of that population.
- //
- for (uint i = 0; i < pop_cnt; i++) {
- pct = (double) pop_cnts[i] / (double) pop_tot[i];
-
- if (pop_cnts[i] > 0 && pct < sample_limit) {
- //cerr << "Removing population " << pop_order[i] << " at locus: " << loc->id << "; below sample limit: " << pct << "\n";
- start_index = pop_indexes[pop_order[i]].first;
- end_index = pop_indexes[pop_order[i]].second;
-
- for (uint j = start_index; j <= end_index; j++) {
- if (d[j] != NULL) {
- delete d[j];
- d[j] = NULL;
- loc->hcnt--;
- }
- }
- pop_cnts[i] = 0;
- }
- }
-
- //
- // Check that this locus is present in enough populations.
- //
- for (uint i = 0; i < pop_cnt; i++)
- if (pop_cnts[i] > 0) pops++;
- if (pops < population_limit) {
- //cerr << "Removing locus: " << loc->id << "; below population limit: " << pops << "\n";
- pop_limit = true;
- }
-
- if (pop_limit) {
- blacklist.insert(loc->id);
-
- if (verbose)
- log_fh << "removed_locus\t"
- << loc->id << "\t"
- << loc->loc.chr << "\t"
- << loc->sort_bp() << "\t"
- << 0 << "\tfailed_population_limit\n";
- }
-
- for (uint i = 0; i < pop_cnt; i++)
- pop_cnts[i] = 0;
- pop_limit = false;
- pops = 0;
+ loc = it->second;
+ d = pmap->locus(loc->id);
+
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ //
+ // Check that each sample is over the minimum stack depth for this locus.
+ //
+ if (d[i] != NULL &&
+ min_stack_depth > 0 &&
+ d[i]->tot_depth < min_stack_depth) {
+ below_stack_dep++;
+ delete d[i];
+ d[i] = NULL;
+ loc->hcnt--;
+ }
+
+ //
+ // Check that each sample is over the log likelihood threshold.
+ //
+ if (d[i] != NULL &&
+ filter_lnl &&
+ d[i]->lnl < lnl_limit) {
+ below_lnl_thresh++;
+ delete d[i];
+ d[i] = NULL;
+ loc->hcnt--;
+ }
+ }
+
+ //
+ // Tally up the count of samples in this population.
+ //
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (d[i] != NULL)
+ pop_cnts[samples[i]]++;
+ }
+
+ //
+ // Check that the counts for each population are over sample_limit. If not, zero out
+ // the members of that population.
+ //
+ for (uint i = 0; i < pop_cnt; i++) {
+ pct = (double) pop_cnts[i] / (double) pop_tot[i];
+
+ if (pop_cnts[i] > 0 && pct < sample_limit) {
+ //cerr << "Removing population " << pop_order[i] << " at locus: " << loc->id << "; below sample limit: " << pct << "\n";
+ start_index = pop_indexes[pop_order[i]].first;
+ end_index = pop_indexes[pop_order[i]].second;
+
+ for (uint j = start_index; j <= end_index; j++) {
+ if (d[j] != NULL) {
+ delete d[j];
+ d[j] = NULL;
+ loc->hcnt--;
+ }
+ }
+ pop_cnts[i] = 0;
+ }
+ }
+
+ //
+ // Check that this locus is present in enough populations.
+ //
+ for (uint i = 0; i < pop_cnt; i++)
+ if (pop_cnts[i] > 0) pops++;
+ if (pops < population_limit) {
+ //cerr << "Removing locus: " << loc->id << "; below population limit: " << pops << "\n";
+ pop_limit = true;
+ }
+
+ if (pop_limit) {
+ blacklist.insert(loc->id);
+
+ if (verbose)
+ log_fh << "removed_locus\t"
+ << loc->id << "\t"
+ << loc->loc.chr << "\t"
+ << loc->sort_bp() << "\t"
+ << 0 << "\tfailed_population_limit\n";
+ }
+
+ for (uint i = 0; i < pop_cnt; i++)
+ pop_cnts[i] = 0;
+ pop_limit = false;
+ pops = 0;
}
//
// Remove loci
//
if (min_stack_depth > 0)
- cerr << "Removed " << below_stack_dep << " samples from loci that are below the minimum stack depth of " << min_stack_depth << "x\n";
+ cerr << "Removed " << below_stack_dep << " samples from loci that are below the minimum stack depth of " << min_stack_depth << "x\n";
if (filter_lnl)
- cerr << "Removed " << below_lnl_thresh << " samples from loci that are below the log likelihood threshold of " << lnl_limit << "\n";
+ cerr << "Removed " << below_lnl_thresh << " samples from loci that are below the log likelihood threshold of " << lnl_limit << "\n";
cerr << "Removing " << blacklist.size() << " loci that did not pass sample/population constraints...";
set<int> whitelist;
reduce_catalog(catalog, whitelist, blacklist);
@@ -649,18 +649,18 @@ apply_locus_constraints(map<int, CSLocus *> &catalog,
delete [] samples;
if (retained == 0)
- exit(0);
+ exit(0);
return 0;
}
int
prune_polymorphic_sites(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, set<int> > &whitelist, set<int> &blacklist,
- ofstream &log_fh)
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum,
+ map<int, pair<int, int> > &pop_indexes,
+ map<int, set<int> > &whitelist, set<int> &blacklist,
+ ofstream &log_fh)
{
map<int, set<int> > new_wl;
vector<int> pop_prune_list;
@@ -673,8 +673,8 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
uint pop_id, start_index, end_index;
if (verbose)
- log_fh << "\n#\n# List of pruned nucleotide sites\n#\n"
- << "# Action\tLocus ID\tChr\tBP\tColumn\tReason\n";
+ log_fh << "\n#\n# List of pruned nucleotide sites\n#\n"
+ << "# Action\tLocus ID\tChr\tBP\tColumn\tReason\n";
//
// If the whitelist is populated, use it as a guide for what loci to consider.
@@ -682,248 +682,248 @@ prune_polymorphic_sites(map<int, CSLocus *> &catalog,
// Construct a new whitelist along the way, that is a subset of the existing list.
//
if (whitelist.size() > 0) {
- map<int, set<int> >::iterator it;
-
- for (it = whitelist.begin(); it != whitelist.end(); it++) {
- //
- // A locus on the whitelist may have already been filtered out.
- //
- if (catalog.count(it->first) == 0)
- continue;
-
- loc = catalog[it->first];
- t = psum->locus_tally(loc->id);
- s = psum->locus(loc->id);
-
- //
- // Check that each SNP in this locus is above the sample_limit and that
- // each SNP is above the minor allele frequency. If so, add it back to
- // the whiteliest.
- //
- size = it->second.size();
- for (uint i = 0; i < loc->snps.size(); i++) {
-
- //
- // If it is not already in the whitelist, ignore it.
- //
- if (size > 0 && it->second.count(loc->snps[i]->col) == 0)
- continue;
-
- //
- // If the site is fixed, ignore it.
- //
- if (t->nucs[loc->snps[i]->col].fixed == true)
- continue;
-
- sample_prune = false;
- maf_prune = false;
- het_prune = false;
- inc_prune = false;
- pop_prune_list.clear();
-
- for (int j = 0; j < psum->pop_cnt(); j++) {
- pop_id = psum->rev_pop_index(j);
-
- if (s[j]->nucs[loc->snps[i]->col].incompatible_site)
- inc_prune = true;
-
- else if (s[j]->nucs[loc->snps[i]->col].num_indv == 0 ||
- (double) s[j]->nucs[loc->snps[i]->col].num_indv / (double) psum->pop_size(pop_id) < sample_limit)
- pop_prune_list.push_back(pop_id);
- }
-
- //
- // Check how many populations have to be pruned out due to sample limit. If less than
- // population limit, prune them; if more than population limit, mark locus for deletion.
- //
- if ((psum->pop_cnt() - pop_prune_list.size()) < (uint) population_limit) {
- sample_prune = true;
- } else {
- for (uint j = 0; j < pop_prune_list.size(); j++) {
- if (s[psum->pop_index(pop_prune_list[j])]->nucs[loc->snps[i]->col].num_indv == 0) continue;
-
- start_index = pop_indexes[pop_prune_list[j]].first;
- end_index = pop_indexes[pop_prune_list[j]].second;
- d = pmap->locus(loc->id);
-
- for (uint k = start_index; k <= end_index; k++) {
- if (d[k] == NULL || loc->snps[i]->col >= (uint) d[k]->len)
- continue;
- if (d[k]->model != NULL) {
- d[k]->model[loc->snps[i]->col] = 'U';
- }
- }
- }
- }
-
- if (t->nucs[loc->snps[i]->col].allele_cnt > 1) {
- //
- // Test for minor allele frequency.
- //
- if ((1 - t->nucs[loc->snps[i]->col].p_freq) < minor_allele_freq)
- maf_prune = true;
- //
- // Test for observed heterozygosity.
- //
- if (t->nucs[loc->snps[i]->col].obs_het > max_obs_het)
- het_prune = true;
- }
-
- if (maf_prune == false && het_prune == false && sample_prune == false && inc_prune == false) {
- new_wl[loc->id].insert(loc->snps[i]->col);
- } else {
- pruned++;
- if (verbose) {
- log_fh << "pruned_polymorphic_site\t"
- << loc->id << "\t"
- << loc->loc.chr << "\t"
- << loc->sort_bp(loc->snps[i]->col) << "\t"
- << loc->snps[i]->col << "\t";
- if (inc_prune)
- log_fh << "incompatible_site\n";
- else if (sample_prune)
- log_fh << "sample_limit\n";
- else if (maf_prune)
- log_fh << "maf_limit\n";
- else if (het_prune)
- log_fh << "obshet_limit\n";
- else
- log_fh << "unknown_reason\n";
- }
- }
- }
-
- //
- // If no SNPs were retained for this locus, then mark it to be removed entirely.
- //
- if (new_wl.count(loc->id) == 0) {
- if (verbose)
- log_fh << "removed_locus\t"
- << loc->id << "\t"
- << loc->loc.chr << "\t"
- << loc->sort_bp() << "\t"
- << 0 << "\tno_snps_remaining\n";
- blacklist.insert(loc->id);
- }
- }
+ map<int, set<int> >::iterator it;
+
+ for (it = whitelist.begin(); it != whitelist.end(); it++) {
+ //
+ // A locus on the whitelist may have already been filtered out.
+ //
+ if (catalog.count(it->first) == 0)
+ continue;
+
+ loc = catalog[it->first];
+ t = psum->locus_tally(loc->id);
+ s = psum->locus(loc->id);
+
+ //
+ // Check that each SNP in this locus is above the sample_limit and that
+ // each SNP is above the minor allele frequency. If so, add it back to
+ // the whiteliest.
+ //
+ size = it->second.size();
+ for (uint i = 0; i < loc->snps.size(); i++) {
+
+ //
+ // If it is not already in the whitelist, ignore it.
+ //
+ if (size > 0 && it->second.count(loc->snps[i]->col) == 0)
+ continue;
+
+ //
+ // If the site is fixed, ignore it.
+ //
+ if (t->nucs[loc->snps[i]->col].fixed == true)
+ continue;
+
+ sample_prune = false;
+ maf_prune = false;
+ het_prune = false;
+ inc_prune = false;
+ pop_prune_list.clear();
+
+ for (int j = 0; j < psum->pop_cnt(); j++) {
+ pop_id = psum->rev_pop_index(j);
+
+ if (s[j]->nucs[loc->snps[i]->col].incompatible_site)
+ inc_prune = true;
+
+ else if (s[j]->nucs[loc->snps[i]->col].num_indv == 0 ||
+ (double) s[j]->nucs[loc->snps[i]->col].num_indv / (double) psum->pop_size(pop_id) < sample_limit)
+ pop_prune_list.push_back(pop_id);
+ }
+
+ //
+ // Check how many populations have to be pruned out due to sample limit. If less than
+ // population limit, prune them; if more than population limit, mark locus for deletion.
+ //
+ if ((psum->pop_cnt() - pop_prune_list.size()) < (uint) population_limit) {
+ sample_prune = true;
+ } else {
+ for (uint j = 0; j < pop_prune_list.size(); j++) {
+ if (s[psum->pop_index(pop_prune_list[j])]->nucs[loc->snps[i]->col].num_indv == 0) continue;
+
+ start_index = pop_indexes[pop_prune_list[j]].first;
+ end_index = pop_indexes[pop_prune_list[j]].second;
+ d = pmap->locus(loc->id);
+
+ for (uint k = start_index; k <= end_index; k++) {
+ if (d[k] == NULL || loc->snps[i]->col >= (uint) d[k]->len)
+ continue;
+ if (d[k]->model != NULL) {
+ d[k]->model[loc->snps[i]->col] = 'U';
+ }
+ }
+ }
+ }
+
+ if (t->nucs[loc->snps[i]->col].allele_cnt > 1) {
+ //
+ // Test for minor allele frequency.
+ //
+ if ((1 - t->nucs[loc->snps[i]->col].p_freq) < minor_allele_freq)
+ maf_prune = true;
+ //
+ // Test for observed heterozygosity.
+ //
+ if (t->nucs[loc->snps[i]->col].obs_het > max_obs_het)
+ het_prune = true;
+ }
+
+ if (maf_prune == false && het_prune == false && sample_prune == false && inc_prune == false) {
+ new_wl[loc->id].insert(loc->snps[i]->col);
+ } else {
+ pruned++;
+ if (verbose) {
+ log_fh << "pruned_polymorphic_site\t"
+ << loc->id << "\t"
+ << loc->loc.chr << "\t"
+ << loc->sort_bp(loc->snps[i]->col) << "\t"
+ << loc->snps[i]->col << "\t";
+ if (inc_prune)
+ log_fh << "incompatible_site\n";
+ else if (sample_prune)
+ log_fh << "sample_limit\n";
+ else if (maf_prune)
+ log_fh << "maf_limit\n";
+ else if (het_prune)
+ log_fh << "obshet_limit\n";
+ else
+ log_fh << "unknown_reason\n";
+ }
+ }
+ }
+
+ //
+ // If no SNPs were retained for this locus, then mark it to be removed entirely.
+ //
+ if (new_wl.count(loc->id) == 0) {
+ if (verbose)
+ log_fh << "removed_locus\t"
+ << loc->id << "\t"
+ << loc->loc.chr << "\t"
+ << loc->sort_bp() << "\t"
+ << 0 << "\tno_snps_remaining\n";
+ blacklist.insert(loc->id);
+ }
+ }
} else {
- //
- // Otherwise, just iterate over the catalog.
- //
- map<int, CSLocus *>::iterator it;
- for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
-
- //
- // If this locus is fixed, don't try to filter it out.
- //
- if (loc->snps.size() == 0) {
- new_wl.insert(make_pair(loc->id, std::set<int>()));
- continue;
- }
-
- t = psum->locus_tally(loc->id);
- s = psum->locus(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
-
- //
- // If the site is fixed, ignore it.
- //
- if (t->nucs[loc->snps[i]->col].fixed == true)
- continue;
-
- sample_prune = false;
- maf_prune = false;
- het_prune = false;
- inc_prune = false;
- pop_prune_list.clear();
-
- for (int j = 0; j < psum->pop_cnt(); j++) {
- pop_id = psum->rev_pop_index(j);
-
- if (s[j]->nucs[loc->snps[i]->col].incompatible_site)
- inc_prune = true;
- else if (s[j]->nucs[loc->snps[i]->col].num_indv == 0 ||
- (double) s[j]->nucs[loc->snps[i]->col].num_indv / (double) psum->pop_size(pop_id) < sample_limit)
- pop_prune_list.push_back(pop_id);
- }
-
- //
- // Check how many populations have to be pruned out due to sample limit. If less than
- // population limit, prune them; if more than population limit, mark locus for deletion.
- //
- if ((psum->pop_cnt() - pop_prune_list.size()) < (uint) population_limit) {
- sample_prune = true;
- } else {
- for (uint j = 0; j < pop_prune_list.size(); j++) {
- if (s[psum->pop_index(pop_prune_list[j])]->nucs[loc->snps[i]->col].num_indv == 0) continue;
-
- start_index = pop_indexes[pop_prune_list[j]].first;
- end_index = pop_indexes[pop_prune_list[j]].second;
- d = pmap->locus(loc->id);
-
- for (uint k = start_index; k <= end_index; k++) {
- if (d[k] == NULL || loc->snps[i]->col >= (uint) d[k]->len)
- continue;
- if (d[k]->model != NULL) {
- d[k]->model[loc->snps[i]->col] = 'U';
- }
- }
- }
- }
-
- if (t->nucs[loc->snps[i]->col].allele_cnt > 1) {
- //
- // Test for minor allele frequency.
- //
- if ((1 - t->nucs[loc->snps[i]->col].p_freq) < minor_allele_freq)
- maf_prune = true;
- //
- // Test for observed heterozygosity.
- //
- if (t->nucs[loc->snps[i]->col].obs_het > max_obs_het)
- het_prune = true;
- }
-
- if (maf_prune == false && het_prune == false && sample_prune == false && inc_prune == false) {
- new_wl[loc->id].insert(loc->snps[i]->col);
- } else {
- pruned++;
- if (verbose) {
- log_fh << "pruned_polymorphic_site\t"
- << loc->id << "\t"
- << loc->loc.chr << "\t"
- << loc->sort_bp(loc->snps[i]->col) << "\t"
- << loc->snps[i]->col << "\t";
- if (inc_prune)
- log_fh << "incompatible_site\n";
- else if (sample_prune)
- log_fh << "sample_limit\n";
- else if (maf_prune)
- log_fh << "maf_limit\n";
- else if (het_prune)
- log_fh << "obshet_limit\n";
- else
- log_fh << "unknown_reason\n";
- }
- }
- }
-
- //
- // If no SNPs were retained for this locus, then mark it to be removed entirely.
- //
- if (new_wl.count(loc->id) == 0) {
- if (verbose)
- log_fh << "removed_locus\t"
- << loc->id << "\t"
- << loc->loc.chr << "\t"
- << loc->sort_bp() << "\t"
- << 0 << "\tno_snps_remaining\n";
- blacklist.insert(loc->id);
- }
- }
+ //
+ // Otherwise, just iterate over the catalog.
+ //
+ map<int, CSLocus *>::iterator it;
+ for (it = catalog.begin(); it != catalog.end(); it++) {
+ loc = it->second;
+
+ //
+ // If this locus is fixed, don't try to filter it out.
+ //
+ if (loc->snps.size() == 0) {
+ new_wl.insert(make_pair(loc->id, std::set<int>()));
+ continue;
+ }
+
+ t = psum->locus_tally(loc->id);
+ s = psum->locus(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+
+ //
+ // If the site is fixed, ignore it.
+ //
+ if (t->nucs[loc->snps[i]->col].fixed == true)
+ continue;
+
+ sample_prune = false;
+ maf_prune = false;
+ het_prune = false;
+ inc_prune = false;
+ pop_prune_list.clear();
+
+ for (int j = 0; j < psum->pop_cnt(); j++) {
+ pop_id = psum->rev_pop_index(j);
+
+ if (s[j]->nucs[loc->snps[i]->col].incompatible_site)
+ inc_prune = true;
+ else if (s[j]->nucs[loc->snps[i]->col].num_indv == 0 ||
+ (double) s[j]->nucs[loc->snps[i]->col].num_indv / (double) psum->pop_size(pop_id) < sample_limit)
+ pop_prune_list.push_back(pop_id);
+ }
+
+ //
+ // Check how many populations have to be pruned out due to sample limit. If less than
+ // population limit, prune them; if more than population limit, mark locus for deletion.
+ //
+ if ((psum->pop_cnt() - pop_prune_list.size()) < (uint) population_limit) {
+ sample_prune = true;
+ } else {
+ for (uint j = 0; j < pop_prune_list.size(); j++) {
+ if (s[psum->pop_index(pop_prune_list[j])]->nucs[loc->snps[i]->col].num_indv == 0) continue;
+
+ start_index = pop_indexes[pop_prune_list[j]].first;
+ end_index = pop_indexes[pop_prune_list[j]].second;
+ d = pmap->locus(loc->id);
+
+ for (uint k = start_index; k <= end_index; k++) {
+ if (d[k] == NULL || loc->snps[i]->col >= (uint) d[k]->len)
+ continue;
+ if (d[k]->model != NULL) {
+ d[k]->model[loc->snps[i]->col] = 'U';
+ }
+ }
+ }
+ }
+
+ if (t->nucs[loc->snps[i]->col].allele_cnt > 1) {
+ //
+ // Test for minor allele frequency.
+ //
+ if ((1 - t->nucs[loc->snps[i]->col].p_freq) < minor_allele_freq)
+ maf_prune = true;
+ //
+ // Test for observed heterozygosity.
+ //
+ if (t->nucs[loc->snps[i]->col].obs_het > max_obs_het)
+ het_prune = true;
+ }
+
+ if (maf_prune == false && het_prune == false && sample_prune == false && inc_prune == false) {
+ new_wl[loc->id].insert(loc->snps[i]->col);
+ } else {
+ pruned++;
+ if (verbose) {
+ log_fh << "pruned_polymorphic_site\t"
+ << loc->id << "\t"
+ << loc->loc.chr << "\t"
+ << loc->sort_bp(loc->snps[i]->col) << "\t"
+ << loc->snps[i]->col << "\t";
+ if (inc_prune)
+ log_fh << "incompatible_site\n";
+ else if (sample_prune)
+ log_fh << "sample_limit\n";
+ else if (maf_prune)
+ log_fh << "maf_limit\n";
+ else if (het_prune)
+ log_fh << "obshet_limit\n";
+ else
+ log_fh << "unknown_reason\n";
+ }
+ }
+ }
+
+ //
+ // If no SNPs were retained for this locus, then mark it to be removed entirely.
+ //
+ if (new_wl.count(loc->id) == 0) {
+ if (verbose)
+ log_fh << "removed_locus\t"
+ << loc->id << "\t"
+ << loc->loc.chr << "\t"
+ << loc->sort_bp() << "\t"
+ << 0 << "\tno_snps_remaining\n";
+ blacklist.insert(loc->id);
+ }
+ }
}
whitelist = new_wl;
@@ -939,27 +939,27 @@ order_unordered_loci(map<int, CSLocus *> &catalog)
set<string> chrs;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- if (strlen(loc->loc.chr) > 0)
- chrs.insert(loc->loc.chr);
+ loc = it->second;
+ if (strlen(loc->loc.chr) > 0)
+ chrs.insert(loc->loc.chr);
}
//
// This data is already reference aligned.
//
if (chrs.size() > 0)
- return true;
+ return true;
cerr << "Catalog is not reference aligned, arbitrarily ordering catalog loci.\n";
uint bp = 1;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- loc->loc.chr = new char[3];
- strcpy(loc->loc.chr, "un");
- loc->loc.bp = bp;
+ loc = it->second;
+ loc->loc.chr = new char[3];
+ strcpy(loc->loc.chr, "un");
+ loc->loc.bp = bp;
- bp += strlen(loc->con);
+ bp += strlen(loc->con);
}
return false;
@@ -975,42 +975,42 @@ log_haplotype_cnts(map<int, CSLocus *> &catalog, ofstream &log_fh)
int missing;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
+ loc = it->second;
- if (valid.count(loc->hcnt) == 0)
- valid[loc->hcnt] = 1;
- else
- valid[loc->hcnt]++;
+ if (valid.count(loc->hcnt) == 0)
+ valid[loc->hcnt] = 1;
+ else
+ valid[loc->hcnt]++;
- if (confounded.count(loc->confounded_cnt) == 0)
- confounded[loc->confounded_cnt] = 1;
- else
- confounded[loc->confounded_cnt]++;
+ if (confounded.count(loc->confounded_cnt) == 0)
+ confounded[loc->confounded_cnt] = 1;
+ else
+ confounded[loc->confounded_cnt]++;
- missing = loc->cnt - loc->hcnt;
+ missing = loc->cnt - loc->hcnt;
- if (absent.count(missing) == 0)
- absent[missing] = 1;
- else
- absent[missing]++;
+ if (absent.count(missing) == 0)
+ absent[missing] = 1;
+ else
+ absent[missing]++;
}
map<int, int>::iterator cnt_it;
log_fh << "# Distribution of valid loci matched to catalog locus.\n"
- << "# Valid samples at locus\tCount\n";
+ << "# Valid samples at locus\tCount\n";
for (cnt_it = valid.begin(); cnt_it != valid.end(); cnt_it++)
- log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
+ log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
log_fh << "# Distribution of confounded loci at catalog locus.\n"
- << "# Confounded samples at locus\tCount\n";
+ << "# Confounded samples at locus\tCount\n";
for (cnt_it = confounded.begin(); cnt_it != confounded.end(); cnt_it++)
- log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
+ log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
log_fh << "# Distribution of missing loci at catalog loci.\n"
- << "# Absent samples at locus\tCount\n";
+ << "# Absent samples at locus\tCount\n";
for (cnt_it = absent.begin(); cnt_it != absent.end(); cnt_it++)
- log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
+ log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
return 0;
}
@@ -1025,29 +1025,29 @@ tabulate_haplotypes(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
double mean, cnt;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- d = pmap->locus(loc->id);
+ loc = it->second;
+ d = pmap->locus(loc->id);
- mean = 0.0;
- cnt = 0.0;
+ mean = 0.0;
+ cnt = 0.0;
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL)
- continue;
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (d[i] == NULL)
+ continue;
- if (d[i]->obshap.size() > 1)
- loc->marker = "heterozygous";
+ if (d[i]->obshap.size() > 1)
+ loc->marker = "heterozygous";
- mean += d[i]->lnl;
- cnt++;
- }
+ mean += d[i]->lnl;
+ cnt++;
+ }
- if (loc->marker.length() > 0) {
- create_genotype_map(loc, pmap);
- call_population_genotypes(loc, pmap);
- }
+ if (loc->marker.length() > 0) {
+ create_genotype_map(loc, pmap);
+ call_population_genotypes(loc, pmap);
+ }
- loc->lnl = mean / cnt;
+ loc->lnl = mean / cnt;
}
return 0;
@@ -1055,9 +1055,9 @@ tabulate_haplotypes(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
int
merge_shared_cutsite_loci(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
- map<int, pair<merget, int> > &merge_map,
- ofstream &log_fh)
+ PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
+ map<int, pair<merget, int> > &merge_map,
+ ofstream &log_fh)
{
map<string, vector<CSLocus *> >::iterator it;
CSLocus *cur, *next;
@@ -1081,125 +1081,125 @@ merge_shared_cutsite_loci(map<int, CSLocus *> &catalog,
map<int, int> missing_samps_dist;
cerr << "To merge adjacent loci at least " << merge_prune_lim * 100 << "% of samples must have both adjacent loci;"
- << " the remaining " << 100 - (merge_prune_lim * 100) << "% of individuals will be pruned.\n"
- << "Attempting to merge adjacent loci that share a cutsite...";
-
+ << " the remaining " << 100 - (merge_prune_lim * 100) << "% of individuals will be pruned.\n"
+ << "Attempting to merge adjacent loci that share a cutsite...";
+
if (verbose)
- log_fh << "\n#\n# List of locus pairs that share a cutsite that failed to merge because they could not be phased.\n#\n";
+ log_fh << "\n#\n# List of locus pairs that share a cutsite that failed to merge because they could not be phased.\n#\n";
//
// Iterate over each chromosome.
//
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- //
- // Iterate over each ordered locus on this chromosome.
- //
- next = it->second[0];
- for (uint pos = 1; pos < it->second.size(); pos++) {
- cur = next;
- next = it->second[pos];
-
- //
- // Do these two loci overlap?
- // +Must occur on opposite strands
- // +Must overlap according to the length of the cutsite.
- //
- if (((cur->loc.strand == minus && next->loc.strand == plus) &&
- ((int) (cur->loc.bp - next->loc.bp + 1) == renz_olap[enz])) ||
- ((cur->loc.strand == plus && next->loc.strand == minus) &&
- ((int) (next->loc.bp - cur->loc.bp + 1) == renz_olap[enz]))) {
- overlap++;
-
- d_1 = pmap->locus(cur->id);
- d_2 = pmap->locus(next->id);
- unmergable = 0;
- tot_samp = 0;
-
- //
- // Check if all members of the population contain these two loci (or are missing both).
- //
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d_1[i] != NULL || d_2[i] != NULL)
- tot_samp++;
- if ((d_1[i] != NULL && d_2[i] == NULL) ||
- (d_1[i] == NULL && d_2[i] != NULL))
- unmergable++;
- }
-
- prune_pct = (double) (tot_samp - unmergable) / (double) tot_samp;
-
- //
- // If some of the individuals only have one locus and not the other, prune them out.
- //
- if (prune_pct < 1.0 && prune_pct >= merge_prune_lim) {
- for (int i = 0; i < pmap->sample_cnt(); i++)
- if (d_1[i] != NULL && d_2[i] == NULL) {
- delete d_1[i];
- d_1[i] = NULL;
- } else if (d_1[i] == NULL && d_2[i] != NULL) {
- delete d_2[i];
- d_2[i] = NULL;
- }
- }
-
- //
- // If possible, merge the two loci together.
- //
- if (prune_pct < merge_prune_lim) {
- int pct = (int) (prune_pct * 100);
- missing_samps_dist[pct]++;
- if (verbose) log_fh << "Missing samples, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
- << pct << "% present (" << 100 - pct << "% missing)\n";
- missing_samps_cnt++;
- failure++;
- continue;
- }
-
- phaset res = merge_and_phase_loci(pmap, cur, next, loci_to_destroy, log_fh);
- switch(res) {
- case multiple_fails:
- if (verbose) log_fh << "Failed to phase, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
- << "multiple failures\n";
- multifails_cnt++;
- phase_fail_cnt++;
- failure++;
- break;
- case multimapping_fail:
- if (verbose) log_fh << "Failed to phase, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
- << "multimapping in one or more individuals\n";
- multimapping_cnt++;
- phase_fail_cnt++;
- failure++;
- break;
- case nomapping_fail:
- if (verbose) log_fh << "Failed to phase, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
- << "no mapping in one or more individuals\n";
- nomapping_cnt++;
- phase_fail_cnt++;
- failure++;
- break;
- case complex_phase:
- if (verbose) log_fh << "Phased Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
- << "a complex phasing operation.\n";
- complex_merge_cnt++;
- success++;
- merge_map[cur->id] = make_pair(merge_sink, next->id);
- merge_map[next->id] = make_pair(merge_src, cur->id);
- break;
- case simple_merge:
- if (verbose) log_fh << "Phased Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
- << "a simple merge operation.\n";
- simple_merge_cnt++;
- success++;
- merge_map[cur->id] = make_pair(merge_sink, next->id);
- merge_map[next->id] = make_pair(merge_src, cur->id);
- break;
- default:
- cerr << "Warning: Merge failure.\n";
- break;
- }
- }
- }
+ //
+ // Iterate over each ordered locus on this chromosome.
+ //
+ next = it->second[0];
+ for (uint pos = 1; pos < it->second.size(); pos++) {
+ cur = next;
+ next = it->second[pos];
+
+ //
+ // Do these two loci overlap?
+ // +Must occur on opposite strands
+ // +Must overlap according to the length of the cutsite.
+ //
+ if (((cur->loc.strand == minus && next->loc.strand == plus) &&
+ ((int) (cur->loc.bp - next->loc.bp + 1) == renz_olap[enz])) ||
+ ((cur->loc.strand == plus && next->loc.strand == minus) &&
+ ((int) (next->loc.bp - cur->loc.bp + 1) == renz_olap[enz]))) {
+ overlap++;
+
+ d_1 = pmap->locus(cur->id);
+ d_2 = pmap->locus(next->id);
+ unmergable = 0;
+ tot_samp = 0;
+
+ //
+ // Check if all members of the population contain these two loci (or are missing both).
+ //
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ if (d_1[i] != NULL || d_2[i] != NULL)
+ tot_samp++;
+ if ((d_1[i] != NULL && d_2[i] == NULL) ||
+ (d_1[i] == NULL && d_2[i] != NULL))
+ unmergable++;
+ }
+
+ prune_pct = (double) (tot_samp - unmergable) / (double) tot_samp;
+
+ //
+ // If some of the individuals only have one locus and not the other, prune them out.
+ //
+ if (prune_pct < 1.0 && prune_pct >= merge_prune_lim) {
+ for (int i = 0; i < pmap->sample_cnt(); i++)
+ if (d_1[i] != NULL && d_2[i] == NULL) {
+ delete d_1[i];
+ d_1[i] = NULL;
+ } else if (d_1[i] == NULL && d_2[i] != NULL) {
+ delete d_2[i];
+ d_2[i] = NULL;
+ }
+ }
+
+ //
+ // If possible, merge the two loci together.
+ //
+ if (prune_pct < merge_prune_lim) {
+ int pct = (int) (prune_pct * 100);
+ missing_samps_dist[pct]++;
+ if (verbose) log_fh << "Missing samples, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
+ << pct << "% present (" << 100 - pct << "% missing)\n";
+ missing_samps_cnt++;
+ failure++;
+ continue;
+ }
+
+ phaset res = merge_and_phase_loci(pmap, cur, next, loci_to_destroy, log_fh);
+ switch(res) {
+ case multiple_fails:
+ if (verbose) log_fh << "Failed to phase, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
+ << "multiple failures\n";
+ multifails_cnt++;
+ phase_fail_cnt++;
+ failure++;
+ break;
+ case multimapping_fail:
+ if (verbose) log_fh << "Failed to phase, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
+ << "multimapping in one or more individuals\n";
+ multimapping_cnt++;
+ phase_fail_cnt++;
+ failure++;
+ break;
+ case nomapping_fail:
+ if (verbose) log_fh << "Failed to phase, Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
+ << "no mapping in one or more individuals\n";
+ nomapping_cnt++;
+ phase_fail_cnt++;
+ failure++;
+ break;
+ case complex_phase:
+ if (verbose) log_fh << "Phased Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
+ << "a complex phasing operation.\n";
+ complex_merge_cnt++;
+ success++;
+ merge_map[cur->id] = make_pair(merge_sink, next->id);
+ merge_map[next->id] = make_pair(merge_src, cur->id);
+ break;
+ case simple_merge:
+ if (verbose) log_fh << "Phased Sink Locus: " << cur->id << "; Source Locus: " << next->id << "; "
+ << "a simple merge operation.\n";
+ simple_merge_cnt++;
+ success++;
+ merge_map[cur->id] = make_pair(merge_sink, next->id);
+ merge_map[next->id] = make_pair(merge_src, cur->id);
+ break;
+ default:
+ cerr << "Warning: Merge failure.\n";
+ break;
+ }
+ }
+ }
}
//
@@ -1210,35 +1210,35 @@ merge_shared_cutsite_loci(map<int, CSLocus *> &catalog,
reduce_catalog(catalog, emptyset, loci_to_destroy);
cerr << "done.\n"
- << "Of " << tot_loci << " loci, "
- << overlap << " pairs share a cutsite; "
- << success << " pairs were merged; "
- << failure << " pairs failed to merge; "
- << pmap->loci_cnt() << " remaining loci.\n"
- << " Of those merged, " << simple_merge_cnt << " required only a simple merge without phasing; "
- << "while " << complex_merge_cnt << " required phasing.\n"
- << " Of those that failed to merge, " << missing_samps_cnt << " were missing one of the two haplotypes in one or more samples; "
- << "while " << phase_fail_cnt << " failed to be phased.\n"
- << " Of those that failed to phase, " << nomapping_cnt << " failed due to a lack of haplotype mappings; "
- << multimapping_cnt << " failed due to multiple haplotype mappings; " << multifails_cnt << " failed due to both.\n";
+ << "Of " << tot_loci << " loci, "
+ << overlap << " pairs share a cutsite; "
+ << success << " pairs were merged; "
+ << failure << " pairs failed to merge; "
+ << pmap->loci_cnt() << " remaining loci.\n"
+ << " Of those merged, " << simple_merge_cnt << " required only a simple merge without phasing; "
+ << "while " << complex_merge_cnt << " required phasing.\n"
+ << " Of those that failed to merge, " << missing_samps_cnt << " were missing one of the two haplotypes in one or more samples; "
+ << "while " << phase_fail_cnt << " failed to be phased.\n"
+ << " Of those that failed to phase, " << nomapping_cnt << " failed due to a lack of haplotype mappings; "
+ << multimapping_cnt << " failed due to multiple haplotype mappings; " << multifails_cnt << " failed due to both.\n";
log_fh << "\n#\n# Merging adjacent loci with a shared restriction enzyme cutsite\n#\n"
- << "Of " << tot_loci << " loci, "
- << overlap << " pairs share a cutsite; "
- << success << " pairs were merged; "
- << failure << " pairs failed to merge; "
- << pmap->loci_cnt() << " remaining loci.\n"
- << " Of those merged, " << simple_merge_cnt << " required only a simple merge without phasing; "
- << "while " << complex_merge_cnt << " required phasing.\n"
- << " Of those that failed to merge, " << missing_samps_cnt << " were missing one of the two haplotypes in one or more samples; "
- << "while " << phase_fail_cnt << " failed to be phased.\n"
- << " Of those that failed to phase, " << nomapping_cnt << " failed due to a lack of haplotype mappings; "
- << multimapping_cnt << " failed due to multiple haplotype mappings; " << multifails_cnt << " failed due to both.\n";
+ << "Of " << tot_loci << " loci, "
+ << overlap << " pairs share a cutsite; "
+ << success << " pairs were merged; "
+ << failure << " pairs failed to merge; "
+ << pmap->loci_cnt() << " remaining loci.\n"
+ << " Of those merged, " << simple_merge_cnt << " required only a simple merge without phasing; "
+ << "while " << complex_merge_cnt << " required phasing.\n"
+ << " Of those that failed to merge, " << missing_samps_cnt << " were missing one of the two haplotypes in one or more samples; "
+ << "while " << phase_fail_cnt << " failed to be phased.\n"
+ << " Of those that failed to phase, " << nomapping_cnt << " failed due to a lack of haplotype mappings; "
+ << multimapping_cnt << " failed due to multiple haplotype mappings; " << multifails_cnt << " failed due to both.\n";
log_fh << "#\n# Distribution of loci with samples missing one of two loci to be merged\n"
- << "# Percent samples with both loci present\tNumber of cases\n";
+ << "# Percent samples with both loci present\tNumber of cases\n";
map<int, int>::iterator mit;
for (mit = missing_samps_dist.begin(); mit != missing_samps_dist.end(); mit++)
- log_fh << mit->first << "\t" << mit->second << "\n";
+ log_fh << mit->first << "\t" << mit->second << "\n";
log_fh << "\n";
return 0;
@@ -1246,8 +1246,8 @@ merge_shared_cutsite_loci(map<int, CSLocus *> &catalog,
phaset
merge_and_phase_loci(PopMap<CSLocus> *pmap, CSLocus *cur, CSLocus *next,
- set<int> &loci_to_destroy,
- ofstream &log_fh)
+ set<int> &loci_to_destroy,
+ ofstream &log_fh)
{
Datum **d_1 = pmap->locus(cur->id);
Datum **d_2 = pmap->locus(next->id);
@@ -1273,44 +1273,44 @@ merge_and_phase_loci(PopMap<CSLocus> *pmap, CSLocus *cur, CSLocus *next,
// recorded as "consensus." Check that condition before we start merging.
//
if (cur->snps.size() > 0 && next->snps.size() > 0)
- merge_type = 0;
+ merge_type = 0;
else if (cur->snps.size() == 0)
- merge_type = 1;
+ merge_type = 1;
else if (next->snps.size() == 0)
- merge_type = 2;
+ merge_type = 2;
else
- merge_type = 3;
+ merge_type = 3;
for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d_1[i] == NULL || d_2[i] == NULL)
- continue;
- else if (d_1[i]->obshap.size() > 1 && d_2[i]->obshap.size() > 1)
- continue;
- else {
- for (uint j = 0; j < d_1[i]->obshap.size(); j++) {
- for (uint k = 0; k < d_2[i]->obshap.size(); k++) {
- switch (merge_type) {
- case 0:
- merged_hap = string(d_1[i]->obshap[j]) + string(d_2[i]->obshap[k]);
- break;
- case 1:
- merged_hap = string(d_2[i]->obshap[k]);
- break;
- case 2:
- merged_hap = string(d_1[i]->obshap[j]);
- break;
- case 3:
- default:
- merged_hap = "consensus";
- break;
- }
- phased_haplotypes.insert(merged_hap);
- // cerr << "Phasing: '" << d_1[i]->obshap[j] << "' + '" << d_2[i]->obshap[k] << "' => '" << merged_hap << "'\n";
- }
- }
- phased_sample_cnt++;
- sample_cnt++;
- }
+ if (d_1[i] == NULL || d_2[i] == NULL)
+ continue;
+ else if (d_1[i]->obshap.size() > 1 && d_2[i]->obshap.size() > 1)
+ continue;
+ else {
+ for (uint j = 0; j < d_1[i]->obshap.size(); j++) {
+ for (uint k = 0; k < d_2[i]->obshap.size(); k++) {
+ switch (merge_type) {
+ case 0:
+ merged_hap = string(d_1[i]->obshap[j]) + string(d_2[i]->obshap[k]);
+ break;
+ case 1:
+ merged_hap = string(d_2[i]->obshap[k]);
+ break;
+ case 2:
+ merged_hap = string(d_1[i]->obshap[j]);
+ break;
+ case 3:
+ default:
+ merged_hap = "consensus";
+ break;
+ }
+ phased_haplotypes.insert(merged_hap);
+ // cerr << "Phasing: '" << d_1[i]->obshap[j] << "' + '" << d_2[i]->obshap[k] << "' => '" << merged_hap << "'\n";
+ }
+ }
+ phased_sample_cnt++;
+ sample_cnt++;
+ }
}
//
@@ -1322,104 +1322,104 @@ merge_and_phase_loci(PopMap<CSLocus> *pmap, CSLocus *cur, CSLocus *next,
// Now we need to check if we can phase the remaining haplotypes.
//
for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d_1[i] == NULL || d_2[i] == NULL)
- continue;
- else if (d_1[i]->obshap.size() > 1 && d_2[i]->obshap.size() > 1) {
- // cerr << "Attempting to phase individual " << i << ": " << d_1[i]->id << " / " << d_2[i]->id << "\n";
-
- sample_cnt++;
- //
- // We should be able to find a sinlge phasing mapping for each haplotype from d_1 to d_2
- // that includes all the haplotypes in these two loci.
- //
- vector<pair<char *, char *> > seen_phased;
- uint tot_obshap = d_1[i]->obshap.size() + d_2[i]->obshap.size();
- uint phased_cnt = 0;
- for (uint j = 0; j < d_1[i]->obshap.size(); j++) {
- for (uint k = 0; k < d_2[i]->obshap.size(); k++) {
- // cerr << " " << d_1[i]->obshap[j] << " + " << d_2[i]->obshap[k];
- //
- // Record each pair of haplotypes that has been seen phased previously.
- //
- if (phased_haplotypes.count(string(d_1[i]->obshap[j]) + string(d_2[i]->obshap[k]))) {
- seen_phased.push_back(make_pair(d_1[i]->obshap[j], d_2[i]->obshap[k]));
- // cerr << " => " << d_1[i]->obshap[j] << d_2[i]->obshap[k];
- }
- // cerr << "\n";
- }
- }
- //
- // Now, we will iterate over all sets of phased haplotypes and look
- // for combinations that use all four individual haplotypes.
- //
- for (uint j = 0; j < seen_phased.size(); j++) {
- for (uint k = j; k < seen_phased.size(); k++) {
- set<char *> incorporated_haplotypes;
- //
- // Count the number of distinct char pointers. If this combination
- // of haplotypes includes all unphased haplotypes, count it.
- //
- incorporated_haplotypes.insert(seen_phased[j].first);
- incorporated_haplotypes.insert(seen_phased[j].second);
- incorporated_haplotypes.insert(seen_phased[k].first);
- incorporated_haplotypes.insert(seen_phased[k].second);
- if (incorporated_haplotypes.size() == tot_obshap)
- phased_cnt++;
- }
- }
-
- //
- // If one pair of haplotypes is mapped, but the other is not, assume the second pair or
- // haplotypes must be phased by process of elimination.
- //
- if (phased_cnt == 0 && seen_phased.size() == 1) {
- h_1 = seen_phased[0].first == d_1[i]->obshap[1] ?
- d_1[i]->obshap[0] : d_1[i]->obshap[1];
- h_2 = seen_phased[0].second == d_2[i]->obshap[1] ?
- d_2[i]->obshap[0] : d_2[i]->obshap[1];
- phased_haplotypes.insert(string(h_1) + string(h_2));
- phased_cnt++;
- // cerr << " Phasing: '" << hap_1 << "' + '" << hap_2 << "' => '" << string(hap_1) + string(hap_2) << "'\n";
- }
-
- if (phased_cnt == 0) {
- phased_results.insert(nomapping_fail);
- if (verbose) log_fh << " Locus NOT phased in individual " << i << "; loci " << d_1[i]->id << " / " << d_2[i]->id << " no mapping found.\n";
- } else if (phased_cnt == 1) {
- phased_sample_cnt++;
- phased_results.insert(complex_phase);
- } else {
- phased_results.insert(multimapping_fail);
- if (verbose) log_fh << " Locus NOT phased in individual " << i << "; loci " << d_1[i]->id << " / " << d_2[i]->id << " multiple mappings found.\n";
- }
- }
+ if (d_1[i] == NULL || d_2[i] == NULL)
+ continue;
+ else if (d_1[i]->obshap.size() > 1 && d_2[i]->obshap.size() > 1) {
+ // cerr << "Attempting to phase individual " << i << ": " << d_1[i]->id << " / " << d_2[i]->id << "\n";
+
+ sample_cnt++;
+ //
+ // We should be able to find a sinlge phasing mapping for each haplotype from d_1 to d_2
+ // that includes all the haplotypes in these two loci.
+ //
+ vector<pair<char *, char *> > seen_phased;
+ uint tot_obshap = d_1[i]->obshap.size() + d_2[i]->obshap.size();
+ uint phased_cnt = 0;
+ for (uint j = 0; j < d_1[i]->obshap.size(); j++) {
+ for (uint k = 0; k < d_2[i]->obshap.size(); k++) {
+ // cerr << " " << d_1[i]->obshap[j] << " + " << d_2[i]->obshap[k];
+ //
+ // Record each pair of haplotypes that has been seen phased previously.
+ //
+ if (phased_haplotypes.count(string(d_1[i]->obshap[j]) + string(d_2[i]->obshap[k]))) {
+ seen_phased.push_back(make_pair(d_1[i]->obshap[j], d_2[i]->obshap[k]));
+ // cerr << " => " << d_1[i]->obshap[j] << d_2[i]->obshap[k];
+ }
+ // cerr << "\n";
+ }
+ }
+ //
+ // Now, we will iterate over all sets of phased haplotypes and look
+ // for combinations that use all four individual haplotypes.
+ //
+ for (uint j = 0; j < seen_phased.size(); j++) {
+ for (uint k = j; k < seen_phased.size(); k++) {
+ set<char *> incorporated_haplotypes;
+ //
+ // Count the number of distinct char pointers. If this combination
+ // of haplotypes includes all unphased haplotypes, count it.
+ //
+ incorporated_haplotypes.insert(seen_phased[j].first);
+ incorporated_haplotypes.insert(seen_phased[j].second);
+ incorporated_haplotypes.insert(seen_phased[k].first);
+ incorporated_haplotypes.insert(seen_phased[k].second);
+ if (incorporated_haplotypes.size() == tot_obshap)
+ phased_cnt++;
+ }
+ }
+
+ //
+ // If one pair of haplotypes is mapped, but the other is not, assume the second pair or
+ // haplotypes must be phased by process of elimination.
+ //
+ if (phased_cnt == 0 && seen_phased.size() == 1) {
+ h_1 = seen_phased[0].first == d_1[i]->obshap[1] ?
+ d_1[i]->obshap[0] : d_1[i]->obshap[1];
+ h_2 = seen_phased[0].second == d_2[i]->obshap[1] ?
+ d_2[i]->obshap[0] : d_2[i]->obshap[1];
+ phased_haplotypes.insert(string(h_1) + string(h_2));
+ phased_cnt++;
+ // cerr << " Phasing: '" << hap_1 << "' + '" << hap_2 << "' => '" << string(hap_1) + string(hap_2) << "'\n";
+ }
+
+ if (phased_cnt == 0) {
+ phased_results.insert(nomapping_fail);
+ if (verbose) log_fh << " Locus NOT phased in individual " << i << "; loci " << d_1[i]->id << " / " << d_2[i]->id << " no mapping found.\n";
+ } else if (phased_cnt == 1) {
+ phased_sample_cnt++;
+ phased_results.insert(complex_phase);
+ } else {
+ phased_results.insert(multimapping_fail);
+ if (verbose) log_fh << " Locus NOT phased in individual " << i << "; loci " << d_1[i]->id << " / " << d_2[i]->id << " multiple mappings found.\n";
+ }
+ }
}
if (phased_sample_cnt != sample_cnt) {
- if (phased_results.count(nomapping_fail) > 0 &&
- phased_results.count(multimapping_fail) > 0)
- return multiple_fails;
- else if (phased_results.count(nomapping_fail) > 0)
- return nomapping_fail;
- else if (phased_results.count(multimapping_fail) > 0)
- return multimapping_fail;
- else {
- cerr << "WE SHOULD NOT GET HERE\n";
- return merge_failure;
- }
+ if (phased_results.count(nomapping_fail) > 0 &&
+ phased_results.count(multimapping_fail) > 0)
+ return multiple_fails;
+ else if (phased_results.count(nomapping_fail) > 0)
+ return nomapping_fail;
+ else if (phased_results.count(multimapping_fail) > 0)
+ return multimapping_fail;
+ else {
+ cerr << "WE SHOULD NOT GET HERE\n";
+ return merge_failure;
+ }
}
//
// Okay, merge these two loci together.
//
if (!merge_datums(pmap->sample_cnt(), cur->len, d_1, d_2, phased_haplotypes, merge_type))
- return merge_failure;
+ return merge_failure;
//
// Merge the catalog entries together.
//
if (!merge_csloci(cur, next, phased_haplotypes))
- return merge_failure;
+ return merge_failure;
//
// Mark the merged locus for destruction.
@@ -1427,7 +1427,7 @@ merge_and_phase_loci(PopMap<CSLocus> *pmap, CSLocus *cur, CSLocus *next,
loci_to_destroy.insert(next->id);
if (phased_results.count(complex_phase) > 0)
- return complex_phase;
+ return complex_phase;
return simple_merge;
}
@@ -1445,18 +1445,18 @@ merge_csloci(CSLocus *sink, CSLocus *src, set<string> &phased_haplotypes)
// enumerated on the positive strand. Complement the alleles as well.
//
for (uint j = 0; j < sink->snps.size(); j++) {
- sink->snps[j]->col = sink->len - sink->snps[j]->col - 1;
- sink->snps[j]->rank_1 = reverse(sink->snps[j]->rank_1);
- sink->snps[j]->rank_2 = reverse(sink->snps[j]->rank_2);
- sink->snps[j]->rank_3 = reverse(sink->snps[j]->rank_3);
- sink->snps[j]->rank_4 = reverse(sink->snps[j]->rank_4);
+ sink->snps[j]->col = sink->len - sink->snps[j]->col - 1;
+ sink->snps[j]->rank_1 = reverse(sink->snps[j]->rank_1);
+ sink->snps[j]->rank_2 = reverse(sink->snps[j]->rank_2);
+ sink->snps[j]->rank_3 = reverse(sink->snps[j]->rank_3);
+ sink->snps[j]->rank_4 = reverse(sink->snps[j]->rank_4);
}
//
// 2. Adjust the SNP coordinates in the src locus to account for the now, longer length.
//
for (uint j = 0; j < src->snps.size(); j++)
- src->snps[j]->col = sink->len + src->snps[j]->col - renz_olap[enz];
+ src->snps[j]->col = sink->len + src->snps[j]->col - renz_olap[enz];
//
// 3. Combine SNPs between the two catalog loci: add the SNPs from the sink (formerly on the
@@ -1464,12 +1464,12 @@ merge_csloci(CSLocus *sink, CSLocus *src, set<string> &phased_haplotypes)
//
vector<SNP *> tmpsnp;
for (int j = (int) sink->snps.size() - 1; j >= 0; j--)
- tmpsnp.push_back(sink->snps[j]);
+ tmpsnp.push_back(sink->snps[j]);
for (uint j = 0; j < src->snps.size(); j++)
- tmpsnp.push_back(src->snps[j]);
+ tmpsnp.push_back(src->snps[j]);
sink->snps.clear();
for (uint j = 0; j < tmpsnp.size(); j++)
- sink->snps.push_back(tmpsnp[j]);
+ sink->snps.push_back(tmpsnp[j]);
//
// 4. Adjust the genomic location of the sink locus.
@@ -1502,29 +1502,29 @@ merge_csloci(CSLocus *sink, CSLocus *src, set<string> &phased_haplotypes)
sink->alleles.clear();
set<string>::iterator it;
for (it = phased_haplotypes.begin(); it != phased_haplotypes.end(); it++)
- sink->alleles[*it] = 0;
+ sink->alleles[*it] = 0;
// cerr << "CSLocus " << sink->id << ":\n"
- // << "Length: " << sink->len << "; Chr: " << sink->loc.chr << "; BP: " << sink->sort_bp() << "; strand: " << (sink->loc.strand == plus ? "+" : "-") << "\n"
- // << " SNPs:\n";
+ // << "Length: " << sink->len << "; Chr: " << sink->loc.chr << "; BP: " << sink->sort_bp() << "; strand: " << (sink->loc.strand == plus ? "+" : "-") << "\n"
+ // << " SNPs:\n";
// for (uint j = 0; j < sink->snps.size(); j++)
- // cerr << " Col: " << sink->snps[j]->col
- // << " Rank 1: " << sink->snps[j]->rank_1
- // << " Rank 2: " << sink->snps[j]->rank_2 << "\n";
+ // cerr << " Col: " << sink->snps[j]->col
+ // << " Rank 1: " << sink->snps[j]->rank_1
+ // << " Rank 2: " << sink->snps[j]->rank_2 << "\n";
// cerr << " Alleles:\n";
// map<string, int>::iterator ait;
// for (ait = sink->alleles.begin(); ait != sink->alleles.end(); ait++)
- // cerr << " " << ait->first << "\n";
+ // cerr << " " << ait->first << "\n";
return 1;
}
int
merge_datums(int sample_cnt,
- int sink_locus_len,
- Datum **sink, Datum **src,
- set<string> &phased_haplotypes,
- int merge_type)
+ int sink_locus_len,
+ Datum **sink, Datum **src,
+ set<string> &phased_haplotypes,
+ int merge_type)
{
char tmphap[id_len], *new_hap;
uint haplen, model_len, offset;
@@ -1538,52 +1538,52 @@ merge_datums(int sample_cnt,
// -The sink datum is assumed to be on the negative strand.
//
for (int i = 0; i < sample_cnt; i++) {
- if (sink[i] == NULL && src[i] == NULL)
- continue;
- else if (sink[i] == NULL || src[i] == NULL)
- cerr << "Unexpected condition in merging datums: one datum is NULL while the other is not.\n";
-
- //
- // 1. Reverse complement the SNP coordinates in the sink locus so that they are
- // enumerated on the positive strand. Complement the alleles as well.
- //
- for (uint j = 0; j < sink[i]->snps.size(); j++) {
- sink[i]->snps[j]->col = sink[i]->len - sink[i]->snps[j]->col - 1;
- sink[i]->snps[j]->rank_1 = reverse(sink[i]->snps[j]->rank_1);
- sink[i]->snps[j]->rank_2 = reverse(sink[i]->snps[j]->rank_2);
- sink[i]->snps[j]->rank_3 = reverse(sink[i]->snps[j]->rank_3);
- sink[i]->snps[j]->rank_4 = reverse(sink[i]->snps[j]->rank_4);
- }
-
- //
- // 2. Adjust the SNP coordinates in the src locus to account for the now, longer length.
- //
- for (uint j = 0; j < src[i]->snps.size(); j++)
- src[i]->snps[j]->col = sink[i]->len + src[i]->snps[j]->col - renz_olap[enz];
-
- //
- // 3. Reverse complement the observed haplotypes in the sink locus.
- //
- haplen = strlen(sink[i]->obshap[0]);
- for (uint j = 0; j < sink[i]->obshap.size(); j++) {
- for (uint k = 0; k < haplen; k++)
- tmphap[k] = reverse(sink[i]->obshap[j][haplen - k - 1]);
- tmphap[haplen] = '\0';
- strcpy(sink[i]->obshap[j], tmphap);
- }
-
- //
- // 4. Combine SNPs between the two datums: add the SNPs from the sink (formerly on the
- // negative strand) in reverse order, followed by the SNPs from the src.
- //
- tmpsnp.clear();
- for (int j = (int) sink[i]->snps.size() - 1; j >= 0; j--)
- tmpsnp.push_back(sink[i]->snps[j]);
- for (uint j = 0; j < src[i]->snps.size(); j++)
- tmpsnp.push_back(src[i]->snps[j]);
- sink[i]->snps.clear();
- for (uint j = 0; j < tmpsnp.size(); j++)
- sink[i]->snps.push_back(tmpsnp[j]);
+ if (sink[i] == NULL && src[i] == NULL)
+ continue;
+ else if (sink[i] == NULL || src[i] == NULL)
+ cerr << "Unexpected condition in merging datums: one datum is NULL while the other is not.\n";
+
+ //
+ // 1. Reverse complement the SNP coordinates in the sink locus so that they are
+ // enumerated on the positive strand. Complement the alleles as well.
+ //
+ for (uint j = 0; j < sink[i]->snps.size(); j++) {
+ sink[i]->snps[j]->col = sink[i]->len - sink[i]->snps[j]->col - 1;
+ sink[i]->snps[j]->rank_1 = reverse(sink[i]->snps[j]->rank_1);
+ sink[i]->snps[j]->rank_2 = reverse(sink[i]->snps[j]->rank_2);
+ sink[i]->snps[j]->rank_3 = reverse(sink[i]->snps[j]->rank_3);
+ sink[i]->snps[j]->rank_4 = reverse(sink[i]->snps[j]->rank_4);
+ }
+
+ //
+ // 2. Adjust the SNP coordinates in the src locus to account for the now, longer length.
+ //
+ for (uint j = 0; j < src[i]->snps.size(); j++)
+ src[i]->snps[j]->col = sink[i]->len + src[i]->snps[j]->col - renz_olap[enz];
+
+ //
+ // 3. Reverse complement the observed haplotypes in the sink locus.
+ //
+ haplen = strlen(sink[i]->obshap[0]);
+ for (uint j = 0; j < sink[i]->obshap.size(); j++) {
+ for (uint k = 0; k < haplen; k++)
+ tmphap[k] = reverse(sink[i]->obshap[j][haplen - k - 1]);
+ tmphap[haplen] = '\0';
+ strcpy(sink[i]->obshap[j], tmphap);
+ }
+
+ //
+ // 4. Combine SNPs between the two datums: add the SNPs from the sink (formerly on the
+ // negative strand) in reverse order, followed by the SNPs from the src.
+ //
+ tmpsnp.clear();
+ for (int j = (int) sink[i]->snps.size() - 1; j >= 0; j--)
+ tmpsnp.push_back(sink[i]->snps[j]);
+ for (uint j = 0; j < src[i]->snps.size(); j++)
+ tmpsnp.push_back(src[i]->snps[j]);
+ sink[i]->snps.clear();
+ for (uint j = 0; j < tmpsnp.size(); j++)
+ sink[i]->snps.push_back(tmpsnp[j]);
}
//
@@ -1594,93 +1594,93 @@ merge_datums(int sample_cnt,
vector<int> to_be_phased;
phased_haplotypes.clear();
for (int i = 0; i < sample_cnt; i++) {
- if (sink[i] == NULL && src[i] == NULL)
- continue;
-
- if (sink[i]->obshap.size() > 1 && src[i]->obshap.size() > 1) {
- to_be_phased.push_back(i);
- continue;
- } else {
- tmpobshap.clear();
- tmpobsdep.clear();
- for (uint j = 0; j < sink[i]->obshap.size(); j++) {
- for (uint k = 0; k < src[i]->obshap.size(); k++) {
- switch (merge_type) {
- case 0:
- merged_hap = string(sink[i]->obshap[j]) + string(src[i]->obshap[k]);
- break;
- case 1:
- merged_hap = string(src[i]->obshap[j]);
- break;
- case 2:
- merged_hap = string(sink[i]->obshap[j]);
- break;
- case 3:
- default:
- merged_hap = "consensus";
- break;
- }
- phased_haplotypes.insert(merged_hap);
- tmpobshap.push_back(merged_hap);
- tmpobsdep.push_back((sink[i]->depth[j] + src[i]->depth[k]) / 2);
- }
- }
- sink[i]->depth.clear();
- for (uint j = 0; j < sink[i]->obshap.size(); j++)
- delete [] sink[i]->obshap[j];
- sink[i]->obshap.clear();
- for (uint j = 0; j < tmpobshap.size(); j++) {
- new_hap = new char[tmpobshap[j].length() + 1];
- strcpy(new_hap, tmpobshap[j].c_str());
- sink[i]->obshap.push_back(new_hap);
- sink[i]->depth.push_back(tmpobsdep[j]);
- }
- }
+ if (sink[i] == NULL && src[i] == NULL)
+ continue;
+
+ if (sink[i]->obshap.size() > 1 && src[i]->obshap.size() > 1) {
+ to_be_phased.push_back(i);
+ continue;
+ } else {
+ tmpobshap.clear();
+ tmpobsdep.clear();
+ for (uint j = 0; j < sink[i]->obshap.size(); j++) {
+ for (uint k = 0; k < src[i]->obshap.size(); k++) {
+ switch (merge_type) {
+ case 0:
+ merged_hap = string(sink[i]->obshap[j]) + string(src[i]->obshap[k]);
+ break;
+ case 1:
+ merged_hap = string(src[i]->obshap[j]);
+ break;
+ case 2:
+ merged_hap = string(sink[i]->obshap[j]);
+ break;
+ case 3:
+ default:
+ merged_hap = "consensus";
+ break;
+ }
+ phased_haplotypes.insert(merged_hap);
+ tmpobshap.push_back(merged_hap);
+ tmpobsdep.push_back((sink[i]->depth[j] + src[i]->depth[k]) / 2);
+ }
+ }
+ sink[i]->depth.clear();
+ for (uint j = 0; j < sink[i]->obshap.size(); j++)
+ delete [] sink[i]->obshap[j];
+ sink[i]->obshap.clear();
+ for (uint j = 0; j < tmpobshap.size(); j++) {
+ new_hap = new char[tmpobshap[j].length() + 1];
+ strcpy(new_hap, tmpobshap[j].c_str());
+ sink[i]->obshap.push_back(new_hap);
+ sink[i]->depth.push_back(tmpobsdep[j]);
+ }
+ }
}
//
// 5.2 Phase and combine the haplotypes from the remaining samples.
//
int index;
for (uint i = 0; i < to_be_phased.size(); i++) {
- index = to_be_phased[i];
- tmpobshap.clear();
- tmpobsdep.clear();
-
- vector<pair<char *, char *> > seen_phased;
- uint tot_obshap = sink[index]->obshap.size() + src[index]->obshap.size();
-
- for (uint j = 0; j < sink[index]->obshap.size(); j++) {
- for (uint k = 0; k < src[index]->obshap.size(); k++) {
- if (phased_haplotypes.count(string(sink[index]->obshap[j]) + string(src[index]->obshap[k])))
- seen_phased.push_back(make_pair(sink[index]->obshap[j], src[index]->obshap[k]));
- }
- }
-
- for (uint j = 0; j < seen_phased.size(); j++) {
- for (uint k = j; k < seen_phased.size(); k++) {
- set<char *> incorporated_haplotypes;
- incorporated_haplotypes.insert(seen_phased[j].first);
- incorporated_haplotypes.insert(seen_phased[j].second);
- incorporated_haplotypes.insert(seen_phased[k].first);
- incorporated_haplotypes.insert(seen_phased[k].second);
- if (incorporated_haplotypes.size() == tot_obshap) {
- tmpobshap.push_back(string(seen_phased[j].first) + string(seen_phased[j].second));
- tmpobshap.push_back(string(seen_phased[k].first) + string(seen_phased[k].second));
- //tmpobsdep.push_back((sink[index]->depth[j] + src[index]->depth[k]) / 2);
- }
- }
- }
-
- sink[index]->depth.clear();
- for (uint j = 0; j < sink[index]->obshap.size(); j++)
- delete [] sink[index]->obshap[j];
- sink[index]->obshap.clear();
- for (uint j = 0; j < tmpobshap.size(); j++) {
- new_hap = new char[tmpobshap[j].length() + 1];
- strcpy(new_hap, tmpobshap[j].c_str());
- sink[index]->obshap.push_back(new_hap);
- // sink[index]->depth.push_back(tmpobsdep[j]);
- }
+ index = to_be_phased[i];
+ tmpobshap.clear();
+ tmpobsdep.clear();
+
+ vector<pair<char *, char *> > seen_phased;
+ uint tot_obshap = sink[index]->obshap.size() + src[index]->obshap.size();
+
+ for (uint j = 0; j < sink[index]->obshap.size(); j++) {
+ for (uint k = 0; k < src[index]->obshap.size(); k++) {
+ if (phased_haplotypes.count(string(sink[index]->obshap[j]) + string(src[index]->obshap[k])))
+ seen_phased.push_back(make_pair(sink[index]->obshap[j], src[index]->obshap[k]));
+ }
+ }
+
+ for (uint j = 0; j < seen_phased.size(); j++) {
+ for (uint k = j; k < seen_phased.size(); k++) {
+ set<char *> incorporated_haplotypes;
+ incorporated_haplotypes.insert(seen_phased[j].first);
+ incorporated_haplotypes.insert(seen_phased[j].second);
+ incorporated_haplotypes.insert(seen_phased[k].first);
+ incorporated_haplotypes.insert(seen_phased[k].second);
+ if (incorporated_haplotypes.size() == tot_obshap) {
+ tmpobshap.push_back(string(seen_phased[j].first) + string(seen_phased[j].second));
+ tmpobshap.push_back(string(seen_phased[k].first) + string(seen_phased[k].second));
+ //tmpobsdep.push_back((sink[index]->depth[j] + src[index]->depth[k]) / 2);
+ }
+ }
+ }
+
+ sink[index]->depth.clear();
+ for (uint j = 0; j < sink[index]->obshap.size(); j++)
+ delete [] sink[index]->obshap[j];
+ sink[index]->obshap.clear();
+ for (uint j = 0; j < tmpobshap.size(); j++) {
+ new_hap = new char[tmpobshap[j].length() + 1];
+ strcpy(new_hap, tmpobshap[j].c_str());
+ sink[index]->obshap.push_back(new_hap);
+ // sink[index]->depth.push_back(tmpobsdep[j]);
+ }
}
//
@@ -1690,38 +1690,38 @@ merge_datums(int sample_cnt,
char *p;
for (int i = 0; i < sample_cnt; i++) {
- if (sink[i] == NULL && src[i] == NULL)
- continue;
-
- //
- // Merge the two strings of model calls together.
- // We need to check if the locus for this individual is shorter than the catalog
- // locus. If so, we need to expand out the model call array to be the proper length.
- //
- reverse_string(sink[i]->model);
- offset = 0;
- model_calls.clear();
- if (sink_locus_len > sink[i]->len) {
- offset = sink_locus_len - sink[i]->len;
- model_calls.assign(offset, 'N');
- }
- model_len = offset + sink[i]->len + src[i]->len - renz_olap[enz];
- model_calls.append(sink[i]->model);
- delete [] sink[i]->model;
- sink[i]->model = new char[model_len + 1];
- strcpy(sink[i]->model, model_calls.c_str());
- p = sink[i]->model;
- p += offset + sink[i]->len - renz_olap[enz];
- strcpy(p, src[i]->model);
-
- sink[i]->len = model_len;
- sink[i]->tot_depth = (sink[i]->tot_depth + src[i]->tot_depth) / 2;
- sink[i]->lnl = (sink[i]->lnl + src[i]->lnl) / 2.0;
-
- //
- // Record which datum was merged into this one.
- //
- sink[i]->merge_partner = src[i]->id;
+ if (sink[i] == NULL && src[i] == NULL)
+ continue;
+
+ //
+ // Merge the two strings of model calls together.
+ // We need to check if the locus for this individual is shorter than the catalog
+ // locus. If so, we need to expand out the model call array to be the proper length.
+ //
+ reverse_string(sink[i]->model);
+ offset = 0;
+ model_calls.clear();
+ if (sink_locus_len > sink[i]->len) {
+ offset = sink_locus_len - sink[i]->len;
+ model_calls.assign(offset, 'N');
+ }
+ model_len = offset + sink[i]->len + src[i]->len - renz_olap[enz];
+ model_calls.append(sink[i]->model);
+ delete [] sink[i]->model;
+ sink[i]->model = new char[model_len + 1];
+ strcpy(sink[i]->model, model_calls.c_str());
+ p = sink[i]->model;
+ p += offset + sink[i]->len - renz_olap[enz];
+ strcpy(p, src[i]->model);
+
+ sink[i]->len = model_len;
+ sink[i]->tot_depth = (sink[i]->tot_depth + src[i]->tot_depth) / 2;
+ sink[i]->lnl = (sink[i]->lnl + src[i]->lnl) / 2.0;
+
+ //
+ // Record which datum was merged into this one.
+ //
+ sink[i]->merge_partner = src[i]->id;
}
return 1;
@@ -1729,8 +1729,8 @@ merge_datums(int sample_cnt,
int
datum_adjust_snp_positions(map<int, pair<merget, int> > &merge_map,
- CSLocus *loc, Datum *datum,
- map<int, SNPRes *> &snpres)
+ CSLocus *loc, Datum *datum,
+ map<int, SNPRes *> &snpres)
{
//
// We will start with the 'sink' locus, which was originally on the negative strand:
@@ -1748,25 +1748,25 @@ datum_adjust_snp_positions(map<int, pair<merget, int> > &merge_map,
// if it was necessary when originally merging.
//
while (datum->model[index] == 'N') {
- snp = new SNP;
- snp->col = index;
- snp->lratio = 0.0;
- snp->rank_1 = 'N';
- snp->type = snp_type_unk;
- datum->snps.push_back(snp);
- index++;
+ snp = new SNP;
+ snp->col = index;
+ snp->lratio = 0.0;
+ snp->rank_1 = 'N';
+ snp->type = snp_type_unk;
+ datum->snps.push_back(snp);
+ index++;
}
for (int j = snpr->snps.size() - 1; j > stop_pos; j--) {
- snp = new SNP;
- snp->col = index;
- snp->lratio = snpr->snps[j]->lratio;
- snp->rank_1 = reverse(snpr->snps[j]->rank_1);
- snp->rank_2 = reverse(snpr->snps[j]->rank_2);
- snp->rank_3 = reverse(snpr->snps[j]->rank_3);
- snp->rank_4 = reverse(snpr->snps[j]->rank_4);
- datum->snps.push_back(snp);
- index++;
+ snp = new SNP;
+ snp->col = index;
+ snp->lratio = snpr->snps[j]->lratio;
+ snp->rank_1 = reverse(snpr->snps[j]->rank_1);
+ snp->rank_2 = reverse(snpr->snps[j]->rank_2);
+ snp->rank_3 = reverse(snpr->snps[j]->rank_3);
+ snp->rank_4 = reverse(snpr->snps[j]->rank_4);
+ datum->snps.push_back(snp);
+ index++;
}
//
@@ -1776,15 +1776,15 @@ datum_adjust_snp_positions(map<int, pair<merget, int> > &merge_map,
snpr = snpres[datum->merge_partner];
for (uint j = 0; j < snpres[datum->id]->snps.size(); j++) {
- snp = new SNP;
- snp->col = index;
- snp->lratio = snpr->snps[j]->lratio;
- snp->rank_1 = snpr->snps[j]->rank_1;
- snp->rank_2 = snpr->snps[j]->rank_2;
- snp->rank_3 = snpr->snps[j]->rank_3;
- snp->rank_4 = snpr->snps[j]->rank_4;
- datum->snps.push_back(snp);
- index++;
+ snp = new SNP;
+ snp->col = index;
+ snp->lratio = snpr->snps[j]->lratio;
+ snp->rank_1 = snpr->snps[j]->rank_1;
+ snp->rank_2 = snpr->snps[j]->rank_2;
+ snp->rank_3 = snpr->snps[j]->rank_3;
+ snp->rank_4 = snpr->snps[j]->rank_4;
+ datum->snps.push_back(snp);
+ index++;
}
return 0;
@@ -1803,8 +1803,8 @@ create_genotype_map(CSLocus *locus, PopMap<CSLocus> *pmap)
//cerr << "Creating genotype map for catalog ID " << locus->id << ", marker: " << locus->marker << ".\n";
char gtypes[26] ={'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
- 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
- 'u', 'v', 'w', 'x', 'y', 'z'};
+ 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
+ 'u', 'v', 'w', 'x', 'y', 'z'};
Datum **d;
map<string, int> haplotypes;
@@ -1815,9 +1815,9 @@ create_genotype_map(CSLocus *locus, PopMap<CSLocus> *pmap)
for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] != NULL)
- for (uint n = 0; n < d[i]->obshap.size(); n++)
- haplotypes[d[i]->obshap[n]]++;
+ if (d[i] != NULL)
+ for (uint n = 0; n < d[i]->obshap.size(); n++)
+ haplotypes[d[i]->obshap[n]]++;
}
//
@@ -1829,71 +1829,71 @@ create_genotype_map(CSLocus *locus, PopMap<CSLocus> *pmap)
// Sort the haplotypes map by value
//
for (k = haplotypes.begin(); k != haplotypes.end(); k++)
- sorted_haplotypes.push_back(*k);
+ sorted_haplotypes.push_back(*k);
sort(sorted_haplotypes.begin(), sorted_haplotypes.end(), hap_compare);
for (uint n = 0, index = 0; n < sorted_haplotypes.size() && index <= 26; n++, index++) {
- locus->gmap[sorted_haplotypes[n].first] = gtypes[index];
- //cerr << "GMAP: " << sorted_haplotypes[n].first << " == " << gtypes[index] << "\n";
+ locus->gmap[sorted_haplotypes[n].first] = gtypes[index];
+ //cerr << "GMAP: " << sorted_haplotypes[n].first << " == " << gtypes[index] << "\n";
}
return 0;
}
int call_population_genotypes(CSLocus *locus,
- PopMap<CSLocus> *pmap) {
+ PopMap<CSLocus> *pmap) {
//
// Fetch the array of observed haplotypes from the population
//
Datum **d = pmap->locus(locus->id);
for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL)
- continue;
-
- vector<string> gtypes;
- string gtype;
-
- //cerr << "Sample Id: " << pmap->rev_sample_index(i) << "\n";
-
- for (uint j = 0; j < d[i]->obshap.size(); j++) {
- //
- // Impossible allele encountered.
- //
- if (locus->gmap.count(d[i]->obshap[j]) == 0) {
- gtypes.clear();
- gtypes.push_back("-");
- goto impossible;
- }
-
- gtypes.push_back(locus->gmap[d[i]->obshap[j]]);
- //cerr << " Observed Haplotype: " << d[i]->obshap[j] << ", Genotype: " << locus->gmap[d[i]->obshap[j]] << "\n";
- }
+ if (d[i] == NULL)
+ continue;
+
+ vector<string> gtypes;
+ string gtype;
+
+ //cerr << "Sample Id: " << pmap->rev_sample_index(i) << "\n";
+
+ for (uint j = 0; j < d[i]->obshap.size(); j++) {
+ //
+ // Impossible allele encountered.
+ //
+ if (locus->gmap.count(d[i]->obshap[j]) == 0) {
+ gtypes.clear();
+ gtypes.push_back("-");
+ goto impossible;
+ }
+
+ gtypes.push_back(locus->gmap[d[i]->obshap[j]]);
+ //cerr << " Observed Haplotype: " << d[i]->obshap[j] << ", Genotype: " << locus->gmap[d[i]->obshap[j]] << "\n";
+ }
impossible:
- sort(gtypes.begin(), gtypes.end());
- for (uint j = 0; j < gtypes.size(); j++) {
- gtype += gtypes[j];
- //cerr << " Adding genotype to string: " << gtypes[j] << "; " << gtype << "\n";
- }
+ sort(gtypes.begin(), gtypes.end());
+ for (uint j = 0; j < gtypes.size(); j++) {
+ gtype += gtypes[j];
+ //cerr << " Adding genotype to string: " << gtypes[j] << "; " << gtype << "\n";
+ }
- string m = gtype.length() == 1 ?
- gtype + gtype : gtype;
+ string m = gtype.length() == 1 ?
+ gtype + gtype : gtype;
- d[i]->gtype = new char[m.length() + 1];
- strcpy(d[i]->gtype, m.c_str());
+ d[i]->gtype = new char[m.length() + 1];
+ strcpy(d[i]->gtype, m.c_str());
- if (m != "-")
- locus->gcnt++;
+ if (m != "-")
+ locus->gcnt++;
- //cerr << "Assigning datum, marker: " << locus->marker << ", string: " << m << ", haplotype: " << d[i]->obshap[0] << ", gtype: " << gtype << "\n";
+ //cerr << "Assigning datum, marker: " << locus->marker << ", string: " << m << ", haplotype: " << d[i]->obshap[0] << ", gtype: " << gtype << "\n";
}
return 0;
}
int tally_haplotype_freq(CSLocus *locus, PopMap<CSLocus> *pmap,
- int &total, double &max, string &freq_str) {
+ int &total, double &max, string &freq_str) {
map<string, double> freq;
Datum **d = pmap->locus(locus->id);
@@ -1904,27 +1904,27 @@ int tally_haplotype_freq(CSLocus *locus, PopMap<CSLocus> *pmap,
//cerr << "Examining marker: " << locus->id << "\n";
for (int i = 0; i < pmap->sample_cnt(); i++) {
- if (d[i] == NULL) continue;
+ if (d[i] == NULL) continue;
- //cerr << " Sample: " << i << "; Haplotype: " << d[i]->obshap[0] << "; Genotype: " << d[i]->gtype << "\n";
- if (d[i]->gtype[0] != '-') {
- freq[d[i]->gtype]++;
- total++;
- }
+ //cerr << " Sample: " << i << "; Haplotype: " << d[i]->obshap[0] << "; Genotype: " << d[i]->gtype << "\n";
+ if (d[i]->gtype[0] != '-') {
+ freq[d[i]->gtype]++;
+ total++;
+ }
}
if (total == 0)
- return 0;
+ return 0;
double frac;
stringstream s;
char f[id_len];
map<string, double>::iterator it;
for (it = freq.begin(); it != freq.end(); it++) {
- frac = (double) it->second / (double) total * 100;
- if (frac > max) max = frac;
- sprintf(f, "(%0.1f%%);", frac);
- s << it->first << ":" << it->second << f;
+ frac = (double) it->second / (double) total * 100;
+ if (frac > max) max = frac;
+ sprintf(f, "(%0.1f%%);", frac);
+ s << it->first << ":" << it->second << f;
}
freq_str = s.str();
@@ -1942,7 +1942,7 @@ int write_genomic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap) {
if (fh.fail()) {
cerr << "Error opening genomic output file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -1953,9 +1953,9 @@ int write_genomic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap) {
int num_loci = 0;
for (cit = catalog.begin(); cit != catalog.end(); cit++) {
- loc = cit->second;
+ loc = cit->second;
- num_loci += loc->len - renz_len[enz];
+ num_loci += loc->len - renz_len[enz];
}
cerr << "Writing " << num_loci << " nucleotide positions to genomic file, '" << file << "'\n";
@@ -1975,68 +1975,68 @@ int write_genomic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap) {
char *p;
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint i = 0; i < it->second.size(); i++) {
- loc = it->second[i];
-
- Datum **d = pmap->locus(loc->id);
- set<int> snp_locs;
- string obshap;
-
- for (uint i = 0; i < loc->snps.size(); i++)
- snp_locs.insert(loc->snps[i]->col);
-
- uint start = 0;
- uint end = loc->len;
- //
- // Check for the existence of the restriction enzyme cut site, mask off
- // its output.
- //
- for (uint n = 0; n < rcnt; n++)
- if (strncmp(loc->con, renz[enz][n], rlen) == 0)
- start += renz_len[enz];
- if (start == 0) {
- p = loc->con + (loc->len - rlen);
- for (uint n = rcnt; n < rcnt + rcnt; n++)
- if (strncmp(p, renz[enz][n], rlen) == 0)
- end -= renz_len[enz];
- }
-
- uint k = 0;
- for (uint n = start; n < end; n++) {
- fh << loc->id << "\t" << loc->loc.chr << "\t" << loc->sort_bp(n);
-
- if (snp_locs.count(n) == 0) {
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- a = encode_gtype(loc->con[n]);
- fh << "\t" << encoded_gtypes[a][a];
- }
- } else {
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- fh << "\t";
-
- if (d[j] == NULL)
- fh << "0";
- else
- switch (d[j]->obshap.size()) {
- case 1:
- a = encode_gtype(d[j]->obshap[0][k]);
- fh << encoded_gtypes[a][a];
- break;
- case 2:
- a = encode_gtype(d[j]->obshap[0][k]);
- b = encode_gtype(d[j]->obshap[1][k]);
- fh << encoded_gtypes[a][b];
- break;
- default:
- fh << "0";
- break;
- }
- }
- k++;
- }
- fh << "\n";
- }
- }
+ for (uint i = 0; i < it->second.size(); i++) {
+ loc = it->second[i];
+
+ Datum **d = pmap->locus(loc->id);
+ set<int> snp_locs;
+ string obshap;
+
+ for (uint i = 0; i < loc->snps.size(); i++)
+ snp_locs.insert(loc->snps[i]->col);
+
+ uint start = 0;
+ uint end = loc->len;
+ //
+ // Check for the existence of the restriction enzyme cut site, mask off
+ // its output.
+ //
+ for (uint n = 0; n < rcnt; n++)
+ if (strncmp(loc->con, renz[enz][n], rlen) == 0)
+ start += renz_len[enz];
+ if (start == 0) {
+ p = loc->con + (loc->len - rlen);
+ for (uint n = rcnt; n < rcnt + rcnt; n++)
+ if (strncmp(p, renz[enz][n], rlen) == 0)
+ end -= renz_len[enz];
+ }
+
+ uint k = 0;
+ for (uint n = start; n < end; n++) {
+ fh << loc->id << "\t" << loc->loc.chr << "\t" << loc->sort_bp(n);
+
+ if (snp_locs.count(n) == 0) {
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ a = encode_gtype(loc->con[n]);
+ fh << "\t" << encoded_gtypes[a][a];
+ }
+ } else {
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ fh << "\t";
+
+ if (d[j] == NULL)
+ fh << "0";
+ else
+ switch (d[j]->obshap.size()) {
+ case 1:
+ a = encode_gtype(d[j]->obshap[0][k]);
+ fh << encoded_gtypes[a][a];
+ break;
+ case 2:
+ a = encode_gtype(d[j]->obshap[0][k]);
+ b = encode_gtype(d[j]->obshap[1][k]);
+ fh << encoded_gtypes[a][b];
+ break;
+ default:
+ fh << "0";
+ break;
+ }
+ }
+ k++;
+ }
+ fh << "\n";
+ }
+ }
}
fh.close();
@@ -2046,7 +2046,7 @@ int write_genomic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap) {
int
calculate_haplotype_stats(vector<pair<int, string> > &files, map<int, pair<int, int> > &pop_indexes,
- map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
+ map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
{
map<string, vector<CSLocus *> >::iterator it;
CSLocus *loc;
@@ -2060,8 +2060,8 @@ calculate_haplotype_stats(vector<pair<int, string> > &files, map<int, pair<int,
OHaplotypes<LocStat> *ord;
Bootstrap<LocStat> *bs;
if (kernel_smoothed && loci_ordered) {
- ks = new KSmooth<LocStat>(2);
- ord = new OHaplotypes<LocStat>();
+ ks = new KSmooth<LocStat>(2);
+ ord = new OHaplotypes<LocStat>();
}
//
@@ -2074,7 +2074,7 @@ calculate_haplotype_stats(vector<pair<int, string> > &files, map<int, pair<int,
ofstream fh(file.c_str(), ofstream::out);
if (fh.fail()) {
cerr << "Error opening haplotype stats file '" << file << "'\n";
- exit(1);
+ exit(1);
}
fh.precision(fieldw);
fh.setf(std::ios::fixed);
@@ -2086,14 +2086,14 @@ calculate_haplotype_stats(vector<pair<int, string> > &files, map<int, pair<int,
// Write the population members.
//
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- start = pit->second.first;
- end = pit->second.second;
- fh << "# " << pop_key[pit->first] << "\t";
- for (int i = start; i <= end; i++) {
- fh << files[i].second;
- if (i < end) fh << ",";
- }
- fh << "\n";
+ start = pit->second.first;
+ end = pit->second.second;
+ fh << "# " << pop_key[pit->first] << "\t";
+ for (int i = start; i <= end; i++) {
+ fh << files[i].second;
+ if (i < end) fh << ",";
+ }
+ fh << "\n";
}
fh << "# Batch ID " << "\t"
@@ -2115,89 +2115,89 @@ calculate_haplotype_stats(vector<pair<int, string> > &files, map<int, pair<int,
// Iterate over the members of each population.
//
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- start = pit->second.first;
- end = pit->second.second;
- pop_id = pit->first;
-
- cerr << "Generating haplotype-level summary statistics for population '" << pop_key[pop_id] << "'\n";
- map<string, vector<LocStat *> > genome_locstats;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
-
- if (bootstrap_div)
- bs = new Bootstrap<LocStat>(2);
-
- vector<LocStat *> &locstats = genome_locstats[it->first];
- map<uint, uint> locstats_key;
- ord->order(locstats, locstats_key, it->second);
-
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- d = pmap->locus(loc->id);
-
- if (loc->snps.size() == 0)
- continue;
-
- // cerr << "Looking at locus " << loc->id << "\n";
-
- l = haplotype_diversity(start, end, d);
-
- if (l != NULL) {
- l->loc_id = loc->id;
- l->bp = loc->sort_bp();
- locstats[locstats_key[l->bp]] = l;
- }
- }
-
- if (kernel_smoothed && loci_ordered) {
- cerr << " Generating kernel-smoothed statistics on chromosome " << it->first << "\n";
- ks->smooth(locstats);
- }
-
- if (bootstrap_div)
- bs->add_data(locstats);
- }
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- vector<LocStat *> &locstats = genome_locstats[it->first];
-
- if (bootstrap_div)
- bs->execute(locstats);
-
- //
- // Write results.
- //
- for (uint k = 0; k < locstats.size(); k++) {
- l = locstats[k];
- if (l == NULL) continue;
-
- fh << batch_id << "\t"
- << l->loc_id << "\t"
- << it->first << "\t"
- << l->bp + 1 << "\t"
- << pop_key[pop_id] << "\t"
- << (int) l->alleles << "\t"
- << l->hap_cnt << "\t"
- << l->stat[0] << "\t"
- << l->smoothed[0] << "\t"
- << l->bs[0] << "\t"
- << l->stat[1] << "\t"
- << l->smoothed[1] << "\t"
- << l->bs[1] << "\t"
- << l->hap_str << "\n";
- }
-
- for (uint k = 0; k < locstats.size(); k++)
- delete locstats[k];
- }
-
- if (bootstrap_div)
- delete bs;
+ start = pit->second.first;
+ end = pit->second.second;
+ pop_id = pit->first;
+
+ cerr << "Generating haplotype-level summary statistics for population '" << pop_key[pop_id] << "'\n";
+ map<string, vector<LocStat *> > genome_locstats;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+
+ if (bootstrap_div)
+ bs = new Bootstrap<LocStat>(2);
+
+ vector<LocStat *> &locstats = genome_locstats[it->first];
+ map<uint, uint> locstats_key;
+ ord->order(locstats, locstats_key, it->second);
+
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ d = pmap->locus(loc->id);
+
+ if (loc->snps.size() == 0)
+ continue;
+
+ // cerr << "Looking at locus " << loc->id << "\n";
+
+ l = haplotype_diversity(start, end, d);
+
+ if (l != NULL) {
+ l->loc_id = loc->id;
+ l->bp = loc->sort_bp();
+ locstats[locstats_key[l->bp]] = l;
+ }
+ }
+
+ if (kernel_smoothed && loci_ordered) {
+ cerr << " Generating kernel-smoothed statistics on chromosome " << it->first << "\n";
+ ks->smooth(locstats);
+ }
+
+ if (bootstrap_div)
+ bs->add_data(locstats);
+ }
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ vector<LocStat *> &locstats = genome_locstats[it->first];
+
+ if (bootstrap_div)
+ bs->execute(locstats);
+
+ //
+ // Write results.
+ //
+ for (uint k = 0; k < locstats.size(); k++) {
+ l = locstats[k];
+ if (l == NULL) continue;
+
+ fh << batch_id << "\t"
+ << l->loc_id << "\t"
+ << it->first << "\t"
+ << l->bp + 1 << "\t"
+ << pop_key[pop_id] << "\t"
+ << (int) l->alleles << "\t"
+ << l->hap_cnt << "\t"
+ << l->stat[0] << "\t"
+ << l->smoothed[0] << "\t"
+ << l->bs[0] << "\t"
+ << l->stat[1] << "\t"
+ << l->smoothed[1] << "\t"
+ << l->bs[1] << "\t"
+ << l->hap_str << "\n";
+ }
+
+ for (uint k = 0; k < locstats.size(); k++)
+ delete locstats[k];
+ }
+
+ if (bootstrap_div)
+ delete bs;
}
if (kernel_smoothed && loci_ordered) {
- delete ks;
- delete ord;
+ delete ks;
+ delete ord;
}
fh.close();
@@ -2213,27 +2213,27 @@ nuc_substitution_dist(map<string, int> &hap_index, double **hdists)
uint i, j;
for (it = hap_index.begin(); it != hap_index.end(); it++)
- haplotypes.push_back(it->first);
+ haplotypes.push_back(it->first);
const char *p, *q;
double dist;
for (i = 0; i < haplotypes.size(); i++) {
- for (j = i; j < haplotypes.size(); j++) {
+ for (j = i; j < haplotypes.size(); j++) {
- dist = 0.0;
- p = haplotypes[i].c_str();
- q = haplotypes[j].c_str();
+ dist = 0.0;
+ p = haplotypes[i].c_str();
+ q = haplotypes[j].c_str();
- while (*p != '\0' && *q != '\0') {
- if (*p != *q) dist++;
- p++;
- q++;
- }
+ while (*p != '\0' && *q != '\0') {
+ if (*p != *q) dist++;
+ p++;
+ q++;
+ }
- hdists[i][j] = dist;
- hdists[j][i] = dist;
- }
+ hdists[i][j] = dist;
+ hdists[j][i] = dist;
+ }
}
// //
@@ -2241,13 +2241,13 @@ nuc_substitution_dist(map<string, int> &hap_index, double **hdists)
// //
// cerr << " ";
// for (hit = loc_hap_index.begin(); hit != loc_hap_index.end(); hit++)
- // cerr << "\t" << hit->first;
+ // cerr << "\t" << hit->first;
// cerr << "\n";
// for (hit = loc_hap_index.begin(); hit != loc_hap_index.end(); hit++) {
- // cerr << " " << hit->first;
- // for (hit_2 = loc_hap_index.begin(); hit_2 != loc_hap_index.end(); hit_2++)
- // cerr << "\t" << hdists[hit->second][hit_2->second];
- // cerr << "\n";
+ // cerr << " " << hit->first;
+ // for (hit_2 = loc_hap_index.begin(); hit_2 != loc_hap_index.end(); hit_2++)
+ // cerr << "\t" << hdists[hit->second][hit_2->second];
+ // cerr << "\n";
// }
// cerr << "\n";
@@ -2262,21 +2262,21 @@ nuc_substitution_identity(map<string, int> &hap_index, double **hdists)
uint i, j;
for (it = hap_index.begin(); it != hap_index.end(); it++)
- haplotypes.push_back(it->first);
+ haplotypes.push_back(it->first);
double dist;
for (i = 0; i < haplotypes.size(); i++) {
- for (j = i; j < haplotypes.size(); j++) {
+ for (j = i; j < haplotypes.size(); j++) {
- if (haplotypes[i] == haplotypes[j])
- dist = 0.0;
- else
- dist = 1.0;
+ if (haplotypes[i] == haplotypes[j])
+ dist = 0.0;
+ else
+ dist = 1.0;
- hdists[i][j] = dist;
- hdists[j][i] = dist;
- }
+ hdists[i][j] = dist;
+ hdists[j][i] = dist;
+ }
}
return 0;
@@ -2290,13 +2290,13 @@ nuc_substitution_identity_max(map<string, int> &hap_index, double **hdists)
uint i, j;
for (it = hap_index.begin(); it != hap_index.end(); it++)
- haplotypes.push_back(it->first);
+ haplotypes.push_back(it->first);
for (i = 0; i < haplotypes.size(); i++) {
- for (j = i; j < haplotypes.size(); j++) {
- hdists[i][j] = 1.0;
- hdists[j][i] = 1.0;
- }
+ for (j = i; j < haplotypes.size(); j++) {
+ hdists[i][j] = 1.0;
+ hdists[j][i] = 1.0;
+ }
}
return 0;
@@ -2304,16 +2304,16 @@ nuc_substitution_identity_max(map<string, int> &hap_index, double **hdists)
int
calculate_haplotype_divergence(vector<pair<int, string> > &files,
- map<int, pair<int, int> > &pop_indexes,
- map<int, vector<int> > &master_grp_members,
- map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
+ map<int, pair<int, int> > &pop_indexes,
+ map<int, vector<int> > &master_grp_members,
+ map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
{
map<string, vector<CSLocus *> >::iterator it;
if (bootstrap_phist)
- cerr << "Calculating halotype F statistics across all populations/groups and bootstrap resampling...\n";
+ cerr << "Calculating halotype F statistics across all populations/groups and bootstrap resampling...\n";
else
- cerr << "Calculating haplotype F statistics across all populations/groups...\n";
+ cerr << "Calculating haplotype F statistics across all populations/groups...\n";
//
// Create a list of all the groups we have.
@@ -2321,8 +2321,8 @@ calculate_haplotype_divergence(vector<pair<int, string> > &files,
map<int, vector<int> >::iterator git;
map<int, int> pop_grp_key;
for (git = master_grp_members.begin(); git != master_grp_members.end(); git++)
- for (uint i = 0; i < git->second.size(); i++)
- pop_grp_key[git->second[i]] = git->first;
+ for (uint i = 0; i < git->second.size(); i++)
+ pop_grp_key[git->second[i]] = git->first;
//
// Create a list of all the populations we have.
@@ -2331,7 +2331,7 @@ calculate_haplotype_divergence(vector<pair<int, string> > &files,
map<int, pair<int, int> >::iterator pit;
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++)
- pop_ids.push_back(pit->first);
+ pop_ids.push_back(pit->first);
//
// Instantiate the kernel smoothing object and associated ordering object if requested.
@@ -2340,92 +2340,92 @@ calculate_haplotype_divergence(vector<pair<int, string> > &files,
OHaplotypes<HapStat> *ord;
Bootstrap<HapStat> *bs;
if (kernel_smoothed && loci_ordered) {
- ks = new KSmooth<HapStat>(5);
- ord = new OHaplotypes<HapStat>();
+ ks = new KSmooth<HapStat>(5);
+ ord = new OHaplotypes<HapStat>();
}
if (bootstrap_phist)
- bs = new Bootstrap<HapStat>(5);
+ bs = new Bootstrap<HapStat>(5);
map<string, vector<HapStat *> > genome_hapstats;
uint cnt = 0;
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- string chr = it->first;
+ string chr = it->first;
- cerr << " Generating haplotype F statistics for " << chr << "...";
+ cerr << " Generating haplotype F statistics for " << chr << "...";
- map<uint, uint> hapstats_key;
- vector<HapStat *> &hapstats = genome_hapstats[chr];
- ord->order(hapstats, hapstats_key, it->second);
+ map<uint, uint> hapstats_key;
+ vector<HapStat *> &hapstats = genome_hapstats[chr];
+ ord->order(hapstats, hapstats_key, it->second);
#pragma omp parallel
- {
- CSLocus *loc;
- LocSum **s;
- Datum **d;
- HapStat *h;
+ {
+ CSLocus *loc;
+ LocSum **s;
+ Datum **d;
+ HapStat *h;
#pragma omp for schedule(dynamic, 1) reduction(+:cnt)
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
-
- if (loc->snps.size() == 0)
- continue;
-
- //
- // If this locus only appears in one population or there is only a single haplotype,
- // do not calculate haplotype F stats.
- //
- if (fixed_locus(pop_indexes, d, pop_ids))
- continue;
-
- cnt++;
- // cerr << "Processing locus " << loc->id << "\n";
-
- h = haplotype_amova(pop_grp_key, pop_indexes, d, s, pop_ids);
-
- if (h != NULL) {
- h->stat[4] = haplotype_d_est(pop_indexes, d, s, pop_ids);
-
- h->loc_id = loc->id;
- h->bp = loc->sort_bp();
- hapstats[hapstats_key[h->bp]] = h;
- }
- }
- }
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+
+ if (loc->snps.size() == 0)
+ continue;
+
+ //
+ // If this locus only appears in one population or there is only a single haplotype,
+ // do not calculate haplotype F stats.
+ //
+ if (fixed_locus(pop_indexes, d, pop_ids))
+ continue;
+
+ cnt++;
+ // cerr << "Processing locus " << loc->id << "\n";
+
+ h = haplotype_amova(pop_grp_key, pop_indexes, d, s, pop_ids);
+
+ if (h != NULL) {
+ h->stat[4] = haplotype_d_est(pop_indexes, d, s, pop_ids);
+
+ h->loc_id = loc->id;
+ h->bp = loc->sort_bp();
+ hapstats[hapstats_key[h->bp]] = h;
+ }
+ }
+ }
- if (bootstrap_phist)
- bs->add_data(hapstats);
+ if (bootstrap_phist)
+ bs->add_data(hapstats);
- cerr << "done.\n";
+ cerr << "done.\n";
- //
- // Calculate kernel-smoothed Fst values.
- //
- if (kernel_smoothed && loci_ordered) {
- cerr << " Generating kernel-smoothed haplotype F statistics for " << it->first << "...";
- ks->smooth(hapstats);
- cerr << "done.\n";
- }
+ //
+ // Calculate kernel-smoothed Fst values.
+ //
+ if (kernel_smoothed && loci_ordered) {
+ cerr << " Generating kernel-smoothed haplotype F statistics for " << it->first << "...";
+ ks->smooth(hapstats);
+ cerr << "done.\n";
+ }
}
if (bootstrap_phist) {
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++)
- bs->execute(genome_hapstats[it->first]);
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++)
+ bs->execute(genome_hapstats[it->first]);
}
cerr << "done.\n";
if (kernel_smoothed && loci_ordered) {
- delete ks;
- delete ord;
+ delete ks;
+ delete ord;
}
if (bootstrap_phist)
- delete bs;
+ delete bs;
cerr << "Writing haplotype F statistics... ";
@@ -2437,7 +2437,7 @@ calculate_haplotype_divergence(vector<pair<int, string> > &files,
ofstream fh(file.c_str(), ofstream::out);
if (fh.fail()) {
cerr << "Error opening haplotype Phi_st file '" << file << "'\n";
- exit(1);
+ exit(1);
}
fh.precision(fieldw);
fh.setf(std::ios::fixed);
@@ -2447,27 +2447,27 @@ calculate_haplotype_divergence(vector<pair<int, string> > &files,
//
int start, end;
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- start = pit->second.first;
- end = pit->second.second;
- fh << "# Population " << pop_key[pit->first] << "\t";
- for (int k = start; k <= end; k++) {
- fh << files[k].second;
- if (k < end) fh << ",";
- }
- fh << "\n";
+ start = pit->second.first;
+ end = pit->second.second;
+ fh << "# Population " << pop_key[pit->first] << "\t";
+ for (int k = start; k <= end; k++) {
+ fh << files[k].second;
+ if (k < end) fh << ",";
+ }
+ fh << "\n";
}
//
// Write the group members.
//
for (git = grp_members.begin(); git != grp_members.end(); git++) {
- end = git->second.size();
- fh << "# Group " << grp_key[git->first] << "\t";
- for (int k = 0; k < end; k++) {
- fh << pop_key[git->second[k]];
- if (k < end - 1) fh << ",";
- }
- fh << "\n";
+ end = git->second.size();
+ fh << "# Group " << grp_key[git->first] << "\t";
+ for (int k = 0; k < end; k++) {
+ fh << pop_key[git->second[k]];
+ if (k < end - 1) fh << ",";
+ }
+ fh << "\n";
}
fh << "# Batch ID " << "\t"
@@ -2476,21 +2476,21 @@ calculate_haplotype_divergence(vector<pair<int, string> > &files,
<< "BP" << "\t"
<< "PopCnt" << "\t";
if (log_fst_comp)
- fh << "SSD(WP)" << "\t"
- << "SSD(AP/WG)" << "\t"
- << "SSD(AG)" << "\t"
- << "SSD(TOTAL)" << "\t"
- << "MSD(WP)" << "\t"
- << "MSD(AP/WG)" << "\t"
- << "MSD(AG)" << "\t"
- << "MSD(TOTAL)" << "\t"
- << "n" << "\t"
- << "n'" << "\t"
- << "n''" << "\t"
- << "Sigma2_a" << "\t"
- << "Sigma2_b" << "\t"
- << "Sigma2_c" << "\t"
- << "Sigma_Total" << "\t";
+ fh << "SSD(WP)" << "\t"
+ << "SSD(AP/WG)" << "\t"
+ << "SSD(AG)" << "\t"
+ << "SSD(TOTAL)" << "\t"
+ << "MSD(WP)" << "\t"
+ << "MSD(AP/WG)" << "\t"
+ << "MSD(AG)" << "\t"
+ << "MSD(TOTAL)" << "\t"
+ << "n" << "\t"
+ << "n'" << "\t"
+ << "n''" << "\t"
+ << "Sigma2_a" << "\t"
+ << "Sigma2_b" << "\t"
+ << "Sigma2_c" << "\t"
+ << "Sigma_Total" << "\t";
fh << "phi_st" << "\t"
<< "Smoothed Phi_st" << "\t"
<< "Smoothed Phi_st P-value" << "\t"
@@ -2508,52 +2508,52 @@ calculate_haplotype_divergence(vector<pair<int, string> > &files,
<< "Smoothed D_est P-value" << "\n";
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- string chr = it->first;
-
- vector<HapStat *> &hapstats = genome_hapstats[chr];
-
- for (uint k = 0; k < hapstats.size(); k++) {
- if (hapstats[k] == NULL) continue;
-
- fh << batch_id << "\t"
- << hapstats[k]->loc_id << "\t"
- << chr << "\t"
- << hapstats[k]->bp << "\t"
- << hapstats[k]->popcnt << "\t";
- if (log_fst_comp)
- fh << hapstats[k]->comp[0] << "\t"
- << hapstats[k]->comp[1] << "\t"
- << hapstats[k]->comp[2] << "\t"
- << hapstats[k]->comp[3] << "\t"
- << hapstats[k]->comp[4] << "\t"
- << hapstats[k]->comp[5] << "\t"
- << hapstats[k]->comp[6] << "\t"
- << hapstats[k]->comp[7] << "\t"
- << hapstats[k]->comp[8] << "\t"
- << hapstats[k]->comp[9] << "\t"
- << hapstats[k]->comp[10] << "\t"
- << hapstats[k]->comp[11] << "\t"
- << hapstats[k]->comp[12] << "\t"
- << hapstats[k]->comp[13] << "\t"
- << hapstats[k]->comp[14] << "\t";
- fh << hapstats[k]->stat[0] << "\t"
- << hapstats[k]->smoothed[0] << "\t"
- << hapstats[k]->bs[0] << "\t"
- << hapstats[k]->stat[1] << "\t"
- << hapstats[k]->smoothed[1] << "\t"
- << hapstats[k]->bs[1] << "\t"
- << hapstats[k]->stat[2] << "\t"
- << hapstats[k]->smoothed[2] << "\t"
- << hapstats[k]->bs[2] << "\t"
- << hapstats[k]->stat[3] << "\t"
- << hapstats[k]->smoothed[3] << "\t"
- << hapstats[k]->bs[3] << "\t"
- << hapstats[k]->stat[4] << "\t"
- << hapstats[k]->smoothed[4] << "\t"
- << hapstats[k]->bs[4] << "\n";
-
- delete hapstats[k];
- }
+ string chr = it->first;
+
+ vector<HapStat *> &hapstats = genome_hapstats[chr];
+
+ for (uint k = 0; k < hapstats.size(); k++) {
+ if (hapstats[k] == NULL) continue;
+
+ fh << batch_id << "\t"
+ << hapstats[k]->loc_id << "\t"
+ << chr << "\t"
+ << hapstats[k]->bp << "\t"
+ << hapstats[k]->popcnt << "\t";
+ if (log_fst_comp)
+ fh << hapstats[k]->comp[0] << "\t"
+ << hapstats[k]->comp[1] << "\t"
+ << hapstats[k]->comp[2] << "\t"
+ << hapstats[k]->comp[3] << "\t"
+ << hapstats[k]->comp[4] << "\t"
+ << hapstats[k]->comp[5] << "\t"
+ << hapstats[k]->comp[6] << "\t"
+ << hapstats[k]->comp[7] << "\t"
+ << hapstats[k]->comp[8] << "\t"
+ << hapstats[k]->comp[9] << "\t"
+ << hapstats[k]->comp[10] << "\t"
+ << hapstats[k]->comp[11] << "\t"
+ << hapstats[k]->comp[12] << "\t"
+ << hapstats[k]->comp[13] << "\t"
+ << hapstats[k]->comp[14] << "\t";
+ fh << hapstats[k]->stat[0] << "\t"
+ << hapstats[k]->smoothed[0] << "\t"
+ << hapstats[k]->bs[0] << "\t"
+ << hapstats[k]->stat[1] << "\t"
+ << hapstats[k]->smoothed[1] << "\t"
+ << hapstats[k]->bs[1] << "\t"
+ << hapstats[k]->stat[2] << "\t"
+ << hapstats[k]->smoothed[2] << "\t"
+ << hapstats[k]->bs[2] << "\t"
+ << hapstats[k]->stat[3] << "\t"
+ << hapstats[k]->smoothed[3] << "\t"
+ << hapstats[k]->bs[3] << "\t"
+ << hapstats[k]->stat[4] << "\t"
+ << hapstats[k]->smoothed[4] << "\t"
+ << hapstats[k]->bs[4] << "\n";
+
+ delete hapstats[k];
+ }
}
fh.close();
@@ -2565,16 +2565,16 @@ calculate_haplotype_divergence(vector<pair<int, string> > &files,
int
calculate_haplotype_divergence_pairwise(vector<pair<int, string> > &files,
- map<int, pair<int, int> > &pop_indexes,
- map<int, vector<int> > &master_grp_members,
- map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
+ map<int, pair<int, int> > &pop_indexes,
+ map<int, vector<int> > &master_grp_members,
+ map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
{
map<string, vector<CSLocus *> >::iterator it;
if (bootstrap_phist)
- cerr << "Calculating pairwise halotype F statistics and bootstrap resampling...\n";
+ cerr << "Calculating pairwise halotype F statistics and bootstrap resampling...\n";
else
- cerr << "Calculating pairwise haplotype F statistics...\n";
+ cerr << "Calculating pairwise haplotype F statistics...\n";
//
// Assign all individuals to one group for the pairwise calculations.
@@ -2582,13 +2582,13 @@ calculate_haplotype_divergence_pairwise(vector<pair<int, string> > &files,
map<int, vector<int> >::iterator git;
map<int, int> pop_grp_key;
for (git = master_grp_members.begin(); git != master_grp_members.end(); git++)
- for (uint i = 0; i < git->second.size(); i++)
- pop_grp_key[git->second[i]] = 1;
+ for (uint i = 0; i < git->second.size(); i++)
+ pop_grp_key[git->second[i]] = 1;
map<int, pair<int, int> >::iterator pit;
vector<int> pop_ids;
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++)
- pop_ids.push_back(pit->first);
+ pop_ids.push_back(pit->first);
//
// Instantiate the kernel smoothing object if requested.
@@ -2597,212 +2597,212 @@ calculate_haplotype_divergence_pairwise(vector<pair<int, string> > &files,
OHaplotypes<HapStat> *ord;
Bootstrap<HapStat> *bs;
if (kernel_smoothed && loci_ordered) {
- ks = new KSmooth<HapStat>(5);
- ord = new OHaplotypes<HapStat>();
+ ks = new KSmooth<HapStat>(5);
+ ord = new OHaplotypes<HapStat>();
}
for (uint i = 0; i < pop_ids.size(); i++) {
- for (uint j = i + 1; j < pop_ids.size(); j++) {
+ for (uint j = i + 1; j < pop_ids.size(); j++) {
- if (bootstrap_phist)
- bs = new Bootstrap<HapStat>(5);
+ if (bootstrap_phist)
+ bs = new Bootstrap<HapStat>(5);
- map<string, vector<HapStat *> > genome_hapstats;
- vector<int> subpop_ids;
+ map<string, vector<HapStat *> > genome_hapstats;
+ vector<int> subpop_ids;
- subpop_ids.push_back(pop_ids[i]);
- subpop_ids.push_back(pop_ids[j]);
+ subpop_ids.push_back(pop_ids[i]);
+ subpop_ids.push_back(pop_ids[j]);
- cerr << " Processing populations '" << pop_key[pop_ids[i]] << "' and '" << pop_key[pop_ids[j]] << "'\n";
+ cerr << " Processing populations '" << pop_key[pop_ids[i]] << "' and '" << pop_key[pop_ids[j]] << "'\n";
- uint cnt = 0;
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- string chr = it->first;
+ uint cnt = 0;
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ string chr = it->first;
- cerr << " Generating pairwise haplotype F statistics for " << chr << "...";
+ cerr << " Generating pairwise haplotype F statistics for " << chr << "...";
- map<uint, uint> hapstats_key;
- vector<HapStat *> &hapstats = genome_hapstats[chr];
- ord->order(hapstats, hapstats_key, it->second);
+ map<uint, uint> hapstats_key;
+ vector<HapStat *> &hapstats = genome_hapstats[chr];
+ ord->order(hapstats, hapstats_key, it->second);
#pragma omp parallel
- {
- CSLocus *loc;
- LocSum **s;
- Datum **d;
- HapStat *h;
+ {
+ CSLocus *loc;
+ LocSum **s;
+ Datum **d;
+ HapStat *h;
#pragma omp for schedule(dynamic, 1) reduction(+:cnt)
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
-
- if (loc->snps.size() == 0)
- continue;
-
- //
- // If this locus only appears in one population or there is only a single haplotype,
- // do not calculate haplotype F stats.
- //
- if (fixed_locus(pop_indexes, d, subpop_ids))
- continue;
-
- cnt++;
- // cerr << "Processing locus " << loc->id << "\n";
-
- h = haplotype_amova(pop_grp_key, pop_indexes, d, s, subpop_ids);
-
- if (h != NULL) {
- h->stat[4] = haplotype_d_est(pop_indexes, d, s, subpop_ids);
-
- h->loc_id = loc->id;
- h->bp = loc->sort_bp();
- hapstats[hapstats_key[h->bp]] = h;
- }
- }
- }
-
- if (bootstrap_phist)
- bs->add_data(hapstats);
-
- cerr << "done.\n";
-
- //
- // Calculate kernel-smoothed Fst values.
- //
- if (kernel_smoothed && loci_ordered) {
- cerr << " Generating kernel-smoothed Phi_st for " << it->first << "...";
- ks->smooth(hapstats);
- cerr << "done.\n";
- }
- }
-
- if (bootstrap_phist) {
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++)
- bs->execute(genome_hapstats[it->first]);
- }
-
- cerr << "done.\n";
-
- if (bootstrap_phist)
- delete bs;
-
- cerr << "Writing haplotype F statistics... ";
-
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".phistats_" << pop_key[pop_ids[i]] << "-" << pop_key[pop_ids[j]] << ".tsv";
-
- string file = in_path + pop_name.str();
-
- ofstream fh(file.c_str(), ofstream::out);
- if (fh.fail()) {
- cerr << "Error opening haplotype Phi_st file '" << file << "'\n";
- exit(1);
- }
- fh.precision(fieldw);
- fh.setf(std::ios::fixed);
-
- //
- // Write the population members.
- //
- int start, end;
- for (uint k = 0; k < subpop_ids.size(); k++) {
- start = pop_indexes[subpop_ids[k]].first;
- end = pop_indexes[subpop_ids[k]].second;
- fh << "# Population " << pop_key[subpop_ids[k]] << "\t";
- for (int n = start; n <= end; n++) {
- fh << files[n].second;
- if (n < end) fh << ",";
- }
- fh << "\n";
- }
-
- fh << "# Batch ID " << "\t"
- << "Locus ID" << "\t"
- << "Pop 1 ID" << "\t"
- << "Pop 2 ID" << "\t"
- << "Chr" << "\t"
- << "BP" << "\t";
- if (log_fst_comp)
- fh << "SSD(WP)" << "\t"
- << "SSD(AP/WG)" << "\t"
- << "SSD(AG)" << "\t"
- << "SSD(TOTAL)" << "\t"
- << "MSD(WP)" << "\t"
- << "MSD(AP/WG)" << "\t"
- << "MSD(AG)" << "\t"
- << "MSD(TOTAL)" << "\t"
- << "n" << "\t"
- << "n'" << "\t"
- << "n''" << "\t"
- << "Sigma2_a" << "\t"
- << "Sigma2_b" << "\t"
- << "Sigma2_c" << "\t"
- << "Sigma_Total" << "\t";
- fh << "phi_st" << "\t"
- << "Smoothed Phi_st" << "\t"
- << "Smoothed Phi_st P-value" << "\t"
- << "Fst'" << "\t"
- << "Smoothed Fst'" << "\t"
- << "Smoothed Fst' P-value" << "\t"
- << "D_est" << "\t"
- << "Smoothed D_est" << "\t"
- << "Smoothed D_est P-value" << "\n";
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- string chr = it->first;
-
- vector<HapStat *> &hapstats = genome_hapstats[chr];
-
- for (uint k = 0; k < hapstats.size(); k++) {
- if (hapstats[k] == NULL) continue;
-
- fh << batch_id << "\t"
- << hapstats[k]->loc_id << "\t"
- << pop_key[pop_ids[i]] << "\t"
- << pop_key[pop_ids[j]] << "\t"
- << chr << "\t"
- << hapstats[k]->bp << "\t";
- if (log_fst_comp)
- fh << hapstats[k]->comp[0] << "\t"
- << hapstats[k]->comp[1] << "\t"
- << hapstats[k]->comp[2] << "\t"
- << hapstats[k]->comp[3] << "\t"
- << hapstats[k]->comp[4] << "\t"
- << hapstats[k]->comp[5] << "\t"
- << hapstats[k]->comp[6] << "\t"
- << hapstats[k]->comp[7] << "\t"
- << hapstats[k]->comp[8] << "\t"
- << hapstats[k]->comp[9] << "\t"
- << hapstats[k]->comp[10] << "\t"
- << hapstats[k]->comp[11] << "\t"
- << hapstats[k]->comp[12] << "\t"
- << hapstats[k]->comp[13] << "\t"
- << hapstats[k]->comp[14] << "\t";
- fh << hapstats[k]->stat[0] << "\t"
- << hapstats[k]->smoothed[0] << "\t"
- << hapstats[k]->bs[0] << "\t"
- << hapstats[k]->stat[3] << "\t"
- << hapstats[k]->smoothed[3] << "\t"
- << hapstats[k]->bs[3] << "\t"
- << hapstats[k]->stat[4] << "\t"
- << hapstats[k]->smoothed[4] << "\t"
- << hapstats[k]->bs[4] << "\n";
-
- delete hapstats[k];
- }
- }
-
- fh.close();
-
- cerr << "wrote " << cnt << " loci to pairwise haplotype file, '" << file << "'\n";
- }
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+
+ if (loc->snps.size() == 0)
+ continue;
+
+ //
+ // If this locus only appears in one population or there is only a single haplotype,
+ // do not calculate haplotype F stats.
+ //
+ if (fixed_locus(pop_indexes, d, subpop_ids))
+ continue;
+
+ cnt++;
+ // cerr << "Processing locus " << loc->id << "\n";
+
+ h = haplotype_amova(pop_grp_key, pop_indexes, d, s, subpop_ids);
+
+ if (h != NULL) {
+ h->stat[4] = haplotype_d_est(pop_indexes, d, s, subpop_ids);
+
+ h->loc_id = loc->id;
+ h->bp = loc->sort_bp();
+ hapstats[hapstats_key[h->bp]] = h;
+ }
+ }
+ }
+
+ if (bootstrap_phist)
+ bs->add_data(hapstats);
+
+ cerr << "done.\n";
+
+ //
+ // Calculate kernel-smoothed Fst values.
+ //
+ if (kernel_smoothed && loci_ordered) {
+ cerr << " Generating kernel-smoothed Phi_st for " << it->first << "...";
+ ks->smooth(hapstats);
+ cerr << "done.\n";
+ }
+ }
+
+ if (bootstrap_phist) {
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++)
+ bs->execute(genome_hapstats[it->first]);
+ }
+
+ cerr << "done.\n";
+
+ if (bootstrap_phist)
+ delete bs;
+
+ cerr << "Writing haplotype F statistics... ";
+
+ stringstream pop_name;
+ pop_name << "batch_" << batch_id << ".phistats_" << pop_key[pop_ids[i]] << "-" << pop_key[pop_ids[j]] << ".tsv";
+
+ string file = in_path + pop_name.str();
+
+ ofstream fh(file.c_str(), ofstream::out);
+ if (fh.fail()) {
+ cerr << "Error opening haplotype Phi_st file '" << file << "'\n";
+ exit(1);
+ }
+ fh.precision(fieldw);
+ fh.setf(std::ios::fixed);
+
+ //
+ // Write the population members.
+ //
+ int start, end;
+ for (uint k = 0; k < subpop_ids.size(); k++) {
+ start = pop_indexes[subpop_ids[k]].first;
+ end = pop_indexes[subpop_ids[k]].second;
+ fh << "# Population " << pop_key[subpop_ids[k]] << "\t";
+ for (int n = start; n <= end; n++) {
+ fh << files[n].second;
+ if (n < end) fh << ",";
+ }
+ fh << "\n";
+ }
+
+ fh << "# Batch ID " << "\t"
+ << "Locus ID" << "\t"
+ << "Pop 1 ID" << "\t"
+ << "Pop 2 ID" << "\t"
+ << "Chr" << "\t"
+ << "BP" << "\t";
+ if (log_fst_comp)
+ fh << "SSD(WP)" << "\t"
+ << "SSD(AP/WG)" << "\t"
+ << "SSD(AG)" << "\t"
+ << "SSD(TOTAL)" << "\t"
+ << "MSD(WP)" << "\t"
+ << "MSD(AP/WG)" << "\t"
+ << "MSD(AG)" << "\t"
+ << "MSD(TOTAL)" << "\t"
+ << "n" << "\t"
+ << "n'" << "\t"
+ << "n''" << "\t"
+ << "Sigma2_a" << "\t"
+ << "Sigma2_b" << "\t"
+ << "Sigma2_c" << "\t"
+ << "Sigma_Total" << "\t";
+ fh << "phi_st" << "\t"
+ << "Smoothed Phi_st" << "\t"
+ << "Smoothed Phi_st P-value" << "\t"
+ << "Fst'" << "\t"
+ << "Smoothed Fst'" << "\t"
+ << "Smoothed Fst' P-value" << "\t"
+ << "D_est" << "\t"
+ << "Smoothed D_est" << "\t"
+ << "Smoothed D_est P-value" << "\n";
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ string chr = it->first;
+
+ vector<HapStat *> &hapstats = genome_hapstats[chr];
+
+ for (uint k = 0; k < hapstats.size(); k++) {
+ if (hapstats[k] == NULL) continue;
+
+ fh << batch_id << "\t"
+ << hapstats[k]->loc_id << "\t"
+ << pop_key[pop_ids[i]] << "\t"
+ << pop_key[pop_ids[j]] << "\t"
+ << chr << "\t"
+ << hapstats[k]->bp << "\t";
+ if (log_fst_comp)
+ fh << hapstats[k]->comp[0] << "\t"
+ << hapstats[k]->comp[1] << "\t"
+ << hapstats[k]->comp[2] << "\t"
+ << hapstats[k]->comp[3] << "\t"
+ << hapstats[k]->comp[4] << "\t"
+ << hapstats[k]->comp[5] << "\t"
+ << hapstats[k]->comp[6] << "\t"
+ << hapstats[k]->comp[7] << "\t"
+ << hapstats[k]->comp[8] << "\t"
+ << hapstats[k]->comp[9] << "\t"
+ << hapstats[k]->comp[10] << "\t"
+ << hapstats[k]->comp[11] << "\t"
+ << hapstats[k]->comp[12] << "\t"
+ << hapstats[k]->comp[13] << "\t"
+ << hapstats[k]->comp[14] << "\t";
+ fh << hapstats[k]->stat[0] << "\t"
+ << hapstats[k]->smoothed[0] << "\t"
+ << hapstats[k]->bs[0] << "\t"
+ << hapstats[k]->stat[3] << "\t"
+ << hapstats[k]->smoothed[3] << "\t"
+ << hapstats[k]->bs[3] << "\t"
+ << hapstats[k]->stat[4] << "\t"
+ << hapstats[k]->smoothed[4] << "\t"
+ << hapstats[k]->bs[4] << "\n";
+
+ delete hapstats[k];
+ }
+ }
+
+ fh.close();
+
+ cerr << "wrote " << cnt << " loci to pairwise haplotype file, '" << file << "'\n";
+ }
}
if (kernel_smoothed && loci_ordered) {
- delete ks;
- delete ord;
+ delete ks;
+ delete ord;
}
return 0;
@@ -2817,53 +2817,53 @@ fixed_locus(map<int, pair<int, int> > &pop_indexes, Datum **d, vector<int> &pop_
int pop_cnt = pop_ids.size();
for (int p = 0; p < pop_cnt; p++) {
- start = pop_indexes[pop_ids[p]].first;
- end = pop_indexes[pop_ids[p]].second;
- pop_id = pop_ids[p];
-
- for (int i = start; i <= end; i++) {
- if (d[i] == NULL) continue;
-
- if (d[i]->obshap.size() > 2) {
- continue;
-
- } else if (d[i]->obshap.size() == 1) {
- if (!uncalled_haplotype(d[i]->obshap[0])) {
- loc_haplotypes.insert(d[i]->obshap[0]);
- pop_haplotypes[pop_id].push_back(d[i]->obshap[0]);
- pop_haplotypes[pop_id].push_back(d[i]->obshap[0]);
- }
- } else {
- for (uint j = 0; j < d[i]->obshap.size(); j++) {
- if (!uncalled_haplotype(d[i]->obshap[0])) {
- loc_haplotypes.insert(d[i]->obshap[j]);
- pop_haplotypes[pop_id].push_back(d[i]->obshap[j]);
- }
- }
- }
- }
+ start = pop_indexes[pop_ids[p]].first;
+ end = pop_indexes[pop_ids[p]].second;
+ pop_id = pop_ids[p];
+
+ for (int i = start; i <= end; i++) {
+ if (d[i] == NULL) continue;
+
+ if (d[i]->obshap.size() > 2) {
+ continue;
+
+ } else if (d[i]->obshap.size() == 1) {
+ if (!uncalled_haplotype(d[i]->obshap[0])) {
+ loc_haplotypes.insert(d[i]->obshap[0]);
+ pop_haplotypes[pop_id].push_back(d[i]->obshap[0]);
+ pop_haplotypes[pop_id].push_back(d[i]->obshap[0]);
+ }
+ } else {
+ for (uint j = 0; j < d[i]->obshap.size(); j++) {
+ if (!uncalled_haplotype(d[i]->obshap[0])) {
+ loc_haplotypes.insert(d[i]->obshap[j]);
+ pop_haplotypes[pop_id].push_back(d[i]->obshap[j]);
+ }
+ }
+ }
+ }
}
uint valid_pops = 0;
for (int p = 0; p < pop_cnt; p++) {
- pop_id = pop_ids[p];
+ pop_id = pop_ids[p];
- if (pop_haplotypes[pop_id].size() > 0)
- valid_pops++;
+ if (pop_haplotypes[pop_id].size() > 0)
+ valid_pops++;
}
//
// Check that more than one population has data for this locus.
//
if (valid_pops <= 1)
- return true;
+ return true;
//
// Check that there is more than one haplotype at this locus.
//
if (loc_haplotypes.size() == 1)
- return true;
+ return true;
return false;
}
@@ -2872,8 +2872,8 @@ inline bool
uncalled_haplotype(const char *haplotype)
{
for (const char *p = haplotype; *p != '\0'; p++)
- if (*p == 'N' || *p == 'n')
- return true;
+ if (*p == 'N' || *p == 'n')
+ return true;
return false;
}
@@ -2883,24 +2883,24 @@ count_haplotypes_at_locus(int start, int end, Datum **d, map<string, double> &ha
double n = 0.0;
for (int i = start; i <= end; i++) {
- if (d[i] == NULL) continue;
-
- if (d[i]->obshap.size() > 2) {
- continue;
-
- } else if (d[i]->obshap.size() == 1) {
- if(!uncalled_haplotype(d[i]->obshap[0])) {
- n += 2;
- hap_cnts[d[i]->obshap[0]] += 2;
- }
- } else {
- for (uint j = 0; j < d[i]->obshap.size(); j++) {
- if(!uncalled_haplotype(d[i]->obshap[0])) {
- n++;
- hap_cnts[d[i]->obshap[j]]++;
- }
- }
- }
+ if (d[i] == NULL) continue;
+
+ if (d[i]->obshap.size() > 2) {
+ continue;
+
+ } else if (d[i]->obshap.size() == 1) {
+ if(!uncalled_haplotype(d[i]->obshap[0])) {
+ n += 2;
+ hap_cnts[d[i]->obshap[0]] += 2;
+ }
+ } else {
+ for (uint j = 0; j < d[i]->obshap.size(); j++) {
+ if(!uncalled_haplotype(d[i]->obshap[0])) {
+ n++;
+ hap_cnts[d[i]->obshap[j]]++;
+ }
+ }
+ }
}
return n;
@@ -2929,7 +2929,7 @@ haplotype_diversity(int start, int end, Datum **d)
// If this haplotype is fixed, don't calculate any statistics.
//
if (n == 0)
- return NULL;
+ return NULL;
lstat = new LocStat;
@@ -2938,7 +2938,7 @@ haplotype_diversity(int start, int end, Datum **d)
//
stringstream sstr;
for (hit = hap_freq.begin(); hit != hap_freq.end(); hit++)
- sstr << hit->first << ":" << hit->second << ";";
+ sstr << hit->first << ":" << hit->second << ";";
lstat->hap_str = sstr.str().substr(0, sstr.str().length() - 1);
//
@@ -2946,23 +2946,23 @@ haplotype_diversity(int start, int end, Datum **d)
//
uint k = 0;
for (hit = hap_freq.begin(); hit != hap_freq.end(); hit++) {
- hap_index[hit->first] = k;
- haplotypes.push_back(hit->first);
- k++;
+ hap_index[hit->first] = k;
+ haplotypes.push_back(hit->first);
+ k++;
- // cerr << " Haplotype '" << hit->first << "' occured " << hit->second << " times; ";
+ // cerr << " Haplotype '" << hit->first << "' occured " << hit->second << " times; ";
- hit->second = hit->second / n;
+ hit->second = hit->second / n;
- // cerr << " frequency of " << hit->second << "%\n";
+ // cerr << " frequency of " << hit->second << "%\n";
}
//
// Initialize a two-dimensional array to hold distances between haplotyes.
//
double **hdists = new double *[hap_index.size()];
for (k = 0; k < hap_index.size(); k++) {
- hdists[k] = new double[hap_index.size()];
- memset(hdists[k], 0, hap_index.size());
+ hdists[k] = new double[hap_index.size()];
+ memset(hdists[k], 0, hap_index.size());
}
//
@@ -2974,12 +2974,12 @@ haplotype_diversity(int start, int end, Datum **d)
// Calculate haplotype diversity, Pi.
//
for (uint i = 0; i < haplotypes.size(); i++) {
- for (uint j = 0; j < haplotypes.size(); j++) {
- hapl_diversity +=
- hap_freq[haplotypes[i]] *
- hap_freq[haplotypes[j]] *
- hdists[hap_index[haplotypes[i]]][hap_index[haplotypes[j]]];
- }
+ for (uint j = 0; j < haplotypes.size(); j++) {
+ hapl_diversity +=
+ hap_freq[haplotypes[i]] *
+ hap_freq[haplotypes[j]] *
+ hdists[hap_index[haplotypes[i]]][hap_index[haplotypes[j]]];
+ }
}
hapl_diversity = (n / (n-1)) * hapl_diversity;
@@ -2987,7 +2987,7 @@ haplotype_diversity(int start, int end, Datum **d)
// Calculate gene diversity.
//
for (uint i = 0; i < haplotypes.size(); i++) {
- gene_diversity += hap_freq[haplotypes[i]] * hap_freq[haplotypes[i]];
+ gene_diversity += hap_freq[haplotypes[i]] * hap_freq[haplotypes[i]];
}
gene_diversity = (n / (n - 1)) * (1 - gene_diversity);
@@ -2999,15 +2999,15 @@ haplotype_diversity(int start, int end, Datum **d)
// cerr << " Population " << pop_id << " has haplotype diversity (pi) of " << s[pop_index]->pi << "\n";
for (k = 0; k < hap_index.size(); k++)
- delete hdists[k];
- delete hdists;
+ delete hdists[k];
+ delete [] hdists;
return lstat;
}
HapStat *
haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_indexes,
- Datum **d, LocSum **s, vector<int> &pop_ids)
+ Datum **d, LocSum **s, vector<int> &pop_ids)
{
map<string, int> loc_hap_index;
vector<string> loc_haplotypes;
@@ -3026,34 +3026,34 @@ haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_index
// Tabulate the occurences of haplotypes at this locus.
//
for (int p = 0; p < pop_cnt; p++) {
- start = pop_indexes[pop_ids[p]].first;
- end = pop_indexes[pop_ids[p]].second;
- pop_id = pop_ids[p];
-
- for (int i = start; i <= end; i++) {
- if (d[i] == NULL) continue;
-
- if (d[i]->obshap.size() > 2) {
- continue;
-
- } else if (d[i]->obshap.size() == 1) {
- if(!uncalled_haplotype(d[i]->obshap[0])) {
- loc_hap_index[d[i]->obshap[0]]++;
- loc_haplotypes.push_back(d[i]->obshap[0]);
- loc_haplotypes.push_back(d[i]->obshap[0]);
- pop_haplotypes[pop_id].push_back(d[i]->obshap[0]);
- pop_haplotypes[pop_id].push_back(d[i]->obshap[0]);
- }
- } else {
- for (uint j = 0; j < d[i]->obshap.size(); j++) {
- if(!uncalled_haplotype(d[i]->obshap[0])) {
- loc_hap_index[d[i]->obshap[j]]++;
- loc_haplotypes.push_back(d[i]->obshap[j]);
- pop_haplotypes[pop_id].push_back(d[i]->obshap[j]);
- }
- }
- }
- }
+ start = pop_indexes[pop_ids[p]].first;
+ end = pop_indexes[pop_ids[p]].second;
+ pop_id = pop_ids[p];
+
+ for (int i = start; i <= end; i++) {
+ if (d[i] == NULL) continue;
+
+ if (d[i]->obshap.size() > 2) {
+ continue;
+
+ } else if (d[i]->obshap.size() == 1) {
+ if(!uncalled_haplotype(d[i]->obshap[0])) {
+ loc_hap_index[d[i]->obshap[0]]++;
+ loc_haplotypes.push_back(d[i]->obshap[0]);
+ loc_haplotypes.push_back(d[i]->obshap[0]);
+ pop_haplotypes[pop_id].push_back(d[i]->obshap[0]);
+ pop_haplotypes[pop_id].push_back(d[i]->obshap[0]);
+ }
+ } else {
+ for (uint j = 0; j < d[i]->obshap.size(); j++) {
+ if(!uncalled_haplotype(d[i]->obshap[0])) {
+ loc_hap_index[d[i]->obshap[j]]++;
+ loc_haplotypes.push_back(d[i]->obshap[j]);
+ pop_haplotypes[pop_id].push_back(d[i]->obshap[j]);
+ }
+ }
+ }
+ }
}
//
@@ -3061,9 +3061,9 @@ haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_index
//
double valid_pop_cnt = 0.0;
for (int p = 0; p < pop_cnt; p++) {
- pop_id = pop_ids[p];
- if (pop_haplotypes[pop_id].size() > 0)
- valid_pop_cnt++;
+ pop_id = pop_ids[p];
+ if (pop_haplotypes[pop_id].size() > 0)
+ valid_pop_cnt++;
}
//
@@ -3072,19 +3072,19 @@ haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_index
//
set<int> uniq_grps;
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = pit->first;
+ pop_id = pit->first;
- if (pop_haplotypes.count(pop_id) > 0) {
- uniq_grps.insert(pop_grp_key[pop_id]);
- grp_members[pop_grp_key[pop_id]].push_back(pop_id);
- }
+ if (pop_haplotypes.count(pop_id) > 0) {
+ uniq_grps.insert(pop_grp_key[pop_id]);
+ grp_members[pop_grp_key[pop_id]].push_back(pop_id);
+ }
}
set<int>::iterator uit;
for (uit = uniq_grps.begin(); uit != uniq_grps.end(); uit++)
- grps.push_back(*uit);
+ grps.push_back(*uit);
if (grps.size() == 0)
- return NULL;
+ return NULL;
// cerr << "Groups: ";
// for (uint i = 0; i < grps.size(); i++)
@@ -3093,7 +3093,7 @@ haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_index
// for (git = grp_members.begin(); git != grp_members.end(); git++) {
// cerr << "Group " << git->first << ": ";
// for (uint i = 0; i < git->second.size(); i++)
- // cerr << git->second[i] << ", ";
+ // cerr << git->second[i] << ", ";
// cerr << "\n";
// }
@@ -3102,8 +3102,8 @@ haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_index
//
uint m = 0;
for (hit = loc_hap_index.begin(); hit != loc_hap_index.end(); hit++) {
- loc_hap_index[hit->first] = m;
- m++;
+ loc_hap_index[hit->first] = m;
+ m++;
}
//
@@ -3112,10 +3112,10 @@ haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_index
double **hdists = new double *[loc_hap_index.size()];
double **hdists_max = new double *[loc_hap_index.size()];
for (uint k = 0; k < loc_hap_index.size(); k++) {
- hdists[k] = new double[loc_hap_index.size()];
- memset(hdists[k], 0, loc_hap_index.size());
- hdists_max[k] = new double[loc_hap_index.size()];
- memset(hdists_max[k], 0, loc_hap_index.size());
+ hdists[k] = new double[loc_hap_index.size()];
+ memset(hdists[k], 0, loc_hap_index.size());
+ hdists_max[k] = new double[loc_hap_index.size()];
+ memset(hdists_max[k], 0, loc_hap_index.size());
}
//
@@ -3146,58 +3146,58 @@ haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_index
double b = 0.0;
for (uint g = 0; g < num_grps; g++) {
- for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
- pop_id_1 = grp_members[grps[g]][r];
- tot_cnt += (double) pop_haplotypes[pop_id_1].size();
- }
+ for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
+ pop_id_1 = grp_members[grps[g]][r];
+ tot_cnt += (double) pop_haplotypes[pop_id_1].size();
+ }
}
for (uint g = 0; g < num_grps; g++) {
- grp_cnt = 0.0;
- for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
- pop_id_1 = grp_members[grps[g]][r];
- grp_cnt += (double) pop_haplotypes[pop_id_1].size();
- }
-
- a = 0.0;
- for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
- pop_id_1 = grp_members[grps[g]][r];
- a += (double) (pop_haplotypes[pop_id_1].size() * pop_haplotypes[pop_id_1].size()) / grp_cnt;
- }
- s_g += a;
+ grp_cnt = 0.0;
+ for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
+ pop_id_1 = grp_members[grps[g]][r];
+ grp_cnt += (double) pop_haplotypes[pop_id_1].size();
+ }
+
+ a = 0.0;
+ for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
+ pop_id_1 = grp_members[grps[g]][r];
+ a += (double) (pop_haplotypes[pop_id_1].size() * pop_haplotypes[pop_id_1].size()) / grp_cnt;
+ }
+ s_g += a;
}
n = (tot_cnt - s_g) / (double) (valid_pop_cnt - num_grps);
// cerr << " n: "<< n << "\n";
if (num_grps > 1) {
- //
- // Calculate n'
- //
- a = 0.0;
- for (uint g = 0; g < num_grps; g++) {
- for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
- pop_id_1 = grp_members[grps[g]][r];
- a += ((double) (pop_haplotypes[pop_id_1].size() * pop_haplotypes[pop_id_1].size()) / tot_cnt);
- }
- }
- n_1 = (s_g - a) / (double) (num_grps - 1.0);
-
- // cerr << " n': "<< n_1 << "\n";
-
- //
- // Calculate n''
- //
- for (uint g = 0; g < num_grps; g++) {
- a = 0.0;
- for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
- pop_id_1 = grp_members[grps[g]][r];
- a += pop_haplotypes[pop_id_1].size();
- }
- b += ((a * a) / tot_cnt);
- }
- n_2 = (tot_cnt - b) / (double) (num_grps - 1);
-
- // cerr << " n'': "<< n_2 << "\n";
+ //
+ // Calculate n'
+ //
+ a = 0.0;
+ for (uint g = 0; g < num_grps; g++) {
+ for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
+ pop_id_1 = grp_members[grps[g]][r];
+ a += ((double) (pop_haplotypes[pop_id_1].size() * pop_haplotypes[pop_id_1].size()) / tot_cnt);
+ }
+ }
+ n_1 = (s_g - a) / (double) (num_grps - 1.0);
+
+ // cerr << " n': "<< n_1 << "\n";
+
+ //
+ // Calculate n''
+ //
+ for (uint g = 0; g < num_grps; g++) {
+ a = 0.0;
+ for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
+ pop_id_1 = grp_members[grps[g]][r];
+ a += pop_haplotypes[pop_id_1].size();
+ }
+ b += ((a * a) / tot_cnt);
+ }
+ n_2 = (tot_cnt - b) / (double) (num_grps - 1);
+
+ // cerr << " n'': "<< n_2 << "\n";
}
//
@@ -3213,7 +3213,7 @@ haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_index
double sigma_a = 0.0;
if (grps.size() > 1)
- sigma_a = (msd_ag - sigma_c - (n_1 * sigma_b)) / n_2;
+ sigma_a = (msd_ag - sigma_c - (n_1 * sigma_b)) / n_2;
// Arlequin seems to sum the variance components instead of independently calculating sigma_total: MSD(total) = SSD(total)/degrees.of.freedom
double sigma_total = sigma_a + sigma_b + sigma_c; // msd_total;
@@ -3223,11 +3223,11 @@ haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_index
double phi_sc = 0.0;
if (grps.size() > 1) {
- phi_st = sigma_total > 0.0 ? (sigma_a + sigma_b) / sigma_total : 0.0;
- phi_ct = sigma_total > 0.0 ? sigma_a / sigma_total : 0.0;
- phi_sc = (sigma_a + sigma_b) > 0.0 ? sigma_b / (sigma_b + sigma_c) : 0.0;
+ phi_st = sigma_total > 0.0 ? (sigma_a + sigma_b) / sigma_total : 0.0;
+ phi_ct = sigma_total > 0.0 ? sigma_a / sigma_total : 0.0;
+ phi_sc = (sigma_a + sigma_b) > 0.0 ? sigma_b / (sigma_b + sigma_c) : 0.0;
} else {
- phi_st = sigma_total > 0.0 ? sigma_b / sigma_total : 0.0;
+ phi_st = sigma_total > 0.0 ? sigma_b / sigma_total : 0.0;
}
// cerr << " MSD(AG): " << msd_ag << "; MSD(AP/WG): " << msd_ap_wg << "; MSD(WP): " << msd_wp << "; MSD(TOTAL): " << msd_total << "\n"
@@ -3282,22 +3282,22 @@ haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_index
h->popcnt = valid_pop_cnt;
if (log_fst_comp) {
- h->comp = new double[15];
- h->comp[0] = ssd_wp;
- h->comp[1] = ssd_ap_wg;
- h->comp[2] = ssd_ag;
- h->comp[3] = ssd_total;
- h->comp[4] = msd_wp;
- h->comp[5] = msd_ap_wg;
- h->comp[6] = msd_ag;
- h->comp[7] = msd_total;
- h->comp[8] = n;
- h->comp[9] = n_1;
- h->comp[10] = n_2;
- h->comp[11] = sigma_a;
- h->comp[12] = sigma_b;
- h->comp[13] = sigma_c;
- h->comp[14] = sigma_total;
+ h->comp = new double[15];
+ h->comp[0] = ssd_wp;
+ h->comp[1] = ssd_ap_wg;
+ h->comp[2] = ssd_ag;
+ h->comp[3] = ssd_total;
+ h->comp[4] = msd_wp;
+ h->comp[5] = msd_ap_wg;
+ h->comp[6] = msd_ag;
+ h->comp[7] = msd_total;
+ h->comp[8] = n;
+ h->comp[9] = n_1;
+ h->comp[10] = n_2;
+ h->comp[11] = sigma_a;
+ h->comp[12] = sigma_b;
+ h->comp[13] = sigma_c;
+ h->comp[14] = sigma_total;
}
h->stat[0] = phi_st;
@@ -3306,8 +3306,8 @@ haplotype_amova(map<int, int> &pop_grp_key, map<int, pair<int, int> > &pop_index
h->stat[3] = fst_1;
for (uint k = 0; k < loc_hap_index.size(); k++) {
- delete [] hdists[k];
- delete [] hdists_max[k];
+ delete [] hdists[k];
+ delete [] hdists_max[k];
}
delete [] hdists;
delete [] hdists_max;
@@ -3324,14 +3324,14 @@ amova_ssd_total(vector<string> &loc_haplotypes, map<string, int> &loc_hap_index,
double ssd_total = 0.0;
for (uint j = 0; j < loc_haplotypes.size(); j++) {
- for (uint k = 0; k < loc_haplotypes.size(); k++) {
- ssd_total += hdists[loc_hap_index[loc_haplotypes[j]]][loc_hap_index[loc_haplotypes[k]]];
- // cerr << j << "\t"
- // << k << "\t"
- // << loc_haplotypes[j] << "\t"
- // << loc_haplotypes[k] << "\t"
- // << hdists[loc_hap_index[loc_haplotypes[j]]][loc_hap_index[loc_haplotypes[k]]] << "\n";
- }
+ for (uint k = 0; k < loc_haplotypes.size(); k++) {
+ ssd_total += hdists[loc_hap_index[loc_haplotypes[j]]][loc_hap_index[loc_haplotypes[k]]];
+ // cerr << j << "\t"
+ // << k << "\t"
+ // << loc_haplotypes[j] << "\t"
+ // << loc_haplotypes[k] << "\t"
+ // << hdists[loc_hap_index[loc_haplotypes[j]]][loc_hap_index[loc_haplotypes[k]]] << "\n";
+ }
}
ssd_total = (1.0 / (double) (2*loc_haplotypes.size())) * ssd_total;
// cerr << " ssd_total: "<< ssd_total << "\n";
@@ -3341,8 +3341,8 @@ amova_ssd_total(vector<string> &loc_haplotypes, map<string, int> &loc_hap_index,
double
amova_ssd_wp(vector<int> &grps, map<int, vector<int> > &grp_members,
- map<string, int> &loc_hap_index, map<int, vector<string> > &pop_haplotypes,
- double **hdists)
+ map<string, int> &loc_hap_index, map<int, vector<string> > &pop_haplotypes,
+ double **hdists)
{
//
// Calculate the sum of squared deviations within populations, SSD(WP)
@@ -3352,25 +3352,25 @@ amova_ssd_wp(vector<int> &grps, map<int, vector<int> > &grp_members,
int pop_id;
for (uint g = 0; g < grps.size(); g++) {
- for (uint i = 0; i < grp_members[grps[g]].size(); i++) {
- pop_id = grp_members[grps[g]][i];
- ssd = 0.0;
-
- for (uint j = 0; j < pop_haplotypes[pop_id].size(); j++) {
- for (uint k = 0; k < pop_haplotypes[pop_id].size(); k++) {
- ssd += hdists[loc_hap_index[pop_haplotypes[pop_id][j]]][loc_hap_index[pop_haplotypes[pop_id][k]]];
- // cerr << pop_id << "\t"
- // << j << "\t"
- // << k << "\t"
- // << loc_haplotypes[j] << "\t"
- // << loc_haplotypes[k] << "\t"
- // << hdists[loc_hap_index[loc_haplotypes[j]]][loc_hap_index[loc_haplotypes[k]]] << "\n";
- }
- }
-
- if (pop_haplotypes[pop_id].size() > 0)
- ssd_wp += (1.0 / (double) (2*pop_haplotypes[pop_id].size())) * ssd;
- }
+ for (uint i = 0; i < grp_members[grps[g]].size(); i++) {
+ pop_id = grp_members[grps[g]][i];
+ ssd = 0.0;
+
+ for (uint j = 0; j < pop_haplotypes[pop_id].size(); j++) {
+ for (uint k = 0; k < pop_haplotypes[pop_id].size(); k++) {
+ ssd += hdists[loc_hap_index[pop_haplotypes[pop_id][j]]][loc_hap_index[pop_haplotypes[pop_id][k]]];
+ // cerr << pop_id << "\t"
+ // << j << "\t"
+ // << k << "\t"
+ // << loc_haplotypes[j] << "\t"
+ // << loc_haplotypes[k] << "\t"
+ // << hdists[loc_hap_index[loc_haplotypes[j]]][loc_hap_index[loc_haplotypes[k]]] << "\n";
+ }
+ }
+
+ if (pop_haplotypes[pop_id].size() > 0)
+ ssd_wp += (1.0 / (double) (2*pop_haplotypes[pop_id].size())) * ssd;
+ }
}
// cerr << " ssd_wp: "<< ssd_wp << "\n";
@@ -3379,8 +3379,8 @@ amova_ssd_wp(vector<int> &grps, map<int, vector<int> > &grp_members,
double
amova_ssd_ap_wg(vector<int> &grps, map<int, vector<int> > &grp_members,
- map<string, int> &loc_hap_index, map<int, vector<string> > &pop_haplotypes,
- double **hdists_1, double **hdists_2)
+ map<string, int> &loc_hap_index, map<int, vector<string> > &pop_haplotypes,
+ double **hdists_1, double **hdists_2)
{
//
// Calculate the sum of squared deviations across populations and within groups, SSD(AP/WG)
@@ -3394,49 +3394,49 @@ amova_ssd_ap_wg(vector<int> &grps, map<int, vector<int> > &grp_members,
for (uint g = 0; g < grps.size(); g++) {
- ssd_1 = 0.0;
- for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
- pop_id_1 = grp_members[grps[g]][r];
+ ssd_1 = 0.0;
+ for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
+ pop_id_1 = grp_members[grps[g]][r];
- for (uint j = 0; j < pop_haplotypes[pop_id_1].size(); j++) {
+ for (uint j = 0; j < pop_haplotypes[pop_id_1].size(); j++) {
- for (uint s = 0; s < grp_members[grps[g]].size(); s++) {
- pop_id_2 = grp_members[grps[g]][s];
+ for (uint s = 0; s < grp_members[grps[g]].size(); s++) {
+ pop_id_2 = grp_members[grps[g]][s];
- for (uint k = 0; k < pop_haplotypes[pop_id_2].size(); k++) {
- if (pop_id_1 == pop_id_2)
- ssd_1 += hdists_1[loc_hap_index[pop_haplotypes[pop_id_1][j]]][loc_hap_index[pop_haplotypes[pop_id_2][k]]];
- else
- ssd_1 += hdists_2[loc_hap_index[pop_haplotypes[pop_id_1][j]]][loc_hap_index[pop_haplotypes[pop_id_2][k]]];
- }
- }
- }
- }
+ for (uint k = 0; k < pop_haplotypes[pop_id_2].size(); k++) {
+ if (pop_id_1 == pop_id_2)
+ ssd_1 += hdists_1[loc_hap_index[pop_haplotypes[pop_id_1][j]]][loc_hap_index[pop_haplotypes[pop_id_2][k]]];
+ else
+ ssd_1 += hdists_2[loc_hap_index[pop_haplotypes[pop_id_1][j]]][loc_hap_index[pop_haplotypes[pop_id_2][k]]];
+ }
+ }
+ }
+ }
- den = 0.0;
- for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
- pop_id_1 = grp_members[grps[g]][r];
- den += 2 * pop_haplotypes[pop_id_1].size();
- }
+ den = 0.0;
+ for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
+ pop_id_1 = grp_members[grps[g]][r];
+ den += 2 * pop_haplotypes[pop_id_1].size();
+ }
- ssd_1 = ssd_1 / den;
+ ssd_1 = ssd_1 / den;
- ssd_2 = 0.0;
- for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
- pop_id = grp_members[grps[g]][r];
- ssd = 0.0;
+ ssd_2 = 0.0;
+ for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
+ pop_id = grp_members[grps[g]][r];
+ ssd = 0.0;
- for (uint j = 0; j < pop_haplotypes[pop_id].size(); j++) {
- for (uint k = 0; k < pop_haplotypes[pop_id].size(); k++) {
- ssd += hdists_1[loc_hap_index[pop_haplotypes[pop_id][j]]][loc_hap_index[pop_haplotypes[pop_id][k]]];
- }
- }
+ for (uint j = 0; j < pop_haplotypes[pop_id].size(); j++) {
+ for (uint k = 0; k < pop_haplotypes[pop_id].size(); k++) {
+ ssd += hdists_1[loc_hap_index[pop_haplotypes[pop_id][j]]][loc_hap_index[pop_haplotypes[pop_id][k]]];
+ }
+ }
- if (pop_haplotypes[pop_id].size() > 0)
- ssd_2 += (1.0 / (double) (2*pop_haplotypes[pop_id].size())) * ssd;
- }
+ if (pop_haplotypes[pop_id].size() > 0)
+ ssd_2 += (1.0 / (double) (2*pop_haplotypes[pop_id].size())) * ssd;
+ }
- ssd_ap_wg += ssd_1 - ssd_2;
+ ssd_ap_wg += ssd_1 - ssd_2;
}
// cerr << " ssd_ap_wg: "<< ssd_ap_wg << "\n";
@@ -3445,8 +3445,8 @@ amova_ssd_ap_wg(vector<int> &grps, map<int, vector<int> > &grp_members,
double
amova_ssd_ag(vector<int> &grps, map<int, vector<int> > &grp_members,
- map<string, int> &loc_hap_index, map<int, vector<string> > &pop_haplotypes,
- double **hdists, double ssd_total)
+ map<string, int> &loc_hap_index, map<int, vector<string> > &pop_haplotypes,
+ double **hdists, double ssd_total)
{
//
// Calculate the sum of squared deviations across groups, SSD(AG)
@@ -3458,30 +3458,30 @@ amova_ssd_ag(vector<int> &grps, map<int, vector<int> > &grp_members,
double den = 0.0;
for (uint g = 0; g < grps.size(); g++) {
- ssd_1 = 0.0;
+ ssd_1 = 0.0;
- for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
- pop_id_1 = grp_members[grps[g]][r];
+ for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
+ pop_id_1 = grp_members[grps[g]][r];
- for (uint j = 0; j < pop_haplotypes[pop_id_1].size(); j++) {
+ for (uint j = 0; j < pop_haplotypes[pop_id_1].size(); j++) {
- for (uint s = 0; s < grp_members[grps[g]].size(); s++) {
- pop_id_2 = grp_members[grps[g]][s];
+ for (uint s = 0; s < grp_members[grps[g]].size(); s++) {
+ pop_id_2 = grp_members[grps[g]][s];
- for (uint k = 0; k < pop_haplotypes[pop_id_2].size(); k++) {
- ssd_1 += hdists[loc_hap_index[pop_haplotypes[pop_id_1][j]]][loc_hap_index[pop_haplotypes[pop_id_2][k]]];
- }
- }
- }
- }
+ for (uint k = 0; k < pop_haplotypes[pop_id_2].size(); k++) {
+ ssd_1 += hdists[loc_hap_index[pop_haplotypes[pop_id_1][j]]][loc_hap_index[pop_haplotypes[pop_id_2][k]]];
+ }
+ }
+ }
+ }
- den = 0.0;
- for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
- pop_id_1 = grp_members[grps[g]][r];
- den += 2 * pop_haplotypes[pop_id_1].size();
- }
+ den = 0.0;
+ for (uint r = 0; r < grp_members[grps[g]].size(); r++) {
+ pop_id_1 = grp_members[grps[g]][r];
+ den += 2 * pop_haplotypes[pop_id_1].size();
+ }
- ssd += ssd_1 / den;
+ ssd += ssd_1 / den;
}
ssd_ag = ssd_total - ssd;
@@ -3515,57 +3515,57 @@ haplotype_d_est(map<int, pair<int, int> > &pop_indexes, Datum **d, LocSum **s, v
// Tabulate the occurences of haplotypes at this locus.
//
for (uint p = 0; p < pop_cnt; p++) {
- start = pop_indexes[pop_ids[p]].first;
- end = pop_indexes[pop_ids[p]].second;
- pop_id = pop_ids[p];
-
- for (int i = start; i <= end; i++) {
- if (d[i] == NULL) continue;
-
- if (d[i]->obshap.size() > 2) {
- continue;
-
- } else if (d[i]->obshap.size() == 1) {
- loc_haplotypes[d[i]->obshap[0]] += 2;
- pop_haplotypes[pop_id][d[i]->obshap[0]] += 2;
-
- } else {
- for (uint j = 0; j < d[i]->obshap.size(); j++) {
- loc_haplotypes[d[i]->obshap[j]]++;
- pop_haplotypes[pop_id][d[i]->obshap[j]]++;
- }
- }
- }
+ start = pop_indexes[pop_ids[p]].first;
+ end = pop_indexes[pop_ids[p]].second;
+ pop_id = pop_ids[p];
+
+ for (int i = start; i <= end; i++) {
+ if (d[i] == NULL) continue;
+
+ if (d[i]->obshap.size() > 2) {
+ continue;
+
+ } else if (d[i]->obshap.size() == 1) {
+ loc_haplotypes[d[i]->obshap[0]] += 2;
+ pop_haplotypes[pop_id][d[i]->obshap[0]] += 2;
+
+ } else {
+ for (uint j = 0; j < d[i]->obshap.size(); j++) {
+ loc_haplotypes[d[i]->obshap[j]]++;
+ pop_haplotypes[pop_id][d[i]->obshap[j]]++;
+ }
+ }
+ }
- for (it = pop_haplotypes[pop_id].begin(); it != pop_haplotypes[pop_id].end(); it++)
- pop_totals[pop_id] += it->second;
+ for (it = pop_haplotypes[pop_id].begin(); it != pop_haplotypes[pop_id].end(); it++)
+ pop_totals[pop_id] += it->second;
}
double x = 0.0;
for (it = loc_haplotypes.begin(); it != loc_haplotypes.end(); it++) {
- double freq_sum_sq = 0.0;
- double freq_sq_sum = 0.0;
- for (uint p = 0; p < pop_cnt; p++) {
- pop_id = pop_ids[p];
- freq_sum_sq += (pop_haplotypes[pop_id][it->first] / pop_totals[pop_id]);
- freq_sq_sum += pow((pop_haplotypes[pop_id][it->first] / pop_totals[pop_id]), 2);
- }
- freq_sum_sq = pow(freq_sum_sq, 2);
+ double freq_sum_sq = 0.0;
+ double freq_sq_sum = 0.0;
+ for (uint p = 0; p < pop_cnt; p++) {
+ pop_id = pop_ids[p];
+ freq_sum_sq += (pop_haplotypes[pop_id][it->first] / pop_totals[pop_id]);
+ freq_sq_sum += pow((pop_haplotypes[pop_id][it->first] / pop_totals[pop_id]), 2);
+ }
+ freq_sum_sq = pow(freq_sum_sq, 2);
- x += (freq_sum_sq - freq_sq_sum) / (pop_cnt - 1);
+ x += (freq_sum_sq - freq_sq_sum) / (pop_cnt - 1);
}
double y = 0.0;
for (it = loc_haplotypes.begin(); it != loc_haplotypes.end(); it++) {
- for (uint p = 0; p < pop_cnt; p++) {
- pop_id = pop_ids[p];
+ for (uint p = 0; p < pop_cnt; p++) {
+ pop_id = pop_ids[p];
- y += (pop_haplotypes[pop_id][it->first] * (pop_haplotypes[pop_id][it->first] - 1)) /
- (pop_totals[pop_id] * (pop_totals[pop_id] - 1));
- }
+ y += (pop_haplotypes[pop_id][it->first] * (pop_haplotypes[pop_id][it->first] - 1)) /
+ (pop_totals[pop_id] * (pop_totals[pop_id] - 1));
+ }
}
double d_est = 1.0 - (x / y);
@@ -3575,7 +3575,7 @@ haplotype_d_est(map<int, pair<int, int> > &pop_indexes, Datum **d, LocSum **s, v
int
calculate_summary_stats(vector<pair<int, string> > &files, map<int, pair<int, int> > &pop_indexes,
- map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
+ map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum)
{
map<string, vector<CSLocus *> >::iterator it;
CSLocus *loc;
@@ -3632,127 +3632,127 @@ calculate_summary_stats(vector<pair<int, string> > &files, map<int, pair<int, in
fis_var_all = new double[pop_cnt];
for (int j = 0; j < pop_cnt; j++) {
- private_cnt[j] = 0;
- n[j] = 0.0;
- var_sites[j] = 0.0;
- num_indv_mean[j] = 0.0;
- num_indv_var[j] = 0.0;
- p_mean[j] = 0.0;
- p_var[j] = 0.0;
- obs_het_mean[j] = 0.0;
- obs_het_var[j] = 0.0;
- obs_hom_mean[j] = 0.0;
- obs_hom_var[j] = 0.0;
- exp_het_mean[j] = 0.0;
- exp_het_var[j] = 0.0;
- exp_hom_mean[j] = 0.0;
- exp_hom_var[j] = 0.0;
- pi_mean[j] = 0.0;
- pi_var[j] = 0.0;
- fis_mean[j] = 0.0;
- fis_var[j] = 0.0;
-
- n_all[j] = 0.0;
- num_indv_mean_all[j] = 0.0;
- num_indv_var_all[j] = 0.0;
- p_mean_all[j] = 0.0;
- p_var_all[j] = 0.0;
- obs_het_mean_all[j] = 0.0;
- obs_het_var_all[j] = 0.0;
- obs_hom_mean_all[j] = 0.0;
- obs_hom_var_all[j] = 0.0;
- exp_het_mean_all[j] = 0.0;
- exp_het_var_all[j] = 0.0;
- exp_hom_mean_all[j] = 0.0;
- exp_hom_var_all[j] = 0.0;
- pi_mean_all[j] = 0.0;
- pi_var_all[j] = 0.0;
- fis_mean_all[j] = 0.0;
- fis_var_all[j] = 0.0;
+ private_cnt[j] = 0;
+ n[j] = 0.0;
+ var_sites[j] = 0.0;
+ num_indv_mean[j] = 0.0;
+ num_indv_var[j] = 0.0;
+ p_mean[j] = 0.0;
+ p_var[j] = 0.0;
+ obs_het_mean[j] = 0.0;
+ obs_het_var[j] = 0.0;
+ obs_hom_mean[j] = 0.0;
+ obs_hom_var[j] = 0.0;
+ exp_het_mean[j] = 0.0;
+ exp_het_var[j] = 0.0;
+ exp_hom_mean[j] = 0.0;
+ exp_hom_var[j] = 0.0;
+ pi_mean[j] = 0.0;
+ pi_var[j] = 0.0;
+ fis_mean[j] = 0.0;
+ fis_var[j] = 0.0;
+
+ n_all[j] = 0.0;
+ num_indv_mean_all[j] = 0.0;
+ num_indv_var_all[j] = 0.0;
+ p_mean_all[j] = 0.0;
+ p_var_all[j] = 0.0;
+ obs_het_mean_all[j] = 0.0;
+ obs_het_var_all[j] = 0.0;
+ obs_hom_mean_all[j] = 0.0;
+ obs_hom_var_all[j] = 0.0;
+ exp_het_mean_all[j] = 0.0;
+ exp_het_var_all[j] = 0.0;
+ exp_hom_mean_all[j] = 0.0;
+ exp_hom_var_all[j] = 0.0;
+ pi_mean_all[j] = 0.0;
+ pi_var_all[j] = 0.0;
+ fis_mean_all[j] = 0.0;
+ fis_var_all[j] = 0.0;
}
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- s = psum->locus(loc->id);
- t = psum->locus_tally(loc->id);
- len = strlen(loc->con);
-
- for (int i = 0; i < len; i++) {
- //
- // Compile private alleles
- //
- if (t->nucs[i].priv_allele >= 0)
- private_cnt[t->nucs[i].priv_allele]++;
-
- if (t->nucs[i].allele_cnt == 2) {
-
- for (int j = 0; j < pop_cnt; j++) {
-
- if (s[j]->nucs[i].num_indv == 0) continue;
-
- n[j]++;
-
- if (s[j]->nucs[i].pi > 0) var_sites[j]++;
-
- num_indv_mean[j] += s[j]->nucs[i].num_indv;
- p_mean[j] += s[j]->nucs[i].p;
- obs_het_mean[j] += s[j]->nucs[i].obs_het;
- obs_hom_mean[j] += s[j]->nucs[i].obs_hom;
- exp_het_mean[j] += s[j]->nucs[i].exp_het;
- exp_hom_mean[j] += s[j]->nucs[i].exp_hom;
- pi_mean[j] += s[j]->nucs[i].stat[0];
- fis_mean[j] += s[j]->nucs[i].stat[1] != -7.0 ? s[j]->nucs[i].stat[1] : 0.0;
-
- n_all[j]++;
- num_indv_mean_all[j] += s[j]->nucs[i].num_indv;
- p_mean_all[j] += s[j]->nucs[i].p;
- obs_het_mean_all[j] += s[j]->nucs[i].obs_het;
- obs_hom_mean_all[j] += s[j]->nucs[i].obs_hom;
- exp_het_mean_all[j] += s[j]->nucs[i].exp_het;
- exp_hom_mean_all[j] += s[j]->nucs[i].exp_hom;
- pi_mean_all[j] += s[j]->nucs[i].stat[0];
- fis_mean_all[j] += s[j]->nucs[i].stat[1] != -7.0 ? s[j]->nucs[i].stat[1] : 0.0;
- }
-
- } else if (t->nucs[i].allele_cnt == 1) {
- for (int j = 0; j < pop_cnt; j++) {
- if (s[j]->nucs[i].num_indv == 0) continue;
-
- n_all[j]++;
- num_indv_mean_all[j] += s[j]->nucs[i].num_indv;
- p_mean_all[j] += s[j]->nucs[i].p;
- obs_het_mean_all[j] += s[j]->nucs[i].obs_het;
- obs_hom_mean_all[j] += s[j]->nucs[i].obs_hom;
- exp_het_mean_all[j] += s[j]->nucs[i].exp_het;
- exp_hom_mean_all[j] += s[j]->nucs[i].exp_hom;
- pi_mean_all[j] += s[j]->nucs[i].stat[0];
- fis_mean_all[j] += s[j]->nucs[i].stat[1] != -7.0 ? s[j]->nucs[i].stat[1] : 0.0;
- }
- }
- }
- }
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ s = psum->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+ len = strlen(loc->con);
+
+ for (int i = 0; i < len; i++) {
+ //
+ // Compile private alleles
+ //
+ if (t->nucs[i].priv_allele >= 0)
+ private_cnt[t->nucs[i].priv_allele]++;
+
+ if (t->nucs[i].allele_cnt == 2) {
+
+ for (int j = 0; j < pop_cnt; j++) {
+
+ if (s[j]->nucs[i].num_indv == 0) continue;
+
+ n[j]++;
+
+ if (s[j]->nucs[i].pi > 0) var_sites[j]++;
+
+ num_indv_mean[j] += s[j]->nucs[i].num_indv;
+ p_mean[j] += s[j]->nucs[i].p;
+ obs_het_mean[j] += s[j]->nucs[i].obs_het;
+ obs_hom_mean[j] += s[j]->nucs[i].obs_hom;
+ exp_het_mean[j] += s[j]->nucs[i].exp_het;
+ exp_hom_mean[j] += s[j]->nucs[i].exp_hom;
+ pi_mean[j] += s[j]->nucs[i].stat[0];
+ fis_mean[j] += s[j]->nucs[i].stat[1] != -7.0 ? s[j]->nucs[i].stat[1] : 0.0;
+
+ n_all[j]++;
+ num_indv_mean_all[j] += s[j]->nucs[i].num_indv;
+ p_mean_all[j] += s[j]->nucs[i].p;
+ obs_het_mean_all[j] += s[j]->nucs[i].obs_het;
+ obs_hom_mean_all[j] += s[j]->nucs[i].obs_hom;
+ exp_het_mean_all[j] += s[j]->nucs[i].exp_het;
+ exp_hom_mean_all[j] += s[j]->nucs[i].exp_hom;
+ pi_mean_all[j] += s[j]->nucs[i].stat[0];
+ fis_mean_all[j] += s[j]->nucs[i].stat[1] != -7.0 ? s[j]->nucs[i].stat[1] : 0.0;
+ }
+
+ } else if (t->nucs[i].allele_cnt == 1) {
+ for (int j = 0; j < pop_cnt; j++) {
+ if (s[j]->nucs[i].num_indv == 0) continue;
+
+ n_all[j]++;
+ num_indv_mean_all[j] += s[j]->nucs[i].num_indv;
+ p_mean_all[j] += s[j]->nucs[i].p;
+ obs_het_mean_all[j] += s[j]->nucs[i].obs_het;
+ obs_hom_mean_all[j] += s[j]->nucs[i].obs_hom;
+ exp_het_mean_all[j] += s[j]->nucs[i].exp_het;
+ exp_hom_mean_all[j] += s[j]->nucs[i].exp_hom;
+ pi_mean_all[j] += s[j]->nucs[i].stat[0];
+ fis_mean_all[j] += s[j]->nucs[i].stat[1] != -7.0 ? s[j]->nucs[i].stat[1] : 0.0;
+ }
+ }
+ }
+ }
}
for (int j = 0; j < pop_cnt; j++) {
- num_indv_mean[j] = num_indv_mean[j] / n[j];
- p_mean[j] = p_mean[j] / n[j];
- obs_het_mean[j] = obs_het_mean[j] / n[j];
- obs_hom_mean[j] = obs_hom_mean[j] / n[j];
- exp_het_mean[j] = exp_het_mean[j] / n[j];
- exp_hom_mean[j] = exp_hom_mean[j] / n[j];
- pi_mean[j] = pi_mean[j] / n[j];
- fis_mean[j] = fis_mean[j] / n[j];
-
- num_indv_mean_all[j] = num_indv_mean_all[j] / n_all[j];
- p_mean_all[j] = p_mean_all[j] / n_all[j];
- obs_het_mean_all[j] = obs_het_mean_all[j] / n_all[j];
- obs_hom_mean_all[j] = obs_hom_mean_all[j] / n_all[j];
- exp_het_mean_all[j] = exp_het_mean_all[j] / n_all[j];
- exp_hom_mean_all[j] = exp_hom_mean_all[j] / n_all[j];
- pi_mean_all[j] = pi_mean_all[j] / n_all[j];
- fis_mean_all[j] = fis_mean_all[j] / n_all[j];
+ num_indv_mean[j] = num_indv_mean[j] / n[j];
+ p_mean[j] = p_mean[j] / n[j];
+ obs_het_mean[j] = obs_het_mean[j] / n[j];
+ obs_hom_mean[j] = obs_hom_mean[j] / n[j];
+ exp_het_mean[j] = exp_het_mean[j] / n[j];
+ exp_hom_mean[j] = exp_hom_mean[j] / n[j];
+ pi_mean[j] = pi_mean[j] / n[j];
+ fis_mean[j] = fis_mean[j] / n[j];
+
+ num_indv_mean_all[j] = num_indv_mean_all[j] / n_all[j];
+ p_mean_all[j] = p_mean_all[j] / n_all[j];
+ obs_het_mean_all[j] = obs_het_mean_all[j] / n_all[j];
+ obs_hom_mean_all[j] = obs_hom_mean_all[j] / n_all[j];
+ exp_het_mean_all[j] = exp_het_mean_all[j] / n_all[j];
+ exp_hom_mean_all[j] = exp_hom_mean_all[j] / n_all[j];
+ pi_mean_all[j] = pi_mean_all[j] / n_all[j];
+ fis_mean_all[j] = fis_mean_all[j] / n_all[j];
}
stringstream pop_name;
@@ -3763,7 +3763,7 @@ calculate_summary_stats(vector<pair<int, string> > &files, map<int, pair<int, in
ofstream fh(file.c_str(), ofstream::out);
if (fh.fail()) {
cerr << "Error opening sumstats file '" << file << "'\n";
- exit(1);
+ exit(1);
}
fh.precision(fieldw);
fh.setf(std::ios::fixed);
@@ -3775,14 +3775,14 @@ calculate_summary_stats(vector<pair<int, string> > &files, map<int, pair<int, in
//
map<int, pair<int, int> >::iterator pit;
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- start = pit->second.first;
- end = pit->second.second;
- fh << "# " << pit->first << "\t";
- for (int i = start; i <= end; i++) {
- fh << files[i].second;
- if (i < end) fh << ",";
- }
- fh << "\n";
+ start = pit->second.first;
+ end = pit->second.second;
+ fh << "# " << pit->first << "\t";
+ for (int i = start; i <= end; i++) {
+ fh << files[i].second;
+ if (i < end) fh << ",";
+ }
+ fh << "\n";
}
cerr << "Writing " << catalog.size() << " loci to summary statistics file, '" << file << "'\n";
@@ -3810,124 +3810,124 @@ calculate_summary_stats(vector<pair<int, string> > &files, map<int, pair<int, in
<< "Private" << "\n";
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- s = psum->locus(loc->id);
- t = psum->locus_tally(loc->id);
- len = strlen(loc->con);
-
- for (int i = 0; i < len; i++) {
-
- //
- // If this site is fixed in all populations, DON'T output it. If it is variable,
- // or fixed within populations but variable among, DO output it.
- //
- if (t->nucs[i].allele_cnt == 2) {
-
- for (int j = 0; j < pop_cnt; j++) {
-
- if (s[j]->nucs[i].num_indv == 0) continue;
-
- fh << batch_id << "\t"
- << loc->id << "\t"
- << loc->loc.chr << "\t"
- << loc->sort_bp(i) + 1 << "\t"
- << i << "\t"
- << pop_key[psum->rev_pop_index(j)] << "\t";
-
- //
- // Output the p and q alleles in the same order in each population.
- //
- if (t->nucs[i].p_allele == s[j]->nucs[i].p_nuc) {
- if (s[j]->nucs[i].q_nuc == 0)
- fh << s[j]->nucs[i].p_nuc << "\t" << "-";
- else
- fh << s[j]->nucs[i].p_nuc << "\t" << s[j]->nucs[i].q_nuc;
- p_freq = s[j]->nucs[i].p;
-
- } else {
- if (s[j]->nucs[i].q_nuc == 0)
- fh << "-\t" << s[j]->nucs[i].p_nuc;
- else
- fh << s[j]->nucs[i].q_nuc << "\t" << s[j]->nucs[i].p_nuc;
- p_freq = 1 - s[j]->nucs[i].p;
- }
-
- fh << "\t" << (int) s[j]->nucs[i].num_indv << "\t"
- << std::setprecision(8) << p_freq << "\t"
- << std::setprecision(fieldw) << s[j]->nucs[i].obs_het << "\t"
- << s[j]->nucs[i].obs_hom << "\t"
- << s[j]->nucs[i].exp_het << "\t"
- << s[j]->nucs[i].exp_hom << "\t"
- << s[j]->nucs[i].stat[0] << "\t" // Pi
- << s[j]->nucs[i].smoothed[0] << "\t" // Smoothed Pi
- << s[j]->nucs[i].bs[0] << "\t" // Pi bootstrapped p-value
- << (s[j]->nucs[i].stat[1] == -7.0 ? 0.0 : s[j]->nucs[i].stat[1]) << "\t" // Fis
- << s[j]->nucs[i].smoothed[1] << "\t" // Smoothed Fis
- << s[j]->nucs[i].bs[1] << "\t"; // Fis bootstrapped p-value.
- (t->nucs[i].priv_allele == j) ? fh << "1\n" : fh << "0\n";
-
- //
- // Tabulate the residuals to calculate the variance.
- //
- num_indv_var[j] += pow((s[j]->nucs[i].num_indv - num_indv_mean[j]), 2);
- p_var[j] += pow((s[j]->nucs[i].p - p_mean[j]), 2);
- obs_het_var[j] += pow((s[j]->nucs[i].obs_het - obs_het_mean[j]), 2);
- obs_hom_var[j] += pow((s[j]->nucs[i].obs_hom - obs_hom_mean[j]), 2);
- exp_het_var[j] += pow((s[j]->nucs[i].exp_het - exp_het_mean[j]), 2);
- exp_hom_var[j] += pow((s[j]->nucs[i].exp_hom - exp_hom_mean[j]), 2);
- pi_var[j] += pow((s[j]->nucs[i].stat[0] - pi_mean[j]), 2);
- fis_var[j] += pow((s[j]->nucs[i].stat[1] - fis_mean[j]), 2);
-
- num_indv_var_all[j] += pow((s[j]->nucs[i].num_indv - num_indv_mean_all[j]), 2);
- p_var_all[j] += pow((s[j]->nucs[i].p - p_mean_all[j]), 2);
- obs_het_var_all[j] += pow((s[j]->nucs[i].obs_het - obs_het_mean_all[j]), 2);
- obs_hom_var_all[j] += pow((s[j]->nucs[i].obs_hom - obs_hom_mean_all[j]), 2);
- exp_het_var_all[j] += pow((s[j]->nucs[i].exp_het - exp_het_mean_all[j]), 2);
- exp_hom_var_all[j] += pow((s[j]->nucs[i].exp_hom - exp_hom_mean_all[j]), 2);
- pi_var_all[j] += pow((s[j]->nucs[i].stat[0] - pi_mean_all[j]), 2);
- fis_var_all[j] += pow((s[j]->nucs[i].stat[1] - fis_mean_all[j]), 2);
- }
- } else if (t->nucs[i].allele_cnt == 1) {
- for (int j = 0; j < pop_cnt; j++) {
- if (s[j]->nucs[i].num_indv == 0) continue;
-
- num_indv_var_all[j] += pow((s[j]->nucs[i].num_indv - num_indv_mean_all[j]), 2);
- p_var_all[j] += pow((s[j]->nucs[i].p - p_mean_all[j]), 2);
- obs_het_var_all[j] += pow((s[j]->nucs[i].obs_het - obs_het_mean_all[j]), 2);
- obs_hom_var_all[j] += pow((s[j]->nucs[i].obs_hom - obs_hom_mean_all[j]), 2);
- exp_het_var_all[j] += pow((s[j]->nucs[i].exp_het - exp_het_mean_all[j]), 2);
- exp_hom_var_all[j] += pow((s[j]->nucs[i].exp_hom - exp_hom_mean_all[j]), 2);
- pi_var_all[j] += pow((s[j]->nucs[i].stat[0] - pi_mean_all[j]), 2);
- fis_var_all[j] += pow((s[j]->nucs[i].stat[1] - fis_mean_all[j]), 2);
- }
- }
- }
- }
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ s = psum->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+ len = strlen(loc->con);
+
+ for (int i = 0; i < len; i++) {
+
+ //
+ // If this site is fixed in all populations, DON'T output it. If it is variable,
+ // or fixed within populations but variable among, DO output it.
+ //
+ if (t->nucs[i].allele_cnt == 2) {
+
+ for (int j = 0; j < pop_cnt; j++) {
+
+ if (s[j]->nucs[i].num_indv == 0) continue;
+
+ fh << batch_id << "\t"
+ << loc->id << "\t"
+ << loc->loc.chr << "\t"
+ << loc->sort_bp(i) + 1 << "\t"
+ << i << "\t"
+ << pop_key[psum->rev_pop_index(j)] << "\t";
+
+ //
+ // Output the p and q alleles in the same order in each population.
+ //
+ if (t->nucs[i].p_allele == s[j]->nucs[i].p_nuc) {
+ if (s[j]->nucs[i].q_nuc == 0)
+ fh << s[j]->nucs[i].p_nuc << "\t" << "-";
+ else
+ fh << s[j]->nucs[i].p_nuc << "\t" << s[j]->nucs[i].q_nuc;
+ p_freq = s[j]->nucs[i].p;
+
+ } else {
+ if (s[j]->nucs[i].q_nuc == 0)
+ fh << "-\t" << s[j]->nucs[i].p_nuc;
+ else
+ fh << s[j]->nucs[i].q_nuc << "\t" << s[j]->nucs[i].p_nuc;
+ p_freq = 1 - s[j]->nucs[i].p;
+ }
+
+ fh << "\t" << (int) s[j]->nucs[i].num_indv << "\t"
+ << std::setprecision(8) << p_freq << "\t"
+ << std::setprecision(fieldw) << s[j]->nucs[i].obs_het << "\t"
+ << s[j]->nucs[i].obs_hom << "\t"
+ << s[j]->nucs[i].exp_het << "\t"
+ << s[j]->nucs[i].exp_hom << "\t"
+ << s[j]->nucs[i].stat[0] << "\t" // Pi
+ << s[j]->nucs[i].smoothed[0] << "\t" // Smoothed Pi
+ << s[j]->nucs[i].bs[0] << "\t" // Pi bootstrapped p-value
+ << (s[j]->nucs[i].stat[1] == -7.0 ? 0.0 : s[j]->nucs[i].stat[1]) << "\t" // Fis
+ << s[j]->nucs[i].smoothed[1] << "\t" // Smoothed Fis
+ << s[j]->nucs[i].bs[1] << "\t"; // Fis bootstrapped p-value.
+ (t->nucs[i].priv_allele == j) ? fh << "1\n" : fh << "0\n";
+
+ //
+ // Tabulate the residuals to calculate the variance.
+ //
+ num_indv_var[j] += pow((s[j]->nucs[i].num_indv - num_indv_mean[j]), 2);
+ p_var[j] += pow((s[j]->nucs[i].p - p_mean[j]), 2);
+ obs_het_var[j] += pow((s[j]->nucs[i].obs_het - obs_het_mean[j]), 2);
+ obs_hom_var[j] += pow((s[j]->nucs[i].obs_hom - obs_hom_mean[j]), 2);
+ exp_het_var[j] += pow((s[j]->nucs[i].exp_het - exp_het_mean[j]), 2);
+ exp_hom_var[j] += pow((s[j]->nucs[i].exp_hom - exp_hom_mean[j]), 2);
+ pi_var[j] += pow((s[j]->nucs[i].stat[0] - pi_mean[j]), 2);
+ fis_var[j] += pow((s[j]->nucs[i].stat[1] - fis_mean[j]), 2);
+
+ num_indv_var_all[j] += pow((s[j]->nucs[i].num_indv - num_indv_mean_all[j]), 2);
+ p_var_all[j] += pow((s[j]->nucs[i].p - p_mean_all[j]), 2);
+ obs_het_var_all[j] += pow((s[j]->nucs[i].obs_het - obs_het_mean_all[j]), 2);
+ obs_hom_var_all[j] += pow((s[j]->nucs[i].obs_hom - obs_hom_mean_all[j]), 2);
+ exp_het_var_all[j] += pow((s[j]->nucs[i].exp_het - exp_het_mean_all[j]), 2);
+ exp_hom_var_all[j] += pow((s[j]->nucs[i].exp_hom - exp_hom_mean_all[j]), 2);
+ pi_var_all[j] += pow((s[j]->nucs[i].stat[0] - pi_mean_all[j]), 2);
+ fis_var_all[j] += pow((s[j]->nucs[i].stat[1] - fis_mean_all[j]), 2);
+ }
+ } else if (t->nucs[i].allele_cnt == 1) {
+ for (int j = 0; j < pop_cnt; j++) {
+ if (s[j]->nucs[i].num_indv == 0) continue;
+
+ num_indv_var_all[j] += pow((s[j]->nucs[i].num_indv - num_indv_mean_all[j]), 2);
+ p_var_all[j] += pow((s[j]->nucs[i].p - p_mean_all[j]), 2);
+ obs_het_var_all[j] += pow((s[j]->nucs[i].obs_het - obs_het_mean_all[j]), 2);
+ obs_hom_var_all[j] += pow((s[j]->nucs[i].obs_hom - obs_hom_mean_all[j]), 2);
+ exp_het_var_all[j] += pow((s[j]->nucs[i].exp_het - exp_het_mean_all[j]), 2);
+ exp_hom_var_all[j] += pow((s[j]->nucs[i].exp_hom - exp_hom_mean_all[j]), 2);
+ pi_var_all[j] += pow((s[j]->nucs[i].stat[0] - pi_mean_all[j]), 2);
+ fis_var_all[j] += pow((s[j]->nucs[i].stat[1] - fis_mean_all[j]), 2);
+ }
+ }
+ }
+ }
}
//
// Calculate the variance.
//
for (int j = 0; j < pop_cnt; j++) {
- num_indv_var[j] = num_indv_var[j] / (n[j] - 1);
- p_var[j] = p_var[j] / (n[j] - 1);
- obs_het_var[j] = obs_het_var[j] / (n[j] - 1);
- obs_hom_var[j] = obs_hom_var[j] / (n[j] - 1);
- exp_het_var[j] = exp_het_var[j] / (n[j] - 1);
- exp_hom_var[j] = exp_hom_var[j] / (n[j] - 1);
- pi_var[j] = pi_var[j] / (n[j] - 1);
- fis_var[j] = fis_var[j] / (n[j] - 1);
-
- num_indv_var_all[j] = num_indv_var_all[j] / (n_all[j] - 1);
- p_var_all[j] = p_var_all[j] / (n_all[j] - 1);
- obs_het_var_all[j] = obs_het_var_all[j] / (n_all[j] - 1);
- obs_hom_var_all[j] = obs_hom_var_all[j] / (n_all[j] - 1);
- exp_het_var_all[j] = exp_het_var_all[j] / (n_all[j] - 1);
- exp_hom_var_all[j] = exp_hom_var_all[j] / (n_all[j] - 1);
- pi_var_all[j] = pi_var_all[j] / (n_all[j] - 1);
- fis_var_all[j] = fis_var_all[j] / (n_all[j] - 1);
+ num_indv_var[j] = num_indv_var[j] / (n[j] - 1);
+ p_var[j] = p_var[j] / (n[j] - 1);
+ obs_het_var[j] = obs_het_var[j] / (n[j] - 1);
+ obs_hom_var[j] = obs_hom_var[j] / (n[j] - 1);
+ exp_het_var[j] = exp_het_var[j] / (n[j] - 1);
+ exp_hom_var[j] = exp_hom_var[j] / (n[j] - 1);
+ pi_var[j] = pi_var[j] / (n[j] - 1);
+ fis_var[j] = fis_var[j] / (n[j] - 1);
+
+ num_indv_var_all[j] = num_indv_var_all[j] / (n_all[j] - 1);
+ p_var_all[j] = p_var_all[j] / (n_all[j] - 1);
+ obs_het_var_all[j] = obs_het_var_all[j] / (n_all[j] - 1);
+ obs_hom_var_all[j] = obs_hom_var_all[j] / (n_all[j] - 1);
+ exp_het_var_all[j] = exp_het_var_all[j] / (n_all[j] - 1);
+ exp_hom_var_all[j] = exp_hom_var_all[j] / (n_all[j] - 1);
+ pi_var_all[j] = pi_var_all[j] / (n_all[j] - 1);
+ fis_var_all[j] = fis_var_all[j] / (n_all[j] - 1);
}
fh.close();
@@ -3941,7 +3941,7 @@ calculate_summary_stats(vector<pair<int, string> > &files, map<int, pair<int, in
if (fh.fail()) {
cerr << "Error opening sumstats summary file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -3979,37 +3979,37 @@ calculate_summary_stats(vector<pair<int, string> > &files, map<int, pair<int, in
double *sq_n_all = new double[pop_cnt];
for (int j = 0; j < pop_cnt; j++) {
- sq_n[j] = sqrt(n[j]);
- sq_n_all[j] = sqrt(n_all[j]);
+ sq_n[j] = sqrt(n[j]);
+ sq_n_all[j] = sqrt(n_all[j]);
}
for (int j = 0; j < pop_cnt; j++)
- fh << pop_key[psum->rev_pop_index(j)] << "\t"
- << private_cnt[j] << "\t"
- << num_indv_mean[j] << "\t"
- << num_indv_var[j] << "\t"
- << sqrt(num_indv_var[j]) / sq_n[j] << "\t"
- << p_mean[j] << "\t"
- << p_var[j] << "\t"
- << sqrt(p_var[j]) / sq_n[j] << "\t"
- << obs_het_mean[j] << "\t"
- << obs_het_var[j] << "\t"
- << sqrt(obs_het_var[j]) / sq_n[j] << "\t"
- << obs_hom_mean[j] << "\t"
- << obs_hom_var[j] << "\t"
- << sqrt(obs_hom_var[j]) / sq_n[j] << "\t"
- << exp_het_mean[j] << "\t"
- << exp_het_var[j] << "\t"
- << sqrt(exp_het_var[j]) / sq_n[j] << "\t"
- << exp_hom_mean[j] << "\t"
- << exp_hom_var[j] << "\t"
- << sqrt(exp_hom_var[j]) / sq_n[j] << "\t"
- << pi_mean[j] << "\t"
- << pi_var[j] << "\t"
- << sqrt(pi_var[j]) / sq_n[j] << "\t"
- << fis_mean[j] << "\t"
- << fis_var[j] << "\t"
- << sqrt(num_indv_var[j]) / sq_n[j] << "\n";
+ fh << pop_key[psum->rev_pop_index(j)] << "\t"
+ << private_cnt[j] << "\t"
+ << num_indv_mean[j] << "\t"
+ << num_indv_var[j] << "\t"
+ << sqrt(num_indv_var[j]) / sq_n[j] << "\t"
+ << p_mean[j] << "\t"
+ << p_var[j] << "\t"
+ << sqrt(p_var[j]) / sq_n[j] << "\t"
+ << obs_het_mean[j] << "\t"
+ << obs_het_var[j] << "\t"
+ << sqrt(obs_het_var[j]) / sq_n[j] << "\t"
+ << obs_hom_mean[j] << "\t"
+ << obs_hom_var[j] << "\t"
+ << sqrt(obs_hom_var[j]) / sq_n[j] << "\t"
+ << exp_het_mean[j] << "\t"
+ << exp_het_var[j] << "\t"
+ << sqrt(exp_het_var[j]) / sq_n[j] << "\t"
+ << exp_hom_mean[j] << "\t"
+ << exp_hom_var[j] << "\t"
+ << sqrt(exp_hom_var[j]) / sq_n[j] << "\t"
+ << pi_mean[j] << "\t"
+ << pi_var[j] << "\t"
+ << sqrt(pi_var[j]) / sq_n[j] << "\t"
+ << fis_mean[j] << "\t"
+ << fis_var[j] << "\t"
+ << sqrt(num_indv_var[j]) / sq_n[j] << "\n";
fh << "# All positions (variant and fixed)\n"
<< "# Pop ID\t"
@@ -4044,36 +4044,36 @@ calculate_summary_stats(vector<pair<int, string> > &files, map<int, pair<int, in
<< "StdErr\n";
for (int j = 0; j < pop_cnt; j++) {
- fh << pop_key[psum->rev_pop_index(j)] << "\t"
- << private_cnt[j] << "\t"
- << n_all[j] << "\t"
- << n[j] << "\t"
- << var_sites[j] << "\t"
- << var_sites[j] / n_all[j] * 100 << "\t"
- << num_indv_mean_all[j] << "\t"
- << num_indv_var_all[j] << "\t"
- << sqrt(num_indv_var_all[j]) / sq_n_all[j] << "\t"
- << p_mean_all[j] << "\t"
- << p_var_all[j] << "\t"
- << sqrt(p_var_all[j]) / sq_n_all[j] << "\t"
- << obs_het_mean_all[j] << "\t"
- << obs_het_var_all[j] << "\t"
- << sqrt(obs_het_var_all[j]) / sq_n_all[j] << "\t"
- << obs_hom_mean_all[j] << "\t"
- << obs_hom_var_all[j] << "\t"
- << sqrt(obs_hom_var_all[j]) / sq_n_all[j] << "\t"
- << exp_het_mean_all[j] << "\t"
- << exp_het_var_all[j] << "\t"
- << sqrt(exp_het_var_all[j]) / sq_n_all[j] << "\t"
- << exp_hom_mean_all[j] << "\t"
- << exp_hom_var_all[j] << "\t"
- << sqrt(exp_hom_var_all[j]) / sq_n_all[j] << "\t"
- << pi_mean_all[j] << "\t"
- << pi_var_all[j] << "\t"
- << sqrt(pi_var_all[j]) / sq_n_all[j] << "\t"
- << fis_mean_all[j] << "\t"
- << fis_var_all[j] << "\t"
- << sqrt(num_indv_var_all[j]) / sq_n_all[j] << "\n";
+ fh << pop_key[psum->rev_pop_index(j)] << "\t"
+ << private_cnt[j] << "\t"
+ << n_all[j] << "\t"
+ << n[j] << "\t"
+ << var_sites[j] << "\t"
+ << var_sites[j] / n_all[j] * 100 << "\t"
+ << num_indv_mean_all[j] << "\t"
+ << num_indv_var_all[j] << "\t"
+ << sqrt(num_indv_var_all[j]) / sq_n_all[j] << "\t"
+ << p_mean_all[j] << "\t"
+ << p_var_all[j] << "\t"
+ << sqrt(p_var_all[j]) / sq_n_all[j] << "\t"
+ << obs_het_mean_all[j] << "\t"
+ << obs_het_var_all[j] << "\t"
+ << sqrt(obs_het_var_all[j]) / sq_n_all[j] << "\t"
+ << obs_hom_mean_all[j] << "\t"
+ << obs_hom_var_all[j] << "\t"
+ << sqrt(obs_hom_var_all[j]) / sq_n_all[j] << "\t"
+ << exp_het_mean_all[j] << "\t"
+ << exp_het_var_all[j] << "\t"
+ << sqrt(exp_het_var_all[j]) / sq_n_all[j] << "\t"
+ << exp_hom_mean_all[j] << "\t"
+ << exp_hom_var_all[j] << "\t"
+ << sqrt(exp_hom_var_all[j]) / sq_n_all[j] << "\t"
+ << pi_mean_all[j] << "\t"
+ << pi_var_all[j] << "\t"
+ << sqrt(pi_var_all[j]) / sq_n_all[j] << "\t"
+ << fis_mean_all[j] << "\t"
+ << fis_var_all[j] << "\t"
+ << sqrt(num_indv_var_all[j]) / sq_n_all[j] << "\n";
}
delete [] private_cnt;
@@ -4123,7 +4123,7 @@ calculate_summary_stats(vector<pair<int, string> > &files, map<int, pair<int, in
int
write_fst_stats(vector<pair<int, string> > &files, map<int, pair<int, int> > &pop_indexes,
- map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum, ofstream &log_fh)
+ map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum, ofstream &log_fh)
{
//
// We want to iterate over each pair of populations and calculate Fst at each
@@ -4133,7 +4133,7 @@ write_fst_stats(vector<pair<int, string> > &files, map<int, pair<int, int> > &po
vector<int> pops;
map<int, pair<int, int> >::iterator pit;
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++)
- pops.push_back(pit->first);
+ pops.push_back(pit->first);
if (pops.size() == 1) return 0;
@@ -4144,244 +4144,244 @@ write_fst_stats(vector<pair<int, string> > &files, map<int, pair<int, int> > &po
KSmooth<PopPair> *ks;
Bootstrap<PopPair> *bs;
if (kernel_smoothed && loci_ordered)
- ks = new KSmooth<PopPair>(2);
+ ks = new KSmooth<PopPair>(2);
for (uint i = 0; i < pops.size(); i++) {
- for (uint j = i + 1; j < pops.size(); j++) {
- int pop_1 = pops[i];
- int pop_2 = pops[j];
-
- double sum = 0.0;
- double cnt = 0.0;
-
- stringstream pop_name;
- pop_name << "batch_" << batch_id << ".fst_" << pop_key[pop_1] << "-" << pop_key[pop_2] << ".tsv";
-
- string file = in_path + pop_name.str();
- ofstream fh(file.c_str(), ofstream::out);
- if (fh.fail()) {
- cerr << "Error opening Fst output file '" << file << "'\n";
- exit(1);
- }
- fh.precision(fieldw);
- fh.setf(std::ios::fixed);
-
- cerr << "Calculating Fst for populations '" << pop_key[pop_1] << "' and '" << pop_key[pop_2] << "' and writing it to file, '" << file << "'\n";
-
- fh << "# Batch ID" << "\t"
- << "Locus ID" << "\t"
- << "Pop 1 ID" << "\t"
- << "Pop 2 ID" << "\t"
- << "Chr" << "\t"
- << "BP" << "\t"
- << "Column" << "\t"
- << "Overall Pi" << "\t"
- << "Fst" << "\t"
- << "Fisher's P" << "\t"
- << "Odds Ratio" << "\t"
- << "CI Low" << "\t"
- << "CI High" << "\t"
- << "LOD" << "\t"
- << "Corrected Fst" << "\t"
- << "Smoothed Fst" << "\t"
- << "AMOVA Fst" << "\t"
- << "Corrected AMOVA Fst" << "\t"
- << "Smoothed AMOVA Fst" << "\t"
- << "Smoothed AMOVA Fst P-value" << "\t"
- << "Window SNP Count";
-
- //
- // If requested, log Fst component calculations to a file.
- //
- if (log_fst_comp) {
- fh << "\t"
- << "n_1" << "\t"
- << "n_2" << "\t"
- << "tot_alleles" << "\t"
- << "p_1" << "\t"
- << "q_1" << "\t"
- << "p_2" << "\t"
- << "q_2" << "\t"
- << "pi_1" << "\t"
- << "pi_2" << "\t"
- << "pi_all" << "\t"
- << "bcoeff_1" << "\t"
- << "bcoeff_2" << "\t"
- << "binomial_fst" << "\t"
- << "p_1_freq" << "\t"
- << "q_1_freq" << "\t"
- << "p_2_freq" << "\t"
- << "q_2_freq" << "\t"
- << "p_avg_cor" << "\t"
- << "n_avg_cor" << "\t"
- << "amova_fst" << "\n";
- } else {
- fh << "\n";
- }
-
- if (bootstrap_fst)
- bs = new Bootstrap<PopPair>(2);
-
- map<string, vector<CSLocus *> >::iterator it;
- map<string, vector<PopPair *> > genome_pairs;
- // int snp_dist[max_snp_dist] = {0};
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- string chr = it->first;
-
- map<uint, uint> pairs_key;
- vector<PopPair *> &pairs = genome_pairs[chr];
-
- //
- // Order loci between the two populations and calculate Fst
- //
- ord->order(pairs, pairs_key, it->second, pop_1, pop_2);
-
- //
- // Apply user-selected correction to the Fst values.
- //
- double correction;
- switch(fst_correction) {
- case p_value:
- for (uint i = 0; i < pairs.size(); i++) {
- if (pairs[i] != NULL) {
- pairs[i]->stat[0] = pairs[i]->fet_p < p_value_cutoff ? pairs[i]->fst : 0;
- pairs[i]->stat[1] = pairs[i]->fet_p < p_value_cutoff ? pairs[i]->amova_fst : 0;
- }
- }
- break;
- case bonferroni_win:
- correct_fst_bonferroni_win(pairs);
- break;
- case bonferroni_gen:
- correction = p_value_cutoff / catalog.size();
- for (uint i = 0; i < pairs.size(); i++) {
- if (pairs[i] != NULL) {
- pairs[i]->stat[0] = pairs[i]->fet_p < correction ? pairs[i]->fst : 0;
- pairs[i]->stat[1] = pairs[i]->fet_p < correction ? pairs[i]->amova_fst : 0;
- }
- }
- break;
- case no_correction:
- for (uint i = 0; i < pairs.size(); i++) {
- if (pairs[i] != NULL) {
- pairs[i]->stat[0] = pairs[i]->fst;
- pairs[i]->stat[1] = pairs[i]->amova_fst;
- }
- }
- break;
- }
-
- //
- // If bootstrapping is enabled, record all Fst values.
- //
- if (bootstrap_fst)
- bs->add_data(pairs);
-
- //
- // Calculate kernel-smoothed Fst values.
- //
- if (kernel_smoothed && loci_ordered) {
- cerr << " Generating kernel-smoothed Fst for " << it->first << ".\n";
- ks->smooth(pairs);
- }
- }
-
- //
- // If bootstrap resampling method is approximate, generate our single, empirical distribution.
- //
- map<int, vector<double> > approx_fst_dist;
- // if (bootstrap_fst && bootstrap_type == bs_approx)
- // bootstrap_fst_approximate_dist(fst_samples, allele_depth_samples, weights, snp_dist, approx_fst_dist);
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- string chr = it->first;
- vector<PopPair *> &pairs = genome_pairs[chr];
-
- //
- // Bootstrap resample this chromosome.
- //
- if (bootstrap_fst && bootstrap_type == bs_exact) {
- cerr << " Bootstrap resampling kernel-smoothed Fst for " << it->first << ".\n";
- bs->execute(pairs);
- }
-
- for (uint i = 0; i < pairs.size(); i++) {
-
- if (pairs[i] == NULL)
- continue;
-
- //
- // Calculate Fst P-value from approximate distribution.
- //
- // if (bootstrap_fst && bootstrap_type == bs_approx)
- // pairs[i]->bs[0] = bootstrap_approximate_pval(pairs[i]->snp_cnt, pairs[i]->stat[0], approx_fst_dist);
-
- cnt++;
- sum += pairs[i]->stat[1]; // Corrected AMOVA Fst
-
- fh << batch_id << "\t"
- << pairs[i]->loc_id << "\t"
- << pop_key[pop_1] << "\t"
- << pop_key[pop_2] << "\t"
- << chr << "\t"
- << pairs[i]->bp << "\t"
- << pairs[i]->col << "\t"
- << pairs[i]->pi << "\t"
- << pairs[i]->fst << "\t"
- << std::setprecision(9) << pairs[i]->fet_p << "\t"
- << pairs[i]->fet_or << "\t"
- << pairs[i]->ci_low << "\t"
- << pairs[i]->ci_high << "\t"
- << pairs[i]->lod << "\t"
- << pairs[i]->stat[0] << "\t"
- << pairs[i]->smoothed[0] << "\t"
- << pairs[i]->amova_fst << "\t"
- << pairs[i]->stat[1] << "\t"
- << pairs[i]->smoothed[1] << "\t"
- << pairs[i]->bs[1] << "\t"
- << pairs[i]->snp_cnt;
-
- if (log_fst_comp) {
- fh << "\t"
- << pairs[i]->comp[0] << "\t"
- << pairs[i]->comp[1] << "\t"
- << pairs[i]->comp[2] << "\t"
- << pairs[i]->comp[3] << "\t"
- << pairs[i]->comp[4] << "\t"
- << pairs[i]->comp[5] << "\t"
- << pairs[i]->comp[6] << "\t"
- << pairs[i]->comp[7] << "\t"
- << pairs[i]->comp[8] << "\t"
- << pairs[i]->comp[9] << "\t"
- << pairs[i]->comp[10] << "\t"
- << pairs[i]->comp[11] << "\t"
- << pairs[i]->fst << "\t"
- << pairs[i]->comp[12] << "\t"
- << pairs[i]->comp[13] << "\t"
- << pairs[i]->comp[14] << "\t"
- << pairs[i]->comp[15] << "\t"
- << pairs[i]->comp[16] << "\t"
- << pairs[i]->comp[17] << "\t"
- << pairs[i]->amova_fst << "\n";
- } else {
- fh << "\n";
- }
-
- delete pairs[i];
- }
- }
- cerr << "Pop 1: " << pop_key[pop_1] << "; Pop 2: " << pop_key[pop_2] << "; mean Fst: " << (sum / cnt) << "\n";
- means.push_back(sum / cnt);
-
- cerr << "Pooled populations '" << pop_key[pop_1] << "' and '" << pop_key[pop_2] << "' contained: " << ord->incompatible_loci << " incompatible loci; "
- << ord->multiple_loci << " nucleotides covered by more than one RAD locus.\n";
- fh.close();
-
- if (bootstrap_fst)
- delete bs;
- }
+ for (uint j = i + 1; j < pops.size(); j++) {
+ int pop_1 = pops[i];
+ int pop_2 = pops[j];
+
+ double sum = 0.0;
+ double cnt = 0.0;
+
+ stringstream pop_name;
+ pop_name << "batch_" << batch_id << ".fst_" << pop_key[pop_1] << "-" << pop_key[pop_2] << ".tsv";
+
+ string file = in_path + pop_name.str();
+ ofstream fh(file.c_str(), ofstream::out);
+ if (fh.fail()) {
+ cerr << "Error opening Fst output file '" << file << "'\n";
+ exit(1);
+ }
+ fh.precision(fieldw);
+ fh.setf(std::ios::fixed);
+
+ cerr << "Calculating Fst for populations '" << pop_key[pop_1] << "' and '" << pop_key[pop_2] << "' and writing it to file, '" << file << "'\n";
+
+ fh << "# Batch ID" << "\t"
+ << "Locus ID" << "\t"
+ << "Pop 1 ID" << "\t"
+ << "Pop 2 ID" << "\t"
+ << "Chr" << "\t"
+ << "BP" << "\t"
+ << "Column" << "\t"
+ << "Overall Pi" << "\t"
+ << "Fst" << "\t"
+ << "Fisher's P" << "\t"
+ << "Odds Ratio" << "\t"
+ << "CI Low" << "\t"
+ << "CI High" << "\t"
+ << "LOD" << "\t"
+ << "Corrected Fst" << "\t"
+ << "Smoothed Fst" << "\t"
+ << "AMOVA Fst" << "\t"
+ << "Corrected AMOVA Fst" << "\t"
+ << "Smoothed AMOVA Fst" << "\t"
+ << "Smoothed AMOVA Fst P-value" << "\t"
+ << "Window SNP Count";
+
+ //
+ // If requested, log Fst component calculations to a file.
+ //
+ if (log_fst_comp) {
+ fh << "\t"
+ << "n_1" << "\t"
+ << "n_2" << "\t"
+ << "tot_alleles" << "\t"
+ << "p_1" << "\t"
+ << "q_1" << "\t"
+ << "p_2" << "\t"
+ << "q_2" << "\t"
+ << "pi_1" << "\t"
+ << "pi_2" << "\t"
+ << "pi_all" << "\t"
+ << "bcoeff_1" << "\t"
+ << "bcoeff_2" << "\t"
+ << "binomial_fst" << "\t"
+ << "p_1_freq" << "\t"
+ << "q_1_freq" << "\t"
+ << "p_2_freq" << "\t"
+ << "q_2_freq" << "\t"
+ << "p_avg_cor" << "\t"
+ << "n_avg_cor" << "\t"
+ << "amova_fst" << "\n";
+ } else {
+ fh << "\n";
+ }
+
+ if (bootstrap_fst)
+ bs = new Bootstrap<PopPair>(2);
+
+ map<string, vector<CSLocus *> >::iterator it;
+ map<string, vector<PopPair *> > genome_pairs;
+ // int snp_dist[max_snp_dist] = {0};
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ string chr = it->first;
+
+ map<uint, uint> pairs_key;
+ vector<PopPair *> &pairs = genome_pairs[chr];
+
+ //
+ // Order loci between the two populations and calculate Fst
+ //
+ ord->order(pairs, pairs_key, it->second, pop_1, pop_2);
+
+ //
+ // Apply user-selected correction to the Fst values.
+ //
+ double correction;
+ switch(fst_correction) {
+ case p_value:
+ for (uint i = 0; i < pairs.size(); i++) {
+ if (pairs[i] != NULL) {
+ pairs[i]->stat[0] = pairs[i]->fet_p < p_value_cutoff ? pairs[i]->fst : 0;
+ pairs[i]->stat[1] = pairs[i]->fet_p < p_value_cutoff ? pairs[i]->amova_fst : 0;
+ }
+ }
+ break;
+ case bonferroni_win:
+ correct_fst_bonferroni_win(pairs);
+ break;
+ case bonferroni_gen:
+ correction = p_value_cutoff / catalog.size();
+ for (uint i = 0; i < pairs.size(); i++) {
+ if (pairs[i] != NULL) {
+ pairs[i]->stat[0] = pairs[i]->fet_p < correction ? pairs[i]->fst : 0;
+ pairs[i]->stat[1] = pairs[i]->fet_p < correction ? pairs[i]->amova_fst : 0;
+ }
+ }
+ break;
+ case no_correction:
+ for (uint i = 0; i < pairs.size(); i++) {
+ if (pairs[i] != NULL) {
+ pairs[i]->stat[0] = pairs[i]->fst;
+ pairs[i]->stat[1] = pairs[i]->amova_fst;
+ }
+ }
+ break;
+ }
+
+ //
+ // If bootstrapping is enabled, record all Fst values.
+ //
+ if (bootstrap_fst)
+ bs->add_data(pairs);
+
+ //
+ // Calculate kernel-smoothed Fst values.
+ //
+ if (kernel_smoothed && loci_ordered) {
+ cerr << " Generating kernel-smoothed Fst for " << it->first << ".\n";
+ ks->smooth(pairs);
+ }
+ }
+
+ //
+ // If bootstrap resampling method is approximate, generate our single, empirical distribution.
+ //
+ map<int, vector<double> > approx_fst_dist;
+ // if (bootstrap_fst && bootstrap_type == bs_approx)
+ // bootstrap_fst_approximate_dist(fst_samples, allele_depth_samples, weights, snp_dist, approx_fst_dist);
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ string chr = it->first;
+ vector<PopPair *> &pairs = genome_pairs[chr];
+
+ //
+ // Bootstrap resample this chromosome.
+ //
+ if (bootstrap_fst && bootstrap_type == bs_exact) {
+ cerr << " Bootstrap resampling kernel-smoothed Fst for " << it->first << ".\n";
+ bs->execute(pairs);
+ }
+
+ for (uint i = 0; i < pairs.size(); i++) {
+
+ if (pairs[i] == NULL)
+ continue;
+
+ //
+ // Calculate Fst P-value from approximate distribution.
+ //
+ // if (bootstrap_fst && bootstrap_type == bs_approx)
+ // pairs[i]->bs[0] = bootstrap_approximate_pval(pairs[i]->snp_cnt, pairs[i]->stat[0], approx_fst_dist);
+
+ cnt++;
+ sum += pairs[i]->stat[1]; // Corrected AMOVA Fst
+
+ fh << batch_id << "\t"
+ << pairs[i]->loc_id << "\t"
+ << pop_key[pop_1] << "\t"
+ << pop_key[pop_2] << "\t"
+ << chr << "\t"
+ << pairs[i]->bp << "\t"
+ << pairs[i]->col << "\t"
+ << pairs[i]->pi << "\t"
+ << pairs[i]->fst << "\t"
+ << std::setprecision(9) << pairs[i]->fet_p << "\t"
+ << pairs[i]->fet_or << "\t"
+ << pairs[i]->ci_low << "\t"
+ << pairs[i]->ci_high << "\t"
+ << pairs[i]->lod << "\t"
+ << pairs[i]->stat[0] << "\t"
+ << pairs[i]->smoothed[0] << "\t"
+ << pairs[i]->amova_fst << "\t"
+ << pairs[i]->stat[1] << "\t"
+ << pairs[i]->smoothed[1] << "\t"
+ << pairs[i]->bs[1] << "\t"
+ << pairs[i]->snp_cnt;
+
+ if (log_fst_comp) {
+ fh << "\t"
+ << pairs[i]->comp[0] << "\t"
+ << pairs[i]->comp[1] << "\t"
+ << pairs[i]->comp[2] << "\t"
+ << pairs[i]->comp[3] << "\t"
+ << pairs[i]->comp[4] << "\t"
+ << pairs[i]->comp[5] << "\t"
+ << pairs[i]->comp[6] << "\t"
+ << pairs[i]->comp[7] << "\t"
+ << pairs[i]->comp[8] << "\t"
+ << pairs[i]->comp[9] << "\t"
+ << pairs[i]->comp[10] << "\t"
+ << pairs[i]->comp[11] << "\t"
+ << pairs[i]->fst << "\t"
+ << pairs[i]->comp[12] << "\t"
+ << pairs[i]->comp[13] << "\t"
+ << pairs[i]->comp[14] << "\t"
+ << pairs[i]->comp[15] << "\t"
+ << pairs[i]->comp[16] << "\t"
+ << pairs[i]->comp[17] << "\t"
+ << pairs[i]->amova_fst << "\n";
+ } else {
+ fh << "\n";
+ }
+
+ delete pairs[i];
+ }
+ }
+ cerr << "Pop 1: " << pop_key[pop_1] << "; Pop 2: " << pop_key[pop_2] << "; mean Fst: " << (sum / cnt) << "\n";
+ means.push_back(sum / cnt);
+
+ cerr << "Pooled populations '" << pop_key[pop_1] << "' and '" << pop_key[pop_2] << "' contained: " << ord->incompatible_loci << " incompatible loci; "
+ << ord->multiple_loci << " nucleotides covered by more than one RAD locus.\n";
+ fh.close();
+
+ if (bootstrap_fst)
+ delete bs;
+ }
}
//
@@ -4394,36 +4394,36 @@ write_fst_stats(vector<pair<int, string> > &files, map<int, pair<int, int> > &po
ofstream fh(file.c_str(), ofstream::out);
if (fh.fail()) {
- cerr << "Error opening generic output file '" << file << "'\n";
- exit(1);
+ cerr << "Error opening generic output file '" << file << "'\n";
+ exit(1);
}
//
// Write out X-axis header.
//
for (uint i = 0; i < pops.size(); i++)
- fh << "\t" << pop_key[pops[i]];
+ fh << "\t" << pop_key[pops[i]];
fh << "\n";
uint n = 0;
for (uint i = 0; i < pops.size() - 1; i++) {
- fh << pop_key[pops[i]];
+ fh << pop_key[pops[i]];
- for (uint k = 0; k <= i; k++)
- fh << "\t";
+ for (uint k = 0; k <= i; k++)
+ fh << "\t";
- for (uint j = i + 1; j < pops.size(); j++) {
- fh << "\t" << means[n];
- n++;
- }
- fh << "\n";
+ for (uint j = i + 1; j < pops.size(); j++) {
+ fh << "\t" << means[n];
+ n++;
+ }
+ fh << "\n";
}
fh.close();
delete ord;
if (kernel_smoothed && loci_ordered) {
- delete ks;
+ delete ks;
}
return 0;
@@ -4441,40 +4441,40 @@ correct_fst_bonferroni_win(vector<PopPair *> &pairs)
pos_u = 0;
for (uint pos_c = 0; pos_c < pairs.size(); pos_c++) {
- if (pairs[pos_c] == NULL) continue;
-
- limit_l = pairs[pos_c]->bp - limit > 0 ? pairs[pos_c]->bp - limit : 0;
- limit_u = pairs[pos_c]->bp + limit;
-
- while (pos_l < pairs.size()) {
- if (pairs[pos_l] == NULL) {
- pos_l++;
- } else {
- if (pairs[pos_l]->bp < limit_l)
- pos_l++;
- else
- break;
- }
- }
- while (pos_u < pairs.size()) {
- if (pairs[pos_u] == NULL) {
- pos_u++;
- } else {
- if (pairs[pos_u]->bp < limit_u)
- pos_u++;
- else
- break;
- }
- }
-
- cnt = 0;
- for (uint i = pos_l; i < pos_u; i++) {
- if (pairs[i] == NULL) continue;
- cnt++;
- }
-
- correction = p_value_cutoff / cnt;
- pairs[pos_c]->stat[0] = pairs[pos_c]->fet_p < correction ? pairs[pos_c]->fst : 0;
+ if (pairs[pos_c] == NULL) continue;
+
+ limit_l = pairs[pos_c]->bp - limit > 0 ? pairs[pos_c]->bp - limit : 0;
+ limit_u = pairs[pos_c]->bp + limit;
+
+ while (pos_l < pairs.size()) {
+ if (pairs[pos_l] == NULL) {
+ pos_l++;
+ } else {
+ if (pairs[pos_l]->bp < limit_l)
+ pos_l++;
+ else
+ break;
+ }
+ }
+ while (pos_u < pairs.size()) {
+ if (pairs[pos_u] == NULL) {
+ pos_u++;
+ } else {
+ if (pairs[pos_u]->bp < limit_u)
+ pos_u++;
+ else
+ break;
+ }
+ }
+
+ cnt = 0;
+ for (uint i = pos_l; i < pos_u; i++) {
+ if (pairs[i] == NULL) continue;
+ cnt++;
+ }
+
+ correction = p_value_cutoff / cnt;
+ pairs[pos_c]->stat[0] = pairs[pos_c]->fet_p < correction ? pairs[pos_c]->fst : 0;
}
return 0;
@@ -4497,29 +4497,29 @@ kernel_smoothed_popstats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, Po
Bootstrap<SumStat> *bs;
if (bootstrap_pifis)
- bs = new Bootstrap<SumStat>(2);
+ bs = new Bootstrap<SumStat>(2);
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- vector<SumStat *> &sites = genome_sites[it->first];
+ vector<SumStat *> &sites = genome_sites[it->first];
- ord->order(sites, it->second, pop_id);
- if (bootstrap_pifis) bs->add_data(sites);
+ ord->order(sites, it->second, pop_id);
+ if (bootstrap_pifis) bs->add_data(sites);
}
cerr << " Population '" << pop_key[pop_id] << "' contained " << ord->multiple_loci << " nucleotides covered by more than one RAD locus.\n";
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- if (bootstrap_pifis)
- cerr << " Smoothing and bootstrapping chromosome " << it->first << "\n";
- else
- cerr << " Smoothing chromosome " << it->first << "\n";
+ if (bootstrap_pifis)
+ cerr << " Smoothing and bootstrapping chromosome " << it->first << "\n";
+ else
+ cerr << " Smoothing chromosome " << it->first << "\n";
- vector<SumStat *> &sites = genome_sites[it->first];
+ vector<SumStat *> &sites = genome_sites[it->first];
- ks->smooth(sites);
+ ks->smooth(sites);
- if (bootstrap_pifis && bootstrap_type == bs_exact)
- bs->execute_mixed(sites);
+ if (bootstrap_pifis && bootstrap_type == bs_exact)
+ bs->execute_mixed(sites);
}
delete ks;
@@ -4532,31 +4532,31 @@ kernel_smoothed_popstats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, Po
// map<int, vector<double> > approx_fis_dist;
// map<int, vector<double> > approx_pi_dist;
// if (bootstrap && bootstrap_type == bs_approx) {
-// sites_per_snp = sites_per_snp / tot_windows;
-
-// // cerr << "Sites per snp: " << sites_per_snp << "\n";
-
-// bootstrap_popstats_approximate_dist(fis_samples, pi_samples, allele_depth_samples,
-// weights, snp_dist, sites_per_snp,
-// approx_fis_dist, approx_pi_dist);
-
-// for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
-
-// for (uint pos = 0; pos < it->second.size(); pos++) {
-// loc = it->second[pos];
-// len = strlen(loc->con);
-// lsum = psum->pop(loc->id, pop_id);
-
-// for (int k = 0; k < len; k++)
-// if (lsum->nucs[k].num_indv > 0 && bootstrap && lsum->nucs[k].pi > 0) {
-// //
-// // Calculate Fis/Pi p-values from approximate distribution.
-// //
-// lsum->nucs[k].wFis_pval = bootstrap_approximate_pval(lsum->nucs[k].snp_cnt, lsum->nucs[k].wFis, approx_fis_dist);
-// lsum->nucs[k].wPi_pval = bootstrap_approximate_pval(lsum->nucs[k].snp_cnt, lsum->nucs[k].wPi, approx_pi_dist);
-// }
-// }
-// }
+// sites_per_snp = sites_per_snp / tot_windows;
+
+// // cerr << "Sites per snp: " << sites_per_snp << "\n";
+
+// bootstrap_popstats_approximate_dist(fis_samples, pi_samples, allele_depth_samples,
+// weights, snp_dist, sites_per_snp,
+// approx_fis_dist, approx_pi_dist);
+
+// for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+
+// for (uint pos = 0; pos < it->second.size(); pos++) {
+// loc = it->second[pos];
+// len = strlen(loc->con);
+// lsum = psum->pop(loc->id, pop_id);
+
+// for (int k = 0; k < len; k++)
+// if (lsum->nucs[k].num_indv > 0 && bootstrap && lsum->nucs[k].pi > 0) {
+// //
+// // Calculate Fis/Pi p-values from approximate distribution.
+// //
+// lsum->nucs[k].wFis_pval = bootstrap_approximate_pval(lsum->nucs[k].snp_cnt, lsum->nucs[k].wFis, approx_fis_dist);
+// lsum->nucs[k].wPi_pval = bootstrap_approximate_pval(lsum->nucs[k].snp_cnt, lsum->nucs[k].wPi, approx_pi_dist);
+// }
+// }
+// }
// }
return 0;
@@ -4564,11 +4564,11 @@ kernel_smoothed_popstats(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, Po
int
bootstrap_popstats_approximate_dist(vector<double> &fis_samples,
- vector<double> &pi_samples,
- vector<int> &allele_samples,
- double *weights, int *snp_dist, int sites_per_snp,
- map<int, vector<double> > &approx_fis_dist,
- map<int, vector<double> > &approx_pi_dist)
+ vector<double> &pi_samples,
+ vector<int> &allele_samples,
+ double *weights, int *snp_dist, int sites_per_snp,
+ map<int, vector<double> > &approx_fis_dist,
+ map<int, vector<double> > &approx_pi_dist)
{
//
// Allocate an array of bootstrap resampling objects.
@@ -4580,14 +4580,14 @@ bootstrap_popstats_approximate_dist(vector<double> &fis_samples,
// Initialize the Fst distribution map.
//
for (int i = 0; i < max_snp_dist; i++) {
- if (snp_dist[i] == 0.0) continue;
+ if (snp_dist[i] == 0.0) continue;
- // cerr << "SNP Dist: " << i << " snps occurred " << snp_dist[i] << "\n";
- approx_fis_dist[i] = vector<double> ();
- approx_fis_dist[i].reserve(bootstrap_reps);
+ // cerr << "SNP Dist: " << i << " snps occurred " << snp_dist[i] << "\n";
+ approx_fis_dist[i] = vector<double> ();
+ approx_fis_dist[i].reserve(bootstrap_reps);
- approx_pi_dist[i] = vector<double> ();
- approx_pi_dist[i].reserve(bootstrap_reps);
+ approx_pi_dist[i] = vector<double> ();
+ approx_pi_dist[i].reserve(bootstrap_reps);
}
vector<int> poss;
@@ -4598,128 +4598,128 @@ bootstrap_popstats_approximate_dist(vector<double> &fis_samples,
int half = sites_per_snp / 2;
for (int i = 0; i < max_snp_dist; i++) {
- if (snp_dist[i] == 0.0) continue;
+ if (snp_dist[i] == 0.0) continue;
- cerr << " Generating NULL distribution for " << i << " SNPs...\n";
+ cerr << " Generating NULL distribution for " << i << " SNPs...\n";
- // #pragma omp parallel private(poss, pos, index_1, index_2, index_3, dist, sum_fis, sum_pi, weighted_fis, weighted_pi, final_weight_fis, final_weight_pi)
+ // #pragma omp parallel private(poss, pos, index_1, index_2, index_3, dist, sum_fis, sum_pi, weighted_fis, weighted_pi, final_weight_fis, final_weight_pi)
#pragma omp parallel private(poss, pos, index_3, dist, sum_fis, sum_pi, weighted_fis, weighted_pi, final_weight_fis, final_weight_pi)
- {
- BSample *bs = new BSample[win_size];
+ {
+ BSample *bs = new BSample[win_size];
- //
- // Populate the BSample objects.
- //
- for (int n = 0; n < win_size; n++)
- bs[n].bp = n + 1;
+ //
+ // Populate the BSample objects.
+ //
+ for (int n = 0; n < win_size; n++)
+ bs[n].bp = n + 1;
- vector<double> fiss, pis;
+ vector<double> fiss, pis;
- //
- // Bootstrap this bitch.
- //
+ //
+ // Bootstrap this bitch.
+ //
#pragma omp for schedule(dynamic, 1)
- for (int j = 0; j < bootstrap_reps; j++) {
- // cerr << " Bootsrap rep " << j << "\n";
-
- //
- // First SNP is always placed at the center of the window.
- //
- pos = win_cntr;
- // index_1 = (int) (fis_samples.size() * (random() / (RAND_MAX + 1.0)));
- // index_2 = (int) (pi_samples.size() * (random() / (RAND_MAX + 1.0)));
- index_3 = (int) (allele_samples.size() * (random() / (RAND_MAX + 1.0)));
- //
- // Fill in the area around the SNP with fixed sites.
- //
- start = pos - half > 0 ? pos - half : 0;
- end = pos + half < win_size ? pos + half : win_size;
- for (int n = start; n < end; n++) {
- // bs[n].f = 0;
- // bs[n].pi = 0;
- bs[n].alleles = bs[pos].alleles;
- poss.push_back(n);
- }
- // bs[pos].f = fis_samples[index_1];
- // bs[pos].pi = pi_samples[index_2];
- bs[pos].alleles = allele_samples[index_3];
- // cerr << " Placing SNP at position: " << pos << "; with data from " << index_1 << " filling area from " << start << " to " << end << "\n";
-
- //
- // Randomly select the positions and values for each SNP to populate the window
- //
- for (int k = 0; k < i - 1; k++) {
- pos = (int) (win_size * (random() / (RAND_MAX + 1.0)));
- // index_1 = (int) (fis_samples.size() * (random() / (RAND_MAX + 1.0)));
- // index_2 = (int) (pi_samples.size() * (random() / (RAND_MAX + 1.0)));
- index_3 = (int) (allele_samples.size() * (random() / (RAND_MAX + 1.0)));
-
- poss.push_back(pos);
- //
- // Fill in the area around the SNP with fixed sites.
- //
- start = pos - half > 0 ? pos - half : 0;
- end = pos + half < win_size ? pos + half : win_size;
- for (int n = start; n < end; n++) {
- // bs[n].f = 0;
- // bs[n].pi = 0;
- bs[n].alleles = bs[pos].alleles;
- poss.push_back(n);
- }
- // bs[pos].f = fis_samples[index_1];
- // bs[pos].pi = pi_samples[index_2];
- bs[pos].alleles = allele_samples[index_3];
- // cerr << " Placing SNP at position: " << pos << "; with data from " << index_1 << " filling area from " << start << " to " << end << "\n";
- }
-
- weighted_fis = 0.0;
- sum_fis = 0.0;
- weighted_pi = 0.0;
- sum_pi = 0.0;
-
- for (int n = 0; n < win_size; n++) {
- // if (bs[n].pi < 0.0)
- // continue;
- //
- // Calculate weighted Fst at this position.
- //
- dist = bs[n].bp > bs[win_cntr].bp ? bs[n].bp - bs[win_cntr].bp : bs[win_cntr].bp - bs[n].bp;
-
- final_weight_fis = (bs[n].alleles - 1) * weights[dist];
- // weighted_fis += bs[n].f * final_weight_fis;
- sum_fis += final_weight_fis;
-
- final_weight_pi = (bs[n].alleles - 1) * weights[dist];
- // weighted_pi += bs[n].pi * final_weight_pi;
- sum_pi += final_weight_pi;
- }
-
- fiss.push_back(weighted_fis / sum_fis);
- pis.push_back(weighted_pi / sum_pi);
- // cerr << " New weighted fis value: " << weighted_fis / sum_fis << "; size: " << fiss.size() << "\n";
-
- for (uint n = 0; n < poss.size(); n++) {
- // bs[poss[n]].f = 0.0;
- // bs[poss[n]].pi = -1.0;
- }
- poss.clear();
- }
-
-// #pragma omp critical
-// {
-// vector<double> &f = approx_fis_dist[i];
-// for (uint n = 0; n < fiss.size(); n++)
-// f.push_back(fiss[n]);
-// vector<double> &p = approx_pi_dist[i];
-// for (uint n = 0; n < pis.size(); n++)
-// p.push_back(pis[n]);
-// }
-
- delete [] bs;
- }
-
- sort(approx_fis_dist[i].begin(), approx_fis_dist[i].end());
- sort(approx_pi_dist[i].begin(), approx_pi_dist[i].end());
+ for (int j = 0; j < bootstrap_reps; j++) {
+ // cerr << " Bootsrap rep " << j << "\n";
+
+ //
+ // First SNP is always placed at the center of the window.
+ //
+ pos = win_cntr;
+ // index_1 = (int) (fis_samples.size() * (random() / (RAND_MAX + 1.0)));
+ // index_2 = (int) (pi_samples.size() * (random() / (RAND_MAX + 1.0)));
+ index_3 = (int) (allele_samples.size() * (random() / (RAND_MAX + 1.0)));
+ //
+ // Fill in the area around the SNP with fixed sites.
+ //
+ start = pos - half > 0 ? pos - half : 0;
+ end = pos + half < win_size ? pos + half : win_size;
+ for (int n = start; n < end; n++) {
+ // bs[n].f = 0;
+ // bs[n].pi = 0;
+ bs[n].alleles = bs[pos].alleles;
+ poss.push_back(n);
+ }
+ // bs[pos].f = fis_samples[index_1];
+ // bs[pos].pi = pi_samples[index_2];
+ bs[pos].alleles = allele_samples[index_3];
+ // cerr << " Placing SNP at position: " << pos << "; with data from " << index_1 << " filling area from " << start << " to " << end << "\n";
+
+ //
+ // Randomly select the positions and values for each SNP to populate the window
+ //
+ for (int k = 0; k < i - 1; k++) {
+ pos = (int) (win_size * (random() / (RAND_MAX + 1.0)));
+ // index_1 = (int) (fis_samples.size() * (random() / (RAND_MAX + 1.0)));
+ // index_2 = (int) (pi_samples.size() * (random() / (RAND_MAX + 1.0)));
+ index_3 = (int) (allele_samples.size() * (random() / (RAND_MAX + 1.0)));
+
+ poss.push_back(pos);
+ //
+ // Fill in the area around the SNP with fixed sites.
+ //
+ start = pos - half > 0 ? pos - half : 0;
+ end = pos + half < win_size ? pos + half : win_size;
+ for (int n = start; n < end; n++) {
+ // bs[n].f = 0;
+ // bs[n].pi = 0;
+ bs[n].alleles = bs[pos].alleles;
+ poss.push_back(n);
+ }
+ // bs[pos].f = fis_samples[index_1];
+ // bs[pos].pi = pi_samples[index_2];
+ bs[pos].alleles = allele_samples[index_3];
+ // cerr << " Placing SNP at position: " << pos << "; with data from " << index_1 << " filling area from " << start << " to " << end << "\n";
+ }
+
+ weighted_fis = 0.0;
+ sum_fis = 0.0;
+ weighted_pi = 0.0;
+ sum_pi = 0.0;
+
+ for (int n = 0; n < win_size; n++) {
+ // if (bs[n].pi < 0.0)
+ // continue;
+ //
+ // Calculate weighted Fst at this position.
+ //
+ dist = bs[n].bp > bs[win_cntr].bp ? bs[n].bp - bs[win_cntr].bp : bs[win_cntr].bp - bs[n].bp;
+
+ final_weight_fis = (bs[n].alleles - 1) * weights[dist];
+ // weighted_fis += bs[n].f * final_weight_fis;
+ sum_fis += final_weight_fis;
+
+ final_weight_pi = (bs[n].alleles - 1) * weights[dist];
+ // weighted_pi += bs[n].pi * final_weight_pi;
+ sum_pi += final_weight_pi;
+ }
+
+ fiss.push_back(weighted_fis / sum_fis);
+ pis.push_back(weighted_pi / sum_pi);
+ // cerr << " New weighted fis value: " << weighted_fis / sum_fis << "; size: " << fiss.size() << "\n";
+
+ for (uint n = 0; n < poss.size(); n++) {
+ // bs[poss[n]].f = 0.0;
+ // bs[poss[n]].pi = -1.0;
+ }
+ poss.clear();
+ }
+
+// #pragma omp critical
+// {
+// vector<double> &f = approx_fis_dist[i];
+// for (uint n = 0; n < fiss.size(); n++)
+// f.push_back(fiss[n]);
+// vector<double> &p = approx_pi_dist[i];
+// for (uint n = 0; n < pis.size(); n++)
+// p.push_back(pis[n]);
+// }
+
+ delete [] bs;
+ }
+
+ sort(approx_fis_dist[i].begin(), approx_fis_dist[i].end());
+ sort(approx_pi_dist[i].begin(), approx_pi_dist[i].end());
}
return 0;
@@ -4727,9 +4727,9 @@ bootstrap_popstats_approximate_dist(vector<double> &fis_samples,
int
bootstrap_fst_approximate_dist(vector<double> &fst_samples,
- vector<int> &allele_samples,
- double *weights, int *snp_dist,
- map<int, vector<double> > &approx_fst_dist)
+ vector<int> &allele_samples,
+ double *weights, int *snp_dist,
+ map<int, vector<double> > &approx_fst_dist)
{
//
// Allocate an array of bootstrap resampling objects.
@@ -4741,11 +4741,11 @@ bootstrap_fst_approximate_dist(vector<double> &fst_samples,
// Initialize the Fst distribution map.
//
for (int i = 0; i < max_snp_dist; i++) {
- if (snp_dist[i] == 0.0) continue;
+ if (snp_dist[i] == 0.0) continue;
- // cerr << "SNP Dist: " << i << " snps occurred " << snp_dist[i] << "\n";
- approx_fst_dist[i] = vector<double> ();
- approx_fst_dist[i].reserve(bootstrap_reps);
+ // cerr << "SNP Dist: " << i << " snps occurred " << snp_dist[i] << "\n";
+ approx_fst_dist[i] = vector<double> ();
+ approx_fst_dist[i].reserve(bootstrap_reps);
}
vector<int> poss;
@@ -4755,88 +4755,88 @@ bootstrap_fst_approximate_dist(vector<double> &fst_samples,
int pos, index_2, dist;
for (int i = 0; i < max_snp_dist; i++) {
- if (snp_dist[i] == 0.0) continue;
+ if (snp_dist[i] == 0.0) continue;
- cerr << " Generating NULL distribution for " << i << " SNPs...\n";
+ cerr << " Generating NULL distribution for " << i << " SNPs...\n";
- // #pragma omp parallel private(poss, pos, index_1, index_2, dist, sum, weighted_fst, final_weight)
+ // #pragma omp parallel private(poss, pos, index_1, index_2, dist, sum, weighted_fst, final_weight)
#pragma omp parallel private(poss, pos, index_2, dist, sum, weighted_fst, final_weight)
- {
- BSample *bs = new BSample[win_size];
+ {
+ BSample *bs = new BSample[win_size];
- //
- // Populate the BSample objects.
- //
- for (int n = 0; n < win_size; n++)
- bs[n].bp = n + 1;
+ //
+ // Populate the BSample objects.
+ //
+ for (int n = 0; n < win_size; n++)
+ bs[n].bp = n + 1;
- vector<double> fsts;
+ vector<double> fsts;
- //
- // Bootstrap this bitch.
- //
+ //
+ // Bootstrap this bitch.
+ //
#pragma omp for schedule(dynamic, 1)
- for (int j = 0; j < bootstrap_reps; j++) {
- // cerr << "Bootsrap rep " << j << "\n";
-
- //
- // First SNP is always placed at the center of the window.
- //
- pos = win_cntr;
- // index_1 = (int) (fst_samples.size() * (random() / (RAND_MAX + 1.0)));
- index_2 = (int) (allele_samples.size() * (random() / (RAND_MAX + 1.0)));
- // bs[pos].f = fst_samples[index_1];
- bs[pos].alleles = allele_samples[index_2];
-
- //
- // Randomly select the positions and values for each SNP to populate the window
- //
- for (int k = 0; k < i - 1; k++) {
- pos = (int) (win_size * (random() / (RAND_MAX + 1.0)));
- // index_1 = (int) (fst_samples.size() * (random() / (RAND_MAX + 1.0)));
- index_2 = (int) (allele_samples.size() * (random() / (RAND_MAX + 1.0)));
- // bs[pos].f = fst_samples[index_1];
- // bs[pos].alleles = allele_samples[index_2];
- // cerr << " " << j << ": Placing SNP at position: " << pos << " with data from index " << index_1 << "\n";
-
- poss.push_back(pos);
- }
-
- weighted_fst = 0.0;
- sum = 0.0;
-
- for (int n = 0; n < win_size; n++) {
- // if (bs[n].f == 0.0)
- // continue;
- //
- // Calculate weighted Fst at this position.
- //
- dist = bs[n].bp > bs[win_cntr].bp ? bs[n].bp - bs[win_cntr].bp : bs[win_cntr].bp - bs[n].bp;
-
- final_weight = (bs[n].alleles - 1) * weights[dist];
- // weighted_fst += bs[n].f * final_weight;
- sum += final_weight;
- }
-
- fsts.push_back(weighted_fst / sum);
- // cerr << " New weighted Fst value: " << weighted_fst / sum << "; size: " << fsts.size() << "\n";
-
- // for (uint n = 0; n < poss.size(); n++)
- // bs[poss[n]].f = 0.0;
- poss.clear();
- }
-
-// #pragma omp critical
-// {
-// vector<double> &f = approx_fst_dist[i];
-// for (uint n = 0; n < fsts.size(); n++)
-// f.push_back(fsts[n]);
-// }
-
- delete [] bs;
- }
-
- sort(approx_fst_dist[i].begin(), approx_fst_dist[i].end());
+ for (int j = 0; j < bootstrap_reps; j++) {
+ // cerr << "Bootsrap rep " << j << "\n";
+
+ //
+ // First SNP is always placed at the center of the window.
+ //
+ pos = win_cntr;
+ // index_1 = (int) (fst_samples.size() * (random() / (RAND_MAX + 1.0)));
+ index_2 = (int) (allele_samples.size() * (random() / (RAND_MAX + 1.0)));
+ // bs[pos].f = fst_samples[index_1];
+ bs[pos].alleles = allele_samples[index_2];
+
+ //
+ // Randomly select the positions and values for each SNP to populate the window
+ //
+ for (int k = 0; k < i - 1; k++) {
+ pos = (int) (win_size * (random() / (RAND_MAX + 1.0)));
+ // index_1 = (int) (fst_samples.size() * (random() / (RAND_MAX + 1.0)));
+ index_2 = (int) (allele_samples.size() * (random() / (RAND_MAX + 1.0)));
+ // bs[pos].f = fst_samples[index_1];
+ // bs[pos].alleles = allele_samples[index_2];
+ // cerr << " " << j << ": Placing SNP at position: " << pos << " with data from index " << index_1 << "\n";
+
+ poss.push_back(pos);
+ }
+
+ weighted_fst = 0.0;
+ sum = 0.0;
+
+ for (int n = 0; n < win_size; n++) {
+ // if (bs[n].f == 0.0)
+ // continue;
+ //
+ // Calculate weighted Fst at this position.
+ //
+ dist = bs[n].bp > bs[win_cntr].bp ? bs[n].bp - bs[win_cntr].bp : bs[win_cntr].bp - bs[n].bp;
+
+ final_weight = (bs[n].alleles - 1) * weights[dist];
+ // weighted_fst += bs[n].f * final_weight;
+ sum += final_weight;
+ }
+
+ fsts.push_back(weighted_fst / sum);
+ // cerr << " New weighted Fst value: " << weighted_fst / sum << "; size: " << fsts.size() << "\n";
+
+ // for (uint n = 0; n < poss.size(); n++)
+ // bs[poss[n]].f = 0.0;
+ poss.clear();
+ }
+
+// #pragma omp critical
+// {
+// vector<double> &f = approx_fst_dist[i];
+// for (uint n = 0; n < fsts.size(); n++)
+// f.push_back(fsts[n]);
+// }
+
+ delete [] bs;
+ }
+
+ sort(approx_fst_dist[i].begin(), approx_fst_dist[i].end());
}
return 0;
@@ -4846,7 +4846,7 @@ double
bootstrap_approximate_pval(int snp_cnt, double stat, map<int, vector<double> > &approx_dist)
{
if (approx_dist.count(snp_cnt) == 0)
- return 1.0;
+ return 1.0;
vector<double>::iterator up;
vector<double> &dist = approx_dist[snp_cnt];
@@ -4855,35 +4855,35 @@ bootstrap_approximate_pval(int snp_cnt, double stat, map<int, vector<double> > &
up = upper_bound(dist.begin(), dist.end(), stat);
if (up == dist.begin())
- pos = 1;
+ pos = 1;
else if (up == dist.end())
- pos = dist.size();
+ pos = dist.size();
else
- pos = up - dist.begin() + 1;
+ pos = up - dist.begin() + 1;
double res = 1.0 - (pos / (double) dist.size());
// cerr << "Generated Approx Smoothed Fst Distribution:\n";
// for (uint n = 0; n < dist.size(); n++)
- // cerr << " n: " << n << "; Fst: " << dist[n] << "\n";
+ // cerr << " n: " << n << "; Fst: " << dist[n] << "\n";
// cerr << "Comparing Fst value: " << stat
- // << " at position " << (up - dist.begin()) << " out of "
- // << dist.size() << " positions (converted position: " << pos << "); pvalue: " << res << ".\n";
+ // << " at position " << (up - dist.begin()) << " out of "
+ // << dist.size() << " positions (converted position: " << pos << "); pvalue: " << res << ".\n";
return res;
}
int
write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap,
- map<int, string> &samples, bool write_gtypes)
+ map<int, string> &samples, bool write_gtypes)
{
stringstream pop_name;
pop_name << "batch_" << batch_id;
if (write_gtypes)
- pop_name << ".genotypes.tsv";
+ pop_name << ".genotypes.tsv";
else
- pop_name << ".haplotypes.tsv";
+ pop_name << ".haplotypes.tsv";
string file = in_path + pop_name.str();
@@ -4891,7 +4891,7 @@ write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap,
if (fh.fail()) {
cerr << "Error opening generic output file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -4908,15 +4908,15 @@ write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap,
//
fh << "Catalog ID\t";
if (expand_id)
- fh << "\t";
+ fh << "\t";
if (write_gtypes)
- fh << "Marker\t";
+ fh << "Marker\t";
fh << "Cnt\t";
for (int i = 0; i < pmap->sample_cnt(); i++) {
- fh << samples[pmap->rev_sample_index(i)];
- if (i < pmap->sample_cnt() - 1)
- fh << "\t";
+ fh << samples[pmap->rev_sample_index(i)];
+ if (i < pmap->sample_cnt() - 1)
+ fh << "\t";
}
fh << "\n";
@@ -4924,49 +4924,49 @@ write_generic(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap,
// Output each locus.
//
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
+ loc = it->second;
- stringstream id;
+ stringstream id;
loc->annotation.length() > 0 ?
id << loc->id << "|" << loc->annotation : id << loc->id;
- fh << id.str();
+ fh << id.str();
if (expand_id) {
if (loc->annotation.length() > 0)
id << "\t" << loc->id << "\t" << loc->annotation;
- else if (strlen(loc->loc.chr) > 0)
- id << "\t" << loc->id << "\t" << loc->loc.chr << "_" << loc->loc.bp;
- else
+ else if (strlen(loc->loc.chr) > 0)
+ id << "\t" << loc->id << "\t" << loc->loc.chr << "_" << loc->loc.bp;
+ else
id << "\t" << loc->id << "\t";
}
- if (write_gtypes)
- fh << "\t" << loc->marker;
-
- write_gtypes ? fh << "\t" << loc->gcnt : fh << "\t" << loc->hcnt;
-
- Datum **d = pmap->locus(loc->id);
- string obshap;
-
- for (int i = 0; i < pmap->sample_cnt(); i++) {
- fh << "\t";
-
- if (d[i] == NULL)
- fh << "-";
- else
- if (write_gtypes) {
- fh << d[i]->gtype;
- } else {
- obshap = "";
- for (uint j = 0; j < d[i]->obshap.size(); j++)
- obshap += string(d[i]->obshap[j]) + "/";
- obshap = obshap.substr(0, obshap.length()-1);
- fh << obshap;
- }
- }
+ if (write_gtypes)
+ fh << "\t" << loc->marker;
+
+ write_gtypes ? fh << "\t" << loc->gcnt : fh << "\t" << loc->hcnt;
+
+ Datum **d = pmap->locus(loc->id);
+ string obshap;
+
+ for (int i = 0; i < pmap->sample_cnt(); i++) {
+ fh << "\t";
+
+ if (d[i] == NULL)
+ fh << "-";
+ else
+ if (write_gtypes) {
+ fh << d[i]->gtype;
+ } else {
+ obshap = "";
+ for (uint j = 0; j < d[i]->obshap.size(); j++)
+ obshap += string(d[i]->obshap[j]) + "/";
+ obshap = obshap.substr(0, obshap.length()-1);
+ fh << obshap;
+ }
+ }
- fh << "\n";
+ fh << "\n";
}
fh.close();
@@ -4986,7 +4986,7 @@ write_sql(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
ofstream fh(file.c_str(), ofstream::out);
if (fh.fail()) {
cerr << "Error opening markers SQL file '" << file << "'\n";
- exit(1);
+ exit(1);
}
fh.precision(fieldw);
fh.setf(std::ios::fixed);
@@ -5008,35 +5008,35 @@ write_sql(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap)
stringstream gtype_map;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
-
- string freq = "";
- double max = 0.0;
- int total = 0;
- gtype_map.str("");
-
- if (loc->marker.length() > 0) {
- tally_haplotype_freq(loc, pmap, total, max, freq);
-
- //
- // Record the haplotype to genotype map.
- //
- map<string, string>::iterator j;
- for (j = loc->gmap.begin(); j != loc->gmap.end(); j++)
- gtype_map << j->first << ":" << j->second << ";";
- }
-
- fh << 0 << "\t"
- << batch_id << "\t"
- << loc->id << "\t"
- << "\t" // Marker
- << total << "\t"
- << max << "\t"
- << freq << "\t"
- << loc->f << "\t"
- << loc->lnl << "\t"
+ loc = it->second;
+
+ string freq = "";
+ double max = 0.0;
+ int total = 0;
+ gtype_map.str("");
+
+ if (loc->marker.length() > 0) {
+ tally_haplotype_freq(loc, pmap, total, max, freq);
+
+ //
+ // Record the haplotype to genotype map.
+ //
+ map<string, string>::iterator j;
+ for (j = loc->gmap.begin(); j != loc->gmap.end(); j++)
+ gtype_map << j->first << ":" << j->second << ";";
+ }
+
+ fh << 0 << "\t"
+ << batch_id << "\t"
+ << loc->id << "\t"
+ << "\t" // Marker
+ << total << "\t"
+ << max << "\t"
+ << freq << "\t"
+ << loc->f << "\t"
+ << loc->lnl << "\t"
<< gtype_map.str() << "\t"
- << "\n";
+ << "\n";
}
fh.close();
@@ -5061,7 +5061,7 @@ write_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<int, string
if (fh.fail()) {
cerr << "Error opening FASTA file '" << file << "'\n";
- exit(1);
+ exit(1);
}
map<string, vector<CSLocus *> >::iterator it;
@@ -5070,37 +5070,37 @@ write_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<int, string
char *seq;
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- d = pmap->locus(loc->id);
- seq = new char[loc->len + 1];
- strcpy(seq, loc->con);
-
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- if (d[j] == NULL)
- continue;
-
- for (uint k = 0; k < d[j]->obshap.size(); k++) {
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
- seq[col] = col < loc->len ? d[j]->obshap[k][i] : loc->con[col];
- }
-
- fh << ">CLocus_" << loc->id
- << "_Sample_" << pmap->rev_sample_index(j)
- << "_Locus_" << d[j]->id
- << "_Allele_" << k
- << " [" << samples[pmap->rev_sample_index(j)];
-
- if (strcmp(loc->loc.chr, "un") != 0)
- fh << "; " << loc->loc.chr << ", " << loc->sort_bp() + 1 << ", " << (loc->loc.strand == plus ? "+" : "-");
- fh << "]\n"
- << seq << "\n";
- }
- }
- delete [] seq;
- }
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ d = pmap->locus(loc->id);
+ seq = new char[loc->len + 1];
+ strcpy(seq, loc->con);
+
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ if (d[j] == NULL)
+ continue;
+
+ for (uint k = 0; k < d[j]->obshap.size(); k++) {
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+ seq[col] = col < loc->len ? d[j]->obshap[k][i] : loc->con[col];
+ }
+
+ fh << ">CLocus_" << loc->id
+ << "_Sample_" << pmap->rev_sample_index(j)
+ << "_Locus_" << d[j]->id
+ << "_Allele_" << k
+ << " [" << samples[pmap->rev_sample_index(j)];
+
+ if (strcmp(loc->loc.chr, "un") != 0)
+ fh << "; " << loc->loc.chr << ", " << loc->sort_bp() + 1 << ", " << (loc->loc.strand == plus ? "+" : "-");
+ fh << "]\n"
+ << seq << "\n";
+ }
+ }
+ delete [] seq;
+ }
}
fh.close();
@@ -5125,7 +5125,7 @@ write_strict_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<int,
if (fh.fail()) {
cerr << "Error opening strict FASTA file '" << file << "'\n";
- exit(1);
+ exit(1);
}
map<string, vector<CSLocus *> >::iterator it;
@@ -5134,67 +5134,67 @@ write_strict_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<int,
char *seq;
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- d = pmap->locus(loc->id);
- seq = new char[loc->len + 1];
- strcpy(seq, loc->con);
-
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- if (d[j] == NULL)
- continue;
- if (d[j]->obshap.size() > 2)
- continue;
-
- if (d[j]->obshap.size() == 1) {
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
- seq[col] = col < loc->len ? d[j]->obshap[0][i] : loc->con[col];
- }
-
- fh << ">CLocus_" << loc->id
- << "_Sample_" << pmap->rev_sample_index(j)
- << "_Locus_" << d[j]->id
- << "_Allele_" << 0
- << " [" << samples[pmap->rev_sample_index(j)];
- if (strcmp(loc->loc.chr, "un") != 0)
- fh << "; " << loc->loc.chr << ", " << loc->sort_bp() + 1 << ", " << (loc->loc.strand == plus ? "+" : "-");
- fh << "]\n"
- << seq << "\n";
-
- fh << ">CLocus_" << loc->id
- << "_Sample_" << pmap->rev_sample_index(j)
- << "_Locus_" << d[j]->id
- << "_Allele_" << 1
- << " [" << samples[pmap->rev_sample_index(j)];
- if (strcmp(loc->loc.chr, "un") != 0)
- fh << "; " << loc->loc.chr << ", " << loc->sort_bp() + 1 << ", " << (loc->loc.strand == plus ? "+" : "-");
- fh << "]\n"
- << seq << "\n";
-
- } else {
- for (uint k = 0; k < d[j]->obshap.size(); k++) {
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
- seq[col] = col < loc->len ? d[j]->obshap[k][i] : loc->con[col];
- }
-
- fh << ">CLocus_" << loc->id
- << "_Sample_" << pmap->rev_sample_index(j)
- << "_Locus_" << d[j]->id
- << "_Allele_" << k
- << " [" << samples[pmap->rev_sample_index(j)];
- if (strcmp(loc->loc.chr, "un") != 0)
- fh << "; " << loc->loc.chr << ", " << loc->sort_bp() + 1 << ", " << (loc->loc.strand == plus ? "+" : "-");
- fh << "]\n"
- << seq << "\n";
- }
- }
- }
-
- delete [] seq;
- }
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ d = pmap->locus(loc->id);
+ seq = new char[loc->len + 1];
+ strcpy(seq, loc->con);
+
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ if (d[j] == NULL)
+ continue;
+ if (d[j]->obshap.size() > 2)
+ continue;
+
+ if (d[j]->obshap.size() == 1) {
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+ seq[col] = col < loc->len ? d[j]->obshap[0][i] : loc->con[col];
+ }
+
+ fh << ">CLocus_" << loc->id
+ << "_Sample_" << pmap->rev_sample_index(j)
+ << "_Locus_" << d[j]->id
+ << "_Allele_" << 0
+ << " [" << samples[pmap->rev_sample_index(j)];
+ if (strcmp(loc->loc.chr, "un") != 0)
+ fh << "; " << loc->loc.chr << ", " << loc->sort_bp() + 1 << ", " << (loc->loc.strand == plus ? "+" : "-");
+ fh << "]\n"
+ << seq << "\n";
+
+ fh << ">CLocus_" << loc->id
+ << "_Sample_" << pmap->rev_sample_index(j)
+ << "_Locus_" << d[j]->id
+ << "_Allele_" << 1
+ << " [" << samples[pmap->rev_sample_index(j)];
+ if (strcmp(loc->loc.chr, "un") != 0)
+ fh << "; " << loc->loc.chr << ", " << loc->sort_bp() + 1 << ", " << (loc->loc.strand == plus ? "+" : "-");
+ fh << "]\n"
+ << seq << "\n";
+
+ } else {
+ for (uint k = 0; k < d[j]->obshap.size(); k++) {
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+ seq[col] = col < loc->len ? d[j]->obshap[k][i] : loc->con[col];
+ }
+
+ fh << ">CLocus_" << loc->id
+ << "_Sample_" << pmap->rev_sample_index(j)
+ << "_Locus_" << d[j]->id
+ << "_Allele_" << k
+ << " [" << samples[pmap->rev_sample_index(j)];
+ if (strcmp(loc->loc.chr, "un") != 0)
+ fh << "; " << loc->loc.chr << ", " << loc->sort_bp() + 1 << ", " << (loc->loc.strand == plus ? "+" : "-");
+ fh << "]\n"
+ << seq << "\n";
+ }
+ }
+ }
+
+ delete [] seq;
+ }
}
fh.close();
@@ -5204,9 +5204,9 @@ write_strict_fasta(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap, map<int,
int
write_vcf_ordered(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
- map<int, string> &samples, vector<int> &sample_ids,
- map<int, pair<merget, int> > &merge_map, ofstream &log_fh)
+ PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
+ map<int, string> &samples, vector<int> &sample_ids,
+ map<int, pair<merget, int> > &merge_map, ofstream &log_fh)
{
//
// Write a VCF file as defined here: http://www.1000genomes.org/node/101
@@ -5219,7 +5219,7 @@ write_vcf_ordered(map<int, CSLocus *> &catalog,
if (fh.fail()) {
cerr << "Error opening VCF file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -5259,7 +5259,7 @@ write_vcf_ordered(map<int, CSLocus *> &catalog,
<< "QUAL" << "\t" << "FILTER" << "\t" << "INFO" << "\t" << "FORMAT";
for (int i = 0; i < pmap->sample_cnt(); i++)
- fh << "\t" << samples[pmap->rev_sample_index(i)];
+ fh << "\t" << samples[pmap->rev_sample_index(i)];
fh << "\n";
map<string, vector<CSLocus *> >::iterator it;
@@ -5276,96 +5276,96 @@ write_vcf_ordered(map<int, CSLocus *> &catalog,
OLocTally<NucTally> *ord = new OLocTally<NucTally>(psum, log_fh);
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- vector<NucTally *> sites;
- ord->order(sites, it->second);
-
- for (uint pos = 0; pos < sites.size(); pos++) {
- if (catalog.count(sites[pos]->loc_id) == 0) {
- cerr << "Unable to find locus id " << sites[pos]->loc_id << "\n";
- continue;
- }
- loc = catalog[sites[pos]->loc_id];
- col = sites[pos]->col;
-
- sprintf(p_str, "%0.3f", sites[pos]->p_freq);
- sprintf(q_str, "%0.3f", 1 - sites[pos]->p_freq);
-
- //
- // If on the negative strand, complement the alleles.
- //
- p_allele = loc->loc.strand == minus ? reverse(sites[pos]->p_allele) : sites[pos]->p_allele;
- q_allele = loc->loc.strand == minus ? reverse(sites[pos]->q_allele) : sites[pos]->q_allele;
-
- fh << loc->loc.chr << "\t"
- << loc->sort_bp(col) + 1 << "\t"
- << loc->id << "\t"
- << p_allele << "\t" // REFerence allele
- << q_allele << "\t" // ALTernate allele
- << "." << "\t" // QUAL
- << "PASS" << "\t" // FILTER
- << "NS=" << sites[pos]->num_indv << ";" // INFO
- << "AF=" << p_str << "," << q_str << "\t" // INFO
- << "GT:DP:AD:GL"; // FORMAT
-
- snp_index = loc->snp_index(col);
- if (snp_index < 0) {
- cerr << "Warning, unable to locate SNP call in column " << col << " for catalog locus " << loc->id << "\n";
- fh << "\n";
- continue;
- }
-
- d = pmap->locus(loc->id);
-
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- fh << "\t";
-
- if (d[j] == NULL) {
- //
- // Data does not exist.
- //
- fh << "./.:0:.,.:.,.,.";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- fh << "./.:" << d[j]->tot_depth << ":.,.:.,.,.";
- } else {
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0) {
- // More than two potential alleles.
- fh << "./.:" << d[j]->tot_depth << ":.,.:.,.,.";
- } else {
- find_datum_allele_depths(d[j], snp_index, sites[pos]->p_allele, sites[pos]->q_allele, p_allele+q_allele, dp_1, dp_2);
-
- if (p_allele == 0) {
- gt_1 = q_allele == sites[pos]->p_allele ? 0 : 1;
- fh << gt_1 << "/" << gt_1 << ":" << d[j]->tot_depth << ":" << dp_1 << "," << dp_2;
- } else if (q_allele == 0) {
- gt_1 = p_allele == sites[pos]->p_allele ? 0 : 1;
- fh << gt_1 << "/" << gt_1 << ":" << d[j]->tot_depth << ":" << dp_1 << "," << dp_2;
- } else {
- gt_1 = p_allele == sites[pos]->p_allele ? 0 : 1;
- gt_2 = q_allele == sites[pos]->p_allele ? 0 : 1;
- fh << gt_1 << "/" << gt_2 << ":" << d[j]->tot_depth << ":" << dp_1 << "," << dp_2;
- }
- //
- // Output the likelihood for this model call.
- //
- if (col < d[j]->snps.size()) {
- fh << ":.," << d[j]->snps[col]->lratio << ",.";
- } else {
- cerr << "Warning, unable to locate SNP call in column " << col << " for catalog locus " << loc->id << ", tag ID " << d[j]->id << "\n";
- fh << ":.,.,.";
- }
- }
- }
- }
- fh << "\n";
- }
+ vector<NucTally *> sites;
+ ord->order(sites, it->second);
+
+ for (uint pos = 0; pos < sites.size(); pos++) {
+ if (catalog.count(sites[pos]->loc_id) == 0) {
+ cerr << "Unable to find locus id " << sites[pos]->loc_id << "\n";
+ continue;
+ }
+ loc = catalog[sites[pos]->loc_id];
+ col = sites[pos]->col;
+
+ sprintf(p_str, "%0.3f", sites[pos]->p_freq);
+ sprintf(q_str, "%0.3f", 1 - sites[pos]->p_freq);
+
+ //
+ // If on the negative strand, complement the alleles.
+ //
+ p_allele = loc->loc.strand == minus ? reverse(sites[pos]->p_allele) : sites[pos]->p_allele;
+ q_allele = loc->loc.strand == minus ? reverse(sites[pos]->q_allele) : sites[pos]->q_allele;
+
+ fh << loc->loc.chr << "\t"
+ << loc->sort_bp(col) + 1 << "\t"
+ << loc->id << "\t"
+ << p_allele << "\t" // REFerence allele
+ << q_allele << "\t" // ALTernate allele
+ << "." << "\t" // QUAL
+ << "PASS" << "\t" // FILTER
+ << "NS=" << sites[pos]->num_indv << ";" // INFO
+ << "AF=" << p_str << "," << q_str << "\t" // INFO
+ << "GT:DP:AD:GL"; // FORMAT
+
+ snp_index = loc->snp_index(col);
+ if (snp_index < 0) {
+ cerr << "Warning, unable to locate SNP call in column " << col << " for catalog locus " << loc->id << "\n";
+ fh << "\n";
+ continue;
+ }
+
+ d = pmap->locus(loc->id);
+
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ fh << "\t";
+
+ if (d[j] == NULL || col >= d[j]->len) {
+ //
+ // Data does not exist.
+ //
+ fh << "./.:0:.,.:.,.,.";
+ } else if (d[j]->model[col] == 'U') {
+ //
+ // Data exists, but the model call was uncertain.
+ //
+ fh << "./.:" << d[j]->tot_depth << ":.,.:.,.,.";
+ } else {
+ //
+ // Tally up the nucleotide calls.
+ //
+ tally_observed_haplotypes(d[j]->obshap, snp_index, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0) {
+ // More than two potential alleles.
+ fh << "./.:" << d[j]->tot_depth << ":.,.:.,.,.";
+ } else {
+ find_datum_allele_depths(d[j], snp_index, sites[pos]->p_allele, sites[pos]->q_allele, p_allele+q_allele, dp_1, dp_2);
+
+ if (p_allele == 0) {
+ gt_1 = q_allele == sites[pos]->p_allele ? 0 : 1;
+ fh << gt_1 << "/" << gt_1 << ":" << d[j]->tot_depth << ":" << dp_1 << "," << dp_2;
+ } else if (q_allele == 0) {
+ gt_1 = p_allele == sites[pos]->p_allele ? 0 : 1;
+ fh << gt_1 << "/" << gt_1 << ":" << d[j]->tot_depth << ":" << dp_1 << "," << dp_2;
+ } else {
+ gt_1 = p_allele == sites[pos]->p_allele ? 0 : 1;
+ gt_2 = q_allele == sites[pos]->p_allele ? 0 : 1;
+ fh << gt_1 << "/" << gt_2 << ":" << d[j]->tot_depth << ":" << dp_1 << "," << dp_2;
+ }
+ //
+ // Output the likelihood for this model call.
+ //
+ if (col < d[j]->snps.size()) {
+ fh << ":.," << d[j]->snps[col]->lratio << ",.";
+ } else {
+ cerr << "Warning, unable to locate SNP call in column " << col << " for catalog locus " << loc->id << ", tag ID " << d[j]->id << "\n";
+ fh << ":.,.,.";
+ }
+ }
+ }
+ }
+ fh << "\n";
+ }
}
fh.close();
@@ -5374,9 +5374,9 @@ write_vcf_ordered(map<int, CSLocus *> &catalog,
int
write_vcf(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
- map<int, string> &samples, vector<int> &sample_ids,
- map<int, pair<merget, int> > &merge_map)
+ PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
+ map<int, string> &samples, vector<int> &sample_ids,
+ map<int, pair<merget, int> > &merge_map)
{
//
// Write a VCF file as defined here: http://www.1000genomes.org/node/101
@@ -5389,7 +5389,7 @@ write_vcf(map<int, CSLocus *> &catalog,
if (fh.fail()) {
cerr << "Error opening VCF file '" << file << "'\n";
- exit(1);
+ exit(1);
}
cerr << "In preparation for VCF export, loading SNP data for " << samples.size() << " samples.\n";
@@ -5426,7 +5426,7 @@ write_vcf(map<int, CSLocus *> &catalog,
<< "QUAL" << "\t" << "FILTER" << "\t" << "INFO" << "\t" << "FORMAT";
for (int i = 0; i < pmap->sample_cnt(); i++)
- fh << "\t" << samples[pmap->rev_sample_index(i)];
+ fh << "\t" << samples[pmap->rev_sample_index(i)];
fh << "\n";
map<string, vector<CSLocus *> >::iterator it;
@@ -5440,109 +5440,109 @@ write_vcf(map<int, CSLocus *> &catalog,
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- //
- // We need to order the SNPs so negative and positive strand SNPs are properly ordered.
- //
- vector<GenPos> ordered_loci;
- uint col;
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- col = loc->snps[i]->col;
- if (t->nucs[col].allele_cnt == 2)
- ordered_loci.push_back(GenPos(loc->id, i, loc->sort_bp(col)));
- }
- }
- sort(ordered_loci.begin(), ordered_loci.end(), compare_genpos);
-
- for (uint pos = 0; pos < ordered_loci.size(); pos++) {
- loc = catalog[ordered_loci[pos].id];
- col = loc->snps[ordered_loci[pos].snp_index]->col;
- t = psum->locus_tally(loc->id);
-
- num_indv = (double) t->nucs[col].num_indv;
-
- sprintf(p_str, "%0.3f", t->nucs[col].p_freq);
- sprintf(q_str, "%0.3f", 1 - t->nucs[col].p_freq);
-
- //
- // If on the negative strand, complement the alleles.
- //
- p_allele = loc->loc.strand == minus ? reverse(t->nucs[col].p_allele) : t->nucs[col].p_allele;
- q_allele = loc->loc.strand == minus ? reverse(t->nucs[col].q_allele) : t->nucs[col].q_allele;
-
- fh << loc->loc.chr << "\t"
- << loc->sort_bp(col) + 1 << "\t"
- << loc->id << "\t"
- << p_allele << "\t" // REFerence allele
- << q_allele << "\t" // ALTernate allele
- << "." << "\t" // QUAL
- << "PASS" << "\t" // FILTER
- << "NS=" << num_indv << ";" // INFO
- << "AF=" << p_str << "," << q_str << "\t" // INFO
- << "GT:DP:AD:GL"; // FORMAT
-
- snp_index = loc->snp_index(col);
- if (snp_index < 0) {
- cerr << "Warning, unable to locate SNP call in column " << col << " for catalog locus " << loc->id << "\n";
- fh << "\n";
- continue;
- }
-
- d = pmap->locus(loc->id);
-
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- fh << "\t";
-
- if (d[j] == NULL) {
- //
- // Data does not exist.
- //
- fh << "./.:0:.,.:.,.,.";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- fh << "./.:" << d[j]->tot_depth << ":.,.:.,.,.";
- } else {
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0) {
- // More than two potential alleles.
- fh << "./.:" << d[j]->tot_depth << ":.,.:.,.,.";
- } else {
- find_datum_allele_depths(d[j], snp_index, t->nucs[col].p_allele, t->nucs[col].q_allele, p_allele+q_allele, dp_1, dp_2);
-
- if (p_allele == 0) {
- gt_1 = q_allele == t->nucs[col].p_allele ? 0 : 1;
- fh << gt_1 << "/" << gt_1 << ":" << d[j]->tot_depth << ":" << dp_1 << "," << dp_2;
- } else if (q_allele == 0) {
- gt_1 = p_allele == t->nucs[col].p_allele ? 0 : 1;
- fh << gt_1 << "/" << gt_1 << ":" << d[j]->tot_depth << ":" << dp_1 << "," << dp_2;
- } else {
- gt_1 = p_allele == t->nucs[col].p_allele ? 0 : 1;
- gt_2 = q_allele == t->nucs[col].p_allele ? 0 : 1;
- fh << gt_1 << "/" << gt_2 << ":" << d[j]->tot_depth << ":" << dp_1 << "," << dp_2;
- }
- //
- // Output the likelihood measure for this model call.
- //
- if (snp_index >= 0) {
- fh << ":.," << d[j]->snps[snp_index]->lratio << ",.";
- } else {
- cerr << "Warning, unable to locate SNP call in column " << col << " for catalog locus " << loc->id << ", tag ID " << d[j]->id << "\n";
- fh << ":.,.,.";
- }
- }
- }
- }
- fh << "\n";
- }
+ //
+ // We need to order the SNPs so negative and positive strand SNPs are properly ordered.
+ //
+ vector<GenPos> ordered_loci;
+ uint col;
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ col = loc->snps[i]->col;
+ if (t->nucs[col].allele_cnt == 2)
+ ordered_loci.push_back(GenPos(loc->id, i, loc->sort_bp(col)));
+ }
+ }
+ sort(ordered_loci.begin(), ordered_loci.end(), compare_genpos);
+
+ for (uint pos = 0; pos < ordered_loci.size(); pos++) {
+ loc = catalog[ordered_loci[pos].id];
+ col = loc->snps[ordered_loci[pos].snp_index]->col;
+ t = psum->locus_tally(loc->id);
+
+ num_indv = (double) t->nucs[col].num_indv;
+
+ sprintf(p_str, "%0.3f", t->nucs[col].p_freq);
+ sprintf(q_str, "%0.3f", 1 - t->nucs[col].p_freq);
+
+ //
+ // If on the negative strand, complement the alleles.
+ //
+ p_allele = loc->loc.strand == minus ? reverse(t->nucs[col].p_allele) : t->nucs[col].p_allele;
+ q_allele = loc->loc.strand == minus ? reverse(t->nucs[col].q_allele) : t->nucs[col].q_allele;
+
+ fh << loc->loc.chr << "\t"
+ << loc->sort_bp(col) + 1 << "\t"
+ << loc->id << "\t"
+ << p_allele << "\t" // REFerence allele
+ << q_allele << "\t" // ALTernate allele
+ << "." << "\t" // QUAL
+ << "PASS" << "\t" // FILTER
+ << "NS=" << num_indv << ";" // INFO
+ << "AF=" << p_str << "," << q_str << "\t" // INFO
+ << "GT:DP:AD:GL"; // FORMAT
+
+ snp_index = loc->snp_index(col);
+ if (snp_index < 0) {
+ cerr << "Warning, unable to locate SNP call in column " << col << " for catalog locus " << loc->id << "\n";
+ fh << "\n";
+ continue;
+ }
+
+ d = pmap->locus(loc->id);
+
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ fh << "\t";
+
+ if (d[j] == NULL || col >= d[j]->len) {
+ //
+ // Data does not exist.
+ //
+ fh << "./.:0:.,.:.,.,.";
+ } else if (d[j]->model[col] == 'U') {
+ //
+ // Data exists, but the model call was uncertain.
+ //
+ fh << "./.:" << d[j]->tot_depth << ":.,.:.,.,.";
+ } else {
+ //
+ // Tally up the nucleotide calls.
+ //
+ tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0) {
+ // More than two potential alleles.
+ fh << "./.:" << d[j]->tot_depth << ":.,.:.,.,.";
+ } else {
+ find_datum_allele_depths(d[j], snp_index, t->nucs[col].p_allele, t->nucs[col].q_allele, p_allele+q_allele, dp_1, dp_2);
+
+ if (p_allele == 0) {
+ gt_1 = q_allele == t->nucs[col].p_allele ? 0 : 1;
+ fh << gt_1 << "/" << gt_1 << ":" << d[j]->tot_depth << ":" << dp_1 << "," << dp_2;
+ } else if (q_allele == 0) {
+ gt_1 = p_allele == t->nucs[col].p_allele ? 0 : 1;
+ fh << gt_1 << "/" << gt_1 << ":" << d[j]->tot_depth << ":" << dp_1 << "," << dp_2;
+ } else {
+ gt_1 = p_allele == t->nucs[col].p_allele ? 0 : 1;
+ gt_2 = q_allele == t->nucs[col].p_allele ? 0 : 1;
+ fh << gt_1 << "/" << gt_2 << ":" << d[j]->tot_depth << ":" << dp_1 << "," << dp_2;
+ }
+ //
+ // Output the likelihood measure for this model call.
+ //
+ if (snp_index >= 0) {
+ fh << ":.," << d[j]->snps[snp_index]->lratio << ",.";
+ } else {
+ cerr << "Warning, unable to locate SNP call in column " << col << " for catalog locus " << loc->id << ", tag ID " << d[j]->id << "\n";
+ fh << ":.,.,.";
+ }
+ }
+ }
+ }
+ fh << "\n";
+ }
}
fh.close();
@@ -5551,8 +5551,8 @@ write_vcf(map<int, CSLocus *> &catalog,
int
populate_snp_calls(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap,
- map<int, string> &samples, vector<int> &sample_ids,
- map<int, pair<merget, int> > &merge_map)
+ map<int, string> &samples, vector<int> &sample_ids,
+ map<int, pair<merget, int> > &merge_map)
{
map<int, CSLocus *>::iterator cit;
map<int, SNPRes *>::iterator sit;
@@ -5562,39 +5562,39 @@ populate_snp_calls(map<int, CSLocus *> &catalog, PopMap<CSLocus> *pmap,
SNP *snp;
for (uint i = 0; i < sample_ids.size(); i++) {
- map<int, SNPRes *> snpres;
- load_snp_calls(in_path + samples[sample_ids[i]], snpres);
-
- for (cit = catalog.begin(); cit != catalog.end(); cit++) {
- loc = cit->second;
- datum = pmap->datum(loc->id, sample_ids[i]);
-
- if (datum != NULL && snpres.count(datum->id)) {
-
- if (merge_sites && merge_map.count(loc->id)) {
- datum_adjust_snp_positions(merge_map, loc, datum, snpres);
- } else {
- //
- // Deep copy the SNP objects.
- //
- snpr = snpres[datum->id];
- for (uint j = 0; j < snpr->snps.size(); j++) {
- snp = new SNP;
- snp->col = snpr->snps[j]->col;
- snp->lratio = snpr->snps[j]->lratio;
- snp->rank_1 = snpr->snps[j]->rank_1;
- snp->rank_2 = snpr->snps[j]->rank_2;
- snp->rank_3 = snpr->snps[j]->rank_3;
- snp->rank_4 = snpr->snps[j]->rank_4;
-
- datum->snps.push_back(snp);
- }
- }
- }
- }
-
- for (sit = snpres.begin(); sit != snpres.end(); sit++)
- delete sit->second;
+ map<int, SNPRes *> snpres;
+ load_snp_calls(in_path + samples[sample_ids[i]], snpres);
+
+ for (cit = catalog.begin(); cit != catalog.end(); cit++) {
+ loc = cit->second;
+ datum = pmap->datum(loc->id, sample_ids[i]);
+
+ if (datum != NULL && snpres.count(datum->id)) {
+
+ if (merge_sites && merge_map.count(loc->id)) {
+ datum_adjust_snp_positions(merge_map, loc, datum, snpres);
+ } else {
+ //
+ // Deep copy the SNP objects.
+ //
+ snpr = snpres[datum->id];
+ for (uint j = 0; j < snpr->snps.size(); j++) {
+ snp = new SNP;
+ snp->col = snpr->snps[j]->col;
+ snp->lratio = snpr->snps[j]->lratio;
+ snp->rank_1 = snpr->snps[j]->rank_1;
+ snp->rank_2 = snpr->snps[j]->rank_2;
+ snp->rank_3 = snpr->snps[j]->rank_3;
+ snp->rank_4 = snpr->snps[j]->rank_4;
+
+ datum->snps.push_back(snp);
+ }
+ }
+ }
+ }
+
+ for (sit = snpres.begin(); sit != snpres.end(); sit++)
+ delete sit->second;
}
return 0;
@@ -5608,53 +5608,53 @@ find_datum_allele_depths(Datum *d, int snp_index, char p_allele, char q_allele,
if (allele_cnt == 1) {
- //
- // There is a single observed haplotype for this locus, e.g. GA.
- //
- if (d->obshap.size() == 1) {
- if (d->obshap[0][snp_index] == p_allele) {
- dp_1 = d->depth[0];
- dp_2 = 0;
- } else {
- dp_1 = 0;
- dp_2 = d->depth[0];
- }
- } else {
- //
- // This SNP position is homozygous, but the locus is heterozygous, so there is more
- // than one observed haplotype, e.g. GA / TA.
- //
- if (d->obshap[0][snp_index] == p_allele) {
- dp_1 = d->tot_depth;
- dp_2 = 0;
- } else {
- dp_1 = 0;
- dp_2 = d->tot_depth;
- }
- }
+ //
+ // There is a single observed haplotype for this locus, e.g. GA.
+ //
+ if (d->obshap.size() == 1) {
+ if (d->obshap[0][snp_index] == p_allele) {
+ dp_1 = d->depth[0];
+ dp_2 = 0;
+ } else {
+ dp_1 = 0;
+ dp_2 = d->depth[0];
+ }
+ } else {
+ //
+ // This SNP position is homozygous, but the locus is heterozygous, so there is more
+ // than one observed haplotype, e.g. GA / TA.
+ //
+ if (d->obshap[0][snp_index] == p_allele) {
+ dp_1 = d->tot_depth;
+ dp_2 = 0;
+ } else {
+ dp_1 = 0;
+ dp_2 = d->tot_depth;
+ }
+ }
} else {
- //
- // This SNP position is heterozygous.
- //
- for (uint i = 0; i < d->obshap.size(); i++) {
- if (d->obshap[i][snp_index] == p_allele)
- dp_1 = d->depth[i];
- else if (d->obshap[i][snp_index] == q_allele)
- dp_2 = d->depth[i];
- }
+ //
+ // This SNP position is heterozygous.
+ //
+ for (uint i = 0; i < d->obshap.size(); i++) {
+ if (d->obshap[i][snp_index] == p_allele)
+ dp_1 = d->depth[i];
+ else if (d->obshap[i][snp_index] == q_allele)
+ dp_2 = d->depth[i];
+ }
}
if (dp_1 == 0 && dp_2 == 0)
- cerr << "Warning: Unable to find allele depths for datum " << d->id << "\n";
+ cerr << "Warning: Unable to find allele depths for datum " << d->id << "\n";
return 0;
}
int
write_vcf_haplotypes(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
- map<int, string> &samples, vector<int> &sample_ids)
+ PopMap<CSLocus> *pmap, PopSum<CSLocus> *psum,
+ map<int, string> &samples, vector<int> &sample_ids)
{
//
// Write a VCF file as defined here: http://samtools.github.io/hts-specs/
@@ -5669,7 +5669,7 @@ write_vcf_haplotypes(map<int, CSLocus *> &catalog,
if (fh.fail()) {
cerr << "Error opening VCF file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -5696,7 +5696,7 @@ write_vcf_haplotypes(map<int, CSLocus *> &catalog,
<< "QUAL" << "\t" << "FILTER" << "\t" << "INFO" << "\t" << "FORMAT";
for (int i = 0; i < pmap->sample_cnt(); i++)
- fh << "\t" << samples[pmap->rev_sample_index(i)];
+ fh << "\t" << samples[pmap->rev_sample_index(i)];
fh << "\n";
map<string, vector<CSLocus *> >::iterator it;
@@ -5710,84 +5710,84 @@ write_vcf_haplotypes(map<int, CSLocus *> &catalog,
char allele[id_len];
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- d = pmap->locus(loc->id);
-
- hap_freq.clear();
- hap_index.clear();
- ordered_hap.clear();
-
- num_hap = count_haplotypes_at_locus(0, pmap->sample_cnt() - 1, d, hap_freq);
-
- if (num_hap == 0 || hap_freq.size() == 1)
- continue;
-
- num_indv = num_hap / 2.0;
-
- //
- // Order the haplotypes according to most frequent. Record the ordered position or each
- // haplotype and convert them from counts to frequencies.
- //
- for (hit = hap_freq.begin(); hit != hap_freq.end(); hit++) {
- ordered_hap.push_back(make_pair(hit->first, hit->second));
- hit->second = hit->second / num_hap;
- }
- sort(ordered_hap.begin(), ordered_hap.end(), compare_pair_haplotype);
- for (uint i = 0; i < ordered_hap.size(); i++)
- hap_index[ordered_hap[i].first] = i;
-
- string alt_str, freq_str;
- for (uint i = 1; i < ordered_hap.size(); i++) {
- alt_str += ordered_hap[i].first;
- sprintf(allele, "%0.3f", hap_freq[ordered_hap[i].first]);
- freq_str += allele;
- if (i < ordered_hap.size() - 1) {
- alt_str += ",";
- freq_str += ",";
- }
- }
-
- fh << loc->loc.chr << "\t"
- << loc->sort_bp() + 1 << "\t"
- << loc->id << "\t"
- << ordered_hap[0].first << "\t" // REFerence haplotypes
- << alt_str << "\t" // ALTernate haplotypes
- << "." << "\t" // QUAL
- << "PASS" << "\t" // FILTER
- << "NS=" << num_indv << ";" // INFO
- << "AF=" << freq_str << "\t" // INFO
- << "GT:DP"; // FORMAT
-
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- fh << "\t";
-
- if (d[j] == NULL) {
- //
- // Data does not exist.
- //
- fh << "./.:0";
-
- } else if (d[j]->obshap.size() > 2) {
- fh << "./.:" << d[j]->tot_depth;
-
- } else if (d[j]->obshap.size() == 1) {
- if(uncalled_haplotype(d[j]->obshap[0]))
- fh << "./.:" << d[j]->tot_depth;
- else
- fh << hap_index[d[j]->obshap[0]] << "/" << hap_index[d[j]->obshap[0]] << ":" << d[j]->tot_depth;
- } else {
- if(!uncalled_haplotype(d[j]->obshap[0]) &&
- !uncalled_haplotype(d[j]->obshap[1]))
- fh << hap_index[d[j]->obshap[0]] << "/" << hap_index[d[j]->obshap[1]] << ":" << d[j]->tot_depth;
- else if (!uncalled_haplotype(d[j]->obshap[0]))
- fh << hap_index[d[j]->obshap[0]] << "/" << "." << ":" << d[j]->tot_depth;
- else if (!uncalled_haplotype(d[j]->obshap[1]))
- fh << "." << "/" << hap_index[d[j]->obshap[1]] << ":" << d[j]->tot_depth;
- }
- }
- fh << "\n";
- }
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ d = pmap->locus(loc->id);
+
+ hap_freq.clear();
+ hap_index.clear();
+ ordered_hap.clear();
+
+ num_hap = count_haplotypes_at_locus(0, pmap->sample_cnt() - 1, d, hap_freq);
+
+ if (num_hap == 0 || hap_freq.size() == 1)
+ continue;
+
+ num_indv = num_hap / 2.0;
+
+ //
+ // Order the haplotypes according to most frequent. Record the ordered position or each
+ // haplotype and convert them from counts to frequencies.
+ //
+ for (hit = hap_freq.begin(); hit != hap_freq.end(); hit++) {
+ ordered_hap.push_back(make_pair(hit->first, hit->second));
+ hit->second = hit->second / num_hap;
+ }
+ sort(ordered_hap.begin(), ordered_hap.end(), compare_pair_haplotype);
+ for (uint i = 0; i < ordered_hap.size(); i++)
+ hap_index[ordered_hap[i].first] = i;
+
+ string alt_str, freq_str;
+ for (uint i = 1; i < ordered_hap.size(); i++) {
+ alt_str += ordered_hap[i].first;
+ sprintf(allele, "%0.3f", hap_freq[ordered_hap[i].first]);
+ freq_str += allele;
+ if (i < ordered_hap.size() - 1) {
+ alt_str += ",";
+ freq_str += ",";
+ }
+ }
+
+ fh << loc->loc.chr << "\t"
+ << loc->sort_bp() + 1 << "\t"
+ << loc->id << "\t"
+ << ordered_hap[0].first << "\t" // REFerence haplotypes
+ << alt_str << "\t" // ALTernate haplotypes
+ << "." << "\t" // QUAL
+ << "PASS" << "\t" // FILTER
+ << "NS=" << num_indv << ";" // INFO
+ << "AF=" << freq_str << "\t" // INFO
+ << "GT:DP"; // FORMAT
+
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ fh << "\t";
+
+ if (d[j] == NULL) {
+ //
+ // Data does not exist.
+ //
+ fh << "./.:0";
+
+ } else if (d[j]->obshap.size() > 2) {
+ fh << "./.:" << d[j]->tot_depth;
+
+ } else if (d[j]->obshap.size() == 1) {
+ if(uncalled_haplotype(d[j]->obshap[0]))
+ fh << "./.:" << d[j]->tot_depth;
+ else
+ fh << hap_index[d[j]->obshap[0]] << "/" << hap_index[d[j]->obshap[0]] << ":" << d[j]->tot_depth;
+ } else {
+ if(!uncalled_haplotype(d[j]->obshap[0]) &&
+ !uncalled_haplotype(d[j]->obshap[1]))
+ fh << hap_index[d[j]->obshap[0]] << "/" << hap_index[d[j]->obshap[1]] << ":" << d[j]->tot_depth;
+ else if (!uncalled_haplotype(d[j]->obshap[0]))
+ fh << hap_index[d[j]->obshap[0]] << "/" << "." << ":" << d[j]->tot_depth;
+ else if (!uncalled_haplotype(d[j]->obshap[1]))
+ fh << "." << "/" << hap_index[d[j]->obshap[1]] << ":" << d[j]->tot_depth;
+ }
+ }
+ fh << "\n";
+ }
}
fh.close();
@@ -5797,10 +5797,10 @@ write_vcf_haplotypes(map<int, CSLocus *> &catalog,
int
write_genepop(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum,
+ map<int, pair<int, int> > &pop_indexes,
+ map<int, string> &samples)
{
//
// Write a GenePop file as defined here: http://kimura.univ-montp2.fr/~rousset/Genepop.htm
@@ -5815,7 +5815,7 @@ write_genepop(map<int, CSLocus *> &catalog,
if (fh.fail()) {
cerr << "Error opening GenePop file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -5847,33 +5847,33 @@ write_genepop(map<int, CSLocus *> &catalog,
//
uint cnt = 0;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- for (uint j = 0; j < loc->snps.size(); j++) {
- col = loc->snps[j]->col;
- t = psum->locus_tally(loc->id);
-
- if (t->nucs[col].allele_cnt != 2)
- continue;
- cnt++;
- }
+ loc = it->second;
+ for (uint j = 0; j < loc->snps.size(); j++) {
+ col = loc->snps[j]->col;
+ t = psum->locus_tally(loc->id);
+
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+ cnt++;
+ }
}
uint i = 0;
for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- for (uint j = 0; j < loc->snps.size(); j++) {
- col = loc->snps[j]->col;
- t = psum->locus_tally(loc->id);
-
- //
- // If this site is fixed in all populations or has too many alleles don't output it.
- //
- if (t->nucs[col].allele_cnt != 2)
- continue;
- i++;
- fh << loc->id << "_" << col;
- if (i < cnt) fh << ",";
- }
+ loc = it->second;
+ for (uint j = 0; j < loc->snps.size(); j++) {
+ col = loc->snps[j]->col;
+ t = psum->locus_tally(loc->id);
+
+ //
+ // If this site is fixed in all populations or has too many alleles don't output it.
+ //
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+ i++;
+ fh << loc->id << "_" << col;
+ if (i < cnt) fh << ",";
+ }
}
fh << "\n";
@@ -5884,68 +5884,68 @@ write_genepop(map<int, CSLocus *> &catalog,
nuc_map['T'] = "04";
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = psum->pop_index(pit->first);
- start_index = pit->second.first;
- end_index = pit->second.second;
-
- fh << "pop\n";
-
- for (int j = start_index; j <= end_index; j++) {
-
- fh << samples[pmap->rev_sample_index(j)] << ",";
-
- for (it = catalog.begin(); it != catalog.end(); it++) {
- loc = it->second;
- d = pmap->locus(loc->id);
- s = psum->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- for (i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- if (t->nucs[col].allele_cnt != 2)
- continue;
-
- if (s[pop_id]->nucs[col].incompatible_site ||
- s[pop_id]->nucs[col].filtered_site) {
- //
- // This site contains more than two alleles in this population or was filtered
- // due to a minor allele frequency that is too low.
- //
- fh << "\t0000";
- } else if (d[j] == NULL) {
- //
- // Data does not exist.
- //
- fh << "\t0000";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- fh << "\t0000";
- } else {
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, i, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0) {
- // More than two potential alleles.
- fh << "\t0000";
- } else if (p_allele == 0) {
- fh << "\t" << nuc_map[q_allele] << nuc_map[q_allele];
-
- } else if (q_allele == 0) {
- fh << "\t" << nuc_map[p_allele] << nuc_map[p_allele];
-
- } else {
- fh << "\t" << nuc_map[p_allele] << nuc_map[q_allele];
- }
- }
- }
- }
- fh << "\n";
- }
+ pop_id = psum->pop_index(pit->first);
+ start_index = pit->second.first;
+ end_index = pit->second.second;
+
+ fh << "pop\n";
+
+ for (int j = start_index; j <= end_index; j++) {
+
+ fh << samples[pmap->rev_sample_index(j)] << ",";
+
+ for (it = catalog.begin(); it != catalog.end(); it++) {
+ loc = it->second;
+ d = pmap->locus(loc->id);
+ s = psum->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ for (i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+
+ if (s[pop_id]->nucs[col].incompatible_site ||
+ s[pop_id]->nucs[col].filtered_site) {
+ //
+ // This site contains more than two alleles in this population or was filtered
+ // due to a minor allele frequency that is too low.
+ //
+ fh << "\t0000";
+ } else if (d[j] == NULL || col >= d[j]->len) {
+ //
+ // Data does not exist.
+ //
+ fh << "\t0000";
+ } else if (d[j]->model[col] == 'U') {
+ //
+ // Data exists, but the model call was uncertain.
+ //
+ fh << "\t0000";
+ } else {
+ //
+ // Tally up the nucleotide calls.
+ //
+ tally_observed_haplotypes(d[j]->obshap, i, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0) {
+ // More than two potential alleles.
+ fh << "\t0000";
+ } else if (p_allele == 0) {
+ fh << "\t" << nuc_map[q_allele] << nuc_map[q_allele];
+
+ } else if (q_allele == 0) {
+ fh << "\t" << nuc_map[p_allele] << nuc_map[p_allele];
+
+ } else {
+ fh << "\t" << nuc_map[p_allele] << nuc_map[q_allele];
+ }
+ }
+ }
+ }
+ fh << "\n";
+ }
}
fh.close();
@@ -5955,10 +5955,10 @@ write_genepop(map<int, CSLocus *> &catalog,
int
write_genepop_ordered(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples, ofstream &log_fh)
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum,
+ map<int, pair<int, int> > &pop_indexes,
+ map<int, string> &samples, ofstream &log_fh)
{
//
// Write a GenePop file as defined here: http://kimura.univ-montp2.fr/~rousset/Genepop.htm
@@ -5973,7 +5973,7 @@ write_genepop_ordered(map<int, CSLocus *> &catalog,
if (fh.fail()) {
cerr << "Error opening GenePop file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -6012,14 +6012,14 @@ write_genepop_ordered(map<int, CSLocus *> &catalog,
int chrs = pmap->ordered_loci.size();
int cnt = 0;
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- vector<NucTally *> &sites = genome_sites[it->first];
- ord->order(sites, it->second);
- cnt++;
+ vector<NucTally *> &sites = genome_sites[it->first];
+ ord->order(sites, it->second);
+ cnt++;
- for (uint pos = 0; pos < sites.size(); pos++) {
- fh << sites[pos]->loc_id << "_" << sites[pos]->col;
- if (cnt < chrs || pos < sites.size() - 1) fh << ",";
- }
+ for (uint pos = 0; pos < sites.size(); pos++) {
+ fh << sites[pos]->loc_id << "_" << sites[pos]->col;
+ if (cnt < chrs || pos < sites.size() - 1) fh << ",";
+ }
}
fh << "\n";
@@ -6030,66 +6030,66 @@ write_genepop_ordered(map<int, CSLocus *> &catalog,
nuc_map['T'] = "04";
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = psum->pop_index(pit->first);
- start_index = pit->second.first;
- end_index = pit->second.second;
-
- fh << "pop\n";
-
- for (int j = start_index; j <= end_index; j++) {
-
- fh << samples[pmap->rev_sample_index(j)] << ",";
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- vector<NucTally *> &sites = genome_sites[it->first];
-
- for (uint pos = 0; pos < sites.size(); pos++) {
- loc = catalog[sites[pos]->loc_id];
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- col = sites[pos]->col;
-
- if (s[pop_id]->nucs[col].incompatible_site ||
- s[pop_id]->nucs[col].filtered_site) {
- //
- // This site contains more than two alleles in this population or was filtered
- // due to a minor allele frequency that is too low.
- //
- fh << "\t0000";
- } else if (d[j] == NULL) {
- //
- // Data does not exist.
- //
- fh << "\t0000";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- fh << "\t0000";
- } else {
- snp_index = loc->snp_index(col);
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0) {
- // More than two potential alleles.
- fh << "\t0000";
- } else if (p_allele == 0) {
- fh << "\t" << nuc_map[q_allele] << nuc_map[q_allele];
-
- } else if (q_allele == 0) {
- fh << "\t" << nuc_map[p_allele] << nuc_map[p_allele];
-
- } else {
- fh << "\t" << nuc_map[p_allele] << nuc_map[q_allele];
- }
- }
- }
- }
- fh << "\n";
- }
+ pop_id = psum->pop_index(pit->first);
+ start_index = pit->second.first;
+ end_index = pit->second.second;
+
+ fh << "pop\n";
+
+ for (int j = start_index; j <= end_index; j++) {
+
+ fh << samples[pmap->rev_sample_index(j)] << ",";
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ vector<NucTally *> &sites = genome_sites[it->first];
+
+ for (uint pos = 0; pos < sites.size(); pos++) {
+ loc = catalog[sites[pos]->loc_id];
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ col = sites[pos]->col;
+
+ if (s[pop_id]->nucs[col].incompatible_site ||
+ s[pop_id]->nucs[col].filtered_site) {
+ //
+ // This site contains more than two alleles in this population or was filtered
+ // due to a minor allele frequency that is too low.
+ //
+ fh << "\t0000";
+ } else if (d[j] == NULL || col >= d[j]->len) {
+ //
+ // Data does not exist.
+ //
+ fh << "\t0000";
+ } else if (d[j]->model[col] == 'U') {
+ //
+ // Data exists, but the model call was uncertain.
+ //
+ fh << "\t0000";
+ } else {
+ snp_index = loc->snp_index(col);
+ //
+ // Tally up the nucleotide calls.
+ //
+ tally_observed_haplotypes(d[j]->obshap, snp_index, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0) {
+ // More than two potential alleles.
+ fh << "\t0000";
+ } else if (p_allele == 0) {
+ fh << "\t" << nuc_map[q_allele] << nuc_map[q_allele];
+
+ } else if (q_allele == 0) {
+ fh << "\t" << nuc_map[p_allele] << nuc_map[p_allele];
+
+ } else {
+ fh << "\t" << nuc_map[p_allele] << nuc_map[q_allele];
+ }
+ }
+ }
+ }
+ fh << "\n";
+ }
}
fh.close();
@@ -6099,10 +6099,10 @@ write_genepop_ordered(map<int, CSLocus *> &catalog,
int
write_structure(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum,
+ map<int, pair<int, int> > &pop_indexes,
+ map<int, string> &samples)
{
//
// Write a Structure file as defined here: http://pritch.bsd.uchicago.edu/structure.html
@@ -6120,7 +6120,7 @@ write_structure(map<int, CSLocus *> &catalog,
if (fh.fail()) {
cerr << "Error opening Structure file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -6145,16 +6145,16 @@ write_structure(map<int, CSLocus *> &catalog,
LocTally *t;
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
- if (t->nucs[col].allele_cnt == 2)
- fh << "\t" << loc->id << "_" << col;
- }
- }
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+ if (t->nucs[col].allele_cnt == 2)
+ fh << "\t" << loc->id << "_" << col;
+ }
+ }
}
fh << "\n";
@@ -6169,110 +6169,110 @@ write_structure(map<int, CSLocus *> &catalog,
char p_allele, q_allele;
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- p = psum->pop_index(pit->first);
- pop_id = pit->first;
- start_index = pit->second.first;
- end_index = pit->second.second;
-
- for (int j = start_index; j <= end_index; j++) {
- //
- // Output all the loci for this sample, printing only the p allele
- //
- fh << samples[pmap->rev_sample_index(j)] << "\t" << pop_id;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- //
- // If this site is fixed in all populations or has too many alleles don't output it.
- //
- if (t->nucs[col].allele_cnt != 2)
- continue;
-
- if (s[p]->nucs[col].incompatible_site ||
- s[p]->nucs[col].filtered_site) {
- //
- // This site contains more than two alleles in this population or was filtered
- // due to a minor allele frequency that is too low.
- //
- fh << "\t" << "0";
- } else if (d[j] == NULL) {
- //
- // Data does not exist.
- //
- fh << "\t" << "0";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- fh << "\t" << "0";
- } else {
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, i, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- fh << "\t" << "0";
- else if (p_allele == 0)
- fh << "\t" << nuc_map[q_allele];
- else
- fh << "\t" << nuc_map[p_allele];
- }
- }
- }
- }
- fh << "\n";
-
- //
- // Output all the loci for this sample again, now for the q allele
- //
- fh << samples[pmap->rev_sample_index(j)] << "\t" << pop_id;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- if (t->nucs[col].allele_cnt != 2)
- continue;
-
- if (s[p]->nucs[col].incompatible_site ||
- s[p]->nucs[col].filtered_site) {
- fh << "\t" << "0";
- } else if (d[j] == NULL) {
- fh << "\t" << "0";
- } else if (d[j]->model[col] == 'U') {
- fh << "\t" << "0";
- } else {
- tally_observed_haplotypes(d[j]->obshap, i, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- fh << "\t" << "0";
- else if (q_allele == 0)
- fh << "\t" << nuc_map[p_allele];
- else
- fh << "\t" << nuc_map[q_allele];
- }
- }
- }
- }
- fh << "\n";
- }
+ p = psum->pop_index(pit->first);
+ pop_id = pit->first;
+ start_index = pit->second.first;
+ end_index = pit->second.second;
+
+ for (int j = start_index; j <= end_index; j++) {
+ //
+ // Output all the loci for this sample, printing only the p allele
+ //
+ fh << samples[pmap->rev_sample_index(j)] << "\t" << pop_id;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ //
+ // If this site is fixed in all populations or has too many alleles don't output it.
+ //
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+
+ if (s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ //
+ // This site contains more than two alleles in this population or was filtered
+ // due to a minor allele frequency that is too low.
+ //
+ fh << "\t" << "0";
+ } else if (d[j] == NULL || col >= d[j]->len) {
+ //
+ // Data does not exist.
+ //
+ fh << "\t" << "0";
+ } else if (d[j]->model[col] == 'U') {
+ //
+ // Data exists, but the model call was uncertain.
+ //
+ fh << "\t" << "0";
+ } else {
+ //
+ // Tally up the nucleotide calls.
+ //
+ tally_observed_haplotypes(d[j]->obshap, i, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ fh << "\t" << "0";
+ else if (p_allele == 0)
+ fh << "\t" << nuc_map[q_allele];
+ else
+ fh << "\t" << nuc_map[p_allele];
+ }
+ }
+ }
+ }
+ fh << "\n";
+
+ //
+ // Output all the loci for this sample again, now for the q allele
+ //
+ fh << samples[pmap->rev_sample_index(j)] << "\t" << pop_id;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+
+ if (s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ fh << "\t" << "0";
+ } else if (d[j] == NULL || col >= d[j]->len) {
+ fh << "\t" << "0";
+ } else if (d[j]->model[col] == 'U') {
+ fh << "\t" << "0";
+ } else {
+ tally_observed_haplotypes(d[j]->obshap, i, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ fh << "\t" << "0";
+ else if (q_allele == 0)
+ fh << "\t" << nuc_map[p_allele];
+ else
+ fh << "\t" << nuc_map[q_allele];
+ }
+ }
+ }
+ }
+ fh << "\n";
+ }
}
fh.close();
@@ -6284,10 +6284,10 @@ write_structure(map<int, CSLocus *> &catalog,
int
write_structure_ordered(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples, ofstream &log_fh)
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum,
+ map<int, pair<int, int> > &pop_indexes,
+ map<int, string> &samples, ofstream &log_fh)
{
//
// Write a Structure file as defined here: http://pritch.bsd.uchicago.edu/structure.html
@@ -6305,7 +6305,7 @@ write_structure_ordered(map<int, CSLocus *> &catalog,
if (fh.fail()) {
cerr << "Error opening Structure file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -6335,11 +6335,11 @@ write_structure_ordered(map<int, CSLocus *> &catalog,
OLocTally<NucTally> *ord = new OLocTally<NucTally>(psum, log_fh);
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- vector<NucTally *> &sites = genome_sites[it->first];
- ord->order(sites, it->second);
+ vector<NucTally *> &sites = genome_sites[it->first];
+ ord->order(sites, it->second);
- for (uint pos = 0; pos < sites.size(); pos++)
- fh << "\t" << sites[pos]->loc_id << "_" << sites[pos]->col;
+ for (uint pos = 0; pos < sites.size(); pos++)
+ fh << "\t" << sites[pos]->loc_id << "_" << sites[pos]->col;
}
fh << "\n";
@@ -6355,97 +6355,97 @@ write_structure_ordered(map<int, CSLocus *> &catalog,
uint col, snp_index;
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- p = psum->pop_index(pit->first);
- pop_id = pit->first;
- start_index = pit->second.first;
- end_index = pit->second.second;
-
- for (int j = start_index; j <= end_index; j++) {
- //
- // Output all the loci for this sample, printing only the p allele
- //
- fh << samples[pmap->rev_sample_index(j)] << "\t" << pop_id;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- vector<NucTally *> &sites = genome_sites[it->first];
-
- for (uint pos = 0; pos < sites.size(); pos++) {
- loc = catalog[sites[pos]->loc_id];
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- col = sites[pos]->col;
-
- if (s[p]->nucs[col].incompatible_site ||
- s[p]->nucs[col].filtered_site) {
- //
- // This site contains more than two alleles in this population or was filtered
- // due to a minor allele frequency that is too low.
- //
- fh << "\t" << "0";
- } else if (d[j] == NULL) {
- //
- // Data does not exist.
- //
- fh << "\t" << "0";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- fh << "\t" << "0";
- } else {
- snp_index = loc->snp_index(col);
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- fh << "\t" << "0";
- else if (p_allele == 0)
- fh << "\t" << nuc_map[q_allele];
- else
- fh << "\t" << nuc_map[p_allele];
- }
- }
- }
- fh << "\n";
-
- //
- // Output all the loci for this sample again, now for the q allele
- //
- fh << samples[pmap->rev_sample_index(j)] << "\t" << pop_id;
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- vector<NucTally *> &sites = genome_sites[it->first];
-
- for (uint pos = 0; pos < sites.size(); pos++) {
- loc = catalog[sites[pos]->loc_id];
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- col = sites[pos]->col;
-
- if (s[p]->nucs[col].incompatible_site ||
- s[p]->nucs[col].filtered_site) {
- fh << "\t" << "0";
- } else if (d[j] == NULL) {
- fh << "\t" << "0";
- } else if (d[j]->model[col] == 'U') {
- fh << "\t" << "0";
- } else {
- snp_index = loc->snp_index(col);
- tally_observed_haplotypes(d[j]->obshap, snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- fh << "\t" << "0";
- else if (q_allele == 0)
- fh << "\t" << nuc_map[p_allele];
- else
- fh << "\t" << nuc_map[q_allele];
- }
- }
- }
- fh << "\n";
- }
+ p = psum->pop_index(pit->first);
+ pop_id = pit->first;
+ start_index = pit->second.first;
+ end_index = pit->second.second;
+
+ for (int j = start_index; j <= end_index; j++) {
+ //
+ // Output all the loci for this sample, printing only the p allele
+ //
+ fh << samples[pmap->rev_sample_index(j)] << "\t" << pop_id;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ vector<NucTally *> &sites = genome_sites[it->first];
+
+ for (uint pos = 0; pos < sites.size(); pos++) {
+ loc = catalog[sites[pos]->loc_id];
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ col = sites[pos]->col;
+
+ if (s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ //
+ // This site contains more than two alleles in this population or was filtered
+ // due to a minor allele frequency that is too low.
+ //
+ fh << "\t" << "0";
+ } else if (d[j] == NULL || col >= d[j]->len) {
+ //
+ // Data does not exist.
+ //
+ fh << "\t" << "0";
+ } else if (d[j]->model[col] == 'U') {
+ //
+ // Data exists, but the model call was uncertain.
+ //
+ fh << "\t" << "0";
+ } else {
+ snp_index = loc->snp_index(col);
+ //
+ // Tally up the nucleotide calls.
+ //
+ tally_observed_haplotypes(d[j]->obshap, snp_index, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ fh << "\t" << "0";
+ else if (p_allele == 0)
+ fh << "\t" << nuc_map[q_allele];
+ else
+ fh << "\t" << nuc_map[p_allele];
+ }
+ }
+ }
+ fh << "\n";
+
+ //
+ // Output all the loci for this sample again, now for the q allele
+ //
+ fh << samples[pmap->rev_sample_index(j)] << "\t" << pop_id;
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ vector<NucTally *> &sites = genome_sites[it->first];
+
+ for (uint pos = 0; pos < sites.size(); pos++) {
+ loc = catalog[sites[pos]->loc_id];
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ col = sites[pos]->col;
+
+ if (s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ fh << "\t" << "0";
+ } else if (d[j] == NULL || col >= d[j]->len) {
+ fh << "\t" << "0";
+ } else if (d[j]->model[col] == 'U') {
+ fh << "\t" << "0";
+ } else {
+ snp_index = loc->snp_index(col);
+ tally_observed_haplotypes(d[j]->obshap, snp_index, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ fh << "\t" << "0";
+ else if (q_allele == 0)
+ fh << "\t" << nuc_map[p_allele];
+ else
+ fh << "\t" << nuc_map[q_allele];
+ }
+ }
+ }
+ fh << "\n";
+ }
}
fh.close();
@@ -6457,10 +6457,10 @@ write_structure_ordered(map<int, CSLocus *> &catalog,
int
write_hzar(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum,
+ map<int, pair<int, int> > &pop_indexes,
+ map<int, string> &samples)
{
//
// Write a Hybrid Zone Analysis using R (HZAR) file as defined here:
@@ -6476,7 +6476,7 @@ write_hzar(map<int, CSLocus *> &catalog,
if (fh.fail()) {
cerr << "Error opening HZAR file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -6501,20 +6501,20 @@ write_hzar(map<int, CSLocus *> &catalog,
LocTally *t;
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- s = psum->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
- if (t->nucs[col].allele_cnt == 2) {
- fh << "," << loc->id << "_" << col << ".A"
- << "," << loc->id << "_" << col << ".B"
- << "," << loc->id << "_" << col << ".N";
- }
- }
- }
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ s = psum->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+ if (t->nucs[col].allele_cnt == 2) {
+ fh << "," << loc->id << "_" << col << ".A"
+ << "," << loc->id << "_" << col << ".B"
+ << "," << loc->id << "_" << col << ".N";
+ }
+ }
+ }
}
fh << "\n";
@@ -6522,44 +6522,44 @@ write_hzar(map<int, CSLocus *> &catalog,
int pop_id, p;
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- p = psum->pop_index(pit->first);
- pop_id = pit->first;
-
- fh << pop_key[pop_id] << ",";
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- s = psum->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- //
- // If this site is fixed in all populations or has too many alleles don't output it.
- //
- if (t->nucs[col].allele_cnt != 2)
- continue;
-
- if (s[p]->nucs[col].num_indv == 0 ||
- s[p]->nucs[col].incompatible_site ||
- s[p]->nucs[col].filtered_site) {
- fh << ",0,0,0";
- continue;
- }
-
- if (t->nucs[col].p_allele == s[p]->nucs[col].p_nuc)
- fh << "," << s[p]->nucs[col].p << "," << 1 - s[p]->nucs[col].p << ",";
- else
- fh << "," << 1 - s[p]->nucs[col].p << "," << s[p]->nucs[col].p << ",";
-
- fh << s[p]->nucs[col].num_indv * 2;
- }
- }
- }
- fh << "\n";
+ p = psum->pop_index(pit->first);
+ pop_id = pit->first;
+
+ fh << pop_key[pop_id] << ",";
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ s = psum->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ //
+ // If this site is fixed in all populations or has too many alleles don't output it.
+ //
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+
+ if (s[p]->nucs[col].num_indv == 0 ||
+ s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ fh << ",0,0,0";
+ continue;
+ }
+
+ if (t->nucs[col].p_allele == s[p]->nucs[col].p_nuc)
+ fh << "," << s[p]->nucs[col].p << "," << 1 - s[p]->nucs[col].p << ",";
+ else
+ fh << "," << 1 - s[p]->nucs[col].p << "," << s[p]->nucs[col].p << ",";
+
+ fh << s[p]->nucs[col].num_indv * 2;
+ }
+ }
+ }
+ fh << "\n";
}
fh.close();
@@ -6571,10 +6571,10 @@ write_hzar(map<int, CSLocus *> &catalog,
int
write_treemix(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum,
+ map<int, pair<int, int> > &pop_indexes,
+ map<int, string> &samples)
{
//
// Write a TreeMix file (Pickrell and Pritchard, 2012 PLoS Genetics)
@@ -6590,7 +6590,7 @@ write_treemix(map<int, CSLocus *> &catalog,
if (fh.fail()) {
cerr << "Error opening TreeMix file '" << file << "'\n";
- exit(1);
+ exit(1);
}
pop_name << ".log";
@@ -6602,7 +6602,7 @@ write_treemix(map<int, CSLocus *> &catalog,
if (log_fh.fail()) {
cerr << "Error opening Phylip Log file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -6616,7 +6616,7 @@ write_treemix(map<int, CSLocus *> &catalog,
strftime(date, 32, "%B %d, %Y", timeinfo);
log_fh << "# Stacks v" << VERSION << "; " << " TreeMix v1.1; " << date << "\n"
- << "# Line\tLocus ID\tColumn\tChr\tBasepair\n";
+ << "# Line\tLocus ID\tColumn\tChr\tBasepair\n";
//
// Output the header.
@@ -6635,7 +6635,7 @@ write_treemix(map<int, CSLocus *> &catalog,
//
stringstream sstr;
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++)
- sstr << pop_key[pit->first] << " ";
+ sstr << pop_key[pit->first] << " ";
fh << sstr.str().substr(0, sstr.str().length() - 1) << "\n";
@@ -6643,51 +6643,51 @@ write_treemix(map<int, CSLocus *> &catalog,
long int line = 1;
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- s = psum->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- sstr.str("");
-
- //
- // If this site is fixed in all populations or has too many alleles don't output it.
- //
- if (t->nucs[col].allele_cnt != 2)
- continue;
-
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- p = psum->pop_index(pit->first);
-
- if (s[p]->nucs[col].num_indv == 0 ||
- s[p]->nucs[col].incompatible_site ||
- s[p]->nucs[col].filtered_site) {
- sstr << "0,0 ";
- continue;
- }
-
- p_freq = (t->nucs[col].p_allele == s[p]->nucs[col].p_nuc) ?
- s[p]->nucs[col].p :
- 1 - s[p]->nucs[col].p;
-
- allele_cnt = s[p]->nucs[col].num_indv * 2;
- p_cnt = round(allele_cnt * p_freq);
- q_cnt = allele_cnt - p_cnt;
- sstr << (int) p_cnt << "," << (int) q_cnt << " ";
- }
-
- if (sstr.str().length() == 0)
- continue;
-
- fh << sstr.str().substr(0, sstr.str().length() - 1) << "\n";
- log_fh << line << "\t" << loc->id << "\t" << col << "\t" << loc->loc.chr << "\t" << loc->sort_bp(col) + 1 << "\n";
- line++;
- }
- }
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ s = psum->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ sstr.str("");
+
+ //
+ // If this site is fixed in all populations or has too many alleles don't output it.
+ //
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+
+ for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
+ p = psum->pop_index(pit->first);
+
+ if (s[p]->nucs[col].num_indv == 0 ||
+ s[p]->nucs[col].incompatible_site ||
+ s[p]->nucs[col].filtered_site) {
+ sstr << "0,0 ";
+ continue;
+ }
+
+ p_freq = (t->nucs[col].p_allele == s[p]->nucs[col].p_nuc) ?
+ s[p]->nucs[col].p :
+ 1 - s[p]->nucs[col].p;
+
+ allele_cnt = s[p]->nucs[col].num_indv * 2;
+ p_cnt = round(allele_cnt * p_freq);
+ q_cnt = allele_cnt - p_cnt;
+ sstr << (int) p_cnt << "," << (int) q_cnt << " ";
+ }
+
+ if (sstr.str().length() == 0)
+ continue;
+
+ fh << sstr.str().substr(0, sstr.str().length() - 1) << "\n";
+ log_fh << line << "\t" << loc->id << "\t" << col << "\t" << loc->loc.chr << "\t" << loc->sort_bp(col) + 1 << "\n";
+ line++;
+ }
+ }
}
fh.close();
@@ -6700,10 +6700,10 @@ write_treemix(map<int, CSLocus *> &catalog,
int
write_fastphase(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum,
+ map<int, pair<int, int> > &pop_indexes,
+ map<int, string> &samples)
{
//
// Write a fastPHASE file as defined here: http://stephenslab.uchicago.edu/software.html
@@ -6720,187 +6720,187 @@ write_fastphase(map<int, CSLocus *> &catalog,
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- stringstream pop_name;
- pop_name << "batch_" << batch_id << "." << it->first << ".fastphase.inp";
- string file = in_path + pop_name.str();
-
- ofstream fh(file.c_str(), ofstream::out);
-
- if (fh.fail()) {
- cerr << "Error opening fastPHASE file '" << file << "'\n";
- exit(1);
- }
-
- //
- // Tally up the number of sites
- //
- int total_sites = 0;
- uint col;
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- col = loc->snps[i]->col;
- if (t->nucs[col].allele_cnt == 2)
- total_sites++;
- }
- }
-
- //
- // Output the total number of SNP sites and the number of individuals.
- //
- fh << samples.size() << "\n"
- << total_sites << "\n";
-
- //
- // We need to determine an ordering that can take into account overlapping RAD sites.
- //
- vector<GenPos> ordered_loci;
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- col = loc->snps[i]->col;
- if (t->nucs[col].allele_cnt == 2)
- ordered_loci.push_back(GenPos(loc->id, i, loc->sort_bp(col)));
- }
- }
- sort(ordered_loci.begin(), ordered_loci.end(), compare_genpos);
-
- //
- // Output the position of each site according to its basepair.
- //
- fh << "P";
- for (uint pos = 0; pos < ordered_loci.size(); pos++) {
- loc = catalog[ordered_loci[pos].id];
- col = loc->snps[ordered_loci[pos].snp_index]->col;
- fh << " " << ordered_loci[pos].bp;
- }
- fh << "\n";
-
- //
- // Output a line of 'S' characters, one per site, indicating that these are SNP markers.
- //
- string snp_markers, gtypes_str;
- snp_markers.assign(total_sites, 'S');
- fh << snp_markers << '\n';
-
- //
- // Now output each sample name followed by a new line, then all of the genotypes for that sample
- // on two lines.
- //
-
- map<int, pair<int, int> >::iterator pit;
- int start_index, end_index, pop_id;
- char p_allele, q_allele;
- stringstream gtypes;
-
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = psum->pop_index(pit->first);
- start_index = pit->second.first;
- end_index = pit->second.second;
-
- for (int j = start_index; j <= end_index; j++) {
- //
- // Output all the loci for this sample, printing only the p allele
- //
- fh << samples[pmap->rev_sample_index(j)] << "\n";
-
- gtypes.str("");
- for (uint pos = 0; pos < ordered_loci.size(); pos++) {
- loc = catalog[ordered_loci[pos].id];
- col = loc->snps[ordered_loci[pos].snp_index]->col;
-
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- //
- // If this site is fixed in all populations or has too many alleles don't output it.
- //
- if (t->nucs[col].allele_cnt != 2)
- continue;
-
- if (s[pop_id]->nucs[col].incompatible_site ||
- s[pop_id]->nucs[col].filtered_site) {
- //
- // This site contains more than two alleles in this population or was filtered
- // due to a minor allele frequency that is too low.
- //
- gtypes << "? ";
-
- } else if (d[j] == NULL) {
- //
- // Data does not exist.
- //
- gtypes << "? ";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- gtypes << "? ";
- } else {
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- gtypes << "? ";
- else if (p_allele == 0)
- gtypes << q_allele << " ";
- else
- gtypes << p_allele << " ";
- }
- }
- gtypes_str = gtypes.str();
- fh << gtypes_str.substr(0, gtypes_str.length() - 1) << "\n";
-
- //
- // Output all the loci for this sample again, now for the q allele
- //
- gtypes.str("");
- for (uint pos = 0; pos < ordered_loci.size(); pos++) {
- loc = catalog[ordered_loci[pos].id];
- col = loc->snps[ordered_loci[pos].snp_index]->col;
-
-
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- if (t->nucs[col].allele_cnt != 2)
- continue;
-
- if (s[pop_id]->nucs[col].incompatible_site ||
- s[pop_id]->nucs[col].filtered_site) {
- gtypes << "? ";
-
- } else if (d[j] == NULL) {
- gtypes << "? ";
-
- } else if (d[j]->model[col] == 'U') {
- gtypes << "? ";
-
- } else {
- tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- gtypes << "? ";
- else if (q_allele == 0)
- gtypes << p_allele << " ";
- else
- gtypes << q_allele << " ";
- }
- }
- gtypes_str = gtypes.str();
- fh << gtypes_str.substr(0, gtypes_str.length() - 1) << "\n";
- }
- }
-
- fh.close();
+ stringstream pop_name;
+ pop_name << "batch_" << batch_id << "." << it->first << ".fastphase.inp";
+ string file = in_path + pop_name.str();
+
+ ofstream fh(file.c_str(), ofstream::out);
+
+ if (fh.fail()) {
+ cerr << "Error opening fastPHASE file '" << file << "'\n";
+ exit(1);
+ }
+
+ //
+ // Tally up the number of sites
+ //
+ int total_sites = 0;
+ uint col;
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ col = loc->snps[i]->col;
+ if (t->nucs[col].allele_cnt == 2)
+ total_sites++;
+ }
+ }
+
+ //
+ // Output the total number of SNP sites and the number of individuals.
+ //
+ fh << samples.size() << "\n"
+ << total_sites << "\n";
+
+ //
+ // We need to determine an ordering that can take into account overlapping RAD sites.
+ //
+ vector<GenPos> ordered_loci;
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ col = loc->snps[i]->col;
+ if (t->nucs[col].allele_cnt == 2)
+ ordered_loci.push_back(GenPos(loc->id, i, loc->sort_bp(col)));
+ }
+ }
+ sort(ordered_loci.begin(), ordered_loci.end(), compare_genpos);
+
+ //
+ // Output the position of each site according to its basepair.
+ //
+ fh << "P";
+ for (uint pos = 0; pos < ordered_loci.size(); pos++) {
+ loc = catalog[ordered_loci[pos].id];
+ col = loc->snps[ordered_loci[pos].snp_index]->col;
+ fh << " " << ordered_loci[pos].bp;
+ }
+ fh << "\n";
+
+ //
+ // Output a line of 'S' characters, one per site, indicating that these are SNP markers.
+ //
+ string snp_markers, gtypes_str;
+ snp_markers.assign(total_sites, 'S');
+ fh << snp_markers << '\n';
+
+ //
+ // Now output each sample name followed by a new line, then all of the genotypes for that sample
+ // on two lines.
+ //
+
+ map<int, pair<int, int> >::iterator pit;
+ int start_index, end_index, pop_id;
+ char p_allele, q_allele;
+ stringstream gtypes;
+
+ for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
+ pop_id = psum->pop_index(pit->first);
+ start_index = pit->second.first;
+ end_index = pit->second.second;
+
+ for (int j = start_index; j <= end_index; j++) {
+ //
+ // Output all the loci for this sample, printing only the p allele
+ //
+ fh << samples[pmap->rev_sample_index(j)] << "\n";
+
+ gtypes.str("");
+ for (uint pos = 0; pos < ordered_loci.size(); pos++) {
+ loc = catalog[ordered_loci[pos].id];
+ col = loc->snps[ordered_loci[pos].snp_index]->col;
+
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ //
+ // If this site is fixed in all populations or has too many alleles don't output it.
+ //
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+
+ if (s[pop_id]->nucs[col].incompatible_site ||
+ s[pop_id]->nucs[col].filtered_site) {
+ //
+ // This site contains more than two alleles in this population or was filtered
+ // due to a minor allele frequency that is too low.
+ //
+ gtypes << "? ";
+
+ } else if (d[j] == NULL || col >= d[j]->len) {
+ //
+ // Data does not exist.
+ //
+ gtypes << "? ";
+ } else if (d[j]->model[col] == 'U') {
+ //
+ // Data exists, but the model call was uncertain.
+ //
+ gtypes << "? ";
+ } else {
+ //
+ // Tally up the nucleotide calls.
+ //
+ tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ gtypes << "? ";
+ else if (p_allele == 0)
+ gtypes << q_allele << " ";
+ else
+ gtypes << p_allele << " ";
+ }
+ }
+ gtypes_str = gtypes.str();
+ fh << gtypes_str.substr(0, gtypes_str.length() - 1) << "\n";
+
+ //
+ // Output all the loci for this sample again, now for the q allele
+ //
+ gtypes.str("");
+ for (uint pos = 0; pos < ordered_loci.size(); pos++) {
+ loc = catalog[ordered_loci[pos].id];
+ col = loc->snps[ordered_loci[pos].snp_index]->col;
+
+
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+
+ if (s[pop_id]->nucs[col].incompatible_site ||
+ s[pop_id]->nucs[col].filtered_site) {
+ gtypes << "? ";
+
+ } else if (d[j] == NULL || col >= d[j]->len) {
+ gtypes << "? ";
+
+ } else if (d[j]->model[col] == 'U') {
+ gtypes << "? ";
+
+ } else {
+ tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ gtypes << "? ";
+ else if (q_allele == 0)
+ gtypes << p_allele << " ";
+ else
+ gtypes << q_allele << " ";
+ }
+ }
+ gtypes_str = gtypes.str();
+ fh << gtypes_str.substr(0, gtypes_str.length() - 1) << "\n";
+ }
+ }
+
+ fh.close();
}
cerr << "done.\n";
@@ -6910,10 +6910,10 @@ write_fastphase(map<int, CSLocus *> &catalog,
int
write_phase(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum,
+ map<int, pair<int, int> > &pop_indexes,
+ map<int, string> &samples)
{
//
// Write a PHASE file as defined here: http://stephenslab.uchicago.edu/software.html
@@ -6932,265 +6932,265 @@ write_phase(map<int, CSLocus *> &catalog,
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- stringstream pop_name;
- pop_name << "batch_" << batch_id << "." << it->first << ".phase.inp";
- string file = in_path + pop_name.str();
-
- ofstream fh(file.c_str(), ofstream::out);
-
- if (fh.fail()) {
- cerr << "Error opening PHASE file '" << file << "'\n";
- exit(1);
- }
-
- //
- // We need to determine an ordering for all legitimate loci/SNPs.
- //
- uint col;
- vector<GenPos> ordered_loci;
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- t = psum->locus_tally(loc->id);
-
- if (loc->snps.size() == 0) continue;
-
- //
- // Will we output this locus as a haplotype or as a SNP?
- //
- if (loc->snps.size() > 1) {
- //
- // Check that there aren't too many haplotypes (PHASE has a max of 50).
- //
- if (loc->alleles.size() > 40) continue;
-
- //
- // Iterate over the population to determine that this subset of the population
- // has data at this locus.
- //
- d = pmap->locus(loc->id);
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- if (d[j] != NULL &&
- d[j]->obshap.size() > 0 &&
- d[j]->obshap.size() <= 2) {
- //
- // Data exists, and there are the correct number of haplotypes.
- //
- ordered_loci.push_back(GenPos(loc->id, 0, loc->sort_bp(), haplotype));
- break;
- }
- }
- } else {
- col = loc->snps[0]->col;
-
- if (t->nucs[col].allele_cnt == 2)
- ordered_loci.push_back(GenPos(loc->id, 0, loc->sort_bp(col), snp));
- }
- }
- sort(ordered_loci.begin(), ordered_loci.end(), compare_genpos);
-
- //
- // Output the total number of SNP sites and the number of individuals.
- //
- fh << samples.size() << "\n"
- << ordered_loci.size() << "\n";
-
- //
- // Output the position of each site according to its basepair.
- //
- fh << "P";
- for (uint pos = 0; pos < ordered_loci.size(); pos++)
- fh << " " << ordered_loci[pos].bp;
- fh << "\n";
-
- //
- // Output a line of 'S' characters for SNP markers, 'M' characters for multiallelic haplotypes.
- //
- for (uint pos = 0; pos < ordered_loci.size(); pos++) {
- if (pos > 0) fh << " ";
- fh << (ordered_loci[pos].type == snp ? "S" : "M");
- }
- fh << "\n";
-
- //
- // Now output each sample name followed by a new line, then all of the genotypes for that sample
- // on two lines.
- //
-
- map<int, pair<int, int> >::iterator pit;
- string gtypes_str;
- bool found;
- int start_index, end_index, pop_id;
- char p_allele, q_allele;
- stringstream gtypes;
-
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = psum->pop_index(pit->first);
- start_index = pit->second.first;
- end_index = pit->second.second;
-
- for (int j = start_index; j <= end_index; j++) {
- //
- // Output all the loci for this sample, printing only the p allele
- //
- fh << samples[pmap->rev_sample_index(j)] << "\n";
-
- gtypes.str("");
- for (uint pos = 0; pos < ordered_loci.size(); pos++) {
- loc = catalog[ordered_loci[pos].id];
-
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- //
- // Will we output this locus as a haplotype or as a SNP?
- //
- if (ordered_loci[pos].type == haplotype) {
- if (d[j] == NULL) {
- //
- // Data does not exist.
- //
- gtypes << "-1 ";
- } else {
- //
- // Data exists, output the first haplotype. We will assume the haplotypes are
- // numbered by their position in the loc->strings vector.
- //
- if (d[j]->obshap.size() > 2) {
- // cerr << "Warning: too many haplotypes, catalog locus: " << loc->id << "\n";
- gtypes << "-1 ";
- } else {
- found = false;
- for (uint k = 0; k < loc->strings.size(); k++)
- if (d[j]->obshap[0] == loc->strings[k].first) {
- found = true;
- gtypes << k + 1 << " ";
- }
- if (found == false)
- cerr << "Unable to find haplotype " << d[j]->obshap[0] << " from individual "
- << samples[pmap->rev_sample_index(j)] << "; catalog locus: " << loc->id << "\n";
- }
- }
- } else {
- col = loc->snps[ordered_loci[pos].snp_index]->col;
-
- if (s[pop_id]->nucs[col].incompatible_site ||
- s[pop_id]->nucs[col].filtered_site) {
- //
- // This site contains more than two alleles in this population or was filtered
- // due to a minor allele frequency that is too low.
- //
- gtypes << "? ";
-
- } else if (d[j] == NULL) {
- //
- // Data does not exist.
- //
- gtypes << "? ";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- gtypes << "? ";
- } else {
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- gtypes << "? ";
- else if (p_allele == 0)
- gtypes << q_allele << " ";
- else
- gtypes << p_allele << " ";
- }
- }
- }
- gtypes_str = gtypes.str();
- fh << gtypes_str.substr(0, gtypes_str.length() - 1) << "\n";
-
- //
- // Output all the loci for this sample again, now for the q allele
- //
- gtypes.str("");
- for (uint pos = 0; pos < ordered_loci.size(); pos++) {
- loc = catalog[ordered_loci[pos].id];
-
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- //
- // Will we output this locus as a haplotype or as a SNP?
- //
- if (ordered_loci[pos].type == haplotype) {
- if (d[j] == NULL) {
- //
- // Data does not exist.
- //
- gtypes << "-1 ";
- } else {
- //
- // Data exists, output the second haplotype. We will assume the haplotypes are
- // numbered by their position in the loc->strings vector.
- //
- if (d[j]->obshap.size() > 2) {
- // cerr << "Warning: too many haplotypes, catalog locus: " << loc->id << "\n";
- gtypes << "-1 ";
- } else if (d[j]->obshap.size() > 1) {
- found = false;
- for (uint k = 0; k < loc->strings.size(); k++)
- if (d[j]->obshap[1] == loc->strings[k].first) {
- found = true;
- gtypes << k + 1 << " ";
- }
- if (found == false)
- cerr << "Unable to find haplotype " << d[j]->obshap[1] << " from individual "
- << samples[pmap->rev_sample_index(j)] << "; catalog locus: " << loc->id << "\n";
- } else {
- found = false;
- for (uint k = 0; k < loc->strings.size(); k++)
- if (d[j]->obshap[0] == loc->strings[k].first) {
- found = true;
- gtypes << k + 1 << " ";
- }
- if (found == false)
- cerr << "Unable to find haplotype " << d[j]->obshap[0] << " from individual "
- << samples[pmap->rev_sample_index(j)] << "; catalog locus: " << loc->id << "\n";
- }
- }
- } else {
- col = loc->snps[ordered_loci[pos].snp_index]->col;
-
- if (s[pop_id]->nucs[col].incompatible_site ||
- s[pop_id]->nucs[col].filtered_site) {
- gtypes << "? ";
-
- } else if (d[j] == NULL) {
- gtypes << "? ";
-
- } else if (d[j]->model[col] == 'U') {
- gtypes << "? ";
-
- } else {
- tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- gtypes << "? ";
- else if (q_allele == 0)
- gtypes << p_allele << " ";
- else
- gtypes << q_allele << " ";
- }
- }
- }
- gtypes_str = gtypes.str();
- fh << gtypes_str.substr(0, gtypes_str.length() - 1) << "\n";
- }
- }
-
- fh.close();
+ stringstream pop_name;
+ pop_name << "batch_" << batch_id << "." << it->first << ".phase.inp";
+ string file = in_path + pop_name.str();
+
+ ofstream fh(file.c_str(), ofstream::out);
+
+ if (fh.fail()) {
+ cerr << "Error opening PHASE file '" << file << "'\n";
+ exit(1);
+ }
+
+ //
+ // We need to determine an ordering for all legitimate loci/SNPs.
+ //
+ uint col;
+ vector<GenPos> ordered_loci;
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ t = psum->locus_tally(loc->id);
+
+ if (loc->snps.size() == 0) continue;
+
+ //
+ // Will we output this locus as a haplotype or as a SNP?
+ //
+ if (loc->snps.size() > 1) {
+ //
+ // Check that there aren't too many haplotypes (PHASE has a max of 50).
+ //
+ if (loc->alleles.size() > 40) continue;
+
+ //
+ // Iterate over the population to determine that this subset of the population
+ // has data at this locus.
+ //
+ d = pmap->locus(loc->id);
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ if (d[j] != NULL &&
+ d[j]->obshap.size() > 0 &&
+ d[j]->obshap.size() <= 2) {
+ //
+ // Data exists, and there are the correct number of haplotypes.
+ //
+ ordered_loci.push_back(GenPos(loc->id, 0, loc->sort_bp(), haplotype));
+ break;
+ }
+ }
+ } else {
+ col = loc->snps[0]->col;
+
+ if (t->nucs[col].allele_cnt == 2)
+ ordered_loci.push_back(GenPos(loc->id, 0, loc->sort_bp(col), snp));
+ }
+ }
+ sort(ordered_loci.begin(), ordered_loci.end(), compare_genpos);
+
+ //
+ // Output the total number of SNP sites and the number of individuals.
+ //
+ fh << samples.size() << "\n"
+ << ordered_loci.size() << "\n";
+
+ //
+ // Output the position of each site according to its basepair.
+ //
+ fh << "P";
+ for (uint pos = 0; pos < ordered_loci.size(); pos++)
+ fh << " " << ordered_loci[pos].bp;
+ fh << "\n";
+
+ //
+ // Output a line of 'S' characters for SNP markers, 'M' characters for multiallelic haplotypes.
+ //
+ for (uint pos = 0; pos < ordered_loci.size(); pos++) {
+ if (pos > 0) fh << " ";
+ fh << (ordered_loci[pos].type == snp ? "S" : "M");
+ }
+ fh << "\n";
+
+ //
+ // Now output each sample name followed by a new line, then all of the genotypes for that sample
+ // on two lines.
+ //
+
+ map<int, pair<int, int> >::iterator pit;
+ string gtypes_str;
+ bool found;
+ int start_index, end_index, pop_id;
+ char p_allele, q_allele;
+ stringstream gtypes;
+
+ for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
+ pop_id = psum->pop_index(pit->first);
+ start_index = pit->second.first;
+ end_index = pit->second.second;
+
+ for (int j = start_index; j <= end_index; j++) {
+ //
+ // Output all the loci for this sample, printing only the p allele
+ //
+ fh << samples[pmap->rev_sample_index(j)] << "\n";
+
+ gtypes.str("");
+ for (uint pos = 0; pos < ordered_loci.size(); pos++) {
+ loc = catalog[ordered_loci[pos].id];
+
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ //
+ // Will we output this locus as a haplotype or as a SNP?
+ //
+ if (ordered_loci[pos].type == haplotype) {
+ if (d[j] == NULL) {
+ //
+ // Data does not exist.
+ //
+ gtypes << "-1 ";
+ } else {
+ //
+ // Data exists, output the first haplotype. We will assume the haplotypes are
+ // numbered by their position in the loc->strings vector.
+ //
+ if (d[j]->obshap.size() > 2) {
+ // cerr << "Warning: too many haplotypes, catalog locus: " << loc->id << "\n";
+ gtypes << "-1 ";
+ } else {
+ found = false;
+ for (uint k = 0; k < loc->strings.size(); k++)
+ if (d[j]->obshap[0] == loc->strings[k].first) {
+ found = true;
+ gtypes << k + 1 << " ";
+ }
+ if (found == false)
+ cerr << "Unable to find haplotype " << d[j]->obshap[0] << " from individual "
+ << samples[pmap->rev_sample_index(j)] << "; catalog locus: " << loc->id << "\n";
+ }
+ }
+ } else {
+ col = loc->snps[ordered_loci[pos].snp_index]->col;
+
+ if (s[pop_id]->nucs[col].incompatible_site ||
+ s[pop_id]->nucs[col].filtered_site) {
+ //
+ // This site contains more than two alleles in this population or was filtered
+ // due to a minor allele frequency that is too low.
+ //
+ gtypes << "? ";
+
+ } else if (d[j] == NULL || col >= d[j]->len) {
+ //
+ // Data does not exist.
+ //
+ gtypes << "? ";
+ } else if (d[j]->model[col] == 'U') {
+ //
+ // Data exists, but the model call was uncertain.
+ //
+ gtypes << "? ";
+ } else {
+ //
+ // Tally up the nucleotide calls.
+ //
+ tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ gtypes << "? ";
+ else if (p_allele == 0)
+ gtypes << q_allele << " ";
+ else
+ gtypes << p_allele << " ";
+ }
+ }
+ }
+ gtypes_str = gtypes.str();
+ fh << gtypes_str.substr(0, gtypes_str.length() - 1) << "\n";
+
+ //
+ // Output all the loci for this sample again, now for the q allele
+ //
+ gtypes.str("");
+ for (uint pos = 0; pos < ordered_loci.size(); pos++) {
+ loc = catalog[ordered_loci[pos].id];
+
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ //
+ // Will we output this locus as a haplotype or as a SNP?
+ //
+ if (ordered_loci[pos].type == haplotype) {
+ if (d[j] == NULL) {
+ //
+ // Data does not exist.
+ //
+ gtypes << "-1 ";
+ } else {
+ //
+ // Data exists, output the second haplotype. We will assume the haplotypes are
+ // numbered by their position in the loc->strings vector.
+ //
+ if (d[j]->obshap.size() > 2) {
+ // cerr << "Warning: too many haplotypes, catalog locus: " << loc->id << "\n";
+ gtypes << "-1 ";
+ } else if (d[j]->obshap.size() > 1) {
+ found = false;
+ for (uint k = 0; k < loc->strings.size(); k++)
+ if (d[j]->obshap[1] == loc->strings[k].first) {
+ found = true;
+ gtypes << k + 1 << " ";
+ }
+ if (found == false)
+ cerr << "Unable to find haplotype " << d[j]->obshap[1] << " from individual "
+ << samples[pmap->rev_sample_index(j)] << "; catalog locus: " << loc->id << "\n";
+ } else {
+ found = false;
+ for (uint k = 0; k < loc->strings.size(); k++)
+ if (d[j]->obshap[0] == loc->strings[k].first) {
+ found = true;
+ gtypes << k + 1 << " ";
+ }
+ if (found == false)
+ cerr << "Unable to find haplotype " << d[j]->obshap[0] << " from individual "
+ << samples[pmap->rev_sample_index(j)] << "; catalog locus: " << loc->id << "\n";
+ }
+ }
+ } else {
+ col = loc->snps[ordered_loci[pos].snp_index]->col;
+
+ if (s[pop_id]->nucs[col].incompatible_site ||
+ s[pop_id]->nucs[col].filtered_site) {
+ gtypes << "? ";
+
+ } else if (d[j] == NULL || col >= d[j]->len) {
+ gtypes << "? ";
+
+ } else if (d[j]->model[col] == 'U') {
+ gtypes << "? ";
+
+ } else {
+ tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ gtypes << "? ";
+ else if (q_allele == 0)
+ gtypes << p_allele << " ";
+ else
+ gtypes << q_allele << " ";
+ }
+ }
+ }
+ gtypes_str = gtypes.str();
+ fh << gtypes_str.substr(0, gtypes_str.length() - 1) << "\n";
+ }
+ }
+
+ fh.close();
}
cerr << "done.\n";
@@ -7200,10 +7200,10 @@ write_phase(map<int, CSLocus *> &catalog,
int
write_plink(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum,
+ map<int, pair<int, int> > &pop_indexes,
+ map<int, string> &samples)
{
//
// Write a PLINK file as defined here: http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml
@@ -7240,8 +7240,8 @@ write_plink(map<int, CSLocus *> &catalog,
ofstream fh(file.c_str(), ofstream::out);
if (fh.fail()) {
- cerr << "Error opening PLINK markers file '" << file << "'\n";
- exit(1);
+ cerr << "Error opening PLINK markers file '" << file << "'\n";
+ exit(1);
}
//
@@ -7250,21 +7250,21 @@ write_plink(map<int, CSLocus *> &catalog,
fh << "# Stacks v" << VERSION << "; " << " PLINK v1.07; " << date << "\n";
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- chr = it->first;
-
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
- if (t->nucs[col].allele_cnt == 2)
- fh << chr << "\t"
- << loc->id << "_" << col << "\t"
- << "0\t"
- << loc->sort_bp(col) << "\n";
- }
- }
+ chr = it->first;
+
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+ if (t->nucs[col].allele_cnt == 2)
+ fh << chr << "\t"
+ << loc->id << "_" << col << "\t"
+ << "0\t"
+ << loc->sort_bp(col) << "\n";
+ }
+ }
}
fh.close();
@@ -7278,8 +7278,8 @@ write_plink(map<int, CSLocus *> &catalog,
fh.open(file.c_str(), ofstream::out);
if (fh.fail()) {
- cerr << "Error opening PLINK markers file '" << file << "'\n";
- exit(1);
+ cerr << "Error opening PLINK markers file '" << file << "'\n";
+ exit(1);
}
fh << "# Stacks v" << VERSION << "; " << " PLINK v1.07; " << date << "\n";
@@ -7292,75 +7292,75 @@ write_plink(map<int, CSLocus *> &catalog,
// marker, output the genotypes for each sample in two successive columns.
//
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = psum->pop_index(pit->first);
- start_index = pit->second.first;
- end_index = pit->second.second;
-
- for (int j = start_index; j <= end_index; j++) {
-
- fh << pit->first << "\t"
- << samples[pmap->rev_sample_index(j)] << "\t"
- << "0\t" // Paternal ID
- << "0\t" // Maternal ID
- << "0\t" // Sex
- << "0"; // Phenotype
-
- for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- t = psum->locus_tally(loc->id);
+ pop_id = psum->pop_index(pit->first);
+ start_index = pit->second.first;
+ end_index = pit->second.second;
+
+ for (int j = start_index; j <= end_index; j++) {
+
+ fh << pit->first << "\t"
+ << samples[pmap->rev_sample_index(j)] << "\t"
+ << "0\t" // Paternal ID
+ << "0\t" // Maternal ID
+ << "0\t" // Sex
+ << "0"; // Phenotype
+
+ for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ t = psum->locus_tally(loc->id);
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- //
- // If this site is fixed in all populations or has too many alleles don't output it.
- //
- if (t->nucs[col].allele_cnt != 2)
- continue;
- //
- // Output the p and q alleles
- //
- if (s[pop_id]->nucs[col].incompatible_site ||
- s[pop_id]->nucs[col].filtered_site) {
- //
- // This site contains more than two alleles in this population or was filtered
- // due to a minor allele frequency that is too low.
- //
- fh << "\t" << "0" << "\t" << "0";
- } else if (d[j] == NULL) {
- //
- // Data does not exist.
- //
- fh << "\t" << "0" << "\t" << "0";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- fh << "\t" << "0" << "\t" << "0";
- } else {
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, i, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- fh << "\t" << "0" << "\t" << "0";
- else if (p_allele == 0)
- fh << "\t" << q_allele << "\t" << q_allele;
- else if (q_allele == 0)
- fh << "\t" << p_allele << "\t" << p_allele;
- else
- fh << "\t" << p_allele << "\t" << q_allele;
- }
- }
- }
- }
- fh << "\n";
- }
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ //
+ // If this site is fixed in all populations or has too many alleles don't output it.
+ //
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+ //
+ // Output the p and q alleles
+ //
+ if (s[pop_id]->nucs[col].incompatible_site ||
+ s[pop_id]->nucs[col].filtered_site) {
+ //
+ // This site contains more than two alleles in this population or was filtered
+ // due to a minor allele frequency that is too low.
+ //
+ fh << "\t" << "0" << "\t" << "0";
+ } else if (d[j] == NULL || col >= d[j]->len) {
+ //
+ // Data does not exist.
+ //
+ fh << "\t" << "0" << "\t" << "0";
+ } else if (d[j]->model[col] == 'U') {
+ //
+ // Data exists, but the model call was uncertain.
+ //
+ fh << "\t" << "0" << "\t" << "0";
+ } else {
+ //
+ // Tally up the nucleotide calls.
+ //
+ tally_observed_haplotypes(d[j]->obshap, i, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ fh << "\t" << "0" << "\t" << "0";
+ else if (p_allele == 0)
+ fh << "\t" << q_allele << "\t" << q_allele;
+ else if (q_allele == 0)
+ fh << "\t" << p_allele << "\t" << p_allele;
+ else
+ fh << "\t" << p_allele << "\t" << q_allele;
+ }
+ }
+ }
+ }
+ fh << "\n";
+ }
}
fh.close();
@@ -7372,10 +7372,10 @@ write_plink(map<int, CSLocus *> &catalog,
int
write_beagle(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum,
+ map<int, pair<int, int> > &pop_indexes,
+ map<int, string> &samples)
{
//
// Write a Beagle file as defined here: http://faculty.washington.edu/browning/beagle/beagle.html
@@ -7406,176 +7406,176 @@ write_beagle(map<int, CSLocus *> &catalog,
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- //
- // We need to determine an ordering that can take into account overlapping RAD sites.
- //
- vector<GenPos> ordered_loci;
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- col = loc->snps[i]->col;
- if (t->nucs[col].allele_cnt == 2)
- ordered_loci.push_back(GenPos(loc->id, i, loc->sort_bp(col)));
- }
- }
- sort(ordered_loci.begin(), ordered_loci.end(), compare_genpos);
-
- //
- // Now output the genotypes in a separate file for each population.
- //
- map<int, pair<int, int> >::iterator pit;
- int start_index, end_index, pop_id;
-
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = psum->pop_index(pit->first);
- start_index = pit->second.first;
- end_index = pit->second.second;
-
- //
- // Open a markers file containing each marker, its genomic position in basepairs
- // and the two alternative alleles at this position.
- //
- pop_name.str("");
- pop_name << "batch_" << batch_id << "." << pop_key[pit->first] << "-" << it->first << ".unphased.bgl.markers";
- file = in_path + pop_name.str();
-
- ofstream mfh(file.c_str(), ofstream::out);
- if (mfh.fail()) {
- cerr << "Error opening Beagle markers file '" << file << "'\n";
- exit(1);
- }
- mfh << "# Stacks v" << VERSION << "; " << " Beagle v3.3; " << date << "\n";
-
- //
- // Open the genotypes file.
- //
- pop_name.str("");
- pop_name << "batch_" << batch_id << "." << pop_key[pit->first] << "-" << it->first << ".unphased.bgl";
- file = in_path + pop_name.str();
-
- ofstream fh(file.c_str(), ofstream::out);
- if (fh.fail()) {
- cerr << "Error opening Beagle genotypes file '" << file << "'\n";
- exit(1);
- }
- fh << "# Stacks v" << VERSION << "; " << " Beagle v3.3; " << date << "\n";
-
- char p_allele, q_allele;
- //
- // Output a list of all the samples in this population.
- //
- fh << "I\tid";
- for (int j = start_index; j <= end_index; j++)
- fh << "\t" << samples[pmap->rev_sample_index(j)] << "\t" << samples[pmap->rev_sample_index(j)];
- fh << "\n";
-
- //
- // Output population IDs for each sample.
- //
- fh << "S\tid";
- for (int j = start_index; j <= end_index; j++)
- fh << "\t" << pit->first << "\t" << pit->first;
- fh << "\n";
-
- //
- // For each marker, output the genotypes for each sample in two successive columns.
- //
- for (uint pos = 0; pos < ordered_loci.size(); pos++) {
- loc = catalog[ordered_loci[pos].id];
-
- s = psum->locus(loc->id);
- d = pmap->locus(loc->id);
- t = psum->locus_tally(loc->id);
- col = loc->snps[ordered_loci[pos].snp_index]->col;
-
- //
- // If this site is fixed in all populations or has too many alleles don't output it.
- //
- if (t->nucs[col].allele_cnt != 2)
- continue;
-
- //
- // If this site is monomorphic in this population don't output it.
- //
- if (s[pop_id]->nucs[col].pi == 0.0)
- continue;
-
- //
- // Output this locus to the markers file.
- //
- mfh << loc->id << "_" << col << "\t"
- << loc->sort_bp(col) << "\t"
- << t->nucs[col].p_allele << "\t"
- << t->nucs[col].q_allele << "\n";
-
- fh << "M" << "\t" << loc->id << "_" << col;
-
- for (int j = start_index; j <= end_index; j++) {
- //
- // Output the p allele
- //
- if (s[pop_id]->nucs[col].incompatible_site ||
- s[pop_id]->nucs[col].filtered_site) {
- //
- // This site contains more than two alleles in this population or was filtered
- // due to a minor allele frequency that is too low.
- //
- fh << "\t" << "?";
-
- } else if (d[j] == NULL) {
- //
- // Data does not exist.
- //
- fh << "\t" << "?";
- } else if (d[j]->model[col] == 'U') {
- //
- // Data exists, but the model call was uncertain.
- //
- fh << "\t" << "?";
- } else {
- //
- // Tally up the nucleotide calls.
- //
- tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
-
- if (p_allele == 0 && q_allele == 0)
- fh << "\t" << "?";
- else if (p_allele == 0)
- fh << "\t" << q_allele;
- else
- fh << "\t" << p_allele;
- }
-
- //
- // Now output the q allele
- //
- if (s[pop_id]->nucs[col].incompatible_site ||
- s[pop_id]->nucs[col].filtered_site) {
- fh << "\t" << "?";
-
- } else if (d[j] == NULL) {
- fh << "\t" << "?";
-
- } else if (d[j]->model[col] == 'U') {
- fh << "\t" << "?";
-
- } else {
- if (p_allele == 0 && q_allele == 0)
- fh << "\t" << "?";
- else if (q_allele == 0)
- fh << "\t" << p_allele;
- else
- fh << "\t" << q_allele;
- }
- }
- fh << "\n";
- }
-
- fh.close();
- mfh.close();
- }
+ //
+ // We need to determine an ordering that can take into account overlapping RAD sites.
+ //
+ vector<GenPos> ordered_loci;
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ col = loc->snps[i]->col;
+ if (t->nucs[col].allele_cnt == 2)
+ ordered_loci.push_back(GenPos(loc->id, i, loc->sort_bp(col)));
+ }
+ }
+ sort(ordered_loci.begin(), ordered_loci.end(), compare_genpos);
+
+ //
+ // Now output the genotypes in a separate file for each population.
+ //
+ map<int, pair<int, int> >::iterator pit;
+ int start_index, end_index, pop_id;
+
+ for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
+ pop_id = psum->pop_index(pit->first);
+ start_index = pit->second.first;
+ end_index = pit->second.second;
+
+ //
+ // Open a markers file containing each marker, its genomic position in basepairs
+ // and the two alternative alleles at this position.
+ //
+ pop_name.str("");
+ pop_name << "batch_" << batch_id << "." << pop_key[pit->first] << "-" << it->first << ".unphased.bgl.markers";
+ file = in_path + pop_name.str();
+
+ ofstream mfh(file.c_str(), ofstream::out);
+ if (mfh.fail()) {
+ cerr << "Error opening Beagle markers file '" << file << "'\n";
+ exit(1);
+ }
+ mfh << "# Stacks v" << VERSION << "; " << " Beagle v3.3; " << date << "\n";
+
+ //
+ // Open the genotypes file.
+ //
+ pop_name.str("");
+ pop_name << "batch_" << batch_id << "." << pop_key[pit->first] << "-" << it->first << ".unphased.bgl";
+ file = in_path + pop_name.str();
+
+ ofstream fh(file.c_str(), ofstream::out);
+ if (fh.fail()) {
+ cerr << "Error opening Beagle genotypes file '" << file << "'\n";
+ exit(1);
+ }
+ fh << "# Stacks v" << VERSION << "; " << " Beagle v3.3; " << date << "\n";
+
+ char p_allele, q_allele;
+ //
+ // Output a list of all the samples in this population.
+ //
+ fh << "I\tid";
+ for (int j = start_index; j <= end_index; j++)
+ fh << "\t" << samples[pmap->rev_sample_index(j)] << "\t" << samples[pmap->rev_sample_index(j)];
+ fh << "\n";
+
+ //
+ // Output population IDs for each sample.
+ //
+ fh << "S\tid";
+ for (int j = start_index; j <= end_index; j++)
+ fh << "\t" << pit->first << "\t" << pit->first;
+ fh << "\n";
+
+ //
+ // For each marker, output the genotypes for each sample in two successive columns.
+ //
+ for (uint pos = 0; pos < ordered_loci.size(); pos++) {
+ loc = catalog[ordered_loci[pos].id];
+
+ s = psum->locus(loc->id);
+ d = pmap->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+ col = loc->snps[ordered_loci[pos].snp_index]->col;
+
+ //
+ // If this site is fixed in all populations or has too many alleles don't output it.
+ //
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+
+ //
+ // If this site is monomorphic in this population don't output it.
+ //
+ if (s[pop_id]->nucs[col].pi == 0.0)
+ continue;
+
+ //
+ // Output this locus to the markers file.
+ //
+ mfh << loc->id << "_" << col << "\t"
+ << loc->sort_bp(col) << "\t"
+ << t->nucs[col].p_allele << "\t"
+ << t->nucs[col].q_allele << "\n";
+
+ fh << "M" << "\t" << loc->id << "_" << col;
+
+ for (int j = start_index; j <= end_index; j++) {
+ //
+ // Output the p allele
+ //
+ if (s[pop_id]->nucs[col].incompatible_site ||
+ s[pop_id]->nucs[col].filtered_site) {
+ //
+ // This site contains more than two alleles in this population or was filtered
+ // due to a minor allele frequency that is too low.
+ //
+ fh << "\t" << "?";
+
+ } else if (d[j] == NULL || col >= d[j]->len) {
+ //
+ // Data does not exist.
+ //
+ fh << "\t" << "?";
+ } else if (d[j]->model[col] == 'U') {
+ //
+ // Data exists, but the model call was uncertain.
+ //
+ fh << "\t" << "?";
+ } else {
+ //
+ // Tally up the nucleotide calls.
+ //
+ tally_observed_haplotypes(d[j]->obshap, ordered_loci[pos].snp_index, p_allele, q_allele);
+
+ if (p_allele == 0 && q_allele == 0)
+ fh << "\t" << "?";
+ else if (p_allele == 0)
+ fh << "\t" << q_allele;
+ else
+ fh << "\t" << p_allele;
+ }
+
+ //
+ // Now output the q allele
+ //
+ if (s[pop_id]->nucs[col].incompatible_site ||
+ s[pop_id]->nucs[col].filtered_site) {
+ fh << "\t" << "?";
+
+ } else if (d[j] == NULL || col >= d[j]->len) {
+ fh << "\t" << "?";
+
+ } else if (d[j]->model[col] == 'U') {
+ fh << "\t" << "?";
+
+ } else {
+ if (p_allele == 0 && q_allele == 0)
+ fh << "\t" << "?";
+ else if (q_allele == 0)
+ fh << "\t" << p_allele;
+ else
+ fh << "\t" << q_allele;
+ }
+ }
+ fh << "\n";
+ }
+
+ fh.close();
+ mfh.close();
+ }
}
cerr << "done.\n";
@@ -7585,10 +7585,10 @@ write_beagle(map<int, CSLocus *> &catalog,
int
write_beagle_phased(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum,
+ map<int, pair<int, int> > &pop_indexes,
+ map<int, string> &samples)
{
//
// Write a Beagle file as a set of haplotpyes as defined here:
@@ -7617,156 +7617,156 @@ write_beagle_phased(map<int, CSLocus *> &catalog,
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- //
- // We need to determine an ordering for all legitimate loci/SNPs.
- //
- vector<GenPos> ordered_loci;
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- if (loc->snps.size() == 0) continue;
-
- //
- // Check that there aren't too many haplotypes (PHASE has a max of 50).
- //
- if (loc->alleles.size() > 40) continue;
-
- //
- // Iterate over the population to determine that this subset of the population
- // has data at this locus.
- //
- d = pmap->locus(loc->id);
- for (int j = 0; j < pmap->sample_cnt(); j++) {
- if (d[j] != NULL &&
- d[j]->obshap.size() > 0 &&
- d[j]->obshap.size() <= 2) {
- //
- // Data exists, and their are the corrent number of haplotypes.
- //
- ordered_loci.push_back(GenPos(loc->id, 0, loc->sort_bp(), haplotype));
- break;
- }
- }
- }
- sort(ordered_loci.begin(), ordered_loci.end(), compare_genpos);
-
- //
- // Now output the genotypes in a separate file for each population.
- //
- map<int, pair<int, int> >::iterator pit;
- int start_index, end_index, pop_id;
-
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = psum->pop_index(pit->first);
- start_index = pit->second.first;
- end_index = pit->second.second;
-
- //
- // Open a file for writing the markers: their genomic position in basepairs
- // and the two alternative alleles at this position.
- //
- pop_name.str("");
- pop_name << "batch_" << batch_id << "." << pop_key[pit->first] << "-" << it->first << ".phased.bgl.markers";
- file = in_path + pop_name.str();
-
- ofstream mfh(file.c_str(), ofstream::out);
- if (mfh.fail()) {
- cerr << "Error opening Beagle markers file '" << file << "'\n";
- exit(1);
- }
- mfh << "# Stacks v" << VERSION << "; " << " Beagle v3.3; " << date << "\n";
-
- //
- // Now output the haplotypes in a separate file.
- //
- pop_name.str("");
- pop_name << "batch_" << batch_id << "." << pop_key[pit->first] << "-" << it->first << ".phased.bgl";
- file = in_path + pop_name.str();
-
- ofstream fh(file.c_str(), ofstream::out);
- if (fh.fail()) {
- cerr << "Error opening Beagle markers file '" << file << "'\n";
- exit(1);
- }
- fh << "# Stacks v" << VERSION << "; " << " Beagle v3.3; " << date << "\n";
-
- //
- // Output a list of all the samples in the data set.
- //
- fh << "I\tid";
- for (int j = start_index; j <= end_index; j++)
- fh << "\t" << samples[pmap->rev_sample_index(j)] << "\t" << samples[pmap->rev_sample_index(j)];
- fh << "\n";
-
- //
- // Output population IDs for each sample.
- //
- fh << "S\tid";
- for (int j = start_index; j <= end_index; j++)
- fh << "\t" << pop_id << "\t" << pop_id;
- fh << "\n";
-
- for (uint pos = 0; pos < ordered_loci.size(); pos++) {
- loc = catalog[ordered_loci[pos].id];
- d = pmap->locus(loc->id);
-
- //
- // If this locus is monomorphic in this population don't output it.
- //
- set<string> haplotypes;
- for (int j = start_index; j <= end_index; j++) {
- if (d[j] == NULL) continue;
-
- if (d[j]->obshap.size() == 2) {
- haplotypes.insert(d[j]->obshap[0]);
- haplotypes.insert(d[j]->obshap[1]);
- } else {
- haplotypes.insert(d[j]->obshap[0]);
- }
- }
- if (haplotypes.size() == 1) continue;
-
- //
- // Output this locus to the markers file.
- //
- mfh << loc->id << "\t"
- << loc->sort_bp();
- for (uint j = 0; j < loc->strings.size(); j++)
- mfh << "\t" << loc->strings[j].first;
- mfh << "\n";
-
- //
- // For each marker, output the genotypes for each sample in two successive columns.
- //
- fh << "M" << "\t" << loc->id;
-
- for (int j = start_index; j <= end_index; j++) {
- //
- // Output the p and the q haplotype
- //
- if (d[j] == NULL) {
- //
- // Data does not exist.
- //
- fh << "\t" << "?" << "\t" << "?";
- } else {
- //
- // Data exists, output the first haplotype. We will assume the haplotypes are
- // numbered by their position in the loc->strings vector.
- //
- if (d[j]->obshap.size() > 2)
- fh << "\t" << "?" << "\t" << "?";
- else if (d[j]->obshap.size() == 2)
- fh << "\t" << d[j]->obshap[0] << "\t" << d[j]->obshap[1];
- else
- fh << "\t" << d[j]->obshap[0] << "\t" << d[j]->obshap[0];
- }
- }
- fh << "\n";
- }
- fh.close();
- mfh.close();
- }
+ //
+ // We need to determine an ordering for all legitimate loci/SNPs.
+ //
+ vector<GenPos> ordered_loci;
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ if (loc->snps.size() == 0) continue;
+
+ //
+ // Check that there aren't too many haplotypes (PHASE has a max of 50).
+ //
+ if (loc->alleles.size() > 40) continue;
+
+ //
+ // Iterate over the population to determine that this subset of the population
+ // has data at this locus.
+ //
+ d = pmap->locus(loc->id);
+ for (int j = 0; j < pmap->sample_cnt(); j++) {
+ if (d[j] != NULL &&
+ d[j]->obshap.size() > 0 &&
+ d[j]->obshap.size() <= 2) {
+ //
+ // Data exists, and their are the corrent number of haplotypes.
+ //
+ ordered_loci.push_back(GenPos(loc->id, 0, loc->sort_bp(), haplotype));
+ break;
+ }
+ }
+ }
+ sort(ordered_loci.begin(), ordered_loci.end(), compare_genpos);
+
+ //
+ // Now output the genotypes in a separate file for each population.
+ //
+ map<int, pair<int, int> >::iterator pit;
+ int start_index, end_index, pop_id;
+
+ for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
+ pop_id = psum->pop_index(pit->first);
+ start_index = pit->second.first;
+ end_index = pit->second.second;
+
+ //
+ // Open a file for writing the markers: their genomic position in basepairs
+ // and the two alternative alleles at this position.
+ //
+ pop_name.str("");
+ pop_name << "batch_" << batch_id << "." << pop_key[pit->first] << "-" << it->first << ".phased.bgl.markers";
+ file = in_path + pop_name.str();
+
+ ofstream mfh(file.c_str(), ofstream::out);
+ if (mfh.fail()) {
+ cerr << "Error opening Beagle markers file '" << file << "'\n";
+ exit(1);
+ }
+ mfh << "# Stacks v" << VERSION << "; " << " Beagle v3.3; " << date << "\n";
+
+ //
+ // Now output the haplotypes in a separate file.
+ //
+ pop_name.str("");
+ pop_name << "batch_" << batch_id << "." << pop_key[pit->first] << "-" << it->first << ".phased.bgl";
+ file = in_path + pop_name.str();
+
+ ofstream fh(file.c_str(), ofstream::out);
+ if (fh.fail()) {
+ cerr << "Error opening Beagle markers file '" << file << "'\n";
+ exit(1);
+ }
+ fh << "# Stacks v" << VERSION << "; " << " Beagle v3.3; " << date << "\n";
+
+ //
+ // Output a list of all the samples in the data set.
+ //
+ fh << "I\tid";
+ for (int j = start_index; j <= end_index; j++)
+ fh << "\t" << samples[pmap->rev_sample_index(j)] << "\t" << samples[pmap->rev_sample_index(j)];
+ fh << "\n";
+
+ //
+ // Output population IDs for each sample.
+ //
+ fh << "S\tid";
+ for (int j = start_index; j <= end_index; j++)
+ fh << "\t" << pop_id << "\t" << pop_id;
+ fh << "\n";
+
+ for (uint pos = 0; pos < ordered_loci.size(); pos++) {
+ loc = catalog[ordered_loci[pos].id];
+ d = pmap->locus(loc->id);
+
+ //
+ // If this locus is monomorphic in this population don't output it.
+ //
+ set<string> haplotypes;
+ for (int j = start_index; j <= end_index; j++) {
+ if (d[j] == NULL) continue;
+
+ if (d[j]->obshap.size() == 2) {
+ haplotypes.insert(d[j]->obshap[0]);
+ haplotypes.insert(d[j]->obshap[1]);
+ } else {
+ haplotypes.insert(d[j]->obshap[0]);
+ }
+ }
+ if (haplotypes.size() == 1) continue;
+
+ //
+ // Output this locus to the markers file.
+ //
+ mfh << loc->id << "\t"
+ << loc->sort_bp();
+ for (uint j = 0; j < loc->strings.size(); j++)
+ mfh << "\t" << loc->strings[j].first;
+ mfh << "\n";
+
+ //
+ // For each marker, output the genotypes for each sample in two successive columns.
+ //
+ fh << "M" << "\t" << loc->id;
+
+ for (int j = start_index; j <= end_index; j++) {
+ //
+ // Output the p and the q haplotype
+ //
+ if (d[j] == NULL) {
+ //
+ // Data does not exist.
+ //
+ fh << "\t" << "?" << "\t" << "?";
+ } else {
+ //
+ // Data exists, output the first haplotype. We will assume the haplotypes are
+ // numbered by their position in the loc->strings vector.
+ //
+ if (d[j]->obshap.size() > 2)
+ fh << "\t" << "?" << "\t" << "?";
+ else if (d[j]->obshap.size() == 2)
+ fh << "\t" << d[j]->obshap[0] << "\t" << d[j]->obshap[1];
+ else
+ fh << "\t" << d[j]->obshap[0] << "\t" << d[j]->obshap[0];
+ }
+ }
+ fh << "\n";
+ }
+ fh.close();
+ mfh.close();
+ }
}
cerr << "done.\n";
@@ -7776,10 +7776,10 @@ write_beagle_phased(map<int, CSLocus *> &catalog,
int
write_phylip(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum,
+ map<int, pair<int, int> > &pop_indexes,
+ map<int, string> &samples)
{
//
// We want to find loci where each locus is fixed within a population but variable between populations.
@@ -7797,7 +7797,7 @@ write_phylip(map<int, CSLocus *> &catalog,
if (fh.fail()) {
cerr << "Error opening Phylip file '" << file << "'\n";
- exit(1);
+ exit(1);
}
pop_name << ".log";
@@ -7809,7 +7809,7 @@ write_phylip(map<int, CSLocus *> &catalog,
if (log_fh.fail()) {
cerr << "Error opening Phylip Log file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -7823,7 +7823,7 @@ write_phylip(map<int, CSLocus *> &catalog,
strftime(date, 32, "%B %d, %Y", timeinfo);
log_fh << "# Stacks v" << VERSION << "; " << " Phylip sequential; " << date << "\n"
- << "# Seq Pos\tLocus ID\tColumn\tPopulation\n";
+ << "# Seq Pos\tLocus ID\tColumn\tPopulation\n";
map<string, vector<CSLocus *> >::iterator it;
CSLocus *loc;
@@ -7842,146 +7842,146 @@ write_phylip(map<int, CSLocus *> &catalog,
int index = 0;
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- s = psum->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- if (phylip_var == false) {
- //
- // We are looking for loci that are fixed within each population, but are
- // variable between one or more populations.
- //
- if (t->nucs[col].fixed == true || t->nucs[col].allele_cnt != 2 || t->nucs[col].pop_cnt < 2)
- continue;
-
- bool fixed_within = true;
- for (int j = 0; j < pop_cnt; j++) {
- if (s[j]->nucs[col].num_indv == 0)
- continue;
- if (s[j]->nucs[col].fixed == false) {
- fixed_within = false;
- break;
- }
- }
- if (fixed_within == false) continue;
-
- log_fh << index << "\t" << loc->id << "\t" << col << "\t";
-
- for (int j = 0; j < pop_cnt; j++) {
- pop_id = psum->rev_pop_index(j);
-
- if (s[j]->nucs[col].num_indv > 0) {
- interspecific_nucs[pop_id] += s[j]->nucs[col].p_nuc;
- log_fh << pop_key[pop_id] << ":" << s[j]->nucs[col].p_nuc << ",";
- } else {
- interspecific_nucs[pop_id] += 'N';
- log_fh << pop_key[pop_id] << ":N" << ",";
- }
- }
- log_fh << "\n";
- index++;
-
- } else {
- //
- // Encode SNPs that are variable within a population as well, using IUPAC notation:
- // http://en.wikipedia.org/wiki/Nucleic_acid_notation#IUPAC_notation
- //
- if (t->nucs[col].allele_cnt != 2)
- continue;
-
- log_fh << index << "\t" << loc->id << "\t" << col << "\t";
-
- for (int j = 0; j < pop_cnt; j++) {
- pop_id = psum->rev_pop_index(j);
-
- switch(s[j]->nucs[col].p_nuc) {
- case 0:
- nuc = 'N';
- break;
- case 'A':
- switch(s[j]->nucs[col].q_nuc) {
- case 'C':
- nuc = 'M';
- break;
- case 'G':
- nuc = 'R';
- break;
- case 'T':
- nuc = 'W';
- break;
- case 0:
- nuc = 'A';
- break;
- }
- break;
- case 'C':
- switch(s[j]->nucs[col].q_nuc) {
- case 'A':
- nuc = 'M';
- break;
- case 'G':
- nuc = 'S';
- break;
- case 'T':
- nuc = 'Y';
- break;
- case 0:
- nuc = 'C';
- break;
- }
- break;
- case 'G':
- switch(s[j]->nucs[col].q_nuc) {
- case 'A':
- nuc = 'R';
- break;
- case 'C':
- nuc = 'S';
- break;
- case 'T':
- nuc = 'K';
- break;
- case 0:
- nuc = 'G';
- break;
- }
- break;
- case 'T':
- switch(s[j]->nucs[col].q_nuc) {
- case 'A':
- nuc = 'W';
- break;
- case 'C':
- nuc = 'Y';
- break;
- case 'G':
- nuc = 'K';
- break;
- case 0:
- nuc = 'T';
- break;
- }
- break;
- }
- interspecific_nucs[pop_id] += nuc;
- log_fh << pop_key[pop_id] << ":" << nuc << ",";
-
- }
- log_fh << "\n";
- index++;
- }
- }
- }
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ s = psum->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ if (phylip_var == false) {
+ //
+ // We are looking for loci that are fixed within each population, but are
+ // variable between one or more populations.
+ //
+ if (t->nucs[col].fixed == true || t->nucs[col].allele_cnt != 2 || t->nucs[col].pop_cnt < 2)
+ continue;
+
+ bool fixed_within = true;
+ for (int j = 0; j < pop_cnt; j++) {
+ if (s[j]->nucs[col].num_indv == 0)
+ continue;
+ if (s[j]->nucs[col].fixed == false) {
+ fixed_within = false;
+ break;
+ }
+ }
+ if (fixed_within == false) continue;
+
+ log_fh << index << "\t" << loc->id << "\t" << col << "\t";
+
+ for (int j = 0; j < pop_cnt; j++) {
+ pop_id = psum->rev_pop_index(j);
+
+ if (s[j]->nucs[col].num_indv > 0) {
+ interspecific_nucs[pop_id] += s[j]->nucs[col].p_nuc;
+ log_fh << pop_key[pop_id] << ":" << s[j]->nucs[col].p_nuc << ",";
+ } else {
+ interspecific_nucs[pop_id] += 'N';
+ log_fh << pop_key[pop_id] << ":N" << ",";
+ }
+ }
+ log_fh << "\n";
+ index++;
+
+ } else {
+ //
+ // Encode SNPs that are variable within a population as well, using IUPAC notation:
+ // http://en.wikipedia.org/wiki/Nucleic_acid_notation#IUPAC_notation
+ //
+ if (t->nucs[col].allele_cnt != 2)
+ continue;
+
+ log_fh << index << "\t" << loc->id << "\t" << col << "\t";
+
+ for (int j = 0; j < pop_cnt; j++) {
+ pop_id = psum->rev_pop_index(j);
+
+ switch(s[j]->nucs[col].p_nuc) {
+ case 0:
+ nuc = 'N';
+ break;
+ case 'A':
+ switch(s[j]->nucs[col].q_nuc) {
+ case 'C':
+ nuc = 'M';
+ break;
+ case 'G':
+ nuc = 'R';
+ break;
+ case 'T':
+ nuc = 'W';
+ break;
+ case 0:
+ nuc = 'A';
+ break;
+ }
+ break;
+ case 'C':
+ switch(s[j]->nucs[col].q_nuc) {
+ case 'A':
+ nuc = 'M';
+ break;
+ case 'G':
+ nuc = 'S';
+ break;
+ case 'T':
+ nuc = 'Y';
+ break;
+ case 0:
+ nuc = 'C';
+ break;
+ }
+ break;
+ case 'G':
+ switch(s[j]->nucs[col].q_nuc) {
+ case 'A':
+ nuc = 'R';
+ break;
+ case 'C':
+ nuc = 'S';
+ break;
+ case 'T':
+ nuc = 'K';
+ break;
+ case 0:
+ nuc = 'G';
+ break;
+ }
+ break;
+ case 'T':
+ switch(s[j]->nucs[col].q_nuc) {
+ case 'A':
+ nuc = 'W';
+ break;
+ case 'C':
+ nuc = 'Y';
+ break;
+ case 'G':
+ nuc = 'K';
+ break;
+ case 0:
+ nuc = 'T';
+ break;
+ }
+ break;
+ }
+ interspecific_nucs[pop_id] += nuc;
+ log_fh << pop_key[pop_id] << ":" << nuc << ",";
+
+ }
+ log_fh << "\n";
+ index++;
+ }
+ }
+ }
}
if (interspecific_nucs.size() == 0) {
- cerr << " No data is available to write to the Phylip file.\n";
- return 0;
+ cerr << " No data is available to write to the Phylip file.\n";
+ return 0;
}
char id_str[id_len];
@@ -7989,15 +7989,15 @@ write_phylip(map<int, CSLocus *> &catalog,
fh << pop_indexes.size() << " " << interspecific_nucs.begin()->second.length() << "\n";
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = pit->first;
+ pop_id = pit->first;
- sprintf(id_str, "%s", pop_key[pop_id].c_str());
- len = strlen(id_str);
- for (uint j = len; j < 10; j++)
- id_str[j] = ' ';
- id_str[9] = '\0';
+ sprintf(id_str, "%s", pop_key[pop_id].c_str());
+ len = strlen(id_str);
+ for (uint j = len; j < 10; j++)
+ id_str[j] = ' ';
+ id_str[9] = '\0';
- fh << id_str << " " << interspecific_nucs[pop_id] << "\n";
+ fh << id_str << " " << interspecific_nucs[pop_id] << "\n";
}
//
@@ -8015,10 +8015,10 @@ write_phylip(map<int, CSLocus *> &catalog,
int
write_fullseq_phylip(map<int, CSLocus *> &catalog,
- PopMap<CSLocus> *pmap,
- PopSum<CSLocus> *psum,
- map<int, pair<int, int> > &pop_indexes,
- map<int, string> &samples)
+ PopMap<CSLocus> *pmap,
+ PopSum<CSLocus> *psum,
+ map<int, pair<int, int> > &pop_indexes,
+ map<int, string> &samples)
{
//
// We want to write all variable loci in Phylip interleaved format. Polymorphic positions
@@ -8037,7 +8037,7 @@ write_fullseq_phylip(map<int, CSLocus *> &catalog,
if (fh.fail()) {
cerr << "Error opening Phylip file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -8052,7 +8052,7 @@ write_fullseq_phylip(map<int, CSLocus *> &catalog,
if (par_fh.fail()) {
cerr << "Error opening Phylip partitions file '" << file << "'\n";
- exit(1);
+ exit(1);
}
pop_name.str("");
@@ -8065,7 +8065,7 @@ write_fullseq_phylip(map<int, CSLocus *> &catalog,
if (log_fh.fail()) {
cerr << "Error opening Phylip Log file '" << file << "'\n";
- exit(1);
+ exit(1);
}
//
@@ -8079,7 +8079,7 @@ write_fullseq_phylip(map<int, CSLocus *> &catalog,
strftime(date, 32, "%B %d, %Y", timeinfo);
log_fh << "# Stacks v" << VERSION << "; " << " Phylip interleaved; " << date << "\n"
- << "# Locus ID\tLine Number";
+ << "# Locus ID\tLine Number";
if (loci_ordered) log_fh << "\tChr\tBasepair";
log_fh << "\n";
@@ -8101,22 +8101,22 @@ write_fullseq_phylip(map<int, CSLocus *> &catalog,
// Determine the length of sequence we will output.
//
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
- t = psum->locus_tally(loc->id);
+ t = psum->locus_tally(loc->id);
- include = true;
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
+ include = true;
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
- if (t->nucs[col].allele_cnt != 2)
- include = false;
- }
+ if (t->nucs[col].allele_cnt != 2)
+ include = false;
+ }
- if (include)
- len += strlen(loc->con);
- }
+ if (include)
+ len += strlen(loc->con);
+ }
}
map<int, string> outstrs;
@@ -8124,16 +8124,16 @@ write_fullseq_phylip(map<int, CSLocus *> &catalog,
fh << pop_indexes.size() << " " << len << "\n";
for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = pit->first;
+ pop_id = pit->first;
- outstrs[pop_id] = "";
- sprintf(id_str, "%s", pop_key[pop_id].c_str());
- len = strlen(id_str);
- for (uint j = len; j < 10; j++)
- id_str[j] = ' ';
- id_str[9] = '\0';
+ outstrs[pop_id] = "";
+ sprintf(id_str, "%s", pop_key[pop_id].c_str());
+ len = strlen(id_str);
+ for (uint j = len; j < 10; j++)
+ id_str[j] = ' ';
+ id_str[9] = '\0';
- outstrs[pop_id] += string(id_str) + " ";
+ outstrs[pop_id] += string(id_str) + " ";
}
char *seq;
@@ -8142,130 +8142,130 @@ write_fullseq_phylip(map<int, CSLocus *> &catalog,
int cnt = 1;
for (it = pmap->ordered_loci.begin(); it != pmap->ordered_loci.end(); it++) {
- for (uint pos = 0; pos < it->second.size(); pos++) {
- loc = it->second[pos];
-
- s = psum->locus(loc->id);
- t = psum->locus_tally(loc->id);
-
- include = true;
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- if (t->nucs[col].allele_cnt != 2)
- include = false;
- }
-
- if (!include)
- continue;
-
- seq = new char[loc->len + 1];
- strcpy(seq, loc->con);
-
- for (int j = 0; j < pop_cnt; j++) {
- pop_id = psum->rev_pop_index(j);
-
- for (uint i = 0; i < loc->snps.size(); i++) {
- uint col = loc->snps[i]->col;
-
- //
- // Encode SNPs that are variable within a population using IUPAC notation:
- // http://en.wikipedia.org/wiki/Nucleic_acid_notation#IUPAC_notation
- //
- switch(s[j]->nucs[col].p_nuc) {
- case 0:
- nuc = 'N';
- break;
- case 'A':
- switch(s[j]->nucs[col].q_nuc) {
- case 'C':
- nuc = 'M';
- break;
- case 'G':
- nuc = 'R';
- break;
- case 'T':
- nuc = 'W';
- break;
- case 0:
- nuc = 'A';
- break;
- }
- break;
- case 'C':
- switch(s[j]->nucs[col].q_nuc) {
- case 'A':
- nuc = 'M';
- break;
- case 'G':
- nuc = 'S';
- break;
- case 'T':
- nuc = 'Y';
- break;
- case 0:
- nuc = 'C';
- break;
- }
- break;
- case 'G':
- switch(s[j]->nucs[col].q_nuc) {
- case 'A':
- nuc = 'R';
- break;
- case 'C':
- nuc = 'S';
- break;
- case 'T':
- nuc = 'K';
- break;
- case 0:
- nuc = 'G';
- break;
- }
- break;
- case 'T':
- switch(s[j]->nucs[col].q_nuc) {
- case 'A':
- nuc = 'W';
- break;
- case 'C':
- nuc = 'Y';
- break;
- case 'G':
- nuc = 'K';
- break;
- case 0:
- nuc = 'T';
- break;
- }
- break;
- }
-
- seq[col] = nuc;
- }
-
- outstrs[pop_id] += string(seq);
- }
- delete [] seq;
-
- log_fh << line << "\t" << loc->id;
- if (loci_ordered) log_fh << "\t" << loc->loc.chr << "\t" << loc->sort_bp() + 1;
- log_fh << "\n";
-
- for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
- pop_id = pit->first;
- fh << outstrs[pop_id] << "\n";
- outstrs[pop_id] = "";
- line++;
- }
- fh << "\n";
- line++;
-
- par_fh << "DNA, p" << cnt << "=" << index << "-" << index + loc->len - 1 << "\n";
- index += loc->len;
- cnt++;
- }
+ for (uint pos = 0; pos < it->second.size(); pos++) {
+ loc = it->second[pos];
+
+ s = psum->locus(loc->id);
+ t = psum->locus_tally(loc->id);
+
+ include = true;
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ if (t->nucs[col].allele_cnt != 2)
+ include = false;
+ }
+
+ if (!include)
+ continue;
+
+ seq = new char[loc->len + 1];
+ strcpy(seq, loc->con);
+
+ for (int j = 0; j < pop_cnt; j++) {
+ pop_id = psum->rev_pop_index(j);
+
+ for (uint i = 0; i < loc->snps.size(); i++) {
+ uint col = loc->snps[i]->col;
+
+ //
+ // Encode SNPs that are variable within a population using IUPAC notation:
+ // http://en.wikipedia.org/wiki/Nucleic_acid_notation#IUPAC_notation
+ //
+ switch(s[j]->nucs[col].p_nuc) {
+ case 0:
+ nuc = 'N';
+ break;
+ case 'A':
+ switch(s[j]->nucs[col].q_nuc) {
+ case 'C':
+ nuc = 'M';
+ break;
+ case 'G':
+ nuc = 'R';
+ break;
+ case 'T':
+ nuc = 'W';
+ break;
+ case 0:
+ nuc = 'A';
+ break;
+ }
+ break;
+ case 'C':
+ switch(s[j]->nucs[col].q_nuc) {
+ case 'A':
+ nuc = 'M';
+ break;
+ case 'G':
+ nuc = 'S';
+ break;
+ case 'T':
+ nuc = 'Y';
+ break;
+ case 0:
+ nuc = 'C';
+ break;
+ }
+ break;
+ case 'G':
+ switch(s[j]->nucs[col].q_nuc) {
+ case 'A':
+ nuc = 'R';
+ break;
+ case 'C':
+ nuc = 'S';
+ break;
+ case 'T':
+ nuc = 'K';
+ break;
+ case 0:
+ nuc = 'G';
+ break;
+ }
+ break;
+ case 'T':
+ switch(s[j]->nucs[col].q_nuc) {
+ case 'A':
+ nuc = 'W';
+ break;
+ case 'C':
+ nuc = 'Y';
+ break;
+ case 'G':
+ nuc = 'K';
+ break;
+ case 0:
+ nuc = 'T';
+ break;
+ }
+ break;
+ }
+
+ seq[col] = nuc;
+ }
+
+ outstrs[pop_id] += string(seq);
+ }
+ delete [] seq;
+
+ log_fh << line << "\t" << loc->id;
+ if (loci_ordered) log_fh << "\t" << loc->loc.chr << "\t" << loc->sort_bp() + 1;
+ log_fh << "\n";
+
+ for (pit = pop_indexes.begin(); pit != pop_indexes.end(); pit++) {
+ pop_id = pit->first;
+ fh << outstrs[pop_id] << "\n";
+ outstrs[pop_id] = "";
+ line++;
+ }
+ fh << "\n";
+ line++;
+
+ par_fh << "DNA, p" << cnt << "=" << index << "-" << index + loc->len - 1 << "\n";
+ index += loc->len;
+ cnt++;
+ }
}
//
@@ -8289,30 +8289,30 @@ tally_ref_alleles(LocSum **s, int pop_cnt, int snp_index, char &p_allele, char &
char nuc[2];
for (int j = 0; j < pop_cnt; j++) {
- nuc[0] = 0;
- nuc[1] = 0;
+ nuc[0] = 0;
+ nuc[1] = 0;
nuc[0] = s[j]->nucs[snp_index].p_nuc;
nuc[1] = s[j]->nucs[snp_index].q_nuc;
- for (uint k = 0; k < 2; k++)
- switch(nuc[k]) {
- case 'A':
- case 'a':
- nucs[0]++;
- break;
- case 'C':
- case 'c':
- nucs[1]++;
- break;
- case 'G':
- case 'g':
- nucs[2]++;
- break;
- case 'T':
- case 't':
- nucs[3]++;
- break;
- }
+ for (uint k = 0; k < 2; k++)
+ switch(nuc[k]) {
+ case 'A':
+ case 'a':
+ nucs[0]++;
+ break;
+ case 'C':
+ case 'c':
+ nucs[1]++;
+ break;
+ case 'G':
+ case 'g':
+ nucs[2]++;
+ break;
+ case 'T':
+ case 't':
+ nucs[3]++;
+ break;
+ }
}
//
@@ -8323,12 +8323,12 @@ tally_ref_alleles(LocSum **s, int pop_cnt, int snp_index, char &p_allele, char &
int i;
int allele_cnt = 0;
for (i = 0; i < 4; i++)
- if (nucs[i] > 0) allele_cnt++;
+ if (nucs[i] > 0) allele_cnt++;
if (allele_cnt > 2) {
- p_allele = 0;
- q_allele = 0;
- return 0;
+ p_allele = 0;
+ q_allele = 0;
+ return 0;
}
//
@@ -8339,39 +8339,39 @@ tally_ref_alleles(LocSum **s, int pop_cnt, int snp_index, char &p_allele, char &
i = 0;
while (p_allele == 0 && i < 4) {
- if (nucs[i] > 0) {
- switch(i) {
- case 0:
- p_allele = 'A';
- break;
- case 1:
- p_allele = 'C';
- break;
- case 2:
- p_allele = 'G';
- break;
- case 3:
- p_allele = 'T';
- break;
- }
- }
- i++;
+ if (nucs[i] > 0) {
+ switch(i) {
+ case 0:
+ p_allele = 'A';
+ break;
+ case 1:
+ p_allele = 'C';
+ break;
+ case 2:
+ p_allele = 'G';
+ break;
+ case 3:
+ p_allele = 'T';
+ break;
+ }
+ }
+ i++;
}
while (q_allele == 0 && i < 4) {
- if (nucs[i] > 0) {
- switch(i) {
- case 1:
- q_allele = 'C';
- break;
- case 2:
- q_allele = 'G';
- break;
- case 3:
- q_allele = 'T';
- break;
- }
- }
- i++;
+ if (nucs[i] > 0) {
+ switch(i) {
+ case 1:
+ q_allele = 'C';
+ break;
+ case 2:
+ q_allele = 'G';
+ break;
+ case 3:
+ q_allele = 'T';
+ break;
+ }
+ }
+ i++;
}
return 1;
@@ -8389,24 +8389,24 @@ tally_observed_haplotypes(vector<char *> &obshap, int snp_index, char &p_allele,
for (uint j = 0; j < obshap.size(); j++) {
nuc = obshap[j][snp_index];
- switch(nuc) {
- case 'A':
- case 'a':
- nucs[0]++;
- break;
- case 'C':
- case 'c':
- nucs[1]++;
- break;
- case 'G':
- case 'g':
- nucs[2]++;
- break;
- case 'T':
- case 't':
- nucs[3]++;
- break;
- }
+ switch(nuc) {
+ case 'A':
+ case 'a':
+ nucs[0]++;
+ break;
+ case 'C':
+ case 'c':
+ nucs[1]++;
+ break;
+ case 'G':
+ case 'g':
+ nucs[2]++;
+ break;
+ case 'T':
+ case 't':
+ nucs[3]++;
+ break;
+ }
}
//
@@ -8417,12 +8417,12 @@ tally_observed_haplotypes(vector<char *> &obshap, int snp_index, char &p_allele,
int i;
int allele_cnt = 0;
for (i = 0; i < 4; i++)
- if (nucs[i] > 0) allele_cnt++;
+ if (nucs[i] > 0) allele_cnt++;
if (allele_cnt > 2) {
- p_allele = 0;
- q_allele = 0;
- return -1;
+ p_allele = 0;
+ q_allele = 0;
+ return -1;
}
//
@@ -8433,39 +8433,39 @@ tally_observed_haplotypes(vector<char *> &obshap, int snp_index, char &p_allele,
i = 0;
while (p_allele == 0 && i < 4) {
- if (nucs[i] > 0) {
- switch(i) {
- case 0:
- p_allele = 'A';
- break;
- case 1:
- p_allele = 'C';
- break;
- case 2:
- p_allele = 'G';
- break;
- case 3:
- p_allele = 'T';
- break;
- }
- }
- i++;
+ if (nucs[i] > 0) {
+ switch(i) {
+ case 0:
+ p_allele = 'A';
+ break;
+ case 1:
+ p_allele = 'C';
+ break;
+ case 2:
+ p_allele = 'G';
+ break;
+ case 3:
+ p_allele = 'T';
+ break;
+ }
+ }
+ i++;
}
while (q_allele == 0 && i < 4) {
- if (nucs[i] > 0) {
- switch(i) {
- case 1:
- q_allele = 'C';
- break;
- case 2:
- q_allele = 'G';
- break;
- case 3:
- q_allele = 'T';
- break;
- }
- }
- i++;
+ if (nucs[i] > 0) {
+ switch(i) {
+ case 1:
+ q_allele = 'C';
+ break;
+ case 2:
+ q_allele = 'G';
+ break;
+ case 3:
+ q_allele = 'T';
+ break;
+ }
+ }
+ i++;
}
return 0;
@@ -8477,34 +8477,34 @@ int load_marker_list(string path, set<int> &list) {
if (fh.fail()) {
cerr << "Error opening white/black list file '" << path << "'\n";
- exit(1);
+ exit(1);
}
int marker;
char *p, *e;
while (fh.good()) {
- fh.getline(line, id_len);
+ fh.getline(line, id_len);
- if (strlen(line) == 0) continue;
+ if (strlen(line) == 0) continue;
- //
- // Skip commented lines.
- //
- for (p = line; isspace(*p) && *p != '\0'; p++);
- if (*p == '#') continue;
+ //
+ // Skip commented lines.
+ //
+ for (p = line; isspace(*p) && *p != '\0'; p++);
+ if (*p == '#') continue;
- marker = (int) strtol(line, &e, 10);
+ marker = (int) strtol(line, &e, 10);
- if (*e == '\0')
- list.insert(marker);
+ if (*e == '\0')
+ list.insert(marker);
}
fh.close();
if (list.size() == 0) {
- cerr << "Unable to load any markers from '" << path << "'\n";
- exit(1);
+ cerr << "Unable to load any markers from '" << path << "'\n";
+ exit(1);
}
return 0;
@@ -8516,7 +8516,7 @@ int load_marker_column_list(string path, map<int, set<int> > &list) {
if (fh.fail()) {
cerr << "Error opening white/black list file '" << path << "'\n";
- exit(1);
+ exit(1);
}
vector<string> parts;
@@ -8525,56 +8525,56 @@ int load_marker_column_list(string path, map<int, set<int> > &list) {
uint line_num = 1;
while (fh.good()) {
- fh.getline(line, id_len);
-
- if (strlen(line) == 0) continue;
-
- //
- // Skip commented lines.
- //
- for (p = line; isspace(*p) && *p != '\0'; p++);
- if (*p == '#') continue;
-
- //
- // Parse the whitelist, we expect:
- // <marker>[<tab><snp column>]
- //
- parse_tsv(line, parts);
-
- if (parts.size() > 2) {
- cerr << "Too many columns in whitelist " << path << "' at line " << line_num << "\n";
- exit(1);
-
- } else if (parts.size() == 2) {
- marker = (int) strtol(parts[0].c_str(), &e, 10);
- if (*e != '\0') {
- cerr << "Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
- exit(1);
- }
- col = (int) strtol(parts[1].c_str(), &e, 10);
- if (*e != '\0') {
- cerr << "Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
- exit(1);
- }
- list[marker].insert(col);
-
- } else {
- marker = (int) strtol(parts[0].c_str(), &e, 10);
- if (*e != '\0') {
- cerr << "Unable to parse whitelist, '" << path << "' at line " << line << "\n";
- exit(1);
- }
- list.insert(make_pair(marker, std::set<int>()));
- }
-
- line_num++;
+ fh.getline(line, id_len);
+
+ if (strlen(line) == 0) continue;
+
+ //
+ // Skip commented lines.
+ //
+ for (p = line; isspace(*p) && *p != '\0'; p++);
+ if (*p == '#') continue;
+
+ //
+ // Parse the whitelist, we expect:
+ // <marker>[<tab><snp column>]
+ //
+ parse_tsv(line, parts);
+
+ if (parts.size() > 2) {
+ cerr << "Too many columns in whitelist " << path << "' at line " << line_num << "\n";
+ exit(1);
+
+ } else if (parts.size() == 2) {
+ marker = (int) strtol(parts[0].c_str(), &e, 10);
+ if (*e != '\0') {
+ cerr << "Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
+ exit(1);
+ }
+ col = (int) strtol(parts[1].c_str(), &e, 10);
+ if (*e != '\0') {
+ cerr << "Unable to parse whitelist, '" << path << "' at line " << line_num << "\n";
+ exit(1);
+ }
+ list[marker].insert(col);
+
+ } else {
+ marker = (int) strtol(parts[0].c_str(), &e, 10);
+ if (*e != '\0') {
+ cerr << "Unable to parse whitelist, '" << path << "' at line " << line << "\n";
+ exit(1);
+ }
+ list.insert(make_pair(marker, std::set<int>()));
+ }
+
+ line_num++;
}
fh.close();
if (list.size() == 0) {
- cerr << "Unable to load any markers from '" << path << "'\n";
- help();
+ cerr << "Unable to load any markers from '" << path << "'\n";
+ help();
}
return 0;
@@ -8582,8 +8582,8 @@ int load_marker_column_list(string path, map<int, set<int> > &list) {
int
build_file_list(vector<pair<int, string> > &files,
- map<int, pair<int, int> > &pop_indexes,
- map<int, vector<int> > &grp_members)
+ map<int, pair<int, int> > &pop_indexes,
+ map<int, vector<int> > &grp_members)
{
char line[max_len];
vector<string> parts;
@@ -8593,145 +8593,145 @@ build_file_list(vector<pair<int, string> > &files,
uint len;
if (pmap_path.length() > 0) {
- cerr << "Parsing population map.\n";
-
- ifstream fh(pmap_path.c_str(), ifstream::in);
-
- if (fh.fail()) {
- cerr << "Error opening population map '" << pmap_path << "'\n";
- return 0;
- }
-
- uint pop_id = 0;
- uint grp_id = 0;
-
- while (fh.good()) {
- fh.getline(line, max_len);
-
- len = strlen(line);
- if (len == 0) continue;
-
- //
- // Check that there is no carraige return in the buffer.
- //
- if (line[len - 1] == '\r') line[len - 1] = '\0';
-
- //
- // Ignore comments
- //
- if (line[0] == '#') continue;
-
- //
- // Parse the population map, we expect:
- // <file name><tab><population string>[<tab><group string>]
- //
- parse_tsv(line, parts);
-
- if (parts.size() < 2 || parts.size() > 3) {
- cerr << "Population map is not formated correctly: expecting two or three, tab separated columns, found " << parts.size() << ".\n";
- return 0;
- }
-
- //
- // Have we seen this population or group before?
- //
- if (pop_names.count(parts[1]) == 0) {
- pop_names.insert(parts[1]);
- pop_id++;
- pop_key[pop_id] = parts[1];
- pop_key_rev[parts[1]] = pop_id;
-
- //
- // If this is the first time we have seen this population, but not the
- // first time we have seen this group, add the population to the group list.
- //
- if (parts.size() == 3 && grp_key_rev.count(parts[2]) > 0)
- grp_members[grp_key_rev[parts[2]]].push_back(pop_id);
- }
- if (parts.size() == 3 && grp_names.count(parts[2]) == 0) {
- grp_names.insert(parts[2]);
- grp_id++;
- grp_key[grp_id] = parts[2];
- grp_key_rev[parts[2]] = grp_id;
-
- //
- // Associate the current population with the group.
- //
- grp_members[grp_id].push_back(pop_id);
- }
-
- //
- // Test that file exists before adding to list.
- //
- ifstream test_fh;
- gzFile gz_test_fh;
-
- f = in_path.c_str() + parts[0] + ".matches.tsv";
- test_fh.open(f.c_str());
-
- if (test_fh.fail()) {
- //
- // Test for a gzipped file.
- //
- f = in_path.c_str() + parts[0] + ".matches.tsv.gz";
- gz_test_fh = gzopen(f.c_str(), "rb");
- if (!gz_test_fh) {
- cerr << " Unable to find " << f.c_str() << ", excluding it from the analysis.\n";
- } else {
- gzclose(gz_test_fh);
- files.push_back(make_pair(pop_key_rev[parts[1]], parts[0]));
- }
- } else {
- test_fh.close();
- files.push_back(make_pair(pop_key_rev[parts[1]], parts[0]));
- }
- }
-
- fh.close();
+ cerr << "Parsing population map.\n";
+
+ ifstream fh(pmap_path.c_str(), ifstream::in);
+
+ if (fh.fail()) {
+ cerr << "Error opening population map '" << pmap_path << "'\n";
+ return 0;
+ }
+
+ uint pop_id = 0;
+ uint grp_id = 0;
+
+ while (fh.good()) {
+ fh.getline(line, max_len);
+
+ len = strlen(line);
+ if (len == 0) continue;
+
+ //
+ // Check that there is no carraige return in the buffer.
+ //
+ if (line[len - 1] == '\r') line[len - 1] = '\0';
+
+ //
+ // Ignore comments
+ //
+ if (line[0] == '#') continue;
+
+ //
+ // Parse the population map, we expect:
+ // <file name><tab><population string>[<tab><group string>]
+ //
+ parse_tsv(line, parts);
+
+ if (parts.size() < 2 || parts.size() > 3) {
+ cerr << "Population map is not formated correctly: expecting two or three, tab separated columns, found " << parts.size() << ".\n";
+ return 0;
+ }
+
+ //
+ // Have we seen this population or group before?
+ //
+ if (pop_names.count(parts[1]) == 0) {
+ pop_names.insert(parts[1]);
+ pop_id++;
+ pop_key[pop_id] = parts[1];
+ pop_key_rev[parts[1]] = pop_id;
+
+ //
+ // If this is the first time we have seen this population, but not the
+ // first time we have seen this group, add the population to the group list.
+ //
+ if (parts.size() == 3 && grp_key_rev.count(parts[2]) > 0)
+ grp_members[grp_key_rev[parts[2]]].push_back(pop_id);
+ }
+ if (parts.size() == 3 && grp_names.count(parts[2]) == 0) {
+ grp_names.insert(parts[2]);
+ grp_id++;
+ grp_key[grp_id] = parts[2];
+ grp_key_rev[parts[2]] = grp_id;
+
+ //
+ // Associate the current population with the group.
+ //
+ grp_members[grp_id].push_back(pop_id);
+ }
+
+ //
+ // Test that file exists before adding to list.
+ //
+ ifstream test_fh;
+ gzFile gz_test_fh;
+
+ f = in_path.c_str() + parts[0] + ".matches.tsv";
+ test_fh.open(f.c_str());
+
+ if (test_fh.fail()) {
+ //
+ // Test for a gzipped file.
+ //
+ f = in_path.c_str() + parts[0] + ".matches.tsv.gz";
+ gz_test_fh = gzopen(f.c_str(), "rb");
+ if (!gz_test_fh) {
+ cerr << " Unable to find " << f.c_str() << ", excluding it from the analysis.\n";
+ } else {
+ gzclose(gz_test_fh);
+ files.push_back(make_pair(pop_key_rev[parts[1]], parts[0]));
+ }
+ } else {
+ test_fh.close();
+ files.push_back(make_pair(pop_key_rev[parts[1]], parts[0]));
+ }
+ }
+
+ fh.close();
} else {
- cerr << "No population map specified, building file list.\n";
+ cerr << "No population map specified, building file list.\n";
- //
- // If no population map is specified, read all the files from the Stacks directory.
- //
- uint pos;
- string file;
- struct dirent *direntry;
+ //
+ // If no population map is specified, read all the files from the Stacks directory.
+ //
+ uint pos;
+ string file;
+ struct dirent *direntry;
- DIR *dir = opendir(in_path.c_str());
+ DIR *dir = opendir(in_path.c_str());
- if (dir == NULL) {
- cerr << "Unable to open directory '" << in_path << "' for reading.\n";
- exit(1);
- }
+ if (dir == NULL) {
+ cerr << "Unable to open directory '" << in_path << "' for reading.\n";
+ exit(1);
+ }
- while ((direntry = readdir(dir)) != NULL) {
- file = direntry->d_name;
+ while ((direntry = readdir(dir)) != NULL) {
+ file = direntry->d_name;
- if (file == "." || file == "..")
- continue;
+ if (file == "." || file == "..")
+ continue;
- if (file.substr(0, 6) == "batch_")
- continue;
+ if (file.substr(0, 6) == "batch_")
+ continue;
- pos = file.rfind(".tags.tsv");
- if (pos < file.length()) {
- files.push_back(make_pair(1, file.substr(0, pos)));
- } else {
- pos = file.rfind(".tags.tsv.gz");
- if (pos < file.length())
- files.push_back(make_pair(1, file.substr(0, pos)));
- }
- }
+ pos = file.rfind(".tags.tsv");
+ if (pos < file.length()) {
+ files.push_back(make_pair(1, file.substr(0, pos)));
+ } else {
+ pos = file.rfind(".tags.tsv.gz");
+ if (pos < file.length())
+ files.push_back(make_pair(1, file.substr(0, pos)));
+ }
+ }
- pop_key[1] = "1";
+ pop_key[1] = "1";
- closedir(dir);
+ closedir(dir);
}
if (files.size() == 0) {
- cerr << "Unable to locate any input files to process within '" << in_path << "'\n";
- return 0;
+ cerr << "Unable to locate any input files to process within '" << in_path << "'\n";
+ return 0;
}
//
@@ -8749,36 +8749,36 @@ build_file_list(vector<pair<int, string> > &files,
int pop_id = files[0].first;
do {
- end++;
- if (pop_id != files[end].first) {
- pop_indexes[pop_id] = make_pair(start, end - 1);
- start = end;
- pop_id = files[end].first;
- }
+ end++;
+ if (pop_id != files[end].first) {
+ pop_indexes[pop_id] = make_pair(start, end - 1);
+ start = end;
+ pop_id = files[end].first;
+ }
} while (end < (int) files.size());
pop_indexes.size() == 1 ?
- cerr << " " << pop_indexes.size() << " population found\n" :
- cerr << " " << pop_indexes.size() << " populations found\n";
+ cerr << " " << pop_indexes.size() << " population found\n" :
+ cerr << " " << pop_indexes.size() << " populations found\n";
if (population_limit > (int) pop_indexes.size()) {
- cerr << "Population limit ("
- << population_limit
- << ") larger than number of popualtions present, adjusting parameter to "
- << pop_indexes.size() << "\n";
- population_limit = pop_indexes.size();
+ cerr << "Population limit ("
+ << population_limit
+ << ") larger than number of popualtions present, adjusting parameter to "
+ << pop_indexes.size() << "\n";
+ population_limit = pop_indexes.size();
}
map<int, pair<int, int> >::iterator it;
for (it = pop_indexes.begin(); it != pop_indexes.end(); it++) {
- start = it->second.first;
- end = it->second.second;
- cerr << " " << pop_key[it->first] << ": ";
- for (int i = start; i <= end; i++) {
- cerr << files[i].second;
- if (i < end) cerr << ", ";
- }
- cerr << "\n";
+ start = it->second.first;
+ end = it->second.second;
+ cerr << " " << pop_key[it->first] << ": ";
+ for (int i = start; i <= end; i++) {
+ cerr << files[i].second;
+ if (i < end) cerr << ", ";
+ }
+ cerr << "\n";
}
//
@@ -8786,22 +8786,22 @@ build_file_list(vector<pair<int, string> > &files,
// group with each population ID as a member.
//
if (grp_members.size() == 0) {
- for (it = pop_indexes.begin(); it != pop_indexes.end(); it++)
- grp_members[1].push_back(it->first);
- grp_key[1] = "1";
+ for (it = pop_indexes.begin(); it != pop_indexes.end(); it++)
+ grp_members[1].push_back(it->first);
+ grp_key[1] = "1";
}
grp_members.size() == 1 ?
- cerr << " " << grp_members.size() << " group of populations found\n" :
- cerr << " " << grp_members.size() << " groups of populations found\n";
+ cerr << " " << grp_members.size() << " group of populations found\n" :
+ cerr << " " << grp_members.size() << " groups of populations found\n";
map<int, vector<int> >::iterator git;
for (git = grp_members.begin(); git != grp_members.end(); git++) {
- cerr << " " << grp_key[git->first] << ": ";
- for (uint i = 0; i < git->second.size(); i++) {
- cerr << pop_key[git->second[i]];
- if (i < git->second.size() - 1) cerr << ", ";
- }
- cerr << "\n";
+ cerr << " " << grp_key[git->first] << ": ";
+ for (uint i = 0; i < git->second.size(); i++) {
+ cerr << pop_key[git->second[i]];
+ if (i < git->second.size() - 1) cerr << ", ";
+ }
+ cerr << "\n";
}
return 1;
@@ -8809,7 +8809,7 @@ build_file_list(vector<pair<int, string> > &files,
bool compare_pop_map(pair<int, string> a, pair<int, string> b) {
if (a.first == b.first)
- return (a.second < b.second);
+ return (a.second < b.second);
return (a.first < b.first);
}
@@ -8825,8 +8825,8 @@ int parse_command_line(int argc, char* argv[]) {
int c;
while (1) {
- static struct option long_options[] = {
- {"help", no_argument, NULL, 'h'},
+ static struct option long_options[] = {
+ {"help", no_argument, NULL, 'h'},
{"version", no_argument, NULL, 'v'},
{"verbose", no_argument, NULL, 'd'},
{"sql", no_argument, NULL, 's'},
@@ -8841,324 +8841,324 @@ int parse_command_line(int argc, char* argv[]) {
{"beagle_phased", no_argument, NULL, 'H'},
{"plink", no_argument, NULL, 'K'},
{"genomic", no_argument, NULL, 'g'},
- {"genepop", no_argument, NULL, 'G'},
- {"phylip", no_argument, NULL, 'Y'},
- {"phylip_var", no_argument, NULL, 'L'},
- {"phylip_var_all", no_argument, NULL, 'T'},
- {"hzar", no_argument, NULL, 'Z'},
- {"treemix", no_argument, NULL, 'U'},
- {"merge_sites", no_argument, NULL, 'D'},
- {"window_size", required_argument, NULL, 'w'},
- {"num_threads", required_argument, NULL, 't'},
- {"batch_id", required_argument, NULL, 'b'},
- {"in_path", required_argument, NULL, 'P'},
- {"progeny", required_argument, NULL, 'r'},
- {"min_depth", required_argument, NULL, 'm'},
- {"renz", required_argument, NULL, 'e'},
- {"pop_map", required_argument, NULL, 'M'},
- {"whitelist", required_argument, NULL, 'W'},
- {"blacklist", required_argument, NULL, 'B'},
- {"write_single_snp", no_argument, NULL, 'I'},
- {"write_random_snp", no_argument, NULL, 'j'},
- {"ordered_export", no_argument, NULL, 'N'},
+ {"genepop", no_argument, NULL, 'G'},
+ {"phylip", no_argument, NULL, 'Y'},
+ {"phylip_var", no_argument, NULL, 'L'},
+ {"phylip_var_all", no_argument, NULL, 'T'},
+ {"hzar", no_argument, NULL, 'Z'},
+ {"treemix", no_argument, NULL, 'U'},
+ {"merge_sites", no_argument, NULL, 'D'},
+ {"window_size", required_argument, NULL, 'w'},
+ {"num_threads", required_argument, NULL, 't'},
+ {"batch_id", required_argument, NULL, 'b'},
+ {"in_path", required_argument, NULL, 'P'},
+ {"progeny", required_argument, NULL, 'r'},
+ {"min_depth", required_argument, NULL, 'm'},
+ {"renz", required_argument, NULL, 'e'},
+ {"pop_map", required_argument, NULL, 'M'},
+ {"whitelist", required_argument, NULL, 'W'},
+ {"blacklist", required_argument, NULL, 'B'},
+ {"write_single_snp", no_argument, NULL, 'I'},
+ {"write_random_snp", no_argument, NULL, 'j'},
+ {"ordered_export", no_argument, NULL, 'N'},
{"kernel_smoothed", no_argument, NULL, 'k'},
{"fstats", no_argument, NULL, '6'},
{"log_fst_comp", no_argument, NULL, 'l'},
{"bootstrap_type", required_argument, NULL, 'O'},
- {"bootstrap_reps", required_argument, NULL, 'R'},
- {"bootstrap_wl", required_argument, NULL, 'Q'},
+ {"bootstrap_reps", required_argument, NULL, 'R'},
+ {"bootstrap_wl", required_argument, NULL, 'Q'},
{"bootstrap", no_argument, NULL, '1'},
{"bootstrap_fst", no_argument, NULL, '2'},
{"bootstrap_phist", no_argument, NULL, '3'},
{"bootstrap_div", no_argument, NULL, '4'},
{"bootstrap_pifis", no_argument, NULL, '5'},
- {"min_populations", required_argument, NULL, 'p'},
- {"min_maf", required_argument, NULL, 'a'},
- {"max_obs_het", required_argument, NULL, 'q'},
- {"lnl_lim", required_argument, NULL, 'c'},
- {"merge_prune_lim", required_argument, NULL, 'i'},
- {"fst_correction", required_argument, NULL, 'f'},
- {"p_value_cutoff", required_argument, NULL, 'u'},
- {0, 0, 0, 0}
- };
-
- // getopt_long stores the option index here.
- int option_index = 0;
+ {"min_populations", required_argument, NULL, 'p'},
+ {"min_maf", required_argument, NULL, 'a'},
+ {"max_obs_het", required_argument, NULL, 'q'},
+ {"lnl_lim", required_argument, NULL, 'c'},
+ {"merge_prune_lim", required_argument, NULL, 'i'},
+ {"fst_correction", required_argument, NULL, 'f'},
+ {"p_value_cutoff", required_argument, NULL, 'u'},
+ {0, 0, 0, 0}
+ };
+
+ // getopt_long stores the option index here.
+ int option_index = 0;
- c = getopt_long(argc, argv, "ACDEFGHJKLNSTUVYZ123456dghjklnsva:b:c:e:f:i:m:o:p:q:r:t:u:w:B:I:M:O:P:R:Q:W:", long_options, &option_index);
+ c = getopt_long(argc, argv, "ACDEFGHJKLNSTUVYZ123456dghjklnsva:b:c:e:f:i:m:o:p:q:r:t:u:w:B:I:M:O:P:R:Q:W:", long_options, &option_index);
- // Detect the end of the options.
- if (c == -1)
- break;
+ // Detect the end of the options.
+ if (c == -1)
+ break;
- switch (c) {
- case 'h':
- help();
- break;
- case 'd':
- verbose = true;
- break;
- case 't':
- num_threads = atoi(optarg);
- break;
- case 'P':
- in_path = optarg;
- break;
- case 'M':
- pmap_path = optarg;
- break;
- case 'D':
- merge_sites = true;
- break;
- case 'i':
- merge_prune_lim = is_double(optarg);
- break;
- case 'q':
- max_obs_het = is_double(optarg);
- break;
- case 'b':
- batch_id = is_integer(optarg);
- if (batch_id < 0) {
- cerr << "Batch ID (-b) must be an integer, e.g. 1, 2, 3\n";
- help();
- }
- break;
- case 'r':
- sample_limit = atof(optarg);
- break;
- case 'p':
- population_limit = atoi(optarg);
- break;
- case 'k':
- kernel_smoothed = true;
- calc_fstats = true;
- break;
- case '6':
- calc_fstats = true;
- break;
- case 'l':
- log_fst_comp = true;
- break;
- case '1':
- bootstrap = true;
- bootstrap_fst = true;
- bootstrap_phist = true;
- bootstrap_pifis = true;
- bootstrap_div = true;
- break;
- case '2':
- bootstrap_fst = true;
- break;
- case '3':
- bootstrap_phist = true;
- break;
- case '4':
- bootstrap_div = true;
- break;
- case '5':
- bootstrap_pifis = true;
- break;
- case 'O':
- if (strcasecmp(optarg, "exact") == 0)
- bootstrap_type = bs_exact;
- else if (strcasecmp(optarg, "approx") == 0)
- bootstrap_type = bs_approx;
- else {
- cerr << "Unknown bootstrap type specified '" << optarg << "'\n";
- help();
- }
- break;
- case 'R':
- bootstrap_reps = atoi(optarg);
- break;
- case 'Q':
- bs_wl_file = optarg;
- bootstrap_wl = true;
- break;
- case 'c':
- lnl_limit = is_double(optarg);
- filter_lnl = true;
- break;
- case 'I':
- write_single_snp = true;
- break;
- case 'j':
- write_random_snp = true;
- break;
- case 'N':
- ordered_export = true;
- break;
- case 's':
- sql_out = true;
- break;
- case 'V':
- vcf_out = true;
- break;
- case 'n':
- vcf_haplo_out = true;
- break;
- case 'F':
- fasta_out = true;
- break;
- case 'J':
- fasta_strict_out = true;
- break;
- case 'G':
- genepop_out = true;
- break;
- case 'S':
- structure_out = true;
- break;
- case 'A':
- fastphase_out = true;
- break;
- case 'C':
- phase_out = true;
- break;
- case 'E':
- beagle_out = true;
- break;
- case 'H':
- beagle_phased_out = true;
- break;
- case 'K':
- plink_out = true;
- break;
- case 'Z':
- hzar_out = true;
- break;
- case 'Y':
- phylip_out = true;
- break;
- case 'L':
- phylip_var = true;
- break;
- case 'T':
- phylip_var_all = true;
- break;
- case 'U':
- treemix_out = true;
- break;
- case 'g':
- genomic_out = true;
- break;
- case 'W':
- wl_file = optarg;
- break;
- case 'B':
- bl_file = optarg;
- break;
- case 'm':
- min_stack_depth = atoi(optarg);
- break;
- case 'a':
- minor_allele_freq = atof(optarg);
- break;
- case 'f':
- if (strcasecmp(optarg, "p_value") == 0)
- fst_correction = p_value;
- else if (strcasecmp(optarg, "bonferroni_win") == 0)
- fst_correction = bonferroni_win;
- else if (strcasecmp(optarg, "bonferroni_gen") == 0)
- fst_correction = bonferroni_gen;
- else {
- cerr << "Unknown Fst correction specified '" << optarg << "'\n";
- help();
- }
- break;
- case 'u':
- p_value_cutoff = atof(optarg);
- break;
- case 'e':
- enz = optarg;
- break;
- case 'w':
- sigma = atof(optarg);
- break;
+ switch (c) {
+ case 'h':
+ help();
+ break;
+ case 'd':
+ verbose = true;
+ break;
+ case 't':
+ num_threads = atoi(optarg);
+ break;
+ case 'P':
+ in_path = optarg;
+ break;
+ case 'M':
+ pmap_path = optarg;
+ break;
+ case 'D':
+ merge_sites = true;
+ break;
+ case 'i':
+ merge_prune_lim = is_double(optarg);
+ break;
+ case 'q':
+ max_obs_het = is_double(optarg);
+ break;
+ case 'b':
+ batch_id = is_integer(optarg);
+ if (batch_id < 0) {
+ cerr << "Batch ID (-b) must be an integer, e.g. 1, 2, 3\n";
+ help();
+ }
+ break;
+ case 'r':
+ sample_limit = atof(optarg);
+ break;
+ case 'p':
+ population_limit = atoi(optarg);
+ break;
+ case 'k':
+ kernel_smoothed = true;
+ calc_fstats = true;
+ break;
+ case '6':
+ calc_fstats = true;
+ break;
+ case 'l':
+ log_fst_comp = true;
+ break;
+ case '1':
+ bootstrap = true;
+ bootstrap_fst = true;
+ bootstrap_phist = true;
+ bootstrap_pifis = true;
+ bootstrap_div = true;
+ break;
+ case '2':
+ bootstrap_fst = true;
+ break;
+ case '3':
+ bootstrap_phist = true;
+ break;
+ case '4':
+ bootstrap_div = true;
+ break;
+ case '5':
+ bootstrap_pifis = true;
+ break;
+ case 'O':
+ if (strcasecmp(optarg, "exact") == 0)
+ bootstrap_type = bs_exact;
+ else if (strcasecmp(optarg, "approx") == 0)
+ bootstrap_type = bs_approx;
+ else {
+ cerr << "Unknown bootstrap type specified '" << optarg << "'\n";
+ help();
+ }
+ break;
+ case 'R':
+ bootstrap_reps = atoi(optarg);
+ break;
+ case 'Q':
+ bs_wl_file = optarg;
+ bootstrap_wl = true;
+ break;
+ case 'c':
+ lnl_limit = is_double(optarg);
+ filter_lnl = true;
+ break;
+ case 'I':
+ write_single_snp = true;
+ break;
+ case 'j':
+ write_random_snp = true;
+ break;
+ case 'N':
+ ordered_export = true;
+ break;
+ case 's':
+ sql_out = true;
+ break;
+ case 'V':
+ vcf_out = true;
+ break;
+ case 'n':
+ vcf_haplo_out = true;
+ break;
+ case 'F':
+ fasta_out = true;
+ break;
+ case 'J':
+ fasta_strict_out = true;
+ break;
+ case 'G':
+ genepop_out = true;
+ break;
+ case 'S':
+ structure_out = true;
+ break;
+ case 'A':
+ fastphase_out = true;
+ break;
+ case 'C':
+ phase_out = true;
+ break;
+ case 'E':
+ beagle_out = true;
+ break;
+ case 'H':
+ beagle_phased_out = true;
+ break;
+ case 'K':
+ plink_out = true;
+ break;
+ case 'Z':
+ hzar_out = true;
+ break;
+ case 'Y':
+ phylip_out = true;
+ break;
+ case 'L':
+ phylip_var = true;
+ break;
+ case 'T':
+ phylip_var_all = true;
+ break;
+ case 'U':
+ treemix_out = true;
+ break;
+ case 'g':
+ genomic_out = true;
+ break;
+ case 'W':
+ wl_file = optarg;
+ break;
+ case 'B':
+ bl_file = optarg;
+ break;
+ case 'm':
+ min_stack_depth = atoi(optarg);
+ break;
+ case 'a':
+ minor_allele_freq = atof(optarg);
+ break;
+ case 'f':
+ if (strcasecmp(optarg, "p_value") == 0)
+ fst_correction = p_value;
+ else if (strcasecmp(optarg, "bonferroni_win") == 0)
+ fst_correction = bonferroni_win;
+ else if (strcasecmp(optarg, "bonferroni_gen") == 0)
+ fst_correction = bonferroni_gen;
+ else {
+ cerr << "Unknown Fst correction specified '" << optarg << "'\n";
+ help();
+ }
+ break;
+ case 'u':
+ p_value_cutoff = atof(optarg);
+ break;
+ case 'e':
+ enz = optarg;
+ break;
+ case 'w':
+ sigma = atof(optarg);
+ break;
case 'v':
version();
break;
- case '?':
- // getopt_long already printed an error message.
- help();
- break;
- default:
- cerr << "Unknown command line option: '" << (char) c << "'\n";
- help();
- abort();
- }
+ case '?':
+ // getopt_long already printed an error message.
+ help();
+ break;
+ default:
+ cerr << "Unknown command line option: '" << (char) c << "'\n";
+ help();
+ abort();
+ }
}
if (in_path.length() == 0) {
- cerr << "You must specify a path to the directory containing Stacks output files.\n";
- help();
+ cerr << "You must specify a path to the directory containing Stacks output files.\n";
+ help();
}
if (in_path.at(in_path.length() - 1) != '/')
- in_path += "/";
+ in_path += "/";
if (pmap_path.length() == 0) {
- cerr << "A population map was not specified, all samples will be read from '" << in_path << "' as a single popultaion.\n";
+ cerr << "A population map was not specified, all samples will be read from '" << in_path << "' as a single popultaion.\n";
}
if (batch_id < 0) {
- cerr << "You must specify a batch ID.\n";
- help();
+ cerr << "You must specify a batch ID.\n";
+ help();
}
if (enz.length() > 0 && renz.count(enz) == 0) {
- cerr << "Unrecognized restriction enzyme specified: '" << enz.c_str() << "'.\n";
- help();
+ cerr << "Unrecognized restriction enzyme specified: '" << enz.c_str() << "'.\n";
+ help();
}
if (merge_prune_lim != 1.0) {
- if (merge_prune_lim > 1.0)
- merge_prune_lim = merge_prune_lim / 100;
+ if (merge_prune_lim > 1.0)
+ merge_prune_lim = merge_prune_lim / 100;
- if (merge_prune_lim < 0 || merge_prune_lim > 1.0) {
- cerr << "Unable to parse the merge sites pruning limit.\n";
- help();
- }
+ if (merge_prune_lim < 0 || merge_prune_lim > 1.0) {
+ cerr << "Unable to parse the merge sites pruning limit.\n";
+ help();
+ }
}
if (minor_allele_freq > 0) {
- if (minor_allele_freq > 1)
- minor_allele_freq = minor_allele_freq / 100;
+ if (minor_allele_freq > 1)
+ minor_allele_freq = minor_allele_freq / 100;
- if (minor_allele_freq > 0.5) {
- cerr << "Unable to parse the minor allele frequency.\n";
- help();
- }
+ if (minor_allele_freq > 0.5) {
+ cerr << "Unable to parse the minor allele frequency.\n";
+ help();
+ }
}
if (max_obs_het != 1.0) {
- if (max_obs_het > 1)
- max_obs_het = max_obs_het / 100;
+ if (max_obs_het > 1)
+ max_obs_het = max_obs_het / 100;
- if (max_obs_het < 0 || max_obs_het > 1.0) {
- cerr << "Unable to parse the maximum observed heterozygosity.\n";
- help();
- }
+ if (max_obs_het < 0 || max_obs_het > 1.0) {
+ cerr << "Unable to parse the maximum observed heterozygosity.\n";
+ help();
+ }
}
if (sample_limit > 0) {
- if (sample_limit > 1)
- sample_limit = sample_limit / 100;
+ if (sample_limit > 1)
+ sample_limit = sample_limit / 100;
- if (sample_limit > 1.0) {
- cerr << "Unable to parse the sample limit frequency\n";
- help();
- }
+ if (sample_limit > 1.0) {
+ cerr << "Unable to parse the sample limit frequency\n";
+ help();
+ }
}
if (write_single_snp && write_random_snp) {
- cerr << "Please specify either '--write_single_snp' or '--write_random_snp', not both.\n";
- help();
+ cerr << "Please specify either '--write_single_snp' or '--write_random_snp', not both.\n";
+ help();
}
if (merge_sites == true && enz.length() == 0) {
- cerr << "You must specify the restriction enzyme associated with this data set to merge overlaping cutsites.\n";
- help();
+ cerr << "You must specify the restriction enzyme associated with this data set to merge overlaping cutsites.\n";
+ help();
}
return 0;
@@ -9173,65 +9173,65 @@ void version() {
void help() {
std::cerr << "populations " << VERSION << "\n"
<< "populations -b batch_id -P path -M path [-r min] [-m min] [-B blacklist] [-W whitelist] [-s] [-e renz] [-t threads] [-v] [-h]" << "\n"
- << " b: Batch ID to examine when exporting from the catalog.\n"
- << " P: path to the Stacks output files.\n"
- << " M: path to the population map, a tab separated file describing which individuals belong in which population.\n"
- << " s: output a file to import results into an SQL database.\n"
- << " B: specify a file containing Blacklisted markers to be excluded from the export.\n"
- << " W: specify a file containing Whitelisted markers to include in the export.\n"
- << " e: restriction enzyme, required if generating 'genomic' output.\n"
- << " t: number of threads to run in parallel sections of code.\n"
- << " v: print program version." << "\n"
- << " h: display this help messsage." << "\n\n"
- << " Merging and Phasing:\n"
- << " --merge_sites: merge loci that were produced from the same restriction enzyme cutsite (requires reference-aligned data).\n"
- << " --merge_prune_lim: when merging adjacent loci, if at least X% samples posses both loci prune the remaining samples out of the analysis.\n"
- << " Data Filtering:\n"
- << " r: minimum percentage of individuals in a population required to process a locus for that population.\n"
- << " p: minimum number of populations a locus must be present in to process a locus.\n"
- << " m: specify a minimum stack depth required for individuals at a locus.\n"
- << " f: specify a correction to be applied to Fst values: 'p_value', 'bonferroni_win', or 'bonferroni_gen'.\n"
- << " --min_maf: specify a minimum minor allele frequency required to process a nucleotide site at a locus (0 < min_maf < 0.5).\n"
- << " --max_obs_het: specify a maximum observed heterozygosity required to process a nucleotide site at a locus.\n"
- << " --p_value_cutoff [num]: required p-value to keep an Fst measurement (0.05 by default). Also used as base for Bonferroni correction.\n"
- << " --lnl_lim [num]: filter loci with log likelihood values below this threshold.\n"
- << " --write_single_snp: restrict data analysis to only the first SNP per locus.\n"
- << " --write_random_snp: restrict data analysis to one random SNP per locus.\n\n"
- << " Fstats:\n"
- << " --fstats: enable SNP and haplotype-based F statistics.\n\n"
- << " Kernel-smoothing algorithm:\n"
- << " k: enable kernel-smoothed Pi, Fis, Fst, Fst', and Phi_st calculations.\n"
- << " --window_size [num]: distance over which to average values (sigma, default 150,000bp; window is 3sigma in length).\n\n"
- << " Bootstrap Resampling:\n"
- << " --bootstrap: turn on boostrap resampling for all smoothed statistics.\n"
- << " --bootstrap_pifis: turn on boostrap resampling for smoothed SNP-based Pi and Fis calculations.\n"
- << " --bootstrap_fst: turn on boostrap resampling for smoothed Fst calculations based on pairwise population comparison of SNPs.\n"
- << " --bootstrap_div: turn on boostrap resampling for smoothed haplotype diveristy and gene diversity calculations based on haplotypes.\n"
- << " --bootstrap_phist: turn on boostrap resampling for smoothed Phi_st calculations based on haplotypes.\n"
- << " --bootstrap_reps [num]: number of bootstrap resamplings to calculate (default 100).\n"
- << " --bootstrap_wl [path]: only bootstrap loci contained in this whitelist.\n\n"
- << " File ouput options:\n"
- << " --ordered_export: if data is reference aligned, exports will be ordered; only a single representative of each overlapping site.\n"
- << " --genomic: output each nucleotide position (fixed or polymorphic) in all population members to a file.\n"
- << " --fasta: output full sequence for each unique haplotype, from each sample locus in FASTA format, regardless of plausibility.\n"
- << " --fasta_strict: output full sequence for each haplotype, from each sample locus in FASTA format, only for biologically plausible loci.\n"
- << " --vcf: output SNPs in Variant Call Format (VCF).\n"
- << " --vcf_haplotypes: output haplotypes in Variant Call Format (VCF).\n"
- << " --genepop: output results in GenePop format.\n"
- << " --structure: output results in Structure format.\n"
- << " --phase: output genotypes in PHASE format.\n"
- << " --fastphase: output genotypes in fastPHASE format.\n"
- << " --beagle: output genotypes in Beagle format.\n"
- << " --beagle_phased: output haplotypes in Beagle format.\n"
- << " --plink: output genotypes in PLINK format.\n"
- << " --hzar: output genotypes in Hybrid Zone Analysis using R (HZAR) format.\n"
- << " --phylip: output nucleotides that are fixed-within, and variant among populations in Phylip format for phylogenetic tree construction.\n"
- << " --phylip_var: include variable sites in the phylip output encoded using IUPAC notation.\n"
- << " --phylip_var_all: include all sequence as well as variable sites in the phylip output encoded using IUPAC notation.\n"
- << " --treemix: output SNPs in a format useable for the TreeMix program (Pickrell and Pritchard).\n\n"
- << " Debugging:\n"
- << " --verbose: turn on additional logging.\n"
- << " --log_fst_comp: log components of Fst/Phi_st calculations to a file.\n";
+ << " b: Batch ID to examine when exporting from the catalog.\n"
+ << " P: path to the Stacks output files.\n"
+ << " M: path to the population map, a tab separated file describing which individuals belong in which population.\n"
+ << " s: output a file to import results into an SQL database.\n"
+ << " B: specify a file containing Blacklisted markers to be excluded from the export.\n"
+ << " W: specify a file containing Whitelisted markers to include in the export.\n"
+ << " e: restriction enzyme, required if generating 'genomic' output.\n"
+ << " t: number of threads to run in parallel sections of code.\n"
+ << " v: print program version." << "\n"
+ << " h: display this help messsage." << "\n\n"
+ << " Merging and Phasing:\n"
+ << " --merge_sites: merge loci that were produced from the same restriction enzyme cutsite (requires reference-aligned data).\n"
+ << " --merge_prune_lim: when merging adjacent loci, if at least X% samples posses both loci prune the remaining samples out of the analysis.\n"
+ << " Data Filtering:\n"
+ << " r: minimum percentage of individuals in a population required to process a locus for that population.\n"
+ << " p: minimum number of populations a locus must be present in to process a locus.\n"
+ << " m: specify a minimum stack depth required for individuals at a locus.\n"
+ << " f: specify a correction to be applied to Fst values: 'p_value', 'bonferroni_win', or 'bonferroni_gen'.\n"
+ << " --min_maf: specify a minimum minor allele frequency required to process a nucleotide site at a locus (0 < min_maf < 0.5).\n"
+ << " --max_obs_het: specify a maximum observed heterozygosity required to process a nucleotide site at a locus.\n"
+ << " --p_value_cutoff [num]: required p-value to keep an Fst measurement (0.05 by default). Also used as base for Bonferroni correction.\n"
+ << " --lnl_lim [num]: filter loci with log likelihood values below this threshold.\n"
+ << " --write_single_snp: restrict data analysis to only the first SNP per locus.\n"
+ << " --write_random_snp: restrict data analysis to one random SNP per locus.\n\n"
+ << " Fstats:\n"
+ << " --fstats: enable SNP and haplotype-based F statistics.\n\n"
+ << " Kernel-smoothing algorithm:\n"
+ << " k: enable kernel-smoothed Pi, Fis, Fst, Fst', and Phi_st calculations.\n"
+ << " --window_size [num]: distance over which to average values (sigma, default 150,000bp; window is 3sigma in length).\n\n"
+ << " Bootstrap Resampling:\n"
+ << " --bootstrap: turn on boostrap resampling for all smoothed statistics.\n"
+ << " --bootstrap_pifis: turn on boostrap resampling for smoothed SNP-based Pi and Fis calculations.\n"
+ << " --bootstrap_fst: turn on boostrap resampling for smoothed Fst calculations based on pairwise population comparison of SNPs.\n"
+ << " --bootstrap_div: turn on boostrap resampling for smoothed haplotype diveristy and gene diversity calculations based on haplotypes.\n"
+ << " --bootstrap_phist: turn on boostrap resampling for smoothed Phi_st calculations based on haplotypes.\n"
+ << " --bootstrap_reps [num]: number of bootstrap resamplings to calculate (default 100).\n"
+ << " --bootstrap_wl [path]: only bootstrap loci contained in this whitelist.\n\n"
+ << " File ouput options:\n"
+ << " --ordered_export: if data is reference aligned, exports will be ordered; only a single representative of each overlapping site.\n"
+ << " --genomic: output each nucleotide position (fixed or polymorphic) in all population members to a file.\n"
+ << " --fasta: output full sequence for each unique haplotype, from each sample locus in FASTA format, regardless of plausibility.\n"
+ << " --fasta_strict: output full sequence for each haplotype, from each sample locus in FASTA format, only for biologically plausible loci.\n"
+ << " --vcf: output SNPs in Variant Call Format (VCF).\n"
+ << " --vcf_haplotypes: output haplotypes in Variant Call Format (VCF).\n"
+ << " --genepop: output results in GenePop format.\n"
+ << " --structure: output results in Structure format.\n"
+ << " --phase: output genotypes in PHASE format.\n"
+ << " --fastphase: output genotypes in fastPHASE format.\n"
+ << " --beagle: output genotypes in Beagle format.\n"
+ << " --beagle_phased: output haplotypes in Beagle format.\n"
+ << " --plink: output genotypes in PLINK format.\n"
+ << " --hzar: output genotypes in Hybrid Zone Analysis using R (HZAR) format.\n"
+ << " --phylip: output nucleotides that are fixed-within, and variant among populations in Phylip format for phylogenetic tree construction.\n"
+ << " --phylip_var: include variable sites in the phylip output encoded using IUPAC notation.\n"
+ << " --phylip_var_all: include all sequence as well as variable sites in the phylip output encoded using IUPAC notation.\n"
+ << " --treemix: output SNPs in a format useable for the TreeMix program (Pickrell and Pritchard).\n\n"
+ << " Debugging:\n"
+ << " --verbose: turn on additional logging.\n"
+ << " --log_fst_comp: log components of Fst/Phi_st calculations to a file.\n";
// << " --bootstrap_type [exact|approx]: enable bootstrap resampling for population statistics (reference genome required).\n"
exit(0);
diff --git a/src/process_radtags.cc b/src/process_radtags.cc
index 4cfe498..afca39f 100644
--- a/src/process_radtags.cc
+++ b/src/process_radtags.cc
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2011-2015, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2011-2016, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -351,7 +351,7 @@ process_paired_reads(string prefix_1,
} else {
if (barcode_type == inline_null || barcode_type == inline_inline || barcode_type == inline_index)
r_1->set_len(r_1->len - (max_bc_size_1 - r_1->inline_bc_len));
- if (barcode_type == inline_index || barcode_type == index_index)
+ if (barcode_type == index_inline || barcode_type == inline_inline)
r_2->set_len(r_2->len - (max_bc_size_2 - r_2->inline_bc_len));
}
diff --git a/src/pstacks.cc b/src/pstacks.cc
index 5965ff6..45d3159 100644
--- a/src/pstacks.cc
+++ b/src/pstacks.cc
@@ -32,7 +32,7 @@ string in_file;
FileT out_file_type;
string out_path;
int sql_id = 0;
-int min_stack_cov = 1;
+int min_stack_cov = 3;
int num_threads = 1;
//
@@ -377,18 +377,20 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
string tag_file = out_path + in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1)) + ".tags.tsv";
string snp_file = out_path + in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1)) + ".snps.tsv";
string all_file = out_path + in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1)) + ".alleles.tsv";
-
+ string mod_file = out_path + in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1)) + ".models.tsv";
+
if (gzip) {
tag_file += ".gz";
snp_file += ".gz";
all_file += ".gz";
+ mod_file += ".gz";
}
//
// Open the output files for writing.
//
- gzFile gz_tags, gz_snps, gz_alle;
- ofstream tags, snps, alle;
+ gzFile gz_tags, gz_snps, gz_alle, gz_mods;
+ ofstream tags, snps, alle, mods;
if (gzip) {
gz_tags = gzopen(tag_file.c_str(), "wb");
if (!gz_tags) {
@@ -398,6 +400,14 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
#if ZLIB_VERNUM >= 0x1240
gzbuffer(gz_tags, libz_buffer_size);
#endif
+ gz_mods = gzopen(mod_file.c_str(), "wb");
+ if (!gz_mods) {
+ cerr << "Error: Unable to open gzipped tag file '" << tag_file << "': " << strerror(errno) << ".\n";
+ exit(1);
+ }
+ #if ZLIB_VERNUM >= 0x1240
+ gzbuffer(gz_mods, libz_buffer_size);
+ #endif
gz_snps = gzopen(snp_file.c_str(), "wb");
if (!gz_snps) {
cerr << "Error: Unable to open gzipped snps file '" << snp_file << "': " << strerror(errno) << ".\n";
@@ -420,6 +430,11 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
cerr << "Error: Unable to open tag file for writing.\n";
exit(1);
}
+ mods.open(mod_file.c_str());
+ if (mods.fail()) {
+ cerr << "Error: Unable to open tag file for writing.\n";
+ exit(1);
+ }
snps.open(snp_file.c_str());
if (snps.fail()) {
cerr << "Error: Unable to open SNPs file for writing.\n";
@@ -447,10 +462,12 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
log << "# pstacks version " << VERSION << "; generated on " << date << "\n";
if (gzip) {
gzputs(gz_tags, log.str().c_str());
+ gzputs(gz_mods, log.str().c_str());
gzputs(gz_snps, log.str().c_str());
gzputs(gz_alle, log.str().c_str());
} else {
tags << log.str();
+ mods << log.str();
snps << log.str();
alle << log.str();
}
@@ -478,7 +495,7 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
// Calculate the log likelihood of this merged stack.
//
tag_1->gen_matrix(u);
- tag_1->calc_likelihood_pstacks();
+ tag_1->calc_likelihood();
wrote++;
@@ -529,6 +546,7 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
<< "\n";
if (gzip) gzputs(gz_tags, sstr.str().c_str()); else tags << sstr.str();
+ if (gzip) gzputs(gz_mods, sstr.str().c_str()); else mods << sstr.str();
sstr.str("");
// Now write out the components of each unique tag merged into this one.
@@ -595,10 +613,12 @@ int write_results(map<int, MergedStack *> &m, map<int, PStack *> &u) {
if (gzip) {
gzclose(gz_tags);
+ gzclose(gz_mods);
gzclose(gz_snps);
gzclose(gz_alle);
} else {
tags.close();
+ mods.close();
snps.close();
alle.close();
}
diff --git a/src/sql_utilities.h b/src/sql_utilities.h
index 2d30c7d..1eafe7d 100644
--- a/src/sql_utilities.h
+++ b/src/sql_utilities.h
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2010-2014, Julian Catchen <jcatchen at uoregon.edu>
+// Copyright 2010-2016, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -21,10 +21,6 @@
//
// sql_utilities.h -- template routines to read and write Stacks SQL file formats.
//
-// Julian Catchen
-// jcatchen at uoregon.edu
-// University of Oregon
-//
#ifndef __SQL_UTILITIES_H__
#define __SQL_UTILITIES_H__
@@ -37,7 +33,7 @@
const uint num_tags_fields = 14;
const uint num_snps_fields = 10;
const uint num_alleles_fields = 6;
-const uint num_matches_fields = 8;
+const uint num_matches_fields = 9;
template <class LocusT>
int
@@ -58,31 +54,72 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
char *line = (char *) malloc(sizeof(char) * max_len);
int size = max_len;
bool gzip = false;
+ bool open_fail = false;
int fh_status = 1;
//
- // First, parse the tag file and pull in the consensus sequence
- // for each locus.
+ // First, try to parse the models file to pull in the consensus sequence and model string
+ // for each locus. If the models file is not available or we are requested to store the
+ // reads from each stack, fall back to the tags file.
//
- f = sample + ".tags.tsv";
- fh.open(f.c_str(), ifstream::in);
+ if (!store_reads) {
+ f = sample + ".models.tsv";
+ fh.open(f.c_str(), ifstream::in);
+ if (fh.fail())
+ open_fail = true;
+ }
- if (fh.fail()) {
- //
- // Test for a gzipped file.
- //
- f = sample + ".tags.tsv.gz";
- gz_fh = gzopen(f.c_str(), "rb");
- if (!gz_fh) {
- cerr << " Unable to open '" << sample << "'\n";
- return 0;
- }
- #if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_fh, libz_buffer_size);
- #endif
- gzip = true;
- compressed = true;
+ if (!store_reads && open_fail) {
+ //
+ // Test for a gzipped MODELs file.
+ //
+ f = sample + ".models.tsv.gz";
+ gz_fh = gzopen(f.c_str(), "rb");
+ if (!gz_fh) {
+ open_fail = true;
+ } else {
+ open_fail = false;
+ #if ZLIB_VERNUM >= 0x1240
+ gzbuffer(gz_fh, libz_buffer_size);
+ #endif
+ gzip = true;
+ }
+ }
+
+ if (open_fail) {
+ //
+ // Test for a TAGs file.
+ //
+ f = sample + ".tags.tsv";
+ fh.open(f.c_str(), ifstream::in);
+ if (fh.fail())
+ open_fail = true;
+ else
+ open_fail = false;
+ }
+
+ if (open_fail) {
+ //
+ // Test for a gzipped TAGs file.
+ //
+ f = sample + ".tags.tsv.gz";
+ gz_fh = gzopen(f.c_str(), "rb");
+ if (!gz_fh) {
+ open_fail = true;
+ } else {
+ open_fail = false;
+ #if ZLIB_VERNUM >= 0x1240
+ gzbuffer(gz_fh, libz_buffer_size);
+ #endif
+ gzip = true;
+ }
}
+
+ if (open_fail) {
+ cerr << " Unable to open '" << sample << "'\n";
+ return 0;
+ }
+
cerr << " Parsing " << f.c_str() << "\n";
uint id;
@@ -92,11 +129,11 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
fh_status = (gzip == true) ? read_gzip_line(gz_fh, &line, &size) : read_line(fh, &line, &size);
if (!fh_status && strlen(line) == 0)
- continue;
+ continue;
- if (is_comment(line)) continue;
+ if (is_comment(line)) continue;
- parse_tsv(line, parts);
+ parse_tsv(line, parts);
if (parts.size() != num_tags_fields) {
cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (" << parts.size() << " fields).\n";
@@ -105,111 +142,111 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
id = atoi(parts[2].c_str());
- if (parts[6] != "consensus") {
+ if (parts[6] != "consensus") {
if (blacklisted.count(id)) continue;
- //
- // Make sure this locus has already been defined (consensus sequence SHOULD always
- // be specified first in the file for a particular locus).
- //
- if (loci.count(id) > 0) {
- //
- // Read the model sequence, a series of letters specifying if the model called a
- // homozygous base (O), a heterozygous base (E), or if the base type was unknown (U).
- //
- if (parts[6] == "model") {
- loci[id]->model = new char[parts[9].length() + 1];
- strcpy(loci[id]->model, parts[9].c_str());
-
- } else {
- //
- // Otherwise, we expect a primary or secondary read, record these if specified.
- //
- loci[id]->depth++;
-
- if (store_reads) {
- char *read = new char[parts[9].length() + 1];
- strcpy(read, parts[9].c_str());
- loci[id]->reads.push_back(read);
-
- char *read_id = new char[parts[8].length() + 1];
- strcpy(read_id, parts[8].c_str());
- loci[id]->comp.push_back(read_id);
- //
- // Store the internal stack number for this read.
- //
- loci[id]->comp_cnt.push_back(atoi(parts[7].c_str()));
-
- //
- // Store the read type.
- //
- if (parts[6] == "primary")
- loci[id]->comp_type.push_back(primary);
- else
- loci[id]->comp_type.push_back(secondary);
- }
- }
-
- continue;
+ //
+ // Make sure this locus has already been defined (consensus sequence SHOULD always
+ // be specified first in the file for a particular locus).
+ //
+ if (loci.count(id) > 0) {
+ //
+ // Read the model sequence, a series of letters specifying if the model called a
+ // homozygous base (O), a heterozygous base (E), or if the base type was unknown (U).
+ //
+ if (parts[6] == "model") {
+ loci[id]->model = new char[parts[9].length() + 1];
+ strcpy(loci[id]->model, parts[9].c_str());
+
+ } else {
+ //
+ // Otherwise, we expect a primary or secondary read, record these if specified.
+ //
+ loci[id]->depth++;
+
+ if (store_reads) {
+ char *read = new char[parts[9].length() + 1];
+ strcpy(read, parts[9].c_str());
+ loci[id]->reads.push_back(read);
+
+ char *read_id = new char[parts[8].length() + 1];
+ strcpy(read_id, parts[8].c_str());
+ loci[id]->comp.push_back(read_id);
+ //
+ // Store the internal stack number for this read.
+ //
+ loci[id]->comp_cnt.push_back(atoi(parts[7].c_str()));
+
+ //
+ // Store the read type.
+ //
+ if (parts[6] == "primary")
+ loci[id]->comp_type.push_back(primary);
+ else
+ loci[id]->comp_type.push_back(secondary);
+ }
+ }
+
+ continue;
} else {
cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (stack " << id << " does not exist).\n";
return 0;
}
}
- //
- // Do not include blacklisted tags in the catalog. They are tags that are composed
- // of noise and/or repetitive sequence.
- //
- if (parts[11] == "1") {
- blacklisted.insert(id);
- continue;
- }
+ //
+ // Do not include blacklisted tags in the catalog. They are tags that are composed
+ // of noise and/or repetitive sequence.
+ //
+ if (parts[11] == "1") {
+ blacklisted.insert(id);
+ continue;
+ }
- c = new LocusT;
+ c = new LocusT;
c->sample_id = atoi(parts[1].c_str());
- c->id = id;
- c->add_consensus(parts[9].c_str());
+ c->id = id;
+ c->add_consensus(parts[9].c_str());
- //
- // Read in the flags
- //
- c->deleveraged = (parts[10] == "1" ? true : false);
- c->lumberjackstack = (parts[12] == "1" ? true : false);
+ //
+ // Read in the flags
+ //
+ c->deleveraged = (parts[10] == "1" ? true : false);
+ c->lumberjackstack = (parts[12] == "1" ? true : false);
- //
- // Read in the log likelihood of the locus.
- //
- c->lnl = is_double(parts[13].c_str());
+ //
+ // Read in the log likelihood of the locus.
+ //
+ c->lnl = is_double(parts[13].c_str());
//
// Parse the physical genome location of this locus.
//
- c->loc.set(parts[3].c_str(), atoi(parts[4].c_str()), (parts[5] == "+" ? plus : minus));
+ c->loc.set(parts[3].c_str(), atoi(parts[4].c_str()), (parts[5] == "+" ? plus : minus));
- //
- // Parse the components of this stack (either the Illumina ID, or the catalog constituents)
- //
- q = parts[8].c_str();
- while (*q != '\0') {
- for (p = q; *q != ',' && *q != '\0'; q++);
- len = q - p;
- cmp = new char[len + 1];
- strncpy(cmp, p, len);
- cmp[len] = '\0';
- c->comp.push_back(cmp);
- if (*q != '\0') q++;
- }
+ //
+ // Parse the components of this stack (either the Illumina ID, or the catalog constituents)
+ //
+ q = parts[8].c_str();
+ while (*q != '\0') {
+ for (p = q; *q != ',' && *q != '\0'; q++);
+ len = q - p;
+ cmp = new char[len + 1];
+ strncpy(cmp, p, len);
+ cmp[len] = '\0';
+ c->comp.push_back(cmp);
+ if (*q != '\0') q++;
+ }
- loci[c->id] = c;
+ loci[c->id] = c;
line_num++;
}
if (gzip)
- gzclose(gz_fh);
+ gzclose(gz_fh);
else
- fh.close();
+ fh.close();
//
// Next, parse the SNP file and load model calls.
@@ -221,20 +258,20 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
f = sample + ".snps.tsv";
fh.open(f.c_str(), ifstream::in);
if (fh.fail()) {
- //
- // Test for a gzipped file.
- //
- f = sample + ".snps.tsv.gz";
- gz_fh = gzopen(f.c_str(), "rb");
- if (!gz_fh) {
- cerr << " Unable to open '" << sample << "'\n";
- return 0;
- }
+ //
+ // Test for a gzipped file.
+ //
+ f = sample + ".snps.tsv.gz";
+ gz_fh = gzopen(f.c_str(), "rb");
+ if (!gz_fh) {
+ cerr << " Unable to open '" << sample << "'\n";
+ return 0;
+ }
#if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_fh, libz_buffer_size);
- #endif
- gzip = true;
- compressed = true;
+ gzbuffer(gz_fh, libz_buffer_size);
+ #endif
+ gzip = true;
+ compressed = true;
}
cerr << " Parsing " << f.c_str() << "\n";
@@ -242,11 +279,11 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
fh_status = (gzip == true) ? read_gzip_line(gz_fh, &line, &size) : read_line(fh, &line, &size);
if (!fh_status && strlen(line) == 0)
- continue;
+ continue;
- if (is_comment(line)) continue;
+ if (is_comment(line)) continue;
- parse_tsv(line, parts);
+ parse_tsv(line, parts);
if (parts.size() != num_snps_fields && parts.size() != num_snps_fields - 2) {
cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (" << parts.size() << " fields).\n";
@@ -255,39 +292,39 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
id = atoi(parts[2].c_str());
- if (blacklisted.count(id))
- continue;
-
- //
- // Only load heterozygous model calls.
- //
- if (load_all_model_calls == false && parts[4] != "E")
- continue;
-
- snp = new SNP;
- snp->col = atoi(parts[3].c_str());
- snp->lratio = atof(parts[5].c_str());
- snp->rank_1 = parts[6].at(0);
- snp->rank_2 = parts[7].at(0) == '-' ? 0 : parts[7].at(0);
-
- if (parts[4] == "E")
- snp->type = snp_type_het;
- else if (parts[4] == "O")
- snp->type = snp_type_hom;
- else
- snp->type = snp_type_unk;
-
- if (parts.size() == 10) {
- if (parts[8].length() == 0 || parts[8].at(0) == '-')
- snp->rank_3 = 0;
- else
- snp->rank_3 = parts[8].at(0);
-
- if (parts[9].length() == 0 || parts[9].at(0) == '-')
- snp->rank_4 = 0;
- else
- snp->rank_4 = parts[9].at(0);
- }
+ if (blacklisted.count(id))
+ continue;
+
+ //
+ // Only load heterozygous model calls.
+ //
+ if (load_all_model_calls == false && parts[4] != "E")
+ continue;
+
+ snp = new SNP;
+ snp->col = atoi(parts[3].c_str());
+ snp->lratio = atof(parts[5].c_str());
+ snp->rank_1 = parts[6].at(0);
+ snp->rank_2 = parts[7].at(0) == '-' ? 0 : parts[7].at(0);
+
+ if (parts[4] == "E")
+ snp->type = snp_type_het;
+ else if (parts[4] == "O")
+ snp->type = snp_type_hom;
+ else
+ snp->type = snp_type_unk;
+
+ if (parts.size() == 10) {
+ if (parts[8].length() == 0 || parts[8].at(0) == '-')
+ snp->rank_3 = 0;
+ else
+ snp->rank_3 = parts[8].at(0);
+
+ if (parts[9].length() == 0 || parts[9].at(0) == '-')
+ snp->rank_4 = 0;
+ else
+ snp->rank_4 = parts[9].at(0);
+ }
if (loci.count(id) > 0) {
loci[id]->snps.push_back(snp);
@@ -300,9 +337,9 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
}
if (gzip)
- gzclose(gz_fh);
+ gzclose(gz_fh);
else
- fh.close();
+ fh.close();
//
// Finally, parse the Alleles file
@@ -314,20 +351,20 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
f = sample + ".alleles.tsv";
fh.open(f.c_str(), ifstream::in);
if (fh.fail()) {
- //
- // Test for a gzipped file.
- //
- f = sample + ".alleles.tsv.gz";
- gz_fh = gzopen(f.c_str(), "rb");
- if (!gz_fh) {
- cerr << " Unable to open '" << sample << "'\n";
- return 0;
- }
+ //
+ // Test for a gzipped file.
+ //
+ f = sample + ".alleles.tsv.gz";
+ gz_fh = gzopen(f.c_str(), "rb");
+ if (!gz_fh) {
+ cerr << " Unable to open '" << sample << "'\n";
+ return 0;
+ }
#if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_fh, libz_buffer_size);
- #endif
- gzip = true;
- compressed = true;
+ gzbuffer(gz_fh, libz_buffer_size);
+ #endif
+ gzip = true;
+ compressed = true;
}
cerr << " Parsing " << f.c_str() << "\n";
@@ -335,11 +372,11 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
fh_status = (gzip == true) ? read_gzip_line(gz_fh, &line, &size) : read_line(fh, &line, &size);
if (!fh_status && strlen(line) == 0)
- continue;
+ continue;
- if (is_comment(line)) continue;
+ if (is_comment(line)) continue;
- parse_tsv(line, parts);
+ parse_tsv(line, parts);
if (parts.size() != num_alleles_fields) {
cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (" << parts.size() << " fields).\n";
@@ -348,8 +385,8 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
id = atoi(parts[2].c_str());
- if (blacklisted.count(id))
- continue;
+ if (blacklisted.count(id))
+ continue;
if (loci.count(id) > 0) {
loci[id]->alleles[parts[3]] = atoi(parts[5].c_str());
@@ -362,9 +399,9 @@ load_loci(string sample, map<int, LocusT *> &loci, bool store_reads, bool load_
}
if (gzip)
- gzclose(gz_fh);
+ gzclose(gz_fh);
else
- fh.close();
+ fh.close();
//
// Populate the strings member with the sequence for each allele for each Locus.
@@ -385,15 +422,15 @@ int dump_loci(map<int, LocusT *> &u) {
for (i = u.begin(); i != u.end(); i++) {
- cerr << "Locus ID: " << i->second->id << "\n"
- << " Consensus: " << i->second->con << "\n"
+ cerr << "Locus ID: " << i->second->id << "\n"
+ << " Consensus: " << i->second->con << "\n"
<< " Genomic Location: " << i->second->loc.chr << "; " << i->second->loc.bp << "bp\n"
- << " SNPs:\n";
+ << " SNPs:\n";
- for (s = i->second->snps.begin(); s != i->second->snps.end(); s++)
- cerr << " Col: " << (*s)->col << " rank 1: " << (*s)->rank_1 << " rank 2: " << (*s)->rank_2 << "\n";
+ for (s = i->second->snps.begin(); s != i->second->snps.end(); s++)
+ cerr << " Col: " << (*s)->col << " rank 1: " << (*s)->rank_1 << " rank 2: " << (*s)->rank_2 << "\n";
- cerr << "\n";
+ cerr << "\n";
}
return 0;
@@ -409,25 +446,26 @@ int load_catalog_matches(string sample, vector<CatMatch *> &matches) {
char *line = (char *) malloc(sizeof(char) * max_len);
int size = max_len;
+ int cnt = 0;
bool gzip = false;
int fh_status = 1;
f = sample + ".matches.tsv";
fh.open(f.c_str(), ifstream::in);
if (fh.fail()) {
- //
- // Test for a gzipped file.
- //
- f = sample + ".matches.tsv.gz";
- gz_fh = gzopen(f.c_str(), "rb");
- if (!gz_fh) {
- cerr << " Unable to open '" << sample << "'\n";
- return 0;
- }
+ //
+ // Test for a gzipped file.
+ //
+ f = sample + ".matches.tsv.gz";
+ gz_fh = gzopen(f.c_str(), "rb");
+ if (!gz_fh) {
+ cerr << " Unable to open '" << sample << "'\n";
+ return 0;
+ }
#if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_fh, libz_buffer_size);
- #endif
- gzip = true;
+ gzbuffer(gz_fh, libz_buffer_size);
+ #endif
+ gzip = true;
}
cerr << " Parsing " << f.c_str() << "\n";
@@ -437,33 +475,41 @@ int load_catalog_matches(string sample, vector<CatMatch *> &matches) {
line_num++;
if (!fh_status && strlen(line) == 0)
- continue;
+ continue;
+
+ if (is_comment(line)) continue;
- if (is_comment(line)) continue;
+ parse_tsv(line, parts);
- parse_tsv(line, parts);
+ cnt = parts.size();
- if (parts.size() != num_matches_fields) {
+ if (cnt != num_matches_fields && cnt != num_matches_fields - 1) {
cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (" << parts.size() << " fields).\n";
return 0;
}
- m = new CatMatch;
- m->batch_id = atoi(parts[1].c_str());
- m->cat_id = atoi(parts[2].c_str());
+ m = new CatMatch;
+ m->batch_id = atoi(parts[1].c_str());
+ m->cat_id = atoi(parts[2].c_str());
m->sample_id = atoi(parts[3].c_str());
- m->tag_id = atoi(parts[4].c_str());
- m->haplotype = new char[parts[5].length() + 1];
- strcpy(m->haplotype, parts[5].c_str());
- m->depth = atoi(parts[6].c_str());
- m->lnl = is_double(parts[7].c_str());
- matches.push_back(m);
+ m->tag_id = atoi(parts[4].c_str());
+ m->haplotype = new char[parts[5].length() + 1];
+ strcpy(m->haplotype, parts[5].c_str());
+ m->depth = atoi(parts[6].c_str());
+ m->lnl = is_double(parts[7].c_str());
+
+ if (cnt == num_matches_fields && parts[8].length() > 0) {
+ m->cigar = new char[parts[8].length() + 1];
+ strcpy(m->cigar, parts[8].c_str());
+ }
+
+ matches.push_back(m);
}
if (gzip)
- gzclose(gz_fh);
+ gzclose(gz_fh);
else
- fh.close();
+ fh.close();
return 0;
}
@@ -478,33 +524,73 @@ int load_model_results(string sample, map<int, ModRes *> &modres) {
char *line = (char *) malloc(sizeof(char) * max_len);
int size = max_len;
bool gzip = false;
+ bool open_fail = false;
int fh_status = 1;
//
- // First, parse the tag file and pull in the consensus sequence
- // for each Radtag.
+ // Parse the models file (if it exists), otherwise parse the tag file to
+ // pull in the model calls for each locus.
//
gzip = false;
fh_status = 1;
line_num = 1;
- f = sample + ".tags.tsv";
+ f = sample + ".models.tsv";
fh.open(f.c_str(), ifstream::in);
- if (fh.fail()) {
- //
- // Test for a gzipped file.
- //
- f = sample + ".tags.tsv.gz";
- gz_fh = gzopen(f.c_str(), "rb");
- if (!gz_fh) {
- cerr << " Unable to open '" << sample << "'\n";
- return 0;
- }
- #if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_fh, libz_buffer_size);
- #endif
- gzip = true;
+ if (fh.fail())
+ open_fail = true;
+
+ if (open_fail) {
+ //
+ // Test for a gzipped MODELs file.
+ //
+ f = sample + ".models.tsv.gz";
+ gz_fh = gzopen(f.c_str(), "rb");
+ if (!gz_fh) {
+ open_fail = true;
+ } else {
+ open_fail = false;
+ #if ZLIB_VERNUM >= 0x1240
+ gzbuffer(gz_fh, libz_buffer_size);
+ #endif
+ gzip = true;
+ }
}
+
+ if (open_fail) {
+ //
+ // Test for a TAGs file.
+ //
+ f = sample + ".tags.tsv";
+ fh.open(f.c_str(), ifstream::in);
+ if (fh.fail())
+ open_fail = true;
+ else
+ open_fail = false;
+ }
+
+ if (open_fail) {
+ //
+ // Test for a gzipped TAGs file.
+ //
+ f = sample + ".tags.tsv.gz";
+ gz_fh = gzopen(f.c_str(), "rb");
+ if (!gz_fh) {
+ open_fail = true;
+ } else {
+ open_fail = false;
+ #if ZLIB_VERNUM >= 0x1240
+ gzbuffer(gz_fh, libz_buffer_size);
+ #endif
+ gzip = true;
+ }
+ }
+
+ if (open_fail) {
+ cerr << " Unable to open '" << sample << "'\n";
+ return 0;
+ }
+
cerr << " Parsing " << f.c_str() << "\n";
ModRes *mod;
@@ -515,33 +601,33 @@ int load_model_results(string sample, map<int, ModRes *> &modres) {
line_num++;
if (!fh_status && strlen(line) == 0)
- continue;
- if (is_comment(line)) continue;
+ continue;
+ if (is_comment(line)) continue;
- parse_tsv(line, parts);
+ parse_tsv(line, parts);
if (parts.size() != num_tags_fields) {
cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (" << parts.size() << " fields).\n";
return 0;
}
- //
- // Read the model sequence, a series of letters specifying if the model called a
- // homozygous base (O), a heterozygous base (E), or if the base type was unknown (U).
- //
- if (parts[6] != "model") continue;
+ //
+ // Read the model sequence, a series of letters specifying if the model called a
+ // homozygous base (O), a heterozygous base (E), or if the base type was unknown (U).
+ //
+ if (parts[6] != "model") continue;
- samp_id = atoi(parts[1].c_str());
+ samp_id = atoi(parts[1].c_str());
tag_id = atoi(parts[2].c_str());
mod = new ModRes(samp_id, tag_id, parts[9].c_str());
- modres[tag_id] = mod;
+ modres[tag_id] = mod;
}
if (gzip)
- gzclose(gz_fh);
+ gzclose(gz_fh);
else
- fh.close();
+ fh.close();
delete [] line;
@@ -569,19 +655,19 @@ int load_snp_calls(string sample, map<int, SNPRes *> &snpres) {
f = sample + ".snps.tsv";
fh.open(f.c_str(), ifstream::in);
if (fh.fail()) {
- //
- // Test for a gzipped file.
- //
- f = sample + ".snps.tsv.gz";
- gz_fh = gzopen(f.c_str(), "rb");
- if (!gz_fh) {
- cerr << " Unable to open '" << sample << "'\n";
- return 0;
- }
+ //
+ // Test for a gzipped file.
+ //
+ f = sample + ".snps.tsv.gz";
+ gz_fh = gzopen(f.c_str(), "rb");
+ if (!gz_fh) {
+ cerr << " Unable to open '" << sample << "'\n";
+ return 0;
+ }
#if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_fh, libz_buffer_size);
- #endif
- gzip = true;
+ gzbuffer(gz_fh, libz_buffer_size);
+ #endif
+ gzip = true;
}
cerr << " Parsing " << f.c_str() << "\n";
@@ -590,57 +676,57 @@ int load_snp_calls(string sample, map<int, SNPRes *> &snpres) {
fh_status = (gzip == true) ? read_gzip_line(gz_fh, &line, &size) : read_line(fh, &line, &size);
if (!fh_status && strlen(line) == 0)
- continue;
- if (is_comment(line)) continue;
+ continue;
+ if (is_comment(line)) continue;
- parse_tsv(line, parts);
+ parse_tsv(line, parts);
if (parts.size() != num_snps_fields && parts.size() != num_snps_fields - 2) {
cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (" << parts.size() << " fields).\n";
return 0;
}
- samp_id = atoi(parts[1].c_str());
+ samp_id = atoi(parts[1].c_str());
id = atoi(parts[2].c_str());
- snp = new SNP;
- snp->col = atoi(parts[3].c_str());
-
- if (parts[4] == "O")
- snp->type = snp_type_hom;
- else if (parts[4] == "E")
- snp->type = snp_type_het;
- else
- snp->type = snp_type_unk;
-
- snp->lratio = atof(parts[5].c_str());
- snp->rank_1 = parts[6].at(0);
- snp->rank_2 = parts[7].at(0) == '-' ? 0 : parts[7].at(0);
-
- if (parts.size() == 10) {
- if (parts[8].length() == 0 || parts[8].at(0) == '-')
- snp->rank_3 = 0;
- else
- snp->rank_3 = parts[8].at(0);
- if (parts[9].length() == 0 || parts[9].at(0) == '-')
- snp->rank_4 = 0;
- else
- snp->rank_4 = parts[9].at(0);
- }
+ snp = new SNP;
+ snp->col = atoi(parts[3].c_str());
+
+ if (parts[4] == "O")
+ snp->type = snp_type_hom;
+ else if (parts[4] == "E")
+ snp->type = snp_type_het;
+ else
+ snp->type = snp_type_unk;
+
+ snp->lratio = atof(parts[5].c_str());
+ snp->rank_1 = parts[6].at(0);
+ snp->rank_2 = parts[7].at(0) == '-' ? 0 : parts[7].at(0);
+
+ if (parts.size() == 10) {
+ if (parts[8].length() == 0 || parts[8].at(0) == '-')
+ snp->rank_3 = 0;
+ else
+ snp->rank_3 = parts[8].at(0);
+ if (parts[9].length() == 0 || parts[9].at(0) == '-')
+ snp->rank_4 = 0;
+ else
+ snp->rank_4 = parts[9].at(0);
+ }
if (snpres.count(id) == 0) {
- snpr = new SNPRes(samp_id, id);
- snpres[id] = snpr;
+ snpr = new SNPRes(samp_id, id);
+ snpres[id] = snpr;
}
- snpres[id]->snps.push_back(snp);
+ snpres[id]->snps.push_back(snp);
line_num++;
}
if (gzip)
- gzclose(gz_fh);
+ gzclose(gz_fh);
else
- fh.close();
+ fh.close();
delete [] line;
diff --git a/src/sstacks.cc b/src/sstacks.cc
index adf81a7..d795af6 100644
--- a/src/sstacks.cc
+++ b/src/sstacks.cc
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2010-2015, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2010-2016, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -36,8 +36,13 @@ int catalog = 0;
bool verify_haplotypes = true;
bool impute_haplotypes = true;
bool require_uniq_haplotypes = false;
+bool gapped_alignments = false;
searcht search_type = sequence;
+double min_match_len = 0.80;
+double max_gaps = 2.0;
+int gapped_kmer_len = 19;
+
int main (int argc, char* argv[]) {
parse_command_line(argc, argv);
@@ -68,6 +73,16 @@ int main (int argc, char* argv[]) {
return 0;
}
+ KmerHashMap kmer_map;
+ map<int, pair<allele_type, int> > allele_map;
+ vector<char *> kmer_map_keys;
+
+ if (gapped_alignments) {
+ cerr << "Populating kmer dictionary for gapped alignments...";
+ populate_kmer_hash(catalog, kmer_map, kmer_map_keys, allele_map, gapped_kmer_len);
+ cerr << "done.\n";
+ }
+
string sample_path;
int i = 1;
@@ -100,6 +115,11 @@ int main (int argc, char* argv[]) {
cerr << "Searching for sequence matches...\n";
find_matches_by_sequence(catalog, sample);
+ if (gapped_alignments) {
+ cerr << "Searching for gapped alignments...\n";
+ search_for_gaps(catalog, sample, kmer_map, allele_map, min_match_len);
+ }
+
} else if (search_type == genomic_loc) {
cerr << "Searching for matches by genomic location...\n";
find_matches_by_genomic_loc(catalog, sample);
@@ -116,10 +136,15 @@ int main (int argc, char* argv[]) {
sample.clear();
}
+ if (gapped_alignments)
+ free_kmer_hash(kmer_map, kmer_map_keys);
+
return 0;
}
-int find_matches_by_genomic_loc(map<int, Locus *> &sample_1, map<int, QLocus *> &sample_2) {
+int
+find_matches_by_genomic_loc(map<int, Locus *> &sample_1, map<int, QLocus *> &sample_2)
+{
//
// Calculate the distance (number of mismatches) between each pair
// of Radtags. We expect all radtags to be the same length;
@@ -556,6 +581,7 @@ find_matches_by_sequence(map<int, Locus *> &sample_1, map<int, QLocus *> &sample
unsigned long mmatch = 0;
unsigned long nosnps = 0;
unsigned long nomatch = 0;
+ unsigned long no_haps = 0;
unsigned long tot_hap = 0;
unsigned long ver_hap = 0;
@@ -600,13 +626,16 @@ find_matches_by_sequence(map<int, Locus *> &sample_1, map<int, QLocus *> &sample
}
}
- if (loci_hit.size() > 0) matches++;
+ if (loci_hit.size() == 0)
+ nomatch++;
+ else if (loci_hit.size() > 0)
+ matches++;
if (verify_haplotypes && loci_hit.size() > 0) {
uint verified = verify_sequence_match(sample_1, query, loci_hit, haplo_hits,
min_tag_len, mmatch, nosnps);
ver_hap += verified;
- if (verified == 0) nomatch++;
+ if (verified == 0) no_haps++;
}
}
}
@@ -622,7 +651,7 @@ find_matches_by_sequence(map<int, Locus *> &sample_1, map<int, QLocus *> &sample
sample_1_map_keys.clear();
cerr << keys.size() << " stacks compared against the catalog containing " << sample_1.size() << " loci.\n"
- << " " << matches << " matching loci, " << nomatch << " contained no verified haplotypes.\n"
+ << " " << matches << " matching loci, " << no_haps << " contained no verified haplotypes.\n"
<< " " << mmatch << " loci matched more than one catalog locus and were excluded.\n"
<< " " << nosnps << " loci contained SNPs unaccounted for in the catalog and were excluded.\n"
<< " " << tot_hap << " total haplotypes examined from matching loci, " << ver_hap << " verified.\n";
@@ -677,7 +706,7 @@ int verify_sequence_match(map<int, Locus *> &sample_1, QLocus *query,
//
map<string, vector<string> >::iterator it;
map<string, int> cat_hap, query_hap;
-
+
for (it = haplo_hits.begin(); it != haplo_hits.end(); it++) {
query_hap[it->first] = it->second.size();
for (uint j = 0; j < it->second.size(); j++)
@@ -708,6 +737,425 @@ int verify_sequence_match(map<int, Locus *> &sample_1, QLocus *query,
}
int
+search_for_gaps(map<int, Locus *> &catalog, map<int, QLocus *> &sample,
+ KmerHashMap &kmer_map, map<int, pair<allele_type, int> > &allele_map,
+ double min_match_len)
+{
+ //
+ // Search for loci that can be merged with a gapped alignment.
+ //
+
+ //
+ // OpenMP can't parallelize random access iterators, so we convert
+ // our map to a vector of integer keys.
+ //
+ map<int, QLocus *>::iterator it;
+ vector<int> keys;
+ for (it = sample.begin(); it != sample.end(); it++)
+ keys.push_back(it->first);
+
+ //
+ // Calculate the number of k-mers we will generate. If kmer_len == 0,
+ // determine the optimal length for k-mers.
+ //
+ int con_len = strlen(sample[keys[0]]->con);
+ int num_kmers = con_len - gapped_kmer_len + 1;
+
+ //
+ // Calculate the minimum number of matching k-mers required for a possible sequence match.
+ //
+ int min_hits = (round((double) con_len * min_match_len) - (gapped_kmer_len * max_gaps)) - gapped_kmer_len + 1;
+
+ cerr << " Searching with a k-mer length of " << gapped_kmer_len << " (" << num_kmers << " k-mers per read); " << min_hits << " k-mer hits required.\n";
+
+ uint gapped_aln = 0;
+ uint matches = 0;
+ uint mmatches = 0;
+ uint nomatches = 0;
+ uint ver_hap = 0;
+ uint tot_hap = 0;
+ uint bad_aln = 0;
+ uint no_haps = 0;
+ uint nosnps = 0;
+
+ #pragma omp parallel
+ {
+ QLocus *query;
+ Locus *tag_2;
+ KmerHashMap::iterator h;
+ AlignRes aln_res;
+ vector<char *> kmers;
+ set<string> uniq_kmers;
+ vector<int> hits;
+ vector<pair<int, int> > ordered_hits;
+ uint hit_cnt, index, prev_id, allele_id, hits_size, stop, top_hit;
+ pair<allele_type, int> cat_hit;
+ string query_allele, query_seq, cat_allele, cat_seq;
+ map<string, vector<string> > haplo_hits;
+ set<int> loci_hit;
+ vector<pair<char, uint> > cigar;
+
+ GappedAln *aln = new GappedAln();
+
+ initialize_kmers(gapped_kmer_len, num_kmers, kmers);
+
+ #pragma omp for schedule(dynamic) reduction(+:matches) reduction(+:nomatches) reduction(+:mmatches) reduction(+:gapped_aln) reduction(+:ver_hap) reduction(+:tot_hap) reduction(+:bad_aln) reduction(+:no_haps)
+ for (uint i = 0; i < keys.size(); i++) {
+ query = sample[keys[i]];
+
+ //
+ // If we already matched this locus to the catalog without using gapped alignments, skip it now.
+ //
+ if (query->matches.size() > 0)
+ continue;
+
+ gapped_aln++;
+
+ map<allele_type, map<allele_type, AlignRes> > query_hits;
+
+ loci_hit.clear();
+
+ for (vector<pair<allele_type, string> >::iterator allele = query->strings.begin(); allele != query->strings.end(); allele++) {
+ // cerr << "Allele: " << query_allele << "\n";
+
+ query_allele = allele->first;
+ query_seq = allele->second;
+ tot_hap++;
+
+ generate_kmers_lazily(allele->second.c_str(), gapped_kmer_len, num_kmers, kmers);
+
+ //
+ // We want to create a list of unique kmers to search with; otherwise, repetitive kmers will
+ // generate, multiple, spurious hits in sequences with multiple copies of the same kmer.
+ //
+ uniq_kmers.clear();
+ for (uint j = 0; j < num_kmers; j++)
+ uniq_kmers.insert(kmers[j]);
+
+ hits.clear();
+ ordered_hits.clear();
+
+ //
+ // Lookup the occurances of each k-mer in the kmer_map
+ //
+ for (set<string>::iterator j = uniq_kmers.begin(); j != uniq_kmers.end(); j++) {
+
+ h = kmer_map.find(j->c_str());
+
+ if (h != kmer_map.end())
+ for (uint k = 0; k < h->second.size(); k++)
+ hits.push_back(h->second[k]);
+ }
+
+ //
+ // Sort the vector of indexes; provides the number of hits to each allele/locus
+ // and orders them largest to smallest.
+ //
+ sort(hits.begin(), hits.end());
+
+ //
+ // Iterate through the list of hits and collapse them down by number of kmer hits per allele.
+ //
+ hits_size = hits.size();
+
+ if (hits_size == 0)
+ continue;
+
+ prev_id = hits[0];
+ index = 0;
+
+ do {
+ hit_cnt = 0;
+ allele_id = prev_id;
+
+ while (hits[index] == prev_id) {
+ hit_cnt++;
+ index++;
+ }
+
+ if (index < hits_size)
+ prev_id = hits[index];
+
+ if (hit_cnt >= min_hits)
+ ordered_hits.push_back(make_pair(allele_id, hit_cnt));
+
+ } while (index < hits_size);
+
+ if (ordered_hits.size() == 0)
+ continue;
+
+ //
+ // Process the hits from most kmer hits to least kmer hits.
+ //
+ sort(ordered_hits.begin(), ordered_hits.end(), compare_pair_intint);
+
+ //
+ // Only try to align the sequences with the most kmers in common.
+ //
+ top_hit = ordered_hits[0].second;
+ stop = 1;
+ for (uint j = 1; j < ordered_hits.size(); j++)
+ if (ordered_hits[j].second < top_hit) {
+ stop = j;
+ break;
+ }
+
+ for (uint j = 0; j < stop; j++) {
+ cat_hit = allele_map.at(ordered_hits[j].first);
+ hit_cnt = ordered_hits[j].second;
+
+ tag_2 = catalog[cat_hit.second];
+
+ cat_allele = cat_hit.first;
+ cat_seq = "";
+ for (uint k = 0; k < tag_2->strings.size(); k++)
+ if (tag_2->strings[k].first == cat_hit.first) {
+ cat_seq = tag_2->strings[k].second;
+ break;
+ }
+
+ aln->init(tag_2->len, query->len);
+
+ // cerr << "Attempting to align: cat id " << tag_2->id << " with locus id " << query->id << "\n"
+ // << "Cat allele: " << cat_allele << "; seq: " << cat_seq << "\n"
+ // << "Allele: " << query_allele << "; seq: " << allele->second << "\n";
+
+ if (aln->align(cat_seq, query_seq)) {
+ aln->parse_cigar(cigar);
+
+ aln_res = aln->result();
+
+ //
+ // At this point in the analysis, all possible alleles we want to detect must already
+ // be present in the catalog. Therefore, we should reject any alignment that implies a
+ // change in the catalog sequence (with the presence of an deletion element) as
+ // spurious.
+ //
+ if (aln_res.cigar.find('D') != string::npos)
+ continue;
+
+ //
+ // If the alignment has too many gaps, skip it.
+ // If the alignment doesn't span enough of the two sequences, skip it.
+ //
+ if (aln_res.gap_cnt <= (max_gaps + 1) &&
+ aln_res.pct_id >= min_match_len &&
+ dist(cat_seq.c_str(), query_seq.c_str(), cigar) == 0) {
+ // cerr << "Adding match: " << aln_res.cigar << "\n";
+ loci_hit.insert(tag_2->id);
+ query_hits[query_allele][cat_allele] = aln_res;
+ }
+ }
+ }
+ }
+
+ if (verify_gapped_match(catalog, query, loci_hit, query_hits, mmatches, nosnps, no_haps, bad_aln, ver_hap))
+ matches++;
+ else
+ nomatches++;
+ }
+
+ //
+ // Free the k-mers we generated for this query and the alignment class.
+ //
+ for (uint j = 0; j < kmers.size(); j++)
+ delete [] kmers[j];
+ kmers.clear();
+
+ delete aln;
+ }
+
+ cerr << "Out of " << keys.size() << " query loci, " << gapped_aln << " gapped alignments attempted.\n"
+ << " " << matches << " loci matched one catalog locus; " << tot_hap << " total haplotypes examined, " << ver_hap << " verified.\n"
+ << " " << nomatches << " loci matched no catalog locus;\n"
+ << " " << mmatches << " loci matched more than one catalog locus and were excluded.\n"
+ << " " << nosnps << " loci contained SNPs unaccounted for in the catalog and were excluded.\n"
+ << " " << no_haps << " loci had no verified haplotypes.\n"
+ << " " << bad_aln << " loci had inconsistent alignments to a catalog locus and were excluded.\n";
+
+
+ return 0;
+}
+
+bool
+verify_gapped_match(map<int, Locus *> &catalog, QLocus *query,
+ set<int> &loci_hit, map<allele_type, map<allele_type, AlignRes> > &query_hits,
+ uint &mmatch, uint &nosnps, uint &no_haps, uint &bad_aln, uint &ver_hits) {
+ //
+ // 1. Check that this query locus matches just a single catalog locus.
+ //
+ if (loci_hit.size() == 0) {
+ return false;
+ } else if (loci_hit.size() > 1) {
+ mmatch++;
+ return false;
+ }
+
+ int cat_id = *(loci_hit.begin());
+ Locus *cat = catalog[cat_id];
+
+ map<allele_type, map<allele_type, AlignRes> >::iterator query_it;
+ map<allele_type, AlignRes>::iterator cat_it;
+ AlignRes aln_res;
+ string query_allele, cat_allele, converted_query_allele, qseq;
+ int query_len;
+ uint verified = 0;
+
+ //
+ // 2. Check if there was a consistent alignment between the alleles to the catalog locus.
+ //
+ set<string> cigars;
+ for (query_it = query_hits.begin(); query_it != query_hits.end(); query_it++) {
+ map<allele_type, AlignRes> &cat_hits = query_it->second;
+ for (cat_it = cat_hits.begin(); cat_it != cat_hits.end(); cat_it++)
+ cigars.insert(cat_it->second.cigar);
+ }
+ if (cigars.size() > 1) {
+ bad_aln++;
+ return false;
+ }
+
+ //
+ // 3. Make sure the query has no SNPs unaccounted for in the catalog.
+ //
+ vector<pair<char, uint> > cigar;
+ query_len = parse_cigar(invert_cigar(*cigars.begin()).c_str(), cigar);
+ adjust_snps_for_gaps(cigar, query);
+ qseq = apply_cigar_to_seq(query->con, cigar);
+ query->add_consensus(qseq.c_str());
+
+ int min_tag_len = query_len > cat->len ? query_len : cat->len;
+
+ vector<SNP *>::iterator i, j;
+ bool found;
+
+ for (i = query->snps.begin(); i != query->snps.end(); i++) {
+ found = false;
+ //
+ // SNP occurs in a column that is beyond the length of the catalog
+ //
+ if ((*i)->col > min_tag_len - 1)
+ continue;
+
+ for (j = cat->snps.begin(); j != cat->snps.end(); j++) {
+ if ((*i)->col == (*j)->col)
+ found = true;
+ }
+ //
+ // Query locus posses a SNP not present in the catalog.
+ //
+ if (found == false) {
+ nosnps++;
+ return false;
+ }
+ }
+
+ //
+ // 4. Assign the allele hits after verifying there is a match between catalog and query allele..
+ //
+ for (query_it = query_hits.begin(); query_it != query_hits.end(); query_it++) {
+ query_allele = query_it->first;
+
+ map<allele_type, AlignRes> &cat_hits = query_it->second;
+
+ if (cat_hits.size() == 1 && cat_hits.begin()->first == "consensus") {
+ cat_it = cat_hits.begin();
+
+ cat_allele = cat_it->first;
+ aln_res = cat_it->second;
+
+ verified++;
+ query->add_match(cat_id, cat_allele, query_allele, 0, invert_cigar(aln_res.cigar));
+ continue;
+ }
+
+ converted_query_allele = generate_query_allele(cat, query, query_allele);
+
+ cat_it = cat_hits.find(converted_query_allele);
+
+ if (cat_it != cat_hits.end()) {
+ cat_allele = cat_it->first;
+ aln_res = cat_it->second;
+
+ verified++;
+ query->add_match(cat_id, cat_allele, query_allele, 0, invert_cigar(aln_res.cigar));
+ }
+ // } else {
+ // //
+ // // Check for alleles that have Ns in the query, but not in the catalog, but otherwise match.
+ // //
+ // uint match_cnt = 0;
+ // for (cat_it = cat_hits.begin(); cat_it != cat_hits.end(); cat_it++) {
+ // cat_allele = cat_it->first;
+
+ // if (match_alleles(cat_allele, converted_query_allele)) {
+ // match_cnt++;
+ // aln_res = cat_it->second;
+ // }
+ // }
+
+ // if (match_cnt == 1) {
+ // verified++;
+ // query->add_match(cat_id, cat_allele, query_allele, 0, invert_cigar(aln_res.cigar));
+ // }
+ // }
+ }
+
+
+ if (verified > 0) {
+ ver_hits += verified;
+ } else {
+ no_haps++;
+ return false;
+ }
+
+ return true;
+}
+
+bool
+match_alleles(allele_type catalog_allele, allele_type query_allele)
+{
+ const char *q = catalog_allele.c_str();
+ const char *p = query_allele.c_str();
+ const char *stop = p + query_allele.length();
+
+ while (p < stop) {
+ if (*p != 'N' && *p != *q)
+ return false;
+ p++;
+ q++;
+ }
+ return true;
+}
+
+string
+generate_query_allele(Locus *ctag, Locus *qtag, allele_type allele)
+{
+ string new_allele = "";
+
+ if (qtag->snps.size() == 0) {
+ for (uint i = 0; i < ctag->snps.size(); i++)
+ new_allele += ctag->snps[i]->col > qtag->len - 1 ? 'N' : qtag->con[ctag->snps[i]->col];
+
+ } else {
+ uint pos = 0;
+ uint index = 0;
+
+ for (uint i = 0; i < ctag->snps.size(); i++) {
+ if (index < qtag->snps.size() && qtag->snps[index]->col == ctag->snps[i]->col) {
+ new_allele += allele[pos];
+ index++;
+ pos++;
+ } else {
+ new_allele += ctag->snps[i]->col > qtag->len - 1 ? 'N' : qtag->con[ctag->snps[i]->col];
+ }
+ }
+ }
+
+ return new_allele;
+}
+
+int
populate_hash(map<int, Locus *> &sample, HashMap &hash_map, vector<char *> &hash_map_keys, int min_tag_len)
{
map<int, Locus *>::iterator it;
@@ -808,25 +1256,25 @@ write_matches(string sample_path, map<int, QLocus *> &sample)
qloc->alleles.count(qloc->matches[j]->cat_type) > 0 ?
qloc->alleles[qloc->matches[j]->cat_type] : qloc->depth;
- sstr <<
- "0" << "\t" <<
- batch_id << "\t" <<
- qloc->matches[j]->cat_id << "\t" <<
- samp_id << "\t" <<
- qloc->id << "\t" <<
- qloc->matches[j]->cat_type << "\t" <<
- match_depth << "\t" <<
- qloc->lnl << "\n";
+ sstr << "0" << "\t"
+ << batch_id << "\t"
+ << qloc->matches[j]->cat_id << "\t"
+ << samp_id << "\t"
+ << qloc->id << "\t"
+ << qloc->matches[j]->cat_type << "\t"
+ << match_depth << "\t"
+ << qloc->lnl << "\t"
+ << qloc->matches[j]->cigar << "\n";
}
if (in_file_type == FileT::gzsql) gzputs(gz_matches, sstr.str().c_str()); else matches << sstr.str();
sstr.str("");
}
- if (in_file_type == FileT::gzsql)
- gzclose(gz_matches);
- else
- matches.close();
+ if (in_file_type == FileT::gzsql)
+ gzclose(gz_matches);
+ else
+ matches.close();
return 0;
}
@@ -842,6 +1290,7 @@ int parse_command_line(int argc, char* argv[]) {
{"genomic_loc", no_argument, NULL, 'g'},
{"verify_hap", no_argument, NULL, 'x'},
{"uniq_haplotypes", no_argument, NULL, 'u'},
+ {"gapped", no_argument, NULL, 'G'},
{"num_threads", required_argument, NULL, 'p'},
{"batch_id", required_argument, NULL, 'b'},
{"catalog", required_argument, NULL, 'c'},
@@ -853,7 +1302,7 @@ int parse_command_line(int argc, char* argv[]) {
// getopt_long stores the option index here.
int option_index = 0;
- c = getopt_long(argc, argv, "hgxuvs:c:o:b:p:", long_options, &option_index);
+ c = getopt_long(argc, argv, "hgGxuvs:c:o:b:p:", long_options, &option_index);
// Detect the end of the options.
if (c == -1)
@@ -892,6 +1341,9 @@ int parse_command_line(int argc, char* argv[]) {
case 'u':
require_uniq_haplotypes = true;
break;
+ case 'G':
+ gapped_alignments = true;
+ break;
case 'v':
version();
break;
@@ -941,7 +1393,9 @@ void help() {
<< " g: base matching on genomic location, not sequence identity." << "\n"
<< " x: don't verify haplotype of matching locus." << "\n"
<< " v: print program version." << "\n"
- << " h: display this help messsage." << "\n\n";
+ << " h: display this help messsage." << "\n\n"
+ << " Gapped assembly options:\n"
+ << " --gapped: preform gapped alignments between stacks.\n";
exit(0);
}
diff --git a/src/sstacks.h b/src/sstacks.h
index 49a794e..5b978e3 100644
--- a/src/sstacks.h
+++ b/src/sstacks.h
@@ -27,6 +27,7 @@
#include <omp.h> // OpenMP library
#endif
#include <getopt.h> // Process command-line options
+#include <time.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
@@ -66,8 +67,11 @@ using google::sparse_hash_map;
#include "kmers.h"
#include "stacks.h"
#include "locus.h"
+#include "GappedAln.h"
+#include "kmers.h"
#include "sql_utilities.h"
#include "utils.h"
+#include "aln_utils.h"
#ifdef HAVE_SPARSEHASH
typedef sparse_hash_map<const char *, vector<pair<int, allele_type> >, hash_charptr, eqstr> HashMap;
@@ -75,17 +79,21 @@ typedef sparse_hash_map<const char *, vector<pair<int, allele_type> >, hash_char
typedef unordered_map<const char *, vector<pair<int, allele_type> >, hash_charptr, eqstr> HashMap;
#endif
-void help( void );
-void version( void );
-int parse_command_line(int, char**);
-int populate_hash(map<int, Locus *> &, HashMap &, vector<char *> &, int);
-int find_matches_by_sequence(map<int, Locus *> &, map<int, QLocus *> &);
-int find_matches_by_genomic_loc(map<int, Locus *> &, map<int, QLocus *> &);
-int verify_sequence_match(map<int, Locus *> &, QLocus *, set<int> &, map<string, vector<string> > &, uint, unsigned long &, unsigned long &);
-int verify_genomic_loc_match(Locus *, QLocus *, set<string> &, unsigned long &);
-int generate_query_haplotypes(Locus *, QLocus *, set<string> &);
-int impute_haplotype(string, vector<pair<allele_type, string> > &, string &);
-bool compare_dist(pair<int, int>, pair<int, int>);
-int write_matches(string, map<int, QLocus *> &);
+void help( void );
+void version( void );
+int parse_command_line(int, char**);
+int populate_hash(map<int, Locus *> &, HashMap &, vector<char *> &, int);
+int find_matches_by_sequence(map<int, Locus *> &, map<int, QLocus *> &);
+int find_matches_by_genomic_loc(map<int, Locus *> &, map<int, QLocus *> &);
+int verify_sequence_match(map<int, Locus *> &, QLocus *, set<int> &, map<string, vector<string> > &, uint, unsigned long &, unsigned long &);
+int search_for_gaps(map<int, Locus *> &, map<int, QLocus *> &, KmerHashMap &, map<int, pair<allele_type, int> > &, double);
+bool verify_gapped_match(map<int, Locus *> &, QLocus *, set<int> &, map<allele_type, map<allele_type, AlignRes> > &, uint &, uint &, uint &, uint &, uint &);
+int verify_genomic_loc_match(Locus *, QLocus *, set<string> &, unsigned long &);
+string generate_query_allele(Locus *, Locus *, allele_type);
+bool match_alleles(allele_type, allele_type);
+int generate_query_haplotypes(Locus *, QLocus *, set<string> &);
+int impute_haplotype(string, vector<pair<allele_type, string> > &, string &);
+bool compare_dist(pair<int, int>, pair<int, int>);
+int write_matches(string, map<int, QLocus *> &);
#endif // __SSTACKS_H__
diff --git a/src/stacks.cc b/src/stacks.cc
index 66f8d07..4294586 100644
--- a/src/stacks.cc
+++ b/src/stacks.cc
@@ -35,13 +35,13 @@ Rem::Rem() {
this->utilized = false;
}
-Rem::Rem(int id, uint seq_id, DNASeq *seq) {
+Rem::Rem(int id, uint seq_id, DNANSeq *seq) {
this->id = id;
this->utilized = false;
this->map.push_back(seq_id);
- this->seq = new DNASeq(seq->size, seq->s);
+ this->seq = new DNANSeq(seq->size(), seq->s);
}
int Rem::add_id(uint id) {
@@ -50,20 +50,20 @@ int Rem::add_id(uint id) {
return 0;
}
-int Rem::add_seq(const DNASeq *seq) {
+int Rem::add_seq(const DNANSeq *seq) {
if (this->seq != NULL)
- delete this->seq;
+ delete this->seq;
- this->seq = new DNASeq(seq->size, seq->s);
+ this->seq = new DNANSeq(seq->size(), seq->s);
return 0;
}
int Rem::add_seq(const char *seq) {
if (this->seq != NULL)
- delete this->seq;
+ delete this->seq;
- this->seq = new DNASeq(strlen(seq), seq);
+ this->seq = new DNANSeq(strlen(seq), seq);
return 0;
}
@@ -78,7 +78,7 @@ int PStack::add_id(const char *id) {
int PStack::add_seq(const char *seq) {
if (this->seq != NULL)
- delete this->seq;
+ delete this->seq;
this->len = strlen(seq);
this->seq = new DNANSeq(this->len, seq);
@@ -88,7 +88,7 @@ int PStack::add_seq(const char *seq) {
int PStack::add_seq(DNANSeq *seq) {
if (this->seq != NULL)
- delete this->seq;
+ delete this->seq;
this->seq = new DNANSeq(seq->size(), seq->s);
@@ -103,18 +103,18 @@ int Stack::add_id(uint id) {
int Stack::add_seq(const char *seq) {
if (this->seq != NULL)
- delete this->seq;
+ delete this->seq;
- this->seq = new DNASeq(strlen(seq), seq);
+ this->seq = new DNANSeq(strlen(seq), seq);
return 0;
}
-int Stack::add_seq(const DNASeq *seq) {
+int Stack::add_seq(const DNANSeq *seq) {
if (this->seq != NULL)
- delete this->seq;
+ delete this->seq;
- this->seq = new DNASeq(seq->size, seq->s);
+ this->seq = new DNANSeq(seq->size(), seq->s);
return 0;
}
diff --git a/src/stacks.h b/src/stacks.h
index 3393ae3..dc9958d 100644
--- a/src/stacks.h
+++ b/src/stacks.h
@@ -59,32 +59,32 @@ public:
strand_type strand;
void set(const char *chr, uint bp, strand_type strand) {
- if (this->chr != NULL)
- delete [] this->chr;
- this->chr = new char[strlen(chr) + 1];
- this->bp = bp;
- this->strand = strand;
- strcpy(this->chr, chr);
+ if (this->chr != NULL)
+ delete [] this->chr;
+ this->chr = new char[strlen(chr) + 1];
+ this->bp = bp;
+ this->strand = strand;
+ strcpy(this->chr, chr);
}
PhyLoc() {
- chr = NULL;
- bp = 0;
- strand = plus;
+ chr = NULL;
+ bp = 0;
+ strand = plus;
}
PhyLoc(const char *chr, uint bp) {
- this->chr = new char[strlen(chr) + 1];
- this->bp = bp;
- this->strand = plus;
- strcpy(this->chr, chr);
+ this->chr = new char[strlen(chr) + 1];
+ this->bp = bp;
+ this->strand = plus;
+ strcpy(this->chr, chr);
}
PhyLoc(const char *chr, uint bp, strand_type strnd) {
- this->chr = new char[strlen(chr) + 1];
- this->bp = bp;
- this->strand = strnd;
- strcpy(this->chr, chr);
+ this->chr = new char[strlen(chr) + 1];
+ this->bp = bp;
+ this->strand = strnd;
+ strcpy(this->chr, chr);
}
~PhyLoc() {
- delete [] chr;
+ delete [] chr;
}
};
@@ -99,34 +99,64 @@ class SNP {
char rank_4;
SNP() {
- col = 0;
- lratio = 0.0;
- rank_1 = 0;
- rank_2 = 0;
- rank_3 = 0;
- rank_4 = 0;
+ col = 0;
+ lratio = 0.0;
+ rank_1 = 0;
+ rank_2 = 0;
+ rank_3 = 0;
+ rank_4 = 0;
+ }
+};
+
+class Gap {
+public:
+ uint start;
+ uint end;
+
+ Gap(uint s, uint e) {
+ start = s;
+ end = e;
+ }
+};
+
+class Aln {
+public:
+ uint id;
+ uint gap_cnt;
+ double pct_id;
+ string cigar;
+ Aln() {
+ this->id = 0;
+ this->pct_id = 0.0;
+ this->gap_cnt = 0;
+ }
+ Aln(uint id, string cigar, double pct_id, uint gap_cnt) {
+ this->id = id;
+ this->cigar = cigar;
+ this->pct_id = pct_id;
+ this->gap_cnt = gap_cnt;
}
};
class PStack {
public:
- uint id;
- uint count; // Number of identical reads forming this stack
- DNANSeq *seq; // Sequence read
- uint len; // Read length
- vector<char *> map; // List of sequence read IDs merged into this stack
- PhyLoc loc; // Physical genome location of this stack.
+ uint id;
+ uint count; // Number of identical reads forming this stack
+ DNANSeq *seq; // Sequence read
+ uint len; // Read length
+ vector<char *> map; // List of sequence read IDs merged into this stack
+ PhyLoc loc; // Physical genome location of this stack.
PStack() {
- id = 0;
- count = 0;
- seq = NULL;
- len = 0;
+ id = 0;
+ count = 0;
+ seq = NULL;
+ len = 0;
}
~PStack() {
- delete this->seq;
- for (unsigned int i = 0; i < this->map.size(); i++)
- delete [] this->map[i];
+ delete this->seq;
+ for (unsigned int i = 0; i < this->map.size(); i++)
+ delete [] this->map[i];
}
int add_id(const char *);
int add_seq(const char *);
@@ -136,38 +166,38 @@ class PStack {
class Stack {
public:
uint id;
- DNASeq *seq; // Sequence read
+ DNANSeq *seq; // Sequence read
vector<uint> map; // List of sequence read IDs merged into this stack
Stack() {
- id = 0;
- seq = NULL;
+ id = 0;
+ seq = NULL;
}
~Stack() {
- delete this->seq;
+ delete this->seq;
}
uint count() { return this->map.size(); }
int add_id(uint);
int add_seq(const char *);
- int add_seq(const DNASeq *);
+ int add_seq(const DNANSeq *);
};
class Rem {
public:
uint id;
vector<uint> map; // List of sequence read IDs merged into this stack
- DNASeq *seq; // Sequence read
+ DNANSeq *seq; // Sequence read
bool utilized;
Rem();
- Rem(int, uint, DNASeq *);
+ Rem(int, uint, DNANSeq *);
~Rem() {
- delete this->seq;
+ delete this->seq;
}
uint count() { return this->map.size(); }
int add_id(uint);
int add_seq(const char *);
- int add_seq(const DNASeq *);
+ int add_seq(const DNANSeq *);
};
class CatMatch {
@@ -179,18 +209,21 @@ public:
int depth;
double lnl;
char *haplotype;
+ char *cigar;
CatMatch() {
- batch_id = 0;
- cat_id = 0;
- sample_id = 0;
- tag_id = 0;
- depth = 0;
- lnl = 0.0;
- haplotype = NULL;
+ batch_id = 0;
+ cat_id = 0;
+ sample_id = 0;
+ tag_id = 0;
+ depth = 0;
+ lnl = 0.0;
+ haplotype = NULL;
+ cigar = NULL;
}
- ~CatMatch() {
- delete [] haplotype;
+ ~CatMatch() {
+ delete [] haplotype;
+ delete [] cigar;
}
};
@@ -201,13 +234,13 @@ public:
char *model;
ModRes(int samp_id, int tag_id, const char *model) {
- this->sample_id = samp_id;
- this->tag_id = tag_id;
- this->model = new char[strlen(model) + 1];
- strcpy(this->model, model);
+ this->sample_id = samp_id;
+ this->tag_id = tag_id;
+ this->model = new char[strlen(model) + 1];
+ strcpy(this->model, model);
}
~ModRes() {
- delete [] this->model;
+ delete [] this->model;
}
};
@@ -218,13 +251,13 @@ public:
vector<SNP *> snps;
SNPRes(int samp_id, int tag_id) {
- this->sample_id = samp_id;
- this->tag_id = tag_id;
+ this->sample_id = samp_id;
+ this->tag_id = tag_id;
}
~SNPRes() {
- for (uint i = 0; i < this->snps.size(); i++)
- delete this->snps[i];
- this->snps.clear();
+ for (uint i = 0; i < this->snps.size(); i++)
+ delete this->snps[i];
+ this->snps.clear();
}
};
diff --git a/src/ustacks.cc b/src/ustacks.cc
index e04c32b..b0bd7ae 100644
--- a/src/ustacks.cc
+++ b/src/ustacks.cc
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2010-2015, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2010-2016, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -27,28 +27,28 @@
//
// Global variables to hold command-line options.
//
-FileT in_file_type;
-string in_file;
-string out_path;
-int num_threads = 1;
-int sql_id = 0;
-bool call_sec_hapl = true;
-bool set_kmer_len = true;
-int kmer_len = 0;
-int max_kmer_len = 19;
-int min_merge_cov = 3;
-uint max_subgraph = 3;
-int dump_graph = 0;
-int retain_rem_reads = false;
-int deleverage_stacks = 0;
-int remove_rep_stacks = 0;
-int max_utag_dist = 2;
-int max_rem_dist = -1;
-double cov_mean = 0.0;
-double cov_stdev = 0.0;
-double cov_scale = 1;
-int deleverage_trigger;
-int removal_trigger;
+FileT in_file_type;
+string in_file;
+string out_path;
+int num_threads = 1;
+int sql_id = 0;
+bool call_sec_hapl = true;
+bool set_kmer_len = true;
+int kmer_len = 0;
+int max_kmer_len = 19;
+int min_merge_cov = 3;
+uint max_subgraph = 3;
+int dump_graph = 0;
+int retain_rem_reads = false;
+int deleverage_stacks = 0;
+int remove_rep_stacks = 0;
+int max_utag_dist = 2;
+int max_rem_dist = -1;
+bool gapped_alignments = false;
+double min_match_len = 0.80;
+double max_gaps = 2.0;
+int deleverage_trigger;
+int removal_trigger;
//
// For use with the multinomial model to call fixed nucleotides.
//
@@ -71,25 +71,27 @@ int main (int argc, char* argv[]) {
//
if (max_rem_dist == -1) max_rem_dist = max_utag_dist + 2;
- cerr << "Min depth of coverage to create a stack: " << min_merge_cov << "\n"
- << "Max distance allowed between stacks: " << max_utag_dist << "\n"
- << "Max distance allowed to align secondary reads: " << max_rem_dist << "\n"
- << "Max number of stacks allowed per de novo locus: " << max_subgraph << "\n"
- << "Deleveraging algorithm: " << (deleverage_stacks ? "enabled" : "disabled") << "\n"
- << "Removal algorithm: " << (remove_rep_stacks ? "enabled" : "disabled") << "\n"
- << "Model type: ";
+ cerr << "ustacks paramters selected:\n"
+ << " Min depth of coverage to create a stack: " << min_merge_cov << "\n"
+ << " Max distance allowed between stacks: " << max_utag_dist << "\n"
+ << " Max distance allowed to align secondary reads: " << max_rem_dist << "\n"
+ << " Max number of stacks allowed per de novo locus: " << max_subgraph << "\n"
+ << " Deleveraging algorithm: " << (deleverage_stacks ? "enabled" : "disabled") << "\n"
+ << " Removal algorithm: " << (remove_rep_stacks ? "enabled" : "disabled") << "\n"
+ << " Model type: ";
switch (model_type) {
case snp:
- cerr << "SNP\n";
- break;
+ cerr << "SNP\n";
+ break;
case fixed:
- cerr << "Fixed\n";
- break;
+ cerr << "Fixed\n";
+ break;
case bounded:
- cerr << "Bounded; lower epsilon bound: " << bound_low << "; upper bound: " << bound_high << "\n";
- break;
+ cerr << "Bounded; lower epsilon bound: " << bound_low << "; upper bound: " << bound_high << "\n";
+ break;
}
- cerr << "Alpha significance level for model: " << alpha << "\n";
+ cerr << " Alpha significance level for model: " << alpha << "\n"
+ << " Gapped alignments: " << (gapped_alignments ? "enabled" : "disabled") << "\n";
//
// Set limits to call het or homozygote according to chi-square distribution with one
@@ -97,17 +99,17 @@ int main (int argc, char* argv[]) {
// http://en.wikipedia.org/wiki/Chi-squared_distribution#Table_of_.CF.872_value_vs_p-value
//
if (alpha == 0.1) {
- heterozygote_limit = -2.71;
- homozygote_limit = 2.71;
+ heterozygote_limit = -2.71;
+ homozygote_limit = 2.71;
} else if (alpha == 0.05) {
- heterozygote_limit = -3.84;
- homozygote_limit = 3.84;
+ heterozygote_limit = -3.84;
+ homozygote_limit = 3.84;
} else if (alpha == 0.01) {
- heterozygote_limit = -6.64;
- homozygote_limit = 6.64;
+ heterozygote_limit = -6.64;
+ homozygote_limit = 6.64;
} else if (alpha == 0.001) {
- heterozygote_limit = -10.83;
- homozygote_limit = 10.83;
+ heterozygote_limit = -10.83;
+ homozygote_limit = 10.83;
}
//
@@ -118,7 +120,7 @@ int main (int argc, char* argv[]) {
#endif
DNASeqHashMap radtags;
- vector<DNASeq *> radtags_keys;
+ vector<DNANSeq *> radtags_keys;
map<int, Rem *> remainders;
set<int> merge_map;
map<int, Stack *> unique;
@@ -131,12 +133,12 @@ int main (int argc, char* argv[]) {
// dump_unique_tags(unique);
- if (cov_mean == 0 || cov_stdev == 0)
- calc_coverage_distribution(unique, cov_mean, cov_stdev);
+ double cov_mean, cov_stdev, cov_max;
+
+ calc_coverage_distribution(unique, cov_mean, cov_stdev, cov_max);
+ cerr << "Initial coverage mean: " << cov_mean << "; Std Dev: " << cov_stdev << "; Max: " << cov_max << "\n";
- cerr << "Coverage mean: " << cov_mean << "; stdev: " << cov_stdev << "\n";
-
- calc_triggers(cov_mean, cov_stdev, deleverage_trigger, removal_trigger);
+ calc_triggers(cov_mean, cov_stdev, 1, deleverage_trigger, removal_trigger);
cerr << "Deleveraging trigger: " << deleverage_trigger << "; Removal trigger: " << removal_trigger << "\n";
@@ -144,13 +146,18 @@ int main (int argc, char* argv[]) {
populate_merged_tags(unique, merged);
+ cerr << merged.size() << " initial stacks were populated; " << remainders.size() << " stacks were set aside as secondary reads.\n";
+
if (remove_rep_stacks) {
- cerr << "Calculating distance for removing repetitive stacks.\n";
- calc_kmer_distance(merged, 1);
- cerr << "Removing repetitive stacks.\n";
- remove_repetitive_stacks(unique, merged);
+ cerr << "Calculating distance for removing repetitive stacks.\n";
+ calc_kmer_distance(merged, 1);
+ cerr << "Removing repetitive stacks.\n";
+ remove_repetitive_stacks(unique, merged);
}
+ calc_coverage_distribution(unique, merged, cov_mean, cov_stdev, cov_max);
+ cerr << "Post-Repeat Removal, coverage depth Mean: " << cov_mean << "; Std Dev: " << cov_stdev << "; Max: " << cov_max << "\n";
+
cerr << "Calculating distance between stacks...\n";
calc_kmer_distance(merged, max_utag_dist);
@@ -159,14 +166,33 @@ int main (int argc, char* argv[]) {
call_consensus(merged, unique, remainders, false);
- calc_merged_coverage_distribution(unique, merged);
+ calc_coverage_distribution(unique, merged, cov_mean, cov_stdev, cov_max);
+ cerr << "After merging, coverage depth Mean: " << cov_mean << "; Std Dev: " << cov_stdev << "; Max: " << cov_max << "\n";
//dump_merged_tags(merged);
cerr << "Merging remainder radtags\n";
merge_remainders(merged, remainders);
- // Call the consensus sequence again, now that remainder tags have been merged.
+ calc_coverage_distribution(unique, remainders, merged, cov_mean, cov_stdev, cov_max);
+ cerr << "After remainders merged, coverage depth Mean: " << cov_mean << "; Std Dev: " << cov_stdev << "; Max: " << cov_max << "\n";
+
+ if (gapped_alignments) {
+ call_consensus(merged, unique, remainders, false);
+
+ cerr << "Searching for gaps between merged stacks...\n";
+ search_for_gaps(merged, min_match_len);
+
+ merge_gapped_alns(unique, remainders, merged);
+
+ calc_coverage_distribution(unique, remainders, merged, cov_mean, cov_stdev, cov_max);
+ cerr << "After gapped alignments, coverage depth Mean: " << cov_mean << "; Std Dev: " << cov_stdev << "; Max: " << cov_max << "\n";
+ }
+
+ //
+ // Call the final consensus sequence and invoke the SNP model.
+ //
+ cerr << "Calling final consensus sequences, invoking SNP-calling model...\n";
call_consensus(merged, unique, remainders, true);
count_raw_reads(unique, remainders, merged);
@@ -178,24 +204,390 @@ int main (int argc, char* argv[]) {
return 0;
}
-int merge_remainders(map<int, MergedStack *> &merged, map<int, Rem *> &rem) {
+int
+merge_gapped_alns(map<int, Stack *> &unique, map<int, Rem *> &rem, map<int, MergedStack *> &merged)
+{
+ map<int, MergedStack *> new_merged;
+ map<int, MergedStack *>::iterator it;
+ MergedStack *tag_1, *tag_2, *merged_tag;
+
+ int id = 1;
+ uint merge_cnt = 0;
+
+ set<int> processed;
+ string cigar_1, cigar_2;
+ vector<pair<char, uint> > cigar;
+
+ for (it = merged.begin(); it != merged.end(); it++) {
+ if (processed.count(it->first) > 0)
+ continue;
+
+ tag_1 = it->second;
+ sort(tag_1->alns.begin(), tag_1->alns.end(), rank_alignments);
+
+ //
+ // No gapped alignments, or no optimal alignment for this stack, or
+ // this stack has already been set aside.
+ //
+ if (tag_1->masked || tag_1->alns.size() == 0)
+ continue;
+ if (tag_1->alns.size() > 1 && tag_1->alns[0].pct_id == tag_1->alns[1].pct_id)
+ continue;
+
+ //
+ // Found one or more gapped alignments. Make sure the best alignment for each of the aligned pairs
+ // of tags are, reciprocal to each other..
+ //
+ tag_2 = merged[tag_1->alns[0].id];
+ sort(tag_2->alns.begin(), tag_2->alns.end(), rank_alignments);
+
+ if (tag_2->masked || tag_2->alns.size() == 0)
+ continue;
+ if (tag_2->alns.size() > 1 && tag_2->alns[0].pct_id == tag_2->alns[1].pct_id)
+ continue;
+
+ if (tag_1->id != tag_2->alns[0].id)
+ continue;
+
+ cigar_1 = invert_cigar(tag_1->alns[0].cigar);
+ cigar_2 = tag_2->alns[0].cigar;
+
+ if (cigar_1 == cigar_2) {
+ parse_cigar(tag_1->alns[0].cigar.c_str(), cigar);
+
+ //
+ // Check that the alignment still contains fewer than
+ // max_utag_dist mismatches.
+ //
+ if (dist(tag_1->con, tag_2->con, cigar) > max_utag_dist)
+ continue;
+
+ //
+ // If the alignment has too many gaps, skip it.
+ //
+ if (tag_1->alns[0].gap_cnt > (max_gaps + 1))
+ continue;
+
+ //
+ // If the alignment doesn't span enough of the two sequences, skip it.
+ //
+ if (tag_1->alns[0].pct_id < min_match_len)
+ continue;
+
+ //
+ // Edit the sequences to accommodate any added gaps.
+ //
+ edit_gapped_seqs(unique, rem, tag_1, cigar);
+
+ parse_cigar(tag_2->alns[0].cigar.c_str(), cigar);
+ edit_gapped_seqs(unique, rem, tag_2, cigar);
+
+ //
+ // Merge the tags.
+ //
+ merged_tag = merge_tags(tag_1, tag_2, id);
+ new_merged[id] = merged_tag;
+ id++;
+
+ //
+ // Record the gaps.
+ //
+ uint pos = 0;
+ for (uint j = 0; j < cigar.size(); j++) {
+ if (cigar[j].first == 'I' || cigar[j].first == 'D')
+ merged_tag->gaps.push_back(Gap(pos, pos + cigar[j].second));
+ pos += cigar[j].second;
+ }
+
+ processed.insert(tag_1->id);
+ processed.insert(tag_2->id);
+
+ merge_cnt++;
+ }
+ }
+
+ set<int> merge_set;
+ for (it = merged.begin(); it != merged.end(); it++) {
+ if (processed.count(it->first))
+ continue;
+ tag_1 = it->second;
+ merge_set.insert(tag_1->id);
+ tag_2 = merge_tags(merged, merge_set, id);
+ new_merged[id] = tag_2;
+ merge_set.clear();
+ id++;
+ }
+
+ uint new_cnt = new_merged.size();
+ uint old_cnt = merged.size();
+
+ //
+ // Free the memory from the old map of merged tags.
+ //
+ for (it = merged.begin(); it != merged.end(); it++)
+ delete it->second;
+
+ merged = new_merged;
+
+ cerr << " " << old_cnt << " stacks merged into " << new_cnt
+ << " stacks; merged " << merge_cnt
+ << " gapped alignments.\n";
+
+ return 0;
+}
+
+bool
+rank_alignments(Aln a, Aln b)
+{
+ return a.pct_id > b.pct_id;
+}
+
+int
+edit_gapped_seqs(map<int, Stack *> &unique, map<int, Rem *> &rem, MergedStack *tag, vector<pair<char, uint> > &cigar)
+{
+ int stack_id;
+ Stack *s;
+ Rem *r;
+ char *buf = new char[tag->len + 1];
+
+ for (uint i = 0; i < tag->utags.size(); i++) {
+ stack_id = tag->utags[i];
+ s = unique[stack_id];
+
+ buf = s->seq->seq(buf);
+ edit_gaps(cigar, buf);
+
+ delete s->seq;
+ s->seq = new DNANSeq(tag->len, buf);
+ }
+
+ for (uint i = 0; i < tag->remtags.size(); i++) {
+ stack_id = tag->remtags[i];
+ r = rem[stack_id];
+
+ buf = r->seq->seq(buf);
+ edit_gaps(cigar, buf);
+
+ delete r->seq;
+ r->seq = new DNANSeq(tag->len, buf);
+ }
+
+ delete [] buf;
+
+ return 0;
+}
+
+int
+edit_gaps(vector<pair<char, uint> > &cigar, char *seq)
+{
+ char *buf;
+ uint size = cigar.size();
+ char op;
+ uint dist, bp, len, buf_len, buf_size, j, k, stop;
+
+ len = strlen(seq);
+ bp = 0;
+
+ buf = new char[len + 1];
+ buf_size = len + 1;
+
+ for (uint i = 0; i < size; i++) {
+ op = cigar[i].first;
+ dist = cigar[i].second;
+
+ switch(op) {
+ case 'S':
+ stop = bp + dist;
+ stop = stop > len ? len : stop;
+ while (bp < stop) {
+ seq[bp] = 'N';
+ bp++;
+ }
+ break;
+ case 'D':
+ //
+ // A deletion has occured in the read relative to the reference genome.
+ // Pad the read with sufficent Ns to match the deletion, shifting the existing
+ // sequence down. Trim the final length to keep the read length consistent.
+ //
+ k = bp >= len ? len : bp;
+
+ strncpy(buf, seq + k, buf_size - 1);
+ buf[buf_size - 1] = '\0';
+ buf_len = strlen(buf);
+
+ stop = bp + dist;
+ stop = stop > len ? len : stop;
+ while (bp < stop) {
+ seq[bp] = 'N';
+ bp++;
+ }
+
+ j = bp;
+ k = 0;
+ while (j < len && k < buf_len) {
+ seq[j] = buf[k];
+ k++;
+ j++;
+ }
+ break;
+ case 'I':
+ case 'M':
+ bp += dist;
+ break;
+ default:
+ break;
+ }
+ }
+
+ delete [] buf;
+
+ return 0;
+}
+
+int
+search_for_gaps(map<int, MergedStack *> &merged, double min_match_len)
+{
+ //
+ // Search for loci that can be merged with a gapped alignment.
+ //
+ KmerHashMap kmer_map;
+ vector<char *> kmer_map_keys;
+ MergedStack *tag_1, *tag_2;
+ map<int, MergedStack *>::iterator it;
+
+ //
+ // OpenMP can't parallelize random access iterators, so we convert
+ // our map to a vector of integer keys.
+ //
+ vector<int> keys;
+ for (it = merged.begin(); it != merged.end(); it++)
+ keys.push_back(it->first);
+
+ //
+ // Calculate the number of k-mers we will generate. If kmer_len == 0,
+ // determine the optimal length for k-mers.
+ //
+ int con_len = strlen(merged[keys[0]]->con);
+ int kmer_len = 19;
+ int num_kmers = con_len - kmer_len + 1;
+
+ //
+ // Calculate the minimum number of matching k-mers required for a possible sequence match.
+ //
+ int min_hits = (round((double) con_len * min_match_len) - (kmer_len * max_gaps)) - kmer_len + 1;
+
+ cerr << " Searching with a k-mer length of " << kmer_len << " (" << num_kmers << " k-mers per read); " << min_hits << " k-mer hits required.\n";
+
+ populate_kmer_hash(merged, kmer_map, kmer_map_keys, kmer_len);
+
+ #pragma omp parallel private(tag_1, tag_2)
+ {
+ KmerHashMap::iterator h;
+ vector<char *> query_kmers;
+ set<string> uniq_kmers;
+ GappedAln *aln = new GappedAln(con_len);
+ AlignRes a;
+
+ initialize_kmers(kmer_len, num_kmers, query_kmers);
+
+ #pragma omp for schedule(dynamic)
+ for (uint i = 0; i < keys.size(); i++) {
+ tag_1 = merged[keys[i]];
+
+ //
+ // Don't compute distances for masked tags.
+ //
+ if (tag_1->masked) continue;
+
+ //
+ // Don't compare tags that are already at or above max_locus_stacks.
+ //
+ if (tag_1->utags.size() >= max_subgraph)
+ continue;
+
+ generate_kmers_lazily(tag_1->con, kmer_len, num_kmers, query_kmers);
+
+ uniq_kmers.clear();
+ for (int j = 0; j < num_kmers; j++)
+ uniq_kmers.insert(query_kmers[j]);
+
+ map<int, int> hits;
+ //
+ // Lookup the occurances of each k-mer in the kmer_map
+ //
+ for (set<string>::iterator j = uniq_kmers.begin(); j != uniq_kmers.end(); j++) {
+ h = kmer_map.find(j->c_str());
+
+ if (h != kmer_map.end())
+ for (uint k = 0; k < h->second.size(); k++)
+ hits[h->second[k]]++;
+ }
+
+ //
+ // Iterate through the list of hits. For each hit that has more than min_hits
+ // check its full length to verify a match.
+ //
+ map<int, int>::iterator hit_it;
+ for (hit_it = hits.begin(); hit_it != hits.end(); hit_it++) {
+
+ if (hit_it->second < min_hits) continue;
+
+ tag_2 = merged[hit_it->first];
+
+ // Don't compute distances for masked tags
+ if (tag_2->masked) continue;
+
+ // Don't compare tag_1 against itself.
+ if (tag_1 == tag_2) continue;
+
+ //
+ // Don't compare tags that are already at or above max_locus_stacks.
+ //
+ if (tag_2->utags.size() >= max_subgraph)
+ continue;
+
+ if (aln->align(tag_1->con, tag_2->con)) {
+ a = aln->result();
+ tag_1->alns.push_back(Aln(tag_2->id, a.cigar, a.pct_id, a.gap_cnt));
+ }
+ }
+ }
+
+ //
+ // Free the k-mers we generated for this query.
+ //
+ for (int j = 0; j < query_kmers.size(); j++)
+ delete [] query_kmers[j];
+ query_kmers.clear();
+
+ delete aln;
+ }
+
+ free_kmer_hash(kmer_map, kmer_map_keys);
+
+ return 0;
+}
+
+int
+merge_remainders(map<int, MergedStack *> &merged, map<int, Rem *> &rem)
+{
map<int, Rem *>::iterator it;
- int j, k;
// OpenMP can't parallelize random access iterators, so we convert
// our map to a vector of integer keys.
vector<int> keys;
uint tot = 0;
for (it = rem.begin(); it != rem.end(); it++) {
- keys.push_back(it->first);
- tot += it->second->count();
+ keys.push_back(it->first);
+ tot += it->second->count();
}
cerr << " " << tot << " remainder sequences left to merge.\n";
if (max_rem_dist <= 0) {
- cerr << " Matched 0 remainder reads; unable to match " << tot << " remainder reads.\n";
- return 0;
+ cerr << " Matched 0 remainder reads; unable to match " << tot << " remainder reads.\n";
+ return 0;
}
//
@@ -206,60 +598,54 @@ int merge_remainders(map<int, MergedStack *> &merged, map<int, Rem *> &rem) {
if (set_kmer_len) kmer_len = determine_kmer_length(con_len, max_rem_dist);
int num_kmers = con_len - kmer_len + 1;
- cerr << " Distance allowed between stacks: " << max_rem_dist << "\n"
- << " Using a k-mer length of " << kmer_len << "\n"
- << " Number of kmers per sequence: " << num_kmers << "\n";
-
//
// Calculate the minimum number of matching k-mers required for a possible sequence match.
//
int min_hits = calc_min_kmer_matches(kmer_len, max_rem_dist, con_len, set_kmer_len ? true : false);
+ cerr << " Distance allowed between stacks: " << max_rem_dist
+ << "; searching with a k-mer length of " << kmer_len << " (" << num_kmers << " k-mers per read); "
+ << min_hits << " k-mer hits required.\n";
+
KmerHashMap kmer_map;
vector<char *> kmer_map_keys;
populate_kmer_hash(merged, kmer_map, kmer_map_keys, kmer_len);
int utilized = 0;
- //
- // Create a character buffer to hold the Rem sequence, this is faster
- // than repeatedly decoding the DNASeq buffers.
- //
- //it = rem.find(keys[0]);
- //char *buf = new char[it->second->seq->size + 1];
+ #pragma omp parallel private(it)
+ {
+ KmerHashMap::iterator h;
+ vector<char *> rem_kmers;
+ char *buf = new char[con_len + 1];
- #pragma omp parallel private(it, k)
- {
#pragma omp for schedule(dynamic)
- for (j = 0; j < (int) keys.size(); j++) {
- it = rem.find(keys[j]);
- Rem *r = it->second;
- char *buf = new char[r->seq->size + 1];
+ for (uint j = 0; j < keys.size(); j++) {
+ it = rem.find(keys[j]);
+ Rem *r = it->second;
//
// Generate the k-mers for this remainder sequence
//
- vector<char *> rem_kmers;
buf = r->seq->seq(buf);
- generate_kmers(buf, kmer_len, num_kmers, rem_kmers);
+ generate_kmers_lazily(buf, kmer_len, num_kmers, rem_kmers);
map<int, int> hits;
- vector<int>::iterator map_it;
//
// Lookup the occurances of each remainder k-mer in the MergedStack k-mer map
//
- for (k = 0; k < num_kmers; k++) {
- if (kmer_map.find(rem_kmers[k]) != kmer_map.end())
- for (map_it = kmer_map[rem_kmers[k]].begin();
- map_it != kmer_map[rem_kmers[k]].end();
- map_it++)
- hits[*map_it]++;
+ for (uint k = 0; k < num_kmers; k++) {
+ h = kmer_map.find(rem_kmers[k]);
+
+ if (h != kmer_map.end())
+ for (uint n = 0; n < h->second.size(); n++)
+ hits[h->second[n]]++;
}
//
// Iterate through the list of hits. For each hit that has more than min_hits
// check its full length to verify a match.
//
- map<int, int> dists;
+ map<int, int> dists;
map<int, int>::iterator hit_it;
for (hit_it = hits.begin(); hit_it != hits.end(); hit_it++) {
if (hit_it->second < min_hits) continue;
@@ -274,42 +660,42 @@ int merge_remainders(map<int, MergedStack *> &merged, map<int, Rem *> &rem) {
}
}
- //
- // Free the k-mers we generated for this remainder read
- //
- for (k = 0; k < num_kmers; k++)
- delete [] rem_kmers[k];
-
- // Check to see if there is a uniquely low distance, if so,
- // merge this remainder tag. If not, discard it, since we
- // can't locate a single best-fitting Stack to merge it into.
- map<int, int>::iterator s;
- int min_id = -1;
- int count = 0;
- int dist = max_rem_dist + 1;
-
- for (s = dists.begin(); s != dists.end(); s++) {
- if ((*s).second < dist) {
- min_id = (*s).first;
- count = 1;
- dist = (*s).second;
- } else if ((*s).second == dist) {
- count++;
- }
- }
-
- delete [] buf;
-
- // Found a merge partner.
- if (min_id >= 0 && count == 1) {
- r->utilized = true;
+ // Check to see if there is a uniquely low distance, if so,
+ // merge this remainder tag. If not, discard it, since we
+ // can't locate a single best-fitting Stack to merge it into.
+ map<int, int>::iterator s;
+ int min_id = -1;
+ int count = 0;
+ int dist = max_rem_dist + 1;
+
+ for (s = dists.begin(); s != dists.end(); s++) {
+ if ((*s).second < dist) {
+ min_id = (*s).first;
+ count = 1;
+ dist = (*s).second;
+ } else if ((*s).second == dist) {
+ count++;
+ }
+ }
+
+ // Found a merge partner.
+ if (min_id >= 0 && count == 1) {
+ r->utilized = true;
#pragma omp critical
- {
- merged[min_id]->remtags.push_back(it->first);
- utilized += it->second->count();
- }
- }
- }
+ {
+ merged[min_id]->remtags.push_back(it->first);
+ utilized += it->second->count();
+ }
+ }
+ }
+
+ //
+ // Free the k-mers we generated for this remainder read
+ //
+ for (uint k = 0; k < rem_kmers.size(); k++)
+ delete [] rem_kmers[k];
+
+ delete [] buf;
}
free_kmer_hash(kmer_map, kmer_map_keys);
@@ -320,50 +706,54 @@ int merge_remainders(map<int, MergedStack *> &merged, map<int, Rem *> &rem) {
return 0;
}
-int call_alleles(MergedStack *mtag, vector<DNASeq *> &reads, vector<read_type> &read_types) {
+int
+call_alleles(MergedStack *mtag, vector<DNANSeq *> &reads, vector<read_type> &read_types)
+{
int row;
int height = reads.size();
string allele;
- DNASeq *d;
+ DNANSeq *d;
char base;
vector<SNP *>::iterator snp;
for (row = 0; row < height; row++) {
- allele.clear();
+ allele.clear();
- uint snp_cnt = 0;
+ uint snp_cnt = 0;
- //
- // Only call a haplotype from primary reads.
- //
- if (!call_sec_hapl && read_types[row] == secondary) continue;
+ //
+ // Only call a haplotype from primary reads.
+ //
+ if (!call_sec_hapl && read_types[row] == secondary) continue;
- for (snp = mtag->snps.begin(); snp != mtag->snps.end(); snp++) {
- if ((*snp)->type != snp_type_het) continue;
+ for (snp = mtag->snps.begin(); snp != mtag->snps.end(); snp++) {
+ if ((*snp)->type != snp_type_het) continue;
- snp_cnt++;
+ snp_cnt++;
d = reads[row];
- base = (*d)[(*snp)->col];
-
- //
- // Check to make sure the nucleotide at the location of this SNP is
- // of one of the two possible states the multinomial model called.
- //
- if (base == (*snp)->rank_1 || base == (*snp)->rank_2)
- allele += base;
- else
- break;
- }
-
- if (snp_cnt > 0 && allele.length() == snp_cnt)
- mtag->alleles[allele]++;
+ base = (*d)[(*snp)->col];
+
+ //
+ // Check to make sure the nucleotide at the location of this SNP is
+ // of one of the two possible states the multinomial model called.
+ //
+ if (base == (*snp)->rank_1 || base == (*snp)->rank_2)
+ allele += base;
+ else
+ break;
+ }
+
+ if (snp_cnt > 0 && allele.length() == snp_cnt)
+ mtag->alleles[allele]++;
}
return 0;
}
-int call_consensus(map<int, MergedStack *> &merged, map<int, Stack *> &unique, map<int, Rem *> &rem, bool invoke_model) {
+int
+call_consensus(map<int, MergedStack *> &merged, map<int, Stack *> &unique, map<int, Rem *> &rem, bool invoke_model)
+{
//
// OpenMP can't parallelize random access iterators, so we convert
// our map to a vector of integer keys.
@@ -371,101 +761,122 @@ int call_consensus(map<int, MergedStack *> &merged, map<int, Stack *> &unique, m
map<int, MergedStack *>::iterator it;
vector<int> keys;
for (it = merged.begin(); it != merged.end(); it++)
- keys.push_back(it->first);
+ keys.push_back(it->first);
int i;
#pragma omp parallel private(i)
{
- MergedStack *mtag;
- Stack *utag;
- Rem *r;
+ MergedStack *mtag;
+ Stack *utag;
+ Rem *r;
#pragma omp for schedule(dynamic)
- for (i = 0; i < (int) keys.size(); i++) {
- mtag = merged[keys[i]];
-
- //
- // Create a two-dimensional array, each row containing one read. For
- // each unique tag that has been merged together, add the sequence for
- // that tag into our array as many times as it originally occurred.
- //
- vector<int>::iterator j;
- vector<DNASeq *> reads;
- vector<read_type> read_types;
-
- for (j = mtag->utags.begin(); j != mtag->utags.end(); j++) {
- utag = unique[*j];
-
- for (uint k = 0; k < utag->count(); k++) {
- reads.push_back(utag->seq);
- read_types.push_back(primary);
- }
- }
-
- // For each remainder tag that has been merged into this Stack, add the sequence.
- for (j = mtag->remtags.begin(); j != mtag->remtags.end(); j++) {
- r = rem[*j];
-
- for (uint k = 0; k < r->count(); k++) {
- reads.push_back(r->seq);
- read_types.push_back(secondary);
- }
- }
-
- //
- // Iterate over each column of the array and call the consensus base.
- //
- int row, col;
- int length = reads[0]->size;
- int height = reads.size();
- string con;
- map<char, int> nuc;
- map<char, int>::iterator max, n;
- DNASeq *d;
-
- for (col = 0; col < length; col++) {
- nuc['A'] = 0;
- nuc['G'] = 0;
- nuc['C'] = 0;
- nuc['T'] = 0;
-
- for (row = 0; row < height; row++) {
- d = reads[row];
- if (nuc.count((*d)[col]))
- nuc[(*d)[col]]++;
- }
-
- //
- // Find the base with a plurality of occurances and call it.
- //
- max = nuc.end();
-
- for (n = nuc.begin(); n != nuc.end(); n++) {
-
- if (max == nuc.end() || n->second > max->second)
- max = n;
- }
- con += max->first;
-
- //
- // Search this column for the presence of a SNP
- //
- if (invoke_model)
- switch(model_type) {
- case snp:
+ for (i = 0; i < (int) keys.size(); i++) {
+ mtag = merged[keys[i]];
+
+ //
+ // Create a two-dimensional array, each row containing one read. For
+ // each unique tag that has been merged together, add the sequence for
+ // that tag into our array as many times as it originally occurred.
+ //
+ vector<int>::iterator j;
+ vector<DNANSeq *> reads;
+ vector<read_type> read_types;
+
+ for (j = mtag->utags.begin(); j != mtag->utags.end(); j++) {
+ utag = unique[*j];
+
+ for (uint k = 0; k < utag->count(); k++) {
+ reads.push_back(utag->seq);
+ read_types.push_back(primary);
+ }
+ }
+
+ // For each remainder tag that has been merged into this Stack, add the sequence.
+ for (j = mtag->remtags.begin(); j != mtag->remtags.end(); j++) {
+ r = rem[*j];
+
+ for (uint k = 0; k < r->count(); k++) {
+ reads.push_back(r->seq);
+ read_types.push_back(secondary);
+ }
+ }
+
+ //
+ // Iterate over each column of the array and call the consensus base.
+ //
+ uint row, col;
+ uint length = reads[0]->size();
+ uint height = reads.size();
+ string con;
+ map<char, int> nuc;
+ map<char, int>::iterator max, n;
+ DNANSeq *d;
+
+ uint cur_gap = mtag->gaps.size() > 0 ? 0 : 1;
+
+ for (col = 0; col < length; col++) {
+ //
+ // Don't invoke the model within gaps.
+ //
+ if (cur_gap < mtag->gaps.size() && col == mtag->gaps[cur_gap].start) {
+ do {
+ con += 'N';
+ SNP *snp = new SNP;
+ snp->type = snp_type_unk;
+ snp->col = col;
+ snp->rank_1 = '-';
+ snp->rank_2 = '-';
+ mtag->snps.push_back(snp);
+ col++;
+ } while (col < mtag->gaps[cur_gap].end && col < length);
+ col--;
+ cur_gap++;
+ continue;
+ }
+
+ nuc['A'] = 0;
+ nuc['G'] = 0;
+ nuc['C'] = 0;
+ nuc['T'] = 0;
+
+ for (row = 0; row < height; row++) {
+ d = reads[row];
+ if (nuc.count((*d)[col]))
+ nuc[(*d)[col]]++;
+ }
+
+ //
+ // Find the base with a plurality of occurances and call it.
+ //
+ max = nuc.end();
+
+ for (n = nuc.begin(); n != nuc.end(); n++) {
+
+ if (max == nuc.end() || n->second > max->second)
+ max = n;
+ }
+ con += max->first;
+
+ //
+ // Search this column for the presence of a SNP
+ //
+ if (invoke_model)
+ switch(model_type) {
+ case snp:
call_multinomial_snp(mtag, col, nuc, true);
- break;
- case bounded:
- call_bounded_multinomial_snp(mtag, col, nuc, true);
- break;
- case fixed:
+ break;
+ case bounded:
+ call_bounded_multinomial_snp(mtag, col, nuc, true);
+ break;
+ case fixed:
call_multinomial_fixed(mtag, col, nuc);
- break;
- }
- }
+ break;
+ }
+ }
- if (invoke_model) {
- call_alleles(mtag, reads, read_types);
+ if (invoke_model) {
+ call_alleles(mtag, reads, read_types);
if (model_type == fixed) {
//
@@ -473,20 +884,22 @@ int call_consensus(map<int, MergedStack *> &merged, map<int, Stack *> &unique, m
//
vector<SNP *>::iterator s;
for (s = mtag->snps.begin(); s != mtag->snps.end(); s++) {
- if ((*s)->type == snp_type_unk)
- con.replace((*s)->col, 1, "N");
+ if ((*s)->type == snp_type_unk)
+ con.replace((*s)->col, 1, "N");
}
}
}
- mtag->add_consensus(con.c_str());
- }
+ mtag->add_consensus(con.c_str());
+ }
}
return 0;
}
-int populate_merged_tags(map<int, Stack *> &unique, map<int, MergedStack *> &merged) {
+int
+populate_merged_tags(map<int, Stack *> &unique, map<int, MergedStack *> &merged)
+{
map<int, Stack *>::iterator i;
map<int, MergedStack *>::iterator it_new, it_old;
Stack *utag;
@@ -496,25 +909,27 @@ int populate_merged_tags(map<int, Stack *> &unique, map<int, MergedStack *> &mer
it_old = merged.begin();
for (i = unique.begin(); i != unique.end(); i++) {
- utag = (*i).second;
- mtag = new MergedStack;
-
- mtag->id = k;
- mtag->count = utag->count();
- mtag->utags.push_back(utag->id);
- mtag->add_consensus(utag->seq);
-
- // Insert the new MergedStack giving a hint as to which position
- // to insert it at.
- it_new = merged.insert(it_old, pair<int, MergedStack *>(k, mtag));
- it_old = it_new;
- k++;
+ utag = (*i).second;
+ mtag = new MergedStack;
+
+ mtag->id = k;
+ mtag->count = utag->count();
+ mtag->utags.push_back(utag->id);
+ mtag->add_consensus(utag->seq);
+
+ // Insert the new MergedStack giving a hint as to which position
+ // to insert it at.
+ it_new = merged.insert(it_old, pair<int, MergedStack *>(k, mtag));
+ it_old = it_new;
+ k++;
}
return 0;
}
-int merge_stacks(map<int, Stack *> &unique, map<int, Rem *> &rem, map<int, MergedStack *> &merged, set<int> &merge_map, int round) {
+int
+merge_stacks(map<int, Stack *> &unique, map<int, Rem *> &rem, map<int, MergedStack *> &merged, set<int> &merge_map, int round)
+{
map<int, MergedStack *> new_merged;
map<int, MergedStack *>::iterator it, it_old, it_new;
MergedStack *tag_1, *tag_2;
@@ -527,157 +942,157 @@ int merge_stacks(map<int, Stack *> &unique, map<int, Rem *> &rem, map<int, Merge
uint blist_cnt = 0;
for (it = merged.begin(); it != merged.end(); it++) {
- tag_1 = it->second;
+ tag_1 = it->second;
- //
- // This tag may already have been merged by an earlier operation.
- //
- if (merge_map.count(tag_1->id) > 0)
- continue;
+ //
+ // This tag may already have been merged by an earlier operation.
+ //
+ if (merge_map.count(tag_1->id) > 0)
+ continue;
- queue<int> merge_list;
- pair<set<int>::iterator,bool> ret;
- vector<pair<int, int> >::iterator k;
+ queue<int> merge_list;
+ pair<set<int>::iterator,bool> ret;
+ vector<pair<int, int> >::iterator k;
- merge_lists.push_back(set<int>());
+ merge_lists.push_back(set<int>());
- if (tag_1->masked) {
- merge_lists[index].insert(tag_1->id);
- index++;
- continue;
- }
+ if (tag_1->masked) {
+ merge_lists[index].insert(tag_1->id);
+ index++;
+ continue;
+ }
- //
- // Construct a list of MergedStacks that are within a particular distance
- // of this tag.
- //
- merge_lists[index].insert(tag_1->id);
- merge_list.push(tag_1->id);
-
- while (!merge_list.empty()) {
- tag_2 = merged[merge_list.front()];
- merge_list.pop();
-
- for (k = tag_2->dist.begin(); k != tag_2->dist.end(); k++) {
- ret = merge_lists[index].insert(k->first);
-
- //
- // If this Tag has not already been added to the merge list (i.e. we were able
- // to insert it in to our unique_merge_list, which is a set), add it for consideration
- // later in the loop.
- //
- if (ret.second == true)
- merge_list.push((*k).first);
- }
- }
+ //
+ // Construct a list of MergedStacks that are within a particular distance
+ // of this tag.
+ //
+ merge_lists[index].insert(tag_1->id);
+ merge_list.push(tag_1->id);
- //
- // Record the nodes that have been merged in this round.
- //
- set<int>::iterator j;
- for (j = merge_lists[index].begin(); j != merge_lists[index].end(); j++)
- merge_map.insert(*j);
+ while (!merge_list.empty()) {
+ tag_2 = merged[merge_list.front()];
+ merge_list.pop();
+
+ for (k = tag_2->dist.begin(); k != tag_2->dist.end(); k++) {
+ ret = merge_lists[index].insert(k->first);
- index++;
+ //
+ // If this Tag has not already been added to the merge list (i.e. we were able
+ // to insert it in to our unique_merge_list, which is a set), add it for consideration
+ // later in the loop.
+ //
+ if (ret.second == true)
+ merge_list.push((*k).first);
+ }
+ }
+
+ //
+ // Record the nodes that have been merged in this round.
+ //
+ set<int>::iterator j;
+ for (j = merge_lists[index].begin(); j != merge_lists[index].end(); j++)
+ merge_map.insert(*j);
+
+ index++;
}
#pragma omp parallel private(tag_1, tag_2)
{
- vector<MergedStack *> merged_tags;
+ vector<MergedStack *> merged_tags;
#pragma omp for reduction(+:delev_cnt) reduction(+:blist_cnt)
- for (uint index = 0; index < merge_lists.size(); index++) {
- //
- // Deal with the simple case of a single locus that does not need to be merged.
- //
- if (merge_lists[index].size() == 1) {
- tag_1 = merged[*(merge_lists[index].begin())];
- tag_2 = merge_tags(merged, merge_lists[index], 0);
-
- //
- // If this tag is masked, keep the old cohort_id.
- //
- if (tag_1->masked) {
- tag_2->cohort_id = tag_1->cohort_id;
- } else {
- tag_2->cohort_id = cohort_id;
- #pragma omp atomic
- cohort_id++;
- }
- merged_tags.push_back(tag_2);
- continue;
- }
-
- //
- // Break large loci down by constructing a minimum
- // spanning tree and severing long distance edges.
- //
- if (deleverage_stacks) {
- vector<MergedStack *> tags;
- bool delev;
-
- deleverage(unique, rem, merged, merge_lists[index], cohort_id, tags);
-
- if (tags.size() == 1) {
- delev = false;
- } else {
- delev_cnt++;
- delev = true;
- }
+ for (uint index = 0; index < merge_lists.size(); index++) {
+ //
+ // Deal with the simple case of a single locus that does not need to be merged.
+ //
+ if (merge_lists[index].size() == 1) {
+ tag_1 = merged[*(merge_lists[index].begin())];
+ tag_2 = merge_tags(merged, merge_lists[index], 0);
- for (uint t = 0; t < tags.size(); t++) {
- //tags[t]->id = id;
- tags[t]->deleveraged = delev;
+ //
+ // If this tag is masked, keep the old cohort_id.
+ //
+ if (tag_1->masked) {
+ tag_2->cohort_id = tag_1->cohort_id;
+ } else {
+ tag_2->cohort_id = cohort_id;
+ #pragma omp atomic
+ cohort_id++;
+ }
+ merged_tags.push_back(tag_2);
+ continue;
+ }
- if (tags[t]->utags.size() > max_subgraph) {
- tags[t]->masked = true;
- tags[t]->blacklisted = true;
- blist_cnt++;
- }
+ //
+ // Break large loci down by constructing a minimum
+ // spanning tree and severing long distance edges.
+ //
+ if (deleverage_stacks) {
+ vector<MergedStack *> tags;
+ bool delev;
- //new_merged.insert(pair<int, MergedStack *>(id, tags[t]));
- merged_tags.push_back(tags[t]);
- //id++;
- }
+ deleverage(unique, rem, merged, merge_lists[index], cohort_id, tags);
- #pragma omp atomic
- cohort_id++;
+ if (tags.size() == 1) {
+ delev = false;
+ } else {
+ delev_cnt++;
+ delev = true;
+ }
- } else {
- //
- // If not deleveraging, merge these tags together into a new MergedStack object.
- //
- tag_2 = merge_tags(merged, merge_lists[index], 0);
- tag_2->cohort_id = cohort_id;
+ for (uint t = 0; t < tags.size(); t++) {
+ //tags[t]->id = id;
+ tags[t]->deleveraged = delev;
- if (tag_2->utags.size() > max_subgraph) {
- tag_2->masked = true;
- tag_2->blacklisted = true;
- blist_cnt++;
- }
+ if (tags[t]->utags.size() > max_subgraph) {
+ tags[t]->masked = true;
+ tags[t]->blacklisted = true;
+ blist_cnt++;
+ }
- //new_merged.insert(pair<int, MergedStack *>(id, tag_2));
- merged_tags.push_back(tag_2);
+ //new_merged.insert(pair<int, MergedStack *>(id, tags[t]));
+ merged_tags.push_back(tags[t]);
+ //id++;
+ }
#pragma omp atomic
- cohort_id++;
- //id++;
- }
- }
+ cohort_id++;
- //
- // Merge the accumulated tags into the new_merged map.
- //
+ } else {
+ //
+ // If not deleveraging, merge these tags together into a new MergedStack object.
+ //
+ tag_2 = merge_tags(merged, merge_lists[index], 0);
+ tag_2->cohort_id = cohort_id;
+
+ if (tag_2->utags.size() > max_subgraph) {
+ tag_2->masked = true;
+ tag_2->blacklisted = true;
+ blist_cnt++;
+ }
+
+ //new_merged.insert(pair<int, MergedStack *>(id, tag_2));
+ merged_tags.push_back(tag_2);
+
+ #pragma omp atomic
+ cohort_id++;
+ //id++;
+ }
+ }
+
+ //
+ // Merge the accumulated tags into the new_merged map.
+ //
#pragma omp critical
- {
- it_old = merged.begin();
- for (uint j = 0; j < merged_tags.size(); j++) {
- merged_tags[j]->id = id;
- it_new = new_merged.insert(it_old, pair<int, MergedStack *>(id, merged_tags[j]));
- it_old = it_new;
- id++;
- }
- }
+ {
+ it_old = merged.begin();
+ for (uint j = 0; j < merged_tags.size(); j++) {
+ merged_tags[j]->id = id;
+ it_new = new_merged.insert(it_old, pair<int, MergedStack *>(id, merged_tags[j]));
+ it_old = it_new;
+ id++;
+ }
+ }
}
uint new_cnt = new_merged.size();
@@ -687,17 +1102,50 @@ int merge_stacks(map<int, Stack *> &unique, map<int, Rem *> &rem, map<int, Merge
// Free the memory from the old map of merged tags.
//
for (it = merged.begin(); it != merged.end(); it++)
- delete it->second;
+ delete it->second;
merged = new_merged;
cerr << " " << old_cnt << " stacks merged into " << new_cnt
- << " stacks; deleveraged " << delev_cnt
- << " stacks; removed " << blist_cnt << " stacks.\n";
+ << " stacks; deleveraged " << delev_cnt
+ << " stacks; removed " << blist_cnt << " stacks.\n";
return 0;
}
+MergedStack *
+merge_tags(MergedStack *tag_1, MergedStack *tag_2, int id)
+{
+ MergedStack *new_tag;
+
+ new_tag = new MergedStack;
+ new_tag->id = id;
+
+ new_tag->deleveraged = tag_2->deleveraged || tag_1->deleveraged;
+ new_tag->masked = tag_2->masked || tag_1->masked;
+ new_tag->blacklisted = tag_2->blacklisted || tag_1->blacklisted;
+ new_tag->gappedlumberjack = tag_2->gappedlumberjack || tag_1->gappedlumberjack;
+ new_tag->lumberjackstack = tag_2->lumberjackstack || tag_1->lumberjackstack;
+
+ for (uint i = 0; i < tag_1->utags.size(); i++)
+ new_tag->utags.push_back(tag_1->utags[i]);
+
+ for (uint i = 0; i < tag_1->remtags.size(); i++)
+ new_tag->remtags.push_back(tag_1->remtags[i]);
+
+ new_tag->count = tag_1->count;
+
+ for (uint i = 0; i < tag_2->utags.size(); i++)
+ new_tag->utags.push_back(tag_2->utags[i]);
+
+ for (uint i = 0; i < tag_2->remtags.size(); i++)
+ new_tag->remtags.push_back(tag_2->remtags[i]);
+
+ new_tag->count += tag_2->count;
+
+ return new_tag;
+}
+
MergedStack *merge_tags(map<int, MergedStack *> &merged, set<int> &merge_list, int id) {
set<int>::iterator i;
vector<int>::iterator j;
@@ -707,18 +1155,18 @@ MergedStack *merge_tags(map<int, MergedStack *> &merged, set<int> &merge_list, i
tag_1->id = id;
for (i = merge_list.begin(); i != merge_list.end(); i++) {
- tag_2 = merged[(*i)];
+ tag_2 = merged[(*i)];
- tag_1->deleveraged = tag_2->deleveraged ? true : tag_1->deleveraged;
- tag_1->masked = tag_2->masked ? true : tag_1->masked;
- tag_1->blacklisted = tag_2->blacklisted ? true : tag_1->blacklisted;
- tag_1->lumberjackstack = tag_2->lumberjackstack ? true : tag_1->lumberjackstack;
+ tag_1->deleveraged = tag_2->deleveraged ? true : tag_1->deleveraged;
+ tag_1->masked = tag_2->masked ? true : tag_1->masked;
+ tag_1->blacklisted = tag_2->blacklisted ? true : tag_1->blacklisted;
+ tag_1->lumberjackstack = tag_2->lumberjackstack ? true : tag_1->lumberjackstack;
- for (j = tag_2->utags.begin(); j != tag_2->utags.end(); j++)
- tag_1->utags.push_back(*j);
+ for (j = tag_2->utags.begin(); j != tag_2->utags.end(); j++)
+ tag_1->utags.push_back(*j);
- for (j = tag_2->remtags.begin(); j != tag_2->remtags.end(); j++)
- tag_1->remtags.push_back(*j);
+ for (j = tag_2->remtags.begin(); j != tag_2->remtags.end(); j++)
+ tag_1->remtags.push_back(*j);
tag_1->count += tag_2->count;
}
@@ -735,18 +1183,18 @@ MergedStack *merge_tags(map<int, MergedStack *> &merged, int *merge_list, int me
tag_1->id = id;
for (i = 0; i < merge_list_size; i++) {
- tag_2 = merged[merge_list[i]];
+ tag_2 = merged[merge_list[i]];
- tag_1->deleveraged = tag_2->deleveraged ? true : tag_1->deleveraged;
- tag_1->masked = tag_2->masked ? true : tag_1->masked;
- tag_1->blacklisted = tag_2->blacklisted ? true : tag_1->blacklisted;
- tag_1->lumberjackstack = tag_2->lumberjackstack ? true : tag_1->lumberjackstack;
+ tag_1->deleveraged = tag_2->deleveraged ? true : tag_1->deleveraged;
+ tag_1->masked = tag_2->masked ? true : tag_1->masked;
+ tag_1->blacklisted = tag_2->blacklisted ? true : tag_1->blacklisted;
+ tag_1->lumberjackstack = tag_2->lumberjackstack ? true : tag_1->lumberjackstack;
- for (j = tag_2->utags.begin(); j != tag_2->utags.end(); j++)
- tag_1->utags.push_back(*j);
+ for (j = tag_2->utags.begin(); j != tag_2->utags.end(); j++)
+ tag_1->utags.push_back(*j);
- for (j = tag_2->remtags.begin(); j != tag_2->remtags.end(); j++)
- tag_1->remtags.push_back(*j);
+ for (j = tag_2->remtags.begin(); j != tag_2->remtags.end(); j++)
+ tag_1->remtags.push_back(*j);
tag_1->count += tag_2->count;
}
@@ -754,7 +1202,9 @@ MergedStack *merge_tags(map<int, MergedStack *> &merged, int *merge_list, int me
return tag_1;
}
-int remove_repetitive_stacks(map<int, Stack *> &unique, map<int, MergedStack *> &merged) {
+int
+remove_repetitive_stacks(map<int, Stack *> &unique, map<int, MergedStack *> &merged)
+{
//
// If enabled, check the depth of coverage of each unique tag, and remove
// from consideration any tags with depths greater than removal_trigger. These tags
@@ -771,70 +1221,92 @@ int remove_repetitive_stacks(map<int, Stack *> &unique, map<int, MergedStack *>
//
// First, iterate through the stacks and populate a list of tags that will be removed
- // (those above the removal_trigger and those 1 nucleotide away). If we don't construct
- // this list first, we will inadvertantly merge short stacks that end up being a
- // single nucleotide away from one of the lumberjack stacks found later in the process.
+ // (those above the removal_trigger and those 1 nucleotide away). Sort the list of
+ // stacks so that we process them from largest depth to shortest so the same stacks
+ // are always merged/removed..
//
+ vector<pair<int, int> > ordered_tags;
+ for (i = merged.begin(); i != merged.end(); i++) {
+ if (i->second->count > removal_trigger)
+ ordered_tags.push_back(make_pair(i->second->id, i->second->count));
+ }
+ sort(ordered_tags.begin(), ordered_tags.end(), compare_pair_intint);
+ pair<set<int>::iterator,bool> ret;
int id = 0;
//
// Merge all stacks that are over the removal trigger with their nearest neighbors and
// mask them so they are not further processed by the program.
//
- for (i = merged.begin(); i != merged.end(); i++) {
- tag_1 = i->second;
+ for (uint j = 0; j < ordered_tags.size(); j++) {
+ tag_1 = merged[ordered_tags[j].first];
- //
- // Don't process a tag that has already been merged.
- //
+ //
+ // Don't process a tag that has already been merged.
+ //
if (already_merged.count(tag_1->id) > 0)
- continue;
-
- if (tag_1->count > removal_trigger) {
- set<int> unique_merge_list;
- unique_merge_list.insert(tag_1->id);
- already_merged.insert(tag_1->id);
-
- for (k = tag_1->dist.begin(); k != tag_1->dist.end(); k++) {
- if (already_merged.count(k->first) == 0) {
- already_merged.insert(k->first);
- unique_merge_list.insert(k->first);
+ continue;
+
+ //
+ // Construct a list of MergedStacks that are either:
+ // within a distance of 1 nucleotide of this tag, or
+ // are they themselves above the lumberjack stacks limit.
+ //
+ queue<int> merge_queue;
+ set<int> merge_list;
+ merge_queue.push(tag_1->id);
+ merge_list.insert(tag_1->id);
+ already_merged.insert(tag_1->id);
+
+ while (!merge_queue.empty()) {
+ tag_2 = merged[merge_queue.front()];
+ merge_queue.pop();
+
+ if (tag_2->count < removal_trigger)
+ continue;
+
+ for (k = tag_2->dist.begin(); k != tag_2->dist.end(); k++) {
+ ret = already_merged.insert(k->first);
+
+ if (ret.second == true) {
+ merge_queue.push(k->first);
+ merge_list.insert(k->first);
}
- }
-
- tag_1->lumberjackstack = true;
- tag_1->masked = true;
- tag_1->blacklisted = true;
-
- //
- // Merge these tags together into a new MergedStack object.
- //
- tag_2 = merge_tags(merged, unique_merge_list, id);
- tag_2->add_consensus(tag_1->con);
-
- new_merged.insert(make_pair(id, tag_2));
- id++;
- }
+ }
+ }
+
+ //
+ // Merge these tags together into a new MergedStack object.
+ //
+ tag_2 = merge_tags(merged, merge_list, id);
+ tag_2->add_consensus(tag_1->con);
+
+ tag_2->lumberjackstack = true;
+ tag_2->masked = true;
+ tag_2->blacklisted = true;
+
+ new_merged.insert(make_pair(id, tag_2));
+ id++;
}
//
// Move the non-lumberjack stacks, unmodified, into the new merged map.
//
for (i = merged.begin(); i != merged.end(); i++) {
- tag_1 = i->second;
+ tag_1 = i->second;
- if (already_merged.count(tag_1->id) > 0)
- continue;
+ if (already_merged.count(tag_1->id) > 0)
+ continue;
- set<int> unique_merge_list;
- unique_merge_list.insert(tag_1->id);
+ set<int> merge_list;
+ merge_list.insert(tag_1->id);
- tag_2 = merge_tags(merged, unique_merge_list, id);
- tag_2->add_consensus(tag_1->con);
+ tag_2 = merge_tags(merged, merge_list, id);
+ tag_2->add_consensus(tag_1->con);
- new_merged.insert(make_pair(id, tag_2));
- id++;
+ new_merged.insert(make_pair(id, tag_2));
+ id++;
}
cerr << " Removed " << already_merged.size() << " stacks.\n";
@@ -844,7 +1316,7 @@ int remove_repetitive_stacks(map<int, Stack *> &unique, map<int, MergedStack *>
//
map<int, MergedStack *>::iterator it;
for (it = merged.begin(); it != merged.end(); it++)
- delete it->second;
+ delete it->second;
merged = new_merged;
@@ -855,10 +1327,10 @@ int remove_repetitive_stacks(map<int, Stack *> &unique, map<int, MergedStack *>
int deleverage(map<int, Stack *> &unique,
map<int, Rem *> &rem,
- map<int, MergedStack *> &merged,
- set<int> &merge_list,
- int cohort_id,
- vector<MergedStack *> &deleveraged_tags) {
+ map<int, MergedStack *> &merged,
+ set<int> &merge_list,
+ int cohort_id,
+ vector<MergedStack *> &deleveraged_tags) {
set<int>::iterator it;
vector<pair<int, int> >::iterator j;
MergedStack *tag_1, *tag_2;
@@ -872,11 +1344,11 @@ int deleverage(map<int, Stack *> &unique,
vector<int> keys;
for (it = merge_list.begin(); it != merge_list.end(); it++) {
- keys.push_back(*it);
+ keys.push_back(*it);
mst->add_node(*it);
tag_1 = merged[*it];
- // cerr << " " << *it << " -> " << tag_1->utags[0] << "\n";
+ // cerr << " " << *it << " -> " << tag_1->utags[0] << "\n";
}
//
@@ -888,15 +1360,15 @@ int deleverage(map<int, Stack *> &unique,
tag_1 = merged[keys[k]];
n_1 = mst->node(keys[k]);
- for (l = k+1; l < keys.size(); l++) {
- tag_2 = merged[keys[l]];
+ for (l = k+1; l < keys.size(); l++) {
+ tag_2 = merged[keys[l]];
n_2 = mst->node(keys[l]);
- int d = dist(tag_1, tag_2);
+ int d = dist(tag_1, tag_2);
n_1->add_edge(mst->node(keys[l]), d);
n_2->add_edge(mst->node(keys[k]), d);
- }
+ }
}
//
@@ -933,17 +1405,17 @@ int deleverage(map<int, Stack *> &unique,
for (uint i = 0; i < n->min_adj_list.size(); i++) {
if (visited.count(n->min_adj_list[i]->id) == 0) {
q.push(n->min_adj_list[i]);
- // cerr << n->id << " -> " << n->min_adj_list[i]->id << ": ";
-
- //
- // Find the edge distance.
- //
- for (uint j = 0; j < n->edges.size(); j++)
- if (n->edges[j]->child == n->min_adj_list[i]) {
- // cerr << n->edges[j]->dist << "\n";
- dists.insert(n->edges[j]->dist);
- }
- }
+ // cerr << n->id << " -> " << n->min_adj_list[i]->id << ": ";
+
+ //
+ // Find the edge distance.
+ //
+ for (uint j = 0; j < n->edges.size(); j++)
+ if (n->edges[j]->child == n->min_adj_list[i]) {
+ // cerr << n->edges[j]->dist << "\n";
+ dists.insert(n->edges[j]->dist);
+ }
+ }
}
}
@@ -952,10 +1424,10 @@ int deleverage(map<int, Stack *> &unique,
// distance separating stacks.
//
if (dists.size() == 1) {
- tag_1 = merge_tags(merged, merge_list, 0);
- deleveraged_tags.push_back(tag_1);
- delete mst;
- return 0;
+ tag_1 = merge_tags(merged, merge_list, 0);
+ deleveraged_tags.push_back(tag_1);
+ delete mst;
+ return 0;
}
uint min_dist = *(dists.begin());
@@ -980,33 +1452,33 @@ int deleverage(map<int, Stack *> &unique,
if (visited.count(n->min_adj_list[i]->id) == 0) {
q.push(n->min_adj_list[i]);
- for (uint j = 0; j < n->edges.size(); j++) {
- if (n->edges[j]->child == n->min_adj_list[i])
- if (n->edges[j]->dist > min_dist) {
-
- // cerr << "Merging the following stacks into a locus:\n";
- for (it = uniq_merge_list.begin(); it != uniq_merge_list.end(); it++) {
- tag_1 = merged[*it];
- // cerr << " " << *it << " -> " << tag_1->utags[0] << "\n";
- }
-
- tag_1 = merge_tags(merged, uniq_merge_list, id);
- tag_1->cohort_id = cohort_id;
- deleveraged_tags.push_back(tag_1);
- uniq_merge_list.clear();
- id++;
- }
- }
+ for (uint j = 0; j < n->edges.size(); j++) {
+ if (n->edges[j]->child == n->min_adj_list[i])
+ if (n->edges[j]->dist > min_dist) {
+
+ // cerr << "Merging the following stacks into a locus:\n";
+ for (it = uniq_merge_list.begin(); it != uniq_merge_list.end(); it++) {
+ tag_1 = merged[*it];
+ // cerr << " " << *it << " -> " << tag_1->utags[0] << "\n";
+ }
+
+ tag_1 = merge_tags(merged, uniq_merge_list, id);
+ tag_1->cohort_id = cohort_id;
+ deleveraged_tags.push_back(tag_1);
+ uniq_merge_list.clear();
+ id++;
+ }
+ }
- uniq_merge_list.insert(n->min_adj_list[i]->id);
- }
+ uniq_merge_list.insert(n->min_adj_list[i]->id);
+ }
}
}
// cerr << "Merging the following stacks into a locus:\n";
for (it = uniq_merge_list.begin(); it != uniq_merge_list.end(); it++) {
- tag_1 = merged[*it];
- // cerr << " " << *it << " -> " << tag_1->utags[0] << "\n";
+ tag_1 = merged[*it];
+ // cerr << " " << *it << " -> " << tag_1->utags[0] << "\n";
}
tag_1 = merge_tags(merged, uniq_merge_list, id);
@@ -1033,7 +1505,7 @@ int calc_kmer_distance(map<int, MergedStack *> &merged, int utag_dist) {
// our map to a vector of integer keys.
vector<int> keys;
for (it = merged.begin(); it != merged.end(); it++)
- keys.push_back(it->first);
+ keys.push_back(it->first);
//
// Calculate the number of k-mers we will generate. If kmer_len == 0,
@@ -1044,19 +1516,24 @@ int calc_kmer_distance(map<int, MergedStack *> &merged, int utag_dist) {
kmer_len = determine_kmer_length(con_len, utag_dist);
int num_kmers = con_len - kmer_len + 1;
- cerr << " Distance allowed between stacks: " << utag_dist << "\n"
- << " Using a k-mer length of " << kmer_len << "\n"
- << " Number of kmers per sequence: " << num_kmers << "\n";
-
//
// Calculate the minimum number of matching k-mers required for a possible sequence match.
//
int min_hits = calc_min_kmer_matches(kmer_len, utag_dist, con_len, set_kmer_len ? true : false);
+ cerr << " Distance allowed between stacks: " << utag_dist
+ << "; searching with a k-mer length of " << kmer_len << " (" << num_kmers << " k-mers per read); "
+ << min_hits << " k-mer hits required.\n";
+
populate_kmer_hash(merged, kmer_map, kmer_map_keys, kmer_len);
#pragma omp parallel private(tag_1, tag_2)
{
+ KmerHashMap::iterator h;
+ vector<char *> query_kmers;
+
+ initialize_kmers(kmer_len, num_kmers, query_kmers);
+
#pragma omp for schedule(dynamic)
for (uint i = 0; i < keys.size(); i++) {
tag_1 = merged[keys[i]];
@@ -1064,8 +1541,7 @@ int calc_kmer_distance(map<int, MergedStack *> &merged, int utag_dist) {
// Don't compute distances for masked tags
if (tag_1->masked) continue;
- vector<char *> query_kmers;
- generate_kmers(tag_1->con, kmer_len, num_kmers, query_kmers);
+ generate_kmers_lazily(tag_1->con, kmer_len, num_kmers, query_kmers);
map<int, int> hits;
int d;
@@ -1073,17 +1549,12 @@ int calc_kmer_distance(map<int, MergedStack *> &merged, int utag_dist) {
// Lookup the occurances of each k-mer in the kmer_map
//
for (int j = 0; j < num_kmers; j++) {
- for (uint k = 0; k < kmer_map[query_kmers[j]].size(); k++)
- hits[kmer_map[query_kmers[j]][k]]++;
- }
-
- //
- // Free the k-mers we generated for this query
- //
- for (int j = 0; j < num_kmers; j++)
- delete [] query_kmers[j];
+ h = kmer_map.find(query_kmers[j]);
- // cerr << " Tag " << tag_1->id << " hit " << hits.size() << " kmers.\n";
+ if (h != kmer_map.end())
+ for (uint k = 0; k < h->second.size(); k++)
+ hits[h->second[k]]++;
+ }
//
// Iterate through the list of hits. For each hit that has more than min_hits
@@ -1091,12 +1562,9 @@ int calc_kmer_distance(map<int, MergedStack *> &merged, int utag_dist) {
//
map<int, int>::iterator hit_it;
for (hit_it = hits.begin(); hit_it != hits.end(); hit_it++) {
- // cerr << " Tag " << hit_it->first << " has " << hit_it->second << " hits (min hits: " << min_hits << ")\n";
if (hit_it->second < min_hits) continue;
- // cerr << " Match found, checking full-length match\n";
-
tag_2 = merged[hit_it->first];
// Don't compute distances for masked tags
@@ -1106,7 +1574,6 @@ int calc_kmer_distance(map<int, MergedStack *> &merged, int utag_dist) {
if (tag_1 == tag_2) continue;
d = dist(tag_1, tag_2);
- // cerr << " Distance: " << d << "\n";
//
// Store the distance between these two sequences if it is
@@ -1120,6 +1587,12 @@ int calc_kmer_distance(map<int, MergedStack *> &merged, int utag_dist) {
// Sort the vector of distances.
sort(tag_1->dist.begin(), tag_1->dist.end(), compare_dist);
}
+
+ //
+ // Free the k-mers we generated for this query
+ //
+ for (int j = 0; j < query_kmers.size(); j++)
+ delete [] query_kmers[j];
}
free_kmer_hash(kmer_map, kmer_map_keys);
@@ -1140,47 +1613,47 @@ int calc_distance(map<int, MergedStack *> &merged, int utag_dist) {
// our map to a vector of integer keys.
vector<int> keys;
for (it = merged.begin(); it != merged.end(); it++)
- keys.push_back(it->first);
+ keys.push_back(it->first);
#pragma omp parallel private(i, j, tag_1, tag_2)
{
#pragma omp for schedule(dynamic)
- for (i = 0; i < (int) keys.size(); i++) {
+ for (i = 0; i < (int) keys.size(); i++) {
- tag_1 = merged[keys[i]];
+ tag_1 = merged[keys[i]];
- // Don't compute distances for masked tags
- if (tag_1->masked) continue;
+ // Don't compute distances for masked tags
+ if (tag_1->masked) continue;
- int d;
+ int d;
- for (j = 0; j < (int) keys.size(); j++) {
- tag_2 = merged[keys[j]];
+ for (j = 0; j < (int) keys.size(); j++) {
+ tag_2 = merged[keys[j]];
- // Don't compute distances for masked tags
- if (tag_2->masked) continue;
+ // Don't compute distances for masked tags
+ if (tag_2->masked) continue;
- // Don't compare tag_1 against itself.
- if (tag_1 == tag_2) continue;
+ // Don't compare tag_1 against itself.
+ if (tag_1 == tag_2) continue;
- d = dist(tag_1, tag_2);
+ d = dist(tag_1, tag_2);
//cerr << " Distance: " << d << "\n";
- //
- // Store the distance between these two sequences if it is
- // below the maximum distance (which governs those
- // sequences to be merged in the following step of the
- // algorithm.)
- //
- if (d == utag_dist) {
- tag_1->add_dist(tag_2->id, d);
+ //
+ // Store the distance between these two sequences if it is
+ // below the maximum distance (which governs those
+ // sequences to be merged in the following step of the
+ // algorithm.)
+ //
+ if (d == utag_dist) {
+ tag_1->add_dist(tag_2->id, d);
//cerr << " HIT.\n";
- }
- }
+ }
+ }
- // Sort the vector of distances.
- sort(tag_1->dist.begin(), tag_1->dist.end(), compare_dist);
- }
+ // Sort the vector of distances.
+ sort(tag_1->dist.begin(), tag_1->dist.end(), compare_dist);
+ }
}
return 0;
@@ -1192,41 +1665,41 @@ int reduce_radtags(DNASeqHashMap &radtags, map<int, Stack *> &unique, map<int, R
Rem *r;
Stack *u;
int global_id = 1;
-
+
for (it = radtags.begin(); it != radtags.end(); it++) {
- if (it->second.count() < min_merge_cov) {
- //
- // Don't record this unique RAD-Tag if its coverage is below
- // the specified cutoff. However, add the reads to the remainder
- // vector for later processing.
- //
- r = new Rem;
- r->id = global_id;
- r->add_seq(it->first);
-
- for (uint i = 0; i < it->second.ids.size(); i++)
- r->add_id(it->second.ids[i]);
-
- rem[r->id] = r;
- global_id++;
-
- } else {
- //
- // Populate a Stack object for this unique radtag. Create a
- // map of the IDs for the sequences that have been
- // collapsed into this radtag.
- //
- u = new Stack;
- u->id = global_id;
- u->add_seq(it->first);
-
- // Copy the original Fastq IDs from which this unique radtag was built.
- for (uint i = 0; i < it->second.ids.size(); i++)
- u->add_id(it->second.ids[i]);
-
- unique[u->id] = u;
- global_id++;
- }
+ if (it->second.count() < min_merge_cov) {
+ //
+ // Don't record this unique RAD-Tag if its coverage is below
+ // the specified cutoff. However, add the reads to the remainder
+ // vector for later processing.
+ //
+ r = new Rem;
+ r->id = global_id;
+ r->add_seq(it->first);
+
+ for (uint i = 0; i < it->second.ids.size(); i++)
+ r->add_id(it->second.ids[i]);
+
+ rem[r->id] = r;
+ global_id++;
+
+ } else {
+ //
+ // Populate a Stack object for this unique radtag. Create a
+ // map of the IDs for the sequences that have been
+ // collapsed into this radtag.
+ //
+ u = new Stack;
+ u->id = global_id;
+ u->add_seq(it->first);
+
+ // Copy the original Fastq IDs from which this unique radtag was built.
+ for (uint i = 0; i < it->second.ids.size(); i++)
+ u->add_id(it->second.ids[i]);
+
+ unique[u->id] = u;
+ global_id++;
+ }
}
if (unique.size() == 0) {
@@ -1238,40 +1711,46 @@ int reduce_radtags(DNASeqHashMap &radtags, map<int, Stack *> &unique, map<int, R
}
int
-free_radtags_hash(DNASeqHashMap &radtags, vector<DNASeq *> &radtags_keys)
+free_radtags_hash(DNASeqHashMap &radtags, vector<DNANSeq *> &radtags_keys)
{
for (uint i = 0; i < radtags_keys.size(); i++)
- delete radtags_keys[i];
+ delete radtags_keys[i];
radtags.clear();
return 0;
}
-int calc_coverage_distribution(map<int, Stack *> &unique, double &mean, double &stdev) {
+int
+calc_coverage_distribution(map<int, Stack *> &unique,
+ double &mean, double &stdev, double &max)
+{
map<int, Stack *>::iterator i;
double m = 0.0;
double s = 0.0;
double sum = 0.0;
- uint max = 0;
uint cnt = 0;
double total = 0.0;
+ mean = 0.0;
+ max = 0.0;
+ stdev = 0.0;
+
map<int, int> depth_dist;
map<int, int>::iterator j;
for (i = unique.begin(); i != unique.end(); i++) {
- cnt = i->second->count();
- m += cnt;
- total++;
+ cnt = i->second->count();
+ m += cnt;
+ total++;
depth_dist[cnt]++;
- if (cnt > max)
- max = cnt;
+ if (cnt > max)
+ max = cnt;
}
- mean = round(m / total);
+ mean = m / total;
//
// Calculate the standard deviation
@@ -1279,65 +1758,120 @@ int calc_coverage_distribution(map<int, Stack *> &unique, double &mean, double &
total = 0.0;
for (i = unique.begin(); i != unique.end(); i++) {
- total++;
- s = i->second->count();
- sum += pow((s - mean), 2);
+ total++;
+ s = i->second->count();
+ sum += pow((s - mean), 2);
}
stdev = sqrt(sum / (total - 1));
- cerr << " Mean coverage depth is " << mean << "; Std Dev: " << stdev << " Max: " << max << "\n";
+ return 0;
+}
+
+int
+calc_coverage_distribution(map<int, Stack *> &unique,
+ map<int, MergedStack *> &merged,
+ double &mean, double &stdev, double &max)
+{
+ map<int, MergedStack *>::iterator it;
+ vector<int>::iterator k;
+ Stack *tag;
+ double m = 0.0;
+ double s = 0.0;
+ double sum = 0.0;
+ double cnt = 0.0;
+
+ mean = 0.0;
+ max = 0.0;
+ stdev = 0.0;
+
+ for (it = merged.begin(); it != merged.end(); it++) {
+ if (it->second->blacklisted) continue;
+
+ cnt++;
+ m = 0.0;
+ for (k = it->second->utags.begin(); k != it->second->utags.end(); k++) {
+ tag = unique[*k];
+ m += tag->count();
+ }
+ if (m > max) max = m;
+
+ sum += m;
+ }
+
+ mean = sum / cnt;
//
- // Output the distribution of stack depths
+ // Calculate the standard deviation
//
- //for (j = depth_dist.begin(); j != depth_dist.end(); j++)
- // cerr << j->first << "\t" << j->second << "\n";
+ for (it = merged.begin(); it != merged.end(); it++) {
+ s = 0.0;
+ for (k = it->second->utags.begin(); k != it->second->utags.end(); k++) {
+ tag = unique[*k];
+ s += tag->count();
+ }
+ sum += pow((s - mean), 2);
+ }
+
+ stdev = sqrt(sum / (cnt - 1));
return 0;
}
-double calc_merged_coverage_distribution(map<int, Stack *> &unique, map<int, MergedStack *> &merged) {
+int
+calc_coverage_distribution(map<int, Stack *> &unique,
+ map<int, Rem *> &rem,
+ map<int, MergedStack *> &merged,
+ double &mean, double &stdev, double &max)
+{
map<int, MergedStack *>::iterator it;
- vector<int>::iterator k;
+ vector<int>::iterator k;
Stack *tag;
- double m = 0.0;
- double s = 0.0;
- double sum = 0.0;
- double mean = 0.0;
- double max = 0.0;
- double stdev = 0.0;
+ double m = 0.0;
+ double s = 0.0;
+ double sum = 0.0;
+ double cnt = 0.0;
+
+ mean = 0.0;
+ max = 0.0;
+ stdev = 0.0;
for (it = merged.begin(); it != merged.end(); it++) {
- m = 0.0;
- for (k = it->second->utags.begin(); k != it->second->utags.end(); k++) {
- tag = unique[*k];
- m += tag->count();
- }
- if (m > max) max = m;
-
- sum += m;
+ if (it->second->blacklisted) continue;
+
+ cnt++;
+ m = 0.0;
+ for (k = it->second->utags.begin(); k != it->second->utags.end(); k++) {
+ tag = unique[*k];
+ m += tag->count();
+ }
+ for (uint j = 0; j < it->second->remtags.size(); j++)
+ m += rem[it->second->remtags[j]]->count();
+
+ if (m > max) max = m;
+
+ sum += m;
}
- mean = sum / (double) merged.size();
+ mean = sum / cnt;
//
// Calculate the standard deviation
//
for (it = merged.begin(); it != merged.end(); it++) {
- s = 0.0;
- for (k = it->second->utags.begin(); k != it->second->utags.end(); k++) {
- tag = unique[*k];
- s += tag->count();
- }
- sum += pow((s - mean), 2);
+ s = 0.0;
+ for (k = it->second->utags.begin(); k != it->second->utags.end(); k++) {
+ tag = unique[*k];
+ s += tag->count();
+ }
+ for (uint j = 0; j < it->second->remtags.size(); j++)
+ s += rem[it->second->remtags[j]]->count();
+ sum += pow((s - mean), 2);
}
- stdev = sqrt(sum / (merged.size() - 1));
+ stdev = sqrt(sum / (cnt - 1));
- cerr << " Mean merged coverage depth is " << mean << "; Std Dev: " << stdev << "; Max: " << max << "\n";
-
- return mean;
+ return 0;
}
int count_raw_reads(map<int, Stack *> &unique, map<int, Rem *> &rem, map<int, MergedStack *> &merged) {
@@ -1351,17 +1885,17 @@ int count_raw_reads(map<int, Stack *> &unique, map<int, Rem *> &rem, map<int, Me
map<int, int>::iterator uit;
for (it = merged.begin(); it != merged.end(); it++) {
- for (k = it->second->utags.begin(); k != it->second->utags.end(); k++) {
- tag = unique[*k];
- m += tag->count();
+ for (k = it->second->utags.begin(); k != it->second->utags.end(); k++) {
+ tag = unique[*k];
+ m += tag->count();
if (uniq_ids.count(*k) == 0)
uniq_ids[*k] = 0;
uniq_ids[*k]++;
- }
- for (uint j = 0; j < it->second->remtags.size(); j++)
- m += rem[it->second->remtags[j]]->count();
- //m += it->second->remtags.size();
+ }
+ for (uint j = 0; j < it->second->remtags.size(); j++)
+ m += rem[it->second->remtags[j]]->count();
+ //m += it->second->remtags.size();
}
for (uit = uniq_ids.begin(); uit != uniq_ids.end(); uit++)
@@ -1404,66 +1938,81 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
size_t pos_2 = in_file.find_last_of(".");
if (in_file.substr(pos_2) == ".gz") {
- in_file = in_file.substr(0, pos_2);
- pos_2 = in_file.find_last_of(".");
+ in_file = in_file.substr(0, pos_2);
+ pos_2 = in_file.find_last_of(".");
}
string tag_file = out_path + in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1)) + ".tags.tsv";
string snp_file = out_path + in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1)) + ".snps.tsv";
string all_file = out_path + in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1)) + ".alleles.tsv";
-
+ string mod_file = out_path + in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1)) + ".models.tsv";
+
if (gzip) {
- tag_file += ".gz";
- snp_file += ".gz";
- all_file += ".gz";
+ tag_file += ".gz";
+ snp_file += ".gz";
+ all_file += ".gz";
+ mod_file += ".gz";
}
//
// Open the output files for writing.
//
- gzFile gz_tags, gz_snps, gz_alle;
- ofstream tags, snps, alle;
+ gzFile gz_tags, gz_snps, gz_alle, gz_mods;
+ ofstream tags, snps, alle, mods;
if (gzip) {
- gz_tags = gzopen(tag_file.c_str(), "wb");
- if (!gz_tags) {
- cerr << "Error: Unable to open gzipped tag file '" << tag_file << "': " << strerror(errno) << ".\n";
- exit(1);
- }
+ gz_tags = gzopen(tag_file.c_str(), "wb");
+ if (!gz_tags) {
+ cerr << "Error: Unable to open gzipped tag file '" << tag_file << "': " << strerror(errno) << ".\n";
+ exit(1);
+ }
+ #if ZLIB_VERNUM >= 0x1240
+ gzbuffer(gz_tags, libz_buffer_size);
+ #endif
+ gz_mods = gzopen(mod_file.c_str(), "wb");
+ if (!gz_mods) {
+ cerr << "Error: Unable to open gzipped tag file '" << tag_file << "': " << strerror(errno) << ".\n";
+ exit(1);
+ }
#if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_tags, libz_buffer_size);
- #endif
- gz_snps = gzopen(snp_file.c_str(), "wb");
- if (!gz_snps) {
- cerr << "Error: Unable to open gzipped snps file '" << snp_file << "': " << strerror(errno) << ".\n";
- exit(1);
- }
+ gzbuffer(gz_mods, libz_buffer_size);
+ #endif
+ gz_snps = gzopen(snp_file.c_str(), "wb");
+ if (!gz_snps) {
+ cerr << "Error: Unable to open gzipped snps file '" << snp_file << "': " << strerror(errno) << ".\n";
+ exit(1);
+ }
#if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_snps, libz_buffer_size);
- #endif
- gz_alle = gzopen(all_file.c_str(), "wb");
- if (!gz_alle) {
- cerr << "Error: Unable to open gzipped alleles file '" << all_file << "': " << strerror(errno) << ".\n";
- exit(1);
- }
+ gzbuffer(gz_snps, libz_buffer_size);
+ #endif
+ gz_alle = gzopen(all_file.c_str(), "wb");
+ if (!gz_alle) {
+ cerr << "Error: Unable to open gzipped alleles file '" << all_file << "': " << strerror(errno) << ".\n";
+ exit(1);
+ }
#if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_alle, libz_buffer_size);
- #endif
+ gzbuffer(gz_alle, libz_buffer_size);
+ #endif
} else {
- tags.open(tag_file.c_str());
- if (tags.fail()) {
- cerr << "Error: Unable to open tag file for writing.\n";
- exit(1);
- }
- snps.open(snp_file.c_str());
- if (snps.fail()) {
- cerr << "Error: Unable to open SNPs file for writing.\n";
- exit(1);
- }
- alle.open(all_file.c_str());
- if (alle.fail()) {
- cerr << "Error: Unable to open allele file for writing.\n";
- exit(1);
- }
+ tags.open(tag_file.c_str());
+ if (tags.fail()) {
+ cerr << "Error: Unable to open tag file for writing.\n";
+ exit(1);
+ }
+ mods.open(mod_file.c_str());
+ if (mods.fail()) {
+ cerr << "Error: Unable to open tag file for writing.\n";
+ exit(1);
+ }
+ snps.open(snp_file.c_str());
+ if (snps.fail()) {
+ cerr << "Error: Unable to open SNPs file for writing.\n";
+ exit(1);
+ }
+ alle.open(all_file.c_str());
+ if (alle.fail()) {
+ cerr << "Error: Unable to open allele file for writing.\n";
+ exit(1);
+ }
}
//
@@ -1481,12 +2030,14 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
log << "# ustacks version " << VERSION << "; generated on " << date << "\n";
if (gzip) {
gzputs(gz_tags, log.str().c_str());
+ gzputs(gz_mods, log.str().c_str());
gzputs(gz_snps, log.str().c_str());
gzputs(gz_alle, log.str().c_str());
} else {
tags << log.str();
- snps << log.str();
- alle << log.str();
+ mods << log.str();
+ snps << log.str();
+ alle << log.str();
}
int id;
@@ -1494,218 +2045,221 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
char *buf = new char[m.begin()->second->len + 1];
for (i = m.begin(); i != m.end(); i++) {
- float total = 0;
- tag_1 = i->second;
-
- //
- // Calculate the log likelihood of this merged stack.
- //
- tag_1->gen_matrix(u, r);
- tag_1->calc_likelihood();
-
- // First write the consensus sequence
- sstr << "0" << "\t"
- << sql_id << "\t"
- << tag_1->id << "\t"
- //<< tag_1->cohort_id << "\t"
- << "" << "\t" // chr
- << 0 << "\t" // bp
- << "+" << "\t" // strand
- << "consensus\t" << "\t"
- << "\t"
- << tag_1->con << "\t"
- << tag_1->deleveraged << "\t"
- << tag_1->blacklisted << "\t"
- << tag_1->lumberjackstack << "\t"
- << tag_1->lnl << "\n";
+ float total = 0;
+ tag_1 = i->second;
+
+ //
+ // Calculate the log likelihood of this merged stack.
+ //
+ tag_1->gen_matrix(u, r);
+ tag_1->calc_likelihood();
+
+ // First write the consensus sequence
+ sstr << "0" << "\t"
+ << sql_id << "\t"
+ << tag_1->id << "\t"
+ //<< tag_1->cohort_id << "\t"
+ << "" << "\t" // chr
+ << 0 << "\t" // bp
+ << "+" << "\t" // strand
+ << "consensus\t" << "\t"
+ << "\t"
+ << tag_1->con << "\t"
+ << tag_1->deleveraged << "\t"
+ << tag_1->blacklisted << "\t"
+ << tag_1->lumberjackstack << "\t"
+ << tag_1->lnl << "\n";
+
+ //
+ // Write a sequence recording the output of the SNP model for each nucleotide.
+ //
+ sstr << "0" << "\t"
+ << sql_id << "\t"
+ << tag_1->id << "\t"
+ //<< "\t" // cohort_id
+ << "\t" // chr
+ << "\t" // bp
+ << "\t" // strand
+ << "model\t" << "\t"
+ << "\t";
+ for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
+ switch((*s)->type) {
+ case snp_type_het:
+ sstr << "E";
+ break;
+ case snp_type_hom:
+ sstr << "O";
+ break;
+ default:
+ sstr << "U";
+ break;
+ }
+ }
+ sstr << "\t"
+ << "\t"
+ << "\t"
+ << "\t"
+ << "\n";
+
+ if (gzip) gzputs(gz_tags, sstr.str().c_str()); else tags << sstr.str();
+ if (gzip) gzputs(gz_mods, sstr.str().c_str()); else mods << sstr.str();
+ sstr.str("");
+
+ //
+ // Now write out the components of each unique tag merged into this locus.
+ //
+ id = 0;
+ for (k = tag_1->utags.begin(); k != tag_1->utags.end(); k++) {
+ tag_2 = u[*k];
+ total += tag_2->count();
+
+ for (uint j = 0; j < tag_2->map.size(); j++) {
+ sstr << "0" << "\t"
+ << sql_id << "\t"
+ << tag_1->id << "\t"
+ //<< "\t" // cohort_id
+ << "\t" // chr
+ << "\t" // bp
+ << "\t" // strand
+ << "primary\t"
+ << id << "\t"
+ << seq_ids[tag_2->map[j]] << "\t"
+ << tag_2->seq->seq(buf)
+ << "\t\t\t\t\n";
+
+ if (gzip) gzputs(gz_tags, sstr.str().c_str()); else tags << sstr.str();
+ sstr.str("");
+ }
- //
- // Write a sequence recording the output of the SNP model for each nucleotide.
- //
- sstr << "0" << "\t"
- << sql_id << "\t"
- << tag_1->id << "\t"
- //<< "\t" // cohort_id
- << "\t" // chr
- << "\t" // bp
- << "\t" // strand
- << "model\t" << "\t"
- << "\t";
- for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
- switch((*s)->type) {
- case snp_type_het:
- sstr << "E";
- break;
- case snp_type_hom:
- sstr << "O";
- break;
- default:
- sstr << "U";
- break;
- }
- }
- sstr << "\t"
- << "\t"
- << "\t"
- << "\t"
- << "\n";
-
- if (gzip) gzputs(gz_tags, sstr.str().c_str()); else tags << sstr.str();
- sstr.str("");
+ id++;
+ }
- //
- // Now write out the components of each unique tag merged into this locus.
- //
- id = 0;
- for (k = tag_1->utags.begin(); k != tag_1->utags.end(); k++) {
- tag_2 = u[*k];
- total += tag_2->count();
-
- for (uint j = 0; j < tag_2->map.size(); j++) {
- sstr << "0" << "\t"
- << sql_id << "\t"
- << tag_1->id << "\t"
- //<< "\t" // cohort_id
- << "\t" // chr
- << "\t" // bp
- << "\t" // strand
- << "primary\t"
- << id << "\t"
- << seq_ids[tag_2->map[j]] << "\t"
- << tag_2->seq->seq(buf)
- << "\t\t\t\t\n";
-
- if (gzip) gzputs(gz_tags, sstr.str().c_str()); else tags << sstr.str();
- sstr.str("");
- }
-
- id++;
- }
+ //
+ // Write out the remainder tags merged into this unique tag.
+ //
+ for (k = tag_1->remtags.begin(); k != tag_1->remtags.end(); k++) {
+ rem = r[*k];
+ total += rem->map.size();
+
+ for (uint j = 0; j < rem->map.size(); j++)
+ sstr << "0" << "\t"
+ << sql_id << "\t"
+ << tag_1->id << "\t"
+ //<< "\t" // cohort_id
+ << "\t" // chr
+ << "\t" // bp
+ << "\t" // strand
+ << "secondary\t"
+ << "\t"
+ << seq_ids[rem->map[j]] << "\t"
+ << rem->seq->seq(buf)
+ << "\t\t\t\t\n";
+
+ if (gzip) gzputs(gz_tags, sstr.str().c_str()); else tags << sstr.str();
+ sstr.str("");
+ }
- //
- // Write out the remainder tags merged into this unique tag.
- //
- for (k = tag_1->remtags.begin(); k != tag_1->remtags.end(); k++) {
- rem = r[*k];
- total += rem->map.size();
-
- for (uint j = 0; j < rem->map.size(); j++)
- sstr << "0" << "\t"
- << sql_id << "\t"
- << tag_1->id << "\t"
- //<< "\t" // cohort_id
- << "\t" // chr
- << "\t" // bp
- << "\t" // strand
- << "secondary\t"
- << "\t"
- << seq_ids[rem->map[j]] << "\t"
- << rem->seq->seq(buf)
- << "\t\t\t\t\n";
-
- if (gzip) gzputs(gz_tags, sstr.str().c_str()); else tags << sstr.str();
- sstr.str("");
- }
+ //
+ // Write out the model calls for each nucleotide in this locus.
+ //
+ for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
+ sstr << "0" << "\t"
+ << sql_id << "\t"
+ << tag_1->id << "\t"
+ << (*s)->col << "\t";
+
+ switch((*s)->type) {
+ case snp_type_het:
+ sstr << "E\t";
+ break;
+ case snp_type_hom:
+ sstr << "O\t";
+ break;
+ default:
+ sstr << "U\t";
+ break;
+ }
- //
- // Write out the model calls for each nucleotide in this locus.
- //
- for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
- sstr << "0" << "\t"
- << sql_id << "\t"
- << tag_1->id << "\t"
- << (*s)->col << "\t";
-
- switch((*s)->type) {
- case snp_type_het:
- sstr << "E\t";
- break;
- case snp_type_hom:
- sstr << "O\t";
- break;
- default:
- sstr << "U\t";
- break;
- }
-
- sstr << std::fixed << std::setprecision(2)
- << (*s)->lratio << "\t"
- << (*s)->rank_1 << "\t"
- << (*s)->rank_2 << "\t\t\n";
- }
-
- if (gzip) gzputs(gz_snps, sstr.str().c_str()); else snps << sstr.str();
- sstr.str("");
+ sstr << std::fixed << std::setprecision(2)
+ << (*s)->lratio << "\t"
+ << (*s)->rank_1 << "\t"
+ << (*s)->rank_2 << "\t\t\n";
+ }
- //
- // Write the expressed alleles seen for the recorded SNPs and
- // the percentage of tags a particular allele occupies.
- //
- for (t = tag_1->alleles.begin(); t != tag_1->alleles.end(); t++) {
- sstr << "0" << "\t"
- << sql_id << "\t"
- << tag_1->id << "\t"
- << (*t).first << "\t"
- << (((*t).second/total) * 100) << "\t"
- << (*t).second << "\n";
- }
- if (gzip) gzputs(gz_alle, sstr.str().c_str()); else alle << sstr.str();
- sstr.str("");
+ if (gzip) gzputs(gz_snps, sstr.str().c_str()); else snps << sstr.str();
+ sstr.str("");
+
+ //
+ // Write the expressed alleles seen for the recorded SNPs and
+ // the percentage of tags a particular allele occupies.
+ //
+ for (t = tag_1->alleles.begin(); t != tag_1->alleles.end(); t++) {
+ sstr << "0" << "\t"
+ << sql_id << "\t"
+ << tag_1->id << "\t"
+ << (*t).first << "\t"
+ << (((*t).second/total) * 100) << "\t"
+ << (*t).second << "\n";
+ }
+ if (gzip) gzputs(gz_alle, sstr.str().c_str()); else alle << sstr.str();
+ sstr.str("");
}
if (gzip) {
- gzclose(gz_tags);
- gzclose(gz_snps);
- gzclose(gz_alle);
+ gzclose(gz_tags);
+ gzclose(gz_mods);
+ gzclose(gz_snps);
+ gzclose(gz_alle);
} else {
- tags.close();
- snps.close();
- alle.close();
+ tags.close();
+ mods.close();
+ snps.close();
+ alle.close();
}
//
// Free sequence IDs.
//
for (uint i = 0; i < seq_ids.size(); i++)
- delete [] seq_ids[i];
+ delete [] seq_ids[i];
//
// If specified, output reads not utilized in any stacks.
//
if (retain_rem_reads) {
- string unused_file = out_path + in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1)) + ".unused.fa";
-
- gzFile gz_unused;
- ofstream unused;
-
- if (gzip) {
- unused_file += ".gz";
- gz_unused = gzopen(unused_file.c_str(), "wb");
- if (!gz_unused) {
- cerr << "Error: Unable to open gzipped discard file '" << unused_file << "': " << strerror(errno) << ".\n";
- exit(1);
- }
+ string unused_file = out_path + in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1)) + ".unused.fa";
+
+ gzFile gz_unused;
+ ofstream unused;
+
+ if (gzip) {
+ unused_file += ".gz";
+ gz_unused = gzopen(unused_file.c_str(), "wb");
+ if (!gz_unused) {
+ cerr << "Error: Unable to open gzipped discard file '" << unused_file << "': " << strerror(errno) << ".\n";
+ exit(1);
+ }
#if ZLIB_VERNUM >= 0x1240
- gzbuffer(gz_unused, libz_buffer_size);
- #endif
- } else {
- unused.open(unused_file.c_str());
- if (unused.fail()) {
- cerr << "Error: Unable to open discard file for writing.\n";
- exit(1);
- }
- }
-
- map<int, Rem *>::iterator r_it;
- for (r_it = r.begin(); r_it != r.end(); r_it++) {
- if (r_it->second->utilized == false)
- sstr << ">" << r_it->second->id << "\n" << r_it->second->seq->seq(buf) << "\n";
- if (gzip) gzputs(gz_unused, sstr.str().c_str()); else unused << sstr.str();
- sstr.str("");
- }
-
- if (gzip) gzclose(gz_unused); else unused.close();
+ gzbuffer(gz_unused, libz_buffer_size);
+ #endif
+ } else {
+ unused.open(unused_file.c_str());
+ if (unused.fail()) {
+ cerr << "Error: Unable to open discard file for writing.\n";
+ exit(1);
+ }
+ }
+
+ map<int, Rem *>::iterator r_it;
+ for (r_it = r.begin(); r_it != r.end(); r_it++) {
+ if (r_it->second->utilized == false)
+ sstr << ">" << r_it->second->id << "\n" << r_it->second->seq->seq(buf) << "\n";
+ if (gzip) gzputs(gz_unused, sstr.str().c_str()); else unused << sstr.str();
+ sstr.str("");
+ }
+
+ if (gzip) gzclose(gz_unused); else unused.close();
}
delete [] buf;
@@ -1714,11 +2268,11 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
}
int dump_stack_graph(string data_file,
- map<int, Stack *> &unique,
- map<int, MergedStack *> &merged,
- vector<int> &keys,
- map<int, map<int, double> > &dist_map,
- map<int, set<int> > &cluster_map) {
+ map<int, Stack *> &unique,
+ map<int, MergedStack *> &merged,
+ vector<int> &keys,
+ map<int, map<int, double> > &dist_map,
+ map<int, set<int> > &cluster_map) {
uint s, t;
double d, scale, scaled_d;
char label[32];
@@ -1733,18 +2287,18 @@ int dump_stack_graph(string data_file,
// Output a list of IDs so we can locate these stacks in the final results.
//
for (s = 0; s < keys.size(); s++)
- data << "/* " << keys[s] << ": " << unique[merged[keys[s]]->utags[0]]->map[0] << "; depth: " << merged[keys[s]]->count << " */\n";
+ data << "/* " << keys[s] << ": " << unique[merged[keys[s]]->utags[0]]->map[0] << "; depth: " << merged[keys[s]]->count << " */\n";
//
// Output a specification to visualize the stack graph using graphviz:
// http://www.graphviz.org/
//
data << "graph " << title.c_str() << " {\n"
- << "rankdir=LR\n"
- << "size=\"20!\"\n"
- << "overlap=false\n"
- << "node [shape=circle style=filled fillcolor=\"#3875d7\" fontname=\"Arial\"];\n"
- << "edge [fontsize=8.0 fontname=\"Arial\" color=\"#aaaaaa\"];\n";
+ << "rankdir=LR\n"
+ << "size=\"20!\"\n"
+ << "overlap=false\n"
+ << "node [shape=circle style=filled fillcolor=\"#3875d7\" fontname=\"Arial\"];\n"
+ << "edge [fontsize=8.0 fontname=\"Arial\" color=\"#aaaaaa\"];\n";
colors.push_back("red");
colors.push_back("blue");
@@ -1760,34 +2314,34 @@ int dump_stack_graph(string data_file,
// Write out the clusters created by R, prior to writing all the nodes and connections.
s = 0;
for (c = cluster_map.begin(); c != cluster_map.end(); c++) {
- data << "subgraph " << s << " {\n"
- << " edge [penwidth=5 fontsize=12.0 fontcolor=\"black\" color=\"black\"]\n";
-
- if ((*c).second.size() == 1) {
- color = "white";
- data << " node [fillcolor=" << color.c_str() << " fontcolor=\"black\"]\n";
- } else {
- color = colors[color_index % colors.size()];
- data << " node [fillcolor=" << color.c_str() << " fontcolor=\"white\"]\n";
- color_index++;
- }
-
- for (it = (*c).second.begin(); it != (*c).second.end(); it++) {
- data << " " << *it << "\n";
- }
-
- if ((*c).second.size() > 1) {
- uint j = 0;
- for (it = (*c).second.begin(); it != (*c).second.end(); it++) {
- data << *it;
- if (j < (*c).second.size() - 1)
- data << " -- ";
- j++;
- }
- }
-
- data << "}\n";
- s++;
+ data << "subgraph " << s << " {\n"
+ << " edge [penwidth=5 fontsize=12.0 fontcolor=\"black\" color=\"black\"]\n";
+
+ if ((*c).second.size() == 1) {
+ color = "white";
+ data << " node [fillcolor=" << color.c_str() << " fontcolor=\"black\"]\n";
+ } else {
+ color = colors[color_index % colors.size()];
+ data << " node [fillcolor=" << color.c_str() << " fontcolor=\"white\"]\n";
+ color_index++;
+ }
+
+ for (it = (*c).second.begin(); it != (*c).second.end(); it++) {
+ data << " " << *it << "\n";
+ }
+
+ if ((*c).second.size() > 1) {
+ uint j = 0;
+ for (it = (*c).second.begin(); it != (*c).second.end(); it++) {
+ data << *it;
+ if (j < (*c).second.size() - 1)
+ data << " -- ";
+ j++;
+ }
+ }
+
+ data << "}\n";
+ s++;
}
//
@@ -1795,18 +2349,18 @@ int dump_stack_graph(string data_file,
// and scale the edge lengths to fit the canvas.
//
for (s = 0; s < keys.size(); s++)
- for (t = s+1; t < keys.size(); t++)
- scale = dist_map[keys[s]][keys[t]] > scale ? dist_map[keys[s]][keys[t]] : scale;
+ for (t = s+1; t < keys.size(); t++)
+ scale = dist_map[keys[s]][keys[t]] > scale ? dist_map[keys[s]][keys[t]] : scale;
scale = scale / 20;
for (s = 0; s < keys.size(); s++) {
- for (t = s+1; t < keys.size(); t++) {
- d = dist_map[keys[s]][keys[t]];
- scaled_d = d / scale;
- scaled_d = scaled_d < 0.75 ? 0.75 : scaled_d;
- sprintf(label, "%.1f", d);
- data << keys[s] << " -- " << keys[t] << " [len=" << scaled_d << ", label=" << label << "];\n";
- }
+ for (t = s+1; t < keys.size(); t++) {
+ d = dist_map[keys[s]][keys[t]];
+ scaled_d = d / scale;
+ scaled_d = scaled_d < 0.75 ? 0.75 : scaled_d;
+ sprintf(label, "%.1f", d);
+ data << keys[s] << " -- " << keys[t] << " [len=" << scaled_d << ", label=" << label << "];\n";
+ }
}
data << "}\n";
@@ -1823,18 +2377,18 @@ int dump_unique_tags(map<int, Stack *> &u) {
char *c;
for (it = u.begin(); it != u.end(); it++) {
- c = (*it).second->seq->seq();
+ c = (*it).second->seq->seq();
- cerr << "UniqueTag UID: " << (*it).second->id << "\n"
- << " Seq: " << c << "\n"
- << " IDs: ";
+ cerr << "UniqueTag UID: " << (*it).second->id << "\n"
+ << " Seq: " << c << "\n"
+ << " IDs: ";
- for (uint j = 0; j < it->second->map.size(); j++)
- cerr << it->second->map[j] << " ";
+ for (uint j = 0; j < it->second->map.size(); j++)
+ cerr << it->second->map[j] << " ";
- cerr << "\n\n";
+ cerr << "\n\n";
- delete [] c;
+ delete [] c;
}
return 0;
@@ -1847,32 +2401,32 @@ int dump_merged_tags(map<int, MergedStack *> &m) {
for (it = m.begin(); it != m.end(); it++) {
- cerr << "MergedStack ID: " << it->second->id << "\n"
- << " Consensus: ";
- if (it->second->con != NULL)
- cerr << it->second->con << "\n";
- else
- cerr << "\n";
- cerr << " IDs: ";
+ cerr << "MergedStack ID: " << it->second->id << "\n"
+ << " Consensus: ";
+ if (it->second->con != NULL)
+ cerr << it->second->con << "\n";
+ else
+ cerr << "\n";
+ cerr << " IDs: ";
- for (fit = it->second->utags.begin(); fit != it->second->utags.end(); fit++)
- cerr << (*fit) << " ";
+ for (fit = it->second->utags.begin(); fit != it->second->utags.end(); fit++)
+ cerr << (*fit) << " ";
- cerr << "\n"
- << " Distances: ";
+ cerr << "\n"
+ << " Distances: ";
- for (pit = it->second->dist.begin(); pit != it->second->dist.end(); pit++)
- cerr << (*pit).first << ": " << (*pit).second << ", ";
+ for (pit = it->second->dist.begin(); pit != it->second->dist.end(); pit++)
+ cerr << (*pit).first << ": " << (*pit).second << ", ";
- cerr << "\n\n";
+ cerr << "\n\n";
}
return 0;
}
-int load_radtags(string in_file, DNASeqHashMap &radtags, vector<DNASeq *> &radtags_keys) {
+int load_radtags(string in_file, DNASeqHashMap &radtags, vector<DNANSeq *> &radtags_keys) {
Input *fh = NULL;
- DNASeq *d;
+ DNANSeq *d;
if (in_file_type == FileT::fasta)
fh = new Fasta(in_file.c_str());
@@ -1898,28 +2452,27 @@ int load_radtags(string in_file, DNASeqHashMap &radtags, vector<DNASeq *> &radta
while ((fh->next_seq(c)) != 0) {
if (i % 10000 == 0) cerr << " Loading RAD-Tag " << i << " \r";
- prev_seql = seql;
- seql = 0;
+ prev_seql = seql;
+ seql = 0;
- for (char *p = c.seq; *p != '\0'; p++, seql++)
- switch (*p) {
- case 'N':
- case 'n':
- case '.':
- *p = 'A';
- corrected++;
- }
+ for (char *p = c.seq; *p != '\0'; p++, seql++)
+ switch (*p) {
+ case 'N':
+ case 'n':
+ case '.':
+ *p = 'A';
+ corrected++;
+ }
- cerr << "seql: " << seql << "; prev_seql: " << prev_seql << "\n";
- if (seql != prev_seql && prev_seql > 0) len_mismatch = true;
+ if (seql != prev_seql && prev_seql > 0) len_mismatch = true;
- d = new DNASeq(seql, c.seq);
+ d = new DNANSeq(seql, c.seq);
- pair<DNASeqHashMap::iterator, bool> r;
+ pair<DNASeqHashMap::iterator, bool> r;
- r = radtags.insert(make_pair(d, HVal()));
- (*r.first).second.add_id(i);
- radtags_keys.push_back(d);
+ r = radtags.insert(make_pair(d, HVal()));
+ (*r.first).second.add_id(i);
+ radtags_keys.push_back(d);
i++;
}
cerr << "Loaded " << i << " RAD-Tags; inserted " << radtags.size() << " elements into the RAD-Tags hash map.\n";
@@ -1932,7 +2485,7 @@ int load_radtags(string in_file, DNASeqHashMap &radtags, vector<DNASeq *> &radta
cerr << " " << corrected << " reads contained uncalled nucleotides that were modified.\n";
if (len_mismatch)
- cerr << " Warning: different sequence lengths detected, this will interfere with Stacks algorithms.\n";
+ cerr << " Warning: different sequence lengths detected, this will interfere with Stacks algorithms.\n";
//
// Close the file and delete the Input object.
@@ -1965,9 +2518,9 @@ load_seq_ids(vector<char *> &seq_ids)
c.qual = new char[max_len];
while ((fh->next_seq(c)) != 0) {
- id = new char[strlen(c.id) + 1];
- strcpy(id, c.id);
- seq_ids.push_back(id);
+ id = new char[strlen(c.id) + 1];
+ strcpy(id, c.id);
+ seq_ids.push_back(id);
}
cerr << "read " << seq_ids.size() << " sequence IDs.\n";
@@ -1976,7 +2529,12 @@ load_seq_ids(vector<char *> &seq_ids)
return 0;
}
-int calc_triggers(double cov_mean, double cov_stdev, int &deleverage_trigger, int &removal_trigger) {
+int
+calc_triggers(double cov_mean,
+ double cov_stdev,
+ double cov_scale,
+ int &deleverage_trigger, int &removal_trigger)
+{
deleverage_trigger = (int) round(cov_mean + cov_stdev * cov_scale);
removal_trigger = (int) round(cov_mean + (cov_stdev * 2) * cov_scale);
@@ -1999,15 +2557,15 @@ int calc_triggers(double cov_mean, double cov_stdev, int &deleverage_trigger, in
// long double i = 0.0;
// do {
-// e = exp(-1 * lambda);
-// g = pow(lambda, k);
-// f = factorial(k);
-// h = (e * g);
-// i = h / f;
-// d += i;
-
-// //cerr << "iteration " << k << "; e: " << e << " h: " << h << " g: " << g << " F: " << f << " i: " << i << " D: " << d << "\n";
-// k++;
+// e = exp(-1 * lambda);
+// g = pow(lambda, k);
+// f = factorial(k);
+// h = (e * g);
+// i = h / f;
+// d += i;
+
+// //cerr << "iteration " << k << "; e: " << e << " h: " << h << " g: " << g << " F: " << f << " i: " << i << " D: " << d << "\n";
+// k++;
// } while (d < 0.999999);
// return k - 1;
@@ -2019,8 +2577,8 @@ long double factorial(int i) {
if (i == 0) return 1;
do {
- f = f * i;
- i--;
+ f = f * i;
+ i--;
} while (i > 0);
return f;
@@ -2030,113 +2588,113 @@ int parse_command_line(int argc, char* argv[]) {
int c;
while (1) {
- static struct option long_options[] = {
- {"help", no_argument, NULL, 'h'},
- {"version", no_argument, NULL, 'v'},
- {"infile_type", required_argument, NULL, 't'},
- {"file", required_argument, NULL, 'f'},
- {"outpath", required_argument, NULL, 'o'},
- {"id", required_argument, NULL, 'i'},
- {"min_cov", required_argument, NULL, 'm'},
- {"max_dist", required_argument, NULL, 'M'},
- {"max_sec_dist", required_argument, NULL, 'N'},
- {"max_locus_stacks", required_argument, NULL, 'K'},
- {"k_len", required_argument, NULL, 'k'},
- {"num_threads", required_argument, NULL, 'p'},
- {"deleverage", no_argument, NULL, 'd'},
- {"remove_rep", no_argument, NULL, 'r'},
- {"retain_rem", no_argument, NULL, 'R'},
- {"graph", no_argument, NULL, 'g'},
- {"exp_cov", no_argument, NULL, 'E'},
- {"cov_stdev", no_argument, NULL, 's'},
- {"cov_scale", no_argument, NULL, 'S'},
- {"sec_hapl", no_argument, NULL, 'H'},
- {"model_type", required_argument, NULL, 'T'},
- {"bc_err_freq", required_argument, NULL, 'e'},
- {"bound_low", required_argument, NULL, 'L'},
- {"bound_high", required_argument, NULL, 'U'},
- {"alpha", required_argument, NULL, 'A'},
- {0, 0, 0, 0}
- };
-
- // getopt_long stores the option index here.
- int option_index = 0;
+ static struct option long_options[] = {
+ {"help", no_argument, NULL, 'h'},
+ {"version", no_argument, NULL, 'v'},
+ {"infile_type", required_argument, NULL, 't'},
+ {"file", required_argument, NULL, 'f'},
+ {"outpath", required_argument, NULL, 'o'},
+ {"id", required_argument, NULL, 'i'},
+ {"min_cov", required_argument, NULL, 'm'},
+ {"max_dist", required_argument, NULL, 'M'},
+ {"max_sec_dist", required_argument, NULL, 'N'},
+ {"max_locus_stacks", required_argument, NULL, 'K'},
+ {"k_len", required_argument, NULL, 'k'},
+ {"num_threads", required_argument, NULL, 'p'},
+ {"deleverage", no_argument, NULL, 'd'},
+ {"remove_rep", no_argument, NULL, 'r'},
+ {"retain_rem", no_argument, NULL, 'R'},
+ {"graph", no_argument, NULL, 'g'},
+ {"sec_hapl", no_argument, NULL, 'H'},
+ {"gapped", no_argument, NULL, 'G'},
+ {"max_gaps", required_argument, NULL, 'X'},
+ {"min_aln_len", required_argument, NULL, 'x'},
+ {"model_type", required_argument, NULL, 'T'},
+ {"bc_err_freq", required_argument, NULL, 'e'},
+ {"bound_low", required_argument, NULL, 'L'},
+ {"bound_high", required_argument, NULL, 'U'},
+ {"alpha", required_argument, NULL, 'A'},
+ {0, 0, 0, 0}
+ };
+
+ // getopt_long stores the option index here.
+ int option_index = 0;
- c = getopt_long(argc, argv, "hHvdrgRA:L:U:f:o:i:m:e:E:s:S:p:t:M:N:K:k:T:", long_options, &option_index);
+ c = getopt_long(argc, argv, "GhHvdrgRA:L:U:f:o:i:m:e:p:t:M:N:K:k:T:X:x:", long_options, &option_index);
- // Detect the end of the options.
- if (c == -1)
- break;
+ // Detect the end of the options.
+ if (c == -1)
+ break;
- switch (c) {
- case 'h':
- help();
- break;
- case 't':
+ switch (c) {
+ case 'h':
+ help();
+ break;
+ case 't':
if (strcmp(optarg, "tsv") == 0)
in_file_type = FileT::tsv;
else if (strcmp(optarg, "fasta") == 0)
in_file_type = FileT::fasta;
else if (strcmp(optarg, "fastq") == 0)
in_file_type = FileT::fastq;
- else if (strcasecmp(optarg, "gzfasta") == 0)
+ else if (strcasecmp(optarg, "gzfasta") == 0)
in_file_type = FileT::gzfasta;
- else if (strcasecmp(optarg, "gzfastq") == 0)
+ else if (strcasecmp(optarg, "gzfastq") == 0)
in_file_type = FileT::gzfastq;
else
in_file_type = FileT::unknown;
- break;
- case 'f':
- in_file = optarg;
- break;
- case 'o':
- out_path = optarg;
- break;
- case 'i':
- sql_id = is_integer(optarg);
- if (sql_id < 0) {
- cerr << "SQL ID (-i) must be an integer, e.g. 1, 2, 3\n";
- help();
- }
- break;
- case 'm':
- min_merge_cov = is_integer(optarg);
- break;
- case 'M':
- max_utag_dist = is_integer(optarg);
- break;
- case 'N':
- max_rem_dist = is_integer(optarg);
- break;
- case 'd':
- deleverage_stacks++;
- break;
- case 'r':
- remove_rep_stacks++;
- break;
- case 'K':
- max_subgraph = is_integer(optarg);
- break;
- case 'k':
- set_kmer_len = false;
- kmer_len = is_integer(optarg);
- break;
- case 'R':
- retain_rem_reads = true;
- break;
- case 'g':
- dump_graph++;
- break;
- case 'E':
- cov_mean = atof(optarg);
- break;
- case 's':
- cov_stdev = atof(optarg);
- break;
- case 'S':
- cov_scale = atof(optarg);
- break;
- case 'T':
+ break;
+ case 'f':
+ in_file = optarg;
+ break;
+ case 'o':
+ out_path = optarg;
+ break;
+ case 'i':
+ sql_id = is_integer(optarg);
+ if (sql_id < 0) {
+ cerr << "SQL ID (-i) must be an integer, e.g. 1, 2, 3\n";
+ help();
+ }
+ break;
+ case 'm':
+ min_merge_cov = is_integer(optarg);
+ break;
+ case 'M':
+ max_utag_dist = is_integer(optarg);
+ break;
+ case 'N':
+ max_rem_dist = is_integer(optarg);
+ break;
+ case 'd':
+ deleverage_stacks++;
+ break;
+ case 'r':
+ remove_rep_stacks++;
+ break;
+ case 'K':
+ max_subgraph = is_integer(optarg);
+ break;
+ case 'k':
+ set_kmer_len = false;
+ kmer_len = is_integer(optarg);
+ break;
+ case 'R':
+ retain_rem_reads = true;
+ break;
+ case 'g':
+ dump_graph++;
+ break;
+ case 'G':
+ gapped_alignments = true;
+ break;
+ case 'X':
+ max_gaps = is_double(optarg);
+ break;
+ case 'x':
+ min_match_len = is_double(optarg);
+ break;
+ case 'T':
if (strcmp(optarg, "snp") == 0) {
model_type = snp;
} else if (strcmp(optarg, "fixed") == 0) {
@@ -2147,37 +2705,37 @@ int parse_command_line(int argc, char* argv[]) {
cerr << "Unknown model type specified '" << optarg << "'\n";
help();
}
- case 'e':
- barcode_err_freq = atof(optarg);
- break;
- case 'L':
- bound_low = atof(optarg);
- break;
- case 'U':
- bound_high = atof(optarg);
- break;
- case 'A':
- alpha = atof(optarg);
- break;
- case 'H':
- call_sec_hapl = false;
- break;
- case 'p':
- num_threads = is_integer(optarg);
- break;
+ case 'e':
+ barcode_err_freq = is_double(optarg);
+ break;
+ case 'L':
+ bound_low = is_double(optarg);
+ break;
+ case 'U':
+ bound_high = is_double(optarg);
+ break;
+ case 'A':
+ alpha = is_double(optarg);
+ break;
+ case 'H':
+ call_sec_hapl = false;
+ break;
+ case 'p':
+ num_threads = is_integer(optarg);
+ break;
case 'v':
version();
break;
- case '?':
- // getopt_long already printed an error message.
- help();
- break;
+ case '?':
+ // getopt_long already printed an error message.
+ help();
+ break;
- default:
- cerr << "Unknown command line option '" << (char) c << "'\n";
- help();
- abort();
- }
+ default:
+ cerr << "Unknown command line option '" << (char) c << "'\n";
+ help();
+ abort();
+ }
}
if (set_kmer_len == false && (kmer_len < 5 || kmer_len > 31)) {
@@ -2186,38 +2744,38 @@ int parse_command_line(int argc, char* argv[]) {
}
if (alpha != 0.1 && alpha != 0.05 && alpha != 0.01 && alpha != 0.001) {
- cerr << "SNP model alpha significance level must be either 0.1, 0.05, 0.01, or 0.001.\n";
- help();
+ cerr << "SNP model alpha significance level must be either 0.1, 0.05, 0.01, or 0.001.\n";
+ help();
}
if (bound_low != 0 && (bound_low < 0 || bound_low >= 1.0)) {
- cerr << "SNP model lower bound must be between 0.0 and 1.0.\n";
- help();
+ cerr << "SNP model lower bound must be between 0.0 and 1.0.\n";
+ help();
}
if (bound_high != 1 && (bound_high <= 0 || bound_high > 1.0)) {
- cerr << "SNP model upper bound must be between 0.0 and 1.0.\n";
- help();
+ cerr << "SNP model upper bound must be between 0.0 and 1.0.\n";
+ help();
}
if (bound_low > 0 || bound_high < 1.0) {
- model_type = bounded;
+ model_type = bounded;
}
if (in_file.length() == 0 || in_file_type == FileT::unknown) {
- cerr << "You must specify an input file of a supported type.\n";
- help();
+ cerr << "You must specify an input file of a supported type.\n";
+ help();
}
if (out_path.length() == 0)
- out_path = ".";
+ out_path = ".";
if (out_path.at(out_path.length() - 1) != '/')
- out_path += "/";
+ out_path += "/";
if (model_type == fixed && barcode_err_freq == 0) {
- cerr << "You must specify the barcode error frequency.\n";
- help();
+ cerr << "You must specify the barcode error frequency.\n";
+ help();
}
return 0;
@@ -2232,31 +2790,35 @@ void version() {
void help() {
std::cerr << "ustacks " << VERSION << "\n"
<< "ustacks -t file_type -f file_path [-d] [-r] [-o path] [-i id] [-m min_cov] [-M max_dist] [-p num_threads] [-R] [-H] [-h]" << "\n"
- << " t: input file Type. Supported types: fasta, fastq, gzfasta, or gzfastq.\n"
+ << " t: input file Type. Supported types: fasta, fastq, gzfasta, or gzfastq.\n"
<< " f: input file path.\n"
- << " o: output path to write results." << "\n"
- << " i: SQL ID to insert into the output to identify this sample." << "\n"
- << " m: Minimum depth of coverage required to create a stack (default 3)." << "\n"
- << " M: Maximum distance (in nucleotides) allowed between stacks (default 2)." << "\n"
- << " N: Maximum distance allowed to align secondary reads to primary stacks (default: M + 2).\n"
- << " R: retain unused reads.\n"
- << " H: disable calling haplotypes from secondary reads.\n"
+ << " o: output path to write results.\n"
+ << " i: SQL ID to insert into the output to identify this sample.\n"
+ << " m: Minimum depth of coverage required to create a stack (default 3).\n"
+ << " M: Maximum distance (in nucleotides) allowed between stacks (default 2).\n"
+ << " N: Maximum distance allowed to align secondary reads to primary stacks (default: M + 2).\n"
+ << " R: retain unused reads.\n"
+ << " H: disable calling haplotypes from secondary reads.\n"
<< " p: enable parallel execution with num_threads threads.\n"
- << " h: display this help messsage.\n\n"
- << " Stack assembly options:\n"
- << " r: enable the Removal algorithm, to drop highly-repetitive stacks (and nearby errors) from the algorithm." << "\n"
- << " d: enable the Deleveraging algorithm, used for resolving over merged tags." << "\n"
- << " --max_locus_stacks <num>: maximum number of stacks at a single de novo locus (default 3).\n"
- << " --k_len <len>: specify k-mer size for matching between alleles and loci (automatically calculated by default).\n\n"
- << " Model options:\n"
- << " --model_type: either 'snp' (default), 'bounded', or 'fixed'\n"
- << " For the SNP or Bounded SNP model:\n"
- << " --alpha <num>: chi square significance level required to call a heterozygote or homozygote, either 0.1, 0.05 (default), 0.01, or 0.001.\n"
- << " For the Bounded SNP model:\n"
- << " --bound_low <num>: lower bound for epsilon, the error rate, between 0 and 1.0 (default 0).\n"
- << " --bound_high <num>: upper bound for epsilon, the error rate, between 0 and 1.0 (default 1).\n"
- << " For the Fixed model:\n"
- << " --bc_err_freq <num>: specify the barcode error frequency, between 0 and 1.0.\n";
+ << " h: display this help messsage.\n\n"
+ << " Stack assembly options:\n"
+ << " r: enable the Removal algorithm, to drop highly-repetitive stacks (and nearby errors) from the algorithm.\n"
+ << " d: enable the Deleveraging algorithm, used for resolving over merged tags.\n"
+ << " --max_locus_stacks <num>: maximum number of stacks at a single de novo locus (default 3).\n"
+ << " --k_len <len>: specify k-mer size for matching between alleles and loci (automatically calculated by default).\n\n"
+ << " Gapped assembly options:\n"
+ << " --gapped: preform gapped alignments between stacks.\n"
+ << " --max_gaps: number of gaps allowed between stacks before merging (default: 2).\n"
+ << " --min_aln_len: minimum length of aligned sequence in a gapped alignment (default: 0.80).\n\n"
+ << " Model options:\n"
+ << " --model_type: either 'snp' (default), 'bounded', or 'fixed'\n"
+ << " For the SNP or Bounded SNP model:\n"
+ << " --alpha <num>: chi square significance level required to call a heterozygote or homozygote, either 0.1, 0.05 (default), 0.01, or 0.001.\n"
+ << " For the Bounded SNP model:\n"
+ << " --bound_low <num>: lower bound for epsilon, the error rate, between 0 and 1.0 (default 0).\n"
+ << " --bound_high <num>: upper bound for epsilon, the error rate, between 0 and 1.0 (default 1).\n"
+ << " For the Fixed model:\n"
+ << " --bc_err_freq <num>: specify the barcode error frequency, between 0 and 1.0.\n";
exit(0);
}
diff --git a/src/ustacks.h b/src/ustacks.h
index 1008054..f5ed095 100644
--- a/src/ustacks.h
+++ b/src/ustacks.h
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2010, Julian Catchen <jcatchen at uoregon.edu>
+// Copyright 2010-2016, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -78,41 +78,41 @@ using google::sparse_hash_map;
#include "FastqI.h" // Reading input files in FASTQ format
#include "gzFasta.h" // Reading gzipped input files in FASTA format
#include "gzFastq.h" // Reading gzipped input files in FASTQ format
-
-typedef unsigned int uint;
-
-const int barcode_size = 5;
+#include "aln_utils.h"
+#include "GappedAln.h"
class HVal {
public:
vector<int> ids;
int count() {
- return this->ids.size();
+ return this->ids.size();
}
int add_id(int id) {
- this->ids.push_back(id);
- return 0;
+ this->ids.push_back(id);
+ return 0;
}
};
+const int barcode_size = 5;
+
#ifdef HAVE_SPARSEHASH
-typedef sparse_hash_map<DNASeq *, HVal, hash_dnaseq, dnaseq_eqstr> DNASeqHashMap;
+typedef sparse_hash_map<DNANSeq *, HVal, hash_dnanseq, dnanseq_eqstr> DNASeqHashMap;
#else
-typedef unordered_map<DNASeq *, HVal, hash_dnaseq, dnaseq_eqstr> DNASeqHashMap;
+typedef unordered_map<DNANSeq *, HVal, hash_dnanseq, dnanseq_eqstr> DNASeqHashMap;
#endif
void help( void );
void version( void );
int parse_command_line(int, char**);
-int load_radtags(string, DNASeqHashMap &, vector<DNASeq *> &);
+int load_radtags(string, DNASeqHashMap &, vector<DNANSeq *> &);
int load_seq_ids(vector<char *> &);
int reduce_radtags(DNASeqHashMap &, map<int, Stack *> &, map<int, Rem *> &);
-int free_radtags_hash(DNASeqHashMap &, vector<DNASeq *> &);
+int free_radtags_hash(DNASeqHashMap &, vector<DNANSeq *> &);
int populate_merged_tags(map<int, Stack *> &, map<int, MergedStack *> &);
int merge_stacks(map<int, Stack *> &, map<int, Rem *> &, map<int, MergedStack *> &, set<int> &, int);
int call_consensus(map<int, MergedStack *> &, map<int, Stack *> &, map<int, Rem *> &, bool);
-int call_alleles(MergedStack *, vector<DNASeq *> &, vector<read_type> &);
+int call_alleles(MergedStack *, vector<DNANSeq *> &, vector<read_type> &);
int merge_remainders(map<int, MergedStack *> &, map<int, Rem *> &);
int write_results(map<int, MergedStack *> &, map<int, Stack *> &, map<int, Rem *> &);
@@ -120,18 +120,24 @@ int write_results(map<int, MergedStack *> &, map<int, Stack *> &, map<int, Rem
// Match MergedStacks using a k-mer hashing algorithm
//
int calc_kmer_distance(map<int, MergedStack *> &, int);
-
+int search_for_gaps(map<int, MergedStack *> &, double);
+int merge_gapped_alns(map<int, Stack *> &, map<int, Rem *> &, map<int, MergedStack *> &);
+int edit_gapped_seqs(map<int, Stack *> &, map<int, Rem *> &, MergedStack *, vector<pair<char, uint> > &);
+int edit_gaps(vector<pair<char, uint> > &, char *);
+int dist(MergedStack *, MergedStack *, vector<pair<char, uint> > &);
+bool rank_alignments(Aln, Aln);
//
// Calculate depth of coverage statistics for stacks
//
-int calc_coverage_distribution(map<int, Stack *> &, double &, double &);
-double calc_merged_coverage_distribution(map<int, Stack *> &, map<int, MergedStack *> &);
-int count_raw_reads(map<int, Stack *> &, map<int, Rem *> &, map<int, MergedStack *> &);
+int calc_coverage_distribution(map<int, Stack *> &, double &, double &, double &);
+int calc_coverage_distribution(map<int, Stack *> &, map<int, MergedStack *> &, double &, double &, double &);
+int calc_coverage_distribution(map<int, Stack *> &, map<int, Rem *> &, map<int, MergedStack *> &, double &, double &, double &);
+int count_raw_reads(map<int, Stack *> &, map<int, Rem *> &, map<int, MergedStack *> &);
//
// Dealing with lumberjack (huge) stacks
//
-int calc_triggers(double, double, int &, int &);
+int calc_triggers(double, double, double, int &, int &);
int remove_repetitive_stacks(map<int, Stack *> &, map<int, MergedStack *> &);
int deleverage(map<int, Stack *> &, map<int, Rem *> &, map<int, MergedStack *> &, set<int> &, int, vector<MergedStack *> &);
@@ -145,6 +151,7 @@ int dump_stack_graph(string, map<int, Stack *> &, map<int, MergedStack *> &, ve
//
// Utilities
//
+MergedStack *merge_tags(MergedStack *, MergedStack *, int);
MergedStack *merge_tags(map<int, MergedStack *> &, set<int> &, int);
MergedStack *merge_tags(map<int, MergedStack *> &, int *, int, int);
long double factorial(int);
diff --git a/src/utils.cc b/src/utils.cc
index c1e32b6..8a83e74 100644
--- a/src/utils.cc
+++ b/src/utils.cc
@@ -239,10 +239,18 @@ bool compare_pair(pair<char, int> a, pair<char, int> b) {
return (a.second > b.second);
}
+bool compare_pair_intint(pair<int, int> a, pair<int, int> b) {
+ return (a.second > b.second);
+}
+
bool compare_pair_intdouble(pair<int, double> a, pair<int, double> b) {
return (a.second < b.second);
}
+bool compare_pair_stringint(pair<string, int> a, pair<string, int> b) {
+ return (a.second < b.second);
+}
+
bool compare_ints(int a, int b) {
return (a > b);
}
diff --git a/src/utils.h b/src/utils.h
index 6d016b4..a5c26e2 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -50,7 +50,9 @@ double reduced_log_factorial(double, double);
//
bool compare_ints(int, int);
bool compare_pair(pair<char, int>, pair<char, int>);
+bool compare_pair_intint(pair<int, int>, pair<int, int>);
bool compare_pair_intdouble(pair<int, double>, pair<int, double>);
+bool compare_pair_stringint(pair<string, int>, pair<string, int>);
bool compare_pair_snp(pair<string, SNP *>, pair<string, SNP *>);
bool compare_pair_haplotype(pair<string, double>, pair<string, double>);
bool compare_pair_haplotype_rev(pair<string, double>, pair<string, double>);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/stacks.git
More information about the debian-med-commit
mailing list