[med-svn] [plink1.9] 02/04: Imported Upstream version 1.90~b3.31-160203

Dylan Aïssi bob.dybian-guest at moszumanska.debian.org
Fri Feb 5 06:42:41 UTC 2016


This is an automated email from the git hooks/post-receive script.

bob.dybian-guest pushed a commit to branch master
in repository plink1.9.

commit 203f5898ae9654ddae76f382837f9602bac50da5
Author: Dylan Aïssi <bob.dybian at gmail.com>
Date:   Thu Feb 4 22:11:03 2016 +0100

    Imported Upstream version 1.90~b3.31-160203
---
 Rconnection.cc  |    6 +-
 Rconnection.h   |    2 +-
 SFMT.c          |   12 +-
 hfile.c         |    2 +-
 khash.h         |    6 +-
 plink.c         | 1207 ++++++++++-----------
 plink_assoc.c   | 2170 ++++++++++++++-----------------------
 plink_assoc.h   |    2 -
 plink_calc.c    | 1057 +++++++++---------
 plink_calc.h    |    2 +-
 plink_cluster.c |  661 ++++++------
 plink_cnv.c     |  174 ++-
 plink_common.c  | 3243 +++++++++++++++++++++++++++++++------------------------
 plink_common.h  | 1321 ++++++++++++----------
 plink_data.c    | 2860 ++++++++++++++++++++++++------------------------
 plink_dosage.c  |  460 ++++----
 plink_family.c  | 1011 ++++++++---------
 plink_filter.c  |  641 ++++++-----
 plink_filter.h  |    2 +-
 plink_glm.c     | 1657 ++++++++++++----------------
 plink_help.c    |   26 +-
 plink_homozyg.c |  392 +++----
 plink_lasso.c   |  141 +--
 plink_ld.c      | 2070 ++++++++++++++++++-----------------
 plink_matrix.c  |    6 +-
 plink_misc.c    | 1167 ++++++++++----------
 plink_perm.c    |  553 +++++++++-
 plink_perm.h    |   67 +-
 plink_rserve.c  |  110 +-
 plink_set.c     |  661 +++++-------
 plink_stats.c   |   12 +-
 yarn.c          |    1 -
 32 files changed, 10866 insertions(+), 10836 deletions(-)

diff --git a/Rconnection.cc b/Rconnection.cc
index ba6b2a9..23692e4 100644
--- a/Rconnection.cc
+++ b/Rconnection.cc
@@ -41,7 +41,7 @@
    -11 - operation is unsupported (e.g. unix login while crypt is not linked)
    -12 - eval didn't return a SEXP (possibly the server is too old/buggy or crashed)
  */
-#if defined __cplusplus && !defined _WIN32
+#if defined (__cplusplus) && !defined (_WIN32)
 
 #include "Rconnection.h"
 
@@ -371,7 +371,7 @@ const char **Rexp::attributeNames() {
 }
 
 void Rinteger::fix_content() {
-    if (len<0 || !data) return;
+    if (!data) return;
 #ifdef SWAPEND
     int *i = (int*) data;
     int *j = (int*) (data+len);
@@ -380,7 +380,7 @@ void Rinteger::fix_content() {
 }
 
 void Rdouble::fix_content() {
-    if (len<0 || !data) return;
+    if (!data) return;
 #ifdef SWAPEND
     double *i = (double*) data;
     double *j = (double*) (data+len);
diff --git a/Rconnection.h b/Rconnection.h
index 4367b5a..340ea7c 100644
--- a/Rconnection.h
+++ b/Rconnection.h
@@ -229,7 +229,7 @@ public:
     /*Rstring(const char *str) : Rexp(XT_STR, str, strlen(str)+1) {}*/
     
     char **strings() { return cont; }
-    char *stringAt(unsigned int i) { return (i < 0 || i >= nel) ? 0 : cont[i]; }
+    char *stringAt(unsigned int i) { return (i >= nel) ? 0 : cont[i]; }
     char *string() { return stringAt(0); }
     virtual Rsize_t length() { return nel; }
 
diff --git a/SFMT.c b/SFMT.c
index b79f2b2..5de9e89 100644
--- a/SFMT.c
+++ b/SFMT.c
@@ -53,9 +53,10 @@ inline static void do_recursion(w128_t * r, w128_t * a, w128_t * b,
 				w128_t * c, w128_t * d);
 #endif
 
-inline static void rshift128(w128_t *out,  w128_t const *in, int shift);
-inline static void lshift128(w128_t *out,  w128_t const *in, int shift);
-
+#ifndef __LP64__
+  inline static void rshift128(w128_t *out,  w128_t const *in, int shift);
+  inline static void lshift128(w128_t *out,  w128_t const *in, int shift);
+  
 /**
  * This function simulates SIMD 128-bit right shift by the standard C.
  * The 128-bit integer given in in is shifted by (shift * 8) bits.
@@ -79,6 +80,7 @@ inline static void rshift128(w128_t *out, w128_t const *in, int shift)
     out->u[3] = (uint32_t)(oh >> 32);
     out->u[2] = (uint32_t)oh;
 }
+
 /**
  * This function simulates SIMD 128-bit left shift by the standard C.
  * The 128-bit integer given in in is shifted by (shift * 8) bits.
@@ -102,6 +104,7 @@ inline static void lshift128(w128_t *out, w128_t const *in, int shift)
     out->u[3] = (uint32_t)(oh >> 32);
     out->u[2] = (uint32_t)oh;
 }
+
 /**
  * This function represents the recursion formula.
  * @param r output
@@ -110,7 +113,6 @@ inline static void lshift128(w128_t *out, w128_t const *in, int shift)
  * @param c a 128-bit part of the internal state array
  * @param d a 128-bit part of the internal state array
  */
-#ifndef __LP64__
 inline static void do_recursion(w128_t *r, w128_t *a, w128_t *b,
 				w128_t *c, w128_t *d)
 {
@@ -133,8 +135,10 @@ inline static void do_recursion(w128_t *r, w128_t *a, w128_t *b,
 /**
  * parameters used by sse2.
  */
+#ifdef __LP64__
 static const w128_t sse2_param_mask = {{SFMT_MSK1, SFMT_MSK2,
 					SFMT_MSK3, SFMT_MSK4}};
+#endif
 /*----------------
   STATIC FUNCTIONS
   ----------------*/
diff --git a/hfile.c b/hfile.c
index 9ab1ea9..81fffc6 100644
--- a/hfile.c
+++ b/hfile.c
@@ -486,6 +486,7 @@ typedef struct {
     size_t length, pos;
 } hFILE_mem;
 
+/*
 static ssize_t mem_read(hFILE *fpv, void *buffer, size_t nbytes)
 {
     hFILE_mem *fp = (hFILE_mem *) fpv;
@@ -529,7 +530,6 @@ static const struct hFILE_backend mem_backend =
     mem_read, NULL, mem_seek, NULL, mem_close
 };
 
-/*
 static hFILE *hopen_mem(const char *data, const char *mode)
 {
     // TODO Implement write modes, which will require memory allocation
diff --git a/khash.h b/khash.h
index 5e55088..e900842 100644
--- a/khash.h
+++ b/khash.h
@@ -194,7 +194,7 @@ static const double __ac_HASH_UPPER = 0.77;
 #define __KHASH_PROTOTYPES(name, khkey_t, khval_t)	 					\
 	extern kh_##name##_t *kh_init_##name(void);							\
 	extern void kh_destroy_##name(kh_##name##_t *h);					\
-	extern void kh_clear_##name(kh_##name##_t *h);						\
+	/* extern void kh_clear_##name(kh_##name##_t *h); */		\
 	extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); 	\
 	extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
 	extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
@@ -212,13 +212,15 @@ static const double __ac_HASH_UPPER = 0.77;
 			kfree(h);													\
 		}																\
 	}																	\
+	/*
 	SCOPE void kh_clear_##name(kh_##name##_t *h)						\
 	{																	\
 		if (h && h->flags) {											\
 			memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
 			h->size = h->n_occupied = 0;								\
 		}																\
-	}																	\
+	}
+*/									\
 	SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) 	\
 	{																	\
 		if (h->n_buckets) {												\
diff --git a/plink.c b/plink.c
index de1f23c..1ec7bae 100644
--- a/plink.c
+++ b/plink.c
@@ -1,5 +1,5 @@
 // PLINK 1.90
-// Copyright (C) 2005-2015 Shaun Purcell, Christopher Chang
+// Copyright (C) 2005-2016 Shaun Purcell, Christopher Chang
 
 // This program is free software: you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
@@ -40,6 +40,7 @@
 #include "plink_lasso.h"
 #include "plink_ld.h"
 #include "plink_misc.h"
+#include "plink_perm.h"
 #ifdef __cplusplus
   #ifndef _WIN32
     #include "plink_rserve.h"
@@ -89,9 +90,9 @@
 // shouldn't be larger than 2^31 - 1
 #define PARALLEL_MAX 32768
 
-const char ver_str[] =
+static const char ver_str[] =
 #ifdef STABLE_BUILD
-  "PLINK v1.90b3.28"
+  "PLINK v1.90b3.31"
 #else
   "PLINK v1.90p"
 #endif
@@ -103,10 +104,10 @@ const char ver_str[] =
 #else
   " 32-bit"
 #endif
-  " (16 Dec 2015)";
-const char ver_str2[] =
+  " (3 Feb 2016)";
+static const char ver_str2[] =
   // include leading space if day < 10, so character length stays the same
-  ""
+  " "
 #ifdef STABLE_BUILD
   "" // (don't want this when version number has a trailing letter)
 #else
@@ -116,24 +117,22 @@ const char ver_str2[] =
   "  "
 #endif
   "    https://www.cog-genomics.org/plink2\n"
-  "(C) 2005-2015 Shaun Purcell, Christopher Chang   GNU General Public License v3\n";
-const char errstr_append[] = "For more information, try '" PROG_NAME_STR " --help [flag name]' or '" PROG_NAME_STR " --help | more'.\n";
+  "(C) 2005-2016 Shaun Purcell, Christopher Chang   GNU General Public License v3\n";
+static const char errstr_append[] = "For more information, try '" PROG_NAME_STR " --help [flag name]' or '" PROG_NAME_STR " --help | more'.\n";
 #ifdef STABLE_BUILD
   #ifndef NOLAPACK
-const char notestr_null_calc2[] = "Commands include --make-bed, --recode, --flip-scan, --merge-list,\n--write-snplist, --list-duplicate-vars, --freqx, --missing, --test-mishap,\n--hardy, --mendel, --ibc, --impute-sex, --indep-pairphase, --r2, --show-tags,\n--blocks, --distance, --genome, --homozyg, --make-rel, --make-grm-gz,\n--rel-cutoff, --cluster, --pca, --neighbour, --ibs-test, --regress-distance,\n--model, --bd, --gxe, --logistic, --dosage, --lasso, --test-missing,\n--make-perm-phen [...]
+static const char notestr_null_calc2[] = "Commands include --make-bed, --recode, --flip-scan, --merge-list,\n--write-snplist, --list-duplicate-vars, --freqx, --missing, --test-mishap,\n--hardy, --mendel, --ibc, --impute-sex, --indep-pairphase, --r2, --show-tags,\n--blocks, --distance, --genome, --homozyg, --make-rel, --make-grm-gz,\n--rel-cutoff, --cluster, --pca, --neighbour, --ibs-test, --regress-distance,\n--model, --bd, --gxe, --logistic, --dosage, --lasso, --test-missing,\n--make-pe [...]
   #else
-const char notestr_null_calc2[] = "Commands include --make-bed, --recode, --flip-scan, --merge-list,\n--write-snplist, --list-duplicate-vars, --freqx, --missing, --test-mishap,\n--hardy, --mendel, --ibc, --impute-sex, --indep-pairphase, --r2, --show-tags,\n--blocks, --distance, --genome, --homozyg, --make-rel, --make-grm-gz,\n--rel-cutoff, --cluster, --neighbour, --ibs-test, --regress-distance, --model,\n--bd, --gxe, --logistic, --dosage, --lasso, --test-missing, --make-perm-pheno,\n--td [...]
+static const char notestr_null_calc2[] = "Commands include --make-bed, --recode, --flip-scan, --merge-list,\n--write-snplist, --list-duplicate-vars, --freqx, --missing, --test-mishap,\n--hardy, --mendel, --ibc, --impute-sex, --indep-pairphase, --r2, --show-tags,\n--blocks, --distance, --genome, --homozyg, --make-rel, --make-grm-gz,\n--rel-cutoff, --cluster, --neighbour, --ibs-test, --regress-distance, --model,\n--bd, --gxe, --logistic, --dosage, --lasso, --test-missing, --make-perm-pheno [...]
   #endif
 #else
   #ifndef NOLAPACK
-const char notestr_null_calc2[] = "Commands include --make-bed, --recode, --flip-scan, --merge-list,\n--write-snplist, --list-duplicate-vars, --freqx, --missing, --test-mishap,\n--hardy, --mendel, --ibc, --impute-sex, --indep-pairphase, --r2, --show-tags,\n--blocks, --distance, --genome, --homozyg, --make-rel, --make-grm-gz,\n--rel-cutoff, --cluster, --pca, --neighbour, --ibs-test, --regress-distance,\n--model, --bd, --gxe, --logistic, --dosage, --lasso, --test-missing,\n--make-perm-phen [...]
+static const char notestr_null_calc2[] = "Commands include --make-bed, --recode, --flip-scan, --merge-list,\n--write-snplist, --list-duplicate-vars, --freqx, --missing, --test-mishap,\n--hardy, --mendel, --ibc, --impute-sex, --indep-pairphase, --r2, --show-tags,\n--blocks, --distance, --genome, --homozyg, --make-rel, --make-grm-gz,\n--rel-cutoff, --cluster, --pca, --neighbour, --ibs-test, --regress-distance,\n--model, --bd, --gxe, --logistic, --dosage, --lasso, --test-missing,\n--make-pe [...]
   #else
-const char notestr_null_calc2[] = "Commands include --make-bed, --recode, --flip-scan, --merge-list,\n--write-snplist, --list-duplicate-vars, --freqx, --missing, --test-mishap,\n--hardy, --mendel, --ibc, --impute-sex, --indep-pairphase, --r2, --show-tags,\n--blocks, --distance, --genome, --homozyg, --make-rel, --make-grm-gz,\n--rel-cutoff, --cluster, --neighbour, --ibs-test, --regress-distance, --model,\n--bd, --gxe, --logistic, --dosage, --lasso, --test-missing, --make-perm-pheno,\n--td [...]
+static const char notestr_null_calc2[] = "Commands include --make-bed, --recode, --flip-scan, --merge-list,\n--write-snplist, --list-duplicate-vars, --freqx, --missing, --test-mishap,\n--hardy, --mendel, --ibc, --impute-sex, --indep-pairphase, --r2, --show-tags,\n--blocks, --distance, --genome, --homozyg, --make-rel, --make-grm-gz,\n--rel-cutoff, --cluster, --neighbour, --ibs-test, --regress-distance, --model,\n--bd, --gxe, --logistic, --dosage, --lasso, --test-missing, --make-perm-pheno [...]
   #endif
 #endif
 
-unsigned char* wkspace;
-
 void disp_exit_msg(int32_t retval) {
   switch (retval) {
   case RET_NOMEM:
@@ -243,7 +242,7 @@ void calc_marker_reverse_bin(uintptr_t* marker_reverse, uintptr_t* marker_exclud
     for (; marker_uidx < marker_uidx_stop; marker_uidx++) {
       dxx = set_allele_freqs[marker_uidx];
       if (dxx < 0.5) {
-	SET_BIT(marker_reverse, marker_uidx);
+	SET_BIT(marker_uidx, marker_reverse);
 	set_allele_freqs[marker_uidx] = 1.0 - dxx;
       }
     }
@@ -254,7 +253,7 @@ void swap_reversed_marker_alleles(uintptr_t unfiltered_marker_ct, uintptr_t* mar
   uintptr_t marker_uidx = 0;
   char* swap_ptr;
   while (1) {
-    next_set_ul_ck(marker_reverse, &marker_uidx, unfiltered_marker_ct);
+    next_set_ul_ck(marker_reverse, unfiltered_marker_ct, &marker_uidx);
     if (marker_uidx == unfiltered_marker_ct) {
       return;
     }
@@ -265,6 +264,10 @@ void swap_reversed_marker_alleles(uintptr_t unfiltered_marker_ct, uintptr_t* mar
   }
 }
 
+static inline int32_t bed_suffix_conflict(uint64_t calculation_type, uint32_t recode_modifier) {
+  return (calculation_type & CALC_MAKE_BED) || ((calculation_type & CALC_RECODE) && (recode_modifier & (RECODE_LGEN | RECODE_LGEN_REF | RECODE_RLIST)));
+}
+
 static inline uint32_t are_marker_pos_needed(uint64_t calculation_type, uint64_t misc_flags, char* cm_map_fname, char* set_fname, uint32_t min_bp_space, uint32_t genome_skip_write, uint32_t ld_modifier, uint32_t epi_modifier, uint32_t cluster_modifier) {
   return (calculation_type & (CALC_MAKE_BED | CALC_MAKE_BIM | CALC_RECODE | CALC_GENOME | CALC_HOMOZYG | CALC_LD_PRUNE | CALC_REGRESS_PCS | CALC_MODEL | CALC_GLM | CALC_CLUMP | CALC_BLOCKS | CALC_FLIPSCAN | CALC_TDT | CALC_QFAM | CALC_FST | CALC_SHOW_TAGS | CALC_DUPVAR | CALC_RPLUGIN)) || (misc_flags & (MISC_EXTRACT_RANGE | MISC_EXCLUDE_RANGE)) || cm_map_fname || set_fname || min_bp_space || genome_skip_write || ((calculation_type & CALC_LD) && (!(ld_modifier & LD_MATRIX_SHAPEMASK))) ||  [...]
 }
@@ -299,7 +302,6 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
   // set_allele_freqs = .bed set bit frequency in middle of loading process, A2
   //   allele frequency later.
   double* set_allele_freqs = NULL;
-  uintptr_t topsize = 0;
   uintptr_t unfiltered_sample_ct = 0;
   uintptr_t unfiltered_sample_ct4 = 0;
   uintptr_t unfiltered_sample_ctl = 0;
@@ -354,7 +356,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
   uintptr_t max_paternal_id_len = 2;
   char* maternal_ids = NULL;
   uintptr_t max_maternal_id_len = 2;
-  unsigned char* wkspace_mark = NULL;
+  unsigned char* bigstack_mark = NULL;
   uintptr_t cluster_ct = 0;
   uint32_t* cluster_map = NULL; // unfiltered sample IDs
   // index for cluster_map, length (cluster_ct + 1)
@@ -366,7 +368,6 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
   uintptr_t* cluster_merge_prevented = NULL;
   double* cluster_sorted_ibs = NULL;
   char* cptr = NULL;
-  uint64_t dists_alloc = 0;
   double missing_phenod = (double)missing_pheno;
   double ci_zt = 0.0;
   uintptr_t bed_offset = 3;
@@ -391,9 +392,9 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
   uint32_t sample_male_ct = 0;
   uint32_t sample_f_ct = 0;
   uint32_t sample_f_male_ct = 0;
-  unsigned char* wkspace_mark2 = NULL;
-  unsigned char* wkspace_mark_precluster = NULL;
-  unsigned char* wkspace_mark_postcluster = NULL;
+  unsigned char* bigstack_mark2 = NULL;
+  unsigned char* bigstack_mark_precluster = NULL;
+  unsigned char* bigstack_mark_postcluster = NULL;
   uint32_t* nchrobs = NULL;
   int32_t* hwe_lls = NULL;
   int32_t* hwe_lhs = NULL;
@@ -436,17 +437,17 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
   // famname[0] is nonzero iff we're not in the --merge-list special case
   if ((calculation_type & CALC_MAKE_BED) && famname[0]) {
 #ifdef _WIN32
-    uii = GetFullPathName(bedname, FNAMESIZE, tbuf, NULL);
+    uii = GetFullPathName(bedname, FNAMESIZE, g_textbuf, NULL);
     if ((!uii) || (uii > FNAMESIZE))
 #else
-    if (!realpath(bedname, tbuf))
+    if (!realpath(bedname, g_textbuf))
 #endif
     {
       uii = strlen(bedname);
       if ((uii > 8) && ((!memcmp(&(bedname[uii - 8]), ".bed.bed", 8)) || (!memcmp(&(bedname[uii - 8]), ".bim.bed", 8)) || (!memcmp(&(bedname[uii - 8]), ".fam.bed", 8)))) {
 	LOGERRPRINTFWW("Error: Failed to open %s. (--bfile expects a filename *prefix*; '.bed', '.bim', and '.fam' are automatically appended.)\n", bedname);
       } else {
-        LOGERRPRINTFWW(errstr_fopen, bedname);
+        LOGERRPRINTFWW(g_errstr_fopen, bedname);
       }
       goto plink_ret_OPEN_FAIL;
     }
@@ -454,32 +455,32 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
     // if file doesn't exist, realpath returns NULL on Linux instead of what
     // the path would be.
 #ifdef _WIN32
-    uii = GetFullPathName(outname, FNAMESIZE, &(tbuf[FNAMESIZE + 64]), NULL);
-    if (uii && (uii <= FNAMESIZE) && (!strcmp(tbuf, &(tbuf[FNAMESIZE + 64]))))
+    uii = GetFullPathName(outname, FNAMESIZE, &(g_textbuf[FNAMESIZE + 64]), NULL);
+    if (uii && (uii <= FNAMESIZE) && (!strcmp(g_textbuf, &(g_textbuf[FNAMESIZE + 64]))))
 #else
-    cptr = realpath(outname, &(tbuf[FNAMESIZE + 64]));
-    if (cptr && (!strcmp(tbuf, &(tbuf[FNAMESIZE + 64]))))
+    cptr = realpath(outname, &(g_textbuf[FNAMESIZE + 64]));
+    if (cptr && (!strcmp(g_textbuf, &(g_textbuf[FNAMESIZE + 64]))))
 #endif
     {
       logprint("Note: --make-bed input and output filenames match.  Appending '~' to input\nfilenames.\n");
       uii = strlen(bedname);
-      memcpy(tbuf, bedname, uii + 1);
+      memcpy(g_textbuf, bedname, uii + 1);
       memcpy(&(bedname[uii]), "~", 2);
-      if (rename(tbuf, bedname)) {
+      if (rename(g_textbuf, bedname)) {
 	logerrprint("Error: Failed to append '~' to input .bed filename.\n");
 	goto plink_ret_OPEN_FAIL;
       }
       uii = strlen(bimname);
-      memcpy(tbuf, bimname, uii + 1);
+      memcpy(g_textbuf, bimname, uii + 1);
       memcpy(&(bimname[uii]), "~", 2);
-      if (rename(tbuf, bimname)) {
+      if (rename(g_textbuf, bimname)) {
 	logerrprint("Error: Failed to append '~' to input .bim filename.\n");
 	goto plink_ret_OPEN_FAIL;
       }
       uii = strlen(famname);
-      memcpy(tbuf, famname, uii + 1);
+      memcpy(g_textbuf, famname, uii + 1);
       memcpy(&(famname[uii]), "~", 2);
-      if (rename(tbuf, famname)) {
+      if (rename(g_textbuf, famname)) {
 	logerrprint("Error: Failed to append '~' to input .fam filename.\n");
 	goto plink_ret_OPEN_FAIL;
       }
@@ -506,7 +507,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
     memcpy(memcpya(famname, bedname, uljj), ".fam", 5);
     memcpy(memcpya(bimname, bedname, uljj), ".bim", 5);
     if ((calculation_type & CALC_MAKE_BED) && ulii) {
-      if (push_ll_str(file_delete_list_ptr, bedname) || push_ll_str(file_delete_list_ptr, famname) || push_ll_str(file_delete_list_ptr, bimname)) {
+      if (push_ll_str(bedname, file_delete_list_ptr) || push_ll_str(famname, file_delete_list_ptr) || push_ll_str(bimname, file_delete_list_ptr)) {
 	goto plink_ret_NOMEM;
       }
     }
@@ -515,13 +516,13 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
   // don't use fopen_checked() here, since we want to customize the error
   // message.
   if (bedname[0]) {
-    bedfile = fopen(bedname, "rb");
+    bedfile = fopen(bedname, FOPEN_RB);
     if (!bedfile) {
       uii = strlen(bedname);
       if ((uii > 8) && ((!memcmp(&(bedname[uii - 8]), ".bed.bed", 8)) || (!memcmp(&(bedname[uii - 8]), ".bim.bed", 8)) || (!memcmp(&(bedname[uii - 8]), ".fam.bed", 8)))) {
 	LOGERRPRINTFWW("Error: Failed to open %s. (--bfile expects a filename *prefix*; '.bed', '.bim', and '.fam' are automatically appended.)\n", bedname);
       } else {
-	LOGERRPRINTFWW(errstr_fopen, bedname);
+	LOGERRPRINTFWW(g_errstr_fopen, bedname);
       }
       goto plink_ret_OPEN_FAIL;
     }
@@ -581,7 +582,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
     }
 
     unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-    unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+    unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
 
     if (misc_flags & MISC_MAKE_FOUNDERS_FIRST) {
       if (make_founders(unfiltered_sample_ct, unfiltered_sample_ct, sample_ids, max_sample_id_len, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, (misc_flags / MISC_MAKE_FOUNDERS_REQUIRE_2_MISSING) & 1, sample_exclude, founder_info)) {
@@ -590,12 +591,12 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
     }
 
     if ((pheno_modifier & PHENO_MERGE) && pheno_all) {
-      if (aligned_malloc(&orig_pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t))) {
+      if (aligned_malloc(unfiltered_sample_ctl * sizeof(intptr_t), &orig_pheno_nm)) {
 	goto plink_ret_NOMEM;
       }
       memcpy(orig_pheno_nm, pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t));
       if (pheno_c) {
-	if (aligned_malloc(&orig_pheno_c, unfiltered_sample_ctl * sizeof(intptr_t))) {
+	if (aligned_malloc(unfiltered_sample_ctl * sizeof(intptr_t), &orig_pheno_c)) {
 	  goto plink_ret_NOMEM;
 	}
 	memcpy(orig_pheno_c, pheno_c, unfiltered_sample_ctl * sizeof(intptr_t));
@@ -607,7 +608,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	memcpy(orig_pheno_d, pheno_d, unfiltered_sample_ct * sizeof(double));
       }
     }
-    count_genders(sex_nm, sex_male, unfiltered_sample_ct, sample_exclude, &uii, &ujj, &gender_unk_ct);
+    count_genders(sex_nm, sex_male, sample_exclude, unfiltered_sample_ct, &uii, &ujj, &gender_unk_ct);
     if (gender_unk_ct) {
       LOGPRINTF("%" PRIuPTR " %s (%u male%s, %u female%s, %u ambiguous) loaded from .fam.\n", unfiltered_sample_ct, species_str(unfiltered_sample_ct), uii, (uii == 1)? "" : "s", ujj, (ujj == 1)? "" : "s", gender_unk_ct);
       retval = write_nosex(outname, outname_end, unfiltered_sample_ct, sample_exclude, sex_nm, gender_unk_ct, sample_ids, max_sample_id_len);
@@ -623,13 +624,13 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	LOGPRINTF("%u phenotype value%s loaded from .fam.\n", uii, (uii == 1)? "" : "s");
       }
 
-      if (phenoname && fopen_checked(&phenofile, phenoname, "r")) {
+      if (phenoname && fopen_checked(phenoname, "r", &phenofile)) {
 	goto plink_ret_OPEN_FAIL;
       }
 
       if (phenofile || update_ids_fname || update_parents_fname || update_sex_fname || (filter_flags & FILTER_TAIL_PHENO)) {
-	wkspace_mark = wkspace_base;
-	retval = sort_item_ids(&cptr, &uiptr, unfiltered_sample_ct, sample_exclude, 0, sample_ids, max_sample_id_len, 0, 0, strcmp_deref);
+	bigstack_mark = g_bigstack_base;
+	retval = sort_item_ids(unfiltered_sample_ct, sample_exclude, 0, sample_ids, max_sample_id_len, 0, 0, strcmp_deref, &cptr, &uiptr);
 	if (retval) {
 	  goto plink_ret_1;
 	}
@@ -645,7 +646,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	    if (retval == LOAD_PHENO_LAST_COL) {
 	      logerrprintb();
 	      retval = RET_INVALID_FORMAT;
-	      wkspace_reset(wkspace_mark);
+	      bigstack_reset(bigstack_mark);
 	    }
 	    goto plink_ret_1;
 	  }
@@ -656,13 +657,13 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	    goto plink_ret_1;
 	  }
 	}
-	wkspace_reset(wkspace_mark);
+	bigstack_reset(bigstack_mark);
       }
 
       if (pheno_c) {
 	/*
 	if (calculation_type & (CALC_REGRESS_PCS | CALC_REGRESS_PCS_DISTANCE)) {
-	  sprintf(logbuf, "Error: --regress-pcs%s requires a scalar phenotype.\n", (calculation_type & CALC_REGRESS_PCS_DISTANCE)? "-distance" : "");
+	  sprintf(g_logbuf, "Error: --regress-pcs%s requires a scalar phenotype.\n", (calculation_type & CALC_REGRESS_PCS_DISTANCE)? "-distance" : "");
 	  goto plink_ret_INVALID_CMDLINE_2;
 	*/
 	if (calculation_type & (CALC_REGRESS_REL | CALC_REGRESS_DISTANCE | CALC_UNRELATED_HERITABILITY | CALC_GXE)) {
@@ -759,8 +760,8 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
     uii = update_cm || update_map || update_name || (marker_alleles_needed && (update_alleles_fname || (flip_fname && (!flip_subset_fname)))) || filter_attrib_fname || qual_filter;
     if (uii || extractname || excludename) {
       // only permit duplicate marker IDs for --extract/--exclude
-      wkspace_mark = wkspace_base;
-      retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, !uii, &marker_id_htable, &marker_id_htable_size);
+      bigstack_mark = g_bigstack_base;
+      retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, !uii, &marker_id_htable_size, &marker_id_htable);
       if (retval) {
 	goto plink_ret_1;
       }
@@ -781,8 +782,8 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	  goto plink_ret_1;
 	}
 	if (update_alleles_fname || (marker_alleles_needed && flip_fname && (!flip_subset_fname)) || extractname || excludename) {
-	  wkspace_reset(wkspace_mark);
-	  retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable, &marker_id_htable_size);
+	  bigstack_reset(bigstack_mark);
+	  retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable_size, &marker_id_htable);
 	  if (retval) {
 	    goto plink_ret_1;
 	  }
@@ -852,7 +853,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	  goto plink_ret_1;
 	}
       }
-      wkspace_reset(wkspace_mark);
+      bigstack_reset(bigstack_mark);
     }
 
     if (allelexxxx) {
@@ -881,20 +882,20 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
       goto plink_ret_INVALID_FORMAT;
     }
     rewind(bedfile);
-    uii = fread(tbuf, 1, 3, bedfile);
+    uii = fread(g_textbuf, 1, 3, bedfile);
     llyy = ((uint64_t)unfiltered_sample_ct4) * unfiltered_marker_ct;
     llzz = ((uint64_t)unfiltered_sample_ct) * ((unfiltered_marker_ct + 3) / 4);
-    if ((uii == 3) && (!memcmp(tbuf, "l\x1b\x01", 3))) {
+    if ((uii == 3) && (!memcmp(g_textbuf, "l\x1b\x01", 3))) {
       llyy += 3;
-    } else if ((uii == 3) && (!memcmp(tbuf, "l\x1b", 2))) {
+    } else if ((uii == 3) && (!memcmp(g_textbuf, "l\x1b", 3))) {
       // v1.00 sample-major
       llyy = llzz + 3;
       bed_offset = 2;
-    } else if (uii && (*tbuf == '\x01')) {
+    } else if (uii && (*g_textbuf == '\x01')) {
       // v0.99 SNP-major
       llyy += 1;
       bed_offset = 1;
-    } else if (uii && (!(*tbuf))) {
+    } else if (uii && (!(*g_textbuf))) {
       // v0.99 sample-major
       llyy = llzz + 1;
       bed_offset = 2;
@@ -910,11 +911,11 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
       bed_offset = 2;
     }
     if (llxx != llyy) {
-      if ((*tbuf == '#') || ((uii == 3) && (!memcmp(tbuf, "chr", 3)))) {
+      if ((*g_textbuf == '#') || ((uii == 3) && (!memcmp(g_textbuf, "chr", 3)))) {
 	logerrprint("Error: Invalid header bytes in PLINK 1 .bed file.  (Is this a UCSC Genome\nBrowser BED file instead?)\n");
 	goto plink_ret_INVALID_FORMAT;
       } else {
-	sprintf(logbuf, "Error: Invalid .bed file size (expected %" PRId64 " bytes).\n", llyy);
+	sprintf(g_logbuf, "Error: Invalid .bed file size (expected %" PRId64 " bytes).\n", llyy);
 	goto plink_ret_INVALID_FORMAT_2;
       }
     }
@@ -929,7 +930,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
       }
       LOGPRINTFWW("Variant-major .bed written to %s .\n", outname);
       strcpy(bedname, outname);
-      if (fopen_checked(&bedfile, bedname, "rb")) {
+      if (fopen_checked(bedname, FOPEN_RB, &bedfile)) {
 	goto plink_ret_OPEN_FAIL;
       }
       bed_offset = 3;
@@ -937,8 +938,8 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
   }
 
   if (unfiltered_sample_ct && (update_ids_fname || update_parents_fname || update_sex_fname || keepname || keepfamname || removename || removefamname || filter_attrib_sample_fname || om_ip->marker_fname || filtername)) {
-    wkspace_mark = wkspace_base;
-    retval = sort_item_ids(&cptr, &uiptr, unfiltered_sample_ct, sample_exclude, 0, sample_ids, max_sample_id_len, 0, 0, strcmp_deref);
+    bigstack_mark = g_bigstack_base;
+    retval = sort_item_ids(unfiltered_sample_ct, sample_exclude, 0, sample_ids, max_sample_id_len, 0, 0, strcmp_deref, &cptr, &uiptr);
     if (retval) {
       goto plink_ret_1;
     }
@@ -947,8 +948,8 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
       if (retval) {
 	goto plink_ret_1;
       }
-      wkspace_reset(wkspace_base);
-      retval = sort_item_ids(&cptr, &uiptr, unfiltered_sample_ct, sample_exclude, 0, sample_ids, max_sample_id_len, 0, 0, strcmp_deref);
+      bigstack_reset(g_bigstack_base);
+      retval = sort_item_ids(unfiltered_sample_ct, sample_exclude, 0, sample_ids, max_sample_id_len, 0, 0, strcmp_deref, &cptr, &uiptr);
       if (retval) {
 	goto plink_ret_1;
       }
@@ -1021,7 +1022,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	goto plink_ret_1;
       }
     }
-    wkspace_reset(wkspace_mark);
+    bigstack_reset(bigstack_mark);
   }
 
   if (famname[0] && (unfiltered_sample_ct != sample_exclude_ct)) {
@@ -1036,7 +1037,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	} else {
 	  // either --must-have-sex without --allow-no-sex, or no data
 	  // generation command
-	  bitfield_and(pheno_nm, sex_nm, unfiltered_sample_ctl);
+	  bitvec_and(sex_nm, unfiltered_sample_ctl, pheno_nm);
 	}
       }
       if (uii || pheno_all || loop_assoc_fname) {
@@ -1044,8 +1045,8 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
       }
     }
     if (filter_flags & FILTER_PRUNE) {
-      bitfield_ornot(sample_exclude, pheno_nm, unfiltered_sample_ctl);
-      zero_trailing_bits(sample_exclude, unfiltered_sample_ct);
+      bitvec_ornot(pheno_nm, unfiltered_sample_ctl, sample_exclude);
+      zero_trailing_bits(unfiltered_sample_ct, sample_exclude);
       sample_exclude_ct = popcount_longs(sample_exclude, unfiltered_sample_ctl);
       if ((sample_exclude_ct == unfiltered_sample_ct) && (!allow_no_samples)) {
 	LOGERRPRINTF("Error: All %s removed by --prune.\n", g_species_plural);
@@ -1110,7 +1111,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
       }
     }
     if (cluster_ptr->fname || (misc_flags & MISC_FAMILY_CLUSTERS)) {
-      // could save off wkspace_mark here and free immediately after
+      // could save off bigstack_mark here and free immediately after
       // load_clusters(), if clusters are *only* used for filtering.  But not a
       // big deal.
       retval = load_clusters(cluster_ptr->fname, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, sample_ids, max_sample_id_len, mwithin_col, (misc_flags / MISC_LOAD_CLUSTER_KEEP_NA) & 1, &cluster_ct, &cluster_map, &cluster_starts, &cluster_ids, &max_cluster_id_len, cluster_ptr->keep_fname, cluster_ptr->keep_flattened, cluster_ptr->remove_fname, cluster_ptr->remove_flattened, allow_no_samples);
@@ -1126,13 +1127,13 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
       goto plink_ret_ALL_SAMPLES_EXCLUDED;
     }
 
-    if ((sample_ct < 2) && (relationship_or_ibc_req(calculation_type) || distance_req(calculation_type, read_dists_fname) || (calculation_type & (CALC_GENOME | CALC_CLUSTER | CALC_NEIGHBOR)))) {
-      sprintf(logbuf, "Error: At least 2 %s required for pairwise analysis.\n", g_species_plural);
+    if ((sample_ct < 2) && (relationship_or_ibc_req(calculation_type) || distance_req(read_dists_fname, calculation_type) || (calculation_type & (CALC_GENOME | CALC_CLUSTER | CALC_NEIGHBOR)))) {
+      sprintf(g_logbuf, "Error: At least 2 %s required for pairwise analysis.\n", g_species_plural);
       goto plink_ret_INVALID_CMDLINE_2;
     }
 
     if ((parallel_tot > 1) && (calculation_type & (CALC_DISTANCE | CALC_GENOME | CALC_RELATIONSHIP)) && (parallel_tot > sample_ct / 2)) {
-      sprintf(logbuf, "Error: Too many --parallel jobs (maximum %" PRIuPTR "/2 = %" PRIuPTR ").\n", sample_ct, sample_ct / 2);
+      sprintf(g_logbuf, "Error: Too many --parallel jobs (maximum %" PRIuPTR "/2 = %" PRIuPTR ").\n", sample_ct, sample_ct / 2);
       goto plink_ret_INVALID_CMDLINE_2;
     }
   }
@@ -1179,12 +1180,12 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
     }
 
     if (sample_sort & (SAMPLE_SORT_NATURAL | SAMPLE_SORT_ASCII)) {
-      retval = sort_item_ids(&cptr, &uiptr, unfiltered_sample_ct, sample_exclude, sample_exclude_ct, sample_ids, max_sample_id_len, 0, 0, (sample_sort & SAMPLE_SORT_NATURAL)? strcmp_natural_deref : strcmp_deref);
+      retval = sort_item_ids(unfiltered_sample_ct, sample_exclude, sample_exclude_ct, sample_ids, max_sample_id_len, 0, 0, (sample_sort & SAMPLE_SORT_NATURAL)? strcmp_natural_deref : strcmp_deref, &cptr, &uiptr);
       if (retval) {
 	goto plink_ret_1;
       }
       sample_sort_map = uiptr;
-      wkspace_reset(cptr);
+      bigstack_reset(cptr);
     } else if (sample_sort == SAMPLE_SORT_FILE) {
       retval = sample_sort_file_map(sample_sort_fname, unfiltered_sample_ct, sample_exclude, unfiltered_sample_ct - sample_exclude_ct, sample_ids, max_sample_id_len, &sample_sort_map);
       if (retval) {
@@ -1221,16 +1222,16 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
       }
     }
 
-    bitfield_andnot(pheno_nm, sample_exclude, unfiltered_sample_ctl);
+    bitvec_andnot(sample_exclude, unfiltered_sample_ctl, pheno_nm);
     if (pheno_c) {
-      bitfield_and(pheno_c, pheno_nm, unfiltered_sample_ctl);
+      bitvec_and(pheno_nm, unfiltered_sample_ctl, pheno_c);
     }
-    bitfield_andnot(founder_info, sample_exclude, unfiltered_sample_ctl);
-    bitfield_andnot(sex_nm, sample_exclude, unfiltered_sample_ctl);
+    bitvec_andnot(sample_exclude, unfiltered_sample_ctl, founder_info);
+    bitvec_andnot(sample_exclude, unfiltered_sample_ctl, sex_nm);
     if (gender_unk_ct) {
       gender_unk_ct = sample_ct - popcount_longs(sex_nm, unfiltered_sample_ctl);
     }
-    bitfield_and(sex_male, sex_nm, unfiltered_sample_ctl);
+    bitvec_and(sex_nm, unfiltered_sample_ctl, sex_male);
 
     pheno_nm_ct = popcount_longs(pheno_nm, unfiltered_sample_ctl);
     if (!pheno_nm_ct) {
@@ -1250,13 +1251,12 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 
   if (bimname[0] && (unfiltered_marker_ct != marker_exclude_ct)) {
     plink_maxsnp = calc_plink_maxsnp(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len);
-    uii = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
-    if (wkspace_alloc_ul_checked(&marker_reverse, uii * sizeof(intptr_t))) {
+    uii = BITCT_TO_WORDCT(unfiltered_marker_ct);
+    if (bigstack_calloc_ul(uii, &marker_reverse)) {
       goto plink_ret_NOMEM;
     }
-    fill_ulong_zero(marker_reverse, uii);
     if (bedfile && sample_ct) {
-      retval = calc_freqs_and_hwe(bedfile, outname, outname_end, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, unfiltered_sample_ct, sample_exclude, sample_exclude_ct, sample_ids, max_sample_id_len, founder_info, nonfounders, (misc_flags / MISC_MAF_SUCC) & 1, set_allele_freqs, bed_offset, (hwe_thresh > 0.0) || (calculation_type & CALC_HARDY), hwe_modifier & HWE_THRESH_ALL, (pheno_nm_ct && pheno_c)? ((calculation_type / CALC [...]
+      retval = calc_freqs_and_hwe(bedfile, outname, outname_end, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, unfiltered_sample_ct, sample_exclude, sample_exclude_ct, sample_ids, max_sample_id_len, founder_info, nonfounders, (misc_flags / MISC_MAF_SUCC) & 1, set_allele_freqs, bed_offset, (hwe_thresh > 0.0) || (calculation_type & CALC_HARDY), hwe_modifier & HWE_THRESH_ALL, (pheno_nm_ct && pheno_c)? ((calculation_type / CALC [...]
       if (retval) {
 	goto plink_ret_1;
       }
@@ -1315,8 +1315,8 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 
       if (geno_excl_bitfield) {
 	ulii = marker_exclude_ct;
-	uljj = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
-	bitfield_or(marker_exclude, geno_excl_bitfield, uljj);
+	uljj = BITCT_TO_WORDCT(unfiltered_marker_ct);
+	bitvec_or(geno_excl_bitfield, uljj, marker_exclude);
 	marker_exclude_ct = popcount_longs(marker_exclude, uljj);
 	if ((marker_exclude_ct == unfiltered_marker_ct) && (!allow_no_variants)) {
 	  logerrprint("Error: All variants excluded due to missing genotype data (--geno).\n");
@@ -1362,20 +1362,20 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	if (fam_ip->mendel_modifier & MENDEL_FILTER) {
 	  // gah
 	  sample_ct = unfiltered_sample_ct - sample_exclude_ct;
-	  bitfield_andnot(founder_info, sample_exclude, unfiltered_sample_ctl);
-	  bitfield_andnot(sex_nm, sample_exclude, unfiltered_sample_ctl);
-	  bitfield_and(sex_male, sex_nm, unfiltered_sample_ctl);
+	  bitvec_andnot(sample_exclude, unfiltered_sample_ctl, founder_info);
+	  bitvec_andnot(sample_exclude, unfiltered_sample_ctl, sex_nm);
+	  bitvec_and(sex_nm, unfiltered_sample_ctl, sex_male);
 	  if (pheno_nm_ct) {
-	    bitfield_andnot(pheno_nm, sample_exclude, unfiltered_sample_ctl);
+	    bitvec_andnot(sample_exclude, unfiltered_sample_ctl, pheno_nm);
 	    pheno_nm_ct = popcount_longs(pheno_nm, unfiltered_sample_ctl);
 	    if (pheno_c) {
-	      bitfield_and(pheno_c, pheno_nm, unfiltered_sample_ctl);
+	      bitvec_and(pheno_nm, unfiltered_sample_ctl, pheno_c);
 	      pheno_ctrl_ct = pheno_nm_ct - popcount_longs(pheno_c, unfiltered_sample_ctl);
 	    }
 	  }
 	}
       }
-      wkspace_reset(hwe_lls);
+      bigstack_reset(hwe_lls);
     }
     if (sip->fname) {
       if (map_is_unsorted & UNSORTED_BP) {
@@ -1413,11 +1413,11 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
       logprint("Note: No phenotypes present.\n");
     } else if (pheno_c) {
       if (pheno_nm_ct != sample_ct) {
-	sprintf(logbuf, "Among remaining phenotypes, %u %s and %u %s.  (%" PRIuPTR " phenotype%s missing.)\n", pheno_nm_ct - pheno_ctrl_ct, (pheno_nm_ct - pheno_ctrl_ct == 1)? "is a case" : "are cases", pheno_ctrl_ct, (pheno_ctrl_ct == 1)? "is a control" : "are controls", sample_ct - pheno_nm_ct, (sample_ct - pheno_nm_ct == 1)? " is" : "s are");
+	sprintf(g_logbuf, "Among remaining phenotypes, %u %s and %u %s.  (%" PRIuPTR " phenotype%s missing.)\n", pheno_nm_ct - pheno_ctrl_ct, (pheno_nm_ct - pheno_ctrl_ct == 1)? "is a case" : "are cases", pheno_ctrl_ct, (pheno_ctrl_ct == 1)? "is a control" : "are controls", sample_ct - pheno_nm_ct, (sample_ct - pheno_nm_ct == 1)? " is" : "s are");
       } else {
-	sprintf(logbuf, "Among remaining phenotypes, %u %s and %u %s.\n", pheno_nm_ct - pheno_ctrl_ct, (pheno_nm_ct - pheno_ctrl_ct == 1)? "is a case" : "are cases", pheno_ctrl_ct, (pheno_ctrl_ct == 1)? "is a control" : "are controls");
+	sprintf(g_logbuf, "Among remaining phenotypes, %u %s and %u %s.\n", pheno_nm_ct - pheno_ctrl_ct, (pheno_nm_ct - pheno_ctrl_ct == 1)? "is a case" : "are cases", pheno_ctrl_ct, (pheno_ctrl_ct == 1)? "is a control" : "are controls");
       }
-      wordwrap(logbuf, 0);
+      wordwrapb(0);
       logprintb();
     } else {
       logprint("Phenotype data is quantitative.\n");
@@ -1450,22 +1450,22 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	}
 	if ((!pca_sample_exclude) && (sample_ct != unfiltered_sample_ct + sample_exclude_ct)) {
 	  sample_ct = unfiltered_sample_ct - sample_exclude_ct;
-	  if ((sample_ct < 2) && (distance_req(calculation_type, read_dists_fname) || (calculation_type & (CALC_REGRESS_REL | CALC_PCA | CALC_GENOME | CALC_CLUSTER | CALC_NEIGHBOR)))) {
+	  if ((sample_ct < 2) && (distance_req(read_dists_fname, calculation_type) || (calculation_type & (CALC_REGRESS_REL | CALC_PCA | CALC_GENOME | CALC_CLUSTER | CALC_NEIGHBOR)))) {
 	    // pathological case
-	    sprintf(logbuf, "Error: Too many %s pruned for additional pairwise analysis steps.\n", g_species_plural);
+	    sprintf(g_logbuf, "Error: Too many %s pruned for additional pairwise analysis steps.\n", g_species_plural);
 	    goto plink_ret_INVALID_CMDLINE_2;
 	  }
 	}
 	if (calculation_type & CALC_REL_CUTOFF) {
 	  // ugh, probably better to just stop supporting this
-	  bitfield_andnot(founder_info, sample_exclude, unfiltered_sample_ctl);
-	  bitfield_andnot(sex_nm, sample_exclude, unfiltered_sample_ctl);
-	  bitfield_and(sex_male, sex_nm, unfiltered_sample_ctl);
+	  bitvec_andnot(sample_exclude, unfiltered_sample_ctl, founder_info);
+	  bitvec_andnot(sample_exclude, unfiltered_sample_ctl, sex_nm);
+	  bitvec_and(sex_nm, unfiltered_sample_ctl, sex_male);
 	  if (pheno_nm_ct) {
-	    bitfield_andnot(pheno_nm, sample_exclude, unfiltered_sample_ctl);
+	    bitvec_andnot(sample_exclude, unfiltered_sample_ctl, pheno_nm);
 	    pheno_nm_ct = popcount_longs(pheno_nm, unfiltered_sample_ctl);
 	    if (pheno_c) {
-	      bitfield_and(pheno_c, pheno_nm, unfiltered_sample_ctl);
+	      bitvec_and(pheno_nm, unfiltered_sample_ctl, pheno_c);
 	      pheno_ctrl_ct = pheno_nm_ct - popcount_longs(pheno_c, unfiltered_sample_ctl);
 	    }
 	  }
@@ -1488,7 +1488,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	  retval = calc_unrelated_herit(calculation_type, relip, unfiltered_sample_ct, sample_exclude, sample_ct, pheno_d, rel_ibc);
 	}
 #endif
-	wkspace_reset(g_sample_missing_unwt);
+	bigstack_reset(g_sample_missing_unwt);
 	if (retval) {
 	  goto plink_ret_1;
 	}
@@ -1564,11 +1564,11 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 
   if (calculation_type & (CALC_WRITE_COVAR | CALC_MAKE_BED | CALC_MAKE_BIM | CALC_MAKE_FAM | CALC_RECODE)) {
     if (gender_unk_ct && (sex_missing_pheno & MUST_HAVE_SEX)) {
-      if (aligned_malloc(&pheno_nm_datagen, unfiltered_sample_ctl * sizeof(intptr_t))) {
+      if (aligned_malloc(unfiltered_sample_ctl * sizeof(intptr_t), &pheno_nm_datagen)) {
 	goto plink_ret_NOMEM;
       }
       memcpy(pheno_nm_datagen, pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t));
-      bitfield_and(pheno_nm_datagen, sex_nm, unfiltered_sample_ctl);
+      bitvec_and(sex_nm, unfiltered_sample_ctl, pheno_nm_datagen);
     }
     if (covar_ct && (calculation_type & (CALC_WRITE_COVAR | CALC_MAKE_BED | CALC_MAKE_FAM | CALC_RECODE)) && sample_ct) {
       retval = write_covars(outname, outname_end, write_covar_modifier, write_covar_dummy_max_categories, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, sex_nm, sex_male, pheno_nm_datagen? pheno_nm_datagen : pheno_nm, pheno_c, pheno_d, missing_phenod, output_missing_pheno, covar_ct, covar_names, max_covar_name_len, covar_nm, covar_d);
@@ -1703,7 +1703,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
   // load at far end of stack to make this workable...)
 
   if (calculation_type & (CALC_CLUSTER | CALC_NEIGHBOR)) {
-    wkspace_mark_postcluster = wkspace_base;
+    bigstack_mark_postcluster = g_bigstack_base;
     ulii = (sample_ct * (sample_ct - 1)) >> 1;
     if (cluster_ptr->mds_dim_ct) {
 #ifndef __LP64__
@@ -1714,12 +1714,12 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 #endif
       if (((!read_dists_fname) && (!read_genome_fname)) || (cluster_ptr->modifier & CLUSTER_MISSING)) {
 	if ((!(cluster_ptr->modifier & CLUSTER_MDS)) || (!cluster_ct)) {
-          if (wkspace_alloc_d_checked(&mds_plot_dmatrix_copy, ulii * sizeof(double))) {
+          if (bigstack_alloc_d(ulii, &mds_plot_dmatrix_copy)) {
             goto plink_ret_NOMEM;
           }
 	} else {
 	  ulii = cluster_ct + sample_ct - cluster_starts[cluster_ct];
-          if (wkspace_alloc_d_checked(&mds_plot_dmatrix_copy, (ulii * (ulii - 1)) * (sizeof(double) / 2))) {
+          if (bigstack_alloc_d((ulii * (ulii - 1)) / 2, &mds_plot_dmatrix_copy)) {
             goto plink_ret_NOMEM;
           }
 	}
@@ -1739,11 +1739,11 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
       goto plink_ret_NOMEM;
 #endif
     }
-    if (wkspace_alloc_ul_checked(&cluster_merge_prevented, ((ulii + (BITCT - 1)) / BITCT) * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(BITCT_TO_WORDCT(ulii), &cluster_merge_prevented)) {
       goto plink_ret_NOMEM;
     }
     if (cluster_ct || (calculation_type & CALC_GENOME) || genome_skip_write) {
-      if (wkspace_alloc_d_checked(&cluster_sorted_ibs, ulii * sizeof(double))) {
+      if (bigstack_alloc_d(ulii, &cluster_sorted_ibs)) {
 	goto plink_ret_NOMEM;
       }
       if (cluster_ptr->modifier & CLUSTER_GROUP_AVG) {
@@ -1754,10 +1754,10 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	}
       }
     }
-    wkspace_mark_precluster = wkspace_base;
+    bigstack_mark_precluster = g_bigstack_base;
   }
 
-  wkspace_mark2 = wkspace_base;
+  bigstack_mark2 = g_bigstack_base;
 
   /*
   if (calculation_type & CALC_REGRESS_PCS_DISTANCE) {
@@ -1766,7 +1766,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
     goto plink_ret_1;
   }
   */
-  if (distance_req(calculation_type, read_dists_fname)) {
+  if (distance_req(read_dists_fname, calculation_type)) {
     retval = calc_distance(threads, parallel_idx, parallel_tot, bedfile, bed_offset, outname, outname_end, read_dists_fname, distance_wts_fname, distance_exp, calculation_type, dist_calc_type, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, set_allele_freqs, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, chrom_info_ptr);
     if (retval) {
       goto plink_ret_1;
@@ -1777,8 +1777,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
     // use delayed and specialized load for --cluster/--neighbour, since PPC
     // values may be needed, and user may want to process a distance matrix too
     // large to be loaded in memory by doing some pre-clustering
-    dists_alloc = (sample_ct * (sample_ct - 1)) * (sizeof(double) / 2);
-    if (wkspace_alloc_d_checked(&g_dists, dists_alloc)) {
+    if (bigstack_alloc_d((sample_ct * (sample_ct - 1)) / 2, &g_dists)) {
       goto plink_ret_NOMEM;
     }
     retval = read_dists(read_dists_fname, read_dists_id_fname, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, 0, NULL, NULL, 0, 0, g_dists, 0, NULL, NULL);
@@ -1816,12 +1815,12 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
   }
 
   if (read_dists_fname && (calculation_type & (CALC_IBS_TEST | CALC_GROUPDIST | CALC_REGRESS_DISTANCE))) {
-    wkspace_reset(g_dists);
+    bigstack_reset(g_dists);
     g_dists = NULL;
   }
 
   if ((calculation_type & CALC_GENOME) || genome_skip_write) {
-    wkspace_reset(wkspace_mark2);
+    bigstack_reset(bigstack_mark2);
     g_dists = NULL;
     retval = calc_genome(threads, bedfile, bed_offset, marker_ct, unfiltered_marker_ct, marker_exclude, chrom_info_ptr, marker_pos, set_allele_freqs, nchrobs, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, plink_maxfid, plink_maxiid, max_sample_id_len, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, founder_info, parallel_idx, parallel_tot, outname, outname_end, nonfounders, calculation_type, genome_modifier, ppc_gap, genome_min_pi_hat, genome_max_pi_h [...]
     if (retval) {
@@ -1844,7 +1843,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
   }
 
   if (calculation_type & (CALC_CLUSTER | CALC_NEIGHBOR)) {
-    retval = calc_cluster_neighbor(threads, bedfile, bed_offset, marker_ct, unfiltered_marker_ct, marker_exclude, chrom_info_ptr, set_allele_freqs, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, plink_maxfid, plink_maxiid, max_sample_id_len, read_dists_fname, read_dists_id_fname, read_genome_fname, outname, outname_end, calculation_type, cluster_ct, cluster_map, cluster_starts, cluster_ptr, missing_pheno, neighbor_n1, neighbor_n2, ppc_gap, pheno_c, mds_plot_dmatrix_copy, cl [...]
+    retval = calc_cluster_neighbor(threads, bedfile, bed_offset, marker_ct, unfiltered_marker_ct, marker_exclude, chrom_info_ptr, set_allele_freqs, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, plink_maxfid, plink_maxiid, max_sample_id_len, read_dists_fname, read_dists_id_fname, read_genome_fname, outname, outname_end, calculation_type, cluster_ct, cluster_map, cluster_starts, cluster_ptr, missing_pheno, neighbor_n1, neighbor_n2, ppc_gap, pheno_c, mds_plot_dmatrix_copy, cl [...]
     if (retval) {
       goto plink_ret_1;
     }
@@ -1896,13 +1895,13 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	pheno_d = NULL;
       }
       if (!pheno_c) {
-	if (aligned_malloc(&pheno_c, unfiltered_sample_ctl * sizeof(intptr_t))) {
+	if (aligned_malloc(unfiltered_sample_ctl * sizeof(intptr_t), &pheno_c)) {
 	  goto plink_ret_NOMEM;
 	}
       }
     } else {
-      wkspace_mark = wkspace_base;
-      retval = sort_item_ids(&cptr, &uiptr, unfiltered_sample_ct, sample_exclude, sample_exclude_ct, sample_ids, max_sample_id_len, 0, 0, strcmp_deref);
+      bigstack_mark = g_bigstack_base;
+      retval = sort_item_ids(unfiltered_sample_ct, sample_exclude, sample_exclude_ct, sample_ids, max_sample_id_len, 0, 0, strcmp_deref, &cptr, &uiptr);
       if (retval) {
 	goto plink_ret_1;
       }
@@ -1916,7 +1915,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	fill_ulong_zero(pheno_c, unfiltered_sample_ctl);
 	ukk = cluster_starts[uii + 1];
 	for (ujj = cluster_starts[uii]; ujj < ukk; ujj++) {
-	  SET_BIT(pheno_c, cluster_map[ujj]);
+	  SET_BIT(cluster_map[ujj], pheno_c);
 	}
 	uii++;
       } else {
@@ -1927,7 +1926,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	    if (!pheno_c) {
 	      free(pheno_d);
 	      pheno_d = NULL;
-	      if (aligned_malloc(&pheno_c, unfiltered_sample_ctl * sizeof(intptr_t))) {
+	      if (aligned_malloc(unfiltered_sample_ctl * sizeof(intptr_t), &pheno_c)) {
 		goto plink_ret_NOMEM;
 	      }
 	    }
@@ -1949,15 +1948,15 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	outname_end[1] = '\0';
 	retval = load_pheno(phenofile, unfiltered_sample_ct, sample_exclude_ct, cptr, max_sample_id_len, uiptr, missing_pheno, (misc_flags / MISC_AFFECTION_01) & 1, uii, NULL, pheno_nm, &pheno_c, &pheno_d, &(outname_end[1]), (uintptr_t)((&(outname[FNAMESIZE - 32])) - outname_end));
 	if (retval == LOAD_PHENO_LAST_COL) {
-	  wkspace_reset(wkspace_mark);
+	  bigstack_reset(bigstack_mark);
 	  retval = 0; // exit code bugfix
 	  break;
 	} else if (retval) {
 	  goto plink_ret_1;
 	}
-	bitfield_andnot(pheno_nm, sample_exclude, unfiltered_sample_ctl);
+	bitvec_andnot(sample_exclude, unfiltered_sample_ctl, pheno_nm);
 	if (gender_unk_ct && (!(sex_missing_pheno & ALLOW_NO_SEX))) {
-	  bitfield_and(pheno_nm, sex_nm, unfiltered_sample_ctl);
+	  bitvec_and(sex_nm, unfiltered_sample_ctl, pheno_nm);
 	}
 	pheno_nm_ct = popcount_longs(pheno_nm, unfiltered_sample_ctl);
 	if (!pheno_nm_ct) {
@@ -1965,14 +1964,14 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	}
 	if (!outname_end[1]) {
 	  outname_end[1] = 'P';
-	  outname_end2 = uint32_write(&(outname_end[2]), uii);
+	  outname_end2 = uint32toa(uii, &(outname_end[2]));
 	} else {
           outname_end2 = (char*)memchr(&(outname_end[1]), '\0', FNAMESIZE);
 	}
       }
       *outname_end2 = '\0';
       if (pheno_c) {
-	bitfield_and(pheno_c, pheno_nm, unfiltered_sample_ctl);
+	bitvec_and(pheno_nm, unfiltered_sample_ctl, pheno_c);
         ujj = popcount_longs(pheno_c, unfiltered_sample_ctl);
 	ukk = pheno_nm_ct - ujj;
 	ulii = unfiltered_sample_ct - sample_exclude_ct - pheno_nm_ct;
@@ -2126,9 +2125,6 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
     break;
   }
  plink_ret_1:
-  if (topsize) {
-    wkspace_left += topsize;
-  }
   aligned_free_cond(pheno_nm_datagen);
   free_cond(orig_pheno_d);
   aligned_free_cond(orig_pheno_c);
@@ -2183,16 +2179,16 @@ uint32_t param_count(int32_t argc, char** argv, int32_t flag_idx) {
 int32_t enforce_param_ct_range(uint32_t param_ct, char* flag_name, uint32_t min_ct, uint32_t max_ct) {
   if (param_ct > max_ct) {
     if (max_ct > min_ct) {
-      sprintf(logbuf, "Error: %s accepts at most %u parameter%s.\n", flag_name, max_ct, (max_ct == 1)? "" : "s");
+      sprintf(g_logbuf, "Error: %s accepts at most %u parameter%s.\n", flag_name, max_ct, (max_ct == 1)? "" : "s");
     } else {
-      sprintf(logbuf, "Error: %s only accepts %u parameter%s.\n", flag_name, max_ct, (max_ct == 1)? "" : "s");
+      sprintf(g_logbuf, "Error: %s only accepts %u parameter%s.\n", flag_name, max_ct, (max_ct == 1)? "" : "s");
     }
     return -1;
   } else if (param_ct < min_ct) {
     if (min_ct == 1) {
-      sprintf(logbuf, "Error: Missing %s parameter.\n", flag_name);
+      sprintf(g_logbuf, "Error: Missing %s parameter.\n", flag_name);
     } else {
-      sprintf(logbuf, "Error: %s requires %s%u parameters.\n", flag_name, (min_ct < max_ct)? "at least " : "", min_ct);
+      sprintf(g_logbuf, "Error: %s requires %s%u parameters.\n", flag_name, (min_ct < max_ct)? "at least " : "", min_ct);
     }
     return -1;
   }
@@ -2273,7 +2269,7 @@ int32_t parse_chrom_ranges(uint32_t param_ct, char range_delim, char** argv, uin
     cur_arg_ptr = argv[1];
     while (1) {
       if (parse_next_range(param_ct, range_delim, argv, &cur_param_idx, &cur_arg_ptr, &range_start, &rs_len, &range_end, &re_len)) {
-	sprintf(logbuf, "Error: Invalid --%s parameter '%s'.\n", cur_flag_str, argv[cur_param_idx]);
+	sprintf(g_logbuf, "Error: Invalid --%s parameter '%s'.\n", cur_flag_str, argv[cur_param_idx]);
 	goto parse_chrom_ranges_ret_INVALID_CMDLINE_WWA;
       }
       if (!range_start) {
@@ -2283,12 +2279,12 @@ int32_t parse_chrom_ranges(uint32_t param_ct, char range_delim, char** argv, uin
       if (chrom_code_start < 0) {
 	range_start[rs_len] = '\0';
 	if (!allow_extra_chroms) {
-	  sprintf(logbuf, "Error: Invalid --%s chromosome code '%s'.\n", cur_flag_str, range_start);
+	  sprintf(g_logbuf, "Error: Invalid --%s chromosome code '%s'.\n", cur_flag_str, range_start);
 	  goto parse_chrom_ranges_ret_INVALID_CMDLINE_WWA;
 	} else if (range_end) {
 	  goto parse_chrom_ranges_ret_INVALID_CMDLINE_NONSTD;
 	}
-        if (push_ll_str(&(chrom_info_ptr->incl_excl_name_stack), range_start)) {
+        if (push_ll_str(range_start, &(chrom_info_ptr->incl_excl_name_stack))) {
 	  goto parse_chrom_ranges_ret_NOMEM;
 	}
       } else if (range_end) {
@@ -2296,7 +2292,7 @@ int32_t parse_chrom_ranges(uint32_t param_ct, char range_delim, char** argv, uin
 	if (chrom_code_end < 0) {
 	  if (!allow_extra_chroms) {
 	    range_end[re_len] = '\0';
-	    sprintf(logbuf, "Error: Invalid --%s chromosome code '%s'.\n", cur_flag_str, range_end);
+	    sprintf(g_logbuf, "Error: Invalid --%s chromosome code '%s'.\n", cur_flag_str, range_end);
 	    goto parse_chrom_ranges_ret_INVALID_CMDLINE_WWA;
 	  } else {
 	    goto parse_chrom_ranges_ret_INVALID_CMDLINE_NONSTD;
@@ -2305,12 +2301,12 @@ int32_t parse_chrom_ranges(uint32_t param_ct, char range_delim, char** argv, uin
         if (chrom_code_end <= chrom_code_start) {
 	  range_start[rs_len] = '\0';
 	  range_end[re_len] = '\0';
-	  sprintf(logbuf, "Error: --%s chromosome code '%s' is not greater than '%s'.\n", cur_flag_str, range_end, range_start);
+	  sprintf(g_logbuf, "Error: --%s chromosome code '%s' is not greater than '%s'.\n", cur_flag_str, range_end, range_start);
 	  goto parse_chrom_ranges_ret_INVALID_CMDLINE_WWA;
 	}
-	fill_bits(chrom_mask, chrom_code_start, chrom_code_end + 1 - chrom_code_start);
+	fill_bits(chrom_code_start, chrom_code_end + 1 - chrom_code_start, chrom_mask);
       } else {
-        set_bit(chrom_mask, chrom_code_start);
+        set_bit(chrom_code_start, chrom_mask);
       }
       argct++;
     }
@@ -2328,7 +2324,7 @@ int32_t parse_chrom_ranges(uint32_t param_ct, char range_delim, char** argv, uin
     retval = RET_INVALID_CMDLINE;
     break;
   parse_chrom_ranges_ret_INVALID_CMDLINE_WWA:
-    wordwrap(logbuf, 0);
+    wordwrapb(0);
     logerrprintb();
     logerrprint(errstr_append);
     retval = RET_INVALID_CMDLINE;
@@ -2490,59 +2486,59 @@ int32_t rerun(uint32_t rerun_argv_pos, uint32_t rerun_parameter_present, int32_t
     print_ver();
     goto rerun_ret_OPEN_FAIL;
   }
-  tbuf[MAXLINELEN - 1] = ' ';
-  if (!fgets(tbuf, MAXLINELEN, rerunfile)) {
+  g_textbuf[MAXLINELEN - 1] = ' ';
+  if (!fgets(g_textbuf, MAXLINELEN, rerunfile)) {
     print_ver();
     fflush(stdout);
     fputs("Error: Empty log file for --rerun.\n", stderr);
     goto rerun_ret_INVALID_FORMAT;
   }
-  if (!tbuf[MAXLINELEN - 1]) {
+  if (!g_textbuf[MAXLINELEN - 1]) {
     goto rerun_ret_LONG_LINE;
   }
-  if (!fgets(tbuf, MAXLINELEN, rerunfile)) {
+  if (!fgets(g_textbuf, MAXLINELEN, rerunfile)) {
     print_ver();
     fflush(stdout);
     fputs("Error: Only one line in --rerun log file.\n", stderr);
     goto rerun_ret_INVALID_FORMAT;
   }
   line_idx++;
-  if (!tbuf[MAXLINELEN - 1]) {
+  if (!g_textbuf[MAXLINELEN - 1]) {
     goto rerun_ret_LONG_LINE;
   }
-  if ((tbuf[0] >= '0') && (tbuf[0] <= '9')) {
+  if ((g_textbuf[0] >= '0') && (g_textbuf[0] <= '9')) {
     // Old "xx arguments: --aa bb --cc --dd" format
     fclose_null(&rerunfile);
-    if (scan_posint_capped(tbuf, &loaded_arg_ct, (MAXLINELEN / 2) / 10, (MAXLINELEN / 2) % 10)) {
+    if (scan_posint_capped(g_textbuf, (MAXLINELEN / 2) / 10, (MAXLINELEN / 2) % 10, &loaded_arg_ct)) {
       print_ver();
       fflush(stdout);
       fputs("Error: Invalid argument count on line 2 of --rerun log file.\n", stderr);
       goto rerun_ret_INVALID_FORMAT;
     }
-    line_byte_ct = strlen(tbuf) + 1;
+    line_byte_ct = strlen(g_textbuf) + 1;
     rerun_buf = (char*)malloc(line_byte_ct);
     if (!rerun_buf) {
       goto rerun_ret_NOMEM;
     }
     *rerun_buf_ptr = rerun_buf;
-    memcpy(rerun_buf, tbuf, line_byte_ct);
+    memcpy(rerun_buf, g_textbuf, line_byte_ct);
     // skip "xx arguments: ", to get to the first flag
     rerun_start_ptr = next_token_mult(rerun_buf, 2);
   } else {
     // Current, and also PLINK 1.07, "Options in effect:"
-    while (memcmp(tbuf, "Options in effect:", 18) || (tbuf[18] >= ' ')) {
+    while (memcmp(g_textbuf, "Options in effect:", 18) || (g_textbuf[18] >= ' ')) {
       line_idx++;
-      if (!fgets(tbuf, MAXLINELEN, rerunfile)) {
+      if (!fgets(g_textbuf, MAXLINELEN, rerunfile)) {
 	print_ver();
 	fflush(stdout);
 	fputs("Error: Invalid log file for --rerun.\n", stderr);
 	goto rerun_ret_INVALID_FORMAT;
       }
     }
-    load_ptr = tbuf;
+    load_ptr = g_textbuf;
     loaded_arg_ct = 0;
-    // We load each of the option lines in sequence into tbuf, always
-    // overwriting the previous line's newline.  (Note that tbuf[] has
+    // We load each of the option lines in sequence into g_textbuf, always
+    // overwriting the previous line's newline.  (Note that g_textbuf[] has
     // size > 2 * MAXLINELEN; this lets us avoid additional dynamic memory
     // allocation as long as we impose the constraint that all lines combined
     // add up to less than MAXLINELEN bytes.)
@@ -2552,7 +2548,7 @@ int32_t rerun(uint32_t rerun_argv_pos, uint32_t rerun_parameter_present, int32_t
 	break;
       }
       line_idx++;
-      if (!tbuf[MAXLINELEN - 1]) {
+      if (!g_textbuf[MAXLINELEN - 1]) {
 	goto rerun_ret_LONG_LINE;
       }
       sptr = skip_initial_spaces(load_ptr);
@@ -2566,7 +2562,7 @@ int32_t rerun(uint32_t rerun_argv_pos, uint32_t rerun_parameter_present, int32_t
         sptr = skip_initial_spaces(argptr);
       } while (!is_eoln_kns(*sptr));
       load_ptr = argptr;
-      if (load_ptr >= &(tbuf[MAXLINELEN])) {
+      if (load_ptr >= &(g_textbuf[MAXLINELEN])) {
 	print_ver();
 	fflush(stdout);
 	fputs("Error: --rerun argument sequence too long.\n", stderr);
@@ -2574,19 +2570,19 @@ int32_t rerun(uint32_t rerun_argv_pos, uint32_t rerun_parameter_present, int32_t
       }
     }
     fclose_null(&rerunfile);
-    line_byte_ct = 1 + (uintptr_t)(load_ptr - tbuf);
+    line_byte_ct = 1 + (uintptr_t)(load_ptr - g_textbuf);
     rerun_buf = (char*)malloc(line_byte_ct);
     if (!rerun_buf) {
       goto rerun_ret_NOMEM;
     }
-    rerun_buf = (char*)malloc(1 + ((uintptr_t)(load_ptr - tbuf)));
-    memcpy(rerun_buf, tbuf, line_byte_ct);
+    rerun_buf = (char*)malloc(1 + ((uintptr_t)(load_ptr - g_textbuf)));
+    memcpy(rerun_buf, g_textbuf, line_byte_ct);
     rerun_start_ptr = skip_initial_spaces(rerun_buf);
   }
   sptr = rerun_start_ptr;
 
-  // now use tbuf as a lame bitfield
-  memset(tbuf, 1, loaded_arg_ct);
+  // now use g_textbuf as a lame bitfield
+  memset(g_textbuf, 1, loaded_arg_ct);
   loaded_arg_idx = 0;
   duplicate_ct = 0;
   do {
@@ -2613,7 +2609,7 @@ int32_t rerun(uint32_t rerun_argv_pos, uint32_t rerun_parameter_present, int32_t
 	// matching flag, override --rerun
 	do {
 	  duplicate_ct++;
-	  tbuf[loaded_arg_idx++] = 0;
+	  g_textbuf[loaded_arg_idx++] = 0;
 	  if (loaded_arg_idx == loaded_arg_ct) {
 	    break;
 	  }
@@ -2638,7 +2634,7 @@ int32_t rerun(uint32_t rerun_argv_pos, uint32_t rerun_parameter_present, int32_t
   }
   sptr = rerun_start_ptr;
   for (loaded_arg_idx = 0; loaded_arg_idx < loaded_arg_ct; loaded_arg_idx++) {
-    if (tbuf[loaded_arg_idx]) {
+    if (g_textbuf[loaded_arg_idx]) {
       slen = strlen_se(sptr);
       subst_argv2[new_arg_idx++] = sptr;
       sptr[slen] = '\0';
@@ -2891,9 +2887,10 @@ uint32_t valid_varid_template_string(char* varid_str, const char* flag_name) {
   return 1;
 }
 
-// these need global scope to stay around on all systems
-const char species_singular_constants[][7] = {"person", "cow", "dog", "horse", "mouse", "plant", "sheep", "sample"};
-const char species_plural_constants[][8] = {"people", "cattle", "dogs", "horses", "mice", "plants", "sheep", "samples"};
+// if these are defined within init_delim_and_species, they may not persist
+// after function exit
+static const char species_singular_constants[][7] = {"person", "cow", "dog", "horse", "mouse", "plant", "sheep", "sample"};
+static const char species_plural_constants[][8] = {"people", "cattle", "dogs", "horses", "mice", "plants", "sheep", "samples"};
 
 int32_t init_delim_and_species(uint32_t flag_ct, char* flag_buf, uint32_t* flag_map, int32_t argc, char** argv, char* range_delim_ptr, Chrom_info* chrom_info_ptr) {
   // human: 22, X, Y, XY, MT
@@ -2926,8 +2923,8 @@ int32_t init_delim_and_species(uint32_t flag_ct, char* flag_buf, uint32_t* flag_
     if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
       goto init_delim_and_species_ret_INVALID_CMDLINE_2A;
     }
-    if (scan_posint_capped(argv[cur_arg + 1], (uint32_t*)(&ii), MAX_CHROM_TEXTNUM / 10, MAX_CHROM_TEXTNUM % 10)) {
-      sprintf(logbuf, "Error: Invalid --autosome-num parameter '%s'.\n", argv[cur_arg + 1]);
+    if (scan_posint_capped(argv[cur_arg + 1], MAX_CHROM_TEXTNUM / 10, MAX_CHROM_TEXTNUM % 10, (uint32_t*)(&ii))) {
+      sprintf(g_logbuf, "Error: Invalid --autosome-num parameter '%s'.\n", argv[cur_arg + 1]);
       goto init_delim_and_species_ret_INVALID_CMDLINE_WWA;
     }
     chrom_info_ptr->x_code = ii + 1;
@@ -2936,7 +2933,7 @@ int32_t init_delim_and_species(uint32_t flag_ct, char* flag_buf, uint32_t* flag_
     chrom_info_ptr->mt_code = -1;
     chrom_info_ptr->max_code = ii + 1;
     chrom_info_ptr->autosome_ct = ii;
-    set_bit(chrom_info_ptr->haploid_mask, ii + 1);
+    set_bit(ii + 1, chrom_info_ptr->haploid_mask);
   }
   if (flag_match("chr-set", &flag_idx, flag_ct, flag_buf)) {
     if (species_flag(&species_code, SPECIES_UNKNOWN)) {
@@ -2947,8 +2944,8 @@ int32_t init_delim_and_species(uint32_t flag_ct, char* flag_buf, uint32_t* flag_
     if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 5)) {
       goto init_delim_and_species_ret_INVALID_CMDLINE_2A;
     }
-    if (scan_int_abs_bounded(argv[cur_arg + 1], &ii, MAX_CHROM_TEXTNUM / 10, MAX_CHROM_TEXTNUM % 10) || (!ii)) {
-      sprintf(logbuf, "Error: Invalid --chr-set parameter '%s'.\n", argv[cur_arg + 1]);
+    if (scan_int_abs_bounded(argv[cur_arg + 1], MAX_CHROM_TEXTNUM / 10, MAX_CHROM_TEXTNUM % 10, &ii) || (!ii)) {
+      sprintf(g_logbuf, "Error: Invalid --chr-set parameter '%s'.\n", argv[cur_arg + 1]);
       goto init_delim_and_species_ret_INVALID_CMDLINE_WWA;
     }
     if (ii < 0) {
@@ -2963,28 +2960,28 @@ int32_t init_delim_and_species(uint32_t flag_ct, char* flag_buf, uint32_t* flag_
       chrom_info_ptr->xy_code = -1;
       chrom_info_ptr->mt_code = -1;
       chrom_info_ptr->max_code = ii;
-      fill_all_bits(chrom_info_ptr->haploid_mask, ((uint32_t)ii) + 1);
+      fill_all_bits(((uint32_t)ii) + 1, chrom_info_ptr->haploid_mask);
     } else {
       chrom_info_ptr->autosome_ct = ii;
       chrom_info_ptr->x_code = ii + 1;
       chrom_info_ptr->y_code = ii + 2;
       chrom_info_ptr->xy_code = ii + 3;
       chrom_info_ptr->mt_code = ii + 4;
-      set_bit(chrom_info_ptr->haploid_mask, ii + 1);
-      set_bit(chrom_info_ptr->haploid_mask, ii + 2);
+      set_bit(ii + 1, chrom_info_ptr->haploid_mask);
+      set_bit(ii + 2, chrom_info_ptr->haploid_mask);
       for (param_idx = 2; param_idx <= param_ct; param_idx++) {
 	if (!strcmp(argv[cur_arg + param_idx], "no-x")) {
 	  chrom_info_ptr->x_code = -1;
-	  clear_bit(chrom_info_ptr->haploid_mask, ii + 1);
+	  clear_bit(ii + 1, chrom_info_ptr->haploid_mask);
 	} else if (!strcmp(argv[cur_arg + param_idx], "no-y")) {
 	  chrom_info_ptr->y_code = -1;
-	  clear_bit(chrom_info_ptr->haploid_mask, ii + 2);
+	  clear_bit(ii + 2, chrom_info_ptr->haploid_mask);
 	} else if (!strcmp(argv[cur_arg + param_idx], "no-xy")) {
 	  chrom_info_ptr->xy_code = -1;
 	} else if (!strcmp(argv[cur_arg + param_idx], "no-mt")) {
 	  chrom_info_ptr->mt_code = -1;
 	} else {
-	  sprintf(logbuf, "Error: Invalid --chr-set parameter '%s'.\n", argv[cur_arg + param_idx]);
+	  sprintf(g_logbuf, "Error: Invalid --chr-set parameter '%s'.\n", argv[cur_arg + param_idx]);
 	  goto init_delim_and_species_ret_INVALID_CMDLINE_WWA;
 	}
       }
@@ -3122,7 +3119,7 @@ int32_t init_delim_and_species(uint32_t flag_ct, char* flag_buf, uint32_t* flag_
   }
   while (0) {
   init_delim_and_species_ret_INVALID_CMDLINE_WWA:
-    wordwrap(logbuf, 0);
+    wordwrapb(0);
   init_delim_and_species_ret_INVALID_CMDLINE_2A:
     logerrprintb();
   init_delim_and_species_ret_INVALID_CMDLINE_A:
@@ -3138,21 +3135,21 @@ int32_t init_delim_and_species(uint32_t flag_ct, char* flag_buf, uint32_t* flag_
 
 void fill_chrom_mask(Chrom_info* chrom_info_ptr) {
   if (chrom_info_ptr->species != SPECIES_UNKNOWN) {
-    fill_all_bits(chrom_info_ptr->chrom_mask, chrom_info_ptr->max_code + 1);
+    fill_all_bits(chrom_info_ptr->max_code + 1, chrom_info_ptr->chrom_mask);
   } else {
-    fill_all_bits(chrom_info_ptr->chrom_mask, chrom_info_ptr->autosome_ct + 1);
+    fill_all_bits(chrom_info_ptr->autosome_ct + 1, chrom_info_ptr->chrom_mask);
     // --chr-set support
     if (chrom_info_ptr->x_code != -1) {
-      set_bit(chrom_info_ptr->chrom_mask, chrom_info_ptr->x_code);
+      set_bit(chrom_info_ptr->x_code, chrom_info_ptr->chrom_mask);
     }
     if (chrom_info_ptr->y_code != -1) {
-      set_bit(chrom_info_ptr->chrom_mask, chrom_info_ptr->y_code);
+      set_bit(chrom_info_ptr->y_code, chrom_info_ptr->chrom_mask);
     }
     if (chrom_info_ptr->xy_code != -1) {
-      set_bit(chrom_info_ptr->chrom_mask, chrom_info_ptr->xy_code);
+      set_bit(chrom_info_ptr->xy_code, chrom_info_ptr->chrom_mask);
     }
     if (chrom_info_ptr->mt_code != -1) {
-      set_bit(chrom_info_ptr->chrom_mask, chrom_info_ptr->mt_code);
+      set_bit(chrom_info_ptr->mt_code, chrom_info_ptr->chrom_mask);
     }
   }
 }
@@ -3239,7 +3236,7 @@ int32_t main(int32_t argc, char** argv) {
   double vcf_min_gq = -1;
   double vcf_min_gp = -1;
   double qual_min_thresh = 0.0;
-  double qual_max_thresh = HUGE_DOUBLE;
+  double qual_max_thresh = DBL_MAX;
   char id_delim = '\0';
   char vcf_idspace_to = '\0';
   unsigned char vcf_half_call = 0;
@@ -3353,8 +3350,8 @@ int32_t main(int32_t argc, char** argv) {
   uint32_t cnv_enrichment_test_mperms = 0;
   uint32_t cnv_min_seglen = 0;
   uint32_t cnv_max_seglen = 0xffffffffU;
-  double cnv_min_score = -HUGE_DOUBLE;
-  double cnv_max_score = HUGE_DOUBLE;
+  double cnv_min_score = -DBL_MAX;
+  double cnv_max_score = DBL_MAX;
   uint32_t cnv_min_sites = 0;
   uint32_t cnv_max_sites = 0xffffffffU;
   uint32_t cnv_intersect_filter_type = 0;
@@ -3385,7 +3382,7 @@ int32_t main(int32_t argc, char** argv) {
   char* missing_code = NULL;
   char range_delim = '-';
   uint32_t modifier_23 = 0;
-  double pheno_23 = HUGE_DOUBLE;
+  double pheno_23 = DBL_MAX;
   char* fid_23 = NULL;
   char* iid_23 = NULL;
   char* paternal_id_23 = NULL;
@@ -3407,8 +3404,9 @@ int32_t main(int32_t argc, char** argv) {
   int32_t mib[2];
   size_t sztmp;
 #endif
-  unsigned char* wkspace_ua = NULL;
+  unsigned char* bigstack_ua = NULL; // ua = unaligned
   char* bubble = NULL;
+  unsigned char* bigstack_initial_base;
   uint32_t param_ct;
   time_t rawtime;
   char* argptr;
@@ -3482,7 +3480,7 @@ int32_t main(int32_t argc, char** argv) {
       ujj = param_count(argc, argv, uii);
       if (enforce_param_ct_range(ujj, argv[uii], 1, 1)) {
 	print_ver();
-	fputs(logbuf, stdout);
+	fputs(g_logbuf, stdout);
 	fputs(errstr_append, stdout);
 	goto main_ret_INVALID_CMDLINE;
       }
@@ -3496,10 +3494,10 @@ int32_t main(int32_t argc, char** argv) {
 	}
       }
       // logging not yet active, so don't use fopen_checked()
-      scriptfile = fopen(argv[uii + 1], "rb");
+      scriptfile = fopen(argv[uii + 1], FOPEN_RB);
       if (!scriptfile) {
 	print_ver();
-	printf(errstr_fopen, argv[uii + 1]);
+	printf(g_errstr_fopen, argv[uii + 1]);
 	goto main_ret_OPEN_FAIL;
       }
       if (fseeko(scriptfile, 0, SEEK_END)) {
@@ -3571,7 +3569,7 @@ int32_t main(int32_t argc, char** argv) {
       ujj = param_count(argc, argv, uii);
       if (enforce_param_ct_range(ujj, argv[uii], 0, 1)) {
 	print_ver();
-	fputs(logbuf, stdout);
+	fputs(g_logbuf, stdout);
 	fputs(errstr_append, stdout);
 	goto main_ret_INVALID_CMDLINE;
       }
@@ -3618,7 +3616,7 @@ int32_t main(int32_t argc, char** argv) {
 	if ((cur_arg != 1) || (uii != 1) || subst_argv) {
 	  printf("-%s present, ignoring other flags.\n", argptr);
 	}
-	fputs(cmdline_format_str, stdout);
+	fputs(g_cmdline_format_str, stdout);
 	fputs(notestr_null_calc2, stdout);
         retval = RET_HELP;
 	goto main_ret_1;
@@ -3631,7 +3629,7 @@ int32_t main(int32_t argc, char** argv) {
       if (strlen(argptr) >= MAX_FLAG_LEN) {
 	print_ver();
 	invalid_arg(argv[uii]);
-	fputs(logbuf, stdout);
+	fputs(g_logbuf, stdout);
 	fputs(errstr_append, stdout);
         goto main_ret_INVALID_CMDLINE;
       }
@@ -3640,7 +3638,7 @@ int32_t main(int32_t argc, char** argv) {
   }
   if (!flag_ct) {
     print_ver();
-    fputs(cmdline_format_str, stdout);
+    fputs(g_cmdline_format_str, stdout);
     fputs(notestr_null_calc2, stdout);
     retval = RET_NULL_CALC;
     goto main_ret_1;
@@ -3951,7 +3949,7 @@ int32_t main(int32_t argc, char** argv) {
       ujj = flag_map[cur_flag];
       ukk = param_count(argc, argv, ujj);
       if (enforce_param_ct_range(ukk, argv[ujj], 1, 1)) {
-	fputs(logbuf, stdout);
+	fputs(g_logbuf, stdout);
 	fputs(errstr_append, stdout);
 	goto main_ret_INVALID_CMDLINE;
       }
@@ -3969,8 +3967,8 @@ int32_t main(int32_t argc, char** argv) {
     }
   }
   memcpy(&(outname[uii]), ".log", 5);
-  logfile = fopen(outname, "w");
-  if (!logfile) {
+  g_logfile = fopen(outname, "w");
+  if (!g_logfile) {
     fflush(stdout);
     fprintf(stderr, "Error: Failed to open %s.  Try ", outname);
     if (!memcmp(outname, PROG_NAME_STR, 6)) {
@@ -3985,8 +3983,8 @@ int32_t main(int32_t argc, char** argv) {
 
   logstr(ver_str);
   /*
-  sprintf(logbuf, "\n%d argument%s:", argc + umm - cur_arg, (argc + umm - cur_arg == 1)? "" : "s");
-  logstr(logbuf);
+  sprintf(g_logbuf, "\n%d argument%s:", argc + umm - cur_arg, (argc + umm - cur_arg == 1)? "" : "s");
+  logstr(g_logbuf);
   for (cur_flag = 0; cur_flag < flag_ct; cur_flag++) {
     logstr(" --");
     logstr(&(flag_buf[cur_flag * MAX_FLAG_LEN]));
@@ -4012,18 +4010,18 @@ int32_t main(int32_t argc, char** argv) {
   logprint("\n");
 
 #ifdef _WIN32
-  windows_dw = TBUF_SIZE;
-  if (GetComputerName(tbuf, &windows_dw))
+  windows_dw = TEXTBUF_SIZE;
+  if (GetComputerName(g_textbuf, &windows_dw))
 #else
-  if (gethostname(tbuf, TBUF_SIZE) != -1)
+  if (gethostname(g_textbuf, TEXTBUF_SIZE) != -1)
 #endif
   {
     logstr("Hostname: ");
-    logstr(tbuf);
+    logstr(g_textbuf);
   }
   logstr("\nWorking directory: ");
-  getcwd(tbuf, FNAMESIZE);
-  logstr(tbuf);
+  getcwd(g_textbuf, FNAMESIZE);
+  logstr(g_textbuf);
   logstr("\nStart time: ");
   time(&rawtime);
   logstr(ctime(&rawtime));
@@ -4125,7 +4123,7 @@ int32_t main(int32_t argc, char** argv) {
 	      }
 	      if (param_ct > 4) {
 		if (scan_double(argv[cur_arg + 5], &pheno_23)) {
-		  sprintf(logbuf, "Error: Invalid --23file phenotype '%s'.\n", argv[cur_arg + 5]);
+		  sprintf(g_logbuf, "Error: Invalid --23file phenotype '%s'.\n", argv[cur_arg + 5]);
 		  goto main_ret_INVALID_CMDLINE_WWA;
 		}
 		if (param_ct > 5) {
@@ -4152,7 +4150,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	load_rare = LOAD_RARE_23;
       } else if ((!memcmp(argptr2, "3file-convert-xy", 17)) || (!memcmp(argptr2, "3file-make-xylist", 18))) {
-        sprintf(logbuf, "Error: --%s has been retired due to brain-damaged design.  Use\n--split-x instead.\n", argptr);
+        sprintf(g_logbuf, "Error: --%s has been retired due to brain-damaged design.  Use\n--split-x instead.\n", argptr);
         goto main_ret_INVALID_CMDLINE_2A;
       } else {
 	goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
@@ -4175,7 +4173,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &cluster.min_ct)) {
-          sprintf(logbuf, "Error: Invalid --K cluster count '%s'.\n", argv[cur_arg + 1]);
+          sprintf(g_logbuf, "Error: Invalid --K cluster count '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else {
@@ -4195,7 +4193,7 @@ int32_t main(int32_t argc, char** argv) {
 	  if (!strcmp(argv[cur_arg + 1], "debug")) {
 	    uii = 2;
 	  } else if (strcmp(argv[cur_arg + 2], "debug")) {
-	    sprintf(logbuf, "Error: Invalid --R modifier '%s'.\n", argv[cur_arg + 2]);
+	    sprintf(g_logbuf, "Error: Invalid --R modifier '%s'.\n", argv[cur_arg + 2]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
           misc_flags |= MISC_RPLUGIN_DEBUG;
@@ -4222,8 +4220,8 @@ int32_t main(int32_t argc, char** argv) {
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
-	if (scan_posint_capped(argv[cur_arg + 1], &rplugin_port, 65535 / 10, 65535 % 10)) {
-	  sprintf(logbuf, "Error: Invalid --R-port parameter '%s'.\n", argv[cur_arg + 1]);
+	if (scan_posint_capped(argv[cur_arg + 1], 65535 / 10, 65535 % 10, &rplugin_port)) {
+	  sprintf(g_logbuf, "Error: Invalid --R-port parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "-debug", 7)) {
@@ -4250,7 +4248,7 @@ int32_t main(int32_t argc, char** argv) {
 
     case 'a':
       if (!memcmp(argptr2, "utosome", 8)) {
-	fill_bits(chrom_info.chrom_mask, 1, chrom_info.autosome_ct);
+	fill_bits(1, chrom_info.autosome_ct, chrom_info.chrom_mask);
 	chrom_info.is_include_stack = 1;
 	chrom_flag_present = 1;
 	goto main_param_zero;
@@ -4263,8 +4261,8 @@ int32_t main(int32_t argc, char** argv) {
 	  logerrprint("Error: --autosome-xy used with a species lacking an XY region.\n");
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
-	fill_bits(chrom_info.chrom_mask, 1, chrom_info.autosome_ct);
-	set_bit(chrom_info.chrom_mask, chrom_info.xy_code);
+	fill_bits(1, chrom_info.autosome_ct, chrom_info.chrom_mask);
+	set_bit(chrom_info.xy_code, chrom_info.chrom_mask);
 	chrom_info.is_include_stack = 1;
 	goto main_param_zero;
       } else if (!memcmp(argptr2, "llow-extra-chr", 15)) {
@@ -4277,7 +4275,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
         if (param_ct) {
 	  if (memcmp("0", argv[cur_arg + 1], 2)) {
-            sprintf(logbuf, "Error: Invalid --allow-extra-chr parameter '%s'.\n", argv[cur_arg + 1]);
+            sprintf(g_logbuf, "Error: Invalid --allow-extra-chr parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  chrom_info.zero_extra_chroms = 1;
@@ -4307,7 +4305,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct == 1) {
 	  if (strcmp("multichar", argv[cur_arg + 1])) {
-	    sprintf(logbuf, "Error: Invalid --allele1234 parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --allele1234 parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  allelexxxx = ALLELE_RECODE_MULTICHAR;
@@ -4325,7 +4323,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct == 1) {
 	  if (strcmp("multichar", argv[cur_arg + 1])) {
-	    sprintf(logbuf, "Error: Invalid --alleleACGT parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --alleleACGT parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  allelexxxx = ALLELE_RECODE_ACGT | ALLELE_RECODE_MULTICHAR;
@@ -4385,7 +4383,7 @@ int32_t main(int32_t argc, char** argv) {
 	      goto main_ret_INVALID_CMDLINE;
 	    }
 	    if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &model_mperm_val)) {
-	      sprintf(logbuf, "Error: Invalid --assoc mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
+	      sprintf(g_logbuf, "Error: Invalid --assoc mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
 	      goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	    model_modifier |= MODEL_MPERM;
@@ -4407,7 +4405,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "set-test")) {
 	    model_modifier |= MODEL_SET_TEST;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --assoc parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --assoc parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -4426,7 +4424,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "qq-plot")) {
 	    mtest_adjust |= ADJUST_QQ;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --adjust parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --adjust parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -4435,13 +4433,13 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &aperm.min)) {
-	  sprintf(logbuf, "Error: Invalid --aperm min permutation count '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --aperm min permutation count '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	aperm.min++;
 	if (param_ct > 1) {
-	  if (scan_posint_capped(argv[cur_arg + 2], &aperm.max, APERM_MAX / 10, APERM_MAX % 10)) {
-	    sprintf(logbuf, "Error: Invalid --aperm max permutation count '%s'.\n", argv[cur_arg + 2]);
+	  if (scan_posint_capped(argv[cur_arg + 2], APERM_MAX / 10, APERM_MAX % 10, &aperm.max)) {
+	    sprintf(g_logbuf, "Error: Invalid --aperm max permutation count '%s'.\n", argv[cur_arg + 2]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -4453,30 +4451,30 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct > 2) {
 	  if (scan_double(argv[cur_arg + 3], &aperm.alpha)) {
-	    sprintf(logbuf, "Error: Invalid --aperm alpha threshold '%s'.\n", argv[cur_arg + 3]);
+	    sprintf(g_logbuf, "Error: Invalid --aperm alpha threshold '%s'.\n", argv[cur_arg + 3]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  if (param_ct > 3) {
 	    if (scan_double(argv[cur_arg + 4], &aperm.beta) || (aperm.beta <= 0)) {
-	      sprintf(logbuf, "Error: Invalid --aperm beta '%s'.\n", argv[cur_arg + 4]);
+	      sprintf(g_logbuf, "Error: Invalid --aperm beta '%s'.\n", argv[cur_arg + 4]);
 	      goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	    if (param_ct > 4) {
 	      if (scan_double(argv[cur_arg + 5], &aperm.init_interval)) {
-		sprintf(logbuf, "Error: Invalid --aperm initial pruning interval '%s'.\n", argv[cur_arg + 5]);
+		sprintf(g_logbuf, "Error: Invalid --aperm initial pruning interval '%s'.\n", argv[cur_arg + 5]);
 		goto main_ret_INVALID_CMDLINE_WWA;
 	      }
 	      if ((aperm.init_interval < 1) || (aperm.init_interval > 1000000)) {
-		sprintf(logbuf, "Error: Invalid --aperm initial pruning interval '%s'.\n", argv[cur_arg + 5]);
+		sprintf(g_logbuf, "Error: Invalid --aperm initial pruning interval '%s'.\n", argv[cur_arg + 5]);
 		goto main_ret_INVALID_CMDLINE_WWA;
 	      }
 	      if (param_ct == 6) {
 		if (scan_double(argv[cur_arg + 6], &aperm.interval_slope)) {
-		  sprintf(logbuf, "Error: Invalid --aperm pruning interval slope '%s'.\n", argv[cur_arg + 6]);
+		  sprintf(g_logbuf, "Error: Invalid --aperm pruning interval slope '%s'.\n", argv[cur_arg + 6]);
 		  goto main_ret_INVALID_CMDLINE_WWA;
 		}
 		if ((aperm.interval_slope < 0) || (aperm.interval_slope > 1)) {
-		  sprintf(logbuf, "Error: Invalid --aperm pruning interval slope '%s'.\n", argv[cur_arg + 6]);
+		  sprintf(g_logbuf, "Error: Invalid --aperm pruning interval slope '%s'.\n", argv[cur_arg + 6]);
 		  goto main_ret_INVALID_CMDLINE_WWA;
 		}
 	      }
@@ -4560,7 +4558,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if ((ujj == 8) && (!memcmp(argv[cur_arg + uii], "distance", 8))) {
 	    annot_info.modifier |= ANNOT_DISTANCE;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --annotate parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --annotate parameter '%s'.\n", argv[cur_arg + uii]);
             goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -4578,7 +4576,7 @@ int32_t main(int32_t argc, char** argv) {
 	    logerrprint("Error: --annotate 'subset' modifier must be used with 'ranges'.\n");
 	    goto main_ret_INVALID_CMDLINE_A;
 	  } else if (annot_info.modifier & (ANNOT_MINIMAL | ANNOT_DISTANCE)) {
-	    sprintf(logbuf, "Error: --annotate '%s' modifier must be used with 'ranges'.\n", (annot_info.modifier & ANNOT_MINIMAL)? "minimal" : "distance");
+	    sprintf(g_logbuf, "Error: --annotate '%s' modifier must be used with 'ranges'.\n", (annot_info.modifier & ANNOT_MINIMAL)? "minimal" : "distance");
             goto main_ret_INVALID_CMDLINE_2A;
 	  }
 	}
@@ -4731,7 +4729,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &min_bp_space)) {
-	  sprintf(logbuf, "Error: Invalid --bp-space minimum bp distance '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --bp-space minimum bp distance '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	filter_flags |= FILTER_BIM_REQ | FILTER_DOSAGEMAP | FILTER_NOCNV;
@@ -4774,7 +4772,7 @@ int32_t main(int32_t argc, char** argv) {
 	      goto main_ret_INVALID_CMDLINE;
 	    }
 	    if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &(cluster.cmh_mperm_val))) {
-	      sprintf(logbuf, "Error: Invalid --bd mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
+	      sprintf(g_logbuf, "Error: Invalid --bd mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
               goto main_ret_INVALID_CMDLINE_WWA;
 	    }
             cluster.modifier |= CLUSTER_CMH_MPERM;
@@ -4786,7 +4784,7 @@ int32_t main(int32_t argc, char** argv) {
             logerrprint("Error: Improper --bd mperm syntax.  (Use '--bd mperm=[value]'.)\n");
             goto main_ret_INVALID_CMDLINE_A;
 	  } else {
-            sprintf(logbuf, "Error: Invalid --bd parameter '%s'.\n", argv[cur_arg + uii]);
+            sprintf(g_logbuf, "Error: Invalid --bd parameter '%s'.\n", argv[cur_arg + uii]);
             goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -4802,7 +4800,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "list")) {
 	    misc_flags |= MISC_BIALLELIC_ONLY_LIST;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --biallelic-only modifier '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --biallelic-only modifier '%s'.\n", argv[cur_arg + uii]);
             goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -4836,7 +4834,7 @@ int32_t main(int32_t argc, char** argv) {
 	strcpy(pedname, argv[cur_arg + 1]);
 	if (param_ct == 2) {
 	  if (strcmp(argv[cur_arg + 2], "snpid-chr")) {
-	    sprintf(logbuf, "Error: Invalid --bgen modifier '%s'.\n", argv[cur_arg + 2]);
+	    sprintf(g_logbuf, "Error: Invalid --bgen modifier '%s'.\n", argv[cur_arg + 2]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
           misc_flags |= MISC_OXFORD_SNPID_CHR;
@@ -4851,7 +4849,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "no-small-max-span")) {
             ld_info.modifier |= LD_BLOCKS_NO_SMALL_MAX_SPAN;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --blocks parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --blocks parameter '%s'.\n", argv[cur_arg + uii]);
             goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -4865,7 +4863,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0) || (dxx > 1.0 - SMALL_EPSILON)) {
-	  sprintf(logbuf, "Error: Invalid --blocks-inform-frac parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --blocks-inform-frac parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	ld_info.blocks_inform_frac = dxx;
@@ -4878,7 +4876,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0)) {
-	  sprintf(logbuf, "Error: Invalid --blocks-max-kb parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --blocks-max-kb parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (dxx > 2147483.646) {
@@ -4895,7 +4893,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0) || (dxx > 0.5)) {
-	  sprintf(logbuf, "Error: Invalid --blocks-min-maf parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --blocks-min-maf parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
         ld_info.blocks_min_maf = dxx;
@@ -4908,7 +4906,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0) || (dxx > 1.0)) {
-	  sprintf(logbuf, "Error: Invalid --blocks-recomb-highci parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --blocks-recomb-highci parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
         ld_info.blocks_recomb_highci = ((int32_t)((dxx + SMALL_EPSILON) * 100));
@@ -4926,7 +4924,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < SMALL_EPSILON) || (dxx > 1.0)) {
-	  sprintf(logbuf, "Error: Invalid --blocks-strong-highci parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --blocks-strong-highci parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
         ld_info.blocks_strong_highci = (int32_t)((dxx - SMALL_EPSILON) * 100);
@@ -4945,7 +4943,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < SMALL_EPSILON) || (dxx >= 1)) {
-	  sprintf(logbuf, "Error: Invalid --blocks-strong-lowci parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --blocks-strong-lowci parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	ld_info.blocks_strong_lowci_outer = 2 + (int32_t)((dxx - SMALL_EPSILON) * 100);
@@ -4966,7 +4964,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0)) {
-	  sprintf(logbuf, "Error: Invalid --border parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --border parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WW;
 	}
 	if (dxx > 2147483.646) {
@@ -5019,7 +5017,7 @@ int32_t main(int32_t argc, char** argv) {
 	  if (!strcmp(argv[cur_arg + 1], "keep-pheno-on-missing-cov")) {
 	    uii = 2;
 	  } else if (strcmp(argv[cur_arg + 2], "keep-pheno-on-missing-cov")) {
-	    sprintf(logbuf, "Error: Invalid --covar parameter '%s'.\n", argv[cur_arg + 2]);
+	    sprintf(g_logbuf, "Error: Invalid --covar parameter '%s'.\n", argv[cur_arg + 2]);
             goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  if (rplugin_fname) {
@@ -5064,7 +5062,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_uint_defcap(argv[cur_arg + 1], (uint32_t*)&model_cell_ct)) {
-	  sprintf(logbuf, "Error: Invalid --cell parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --cell parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "i", 2)) {
@@ -5072,7 +5070,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx)) {
-	  sprintf(logbuf, "Error: Invalid --ci parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --ci parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if ((dxx < 0.01) || (dxx >= 1.0)) {
@@ -5104,7 +5102,7 @@ int32_t main(int32_t argc, char** argv) {
 	    }
 	    cluster.modifier |= CLUSTER_OLD_TIEBREAKS;
 	  } else {
-            sprintf(logbuf, "Error: Invalid --cluster parameter '%s'.\n", argv[cur_arg + uii]);
+            sprintf(g_logbuf, "Error: Invalid --cluster parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -5181,7 +5179,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (scan_posint_defcap(argv[cur_arg + 1], &cnv_enrichment_test_mperms)) {
-	    sprintf(logbuf, "Error: Invalid --cnv-enrichment-test permutation count '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --cnv-enrichment-test permutation count '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -5210,7 +5208,7 @@ int32_t main(int32_t argc, char** argv) {
           goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &cnv_freq_val)) {
-	  sprintf(logbuf, "Error: Invalid --cnv-freq-exclude-above parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --cnv-freq-exclude-above parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	cnv_freq_type = CNV_FREQ_EXCLUDE_ABOVE;
@@ -5224,7 +5222,7 @@ int32_t main(int32_t argc, char** argv) {
           goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &cnv_freq_val) || (cnv_freq_val == 1)) {
-	  sprintf(logbuf, "Error: Invalid --cnv-freq-exclude-below parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --cnv-freq-exclude-below parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	cnv_freq_type = CNV_FREQ_EXCLUDE_BELOW;
@@ -5238,7 +5236,7 @@ int32_t main(int32_t argc, char** argv) {
           goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &cnv_freq_val)) {
-	  sprintf(logbuf, "Error: Invalid --cnv-freq-exclude-exact parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --cnv-freq-exclude-exact parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	cnv_freq_type = CNV_FREQ_EXCLUDE_EXACT;
@@ -5252,7 +5250,7 @@ int32_t main(int32_t argc, char** argv) {
           goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &cnv_freq_val)) {
-	  sprintf(logbuf, "Error: Invalid --cnv-freq-include-exact parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --cnv-freq-include-exact parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	cnv_freq_type = CNV_FREQ_INCLUDE_EXACT;
@@ -5263,7 +5261,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (scan_double(argv[cur_arg + 1], &cnv_freq_val2) || (cnv_freq_val2 < 0) || (cnv_freq_val2 > 1)) {
-	    sprintf(logbuf, "Error: Invalid --cnv-freq-method2 parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --cnv-freq-method2 parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -5286,7 +5284,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (scan_double(argv[cur_arg + 1], &cnv_freq_val2) || (cnv_freq_val2 < 0) || (cnv_freq_val2 > 1)) {
-	    sprintf(logbuf, "Error: Invalid --cnv-freq-overlap parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --cnv-freq-overlap parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -5301,7 +5299,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (scan_posint_defcap(argv[cur_arg + 1], &cnv_sample_mperms)) {
-	    sprintf(logbuf, "Error: Invalid --cnv-indiv-perm permutation count '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --cnv-indiv-perm permutation count '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -5326,7 +5324,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0.001) || (dxx > 2147483.646)) {
-	  sprintf(logbuf, "Error: Invalid --cnv-kb size '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --cnv-kb size '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	cnv_min_seglen = (int32_t)(dxx * 1000 * (1 + SMALL_EPSILON));
@@ -5355,7 +5353,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (strcmp(argv[cur_arg + 1], "short")) {
-            sprintf(logbuf, "Error: Invalid --cnv-make-map parameter '%s'.\n", argv[cur_arg + 1]);
+            sprintf(g_logbuf, "Error: Invalid --cnv-make-map parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  cnv_calc_type |= CNV_MAKE_MAP;
@@ -5372,7 +5370,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0.001) || (dxx > 2147483.646)) {
-	  sprintf(logbuf, "Error: Invalid --cnv-max-kb size '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --cnv-max-kb size '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	cnv_max_seglen = (int32_t)(dxx * 1000 * (1 + SMALL_EPSILON));
@@ -5390,7 +5388,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &cnv_max_score)) {
-	  sprintf(logbuf, "Error: Invalid --cnv-max-score value '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --cnv-max-score value '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "nv-max-sites", 13)) {
@@ -5403,7 +5401,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_uint_defcap(argv[cur_arg + 1], &cnv_max_sites)) {
-	  sprintf(logbuf, "Error: Invalid --cnv-max-sites parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --cnv-max-sites parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "nv-overlap", 11)) {
@@ -5419,7 +5417,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &cnv_overlap_val) || (cnv_overlap_val < 0) || (cnv_overlap_val > 1))  {
-	  sprintf(logbuf, "Error: Invalid --cnv-overlap value '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --cnv-overlap value '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (cnv_overlap_val != 0) {
@@ -5446,7 +5444,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &cnv_overlap_val) || (cnv_overlap_val <= 0) || (cnv_overlap_val > 1))  {
-	  sprintf(logbuf, "Error: Invalid --cnv-region-overlap value '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --cnv-region-overlap value '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	cnv_overlap_type = CNV_OVERLAP_REGION;
@@ -5460,7 +5458,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &cnv_min_score)) {
-	  sprintf(logbuf, "Error: Invalid --cnv-score value '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --cnv-score value '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (cnv_min_score > cnv_max_score) {
@@ -5477,7 +5475,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_uint_defcap(argv[cur_arg + 1], &cnv_min_sites)) {
-	  sprintf(logbuf, "Error: Invalid --cnv-sites parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --cnv-sites parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (cnv_min_sites > cnv_max_sites) {
@@ -5531,7 +5529,7 @@ int32_t main(int32_t argc, char** argv) {
 	  uii = 1;
 	}
 	if (scan_posint_defcap(argv[cur_arg + uii], &cnv_test_mperms)) {
-	  sprintf(logbuf, "Error: Invalid --cnv-test permutation count '%s'.\n", argv[cur_arg + uii]);
+	  sprintf(g_logbuf, "Error: Invalid --cnv-test permutation count '%s'.\n", argv[cur_arg + uii]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	cnv_calc_type |= CNV_TEST;
@@ -5562,7 +5560,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (scan_posint_defcap(argv[cur_arg + 1], &cnv_test_region_mperms)) {
-	    sprintf(logbuf, "Error: Invalid --cnv-test-region permutation count '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --cnv-test-region permutation count '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -5577,7 +5575,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0.001)) {
-	  sprintf(logbuf, "Error: Invalid --cnv-test-window size '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --cnv-test-window size '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	dxx *= 1000;
@@ -5599,7 +5597,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &cnv_overlap_val) || (cnv_overlap_val <= 0) || (cnv_overlap_val > 1)) {
-	  sprintf(logbuf, "Error: Invalid --cnv-union-overlap value '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --cnv-union-overlap value '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	cnv_overlap_type = CNV_OVERLAP_UNION;
@@ -5614,7 +5612,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (strcmp(argv[cur_arg + 1], "freq")) {
-            sprintf(logbuf, "Error: Invalid --cnv-write parameter '%s'.\n", argv[cur_arg + 1]);
+            sprintf(g_logbuf, "Error: Invalid --cnv-write parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  if (!(cnv_freq_val & CNV_FREQ_METHOD2)) {
@@ -5762,12 +5760,12 @@ int32_t main(int32_t argc, char** argv) {
 	    } else {
 	      if (!ujj) {
 		if (scan_double(argv[cur_arg + uii], &check_sex_fthresh) || (check_sex_fthresh <= 0.0)) {
-		  sprintf(logbuf, "Error: Invalid --check-sex female F-statistic estimate ceiling '%s'.\n", argv[cur_arg + uii]);
+		  sprintf(g_logbuf, "Error: Invalid --check-sex female F-statistic estimate ceiling '%s'.\n", argv[cur_arg + uii]);
 		  goto main_ret_INVALID_CMDLINE_WWA;
 		}
 	      } else if (ujj == 1) {
 		if (scan_double(argv[cur_arg + uii], &check_sex_mthresh) || (check_sex_mthresh >= 1.0)) {
-		  sprintf(logbuf, "Error: Invalid --check-sex male F-statistic estimate floor '%s'.\n", argv[cur_arg + uii]);
+		  sprintf(g_logbuf, "Error: Invalid --check-sex male F-statistic estimate floor '%s'.\n", argv[cur_arg + uii]);
 		  goto main_ret_INVALID_CMDLINE_WWA;
 		}
 	      } else if (ujj == 2) {
@@ -5905,7 +5903,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0.001)) {
-	  sprintf(logbuf, "Error: Invalid --clump-kb parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --clump-kb parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WW;
 	}
 	dxx *= 1000;
@@ -5923,7 +5921,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx <= 0) || (dxx > 1)) {
-	  sprintf(logbuf, "Error: Invalid --clump-p1 parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --clump-p1 parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WW;
 	}
 	clump_info.p1 = dxx;
@@ -5936,7 +5934,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < clump_info.p1) || (dxx > 1)) {
-	  sprintf(logbuf, "Error: Invalid --clump-p2 parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --clump-p2 parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WW;
 	}
 	clump_info.p2 = dxx;
@@ -5949,7 +5947,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx >= 1)) {
-	  sprintf(logbuf, "Error: Invalid --clump-r2 parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --clump-r2 parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WW;
 	}
 	clump_info.r2 = dxx;
@@ -5974,7 +5972,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0)) {
-	  sprintf(logbuf, "Error: Invalid --clump-range-border parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --clump-range-border parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WW;
 	}
 	if (dxx > 2147483.646) {
@@ -6120,7 +6118,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "flat-missing")) {
 	    dist_calc_type |= DISTANCE_FLAT_MISSING;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --distance parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --distance parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -6133,7 +6131,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &distance_exp)) {
-	  sprintf(logbuf, "Error: Invalid --distance-exp parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --distance-exp parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WW;
 	}
 	fputs("Note: '--distance-exp [x]' deprecated.  Use '--distance-weights exp=[x]' instead.\n", stdout);
@@ -6150,7 +6148,7 @@ int32_t main(int32_t argc, char** argv) {
         }
 	if ((strlen(argv[cur_arg + 1]) > 4) && (!memcmp(argv[cur_arg + 1], "exp=", 4))) {
 	  if (scan_double(&(argv[cur_arg + 1][4]), &distance_exp)) {
-	    sprintf(logbuf, "Error: Invalid --distance-wts exponent '%s'.\n", &(argv[cur_arg + 1][4]));
+	    sprintf(g_logbuf, "Error: Invalid --distance-wts exponent '%s'.\n", &(argv[cur_arg + 1][4]));
 	    goto main_ret_INVALID_CMDLINE_WW;
 	  }
 	} else {
@@ -6160,7 +6158,7 @@ int32_t main(int32_t argc, char** argv) {
 	    if (!strcmp(argv[cur_arg + 1], "noheader")) {
 	      uii = 2;
 	    } else if (strcmp(argv[cur_arg + 2], "noheader")) {
-	      sprintf(logbuf, "Error: Invalid --distance-wts parameter '%s'.\n", argv[cur_arg + 2]);
+	      sprintf(g_logbuf, "Error: Invalid --distance-wts parameter '%s'.\n", argv[cur_arg + 2]);
 	      goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	    dist_calc_type |= DISTANCE_WTS_NOHEADER;
@@ -6219,7 +6217,7 @@ int32_t main(int32_t argc, char** argv) {
 	    dummy_flags |= DUMMY_SCALAR_PHENO;
 	  } else {
 	    if ((dummy_flags & DUMMY_MISSING_PHENO) || scan_double(argv[cur_arg + uii], &dxx) || (dxx < 0.0) || (dxx > 1.0)) {
-	      sprintf(logbuf, "Error: Invalid --dummy parameter '%s'.\n", argv[cur_arg + uii]);
+	      sprintf(g_logbuf, "Error: Invalid --dummy parameter '%s'.\n", argv[cur_arg + uii]);
 	      goto main_ret_INVALID_CMDLINE_WWA;
 	    } else if (dummy_flags & DUMMY_MISSING_GENO) {
 	      dummy_missing_pheno = dxx;
@@ -6255,7 +6253,7 @@ int32_t main(int32_t argc, char** argv) {
 	  }
 	  if (uii <= param_ct) {
 	    if (scan_posint_defcap(argv[cur_arg + uii], &write_covar_dummy_max_categories) || (write_covar_dummy_max_categories < 3)) {
-	      sprintf(logbuf, "Error: Invalid --dummy-coding max categories parameter '%s'.\n", argv[cur_arg + uii]);
+	      sprintf(g_logbuf, "Error: Invalid --dummy-coding max categories parameter '%s'.\n", argv[cur_arg + uii]);
 	      goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	  }
@@ -6337,23 +6335,23 @@ int32_t main(int32_t argc, char** argv) {
 	    dosage_info.modifier |= DOSAGE_NOHEADER;
 	  } else if (!memcmp(argv[cur_arg + uii], "skip0=", 6)) {
 	    if (scan_uint_defcap(&(argv[cur_arg + uii][6]), &(dosage_info.skip0))) {
-	      sprintf(logbuf, "Error: Invalid --dosage skip0 parameter '%s'.\n", &(argv[cur_arg + uii][6]));
+	      sprintf(g_logbuf, "Error: Invalid --dosage skip0 parameter '%s'.\n", &(argv[cur_arg + uii][6]));
 	      goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	  } else if (!memcmp(argv[cur_arg + uii], "skip1=", 6)) {
 	    if (scan_uint_defcap(&(argv[cur_arg + uii][6]), &(dosage_info.skip1))) {
-	      sprintf(logbuf, "Error: Invalid --dosage skip1 parameter '%s'.\n", &(argv[cur_arg + uii][6]));
+	      sprintf(g_logbuf, "Error: Invalid --dosage skip1 parameter '%s'.\n", &(argv[cur_arg + uii][6]));
 	      goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	  } else if (!memcmp(argv[cur_arg + uii], "skip2=", 6)) {
 	    if (scan_uint_defcap(&(argv[cur_arg + uii][6]), &(dosage_info.skip2))) {
-	      sprintf(logbuf, "Error: Invalid --dosage skip2 parameter '%s'.\n", &(argv[cur_arg + uii][6]));
+	      sprintf(g_logbuf, "Error: Invalid --dosage skip2 parameter '%s'.\n", &(argv[cur_arg + uii][6]));
 	      goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	  } else if (!memcmp(argv[cur_arg + uii], "format=", 7)) {
 	    ujj = ((unsigned char)argv[cur_arg + uii][7]) - '1';
 	    if ((ujj > 2) || argv[cur_arg + uii][8]) {
-	      sprintf(logbuf, "Error: Invalid --dosage format parameter '%s'.\n", &(argv[cur_arg + uii][7]));
+	      sprintf(g_logbuf, "Error: Invalid --dosage format parameter '%s'.\n", &(argv[cur_arg + uii][7]));
 	      goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	    dosage_info.format = ujj + 1;
@@ -6361,7 +6359,7 @@ int32_t main(int32_t argc, char** argv) {
 	    glm_modifier |= GLM_STANDARD_BETA;
 	  } else {
 	  main_dosage_invalid_param:
-	    sprintf(logbuf, "Error: Invalid --dosage modifier '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --dosage modifier '%s'.\n", argv[cur_arg + uii]);
             goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -6407,7 +6405,7 @@ int32_t main(int32_t argc, char** argv) {
 	      goto main_ret_INVALID_CMDLINE;
 	    }
 	    if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &family_info.dfam_mperm_val)) {
-	      sprintf(logbuf, "Error: Invalid --dfam mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
+	      sprintf(g_logbuf, "Error: Invalid --dfam mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
 	      goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	    family_info.dfam_modifier |= DFAM_MPERM;
@@ -6417,7 +6415,7 @@ int32_t main(int32_t argc, char** argv) {
 	    logerrprint("Error: Improper --dfam mperm syntax.  (Use '--dfam mperm=[value]'.)\n");
 	    goto main_ret_INVALID_CMDLINE;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --dfam parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --dfam parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -6512,7 +6510,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + 1], "set-by-all")) {
 	    epi_info.modifier |= EPI_SET_BY_ALL;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --epistasis modifier '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --epistasis modifier '%s'.\n", argv[cur_arg + 1]);
             goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -6527,8 +6525,8 @@ int32_t main(int32_t argc, char** argv) {
 	if (retval) {
 	  goto main_ret_1;
 	}
-	if (scan_posint_capped(argv[cur_arg + 2], &epi_info.summary_merge_ct, PARALLEL_MAX / 10, PARALLEL_MAX % 10) || (epi_info.summary_merge_ct == 1)) {
-	  sprintf(logbuf, "Error: Invalid --epistasis-summary-merge job count '%s'.\n", argv[cur_arg + 2]);
+	if (scan_posint_capped(argv[cur_arg + 2], PARALLEL_MAX / 10, PARALLEL_MAX % 10, &epi_info.summary_merge_ct) || (epi_info.summary_merge_ct == 1)) {
+	  sprintf(g_logbuf, "Error: Invalid --epistasis-summary-merge job count '%s'.\n", argv[cur_arg + 2]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "pi1", 4)) {
@@ -6536,7 +6534,7 @@ int32_t main(int32_t argc, char** argv) {
           goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx <= 0)) {
-	  sprintf(logbuf, "Error: Invalid --epi1 parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --epi1 parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	epi_info.epi1 = dxx;
@@ -6545,7 +6543,7 @@ int32_t main(int32_t argc, char** argv) {
           goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx <= 0) || (dxx >= 1)) {
-	  sprintf(logbuf, "Error: Invalid --epi2 parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --epi2 parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	epi_info.epi2 = dxx;
@@ -6673,7 +6671,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "gz")) {
 	    misc_flags |= MISC_FREQ_GZ;
 	  } else {
-            sprintf(logbuf, "Error: Invalid --freq parameter '%s'.\n", argv[cur_arg + uii]);
+            sprintf(g_logbuf, "Error: Invalid --freq parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -6694,7 +6692,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (strcmp(argv[cur_arg + 1], "gz")) {
-	    sprintf(logbuf, "Error: Invalid --freqx parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --freqx parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  misc_flags |= MISC_FREQ_GZ;
@@ -6724,7 +6722,7 @@ int32_t main(int32_t argc, char** argv) {
 	cc = argptr2[4];
 	if (cc == 'b') {
 	  if (scan_uint_defcap(argv[cur_arg + 1], (uint32_t*)&marker_pos_start)) {
-	    sprintf(logbuf, "Error: Invalid --from-bp parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --from-bp parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	} else {
@@ -6733,7 +6731,7 @@ int32_t main(int32_t argc, char** argv) {
 	    goto main_ret_INVALID_CMDLINE;
 	  }
 	  if (scan_double(argv[cur_arg + 1], &dxx)) {
-	    sprintf(logbuf, "Error: Invalid --from-kb/-mb parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --from-kb/-mb parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  dxx *= (cc == 'k')? 1000 : 1000000;
@@ -6795,13 +6793,13 @@ int32_t main(int32_t argc, char** argv) {
         for (uii = 1; uii <= param_ct; uii++) {
 	  if (!strcmp(argv[cur_arg + uii], "no-ueki")) {
 	    if (epi_info.modifier & (EPI_FAST_BOOST | EPI_FAST_JOINT_EFFECTS)) {
-	      sprintf(logbuf, "Error: --fast-epistasis 'no-ueki' modifier cannot be used with '%s'.\n", (epi_info.modifier & EPI_FAST_BOOST)? "boost" : "joint-effects");
+	      sprintf(g_logbuf, "Error: --fast-epistasis 'no-ueki' modifier cannot be used with '%s'.\n", (epi_info.modifier & EPI_FAST_BOOST)? "boost" : "joint-effects");
 	      goto main_ret_INVALID_CMDLINE_2A;
 	    }
 	    epi_info.modifier |= EPI_FAST_NO_UEKI;
 	  } else if (!strcmp(argv[cur_arg + uii], "boost")) {
 	    if (epi_info.modifier & (EPI_FAST_NO_UEKI | EPI_FAST_JOINT_EFFECTS)) {
-	      sprintf(logbuf, "Error: --fast-epistasis 'boost' modifier cannot be used with '%s'.\n", (epi_info.modifier & EPI_FAST_NO_UEKI)? "no-ueki" : "joint-effects");
+	      sprintf(g_logbuf, "Error: --fast-epistasis 'boost' modifier cannot be used with '%s'.\n", (epi_info.modifier & EPI_FAST_NO_UEKI)? "no-ueki" : "joint-effects");
 	      goto main_ret_INVALID_CMDLINE_2A;
 	    }
 	    if (epi_info.modifier & EPI_FAST_CASE_ONLY) {
@@ -6811,7 +6809,7 @@ int32_t main(int32_t argc, char** argv) {
 	    epi_info.modifier |= EPI_FAST_BOOST;
 	  } else if (!strcmp(argv[cur_arg + uii], "joint-effects")) {
 	    if (epi_info.modifier & (EPI_FAST_NO_UEKI | EPI_FAST_BOOST)) {
-	      sprintf(logbuf, "Error: --fast-epistasis 'joint-effects' modifier cannot be used with '%s'.\n", (epi_info.modifier & EPI_FAST_NO_UEKI)? "no-ueki" : "boost");
+	      sprintf(g_logbuf, "Error: --fast-epistasis 'joint-effects' modifier cannot be used with '%s'.\n", (epi_info.modifier & EPI_FAST_NO_UEKI)? "no-ueki" : "boost");
 	      goto main_ret_INVALID_CMDLINE_2A;
 	    }
 	    epi_info.modifier |= EPI_FAST_JOINT_EFFECTS;
@@ -6833,7 +6831,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "nop")) {
 	    epi_info.modifier |= EPI_FAST_NO_P_VALUE;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --fast-epistasis modifier '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --fast-epistasis modifier '%s'.\n", argv[cur_arg + uii]);
             goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -6845,7 +6843,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
         if (param_ct) {
           if (strcmp(argv[cur_arg + 1], "verbose")) {
-	    sprintf(logbuf, "Error: Invalid --flip-scan parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --flip-scan parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
           ld_info.modifier |= LD_FLIPSCAN_VERBOSE;
@@ -6860,7 +6858,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &ld_info.flipscan_window_size) || (ld_info.flipscan_window_size == 1)) {
-	  sprintf(logbuf, "Error: Invalid --flip-scan-window size '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --flip-scan-window size '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "lip-scan-window-kb", 22)) {
@@ -6872,7 +6870,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0)) {
-	  sprintf(logbuf, "Error: Invalid --flip-scan-window-kb parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --flip-scan-window-kb parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (dxx > 2147483.646) {
@@ -6889,7 +6887,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx <= 0.0) || (dxx > 1.0)) {
-	  sprintf(logbuf, "Error: Invalid --flip-scan-threshold parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --flip-scan-threshold parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
         ld_info.flipscan_thresh = dxx;
@@ -6926,7 +6924,7 @@ int32_t main(int32_t argc, char** argv) {
 	  // allow case/control status to represent just two subpopulations,
 	  // but force user to be explicit about this nonstandard usage
           if (strcmp(argv[cur_arg + 1], "case-control")) {
-	    sprintf(logbuf, "Error: Invalid --fst parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --fst parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  misc_flags |= MISC_FST_CC;
@@ -6944,11 +6942,11 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (scan_double(argv[cur_arg + 1], &geno_thresh)) {
-	    sprintf(logbuf, "Error: Invalid --geno parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --geno parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  if ((geno_thresh < 0.0) || (geno_thresh > 1.0)) {
-	    sprintf(logbuf, "Error: Invalid --geno parameter '%s' (must be between 0 and 1).\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --geno parameter '%s' (must be between 0 and 1).\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	} else {
@@ -6989,7 +6987,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "nudge")) {
             genome_modifier |= GENOME_NUDGE;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --genome parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --genome parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -7008,12 +7006,12 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (scan_posintptr(argv[cur_arg + 1], &groupdist_iters) || (groupdist_iters < 2) || (groupdist_iters > ((~ZEROLU) - MAX_THREADS))) {
-	    sprintf(logbuf, "Error: Invalid --groupdist jackknife iteration count '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --groupdist jackknife iteration count '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  if (param_ct == 2) {
 	    if (scan_posint_defcap(argv[cur_arg + 2], &groupdist_d)) {
-	      sprintf(logbuf, "Error: Invalid --groupdist jackknife delete parameter '%s'.\n", argv[cur_arg + 2]);
+	      sprintf(g_logbuf, "Error: Invalid --groupdist jackknife delete parameter '%s'.\n", argv[cur_arg + 2]);
 	      goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	  }
@@ -7068,7 +7066,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (scan_posint_defcap(argv[cur_arg + 1], &gxe_mcovar)) {
-	    sprintf(logbuf, "Error: Invalid --gxe parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --gxe parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	} else {
@@ -7164,7 +7162,7 @@ int32_t main(int32_t argc, char** argv) {
           goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0)) {
-	  sprintf(logbuf, "Error: Invalid --gap parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --gap parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (dxx > 2147483.646) {
@@ -7201,7 +7199,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0)) {
-	  sprintf(logbuf, "Error: Invalid --gene-list-border parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --gene-list-border parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WW;
 	}
 	if (dxx > 2147483.646) {
@@ -7267,7 +7265,7 @@ int32_t main(int32_t argc, char** argv) {
 	    }
             ujj = 1;
             if ((hwe_thresh < 0.0) || (hwe_thresh >= 1.0)) {
-	      sprintf(logbuf, "Error: Invalid --hwe threshold '%s' (must be between 0 and 1).\n", argv[cur_arg + uii]);
+	      sprintf(g_logbuf, "Error: Invalid --hwe threshold '%s' (must be between 0 and 1).\n", argv[cur_arg + uii]);
 	      goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	  }
@@ -7295,7 +7293,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "gz")) {
 	    misc_flags |= MISC_HET_GZ;
 	  } else {
-            sprintf(logbuf, "Error: Invalid --het parameter '%s'.\n", argv[cur_arg + uii]);
+            sprintf(g_logbuf, "Error: Invalid --het parameter '%s'.\n", argv[cur_arg + uii]);
             goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -7313,7 +7311,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "gz")) {
 	    hwe_modifier |= HWE_GZ;
 	  } else {
-            sprintf(logbuf, "Error: Invalid --hardy parameter '%s'.\n", argv[cur_arg + uii]);
+            sprintf(g_logbuf, "Error: Invalid --hardy parameter '%s'.\n", argv[cur_arg + uii]);
             goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -7342,7 +7340,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "subtract-1-from-lengths")) {
             homozyg.modifier |= HOMOZYG_OLD_LENGTHS;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --homozyg parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --homozyg parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -7352,7 +7350,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &homozyg.min_snp) || (homozyg.min_snp == 1)) {
-	  sprintf(logbuf, "Error: Invalid --homozyg-snp parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --homozyg-snp parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	calculation_type |= CALC_HOMOZYG;
@@ -7361,7 +7359,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
         if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < SMALL_EPSILON) || (dxx >= (2147483.646 * (1 + SMALL_EPSILON)))) {
-	  sprintf(logbuf, "Error: Invalid --homozyg-kb parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --homozyg-kb parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	calculation_type |= CALC_HOMOZYG;
@@ -7372,7 +7370,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
         if (scan_double(argv[cur_arg + 1], &dxx) || (dxx <= 0.0) || (dxx >= 2147483.646)) {
-	  sprintf(logbuf, "Error: Invalid --homozyg-density parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --homozyg-density parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
         calculation_type |= CALC_HOMOZYG;
@@ -7382,7 +7380,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
         if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0.001) || (dxx >= 2147483.646)) {
-	  sprintf(logbuf, "Error: Invalid --homozyg-gap parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --homozyg-gap parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
         calculation_type |= CALC_HOMOZYG;
@@ -7392,7 +7390,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_uint_defcap(argv[cur_arg + 1], &homozyg.max_hets)) {
-	  sprintf(logbuf, "Error: Invalid --homozyg-het parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --homozyg-het parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (homozyg.max_hets && (homozyg.modifier & HOMOZYG_EXTEND)) {
@@ -7405,7 +7403,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &homozyg.window_size) || (homozyg.window_size == 1)) {
-	  sprintf(logbuf, "Error: Invalid --homozyg-window-snp parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --homozyg-window-snp parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
         calculation_type |= CALC_HOMOZYG;
@@ -7417,7 +7415,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_uint_defcap(argv[cur_arg + 1], &homozyg.window_max_hets)) {
-	  sprintf(logbuf, "Error: Invalid --homozyg-window-het parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --homozyg-window-het parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
         calculation_type |= CALC_HOMOZYG;
@@ -7426,7 +7424,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_uint_defcap(argv[cur_arg + 1], &homozyg.window_max_missing)) {
-	  sprintf(logbuf, "Error: Invalid --homozyg-window-missing parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --homozyg-window-missing parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
         calculation_type |= CALC_HOMOZYG;
@@ -7435,7 +7433,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx <= 0.0) || (dxx > 1.0)) {
-	  sprintf(logbuf, "Error: Invalid --homozyg-window-threshold parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --homozyg-window-threshold parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
         calculation_type |= CALC_HOMOZYG;
@@ -7448,7 +7446,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx <= 0.0) || (dxx > 1.0)) {
-	  sprintf(logbuf, "Error: Invalid --homozyg-match parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --homozyg-match parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	homozyg.overlap_min = dxx;
@@ -7497,10 +7495,10 @@ int32_t main(int32_t argc, char** argv) {
 	  hard_call_threshold = -1;
 	} else {
 	  if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0.0) || (dxx > 1.0)) {
-	    sprintf(logbuf, "Error: Invalid --hard-call-threshold parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --hard-call-threshold parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  } else if (dxx > (0.5 + SMALLISH_EPSILON)) {
-	    sprintf(logbuf, "Error: The --hard-call-threshold parameter must be smaller than 0.5.  (Did you\nmean '--hard-call-threshold %g'?)\n", 1.0 - dxx);
+	    sprintf(g_logbuf, "Error: The --hard-call-threshold parameter must be smaller than 0.5.  (Did you\nmean '--hard-call-threshold %g'?)\n", 1.0 - dxx);
 	    goto main_ret_INVALID_CMDLINE_2A; 
 	  } else if (dxx > (0.5 - SMALLISH_EPSILON)) {
 	    logerrprint("Error: The --hard-call-threshold parameter must be smaller than 0.5, to prevent\nties.\n");
@@ -7556,12 +7554,12 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	// may want to permit decimal here
 	if (scan_posint_defcap(argv[cur_arg + 1], &ld_info.prune_window_size) || ((ld_info.prune_window_size == 1) && (param_ct == 3))) {
-	  sprintf(logbuf, "Error: Invalid --%s window size '%s'.\n", argptr, argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --%s window size '%s'.\n", argptr, argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (param_ct == 4) {
 	  if (!match_upper(argv[cur_arg + 2], "KB")) {
-	    sprintf(logbuf, "Error: Invalid --%s parameter sequence.\n", argptr);
+	    sprintf(g_logbuf, "Error: Invalid --%s parameter sequence.\n", argptr);
 	    goto main_ret_INVALID_CMDLINE_2A;
 	  }
 	  ld_info.modifier |= LD_PRUNE_KB_WINDOW;
@@ -7572,11 +7570,11 @@ int32_t main(int32_t argc, char** argv) {
 	  }
 	}
 	if (scan_posint_defcap(argv[cur_arg + param_ct - 1], &ld_info.prune_window_incr)) {
-	  sprintf(logbuf, "Error: Invalid increment '%s' for --%s.\n", argv[cur_arg + param_ct - 1], argptr);
+	  sprintf(g_logbuf, "Error: Invalid increment '%s' for --%s.\n", argv[cur_arg + param_ct - 1], argptr);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (scan_double(argv[cur_arg + param_ct], &ld_info.prune_last_param) || (ld_info.prune_last_param < 0.0) || (ld_info.prune_last_param >= 1.0)) {
-	  sprintf(logbuf, "Error: Invalid --%s r^2 threshold '%s'.\n", argptr, argv[cur_arg + param_ct]);
+	  sprintf(g_logbuf, "Error: Invalid --%s r^2 threshold '%s'.\n", argptr, argv[cur_arg + param_ct]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	calculation_type |= CALC_LD_PRUNE;
@@ -7590,7 +7588,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &ld_info.prune_window_size) || ((ld_info.prune_window_size == 1) && (param_ct == 3))) {
-	  sprintf(logbuf, "Error: Invalid --indep window size '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --indep window size '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (param_ct == 4) {
@@ -7606,15 +7604,15 @@ int32_t main(int32_t argc, char** argv) {
 	  }
 	}
 	if (scan_posint_defcap(argv[cur_arg + param_ct - 1], &ld_info.prune_window_incr)) {
-	  sprintf(logbuf, "Error: Invalid increment '%s' for --indep.\n", argv[cur_arg + param_ct - 1]);
+	  sprintf(g_logbuf, "Error: Invalid increment '%s' for --indep.\n", argv[cur_arg + param_ct - 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (scan_double(argv[cur_arg + param_ct], &ld_info.prune_last_param)) {
-	  sprintf(logbuf, "Error: Invalid --indep VIF threshold '%s'.\n", argv[cur_arg + param_ct]);
+	  sprintf(g_logbuf, "Error: Invalid --indep VIF threshold '%s'.\n", argv[cur_arg + param_ct]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (ld_info.prune_last_param < 1.0) {
-	  sprintf(logbuf, "Error: --indep VIF threshold '%s' too small (must be >= 1).\n", argv[cur_arg + param_ct]);
+	  sprintf(g_logbuf, "Error: --indep VIF threshold '%s' too small (must be >= 1).\n", argv[cur_arg + param_ct]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	calculation_type |= CALC_LD_PRUNE;
@@ -7631,7 +7629,7 @@ int32_t main(int32_t argc, char** argv) {
 	  sample_sort = SAMPLE_SORT_ASCII;
 	} else if ((!strcmp(argv[cur_arg + 1], "file")) || ((tolower(argv[cur_arg + 1][0]) == 'f') && jj)) {
 	  if (param_ct == 1) {
-	    sprintf(logbuf, "Error: Missing '--indiv-sort %s' filename.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Missing '--indiv-sort %s' filename.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_2A;
 	  }
 	  sample_sort = SAMPLE_SORT_FILE;
@@ -7640,11 +7638,11 @@ int32_t main(int32_t argc, char** argv) {
 	    goto main_ret_1;
 	  }
 	} else {
-	  sprintf(logbuf, "Error: '%s' is not a valid mode for --indiv-sort.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: '%s' is not a valid mode for --indiv-sort.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if ((param_ct == 2) && (sample_sort != SAMPLE_SORT_FILE)) {
-          sprintf(logbuf, "Error: '--indiv-sort %s' does not accept a second parameter.\n", argv[cur_arg + 1]);
+          sprintf(g_logbuf, "Error: '--indiv-sort %s' does not accept a second parameter.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
       } else if (!memcmp(argptr2, "bs-test", 8)) {
@@ -7653,11 +7651,11 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (scan_posintptr(argv[cur_arg + 1], &ibs_test_perms)) {
-	    sprintf(logbuf, "Error: Invalid --ibs-test permutation count '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --ibs-test permutation count '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
           if (ibs_test_perms < MAX_THREADS * 2) {
-	    sprintf(logbuf, "Error: --ibs-test permutation count '%s' too small (min %u).\n", argv[cur_arg + 1], MAX_THREADS * 2);
+	    sprintf(g_logbuf, "Error: --ibs-test permutation count '%s' too small (min %u).\n", argv[cur_arg + 1], MAX_THREADS * 2);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -7675,7 +7673,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
         if (scan_double(argv[cur_arg + 1], &dxx)) {
-	  sprintf(logbuf, "Error: Invalid --ibm parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --ibm parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if ((dxx <= 0.0) || (dxx > 1.0)) {
@@ -7739,12 +7737,12 @@ int32_t main(int32_t argc, char** argv) {
 	    } else {
 	      if (!ujj) {
 		if (scan_double(argv[cur_arg + uii], &check_sex_fthresh) || (check_sex_fthresh <= 0.0)) {
-		  sprintf(logbuf, "Error: Invalid --impute-sex female F-statistic estimate ceiling '%s'.\n", argv[cur_arg + uii]);
+		  sprintf(g_logbuf, "Error: Invalid --impute-sex female F-statistic estimate ceiling '%s'.\n", argv[cur_arg + uii]);
 		  goto main_ret_INVALID_CMDLINE_WWA;
 		}
 	      } else if (ujj == 1) {
 		if (scan_double(argv[cur_arg + uii], &check_sex_mthresh) || (check_sex_mthresh >= 1.0)) {
-		  sprintf(logbuf, "Error: Invalid --impute-sex male F-statistic estimate floor '%s'.\n", argv[cur_arg + uii]);
+		  sprintf(g_logbuf, "Error: Invalid --impute-sex male F-statistic estimate floor '%s'.\n", argv[cur_arg + uii]);
 		  goto main_ret_INVALID_CMDLINE_WWA;
 		}
 	      } else if (ujj == 2) {
@@ -7826,8 +7824,8 @@ int32_t main(int32_t argc, char** argv) {
           goto main_ret_INVALID_CMDLINE_2A;
 	}
 	// may as well enforce 2^29 / 18 limit...
-	if (scan_uint_capped(argv[cur_arg + 1], &epi_info.je_cellmin, 29826161 / 10, 29826161 % 10)) {
-	  sprintf(logbuf, "Error: Invalid --je-cellmin parameter '%s'.\n", argv[cur_arg + 1]);
+	if (scan_uint_capped(argv[cur_arg + 1], 29826161 / 10, 29826161 % 10, &epi_info.je_cellmin)) {
+	  sprintf(g_logbuf, "Error: Invalid --je-cellmin parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else {
@@ -7856,7 +7854,7 @@ int32_t main(int32_t argc, char** argv) {
 	filter_flags |= FILTER_FAM_REQ;
       } else if (!memcmp(argptr2, "eep-allele-order", 17)) {
 	if (load_rare & (LOAD_RARE_CNV | LOAD_RARE_DOSAGE)) {
-	  sprintf(logbuf, "Error: --keep-allele-order has no effect with %s.\n", (load_rare == LOAD_RARE_CNV)? "a .cnv fileset" : "--dosage");
+	  sprintf(g_logbuf, "Error: --keep-allele-order has no effect with %s.\n", (load_rare == LOAD_RARE_CNV)? "a .cnv fileset" : "--dosage");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	misc_flags |= MISC_KEEP_ALLELE_ORDER;
@@ -7866,7 +7864,7 @@ int32_t main(int32_t argc, char** argv) {
 	goto main_param_zero;
       } else if (!memcmp(argptr2, "eep-autoconv", 13)) {
 	if (load_rare & (LOAD_RARE_CNV | LOAD_RARE_DOSAGE)) {
-	  sprintf(logbuf, "Error: --keep-autoconv has no effect with %s.\n", (load_rare == LOAD_RARE_CNV)? "--cfile/--cnv-list" : "--dosage");
+	  sprintf(g_logbuf, "Error: --keep-autoconv has no effect with %s.\n", (load_rare == LOAD_RARE_CNV)? "--cfile/--cnv-list" : "--dosage");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
         misc_flags |= MISC_KEEP_AUTOCONV;
@@ -7975,7 +7973,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &adjust_lambda)) {
-	  sprintf(logbuf, "Error: Invalid --lambda parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --lambda parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (adjust_lambda < 1) {
@@ -8016,20 +8014,20 @@ int32_t main(int32_t argc, char** argv) {
 	  for (uii = 1; uii <= param_ct; uii++) {
 	    if (!strcmp(argv[cur_arg + uii], "perm")) {
 	      if (glm_modifier & GLM_MPERM) {
-		sprintf(logbuf, "Error: --%s 'mperm' and 'perm' cannot be used together.\n", argptr);
+		sprintf(g_logbuf, "Error: --%s 'mperm' and 'perm' cannot be used together.\n", argptr);
 		goto main_ret_INVALID_CMDLINE_2A;
 	      }
 	      glm_modifier |= GLM_PERM;
 	    } else if ((strlen(argv[cur_arg + uii]) > 6) && (!memcmp(argv[cur_arg + uii], "mperm=", 6))) {
 	      if (glm_modifier & GLM_PERM) {
-		sprintf(logbuf, "Error: --%s 'mperm' and 'perm' cannot be used together.\n", argptr);
+		sprintf(g_logbuf, "Error: --%s 'mperm' and 'perm' cannot be used together.\n", argptr);
 		goto main_ret_INVALID_CMDLINE_2A;
 	      } else if (glm_modifier & GLM_MPERM) {
-		sprintf(logbuf, "Error: Duplicate --%s 'mperm' modifier.\n", argptr);
+		sprintf(g_logbuf, "Error: Duplicate --%s 'mperm' modifier.\n", argptr);
 		goto main_ret_INVALID_CMDLINE_2;
 	      }
 	      if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &glm_mperm_val)) {
-		sprintf(logbuf, "Error: Invalid --%s mperm parameter '%s'.\n", argptr, &(argv[cur_arg + uii][6]));
+		sprintf(g_logbuf, "Error: Invalid --%s mperm parameter '%s'.\n", argptr, &(argv[cur_arg + uii][6]));
 		goto main_ret_INVALID_CMDLINE_WWA;
 	      }
 	      glm_modifier |= GLM_MPERM;
@@ -8039,35 +8037,35 @@ int32_t main(int32_t argc, char** argv) {
 	      glm_modifier |= GLM_PERM_COUNT;
 	    } else if (!strcmp(argv[cur_arg + uii], "genotypic")) {
 	      if (glm_modifier & (GLM_HETHOM | GLM_DOMINANT | GLM_RECESSIVE)) {
-		sprintf(logbuf, "Error: Conflicting --%s parameters.\n", argptr);
+		sprintf(g_logbuf, "Error: Conflicting --%s parameters.\n", argptr);
 		goto main_ret_INVALID_CMDLINE_2;
 	      }
 	      glm_modifier |= GLM_GENOTYPIC;
 	      glm_xchr_model = 0;
 	    } else if (!strcmp(argv[cur_arg + uii], "hethom")) {
 	      if (glm_modifier & (GLM_GENOTYPIC | GLM_DOMINANT | GLM_RECESSIVE)) {
-		sprintf(logbuf, "Error: Conflicting --%s parameters.\n", argptr);
+		sprintf(g_logbuf, "Error: Conflicting --%s parameters.\n", argptr);
 		goto main_ret_INVALID_CMDLINE_2;
 	      }
 	      glm_modifier |= GLM_HETHOM;
 	      glm_xchr_model = 0;
 	    } else if (!strcmp(argv[cur_arg + uii], "dominant")) {
 	      if (glm_modifier & (GLM_GENOTYPIC | GLM_HETHOM | GLM_RECESSIVE)) {
-		sprintf(logbuf, "Error: Conflicting --%s parameters.\n", argptr);
+		sprintf(g_logbuf, "Error: Conflicting --%s parameters.\n", argptr);
 		goto main_ret_INVALID_CMDLINE_2;
 	      }
 	      glm_modifier |= GLM_DOMINANT;
 	      glm_xchr_model = 0;
 	    } else if (!strcmp(argv[cur_arg + uii], "recessive")) {
 	      if (glm_modifier & (GLM_GENOTYPIC | GLM_HETHOM | GLM_DOMINANT)) {
-		sprintf(logbuf, "Error: Conflicting --%s parameters.\n", argptr);
+		sprintf(g_logbuf, "Error: Conflicting --%s parameters.\n", argptr);
 		goto main_ret_INVALID_CMDLINE_2;
 	      }
 	      glm_modifier |= GLM_RECESSIVE;
 	      glm_xchr_model = 0;
 	    } else if (!strcmp(argv[cur_arg + uii], "no-snp")) {
 	      if (mtest_adjust) {
-		sprintf(logbuf, "Error: --%s no-snp cannot be used with --adjust.\n", argptr);
+		sprintf(g_logbuf, "Error: --%s no-snp cannot be used with --adjust.\n", argptr);
 		goto main_ret_INVALID_CMDLINE_2A;
 	      }
 	      // defer the rest of the check
@@ -8076,13 +8074,13 @@ int32_t main(int32_t argc, char** argv) {
 	      glm_modifier |= GLM_HIDE_COVAR;
 	    } else if (!strcmp(argv[cur_arg + uii], "sex")) {
 	      if (glm_modifier & GLM_NO_X_SEX) {
-		sprintf(logbuf, "Error: --%s 'sex' and 'no-x-sex' cannot be used together.\n", argptr);
+		sprintf(g_logbuf, "Error: --%s 'sex' and 'no-x-sex' cannot be used together.\n", argptr);
 		goto main_ret_INVALID_CMDLINE_2A;
 	      }
 	      glm_modifier |= GLM_SEX;
 	    } else if (!strcmp(argv[cur_arg + uii], "no-x-sex")) {
 	      if (glm_modifier & GLM_SEX) {
-		sprintf(logbuf, "Error: --%s 'sex' and 'no-x-sex' cannot be used together.\n", argptr);
+		sprintf(g_logbuf, "Error: --%s 'sex' and 'no-x-sex' cannot be used together.\n", argptr);
 		goto main_ret_INVALID_CMDLINE_2A;
 	      }
 	      glm_modifier |= GLM_NO_X_SEX;
@@ -8101,15 +8099,15 @@ int32_t main(int32_t argc, char** argv) {
 	    } else if (!strcmp(argv[cur_arg + uii], "set-test")) {
 	      glm_modifier |= GLM_SET_TEST;
 	    } else if (!strcmp(argv[cur_arg + uii], "mperm")) {
-	      sprintf(logbuf, "Error: Improper --%s mperm syntax.  (Use '--%s mperm=[value]'.)\n", argptr, argptr);
+	      sprintf(g_logbuf, "Error: Improper --%s mperm syntax.  (Use '--%s mperm=[value]'.)\n", argptr, argptr);
 	      goto main_ret_INVALID_CMDLINE_2;
 	    } else {
-	      sprintf(logbuf, "Error: Invalid --%s parameter '%s'.\n", argptr, argv[cur_arg + uii]);
+	      sprintf(g_logbuf, "Error: Invalid --%s parameter '%s'.\n", argptr, argv[cur_arg + uii]);
 	      goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	  }
 	  if ((glm_modifier & GLM_NO_SNP) && (glm_modifier & GLM_NO_SNP_EXCL)) {
-	    sprintf(logbuf, "Error: --%s 'no-snp' modifier conflicts with another modifier.\n", argptr);
+	    sprintf(g_logbuf, "Error: --%s 'no-snp' modifier conflicts with another modifier.\n", argptr);
 	    goto main_ret_INVALID_CMDLINE_2A;
 	  }
 	  calculation_type |= CALC_GLM;
@@ -8120,7 +8118,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cc = argv[cur_arg + 1][0];
 	if ((cc < '1') || (cc > '3') || (argv[cur_arg + 1][1] != '\0')) {
-	  sprintf(logbuf, "Error: Invalid --ld-xchr parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --ld-xchr parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
         if (cc == '2') {
@@ -8133,7 +8131,7 @@ int32_t main(int32_t argc, char** argv) {
           goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &lasso_h2) || (lasso_h2 > 1) || (lasso_h2 <= 0)) {
-	  sprintf(logbuf, "Error: Invalid --lasso heritability estimate '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --lasso heritability estimate '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	for (uii = 2; uii <= param_ct; uii++) {
@@ -8143,7 +8141,7 @@ int32_t main(int32_t argc, char** argv) {
             logerrprint("Error: Invalid --lasso parameter sequence.\n");
             goto main_ret_INVALID_CMDLINE_A;
 	  } else if (scan_double(argv[cur_arg + uii], &lasso_minlambda) || (lasso_minlambda <= 0)) {
-	    sprintf(logbuf, "Error: Invalid --lasso minimum lambda '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --lasso minimum lambda '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -8168,7 +8166,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &ld_info.window_size) || (ld_info.window_size == 1)) {
-	  sprintf(logbuf, "Error: Invalid --ld-window window size '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --ld-window window size '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "d-window-kb", 12)) {
@@ -8176,7 +8174,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0)) {
-	  sprintf(logbuf, "Error: Invalid --ld-window-kb parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --ld-window-kb parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (dxx > 2147483.646) {
@@ -8189,7 +8187,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0) || (dxx > 1)) {
-	  sprintf(logbuf, "Error: Invalid --ld-window-r2 parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --ld-window-r2 parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
         ld_info.window_r2 = dxx;
@@ -8229,7 +8227,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct == 3) {
 	  if (strcmp(argv[cur_arg + 3], "hwe-midp")) {
-	    sprintf(logbuf, "Error: Invalid --ld parameter '%s'.\n", argv[cur_arg + 3]);
+	    sprintf(g_logbuf, "Error: Invalid --ld parameter '%s'.\n", argv[cur_arg + 3]);
             goto main_ret_INVALID_CMDLINE_WWA;
 	  }
           epi_info.modifier |= EPI_HWE_MIDP;
@@ -8250,7 +8248,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "suppress-first")) {
 	    dupvar_modifier |= DUPVAR_SUPPRESS_FIRST;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --list-duplicate-vars parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --list-duplicate-vars parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -8294,7 +8292,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
         cc = extract_char_param(argv[cur_arg + 1]);
 	if (((unsigned char)cc <= ' ') || ((cc > '0') && (cc <= '4')) || (cc == 'A') || (cc == 'C') || (cc == 'G') || (cc == 'T')) {
-	  sprintf(logbuf, "Error: Invalid --missing-genotype parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --missing-genotype parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	g_missing_geno_ptr = &(g_one_char_strs[((unsigned char)cc) * 2]);
@@ -8307,7 +8305,7 @@ int32_t main(int32_t argc, char** argv) {
 	// if anyone is using a missing pheno value of -2^31, they should be
 	// flogged with wet noodles
 	if (scan_int32(argv[cur_arg + 1], &missing_pheno) || (!missing_pheno) || (missing_pheno == 1) || (jj > 31) || scan_double(argv[cur_arg + 1], &dxx) || (dxx != (double)missing_pheno)) {
-	  sprintf(logbuf, "Error: Invalid --missing-phenotype parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --missing-phenotype parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	memcpy(output_missing_pheno, argv[cur_arg + 1], jj + 1);
@@ -8340,7 +8338,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &mpheno_col)) {
-	  sprintf(logbuf, "Error: Invalid --mpheno parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --mpheno parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "filter", 7)) {
@@ -8352,7 +8350,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &mfilter_col)) {
-	  sprintf(logbuf, "Error: Invalid --mfilter parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --mfilter parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "emory", 6)) {
@@ -8361,11 +8359,11 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	// may as well support systems with >2 PB RAM...
 	if (scan_posintptr(argv[cur_arg + 1], (uintptr_t*)&malloc_size_mb)) {
-	  sprintf(logbuf, "Error: Invalid --memory parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --memory parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
-	if (malloc_size_mb < WKSPACE_MIN_MB) {
-	  sprintf(logbuf, "Error: Invalid --memory parameter '%s' (minimum %u).\n", argv[cur_arg + 1], WKSPACE_MIN_MB);
+	if (malloc_size_mb < BIGSTACK_MIN_MB) {
+	  sprintf(g_logbuf, "Error: Invalid --memory parameter '%s' (minimum %u).\n", argv[cur_arg + 1], BIGSTACK_MIN_MB);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 #ifndef __LP64__
@@ -8380,14 +8378,14 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (scan_double(argv[cur_arg + 1], &min_maf)) {
-	    sprintf(logbuf, "Error: Invalid --maf parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --maf parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  if (min_maf <= 0.0) {
-	    sprintf(logbuf, "Error: --maf parameter '%s' too small (must be > 0).\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: --maf parameter '%s' too small (must be > 0).\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  } else if (min_maf > max_maf) {
-	    sprintf(logbuf, "Error: --maf parameter '%s' too large (must be <= %g).\n", argv[cur_arg + 1], max_maf);
+	    sprintf(g_logbuf, "Error: --maf parameter '%s' too large (must be <= %g).\n", argv[cur_arg + 1], max_maf);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	} else {
@@ -8399,14 +8397,14 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &max_maf)) {
-	  sprintf(logbuf, "Error: Invalid --max-maf parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --max-maf parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (max_maf < min_maf) {
-	  sprintf(logbuf, "Error: --max-maf parameter '%s' too small (must be >= %g).\n", argv[cur_arg + 1], min_maf);
+	  sprintf(g_logbuf, "Error: --max-maf parameter '%s' too small (must be >= %g).\n", argv[cur_arg + 1], min_maf);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	} else if (max_maf >= 0.5) {
-	  sprintf(logbuf, "Error: --max-maf parameter '%s' too large (must be < 0.5).\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: --max-maf parameter '%s' too large (must be < 0.5).\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	filter_flags |= FILTER_ALL_REQ | FILTER_NODOSAGE | FILTER_NOCNV;
@@ -8416,11 +8414,11 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (scan_double(argv[cur_arg + 1], &mind_thresh)) {
-	    sprintf(logbuf, "Error: Invalid --mind parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --mind parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  if ((mind_thresh < 0.0) || (mind_thresh > 1.0)) {
-	    sprintf(logbuf, "Error: Invalid --mind parameter '%s' (must be between 0 and 1).\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --mind parameter '%s' (must be between 0 and 1).\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	} else {
@@ -8462,7 +8460,7 @@ int32_t main(int32_t argc, char** argv) {
 	      goto main_ret_INVALID_CMDLINE_A;
 	    }
 	    if (rel_info.ibc_type) {
-	      sprintf(logbuf, "Error: --make-grm-gz '%s' modifier cannot coexist with another IBC modifier.\n", argv[cur_arg + uii]);
+	      sprintf(g_logbuf, "Error: --make-grm-gz '%s' modifier cannot coexist with another IBC modifier.\n", argv[cur_arg + uii]);
 	      goto main_ret_INVALID_CMDLINE_2A;
 	    }
 	    rel_info.ibc_type = argv[cur_arg + uii][3] - '0';
@@ -8470,7 +8468,7 @@ int32_t main(int32_t argc, char** argv) {
 	    logerrprint("Error: --make-grm-gz 'single-prec' modifier has been retired.\n");
 	    goto main_ret_INVALID_CMDLINE;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --make-grm-gz parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --make-grm-gz parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -8494,7 +8492,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if ((!strcmp(argv[cur_arg + 1], "ibc2")) || (!strcmp(argv[cur_arg + 1], "ibc3"))) {
 	    rel_info.ibc_type = argv[cur_arg + 1][3] - '0';
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --make-grm-bin parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --make-grm-bin parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -8573,7 +8571,7 @@ int32_t main(int32_t argc, char** argv) {
 	      goto main_ret_INVALID_CMDLINE_A;
 	    }
 	    if (rel_info.ibc_type) {
-	      sprintf(logbuf, "Error: --make-rel '%s' modifier cannot coexist with another IBC modifier.\n", argv[cur_arg + uii]);
+	      sprintf(g_logbuf, "Error: --make-rel '%s' modifier cannot coexist with another IBC modifier.\n", argv[cur_arg + uii]);
 	      goto main_ret_INVALID_CMDLINE_2A;
 	    }
 	    rel_info.ibc_type = argv[cur_arg + uii][3] - '0';
@@ -8581,7 +8579,7 @@ int32_t main(int32_t argc, char** argv) {
 	    logerrprint("Error: --make-rel 'single-prec' modifier has been retired.  Use 'bin4'.\n");
 	    goto main_ret_INVALID_CMDLINE;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --make-rel parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --make-rel parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -8612,13 +8610,13 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
 	if (load_rare & (LOAD_RARE_CNV | LOAD_RARE_DOSAGE)) {
-	  sprintf(logbuf, "Error: --make-bed cannot be used with %s.\n", (load_rare == LOAD_RARE_CNV)? "a .cnv fileset" : "--dosage");
+	  sprintf(g_logbuf, "Error: --make-bed cannot be used with %s.\n", (load_rare == LOAD_RARE_CNV)? "a .cnv fileset" : "--dosage");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (param_ct) {
 	  // the missing --out mistake is so common--I must have made it over a
 	  // hundred times by now--that a custom error message is worthwhile.
-	  sprintf(logbuf, "Error: --make-bed doesn't accept parameters.%s\n", ((param_ct == 1) && (!outname_end))? "  (Did you forget '--out'?)" : "");
+	  sprintf(g_logbuf, "Error: --make-bed doesn't accept parameters.%s\n", ((param_ct == 1) && (!outname_end))? "  (Did you forget '--out'?)" : "");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	calculation_type |= CALC_MAKE_BED;
@@ -8628,7 +8626,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
 	if (load_rare & (LOAD_RARE_CNV | LOAD_RARE_DOSAGE)) {
-	  sprintf(logbuf, "Error: --make-just-bim cannot be used with %s.\n", (load_rare == LOAD_RARE_CNV)? "a .cnv fileset" : "--dosage");
+	  sprintf(g_logbuf, "Error: --make-just-bim cannot be used with %s.\n", (load_rare == LOAD_RARE_CNV)? "a .cnv fileset" : "--dosage");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	calculation_type |= CALC_MAKE_BIM;
@@ -8639,7 +8637,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
 	if (load_rare & (LOAD_RARE_CNV | LOAD_RARE_DOSAGE)) {
-	  sprintf(logbuf, "Error: --make-just-fam cannot be used with %s.\n", (load_rare == LOAD_RARE_CNV)? "a .cnv fileset" : "--dosage");
+	  sprintf(g_logbuf, "Error: --make-just-fam cannot be used with %s.\n", (load_rare == LOAD_RARE_CNV)? "a .cnv fileset" : "--dosage");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	calculation_type |= CALC_MAKE_FAM;
@@ -8649,7 +8647,7 @@ int32_t main(int32_t argc, char** argv) {
 	  logerrprint("Error: --merge cannot be used with --bmerge.\n");
 	  goto main_ret_INVALID_CMDLINE_A;
 	} else if (load_rare & (LOAD_RARE_CNV | LOAD_RARE_DOSAGE)) {
-	  sprintf(logbuf, "Error: --merge cannot be used with %s.\n", (load_rare == LOAD_RARE_CNV)? ".cnv filesets" : "--dosage");
+	  sprintf(g_logbuf, "Error: --merge cannot be used with %s.\n", (load_rare == LOAD_RARE_CNV)? ".cnv filesets" : "--dosage");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 2)) {
@@ -8682,7 +8680,7 @@ int32_t main(int32_t argc, char** argv) {
 	  logerrprint("Error: --merge-list cannot be used with --merge or --bmerge.\n");
 	  goto main_ret_INVALID_CMDLINE;
 	} else if (load_rare & (LOAD_RARE_CNV | LOAD_RARE_DOSAGE)) {
-	  sprintf(logbuf, "Error: --merge-list cannot be used with %s.\n", (load_rare == LOAD_RARE_CNV)? ".cnv filesets" : "--dosage");
+	  sprintf(g_logbuf, "Error: --merge-list cannot be used with %s.\n", (load_rare == LOAD_RARE_CNV)? ".cnv filesets" : "--dosage");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
@@ -8706,7 +8704,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cc = argv[cur_arg + 1][0];
 	if ((cc < '1') || (cc > '7') || (argv[cur_arg + 1][1] != '\0')) {
-          sprintf(logbuf, "Error: Invalid --merge-mode parameter '%s'.\n", argv[cur_arg + 1]);
+          sprintf(g_logbuf, "Error: Invalid --merge-mode parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if ((merge_type & MERGE_LIST) && (cc > '5')) {
@@ -8732,7 +8730,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &gxe_mcovar)) {
-	  sprintf(logbuf, "Error: Invalid --mcovar parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --mcovar parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
         logprint("Note: --mcovar flag deprecated.  Use '--gxe [covariate index]'.\n");
@@ -8812,7 +8810,7 @@ int32_t main(int32_t argc, char** argv) {
 	      goto main_ret_INVALID_CMDLINE;
 	    }
 	    if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &model_mperm_val)) {
-	      sprintf(logbuf, "Error: Invalid --model mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
+	      sprintf(g_logbuf, "Error: Invalid --model mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
 	      goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	    model_modifier |= MODEL_MPERM;
@@ -8822,7 +8820,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "set-test")) {
 	    model_modifier |= MODEL_SET_TEST;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --model parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --model parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -8881,23 +8879,23 @@ int32_t main(int32_t argc, char** argv) {
 	goto main_param_zero;
       } else if (!memcmp(argptr2, "perm", 5)) {
 	if (model_modifier & (MODEL_PERM | MODEL_MPERM)) {
-	  sprintf(logbuf, "Error: --mperm cannot be used with --%s %sperm.\n", (model_modifier & MODEL_ASSOC)? "assoc" : "model", (model_modifier & MODEL_PERM)? "" : "m");
+	  sprintf(g_logbuf, "Error: --mperm cannot be used with --%s %sperm.\n", (model_modifier & MODEL_ASSOC)? "assoc" : "model", (model_modifier & MODEL_PERM)? "" : "m");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	} else if (glm_modifier & (GLM_PERM | GLM_MPERM)) {
-	  sprintf(logbuf, "Error: --mperm cannot be used with --%s %sperm.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear", (glm_modifier & GLM_PERM)? "" : "m");
+	  sprintf(g_logbuf, "Error: --mperm cannot be used with --%s %sperm.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear", (glm_modifier & GLM_PERM)? "" : "m");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	} else if (family_info.dfam_modifier & (DFAM_PERM | DFAM_MPERM)) {
-	  sprintf(logbuf, "Error: --mperm cannot be used with --dfam %sperm.\n", (family_info.dfam_modifier & DFAM_PERM)? "" : "m");
+	  sprintf(g_logbuf, "Error: --mperm cannot be used with --dfam %sperm.\n", (family_info.dfam_modifier & DFAM_PERM)? "" : "m");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	} else if (cluster.modifier & (CLUSTER_CMH_PERM | CLUSTER_CMH_MPERM)) {
-	  sprintf(logbuf, "Error: --mperm cannot be used with --%s %sperm.\n", (cluster.modifier & CLUSTER_CMH_BD)? "bd" : "mh", (cluster.modifier & CLUSTER_CMH_PERM)? "" : "m");
+	  sprintf(g_logbuf, "Error: --mperm cannot be used with --%s %sperm.\n", (cluster.modifier & CLUSTER_CMH_BD)? "bd" : "mh", (cluster.modifier & CLUSTER_CMH_PERM)? "" : "m");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &mperm_val)) {
-	  sprintf(logbuf, "Error: Invalid --mperm parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --mperm parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (load_rare & LOAD_RARE_CNV) {
@@ -8958,7 +8956,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &cluster.max_size) || (cluster.max_size == 1)) {
-	  sprintf(logbuf, "Error: Invalid --mc parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --mc parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "cc", 2)) {
@@ -8970,7 +8968,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &cluster.max_cases)) {
-	  sprintf(logbuf, "Error: Invalid --mcc parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --mcc parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (cluster.max_cases > cluster.max_size) {
@@ -8978,7 +8976,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 2], &cluster.max_ctrls)) {
-	  sprintf(logbuf, "Error: Invalid --mcc parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --mcc parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (cluster.max_ctrls > cluster.max_size) {
@@ -9042,7 +9040,7 @@ int32_t main(int32_t argc, char** argv) {
 	      goto main_ret_INVALID_CMDLINE_A;
 	    }
 	    if (scan_posint_defcap(argv[cur_arg + uii], &cluster.mds_dim_ct)) {
-	      sprintf(logbuf, "Error: Invalid --mds-plot parameter '%s'.\n", argv[cur_arg + uii]);
+	      sprintf(g_logbuf, "Error: Invalid --mds-plot parameter '%s'.\n", argv[cur_arg + uii]);
               goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	  }
@@ -9064,7 +9062,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &mwithin_col)) {
-	  sprintf(logbuf, "Error: Invalid --mwithin parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --mwithin parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "in", 3)) {
@@ -9076,7 +9074,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
         if (scan_double(argv[cur_arg + 1], &dxx)) {
-	  sprintf(logbuf, "Error: Invalid --min parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --min parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if ((dxx < -1.0) || (dxx > 1.0)) {
@@ -9097,7 +9095,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
         if (scan_double(argv[cur_arg + 1], &dxx)) {
-	  sprintf(logbuf, "Error: Invalid --max parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --max parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if ((dxx < -1.0) || (dxx > 1.0)) {
@@ -9116,7 +9114,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "first")) {
 	    misc_flags |= MISC_MAKE_FOUNDERS_FIRST;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --make-founders parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --make-founders parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -9127,7 +9125,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
         if (param_ct) {
 	  if (strcmp(argv[cur_arg + 1], "gz")) {
-	    sprintf(logbuf, "Error: Invalid --missing parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --missing parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  misc_flags |= MISC_MISSING_GZ;
@@ -9157,7 +9155,7 @@ int32_t main(int32_t argc, char** argv) {
 	      goto main_ret_INVALID_CMDLINE;
 	    }
 	    if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &(cluster.cmh_mperm_val))) {
-	      sprintf(logbuf, "Error: Invalid --mh mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
+	      sprintf(g_logbuf, "Error: Invalid --mh mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
               goto main_ret_INVALID_CMDLINE_WWA;
 	    }
             cluster.modifier |= CLUSTER_CMH_MPERM;
@@ -9169,7 +9167,7 @@ int32_t main(int32_t argc, char** argv) {
             logerrprint("Error: Improper --mh mperm syntax.  (Use '--mh mperm=[value]'.)\n");
             goto main_ret_INVALID_CMDLINE_A;
 	  } else {
-            sprintf(logbuf, "Error: Invalid --mh parameter '%s'.\n", argv[cur_arg + uii]);
+            sprintf(g_logbuf, "Error: Invalid --mh parameter '%s'.\n", argv[cur_arg + uii]);
             goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -9184,7 +9182,7 @@ int32_t main(int32_t argc, char** argv) {
 	goto main_param_zero;
       } else if (!memcmp(argptr2, "ake-set", 8)) {
 	if (load_rare & (LOAD_RARE_CNV | LOAD_RARE_DOSAGE)) {
-	  sprintf(logbuf, "Error: --make-set cannot be used with %s.\n", (load_rare == LOAD_RARE_CNV)? "a .cnv fileset" : "--dosage");
+	  sprintf(g_logbuf, "Error: --make-set cannot be used with %s.\n", (load_rare == LOAD_RARE_CNV)? "a .cnv fileset" : "--dosage");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
@@ -9205,7 +9203,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
         if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0)) {
-	  sprintf(logbuf, "Error: Invalid --make-set-border parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --make-set-border parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
 	if (dxx > 2147483.646) {
@@ -9252,7 +9250,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct == 1) {
 	  if (strcmp(argv[cur_arg + 1], "no-fail")) {
-	    sprintf(logbuf, "Error: Invalid --merge-x parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --merge-x parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_A;
 	  }
 	  misc_flags |= MISC_SPLIT_MERGE_NOFAIL;
@@ -9279,17 +9277,17 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + 2], "var-first")) {
 	    ujj = 3;
 	  } else if (strcmp(argv[cur_arg + 3], "var-first")) {
-	    sprintf(logbuf, "Error: Invalid --me parameter '%s'.\n", argv[cur_arg + 3]);
+	    sprintf(g_logbuf, "Error: Invalid --me parameter '%s'.\n", argv[cur_arg + 3]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  family_info.mendel_modifier |= MENDEL_FILTER_VAR_FIRST;
 	}
 	if (scan_double(argv[cur_arg + uii], &family_info.mendel_max_trio_error) || (family_info.mendel_max_trio_error < 0.0) || (family_info.mendel_max_trio_error > 1.0)) {
-	  sprintf(logbuf, "Error: Invalid --me maximum per-trio error rate '%s'.\n", argv[cur_arg + uii]);
+	  sprintf(g_logbuf, "Error: Invalid --me maximum per-trio error rate '%s'.\n", argv[cur_arg + uii]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (scan_double(argv[cur_arg + ujj], &family_info.mendel_max_var_error) || (family_info.mendel_max_var_error < 0.0) || (family_info.mendel_max_var_error > 1.0)) {
-	  sprintf(logbuf, "Error: Invalid --me maximum per-variant error rate '%s'.\n", argv[cur_arg + ujj]);
+	  sprintf(g_logbuf, "Error: Invalid --me maximum per-variant error rate '%s'.\n", argv[cur_arg + ujj]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if ((family_info.mendel_max_trio_error < 1.0) || (family_info.mendel_max_var_error < 1.0)) {
@@ -9313,7 +9311,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
         if (param_ct) {
 	  if (scan_double(argv[cur_arg + 1], &family_info.mendel_exclude_one_ratio) || (family_info.mendel_exclude_one_ratio < 1.0)) {
-	    sprintf(logbuf, "Error: Invalid --me-exclude-one ratio '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --me-exclude-one ratio '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	} else {
@@ -9329,7 +9327,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (strcmp(argv[cur_arg + 1], "summaries-only")) {
-	    sprintf(logbuf, "Error: Invalid --mendel parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --mendel parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE;
 	  }
 	  family_info.mendel_modifier |= MENDEL_SUMMARIES_ONLY;
@@ -9346,7 +9344,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &permphe_ct)) {
-	  sprintf(logbuf, "Error: Invalid --make-perm-pheno parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --make-perm-pheno parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	calculation_type |= CALC_MAKE_PERM_PHENO;
@@ -9385,7 +9383,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "weighted-z")) {
 	    metaanal_flags |= METAANAL_WEIGHTED_Z;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --meta-analysis parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --meta-analysis parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -9467,7 +9465,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_uint_defcap(argv[cur_arg + 1], &min_ac)) {
-	  sprintf(logbuf, "Error: Invalid --mac parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --mac parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "ax-mac", 7)) {
@@ -9476,7 +9474,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_uint_defcap(argv[cur_arg + 1], &max_ac)) {
-	  sprintf(logbuf, "Error: Invalid --max-mac parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --max-mac parameter '%s'.\n", argv[cur_arg + 1]);
 	}
         if (max_ac < min_ac) {
 	  logerrprint("Error: --max-mac parameter cannot be smaller than --mac parameter.\n");
@@ -9525,11 +9523,11 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &neighbor_n1)) {
-	  sprintf(logbuf, "Error: Invalid --neighbour parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --neighbour parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 2], &neighbor_n2)) {
-	  sprintf(logbuf, "Error: Invalid --neighbour parameter '%s'.\n", argv[cur_arg + 2]);
+	  sprintf(g_logbuf, "Error: Invalid --neighbour parameter '%s'.\n", argv[cur_arg + 2]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (neighbor_n2 < neighbor_n1) {
@@ -9578,8 +9576,8 @@ int32_t main(int32_t argc, char** argv) {
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
-	if (scan_posint_capped(argv[cur_arg + 1], &new_id_max_allele_len, (MAX_ID_LEN - 2) / 10, (MAX_ID_LEN - 2) % 10)) {
-	  sprintf(logbuf, "Error: Invalid --new-id-max-allele-len parameter '%s'.\n", argv[cur_arg + 1]);
+	if (scan_posint_capped(argv[cur_arg + 1], (MAX_ID_LEN - 2) / 10, (MAX_ID_LEN - 2) % 10, &new_id_max_allele_len)) {
+	  sprintf(g_logbuf, "Error: Invalid --new-id-max-allele-len parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "o-snp", 6)) {
@@ -9593,7 +9591,7 @@ int32_t main(int32_t argc, char** argv) {
 	  logerrprint("Error: --no-snp cannot be used with --mperm-save.\n");
 	  goto main_ret_INVALID_CMDLINE_A;
 	} else if ((glm_modifier & (GLM_NO_SNP_EXCL - GLM_HETHOM - GLM_DOMINANT)) || ((glm_modifier & (GLM_HETHOM | GLM_DOMINANT)) && (!(glm_modifier & (GLM_CONDITION_DOMINANT | GLM_CONDITION_RECESSIVE))))) {
-	  sprintf(logbuf, "Error: --no-snp conflicts with a --%s modifier.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear");
+	  sprintf(g_logbuf, "Error: --no-snp conflicts with a --%s modifier.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	logprint("Note: --no-snp flag deprecated.  Use e.g. '--linear no-snp'.\n");
@@ -9604,7 +9602,7 @@ int32_t main(int32_t argc, char** argv) {
 	  logerrprint("Error: --no-x-sex must be used with --linear or --logistic.\n");
 	  goto main_ret_INVALID_CMDLINE_A;
 	} else if (glm_modifier & (GLM_NO_SNP | GLM_SEX)) {
-	  sprintf(logbuf, "Error: --no-x-sex conflicts with a --%s modifier.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear");
+	  sprintf(g_logbuf, "Error: --no-x-sex conflicts with a --%s modifier.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	logprint("Note: --no-x-sex flag deprecated.  Use e.g. '--linear no-x-sex'.\n");
@@ -9651,7 +9649,7 @@ int32_t main(int32_t argc, char** argv) {
 	} else if (!strcmp(argv[cur_arg + 1], "chrMT")) {
           chrom_info.output_encoding = CHR_OUTPUT_PREFIX | CHR_OUTPUT_MT;
 	} else if (strcmp(argv[cur_arg + 1], "26")) {
-	  sprintf(logbuf, "Error: Invalid --output-chr parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --output-chr parameter '%s'.\n", argv[cur_arg + 1]);
           goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "utput-missing-genotype", 23)) {
@@ -9660,7 +9658,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cc = extract_char_param(argv[cur_arg + 1]);
 	if (((unsigned char)cc) <= ' ') {
-	  sprintf(logbuf, "Error: Invalid --output-missing-genotype parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --output-missing-genotype parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	g_output_missing_geno_ptr = &(g_one_char_strs[((unsigned char)cc) * 2]);
@@ -9724,7 +9722,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (!(misc_flags & MISC_ALLOW_EXTRA_CHROMS)) {
 	  if (get_chrom_code_raw(argv[cur_arg + 1]) < 0) {
-	    sprintf(logbuf, "Error: Invalid --oxford-single-chr chromosome code '%s'. (Did you forget --allow-extra-chr?)\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --oxford-single-chr chromosome code '%s'. (Did you forget --allow-extra-chr?)\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -9747,7 +9745,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
         if (scan_double(argv[cur_arg + 1], &output_min_p) || (!(output_min_p >= 0.0)) || (output_min_p >= 1.0)) {
-	  sprintf(logbuf, "Error: Invalid --output-min-p parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --output-min-p parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (memcmp(argptr2, "ut", 3)) {
@@ -9840,12 +9838,12 @@ int32_t main(int32_t argc, char** argv) {
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 2, 2)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
-	if (scan_posint_capped(argv[cur_arg + 1], &parallel_idx, PARALLEL_MAX / 10, PARALLEL_MAX % 10)) {
-	  sprintf(logbuf, "Error: Invalid --parallel job index '%s'.\n", argv[cur_arg + 1]);
+	if (scan_posint_capped(argv[cur_arg + 1], PARALLEL_MAX / 10, PARALLEL_MAX % 10, &parallel_idx)) {
+	  sprintf(g_logbuf, "Error: Invalid --parallel job index '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
-	if (scan_posint_capped(argv[cur_arg + 2], &parallel_tot, PARALLEL_MAX / 10, PARALLEL_MAX % 10) || (parallel_tot == 1) || (parallel_tot < parallel_idx)) {
-	  sprintf(logbuf, "Error: Invalid --parallel total job count '%s'.\n", argv[cur_arg + 2]);
+	if (scan_posint_capped(argv[cur_arg + 2], PARALLEL_MAX / 10, PARALLEL_MAX % 10, &parallel_tot) || (parallel_tot == 1) || (parallel_tot < parallel_idx)) {
+	  sprintf(g_logbuf, "Error: Invalid --parallel total job count '%s'.\n", argv[cur_arg + 2]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	parallel_idx--; // internal 0..(n-1) indexing
@@ -9854,7 +9852,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx)) {
-	  sprintf(logbuf, "Error: Invalid --ppc-gap parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --ppc-gap parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	dxx *= 1000;
@@ -9868,21 +9866,21 @@ int32_t main(int32_t argc, char** argv) {
       } else if (!memcmp(argptr2, "erm", 4)) {
 	if (model_modifier & MODEL_MPERM) {
           if (calculation_type & CALC_MODEL) {
-	    sprintf(logbuf, "Error: --perm cannot be used with --%s mperm.\n", (model_modifier & MODEL_ASSOC)? "assoc" : "model");
+	    sprintf(g_logbuf, "Error: --perm cannot be used with --%s mperm.\n", (model_modifier & MODEL_ASSOC)? "assoc" : "model");
 	    goto main_ret_INVALID_CMDLINE_2A;
 	  } else {
 	    logerrprint("Error: --perm cannot be used with --mperm.\n");
 	    goto main_ret_INVALID_CMDLINE_A;
 	  }
 	} else if ((calculation_type & CALC_GLM) && (glm_modifier & (GLM_MPERM | GLM_NO_SNP))) {
-	  sprintf(logbuf, "Error: --perm cannot be used with --%s %s.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear", (glm_modifier & GLM_MPERM)? "mperm" : "no-snp");
+	  sprintf(g_logbuf, "Error: --perm cannot be used with --%s %s.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear", (glm_modifier & GLM_MPERM)? "mperm" : "no-snp");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	} else if (family_info.dfam_modifier & DFAM_MPERM) {
 	  logerrprint("Error: --perm cannot be used with --dfam mperm.\n");
 	  goto main_ret_INVALID_CMDLINE_A;
 	} else if (calculation_type & CALC_CMH) {
           if (cluster.modifier & CLUSTER_CMH_MPERM) {
-	    sprintf(logbuf, "Error: --perm cannot be used with --%s mperm.\n", (cluster.modifier & CLUSTER_CMH_BD)? "bd" : "mh");
+	    sprintf(g_logbuf, "Error: --perm cannot be used with --%s mperm.\n", (cluster.modifier & CLUSTER_CMH_BD)? "bd" : "mh");
 	    goto main_ret_INVALID_CMDLINE_2A;
 	  } else if (cluster.modifier & CLUSTER_CMH_PERM_BD) {
 	    logerrprint("Error: --perm cannot be used with --bd perm-bd.\n");
@@ -9920,7 +9918,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
         if (scan_double(argv[cur_arg + 1], &dxx)) {
-	  sprintf(logbuf, "Error: Invalid --pfilter parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --pfilter parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if ((dxx <= 0.0) || (dxx > 1.0)) {
@@ -9933,7 +9931,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &perm_batch_size)) {
-	  sprintf(logbuf, "Error: Invalid --perm-batch-size parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --perm-batch-size parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "pc", 3)) {
@@ -9945,7 +9943,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
         if (scan_double(argv[cur_arg + 1], &dxx)) {
-	  sprintf(logbuf, "Error: Invalid --ppc parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --ppc parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if ((dxx <= 0.0) || (dxx >= 1.0)) {
@@ -9962,7 +9960,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &homozyg.pool_size_min) || (homozyg.pool_size_min == 1)) {
-	  sprintf(logbuf, "Error: Invalid --pool-size parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --pool-size parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "arameters", 10)) {
@@ -10161,8 +10159,8 @@ int32_t main(int32_t argc, char** argv) {
               logerrprint("Error: --q-score-range takes at most two numeric parameters.\n");
               goto main_ret_INVALID_CMDLINE_A;
 	    } else {
-	      if (scan_posint_capped(argv[cur_arg + uii], (uint32_t*)&ii, (MAXLINEBUFLEN / 2) / 10, (MAXLINEBUFLEN / 2) % 10)) {
-                sprintf(logbuf, "Error: Invalid --q-score-range parameter '%s'.\n", argv[cur_arg + uii]);
+	      if (scan_posint_capped(argv[cur_arg + uii], (MAXLINEBUFLEN / 2) / 10, (MAXLINEBUFLEN / 2) % 10, (uint32_t*)&ii)) {
+                sprintf(g_logbuf, "Error: Invalid --q-score-range parameter '%s'.\n", argv[cur_arg + uii]);
                 goto main_ret_INVALID_CMDLINE_WWA;
 	      }
               if (!ujj) {
@@ -10197,7 +10195,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
         if (scan_double(argv[cur_arg + 1], &qual_min_thresh)) {
-	  sprintf(logbuf, "Error: Invalid --qual-threshold parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --qual-threshold parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (qual_min_thresh > qual_max_thresh) {
@@ -10209,7 +10207,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &qual_max_thresh)) {
-	  sprintf(logbuf, "Error: Invalid --qual-max-threshold parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --qual-max-threshold parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if ((!memcmp(argptr2, "fam", 4)) || (!memcmp(argptr2, "fam-parents", 12)) || (!memcmp(argptr2, "fam-between", 12)) || (!memcmp(argptr2, "fam-total", 10))) {
@@ -10225,35 +10223,35 @@ int32_t main(int32_t argc, char** argv) {
 	    family_info.qfam_modifier |= QFAM_EMP_SE;
 	  } else if (!strcmp(argv[cur_arg + uii], "perm")) {
 	    if (family_info.qfam_modifier & QFAM_MPERM) {
-	      sprintf(logbuf, "Error: --%s 'mperm' and 'perm' cannot be used together.\n", argptr);
+	      sprintf(g_logbuf, "Error: --%s 'mperm' and 'perm' cannot be used together.\n", argptr);
 	      goto main_ret_INVALID_CMDLINE_2A;
 	    }
 	    family_info.qfam_modifier |= QFAM_PERM;
 	  } else if ((strlen(argv[cur_arg + uii]) > 6) && (!memcmp(argv[cur_arg + uii], "mperm=", 6))) {
 	    if (family_info.qfam_modifier & QFAM_PERM) {
-	      sprintf(logbuf, "Error: --%s 'mperm' and 'perm' cannot be used together.\n", argptr);
+	      sprintf(g_logbuf, "Error: --%s 'mperm' and 'perm' cannot be used together.\n", argptr);
 	      goto main_ret_INVALID_CMDLINE_2A;
 	    } else if (family_info.qfam_modifier & QFAM_MPERM) {
-	      sprintf(logbuf, "Error: Duplicate --%s 'mperm' modifier.\n", argptr);
+	      sprintf(g_logbuf, "Error: Duplicate --%s 'mperm' modifier.\n", argptr);
 	      goto main_ret_INVALID_CMDLINE_2;
 	    }
 	    if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &(family_info.qfam_mperm_val))) {
-	      sprintf(logbuf, "Error: Invalid --%s mperm parameter '%s'.\n", argptr, &(argv[cur_arg + uii][6]));
+	      sprintf(g_logbuf, "Error: Invalid --%s mperm parameter '%s'.\n", argptr, &(argv[cur_arg + uii][6]));
               goto main_ret_INVALID_CMDLINE_WWA;
 	    }
             family_info.qfam_modifier |= QFAM_MPERM;
 	  } else if (!strcmp(argv[cur_arg + uii], "perm-count")) {
             family_info.qfam_modifier |= QFAM_PERM_COUNT;
 	  } else if (!strcmp(argv[cur_arg + uii], "mperm")) {
-            sprintf(logbuf, "Error: Improper --%s mperm syntax.  (Use '--%s mperm=[value]'.)\n", argptr, argptr);
+            sprintf(g_logbuf, "Error: Improper --%s mperm syntax.  (Use '--%s mperm=[value]'.)\n", argptr, argptr);
             goto main_ret_INVALID_CMDLINE_WWA;
 	  } else {
-            sprintf(logbuf, "Error: Invalid --%s parameter '%s'.\n", argptr, argv[cur_arg + uii]);
+            sprintf(g_logbuf, "Error: Invalid --%s parameter '%s'.\n", argptr, argv[cur_arg + uii]);
             goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
 	if (!(family_info.qfam_modifier & (QFAM_PERM | QFAM_MPERM))) {
-	  sprintf(logbuf, "Error: --%s requires permutation.\n", argptr);
+	  sprintf(g_logbuf, "Error: --%s requires permutation.\n", argptr);
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (!memcmp(argptr2, "fam", 4)) {
@@ -10332,7 +10330,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (scan_double(argv[cur_arg + 1], &rel_info.cutoff) || (rel_info.cutoff <= 0.0) || (rel_info.cutoff >= 1.0)) {
-	    sprintf(logbuf, "Error: Invalid --rel-cutoff parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --rel-cutoff parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -10347,12 +10345,12 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (scan_posintptr(argv[cur_arg + 1], &regress_iters) || (regress_iters == 1)) {
-	    sprintf(logbuf, "Error: Invalid --regress-distance jackknife iteration count '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --regress-distance jackknife iteration count '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  if (param_ct == 2) {
 	    if (scan_posint_defcap(argv[cur_arg + 2], &regress_d)) {
-	      sprintf(logbuf, "Error: Invalid --regress-distance jackknife delete parameter '%s'.\n", argv[cur_arg + 2]);
+	      sprintf(g_logbuf, "Error: Invalid --regress-distance jackknife delete parameter '%s'.\n", argv[cur_arg + 2]);
 	      goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	  }
@@ -10371,12 +10369,12 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (scan_posintptr(argv[cur_arg + 1], &rel_info.regress_rel_iters) || (rel_info.regress_rel_iters == 1)) {
-	    sprintf(logbuf, "Error: Invalid --regress-rel jackknife iteration count '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --regress-rel jackknife iteration count '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  if (param_ct == 2) {
 	    if (scan_posint_defcap(argv[cur_arg + 2], &rel_info.regress_rel_d)) {
-	      sprintf(logbuf, "Error: Invalid --regress-rel jackknife delete parameter '%s'.\n", argv[cur_arg + 2]);
+	      sprintf(g_logbuf, "Error: Invalid --regress-rel jackknife delete parameter '%s'.\n", argv[cur_arg + 2]);
 	      goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	  }
@@ -10402,11 +10400,11 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "clip")) {
 	    regress_pcs_modifier |= REGRESS_PCS_CLIP;
 	  } else if ((max_pcs != MAX_PCS_DEFAULT) || (argv[cur_arg + uii][0] < '0') || (argv[cur_arg + uii][0] > '9')) {
-	    sprintf(logbuf, "Error: Invalid --regress-pcs parameter '%s'.%s", argv[cur_arg + uii], errstr_append);
+	    sprintf(g_logbuf, "Error: Invalid --regress-pcs parameter '%s'.%s", argv[cur_arg + uii], errstr_append);
 	    goto main_ret_INVALID_CMDLINE_3;
 	  } else {
             if (scan_posint_defcap(argv[cur_arg + uii], &max_pcs)) {
-	      sprintf(logbuf, "Error: Invalid --regress-pcs maximum principal component count '%s'.%s", argv[cur_arg + uii], errstr_append);
+	      sprintf(g_logbuf, "Error: Invalid --regress-pcs maximum principal component count '%s'.%s", argv[cur_arg + uii], errstr_append);
 	      goto main_ret_INVALID_CMDLINE_3;
 	    }
 	  }
@@ -10414,10 +10412,10 @@ int32_t main(int32_t argc, char** argv) {
 	calculation_type |= CALC_REGRESS_PCS;
       } else if (!memcmp(argptr2, "egress-pcs-distance", 20)) {
 	if (calculation_type & CALC_REGRESS_PCS) {
-	  sprintf(logbuf, "Error: --regress-pcs-distance cannot be used with --regress-pcs.%s", errstr_append);
+	  sprintf(g_logbuf, "Error: --regress-pcs-distance cannot be used with --regress-pcs.%s", errstr_append);
 	  goto main_ret_INVALID_CMDLINE_3;
 	} else if (calculation_type & CALC_DISTANCE) {
-	  sprintf(logbuf, "Error: --regress-pcs-distance cannot be used with --distance.%s", errstr_append);
+	  sprintf(g_logbuf, "Error: --regress-pcs-distance cannot be used with --distance.%s", errstr_append);
 	  goto main_ret_INVALID_CMDLINE_3;
 	}
         if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 11)) {
@@ -10434,43 +10432,43 @@ int32_t main(int32_t argc, char** argv) {
 	    regress_pcs_modifier |= REGRESS_PCS_SEX_SPECIFIC;
 	  } else if (!strcmp(argv[cur_arg + uii], "square")) {
 	    if ((dist_calc_type & DISTANCE_SHAPEMASK) == DISTANCE_SQ0) {
-	      sprintf(logbuf, "Error: --regress-pcs-distance 'square' and 'square0' modifiers cannot coexist.%s", errstr_append);
+	      sprintf(g_logbuf, "Error: --regress-pcs-distance 'square' and 'square0' modifiers cannot coexist.%s", errstr_append);
 	      goto main_ret_INVALID_CMDLINE_3;
 	    } else if ((dist_calc_type & DISTANCE_SHAPEMASK) == DISTANCE_TRI) {
-	      sprintf(logbuf, "Error: --regress-pcs-distance 'square' and 'triangle' modifiers cannot coexist.%s", errstr_append);
+	      sprintf(g_logbuf, "Error: --regress-pcs-distance 'square' and 'triangle' modifiers cannot coexist.%s", errstr_append);
 	      goto main_ret_INVALID_CMDLINE_3;
 	    } else if (parallel_tot > 1) {
-	      sprintf(logbuf, "Error: --parallel cannot be used with '--regress-pcs-distance square'.  Use\nthe square0 or triangle shape instead.%s", errstr_append);
+	      sprintf(g_logbuf, "Error: --parallel cannot be used with '--regress-pcs-distance square'.  Use\nthe square0 or triangle shape instead.%s", errstr_append);
 	      goto main_ret_INVALID_CMDLINE_3;
 	    }
 	    dist_calc_type |= DISTANCE_SQ;
 	  } else if (!strcmp(argv[cur_arg + uii], "square0")) {
 	    if ((dist_calc_type & DISTANCE_SHAPEMASK) == DISTANCE_SQ) {
-	      sprintf(logbuf, "Error: --regress-pcs-distance 'square' and 'square0' modifiers cannot coexist.%s", errstr_append);
+	      sprintf(g_logbuf, "Error: --regress-pcs-distance 'square' and 'square0' modifiers cannot coexist.%s", errstr_append);
 	      goto main_ret_INVALID_CMDLINE_3;
 	    } else if ((dist_calc_type & DISTANCE_SHAPEMASK) == DISTANCE_TRI) {
-	      sprintf(logbuf, "Error: --regress-pcs-distance 'square0' and 'triangle' modifiers can't coexist.%s", errstr_append);
+	      sprintf(g_logbuf, "Error: --regress-pcs-distance 'square0' and 'triangle' modifiers can't coexist.%s", errstr_append);
 	      goto main_ret_INVALID_CMDLINE_3;
 	    }
 	    dist_calc_type |= DISTANCE_SQ0;
 	  } else if (!strcmp(argv[cur_arg + uii], "triangle")) {
 	    if ((dist_calc_type & DISTANCE_SHAPEMASK) == DISTANCE_SQ) {
-	      sprintf(logbuf, "Error: --regress-pcs-distance 'square' and 'triangle' modifiers cannot coexist.%s", errstr_append);
+	      sprintf(g_logbuf, "Error: --regress-pcs-distance 'square' and 'triangle' modifiers cannot coexist.%s", errstr_append);
 	      goto main_ret_INVALID_CMDLINE_3;
 	    } else if ((dist_calc_type & DISTANCE_SHAPEMASK) == DISTANCE_SQ0) {
-	      sprintf(logbuf, "Error: --regress-pcs-distance 'square0' and 'triangle' modifiers can't coexist.%s", errstr_append);
+	      sprintf(g_logbuf, "Error: --regress-pcs-distance 'square0' and 'triangle' modifiers can't coexist.%s", errstr_append);
 	      goto main_ret_INVALID_CMDLINE_3;
 	    }
 	    dist_calc_type |= DISTANCE_TRI;
 	  } else if (!strcmp(argv[cur_arg + uii], "gz")) {
 	    if (dist_calc_type & DISTANCE_BIN) {
-	      sprintf(logbuf, "Error: --regress-pcs-distance 'gz' and 'bin' flags cannot coexist.%s", errstr_append);
+	      sprintf(g_logbuf, "Error: --regress-pcs-distance 'gz' and 'bin' flags cannot coexist.%s", errstr_append);
 	      goto main_ret_INVALID_CMDLINE_3;
 	    }
 	    dist_calc_type |= DISTANCE_GZ;
 	  } else if (!strcmp(argv[cur_arg + uii], "bin")) {
 	    if (dist_calc_type & DISTANCE_GZ) {
-	      sprintf(logbuf, "Error: --regress-pcs-distance 'gz' and 'bin' flags cannot coexist.%s", errstr_append);
+	      sprintf(g_logbuf, "Error: --regress-pcs-distance 'gz' and 'bin' flags cannot coexist.%s", errstr_append);
 	      goto main_ret_INVALID_CMDLINE_3;
 	    }
 	    dist_calc_type |= DISTANCE_BIN;
@@ -10495,11 +10493,11 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "flat-missing")) {
 	    dist_calc_type |= DISTANCE_FLAT_MISSING;
 	  } else if ((max_pcs != MAX_PCS_DEFAULT) || (argv[cur_arg + uii][0] < '0') || (argv[cur_arg + uii][0] > '9')) {
-	    sprintf(logbuf, "Error: Invalid --regress-pcs-distance parameter '%s'.%s", argv[cur_arg + uii], errstr_append);
+	    sprintf(g_logbuf, "Error: Invalid --regress-pcs-distance parameter '%s'.%s", argv[cur_arg + uii], errstr_append);
 	    goto main_ret_INVALID_CMDLINE_3;
 	  } else {
             if (scan_posint_defcap(argv[cur_arg + uii], &max_pcs)) {
-	      sprintf(logbuf, "Error: Invalid --regress-pcs-distance maximum PC count '%s'.%s", argv[cur_arg + uii], errstr_append);
+	      sprintf(g_logbuf, "Error: Invalid --regress-pcs-distance maximum PC count '%s'.%s", argv[cur_arg + uii], errstr_append);
 	      goto main_ret_INVALID_CMDLINE_3;
 	    }
 	  }
@@ -10697,7 +10695,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "omit-nonmale-y")) {
 	    recode_modifier |= RECODE_OMIT_NONMALE_Y;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --recode parameter '%s'.%s\n", argv[cur_arg + uii], ((uii == param_ct) && (!outname_end))? "  (Did you forget '--out'?)" : "");
+	    sprintf(g_logbuf, "Error: Invalid --recode parameter '%s'.%s\n", argv[cur_arg + uii], ((uii == param_ct) && (!outname_end))? "  (Did you forget '--out'?)" : "");
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -10804,7 +10802,7 @@ int32_t main(int32_t argc, char** argv) {
 	  logerrprint("Error: --recessive must be used with --linear or --logistic.\n");
 	  goto main_ret_INVALID_CMDLINE_A;
 	} else if (glm_modifier & (GLM_GENOTYPIC | GLM_HETHOM | GLM_DOMINANT)) {
-	  sprintf(logbuf, "Error: --recessive conflicts with a --%s modifier.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear");
+	  sprintf(g_logbuf, "Error: --recessive conflicts with a --%s modifier.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	logprint("Note: --recessive flag deprecated.  Use e.g. '--linear recessive' (and\n'--condition-list [filename] recessive' to change covariate coding).\n");
@@ -10840,7 +10838,7 @@ int32_t main(int32_t argc, char** argv) {
 	      goto main_ret_INVALID_CMDLINE;
 	    } else if (ld_info.modifier & (LD_INTER_CHR | LD_INPHASE | LD_DPRIME | LD_WITH_FREQS)) {
 	    main_r2_matrix_conflict:
-              sprintf(logbuf, "Error: --r/--r2 '%s' cannot be used with matrix output.\n", (ld_info.modifier & LD_INTER_CHR)? "inter-chr" : ((ld_info.modifier & LD_INPHASE)? "in-phase" : ((ld_info.modifier & LD_DPRIME)? "dprime" : "with-freqs")));
+              sprintf(g_logbuf, "Error: --r/--r2 '%s' cannot be used with matrix output.\n", (ld_info.modifier & LD_INTER_CHR)? "inter-chr" : ((ld_info.modifier & LD_INPHASE)? "in-phase" : ((ld_info.modifier & LD_DPRIME)? "dprime" : "with-freqs")));
 	      goto main_ret_INVALID_CMDLINE_2A;
 	    }
 	    ld_info.modifier |= LD_MATRIX_SQ;
@@ -10922,7 +10920,7 @@ int32_t main(int32_t argc, char** argv) {
 	  } else if (!strcmp(argv[cur_arg + uii], "yes-really")) {
 	    ld_info.modifier |= LD_YES_REALLY;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --r/--r2 parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --r/--r2 parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -10943,7 +10941,7 @@ int32_t main(int32_t argc, char** argv) {
 	calculation_type |= CALC_LD;
       } else if (!memcmp(argptr2, "eal-ref-alleles", 16)) {
 	if (load_rare & (LOAD_RARE_CNV | LOAD_RARE_DOSAGE)) {
-	  sprintf(logbuf, "Error: --real-ref-alleles has no effect with %s.\n", (load_rare == LOAD_RARE_CNV)? "a .cnv fileset" : "--dosage");
+	  sprintf(g_logbuf, "Error: --real-ref-alleles has no effect with %s.\n", (load_rare == LOAD_RARE_CNV)? "a .cnv fileset" : "--dosage");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
         misc_flags |= MISC_REAL_REF_ALLELES | MISC_KEEP_ALLELE_ORDER;
@@ -10973,8 +10971,8 @@ int32_t main(int32_t argc, char** argv) {
 	rseed_ct = param_ct;
 	rseeds = (uint32_t*)malloc(param_ct * sizeof(int32_t));
 	for (uii = 1; uii <= param_ct; uii++) {
-	  if (scan_uint_capped(argv[cur_arg + uii], &(rseeds[uii - 1]), 0xffffffffU / 10, 0xffffffffU % 10)) {
-	    sprintf(logbuf, "Error: Invalid --seed parameter '%s'.\n", argv[cur_arg + uii]);
+	  if (scan_uint_capped(argv[cur_arg + uii], 0xffffffffU / 10, 0xffffffffU % 10, &(rseeds[uii - 1]))) {
+	    sprintf(g_logbuf, "Error: Invalid --seed parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -11047,7 +11045,7 @@ int32_t main(int32_t argc, char** argv) {
           goto main_ret_INVALID_CMDLINE_A;
 	}
 	if (load_rare & (LOAD_RARE_CNV | LOAD_RARE_DOSAGE)) {
-	  sprintf(logbuf, "Error: --set cannot be used with %s.\n", (load_rare == LOAD_RARE_CNV)? "a .cnv fileset" : "--dosage");
+	  sprintf(g_logbuf, "Error: --set cannot be used with %s.\n", (load_rare == LOAD_RARE_CNV)? "a .cnv fileset" : "--dosage");
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
@@ -11116,36 +11114,36 @@ int32_t main(int32_t argc, char** argv) {
 	for (uii = 2; uii <= param_ct; uii++) {
 	  if (!strcmp(argv[cur_arg + uii], "tags")) {
 	    if (simulate_flags & SIMULATE_HAPS) {
-	      sprintf(logbuf, "Error: --%s 'tags' and 'haps' modifiers cannot be used together.\n", argptr);
+	      sprintf(g_logbuf, "Error: --%s 'tags' and 'haps' modifiers cannot be used together.\n", argptr);
 	      goto main_ret_INVALID_CMDLINE_2A;
 	    }
 	    simulate_flags |= SIMULATE_TAGS;
 	  } else if (!strcmp(argv[cur_arg + uii], "haps")) {
 	    if (simulate_flags & SIMULATE_TAGS) {
-	      sprintf(logbuf, "Error: --%s 'tags' and 'haps' modifiers cannot be used together.\n", argptr);
+	      sprintf(g_logbuf, "Error: --%s 'tags' and 'haps' modifiers cannot be used together.\n", argptr);
 	      goto main_ret_INVALID_CMDLINE_2A;
 	    }
 	    simulate_flags |= SIMULATE_HAPS;
 	  } else if (match_upper(argv[cur_arg + uii], "ACGT")) {
 	    if (simulate_flags & (SIMULATE_1234 | SIMULATE_12)) {
-	      sprintf(logbuf, "Error: --%s 'acgt' modifier cannot be used with '1234' or '12'.\n", argptr);
+	      sprintf(g_logbuf, "Error: --%s 'acgt' modifier cannot be used with '1234' or '12'.\n", argptr);
 	      goto main_ret_INVALID_CMDLINE_2A;
 	    }
             simulate_flags |= SIMULATE_ACGT;
 	  } else if (!strcmp(argv[cur_arg + uii], "1234")) {
 	    if (simulate_flags & (SIMULATE_ACGT | SIMULATE_12)) {
-	      sprintf(logbuf, "Error: --%s '1234' modifier cannot be used with 'acgt' or '12'.\n", argptr);
+	      sprintf(g_logbuf, "Error: --%s '1234' modifier cannot be used with 'acgt' or '12'.\n", argptr);
 	      goto main_ret_INVALID_CMDLINE_2A;
 	    }
             simulate_flags |= SIMULATE_1234;
 	  } else if (!strcmp(argv[cur_arg + uii], "12")) {
 	    if (simulate_flags & (SIMULATE_ACGT | SIMULATE_1234)) {
-	      sprintf(logbuf, "Error: --%s '12' modifier cannot be used with 'acgt' or '1234'.\n", argptr);
+	      sprintf(g_logbuf, "Error: --%s '12' modifier cannot be used with 'acgt' or '1234'.\n", argptr);
 	      goto main_ret_INVALID_CMDLINE_2A;
 	    }
             simulate_flags |= SIMULATE_12;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --%s parameter '%s'.\n", argptr, argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --%s parameter '%s'.\n", argptr, argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -11159,7 +11157,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
 	if (scan_uint_defcap(argv[cur_arg + 1], &simulate_cases)) {
-	  sprintf(logbuf, "Error: Invalid --simulate-ncases parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --simulate-ncases parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "imulate-ncontrols", 18)) {
@@ -11167,11 +11165,11 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (load_rare != LOAD_RARE_SIMULATE) {
-	  sprintf(logbuf, "Error: --simulate-ncontrols must be used with --simulate.\n");
+	  sprintf(g_logbuf, "Error: --simulate-ncontrols must be used with --simulate.\n");
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
 	if (scan_uint_defcap(argv[cur_arg + 1], &simulate_controls)) {
-	  sprintf(logbuf, "Error: Invalid --simulate-ncontrols parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --simulate-ncontrols parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if ((!simulate_controls) && (!simulate_cases)) {
@@ -11183,7 +11181,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &simulate_prevalence) || (simulate_prevalence < 0) || (simulate_prevalence > 1)) {
-	  sprintf(logbuf, "Error: Invalid --simulate-prevalence parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --simulate-prevalence parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "imulate-label", 14)) {
@@ -11198,7 +11196,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &simulate_missing) || (simulate_missing < 0) || (simulate_missing > 1)) {
-	  sprintf(logbuf, "Error: Invalid --simulate-missing parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --simulate-missing parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "imulate-n", 10)) {
@@ -11210,7 +11208,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &simulate_qt_samples)) {
-	  sprintf(logbuf, "Error: Invalid --simulate-n parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --simulate-n parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "imulate-haps", 13)) {
@@ -11237,7 +11235,7 @@ int32_t main(int32_t argc, char** argv) {
 	    logerrprint("Error: --sex must be used with --linear/--logistic/--dosage.\n");
 	    goto main_ret_INVALID_CMDLINE_A;
 	  } else if (glm_modifier & GLM_NO_X_SEX) {
-	    sprintf(logbuf, "Error: --sex conflicts with a --%s modifier.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear");
+	    sprintf(g_logbuf, "Error: --sex conflicts with a --%s modifier.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear");
 	    goto main_ret_INVALID_CMDLINE_2A;
 	  }
 	  logprint("Note: --sex flag deprecated.  Use e.g. '--linear sex'.\n");
@@ -11298,7 +11296,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx <= 0) || (dxx > 1)) {
-	  sprintf(logbuf, "Error: Invalid --set-p parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --set-p parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	set_info.set_p = dxx;
@@ -11318,13 +11316,13 @@ int32_t main(int32_t argc, char** argv) {
           if (!strcmp(argv[cur_arg + 2], "write")) {
             set_info.modifier |= SET_R2_WRITE;
 	  } else {
-            sprintf(logbuf, "Error: Invalid --set-r2 parameter '%s'.\n", argv[cur_arg + 2]);
+            sprintf(g_logbuf, "Error: Invalid --set-r2 parameter '%s'.\n", argv[cur_arg + 2]);
             goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
         if (uii <= param_ct) {
 	  if (scan_double(argv[cur_arg + uii], &dxx) || (dxx < 0.0)) {
-	    sprintf(logbuf, "Error: Invalid --set-r2 parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --set-r2 parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  if (dxx > 0.0) {
@@ -11347,7 +11345,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &set_info.set_max)) {
-	  sprintf(logbuf, "Error: Invalid --set-max parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --set-max parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "et-test-lambda", 15)) {
@@ -11359,7 +11357,7 @@ int32_t main(int32_t argc, char** argv) {
           goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &set_info.set_test_lambda)) {
-	  sprintf(logbuf, "Error: Invalid --set-test-lambda parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --set-test-lambda parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (set_info.set_test_lambda < 1) {
@@ -11384,7 +11382,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if ((strlen(argv[cur_arg + 1]) != 5) || (memcmp(argv[cur_arg + 1], "no-", 3)) || (!match_upper(&(argv[cur_arg + 1][3]), "DI"))) {
-	    sprintf(logbuf, "Error: Invalid --snps-only parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --snps-only parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
           misc_flags |= MISC_SNPS_ONLY_NO_DI;
@@ -11426,7 +11424,7 @@ int32_t main(int32_t argc, char** argv) {
             splitx_bound1 = 2781479;
             splitx_bound1 = 155701383;
 	  } else {
-            sprintf(logbuf, "Error: Unrecognized --split-x build code '%s'.\n", argv[cur_arg + uii]);
+            sprintf(g_logbuf, "Error: Unrecognized --split-x build code '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  if (chrom_info.species != SPECIES_HUMAN) {
@@ -11435,11 +11433,11 @@ int32_t main(int32_t argc, char** argv) {
 	  }
 	} else {
 	  if (scan_uint_defcap(argv[cur_arg + uii], &splitx_bound1)) {
-	    sprintf(logbuf, "Error: Invalid --split-x parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --split-x parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  if (scan_posint_defcap(argv[cur_arg + ujj], &splitx_bound2) || (splitx_bound2 <= splitx_bound1)) {
-	    sprintf(logbuf, "Error: Invalid --split-x parameter '%s'.\n", argv[cur_arg + ujj]);
+	    sprintf(g_logbuf, "Error: Invalid --split-x parameter '%s'.\n", argv[cur_arg + ujj]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -11451,7 +11449,7 @@ int32_t main(int32_t argc, char** argv) {
 	goto main_ret_INVALID_CMDLINE_A;
       } else if (!memcmp(argptr2, "et-missing-var-ids", 19)) {
 	if (load_rare & (LOAD_RARE_CNV | LOAD_RARE_DOSAGE)) {
-	  sprintf(logbuf, "Error: --set-missing-var-ids cannot be used with %s.\n", (load_rare == LOAD_RARE_CNV)? "a .cnv fileset" : "--dosage");
+	  sprintf(g_logbuf, "Error: --set-missing-var-ids cannot be used with %s.\n", (load_rare == LOAD_RARE_CNV)? "a .cnv fileset" : "--dosage");
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
         if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
@@ -11529,8 +11527,8 @@ int32_t main(int32_t argc, char** argv) {
             logerrprint("Error: --score takes at most three numeric parameters.\n");
             goto main_ret_INVALID_CMDLINE_A;
 	  } else {
-	    if (scan_posint_capped(argv[cur_arg + uii], (uint32_t*)&ii, (MAXLINEBUFLEN / 2) / 10, (MAXLINEBUFLEN / 2) % 10)) {
-              sprintf(logbuf, "Error: Invalid --score parameter '%s'.\n", argv[cur_arg + uii]);
+	    if (scan_posint_capped(argv[cur_arg + uii], (MAXLINEBUFLEN / 2) / 10, (MAXLINEBUFLEN / 2) % 10, (uint32_t*)&ii)) {
+              sprintf(g_logbuf, "Error: Invalid --score parameter '%s'.\n", argv[cur_arg + uii]);
               goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	    if (!ujj) {
@@ -11617,14 +11615,14 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
 	if (scan_double(argv[cur_arg + 1], &tail_bottom)) {
-	  sprintf(logbuf, "Error: Invalid --tail-pheno parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --tail-pheno parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (param_ct == 1) {
 	  tail_top = tail_bottom;
 	} else {
 	  if (scan_double(argv[cur_arg + 2], &tail_top)) {
-	    sprintf(logbuf, "Error: Invalid --tail-pheno parameter '%s'.\n", argv[cur_arg + 2]);
+	    sprintf(g_logbuf, "Error: Invalid --tail-pheno parameter '%s'.\n", argv[cur_arg + 2]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -11638,7 +11636,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_posint_defcap(argv[cur_arg + 1], &g_thread_ct)) {
-	  sprintf(logbuf, "Error: Invalid --threads parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --threads parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (g_thread_ct > MAX_THREADS) {
@@ -11758,7 +11756,7 @@ int32_t main(int32_t argc, char** argv) {
 	cc = argptr2[2];
 	if (cc == 'b') {
 	  if (scan_uint_defcap(argv[cur_arg + 1], (uint32_t*)&ii)) {
-	    sprintf(logbuf, "Error: Invalid --to-bp parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --to-bp parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	} else {
@@ -11767,7 +11765,7 @@ int32_t main(int32_t argc, char** argv) {
 	    goto main_ret_INVALID_CMDLINE;
 	  }
 	  if (scan_double(argv[cur_arg + 1], &dxx)) {
-	    sprintf(logbuf, "Error: Invalid --to-kb/-mb parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --to-kb/-mb parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	  dxx *= (cc == 'k')? 1000 : 1000000;
@@ -11807,7 +11805,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &thin_keep_prob)) {
-	  sprintf(logbuf, "Error: Invalid --thin variant retention probability '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --thin variant retention probability '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
         if (thin_keep_prob < (0.5 / 4294967296.0)) {
@@ -11832,7 +11830,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_uint_defcap(argv[cur_arg + 1], &thin_keep_ct) || ((!thin_keep_ct) && (!(misc_flags & MISC_ALLOW_NO_VARS)))) {
-	  sprintf(logbuf, "Error: Invalid --thin-count parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --thin-count parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	filter_flags |= FILTER_BIM_REQ | FILTER_DOSAGEMAP | FILTER_NOCNV;
@@ -11842,7 +11840,7 @@ int32_t main(int32_t argc, char** argv) {
           goto main_ret_INVALID_CMDLINE_2A;
         }
         if (scan_double(argv[cur_arg + 1], &thin_keep_sample_prob)) {
-          sprintf(logbuf, "Error: Invalid --thin-indiv %s retention probability '%s'.\n", g_species_singular, argv[cur_arg + 1]);
+          sprintf(g_logbuf, "Error: Invalid --thin-indiv %s retention probability '%s'.\n", g_species_singular, argv[cur_arg + 1]);
           goto main_ret_INVALID_CMDLINE_WWA;
         }
         if (thin_keep_sample_prob < (0.5 / 4294967296.0)) {
@@ -11862,7 +11860,7 @@ int32_t main(int32_t argc, char** argv) {
           goto main_ret_INVALID_CMDLINE_2A;
         }
         if (scan_uint_defcap(argv[cur_arg + 1], &thin_keep_sample_ct) || ((!thin_keep_sample_ct) && (!(misc_flags & MISC_ALLOW_NO_SAMPLES)))) {
-          sprintf(logbuf, "Error: Invalid --thin-indiv-count parameter '%s'.\n", argv[cur_arg + 1]);
+          sprintf(g_logbuf, "Error: Invalid --thin-indiv-count parameter '%s'.\n", argv[cur_arg + 1]);
           goto main_ret_INVALID_CMDLINE_WWA;
         }
       } else if (!memcmp(argptr2, "ests", 5)) {
@@ -11926,7 +11924,7 @@ int32_t main(int32_t argc, char** argv) {
               goto main_ret_INVALID_CMDLINE;
 	    }
 	    if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &testmiss_mperm_val)) {
-	      sprintf(logbuf, "Error: Invalid --test-missing mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
+	      sprintf(g_logbuf, "Error: Invalid --test-missing mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
               goto main_ret_INVALID_CMDLINE_WWA;
 	    }
             testmiss_modifier |= TESTMISS_MPERM;
@@ -11938,7 +11936,7 @@ int32_t main(int32_t argc, char** argv) {
             logerrprint("Error: Improper --test-missing mperm syntax.  (Use '--test-missing\nmperm=[value]'.)\n");
             goto main_ret_INVALID_CMDLINE;
 	  } else {
-            sprintf(logbuf, "Error: Invalid --test-missing parameter '%s'.\n", argv[cur_arg + uii]);
+            sprintf(g_logbuf, "Error: Invalid --test-missing parameter '%s'.\n", argv[cur_arg + uii]);
             goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -11992,7 +11990,7 @@ int32_t main(int32_t argc, char** argv) {
 	      goto main_ret_INVALID_CMDLINE;
 	    }
 	    if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &family_info.tdt_mperm_val)) {
-	      sprintf(logbuf, "Error: Invalid --tdt mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
+	      sprintf(g_logbuf, "Error: Invalid --tdt mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
               goto main_ret_INVALID_CMDLINE_WWA;
 	    }
             family_info.tdt_modifier |= TDT_MPERM;
@@ -12026,7 +12024,7 @@ int32_t main(int32_t argc, char** argv) {
 	    logerrprint("Error: Improper --tdt mperm syntax.  (Use '--tdt mperm=[value]'.)\n");
 	    goto main_ret_INVALID_CMDLINE;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --tdt parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --tdt parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -12060,7 +12058,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
         if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0)) {
-	  sprintf(logbuf, "Error: Invalid --tag-kb parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --tag-kb parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (dxx > 2147483.646) {
@@ -12077,7 +12075,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0) || (dxx > 1)) {
-	  sprintf(logbuf, "Error: Invalid --tag-r2 threshold '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --tag-r2 threshold '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	ld_info.show_tags_r2 = dxx;
@@ -12134,29 +12132,29 @@ int32_t main(int32_t argc, char** argv) {
 	  }
 	  if (param_ct >= uii) {
 	    if (scan_double(argv[cur_arg + uii], &rel_info.unrelated_herit_tol)) {
-	      sprintf(logbuf, "Error: Invalid --unrelated-heritability EM tolerance parameter '%s'.\n", argv[cur_arg + uii]);
+	      sprintf(g_logbuf, "Error: Invalid --unrelated-heritability EM tolerance parameter '%s'.\n", argv[cur_arg + uii]);
 	      goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	    if (rel_info.unrelated_herit_tol <= 0.0) {
-	      sprintf(logbuf, "Error: Invalid --unrelated-heritability EM tolerance parameter '%s'.\n", argv[cur_arg + uii]);
+	      sprintf(g_logbuf, "Error: Invalid --unrelated-heritability EM tolerance parameter '%s'.\n", argv[cur_arg + uii]);
 	      goto main_ret_INVALID_CMDLINE_WWA;
 	    }
 	    if (param_ct > uii) {
 	      if (scan_double(argv[cur_arg + uii + 1], &rel_info.unrelated_herit_covg)) {
-		sprintf(logbuf, "Error: Invalid --unrelated-heritability genomic covariance prior '%s'.\n", argv[cur_arg + uii + 1]);
+		sprintf(g_logbuf, "Error: Invalid --unrelated-heritability genomic covariance prior '%s'.\n", argv[cur_arg + uii + 1]);
 		goto main_ret_INVALID_CMDLINE_WWA;
 	      }
 	      if ((rel_info.unrelated_herit_covg <= 0.0) || (rel_info.unrelated_herit_covg > 1.0)) {
-		sprintf(logbuf, "Error: Invalid --unrelated-heritability genomic covariance prior '%s'.\n", argv[cur_arg + uii + 1]);
+		sprintf(g_logbuf, "Error: Invalid --unrelated-heritability genomic covariance prior '%s'.\n", argv[cur_arg + uii + 1]);
 		goto main_ret_INVALID_CMDLINE_WWA;
 	      }
 	      if (param_ct == uii + 2) {
 		if (scan_double(argv[cur_arg + uii + 2], &rel_info.unrelated_herit_covr)) {
-		  sprintf(logbuf, "Error: Invalid --unrelated-heritability residual covariance prior '%s'.\n", argv[cur_arg + uii + 2]);
+		  sprintf(g_logbuf, "Error: Invalid --unrelated-heritability residual covariance prior '%s'.\n", argv[cur_arg + uii + 2]);
 		  goto main_ret_INVALID_CMDLINE_WWA;
 		}
 		if ((rel_info.unrelated_herit_covr <= 0.0) || (rel_info.unrelated_herit_covr > 1.0)) {
-		  sprintf(logbuf, "Error: Invalid --unrelated-heritability residual covariance prior '%s'.\n", argv[cur_arg + uii + 2]);
+		  sprintf(g_logbuf, "Error: Invalid --unrelated-heritability residual covariance prior '%s'.\n", argv[cur_arg + uii + 2]);
 		  goto main_ret_INVALID_CMDLINE_WWA;
 		}
 	      } else {
@@ -12239,7 +12237,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (update_map_modifier) {
 	  if (param_ct != 1) {
-	    sprintf(logbuf, "Error: Multi-parameter --update-map cannot be used with deprecated\nparameter-free --update-%s.\n", (update_map_modifier == 1)? "chr" : "cm");
+	    sprintf(g_logbuf, "Error: Multi-parameter --update-map cannot be used with deprecated\nparameter-free --update-%s.\n", (update_map_modifier == 1)? "chr" : "cm");
 	    goto main_ret_INVALID_CMDLINE_2;
 	  }
 	  retval = alloc_2col((update_map_modifier == 1)? (&update_chr) : (&update_cm), &(argv[cur_arg + 1]), argptr, 1);
@@ -12322,7 +12320,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct == 2) {
 	  if (scan_posint_defcap(argv[cur_arg + 2], &update_sex_col)) {
-	    sprintf(logbuf, "Error: Invalid --update-sex column parameter '%s'.  (This must be a positive integer.)\n", argv[cur_arg + 2]);
+	    sprintf(g_logbuf, "Error: Invalid --update-sex column parameter '%s'.  (This must be a positive integer.)\n", argv[cur_arg + 2]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -12350,11 +12348,11 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &glm_vif_thresh)) {
-	  sprintf(logbuf, "Error: Invalid --linear/--epistasis VIF threshold '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --linear/--epistasis VIF threshold '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (glm_vif_thresh < 1.0) {
-	  sprintf(logbuf, "Error: --linear/--epistasis VIF threshold '%s' too small (must be >= 1).\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: --linear/--epistasis VIF threshold '%s' too small (must be >= 1).\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "egas", 5)) {
@@ -12389,7 +12387,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
         if (scan_double(argv[cur_arg + 1], &vcf_min_qual) || (vcf_min_qual < 0.0)) {
-	  sprintf(logbuf, "Error: Invalid --vcf-min-qual parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --vcf-min-qual parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	vcf_min_qual *= 1 - SMALL_EPSILON;
@@ -12415,7 +12413,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &vcf_min_gp) || (vcf_min_gp <= 0.0) || (vcf_min_gp > 1.0)) {
-	  sprintf(logbuf, "Error: Invalid --vcf-min-gp parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --vcf-min-gp parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	vcf_min_gp *= 1 - SMALL_EPSILON;
@@ -12429,7 +12427,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &vcf_min_gq) || (vcf_min_gq < 0.0)) {
-	  sprintf(logbuf, "Error: Invalid --vcf-min-gq parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --vcf-min-gq parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	vcf_min_gq *= 1 - SMALL_EPSILON;
@@ -12468,7 +12466,7 @@ int32_t main(int32_t argc, char** argv) {
 	} else if ((!strcmp(argv[cur_arg + 1], "e")) || (!strcmp(argv[cur_arg + 1], "error"))) {
 	  vcf_half_call = VCF_HALF_CALL_ERROR;
 	} else {
-	  sprintf(logbuf, "Error: '%s' is not a valid mode for --vcf-half-call.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: '%s' is not a valid mode for --vcf-half-call.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "cf-require-gt", 14)) {
@@ -12492,7 +12490,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
         if (scan_posint_defcap(argv[cur_arg + 1], &write_var_range_ct)) {
-	  sprintf(logbuf, "Error: Invalid --write-var-ranges parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --write-var-ranges parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	calculation_type |= CALC_WRITE_VAR_RANGES;
@@ -12505,7 +12503,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &dxx) || (dxx < 0)) {
-	  sprintf(logbuf, "Error: Invalid --window parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --window parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
         dxx *= 500;
@@ -12570,7 +12568,7 @@ int32_t main(int32_t argc, char** argv) {
 	    }
 	    write_covar_modifier |= WRITE_COVAR_FEMALE_2;
 	  } else {
-	    sprintf(logbuf, "Error: Invalid --with-phenotype parameter '%s'.\n", argv[cur_arg + uii]);
+	    sprintf(g_logbuf, "Error: Invalid --with-phenotype parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
 	}
@@ -12604,7 +12602,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	if (param_ct) {
 	  if (strcmp(argv[cur_arg + 1], "omit-unassigned")) {
-	    sprintf(logbuf, "Error: Invalid --write-cluster parameter '%s'.\n", argv[cur_arg + 1]);
+	    sprintf(g_logbuf, "Error: Invalid --write-cluster parameter '%s'.\n", argv[cur_arg + 1]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
           misc_flags |= MISC_WRITE_CLUSTER_OMIT_UNASSIGNED;
@@ -12666,14 +12664,14 @@ int32_t main(int32_t argc, char** argv) {
 	  logerrprint("Error: --xchr-model must be used with --linear or --logistic.\n");
 	  goto main_ret_INVALID_CMDLINE_A;
 	} else if (glm_modifier & (GLM_GENOTYPIC | GLM_HETHOM | GLM_DOMINANT | GLM_RECESSIVE)) {
-	  sprintf(logbuf, "Error: --xchr-model cannot be used with --%s %s.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear", (glm_modifier & GLM_GENOTYPIC)? "genotypic" : ((glm_modifier & GLM_HETHOM)? "hethom" : ((glm_modifier & GLM_DOMINANT)? "dominant" : "recessive")));
+	  sprintf(g_logbuf, "Error: --xchr-model cannot be used with --%s %s.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear", (glm_modifier & GLM_GENOTYPIC)? "genotypic" : ((glm_modifier & GLM_HETHOM)? "hethom" : ((glm_modifier & GLM_DOMINANT)? "dominant" : "recessive")));
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if ((argv[cur_arg + 1][1] != '\0') || (argv[cur_arg + 1][0] < '0') || (argv[cur_arg + 1][0] > '3')) {
-	  sprintf(logbuf, "Error: Invalid --xchr-model parameter '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(g_logbuf, "Error: Invalid --xchr-model parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	glm_xchr_model = (uint32_t)(argv[cur_arg + 1][0] - '0');
@@ -12712,7 +12710,7 @@ int32_t main(int32_t argc, char** argv) {
 
     main_param_zero:
       if (param_ct) {
-        sprintf(logbuf, "Error: --%s doesn't accept parameters.\n", argptr);
+        sprintf(g_logbuf, "Error: --%s doesn't accept parameters.\n", argptr);
 	goto main_ret_INVALID_CMDLINE_2A;
       }
     }
@@ -12822,11 +12820,11 @@ int32_t main(int32_t argc, char** argv) {
     }
   }
   if (update_map_modifier) {
-    sprintf(logbuf, "Error: Deprecated parameter-free --update-%s cannot be used without\n--update-map.\n", (update_map_modifier == 1)? "chr" : "cm");
+    sprintf(g_logbuf, "Error: Deprecated parameter-free --update-%s cannot be used without\n--update-map.\n", (update_map_modifier == 1)? "chr" : "cm");
     goto main_ret_INVALID_CMDLINE_2A;
   }
   if (((misc_flags & (MISC_FILL_MISSING_A2 | MISC_MERGEX | MISC_SET_ME_MISSING)) || splitx_bound2 || update_chr) && (((load_rare == LOAD_RARE_CNV) && (cnv_calc_type != CNV_WRITE)) || ((load_rare != LOAD_RARE_CNV) && (calculation_type != CALC_MAKE_BED)))) {
-    sprintf(logbuf, "Error: --merge-x/--split-x/--update-chr/--set-me-missing/--fill-missing-a2\nmust be used with --%s and no other commands.\n", (load_rare == LOAD_RARE_CNV)? "cnv-write" : "make-bed");
+    sprintf(g_logbuf, "Error: --merge-x/--split-x/--update-chr/--set-me-missing/--fill-missing-a2\nmust be used with --%s and no other commands.\n", (load_rare == LOAD_RARE_CNV)? "cnv-write" : "make-bed");
     goto main_ret_INVALID_CMDLINE_2A;
   }
   if (load_rare == LOAD_RARE_CNV) {
@@ -13221,7 +13219,7 @@ int32_t main(int32_t argc, char** argv) {
       goto main_ret_INVALID_CMDLINE_A;
     }
   }
-  if (qual_max_thresh != HUGE_DOUBLE) {
+  if (qual_max_thresh != DBL_MAX) {
     if (!qual_filter) {
       logerrprint("Error: --qual-max-threshold must be used with --qual-scores.\n");
       goto main_ret_INVALID_CMDLINE;
@@ -13252,6 +13250,9 @@ int32_t main(int32_t argc, char** argv) {
     logerrprint("Error: --merge-equal-pos must be used with --merge/--bmerge/--merge-list.\n(Note that you are permitted to merge a fileset with itself.)\n");
     goto main_ret_INVALID_CMDLINE_A;
   }
+  if (calculation_type && (!(calculation_type & (~(CALC_FREQ | CALC_MISSING_REPORT)))) && ((geno_thresh != 1.0) || (hwe_thresh != 0.0) || (min_maf != 0.0) || (max_maf != 0.5) || min_ac || (max_ac != 0x7fffffff))) {
+    logerrprint("Warning: --freq and --missing complete BEFORE --geno, --hwe, and --maf in\nplink's norder of operations.\n");
+  }
   // short batch job?
   uii = 0;
   if ((!calculation_type) && (!(load_rare & (LOAD_RARE_LGEN | LOAD_RARE_DUMMY | LOAD_RARE_SIMULATE | LOAD_RARE_TRANSPOSE_MASK | LOAD_RARE_23 | LOAD_RARE_CNV | LOAD_RARE_VCF | LOAD_RARE_BCF)))) {
@@ -13279,20 +13280,20 @@ int32_t main(int32_t argc, char** argv) {
   flag_map = NULL;
   if (!rseeds) {
     ujj = (uint32_t)time(NULL);
-    sprintf(logbuf, "Random number seed: %u\n", ujj);
-    logstr(logbuf);
-    sfmt_init_gen_rand(&sfmt, ujj);
+    sprintf(g_logbuf, "Random number seed: %u\n", ujj);
+    logstr(g_logbuf);
+    sfmt_init_gen_rand(&g_sfmt, ujj);
   } else {
     if (rseed_ct == 1) {
-      sfmt_init_gen_rand(&sfmt, rseeds[0]);
+      sfmt_init_gen_rand(&g_sfmt, rseeds[0]);
     } else {
-      sfmt_init_by_array(&sfmt, rseeds, rseed_ct);
+      sfmt_init_by_array(&g_sfmt, rseeds, rseed_ct);
     }
     free(rseeds);
     rseeds = NULL;
   }
   // guarantee contiguous malloc space outside of main workspace
-  bubble = (char*)malloc(NON_WKSPACE_MIN * sizeof(char));
+  bubble = (char*)malloc(NON_BIGSTACK_MIN * sizeof(char));
   if (!bubble) {
     goto main_ret_NOMEM;
   }
@@ -13315,16 +13316,16 @@ int32_t main(int32_t argc, char** argv) {
 #endif
 #endif
   if (!llxx) {
-    default_alloc_mb = WKSPACE_DEFAULT_MB;
-  } else if (llxx < (WKSPACE_MIN_MB * 2)) {
-    default_alloc_mb = WKSPACE_MIN_MB;
+    default_alloc_mb = BIGSTACK_DEFAULT_MB;
+  } else if (llxx < (BIGSTACK_MIN_MB * 2)) {
+    default_alloc_mb = BIGSTACK_MIN_MB;
   } else {
     default_alloc_mb = llxx / 2;
   }
   if (!malloc_size_mb) {
     malloc_size_mb = default_alloc_mb;
-  } else if (malloc_size_mb < WKSPACE_MIN_MB) {
-    malloc_size_mb = WKSPACE_MIN_MB;
+  } else if (malloc_size_mb < BIGSTACK_MIN_MB) {
+    malloc_size_mb = BIGSTACK_MIN_MB;
   }
 #ifndef __LP64__
   if (malloc_size_mb > 2047) {
@@ -13332,28 +13333,28 @@ int32_t main(int32_t argc, char** argv) {
   }
 #endif
   if (llxx) {
-    sprintf(logbuf, "%" PRId64 " MB RAM detected; reserving %" PRIdPTR " MB for main workspace.\n", llxx, malloc_size_mb);
+    sprintf(g_logbuf, "%" PRId64 " MB RAM detected; reserving %" PRIdPTR " MB for main workspace.\n", llxx, malloc_size_mb);
   } else {
-    sprintf(logbuf, "Failed to calculate system memory.  Attempting to reserve %" PRIdPTR " MB.\n", malloc_size_mb);
+    sprintf(g_logbuf, "Failed to calculate system memory.  Attempting to reserve %" PRIdPTR " MB.\n", malloc_size_mb);
   }
   logprintb();
-  wkspace_ua = (unsigned char*)malloc(malloc_size_mb * 1048576 * sizeof(char));
-  while (!wkspace_ua) {
+  bigstack_ua = (unsigned char*)malloc(malloc_size_mb * 1048576 * sizeof(char));
+  while (!bigstack_ua) {
     malloc_size_mb = (malloc_size_mb * 3) / 4;
-    if (malloc_size_mb < WKSPACE_MIN_MB) {
-      malloc_size_mb = WKSPACE_MIN_MB;
+    if (malloc_size_mb < BIGSTACK_MIN_MB) {
+      malloc_size_mb = BIGSTACK_MIN_MB;
     }
-    wkspace_ua = (unsigned char*)malloc(malloc_size_mb * 1048576 * sizeof(char));
-    if (wkspace_ua) {
+    bigstack_ua = (unsigned char*)malloc(malloc_size_mb * 1048576 * sizeof(char));
+    if (bigstack_ua) {
       LOGPRINTF("Allocated %" PRIdPTR " MB successfully, after larger attempt(s) failed.\n", malloc_size_mb);
-    } else if (malloc_size_mb == WKSPACE_MIN_MB) {
+    } else if (malloc_size_mb == BIGSTACK_MIN_MB) {
       goto main_ret_NOMEM;
     }
   }
   // force 64-byte align to make cache line sensitivity work
-  wkspace = (unsigned char*)CACHEALIGN((uintptr_t)wkspace_ua);
-  wkspace_base = wkspace;
-  wkspace_left = (malloc_size_mb * 1048576 - (uintptr_t)(wkspace - wkspace_ua)) & (~(CACHELINE - ONELU));
+  bigstack_initial_base = (unsigned char*)round_up_pow2((uintptr_t)bigstack_ua, CACHELINE);
+  g_bigstack_base = bigstack_initial_base;
+  g_bigstack_end = &(bigstack_initial_base[(malloc_size_mb * 1048576 - (uintptr_t)(bigstack_initial_base - bigstack_ua)) & (~(CACHELINE - ONELU))]);
   free(bubble);
   bubble = NULL;
 
@@ -13443,7 +13444,7 @@ int32_t main(int32_t argc, char** argv) {
       } else if (load_rare & LOAD_RARE_BCF) {
 	retval = bcf_to_bed(pedname, outname, sptr, missing_pheno, misc_flags, const_fid, id_delim, vcf_idspace_to, vcf_min_qual, vcf_filter_exceptions_flattened, &chrom_info);
       } else if (load_rare == LOAD_RARE_23) {
-        retval = bed_from_23(pedname, outname, sptr, modifier_23, fid_23, iid_23, (pheno_23 == HUGE_DOUBLE)? ((double)missing_pheno) : pheno_23, misc_flags, paternal_id_23, maternal_id_23, &chrom_info);
+        retval = bed_from_23(pedname, outname, sptr, modifier_23, fid_23, iid_23, (pheno_23 == DBL_MAX)? ((double)missing_pheno) : pheno_23, misc_flags, paternal_id_23, maternal_id_23, &chrom_info);
       } else if (load_rare & LOAD_RARE_DUMMY) {
 	retval = generate_dummy(outname, sptr, dummy_flags, dummy_marker_ct, dummy_sample_ct, dummy_missing_geno, dummy_missing_pheno, missing_pheno);
       } else if (load_rare & LOAD_RARE_SIMULATE) {
@@ -13473,7 +13474,7 @@ int32_t main(int32_t argc, char** argv) {
       memcpy(memcpya(mapname, outname, uii), ".bim", 5);
       memcpy(memcpya(famname, outname, uii), ".fam", 5);
       if (calculation_type && (!(misc_flags & MISC_KEEP_AUTOCONV))) {
-	if (push_ll_str(&file_delete_list, pedname) || push_ll_str(&file_delete_list, mapname) || push_ll_str(&file_delete_list, famname)) {
+	if (push_ll_str(pedname, &file_delete_list) || push_ll_str(mapname, &file_delete_list) || push_ll_str(famname, &file_delete_list)) {
 	  goto main_ret_NOMEM;
 	}
       }
@@ -13507,7 +13508,7 @@ int32_t main(int32_t argc, char** argv) {
     retval = RET_INVALID_CMDLINE;
     break;
   main_ret_INVALID_CMDLINE_WWA:
-    wordwrap(logbuf, 0);
+    wordwrapb(0);
   main_ret_INVALID_CMDLINE_2A:
     logerrprintb();
   main_ret_INVALID_CMDLINE_A:
@@ -13515,7 +13516,7 @@ int32_t main(int32_t argc, char** argv) {
     retval = RET_INVALID_CMDLINE;
     break;
   main_ret_INVALID_CMDLINE_WW:
-    wordwrap(logbuf, 0);
+    wordwrapb(0);
   main_ret_INVALID_CMDLINE_2:
     logerrprintb();
   main_ret_INVALID_CMDLINE:
@@ -13527,16 +13528,16 @@ int32_t main(int32_t argc, char** argv) {
     } else {
       logerrprint("Warning: No output requested.  Exiting.\n");
     }
-    fputs(cmdline_format_str, stdout);
+    fputs(g_cmdline_format_str, stdout);
     fputs(notestr_null_calc2, stdout);
     retval = RET_NULL_CALC;
 #ifdef STABLE_BUILD
     break;
   main_unstable_disabled:
     // see the UNSTABLE macro in plink_common.h
-    memcpy(logbuf, "Error: --", 9);
+    memcpy(g_logbuf, "Error: --", 9);
     strcpy(sptr, " is either unfinished or not yet well-tested. If you wish to help with testing, use the latest development build.\n");
-    wordwrap(logbuf, 0);
+    wordwrapb(0);
     logerrprintb();
     retval = RET_CALC_NOT_YET_SUPPORTED;
 #endif
@@ -13545,7 +13546,7 @@ int32_t main(int32_t argc, char** argv) {
   fclose_cond(scriptfile);
   disp_exit_msg(retval);
   free_cond(bubble);
-  free_cond(wkspace_ua);
+  free_cond(bigstack_ua);
   free_cond(subst_argv);
   free_cond(script_buf);
   free_cond(rerun_buf);
@@ -13653,27 +13654,27 @@ int32_t main(int32_t argc, char** argv) {
       chrom_info.incl_excl_name_stack = ll_str_ptr;
     } while (chrom_info.incl_excl_name_stack);
   }
-  if (logfile) {
+  if (g_logfile) {
     if (!g_log_failed) {
       logstr("\nEnd time: ");
       time(&rawtime);
       logstr(ctime(&rawtime));
-      if (fclose(logfile)) {
+      if (fclose(g_logfile)) {
 	fflush(stdout);
 	fputs("Error: Failed to finish writing to log.\n", stderr);
       }
     } else {
-      fclose(logfile);
+      fclose(g_logfile);
     }
-    logfile = NULL;
+    g_logfile = NULL;
   }
   if (misc_flags & MISC_GPLINK) {
     memcpy(outname_end, ".gplink", 8);
-    logfile = fopen(outname, "w");
-    if (logfile) { // can't do much if an error occurs here...
-      putc(retval? '1' : '0', logfile);
-      putc('\n', logfile);
-      fclose(logfile);
+    g_logfile = fopen(outname, "w");
+    if (g_logfile) { // can't do much if an error occurs here...
+      putc(retval? '1' : '0', g_logfile);
+      putc('\n', g_logfile);
+      fclose(g_logfile);
     }
   }
 
diff --git a/plink_assoc.c b/plink_assoc.c
index 14030dc..dc8b556 100644
--- a/plink_assoc.c
+++ b/plink_assoc.c
@@ -42,7 +42,7 @@ void single_marker_cc_freqs(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* c
   while (sample_ctl2 >= 60) {
   single_marker_cc_freqs_loop:
     lptr_6x_end = &(lptr[cur_decr]);
-    count_2freq_dbl_60v((__m128i*)lptr, (__m128i*)lptr_6x_end, (__m128i*)ctrl_include2, (__m128i*)case_include2, &tot_ctrl_ab, &tot_ctrl_c, &tot_case_ab, &tot_case_c);
+    count_2freq_dbl_960b((__m128i*)lptr, (__m128i*)lptr_6x_end, (__m128i*)ctrl_include2, (__m128i*)case_include2, &tot_ctrl_ab, &tot_ctrl_c, &tot_case_ab, &tot_case_c);
     lptr = lptr_6x_end;
     ctrl_include2 = &(ctrl_include2[cur_decr]);
     case_include2 = &(case_include2[cur_decr]);
@@ -55,7 +55,7 @@ void single_marker_cc_freqs(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* c
 #else
   uintptr_t* lptr_six_end = &(lptr[sample_ctl2 - (sample_ctl2 % 6)]);
   while (lptr < lptr_six_end) {
-    count_2freq_dbl_6(lptr, ctrl_include2, case_include2, &tot_ctrl_ab, &tot_ctrl_c, &tot_case_ab, &tot_case_c);
+    count_2freq_dbl_24b(lptr, ctrl_include2, case_include2, &tot_ctrl_ab, &tot_ctrl_c, &tot_case_ab, &tot_case_c);
     lptr = &(lptr[6]);
     ctrl_include2 = &(ctrl_include2[6]);
     case_include2 = &(case_include2[6]);
@@ -144,8 +144,8 @@ void single_marker_cc_3freqs(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t*
   while (sample_ctl2 >= 120) {
   single_marker_cc_3freqs_loop:
     lptr_12x_end = &(lptr[cur_decr]);
-    count_3freq_120v((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)ctrl_include2, &tot_ctrl_a, &tot_ctrl_b, &tot_ctrl_c);
-    count_3freq_120v((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)case_include2, &tot_case_a, &tot_case_b, &tot_case_c);
+    count_3freq_1920b((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)ctrl_include2, &tot_ctrl_a, &tot_ctrl_b, &tot_ctrl_c);
+    count_3freq_1920b((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)case_include2, &tot_case_a, &tot_case_b, &tot_case_c);
     lptr = lptr_12x_end;
     ctrl_include2 = &(ctrl_include2[cur_decr]);
     case_include2 = &(case_include2[cur_decr]);
@@ -158,8 +158,8 @@ void single_marker_cc_3freqs(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t*
 #else
   uintptr_t* lptr_twelve_end = &(lptr[sample_ctl2 - (sample_ctl2 % 12)]);
   while (lptr < lptr_twelve_end) {
-    count_3freq_12(lptr, ctrl_include2, &tot_ctrl_a, &tot_ctrl_b, &tot_ctrl_c);
-    count_3freq_12(lptr, case_include2, &tot_case_a, &tot_case_b, &tot_case_c);
+    count_3freq_48b(lptr, ctrl_include2, &tot_ctrl_a, &tot_ctrl_b, &tot_ctrl_c);
+    count_3freq_48b(lptr, case_include2, &tot_case_a, &tot_case_b, &tot_case_c);
     lptr = &(lptr[12]);
     ctrl_include2 = &(ctrl_include2[12]);
     case_include2 = &(case_include2[12]);
@@ -200,7 +200,7 @@ static inline void adjust_print(double pval, double output_min_p, const char* ou
   } else if (pval <= output_min_p) {
     *bufpp = memcpya(*bufpp, output_min_p_str, output_min_p_strlen);
   } else {
-    *bufpp = double_g_writewx4x(*bufpp, pval, 10, ' ');
+    *bufpp = dtoa_g_wxp4x(pval, 10, ' ', *bufpp);
   }
 }
 
@@ -210,7 +210,7 @@ static inline void adjust_print_log10(double pval, double output_min_p, const ch
   } else if (pval <= output_min_p) {
     *bufpp = memcpya(*bufpp, output_min_logp_str, output_min_logp_strlen);
   } else if (pval < 1) {
-    *bufpp = double_g_writewx4x(*bufpp, -log10(pval), 10, ' ');
+    *bufpp = dtoa_g_wxp4x(-log10(pval), 10, ' ', *bufpp);
   } else {
     *bufpp = memcpya(*bufpp, "         0 ", 11);
   }
@@ -221,7 +221,7 @@ int32_t multcomp(char* outname, char* outname_end, uint32_t* marker_uidxs, uintp
   // 1. Just p-values (pvals[]).
   // 2. T statistics (in chi[]) and dfs (in tcnt[]).
   // 3. 1df chi-square stats (in chi[]).
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uint32_t is_log10 = mtest_adjust & ADJUST_LOG10;
   uint32_t qq_plot = mtest_adjust & ADJUST_QQ;
   FILE* outfile = NULL;
@@ -258,9 +258,9 @@ int32_t multcomp(char* outname, char* outname_end, uint32_t* marker_uidxs, uintp
   uint32_t ujj;
   uint32_t loop_end;
 
-  if (wkspace_alloc_d_checked(&sp, chi_ct * sizeof(double)) ||
-      wkspace_alloc_d_checked(&schi, chi_ct * sizeof(double)) ||
-      wkspace_alloc_ui_checked(&new_order, chi_ct * sizeof(int32_t))) {
+  if (bigstack_alloc_d(chi_ct, &sp) ||
+      bigstack_alloc_d(chi_ct, &schi) ||
+      bigstack_alloc_ui(chi_ct, &new_order)) {
     goto multcomp_ret_NOMEM;
   }
   if (pvals) {
@@ -275,7 +275,7 @@ int32_t multcomp(char* outname, char* outname_end, uint32_t* marker_uidxs, uintp
       }
     }
   } else if (tcnt) {
-    if (wkspace_alloc_ui_checked(&new_tcnt, chi_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(chi_ct, &new_tcnt)) {
       goto multcomp_ret_NOMEM;
     }
     for (cur_idx = 0; cur_idx < chi_ct; cur_idx++) {
@@ -350,9 +350,9 @@ int32_t multcomp(char* outname, char* outname_end, uint32_t* marker_uidxs, uintp
   }
 
   // handle reverse-order calculations
-  if (wkspace_alloc_d_checked(&pv_bh, chi_ct * sizeof(double)) ||
-      wkspace_alloc_d_checked(&pv_by, chi_ct * sizeof(double)) ||
-      wkspace_alloc_d_checked(&pv_gc, chi_ct * sizeof(double))) {
+  if (bigstack_alloc_d(chi_ct, &pv_bh) ||
+      bigstack_alloc_d(chi_ct, &pv_by) ||
+      bigstack_alloc_d(chi_ct, &pv_gc)) {
     goto multcomp_ret_NOMEM;
   }
   if (adjust_gc) {
@@ -403,19 +403,19 @@ int32_t multcomp(char* outname, char* outname_end, uint32_t* marker_uidxs, uintp
 
   uii = strlen(outname_end);
   memcpy(&(outname_end[uii]), ".adjusted", 10);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto multcomp_ret_OPEN_FAIL;
   }
   if (!is_set_test) {
-    sprintf(tbuf, " CHR %%%us      UNADJ %s", plink_maxsnp, skip_gc? "" : "        GC ");
-    fprintf(outfile, tbuf, "SNP");
+    sprintf(g_textbuf, " CHR %%%us      UNADJ %s", plink_maxsnp, skip_gc? "" : "        GC ");
+    fprintf(outfile, g_textbuf, "SNP");
   } else {
     plink_maxsnp = max_marker_id_len - 1;
     if (plink_maxsnp < 3) {
       plink_maxsnp = 3;
     }
-    sprintf(tbuf, " %%%us      UNADJ ", plink_maxsnp);
-    fprintf(outfile, tbuf, "SET");
+    sprintf(g_textbuf, " %%%us      UNADJ ", plink_maxsnp);
+    fprintf(outfile, g_textbuf, "SET");
   }
   if (qq_plot) {
     fputs("        QQ ", outfile);
@@ -430,14 +430,14 @@ int32_t multcomp(char* outname, char* outname_end, uint32_t* marker_uidxs, uintp
     if (output_min_p == 0.0) {
       memcpy(output_min_p_str, "       INF ", 11);
     } else {
-      bufptr = double_g_writewx4x(output_min_p_str, output_min_p, 10, ' ');
+      bufptr = dtoa_g_wxp4x(output_min_p, 10, ' ', output_min_p_str);
       output_min_p_strlen = (uintptr_t)(bufptr - output_min_p_str);
     }
   } else {
     if (output_min_p == 0.0) {
       memcpy(output_min_p_str, "       INF ", 11);
     } else {
-      bufptr = double_g_writewx4x(output_min_p_str, -log10(output_min_p), 10, ' ');
+      bufptr = dtoa_g_wxp4x(-log10(output_min_p), 10, ' ', output_min_p_str);
       output_min_p_strlen = (uintptr_t)(bufptr - output_min_p_str);
     }
   }
@@ -457,14 +457,14 @@ int32_t multcomp(char* outname, char* outname_end, uint32_t* marker_uidxs, uintp
       }
       marker_uidx = new_order[cur_idx];
       if (!is_set_test) {
-        bufptr = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx)));
+        bufptr = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx), g_textbuf));
       } else {
-        bufptr = tbuf;
+        bufptr = g_textbuf;
       }
       *bufptr++ = ' ';
       bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), bufptr);
       *bufptr++ = ' ';
-      if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+      if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	goto multcomp_ret_WRITE_FAIL;
       }
       bonf = pval * dct;
@@ -500,14 +500,14 @@ int32_t multcomp(char* outname, char* outname_end, uint32_t* marker_uidxs, uintp
 	pv_sidak_sd = dyy;
       }
 
-      bufptr = tbuf;
+      bufptr = g_textbuf;
       if (!is_log10) {
 	adjust_print(unadj_pval, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
 	if (!skip_gc) {
 	  adjust_print(pv_gc[cur_idx], output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
 	}
 	if (qq_plot) {
-	  bufptr = double_g_writewx4x(bufptr, (((double)((int32_t)cur_idx)) + 0.5) * dzz, 10, ' ');
+	  bufptr = dtoa_g_wxp4x((((double)((int32_t)cur_idx)) + 0.5) * dzz, 10, ' ', bufptr);
 	}
 	adjust_print(bonf, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
 	adjust_print(pv_holm, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
@@ -521,7 +521,7 @@ int32_t multcomp(char* outname, char* outname_end, uint32_t* marker_uidxs, uintp
 	  adjust_print_log10(pv_gc[cur_idx], output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
 	}
 	if (qq_plot) {
-	  bufptr = double_g_writewx4x(bufptr, (((double)((int32_t)cur_idx)) + 0.5) * dzz, 10, ' ');
+	  bufptr = dtoa_g_wxp4x((((double)((int32_t)cur_idx)) + 0.5) * dzz, 10, ' ', bufptr);
 	}
 	adjust_print_log10(bonf, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
 	adjust_print_log10(pv_holm, output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
@@ -531,7 +531,7 @@ int32_t multcomp(char* outname, char* outname_end, uint32_t* marker_uidxs, uintp
 	adjust_print_log10(pv_by[cur_idx], output_min_p, output_min_p_str, output_min_p_strlen, &bufptr);
       }
       *bufptr++ = '\n';
-      if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+      if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	goto multcomp_ret_WRITE_FAIL;
       }
     }
@@ -559,7 +559,7 @@ int32_t multcomp(char* outname, char* outname_end, uint32_t* marker_uidxs, uintp
   }
  multcomp_ret_1:
   fclose_cond(outfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -589,7 +589,7 @@ void calc_git(uint32_t pheno_nm_ct, uint32_t perm_vec_ct, uintptr_t* __restrict_
   //    32-bit ints, zeroed out, and the second loop restarts.
   // Note that results_bufs[] is assumed to be zeroed out before this function
   // is called.
-  uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
+  uint32_t pheno_nm_ctl2x = QUATERCT_TO_WORDCT(pheno_nm_ct);
   uint32_t perm_ct16 = (perm_vec_ct + 15) / 16;
 #ifdef __LP64__
   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
@@ -697,7 +697,7 @@ void calc_git(uint32_t pheno_nm_ct, uint32_t perm_vec_ct, uintptr_t* __restrict_
 }
 
 void calc_qgit(uint32_t pheno_nm_ct, uintptr_t perm_vec_ctcl8m, uint32_t num_perms_now, uintptr_t* __restrict__ loadbuf, double* perm_vecstd, double* thread_bufs) {
-  uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
+  uint32_t pheno_nm_ctl2x = QUATERCT_TO_WORDCT(pheno_nm_ct);
 #ifdef __LP64__
   // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
   uint32_t row_mult = perm_vec_ctcl8m / 4;
@@ -800,7 +800,7 @@ void calc_qgit(uint32_t pheno_nm_ct, uintptr_t perm_vec_ctcl8m, uint32_t num_per
 }
 
 void calc_qgit_lin(uint32_t pheno_nm_ct, uintptr_t perm_vec_ctcl8m, uint32_t num_perms_now, uintptr_t* __restrict__ loadbuf, double* perm_vecstd, double* thread_bufs) {
-  uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
+  uint32_t pheno_nm_ctl2x = QUATERCT_TO_WORDCT(pheno_nm_ct);
 #ifdef __LP64__
   // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
   uint32_t row_mult = perm_vec_ctcl8m / 4;
@@ -898,7 +898,7 @@ uintptr_t rem_cost_60v(__m128i* vec1, __m128i* vend, __m128i* vec2) {
   __m128i result_a;
   __m128i acc_a;
   __m128i acc_b;
-  __uni16 acc;
+  __univec acc;
   acc.vi = _mm_setzero_si128();
   do {
     loader = *vec1++;
@@ -944,7 +944,7 @@ uintptr_t qrem_cost2_40v(__m128i* vec1, __m128i* vend, __m128i* vec2) {
   __m128i result_b;
   __m128i result_c;
   __m128i inner_acc;
-  __uni16 acc;
+  __univec acc;
   acc.vi = _mm_setzero_si128();
   do {
     loader = *vec1++;
@@ -1281,7 +1281,7 @@ static inline void calc_rem_merge32_minus(uint32_t perm_ct4, uintptr_t* __restri
 #endif
 
 void calc_rem(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, uintptr_t* loadbuf_ref, uint32_t* perm_vecst, uint32_t* results_bufs, uint32_t* thread_wkspace) {
-  uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
+  uint32_t pheno_nm_ctl2x = QUATERCT_TO_WORDCT(pheno_nm_ct);
   uint32_t perm_ct16 = (perm_vec_ct + 15) / 16;
   // [cur_xor - 1][cur_raw]
   // low 8 bits give index of first remv[] array to increment; next 8 bits give
@@ -1452,8 +1452,8 @@ void calc_rem(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, u
 }
 
 void calc_qrem(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, uintptr_t* loadbuf_ref, double* perm_vecstd, double* outbufs) {
-  uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
-  uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
+  uint32_t pheno_nm_ctl2x = QUATERCT_TO_WORDCT(pheno_nm_ct);
 #ifdef __LP64__
   // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
   uint32_t row_mult = perm_vec_ctcl8m / 4;
@@ -1717,8 +1717,8 @@ void calc_qrem(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf,
 }
 
 void calc_qrem_lin(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, uintptr_t* loadbuf_ref, double* perm_vecstd, double* outbufs) {
-  uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
-  uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
+  uint32_t pheno_nm_ctl2x = QUATERCT_TO_WORDCT(pheno_nm_ct);
 #ifdef __LP64__
   // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
   uint32_t row_mult = perm_vec_ctcl8m / 4;
@@ -2105,7 +2105,7 @@ void check_for_better_rem_cost(uintptr_t best_cost, uint32_t maxt_block_base, ui
   //   2 * (<-> neither side homcom) + (<-> homcom) + constant
   // Simple lower bound:
   //   max(delta(homcom), delta(non-homcom)) + constant
-  uintptr_t pheno_nm_ctv2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
   uint32_t marker_idx_tmp = maxt_block_base;
   uint32_t loop_ceil = maxt_block_base2;
   int32_t homrar_ct = pheno_nm_ct - missing_ct - het_ct - homcom_ct;
@@ -2152,17 +2152,13 @@ static double* g_mperm_save_all;
 // any better than the usual PLINK 2-bit format.
 static uintptr_t* g_loadbuf;
 
-static uintptr_t* g_perm_vecs;
-
 static uint32_t* g_perm_vecst; // genotype indexing support
 static uint32_t* g_thread_git_wkspace;
 static uint32_t* g_resultbuf;
 
 // always use genotype indexing for QT --assoc
-static double* g_perm_vecstd;
 static double* g_thread_git_qbufs;
 static double* g_qresultbuf;
-static double* g_pheno_d2;
 static double g_pheno_sum;
 static double g_pheno_ssq;
 static uint16_t* g_ldrefs;
@@ -2251,16 +2247,13 @@ static uintptr_t* g_is_invalid_bitfield;
 static uint32_t g_model_fisher;
 static uint32_t g_fisher_midp;
 static uint32_t g_assoc_thread_ct;
-static uintptr_t g_perm_vec_ct;
 static uint32_t g_maxt_block_base;
 static uint32_t g_block_start;
 static uint32_t g_qblock_start;
 static uint32_t g_block_diff;
 static uint32_t g_perms_done;
 static uint32_t g_first_adapt_check;
-static uint32_t g_pheno_nm_ct;
 static uint32_t g_male_ct;
-static uint32_t g_case_ct;
 static double g_adaptive_intercept;
 static double g_adaptive_slope;
 static double g_aperm_alpha;
@@ -2273,241 +2266,12 @@ static uint32_t g_min_ploidy_1;
 
 static int32_t g_is_model_prec;
 
-static uint32_t g_is_perm1;
-static uint32_t g_tot_quotient;
-static uint64_t g_totq_magic;
-static uint32_t g_totq_preshift;
-static uint32_t g_totq_postshift;
-static uint32_t g_totq_incr;
-
-static uint32_t g_cluster_ct;
-static uint32_t* g_cluster_map;
-static uint32_t* g_cluster_starts;
-static uint32_t* g_cluster_case_cts;
-
-// per-cluster magic number sets
-static uintptr_t* g_cluster_cc_perm_preimage;
-static uint32_t* g_tot_quotients;
-static uint64_t* g_totq_magics;
-static uint32_t* g_totq_preshifts;
-static uint32_t* g_totq_postshifts;
-static uint32_t* g_totq_incrs;
-
-static uint32_t* g_sample_to_cluster;
-static uint32_t* g_qassoc_cluster_thread_wkspace;
-
 static uint32_t* g_male_case_cts;
 
-THREAD_RET_TYPE model_assoc_gen_perms_thread(void* arg) {
-  intptr_t tidx = (intptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
-  uint32_t case_ct = g_case_ct;
-  uint32_t tot_quotient = g_tot_quotient;
-  uint64_t totq_magic = g_totq_magic;
-  uint32_t totq_preshift = g_totq_preshift;
-  uint32_t totq_postshift = g_totq_postshift;
-  uint32_t totq_incr = g_totq_incr;
-  uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
-  sfmt_t* __restrict__ sfmtp = g_sfmtp_arr[tidx];
-  uintptr_t pheno_nm_ctv = (pheno_nm_ct + (BITCT - 1)) / BITCT;
-  uint32_t pidx = (((uint64_t)tidx) * g_perm_vec_ct) / g_assoc_thread_ct;
-  uint32_t pmax = (((uint64_t)tidx + 1) * g_perm_vec_ct) / g_assoc_thread_ct;
-  if (!g_is_perm1) {
-    pheno_nm_ctv *= 2;
-    for (; pidx < pmax; pidx++) {
-      generate_cc_perm_vec(pheno_nm_ct, case_ct, tot_quotient, totq_magic, totq_preshift, totq_postshift, totq_incr, &(perm_vecs[pidx * pheno_nm_ctv]), sfmtp);
-    }
-  } else {
-    pheno_nm_ctv = (pheno_nm_ctv + 1) & (~1);
-    for (; pidx < pmax; pidx++) {
-      generate_cc_perm1(pheno_nm_ct, case_ct, tot_quotient, totq_magic, totq_preshift, totq_postshift, totq_incr, &(perm_vecs[pidx * pheno_nm_ctv]), sfmtp);
-    }
-  }
-  THREAD_RETURN;
-}
-
-THREAD_RET_TYPE model_assoc_gen_cluster_perms_thread(void* arg) {
-  intptr_t tidx = (intptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
-  uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
-  sfmt_t* __restrict__ sfmtp = g_sfmtp_arr[tidx];
-  uintptr_t pheno_nm_ctv = (pheno_nm_ct + (BITCT - 1)) / BITCT;
-  uint32_t pidx = (((uint64_t)tidx) * g_perm_vec_ct) / g_assoc_thread_ct;
-  uint32_t pmax = (((uint64_t)tidx + 1) * g_perm_vec_ct) / g_assoc_thread_ct;
-  uint32_t cluster_ct = g_cluster_ct;
-  uint32_t* cluster_map = g_cluster_map;
-  uint32_t* cluster_starts = g_cluster_starts;
-  uint32_t* cluster_case_cts = g_cluster_case_cts;
-  uintptr_t* cluster_cc_perm_preimage = g_cluster_cc_perm_preimage;
-  uint32_t* tot_quotients = g_tot_quotients;
-  uint64_t* totq_magics = g_totq_magics;
-  uint32_t* totq_preshifts = g_totq_preshifts;
-  uint32_t* totq_postshifts = g_totq_postshifts;
-  uint32_t* totq_incrs = g_totq_incrs;
-  if (!g_is_perm1) {
-    pheno_nm_ctv *= 2;
-    for (; pidx < pmax; pidx++) {
-      generate_cc_cluster_perm_vec(pheno_nm_ct, cluster_cc_perm_preimage, cluster_ct, cluster_map, cluster_starts, cluster_case_cts, tot_quotients, totq_magics, totq_preshifts, totq_postshifts, totq_incrs, &(perm_vecs[pidx * pheno_nm_ctv]), sfmtp);
-    }
-  } else {
-    pheno_nm_ctv = (pheno_nm_ctv + 1) & (~1);
-    for (; pidx < pmax; pidx++) {
-      generate_cc_cluster_perm1(pheno_nm_ct, cluster_cc_perm_preimage, cluster_ct, cluster_map, cluster_starts, cluster_case_cts, tot_quotients, totq_magics, totq_preshifts, totq_postshifts, totq_incrs, &(perm_vecs[pidx * pheno_nm_ctv]), sfmtp);
-    }
-  }
-  THREAD_RETURN;
-}
-
-THREAD_RET_TYPE qassoc_gen_perms_thread(void* arg) {
-  // Used by QT --assoc and --make-perm-pheno.
-  //
-  // Takes an array of phenotype values in g_pheno_d2 of length g_pheno_nm_ct,
-  // and populates g_perm_vecstd[] with permutations of those values.  Also
-  // requires g_sfmtp_arr[] and g_assoc_thread_ct to be initialized.
-  //
-  // g_perm_vecstd is sample-major.  The nth permutation is stored across
-  //   g_perm_vecstd[n]
-  //   g_perm_vecstd[n + perm_vec_ctcl8m]
-  //   g_perm_vecstd[n + 2 * perm_vec_ctcl8m]
-  //   ...
-  uintptr_t tidx = (uintptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
-  uintptr_t perm_vec_ctcl8 = (g_perm_vec_ct + (CACHELINE_DBL - 1)) / CACHELINE_DBL;
-  uintptr_t perm_vec_ctcl8m = perm_vec_ctcl8 * CACHELINE_DBL;
-  double* pheno_d2 = g_pheno_d2;
-  sfmt_t* sfmtp = g_sfmtp_arr[tidx];
-  uint32_t pmin = CACHELINE_DBL * ((((uint64_t)tidx) * perm_vec_ctcl8) / g_assoc_thread_ct);
-  uint32_t pmax = CACHELINE_DBL * ((((uint64_t)tidx + 1) * perm_vec_ctcl8) / g_assoc_thread_ct);
-  double* perm_vecstd = &(g_perm_vecstd[pmin]);
-  uint32_t poffset = 0;
-  uint32_t sample_idx = 1;
-  uint32_t pdiff;
-  uint32_t tot_quotient;
-  uint32_t upper_bound;
-  uint64_t totq_magic;
-  uint32_t totq_preshift;
-  uint32_t totq_postshift;
-  uint32_t totq_incr;
-  uint32_t urand;
-  uint32_t uii;
-  double* wptr;
-  double* wptr2;
-  double* wptr3;
-  double cur_source;
-  if (tidx + 1 == g_assoc_thread_ct) {
-    pmax = g_perm_vec_ct;
-  }
-  pdiff = pmax - pmin;
-  cur_source = *pheno_d2++;
-  wptr = perm_vecstd;
-  for (; poffset < pdiff; poffset++) {
-    *wptr++ = cur_source;
-  }
-  for (; sample_idx < pheno_nm_ct; sample_idx++) {
-    tot_quotient = 0x100000000LLU / (sample_idx + 1);
-    upper_bound = (sample_idx + 1) * tot_quotient - 1;
-    magic_num(tot_quotient, &totq_magic, &totq_preshift, &totq_postshift, &totq_incr);
-    cur_source = *pheno_d2++;
-    wptr = &(perm_vecstd[sample_idx * perm_vec_ctcl8m]);
-    wptr2 = perm_vecstd;
-    for (poffset = 0; poffset < pdiff; poffset++) {
-      do {
-	urand = sfmt_genrand_uint32(sfmtp);
-      } while (urand > upper_bound);
-      uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
-      wptr3 = &(wptr2[uii * perm_vec_ctcl8m]);
-      *wptr++ = *wptr3;
-      *wptr3 = cur_source;
-      wptr2++;
-    }
-  }
-  THREAD_RETURN;
-}
-
-THREAD_RET_TYPE qassoc_gen_cluster_perms_thread(void* arg) {
-  // Variant of qassoc_gen_perms_thread() which restricts permutations to be
-  // within-cluster.
-  // On top of the qassoc_gen_perms_thread requirements, this also needs
-  // g_cluster_ct, g_cluster_map, g_cluster_starts,
-  // g_qassoc_cluster_thread_wkspace, and g_sample_to_cluster to be
-  // initialized.
-  uintptr_t tidx = (uintptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
-  uintptr_t perm_vec_ctcl8 = (g_perm_vec_ct + (CACHELINE_DBL - 1)) / CACHELINE_DBL;
-  uintptr_t perm_vec_ctcl8m = perm_vec_ctcl8 * CACHELINE_DBL;
-  double* pheno_d2 = g_pheno_d2;
-  sfmt_t* sfmtp = g_sfmtp_arr[tidx];
-  uint32_t pmin = CACHELINE_DBL * ((((uint64_t)tidx) * perm_vec_ctcl8) / g_assoc_thread_ct);
-  uint32_t pmax = CACHELINE_DBL * ((((uint64_t)tidx + 1) * perm_vec_ctcl8) / g_assoc_thread_ct);
-  double* perm_vecstd = &(g_perm_vecstd[pmin]);
-  uint32_t cluster_ct = g_cluster_ct;
-  uint32_t cluster_ctcl = (cluster_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32;
-  uint32_t* cluster_map = g_cluster_map;
-  uint32_t* cluster_starts = g_cluster_starts;
-  uint32_t* in_cluster_positions = &(g_qassoc_cluster_thread_wkspace[tidx * cluster_ctcl * CACHELINE_INT32]);
-  uint32_t* sample_to_cluster = g_sample_to_cluster;
-  uint32_t poffset = 0;
-  uint32_t sample_idx = 0;
-  uint32_t* cur_map_start;
-  uint32_t pdiff;
-  uint32_t cluster_idx;
-  uint32_t cur_in_cluster_pos;
-  uint32_t tot_quotient;
-  uint32_t upper_bound;
-  uint64_t totq_magic;
-  uint32_t totq_preshift;
-  uint32_t totq_postshift;
-  uint32_t totq_incr;
-  uint32_t urand;
-  uint32_t uii;
-  double* wptr;
-  double* wptr2;
-  double* wptr3;
-  double cur_source;
-  if (tidx + 1 == g_assoc_thread_ct) {
-    pmax = g_perm_vec_ct;
-  }
-  pdiff = pmax - pmin;
-  fill_uint_zero(in_cluster_positions, cluster_ct);
-  for (; sample_idx < pheno_nm_ct; sample_idx++) {
-    cur_source = *pheno_d2++;
-    cluster_idx = sample_to_cluster[sample_idx];
-    if (cluster_idx == 0xffffffffU) {
-      cur_in_cluster_pos = 0;
-    } else {
-      cur_in_cluster_pos = in_cluster_positions[cluster_idx];
-      in_cluster_positions[cluster_idx] += 1;
-    }
-    wptr = &(perm_vecstd[sample_idx * perm_vec_ctcl8m]);
-    if (!cur_in_cluster_pos) {
-      for (poffset = 0; poffset < pdiff; poffset++) {
-        *wptr++ = cur_source;
-      }
-    } else {
-      cur_map_start = &(cluster_map[cluster_starts[cluster_idx]]);
-      tot_quotient = 0x100000000LLU / (cur_in_cluster_pos + 1);
-      upper_bound = (cur_in_cluster_pos + 1) * tot_quotient - 1;
-      magic_num(tot_quotient, &totq_magic, &totq_preshift, &totq_postshift, &totq_incr);
-      wptr2 = perm_vecstd;
-      for (poffset = 0; poffset < pdiff; poffset++) {
-	do {
-	  urand = sfmt_genrand_uint32(sfmtp);
-	} while (urand > upper_bound);
-	uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
-	wptr3 = &(wptr2[cur_map_start[uii] * perm_vec_ctcl8m]);
-	*wptr++ = *wptr3;
-	*wptr3 = cur_source;
-	wptr2++;
-      }
-    }
-  }
-  THREAD_RETURN;
-}
-
 THREAD_RET_TYPE assoc_adapt_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
-  uintptr_t pheno_nm_ct = g_pheno_nm_ct;
-  uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+  uintptr_t pheno_nm_ct = g_perm_pheno_nm_ct;
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
   uintptr_t perm_vec_ct = g_perm_vec_ct;
   uint32_t assoc_thread_ct = g_assoc_thread_ct;
   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
@@ -2515,7 +2279,7 @@ THREAD_RET_TYPE assoc_adapt_thread(void* arg) {
   uint32_t fisher_midp = g_fisher_midp;
   uint32_t precomp_width = g_precomp_width;
   uint32_t first_adapt_check = g_first_adapt_check;
-  uint32_t case_ct = g_case_ct;
+  uint32_t case_ct = g_perm_case_ct;
   uintptr_t* __restrict__ male_vec = g_sample_male_include2;
   uintptr_t* __restrict__ nonmale_vec = g_sample_nonmale_include2;
   uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
@@ -2617,14 +2381,14 @@ THREAD_RET_TYPE assoc_adapt_thread(void* arg) {
       }
       for (pidx = 0; pidx < perm_vec_ct;) {
 	if (!min_ploidy_1) {
-	  vec_set_freq(pheno_nm_ctl2, &(loadbuf[marker_bidx * pheno_nm_ctl2]), &(perm_vecs[pidx * pheno_nm_ctl2]), &case_set_ct, &case_missing_ct);
+	  genovec_set_freq(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
 	} else if (is_x) {
-	  vec_set_freq_x(pheno_nm_ctl2, &(loadbuf[marker_bidx * pheno_nm_ctl2]), &(perm_vecs[pidx * pheno_nm_ctl2]), male_vec, &case_set_ct, &case_missing_ct);
+	  genovec_set_freq_x(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), male_vec, pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
 	} else if (!is_y) {
-	  vec_3freq(pheno_nm_ctl2, &(loadbuf[marker_bidx * pheno_nm_ctl2]), &(perm_vecs[pidx * pheno_nm_ctl2]), &case_missing_ct, &uii, &case_set_ct);
+	  genovec_3freq(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), pheno_nm_ctv2, &case_missing_ct, &uii, &case_set_ct);
 	  case_missing_ct += uii;
 	} else {
-	  vec_set_freq_y(pheno_nm_ctl2, &(loadbuf[marker_bidx * pheno_nm_ctl2]), &(perm_vecs[pidx * pheno_nm_ctl2]), nonmale_vec, &case_set_ct, &case_missing_ct);
+	  genovec_set_freq_y(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), nonmale_vec, pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
 	}
 	// deliberate underflow
 	uii = (uint32_t)(case_missing_ct - missing_start);
@@ -2690,28 +2454,25 @@ THREAD_RET_TYPE assoc_adapt_thread(void* arg) {
 
 THREAD_RET_TYPE assoc_maxt_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
   uintptr_t perm_vec_ct = g_perm_vec_ct;
-  uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
   uint32_t assoc_thread_ct = g_assoc_thread_ct;
   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
   uint32_t model_fisher = g_model_fisher;
   uint32_t fisher_midp = g_fisher_midp;
-#ifdef __LP64__
-  uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
-#else
-  uint32_t perm_ct64 = (perm_vec_ct + 63) / 64;
-  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct64 * 144]);
-#endif
+
+  // currently safe for this to be uint32_t since perm_vec_ct < 2^30
+  uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
+  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
   uint32_t* git_homrar_cts = NULL;
   uint32_t* git_missing_cts = NULL;
   uint32_t* git_het_cts = NULL;
-  uintptr_t perm_vec_ctcl4m = CACHEALIGN32_INT32(perm_vec_ct);
-  uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
+  uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
+  uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
   double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
   uint32_t precomp_width = g_precomp_width;
-  uint32_t case_ct = g_case_ct;
+  uint32_t case_ct = g_perm_case_ct;
   uintptr_t* __restrict__ male_vec = g_sample_male_include2;
   uintptr_t* __restrict__ nonmale_vec = g_sample_nonmale_include2;
   uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
@@ -2838,7 +2599,7 @@ THREAD_RET_TYPE assoc_maxt_thread(void* arg) {
       gpui = &(precomp_ui[6 * precomp_width * marker_bidx]);
       missing_start = precomp_start[marker_bidx];
       success_2incr = 0;
-      loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctl2]);
+      loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
       if (!is_x_or_y) {
 	ldref = ldrefs[marker_idx];
 	if (!min_ploidy_1) {
@@ -2859,20 +2620,12 @@ THREAD_RET_TYPE assoc_maxt_thread(void* arg) {
 	  ldrefs[marker_idx] = ldref;
 	}
 	if (ldref == marker_bidx) {
-#ifdef __LP64__
-	  fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
-#else
-	  fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
-#endif
+	  fill_uint_zero(git_homrar_cts, 3 * perm_vec_ctcl4m);
 	  calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
-#ifdef __LP64__
-	  fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
-#else
-	  fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-#endif
+	  fill_uint_zero(thread_git_wkspace, perm_ctvc * 72 * BYTECT4);
 	} else {
 	  memcpy(git_homrar_cts, &(resultbuf[3 * ldref * perm_vec_ctcl4m]), 3 * perm_vec_ctcl4m * sizeof(int32_t));
-	  calc_rem(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctl2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
+	  calc_rem(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
 	}
       }
       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
@@ -2886,9 +2639,9 @@ THREAD_RET_TYPE assoc_maxt_thread(void* arg) {
 	  }
 	} else {
 	  if (is_x) {
-	    vec_set_freq_x(pheno_nm_ctl2, loadbuf_cur, &(perm_vecs[pidx * pheno_nm_ctl2]), male_vec, &case_set_ct, &case_missing_ct);
+	    genovec_set_freq_x(loadbuf_cur, &(perm_vecs[pidx * pheno_nm_ctv2]), male_vec, pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
 	  } else {
-	    vec_set_freq_y(pheno_nm_ctl2, loadbuf_cur, &(perm_vecs[pidx * pheno_nm_ctl2]), nonmale_vec, &case_set_ct, &case_missing_ct);
+	    genovec_set_freq_y(loadbuf_cur, &(perm_vecs[pidx * pheno_nm_ctv2]), nonmale_vec, pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
 	  }
 	}
 	// deliberate underflow
@@ -2974,23 +2727,18 @@ THREAD_RET_TYPE assoc_set_thread(void* arg) {
   // (possible todo: permit Fisher test, converting p-values into equivalent
   // chi-square stats?)
   uintptr_t tidx = (uintptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
   uint32_t assoc_thread_ct = g_assoc_thread_ct;
   uintptr_t perm_vec_ct = g_perm_vec_ct;
-  uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
-#ifdef __LP64__
-  uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
-#else
-  uint32_t perm_ct64 = (perm_vec_ct + 63) / 64;
-  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct64 * 144]);
-#endif
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
+  uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
+  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
   uint32_t* git_homrar_cts = NULL;
   uint32_t* git_missing_cts = NULL;
   uint32_t* git_het_cts = NULL;
-  uintptr_t perm_vec_ctcl4m = CACHEALIGN32_INT32(perm_vec_ct);
+  uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
   uint32_t* resultbuf = g_resultbuf;
-  uint32_t case_ct = g_case_ct;
+  uint32_t case_ct = g_perm_case_ct;
   uintptr_t* __restrict__ male_vec = g_sample_male_include2;
   uintptr_t* __restrict__ nonmale_vec = g_sample_nonmale_include2;
   uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
@@ -3051,22 +2799,14 @@ THREAD_RET_TYPE assoc_set_thread(void* arg) {
 	row1x_sum = min_ploidy * case_ct;
 	tot_obs = min_ploidy * (pheno_nm_ct - missing_ct);
       }
-      loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctl2]);
+      loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
       if (!is_x_or_y) {
 	git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
 	git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
 	git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
-#ifdef __LP64__
-	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
-#else
-	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
-#endif
+	fill_uint_zero(git_homrar_cts, 3 * perm_vec_ctcl4m);
 	calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
-#ifdef __LP64__
-	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
-#else
-	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-#endif
+	fill_uint_zero(thread_git_wkspace, perm_ctvc * 72 * BYTECT4);
       }
       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
 	if (!is_x_or_y) {
@@ -3079,9 +2819,9 @@ THREAD_RET_TYPE assoc_set_thread(void* arg) {
 	  }
 	} else {
 	  if (is_x) {
-	    vec_set_freq_x(pheno_nm_ctl2, loadbuf_cur, &(perm_vecs[pidx * pheno_nm_ctl2]), male_vec, &case_set_ct, &case_missing_ct);
+	    genovec_set_freq_x(loadbuf_cur, &(perm_vecs[pidx * pheno_nm_ctv2]), male_vec, pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
 	  } else {
-	    vec_set_freq_y(pheno_nm_ctl2, loadbuf_cur, &(perm_vecs[pidx * pheno_nm_ctl2]), nonmale_vec, &case_set_ct, &case_missing_ct);
+	    genovec_set_freq_y(loadbuf_cur, &(perm_vecs[pidx * pheno_nm_ctv2]), nonmale_vec, pheno_nm_ctv2, &case_set_ct, &case_missing_ct);
 	  }
 	}
 	// Fisher's exact test not supported since we are adding raw chi-square
@@ -3099,13 +2839,13 @@ THREAD_RET_TYPE assoc_set_thread(void* arg) {
 
 THREAD_RET_TYPE qassoc_adapt_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
   uintptr_t perm_vec_ct = g_perm_vec_ct;
   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
   uint32_t first_adapt_check = g_first_adapt_check;
   uint32_t max_thread_ct = g_assoc_thread_ct;
-  uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
-  uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
+  uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
   double* git_qt_g_prod = &(g_thread_git_qbufs[perm_vec_ctcl8m * tidx * 3]);
   double* git_qt_sum = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 3 + 1)]);
   double* git_qt_ssq = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 3 + 2)]);
@@ -3223,11 +2963,11 @@ THREAD_RET_TYPE qassoc_adapt_thread(void* arg) {
 	    next_cqg = ulii + (ulii >> 2);
 	  }
 	  next_cqg -= pidx_offset;
-	  next_cqg = CACHEALIGN_DBL(next_cqg);
+	  next_cqg = round_up_pow2(next_cqg, CACHELINE_DBL);
 	  if (next_cqg > perm_vec_ct) {
 	    next_cqg = perm_vec_ct;
 	  }
-	  calc_qgit(pheno_nm_ct, perm_vec_ctcl8m, next_cqg - pidx, &(loadbuf[marker_bidx * pheno_nm_ctl2]), &(perm_vecstd[pidx]), &(git_qt_g_prod[pidx]));
+	  calc_qgit(pheno_nm_ct, perm_vec_ctcl8m, next_cqg - pidx, &(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecstd[pidx]), &(git_qt_g_prod[pidx]));
 	}
 	qt_sum = pheno_sum - git_qt_sum[pidx];
 	qt_ssq = pheno_ssq - git_qt_ssq[pidx];
@@ -3277,13 +3017,13 @@ THREAD_RET_TYPE qassoc_adapt_thread(void* arg) {
 
 THREAD_RET_TYPE qassoc_adapt_lin_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
   uintptr_t perm_vec_ct = g_perm_vec_ct;
   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
   uint32_t first_adapt_check = g_first_adapt_check;
   uint32_t max_thread_ct = g_assoc_thread_ct;
-  uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
-  uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
+  uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
   double* git_qt_het_sum = &(g_thread_git_qbufs[perm_vec_ctcl8m * tidx * 6]);
   double* git_qt_het_ssq = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 6 + 1)]);
   double* git_qt_homrar_sum = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 6 + 2)]);
@@ -3397,11 +3137,11 @@ THREAD_RET_TYPE qassoc_adapt_lin_thread(void* arg) {
 	    next_cqg = ulii + (ulii >> 2);
 	  }
 	  next_cqg -= pidx_offset;
-	  next_cqg = CACHEALIGN_DBL(next_cqg);
+	  next_cqg = round_up_pow2(next_cqg, CACHELINE_DBL);
 	  if (next_cqg > perm_vec_ct) {
 	    next_cqg = perm_vec_ct;
 	  }
-	  calc_qgit_lin(pheno_nm_ct, perm_vec_ctcl8m, next_cqg - pidx, &(loadbuf[marker_bidx * pheno_nm_ctl2]), &(perm_vecstd[pidx]), &(git_qt_het_sum[pidx]));
+	  calc_qgit_lin(pheno_nm_ct, perm_vec_ctcl8m, next_cqg - pidx, &(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecstd[pidx]), &(git_qt_het_sum[pidx]));
 	}
 	qt_sum = pheno_sum - git_qt_missing_sum[pidx];
 	qt_ssq = pheno_ssq - git_qt_missing_ssq[pidx];
@@ -3454,10 +3194,10 @@ THREAD_RET_TYPE qassoc_adapt_lin_thread(void* arg) {
 
 THREAD_RET_TYPE qassoc_maxt_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
   uintptr_t perm_vec_ct = g_perm_vec_ct;
-  uintptr_t pheno_nm_ctv2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
-  uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
+  uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
   uint32_t max_thread_ct = g_assoc_thread_ct;
   double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
   double* __restrict__ perm_vecstd = g_perm_vecstd;
@@ -3662,10 +3402,10 @@ THREAD_RET_TYPE qassoc_maxt_thread(void* arg) {
 
 THREAD_RET_TYPE qassoc_maxt_lin_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
   uintptr_t perm_vec_ct = g_perm_vec_ct;
-  uintptr_t pheno_nm_ctv2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
-  uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
+  uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
   uint32_t max_thread_ct = g_assoc_thread_ct;
   double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
   double* __restrict__ perm_vecstd = g_perm_vecstd;
@@ -3848,11 +3588,11 @@ THREAD_RET_TYPE qassoc_set_thread(void* arg) {
   // Simplified version of qassoc_adapt/maxt_thread(), except we need to save
   // actual t-statistics.
   uintptr_t tidx = (uintptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
   uintptr_t perm_vec_ct = g_perm_vec_ct;
   uint32_t max_thread_ct = g_assoc_thread_ct;
-  uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
-  uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
+  uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
   double* git_qt_g_prod = &(g_thread_git_qbufs[perm_vec_ctcl8m * tidx * 3]);
   double* git_qt_sum = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 3 + 1)]);
   double* git_qt_ssq = &(g_thread_git_qbufs[perm_vec_ctcl8m * (tidx * 3 + 2)]);
@@ -3920,7 +3660,7 @@ THREAD_RET_TYPE qassoc_set_thread(void* arg) {
       nanal_m2_recip = 1.0 / ((double)(((int32_t)nanal) - 2));
       geno_mean = ((double)geno_sum) * nanal_recip;
       geno_var_recip = 1.0 / ((((double)geno_ssq) - geno_sum * geno_mean) * nanal_m1_recip);
-      calc_qgit(pheno_nm_ct, perm_vec_ctcl8m, perm_vec_ct, &(loadbuf[marker_bidx * pheno_nm_ctl2]), perm_vecstd, git_qt_g_prod);
+      calc_qgit(pheno_nm_ct, perm_vec_ctcl8m, perm_vec_ct, &(loadbuf[marker_bidx * pheno_nm_ctv2]), perm_vecstd, git_qt_g_prod);
       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
 	qt_sum = pheno_sum - git_qt_sum[pidx];
 	qt_ssq = pheno_ssq - git_qt_ssq[pidx];
@@ -3944,8 +3684,8 @@ THREAD_RET_TYPE qassoc_set_thread(void* arg) {
 
 THREAD_RET_TYPE model_adapt_domrec_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
-  uintptr_t pheno_nm_ct = g_pheno_nm_ct;
-  uintptr_t pheno_nm_ctv2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+  uintptr_t pheno_nm_ct = g_perm_pheno_nm_ct;
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
   uintptr_t perm_vec_ct = g_perm_vec_ct;
   uint32_t assoc_thread_ct = g_assoc_thread_ct;
   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
@@ -3953,7 +3693,7 @@ THREAD_RET_TYPE model_adapt_domrec_thread(void* arg) {
   uint32_t fisher_midp = g_fisher_midp;
   uint32_t precomp_width = g_precomp_width;
   uint32_t first_adapt_check = g_first_adapt_check;
-  uint32_t case_ct = g_case_ct;
+  uint32_t case_ct = g_perm_case_ct;
   int32_t is_model_prec = g_is_model_prec;
   uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
   uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
@@ -4044,7 +3784,7 @@ THREAD_RET_TYPE model_adapt_domrec_thread(void* arg) {
       success_2start = perm_2success_ct[marker_idx];
       success_2incr = 0;
       for (pidx = 0; pidx < perm_vec_ct;) {
-	vec_3freq(pheno_nm_ctv2, &(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), &case_missing_ct, &uii, &case_homx_ct);
+	genovec_3freq(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), pheno_nm_ctv2, &case_missing_ct, &uii, &case_homx_ct);
 	if (is_model_prec) {
 	  case_homx_ct = case_ct - case_homx_ct - case_missing_ct - uii;
 	}
@@ -4112,28 +3852,23 @@ THREAD_RET_TYPE model_adapt_domrec_thread(void* arg) {
 
 THREAD_RET_TYPE model_maxt_domrec_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
   uintptr_t perm_vec_ct = g_perm_vec_ct;
-  uintptr_t pheno_nm_ctv2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
   uint32_t assoc_thread_ct = g_assoc_thread_ct;
   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
   uint32_t model_fisher = g_model_fisher;
   uint32_t fisher_midp = g_fisher_midp;
-#ifdef __LP64__
-  uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
-#else
-  uint32_t perm_ct64 = (perm_vec_ct + 63) / 64;
-  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct64 * 144]);
-#endif
+  uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
+  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
   uint32_t* git_homrar_cts = NULL;
   uint32_t* git_missing_cts = NULL;
   uint32_t* git_het_cts = NULL;
-  uintptr_t perm_vec_ctcl4m = CACHEALIGN32_INT32(perm_vec_ct);
-  uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
+  uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
+  uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
   double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
   uint32_t precomp_width = g_precomp_width;
-  uint32_t case_ct = g_case_ct;
+  uint32_t case_ct = g_perm_case_ct;
   int32_t is_model_prec = g_is_model_prec;
   uint32_t* __restrict__ perm_vecst = g_perm_vecst;
   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
@@ -4260,17 +3995,9 @@ THREAD_RET_TYPE model_maxt_domrec_thread(void* arg) {
 	ldrefs[marker_idx] = ldref;
       }
       if (ldref == marker_bidx) {
-#ifdef __LP64__
-	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
-#else
-	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
-#endif
+	fill_uint_zero(git_homrar_cts, 3 * perm_vec_ctcl4m);
 	calc_git(pheno_nm_ct, perm_vec_ct, &(loadbuf[marker_bidx * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
-#ifdef __LP64__
-	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
-#else
-	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-#endif
+	fill_uint_zero(thread_git_wkspace, perm_ctvc * 72 * BYTECT4);
       } else {
 	memcpy(git_homrar_cts, &(resultbuf[3 * ldref * perm_vec_ctcl4m]), 3 * perm_vec_ctcl4m * sizeof(int32_t));
 	calc_rem(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
@@ -4358,23 +4085,18 @@ THREAD_RET_TYPE model_maxt_domrec_thread(void* arg) {
 THREAD_RET_TYPE model_set_domrec_thread(void* arg) {
   // Similar to assoc_set_thread().
   uintptr_t tidx = (uintptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
   uint32_t assoc_thread_ct = g_assoc_thread_ct;
   uintptr_t perm_vec_ct = g_perm_vec_ct;
-  uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
-#ifdef __LP64__
-  uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
-#else
-  uint32_t perm_ct64 = (perm_vec_ct + 63) / 64;
-  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct64 * 144]);
-#endif
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
+  uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
+  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
   uint32_t* git_homrar_cts = NULL;
   uint32_t* git_missing_cts = NULL;
   uint32_t* git_het_cts = NULL;
-  uintptr_t perm_vec_ctcl4m = CACHEALIGN32_INT32(perm_vec_ct);
+  uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
   uint32_t* resultbuf = g_resultbuf;
-  uint32_t case_ct = g_case_ct;
+  uint32_t case_ct = g_perm_case_ct;
   int32_t is_model_prec = g_is_model_prec;
   uint32_t* __restrict__ perm_vecst = g_perm_vecst;
   double* msa_ptr = NULL;
@@ -4425,21 +4147,13 @@ THREAD_RET_TYPE model_set_domrec_thread(void* arg) {
       } else {
         col1_sum = homcom_ct;
       }
-      loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctl2]);
+      loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
       git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
       git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
       git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
-#ifdef __LP64__
-      fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
-#else
-      fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
-#endif
+      fill_uint_zero(git_homrar_cts, 3 * perm_vec_ctcl4m);
       calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
-#ifdef __LP64__
-      fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
-#else
-      fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-#endif
+      fill_uint_zero(thread_git_wkspace, perm_ctvc * 72 * BYTECT4);
       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
 	case_missing_ct = git_missing_cts[pidx];
         if (is_model_prec) {
@@ -4460,14 +4174,14 @@ THREAD_RET_TYPE model_set_domrec_thread(void* arg) {
 
 THREAD_RET_TYPE model_adapt_trend_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
-  uintptr_t pheno_nm_ct = g_pheno_nm_ct;
-  uintptr_t pheno_nm_ctv2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+  uintptr_t pheno_nm_ct = g_perm_pheno_nm_ct;
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
   uintptr_t perm_vec_ct = g_perm_vec_ct;
   uint32_t assoc_thread_ct = g_assoc_thread_ct;
   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
   uint32_t precomp_width = g_precomp_width;
   uint32_t first_adapt_check = g_first_adapt_check;
-  uint32_t case_ct = g_case_ct;
+  uint32_t case_ct = g_perm_case_ct;
   uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
   uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
@@ -4543,7 +4257,7 @@ THREAD_RET_TYPE model_adapt_trend_thread(void* arg) {
       chisq_high = orig_chisq[marker_idx] + EPSILON;
       chisq_low = orig_chisq[marker_idx] - EPSILON;
       for (pidx = 0; pidx < perm_vec_ct;) {
-	vec_set_freq(pheno_nm_ctv2, &(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), &case_com_ct, &case_missing_ct);
+	genovec_set_freq(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), pheno_nm_ctv2, &case_com_ct, &case_missing_ct);
 	// deliberate underflow
 	uii = (uint32_t)(case_missing_ct - missing_start);
 	if (uii < precomp_width) {
@@ -4599,26 +4313,21 @@ THREAD_RET_TYPE model_adapt_trend_thread(void* arg) {
 
 THREAD_RET_TYPE model_maxt_trend_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
   uintptr_t perm_vec_ct = g_perm_vec_ct;
-  uintptr_t pheno_nm_ctv2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
   uint32_t assoc_thread_ct = g_assoc_thread_ct;
   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
-#ifdef __LP64__
-  uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
-#else
-  uint32_t perm_ct64 = (perm_vec_ct + 63) / 64;
-  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct64 * 144]);
-#endif
+  uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
+  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
   uint32_t* git_homrar_cts = NULL;
   uint32_t* git_missing_cts = NULL;
   uint32_t* git_het_cts = NULL;
-  uintptr_t perm_vec_ctcl4m = CACHEALIGN32_INT32(perm_vec_ct);
-  uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
+  uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
+  uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
   double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
   uint32_t precomp_width = g_precomp_width;
-  uint32_t case_ct = g_case_ct;
+  uint32_t case_ct = g_perm_case_ct;
   uint32_t* __restrict__ perm_vecst = g_perm_vecst;
   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
   double* __restrict__ mperm_save_all = g_mperm_save_all;
@@ -4721,17 +4430,9 @@ THREAD_RET_TYPE model_maxt_trend_thread(void* arg) {
 	ldrefs[marker_idx] = ldref;
       }
       if (ldref == marker_bidx) {
-#ifdef __LP64__
-	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
-#else
-	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
-#endif
+	fill_uint_zero(git_homrar_cts, 3 * perm_vec_ctcl4m);
 	calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
-#ifdef __LP64__
-	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
-#else
-	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-#endif
+	fill_uint_zero(thread_git_wkspace, perm_ctvc * 72 * BYTECT4);
       } else {
 	memcpy(git_homrar_cts, &(resultbuf[3 * ldref * perm_vec_ctcl4m]), 3 * perm_vec_ctcl4m * sizeof(int32_t));
 	calc_rem(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
@@ -4795,23 +4496,18 @@ THREAD_RET_TYPE model_set_trend_thread(void* arg) {
   // Similar to model_set_domrec_thread().  (In fact, it's so similar that it
   // may be appropriate to merge the functions.)
   uintptr_t tidx = (uintptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
   uint32_t assoc_thread_ct = g_assoc_thread_ct;
   uintptr_t perm_vec_ct = g_perm_vec_ct;
-  uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
-#ifdef __LP64__
-  uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
-#else
-  uint32_t perm_ct64 = (perm_vec_ct + 63) / 64;
-  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct64 * 144]);
-#endif
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
+  uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
+  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
   uint32_t* git_homrar_cts = NULL;
   uint32_t* git_missing_cts = NULL;
   uint32_t* git_het_cts = NULL;
-  uintptr_t perm_vec_ctcl4m = CACHEALIGN32_INT32(perm_vec_ct);
+  uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
   uint32_t* resultbuf = g_resultbuf;
-  uint32_t case_ct = g_case_ct;
+  uint32_t case_ct = g_perm_case_ct;
   uint32_t* __restrict__ perm_vecst = g_perm_vecst;
   double* msa_ptr = NULL;
   uintptr_t* loadbuf;
@@ -4855,21 +4551,13 @@ THREAD_RET_TYPE model_set_trend_thread(void* arg) {
       tot_obs = pheno_nm_ct - missing_ct;
       het_ct = het_cts[marker_idx];
       homcom_ct = homcom_cts[marker_idx];
-      loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctl2]);
+      loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
       git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
       git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
       git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
-#ifdef __LP64__
-      fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
-#else
-      fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
-#endif
+      fill_uint_zero(git_homrar_cts, 3 * perm_vec_ctcl4m);
       calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
-#ifdef __LP64__
-      fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
-#else
-      fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-#endif
+      fill_uint_zero(thread_git_wkspace, perm_ctvc * 72 * BYTECT4);
       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
 	case_missing_ct = git_missing_cts[pidx];
 	case_com_ct = 2 * (case_ct - case_missing_ct - git_homrar_cts[pidx]) - git_het_cts[pidx];
@@ -4886,15 +4574,15 @@ THREAD_RET_TYPE model_set_trend_thread(void* arg) {
 
 THREAD_RET_TYPE model_adapt_gen_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
-  uintptr_t pheno_nm_ct = g_pheno_nm_ct;
-  uintptr_t pheno_nm_ctv2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+  uintptr_t pheno_nm_ct = g_perm_pheno_nm_ct;
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
   uintptr_t perm_vec_ct = g_perm_vec_ct;
   uint32_t assoc_thread_ct = g_assoc_thread_ct;
   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
   uint32_t model_fisher = g_model_fisher;
   uint32_t fisher_midp = g_fisher_midp;
   uint32_t first_adapt_check = g_first_adapt_check;
-  uint32_t case_ct = g_case_ct;
+  uint32_t case_ct = g_perm_case_ct;
   uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
   uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
@@ -4982,7 +4670,7 @@ THREAD_RET_TYPE model_adapt_gen_thread(void* arg) {
       success_2start = perm_2success_ct[marker_idx];
       success_2incr = 0;
       for (pidx = 0; pidx < perm_vec_ct;) {
-	vec_3freq(pheno_nm_ctv2, &(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), &case_missing_ct, &case_het_ct, &case_homcom_ct);
+	genovec_3freq(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), pheno_nm_ctv2, &case_missing_ct, &case_het_ct, &case_homcom_ct);
 	if (model_fisher) {
 	  uii = case_ct - case_het_ct - case_homcom_ct - case_missing_ct;
 	  // this is very slow.  a precomputed 2-dimensional table could
@@ -5035,27 +4723,22 @@ THREAD_RET_TYPE model_adapt_gen_thread(void* arg) {
 
 THREAD_RET_TYPE model_maxt_gen_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
   uintptr_t perm_vec_ct = g_perm_vec_ct;
-  uintptr_t pheno_nm_ctv2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
   uint32_t assoc_thread_ct = g_assoc_thread_ct;
   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
   uint32_t model_fisher = g_model_fisher;
   uint32_t fisher_midp = g_fisher_midp;
-#ifdef __LP64__
-  uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
-#else
-  uint32_t perm_ct64 = (perm_vec_ct + 63) / 64;
-  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct64 * 144]);
-#endif
+  uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
+  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
   uint32_t* git_homrar_cts = NULL;
   uint32_t* git_missing_cts = NULL;
   uint32_t* git_het_cts = NULL;
-  uintptr_t perm_vec_ctcl4m = CACHEALIGN32_INT32(perm_vec_ct);
-  uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
+  uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
+  uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
   double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
-  uint32_t case_ct = g_case_ct;
+  uint32_t case_ct = g_perm_case_ct;
   uint32_t* __restrict__ perm_vecst = g_perm_vecst;
   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
   double* __restrict__ mperm_save_all = g_mperm_save_all;
@@ -5165,17 +4848,9 @@ THREAD_RET_TYPE model_maxt_gen_thread(void* arg) {
 	ldrefs[marker_idx] = ldref;
       }
       if (ldref == marker_bidx) {
-#ifdef __LP64__
-	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
-#else
-	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
-#endif
+	fill_uint_zero(git_homrar_cts, 3 * perm_vec_ctcl4m);
 	calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
-#ifdef __LP64__
-	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
-#else
-	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-#endif
+	fill_uint_zero(thread_git_wkspace, perm_ctvc * 72 * BYTECT4);
       } else {
 	memcpy(git_homrar_cts, &(resultbuf[3 * ldref * perm_vec_ctcl4m]), 3 * perm_vec_ctcl4m * sizeof(int32_t));
 	calc_rem(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
@@ -5228,8 +4903,8 @@ THREAD_RET_TYPE model_maxt_gen_thread(void* arg) {
 
 THREAD_RET_TYPE model_adapt_best_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
-  uintptr_t pheno_nm_ct = g_pheno_nm_ct;
-  uintptr_t pheno_nm_ctv2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+  uintptr_t pheno_nm_ct = g_perm_pheno_nm_ct;
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
   uintptr_t perm_vec_ct = g_perm_vec_ct;
   uint32_t assoc_thread_ct = g_assoc_thread_ct;
   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
@@ -5237,7 +4912,7 @@ THREAD_RET_TYPE model_adapt_best_thread(void* arg) {
   uint32_t fisher_midp = g_fisher_midp;
   uint32_t precomp_width = g_precomp_width;
   uint32_t first_adapt_check = g_first_adapt_check;
-  uint32_t case_ct = g_case_ct;
+  uint32_t case_ct = g_perm_case_ct;
   uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
   uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
@@ -5331,7 +5006,7 @@ THREAD_RET_TYPE model_adapt_best_thread(void* arg) {
       success_2start = perm_2success_ct[marker_idx];
       success_2incr = 0;
       for (pidx = 0; pidx < perm_vec_ct;) {
-	vec_3freq(pheno_nm_ctv2, &(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), &case_missing_ct, &case_het_ct, &case_homcom_ct);
+	genovec_3freq(&(loadbuf[marker_bidx * pheno_nm_ctv2]), &(perm_vecs[pidx * pheno_nm_ctv2]), pheno_nm_ctv2, &case_missing_ct, &case_het_ct, &case_homcom_ct);
 	case_homrar_ct = case_ct - case_missing_ct - case_het_ct - case_homcom_ct;
 	case_com_ct = case_het_ct + 2 * case_homcom_ct;
 	ujj = 0; // best increment so far
@@ -5464,28 +5139,23 @@ THREAD_RET_TYPE model_adapt_best_thread(void* arg) {
 
 THREAD_RET_TYPE model_maxt_best_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
   uintptr_t perm_vec_ct = g_perm_vec_ct;
-  uintptr_t pheno_nm_ctv2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
   uint32_t assoc_thread_ct = g_assoc_thread_ct;
   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
   uint32_t model_fisher = g_model_fisher;
   uint32_t fisher_midp = g_fisher_midp;
-#ifdef __LP64__
-  uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
-#else
-  uint32_t perm_ct64 = (perm_vec_ct + 63) / 64;
-  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct64 * 144]);
-#endif
+  uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
+  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
   uint32_t* git_homrar_cts = NULL;
   uint32_t* git_missing_cts = NULL;
   uint32_t* git_het_cts = NULL;
-  uintptr_t perm_vec_ctcl4m = CACHEALIGN32_INT32(perm_vec_ct);
-  uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
+  uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
+  uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
   double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
   uint32_t precomp_width = g_precomp_width;
-  uint32_t case_ct = g_case_ct;
+  uint32_t case_ct = g_perm_case_ct;
   uint32_t* __restrict__ perm_vecst = g_perm_vecst;
   uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
   double* __restrict__ mperm_save_all = g_mperm_save_all;
@@ -5612,17 +5282,9 @@ THREAD_RET_TYPE model_maxt_best_thread(void* arg) {
 	ldrefs[marker_idx] = ldref;
       }
       if (ldref == marker_bidx) {
-#ifdef __LP64__
-	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
-#else
-	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
-#endif
+	fill_uint_zero(git_homrar_cts, 3 * perm_vec_ctcl4m);
 	calc_git(pheno_nm_ct, perm_vec_ct, &(loadbuf[marker_bidx * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
-#ifdef __LP64__
-	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
-#else
-	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-#endif
+	fill_uint_zero(thread_git_wkspace, perm_ctvc * 72 * BYTECT4);
       } else {
 	memcpy(git_homrar_cts, &(resultbuf[3 * ldref * perm_vec_ctcl4m]), 3 * perm_vec_ctcl4m * sizeof(int32_t));
 	calc_rem(pheno_nm_ct, perm_vec_ct, loadbuf_cur, &(loadbuf[ldref * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
@@ -5803,23 +5465,18 @@ THREAD_RET_TYPE model_maxt_best_thread(void* arg) {
 THREAD_RET_TYPE model_set_best_thread(void* arg) {
   // Similar to model_set_domrec_thread().
   uintptr_t tidx = (uintptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
   uint32_t assoc_thread_ct = g_assoc_thread_ct;
   uintptr_t perm_vec_ct = g_perm_vec_ct;
-  uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
-#ifdef __LP64__
-  uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
-#else
-  uint32_t perm_ct64 = (perm_vec_ct + 63) / 64;
-  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct64 * 144]);
-#endif
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
+  uint32_t perm_ctvc = BITCT_TO_VECCT(perm_vec_ct);
+  uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ctvc * 144 * BYTECT4]);
   uint32_t* git_homrar_cts = NULL;
   uint32_t* git_missing_cts = NULL;
   uint32_t* git_het_cts = NULL;
-  uintptr_t perm_vec_ctcl4m = CACHEALIGN32_INT32(perm_vec_ct);
+  uintptr_t perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
   uint32_t* resultbuf = g_resultbuf;
-  uint32_t case_ct = g_case_ct;
+  uint32_t case_ct = g_perm_case_ct;
   uint32_t* __restrict__ perm_vecst = g_perm_vecst;
   double* msa_ptr = NULL;
   uintptr_t* loadbuf;
@@ -5877,21 +5534,13 @@ THREAD_RET_TYPE model_set_best_thread(void* arg) {
       com_ct = 2 * homcom_ct + het_ct;
       homrar_ct = tot_obs - homcom_ct - het_ct;
       skip_domrec = IS_SET(is_invalid, marker_idx);
-      loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctl2]);
+      loadbuf_cur = &(loadbuf[marker_bidx * pheno_nm_ctv2]);
       git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
       git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
       git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
-#ifdef __LP64__
-      fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
-#else
-      fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
-#endif
+      fill_uint_zero(git_homrar_cts, 3 * perm_vec_ctcl4m);
       calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
-#ifdef __LP64__
-      fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
-#else
-      fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-#endif
+      fill_uint_zero(thread_git_wkspace, perm_ctvc * 72 * BYTECT4);
       for (pidx = 0; pidx < perm_vec_ct; pidx++) {
 	case_missing_ct = git_missing_cts[pidx];
 	case_het_ct = git_het_cts[pidx];
@@ -5940,14 +5589,14 @@ int32_t model_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_of
   //    refers to all markers contained in at least one *significant* set.
   //    orig_chisq is collapsed before permutation to be congruent to this
   //    marker_exclude.
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   uintptr_t* marker_exclude = marker_exclude_mid;
   uintptr_t* unstopped_markers = NULL;
   uintptr_t* loadbuf = g_loadbuf;
   uintptr_t* sample_male_include2 = g_sample_male_include2;
   uintptr_t* perm_adapt_set_unstopped = NULL;
-  char* tbuf2 = &(tbuf[MAXLINELEN]);
+  char* tbuf2 = &(g_textbuf[MAXLINELEN]);
   double* orig_chisq = g_orig_chisq;
   double* sorted_chisq_buf = NULL;
   uint32_t* marker_idx_to_uidx = NULL;
@@ -5955,7 +5604,7 @@ int32_t model_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_of
   uint32_t* proxy_arr = NULL;
   uint32_t* perm_2success_ct = NULL;
   uint32_t* perm_attempt_ct = NULL;
-  uintptr_t pheno_nm_ctv2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
   uintptr_t marker_ct = marker_ct_mid;
   uintptr_t final_mask = get_final_mask(pheno_nm_ct);
   uintptr_t ulii = 0;
@@ -5974,7 +5623,7 @@ int32_t model_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_of
   double* chisq_ptr;
   double* read_dptr;
   double* write_dptr;
-  unsigned char* wkspace_mark2;
+  unsigned char* bigstack_mark2;
   uint32_t** setdefs;
   uint32_t** ld_map;
   uintptr_t marker_uidx;
@@ -6049,7 +5698,7 @@ int32_t model_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_of
   if (max_thread_ct > perms_total) {
     max_thread_ct = perms_total;
   }
-  if (wkspace_init_sfmtp(max_thread_ct)) {
+  if (bigstack_init_sfmtp(max_thread_ct)) {
     goto model_assoc_set_test_ret_NOMEM;
   }
   marker_unstopped_ct = marker_ct;
@@ -6058,7 +5707,7 @@ int32_t model_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_of
   // generate a permutation batch, efficiently compute chi-square stats for all
   // variants in at least one tested set, compute set score, compare to base
   // set score.
-  wkspace_mark2 = wkspace_base;
+  bigstack_mark2 = g_bigstack_base;
  model_assoc_set_test_more_perms:
   if (perms_done) {
     uii = apip->init_interval;
@@ -6077,7 +5726,7 @@ int32_t model_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_of
   //   perm_vec_ct * (9 * max_thread_ct + 20 * MODEL_BLOCKSIZE +
   //                    pheno_nm_ct / 8 + sizeof(intptr_t) * pheno_nm_ctv2
   //                    + marker_ct * sizeof(double))
-  perm_vec_ct = 128 * (wkspace_left / (128LL * sizeof(intptr_t) * pheno_nm_ctv2 + 1152LL * max_thread_ct + 2560LL * MODEL_BLOCKSIZE + 16LL * pheno_nm_ct + 128LL * sizeof(double) * marker_ct));
+  perm_vec_ct = 128 * (bigstack_left() / (128LL * sizeof(intptr_t) * pheno_nm_ctv2 + 1152LL * max_thread_ct + 2560LL * MODEL_BLOCKSIZE + 16LL * pheno_nm_ct + 128LL * sizeof(double) * marker_ct));
   if (perm_vec_ct > perm_batch_size) {
     perm_vec_ct = perm_batch_size;
   }
@@ -6086,49 +5735,39 @@ int32_t model_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_of
   } else if (!perm_vec_ct) {
     goto model_assoc_set_test_ret_NOMEM;
   }
-  perm_vec_ctcl4m = CACHEALIGN32_INT32(perm_vec_ct);
+  perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
   perms_done += perm_vec_ct;
   g_perms_done = perms_done;
   g_perm_vec_ct = perm_vec_ct;
-  g_perm_vecs = (uintptr_t*)wkspace_alloc(perm_vec_ct * pheno_nm_ctv2 * sizeof(intptr_t));
-  if (perm_vec_ct > max_thread_ct) {
-    assoc_thread_ct = max_thread_ct;
-  } else {
-    assoc_thread_ct = perm_vec_ct;
-  }
-  g_assoc_thread_ct = assoc_thread_ct;
+  bigstack_alloc_ul(perm_vec_ct * pheno_nm_ctv2, &g_perm_vecs);
+  g_perm_generation_thread_ct = MINV(max_thread_ct, perm_vec_ct);
   ulii = 0;
-  if (!g_cluster_starts) {
-    if (spawn_threads(threads, &model_assoc_gen_perms_thread, assoc_thread_ct)) {
+  if (!g_perm_cluster_starts) {
+    if (spawn_threads(threads, &generate_cc_perms_thread, g_perm_generation_thread_ct)) {
       goto model_assoc_set_test_ret_THREAD_CREATE_FAIL;
     }
-    model_assoc_gen_perms_thread((void*)ulii);
+    generate_cc_perms_thread((void*)ulii);
   } else {
-    if (spawn_threads(threads, &model_assoc_gen_cluster_perms_thread, assoc_thread_ct)) {
+    if (spawn_threads(threads, &generate_cc_cluster_perms_thread, g_perm_generation_thread_ct)) {
       goto model_assoc_set_test_ret_THREAD_CREATE_FAIL;
     }
-    model_assoc_gen_cluster_perms_thread((void*)ulii);
+    generate_cc_cluster_perms_thread((void*)ulii);
   }
-  join_threads(threads, assoc_thread_ct);
+  join_threads(threads, g_perm_generation_thread_ct);
   g_assoc_thread_ct = max_thread_ct;
-  g_resultbuf = (uint32_t*)wkspace_alloc(perm_vec_ctcl4m * 3 * MODEL_BLOCKSIZE * sizeof(int32_t));
+  bigstack_alloc_ui(perm_vec_ctcl4m * 3 * MODEL_BLOCKSIZE, &g_resultbuf);
 #ifdef __LP64__
-  ulii = ((perm_vec_ct + 127) / 128) * 16;
-  g_perm_vecst = (uint32_t*)wkspace_alloc(ulii * pheno_nm_ct);
+  ulii = ((perm_vec_ct + 127) / 128) * 4;
+  bigstack_alloc_ui(ulii * pheno_nm_ct, &g_perm_vecst);
 #else
-  ulii = ((perm_vec_ct + 31) / 32) * 4;
-  g_perm_vecst = (uint32_t*)wkspace_alloc(ulii * pheno_nm_ct);
-  ulii = ((perm_vec_ct + 63) / 64) * 8;
+  ulii = (perm_vec_ct + 31) / 32;
+  bigstack_alloc_ui(ulii * pheno_nm_ct, &g_perm_vecst);
+  ulii = ((perm_vec_ct + 63) / 64) * 2;
 #endif
-  g_thread_git_wkspace = (uint32_t*)wkspace_alloc(ulii * 72 * max_thread_ct);
+  bigstack_calloc_ui(ulii * 72 * max_thread_ct, &g_thread_git_wkspace);
   transpose_perms(g_perm_vecs, perm_vec_ct, pheno_nm_ct, g_perm_vecst);
-#ifdef __LP64__
-  fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 9 * max_thread_ct);
-#else
-  fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 18 * max_thread_ct);
-#endif
-  g_mperm_save_all = (double*)wkspace_alloc(MODEL_BLOCKSIZE * perm_vec_ct * sizeof(double));
-  chisq_pmajor = (double*)wkspace_alloc(marker_ct * perm_vec_ct * sizeof(double));
+  bigstack_alloc_d(MODEL_BLOCKSIZE * perm_vec_ct, &g_mperm_save_all);
+  bigstack_alloc_d(marker_ct * perm_vec_ct, &chisq_pmajor);
   chrom_fo_idx = 0xffffffffU;
   marker_uidx = next_unset_unsafe(marker_exclude, 0);
   if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
@@ -6178,7 +5817,7 @@ int32_t model_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_of
 	}
       }
       loadbuf_ptr = &(loadbuf[block_size * pheno_nm_ctv2]);
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf_ptr, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_ptr)) {
 	goto model_assoc_set_test_ret_READ_FAIL;
       }
       g_adapt_m_table[block_size] = marker_idx2++;
@@ -6240,7 +5879,7 @@ int32_t model_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_of
     marker_idx += block_size;
   } while (marker_idx < marker_unstopped_ct);
   compute_set_scores(marker_ct, perm_vec_ct, set_ct, chisq_pmajor, orig_set_scores, sorted_chisq_buf, sorted_marker_idx_buf, proxy_arr, setdefs, ld_map, apip, chisq_threshold, adaptive_ci_zt, first_adapt_check, perms_done, sip->set_max, perm_adapt_set_unstopped, perm_2success_ct, perm_attempt_ct);
-  wkspace_reset(wkspace_mark2);
+  bigstack_reset(bigstack_mark2);
   if (perms_done < perms_total) {
     if (model_modifier & MODEL_PERM) {
       if (!extract_set_union(setdefs, set_ct, perm_adapt_set_unstopped, unstopped_markers, marker_ct)) {
@@ -6279,25 +5918,26 @@ int32_t model_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_of
     break;
   }
  model_assoc_set_test_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 void get_model_assoc_precomp_bounds(uint32_t missing_ct, uint32_t is_model, uint32_t* minp, uint32_t* ctp) {
   // Estimate which case missing counts are most common.
-  // Expected value = (g_case_ct * missing_ct / g_pheno_nm_ct)
+  // Expected value = (g_perm_case_ct * missing_ct / g_perm_pheno_nm_ct)
   // If X-chromosome and (!is_model):
-  //   Lower bound = max(0, missing_ct - 2 * (g_pheno_nm_ct - g_case_ct))
-  //   Upper bound = min(g_case_ct * 2, missing_ct)
+  //   Lower bound = max(0, missing_ct - 2 * (g_perm_pheno_nm_ct -
+  //                 g_perm_case_ct))
+  //   Upper bound = min(g_perm_case_ct * 2, missing_ct)
   //   (Could be a bit more precise if we tracked missing male and female
   //    counts separately, but whatever)
   //   Each male automatically contributes 1 to initial missing_ct!
   // Otherwise:
-  //   Lower bound = max(0, missing_ct - (g_pheno_nm_ct - g_case_ct))
-  //   Upper bound = min(g_case_ct, missing_ct)
-  double xval = ((double)(g_case_ct * ((int64_t)missing_ct))) / ((double)((intptr_t)g_pheno_nm_ct));
+  //   Lower bound = max(0, missing_ct - (g_perm_pheno_nm_ct - g_perm_case_ct))
+  //   Upper bound = min(g_perm_case_ct, missing_ct)
+  double xval = ((double)(g_perm_case_ct * ((int64_t)missing_ct))) / ((double)((intptr_t)g_perm_pheno_nm_ct));
   intptr_t lbound = (intptr_t)(xval + EPSILON + 1 - ((double)((intptr_t)g_precomp_width)) * 0.5);
-  intptr_t ctrl_ct = g_pheno_nm_ct - g_case_ct;
+  intptr_t ctrl_ct = g_perm_pheno_nm_ct - g_perm_case_ct;
   intptr_t ubound = missing_ct;
   intptr_t lii;
   if (lbound < 0) {
@@ -6305,13 +5945,13 @@ void get_model_assoc_precomp_bounds(uint32_t missing_ct, uint32_t is_model, uint
   }
   if (g_is_x && (!is_model)) {
     lii = missing_ct - (2 * ctrl_ct);
-    if (((uintptr_t)ubound) > g_case_ct * 2) {
-      ubound = g_case_ct * 2;
+    if (((uintptr_t)ubound) > g_perm_case_ct * 2) {
+      ubound = g_perm_case_ct * 2;
     }
   } else {
     lii = missing_ct - ctrl_ct;
-    if (((uintptr_t)ubound) > g_case_ct) {
-      ubound = g_case_ct;
+    if (((uintptr_t)ubound) > g_perm_case_ct) {
+      ubound = g_perm_case_ct;
     }
   }
   if (lii > lbound) {
@@ -6326,11 +5966,11 @@ void get_model_assoc_precomp_bounds(uint32_t missing_ct, uint32_t is_model, uint
 }
 
 int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t model_modifier, uint32_t model_cell_ct, uint32_t model_mperm_val, double ci_size, double ci_zt, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marke [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
-  uintptr_t pheno_nm_ctv2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
   int32_t retval = 0;
   FILE* outfile = NULL;
   FILE* outfile_msa = NULL;
@@ -6400,7 +6040,7 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
   uint32_t model_fisher = model_modifier & MODEL_FISHER;
   uint32_t model_fisherx = model_fisher && (!(model_modifier & MODEL_PTREND));
   uint32_t fisher_midp = model_modifier & MODEL_FISHER_MIDP;
-  char* writebuf = tbuf;
+  char* writebuf = g_textbuf;
   char* chrom_name_ptr = NULL;
   uint32_t chrom_name_len = 0;
   char chrom_name_buf[3 + MAX_CHROM_TEXTNUM_LEN];
@@ -6475,31 +6115,31 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
     goto model_assoc_ret_1;
   }
   if (max_marker_allele_len > MAXLINELEN) {
-    if (wkspace_alloc_c_checked(&writebuf, 2 * max_marker_allele_len + MAXLINELEN)) {
+    if (bigstack_alloc_c(2 * max_marker_allele_len + MAXLINELEN, &writebuf)) {
       goto model_assoc_ret_NOMEM;
     }
   }
   g_model_fisher = model_fisher;
   g_fisher_midp = fisher_midp;
-  g_pheno_nm_ct = pheno_nm_ct;
+  g_perm_pheno_nm_ct = pheno_nm_ct;
   perms_done = 0;
   g_is_model_prec = model_modifier / MODEL_PREC;
-  g_is_perm1 = 0;
+  g_perm_is_1bit = 0;
   g_mperm_save_all = NULL;
   g_sample_male_include2 = NULL;
   if (is_set_test) {
-    if (wkspace_alloc_ul_checked(&founder_pnm, unfiltered_sample_ctl * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_sample_ctl, &founder_pnm)) {
       goto model_assoc_ret_NOMEM;
     }
     memcpy(founder_pnm, pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t));
-    bitfield_and(founder_pnm, founder_info, unfiltered_sample_ctl);
+    bitvec_and(founder_info, unfiltered_sample_ctl, founder_pnm);
     if (extract_set_union_unfiltered(sip, NULL, unfiltered_marker_ct, marker_exclude_orig, &marker_exclude, &marker_ct)) {
       goto model_assoc_ret_NOMEM;
     }
   }
   if (model_maxt_nst) {
     perms_total = model_mperm_val;
-    if (wkspace_alloc_d_checked(&maxt_extreme_stat, sizeof(double) * perms_total)) {
+    if (bigstack_alloc_d(perms_total, &maxt_extreme_stat)) {
       goto model_assoc_ret_NOMEM;
     }
     g_maxt_extreme_stat = maxt_extreme_stat;
@@ -6512,7 +6152,7 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
     }
     if (mperm_save & MPERM_DUMP_ALL) {
       memcpy(outname_end, ".mperm.dump.all", 16);
-      if (fopen_checked(&outfile_msa, outname, "w")) {
+      if (fopen_checked(outname, "w", &outfile_msa)) {
 	goto model_assoc_ret_OPEN_FAIL;
       }
       LOGPRINTFWW("Dumping all permutation %svalues to %s .\n", model_fisherx? "p-" : "chi-square ", outname);
@@ -6531,7 +6171,7 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
       g_adaptive_slope = apip->interval_slope;
     }
   }
-  if (wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw)) {
     goto model_assoc_ret_NOMEM;
   }
   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
@@ -6541,15 +6181,15 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
     } else {
       outname_end2 = memcpyb(outname_end, ".assoc", 7);
     }
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto model_assoc_ret_OPEN_FAIL;
     }
-    sprintf(logbuf, "Writing C/C --assoc report to %s ... ", outname);
-    wordwrap(logbuf, 25); // strlen("[generating permutations]")
+    sprintf(g_logbuf, "Writing C/C --assoc report to %s ... ", outname);
+    wordwrapb(25); // strlen("[generating permutations]")
     logprintb();
     fflush(stdout);
-    sprintf(tbuf, " CHR %%%us         BP   A1 ", plink_maxsnp);
-    fprintf(outfile, tbuf, "SNP");
+    sprintf(g_textbuf, " CHR %%%us         BP   A1 ", plink_maxsnp);
+    fprintf(outfile, g_textbuf, "SNP");
     if (assoc_counts) {
       fputs("     C_A      C_U   A2 ", outfile);
     } else {
@@ -6595,11 +6235,11 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
       }
     }
     outname_end2 = memcpyb(outname_end, ".model", 7);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto model_assoc_ret_OPEN_FAIL;
     }
-    sprintf(logbuf, "Writing --model report to %s ... ", outname);
-    wordwrap(logbuf, 25);
+    sprintf(g_logbuf, "Writing --model report to %s ... ", outname);
+    wordwrapb(25);
     logprintb();
     fflush(stdout);
     if (model_perm_best && model_perms) {
@@ -6613,8 +6253,8 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
     } else if (model_modifier & MODEL_PTREND) {
       outname_end2 = memcpyb(outname_end2, ".trend", 7);
     }
-    sprintf(tbuf, " CHR %%%us   A1   A2     TEST            AFF          UNAFF ", plink_maxsnp);
-    fprintf(outfile, tbuf, "SNP");
+    sprintf(g_textbuf, " CHR %%%us   A1   A2     TEST            AFF          UNAFF ", plink_maxsnp);
+    fprintf(outfile, g_textbuf, "SNP");
     if (!model_fisher) {
       fputs("       CHISQ   DF ", outfile);
     } else {
@@ -6624,26 +6264,26 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
       goto model_assoc_ret_WRITE_FAIL;
     }
   }
-  marker_ctl = (marker_ct + (BITCT - 1)) / BITCT;
+  marker_ctl = BITCT_TO_WORDCT(marker_ct);
   g_adaptive_ci_zt = ltqnorm(1 - apip->beta / (2.0 * ((intptr_t)marker_ct)));
-  if (wkspace_alloc_ul_checked(&loadbuf, MODEL_BLOCKSIZE * pheno_nm_ctv2 * sizeof(intptr_t)) ||
-      wkspace_alloc_d_checked(&orig_pvals, marker_ct * sizeof(double)) ||
-      wkspace_alloc_ui_checked(&missing_cts, marker_ct * sizeof(int32_t))) {
+  if (bigstack_alloc_ul(MODEL_BLOCKSIZE * pheno_nm_ctv2, &loadbuf) ||
+      bigstack_alloc_d(marker_ct, &orig_pvals) ||
+      bigstack_alloc_ui(marker_ct, &missing_cts)) {
     goto model_assoc_ret_NOMEM;
   }
   g_loadbuf = loadbuf;
   g_orig_pvals = orig_pvals;
   g_missing_cts = missing_cts;
   if (model_assoc) {
-    if (wkspace_alloc_d_checked(&orig_odds, marker_ct * sizeof(double)) ||
-        wkspace_alloc_ui_checked(&set_cts, marker_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_d(marker_ct, &orig_odds) ||
+        bigstack_alloc_ui(marker_ct, &set_cts)) {
       goto model_assoc_ret_NOMEM;
     }
     g_set_cts = set_cts;
   }
   if ((!model_assoc) || model_maxt_nst) {
-    if (wkspace_alloc_ui_checked(&het_cts, marker_ct * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&homcom_cts, marker_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(marker_ct, &het_cts) ||
+        bigstack_alloc_ui(marker_ct, &homcom_cts)) {
       goto model_assoc_ret_NOMEM;
     }
     g_het_cts = het_cts;
@@ -6651,14 +6291,14 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
   }
   gender_req = ((x_code != -1) && is_set(chrom_info_ptr->chrom_mask, x_code)) || (model_assoc && (((y_code != -1) && is_set(chrom_info_ptr->chrom_mask, y_code))));
   if (gender_req) {
-    if (wkspace_alloc_ul_checked(&g_sample_nonmale_include2, pheno_nm_ctv2 * sizeof(intptr_t)) ||
-	wkspace_alloc_ul_checked(&sample_male_include2, pheno_nm_ctv2 * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(pheno_nm_ctv2, &g_sample_nonmale_include2) ||
+	bigstack_alloc_ul(pheno_nm_ctv2, &sample_male_include2)) {
       goto model_assoc_ret_NOMEM;
     }
     g_sample_male_include2 = sample_male_include2;
-    vec_collapse_init(sex_male, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, sample_male_include2);
+    quaterarr_collapse_init(sex_male, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, sample_male_include2);
     male_ct = popcount01_longs(sample_male_include2, pheno_nm_ctv2);
-    vec_init_invert(pheno_nm_ct, g_sample_nonmale_include2, sample_male_include2);
+    quatervec_01_init_invert(sample_male_include2, pheno_nm_ct, g_sample_nonmale_include2);
     nonmale_ct = pheno_nm_ct - male_ct;
   }
   // Set test does not support Fisher stats, so currently guaranteed to be
@@ -6666,48 +6306,46 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
   // generation of synthetic chi-square stats from Fisher p-values.
   fill_orig_chisq = (!model_fisherx) || (mtest_adjust && (!model_fisher));
   if (fill_orig_chisq) {
-    if (wkspace_alloc_d_checked(&orig_chisq, marker_ct * sizeof(double))) {
+    if (bigstack_calloc_d(marker_ct, &orig_chisq)) {
       goto model_assoc_ret_NOMEM;
     }
-    fill_double_zero(orig_chisq, marker_ct);
   }
   g_orig_chisq = orig_chisq;
 
   if (model_perms) {
     if (cluster_starts) {
-      retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, pheno_c, pheno_nm_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_cluster_ct, &g_cluster_map, &g_cluster_starts, &g_cluster_case_cts, &g_cluster_cc_perm_preimage);
+      retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, pheno_c, pheno_nm_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, &g_perm_cluster_case_cts, &g_perm_cluster_cc_preimage);
       if (retval) {
 	goto model_assoc_ret_1;
       }
-      if (!g_cluster_ct) {
+      if (!g_perm_cluster_ct) {
         logerrprint("Error: No size 2+ clusters for permutation test.\n");
 	goto model_assoc_ret_INVALID_CMDLINE;
       }
-      retval = cluster_alloc_and_populate_magic_nums(g_cluster_ct, g_cluster_map, g_cluster_starts, &g_tot_quotients, &g_totq_magics, &g_totq_preshifts, &g_totq_postshifts, &g_totq_incrs);
+      retval = cluster_alloc_and_populate_magic_nums(g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, &g_perm_tot_quotients, &g_perm_totq_magics, &g_perm_totq_preshifts, &g_perm_totq_postshifts, &g_perm_totq_incrs);
       if (retval) {
         goto model_assoc_ret_1;
       }
     } else {
-      g_cluster_starts = NULL;
+      g_perm_cluster_starts = NULL;
     }
     if (!is_set_test) {
       if (max_thread_ct > perms_total) {
 	max_thread_ct = perms_total;
       }
-      if (wkspace_init_sfmtp(max_thread_ct)) {
+      if (bigstack_init_sfmtp(max_thread_ct)) {
 	goto model_assoc_ret_NOMEM;
       }
     }
     if (model_perm_best) {
-      if (wkspace_alloc_ul_checked(&is_invalid_bitfield, marker_ctl * sizeof(intptr_t))) {
+      if (bigstack_calloc_ul(marker_ctl, &is_invalid_bitfield)) {
 	goto model_assoc_ret_NOMEM;
       }
       g_is_invalid_bitfield = is_invalid_bitfield;
-      fill_ulong_zero(is_invalid_bitfield, marker_ctl);
     }
 
     if (!is_set_test) {
-      g_ldrefs = (uint16_t*)wkspace_alloc(marker_ct * sizeof(int16_t));
+      g_ldrefs = (uint16_t*)bigstack_alloc(marker_ct * sizeof(int16_t));
       if (!g_ldrefs) {
 	goto model_assoc_ret_NOMEM;
       }
@@ -6726,47 +6364,48 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	precomp_width = 0;
       }
       g_precomp_width = precomp_width;
-      if (wkspace_alloc_ui_checked(&perm_2success_ct, marker_ct * sizeof(int32_t))) {
+      if (bigstack_calloc_ui(marker_ct, &perm_2success_ct)) {
 	goto model_assoc_ret_NOMEM;
       }
       if (model_maxt_nst) {
 	if (model_fisherx) {
 	  if (model_assoc || (model_modifier & (MODEL_PDOM | MODEL_PREC))) {
-	    if (wkspace_alloc_ui_checked(&precomp_ui, precomp_width * 6 * MODEL_BLOCKSIZE * sizeof(int32_t)) ||
-		wkspace_alloc_d_checked(&precomp_d, precomp_width * 2 * MODEL_BLOCKSIZE * sizeof(double))) {
+	    if (bigstack_alloc_ui(precomp_width * 6 * MODEL_BLOCKSIZE, &precomp_ui) ||
+		bigstack_alloc_d(precomp_width * 2 * MODEL_BLOCKSIZE, &precomp_d)) {
 	      goto model_assoc_ret_NOMEM;
 	    }
 	  } else if (model_perm_best) {
-	    if (wkspace_alloc_ui_checked(&precomp_ui, precomp_width * 18 * MODEL_BLOCKSIZE * sizeof(int32_t)) ||
-		wkspace_alloc_d_checked(&precomp_d, precomp_width * 6 * MODEL_BLOCKSIZE * sizeof(double))) {
+	    if (bigstack_alloc_ui(precomp_width * 18 * MODEL_BLOCKSIZE, &precomp_ui) ||
+		bigstack_alloc_d(precomp_width * 6 * MODEL_BLOCKSIZE, &precomp_d)) {
 	      goto model_assoc_ret_NOMEM;
 	    }
 	  }
 	} else if (model_assoc || (model_modifier & (MODEL_PDOM | MODEL_PREC | MODEL_PTREND))) {
-	  if (wkspace_alloc_ui_checked(&precomp_ui, precomp_width * 6 * MODEL_BLOCKSIZE * sizeof(int32_t)) ||
-	      wkspace_alloc_d_checked(&precomp_d, precomp_width * 2 * MODEL_BLOCKSIZE * sizeof(double))) {
+	  if (bigstack_alloc_ui(precomp_width * 6 * MODEL_BLOCKSIZE, &precomp_ui) ||
+	      bigstack_alloc_d(precomp_width * 2 * MODEL_BLOCKSIZE, &precomp_d)) {
 	    goto model_assoc_ret_NOMEM;
 	  }
 	} else if (model_perm_best) {
-	  if (wkspace_alloc_ui_checked(&precomp_ui, precomp_width * 18 * MODEL_BLOCKSIZE * sizeof(int32_t)) ||
-	      wkspace_alloc_d_checked(&precomp_d, precomp_width * 6 * MODEL_BLOCKSIZE * sizeof(double))) {
+	  if (bigstack_alloc_ui(precomp_width * 18 * MODEL_BLOCKSIZE, &precomp_ui) ||
+	      bigstack_alloc_d(precomp_width * 6 * MODEL_BLOCKSIZE, &precomp_d)) {
 	    goto model_assoc_ret_NOMEM;
 	  }
 	}
       } else if (model_assoc || (model_modifier & (MODEL_PDOM | MODEL_PREC | MODEL_PTREND))) {
-	if (wkspace_alloc_ui_checked(&precomp_ui, precomp_width * 4 * MODEL_BLOCKSIZE * sizeof(int32_t))) {
+	if (bigstack_alloc_ui(precomp_width * 4 * MODEL_BLOCKSIZE, &precomp_ui)) {
 	  goto model_assoc_ret_NOMEM;
 	}
       } else if (model_perm_best) {
-	if (wkspace_alloc_ui_checked(&precomp_ui, precomp_width * 12 * MODEL_BLOCKSIZE * sizeof(int32_t))) {
+	if (bigstack_alloc_ui(precomp_width * 12 * MODEL_BLOCKSIZE, &precomp_ui)) {
 	  goto model_assoc_ret_NOMEM;
 	}
       }
       g_perm_2success_ct = perm_2success_ct;
-      fill_uint_zero(perm_2success_ct, marker_ct);
       if (model_adapt_nst) {
-	if (wkspace_alloc_ui_checked(&perm_attempt_ct, marker_ct * sizeof(int32_t)) ||
-	    wkspace_alloc_uc_checked(&perm_adapt_stop, marker_ct)) {
+	if (bigstack_alloc_ui(marker_ct, &perm_attempt_ct) ||
+
+	    // we need to zero out trailing bytes of the last word
+	    bigstack_calloc_uc(round_up_pow2(marker_ct, BYTECT), &perm_adapt_stop)) {
 	  goto model_assoc_ret_NOMEM;
 	}
 	g_perm_attempt_ct = perm_attempt_ct;
@@ -6775,40 +6414,39 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	for (uii = 0; uii < marker_ct; uii++) {
 	  perm_attempt_ct[uii] = ujj;
 	}
-	fill_ulong_zero((uintptr_t*)perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
       }
     }
     if (!cluster_starts) {
-      g_tot_quotient = 0x100000000LLU / pheno_nm_ct;
-      magic_num(g_tot_quotient, &g_totq_magic, &g_totq_preshift, &g_totq_postshift, &g_totq_incr);
+      g_perm_tot_quotient = 0x100000000LLU / pheno_nm_ct;
+      magic_num(g_perm_tot_quotient, &g_perm_totq_magic, &g_perm_totq_preshift, &g_perm_totq_postshift, &g_perm_totq_incr);
     }
   }
   g_precomp_ui = precomp_ui;
   g_precomp_d = precomp_d;
-  if (wkspace_alloc_ul_checked(&sample_ctrl_include2, pheno_nm_ctv2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&sample_case_include2, pheno_nm_ctv2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(pheno_nm_ctv2, &sample_ctrl_include2) ||
+      bigstack_alloc_ul(pheno_nm_ctv2, &sample_case_include2)) {
     goto model_assoc_ret_NOMEM;
   }
-  vec_collapse_init(pheno_c, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, sample_case_include2);
+  quaterarr_collapse_init(pheno_c, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, sample_case_include2);
   case_ct = popcount01_longs(sample_case_include2, pheno_nm_ctv2);
-  g_case_ct = case_ct;
-  vec_init_invert(pheno_nm_ct, sample_ctrl_include2, sample_case_include2);
+  g_perm_case_ct = case_ct;
+  quatervec_01_init_invert(sample_case_include2, pheno_nm_ct, sample_ctrl_include2);
   ctrl_ct = pheno_nm_ct - case_ct;
   if (gender_req) {
     // todo: get rid of these and just use the functions called by the
     // permutation tests
-    if (wkspace_alloc_ul_checked(&sample_nonmale_ctrl_include2, pheno_nm_ctv2 * sizeof(intptr_t)) ||
-	wkspace_alloc_ul_checked(&sample_nonmale_case_include2, pheno_nm_ctv2 * sizeof(intptr_t)) ||
-	wkspace_alloc_ul_checked(&sample_male_ctrl_include2, pheno_nm_ctv2 * sizeof(intptr_t)) ||
-	wkspace_alloc_ul_checked(&sample_male_case_include2, pheno_nm_ctv2 * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(pheno_nm_ctv2, &sample_nonmale_ctrl_include2) ||
+	bigstack_alloc_ul(pheno_nm_ctv2, &sample_nonmale_case_include2) ||
+	bigstack_alloc_ul(pheno_nm_ctv2, &sample_male_ctrl_include2) ||
+	bigstack_alloc_ul(pheno_nm_ctv2, &sample_male_case_include2)) {
       goto model_assoc_ret_NOMEM;
     }
-    vec_collapse_init(sex_male, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, sample_male_case_include2);
-    bitfield_and(sample_male_case_include2, sample_case_include2, pheno_nm_ctv2);
+    quaterarr_collapse_init(sex_male, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, sample_male_case_include2);
+    bitvec_and(sample_case_include2, pheno_nm_ctv2, sample_male_case_include2);
     case_male_ct = popcount01_longs(sample_male_case_include2, pheno_nm_ctv2);
-    bitfield_andnot_copy(pheno_nm_ctv2, sample_male_ctrl_include2, sample_male_include2, sample_male_case_include2);
-    bitfield_andnot_copy(pheno_nm_ctv2, sample_nonmale_case_include2, sample_case_include2, sample_male_case_include2);
-    bitfield_andnot_copy(pheno_nm_ctv2, sample_nonmale_ctrl_include2, sample_ctrl_include2, sample_male_ctrl_include2);
+    bitvec_andnot_copy(sample_male_include2, sample_male_case_include2, pheno_nm_ctv2, sample_male_ctrl_include2);
+    bitvec_andnot_copy(sample_case_include2, sample_male_case_include2, pheno_nm_ctv2, sample_nonmale_case_include2);
+    bitvec_andnot_copy(sample_ctrl_include2, sample_male_ctrl_include2, pheno_nm_ctv2, sample_nonmale_ctrl_include2);
     ctrl_male_ct = male_ct - case_male_ct;
     case_nonmale_ct = case_ct - case_male_ct;
     ctrl_nonmale_ct = ctrl_ct - ctrl_male_ct;
@@ -6819,7 +6457,7 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
     loadbuf[uii * pheno_nm_ctv2 - 1] = 0;
   }
   if (model_perms) {
-    if (wkspace_left < pheno_nm_ctv2 * sizeof(intptr_t)) {
+    if (bigstack_left() < pheno_nm_ctv2 * sizeof(intptr_t)) {
       goto model_assoc_ret_NOMEM;
     }
   }
@@ -6842,7 +6480,7 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	}
 	g_first_adapt_check = uii;
       }
-      perm_vec_ct = wkspace_left / (pheno_nm_ctv2 * sizeof(intptr_t));
+      perm_vec_ct = bigstack_left() / (pheno_nm_ctv2 * sizeof(intptr_t));
     } else {
       // perm_vec_ct memory allocation dependencies:
       //   g_maxt_thread_results: (8 * perm_vec_ct, cacheline-aligned) *
@@ -6869,9 +6507,9 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
       // 8 * perm_vec_ct bytes, multiplying by 128 yields 1024, and
       // 1152 + 1024 = 2176.
       if (mperm_save & MPERM_DUMP_ALL) {
-        perm_vec_ct = 128 * (wkspace_left / (128LL * sizeof(intptr_t) * pheno_nm_ctv2 + 2176LL * max_thread_ct + 1536LL * MODEL_BLOCKSIZE + 16LL * pheno_nm_ct + 128LL * sizeof(double) * marker_ct));
+        perm_vec_ct = 128 * (bigstack_left() / (128LL * sizeof(intptr_t) * pheno_nm_ctv2 + 2176LL * max_thread_ct + 1536LL * MODEL_BLOCKSIZE + 16LL * pheno_nm_ct + 128LL * sizeof(double) * marker_ct));
       } else {
-        perm_vec_ct = 128 * (wkspace_left / (128LL * sizeof(intptr_t) * pheno_nm_ctv2 + 2176LL * max_thread_ct + 1536LL * MODEL_BLOCKSIZE + 16LL * pheno_nm_ct));
+        perm_vec_ct = 128 * (bigstack_left() / (128LL * sizeof(intptr_t) * pheno_nm_ctv2 + 2176LL * max_thread_ct + 1536LL * MODEL_BLOCKSIZE + 16LL * pheno_nm_ct));
       }
     }
     if (perm_vec_ct > perms_total - perms_done) {
@@ -6879,52 +6517,41 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
     } else if (!perm_vec_ct) {
       goto model_assoc_ret_NOMEM;
     }
-    perm_vec_ctcl4m = CACHEALIGN32_INT32(perm_vec_ct);
+    perm_vec_ctcl4m = round_up_pow2(perm_vec_ct, CACHELINE_INT32);
     perms_done += perm_vec_ct;
     g_perms_done = perms_done;
     g_perm_vec_ct = perm_vec_ct;
-    g_perm_vecs = (uintptr_t*)wkspace_alloc(perm_vec_ct * pheno_nm_ctv2 * sizeof(intptr_t));
-    if (perm_vec_ct > max_thread_ct) {
-      assoc_thread_ct = max_thread_ct;
-    } else {
-      assoc_thread_ct = perm_vec_ct;
-    }
-    g_assoc_thread_ct = assoc_thread_ct;
+    bigstack_alloc_ul(perm_vec_ct * pheno_nm_ctv2, &g_perm_vecs);
+    g_perm_generation_thread_ct = MINV(max_thread_ct, perm_vec_ct);
     ulii = 0;
     if (!cluster_starts) {
-      if (spawn_threads(threads, &model_assoc_gen_perms_thread, assoc_thread_ct)) {
+      if (spawn_threads(threads, &generate_cc_perms_thread, g_perm_generation_thread_ct)) {
 	goto model_assoc_ret_THREAD_CREATE_FAIL;
       }
-      model_assoc_gen_perms_thread((void*)ulii);
+      generate_cc_perms_thread((void*)ulii);
     } else {
-      if (spawn_threads(threads, &model_assoc_gen_cluster_perms_thread, assoc_thread_ct)) {
+      if (spawn_threads(threads, &generate_cc_cluster_perms_thread, g_perm_generation_thread_ct)) {
 	goto model_assoc_ret_THREAD_CREATE_FAIL;
       }
-      model_assoc_gen_cluster_perms_thread((void*)ulii);
+      generate_cc_cluster_perms_thread((void*)ulii);
     }
-    join_threads(threads, assoc_thread_ct);
+    join_threads(threads, g_perm_generation_thread_ct);
     g_assoc_thread_ct = max_thread_ct;
     if (!model_adapt_nst) {
-      ulii = (perm_vec_ct + (CACHELINE_DBL - 1)) / CACHELINE_DBL;
-      g_maxt_thread_results = (double*)wkspace_alloc(max_thread_ct * ulii * CACHELINE);
-      g_resultbuf = (uint32_t*)wkspace_alloc(perm_vec_ctcl4m * 3 * MODEL_BLOCKSIZE * sizeof(int32_t));
+      bigstack_alloc_d(max_thread_ct * round_up_pow2(perm_vec_ct, CACHELINE_DBL), &g_maxt_thread_results);
+      bigstack_alloc_ui(perm_vec_ctcl4m * 3 * MODEL_BLOCKSIZE, &g_resultbuf);
 #ifdef __LP64__
-      ulii = ((perm_vec_ct + 127) / 128) * 16;
-      g_perm_vecst = (uint32_t*)wkspace_alloc(ulii * pheno_nm_ct);
+      ulii = ((perm_vec_ct + 127) / 128) * 4;
+      bigstack_alloc_ui(ulii * pheno_nm_ct, &g_perm_vecst);
 #else
-      ulii = ((perm_vec_ct + 31) / 32) * 4;
-      g_perm_vecst = (uint32_t*)wkspace_alloc(ulii * pheno_nm_ct);
-      ulii = ((perm_vec_ct + 63) / 64) * 8;
+      ulii = (perm_vec_ct + 31) / 32;
+      bigstack_alloc_ui(ulii * pheno_nm_ct, &g_perm_vecst);
+      ulii = ((perm_vec_ct + 63) / 64) * 2;
 #endif
-      g_thread_git_wkspace = (uint32_t*)wkspace_alloc(ulii * 72 * max_thread_ct);
+      bigstack_calloc_ui(ulii * 72 * max_thread_ct, &g_thread_git_wkspace);
       transpose_perms(g_perm_vecs, perm_vec_ct, pheno_nm_ct, g_perm_vecst);
-#ifdef __LP64__
-      fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 9 * max_thread_ct);
-#else
-      fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 18 * max_thread_ct);
-#endif
       if (mperm_save & MPERM_DUMP_ALL) {
-	g_mperm_save_all = (double*)wkspace_alloc(marker_ct * perm_vec_ct * sizeof(double));
+	bigstack_alloc_d(marker_ct * perm_vec_ct, &g_mperm_save_all);
       }
     }
     if (!perm_pass_idx) {
@@ -6986,7 +6613,7 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	}
       }
       g_is_x = is_x;
-      chrom_name_ptr = chrom_name_buf5w4write(chrom_name_buf, chrom_info_ptr, uii, &chrom_name_len);
+      chrom_name_ptr = chrom_name_buf5w4write(chrom_info_ptr, uii, &chrom_name_len, chrom_name_buf);
     } else if (model_maxt_nst) {
       marker_idx -= MODEL_BLOCKKEEP;
       if (marker_idx) { // max(T) initial block special case, see below
@@ -7024,7 +6651,7 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	}
       }
       loadbuf_ptr = &(loadbuf[block_size * pheno_nm_ctv2]);
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf_ptr, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_ptr)) {
 	goto model_assoc_ret_READ_FAIL;
       }
       if (model_adapt_nst) {
@@ -7132,15 +6759,14 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	    wptr = memcpyax(writebuf, chrom_name_ptr, chrom_name_len, ' ');
 	    wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr);
 	    *wptr++ = ' ';
-	    wptr = uint32_writew10(wptr, marker_pos[marker_uidx2]);
-	    *wptr = ' ';
-	    wptr = fw_strcpy(4, a1ptr, &(wptr[1]));
+	    wptr = uint32toa_w10x(marker_pos[marker_uidx2], ' ', wptr);
+	    wptr = fw_strcpy(4, a1ptr, wptr);
 	    *wptr++ = ' ';
 	    if (umm + ukk) {
 	      if (assoc_counts) {
-		wptr = uint32_writew8(wptr, umm);
+		wptr = uint32toa_w8(umm, wptr);
 	      } else {
-		wptr = double_g_writewx4(wptr, da1 / (da1 + da2), 8);
+		wptr = dtoa_g_wxp4(da1 / (da1 + da2), 8, wptr);
 	      }
 	      *wptr++ = ' ';
 	    } else {
@@ -7148,9 +6774,9 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	    }
 	    if (ujj + uii) {
 	      if (assoc_counts) {
-		wptr = uint32_writew8(wptr, ujj);
+		wptr = uint32toa_w8(ujj, wptr);
 	      } else {
-		wptr = double_g_writewx4(wptr, du1 / (du1 + du2), 8);
+		wptr = dtoa_g_wxp4(du1 / (du1 + du2), 8, wptr);
 	      }
 	    } else {
 	      wptr = memcpya(wptr, "      NA", 8);
@@ -7159,10 +6785,11 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	    wptr = fw_strcpy(4, a2ptr, &(wptr[1]));
 	    *wptr++ = ' ';
 	    if (model_fisher) {
-	      wptr = double_g_writewx4(wptr, MAXV(pval, output_min_p), 12);
+	      wptr = dtoa_g_wxp4(MAXV(pval, output_min_p), 12, wptr);
 	    } else {
 	      if (pval > -1) {
-		wptr = double_g_writewx4(double_g_writewx4x(wptr, dxx, 12, ' '), MAXV(pval, output_min_p), 12);
+		wptr = dtoa_g_wxp4x(dxx, 12, ' ', wptr);
+		wptr = dtoa_g_wxp4(MAXV(pval, output_min_p), 12, wptr);
 	      } else {
 		wptr = memcpya(wptr, "          NA           NA", 25);
 	      }
@@ -7174,7 +6801,7 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 		wptr = memcpya(wptr, "           NA           NA           NA", 39);
 	      }
 	    } else {
-	      wptr = double_g_writewx4(wptr, *ooptr, 12);
+	      wptr = dtoa_g_wxp4(*ooptr, 12, wptr);
 	      if (display_ci) {
 		dxx = log(*ooptr);
 		dyy = sqrt(1 / da1 + 1 / da2 + 1 / du1 + 1 / du2);
@@ -7182,7 +6809,9 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 		dww = exp(dxx - dzz);
 		dvv = exp(dxx + dzz);
 		*wptr++ = ' ';
-		wptr = double_g_writewx4(double_g_writewx4x(double_g_writewx4x(wptr, dyy, 12, ' '), dww, 12, ' '), dvv, 12);
+		wptr = dtoa_g_wxp4x(dyy, 12, ' ', wptr);
+		wptr = dtoa_g_wxp4x(dww, 12, ' ', wptr);
+		wptr = dtoa_g_wxp4(dvv, 12, wptr);
 	      }
 	    }
 	    wptr = memcpya(wptr, " \n", 2);
@@ -7222,10 +6851,14 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	  wptr_mid = wptr;
 	  if (!model_trendonly) {
 	    memcpy(wptr, "   GENO ", 8);
-	    wptr2 = uint32_write(uint32_writex(uint32_writex(wbuf, uoo, '/'), unn, '/'), umm);
+	    wptr2 = uint32toa_x(uoo, '/', wbuf);
+	    wptr2 = uint32toa_x(unn, '/', wptr2);
+	    wptr2 = uint32toa(umm, wptr2);
 	    wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, &(wptr[8]));
 	    *wptr++ = ' ';
-	    wptr2 = uint32_write(uint32_writex(uint32_writex(wbuf, ukk, '/'), ujj, '/'), uii);
+	    wptr2 = uint32toa_x(ukk, '/', wbuf);
+	    wptr2 = uint32toa_x(ujj, '/', wptr2);
+	    wptr2 = uint32toa(uii, wptr2);
 	    wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, wptr);
 	    *wptr++ = ' ';
 	    if (is_invalid) {
@@ -7252,21 +6885,24 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	      wptr = model_assoc_tna(model_fisher, wptr);
 	    } else {
 	      if (!model_fisher) {
-		wptr = memcpya(double_g_writewx4(wptr, dvv, 12), "    ", 4);
+		wptr = dtoa_g_wxp4(dvv, 12, wptr);
+		wptr = memcpya(wptr, "    ", 4);
 		*wptr++ = '0' + upp;
 		*wptr++ = ' ';
 	      }
-	      wptr = double_g_writewx4x(wptr, MAXV(gen_p, output_min_p), 12, '\n');
+	      wptr = dtoa_g_wxp4x(MAXV(gen_p, output_min_p), 12, '\n', wptr);
 	    }
 	    if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
 	      goto model_assoc_ret_WRITE_FAIL;
 	    }
 	  }
 	  memcpy(wptr_mid, "  TREND ", 8);
-	  wptr2 = uint32_write(uint32_writex(wbuf, uoo * 2 + unn, '/'), umm * 2 + unn);
+	  wptr2 = uint32toa_x(uoo * 2 + unn, '/', wbuf);
+	  wptr2 = uint32toa(umm * 2 + unn, wptr2);
 	  wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, &(wptr_mid[8]));
 	  *wptr++ = ' ';
-	  wptr2 = uint32_write(uint32_writex(wbuf, ukk * 2 + ujj, '/'), uii * 2 + ujj);
+	  wptr2 = uint32toa_x(ukk * 2 + ujj, '/', wbuf);
+	  wptr2 = uint32toa(uii * 2 + ujj, wptr2);
 	  wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, wptr);
 	  *wptr++ = ' ';
 	  wptr_mid2 = wptr; // save this for next line
@@ -7281,9 +6917,10 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	  }
 	  if (ca_p > -1) {
 	    if (!model_fisher) {
-	      wptr = memcpya(double_g_writewx4(wptr, ca_chisq, 12), "    1 ", 6);
+	      wptr = dtoa_g_wxp4(ca_chisq, 12, wptr);
+	      wptr = memcpya(wptr, "    1 ", 6);
 	    }
-	    wptr = double_g_writewx4x(wptr, MAXV(ca_p, output_min_p), 12, '\n');
+	    wptr = dtoa_g_wxp4x(MAXV(ca_p, output_min_p), 12, '\n', wptr);
 	  } else {
 	    wptr = model_assoc_tna(model_fisher, wptr);
 	  }
@@ -7301,9 +6938,10 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	    }
 	    if (mult_p > -1) {
 	      if (!model_fisher) {
-		wptr = memcpya(double_g_writewx4(wptr, dww, 12), "    1 ", 6);
+		wptr = dtoa_g_wxp4(dww, 12, wptr);
+		wptr = memcpya(wptr, "    1 ", 6);
 	      }
-	      wptr = double_g_writewx4x(wptr, MAXV(mult_p, output_min_p), 12, '\n');
+	      wptr = dtoa_g_wxp4x(MAXV(mult_p, output_min_p), 12, '\n', wptr);
 	    } else {
 	      wptr = model_assoc_tna(model_fisher, wptr);
 	    }
@@ -7311,10 +6949,12 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	      goto model_assoc_ret_WRITE_FAIL;
 	    }
 	    memcpy(wptr_mid, "    DOM", 7);
-	    wptr2 = uint32_write(uint32_writex(wbuf, uoo + unn, '/'), umm);
+	    wptr2 = uint32toa_x(uoo + unn, '/', wbuf);
+	    wptr2 = uint32toa(umm, wptr2);
 	    wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, &(wptr_mid[8]));
 	    *wptr++ = ' ';
-	    wptr2 = uint32_write(uint32_writex(wbuf, ukk + ujj, '/'), uii);
+	    wptr2 = uint32toa_x(ukk + ujj, '/', wbuf);
+	    wptr2 = uint32toa(uii, wptr2);
 	    wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, wptr);
 	    *wptr++ = ' ';
 	    if (is_invalid) {
@@ -7341,18 +6981,21 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	      wptr = model_assoc_tna(model_fisher, wptr);
 	    } else {
 	      if (!model_fisher) {
-		wptr = memcpya(double_g_writewx4(wptr, dww, 12), "    1 ", 6);
+		wptr = dtoa_g_wxp4(dww, 12, wptr);
+		wptr = memcpya(wptr, "    1 ", 6);
 	      }
-	      wptr = double_g_writewx4x(wptr, MAXV(dom_p, output_min_p), 12, '\n');
+	      wptr = dtoa_g_wxp4x(MAXV(dom_p, output_min_p), 12, '\n', wptr);
 	    }
 	    if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
 	      goto model_assoc_ret_WRITE_FAIL;
 	    }
 	    memcpy(&(wptr_mid[4]), "REC", 3);
-	    wptr2 = uint32_write(uint32_writex(wbuf, uoo, '/'), unn + umm);
+	    wptr2 = uint32toa_x(uoo, '/', wbuf);
+	    wptr2 = uint32toa(unn + umm, wptr2);
 	    wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, &(wptr_mid[8]));
 	    *wptr++ = ' ';
-	    wptr2 = uint32_write(uint32_writex(wbuf, ukk, '/'), ujj + uii);
+	    wptr2 = uint32toa_x(ukk, '/', wbuf);
+	    wptr2 = uint32toa(ujj + uii, wptr2);
 	    wptr = fw_strcpyn(14, wptr2 - wbuf, wbuf, wptr);
 	    *wptr++ = ' ';
 	    if (is_invalid) {
@@ -7379,9 +7022,10 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	      wptr = model_assoc_tna(model_fisher, wptr);
 	    } else {
 	      if (!model_fisher) {
-		wptr = memcpya(double_g_writewx4(wptr, dww, 12), "    1 ", 6);
+		wptr = dtoa_g_wxp4(dww, 12, wptr);
+		wptr = memcpya(wptr, "    1 ", 6);
 	      }
-	      wptr = double_g_writewx4x(wptr, MAXV(rec_p, output_min_p), 12, '\n');
+	      wptr = dtoa_g_wxp4x(MAXV(rec_p, output_min_p), 12, '\n', wptr);
 	    }
 	    if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
 	      goto model_assoc_ret_WRITE_FAIL;
@@ -7398,7 +7042,7 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	      }
 	    }
 	    if (model_perms && is_invalid) {
-	      set_bit_ul(is_invalid_bitfield, marker_idx + marker_bidx);
+	      set_bit_ul(marker_idx + marker_bidx, is_invalid_bitfield);
 	    }
 	    if (fill_orig_chisq) {
 	      if (dxx != -9) {
@@ -7743,7 +7387,7 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	  model_maxt_best_thread((void*)ulii);
 	}
 	join_threads2(threads, max_thread_ct, is_last_block);
-	ulii = CACHEALIGN32_DBL(perm_vec_ct);
+	ulii = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
         if (model_fisherx) {
 	  for (uii = 0; uii < assoc_thread_ct; uii++) {
 	    ooptr = &(g_maxt_thread_results[uii * ulii]);
@@ -7787,14 +7431,14 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
     fputs("\b\b", stdout);
     logprint("done.\n");
     if (model_perms_nst) {
-      wkspace_reset(g_perm_vecs);
+      bigstack_reset(g_perm_vecs);
     }
     if (fclose_null(&outfile)) {
       goto model_assoc_ret_WRITE_FAIL;
     }
     if (!is_set_test) {
       if (mtest_adjust) {
-        if (wkspace_alloc_ui_checked(&marker_idx_to_uidx, marker_ct * sizeof(int32_t))) {
+        if (bigstack_alloc_ui(marker_ct, &marker_idx_to_uidx)) {
 	  goto model_assoc_ret_NOMEM;
         }
         fill_idx_to_uidx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_idx_to_uidx);
@@ -7802,26 +7446,26 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
         if (retval) {
 	  goto model_assoc_ret_1;
         }
-        wkspace_reset(marker_idx_to_uidx);
+        bigstack_reset(marker_idx_to_uidx);
       }
       if (mperm_save & MPERM_DUMP_ALL) {
-	tbuf[0] = '0';
-	wptr = &(tbuf[1]);
-	a1ptr = &(tbuf[MAXLINELEN]);
+	g_textbuf[0] = '0';
+	wptr = &(g_textbuf[1]);
+	a1ptr = &(g_textbuf[MAXLINELEN]);
 	if (model_fisherx) {
 	  for (uii = 0; uii < marker_ct; uii++) {
 	    *wptr++ = ' ';
 	    dxx = orig_pvals[uii];
 	    if (dxx >= 0) {
-	      wptr = double_g_write(wptr, dxx);
+	      wptr = dtoa_g(dxx, wptr);
 	    } else {
 	      wptr = memcpya(wptr, "NA", 2);
 	    }
 	    if (wptr >= a1ptr) {
-	      if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_msa)) {
+	      if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
 		goto model_assoc_ret_WRITE_FAIL;
 	      }
-	      wptr = tbuf;
+	      wptr = g_textbuf;
 	    }
 	  }
 	} else {
@@ -7829,20 +7473,20 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	    *wptr++ = ' ';
 	    dxx = orig_chisq[uii];
 	    if (dxx >= 0) {
-	      wptr = double_g_write(wptr, dxx);
+	      wptr = dtoa_g(dxx, wptr);
 	    } else {
 	      wptr = memcpya(wptr, "NA", 2);
 	    }
 	    if (wptr >= a1ptr) {
-	      if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_msa)) {
+	      if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
 		goto model_assoc_ret_WRITE_FAIL;
 	      }
-	      wptr = tbuf;
+	      wptr = g_textbuf;
 	    }
 	  }
 	}
 	*wptr++ = '\n';
-	if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_msa)) {
+	if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
 	  goto model_assoc_ret_WRITE_FAIL;
 	}
       }
@@ -7862,34 +7506,34 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
       fflush(stdout);
       ulii = perm_vec_ct;
       ujj = 1 + perms_done - ulii;
-      wptr = tbuf;
-      a1ptr = &(tbuf[MAXLINELEN]);
+      wptr = g_textbuf;
+      a1ptr = &(g_textbuf[MAXLINELEN]);
       for (uii = 0; uii < ulii; uii++) {
-	wptr = uint32_write(wptr, uii + ujj);
+	wptr = uint32toa(uii + ujj, wptr);
         orig_pvals_ptr = &(g_mperm_save_all[uii]);
 	for (ukk = 0; ukk < marker_ct; ukk++) {
 	  *wptr++ = ' ';
 	  dxx = orig_pvals_ptr[ukk * ulii];
 	  if (dxx >= 0) {
-	    wptr = double_g_write(wptr, dxx);
+	    wptr = dtoa_g(dxx, wptr);
 	  } else {
 	    wptr = memcpya(wptr, "NA", 2);
 	  }
 	  if (wptr >= a1ptr) {
-	    if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_msa)) {
+	    if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
 	      goto model_assoc_ret_WRITE_FAIL;
 	    }
-	    wptr = tbuf;
+	    wptr = g_textbuf;
 	  }
 	}
 	*wptr++ = '\n';
       }
-      if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_msa)) {
+      if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
 	goto model_assoc_ret_WRITE_FAIL;
       }
       fputs("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b               ", stdout);
     }
-    wkspace_reset(g_perm_vecs);
+    bigstack_reset(g_perm_vecs);
     if (perms_done < perms_total) {
       if (model_adapt_nst) {
 	marker_unstopped_ct = marker_ct - popcount01_longs((uintptr_t*)perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
@@ -7923,14 +7567,14 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
       memcpy(outname_end2, ".perm", 6);
     } else {
       if (mperm_save & MPERM_DUMP_BEST) {
-	if (wkspace_alloc_c_checked(&a1ptr, FNAMESIZE)) {
+	if (bigstack_alloc_c(FNAMESIZE, &a1ptr)) {
 	  goto model_assoc_ret_NOMEM;
 	}
 	ulii = outname_end - outname;
 	memcpy(a1ptr, outname, ulii);
 	memcpy(&(a1ptr[ulii]), ".mperm.dump.best", 17);
 	LOGPRINTFWW("Dumping best permutation %svalues to %s .\n", model_fisherx? "p-" : "chi-square ", a1ptr);
-	if (fopen_checked(&outfile, a1ptr, "w")) {
+	if (fopen_checked(a1ptr, "w", &outfile)) {
 	  goto model_assoc_ret_OPEN_FAIL;
 	}
 	dxx = 0;
@@ -7948,15 +7592,15 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	    }
 	  }
 	}
-        memcpy(tbuf, "0 ", 2);
-	wptr = double_g_writex(&(tbuf[2]), dxx, '\n');
-	if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile)) {
+        memcpy(g_textbuf, "0 ", 2);
+	wptr = dtoa_gx(dxx, '\n', &(g_textbuf[2]));
+	if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
 	  goto model_assoc_ret_WRITE_FAIL;
 	}
 	for (uii = 0; uii < perms_total; uii++) {
-	  wptr = uint32_writex(tbuf, uii + 1, ' ');
-	  wptr = double_g_writex(wptr, maxt_extreme_stat[uii], '\n');
-	  if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile)) {
+	  wptr = uint32toa_x(uii + 1, ' ', g_textbuf);
+	  wptr = dtoa_gx(maxt_extreme_stat[uii], '\n', wptr);
+	  if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
 	    goto model_assoc_ret_WRITE_FAIL;
 	  }
 	}
@@ -7966,13 +7610,13 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
       }
       memcpy(outname_end2, ".mperm", 7);
     }
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto model_assoc_ret_OPEN_FAIL;
     }
     if (model_adapt_nst) {
-      sprintf(tbuf, " CHR %%%us         EMP1           NP \n", plink_maxsnp);
+      sprintf(g_textbuf, " CHR %%%us         EMP1           NP \n", plink_maxsnp);
     } else {
-      sprintf(tbuf, " CHR %%%us         EMP1         EMP2 \n", plink_maxsnp);
+      sprintf(g_textbuf, " CHR %%%us         EMP1         EMP2 \n", plink_maxsnp);
 #ifdef __cplusplus
       std::sort(maxt_extreme_stat, &(maxt_extreme_stat[perms_total]));
 #else
@@ -7984,7 +7628,7 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
       printf("extreme stats: %g %g\n", maxt_extreme_stat[0], maxt_extreme_stat[perms_total - 1]);
     }
     */
-    fprintf(outfile, tbuf, "SNP");
+    fprintf(outfile, g_textbuf, "SNP");
     chrom_fo_idx = 0xffffffffU;
     marker_uidx = next_unset_unsafe(marker_exclude, 0);
     marker_idx = 0;
@@ -8002,7 +7646,7 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	}
 	marker_uidx = next_unset_unsafe(marker_exclude, chrom_end);
       }
-      wptr_start = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, uii));
+      wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, uii, g_textbuf));
       *wptr_start++ = ' ';
       wptr_start[plink_maxsnp] = ' ';
       for (; marker_uidx < chrom_end;) {
@@ -8019,13 +7663,13 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
             wptr = memcpya(wptr, "          NA           NA", 25);
 	  } else {
 	    if (!model_perm_count) {
-	      wptr = double_g_writewx4x(wptr, pval, 12, ' ');
+	      wptr = dtoa_g_wxp4x(pval, 12, ' ', wptr);
 	    } else {
-	      wptr = double_g_writewx4x(wptr, ((double)perm_2success_ct[marker_idx]) * 0.5, 12, ' ');
+	      wptr = dtoa_g_wxp4x(((double)perm_2success_ct[marker_idx]) * 0.5, 12, ' ', wptr);
 	    }
 	    if (model_adapt_nst) {
 	      wptr = memseta(wptr, 32, 2);
-	      wptr = uint32_writew10(wptr, perm_attempt_ct[marker_idx]);
+	      wptr = uint32toa_w10(perm_attempt_ct[marker_idx], wptr);
 	    } else {
 	      if (model_fisherx) {
 		// minimum p-value
@@ -8035,14 +7679,14 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 		dzz = (int32_t)(perms_total - doublearr_greater_than(maxt_extreme_stat, perms_total, orig_chisq[marker_idx] - EPSILON) + 1);
 	      }
 	      if (!model_perm_count) {
-		wptr = double_g_writewx4(wptr, dzz * dyy, 12);
+		wptr = dtoa_g_wxp4(dzz * dyy, 12, wptr);
 	      } else {
-		wptr = double_g_writewx4(wptr, dzz - 1, 12);
+		wptr = dtoa_g_wxp4(dzz - 1, 12, wptr);
 	      }
 	    }
 	  }
 	  wptr = memcpya(wptr, " \n", 2);
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	    goto model_assoc_ret_WRITE_FAIL;
 	  }
 	}
@@ -8081,7 +7725,7 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
     break;
   }
  model_assoc_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   fclose_cond(outfile_msa);
   return retval;
@@ -8091,7 +7735,7 @@ int32_t qassoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
   // Similar to glm_linear_assoc_set_test().
   // Side effect: t-statistics in g_orig_chisq[] are clobbered and replaced
   // with same-p-value 1df chi-square statistics.
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   uintptr_t* marker_exclude = marker_exclude_mid;
   uintptr_t* unstopped_markers = NULL;
@@ -8108,13 +7752,13 @@ int32_t qassoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
   uintptr_t marker_ct = marker_ct_mid;
   uintptr_t set_ct = 0;
   uintptr_t final_mask = get_final_mask(pheno_nm_ct);
-  uintptr_t pheno_nm_ctv2 = 2 * ((pheno_nm_ct + BITCT - 1) / BITCT);
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
   double adaptive_ci_zt = 0.0;
   uint32_t max_thread_ct = g_thread_ct;
   uint32_t perm_count = model_modifier & MODEL_PERM_COUNT;
   uint32_t perms_done = 0;
   int32_t retval = 0;
-  unsigned char* wkspace_mark2;
+  unsigned char* bigstack_mark2;
   uintptr_t* set_incl;
   uintptr_t* loadbuf_ptr;
   double* orig_set_scores;
@@ -8168,23 +7812,22 @@ int32_t qassoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
   if (!set_ct) {
     goto qassoc_set_test_write;
   }
-  marker_ctl = (marker_ct + (BITCT - 1)) / BITCT;
+  marker_ctl = BITCT_TO_WORDCT(marker_ct);
   if (marker_ct_mid != marker_ct) {
     inplace_delta_collapse_arr((char*)tcnt, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
     inplace_delta_collapse_arr((char*)g_missing_cts, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
     inplace_delta_collapse_arr((char*)g_het_cts, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
     inplace_delta_collapse_arr((char*)g_homcom_cts, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
   }
-  if (wkspace_alloc_ul_checked(&regression_skip, marker_ctl * sizeof(intptr_t))) {
+  if (bigstack_calloc_ul(marker_ctl, &regression_skip)) {
     goto qassoc_set_test_ret_NOMEM;
   }
-  fill_ulong_zero(regression_skip, marker_ctl);
   for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
     // nanal
     uii = tcnt[marker_idx] + 2;
     if ((uii == 2) || (g_homcom_cts[marker_idx] == uii) || (g_het_cts[marker_idx] == uii) || (g_het_cts[marker_idx] + g_homcom_cts[marker_idx] == 0)) {
       // 0 df or no genotype variation, regression always fails
-      SET_BIT(regression_skip, marker_idx);
+      SET_BIT(marker_idx, regression_skip);
     }
   }
   if (model_modifier & MODEL_PERM) {
@@ -8201,14 +7844,14 @@ int32_t qassoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
   if (max_thread_ct > perms_total) {
     max_thread_ct = perms_total;
   }
-  if (wkspace_init_sfmtp(max_thread_ct)) {
+  if (bigstack_init_sfmtp(max_thread_ct)) {
     goto qassoc_set_test_ret_NOMEM;
   }
 
-  wkspace_mark2 = wkspace_base;
+  bigstack_mark2 = g_bigstack_base;
  qassoc_set_test_more_perms:
-  bitfield_and(regression_skip, unstopped_markers, marker_ctl);
-  bitfield_andnot(unstopped_markers, regression_skip, marker_ctl);
+  bitvec_and(unstopped_markers, marker_ctl, regression_skip);
+  bitvec_andnot(regression_skip, marker_ctl, unstopped_markers);
   skip_ct = popcount_longs(regression_skip, marker_ctl);
   marker_unstopped_ct = popcount_longs(unstopped_markers, marker_ctl);
 
@@ -8225,35 +7868,31 @@ int32_t qassoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
   }
   g_perm_vec_ct = perm_vec_ct;
   if (perm_vec_ct >= CACHELINE_INT32 * max_thread_ct) {
-    g_assoc_thread_ct = max_thread_ct;
+    g_perm_generation_thread_ct = max_thread_ct;
   } else {
-    g_assoc_thread_ct = perm_vec_ct / CACHELINE_INT32;
-    if (!g_assoc_thread_ct) {
-      g_assoc_thread_ct = 1;
-    }
+    g_perm_generation_thread_ct = MAXV(perm_vec_ct / CACHELINE_INT32, 1);
   }
-  perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
-  if (wkspace_alloc_d_checked(&g_perm_vecstd, perm_vec_ctcl8m * sizeof(double) * pheno_nm_ct) ||
-      wkspace_alloc_d_checked(&g_thread_git_qbufs, perm_vec_ctcl8m * sizeof(double) * 3 * max_thread_ct)) {
+  perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
+  if (bigstack_alloc_d(perm_vec_ctcl8m * pheno_nm_ct, &g_perm_vecstd) ||
+      bigstack_calloc_d(perm_vec_ctcl8m * 3 * max_thread_ct, &g_thread_git_qbufs)) {
     goto qassoc_set_test_ret_NOMEM;
   }
-  fill_double_zero(g_thread_git_qbufs, 3 * max_thread_ct * perm_vec_ctcl8m);
 
   ulii = 0;
-  if (!g_cluster_ct) {
-    if (spawn_threads(threads, &qassoc_gen_perms_thread, g_assoc_thread_ct)) {
+  if (!g_perm_cluster_ct) {
+    if (spawn_threads(threads, &generate_qt_perms_smajor_thread, g_perm_generation_thread_ct)) {
       goto qassoc_set_test_ret_THREAD_CREATE_FAIL;
     }
-    qassoc_gen_perms_thread((void*)ulii);
+    generate_qt_perms_smajor_thread((void*)ulii);
   } else {
-    if (spawn_threads(threads, &qassoc_gen_cluster_perms_thread, g_assoc_thread_ct)) {
+    if (spawn_threads(threads, &generate_qt_cluster_perms_smajor_thread, g_perm_generation_thread_ct)) {
       goto qassoc_set_test_ret_THREAD_CREATE_FAIL;
     }
-    qassoc_gen_cluster_perms_thread((void*)ulii);
+    generate_qt_cluster_perms_smajor_thread((void*)ulii);
   }
-  join_threads(threads, g_assoc_thread_ct);
-  if (wkspace_alloc_d_checked(&g_mperm_save_all, MODEL_BLOCKSIZE * perm_vec_ct * sizeof(double)) ||
-      wkspace_alloc_d_checked(&chisq_pmajor, marker_ct * perm_vec_ct * sizeof(double))) {
+  join_threads(threads, g_perm_generation_thread_ct);
+  if (bigstack_alloc_d(MODEL_BLOCKSIZE * perm_vec_ct, &g_mperm_save_all) ||
+      bigstack_alloc_d(marker_ct * perm_vec_ct, &chisq_pmajor)) {
     goto qassoc_set_test_ret_NOMEM;
   }
   for (pidx = 0; pidx < perm_vec_ct; pidx++) {
@@ -8298,7 +7937,7 @@ int32_t qassoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
 	}
       }
       loadbuf_ptr = &(loadbuf[block_size * pheno_nm_ctv2]);
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf_ptr, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_ptr)) {
 	goto qassoc_set_test_ret_READ_FAIL;
       }
       if (g_min_ploidy_1 && hh_or_mt_exists) {
@@ -8358,7 +7997,7 @@ int32_t qassoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
   } while (marker_idx < marker_unstopped_ct);
   perms_done += perm_vec_ct;
   compute_set_scores(marker_ct, perm_vec_ct, set_ct, chisq_pmajor, orig_set_scores, sorted_chisq_buf, sorted_marker_idx_buf, proxy_arr, setdefs, ld_map, apip, chisq_threshold, adaptive_ci_zt, first_adapt_check, perms_done, sip->set_max, perm_adapt_set_unstopped, perm_2success_ct, perm_attempt_ct);
-  wkspace_reset(wkspace_mark2);
+  bigstack_reset(bigstack_mark2);
   if (perms_done < perms_total) {
     if (model_modifier & MODEL_PERM) {
       if (!extract_set_union(setdefs, set_ct, perm_adapt_set_unstopped, unstopped_markers, marker_ct)) {
@@ -8397,17 +8036,17 @@ int32_t qassoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
     break;
   }
  qassoc_set_test_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t model_modifier, uint32_t model_mperm_val, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t* marker_reverse, Chrom_info* chrom [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t marker_ct = marker_ct_orig;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t unfiltered_sample_ctv2 = 2 * unfiltered_sample_ctl;
-  uintptr_t pheno_nm_ctv2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t unfiltered_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct);
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
   uintptr_t final_mask = get_final_mask(pheno_nm_ct);
   uintptr_t perm_vec_ctcl8m = 0;
   FILE* outfile = NULL;
@@ -8513,27 +8152,27 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
     goto qassoc_ret_1;
   }
   if (is_set_test) {
-    if (wkspace_alloc_ul_checked(&founder_pnm, unfiltered_sample_ctl * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_sample_ctl, &founder_pnm)) {
       goto qassoc_ret_NOMEM;
     }
     memcpy(founder_pnm, pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t));
-    bitfield_and(founder_pnm, founder_info, unfiltered_sample_ctl);
+    bitvec_and(founder_info, unfiltered_sample_ctl, founder_pnm);
     if (extract_set_union_unfiltered(sip, NULL, unfiltered_marker_ct, marker_exclude_orig, &marker_exclude, &marker_ct)) {
       goto qassoc_ret_NOMEM;
     }
   }
   memset(spacebuf, 32, 8);
-  g_pheno_nm_ct = pheno_nm_ct;
+  g_perm_pheno_nm_ct = pheno_nm_ct;
   g_perms_done = 0;
   g_mperm_save_all = NULL;
   numbuf[0] = ' ';
   if (perm_maxt_nst) {
     perms_total = model_mperm_val;
-    if (wkspace_alloc_d_checked(&g_maxt_extreme_stat, sizeof(double) * perms_total)) {
+    // square of t-stat
+    if (bigstack_calloc_d(perms_total, &g_maxt_extreme_stat)) {
       goto qassoc_ret_NOMEM;
     }
-    fill_double_zero(g_maxt_extreme_stat, perms_total); // square of t-stat
-    g_ldrefs = (uint16_t*)wkspace_alloc(marker_ct * sizeof(int16_t));
+    g_ldrefs = (uint16_t*)bigstack_alloc(marker_ct * sizeof(int16_t));
     if (!g_ldrefs) {
       goto qassoc_ret_NOMEM;
     }
@@ -8544,7 +8183,7 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
 #endif
     if (mperm_save & MPERM_DUMP_ALL) {
       memcpy(outname_end, ".mperm.dump.all", 16);
-      if (fopen_checked(&outfile_msa, outname, "w")) {
+      if (fopen_checked(outname, "w", &outfile_msa)) {
 	goto qassoc_ret_OPEN_FAIL;
       }
       if (putc_checked('0', outfile_msa)) {
@@ -8557,15 +8196,14 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
     if (perm_adapt_nst) {
       g_aperm_alpha = apip->alpha;
       perms_total = apip->max;
-      if (wkspace_alloc_ui_checked(&g_perm_attempt_ct, marker_ct * sizeof(int32_t)) ||
-	  wkspace_alloc_uc_checked(&g_perm_adapt_stop, marker_ct)) {
+      if (bigstack_alloc_ui(marker_ct, &g_perm_attempt_ct) ||
+	  bigstack_calloc_uc(round_up_pow2(marker_ct, BYTECT), &g_perm_adapt_stop)) {
 	goto qassoc_ret_NOMEM;
       }
       ujj = apip->max;
       for (uii = 0; uii < marker_ct; uii++) {
 	g_perm_attempt_ct[uii] = ujj;
       }
-      fill_ulong_zero((uintptr_t*)g_perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
       g_adaptive_ci_zt = ltqnorm(1 - apip->beta / (2.0 * ((intptr_t)marker_ct)));
       if (apip->min < apip->init_interval) {
 	g_first_adapt_check = (int32_t)(apip->init_interval);
@@ -8577,31 +8215,31 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
     }
   }
   outname_end2 = memcpyb(outname_end, ".qassoc", 8);
-  if (wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctv2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctv2, &loadbuf_raw)) {
     goto qassoc_ret_NOMEM;
   }
   loadbuf_raw[unfiltered_sample_ctv2 - 2] = 0;
   loadbuf_raw[unfiltered_sample_ctv2 - 1] = 0;
   if (fill_orig_chiabs) {
-    if (wkspace_alloc_d_checked(&g_orig_chisq, marker_ct * sizeof(double))) {
+    if (bigstack_alloc_d(marker_ct, &g_orig_chisq)) {
       goto qassoc_ret_NOMEM;
     }
     if (mtest_adjust || is_set_test) {
-      if (wkspace_alloc_ui_checked(&tcnt, marker_ct * sizeof(int32_t))) {
+      if (bigstack_alloc_ui(marker_ct, &tcnt)) {
 	goto qassoc_ret_NOMEM;
       }
     }
   }
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto qassoc_ret_OPEN_FAIL;
   }
   if (qt_means) {
     memcpy(outname_end2, ".means", 7);
-    if (fopen_checked(&outfile_qtm, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile_qtm)) {
       goto qassoc_ret_OPEN_FAIL;
     }
-    sprintf(tbuf, " CHR %%%us  VALUE      G11      G12      G22\n", plink_maxsnp);
-    fprintf(outfile_qtm, tbuf, "SNP");
+    sprintf(g_textbuf, " CHR %%%us  VALUE      G11      G12      G22\n", plink_maxsnp);
+    fprintf(outfile_qtm, g_textbuf, "SNP");
     *outname_end2 = '\0';
   }
   if (haploid_chrom_present(chrom_info_ptr) || mt_exists) {
@@ -8609,8 +8247,8 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
   }
   LOGPRINTFWW5("Writing QT --assoc report to %s ... ", outname);
   fflush(stdout);
-  sprintf(tbuf, " CHR %%%us         BP    NMISS       BETA         SE         R2        T            P ", plink_maxsnp);
-  fprintf(outfile, tbuf, "SNP");
+  sprintf(g_textbuf, " CHR %%%us         BP    NMISS       BETA         SE         R2        T            P ", plink_maxsnp);
+  fprintf(outfile, g_textbuf, "SNP");
   if (do_lin) {
     fputs("         LIN        LIN_P ", outfile);
   }
@@ -8632,58 +8270,57 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
       max_thread_ct = MAXV(uii, 1);
     }
     if (cluster_starts) {
-      retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, NULL, pheno_nm_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_cluster_ct, &g_cluster_map, &g_cluster_starts, NULL, NULL);
+      retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, NULL, pheno_nm_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, NULL, NULL);
       if (retval) {
 	goto qassoc_ret_1;
       }
-      if (!g_cluster_ct) {
+      if (!g_perm_cluster_ct) {
         logerrprint("Error: No size 2+ clusters for permutation test.\n");
         goto qassoc_ret_INVALID_CMDLINE;
       }
-      if (wkspace_alloc_ui_checked(&g_sample_to_cluster, pheno_nm_ct * sizeof(int32_t)) ||
-          wkspace_alloc_ui_checked(&g_qassoc_cluster_thread_wkspace, max_thread_ct * ((g_cluster_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32) * CACHELINE)) {
+      if (bigstack_alloc_ui(pheno_nm_ct, &g_perm_sample_to_cluster) ||
+          bigstack_alloc_ui(max_thread_ct * round_up_pow2(g_perm_cluster_ct, CACHELINE_INT32), &g_perm_qt_cluster_thread_wkspace)) {
 	goto qassoc_ret_NOMEM;
       }
-      fill_unfiltered_sample_to_cluster(pheno_nm_ct, g_cluster_ct, g_cluster_map, g_cluster_starts, g_sample_to_cluster);
+      fill_unfiltered_sample_to_cluster(pheno_nm_ct, g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, g_perm_sample_to_cluster);
     }
-    if (wkspace_alloc_ui_checked(&g_missing_cts, marker_ct * sizeof(int32_t)) ||
-	wkspace_alloc_ui_checked(&g_het_cts, marker_ct * sizeof(int32_t)) ||
-	wkspace_alloc_ui_checked(&g_homcom_cts, marker_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(marker_ct, &g_missing_cts) ||
+	bigstack_alloc_ui(marker_ct, &g_het_cts) ||
+	bigstack_alloc_ui(marker_ct, &g_homcom_cts)) {
       goto qassoc_ret_NOMEM;
     }
     if (!is_set_test) {
-      if (wkspace_init_sfmtp(max_thread_ct)) {
+      if (bigstack_init_sfmtp(max_thread_ct)) {
 	goto qassoc_ret_NOMEM;
       }
-      if (wkspace_alloc_ui_checked(&g_perm_2success_ct, marker_ct * sizeof(int32_t))) {
+      if (bigstack_calloc_ui(marker_ct, &g_perm_2success_ct)) {
 	goto qassoc_ret_NOMEM;
       }
-      fill_uint_zero(g_perm_2success_ct, marker_ct);
     }
   }
   if (do_lin) {
-    if (wkspace_alloc_d_checked(&g_orig_linsq, marker_ct * sizeof(double))) {
+    if (bigstack_alloc_d(marker_ct, &g_orig_linsq)) {
       goto qassoc_ret_NOMEM;
     }
   }
-  if (wkspace_alloc_ul_checked(&g_loadbuf, MODEL_BLOCKSIZE * pheno_nm_ctv2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&marker_idx_to_uidx, marker_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(&sample_include2, pheno_nm_ctv2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(MODEL_BLOCKSIZE * pheno_nm_ctv2, &g_loadbuf) ||
+      bigstack_alloc_ui(marker_ct, &marker_idx_to_uidx) ||
+      bigstack_alloc_ul(pheno_nm_ctv2, &sample_include2)) {
     goto qassoc_ret_NOMEM;
   }
-  fill_vec_55(sample_include2, pheno_nm_ct);
-  if (alloc_collapsed_haploid_filters(unfiltered_sample_ct, pheno_nm_ct, hh_or_mt_exists, 1, pheno_nm, sex_male, &sample_include2, &sample_male_include2)) {
+  fill_quatervec_55(pheno_nm_ct, sample_include2);
+  if (alloc_collapsed_haploid_filters(pheno_nm, sex_male, unfiltered_sample_ct, pheno_nm_ct, hh_or_mt_exists, 1, &sample_include2, &sample_male_include2)) {
     goto qassoc_ret_NOMEM;
   }
   marker_unstopped_ct = marker_ct;
-  if (wkspace_alloc_d_checked(&g_pheno_d2, pheno_nm_ct * sizeof(double))) {
+  if (bigstack_alloc_d(pheno_nm_ct, &g_perm_pheno_d2)) {
     goto qassoc_ret_NOMEM;
   }
   g_pheno_sum = 0;
   g_pheno_ssq = 0;
   sample_uidx = 0;
   sample_idx = 0;
-  dptr = g_pheno_d2;
+  dptr = g_perm_pheno_d2;
   do {
     sample_uidx = next_set_ul_unsafe(pheno_nm, sample_uidx);
     sample_uidx_stop = next_unset_ul(pheno_nm, sample_uidx, unfiltered_sample_ct);
@@ -8726,49 +8363,45 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
     if (g_perm_vec_ct > perms_total - g_perms_done) {
       g_perm_vec_ct = perms_total - g_perms_done;
     }
-    perm_vec_ctcl8m = CACHEALIGN32_DBL(g_perm_vec_ct);
-    if (wkspace_alloc_d_checked(&g_perm_vecstd, perm_vec_ctcl8m * sizeof(double) * pheno_nm_ct)) {
+    perm_vec_ctcl8m = round_up_pow2(g_perm_vec_ct, CACHELINE_DBL);
+    if (bigstack_alloc_d(perm_vec_ctcl8m * pheno_nm_ct, &g_perm_vecstd)) {
       goto qassoc_ret_NOMEM;
     }
     ulii = do_lin? 6 : 3;
     if (perm_maxt_nst) {
-      if (wkspace_alloc_d_checked(&g_maxt_thread_results, max_thread_ct * perm_vec_ctcl8m * sizeof(double)) ||
-	  wkspace_alloc_d_checked(&g_qresultbuf, ulii * MODEL_BLOCKSIZE * perm_vec_ctcl8m * sizeof(double))) {
+      if (bigstack_alloc_d(max_thread_ct * perm_vec_ctcl8m, &g_maxt_thread_results) ||
+	  bigstack_alloc_d(ulii * MODEL_BLOCKSIZE * perm_vec_ctcl8m, &g_qresultbuf)) {
 	goto qassoc_ret_NOMEM;
       }
       if (mperm_save & MPERM_DUMP_ALL) {
-	if (wkspace_alloc_d_checked(&g_mperm_save_all, marker_ct * sizeof(double) * g_perm_vec_ct)) {
+	if (bigstack_alloc_d(marker_ct * g_perm_vec_ct, &g_mperm_save_all)) {
 	  goto qassoc_ret_NOMEM;
 	}
       }
     } else {
-      if (wkspace_alloc_d_checked(&g_thread_git_qbufs, perm_vec_ctcl8m * sizeof(double) * ulii * max_thread_ct)) {
+      if (bigstack_calloc_d(perm_vec_ctcl8m * ulii * max_thread_ct, &g_thread_git_qbufs)) {
 	goto qassoc_ret_NOMEM;
       }
-      fill_double_zero(g_thread_git_qbufs, ulii * max_thread_ct * perm_vec_ctcl8m);
     }
     g_perms_done += g_perm_vec_ct;
     if (g_perm_vec_ct >= CACHELINE_DBL * max_thread_ct) {
-      g_assoc_thread_ct = max_thread_ct;
+      g_perm_generation_thread_ct = max_thread_ct;
     } else {
-      g_assoc_thread_ct = g_perm_vec_ct / CACHELINE_DBL;
-      if (!g_assoc_thread_ct) {
-	g_assoc_thread_ct = 1;
-      }
+      g_perm_generation_thread_ct = MAXV(g_perm_vec_ct / CACHELINE_DBL, 1);
     }
     ulii = 0;
     if (!cluster_starts) {
-      if (spawn_threads(threads, &qassoc_gen_perms_thread, g_assoc_thread_ct)) {
+      if (spawn_threads(threads, &generate_qt_perms_smajor_thread, g_perm_generation_thread_ct)) {
 	goto qassoc_ret_THREAD_CREATE_FAIL;
       }
-      qassoc_gen_perms_thread((void*)ulii);
+      generate_qt_perms_smajor_thread((void*)ulii);
     } else {
-      if (spawn_threads(threads, &qassoc_gen_cluster_perms_thread, g_assoc_thread_ct)) {
+      if (spawn_threads(threads, &generate_qt_cluster_perms_smajor_thread, g_perm_generation_thread_ct)) {
 	goto qassoc_ret_THREAD_CREATE_FAIL;
       }
-      qassoc_gen_cluster_perms_thread((void*)ulii);
+      generate_qt_cluster_perms_smajor_thread((void*)ulii);
     }
-    join_threads(threads, g_assoc_thread_ct);
+    join_threads(threads, g_perm_generation_thread_ct);
     g_assoc_thread_ct = max_thread_ct;
   }
   chrom_fo_idx = 0xffffffffU;
@@ -8788,7 +8421,7 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
       refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &g_is_x, &g_is_y, &uii, &g_min_ploidy_1);
       g_min_ploidy_1 |= uii; // treat MT as haploid
       uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-      chrom_name_ptr = chrom_name_buf5w4write(chrom_name_buf, chrom_info_ptr, uii, &chrom_name_len);
+      chrom_name_ptr = chrom_name_buf5w4write(chrom_info_ptr, uii, &chrom_name_len, chrom_name_buf);
     } else if (perm_maxt_nst) {
       marker_idx -= MODEL_BLOCKKEEP;
       memcpy(g_loadbuf, &(g_loadbuf[(MODEL_BLOCKSIZE - MODEL_BLOCKKEEP) * pheno_nm_ctv2]), MODEL_BLOCKKEEP * pheno_nm_ctv2 * sizeof(intptr_t));
@@ -8821,7 +8454,7 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
 	}
       }
       loadbuf_ptr = &(g_loadbuf[block_size * pheno_nm_ctv2]);
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf_ptr, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_ptr)) {
 	goto qassoc_ret_READ_FAIL;
       }
       if (g_min_ploidy_1 && hh_or_mt_exists) {
@@ -8850,16 +8483,14 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
 	marker_uidx2 = mu_table[marker_bidx];
         marker_idx_to_uidx[marker_idx + marker_bidx] = marker_uidx2;
 	loadbuf_ptr = &(g_loadbuf[marker_bidx * pheno_nm_ctv2]);
-	vec_3freq(pheno_nm_ctv2, loadbuf_ptr, sample_include2, &missing_ct, &het_ct, &homcom_ct);
+	genovec_3freq(loadbuf_ptr, sample_include2, pheno_nm_ctv2, &missing_ct, &het_ct, &homcom_ct);
 	nanal = pheno_nm_ct - missing_ct;
-	wptr = memcpya(tbuf, chrom_name_ptr, chrom_name_len);
+	wptr = memcpya(g_textbuf, chrom_name_ptr, chrom_name_len);
 	*wptr++ = ' ';
         wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr);
 	*wptr++ = ' ';
-	wptr = uint32_writew10(wptr, marker_pos[marker_uidx2]);
-	*wptr++ = ' ';
-	wptr = uint32_writew8(wptr, nanal);
-	*wptr++ = ' ';
+	wptr = uint32toa_w10x(marker_pos[marker_uidx2], ' ', wptr);
+	wptr = uint32toa_w8x(nanal, ' ', wptr);
 	homrar_ct = nanal - het_ct - homcom_ct;
 	if (do_perms) {
 	  g_missing_cts[marker_idx + marker_bidx] = missing_ct;
@@ -8886,7 +8517,7 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
 	    ujj = CTZLU(ulii) & (BITCT - 2);
 	    ukk = (ulii >> ujj) & 3;
 	    sample_idx = uii + (ujj / 2);
-	    dxx = g_pheno_d2[sample_idx];
+	    dxx = g_perm_pheno_d2[sample_idx];
 	    if (ukk == 1) {
 	      qt_g_prod += dxx;
 	      if (qt_means_or_lin) {
@@ -8944,7 +8575,7 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
 	  if (mperm_save & MPERM_DUMP_ALL) {
 	    if (!do_lin) {
 	      if (tp >= 0) {
-		double_g_writex(&(numbuf[1]), tstat * tstat, '\0');
+		dtoa_gx(tstat * tstat, '\0', &(numbuf[1]));
 		fputs(numbuf, outfile_msa);
 	      } else {
 		fputs(" NA", outfile_msa);
@@ -8952,7 +8583,7 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
 	    } else {
 	      dxx = g_orig_linsq[marker_idx + marker_bidx];
 	      if ((nanal > 2) && realnum(dxx)) {
-		double_g_writex(&(numbuf[1]), dxx, '\0');
+		dtoa_gx(dxx, '\0', &(numbuf[1]));
 		fputs(numbuf, outfile_msa);
 	      } else {
 		fputs(" NA", outfile_msa);
@@ -8965,13 +8596,13 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
 	  if (!realnum(beta)) {
 	    wptr = memcpya(wptr, "        NA         NA         NA ", 33);
 	  } else {
-	    wptr = double_g_writewx4x(wptr, beta, 10, ' ');
-	    wptr = double_g_writewx4x(wptr, vbeta_sqrt, 10, ' ');
-	    wptr = double_g_writewx4x(wptr, rsq, 10, ' ');
+	    wptr = dtoa_g_wxp4x(beta, 10, ' ', wptr);
+	    wptr = dtoa_g_wxp4x(vbeta_sqrt, 10, ' ', wptr);
+	    wptr = dtoa_g_wxp4x(rsq, 10, ' ', wptr);
 	  }
 	  if (tp >= 0) {
-	    wptr = double_g_writewx4x(wptr, tstat, 8, ' ');
-	    wptr = double_g_writewx4(wptr, MAXV(tp, output_min_p), 12);
+	    wptr = dtoa_g_wxp4x(tstat, 8, ' ', wptr);
+	    wptr = dtoa_g_wxp4(MAXV(tp, output_min_p), 12, wptr);
 	  } else {
 	    wptr = memcpya(wptr, "      NA           NA", 21);
 	  }
@@ -8980,9 +8611,9 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
 	    if (realnum(dxx)) {
 	      *wptr++ = ' ';
 	      dxx = sqrt(dxx);
-	      wptr = double_g_writewx4x(wptr, dxx, 12, ' ');
+	      wptr = dtoa_g_wxp4x(dxx, 12, ' ', wptr);
 	      dxx = calc_tprob(dxx, nanal - 2);
-	      wptr = double_g_writewx4(wptr, MAXV(dxx, output_min_p), 12);
+	      wptr = dtoa_g_wxp4(MAXV(dxx, output_min_p), 12, wptr);
 	    } else {
 	      wptr = memcpya(wptr, "           NA           NA", 26);
 	    }
@@ -9000,11 +8631,11 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
 	  }
 	  *wptr++ = '\n';
 	}
-	if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	  goto qassoc_ret_WRITE_FAIL;
 	}
 	if (qt_means) {
-	  wptr_restart = &(tbuf[2 + chrom_name_len + plink_maxsnp]);
+	  wptr_restart = &(g_textbuf[2 + chrom_name_len + plink_maxsnp]);
 	  wptr = memcpya(wptr_restart, "  GENO ", 7);
 	  a1ptr = marker_allele_ptrs[2 * marker_uidx2];
 	  a2ptr = marker_allele_ptrs[2 * marker_uidx2 + 1];
@@ -9013,7 +8644,7 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
 	  if (uii < 4) {
 	    wptr = memseta(wptr, 32, 7 - 2 * uii);
 	  }
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile_qtm)) {
+	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_qtm)) {
 	    goto qassoc_ret_WRITE_FAIL;
 	  }
 	  fputs(a1ptr, outfile_qtm);
@@ -9035,52 +8666,49 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
           fputs(a2ptr, outfile_qtm);
 	  putc('\n', outfile_qtm);
 	  wptr = memcpya(wptr_restart, "COUNTS ", 7);
-	  wptr = uint32_writew8(wptr, homrar_ct);
-	  *wptr++ = ' ';
-	  wptr = uint32_writew8(wptr, het_ct);
-	  *wptr++ = ' ';
-	  wptr = uint32_writew8(wptr, homcom_ct);
-	  *wptr++ = '\n';
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile_qtm)) {
+	  wptr = uint32toa_w8x(homrar_ct, ' ', wptr);
+	  wptr = uint32toa_w8x(het_ct, ' ', wptr);
+	  wptr = uint32toa_w8x(homcom_ct, '\n', wptr);
+	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_qtm)) {
 	    goto qassoc_ret_WRITE_FAIL;
 	  }
 	  wptr = memcpya(wptr_restart, "  FREQ ", 7);
-	  wptr = double_g_writewx4x(wptr, nanal_recip * ((intptr_t)homrar_ct), 8, ' ');
-	  wptr = double_g_writewx4x(wptr, nanal_recip * ((intptr_t)het_ct), 8, ' ');
-	  wptr = double_g_writewx4x(wptr, nanal_recip * ((intptr_t)homcom_ct), 8, '\n');
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile_qtm)) {
+	  wptr = dtoa_g_wxp4x(nanal_recip * ((intptr_t)homrar_ct), 8, ' ', wptr);
+	  wptr = dtoa_g_wxp4x(nanal_recip * ((intptr_t)het_ct), 8, ' ', wptr);
+	  wptr = dtoa_g_wxp4x(nanal_recip * ((intptr_t)homcom_ct), 8, '\n', wptr);
+	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_qtm)) {
 	    goto qassoc_ret_WRITE_FAIL;
 	  }
 	  wptr = memcpya(wptr_restart, "  MEAN ", 7);
 	  qt_homcom_sum = qt_sum - qt_homrar_sum - qt_het_sum;
 	  if (homrar_ct) {
 	    x11 = qt_homrar_sum / ((double)homrar_ct);
-	    wptr = double_g_writewx4(wptr, x11, 8);
+	    wptr = dtoa_g_wxp4(x11, 8, wptr);
 	  } else {
 	    wptr = memcpya(wptr, "      NA", 8);
 	  }
 	  *wptr++ = ' ';
 	  if (het_ct) {
 	    x12 = qt_het_sum / ((double)het_ct);
-	    wptr = double_g_writewx4(wptr, x12, 8);
+	    wptr = dtoa_g_wxp4(x12, 8, wptr);
 	  } else {
 	    wptr = memcpya(wptr, "      NA", 8);
 	  }
 	  *wptr++ = ' ';
 	  if (homcom_ct) {
 	    x22 = qt_homcom_sum / ((double)homcom_ct);
-	    wptr = double_g_writewx4(wptr, x22, 8);
+	    wptr = dtoa_g_wxp4(x22, 8, wptr);
 	  } else {
 	    wptr = memcpya(wptr, "      NA", 8);
 	  }
 	  *wptr++ = '\n';
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile_qtm)) {
+	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_qtm)) {
 	    goto qassoc_ret_WRITE_FAIL;
 	  }
 	  wptr = memcpya(wptr_restart, "    SD ", 7);
 	  if (homrar_ct > 1) {
             dxx = sqrt((qt_homrar_ssq - qt_homrar_sum * x11) / ((double)((intptr_t)homrar_ct - 1)));
-	    wptr = double_g_writewx4(wptr, dxx, 8);
+	    wptr = dtoa_g_wxp4(dxx, 8, wptr);
 	  } else if (homrar_ct == 1) {
 	    wptr = memcpya(wptr, "       0", 8);
 	  } else {
@@ -9089,7 +8717,7 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
 	  *wptr++ = ' ';
 	  if (het_ct > 1) {
             dxx = sqrt((qt_het_ssq - qt_het_sum * x12) / ((double)((intptr_t)het_ct - 1)));
-	    wptr = double_g_writewx4(wptr, dxx, 8);
+	    wptr = dtoa_g_wxp4(dxx, 8, wptr);
 	  } else if (het_ct == 1) {
 	    wptr = memcpya(wptr, "       0", 8);
 	  } else {
@@ -9098,14 +8726,14 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
 	  *wptr++ = ' ';
 	  if (homcom_ct > 1) {
             dxx = sqrt((qt_ssq - qt_het_ssq - qt_homrar_ssq - qt_homcom_sum * x22) / ((double)((intptr_t)homcom_ct - 1)));
-	    wptr = double_g_writewx4(wptr, dxx, 8);
+	    wptr = dtoa_g_wxp4(dxx, 8, wptr);
 	  } else if (homcom_ct == 1) {
 	    wptr = memcpya(wptr, "       0", 8);
 	  } else {
 	    wptr = memcpya(wptr, "      NA", 8);
 	  }
 	  *wptr++ = '\n';
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile_qtm)) {
+	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_qtm)) {
 	    goto qassoc_ret_WRITE_FAIL;
 	  }
 	}
@@ -9136,7 +8764,7 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
 	} else if (!ukk) {
 	  ukk = 1;
 	}
-	ulii = CACHEALIGN32_DBL(g_perm_vec_ct);
+	ulii = round_up_pow2(g_perm_vec_ct, CACHELINE_DBL);
 	for (uii = 0; uii < ukk; uii++) {
 	  ooptr = &(g_maxt_thread_results[uii * ulii]);
 	  for (ujj = g_perms_done - g_perm_vec_ct; ujj < g_perms_done; ujj++) {
@@ -9191,7 +8819,7 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
     }
     if (!is_set_test) {
       if (do_perms_nst) {
-	wkspace_reset(g_perm_vecstd);
+	bigstack_reset(g_perm_vecstd);
       }
       if (mtest_adjust) {
 	if (do_lin) {
@@ -9225,37 +8853,37 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
       fflush(stdout);
       ulii = g_perm_vec_ct;
       ujj = 1 + g_perms_done - ulii;
-      wptr = tbuf;
-      a1ptr = &(tbuf[MAXLINELEN]);
+      wptr = g_textbuf;
+      a1ptr = &(g_textbuf[MAXLINELEN]);
       for (uii = 0; uii < ulii; uii++) {
-	wptr = uint32_write(wptr, uii + ujj);
+	wptr = uint32toa(uii + ujj, wptr);
 	ooptr = &(g_mperm_save_all[uii]);
 	for (ukk = 0; ukk < marker_ct; ukk++) {
 	  *wptr++ = ' ';
 	  dxx = ooptr[ukk * ulii];
 	  if (dxx >= 0) {
-	    wptr = double_g_write(wptr, dxx);
+	    wptr = dtoa_g(dxx, wptr);
 	  } else {
 	    wptr = memcpya(wptr, "NA", 2);
 	  }
 	  if (wptr >= a1ptr) {
-	    if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_msa)) {
+	    if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
 	      goto qassoc_ret_WRITE_FAIL;
 	    }
-	    wptr = tbuf;
+	    wptr = g_textbuf;
 	  }
 	}
 	*wptr++ = '\n';
       }
-      if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_msa)) {
+      if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
 	goto qassoc_ret_WRITE_FAIL;
       }
       fputs("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b               ", stdout);
     }
-    wkspace_reset(g_perm_vecstd);
+    bigstack_reset(g_perm_vecstd);
     if (g_perms_done < perms_total) {
       if (perm_adapt_nst) {
-	marker_unstopped_ct = marker_ct - popcount_longs((uintptr_t*)g_perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
+	marker_unstopped_ct = marker_ct - popcount01_longs((uintptr_t*)g_perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
 	if (!marker_unstopped_ct) {
 	  goto qassoc_adapt_perm_count;
 	}
@@ -9286,7 +8914,7 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
       if (mperm_save & MPERM_DUMP_BEST) {
 	memcpy(outname_end, ".mperm.dump.best", 17);
 	LOGPRINTFWW("Dumping best permutation squared %sstats to %s .\n", do_lin? "Lin " : "Wald t-", outname);
-	if (fopen_checked(&outfile, outname, "w")) {
+	if (fopen_checked(outname, "w", &outfile)) {
 	  goto qassoc_ret_OPEN_FAIL;
 	}
 	dxx = 0;
@@ -9304,15 +8932,15 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
 	    }
 	  }
 	}
-        memcpy(tbuf, "0 ", 2);
-	wptr = double_g_writex(&(tbuf[2]), dxx, '\n');
-	if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile)) {
+        memcpy(g_textbuf, "0 ", 2);
+	wptr = dtoa_gx(dxx, '\n', &(g_textbuf[2]));
+	if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
 	  goto qassoc_ret_WRITE_FAIL;
 	}
 	for (uii = 0; uii < perms_total; uii++) {
-	  wptr = uint32_writex(tbuf, uii + 1, ' ');
-	  wptr = double_g_writex(wptr, g_maxt_extreme_stat[uii], '\n');
-	  if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile)) {
+	  wptr = uint32toa_x(uii + 1, ' ', g_textbuf);
+	  wptr = dtoa_gx(g_maxt_extreme_stat[uii], '\n', wptr);
+	  if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
 	    goto qassoc_ret_WRITE_FAIL;
 	  }
 	}
@@ -9323,13 +8951,13 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
       }
       memcpy(outname_end2, ".mperm", 7);
     }
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto qassoc_ret_OPEN_FAIL;
     }
     if (perm_adapt_nst) {
-      sprintf(tbuf, " CHR %%%us         EMP1           NP \n", plink_maxsnp);
+      sprintf(g_textbuf, " CHR %%%us         EMP1           NP \n", plink_maxsnp);
     } else {
-      sprintf(tbuf, " CHR %%%us         EMP1         EMP2 \n", plink_maxsnp);
+      sprintf(g_textbuf, " CHR %%%us         EMP1         EMP2 \n", plink_maxsnp);
 #ifdef __cplusplus
       std::sort(g_maxt_extreme_stat, &(g_maxt_extreme_stat[perms_total]));
 #else
@@ -9340,7 +8968,7 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
     // if (perm_maxt) {
     //   printf("extreme stats: %g %g %g\n", g_maxt_extreme_stat[0], g_maxt_extreme_stat[(perms_total - 1) / 2], g_maxt_extreme_stat[perms_total - 1]);
     // }
-    fprintf(outfile, tbuf, "SNP");
+    fprintf(outfile, g_textbuf, "SNP");
     chrom_fo_idx = 0xffffffffU;
     marker_uidx = next_unset_unsafe(marker_exclude, 0);
     marker_idx = 0;
@@ -9351,7 +8979,7 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
 	chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[(++chrom_fo_idx) + 1U];
       } while (marker_uidx >= chrom_end);
       uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-      wptr_start = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, uii));
+      wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, uii, g_textbuf));
       *wptr_start++ = ' ';
       wptr_start[plink_maxsnp] = ' ';
       for (; marker_uidx < chrom_end;) {
@@ -9368,13 +8996,13 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
             wptr = memcpya(wptr, "          NA           NA", 25);
 	  } else {
 	    if (!perm_count) {
-	      wptr = double_g_writewx4x(wptr, pval, 12, ' ');
+	      wptr = dtoa_g_wxp4x(pval, 12, ' ', wptr);
 	    } else {
-	      wptr = double_g_writewx4x(wptr, ((double)g_perm_2success_ct[marker_idx]) * 0.5, 12, ' ');
+	      wptr = dtoa_g_wxp4x(((double)g_perm_2success_ct[marker_idx]) * 0.5, 12, ' ', wptr);
 	    }
 	    if (perm_adapt_nst) {
 	      wptr = memseta(wptr, 32, 2);
-	      wptr = uint32_writew10(wptr, g_perm_attempt_ct[marker_idx]);
+	      wptr = uint32toa_w10(g_perm_attempt_ct[marker_idx], wptr);
 	    } else {
 	      // maximum chisq
 	      // N.B. numbers in maxt_extreme_stat[] have been pre-squared
@@ -9386,14 +9014,14 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
 	      }
 	      dzz = (int32_t)(perms_total - doublearr_greater_than(g_maxt_extreme_stat, perms_total, dzz - EPSILON) + 1);
 	      if (!perm_count) {
-		wptr = double_g_writewx4(wptr, dzz * dyy, 12);
+		wptr = dtoa_g_wxp4(dzz * dyy, 12, wptr);
 	      } else {
-		wptr = double_g_writewx4(wptr, dzz - 1, 12);
+		wptr = dtoa_g_wxp4(dzz - 1, 12, wptr);
 	      }
 	    }
 	  }
 	  wptr = memcpya(wptr, " \n", 2);
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	    goto qassoc_ret_WRITE_FAIL;
 	  }
 	}
@@ -9432,7 +9060,7 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
     break;
   }
  qassoc_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   fclose_cond(outfile_qtm);
   fclose_cond(outfile_msa);
@@ -9440,13 +9068,13 @@ int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* ou
 }
 
 int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double output_min_p, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uintptr_t* sample_exclude, uintptr_t* pheno_nm, double* pheno_d, uintptr_t* gxe_covar_nm, uintptr_t* gxe_covar_c, uintptr_t* sex_male, uint32_t hh_or_mt_exists) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t sample_ctl = (sample_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
   uintptr_t covar_nm_ct = popcount_longs(gxe_covar_nm, sample_ctl);
-  uintptr_t covar_nm_ctl = (covar_nm_ct + (BITCT - 1)) / BITCT;
+  uintptr_t covar_nm_ctl = BITCT_TO_WORDCT(covar_nm_ct);
   // gxe_covar_c has opposite truth value from ->bcovar in PLINK 1.07 gxe.cpp;
   // see lines 50-58 in gxe.cpp
   uintptr_t group2_size = popcount_longs(gxe_covar_c, sample_ctl);
@@ -9476,7 +9104,7 @@ int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outn
   double* cur_pheno_d = NULL;
   char* wptr_start = NULL;
   uintptr_t cur_sample_ct = 0;
-  uintptr_t cur_sample_ctl2 = 0;
+  uintptr_t cur_sample_ctv2 = 0;
   uintptr_t cur_group1_size = 0;
   uintptr_t cur_group2_size = 0;
   uint32_t y_exists = (chrom_info_ptr->y_code != -1) && is_set(chrom_info_ptr->chrom_mask, chrom_info_ptr->y_code);
@@ -9570,15 +9198,14 @@ int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outn
     logerrprint("Error: Second --gxe group has fewer than three members.\n");
     goto gxe_assoc_ret_INVALID_CMDLINE;
   }
-  if (wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl * 2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf, covar_nm_ctl * 2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&covar_nm_raw, unfiltered_sample_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_d_checked(&pheno_d_collapsed, covar_nm_ct * sizeof(double))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl * 2, &loadbuf_raw) ||
+      bigstack_alloc_ul(covar_nm_ctl * 2, &loadbuf) ||
+      bigstack_calloc_ul(unfiltered_sample_ctl, &covar_nm_raw) ||
+      bigstack_alloc_d(covar_nm_ct, &pheno_d_collapsed)) {
     goto gxe_assoc_ret_NOMEM;
   }
   loadbuf_raw[unfiltered_sample_ctl * 2 - 1] = 0;
 
-  fill_ulong_zero(covar_nm_raw, unfiltered_sample_ctl);
   sample_uidx = 0;
   sample_idx = 0;
   sample_idx2 = 0;
@@ -9587,7 +9214,7 @@ int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outn
     sample_uidx_stop = next_set_ul(sample_exclude, sample_uidx, unfiltered_sample_ct);
     do {
       if (IS_SET(gxe_covar_nm, sample_idx)) {
-        SET_BIT(covar_nm_raw, sample_uidx);
+        SET_BIT(sample_uidx, covar_nm_raw);
         dxx = pheno_d[sample_uidx];
         if (IS_SET(gxe_covar_c, sample_idx)) {
 	  pheno_sum_g2 += dxx;
@@ -9602,12 +9229,11 @@ int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outn
     } while (++sample_uidx < sample_uidx_stop);
   } while (sample_idx < sample_ct);
 
-  if (wkspace_alloc_ul_checked(&group1_include2, covar_nm_ctl * 2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&group2_include2, covar_nm_ctl * 2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(covar_nm_ctl * 2, &group1_include2) ||
+      bigstack_calloc_ul(covar_nm_ctl * 2, &group2_include2)) {
     goto gxe_assoc_ret_NOMEM;
   }
-  fill_vec_55(group1_include2, covar_nm_ct);
-  fill_ulong_zero(group2_include2, covar_nm_ctl * 2);
+  fill_quatervec_55(covar_nm_ct, group1_include2);
   sample_idx = 0;
   sample_idx2 = 0;
   do {
@@ -9615,25 +9241,24 @@ int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outn
     sample_uidx_stop = next_unset_ul(gxe_covar_nm, sample_idx, sample_ct);
     do {
       if (IS_SET(gxe_covar_c, sample_idx)) {
-	SET_BIT_DBL(group2_include2, sample_idx2);
+	SET_BIT_DBL(sample_idx2, group2_include2);
       }
       sample_idx2++;
     } while (++sample_idx < sample_uidx_stop);
   } while (sample_idx2 < covar_nm_ct);
-  bitfield_andnot(group1_include2, group2_include2, covar_nm_ctl * 2);
+  bitvec_andnot(group2_include2, covar_nm_ctl * 2, group1_include2);
 
   hh_or_mt_exists |= mt_exists * NXMHH_EXISTS;
   if ((hh_or_mt_exists & NXMHH_EXISTS) || y_exists) {
-    if (wkspace_alloc_ul_checked(&sample_include2, covar_nm_ctl * 2 * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(covar_nm_ctl * 2, &sample_include2)) {
       goto gxe_assoc_ret_NOMEM;
     }
-    fill_vec_55(sample_include2, covar_nm_ct);
+    fill_quatervec_55(covar_nm_ct, sample_include2);
   }
   if ((hh_or_mt_exists & XMHH_EXISTS) || y_exists) {
-    if (wkspace_alloc_ul_checked(&sample_male_include2, covar_nm_ctl * 2 * sizeof(intptr_t))) {
+    if (bigstack_calloc_ul(covar_nm_ctl * 2, &sample_male_include2)) {
       goto gxe_assoc_ret_NOMEM;
     }
-    fill_ulong_zero(sample_male_include2, covar_nm_ctl * 2);
     sample_uidx = 0;
     sample_idx = 0;
     sample_idx2 = 0;
@@ -9643,7 +9268,7 @@ int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outn
       do {
         if (IS_SET(gxe_covar_nm, sample_idx)) {
           if (IS_SET(sex_male, sample_uidx)) {
-	    SET_BIT_DBL(sample_male_include2, sample_idx2);
+	    SET_BIT_DBL(sample_idx2, sample_male_include2);
 	    male_ct++;
 	  }
 	  sample_idx2++;
@@ -9651,7 +9276,7 @@ int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outn
 	sample_idx++;
       } while (++sample_uidx < sample_uidx_stop);
     } while (sample_idx < sample_ct);
-    male_ctl = (male_ct + (BITCT - 1)) / BITCT;
+    male_ctl = BITCT_TO_WORDCT(male_ct);
     if (y_exists) {
       group1_size_male = popcount_longs_exclude(sample_male_include2, group2_include2, covar_nm_ctl * 2);
       group2_size_male = male_ct - group1_size_male;
@@ -9660,22 +9285,21 @@ int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outn
 	skip_y = 1;
       }
       // currently still need to initialize covar_nm_male_raw even on skip_y
-      if (wkspace_alloc_ul_checked(&sample_male_all_include2, male_ctl * 2 * sizeof(intptr_t)) ||
-          wkspace_alloc_ul_checked(&group1_male_include2, male_ctl * 2 * sizeof(intptr_t)) ||
-	  wkspace_alloc_ul_checked(&group2_male_include2, male_ctl * 2 * sizeof(intptr_t)) ||
-	  wkspace_alloc_d_checked(&pheno_d_male_collapsed, male_ct * sizeof(double)) ||
-	  wkspace_alloc_ul_checked(&covar_nm_male_raw, unfiltered_sample_ctl * sizeof(intptr_t))) {
+      if (bigstack_alloc_ul(male_ctl * 2, &sample_male_all_include2) ||
+          bigstack_alloc_ul(male_ctl * 2, &group1_male_include2) ||
+	  bigstack_calloc_ul(male_ctl * 2, &group2_male_include2) ||
+	  bigstack_alloc_d(male_ct, &pheno_d_male_collapsed) ||
+	  bigstack_alloc_ul(unfiltered_sample_ctl, &covar_nm_male_raw)) {
 	goto gxe_assoc_ret_NOMEM;
       }
-      fill_vec_55(sample_male_all_include2, male_ct);
-      fill_vec_55(group1_male_include2, male_ct);
-      fill_ulong_zero(group2_male_include2, male_ctl * 2);
+      fill_quatervec_55(male_ct, sample_male_all_include2);
+      fill_quatervec_55(male_ct, group1_male_include2);
       sample_idx = 0;
       for (sample_idx2 = 0; sample_idx2 < covar_nm_ct; sample_idx2++) {
 	if (IS_SET_DBL(sample_male_include2, sample_idx2)) {
 	  dxx = pheno_d_collapsed[sample_idx2];
 	  if (IS_SET_DBL(group2_include2, sample_idx2)) {
-	    SET_BIT_DBL(group2_male_include2, sample_idx);
+	    SET_BIT_DBL(sample_idx, group2_male_include2);
 	    pheno_sum_male_g2 += dxx;
 	    pheno_ssq_male_g2 += dxx * dxx;
 	  } else {
@@ -9685,7 +9309,7 @@ int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outn
 	  pheno_d_male_collapsed[sample_idx++] = dxx;
 	}
       }
-      bitfield_andnot(group1_male_include2, group2_male_include2, male_ctl * 2);
+      bitvec_andnot(group2_male_include2, male_ctl * 2, group1_male_include2);
       for (ulii = 0; ulii < unfiltered_sample_ctl; ulii++) {
 	covar_nm_male_raw[ulii] = covar_nm_raw[ulii] & sex_male[ulii];
       }
@@ -9693,7 +9317,7 @@ int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outn
   }
 
   memcpy(outname_end, ".qassoc.gxe", 12);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto gxe_assoc_ret_OPEN_FAIL;
   }
   if (haploid_chrom_present(chrom_info_ptr) || mt_exists) {
@@ -9702,8 +9326,8 @@ int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outn
   LOGPRINTFWW5("Writing --gxe report to %s ... ", outname);
   fputs("0%", stdout);
   fflush(stdout);
-  sprintf(tbuf, " CHR %%%us   NMISS1      BETA1        SE1   NMISS2      BETA2        SE2    Z_GXE        P_GXE \n", plink_maxsnp);
-  fprintf(outfile, tbuf, "SNP");
+  sprintf(g_textbuf, " CHR %%%us   NMISS1      BETA1        SE1   NMISS2      BETA2        SE2    Z_GXE        P_GXE \n", plink_maxsnp);
+  fprintf(outfile, g_textbuf, "SNP");
 
   if (fseeko(bedfile, bed_offset, SEEK_SET)) {
     goto gxe_assoc_ret_READ_FAIL;
@@ -9755,14 +9379,14 @@ int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outn
           cur_pheno_d = pheno_d_male_collapsed;
 	  cur_covar_nm_raw = covar_nm_male_raw;
 	}
-	wptr_start = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx]));
+	wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx], g_textbuf));
 	*wptr_start++ = ' ';
-	cur_sample_ctl2 = ((cur_sample_ct + (BITCT - 1)) / BITCT) * 2;
-        loadbuf[cur_sample_ctl2 - 1] = 0;
+	cur_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(cur_sample_ct);
+        loadbuf[cur_sample_ctv2 - 1] = 0;
 	final_mask = get_final_mask(cur_sample_ct);
       }
 
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, cur_sample_ct, cur_covar_nm_raw, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, cur_sample_ct, cur_covar_nm_raw, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf)) {
 	goto gxe_assoc_ret_READ_FAIL;
       }
       if (is_y && skip_y) {
@@ -9781,7 +9405,7 @@ int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outn
       //   g_var{1,2}: (geno_ssq - (geno_sum^2 / N)) / (N-1)
       //   qt_g_covar{1,2}: (qt_g_prod - ((qt_sum * geno_sum) / N)) / (N-1)
 
-      single_marker_cc_3freqs(cur_sample_ctl2, loadbuf, cur_group1_i2, cur_group2_i2, &homcom_ct1, &het_ct1, &missing_ct1, &homcom_ct2, &het_ct2, &missing_ct2);
+      single_marker_cc_3freqs(cur_sample_ctv2, loadbuf, cur_group1_i2, cur_group2_i2, &homcom_ct1, &het_ct1, &missing_ct1, &homcom_ct2, &het_ct2, &missing_ct2);
       nanal1 = ((uint32_t)cur_group1_size) - missing_ct1;
       nanal2 = ((uint32_t)cur_group2_size) - missing_ct2;
       homrar_ct1 = nanal1 - (het_ct1 + homcom_ct1);
@@ -9871,20 +9495,20 @@ int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outn
 	  goto gxe_assoc_nan_line;
 	}
         zval = (beta1 - beta2) / sqrt(vbeta1 + vbeta2);
-        wptr = uint32_writew8x(wptr, nanal1, ' ');
-        wptr = double_g_writewx4x(wptr, beta1, 10, ' ');
-        wptr = double_g_writewx4x(wptr, sqrt(vbeta1), 10, ' ');
-        wptr = uint32_writew8x(wptr, nanal2, ' ');
-        wptr = double_g_writewx4x(wptr, beta2, 10, ' ');
-        wptr = double_g_writewx4x(wptr, sqrt(vbeta2), 10, ' ');
-        wptr = double_g_writewx4x(wptr, zval, 8, ' ');
+        wptr = uint32toa_w8x(nanal1, ' ', wptr);
+        wptr = dtoa_g_wxp4x(beta1, 10, ' ', wptr);
+        wptr = dtoa_g_wxp4x(sqrt(vbeta1), 10, ' ', wptr);
+        wptr = uint32toa_w8x(nanal2, ' ', wptr);
+        wptr = dtoa_g_wxp4x(beta2, 10, ' ', wptr);
+        wptr = dtoa_g_wxp4x(sqrt(vbeta2), 10, ' ', wptr);
+        wptr = dtoa_g_wxp4x(zval, 8, ' ', wptr);
 	dxx = chiprob_p(zval * zval, 1);
-        wptr = double_g_writewx4x(wptr, MAXV(dxx, output_min_p), 12, '\n');
+        wptr = dtoa_g_wxp4x(MAXV(dxx, output_min_p), 12, '\n', wptr);
       } else {
       gxe_assoc_nan_line:
         wptr = memcpya(wptr, "      NA         NA         NA       NA         NA         NA       NA           NA\n", 84);
       }
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	goto gxe_assoc_ret_WRITE_FAIL;
       }
       marker_uidx++;
@@ -9922,7 +9546,7 @@ int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outn
     retval = RET_INVALID_CMDLINE;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   return retval;
 }
@@ -9932,7 +9556,7 @@ void calc_git_missing(uint32_t pheno_nm_ct, uint32_t perm_vec_ct, uintptr_t* __r
   // and nonmissing.
   // thread_wkspace[] is assumed to be zeroed out before this function is
   // called.
-  uint32_t pheno_nm_ctl = (pheno_nm_ct + (BITCT - 1)) / BITCT;
+  uint32_t pheno_nm_ctl = BITCT_TO_WORDCT(pheno_nm_ct);
 #ifdef __LP64__
   uint32_t perm_ct16 = (perm_vec_ct + 15) / 16;
   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
@@ -10018,9 +9642,9 @@ void calc_git_missing(uint32_t pheno_nm_ct, uint32_t perm_vec_ct, uintptr_t* __r
 
 THREAD_RET_TYPE testmiss_adapt_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
-  uintptr_t pheno_nm_ct = g_pheno_nm_ct;
-  uintptr_t pheno_nm_ctl = (pheno_nm_ct + (BITCT - 1)) / BITCT;
-  uintptr_t pheno_nm_ctv = (pheno_nm_ctl + 1) & (~1);
+  uintptr_t pheno_nm_ct = g_perm_pheno_nm_ct;
+  uintptr_t pheno_nm_ctl = BITCT_TO_WORDCT(pheno_nm_ct);
+  uintptr_t pheno_nm_ctv = round_up_pow2(pheno_nm_ctl, VEC_WORDS);
   uintptr_t perm_vec_ct = g_perm_vec_ct;
   uint32_t max_thread_ct = g_assoc_thread_ct;
   uint32_t pidx_offset = g_perms_done;
@@ -10156,12 +9780,12 @@ THREAD_RET_TYPE testmiss_adapt_thread(void* arg) {
 THREAD_RET_TYPE testmiss_maxt_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
   uintptr_t perm_vec_ct = g_perm_vec_ct;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
   uint32_t is_midp = g_fisher_midp;
   uint32_t max_thread_ct = g_assoc_thread_ct;
-  uintptr_t pheno_nm_ctl = (pheno_nm_ct + (BITCT - 1)) / BITCT;
-  uintptr_t pheno_nm_ctv = (pheno_nm_ctl + 1) & (~1);
-  uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
+  uintptr_t pheno_nm_ctl = BITCT_TO_WORDCT(pheno_nm_ct);
+  uintptr_t pheno_nm_ctv = round_up_pow2(pheno_nm_ctl, VEC_WORDS);
+  uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 176]);
   uint32_t* __restrict__ perm_vecst = g_perm_vecst;
@@ -10174,7 +9798,7 @@ THREAD_RET_TYPE testmiss_maxt_thread(void* arg) {
   double* gpd = NULL;
   double stat_high = 0;
   double stat_low = 0;
-  uint32_t case_ct = g_case_ct;
+  uint32_t case_ct = g_perm_case_ct;
   uint32_t cur_case_ct = case_ct;
   uintptr_t* loadbuf;
   uintptr_t* loadbuf_ptr;
@@ -10299,14 +9923,14 @@ THREAD_RET_TYPE testmiss_maxt_thread(void* arg) {
 
 int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t testmiss_mperm_val, uint32_t testmiss_modifier, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uint32_t cluster_ct, uint [...]
   // Simple variant of model_assoc().
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
-  uintptr_t pheno_nm_ctl = (pheno_nm_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t pheno_nm_ctl = BITCT_TO_WORDCT(pheno_nm_ct);
   uintptr_t cur_sample_ctl = pheno_nm_ctl;
-  uintptr_t pheno_nm_ctv = (pheno_nm_ctl + 1) & (~1);
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t pheno_nm_ctv = round_up_pow2(pheno_nm_ctl, VEC_WORDS);
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t marker_uidx = next_unset_unsafe(marker_exclude_orig, 0);
   double maxt_cur_extreme_stat = 0;
   FILE* outfile = NULL;
@@ -10317,7 +9941,7 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
   uintptr_t* pheno_c_collapsed_male = NULL;
   uintptr_t* sex_male_collapsed = NULL;
   char* wptr_start = NULL;
-  char* tbuf2 = &(tbuf[MAXLINELEN]);
+  char* tbuf2 = &(g_textbuf[MAXLINELEN]);
   uint32_t perm_adapt = testmiss_modifier & TESTMISS_PERM;
   uint32_t perm_maxt = testmiss_modifier & TESTMISS_MPERM;
   uint32_t perm_count = testmiss_modifier & TESTMISS_PERM_COUNT;
@@ -10340,7 +9964,7 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
   uint32_t chrom_end = 0;
   uint32_t mperm_dump_all = 0;
   uint32_t max_thread_ct = g_thread_ct;
-  uintptr_t pheno_male_nm_ctl = (male_ct + (BITCT - 1)) / BITCT;
+  uintptr_t pheno_male_nm_ctl = BITCT_TO_WORDCT(male_ct);
   int32_t y_code = chrom_info_ptr->y_code;
   int32_t retval = 0;
   uint32_t uibuf[4];
@@ -10395,7 +10019,7 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
   if (perm_maxt) {
     mperm_dump_all = mperm_save & MPERM_DUMP_ALL;
     perms_total = testmiss_mperm_val;
-    if (wkspace_alloc_d_checked(&g_maxt_extreme_stat, sizeof(double) * perms_total)) {
+    if (bigstack_alloc_d(perms_total, &g_maxt_extreme_stat)) {
       goto testmiss_ret_NOMEM;
     }
     for (uii = 0; uii < perms_total; uii++) {
@@ -10403,7 +10027,7 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
     }
     if (mperm_dump_all) {
       memcpy(outname_end, ".mperm.dump.all", 16);
-      if (fopen_checked(&outfile_msa, outname, "w")) {
+      if (fopen_checked(outname, "w", &outfile_msa)) {
         goto testmiss_ret_OPEN_FAIL;
       }
       LOGPRINTFWW("Dumping all permutation p-values to %s .\n", outname);
@@ -10419,39 +10043,39 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
   // test.  Since it's likely that many such sites exist, we postpone the
   // associated memory allocations until after the basic .missing report is
   // generated.
-  if (wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&pheno_nm2, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&pheno_c_collapsed, pheno_nm_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&missing_bitfield, pheno_nm_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&marker_exclude, unfiltered_marker_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl2, &pheno_nm2) ||
+      bigstack_alloc_ul(pheno_nm_ctl, &pheno_c_collapsed) ||
+      bigstack_alloc_ul(pheno_nm_ctl, &missing_bitfield) ||
+      bigstack_alloc_ul(unfiltered_marker_ctl, &marker_exclude)) {
     goto testmiss_ret_NOMEM;
   }
   memcpy(marker_exclude, marker_exclude_orig, unfiltered_marker_ctl * sizeof(intptr_t));
   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
-  vec_include_init(unfiltered_sample_ct, pheno_nm2, pheno_nm);
+  init_quaterarr_from_bitarr(pheno_nm, unfiltered_sample_ct, pheno_nm2);
   cur_pheno_nm2 = pheno_nm2;
-  collapse_copy_bitarr_incl(unfiltered_sample_ct, pheno_c, pheno_nm, pheno_nm_ct, pheno_c_collapsed);
+  copy_bitarr_subset(pheno_c, pheno_nm, unfiltered_sample_ct, pheno_nm_ct, pheno_c_collapsed);
   cur_pheno_c_collapsed = pheno_c_collapsed;
   if (!skip_y) {
-    if (wkspace_alloc_ul_checked(&pheno_male_nm2, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-        wkspace_alloc_ul_checked(&pheno_c_collapsed_male, pheno_male_nm_ctl * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_sample_ctl2, &pheno_male_nm2) ||
+        bigstack_alloc_ul(pheno_male_nm_ctl, &pheno_c_collapsed_male)) {
       goto testmiss_ret_NOMEM;
     }
     // temporary non-excluded male bitfield
     memcpy(pheno_male_nm2, pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t));
-    bitfield_and(pheno_male_nm2, sex_male, unfiltered_sample_ctl);
-    collapse_copy_bitarr_incl(unfiltered_sample_ct, pheno_c, pheno_male_nm2, male_ct, pheno_c_collapsed_male);
+    bitvec_and(sex_male, unfiltered_sample_ctl, pheno_male_nm2);
+    copy_bitarr_subset(pheno_c, pheno_male_nm2, unfiltered_sample_ct, male_ct, pheno_c_collapsed_male);
     memcpy(pheno_male_nm2, pheno_nm2, unfiltered_sample_ctl2 * sizeof(intptr_t));
-    vec_include_mask_in(unfiltered_sample_ct, pheno_male_nm2, sex_male);
+    apply_bitarr_mask_to_quaterarr_01(sex_male, unfiltered_sample_ct, pheno_male_nm2);
   }
   outname_end2 = memcpyb(outname_end, ".missing", 9);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto testmiss_ret_OPEN_FAIL;
   }
   LOGPRINTFWW5("Writing --test-missing report to %s ... ", outname);
   fflush(stdout);
-  sprintf(tbuf, " CHR %%%us     F_MISS_A     F_MISS_U            P \n", plink_maxsnp);
-  fprintf(outfile, tbuf, "SNP");
+  sprintf(g_textbuf, " CHR %%%us     F_MISS_A     F_MISS_U            P \n", plink_maxsnp);
+  fprintf(outfile, g_textbuf, "SNP");
   if (ferror(outfile)) {
     goto testmiss_ret_WRITE_FAIL;
   }
@@ -10465,7 +10089,7 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
   }
   chrom_end = 0;
   // must be last allocation
-  if (wkspace_alloc_d_checked(&g_orig_pvals, marker_ct_orig * sizeof(double))) {
+  if (bigstack_alloc_d(marker_ct_orig, &g_orig_pvals)) {
     goto testmiss_ret_NOMEM;
   }
   dptr = g_orig_pvals;
@@ -10499,16 +10123,16 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
         cur_case_ct_recip = 1.0 / ((double)((int32_t)cur_case_ct));
         cur_ctrl_ct_recip = 1.0 / ((double)((int32_t)cur_ctrl_ct));
       } else if (g_is_y) {
-        fill_bits(marker_exclude, marker_uidx, chrom_end - marker_uidx);
+        fill_bits(marker_uidx, chrom_end - marker_uidx, marker_exclude);
 	marker_idx += chrom_end - marker_uidx - 1 - popcount_bit_idx(marker_exclude_orig, marker_uidx, chrom_end);
 	marker_uidx = chrom_end - 1;
 	continue;
       }
       uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-      wptr_start = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, uii));
+      wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, uii, g_textbuf));
       *wptr_start++ = ' ';
     }
-    if (load_raw(bedfile, loadbuf_raw, unfiltered_sample_ct4)) {
+    if (load_raw(unfiltered_sample_ct4, bedfile, loadbuf_raw)) {
       goto testmiss_ret_READ_FAIL;
     }
     if (is_haploid && hh_exists) {
@@ -10517,7 +10141,7 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
     extract_collapsed_missing_bitfield(loadbuf_raw, unfiltered_sample_ct, cur_pheno_nm2, cur_pheno_nm_ct, missing_bitfield);
     missing_ct = popcount_longs(missing_bitfield, cur_sample_ctl);
     if ((!missing_ct) || (missing_ct == cur_pheno_nm_ct)) {
-      SET_BIT(marker_exclude, marker_uidx);
+      SET_BIT(marker_uidx, marker_exclude);
       continue;
     }
     uii = popcount_longs_intersect(missing_bitfield, cur_pheno_c_collapsed, cur_sample_ctl);
@@ -10529,10 +10153,10 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
     }
     wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
     *wptr++ = ' ';
-    wptr = double_g_writewx4x(wptr, ((int32_t)uii) * cur_case_ct_recip, 12, ' ');
-    wptr = double_g_writewx4x(wptr, ((int32_t)ujj) * cur_ctrl_ct_recip, 12, ' ');
-    wptr = double_g_writewx4x(wptr, MAXV(pval, output_min_p), 12, '\n');
-    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+    wptr = dtoa_g_wxp4x(((int32_t)uii) * cur_case_ct_recip, 12, ' ', wptr);
+    wptr = dtoa_g_wxp4x(((int32_t)ujj) * cur_ctrl_ct_recip, 12, ' ', wptr);
+    wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 12, '\n', wptr);
+    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
       goto testmiss_ret_WRITE_FAIL;
     }
   }
@@ -10541,9 +10165,9 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
   }
   logprint("done.\n");
   marker_ct = (uintptr_t)(dptr - g_orig_pvals);
-  wkspace_shrink_top(g_orig_pvals, marker_ct * sizeof(double));
+  bigstack_shrink_top(g_orig_pvals, marker_ct * sizeof(double));
   if (mtest_adjust) {
-    if (wkspace_alloc_ui_checked(&marker_idx_to_uidx, marker_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(marker_ct, &marker_idx_to_uidx)) {
       goto testmiss_ret_NOMEM;
     }
     fill_idx_to_uidx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_idx_to_uidx);
@@ -10559,25 +10183,25 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
     }
     LOGPRINTF("Including %" PRIuPTR " loc%s in --test-missing permutation test.\n", marker_ct, (marker_ct == 1)? "us" : "i");
     if (mperm_dump_all) {
-      tbuf[0] = '0';
-      wptr = &(tbuf[1]);
+      g_textbuf[0] = '0';
+      wptr = &(g_textbuf[1]);
       for (uii = 0; uii < marker_ct; uii++) {
         *wptr++ = ' ';
         dxx = g_orig_pvals[uii];
 	if (dxx >= 0) {
-	  wptr = double_g_write(wptr, dxx);
+	  wptr = dtoa_g(dxx, wptr);
 	} else {
 	  wptr = memcpya(wptr, "NA", 2);
 	}
 	if (wptr >= tbuf2) {
-	  if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_msa)) {
+	  if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
 	    goto testmiss_ret_WRITE_FAIL;
 	  }
-	  wptr = tbuf;
+	  wptr = g_textbuf;
 	}
       }
       *wptr++ = '\n';
-      if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_msa)) {
+      if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
 	goto testmiss_ret_WRITE_FAIL;
       }
     }
@@ -10589,74 +10213,72 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
       if (popcount_bit_idx(marker_exclude, uii, ujj) == ujj - uii) {
 	skip_y = 1;
       } else {
-	if (wkspace_alloc_ul_checked(&sex_male_collapsed, pheno_nm_ctl * sizeof(intptr_t))) {
+	if (bigstack_alloc_ul(pheno_nm_ctl, &sex_male_collapsed)) {
 	  goto testmiss_ret_NOMEM;
 	}
-	collapse_copy_bitarr_incl(unfiltered_sample_ct, sex_male, pheno_nm, pheno_nm_ct, sex_male_collapsed);
+	copy_bitarr_subset(sex_male, pheno_nm, unfiltered_sample_ct, pheno_nm_ct, sex_male_collapsed);
       }
     }
 
     if (cluster_starts) {
-      retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, pheno_c, pheno_nm_ct, 1, cluster_ct, cluster_map, cluster_starts, &g_cluster_ct, &g_cluster_map, &g_cluster_starts, &g_cluster_case_cts, &g_cluster_cc_perm_preimage);
+      retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, pheno_c, pheno_nm_ct, 1, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, &g_perm_cluster_case_cts, &g_perm_cluster_cc_preimage);
       if (retval) {
 	goto testmiss_ret_1;
       }
-      if (!g_cluster_ct) {
+      if (!g_perm_cluster_ct) {
 	logerrprint("Error: No size 2+ clusters for permutation test.\n");
 	goto testmiss_ret_INVALID_CMDLINE;
       }
-      retval = cluster_alloc_and_populate_magic_nums(g_cluster_ct, g_cluster_map, g_cluster_starts, &g_tot_quotients, &g_totq_magics, &g_totq_preshifts, &g_totq_postshifts, &g_totq_incrs);
+      retval = cluster_alloc_and_populate_magic_nums(g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, &g_perm_tot_quotients, &g_perm_totq_magics, &g_perm_totq_preshifts, &g_perm_totq_postshifts, &g_perm_totq_incrs);
       if (retval) {
 	goto testmiss_ret_1;
       }
     } else {
-      g_cluster_starts = NULL;
+      g_perm_cluster_starts = NULL;
     }
     if (max_thread_ct > perms_total) {
       max_thread_ct = perms_total;
     }
-    if (wkspace_init_sfmtp(max_thread_ct)) {
+    if (bigstack_init_sfmtp(max_thread_ct)) {
       goto testmiss_ret_NOMEM;
     }
-    if (wkspace_alloc_ul_checked(&g_loadbuf, MODEL_BLOCKSIZE * pheno_nm_ctv * sizeof(intptr_t)) ||
-	wkspace_alloc_ui_checked(&g_perm_2success_ct, marker_ct * sizeof(int32_t)) ||
-	wkspace_alloc_ui_checked(&g_missing_cts, marker_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ul(MODEL_BLOCKSIZE * pheno_nm_ctv, &g_loadbuf) ||
+	bigstack_calloc_ui(marker_ct, &g_perm_2success_ct) ||
+	bigstack_alloc_ui(marker_ct, &g_missing_cts)) {
       goto testmiss_ret_NOMEM;
     }
     for (uii = 1; uii <= MODEL_BLOCKSIZE; uii++) {
       g_loadbuf[uii * pheno_nm_ctv - 2] = 0;
       g_loadbuf[uii * pheno_nm_ctv - 1] = 0;
     }
-    fill_uint_zero(g_perm_2success_ct, marker_ct);
     uii = marker_ct;
     if (perm_maxt) {
       if (!mperm_dump_all) {
-	if (wkspace_alloc_ui_checked(&g_precomp_ui, 6 * MODEL_BLOCKSIZE * sizeof(int32_t)) ||
-	    wkspace_alloc_d_checked(&g_precomp_d, 2 * MODEL_BLOCKSIZE * sizeof(double))) {
+	if (bigstack_alloc_ui(6 * MODEL_BLOCKSIZE, &g_precomp_ui) ||
+	    bigstack_alloc_d(2 * MODEL_BLOCKSIZE, &g_precomp_d)) {
 	  goto testmiss_ret_NOMEM;
 	}
       }
     } else {
-      if (wkspace_alloc_ui_checked(&g_perm_attempt_ct, marker_ct * sizeof(int32_t)) ||
-	  wkspace_alloc_uc_checked(&g_perm_adapt_stop, marker_ct) ||
-	  wkspace_alloc_ui_checked(&g_precomp_ui, 4 * MODEL_BLOCKSIZE * sizeof(int32_t))) {
+      if (bigstack_alloc_ui(marker_ct, &g_perm_attempt_ct) ||
+	  bigstack_calloc_uc(round_up_pow2(marker_ct, BYTECT), &g_perm_adapt_stop) ||
+	  bigstack_alloc_ui(4 * MODEL_BLOCKSIZE, &g_precomp_ui)) {
 	goto testmiss_ret_NOMEM;
       }
       for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
 	g_perm_attempt_ct[marker_idx] = perms_total;
       }
-      fill_ulong_zero((uintptr_t*)g_perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
       g_adaptive_ci_zt = ltqnorm(1 - apip->beta / (2.0 * ((intptr_t)marker_ct)));
     }
     if (!cluster_starts) {
-      g_tot_quotient = 0x100000000LLU / pheno_nm_ct;
-      magic_num(g_tot_quotient, &g_totq_magic, &g_totq_preshift, &g_totq_postshift, &g_totq_incr);
+      g_perm_tot_quotient = 0x100000000LLU / pheno_nm_ct;
+      magic_num(g_perm_tot_quotient, &g_perm_totq_magic, &g_perm_totq_preshift, &g_perm_totq_postshift, &g_perm_totq_incr);
     }
     marker_unstopped_ct = marker_ct;
-    g_is_perm1 = 1;
+    g_perm_is_1bit = 1;
     g_perms_done = 0;
-    g_pheno_nm_ct = pheno_nm_ct;
-    g_case_ct = case_ct;
+    g_perm_pheno_nm_ct = pheno_nm_ct;
+    g_perm_case_ct = case_ct;
     g_male_ct = male_ct;
     g_fisher_midp = midp;
     g_mperm_save_all = NULL;
@@ -10676,7 +10298,7 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
 	g_adaptive_intercept = apip->init_interval;
 	g_adaptive_slope = apip->interval_slope;
       }
-      g_perm_vec_ct = (wkspace_left - CACHELINE + sizeof(int32_t)) / (pheno_nm_ctv * sizeof(intptr_t) + (1 - skip_y) * sizeof(int32_t));
+      g_perm_vec_ct = (bigstack_left() - CACHELINE + sizeof(int32_t)) / (pheno_nm_ctv * sizeof(intptr_t) + (1 - skip_y) * sizeof(int32_t));
     } else {
       // g_perm_vec_ct memory allocation dependencies:
       //   g_maxt_thread_results: (8 * g_perm_vec_ct, cacheline-aligned) *
@@ -10692,9 +10314,9 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
       //                    sizeof(intptr_t) * pheno_nm_ctv
       //                    [+ marker_ct * sizeof(double) * mperm_save_all])
       if (mperm_dump_all) {
-	g_perm_vec_ct = 128 * (wkspace_left / (128 * sizeof(intptr_t) * pheno_nm_ctv + 1728LL * max_thread_ct + 16LL * pheno_nm_ct + 512 * (1 - skip_y) + 128LL * sizeof(double) * marker_ct));
+	g_perm_vec_ct = 128 * (bigstack_left() / (128 * sizeof(intptr_t) * pheno_nm_ctv + 1728LL * max_thread_ct + 16LL * pheno_nm_ct + 512 * (1 - skip_y) + 128LL * sizeof(double) * marker_ct));
       } else {
-	g_perm_vec_ct = 128 * (wkspace_left / (128 * sizeof(intptr_t) * pheno_nm_ctv + 1728LL * max_thread_ct + 16LL * pheno_nm_ct + 512 * (1 - skip_y)));
+	g_perm_vec_ct = 128 * (bigstack_left() / (128 * sizeof(intptr_t) * pheno_nm_ctv + 1728LL * max_thread_ct + 16LL * pheno_nm_ct + 512 * (1 - skip_y)));
       }
     }
     if (g_perm_vec_ct > perms_total - g_perms_done) {
@@ -10702,50 +10324,40 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
     } else if (!g_perm_vec_ct) {
       goto testmiss_ret_NOMEM;
     }
-    g_perm_vecs = (uintptr_t*)wkspace_alloc(g_perm_vec_ct * pheno_nm_ctv * sizeof(intptr_t));
-    if (g_perm_vec_ct > max_thread_ct) {
-      g_assoc_thread_ct = max_thread_ct;
-    } else {
-      g_assoc_thread_ct = g_perm_vec_ct;
-    }
+    bigstack_alloc_ul(g_perm_vec_ct * pheno_nm_ctv, &g_perm_vecs);
+    g_perm_generation_thread_ct = MINV(max_thread_ct, g_perm_vec_ct);
     ulii = 0;
     if (!cluster_starts) {
-      if (spawn_threads(threads, &model_assoc_gen_perms_thread, g_assoc_thread_ct)) {
+      if (spawn_threads(threads, &generate_cc_perms_thread, g_perm_generation_thread_ct)) {
 	goto testmiss_ret_THREAD_CREATE_FAIL;
       }
-      model_assoc_gen_perms_thread((void*)ulii);
+      generate_cc_perms_thread((void*)ulii);
     } else {
-      if (spawn_threads(threads, &model_assoc_gen_cluster_perms_thread, g_assoc_thread_ct)) {
+      if (spawn_threads(threads, &generate_cc_cluster_perms_thread, g_perm_generation_thread_ct)) {
 	goto testmiss_ret_THREAD_CREATE_FAIL;
       }
-      model_assoc_gen_cluster_perms_thread((void*)ulii);
+      generate_cc_cluster_perms_thread((void*)ulii);
     }
-    join_threads(threads, g_assoc_thread_ct);
+    join_threads(threads, g_perm_generation_thread_ct);
     g_assoc_thread_ct = max_thread_ct;
     if (perm_maxt) {
-      ulii = (g_perm_vec_ct + (CACHELINE_DBL - 1)) / CACHELINE_DBL;
-      g_maxt_thread_results = (double*)wkspace_alloc(max_thread_ct * ulii * CACHELINE);
+      bigstack_alloc_d(max_thread_ct * round_up_pow2(g_perm_vec_ct, CACHELINE_DBL), &g_maxt_thread_results);
 #ifdef __LP64__
-      ulii = ((g_perm_vec_ct + 127) / 128) * 16;
-      g_perm_vecst = (uint32_t*)wkspace_alloc(ulii * pheno_nm_ct);
+      ulii = ((g_perm_vec_ct + 127) / 128) * 4;
+      bigstack_alloc_ui(ulii * pheno_nm_ct, &g_perm_vecst);
 #else
-      ulii = ((g_perm_vec_ct + 31) / 32) * 4;
-      g_perm_vecst = (uint32_t*)wkspace_alloc(ulii * pheno_nm_ct);
-      ulii = ((g_perm_vec_ct + 127) / 128) * 16; // force 64-byte align
+      ulii = (g_perm_vec_ct + 31) / 32;
+      bigstack_alloc_ui(ulii * pheno_nm_ct, &g_perm_vecst);
+      ulii = ((g_perm_vec_ct + 127) / 128) * 4; // force 64-byte align
 #endif
-      g_thread_git_wkspace = (uint32_t*)wkspace_alloc(ulii * 44 * max_thread_ct);
+      bigstack_calloc_ui(ulii * 44 * max_thread_ct, &g_thread_git_wkspace);
       transpose_perm1s(g_perm_vecs, g_perm_vec_ct, pheno_nm_ct, g_perm_vecst);
-#ifdef __LP64__
-      fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, (ulii / 2) * 11 * max_thread_ct);
-#else
-      fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 11 * max_thread_ct);
-#endif
       if (mperm_dump_all) {
-	g_mperm_save_all = (double*)wkspace_alloc(marker_ct * g_perm_vec_ct * sizeof(double));
+	bigstack_alloc_d(marker_ct * g_perm_vec_ct, &g_mperm_save_all);
       }
     }
     if (!skip_y) {
-      g_male_case_cts = (uint32_t*)wkspace_alloc(g_perm_vec_ct * sizeof(int32_t));
+      bigstack_alloc_ui(g_perm_vec_ct, &g_male_case_cts);
       for (perm_idx = 0; perm_idx < g_perm_vec_ct; perm_idx++) {
 	g_male_case_cts[perm_idx] = popcount_longs_intersect(sex_male_collapsed, &(g_perm_vecs[perm_idx * pheno_nm_ctv]), pheno_nm_ctl);
       }
@@ -10761,7 +10373,7 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
     // only forced to terminate block at Y chromosome boundaries
     if (!skip_y) {
       marker_uidx_end = chrom_info_ptr->chrom_start[(uint32_t)y_code];
-      pheno_male_nm_ctl = (pheno_male_nm_ctl + 1) & (~1);
+      pheno_male_nm_ctl = round_up_pow2(pheno_male_nm_ctl, 2);
     } else {
       marker_uidx_end = unfiltered_marker_ct;
     }
@@ -10800,12 +10412,12 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
 	  chrom_fo_idx++;
 	  refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &g_is_x, &g_is_y, &uii, &is_haploid);
 	  if (!g_is_y) {
-	    g_case_ct = case_ct;
+	    g_perm_case_ct = case_ct;
 	  } else {
-	    g_case_ct = case_ct_y;
+	    g_perm_case_ct = case_ct_y;
 	  }
 	}
-	if (load_raw(bedfile, loadbuf_raw, unfiltered_sample_ct4)) {
+	if (load_raw(unfiltered_sample_ct4, bedfile, loadbuf_raw)) {
 	  goto testmiss_ret_READ_FAIL;
 	}
 	if (is_haploid && hh_exists) {
@@ -10814,7 +10426,7 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
 	loadbuf_ptr = &(g_loadbuf[block_size * pheno_nm_ctv]);
 	extract_collapsed_missing_bitfield(loadbuf_raw, unfiltered_sample_ct, pheno_nm2, pheno_nm_ct, loadbuf_ptr);
 	if (g_is_y) {
-	  bitfield_and(loadbuf_ptr, sex_male_collapsed, pheno_nm_ctl);
+	  bitvec_and(sex_male_collapsed, pheno_nm_ctl, loadbuf_ptr);
 	}
 	if (!g_perms_done) {
 	  missing_ct = popcount_longs(loadbuf_ptr, pheno_nm_ctl);
@@ -10886,7 +10498,7 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
 	}
 	testmiss_maxt_thread((void*)ulii);
 	join_threads2(threads, max_thread_ct, is_last_block);
-	ulii = CACHEALIGN32_DBL(g_perm_vec_ct);
+	ulii = round_up_pow2(g_perm_vec_ct, CACHELINE_DBL);
 	umm = block_size;
 	if (umm > max_thread_ct) {
 	  umm = max_thread_ct;
@@ -10913,36 +10525,36 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
       fflush(stdout);
       ulii = g_perm_vec_ct;
       ujj = 1 + g_perms_done;
-      wptr = tbuf;
-      tbuf2 = &(tbuf[MAXLINELEN]);
+      wptr = g_textbuf;
+      tbuf2 = &(g_textbuf[MAXLINELEN]);
       for (uii = 0; uii < ulii; uii++) {
-	wptr = uint32_write(wptr, uii + ujj);
+	wptr = uint32toa(uii + ujj, wptr);
 	dptr = &(g_mperm_save_all[uii]);
 	for (ukk = 0; ukk < marker_ct; ukk++) {
 	  *wptr++ = ' ';
 	  dxx = dptr[ukk * ulii];
 	  if (dxx >= 0) {
-	    wptr = double_g_write(wptr, dxx);
+	    wptr = dtoa_g(dxx, wptr);
 	  } else {
 	    wptr = memcpya(wptr, "NA", 2);
 	  }
 	  if (wptr >= tbuf2) {
-	    if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_msa)) {
+	    if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
 	      goto testmiss_ret_WRITE_FAIL;
 	    }
-	    wptr = tbuf;
+	    wptr = g_textbuf;
 	  }
 	}
 	*wptr++ = '\n';
       }
-      if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_msa)) {
+      if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
 	goto testmiss_ret_WRITE_FAIL;
       }
       fputs("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b               ", stdout);
     }
     // really should postpone this for --assoc/--model too
     g_perms_done += g_perm_vec_ct;
-    wkspace_reset(g_perm_vecs);
+    bigstack_reset(g_perm_vecs);
     if (g_perms_done < perms_total) {
       if (perm_adapt) {
 	marker_unstopped_ct = marker_ct - popcount01_longs((uintptr_t*)g_perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
@@ -10975,7 +10587,7 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
 	ulii = outname_end - outname;
 	memcpy(outname_end, ".mperm.dump.best", 17);
 	LOGPRINTFWW("Dumping best permutation p-values to %s .\n", outname);
-	if (fopen_checked(&outfile, outname, "w")) {
+	if (fopen_checked(outname, "w", &outfile)) {
 	  goto testmiss_ret_OPEN_FAIL;
 	}
 	dxx = 1.0;
@@ -10984,15 +10596,15 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
 	    dxx = g_orig_pvals[marker_idx];
 	  }
 	}
-	memcpy(tbuf, "0 ", 2);
-	wptr = double_g_writex(&(tbuf[2]), dxx, '\n');
-	if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile)) {
+	memcpy(g_textbuf, "0 ", 2);
+	wptr = dtoa_gx(dxx, '\n', &(g_textbuf[2]));
+	if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
 	  goto testmiss_ret_WRITE_FAIL;
 	}
 	for (uii = 0; uii < perms_total; uii++) {
-	  wptr = uint32_writex(tbuf, uii + 1, ' ');
-	  wptr = double_g_writex(wptr, g_maxt_extreme_stat[uii], '\n');
-	  if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile)) {
+	  wptr = uint32toa_x(uii + 1, ' ', g_textbuf);
+	  wptr = dtoa_gx(g_maxt_extreme_stat[uii], '\n', wptr);
+	  if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
 	    goto testmiss_ret_WRITE_FAIL;
 	  }
 	}
@@ -11003,13 +10615,13 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
       }
       memcpy(outname_end2, ".mperm", 7);
     }
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto testmiss_ret_OPEN_FAIL;
     }
     if (perm_adapt) {
-      sprintf(tbuf, " CHR %%%us         EMP1           NP \n", plink_maxsnp);
+      sprintf(g_textbuf, " CHR %%%us         EMP1           NP \n", plink_maxsnp);
     } else {
-      sprintf(tbuf, " CHR %%%us         EMP1         EMP2 \n", plink_maxsnp);
+      sprintf(g_textbuf, " CHR %%%us         EMP1         EMP2 \n", plink_maxsnp);
 #ifdef __cplusplus
       std::sort(g_maxt_extreme_stat, &(g_maxt_extreme_stat[perms_total]));
 #else
@@ -11021,7 +10633,7 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
       printf("extreme stats: %g %g\n", g_maxt_extreme_stat[0], g_maxt_extreme_stat[perms_total - 1]);
     }
     */
-    fprintf(outfile, tbuf, "SNP");
+    fprintf(outfile, g_textbuf, "SNP");
     chrom_fo_idx = 0xffffffffU;
     marker_uidx = next_unset_unsafe(marker_exclude, 0);
     marker_idx = 0;
@@ -11032,7 +10644,7 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
 	chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[(++chrom_fo_idx) + 1U];
       } while (marker_uidx >= chrom_end);
       uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-      wptr_start = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, uii));
+      wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, uii, g_textbuf));
       *wptr_start++ = ' ';
       wptr_start[plink_maxsnp] = ' ';
       for (; marker_uidx < chrom_end;) {
@@ -11045,24 +10657,24 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
 	  fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
 	  wptr = &(wptr_start[1 + plink_maxsnp]);
 	  if (!perm_count) {
-	    wptr = double_g_writewx4x(wptr, pval, 12, ' ');
+	    wptr = dtoa_g_wxp4x(pval, 12, ' ', wptr);
 	  } else {
-	    wptr = double_g_writewx4x(wptr, ((double)g_perm_2success_ct[marker_idx]) * 0.5, 12, ' ');
+	    wptr = dtoa_g_wxp4x(((double)g_perm_2success_ct[marker_idx]) * 0.5, 12, ' ', wptr);
 	  }
 	  if (perm_adapt) {
 	    wptr = memseta(wptr, 32, 2);
-	    wptr = uint32_writew10(wptr, g_perm_attempt_ct[marker_idx]);
+	    wptr = uint32toa_w10(g_perm_attempt_ct[marker_idx], wptr);
 	  } else {
 	    // minimum p-value
 	    dzz = (int32_t)(doublearr_greater_than(g_maxt_extreme_stat, perms_total, g_orig_pvals[marker_idx] * (1.0 + EPSILON)) + 1);
 	    if (!perm_count) {
-	      wptr = double_g_writewx4(wptr, dzz * dyy, 12);
+	      wptr = dtoa_g_wxp4(dzz * dyy, 12, wptr);
 	    } else {
-	      wptr = double_g_writewx4(wptr, dzz - 1, 12);
+	      wptr = dtoa_g_wxp4(dzz - 1, 12, wptr);
 	    }
 	  }
 	  wptr = memcpya(wptr, " \n", 2);
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	    goto testmiss_ret_WRITE_FAIL;
 	  }
 	}
@@ -11101,183 +10713,14 @@ int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
     break;
   }
  testmiss_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   fclose_cond(outfile_msa);
   return retval;
 }
 
-int32_t make_perm_pheno(pthread_t* threads, char* outname, char* outname_end, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, double* pheno_d, char* output_missing_pheno, uint32_t permphe_ct) {
-  unsigned char* wkspace_mark = wkspace_base;
-  FILE* outfile = NULL;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t pheno_nm_ctl = (pheno_nm_ct + (BITCT - 1)) / BITCT;
-  uintptr_t pheno_nm_ctv = (pheno_nm_ctl + 1) & (~1);
-  uintptr_t perm_vec_ctcl8m = 0;
-  char* writebuf = NULL;
-  int32_t retval = 0;
-  uintptr_t* ulptr;
-  double* dptr;
-  char* wptr;
-  uintptr_t sample_uidx;
-  uintptr_t sample_idx;
-  uintptr_t perm_idx;
-  uintptr_t ulii;
-  uint32_t sample_nmidx;
-  uint32_t rshift;
-  if (!pheno_nm_ct) {
-    logerrprint("Error: --make-perm-pheno requires phenotype data.\n");
-    goto make_perm_pheno_ret_INVALID_CMDLINE;
-  }
-  g_assoc_thread_ct = MINV(g_thread_ct, permphe_ct);
-  if (wkspace_init_sfmtp(g_assoc_thread_ct)) {
-    goto make_perm_pheno_ret_NOMEM;
-  }
-  g_pheno_nm_ct = pheno_nm_ct;
-  g_perm_vec_ct = permphe_ct;
-  ulii = 0;
-  if (pheno_c) {
-    g_is_perm1 = 1;
-    g_case_ct = popcount_longs(pheno_c, unfiltered_sample_ctl);
-    // could seamlessly support multipass by using different permutation logic,
-    // but pointless in practice; better to just generate multiple files
-    if (wkspace_alloc_ul_checked(&g_perm_vecs, permphe_ct * pheno_nm_ctv * sizeof(intptr_t))) {
-      goto make_perm_pheno_ret_NOMEM;
-    }
-    if (cluster_starts) {
-      // most similar to testmiss()
-      retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, pheno_c, pheno_nm_ct, 1, cluster_ct, cluster_map, cluster_starts, &g_cluster_ct, &g_cluster_map, &g_cluster_starts, &g_cluster_case_cts, &g_cluster_cc_perm_preimage);
-      if (retval) {
-	goto make_perm_pheno_ret_1;
-      }
-      if (!g_cluster_ct) {
-        logerrprint("Error: Degenerate --make-perm-pheno invocation (no size 2+ clusters).\n");
-        goto make_perm_pheno_ret_INVALID_CMDLINE;
-      }
-      retval = cluster_alloc_and_populate_magic_nums(g_cluster_ct, g_cluster_map, g_cluster_starts, &g_tot_quotients, &g_totq_magics, &g_totq_preshifts, &g_totq_postshifts, &g_totq_incrs);
-      if (retval) {
-        goto make_perm_pheno_ret_1;
-      }
-      // not actually much of a point to multithreading since this is I/O
-      // bound, but what the hell, the permutation generators already support
-      // it
-      if (spawn_threads(threads, &model_assoc_gen_cluster_perms_thread, g_assoc_thread_ct)) {
-	goto make_perm_pheno_ret_THREAD_CREATE_FAIL;
-      }
-      model_assoc_gen_cluster_perms_thread((void*)ulii);
-    } else {
-      g_cluster_starts = NULL;
-      g_tot_quotient = 0x100000000LLU / pheno_nm_ct;
-      magic_num(g_tot_quotient, &g_totq_magic, &g_totq_preshift, &g_totq_postshift, &g_totq_incr);
-      if (spawn_threads(threads, &model_assoc_gen_perms_thread, g_assoc_thread_ct)) {
-	goto make_perm_pheno_ret_THREAD_CREATE_FAIL;
-      }
-      model_assoc_gen_perms_thread((void*)ulii);
-    }
-  } else {
-    g_pheno_d2 = (double*)alloc_and_init_collapsed_arr_incl((char*)pheno_d, sizeof(double), unfiltered_sample_ct, pheno_nm, pheno_nm_ct, 1);
-    if (!g_pheno_d2) {
-      goto make_perm_pheno_ret_NOMEM;
-    }
-    perm_vec_ctcl8m = CACHEALIGN32_DBL(permphe_ct);
-    if (wkspace_alloc_d_checked(&g_perm_vecstd, perm_vec_ctcl8m * sizeof(double) * pheno_nm_ct)) {
-      goto make_perm_pheno_ret_NOMEM;
-    }
-    if (cluster_starts) {
-      retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, NULL, pheno_nm_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_cluster_ct, &g_cluster_map, &g_cluster_starts, NULL, NULL);
-      if (retval) {
-	goto make_perm_pheno_ret_1;
-      }
-      if (!g_cluster_ct) {
-        logerrprint("Error: Degenerate --make-perm-pheno invocation (no size 2+ clusters).\n");
-        goto make_perm_pheno_ret_INVALID_CMDLINE;
-      }
-      if (wkspace_alloc_ui_checked(&g_sample_to_cluster, pheno_nm_ct * sizeof(int32_t)) ||
-          wkspace_alloc_ui_checked(&g_qassoc_cluster_thread_wkspace, g_assoc_thread_ct * ((g_cluster_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32) * CACHELINE)) {
-	goto make_perm_pheno_ret_NOMEM;
-      }
-      fill_unfiltered_sample_to_cluster(pheno_nm_ct, g_cluster_ct, g_cluster_map, g_cluster_starts, g_sample_to_cluster);
-      if (spawn_threads(threads, &qassoc_gen_cluster_perms_thread, g_assoc_thread_ct)) {
-	goto make_perm_pheno_ret_THREAD_CREATE_FAIL;
-      }
-      qassoc_gen_cluster_perms_thread((void*)ulii);
-    } else {
-      if (spawn_threads(threads, &qassoc_gen_perms_thread, g_assoc_thread_ct)) {
-	goto make_perm_pheno_ret_THREAD_CREATE_FAIL;
-      }
-      qassoc_gen_perms_thread((void*)ulii);
-    }
-    if (wkspace_alloc_c_checked(&writebuf, permphe_ct * 16)) {
-      goto make_perm_pheno_ret_NOMEM;
-    }
-  }
-  join_threads(threads, g_assoc_thread_ct);
-  memcpy(outname_end, ".pphe", 6);
-  if (fopen_checked(&outfile, outname, "w")) {
-    goto make_perm_pheno_ret_OPEN_FAIL;
-  }
-  sample_nmidx = 0;
-  for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
-    next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
-    fputs(&(sample_ids[sample_uidx * max_sample_id_len]), outfile);
-    if (!IS_SET(pheno_nm, sample_uidx)) {
-      for (perm_idx = 0; perm_idx < permphe_ct; perm_idx++) {
-	putc('\t', outfile);
-	fputs(output_missing_pheno, outfile);
-      }
-    } else if (pheno_c) {
-      ulptr = &(g_perm_vecs[sample_nmidx / BITCT]);
-      rshift = sample_nmidx % BITCT;
-      for (perm_idx = 0; perm_idx < permphe_ct; perm_idx++) {
-	putc('\t', outfile);
-        putc('1' + ((ulptr[perm_idx * pheno_nm_ctv] >> rshift) & 1), outfile);
-      }
-      sample_nmidx++;
-    } else {
-      wptr = writebuf;
-      dptr = &(g_perm_vecstd[sample_nmidx * perm_vec_ctcl8m]);
-      for (perm_idx = 0; perm_idx < permphe_ct; perm_idx++) {
-	*wptr++ = '\t';
-        wptr = double_g_write(wptr, *dptr++);
-      }
-      if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
-	goto make_perm_pheno_ret_WRITE_FAIL;
-      }
-      sample_nmidx++;
-    }
-    if (putc_checked('\n', outfile)) {
-      goto make_perm_pheno_ret_WRITE_FAIL;
-    }
-  }
-  if (fclose_null(&outfile)) {
-    goto make_perm_pheno_ret_WRITE_FAIL;
-  }
-  LOGPRINTFWW("--make-perm-pheno: Permuted phenotypes written to %s .\n", outname);
-  while (0) {
-  make_perm_pheno_ret_NOMEM:
-    retval = RET_NOMEM;
-    break;
-  make_perm_pheno_ret_OPEN_FAIL:
-    retval = RET_OPEN_FAIL;
-    break;
-  make_perm_pheno_ret_WRITE_FAIL:
-    retval = RET_WRITE_FAIL;
-    break;
-  make_perm_pheno_ret_INVALID_CMDLINE:
-    retval = RET_INVALID_CMDLINE;
-    break;
-  make_perm_pheno_ret_THREAD_CREATE_FAIL:
-    retval = RET_THREAD_CREATE_FAIL;
-    break;
-  }
- make_perm_pheno_ret_1:
-  wkspace_reset(wkspace_mark);
-  fclose_cond(outfile);
-  return retval;
-}
-
 int32_t cluster_assoc_init(const char* flag_name, uintptr_t unfiltered_sample_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t* sex_male, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uintptr_t* cluster_bitfield, uintptr_t** pheno_nm_11_ptr, uintptr_t** pheno_nm_nonmale_11_ptr, uintptr_t** pheno_nm_male_11_ptr, uint32_t** sample_to_cluster_pheno_ptr, uint32_t** cluster_pheno_gtots_ptr, uint32_t** cur_cluster_pheno_gtots_ptr, uint32_t** cluster_geno_cts_ptr,  [...]
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
   uint32_t cluster_ct2 = 0;
   uint32_t sample_ct = 0;
   uint32_t cluster_end = 0;
@@ -11307,11 +10750,11 @@ int32_t cluster_assoc_init(const char* flag_name, uintptr_t unfiltered_sample_ct
   //    samples not in a valid cluster via application of the pheno_nm_11
   //    bitmask.  sample_to_cluster_pheno[] maps sample_uidx to (valid) cluster
   //    index (high 31 bits) and case/control status (low bit).
-  if (wkspace_alloc_ul_checked(pheno_nm_11_ptr, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(pheno_nm_nonmale_11_ptr, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(pheno_nm_male_11_ptr, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(sample_to_cluster_pheno_ptr, unfiltered_sample_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(cluster_pheno_gtots_ptr, cluster_ct * 4 * sizeof(int32_t))) {
+  if (bigstack_calloc_ul(unfiltered_sample_ctl2, pheno_nm_11_ptr) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl2, pheno_nm_nonmale_11_ptr) ||
+      bigstack_calloc_ul(unfiltered_sample_ctl2, pheno_nm_male_11_ptr) ||
+      bigstack_alloc_ui(unfiltered_sample_ct, sample_to_cluster_pheno_ptr) ||
+      bigstack_alloc_ui(cluster_ct * 4, cluster_pheno_gtots_ptr)) {
     return RET_NOMEM;
   }
   pheno_nm_11 = *pheno_nm_11_ptr;
@@ -11319,8 +10762,6 @@ int32_t cluster_assoc_init(const char* flag_name, uintptr_t unfiltered_sample_ct
   pheno_nm_male_11 = *pheno_nm_male_11_ptr;
   sample_to_cluster_pheno = *sample_to_cluster_pheno_ptr;
   cluster_pheno_gtots = *cluster_pheno_gtots_ptr;
-  fill_ulong_zero(pheno_nm_11, unfiltered_sample_ctl2);
-  fill_ulong_zero(pheno_nm_male_11, unfiltered_sample_ctl2);
   for (cluster_idx = 0; cluster_idx < cluster_ct; cluster_idx++) {
     uii = cluster_end;
     cluster_end = cluster_starts[cluster_idx + 1];
@@ -11379,15 +10820,15 @@ int32_t cluster_assoc_init(const char* flag_name, uintptr_t unfiltered_sample_ct
     sample_ct += ctrl_ct + case_ct;
     case_ct_total += case_ct;
     if (cluster_bitfield) {
-      SET_BIT(cluster_bitfield, cluster_idx);
+      SET_BIT(cluster_idx, cluster_bitfield);
     }
     cluster_ct2++;
   }
-  bitfield_andnot_copy(unfiltered_sample_ctl2, pheno_nm_nonmale_11, pheno_nm_11, pheno_nm_male_11);
-  wkspace_shrink_top(cluster_pheno_gtots, cluster_ct2 * 4 * sizeof(int32_t));
-  if (wkspace_alloc_ui_checked(cur_cluster_pheno_gtots_ptr, cluster_ct2 * 2 * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(cluster_geno_cts_ptr, cluster_ct2 * 4 * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(loadbuf_raw_ptr, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
+  bitvec_andnot_copy(pheno_nm_11, pheno_nm_male_11, unfiltered_sample_ctl2, pheno_nm_nonmale_11);
+  bigstack_shrink_top(cluster_pheno_gtots, cluster_ct2 * 4 * sizeof(int32_t));
+  if (bigstack_alloc_ui(cluster_ct2 * 2, cur_cluster_pheno_gtots_ptr) ||
+      bigstack_alloc_ui(cluster_ct2 * 4, cluster_geno_cts_ptr) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl2, loadbuf_raw_ptr)) {
     return RET_NOMEM;
   }
   if (cluster_ct2 < 2) {
@@ -11456,19 +10897,19 @@ int32_t cluster_assoc_load_one(FILE* bedfile, uintptr_t bed_offset, uintptr_t* m
       }
     }
     if (chrom_name_len_ptr) {
-      *chrom_name_pp = chrom_name_buf5w4write(chrom_name_buf, chrom_info_ptr, chrom_idx, chrom_name_len_ptr);
+      *chrom_name_pp = chrom_name_buf5w4write(chrom_info_ptr, chrom_idx, chrom_name_len_ptr, chrom_name_buf);
     } else {
       // --mh2
-      // chrom_name_buf = tbuf in this case, and we return wptr_start
-      *chrom_name_pp = chrom_name_write(chrom_name_buf, chrom_info_ptr, chrom_idx);
+      // chrom_name_buf = g_textbuf in this case, and we return wptr_start
+      *chrom_name_pp = chrom_name_write(chrom_info_ptr, chrom_idx, chrom_name_buf);
       *(*chrom_name_pp)++ = '\t';
     }
   }
-  if (load_raw(bedfile, loadbuf_raw, unfiltered_sample_ct4)) {
+  if (load_raw(unfiltered_sample_ct4, bedfile, loadbuf_raw)) {
     return RET_READ_FAIL;
   }
   if (IS_SET(marker_reverse, marker_uidx)) {
-    reverse_loadbuf((unsigned char*)loadbuf_raw, unfiltered_sample_ct);
+    reverse_loadbuf(unfiltered_sample_ct, (unsigned char*)loadbuf_raw);
   }
   if (min_ploidy_1 && hh_or_mt_exists) {
     haploid_fix(hh_or_mt_exists, sample_hh_include2, sample_hh_male_include2, unfiltered_sample_ct, *is_x_ptr, *is_y_ptr, (unsigned char*)loadbuf_raw);
@@ -11518,7 +10959,7 @@ int32_t cluster_assoc_load_one(FILE* bedfile, uintptr_t bed_offset, uintptr_t* m
 }
 
 int32_t cmh_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t cmh_mperm_val, uint32_t cmh_modifier, double ci_size, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t* marker_reverse, Chrom_info*  [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
   FILE* outfile_msa = NULL;
   uintptr_t* sample_hh_include2 = NULL;
@@ -11612,7 +11053,7 @@ int32_t cmh_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
   }
 
   memcpy(outname_end, ".cmh", 5);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto cmh_assoc_ret_OPEN_FAIL;
   }
   if (ci_size == 0.0) {
@@ -11622,8 +11063,8 @@ int32_t cmh_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
   LOGPRINTFWW5("Writing report to %s ... ", outname);
   fputs("0%", stdout);
   fflush(stdout);
-  sprintf(tbuf, " CHR %%%us         BP   A1      MAF   A2      CHISQ          P         OR         SE        ", plink_maxsnp);
-  fprintf(outfile, tbuf, "SNP");
+  sprintf(g_textbuf, " CHR %%%us         BP   A1      MAF   A2      CHISQ          P         OR         SE        ", plink_maxsnp);
+  fprintf(outfile, g_textbuf, "SNP");
   uii = (uint32_t)((int32_t)(ci_size * 100));
   if (uii >= 10) {
     fprintf(outfile, "L%u        U%u ", uii, uii);
@@ -11642,11 +11083,11 @@ int32_t cmh_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
   if (alloc_raw_haploid_filters(unfiltered_sample_ct, hh_or_mt_exists, 1, pheno_nm, sex_male, &sample_hh_include2, &sample_hh_male_include2)) {
     goto cmh_assoc_ret_NOMEM;
   }
-  if (wkspace_alloc_d_checked(&orig_chisq, marker_ct * sizeof(double))) {
+  if (bigstack_alloc_d(marker_ct, &orig_chisq)) {
     goto cmh_assoc_ret_NOMEM;
   }
   if (perm_bd) {
-    if (wkspace_alloc_ui_checked(&orig_df, marker_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(marker_ct, &orig_df)) {
       goto cmh_assoc_ret_NOMEM;
     }
   }
@@ -11706,44 +11147,44 @@ int32_t cmh_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
       *dptr++ = -9;
     }
     if ((pfilter == 2.0) || ((pval <= pfilter) && (pval != -9))) {
-      wptr = memcpyax(tbuf, chrom_name_ptr, chrom_name_len, ' ');
+      wptr = memcpyax(g_textbuf, chrom_name_ptr, chrom_name_len, ' ');
       wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr);
       *wptr++ = ' ';
-      wptr = uint32_writew10x(wptr, marker_pos[marker_uidx], ' ');
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      wptr = uint32toa_w10x(marker_pos[marker_uidx], ' ', wptr);
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	goto cmh_assoc_ret_WRITE_FAIL;
       }
       fputs_w4(marker_allele_ptrs[marker_uidx * 2], outfile);
-      tbuf[0] = ' ';
-      wptr = double_g_writewx4x(&(tbuf[1]), 1.0 - set_allele_freqs[marker_uidx], 8, ' ');
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      g_textbuf[0] = ' ';
+      wptr = dtoa_g_wxp4x(1.0 - set_allele_freqs[marker_uidx], 8, ' ', &(g_textbuf[1]));
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	goto cmh_assoc_ret_WRITE_FAIL;
       }
       fputs_w4(marker_allele_ptrs[marker_uidx * 2 + 1], outfile);
       if (realnum(cmh_stat)) {
-	tbuf[0] = ' ';
-	wptr = double_g_writewx4x(&(tbuf[1]), cmh_stat, 10, ' ');
-	wptr = double_g_writewx4x(wptr, MAXV(pval, output_min_p), 10, ' ');
+	g_textbuf[0] = ' ';
+	wptr = dtoa_g_wxp4x(cmh_stat, 10, ' ', &(g_textbuf[1]));
+	wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 10, ' ', wptr);
       } else {
-        wptr = memcpya(tbuf, "         NA         NA ", 23);
+        wptr = memcpya(g_textbuf, "         NA         NA ", 23);
       }
       if (realnum(odds_ratio)) {
-        wptr = double_g_writewx4x(wptr, odds_ratio, 10, ' ');
+        wptr = dtoa_g_wxp4x(odds_ratio, 10, ' ', wptr);
       } else {
 	wptr = memcpya(wptr, "        NA ", 11);
       }
       if (realnum(se)) {
-        wptr = double_g_writewx4x(wptr, se, 10, ' ');
+        wptr = dtoa_g_wxp4x(se, 10, ' ', wptr);
 	dxx = ci_zt * se;
 	dyy = exp(log_or - dxx);
 	if (realnum(dyy)) {
-          wptr = double_g_writewx4x(wptr, dyy, 10, ' ');
+          wptr = dtoa_g_wxp4x(dyy, 10, ' ', wptr);
 	} else {
 	  wptr = memcpya(wptr, "        NA ", 11);
 	}
 	dyy = exp(log_or + dxx);
         if (realnum(dyy)) {
-          wptr = double_g_writewx4x(wptr, dyy, 10, ' ');
+          wptr = dtoa_g_wxp4x(dyy, 10, ' ', wptr);
 	} else {
 	  wptr = memcpya(wptr, "        NA ", 11);
 	}
@@ -11785,8 +11226,8 @@ int32_t cmh_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
 	  }
 	  pval = chiprob_p(bdx2, cur_df);
 	  if (pval > -1) {
-	    wptr = double_g_writewx4x(wptr, bdx2, 10, ' ');
-	    wptr = double_g_writewx4x(wptr, MAXV(pval, output_min_p), 10, ' ');
+	    wptr = dtoa_g_wxp4x(bdx2, 10, ' ', wptr);
+	    wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 10, ' ', wptr);
 	  } else {
 	    goto cmh_assoc_bd_fail;
 	  }
@@ -11796,7 +11237,7 @@ int32_t cmh_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
 	}
       }
       *wptr++ = '\n';
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	goto cmh_assoc_ret_WRITE_FAIL;
       }
     }
@@ -11821,7 +11262,7 @@ int32_t cmh_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
   fputs("\b\b", stdout);
   logprint("done.\n");
   if (mtest_adjust) {
-    if (wkspace_alloc_ui_checked(&marker_idx_to_uidx, marker_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(marker_ct, &marker_idx_to_uidx)) {
       goto cmh_assoc_ret_NOMEM;
     }
     fill_idx_to_uidx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_idx_to_uidx);
@@ -11874,14 +11315,14 @@ int32_t cmh_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char*
     break;
   }
  cmh_assoc_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   fclose_cond(outfile_msa);
   return retval;
 }
 
 int32_t cmh2_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t* sex_male, uint32_t hh_o [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
   uintptr_t* sample_hh_include2 = NULL;
   uintptr_t* sample_hh_male_include2 = NULL;
@@ -11962,15 +11403,15 @@ int32_t cmh2_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
     goto cmh2_assoc_ret_INVALID_CMDLINE;
   }
 #endif
-  if (wkspace_alloc_d_checked(&ty_ctrl, cluster_ct1 * sizeof(double)) ||
-      wkspace_alloc_d_checked(&ty_case, cluster_ct1 * sizeof(double)) ||
-      wkspace_alloc_d_checked(&n0, (cluster_ct1 - 1) * sizeof(double)) ||
-      wkspace_alloc_d_checked(&u0, (cluster_ct1 - 1) * sizeof(double)) ||
-      wkspace_alloc_d_checked(&v0, (cluster_ct1 - 1) * (cluster_ct1 - 1) * sizeof(double)) ||
-      wkspace_alloc_d_checked(&dbl_2d_buf, (cluster_ct1 - 1) * (cluster_ct1 - 1) * sizeof(double))) {
+  if (bigstack_alloc_d(cluster_ct1, &ty_ctrl) ||
+      bigstack_alloc_d(cluster_ct1, &ty_case) ||
+      bigstack_alloc_d(cluster_ct1 - 1, &n0) ||
+      bigstack_alloc_d(cluster_ct1 - 1, &u0) ||
+      bigstack_alloc_d((cluster_ct1 - 1) * (cluster_ct1 - 1), &v0) ||
+      bigstack_alloc_d((cluster_ct1 - 1) * (cluster_ct1 - 1), &dbl_2d_buf)) {
     goto cmh2_assoc_ret_NOMEM;
   }
-  mi_buf = (MATRIX_INVERT_BUF1_TYPE*)wkspace_alloc((cluster_ct1 - 1) * sizeof(MATRIX_INVERT_BUF1_TYPE));
+  mi_buf = (MATRIX_INVERT_BUF1_TYPE*)bigstack_alloc((cluster_ct1 - 1) * sizeof(MATRIX_INVERT_BUF1_TYPE));
   if (!mi_buf) {
     goto cmh2_assoc_ret_NOMEM;
   }
@@ -11984,7 +11425,7 @@ int32_t cmh2_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
     goto cmh2_assoc_ret_READ_FAIL;
   }
   memcpy(outname_end, ".cmh2", 6);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto cmh2_assoc_ret_OPEN_FAIL;
   }
   LOGPRINTFWW5("Writing report to %s ... ", outname);
@@ -11995,7 +11436,7 @@ int32_t cmh2_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
   }
   loop_end = marker_ct / 100;
   for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
-    if (cluster_assoc_load_one(bedfile, bed_offset, marker_exclude, unfiltered_sample_ct, sample_hh_include2, sample_hh_male_include2, loadbuf_raw, pheno_nm_11, pheno_nm_nonmale_11, pheno_nm_male_11, marker_reverse, chrom_info_ptr, hh_or_mt_exists, tbuf, cluster_ct1, sample_to_cluster_pheno, cluster_pheno_gtots, cur_cluster_pheno_gtots, cluster_geno_cts, &marker_uidx, &chrom_end, &chrom_fo_idx, &min_ploidy_1, &is_x, &is_y, &wptr_start, NULL)) {
+    if (cluster_assoc_load_one(bedfile, bed_offset, marker_exclude, unfiltered_sample_ct, sample_hh_include2, sample_hh_male_include2, loadbuf_raw, pheno_nm_11, pheno_nm_nonmale_11, pheno_nm_male_11, marker_reverse, chrom_info_ptr, hh_or_mt_exists, g_textbuf, cluster_ct1, sample_to_cluster_pheno, cluster_pheno_gtots, cur_cluster_pheno_gtots, cluster_geno_cts, &marker_uidx, &chrom_end, &chrom_fo_idx, &min_ploidy_1, &is_x, &is_y, &wptr_start, NULL)) {
       goto cmh2_assoc_ret_READ_FAIL;
     }
     wptr = strcpyax(wptr_start, &(marker_ids[marker_uidx * max_marker_id_len]), '\t');
@@ -12078,16 +11519,16 @@ int32_t cmh2_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
 	}
 	chisq += dxx * (dbl_2d_buf[cluster_idx]);
       }
-      wptr = double_g_writex(wptr, chisq, '\t');
-      wptr = uint32_writex(wptr, cur_cluster_ctm1, '\t');
+      wptr = dtoa_gx(chisq, '\t', wptr);
+      wptr = uint32toa_x(cur_cluster_ctm1, '\t', wptr);
       dxx = chiprob_p(chisq, (int32_t)cur_cluster_ctm1);
-      wptr = double_g_writex(wptr, MAXV(dxx, output_min_p), '\n');
+      wptr = dtoa_gx(MAXV(dxx, output_min_p), '\n', wptr);
     } else {
     cmh2_assoc_fail:
       wptr = memcpya(wptr, "NA\tNA\tNA\n", 9);
     }
   cmh2_assoc_fail2:
-    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
       goto cmh2_assoc_ret_WRITE_FAIL;
     }
     if (marker_idx >= loop_end) {
@@ -12127,19 +11568,19 @@ int32_t cmh2_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
     retval = RET_INVALID_CMDLINE;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   return retval;
 }
 
 int32_t homog_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* clu [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   FILE* outfile = NULL;
   uintptr_t* sample_hh_include2 = NULL;
   uintptr_t* sample_hh_male_include2 = NULL;
-  char* writebuf = tbuf;
+  char* writebuf = g_textbuf;
   char* chrom_name_ptr = NULL;
-  uintptr_t topsize = 0;
   uint32_t cluster_ct2 = 0;
   uint32_t chrom_fo_idx = 0xffffffffU;
   uint32_t chrom_end = 0;
@@ -12191,25 +11632,22 @@ int32_t homog_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
   uint32_t loop_end;
   ulii = 2 * max_marker_allele_len + MAX_ID_LEN + max_marker_id_len + max_cluster_id_len + 256;
   if (ulii > MAXLINELEN) {
-    if (wkspace_alloc_c_checked(&writebuf, ulii)) {
+    if (bigstack_alloc_c(ulii, &writebuf)) {
       goto homog_assoc_ret_NOMEM;
     }
   }
-  ulii = ((cluster_ct + (BITCT - 1)) / BITCT);
-  cluster_bitfield = (uintptr_t*)top_alloc(&topsize, ulii * sizeof(intptr_t));
-  fill_ulong_zero(cluster_bitfield, ulii);
-  wkspace_left -= topsize;
+  if (bigstack_end_calloc_ul(BITCT_TO_WORDCT(cluster_ct), &cluster_bitfield)) {
+    goto homog_assoc_ret_NOMEM;
+  }
   // Factor out common initialization with cmh_assoc().
   retval = cluster_assoc_init("--homog", unfiltered_sample_ct, pheno_nm, pheno_c, sex_male, cluster_ct, cluster_map, cluster_starts, cluster_bitfield, &pheno_nm_11, &pheno_nm_nonmale_11, &pheno_nm_male_11, &sample_to_cluster_pheno, &cluster_pheno_gtots, &cur_cluster_pheno_gtots, &cluster_geno_cts, &loadbuf_raw, &cluster_ct2);
   if (retval) {
-    wkspace_left += topsize;
     goto homog_assoc_ret_1;
   }
   if (cluster_ct == cluster_ct2) {
     cluster_ids_collapsed = cluster_ids;
   } else {
-    if (wkspace_alloc_c_checked(&cluster_ids_collapsed, cluster_ct2 * max_cluster_id_len)) {
-      wkspace_left += topsize;
+    if (bigstack_alloc_c(cluster_ct2 * max_cluster_id_len, &cluster_ids_collapsed)) {
       goto homog_assoc_ret_NOMEM;
     }
     for (ulii = 0, cluster_idx = 0; cluster_idx < cluster_ct2; ulii++, cluster_idx++) {
@@ -12217,12 +11655,12 @@ int32_t homog_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
       memcpy(&(cluster_ids_collapsed[cluster_idx * max_cluster_id_len]), &(cluster_ids[ulii * max_cluster_id_len]), max_cluster_id_len);
     }
   }
-  wkspace_left += topsize;
+  bigstack_end_reset(bigstack_end_mark);
   cluster_ct2d = (double)((int32_t)cluster_ct2);
   cluster_ct2m1d = (double)((int32_t)cluster_ct2 - 1);
-  if (wkspace_alloc_d_checked(&cluster_tables, cluster_ct2 * 4 * sizeof(double)) ||
-      wkspace_alloc_d_checked(&cluster_or, cluster_ct2 * sizeof(double)) ||
-      wkspace_alloc_d_checked(&cluster_chisq, cluster_ct2 * sizeof(double))) {
+  if (bigstack_alloc_d(cluster_ct2 * 4, &cluster_tables) ||
+      bigstack_alloc_d(cluster_ct2, &cluster_or) ||
+      bigstack_alloc_d(cluster_ct2, &cluster_chisq)) {
     goto homog_assoc_ret_NOMEM;
   }
   if (cluster_ct2 > 10) {
@@ -12230,15 +11668,15 @@ int32_t homog_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
   }
 
   memcpy(outname_end, ".homog", 7);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto homog_assoc_ret_OPEN_FAIL;
   }
   LOGPRINTFWW5("Writing report to %s ... ", outname);
   fputs("0%", stdout);
   fflush(stdout);
   // misaligned for backward compatibility
-  sprintf(tbuf, " CHR %%%us   A1   A2      F_A      F_U      N_A      N_U     TEST      CHISQ   DF          P         OR\n", plink_maxsnp);
-  fprintf(outfile, tbuf, "SNP");
+  sprintf(g_textbuf, " CHR %%%us   A1   A2      F_A      F_U      N_A      N_U     TEST      CHISQ   DF          P         OR\n", plink_maxsnp);
+  fprintf(outfile, g_textbuf, "SNP");
   if (chrom_info_ptr->mt_code != -1) {
     hh_or_mt_exists |= NXMHH_EXISTS;
   }
@@ -12291,26 +11729,26 @@ int32_t homog_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
     *wptr_start++ = ' ';
     wptr_start = memcpya(wptr_start, "      NA       NA       NA       NA ", 36);
     wptr = memcpya(wptr_start, " TOTAL ", 7);
-    wptr = double_g_writewx4x(wptr, x_total, 10, ' ');
-    wptr = uint32_writew4x(wptr, cluster_ct2, ' ');
-    wptr = double_g_writewx4x(wptr, chiprob_p(x_total, cluster_ct2d), 10, ' ');
+    wptr = dtoa_g_wxp4x(x_total, 10, ' ', wptr);
+    wptr = uint32toa_w4x(cluster_ct2, ' ', wptr);
+    wptr = dtoa_g_wxp4x(chiprob_p(x_total, cluster_ct2d), 10, ' ', wptr);
     wptr = memcpya(wptr, "        NA\n", 11);
     if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
       goto homog_assoc_ret_WRITE_FAIL;
     }
     wptr = memcpya(wptr_start, " ASSOC ", 7);
-    wptr = double_g_writewx4(wptr, x_assoc, 10);
+    wptr = dtoa_g_wxp4(x_assoc, 10, wptr);
     wptr = memcpya(wptr, "    1 ", 6);
-    wptr = double_g_writewx4x(wptr, chiprob_p(x_assoc, 1), 10, ' ');
+    wptr = dtoa_g_wxp4x(chiprob_p(x_assoc, 1), 10, ' ', wptr);
     wptr = memcpya(wptr, "        NA\n", 11);
     if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
       goto homog_assoc_ret_WRITE_FAIL;
     }
     dxx = x_total - x_assoc;
     wptr = memcpya(wptr_start, " HOMOG ", 7);
-    wptr = double_g_writewx4x(wptr, dxx, 10, ' ');
-    wptr = uint32_writew4x(wptr, cluster_ct2 - 1, ' ');
-    wptr = double_g_writewx4x(wptr, chiprob_p(dxx, cluster_ct2m1d), 10, ' ');
+    wptr = dtoa_g_wxp4x(dxx, 10, ' ', wptr);
+    wptr = uint32toa_w4x(cluster_ct2 - 1, ' ', wptr);
+    wptr = dtoa_g_wxp4x(chiprob_p(dxx, cluster_ct2m1d), 10, ' ', wptr);
     wptr = memcpya(wptr, "        NA\n", 11);
     if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
       goto homog_assoc_ret_WRITE_FAIL;
@@ -12321,15 +11759,15 @@ int32_t homog_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
       ctrl_ctd = dptr[2] + dptr[3];
       if ((case_ctd < 1.5) || (ctrl_ctd < 1.5)) {
 	wptr = memcpya(wptr_start, "      NA       NA ", 18);
-	wptr = double_g_writewx4x(wptr, case_ctd - 1, 8, ' ');
-	wptr = double_g_writewx4x(wptr, ctrl_ctd - 1, 8, ' ');
+	wptr = dtoa_g_wxp4x(case_ctd - 1, 8, ' ', wptr);
+	wptr = dtoa_g_wxp4x(ctrl_ctd - 1, 8, ' ', wptr);
 	wptr = fw_strcpy(6, &(cluster_ids_collapsed[cluster_idx * max_cluster_id_len]), wptr);
         wptr = memcpya(wptr, "         NA   NA         NA         NA\n", 39);
       } else {
-        wptr = double_g_writewx4x(wptr_start, dptr[0] / case_ctd, 8, ' ');
-        wptr = double_g_writewx4x(wptr, dptr[2] / ctrl_ctd, 8, ' ');
-	wptr = double_g_writewx4x(wptr, case_ctd - 1, 8, ' ');
-	wptr = double_g_writewx4x(wptr, ctrl_ctd - 1, 8, ' ');
+        wptr = dtoa_g_wxp4x(dptr[0] / case_ctd, 8, ' ', wptr_start);
+        wptr = dtoa_g_wxp4x(dptr[2] / ctrl_ctd, 8, ' ', wptr);
+	wptr = dtoa_g_wxp4x(case_ctd - 1, 8, ' ', wptr);
+	wptr = dtoa_g_wxp4x(ctrl_ctd - 1, 8, ' ', wptr);
 	wptr = fw_strcpy(6, &(cluster_ids_collapsed[cluster_idx * max_cluster_id_len]), wptr);
 	*wptr++ = ' ';
 	dxx = cluster_chisq[cluster_idx];
@@ -12337,12 +11775,12 @@ int32_t homog_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
 	  // probably rounding error
 	  dxx = 0;
 	}
-        wptr = double_g_writewx4(wptr, dxx, 10);
+        wptr = dtoa_g_wxp4(dxx, 10, wptr);
         wptr = memcpya(wptr, "    1 ", 6);
-	wptr = double_g_writewx4x(wptr, MAXV(chiprob_p(dxx, 1), output_min_p), 10, ' ');
+	wptr = dtoa_g_wxp4x(MAXV(chiprob_p(dxx, 1), output_min_p), 10, ' ', wptr);
 	dxx = cluster_or[cluster_idx];
         if (realnum(dxx)) {
-          wptr = double_g_writewx4x(wptr, dxx, 10, '\n');
+          wptr = dtoa_g_wxp4x(dxx, 10, '\n', wptr);
 	} else {
 	  wptr = memcpya(wptr, "        NA\n", 11);
 	}
@@ -12386,7 +11824,7 @@ int32_t homog_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
     break;
   }
  homog_assoc_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
   fclose_cond(outfile);
   return retval;
 }
diff --git a/plink_assoc.h b/plink_assoc.h
index 97aeae3..2d81f4b 100644
--- a/plink_assoc.h
+++ b/plink_assoc.h
@@ -20,8 +20,6 @@ int32_t gxe_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outn
 
 int32_t testmiss(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t testmiss_mperm_val, uint32_t testmiss_modifier, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uint32_t cluster_ct, uint32_t* clus [...]
 
-int32_t make_perm_pheno(pthread_t* threads, char* outname, char* outname_end, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, double* pheno_d, char* output_missing_pheno, uint32_t permphe_ct);
-
 int32_t cmh_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t cmh_mperm_val, uint32_t cmh_modifier, double ci_size, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t* marker_reverse, Chrom_info*  [...]
 
 int32_t cmh2_assoc(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t* sex_male, uint32_t hh_exists);
diff --git a/plink_calc.c b/plink_calc.c
index 3595665..ee7bb02 100644
--- a/plink_calc.c
+++ b/plink_calc.c
@@ -538,7 +538,7 @@ void exclude_multi(uintptr_t* exclude_arr, int32_t* new_excl, uint32_t unfiltere
     sample_uidx_stop = next_set(exclude_arr, sample_uidx, unfiltered_sample_ct);
     do {
       if (*new_excl++ == -1) {
-        SET_BIT(exclude_arr, sample_uidx);
+        SET_BIT(sample_uidx, exclude_arr);
 	exclude_ct++;
       }
     } while (++sample_uidx < sample_uidx_stop);
@@ -584,7 +584,7 @@ static inline uint32_t popcount_xor_1mask_multiword(__m128i** xor1p, __m128i* xo
   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
   __m128i count1, count2, half1, half2;
-  __uni16 acc;
+  __univec acc;
   __m128i* xor2_end = &(xor2[MULTIPLEX_2DIST / 128]);
 
   acc.vi = _mm_setzero_si128();
@@ -619,7 +619,7 @@ static inline uint32_t popcount_xor_2mask_multiword(__m128i** xor1p, __m128i* xo
   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
   __m128i count1, count2, half1, half2;
-  __uni16 acc;
+  __univec acc;
   __m128i* xor2_end = &(xor2[MULTIPLEX_2DIST / 128]);
 
   acc.vi = _mm_setzero_si128();
@@ -736,6 +736,7 @@ static double g_reg_tot_xx;
 static double g_reg_tot_yy;
 static uint32_t g_ctrl_ct;
 static uint32_t g_case_ct;
+static uintptr_t g_sample_ct;
 static uintptr_t g_jackknife_iters;
 static uint32_t g_jackknife_d;
 static double g_calc_result[MAX_THREADS_P1][9];
@@ -1123,8 +1124,8 @@ void incr_genome(uint32_t* genome_main, uintptr_t* geno, uintptr_t* masks, uintp
   __m128i count_ibs0;
   __m128i count2_ibs1;
   __m128i count2_ibs0;
-  __uni16 acc_ibs1;
-  __uni16 acc_ibs0;
+  __univec acc_ibs1;
+  __univec acc_ibs0;
   uintptr_t* lptr;
   __m128i* glptr;
   __m128i* glptr_fixed;
@@ -1453,7 +1454,7 @@ void incr_genome(uint32_t* genome_main, uintptr_t* geno, uintptr_t* masks, uintp
 	  do {
 	    offset = next_ppc_marker_hybrid / BITCT;
 	    marker_window_ptr = &(g_marker_window[offset * BITCT]);
-	    next_ppc_marker_hybrid = ~ZEROLU << (next_ppc_marker_hybrid & (BITCT - 1));
+	    next_ppc_marker_hybrid = (~ZEROLU) << (next_ppc_marker_hybrid & (BITCT - 1));
 	  incr_genome_1mask_loop:
 	    uland = glptr_back[offset] & (((uintptr_t*)glptr_fixed)[offset]);
 	    ulval = ((uland & (uland << 1)) & AAAAMASK) | (((uintptr_t*)xor_buf)[offset]);
@@ -1837,8 +1838,8 @@ THREAD_RET_TYPE groupdist_jack_thread(void* arg) {
   uint32_t case_ct = g_case_ct;
   uint32_t ctrl_ct = g_ctrl_ct;
   uint32_t jackknife_d = g_jackknife_d;
-  uint32_t* uibuf = (uint32_t*)(&(g_geno[tidx * CACHEALIGN(case_ct + ctrl_ct + (jackknife_d + 1) * sizeof(int32_t))]));
-  unsigned char* cbuf = &(g_geno[tidx * CACHEALIGN(case_ct + ctrl_ct + (jackknife_d + 1) * sizeof(int32_t)) + (jackknife_d + 1) * sizeof(int32_t)]);
+  uint32_t* uibuf = (uint32_t*)(&(g_geno[tidx * round_up_pow2(case_ct + ctrl_ct + (jackknife_d + 1) * sizeof(int32_t), CACHELINE)]));
+  unsigned char* cbuf = &(g_geno[tidx * round_up_pow2(case_ct + ctrl_ct + (jackknife_d + 1) * sizeof(int32_t), CACHELINE) + (jackknife_d + 1) * sizeof(int32_t)]);
   uintptr_t jackknife_iters = g_jackknife_iters;
   uintptr_t uljj = jackknife_iters / 100;
   double reg_tot_x = g_reg_tot_x;
@@ -1940,8 +1941,8 @@ THREAD_RET_TYPE regress_rel_jack_thread(void* arg) {
   double reg_tot_xx = g_reg_tot_xx;
   double reg_tot_yy = g_reg_tot_yy;
   uint32_t jackknife_d = g_jackknife_d;
-  uint32_t* uibuf = (uint32_t*)(&(g_geno[tidx * CACHEALIGN(sample_ct + (jackknife_d + 1) * sizeof(int32_t))]));
-  unsigned char* cbuf = &(g_geno[tidx * CACHEALIGN(sample_ct + (jackknife_d + 1) * sizeof(int32_t)) + (jackknife_d + 1) * sizeof(int32_t)]);
+  uint32_t* uibuf = (uint32_t*)(&(g_geno[tidx * round_up_pow2(sample_ct + (jackknife_d + 1) * sizeof(int32_t), CACHELINE)]));
+  unsigned char* cbuf = &(g_geno[tidx * round_up_pow2(sample_ct + (jackknife_d + 1) * sizeof(int32_t), CACHELINE) + (jackknife_d + 1) * sizeof(int32_t)]);
   double* jackknife_precomp = g_jackknife_precomp;
   double* rel_dists = g_rel_dists;
   double* pheno_packed = g_pheno_packed;
@@ -1995,7 +1996,7 @@ uint32_t set_default_jackknife_d(uint32_t ct) {
 }
 
 int32_t regress_rel_main(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, Rel_info* relip, pthread_t* threads, double* pheno_d) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t regress_rel_iters = relip->regress_rel_iters;
   double reg_tot_xy = 0;
   double reg_tot_x = 0;
@@ -2019,7 +2020,7 @@ int32_t regress_rel_main(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
   double dxxsq;
   double dyysq;
   g_sample_ct = sample_ct;
-  if (wkspace_alloc_d_checked(&g_pheno_packed, sample_ct * sizeof(double))) {
+  if (bigstack_alloc_d(sample_ct, &g_pheno_packed)) {
     return RET_NOMEM;
   }
   collapse_copy_phenod(g_pheno_packed, pheno_d, sample_exclude, unfiltered_sample_ct, sample_ct);
@@ -2027,10 +2028,9 @@ int32_t regress_rel_main(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
   trimatrix_size = ((uintptr_t)sample_ct * (sample_ct - 1)) / 2;
   rel_ptr = g_rel_dists;
   pheno_ptr = g_pheno_packed;
-  if (wkspace_alloc_d_checked(&g_jackknife_precomp, sample_ct * JACKKNIFE_VALS_REL * sizeof(double))) {
+  if (bigstack_calloc_d(sample_ct * JACKKNIFE_VALS_REL, &g_jackknife_precomp)) {
     return RET_NOMEM;
   }
-  fill_double_zero(g_jackknife_precomp, sample_ct * JACKKNIFE_VALS_REL);
   for (uii = 1; uii < sample_ct; uii++) {
     half_avg_pheno = *(++pheno_ptr);
     pheno_ptr2 = g_pheno_packed;
@@ -2078,10 +2078,10 @@ int32_t regress_rel_main(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
   } else {
     g_jackknife_d = set_default_jackknife_d(sample_ct);
   }
-  if (wkspace_alloc_uc_checked(&g_geno, g_thread_ct * CACHEALIGN(sample_ct * (g_jackknife_d + 1) * sizeof(int32_t)))) {
+  if (bigstack_alloc_uc(g_thread_ct * round_up_pow2(sample_ct * (g_jackknife_d + 1) * sizeof(int32_t), CACHELINE), &g_geno)) {
     return RET_NOMEM;
   }
-  if (wkspace_init_sfmtp(g_thread_ct)) {
+  if (bigstack_init_sfmtp(g_thread_ct)) {
     return RET_NOMEM;
   }
   if (spawn_threads(threads, &regress_rel_jack_thread, g_thread_ct)) {
@@ -2106,7 +2106,7 @@ int32_t regress_rel_main(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
   putchar('\r');
   LOGPRINTF("Jackknife s.e. (y = genomic relationship): %g\n", sqrt(((sample_ct - g_jackknife_d) / (double)g_jackknife_d) * (dxxsq - dxx * dxx / (double)ulii) / ((double)ulii - 1)));
   LOGPRINTF("               (y = phenotype): %g\n", sqrt(((sample_ct - g_jackknife_d) / (double)g_jackknife_d) * (dyysq - dyy * dyy / (double)ulii) / ((double)ulii - 1)));
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return 0;
 }
 
@@ -2216,7 +2216,7 @@ void reml_em_one_trait(double* wkbase, double* pheno, double* covg_ref, double*
   char blas_char;
   int32_t sample_ct_i32 = sample_ct;
 #endif
-  mat_offset = CACHEALIGN_DBL(mat_offset * mat_offset);
+  mat_offset = round_up_pow2_ull(mat_offset * mat_offset, CACHELINE);
   rel_dists = &(wkbase[mat_offset]);
   row = &(wkbase[mat_offset * 3]);
   irow = (MATRIX_INVERT_BUF1_TYPE*)row;
@@ -2338,8 +2338,8 @@ void reml_em_one_trait(double* wkbase, double* pheno, double* covg_ref, double*
     fflush(stdout);
   } while (ll_change > tol);
   putchar('\n');
-  sprintf(logbuf, "covg: %g  covr: %g\n", *covg_ref, *covr_ref);
-  logstr(logbuf);
+  sprintf(g_logbuf, "covg: %g  covr: %g\n", *covg_ref, *covr_ref);
+  logstr(g_logbuf);
 }
 
 void mean_zero_var_one_in_place(uint32_t sample_ct, double* pheno_d) {
@@ -2382,15 +2382,15 @@ int32_t calc_unrelated_herit(uint64_t calculation_type, Rel_info* relip, uintptr
   g_sample_ct = sample_ct;
   g_missing_dbl_excluded = NULL;
   ulii = sample_ct;
-  ulii = CACHEALIGN_DBL(ulii * ulii);
+  ulii = round_up_pow2(ulii * ulii, CACHELINE_DBL);
   rel_base = &(g_rel_dists[ulii]);
-  ulii = ulii * 3 + CACHEALIGN_DBL(sample_ct) * 3;
-  // no wkspace_shrink_top here since this actually grows the allocation...
-  wkspace_reset(g_rel_dists);
-  if (wkspace_alloc_d_checked(&g_rel_dists, ulii * sizeof(double))) {
+  ulii = ulii * 3 + round_up_pow2(sample_ct, CACHELINE_DBL) * 3;
+  // no bigstack_shrink_top here since this actually grows the allocation...
+  bigstack_reset(g_rel_dists);
+  if (bigstack_alloc_d(ulii, &g_rel_dists)) {
     return RET_NOMEM;
   }
-  pheno_ptr = &(g_rel_dists[ulii - CACHEALIGN_DBL(sample_ct)]);
+  pheno_ptr = &(g_rel_dists[ulii - round_up_pow2(sample_ct, CACHELINE_DBL)]);
   collapse_copy_phenod(pheno_ptr, pheno_d, sample_exclude, unfiltered_sample_ct, sample_ct);
   mean_zero_var_one_in_place(sample_ct, pheno_ptr);
   if (calculation_type & CALC_IBC) {
@@ -2415,8 +2415,8 @@ int32_t unrelated_herit_batch(uint32_t load_grm_bin, char* grmname, char* phenon
   FILE* infile = NULL;
   FILE* grm_binfile = NULL;
   gzFile grm_gzfile = NULL;
-  unsigned char* wkspace_mark = wkspace_base;
-  uintptr_t topsize = 0;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   uintptr_t max_sample_id_len = 4;
   uintptr_t unfiltered_sample_ct = 0;
   uintptr_t sample_uidx = 0;
@@ -2451,17 +2451,17 @@ int32_t unrelated_herit_batch(uint32_t load_grm_bin, char* grmname, char* phenon
   // 3. collapse phenotypes if necessary,  load (subset of) relationship matrix
   // 4. call reml_em_one_trait()
   memcpy(grmname_end, ".grm.id", 8);
-  if (fopen_checked(&infile, grmname, "r")) {
+  if (fopen_checked(grmname, "r", &infile)) {
     goto unrelated_herit_batch_ret_OPEN_FAIL;
   }
-  tbuf[MAXLINELEN - 1] = ' ';
-  while (fgets(tbuf, MAXLINELEN, infile)) {
+  g_textbuf[MAXLINELEN - 1] = ' ';
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
+    if (!g_textbuf[MAXLINELEN - 1]) {
       LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, grmname);
       goto unrelated_herit_batch_ret_INVALID_FORMAT_2;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
@@ -2486,21 +2486,14 @@ int32_t unrelated_herit_batch(uint32_t load_grm_bin, char* grmname, char* phenon
     goto unrelated_herit_batch_ret_INVALID_FORMAT;
   }
   rewind(infile);
-  unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  if (wkspace_alloc_ul_checked(&pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t))) {
-    goto unrelated_herit_batch_ret_NOMEM;
-  }
-  sorted_ids = (char*)top_alloc(&topsize, unfiltered_sample_ct * max_sample_id_len);
-  if (!sorted_ids) {
+  unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  if (bigstack_calloc_ul(unfiltered_sample_ctl, &pheno_nm) ||
+      bigstack_end_alloc_c(unfiltered_sample_ct * max_sample_id_len, &sorted_ids) ||
+      bigstack_end_alloc_ui(unfiltered_sample_ct, &id_map)) {
     goto unrelated_herit_batch_ret_NOMEM;
   }
-  id_map = (uint32_t*)top_alloc(&topsize, unfiltered_sample_ct * sizeof(int32_t));
-  if (!id_map) {
-    goto unrelated_herit_batch_ret_NOMEM;
-  }
-  fill_ulong_zero(pheno_nm, unfiltered_sample_ctl);
-  while (fgets(tbuf, MAXLINELEN, infile)) {
-    bufptr = skip_initial_spaces(tbuf);
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
@@ -2519,20 +2512,16 @@ int32_t unrelated_herit_batch(uint32_t load_grm_bin, char* grmname, char* phenon
   for (sample_uidx = 0; sample_uidx < unfiltered_sample_ct; sample_uidx++) {
     id_map[sample_uidx] = sample_uidx;
   }
-  wkspace_left -= topsize;
   if (qsort_ext(sorted_ids, unfiltered_sample_ct, max_sample_id_len, strcmp_deref, (char*)id_map, sizeof(int32_t))) {
-    goto unrelated_herit_batch_ret_NOMEM2;
+    goto unrelated_herit_batch_ret_NOMEM;
   }
-  wkspace_left += topsize;
 
   fclose_null(&infile);
-  if (fopen_checked(&infile, phenoname, "r")) {
+  if (fopen_checked(phenoname, "r", &infile)) {
     goto unrelated_herit_batch_ret_OPEN_FAIL;
   }
-  wkspace_left -= topsize;
   retval = load_pheno(infile, unfiltered_sample_ct, 0, sorted_ids, max_sample_id_len, id_map, missing_pheno, 0, mpheno_col, phenoname_str, pheno_nm, &pheno_c, &pheno_d, NULL, 0);
-  wkspace_left += topsize;
-  // topsize = 0; (sorted_ids and id_map no longer used)
+  bigstack_end_reset(bigstack_end_mark);
   fclose_null(&infile);
   if (retval) {
     goto unrelated_herit_batch_ret_1;
@@ -2546,20 +2535,20 @@ int32_t unrelated_herit_batch(uint32_t load_grm_bin, char* grmname, char* phenon
     logerrprint("Error: Less than two phenotypes present.\n");
     goto unrelated_herit_batch_ret_INVALID_FORMAT;
   }
-  ulii = CACHEALIGN_DBL(pheno_nm_ct * pheno_nm_ct);
-  uljj = ulii * 3 + CACHEALIGN_DBL(pheno_nm_ct) * 3;
-  if (wkspace_alloc_d_checked(&matrix_wkbase, ulii * sizeof(double))) {
+  ulii = round_up_pow2(pheno_nm_ct * pheno_nm_ct, CACHELINE_DBL);
+  uljj = ulii * 3 + round_up_pow2(pheno_nm_ct, CACHELINE_DBL) * 3;
+  if (bigstack_alloc_d(ulii, &matrix_wkbase)) {
     goto unrelated_herit_batch_ret_NOMEM;
   }
   g_sample_ct = pheno_nm_ct;
-  pheno_ptr = &(matrix_wkbase[uljj - CACHEALIGN_DBL(pheno_nm_ct)]);
+  pheno_ptr = &(matrix_wkbase[uljj - round_up_pow2(pheno_nm_ct, CACHELINE_DBL)]);
   collapse_copy_phenod_incl(pheno_ptr, pheno_d, pheno_nm, unfiltered_sample_ct, pheno_nm_ct);
   rel_base = &(matrix_wkbase[ulii]);
   mean_zero_var_one_in_place(pheno_nm_ct, pheno_ptr);
   sample_uidx = 0;
   if (load_grm_bin) {
     memcpy(grmname_end, ".grm.bin", 9);
-    if (fopen_checked(&grm_binfile, grmname, "rb")) {
+    if (fopen_checked(grmname, FOPEN_RB, &grm_binfile)) {
       goto unrelated_herit_batch_ret_OPEN_FAIL;
     }
     if (fseeko(grm_binfile, 0, SEEK_END)) {
@@ -2600,31 +2589,32 @@ int32_t unrelated_herit_batch(uint32_t load_grm_bin, char* grmname, char* phenon
     fclose_null(&grm_binfile);
   } else {
     memcpy(grmname_end, ".grm.gz", 8);
-    if (gzopen_checked(&grm_gzfile, grmname, "rb")) {
-      goto unrelated_herit_batch_ret_OPEN_FAIL;
+    retval = gzopen_read_checked(grmname, &grm_gzfile);
+    if (retval) {
+      goto unrelated_herit_batch_ret_1;
     }
     ulii = 0;
     for (sample_uidx = 0; sample_uidx < pheno_nm_ct; sample_uidx++) {
       if (!IS_SET(pheno_nm, sample_uidx)) {
         for (sample_uidx2 = 0; sample_uidx2 <= sample_uidx; sample_uidx2++) {
-          if (!gzgets(grm_gzfile, tbuf, MAXLINELEN)) {
+          if (!gzgets(grm_gzfile, g_textbuf, MAXLINELEN)) {
 	    goto unrelated_herit_batch_ret_READ_FAIL;
 	  }
-	  if (!tbuf[MAXLINELEN - 1]) {
+	  if (!g_textbuf[MAXLINELEN - 1]) {
 	    goto unrelated_herit_batch_ret_INVALID_FORMAT_3;
 	  }
 	}
       } else {
 	row_ptr = &(rel_base[ulii * pheno_nm_ct]);
 	for (sample_uidx2 = 0; sample_uidx2 <= sample_uidx; sample_uidx2++) {
-	  if (!gzgets(grm_gzfile, tbuf, MAXLINELEN)) {
+	  if (!gzgets(grm_gzfile, g_textbuf, MAXLINELEN)) {
 	    goto unrelated_herit_batch_ret_READ_FAIL;
 	  }
-	  if (!tbuf[MAXLINELEN - 1]) {
+	  if (!g_textbuf[MAXLINELEN - 1]) {
 	    goto unrelated_herit_batch_ret_INVALID_FORMAT_3;
 	  }
 	  if (IS_SET(pheno_nm, sample_uidx2)) {
-	    bufptr = next_token_mult(tbuf, 3);
+	    bufptr = next_token_mult(g_textbuf, 3);
 	    if (no_more_tokens_kns(bufptr)) {
 	      goto unrelated_herit_batch_ret_INVALID_FORMAT_3;
 	    }
@@ -2651,8 +2641,6 @@ int32_t unrelated_herit_batch(uint32_t load_grm_bin, char* grmname, char* phenon
   reml_em_one_trait(matrix_wkbase, pheno_ptr, &unrelated_herit_covg, &unrelated_herit_covr, unrelated_herit_tol, is_strict);
   LOGPRINTF("h^2 estimate: %g\n", unrelated_herit_covg);
   while (0) {
-  unrelated_herit_batch_ret_NOMEM2:
-    wkspace_left += topsize;
   unrelated_herit_batch_ret_NOMEM:
     retval = RET_NOMEM;
     break;
@@ -2680,18 +2668,18 @@ int32_t unrelated_herit_batch(uint32_t load_grm_bin, char* grmname, char* phenon
   fclose_cond(infile);
   fclose_cond(grm_binfile);
   gzclose_cond(grm_gzfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
   return retval;
 }
 #endif
 
 int32_t ibs_test_calc(pthread_t* threads, char* read_dists_fname, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uintptr_t perm_ct, uintptr_t pheno_nm_ct, uintptr_t pheno_ctrl_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c) {
-  unsigned char* wkspace_mark = wkspace_base;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t pheno_nm_ctl = (pheno_nm_ct + (BITCT - 1)) / BITCT;
-  uintptr_t perm_ctcl = (perm_ct + (CACHELINE * 8)) / (CACHELINE * 8);
+  unsigned char* bigstack_mark = g_bigstack_base;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t pheno_nm_ctl = BITCT_TO_WORDCT(pheno_nm_ct);
+  uintptr_t perm_ctcl = 1 + (perm_ct / CACHELINE_BIT);
   uintptr_t perm_ctclm = perm_ctcl * (CACHELINE / sizeof(intptr_t));
-  uintptr_t perm_ctcld = (perm_ct + CACHELINE_DBL) / CACHELINE_DBL;
+  uintptr_t perm_ctcld = 1 + (perm_ct / CACHELINE_DBL);
   uintptr_t perm_ctcldm = perm_ctcld * CACHELINE_DBL;
   uintptr_t case_ct = pheno_nm_ct - pheno_ctrl_ct;
   uint32_t tidx = 1;
@@ -2754,24 +2742,23 @@ int32_t ibs_test_calc(pthread_t* threads, char* read_dists_fname, uintptr_t unfi
   case_case_ct = (case_ct * (case_ct - 1)) / 2;
   g_perm_ct = perm_ct;
   // g_pheno_nm and g_pheno_c should be NULL
-  if (wkspace_alloc_ul_checked(&g_pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&g_pheno_c, unfiltered_sample_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl, &g_pheno_nm) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl, &g_pheno_c)) {
     goto ibs_test_calc_ret_NOMEM;
   }
-  collapse_copy_bitarr(unfiltered_sample_ct, pheno_nm, sample_exclude, sample_ct, g_pheno_nm);
-  collapse_copy_bitarr(unfiltered_sample_ct, pheno_c, sample_exclude, sample_ct, g_pheno_c);
-  if (wkspace_alloc_d_checked(&g_ibs_test_partial_sums, g_thread_ct * 32 * BITCT * sizeof(double)) ||
-      wkspace_alloc_ul_checked(&perm_rows, perm_ct * pheno_nm_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&g_perm_col_buf, perm_ctclm * sizeof(intptr_t) * g_thread_ct) ||
-      wkspace_alloc_d_checked(&perm_results, 2 * perm_ctcldm * sizeof(double) * g_thread_ct)) {
+  copy_bitarr_subset_excl(pheno_nm, sample_exclude, unfiltered_sample_ct, sample_ct, g_pheno_nm);
+  copy_bitarr_subset_excl(pheno_c, sample_exclude, unfiltered_sample_ct, sample_ct, g_pheno_c);
+  if (bigstack_alloc_d(g_thread_ct * 32 * BITCT, &g_ibs_test_partial_sums) ||
+      bigstack_alloc_ul(perm_ct * pheno_nm_ctl, &perm_rows) ||
+      bigstack_alloc_ul(perm_ctclm * g_thread_ct, &g_perm_col_buf) ||
+      bigstack_calloc_d(2 * perm_ctcldm * g_thread_ct, &perm_results)) {
     goto ibs_test_calc_ret_NOMEM;
   }
   g_perm_results = perm_results;
-  fill_double_zero(perm_results, 2 * perm_ctcldm * g_thread_ct);
   g_perm_rows = perm_rows;
 
   // first permutation = original
-  collapse_copy_bitarr_incl(unfiltered_sample_ct, g_pheno_c, g_pheno_nm, pheno_nm_ct, perm_rows);
+  copy_bitarr_subset(g_pheno_c, g_pheno_nm, unfiltered_sample_ct, pheno_nm_ct, perm_rows);
   for (ulii = pheno_nm_ctl - 1; ulii; ulii--) {
     perm_rows[ulii * perm_ct] = perm_rows[ulii];
   }
@@ -2786,7 +2773,7 @@ int32_t ibs_test_calc(pthread_t* threads, char* read_dists_fname, uintptr_t unfi
   for (ulii = 0; ulii < pheno_nm_ct; ulii++) {
     uljj += ((perm_rows[((ulii / BITCT) * perm_ct)] >> (ulii & (BITCT - 1))) & 1);
   }
-  triangle_fill(g_thread_start, pheno_nm_ct, g_thread_ct, 0, 1, 1, 1);
+  triangle_fill(pheno_nm_ct, g_thread_ct, 0, 1, 1, 1, g_thread_start);
   if (spawn_threads(threads, &ibs_test_thread, g_thread_ct)) {
     goto ibs_test_calc_ret_THREAD_CREATE_FAIL;
   }
@@ -2894,15 +2881,15 @@ int32_t ibs_test_calc(pthread_t* threads, char* read_dists_fname, uintptr_t unfi
     break;
   }
  ibs_test_calc_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   g_pheno_nm = NULL;
   g_pheno_c = NULL;
   return retval;
 }
 
 int32_t groupdist_calc(pthread_t* threads, uint32_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uintptr_t groupdist_iters, uint32_t groupdist_d, uint32_t pheno_nm_ct, uint32_t pheno_ctrl_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c) {
-  unsigned char* wkspace_mark = wkspace_base;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   double* dist_ptr = g_dists;
   double dhh_ssq = 0.0;
   double dhl_ssq = 0.0;
@@ -2947,14 +2934,14 @@ int32_t groupdist_calc(pthread_t* threads, uint32_t unfiltered_sample_ct, uintpt
   g_ctrl_ct = pheno_ctrl_ct;
   g_sample_ct = sample_ct;
   // g_pheno_nm and g_pheno_c should be NULL
-  if (wkspace_alloc_ul_checked(&pheno_nm_local, unfiltered_sample_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&pheno_c_local, unfiltered_sample_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl, &pheno_nm_local) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl, &pheno_c_local)) {
     goto groupdist_calc_ret_NOMEM;
   }
   g_pheno_nm = pheno_nm_local;
   g_pheno_c = pheno_c_local;
-  collapse_copy_bitarr(unfiltered_sample_ct, pheno_nm, sample_exclude, sample_ct, pheno_nm_local);
-  collapse_copy_bitarr(unfiltered_sample_ct, pheno_c, sample_exclude, sample_ct, pheno_c_local);
+  copy_bitarr_subset_excl(pheno_nm, sample_exclude, unfiltered_sample_ct, sample_ct, pheno_nm_local);
+  copy_bitarr_subset_excl(pheno_c, sample_exclude, unfiltered_sample_ct, sample_ct, pheno_c_local);
   ll_size = ((uintptr_t)g_ctrl_ct * (g_ctrl_ct - 1)) / 2;
   lh_size = g_ctrl_ct * g_case_ct;
   hh_size = ((uintptr_t)g_case_ct * (g_case_ct - 1)) / 2;
@@ -2966,10 +2953,10 @@ int32_t groupdist_calc(pthread_t* threads, uint32_t unfiltered_sample_ct, uintpt
   } else {
     g_jackknife_d = set_default_jackknife_d(g_case_ct + g_ctrl_ct);
   }
-  if (wkspace_alloc_d_checked(&ll_pool, ll_size * sizeof(double)) ||
-      wkspace_alloc_d_checked(&lh_pool, lh_size * sizeof(double)) ||
-      wkspace_alloc_d_checked(&hh_pool, hh_size * sizeof(double)) ||
-      wkspace_alloc_uc_checked(&g_geno, g_thread_ct * CACHEALIGN(g_case_ct + g_ctrl_ct + (g_jackknife_d + 1) * sizeof(int32_t)))) {
+  if (bigstack_alloc_d(ll_size, &ll_pool) ||
+      bigstack_alloc_d(lh_size, &lh_pool) ||
+      bigstack_alloc_d(hh_size, &hh_pool) ||
+      bigstack_alloc_uc(g_thread_ct * round_up_pow2(g_case_ct + g_ctrl_ct + (g_jackknife_d + 1) * sizeof(int32_t), CACHELINE), &g_geno)) {
     goto groupdist_calc_ret_NOMEM;
   }
   ll_poolp = ll_pool;
@@ -3014,9 +3001,9 @@ int32_t groupdist_calc(pthread_t* threads, uint32_t unfiltered_sample_ct, uintpt
       dist_ptr += sample_idx;
     }
   }
-  ll_med = destructive_get_dmedian(ll_pool, ll_size);
-  lh_med = destructive_get_dmedian(lh_pool, lh_size);
-  hh_med = destructive_get_dmedian(hh_pool, hh_size);
+  ll_med = destructive_get_dmedian(ll_size, ll_pool);
+  lh_med = destructive_get_dmedian(lh_size, lh_pool);
+  hh_med = destructive_get_dmedian(hh_size, hh_pool);
   logprint("Case/control distance analysis:\n");
   if (g_case_ct < 2) {
     dxx = 0.0;
@@ -3048,12 +3035,11 @@ int32_t groupdist_calc(pthread_t* threads, uint32_t unfiltered_sample_ct, uintpt
   if (2 * g_jackknife_d >= (g_case_ct + g_ctrl_ct)) {
     logprint("Delete-d jackknife skipped because d is too large.\n");
   } else {
-    if (wkspace_alloc_d_checked(&jackknife_precomp, sample_ct * JACKKNIFE_VALS_GROUPDIST * sizeof(double))) {
+    if (bigstack_calloc_d(sample_ct * JACKKNIFE_VALS_GROUPDIST, &jackknife_precomp)) {
       goto groupdist_calc_ret_NOMEM;
     }
     g_jackknife_precomp = jackknife_precomp;
-    fill_double_zero(jackknife_precomp, sample_ct * JACKKNIFE_VALS_GROUPDIST);
-    if (wkspace_init_sfmtp(g_thread_ct)) {
+    if (bigstack_init_sfmtp(g_thread_ct)) {
       goto groupdist_calc_ret_NOMEM;
     }
     // to precompute:
@@ -3125,7 +3111,7 @@ int32_t groupdist_calc(pthread_t* threads, uint32_t unfiltered_sample_ct, uintpt
     break;
   }
  groupdist_calc_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   g_pheno_nm = NULL;
   g_pheno_c = NULL;
   return retval;
@@ -3196,12 +3182,12 @@ void normalize_phenos(double* new_phenos, uint32_t sample_ct, uintptr_t* sample_
 int32_t calc_regress_pcs(char* evecname, uint32_t regress_pcs_modifier, uint32_t max_pcs, FILE* bedfile, uintptr_t bed_offset, uint32_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, Chrom_info* chrom_info_ptr, uint32_t* marker_pos, uintptr_t sample_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, char* sample_ids, uintptr_t max_sample_id_len, uintptr_ [...]
   FILE* outfile = NULL;
   FILE* evecfile = NULL;
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t* sample_include2 = NULL;
   uintptr_t* sample_male_include2 = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl2 = 2 * ((unfiltered_sample_ct + BITCT - 1) / BITCT);
-  uintptr_t sample_ctl2 = 2 * ((sample_ct + BITCT - 1) / BITCT);
+  uintptr_t unfiltered_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct);
+  uintptr_t sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
   uintptr_t marker_uidx = 0;
   uint32_t pc_ct = 0;
   uint32_t pct = 1;
@@ -3245,13 +3231,13 @@ int32_t calc_regress_pcs(char* evecname, uint32_t regress_pcs_modifier, uint32_t
   MATRIX_INVERT_BUF1_TYPE* inv_1d_buf;
   double* dbl_2d_buf;
   double dxx;
-  if (wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf, sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&missing_cts, sample_ct * sizeof(int32_t)) ||
-      wkspace_alloc_c_checked(&id_buf, max_sample_id_len)) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctv2, &loadbuf_raw) ||
+      bigstack_alloc_ul(sample_ctv2, &loadbuf) ||
+      bigstack_alloc_ui(sample_ct, &missing_cts) ||
+      bigstack_alloc_c(max_sample_id_len, &id_buf)) {
     goto calc_regress_pcs_ret_NOMEM;
   }
-  if (alloc_collapsed_haploid_filters(unfiltered_sample_ct, sample_ct, hh_exists, 0, sample_exclude, sex_male, &sample_include2, &sample_male_include2)) {
+  if (alloc_collapsed_haploid_filters(sample_exclude, sex_male, unfiltered_sample_ct, sample_ct, hh_exists, 0, &sample_include2, &sample_male_include2)) {
     goto calc_regress_pcs_ret_NOMEM;
   }
   
@@ -3268,26 +3254,26 @@ int32_t calc_regress_pcs(char* evecname, uint32_t regress_pcs_modifier, uint32_t
     evecfile = fopen(evecname, "r");
     if (!evecfile) {
       strcpy(&(evecname[ulii]), ".eigenvec");
-      if (fopen_checked(&evecfile, evecname, "r")) {
+      if (fopen_checked(evecname, "r", &evecfile)) {
         goto calc_regress_pcs_ret_OPEN_FAIL;
       }
     }
   }
 
-  tbuf[MAXLINELEN - 7] = ' ';
-  tbuf[MAXLINELEN - 1] = ' ';
-  if (!fgets(tbuf, MAXLINELEN - 6, evecfile)) {
+  g_textbuf[MAXLINELEN - 7] = ' ';
+  g_textbuf[MAXLINELEN - 1] = ' ';
+  if (!fgets(g_textbuf, MAXLINELEN - 6, evecfile)) {
     if (feof(evecfile)) {
       goto calc_regress_pcs_ret_INVALID_FORMAT_2G;
     } else {
       goto calc_regress_pcs_ret_READ_FAIL;
     }
   }
-  if (!tbuf[MAXLINELEN - 7]) {
+  if (!g_textbuf[MAXLINELEN - 7]) {
     logerrprint("Error: Excessively long line in .evec/.eigenvec file.\n");
     goto calc_regress_pcs_ret_INVALID_FORMAT;
   }
-  bufptr = skip_initial_spaces(tbuf);
+  bufptr = skip_initial_spaces(g_textbuf);
   if (no_more_tokens_kns(bufptr)) {
     goto calc_regress_pcs_ret_INVALID_FORMAT_2G;
   }
@@ -3304,25 +3290,23 @@ int32_t calc_regress_pcs(char* evecname, uint32_t regress_pcs_modifier, uint32_t
     goto calc_regress_pcs_ret_INVALID_FORMAT_2G;
   }
   if (pc_ct > max_pcs) {
-    sprintf(logbuf, "%svec format detected.  Regressing on %d PC%s (out of %d).\n", is_eigenvec? "GCTA .eigen" : "SMARTPCA .e", max_pcs, (max_pcs == 1)? "" : "s", pc_ct);
+    sprintf(g_logbuf, "%svec format detected.  Regressing on %d PC%s (out of %d).\n", is_eigenvec? "GCTA .eigen" : "SMARTPCA .e", max_pcs, (max_pcs == 1)? "" : "s", pc_ct);
     pc_ct = max_pcs;
   } else {
-    sprintf(logbuf, "%svec format detected.  Regressing on %d principal component%s.\n", is_eigenvec? "GCTA .eigen" : "SMARTPCA .e", pc_ct, (pc_ct == 1)? "" : "s");
+    sprintf(g_logbuf, "%svec format detected.  Regressing on %d principal component%s.\n", is_eigenvec? "GCTA .eigen" : "SMARTPCA .e", pc_ct, (pc_ct == 1)? "" : "s");
   }
   logprintb();
   pc_ct_p1 = pc_ct + 1;
-  if (wkspace_alloc_d_checked(&pc_matrix, pc_ct_p1 * sample_ct * sizeof(double))) {
-    goto calc_regress_pcs_ret_NOMEM;
-  }
-  if (wkspace_alloc_d_checked(&pc_orig_prod_sums, pc_ct_p1 * pc_ct_p1 * sizeof(double)) ||
-      wkspace_alloc_d_checked(&pc_prod_sums, pc_ct_p1 * pc_ct_p1 * sizeof(double)) ||
-      wkspace_alloc_d_checked(&x_prime_y, pc_ct_p1 * sizeof(double)) ||
-      wkspace_alloc_d_checked(&beta_vec, pc_ct_p1 * sizeof(double)) ||
-      wkspace_alloc_d_checked(&residual_vec, pc_ct_p1 * sizeof(double)) ||
-      wkspace_alloc_d_checked(&dbl_2d_buf, pc_ct_p1 * pc_ct_p1 * sizeof(double))) {
+  if (bigstack_alloc_d(pc_ct_p1 * sample_ct, &pc_matrix) ||
+      bigstack_alloc_d(pc_ct_p1 * pc_ct_p1, &pc_orig_prod_sums) ||
+      bigstack_alloc_d(pc_ct_p1 * pc_ct_p1, &pc_prod_sums) ||
+      bigstack_alloc_d(pc_ct_p1, &x_prime_y) ||
+      bigstack_alloc_d(pc_ct_p1, &beta_vec) ||
+      bigstack_alloc_d(pc_ct_p1, &residual_vec) ||
+      bigstack_alloc_d(pc_ct_p1 * pc_ct_p1, &dbl_2d_buf)) {
     goto calc_regress_pcs_ret_NOMEM;
   }
-  inv_1d_buf = (MATRIX_INVERT_BUF1_TYPE*)wkspace_alloc(pc_ct_p1 * sizeof(MATRIX_INVERT_BUF1_TYPE));
+  inv_1d_buf = (MATRIX_INVERT_BUF1_TYPE*)bigstack_alloc(pc_ct_p1 * sizeof(MATRIX_INVERT_BUF1_TYPE));
   if (!inv_1d_buf) {
     goto calc_regress_pcs_ret_NOMEM;
   }
@@ -3331,7 +3315,7 @@ int32_t calc_regress_pcs(char* evecname, uint32_t regress_pcs_modifier, uint32_t
     sample_idx = 0;
     while (1) {
       // todo: validate, and perhaps permute, sample IDs
-      bufptr = next_token_mult(skip_initial_spaces(tbuf), 2);
+      bufptr = next_token_mult(skip_initial_spaces(g_textbuf), 2);
       for (uii = 0; uii < pc_ct; uii++) {
 	if (no_more_tokens_kns(bufptr)) {
 	  goto calc_regress_pcs_ret_INVALID_FORMAT_2G;
@@ -3345,9 +3329,9 @@ int32_t calc_regress_pcs(char* evecname, uint32_t regress_pcs_modifier, uint32_t
       if (++sample_idx >= sample_ct) {
 	break;
       }
-      if (!fgets(tbuf, MAXLINELEN, evecfile)) {
+      if (!fgets(g_textbuf, MAXLINELEN, evecfile)) {
 	if (feof(evecfile)) {
-	  sprintf(logbuf, "Error: Fewer %s in .eigenvec file than expected.\n", g_species_plural);
+	  sprintf(g_logbuf, "Error: Fewer %s in .eigenvec file than expected.\n", g_species_plural);
 	  goto calc_regress_pcs_ret_INVALID_FORMAT_3;
 	} else {
 	  goto calc_regress_pcs_ret_READ_FAIL;
@@ -3356,15 +3340,15 @@ int32_t calc_regress_pcs(char* evecname, uint32_t regress_pcs_modifier, uint32_t
     }
   } else {
     for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
-      if (!fgets(tbuf, MAXLINELEN, evecfile)) {
+      if (!fgets(g_textbuf, MAXLINELEN, evecfile)) {
 	if (feof(evecfile)) {
-	  sprintf(logbuf, "Error: Fewer %s in .evec file than expected.\n", g_species_plural);
+	  sprintf(g_logbuf, "Error: Fewer %s in .evec file than expected.\n", g_species_plural);
 	  goto calc_regress_pcs_ret_INVALID_FORMAT_3;
 	} else {
 	  goto calc_regress_pcs_ret_READ_FAIL;
 	}
       }
-      bufptr = next_token(skip_initial_spaces(tbuf));
+      bufptr = next_token(skip_initial_spaces(g_textbuf));
       for (uii = 0; uii < pc_ct; uii++) {
 	if (no_more_tokens_kns(bufptr)) {
 	  goto calc_regress_pcs_ret_INVALID_FORMAT_2G;
@@ -3377,9 +3361,9 @@ int32_t calc_regress_pcs(char* evecname, uint32_t regress_pcs_modifier, uint32_t
       pc_matrix[pc_ct * sample_ct + sample_idx] = 1.0;
     }
   }
-  if (fgets(tbuf, MAXLINELEN, evecfile)) {
-    if (!no_more_tokens_kns(skip_initial_spaces(tbuf))) {
-      sprintf(logbuf, "Error: More %s in .e%svec file than expected.\n", g_species_plural, is_eigenvec? "igen" : "");
+  if (fgets(g_textbuf, MAXLINELEN, evecfile)) {
+    if (!no_more_tokens_kns(skip_initial_spaces(g_textbuf))) {
+      sprintf(g_logbuf, "Error: More %s in .e%svec file than expected.\n", g_species_plural, is_eigenvec? "igen" : "");
       goto calc_regress_pcs_ret_INVALID_FORMAT_3;
     }
   }
@@ -3402,7 +3386,7 @@ int32_t calc_regress_pcs(char* evecname, uint32_t regress_pcs_modifier, uint32_t
   // bits instead of the ~20 you get from printf("%g", dxx)), and there's no
   // need for repeated random access.
   strcpy(outname_end, ".gen");
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto calc_regress_pcs_ret_OPEN_FAIL;
   }
   if (fseeko(bedfile, bed_offset, SEEK_SET)) {
@@ -3419,19 +3403,19 @@ int32_t calc_regress_pcs(char* evecname, uint32_t regress_pcs_modifier, uint32_t
       chrom_fo_idx++;
       refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_haploid);
     }
-    if (load_and_collapse(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, sample_ct, sample_exclude, IS_SET(marker_reverse, marker_uidx))) {
+    if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf)) {
       goto calc_regress_pcs_ret_READ_FAIL;
     }
     if (is_haploid && hh_exists) {
       haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)loadbuf);
     }
-    bufptr = chrom_name_write(tbuf, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx));
+    bufptr = chrom_name_write(chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx), g_textbuf);
     *bufptr++ = ' ';
-    fwrite(tbuf, 1, bufptr - tbuf, outfile);
+    fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
     fputs(&(marker_ids[marker_uidx * max_marker_id_len]), outfile);
-    tbuf[0] = ' ';
-    bufptr = uint32_writex(&(tbuf[1]), marker_pos[marker_uidx], ' ');
-    fwrite(tbuf, 1, bufptr - tbuf, outfile);
+    g_textbuf[0] = ' ';
+    bufptr = uint32toa_x(marker_pos[marker_uidx], ' ', &(g_textbuf[1]));
+    fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
     fputs(marker_allele_ptrs[2 * marker_uidx], outfile);
     putc(' ', outfile);
     if (fputs_checked(marker_allele_ptrs[2 * marker_uidx + 1], outfile)) {
@@ -3510,11 +3494,15 @@ int32_t calc_regress_pcs(char* evecname, uint32_t regress_pcs_modifier, uint32_t
 		if (regress_pcs_clip) {
 		  fputs(" 1 0 0", outfile);
 		} else {
-		  bufptr = double_g_write(memcpyl3a(double_g_write(&(wbuf[1]), 1.0 - dxx * 0.5), " 0 "), dxx * 0.5);
+		  bufptr = dtoa_g(1.0 - dxx * 0.5, &(wbuf[1]));
+                  bufptr = memcpyl3a(bufptr, " 0 ");
+		  bufptr = dtoa_g(dxx * 0.5, bufptr);
 		  fwrite(wbuf, 1, bufptr - wbuf, outfile);
 		}
 	      } else {
-		bufptr = memcpya(double_g_write(double_g_writex(&(wbuf[1]), 1.0 - dxx, ' '), dxx), " 0", 2);
+	        bufptr = dtoa_gx(1.0 - dxx, ' ', &(wbuf[1]));
+		bufptr = dtoa_g(dxx, bufptr);
+		bufptr = memcpya(bufptr, " 0", 2);
 		fwrite(wbuf, 1, bufptr - wbuf, outfile);
 	      }
 	    } else {
@@ -3522,11 +3510,15 @@ int32_t calc_regress_pcs(char* evecname, uint32_t regress_pcs_modifier, uint32_t
 		if (regress_pcs_clip) {
 		  fputs(" 0 0 1", outfile);
 		} else {
-		  bufptr = double_g_write(memcpyl3a(double_g_write(&(wbuf[1]), 1.0 - dxx * 0.5), " 0 "), dxx * 0.5);
+		  bufptr = dtoa_g(1.0 - dxx * 0.5, &(wbuf[1]));
+		  bufptr = memcpyl3a(bufptr, " 0 ");
+		  bufptr = dtoa_g(dxx * 0.5, bufptr);
 		  fwrite(wbuf, 1, bufptr - wbuf, outfile);
 		}
 	      } else {
-		bufptr = double_g_write(double_g_writex(memcpya(&(wbuf[1]), "0 ", 2), 2.0 - dxx, ' '), dxx - 1.0);
+	        bufptr = memcpya(&(wbuf[1]), "0 ", 2);
+		bufptr = dtoa_gx(2.0 - dxx, ' ', bufptr);
+		bufptr = dtoa_g(dxx - 1.0, bufptr);
 		fwrite(wbuf, 1, bufptr - wbuf, outfile);
 	      }
 	    }
@@ -3554,7 +3546,7 @@ int32_t calc_regress_pcs(char* evecname, uint32_t regress_pcs_modifier, uint32_t
     goto calc_regress_pcs_ret_WRITE_FAIL;
   }
   strcpy(outname_end, ".sample");
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto calc_regress_pcs_ret_OPEN_FAIL;
   }
   if (fputs_checked("ID_1 ID_2 missing sex phenotype\n0 0 0 D P\n", outfile)) {
@@ -3613,12 +3605,12 @@ int32_t calc_regress_pcs(char* evecname, uint32_t regress_pcs_modifier, uint32_t
     fwrite(sample_id_ptr, 1, uii, outfile);
     putc(' ', outfile);
     fputs(&(sample_id_ptr[uii + 1]), outfile);
-    tbuf[0] = ' ';
-    bufptr = double_g_writex(&(tbuf[1]), ((double)missing_cts[sample_uidx]) / (double)marker_ct, ' ');
+    g_textbuf[0] = ' ';
+    bufptr = dtoa_gx(((double)missing_cts[sample_uidx]) / (double)marker_ct, ' ', &(g_textbuf[1]));
     *bufptr = sexchar(sex_nm, sex_male, sample_uidx);
     bufptr[1] = ' ';
-    bufptr = double_g_writex(&(bufptr[2]), residual_vec[sample_idx], '\n');
-    if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+    bufptr = dtoa_gx(residual_vec[sample_idx], '\n', &(bufptr[2]));
+    if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
       goto calc_regress_pcs_ret_WRITE_FAIL;
     }
   }
@@ -3628,7 +3620,7 @@ int32_t calc_regress_pcs(char* evecname, uint32_t regress_pcs_modifier, uint32_t
   *outname_end = '\0';
   putchar('\r');
   LOGPRINTF("Principal component regression residuals and %sphenotype Z-scores %s%s.gen and %s.sample.\n", regress_pcs_sex_specific? "sex-specific " : "", regress_pcs_sex_specific? "\nwritten to " : "written to\n", outname, outname);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   while (0) {
   calc_regress_pcs_ret_NOMEM:
     retval = RET_NOMEM;
@@ -3715,11 +3707,11 @@ static double g_dw_half_marker_ct_recip;
 int32_t write_ids(char* outname, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, char* sample_ids, uintptr_t max_sample_id_len) {
   uintptr_t sample_uidx = 0;
   FILE* outfile;
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     return RET_OPEN_FAIL;
   }
   while (1) {
-    next_unset_ul_ck(sample_exclude, &sample_uidx, unfiltered_sample_ct);
+    next_unset_ul_ck(sample_exclude, unfiltered_sample_ct, &sample_uidx);
     if (sample_uidx == unfiltered_sample_ct) {
       break;
     }
@@ -3768,8 +3760,8 @@ int32_t distance_open(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile3_p
     } else {
       sprintf(outname_end, ".dist%s", varsuffix);
     }
-    strcpy(tbuf, outname_end);
-    if (fopen_checked(outfile_ptr, outname, mode)) {
+    strcpy(g_textbuf, outname_end);
+    if (fopen_checked(outname, mode, outfile_ptr)) {
       return 1;
     }
   }
@@ -3779,8 +3771,8 @@ int32_t distance_open(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile3_p
     } else {
       sprintf(outname_end, ".mibs%s", varsuffix);
     }
-    strcpy(&(tbuf[MAX_POST_EXT]), outname_end);
-    if (fopen_checked(outfile2_ptr, outname, mode)) {
+    strcpy(&(g_textbuf[MAX_POST_EXT]), outname_end);
+    if (fopen_checked(outname, mode, outfile2_ptr)) {
       return 1;
     }
   }
@@ -3790,8 +3782,8 @@ int32_t distance_open(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile3_p
     } else {
       sprintf(outname_end, ".mdist%s", varsuffix);
     }
-    strcpy(&(tbuf[MAX_POST_EXT * 2]), outname_end);
-    if (fopen_checked(outfile3_ptr, outname, mode)) {
+    strcpy(&(g_textbuf[MAX_POST_EXT * 2]), outname_end);
+    if (fopen_checked(outname, mode, outfile3_ptr)) {
       return 1;
     }
   }
@@ -3801,16 +3793,16 @@ int32_t distance_open(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile3_p
 void distance_print_done(int32_t format_code, char* outname, char* outname_end) {
   putchar('\r');
   if (!format_code) {
-    strcpy(outname_end, tbuf);
-    sprintf(logbuf, "Distances (allele counts) written to %s .\n", outname);
+    strcpy(outname_end, g_textbuf);
+    sprintf(g_logbuf, "Distances (allele counts) written to %s .\n", outname);
   } else if (format_code == 1) {
-    strcpy(outname_end, &(tbuf[MAX_POST_EXT]));
-    sprintf(logbuf, "IBS matrix written to %s .\n", outname);
+    strcpy(outname_end, &(g_textbuf[MAX_POST_EXT]));
+    sprintf(g_logbuf, "IBS matrix written to %s .\n", outname);
   } else if (format_code == 2) {
-    strcpy(outname_end, &(tbuf[MAX_POST_EXT * 2]));
-    sprintf(logbuf, "Distances (proportions) written to %s .\n", outname);
+    strcpy(outname_end, &(g_textbuf[MAX_POST_EXT * 2]));
+    sprintf(g_logbuf, "Distances (proportions) written to %s .\n", outname);
   }
-  wordwrap(logbuf, 0);
+  wordwrapb(0);
   logprintb();
 }
 
@@ -3826,14 +3818,14 @@ uint32_t distance_d_write_tri_emitn(uint32_t overflow_ct, unsigned char* readbuf
   uint32_t pct = g_pct;
   while (sample1idx < max_sample1idx) {
     while (sample2idx + 1 < sample1idx) {
-      sptr_cur = double_g_writex(sptr_cur, *dist_ptr++, '\t');
+      sptr_cur = dtoa_gx(*dist_ptr++, '\t', sptr_cur);
       sample2idx++;
       if (sptr_cur >= readbuf_end) {
 	goto distance_d_write_tri_emitn_ret;
       }
     }
     if (sample2idx + 1 == sample1idx) {
-      sptr_cur = double_g_writex(sptr_cur, *dist_ptr++, '\n');
+      sptr_cur = dtoa_gx(*dist_ptr++, '\n', sptr_cur);
     }
     if ((((uint64_t)sample1idx) * (sample1idx + 1) / 2 - start_offset) >= hundredth * pct) {
       pct = (((uint64_t)sample1idx) * (sample1idx + 1) / 2 - start_offset) / hundredth;
@@ -3865,7 +3857,7 @@ uint32_t distance_d_write_sq0_emitn(uint32_t overflow_ct, unsigned char* readbuf
   uintptr_t ulii;
   while (sample1idx < max_sample1idx) {
     while (sample2idx < sample1idx) {
-      sptr_cur = double_g_writex(sptr_cur, *dist_ptr++, '\t');
+      sptr_cur = dtoa_gx(*dist_ptr++, '\t', sptr_cur);
       sample2idx++;
       if (sptr_cur >= readbuf_end) {
 	goto distance_d_write_sq0_emitn_ret;
@@ -3908,7 +3900,7 @@ uint32_t distance_d_write_sq_emitn(uint32_t overflow_ct, unsigned char* readbuf)
   uint32_t pct = g_pct;
   while (sample1idx < max_sample1idx) {
     while (sample2idx < sample1idx) {
-      sptr_cur = double_g_writex(sptr_cur, *dist_ptr++, '\t');
+      sptr_cur = dtoa_gx(*dist_ptr++, '\t', sptr_cur);
       sample2idx++;
       if (sptr_cur >= readbuf_end) {
 	goto distance_d_write_sq_emitn_ret;
@@ -3919,8 +3911,8 @@ uint32_t distance_d_write_sq_emitn(uint32_t overflow_ct, unsigned char* readbuf)
       sample2idx++;
     }
     while (sample2idx < sample_ct) {
-      *sptr_cur = '\t';
-      sptr_cur = double_g_write(&(sptr_cur[1]), dists[((sample2idx * (sample2idx - 1)) / 2) + sample1idx]);
+      *sptr_cur++ = '\t';
+      sptr_cur = dtoa_g(dists[((sample2idx * (sample2idx - 1)) / 2) + sample1idx], sptr_cur);
       sample2idx++;
       if (sptr_cur >= readbuf_end) {
 	goto distance_d_write_sq_emitn_ret;
@@ -3956,7 +3948,7 @@ uint32_t distance_d_write_ibs_tri_emitn(uint32_t overflow_ct, unsigned char* rea
   uint32_t pct = g_pct;
   while (sample1idx < max_sample1idx) {
     while (sample2idx < sample1idx) {
-      sptr_cur = double_g_writex(sptr_cur, 1.0 - (*dist_ptr++) * half_marker_ct_recip, '\t');
+      sptr_cur = dtoa_gx(1.0 - (*dist_ptr++) * half_marker_ct_recip, '\t', sptr_cur);
       sample2idx++;
       if (sptr_cur >= readbuf_end) {
 	goto distance_d_write_ibs_tri_emitn_ret;
@@ -3996,7 +3988,7 @@ uint32_t distance_d_write_ibs_sq0_emitn(uint32_t overflow_ct, unsigned char* rea
   uintptr_t ulii;
   while (sample1idx < max_sample1idx) {
     while (sample2idx < sample1idx) {
-      sptr_cur = double_g_writex(sptr_cur, 1.0 - (*dist_ptr++) * half_marker_ct_recip, '\t');
+      sptr_cur = dtoa_gx(1.0 - (*dist_ptr++) * half_marker_ct_recip, '\t', sptr_cur);
       sample2idx++;
       if (sptr_cur >= readbuf_end) {
 	goto distance_d_write_ibs_sq0_emitn_ret;
@@ -4047,7 +4039,7 @@ uint32_t distance_d_write_ibs_sq_emitn(uint32_t overflow_ct, unsigned char* read
   uint32_t pct = g_pct;
   while (sample1idx < max_sample1idx) {
     while (sample2idx < sample1idx) {
-      sptr_cur = double_g_writex(sptr_cur, 1.0 - (*dist_ptr++) * half_marker_ct_recip, '\t');
+      sptr_cur = dtoa_gx(1.0 - (*dist_ptr++) * half_marker_ct_recip, '\t', sptr_cur);
       sample2idx++;
       if (sptr_cur >= readbuf_end) {
 	goto distance_d_write_ibs_sq_emitn_ret;
@@ -4058,8 +4050,8 @@ uint32_t distance_d_write_ibs_sq_emitn(uint32_t overflow_ct, unsigned char* read
       sample2idx++;
     }
     while (sample2idx < sample_ct) {
-      *sptr_cur = '\t';
-      sptr_cur = double_g_write(&(sptr_cur[1]), 1.0 - (dists[((sample2idx * (sample2idx - 1)) / 2) + sample1idx]) * half_marker_ct_recip);
+      *sptr_cur++ = '\t';
+      sptr_cur = dtoa_g(1.0 - (dists[((sample2idx * (sample2idx - 1)) / 2) + sample1idx]) * half_marker_ct_recip, sptr_cur);
       sample2idx++;
       if (sptr_cur >= readbuf_end) {
 	goto distance_d_write_ibs_sq_emitn_ret;
@@ -4095,14 +4087,14 @@ uint32_t distance_d_write_1mibs_tri_emitn(uint32_t overflow_ct, unsigned char* r
   uint32_t pct = g_pct;
   while (sample1idx < max_sample1idx) {
     while (sample2idx + 1 < sample1idx) {
-      sptr_cur = double_g_writex(sptr_cur, (*dist_ptr++) * half_marker_ct_recip, '\t');
+      sptr_cur = dtoa_gx((*dist_ptr++) * half_marker_ct_recip, '\t', sptr_cur);
       sample2idx++;
       if (sptr_cur >= readbuf_end) {
 	goto distance_d_write_1mibs_tri_emitn_ret;
       }
     }
     if (sample2idx + 1 == sample1idx) {
-      sptr_cur = double_g_writex(sptr_cur, (*dist_ptr++) * half_marker_ct_recip, '\n');
+      sptr_cur = dtoa_gx((*dist_ptr++) * half_marker_ct_recip, '\n', sptr_cur);
     }
     if ((((uint64_t)sample1idx) * (sample1idx + 1) / 2 - start_offset) >= hundredth * pct) {
       pct = (((uint64_t)sample1idx) * (sample1idx + 1) / 2 - start_offset) / hundredth;
@@ -4135,7 +4127,7 @@ uint32_t distance_d_write_1mibs_sq0_emitn(uint32_t overflow_ct, unsigned char* r
   uintptr_t ulii;
   while (sample1idx < max_sample1idx) {
     while (sample2idx < sample1idx) {
-      sptr_cur = double_g_writex(sptr_cur, (*dist_ptr++) * half_marker_ct_recip, '\t');
+      sptr_cur = dtoa_gx((*dist_ptr++) * half_marker_ct_recip, '\t', sptr_cur);
       sample2idx++;
       if (sptr_cur >= readbuf_end) {
 	goto distance_d_write_1mibs_sq0_emitn_ret;
@@ -4179,7 +4171,7 @@ uint32_t distance_d_write_1mibs_sq_emitn(uint32_t overflow_ct, unsigned char* re
   uint32_t pct = g_pct;
   while (sample1idx < max_sample1idx) {
     while (sample2idx < sample1idx) {
-      sptr_cur = double_g_writex(sptr_cur, (*dist_ptr++) * half_marker_ct_recip, '\t');
+      sptr_cur = dtoa_gx((*dist_ptr++) * half_marker_ct_recip, '\t', sptr_cur);
       sample2idx++;
       if (sptr_cur >= readbuf_end) {
 	goto distance_d_write_1mibs_sq_emitn_ret;
@@ -4190,8 +4182,8 @@ uint32_t distance_d_write_1mibs_sq_emitn(uint32_t overflow_ct, unsigned char* re
       sample2idx++;
     }
     while (sample2idx < sample_ct) {
-      *sptr_cur = '\t';
-      sptr_cur = double_g_write(&(sptr_cur[1]), (dists[((sample2idx * (sample2idx - 1)) / 2) + sample1idx]) * half_marker_ct_recip);
+      *sptr_cur++ = '\t';
+      sptr_cur = dtoa_g((dists[((sample2idx * (sample2idx - 1)) / 2) + sample1idx]) * half_marker_ct_recip, sptr_cur);
       sample2idx++;
       if (sptr_cur >= readbuf_end) {
 	goto distance_d_write_1mibs_sq_emitn_ret;
@@ -4244,7 +4236,7 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
   }
   g_pct = 1;
   if (dist_calc_type & (DISTANCE_BIN | DISTANCE_BIN4)) {
-    if (distance_open(outfile_ptr, outfile2_ptr, outfile3_ptr, outname, outname_end, ".bin", "wb", dist_calc_type, parallel_idx, parallel_tot)) {
+    if (distance_open(outfile_ptr, outfile2_ptr, outfile3_ptr, outname, outname_end, ".bin", FOPEN_WB, dist_calc_type, parallel_idx, parallel_tot)) {
       goto distance_d_write_ret_OPEN_FAIL;
     }
     if (shape == DISTANCE_TRI) {
@@ -4755,11 +4747,11 @@ uint32_t calc_genome_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
   if (!sample2idx) {
     // first line, if not 2nd or later part of parallel write
     if (!sample1idx) {
-      sptr_cur += sprintf(sptr_cur, tbuf, " FID1", " IID1", " FID2", " IID2");
+      sptr_cur += sprintf(sptr_cur, g_textbuf, " FID1", " IID1", " FID2", " IID2");
     }
     sample1idx = g_thread_start[0];
     sample2idx = sample1idx + 1;
-    tbuf[0] = ' ';
+    g_textbuf[0] = ' ';
   }
   while (sample1idx < tstc) {
     if (sample2idx == sample1idx + 1) {
@@ -4774,7 +4766,7 @@ uint32_t calc_genome_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
       family_id_fixed = g_cg_pri.family_idxs[sample1uidx];
       founder_ct = g_cg_pri.family_founder_cts[family_id_fixed];
       llfct = ((int64_t)founder_ct * (founder_ct - 1)) - 2 * g_cg_pri.family_rel_space_offsets[family_id_fixed];
-      sptr_start = fw_strcpyn(max_sample_fid_len - 1, uii, fam1, &(tbuf[1]));
+      sptr_start = fw_strcpyn(max_sample_fid_len - 1, uii, fam1, &(g_textbuf[1]));
       *sptr_start++ = ' ';
       sptr_start = fw_strcpy(max_sample_iid_len - 1, sample1, sptr_start);
       *sptr_start++ = ' ';
@@ -4782,7 +4774,7 @@ uint32_t calc_genome_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
     while (sample2idx < sample_ct) {
       next_unset_ul_unsafe_ck(sample_exclude, &sample2uidx);
       sptr_cur_start = sptr_cur;
-      sptr_cur = memcpya(sptr_cur, tbuf, sptr_start - tbuf);
+      sptr_cur = memcpya(sptr_cur, g_textbuf, sptr_start - g_textbuf);
       cptr = &(sample_ids[sample2uidx * max_sample_id_len]);
       uii = strlen_se(cptr);
       memcpyx(fam2, cptr, uii, '\0');
@@ -4829,7 +4821,7 @@ uint32_t calc_genome_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
 	    dxx = g_cg_pri.rel_space[nn + ((int64_t)rel_space_id_fixed * (rel_space_id_fixed - 1) - llfct) / 2];
 	  }
 	}
-	sptr_cur = width_force(5, sptr_cur, double_g_write(sptr_cur, dxx));
+	sptr_cur = width_force(5, sptr_cur, dtoa_g(dxx, sptr_cur));
       } else if (!is_rel_check) {
 	sptr_cur = memcpya(sptr_cur, "UN    NA", 8);
       } else {
@@ -4884,7 +4876,10 @@ uint32_t calc_genome_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
 	dxx1 = dxx2 * dxx2;
       }
       *sptr_cur++ = ' ';
-      sptr_cur = double_f_writew74(double_f_writew74x(double_f_writew74x(double_f_writew74x(sptr_cur, dxx, ' '), dyy, ' '), dxx1, ' '), dxx2);
+      sptr_cur = dtoa_f_w7p4x(dxx, ' ', sptr_cur);
+      sptr_cur = dtoa_f_w7p4x(dyy, ' ', sptr_cur);
+      sptr_cur = dtoa_f_w7p4x(dxx1, ' ', sptr_cur);
+      sptr_cur = dtoa_f_w7p4(dxx2, sptr_cur);
 
       if (pheno_c) {
 	uii = IS_SET(g_cg_pheno_nm, sample1uidx);
@@ -4906,20 +4901,24 @@ uint32_t calc_genome_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
       dyy = (double)genome_main[gmcell + 3];
       dxx1 = 1.0 / ((double)(genome_main[gmcell + 4] + genome_main[gmcell + 3]));
       dxx2 = normdist((dxx * dxx1 - 0.666666) / (sqrt(0.2222222 * dxx1)));
-      sptr_cur = double_f_writew96x(sptr_cur, 1.0 - (genome_main[gmcell] + 2 * genome_main[gmcell + 1]) / ((double)(2 * nn)), ' ');
+      sptr_cur = dtoa_f_w9p6x(1.0 - (genome_main[gmcell] + 2 * genome_main[gmcell + 1]) / ((double)(2 * nn)), ' ', sptr_cur);
       if (dxx2 != dxx2) {
 	sptr_cur = memcpya(sptr_cur, "     NA ", 8);
       } else {
-	sptr_cur = double_f_writew74x(sptr_cur, dxx2, ' ');
+	sptr_cur = dtoa_f_w7p4x(dxx2, ' ', sptr_cur);
       }
       if (genome_main[gmcell + 3]) {
-	sptr_cur = double_f_writew74(sptr_cur, dxx / dyy);
+	sptr_cur = dtoa_f_w7p4(dxx / dyy, sptr_cur);
       } else {
 	sptr_cur = memcpya(sptr_cur, "     NA", 7);
       }
       if (output_full) {
-	*sptr_cur = ' ';
-	sptr_cur = double_f_writew74(double_f_writew74x(uint32_writew7x(uint32_writew7x(uint32_writew7x(&(sptr_cur[1]), genome_main[gmcell + 1], ' '), genome_main[gmcell], ' '), oo, ' '), dyy, ' '), dxx);
+	*sptr_cur++ = ' ';
+	sptr_cur = uint32toa_w7x(genome_main[gmcell + 1], ' ', sptr_cur);
+	sptr_cur = uint32toa_w7x(genome_main[gmcell], ' ', sptr_cur);
+	sptr_cur = uint32toa_w7x(oo, ' ', sptr_cur);
+	sptr_cur = dtoa_f_w7p4x(dyy, ' ', sptr_cur);
+	sptr_cur = dtoa_f_w7p4(dxx, sptr_cur);
       }
       *sptr_cur++ = '\n';
     calc_genome_emitn_skip_line:
@@ -4967,7 +4966,7 @@ uint32_t calc_genome_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
 int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uint32_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, uint32_t* marker_pos, double* set_allele_freqs, uint32_t* nchrobs, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, char* paternal_ids, uintptr_t max_paternal_id_len, char* maternal_id [...]
   FILE* outfile = NULL;
   int32_t retval = 0;
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   unsigned char* loadbuf = NULL; // from file
   int32_t ibd_prect = 0;
@@ -5056,7 +5055,7 @@ int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uin
   g_cg_min_pi_hat = min_pi_hat;
   g_cg_max_pi_hat = max_pi_hat;
 
-  triangle_fill(g_thread_start, sample_ct, dist_thread_ct, parallel_tot - parallel_idx - 1, parallel_tot, 1, 1);
+  triangle_fill(sample_ct, dist_thread_ct, parallel_tot - parallel_idx - 1, parallel_tot, 1, 1, g_thread_start);
   // invert order, for --genome --parallel to naturally work
   for (uii = 0; uii <= dist_thread_ct / 2; uii++) {
     ujj = g_thread_start[uii];
@@ -5076,16 +5075,16 @@ int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uin
   // f(n) = nsample_ct - n(n+1)/2
   tot_cells = (int64_t)sample_ct * (g_cg_tstc - g_thread_start[0]) - ((int64_t)g_cg_tstc * (g_cg_tstc + 1) - (int64_t)g_thread_start[0] * (g_thread_start[0] + 1)) / 2;
   g_cg_tot_lines = cur_line + tot_cells;
-  if (wkspace_alloc_ui_checked(&missing_dbl_excluded, tot_cells * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&sample_missing_unwt, sample_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&genome_main, tot_cells * 5 * sizeof(int32_t)) ||
-      wkspace_alloc_uc_checked(&loadbuf, GENOME_MULTIPLEX * unfiltered_sample_ct4) ||
-      wkspace_alloc_ul_checked(&geno, sample_ct * (GENOME_MULTIPLEX / 4)) ||
-      wkspace_alloc_ul_checked(&masks, sample_ct * (GENOME_MULTIPLEX / 4)) ||
-      wkspace_alloc_ul_checked(&mmasks, sample_ct * sizeof(intptr_t)) ||
-      wkspace_alloc_c_checked(&g_cg_fam1, plink_maxfid + 1) ||
-      wkspace_alloc_c_checked(&g_cg_fam2, plink_maxfid + 1) ||
-      wkspace_alloc_uc_checked(&overflow_buf, 262144)) {
+  if (bigstack_alloc_ui(tot_cells, &missing_dbl_excluded) ||
+      bigstack_alloc_ui(sample_ct, &sample_missing_unwt) ||
+      bigstack_alloc_ui(tot_cells * 5, &genome_main) ||
+      bigstack_alloc_uc(GENOME_MULTIPLEX * unfiltered_sample_ct4, &loadbuf) ||
+      bigstack_alloc_ul(sample_ct * (GENOME_MULTIPLEX / BITCT2), &geno) ||
+      bigstack_alloc_ul(sample_ct * (GENOME_MULTIPLEX / BITCT2), &masks) ||
+      bigstack_alloc_ul(sample_ct, &mmasks) ||
+      bigstack_alloc_c(plink_maxfid + 1, &g_cg_fam1) ||
+      bigstack_alloc_c(plink_maxfid + 1, &g_cg_fam2) ||
+      bigstack_alloc_uc(262144, &overflow_buf)) {
     goto calc_genome_ret_NOMEM;
   }
 
@@ -5338,7 +5337,7 @@ int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uin
 
   if (calculation_type & CALC_PLINK1_IBS_MATRIX) {
     strcpy(outname_end, ".mibs");
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto calc_genome_ret_OPEN_FAIL;
     }
     giptr = genome_main;
@@ -5349,7 +5348,7 @@ int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uin
       uii = marker_ct - giptr3[sample_idx];
       uljj = sample_idx - 1; // not referenced when sample_idx == 0
       for (ulii = 0; ulii < sample_idx; ulii++) {
-        cptr = double_g_writex(wbuf, 1.0 - ((double)(genome_main[uljj * 5] + 2 * genome_main[uljj * 5 + 1])) / ((double)(2 * (uii - (*giptr3++) + missing_dbl_excluded[uljj]))), ' ');
+        cptr = dtoa_gx(1.0 - ((double)(genome_main[uljj * 5] + 2 * genome_main[uljj * 5 + 1])) / ((double)(2 * (uii - (*giptr3++) + missing_dbl_excluded[uljj]))), ' ', wbuf);
         fwrite(wbuf, 1, cptr - wbuf, outfile);
 	uljj += sample_ct - ulii - 2;
       }
@@ -5357,7 +5356,7 @@ int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uin
       putc(' ', outfile);
       giptr3++;
       for (ujj = sample_idx + 1; ujj < sample_ct; ujj++) {
-	cptr = double_g_writex(wbuf, 1.0 - ((double)((*giptr) + 2 * giptr[1])) / ((double)(2 * (uii - (*giptr3++) + (*giptr2++)))), ' ');
+	cptr = dtoa_gx(1.0 - ((double)((*giptr) + 2 * giptr[1])) / ((double)(2 * (uii - (*giptr3++) + (*giptr2++)))), ' ', wbuf);
 	fwrite(wbuf, 1, cptr - wbuf, outfile);
 	giptr = &(giptr[5]);
       }
@@ -5385,7 +5384,7 @@ int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uin
 
   if (calculation_type & CALC_PLINK1_DISTANCE_MATRIX) {
     strcpy(outname_end, ".mdist");
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto calc_genome_ret_OPEN_FAIL;
     }
     giptr = genome_main;
@@ -5396,7 +5395,7 @@ int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uin
       uii = marker_ct - giptr3[sample_idx];
       uljj = sample_idx - 1;
       for (ulii = 0; ulii < sample_idx; ulii++) {
-	cptr = double_g_writex(wbuf, ((double)(genome_main[uljj * 5] + 2 * genome_main[uljj * 5 + 1])) / ((double)(2 * (uii - (*giptr3++) + missing_dbl_excluded[uljj]))), ' ');
+	cptr = dtoa_gx(((double)(genome_main[uljj * 5] + 2 * genome_main[uljj * 5 + 1])) / ((double)(2 * (uii - (*giptr3++) + missing_dbl_excluded[uljj]))), ' ', wbuf);
 	fwrite(wbuf, 1, cptr - wbuf, outfile);
 	uljj += sample_ct - ulii - 2;
       }
@@ -5404,7 +5403,7 @@ int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uin
       putc(' ', outfile);
       giptr3++;
       for (ujj = sample_idx + 1; ujj < sample_ct; ujj++) {
-	cptr = double_g_writex(wbuf, ((double)((*giptr) + 2 * giptr[1])) / ((double)(2 * (uii - (*giptr3++) + (*giptr2++)))), ' ');
+	cptr = dtoa_gx(((double)((*giptr) + 2 * giptr[1])) / ((double)(2 * (uii - (*giptr3++) + (*giptr2++)))), ' ', wbuf);
         fwrite(wbuf, 1, cptr - wbuf, outfile);
 	giptr = &(giptr[5]);
       }
@@ -5432,9 +5431,9 @@ int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uin
 
   if (!parallel_idx) {
     if (genome_modifier & GENOME_OUTPUT_FULL) {
-      sprintf(tbuf, "%%%us%%%us%%%us%%%us RT    EZ      Z0      Z1      Z2  PI_HAT PHE       DST     PPC   RATIO    IBS0    IBS1    IBS2  HOMHOM  HETHET\n", plink_maxfid, plink_maxiid, plink_maxfid, plink_maxiid);
+      sprintf(g_textbuf, "%%%us%%%us%%%us%%%us RT    EZ      Z0      Z1      Z2  PI_HAT PHE       DST     PPC   RATIO    IBS0    IBS1    IBS2  HOMHOM  HETHET\n", plink_maxfid, plink_maxiid, plink_maxfid, plink_maxiid);
     } else {
-      sprintf(tbuf, "%%%us%%%us%%%us%%%us RT    EZ      Z0      Z1      Z2  PI_HAT PHE       DST     PPC   RATIO\n", plink_maxfid, plink_maxiid, plink_maxfid, plink_maxiid);
+      sprintf(g_textbuf, "%%%us%%%us%%%us%%%us RT    EZ      Z0      Z1      Z2  PI_HAT PHE       DST     PPC   RATIO\n", plink_maxfid, plink_maxiid, plink_maxfid, plink_maxiid);
     }
   }
   g_pct = 1;
@@ -5501,9 +5500,9 @@ int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uin
  calc_genome_ret_1:
   fclose_cond(outfile);
   if ((!retval) && (calculation_type & (CALC_CLUSTER | CALC_NEIGHBOR))) {
-    wkspace_reset(loadbuf);
+    bigstack_reset(loadbuf);
   } else {
-    wkspace_reset(wkspace_mark);
+    bigstack_reset(bigstack_mark);
   }
   return retval;
 }
@@ -5524,7 +5523,7 @@ int32_t do_rel_cutoff(uint64_t calculation_type, double rel_cutoff, double* rel_
   uint32_t samples_excluded = 0;
   uint32_t exactly_one_rel_ct = 0;
   uintptr_t sample_ct = unfiltered_sample_ct - (*sample_exclude_ct_ptr);
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   double* rel_dists = g_rel_dists;
   double* dist_ptr = rel_dists;
   double* dptr2;
@@ -5549,10 +5548,9 @@ int32_t do_rel_cutoff(uint64_t calculation_type, double rel_cutoff, double* rel_
   //   != NP, anyway), so we use a simple heuristic: prune the first sample
   //   with the largest number of remaining too-close relationships.
 
-  if (wkspace_alloc_i_checked(&rel_ct_arr, sample_ct * sizeof(int32_t))) {
+  if (bigstack_calloc_i(sample_ct, &rel_ct_arr)) {
     return RET_NOMEM;
   }
-  fill_int_zero(rel_ct_arr, sample_ct);
   for (sample_idx = 1; sample_idx < sample_ct; sample_idx++) {
     for (uii = 0; uii < sample_idx; uii++) {
       if (*dist_ptr++ > rel_cutoff) {
@@ -5696,7 +5694,7 @@ int32_t do_rel_cutoff(uint64_t calculation_type, double rel_cutoff, double* rel_
     }
     LOGPRINTFWW("Remaining sample IDs written to %s .\n", outname);
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return 0;
 }
 
@@ -5732,17 +5730,19 @@ uint32_t rel_cutoff_batch_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
   while (row < sample_ct) {
     if (rel_ct_arr[row] == -1) {
       for (uii = 0; uii <= row; uii++) {
-	gzgets(cur_gzfile, tbuf, MAXLINELEN);
+	gzgets(cur_gzfile, g_textbuf, MAXLINELEN);
       }
     } else {
-      cptr = uint32_writex(wbuf, new_row, '\t');
+      cptr = uint32toa_x(new_row, '\t', wbuf);
       wbuf_ct = (uintptr_t)(cptr - wbuf);
       while (col <= row) {
-        gzgets(cur_gzfile, tbuf, MAXLINELEN);
+        gzgets(cur_gzfile, g_textbuf, MAXLINELEN);
 	if (rel_ct_arr[col++] != -1) {
-	  cptr = next_token_mult(tbuf, 2);
+	  cptr = next_token_mult(g_textbuf, 2);
           uii = strlen(cptr);
-          sptr_cur = memcpya(uint32_writex(memcpya(sptr_cur, wbuf, wbuf_ct), ++new_col, '\t'), cptr, uii);
+	  sptr_cur = memcpya(sptr_cur, wbuf, wbuf_ct);
+	  sptr_cur = uint32toa_x(++new_col, '\t', sptr_cur);
+          sptr_cur = memcpya(sptr_cur, cptr, uii);
 	  if (sptr_cur >= readbuf_end) {
 	    goto rel_cutoff_batch_emitn_ret;
 	  }
@@ -5798,7 +5798,7 @@ uint32_t rel_cutoff_batch_rbin_emitn(uint32_t overflow_ct, unsigned char* readbu
       fseeko(in_binfile, (row + 1) * sizeof(float), SEEK_CUR);
       fseeko(in_bin_nfile, (row + 1) * sizeof(float), SEEK_CUR);
     } else {
-      cptr = uint32_writex(wbuf, new_row, '\t');
+      cptr = uint32toa_x(new_row, '\t', wbuf);
       wbuf_ct = (uintptr_t)(cptr - wbuf);
       while (col <= row) {
 	if (rel_ct_arr[col] == -1) {
@@ -5814,11 +5814,12 @@ uint32_t rel_cutoff_batch_rbin_emitn(uint32_t overflow_ct, unsigned char* readbu
 	    break;
 	  }
 	}
-	sptr_cur = uint32_writex(memcpya(sptr_cur, wbuf, wbuf_ct), ++new_col, '\t');
+	sptr_cur = memcpya(sptr_cur, wbuf, wbuf_ct);
+	sptr_cur = uint32toa_x(++new_col, '\t', sptr_cur);
 	fread(&fxx, 4, 1, in_bin_nfile);
-	sptr_cur = uint32_writex(sptr_cur, (int32_t)fxx, '\t');
+	sptr_cur = uint32toa_x((int32_t)fxx, '\t', sptr_cur);
 	fread(&fxx, 4, 1, in_binfile);
-	sptr_cur = float_e_writex(sptr_cur, fxx, '\n');
+	sptr_cur = ftoa_ex(fxx, '\n', sptr_cur);
 	col++;
 	if (sptr_cur >= readbuf_end) {
 	  goto rel_cutoff_batch_rbin_emitn_ret;
@@ -5861,7 +5862,7 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
   FILE* in_binfile = NULL;
   FILE* in_bin_nfile = NULL;
   gzFile cur_gzfile = NULL;
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uint32_t samples_excluded = 0;
   uint32_t exactly_one_rel_ct = 0;
   uint32_t rel_calc_type = relip->modifier & REL_CALC_MASK;
@@ -5903,17 +5904,17 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
     goto rel_cutoff_batch_ret_INVALID_CMDLINE;
   }
   memcpy(grmname_end, ".grm.id", 8);
-  if (fopen_checked(&idfile, grmname, "r")) {
+  if (fopen_checked(grmname, "r", &idfile)) {
     goto rel_cutoff_batch_ret_OPEN_FAIL;
   }
-  tbuf[MAXLINELEN - 1] = ' ';
-  while (fgets(tbuf, MAXLINELEN, idfile)) {
+  g_textbuf[MAXLINELEN - 1] = ' ';
+  while (fgets(g_textbuf, MAXLINELEN, idfile)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
+    if (!g_textbuf[MAXLINELEN - 1]) {
       LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, grmname);
       goto rel_cutoff_batch_ret_INVALID_FORMAT_2;
     }
-    if (is_eoln_kns(*(skip_initial_spaces(tbuf)))) {
+    if (is_eoln_kns(*(skip_initial_spaces(g_textbuf)))) {
       continue;
     }
     sample_ct++;
@@ -5923,22 +5924,18 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
   }
   fclose_null(&idfile);
   ullii = sample_ct;
-  ullii = ((ullii * (ullii - 1)) / 2 + BITCT - 1) / BITCT;
+  ullii = BITCT_TO_WORDCT((ullii * (ullii - 1)) / 2);
 #ifndef __LP64__
   if (ullii >= 0x20000000) {
     goto rel_cutoff_batch_ret_NOMEM;
   }
 #endif
   tot_words = ullii;
-  if (wkspace_alloc_ul_checked(&compact_rel_table, tot_words * sizeof(intptr_t))) {
+  if (bigstack_calloc_ul(tot_words, &compact_rel_table) ||
+      bigstack_calloc_i(sample_ct, &rel_ct_arr) ||
+      bigstack_alloc_uc(262144, &overflow_buf)) {
     goto rel_cutoff_batch_ret_NOMEM;
   }
-  fill_ulong_zero(compact_rel_table, tot_words);
-  if (wkspace_alloc_i_checked(&rel_ct_arr, sample_ct * sizeof(int32_t)) ||
-      wkspace_alloc_uc_checked(&overflow_buf, 262144)) {
-    goto rel_cutoff_batch_ret_NOMEM;
-  }
-  fill_int_zero(rel_ct_arr, sample_ct);
 
   fputs("Reading... 0%", stdout);
   fflush(stdout);
@@ -5948,7 +5945,7 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
   col = 0;
   if (load_grm_bin) {
     memcpy(grmname_end, ".grm.bin", 9);
-    if (fopen_checked(&in_binfile, grmname, "rb")) {
+    if (fopen_checked(grmname, FOPEN_RB, &in_binfile)) {
       goto rel_cutoff_batch_ret_OPEN_FAIL;
     }
     rel_cutoff_f = (float)rel_cutoff;
@@ -5997,11 +5994,9 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
     fclose_null(&in_binfile);
   } else {
     memcpy(grmname_end, ".grm.gz", 8);
-    if (gzopen_checked(&cur_gzfile, grmname, "rb")) {
-      goto rel_cutoff_batch_ret_OPEN_FAIL;
-    }
-    if (gzbuffer(cur_gzfile, 131072)) {
-      goto rel_cutoff_batch_ret_NOMEM;
+    retval = gzopen_read_checked(grmname, &cur_gzfile);
+    if (retval) {
+      goto rel_cutoff_batch_ret_1;
     }
     for (pct = 1; pct <= 100; pct++) {
       wl_floor = (((uint64_t)tot_words) * (100 - pct)) / 100;
@@ -6018,17 +6013,17 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
 	  }
 	}
 	for (inword_idx = 0; inword_idx < inword_bound; inword_idx++) {
-	  if (!gzgets(cur_gzfile, tbuf, MAXLINELEN)) {
+	  if (!gzgets(cur_gzfile, g_textbuf, MAXLINELEN)) {
 	    goto rel_cutoff_batch_ret_READ_FAIL;
 	  }
 	  if (row == col) {
 	    row++;
 	    col = 0;
-	    if (!gzgets(cur_gzfile, tbuf, MAXLINELEN)) {
+	    if (!gzgets(cur_gzfile, g_textbuf, MAXLINELEN)) {
 	      goto rel_cutoff_batch_ret_READ_FAIL;
 	    }
 	  }
-	  bufptr = next_token_mult(tbuf, 3);
+	  bufptr = next_token_mult(g_textbuf, 3);
 	  if (no_more_tokens_kns(bufptr)) {
 	    goto rel_cutoff_batch_ret_INVALID_FORMAT_GENERIC;
 	  }
@@ -6052,10 +6047,10 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
 	fflush(stdout);
       }
     }
-    if (!gzgets(cur_gzfile, tbuf, MAXLINELEN)) {
+    if (!gzgets(cur_gzfile, g_textbuf, MAXLINELEN)) {
       goto rel_cutoff_batch_ret_READ_FAIL;
     }
-    if (gzgets(cur_gzfile, tbuf, MAXLINELEN)) {
+    if (gzgets(cur_gzfile, g_textbuf, MAXLINELEN)) {
       goto rel_cutoff_batch_ret_INVALID_FORMAT_GENERIC;
     }
     gzclose(cur_gzfile);
@@ -6223,24 +6218,24 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
   }
 
   memcpy(grmname_end, ".grm.id", 8);
-  if (fopen_checked(&idfile, grmname, "r")) {
+  if (fopen_checked(grmname, "r", &idfile)) {
     goto rel_cutoff_batch_ret_OPEN_FAIL;
   }
 
   memcpy(outname_end, ".grm.id", 8);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto rel_cutoff_batch_ret_OPEN_FAIL;
   }
 
   for (sample_idx = 0; sample_idx < sample_ct;) {
-    if (fgets(tbuf, MAXLINELEN, idfile) == NULL) {
+    if (fgets(g_textbuf, MAXLINELEN, idfile) == NULL) {
       goto rel_cutoff_batch_ret_READ_FAIL;
     }
-    if (is_eoln_kns(*(skip_initial_spaces(tbuf)))) {
+    if (is_eoln_kns(*(skip_initial_spaces(g_textbuf)))) {
       continue;
     }
     if (rel_ct_arr[sample_idx] != -1) {
-      if (fputs_checked(tbuf, outfile)) {
+      if (fputs_checked(g_textbuf, outfile)) {
 	goto rel_cutoff_batch_ret_WRITE_FAIL;
       }
     }
@@ -6255,24 +6250,22 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
   if (rel_calc_type & (REL_CALC_GRM | REL_CALC_GRM_BIN)) {
     if (load_grm_bin) {
       memcpy(grmname_end, ".grm.bin", 9);
-      if (fopen_checked(&in_binfile, grmname, "rb")) {
+      if (fopen_checked(grmname, FOPEN_RB, &in_binfile)) {
 	goto rel_cutoff_batch_ret_OPEN_FAIL;
       }
       g_rcb_in_binfile = in_binfile;
       memcpy(grmname_end, ".grm.N.bin", 11);
-      if (fopen_checked(&in_bin_nfile, grmname, "rb")) {
+      if (fopen_checked(grmname, FOPEN_RB, &in_bin_nfile)) {
 	goto rel_cutoff_batch_ret_OPEN_FAIL;
       }
       g_rcb_in_bin_nfile = in_bin_nfile;
     } else {
       memcpy(grmname_end, ".grm.gz", 8);
-      if (gzopen_checked(&cur_gzfile, grmname, "rb")) {
-	goto rel_cutoff_batch_ret_OPEN_FAIL;
+      retval = gzopen_read_checked(grmname, &cur_gzfile);
+      if (retval) {
+	goto rel_cutoff_batch_ret_1;
       }
       g_rcb_cur_gzfile = cur_gzfile;
-      if (gzbuffer(cur_gzfile, 131072)) {
-	goto rel_cutoff_batch_ret_NOMEM;
-      }
     }
     fputs("Rewriting matrix... 0%", stdout);
     fflush(stdout);
@@ -6317,11 +6310,11 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
       progress = 0;
       hundredth = 1 + ((((uint64_t)sample_ct) * (sample_ct - 1)) / 200);
       memcpy(outname_end, ".grm.N.bin", 11);
-      if (fopen_checked(&out_bin_nfile, outname, "wb")) {
+      if (fopen_checked(outname, FOPEN_WB, &out_bin_nfile)) {
 	goto rel_cutoff_batch_ret_OPEN_FAIL;
       }
       memcpy(outname_end, ".grm.bin", 9);
-      if (fopen_checked(&outfile, outname, "wb")) {
+      if (fopen_checked(outname, FOPEN_WB, &outfile)) {
 	goto rel_cutoff_batch_ret_OPEN_FAIL;
       }
       while (row < sample_ct) {
@@ -6331,7 +6324,7 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
 	    fseeko(in_bin_nfile, (row + 1) * sizeof(float), SEEK_CUR);
 	  } else {
 	    for (uii = 0; uii <= row; uii++) {
-	      gzgets(cur_gzfile, tbuf, MAXLINELEN);
+	      gzgets(cur_gzfile, g_textbuf, MAXLINELEN);
 	    }
 	  }
 	} else {
@@ -6358,9 +6351,9 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
 	    }
 	  } else {
 	    while (col <= row) {
-	      gzgets(cur_gzfile, tbuf, MAXLINELEN);
+	      gzgets(cur_gzfile, g_textbuf, MAXLINELEN);
 	      if (rel_ct_arr[col++] != -1) {
-		bufptr = next_token_mult(tbuf, 2);
+		bufptr = next_token_mult(g_textbuf, 2);
 		if (scan_float(bufptr, &fxx)) {
 		  goto rel_cutoff_batch_ret_INVALID_FORMAT_GENERIC;
 		}
@@ -6425,7 +6418,7 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
   fclose_cond(outfile);
   fclose_cond(out_bin_nfile);
   gzclose_cond(cur_gzfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -6453,13 +6446,13 @@ uint32_t calc_rel_tri_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
   uint32_t pct = g_pct;
   while (sample1idx < max_sample1idx) {
     while (sample2idx < sample1idx) {
-      sptr_cur = double_g_writex(sptr_cur, *dist_ptr++, '\t');
+      sptr_cur = dtoa_gx(*dist_ptr++, '\t', sptr_cur);
       sample2idx++;
       if (sptr_cur >= readbuf_end) {
 	goto calc_rel_tri_emitn_ret;
       }
     }
-    sptr_cur = double_g_writex(sptr_cur, *ibc_ptr++, '\n');
+    sptr_cur = dtoa_gx(*ibc_ptr++, '\n', sptr_cur);
     sample1idx++;
     if ((((uint64_t)sample1idx) * (sample1idx + 1) / 2 - start_offset) >= hundredth * pct) {
       pct = (((uint64_t)sample1idx) * (sample1idx + 1) / 2 - start_offset) / hundredth;
@@ -6492,14 +6485,14 @@ uint32_t calc_rel_sq0_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
   uintptr_t ulii;
   while (sample1idx < max_sample1idx) {
     while (sample2idx < sample1idx) {
-      sptr_cur = double_g_writex(sptr_cur, *dist_ptr++, '\t');
+      sptr_cur = dtoa_gx(*dist_ptr++, '\t', sptr_cur);
       sample2idx++;
       if (sptr_cur >= readbuf_end) {
 	goto calc_rel_sq0_emitn_ret;
       }
     }
     if (sample2idx == sample1idx) {
-      sptr_cur = double_g_write(sptr_cur, *ibc_ptr++);
+      sptr_cur = dtoa_g(*ibc_ptr++, sptr_cur);
       sample2idx++;
     }
     if (sptr_cur >= readbuf_end) {
@@ -6546,19 +6539,19 @@ uint32_t calc_rel_sq_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
   uint32_t pct = g_pct;
   while (sample1idx < max_sample1idx) {
     while (sample2idx < sample1idx) {
-      sptr_cur = double_g_writex(sptr_cur, *dist_ptr++, '\t');
+      sptr_cur = dtoa_gx(*dist_ptr++, '\t', sptr_cur);
       sample2idx++;
       if (sptr_cur >= readbuf_end) {
 	goto calc_rel_sq_emitn_ret;
       }
     }
     if (sample2idx == sample1idx) {
-      sptr_cur = double_g_write(sptr_cur, *ibc_ptr++);
+      sptr_cur = dtoa_g(*ibc_ptr++, sptr_cur);
       sample2idx++;
     }
     while (sample2idx < sample_ct) {
-      *sptr_cur = '\t';
-      sptr_cur = double_g_write(&(sptr_cur[1]), rel_dists[tri_coord_no_diag(sample1idx, sample2idx)]);
+      *sptr_cur++ = '\t';
+      sptr_cur = dtoa_g(rel_dists[tri_coord_no_diag(sample1idx, sample2idx)], sptr_cur);
       sample2idx++;
       if (sptr_cur >= readbuf_end) {
 	goto calc_rel_sq_emitn_ret;
@@ -6602,16 +6595,22 @@ uint32_t calc_rel_grm_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
   uint32_t uii;
   while (sample1idx < max_sample1idx) {
     uii = marker_ct - sample_missing_unwt[sample1idx];
-    wbuf_end = uint32_writex(wbuf, sample1idx + 1, '\t');
+    wbuf_end = uint32toa_x(sample1idx + 1, '\t', wbuf);
     wbuf_len = (uintptr_t)(wbuf_end - wbuf);
     while (sample2idx < sample1idx) {
-      sptr_cur = double_e_writex(uint32_writex(uint32_writex(memcpya(sptr_cur, wbuf, wbuf_len), sample2idx + 1, '\t'), (uii - sample_missing_unwt[sample2idx]) + (*mdeptr++), '\t'), *dist_ptr++, '\n');
+      sptr_cur = memcpya(sptr_cur, wbuf, wbuf_len);
+      sptr_cur = uint32toa_x(sample2idx + 1, '\t', sptr_cur);
+      sptr_cur = uint32toa_x((uii - sample_missing_unwt[sample2idx]) + (*mdeptr++), '\t', sptr_cur);
+      sptr_cur = dtoa_ex(*dist_ptr++, '\n', sptr_cur);
       sample2idx++;
       if (sptr_cur >= readbuf_end) {
 	goto calc_rel_grm_emitn_ret;
       }
     }
-    sptr_cur = double_e_writex(uint32_writex(uint32_writex(memcpya(sptr_cur, wbuf, wbuf_len), ++sample1idx, '\t'), uii, '\t'), *ibc_ptr++, '\n');
+    sptr_cur = memcpya(sptr_cur, wbuf, wbuf_len);
+    sptr_cur = uint32toa_x(++sample1idx, '\t', sptr_cur);
+    sptr_cur = uint32toa_x(uii, '\t', sptr_cur);
+    sptr_cur = dtoa_ex(*ibc_ptr++, '\n', sptr_cur);
     if ((((uint64_t)sample1idx) * (sample1idx + 1) / 2 - start_offset) >= hundredth * pct) {
       pct = (((uint64_t)sample1idx) * (sample1idx + 1) / 2 - start_offset) / hundredth;
       printf("\rWriting... %u%%", pct++);
@@ -6676,17 +6675,17 @@ void copy_set_allele_freqs(uintptr_t marker_uidx, uintptr_t* marker_exclude, uin
 }
 
 int32_t load_distance_wts(char* distance_wts_fname, uintptr_t unfiltered_marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t noheader, uint32_t conditional_alloc_exclude, uintptr_t** marker_exclude_ptr, uint32_t* marker_ct_ptr, double** main_weights_ptr) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   FILE* infile = NULL;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t line_idx = 0;
-  uintptr_t topsize = 0;
 
   // special case: weight-0 assignment effectively doesn't exist, but we still
   // want to check for repeated IDs there.
   uint32_t zcount = 0;
 
   int32_t retval = 0;
-  unsigned char* wkspace_mark;
   uintptr_t* marker_include;
   double* main_weights_tmp;
   double* dptr;
@@ -6698,33 +6697,25 @@ int32_t load_distance_wts(char* distance_wts_fname, uintptr_t unfiltered_marker_
   uint32_t marker_idx;
   uint32_t idlen;
   uint32_t marker_ct;
-  marker_include = (uintptr_t*)top_alloc(&topsize, unfiltered_marker_ctl * sizeof(intptr_t));
-  if (!marker_include) {
+  if (bigstack_end_calloc_ul(unfiltered_marker_ctl, &marker_include) ||
+      bigstack_end_alloc_d(unfiltered_marker_ct, &main_weights_tmp)) {
     goto load_distance_wts_ret_NOMEM;
   }
-  fill_ulong_zero(marker_include, unfiltered_marker_ctl);
-  main_weights_tmp = (double*)top_alloc(&topsize, unfiltered_marker_ct * sizeof(double));
-  if (!main_weights_tmp) {
-    goto load_distance_wts_ret_NOMEM;
-  }
-  wkspace_left -= topsize;
-  wkspace_mark = wkspace_base;
-  retval = alloc_and_populate_id_htable(unfiltered_marker_ct, *marker_exclude_ptr, *marker_ct_ptr, marker_ids, max_marker_id_len, 0, &marker_id_htable, &marker_id_htable_size);
-  wkspace_left += topsize;
+  retval = alloc_and_populate_id_htable(unfiltered_marker_ct, *marker_exclude_ptr, *marker_ct_ptr, marker_ids, max_marker_id_len, 0, &marker_id_htable_size, &marker_id_htable);
   if (retval) {
     goto load_distance_wts_ret_1;
   }
-  if (fopen_checked(&infile, distance_wts_fname, "r")) {
+  if (fopen_checked(distance_wts_fname, "r", &infile)) {
     goto load_distance_wts_ret_OPEN_FAIL;
   }
-  tbuf[MAXLINELEN - 1] = ' ';
-  while (fgets(tbuf, MAXLINELEN, infile)) {
+  g_textbuf[MAXLINELEN - 1] = ' ';
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
+    if (!g_textbuf[MAXLINELEN - 1]) {
       LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, distance_wts_fname);
       goto load_distance_wts_ret_INVALID_FORMAT_2;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
@@ -6743,10 +6734,10 @@ int32_t load_distance_wts(char* distance_wts_fname, uintptr_t unfiltered_marker_
       LOGPREPRINTFWW("Error: Duplicate variant ID '%s' in --distance-wts file.\n", bufptr);
       goto load_distance_wts_ret_INVALID_FORMAT_2;
     }
-    set_bit(marker_include, marker_uidx);
+    set_bit(marker_uidx, marker_include);
     bufptr = skip_initial_spaces(&(bufptr[idlen]));
     if (is_eoln_kns(*bufptr)) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --distance-wts file has fewer tokens than expected.\n", line_idx);
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --distance-wts file has fewer tokens than expected.\n", line_idx);
       goto load_distance_wts_ret_INVALID_FORMAT_2;
     }
     if (scan_double(bufptr, &dxx)) {
@@ -6763,26 +6754,24 @@ int32_t load_distance_wts(char* distance_wts_fname, uintptr_t unfiltered_marker_
   if (!feof(infile)) {
     goto load_distance_wts_ret_READ_FAIL;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   marker_ct = popcount_longs(marker_include, unfiltered_marker_ctl) - zcount;
   if (!marker_ct) {
     logerrprint("Error: No valid nonzero entries in --distance-wts file.\n");
     goto load_distance_wts_ret_INVALID_FORMAT;
   }
-  wkspace_left -= topsize;
   if ((marker_ct != (*marker_ct_ptr))) {
     if (conditional_alloc_exclude) {
-      if (wkspace_alloc_ul_checked(marker_exclude_ptr, unfiltered_marker_ctl * sizeof(intptr_t))) {
-	goto load_distance_wts_ret_NOMEM2;
+      if (bigstack_alloc_ul(unfiltered_marker_ctl, marker_exclude_ptr)) {
+	goto load_distance_wts_ret_NOMEM;
       }
     }
-    bitfield_exclude_to_include(marker_include, *marker_exclude_ptr, unfiltered_marker_ct);
+    bitarr_invert_copy(marker_include, unfiltered_marker_ct, *marker_exclude_ptr);
     *marker_ct_ptr = marker_ct;
   }
-  if (wkspace_alloc_d_checked(main_weights_ptr, marker_ct * sizeof(double))) {
-    goto load_distance_wts_ret_NOMEM2;
+  if (bigstack_alloc_d(marker_ct, main_weights_ptr)) {
+    goto load_distance_wts_ret_NOMEM;
   }
-  wkspace_left += topsize;
   dptr = *main_weights_ptr;
   *marker_ct_ptr = marker_ct;
   for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++) {
@@ -6793,10 +6782,7 @@ int32_t load_distance_wts(char* distance_wts_fname, uintptr_t unfiltered_marker_
       marker_idx++;
     }
   }
-  // topsize = 0;
   while (0) {
-  load_distance_wts_ret_NOMEM2:
-    wkspace_left += topsize;
   load_distance_wts_ret_NOMEM:
     retval = RET_NOMEM;
     break;
@@ -6807,7 +6793,7 @@ int32_t load_distance_wts(char* distance_wts_fname, uintptr_t unfiltered_marker_
     retval = RET_READ_FAIL;
     break;
   load_distance_wts_ret_INVALID_WEIGHT:
-    sprintf(logbuf, "Error: Invalid weight on line %" PRIuPTR " of --distance-wts file.\n", line_idx);
+    sprintf(g_logbuf, "Error: Invalid weight on line %" PRIuPTR " of --distance-wts file.\n", line_idx);
   load_distance_wts_ret_INVALID_FORMAT_2:
     logerrprintb();
   load_distance_wts_ret_INVALID_FORMAT:
@@ -6815,12 +6801,13 @@ int32_t load_distance_wts(char* distance_wts_fname, uintptr_t unfiltered_marker_
     break;
   }
  load_distance_wts_ret_1:
+  bigstack_end_reset(bigstack_end_mark);
   fclose_cond(infile);
   return retval;
 }
 
 int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_tot, uint64_t calculation_type, Rel_info* relip, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, char* distance_wts_fname, uint32_t distance_wts_noheader, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t* marker_reverse, uint32_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_e [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   uintptr_t sample_ct = unfiltered_sample_ct - (*sample_exclude_ct_ptr);
   uintptr_t marker_uidx = 0;
@@ -6894,26 +6881,24 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
     }
   }
   // currently must be bottom allocation, since plink() will free it
-  if (wkspace_alloc_ui_checked(&sample_missing_unwt, sample_ct * sizeof(int32_t))) {
+  if (bigstack_calloc_ui(sample_ct, &sample_missing_unwt)) {
     goto calc_rel_ret_NOMEM;
   }
   g_sample_missing_unwt = sample_missing_unwt;
-  fill_int_zero((int32_t*)sample_missing_unwt, sample_ct);
   g_sample_ct = sample_ct;
   if (dist_thread_ct > sample_ct / 2) {
     dist_thread_ct = sample_ct / 2;
   }
-  triangle_fill(g_thread_start, sample_ct, dist_thread_ct, parallel_idx, parallel_tot, 1, 1);
+  triangle_fill(sample_ct, dist_thread_ct, parallel_idx, parallel_tot, 1, 1, g_thread_start);
   if (calculation_type & CALC_IBC) {
     uii = sample_ct * 3;
   } else {
     uii = sample_ct;
   }
-  if (wkspace_alloc_d_checked(rel_ibc_ptr, uii * sizeof(double))) {
+  if (bigstack_calloc_d(uii, rel_ibc_ptr)) {
     goto calc_rel_ret_NOMEM;
   }
   rel_ibc = *rel_ibc_ptr;
-  fill_double_zero(rel_ibc, uii);
   if (rel_req) {
     llxx = g_thread_start[dist_thread_ct];
     llxx = ((llxx * (llxx - 1)) - (int64_t)g_thread_start[0] * (g_thread_start[0] - 1)) / 2;
@@ -6922,34 +6907,31 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
       // CALC_UNRELATED_HERITABILITY, positioning the missingness matrix here
       // will let us avoid recalculating it if --distance-matrix or --matrix is
       // requested
-      if (wkspace_alloc_ui_checked(&g_missing_dbl_excluded, llxx * sizeof(int32_t))) {
+      if (bigstack_calloc_ui(llxx, &g_missing_dbl_excluded)) {
 	goto calc_rel_ret_NOMEM;
       }
-      fill_int_zero((int32_t*)g_missing_dbl_excluded, llxx);
     }
-    if (wkspace_alloc_d_checked(&rel_dists, llxx * sizeof(double))) {
+    if (bigstack_calloc_d(llxx, &rel_dists)) {
       goto calc_rel_ret_NOMEM;
     }
     g_rel_dists = rel_dists;
-    fill_double_zero(rel_dists, llxx);
   }
-  wkspace_mark = wkspace_base;
-  // stack allocations after this point are freed normally
+  bigstack_mark = g_bigstack_base;
+  // bigstack allocations after this point are freed normally
   if (rel_req && (!g_missing_dbl_excluded)) {
-    if (wkspace_alloc_ui_checked(&g_missing_dbl_excluded, llxx * sizeof(int32_t))) {
+    if (bigstack_calloc_ui(llxx, &g_missing_dbl_excluded)) {
       goto calc_rel_ret_NOMEM;
     }
-    fill_int_zero((int32_t*)g_missing_dbl_excluded, llxx);
   }
   if (fseeko(bedfile, bed_offset, SEEK_SET)) {
     goto calc_rel_ret_READ_FAIL;
   }
-  if (wkspace_alloc_ul_checked(&geno, sample_ct * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&mmasks, sample_ct * sizeof(intptr_t)) ||
-      wkspace_alloc_uc_checked(&gptr, MULTIPLEX_REL * unfiltered_sample_ct4) ||
-      wkspace_alloc_ul_checked(&masks, sample_ct * sizeof(intptr_t)) ||
-      wkspace_alloc_d_checked(&subset_weights, 2048 * BITCT * sizeof(double)) ||
-      wkspace_alloc_uc_checked(&overflow_buf, 262144)) {
+  if (bigstack_alloc_ul(sample_ct, &geno) ||
+      bigstack_alloc_ul(sample_ct, &mmasks) ||
+      bigstack_alloc_uc(MULTIPLEX_REL * unfiltered_sample_ct4, &gptr) ||
+      bigstack_alloc_ul(sample_ct, &masks) ||
+      bigstack_alloc_d(2048 * BITCT, &subset_weights) ||
+      bigstack_alloc_uc(262144, &overflow_buf)) {
     goto calc_rel_ret_NOMEM;
   }
   g_geno = (unsigned char*)geno;
@@ -7104,7 +7086,7 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
 
   if (calculation_type & CALC_IBC) {
     strcpy(outname_end, ".ibc");
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto calc_rel_ret_OPEN_FAIL;
     }
     dptr2 = rel_ibc;
@@ -7120,10 +7102,10 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
       sample_id = (char*)memchr(fam_id, '\t', max_sample_id_len);
       wptr = memcpyax(wbuf, fam_id, (uintptr_t)(sample_id - fam_id), '\t');
       wptr = strcpyax(wptr, &(sample_id[1]), '\t');
-      wptr = uint32_writex(wptr, marker_ct - sample_missing_unwt[sample_idx], '\t');
-      wptr = double_g_writex(wptr, *dptr3++ - 1.0, '\t');
-      wptr = double_g_writex(wptr, *dptr4++ - 1.0, '\t');
-      wptr = double_g_writex(wptr, *dptr2++ - 1.0, '\n');
+      wptr = uint32toa_x(marker_ct - sample_missing_unwt[sample_idx], '\t', wptr);
+      wptr = dtoa_gx(*dptr3++ - 1.0, '\t', wptr);
+      wptr = dtoa_gx(*dptr4++ - 1.0, '\t', wptr);
+      wptr = dtoa_gx(*dptr2++ - 1.0, '\n', wptr);
 
       if (fwrite_checked(wbuf, wptr - wbuf, outfile)) {
 	goto calc_rel_ret_WRITE_FAIL;
@@ -7163,7 +7145,7 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
       if (parallel_tot > 1) {
 	sprintf(&(outname_end[8]), ".%u", parallel_idx + 1);
       }
-      if (fopen_checked(&outfile, outname, "wb")) {
+      if (fopen_checked(outname, FOPEN_WB, &outfile)) {
 	goto calc_rel_ret_OPEN_FAIL;
       }
       for (sample_idx = min_sample; sample_idx < max_parallel_sample; sample_idx++) {
@@ -7206,17 +7188,17 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
       memcpy(outname_end, ".grm.N.bin", 11);
       if (parallel_tot > 1) {
 	outname_end[10] = '.';
-	uint32_writex(&(outname_end[11]), parallel_idx + 1, '\0');
+	uint32toa_x(parallel_idx + 1, '\0', &(outname_end[11]));
       }
-      if (fopen_checked(&out_bin_nfile, outname, "wb")) {
+      if (fopen_checked(outname, FOPEN_WB, &out_bin_nfile)) {
 	goto calc_rel_ret_OPEN_FAIL;
       }
       memcpy(outname_end, ".grm.bin", 9);
       if (parallel_tot > 1) {
 	outname_end[8] = '.';
-	uint32_writex(&(outname_end[9]), parallel_idx + 1, '\0');
+	uint32toa_x(parallel_idx + 1, '\0', &(outname_end[9]));
       }
-      if (fopen_checked(&outfile, outname, "wb")) {
+      if (fopen_checked(outname, FOPEN_WB, &outfile)) {
 	goto calc_rel_ret_OPEN_FAIL;
       }
       mdeptr = g_missing_dbl_excluded;
@@ -7256,7 +7238,7 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
       if (parallel_tot > 1) {
 	sprintf(&(outname_end[8]), ".%u", parallel_idx + 1);
       }
-      if (fopen_checked(&outfile, outname, "wb")) {
+      if (fopen_checked(outname, FOPEN_WB, &outfile)) {
 	goto calc_rel_ret_OPEN_FAIL;
       }
       for (sample_idx = min_sample; sample_idx < max_parallel_sample; sample_idx++) {
@@ -7388,7 +7370,7 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
     }
     putchar('\r');
     if (!parallel_idx) {
-      wptr = strcpya(logbuf, "Relationship matrix ");
+      wptr = strcpya(g_logbuf, "Relationship matrix ");
       if (parallel_tot > 1) {
 	wptr = strcpya(wptr, "component ");
       }
@@ -7401,9 +7383,9 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
       }
       sprintf(wptr, " , and IDs written to %s .\n", outname);
     } else {
-      sprintf(logbuf, "Relationship matrix component written to %s .\n", outname);
+      sprintf(g_logbuf, "Relationship matrix component written to %s .\n", outname);
     }
-    wordwrap(logbuf, 0);
+    wordwrapb(0);
     logprintb();
   }
   if (all_missing_warning) {
@@ -7435,7 +7417,7 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
     break;
   }
  calc_rel_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   fclose_cond(out_bin_nfile);
   return retval;
@@ -7449,11 +7431,11 @@ int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
   // exist.  EIGENSOFT is not *that* hard to use.)
   FILE* outfile = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
-  uintptr_t pca_sample_ctl2 = (pca_sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t pca_sample_ctl2 = QUATERCT_TO_WORDCT(pca_sample_ct);
   uintptr_t proj_sample_ct = sample_ct - pca_sample_ct;
-  uintptr_t proj_sample_ctl2 = (proj_sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t proj_sample_ctl2 = QUATERCT_TO_WORDCT(proj_sample_ct);
   uintptr_t final_mask = get_final_mask(pca_sample_ct);
   double nz = 0.0;
   double zz = -1.0;
@@ -7521,10 +7503,10 @@ int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
   if ((pc_ct > pca_sample_ct) || (pc_ct > marker_ct)) {
     if (pca_sample_ct <= marker_ct) {
       pc_ct = pca_sample_ct;
-      sprintf(logbuf, "Warning: calculating %u PCs, since there are only %u samples.\n", pc_ct, pc_ct);
+      sprintf(g_logbuf, "Warning: calculating %u PCs, since there are only %u samples.\n", pc_ct, pc_ct);
     } else {
       pc_ct = marker_ct;
-      sprintf(logbuf, "Warning: calculating %u PCs, since there are only %u autosomal markers.\n", pc_ct, pc_ct);
+      sprintf(g_logbuf, "Warning: calculating %u PCs, since there are only %u autosomal markers.\n", pc_ct, pc_ct);
     }
     logerrprintb();
   }
@@ -7533,9 +7515,9 @@ int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
     ulii = sample_ct * pc_ct;
   }
   rel_dists = g_rel_dists;
-  wkspace_reset(rel_dists);
-  if (wkspace_alloc_d_checked(&rel_dists, ulii * sizeof(double)) ||
-      wkspace_alloc_d_checked(&main_matrix, pca_sample_ct * pca_sample_ct * sizeof(double))) {
+  bigstack_reset(rel_dists);
+  if (bigstack_alloc_d(ulii, &rel_dists) ||
+      bigstack_alloc_d(pca_sample_ct * pca_sample_ct, &main_matrix)) {
     goto calc_pca_ret_NOMEM;
   }
   ulii = ((pca_sample_ct - 1) * (pca_sample_ct - 2)) >> 1;
@@ -7558,13 +7540,11 @@ int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
   mdim = ulii;
   i2 = mdim;
   i1 = i2 + 1 - pc_ct;
-  if (wkspace_alloc_d_checked(&out_w, pc_ct * sizeof(double)) ||
-      wkspace_alloc_d_checked(&out_z, pc_ct * ulii * sizeof(double))) {
+  if (bigstack_calloc_d(pc_ct, &out_w) ||
+      bigstack_calloc_d(pc_ct * ulii, &out_z)) {
     goto calc_pca_ret_NOMEM;
   }
-  fill_double_zero(out_w, pc_ct);
-  fill_double_zero(out_z, pc_ct * ulii);
-  isuppz = (__CLPK_integer*)wkspace_alloc(2 * pc_ct * sizeof(__CLPK_integer));
+  isuppz = (__CLPK_integer*)bigstack_alloc(2 * pc_ct * sizeof(__CLPK_integer));
   if (!isuppz) {
     goto calc_pca_ret_NOMEM;
   }
@@ -7573,15 +7553,14 @@ int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
 
   dsyevr_(&jobz, &range, &uplo, &mdim, main_matrix, &mdim, &nz, &nz, &i1, &i2, &zz, &out_m, out_w, out_z, &ldz, isuppz, &optim_lwork, &lwork, &optim_liwork, &liwork, &info);
   lwork = (int32_t)optim_lwork;
-  if (wkspace_alloc_d_checked(&work, lwork * sizeof(double))) {
+  if (bigstack_calloc_d(lwork, &work)) {
     goto calc_pca_ret_NOMEM;
   }
   liwork = optim_liwork;
-  iwork = (__CLPK_integer*)wkspace_alloc(liwork * sizeof(__CLPK_integer));
+  iwork = (__CLPK_integer*)bigstack_alloc(liwork * sizeof(__CLPK_integer));
   if (!iwork) {
     goto calc_pca_ret_NOMEM;
   }
-  fill_double_zero(work, lwork);
   fill_int_zero((int32_t*)iwork, liwork * (sizeof(__CLPK_integer) / sizeof(int32_t)));
   dsyevr_(&jobz, &range, &uplo, &mdim, main_matrix, &mdim, &nz, &nz, &i1, &i2, &zz, &out_m, out_w, out_z, &ldz, isuppz, work, &lwork, iwork, &liwork, &info);
 
@@ -7597,18 +7576,17 @@ int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
       *dptr++ = dptr2[pc_idx * pca_sample_ct];
     }
   }
-  wkspace_reset(out_z);
+  bigstack_reset(out_z);
   if (var_wts || proj_sample_ct) {
-    if (wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-        wkspace_alloc_ul_checked(&loadbuf, pca_sample_ctl2 * sizeof(intptr_t)) ||
-        wkspace_alloc_d_checked(&cur_var_wts, pc_ct * sizeof(double)) ||
-        wkspace_alloc_d_checked(&pc_sums, pc_ct * sizeof(double)) ||
-        wkspace_alloc_d_checked(&eigval_inv_sqrts, pc_ct * sizeof(double))) {
+    if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw) ||
+        bigstack_alloc_ul(pca_sample_ctl2, &loadbuf) ||
+        bigstack_alloc_d(pc_ct, &cur_var_wts) ||
+        bigstack_calloc_d(pc_ct, &pc_sums) ||
+        bigstack_alloc_d(pc_ct, &eigval_inv_sqrts)) {
       goto calc_pca_ret_NOMEM;
     }
     loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
     loadbuf[pca_sample_ctl2 - 1] = 0;
-    fill_double_zero(pc_sums, pc_ct);
     dptr = sample_loadings;
     for (sample_idx = 0; sample_idx < pca_sample_ct; sample_idx++) {
       for (pc_idx = 0; pc_idx < pc_ct; pc_idx++) {
@@ -7619,37 +7597,34 @@ int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
       eigval_inv_sqrts[pc_idx] = sqrt(1 / out_w[pc_idx]);
     }
     if (proj_sample_ct) {
-      if (wkspace_alloc_ul_checked(&sample_exclude_proj, unfiltered_sample_ctl * sizeof(intptr_t)) ||
-          wkspace_alloc_ul_checked(&loadbuf_proj, proj_sample_ctl2 * sizeof(intptr_t)) ||
-          wkspace_alloc_d_checked(&proj_sample_loadings, proj_sample_ct * pc_ct * sizeof(double)) ||
-	  wkspace_alloc_d_checked(&proj_allhom_wts, pc_ct * sizeof(double)) ||
-          wkspace_alloc_ui_checked(&proj_missing_cts, proj_sample_ct * sizeof(int32_t))) {
+      if (bigstack_alloc_ul(unfiltered_sample_ctl, &sample_exclude_proj) ||
+          bigstack_alloc_ul(proj_sample_ctl2, &loadbuf_proj) ||
+          bigstack_calloc_d(proj_sample_ct * pc_ct, &proj_sample_loadings) ||
+	  bigstack_calloc_d(pc_ct, &proj_allhom_wts) ||
+          bigstack_calloc_ui(proj_sample_ct, &proj_missing_cts)) {
 	goto calc_pca_ret_NOMEM;
       }
       memcpy(sample_exclude_proj, sample_exclude, unfiltered_sample_ctl * sizeof(intptr_t));
-      bitfield_ornot(sample_exclude_proj, pca_sample_exclude, unfiltered_sample_ctl);
-      zero_trailing_bits(sample_exclude_proj, unfiltered_sample_ct);
+      bitvec_ornot(pca_sample_exclude, unfiltered_sample_ctl, sample_exclude_proj);
+      zero_trailing_bits(unfiltered_sample_ct, sample_exclude_proj);
       loadbuf_proj[proj_sample_ctl2 - 1] = 0;
-      fill_double_zero(proj_sample_loadings, proj_sample_ct * pc_ct);
-      fill_double_zero(proj_allhom_wts, pc_ct);
-      fill_uint_zero(proj_missing_cts, proj_sample_ct);
     }
     if (var_wts) {
       memcpy(outname_end, ".eigenvec.var", 14);
-      if (fopen_checked(&outfile, outname, "w")) {
+      if (fopen_checked(outname, "w", &outfile)) {
 	goto calc_pca_ret_OPEN_FAIL;
       }
       if (write_headers) {
-	wptr = memcpyl3a(tbuf, "CHR");
+	wptr = memcpyl3a(g_textbuf, "CHR");
 	*wptr++ = delimiter;
 	wptr = memcpyl3a(wptr, "VAR");
 	for (pc_idx = 1; pc_idx <= pc_ct; pc_idx++) {
 	  *wptr++ = delimiter;
 	  wptr = memcpya(wptr, "PC", 2);
-	  wptr = uint32_write(wptr, pc_idx);
+	  wptr = uint32toa(pc_idx, wptr);
 	}
 	*wptr++ = '\n';
-	if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	  goto calc_pca_ret_WRITE_FAIL;
 	}
       }
@@ -7662,7 +7637,7 @@ int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
       }
       marker_uidx = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx];
       chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1];
-      wptr_start = chrom_name_write(tbuf, chrom_info_ptr, chrom_idx);
+      wptr_start = chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf);
       *wptr_start++ = delimiter;
       if (marker_uidx < chrom_end) {
 	if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
@@ -7679,8 +7654,9 @@ int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
 	    }
 	  }
 	  // if projecting, loadbuf_raw contains raw data, so we can just
-	  // follow up with collapse_copy_2bitarr() and reverse_loadbuf()
-	  if (load_and_collapse(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, pca_sample_ct, pca_sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+	  // follow up with copy_quaterarr_nonempty_subset_excl() and
+	  // reverse_loadbuf()
+	  if (load_and_collapse(unfiltered_sample_ct, pca_sample_ct, pca_sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf)) {
 	    goto calc_pca_ret_READ_FAIL;
 	  }
 	  // Variant weight matrix = X^T * S * D^{-1/2}, where X^T is the
@@ -7720,9 +7696,9 @@ int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
 	    cur_var_wts[pc_idx] *= eigval_inv_sqrts[pc_idx];
 	  }
 	  if (proj_sample_ct) {
-	    collapse_copy_2bitarr(loadbuf_raw, loadbuf_proj, unfiltered_sample_ct, proj_sample_ct, sample_exclude_proj);
+	    copy_quaterarr_nonempty_subset_excl(loadbuf_raw, sample_exclude_proj, unfiltered_sample_ct, proj_sample_ct, loadbuf_proj);
 	    if (IS_SET(marker_reverse, marker_uidx)) {
-	      reverse_loadbuf((unsigned char*)loadbuf_proj, proj_sample_ct);
+	      reverse_loadbuf(proj_sample_ct, (unsigned char*)loadbuf_proj);
 	    }
 	    ulptr = loadbuf_proj;
 	    sample_uidx = 0;
@@ -7762,10 +7738,10 @@ int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
 	    dptr2 = cur_var_wts;
 	    do {
 	      *wptr++ = delimiter;
-	      wptr = double_g_write(wptr, *(--dptr));
+	      wptr = dtoa_g(*(--dptr), wptr);
 	    } while (dptr > dptr2);
 	    *wptr++ = '\n';
-	    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	      goto calc_pca_ret_WRITE_FAIL;
 	    }
 	  }
@@ -7809,36 +7785,36 @@ int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
   }
 
   memcpy(outname_end, ".eigenval", 10);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto calc_pca_ret_OPEN_FAIL;
   }
-  wptr = tbuf;
+  wptr = g_textbuf;
   pc_idx = pc_ct;
   do {
     pc_idx--;
-    wptr = double_g_writex(wptr, out_w[pc_idx], '\n');
+    wptr = dtoa_gx(out_w[pc_idx], '\n', wptr);
   } while (pc_idx);
-  if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
     goto calc_pca_ret_WRITE_FAIL;
   }
   if (fclose_null(&outfile)) {
     goto calc_pca_ret_WRITE_FAIL;
   }
   memcpy(outname_end, ".eigenvec", 10);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto calc_pca_ret_OPEN_FAIL;
   }
   if (write_headers) {
-    wptr = memcpyl3a(tbuf, "FID");
+    wptr = memcpyl3a(g_textbuf, "FID");
     *wptr++ = delimiter;
     wptr = memcpyl3a(wptr, "IID");
     for (pc_idx = 1; pc_idx <= pc_ct; pc_idx++) {
       *wptr++ = delimiter;
       wptr = memcpya(wptr, "PC", 2);
-      wptr = uint32_write(wptr, pc_idx);
+      wptr = uint32toa(pc_idx, wptr);
     }
     *wptr++ = '\n';
-    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
       goto calc_pca_ret_WRITE_FAIL;
     }
   }
@@ -7846,10 +7822,10 @@ int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
     next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
     cptr = &(sample_ids[sample_uidx * max_sample_id_len]);
     if (delimiter == '\t') {
-      wptr = strcpya(tbuf, cptr);
+      wptr = strcpya(g_textbuf, cptr);
     } else {
       wptr_start = (char*)memchr(cptr, '\t', max_sample_id_len);
-      wptr = memcpya(tbuf, cptr, wptr_start - cptr);
+      wptr = memcpya(g_textbuf, cptr, wptr_start - cptr);
       *wptr++ = ' ';
       wptr = strcpya(wptr, &(wptr_start[1]));
     }
@@ -7857,10 +7833,10 @@ int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
     dptr2 = &(sample_loadings[sample_idx * pc_ct]);
     do {
       *wptr++ = delimiter;
-      wptr = double_g_write(wptr, *(--dptr));
+      wptr = dtoa_g(*(--dptr), wptr);
     } while (dptr > dptr2);
     *wptr++ = '\n';
-    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
       goto calc_pca_ret_WRITE_FAIL;
     }
   }
@@ -7900,7 +7876,7 @@ int32_t calc_ibm(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uintpt
   int32_t retval = 0;
   uintptr_t* marker_exclude = marker_exclude_orig;
   uint32_t* giptr = NULL;
-  unsigned char* wkspace_mark;
+  unsigned char* bigstack_mark;
   unsigned char* bedbuf;
   unsigned char* gptr;
   uint32_t* sample_missing_unwt;
@@ -7922,20 +7898,18 @@ int32_t calc_ibm(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uintpt
       dist_thread_ct = 1;
     }
   }
-  triangle_fill(g_thread_start, sample_ct, dist_thread_ct, 0, 1, 1, 1);
+  triangle_fill(sample_ct, dist_thread_ct, 0, 1, 1, 1, g_thread_start);
   llxx = g_thread_start[dist_thread_ct];
   llxx = (llxx * (llxx - 1)) / 2;
-  if (wkspace_alloc_ui_checked(&g_missing_dbl_excluded, llxx * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&sample_missing_unwt, sample_ct * sizeof(int32_t))) {
+  if (bigstack_calloc_ui(llxx, &g_missing_dbl_excluded) ||
+      bigstack_calloc_ui(sample_ct, &sample_missing_unwt)) {
     goto calc_ibm_ret_NOMEM;
   }
-  fill_uint_zero(g_missing_dbl_excluded, llxx);
   g_sample_missing_unwt = sample_missing_unwt;
-  fill_uint_zero(sample_missing_unwt, sample_ct);
-  wkspace_mark = wkspace_base;
+  bigstack_mark = g_bigstack_base;
 
-  if (wkspace_alloc_ul_checked(&mmasks, sample_ct * sizeof(intptr_t)) ||
-      wkspace_alloc_uc_checked(&bedbuf, MULTIPLEX_DIST * unfiltered_sample_ct4)) {
+  if (bigstack_alloc_ul(sample_ct, &mmasks) ||
+      bigstack_alloc_uc(MULTIPLEX_DIST * unfiltered_sample_ct4, &bedbuf)) {
     goto calc_ibm_ret_NOMEM;
   }
   g_mmasks = mmasks;
@@ -7995,7 +7969,7 @@ int32_t calc_ibm(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uintpt
     fflush(stdout);
   } while (!is_last_block);
   putchar('\r');
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   while (0) {
   calc_ibm_ret_NOMEM:
     retval = RET_NOMEM;
@@ -8037,7 +8011,7 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
   uintptr_t* masks;
   uintptr_t* mmasks;
   char* wptr;
-  unsigned char* wkspace_mark;
+  unsigned char* bigstack_mark;
   unsigned char* bedbuf;
   unsigned char* gptr;
   uintptr_t sample_uidx;
@@ -8074,7 +8048,7 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
       dist_thread_ct = 1;
     }
   }
-  triangle_fill(g_thread_start, sample_ct, dist_thread_ct, parallel_idx, parallel_tot, 1, 1);
+  triangle_fill(sample_ct, dist_thread_ct, parallel_idx, parallel_tot, 1, 1, g_thread_start);
   llxx = g_thread_start[dist_thread_ct];
   llxx = ((llxx * (llxx - 1)) - (int64_t)g_thread_start[0] * (g_thread_start[0] - 1)) / 2;
   dists_alloc = llxx * sizeof(double);
@@ -8084,13 +8058,11 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
   }
 #endif
   if ((calculation_type & (CALC_PLINK1_DISTANCE_MATRIX | CALC_PLINK1_IBS_MATRIX)) || (dist_calc_type & DISTANCE_FLAT_MISSING)) {
-    if (wkspace_alloc_ui_checked(&g_missing_dbl_excluded, llxx * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&sample_missing_unwt, sample_ct * sizeof(int32_t))) {
+    if (bigstack_calloc_ui(llxx, &g_missing_dbl_excluded) ||
+        bigstack_calloc_ui(sample_ct, &sample_missing_unwt)) {
       goto calc_distance_ret_NOMEM;
     }
     g_sample_missing_unwt = sample_missing_unwt;
-    fill_uint_zero(g_missing_dbl_excluded, llxx);
-    fill_uint_zero(sample_missing_unwt, sample_ct);
     unwt_needed = 1;
   } else {
     // defensive
@@ -8098,26 +8070,24 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
   }
   // Additional + CACHELINE is to fix aliasing bug that shows up with -O2 in
   // some cases.
-  if (wkspace_alloc_d_checked(&g_dists, dists_alloc + CACHELINE)) {
+  if (bigstack_alloc_d(dists_alloc + CACHELINE_DBL, &g_dists)) {
     goto calc_distance_ret_NOMEM;
   }
   // stack allocations before this point must be freed by the caller.
-  wkspace_mark = wkspace_base;
+  bigstack_mark = g_bigstack_base;
   if (missing_wt_needed) {
-    if (wkspace_alloc_ui_checked(&g_missing_tot_weights, llxx * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&sample_missing, sample_ct * sizeof(int32_t))) {
+    if (bigstack_calloc_ui(llxx, &g_missing_tot_weights) ||
+        bigstack_calloc_ui(sample_ct, &sample_missing)) {
       goto calc_distance_ret_NOMEM;
     }
     g_sample_missing = sample_missing;
-    fill_uint_zero(g_missing_tot_weights, llxx);
-    fill_uint_zero(sample_missing, sample_ct);
   } else {
     g_missing_tot_weights = NULL;
   }
 
   ujj = distance_wts_fname || (distance_exp != 0.0); // special weights?
   if (!ujj) {
-    g_idists = (int32_t*)((char*)wkspace_mark - CACHEALIGN(llxx * sizeof(int32_t)));
+    g_idists = (int32_t*)((char*)bigstack_mark - round_up_pow2(llxx * sizeof(int32_t), CACHELINE));
     fill_int_zero(g_idists, llxx);
   } else {
     fill_double_zero(g_dists, llxx);
@@ -8137,21 +8107,14 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
   }
 
   // stack allocations past this point are freed BEFORE results are written.
-  if (!ujj) {
-    masks = (uintptr_t*)wkspace_alloc(sample_ct * (MULTIPLEX_2DIST / 8));
-  } else {
-    masks = (uintptr_t*)wkspace_alloc(sample_ct * sizeof(intptr_t));
-  }
-  if (!masks) {
-    goto calc_distance_ret_NOMEM;
-  }
-  if (wkspace_alloc_ul_checked(&mmasks, sample_ct * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(ujj? sample_ct : (sample_ct * (MULTIPLEX_2DIST / BITCT)), &masks) ||
+      bigstack_alloc_ul(sample_ct, &mmasks)) {
     goto calc_distance_ret_NOMEM;
   }
 
   // Load or compute nonuniform marker weighting scheme.
   if (distance_exp != 0.0) {
-    if (wkspace_alloc_d_checked(&main_weights, marker_ct * sizeof(double))) {
+    if (bigstack_alloc_d(marker_ct, &main_weights)) {
       goto calc_distance_ret_NOMEM;
     }
     for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
@@ -8170,8 +8133,8 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
   if (missing_wt_needed) {
     // hack: overwrite dist_missing_wts while populating dist_missing_wts_i.
     // CACHELINE padding added to reduce risk of an aliasing problem.
-    if (wkspace_alloc_ui_checked(&dist_missing_wts_i, CACHELINE) ||
-        wkspace_alloc_d_checked(&dist_missing_wts, marker_ct * sizeof(double))) {
+    if (bigstack_alloc_ui(CACHELINE_INT32, &dist_missing_wts_i) ||
+        bigstack_alloc_d(marker_ct, &dist_missing_wts)) {
       goto calc_distance_ret_NOMEM;
     }
     dyy = 0.0; // raw weight sum
@@ -8221,10 +8184,10 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
 
   if (!main_weights) {
     multiplex = MULTIPLEX_DIST;
-    geno = (uintptr_t*)wkspace_alloc(sample_ct * (MULTIPLEX_2DIST / 8));
+    bigstack_alloc_ul(sample_ct * (MULTIPLEX_2DIST / BITCT), &geno);
   } else {
     multiplex = MULTIPLEX_DIST_EXP;
-    geno = (uintptr_t*)wkspace_alloc(sample_ct * sizeof(intptr_t));
+    bigstack_alloc_ul(sample_ct, &geno);
   }
   if (!geno) {
     goto calc_distance_ret_NOMEM;
@@ -8233,16 +8196,16 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
   g_masks = masks;
   g_mmasks = mmasks;
 
-  if (wkspace_alloc_uc_checked(&bedbuf, multiplex * unfiltered_sample_ct4)) {
+  if (bigstack_alloc_uc(multiplex * unfiltered_sample_ct4, &bedbuf)) {
     goto calc_distance_ret_NOMEM;
   }
   if (main_weights) {
 #ifdef __LP64__
-    if (wkspace_alloc_d_checked(&subset_weights, 45056 * sizeof(double))) {
+    if (bigstack_alloc_d(45056, &subset_weights)) {
       goto calc_distance_ret_NOMEM;
     }
 #else
-    if (wkspace_alloc_d_checked(&subset_weights, 32768 * sizeof(double))) {
+    if (bigstack_alloc_d(32768, &subset_weights)) {
       goto calc_distance_ret_NOMEM;
     }
     g_subset_weights_i = wtbuf;
@@ -8456,15 +8419,15 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
   } while (!is_last_block);
   putchar('\r');
   logprint("Distance matrix calculation complete.\n");
-  wkspace_reset(masks);
+  bigstack_reset(masks);
   if (calculation_type & (CALC_PLINK1_DISTANCE_MATRIX | CALC_PLINK1_IBS_MATRIX)) {
-    if (wkspace_alloc_c_checked(&writebuf, 16 * sample_ct)) {
+    if (bigstack_alloc_c(16 * sample_ct, &writebuf)) {
       goto calc_distance_ret_NOMEM;
     }
   }
   if (calculation_type & CALC_PLINK1_DISTANCE_MATRIX) {
     strcpy(outname_end, ".mdist");
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto calc_distance_ret_OPEN_FAIL;
     }
     iptr = g_idists;
@@ -8476,13 +8439,13 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
       uii = marker_ct - giptr2[sample_idx];
       wptr = writebuf;
       for (ujj = 0; ujj < sample_idx; ujj++) {
-	wptr = double_g_writex(wptr, ((double)(*iptr++)) / (2 * (uii - (*giptr2++) + (*giptr++))), ' ');
+	wptr = dtoa_gx(((double)(*iptr++)) / (2 * (uii - (*giptr2++) + (*giptr++))), ' ', wptr);
       }
       wptr = memcpya(wptr, "0 ", 2);
       giptr2++;
       for (ulii = sample_idx + 1; ulii < sample_ct; ulii++) {
 	uljj = tri_coord_no_diag(sample_idx, ulii);
-	wptr = double_g_writex(wptr, ((double)g_idists[uljj]) / (2 * (uii - (*giptr2++) + g_missing_dbl_excluded[uljj])), ' ');
+	wptr = dtoa_gx(((double)g_idists[uljj]) / (2 * (uii - (*giptr2++) + g_missing_dbl_excluded[uljj])), ' ', wptr);
       }
       *wptr++ = '\n';
       if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
@@ -8499,7 +8462,7 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
     }
     putchar('\r');
     if (!parallel_idx) {
-      wptr = strcpya(logbuf, "Distances (proportions) written to ");
+      wptr = strcpya(g_logbuf, "Distances (proportions) written to ");
       wptr = strcpya(wptr, outname);
       strcpy(outname_end, ".mdist.id");
       retval = write_ids(outname, unfiltered_sample_ct, sample_exclude, sample_ids, max_sample_id_len);
@@ -8508,14 +8471,14 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
       }
       sprintf(wptr, " , and IDs to %s .\n", outname);
     } else {
-      sprintf(logbuf, "Distances (proportions) written to %s .\n", outname);
+      sprintf(g_logbuf, "Distances (proportions) written to %s .\n", outname);
     }
-    wordwrap(logbuf, 0);
+    wordwrapb(0);
     logprintb();
   }
   if (calculation_type & CALC_PLINK1_IBS_MATRIX) {
     strcpy(outname_end, ".mibs");
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto calc_distance_ret_OPEN_FAIL;
     }
     iptr = g_idists;
@@ -8526,13 +8489,13 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
       uii = marker_ct - giptr2[sample_idx];
       wptr = writebuf;
       for (ujj = 0; ujj < sample_idx; ujj++) {
-	wptr = double_g_writex(wptr, 1.0 - (((double)(*iptr++)) / (2 * (uii - (*giptr2++) + (*giptr++)))), ' ');
+	wptr = dtoa_gx(1.0 - (((double)(*iptr++)) / (2 * (uii - (*giptr2++) + (*giptr++)))), ' ', wptr);
       }
       wptr = memcpya(wptr, "1 ", 2);
       giptr2++;
       for (ulii = sample_idx + 1; ulii < sample_ct; ulii++) {
 	uljj = tri_coord_no_diag(sample_idx, ulii);
-	wptr = double_g_writex(wptr, 1.0 - (((double)g_idists[uljj]) / (2 * (uii - (*giptr2++) + g_missing_dbl_excluded[uljj]))), ' ');
+	wptr = dtoa_gx(1.0 - (((double)g_idists[uljj]) / (2 * (uii - (*giptr2++) + g_missing_dbl_excluded[uljj]))), ' ', wptr);
       }
       *wptr++ = '\n';
       if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
@@ -8645,7 +8608,7 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
       }
     }
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
 
   while (0) {
   calc_distance_ret_NOMEM:
@@ -8710,7 +8673,7 @@ void cluster_dist_multiply(uintptr_t sample_ct, uintptr_t cluster_ct, uint32_t*
   }
 }
 
-int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uint32_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, char* read_dists_fname, char* read_dists_id_fname, char* read_genome_fname, char* outname, char* ou [...]
+int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uint32_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, char* read_dists_fname, char* read_dists_id_fname, char* read_genome_fname, char* outname, char* ou [...]
   // --cluster and --neighbour.  They are handled by the same function because
   // they initially process the distance matrix/PPC test results in roughly the
   // same way, but we have removed the PLINK 1.07 requirement that --cluster be
@@ -8749,11 +8712,11 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
   double min_ibm = cp->min_ibm;
   double min_zx = 0.0;
   uint32_t ibm_constraint = (min_ibm != 0.0);
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t sample_ctl = (sample_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
   double sample_ct_recip = 1.0 / ((double)((intptr_t)sample_ct));
   uint32_t do_neighbor = (calculation_type / CALC_NEIGHBOR) & 1;
-  uint32_t is_group_avg = cp->modifier & CLUSTER_GROUP_AVG;
+  uint32_t is_group_avg = (cp->modifier / CLUSTER_GROUP_AVG) & 1;
   uint32_t is_mds_cluster = cp->modifier & CLUSTER_MDS;
   uint32_t is_old_tiebreaks = cp->modifier & CLUSTER_OLD_TIEBREAKS;
   uint32_t mds_fill_nonclust = (!is_mds_cluster) && mds_plot_dmatrix_copy;
@@ -8834,12 +8797,11 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
     if (g_dists) {
       cluster_sorted_ibs = g_dists;
     } else {
-      if (wkspace_alloc_d_checked(&cluster_sorted_ibs, initial_triangle_size * sizeof(double))) {
+      if (bigstack_calloc_d(initial_triangle_size, &cluster_sorted_ibs)) {
         goto calc_cluster_neighbor_ret_NOMEM;
       }
-      fill_double_zero(cluster_sorted_ibs, initial_triangle_size);
     }
-    wkspace_mark_precluster = wkspace_base;
+    bigstack_mark_precluster = g_bigstack_base;
   }
   if (do_neighbor) {
     if (neighbor_n2 >= sample_ct) {
@@ -8865,7 +8827,7 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
     }
     fill_double_zero(neighbor_quantiles, ulii);
   }
-  fill_ulong_zero(cluster_merge_prevented, (initial_triangle_size + (BITCT - 1)) / BITCT);
+  fill_ulong_zero(cluster_merge_prevented, BITCT_TO_WORDCT(initial_triangle_size));
   if ((min_ppc != 0.0) || genome_main || read_genome_fname) {
     if (do_neighbor && (min_ppc != 0.0)) {
       ppc_fail_counts = (uint32_t*)malloc(sample_ct * sizeof(int32_t));
@@ -8935,10 +8897,10 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
 		ppc_fail_counts[sample_idx2] += 1;
 	      }
 	      if (!cluster_ct) {
-		set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(sample_idx1, sample_idx2));
+		set_bit_ul(tri_coord_no_diag(sample_idx1, sample_idx2), cluster_merge_prevented);
 	      } else {
 		if (clidx1 != clidx2) {
-		  SET_BIT(cluster_merge_prevented, tcoord);
+		  SET_BIT(tcoord, cluster_merge_prevented);
 		} else if (!ppc_warning) {
 		  logerrprint("Warning: Initial cluster assignment violates PPC test constraint.\n");
 		  ppc_warning = 1;
@@ -8949,7 +8911,7 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
 	  ulii += 5;
 	}
       }
-      wkspace_reset(genome_main);
+      bigstack_reset(genome_main);
     }
   } else if (((!cluster_missing) || do_neighbor) && (!read_dists_fname)) {
     // calculate entire distance matrix, or use already-calculated matrix in
@@ -9011,7 +8973,7 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
 	  }
 	}
       }
-      wkspace_reset(g_dists);
+      bigstack_reset(g_dists);
     }
   }
   if (read_dists_fname) {
@@ -9027,7 +8989,7 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
   fill_idx_to_uidx(sample_exclude, unfiltered_sample_ct, sample_ct, sample_idx_to_uidx);
   if (do_neighbor) {
     memcpy(outname_end, ".nearest", 9);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto calc_cluster_neighbor_ret_OPEN_FAIL;
     }
     if (fputs_checked("         FID          IID     NN      MIN_DST            Z         FID2         IID2 ", outfile)) {
@@ -9054,7 +9016,7 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
     for (sample_idx1 = 0; sample_idx1 < sample_ct; sample_idx1++) {
       fam_id = &(sample_ids[sample_idx_to_uidx[sample_idx1] * max_sample_id_len]);
       sample_id = (char*)memchr(fam_id, '\t', max_sample_id_len);
-      wptr_start = fw_strcpyn(12, (uint32_t)(sample_id - fam_id), fam_id, tbuf);
+      wptr_start = fw_strcpyn(12, (uint32_t)(sample_id - fam_id), fam_id, g_textbuf);
       *wptr_start++ = ' ';
       wptr_start = fw_strcpy(12, &(sample_id[1]), wptr_start);
       *wptr_start++ = ' ';
@@ -9062,11 +9024,11 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
         dyy = ((double)((int32_t)ppc_fail_counts[sample_idx1])) * dxx1;
       }
       for (ulii = 0; ulii < neighbor_row_ct; ulii++) {
-        wptr = uint32_writew6x(wptr_start, ulii + neighbor_n1, ' ');
+        wptr = uint32toa_w6x(ulii + neighbor_n1, ' ', wptr_start);
 	sample_idx2 = sample_idx1 + ulii * sample_ct;
         dxx = neighbor_quantiles[sample_idx2];
-	wptr = double_g_writewx4x(wptr, dxx, 12, ' ');
-        wptr = double_g_writewx4x(wptr, (dxx - neighbor_quantile_means[ulii]) * neighbor_quantile_stdev_recips[ulii], 12, ' ');
+	wptr = dtoa_g_wxp4x(dxx, 12, ' ', wptr);
+        wptr = dtoa_g_wxp4x((dxx - neighbor_quantile_means[ulii]) * neighbor_quantile_stdev_recips[ulii], 12, ' ', wptr);
 	sample_idx2 = neighbor_qindices[sample_idx2];
         fam_id = &(sample_ids[sample_idx_to_uidx[sample_idx2] * max_sample_id_len]);
         sample_id = (char*)memchr(fam_id, '\t', max_sample_id_len);
@@ -9075,10 +9037,10 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
         wptr = fw_strcpy(12, &(sample_id[1]), wptr);
         *wptr++ = ' ';
 	if (min_ppc != 0.0) {
-          wptr = double_g_writewx4x(wptr, dyy, 12, ' ');
+          wptr = dtoa_g_wxp4x(dyy, 12, ' ', wptr);
 	}
         *wptr++ = '\n';
-        if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+        if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	  goto calc_cluster_neighbor_ret_WRITE_FAIL;
 	}
       }
@@ -9103,12 +9065,12 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
         logprint("\n");
       }
       // N.B. this is still used until the end of the block
-      wkspace_reset(g_missing_dbl_excluded);
+      bigstack_reset(g_missing_dbl_excluded);
     }
     dxx1 = 1.0 / ((double)((int32_t)marker_ct));
     if (cluster_missing) {
       memcpy(outname_end, ".mdist.missing", 15);
-      if (fopen_checked(&outfile, outname, "w")) {
+      if (fopen_checked(outname, "w", &outfile)) {
 	goto calc_cluster_neighbor_ret_OPEN_FAIL;
       }
       fputs("Writing IBM matrix... 0%    \b\b\b\b", stdout);
@@ -9132,11 +9094,11 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
 	    dxx = 1.0 - ((double)((int32_t)(uii + (*sample_missing_ptr++) - 2 * (*dbl_exclude_ptr++)))) * dxx1;
 	    if (cluster_missing) {
 	      *dptr++ = dxx;
-	      wptr = double_g_writex(tbuf, dxx, ' ');
-	      fwrite(tbuf, 1, wptr - tbuf, outfile);
+	      wptr = dtoa_gx(dxx, ' ', g_textbuf);
+	      fwrite(g_textbuf, 1, wptr - g_textbuf, outfile);
 	    }
 	    if (dxx < min_ibm) {
-	      set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(sample_idx2, sample_idx1));
+	      set_bit_ul(tri_coord_no_diag(sample_idx2, sample_idx1), cluster_merge_prevented);
 	    }
 	  }
 	} else {
@@ -9162,14 +9124,14 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
 		  *dptr2 += dxx;
 		}
 	      }
-	      wptr = double_g_writex(tbuf, dxx, ' ');
-	      fwrite(tbuf, 1, wptr - tbuf, outfile);
+	      wptr = dtoa_gx(dxx, ' ', g_textbuf);
+	      fwrite(g_textbuf, 1, wptr - g_textbuf, outfile);
 	    }
 	    if (dxx < min_ibm) {
 	      if (clidx1 < clidx2) {
-		set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(clidx1, clidx2));
+		set_bit_ul(tri_coord_no_diag(clidx1, clidx2), cluster_merge_prevented);
 	      } else if (clidx1 > clidx2) {
-		set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(clidx2, clidx1));
+		set_bit_ul(tri_coord_no_diag(clidx2, clidx1), cluster_merge_prevented);
 	      } else {
 		ibm_warning = 1;
 	      }
@@ -9184,11 +9146,11 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
 	    dbl_exclude_ptr = &(dbl_exclude_ptr[ulii - sample_idx2]);
 	    if (cluster_missing) {
 	      *dptr++ = dxx;
-	      wptr = double_g_writex(tbuf, dxx, ' ');
-	      fwrite(tbuf, 1, wptr - tbuf, outfile);
+	      wptr = dtoa_gx(dxx, ' ', g_textbuf);
+	      fwrite(g_textbuf, 1, wptr - g_textbuf, outfile);
 	    }
 	    if (dxx < min_ibm) {
-	      set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(sample_idx2, sample_idx1));
+	      set_bit_ul(tri_coord_no_diag(sample_idx2, sample_idx1), cluster_merge_prevented);
 	    }
 	  }
 	} else {
@@ -9215,14 +9177,14 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
 		  *dptr2 += dxx;
 		}
 	      }
-	      wptr = double_g_writex(tbuf, dxx, ' ');
-	      fwrite(tbuf, 1, wptr - tbuf, outfile);
+	      wptr = dtoa_gx(dxx, ' ', g_textbuf);
+	      fwrite(g_textbuf, 1, wptr - g_textbuf, outfile);
 	    }
 	    if (dxx < min_ibm) {
 	      if (clidx1 < clidx2) {
-		set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(clidx1, clidx2));
+		set_bit_ul(tri_coord_no_diag(clidx1, clidx2), cluster_merge_prevented);
 	      } else if (clidx1 > clidx2) {
-		set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(clidx2, clidx1));
+		set_bit_ul(tri_coord_no_diag(clidx2, clidx1), cluster_merge_prevented);
 	      } else {
 		ibm_warning = 1;
 	      }
@@ -9238,8 +9200,8 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
 	if (!genome_main) {
 	  for (sample_idx2 = sample_idx1 + 1; sample_idx2 < sample_ct; sample_idx2++) {
 	    dxx = 1.0 - ((double)((int32_t)(uii + (*(++sample_missing_ptr)) - 2 * missing_dbl_excluded[((sample_idx2 * (sample_idx2 - 1)) >> 1) + sample_idx1]))) * dxx1;
-	    wptr = double_g_writex(tbuf, dxx, ' ');
-	    fwrite(tbuf, 1, wptr - tbuf, outfile);
+	    wptr = dtoa_gx(dxx, ' ', g_textbuf);
+	    fwrite(g_textbuf, 1, wptr - g_textbuf, outfile);
 	  }
 	} else {
 	  // f(0) = 0
@@ -9250,8 +9212,8 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
 	  dbl_exclude_ptr = &(missing_dbl_excluded[sample_ct * sample_idx1 - ((sample_idx1 * (sample_idx1 + 1)) >> 1)]);
 	  for (sample_idx2 = sample_idx1 + 1; sample_idx2 < sample_ct; sample_idx2++) {
 	    dxx = 1.0 - ((double)((int32_t)(uii + (*(++sample_missing_ptr)) - 2 * (*dbl_exclude_ptr++)))) * dxx1;
-	    wptr = double_g_writex(tbuf, dxx, ' ');
-	    fwrite(tbuf, 1, wptr - tbuf, outfile);
+	    wptr = dtoa_gx(dxx, ' ', g_textbuf);
+	    fwrite(g_textbuf, 1, wptr - g_textbuf, outfile);
 	  }
 	}
 	if (putc_checked('\n', outfile)) {
@@ -9291,14 +9253,14 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
     case_ct = popcount_longs(pheno_c, unfiltered_sample_ctl);
     ctrl_ct = sample_ct - case_ct;
     if ((cp->modifier & CLUSTER_CC) || (cp->max_cases < case_ct) || (cp->max_ctrls < ctrl_ct)) {
-      if (wkspace_alloc_ul_checked(&collapsed_pheno_c, sample_ctl * sizeof(intptr_t))) {
+      if (bigstack_alloc_ul(sample_ctl, &collapsed_pheno_c)) {
 	goto calc_cluster_neighbor_ret_NOMEM;
       }
       cur_cluster_case_cts = (uint32_t*)malloc(cur_cluster_ct * sizeof(int32_t));
       if (!cur_cluster_case_cts) {
 	goto calc_cluster_neighbor_ret_NOMEM;
       }
-      collapse_copy_bitarr(unfiltered_sample_ct, pheno_c, sample_exclude, sample_ct, collapsed_pheno_c);
+      copy_bitarr_subset_excl(pheno_c, sample_exclude, unfiltered_sample_ct, sample_ct, collapsed_pheno_c);
       fill_uint_zero(cur_cluster_case_cts, cur_cluster_ct);
       if (!cluster_ct) {
 	for (sample_idx1 = 0; sample_idx1 < sample_ct; sample_idx1++) {
@@ -9313,7 +9275,7 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
 	  }
 	}
       }
-      wkspace_reset(collapsed_pheno_c);
+      bigstack_reset(collapsed_pheno_c);
     }
   }
   if (cur_cluster_case_cts || is_group_avg || (cp->max_size < sample_ct)) {
@@ -9329,20 +9291,20 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
 	if (ujj != uii) {
 	  mc_warning = 1;
 	}
-	fill_bits(cluster_merge_prevented, (clidx1 * (clidx1 - 1)) >> 1, clidx1);
+	fill_bits((clidx1 * (clidx1 - 1)) >> 1, clidx1, cluster_merge_prevented);
 	for (clidx2 = clidx1 + 1; clidx2 < cur_cluster_ct; clidx2++) {
-	  set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(clidx1, clidx2));
+	  set_bit_ul(tri_coord_no_diag(clidx1, clidx2), cluster_merge_prevented);
 	}
       } else if (ujj > uii / 2) {
 	ujj = uii - ujj;
 	for (clidx2 = 0; clidx2 < clidx1; clidx2++) {
 	  if (cluster_starts[clidx2 + 1] - cluster_starts[clidx2] > ujj) {
-	    set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(clidx2, clidx1));
+	    set_bit_ul(tri_coord_no_diag(clidx2, clidx1), cluster_merge_prevented);
 	  }
 	}
 	for (clidx2 = clidx1 + 1; clidx2 < cluster_ct; clidx2++) {
 	  if (cluster_starts[clidx2 + 1] - cluster_starts[clidx2] > ujj) {
-	    set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(clidx1, clidx2));
+	    set_bit_ul(tri_coord_no_diag(clidx1, clidx2), cluster_merge_prevented);
 	  }
 	}
       }
@@ -9360,26 +9322,26 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
 	  ujj = cur_cluster_case_cts[clidx1];
 	  if (ujj > uii) {
 	    mcc_warning = 1;
-	    fill_bits(cluster_merge_prevented, (clidx1 * (clidx1 - 1)) >> 1, clidx1);
+	    fill_bits((clidx1 * (clidx1 - 1)) >> 1, clidx1, cluster_merge_prevented);
 	    for (clidx2 = clidx1 + 1; clidx2 < cur_cluster_ct; clidx2++) {
-	      set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(clidx1, clidx2));
+	      set_bit_ul(tri_coord_no_diag(clidx1, clidx2), cluster_merge_prevented);
 	    }
 	  } else if (ujj > uii / 2) {
 	    ujj = uii - ujj;
 	    for (clidx2 = 0; clidx2 < clidx1; clidx2++) {
 	      if (cur_cluster_case_cts[clidx2] > ujj) {
-		set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(clidx2, clidx1));
+		set_bit_ul(tri_coord_no_diag(clidx2, clidx1), cluster_merge_prevented);
 	      }
 	    }
 	    for (clidx2 = clidx1 + 1; clidx2 < cluster_ct; clidx2++) {
 	      if (cur_cluster_case_cts[clidx2] > ujj) {
-		set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(clidx1, clidx2));
+		set_bit_ul(tri_coord_no_diag(clidx1, clidx2), cluster_merge_prevented);
 	      }
 	    }
 	    if (!ujj) {
 	      for (clidx2 = cluster_ct; clidx2 < cur_cluster_ct; clidx2++) {
                 if (cur_cluster_case_cts[clidx2]) {
-		  set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(clidx1, clidx2));
+		  set_bit_ul(tri_coord_no_diag(clidx1, clidx2), cluster_merge_prevented);
 		}
 	      }
 	    }
@@ -9390,7 +9352,7 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
 	    if (cur_cluster_case_cts[clidx1]) {
 	      for (clidx2 = clidx1 + 1; clidx2 < cur_cluster_ct; clidx2++) {
 		if (cur_cluster_case_cts[clidx2]) {
-		  set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(clidx1, clidx2));
+		  set_bit_ul(tri_coord_no_diag(clidx1, clidx2), cluster_merge_prevented);
 		}
 	      }
 	    }
@@ -9403,26 +9365,26 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
 	  ujj = cur_cluster_sizes[clidx1] - cur_cluster_case_cts[clidx1];
 	  if (ujj > uii) {
 	    mcc_warning = 1;
-	    fill_bits(cluster_merge_prevented, (clidx1 * (clidx1 - 1)) >> 1, clidx1);
+	    fill_bits((clidx1 * (clidx1 - 1)) >> 1, clidx1, cluster_merge_prevented);
 	    for (clidx2 = clidx1 + 1; clidx2 < cur_cluster_ct; clidx2++) {
-	      set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(clidx1, clidx2));
+	      set_bit_ul(tri_coord_no_diag(clidx1, clidx2), cluster_merge_prevented);
 	    }
 	  } else if (ujj > uii / 2) {
 	    ujj = uii - ujj;
 	    for (clidx2 = 0; clidx2 < clidx1; clidx2++) {
 	      if (cur_cluster_sizes[clidx2] - cur_cluster_case_cts[clidx2] > ujj) {
-		set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(clidx2, clidx1));
+		set_bit_ul(tri_coord_no_diag(clidx2, clidx1), cluster_merge_prevented);
 	      }
 	    }
 	    for (clidx2 = clidx1 + 1; clidx2 < cluster_ct; clidx2++) {
 	      if (cur_cluster_sizes[clidx2] - cur_cluster_case_cts[clidx2] > ujj) {
-		set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(clidx1, clidx2));
+		set_bit_ul(tri_coord_no_diag(clidx1, clidx2), cluster_merge_prevented);
 	      }
 	    }
 	    if (!ujj) {
 	      for (clidx2 = cluster_ct; clidx2 < cur_cluster_ct; clidx2++) {
                 if (!cur_cluster_case_cts[clidx2]) {
-		  set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(clidx1, clidx2));
+		  set_bit_ul(tri_coord_no_diag(clidx1, clidx2), cluster_merge_prevented);
 		}
 	      }
 	    }
@@ -9433,7 +9395,7 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
 	    if (!cur_cluster_case_cts[clidx1]) {
 	      for (clidx2 = clidx1 + 1; clidx2 < cur_cluster_ct; clidx2++) {
 		if (!cur_cluster_case_cts[clidx2]) {
-		  set_bit_ul(cluster_merge_prevented, tri_coord_no_diag(clidx1, clidx2));
+		  set_bit_ul(tri_coord_no_diag(clidx1, clidx2), cluster_merge_prevented);
 		}
 	      }
 	    }
@@ -9446,7 +9408,7 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
     }
   }
 
-  wkspace_reset(wkspace_mark_precluster);
+  bigstack_reset(bigstack_mark_precluster);
   if (cp->match_fname || cp->qmatch_fname) {
     retval = cluster_enforce_match(cp, missing_pheno, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, cluster_ct, cluster_starts, sample_to_cluster, cluster_merge_prevented);
     if (retval) {
@@ -9472,7 +9434,7 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
     //
     // So we perform a special in-place qsort_ext():
     // 0. Ensure cluster_sorted_ibs is on top of the stack.  (This is why we
-    //    use malloc more than wkspace_alloc here.)
+    //    use malloc more than bigstack_alloc here.)
     // 1. Convert it to an array of 12-byte structs (first 8 bytes = original
     //    IBS value, last 4 bytes = cluster indices) by allocating ~50% more
     //    memory, and copying values back-to-front.  (If there are >65536
@@ -9497,14 +9459,14 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
       if (tcoord >= (umm * (umm + 1)) / 2) {
 	umm++;
       }
-      heap_size -= popcount_longs_nzbase(cluster_merge_prevented, tcoord / BITCT, (initial_triangle_size + (BITCT - 1)) / BITCT);
+      heap_size -= popcount_longs_nzbase(cluster_merge_prevented, tcoord / BITCT, BITCT_TO_WORDCT(initial_triangle_size));
     }
     if (!heap_size) {
       logerrprint("Error: No cluster merges possible.\n");
       goto calc_cluster_neighbor_ret_INVALID_CMDLINE;
     }
-    wkspace_reset(cluster_sorted_ibs);
-    if (wkspace_alloc_ui_checked(&cluster_sorted_ibs_indices, ((is_group_avg? 4 : 3) * heap_size * sizeof(int32_t)) + CACHELINE)) {
+    bigstack_reset(cluster_sorted_ibs);
+    if (bigstack_alloc_ui((3 + is_group_avg) * heap_size + CACHELINE_INT32, &cluster_sorted_ibs_indices)) {
       goto calc_cluster_neighbor_ret_NOMEM;
     }
     ulii = initial_triangle_size - 1;
@@ -9548,15 +9510,14 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
     if (!is_group_avg) {
       if (is_old_tiebreaks) {
 	ulii = 1 + (heap_size / BITCT);
-	if (wkspace_alloc_ul_checked(&ibs_ties, ulii * sizeof(intptr_t))) {
+	if (bigstack_calloc_ul(ulii, &ibs_ties)) {
 	  goto calc_cluster_neighbor_ret_NOMEM;
 	}
 	// copy this earlier after cluster_index allocated?
-	fill_ulong_zero(ibs_ties, ulii);
 	uljj = heap_size - 1;
 	for (ulii = 0; ulii < uljj; ulii++) {
 	  if ((cluster_sorted_ibs_indices[ulii * 3 + CACHELINE_INT32] == cluster_sorted_ibs_indices[ulii * 3 + 3 + CACHELINE_INT32]) && (cluster_sorted_ibs_indices[ulii * 3 + 1 + CACHELINE_INT32] == cluster_sorted_ibs_indices[ulii * 3 + 4 + CACHELINE_INT32])) {
-	    SET_BIT(ibs_ties, ulii);
+	    SET_BIT(ulii, ibs_ties);
 	  }
 	}
       }
@@ -9567,7 +9528,7 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
 	// this guarantees write_cluster_solution() has enough space
 	ulii = cur_cluster_ct;
       }
-      wkspace_reset((&(cluster_sorted_ibs_indices[CACHEALIGN_INT32(ulii)])));
+      bigstack_reset((&(cluster_sorted_ibs_indices[round_up_pow2(ulii, CACHELINE_INT32)])));
     } else {
       uiptr = &(cluster_sorted_ibs_indices[CACHELINE_INT32 + 3 * heap_size]);
       for (ulii = 0; ulii < heap_size; ulii++) {
@@ -9578,18 +9539,17 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
         *uiptr++ = cluster_sorted_ibs_indices[CACHELINE_INT32 + ulii * 3];
         *uiptr++ = cluster_sorted_ibs_indices[CACHELINE_INT32 + ulii * 3 + 1];
       }
-      wkspace_shrink_top(cluster_sorted_ibs, heap_size * sizeof(double));
-      memcpy(wkspace_base, &(cluster_sorted_ibs_indices[CACHELINE_INT32 + 3 * heap_size]), heap_size * sizeof(int32_t));
+      bigstack_shrink_top(cluster_sorted_ibs, heap_size * sizeof(double));
+      memcpy(g_bigstack_base, &(cluster_sorted_ibs_indices[CACHELINE_INT32 + 3 * heap_size]), heap_size * sizeof(int32_t));
       ulii = heap_size;
       if (ulii < cur_cluster_ct) {
         ulii = cur_cluster_ct;
       }
-      cluster_sorted_ibs_indices = (uint32_t*)wkspace_alloc(ulii * sizeof(int32_t));
+      bigstack_alloc_ui(ulii, &cluster_sorted_ibs_indices);
     }
-    if (wkspace_alloc_ui_checked(&cluster_index, initial_triangle_size * sizeof(int32_t))) {
+    if (bigstack_calloc_ui(initial_triangle_size, &cluster_index)) {
       goto calc_cluster_neighbor_ret_NOMEM;
     }
-    fill_uint_zero(cluster_index, initial_triangle_size);
     ujj = heap_size;
     if (!is_group_avg) {
       for (uii = 0; uii < ujj; uii++) {
@@ -9610,7 +9570,7 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
   }
 #endif
 
-  if (wkspace_alloc_ui_checked(&merge_sequence, 2 * (sample_ct - cp->min_ct) * sizeof(int32_t))) {
+  if (bigstack_alloc_ui(2 * (sample_ct - cp->min_ct), &merge_sequence)) {
     goto calc_cluster_neighbor_ret_NOMEM;
   }
   cur_cluster_remap = (uint32_t*)malloc(cur_cluster_ct * sizeof(int32_t));
@@ -9639,13 +9599,13 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
   if (cp->mds_dim_ct) {
     if (!mds_plot_dmatrix_copy) {
       // --read-dists or --read-genome, and not cluster_missing
-      wkspace_reset(wkspace_mark_postcluster);
+      bigstack_reset(bigstack_mark_postcluster);
       if (!is_mds_cluster) {
 	ulii = sample_ct;
       } else {
 	ulii = cur_cluster_ct;
       }
-      if (wkspace_alloc_d_checked(&mds_plot_dmatrix_copy, ulii * (ulii - 1) * (sizeof(double) / 2))) {
+      if (bigstack_alloc_d((ulii * (ulii - 1)) / 2, &mds_plot_dmatrix_copy)) {
 	goto calc_cluster_neighbor_ret_NOMEM;
       }
       if (is_mds_cluster || (!read_dists_fname)) {
@@ -9660,7 +9620,7 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
 	goto calc_cluster_neighbor_ret_1;
       }
     } else {
-      wkspace_reset(cluster_merge_prevented);
+      bigstack_reset(cluster_merge_prevented);
       if (is_mds_cluster) {
         cluster_dist_multiply(sample_ct, cluster_ct, cluster_starts, mds_plot_dmatrix_copy);
       }
@@ -9691,7 +9651,7 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
     break;
   }
  calc_cluster_neighbor_ret_1:
-  wkspace_reset(wkspace_mark_postcluster);
+  bigstack_reset(bigstack_mark_postcluster);
   free_cond(sample_to_cluster);
   free_cond(neighbor_quantiles);
   free_cond(neighbor_qindices);
@@ -9761,8 +9721,8 @@ THREAD_RET_TYPE regress_jack_thread(void* arg) {
   uintptr_t jackknife_iters = g_jackknife_iters;
   uintptr_t sample_ct = g_sample_ct;
   uint32_t jackknife_d = g_jackknife_d;
-  uint32_t* uibuf = (uint32_t*)(&(g_generic_buf[tidx * CACHEALIGN(sample_ct + (jackknife_d + 1) * sizeof(int32_t))]));
-  unsigned char* cbuf = &(g_generic_buf[tidx * CACHEALIGN(sample_ct + (jackknife_d + 1) * sizeof(int32_t)) + (jackknife_d + 1) * sizeof(int32_t)]);
+  uint32_t* uibuf = (uint32_t*)(&(g_generic_buf[tidx * round_up_pow2(sample_ct + (jackknife_d + 1) * sizeof(int32_t), CACHELINE)]));
+  unsigned char* cbuf = &(g_generic_buf[tidx * round_up_pow2(sample_ct + (jackknife_d + 1) * sizeof(int32_t), CACHELINE) + (jackknife_d + 1) * sizeof(int32_t)]);
   uintptr_t uljj = jackknife_iters / 100;
   double* jackknife_precomp = g_jackknife_precomp;
   double* dists = g_dists;
@@ -9802,7 +9762,7 @@ THREAD_RET_TYPE regress_jack_thread(void* arg) {
 }
 
 int32_t regress_distance(pthread_t* threads, uint64_t calculation_type, double* pheno_d, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uint32_t thread_ct, uintptr_t regress_iters, uint32_t regress_d) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   double reg_tot_xy = 0;
   double reg_tot_x = 0;
   double reg_tot_y = 0;
@@ -9844,12 +9804,11 @@ int32_t regress_distance(pthread_t* threads, uint64_t calculation_type, double*
   // Then for each delete-d jackknife iteration, we take the global sums,
   // subtract the partial row sums corresponding to the deleted samples, and
   // then add back the elements in the intersection of two deletions.
-  if (wkspace_alloc_d_checked(&jackknife_precomp, sample_ct * JACKKNIFE_VALS_DIST * sizeof(double))) {
+  if (bigstack_calloc_d(sample_ct * JACKKNIFE_VALS_DIST, &jackknife_precomp)) {
     goto regress_distance_ret_NOMEM;
   }
   g_jackknife_precomp = jackknife_precomp;
-  fill_double_zero(jackknife_precomp, sample_ct * JACKKNIFE_VALS_DIST);
-  if (wkspace_init_sfmtp(g_thread_ct)) {
+  if (bigstack_init_sfmtp(g_thread_ct)) {
     goto regress_distance_ret_NOMEM;
   }
   for (uii = 1; uii < sample_ct; uii++) {
@@ -9902,7 +9861,7 @@ int32_t regress_distance(pthread_t* threads, uint64_t calculation_type, double*
   } else {
     g_jackknife_d = set_default_jackknife_d(sample_ct);
   }
-  if (wkspace_alloc_uc_checked(&g_generic_buf, thread_ct * CACHEALIGN(sample_ct + (g_jackknife_d + 1) * sizeof(int32_t)))) {
+  if (bigstack_alloc_uc(thread_ct * round_up_pow2(sample_ct + (g_jackknife_d + 1) * sizeof(int32_t), CACHELINE), &g_generic_buf)) {
     goto regress_distance_ret_NOMEM;
   }
   if (spawn_threads(threads, &regress_jack_thread, thread_ct)) {
@@ -9932,6 +9891,6 @@ int32_t regress_distance(pthread_t* threads, uint64_t calculation_type, double*
   regress_distance_ret_THREAD_CREATE_FAIL:
     retval = RET_THREAD_CREATE_FAIL;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
diff --git a/plink_calc.h b/plink_calc.h
index 6e86302..e525283 100644
--- a/plink_calc.h
+++ b/plink_calc.h
@@ -69,7 +69,7 @@ int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
 
 int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_tot, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, char* read_dists_fname, char* distance_wts_fname, double distance_exp, uint64_t calculation_type, uint32_t dist_calc_type, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uint32_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exc [...]
 
-int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uint32_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, char* read_dists_fname, char* read_dists_id_fname, char* read_genome_fname, char* outname, char* ou [...]
+int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uint32_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, char* read_dists_fname, char* read_dists_id_fname, char* read_genome_fname, char* outname, char* ou [...]
 
 int32_t regress_distance(pthread_t* threads, uint64_t calculation_type, double* pheno_d, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uint32_t thread_ct, uintptr_t regress_iters, uint32_t regress_d);
 
diff --git a/plink_cluster.c b/plink_cluster.c
index f0741be..75850a9 100644
--- a/plink_cluster.c
+++ b/plink_cluster.c
@@ -43,19 +43,19 @@ void cluster_cleanup(Cluster_info* cluster_ptr) {
 }
 
 int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* sample_ids, uintptr_t max_sample_id_len, uint32_t mwithin_col, uint32_t keep_na, uintptr_t* cluster_ct_ptr, uint32_t** cluster_map_ptr, uint32_t** cluster_starts_ptr, char** cluster_ids_ptr, uintptr_t* max_cluster_id_len_ptr, char* keep_fname, char* keep_flattened, char* remove_fname, char* remove_flattened, uint32_t allow_no_samples) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   FILE* infile = NULL;
   uintptr_t* sample_exclude_new = NULL;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t sample_exclude_ct = *sample_exclude_ct_ptr;
   uintptr_t sample_ct = unfiltered_sample_ct - sample_exclude_ct;
-  uintptr_t sample_ctl = (sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t topsize = 0;
+  uintptr_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
   uintptr_t max_cluster_kr_len = 0;
   uint32_t cluster_filter = (keep_fname || keep_flattened || remove_fname || remove_flattened);
   uint32_t cluster_kr_ct = 0;
   int32_t retval = 0;
-  char* idbuf = &(tbuf[MAXLINELEN]);
+  char* idbuf = &(g_textbuf[MAXLINELEN]);
   // even if both --keep-clusters and --remove-clusters were specified, only
   // one is effectively active (i.e. any names in both lists are deleted from
   // the keep list, and then the function proceeds as if --remove-clusters
@@ -71,12 +71,12 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
   uintptr_t* already_seen;
   uintptr_t* ulptr;
   char* cluster_ids;
+  unsigned char* bigstack_end_mark2;
   uint32_t* cluster_map;
   uint32_t* cluster_starts;
   uint32_t* tmp_cluster_starts;
-  uintptr_t topsize_bak;
   uintptr_t line_idx;
-  Ll_str* llptr;
+  Ll_str* ll_ptr;
   char* sorted_ids;
   uint32_t* id_map;
   char* fam_id;
@@ -88,25 +88,27 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
   uint32_t sample_idx;
   uint32_t slen;
   uint32_t uii;
-  tbuf[MAXLINELEN - 1] = ' ';
+  g_textbuf[MAXLINELEN - 1] = ' ';
   if (cluster_filter) {
-    sample_exclude_new = (uintptr_t*)top_alloc(&topsize, unfiltered_sample_ctl * sizeof(intptr_t));
+    if (bigstack_end_alloc_ul(unfiltered_sample_ctl, &sample_exclude_new)) {
+      goto load_clusters_ret_NOMEM;
+    }
     if (keep_flattened || keep_fname) {
       if (keep_flattened) {
 	cluster_kr_ct = count_and_measure_multistr(keep_flattened, &max_cluster_kr_len);
       }
       if (keep_fname) {
-	if (fopen_checked(&infile, keep_fname, "r")) {
+	if (fopen_checked(keep_fname, "r", &infile)) {
 	  goto load_clusters_ret_OPEN_FAIL;
 	}
 	line_idx = 0;
-	while (fgets(tbuf, MAXLINELEN, infile)) {
+	while (fgets(g_textbuf, MAXLINELEN, infile)) {
 	  line_idx++;
-	  if (!tbuf[MAXLINELEN - 1]) {
-	    sprintf(logbuf, "Error: Line %" PRIuPTR " of --keep-clusters file is pathologically long.\n", line_idx);
+	  if (!g_textbuf[MAXLINELEN - 1]) {
+	    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --keep-clusters file is pathologically long.\n", line_idx);
 	    goto load_clusters_ret_INVALID_FORMAT_2;
 	  }
-	  cluster_name_ptr = skip_initial_spaces(tbuf);
+	  cluster_name_ptr = skip_initial_spaces(g_textbuf);
 	  if (is_eoln_kns(*cluster_name_ptr)) {
 	    continue;
 	  }
@@ -121,9 +123,8 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
 	  goto load_clusters_ret_INVALID_FORMAT;
 	}
       }
-      fill_all_bits(sample_exclude_new, unfiltered_sample_ct);
-      sorted_keep_ids = (char*)top_alloc(&topsize, cluster_kr_ct * max_cluster_kr_len);
-      if (!sorted_keep_ids) {
+      fill_all_bits(unfiltered_sample_ct, sample_exclude_new);
+      if (bigstack_end_alloc_c(cluster_kr_ct * max_cluster_kr_len, &sorted_keep_ids)) {
 	goto load_clusters_ret_NOMEM;
       }
       ulii = 0;
@@ -138,8 +139,8 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
       }
       if (keep_fname) {
 	rewind(infile);
-	while (fgets(tbuf, MAXLINELEN, infile)) {
-	  cluster_name_ptr = skip_initial_spaces(tbuf);
+	while (fgets(g_textbuf, MAXLINELEN, infile)) {
+	  cluster_name_ptr = skip_initial_spaces(g_textbuf);
 	  if (is_eoln_kns(*cluster_name_ptr)) {
 	    continue;
 	  }
@@ -154,43 +155,40 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
       qsort(sorted_keep_ids, cluster_kr_ct, max_cluster_kr_len, strcmp_casted);
       cluster_kr_ct = collapse_duplicate_ids(sorted_keep_ids, cluster_kr_ct, max_cluster_kr_len, NULL);
       if (remove_flattened || remove_fname) {
-	topsize_bak = topsize;
-	ulii = (cluster_kr_ct + (BITCT - 1)) / BITCT;
+	bigstack_end_mark2 = g_bigstack_end;
 	// track deletions
-	already_seen = (uintptr_t*)top_alloc(&topsize, ulii * sizeof(intptr_t));
-	if (!already_seen) {
+	if (bigstack_end_calloc_ul(BITCT_TO_WORDCT(cluster_kr_ct), &already_seen)) {
 	  goto load_clusters_ret_NOMEM;
 	}
-	fill_ulong_zero(already_seen, ulii);
 	if (remove_flattened) {
 	  cluster_name_ptr = remove_flattened;
 	  do {
 	    slen = strlen(cluster_name_ptr);
 	    sorted_idx = bsearch_str(cluster_name_ptr, slen, sorted_keep_ids, max_cluster_kr_len, cluster_kr_ct);
 	    if (sorted_idx != -1) {
-	      set_bit(already_seen, sorted_idx);
+	      set_bit(sorted_idx, already_seen);
 	    }
 	    cluster_name_ptr = &(cluster_name_ptr[slen + 1]);
 	  } while (*cluster_name_ptr);
 	}
 	if (remove_fname) {
-	  if (fopen_checked(&infile, remove_fname, "r")) {
+	  if (fopen_checked(remove_fname, "r", &infile)) {
             goto load_clusters_ret_OPEN_FAIL;
 	  }
 	  line_idx = 0;
-          while (fgets(tbuf, MAXLINELEN, infile)) {
+          while (fgets(g_textbuf, MAXLINELEN, infile)) {
 	    line_idx++;
-            if (!tbuf[MAXLINELEN - 1]) {
-	      sprintf(logbuf, "Error: Line %" PRIuPTR " of --remove-clusters file is pathologically long.\n", line_idx);
+            if (!g_textbuf[MAXLINELEN - 1]) {
+	      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --remove-clusters file is pathologically long.\n", line_idx);
 	      goto load_clusters_ret_INVALID_FORMAT_2;
 	    }
-	    cluster_name_ptr = skip_initial_spaces(tbuf);
+	    cluster_name_ptr = skip_initial_spaces(g_textbuf);
 	    if (is_eoln_kns(*cluster_name_ptr)) {
 	      continue;
 	    }
 	    sorted_idx = bsearch_str_nl(cluster_name_ptr, sorted_keep_ids, max_cluster_kr_len, cluster_kr_ct);
 	    if (sorted_idx != -1) {
-	      set_bit(already_seen, sorted_idx);
+	      set_bit(sorted_idx, already_seen);
 	    }
 	  }
 	}
@@ -204,7 +202,7 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
 	  }
           cluster_kr_ct = uii;
 	}
-	topsize = topsize_bak;
+	bigstack_end_reset(bigstack_end_mark2);
       }
     } else {
       memcpy(sample_exclude_new, sample_exclude, unfiltered_sample_ctl * sizeof(intptr_t));
@@ -212,17 +210,17 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
         cluster_kr_ct += count_and_measure_multistr(remove_flattened, &max_cluster_kr_len);
       }
       if (remove_fname) {
-	if (fopen_checked(&infile, remove_fname, "r")) {
+	if (fopen_checked(remove_fname, "r", &infile)) {
 	  goto load_clusters_ret_OPEN_FAIL;
 	}
 	line_idx = 0;
-	while (fgets(tbuf, MAXLINELEN, infile)) {
+	while (fgets(g_textbuf, MAXLINELEN, infile)) {
 	  line_idx++;
-	  if (!tbuf[MAXLINELEN - 1]) {
-	    sprintf(logbuf, "Error: Line %" PRIuPTR " of --remove-clusters file is pathologically long.\n", line_idx);
+	  if (!g_textbuf[MAXLINELEN - 1]) {
+	    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --remove-clusters file is pathologically long.\n", line_idx);
 	    goto load_clusters_ret_INVALID_FORMAT_2;
 	  }
-	  cluster_name_ptr = skip_initial_spaces(tbuf);
+	  cluster_name_ptr = skip_initial_spaces(g_textbuf);
 	  if (is_eoln_kns(*cluster_name_ptr)) {
 	    continue;
 	  }
@@ -234,8 +232,7 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
 	}
       }
       if (cluster_kr_ct) {
-	sorted_remove_ids = (char*)top_alloc(&topsize, cluster_kr_ct * max_cluster_kr_len);
-	if (!sorted_remove_ids) {
+	if (bigstack_end_alloc_c(cluster_kr_ct * max_cluster_kr_len, &sorted_remove_ids)) {
 	  goto load_clusters_ret_NOMEM;
 	}
 	ulii = 0;
@@ -250,8 +247,8 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
 	}
 	if (remove_fname) {
 	  rewind(infile);
-	  while (fgets(tbuf, MAXLINELEN, infile)) {
-	    cluster_name_ptr = skip_initial_spaces(tbuf);
+	  while (fgets(g_textbuf, MAXLINELEN, infile)) {
+	    cluster_name_ptr = skip_initial_spaces(g_textbuf);
 	    if (is_eoln_kns(*cluster_name_ptr)) {
 	      continue;
 	    }
@@ -272,23 +269,13 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
   }
 
   if (fname) {
-    sorted_ids = (char*)top_alloc(&topsize, sample_ct * max_sample_id_len);
-    if (!sorted_ids) {
-      goto load_clusters_ret_NOMEM;
-    }
-    id_map = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
-    if (!id_map) {
-      goto load_clusters_ret_NOMEM;
-    }
-    topsize_bak = topsize;
-    already_seen = (uintptr_t*)top_alloc(&topsize, sample_ctl * sizeof(intptr_t));
-    if (!already_seen) {
+    if (bigstack_end_alloc_c(sample_ct * max_sample_id_len, &sorted_ids) ||
+        bigstack_end_alloc_ui(sample_ct, &id_map) ||
+        bigstack_end_calloc_ul(sample_ctl, &already_seen)) {
       goto load_clusters_ret_NOMEM;
     }
-    fill_ulong_zero(already_seen, sample_ctl);
-    wkspace_left -= topsize;
-    retval = sort_item_ids_noalloc(sorted_ids, id_map, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, 0, 0, strcmp_deref);
-    wkspace_left += topsize;
+    bigstack_end_mark2 = (unsigned char*)id_map;
+    retval = sort_item_ids_noalloc(unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, 0, 0, strcmp_deref, sorted_ids, id_map);
     if (retval) {
       goto load_clusters_ret_1;
     }
@@ -299,24 +286,24 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
     // intermission. sort cluster names, purge duplicates, allocate memory for
     //               return values
     // 2. populate return arrays
-    if (fopen_checked(&infile, fname, "r")) {
+    if (fopen_checked(fname, "r", &infile)) {
       goto load_clusters_ret_OPEN_FAIL;
     }
     if (!mwithin_col) {
       mwithin_col = 1;
     }
     line_idx = 0;
-    while (fgets(tbuf, MAXLINELEN, infile)) {
+    while (fgets(g_textbuf, MAXLINELEN, infile)) {
       line_idx++;
-      if (!tbuf[MAXLINELEN - 1]) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of --within file is pathologically long.\n", line_idx);
+      if (!g_textbuf[MAXLINELEN - 1]) {
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --within file is pathologically long.\n", line_idx);
 	goto load_clusters_ret_INVALID_FORMAT_2;
       }
-      fam_id = skip_initial_spaces(tbuf);
+      fam_id = skip_initial_spaces(g_textbuf);
       if (is_eoln_kns(*fam_id)) {
 	continue;
       }
-      if (bsearch_read_fam_indiv(idbuf, sorted_ids, max_sample_id_len, sample_ct, fam_id, &cluster_name_ptr, &sorted_idx)) {
+      if (bsearch_read_fam_indiv(fam_id, sorted_ids, max_sample_id_len, sample_ct, &cluster_name_ptr, &sorted_idx, idbuf)) {
 	goto load_clusters_ret_MISSING_TOKENS;
       }
       if (sorted_idx == -1) {
@@ -333,7 +320,7 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
       if (no_more_tokens_kns(cluster_name_ptr)) {
 	goto load_clusters_ret_MISSING_TOKENS;
       }
-      set_bit(already_seen, sorted_idx);
+      set_bit(sorted_idx, already_seen);
       slen = strlen_se(cluster_name_ptr);
       if ((!keep_na) && (slen == 2) && (!memcmp(cluster_name_ptr, "NA", 2))) {
 	// postponed to here because, even without 'keep-NA', we do not want to
@@ -351,13 +338,12 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
       cluster_name_ptr[slen] = '\0';
       // do NOT optimize common case because current logic uses
       // collapse_duplicate_ids() last parameter to determine cluster sizes
-      llptr = top_alloc_llstr(&topsize, slen + 1);
-      if (!llptr) {
+      if (bigstack_end_alloc_llstr(slen + 1, &ll_ptr)) {
 	goto load_clusters_ret_NOMEM;
       }
-      llptr->next = cluster_names;
-      memcpy(llptr->ss, cluster_name_ptr, slen + 1);
-      cluster_names = llptr;
+      ll_ptr->next = cluster_names;
+      memcpy(ll_ptr->ss, cluster_name_ptr, slen + 1);
+      cluster_names = ll_ptr;
       assigned_ct++;
     }
     if (!feof(infile)) {
@@ -369,9 +355,8 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
 	goto load_clusters_ret_INVALID_FORMAT;
       }
       *max_cluster_id_len_ptr = max_cluster_id_len;
-      wkspace_left -= topsize;
-      if (wkspace_alloc_c_checked(cluster_ids_ptr, assigned_ct * max_cluster_id_len)) {
-	goto load_clusters_ret_NOMEM2;
+      if (bigstack_alloc_c(assigned_ct * max_cluster_id_len, cluster_ids_ptr)) {
+	goto load_clusters_ret_NOMEM;
       }
       cluster_ids = *cluster_ids_ptr;
       for (ulii = 0; ulii < assigned_ct; ulii++) {
@@ -380,35 +365,31 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
       }
       // deallocate cluster ID linked list and duplicate sample ID detector
       // from top of stack, allocate cluster size tracker
-      wkspace_left += topsize;
-      topsize = topsize_bak;
-      tmp_cluster_starts = (uint32_t*)top_alloc(&topsize, assigned_ct * sizeof(int32_t));
-      if (!tmp_cluster_starts) {
+      bigstack_end_reset(bigstack_end_mark2);
+      if (bigstack_end_alloc_ui(assigned_ct, &tmp_cluster_starts)) {
 	goto load_clusters_ret_NOMEM;
       }
-      wkspace_left -= topsize;
       // may as well use natural sort of cluster names
       qsort(cluster_ids, assigned_ct, max_cluster_id_len, strcmp_natural);
       cluster_ct = collapse_duplicate_ids(cluster_ids, assigned_ct, max_cluster_id_len, tmp_cluster_starts);
       *cluster_ct_ptr = cluster_ct;
-      wkspace_shrink_top(cluster_ids, cluster_ct * max_cluster_id_len);
-      if (wkspace_alloc_ui_checked(cluster_map_ptr, assigned_ct * sizeof(int32_t)) ||
-	  wkspace_alloc_ui_checked(cluster_starts_ptr, (cluster_ct + 1) * sizeof(int32_t))) {
-	goto load_clusters_ret_NOMEM2;
+      bigstack_shrink_top(cluster_ids, cluster_ct * max_cluster_id_len);
+      if (bigstack_alloc_ui(assigned_ct, cluster_map_ptr) ||
+	  bigstack_alloc_ui(cluster_ct + 1, cluster_starts_ptr)) {
+	goto load_clusters_ret_NOMEM;
       }
-      wkspace_left += topsize;
       cluster_map = *cluster_map_ptr;
       cluster_starts = *cluster_starts_ptr;
       memcpy(cluster_starts, tmp_cluster_starts, cluster_ct * sizeof(int32_t));
       cluster_starts[cluster_ct] = assigned_ct;
       rewind(infile);
       // second pass
-      while (fgets(tbuf, MAXLINELEN, infile)) {
-	fam_id = skip_initial_spaces(tbuf);
+      while (fgets(g_textbuf, MAXLINELEN, infile)) {
+	fam_id = skip_initial_spaces(g_textbuf);
 	if (is_eoln_kns(*fam_id)) {
 	  continue;
 	}
-	bsearch_read_fam_indiv(idbuf, sorted_ids, max_sample_id_len, sample_ct, fam_id, &cluster_name_ptr, &sorted_idx);
+	bsearch_read_fam_indiv(fam_id, sorted_ids, max_sample_id_len, sample_ct, &cluster_name_ptr, &sorted_idx, idbuf);
 	if (sorted_idx == -1) {
 	  continue;
 	}
@@ -425,11 +406,11 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
 	  if (sorted_idx == -1) {
 	    continue;
 	  }
-	  clear_bit(sample_exclude_new, sample_uidx);
+	  clear_bit(sample_uidx, sample_exclude_new);
 	} else if (sorted_remove_ids) {
 	  sorted_idx = bsearch_str(cluster_name_ptr, slen, sorted_remove_ids, max_cluster_kr_len, cluster_kr_ct);
 	  if (sorted_idx != -1) {
-	    set_bit(sample_exclude_new, sample_uidx);
+	    set_bit(sample_uidx, sample_exclude_new);
 	    continue;
 	  }
 	}
@@ -487,11 +468,11 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
 	if (sorted_idx == -1) {
 	  continue;
 	}
-	clear_bit(sample_exclude_new, sample_uidx);
+	clear_bit(sample_uidx, sample_exclude_new);
       } else if (sorted_remove_ids) {
 	sorted_idx = bsearch_str(cluster_name_ptr, slen, sorted_remove_ids, max_cluster_kr_len, cluster_kr_ct);
 	if (sorted_idx != -1) {
-	  set_bit(sample_exclude_new, sample_uidx);
+	  set_bit(sample_uidx, sample_exclude_new);
 	}
       }
       if (slen >= max_cluster_id_len) {
@@ -504,9 +485,8 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
       goto load_clusters_ret_INVALID_FORMAT;
     }
     *max_cluster_id_len_ptr = max_cluster_id_len;
-    wkspace_left -= topsize;
-    if (wkspace_alloc_c_checked(cluster_ids_ptr, assigned_ct * max_cluster_id_len)) {
-      goto load_clusters_ret_NOMEM2;
+    if (bigstack_alloc_c(assigned_ct * max_cluster_id_len, cluster_ids_ptr)) {
+      goto load_clusters_ret_NOMEM;
     }
     cluster_ids = *cluster_ids_ptr;
     if (sample_exclude_new) {
@@ -520,21 +500,17 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
       slen = (uintptr_t)((char*)memchr(cluster_name_ptr, '\t', max_sample_id_len) - cluster_name_ptr);
       memcpyx(&(cluster_ids[sample_idx * max_cluster_id_len]), cluster_name_ptr, slen, '\0');
     }
-    wkspace_left += topsize;
-    tmp_cluster_starts = (uint32_t*)top_alloc(&topsize, assigned_ct * sizeof(int32_t));
-    if (!tmp_cluster_starts) {
+    if (bigstack_end_alloc_ui(assigned_ct, &tmp_cluster_starts)) {
       goto load_clusters_ret_NOMEM;
     }
-    wkspace_left -= topsize;
     qsort(cluster_ids, assigned_ct, max_cluster_id_len, strcmp_natural);
     cluster_ct = collapse_duplicate_ids(cluster_ids, assigned_ct, max_cluster_id_len, tmp_cluster_starts);
     *cluster_ct_ptr = cluster_ct;
-    wkspace_shrink_top(cluster_ids, cluster_ct * max_cluster_id_len);
-    if (wkspace_alloc_ui_checked(cluster_map_ptr, assigned_ct * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(cluster_starts_ptr, (cluster_ct + 1) * sizeof(int32_t))) {
-      goto load_clusters_ret_NOMEM2;
+    bigstack_shrink_top(cluster_ids, cluster_ct * max_cluster_id_len);
+    if (bigstack_alloc_ui(assigned_ct, cluster_map_ptr) ||
+        bigstack_alloc_ui(cluster_ct + 1, cluster_starts_ptr)) {
+      goto load_clusters_ret_NOMEM;
     }
-    wkspace_left += topsize;
     cluster_map = *cluster_map_ptr;
     cluster_starts = *cluster_starts_ptr;
     memcpy(cluster_starts, tmp_cluster_starts, cluster_ct * sizeof(int32_t));
@@ -542,8 +518,8 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
     for (sample_uidx = 0, sample_idx = 0; sample_idx < assigned_ct; sample_uidx++, sample_idx++) {
       next_unset_unsafe_ck(ulptr, &sample_uidx);
       cluster_name_ptr = &(sample_ids[sample_uidx * max_sample_id_len]);
-      memcpyx(tbuf, cluster_name_ptr, (uintptr_t)((char*)memchr(cluster_name_ptr, '\t', max_cluster_id_len) - cluster_name_ptr), '\0');
-      sorted_idx = bsearch_str_natural(tbuf, cluster_ids, max_cluster_id_len, cluster_ct);
+      memcpyx(g_textbuf, cluster_name_ptr, (uintptr_t)((char*)memchr(cluster_name_ptr, '\t', max_cluster_id_len) - cluster_name_ptr), '\0');
+      sorted_idx = bsearch_str_natural(g_textbuf, cluster_ids, max_cluster_id_len, cluster_ct);
       uii = tmp_cluster_starts[(uint32_t)sorted_idx];
       tmp_cluster_starts[(uint32_t)sorted_idx] += 1;
       cluster_map[uii] = sample_uidx;
@@ -561,8 +537,6 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
   }
 
   while (0) {
-  load_clusters_ret_NOMEM2:
-    wkspace_left += topsize;
   load_clusters_ret_NOMEM:
     retval = RET_NOMEM;
     break;
@@ -573,7 +547,7 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
     retval = RET_READ_FAIL;
     break;
   load_clusters_ret_MISSING_TOKENS:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of --within file has fewer tokens than expected.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --within file has fewer tokens than expected.\n", line_idx);
   load_clusters_ret_INVALID_FORMAT_2:
     logerrprintb();
   load_clusters_ret_INVALID_FORMAT:
@@ -582,8 +556,9 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
   }
  load_clusters_ret_1:
   if (retval) {
-    wkspace_reset(wkspace_mark);
+    bigstack_reset(bigstack_mark);
   }
+  bigstack_end_reset(bigstack_end_mark);
   fclose_cond(infile);
   return retval;
 }
@@ -608,7 +583,7 @@ int32_t fill_sample_to_cluster(uintptr_t unfiltered_sample_ct, uintptr_t* sample
   // filled with the cluster index -> sample uidx mapping.
   // (Yes, this is a strange interface; it may be switched to filtered sample
   // indexes later.)
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uint32_t* cluster_map_pos = cluster_map;
   int32_t retval = 0;
   uint32_t* uidx_to_idx;
@@ -616,7 +591,7 @@ int32_t fill_sample_to_cluster(uintptr_t unfiltered_sample_ct, uintptr_t* sample
   uint32_t cluster_idx;
   uint32_t sample_uidx;
   uint32_t sample_idx;
-  if (wkspace_alloc_ui_checked(&uidx_to_idx, unfiltered_sample_ct * sizeof(int32_t))) {
+  if (bigstack_alloc_ui(unfiltered_sample_ct, &uidx_to_idx)) {
     goto fill_sample_to_cluster_ret_NOMEM;
   }
   fill_uidx_to_idx(sample_exclude, unfiltered_sample_ct, sample_ct, uidx_to_idx);
@@ -642,12 +617,12 @@ int32_t fill_sample_to_cluster(uintptr_t unfiltered_sample_ct, uintptr_t* sample
     retval = RET_NOMEM;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t write_clusters(char* outname, char* outname_end, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, uint32_t omit_unassigned, uintptr_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, char* cluster_ids, uintptr_t max_cluster_id_len) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
   uintptr_t sample_uidx = 0;
   int32_t retval = 0;
@@ -657,12 +632,12 @@ int32_t write_clusters(char* outname, char* outname_end, uintptr_t unfiltered_sa
   uintptr_t sample_idx;
   uint32_t cluster_idx;
   uint32_t slen;
-  if (wkspace_alloc_ui_checked(&sample_to_cluster, unfiltered_sample_ct * sizeof(int32_t))) {
+  if (bigstack_alloc_ui(unfiltered_sample_ct, &sample_to_cluster)) {
     goto write_cluster_ret_NOMEM;
   }
   fill_unfiltered_sample_to_cluster(unfiltered_sample_ct, cluster_ct, cluster_map, cluster_starts, sample_to_cluster);
   memcpy(outname_end, ".clst", 6);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto write_cluster_ret_OPEN_FAIL;
   }
   for (sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
@@ -671,14 +646,14 @@ int32_t write_clusters(char* outname, char* outname_end, uintptr_t unfiltered_sa
     if ((!omit_unassigned) || (cluster_idx != 0xffffffffU)) {
       sample_id_ptr = &(sample_ids[sample_uidx * max_sample_id_len]);
       slen = strlen_se(sample_id_ptr);
-      bufptr = memcpyax(tbuf, sample_id_ptr, slen, ' ');
+      bufptr = memcpyax(g_textbuf, sample_id_ptr, slen, ' ');
       bufptr = strcpyax(bufptr, &(sample_id_ptr[slen + 1]), ' ');
       if (cluster_idx != 0xffffffffU) {
         bufptr = strcpyax(bufptr, &(cluster_ids[cluster_idx * max_cluster_id_len]), '\n');
       } else {
         bufptr = memcpyl3a(bufptr, "NA\n");
       }
-      if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+      if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	goto write_cluster_ret_WRITE_FAIL;
       }
     }
@@ -699,14 +674,14 @@ int32_t write_clusters(char* outname, char* outname_end, uintptr_t unfiltered_sa
     break;
   }
   fclose_cond(outfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t extract_clusters(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uintptr_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, char* cluster_ids, uintptr_t max_cluster_id_len, char* cluster_names_flattened, char* clusters_fname, uintptr_t** new_sample_exclude_ptr, uintptr_t* new_sample_ct_ptr) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t line_idx = 0;
   int32_t retval = 0;
   char* bufptr;
@@ -716,12 +691,12 @@ int32_t extract_clusters(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
   uint32_t slen;
   uint32_t sample_uidx;
   int32_t ii;
-  if (wkspace_alloc_ul_checked(new_sample_exclude_ptr, unfiltered_sample_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl, new_sample_exclude_ptr)) {
     goto extract_clusters_ret_NOMEM;
   }
   new_sample_exclude = *new_sample_exclude_ptr;
-  wkspace_mark = wkspace_base;
-  fill_all_bits(new_sample_exclude, unfiltered_sample_ct);
+  bigstack_mark = g_bigstack_base;
+  fill_all_bits(unfiltered_sample_ct, new_sample_exclude);
   if (cluster_names_flattened) {
     bufptr = cluster_names_flattened;
     do {
@@ -734,7 +709,7 @@ int32_t extract_clusters(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
 	  while (uiptr < cluster_end) {
 	    sample_uidx = *uiptr++;
 	    if (!is_set(sample_exclude, sample_uidx)) {
-	      clear_bit(new_sample_exclude, sample_uidx);
+	      clear_bit(sample_uidx, new_sample_exclude);
 	    }
 	  }
 	}
@@ -743,17 +718,17 @@ int32_t extract_clusters(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
     } while (*bufptr);
   }
   if (clusters_fname) {
-    if (fopen_checked(&infile, clusters_fname, "r")) {
+    if (fopen_checked(clusters_fname, "r", &infile)) {
       goto extract_clusters_ret_OPEN_FAIL;
     }
-    tbuf[MAXLINELEN - 1] = ' ';
-    while (fgets(tbuf, MAXLINELEN, infile)) {
+    g_textbuf[MAXLINELEN - 1] = ' ';
+    while (fgets(g_textbuf, MAXLINELEN, infile)) {
       line_idx++;
-      if (!tbuf[MAXLINELEN - 1]) {
-	LOGPREPRINTFWW(logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, clusters_fname);
+      if (!g_textbuf[MAXLINELEN - 1]) {
+	LOGPREPRINTFWW(g_logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, clusters_fname);
         goto extract_clusters_ret_INVALID_FORMAT_2;
       }
-      bufptr = skip_initial_spaces(tbuf);
+      bufptr = skip_initial_spaces(g_textbuf);
       if (is_eoln_kns(*bufptr)) {
         continue;
       }
@@ -769,7 +744,7 @@ int32_t extract_clusters(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
 	while (uiptr < cluster_end) {
 	  sample_uidx = *uiptr++;
 	  if (!is_set(sample_exclude, sample_uidx)) {
-	    clear_bit(new_sample_exclude, sample_uidx);
+	    clear_bit(sample_uidx, new_sample_exclude);
 	  }
 	}
       }
@@ -795,7 +770,7 @@ int32_t extract_clusters(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
     break;
   }
   fclose_cond(infile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -834,11 +809,11 @@ void adjust_cc_perm_preimage(uint32_t cluster_ct, uint32_t* cluster_map, uint32_
       cluster_end = cluster_starts[cluster_idx + 1];
       if (cluster_case_cts[cluster_idx] * 2 < cluster_end - map_idx) {
 	do {
-	  CLEAR_BIT_DBL(cluster_cc_perm_preimage, cluster_map[map_idx]);
+	  CLEAR_BIT_DBL(cluster_map[map_idx], cluster_cc_perm_preimage);
 	} while (++map_idx < cluster_end);
       } else {
 	do {
-	  SET_BIT_DBL(cluster_cc_perm_preimage, cluster_map[map_idx]);
+	  SET_BIT_DBL(cluster_map[map_idx], cluster_cc_perm_preimage);
 	} while (++map_idx < cluster_end);
       }
     }
@@ -848,11 +823,11 @@ void adjust_cc_perm_preimage(uint32_t cluster_ct, uint32_t* cluster_map, uint32_
       cluster_end = cluster_starts[cluster_idx + 1];
       if (cluster_case_cts[cluster_idx] * 2 < cluster_end - map_idx) {
 	do {
-	  CLEAR_BIT(cluster_cc_perm_preimage, cluster_map[map_idx]);
+	  CLEAR_BIT(cluster_map[map_idx], cluster_cc_perm_preimage);
 	} while (++map_idx < cluster_end);
       } else {
 	do {
-	  SET_BIT(cluster_cc_perm_preimage, cluster_map[map_idx]);
+	  SET_BIT(cluster_map[map_idx], cluster_cc_perm_preimage);
 	} while (++map_idx < cluster_end);
       }
     }
@@ -870,7 +845,7 @@ int32_t cluster_include_and_reindex(uintptr_t unfiltered_sample_ct, uintptr_t* s
   //
   // If pheno_c is set, this also allocates and populates an array of
   // per-cluster case counts, and a permutation preimage.
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uint32_t old_assigned_ct = cluster_starts[cluster_ct];
   uint32_t new_cluster_ct = 0;
   uint32_t sample_uidx = 0;
@@ -890,14 +865,14 @@ int32_t cluster_include_and_reindex(uintptr_t unfiltered_sample_ct, uintptr_t* s
   uint32_t last_case_ct_incr;
   uint32_t shrink_map;
   if (pheno_c) {
-    if (wkspace_alloc_ul_checked(cluster_cc_perm_preimage_ptr, (2 - is_perm1) * ((sample_ct + (BITCT - 1)) / BITCT) * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul((2 - is_perm1) * BITCT_TO_WORDCT(sample_ct), cluster_cc_perm_preimage_ptr)) {
       goto cluster_include_and_reindex_ret_NOMEM;
     }
     cluster_cc_perm_preimage = *cluster_cc_perm_preimage_ptr;
     if (!is_perm1) {
-      vec_collapse_init(pheno_c, unfiltered_sample_ct, sample_include, sample_ct, cluster_cc_perm_preimage);
+      quaterarr_collapse_init(pheno_c, unfiltered_sample_ct, sample_include, sample_ct, cluster_cc_perm_preimage);
     } else {
-      collapse_copy_bitarr_incl(unfiltered_sample_ct, pheno_c, sample_include, sample_ct, cluster_cc_perm_preimage);
+      copy_bitarr_subset(pheno_c, sample_include, unfiltered_sample_ct, sample_ct, cluster_cc_perm_preimage);
     }
   }
   if ((sample_ct == unfiltered_sample_ct) && ((!remove_size1) || no_size1(cluster_ct, cluster_starts))) {
@@ -905,7 +880,7 @@ int32_t cluster_include_and_reindex(uintptr_t unfiltered_sample_ct, uintptr_t* s
     *new_cluster_ct_ptr = cluster_ct;
     *new_cluster_starts_ptr = cluster_starts;
     if (pheno_c) {
-      if (wkspace_alloc_ui_checked(cluster_case_cts_ptr, cluster_ct * sizeof(int32_t))) {
+      if (bigstack_alloc_ui(cluster_ct, cluster_case_cts_ptr)) {
 	goto cluster_include_and_reindex_ret_NOMEM;
       }
       populate_cluster_case_cts(pheno_c, cluster_ct, cluster_map, cluster_starts, *cluster_case_cts_ptr);
@@ -926,13 +901,13 @@ int32_t cluster_include_and_reindex(uintptr_t unfiltered_sample_ct, uintptr_t* s
     }
   }
   // possibly +1 to simplify remove_size1 logic
-  if (wkspace_alloc_ui_checked(new_cluster_map_ptr, (assigned_ct + remove_size1) * sizeof(int32_t))) {
+  if (bigstack_alloc_ui(assigned_ct + remove_size1, new_cluster_map_ptr)) {
     goto cluster_include_and_reindex_ret_NOMEM;
   }
   new_cluster_map = *new_cluster_map_ptr;
   shrink_map = (assigned_ct < old_assigned_ct);
   if (shrink_map) {
-    if (wkspace_alloc_ui_checked(new_cluster_starts_ptr, (new_cluster_ct + 1) * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(new_cluster_ct + 1, new_cluster_starts_ptr)) {
       goto cluster_include_and_reindex_ret_NOMEM;
     }
     new_cluster_starts = *new_cluster_starts_ptr;
@@ -942,12 +917,12 @@ int32_t cluster_include_and_reindex(uintptr_t unfiltered_sample_ct, uintptr_t* s
     *new_cluster_starts_ptr = cluster_starts;
   }
   if (pheno_c) {
-    if (wkspace_alloc_ui_checked(cluster_case_cts_ptr, new_cluster_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(new_cluster_ct, cluster_case_cts_ptr)) {
       goto cluster_include_and_reindex_ret_NOMEM;
     }
     cluster_case_cts = *cluster_case_cts_ptr;
   }
-  if (wkspace_alloc_ui_checked(&uidx_to_idx, unfiltered_sample_ct * sizeof(int32_t))) {
+  if (bigstack_alloc_ui(unfiltered_sample_ct, &uidx_to_idx)) {
     goto cluster_include_and_reindex_ret_NOMEM;
   }
   fill_uidx_to_idx_incl(sample_include, unfiltered_sample_ct, sample_ct, uidx_to_idx);
@@ -1008,28 +983,28 @@ int32_t cluster_include_and_reindex(uintptr_t unfiltered_sample_ct, uintptr_t* s
   if (pheno_c && new_cluster_ct) {
     adjust_cc_perm_preimage(new_cluster_ct, new_cluster_map, new_cluster_starts, cluster_case_cts, cluster_cc_perm_preimage, is_perm1);
   }
-  wkspace_reset(uidx_to_idx);
+  bigstack_reset(uidx_to_idx);
   return 0;
  cluster_include_and_reindex_ret_NOMEM:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return RET_NOMEM;
 }
 
 int32_t cluster_alloc_and_populate_magic_nums(uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t** tot_quotients_ptr, uint64_t** totq_magics_ptr, uint32_t** totq_preshifts_ptr, uint32_t** totq_postshifts_ptr, uint32_t** totq_incrs_ptr) {
   // assumes all clusters are of size > 1
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uint32_t* tot_quotients;
   uint64_t* totq_magics;
   uint32_t* totq_preshifts;
   uint32_t* totq_postshifts;
   uint32_t* totq_incrs;
   uint32_t cluster_idx;
-  if (wkspace_alloc_ui_checked(tot_quotients_ptr, cluster_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ull_checked(totq_magics_ptr, cluster_ct * sizeof(int64_t)) ||
-      wkspace_alloc_ui_checked(totq_preshifts_ptr, cluster_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(totq_postshifts_ptr, cluster_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(totq_incrs_ptr, cluster_ct * sizeof(int32_t))) {
-    wkspace_reset(wkspace_mark);
+  if (bigstack_alloc_ui(cluster_ct, tot_quotients_ptr) ||
+      bigstack_alloc_ull(cluster_ct, totq_magics_ptr) ||
+      bigstack_alloc_ui(cluster_ct, totq_preshifts_ptr) ||
+      bigstack_alloc_ui(cluster_ct, totq_postshifts_ptr) ||
+      bigstack_alloc_ui(cluster_ct, totq_incrs_ptr)) {
+    bigstack_reset(bigstack_mark);
     return RET_NOMEM;
   }
   tot_quotients = *tot_quotients_ptr;
@@ -1045,13 +1020,13 @@ int32_t cluster_alloc_and_populate_magic_nums(uint32_t cluster_ct, uint32_t* clu
 }
 
 int32_t read_dists(char* dist_fname, char* id_fname, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t cluster_ct, uint32_t* cluster_starts, uint32_t* sample_to_cluster, uint32_t for_cluster_flag, uint32_t is_max_dist, double* dists, uint32_t neighbor_n2, double* neighbor_quantiles, uint32_t* neighbor_qindices) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* dist_file = NULL;
   FILE* id_file = NULL;
   uintptr_t id_entry_ct = sample_ct;
   uintptr_t matching_entry_ct = sample_ct;
   uintptr_t line_idx = 0;
-  char* id_buf = &(tbuf[MAXLINELEN]);
+  char* id_buf = &(g_textbuf[MAXLINELEN]);
   uint64_t* fidx_to_memidx = NULL; // high 32 bits = fidx, low 32 = memidx
   uint32_t is_presorted = cluster_ct? 0 : 1;
   int32_t retval = 0;
@@ -1075,38 +1050,38 @@ int32_t read_dists(char* dist_fname, char* id_fname, uintptr_t unfiltered_sample
   double cur_ibs;
   uint32_t uii;
   int32_t ii;
-  if (fopen_checked(&dist_file, dist_fname, "rb")) {
+  if (fopen_checked(dist_fname, FOPEN_RB, &dist_file)) {
     goto read_dists_ret_OPEN_FAIL;
   }
   if (fseeko(dist_file, 0, SEEK_END)) {
     goto read_dists_ret_READ_FAIL;
   }
   if (id_fname) {
-    if (wkspace_alloc_ull_checked(&fidx_to_memidx, sample_ct * sizeof(int64_t))) {
+    if (bigstack_alloc_ull(sample_ct, &fidx_to_memidx)) {
       goto read_dists_ret_NOMEM;
     }
     fill_ull_one(fidx_to_memidx, sample_ct);
-    if (fopen_checked(&id_file, id_fname, "r")) {
+    if (fopen_checked(id_fname, "r", &id_file)) {
       goto read_dists_ret_OPEN_FAIL;
     }
-    retval = sort_item_ids(&sorted_ids, &id_map, unfiltered_sample_ct, sample_exclude, unfiltered_sample_ct - sample_ct, sample_ids, max_sample_id_len, 0, 1, strcmp_deref);
+    retval = sort_item_ids(unfiltered_sample_ct, sample_exclude, unfiltered_sample_ct - sample_ct, sample_ids, max_sample_id_len, 0, 1, strcmp_deref, &sorted_ids, &id_map);
     if (retval) {
       goto read_dists_ret_1;
     }
     id_entry_ct = 0;
     matching_entry_ct = 0;
-    tbuf[MAXLINELEN - 1] = ' ';
-    while (fgets(tbuf, MAXLINELEN, id_file)) {
+    g_textbuf[MAXLINELEN - 1] = ' ';
+    while (fgets(g_textbuf, MAXLINELEN, id_file)) {
       line_idx++;
-      if (!tbuf[MAXLINELEN - 1]) {
+      if (!g_textbuf[MAXLINELEN - 1]) {
 	LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, id_fname);
         goto read_dists_ret_INVALID_FORMAT_2;
       }
-      fam_id = skip_initial_spaces(tbuf);
+      fam_id = skip_initial_spaces(g_textbuf);
       if (is_eoln_kns(*fam_id)) {
         continue;
       }
-      if (bsearch_read_fam_indiv(id_buf, sorted_ids, max_sample_id_len, sample_ct, fam_id, NULL, &ii)) {
+      if (bsearch_read_fam_indiv(fam_id, sorted_ids, max_sample_id_len, sample_ct, NULL, &ii, id_buf)) {
 	LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, id_fname);
         goto read_dists_ret_INVALID_FORMAT_2;
       }
@@ -1143,7 +1118,7 @@ int32_t read_dists(char* dist_fname, char* id_fname, uintptr_t unfiltered_sample
       goto read_dists_ret_INVALID_FORMAT;
     }
   } else if (cluster_ct) {
-    if (wkspace_alloc_ull_checked(&fidx_to_memidx, sample_ct * sizeof(int64_t))) {
+    if (bigstack_alloc_ull(sample_ct, &fidx_to_memidx)) {
       goto read_dists_ret_NOMEM;
     }
     if (neighbor_n2) {
@@ -1287,7 +1262,7 @@ int32_t read_dists(char* dist_fname, char* id_fname, uintptr_t unfiltered_sample
     break;
   }
  read_dists_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(dist_file);
   fclose_cond(id_file);
   return retval;
@@ -1317,14 +1292,14 @@ void update_neighbor(uintptr_t sample_ct, uint32_t neighbor_n2, uintptr_t sample
 }
 
 int32_t read_genome(char* read_genome_fname, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* cluster_merge_prevented, double* cluster_sorted_ibs, uint32_t neighbor_n2, double* neighbor_quantiles, uint32_t* neighbor_qindices, uint32_t* ppc_fail_counts, double min_ppc, uint32_t is_max_dist, uintptr_t cluster_ct, uint32_t* cluster_starts, uint32_t* sample_to_cluster) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   gzFile gz_infile = NULL;
   uint32_t neighbor_load_quantiles = neighbor_quantiles && cluster_sorted_ibs;
   uint32_t ppc_warning = cluster_merge_prevented? 0 : 1;
   uintptr_t loaded_entry_ct = 0;
   uintptr_t line_idx = 1;
   uint32_t ppc_fail = 0;
-  char* idbuf = &(tbuf[MAXLINELEN]);
+  char* idbuf = &(g_textbuf[MAXLINELEN]);
   char* sorted_ids;
   uint32_t* id_map;
   char* bufptr;
@@ -1337,46 +1312,47 @@ int32_t read_genome(char* read_genome_fname, uintptr_t unfiltered_sample_ct, uin
   uint32_t uii;
   int32_t ii;
   int32_t retval;
-  retval = sort_item_ids(&sorted_ids, &id_map, unfiltered_sample_ct, sample_exclude, unfiltered_sample_ct - sample_ct, sample_ids, max_sample_id_len, 0, 1, strcmp_deref);
+  retval = sort_item_ids(unfiltered_sample_ct, sample_exclude, unfiltered_sample_ct - sample_ct, sample_ids, max_sample_id_len, 0, 1, strcmp_deref, &sorted_ids, &id_map);
   if (retval) {
     goto read_genome_ret_1;
   }
-  if (gzopen_checked(&gz_infile, read_genome_fname, "rb")) {
-    goto read_genome_ret_OPEN_FAIL;
+  retval = gzopen_read_checked(read_genome_fname, &gz_infile);
+  if (retval) {
+    goto read_genome_ret_1;
   }
-  tbuf[MAXLINELEN - 1] = ' ';
+  g_textbuf[MAXLINELEN - 1] = ' ';
   // header line
   do {
-    if (!gzgets(gz_infile, tbuf, MAXLINELEN)) {
+    if (!gzgets(gz_infile, g_textbuf, MAXLINELEN)) {
       goto read_genome_ret_READ_FAIL;
     }
-    if (!tbuf[MAXLINELEN - 1]) {
+    if (!g_textbuf[MAXLINELEN - 1]) {
       goto read_genome_ret_LONG_LINE;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
   } while (is_eoln_kns(*bufptr));
   // a little bit of input validation
   if (memcmp(bufptr, "FID1", 4)) {
     logerrprint("Error: Invalid --read-genome file header line.\n");
     goto read_genome_ret_INVALID_FORMAT;
   }
-  while (gzgets(gz_infile, tbuf, MAXLINELEN)) {
+  while (gzgets(gz_infile, g_textbuf, MAXLINELEN)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
+    if (!g_textbuf[MAXLINELEN - 1]) {
       goto read_genome_ret_LONG_LINE;
     }
-    fam_id = skip_initial_spaces(tbuf);
+    fam_id = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*fam_id)) {
       continue;
     }
-    if (bsearch_read_fam_indiv(idbuf, sorted_ids, max_sample_id_len, sample_ct, fam_id, &fam_id, &ii)) {
+    if (bsearch_read_fam_indiv(fam_id, sorted_ids, max_sample_id_len, sample_ct, &fam_id, &ii, idbuf)) {
       goto read_genome_ret_MISSING_TOKENS;
     }
     if (ii == -1) {
       continue;
     }
     sample_idx1 = id_map[(uint32_t)ii];
-    if (bsearch_read_fam_indiv(idbuf, sorted_ids, max_sample_id_len, sample_ct, fam_id, &bufptr, &ii)) {
+    if (bsearch_read_fam_indiv(fam_id, sorted_ids, max_sample_id_len, sample_ct, &bufptr, &ii, idbuf)) {
       goto read_genome_ret_MISSING_TOKENS;
     }
     if (ii == -1) {
@@ -1384,17 +1360,17 @@ int32_t read_genome(char* read_genome_fname, uintptr_t unfiltered_sample_ct, uin
     }
     sample_idx2 = id_map[(uint32_t)ii];
     if (sample_idx2 == sample_idx1) {
-      sprintf(logbuf, "Error: FID1/IID1 matches FID2/IID2 on line %" PRIuPTR " of --read-genome file.\n", line_idx);
+      sprintf(g_logbuf, "Error: FID1/IID1 matches FID2/IID2 on line %" PRIuPTR " of --read-genome file.\n", line_idx);
       goto read_genome_ret_INVALID_FORMAT_2;
     }
     bufptr = next_token_mult(bufptr, 7); // distance
     fam_id = next_token(bufptr); // repurposed to PPC test value
-    if (no_more_tokens(fam_id)) {
+    if (no_more_tokens_kns(fam_id)) {
       goto read_genome_ret_MISSING_TOKENS;
     }
     if (min_ppc != 0.0) {
       if (scan_double(fam_id, &cur_ppc)) {
-	sprintf(logbuf, "Error: Invalid PPC test value on line %" PRIuPTR " of --read-genome file.\n", line_idx);
+	sprintf(g_logbuf, "Error: Invalid PPC test value on line %" PRIuPTR " of --read-genome file.\n", line_idx);
 	goto read_genome_ret_INVALID_FORMAT_2;
       }
       ppc_fail = (cur_ppc < min_ppc);
@@ -1404,7 +1380,7 @@ int32_t read_genome(char* read_genome_fname, uintptr_t unfiltered_sample_ct, uin
       }
     }
     if (scan_double(bufptr, &cur_ibs)) {
-      sprintf(logbuf, "Error: Invalid IBS value on line %" PRIuPTR " of --read-genome file.\n", line_idx);
+      sprintf(g_logbuf, "Error: Invalid IBS value on line %" PRIuPTR " of --read-genome file.\n", line_idx);
       goto read_genome_ret_INVALID_FORMAT_2;
     }
     if (neighbor_load_quantiles) {
@@ -1429,7 +1405,7 @@ int32_t read_genome(char* read_genome_fname, uintptr_t unfiltered_sample_ct, uin
     }
     tcoord = tri_coord_no_diag(sample_idx1, sample_idx2);
     if (ppc_fail) {
-      SET_BIT(cluster_merge_prevented, tcoord);
+      SET_BIT(tcoord, cluster_merge_prevented);
     }
     if (cluster_sorted_ibs) {
       if (!is_max_dist) {
@@ -1449,9 +1425,6 @@ int32_t read_genome(char* read_genome_fname, uintptr_t unfiltered_sample_ct, uin
     goto read_genome_ret_INVALID_FORMAT_2;
   }
   while (0) {
-  read_genome_ret_OPEN_FAIL:
-    retval = RET_OPEN_FAIL;
-    break;
   read_genome_ret_READ_FAIL:
     retval = RET_READ_FAIL;
     break;
@@ -1460,7 +1433,7 @@ int32_t read_genome(char* read_genome_fname, uintptr_t unfiltered_sample_ct, uin
     retval = RET_INVALID_FORMAT;
     break;
   read_genome_ret_LONG_LINE:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of --read-genome file is pathologically long.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --read-genome file is pathologically long.\n", line_idx);
   read_genome_ret_INVALID_FORMAT_2:
     logerrprintb();
   read_genome_ret_INVALID_FORMAT:
@@ -1468,17 +1441,18 @@ int32_t read_genome(char* read_genome_fname, uintptr_t unfiltered_sample_ct, uin
     break;
   }
  read_genome_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   gzclose_cond(gz_infile);
   return retval;
 }
 
 int32_t cluster_enforce_match(Cluster_info* cp, int32_t missing_pheno, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t cluster_ct, uint32_t* cluster_starts, uint32_t* sample_to_cluster, uintptr_t* merge_prevented) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* matchfile = NULL;
   FILE* typefile = NULL;
-  char* id_buf = &(tbuf[MAXLINELEN]);
+  char* id_buf = &(g_textbuf[MAXLINELEN]);
   char* missing_str = NULL;
+  uintptr_t bigstack_pre_end_address = ((uintptr_t)g_bigstack_end) - MAXLINELEN;
   uintptr_t cur_coord = 0;
   uint32_t cluster_mismatch_warning = 0;
   uint32_t cov_ct = 0;
@@ -1490,7 +1464,7 @@ int32_t cluster_enforce_match(Cluster_info* cp, int32_t missing_pheno, uintptr_t
   uint32_t* id_map;
   char** sample_idx_to_match_str;
   double** sample_idx_to_dvals;
-  unsigned char* wkspace_mark2;
+  unsigned char* bigstack_mark2;
   unsigned char* cov_type_arr;
   double* tol_arr;
   char* bufptr;
@@ -1513,23 +1487,23 @@ int32_t cluster_enforce_match(Cluster_info* cp, int32_t missing_pheno, uintptr_t
   uint32_t uii;
   int32_t ii;
   char cc;
-  retval = sort_item_ids(&sorted_ids, &id_map, unfiltered_sample_ct, sample_exclude, unfiltered_sample_ct - sample_ct, sample_ids, max_sample_id_len, 0, 1, strcmp_deref);
+  retval = sort_item_ids(unfiltered_sample_ct, sample_exclude, unfiltered_sample_ct - sample_ct, sample_ids, max_sample_id_len, 0, 1, strcmp_deref, &sorted_ids, &id_map);
   if (retval) {
     goto cluster_enforce_match_ret_1;
   }
   
-  wkspace_mark2 = wkspace_base;
-  tbuf[MAXLINELEN - 1] = ' ';
+  bigstack_mark2 = g_bigstack_base;
+  g_textbuf[MAXLINELEN - 1] = ' ';
   if (cp->match_fname) {
-    sample_idx_to_match_str = (char**)wkspace_alloc(sample_ct * sizeof(intptr_t));
+    sample_idx_to_match_str = (char**)bigstack_alloc(sample_ct * sizeof(intptr_t));
     if (!sample_idx_to_match_str) {
       goto cluster_enforce_match_ret_NOMEM;
     }
     for (sample_idx1 = 0; sample_idx1 < sample_ct; sample_idx1++) {
       sample_idx_to_match_str[sample_idx1] = NULL;
     }
-    cov_type_arr = wkspace_base;
-    if (wkspace_left < MAXLINELEN) {
+    cov_type_arr = g_bigstack_base;
+    if (((uintptr_t)cov_type_arr) > bigstack_pre_end_address) {
       goto cluster_enforce_match_ret_NOMEM;
     }
     if (cp->match_missing_str) {
@@ -1537,18 +1511,18 @@ int32_t cluster_enforce_match(Cluster_info* cp, int32_t missing_pheno, uintptr_t
       missing_len = strlen(missing_str);
     }
     if (cp->match_type_fname) {
-      if (fopen_checked(&typefile, cp->match_type_fname, "r")) {
+      if (fopen_checked(cp->match_type_fname, "r", &typefile)) {
 	goto cluster_enforce_match_ret_OPEN_FAIL;
       }
       cov_idx = 0;
       line_idx = 0;
-      while (fgets(tbuf, MAXLINELEN, typefile)) {
+      while (fgets(g_textbuf, MAXLINELEN, typefile)) {
 	line_idx++;
-        if (!tbuf[MAXLINELEN - 1]) {
-	  sprintf(logbuf, "Error: Line %" PRIuPTR " of --match-type file is pathologically long.\n", line_idx);
+        if (!g_textbuf[MAXLINELEN - 1]) {
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --match-type file is pathologically long.\n", line_idx);
           goto cluster_enforce_match_ret_INVALID_FORMAT_2;
 	}
-        bufptr = skip_initial_spaces(tbuf);
+        bufptr = skip_initial_spaces(g_textbuf);
 	cc = *bufptr;
         while (!is_eoln_kns(cc)) {
 	  slen = strlen_se(bufptr);
@@ -1561,7 +1535,7 @@ int32_t cluster_enforce_match(Cluster_info* cp, int32_t missing_pheno, uintptr_t
             cov_type_arr[cov_ct] = 0;
 	    cov_idx++;
 	  } else {
-            sprintf(logbuf, "Error: Line %" PRIuPTR " of --match-type file has an invalid token\n(0/1/-1/-/+/* expected).\n", line_idx);
+            sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --match-type file has an invalid token\n(0/1/-1/-/+/* expected).\n", line_idx);
 	    goto cluster_enforce_match_ret_INVALID_FORMAT_2;
 	  }
 	  cov_ct++;
@@ -1589,9 +1563,9 @@ int32_t cluster_enforce_match(Cluster_info* cp, int32_t missing_pheno, uintptr_t
       while (!cov_type_arr[cov_ct - 1]) {
 	cov_ct--;
       }
-      wkspace_alloc(cov_ct * sizeof(char)); // cov_type_arr
+      bigstack_alloc(cov_ct * sizeof(char)); // cov_type_arr
     }
-    retval = open_and_load_to_first_token(&matchfile, cp->match_fname, MAXLINELEN, '\0', "--match file", tbuf, &bufptr, &line_idx);
+    retval = open_and_load_to_first_token(&matchfile, cp->match_fname, MAXLINELEN, '\0', "--match file", g_textbuf, &bufptr, &line_idx);
     if (retval) {
       goto cluster_enforce_match_ret_1;
     }
@@ -1602,22 +1576,22 @@ int32_t cluster_enforce_match(Cluster_info* cp, int32_t missing_pheno, uintptr_t
 	goto cluster_enforce_match_ret_MISSING_TOKENS;
       }
       cov_ct -= 2;
-      wkspace_alloc(cov_ct * sizeof(char)); // cov_type_arr
+      bigstack_alloc(cov_ct * sizeof(char)); // cov_type_arr
       memset(cov_type_arr, 2, cov_ct);
       non_null_cov_ct = cov_ct;
     }
-    wptr = (char*)wkspace_base;
+    wptr = (char*)g_bigstack_base;
     do {
       line_idx++;
-      if (!tbuf[MAXLINELEN - 1]) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of --match file is pathologically long.\n", line_idx);
+      if (!g_textbuf[MAXLINELEN - 1]) {
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --match file is pathologically long.\n", line_idx);
 	goto cluster_enforce_match_ret_INVALID_FORMAT_2;
       }
-      bufptr = skip_initial_spaces(tbuf);
+      bufptr = skip_initial_spaces(g_textbuf);
       if (is_eoln_kns(*bufptr)) {
 	continue;
       }
-      if (bsearch_read_fam_indiv(id_buf, sorted_ids, max_sample_id_len, sample_ct, bufptr, &bufptr2, &ii)) {
+      if (bsearch_read_fam_indiv(bufptr, sorted_ids, max_sample_id_len, sample_ct, &bufptr2, &ii, id_buf)) {
 	goto cluster_enforce_match_ret_MISSING_TOKENS;
       }
       if (ii == -1) {
@@ -1645,10 +1619,10 @@ int32_t cluster_enforce_match(Cluster_info* cp, int32_t missing_pheno, uintptr_t
 	  }
 	}
       }
-      if ((uintptr_t)(((unsigned char*)wptr) - wkspace_base) > wkspace_left - MAXLINELEN) {
+      if (((uintptr_t)wptr) > bigstack_pre_end_address) {
 	goto cluster_enforce_match_ret_NOMEM;
       }
-    } while (fgets(tbuf, MAXLINELEN, matchfile));
+    } while (fgets(g_textbuf, MAXLINELEN, matchfile));
     if (!feof(matchfile)) {
       goto cluster_enforce_match_ret_READ_FAIL;
     }
@@ -1731,7 +1705,7 @@ int32_t cluster_enforce_match(Cluster_info* cp, int32_t missing_pheno, uintptr_t
 	}
 	if (cov_idx < non_null_cov_ct) {
 	  if (clidx1 != clidx2) {
-	    SET_BIT(merge_prevented, cur_coord);
+	    SET_BIT(cur_coord, merge_prevented);
 	  } else {
 	    cluster_mismatch_warning = 1;
 	  }
@@ -1744,18 +1718,18 @@ int32_t cluster_enforce_match(Cluster_info* cp, int32_t missing_pheno, uintptr_t
     }
     cov_ct = 0;
     non_null_cov_ct = 0;
-    wkspace_reset(wkspace_mark2);
+    bigstack_reset(bigstack_mark2);
   }
   if (cp->qmatch_fname) {
-    sample_idx_to_dvals = (double**)wkspace_alloc(sample_ct * sizeof(intptr_t));
+    sample_idx_to_dvals = (double**)bigstack_alloc(sample_ct * sizeof(intptr_t));
     if (!sample_idx_to_dvals) {
       goto cluster_enforce_match_ret_NOMEM;
     }
     for (sample_idx1 = 0; sample_idx1 < sample_ct; sample_idx1++) {
       sample_idx_to_dvals[sample_idx1] = NULL;
     }
-    tol_arr = (double*)wkspace_base;
-    if (wkspace_left <= MAXLINELEN * 4) {
+    tol_arr = (double*)g_bigstack_base;
+    if (bigstack_left() <= MAXLINELEN * 4) {
       goto cluster_enforce_match_ret_NOMEM;
     }
     if (cp->qmatch_missing_str) {
@@ -1766,30 +1740,30 @@ int32_t cluster_enforce_match(Cluster_info* cp, int32_t missing_pheno, uintptr_t
 	*bufptr++ = '-';
 	missing_pheno = -missing_pheno;
       }
-      bufptr = uint32_write(bufptr, (uint32_t)missing_pheno);
+      bufptr = uint32toa((uint32_t)missing_pheno, bufptr);
       *bufptr = '\0';
       missing_str = intbuf;
     }
     missing_len = strlen(missing_str);
-    if (fopen_checked(&typefile, cp->qt_fname, "r")) {
+    if (fopen_checked(cp->qt_fname, "r", &typefile)) {
       goto cluster_enforce_match_ret_OPEN_FAIL;
     }
     line_idx = 0;
-    while (fgets(tbuf, MAXLINELEN, typefile)) {
+    while (fgets(g_textbuf, MAXLINELEN, typefile)) {
       line_idx++;
-      if (!tbuf[MAXLINELEN - 1]) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of --qt file is pathologically long.\n", line_idx);
+      if (!g_textbuf[MAXLINELEN - 1]) {
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --qt file is pathologically long.\n", line_idx);
 	goto cluster_enforce_match_ret_INVALID_FORMAT_2;
       }
-      bufptr = skip_initial_spaces(tbuf);
+      bufptr = skip_initial_spaces(g_textbuf);
       while (!is_eoln_kns(*bufptr)) {
         if (scan_double(bufptr, &dxx)) {
-	  sprintf(logbuf, "Error: Line %" PRIuPTR " of --qt file has a non-numeric value.\n", line_idx);
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --qt file has a non-numeric value.\n", line_idx);
 	  goto cluster_enforce_match_ret_INVALID_FORMAT_2;
 	}
 	if (dxx < 0) {
 	  if (dxx != -1) {
-	    sprintf(logbuf, "Error: Line %" PRIuPTR " of --qt file has an invalid tolerance (-1 = ignore,\nother values must be nonnegative).\n", line_idx);
+	    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --qt file has an invalid tolerance (-1 = ignore,\nother values must be nonnegative).\n", line_idx);
             goto cluster_enforce_match_ret_INVALID_FORMAT_2;
 	  }
 	} else {
@@ -1811,26 +1785,26 @@ int32_t cluster_enforce_match(Cluster_info* cp, int32_t missing_pheno, uintptr_t
       logerrprint("Error: Empty --qt file.\n");
       goto cluster_enforce_match_ret_INVALID_FORMAT;
     }
-    wkspace_alloc(cov_ct * sizeof(double)); // tol_arr
-    if (wkspace_left < non_null_cov_ct * sizeof(double)) {
+    bigstack_alloc(cov_ct * sizeof(double)); // tol_arr
+    if (bigstack_left() < non_null_cov_ct * sizeof(double)) {
       goto cluster_enforce_match_ret_NOMEM;
     }
-    dptr = (double*)wkspace_base;
-    if (fopen_checked(&matchfile, cp->qmatch_fname, "r")) {
+    dptr = (double*)g_bigstack_base;
+    if (fopen_checked(cp->qmatch_fname, "r", &matchfile)) {
       goto cluster_enforce_match_ret_OPEN_FAIL;
     }
     line_idx = 0;
-    while (fgets(tbuf, MAXLINELEN, matchfile)) {
+    while (fgets(g_textbuf, MAXLINELEN, matchfile)) {
       line_idx++;
-      if (!tbuf[MAXLINELEN - 1]) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of --qmatch file is pathologically long.\n", line_idx);
+      if (!g_textbuf[MAXLINELEN - 1]) {
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --qmatch file is pathologically long.\n", line_idx);
 	goto cluster_enforce_match_ret_INVALID_FORMAT_2;
       }
-      bufptr = skip_initial_spaces(tbuf);
+      bufptr = skip_initial_spaces(g_textbuf);
       if (is_eoln_kns(*bufptr)) {
 	continue;
       }
-      if (bsearch_read_fam_indiv(id_buf, sorted_ids, max_sample_id_len, sample_ct, bufptr, &bufptr, &ii)) {
+      if (bsearch_read_fam_indiv(bufptr, sorted_ids, max_sample_id_len, sample_ct, &bufptr, &ii, id_buf)) {
         goto cluster_enforce_match_ret_MISSING_TOKENS_Q;
       }
       if (ii == -1) {
@@ -1850,10 +1824,10 @@ int32_t cluster_enforce_match(Cluster_info* cp, int32_t missing_pheno, uintptr_t
 	}
         if (tol_arr[cov_idx] != -1) {
 	  if ((!memcmp(bufptr, missing_str, missing_len)) && (((unsigned char)bufptr[missing_len]) <= ' ')) {
-	    *dptr++ = -HUGE_DOUBLE;
+	    *dptr++ = -DBL_MAX;
 	  } else {
             if (scan_double(bufptr, dptr++)) {
-	      sprintf(logbuf, "Error: Line %" PRIuPTR " of --qmatch file has a non-numeric covariate.\n", line_idx);
+	      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --qmatch file has a non-numeric covariate.\n", line_idx);
 	      goto cluster_enforce_match_ret_INVALID_FORMAT_2;
 	    }
 	  }
@@ -1861,7 +1835,7 @@ int32_t cluster_enforce_match(Cluster_info* cp, int32_t missing_pheno, uintptr_t
 	bufptr = token_endnn(bufptr);
       }
 
-      if (wkspace_left < (uintptr_t)(((unsigned char*)(&(dptr[non_null_cov_ct]))) - wkspace_base)) {
+      if (((uintptr_t)g_bigstack_end) < (uintptr_t)(&(dptr[non_null_cov_ct]))) {
 	goto cluster_enforce_match_ret_NOMEM;
       }
     }
@@ -1927,13 +1901,13 @@ int32_t cluster_enforce_match(Cluster_info* cp, int32_t missing_pheno, uintptr_t
 	for (cov_idx = 0; cov_idx < non_null_cov_ct; cov_idx++) {
 	  dxx = *dptr++;
 	  dyy = *dptr2++;
-	  if ((dxx != -HUGE_DOUBLE) && (dyy != -HUGE_DOUBLE) && (tol_arr[cov_idx] < fabs(dxx - dyy))) {
+	  if ((dxx != -DBL_MAX) && (dyy != -DBL_MAX) && (tol_arr[cov_idx] < fabs(dxx - dyy))) {
             break;
 	  }
 	}
 	if (cov_idx < non_null_cov_ct) {
 	  if (clidx1 != clidx2) {
-	    SET_BIT(merge_prevented, cur_coord);
+	    SET_BIT(cur_coord, merge_prevented);
 	  } else {
 	    cluster_mismatch_warning = 1;
 	  }
@@ -1960,7 +1934,7 @@ int32_t cluster_enforce_match(Cluster_info* cp, int32_t missing_pheno, uintptr_t
     retval = RET_INVALID_FORMAT;
     break;
   cluster_enforce_match_ret_MISSING_TOKENS:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of --match file has fewer tokens than expected.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --match file has fewer tokens than expected.\n", line_idx);
   cluster_enforce_match_ret_INVALID_FORMAT_2:
     logerrprintb();
   cluster_enforce_match_ret_INVALID_FORMAT:
@@ -1968,7 +1942,7 @@ int32_t cluster_enforce_match(Cluster_info* cp, int32_t missing_pheno, uintptr_t
     break;
   }
  cluster_enforce_match_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(matchfile);
   fclose_cond(typefile);
   return retval;
@@ -2180,7 +2154,7 @@ uint32_t cluster_main(uintptr_t cluster_ct, uintptr_t* merge_prevented, uintptr_
       for (uii = 0; uii < clidx_small; uii++) {
 	if ((cur_cluster_remap[uii] == uii) && (!is_set(merge_prevented, tcoord2 + uii))) {
 	  if (is_set(merge_prevented, tcoord1 + uii)) {
-	    set_bit(merge_prevented, tcoord2 + uii);
+	    set_bit(tcoord2 + uii, merge_prevented);
 	  } else {
 	    ujj = cluster_index[tcoord1 + uii];
 	    ukk = cluster_index[tcoord2 + uii];
@@ -2197,7 +2171,7 @@ uint32_t cluster_main(uintptr_t cluster_ct, uintptr_t* merge_prevented, uintptr_
 	umm = tri_coord_no_diag_32(clidx_small, uii);
 	if ((cur_cluster_remap[uii] == uii) && (!IS_SET(merge_prevented, umm))) {
 	  if (is_set(merge_prevented, tcoord1 + uii)) {
-	    SET_BIT(merge_prevented, umm);
+	    SET_BIT(umm, merge_prevented);
 	  } else {
 	    ujj = cluster_index[tcoord1 + uii];
 	    ukk = cluster_index[umm];
@@ -2214,7 +2188,7 @@ uint32_t cluster_main(uintptr_t cluster_ct, uintptr_t* merge_prevented, uintptr_
 	umm = tri_coord_no_diag_32(clidx_small, uii);
 	if ((cur_cluster_remap[uii] == uii) && (!IS_SET(merge_prevented, umm))) {
 	  if (is_set(merge_prevented, tri_coord_no_diag_32(clidx_large, uii))) {
-	    SET_BIT(merge_prevented, umm);
+	    SET_BIT(umm, merge_prevented);
 	  } else {
 	    ujj = cluster_index[tri_coord_no_diag_32(clidx_large, uii)];
 	    ukk = cluster_index[umm];
@@ -2231,7 +2205,7 @@ uint32_t cluster_main(uintptr_t cluster_ct, uintptr_t* merge_prevented, uintptr_
       for (uii = 0; uii < clidx_small; uii++) {
 	if ((cur_cluster_remap[uii] == uii) && (!is_set(merge_prevented, tcoord2 + uii))) {
 	  if (is_set(merge_prevented, tcoord1 + uii) || (size_restriction && (cur_cluster_sizes[uii] > cur_size)) || (case_restriction && (cur_cluster_case_cts[uii] > cur_cases)) || (ctrl_restriction && (cur_cluster_sizes[uii] - cur_cluster_case_cts[uii] > cur_ctrls))) {
-	    set_bit(merge_prevented, tcoord2 + uii);
+	    set_bit(tcoord2 + uii, merge_prevented);
 	  } else {
 	    ujj = cluster_index[tcoord1 + uii];
 	    ukk = cluster_index[tcoord2 + uii];
@@ -2248,7 +2222,7 @@ uint32_t cluster_main(uintptr_t cluster_ct, uintptr_t* merge_prevented, uintptr_
 	umm = tri_coord_no_diag_32(clidx_small, uii);
 	if ((cur_cluster_remap[uii] == uii) && (!IS_SET(merge_prevented, umm))) {
 	  if (is_set(merge_prevented, tcoord1 + uii) || (size_restriction && (cur_cluster_sizes[uii] > cur_size)) || (case_restriction && (cur_cluster_case_cts[uii] > cur_cases)) || (ctrl_restriction && (cur_cluster_sizes[uii] - cur_cluster_case_cts[uii] > cur_ctrls))) {
-	    SET_BIT(merge_prevented, umm);
+	    SET_BIT(umm, merge_prevented);
 	  } else {
 	    ujj = cluster_index[tcoord1 + uii];
 	    ukk = cluster_index[umm];
@@ -2265,7 +2239,7 @@ uint32_t cluster_main(uintptr_t cluster_ct, uintptr_t* merge_prevented, uintptr_
 	umm = tri_coord_no_diag_32(clidx_small, uii);
 	if ((cur_cluster_remap[uii] == uii) && (!IS_SET(merge_prevented, umm))) {
 	  if (is_set(merge_prevented, tri_coord_no_diag_32(clidx_large, uii)) || (size_restriction && (cur_cluster_sizes[uii] > cur_size)) || (case_restriction && (cur_cluster_case_cts[uii] > cur_cases)) || (ctrl_restriction && (cur_cluster_sizes[uii] - cur_cluster_case_cts[uii] > cur_ctrls))) {
-	    SET_BIT(merge_prevented, umm);
+	    SET_BIT(umm, merge_prevented);
 	  } else {
 	    ujj = cluster_index[tri_coord_no_diag_32(clidx_large, uii)];
 	    ukk = cluster_index[umm];
@@ -2526,7 +2500,7 @@ uint32_t cluster_group_avg_main(uint32_t cluster_ct, uintptr_t* merge_prevented,
 	for (uii = 0; uii < clidx_small; uii++) {
 	  if ((cur_cluster_remap[uii] == uii) && (!is_set(merge_prevented, tcoord2 + uii))) {
 	    if (is_set(merge_prevented, tcoord1 + uii)) {
-	      set_bit(merge_prevented, tcoord2 + uii);
+	      set_bit(tcoord2 + uii, merge_prevented);
 	    } else {
 	      heap_merge_two(tcoord1 + uii, tcoord2 + uii, dsize2, dsize1, dsize_recip, &heap_size, heap_vals, val_to_cindices, cluster_index);
 	    }
@@ -2536,7 +2510,7 @@ uint32_t cluster_group_avg_main(uint32_t cluster_ct, uintptr_t* merge_prevented,
 	  ujj = tri_coord_no_diag_32(clidx_small, uii);
 	  if ((cur_cluster_remap[uii] == uii) && (!IS_SET(merge_prevented, ujj))) {
 	    if (is_set(merge_prevented, tcoord1 + uii)) {
-	      SET_BIT(merge_prevented, ujj);
+	      SET_BIT(ujj, merge_prevented);
 	    } else {
 	      heap_merge_two(tcoord1 + uii, ujj, dsize2, dsize1, dsize_recip, &heap_size, heap_vals, val_to_cindices, cluster_index);
 	    }
@@ -2546,7 +2520,7 @@ uint32_t cluster_group_avg_main(uint32_t cluster_ct, uintptr_t* merge_prevented,
 	  ujj = tri_coord_no_diag_32(clidx_small, uii);
 	  if ((cur_cluster_remap[uii] == uii) && (!IS_SET(merge_prevented, ujj))) {
 	    if (is_set(merge_prevented, tri_coord_no_diag_32(clidx_large, uii))) {
-	      SET_BIT(merge_prevented, ujj);
+	      SET_BIT(ujj, merge_prevented);
 	    } else {
 	      heap_merge_two(tri_coord_no_diag_32(clidx_large, uii), ujj, dsize2, dsize1, dsize_recip, &heap_size, heap_vals, val_to_cindices, cluster_index);
 	    }
@@ -2556,7 +2530,7 @@ uint32_t cluster_group_avg_main(uint32_t cluster_ct, uintptr_t* merge_prevented,
 	for (uii = 0; uii < clidx_small; uii++) {
 	  if ((cur_cluster_remap[uii] == uii) && (!is_set(merge_prevented, tcoord2 + uii))) {
 	    if (is_set(merge_prevented, tcoord1 + uii) || (size_restriction && (cur_cluster_sizes[uii] > cur_size)) || (case_restriction && (cur_cluster_case_cts[uii] > cur_cases)) || (ctrl_restriction && (cur_cluster_sizes[uii] - cur_cluster_case_cts[uii] > cur_ctrls))) {
-	      set_bit(merge_prevented, tcoord2 + uii);
+	      set_bit(tcoord2 + uii, merge_prevented);
 	    } else {
 	      heap_merge_two(tcoord1 + uii, tcoord2 + uii, dsize2, dsize1, dsize_recip, &heap_size, heap_vals, val_to_cindices, cluster_index);
 	    }
@@ -2566,7 +2540,7 @@ uint32_t cluster_group_avg_main(uint32_t cluster_ct, uintptr_t* merge_prevented,
 	  ujj = tri_coord_no_diag_32(clidx_small, uii);
 	  if ((cur_cluster_remap[uii] == uii) && (!IS_SET(merge_prevented, ujj))) {
 	    if (is_set(merge_prevented, tcoord1 + uii) || (size_restriction && (cur_cluster_sizes[uii] > cur_size)) || (case_restriction && (cur_cluster_case_cts[uii] > cur_cases)) || (ctrl_restriction && (cur_cluster_sizes[uii] - cur_cluster_case_cts[uii] > cur_ctrls))) {
-	      SET_BIT(merge_prevented, ujj);
+	      SET_BIT(ujj, merge_prevented);
 	    } else {
 	      heap_merge_two(tcoord1 + uii, ujj, dsize2, dsize1, dsize_recip, &heap_size, heap_vals, val_to_cindices, cluster_index);
 	    }
@@ -2576,7 +2550,7 @@ uint32_t cluster_group_avg_main(uint32_t cluster_ct, uintptr_t* merge_prevented,
 	  ujj = tri_coord_no_diag_32(clidx_small, uii);
 	  if ((cur_cluster_remap[uii] == uii) && (!IS_SET(merge_prevented, ujj))) {
 	    if (is_set(merge_prevented, tri_coord_no_diag_32(clidx_large, uii)) || (size_restriction && (cur_cluster_sizes[uii] > cur_size)) || (case_restriction && (cur_cluster_case_cts[uii] > cur_cases)) || (ctrl_restriction && (cur_cluster_sizes[uii] - cur_cluster_case_cts[uii] > cur_ctrls))) {
-	      SET_BIT(merge_prevented, ujj);
+	      SET_BIT(ujj, merge_prevented);
 	    } else {
 	      heap_merge_two(tri_coord_no_diag_32(clidx_large, uii), ujj, dsize2, dsize1, dsize_recip, &heap_size, heap_vals, val_to_cindices, cluster_index);
 	    }
@@ -2588,7 +2562,7 @@ uint32_t cluster_group_avg_main(uint32_t cluster_ct, uintptr_t* merge_prevented,
 	for (uii = 0; uii < clidx_small; uii++) {
 	  if ((cur_cluster_remap[uii] == uii) && (!is_set(merge_prevented, tcoord2 + uii))) {
 	    if (is_set(merge_prevented, tcoord1 + uii)) {
-	      set_bit(merge_prevented, tcoord2 + uii);
+	      set_bit(tcoord2 + uii, merge_prevented);
 	    } else {
 	      heap_merge_two_cc(tcoord1 + uii, tcoord2 + uii, dsize2, dsize1, dsize_recip, &heap_size, heap_vals, val_to_cindices, cluster_index);
 	    }
@@ -2598,7 +2572,7 @@ uint32_t cluster_group_avg_main(uint32_t cluster_ct, uintptr_t* merge_prevented,
 	  ujj = tri_coord_no_diag_32(clidx_small, uii);
 	  if ((cur_cluster_remap[uii] == uii) && (!IS_SET(merge_prevented, ujj))) {
 	    if (is_set(merge_prevented, tcoord1 + uii)) {
-	      SET_BIT(merge_prevented, ujj);
+	      SET_BIT(ujj, merge_prevented);
 	    } else {
 	      heap_merge_two_cc(tcoord1 + uii, ujj, dsize2, dsize1, dsize_recip, &heap_size, heap_vals, val_to_cindices, cluster_index);
 	    }
@@ -2608,7 +2582,7 @@ uint32_t cluster_group_avg_main(uint32_t cluster_ct, uintptr_t* merge_prevented,
 	  ujj = tri_coord_no_diag_32(clidx_small, uii);
 	  if ((cur_cluster_remap[uii] == uii) && (!IS_SET(merge_prevented, ujj))) {
 	    if (is_set(merge_prevented, tri_coord_no_diag_32(clidx_large, uii))) {
-	      SET_BIT(merge_prevented, ujj);
+	      SET_BIT(ujj, merge_prevented);
 	    } else {
 	      heap_merge_two_cc(tri_coord_no_diag_32(clidx_large, uii), ujj, dsize2, dsize1, dsize_recip, &heap_size, heap_vals, val_to_cindices, cluster_index);
 	    }
@@ -2618,7 +2592,7 @@ uint32_t cluster_group_avg_main(uint32_t cluster_ct, uintptr_t* merge_prevented,
 	for (uii = 0; uii < clidx_small; uii++) {
 	  if ((cur_cluster_remap[uii] == uii) && (!is_set(merge_prevented, tcoord2 + uii))) {
 	    if (is_set(merge_prevented, tcoord1 + uii) || (size_restriction && (cur_cluster_sizes[uii] > cur_size)) || (case_restriction && (cur_cluster_case_cts[uii] > cur_cases)) || (ctrl_restriction && (cur_cluster_sizes[uii] - cur_cluster_case_cts[uii] > cur_ctrls))) {
-	      set_bit(merge_prevented, tcoord2 + uii);
+	      set_bit(tcoord2 + uii, merge_prevented);
 	    } else {
 	      heap_merge_two_cc(tcoord1 + uii, tcoord2 + uii, dsize2, dsize1, dsize_recip, &heap_size, heap_vals, val_to_cindices, cluster_index);
 	    }
@@ -2628,7 +2602,7 @@ uint32_t cluster_group_avg_main(uint32_t cluster_ct, uintptr_t* merge_prevented,
 	  ujj = tri_coord_no_diag_32(clidx_small, uii);
 	  if ((cur_cluster_remap[uii] == uii) && (!IS_SET(merge_prevented, ujj))) {
 	    if (is_set(merge_prevented, tcoord1 + uii) || (size_restriction && (cur_cluster_sizes[uii] > cur_size)) || (case_restriction && (cur_cluster_case_cts[uii] > cur_cases)) || (ctrl_restriction && (cur_cluster_sizes[uii] - cur_cluster_case_cts[uii] > cur_ctrls))) {
-	      SET_BIT(merge_prevented, ujj);
+	      SET_BIT(ujj, merge_prevented);
 	    } else {
 	      heap_merge_two_cc(tcoord1 + uii, ujj, dsize2, dsize1, dsize_recip, &heap_size, heap_vals, val_to_cindices, cluster_index);
 	    }
@@ -2638,7 +2612,7 @@ uint32_t cluster_group_avg_main(uint32_t cluster_ct, uintptr_t* merge_prevented,
 	  ujj = tri_coord_no_diag_32(clidx_small, uii);
 	  if ((cur_cluster_remap[uii] == uii) && (!IS_SET(merge_prevented, ujj))) {
 	    if (is_set(merge_prevented, tri_coord_no_diag_32(clidx_large, uii)) || (size_restriction && (cur_cluster_sizes[uii] > cur_size)) || (case_restriction && (cur_cluster_case_cts[uii] > cur_cases)) || (ctrl_restriction && (cur_cluster_sizes[uii] - cur_cluster_case_cts[uii] > cur_ctrls))) {
-	      SET_BIT(merge_prevented, ujj);
+	      SET_BIT(ujj, merge_prevented);
 	    } else {
 	      heap_merge_two_cc(tri_coord_no_diag_32(clidx_large, uii), ujj, dsize2, dsize1, dsize_recip, &heap_size, heap_vals, val_to_cindices, cluster_index);
 	    }
@@ -2734,7 +2708,7 @@ void write_cluster1_oitc(FILE* outfile, uint32_t clidx, char* sample_ids, uintpt
 }
 
 int32_t write_cluster_solution(char* outname, char* outname_end, uint32_t* orig_sample_to_cluster, uintptr_t sample_ct, uint32_t* orig_cluster_map, uint32_t* orig_cluster_starts, uint32_t* late_clidx_to_sample_uidx, uint32_t orig_within_ct, uint32_t orig_cluster_ct, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* pheno_c, uint32_t* sample_idx_to_uidx, Cluster_info* cp, uint32_t* cluster_remap, uint32_t* clidx_table_space, uint32_t merge_ct, uint32_t* merge_sequence) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
   uint32_t only2 = cp->modifier & CLUSTER_ONLY2;
   uint32_t report_pheno = (cp->modifier & CLUSTER_CC) || (cp->max_ctrls != 0xffffffffU);
@@ -2758,7 +2732,7 @@ int32_t write_cluster_solution(char* outname, char* outname_end, uint32_t* orig_
     }
   }
   memcpy(outname_end, ".cluster2", 10);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto write_cluster_solution_ret_OPEN_FAIL;
   }
   fputs("Writing cluster solution...", stdout);
@@ -2771,10 +2745,10 @@ int32_t write_cluster_solution(char* outname, char* outname_end, uint32_t* orig_
     }
     sptr = &(sample_ids[sample_idx_to_uidx[sample_idx] * max_sample_id_len]);
     sptr2 = (char*)memchr(sptr, '\t', max_sample_id_len);
-    wptr = memcpyax(tbuf, sptr, (sptr2 - sptr), ' ');
+    wptr = memcpyax(g_textbuf, sptr, (sptr2 - sptr), ' ');
     wptr = strcpyax(wptr, &(sptr2[1]), '\t');
-    wptr = uint32_writex(wptr, clidx_remap[clidx], '\n');
-    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+    wptr = uint32toa_x(clidx_remap[clidx], '\n', wptr);
+    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
       goto write_cluster_solution_ret_WRITE_FAIL;
     }
   }
@@ -2783,14 +2757,14 @@ int32_t write_cluster_solution(char* outname, char* outname_end, uint32_t* orig_
   }
   if (!only2) {
     outname_end[8] = '1';
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto write_cluster_solution_ret_OPEN_FAIL;
     }
-    memcpy(tbuf, "SOL-", 4);
+    memcpy(g_textbuf, "SOL-", 4);
     for (clidx = 0; clidx < orig_cluster_ct; clidx++) {
       if (cluster_remap[clidx] == clidx) {
-        wptr = uint32_writex(&(tbuf[4]), clidx_remap[clidx], '\t');
-        if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+        wptr = uint32toa_x(clidx_remap[clidx], '\t', &(g_textbuf[4]));
+        if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	  goto write_cluster_solution_ret_WRITE_FAIL;
 	}
         if (!orig_sample_to_cluster) {
@@ -2811,7 +2785,7 @@ int32_t write_cluster_solution(char* outname, char* outname_end, uint32_t* orig_
     } else {
       outname_end[8] = '3';
     }
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto write_cluster_solution_ret_WRITE_FAIL;
     }
     clidx = 0;
@@ -2858,7 +2832,7 @@ int32_t write_cluster_solution(char* outname, char* outname_end, uint32_t* orig_
       } else {
 	clidx = sample_idx;
       }
-      wptr = uint32_writex(wbuf, clidx, ' ');
+      wptr = uint32toa_x(clidx, ' ', wbuf);
       fwrite(wbuf, 1, wptr - wbuf, outfile);
       uii = 0;
       if (merge_ct) {
@@ -2869,7 +2843,7 @@ int32_t write_cluster_solution(char* outname, char* outname_end, uint32_t* orig_
 	    ujj = merge_ct;
 	  }
 	  for (; uii < ujj; uii++) {
-	    wptr = uint32_writex(wbuf, *cur_remap++, ' ');
+	    wptr = uint32toa_x(*cur_remap++, ' ', wbuf);
 	    fwrite(wbuf, 1, wptr - wbuf, outfile);
 	  }
 	  if (ujj == merge_ct) {
@@ -2916,7 +2890,7 @@ int32_t write_cluster_solution(char* outname, char* outname_end, uint32_t* orig_
     break;
   }
   fclose_cond(outfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -2969,12 +2943,10 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
     }
   }
   if (is_mds_cluster) {
-    if (wkspace_alloc_d_checked(&main_matrix, final_cluster_ct * final_cluster_ct * sizeof(double)) ||
-        wkspace_alloc_ui_checked(&final_cluster_sizes, final_cluster_ct * sizeof(int32_t))) {
+    if (bigstack_calloc_d(final_cluster_ct * final_cluster_ct, &main_matrix) ||
+        bigstack_alloc_ui(final_cluster_ct, &final_cluster_sizes)) {
       goto mds_plot_ret_NOMEM;
     }
-    fill_double_zero(main_matrix, final_cluster_ct * final_cluster_ct);
-    fill_uint_zero(final_cluster_sizes, final_cluster_ct);
     dptr = dists;
     final_cluster_sizes[final_cluster_remap[0]] = 1;
     for (uii = 1; uii < cur_cluster_ct; uii++) {
@@ -3001,8 +2973,8 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
     }
     ulii = final_cluster_ct;
   } else {
-    wkspace_reset(dists);
-    if (wkspace_alloc_d_checked(&main_matrix, sample_ct * sample_ct * sizeof(double))) {
+    bigstack_reset(dists);
+    if (bigstack_alloc_d(sample_ct * sample_ct, &main_matrix)) {
       goto mds_plot_ret_NOMEM;
     }
     // expand triangular diagonal-free matrix to bottom-left of square matrix
@@ -3017,10 +2989,9 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
     }
     ulii = sample_ct;
   }
-  if (wkspace_alloc_d_checked(&column_means, ulii * sizeof(double))) {
+  if (bigstack_calloc_d(ulii, &column_means)) {
     goto mds_plot_ret_NOMEM;
   }
-  fill_double_zero(column_means, ulii);
   // bottom left filled with IBS values.  Now subtract them from 1 and square
   // them, and extract column means...
   for (clidx1 = 0; clidx1 < ulii; clidx1++) {
@@ -3070,16 +3041,16 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
   fflush(stdout);
 
   mdim = ulii;
-  if (wkspace_alloc_d_checked(&sqrt_eigvals, ulii * sizeof(double)) ||
-      wkspace_alloc_d_checked(&out_u, ulii * ulii * sizeof(double)) ||
-      wkspace_alloc_d_checked(&out_v, ulii * ulii * sizeof(double))) {
+  if (bigstack_alloc_d(ulii, &sqrt_eigvals) ||
+      bigstack_alloc_d(ulii * ulii, &out_u) ||
+      bigstack_alloc_d(ulii * ulii, &out_v)) {
     goto mds_plot_ret_NOMEM;
   }
   // fill_double_zero(sqrt_eigvals, ulii);
   // fill_double_zero(out_u, ulii * ulii);
   // fill_double_zero(out_v, ulii * ulii);
 
-  iwork = (__CLPK_integer*)wkspace_alloc(8 * ulii * sizeof(__CLPK_integer));
+  iwork = (__CLPK_integer*)bigstack_alloc(8 * ulii * sizeof(__CLPK_integer));
   if (!iwork) {
     goto mds_plot_ret_NOMEM;
   }
@@ -3088,7 +3059,7 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
   // workspace query
   dgesdd_(&jobz, &mdim, &mdim, main_matrix, &mdim, sqrt_eigvals, out_u, &mdim, out_v, &mdim, &optim_lwork, &lwork, iwork, &info);
   lwork = (int32_t)optim_lwork;
-  if (wkspace_alloc_d_checked(&work, lwork * sizeof(double))) {
+  if (bigstack_alloc_d(lwork, &work)) {
     goto mds_plot_ret_NOMEM;
   }
   // fill_double_zero(work, lwork);
@@ -3118,19 +3089,19 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
   logprint(" done.\n");
 
   memcpy(outname_end, ".mds", 5);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto mds_plot_ret_OPEN_FAIL;
   }
-  sprintf(tbuf, "%%%us %%%us    SOL ", plink_maxfid, plink_maxiid);
-  fprintf(outfile, tbuf, "FID", "IID");
-  tbuf[22] = ' ';
+  sprintf(g_textbuf, "%%%us %%%us    SOL ", plink_maxfid, plink_maxiid);
+  fprintf(outfile, g_textbuf, "FID", "IID");
+  g_textbuf[22] = ' ';
   for (dim_idx = 0; dim_idx < dim_ct; dim_idx++) {
-    wptr = uint32_write(tbuf, dim_idx + 1);
-    uii = wptr - tbuf;
-    wptr2 = memseta(&(tbuf[10]), 32, 11 - uii);
+    wptr = uint32toa(dim_idx + 1, g_textbuf);
+    uii = wptr - g_textbuf;
+    wptr2 = memseta(&(g_textbuf[10]), 32, 11 - uii);
     *wptr2++ = 'C';
-    memcpy(wptr2, tbuf, uii);
-    fwrite(&(tbuf[10]), 1, 13, outfile);
+    memcpy(wptr2, g_textbuf, uii);
+    fwrite(&(g_textbuf[10]), 1, 13, outfile);
   }
   if (putc_checked('\n', outfile)) {
     goto mds_plot_ret_WRITE_FAIL;
@@ -3138,7 +3109,7 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
   for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
     wptr2 = &(sample_ids[sample_idx_to_uidx[sample_idx] * max_sample_id_len]);
     uii = strlen_se(wptr2);
-    wptr = fw_strcpyn(plink_maxfid, uii, wptr2, tbuf);
+    wptr = fw_strcpyn(plink_maxfid, uii, wptr2, g_textbuf);
     *wptr++ = ' ';
     wptr = fw_strcpy(plink_maxiid, &(wptr2[uii + 1]), wptr);
     *wptr++ = ' ';
@@ -3148,33 +3119,33 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
       uii = sample_idx;
     }
     uii = final_cluster_remap[uii];
-    wptr = uint32_writew6x(wptr, uii, ' ');
-    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+    wptr = uint32toa_w6x(uii, ' ', wptr);
+    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
       goto mds_plot_ret_WRITE_FAIL;
     }
     if (!is_mds_cluster) {
       dptr = &(main_matrix[sample_idx * dim_ct]);
       for (dim_idx = 0; dim_idx < dim_ct; dim_idx++) {
-        wptr = double_g_writex(&(tbuf[11]), *(dptr++), ' ');
-	uii = wptr - (&(tbuf[11]));
+        wptr = dtoa_gx(*(dptr++), ' ', &(g_textbuf[11]));
+	uii = wptr - (&(g_textbuf[11]));
 	if (uii < 13) {
 	  wptr2 = &(wptr[-13]);
 	  memset(wptr2, 32, 13 - uii);
 	} else {
-	  wptr2 = &(tbuf[11]);
+	  wptr2 = &(g_textbuf[11]);
 	}
 	fwrite(wptr2, 1, wptr - wptr2, outfile);
       }
     } else {
       dptr = &(main_matrix[uii * dim_ct]);
       for (dim_idx = 0; dim_idx < dim_ct; dim_idx++) {
-        wptr = double_g_writex(&(tbuf[11]), *(dptr++), ' ');
-	uii = wptr - (&(tbuf[11]));
+        wptr = dtoa_gx(*(dptr++), ' ', &(g_textbuf[11]));
+	uii = wptr - (&(g_textbuf[11]));
 	if (uii < 13) {
 	  wptr2 = &(wptr[-13]);
 	  memset(wptr2, 32, 13 - uii);
 	} else {
-	  wptr2 = &(tbuf[11]);
+	  wptr2 = &(g_textbuf[11]);
 	}
 	fwrite(wptr2, 1, wptr - wptr2, outfile);
       }
@@ -3191,13 +3162,13 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
   } else {
     LOGPREPRINTFWW("MDS solution written to %s (eigenvalues in %s.eigvals ).\n", outname, outname);
     memcpy(&(outname_end[4]), ".eigvals", 9);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto mds_plot_ret_OPEN_FAIL;
     }
     for (dim_idx = 0; dim_idx < dim_ct; dim_idx++) {
-      wptr = double_g_writex(tbuf, sqrt_eigvals[dim_idx] * sqrt_eigvals[dim_idx], '\n');
+      wptr = dtoa_gx(sqrt_eigvals[dim_idx] * sqrt_eigvals[dim_idx], '\n', g_textbuf);
       *wptr = '\0';
-      fputs(tbuf, outfile);
+      fputs(g_textbuf, outfile);
     }
     if (fclose_null(&outfile)) {
       goto mds_plot_ret_WRITE_FAIL;
@@ -3217,7 +3188,7 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
   }
   fclose_cond(outfile);
   free_cond(final_cluster_remap);
-  wkspace_reset(dists);
+  bigstack_reset(dists);
   return retval;
 }
 
@@ -3281,12 +3252,10 @@ int32_t mds_plot_eigendecomp(char* outname, char* outname_end, uintptr_t* sample
     }
   }
   if (is_mds_cluster) {
-    if (wkspace_alloc_d_checked(&main_matrix, final_cluster_ct * final_cluster_ct * sizeof(double)) ||
-        wkspace_alloc_ui_checked(&final_cluster_sizes, final_cluster_ct * sizeof(int32_t))) {
+    if (bigstack_calloc_d(final_cluster_ct * final_cluster_ct, &main_matrix) ||
+        bigstack_calloc_ui(final_cluster_ct, &final_cluster_sizes)) {
       goto mds_plot_eigendecomp_ret_NOMEM;
     }
-    fill_double_zero(main_matrix, final_cluster_ct * final_cluster_ct);
-    fill_uint_zero(final_cluster_sizes, final_cluster_ct);
     dptr = dists;
     final_cluster_sizes[final_cluster_remap[0]] = 1;
     for (uii = 1; uii < cur_cluster_ct; uii++) {
@@ -3313,8 +3282,8 @@ int32_t mds_plot_eigendecomp(char* outname, char* outname_end, uintptr_t* sample
     }
     ulii = final_cluster_ct;
   } else {
-    wkspace_reset(dists);
-    if (wkspace_alloc_d_checked(&main_matrix, sample_ct * sample_ct * sizeof(double))) {
+    bigstack_reset(dists);
+    if (bigstack_alloc_d(sample_ct * sample_ct, &main_matrix)) {
       goto mds_plot_eigendecomp_ret_NOMEM;
     }
     // expand triangular diagonal-free matrix to square matrix
@@ -3329,10 +3298,9 @@ int32_t mds_plot_eigendecomp(char* outname, char* outname_end, uintptr_t* sample
     }
     ulii = sample_ct;
   }
-  if (wkspace_alloc_d_checked(&column_means, ulii * sizeof(double))) {
+  if (bigstack_calloc_d(ulii, &column_means)) {
     goto mds_plot_eigendecomp_ret_NOMEM;
   }
-  fill_double_zero(column_means, ulii);
   // bottom left filled with IBS values.  Now subtract them from 1 and square
   // them, and extract column means...
   for (clidx1 = 1; clidx1 < ulii; clidx1++) {
@@ -3381,13 +3349,11 @@ int32_t mds_plot_eigendecomp(char* outname, char* outname_end, uintptr_t* sample
   mdim = ulii;
   i2 = mdim;
   i1 = i2 + 1 - dim_ct;
-  if (wkspace_alloc_d_checked(&out_w, dim_ct * sizeof(double)) ||
-      wkspace_alloc_d_checked(&out_z, dim_ct * ulii * sizeof(double))) {
+  if (bigstack_calloc_d(dim_ct, &out_w) ||
+      bigstack_calloc_d(dim_ct * ulii, &out_z)) {
     goto mds_plot_eigendecomp_ret_NOMEM;
   }
-  fill_double_zero(out_w, dim_ct);
-  fill_double_zero(out_z, dim_ct * ulii);
-  isuppz = (__CLPK_integer*)wkspace_alloc(2 * dim_ct * sizeof(__CLPK_integer));
+  isuppz = (__CLPK_integer*)bigstack_alloc(2 * dim_ct * sizeof(__CLPK_integer));
   if (!isuppz) {
     goto mds_plot_eigendecomp_ret_NOMEM;
   }
@@ -3396,23 +3362,22 @@ int32_t mds_plot_eigendecomp(char* outname, char* outname_end, uintptr_t* sample
 
   dsyevr_(&jobz, &range, &uplo, &mdim, main_matrix, &mdim, &nz, &nz, &i1, &i2, &zz, &out_m, out_w, out_z, &ldz, isuppz, &optim_lwork, &lwork, &optim_liwork, &liwork, &info);
   lwork = (int32_t)optim_lwork;
-  if (wkspace_alloc_d_checked(&work, lwork * sizeof(double))) {
+  if (bigstack_calloc_d(lwork, &work)) {
     goto mds_plot_eigendecomp_ret_NOMEM;
   }
   liwork = optim_liwork;
-  iwork = (__CLPK_integer*)wkspace_alloc(liwork * sizeof(__CLPK_integer));
+  iwork = (__CLPK_integer*)bigstack_alloc(liwork * sizeof(__CLPK_integer));
   if (!iwork) {
     goto mds_plot_eigendecomp_ret_NOMEM;
   }
-  fill_double_zero(work, lwork);
   fill_int_zero((int32_t*)iwork, liwork * (sizeof(__CLPK_integer) / sizeof(int32_t)));
   dsyevr_(&jobz, &range, &uplo, &mdim, main_matrix, &mdim, &nz, &nz, &i1, &i2, &zz, &out_m, out_w, out_z, &ldz, isuppz, work, &lwork, iwork, &liwork, &info);
 
   // * out_w[0..(dim_ct-1)] contains eigenvalues
   // * out_z[(ii*ulii)..(ii*ulii + ulii - 1)] is eigenvector corresponding to
   //   out_w[ii]
-  wkspace_reset(isuppz);
-  if (wkspace_alloc_d_checked(&sqrt_eigvals, dim_ct * sizeof(double))) {
+  bigstack_reset(isuppz);
+  if (bigstack_alloc_d(dim_ct, &sqrt_eigvals)) {
     goto mds_plot_eigendecomp_ret_NOMEM;
   }
   for (dim_idx = 0; dim_idx < dim_ct; dim_idx++) {
@@ -3433,19 +3398,19 @@ int32_t mds_plot_eigendecomp(char* outname, char* outname_end, uintptr_t* sample
   logprint(" done.\n");
 
   memcpy(outname_end, ".mds", 5);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto mds_plot_eigendecomp_ret_OPEN_FAIL;
   }
-  sprintf(tbuf, "%%%us %%%us    SOL ", plink_maxfid, plink_maxiid);
-  fprintf(outfile, tbuf, "FID", "IID");
-  tbuf[22] = ' ';
+  sprintf(g_textbuf, "%%%us %%%us    SOL ", plink_maxfid, plink_maxiid);
+  fprintf(outfile, g_textbuf, "FID", "IID");
+  g_textbuf[22] = ' ';
   for (dim_idx = 0; dim_idx < dim_ct; dim_idx++) {
-    wptr = uint32_write(tbuf, dim_idx + 1);
-    uii = wptr - tbuf;
-    wptr2 = memseta(&(tbuf[10]), 32, 11 - uii);
+    wptr = uint32toa(dim_idx + 1, g_textbuf);
+    uii = wptr - g_textbuf;
+    wptr2 = memseta(&(g_textbuf[10]), 32, 11 - uii);
     *wptr2++ = 'C';
-    memcpy(wptr2, tbuf, uii);
-    fwrite(&(tbuf[10]), 1, 13, outfile);
+    memcpy(wptr2, g_textbuf, uii);
+    fwrite(&(g_textbuf[10]), 1, 13, outfile);
   }
   if (putc_checked('\n', outfile)) {
     goto mds_plot_eigendecomp_ret_WRITE_FAIL;
@@ -3453,7 +3418,7 @@ int32_t mds_plot_eigendecomp(char* outname, char* outname_end, uintptr_t* sample
   for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
     wptr2 = &(sample_ids[sample_idx_to_uidx[sample_idx] * max_sample_id_len]);
     uii = strlen_se(wptr2);
-    wptr = fw_strcpyn(plink_maxfid, uii, wptr2, tbuf);
+    wptr = fw_strcpyn(plink_maxfid, uii, wptr2, g_textbuf);
     *wptr++ = ' ';
     wptr = fw_strcpy(plink_maxiid, &(wptr2[uii + 1]), wptr);
     *wptr++ = ' ';
@@ -3463,33 +3428,33 @@ int32_t mds_plot_eigendecomp(char* outname, char* outname_end, uintptr_t* sample
       uii = sample_idx;
     }
     uii = final_cluster_remap[uii];
-    wptr = uint32_writew6x(wptr, uii, ' ');
-    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+    wptr = uint32toa_w6x(uii, ' ', wptr);
+    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
       goto mds_plot_eigendecomp_ret_WRITE_FAIL;
     }
     if (!is_mds_cluster) {
       dptr = &(main_matrix[(sample_idx + 1) * dim_ct]);
       for (dim_idx = 0; dim_idx < dim_ct; dim_idx++) {
-        wptr = double_g_writex(&(tbuf[11]), *(--dptr), ' ');
-	uii = wptr - (&(tbuf[11]));
+        wptr = dtoa_gx(*(--dptr), ' ', &(g_textbuf[11]));
+	uii = wptr - (&(g_textbuf[11]));
 	if (uii < 13) {
 	  wptr2 = &(wptr[-13]);
 	  memset(wptr2, 32, 13 - uii);
 	} else {
-	  wptr2 = &(tbuf[11]);
+	  wptr2 = &(g_textbuf[11]);
 	}
 	fwrite(wptr2, 1, wptr - wptr2, outfile);
       }
     } else {
       dptr = &(main_matrix[(uii + 1) * dim_ct]);
       for (dim_idx = 0; dim_idx < dim_ct; dim_idx++) {
-        wptr = double_g_writex(&(tbuf[11]), *(--dptr), ' ');
-	uii = wptr - (&(tbuf[11]));
+        wptr = dtoa_gx(*(--dptr), ' ', &(g_textbuf[11]));
+	uii = wptr - (&(g_textbuf[11]));
 	if (uii < 13) {
 	  wptr2 = &(wptr[-13]);
 	  memset(wptr2, 32, 13 - uii);
 	} else {
-	  wptr2 = &(tbuf[11]);
+	  wptr2 = &(g_textbuf[11]);
 	}
 	fwrite(wptr2, 1, wptr - wptr2, outfile);
       }
@@ -3506,13 +3471,13 @@ int32_t mds_plot_eigendecomp(char* outname, char* outname_end, uintptr_t* sample
   } else {
     LOGPREPRINTFWW("MDS solution written to %s (eigenvalues in %s.eigvals ).\n", outname, outname);
     memcpy(&(outname_end[4]), ".eigvals", 9);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto mds_plot_eigendecomp_ret_OPEN_FAIL;
     }
     for (dim_idx = dim_ct; dim_idx; dim_idx--) {
-      wptr = double_g_writex(tbuf, sqrt_eigvals[dim_idx - 1] * sqrt_eigvals[dim_idx - 1], '\n');
+      wptr = dtoa_gx(sqrt_eigvals[dim_idx - 1] * sqrt_eigvals[dim_idx - 1], '\n', g_textbuf);
       *wptr = '\0';
-      fputs(tbuf, outfile);
+      fputs(g_textbuf, outfile);
     }
     if (fclose_null(&outfile)) {
       goto mds_plot_eigendecomp_ret_WRITE_FAIL;
@@ -3532,7 +3497,7 @@ int32_t mds_plot_eigendecomp(char* outname, char* outname_end, uintptr_t* sample
   }
   fclose_cond(outfile);
   free_cond(final_cluster_remap);
-  wkspace_reset(dists);
+  bigstack_reset(dists);
   return retval;
 }
 #endif
diff --git a/plink_cnv.c b/plink_cnv.c
index 95e5c3b..95f5ccc 100644
--- a/plink_cnv.c
+++ b/plink_cnv.c
@@ -18,7 +18,7 @@ int32_t cnv_subset_load(char* subset_fname, char** subset_list_ptr, uintptr_t* s
     goto cnv_subset_load_ret_NOMEM;
   }
 #endif
-  if (wkspace_alloc_c_checked(subset_list_ptr, subset_ct * max_subset_name_len)) {
+  if (bigstack_alloc_c(subset_ct * max_subset_name_len, subset_list_ptr)) {
     goto cnv_subset_load_ret_NOMEM;
   }
   retval = load_string_list(&subset_file, max_subset_name_len, *subset_list_ptr);
@@ -52,7 +52,7 @@ const char* cnv_intersect_filter_type_to_str(uint32_t intersect_filter_type) {
 #define SMALL_INTERVAL_BITS 18
 #define SMALL_INTERVAL_MAX_SIZE ((1 << SMALL_INTERVAL_BITS) - 1)
 
-int32_t cnv_intersect_load(uint32_t intersect_filter_type, char* intersect_filter_fname, char* subset_list, uintptr_t subset_ct, uintptr_t max_subset_name_len, uintptr_t* il_chrom_start_small, uintptr_t* il_chrom_start_large, uint32_t* il_chrom_max_width_small, uint32_t* il_chrom_max_width_large, uint64_t** il_small_ptr, uint64_t** il_large_ptr, int32_t marker_pos_start, int32_t marker_pos_end, uint32_t allow_extra_chroms, Chrom_info* chrom_info_ptr, uintptr_t* topsize_ptr) {
+int32_t cnv_intersect_load(uint32_t intersect_filter_type, char* intersect_filter_fname, char* subset_list, uintptr_t subset_ct, uintptr_t max_subset_name_len, uintptr_t* il_chrom_start_small, uintptr_t* il_chrom_start_large, uint32_t* il_chrom_max_width_small, uint32_t* il_chrom_max_width_large, uint64_t** il_small_ptr, uint64_t** il_large_ptr, int32_t marker_pos_start, int32_t marker_pos_end, uint32_t allow_extra_chroms, Chrom_info* chrom_info_ptr) {
   // We store intervals in sorted order, with the center of each interval in
   // the high-order bits, and the size (without adding 1) in the low-order
   // bits.  (Chromosome beginnings and endings are stored in small external
@@ -72,7 +72,7 @@ int32_t cnv_intersect_load(uint32_t intersect_filter_type, char* intersect_filte
   // almost all the "small tier" intervals regardless of the largest interval
   // size.
   FILE* intersect_file = NULL;
-  uintptr_t max_interval_ct = wkspace_left / 9;
+  uintptr_t max_interval_ct = bigstack_left() / 9;
   uintptr_t small_interval_ct = 0;
   uintptr_t large_interval_ct = 0;
   uintptr_t reverse_warning_ct = 0;
@@ -82,9 +82,9 @@ int32_t cnv_intersect_load(uint32_t intersect_filter_type, char* intersect_filte
   //        bottom 16 bits = size
   // Large: top bit = zero, next 32 bits = center pos * 2, bottom 31 = size
   //        [chrom information stored separately, initially in reverse order]
-  unsigned char* tmp_il_large_chroms = wkspace_base; // grows up
-  uint64_t* tmp_il_small = (uint64_t*)(&(tmp_il_large_chroms[(max_interval_ct + 7) & (~(7 * ONELU))])); // grows up
-  uint64_t* il_large = (uint64_t*)(&(wkspace_base[wkspace_left])); // grows down
+  unsigned char* tmp_il_large_chroms = g_bigstack_base; // grows up
+  uint64_t* tmp_il_small = (uint64_t*)(&(tmp_il_large_chroms[round_up_pow2(max_interval_ct, sizeof(int64_t))])); // grows up
+  uint64_t* il_large = (uint64_t*)g_bigstack_end; // grows down
   uintptr_t* chrom_mask = chrom_info_ptr->chrom_mask;
   const char* cift_str = cnv_intersect_filter_type_to_str(intersect_filter_type);
   int32_t retval = 0;
@@ -104,30 +104,30 @@ int32_t cnv_intersect_load(uint32_t intersect_filter_type, char* intersect_filte
   uint32_t cur_chrom;
   uint32_t uii;
   unsigned char ucc;
-  if (fopen_checked(&intersect_file, intersect_filter_fname, "r")) {
+  if (fopen_checked(intersect_filter_fname, "r", &intersect_file)) {
     goto cnv_intersect_load_ret_OPEN_FAIL;
   }
-  while (fgets(tbuf, MAXLINELEN, intersect_file)) {
+  while (fgets(g_textbuf, MAXLINELEN, intersect_file)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, cift_str);
+    if (!g_textbuf[MAXLINELEN - 1]) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, cift_str);
       goto cnv_intersect_load_ret_INVALID_FORMAT_2;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
     if (!is_eoln_kns(*bufptr)) {
       // CHR, BP1, BP2, subset name
       bufptr2 = next_token_mult(bufptr, 2);
       if (no_more_tokens_kns(bufptr2)) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of %s file has fewer tokens than expected.\n", line_idx, cift_str);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s file has fewer tokens than expected.\n", line_idx, cift_str);
 	goto cnv_intersect_load_ret_INVALID_FORMAT_2;
       }
       ii = get_chrom_code(chrom_info_ptr, bufptr);
       if (ii < 0) {
 	if ((!allow_extra_chroms) || (ii == -1)) {
-	  sprintf(logbuf, "Error: Invalid chromosome code on line %" PRIuPTR " of %s.\n", line_idx, cift_str);
+	  sprintf(g_logbuf, "Error: Invalid chromosome code on line %" PRIuPTR " of %s.\n", line_idx, cift_str);
 	  goto cnv_intersect_load_ret_INVALID_FORMAT_2;
 	}
-        retval = resolve_or_add_chrom_name(chrom_info_ptr, bufptr, &ii, line_idx, cift_str);
+        retval = resolve_or_add_chrom_name(bufptr, cift_str, line_idx, chrom_info_ptr, &ii);
 	if (retval) {
 	  goto cnv_intersect_load_ret_1;
 	}
@@ -138,11 +138,11 @@ int32_t cnv_intersect_load(uint32_t intersect_filter_type, char* intersect_filte
       }
       bufptr = next_token(bufptr);
       if (scan_uint_defcap(bufptr, (uint32_t*)&jj)) {
-	sprintf(logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of %s.\n", line_idx, cift_str);
+	sprintf(g_logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of %s.\n", line_idx, cift_str);
 	goto cnv_intersect_load_ret_INVALID_FORMAT_2;
       }
       if (scan_uint_defcap(bufptr2, (uint32_t*)&ii)) {
-	sprintf(logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of %s.\n", line_idx, cift_str);
+	sprintf(g_logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of %s.\n", line_idx, cift_str);
 	goto cnv_intersect_load_ret_INVALID_FORMAT_2;
       }
       if (ii < jj) {
@@ -196,7 +196,7 @@ int32_t cnv_intersect_load(uint32_t intersect_filter_type, char* intersect_filte
       logerrprint("Warning: All intervals filtered out by --cnv-subset.\n");
       goto cnv_intersect_load_ret_1;
     }
-    sprintf(logbuf, "Error: Empty %s.\n", cift_str);
+    sprintf(g_logbuf, "Error: Empty %s.\n", cift_str);
     goto cnv_intersect_load_ret_INVALID_FORMAT_2;
   }
   if (small_interval_ct) {
@@ -288,8 +288,7 @@ int32_t cnv_intersect_load(uint32_t intersect_filter_type, char* intersect_filte
   } else {
     fill_ulong_zero(il_chrom_start_large, chrom_code_end + 1);
   }
-  *topsize_ptr = CACHELINE * ((small_interval_ct + large_interval_ct + CACHELINE_INT64 - 1) / CACHELINE_INT64);
-  wkspace_left -= (*topsize_ptr);
+  bigstack_end_alloc_presized(round_up_pow2(small_interval_ct + large_interval_ct, CACHELINE_INT64) * sizeof(int64_t));
   while (0) {
   cnv_intersect_load_ret_NOMEM:
     retval = RET_NOMEM;
@@ -315,22 +314,22 @@ int32_t cnv_first_nonheader_line(FILE* cnvfile, uintptr_t* line_idx_ptr) {
   int32_t retval = 0;
   char* bufptr;
   rewind(cnvfile);
-  // assumes tbuf[MAXLINELEN - 1] is initialized to space
+  // assumes g_textbuf[MAXLINELEN - 1] is initialized to space
   do {
     line_idx++;
-    if (!fgets(tbuf, MAXLINELEN, cnvfile)) {
+    if (!fgets(g_textbuf, MAXLINELEN, cnvfile)) {
       goto cnv_first_nonheader_line_fgets_fail;
     }
-    if (!tbuf[MAXLINELEN - 1]) {
+    if (!g_textbuf[MAXLINELEN - 1]) {
       logprint("\n");
       LOGERRPRINTF("Error: Line %" PRIuPTR " of .cnv file is pathologically long.\n", line_idx);
       goto cnv_first_nonheader_line_ret_INVALID_FORMAT;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
   } while (is_eoln_kns(*bufptr));
   if ((strlen_se(bufptr) == 3) && (!memcmp(bufptr, "FID", 3))) {
     line_idx++;
-    if (!fgets(tbuf, MAXLINELEN, cnvfile)) {
+    if (!fgets(g_textbuf, MAXLINELEN, cnvfile)) {
       goto cnv_first_nonheader_line_fgets_fail;
     }
   }
@@ -432,13 +431,13 @@ uint32_t is_cnv_overlap(uint32_t start_pos, uint32_t end_pos, uint32_t overlap_t
 }
 
 int32_t cnv_make_map_write(FILE* new_mapfile, Chrom_info* chrom_info_ptr, uint32_t chrom_idx, uint32_t bp_pos, uintptr_t* max_marker_id_len_ptr) {
-  char* wptr = chrom_name_write(tbuf, chrom_info_ptr, chrom_idx);
+  char* wptr = chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf);
   char* wptr2 = memcpya(wptr, "\tp", 2);
   uintptr_t cur_marker_id_len;
   // this just needs to be an arbitrary unique name, so it's fine if we don't
   // use chrom_name_write() here
-  wptr2 = uint32_writex(wptr2, chrom_idx, '-');
-  wptr2 = uint32_write(wptr2, bp_pos);
+  wptr2 = uint32toa_x(chrom_idx, '-', wptr2);
+  wptr2 = uint32toa(bp_pos, wptr2);
   cur_marker_id_len = (uintptr_t)(wptr2 - wptr);
   if (cur_marker_id_len > (*max_marker_id_len_ptr)) {
     // includes an extra character at the start, to compensate for missing
@@ -446,19 +445,19 @@ int32_t cnv_make_map_write(FILE* new_mapfile, Chrom_info* chrom_info_ptr, uint32
     *max_marker_id_len_ptr = cur_marker_id_len;
   }
   wptr2 = memcpyl3a(wptr2, "\t0\t");
-  wptr2 = uint32_writex(wptr2, bp_pos, '\n');
-  return fwrite_checked(tbuf, wptr2 - tbuf, new_mapfile);
+  wptr2 = uint32toa_x(bp_pos, '\n', wptr2);
+  return fwrite_checked(g_textbuf, wptr2 - g_textbuf, new_mapfile);
 }
 
 int32_t cnv_make_map(FILE* cnvfile, char* new_mapname, uint32_t cnv_calc_type, uint32_t min_seglen, uint32_t max_seglen, double min_score, double max_score, uint32_t min_sites, uint32_t max_sites, uintptr_t* il_chrom_start_small, uintptr_t* il_chrom_start_large, uint32_t* il_chrom_max_width_small, uint32_t* il_chrom_max_width_large, uint64_t* il_small, uint64_t* il_large, uint32_t intersect_filter_type, uint32_t overlap_type, double overlap_val, int32_t marker_pos_start, int32_t marker_p [...]
-  int64_t* marker_pos_arr = (int64_t*)wkspace_base;
+  int64_t* marker_pos_arr = (int64_t*)g_bigstack_base;
   FILE* new_mapfile = NULL;
   uintptr_t raw_marker_ct = 0;
   uint32_t distinct_marker_ct = 1;
   uint32_t req_fields = 3;
   uint32_t filter_seglen = min_seglen || (max_seglen < 0xffffffffU);
   uint32_t cnv_del = cnv_calc_type & CNV_DEL;
-  uint32_t filter_score = (min_score > -HUGE_DOUBLE) || (max_score < HUGE_DOUBLE);
+  uint32_t filter_score = (min_score > -DBL_MAX) || (max_score < DBL_MAX);
   uint32_t filter_sites = min_sites || (max_sites < 0xffffffffU);
   uint32_t make_map_long = cnv_calc_type & CNV_MAKE_MAP_LONG;
   uint32_t is_autogen = (!il_chrom_start_small)? 1 : 0;
@@ -480,14 +479,14 @@ int32_t cnv_make_map(FILE* cnvfile, char* new_mapname, uint32_t cnv_calc_type, u
   int32_t ii;
   double dxx;
   logprintb();
-  if (fopen_checked(&new_mapfile, new_mapname, "w")) {
+  if (fopen_checked(new_mapname, "w", &new_mapfile)) {
     goto cnv_make_map_ret_OPEN_FAIL;
   }
   retval = cnv_first_nonheader_line(cnvfile, &line_idx);
   if (retval) {
     goto cnv_make_map_ret_1;
   }
-  max_marker_ct = wkspace_left / sizeof(int64_t);
+  max_marker_ct = bigstack_left() / sizeof(int64_t);
   // allow SCORE/SITES to be missing if they aren't being filtered on
   if (filter_sites) {
     req_fields = 5;
@@ -497,26 +496,26 @@ int32_t cnv_make_map(FILE* cnvfile, char* new_mapname, uint32_t cnv_calc_type, u
   line_idx--;
   do {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of .cnv file is pathologically long.\n", line_idx);
+    if (!g_textbuf[MAXLINELEN - 1]) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .cnv file is pathologically long.\n", line_idx);
       goto cnv_make_map_ret_INVALID_FORMAT_2N;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
     if (!is_eoln_kns(*bufptr)) {
       // FID, IID, CHR, BP1, BP2, TYPE, SCORE, SITES
       bufptr = next_token_mult(bufptr, 2);
       bufptr2 = next_token_mult(bufptr, req_fields);
       if (no_more_tokens_kns(bufptr2)) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of .cnv file has fewer tokens than expected.\n", line_idx);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .cnv file has fewer tokens than expected.\n", line_idx);
 	goto cnv_make_map_ret_INVALID_FORMAT_2N;
       }
       ii = get_chrom_code(chrom_info_ptr, bufptr);
       if (ii < 0) {
 	if ((!allow_extra_chroms) || (ii == -1)) {
-	  sprintf(logbuf, "Error: Invalid chromosome code on line %" PRIuPTR " of .cnv file.\n", line_idx);
+	  sprintf(g_logbuf, "Error: Invalid chromosome code on line %" PRIuPTR " of .cnv file.\n", line_idx);
           goto cnv_make_map_ret_INVALID_FORMAT_2N;
 	}
-        retval = resolve_or_add_chrom_name(chrom_info_ptr, bufptr, &ii, line_idx, ".cnv file");
+        retval = resolve_or_add_chrom_name(bufptr, ".cnv file", line_idx, chrom_info_ptr, &ii);
 	if (retval) {
 	  goto cnv_make_map_ret_1;
 	}
@@ -529,11 +528,11 @@ int32_t cnv_make_map(FILE* cnvfile, char* new_mapname, uint32_t cnv_calc_type, u
       bufptr2 = next_token(bufptr);
       bufptr = next_token(bufptr2);
       if (scan_uint_defcap(bufptr2, &seg_start) || scan_uint_defcap(bufptr, &seg_end)) {
-	sprintf(logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of .cnv file.\n", line_idx);
+	sprintf(g_logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of .cnv file.\n", line_idx);
         goto cnv_make_map_ret_INVALID_FORMAT_2N;
       }
       if (seg_end < seg_start) {
-	sprintf(logbuf, "Error: Segment end coordinate smaller than segment start on line %" PRIuPTR " of\n.cnv file.\n", line_idx);
+	sprintf(g_logbuf, "Error: Segment end coordinate smaller than segment start on line %" PRIuPTR " of\n.cnv file.\n", line_idx);
 	goto cnv_make_map_ret_INVALID_FORMAT_2N;
       }
       if ((marker_pos_start > (int32_t)seg_start) || ((marker_pos_end != -1) && (marker_pos_end < (int32_t)seg_end))) {
@@ -550,7 +549,7 @@ int32_t cnv_make_map(FILE* cnvfile, char* new_mapname, uint32_t cnv_calc_type, u
       if (cnv_calc_type & (CNV_DEL | CNV_DUP)) {
 	bufptr2 = next_token(bufptr);
 	if (scan_uint_defcap(bufptr2, (uint32_t*)&ii)) {
-	  sprintf(logbuf, "Error: Invalid variant copy count on line %" PRIuPTR " of .cnv file.\n", line_idx);
+	  sprintf(g_logbuf, "Error: Invalid variant copy count on line %" PRIuPTR " of .cnv file.\n", line_idx);
 	  goto cnv_make_map_ret_INVALID_FORMAT_2N;
 	}
 	if (cnv_del) {
@@ -564,7 +563,7 @@ int32_t cnv_make_map(FILE* cnvfile, char* new_mapname, uint32_t cnv_calc_type, u
       if (filter_score) {
 	bufptr2 = next_token_mult(bufptr, 2);
 	if (scan_double(bufptr2, &dxx)) {
-          sprintf(logbuf, "Error: Invalid confidence score on line %" PRIuPTR " of .cnv file.\n", line_idx);
+          sprintf(g_logbuf, "Error: Invalid confidence score on line %" PRIuPTR " of .cnv file.\n", line_idx);
 	  goto cnv_make_map_ret_INVALID_FORMAT_2N;
 	}
 	if ((dxx < min_score) || (dxx > max_score)) {
@@ -574,7 +573,7 @@ int32_t cnv_make_map(FILE* cnvfile, char* new_mapname, uint32_t cnv_calc_type, u
       if (filter_sites) {
 	bufptr2 = next_token_mult(bufptr, 3);
 	if (scan_posint_defcap(bufptr2, (uint32_t*)&ii)) {
-	  sprintf(logbuf, "Error: Invalid probe count on line %" PRIuPTR " of .cnv file.\n", line_idx);
+	  sprintf(g_logbuf, "Error: Invalid probe count on line %" PRIuPTR " of .cnv file.\n", line_idx);
 	  goto cnv_make_map_ret_INVALID_FORMAT_2N;
 	}
 	if ((((uint32_t)ii) < min_sites) || (((uint32_t)ii) > max_sites)) {
@@ -601,7 +600,7 @@ int32_t cnv_make_map(FILE* cnvfile, char* new_mapname, uint32_t cnv_calc_type, u
       }
       marker_pos_arr[raw_marker_ct++] = 1 + (int64_t)ullii;
     }
-  } while (fgets(tbuf, MAXLINELEN, cnvfile));
+  } while (fgets(g_textbuf, MAXLINELEN, cnvfile));
   if (!feof(cnvfile)) {
     goto cnv_make_map_ret_READ_FAIL;
   }
@@ -674,7 +673,7 @@ int32_t cnv_make_map(FILE* cnvfile, char* new_mapname, uint32_t cnv_calc_type, u
   }
  cnv_make_map_ret_1:
   fclose_cond(new_mapfile);
-  wkspace_reset(marker_pos_arr);
+  bigstack_reset(marker_pos_arr);
   return retval;
 }
 
@@ -698,13 +697,13 @@ int32_t validate_cnv_map(FILE** mapfile_ptr, char* mapname, int32_t* marker_pos_
   if ((*marker_pos_end_ptr) != -1) {
     marker_pos_end = *marker_pos_end_ptr;
   }
-  if (fopen_checked(mapfile_ptr, mapname, "r")) {
+  if (fopen_checked(mapname, "r", mapfile_ptr)) {
     goto validate_cnv_map_ret_OPEN_FAIL;
   }
   marker_chrom_start[0] = 0;
   do {
     line_idx++;
-    if (!fgets(tbuf, MAXLINELEN, *mapfile_ptr)) {
+    if (!fgets(g_textbuf, MAXLINELEN, *mapfile_ptr)) {
       if (feof(*mapfile_ptr)) {
 	logerrprint("Error: Empty .cnv.map file.\n");
 	goto validate_cnv_map_ret_INVALID_FORMAT;
@@ -712,10 +711,10 @@ int32_t validate_cnv_map(FILE** mapfile_ptr, char* mapname, int32_t* marker_pos_
 	goto validate_cnv_map_ret_READ_FAIL;
       }
     }
-    if (!tbuf[MAXLINELEN - 1]) {
+    if (!g_textbuf[MAXLINELEN - 1]) {
       goto validate_cnv_map_ret_LONG_LINE;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
   } while (is_eoln_kns(*bufptr));
   bufptr2 = next_token_mult(bufptr, 2);
   if (is_eoln_kns(*bufptr2)) {
@@ -731,20 +730,20 @@ int32_t validate_cnv_map(FILE** mapfile_ptr, char* mapname, int32_t* marker_pos_
   line_idx--;
   do {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
+    if (!g_textbuf[MAXLINELEN - 1]) {
       goto validate_cnv_map_ret_LONG_LINE;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
     ii = get_chrom_code(chrom_info_ptr, bufptr);
     if (ii < 0) {
       if ((!allow_extra_chroms) || (ii == -1)) {
-	sprintf(logbuf, "Error: Invalid chromosome code on line %" PRIuPTR " of .cnv.map file.\n", line_idx);
+	sprintf(g_logbuf, "Error: Invalid chromosome code on line %" PRIuPTR " of .cnv.map file.\n", line_idx);
 	goto validate_cnv_map_ret_INVALID_FORMAT_2;
       }
-      retval = resolve_or_add_chrom_name(chrom_info_ptr, bufptr, &ii, line_idx, ".cnv.map file");
+      retval = resolve_or_add_chrom_name(bufptr, ".cnv.map file", line_idx, chrom_info_ptr, &ii);
       if (retval) {
 	goto validate_cnv_map_ret_1;
       }
@@ -767,7 +766,7 @@ int32_t validate_cnv_map(FILE** mapfile_ptr, char* mapname, int32_t* marker_pos_
       continue;
     }
     if (scan_uint_defcap(bufptr2, (uint32_t*)&ii)) {
-      sprintf(logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of .cnv.map file.\n", line_idx);
+      sprintf(g_logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of .cnv.map file.\n", line_idx);
       goto validate_cnv_map_ret_INVALID_FORMAT_2;
     }
     if (ii <= last_pos) {
@@ -789,7 +788,7 @@ int32_t validate_cnv_map(FILE** mapfile_ptr, char* mapname, int32_t* marker_pos_
       logerrprint("Error: Too many entries in .cnv.map file (max 2147483647).\n");
       goto validate_cnv_map_ret_INVALID_FORMAT;
     }
-  } while (fgets(tbuf, MAXLINELEN, *mapfile_ptr));
+  } while (fgets(g_textbuf, MAXLINELEN, *mapfile_ptr));
   if (!feof(*mapfile_ptr)) {
     goto validate_cnv_map_ret_READ_FAIL;
   }
@@ -814,7 +813,7 @@ int32_t validate_cnv_map(FILE** mapfile_ptr, char* mapname, int32_t* marker_pos_
     retval = RET_INVALID_FORMAT;
     break;
   validate_cnv_map_ret_LONG_LINE:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of .cnv.map is pathologically long.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .cnv.map is pathologically long.\n", line_idx);
   validate_cnv_map_ret_INVALID_FORMAT_2:
     logerrprintb();
   validate_cnv_map_ret_INVALID_FORMAT:
@@ -839,10 +838,10 @@ int32_t load_cnv_map(FILE* mapfile, int32_t marker_pos_start, int32_t marker_pos
     marker_pos_end = 0x7fffffff;
   }
   do {
-    if (!fgets(tbuf, MAXLINELEN, mapfile)) {
+    if (!fgets(g_textbuf, MAXLINELEN, mapfile)) {
       goto load_cnv_map_ret_READ_FAIL;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
   } while (is_eoln_kns(*bufptr));
   bufptr = next_token_mult(bufptr, 3);
   if (is_eoln_kns(*bufptr)) {
@@ -851,7 +850,7 @@ int32_t load_cnv_map(FILE* mapfile, int32_t marker_pos_start, int32_t marker_pos
     colskip = 2;
   }
   do {
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
@@ -873,7 +872,7 @@ int32_t load_cnv_map(FILE* mapfile, int32_t marker_pos_start, int32_t marker_pos
     memcpy(marker_ids, bufptr, cur_marker_id_len + 1);
     marker_ids = &(marker_ids[max_marker_id_len]);
     *marker_pos++ = (uint32_t)cur_pos;
-  } while (fgets(tbuf, MAXLINELEN, mapfile));
+  } while (fgets(g_textbuf, MAXLINELEN, mapfile));
   if (!feof(mapfile)) {
     goto load_cnv_map_ret_READ_FAIL;
   }
@@ -886,7 +885,8 @@ int32_t load_cnv_map(FILE* mapfile, int32_t marker_pos_start, int32_t marker_pos
 }
 
 int32_t plink_cnv(char* outname, char* outname_end, char* cnvname, char* mapname, char* famname, char* phenoname, char* keepname, char* removename, char* filtername, uint64_t misc_flags, Two_col_params* update_chr, Two_col_params* update_cm, Two_col_params* update_map, Two_col_params* update_name, char* update_ids_fname, char* update_parents_fname, char* update_sex_fname, char* filtervals_flattened, uint64_t filter_flags, uint32_t cnv_calc_type, uint32_t min_seglen, uint32_t max_seglen,  [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   FILE* cnvfile = NULL;
   FILE* famfile = NULL;
   FILE* mapfile = NULL;
@@ -894,14 +894,13 @@ int32_t plink_cnv(char* outname, char* outname_end, char* cnvname, char* mapname
   char* subset_list = NULL;
   uintptr_t subset_ct = 0;
   uintptr_t max_subset_name_len = 0;
-  uintptr_t topsize = 0;
   uint32_t allow_extra_chroms = (misc_flags / MISC_ALLOW_EXTRA_CHROMS) & 1;
   uint64_t* il_small = NULL; // high-order 32 bits = 2x center pos,
                              // low-order 32 bits = interval end - start
   uint64_t* il_large = NULL;
   uintptr_t* il_chrom_start_small;
   uintptr_t* il_chrom_start_large;
-  unsigned char* wkspace_mark2;
+  unsigned char* bigstack_mark2;
   uint32_t* il_chrom_max_width_small;
   uint32_t* il_chrom_max_width_large;
   uint32_t* marker_chrom_start;
@@ -912,23 +911,23 @@ int32_t plink_cnv(char* outname, char* outname_end, char* cnvname, char* mapname
   char* sptr;
   uintptr_t ulii;
   uint32_t uii;
-  if (wkspace_alloc_ul_checked(&il_chrom_start_small, (MAX_POSSIBLE_CHROM + 1) * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&il_chrom_start_large, (MAX_POSSIBLE_CHROM + 1) * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&il_chrom_max_width_small, MAX_POSSIBLE_CHROM * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&il_chrom_max_width_large, MAX_POSSIBLE_CHROM * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&marker_chrom_start, (MAX_POSSIBLE_CHROM + 1) * sizeof(int32_t))) {
+  if (bigstack_alloc_ul(MAX_POSSIBLE_CHROM + 1, &il_chrom_start_small) ||
+      bigstack_alloc_ul(MAX_POSSIBLE_CHROM + 1, &il_chrom_start_large) ||
+      bigstack_alloc_ui(MAX_POSSIBLE_CHROM, &il_chrom_max_width_small) ||
+      bigstack_alloc_ui(MAX_POSSIBLE_CHROM, &il_chrom_max_width_large) ||
+      bigstack_alloc_ui(MAX_POSSIBLE_CHROM + 1, &marker_chrom_start)) {
     goto plink_cnv_ret_NOMEM;
   }
-  wkspace_mark2 = wkspace_base;
-  if (fopen_checked(&cnvfile, cnvname, "r")) {
+  bigstack_mark2 = g_bigstack_base;
+  if (fopen_checked(cnvname, "r", &cnvfile)) {
     goto plink_cnv_ret_OPEN_FAIL;
   }
   if (cnv_calc_type & (~CNV_MAKE_MAP)) {
-    if (fopen_checked(&famfile, famname, "r")) {
+    if (fopen_checked(famname, "r", &famfile)) {
       goto plink_cnv_ret_OPEN_FAIL;
     }
   }
-  tbuf[MAXLINELEN - 1] = ' ';
+  g_textbuf[MAXLINELEN - 1] = ' ';
   if (intersect_filter_type) {
     if (subset_fname) {
       retval = cnv_subset_load(subset_fname, &subset_list, &subset_ct, &max_subset_name_len);
@@ -936,15 +935,15 @@ int32_t plink_cnv(char* outname, char* outname_end, char* cnvname, char* mapname
 	goto plink_cnv_ret_1;
       }
     }
-    retval = cnv_intersect_load(intersect_filter_type, intersect_filter_fname, subset_list, subset_ct, max_subset_name_len, il_chrom_start_small, il_chrom_start_large, il_chrom_max_width_small, il_chrom_max_width_large, &il_small, &il_large, marker_pos_start, marker_pos_end, allow_extra_chroms, chrom_info_ptr, &topsize);
+    retval = cnv_intersect_load(intersect_filter_type, intersect_filter_fname, subset_list, subset_ct, max_subset_name_len, il_chrom_start_small, il_chrom_start_large, il_chrom_max_width_small, il_chrom_max_width_large, &il_small, &il_large, marker_pos_start, marker_pos_end, allow_extra_chroms, chrom_info_ptr);
     if (retval) {
       goto plink_cnv_ret_1;
     }
-    wkspace_reset(wkspace_mark2);
+    bigstack_reset(bigstack_mark2);
   }
   if (!(cnv_calc_type & CNV_MAKE_MAP)) {
     sptr = (char*)memchr(mapname, 0, FNAMESIZE);
-    if ((mapname[0] == '\0') || (!filename_exists(mapname, sptr, ""))) {
+    if ((mapname[0] == '\0') || (!filename_exists("", mapname, sptr))) {
       if (mapname[0] == '\0') {
         uii = strlen(cnvname);
         if ((uii < 5) || (cnvname[uii - 4] != '.') || (!match_upper_nt(&(cnvname[uii - 3]), "CNV", 3))) {
@@ -954,14 +953,14 @@ int32_t plink_cnv(char* outname, char* outname_end, char* cnvname, char* mapname
 	memcpy(mapname, cnvname, uii);
 	memcpy(&(mapname[uii]), ".map", 5);
 	sptr = &(mapname[uii + 4]);
-	if (filename_exists(mapname, sptr, "")) {
+	if (filename_exists("", mapname, sptr)) {
 	  LOGERRPRINTFWW("Error: No .cnv.map filename specified, and natural autogeneration target\n(%s) already exists.\n", mapname);
 	  goto plink_cnv_ret_INVALID_CMDLINE;
 	}
       }
-      sprintf(logbuf, "Autogenerating missing %s ... ", mapname);
-      wordwrap(logbuf, 5);
-      retval = cnv_make_map(cnvfile, mapname, 0, 0, 0xffffffffU, -HUGE_DOUBLE, HUGE_DOUBLE, 0, 0xffffffffU, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0, 0.0, -1, -1, allow_extra_chroms, 0, chrom_info_ptr, &max_marker_id_len, marker_chrom_start);
+      sprintf(g_logbuf, "Autogenerating missing %s ... ", mapname);
+      wordwrapb(5);
+      retval = cnv_make_map(cnvfile, mapname, 0, 0, 0xffffffffU, -DBL_MAX, DBL_MAX, 0, 0xffffffffU, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0, 0.0, -1, -1, allow_extra_chroms, 0, chrom_info_ptr, &max_marker_id_len, marker_chrom_start);
     } else {
       retval = validate_cnv_map(&mapfile, mapname, &marker_pos_start, &marker_pos_end, allow_extra_chroms, chrom_info_ptr, &max_marker_id_len, marker_chrom_start);
     }
@@ -970,15 +969,15 @@ int32_t plink_cnv(char* outname, char* outname_end, char* cnvname, char* mapname
     }
   } else {
     memcpy(outname_end, ".cnv.map", 9);
-    sprintf(logbuf, "Generating %s ... ", outname);
-    wordwrap(logbuf, 5);
+    sprintf(g_logbuf, "Generating %s ... ", outname);
+    wordwrapb(5);
     retval = cnv_make_map(cnvfile, outname, cnv_calc_type, min_seglen, max_seglen, min_score, max_score, min_sites, max_sites, il_chrom_start_small, il_chrom_start_large, il_chrom_max_width_small, il_chrom_max_width_large, il_small, il_large, intersect_filter_type, overlap_type, overlap_val, marker_pos_start, marker_pos_end, allow_extra_chroms, 0, chrom_info_ptr, &max_marker_id_len, marker_chrom_start);
     if (retval || (!(cnv_calc_type & (CNV_MAKE_MAP | CNV_DEL | CNV_DUP)))) {
       goto plink_cnv_ret_1;
     }
   }
   if (!mapfile) {
-    if (fopen_checked(&mapfile, mapname, "r")) {
+    if (fopen_checked(mapname, "r", &mapfile)) {
       goto plink_cnv_ret_OPEN_FAIL;
     }
   }
@@ -988,8 +987,8 @@ int32_t plink_cnv(char* outname, char* outname_end, char* cnvname, char* mapname
     goto plink_cnv_ret_NOMEM;
   }
 #endif
-  if (wkspace_alloc_ui_checked(&marker_pos, ulii * sizeof(int32_t)) ||
-      wkspace_alloc_c_checked(&marker_ids, ulii * max_marker_id_len)) {
+  if (bigstack_alloc_ui(ulii, &marker_pos) ||
+      bigstack_alloc_c(ulii * max_marker_id_len, &marker_ids)) {
     goto plink_cnv_ret_NOMEM;
   }
   retval = load_cnv_map(mapfile, marker_pos_start, marker_pos_end, chrom_info_ptr, max_marker_id_len, marker_pos, marker_ids);
@@ -1009,14 +1008,11 @@ int32_t plink_cnv(char* outname, char* outname_end, char* cnvname, char* mapname
     break;
   }
  plink_cnv_ret_1:
-  if (topsize) {
-    wkspace_left += topsize;
-  }
   fclose_cond(cnvfile);
   fclose_cond(famfile);
   fclose_cond(mapfile);
   fclose_cond(outfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
   return 0;
 }
 #endif // HIGH_MAX_CHROM
diff --git a/plink_common.c b/plink_common.c
index 9bbc9f9..ad9a165 100644
--- a/plink_common.c
+++ b/plink_common.c
@@ -1,13 +1,13 @@
 #include "plink_common.h"
 
-#include "pigz.h"
+// #include "pigz.h"
 
 // no leading \n since this is used in LOGPRINTFWW expressions
-const char errstr_fopen[] = "Error: Failed to open %s.\n";
+const char g_errstr_fopen[] = "Error: Failed to open %s.\n";
 
-const char cmdline_format_str[] = "\n  " PROG_NAME_STR " [input flag(s)...] {command flag(s)...} {other flag(s)...}\n  " PROG_NAME_STR " --help {flag name(s)...}\n\n";
+const char g_cmdline_format_str[] = "\n  " PROG_NAME_STR " [input flag(s)...] {command flag(s)...} {other flag(s)...}\n  " PROG_NAME_STR " --help {flag name(s)...}\n\n";
 
-char tbuf[TBUF_SIZE];
+char g_textbuf[TEXTBUF_SIZE];
 
 // note that \xxx character constants are interpreted in octal.
 // technically no need to represent 0-31, but 64 extra bytes of data is
@@ -16,29 +16,26 @@ const char g_one_char_strs[] = "\0\0\1\0\2\0\3\0\4\0\5\0\6\0\7\0\10\0\11\0\12\0\
 const char* g_missing_geno_ptr = &(g_one_char_strs[96]);
 const char* g_output_missing_geno_ptr = &(g_one_char_strs[96]);
 
-sfmt_t sfmt;
+sfmt_t g_sfmt;
 
-FILE* logfile = NULL;
+FILE* g_logfile = NULL;
 
-// mostly-safe sprintf buffer.  warning: do NOT put allele codes or
-// arbitrary-length lists in here.
-char logbuf[MAXLINELEN * 2];
+char g_logbuf[MAXLINELEN * 2];
 
 uint32_t g_debug_on = 0;
 uint32_t g_log_failed = 0;
-uintptr_t g_sample_ct;
 uint32_t g_thread_ct;
 
-uint32_t aligned_malloc(uintptr_t** aligned_pp, uintptr_t size) {
-#ifdef __LP64__
+uint32_t aligned_malloc(uintptr_t size, uintptr_t** aligned_pp) {
+#if defined __LP64__ && !defined __APPLE__
   // Avoid random segfaults on 64-bit machines which have 8-byte- instead of
   // 16-byte-aligned malloc().  (Slightly different code is needed if malloc()
   // does not even guarantee 8-byte alignment.)
-  uintptr_t* malloc_ptr = (uintptr_t*)malloc(size + 16);
+  uintptr_t* malloc_ptr = (uintptr_t*)malloc(size + VEC_BYTES);
   if (!malloc_ptr) {
     return 1;
   }
-  *aligned_pp = (uintptr_t*)((((uintptr_t)malloc_ptr) + 16) & (~(15 * ONELU)));
+  *aligned_pp = (uintptr_t*)((((uintptr_t)malloc_ptr) + VEC_BYTES) & (~(VEC_BYTES_M1 * ONELU)));
   (*aligned_pp)[-1] = (uintptr_t)malloc_ptr;
 #else
   // no SSE2 concerns here
@@ -51,29 +48,29 @@ uint32_t aligned_malloc(uintptr_t** aligned_pp, uintptr_t size) {
 }
 
 void aligned_free(uintptr_t* aligned_pp) {
-#ifdef __LP64__
+#if defined __LP64__ && !defined __APPLE__
   free((uintptr_t*)(aligned_pp[-1]));
 #else
   free(aligned_pp);
 #endif
 }
 
-uint32_t push_ll_str(Ll_str** ll_stack_ptr, const char* ss) {
-  uint32_t slen = strlen(ss);
-  Ll_str* new_ll_str = (Ll_str*)malloc(sizeof(Ll_str) + slen + 1);
+uint32_t push_ll_str(const char* ss, Ll_str** ll_stack_ptr) {
+  uintptr_t str_bytes = strlen(ss) + 1;
+  Ll_str* new_ll_str = (Ll_str*)malloc(sizeof(Ll_str) + str_bytes);
   if (!new_ll_str) {
     return 1;
   }
   new_ll_str->next = *ll_stack_ptr;
-  memcpy(new_ll_str->ss, ss, slen + 1);
+  memcpy(new_ll_str->ss, ss, str_bytes);
   *ll_stack_ptr = new_ll_str;
   return 0;
 }
 
 void logstr(const char* ss) {
   if (!g_debug_on) {
-    fputs(ss, logfile);
-    if (ferror(logfile)) {
+    fputs(ss, g_logfile);
+    if (ferror(g_logfile)) {
       putchar('\n');
       fflush(stdout);
       fprintf(stderr, "Warning: Logging failure on:\n%s\nFurther logging will not be attempted in this run.\n", ss);
@@ -84,14 +81,14 @@ void logstr(const char* ss) {
       fflush(stdout);
       fputs(ss, stderr);
     } else {
-      fputs(ss, logfile);
-      if (ferror(logfile)) {
+      fputs(ss, g_logfile);
+      if (ferror(g_logfile)) {
 	putchar('\n');
 	fflush(stdout);
         fprintf(stderr, "Error: Debug logging failure.  Dumping to stderr:\n%s", ss);
 	g_log_failed = 1;
       } else {
-	fflush(logfile);
+	fflush(g_logfile);
       }
     }
   }
@@ -109,19 +106,17 @@ void logerrprint(const char* ss) {
 }
 
 void logprintb() {
-  logstr(logbuf);
-  fputs(logbuf, stdout);
+  logstr(g_logbuf);
+  fputs(g_logbuf, stdout);
 }
 
 void logerrprintb() {
-  logstr(logbuf);
+  logstr(g_logbuf);
   fflush(stdout);
-  fputs(logbuf, stderr);
+  fputs(g_logbuf, stderr);
 }
 
-void wordwrap(char* ss, uint32_t suffix_len) {
-  // This should have been written eons ago.
-
+void wordwrap(uint32_t suffix_len, char* ss) {
   // Input: A null-terminated string with no intermediate newlines.  If
   //        suffix_len is zero, there should be a terminating \n; otherwise,
   //        the last character should be a space.
@@ -150,8 +145,8 @@ void wordwrap(char* ss, uint32_t suffix_len) {
       if (!suffix_len) {
 	if (token_end <= &(line_end[1])) {
 	  // okay if end-of-string is one past the end, because function
-	  // assumes last character is \n in suffix_len == 0 case (might want
-	  // to add a debug option to enforce that)
+	  // assumes last character is \n in suffix_len == 0 case
+	  assert(token_end[-1] == '\n');
 	  return;
 	}
       } else {
@@ -160,6 +155,7 @@ void wordwrap(char* ss, uint32_t suffix_len) {
 	}
 	// because of terminal space assumption, token_start actually points
 	// to the end of the string
+	assert(token_start[-1] == ' ');
       }
       token_start[-1] = '\n';
       return;
@@ -185,66 +181,76 @@ void wordwrap(char* ss, uint32_t suffix_len) {
   }
 }
 
-int32_t fopen_checked(FILE** target_ptr, const char* fname, const char* mode) {
+void wordwrapb(uint32_t suffix_len) {
+  wordwrap(suffix_len, g_logbuf);
+}
+
+int32_t fopen_checked(const char* fname, const char* mode, FILE** target_ptr) {
   *target_ptr = fopen(fname, mode);
   if (!(*target_ptr)) {
-    LOGPRINTFWW(errstr_fopen, fname);
+    LOGPRINTFWW(g_errstr_fopen, fname);
     return -1;
   }
   return 0;
 }
 
 int32_t fwrite_checked(const void* buf, size_t len, FILE* outfile) {
-  while (len > 0x7ffe0000) {
-    // OS X can't perform >2GB writes
-    fwrite(buf, 1, 0x7ffe0000, outfile);
-    buf = &(((unsigned char*)buf)[0x7ffe0000]);
-    len -= 0x7ffe0000;
+  while (len > 0x7ffff000) {
+    // OS X can't perform 2GB+ writes
+    // typical disk block size is 4kb, so 0x7ffff000 is the largest sensible
+    // write size
+    fwrite(buf, 1, 0x7ffff000, outfile);
+    buf = &(((unsigned char*)buf)[0x7ffff000]);
+    len -= 0x7ffff000;
   }
   fwrite(buf, 1, len, outfile);
   return ferror(outfile);
 }
 
-int32_t gzopen_checked(gzFile* target_ptr, const char* fname, const char* mode) {
-  *target_ptr = gzopen(fname, mode);
-  if (!(*target_ptr)) {
-    LOGPRINTFWW(errstr_fopen, fname);
-    return -1;
+int32_t gzopen_read_checked(const char* fname, gzFile* gzf_ptr) {
+  *gzf_ptr = gzopen(fname, FOPEN_RB);
+  if (!(*gzf_ptr)) {
+    LOGPRINTFWW(g_errstr_fopen, fname);
+    return RET_OPEN_FAIL;
+  }
+  if (gzbuffer(*gzf_ptr, 131072)) {
+    return RET_NOMEM;
   }
   return 0;
 }
 
 // manually managed, very large stack
-unsigned char* wkspace_base;
-uintptr_t wkspace_left;
+unsigned char* g_bigstack_base;
+unsigned char* g_bigstack_end;
 
-unsigned char* wkspace_alloc(uintptr_t size) {
-  unsigned char* retval;
-  if (wkspace_left < size) {
+unsigned char* bigstack_alloc(uintptr_t size) {
+  unsigned char* alloc_ptr;
+  size = round_up_pow2(size, CACHELINE);
+  if (bigstack_left() < size) {
     return NULL;
   }
-  size = CACHEALIGN(size);
-  retval = wkspace_base;
-  wkspace_base += size;
-  wkspace_left -= size;
-  return retval;
+  alloc_ptr = g_bigstack_base;
+  g_bigstack_base += size;
+  return alloc_ptr;
 }
 
-void wkspace_reset(void* new_base) {
-  uintptr_t freed_bytes = wkspace_base - (unsigned char*)new_base;
-  wkspace_base = (unsigned char*)new_base;
-  wkspace_left += freed_bytes;
+void bigstack_shrink_top(const void* rebase, uintptr_t new_size) {
+  uintptr_t freed_bytes = ((uintptr_t)(g_bigstack_base - ((unsigned char*)rebase))) - round_up_pow2(new_size, CACHELINE);
+  g_bigstack_base -= freed_bytes;
 }
 
-void wkspace_shrink_top(void* rebase, uintptr_t new_size) {
-  uintptr_t freed_bytes = ((uintptr_t)(wkspace_base - ((unsigned char*)rebase))) - CACHEALIGN(new_size);
-  wkspace_base -= freed_bytes;
-  wkspace_left += freed_bytes;
+unsigned char* bigstack_end_alloc_presized(uintptr_t size) {
+  assert(!(size & END_ALLOC_CHUNK_M1));
+  uintptr_t cur_bigstack_left = bigstack_left();
+  if (size > cur_bigstack_left) {
+    return NULL;
+  } else {
+    g_bigstack_end -= size;
+    return g_bigstack_end;
+  }  
 }
 
-uint32_t match_upper(char* ss, const char* fixed_str) {
-  // Returns whether uppercased ss matches nonempty fixed_str.  Assumes
-  // fixed_str contains nothing but letters and a null terminator.
+uint32_t match_upper(const char* ss, const char* fixed_str) {
   char cc = *fixed_str++;
   do {
     if ((((unsigned char)(*ss++)) & 0xdf) != ((unsigned char)cc)) {
@@ -255,7 +261,7 @@ uint32_t match_upper(char* ss, const char* fixed_str) {
   return !(*ss);
 }
 
-uint32_t match_upper_nt(char* ss, const char* fixed_str, uint32_t ct) {
+uint32_t match_upper_nt(const char* ss, const char* fixed_str, uint32_t ct) {
   do {
     if ((((unsigned char)(*ss++)) & 0xdf) != ((unsigned char)(*fixed_str++))) {
       return 0;
@@ -264,13 +270,7 @@ uint32_t match_upper_nt(char* ss, const char* fixed_str, uint32_t ct) {
   return 1;
 }
 
-uint32_t scan_posint_capped(char* ss, uint32_t* valp, uint32_t cap_div_10, uint32_t cap_mod_10) {
-  // Reads an integer in [1, cap].  Assumes first character is nonspace.  Has
-  // the overflow detection atoi() lacks.
-  // A funny-looking div_10/mod_10 interface is used since the cap will usually
-  // be a constant, and we want the integer division/modulus to occur at
-  // compile time.
-
+uint32_t scan_posint_capped(const char* ss, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp) {
   // '0' has ascii code 48
   uint32_t val = (uint32_t)((unsigned char)*ss) - 48;
   uint32_t cur_digit;
@@ -301,7 +301,7 @@ uint32_t scan_posint_capped(char* ss, uint32_t* valp, uint32_t cap_div_10, uint3
   return 1;
 }
 
-uint32_t scan_uint_capped(char* ss, uint32_t* valp, uint32_t cap_div_10, uint32_t cap_mod_10) {
+uint32_t scan_uint_capped(const char* ss, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp) {
   // Reads an integer in [0, cap].  Assumes first character is nonspace. 
   uint32_t val = (uint32_t)((unsigned char)*ss) - 48;
   uint32_t cur_digit;
@@ -339,7 +339,7 @@ uint32_t scan_uint_capped(char* ss, uint32_t* valp, uint32_t cap_div_10, uint32_
   return ((uint32_t)((unsigned char)(*ss)) - 48) < 10;
 }
 
-uint32_t scan_int_abs_bounded(char* ss, int32_t* valp, uint32_t bound_div_10, uint32_t bound_mod_10) {
+uint32_t scan_int_abs_bounded(const char* ss, uint32_t bound_div_10, uint32_t bound_mod_10, int32_t* valp) {
   // Reads an integer in [-bound, bound].  Assumes first character is nonspace.
   uint32_t val = (uint32_t)((unsigned char)*ss) - 48;
   int32_t sign = 1;
@@ -370,7 +370,7 @@ uint32_t scan_int_abs_bounded(char* ss, int32_t* valp, uint32_t bound_div_10, ui
   return 1;
 }
 
-uint32_t scan_posintptr(char* ss, uintptr_t* valp) {
+uint32_t scan_posintptr(const char* ss, uintptr_t* valp) {
   // Reads an integer in [1, 2^BITCT - 1].  Assumes first character is
   // nonspace. 
   uintptr_t val = (uint32_t)((unsigned char)*ss) - 48;
@@ -438,7 +438,7 @@ uint32_t scan_uintptr(char* ss, uintptr_t* valp) {
 }
 */
 
-uint32_t scan_two_doubles(char* ss, double* val1p, double* val2p) {
+uint32_t scan_two_doubles(char* ss, double* __restrict val1p, double* __restrict val2p) {
   char* ss2;
   *val1p = strtod(ss, &ss2);
   if (ss == ss2) {
@@ -449,7 +449,7 @@ uint32_t scan_two_doubles(char* ss, double* val1p, double* val2p) {
   return (ss == ss2)? 1 : 0;
 }
 
-int32_t scan_token_ct_len(FILE* infile, char* buf, uintptr_t half_bufsize, uintptr_t* token_ct_ptr, uintptr_t* max_token_len_ptr) {
+int32_t scan_token_ct_len(uintptr_t half_bufsize, FILE* infile, char* buf, uintptr_t* __restrict token_ct_ptr, uintptr_t* __restrict max_token_len_ptr) {
   // buf must be of size >= (2 * half_bufsize + 2)
   // max_token_len includes trailing null
   uintptr_t full_bufsize = half_bufsize * 2;
@@ -521,7 +521,7 @@ int32_t scan_token_ct_len(FILE* infile, char* buf, uintptr_t half_bufsize, uintp
   return 0;
 }
 
-int32_t read_tokens(FILE* infile, char* buf, uintptr_t half_bufsize, uintptr_t token_ct, uintptr_t max_token_len, char* token_name_buf) {
+int32_t read_tokens(uintptr_t half_bufsize, uintptr_t token_ct, uintptr_t max_token_len, FILE* infile, char* __restrict buf, char* __restrict token_name_buf) {
   // buf must be of size >= (2 * half_bufsize + 2).
   // max_token_len includes trailing null
   uintptr_t full_bufsize = half_bufsize * 2;
@@ -606,17 +606,17 @@ int32_t gzputs_w4(gzFile gz_outfile, const char* ss) {
 int32_t get_next_noncomment(FILE* fptr, char** lptr_ptr, uintptr_t* line_idx_ptr) {
   char* lptr;
   do {
-    if (!fgets(tbuf, MAXLINELEN, fptr)) {
+    if (!fgets(g_textbuf, MAXLINELEN, fptr)) {
       return -1;
     }
     *line_idx_ptr += 1;
-    lptr = skip_initial_spaces(tbuf);
-  } while (is_eoln_or_comment(*lptr));
+    lptr = skip_initial_spaces(g_textbuf);
+  } while (is_eoln_or_comment_kns(*lptr));
   *lptr_ptr = lptr;
   return 0;
 }
 
-int32_t get_next_noncomment_excl(FILE* fptr, char** lptr_ptr, uintptr_t* line_idx_ptr, uintptr_t* marker_exclude, uintptr_t* marker_uidx_ptr) {
+int32_t get_next_noncomment_excl(const uintptr_t* __restrict marker_exclude, FILE* fptr, char** lptr_ptr, uintptr_t* __restrict line_idx_ptr, uintptr_t* __restrict marker_uidx_ptr) {
   while (!get_next_noncomment(fptr, lptr_ptr, line_idx_ptr)) {
     if (!is_set_ul(marker_exclude, *marker_uidx_ptr)) {
       return 0;
@@ -626,44 +626,15 @@ int32_t get_next_noncomment_excl(FILE* fptr, char** lptr_ptr, uintptr_t* line_id
   return -1;
 }
 
-char* token_end(char* sptr) {
-  char cc;
-  if (!sptr) {
-    return NULL;
-  }
-  cc = *sptr;
-  while (!is_space_or_eoln(cc)) {
-    cc = *(++sptr);
-  }
-  return cc? sptr : NULL;
-}
-
-char* token_endl(char* sptr) {
-  if (!sptr) {
-    return NULL;
-  }
-  while (!is_space_or_eoln(*sptr)) {
-    sptr++;
-  }
-  return sptr;
-}
-
-void get_top_two(uint32_t* uint_arr, uintptr_t uia_size, uintptr_t* top_idx_ptr, uintptr_t* second_idx_ptr) {
-  uintptr_t cur_idx = 2;
-  uintptr_t top_idx;
-  uint32_t top_val;
-  uintptr_t second_idx;
-  uint32_t second_val;
+void get_top_two_ui(const uint32_t* __restrict uint_arr, uintptr_t uia_size, uintptr_t* __restrict top_idx_ptr, uintptr_t* __restrict second_idx_ptr) {
+  assert(uia_size > 1);
+  uintptr_t top_idx = (uint_arr[1] > uint_arr[0])? 1 : 0;
+  uintptr_t second_idx = 1 ^ top_idx;
+  uint32_t top_val = uint_arr[top_idx];
+  uint32_t second_val = uint_arr[second_idx];
+  uintptr_t cur_idx;
   uintptr_t cur_val;
-  if (uint_arr[1] > uint_arr[0]) {
-    top_idx = 1;
-  } else {
-    top_idx = 0;
-  }
-  second_idx = 1 ^ top_idx;
-  top_val = uint_arr[top_idx];
-  second_val = uint_arr[second_idx];
-  do {
+  for (cur_idx = 2; cur_idx < uia_size; ++cur_idx) {
     cur_val = uint_arr[cur_idx];
     if (cur_val > second_val) {
       if (cur_val > top_val) {
@@ -676,55 +647,65 @@ void get_top_two(uint32_t* uint_arr, uintptr_t uia_size, uintptr_t* top_idx_ptr,
 	second_idx = cur_idx;
       }
     }
-  } while (++cur_idx < uia_size);
+  }
   *top_idx_ptr = top_idx;
   *second_idx_ptr = second_idx;
 }
 
-int32_t intlen(int32_t num) {
-  int32_t retval;
+uint32_t intlen(int32_t num) {
+  int32_t retval = 1;
+  uint32_t absnum;
   if (num < 0) {
-    num = -num;
-    retval = 2;
+    absnum = -num;
+    retval++;
   } else {
-    retval = 1;
+    absnum = num;
+  }
+  while (absnum > 99) {
+    // division by a constant is faster for unsigned ints
+    absnum /= 100;
+    retval += 2;
   }
-  while (num > 9) {
-    num /= 10;
+  if (absnum > 9) {
     retval++;
   }
   return retval;
 }
 
-int32_t strcmp_se(char* s_read, const char* s_const, uint32_t len) {
-  return memcmp(s_read, s_const, len) || (!is_space_or_eoln(s_read[len]));
+int32_t strcmp_se(const char* s_read, const char* s_const, uint32_t s_const_len) {
+  return memcmp(s_read, s_const, s_const_len) || (!is_space_or_eoln(s_read[s_const_len]));
 }
 
 char* next_token(char* sptr) {
   if (!sptr) {
     return NULL;
   }
-  while ((*sptr != ' ') && (*sptr != '\t')) {
-    if (!(*sptr)) {
-      return NULL;
-    }
-    sptr++;
+  unsigned char ucc = *sptr;
+  while (ucc > 32) {
+    ucc = *(++sptr);
   }
-  return skip_initial_spaces(sptr);
+  while ((ucc == ' ') || (ucc == '\t')) {
+    ucc = *(++sptr);
+  }
+  return (ucc > 32)? sptr : NULL;
 }
 
 char* next_token_mult(char* sptr, uint32_t ct) {
+  assert(ct);
   if (!sptr) {
     return NULL;
   }
+  unsigned char ucc = *sptr;
   do {
-    while ((*sptr != ' ') && (*sptr != '\t')) {
-      if (!(*sptr)) {
-	return NULL;
-      }
-      sptr++;
+    while (ucc > 32) {
+      ucc = *(++sptr);
+    }
+    while ((ucc == ' ') || (ucc == '\t')) {
+      ucc = *(++sptr);
+    }
+    if (ucc <= 32) {
+      return NULL;
     }
-    sptr = skip_initial_spaces(sptr);
   } while (--ct);
   return sptr;
 }
@@ -764,391 +745,428 @@ uint32_t count_and_measure_multistr(const char* multistr, uintptr_t* max_slen_pt
 
 // number-to-string encoders
 
-static const char digit2_table[] = {
-  "0001020304050607080910111213141516171819"
-  "2021222324252627282930313233343536373839"
-  "4041424344454647484950515253545556575859"
-  "6061626364656667686970717273747576777879"
-  "8081828384858687888990919293949596979899"};
-
-char* uint32_write(char* start, uint32_t uii) {
+static const char digit2_table[200] = {
+  '0', '0', '0', '1', '0', '2', '0', '3', '0', '4',
+  '0', '5', '0', '6', '0', '7', '0', '8', '0', '9',
+  '1', '0', '1', '1', '1', '2', '1', '3', '1', '4',
+  '1', '5', '1', '6', '1', '7', '1', '8', '1', '9',
+  '2', '0', '2', '1', '2', '2', '2', '3', '2', '4',
+  '2', '5', '2', '6', '2', '7', '2', '8', '2', '9',
+  '3', '0', '3', '1', '3', '2', '3', '3', '3', '4',
+  '3', '5', '3', '6', '3', '7', '3', '8', '3', '9',
+  '4', '0', '4', '1', '4', '2', '4', '3', '4', '4',
+  '4', '5', '4', '6', '4', '7', '4', '8', '4', '9',
+  '5', '0', '5', '1', '5', '2', '5', '3', '5', '4',
+  '5', '5', '5', '6', '5', '7', '5', '8', '5', '9',
+  '6', '0', '6', '1', '6', '2', '6', '3', '6', '4',
+  '6', '5', '6', '6', '6', '7', '6', '8', '6', '9',
+  '7', '0', '7', '1', '7', '2', '7', '3', '7', '4',
+  '7', '5', '7', '6', '7', '7', '7', '8', '7', '9',
+  '8', '0', '8', '1', '8', '2', '8', '3', '8', '4',
+  '8', '5', '8', '6', '8', '7', '8', '8', '8', '9',
+  '9', '0', '9', '1', '9', '2', '9', '3', '9', '4',
+  '9', '5', '9', '6', '9', '7', '9', '8', '9', '9'};
+
+char* uint32toa(uint32_t uii, char* start) {
   // Memory-efficient fast integer writer.  (You can do a bit better sometimes
   // by using a larger lookup table, but on average I doubt that pays off.)
-  //
-  // Originally the arguments were in the other order (was trying to follow
-  // Google's "inputs first, than outputs" coding style guidelines), but then I
-  // realized that chained invocation of this function is much easier to read
-  // if I make the target buffer the first argument.
+  // Returns a pointer to the end of the integer (not null-terminated).
   uint32_t quotient;
   if (uii < 1000) {
     if (uii < 10) {
-      *start = '0' + uii;
-      return &(start[1]);
-    } else if (uii >= 100) {
-      quotient = uii / 100;
-      *start++ = '0' + quotient;
-      uii -= quotient * 100;
+      *start++ = '0' + uii;
+      return start;
     }
-    return memcpya(start, &(digit2_table[uii * 2]), 2);
-  } else if (uii < 10000000) {
-    if (uii >= 100000) {
-      if (uii < 1000000) {
-	goto uint32_write_6;
-      }
-      quotient = uii / 1000000;
-      *start++ = '0' + quotient;
-      goto uint32_write_6b;
-    } else if (uii < 10000) {
-      goto uint32_write_4;
+    if (uii < 100) {
+      goto uint32toa_2;
     }
-    quotient = uii / 10000;
+    quotient = uii / 100;
     *start++ = '0' + quotient;
   } else {
-    if (uii >= 100000000) {
-      quotient = uii / 100000000;
-      if (uii >= 1000000000) {
-	start = memcpya(start, &(digit2_table[quotient * 2]), 2);
-      } else {
+    if (uii < 10000000) {
+      if (uii >= 100000) {
+	if (uii < 1000000) {
+	  goto uint32toa_6;
+	}
+	quotient = uii / 1000000;
 	*start++ = '0' + quotient;
+	goto uint32toa_6b;
+      }
+      if (uii < 10000) {
+	goto uint32toa_4;
       }
-      uii -= 100000000 * quotient;
+      quotient = uii / 10000;
+      *start++ = '0' + quotient;
+    } else {
+      if (uii >= 100000000) {
+	quotient = uii / 100000000;
+	if (uii >= 1000000000) {
+	  start = memcpya(start, &(digit2_table[quotient * 2]), 2);
+	} else {
+	  *start++ = '0' + quotient;
+	}
+	uii -= 100000000 * quotient;
+      }
+      quotient = uii / 1000000;
+      start = memcpya(start, &(digit2_table[quotient * 2]), 2);
+    uint32toa_6b:
+      uii -= 1000000 * quotient;
+    uint32toa_6:
+      quotient = uii / 10000;
+      start = memcpya(start, &(digit2_table[quotient * 2]), 2);
     }
-    quotient = uii / 1000000;
-    start = memcpya(start, &(digit2_table[quotient * 2]), 2);
-  uint32_write_6b:
-    uii -= 1000000 * quotient;
-  uint32_write_6:
-    quotient = uii / 10000;
+    uii -= 10000 * quotient;
+  uint32toa_4:
+    // could make a uitoa_z4() call here, but that's slightly slower
+    quotient = uii / 100;
     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
   }
-  uii -= 10000 * quotient;
- uint32_write_4:
-  quotient = uii / 100;
   uii -= 100 * quotient;
-  return memcpya(memcpya(start, &(digit2_table[quotient * 2]), 2), &(digit2_table[uii * 2]), 2);
+ uint32toa_2:
+  return memcpya(start, &(digit2_table[uii * 2]), 2);
 }
 
-char* int32_write(char* start, int32_t ii) {
+char* int32toa(int32_t ii, char* start) {
+  uint32_t uii = ii;
   if (ii < 0) {
-    if (ii < -2147483647) {
-      return memcpya(start, "-2147483648", 11);
-    }
+    // -INT_MIN is undefined, but negating the unsigned int equivalent works
     *start++ = '-';
-    ii = -ii;
+    uii = -uii;
   }
-  return uint32_write(start, (uint32_t)ii);
+  return uint32toa(uii, start);
 }
 
-void uint32_write4(char* start, uint32_t uii) {
-  // Write exactly four digits (padding with zeroes if necessary); useful for
-  // e.g. floating point encoders.
+char* uitoa_z4(uint32_t uii, char* start) {
   uint32_t quotient = uii / 100;
+  assert(quotient < 100);
   uii -= 100 * quotient;
-  memcpy(memcpya(start, &(digit2_table[quotient * 2]), 2), &(digit2_table[uii * 2]), 2);
+  start = memcpya(start, &(digit2_table[quotient * 2]), 2);
+  return memcpya(start, &(digit2_table[uii * 2]), 2);
 }
 
-static inline void uint32_write6(char* start, uint32_t uii) {
+char* uitoa_z6(uint32_t uii, char* start) {
   uint32_t quotient = uii / 10000;
-  uint32_write4(memcpya(start, &(digit2_table[quotient * 2]), 2), uii - 10000 * quotient);
+  start = memcpya(start, &(digit2_table[quotient * 2]), 2);
+  return uitoa_z4(uii - 10000 * quotient, start);
 }
 
-static inline void uint32_write8(char* start, uint32_t uii) {
+char* uitoa_z8(uint32_t uii, char* start) {
   uint32_t quotient = uii / 1000000;
-  uint32_write6(memcpya(start, &(digit2_table[quotient * 2]), 2), uii - 1000000 * quotient);
+  start = memcpya(start, &(digit2_table[quotient * 2]), 2);
+  return uitoa_z6(uii - 1000000 * quotient, start);
 }
 
-char* int64_write(char* start, int64_t llii) {
-  int64_t top_digits;
+char* int64toa(int64_t llii, char* start) {
+  uint64_t ullii = llii;
+  uint64_t top_digits;
   uint32_t bottom_eight;
   uint32_t middle_eight;
   if (llii < 0) {
-    if (llii < -9223372036854775807LL) {
-      // special case, can't be represented positive
-      return memcpya(start, "-9223372036854775808", 20);
-    }
     *start++ = '-';
-    llii = -llii;
+    ullii = -ullii;
   }
-  if (llii <= 0xffffffffLL) {
-    return uint32_write(start, (uint32_t)llii);
+  if (ullii <= 0xffffffffLLU) {
+    return uint32toa((uint32_t)ullii, start);
   }
-  top_digits = llii / 100000000LL;
-  bottom_eight = (uint32_t)(llii - (top_digits * 100000000));
-  if (top_digits <= 0xffffffffLL) {
-    start = uint32_write(start, (uint32_t)top_digits);
-    uint32_write8(start, bottom_eight);
-    return &(start[8]);
+  top_digits = ullii / 100000000;
+  bottom_eight = (uint32_t)(ullii - (top_digits * 100000000));
+  if (top_digits <= 0xffffffffLLU) {
+    start = uint32toa((uint32_t)top_digits, start);
+    return uitoa_z8(bottom_eight, start);
   }
-  llii = top_digits / 100000000LL;
-  middle_eight = (uint32_t)(top_digits - (llii * 100000000));
-  start = uint32_write(start, (uint32_t)llii);
-  uint32_write8(start, middle_eight);
-  uint32_write8(&(start[8]), bottom_eight);
-  return &(start[16]);
+  ullii = top_digits / 100000000;
+  middle_eight = (uint32_t)(top_digits - (ullii * 100000000));
+  start = uint32toa((uint32_t)ullii, start);
+  start = uitoa_z8(middle_eight, start);
+  return uitoa_z8(bottom_eight, start);
 }
 
-char* uint32_writew4(char* start, uint32_t uii) {
-  // Minimum field width 4.
+char* uint32toa_w4(uint32_t uii, char* start) {
   uint32_t quotient;
   if (uii < 1000) {
     if (uii < 10) {
-      memset(start, 32, 3);
-      start[3] = '0' + uii;
+      // assumes little-endian
+      *((uint32_t*)start) = 0x30202020 + (uii << 24);
       return &(start[4]);
-    } else if (uii < 100) {
+    }
+    if (uii < 100) {
       memset(start, 32, 2);
     } else {
       quotient = uii / 100;
-      *start = ' ';
-      start[1] = '0' + quotient;
+      *start++ = ' ';
+      *start++ = '0' + quotient;
       uii -= quotient * 100;
     }
-    return memcpya(&(start[2]), &(digit2_table[uii * 2]), 2);
+    return memcpya(start, &(digit2_table[uii * 2]), 2);
   } else {
-    return uint32_write(start, uii);
+    // presumably the field width is 4 for a reason; don't bother optimizing
+    // this
+    return uint32toa(uii, start);
   }
 }
 
-char* uint32_writew6(char* start, uint32_t uii) {
+char* uint32toa_w6(uint32_t uii, char* start) {
   uint32_t quotient;
   if (uii < 1000) {
     if (uii < 10) {
-      memset(start, 32, 5);
-      start[5] = '0' + uii;
-      return &(start[6]);
-    } else if (uii < 100) {
-      memset(start, 32, 4);
-    } else {
-      memset(start, 32, 3);
-      quotient = uii / 100;
-      start[3] = '0' + quotient;
-      uii -= quotient * 100;
+      start = memseta(start, 32, 5);
+      *start++ = '0' + uii;
+      return start;
     }
-    return memcpya(&(start[4]), &(digit2_table[uii * 2]), 2);
-  } else if (uii < 10000000) {
-    if (uii >= 100000) {
-      if (uii >= 1000000) {
+    if (uii < 100) {
+      start = memseta(start, 32, 4);
+      goto uint32toa_w6_2;
+    }
+    quotient = uii / 100;
+    // the little-endian trick doesn't seem to help here.  possibly relevant
+    // differences from uint32toa_w4() and _w8(): sequential dependence on
+    // quotient, need to interpret pointer as a char* again
+    start = memseta(start, 32, 3);
+    *start++ = '0' + quotient;
+  } else {
+    if (uii < 10000000) {
+      if (uii >= 100000) {
+	if (uii < 1000000) {
+	  goto uint32toa_w6_6;
+	}
 	quotient = uii / 1000000;
 	*start++ = '0' + quotient;
-	goto uint32_writew6_6b;
+	goto uint32toa_w6_6b;
+      } else if (uii >= 10000) {
+	*start++ = ' ';
+	quotient = uii / 10000;
+	*start++ = '0' + quotient;
+      } else {
+	start = memseta(start, 32, 2);
+	goto uint32toa_w6_4;
       }
-      goto uint32_writew6_6;
-    } else if (uii >= 10000) {
-      *start++ = ' ';
-      quotient = uii / 10000;
-      *start++ = '0' + quotient;
     } else {
-      start = memseta(start, 32, 2);
-      goto uint32_writew6_4;
-    }
-  } else {
-    if (uii >= 100000000) {
-      quotient = uii / 100000000;
-      if (uii >= 1000000000) {
-	start = memcpya(start, &(digit2_table[quotient * 2]), 2);
-      } else {
-	*start++ = '0' + quotient;
+      if (uii >= 100000000) {
+	quotient = uii / 100000000;
+	if (uii >= 1000000000) {
+	  start = memcpya(start, &(digit2_table[quotient * 2]), 2);
+	} else {
+	  *start++ = '0' + quotient;
+	}
+	uii -= 100000000 * quotient;
       }
-      uii -= 100000000 * quotient;
+      quotient = uii / 1000000;
+      start = memcpya(start, &(digit2_table[quotient * 2]), 2);
+    uint32toa_w6_6b:
+      uii -= 1000000 * quotient;
+    uint32toa_w6_6:
+      quotient = uii / 10000;
+      start = memcpya(start, &(digit2_table[quotient * 2]), 2);
     }
-    quotient = uii / 1000000;
-    start = memcpya(start, &(digit2_table[quotient * 2]), 2);
-  uint32_writew6_6b:
-    uii -= 1000000 * quotient;
-  uint32_writew6_6:
-    quotient = uii / 10000;
+    uii -= 10000 * quotient;
+  uint32toa_w6_4:
+    quotient = uii / 100;
     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
   }
-  uii -= 10000 * quotient;
- uint32_writew6_4:
-  quotient = uii / 100;
   uii -= 100 * quotient;
-  return memcpya(memcpya(start, &(digit2_table[quotient * 2]), 2), &(digit2_table[uii * 2]), 2);
+ uint32toa_w6_2:
+  return memcpya(start, &(digit2_table[uii * 2]), 2);
 }
 
-char* uint32_writew7(char* start, uint32_t uii) {
+char* uint32toa_w7(uint32_t uii, char* start) {
   uint32_t quotient;
   if (uii < 1000) {
     if (uii < 10) {
-      memset(start, 32, 6);
-      start[6] = '0' + uii;
-      return &(start[7]);
-    } else if (uii < 100) {
-      memset(start, 32, 5);
-    } else {
-      memset(start, 32, 4);
-      quotient = uii / 100;
-      start[4] = '0' + quotient;
-      uii -= quotient * 100;
+      start = memseta(start, 32, 6);
+      *start++ = '0' + uii;
+      return start;
     }
-    return memcpya(&(start[5]), &(digit2_table[uii * 2]), 2);
-  } else if (uii < 10000000) {
-    if (uii >= 100000) {
-      if (uii >= 1000000) {
-	quotient = uii / 1000000;
-	*start++ = '0' + quotient;
-	goto uint32_writew7_6b;
-      }
-      *start++ = ' ';
-      goto uint32_writew7_6;
-    } else if (uii >= 10000) {
-      start = memseta(start, 32, 2);
-      quotient = uii / 10000;
-      *start++ = '0' + quotient;
-    } else {
-      start = memseta(start, 32, 3);
-      goto uint32_writew7_4;
+    if (uii < 100) {
+      start = memseta(start, 32, 5);
+      goto uint32toa_w7_2;
     }
+    quotient = uii / 100;
+    start = memseta(start, 32, 4);
+    *start++ = '0' + quotient;
   } else {
-    if (uii >= 100000000) {
-      quotient = uii / 100000000;
-      if (uii >= 1000000000) {
-	start = memcpya(start, &(digit2_table[quotient * 2]), 2);
-      } else {
+    if (uii < 10000000) {
+      if (uii >= 100000) {
+	if (uii >= 1000000) {
+	  quotient = uii / 1000000;
+	  *start++ = '0' + quotient;
+	  goto uint32toa_w7_6b;
+	}
+	*start++ = ' ';
+	goto uint32toa_w7_6;
+      } else if (uii >= 10000) {
+	start = memseta(start, 32, 2);
+	quotient = uii / 10000;
 	*start++ = '0' + quotient;
+      } else {
+	start = memseta(start, 32, 3);
+	goto uint32toa_w7_4;
+      }
+    } else {
+      if (uii >= 100000000) {
+	quotient = uii / 100000000;
+	if (uii >= 1000000000) {
+	  start = memcpya(start, &(digit2_table[quotient * 2]), 2);
+	} else {
+	  *start++ = '0' + quotient;
+	}
+	uii -= 100000000 * quotient;
       }
-      uii -= 100000000 * quotient;
+      quotient = uii / 1000000;
+      start = memcpya(start, &(digit2_table[quotient * 2]), 2);
+    uint32toa_w7_6b:
+      uii -= 1000000 * quotient;
+    uint32toa_w7_6:
+      quotient = uii / 10000;
+      start = memcpya(start, &(digit2_table[quotient * 2]), 2);
     }
-    quotient = uii / 1000000;
-    start = memcpya(start, &(digit2_table[quotient * 2]), 2);
-  uint32_writew7_6b:
-    uii -= 1000000 * quotient;
-  uint32_writew7_6:
-    quotient = uii / 10000;
+    uii -= 10000 * quotient;
+  uint32toa_w7_4:
+    quotient = uii / 100;
     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
   }
-  uii -= 10000 * quotient;
- uint32_writew7_4:
-  quotient = uii / 100;
   uii -= 100 * quotient;
-  return memcpya(memcpya(start, &(digit2_table[quotient * 2]), 2), &(digit2_table[uii * 2]), 2);
+ uint32toa_w7_2:
+  return memcpya(start, &(digit2_table[uii * 2]), 2);
 }
 
-char* uint32_writew8(char* start, uint32_t uii) {
+char* uint32toa_w8(uint32_t uii, char* start) {
   uint32_t quotient;
   if (uii < 1000) {
     if (uii < 10) {
-      memset(start, 32, 7);
-      start[7] = '0' + uii;
+#ifdef __LP64__
+      *((uintptr_t*)start) = 0x3020202020202020LLU + (((uintptr_t)uii) << 56);
       return &(start[8]);
-    } else if (uii < 100) {
-      memset(start, 32, 6);
-    } else {
-      memset(start, 32, 5);
-      quotient = uii / 100;
-      start[5] = '0' + quotient;
-      uii -= quotient * 100;
+#else
+      start = memseta(start, 32, 7);
+      *start++ = '0' + uii;
+      return start;
+#endif
     }
-    return memcpya(&(start[6]), &(digit2_table[uii * 2]), 2);
-  } else if (uii < 10000000) {
-    if (uii >= 100000) {
-      if (uii < 1000000) {
-	start = memseta(start, 32, 2);
-	goto uint32_writew8_6;
-      }
-      quotient = uii / 1000000;
-      *start = ' ';
-      start[1] = '0' + quotient;
-      start += 2;
-      goto uint32_writew8_6b;
-    } else if (uii < 10000) {
-      start = memseta(start, 32, 4);
-      goto uint32_writew8_4;
+    if (uii < 100) {
+      start = memseta(start, 32, 6);
+      goto uint32toa_w8_2;
     }
-    memset(start, 32, 3);
-    quotient = uii / 10000;
-    start[3] = '0' + quotient;
-    start += 4;
+    quotient = uii / 100;
+    start = memseta(start, 32, 5);
+    *start++ = '0' + quotient;
   } else {
-    if (uii >= 100000000) {
-      quotient = uii / 100000000;
-      if (uii >= 1000000000) {
-	start = memcpya(start, &(digit2_table[quotient * 2]), 2);
-      } else {
-	*start++ = '0' + quotient;
+    if (uii < 10000000) {
+      if (uii >= 100000) {
+	if (uii < 1000000) {
+	  start = memseta(start, 32, 2);
+	  goto uint32toa_w8_6;
+	}
+	quotient = uii / 1000000;
+	*start = ' ';
+	start[1] = '0' + quotient;
+	start += 2;
+	goto uint32toa_w8_6b;
+      } else if (uii < 10000) {
+	start = memseta(start, 32, 4);
+	goto uint32toa_w8_4;
+      }
+      memset(start, 32, 3);
+      quotient = uii / 10000;
+      start[3] = '0' + quotient;
+      start += 4;
+    } else {
+      if (uii >= 100000000) {
+	quotient = uii / 100000000;
+	if (uii >= 1000000000) {
+	  start = memcpya(start, &(digit2_table[quotient * 2]), 2);
+	} else {
+	  *start++ = '0' + quotient;
+	}
+	uii -= 100000000 * quotient;
       }
-      uii -= 100000000 * quotient;
+      quotient = uii / 1000000;
+      start = memcpya(start, &(digit2_table[quotient * 2]), 2);
+    uint32toa_w8_6b:
+      uii -= 1000000 * quotient;
+    uint32toa_w8_6:
+      quotient = uii / 10000;
+      start = memcpya(start, &(digit2_table[quotient * 2]), 2);
     }
-    quotient = uii / 1000000;
-    start = memcpya(start, &(digit2_table[quotient * 2]), 2);
-  uint32_writew8_6b:
-    uii -= 1000000 * quotient;
-  uint32_writew8_6:
-    quotient = uii / 10000;
+    uii -= 10000 * quotient;
+  uint32toa_w8_4:
+    quotient = uii / 100;
     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
   }
-  uii -= 10000 * quotient;
- uint32_writew8_4:
-  quotient = uii / 100;
   uii -= 100 * quotient;
-  return memcpya(memcpya(start, &(digit2_table[quotient * 2]), 2), &(digit2_table[uii * 2]), 2);
+ uint32toa_w8_2:
+  return memcpya(start, &(digit2_table[uii * 2]), 2);
 }
 
-char* uint32_writew10(char* start, uint32_t uii) {
+char* uint32toa_w10(uint32_t uii, char* start) {
+  // if we decide to reduce code size and optimize only one field width, this
+  // should be it
   uint32_t quotient;
   if (uii < 1000) {
     if (uii < 10) {
-      memset(start, 32, 9);
-      start[9] = '0' + uii;
-      return &(start[10]);
-    } else if (uii < 100) {
-      memset(start, 32, 8);
-    } else {
-      memset(start, 32, 7);
-      quotient = uii / 100;
-      start[7] = '0' + quotient;
-      uii -= quotient * 100;
+      start = memseta(start, 32, 9);
+      *start++ = '0' + uii;
+      return start;
     }
-    return memcpya(&(start[8]), &(digit2_table[uii * 2]), 2);
-  } else if (uii < 10000000) {
-    if (uii >= 100000) {
-      if (uii < 1000000) {
-	start = memseta(start, 32, 4);
-	goto uint32_writew10_6;
-      }
-      quotient = uii / 1000000;
-      memset(start, 32, 3);
-      start[3] = '0' + quotient;
-      start += 4;
-      goto uint32_writew10_6b;
-    } else if (uii < 10000) {
-      start = memseta(start, 32, 6);
-      goto uint32_writew10_4;
+    if (uii < 100) {
+      start = memseta(start, 32, 8);
+      goto uint32toa_w10_2;
     }
-    memset(start, 32, 5);
-    quotient = uii / 10000;
-    start[5] = '0' + quotient;
-    start += 6;
+    quotient = uii / 100;
+    start = memseta(start, 32, 7);
+    *start++ = '0' + quotient;
   } else {
-    if (uii >= 100000000) {
-      quotient = uii / 100000000;
-      if (uii >= 1000000000) {
-	memcpy(start, &(digit2_table[quotient * 2]), 2);
-      } else {
-	*start = ' ';
-	start[1] = '0' + quotient;
+    if (uii < 10000000) {
+      if (uii >= 100000) {
+	if (uii < 1000000) {
+	  start = memseta(start, 32, 4);
+	  goto uint32toa_w10_6;
+	}
+	quotient = uii / 1000000;
+	memset(start, 32, 3);
+	start[3] = '0' + quotient;
+	start += 4;
+	goto uint32toa_w10_6b;
+      } else if (uii < 10000) {
+	start = memseta(start, 32, 6);
+	goto uint32toa_w10_4;
       }
-      uii -= 100000000 * quotient;
+      memset(start, 32, 5);
+      quotient = uii / 10000;
+      start[5] = '0' + quotient;
+      start += 6;
     } else {
-      memset(start, 32, 2);
+      if (uii >= 100000000) {
+	quotient = uii / 100000000;
+	if (uii >= 1000000000) {
+	  memcpy(start, &(digit2_table[quotient * 2]), 2);
+	} else {
+	  *start = ' ';
+	  start[1] = '0' + quotient;
+	}
+	uii -= 100000000 * quotient;
+      } else {
+	memset(start, 32, 2);
+      }
+      quotient = uii / 1000000;
+      memcpy(&(start[2]), &(digit2_table[quotient * 2]), 2);
+      start += 4;
+    uint32toa_w10_6b:
+      uii -= 1000000 * quotient;
+    uint32toa_w10_6:
+      quotient = uii / 10000;
+      start = memcpya(start, &(digit2_table[quotient * 2]), 2);
     }
-    quotient = uii / 1000000;
-    memcpy(&(start[2]), &(digit2_table[quotient * 2]), 2);
-    start += 4;
-  uint32_writew10_6b:
-    uii -= 1000000 * quotient;
-  uint32_writew10_6:
-    quotient = uii / 10000;
+    uii -= 10000 * quotient;
+  uint32toa_w10_4:
+    quotient = uii / 100;
     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
   }
-  uii -= 10000 * quotient;
- uint32_writew10_4:
-  quotient = uii / 100;
   uii -= 100 * quotient;
-  return memcpya(memcpya(start, &(digit2_table[quotient * 2]), 2), &(digit2_table[uii * 2]), 2);
+ uint32toa_w10_2:
+  return memcpya(start, &(digit2_table[uii * 2]), 2);
 }
 
-static inline char* uint32_write2trunc(char* start, uint32_t uii) {
+static inline char* uitoa_trunc2(uint32_t uii, char* start) {
   // Given 0 < uii < 100, writes uii without *trailing* zeroes.  (I.e. this is
   // for floating-point encoder use.)
   memcpy(start, &(digit2_table[uii * 2]), 2);
@@ -1158,7 +1176,7 @@ static inline char* uint32_write2trunc(char* start, uint32_t uii) {
   return &(start[1]);
 }
 
-static inline char* uint32_write3trunc(char* start, uint32_t uii) {
+static inline char* uitoa_trunc3(uint32_t uii, char* start) {
   *start++ = '0' + (uii / 100);
   uii %= 100;
   if (!uii) {
@@ -1171,7 +1189,7 @@ static inline char* uint32_write3trunc(char* start, uint32_t uii) {
   return &(start[1]);
 }
 
-static inline char* uint32_write4trunc(char* start, uint32_t uii) {
+static inline char* uitoa_trunc4(uint32_t uii, char* start) {
   uint32_t quotient = uii / 100;
   memcpy(start, &(digit2_table[quotient * 2]), 2);
   uii -= 100 * quotient;
@@ -1185,7 +1203,7 @@ static inline char* uint32_write4trunc(char* start, uint32_t uii) {
   return &(start[1]);
 }
 
-static inline char* uint32_write6trunc(char* start, uint32_t uii) {
+static inline char* uitoa_trunc6(uint32_t uii, char* start) {
   uint32_t quotient = uii / 10000;
   memcpy(start, &(digit2_table[quotient * 2]), 2);
   uii -= 10000 * quotient;
@@ -1205,7 +1223,7 @@ static inline char* uint32_write6trunc(char* start, uint32_t uii) {
   return &(start[1]);
 }
 
-static inline char* uint32_write8trunc(char* start, uint32_t uii) {
+static inline char* uitoa_trunc8(uint32_t uii, char* start) {
   uint32_t quotient = uii / 1000000;
   memcpy(start, &(digit2_table[quotient * 2]), 2);
   uii -= 1000000 * quotient;
@@ -1231,7 +1249,7 @@ static inline char* uint32_write8trunc(char* start, uint32_t uii) {
   return &(start[1]);
 }
 
-static inline char* uint32_write1p1(char* start, uint32_t quotient, uint32_t remainder) {
+static inline char* qrtoa_1p1(uint32_t quotient, uint32_t remainder, char* start) {
   start[0] = '0' + quotient;
   if (!remainder) {
     return &(start[1]);
@@ -1241,7 +1259,7 @@ static inline char* uint32_write1p1(char* start, uint32_t quotient, uint32_t rem
   return &(start[3]);
 }
 
-static inline char* uint32_write1p2(char* start, uint32_t quotient, uint32_t remainder) {
+static inline char* qrtoa_1p2(uint32_t quotient, uint32_t remainder, char* start) {
   *start++ = '0' + quotient;
   if (!remainder) {
     return start;
@@ -1254,7 +1272,7 @@ static inline char* uint32_write1p2(char* start, uint32_t quotient, uint32_t rem
   return &(start[1]);
 }
 
-static inline char* uint32_write1p3(char* start, uint32_t quotient, uint32_t remainder) {
+static inline char* qrtoa_1p3(uint32_t quotient, uint32_t remainder, char* start) {
   // quotient = (int32_t)dxx;
   // remainder = ((int32_t)(dxx * 1000)) - (quotient * 1000);
   *start++ = '0' + quotient;
@@ -1275,7 +1293,7 @@ static inline char* uint32_write1p3(char* start, uint32_t quotient, uint32_t rem
   return &(start[1]);
 }
 
-static inline char* uint32_write1p5(char* start, uint32_t quotient, uint32_t remainder) {
+static inline char* qrtoa_1p5(uint32_t quotient, uint32_t remainder, char* start) {
   *start++ = '0' + quotient;
   if (!remainder) {
     return start;
@@ -1300,7 +1318,7 @@ static inline char* uint32_write1p5(char* start, uint32_t quotient, uint32_t rem
   return &(start[1]);
 }
 
-static inline char* uint32_write1p7(char* start, uint32_t quotient, uint32_t remainder) {
+static inline char* qrtoa_1p7(uint32_t quotient, uint32_t remainder, char* start) {
   *start++ = '0' + quotient;
   if (!remainder) {
     return start;
@@ -1412,8 +1430,9 @@ static inline void double_bround7(double dxx, const double* banker_round, uint32
   *remainderp = remainder - (*quotientp) * 10000000; 
 }
 
-char* double_write6(char* start, double dxx) {
+char* dtoa_so6(double dxx, char* start) {
   // 6 sig fig number, 0.999995 <= dxx < 999999.5
+  // 'so' = "significand only"
   // Just hardcoding all six cases, in the absence of a better approach...
   uint32_t uii;
   uint32_t quotient;
@@ -1421,7 +1440,7 @@ char* double_write6(char* start, double dxx) {
   if (dxx < 99.999949999999) {
     if (dxx < 9.9999949999999) {
       double_bround5(dxx, banker_round8, &quotient, &remainder);
-      return uint32_write1p5(start, quotient, remainder);
+      return qrtoa_1p5(quotient, remainder, start);
     }
     double_bround4(dxx, banker_round8, &quotient, &remainder);
     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
@@ -1434,10 +1453,10 @@ char* double_write6(char* start, double dxx) {
     remainder -= 100 * quotient;
     if (remainder) {
       start += 2;
-    double_write6_pretail:
+    dtoa_so6_pretail:
       memcpy(start, &(digit2_table[remainder * 2]), 2);
     }
-  double_write6_tail:
+  dtoa_so6_tail:
     if (start[1] != '0') {
       return &(start[2]);
     }
@@ -1457,7 +1476,7 @@ char* double_write6(char* start, double dxx) {
       memcpy(start, &(digit2_table[quotient * 2]), 2);
       remainder -= quotient * 10;
       if (!remainder) {
-        goto double_write6_tail;
+        goto dtoa_so6_tail;
       }
       start[2] = '0' + remainder;
       return &(start[3]);
@@ -1471,7 +1490,7 @@ char* double_write6(char* start, double dxx) {
       return start;
     }
     *start++ = '.';
-    goto double_write6_pretail;
+    goto dtoa_so6_pretail;
   } else if (dxx < 99999.949999999) {
     double_bround1(dxx, banker_round8, &uii, &remainder);
     quotient = uii / 10000;
@@ -1488,8 +1507,7 @@ char* double_write6(char* start, double dxx) {
     *start = '0' + remainder;
     return &(start[1]);
   } else {
-    uint32_write6(start, double_bround(dxx, banker_round8));
-    return &(start[6]);
+    return uitoa_z6(double_bround(dxx, banker_round8), start);
   }
 }
 
@@ -1539,7 +1557,7 @@ static inline void float_round6(float fxx, uint32_t* quotientp, uint32_t* remain
   *remainderp = remainder - (*quotientp) * 1000000;
 }
 
-char* float_write6(char* start, float fxx) {
+char* ftoa_so6(float fxx, char* start) {
   uint32_t uii;
   uint32_t quotient;
   uint32_t remainder;
@@ -1552,7 +1570,7 @@ char* float_write6(char* start, float fxx) {
   if (fxx < 99.999944) {
     if (fxx < 9.9999944) {
       float_round5(fxx, &quotient, &remainder);
-      return uint32_write1p5(start, quotient, remainder);
+      return qrtoa_1p5(quotient, remainder, start);
     }
     float_round4(fxx, &quotient, &remainder);
     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
@@ -1565,10 +1583,10 @@ char* float_write6(char* start, float fxx) {
     remainder -= 100 * quotient;
     if (remainder) {
       start += 2;
-    float_write6_pretail:
+    ftoa_so6_pretail:
       memcpy(start, &(digit2_table[remainder * 2]), 2);
     }
-  float_write6_tail:
+  ftoa_so6_tail:
     if (start[1] != '0') {
       return &(start[2]);
     }
@@ -1588,7 +1606,7 @@ char* float_write6(char* start, float fxx) {
       memcpy(start, &(digit2_table[quotient * 2]), 2);
       remainder -= quotient * 10;
       if (!remainder) {
-        goto float_write6_tail;
+        goto ftoa_so6_tail;
       }
       start[2] = '0' + remainder;
       return &(start[3]);
@@ -1602,7 +1620,7 @@ char* float_write6(char* start, float fxx) {
       return start;
     }
     *start++ = '.';
-    goto float_write6_pretail;
+    goto ftoa_so6_pretail;
   } else if (fxx < 99999.944) {
     float_round1(fxx, &uii, &remainder);
     quotient = uii / 10000;
@@ -1619,30 +1637,29 @@ char* float_write6(char* start, float fxx) {
     start[1] = '0' + remainder;
     return &(start[2]);
   } else {
-    uint32_write6(start, float_round(fxx));
-    return &(start[6]);
+    return uitoa_z6(float_round(fxx), start);
   }
 }
 
-char* double_write2(char* start, double dxx) {
+char* dtoa_so2(double dxx, char* start) {
   // 2 sig fig number, 0.95 <= dxx < 99.5
   uint32_t quotient;
   uint32_t remainder;
   if (dxx < 9.9499999999999) {
     double_bround1(dxx, banker_round12, &quotient, &remainder);
-    return uint32_write1p1(start, quotient, remainder);
+    return qrtoa_1p1(quotient, remainder, start);
   }
   return memcpya(start, &(digit2_table[(double_bround(dxx, banker_round12)) * 2]), 2);
 }
 
-char* double_write3(char* start, double dxx) {
+char* dtoa_so3(double dxx, char* start) {
   // 3 sig fig number, 0.995 <= dxx < 999.5
   uint32_t quotient;
   uint32_t remainder;
   if (dxx < 99.949999999999) {
     if (dxx < 9.9949999999999) {
       double_bround2(dxx, banker_round11, &quotient, &remainder);
-      return uint32_write1p2(start, quotient, remainder);
+      return qrtoa_1p2(quotient, remainder, start);
     }
     double_bround1(dxx, banker_round11, &quotient, &remainder);
     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
@@ -1659,7 +1676,7 @@ char* double_write3(char* start, double dxx) {
   return &(start[1]);
 }
 
-char* double_write4(char* start, double dxx) {
+char* dtoa_so4(double dxx, char* start) {
   // 4 sig fig number, 0.9995 <= dxx < 9999.5
   uint32_t uii;
   uint32_t quotient;
@@ -1667,7 +1684,7 @@ char* double_write4(char* start, double dxx) {
   if (dxx < 99.994999999999) {
     if (dxx < 9.9994999999999) {
       double_bround3(dxx, banker_round10, &quotient, &remainder);
-      return uint32_write1p3(start, quotient, remainder);
+      return qrtoa_1p3(quotient, remainder, start);
     }
     double_bround2(dxx, banker_round10, &quotient, &remainder);
     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
@@ -1693,12 +1710,12 @@ char* double_write4(char* start, double dxx) {
     start[1] = '0' + remainder;
     return &(start[2]);
   } else {
-    uint32_write4(start, double_bround(dxx, banker_round10));
+    uitoa_z4(double_bround(dxx, banker_round10), start);
     return &(start[4]);
   }
 }
 
-char* double_write8(char* start, double dxx) {
+char* dtoa_so8(double dxx, char* start) {
   // 8 sig fig number, 0.99999995 <= dxx < 99999999.5
   uint32_t uii;
   uint32_t quotient;
@@ -1706,7 +1723,7 @@ char* double_write8(char* start, double dxx) {
   if (dxx < 99.999999499999) {
     if (dxx < 9.9999999499999) {
       double_bround7(dxx, banker_round6, &quotient, &remainder);
-      return uint32_write1p7(start, quotient, remainder);
+      return qrtoa_1p7(quotient, remainder, start);
     }
     double_bround6(dxx, banker_round6, &quotient, &remainder);
     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
@@ -1719,17 +1736,17 @@ char* double_write8(char* start, double dxx) {
     remainder -= 10000 * quotient;
     if (remainder) {
       start += 2;
-    double_write8_pretail4:
+    dtoa_so8_pretail4:
       quotient = remainder / 100;
       memcpy(start, &(digit2_table[quotient * 2]), 2);
       remainder -= 100 * quotient;
       if (remainder) {
 	start += 2;
-      double_write8_pretail2:
+      dtoa_so8_pretail2:
         memcpy(start, &(digit2_table[remainder * 2]), 2);
       }
     }
-  double_write8_tail:
+  dtoa_so8_tail:
     if (start[1] != '0') {
       return &(start[2]);
     }
@@ -1749,15 +1766,15 @@ char* double_write8(char* start, double dxx) {
       memcpy(start, &(digit2_table[quotient * 2]), 2);
       remainder -= quotient * 1000;
       if (!remainder) {
-        goto double_write8_tail;
+        goto dtoa_so8_tail;
       }
       start += 2;
-    double_write8_pretail3:
+    dtoa_so8_pretail3:
       quotient = remainder / 10;
       memcpy(start, &(digit2_table[quotient * 2]), 2);
       remainder -= quotient * 10;
       if (!remainder) {
-	goto double_write8_tail;
+	goto dtoa_so8_tail;
       }
       start[2] = '0' + remainder;
       return &(start[3]);
@@ -1771,7 +1788,7 @@ char* double_write8(char* start, double dxx) {
       return start;
     }
     *start++ = '.';
-    goto double_write8_pretail4;
+    goto dtoa_so8_pretail4;
   } else if (dxx < 999999.99499999) {
     if (dxx < 99999.999499999) {
       double_bround3(dxx, banker_round6, &uii, &remainder);
@@ -1786,7 +1803,7 @@ char* double_write8(char* start, double dxx) {
 	return start;
       }
       *start++ = '.';
-      goto double_write8_pretail3;
+      goto dtoa_so8_pretail3;
     }
     double_bround2(dxx, banker_round6, &uii, &remainder);
     quotient = uii / 10000;
@@ -1800,7 +1817,7 @@ char* double_write8(char* start, double dxx) {
       return start;
     }
     *start++ = '.';
-    goto double_write8_pretail2;
+    goto dtoa_so8_pretail2;
   } else if (dxx < 9999999.9499999) {
     double_bround1(dxx, banker_round6, &uii, &remainder);
     quotient = uii / 1000000;
@@ -1816,24 +1833,22 @@ char* double_write8(char* start, double dxx) {
     if (!remainder) {
       return start;
     }
-    *start++ = '.';
+    *start = '.';
     start[1] = '0' + remainder;
     return &(start[2]);
   } else {
-    uint32_write8(start, double_bround(dxx, banker_round6));
-    return &(start[8]);
+    return uitoa_z8(double_bround(dxx, banker_round6), start);
   }
 }
 
-char* double_e_write(char* start, double dxx) {
+char* dtoa_e(double dxx, char* start) {
   uint32_t xp10 = 0;
   uint32_t quotient;
   uint32_t remainder;
   char sign;
   if (dxx != dxx) {
     // do this first to avoid generating exception
-    *((uint32_t*)start) = *((uint32_t*)"nan");
-    return &(start[3]);
+    return memcpyl3a(start, "nan");
   } else if (dxx < 0) {
     *start++ = '-';
     dxx = -dxx;
@@ -1842,8 +1857,7 @@ char* double_e_write(char* start, double dxx) {
     if (dxx >= 9.9999994999999e7) {
       if (dxx >= 9.9999994999999e127) {
 	if (dxx == INFINITY) {
-	  *((uint32_t*)start) = *((uint32_t*)"inf");
-	  return &(start[3]);
+	  return memcpyl3a(start, "inf");
 	} else if (dxx >= 9.9999994999999e255) {
 	  dxx *= 1.0e-256;
 	  xp10 |= 256;
@@ -1931,8 +1945,7 @@ char* double_e_write(char* start, double dxx) {
   double_bround6(dxx, banker_round7, &quotient, &remainder);
   *start++ = '0' + quotient;
   *start++ = '.';
-  uint32_write6(start, remainder);
-  start += 6;
+  start = uitoa_z6(remainder, start);
   *start++ = 'e';
   *start++ = sign;
   if (xp10 >= 100) {
@@ -1943,15 +1956,14 @@ char* double_e_write(char* start, double dxx) {
   return memcpya(start, &(digit2_table[xp10 * 2]), 2);
 }
 
-char* float_e_write(char* start, float fxx) {
+char* ftoa_e(float fxx, char* start) {
   uint32_t xp10 = 0;
   uint32_t quotient;
   uint32_t remainder;
   char sign;
   if (fxx != fxx) {
     // do this first to avoid generating exception
-    *((uint32_t*)start) = *((uint32_t*)"nan");
-    return &(start[3]);
+    return memcpyl3a(start, "nan");
   } else if (fxx < 0) {
     *start++ = '-';
     fxx = -fxx;
@@ -1959,8 +1971,7 @@ char* float_e_write(char* start, float fxx) {
   if (fxx >= 9.9999995e-1) {
     if (fxx >= 9.9999995e15) {
       if (fxx == INFINITY) {
-	*((uint32_t*)start) = *((uint32_t*)"inf");
-	return &(start[3]);
+	return memcpyl3a(start, "inf");
       } else if (fxx >= 9.9999995e31) {
 	fxx *= 1.0e-32;
 	xp10 |= 32;
@@ -2019,35 +2030,33 @@ char* float_e_write(char* start, float fxx) {
   float_round6(fxx, &quotient, &remainder);
   *start++ = '0' + quotient;
   *start++ = '.';
-  uint32_write6(start, remainder);
-  start += 6;
+  start = uitoa_z6(remainder, start);
   *start++ = 'e';
   *start++ = sign;
   return memcpya(start, &(digit2_table[xp10 * 2]), 2);
 }
 
-char* double_f_writew2(char* start, double dxx) {
+char* dtoa_f_p2(double dxx, char* start) {
   const double* br_ptr;
   uint32_t quotient;
   uint32_t remainder;
   if (dxx != dxx) {
-    *((uint32_t*)start) = *((uint32_t*)"nan");
-    return &(start[3]);
+    return memcpyl3a(start, "nan");
   } else if (dxx < 9.9949999999999) {
     if (dxx < 0) {
       *start++ = '-';
       dxx = -dxx;
       if (dxx >= 9.9949999999999) {
-        goto double_f_writew2_10;
+        goto dtoa_f_p2_10;
       }
     }
     double_bround2(dxx, banker_round11, &quotient, &remainder);
     *start++ = '0' + quotient;
-  double_f_writew2_dec:
+  dtoa_f_p2_dec:
     *start++ = '.';
     return memcpya(start, &(digit2_table[remainder * 2]), 2);
   }
- double_f_writew2_10:
+ dtoa_f_p2_10:
   if (dxx < 9999999.9949999) {
     if (dxx < 999.99499999999) {
       if (dxx < 99.994999999999) {
@@ -2067,43 +2076,41 @@ char* double_f_writew2(char* start, double dxx) {
       br_ptr = banker_round5;
     }
     double_bround2(dxx, br_ptr, &quotient, &remainder);
-    start = uint32_write(start, quotient);
-    goto double_f_writew2_dec;
+    start = uint32toa(quotient, start);
+    goto dtoa_f_p2_dec;
   }
   if (dxx == INFINITY) {
-    *((uint32_t*)start) = *((uint32_t*)"inf");
-    return &(start[3]);
+    return memcpyl3a(start, "inf");
   }
   // just punt larger numbers to glibc for now, this isn't a bottleneck
   start += sprintf(start, "%.2f", dxx);
   return start;
 }
 
-char* double_f_writew3(char* start, double dxx) {
+char* dtoa_f_p3(double dxx, char* start) {
   const double* br_ptr;
   uint32_t quotient;
   uint32_t remainder;
   if (dxx != dxx) {
-    *((uint32_t*)start) = *((uint32_t*)"nan");
-    return &(start[3]);
+    return memcpyl3a(start, "nan");
   } else if (dxx < 9.9994999999999) {
     if (dxx < 0) {
       *start++ = '-';
       dxx = -dxx;
       if (dxx >= 9.9994999999999) {
-        goto double_f_writew3_10;
+        goto dtoa_f_p3_10;
       }
     }
     double_bround3(dxx, banker_round10, &quotient, &remainder);
     *start++ = '0' + quotient;
-  double_f_writew3_dec:
+  dtoa_f_p3_dec:
     *start++ = '.';
     quotient = remainder / 100;
     remainder -= 100 * quotient;
     *start++ = '0' + quotient;
     return memcpya(start, &(digit2_table[remainder * 2]), 2);
   }
- double_f_writew3_10:
+ dtoa_f_p3_10:
   if (dxx < 999999.99949999) {
     if (dxx < 999.99949999999) {
       if (dxx < 99.999499999999) {
@@ -2121,18 +2128,17 @@ char* double_f_writew3(char* start, double dxx) {
       br_ptr = banker_round5;
     }
     double_bround3(dxx, br_ptr, &quotient, &remainder);
-    start = uint32_write(start, quotient);
-    goto double_f_writew3_dec;
+    start = uint32toa(quotient, start);
+    goto dtoa_f_p3_dec;
   }
   if (dxx == INFINITY) {
-    *((uint32_t*)start) = *((uint32_t*)"inf");
-    return &(start[3]);
+    return memcpyl3a(start, "inf");
   }
   start += sprintf(start, "%.3f", dxx);
   return start;
 }
 
-char* double_f_writew96(char* start, double dxx) {
+char* dtoa_f_w9p6(double dxx, char* start) {
   uint32_t quotient;
   uint32_t remainder;
   if (dxx != dxx) {
@@ -2142,23 +2148,22 @@ char* double_f_writew96(char* start, double dxx) {
       *start++ = '-';
       dxx = -dxx;
       if (dxx >= 9.9999994999999) {
-	goto double_f_writew96_10;
+	goto dtoa_f_w9p6_10;
       }
     } else {
       *start++ = ' ';
     }
     double_bround6(dxx, banker_round7, &quotient, &remainder);
     *start++ = '0' + quotient;
-  double_f_writew96_dec:
+  dtoa_f_w9p6_dec:
     *start++ = '.';
-    uint32_write6(start, remainder);
-    return &(start[6]);
+    return uitoa_z6(remainder, start);
   }
- double_f_writew96_10:
+ dtoa_f_w9p6_10:
   if (dxx < 999.99999949999) {
     double_bround6(dxx, (dxx < 99.999999499999)? banker_round6 : banker_round5, &quotient, &remainder);
-    start = uint32_write(start, quotient);
-    goto double_f_writew96_dec;
+    start = uint32toa(quotient, start);
+    goto dtoa_f_w9p6_dec;
   }
   if (dxx == INFINITY) {
     return memcpya(start, "      inf", 9);
@@ -2167,7 +2172,7 @@ char* double_f_writew96(char* start, double dxx) {
   return start;
 }
 
-char* double_f_writew74(char* start, double dxx) {
+char* dtoa_f_w7p4(double dxx, char* start) {
   const double* br_ptr;
   uint32_t quotient;
   uint32_t remainder;
@@ -2178,20 +2183,20 @@ char* double_f_writew74(char* start, double dxx) {
       *start++ = '-';
       dxx = -dxx;
       if (dxx >= 9.9999499999999) {
-	goto double_f_writew74_10;
+	goto dtoa_f_w7p4_10;
       }
     } else {
       *start++ = ' ';
     }
     double_bround4(dxx, banker_round9, &quotient, &remainder);
     *start++ = '0' + quotient;
-  double_f_writew74_dec:
+  dtoa_f_w7p4_dec:
     *start++ = '.';
     quotient = remainder / 100;
     remainder -= 100 * quotient;
     return memcpya(memcpya(start, &(digit2_table[quotient * 2]), 2), &(digit2_table[remainder * 2]), 2);
   }
- double_f_writew74_10:
+ dtoa_f_w7p4_10:
   if (dxx < 99999.999949999) {
     if (dxx < 999.99994999999) {
       if (dxx < 99.999949999999) {
@@ -2205,8 +2210,8 @@ char* double_f_writew74(char* start, double dxx) {
       br_ptr = banker_round5;
     }
     double_bround4(dxx, br_ptr, &quotient, &remainder);
-    start = uint32_write(start, quotient);
-    goto double_f_writew74_dec;
+    start = uint32toa(quotient, start);
+    goto dtoa_f_w7p4_dec;
   }
   if (dxx == INFINITY) {
     return memcpya(start, "    inf", 7);
@@ -2215,36 +2220,35 @@ char* double_f_writew74(char* start, double dxx) {
   return start;
 }
 
-char* double_f_writew96_spaced(char* start, double dxx) {
+char* dtoa_f_w9p6_spaced(double dxx, char* start) {
   // Prettier fixed-width decimal: removes trailing zero(es) if and only if the
   // match appears to be exact.
   // Does not detect exact matches when abs(dxx) > 2^31 / 10^5.
   double dyy = dxx * 100000 + 0.00000005;
-  start = double_f_writew96(start, dxx);
+  start = dtoa_f_w9p6(dxx, start);
   if (dyy - ((double)((int32_t)dyy)) >= 0.0000001) {
     return start;
   }
-  zeroes_to_spaces(start);
+  trailing_zeroes_to_spaces(start);
   return start;
 }
 
-char* double_f_writew96_clipped(char* start, double dxx) {
+char* dtoa_f_w9p6_clipped(double dxx, char* start) {
   // same conditions as _spaced()
   double dyy = dxx * 100000 + 0.00000005;
-  start = double_f_writew96(start, dxx);
+  start = dtoa_f_w9p6(dxx, start);
   if (dyy - ((double)((int32_t)dyy)) >= 0.0000001) {
     return start;
   }
-  return clip_zeroes(start);
+  return clip_trailing_zeroes(start);
 }
 
-char* double_g_write(char* start, double dxx) {
+char* dtoa_g(double dxx, char* start) {
   uint32_t xp10 = 0;
   uint32_t quotient;
   uint32_t remainder;
   if (dxx != dxx) {
-    *((uint32_t*)start) = *((uint32_t*)"nan");
-    return &(start[3]);
+    return memcpyl3a(start, "nan");
   } else if (dxx < 0) {
     *start++ = '-';
     dxx = -dxx;
@@ -2294,7 +2298,7 @@ char* double_g_write(char* start, double dxx) {
       xp10++;
     }
     double_bround5(dxx, banker_round8, &quotient, &remainder);
-    start = memcpya(uint32_write1p5(start, quotient, remainder), "e-", 2);
+    start = memcpya(qrtoa_1p5(quotient, remainder, start), "e-", 2);
     if (xp10 >= 100) {
       quotient = xp10 / 100;
       *start++ = '0' + quotient;
@@ -2306,8 +2310,7 @@ char* double_g_write(char* start, double dxx) {
     if (dxx >= 9.9999949999999e15) {
       if (dxx >= 9.9999949999999e127) {
 	if (dxx == INFINITY) {
-	  *((uint32_t*)start) = *((uint32_t*)"inf");
-	  return &(start[3]);
+	  return memcpyl3a(start, "inf");
 	} else if (dxx >= 9.9999949999999e255) {
 	  dxx *= 1.0e-256;
 	  xp10 |= 256;
@@ -2346,7 +2349,7 @@ char* double_g_write(char* start, double dxx) {
       xp10++;
     }
     double_bround5(dxx, banker_round8, &quotient, &remainder);
-    start = memcpya(uint32_write1p5(start, quotient, remainder), "e+", 2);
+    start = memcpya(qrtoa_1p5(quotient, remainder, start), "e+", 2);
     if (xp10 >= 100) {
       quotient = xp10 / 100;
       *start++ = '0' + quotient;
@@ -2354,7 +2357,7 @@ char* double_g_write(char* start, double dxx) {
     }
     return memcpya(start, &(digit2_table[xp10 * 2]), 2);
   } else if (dxx >= 0.99999949999999) {
-    return double_write6(start, dxx);
+    return dtoa_so6(dxx, start);
   } else {
     // 6 sig fig decimal, no less than ~0.0001
     start = memcpya(start, "0.", 2);
@@ -2366,17 +2369,16 @@ char* double_g_write(char* start, double dxx) {
       dxx *= 10;
       *start++ = '0';
     }
-    return uint32_write6trunc(start, double_bround(dxx * 1000000, banker_round8));
+    return uitoa_trunc6(double_bround(dxx * 1000000, banker_round8), start);
   }
 }
 
-char* float_g_write(char* start, float fxx) {
+char* ftoa_g(float fxx, char* start) {
   uint32_t xp10 = 0;
   uint32_t quotient;
   uint32_t remainder;
   if (fxx != fxx) {
-    *((uint32_t*)start) = *((uint32_t*)"nan");
-    return &(start[3]);
+    return memcpyl3a(start, "nan");
   } else if (fxx < 0) {
     *start++ = '-';
     fxx = -fxx;
@@ -2411,12 +2413,11 @@ char* float_g_write(char* start, float fxx) {
       xp10++;
     }
     float_round5(fxx, &quotient, &remainder);
-    return memcpya(memcpya(uint32_write1p5(start, quotient, remainder), "e-", 2), &(digit2_table[xp10 * 2]), 2);
+    return memcpya(memcpya(qrtoa_1p5(quotient, remainder, start), "e-", 2), &(digit2_table[xp10 * 2]), 2);
   } else if (fxx >= 999999.44) {
     if (fxx >= 9.9999944e15) {
       if (fxx == INFINITY) {
-	*((uint32_t*)start) = *((uint32_t*)"inf");
-	return &(start[3]);
+	return memcpyl3a(start, "inf");
       } else if (fxx >= 9.9999944e31) {
 	fxx *= 1.0e-32;
 	xp10 |= 32;
@@ -2442,9 +2443,9 @@ char* float_g_write(char* start, float fxx) {
       xp10++;
     }
     float_round5(fxx, &quotient, &remainder);
-    return memcpya(memcpya(uint32_write1p5(start, quotient, remainder), "e+", 2), &(digit2_table[xp10 * 2]), 2);
+    return memcpya(memcpya(qrtoa_1p5(quotient, remainder, start), "e+", 2), &(digit2_table[xp10 * 2]), 2);
   } else if (fxx >= 0.99999944) {
-    return float_write6(start, fxx);
+    return ftoa_so6(fxx, start);
   } else {
     // 6 sig fig decimal, no less than ~0.0001
     start = memcpya(start, "0.", 2);
@@ -2456,12 +2457,12 @@ char* float_g_write(char* start, float fxx) {
       fxx *= 10;
       *start++ = '0';
     }
-    return uint32_write6trunc(start, float_round(fxx * 1000000));
+    return uitoa_trunc6(float_round(fxx * 1000000), start);
   }
 }
 
-char* double_g_writewx2(char* start, double dxx, uint32_t min_width) {
-  // assumes min_width >= 5.
+char* dtoa_g_wxp2(double dxx, uint32_t min_width, char* start) {
+  assert(min_width >= 5);
   uint32_t xp10 = 0;
   char wbuf[16];
   char* wpos = wbuf;
@@ -2520,7 +2521,7 @@ char* double_g_writewx2(char* start, double dxx, uint32_t min_width) {
       xp10++;
     }
     double_bround1(dxx, banker_round12, &quotient, &remainder);
-    wpos = uint32_write1p1(wpos, quotient, remainder);
+    wpos = qrtoa_1p1(quotient, remainder, wpos);
     remainder = wpos - wbuf;
     if (xp10 >= 100) {
       if (remainder < min_width - 5) {
@@ -2591,7 +2592,7 @@ char* double_g_writewx2(char* start, double dxx, uint32_t min_width) {
       xp10++;
     }
     double_bround1(dxx, banker_round12, &quotient, &remainder);
-    wpos = uint32_write1p1(wpos, quotient, remainder);
+    wpos = qrtoa_1p1(quotient, remainder, wpos);
     remainder = wpos - wbuf;
     if (xp10 >= 100) {
       if (remainder < min_width - 5) {
@@ -2615,7 +2616,7 @@ char* double_g_writewx2(char* start, double dxx, uint32_t min_width) {
     return memcpya(start, &(digit2_table[xp10 * 2]), 2);
   } else {
     if (dxx >= 0.99499999999999) {
-      wpos = double_write2(wpos, dxx);
+      wpos = dtoa_so2(dxx, wpos);
     } else {
       // 2 sig fig decimal, no less than ~0.0001
       wpos = memcpya(wpos, "0.", 2);
@@ -2627,7 +2628,7 @@ char* double_g_writewx2(char* start, double dxx, uint32_t min_width) {
 	dxx *= 10;
 	*wpos++ = '0';
       }
-      wpos = uint32_write2trunc(wpos, double_bround(dxx * 100, banker_round12));
+      wpos = uitoa_trunc2(double_bround(dxx * 100, banker_round12), wpos);
     }
     remainder = wpos - wbuf;
     if (remainder < min_width) {
@@ -2639,8 +2640,8 @@ char* double_g_writewx2(char* start, double dxx, uint32_t min_width) {
   }
 }
 
-char* double_g_writewx3(char* start, double dxx, uint32_t min_width) {
-  // assumes min_width >= 5.
+char* dtoa_g_wxp3(double dxx, uint32_t min_width, char* start) {
+  assert(min_width >= 5);
   uint32_t xp10 = 0;
   char wbuf[16];
   char* wpos = wbuf;
@@ -2699,7 +2700,7 @@ char* double_g_writewx3(char* start, double dxx, uint32_t min_width) {
       xp10++;
     }
     double_bround2(dxx, banker_round11, &quotient, &remainder);
-    wpos = uint32_write1p2(wpos, quotient, remainder);
+    wpos = qrtoa_1p2(quotient, remainder, wpos);
     remainder = wpos - wbuf;
     if (xp10 >= 100) {
       if (remainder < min_width - 5) {
@@ -2770,7 +2771,7 @@ char* double_g_writewx3(char* start, double dxx, uint32_t min_width) {
       xp10++;
     }
     double_bround2(dxx, banker_round11, &quotient, &remainder);
-    wpos = uint32_write1p2(wpos, quotient, remainder);
+    wpos = qrtoa_1p2(quotient, remainder, wpos);
     remainder = wpos - wbuf;
     if (xp10 >= 100) {
       if (remainder < min_width - 5) {
@@ -2794,7 +2795,7 @@ char* double_g_writewx3(char* start, double dxx, uint32_t min_width) {
     return memcpya(start, &(digit2_table[xp10 * 2]), 2);
   } else {
     if (dxx >= 0.99949999999999) {
-      wpos = double_write3(wpos, dxx);
+      wpos = dtoa_so3(dxx, wpos);
     } else {
       // 3 sig fig decimal, no less than ~0.001
       wpos = memcpya(wpos, "0.", 2);
@@ -2806,7 +2807,7 @@ char* double_g_writewx3(char* start, double dxx, uint32_t min_width) {
 	dxx *= 10;
 	*wpos++ = '0';
       }
-      wpos = uint32_write3trunc(wpos, double_bround(dxx * 1000, banker_round11));
+      wpos = uitoa_trunc3(double_bround(dxx * 1000, banker_round11), wpos);
     }
     remainder = wpos - wbuf;
     if (remainder < min_width) {
@@ -2818,8 +2819,7 @@ char* double_g_writewx3(char* start, double dxx, uint32_t min_width) {
   }
 }
 
-char* double_g_writewx4(char* start, double dxx, uint32_t min_width) {
-  // only requires min_width to be positive; less than 5 is ok
+char* dtoa_g_wxp4(double dxx, uint32_t min_width, char* start) {
   uint32_t xp10 = 0;
   char wbuf[16];
   char* wpos = wbuf;
@@ -2880,7 +2880,7 @@ char* double_g_writewx4(char* start, double dxx, uint32_t min_width) {
       xp10++;
     }
     double_bround3(dxx, banker_round10, &quotient, &remainder);
-    wpos = uint32_write1p3(wpos, quotient, remainder);
+    wpos = qrtoa_1p3(quotient, remainder, wpos);
     remainder = wpos - wbuf;
     if (xp10 >= 100) {
       if (remainder + 5 < min_width) {
@@ -2953,7 +2953,7 @@ char* double_g_writewx4(char* start, double dxx, uint32_t min_width) {
       xp10++;
     }
     double_bround3(dxx, banker_round10, &quotient, &remainder);
-    wpos = uint32_write1p3(wpos, quotient, remainder);
+    wpos = qrtoa_1p3(quotient, remainder, wpos);
     remainder = wpos - wbuf;
     if (xp10 >= 100) {
       if (remainder + 5 < min_width) {
@@ -2977,7 +2977,7 @@ char* double_g_writewx4(char* start, double dxx, uint32_t min_width) {
     return memcpya(start, &(digit2_table[xp10 * 2]), 2);
   } else {
     if (dxx >= 0.99994999999999) {
-      wpos = double_write4(wpos, dxx);
+      wpos = dtoa_so4(dxx, wpos);
     } else {
       // 4 sig fig decimal, no less than ~0.0001
       wpos = memcpya(wpos, "0.", 2);
@@ -2989,7 +2989,7 @@ char* double_g_writewx4(char* start, double dxx, uint32_t min_width) {
 	dxx *= 10;
 	*wpos++ = '0';
       }
-      wpos = uint32_write4trunc(wpos, double_bround(dxx * 10000, banker_round10));
+      wpos = uitoa_trunc4(double_bround(dxx * 10000, banker_round10), wpos);
     }
     remainder = wpos - wbuf;
     if (remainder < min_width) {
@@ -3001,8 +3001,7 @@ char* double_g_writewx4(char* start, double dxx, uint32_t min_width) {
   }
 }
 
-char* double_g_writewx8(char* start, double dxx, uint32_t min_width) {
-  // only requires min_width to be positive; less than 8 is ok
+char* dtoa_g_wxp8(double dxx, uint32_t min_width, char* start) {
   uint32_t xp10 = 0;
   char wbuf[16];
   char* wpos = wbuf;
@@ -3063,7 +3062,7 @@ char* double_g_writewx8(char* start, double dxx, uint32_t min_width) {
       xp10++;
     }
     double_bround7(dxx, banker_round6, &quotient, &remainder);
-    wpos = uint32_write1p7(wpos, quotient, remainder);
+    wpos = qrtoa_1p7(quotient, remainder, wpos);
     remainder = wpos - wbuf;
     if (xp10 >= 100) {
       if (remainder + 5 < min_width) {
@@ -3136,7 +3135,7 @@ char* double_g_writewx8(char* start, double dxx, uint32_t min_width) {
       xp10++;
     }
     double_bround7(dxx, banker_round6, &quotient, &remainder);
-    wpos = uint32_write1p7(wpos, quotient, remainder);
+    wpos = qrtoa_1p7(quotient, remainder, wpos);
     remainder = wpos - wbuf;
     if (xp10 >= 100) {
       if (remainder + 5 < min_width) {
@@ -3160,7 +3159,7 @@ char* double_g_writewx8(char* start, double dxx, uint32_t min_width) {
     return memcpya(start, &(digit2_table[xp10 * 2]), 2);
   } else {
     if (dxx >= 0.99999999499999) {
-      wpos = double_write8(wpos, dxx);
+      wpos = dtoa_so8(dxx, wpos);
     } else {
       // 8 sig fig decimal, no less than ~0.0001
       wpos = memcpya(wpos, "0.", 2);
@@ -3172,7 +3171,7 @@ char* double_g_writewx8(char* start, double dxx, uint32_t min_width) {
 	dxx *= 10;
 	*wpos++ = '0';
       }
-      wpos = uint32_write8trunc(wpos, double_bround(dxx * 100000000, banker_round6));
+      wpos = uitoa_trunc8(double_bround(dxx * 100000000, banker_round6), wpos);
     }
     remainder = wpos - wbuf;
     if (remainder < min_width) {
@@ -3184,7 +3183,7 @@ char* double_g_writewx8(char* start, double dxx, uint32_t min_width) {
   }
 }
 
-char* chrom_print_human(char* buf, uint32_t num) {
+char* chrom_print_human(uint32_t num, char* buf) {
   uint32_t n10;
   if (num < 10) {
     *buf = '0' + num;
@@ -3211,9 +3210,7 @@ char* chrom_print_human(char* buf, uint32_t num) {
   }
 }
 
-uint32_t allele_set(char** allele_ptr, const char* newval, uint32_t slen) {
-  // newval does not need to be null-terminated, and slen does not include
-  // terminator
+uint32_t allele_set(const char* newval, uint32_t slen, char** allele_ptr) {
   char* newptr;
   if (slen == 1) {
     newptr = (char*)(&(g_one_char_strs[((unsigned char)*newval) * 2]));
@@ -3228,7 +3225,7 @@ uint32_t allele_set(char** allele_ptr, const char* newval, uint32_t slen) {
   return 0;
 }
 
-uint32_t allele_reset(char** allele_ptr, const char* newval, uint32_t slen) {
+uint32_t allele_reset(const char* newval, uint32_t slen, char** allele_ptr) {
   char* newptr;
   if (slen == 1) {
     newptr = (char*)(&(g_one_char_strs[((unsigned char)*newval) * 2]));
@@ -3246,10 +3243,11 @@ uint32_t allele_reset(char** allele_ptr, const char* newval, uint32_t slen) {
   return 0;
 }
 
-void magic_num(uint32_t divisor, uint64_t* multp, uint32_t* pre_shiftp, uint32_t* post_shiftp, uint32_t* incrp) {
+void magic_num(uint32_t divisor, uint64_t* multp, uint32_t* __restrict pre_shiftp, uint32_t* __restrict post_shiftp, uint32_t* __restrict incrp) {
   // Enables fast integer division by a constant not known until runtime.  See
   // http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html .
   // Assumes divisor is not zero, of course.
+  // May want to populate a struct instead.
   uint32_t down_multiplier = 0;
   uint32_t down_exponent = 0;
   uint32_t has_magic_down = 0;
@@ -3305,248 +3303,359 @@ void magic_num(uint32_t divisor, uint64_t* multp, uint32_t* pre_shiftp, uint32_t
   }
 }
 
-void fill_bits(uintptr_t* bit_arr, uintptr_t loc_start, uintptr_t len) {
-  // requires bit_arr to be nonempty
+void fill_bits(uintptr_t loc_start, uintptr_t len, uintptr_t* bitarr) {
+  assert(len);
   uintptr_t maj_start = loc_start / BITCT;
   uintptr_t maj_end = (loc_start + len) / BITCT;
   uintptr_t minor;
   if (maj_start == maj_end) {
-    bit_arr[maj_start] |= (ONELU << ((loc_start + len) % BITCT)) - (ONELU << (loc_start % BITCT));
+    bitarr[maj_start] |= (ONELU << ((loc_start + len) % BITCT)) - (ONELU << (loc_start % BITCT));
   } else {
-    bit_arr[maj_start] |= ~((ONELU << (loc_start % BITCT)) - ONELU);
-    fill_ulong_one(&(bit_arr[maj_start + 1]), maj_end - maj_start - 1);
+    bitarr[maj_start] |= ~((ONELU << (loc_start % BITCT)) - ONELU);
+    fill_ulong_one(&(bitarr[maj_start + 1]), maj_end - maj_start - 1);
     minor = (loc_start + len) % BITCT;
     if (minor) {
-      bit_arr[maj_end] |= (ONELU << minor) - ONELU;
+      bitarr[maj_end] |= (ONELU << minor) - ONELU;
     }
   }
 }
 
-void clear_bits(uintptr_t* bit_arr, uintptr_t loc_start, uintptr_t len) {
-  // requires bit_arr to be nonempty
+void clear_bits(uintptr_t loc_start, uintptr_t len, uintptr_t* bitarr) {
+  assert(len);
   uintptr_t maj_start = loc_start / BITCT;
   uintptr_t maj_end = (loc_start + len) / BITCT;
   uintptr_t minor;
   if (maj_start == maj_end) {
-    bit_arr[maj_start] &= ~((ONELU << ((loc_start + len) % BITCT)) - (ONELU << (loc_start % BITCT)));
+    bitarr[maj_start] &= ~((ONELU << ((loc_start + len) % BITCT)) - (ONELU << (loc_start % BITCT)));
   } else {
-    bit_arr[maj_start] &= ((ONELU << (loc_start % BITCT)) - ONELU);
-    fill_ulong_zero(&(bit_arr[maj_start + 1]), maj_end - maj_start - 1);
+    bitarr[maj_start] &= ((ONELU << (loc_start % BITCT)) - ONELU);
+    fill_ulong_zero(&(bitarr[maj_start + 1]), maj_end - maj_start - 1);
     minor = (loc_start + len) % BITCT;
     if (minor) {
-      bit_arr[maj_end] &= ~((ONELU << minor) - ONELU);
+      bitarr[maj_end] &= ~((ONELU << minor) - ONELU);
     }
   }
 }
 
-uint32_t next_unset_unsafe(uintptr_t* bit_arr, uint32_t loc) {
-  uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
-  uintptr_t ulii = (~(*bit_arr_ptr)) >> (loc % BITCT);
+uint32_t next_unset_unsafe(const uintptr_t* bitarr, uint32_t loc) {
+  const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
+  uintptr_t ulii = (~(*bitarr_ptr)) >> (loc % BITCT);
   if (ulii) {
     return loc + CTZLU(ulii);
   }
   do {
-    ulii = *(++bit_arr_ptr);
+    ulii = *(++bitarr_ptr);
   } while (ulii == ~ZEROLU);
-  return ((uintptr_t)(bit_arr_ptr - bit_arr)) * BITCT + CTZLU(~ulii);
+  return ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(~ulii);
 }
 
 #ifdef __LP64__
-uintptr_t next_unset_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc) {
-  uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
-  uintptr_t ulii = (~(*bit_arr_ptr)) >> (loc % BITCT);
+uintptr_t next_unset_ul_unsafe(const uintptr_t* bitarr, uintptr_t loc) {
+  const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
+  uintptr_t ulii = (~(*bitarr_ptr)) >> (loc % BITCT);
   if (ulii) {
     return loc + CTZLU(ulii);
   }
   do {
-    ulii = *(++bit_arr_ptr);
+    ulii = *(++bitarr_ptr);
   } while (ulii == ~ZEROLU);
-  return (((uintptr_t)(bit_arr_ptr - bit_arr)) * BITCT + CTZLU(~ulii));
+  return (((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(~ulii));
 }
 #endif
 
-uint32_t next_unset(uintptr_t* bit_arr, uint32_t loc, uint32_t ceil) {
-  // safe version.  ceil >= 1.
-  uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
-  uintptr_t ulii = (~(*bit_arr_ptr)) >> (loc % BITCT);
-  uintptr_t* bit_arr_last;
+uint32_t next_unset(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil) {
+  // safe version.
+  assert(ceil >= 1);
+  const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
+  uintptr_t ulii = (~(*bitarr_ptr)) >> (loc % BITCT);
+  const uintptr_t* bitarr_last;
   if (ulii) {
     loc += CTZLU(ulii);
     return MINV(loc, ceil);
   }
-  bit_arr_last = &(bit_arr[(ceil - 1) / BITCT]);
+  bitarr_last = &(bitarr[(ceil - 1) / BITCT]);
   do {
-    if (bit_arr_ptr >= bit_arr_last) {
+    if (bitarr_ptr >= bitarr_last) {
       return ceil;
     }
-    ulii = *(++bit_arr_ptr);
+    ulii = *(++bitarr_ptr);
   } while (ulii == ~ZEROLU);
-  loc = ((uintptr_t)(bit_arr_ptr - bit_arr)) * BITCT + CTZLU(~ulii);
+  loc = ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(~ulii);
   return MINV(loc, ceil);
 }
 
 #ifdef __LP64__
-uintptr_t next_unset_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil) {
-  uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
-  uintptr_t ulii = (~(*bit_arr_ptr)) >> (loc % BITCT);
-  uintptr_t* bit_arr_last;
+uintptr_t next_unset_ul(const uintptr_t* bitarr, uintptr_t loc, uintptr_t ceil) {
+  const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
+  uintptr_t ulii = (~(*bitarr_ptr)) >> (loc % BITCT);
+  const uintptr_t* bitarr_last;
   if (ulii) {
     ulii = loc + CTZLU(ulii);
     return MINV(ulii, ceil);
   }
-  bit_arr_last = &(bit_arr[(ceil - 1) / BITCT]);
+  bitarr_last = &(bitarr[(ceil - 1) / BITCT]);
   do {
-    if (bit_arr_ptr >= bit_arr_last) {
+    if (bitarr_ptr >= bitarr_last) {
       return ceil;
     }
-    ulii = *(++bit_arr_ptr);
+    ulii = *(++bitarr_ptr);
   } while (ulii == ~ZEROLU);
-  ulii = ((uintptr_t)(bit_arr_ptr - bit_arr)) * BITCT + CTZLU(~ulii);
+  ulii = ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(~ulii);
   return MINV(ulii, ceil);
 }
 #endif
 
-uint32_t next_set_unsafe(uintptr_t* bit_arr, uint32_t loc) {
-  uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
-  uintptr_t ulii = (*bit_arr_ptr) >> (loc % BITCT);
+uint32_t next_set_unsafe(const uintptr_t* bitarr, uint32_t loc) {
+  const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
+  uintptr_t ulii = (*bitarr_ptr) >> (loc % BITCT);
   if (ulii) {
     return loc + CTZLU(ulii);
   }
   do {
-    ulii = *(++bit_arr_ptr);
+    ulii = *(++bitarr_ptr);
   } while (!ulii);
-  return ((uintptr_t)(bit_arr_ptr - bit_arr)) * BITCT + CTZLU(ulii);
+  return ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(ulii);
 }
 
 #ifdef __LP64__
-uintptr_t next_set_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc) {
-  uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
-  uintptr_t ulii = (*bit_arr_ptr) >> (loc % BITCT);
+uintptr_t next_set_ul_unsafe(const uintptr_t* bitarr, uintptr_t loc) {
+  const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
+  uintptr_t ulii = (*bitarr_ptr) >> (loc % BITCT);
   if (ulii) {
     return loc + CTZLU(ulii);
   }
   do {
-    ulii = *(++bit_arr_ptr);
+    ulii = *(++bitarr_ptr);
   } while (!ulii);
-  return ((uintptr_t)(bit_arr_ptr - bit_arr)) * BITCT + CTZLU(ulii);
+  return ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(ulii);
 }
 #endif
 
-uint32_t next_set(uintptr_t* bit_arr, uint32_t loc, uint32_t ceil) {
-  uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
-  uintptr_t ulii = (*bit_arr_ptr) >> (loc % BITCT);
-  uintptr_t* bit_arr_last;
+uint32_t next_set(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil) {
+  const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
+  uintptr_t ulii = (*bitarr_ptr) >> (loc % BITCT);
+  const uintptr_t* bitarr_last;
   uint32_t rval;
   if (ulii) {
     rval = loc + CTZLU(ulii);
     return MINV(rval, ceil);
   }
-  bit_arr_last = &(bit_arr[(ceil - 1) / BITCT]);
+  bitarr_last = &(bitarr[(ceil - 1) / BITCT]);
   do {
-    if (bit_arr_ptr >= bit_arr_last) {
+    if (bitarr_ptr >= bitarr_last) {
       return ceil;
     }
-    ulii = *(++bit_arr_ptr);
+    ulii = *(++bitarr_ptr);
   } while (!ulii);
-  rval = ((uintptr_t)(bit_arr_ptr - bit_arr)) * BITCT + CTZLU(ulii);
+  rval = ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(ulii);
   return MINV(rval, ceil);
 }
 
 #ifdef __LP64__
-uintptr_t next_set_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil) {
-  uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
-  uintptr_t ulii = (*bit_arr_ptr) >> (loc % BITCT);
-  uintptr_t* bit_arr_last;
+uintptr_t next_set_ul(const uintptr_t* bitarr, uintptr_t loc, uintptr_t ceil) {
+  const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
+  uintptr_t ulii = (*bitarr_ptr) >> (loc % BITCT);
+  const uintptr_t* bitarr_last;
   if (ulii) {
     ulii = loc + CTZLU(ulii);
     return MINV(ulii, ceil);
   }
-  bit_arr_last = &(bit_arr[(ceil - 1) / BITCT]);
+  bitarr_last = &(bitarr[(ceil - 1) / BITCT]);
   do {
-    if (bit_arr_ptr >= bit_arr_last) {
+    if (bitarr_ptr >= bitarr_last) {
       return ceil;
     }
-    ulii = *(++bit_arr_ptr);
+    ulii = *(++bitarr_ptr);
   } while (!ulii);
-  ulii = ((uintptr_t)(bit_arr_ptr - bit_arr)) * BITCT + CTZLU(ulii);
+  ulii = ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(ulii);
   return MINV(ulii, ceil);
 }
 #endif
 
-int32_t last_set_bit(uintptr_t* bit_arr, uint32_t word_ct) {
-  uintptr_t* bit_arr_ptr = &(bit_arr[word_ct]);
+int32_t last_set_bit(const uintptr_t* bitarr, uint32_t word_ct) {
+  const uintptr_t* bitarr_ptr = &(bitarr[word_ct]);
   uintptr_t ulii;
   do {
-    ulii = *(--bit_arr_ptr);
+    ulii = *(--bitarr_ptr);
     if (ulii) {
-      return ((uintptr_t)(bit_arr_ptr - bit_arr)) * BITCT + BITCT - 1 - CLZLU(ulii);
+      return ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + BITCT - 1 - CLZLU(ulii);
     }
-  } while (bit_arr_ptr > bit_arr);
+  } while (bitarr_ptr > bitarr);
   return -1;
 }
 
-int32_t last_clear_bit(uintptr_t* bit_arr, uint32_t ceil) {
-  // can return ceil or any lower number
-  uintptr_t* bit_arr_ptr = &(bit_arr[ceil / BITCT]);
-  uint32_t remainder = ceil % BITCT;
-  uintptr_t ulii;
-  if (remainder) {
-    ulii = (~(*bit_arr_ptr)) & ((ONELU << remainder) - ONELU);
-    if (ulii) {
-      return (ceil | (BITCT - 1)) - CLZLU(ulii);
-    }
+int32_t last_clear_bit(const uintptr_t* bitarr, uint32_t ceil) {
+  // can return ceil or any lower number
+  const uintptr_t* bitarr_ptr = &(bitarr[ceil / BITCT]);
+  uint32_t remainder = ceil % BITCT;
+  uintptr_t ulii;
+  if (remainder) {
+    ulii = (~(*bitarr_ptr)) & ((ONELU << remainder) - ONELU);
+    if (ulii) {
+      return (ceil | (BITCT - 1)) - CLZLU(ulii);
+    }
+  }
+  while (bitarr_ptr > bitarr) {
+    ulii = ~(*(--bitarr_ptr));
+    if (ulii) {
+      return ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + BITCT - 1 - CLZLU(ulii);
+    }
+  }
+  return -1;
+}
+
+uint32_t prev_unset_unsafe(const uintptr_t* bitarr, uint32_t loc) {
+  // unlike the next_{un}set family, this always returns a STRICTLY earlier
+  // position
+  const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
+  uint32_t remainder = loc % BITCT;
+  uintptr_t ulii;
+  if (remainder) {
+    ulii = (~(*bitarr_ptr)) & ((ONELU << remainder) - ONELU);
+    if (ulii) {
+      return (loc | (BITCT - 1)) - CLZLU(ulii);
+    }
+  }
+  do {
+    ulii = ~(*(--bitarr_ptr));
+  } while (!ulii);
+  return ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + BITCT - 1 - CLZLU(ulii);
+}
+
+/*
+uint32_t prev_unset(uintptr_t* bitarr, uint32_t loc, uint32_t floor) {
+  uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
+  uint32_t remainder = loc % BITCT;
+  uintptr_t* bitarr_first;
+  uintptr_t ulii;
+  if (remainder) {
+    ulii = (~(*bitarr_ptr)) & ((ONELU << remainder) - ONELU);
+    if (ulii) {
+      loc = (loc | (BITCT - 1)) - CLZLU(ulii);
+      return MAXV(loc, floor);
+    }
+  }
+  bitarr_first = &(bitarr[floor / BITCT]);
+  do {
+    if (bitarr_ptr == bitarr_first) {
+      return floor;
+    }
+    ulii = ~(*(--bitarr_ptr));
+  } while (!ulii);
+  loc = ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + BITCT - 1 - CLZLU(ulii);
+  return MAXV(loc, floor);
+}
+*/
+
+
+int32_t bigstack_calloc_uc(uintptr_t ct, unsigned char** ucp_ptr) {
+  *ucp_ptr = (unsigned char*)bigstack_alloc(ct);
+  if (!(*ucp_ptr)) {
+    return 1;
+  }
+  memset(*ucp_ptr, 0, ct);
+  return 0;
+}
+
+int32_t bigstack_calloc_d(uintptr_t ct, double** dp_ptr) {
+  *dp_ptr = (double*)bigstack_alloc(ct * sizeof(double));
+  if (!(*dp_ptr)) {
+    return 1;
+  }
+  fill_double_zero(*dp_ptr, ct);
+  return 0;
+}
+
+int32_t bigstack_calloc_f(uintptr_t ct, float** fp_ptr) {
+  *fp_ptr = (float*)bigstack_alloc(ct * sizeof(float));
+  if (!(*fp_ptr)) {
+    return 1;
+  }
+  fill_float_zero(*fp_ptr, ct);
+  return 0;
+}
+
+int32_t bigstack_calloc_ui(uintptr_t ct, uint32_t** uip_ptr) {
+  *uip_ptr = (uint32_t*)bigstack_alloc(ct * sizeof(int32_t));
+  if (!(*uip_ptr)) {
+    return 1;
+  }
+  fill_uint_zero(*uip_ptr, ct);
+  return 0;
+}
+
+int32_t bigstack_calloc_ul(uintptr_t ct, uintptr_t** ulp_ptr) {
+  *ulp_ptr = (uintptr_t*)bigstack_alloc(ct * sizeof(intptr_t));
+  if (!(*ulp_ptr)) {
+    return 1;
+  }
+  fill_ulong_zero(*ulp_ptr, ct);
+  return 0;
+}
+
+int32_t bigstack_calloc_ull(uintptr_t ct, uint64_t** ullp_ptr) {
+  *ullp_ptr = (uint64_t*)bigstack_alloc(ct * sizeof(int64_t));
+  if (!(*ullp_ptr)) {
+    return 1;
+  }
+  fill_ull_zero(*ullp_ptr, ct);
+  return 0;
+}
+
+int32_t bigstack_end_calloc_uc(uintptr_t ct, unsigned char** ucp_ptr) {
+  *ucp_ptr = (unsigned char*)bigstack_end_alloc(ct);
+  if (!(*ucp_ptr)) {
+    return 1;
+  }
+  memset(*ucp_ptr, 0, ct);
+  return 0;
+}
+
+int32_t bigstack_end_calloc_d(uintptr_t ct, double** dp_ptr) {
+  *dp_ptr = (double*)bigstack_end_alloc(ct * sizeof(double));
+  if (!(*dp_ptr)) {
+    return 1;
   }
-  while (bit_arr_ptr > bit_arr) {
-    ulii = ~(*(--bit_arr_ptr));
-    if (ulii) {
-      return ((uintptr_t)(bit_arr_ptr - bit_arr)) * BITCT + BITCT - 1 - CLZLU(ulii);
-    }
+  fill_double_zero(*dp_ptr, ct);
+  return 0;
+}
+
+int32_t bigstack_end_calloc_f(uintptr_t ct, float** fp_ptr) {
+  *fp_ptr = (float*)bigstack_end_alloc(ct * sizeof(float));
+  if (!(*fp_ptr)) {
+    return 1;
   }
-  return -1;
+  fill_float_zero(*fp_ptr, ct);
+  return 0;
 }
 
-uint32_t prev_unset_unsafe(uintptr_t* bit_arr, uint32_t loc) {
-// unlike the next_{un}set family, this always returns a STRICTLY earlier
-// position
-  uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
-  uint32_t remainder = loc % BITCT;
-  uintptr_t ulii;
-  if (remainder) {
-    ulii = (~(*bit_arr_ptr)) & ((ONELU << remainder) - ONELU);
-    if (ulii) {
-      return (loc | (BITCT - 1)) - CLZLU(ulii);
-    }
+int32_t bigstack_end_calloc_ui(uintptr_t ct, uint32_t** uip_ptr) {
+  *uip_ptr = (uint32_t*)bigstack_end_alloc(ct * sizeof(int32_t));
+  if (!(*uip_ptr)) {
+    return 1;
   }
-  do {
-    ulii = ~(*(--bit_arr_ptr));
-  } while (!ulii);
-  return ((uintptr_t)(bit_arr_ptr - bit_arr)) * BITCT + BITCT - 1 - CLZLU(ulii);
+  fill_uint_zero(*uip_ptr, ct);
+  return 0;
 }
 
-/*
-uint32_t prev_unset(uintptr_t* bit_arr, uint32_t loc, uint32_t floor) {
-  uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
-  uint32_t remainder = loc % BITCT;
-  uintptr_t* bit_arr_first;
-  uintptr_t ulii;
-  if (remainder) {
-    ulii = (~(*bit_arr_ptr)) & ((ONELU << remainder) - ONELU);
-    if (ulii) {
-      loc = (loc | (BITCT - 1)) - CLZLU(ulii);
-      return MAXV(loc, floor);
-    }
+int32_t bigstack_end_calloc_ul(uintptr_t ct, uintptr_t** ulp_ptr) {
+  *ulp_ptr = (uintptr_t*)bigstack_end_alloc(ct * sizeof(intptr_t));
+  if (!(*ulp_ptr)) {
+    return 1;
   }
-  bit_arr_first = &(bit_arr[floor / BITCT]);
-  do {
-    if (bit_arr_ptr == bit_arr_first) {
-      return floor;
-    }
-    ulii = ~(*(--bit_arr_ptr));
-  } while (!ulii);
-  loc = ((uintptr_t)(bit_arr_ptr - bit_arr)) * BITCT + BITCT - 1 - CLZLU(ulii);
-  return MAXV(loc, floor);
+  fill_ulong_zero(*ulp_ptr, ct);
+  return 0;
 }
-*/
+
+int32_t bigstack_end_calloc_ull(uintptr_t ct, uint64_t** ullp_ptr) {
+  *ullp_ptr = (uint64_t*)bigstack_end_alloc(ct * sizeof(int64_t));
+  if (!(*ullp_ptr)) {
+    return 1;
+  }
+  fill_ull_zero(*ullp_ptr, ct);
+  return 0;
+}
+
 
 // MurmurHash3, from
 // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
@@ -3669,22 +3778,24 @@ uintptr_t geqprime(uintptr_t floor) {
   return floor;
 }
 
-int32_t populate_id_htable(uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t item_ct, const char* item_ids, uintptr_t max_id_len, uint32_t store_dups, uint32_t* id_htable, uint32_t id_htable_size) {
+int32_t populate_id_htable(uintptr_t unfiltered_ct, const uintptr_t* exclude_arr, uintptr_t item_ct, const char* item_ids, uintptr_t max_id_len, uint32_t store_dups, uint32_t id_htable_size, uint32_t* id_htable) {
   // While unique IDs are normally assumed (and enforced) here, --extract and
   // --exclude are an exception, since we want to be able to e.g. exclude all
   // variants named '.'.  Since there could be millions of them, ordinary
   // O(n^2) hash table duplicate resolution is unacceptably slow; instead, we
   // allocate additional linked lists past the end of id_htable to track all
   // unfiltered indexes of duplicate names.  (This requires the
-  // alloc_and_populate_id_htable interface; top_alloc doesn't work there.)
+  // alloc_and_populate_id_htable interface; bigstack_end_alloc doesn't work
+  // there.)
   uintptr_t item_uidx = 0;
   uint32_t extra_alloc = 0;
   uint32_t prev_llidx = 0;
   // needs to be synced with extract_exclude_flag_norange()
-  uint32_t* extra_alloc_base = (uint32_t*)wkspace_base;
+  uint32_t* extra_alloc_base = (uint32_t*)g_bigstack_base;
   uint32_t item_idx = 0;
   const char* sptr;
   uintptr_t prev_uidx;
+  uintptr_t cur_bigstack_left;
   uint32_t max_extra_alloc;
   uint32_t slen;
   uint32_t hashval;
@@ -3722,14 +3833,15 @@ int32_t populate_id_htable(uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uint
       }
     }
   } else {
+    cur_bigstack_left = bigstack_left();
 #ifdef __LP64__
-    if (wkspace_left >= 0x400000000LLU) {
+    if (cur_bigstack_left >= 0x400000000LLU) {
       max_extra_alloc = 0xfffffffeU;
     } else {
-      max_extra_alloc = wkspace_left / sizeof(int32_t);
+      max_extra_alloc = cur_bigstack_left / sizeof(int32_t);
     }
 #else
-    max_extra_alloc = wkspace_left / sizeof(int32_t);
+    max_extra_alloc = cur_bigstack_left / sizeof(int32_t);
 #endif
     for (; item_idx < item_ct; item_uidx++, item_idx++) {
       next_unset_ul_unsafe_ck(exclude_arr, &item_uidx);
@@ -3778,7 +3890,7 @@ int32_t populate_id_htable(uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uint
       }
     }
     if (extra_alloc) {
-      wkspace_alloc(extra_alloc * sizeof(int32_t));
+      bigstack_alloc(extra_alloc * sizeof(int32_t));
     }
   }
   return 0;
@@ -3814,7 +3926,7 @@ uint32_t id_htable_find(const char* id_buf, uintptr_t cur_id_len, const uint32_t
   }
 }
 
-void fill_idx_to_uidx(uintptr_t* exclude_arr, uintptr_t unfiltered_item_ct, uintptr_t item_ct, uint32_t* idx_to_uidx) {
+void fill_idx_to_uidx(const uintptr_t* exclude_arr, uintptr_t unfiltered_item_ct, uintptr_t item_ct, uint32_t* idx_to_uidx) {
   uint32_t* idx_to_uidx_end = &(idx_to_uidx[item_ct]);
   uint32_t item_uidx = 0;
   uint32_t item_uidx_stop;
@@ -3827,7 +3939,7 @@ void fill_idx_to_uidx(uintptr_t* exclude_arr, uintptr_t unfiltered_item_ct, uint
   }
 }
 
-void fill_idx_to_uidx_incl(uintptr_t* include_arr, uintptr_t unfiltered_item_ct, uintptr_t item_ct, uint32_t* idx_to_uidx) {
+void fill_idx_to_uidx_incl(const uintptr_t* include_arr, uintptr_t unfiltered_item_ct, uintptr_t item_ct, uint32_t* idx_to_uidx) {
   uint32_t* idx_to_uidx_end = &(idx_to_uidx[item_ct]);
   uint32_t item_uidx = 0;
   uint32_t item_uidx_stop;
@@ -3840,7 +3952,7 @@ void fill_idx_to_uidx_incl(uintptr_t* include_arr, uintptr_t unfiltered_item_ct,
   }
 }
 
-void fill_uidx_to_idx(uintptr_t* exclude_arr, uint32_t unfiltered_item_ct, uint32_t item_ct, uint32_t* uidx_to_idx) {
+void fill_uidx_to_idx(const uintptr_t* exclude_arr, uint32_t unfiltered_item_ct, uint32_t item_ct, uint32_t* uidx_to_idx) {
   uint32_t item_uidx = 0;
   uint32_t item_idx = 0;
   uint32_t* uidx_to_idx_ptr;
@@ -3856,7 +3968,7 @@ void fill_uidx_to_idx(uintptr_t* exclude_arr, uint32_t unfiltered_item_ct, uint3
   }
 }
 
-void fill_uidx_to_idx_incl(uintptr_t* include_arr, uint32_t unfiltered_item_ct, uint32_t item_ct, uint32_t* uidx_to_idx) {
+void fill_uidx_to_idx_incl(const uintptr_t* include_arr, uint32_t unfiltered_item_ct, uint32_t item_ct, uint32_t* uidx_to_idx) {
   uint32_t item_uidx = 0;
   uint32_t item_idx = 0;
   uint32_t* uidx_to_idx_ptr;
@@ -3872,7 +3984,7 @@ void fill_uidx_to_idx_incl(uintptr_t* include_arr, uint32_t unfiltered_item_ct,
   }
 }
 
-void fill_midx_to_idx(uintptr_t* exclude_arr_orig, uintptr_t* exclude_arr, uint32_t item_ct, uint32_t* midx_to_idx) {
+void fill_midx_to_idx(const uintptr_t* exclude_arr_orig, const uintptr_t* exclude_arr, uint32_t item_ct, uint32_t* midx_to_idx) {
   // Assumes item_ct is nonzero.
 
   // May want to switch to alternate behavior: when current midx is excluded,
@@ -3888,12 +4000,12 @@ void fill_midx_to_idx(uintptr_t* exclude_arr_orig, uintptr_t* exclude_arr, uint3
   }
 }
 
-void fill_vec_55(uintptr_t* vec, uint32_t ct) {
+void fill_quatervec_55(uint32_t ct, uintptr_t* quatervec) {
   uint32_t rem = ct & (BITCT - 1);
 #ifdef __LP64__
   const __m128i m1 = {FIVEMASK, FIVEMASK};
-  __m128i* vecp = (__m128i*)vec;
-  __m128i* vec_end = (__m128i*)(&(vec[2 * (ct / BITCT)]));
+  __m128i* vecp = (__m128i*)quatervec;
+  __m128i* vec_end = (__m128i*)(&(quatervec[2 * (ct / BITCT)]));
   uintptr_t* second_to_last;
   while (vecp < vec_end) {
     *vecp++ = m1;
@@ -3909,26 +4021,26 @@ void fill_vec_55(uintptr_t* vec, uint32_t ct) {
     }
   }
 #else
-  uintptr_t* vec_end = &(vec[2 * (ct / BITCT)]);
-  while (vec < vec_end) {
-    *vec++ = FIVEMASK;
+  uintptr_t* vec_end = &(quatervec[2 * (ct / BITCT)]);
+  while (quatervec < vec_end) {
+    *quatervec++ = FIVEMASK;
   }
   if (rem) {
     if (rem > BITCT2) {
-      vec[0] = FIVEMASK;
-      vec[1] = FIVEMASK >> ((BITCT - rem) * 2);
+      quatervec[0] = FIVEMASK;
+      quatervec[1] = FIVEMASK >> ((BITCT - rem) * 2);
     } else {
-      vec[0] = FIVEMASK >> ((BITCT2 - rem) * 2);
-      vec[1] = 0;
+      quatervec[0] = FIVEMASK >> ((BITCT2 - rem) * 2);
+      quatervec[1] = 0;
     }
   }
 #endif
 }
 
-void vec_collapse_init(uintptr_t* unfiltered_bitarr, uint32_t unfiltered_ct, uintptr_t* filter_bitarr, uint32_t filtered_ct, uintptr_t* output_vec) {
-  // Used to unpack e.g. unfiltered sex_male to a filtered 2-bit vector usable
-  // as a raw input bitmask.
-  // Assumes output_vec is sized to a multiple of 16 bytes.
+void quaterarr_collapse_init(const uintptr_t* __restrict unfiltered_bitarr, uint32_t unfiltered_ct, const uintptr_t* __restrict filter_bitarr, uint32_t filtered_ct, uintptr_t* __restrict output_quaterarr) {
+  // Used to unpack e.g. unfiltered sex_male to a filtered quaterarr usable as
+  // a raw input bitmask.
+  // Assumes output_quaterarr is sized to a multiple of 16 bytes.
   uintptr_t cur_write = 0;
   uint32_t item_uidx = 0;
   uint32_t write_bit = 0;
@@ -3941,21 +4053,21 @@ void vec_collapse_init(uintptr_t* unfiltered_bitarr, uint32_t unfiltered_ct, uin
     do {
       cur_write |= ((unfiltered_bitarr[item_uidx / BITCT] >> (item_uidx % BITCT)) & 1) << (write_bit * 2);
       if (++write_bit == BITCT2) {
-	*output_vec++ = cur_write;
+	*output_quaterarr++ = cur_write;
         cur_write = 0;
 	write_bit = 0;
       }
     } while (++item_uidx < item_uidx_stop);
   }
   if (write_bit) {
-    *output_vec++ = cur_write;
+    *output_quaterarr++ = cur_write;
   }
   if ((filtered_ct + (BITCT2 - 1)) & BITCT2) {
-    *output_vec = 0;
+    *output_quaterarr = 0;
   }
 }
 
-void vec_collapse_init_exclude(uintptr_t* unfiltered_bitarr, uint32_t unfiltered_ct, uintptr_t* filter_exclude_bitarr, uint32_t filtered_ct, uintptr_t* output_vec) {
+void quaterarr_collapse_init_exclude(const uintptr_t* __restrict unfiltered_bitarr, uint32_t unfiltered_ct, const uintptr_t* __restrict filter_exclude_bitarr, uint32_t filtered_ct, uintptr_t* __restrict output_quaterarr) {
   uintptr_t cur_write = 0;
   uint32_t item_uidx = 0;
   uint32_t write_bit = 0;
@@ -3968,49 +4080,49 @@ void vec_collapse_init_exclude(uintptr_t* unfiltered_bitarr, uint32_t unfiltered
     do {
       cur_write |= ((unfiltered_bitarr[item_uidx / BITCT] >> (item_uidx % BITCT)) & 1) << (write_bit * 2);
       if (++write_bit == BITCT2) {
-	*output_vec++ = cur_write;
+	*output_quaterarr++ = cur_write;
         cur_write = 0;
 	write_bit = 0;
       }
     } while (++item_uidx < item_uidx_stop);
   }
   if (write_bit) {
-    *output_vec++ = cur_write;
+    *output_quaterarr++ = cur_write;
   }
   if ((filtered_ct + (BITCT2 - 1)) & BITCT2) {
-    *output_vec = 0;
+    *output_quaterarr = 0;
   }
 }
 
-uint32_t alloc_collapsed_haploid_filters(uint32_t unfiltered_sample_ct, uint32_t sample_ct, uint32_t hh_exists, uint32_t is_include, uintptr_t* sample_bitarr, uintptr_t* sex_male, uintptr_t** sample_include2_ptr, uintptr_t** sample_male_include2_ptr) {
-  uintptr_t sample_ctv2 = 2 * ((sample_ct + (BITCT - 1)) / BITCT);
+uint32_t alloc_collapsed_haploid_filters(const uintptr_t* __restrict sample_bitarr, const uintptr_t* __restrict sex_male, uint32_t unfiltered_sample_ct, uint32_t sample_ct, uint32_t hh_exists, uint32_t is_include, uintptr_t** sample_include_quatervec_ptr, uintptr_t** sample_male_include_quatervec_ptr) {
+  uintptr_t sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
   if (hh_exists & (Y_FIX_NEEDED | NXMHH_EXISTS)) {
     // if already allocated, we assume this is fully initialized
-    if (!(*sample_include2_ptr)) {
-      if (wkspace_alloc_ul_checked(sample_include2_ptr, sample_ctv2 * sizeof(intptr_t))) {
+    if (!(*sample_include_quatervec_ptr)) {
+      if (bigstack_alloc_ul(sample_ctv2, sample_include_quatervec_ptr)) {
 	return 1;
       }
-      fill_vec_55(*sample_include2_ptr, sample_ct);
+      fill_quatervec_55(sample_ct, *sample_include_quatervec_ptr);
     }
   }
   if (hh_exists & (XMHH_EXISTS | Y_FIX_NEEDED)) {
-    // if already allocated, we assume it's been top_alloc'd but not
+    // if already allocated, we assume it's been bigstack_end_alloc'd but not
     // initialized
-    if (!(*sample_male_include2_ptr)) {
-      if (wkspace_alloc_ul_checked(sample_male_include2_ptr, sample_ctv2 * sizeof(intptr_t))) {
+    if (!(*sample_male_include_quatervec_ptr)) {
+      if (bigstack_alloc_ul(sample_ctv2, sample_male_include_quatervec_ptr)) {
 	return 1;
       }
     }
     if (is_include) {
-      vec_collapse_init(sex_male, unfiltered_sample_ct, sample_bitarr, sample_ct, *sample_male_include2_ptr);
+      quaterarr_collapse_init(sex_male, unfiltered_sample_ct, sample_bitarr, sample_ct, *sample_male_include_quatervec_ptr);
     } else {
-      vec_collapse_init_exclude(sex_male, unfiltered_sample_ct, sample_bitarr, sample_ct, *sample_male_include2_ptr);
+      quaterarr_collapse_init_exclude(sex_male, unfiltered_sample_ct, sample_bitarr, sample_ct, *sample_male_include_quatervec_ptr);
     }
   }
   return 0;
 }
 
-void sample_delim_convert(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uint32_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, char oldc, char newc) {
+void sample_delim_convert(uintptr_t unfiltered_sample_ct, const uintptr_t* sample_exclude, uint32_t sample_ct, uintptr_t max_sample_id_len, char oldc, char newc, char* sample_ids) {
   // assumes there is exactly one delimiter to convert per name
   uintptr_t sample_uidx = 0;
   uint32_t sample_idx;
@@ -4022,34 +4134,34 @@ void sample_delim_convert(uintptr_t unfiltered_sample_ct, uintptr_t* sample_excl
   }
 }
 
-void get_set_wrange_align(uintptr_t* bitfield, uintptr_t word_ct, uintptr_t* firstw_ptr, uintptr_t* wlen_ptr) {
-  uintptr_t* bitfield_ptr = bitfield;
-  uintptr_t* bitfield_end = &(bitfield[word_ct]);
+void get_set_wrange_align(const uintptr_t* __restrict bitarr, uintptr_t word_ct, uintptr_t* __restrict firstw_ptr, uintptr_t* __restrict wlen_ptr) {
+  const uintptr_t* bitarr_ptr = bitarr;
+  const uintptr_t* bitarr_end = &(bitarr[word_ct]);
 #ifdef __LP64__
-  uintptr_t* bitfield_end2 = &(bitfield[word_ct & (~ONELU)]);
-  while (bitfield_ptr < bitfield_end2) {
-    if (bitfield_ptr[0] || bitfield_ptr[1]) {
-      *firstw_ptr = (uintptr_t)(bitfield_ptr - bitfield);
-      while (!(*(--bitfield_end)));
-      *wlen_ptr = 1 + (uintptr_t)(bitfield_end - bitfield_ptr);
+  const uintptr_t* bitarr_end2 = &(bitarr[word_ct & (~ONELU)]);
+  while (bitarr_ptr < bitarr_end2) {
+    if (bitarr_ptr[0] || bitarr_ptr[1]) {
+      *firstw_ptr = (uintptr_t)(bitarr_ptr - bitarr);
+      while (!(*(--bitarr_end)));
+      *wlen_ptr = 1 + (uintptr_t)(bitarr_end - bitarr_ptr);
       return;
     }
-    bitfield_ptr = &(bitfield_ptr[2]);
+    bitarr_ptr = &(bitarr_ptr[2]);
   }
-  if ((bitfield_end2 != bitfield_end) && (*bitfield_end2)) {
+  if ((bitarr_end2 != bitarr_end) && (*bitarr_end2)) {
     *firstw_ptr = word_ct - 1;
     *wlen_ptr = 1;
     return;
   }
 #else
-  while (bitfield_ptr < bitfield_end) {
-    if (*bitfield_ptr) {
-      *firstw_ptr = (uintptr_t)(bitfield_ptr - bitfield);
-      while (!(*(--bitfield_end)));
-      *wlen_ptr = 1 + (uintptr_t)(bitfield_end - bitfield_ptr);
+  while (bitarr_ptr < bitarr_end) {
+    if (*bitarr_ptr) {
+      *firstw_ptr = (uintptr_t)(bitarr_ptr - bitarr);
+      while (!(*(--bitarr_end)));
+      *wlen_ptr = 1 + (uintptr_t)(bitarr_end - bitarr_ptr);
       return;
     }
-    bitfield_ptr++;
+    bitarr_ptr++;
   }
 #endif
   *firstw_ptr = 0;
@@ -4061,7 +4173,7 @@ void get_set_wrange_align(uintptr_t* bitfield, uintptr_t word_ct, uintptr_t* fir
 const char* g_species_singular = NULL;
 const char* g_species_plural = NULL;
 
-char* chrom_name_std(char* buf, Chrom_info* chrom_info_ptr, uint32_t chrom_idx) {
+char* chrom_name_std(const Chrom_info* chrom_info_ptr, uint32_t chrom_idx, char* buf) {
   uint32_t output_encoding = chrom_info_ptr->output_encoding;
   if (output_encoding & (CHR_OUTPUT_PREFIX | CHR_OUTPUT_0M)) {
     if (output_encoding == CHR_OUTPUT_0M) {
@@ -4083,7 +4195,7 @@ char* chrom_name_std(char* buf, Chrom_info* chrom_info_ptr, uint32_t chrom_idx)
     buf = memcpyl3a(buf, "chr");
   }
   if ((!(output_encoding & (CHR_OUTPUT_M | CHR_OUTPUT_MT))) || (chrom_idx <= chrom_info_ptr->autosome_ct)) {
-    return uint32_write(buf, chrom_idx);
+    return uint32toa(chrom_idx, buf);
   } else if ((int32_t)chrom_idx == chrom_info_ptr->x_code) {
     *buf++ = 'X';
   } else if ((int32_t)chrom_idx == chrom_info_ptr->y_code) {
@@ -4099,13 +4211,13 @@ char* chrom_name_std(char* buf, Chrom_info* chrom_info_ptr, uint32_t chrom_idx)
   return buf;
 }
 
-char* chrom_name_write(char* buf, Chrom_info* chrom_info_ptr, uint32_t chrom_idx) {
+char* chrom_name_write(const Chrom_info* chrom_info_ptr, uint32_t chrom_idx, char* buf) {
   // assumes chrom_idx is valid
   if (!chrom_idx) {
     *buf++ = '0';
     return buf;
   } else if (chrom_idx <= chrom_info_ptr->max_code) {
-    return chrom_name_std(buf, chrom_info_ptr, chrom_idx);
+    return chrom_name_std(chrom_info_ptr, chrom_idx, buf);
   } else if (chrom_info_ptr->zero_extra_chroms) {
     *buf++ = '0';
     return buf;
@@ -4114,16 +4226,16 @@ char* chrom_name_write(char* buf, Chrom_info* chrom_info_ptr, uint32_t chrom_idx
   }
 }
 
-char* chrom_name_buf5w4write(char* buf5, Chrom_info* chrom_info_ptr, uint32_t chrom_idx, uint32_t* chrom_name_len_ptr) {
+char* chrom_name_buf5w4write(const Chrom_info* chrom_info_ptr, uint32_t chrom_idx, uint32_t* chrom_name_len_ptr, char* buf5) {
   uint32_t slen;
   *chrom_name_len_ptr = 4;
   if (!chrom_idx) {
     memcpy(buf5, "   0", 4);
   } else if (chrom_idx <= chrom_info_ptr->max_code) {
     if (chrom_info_ptr->output_encoding & CHR_OUTPUT_PREFIX) {
-      *chrom_name_len_ptr = (uintptr_t)(chrom_name_std(buf5, chrom_info_ptr, chrom_idx) - buf5);
+      *chrom_name_len_ptr = (uintptr_t)(chrom_name_std(chrom_info_ptr, chrom_idx, buf5) - buf5);
     } else {
-      width_force(4, buf5, chrom_name_std(buf5, chrom_info_ptr, chrom_idx));
+      width_force(4, buf5, chrom_name_std(chrom_info_ptr, chrom_idx, buf5));
     }
   } else if (chrom_info_ptr->zero_extra_chroms) {
     memcpy(buf5, "   0", 4);
@@ -4139,7 +4251,7 @@ char* chrom_name_buf5w4write(char* buf5, Chrom_info* chrom_info_ptr, uint32_t ch
   return buf5;
 }
 
-uint32_t get_max_chrom_len(Chrom_info* chrom_info_ptr) {
+uint32_t get_max_chrom_len(const Chrom_info* chrom_info_ptr) {
   // does not include trailing null
   // can be overestimate
   // if more functions start calling this, it should just be built into
@@ -4183,9 +4295,9 @@ void forget_extra_chrom_names(Chrom_info* chrom_info_ptr) {
   }
 }
 
-uint32_t haploid_chrom_present(Chrom_info* chrom_info_ptr) {
-  uintptr_t* chrom_mask = chrom_info_ptr->chrom_mask;
-  uintptr_t* haploid_mask = chrom_info_ptr->haploid_mask;
+uint32_t haploid_chrom_present(const Chrom_info* chrom_info_ptr) {
+  const uintptr_t* chrom_mask = chrom_info_ptr->chrom_mask;
+  const uintptr_t* haploid_mask = chrom_info_ptr->haploid_mask;
   uint32_t uii;
   for (uii = 0; uii < CHROM_MASK_INITIAL_WORDS; uii++) {
     if (chrom_mask[uii] & haploid_mask[uii]) {
@@ -4195,13 +4307,13 @@ uint32_t haploid_chrom_present(Chrom_info* chrom_info_ptr) {
   return 0;
 }
 
-uint32_t bsearch_str_idx(const char* sptr, uint32_t slen, char** str_array, uint32_t* str_sorted_idxs, uint32_t end_idx, uint32_t* gt_ptr) {
+uint32_t bsearch_str_idx(const char* sptr, uint32_t slen, char* const* str_array, const uint32_t* __restrict str_sorted_idxs, uint32_t end_idx, uint32_t* __restrict gt_ptr) {
   // return 0 on success, 1 on failure
   // *gt_ptr is number of strings current string is lexicographically after
   // (so, on success, it's the correct index, and on failure, it's the
   // insertion point)
   uint32_t start_idx = 0;
-  char* sptr2;
+  const char* sptr2;
   uint32_t mid_idx;
   uint32_t slen2;
   int32_t ii;
@@ -4245,7 +4357,7 @@ static inline int32_t single_letter_chrom(uint32_t letter) {
   }
 }
 
-int32_t get_chrom_code_raw(char* sptr) {
+int32_t get_chrom_code_raw(const char* sptr) {
   // any character <= ' ' is considered a terminator
   // note that char arithmetic tends to be compiled to int32 operations, so we
   // mostly work with ints here
@@ -4299,7 +4411,7 @@ int32_t get_chrom_code_raw(char* sptr) {
   return -1;
 }
 
-int32_t get_chrom_code(Chrom_info* chrom_info_ptr, char* sptr) {
+int32_t get_chrom_code(const Chrom_info* chrom_info_ptr, const char* sptr) {
   // does not require string to be null-terminated, and does not perform
   // exhaustive error-checking
   // -1 = total fail, -2 = --allow-extra-chr ok
@@ -4334,7 +4446,7 @@ int32_t get_chrom_code(Chrom_info* chrom_info_ptr, char* sptr) {
   return ii;
 }
 
-int32_t get_chrom_code2(Chrom_info* chrom_info_ptr, char* sptr, uint32_t slen) {
+int32_t get_chrom_code2(const Chrom_info* chrom_info_ptr, char* sptr, uint32_t slen) {
   // when the chromosome name doesn't end with a space
   char* s_end = &(sptr[slen]);
   char tmpc = *s_end;
@@ -4345,8 +4457,8 @@ int32_t get_chrom_code2(Chrom_info* chrom_info_ptr, char* sptr, uint32_t slen) {
   return retval;
 }
 
-uint32_t get_marker_chrom_fo_idx(Chrom_info* chrom_info_ptr, uintptr_t marker_uidx) {
-  uint32_t* marker_binsearch = chrom_info_ptr->chrom_file_order_marker_idx;
+uint32_t get_marker_chrom_fo_idx(const Chrom_info* chrom_info_ptr, uintptr_t marker_uidx) {
+  const uint32_t* marker_binsearch = chrom_info_ptr->chrom_file_order_marker_idx;
   uint32_t chrom_fo_min = 0;
   uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
   uint32_t chrom_fo_cur;
@@ -4361,22 +4473,22 @@ uint32_t get_marker_chrom_fo_idx(Chrom_info* chrom_info_ptr, uintptr_t marker_ui
   return chrom_fo_min;
 }
 
-int32_t resolve_or_add_chrom_name(Chrom_info* chrom_info_ptr, char* bufptr, int32_t* chrom_idx_ptr, uintptr_t line_idx, const char* file_descrip) {
+int32_t resolve_or_add_chrom_name(const char* cur_chrom_name, const char* file_descrip, uintptr_t line_idx, Chrom_info* chrom_info_ptr, int32_t* chrom_idx_ptr) {
   char** nonstd_names = chrom_info_ptr->nonstd_names;
   uint32_t* nonstd_name_order = chrom_info_ptr->nonstd_name_order;
   uint32_t max_code_p1 = chrom_info_ptr->max_code + 1;
   uint32_t name_ct = chrom_info_ptr->name_ct;
   uint32_t chrom_code_end = max_code_p1 + name_ct;
-  uint32_t slen = strlen_se(bufptr);
+  uint32_t slen = strlen_se(cur_chrom_name);
   Ll_str* name_stack_ptr = chrom_info_ptr->incl_excl_name_stack;
   uint32_t in_name_stack = 0;
   uint32_t chrom_idx;
   uint32_t slen2;
-  if (!bsearch_str_idx(bufptr, slen, &(nonstd_names[max_code_p1]), nonstd_name_order, chrom_info_ptr->name_ct, &chrom_idx)) {
+  if (!bsearch_str_idx(cur_chrom_name, slen, &(nonstd_names[max_code_p1]), nonstd_name_order, chrom_info_ptr->name_ct, &chrom_idx)) {
     *chrom_idx_ptr = (int32_t)(chrom_idx + max_code_p1);
     return 0;
   }
-  if (*bufptr == '#') {
+  if (*cur_chrom_name == '#') {
     // this breaks VCF and PLINK 2 binary
     logerrprint("Error: Chromosome/contig names may not begin with '#'.\n");
     return RET_INVALID_FORMAT;
@@ -4400,16 +4512,16 @@ int32_t resolve_or_add_chrom_name(Chrom_info* chrom_info_ptr, char* bufptr, int3
   while (name_stack_ptr) {
     // there shouldn't be many of these, so sorting is unimportant
     slen2 = strlen(name_stack_ptr->ss);
-    if ((slen == slen2) && (!memcmp(bufptr, name_stack_ptr->ss, slen))) {
+    if ((slen == slen2) && (!memcmp(cur_chrom_name, name_stack_ptr->ss, slen))) {
       in_name_stack = 1;
       break;
     }
     name_stack_ptr = name_stack_ptr->next;
   }
   if ((in_name_stack && chrom_info_ptr->is_include_stack) || ((!in_name_stack) && (!chrom_info_ptr->is_include_stack))) {
-    SET_BIT(chrom_info_ptr->chrom_mask, chrom_code_end);
+    SET_BIT(chrom_code_end, chrom_info_ptr->chrom_mask);
   }
-  memcpy(nonstd_names[chrom_code_end], bufptr, slen);
+  memcpy(nonstd_names[chrom_code_end], cur_chrom_name, slen);
   nonstd_names[chrom_code_end][slen] = '\0';
   *chrom_idx_ptr = (int32_t)chrom_code_end;
   for (slen2 = name_ct; slen2 > chrom_idx; slen2--) {
@@ -4420,7 +4532,7 @@ int32_t resolve_or_add_chrom_name(Chrom_info* chrom_info_ptr, char* bufptr, int3
   return 0;
 }
 
-void refresh_chrom_info(Chrom_info* chrom_info_ptr, uintptr_t marker_uidx, uint32_t* chrom_end_ptr, uint32_t* chrom_fo_idx_ptr, uint32_t* is_x_ptr, uint32_t* is_y_ptr, uint32_t* is_mt_ptr, uint32_t* is_haploid_ptr) {
+void refresh_chrom_info(const Chrom_info* chrom_info_ptr, uintptr_t marker_uidx, uint32_t* __restrict chrom_end_ptr, uint32_t* __restrict chrom_fo_idx_ptr, uint32_t* __restrict is_x_ptr, uint32_t* __restrict is_y_ptr, uint32_t* __restrict is_mt_ptr, uint32_t* __restrict is_haploid_ptr) {
   // assumes marker_uidx < unfiltered_marker_ct
   int32_t chrom_idx;
   *chrom_end_ptr = chrom_info_ptr->chrom_file_order_marker_idx[(*chrom_fo_idx_ptr) + 1];
@@ -4434,7 +4546,7 @@ void refresh_chrom_info(Chrom_info* chrom_info_ptr, uintptr_t marker_uidx, uint3
   *is_haploid_ptr = is_set(chrom_info_ptr->haploid_mask, chrom_idx);
 }
 
-int32_t single_chrom_start(Chrom_info* chrom_info_ptr, uint32_t unfiltered_marker_ct, uintptr_t* marker_exclude) {
+int32_t single_chrom_start(const Chrom_info* chrom_info_ptr, const uintptr_t* marker_exclude, uint32_t unfiltered_marker_ct) {
   // Assumes there is at least one marker, and there are no split chromosomes.
   // Returns first marker_uidx in chromosome if there is only one, or -1 if
   // there's more than one chromosome.
@@ -4447,7 +4559,7 @@ int32_t single_chrom_start(Chrom_info* chrom_info_ptr, uint32_t unfiltered_marke
 }
 
 #ifdef __cplusplus
-double destructive_get_dmedian(double* unsorted_arr, uintptr_t len) {
+double destructive_get_dmedian(uintptr_t len, double* unsorted_arr) {
   if (!len) {
     return 0.0;
   }
@@ -4461,7 +4573,7 @@ double destructive_get_dmedian(double* unsorted_arr, uintptr_t len) {
   }
 }
 #else
-double get_dmedian(double* sorted_arr, uintptr_t len) {
+double get_dmedian(const double* sorted_arr, uintptr_t len) {
   if (len) {
     if (len % 2) {
       return sorted_arr[len / 2];
@@ -4473,7 +4585,7 @@ double get_dmedian(double* sorted_arr, uintptr_t len) {
   }
 }
 
-double destructive_get_dmedian(double* unsorted_arr, uintptr_t len) {
+double destructive_get_dmedian(uintptr_t len, double* unsorted_arr) {
   // no, I'm not gonna bother reimplementing introselect just for folks who
   // insist on using gcc over g++
   qsort(unsorted_arr, len, sizeof(double), double_cmp);
@@ -4639,7 +4751,7 @@ int32_t strcmp_natural_deref(const void* s1, const void* s2) {
   return strcmp_natural_uncasted(*(unsigned char**)s1, *(unsigned char**)s2);
 }
 
-int32_t get_uidx_from_unsorted(char* idstr, uintptr_t* exclude_arr, uint32_t id_ct, char* unsorted_ids, uintptr_t max_id_len) {
+int32_t get_uidx_from_unsorted(const char* idstr, const uintptr_t* exclude_arr, uint32_t id_ct, const char* unsorted_ids, uintptr_t max_id_len) {
   uintptr_t id_uidx = 0;
   uintptr_t slen_p1 = strlen(idstr) + 1;
   uint32_t id_idx;
@@ -4666,7 +4778,7 @@ char* scan_for_duplicate_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id
   return NULL;
 }
 
-char* scan_for_duplicate_or_overlap_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_len, char* sorted_nonoverlap_ids, uintptr_t nonoverlap_id_ct, uintptr_t max_nonoverlap_id_len) {
+char* scan_for_duplicate_or_overlap_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_len, const char* sorted_nonoverlap_ids, uintptr_t nonoverlap_id_ct, uintptr_t max_nonoverlap_id_len) {
   // extended scan_for_duplicate_ids() which also verifies that no entry in
   // sorted_ids matches any entry in sorted_nonoverlap_ids.
   // nonoverlap_id_ct == 0 and sorted_nonoverlap_ids == NULL ok.  id_ct cannot
@@ -4674,7 +4786,7 @@ char* scan_for_duplicate_or_overlap_ids(char* sorted_ids, uintptr_t id_ct, uintp
   uintptr_t nonoverlap_id_idx = 0;
   uintptr_t id_idx = 0;
   char* cur_id_ptr = sorted_ids;
-  char* nonoverlap_id_ptr;
+  const char* nonoverlap_id_ptr;
   char* other_id_ptr;
   int32_t ii;
   while (1) {
@@ -4700,17 +4812,7 @@ char* scan_for_duplicate_or_overlap_ids(char* sorted_ids, uintptr_t id_ct, uintp
   }
 }
 
-int32_t is_missing_pheno_cc(char* bufptr, double missing_phenod, uint32_t affection_01) {
-  char* ss;
-  double dxx;
-  dxx = strtod(bufptr, &ss);
-  if ((ss == bufptr) || (dxx == missing_phenod)) {
-    return 1;
-  }
-  return (!affection_01) && (bufptr[0] == '0') && is_space_or_eoln(bufptr[1]);
-}
-
-int32_t eval_affection(char* bufptr, double missing_phenod) {
+int32_t eval_affection(const char* bufptr, double missing_phenod) {
   // turns out --1 had the side-effect of *forcing* case/control
   // interpretation in 1.07.  We replicate that for backward compatibility, and
   // no longer call this function in that context.
@@ -4748,15 +4850,16 @@ uint32_t triangle_divide(int64_t cur_prod, int32_t modif) {
   return vv;
 }
 
-void parallel_bounds(uint32_t ct, int32_t start, uint32_t parallel_idx, uint32_t parallel_tot, int32_t* bound_start_ptr, int32_t* bound_end_ptr) {
+void parallel_bounds(uint32_t ct, int32_t start, uint32_t parallel_idx, uint32_t parallel_tot, int32_t* __restrict bound_start_ptr, int32_t* __restrict bound_end_ptr) {
   int32_t modif = 1 - start * 2;
   int64_t ct_tot = ((int64_t)ct) * (ct + modif);
   *bound_start_ptr = triangle_divide((ct_tot * parallel_idx) / parallel_tot, modif);
   *bound_end_ptr = triangle_divide((ct_tot * (parallel_idx + 1)) / parallel_tot, modif);
 }
 
+// this might belong in plink_calc instead, not being used anywhere else
 // set align to 1 for no alignment
-void triangle_fill(uint32_t* target_arr, uint32_t ct, uint32_t pieces, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t start, uint32_t align) {
+void triangle_fill(uint32_t ct, uint32_t pieces, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t start, uint32_t align, uint32_t* target_arr) {
   int32_t modif = 1 - start * 2;
   uint32_t cur_piece = 1;
   int64_t ct_tr;
@@ -4792,7 +4895,7 @@ int32_t relationship_req(uint64_t calculation_type) {
   return (calculation_type & (CALC_RELATIONSHIP | CALC_UNRELATED_HERITABILITY | CALC_REL_CUTOFF | CALC_REGRESS_REL | CALC_PCA))? 1 : 0;
 }
 
-int32_t distance_req(uint64_t calculation_type, char* read_dists_fname) {
+int32_t distance_req(const char* read_dists_fname, uint64_t calculation_type) {
   return ((calculation_type & CALC_DISTANCE) || ((calculation_type & (CALC_PLINK1_DISTANCE_MATRIX | CALC_PLINK1_IBS_MATRIX)) && (!(calculation_type & CALC_GENOME))) || ((!read_dists_fname) && (calculation_type & (CALC_IBS_TEST | CALC_GROUPDIST | CALC_REGRESS_DISTANCE))));
 }
 
@@ -4880,8 +4983,8 @@ int32_t llcmp(const void* aa, const void* bb) {
 
 // alas, qsort_r not available on some Linux distributions
 
-// Normally use qsort_ext(), but this version is necessary before wkspace has
-// been allocated.
+// Normally use qsort_ext(), but this version is necessary before g_bigstack
+// has been allocated.
 void qsort_ext2(char* main_arr, uintptr_t arr_length, uintptr_t item_length, int(* comparator_deref)(const void*, const void*), char* secondary_arr, uintptr_t secondary_item_len, char* proxy_arr, uintptr_t proxy_len) {
   uintptr_t ulii;
   for (ulii = 0; ulii < arr_length; ulii++) {
@@ -4900,7 +5003,7 @@ void qsort_ext2(char* main_arr, uintptr_t arr_length, uintptr_t item_length, int
 
 // This actually tends to be faster than just sorting an array of indices,
 // because of memory locality issues.
-int32_t qsort_ext(char* main_arr, intptr_t arr_length, intptr_t item_length, int(* comparator_deref)(const void*, const void*), char* secondary_arr, intptr_t secondary_item_len) {
+int32_t qsort_ext(char* main_arr, uintptr_t arr_length, uintptr_t item_length, int(* comparator_deref)(const void*, const void*), char* secondary_arr, intptr_t secondary_item_len) {
   // main_arr = packed array of equal-length items to sort
   // arr_length = number of items
   // item_length = byte count of each main_arr item
@@ -4914,8 +5017,8 @@ int32_t qsort_ext(char* main_arr, intptr_t arr_length, intptr_t item_length, int
   //                 be a lookup table for the original position of each
   //                 main_arr item.)
   // secondary_item_len = byte count of each secondary_arr item
-  intptr_t proxy_len = secondary_item_len + sizeof(void*);
-  unsigned char* wkspace_mark = wkspace_base;
+  uintptr_t proxy_len = secondary_item_len + sizeof(void*);
+  unsigned char* bigstack_mark = g_bigstack_base;
   char* proxy_arr;
   if (!arr_length) {
     return 0;
@@ -4923,15 +5026,15 @@ int32_t qsort_ext(char* main_arr, intptr_t arr_length, intptr_t item_length, int
   if (proxy_len < item_length) {
     proxy_len = item_length;
   }
-  if (wkspace_alloc_c_checked(&proxy_arr, arr_length * proxy_len)) {
+  if (bigstack_alloc_c(arr_length * proxy_len, &proxy_arr)) {
     return -1;
   }
   qsort_ext2(main_arr, arr_length, item_length, comparator_deref, secondary_arr, secondary_item_len, proxy_arr, proxy_len);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return 0;
 }
 
-int32_t sort_item_ids_noalloc(char* sorted_ids, uint32_t* id_map, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t item_ct, char* item_ids, uintptr_t max_id_len, uint32_t allow_dups, uint32_t collapse_idxs, int(* comparator_deref)(const void*, const void*)) {
+int32_t sort_item_ids_noalloc(uintptr_t unfiltered_ct, const uintptr_t* exclude_arr, uintptr_t item_ct, const char* __restrict item_ids, uintptr_t max_id_len, uint32_t allow_dups, uint32_t collapse_idxs, int(* comparator_deref)(const void*, const void*), char* __restrict sorted_ids, uint32_t* id_map) {
   // Stores a lexicographically sorted list of IDs in sorted_ids and the raw
   // positions of the corresponding markers/samples in *id_map_ptr.  Does not
   // include excluded markers/samples in the list.
@@ -4976,17 +5079,17 @@ int32_t sort_item_ids_noalloc(char* sorted_ids, uint32_t* id_map, uintptr_t unfi
   return 0;
 }
 
-int32_t sort_item_ids(char** sorted_ids_ptr, uint32_t** id_map_ptr, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t exclude_ct, char* item_ids, uintptr_t max_id_len, uint32_t allow_dups, uint32_t collapse_idxs, int(* comparator_deref)(const void*, const void*)) {
+int32_t sort_item_ids(uintptr_t unfiltered_ct, const uintptr_t* exclude_arr, uintptr_t exclude_ct, const char* __restrict item_ids, uintptr_t max_id_len, uint32_t allow_dups, uint32_t collapse_idxs, int(* comparator_deref)(const void*, const void*), char** sorted_ids_ptr, uint32_t** id_map_ptr) {
   uintptr_t item_ct = unfiltered_ct - exclude_ct;
   // id_map on bottom because --indiv-sort frees *sorted_ids_ptr
-  if (wkspace_alloc_ui_checked(id_map_ptr, item_ct * sizeof(int32_t)) ||
-      wkspace_alloc_c_checked(sorted_ids_ptr, item_ct * max_id_len)) {
+  if (bigstack_alloc_ui(item_ct, id_map_ptr) ||
+      bigstack_alloc_c(item_ct * max_id_len, sorted_ids_ptr)) {
     return RET_NOMEM;
   }
-  return sort_item_ids_noalloc(*sorted_ids_ptr, *id_map_ptr, unfiltered_ct, exclude_arr, item_ct, item_ids, max_id_len, allow_dups, collapse_idxs, comparator_deref);
+  return sort_item_ids_noalloc(unfiltered_ct, exclude_arr, item_ct, item_ids, max_id_len, allow_dups, collapse_idxs, comparator_deref, *sorted_ids_ptr, *id_map_ptr);
 }
 
-uint32_t uint32arr_greater_than(uint32_t* sorted_uint32_arr, uint32_t arr_length, uint32_t uii) {
+uint32_t uint32arr_greater_than(const uint32_t* sorted_uint32_arr, uint32_t arr_length, uint32_t uii) {
   // assumes arr_length is nonzero, and sorted_uint32_arr is in nondecreasing
   // order.  (useful for searching marker_pos.)
   // uii guaranteed to be larger than sorted_uint32_arr[min_idx - 1] if it
@@ -5012,7 +5115,7 @@ uint32_t uint32arr_greater_than(uint32_t* sorted_uint32_arr, uint32_t arr_length
   }
 }
 
-uint32_t int32arr_greater_than(int32_t* sorted_int32_arr, uint32_t arr_length, int32_t ii) {
+uint32_t int32arr_greater_than(const int32_t* sorted_int32_arr, uint32_t arr_length, int32_t ii) {
   int32_t min_idx = 0;
   int32_t max_idx = arr_length - 1;
   uint32_t mid_idx;
@@ -5031,7 +5134,7 @@ uint32_t int32arr_greater_than(int32_t* sorted_int32_arr, uint32_t arr_length, i
   }
 }
 
-uintptr_t uint64arr_greater_than(uint64_t* sorted_uint64_arr, uintptr_t arr_length, uint64_t ullii) {
+uintptr_t uint64arr_greater_than(const uint64_t* sorted_uint64_arr, uintptr_t arr_length, uint64_t ullii) {
   intptr_t min_idx = 0;
   intptr_t max_idx = arr_length - 1;
   uintptr_t mid_idx;
@@ -5050,7 +5153,7 @@ uintptr_t uint64arr_greater_than(uint64_t* sorted_uint64_arr, uintptr_t arr_leng
   }
 }
 
-uintptr_t doublearr_greater_than(double* sorted_dbl_arr, uintptr_t arr_length, double dxx) {
+uintptr_t doublearr_greater_than(const double* sorted_dbl_arr, uintptr_t arr_length, double dxx) {
   // returns number of items in sorted_dbl_arr which dxx is greater than.
   // assumes array is nonempty and sorted in nondecreasing order
   intptr_t min_idx = 0;
@@ -5071,7 +5174,7 @@ uintptr_t doublearr_greater_than(double* sorted_dbl_arr, uintptr_t arr_length, d
   }
 }
 
-uintptr_t nonincr_doublearr_leq_stride(double* nonincr_dbl_arr, uintptr_t arr_length, uintptr_t stride, double dxx) {
+uintptr_t nonincr_doublearr_leq_stride(const double* nonincr_dbl_arr, uintptr_t arr_length, uintptr_t stride, double dxx) {
   // assumes relevant elements of array are sorted in nonincreasing order
   // instead, and they are spaced stride units apart
   intptr_t min_idx = 0;
@@ -5092,7 +5195,7 @@ uintptr_t nonincr_doublearr_leq_stride(double* nonincr_dbl_arr, uintptr_t arr_le
   }
 }
 
-int32_t bsearch_str(const char* id_buf, uintptr_t cur_id_len, char* lptr, uintptr_t max_id_len, uintptr_t end_idx) {
+int32_t bsearch_str(const char* id_buf, uintptr_t cur_id_len, const char* lptr, uintptr_t max_id_len, uintptr_t end_idx) {
   // does not assume null-terminated id_buf, or nonempty array.
   // N.B. max_id_len includes null terminator as usual, while cur_id_len does
   // NOT.
@@ -5116,7 +5219,7 @@ int32_t bsearch_str(const char* id_buf, uintptr_t cur_id_len, char* lptr, uintpt
   return -1;
 }
 
-int32_t bsearch_str_natural(char* id_buf, char* lptr, uintptr_t max_id_len, uintptr_t end_idx) {
+int32_t bsearch_str_natural(const char* id_buf, const char* lptr, uintptr_t max_id_len, uintptr_t end_idx) {
   // unlike bsearch_str(), caller is responsible for slen > max_id_len check
   // if appropriate here
   uintptr_t start_idx = 0;
@@ -5136,7 +5239,7 @@ int32_t bsearch_str_natural(char* id_buf, char* lptr, uintptr_t max_id_len, uint
   return -1;
 }
 
-uintptr_t bsearch_str_lb(const char* id_buf, uintptr_t cur_id_len, char* lptr, uintptr_t max_id_len, uintptr_t end_idx) {
+uintptr_t bsearch_str_lb(const char* id_buf, uintptr_t cur_id_len, const char* lptr, uintptr_t max_id_len, uintptr_t end_idx) {
   // returns number of elements in lptr[] less than id_buf.
   uintptr_t start_idx = 0;
   uintptr_t mid_idx;
@@ -5154,7 +5257,7 @@ uintptr_t bsearch_str_lb(const char* id_buf, uintptr_t cur_id_len, char* lptr, u
   return start_idx;
 }
 
-uint32_t bsearch_read_fam_indiv(char* id_buf, char* lptr, uintptr_t max_id_len, uintptr_t filter_line_ct, char* read_ptr, char** read_pp_new, int32_t* retval_ptr) {
+uint32_t bsearch_read_fam_indiv(char* __restrict read_ptr, const char* __restrict lptr, uintptr_t max_id_len, uintptr_t filter_line_ct, char** read_pp_new, int32_t* retval_ptr, char* __restrict id_buf) {
   // id_buf = workspace
   // lptr = packed, sorted list of ID strings to search over
   // read_ptr is assumed to point to beginning of FID.  FID is terminated by
@@ -5185,7 +5288,7 @@ uint32_t bsearch_read_fam_indiv(char* id_buf, char* lptr, uintptr_t max_id_len,
   return 0;
 }
 
-void bsearch_fam(char* id_buf, char* lptr, uintptr_t max_id_len, uint32_t filter_line_ct, char* fam_id, uint32_t* first_idx_ptr, uint32_t* last_idx_ptr) {
+void bsearch_fam(const char* __restrict fam_id, const char* __restrict lptr, uintptr_t max_id_len, uint32_t filter_line_ct, uint32_t* __restrict first_idx_ptr, uint32_t* __restrict last_idx_ptr, char* __restrict id_buf) {
   uint32_t slen;
   uint32_t fidx;
   uint32_t loff;
@@ -5215,34 +5318,32 @@ void bsearch_fam(char* id_buf, char* lptr, uintptr_t max_id_len, uint32_t filter
   *last_idx_ptr = 0;
 }
 
-void bitfield_invert(uintptr_t* bit_arr, uintptr_t bit_ct) {
-  uintptr_t* bit_arr_stop = &(bit_arr[bit_ct / BITCT]);
-  while (bit_arr < bit_arr_stop) {
-    *bit_arr = ~(*bit_arr);
-    bit_arr++;
+void bitarr_invert(uintptr_t bit_ct, uintptr_t* bitarr) {
+  uintptr_t* bitarr_stop = &(bitarr[bit_ct / BITCT]);
+  while (bitarr < bitarr_stop) {
+    *bitarr = ~(*bitarr);
+    bitarr++;
   }
   if (bit_ct % BITCT) {
-    *bit_arr = (~(*bit_arr)) & ((ONELU << (bit_ct % BITCT)) - ONELU);
+    *bitarr = (~(*bitarr)) & ((ONELU << (bit_ct % BITCT)) - ONELU);
   }
 }
 
-void bitfield_exclude_to_include(uintptr_t* exclude_arr, uintptr_t* include_arr, uintptr_t bit_ct) {
-  // works the other way around too
-  uintptr_t* exclude_stop = &(exclude_arr[bit_ct / BITCT]);
-  while (exclude_arr < exclude_stop) {
-    *include_arr++ = ~(*exclude_arr++);
+void bitarr_invert_copy(const uintptr_t* input_bitarr, uintptr_t bit_ct, uintptr_t* output_bitarr) {
+  const uintptr_t* input_stop = &(input_bitarr[bit_ct / BITCT]);
+  while (input_bitarr < input_stop) {
+    *output_bitarr++ = ~(*input_bitarr++);
   }
   if (bit_ct % BITCT) {
-    *include_arr = (~(*exclude_arr)) & ((ONELU << (bit_ct % BITCT)) - ONELU);
+    *output_bitarr = (~(*input_bitarr)) & ((ONELU << (bit_ct % BITCT)) - ONELU);
   }
 }
 
-void bitfield_and(uintptr_t* vv, uintptr_t* include_vec, uintptr_t word_ct) {
-  // vv := vv AND include_vec
-  // on 64-bit systems, assumes vv and include_vec are 16-byte aligned
+void bitvec_and(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
+  // main_bitvec := main_bitvec AND arg_bitvec
 #ifdef __LP64__
-  __m128i* vv128 = (__m128i*)vv;
-  __m128i* iv128 = (__m128i*)include_vec;
+  __m128i* vv128 = (__m128i*)main_bitvec;
+  const __m128i* iv128 = (const __m128i*)arg_bitvec;
   __m128i* vv128_end = &(vv128[word_ct / 2]);
   while (vv128 < vv128_end) {
     *vv128 = _mm_and_si128(*iv128++, *vv128);
@@ -5250,23 +5351,22 @@ void bitfield_and(uintptr_t* vv, uintptr_t* include_vec, uintptr_t word_ct) {
   }
   if (word_ct & 1) {
     word_ct--;
-    vv[word_ct] &= include_vec[word_ct];
+    main_bitvec[word_ct] &= arg_bitvec[word_ct];
   }
 #else
-  uintptr_t* vec_end = &(vv[word_ct]);
-  while (vv < vec_end) {
-    *vv++ &= *include_vec++;
+  uintptr_t* bitvec_end = &(main_bitvec[word_ct]);
+  while (main_bitvec < bitvec_end) {
+    *main_bitvec++ &= *arg_bitvec++;
   }
 #endif
 }
 
-void bitfield_andnot(uintptr_t* vv, uintptr_t* exclude_vec, uintptr_t word_ct) {
-  // vv := vv ANDNOT exclude_vec
-  // on 64-bit systems, assumes vv and exclude_vec are 16-byte aligned
+void bitvec_andnot(const uintptr_t* __restrict exclude_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
+  // main_bitvec := main_bitvec ANDNOT exclude_bitvec
   // note that this is the reverse of the _mm_andnot() operand order
 #ifdef __LP64__
-  __m128i* vv128 = (__m128i*)vv;
-  __m128i* ev128 = (__m128i*)exclude_vec;
+  __m128i* vv128 = (__m128i*)main_bitvec;
+  const __m128i* ev128 = (const __m128i*)exclude_bitvec;
   __m128i* vv128_end = &(vv128[word_ct / 2]);
   while (vv128 < vv128_end) {
     *vv128 = _mm_andnot_si128(*ev128++, *vv128);
@@ -5274,23 +5374,21 @@ void bitfield_andnot(uintptr_t* vv, uintptr_t* exclude_vec, uintptr_t word_ct) {
   }
   if (word_ct & 1) {
     word_ct--;
-    vv[word_ct] &= ~(exclude_vec[word_ct]);
+    main_bitvec[word_ct] &= ~(exclude_bitvec[word_ct]);
   }
 #else
-  uintptr_t* vec_end = &(vv[word_ct]);
-  while (vv < vec_end) {
-    *vv++ &= ~(*exclude_vec++);
+  uintptr_t* bitvec_end = &(main_bitvec[word_ct]);
+  while (main_bitvec < bitvec_end) {
+    *main_bitvec++ &= ~(*exclude_bitvec++);
   }
 #endif
 }
 
-void bitfield_andnot_reversed_args(uintptr_t* vv, uintptr_t* include_vec, uintptr_t word_ct) {
-  // vv := (~vv) AND include_vec
-  // on 64-bit systems, assumes vv and exclude_vec are 16-byte aligned
-  // assumes word_ct is nonzero
+void bitvec_andnot_reversed_args(const uintptr_t* __restrict include_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
+  // main_bitvec := (~main_bitvec) AND include_bitvec
 #ifdef __LP64__
-  __m128i* vv128 = (__m128i*)vv;
-  __m128i* iv128 = (__m128i*)include_vec;
+  __m128i* vv128 = (__m128i*)main_bitvec;
+  const __m128i* iv128 = (const __m128i*)include_bitvec;
   __m128i* vv128_end = &(vv128[word_ct / 2]);
   while (vv128 < vv128_end) {
     *vv128 = _mm_andnot_si128(*vv128, *iv128++);
@@ -5298,23 +5396,22 @@ void bitfield_andnot_reversed_args(uintptr_t* vv, uintptr_t* include_vec, uintpt
   }
   if (word_ct & 1) {
     word_ct--;
-    vv[word_ct] = (~vv[word_ct]) & include_vec[word_ct];
+    main_bitvec[word_ct] = (~main_bitvec[word_ct]) & include_bitvec[word_ct];
   }
 #else
-  uintptr_t* vec_end = &(vv[word_ct]);
-  while (vv < vec_end) {
-    *vv = (~(*vv)) & (*include_vec++);
-    vv++;
+  uintptr_t* bitvec_end = &(main_bitvec[word_ct]);
+  while (main_bitvec < bitvec_end) {
+    *main_bitvec = (~(*main_bitvec)) & (*include_bitvec++);
+    main_bitvec++;
   }
 #endif
 }
 
-void bitfield_or(uintptr_t* vv, uintptr_t* or_vec, uintptr_t word_ct) {
-  // vv := vv OR include_vec
-  // on 64-bit systems, assumes vv and include_vec are 16-byte aligned
+void bitvec_or(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec) {
+  // main_bitvec := main_bitvec OR arg_bitvec
 #ifdef __LP64__
-  __m128i* vv128 = (__m128i*)vv;
-  __m128i* ov128 = (__m128i*)or_vec;
+  __m128i* vv128 = (__m128i*)main_bitvec;
+  const __m128i* ov128 = (const __m128i*)arg_bitvec;
   __m128i* vv128_end = &(vv128[word_ct / 2]);
   while (vv128 < vv128_end) {
     *vv128 = _mm_or_si128(*ov128++, *vv128);
@@ -5322,27 +5419,26 @@ void bitfield_or(uintptr_t* vv, uintptr_t* or_vec, uintptr_t word_ct) {
   }
   if (word_ct & 1) {
     word_ct--;
-    vv[word_ct] |= or_vec[word_ct];
+    main_bitvec[word_ct] |= arg_bitvec[word_ct];
   }
 #else
-  uintptr_t* vec_end = &(vv[word_ct]);
-  while (vv < vec_end) {
-    *vv++ |= *or_vec++;
+  uintptr_t* vec_end = &(main_bitvec[word_ct]);
+  while (main_bitvec < vec_end) {
+    *main_bitvec++ |= *arg_bitvec++;
   }
 #endif
 }
 
-void bitfield_ornot(uintptr_t* vv, uintptr_t* inverted_or_vec, uintptr_t word_ct) {
-  // vv := vv OR (~inverted_or_vec)
-  // on 64-bit systems, assumes vv and inverted_or_vec are 16-byte aligned
+void bitvec_ornot(const uintptr_t* __restrict inverted_or_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
+  // main_bitvec := main_bitvec OR (~inverted_or_bitvec)
 #ifdef __LP64__
 #ifdef __APPLE__
   const __m128i all1 = {0xffffffffffffffffLLU, 0xffffffffffffffffLLU};
 #else
   const __m128i all1 = {-1LL, -1LL};
 #endif
-  __m128i* vv128 = (__m128i*)vv;
-  __m128i* ev128 = (__m128i*)inverted_or_vec;
+  __m128i* vv128 = (__m128i*)main_bitvec;
+  const __m128i* ev128 = (const __m128i*)inverted_or_bitvec;
   __m128i* vv128_end = &(vv128[word_ct / 2]);
   while (vv128 < vv128_end) {
     *vv128 = _mm_or_si128(_mm_xor_si128(*ev128++, all1), *vv128);
@@ -5350,22 +5446,21 @@ void bitfield_ornot(uintptr_t* vv, uintptr_t* inverted_or_vec, uintptr_t word_ct
   }
   if (word_ct & 1) {
     word_ct--;
-    vv[word_ct] |= ~(inverted_or_vec[word_ct]);
+    main_bitvec[word_ct] |= ~(inverted_or_bitvec[word_ct]);
   }
 #else
-  uintptr_t* vec_end = &(vv[word_ct]);
-  while (vv < vec_end) {
-    *vv++ |= ~(*inverted_or_vec++);
+  uintptr_t* vec_end = &(main_bitvec[word_ct]);
+  while (main_bitvec < vec_end) {
+    *main_bitvec++ |= ~(*inverted_or_bitvec++);
   }
 #endif
 }
 
-void bitfield_xor(uintptr_t* bit_arr, uintptr_t* xor_arr, uintptr_t word_ct) {
-  // bit_arr := bit_arr XOR xor_arr
-  // on 64-bit systems, assumes bit_arr and xor_arr are 16-byte aligned
+void bitvec_xor(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
+  // main_bitvec := main_bitvec XOR xor_bitvec
 #ifdef __LP64__
-  __m128i* bitv128 = (__m128i*)bit_arr;
-  __m128i* xorv128 = (__m128i*)xor_arr;
+  __m128i* bitv128 = (__m128i*)main_bitvec;
+  __m128i* xorv128 = (__m128i*)arg_bitvec;
   __m128i* bitv128_end = &(bitv128[word_ct / 2]);
   while (bitv128 < bitv128_end) {
     *bitv128 = _mm_xor_si128(*xorv128++, *bitv128);
@@ -5373,34 +5468,34 @@ void bitfield_xor(uintptr_t* bit_arr, uintptr_t* xor_arr, uintptr_t word_ct) {
   }
   if (word_ct & 1) {
     word_ct--;
-    bit_arr[word_ct] ^= xor_arr[word_ct];
+    main_bitvec[word_ct] ^= arg_bitvec[word_ct];
   }
 #else
-  uintptr_t* bit_arr_end = &(bit_arr[word_ct]);
-  while (bit_arr < bit_arr_end) {
-    *bit_arr++ ^= *xor_arr++;
+  uintptr_t* main_bitvec_end = &(main_bitvec[word_ct]);
+  while (main_bitvec < main_bitvec_end) {
+    *main_bitvec++ ^= *arg_bitvec++;
   }
 #endif
 }
 
-uint32_t is_monomorphic_a2(uintptr_t* lptr, uint32_t sample_ct) {
-  uintptr_t* loop_end = &(lptr[sample_ct / BITCT2]);
+uint32_t is_monomorphic_a2(const uintptr_t* geno_arr, uint32_t sample_ct) {
+  const uintptr_t* loop_end = &(geno_arr[sample_ct / BITCT2]);
   uint32_t sample_rem = sample_ct % BITCT2;
-  for (; lptr < loop_end; lptr++) {
-    if ((~(*lptr)) & FIVEMASK) {
+  for (; geno_arr < loop_end; geno_arr++) {
+    if ((~(*geno_arr)) & FIVEMASK) {
       return 0;
     }
   }
-  return (sample_rem && ((~(*lptr)) & (FIVEMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
+  return (sample_rem && ((~(*geno_arr)) & (FIVEMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
 }
 
-uint32_t is_monomorphic(uintptr_t* lptr, uint32_t sample_ct) {
+uint32_t is_monomorphic(const uintptr_t* geno_arr, uint32_t sample_ct) {
   uint32_t sample_ctd2 = sample_ct / BITCT2;
   uint32_t sample_rem = sample_ct % BITCT2;
   uintptr_t ulii;
   uintptr_t uljj;
   while (sample_ctd2) {
-    ulii = *lptr++;
+    ulii = *geno_arr++;
     uljj = (ulii >> 1) & FIVEMASK;
     ulii = ~ulii;
     // now ulii & FIVEMASK = low bit zero, uljj = high bit one
@@ -5416,23 +5511,23 @@ uint32_t is_monomorphic(uintptr_t* lptr, uint32_t sample_ct) {
 	  return 0;
 	}
 	if (!(--sample_ctd2)) {
-	  return (sample_rem && ((~(*lptr)) & (FIVEMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
+	  return (sample_rem && ((~(*geno_arr)) & (FIVEMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
 	}
-	ulii = ~(*lptr++);
+	ulii = ~(*geno_arr++);
       }
     } else if (ulii & FIVEMASK) {
       do {
         if (!(--sample_ctd2)) {
-          return (sample_rem && ((*lptr) & (AAAAMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
+          return (sample_rem && ((*geno_arr) & (AAAAMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
 	}
-	ulii = *lptr++;
+	ulii = *geno_arr++;
       } while (!(ulii & AAAAMASK));
       return 0;
     }
     sample_ctd2--;
   }
   if (sample_rem) {
-    ulii = *lptr;
+    ulii = *geno_arr;
     uljj = (ulii >> 1) & FIVEMASK;
     ulii = ~ulii;
     if ((uljj & ulii) || (uljj && (ulii & (~uljj) & (FIVEMASK >> (BITCT - sample_rem * 2))))) {
@@ -5442,7 +5537,7 @@ uint32_t is_monomorphic(uintptr_t* lptr, uint32_t sample_ct) {
   return 1;
 }
 
-uint32_t less_than_two_genotypes(uintptr_t* lptr, uint32_t sample_ct) {
+uint32_t less_than_two_genotypes(const uintptr_t* geno_arr, uint32_t sample_ct) {
   uint32_t sample_ctd2 = sample_ct / BITCT2;
   uint32_t sample_rem = sample_ct % BITCT2;
   uintptr_t ulii;
@@ -5450,7 +5545,7 @@ uint32_t less_than_two_genotypes(uintptr_t* lptr, uint32_t sample_ct) {
   uintptr_t ulkk;
   uint32_t distinct_genotype_ct;
   while (sample_ctd2) {
-    ulii = *lptr++;
+    ulii = *geno_arr++;
     uljj = (ulii >> 1) & FIVEMASK;
     ulkk = ~ulii;
     if (uljj) {
@@ -5462,15 +5557,15 @@ uint32_t less_than_two_genotypes(uintptr_t* lptr, uint32_t sample_ct) {
 	    return 0;
 	  }
 	  if (!(--sample_ctd2)) {
-	    return (sample_rem && ((~(*lptr)) & (FIVEMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
+	    return (sample_rem && ((~(*geno_arr)) & (FIVEMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
 	  }
-	  ulkk = ~(*lptr++);
+	  ulkk = ~(*geno_arr++);
 	}
       } else {
         // heterozygote observed; either 00 or 11 now means we have 2+
 	// genotypes
 	while (1) {
-	  ulii = ~(*lptr++);
+	  ulii = ~(*geno_arr++);
 	  if (!(--sample_ctd2)) {
 	    return (sample_rem && (((~ulii) ^ (ulii >> 1)) & (FIVEMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
 	  }
@@ -5484,16 +5579,16 @@ uint32_t less_than_two_genotypes(uintptr_t* lptr, uint32_t sample_ct) {
       // polymorphic
       do {
         if (!(--sample_ctd2)) {
-          return (sample_rem && ((*lptr) & (AAAAMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
+          return (sample_rem && ((*geno_arr) & (AAAAMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
 	}
-	ulii = *lptr++;
+	ulii = *geno_arr++;
       } while (!(ulii & AAAAMASK));
       return 0;
     }
     sample_ctd2--;
   }
   if (sample_rem) {
-    ulii = *lptr;
+    ulii = *geno_arr;
     uljj = (ulii >> 1) & FIVEMASK;
     ulkk = ~ulii;
     // homozygous minor present?
@@ -5534,7 +5629,7 @@ uint32_t has_three_genotypes(uintptr_t* lptr, uint32_t sample_ct) {
   }
   cur_lptr = lptr;
   // zero-padding is benign for het and hom A2 checks
-  lptr_end = &(lptr[(sample_ct + (BITCT2 - 1)) / BITCT2]);
+  lptr_end = &(lptr[QUATERCT_TO_WORDCT(sample_ct)]);
   while (1) {
     ulii = *cur_lptr;
     uljj = (ulii >> 1) & FIVEMASK;
@@ -5559,19 +5654,19 @@ uint32_t has_three_genotypes(uintptr_t* lptr, uint32_t sample_ct) {
 
 #ifdef __LP64__
 // Basic SSE2 implementation of Lauradoux/Walisch popcount.
-static inline uintptr_t popcount_vecs(__m128i* vptr, uintptr_t ct) {
+static inline uintptr_t popcount_vecs(const __m128i* vptr, uintptr_t ct) {
   // popcounts vptr[0..(ct-1)].  Assumes ct is a multiple of 3 (0 ok).
   const __m128i m1 = {FIVEMASK, FIVEMASK};
   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
   uintptr_t tot = 0;
-  __m128i* vend;
+  const __m128i* vend;
   __m128i count1;
   __m128i count2;
   __m128i half1;
   __m128i half2;
-  __uni16 acc;
+  __univec acc;
 
   while (ct >= 30) {
     ct -= 30;
@@ -5609,18 +5704,18 @@ static inline uintptr_t popcount_vecs(__m128i* vptr, uintptr_t ct) {
   return tot;
 }
 
-static inline uintptr_t popcount2_vecs(__m128i* vptr, uintptr_t ct) {
+static inline uintptr_t popcount2_vecs(const __m128i* vptr, uintptr_t ct) {
   // assumes ct is a multiple of 6.
   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
   uintptr_t tot = 0;
-  __m128i* vend;
+  const __m128i* vend;
   __m128i loader1;
   __m128i loader2;
   __m128i count1;
   __m128i count2;
-  __uni16 acc;
+  __univec acc;
 
   while (ct >= 30) {
     ct -= 30;
@@ -5657,16 +5752,16 @@ static inline uintptr_t popcount2_vecs(__m128i* vptr, uintptr_t ct) {
   return tot;
 }
 
-static inline uintptr_t popcount_vecs_exclude(__m128i* vptr, __m128i* exclude_ptr, uintptr_t ct) {
+static inline uintptr_t popcount_vecs_exclude(const __m128i* __restrict vptr, const __m128i* __restrict exclude_ptr, uintptr_t ct) {
   // popcounts vptr ANDNOT exclude_ptr[0..(ct-1)].  ct is a multiple of 3.
   const __m128i m1 = {FIVEMASK, FIVEMASK};
   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
   uintptr_t tot = 0;
-  __m128i* vend;
+  const __m128i* vend;
   __m128i count1, count2, half1, half2;
-  __uni16 acc;
+  __univec acc;
 
   while (ct >= 30) {
     ct -= 30;
@@ -5699,16 +5794,16 @@ static inline uintptr_t popcount_vecs_exclude(__m128i* vptr, __m128i* exclude_pt
   return tot;
 }
 
-static inline uintptr_t popcount_vecs_intersect(__m128i* vptr1, __m128i* vptr2, uintptr_t ct) {
+static inline uintptr_t popcount_vecs_intersect(const __m128i* __restrict vptr1, const __m128i* __restrict vptr2, uintptr_t ct) {
   // popcounts vptr1 AND vptr2[0..(ct-1)].  ct is a multiple of 3.
   const __m128i m1 = {FIVEMASK, FIVEMASK};
   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
   uintptr_t tot = 0;
-  __m128i* vend1;
+  const __m128i* vend1;
   __m128i count1, count2, half1, half2;
-  __uni16 acc;
+  __univec acc;
 
   while (ct >= 30) {
     ct -= 30;
@@ -5741,17 +5836,17 @@ static inline uintptr_t popcount_vecs_intersect(__m128i* vptr1, __m128i* vptr2,
 }
 #endif
 
-uintptr_t popcount_longs(uintptr_t* lptr, uintptr_t word_ct) {
+uintptr_t popcount_longs(const uintptr_t* lptr, uintptr_t word_ct) {
   // Efficiently popcounts lptr[0..(word_ct - 1)].  In the 64-bit case, lptr[]
   // must be 16-byte aligned.
   // The popcount_longs_nzbase() wrapper takes care of starting from a later
   // index.
   uintptr_t tot = 0;
-  uintptr_t* lptr_end = &(lptr[word_ct]);
+  const uintptr_t* lptr_end = &(lptr[word_ct]);
 #ifdef __LP64__
   uintptr_t six_ct;
-  __m128i* vptr;
-  vptr = (__m128i*)lptr;
+  const __m128i* vptr;
+  vptr = (const __m128i*)lptr;
   six_ct = word_ct / 6;
   tot += popcount_vecs(vptr, six_ct * 3);
   lptr = &(lptr[six_ct * 6]);
@@ -5761,7 +5856,7 @@ uintptr_t popcount_longs(uintptr_t* lptr, uintptr_t word_ct) {
   // on my development machine by a hair.
   // However, if we take the hint from Lauradoux/Walisch and postpone the
   // multiply and right shift, this is no longer true.  Ah well.
-  uintptr_t* lptr_six_end;
+  const uintptr_t* lptr_six_end;
   uintptr_t tmp_stor;
   uintptr_t loader;
   uintptr_t ulii;
@@ -5802,19 +5897,19 @@ uintptr_t popcount_longs(uintptr_t* lptr, uintptr_t word_ct) {
   return tot;
 }
 
-uintptr_t popcount2_longs(uintptr_t* lptr, uintptr_t word_ct) {
+uintptr_t popcount2_longs(const uintptr_t* lptr, uintptr_t word_ct) {
   // treats lptr[] as an array of two-bit instead of one-bit numbers
   uintptr_t tot = 0;
-  uintptr_t* lptr_end = &(lptr[word_ct]);
+  const uintptr_t* lptr_end = &(lptr[word_ct]);
 #ifdef __LP64__
   uintptr_t twelve_ct;
-  __m128i* vptr;
-  vptr = (__m128i*)lptr;
+  const __m128i* vptr;
+  vptr = (const __m128i*)lptr;
   twelve_ct = word_ct / 12;
   tot += popcount2_vecs(vptr, twelve_ct * 6);
   lptr = &(lptr[twelve_ct * 12]);
 #else
-  uintptr_t* lptr_six_end;
+  const uintptr_t* lptr_six_end;
   uintptr_t loader1;
   uintptr_t loader2;
   uintptr_t ulii;
@@ -5848,7 +5943,7 @@ uintptr_t popcount2_longs(uintptr_t* lptr, uintptr_t word_ct) {
   return tot;
 }
 
-uintptr_t popcount_bit_idx(uintptr_t* lptr, uintptr_t start_idx, uintptr_t end_idx) {
+uintptr_t popcount_bit_idx(const uintptr_t* lptr, uintptr_t start_idx, uintptr_t end_idx) {
   uintptr_t start_idxl = start_idx / BITCT;
   uintptr_t start_idxlr = start_idx & (BITCT - 1);
   uintptr_t end_idxl = end_idx / BITCT;
@@ -5869,7 +5964,7 @@ uintptr_t popcount_bit_idx(uintptr_t* lptr, uintptr_t start_idx, uintptr_t end_i
   return ct;
 }
 
-uint32_t chrom_window_max(uint32_t* marker_pos, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, uint32_t chrom_idx, uint32_t ct_max, uint32_t bp_max, uint32_t cur_window_max) {
+uint32_t chrom_window_max(const uint32_t* marker_pos, const uintptr_t* marker_exclude, const Chrom_info* chrom_info_ptr, uint32_t chrom_idx, uint32_t ct_max, uint32_t bp_max, uint32_t cur_window_max) {
   // okay, it's absurd to keep rewriting this from scratch, especially given
   // that makes it likely that some reimplementations suck (--indep{-pairwise}
   // version was O(n^2) instead of O(n); sure, it didn't really matter because
@@ -5914,7 +6009,7 @@ uint32_t chrom_window_max(uint32_t* marker_pos, uintptr_t* marker_exclude, Chrom
   return cur_window_max;
 }
 
-uint32_t window_back(uint32_t* marker_pos, uintptr_t* marker_exclude, uint32_t marker_uidx_min, uint32_t marker_uidx_start, uint32_t count_max, uint32_t bp_max, uint32_t* window_trail_ct_ptr) {
+uint32_t window_back(const uint32_t* __restrict marker_pos, const uintptr_t* marker_exclude, uint32_t marker_uidx_min, uint32_t marker_uidx_start, uint32_t count_max, uint32_t bp_max, uint32_t* __restrict window_trail_ct_ptr) {
   // finds the earliest location which is within count_max sites and bp_max bps
   // count_max must be positive
   if (marker_uidx_min == marker_uidx_start) {
@@ -5927,7 +6022,7 @@ uint32_t window_back(uint32_t* marker_pos, uintptr_t* marker_exclude, uint32_t m
   uint32_t uii = marker_uidx_start % BITCT;
   uint32_t marker_uidx_last = marker_uidx_start;
   uint32_t remaining_count = count_max;
-  uintptr_t* marker_exclude_cur = &(marker_exclude[marker_uwidx_cur]);
+  const uintptr_t* marker_exclude_cur = &(marker_exclude[marker_uwidx_cur]);
   uintptr_t cur_word;
   marker_uwidx_cur *= BITCT;
   if (bp_max <= marker_pos[marker_uidx_start]) {
@@ -5983,7 +6078,7 @@ uint32_t window_back(uint32_t* marker_pos, uintptr_t* marker_exclude, uint32_t m
   }
 }
 
-uint32_t window_forward(uint32_t* marker_pos, uintptr_t* marker_exclude, uint32_t marker_uidx_start, uint32_t marker_uidx_last, uint32_t count_max, uint32_t bp_max, uint32_t* window_lead_ct_ptr) {
+uint32_t window_forward(const uint32_t* __restrict marker_pos, const uintptr_t* marker_exclude, uint32_t marker_uidx_start, uint32_t marker_uidx_last, uint32_t count_max, uint32_t bp_max, uint32_t* __restrict window_lead_ct_ptr) {
   // window_lead_ct_ptr currently cannot be NULL
   if (marker_uidx_start == marker_uidx_last) {
     *window_lead_ct_ptr = 0;
@@ -5994,7 +6089,7 @@ uint32_t window_forward(uint32_t* marker_pos, uintptr_t* marker_exclude, uint32_
   uint32_t marker_uwidx_cur = (marker_uidx_start + 1) / BITCT;
   uint32_t uii = (marker_uidx_start + 1) % BITCT;
   uint32_t remaining_count = count_max;
-  uintptr_t* marker_exclude_cur = &(marker_exclude[marker_uwidx_cur]);
+  const uintptr_t* marker_exclude_cur = &(marker_exclude[marker_uwidx_cur]);
   uintptr_t cur_word;
   marker_uwidx_cur *= BITCT;
   cur_word = ~((*marker_exclude_cur) | ((ONELU << uii) - ONELU));
@@ -6042,19 +6137,19 @@ uint32_t window_forward(uint32_t* marker_pos, uintptr_t* marker_exclude, uint32_
   return marker_uwidx_prev;
 }
 
-uintptr_t jump_forward_unset_unsafe(uintptr_t* bit_arr, uintptr_t cur_pos, uintptr_t forward_ct) {
+uintptr_t jump_forward_unset_unsafe(const uintptr_t* bitvec, uintptr_t cur_pos, uintptr_t forward_ct) {
   // advances forward_ct unset bits; forward_ct must be positive.  (stays put
   // if forward_ct == 1 and current bit is unset.  may want to tweak this
   // interface, easy to introduce off-by-one bugs...)
-  // In usual 64-bit case, also assumes bit_arr is 16-byte aligned and the end
+  // In usual 64-bit case, also assumes bitvec is 16-byte aligned and the end
   // of the trailing 16-byte block can be safely read from.
   uintptr_t widx = cur_pos / BITCT;
   uintptr_t ulii = cur_pos % BITCT;
-  uintptr_t* bptr = &(bit_arr[widx]);
+  const uintptr_t* bptr = &(bitvec[widx]);
   uintptr_t uljj;
   uintptr_t ulkk;
 #ifdef __LP64__
-  __m128i* vptr;
+  const __m128i* vptr;
 #endif
   if (ulii) {
     uljj = (~(*bptr)) >> ulii;
@@ -6083,14 +6178,14 @@ uintptr_t jump_forward_unset_unsafe(uintptr_t* bit_arr, uintptr_t cur_pos, uintp
     forward_ct -= ulkk;
     bptr++;
   }
-  vptr = (__m128i*)bptr;
+  vptr = (const __m128i*)bptr;
   while (forward_ct > BITCT * 6) {
     uljj = ((forward_ct - 1) / (BITCT * 6)) * 3;
     ulkk = popcount_vecs(vptr, uljj);
     vptr = &(vptr[uljj]);
     forward_ct -= uljj * BITCT * 2 - ulkk;
   }
-  bptr = (uintptr_t*)vptr;
+  bptr = (const uintptr_t*)vptr;
   while (forward_ct > BITCT) {
     forward_ct -= popcount_long(~(*bptr++));
   }
@@ -6106,7 +6201,7 @@ uintptr_t jump_forward_unset_unsafe(uintptr_t* bit_arr, uintptr_t cur_pos, uintp
     uljj = ~(*bptr);
     ulkk = popcount_long(uljj);
     if (ulkk >= forward_ct) {
-      widx = (uintptr_t)(bptr - bit_arr);
+      widx = (uintptr_t)(bptr - bitvec);
       goto jump_forward_unset_unsafe_finish;
     }
     forward_ct -= ulkk;
@@ -6114,18 +6209,18 @@ uintptr_t jump_forward_unset_unsafe(uintptr_t* bit_arr, uintptr_t cur_pos, uintp
   }
 }
 
-uintptr_t popcount_longs_exclude(uintptr_t* lptr, uintptr_t* exclude_arr, uintptr_t end_idx) {
+uintptr_t popcount_longs_exclude(const uintptr_t* __restrict lptr, const uintptr_t* __restrict exclude_arr, uintptr_t end_idx) {
   // popcounts lptr ANDNOT exclude_arr[0..(end_idx-1)].
   // N.B. on 64-bit systems, assumes lptr and exclude_arr are 16-byte aligned.
   uintptr_t tot = 0;
-  uintptr_t* lptr_end = &(lptr[end_idx]);
+  const uintptr_t* lptr_end = &(lptr[end_idx]);
 #ifdef __LP64__
   uintptr_t six_ct = end_idx / 6;
-  tot += popcount_vecs_exclude((__m128i*)lptr, (__m128i*)exclude_arr, six_ct * 3);
+  tot += popcount_vecs_exclude((const __m128i*)lptr, (const __m128i*)exclude_arr, six_ct * 3);
   lptr = &(lptr[six_ct * 6]);
   exclude_arr = &(exclude_arr[six_ct * 6]);
 #else
-  uintptr_t* lptr_six_end;
+  const uintptr_t* lptr_six_end;
   uintptr_t tmp_stor;
   uintptr_t loader;
   uintptr_t ulii;
@@ -6166,16 +6261,16 @@ uintptr_t popcount_longs_exclude(uintptr_t* lptr, uintptr_t* exclude_arr, uintpt
   return tot;
 }
 
-uintptr_t popcount_longs_intersect(uintptr_t* lptr1, uintptr_t* lptr2, uintptr_t word_ct) {
+uintptr_t popcount_longs_intersect(const uintptr_t* __restrict lptr1, const uintptr_t* __restrict lptr2, uintptr_t word_ct) {
   uintptr_t tot = 0;
-  uintptr_t* lptr1_end = &(lptr1[word_ct]);
+  const uintptr_t* lptr1_end = &(lptr1[word_ct]);
 #ifdef __LP64__
   uintptr_t six_ct = word_ct / 6;
-  tot += popcount_vecs_intersect((__m128i*)lptr1, (__m128i*)lptr2, six_ct * 3);
+  tot += popcount_vecs_intersect((const __m128i*)lptr1, (const __m128i*)lptr2, six_ct * 3);
   lptr1 = &(lptr1[six_ct * 6]);
   lptr2 = &(lptr2[six_ct * 6]);
 #else
-  uintptr_t* lptr1_six_end;
+  const uintptr_t* lptr1_six_end;
   uintptr_t tmp_stor;
   uintptr_t loader;
   uintptr_t ulii;
@@ -6216,13 +6311,13 @@ uintptr_t popcount_longs_intersect(uintptr_t* lptr1, uintptr_t* lptr2, uintptr_t
   return tot;
 }
 
-void vertical_bitct_subtract(uintptr_t* bit_arr, uint32_t item_ct, uint32_t* sum_arr) {
+void vertical_bitct_subtract(const uintptr_t* bitarr, uint32_t item_ct, uint32_t* sum_arr) {
   // assumes trailing bits are zeroed out
   uintptr_t cur_word;
   uint32_t idx_offset;
   uint32_t last_set_bit;
   for (idx_offset = 0; idx_offset < item_ct; idx_offset += BITCT) {
-    cur_word = *bit_arr++;
+    cur_word = *bitarr++;
     while (cur_word) {
       last_set_bit = CTZLU(cur_word);
       sum_arr[idx_offset + last_set_bit] -= 1;
@@ -6232,7 +6327,7 @@ void vertical_bitct_subtract(uintptr_t* bit_arr, uint32_t item_ct, uint32_t* sum
 }
 
 #ifdef __LP64__
-void count_2freq_dbl_60v(__m128i* vptr, __m128i* vend, __m128i* mask1vp, __m128i* mask2vp, uint32_t* ct1abp, uint32_t* ct1cp, uint32_t* ct2abp, uint32_t* ct2cp) {
+void count_2freq_dbl_960b(const VECITYPE* geno_vvec, const VECITYPE* geno_vvec_end, const VECITYPE* __restrict mask1vp, const VECITYPE* __restrict mask2vp, uint32_t* __restrict ct1abp, uint32_t* __restrict ct1cp, uint32_t* __restrict ct2abp, uint32_t* __restrict ct2cp) {
   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
   __m128i loader;
@@ -6243,17 +6338,17 @@ void count_2freq_dbl_60v(__m128i* vptr, __m128i* vend, __m128i* mask1vp, __m128i
   __m128i to_ct1_c;
   __m128i to_ct2_ab;
   __m128i to_ct2_c;
-  __uni16 acc1_ab;
-  __uni16 acc1_c;
-  __uni16 acc2_ab;
-  __uni16 acc2_c;
+  __univec acc1_ab;
+  __univec acc1_c;
+  __univec acc2_ab;
+  __univec acc2_c;
 
   acc1_ab.vi = _mm_setzero_si128();
   acc1_c.vi = _mm_setzero_si128();
   acc2_ab.vi = _mm_setzero_si128();
   acc2_c.vi = _mm_setzero_si128();
   do {
-    loader = *vptr++;
+    loader = *geno_vvec++;
     loader2 = *mask1vp++;
     loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
     loader2 = _mm_and_si128(loader2, loader);
@@ -6267,7 +6362,7 @@ void count_2freq_dbl_60v(__m128i* vptr, __m128i* vend, __m128i* mask1vp, __m128i
     to_ct1_ab = _mm_add_epi64(_mm_and_si128(to_ct1_ab, m2), _mm_and_si128(_mm_srli_epi64(to_ct1_ab, 2), m2));
     to_ct2_ab = _mm_add_epi64(_mm_and_si128(to_ct2_ab, m2), _mm_and_si128(_mm_srli_epi64(to_ct2_ab, 2), m2));
 
-    loader = *vptr++;
+    loader = *geno_vvec++;
     loader2 = *mask1vp++;
     loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
     loader2 = _mm_and_si128(loader2, loader);
@@ -6281,7 +6376,7 @@ void count_2freq_dbl_60v(__m128i* vptr, __m128i* vend, __m128i* mask1vp, __m128i
     to_ct2_c = _mm_add_epi64(to_ct2_c, _mm_andnot_si128(loader3, loader2));
     to_ct2_ab = _mm_add_epi64(to_ct2_ab, _mm_add_epi64(_mm_and_si128(to_ct_abtmp, m2), _mm_and_si128(_mm_srli_epi64(to_ct_abtmp, 2), m2)));
 
-    loader = *vptr++;
+    loader = *geno_vvec++;
     loader2 = *mask1vp++;
     loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
     loader2 = _mm_and_si128(loader2, loader);
@@ -6302,7 +6397,7 @@ void count_2freq_dbl_60v(__m128i* vptr, __m128i* vend, __m128i* mask1vp, __m128i
     acc1_c.vi = _mm_add_epi64(acc1_c.vi, _mm_add_epi64(_mm_and_si128(to_ct1_c, m4), _mm_and_si128(_mm_srli_epi64(to_ct1_c, 4), m4)));
     acc2_ab.vi = _mm_add_epi64(acc2_ab.vi, _mm_add_epi64(_mm_and_si128(to_ct2_ab, m4), _mm_and_si128(_mm_srli_epi64(to_ct2_ab, 4), m4)));
     acc2_c.vi = _mm_add_epi64(acc2_c.vi, _mm_add_epi64(_mm_and_si128(to_ct2_c, m4), _mm_and_si128(_mm_srli_epi64(to_ct2_c, 4), m4)));
-  } while (vptr < vend);
+  } while (geno_vvec < geno_vvec_end);
   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
   acc1_ab.vi = _mm_add_epi64(_mm_and_si128(acc1_ab.vi, m8), _mm_and_si128(_mm_srli_epi64(acc1_ab.vi, 8), m8));
   acc1_c.vi = _mm_and_si128(_mm_add_epi64(acc1_c.vi, _mm_srli_epi64(acc1_c.vi, 8)), m8);
@@ -6314,7 +6409,7 @@ void count_2freq_dbl_60v(__m128i* vptr, __m128i* vend, __m128i* mask1vp, __m128i
   *ct2cp += ((acc2_c.u8[0] + acc2_c.u8[1]) * 0x1000100010001LLU) >> 48;
 }
 
-void count_3freq_120v(__m128i* vptr, __m128i* vend, __m128i* maskvp, uint32_t* even_ctp, uint32_t* odd_ctp, uint32_t* homset_ctp) {
+void count_3freq_1920b(const VECITYPE* geno_vvec, const VECITYPE* geno_vvec_end, const VECITYPE* __restrict maskvp, uint32_t* __restrict even_ctp, uint32_t* __restrict odd_ctp, uint32_t* __restrict homset_ctp) {
   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
   __m128i loader;
@@ -6326,26 +6421,26 @@ void count_3freq_120v(__m128i* vptr, __m128i* vend, __m128i* maskvp, uint32_t* e
   __m128i even2;
   __m128i odd2;
   __m128i homset2;
-  __uni16 acc_even;
-  __uni16 acc_odd;
-  __uni16 acc_homset;
+  __univec acc_even;
+  __univec acc_odd;
+  __univec acc_homset;
 
   acc_even.vi = _mm_setzero_si128();
   acc_odd.vi = _mm_setzero_si128();
   acc_homset.vi = _mm_setzero_si128();
   do {
-    loader = *vptr++;
+    loader = *geno_vvec++;
     loader2 = *maskvp++;
     odd1 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
     even1 = _mm_and_si128(loader2, loader);
     homset1 = _mm_and_si128(odd1, loader);
-    loader = *vptr++;
+    loader = *geno_vvec++;
     loader2 = *maskvp++;
     loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
     even1 = _mm_add_epi64(even1, _mm_and_si128(loader2, loader));
     odd1 = _mm_add_epi64(odd1, loader3);
     homset1 = _mm_add_epi64(homset1, _mm_and_si128(loader3, loader));
-    loader = *vptr++;
+    loader = *geno_vvec++;
     loader2 = *maskvp++;
     loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
     even1 = _mm_add_epi64(even1, _mm_and_si128(loader2, loader));
@@ -6356,18 +6451,18 @@ void count_3freq_120v(__m128i* vptr, __m128i* vend, __m128i* maskvp, uint32_t* e
     odd1 = _mm_add_epi64(_mm_and_si128(odd1, m2), _mm_and_si128(_mm_srli_epi64(odd1, 2), m2));
     homset1 = _mm_add_epi64(_mm_and_si128(homset1, m2), _mm_and_si128(_mm_srli_epi64(homset1, 2), m2));
 
-    loader = *vptr++;
+    loader = *geno_vvec++;
     loader2 = *maskvp++;
     odd2 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
     even2 = _mm_and_si128(loader2, loader);
     homset2 = _mm_and_si128(odd2, loader);
-    loader = *vptr++;
+    loader = *geno_vvec++;
     loader2 = *maskvp++;
     loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
     even2 = _mm_add_epi64(even2, _mm_and_si128(loader2, loader));
     odd2 = _mm_add_epi64(odd2, loader3);
     homset2 = _mm_add_epi64(homset2, _mm_and_si128(loader3, loader));
-    loader = *vptr++;
+    loader = *geno_vvec++;
     loader2 = *maskvp++;
     loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
     even2 = _mm_add_epi64(even2, _mm_and_si128(loader2, loader));
@@ -6381,7 +6476,7 @@ void count_3freq_120v(__m128i* vptr, __m128i* vend, __m128i* maskvp, uint32_t* e
     acc_even.vi = _mm_add_epi64(acc_even.vi, _mm_add_epi64(_mm_and_si128(even1, m4), _mm_and_si128(_mm_srli_epi64(even1, 4), m4)));
     acc_odd.vi = _mm_add_epi64(acc_odd.vi, _mm_add_epi64(_mm_and_si128(odd1, m4), _mm_and_si128(_mm_srli_epi64(odd1, 4), m4)));
     acc_homset.vi = _mm_add_epi64(acc_homset.vi, _mm_add_epi64(_mm_and_si128(homset1, m4), _mm_and_si128(_mm_srli_epi64(homset1, 4), m4)));
-  } while (vptr < vend);
+  } while (geno_vvec < geno_vvec_end);
   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
   acc_even.vi = _mm_add_epi64(_mm_and_si128(acc_even.vi, m8), _mm_and_si128(_mm_srli_epi64(acc_even.vi, 8), m8));
   acc_odd.vi = _mm_add_epi64(_mm_and_si128(acc_odd.vi, m8), _mm_and_si128(_mm_srli_epi64(acc_odd.vi, 8), m8));
@@ -6391,8 +6486,8 @@ void count_3freq_120v(__m128i* vptr, __m128i* vend, __m128i* maskvp, uint32_t* e
   *homset_ctp += ((acc_homset.u8[0] + acc_homset.u8[1]) * 0x1000100010001LLU) >> 48;
 }
 #else
-void count_2freq_dbl_6(uintptr_t* lptr, uintptr_t* mask1p, uintptr_t* mask2p, uint32_t* ct1abp, uint32_t* ct1cp, uint32_t* ct2abp, uint32_t* ct2cp) {
-  uintptr_t loader = *lptr++;
+void count_2freq_dbl_24b(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict mask1p, const uintptr_t* __restrict mask2p, uint32_t* __restrict ct1abp, uint32_t* __restrict ct1cp, uint32_t* __restrict ct2abp, uint32_t* __restrict ct2cp) {
+  uintptr_t loader = *geno_vec++;
   uintptr_t loader2 = *mask1p++;
   uintptr_t loader3 = (loader >> 1) & loader2;
   uintptr_t to_ct1_ab;
@@ -6416,7 +6511,7 @@ void count_2freq_dbl_6(uintptr_t* lptr, uintptr_t* mask1p, uintptr_t* mask2p, ui
   to_ct1_ab = (to_ct1_ab & 0x33333333) + ((to_ct1_ab >> 2) & 0x33333333);
   to_ct2_ab = (to_ct2_ab & 0x33333333) + ((to_ct2_ab >> 2) & 0x33333333);
 
-  loader = *lptr++;
+  loader = *geno_vec++;
   loader2 = *mask1p++;
   loader3 = (loader >> 1) & loader2;
   loader2 &= loader;
@@ -6430,7 +6525,7 @@ void count_2freq_dbl_6(uintptr_t* lptr, uintptr_t* mask1p, uintptr_t* mask2p, ui
   to_ct2_c += loader2 & (~loader3);
   to_ct2_ab += (to_ct_abtmp & 0x33333333) + ((to_ct_abtmp >> 2) & 0x33333333);
 
-  loader = *lptr++;
+  loader = *geno_vec++;
   loader2 = *mask1p++;
   loader3 = (loader >> 1) & loader2;
   loader2 &= loader;
@@ -6449,7 +6544,7 @@ void count_2freq_dbl_6(uintptr_t* lptr, uintptr_t* mask1p, uintptr_t* mask2p, ui
   partial2_ab = (to_ct2_ab & 0x0f0f0f0f) + ((to_ct2_ab >> 4) & 0x0f0f0f0f);
   partial2_c = (to_ct2_c & 0x33333333) + ((to_ct2_c >> 2) & 0x33333333);
 
-  loader = *lptr++;
+  loader = *geno_vec++;
   loader2 = *mask1p++;
   loader3 = (loader >> 1) & loader2;
   loader2 &= loader;
@@ -6464,7 +6559,7 @@ void count_2freq_dbl_6(uintptr_t* lptr, uintptr_t* mask1p, uintptr_t* mask2p, ui
   to_ct1_ab = (to_ct1_ab & 0x33333333) + ((to_ct1_ab >> 2) & 0x33333333);
   to_ct2_ab = (to_ct2_ab & 0x33333333) + ((to_ct2_ab >> 2) & 0x33333333);
 
-  loader = *lptr++;
+  loader = *geno_vec++;
   loader2 = *mask1p++;
   loader3 = (loader >> 1) & loader2;
   loader2 &= loader;
@@ -6478,7 +6573,7 @@ void count_2freq_dbl_6(uintptr_t* lptr, uintptr_t* mask1p, uintptr_t* mask2p, ui
   to_ct2_c += loader2 & (~loader3);
   to_ct2_ab += (to_ct_abtmp & 0x33333333) + ((to_ct_abtmp >> 2) & 0x33333333);
 
-  loader = *lptr++;
+  loader = *geno_vec++;
   loader2 = *mask1p++;
   loader3 = (loader >> 1) & loader2;
   loader2 &= loader;
@@ -6506,8 +6601,8 @@ void count_2freq_dbl_6(uintptr_t* lptr, uintptr_t* mask1p, uintptr_t* mask2p, ui
   *ct2cp += (partial2_c * 0x01010101) >> 24;
 }
 
-void count_3freq_12(uintptr_t* lptr, uintptr_t* maskp, uint32_t* ctap, uint32_t* ctbp, uint32_t* ctcp) {
-  uintptr_t loader = *lptr++;
+void count_3freq_48b(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict maskp, uint32_t* __restrict ctap, uint32_t* __restrict ctbp, uint32_t* __restrict ctcp) {
+  uintptr_t loader = *geno_vec++;
   uintptr_t loader2 = *maskp++;
   uint32_t to_ct_a1 = loader & loader2;
   uint32_t to_ct_b1 = (loader >> 1) & loader2;
@@ -6519,31 +6614,31 @@ void count_3freq_12(uintptr_t* lptr, uintptr_t* maskp, uint32_t* ctap, uint32_t*
   uintptr_t partial_a;
   uintptr_t partial_b;
   uintptr_t partial_c;
-  loader = *lptr++;
+  loader = *geno_vec++;
   loader2 = *maskp++;
   loader3 = (loader >> 1) & loader2;
   to_ct_a1 += loader & loader2;
   to_ct_b1 += loader3;
   to_ct_c1 += loader & loader3;
-  loader = *lptr++;
+  loader = *geno_vec++;
   loader2 = *maskp++;
   loader3 = (loader >> 1) & loader2;
   to_ct_a1 += loader & loader2;
   to_ct_b1 += loader3;
   to_ct_c1 += loader & loader3;
 
-  loader = *lptr++;
+  loader = *geno_vec++;
   loader2 = *maskp++;
   to_ct_a2 = loader & loader2;
   to_ct_b2 = (loader >> 1) & loader2;
   to_ct_c2 = loader & to_ct_b2;
-  loader = *lptr++;
+  loader = *geno_vec++;
   loader2 = *maskp++;
   loader3 = (loader >> 1) & loader2;
   to_ct_a2 += loader & loader2;
   to_ct_b2 += loader3;
   to_ct_c2 += loader & loader3;
-  loader = *lptr++;
+  loader = *geno_vec++;
   loader2 = *maskp++;
   loader3 = (loader >> 1) & loader2;
   to_ct_a2 += loader & loader2;
@@ -6560,36 +6655,36 @@ void count_3freq_12(uintptr_t* lptr, uintptr_t* maskp, uint32_t* ctap, uint32_t*
   to_ct_c1 += (to_ct_c2 & 0x33333333) + ((to_ct_c2 >> 2) & 0x33333333);
   partial_c = (to_ct_c1 & 0x0f0f0f0f) + ((to_ct_c1 >> 4) & 0x0f0f0f0f);
 
-  loader = *lptr++;
+  loader = *geno_vec++;
   loader2 = *maskp++;
   to_ct_a1 = loader & loader2;
   to_ct_b1 = (loader >> 1) & loader2;
   to_ct_c1 = loader & to_ct_b1;
-  loader = *lptr++;
+  loader = *geno_vec++;
   loader2 = *maskp++;
   loader3 = (loader >> 1) & loader2;
   to_ct_a1 += loader & loader2;
   to_ct_b1 += loader3;
   to_ct_c1 += loader & loader3;
-  loader = *lptr++;
+  loader = *geno_vec++;
   loader2 = *maskp++;
   loader3 = (loader >> 1) & loader2;
   to_ct_a1 += loader & loader2;
   to_ct_b1 += loader3;
   to_ct_c1 += loader & loader3;
 
-  loader = *lptr++;
+  loader = *geno_vec++;
   loader2 = *maskp++;
   to_ct_a2 = loader & loader2;
   to_ct_b2 = (loader >> 1) & loader2;
   to_ct_c2 = loader & to_ct_b2;
-  loader = *lptr++;
+  loader = *geno_vec++;
   loader2 = *maskp++;
   loader3 = (loader >> 1) & loader2;
   to_ct_a2 += loader & loader2;
   to_ct_b2 += loader3;
   to_ct_c2 += loader & loader3;
-  loader = *lptr;
+  loader = *geno_vec;
   loader2 = *maskp;
   loader3 = (loader >> 1) & loader2;
   to_ct_a2 += loader & loader2;
@@ -6613,7 +6708,7 @@ void count_3freq_12(uintptr_t* lptr, uintptr_t* maskp, uint32_t* ctap, uint32_t*
 #endif
 
 #ifdef __LP64__
-void count_set_freq_60v(__m128i* vptr, __m128i* vend, __m128i* include_vec, uint32_t* set_ctp, uint32_t* missing_ctp) {
+void count_set_freq_60v(const __m128i* vptr, const __m128i* vend, const __m128i* __restrict include_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
@@ -6623,8 +6718,8 @@ void count_set_freq_60v(__m128i* vptr, __m128i* vend, __m128i* include_vec, uint
   __m128i odds;
   __m128i evens;
   __m128i missings;
-  __uni16 acc;
-  __uni16 accm;
+  __univec acc;
+  __univec accm;
   acc.vi = _mm_setzero_si128();
   accm.vi = _mm_setzero_si128();
   do {
@@ -6668,12 +6763,12 @@ void count_set_freq_60v(__m128i* vptr, __m128i* vend, __m128i* include_vec, uint
   *missing_ctp += ((accm.u8[0] + accm.u8[1]) * 0x1000100010001LLU) >> 48;
 }
 
-void count_set_freq_hap_120v(__m128i* vptr, __m128i* vend, __m128i* include_vec, uint32_t* set_ctp, uint32_t* missing_ctp) {
+void count_set_freq_hap_120v(const __m128i* vptr, const __m128i* vend, const __m128i* __restrict include_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
-  __uni16 acc;
-  __uni16 accm;
+  __univec acc;
+  __univec accm;
   __m128i loader;
   __m128i loader2;
   __m128i loader3;
@@ -6728,7 +6823,7 @@ void count_set_freq_hap_120v(__m128i* vptr, __m128i* vend, __m128i* include_vec,
   *missing_ctp += ((accm.u8[0] + accm.u8[1]) * 0x1000100010001LLU) >> 48;
 }
 
-void count_set_freq_x_60v(__m128i* vptr, __m128i* vend, __m128i* include_vec, __m128i* male_vec, uint32_t* set_ctp, uint32_t* missing_ctp) {
+void count_set_freq_x_60v(const __m128i* vptr, const __m128i* vend, const __m128i* __restrict include_vec, const __m128i* __restrict male_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
@@ -6741,8 +6836,8 @@ void count_set_freq_x_60v(__m128i* vptr, __m128i* vend, __m128i* include_vec, __
   __m128i missings_nm;
   __m128i missings_m;
   __m128i males;
-  __uni16 acc;
-  __uni16 accm;
+  __univec acc;
+  __univec accm;
   acc.vi = _mm_setzero_si128();
   accm.vi = _mm_setzero_si128();
   do {
@@ -6798,7 +6893,7 @@ void count_set_freq_x_60v(__m128i* vptr, __m128i* vend, __m128i* include_vec, __
   *missing_ctp += ((accm.u8[0] + accm.u8[1]) * 0x1000100010001LLU) >> 48;
 }
 
-void count_set_freq_y_120v(__m128i* vptr, __m128i* vend, __m128i* include_vec, __m128i* nonmale_vec, uint32_t* set_ctp, uint32_t* missing_ctp) {
+void count_set_freq_y_120v(const __m128i* vptr, const __m128i* vend, const __m128i* __restrict include_vec, const __m128i* __restrict nonmale_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
@@ -6810,8 +6905,8 @@ void count_set_freq_y_120v(__m128i* vptr, __m128i* vend, __m128i* include_vec, _
   __m128i missings1;
   __m128i sets2;
   __m128i missings2;
-  __uni16 acc;
-  __uni16 accm;
+  __univec acc;
+  __univec accm;
   acc.vi = _mm_setzero_si128();
   accm.vi = _mm_setzero_si128();
   do {
@@ -6869,7 +6964,7 @@ void count_set_freq_y_120v(__m128i* vptr, __m128i* vend, __m128i* include_vec, _
   *missing_ctp += ((accm.u8[0] + accm.u8[1]) * 0x1000100010001LLU) >> 48;
 }
 
-uintptr_t count_01_vecs(__m128i* vptr, uintptr_t vct) {
+uintptr_t count_01_vecs(const __m128i* vptr, uintptr_t vct) {
   // counts number of aligned 01s (i.e. PLINK missing genotypes) in
   // [vptr, vend).  Assumes number of words in interval is a multiple of 12.
   const __m128i m1 = {FIVEMASK, FIVEMASK};
@@ -6877,12 +6972,12 @@ uintptr_t count_01_vecs(__m128i* vptr, uintptr_t vct) {
   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
   uintptr_t tot = 0;
-  __m128i* vend;
+  const __m128i* vend;
   __m128i loader1;
   __m128i loader2;
   __m128i count1;
   __m128i count2;
-  __uni16 acc;
+  __univec acc;
 
   while (vct >= 60) {
     vct -= 60;
@@ -6918,7 +7013,7 @@ uintptr_t count_01_vecs(__m128i* vptr, uintptr_t vct) {
 }
 
 #else
-void count_set_freq_6(uintptr_t* lptr, uintptr_t* include_vec, uint32_t* set_ctp, uint32_t* missing_ctp) {
+void count_set_freq_6(const uintptr_t* __restrict lptr, const uintptr_t* __restrict include_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
   uintptr_t loader = *lptr++;
   uintptr_t loader2 = loader >> 1;
   uintptr_t loader3 = *include_vec++;
@@ -6981,7 +7076,7 @@ void count_set_freq_6(uintptr_t* lptr, uintptr_t* include_vec, uint32_t* set_ctp
   *missing_ctp += (accm * 0x01010101) >> 24;
 }
 
-void count_set_freq_hap_12(uintptr_t* lptr, uintptr_t* include_vec, uint32_t* set_ctp, uint32_t* missing_ctp) {
+void count_set_freq_hap_12(const uintptr_t* __restrict lptr, const uintptr_t* __restrict include_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
   uintptr_t loader = *lptr++;
   uintptr_t loader2 = loader >> 1;
   uintptr_t loader3 = *include_vec++;
@@ -7072,7 +7167,7 @@ void count_set_freq_hap_12(uintptr_t* lptr, uintptr_t* include_vec, uint32_t* se
   *missing_ctp += (accm * 0x01010101) >> 24;
 }
 
-void count_set_freq_x_6(uintptr_t* lptr, uintptr_t* include_vec, uintptr_t* male_vec, uint32_t* set_ctp, uint32_t* missing_ctp) {
+void count_set_freq_x_6(const uintptr_t* __restrict lptr, const uintptr_t* __restrict include_vec, const uintptr_t* __restrict male_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
   uintptr_t loader = *lptr++;
   uintptr_t loader2 = loader >> 1;
   uintptr_t loader3 = *include_vec++;
@@ -7172,7 +7267,7 @@ void count_set_freq_x_6(uintptr_t* lptr, uintptr_t* include_vec, uintptr_t* male
   *missing_ctp += (accm * 0x01010101) >> 24;
 }
 
-void count_set_freq_y_12(uintptr_t* lptr, uintptr_t* include_vec, uintptr_t* nonmale_vec, uint32_t* set_ctp, uint32_t* missing_ctp) {
+void count_set_freq_y_12(const uintptr_t* __restrict lptr, const uintptr_t* __restrict include_vec, const uintptr_t* __restrict nonmale_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
   uintptr_t loader = *lptr++;
   uintptr_t loader2 = loader >> 1;
   uintptr_t loader3 = *include_vec++;
@@ -7276,7 +7371,7 @@ void count_set_freq_y_12(uintptr_t* lptr, uintptr_t* include_vec, uintptr_t* non
   *missing_ctp += (accm * 0x01010101) >> 24;
 }
 
-uintptr_t count_01_12(uintptr_t* lptr) {
+uintptr_t count_01_12(const uintptr_t* lptr) {
   uintptr_t loader1 = *lptr++;
   uintptr_t loader2 = *lptr++;
   uintptr_t count1 = loader1 & (~(loader1 >> 1)) & FIVEMASK;
@@ -7315,14 +7410,14 @@ uintptr_t count_01_12(uintptr_t* lptr) {
 }
 #endif
 
-void vec_set_freq(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* include_vec, uint32_t* set_ctp, uint32_t* missing_ctp) {
-  // Assuming include_vec describes e.g. cases, and an autosomal marker, this
-  // counts the number of case set alleles loaded in lptr[], as well as the
-  // number of cases with missing genotype info.
+void genovec_set_freq(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict include_quatervec, uintptr_t sample_ctl2, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
+  // Assuming include_quatervec describes e.g. cases, and an autosomal marker,
+  // this counts the number of case set alleles loaded in geno_vec[], as well
+  // as the number of cases with missing genotype info.
   // See single_marker_freqs_and_hwe() for discussion.
   // missing count: popcount2(genotype & (~(genotype >> 1)) & 0x5555...)
   // set allele count: popcount(genotype) - missing count
-  uintptr_t* lptr_end = &(lptr[sample_ctl2]);
+  const uintptr_t* geno_vec_end = &(geno_vec[sample_ctl2]);
   uintptr_t loader;
   uintptr_t loader2;
   uintptr_t missing_incr;
@@ -7330,31 +7425,31 @@ void vec_set_freq(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* include_vec
   uint32_t accm = 0;
 #ifdef __LP64__
   uintptr_t cur_decr = 60;
-  uintptr_t* lptr_6x_end;
+  const uintptr_t* geno_vec_6x_end;
   sample_ctl2 -= sample_ctl2 % 6;
   while (sample_ctl2 >= 60) {
-  vec_set_freq_loop:
-    lptr_6x_end = &(lptr[cur_decr]);
-    count_set_freq_60v((__m128i*)lptr, (__m128i*)lptr_6x_end, (__m128i*)include_vec, &acc, &accm);
-    lptr = lptr_6x_end;
-    include_vec = &(include_vec[cur_decr]);
+  genovec_set_freq_loop:
+    geno_vec_6x_end = &(geno_vec[cur_decr]);
+    count_set_freq_60v((const __m128i*)geno_vec, (const __m128i*)geno_vec_6x_end, (const __m128i*)include_quatervec, &acc, &accm);
+    geno_vec = geno_vec_6x_end;
+    include_quatervec = &(include_quatervec[cur_decr]);
     sample_ctl2 -= cur_decr;
   }
   if (sample_ctl2) {
     cur_decr = sample_ctl2;
-    goto vec_set_freq_loop;
+    goto genovec_set_freq_loop;
   }
 #else
-  uintptr_t* lptr_six_end = &(lptr[sample_ctl2 - (sample_ctl2 % 6)]);
-  while (lptr < lptr_six_end) {
-    count_set_freq_6(lptr, include_vec, &acc, &accm);
-    lptr = &(lptr[6]);
-    include_vec = &(include_vec[6]);
+  const uintptr_t* geno_vec_six_end = &(geno_vec[sample_ctl2 - (sample_ctl2 % 6)]);
+  while (geno_vec < geno_vec_six_end) {
+    count_set_freq_6(geno_vec, include_quatervec, &acc, &accm);
+    geno_vec = &(geno_vec[6]);
+    include_quatervec = &(include_quatervec[6]);
   }
 #endif
-  while (lptr < lptr_end) {
-    loader = *lptr++;
-    loader2 = *include_vec++;
+  while (geno_vec < geno_vec_end) {
+    loader = *geno_vec++;
+    loader2 = *include_quatervec++;
     missing_incr = popcount2_long(loader & (~(loader >> 1)) & loader2);
     accm += missing_incr;
     acc += popcount_long(loader & (loader2 * 3)) - missing_incr;
@@ -7363,10 +7458,10 @@ void vec_set_freq(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* include_vec
   *missing_ctp = accm;
 }
 
-void vec_set_freq_x(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* include_vec, uintptr_t* male_vec, uint32_t* set_ctp, uint32_t* missing_ctp) {
+void genovec_set_freq_x(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict include_quatervec, const uintptr_t* __restrict male_quatervec, uintptr_t sample_ctl2, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
   // diploid counting for nonmales, haploid counting for males
   // missing_ct := male_obs + male_missing + 2 * female_missing
-  uintptr_t* lptr_end = &(lptr[sample_ctl2]);
+  const uintptr_t* geno_vec_end = &(geno_vec[sample_ctl2]);
   uintptr_t loader;
   uintptr_t loader2;
   uintptr_t loader3;
@@ -7376,40 +7471,40 @@ void vec_set_freq_x(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* include_v
   uint32_t accm = 0;
 #ifdef __LP64__
   uintptr_t cur_decr = 60;
-  uintptr_t* lptr_6x_end;
+  const uintptr_t* geno_vec_6x_end;
   sample_ctl2 -= sample_ctl2 % 6;
   while (sample_ctl2 >= 60) {
-  vec_set_freq_x_loop:
-    lptr_6x_end = &(lptr[cur_decr]);
-    count_set_freq_x_60v((__m128i*)lptr, (__m128i*)lptr_6x_end, (__m128i*)include_vec, (__m128i*)male_vec, &acc, &accm);
-    lptr = lptr_6x_end;
-    include_vec = &(include_vec[cur_decr]);
-    male_vec = &(male_vec[cur_decr]);
+  genovec_set_freq_x_loop:
+    geno_vec_6x_end = &(geno_vec[cur_decr]);
+    count_set_freq_x_60v((const __m128i*)geno_vec, (const __m128i*)geno_vec_6x_end, (const __m128i*)include_quatervec, (const __m128i*)male_quatervec, &acc, &accm);
+    geno_vec = geno_vec_6x_end;
+    include_quatervec = &(include_quatervec[cur_decr]);
+    male_quatervec = &(male_quatervec[cur_decr]);
     sample_ctl2 -= cur_decr;
   }
   if (sample_ctl2) {
     cur_decr = sample_ctl2;
-    goto vec_set_freq_x_loop;
+    goto genovec_set_freq_x_loop;
   }
 #else
-  uintptr_t* lptr_six_end = &(lptr[sample_ctl2 - (sample_ctl2 % 6)]);
-  while (lptr < lptr_six_end) {
-    count_set_freq_x_6(lptr, include_vec, male_vec, &acc, &accm);
-    lptr = &(lptr[6]);
-    include_vec = &(include_vec[6]);
-    male_vec = &(male_vec[6]);
+  const uintptr_t* geno_vec_six_end = &(geno_vec[sample_ctl2 - (sample_ctl2 % 6)]);
+  while (geno_vec < geno_vec_six_end) {
+    count_set_freq_x_6(geno_vec, include_quatervec, male_quatervec, &acc, &accm);
+    geno_vec = &(geno_vec[6]);
+    include_quatervec = &(include_quatervec[6]);
+    male_quatervec = &(male_quatervec[6]);
   }
 #endif
-  while (lptr < lptr_end) {
-    loader = *lptr++;
+  while (geno_vec < geno_vec_end) {
+    loader = *geno_vec++;
     loader2 = loader >> 1;
-    loader3 = *include_vec++;
-    loader4 = loader3 & (~(*male_vec));
+    loader3 = *include_quatervec++;
+    loader4 = loader3 & (~(*male_quatervec));
     missing_incr = popcount2_long(loader & (~loader2) & loader4);
     accm += 2 * missing_incr;
     acc += popcount_long(loader & (loader4 * 3)) - missing_incr;
 
-    loader4 = loader3 & (*male_vec++);
+    loader4 = loader3 & (*male_quatervec++);
     acc += popcount2_long(loader & loader2 & loader4);
     accm += popcount_long(((loader ^ loader2) & loader4) | (loader4 << 1));
   }
@@ -7417,9 +7512,9 @@ void vec_set_freq_x(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* include_v
   *missing_ctp = accm;
 }
 
-void vec_set_freq_y(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* include_vec, uintptr_t* nonmale_vec, uint32_t* set_ctp, uint32_t* missing_ctp) {
+void genovec_set_freq_y(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict include_quatervec, const uintptr_t* __restrict nonmale_quatervec, uintptr_t sample_ctl2, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
   // all nonmales contribute to missing_ct here
-  uintptr_t* lptr_end = &(lptr[sample_ctl2]);
+  const uintptr_t* geno_vec_end = &(geno_vec[sample_ctl2]);
   uintptr_t loader;
   uintptr_t loader2;
   uintptr_t loader3;
@@ -7428,35 +7523,35 @@ void vec_set_freq_y(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* include_v
   uint32_t accm = 0;
 #ifdef __LP64__
   uintptr_t cur_decr = 120;
-  uintptr_t* lptr_12x_end;
+  const uintptr_t* geno_vec_12x_end;
   sample_ctl2 -= sample_ctl2 % 12;
   while (sample_ctl2 >= 120) {
-  vec_set_freq_y_loop:
-    lptr_12x_end = &(lptr[cur_decr]);
-    count_set_freq_y_120v((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)include_vec, (__m128i*)nonmale_vec, &acc, &accm);
-    lptr = lptr_12x_end;
-    include_vec = &(include_vec[cur_decr]);
-    nonmale_vec = &(nonmale_vec[cur_decr]);
+  genovec_set_freq_y_loop:
+    geno_vec_12x_end = &(geno_vec[cur_decr]);
+    count_set_freq_y_120v((__m128i*)geno_vec, (__m128i*)geno_vec_12x_end, (__m128i*)include_quatervec, (__m128i*)nonmale_quatervec, &acc, &accm);
+    geno_vec = geno_vec_12x_end;
+    include_quatervec = &(include_quatervec[cur_decr]);
+    nonmale_quatervec = &(nonmale_quatervec[cur_decr]);
     sample_ctl2 -= cur_decr;
   }
   if (sample_ctl2) {
     cur_decr = sample_ctl2;
-    goto vec_set_freq_y_loop;
+    goto genovec_set_freq_y_loop;
   }
 #else
-  uintptr_t* lptr_twelve_end = &(lptr[sample_ctl2 - (sample_ctl2 % 12)]);
-  while (lptr < lptr_twelve_end) {
-    count_set_freq_y_12(lptr, include_vec, nonmale_vec, &acc, &accm);
-    lptr = &(lptr[12]);
-    include_vec = &(include_vec[12]);
-    nonmale_vec = &(nonmale_vec[12]);
+  const uintptr_t* geno_vec_twelve_end = &(geno_vec[sample_ctl2 - (sample_ctl2 % 12)]);
+  while (geno_vec < geno_vec_twelve_end) {
+    count_set_freq_y_12(geno_vec, include_quatervec, nonmale_quatervec, &acc, &accm);
+    geno_vec = &(geno_vec[12]);
+    include_quatervec = &(include_quatervec[12]);
+    nonmale_quatervec = &(nonmale_quatervec[12]);
   }
 #endif
-  while (lptr < lptr_end) {
-    loader = *lptr++;
+  while (geno_vec < geno_vec_end) {
+    loader = *geno_vec++;
     loader2 = loader >> 1;
-    loader3 = *include_vec++;
-    loader4 = *nonmale_vec++;
+    loader3 = *include_quatervec++;
+    loader4 = *nonmale_quatervec++;
     acc += popcount2_long(loader & loader2 & loader3 & (~loader4));
     accm += popcount2_long(loader3 & ((loader ^ loader2) | loader4));
   }
@@ -7464,9 +7559,9 @@ void vec_set_freq_y(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* include_v
   *missing_ctp = accm;
 }
 
-void vec_3freq(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* include_vec, uint32_t* missing_ctp, uint32_t* het_ctp, uint32_t* homset_ctp) {
+void genovec_3freq(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict include_quatervec, uintptr_t sample_ctl2, uint32_t* __restrict missing_ctp, uint32_t* __restrict het_ctp, uint32_t* __restrict homset_ctp) {
   // generic routine for getting all counts.
-  uintptr_t* lptr_end = &(lptr[sample_ctl2]);
+  const uintptr_t* geno_vec_end = &(geno_vec[sample_ctl2]);
   uintptr_t loader;
   uintptr_t loader2;
   uintptr_t loader3;
@@ -7475,31 +7570,31 @@ void vec_3freq(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* include_vec, u
   uint32_t acc_and = 0;
 #ifdef __LP64__
   uintptr_t cur_decr = 120;
-  uintptr_t* lptr_12x_end;
+  const uintptr_t* geno_vec_12x_end;
   sample_ctl2 -= sample_ctl2 % 12;
   while (sample_ctl2 >= 120) {
-  vec_3freq_loop:
-    lptr_12x_end = &(lptr[cur_decr]);
-    count_3freq_120v((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)include_vec, &acc_even, &acc_odd, &acc_and);
-    lptr = lptr_12x_end;
-    include_vec = &(include_vec[cur_decr]);
+  genovec_3freq_loop:
+    geno_vec_12x_end = &(geno_vec[cur_decr]);
+    count_3freq_1920b((const __m128i*)geno_vec, (const __m128i*)geno_vec_12x_end, (const __m128i*)include_quatervec, &acc_even, &acc_odd, &acc_and);
+    geno_vec = geno_vec_12x_end;
+    include_quatervec = &(include_quatervec[cur_decr]);
     sample_ctl2 -= cur_decr;
   }
   if (sample_ctl2) {
     cur_decr = sample_ctl2;
-    goto vec_3freq_loop;
+    goto genovec_3freq_loop;
   }
 #else
-  uintptr_t* lptr_twelve_end = &(lptr[sample_ctl2 - (sample_ctl2 % 12)]);
-  while (lptr < lptr_twelve_end) {
-    count_3freq_12(lptr, include_vec, &acc_even, &acc_odd, &acc_and);
-    lptr = &(lptr[12]);
-    include_vec = &(include_vec[12]);
+  const uintptr_t* geno_vec_twelve_end = &(geno_vec[sample_ctl2 - (sample_ctl2 % 12)]);
+  while (geno_vec < geno_vec_twelve_end) {
+    count_3freq_48b(geno_vec, include_quatervec, &acc_even, &acc_odd, &acc_and);
+    geno_vec = &(geno_vec[12]);
+    include_quatervec = &(include_quatervec[12]);
   }
 #endif
-  while (lptr < lptr_end) {
-    loader = *lptr++;
-    loader2 = *include_vec++;
+  while (geno_vec < geno_vec_end) {
+    loader = *geno_vec++;
+    loader2 = *include_quatervec++;
     loader3 = loader2 & (loader >> 1);
     acc_even += popcount2_long(loader & loader2);
     acc_odd += popcount2_long(loader3);
@@ -7510,45 +7605,46 @@ void vec_3freq(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* include_vec, u
   *homset_ctp = acc_and;
 }
 
-uintptr_t count_01(uintptr_t* lptr, uintptr_t word_ct) {
+uintptr_t count_01(const uintptr_t* quatervec, uintptr_t word_ct) {
   // really just for getting a missing count
-  // unlike popcount01_longs, this does not assume lptr[] has no 11s
-  uintptr_t* lptr_end = &(lptr[word_ct]);
+  // unlike popcount01_longs, this does not assume quatervec[] has no 11s
+  const uintptr_t* quatervec_end = &(quatervec[word_ct]);
   uintptr_t loader;
 #ifdef __LP64__
   uintptr_t acc;
   word_ct -= word_ct % 12;
-  acc = count_01_vecs((__m128i*)lptr, word_ct / 2);
-  lptr = &(lptr[word_ct]);
+  acc = count_01_vecs((__m128i*)quatervec, word_ct / 2);
+  quatervec = &(quatervec[word_ct]);
 #else
-  uintptr_t* lptr_twelve_end = &(lptr[word_ct - (word_ct % 12)]);
+  const uintptr_t* quatervec_twelve_end = &(quatervec[word_ct - (word_ct % 12)]);
   uintptr_t acc = 0;
-  while (lptr < lptr_twelve_end) {
-    acc += count_01_12(lptr);
-    lptr = &(lptr[12]);
+  while (quatervec < quatervec_twelve_end) {
+    acc += count_01_12(quatervec);
+    quatervec = &(quatervec[12]);
   }
 #endif
-  while (lptr < lptr_end) {
-    loader = *lptr++;
+  while (quatervec < quatervec_end) {
+    loader = *quatervec++;
     acc += popcount2_long(loader & (~(loader >> 1)) & FIVEMASK);
   }
   return acc;
 }
 
-void fill_all_bits(uintptr_t* bit_arr, uintptr_t ct) {
+void fill_all_bits(uintptr_t ct, uintptr_t* bitarr) {
   // leaves bits beyond the end unset
   // ok for ct == 0
   uintptr_t quotient = ct / BITCT;
   uintptr_t remainder = ct % BITCT;
-  fill_ulong_one(bit_arr, quotient);
+  fill_ulong_one(bitarr, quotient);
   if (remainder) {
-    bit_arr[quotient] = (ONELU << remainder) - ONELU;
+    bitarr[quotient] = (ONELU << remainder) - ONELU;
   }
 }
 
-uint32_t numeric_range_list_to_bitfield(Range_list* range_list_ptr, uint32_t item_ct, uintptr_t* bitfield, uint32_t offset, uint32_t ignore_overflow) {
-  char* names = range_list_ptr->names;
-  unsigned char* starts_range = range_list_ptr->starts_range;
+uint32_t numeric_range_list_to_bitarr(const Range_list* range_list_ptr, uint32_t item_ct, uint32_t offset, uint32_t ignore_overflow, uintptr_t* bitarr) {
+  // bitarr assumed to be initialized
+  const char* names = range_list_ptr->names;
+  const unsigned char* starts_range = range_list_ptr->starts_range;
   uint32_t name_ct = range_list_ptr->name_ct;
   uint32_t name_max_len = range_list_ptr->name_max_len;
   uint32_t idx_max = item_ct + offset;
@@ -7556,7 +7652,7 @@ uint32_t numeric_range_list_to_bitfield(Range_list* range_list_ptr, uint32_t ite
   uint32_t idx1;
   uint32_t idx2;
   for (name_idx = 0; name_idx < name_ct; name_idx++) {
-    if (scan_uint_capped(&(names[name_idx * name_max_len]), &idx1, idx_max / 10, idx_max % 10)) {
+    if (scan_uint_capped(&(names[name_idx * name_max_len]), idx_max / 10, idx_max % 10, &idx1)) {
       if (ignore_overflow) {
 	continue;
       }
@@ -7564,21 +7660,22 @@ uint32_t numeric_range_list_to_bitfield(Range_list* range_list_ptr, uint32_t ite
     }
     if (starts_range[name_idx]) {
       name_idx++;
-      if (scan_uint_capped(&(names[name_idx * name_max_len]), &idx2, idx_max / 10, idx_max % 10)) {
+      if (scan_uint_capped(&(names[name_idx * name_max_len]), idx_max / 10, idx_max % 10, &idx2)) {
 	if (!ignore_overflow) {
 	  return 1;
 	}
         idx2 = idx_max - 1;
       }
-      fill_bits(bitfield, idx1 - offset, (idx2 - idx1) + 1);
+      fill_bits(idx1 - offset, (idx2 - idx1) + 1, bitarr);
     } else {
-      set_bit(bitfield, idx1 - offset);
+      set_bit(idx1 - offset, bitarr);
     }
   }
   return 0;
 }
 
-int32_t string_range_list_to_bitfield(char* header_line, uint32_t item_ct, uint32_t fixed_len, Range_list* range_list_ptr, char* sorted_ids, uint32_t* id_map, int32_t* seen_idxs, const char* range_list_flag, const char* file_descrip, uintptr_t* bitfield) {
+int32_t string_range_list_to_bitarr(char* header_line, uint32_t item_ct, uint32_t fixed_len, const Range_list* range_list_ptr, const char* __restrict sorted_ids, const uint32_t* __restrict id_map, const char* __restrict range_list_flag, const char* __restrict file_descrip, uintptr_t* bitarr, int32_t* __restrict seen_idxs) {
+  // bitarr assumed to be initialized
   // if fixed_len is zero, header_line is assumed to be a list of
   // space-delimited unequal-length names
   uintptr_t max_id_len = range_list_ptr->name_max_len;
@@ -7594,18 +7691,18 @@ int32_t string_range_list_to_bitfield(char* header_line, uint32_t item_ct, uint3
     if (ii != -1) {
       cmdline_pos = id_map[(uint32_t)ii];
       if (seen_idxs[cmdline_pos] != -1) {
-	sprintf(logbuf, "Error: Duplicate --%s token in %s.\n", range_list_flag, file_descrip);
-        goto string_range_list_to_bitfield_ret_INVALID_FORMAT_2;
+	sprintf(g_logbuf, "Error: Duplicate --%s token in %s.\n", range_list_flag, file_descrip);
+        goto string_range_list_to_bitarr_ret_INVALID_FORMAT_2;
       }
       seen_idxs[cmdline_pos] = item_idx;
       if (cmdline_pos && range_list_ptr->starts_range[cmdline_pos - 1]) {
         if (seen_idxs[cmdline_pos - 1] == -1) {
           LOGPREPRINTFWW("Error: Second element of --%s range appears before first element in %s.\n", range_list_flag, file_descrip);
-          goto string_range_list_to_bitfield_ret_INVALID_CMDLINE_2;
+          goto string_range_list_to_bitarr_ret_INVALID_CMDLINE_2;
 	}
-	fill_bits(bitfield, seen_idxs[cmdline_pos - 1], (item_idx - seen_idxs[cmdline_pos - 1]) + 1);
+	fill_bits(seen_idxs[cmdline_pos - 1], (item_idx - seen_idxs[cmdline_pos - 1]) + 1, bitarr);
       } else if (!(range_list_ptr->starts_range[cmdline_pos])) {
-	SET_BIT(bitfield, item_idx);
+	SET_BIT(item_idx, bitarr);
       }
     }
     if (++item_idx == item_ct) {
@@ -7619,17 +7716,17 @@ int32_t string_range_list_to_bitfield(char* header_line, uint32_t item_ct, uint3
   }
   for (cmdline_pos = 0; cmdline_pos < name_ct; cmdline_pos++) {
     if (seen_idxs[cmdline_pos] == -1) {
-      goto string_range_list_to_bitfield_ret_INVALID_CMDLINE_3;
+      goto string_range_list_to_bitarr_ret_INVALID_CMDLINE_3;
     }
   }
   while (0) {
-  string_range_list_to_bitfield_ret_INVALID_CMDLINE_3:
-    sprintf(logbuf, "Error: Missing --%s token in %s.\n", range_list_flag, file_descrip);
-  string_range_list_to_bitfield_ret_INVALID_CMDLINE_2:
+  string_range_list_to_bitarr_ret_INVALID_CMDLINE_3:
+    sprintf(g_logbuf, "Error: Missing --%s token in %s.\n", range_list_flag, file_descrip);
+  string_range_list_to_bitarr_ret_INVALID_CMDLINE_2:
     logerrprintb();
     retval = RET_INVALID_CMDLINE;
     break;
-  string_range_list_to_bitfield_ret_INVALID_FORMAT_2:
+  string_range_list_to_bitarr_ret_INVALID_FORMAT_2:
     logerrprintb();
     retval = RET_INVALID_FORMAT;
     break;
@@ -7637,37 +7734,36 @@ int32_t string_range_list_to_bitfield(char* header_line, uint32_t item_ct, uint3
   return retval;
 }
 
-int32_t string_range_list_to_bitfield_alloc(char* header_line, uint32_t item_ct, uint32_t fixed_len, Range_list* range_list_ptr, uintptr_t** bitfield_ptr, const char* range_list_flag, const char* file_descrip) {
-  // wrapper for string_range_list_to_bitfield which allocates the bitfield and
+int32_t string_range_list_to_bitarr_alloc(char* header_line, uint32_t item_ct, uint32_t fixed_len, const Range_list* range_list_ptr, const char* __restrict range_list_flag, const char* __restrict file_descrip, uintptr_t** bitarr_ptr) {
+  // wrapper for string_range_list_to_bitarr which allocates the bitfield and
   // temporary buffers on the heap
-  uintptr_t item_ctl = (item_ct + (BITCT - 1)) / BITCT;
+  uintptr_t item_ctl = BITCT_TO_WORDCT(item_ct);
   uintptr_t name_ct = range_list_ptr->name_ct;
   int32_t retval = 0;
   int32_t* seen_idxs;
   char* sorted_ids;
   uint32_t* id_map;
-  if (wkspace_alloc_ul_checked(bitfield_ptr, item_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_i_checked(&seen_idxs, name_ct)) {
+  if (bigstack_calloc_ul(item_ctl, bitarr_ptr) ||
+      bigstack_alloc_i(name_ct, &seen_idxs)) {
     return RET_NOMEM;
   }
-  fill_ulong_zero(*bitfield_ptr, item_ctl);
   // kludge to use sort_item_ids()
-  fill_ulong_zero((uintptr_t*)seen_idxs, (name_ct + (BITCT - 1)) / BITCT);
-  if (sort_item_ids(&sorted_ids, &id_map, name_ct, (uintptr_t*)seen_idxs, 0, range_list_ptr->names, range_list_ptr->name_max_len, 0, 0, strcmp_deref)) {
+  fill_ulong_zero((uintptr_t*)seen_idxs, BITCT_TO_WORDCT(name_ct));
+  if (sort_item_ids(name_ct, (uintptr_t*)seen_idxs, 0, range_list_ptr->names, range_list_ptr->name_max_len, 0, 0, strcmp_deref, &sorted_ids, &id_map)) {
     return RET_NOMEM;
   }
   fill_int_one(seen_idxs, name_ct);
-  retval = string_range_list_to_bitfield(header_line, item_ct, fixed_len, range_list_ptr, sorted_ids, id_map, seen_idxs, range_list_flag, file_descrip, *bitfield_ptr);
-  wkspace_reset(seen_idxs);
+  retval = string_range_list_to_bitarr(header_line, item_ct, fixed_len, range_list_ptr, sorted_ids, id_map, range_list_flag, file_descrip, *bitarr_ptr, seen_idxs);
+  bigstack_reset(seen_idxs);
   return retval;
 }
 
-int32_t string_range_list_to_bitfield2(char* sorted_ids, uint32_t* id_map, uintptr_t item_ct, uintptr_t max_id_len, Range_list* range_list_ptr, const char* range_list_flag, uintptr_t* bitfield_excl) {
+int32_t string_range_list_to_bitarr2(const char* __restrict sorted_ids, const uint32_t* id_map, uintptr_t item_ct, uintptr_t max_id_len, const Range_list* __restrict range_list_ptr, const char* __restrict range_list_flag, uintptr_t* bitfield_excl) {
   // sorted_ids/id_map is for e.g. marker IDs instead of command line
   // parameters.  bitfield_excl is assumed to be initialized (since its length
   // is not known by this function).
   char* names = range_list_ptr->names;
-  unsigned char* starts_range = range_list_ptr->starts_range;
+  const unsigned char* starts_range = range_list_ptr->starts_range;
   uintptr_t name_max_len = range_list_ptr->name_max_len;
   uint32_t name_ct = range_list_ptr->name_ct;
   int32_t retval = 0;
@@ -7680,7 +7776,7 @@ int32_t string_range_list_to_bitfield2(char* sorted_ids, uint32_t* id_map, uintp
     bufptr = &(names[param_idx * name_max_len]);
     ii = bsearch_str_nl(bufptr, sorted_ids, max_id_len, item_ct);
     if (ii == -1) {
-      goto string_range_list_to_bitfield2_ret_INVALID_CMDLINE_3;
+      goto string_range_list_to_bitarr2_ret_INVALID_CMDLINE_3;
     }
     item_uidx = id_map[(uint32_t)ii];
     if (starts_range[param_idx]) {
@@ -7688,22 +7784,22 @@ int32_t string_range_list_to_bitfield2(char* sorted_ids, uint32_t* id_map, uintp
       bufptr = &(names[param_idx * name_max_len]);
       ii = bsearch_str_nl(bufptr, sorted_ids, max_id_len, item_ct);
       if (ii == -1) {
-        goto string_range_list_to_bitfield2_ret_INVALID_CMDLINE_3;
+        goto string_range_list_to_bitarr2_ret_INVALID_CMDLINE_3;
       }
       item_uidx2 = id_map[(uint32_t)ii];
       if (item_uidx2 < item_uidx) {
-	sprintf(logbuf, "Error: Second element of --%s range appears before first.\n", range_list_flag);
-	goto string_range_list_to_bitfield2_ret_INVALID_CMDLINE_2;
+	sprintf(g_logbuf, "Error: Second element of --%s range appears before first.\n", range_list_flag);
+	goto string_range_list_to_bitarr2_ret_INVALID_CMDLINE_2;
       }
-      clear_bits(bitfield_excl, item_uidx, item_uidx2 - item_uidx + 1);
+      clear_bits(item_uidx, item_uidx2 - item_uidx + 1, bitfield_excl);
     } else {
-      clear_bit(bitfield_excl, item_uidx);
+      clear_bit(item_uidx, bitfield_excl);
     }
   }
   while (0) {
-  string_range_list_to_bitfield2_ret_INVALID_CMDLINE_3:
-    sprintf(logbuf, "Error: --%s ID not found.\n", range_list_flag);
-  string_range_list_to_bitfield2_ret_INVALID_CMDLINE_2:
+  string_range_list_to_bitarr2_ret_INVALID_CMDLINE_3:
+    sprintf(g_logbuf, "Error: --%s ID not found.\n", range_list_flag);
+  string_range_list_to_bitarr2_ret_INVALID_CMDLINE_2:
     logerrprintb();
     retval = RET_INVALID_CMDLINE;
     break;
@@ -7711,7 +7807,7 @@ int32_t string_range_list_to_bitfield2(char* sorted_ids, uint32_t* id_map, uintp
   return retval;
 }
 
-uint32_t count_non_autosomal_markers(Chrom_info* chrom_info_ptr, uintptr_t* marker_exclude, uint32_t count_x, uint32_t count_mt) {
+uint32_t count_non_autosomal_markers(const Chrom_info* chrom_info_ptr, const uintptr_t* marker_exclude, uint32_t count_x, uint32_t count_mt) {
   // for backward compatibility, unplaced markers are considered to be
   // autosomal here
   uint32_t ct = 0;
@@ -7719,19 +7815,19 @@ uint32_t count_non_autosomal_markers(Chrom_info* chrom_info_ptr, uintptr_t* mark
   int32_t y_code = chrom_info_ptr->y_code;
   int32_t mt_code = chrom_info_ptr->mt_code;
   if (count_x && (x_code != -1)) {
-    ct += count_chrom_markers(chrom_info_ptr, x_code, marker_exclude);
+    ct += count_chrom_markers(chrom_info_ptr, marker_exclude, x_code);
   }
   if (y_code != -1) {
-    ct += count_chrom_markers(chrom_info_ptr, y_code, marker_exclude);
+    ct += count_chrom_markers(chrom_info_ptr, marker_exclude, y_code);
   }
   if (count_mt && (mt_code != -1)) {
-    ct += count_chrom_markers(chrom_info_ptr, mt_code, marker_exclude);
+    ct += count_chrom_markers(chrom_info_ptr, marker_exclude, mt_code);
   }
   return ct;
 }
 
-int32_t conditional_allocate_non_autosomal_markers(Chrom_info* chrom_info_ptr, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uint32_t marker_ct, uint32_t count_x, uint32_t count_mt, const char* calc_descrip, uintptr_t** marker_exclude_ptr, uint32_t* newly_excluded_ct_ptr) {
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+int32_t conditional_allocate_non_autosomal_markers(const Chrom_info* chrom_info_ptr, uintptr_t unfiltered_marker_ct, const uintptr_t* marker_exclude_orig, uint32_t marker_ct, uint32_t count_x, uint32_t count_mt, const char* calc_descrip, uintptr_t** marker_exclude_ptr, uint32_t* newly_excluded_ct_ptr) {
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   int32_t x_code = chrom_info_ptr->x_code;
   int32_t y_code = chrom_info_ptr->y_code;
   int32_t mt_code = chrom_info_ptr->mt_code;
@@ -7742,13 +7838,13 @@ int32_t conditional_allocate_non_autosomal_markers(Chrom_info* chrom_info_ptr, u
     *newly_excluded_ct_ptr = marker_ct;
   } else {
     if (count_x && (x_code != -1)) {
-      x_ct = count_chrom_markers(chrom_info_ptr, x_code, marker_exclude_orig);
+      x_ct = count_chrom_markers(chrom_info_ptr, marker_exclude_orig, x_code);
     }
     if (y_code != -1) {
-      y_ct = count_chrom_markers(chrom_info_ptr, y_code, marker_exclude_orig);
+      y_ct = count_chrom_markers(chrom_info_ptr, marker_exclude_orig, y_code);
     }
     if (count_mt && (mt_code != -1)) {
-      mt_ct = count_chrom_markers(chrom_info_ptr, mt_code, marker_exclude_orig);
+      mt_ct = count_chrom_markers(chrom_info_ptr, marker_exclude_orig, mt_code);
     }
     *newly_excluded_ct_ptr = x_ct + y_ct + mt_ct;
   }
@@ -7762,30 +7858,30 @@ int32_t conditional_allocate_non_autosomal_markers(Chrom_info* chrom_info_ptr, u
   if (!(*newly_excluded_ct_ptr)) {
     return 0;
   }
-  if (wkspace_alloc_ul_checked(marker_exclude_ptr, unfiltered_marker_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_marker_ctl, marker_exclude_ptr)) {
     return RET_NOMEM;
   }
   memcpy(*marker_exclude_ptr, marker_exclude_orig, unfiltered_marker_ctl * sizeof(intptr_t));
   if (x_ct) {
-    fill_bits(*marker_exclude_ptr, chrom_info_ptr->chrom_start[(uint32_t)x_code], chrom_info_ptr->chrom_end[(uint32_t)x_code] - chrom_info_ptr->chrom_start[(uint32_t)x_code]);
+    fill_bits(chrom_info_ptr->chrom_start[(uint32_t)x_code], chrom_info_ptr->chrom_end[(uint32_t)x_code] - chrom_info_ptr->chrom_start[(uint32_t)x_code], *marker_exclude_ptr);
   }
   if (y_ct) {
-    fill_bits(*marker_exclude_ptr, chrom_info_ptr->chrom_start[(uint32_t)y_code], chrom_info_ptr->chrom_end[(uint32_t)y_code] - chrom_info_ptr->chrom_start[(uint32_t)y_code]);
+    fill_bits(chrom_info_ptr->chrom_start[(uint32_t)y_code], chrom_info_ptr->chrom_end[(uint32_t)y_code] - chrom_info_ptr->chrom_start[(uint32_t)y_code], *marker_exclude_ptr);
   }
   if (mt_ct) {
-    fill_bits(*marker_exclude_ptr, chrom_info_ptr->chrom_start[(uint32_t)mt_code], chrom_info_ptr->chrom_end[(uint32_t)mt_code] - chrom_info_ptr->chrom_start[(uint32_t)mt_code]);
+    fill_bits(chrom_info_ptr->chrom_start[(uint32_t)mt_code], chrom_info_ptr->chrom_end[(uint32_t)mt_code] - chrom_info_ptr->chrom_start[(uint32_t)mt_code], *marker_exclude_ptr);
   }
   return 0;
 }
 
-uint32_t get_max_chrom_size(Chrom_info* chrom_info_ptr, uintptr_t* marker_exclude, uint32_t* last_chrom_fo_idx_ptr) {
+uint32_t get_max_chrom_size(const Chrom_info* chrom_info_ptr, const uintptr_t* marker_exclude, uint32_t* last_chrom_fo_idx_ptr) {
   uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
   uint32_t max_chrom_size = 0;
   uint32_t last_chrom_fo_idx = 0;
   uint32_t chrom_fo_idx;
   uint32_t cur_chrom_size;
   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_ct; chrom_fo_idx++) {
-    cur_chrom_size = count_chrom_markers(chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx], marker_exclude);
+    cur_chrom_size = count_chrom_markers(chrom_info_ptr, marker_exclude, chrom_info_ptr->chrom_file_order[chrom_fo_idx]);
     if (cur_chrom_size) {
       last_chrom_fo_idx = chrom_fo_idx;
       if (cur_chrom_size > max_chrom_size) {
@@ -7799,7 +7895,7 @@ uint32_t get_max_chrom_size(Chrom_info* chrom_info_ptr, uintptr_t* marker_exclud
   return max_chrom_size;
 }
 
-void count_genders(uintptr_t* sex_nm, uintptr_t* sex_male, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uint32_t* male_ct_ptr, uint32_t* female_ct_ptr, uint32_t* unk_ct_ptr) {
+void count_genders(const uintptr_t* __restrict sex_nm, const uintptr_t* __restrict sex_male, const uintptr_t* __restrict sample_exclude, uintptr_t unfiltered_sample_ct, uint32_t* __restrict male_ct_ptr, uint32_t* __restrict female_ct_ptr, uint32_t* __restrict unk_ct_ptr) {
   // unfiltered_sample_ct can be zero
   uint32_t male_ct = 0;
   uint32_t female_ct = 0;
@@ -7829,7 +7925,7 @@ void count_genders(uintptr_t* sex_nm, uintptr_t* sex_male, uintptr_t unfiltered_
   *unk_ct_ptr = unk_ct;
 }
 
-void reverse_loadbuf(unsigned char* loadbuf, uintptr_t unfiltered_sample_ct) {
+void reverse_loadbuf(uintptr_t unfiltered_sample_ct, unsigned char* loadbuf) {
   // unfiltered_sample_ct can be zero
   uintptr_t sample_bidx = 0;
   unsigned char* loadbuf_end = &(loadbuf[(unfiltered_sample_ct + 3) / 4]);
@@ -7894,109 +7990,423 @@ void reverse_loadbuf(unsigned char* loadbuf, uintptr_t unfiltered_sample_ct) {
   }
 }
 
-void collapse_copy_2bitarr(uintptr_t* rawbuf, uintptr_t* mainbuf, uint32_t unfiltered_sample_ct, uint32_t sample_ct, uintptr_t* sample_exclude) {
-  uintptr_t cur_write = 0;
-  uint32_t sample_uidx = 0;
-  uint32_t sample_idx = 0;
-  uint32_t ii_rem = 0;
-  uint32_t sample_uidx_stop;
-  // just copy first words when possible
-  if (!sample_exclude[0]) {
-    sample_uidx = next_set(sample_exclude, 0, unfiltered_sample_ct & (~(BITCT2 - 1))) & (~(BITCT2 - 1));
-    memcpy(mainbuf, rawbuf, sample_uidx / 4);
-    sample_idx = sample_uidx;
-    mainbuf = &(mainbuf[sample_uidx / BITCT2]);
-  }
-  while (sample_idx < sample_ct) {
-    sample_uidx = next_unset_unsafe(sample_exclude, sample_uidx);
-    sample_uidx_stop = next_set(sample_exclude, sample_uidx, unfiltered_sample_ct);
-    sample_idx += sample_uidx_stop - sample_uidx;
-    do {
-      // er, this can totally be sped up
-      cur_write |= EXTRACT_2BIT_GENO(rawbuf, sample_uidx) << (ii_rem * 2);
-      if (++ii_rem == BITCT2) {
-        *mainbuf++ = cur_write;
-        cur_write = 0;
-        ii_rem = 0;
+// deprecated, try to just use copy_quaterarr_nonempty_subset()
+void copy_quaterarr_nonempty_subset_excl(const uintptr_t* __restrict raw_quaterarr, const uintptr_t* __restrict subset_excl, uint32_t raw_quaterarr_size, uint32_t subset_size, uintptr_t* __restrict output_quaterarr) {
+  assert(subset_size);
+  assert(raw_quaterarr_size >= subset_size);
+  uintptr_t cur_output_word = 0;
+  uintptr_t* output_quaterarr_last = &(output_quaterarr[subset_size / BITCT2]);
+  const uint32_t word_write_halfshift_end = subset_size % BITCT2;
+  uint32_t word_write_halfshift = 0;
+  // if < 2/3-filled, use sparse copy algorithm
+  if (subset_size * (3 * ONELU) < raw_quaterarr_size * (2 * ONELU)) {
+    const uint32_t subset_excl_widx_last = raw_quaterarr_size / BITCT;
+    uint32_t subset_excl_widx = 0;
+    while (1) {
+      uintptr_t cur_include_word = ~subset_excl[subset_excl_widx];
+
+      // this, kiddies, is why exclude masks were a mistake.
+      if (subset_excl_widx == subset_excl_widx_last) {
+	cur_include_word &= (ONELU << (raw_quaterarr_size % BITCT)) - ONELU;
+      }
+      
+      if (cur_include_word) {
+	uint32_t wordhalf_idx = 0;
+#ifdef __LP64__
+	uint32_t cur_include_halfword = (uint32_t)cur_include_word;
+#else
+	uint32_t cur_include_halfword = (uint16_t)cur_include_word;
+#endif
+	while (1) {
+	  if (cur_include_halfword) {
+	    uintptr_t raw_quaterarr_word = raw_quaterarr[subset_excl_widx * 2 + wordhalf_idx];
+	    do {
+	      uint32_t rqa_idx_lowbits = __builtin_ctz(cur_include_halfword);
+	      cur_output_word |= ((raw_quaterarr_word >> (rqa_idx_lowbits * 2)) & 3) << (word_write_halfshift * 2);
+	      if (++word_write_halfshift == BITCT2) {
+		*output_quaterarr++ = cur_output_word;
+		word_write_halfshift = 0;
+		cur_output_word = 0;
+	      }
+	      cur_include_halfword &= cur_include_halfword - 1;
+	    } while (cur_include_halfword);
+	  }
+	  if (wordhalf_idx) {
+	    break;
+	  }
+	  wordhalf_idx++;
+#ifdef __LP64__
+	  cur_include_halfword = cur_include_word >> 32;
+#else
+	  cur_include_halfword = cur_include_word >> 16;
+#endif
+	}
+	if (output_quaterarr == output_quaterarr_last) {
+	  if (word_write_halfshift == word_write_halfshift_end) {
+            if (word_write_halfshift_end) {
+	      *output_quaterarr_last = cur_output_word;
+	    }
+	    return;
+	  }
+	}
       }
-    } while (++sample_uidx < sample_uidx_stop);
+      subset_excl_widx++;	
+    }
   }
-  if (ii_rem) {
-    *mainbuf = cur_write;
+  // blocked copy
+  const uintptr_t* subset_excl_last = &(subset_excl[raw_quaterarr_size / BITCT]);
+  while (1) {
+    uintptr_t cur_include_word = ~(*subset_excl);
+    if (subset_excl == subset_excl_last) {
+      cur_include_word &= (ONELU << (raw_quaterarr_size % BITCT)) - ONELU;
+    }
+    subset_excl++;
+    uint32_t wordhalf_idx = 0;
+#ifdef __LP64__
+    uintptr_t cur_include_halfword = (uint32_t)cur_include_word;
+#else
+    uint32_t cur_include_halfword = (uint16_t)cur_include_word;
+#endif
+    while (1) {
+      uintptr_t raw_quaterarr_word = *raw_quaterarr++;
+      while (cur_include_halfword) {
+	uint32_t rqa_idx_lowbits = CTZLU(cur_include_halfword);
+	uintptr_t halfword_invshifted = (~cur_include_halfword) >> rqa_idx_lowbits;
+	uintptr_t raw_quaterarr_curblock_unmasked = raw_quaterarr_word >> (rqa_idx_lowbits * 2);
+	uint32_t rqa_block_len = CTZLU(halfword_invshifted);
+	uint32_t block_len_limit = BITCT2 - word_write_halfshift;
+	cur_output_word |= raw_quaterarr_curblock_unmasked << (2 * word_write_halfshift);
+	if (rqa_block_len < block_len_limit) {
+	  word_write_halfshift += rqa_block_len;
+	  cur_output_word &= (ONELU << (word_write_halfshift * 2)) - ONELU;
+	} else {
+	  // no need to mask, extra bits vanish off the high end
+	  *output_quaterarr++ = cur_output_word;
+	  word_write_halfshift = rqa_block_len - block_len_limit;
+	  cur_output_word = (raw_quaterarr_curblock_unmasked >> (2 * block_len_limit)) & ((ONELU << (2 * word_write_halfshift)) - ONELU);
+	}
+	cur_include_halfword &= (~(ONELU << (rqa_block_len + rqa_idx_lowbits))) + ONELU;
+      }
+      if (wordhalf_idx) {
+	break;
+      }
+      wordhalf_idx++;
+#ifdef __LP64__
+      cur_include_halfword = cur_include_word >> 32;
+#else
+      cur_include_halfword = cur_include_word >> 16;
+#endif
+    }
+    if (output_quaterarr == output_quaterarr_last) {
+      if (word_write_halfshift == word_write_halfshift_end) {
+	if (word_write_halfshift_end) {
+	  *output_quaterarr_last = cur_output_word;
+	}
+	return;
+      }
+    }
   }
 }
 
-uint32_t load_and_collapse(FILE* bedfile, uintptr_t* rawbuf, uint32_t unfiltered_sample_ct, uintptr_t* mainbuf, uint32_t sample_ct, uintptr_t* sample_exclude, uintptr_t final_mask, uint32_t do_reverse) {
-  // assumes unfiltered_sample_ct is positive
+uint32_t load_and_collapse(uint32_t unfiltered_sample_ct, uint32_t sample_ct, const uintptr_t* __restrict sample_exclude, uintptr_t final_mask, uint32_t do_reverse, FILE* bedfile, uintptr_t* __restrict rawbuf, uintptr_t* __restrict mainbuf) {
+  assert(unfiltered_sample_ct);
   uint32_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   if (unfiltered_sample_ct == sample_ct) {
     rawbuf = mainbuf;
   }
-  if (load_raw(bedfile, rawbuf, unfiltered_sample_ct4)) {
+  if (load_raw(unfiltered_sample_ct4, bedfile, rawbuf)) {
     return RET_READ_FAIL;
   }
   if (unfiltered_sample_ct != sample_ct) {
-    collapse_copy_2bitarr(rawbuf, mainbuf, unfiltered_sample_ct, sample_ct, sample_exclude);
+    copy_quaterarr_nonempty_subset_excl(rawbuf, sample_exclude, unfiltered_sample_ct, sample_ct, mainbuf);
   } else {
     rawbuf[(unfiltered_sample_ct - 1) / BITCT2] &= final_mask;
   }
   if (do_reverse) {
-    reverse_loadbuf((unsigned char*)mainbuf, sample_ct);
+    reverse_loadbuf(sample_ct, (unsigned char*)mainbuf);
   }
   return 0;
 }
 
-void collapse_copy_2bitarr_incl(uintptr_t* rawbuf, uintptr_t* mainbuf, uint32_t unfiltered_sample_ct, uint32_t sample_ct, uintptr_t* sample_include) {
-  // mirror image of collapse_copy_2bitarr()
-  uintptr_t cur_write = 0;
-  uint32_t sample_uidx = 0;
-  uint32_t sample_idx = 0;
-  uint32_t ii_rem = 0;
-  uint32_t sample_uidx_stop;
-  if (!(~sample_include[0])) {
-    sample_uidx = next_unset(sample_include, 0, unfiltered_sample_ct & (~(BITCT2 - 1))) & (~(BITCT2 - 1));
-    memcpy(mainbuf, rawbuf, sample_uidx / 4);
-    sample_idx = sample_uidx;
-    mainbuf = &(mainbuf[sample_uidx / BITCT2]);
-  }
-  while (sample_idx < sample_ct) {
-    sample_uidx = next_set_unsafe(sample_include, sample_uidx);
-    sample_uidx_stop = next_unset(sample_include, sample_uidx, unfiltered_sample_ct);
-    sample_idx += sample_uidx_stop - sample_uidx;
+void copy_quaterarr_nonempty_subset(const uintptr_t* __restrict raw_quaterarr, const uintptr_t* __restrict subset_mask, uint32_t raw_quaterarr_size, uint32_t subset_size, uintptr_t* __restrict output_quaterarr) {
+  // in plink 2.0, we probably want (0-based) bit raw_quaterarr_size of
+  // subset_mask to be always allocated and unset.  This removes a few special
+  // cases re: iterating past the end of arrays.
+  assert(subset_size);
+  assert(raw_quaterarr_size >= subset_size);
+  uintptr_t cur_output_word = 0;
+  uintptr_t* output_quaterarr_last = &(output_quaterarr[subset_size / BITCT2]);
+  const uint32_t word_write_halfshift_end = subset_size % BITCT2;
+  uint32_t word_write_halfshift = 0;
+  // if < 2/3-filled, use sparse copy algorithm
+  if (subset_size * (3 * ONELU) < raw_quaterarr_size * (2 * ONELU)) {
+    uint32_t subset_mask_widx = 0;
+    while (1) {
+      const uintptr_t cur_include_word = subset_mask[subset_mask_widx];
+      if (cur_include_word) {
+	uint32_t wordhalf_idx = 0;
+#ifdef __LP64__
+	uint32_t cur_include_halfword = (uint32_t)cur_include_word;
+#else
+	uint32_t cur_include_halfword = (uint16_t)cur_include_word;
+#endif
+	while (1) {
+	  if (cur_include_halfword) {
+	    uintptr_t raw_quaterarr_word = raw_quaterarr[subset_mask_widx * 2 + wordhalf_idx];
+	    do {
+	      uint32_t rqa_idx_lowbits = __builtin_ctz(cur_include_halfword);
+	      cur_output_word |= ((raw_quaterarr_word >> (rqa_idx_lowbits * 2)) & 3) << (word_write_halfshift * 2);
+	      if (++word_write_halfshift == BITCT2) {
+		*output_quaterarr++ = cur_output_word;
+		word_write_halfshift = 0;
+		cur_output_word = 0;
+	      }
+	      cur_include_halfword &= cur_include_halfword - 1;
+	    } while (cur_include_halfword);
+	  }
+	  if (wordhalf_idx) {
+	    break;
+	  }
+	  wordhalf_idx++;
+#ifdef __LP64__
+	  cur_include_halfword = cur_include_word >> 32;
+#else
+	  cur_include_halfword = cur_include_word >> 16;
+#endif
+	}
+	if (output_quaterarr == output_quaterarr_last) {
+	  if (word_write_halfshift == word_write_halfshift_end) {
+            if (word_write_halfshift_end) {
+	      *output_quaterarr_last = cur_output_word;
+	    }
+	    return;
+	  }
+	}
+      }
+      subset_mask_widx++;	
+    }
+  }
+  // blocked copy
+  while (1) {
+    const uintptr_t cur_include_word = *subset_mask++;
+    uint32_t wordhalf_idx = 0;
+#ifdef __LP64__
+    uintptr_t cur_include_halfword = (uint32_t)cur_include_word;
+#else
+    uint32_t cur_include_halfword = (uint16_t)cur_include_word;
+#endif
+    while (1) {
+      uintptr_t raw_quaterarr_word = *raw_quaterarr++;
+      while (cur_include_halfword) {
+	uint32_t rqa_idx_lowbits = CTZLU(cur_include_halfword);
+	uintptr_t halfword_invshifted = (~cur_include_halfword) >> rqa_idx_lowbits;
+	uintptr_t raw_quaterarr_curblock_unmasked = raw_quaterarr_word >> (rqa_idx_lowbits * 2);
+	uint32_t rqa_block_len = CTZLU(halfword_invshifted);
+	uint32_t block_len_limit = BITCT2 - word_write_halfshift;
+	cur_output_word |= raw_quaterarr_curblock_unmasked << (2 * word_write_halfshift);
+	if (rqa_block_len < block_len_limit) {
+	  word_write_halfshift += rqa_block_len;
+	  cur_output_word &= (ONELU << (word_write_halfshift * 2)) - ONELU;
+	} else {
+	  // no need to mask, extra bits vanish off the high end
+	  *output_quaterarr++ = cur_output_word;
+	  word_write_halfshift = rqa_block_len - block_len_limit;
+	  if (word_write_halfshift) {
+	    cur_output_word = (raw_quaterarr_curblock_unmasked >> (2 * block_len_limit)) & ((ONELU << (2 * word_write_halfshift)) - ONELU);
+	  } else {
+	    // avoid potential right-shift-64
+	    cur_output_word = 0;
+	  }
+	}
+	cur_include_halfword &= (~(ONELU << (rqa_block_len + rqa_idx_lowbits))) + ONELU;
+      }
+      if (wordhalf_idx) {
+	break;
+      }
+      wordhalf_idx++;
+#ifdef __LP64__
+      cur_include_halfword = cur_include_word >> 32;
+#else
+      cur_include_halfword = cur_include_word >> 16;
+#endif
+    }
+    if (output_quaterarr == output_quaterarr_last) {
+      if (word_write_halfshift == word_write_halfshift_end) {
+	if (word_write_halfshift_end) {
+	  *output_quaterarr_last = cur_output_word;
+	}
+	return;
+      }
+    }
+  }
+}
+
+/*
+void inplace_quaterarr_proper_subset(const uintptr_t* __restrict subset_mask, uint32_t orig_quaterarr_size, uint32_t subset_size, uintptr_t* __restrict main_quaterarr) {
+  assert(orig_quaterarr_size > subset_size);
+  // worthwhile to special-case this since we get to entirely skip
+  // reading/writing these words
+  if (!(~subset_mask[0])) {
+    const uintptr_t* subset_mask_initial = subset_mask;
+    // guaranteed to terminate since orig_quaterarr_size > subset_size.
     do {
-      cur_write |= EXTRACT_2BIT_GENO(rawbuf, sample_uidx) << (ii_rem * 2);
-      if (++ii_rem == BITCT2) {
-        *mainbuf++ = cur_write;
-        cur_write = 0;
-        ii_rem = 0;
+      subset_mask++;
+    } while (!(~subset_mask[0]));
+    const uint32_t quaterarr_word_skip_ct = 2 * ((uintptr_t)(subset_mask - subset_mask_initial));
+    main_quaterarr = &(main_quaterarr[quaterarr_word_skip_ct]);
+    const uint32_t item_skip_ct = quaterarr_word_skip_ct * BITCT2;
+    orig_quaterarr_size -= item_skip_ct;
+    subset_size -= item_skip_ct;
+  }
+  uintptr_t cur_output_word = 0;
+  uintptr_t* main_quaterarr_writer = main_quaterarr;
+  uintptr_t* main_quaterarr_write_last = &(main_quaterarr[subset_size / BITCT2]);
+  const uint32_t word_write_halfshift_end = subset_size % BITCT2;
+  uint32_t word_write_halfshift = 0;
+  // if <= 2/3-filled, use sparse copy algorithm
+  if (subset_size * (3 * ONELU) <= orig_quaterarr_size * (2 * ONELU)) {
+    uint32_t subset_mask_widx = 0;
+    while (1) {
+      const uintptr_t cur_include_word = subset_mask[subset_mask_widx];
+      if (cur_include_word) {
+	uint32_t wordhalf_idx = 0;
+#ifdef __LP64__
+	uint32_t cur_include_halfword = (uint32_t)cur_include_word;
+#else
+	uint32_t cur_include_halfword = (uint16_t)cur_include_word;
+#endif
+	while (1) {
+	  if (cur_include_halfword) {
+	    uintptr_t orig_quaterarr_word = main_quaterarr[subset_mask_widx * 2 + wordhalf_idx];
+	    do {
+	      uint32_t rqa_idx_lowbits = __builtin_ctz(cur_include_halfword);
+	      cur_output_word |= ((orig_quaterarr_word >> (rqa_idx_lowbits * 2)) & 3) << (word_write_halfshift * 2);
+	      if (++word_write_halfshift == BITCT2) {
+		*main_quaterarr_writer++ = cur_output_word;
+		word_write_halfshift = 0;
+		cur_output_word = 0;
+	      }
+	      cur_include_halfword &= cur_include_halfword - 1;
+	    } while (cur_include_halfword);
+	  }
+	  if (wordhalf_idx) {
+	    break;
+	  }
+	  wordhalf_idx++;
+#ifdef __LP64__
+	  cur_include_halfword = cur_include_word >> 32;
+#else
+	  cur_include_halfword = cur_include_word >> 16;
+#endif
+	}
+	if (main_quaterarr_writer == main_quaterarr_write_last) {
+	  if (word_write_halfshift == word_write_halfshift_end) {
+            if (word_write_halfshift_end) {
+	      *main_quaterarr_writer = cur_output_word;
+	    }
+	    return;
+	  }
+	}
       }
-    } while (++sample_uidx < sample_uidx_stop);
+      subset_mask_widx++;	
+    }
   }
-  if (ii_rem) {
-    *mainbuf = cur_write;
+  // blocked copy
+  while (1) {
+    const uintptr_t cur_include_word = *subset_mask++;
+    uint32_t wordhalf_idx = 0;
+#ifdef __LP64__
+    uintptr_t cur_include_halfword = (uint32_t)cur_include_word;
+#else
+    uint32_t cur_include_halfword = (uint16_t)cur_include_word;
+#endif
+    while (1) {
+      uintptr_t orig_quaterarr_word = *main_quaterarr++;
+      while (cur_include_halfword) {
+	uint32_t rqa_idx_lowbits = CTZLU(cur_include_halfword);
+	uintptr_t halfword_invshifted = (~cur_include_halfword) >> rqa_idx_lowbits;
+	uintptr_t orig_quaterarr_curblock_unmasked = orig_quaterarr_word >> (rqa_idx_lowbits * 2);
+	uint32_t rqa_block_len = CTZLU(halfword_invshifted);
+	uint32_t block_len_limit = BITCT2 - word_write_halfshift;
+	cur_output_word |= orig_quaterarr_curblock_unmasked << (2 * word_write_halfshift);
+	if (rqa_block_len < block_len_limit) {
+	  word_write_halfshift += rqa_block_len;
+	  cur_output_word &= (ONELU << (word_write_halfshift * 2)) - ONELU;
+	} else {
+	  // no need to mask, extra bits vanish off the high end
+
+	  *main_quaterarr_writer++ = cur_output_word;
+	  word_write_halfshift = rqa_block_len - block_len_limit;
+	  if (word_write_halfshift) {
+	    cur_output_word = (orig_quaterarr_curblock_unmasked >> (2 * block_len_limit)) & ((ONELU << (2 * word_write_halfshift)) - ONELU);
+	  } else {
+	    cur_output_word = 0;
+	  }
+	}
+	cur_include_halfword &= (~(ONELU << (rqa_block_len + rqa_idx_lowbits))) + ONELU;
+      }
+      if (wordhalf_idx) {
+	break;
+      }
+      wordhalf_idx++;
+#ifdef __LP64__
+      cur_include_halfword = cur_include_word >> 32;
+#else
+      cur_include_halfword = cur_include_word >> 16;
+#endif
+    }
+    if (main_quaterarr_writer == main_quaterarr_write_last) {
+      if (word_write_halfshift == word_write_halfshift_end) {
+	if (word_write_halfshift_end) {
+	  *main_quaterarr_writer = cur_output_word;
+	}
+	return;
+      }
+    }
   }
 }
+*/
 
-uint32_t load_and_collapse_incl(FILE* bedfile, uintptr_t* rawbuf, uint32_t unfiltered_sample_ct, uintptr_t* mainbuf, uint32_t sample_ct, uintptr_t* sample_include, uintptr_t final_mask, uint32_t do_reverse) {
+uint32_t load_and_collapse_incl(uint32_t unfiltered_sample_ct, uint32_t sample_ct, const uintptr_t* __restrict sample_include, uintptr_t final_mask, uint32_t do_reverse, FILE* bedfile, uintptr_t* __restrict rawbuf, uintptr_t* __restrict mainbuf) {
+  assert(unfiltered_sample_ct);
   uint32_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   if (unfiltered_sample_ct == sample_ct) {
     rawbuf = mainbuf;
   }
-  if (load_raw(bedfile, rawbuf, unfiltered_sample_ct4)) {
+  if (load_raw(unfiltered_sample_ct4, bedfile, rawbuf)) {
     return RET_READ_FAIL;
   }
   if (unfiltered_sample_ct != sample_ct) {
-    collapse_copy_2bitarr_incl(rawbuf, mainbuf, unfiltered_sample_ct, sample_ct, sample_include);
+    copy_quaterarr_nonempty_subset(rawbuf, sample_include, unfiltered_sample_ct, sample_ct, mainbuf);
   } else {
     mainbuf[(unfiltered_sample_ct - 1) / BITCT2] &= final_mask;
   }
   if (do_reverse) {
-    reverse_loadbuf((unsigned char*)mainbuf, sample_ct);
+    reverse_loadbuf(sample_ct, (unsigned char*)mainbuf);
+  }
+  return 0;
+}
+
+/*
+uint32_t load_and_collapse_incl_inplace(const uintptr_t* __restrict sample_include, uint32_t unfiltered_sample_ct, uint32_t sample_ct, uintptr_t final_mask, uint32_t do_reverse, FILE* bedfile, uintptr_t* __restrict mainbuf) {
+  // mainbuf must be large enough to store unfiltered data
+  uint32_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
+  if (load_raw(unfiltered_sample_ct4, bedfile, mainbuf)) {
+    return RET_READ_FAIL;
+  }
+  if (unfiltered_sample_ct == sample_ct) {
+    mainbuf[(unfiltered_sample_ct - 1) / BITCT2] &= final_mask;
+  } else {
+    inplace_quaterarr_proper_subset(sample_include, unfiltered_sample_ct, sample_ct, mainbuf);
+  }
+  if (do_reverse) {
+    reverse_loadbuf(sample_ct, (unsigned char*)mainbuf);
   }
   return 0;
 }
+*/
 
-uint32_t load_and_split(FILE* bedfile, uintptr_t* rawbuf, uint32_t unfiltered_sample_ct, uintptr_t* casebuf, uintptr_t* ctrlbuf, uintptr_t* pheno_nm, uintptr_t* pheno_c) {
+uint32_t load_and_split(uint32_t unfiltered_sample_ct, const uintptr_t* __restrict pheno_nm, const uintptr_t* __restrict pheno_c, FILE* bedfile, uintptr_t* __restrict rawbuf, uintptr_t* __restrict casebuf, uintptr_t* __restrict ctrlbuf) {
   // add do_reverse later if needed
   uintptr_t* rawbuf_end = &(rawbuf[unfiltered_sample_ct / BITCT2]);
   uintptr_t case_word = 0;
@@ -8009,7 +8419,7 @@ uint32_t load_and_split(FILE* bedfile, uintptr_t* rawbuf, uint32_t unfiltered_sa
   uint32_t read_shift;
   uintptr_t read_word;
   uintptr_t ulii;
-  if (load_raw(bedfile, rawbuf, unfiltered_sample_ct4)) {
+  if (load_raw(unfiltered_sample_ct4, bedfile, rawbuf)) {
     return RET_READ_FAIL;
   }
   while (1) {
@@ -8053,16 +8463,16 @@ uint32_t load_and_split(FILE* bedfile, uintptr_t* rawbuf, uint32_t unfiltered_sa
   }
 }
 
-void vec_include_init(uintptr_t unfiltered_sample_ct, uintptr_t* new_include2, uintptr_t* old_include) {
+void init_quaterarr_from_bitarr(const uintptr_t* __restrict bitarr, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict new_quaterarr) {
   // allows unfiltered_sample_ct == 0
-  uint32_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  uint32_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t ulii;
   uintptr_t uljj;
   uintptr_t ulkk;
   uintptr_t ulmm;
   uint32_t bit_idx;
   while (unfiltered_sample_ctl) {
-    ulii = ~(*old_include++);
+    ulii = ~(*bitarr++);
     ulkk = FIVEMASK;
     ulmm = FIVEMASK;
     if (ulii) {
@@ -8087,32 +8497,32 @@ void vec_include_init(uintptr_t unfiltered_sample_ct, uintptr_t* new_include2, u
 	} while (uljj);
       }
     }
-    *new_include2++ = ulkk;
-    *new_include2++ = ulmm;
+    *new_quaterarr++ = ulkk;
+    *new_quaterarr++ = ulmm;
     --unfiltered_sample_ctl;
   }
   ulii = unfiltered_sample_ct & (BITCT - 1);
   if (ulii) {
-    new_include2--;
+    new_quaterarr--;
     if (ulii < BITCT2) {
-      *new_include2-- = 0;
+      *new_quaterarr-- = 0;
     } else {
       ulii -= BITCT2;
     }
-    *new_include2 &= (ONELU << (ulii * 2)) - ONELU;
+    *new_quaterarr &= (ONELU << (ulii * 2)) - ONELU;
   }
 }
 
-void exclude_to_vec_include(uintptr_t unfiltered_sample_ct, uintptr_t* include_vec, uintptr_t* exclude_arr) {
+void init_quaterarr_from_inverted_bitarr(const uintptr_t* __restrict inverted_bitarr, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict new_quaterarr) {
   // allows unfiltered_sample_ct == 0
-  uint32_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  uint32_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t ulii;
   uintptr_t uljj;
   uintptr_t ulkk;
   uintptr_t ulmm;
   uint32_t bit_idx;
   while (unfiltered_sample_ctl) {
-    ulii = *exclude_arr++;
+    ulii = *inverted_bitarr++;
     ulkk = FIVEMASK;
     ulmm = FIVEMASK;
     if (ulii) {
@@ -8137,32 +8547,32 @@ void exclude_to_vec_include(uintptr_t unfiltered_sample_ct, uintptr_t* include_v
 	} while (uljj);
       }
     }
-    *include_vec++ = ulkk;
-    *include_vec++ = ulmm;
+    *new_quaterarr++ = ulkk;
+    *new_quaterarr++ = ulmm;
     --unfiltered_sample_ctl;
   }
   ulii = unfiltered_sample_ct & (BITCT - 1);
   if (ulii) {
-    include_vec--;
+    new_quaterarr--;
     if (ulii < BITCT2) {
-      *include_vec-- = 0;
+      *new_quaterarr-- = 0;
     } else {
       ulii -= BITCT2;
     }
-    *include_vec &= (ONELU << (ulii * 2)) - ONELU;
+    *new_quaterarr &= (ONELU << (ulii * 2)) - ONELU;
   }
 }
 
-void vec_init_invert(uintptr_t entry_ct, uintptr_t* target_arr, uintptr_t* source_arr) {
-  // Initializes a half-bitfield as the inverse of another.  Assumes target_arr
-  // and source_arr are doubleword-aligned.
-  uint32_t vec_wsize = 2 * ((entry_ct + (BITCT - 1)) / BITCT);
+void quatervec_01_init_invert(const uintptr_t* __restrict source_quatervec, uintptr_t entry_ct, uintptr_t* __restrict target_quatervec) {
+  // Initializes a quatervec as the inverse of another.
+  // Some modifications needed for AVX2.
+  uint32_t vec_wsize = QUATERCT_TO_ALIGNED_WORDCT(entry_ct);
   uint32_t rem = entry_ct & (BITCT - 1);
 #ifdef __LP64__
   const __m128i m1 = {FIVEMASK, FIVEMASK};
-  __m128i* tptr = (__m128i*)target_arr;
-  __m128i* sptr = (__m128i*)source_arr;
-  __m128i* tptr_end = (__m128i*)(&(target_arr[vec_wsize]));
+  __m128i* tptr = (__m128i*)target_quatervec;
+  __m128i* sptr = (__m128i*)source_quatervec;
+  __m128i* tptr_end = (__m128i*)(&(target_quatervec[vec_wsize]));
   uintptr_t* second_to_last;
   while (tptr < tptr_end) {
     *tptr++ = _mm_andnot_si128(*sptr++, m1);
@@ -8177,54 +8587,54 @@ void vec_init_invert(uintptr_t entry_ct, uintptr_t* target_arr, uintptr_t* sourc
     }
   }
 #else
-  uintptr_t* tptr_end = &(target_arr[vec_wsize]);
-  while (target_arr < tptr_end) {
-    *target_arr++ = FIVEMASK & (~(*source_arr++));
+  uintptr_t* tptr_end = &(target_quatervec[vec_wsize]);
+  while (target_quatervec < tptr_end) {
+    *target_quatervec++ = FIVEMASK & (~(*source_quatervec++));
   }
   if (rem) {
     if (rem > BITCT2) {
-      target_arr[-1] &= (~ZEROLU) >> ((BITCT - rem) * 2);
+      target_quatervec[-1] &= (~ZEROLU) >> ((BITCT - rem) * 2);
     } else {
-      target_arr[-2] &= (~ZEROLU) >> ((BITCT2 - rem) * 2);
-      target_arr[-1] = 0;
+      target_quatervec[-2] &= (~ZEROLU) >> ((BITCT2 - rem) * 2);
+      target_quatervec[-1] = 0;
     }
   }
 
 #endif
 }
 
-void bitfield_andnot_copy(uintptr_t word_ct, uintptr_t* target_arr, uintptr_t* source_arr, uintptr_t* exclude_arr) {
-  // assumes word_ct is positive
-  // target_arr := source_arr ANDNOT exclude_arr
+void bitvec_andnot_copy(const uintptr_t* __restrict source_vec, const uintptr_t* __restrict exclude_vec, uintptr_t word_ct, uintptr_t* __restrict target_vec) {
+  // target_vec := source_vec ANDNOT exclude_vec
   // may write an extra word
+  assert(word_ct);
 #ifdef __LP64__
-  __m128i* tptr = (__m128i*)target_arr;
-  __m128i* sptr = (__m128i*)source_arr;
-  __m128i* xptr = (__m128i*)exclude_arr;
-  __m128i* tptr_end = (__m128i*)(&(target_arr[word_ct]));
+  __m128i* tptr = (__m128i*)target_vec;
+  __m128i* sptr = (__m128i*)source_vec;
+  __m128i* xptr = (__m128i*)exclude_vec;
+  __m128i* tptr_end = (__m128i*)(&(target_vec[round_up_pow2(word_ct, VEC_WORDS)]));
   do {
     *tptr++ = _mm_andnot_si128(*xptr++, *sptr++);
   } while (tptr < tptr_end);
 #else
-  uintptr_t* tptr_end = &(target_arr[word_ct]);
+  uintptr_t* tptr_end = &(target_vec[word_ct]);
   do {
-    *target_arr++ = (*source_arr++) & (~(*exclude_arr++));
-  } while (target_arr < tptr_end);
+    *target_vec++ = (*source_vec++) & (~(*exclude_vec++));
+  } while (target_vec < tptr_end);
 #endif
 }
 
-void vec_include_mask_in(uintptr_t unfiltered_sample_ct, uintptr_t* include_arr, uintptr_t* mask_arr) {
+void apply_bitarr_mask_to_quaterarr_01(const uintptr_t* __restrict mask_bitarr, uintptr_t unfiltered_sample_ct, uintptr_t* main_quaterarr) {
   // allows unfiltered_sample_ct == 0
-  uint32_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  uint32_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t ulii;
   uintptr_t uljj;
   uintptr_t ulkk;
   uintptr_t ulmm;
   uint32_t bit_idx;
   while (unfiltered_sample_ctl) {
-    ulii = ~(*mask_arr++);
-    ulkk = *include_arr;
-    ulmm = include_arr[1];
+    ulii = ~(*mask_bitarr++);
+    ulkk = *main_quaterarr;
+    ulmm = main_quaterarr[1];
     if (ulii) {
       uljj = ulii >> BITCT2;
 #ifdef __LP64__
@@ -8247,24 +8657,24 @@ void vec_include_mask_in(uintptr_t unfiltered_sample_ct, uintptr_t* include_arr,
 	} while (uljj);
       }
     }
-    *include_arr++ = ulkk;
-    *include_arr++ = ulmm;
+    *main_quaterarr++ = ulkk;
+    *main_quaterarr++ = ulmm;
     --unfiltered_sample_ctl;
   }
 }
 
-void vec_include_mask_out(uintptr_t unfiltered_sample_ct, uintptr_t* include_arr, uintptr_t* mask_arr) {
-  // assumes unfiltered_sample_ct is positive
-  uint32_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+void apply_bitarr_excl_to_quaterarr_01(const uintptr_t* __restrict excl_bitarr, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict main_quaterarr) {
+  assert(unfiltered_sample_ct);
+  uint32_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t ulii;
   uintptr_t uljj;
   uintptr_t ulkk;
   uintptr_t ulmm;
   uint32_t bit_idx;
   do {
-    ulii = *mask_arr++;
-    ulkk = *include_arr;
-    ulmm = include_arr[1];
+    ulii = *excl_bitarr++;
+    ulkk = *main_quaterarr;
+    ulmm = main_quaterarr[1];
     if (ulii) {
       uljj = ulii >> BITCT2;
 #ifdef __LP64__
@@ -8287,23 +8697,23 @@ void vec_include_mask_out(uintptr_t unfiltered_sample_ct, uintptr_t* include_arr
 	} while (uljj);
       }
     }
-    *include_arr++ = ulkk;
-    *include_arr++ = ulmm;
+    *main_quaterarr++ = ulkk;
+    *main_quaterarr++ = ulmm;
   } while (--unfiltered_sample_ctl);
 }
 
-void vec_include_mask_out_intersect(uintptr_t unfiltered_sample_ct, uintptr_t* include_arr, uintptr_t* mask_arr, uintptr_t* mask2_arr) {
-  // assumes unfiltered_sample_ct is positive
-  uint32_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+void apply_excl_intersect_to_quaterarr_01(const uintptr_t* __restrict excl_bitarr_1, const uintptr_t* __restrict excl_bitarr_2, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict main_quaterarr) {
+  assert(unfiltered_sample_ct);
+  uint32_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t ulii;
   uintptr_t uljj;
   uintptr_t ulkk;
   uintptr_t ulmm;
   uint32_t bit_idx;
   do {
-    ulii = (*mask_arr++) & (*mask2_arr++);
-    ulkk = *include_arr;
-    ulmm = include_arr[1];
+    ulii = (*excl_bitarr_1++) & (*excl_bitarr_2++);
+    ulkk = *main_quaterarr;
+    ulmm = main_quaterarr[1];
     if (ulii) {
       uljj = ulii >> BITCT2;
 #ifdef __LP64__
@@ -8326,54 +8736,54 @@ void vec_include_mask_out_intersect(uintptr_t unfiltered_sample_ct, uintptr_t* i
 	} while (uljj);
       }
     }
-    *include_arr++ = ulkk;
-    *include_arr++ = ulmm;
+    *main_quaterarr++ = ulkk;
+    *main_quaterarr++ = ulmm;
   } while (--unfiltered_sample_ctl);
 }
 
-void vec_init_01(uintptr_t unfiltered_sample_ct, uintptr_t* data_ptr, uintptr_t* result_ptr) {
-  // assumes unfiltered_sample_ct is positive
-  // initializes result_ptr bits 01 iff data_ptr bits are 01
+void quatervec_copy_only_01(const uintptr_t* __restrict input_quatervec, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict output_quatervec) {
+  // initializes result_ptr bits 01 iff input_quatervec bits are 01
+  assert(unfiltered_sample_ct);
 #ifdef __LP64__
   const __m128i m1 = {FIVEMASK, FIVEMASK};
-  __m128i* vec2_read = (__m128i*)data_ptr;
-  __m128i* read_end = &(vec2_read[(unfiltered_sample_ct + (BITCT - 1)) / BITCT]);
-  __m128i* vec2_write = (__m128i*)result_ptr;
+  __m128i* vec2_read = (__m128i*)input_quatervec;
+  __m128i* read_end = &(vec2_read[QUATERCT_TO_VECCT(unfiltered_sample_ct)]);
+  __m128i* vec2_write = (__m128i*)output_quatervec;
   __m128i loader;
   do {
     loader = *vec2_read++;
     *vec2_write++ = _mm_and_si128(_mm_andnot_si128(_mm_srli_epi64(loader, 1), loader), m1);
   } while (vec2_read < read_end);
 #else
-  uintptr_t* read_end = &(data_ptr[2 * ((unfiltered_sample_ct + (BITCT - 1)) / BITCT)]);
+  const uintptr_t* read_end = &(input_quatervec[QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct)]);
   uintptr_t loader;
   do {
-    loader = *data_ptr++;
-    *result_ptr++ = loader & (~(loader >> 1)) & FIVEMASK;
-  } while (data_ptr < read_end);
+    loader = *input_quatervec++;
+    *output_quatervec++ = loader & (~(loader >> 1)) & FIVEMASK;
+  } while (input_quatervec < read_end);
 #endif
 }
 
-void vec_invert(uintptr_t unfiltered_sample_ct, uintptr_t* vec2) {
-  uintptr_t* vec2_last = &(vec2[unfiltered_sample_ct / BITCT2]);
+void quatervec_01_invert(uintptr_t unfiltered_sample_ct, uintptr_t* main_quatervec) {
+  uintptr_t* vec2_last = &(main_quatervec[unfiltered_sample_ct / BITCT2]);
   uint32_t remainder = unfiltered_sample_ct & (BITCT2 - 1);
 #ifdef __LP64__
   const __m128i m1 = {FIVEMASK, FIVEMASK};
-  __m128i* vec2_128 = (__m128i*)vec2;
+  __m128i* vec2_128 = (__m128i*)main_quatervec;
   __m128i* vec2_last128 = &(vec2_128[unfiltered_sample_ct / BITCT]);
   while (vec2_128 < vec2_last128) {
     *vec2_128 = _mm_xor_si128(*vec2_128, m1);
     vec2_128++;
   }
-  vec2 = (uintptr_t*)vec2_128;
-  if (vec2 != vec2_last) {
-    *vec2 = (*vec2) ^ FIVEMASK;
-    vec2++;
+  main_quatervec = (uintptr_t*)vec2_128;
+  if (main_quatervec != vec2_last) {
+    *main_quatervec = (*main_quatervec) ^ FIVEMASK;
+    main_quatervec++;
   }
 #else
-  while (vec2 != vec2_last) {
-    *vec2 = (*vec2) ^ FIVEMASK;
-    vec2++;
+  while (main_quatervec != vec2_last) {
+    *main_quatervec = (*main_quatervec) ^ FIVEMASK;
+    main_quatervec++;
   }
 #endif
   if (remainder) {
@@ -8382,19 +8792,19 @@ void vec_invert(uintptr_t unfiltered_sample_ct, uintptr_t* vec2) {
 }
 
 void vec_datamask(uintptr_t unfiltered_sample_ct, uint32_t matchval, uintptr_t* data_ptr, uintptr_t* mask_ptr, uintptr_t* result_ptr) {
-  // assumes unfiltered_sample_ct is positive
   // vec_ptr assumed to be standard 00/01 bit vector
   // sets result_vec bits to 01 iff data_ptr bits are equal to matchval and
   // vec_ptr bit is set, 00 otherwise.
   // currently assumes matchval is not 1.
+  assert(unfiltered_sample_ct);
 #ifdef __LP64__
   __m128i* data_read = (__m128i*)data_ptr;
   __m128i* mask_read = (__m128i*)mask_ptr;
-  __m128i* data_read_end = &(data_read[(unfiltered_sample_ct + (BITCT - 1)) / BITCT]);
+  __m128i* data_read_end = &(data_read[QUATERCT_TO_VECCT(unfiltered_sample_ct)]);
   __m128i* writer = (__m128i*)result_ptr;
   __m128i loader;
 #else
-  uintptr_t* data_read_end = &(data_ptr[2 * (unfiltered_sample_ct + (BITCT - 1)) / BITCT]);
+  uintptr_t* data_read_end = &(data_ptr[QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct)]);
   uintptr_t loader;
 #endif
   if (matchval) {
@@ -8468,7 +8878,9 @@ void vec_rotate_plink1_to_plink2(uintptr_t* lptr, uint32_t word_ct) {
 }
 */
 
-void rotate_plink1_to_plink2_and_copy(uintptr_t* loadbuf, uintptr_t* writebuf, uintptr_t word_ct) {
+// this was "rotate_plink1_to_plink2_...", until I noticed that the plink2
+// format should store alt allele counts instead of ref allele counts.
+void rotate_plink1_to_a2ct_and_copy(uintptr_t* loadbuf, uintptr_t* writebuf, uintptr_t word_ct) {
   // assumes positive word_ct
   uintptr_t* loadbuf_end = &(loadbuf[word_ct]);
   uintptr_t ulii;
@@ -8481,8 +8893,8 @@ void rotate_plink1_to_plink2_and_copy(uintptr_t* loadbuf, uintptr_t* writebuf, u
   } while (loadbuf < loadbuf_end);
 }
 
-void extract_collapsed_missing_bitfield(uintptr_t* lptr, uintptr_t unfiltered_sample_ct, uintptr_t* sample_include2, uintptr_t sample_ct, uintptr_t* missing_bitfield) {
-  uint32_t word_ct = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+void extract_collapsed_missing_bitfield(uintptr_t* lptr, uintptr_t unfiltered_sample_ct, uintptr_t* sample_include_quaterarr, uintptr_t sample_ct, uintptr_t* missing_bitfield) {
+  uint32_t word_ct = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t sample_idx;
   uintptr_t cur_word;
   uintptr_t cur_mask;
@@ -8495,7 +8907,7 @@ void extract_collapsed_missing_bitfield(uintptr_t* lptr, uintptr_t unfiltered_sa
     woffset = 0;
     for (widx = 0; widx < word_ct; widx++) {
       cur_word = *lptr++;
-      cur_word = cur_word & ((~cur_word) >> 1) & (*sample_include2++);
+      cur_word = cur_word & ((~cur_word) >> 1) & (*sample_include_quaterarr++);
       while (cur_word) {
         uii = CTZLU(cur_word) / 2;
         cur_write |= ONELU << (woffset + uii);
@@ -8513,10 +8925,10 @@ void extract_collapsed_missing_bitfield(uintptr_t* lptr, uintptr_t unfiltered_sa
       *missing_bitfield++ = cur_write;
     }
   } else {
-    fill_ulong_zero(missing_bitfield, (sample_ct + (BITCT - 1)) / BITCT);
+    fill_ulong_zero(missing_bitfield, BITCT_TO_WORDCT(sample_ct));
     sample_idx = 0;
     for (widx = 0; sample_idx < sample_ct; widx++, lptr++) {
-      cur_mask = *sample_include2++;
+      cur_mask = *sample_include_quaterarr++;
       if (cur_mask) {
         cur_word = *lptr;
         cur_word = cur_word & ((~cur_word) >> 1) & cur_mask;
@@ -8524,7 +8936,7 @@ void extract_collapsed_missing_bitfield(uintptr_t* lptr, uintptr_t unfiltered_sa
           if (cur_word) {
 	    uii = sample_idx;
             do {
-              set_bit(missing_bitfield, (CTZLU(cur_word) / 2) + uii);
+              set_bit((CTZLU(cur_word) / 2) + uii, missing_bitfield);
               cur_word &= cur_word - 1;
 	    } while (cur_word);
 	  }
@@ -8534,7 +8946,7 @@ void extract_collapsed_missing_bitfield(uintptr_t* lptr, uintptr_t unfiltered_sa
 	    do {
 	      uii = CTZLU(cur_mask);
 	      if ((cur_word >> uii) & 1) {
-                set_bit_ul(missing_bitfield, sample_idx);
+                set_bit_ul(sample_idx, missing_bitfield);
 	      }
 	      sample_idx++;
 	      cur_mask &= cur_mask - 1;
@@ -8548,7 +8960,7 @@ void extract_collapsed_missing_bitfield(uintptr_t* lptr, uintptr_t unfiltered_sa
   }
 }
 
-void hh_reset(unsigned char* loadbuf, uintptr_t* sample_include2, uintptr_t unfiltered_sample_ct) {
+void hh_reset(unsigned char* loadbuf, uintptr_t* sample_include_quaterarr, uintptr_t unfiltered_sample_ct) {
   uintptr_t sample_bidx = 0;
   unsigned char* loadbuf_end = &(loadbuf[(unfiltered_sample_ct + 3) / 4]);
   unsigned char* iicp;
@@ -8559,14 +8971,14 @@ void hh_reset(unsigned char* loadbuf, uintptr_t* sample_include2, uintptr_t unfi
   uint32_t uii;
   uint32_t ujj;
 #ifdef __LP64__
-  uint32_t* sample_include2_alias32;
+  uint32_t* sample_include_quaterarr_alias32;
   __m128i* loadbuf_alias;
   __m128i* iivp;
   __m128i vii;
   __m128i vjj;
   if (!(((uintptr_t)loadbuf) & 15)) {
     loadbuf_alias = (__m128i*)loadbuf;
-    iivp = (__m128i*)sample_include2;
+    iivp = (__m128i*)sample_include_quaterarr;
     unfiltered_sample_ctd = unfiltered_sample_ct / 64;
     for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
       vii = *loadbuf_alias;
@@ -8577,17 +8989,17 @@ void hh_reset(unsigned char* loadbuf, uintptr_t* sample_include2, uintptr_t unfi
     iicp = (unsigned char*)iivp;
   } else if (!(((uintptr_t)loadbuf) & 3)) {
     loadbuf_alias32 = (uint32_t*)loadbuf;
-    sample_include2_alias32 = (uint32_t*)sample_include2;
+    sample_include_quaterarr_alias32 = (uint32_t*)sample_include_quaterarr;
     unfiltered_sample_ctd = unfiltered_sample_ct / BITCT2;
     for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
       uii = *loadbuf_alias32;
-      ujj = ((uii >> 1) & (~uii)) & (*sample_include2_alias32++);
+      ujj = ((uii >> 1) & (~uii)) & (*sample_include_quaterarr_alias32++);
       *loadbuf_alias32++ = uii - ujj;
     }
     loadbuf = (unsigned char*)loadbuf_alias32;
-    iicp = (unsigned char*)sample_include2_alias32;
+    iicp = (unsigned char*)sample_include_quaterarr_alias32;
   } else {
-    iicp = (unsigned char*)sample_include2;
+    iicp = (unsigned char*)sample_include_quaterarr;
   }
 #else
   if (!(((uintptr_t)loadbuf) & 3)) {
@@ -8595,12 +9007,12 @@ void hh_reset(unsigned char* loadbuf, uintptr_t* sample_include2, uintptr_t unfi
     unfiltered_sample_ctd = unfiltered_sample_ct / BITCT2;
     for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
       uii = *loadbuf_alias32;
-      ujj = ((uii >> 1) & (~uii)) & (*sample_include2++);
+      ujj = ((uii >> 1) & (~uii)) & (*sample_include_quaterarr++);
       *loadbuf_alias32++ = uii - ujj;
     }
     loadbuf = (unsigned char*)loadbuf_alias32;
   }
-  iicp = (unsigned char*)sample_include2;
+  iicp = (unsigned char*)sample_include_quaterarr;
 #endif
   for (; loadbuf < loadbuf_end;) {
     ucc = *loadbuf;
@@ -8609,7 +9021,7 @@ void hh_reset(unsigned char* loadbuf, uintptr_t* sample_include2, uintptr_t unfi
   }
 }
 
-void hh_reset_y(unsigned char* loadbuf, uintptr_t* sample_include2, uintptr_t* sample_male_include2, uintptr_t unfiltered_sample_ct) {
+void hh_reset_y(unsigned char* loadbuf, uintptr_t* sample_include_quaterarr, uintptr_t* sample_male_include_quaterarr, uintptr_t unfiltered_sample_ct) {
   uintptr_t sample_bidx = 0;
   unsigned char* loadbuf_end = &(loadbuf[(unfiltered_sample_ct + 3) / 4]);
   unsigned char* iicp;
@@ -8624,8 +9036,8 @@ void hh_reset_y(unsigned char* loadbuf, uintptr_t* sample_include2, uintptr_t* s
   uint32_t ukk;
 #ifdef __LP64__
   const __m128i m1 = {FIVEMASK, FIVEMASK};
-  uint32_t* sample_include2_alias32;
-  uint32_t* sample_male_include2_alias32;
+  uint32_t* sample_include_quaterarr_alias32;
+  uint32_t* sample_male_include_quaterarr_alias32;
   __m128i* loadbuf_alias;
   __m128i* iivp;
   __m128i* imivp;
@@ -8634,12 +9046,12 @@ void hh_reset_y(unsigned char* loadbuf, uintptr_t* sample_include2, uintptr_t* s
   __m128i vkk;
   if (!(((uintptr_t)loadbuf) & 15)) {
     loadbuf_alias = (__m128i*)loadbuf;
-    iivp = (__m128i*)sample_include2;
-    imivp = (__m128i*)sample_male_include2;
+    iivp = (__m128i*)sample_include_quaterarr;
+    imivp = (__m128i*)sample_male_include_quaterarr;
     unfiltered_sample_ctd = unfiltered_sample_ct / 64;
     for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
-      // sample_include2 & ~sample_male_include2: force to 01
-      // sample_male_include2: convert 10 to 01, keep everything else
+      // sample_include_quaterarr & ~sample_male_include_quaterarr: force to 01
+      // sample_male_include_quaterarr: convert 10 to 01, keep everything else
       vii = *imivp++;
       vjj = *iivp++;
       vkk = _mm_and_si128(*loadbuf_alias, _mm_or_si128(vii, _mm_slli_epi64(vii, 1)));
@@ -8650,36 +9062,36 @@ void hh_reset_y(unsigned char* loadbuf, uintptr_t* sample_include2, uintptr_t* s
     imicp = (unsigned char*)imivp;
   } else if (!(((uintptr_t)loadbuf) & 3)) {
     loadbuf_alias32 = (uint32_t*)loadbuf;
-    sample_include2_alias32 = (uint32_t*)sample_include2;
-    sample_male_include2_alias32 = (uint32_t*)sample_male_include2;
+    sample_include_quaterarr_alias32 = (uint32_t*)sample_include_quaterarr;
+    sample_male_include_quaterarr_alias32 = (uint32_t*)sample_male_include_quaterarr;
     unfiltered_sample_ctd = unfiltered_sample_ct / 16;
     for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
-      uii = *sample_male_include2_alias32++;
-      ujj = *sample_include2_alias32++;
+      uii = *sample_male_include_quaterarr_alias32++;
+      ujj = *sample_include_quaterarr_alias32++;
       ukk = (*loadbuf_alias32) & (uii * 3);
       *loadbuf_alias32++ = ((~uii) & ujj) | (ukk - ((~ukk) & (ukk >> 1) & 0x55555555));
     }
     loadbuf = (unsigned char*)loadbuf_alias32;
-    iicp = (unsigned char*)sample_include2_alias32;
-    imicp = (unsigned char*)sample_male_include2_alias32;
+    iicp = (unsigned char*)sample_include_quaterarr_alias32;
+    imicp = (unsigned char*)sample_male_include_quaterarr_alias32;
   } else {
-    iicp = (unsigned char*)sample_include2;
-    imicp = (unsigned char*)sample_male_include2;
+    iicp = (unsigned char*)sample_include_quaterarr;
+    imicp = (unsigned char*)sample_male_include_quaterarr;
   }
 #else
   if (!(((uintptr_t)loadbuf) & 3)) {
     loadbuf_alias32 = (uint32_t*)loadbuf;
     unfiltered_sample_ctd = unfiltered_sample_ct / 16;
     for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
-      uii = *sample_male_include2++;
-      ujj = *sample_include2++;
+      uii = *sample_male_include_quaterarr++;
+      ujj = *sample_include_quaterarr++;
       ukk = (*loadbuf_alias32) & (uii * 3);
       *loadbuf_alias32++ = ((~uii) & ujj) | (ukk - ((~ukk) & (ukk >> 1) & 0x55555555));
     }
     loadbuf = (unsigned char*)loadbuf_alias32;
   }
-  iicp = (unsigned char*)sample_include2;
-  imicp = (unsigned char*)sample_male_include2;
+  iicp = (unsigned char*)sample_include_quaterarr;
+  imicp = (unsigned char*)sample_male_include_quaterarr;
 #endif
   for (; loadbuf < loadbuf_end;) {
     ucc = *imicp++;
@@ -8689,34 +9101,34 @@ void hh_reset_y(unsigned char* loadbuf, uintptr_t* sample_include2, uintptr_t* s
   }
 }
 
-uint32_t alloc_raw_haploid_filters(uint32_t unfiltered_sample_ct, uint32_t hh_exists, uint32_t is_include, uintptr_t* sample_bitarr, uintptr_t* sex_male, uintptr_t** sample_raw_include2_ptr, uintptr_t** sample_raw_male_include2_ptr) {
-  uintptr_t unfiltered_sample_ctv2 = 2 * ((unfiltered_sample_ct + (BITCT - 1)) / BITCT);
-  uintptr_t* sample_raw_male_include2;
+uint32_t alloc_raw_haploid_filters(uint32_t unfiltered_sample_ct, uint32_t hh_exists, uint32_t is_include, uintptr_t* sample_bitarr, uintptr_t* sex_male, uintptr_t** sample_raw_include_quatervec_ptr, uintptr_t** sample_raw_male_include_quatervec_ptr) {
+  uintptr_t unfiltered_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct);
+  uintptr_t* sample_raw_male_include_quatervec;
   if (hh_exists & (Y_FIX_NEEDED | NXMHH_EXISTS)) {
-    if (wkspace_alloc_ul_checked(sample_raw_include2_ptr, unfiltered_sample_ctv2 * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_sample_ctv2, sample_raw_include_quatervec_ptr)) {
       return 1;
     }
     if (is_include) {
-      vec_include_init(unfiltered_sample_ct, *sample_raw_include2_ptr, sample_bitarr);
+      init_quaterarr_from_bitarr(sample_bitarr, unfiltered_sample_ct, *sample_raw_include_quatervec_ptr);
     } else {
-      exclude_to_vec_include(unfiltered_sample_ct, *sample_raw_include2_ptr, sample_bitarr);
+      init_quaterarr_from_inverted_bitarr(sample_bitarr, unfiltered_sample_ct, *sample_raw_include_quatervec_ptr);
     }
   }
   if (hh_exists & (XMHH_EXISTS | Y_FIX_NEEDED)) {
-    if (wkspace_alloc_ul_checked(sample_raw_male_include2_ptr, unfiltered_sample_ctv2 * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_sample_ctv2, sample_raw_male_include_quatervec_ptr)) {
       return 1;
     }
-    sample_raw_male_include2 = *sample_raw_male_include2_ptr;
+    sample_raw_male_include_quatervec = *sample_raw_male_include_quatervec_ptr;
     if (hh_exists & (Y_FIX_NEEDED | NXMHH_EXISTS)) {
-      memcpy(sample_raw_male_include2, *sample_raw_include2_ptr, unfiltered_sample_ctv2 * sizeof(intptr_t));
+      memcpy(sample_raw_male_include_quatervec, *sample_raw_include_quatervec_ptr, unfiltered_sample_ctv2 * sizeof(intptr_t));
     } else {
       if (is_include) {
-	vec_include_init(unfiltered_sample_ct, sample_raw_male_include2, sample_bitarr);
+	init_quaterarr_from_bitarr(sample_bitarr, unfiltered_sample_ct, sample_raw_male_include_quatervec);
       } else {
-	exclude_to_vec_include(unfiltered_sample_ct, sample_raw_male_include2, sample_bitarr);
+	init_quaterarr_from_inverted_bitarr(sample_bitarr, unfiltered_sample_ct, sample_raw_male_include_quatervec);
       }
     }
-    vec_include_mask_in(unfiltered_sample_ct, sample_raw_male_include2, sex_male);
+    apply_bitarr_mask_to_quaterarr_01(sex_male, unfiltered_sample_ct, sample_raw_male_include_quatervec);
   }
   return 0;
 }
@@ -8836,24 +9248,24 @@ void force_missing(unsigned char* loadbuf, uintptr_t* force_missing_include2, ui
 }
 
 int32_t open_and_size_string_list(char* fname, FILE** infile_ptr, uintptr_t* list_len_ptr, uintptr_t* max_str_len_ptr) {
-  // assumes file is not open yet, and tbuf is safe to clobber
+  // assumes file is not open yet, and g_textbuf is safe to clobber
   uint32_t max_len = 0;
   uintptr_t line_idx = 0;
   uintptr_t list_len = 0;
   int32_t retval = 0;
   char* bufptr;
   uint32_t cur_len;
-  if (fopen_checked(infile_ptr, fname, "r")) {
+  if (fopen_checked(fname, "r", infile_ptr)) {
     goto open_and_size_string_list_ret_OPEN_FAIL;
   }
-  tbuf[MAXLINELEN - 1] = ' ';
-  while (fgets(tbuf, MAXLINELEN, *infile_ptr)) {
+  g_textbuf[MAXLINELEN - 1] = ' ';
+  while (fgets(g_textbuf, MAXLINELEN, *infile_ptr)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
+    if (!g_textbuf[MAXLINELEN - 1]) {
       LOGERRPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, fname);
       goto open_and_size_string_list_ret_INVALID_FORMAT;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
@@ -8884,14 +9296,14 @@ int32_t open_and_size_string_list(char* fname, FILE** infile_ptr, uintptr_t* lis
 }
 
 int32_t load_string_list(FILE** infile_ptr, uintptr_t max_str_len, char* str_list) {
-  // assumes file is open (probably by open_and_size_string_list), and tbuf is
-  // safe to clobber
+  // assumes file is open (probably by open_and_size_string_list), and
+  // g_textbuf is safe to clobber
   int32_t retval = 0;
   char* bufptr;
   uint32_t cur_len;
   rewind(*infile_ptr);
-  while (fgets(tbuf, MAXLINELEN, *infile_ptr)) {
-    bufptr = skip_initial_spaces(tbuf);
+  while (fgets(g_textbuf, MAXLINELEN, *infile_ptr)) {
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
@@ -8914,7 +9326,7 @@ int32_t load_string_list(FILE** infile_ptr, uintptr_t max_str_len, char* str_lis
 int32_t open_and_skip_first_lines(FILE** infile_ptr, char* fname, char* loadbuf, uintptr_t loadbuf_size, uint32_t lines_to_skip) {
   uint32_t line_idx;
   loadbuf[loadbuf_size - 1] = ' ';
-  if (fopen_checked(infile_ptr, fname, "r")) {
+  if (fopen_checked(fname, "r", infile_ptr)) {
     return RET_OPEN_FAIL;
   }
   for (line_idx = 1; line_idx <= lines_to_skip; line_idx++) {
@@ -8976,7 +9388,7 @@ int32_t load_to_first_token(FILE* infile, uintptr_t loadbuf_size, char comment_c
 
 int32_t open_and_load_to_first_token(FILE** infile_ptr, char* fname, uintptr_t loadbuf_size, char comment_char, const char* file_descrip, char* loadbuf, char** bufptr_ptr, uintptr_t* line_idx_ptr) {
   loadbuf[loadbuf_size - 1] = ' ';
-  if (fopen_checked(infile_ptr, fname, "r")) {
+  if (fopen_checked(fname, "r", infile_ptr)) {
     return RET_OPEN_FAIL;
   }
   return load_to_first_token(*infile_ptr, loadbuf_size, comment_char, file_descrip, loadbuf, bufptr_ptr, line_idx_ptr);
@@ -8987,10 +9399,10 @@ int32_t scan_max_strlen(char* fname, uint32_t colnum, uint32_t colnum2, uint32_t
   // is scanned.
   // Includes terminating null in lengths.
   FILE* infile = NULL;
-  uintptr_t loadbuf_size = wkspace_left;
+  uintptr_t loadbuf_size = bigstack_left();
   uintptr_t max_str_len = *max_str_len_ptr;
   uintptr_t max_str2_len = 0;
-  char* loadbuf = (char*)wkspace_base;
+  char* loadbuf = (char*)g_bigstack_base;
   uint32_t colmin;
   uint32_t coldiff;
   char* str1_ptr;
@@ -9090,10 +9502,10 @@ int32_t scan_max_fam_indiv_strlen(char* fname, uint32_t colnum, uintptr_t* max_s
   // assumed to follow.
   // Includes terminating null in lengths.
   FILE* infile = NULL;
-  uintptr_t loadbuf_size = wkspace_left;
+  uintptr_t loadbuf_size = bigstack_left();
   uintptr_t max_sample_id_len = *max_sample_id_len_ptr;
   uintptr_t line_idx = 0;
-  char* loadbuf = (char*)wkspace_base;
+  char* loadbuf = (char*)g_bigstack_base;
   char* bufptr;
   char* bufptr2;
   uintptr_t cur_sample_id_len;
@@ -9190,7 +9602,7 @@ char* alloc_and_init_collapsed_arr(char* item_arr, uintptr_t item_len, uintptr_t
   if (read_only && (unfiltered_ct == filtered_ct)) {
     return item_arr;
   }
-  if (wkspace_alloc_c_checked(&new_arr, filtered_ct * item_len)) {
+  if (bigstack_alloc_c(filtered_ct * item_len, &new_arr)) {
     return NULL;
   }
   wptr = new_arr;
@@ -9216,7 +9628,7 @@ char* alloc_and_init_collapsed_arr_incl(char* item_arr, uintptr_t item_len, uint
   if (read_only && (unfiltered_ct == filtered_ct)) {
     return item_arr;
   }
-  if (wkspace_alloc_c_checked(&new_arr, filtered_ct * item_len)) {
+  if (bigstack_alloc_c(filtered_ct * item_len, &new_arr)) {
     return NULL;
   }
   wptr = new_arr;
@@ -9311,68 +9723,73 @@ void inplace_delta_collapse_bitfield(uintptr_t* read_ptr, uint32_t filtered_ct_n
   }
 }
 
-void collapse_copy_bitarr(uint32_t orig_ct, uintptr_t* bit_arr, uintptr_t* exclude_arr, uint32_t filtered_ct, uintptr_t* output_arr) {
+void copy_bitarr_subset_excl(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict subset_excl, uint32_t raw_bitarr_size, uint32_t subset_size, uintptr_t* __restrict output_bitarr) {
   uintptr_t cur_write = 0;
   uint32_t item_uidx = 0;
   uint32_t write_bit = 0;
   uint32_t item_idx = 0;
   uint32_t item_uidx_stop;
-  if (!exclude_arr[0]) {
-    item_uidx = next_set(exclude_arr, 0, orig_ct & (~(BITCT - 1))) & (~(BITCT - 1));
-    memcpy(output_arr, bit_arr, item_uidx / 8);
+  if (!subset_excl[0]) {
+    item_uidx = next_set(subset_excl, 0, raw_bitarr_size & (~(BITCT - 1))) & (~(BITCT - 1));
+    memcpy(output_bitarr, raw_bitarr, item_uidx / 8);
     item_idx = item_uidx;
-    output_arr = &(output_arr[item_uidx / BITCT]);
+    output_bitarr = &(output_bitarr[item_uidx / BITCT]);
   }
-  while (item_idx < filtered_ct) {
-    item_uidx = next_unset_unsafe(exclude_arr, item_uidx);
-    item_uidx_stop = next_set(exclude_arr, item_uidx, orig_ct);
+  while (item_idx < subset_size) {
+    item_uidx = next_unset_unsafe(subset_excl, item_uidx);
+    item_uidx_stop = next_set(subset_excl, item_uidx, raw_bitarr_size);
     item_idx += item_uidx_stop - item_uidx;
     do {
-      cur_write |= ((bit_arr[item_uidx / BITCT] >> (item_uidx % BITCT)) & 1) << write_bit;
+      cur_write |= ((raw_bitarr[item_uidx / BITCT] >> (item_uidx % BITCT)) & 1) << write_bit;
       if (++write_bit == BITCT) {
-	*output_arr++ = cur_write;
+	*output_bitarr++ = cur_write;
         cur_write = 0;
 	write_bit = 0;
       }
     } while (++item_uidx < item_uidx_stop);
   }
   if (write_bit) {
-    *output_arr = cur_write;
+    *output_bitarr = cur_write;
   }
 }
 
-void collapse_copy_bitarr_incl(uint32_t orig_ct, uintptr_t* bit_arr, uintptr_t* include_arr, uint32_t filtered_ct, uintptr_t* output_arr) {
-  uintptr_t cur_write = 0;
+void copy_bitarr_subset(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict subset_mask, uint32_t raw_bitarr_size, uint32_t subset_size, uintptr_t* __restrict output_bitarr) {
+  // full-blown blocked copy not worth it due to undefined CTZLU(0), >> 64,
+  // << 64
+  uintptr_t cur_output_word = 0;
   uint32_t item_uidx = 0;
-  uint32_t write_bit = 0;
+  uint32_t word_write_shift = 0;
   uint32_t item_idx = 0;
   uint32_t item_uidx_stop;
-  if (!(~include_arr[0])) {
-    item_uidx = next_unset(include_arr, 0, orig_ct & (~(BITCT - 1))) & (~(BITCT - 1));
-    memcpy(output_arr, bit_arr, item_uidx / 8);
+  if (!(~subset_mask[0])) {
+    item_uidx = next_unset(subset_mask, 0, raw_bitarr_size & (~(BITCT - 1))) & (~(BITCT - 1));
+    memcpy(output_bitarr, raw_bitarr, item_uidx / 8);
     item_idx = item_uidx;
-    output_arr = &(output_arr[item_uidx / BITCT]);
-  }
-  while (item_idx < filtered_ct) {
-    item_uidx = next_set_unsafe(include_arr, item_uidx);
-    item_uidx_stop = next_unset(include_arr, item_uidx, orig_ct);
+    output_bitarr = &(output_bitarr[item_uidx / BITCT]);
+  }
+  while (item_idx < subset_size) {
+    item_uidx = next_set_unsafe(subset_mask, item_uidx);
+    
+    // can speed this up a bit once we have a guaranteed unset bit at the end
+    item_uidx_stop = next_unset(subset_mask, item_uidx, raw_bitarr_size);
+    
     item_idx += item_uidx_stop - item_uidx;
     do {
-      cur_write |= ((bit_arr[item_uidx / BITCT] >> (item_uidx % BITCT)) & 1) << write_bit;
-      if (++write_bit == BITCT) {
-	*output_arr++ = cur_write;
-        cur_write = 0;
-	write_bit = 0;
+      cur_output_word |= ((raw_bitarr[item_uidx / BITCT] >> (item_uidx % BITCT)) & 1) << word_write_shift;
+      if (++word_write_shift == BITCT) {
+	*output_bitarr++ = cur_output_word;
+        cur_output_word = 0;
+	word_write_shift = 0;
       }
     } while (++item_uidx < item_uidx_stop);
   }
-  if (write_bit) {
-    *output_arr = cur_write;
+  if (word_write_shift) {
+    *output_bitarr = cur_output_word;
   }
 }
 
 void uncollapse_copy_flip_include_arr(uintptr_t* collapsed_include_arr, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t* output_exclude_arr) {
-  uintptr_t unfiltered_ctl = (unfiltered_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_ctl = BITCT_TO_WORDCT(unfiltered_ct);
   uintptr_t* output_exclude_true_end = &(output_exclude_arr[unfiltered_ctl]);
   uintptr_t* output_exclude_end = &(output_exclude_arr[unfiltered_ct / BITCT]);
   uintptr_t cea_read = 0;
@@ -9428,7 +9845,7 @@ void uncollapse_copy_flip_include_arr(uintptr_t* collapsed_include_arr, uintptr_
 }
 
 void copy_when_nonmissing(uintptr_t* loadbuf, char* source, uintptr_t elem_size, uintptr_t unfiltered_sample_ct, uintptr_t missing_ct, char* dest) {
-  uintptr_t* loadbuf_end = &(loadbuf[(unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2]);
+  uintptr_t* loadbuf_end = &(loadbuf[QUATERCT_TO_WORDCT(unfiltered_sample_ct)]);
   uintptr_t last_missing_p1 = 0;
   uintptr_t sample_idx_offset = 0;
   uintptr_t cur_word;
@@ -9549,7 +9966,7 @@ void init_sfmt64_from_sfmt32(sfmt_t* sfmt32, sfmt_t* sfmt64) {
 }
 
 void generate_perm1_interleaved(uint32_t tot_ct, uint32_t set_ct, uintptr_t perm_idx, uintptr_t perm_ct, uintptr_t* perm_buf) {
-  uintptr_t tot_ctl = (tot_ct + (BITCT - 1)) / BITCT;
+  uintptr_t tot_ctl = BITCT_TO_WORDCT(tot_ct);
   uintptr_t tot_rem = tot_ct & (BITCT - 1);
   uint32_t tot_quotient = (uint32_t)(0x100000000LLU / tot_ct);
   uint32_t upper_bound = tot_ct * tot_quotient - 1;
@@ -9574,7 +9991,7 @@ void generate_perm1_interleaved(uint32_t tot_ct, uint32_t set_ct, uintptr_t perm
       for (num_set = 0; num_set < set_ct; num_set++) {
 	do {
 	  do {
-	    urand = sfmt_genrand_uint32(&sfmt);
+	    urand = sfmt_genrand_uint32(&g_sfmt);
 	  } while (urand > upper_bound);
 	  // this is identical to ulii = urand / tot_quotient
 	  ulii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
@@ -9595,7 +10012,7 @@ void generate_perm1_interleaved(uint32_t tot_ct, uint32_t set_ct, uintptr_t perm
       for (num_set = 0; num_set < set_ct; num_set++) {
 	do {
 	  do {
-	    urand = sfmt_genrand_uint32(&sfmt);
+	    urand = sfmt_genrand_uint32(&g_sfmt);
 	  } while (urand > upper_bound);
 	  ulii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
 	  uljj = ulii / BITCT;
@@ -9934,23 +10351,23 @@ int32_t spawn_threads2(pthread_t* threads, void* (*start_routine)(void*), uintpt
 
 sfmt_t** g_sfmtp_arr;
 
-uint32_t wkspace_init_sfmtp(uint32_t thread_ct) {
+uint32_t bigstack_init_sfmtp(uint32_t thread_ct) {
   uint32_t uibuf[4];
   uint32_t tidx;
   uint32_t uii;
-  g_sfmtp_arr = (sfmt_t**)wkspace_alloc(thread_ct * sizeof(intptr_t));
+  g_sfmtp_arr = (sfmt_t**)bigstack_alloc(thread_ct * sizeof(intptr_t));
   if (!g_sfmtp_arr) {
     return 1;
   }
-  g_sfmtp_arr[0] = &sfmt;
+  g_sfmtp_arr[0] = &g_sfmt;
   if (thread_ct > 1) {
     for (tidx = 1; tidx < thread_ct; tidx++) {
-      g_sfmtp_arr[tidx] = (sfmt_t*)wkspace_alloc(sizeof(sfmt_t));
+      g_sfmtp_arr[tidx] = (sfmt_t*)bigstack_alloc(sizeof(sfmt_t));
       if (!g_sfmtp_arr[tidx]) {
 	return 1;
       }
       for (uii = 0; uii < 4; uii++) {
-	uibuf[uii] = sfmt_genrand_uint32(&sfmt);
+	uibuf[uii] = sfmt_genrand_uint32(&g_sfmt);
       }
       sfmt_init_by_array(g_sfmtp_arr[tidx], uibuf, 4);
     }
diff --git a/plink_common.h b/plink_common.h
index c9794f4..84eb929 100644
--- a/plink_common.h
+++ b/plink_common.h
@@ -10,6 +10,9 @@
 #include <stdint.h>
 #include <inttypes.h>
 
+#define NDEBUG
+#include <assert.h>
+
 // Uncomment this to build this without CBLAS/CLAPACK.
 // #define NOLAPACK
 
@@ -40,6 +43,8 @@
   #define THREAD_RET_TYPE unsigned __stdcall
   #define THREAD_RETURN return 0
   #define EOLN_STR "\r\n"
+  #define FOPEN_RB "rb"
+  #define FOPEN_WB "wb"
 #else
   #include <pthread.h>
   #define THREAD_RET_TYPE void*
@@ -50,6 +55,8 @@
     #endif
   #endif
   #define EOLN_STR "\n"
+  #define FOPEN_RB "r"
+  #define FOPEN_WB "w"
 #endif
 
 #ifdef __APPLE__
@@ -58,8 +65,13 @@
   typedef unsigned long long uint64_t;
   typedef long long int64_t;
 #else
-  #define uint64_t unsigned long long
-  #define int64_t long long
+  #if __GNUC__ < 6
+    // not sure how the f*** this inconsistency between GCC 6 and earlier
+    // versions is supposed to be handled, hopefully there will be official
+    // guidance...
+    #define uint64_t unsigned long long
+    #define int64_t long long
+  #endif
 #endif
 
 #ifdef _WIN64
@@ -81,6 +93,9 @@
 
 #ifdef __cplusplus
   #include <algorithm>
+  #define HEADER_INLINE inline
+#else
+  #define HEADER_INLINE static inline
 #endif
 
 // It would be useful to disable compilation on big-endian platforms, but I
@@ -96,17 +111,20 @@
     #error "64-bit builds currently require SSE2.  Try producing a 32-bit build instead."
   #endif
   #include <emmintrin.h>
-  #define FIVEMASK 0x5555555555555555LLU
-  typedef union {
-    __m128 vf;
-    __m128i vi;
-    __m128d vd;
-    uintptr_t u8[2];
-    double d8[2];
-    float f4[4];
-    uint32_t u4[4];
-  } __uni16;
+
+  #define VECFTYPE __m128
+  #define VECITYPE __m128i
+  #define VECDTYPE __m128d
+
+  // useful because of its bitwise complement: ~ZEROLU is a word with all 1
+  // bits, while ~0 is always 32 1 bits.
   #define ZEROLU 0LLU
+
+  // mainly useful for bitshifts: (ONELU << 32) works in 64-bit builds, while
+  // (1 << 32) is undefined.  also used to cast some numbers/expressions to
+  // uintptr_t (e.g. multiplying an int constant by ONELU widens it to 64 bits
+  // only in 64-bit builds; note that 1LU fails on Win64 while 1LLU doesn't do
+  // the right thing for 32-bit builds).
   #define ONELU 1LLU
 
   #ifdef _WIN32 // i.e. Win64
@@ -131,9 +149,10 @@
 
   #endif // Win64
 
+  #define VEC_BYTES 16
+
 #else // not __LP64__
 
-  #define FIVEMASK 0x55555555
   #define ZEROLU 0LU
   #define ONELU 1LU
   #ifndef PRIuPTR
@@ -144,13 +163,26 @@
   #endif
   #define PRIxPTR2 "08lx"
 
+  // todo: update code so this still works when reduced to 4
+  #define VEC_BYTES 8
+
 #endif // __LP64__
 
+// use constexpr for these as soon as compiler support is available on all
+// platforms
+#define FIVEMASK ((~ZEROLU) / 3)
+#define AAAAMASK (FIVEMASK * 2)
+
+#define VEC_BYTES_M1 (VEC_BYTES - 1)
+#define VEC_BITS (VEC_BYTES * 8)
+#define VEC_BITS_M1 (VEC_BITS - 1)
+
 #include "zlib-1.2.8/zlib.h"
 #include "SFMT.h"
 
-// 64MB of non-workspace memory guaranteed for now
-#define NON_WKSPACE_MIN 67108864
+// 64MB of non-workspace memory guaranteed for now.
+// Currently also serves as the maximum allele length.
+#define NON_BIGSTACK_MIN 67108864
 
 #define PI 3.1415926535897932
 #define RECIP_2_32 0.00000000023283064365386962890625
@@ -172,9 +204,15 @@
 // allow tail sum to be up to 2^30.)
 #define EXACT_TEST_BIAS 0.00000000000000000000000010339757656912845935892608650874535669572651386260986328125
 
-// sometimes an infinity substitute that avoids the 32-bit Windows performance
-// penalty
-#define HUGE_DOUBLE 1.7976931348623157e308
+// occasionally used as an infinity substitute that avoids the 32-bit Windows
+// performance penalty
+// can import from limits.h, we don't bother to include that for now
+#ifndef DBL_MAX
+  #define DBL_MAX 1.7976931348623157e308
+#endif
+
+// not quite the same as FLT_MAX since it's a double-precision constant
+#define FLT_MAXD 3.4028234663852886e38
 
 #define RET_SUCCESS 0
 #define RET_NOMEM 1
@@ -579,37 +617,78 @@
 #define ITERS_DEFAULT 100000
 #define MAX_PCS_DEFAULT 20
 
-#define WKSPACE_MIN_MB 64
-#define WKSPACE_DEFAULT_MB 2048
+#define BIGSTACK_MIN_MB 64
+#define BIGSTACK_DEFAULT_MB 2048
 
 #ifdef __LP64__
   #define BITCT 64
+
+  // unions generally shouldn't be used for reinterpret_cast's job (memcpy is
+  // the right C-compatible way), but vectors are an exception to this rule.
+  typedef union {
+    VECFTYPE vf;
+    VECITYPE vi;
+    VECDTYPE vd;
+    uintptr_t u8[VEC_BITS / BITCT];
+    double d8[VEC_BYTES / sizeof(double)];
+    float f4[VEC_BYTES / sizeof(float)];
+    uint32_t u4[VEC_BYTES / sizeof(int32_t)];
+  } __univec;
 #else
   #define BITCT 32
 #endif
 
 #define BITCT2 (BITCT / 2)
 #define BYTECT (BITCT / 8)
+#define BYTECT4 (BITCT / 32)
+#define VEC_WORDS (VEC_BITS / BITCT)
 
-#define CACHELINE 64 // assumed number of bytes per cache line, for alignment
+// assumed number of bytes per cache line, for alignment
+#define CACHELINE 64
+
+#define CACHELINE_BIT (CACHELINE * 8)
 #define CACHELINE_INT32 (CACHELINE / 4)
 #define CACHELINE_INT64 (CACHELINE / 8)
 #define CACHELINE_WORD (CACHELINE / BYTECT)
 #define CACHELINE_DBL (CACHELINE / 8)
 
-#define CACHEALIGN(val) (((val) + (CACHELINE - 1)) & (~(CACHELINE - ONELU)))
-#define CACHEALIGN_INT32(val) (((val) + (CACHELINE_INT32 - 1)) & (~(CACHELINE_INT32 - ONELU)))
-#define CACHEALIGN_WORD(val) (((val) + (CACHELINE_WORD - 1)) & (~(CACHELINE_WORD - ONELU)))
-#define CACHEALIGN_DBL(val) (((val) + (CACHELINE_DBL - 1)) & (~(CACHELINE_DBL - ONELU)))
+// alignment must be a power of 2
+HEADER_INLINE uintptr_t round_up_pow2(uintptr_t val, uintptr_t alignment) {
+  uintptr_t alignment_m1 = alignment - 1;
+  assert(!(alignment & alignment_m1));
+  return (val + alignment_m1) & (~alignment_m1);
+}
+
+#define BITCT_TO_VECCT(val) (((val) + (VEC_BITS - 1)) / VEC_BITS)
+#define BITCT_TO_WORDCT(val) (((val) + (BITCT - 1)) / BITCT)
+#define BITCT_TO_ALIGNED_WORDCT(val) (VEC_WORDS * BITCT_TO_VECCT(val))
+
+#define QUATERCT_TO_VECCT(val) (((val) + ((VEC_BITS / 2) - 1)) / (VEC_BITS / 2))
+#define QUATERCT_TO_WORDCT(val) (((val) + (BITCT2 - 1)) / BITCT2)
+#define QUATERCT_TO_ALIGNED_WORDCT(val) (VEC_WORDS * QUATERCT_TO_VECCT(val))
+
+// todo: get rid of (BITCT_TO_WORDCT(x) == QUATERCT_TO_VECCT(x)) and similar
+// assumptions, in preparation for AVX2
+
+#ifdef __LP64__
+#define round_up_pow2_ull round_up_pow2
+#else
+HEADER_INLINE uint64_t round_up_pow2_ull(uint64_t val, uint64_t alignment) {
+  uint64_t alignment_m1 = alignment - 1;
+  assert(!(alignment & alignment_m1));
+  return (val + alignment_m1) & (~alignment_m1);
+}
+#endif
 
 // 32-bit instead of word-length bitwise not here, when val can be assumed to
 // be 32-bit.
 // (note that the sizeof operator "returns" an uintptr_t, not a uint32_t; hence
 // the lack of sizeof in the CACHELINE_INT32, etc. definitions.)
-#define CACHEALIGN32(val) (((val) + (CACHELINE - 1)) & (~(CACHELINE - 1)))
-#define CACHEALIGN32_INT32(val) (((val) + (CACHELINE_INT32 - 1)) & (~(CACHELINE_INT32 - 1)))
-#define CACHEALIGN32_WORD(val) (((val) + (CACHELINE_WORD - 1)) & (~(CACHELINE_WORD - 1)))
-#define CACHEALIGN32_DBL(val) (((val) + (CACHELINE_DBL - 1)) & (~(CACHELINE_DBL - 1)))
+HEADER_INLINE uint32_t round_up_pow2_ui(uint32_t val, uint32_t alignment) {
+  uint32_t alignment_m1 = alignment - 1;
+  assert(!(alignment & alignment_m1));
+  return (val + alignment_m1) & (~alignment_m1);
+}
 
 #define MAXV(aa, bb) (((bb) > (aa))? (bb) : (aa))
 #define MINV(aa, bb) (((aa) > (bb))? (bb) : (aa))
@@ -634,7 +713,7 @@
 #define MAXLINELEN 131072
 
 // must be at least 2 * MAXLINELEN + 2 to support generic token loader.
-#define TBUF_SIZE (2 * MAXLINELEN + 256)
+#define TEXTBUF_SIZE (2 * MAXLINELEN + 256)
 
 // Maximum length of chromosome, variant, FID, IID, cluster, and set IDs (not
 // including terminating null, that's what _P1 is for).  This value supports up
@@ -644,14 +723,15 @@
 #define MAX_ID_LEN_P1 (MAX_ID_LEN + 1)
 #define MAX_ID_LEN_STR "16000"
 
-// maximum size of "dynamically" allocated line load buffer.  (this is the
-// limit that applies to .vcf and similar files)
+// Maximum size of "dynamically" allocated line load buffer.  (This is the
+// limit that applies to .vcf and similar files.)  Inconvenient to go higher
+// since fgets() takes a int32_t size argument.
 #define MAXLINEBUFLEN 0x7fffffc0
 
 // Default --perm-batch-size value in most contexts.  It may actually be better
 // to *avoid* a power of two due to the need for transpositions involving this
-// stride; see e.g. http://danluu.com/3c-conflict/ .  This should be tested
-// during PLINK 2.0 development.
+// stride; see e.g. http://danluu.com/3c-conflict/ ; try 448 instead?  This
+// should be tested during PLINK 2.0 development.
 #define DEFAULT_PERM_BATCH_SIZE 512
 
 // note that this is NOT foolproof: see e.g.
@@ -668,7 +748,6 @@
 #define JACKKNIFE_VALS_GROUPDIST 3
 
 #ifdef __LP64__
-  #define AAAAMASK 0xaaaaaaaaaaaaaaaaLLU
   // number of snp-major .bed lines to read at once for distance calc if
   // exponent is nonzero.
   #define MULTIPLEX_DIST_EXP 64
@@ -677,7 +756,6 @@
 #else
   // N.B. 32-bit version not as carefully tested or optimized, but I'll try to
   // make sure it works properly
-  #define AAAAMASK 0xaaaaaaaaU
   #define MULTIPLEX_DIST_EXP 28
   #define MULTIPLEX_REL 30
 #endif
@@ -701,10 +779,8 @@
 
 #ifdef __LP64__
 #define HASHMEM 4194304
-#define HASHMEM_S 4194304
 #else
 #define HASHMEM 2097152
-#define HASHMEM_S 2097152
 #endif
 
 typedef struct {
@@ -716,54 +792,59 @@ typedef struct {
   double interval_slope;
 } Aperm_info;
 
-// fit 4 pathologically long IDs plus a bit extra
-extern char tbuf[];
+// Generic text I/O buffer: any function which reads from/writes to a text file
+// or the console may clobber it.  Sized to fit two MAXLINELEN-length lines
+// plus a bit extra.
+extern char g_textbuf[];
 
 extern const char g_one_char_strs[];
 extern const char* g_missing_geno_ptr;
 extern const char* g_output_missing_geno_ptr;
 
-static inline const char* cond_replace(const char* ss, const char* match_str, const char* replace_str) {
-  if (ss != match_str) {
-    return ss;
-  } else {
-    return replace_str;
-  }
+HEADER_INLINE const char* cond_replace(const char* ss, const char* match_str, const char* replace_str) {
+  return (ss != match_str)? ss : replace_str;
 }
 
-uint32_t aligned_malloc(uintptr_t** aligned_pp, uintptr_t size);
+uint32_t aligned_malloc(uintptr_t size, uintptr_t** aligned_pp);
 
 void aligned_free(uintptr_t* aligned_pp);
 
-static inline void aligned_free_cond(uintptr_t* aligned_ptr) {
+HEADER_INLINE void aligned_free_cond(uintptr_t* aligned_ptr) {
   if (aligned_ptr) {
     aligned_free(aligned_ptr);
   }
 }
 
-static inline void aligned_free_null(uintptr_t** aligned_pp) {
+HEADER_INLINE void aligned_free_null(uintptr_t** aligned_pp) {
   aligned_free(*aligned_pp);
   *aligned_pp = NULL;
 }
 
-static inline void aligned_free_cond_null(uintptr_t** aligned_pp) {
+HEADER_INLINE void aligned_free_cond_null(uintptr_t** aligned_pp) {
   if (*aligned_pp) {
     aligned_free(*aligned_pp);
     *aligned_pp = NULL;
   }
 }
 
-extern sfmt_t sfmt;
+extern sfmt_t g_sfmt;
+
+// file-scope string constants don't always have the g_ prefix, but multi-file
+// constants are always tagged.
+extern const char g_errstr_fopen[];
+extern const char g_cmdline_format_str[];
+
+extern FILE* g_logfile;
 
-extern const char errstr_fopen[];
-extern const char cmdline_format_str[];
+// mostly-safe sprintf buffer.  warning: do NOT put allele codes or
+// arbitrary-length lists in here.
+extern char g_logbuf[];
 
-extern FILE* logfile;
-extern char logbuf[];
 extern uint32_t g_debug_on;
 extern uint32_t g_log_failed;
 
-extern uintptr_t g_sample_ct;
+// should remove this global: multithreaded functions should use a file-local
+// thread_ct which will occasionally be smaller due to job size.
 extern uint32_t g_thread_ct;
 
 typedef struct ll_str_struct {
@@ -792,21 +873,8 @@ typedef struct range_list_struct {
   uint32_t name_max_len;
 } Range_list;
 
-typedef union {
-  float ff;
-  int32_t ii;
-} __floatint32;
-
-typedef union {
-  double dd;
-#ifdef __LP64__
-  uintptr_t uu[1];
-#else
-  uintptr_t uu[2];
-#endif
-} __double_ulong;
-
-uint32_t push_ll_str(Ll_str** ll_stack_ptr, const char* ss);
+// Pushes a copy of ss (allocated via malloc) onto ll_stack.
+uint32_t push_ll_str(const char* ss, Ll_str** ll_stack_ptr);
 
 // warning: do NOT include allele codes (unless they're guaranteed to be SNPs)
 // in log strings; they can overflow the buffer.
@@ -820,76 +888,83 @@ void logprintb();
 
 void logerrprintb();
 
-#define LOGPRINTF(...) sprintf(logbuf, __VA_ARGS__); logprintb();
+#define LOGPRINTF(...) sprintf(g_logbuf, __VA_ARGS__); logprintb();
 
-#define LOGERRPRINTF(...) sprintf(logbuf, __VA_ARGS__); logerrprintb();
+#define LOGERRPRINTF(...) sprintf(g_logbuf, __VA_ARGS__); logerrprintb();
 
 // input for wordwrap/LOGPRINTFWW should have no intermediate '\n's.  If
 // suffix_len is 0, there should be a terminating \n.
-void wordwrap(char* ss, uint32_t suffix_len);
+// void wordwrap(uint32_t suffix_len, char* ss);
 
-#define LOGPREPRINTFWW(...) sprintf(logbuf, __VA_ARGS__); wordwrap(logbuf, 0);
+void wordwrapb(uint32_t suffix_len);
 
-#define LOGPRINTFWW(...) sprintf(logbuf, __VA_ARGS__); wordwrap(logbuf, 0); logprintb();
+#define LOGPREPRINTFWW(...) sprintf(g_logbuf, __VA_ARGS__); wordwrapb(0);
 
-#define LOGERRPRINTFWW(...) sprintf(logbuf, __VA_ARGS__); wordwrap(logbuf, 0); logerrprintb();
+#define LOGPRINTFWW(...) sprintf(g_logbuf, __VA_ARGS__); wordwrapb(0); logprintb();
+
+#define LOGERRPRINTFWW(...) sprintf(g_logbuf, __VA_ARGS__); wordwrapb(0); logerrprintb();
 
 // 5 = length of "done." suffix, which is commonly used
-#define LOGPRINTFWW5(...) sprintf(logbuf, __VA_ARGS__); wordwrap(logbuf, 5); logprintb();
+#define LOGPRINTFWW5(...) sprintf(g_logbuf, __VA_ARGS__); wordwrapb(5); logprintb();
 
 #ifdef STABLE_BUILD
-  #define UNSTABLE(val) sptr = strcpya(&(logbuf[9]), val); goto main_unstable_disabled
+  #define UNSTABLE(val) sptr = strcpya(&(g_logbuf[9]), val); goto main_unstable_disabled
 #else
   #define UNSTABLE(val)
 #endif
 
-int32_t fopen_checked(FILE** target_ptr, const char* fname, const char* mode);
+int32_t fopen_checked(const char* fname, const char* mode, FILE** target_ptr);
 
-static inline int32_t putc_checked(int32_t ii, FILE* outfile) {
+HEADER_INLINE int32_t putc_checked(int32_t ii, FILE* outfile) {
   putc(ii, outfile);
   return ferror(outfile);
 }
 
-static inline int32_t fputs_checked(const char* ss, FILE* outfile) {
+HEADER_INLINE int32_t fputs_checked(const char* ss, FILE* outfile) {
   fputs(ss, outfile);
   return ferror(outfile);
 }
 
+// This must be used for all fwrite() calls where len could be >= 2^31, since
+// OS X raw fwrite() doesn't work in that case.
 int32_t fwrite_checked(const void* buf, size_t len, FILE* outfile);
 
-static inline int32_t fread_checked(char* buf, uintptr_t len, FILE* infile, uintptr_t* bytes_read_ptr) {
+HEADER_INLINE int32_t fread_checked(char* buf, uintptr_t len, FILE* infile, uintptr_t* bytes_read_ptr) {
   *bytes_read_ptr = fread(buf, 1, len, infile);
   return ferror(infile);
 }
 
-static inline void fclose_cond(FILE* fptr) {
+HEADER_INLINE void fclose_cond(FILE* fptr) {
   if (fptr) {
     fclose(fptr);
   }
 }
 
-static inline int32_t fclose_null(FILE** fptr_ptr) {
+HEADER_INLINE int32_t fclose_null(FILE** fptr_ptr) {
   int32_t ii = ferror(*fptr_ptr);
   int32_t jj = fclose(*fptr_ptr);
   *fptr_ptr = NULL;
   return ii || jj;
 }
 
-int32_t gzopen_checked(gzFile* target_ptr, const char* fname, const char* mode);
+// Also sets 128k read buffer.  Can return RET_OPEN_FAIL or RET_NOMEM.
+int32_t gzopen_read_checked(const char* fname, gzFile* gzf_ptr);
+
+// pigz interface should be used for writing .gz files.
 
-static inline int32_t gzclose_null(gzFile* gzf_ptr) {
+HEADER_INLINE int32_t gzclose_null(gzFile* gzf_ptr) {
   int32_t ii = gzclose(*gzf_ptr);
   *gzf_ptr = NULL;
   return (ii != Z_OK);
 }
 
-static inline void gzclose_cond(gzFile gz_infile) {
+HEADER_INLINE void gzclose_cond(gzFile gz_infile) {
   if (gz_infile) {
     gzclose(gz_infile);
   }
 }
 
-static inline int32_t flexwrite_checked(const void* buf, size_t len, uint32_t output_gz, FILE* outfile, gzFile gz_outfile) {
+HEADER_INLINE int32_t flexwrite_checked(const void* buf, size_t len, uint32_t output_gz, FILE* outfile, gzFile gz_outfile) {
   if (!output_gz) {
     return fwrite_checked(buf, len, outfile);
   } else {
@@ -897,7 +972,7 @@ static inline int32_t flexwrite_checked(const void* buf, size_t len, uint32_t ou
   }
 }
 
-static inline int32_t flexputc_checked(int32_t ii, uint32_t output_gz, FILE* outfile, gzFile gz_outfile) {
+HEADER_INLINE int32_t flexputc_checked(int32_t ii, uint32_t output_gz, FILE* outfile, gzFile gz_outfile) {
   if (!output_gz) {
     putc(ii, outfile);
     return ferror(outfile);
@@ -906,7 +981,7 @@ static inline int32_t flexputc_checked(int32_t ii, uint32_t output_gz, FILE* out
   }
 }
 
-static inline int32_t flexputs_checked(const char* ss, uint32_t output_gz, FILE* outfile, gzFile gz_outfile) {
+HEADER_INLINE int32_t flexputs_checked(const char* ss, uint32_t output_gz, FILE* outfile, gzFile gz_outfile) {
   if (!output_gz) {
     return fputs_checked(ss, outfile);
   } else {
@@ -914,7 +989,7 @@ static inline int32_t flexputs_checked(const char* ss, uint32_t output_gz, FILE*
   }
 }
 
-static inline int32_t flexclose_null(uint32_t output_gz, FILE** fptr_ptr, gzFile* gzf_ptr) {
+HEADER_INLINE int32_t flexclose_null(uint32_t output_gz, FILE** fptr_ptr, gzFile* gzf_ptr) {
   if (!output_gz) {
     return fclose_null(fptr_ptr);
   } else {
@@ -922,119 +997,192 @@ static inline int32_t flexclose_null(uint32_t output_gz, FILE** fptr_ptr, gzFile
   }
 }
 
-static inline int32_t bed_suffix_conflict(uint64_t calculation_type, uint32_t recode_modifier) {
-  return (calculation_type & CALC_MAKE_BED) || ((calculation_type & CALC_RECODE) && (recode_modifier & (RECODE_LGEN | RECODE_LGEN_REF | RECODE_RLIST)));
+// manually managed, very large double-ended stack
+extern unsigned char* g_bigstack_base;
+extern unsigned char* g_bigstack_end;
+
+HEADER_INLINE uintptr_t bigstack_left() {
+  return (((uintptr_t)g_bigstack_end) - ((uintptr_t)g_bigstack_base));
 }
 
-// manually managed, very large stack
-extern unsigned char* wkspace_base;
-extern uintptr_t wkspace_left;
+// Basic 64-byte-aligned allocation at bottom of stack.
+unsigned char* bigstack_alloc(uintptr_t size);
 
-unsigned char* wkspace_alloc(uintptr_t size);
 
-static inline int32_t wkspace_alloc_c_checked(char** dc_ptr, uintptr_t size) {
-  *dc_ptr = (char*)wkspace_alloc(size);
-  return !(*dc_ptr);
+// Typesafe, return-0-iff-success interfaces.  (See also bigstack_calloc_...
+// further below.)
+HEADER_INLINE int32_t bigstack_alloc_c(uintptr_t ct, char** cp_ptr) {
+  *cp_ptr = (char*)bigstack_alloc(ct);
+  return !(*cp_ptr);
 }
 
-static inline int32_t wkspace_alloc_d_checked(double** dp_ptr, uintptr_t size) {
-  *dp_ptr = (double*)wkspace_alloc(size);
+HEADER_INLINE int32_t bigstack_alloc_d(uintptr_t ct, double** dp_ptr) {
+  *dp_ptr = (double*)bigstack_alloc(ct * sizeof(double));
   return !(*dp_ptr);
 }
 
-static inline int32_t wkspace_alloc_f_checked(float** fp_ptr, uintptr_t size) {
-  *fp_ptr = (float*)wkspace_alloc(size);
+HEADER_INLINE int32_t bigstack_alloc_f(uintptr_t ct, float** fp_ptr) {
+  *fp_ptr = (float*)bigstack_alloc(ct * sizeof(float));
   return !(*fp_ptr);
 }
 
-static inline int32_t wkspace_alloc_i_checked(int32_t** ip_ptr, uintptr_t size) {
-  *ip_ptr = (int32_t*)wkspace_alloc(size);
+HEADER_INLINE int32_t bigstack_alloc_i(uintptr_t ct, int32_t** ip_ptr) {
+  *ip_ptr = (int32_t*)bigstack_alloc(ct * sizeof(int32_t));
   return !(*ip_ptr);
 }
 
-static inline int32_t wkspace_alloc_uc_checked(unsigned char** ucp_ptr, uintptr_t size) {
-  *ucp_ptr = wkspace_alloc(size);
+HEADER_INLINE int32_t bigstack_alloc_uc(uintptr_t ct, unsigned char** ucp_ptr) {
+  *ucp_ptr = bigstack_alloc(ct);
   return !(*ucp_ptr);
 }
 
-static inline int32_t wkspace_alloc_ui_checked(uint32_t** uip_ptr, uintptr_t size) {
-  *uip_ptr = (uint32_t*)wkspace_alloc(size);
+HEADER_INLINE int32_t bigstack_alloc_ui(uintptr_t ct, uint32_t** uip_ptr) {
+  *uip_ptr = (uint32_t*)bigstack_alloc(ct * sizeof(int32_t));
   return !(*uip_ptr);
 }
 
-static inline int32_t wkspace_alloc_ul_checked(uintptr_t** ulp_ptr, uintptr_t size) {
-  *ulp_ptr = (uintptr_t*)wkspace_alloc(size);
+HEADER_INLINE int32_t bigstack_alloc_ul(uintptr_t ct, uintptr_t** ulp_ptr) {
+  *ulp_ptr = (uintptr_t*)bigstack_alloc(ct * sizeof(intptr_t));
   return !(*ulp_ptr);
 }
 
-static inline int32_t wkspace_alloc_ll_checked(int64_t** llp_ptr, uintptr_t size) {
-  *llp_ptr = (int64_t*)wkspace_alloc(size);
+HEADER_INLINE int32_t bigstack_alloc_ll(uintptr_t ct, int64_t** llp_ptr) {
+  *llp_ptr = (int64_t*)bigstack_alloc(ct * sizeof(int64_t));
   return !(*llp_ptr);
 }
 
-static inline int32_t wkspace_alloc_ull_checked(uint64_t** ullp_ptr, uintptr_t size) {
-  *ullp_ptr = (uint64_t*)wkspace_alloc(size);
+HEADER_INLINE int32_t bigstack_alloc_ull(uintptr_t ct, uint64_t** ullp_ptr) {
+  *ullp_ptr = (uint64_t*)bigstack_alloc(ct * sizeof(int64_t));
   return !(*ullp_ptr);
 }
 
-void wkspace_reset(void* new_base);
+HEADER_INLINE void bigstack_reset(const void* new_base) {
+  g_bigstack_base = (unsigned char*)new_base;
+}
 
-void wkspace_shrink_top(void* rebase, uintptr_t new_size);
+HEADER_INLINE void bigstack_end_reset(const void* new_end) {
+  g_bigstack_end = (unsigned char*)new_end;
+}
 
-static inline unsigned char* top_alloc(uintptr_t* topsize_ptr, uintptr_t size) {
-  uintptr_t ts = *topsize_ptr + ((size + 15) & (~(15 * ONELU)));
-  if (ts > wkspace_left) {
-    return NULL;
-  } else {
-    *topsize_ptr = ts;
-    return &(wkspace_base[wkspace_left - ts]);
-  }
+HEADER_INLINE void bigstack_double_reset(const void* new_base, const void* new_end) {
+  bigstack_reset(new_base);
+  bigstack_end_reset(new_end);
+}
+
+void bigstack_shrink_top(const void* rebase, uintptr_t new_size);
+
+#define END_ALLOC_CHUNK 16
+#define END_ALLOC_CHUNK_M1 (END_ALLOC_CHUNK - 1)
+
+HEADER_INLINE void bigstack_end_set(const void* unaligned_end) {
+  g_bigstack_end = (unsigned char*)(((uintptr_t)unaligned_end) & (~(END_ALLOC_CHUNK_M1 * ONELU)));
+}
+
+// assumes size is divisible by END_ALLOC_CHUNK
+// (no value in directly calling this with a constant size parameter: the
+// compiler will properly optimize a bigstack_end_alloc() call)
+unsigned char* bigstack_end_alloc_presized(uintptr_t size);
+
+HEADER_INLINE unsigned char* bigstack_end_alloc(uintptr_t size) {
+  // multiplication by ONELU is one way to widen an int to word-size.
+  size = round_up_pow2(size, END_ALLOC_CHUNK);
+  return bigstack_end_alloc_presized(size);
+}
+
+#define bigstack_end_aligned_alloc bigstack_end_alloc
+
+HEADER_INLINE int32_t bigstack_end_alloc_c(uintptr_t ct, char** cp_ptr) {
+  *cp_ptr = (char*)bigstack_end_alloc(ct);
+  return !(*cp_ptr);
+}
+
+HEADER_INLINE int32_t bigstack_end_alloc_d(uintptr_t ct, double** dp_ptr) {
+  *dp_ptr = (double*)bigstack_end_alloc(ct * sizeof(double));
+  return !(*dp_ptr);
+}
+
+HEADER_INLINE int32_t bigstack_end_alloc_f(uintptr_t ct, float** fp_ptr) {
+  *fp_ptr = (float*)bigstack_end_alloc(ct * sizeof(float));
+  return !(*fp_ptr);
 }
 
-static inline Ll_str* top_alloc_llstr(uintptr_t* topsize_ptr, uint32_t size) {
-  return (Ll_str*)top_alloc(topsize_ptr, size + sizeof(Ll_str));
+HEADER_INLINE int32_t bigstack_end_alloc_i(uintptr_t ct, int32_t** ip_ptr) {
+  *ip_ptr = (int32_t*)bigstack_end_alloc(ct * sizeof(int32_t));
+  return !(*ip_ptr);
+}
+
+HEADER_INLINE int32_t bigstack_end_alloc_uc(uintptr_t ct, unsigned char** ucp_ptr) {
+  *ucp_ptr = bigstack_end_alloc(ct);
+  return !(*ucp_ptr);
+}
+
+HEADER_INLINE int32_t bigstack_end_alloc_ui(uintptr_t ct, uint32_t** uip_ptr) {
+  *uip_ptr = (uint32_t*)bigstack_end_alloc(ct * sizeof(int32_t));
+  return !(*uip_ptr);
+}
+
+HEADER_INLINE int32_t bigstack_end_alloc_ul(uintptr_t ct, uintptr_t** ulp_ptr) {
+  *ulp_ptr = (uintptr_t*)bigstack_end_alloc(ct * sizeof(intptr_t));
+  return !(*ulp_ptr);
 }
 
-static inline int32_t is_letter(unsigned char ucc) {
+HEADER_INLINE int32_t bigstack_end_alloc_ll(uintptr_t ct, int64_t** llp_ptr) {
+  *llp_ptr = (int64_t*)bigstack_end_alloc(ct * sizeof(int64_t));
+  return !(*llp_ptr);
+}
+
+HEADER_INLINE int32_t bigstack_end_alloc_ull(uintptr_t ct, uint64_t** ullp_ptr) {
+  *ullp_ptr = (uint64_t*)bigstack_end_alloc(ct * sizeof(int64_t));
+  return !(*ullp_ptr);
+}
+
+HEADER_INLINE int32_t bigstack_end_alloc_llstr(uintptr_t str_bytes, Ll_str** llstrp_ptr) {
+  *llstrp_ptr = (Ll_str*)bigstack_end_alloc(str_bytes + sizeof(Ll_str));
+  return !(*llstrp_ptr);
+}
+
+
+HEADER_INLINE int32_t is_letter(unsigned char ucc) {
   return (((ucc & 192) == 64) && (((ucc - 1) & 31) < 26));
 }
 
 // if we need the digit value, better to use (unsigned char)cc - '0'...
-static inline int32_t is_digit(unsigned char ucc) {
+HEADER_INLINE int32_t is_digit(unsigned char ucc) {
   return (ucc <= '9') && (ucc >= '0');
 }
 
-static inline int32_t is_not_digit(unsigned char ucc) {
+HEADER_INLINE int32_t is_not_digit(unsigned char ucc) {
   return (ucc > '9') || (ucc < '0');
 }
 
-static inline int32_t is_not_nzdigit(unsigned char ucc) {
+HEADER_INLINE int32_t is_not_nzdigit(unsigned char ucc) {
   return (ucc > '9') || (ucc <= '0');
 }
 
 // may as well treat all chars < 32, except tab, as eoln...
-static inline int32_t is_eoln(unsigned char ucc) {
-  return (ucc < 32) && (ucc != '\t');
-}
-
 // kns = "known non-space" (where tab counts as a space)
-static inline int32_t is_eoln_kns(unsigned char ucc) {
+/*
+HEADER_INLINE int32_t is_eoln_kns(unsigned char ucc) {
   return (ucc < 32);
 }
+*/
 
-static inline int32_t is_eoln_or_comment(unsigned char ucc) {
-  return (ucc < 32) || (ucc == '#');
+HEADER_INLINE int32_t is_space_or_eoln(unsigned char ucc) {
+  return (ucc <= 32);
 }
 
-static inline int32_t no_more_tokens(char* sptr) {
-  return ((!sptr) || is_eoln(*sptr));
+// could assert ucc is not a space/tab
+#define is_eoln_kns is_space_or_eoln
+
+HEADER_INLINE int32_t is_eoln_or_comment_kns(unsigned char ucc) {
+  return (ucc < 32) || (ucc == '#');
 }
 
-static inline int32_t no_more_tokens_kns(char* sptr) {
+HEADER_INLINE int32_t no_more_tokens_kns(const char* sptr) {
   return ((!sptr) || is_eoln_kns(*sptr));
 }
 
-static inline char* skip_initial_spaces(char* sptr) {
+HEADER_INLINE char* skip_initial_spaces(char* sptr) {
   while ((*sptr == ' ') || (*sptr == '\t')) {
     sptr++;
   }
@@ -1042,7 +1190,7 @@ static inline char* skip_initial_spaces(char* sptr) {
 }
 
 /*
-static inline int32_t is_space_or_eoln(unsigned char cc) {
+HEADER_INLINE int32_t is_space_or_eoln(unsigned char cc) {
   // ' ', \t, \n, \0, \r
 #ifdef __LP64__
   return (ucc <= 32) && (0x100002601LLU & (1LLU << ucc));
@@ -1051,88 +1199,80 @@ static inline int32_t is_space_or_eoln(unsigned char cc) {
 #endif
 }
 */
-static inline int32_t is_space_or_eoln(unsigned char ucc) {
-  return (ucc <= 32);
-}
 
-uint32_t match_upper(char* ss, const char* fixed_str);
+// Returns whether uppercased ss matches nonempty fixed_str.  Assumes fixed_str
+// contains nothing but letters and a null terminator.
+uint32_t match_upper(const char* ss, const char* fixed_str);
 
-uint32_t match_upper_nt(char* ss, const char* fixed_str, uint32_t ct);
+uint32_t match_upper_nt(const char* ss, const char* fixed_str, uint32_t ct);
 
-uint32_t scan_posint_capped(char* ss, uint32_t* valp, uint32_t cap_div_10, uint32_t cap_mod_10);
+// Reads an integer in [1, cap].  Assumes first character is nonspace.  Has the
+// overflow detection atoi() lacks.
+// A funny-looking div_10/mod_10 interface is used since the cap will usually
+// be a constant, and we want the integer division/modulus to occur at compile
+// time.
+uint32_t scan_posint_capped(const char* ss, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp);
 
-uint32_t scan_uint_capped(char* ss, uint32_t* valp, uint32_t cap_div_10, uint32_t cap_mod_10);
+uint32_t scan_uint_capped(const char* ss, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp);
 
-uint32_t scan_int_abs_bounded(char* ss, int32_t* valp, uint32_t bound_div_10, uint32_t bound_mod_10);
+uint32_t scan_int_abs_bounded(const char* ss, uint32_t bound_div_10, uint32_t bound_mod_10, int32_t* valp);
 
 // intentionally rejects -2^31 for now
-static inline uint32_t scan_int32(char* ss, int32_t* valp) {
-  return scan_int_abs_bounded(ss, valp, 0x7fffffff / 10, 0x7fffffff % 10);
+HEADER_INLINE uint32_t scan_int32(const char* ss, int32_t* valp) {
+  return scan_int_abs_bounded(ss, 0x7fffffff / 10, 0x7fffffff % 10, valp);
 }
 
 // default cap = 0x7ffffffe
-static inline uint32_t scan_posint_defcap(char* ss, uint32_t* valp) {
-  return scan_posint_capped(ss, valp, 0x7ffffffe / 10, 0x7ffffffe % 10);
+HEADER_INLINE uint32_t scan_posint_defcap(const char* ss, uint32_t* valp) {
+  return scan_posint_capped(ss, 0x7ffffffe / 10, 0x7ffffffe % 10, valp);
 }
 
-static inline uint32_t scan_uint_defcap(char* ss, uint32_t* valp) {
-  return scan_uint_capped(ss, valp, 0x7ffffffe / 10, 0x7ffffffe % 10);
+HEADER_INLINE uint32_t scan_uint_defcap(const char* ss, uint32_t* valp) {
+  return scan_uint_capped(ss, 0x7ffffffe / 10, 0x7ffffffe % 10, valp);
 }
 
-static inline uint32_t scan_int_abs_defcap(char* ss, int32_t* valp) {
-  return scan_int_abs_bounded(ss, valp, 0x7ffffffe / 10, 0x7ffffffe % 10);
+HEADER_INLINE uint32_t scan_int_abs_defcap(const char* ss, int32_t* valp) {
+  return scan_int_abs_bounded(ss, 0x7ffffffe / 10, 0x7ffffffe % 10, valp);
 }
 
-static inline uint32_t scan_uint_icap(char* ss, uint32_t* valp) {
-  return scan_uint_capped(ss, valp, 0x7fffffff / 10, 0x7fffffff % 10);
+HEADER_INLINE uint32_t scan_uint_icap(const char* ss, uint32_t* valp) {
+  return scan_uint_capped(ss, 0x7fffffff / 10, 0x7fffffff % 10, valp);
 }
 
-uint32_t scan_posintptr(char* ss, uintptr_t* valp);
+uint32_t scan_posintptr(const char* ss, uintptr_t* valp);
 
-static inline char replace_if_zero(char cc, char replacement) {
-  if (cc != '0') {
-    return cc;
-  } else {
-    return replacement;
-  }
-}
-
-static inline const char* replace_if_zstr(char* ss, const char* replacement) {
-  if ((ss[0] != '0') || ss[1]) {
-    return ss;
-  }
-  return replacement;
-}
-
-static inline uint32_t scan_double(char* ss, double* valp) {
+HEADER_INLINE uint32_t scan_double(const char* ss, double* valp) {
   char* ss2;
   *valp = strtod(ss, &ss2);
   return (ss == ss2);
 }
 
-static inline uint32_t scan_float(char* ss, float* valp) {
+HEADER_INLINE uint32_t scan_float(const char* ss, float* valp) {
   char* ss2;
   *valp = strtof(ss, &ss2);
   return (ss == ss2);
 }
 
-uint32_t scan_two_doubles(char* ss, double* val1p, double* val2p);
+// __restrict isn't very important for newer x86 processors since loads/stores
+// tend to be automatically reordered, but may as well use it properly in
+// plink_common.
+uint32_t scan_two_doubles(char* ss, double* __restrict val1p, double* __restrict val2p);
 
-int32_t scan_token_ct_len(FILE* infile, char* buf, uintptr_t half_bufsize, uintptr_t* token_ct_ptr, uintptr_t* max_token_len_ptr);
+int32_t scan_token_ct_len(uintptr_t half_bufsize, FILE* infile, char* buf, uintptr_t* __restrict token_ct_ptr, uintptr_t* __restrict max_token_len_ptr);
 
-int32_t read_tokens(FILE* infile, char* buf, uintptr_t half_bufsize, uintptr_t token_ct, uintptr_t max_token_len, char* token_name_buf);
+int32_t read_tokens(uintptr_t half_bufsize, uintptr_t token_ct, uintptr_t max_token_len, FILE* infile, char* __restrict buf, char* __restrict token_name_buf);
 
-static inline char* memseta(char* target, unsigned char val, uintptr_t ct) {
+HEADER_INLINE char* memseta(char* target, unsigned char val, uintptr_t ct) {
   memset(target, val, ct);
   return &(target[ct]);
 }
 
-static inline char* memcpya(char* target, const void* source, uintptr_t ct) {
+HEADER_INLINE char* memcpya(char* __restrict target, const void* __restrict source, uintptr_t ct) {
   memcpy(target, source, ct);
   return &(target[ct]);
 }
 
-static inline char* memcpyb(char* target, const void* source, uint32_t ct) {
+HEADER_INLINE char* memcpyb(char* __restrict target, const void* __restrict source, uint32_t ct) {
   // Same as memcpya, except the return value is one byte earlier.  Generally
   // used when source is a null-terminated string of known length and we want
   // to copy the null, but sometimes we need to append later.
@@ -1140,41 +1280,41 @@ static inline char* memcpyb(char* target, const void* source, uint32_t ct) {
   return &(target[ct - 1]);
 }
 
-static inline char* memcpyax(char* target, const void* source, uint32_t ct, char extra_char) {
+HEADER_INLINE char* memcpyax(char* __restrict target, const void* __restrict source, uint32_t ct, char extra_char) {
   memcpy(target, source, ct);
   target[ct] = extra_char;
   return &(target[ct + 1]);
 }
 
-static inline void memcpyx(char* target, const void* source, uint32_t ct, char extra_char) {
+HEADER_INLINE void memcpyx(char* __restrict target, const void* __restrict source, uint32_t ct, char extra_char) {
   memcpy(target, source, ct);
   target[ct] = extra_char;
 }
 
-static inline void memcpyl3(char* target, const void* source) {
+HEADER_INLINE void memcpyl3(char* __restrict target, const void* __restrict source) {
   // when it's safe to clobber the fourth character, this is faster
-  *((uint32_t*)target) = *((uint32_t*)source);
+  *((uint32_t*)target) = *((const uint32_t*)source);
 }
 
-static inline char* memcpyl3a(char* target, const void* source) {
+HEADER_INLINE char* memcpyl3a(char* __restrict target, const void* __restrict source) {
   memcpyl3(target, source);
   return &(target[3]);
 }
 
-static inline char* strcpya(char* target, const void* source) {
+HEADER_INLINE char* strcpya(char* __restrict target, const void* __restrict source) {
   uintptr_t slen = strlen((char*)source);
   memcpy(target, source, slen);
   return &(target[slen]);
 }
 
-static inline char* strcpyax(char* target, const void* source, char extra_char) {
+HEADER_INLINE char* strcpyax(char* __restrict target, const void* __restrict source, char extra_char) {
   uintptr_t slen = strlen((char*)source);
   memcpy(target, source, slen);
   target[slen] = extra_char;
   return &(target[slen + 1]);
 }
 
-static inline void append_binary_eoln(char** target_ptr) {
+HEADER_INLINE void append_binary_eoln(char** target_ptr) {
 #ifdef _WIN32
   (*target_ptr)[0] = '\r';
   (*target_ptr)[1] = '\n';
@@ -1185,7 +1325,7 @@ static inline void append_binary_eoln(char** target_ptr) {
 #endif
 }
 
-static inline void fputs_w4(char* ss, FILE* outfile) {
+HEADER_INLINE void fputs_w4(const char* ss, FILE* outfile) {
   // for efficient handling of width-4 allele columns; don't want to call
   // strlen() since that's redundant with fputs
   if (!ss[1]) {
@@ -1206,69 +1346,47 @@ int32_t gzputs_w4(gzFile gz_outfile, const char* ss);
 
 int32_t get_next_noncomment(FILE* fptr, char** lptr_ptr, uintptr_t* line_idx_ptr);
 
-int32_t get_next_noncomment_excl(FILE* fptr, char** lptr_ptr, uintptr_t* line_idx_ptr, uintptr_t* marker_exclude, uintptr_t* marker_uidx_ptr);
-
-char* token_end(char* sptr);
-
-// does not return NULL if token ends with null char
-char* token_endl(char* sptr);
+int32_t get_next_noncomment_excl(const uintptr_t* __restrict marker_exclude, FILE* fptr, char** lptr_ptr, uintptr_t* __restrict line_idx_ptr, uintptr_t* __restrict marker_uidx_ptr);
 
-// token_endl without checking if sptr == NULL
-// also assumes we are currently in a token -- UNSAFE OTHERWISE
-static inline char* token_endnn(char* sptr) {
+// assumes we are currently in a token -- UNSAFE OTHERWISE
+HEADER_INLINE char* token_endnn(char* sptr) {
   while (!is_space_or_eoln(*(++sptr)));
   return sptr;
 }
 
-void get_top_two(uint32_t* uint_arr, uintptr_t uia_size, uintptr_t* top_idx_ptr, uintptr_t* second_idx_ptr);
+void get_top_two_ui(const uint32_t* __restrict uint_arr, uintptr_t uia_size, uintptr_t* __restrict top_idx_ptr, uintptr_t* __restrict second_idx_ptr);
 
-static inline char* uint32_encode_5_hi_uchar(char* start, uint32_t uii) {
-  // tried a few bit hacks here, but turns out nothing really beats this
-  *start++ = (unsigned char)((uii >> 28) | 0x80);
-  *start++ = (unsigned char)((uii >> 21) | 0x80);
-  *start++ = (unsigned char)((uii >> 14) | 0x80);
-  *start++ = (unsigned char)((uii >> 7) | 0x80);
-  *start++ = (unsigned char)(uii | 0x80);
-  return start;
-}
-
-static inline uint32_t uint32_decode_5_hi_uchar(char* start) {
-  uint32_t uii = ((uint32_t)((unsigned char)(*start++))) << 28;
-  uii |= (((uint32_t)((unsigned char)(*start++))) & 0x7f) << 21;
-  uii |= (((uint32_t)((unsigned char)(*start++))) & 0x7f) << 14;
-  uii |= (((uint32_t)((unsigned char)(*start++))) & 0x7f) << 7;
-  uii |= ((uint32_t)((unsigned char)(*start))) & 0x7f;
-  return uii;
-}
-
-int32_t intlen(int32_t num);
+uint32_t intlen(int32_t num);
 
 // safer than token_endnn(), since it handles length zero
-static inline uintptr_t strlen_se(char* ss) {
-  char* ss2 = ss;
+HEADER_INLINE uintptr_t strlen_se(const char* ss) {
+  const char* ss2 = ss;
   while (!is_space_or_eoln(*ss2)) {
     ss2++;
   }
   return (uintptr_t)(ss2 - ss);
 }
 
-int32_t strcmp_se(char* s_read, const char* s_const, uint32_t len);
+int32_t strcmp_se(const char* s_read, const char* s_const, uint32_t s_const_len);
 
 char* next_token(char* sptr);
 
 char* next_token_mult(char* sptr, uint32_t ct);
 
-static inline char* next_token_multz(char* sptr, uint32_t ct) {
-  if (!ct) {
-    return sptr;
-  } else {
+HEADER_INLINE char* next_token_multz(char* sptr, uint32_t ct) {
+  // tried replacing this with ternary operator, but that actually seemed to
+  // slow things down a bit under gcc 4.2.1 (tail call optimization issue?).
+  // todo: recheck this under newer gcc/clang.
+  if (ct) {
     return next_token_mult(sptr, ct);
+  } else {
+    return sptr;
   }
 }
 
 uint32_t count_tokens(const char* bufptr);
 
-static inline char* fw_strcpyn(uint32_t min_width, uint32_t source_len, const char* source, char* dest) {
+HEADER_INLINE char* fw_strcpyn(uint32_t min_width, uint32_t source_len, const char* source, char* dest) {
   // right-justified strcpy with known source length
   if (source_len < min_width) {
     memcpy(memseta(dest, 32, min_width - source_len), source, source_len);
@@ -1278,43 +1396,52 @@ static inline char* fw_strcpyn(uint32_t min_width, uint32_t source_len, const ch
   }
 }
 
-static inline char* fw_strcpy(uint32_t min_width, const char* source, char* dest) {
+HEADER_INLINE char* fw_strcpy(uint32_t min_width, const char* source, char* dest) {
   return fw_strcpyn(min_width, strlen(source), source, dest);
 }
 
 uint32_t count_and_measure_multistr(const char* multistr, uintptr_t* max_slen_ptr);
 
-char* uint32_write(char* start, uint32_t uii);
+char* uint32toa(uint32_t uii, char* start);
 
-char* int32_write(char* start, int32_t ii);
+char* int32toa(int32_t ii, char* start);
 
-void uint32_write4(char* start, uint32_t uii);
+// Write exactly four digits (padding with zeroes if necessary); useful for
+// e.g. floating point encoders.  uii must not be >= 10^4.
+char* uitoa_z4(uint32_t uii, char* start);
 
-char* int64_write(char* start, int64_t llii);
+char* int64toa(int64_t llii, char* start);
 
-char* uint32_writew4(char* start, uint32_t uii);
+// Minimum field width 4 (padding with spaces on left).
+char* uint32toa_w4(uint32_t uii, char* start);
 
-char* uint32_writew6(char* start, uint32_t uii);
+char* uint32toa_w6(uint32_t uii, char* start);
 
-char* uint32_writew7(char* start, uint32_t uii);
+char* uint32toa_w7(uint32_t uii, char* start);
 
-char* uint32_writew8(char* start, uint32_t uii);
+char* uint32toa_w8(uint32_t uii, char* start);
 
-char* uint32_writew10(char* start, uint32_t uii);
+char* uint32toa_w10(uint32_t uii, char* start);
 
-char* double_e_write(char* start, double dxx);
+// These limited-precision converters are usually several times as fast as
+// grisu2's descendants; and let's not even speak of sprintf.  (I'm guessing
+// that the algorithm cannot be made round-trip-safe without throwing away its
+// performance advantage, since we currently multiply by numbers like 1.0e256
+// which don't have an exact representation.  But these functions are very,
+// very good at what they do.)
+char* dtoa_e(double dxx, char* start);
 
-char* float_e_write(char* start, float dxx);
+char* ftoa_e(float dxx, char* start);
 
-char* double_f_writew2(char* start, double dxx);
+char* dtoa_f_p2(double dxx, char* start);
 
-char* double_f_writew3(char* start, double dxx);
+char* dtoa_f_p3(double dxx, char* start);
 
-char* double_f_writew96(char* start, double dxx);
+char* dtoa_f_w9p6(double dxx, char* start);
 
-char* double_f_writew74(char* start, double dxx);
+char* dtoa_f_w7p4(double dxx, char* start);
 
-static inline void zeroes_to_spaces(char* start) {
+HEADER_INLINE void trailing_zeroes_to_spaces(char* start) {
   // removes trailing zeroes
   start--;
   while (*start == '0') {
@@ -1325,26 +1452,23 @@ static inline void zeroes_to_spaces(char* start) {
   }
 }
 
-static inline char* clip_zeroes(char* start) {
-  start--;
-  while (*start == '0') {
-    *(start--) = ' ';
-  }
-  if (*start == '.') {
-    start--;
-  }
-  return &(start[1]);
+HEADER_INLINE char* clip_trailing_zeroes(char* start) {
+  char cc;
+  do {
+    cc = *(--start);
+  } while (cc == '0');
+  return &(start[(cc != '.')]);
 }
 
-char* double_f_writew96_spaced(char* start, double dxx);
+char* dtoa_f_w9p6_spaced(double dxx, char* start);
 
-char* double_f_writew96_clipped(char* start, double dxx);
+char* dtoa_f_w9p6_clipped(double dxx, char* start);
 
-char* double_g_write(char* start, double dxx);
+char* dtoa_g(double dxx, char* start);
 
-char* float_g_write(char* start, float dxx);
+char* ftoa_g(float dxx, char* start);
 
-static inline char* width_force(uint32_t min_width, char* startp, char* endp) {
+HEADER_INLINE char* width_force(uint32_t min_width, char* startp, char* endp) {
   uintptr_t diff = (endp - startp);
   if (diff >= min_width) {
     return endp;
@@ -1359,293 +1483,308 @@ static inline char* width_force(uint32_t min_width, char* startp, char* endp) {
   }
 }
 
-char* double_g_writewx2(char* start, double dxx, uint32_t min_width);
+// assumes min_width >= 5.
+char* dtoa_g_wxp2(double dxx, uint32_t min_width, char* start);
 
-char* double_g_writewx3(char* start, double dxx, uint32_t min_width);
+// assumes min_width >= 5.
+char* dtoa_g_wxp3(double dxx, uint32_t min_width, char* start);
 
-char* double_g_writewx4(char* start, double dxx, uint32_t min_width);
+// only requires min_width to be positive; less than 5 is ok
+char* dtoa_g_wxp4(double dxx, uint32_t min_width, char* start);
 
-char* double_g_writewx8(char* start, double dxx, uint32_t min_width);
+// only requires min_width to be positive; less than 8 is ok
+char* dtoa_g_wxp8(double dxx, uint32_t min_width, char* start);
 
-static inline char* uint32_writex(char* start, uint32_t uii, char extra_char) {
-  char* penult = uint32_write(start, uii);
+HEADER_INLINE char* uint32toa_x(uint32_t uii, char extra_char, char* start) {
+  char* penult = uint32toa(uii, start);
   *penult = extra_char;
   return &(penult[1]);
 }
 
-static inline char* int32_writex(char* start, int32_t ii, char extra_char) {
-  char* penult = int32_write(start, ii);
+HEADER_INLINE char* int32toa_x(int32_t ii, char extra_char, char* start) {
+  char* penult = int32toa(ii, start);
   *penult = extra_char;
   return &(penult[1]);
 }
 
-static inline char* uint32_writew4x(char* start, uint32_t uii, char extra_char) {
-  char* penult = uint32_writew4(start, uii);
+HEADER_INLINE char* uint32toa_w4x(uint32_t uii, char extra_char, char* start) {
+  char* penult = uint32toa_w4(uii, start);
   *penult = extra_char;
   return &(penult[1]);
 }
 
-static inline char* uint32_writew6x(char* start, uint32_t uii, char extra_char) {
-  char* penult = uint32_writew6(start, uii);
+HEADER_INLINE char* uint32toa_w6x(uint32_t uii, char extra_char, char* start) {
+  char* penult = uint32toa_w6(uii, start);
   *penult = extra_char;
   return &(penult[1]);
 }
 
-static inline char* uint32_writew7x(char* start, uint32_t uii, char extra_char) {
-  char* penult = uint32_writew7(start, uii);
+HEADER_INLINE char* uint32toa_w7x(uint32_t uii, char extra_char, char* start) {
+  char* penult = uint32toa_w7(uii, start);
   *penult = extra_char;
   return &(penult[1]);
 }
 
-static inline char* uint32_writew8x(char* start, uint32_t uii, char extra_char) {
-  char* penult = uint32_writew8(start, uii);
+HEADER_INLINE char* uint32toa_w8x(uint32_t uii, char extra_char, char* start) {
+  char* penult = uint32toa_w8(uii, start);
   *penult = extra_char;
   return &(penult[1]);
 }
 
-static inline char* uint32_writew10x(char* start, uint32_t uii, char extra_char) {
-  char* penult = uint32_writew10(start, uii);
+HEADER_INLINE char* uint32toa_w10x(uint32_t uii, char extra_char, char* start) {
+  char* penult = uint32toa_w10(uii, start);
   *penult = extra_char;
   return &(penult[1]);
 }
 
-static inline char* double_e_writex(char* start, double dxx, char extra_char) {
-  char* penult = double_e_write(start, dxx);
+HEADER_INLINE char* dtoa_ex(double dxx, char extra_char, char* start) {
+  char* penult = dtoa_e(dxx, start);
   *penult = extra_char;
   return &(penult[1]);
 }
 
-static inline char* float_e_writex(char* start, float dxx, char extra_char) {
-  char* penult = float_e_write(start, dxx);
+HEADER_INLINE char* ftoa_ex(float fxx, char extra_char, char* start) {
+  char* penult = ftoa_e(fxx, start);
   *penult = extra_char;
   return &(penult[1]);
 }
 
-static inline char* double_f_writew96x(char* start, double dxx, char extra_char) {
-  char* penult = double_f_writew96(start, dxx);
+HEADER_INLINE char* dtoa_f_w9p6x(double dxx, char extra_char, char* start) {
+  char* penult = dtoa_f_w9p6(dxx, start);
   *penult = extra_char;
   return &(penult[1]);
 }
 
-static inline char* double_f_writew74x(char* start, double dxx, char extra_char) {
-  char* penult = double_f_writew74(start, dxx);
+HEADER_INLINE char* dtoa_f_w7p4x(double dxx, char extra_char, char* start) {
+  char* penult = dtoa_f_w7p4(dxx, start);
   *penult = extra_char;
   return &(penult[1]);
 }
 
-static inline char* double_g_writex(char* start, double dxx, char extra_char) {
-  char* penult = double_g_write(start, dxx);
+HEADER_INLINE char* dtoa_gx(double dxx, char extra_char, char* start) {
+  char* penult = dtoa_g(dxx, start);
   *penult = extra_char;
   return &(penult[1]);
 }
 
-static inline char* float_g_writex(char* start, float dxx, char extra_char) {
-  char* penult = float_g_write(start, dxx);
+/*
+HEADER_INLINE char* ftoa_gx(float dxx, char extra_char, char* start) {
+  char* penult = ftoa_g(dxx, start);
   *penult = extra_char;
   return &(penult[1]);
 }
+*/
 
-static inline char* double_g_writewx3x(char* start, double dxx, uint32_t min_width, char extra_char) {
-  char* penult = double_g_writewx3(start, dxx, min_width);
+HEADER_INLINE char* dtoa_g_wxp3x(double dxx, uint32_t min_width, char extra_char, char* start) {
+  char* penult = dtoa_g_wxp3(dxx, min_width, start);
   *penult = extra_char;
   return &(penult[1]);
 }
 
-static inline char* double_g_writewx4x(char* start, double dxx, uint32_t min_width, char extra_char) {
-  char* penult = double_g_writewx4(start, dxx, min_width);
+HEADER_INLINE char* dtoa_g_wxp4x(double dxx, uint32_t min_width, char extra_char, char* start) {
+  char* penult = dtoa_g_wxp4(dxx, min_width, start);
   *penult = extra_char;
   return &(penult[1]);
 }
 
-static inline char* double_g_writewx8x(char* start, double dxx, uint32_t min_width, char extra_char) {
-  char* penult = double_g_writewx8(start, dxx, min_width);
+HEADER_INLINE char* dtoa_g_wxp8x(double dxx, uint32_t min_width, char extra_char, char* start) {
+  char* penult = dtoa_g_wxp8(dxx, min_width, start);
   *penult = extra_char;
   return &(penult[1]);
 }
 
-static inline void read_next_terminate(char* target, char* source) {
+HEADER_INLINE void read_next_terminate(char* __restrict target, const char* __restrict source) {
   while (!is_space_or_eoln(*source)) {
     *target++ = *source++;
   }
   *target = '\0';
 }
 
-char* chrom_print_human(char* buf, uint32_t num);
+char* chrom_print_human(uint32_t num, char* buf);
 
-uint32_t allele_set(char** allele_ptr, const char* newval, uint32_t slen);
+// newval does not need to be null-terminated, and slen does not include
+// terminator
+// assumes *allele_ptr is not initialized
+uint32_t allele_set(const char* newval, uint32_t slen, char** allele_ptr);
 
-uint32_t allele_reset(char** allele_ptr, const char* newval, uint32_t slen);
+// *allele_ptr must be initialized; frees *allele_ptr if necessary
+uint32_t allele_reset(const char* newval, uint32_t slen, char** allele_ptr);
 
-void magic_num(uint32_t divisor, uint64_t* multp, uint32_t* pre_shiftp, uint32_t* post_shiftp, uint32_t* incrp);
+void magic_num(uint32_t divisor, uint64_t* multp, uint32_t* __restrict pre_shiftp, uint32_t* __restrict post_shiftp, uint32_t* __restrict incrp);
 
-static inline uintptr_t tri_coord_no_diag(uintptr_t small_coord, uintptr_t big_coord) {
+HEADER_INLINE uintptr_t tri_coord_no_diag(uintptr_t small_coord, uintptr_t big_coord) {
   // small_coord and big_coord are 0-based indices, small_coord < big_coord
   return ((big_coord * (big_coord - 1)) / 2) + small_coord;
 }
 
-static inline uint32_t tri_coord_no_diag_32(uint32_t small_coord, uint32_t big_coord) {
+HEADER_INLINE uint32_t tri_coord_no_diag_32(uint32_t small_coord, uint32_t big_coord) {
   return ((big_coord * (big_coord - 1)) / 2) + small_coord;
 }
 
 // let the compiler worry about the second argument's bit width here
-#define SET_BIT(aa, bb) ((aa)[(bb) / BITCT] |= ONELU << ((bb) % BITCT))
+#define SET_BIT(idx, arr) ((arr)[(idx) / BITCT] |= ONELU << ((idx) % BITCT))
 
-#define SET_BIT_DBL(aa, bb) ((aa)[(bb) / BITCT2] |= ONELU << (2 * ((bb) % BITCT2)))
+#define SET_BIT_DBL(idx, arr) ((arr)[(idx) / BITCT2] |= ONELU << (2 * ((idx) % BITCT2)))
 
-static inline void set_bit(uintptr_t* bit_arr, uint32_t loc) {
-  bit_arr[loc / BITCT] |= (ONELU << (loc % BITCT));
+// useful for coercing int32_t loc to unsigned
+HEADER_INLINE void set_bit(uint32_t loc, uintptr_t* bitarr) {
+  bitarr[loc / BITCT] |= (ONELU << (loc % BITCT));
 }
 
-static inline void set_bit_ul(uintptr_t* bit_arr, uintptr_t loc) {
-  bit_arr[loc / BITCT] |= (ONELU << (loc % BITCT));
+HEADER_INLINE void set_bit_ul(uintptr_t loc, uintptr_t* bitarr) {
+  bitarr[loc / BITCT] |= (ONELU << (loc % BITCT));
 }
 
-void fill_bits(uintptr_t* bit_arr, uintptr_t loc_start, uintptr_t len);
+// requires positive len
+void fill_bits(uintptr_t loc_start, uintptr_t len, uintptr_t* bitarr);
 
-void clear_bits(uintptr_t* bit_arr, uintptr_t loc_start, uintptr_t len);
+// requires positive len
+void clear_bits(uintptr_t loc_start, uintptr_t len, uintptr_t* bitarr);
 
-#define CLEAR_BIT(aa, bb) ((aa)[(bb) / BITCT] &= ~(ONELU << ((bb) % BITCT)))
+#define CLEAR_BIT(idx, arr) ((arr)[(idx) / BITCT] &= ~(ONELU << ((idx) % BITCT)))
 
-#define CLEAR_BIT_DBL(aa, bb) ((aa)[(bb) / BITCT2] &= ~(ONELU << (2 * ((bb) % BITCT2))))
+#define CLEAR_BIT_DBL(idx, arr) ((arr)[(idx) / BITCT2] &= ~(ONELU << (2 * ((idx) % BITCT2))))
 
-static inline void clear_bit(uintptr_t* bit_arr, uint32_t loc) {
-  bit_arr[loc / BITCT] &= ~(ONELU << (loc % BITCT));
+HEADER_INLINE void clear_bit(uint32_t loc, uintptr_t* bitarr) {
+  bitarr[loc / BITCT] &= ~(ONELU << (loc % BITCT));
 }
 
-static inline void clear_bit_ul(uintptr_t* bit_arr, uintptr_t loc) {
-  bit_arr[loc / BITCT] &= ~(ONELU << (loc % BITCT));
+HEADER_INLINE void clear_bit_ul(uintptr_t loc, uintptr_t* bitarr) {
+  bitarr[loc / BITCT] &= ~(ONELU << (loc % BITCT));
 }
 
-#define IS_SET(aa, bb) (((aa)[(bb) / BITCT] >> ((bb) % BITCT)) & 1)
+#define IS_SET(arr, idx) (((arr)[(idx) / BITCT] >> ((idx) % BITCT)) & 1)
 
-#define IS_SET_DBL(aa, bb) (((aa)[(bb) / BITCT2] >> (2 * ((bb) % BITCT2))) & 1)
+#define IS_SET_DBL(arr, idx) (((arr)[(idx) / BITCT2] >> (2 * ((idx) % BITCT2))) & 1)
 
 // use this instead of IS_SET() for signed 32-bit integers
-static inline uint32_t is_set(const uintptr_t* exclude_arr, uint32_t loc) {
-  return (exclude_arr[loc / BITCT] >> (loc % BITCT)) & 1;
+HEADER_INLINE uint32_t is_set(const uintptr_t* bitarr, uint32_t loc) {
+  return (bitarr[loc / BITCT] >> (loc % BITCT)) & 1;
 }
 
-static inline uint32_t is_set_ul(const uintptr_t* exclude_arr, uintptr_t loc) {
-  return (exclude_arr[loc / BITCT] >> (loc % BITCT)) & 1;
+HEADER_INLINE uint32_t is_set_ul(const uintptr_t* bitarr, uintptr_t loc) {
+  return (bitarr[loc / BITCT] >> (loc % BITCT)) & 1;
 }
 
-#define IS_NONNULL_AND_SET(aa, bb) ((aa) && IS_SET(aa, bb))
+#define IS_NONNULL_AND_SET(arr, idx) ((arr) && IS_SET(arr, idx))
 
-uint32_t next_unset_unsafe(uintptr_t* bit_arr, uint32_t loc);
+uint32_t next_unset_unsafe(const uintptr_t* bitarr, uint32_t loc);
 
-static inline void next_unset_unsafe_ck(uintptr_t* bit_arr, uint32_t* loc_ptr) {
-  if (IS_SET(bit_arr, *loc_ptr)) {
-    *loc_ptr = next_unset_unsafe(bit_arr, *loc_ptr);
+HEADER_INLINE void next_unset_unsafe_ck(const uintptr_t* __restrict bitarr, uint32_t* __restrict loc_ptr) {
+  if (IS_SET(bitarr, *loc_ptr)) {
+    *loc_ptr = next_unset_unsafe(bitarr, *loc_ptr);
   }
 }
 
 #ifdef __LP64__
-uintptr_t next_unset_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc);
+uintptr_t next_unset_ul_unsafe(const uintptr_t* bitarr, uintptr_t loc);
 #else
-static inline uintptr_t next_unset_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc) {
-  return (uintptr_t)next_unset_unsafe(bit_arr, loc);
+HEADER_INLINE uintptr_t next_unset_ul_unsafe(const uintptr_t* bitarr, uintptr_t loc) {
+  return (uintptr_t)next_unset_unsafe(bitarr, loc);
 }
 #endif
 
-static inline void next_unset_ul_unsafe_ck(uintptr_t* bit_arr, uintptr_t* loc_ptr) {
-  if (IS_SET(bit_arr, *loc_ptr)) {
-    *loc_ptr = next_unset_ul_unsafe(bit_arr, *loc_ptr);
+HEADER_INLINE void next_unset_ul_unsafe_ck(const uintptr_t* __restrict bitarr, uintptr_t* __restrict loc_ptr) {
+  if (IS_SET(bitarr, *loc_ptr)) {
+    *loc_ptr = next_unset_ul_unsafe(bitarr, *loc_ptr);
   }
 }
 
-uint32_t next_unset(uintptr_t* bit_arr, uint32_t loc, uint32_t ceil);
+uint32_t next_unset(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil);
 
-static inline void next_unset_ck(uintptr_t* bit_arr, uint32_t* loc_ptr, uint32_t ceil) {
-  if (IS_SET(bit_arr, *loc_ptr)) {
-    *loc_ptr = next_unset(bit_arr, *loc_ptr, ceil);
+HEADER_INLINE void next_unset_ck(const uintptr_t* __restrict bitarr, uint32_t ceil, uint32_t* __restrict loc_ptr) {
+  if (IS_SET(bitarr, *loc_ptr)) {
+    *loc_ptr = next_unset(bitarr, *loc_ptr, ceil);
   }
 }
 
 #ifdef __LP64__
-uintptr_t next_unset_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil);
+uintptr_t next_unset_ul(const uintptr_t* bitarr, uintptr_t loc, uintptr_t ceil);
 #else
-static inline uintptr_t next_unset_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil) {
-  return (uintptr_t)next_unset(bit_arr, loc, ceil);
+HEADER_INLINE uintptr_t next_unset_ul(const uintptr_t* bitarr, uintptr_t loc, uintptr_t ceil) {
+  return (uintptr_t)next_unset(bitarr, loc, ceil);
 }
 #endif
 
-static inline void next_unset_ul_ck(uintptr_t* bit_arr, uintptr_t* loc_ptr, uintptr_t ceil) {
-  if (IS_SET(bit_arr, *loc_ptr)) {
-    *loc_ptr = next_unset_ul(bit_arr, *loc_ptr, ceil);
+HEADER_INLINE void next_unset_ul_ck(const uintptr_t* __restrict bitarr, uintptr_t ceil, uintptr_t* __restrict loc_ptr) {
+  if (IS_SET(bitarr, *loc_ptr)) {
+    *loc_ptr = next_unset_ul(bitarr, *loc_ptr, ceil);
   }
 }
 
-uint32_t next_set_unsafe(uintptr_t* bit_arr, uint32_t loc);
+uint32_t next_set_unsafe(const uintptr_t* bitarr, uint32_t loc);
 
-static inline void next_set_unsafe_ck(uintptr_t* bit_arr, uint32_t* loc_ptr) {
-  if (!IS_SET(bit_arr, *loc_ptr)) {
-    *loc_ptr = next_set_unsafe(bit_arr, *loc_ptr);
+HEADER_INLINE void next_set_unsafe_ck(const uintptr_t* __restrict bitarr, uint32_t* __restrict loc_ptr) {
+  if (!IS_SET(bitarr, *loc_ptr)) {
+    *loc_ptr = next_set_unsafe(bitarr, *loc_ptr);
   }
 }
 
 #ifdef __LP64__
-uintptr_t next_set_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc);
+uintptr_t next_set_ul_unsafe(const uintptr_t* bitarr, uintptr_t loc);
 #else
-static inline uintptr_t next_set_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc) {
-  return (uintptr_t)next_set_unsafe(bit_arr, loc);
+HEADER_INLINE uintptr_t next_set_ul_unsafe(const uintptr_t* bitarr, uintptr_t loc) {
+  return (uintptr_t)next_set_unsafe(bitarr, loc);
 }
 #endif
 
-static inline void next_set_ul_unsafe_ck(uintptr_t* bit_arr, uintptr_t* loc_ptr) {
-  if (!IS_SET(bit_arr, *loc_ptr)) {
-    *loc_ptr = next_set_ul_unsafe(bit_arr, *loc_ptr);
+HEADER_INLINE void next_set_ul_unsafe_ck(const uintptr_t* __restrict bitarr, uintptr_t* __restrict loc_ptr) {
+  if (!IS_SET(bitarr, *loc_ptr)) {
+    *loc_ptr = next_set_ul_unsafe(bitarr, *loc_ptr);
   }
 }
 
-uint32_t next_set(uintptr_t* bit_arr, uint32_t loc, uint32_t ceil);
+uint32_t next_set(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil);
 
-static inline void next_set_ck(uintptr_t* bit_arr, uint32_t* loc_ptr, uint32_t ceil) {
-  if (!IS_SET(bit_arr, *loc_ptr)) {
-    *loc_ptr = next_set(bit_arr, *loc_ptr, ceil);
+HEADER_INLINE void next_set_ck(const uintptr_t* __restrict bitarr, uint32_t ceil, uint32_t* __restrict loc_ptr) {
+  if (!IS_SET(bitarr, *loc_ptr)) {
+    *loc_ptr = next_set(bitarr, *loc_ptr, ceil);
   }
 }
 
 #ifdef __LP64__
-uintptr_t next_set_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil);
+uintptr_t next_set_ul(const uintptr_t* bitarr, uintptr_t loc, uintptr_t ceil);
 #else
-static inline uintptr_t next_set_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil) {
-  return (uintptr_t)next_set(bit_arr, loc, ceil);
+HEADER_INLINE uintptr_t next_set_ul(const uintptr_t* bitarr, uintptr_t loc, uintptr_t ceil) {
+  return (uintptr_t)next_set(bitarr, loc, ceil);
 }
 #endif
 
-static inline void next_set_ul_ck(uintptr_t* bit_arr, uintptr_t* loc_ptr, uintptr_t ceil) {
-  if (!IS_SET(bit_arr, *loc_ptr)) {
-    *loc_ptr = next_set_ul(bit_arr, *loc_ptr, ceil);
+HEADER_INLINE void next_set_ul_ck(const uintptr_t* __restrict bitarr, uintptr_t ceil, uintptr_t* loc_ptr) {
+  if (!IS_SET(bitarr, *loc_ptr)) {
+    *loc_ptr = next_set_ul(bitarr, *loc_ptr, ceil);
   }
 }
 
-int32_t last_set_bit(uintptr_t* bit_arr, uint32_t word_ct);
+int32_t last_set_bit(const uintptr_t* bitarr, uint32_t word_ct);
 
 // note different interface from last_set_bit()
-// int32_t last_clear_bit(uintptr_t* bit_arr, uint32_t ceil);
+// int32_t last_clear_bit(uintptr_t* bitarr, uint32_t ceil);
 
 // unlike the next_[un]set family, this always returns a STRICTLY earlier
 // position
-uint32_t prev_unset_unsafe(uintptr_t* bit_arr, uint32_t loc);
+uint32_t prev_unset_unsafe(const uintptr_t* bitarr, uint32_t loc);
 
-// uint32_t prev_unset(uintptr_t* bit_arr, uint32_t loc, uint32_t floor);
+// uint32_t prev_unset(uintptr_t* bitarr, uint32_t loc, uint32_t floor);
 
-static inline void prev_unset_unsafe_ck(uintptr_t* bit_arr, uint32_t* loc_ptr) {
+HEADER_INLINE void prev_unset_unsafe_ck(const uintptr_t* bitarr, uint32_t* loc_ptr) {
   *loc_ptr -= 1;
-  if (IS_SET(bit_arr, *loc_ptr)) {
-    *loc_ptr = prev_unset_unsafe(bit_arr, *loc_ptr);
+  if (IS_SET(bitarr, *loc_ptr)) {
+    *loc_ptr = prev_unset_unsafe(bitarr, *loc_ptr);
   }
 }
 
-// These functions seem to optimize better than memset(arr, 0, x) under gcc.
-static inline void fill_long_zero(intptr_t* larr, size_t size) {
+// These functions seem to optimize better than memset(arr, 0, x) under OS X
+// <10.9's gcc, and they should be equivalent for later versions (looks like
+// memcpy/memset were redone in gcc 4.3).
+HEADER_INLINE void fill_long_zero(intptr_t* larr, size_t size) {
   size_t ulii;
   for (ulii = 0; ulii < size; ulii++) {
     *larr++ = 0;
   }
 }
 
-static inline void fill_ulong_zero(uintptr_t* ularr, size_t size) {
+HEADER_INLINE void fill_ulong_zero(uintptr_t* ularr, size_t size) {
   size_t ulii;
   for (ulii = 0; ulii < size; ulii++) {
     *ularr++ = 0;
@@ -1653,30 +1792,31 @@ static inline void fill_ulong_zero(uintptr_t* ularr, size_t size) {
 }
 
 #ifdef __LP64__
-static inline void fill_ull_zero(uint64_t* ullarr, size_t size) {
+HEADER_INLINE void fill_ull_zero(uint64_t* ullarr, size_t size) {
   fill_ulong_zero((uintptr_t*)ullarr, size);
 }
 
-static inline void fill_v128_zero(__m128i* v128arr, size_t size) {
+// double v indicates that size is a vector count, not a word count.
+HEADER_INLINE void fill_vec_zero(VECITYPE* vec, size_t size) {
   size_t ulii;
   for (ulii = 0; ulii < size; ulii++) {
-    *v128arr++ = _mm_setzero_si128();
+    *vec++ = _mm_setzero_si128();
   }
 }
 #else
-static inline void fill_ull_zero(uint64_t* ullarr, size_t size) {
+HEADER_INLINE void fill_ull_zero(uint64_t* ullarr, size_t size) {
   fill_ulong_zero((uintptr_t*)ullarr, size * 2);
 }
 #endif
 
-static inline void fill_long_one(intptr_t* larr, size_t size) {
+HEADER_INLINE void fill_long_one(intptr_t* larr, size_t size) {
   size_t ulii;
   for (ulii = 0; ulii < size; ulii++) {
     *larr++ = -1;
   }
 }
 
-static inline void fill_ulong_one(uintptr_t* ularr, size_t size) {
+HEADER_INLINE void fill_ulong_one(uintptr_t* ularr, size_t size) {
   size_t ulii;
   for (ulii = 0; ulii < size; ulii++) {
     *ularr++ = ~ZEROLU;
@@ -1684,119 +1824,176 @@ static inline void fill_ulong_one(uintptr_t* ularr, size_t size) {
 }
 
 #ifdef __LP64__
-static inline void fill_ull_one(uint64_t* ullarr, size_t size) {
+HEADER_INLINE void fill_ull_one(uint64_t* ullarr, size_t size) {
   fill_ulong_one((uintptr_t*)ullarr, size);
 }
 #else
-static inline void fill_ull_one(uint64_t* ullarr, size_t size) {
+HEADER_INLINE void fill_ull_one(uint64_t* ullarr, size_t size) {
   fill_ulong_one((uintptr_t*)ullarr, size * 2);
 }
 #endif
 
-static inline void fill_int_zero(int32_t* iarr, size_t size) {
+HEADER_INLINE void fill_int_zero(int32_t* iarr, size_t size) {
   size_t ulii;
   for (ulii = 0; ulii < size; ulii++) {
     *iarr++ = 0;
   }
 }
 
-static inline void fill_int_one(int32_t* iarr, size_t size) {
+HEADER_INLINE void fill_int_one(int32_t* iarr, size_t size) {
   size_t ulii;
   for (ulii = 0; ulii < size; ulii++) {
     *iarr++ = -1;
   }
 }
 
-static inline void fill_uint_zero(uint32_t* uiarr, size_t size) {
+HEADER_INLINE void fill_uint_zero(uint32_t* uiarr, size_t size) {
   size_t ulii;
   for (ulii = 0; ulii < size; ulii++) {
     *uiarr++ = 0;
   }
 }
 
-static inline void fill_uint_one(uint32_t* uiarr, size_t size) {
+HEADER_INLINE void fill_uint_one(uint32_t* uiarr, size_t size) {
   size_t ulii;
   for (ulii = 0; ulii < size; ulii++) {
     *uiarr++ = ~0U;
   }
 }
 
-static inline void fill_float_zero(float* farr, size_t size) {
+HEADER_INLINE void fill_float_zero(float* farr, size_t size) {
   size_t ulii;
   for (ulii = 0; ulii < size; ulii++) {
     *farr++ = 0.0;
   }
 }
 
-static inline void fill_double_zero(double* darr, size_t size) {
+HEADER_INLINE void fill_double_zero(double* darr, size_t size) {
   size_t ulii;
   for (ulii = 0; ulii < size; ulii++) {
     *darr++ = 0.0;
   }
 }
 
+
+int32_t bigstack_calloc_uc(uintptr_t ct, unsigned char** ucp_ptr);
+
+int32_t bigstack_calloc_d(uintptr_t ct, double** dp_ptr);
+
+int32_t bigstack_calloc_f(uintptr_t ct, float** fp_ptr);
+
+int32_t bigstack_calloc_ui(uintptr_t ct, uint32_t** uip_ptr);
+
+int32_t bigstack_calloc_ul(uintptr_t ct, uintptr_t** ulp_ptr);
+
+int32_t bigstack_calloc_ull(uintptr_t ct, uint64_t** ullp_ptr);
+
+HEADER_INLINE int32_t bigstack_calloc_c(uintptr_t ct, char** cp_ptr) {
+  return bigstack_calloc_uc(ct, (unsigned char**)cp_ptr);
+}
+
+HEADER_INLINE int32_t bigstack_calloc_i(uintptr_t ct, int32_t** ip_ptr) {
+  return bigstack_calloc_ui(ct, (uint32_t**)ip_ptr);
+}
+
+HEADER_INLINE int32_t bigstack_calloc_ll(uintptr_t ct, int64_t** llp_ptr) {
+  return bigstack_calloc_ull(ct, (uint64_t**)llp_ptr);
+}
+
+int32_t bigstack_end_calloc_uc(uintptr_t ct, unsigned char** ucp_ptr);
+
+int32_t bigstack_end_calloc_d(uintptr_t ct, double** dp_ptr);
+
+int32_t bigstack_end_calloc_f(uintptr_t ct, float** fp_ptr);
+
+int32_t bigstack_end_calloc_ui(uintptr_t ct, uint32_t** uip_ptr);
+
+int32_t bigstack_end_calloc_ul(uintptr_t ct, uintptr_t** ulp_ptr);
+
+int32_t bigstack_end_calloc_ull(uintptr_t ct, uint64_t** ullp_ptr);
+
+HEADER_INLINE int32_t bigstack_end_calloc_c(uintptr_t ct, char** cp_ptr) {
+  return bigstack_end_calloc_uc(ct, (unsigned char**)cp_ptr);
+}
+
+HEADER_INLINE int32_t bigstack_end_calloc_i(uintptr_t ct, int32_t** ip_ptr) {
+  return bigstack_end_calloc_ui(ct, (uint32_t**)ip_ptr);
+}
+
+HEADER_INLINE int32_t bigstack_end_calloc_ll(uintptr_t ct, int64_t** llp_ptr) {
+  return bigstack_end_calloc_ull(ct, (uint64_t**)llp_ptr);
+}
+
+
 uint32_t murmurhash3_32(const void* key, uint32_t len);
 
-static inline uint32_t hashval2(char* idstr, uint32_t idlen) {
+HEADER_INLINE uint32_t hashval2(const char* idstr, uint32_t idlen) {
   return murmurhash3_32(idstr, idlen) % HASHSIZE;
 }
 
 uintptr_t geqprime(uintptr_t floor);
 
-static inline uint32_t get_id_htable_size(uintptr_t item_ct) {
-  return (item_ct < 32761)? 65521 : geqprime(item_ct * 2 + 1);
+HEADER_INLINE uint32_t get_id_htable_size(uintptr_t item_ct) {
+  if (item_ct < 32761) {
+    return 65521;
+  } else {
+    return geqprime(item_ct * 2 + 1);
+  }
 }
 
-int32_t populate_id_htable(uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t item_ct, const char* item_ids, uintptr_t max_id_len, uint32_t store_dups, uint32_t* id_htable, uint32_t id_htable_size);
+int32_t populate_id_htable(uintptr_t unfiltered_ct, const uintptr_t* exclude_arr, uintptr_t item_ct, const char* item_ids, uintptr_t max_id_len, uint32_t store_dups, uint32_t id_htable_size, uint32_t* id_htable);
 
-static inline int32_t alloc_and_populate_id_htable(uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t item_ct, const char* item_ids, uintptr_t max_id_len, uint32_t allow_dups, uint32_t** id_htable_ptr, uint32_t* id_htable_size_ptr) {
+HEADER_INLINE int32_t alloc_and_populate_id_htable(uintptr_t unfiltered_ct, const uintptr_t* exclude_arr, uintptr_t item_ct, const char* item_ids, uintptr_t max_id_len, uint32_t allow_dups, uint32_t* id_htable_size_ptr, uint32_t** id_htable_ptr) {
   uint32_t id_htable_size = get_id_htable_size(item_ct);
-  if (wkspace_alloc_ui_checked(id_htable_ptr, id_htable_size * sizeof(int32_t))) {
+  if (bigstack_alloc_ui(id_htable_size, id_htable_ptr)) {
     return RET_NOMEM;
   }
   *id_htable_size_ptr = id_htable_size;
-  return populate_id_htable(unfiltered_ct, exclude_arr, item_ct, item_ids, max_id_len, allow_dups, *id_htable_ptr, id_htable_size);
+  return populate_id_htable(unfiltered_ct, exclude_arr, item_ct, item_ids, max_id_len, allow_dups, id_htable_size, *id_htable_ptr);
 }
 
 uint32_t id_htable_find(const char* id_buf, uintptr_t cur_id_len, const uint32_t* id_htable, uint32_t id_htable_size, const char* item_ids, uintptr_t max_id_len);
 
-void fill_idx_to_uidx(uintptr_t* exclude_arr, uintptr_t unfiltered_item_ct, uintptr_t item_ct, uint32_t* idx_to_uidx);
+void fill_idx_to_uidx(const uintptr_t* exclude_arr, uintptr_t unfiltered_item_ct, uintptr_t item_ct, uint32_t* idx_to_uidx);
 
-void fill_idx_to_uidx_incl(uintptr_t* include_arr, uintptr_t unfiltered_item_ct, uintptr_t item_ct, uint32_t* idx_to_uidx);
+void fill_idx_to_uidx_incl(const uintptr_t* include_arr, uintptr_t unfiltered_item_ct, uintptr_t item_ct, uint32_t* idx_to_uidx);
 
-void fill_uidx_to_idx(uintptr_t* exclude_arr, uint32_t unfiltered_item_ct, uint32_t item_ct, uint32_t* uidx_to_idx);
+void fill_uidx_to_idx(const uintptr_t* exclude_arr, uint32_t unfiltered_item_ct, uint32_t item_ct, uint32_t* uidx_to_idx);
 
-void fill_uidx_to_idx_incl(uintptr_t* include_arr, uint32_t unfiltered_item_ct, uint32_t item_ct, uint32_t* uidx_to_idx);
+void fill_uidx_to_idx_incl(const uintptr_t* include_arr, uint32_t unfiltered_item_ct, uint32_t item_ct, uint32_t* uidx_to_idx);
 
-void fill_midx_to_idx(uintptr_t* exclude_arr_orig, uintptr_t* exclude_arr, uint32_t item_ct, uint32_t* midx_to_idx);
+void fill_midx_to_idx(const uintptr_t* exclude_arr_orig, const uintptr_t* exclude_arr, uint32_t item_ct, uint32_t* midx_to_idx);
 
-void fill_vec_55(uintptr_t* vec, uint32_t ct);
 
-void vec_collapse_init(uintptr_t* unfiltered_bitarr, uint32_t unfiltered_ct, uintptr_t* filter_bitarr, uint32_t filtered_ct, uintptr_t* output_vec);
+// "quaterarr" refers to a packed group of base-4 (2-bit) elements, analogous
+// to "bitarr".  (Based on "quaternary", not "quarter".)  "quatervec"
+// indicates that vector-alignment is also required.
+void fill_quatervec_55(uint32_t ct, uintptr_t* quatervec);
 
-void vec_collapse_init_exclude(uintptr_t* unfiltered_bitarr, uint32_t unfiltered_ct, uintptr_t* filter_exclude_bitarr, uint32_t filtered_ct, uintptr_t* output_vec);
+// Used to unpack e.g. unfiltered sex_male to a filtered quaterarr usable as a
+// raw input bitmask.
+// Assumes output_quaterarr is sized to a multiple of 16 bytes.
+void quaterarr_collapse_init(const uintptr_t* __restrict unfiltered_bitarr, uint32_t unfiltered_ct, const uintptr_t* __restrict filter_bitarr, uint32_t filtered_ct, uintptr_t* __restrict output_quaterarr);
 
-uint32_t alloc_collapsed_haploid_filters(uint32_t unfiltered_sample_ct, uint32_t sample_ct, uint32_t hh_exists, uint32_t is_include, uintptr_t* sample_bitarr, uintptr_t* sex_male, uintptr_t** sample_include2_ptr, uintptr_t** sample_male_include2_ptr);
+void quaterarr_collapse_init_exclude(const uintptr_t* __restrict unfiltered_bitarr, uint32_t unfiltered_ct, const uintptr_t* __restrict filter_exclude_bitarr, uint32_t filtered_ct, uintptr_t* __restrict output_quaterarr);
 
-static inline void free_cond(void* memptr) {
+uint32_t alloc_collapsed_haploid_filters(const uintptr_t* __restrict sample_bitarr, const uintptr_t* __restrict sex_male, uint32_t unfiltered_sample_ct, uint32_t sample_ct, uint32_t hh_exists, uint32_t is_include, uintptr_t** sample_include_quatervec_ptr, uintptr_t** sample_male_include_quatervec_ptr);
+
+HEADER_INLINE void free_cond(void* memptr) {
   if (memptr) {
     free(memptr);
   }
 }
 
-static inline uint32_t realnum(double dd) {
+HEADER_INLINE uint32_t realnum(double dd) {
   return (dd == dd) && (dd != INFINITY) && (dd != -INFINITY);
 }
 
-static inline double get_maf(double allele_freq) {
-  if (allele_freq < 0.5) {
-    return allele_freq;
-  } else {
-    return (1.0 - allele_freq);
-  }
+HEADER_INLINE double get_maf(double allele_freq) {
+  return (allele_freq <= 0.5)? allele_freq : (1.0 - allele_freq);
 }
 
-static inline int32_t filename_exists(char* fname, char* fname_end, const char* fname_append) {
+HEADER_INLINE int32_t filename_exists(const char* __restrict fname_append, char* fname, char* fname_end) {
 #ifdef _WIN32
   DWORD file_attr;
   strcpy(fname_end, fname_append);
@@ -1809,13 +2006,13 @@ static inline int32_t filename_exists(char* fname, char* fname_end, const char*
 #endif
 }
 
-void sample_delim_convert(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uint32_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, char oldc, char newc);
+void sample_delim_convert(uintptr_t unfiltered_sample_ct, const uintptr_t* sample_exclude, uint32_t sample_ct, uintptr_t max_sample_id_len, char oldc, char newc, char* sample_ids);
 
-void get_set_wrange_align(uintptr_t* bitfield, uintptr_t word_ct, uintptr_t* firstw_ptr, uintptr_t* wlen_ptr);
+void get_set_wrange_align(const uintptr_t* __restrict bitarr, uintptr_t word_ct, uintptr_t* __restrict firstw_ptr, uintptr_t* __restrict wlen_ptr);
 
 // Maximum accepted chromosome index is this minus 1.  Currently cannot exceed
 // 2^14 due to SMALL_INTERVAL_BITS setting in plink_cnv.c...
-#define MAX_POSSIBLE_CHROM 64000
+#define MAX_POSSIBLE_CHROM 5120
 // ...unless this is uncommented (it removes the entire CNV module).
 // #define HIGH_MAX_CHROM
 
@@ -1865,12 +2062,13 @@ typedef struct {
   uint32_t max_code;
 
   uint32_t autosome_ct;
+
   // this is a misnomer--it includes X and excludes MT.  Underlying concept is
   // "are some calls guaranteed to be homozygous (assuming >= 1 male)", which
   // is no longer true for MT since heteroplasmy is a thing.
   uintptr_t haploid_mask[CHROM_MASK_WORDS];
 
-  // --allow-extra-chroms support
+  // --allow-extra-chr support
   uint32_t zero_extra_chroms;
   uint32_t name_ct;
   Ll_str* incl_excl_name_stack;
@@ -1892,7 +2090,7 @@ typedef struct {
 extern const char* g_species_singular;
 extern const char* g_species_plural;
 
-static inline const char* species_str(uintptr_t ct) {
+HEADER_INLINE const char* species_str(uintptr_t ct) {
   return (ct == ONELU)? g_species_singular : g_species_plural;
 }
 
@@ -1901,7 +2099,7 @@ static inline const char* species_str(uintptr_t ct) {
 #define CHR_OUTPUT_MT 4
 #define CHR_OUTPUT_0M 8
 
-static inline uint32_t all_words_zero(uintptr_t* word_arr, uintptr_t word_ct) {
+HEADER_INLINE uint32_t all_words_zero(const uintptr_t* word_arr, uintptr_t word_ct) {
   while (word_ct--) {
     if (*word_arr++) {
       return 0;
@@ -1910,38 +2108,40 @@ static inline uint32_t all_words_zero(uintptr_t* word_arr, uintptr_t word_ct) {
   return 1;
 }
 
-char* chrom_name_write(char* buf, Chrom_info* chrom_info_ptr, uint32_t chrom_idx);
+char* chrom_name_write(const Chrom_info* chrom_info_ptr, uint32_t chrom_idx, char* buf);
 
-char* chrom_name_buf5w4write(char* buf5, Chrom_info* chrom_info_ptr, uint32_t chrom_idx, uint32_t* chrom_name_len_ptr);
+char* chrom_name_buf5w4write(const Chrom_info* chrom_info_ptr, uint32_t chrom_idx, uint32_t* chrom_name_len_ptr, char* buf5);
 
-uint32_t get_max_chrom_len(Chrom_info* chrom_info_ptr);
+uint32_t get_max_chrom_len(const Chrom_info* chrom_info_ptr);
 
 void forget_extra_chrom_names(Chrom_info* chrom_info_ptr);
 
-uint32_t haploid_chrom_present(Chrom_info* chrom_info_ptr);
+uint32_t haploid_chrom_present(const Chrom_info* chrom_info_ptr);
 
-int32_t get_chrom_code_raw(char* sptr);
+int32_t get_chrom_code_raw(const char* sptr);
 
-int32_t get_chrom_code(Chrom_info* chrom_info_ptr, char* sptr);
+int32_t get_chrom_code(const Chrom_info* chrom_info_ptr, const char* sptr);
 
-int32_t get_chrom_code2(Chrom_info* chrom_info_ptr, char* sptr, uint32_t slen);
+// when the chromosome name doesn't end with a space
+// currently requires sptr[slen] to be mutable
+int32_t get_chrom_code2(const Chrom_info* chrom_info_ptr, char* sptr, uint32_t slen);
 
-uint32_t get_marker_chrom_fo_idx(Chrom_info* chrom_info_ptr, uintptr_t marker_uidx);
+uint32_t get_marker_chrom_fo_idx(const Chrom_info* chrom_info_ptr, uintptr_t marker_uidx);
 
-static inline uint32_t get_marker_chrom(Chrom_info* chrom_info_ptr, uintptr_t marker_uidx) {
+HEADER_INLINE uint32_t get_marker_chrom(const Chrom_info* chrom_info_ptr, uintptr_t marker_uidx) {
   return chrom_info_ptr->chrom_file_order[get_marker_chrom_fo_idx(chrom_info_ptr, marker_uidx)];
 }
 
-static inline int32_t chrom_exists(Chrom_info* chrom_info_ptr, uint32_t chrom_idx) {
+HEADER_INLINE int32_t chrom_exists(const Chrom_info* chrom_info_ptr, uint32_t chrom_idx) {
   return is_set(chrom_info_ptr->chrom_mask, chrom_idx);
 }
 
-int32_t resolve_or_add_chrom_name(Chrom_info* chrom_info_ptr, char* bufptr, int32_t* chrom_idx_ptr, uintptr_t line_idx, const char* file_descrip);
+int32_t resolve_or_add_chrom_name(const char* cur_chrom_name, const char* file_descrip, uintptr_t line_idx, Chrom_info* chrom_info_ptr, int32_t* chrom_idx_ptr);
 
 // no need for this; code is simpler if we just create a copy of marker_exclude
 // with all non-autosomal loci removed
 /*
-static inline uintptr_t next_autosomal_unsafe(uintptr_t* marker_exclude, uintptr_t marker_uidx, Chrom_info* chrom_info_ptr, uint32_t* chrom_end_ptr, uint32_t* chrom_fo_idx_ptr) {
+HEADER_INLINE uintptr_t next_autosomal_unsafe(uintptr_t* marker_exclude, uintptr_t marker_uidx, Chrom_info* chrom_info_ptr, uint32_t* chrom_end_ptr, uint32_t* chrom_fo_idx_ptr) {
   // assumes we are at an autosomal marker if marker_uidx < *chrom_end_ptr
   next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
   if (marker_uidx < (*chrom_end_ptr)) {
@@ -1963,11 +2163,11 @@ static inline uintptr_t next_autosomal_unsafe(uintptr_t* marker_exclude, uintptr
 }
 */
 
-void refresh_chrom_info(Chrom_info* chrom_info_ptr, uintptr_t marker_uidx, uint32_t* chrom_end_ptr, uint32_t* chrom_fo_idx_ptr, uint32_t* is_x_ptr, uint32_t* is_y_ptr, uint32_t* is_mt_ptr, uint32_t* is_haploid_ptr);
+void refresh_chrom_info(const Chrom_info* chrom_info_ptr, uintptr_t marker_uidx, uint32_t* __restrict chrom_end_ptr, uint32_t* __restrict chrom_fo_idx_ptr, uint32_t* __restrict is_x_ptr, uint32_t* __restrict is_y_ptr, uint32_t* __restrict is_mt_ptr, uint32_t* __restrict is_haploid_ptr);
 
-int32_t single_chrom_start(Chrom_info* chrom_info_ptr, uint32_t unfiltered_marker_ct, uintptr_t* marker_exclude);
+int32_t single_chrom_start(const Chrom_info* chrom_info_ptr, const uintptr_t* marker_exclude, uint32_t unfiltered_marker_ct);
 
-double destructive_get_dmedian(double* unsorted_arr, uintptr_t len);
+double destructive_get_dmedian(uintptr_t len, double* unsorted_arr);
 
 int32_t strcmp_casted(const void* s1, const void* s2);
 
@@ -1977,23 +2177,23 @@ int32_t strcmp_deref(const void* s1, const void* s2);
 
 int32_t strcmp_natural_deref(const void* s1, const void* s2);
 
-int32_t get_uidx_from_unsorted(char* idstr, uintptr_t* exclude_arr, uint32_t id_ct, char* unsorted_ids, uintptr_t max_id_len);
+int32_t get_uidx_from_unsorted(const char* idstr, const uintptr_t* exclude_arr, uint32_t id_ct, const char* unsorted_ids, uintptr_t max_id_len);
 
+// sorted_ids contents not changed, but not worth the trouble of returning a
+// const char*
 char* scan_for_duplicate_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_len);
 
-char* scan_for_duplicate_or_overlap_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_len, char* sorted_nonoverlap_ids, uintptr_t nonoverlap_id_ct, uintptr_t max_nonoverlap_id_len);
-
-int32_t is_missing_pheno_cc(char* bufptr, double missing_phenod, uint32_t affection_01);
+char* scan_for_duplicate_or_overlap_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_len, const char* sorted_nonoverlap_ids, uintptr_t nonoverlap_id_ct, uintptr_t max_nonoverlap_id_len);
 
-int32_t eval_affection(char* bufptr, double missing_phenod);
+int32_t eval_affection(const char* bufptr, double missing_phenod);
 
 uint32_t triangle_divide(int64_t cur_prod, int32_t modif);
 
-void triangle_fill(uint32_t* target_arr, uint32_t ct, uint32_t pieces, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t start, uint32_t align);
+void triangle_fill(uint32_t ct, uint32_t pieces, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t start, uint32_t align, uint32_t* target_arr);
 
 int32_t relationship_req(uint64_t calculation_type);
 
-int32_t distance_req(uint64_t calculation_type, char* read_dists_fname);
+int32_t distance_req(const char* read_dists_fname, uint64_t calculation_type);
 
 int32_t double_cmp(const void* aa, const void* bb);
 
@@ -2019,53 +2219,56 @@ int32_t llcmp(const void* aa, const void* bb);
 
 void qsort_ext2(char* main_arr, uintptr_t arr_length, uintptr_t item_length, int(* comparator_deref)(const void*, const void*), char* secondary_arr, uintptr_t secondary_item_len, char* proxy_arr, uintptr_t proxy_len);
 
-int32_t qsort_ext(char* main_arr, intptr_t arr_length, intptr_t item_length, int(* comparator_deref)(const void*, const void*), char* secondary_arr, intptr_t secondary_item_len);
+int32_t qsort_ext(char* main_arr, uintptr_t arr_length, uintptr_t item_length, int(* comparator_deref)(const void*, const void*), char* secondary_arr, intptr_t secondary_item_len);
 
-int32_t sort_item_ids_noalloc(char* sorted_ids, uint32_t* id_map, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t item_ct, char* item_ids, uintptr_t max_id_len, uint32_t allow_dups, uint32_t collapse_idxs, int(* comparator_deref)(const void*, const void*));
+int32_t sort_item_ids_noalloc(uintptr_t unfiltered_ct, const uintptr_t* exclude_arr, uintptr_t item_ct, const char* __restrict item_ids, uintptr_t max_id_len, uint32_t allow_dups, uint32_t collapse_idxs, int(* comparator_deref)(const void*, const void*), char* __restrict sorted_ids, uint32_t* id_map);
 
-int32_t sort_item_ids(char** sorted_ids_ptr, uint32_t** id_map_ptr, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t exclude_ct, char* item_ids, uintptr_t max_id_len, uint32_t allow_dups, uint32_t collapse_idxs, int(* comparator_deref)(const void*, const void*));
+int32_t sort_item_ids(uintptr_t unfiltered_ct, const uintptr_t* exclude_arr, uintptr_t exclude_ct, const char* __restrict item_ids, uintptr_t max_id_len, uint32_t allow_dups, uint32_t collapse_idxs, int(* comparator_deref)(const void*, const void*), char** sorted_ids_ptr, uint32_t** id_map_ptr);
 
-uint32_t uint32arr_greater_than(uint32_t* sorted_uint32_arr, uint32_t arr_length, uint32_t uii);
+uint32_t uint32arr_greater_than(const uint32_t* sorted_uint32_arr, uint32_t arr_length, uint32_t uii);
 
-uint32_t int32arr_greater_than(int32_t* sorted_int32_arr, uint32_t arr_length, int32_t ii);
+uint32_t int32arr_greater_than(const int32_t* sorted_int32_arr, uint32_t arr_length, int32_t ii);
 
-uintptr_t uint64arr_greater_than(uint64_t* sorted_uint64_arr, uintptr_t arr_length, uint64_t ullii);
+uintptr_t uint64arr_greater_than(const uint64_t* sorted_uint64_arr, uintptr_t arr_length, uint64_t ullii);
 
-uintptr_t doublearr_greater_than(double* sorted_dbl_arr, uintptr_t arr_length, double dxx);
+uintptr_t doublearr_greater_than(const double* sorted_dbl_arr, uintptr_t arr_length, double dxx);
 
-uintptr_t nonincr_doublearr_leq_stride(double* nonincr_dbl_arr, uintptr_t arr_length, uintptr_t stride, double dxx);
+uintptr_t nonincr_doublearr_leq_stride(const double* nonincr_dbl_arr, uintptr_t arr_length, uintptr_t stride, double dxx);
 
-int32_t bsearch_str(const char* id_buf, uintptr_t cur_id_len, char* lptr, uintptr_t max_id_len, uintptr_t end_idx);
+int32_t bsearch_str(const char* id_buf, uintptr_t cur_id_len, const char* lptr, uintptr_t max_id_len, uintptr_t end_idx);
 
-static inline int32_t bsearch_str_nl(const char* id_buf, char* lptr, uintptr_t max_id_len, intptr_t end_idx) {
+HEADER_INLINE int32_t bsearch_str_nl(const char* id_buf, const char* lptr, uintptr_t max_id_len, intptr_t end_idx) {
   return bsearch_str(id_buf, strlen(id_buf), lptr, max_id_len, end_idx);
 }
 
-int32_t bsearch_str_natural(char* id_buf, char* lptr, uintptr_t max_id_len, uintptr_t end_idx);
+int32_t bsearch_str_natural(const char* id_buf, const char* lptr, uintptr_t max_id_len, uintptr_t end_idx);
 
-uintptr_t bsearch_str_lb(const char* id_buf, uintptr_t cur_id_len, char* lptr, uintptr_t max_id_len, uintptr_t end_idx);
+uintptr_t bsearch_str_lb(const char* id_buf, uintptr_t cur_id_len, const char* lptr, uintptr_t max_id_len, uintptr_t end_idx);
 
-uint32_t bsearch_read_fam_indiv(char* id_buf, char* lptr, uintptr_t max_id_len, uintptr_t filter_line_ct, char* read_ptr, char** read_pp_new, int32_t* retval_ptr);
+uint32_t bsearch_read_fam_indiv(char* __restrict read_ptr, const char* __restrict lptr, uintptr_t max_id_len, uintptr_t filter_line_ct, char** read_pp_new, int32_t* retval_ptr, char* __restrict id_buf);
 
-void bsearch_fam(char* id_buf, char* lptr, uintptr_t max_id_len, uint32_t filter_line_ct, char* fam_id, uint32_t* first_idx_ptr, uint32_t* last_idx_ptr);
+void bsearch_fam(const char* __restrict fam_id, const char* __restrict lptr, uintptr_t max_id_len, uint32_t filter_line_ct, uint32_t* __restrict first_idx_ptr, uint32_t* __restrict last_idx_ptr, char* __restrict id_buf);
 
-void bitfield_invert(uintptr_t* bit_arr, uintptr_t bit_ct);
+// These ensure the trailing bits are zeroed out.
+void bitarr_invert(uintptr_t bit_ct, uintptr_t* bitarr);
 
-void bitfield_exclude_to_include(uintptr_t* exclude_arr, uintptr_t* include_arr, uintptr_t bit_ct);
+void bitarr_invert_copy(const uintptr_t* input_bitarr, uintptr_t bit_ct, uintptr_t* output_bitarr);
 
-void bitfield_and(uintptr_t* vv, uintptr_t* include_vec, uintptr_t word_ct);
 
-void bitfield_andnot(uintptr_t* vv, uintptr_t* exclude_vec, uintptr_t word_ct);
+// "bitvec" indicates that word count is used instead of vector count.
+void bitvec_and(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec);
 
-void bitfield_andnot_reversed_args(uintptr_t* vv, uintptr_t* include_vec, uintptr_t word_ct);
+void bitvec_andnot(const uintptr_t* __restrict exclude_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec);
 
-void bitfield_or(uintptr_t* vv, uintptr_t* or_vec, uintptr_t word_ct);
+void bitvec_andnot_reversed_args(const uintptr_t* __restrict include_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec);
 
-void bitfield_ornot(uintptr_t* vv, uintptr_t* inverted_or_vec, uintptr_t word_ct);
+void bitvec_or(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec);
 
-void bitfield_xor(uintptr_t* bit_arr, uintptr_t* xor_arr, uintptr_t word_ct);
+void bitvec_ornot(const uintptr_t* __restrict inverted_or_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec);
 
-static inline uint32_t popcount2_long(uintptr_t val) {
+void bitvec_xor(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec);
+
+HEADER_INLINE uint32_t popcount2_long(uintptr_t val) {
 #ifdef __LP64__
   val = (val & 0x3333333333333333LLU) + ((val >> 2) & 0x3333333333333333LLU);
   return (((val + (val >> 4)) & 0x0f0f0f0f0f0f0f0fLLU) * 0x0101010101010101LLU) >> 56;
@@ -2075,24 +2278,24 @@ static inline uint32_t popcount2_long(uintptr_t val) {
 #endif
 }
 
-static inline uint32_t popcount_long(uintptr_t val) {
+HEADER_INLINE uint32_t popcount_long(uintptr_t val) {
   // the simple version, good enough for all non-time-critical stuff
   return popcount2_long(val - ((val >> 1) & FIVEMASK));
 }
 
-uint32_t is_monomorphic_a2(uintptr_t* lptr, uint32_t sample_ct);
+uint32_t is_monomorphic_a2(const uintptr_t* geno_arr, uint32_t sample_ct);
 
-uint32_t is_monomorphic(uintptr_t* lptr, uint32_t sample_ct);
+uint32_t is_monomorphic(const uintptr_t* geno_arr, uint32_t sample_ct);
 
 // same as is_monomorphic, except it also flags the all-heterozygote case
-uint32_t less_than_two_genotypes(uintptr_t* lptr, uint32_t sample_ct);
+uint32_t less_than_two_genotypes(const uintptr_t* geno_arr, uint32_t sample_ct);
 
 // uint32_t has_three_genotypes(uintptr_t* lptr, uint32_t sample_ct);
 
-uintptr_t popcount_longs(uintptr_t* lptr, uintptr_t word_ct);
+uintptr_t popcount_longs(const uintptr_t* lptr, uintptr_t word_ct);
 
 #ifdef __LP64__
-static inline uintptr_t popcount_longs_nzbase(uintptr_t* lptr, uintptr_t start_idx, uintptr_t end_idx) {
+HEADER_INLINE uintptr_t popcount_longs_nzbase(const uintptr_t* lptr, uintptr_t start_idx, uintptr_t end_idx) {
   uintptr_t prefix_ct = 0;
   if (start_idx & 1) {
     if (end_idx == start_idx) {
@@ -2103,73 +2306,73 @@ static inline uintptr_t popcount_longs_nzbase(uintptr_t* lptr, uintptr_t start_i
   return prefix_ct + popcount_longs(&(lptr[start_idx]), end_idx - start_idx);
 }
 #else
-static inline uintptr_t popcount_longs_nzbase(uintptr_t* lptr, uintptr_t start_idx, uintptr_t end_idx) {
+HEADER_INLINE uintptr_t popcount_longs_nzbase(const uintptr_t* lptr, uintptr_t start_idx, uintptr_t end_idx) {
   return popcount_longs(&(lptr[start_idx]), end_idx - start_idx);
 }
 #endif
 
-uintptr_t popcount2_longs(uintptr_t* lptr, uintptr_t word_ct);
+uintptr_t popcount2_longs(const uintptr_t* lptr, uintptr_t word_ct);
 
 #define popcount01_longs popcount2_longs
 
-uintptr_t popcount_bit_idx(uintptr_t* lptr, uintptr_t start_idx, uintptr_t end_idx);
+uintptr_t popcount_bit_idx(const uintptr_t* lptr, uintptr_t start_idx, uintptr_t end_idx);
 
-uint32_t chrom_window_max(uint32_t* marker_pos, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, uint32_t chrom_idx, uint32_t ct_max, uint32_t bp_max, uint32_t cur_window_max);
+uint32_t chrom_window_max(const uint32_t* marker_pos, const uintptr_t* marker_exclude, const Chrom_info* chrom_info_ptr, uint32_t chrom_idx, uint32_t ct_max, uint32_t bp_max, uint32_t cur_window_max);
 
-uint32_t window_back(uint32_t* marker_pos, uintptr_t* marker_exclude, uint32_t marker_uidx_min, uint32_t marker_uidx, uint32_t count_max, uint32_t bp_max, uint32_t* window_trail_ct_ptr);
+uint32_t window_back(const uint32_t* __restrict marker_pos, const uintptr_t* marker_exclude, uint32_t marker_uidx_min, uint32_t marker_uidx, uint32_t count_max, uint32_t bp_max, uint32_t* __restrict window_trail_ct_ptr);
 
-uint32_t window_forward(uint32_t* marker_pos, uintptr_t* marker_exclude, uint32_t marker_uidx_start, uint32_t marker_uidx_last, uint32_t count_max, uint32_t bp_max, uint32_t* window_lead_ct_ptr);
+uint32_t window_forward(const uint32_t* __restrict marker_pos, const uintptr_t* marker_exclude, uint32_t marker_uidx_start, uint32_t marker_uidx_last, uint32_t count_max, uint32_t bp_max, uint32_t* __restrict window_lead_ct_ptr);
 
-uintptr_t jump_forward_unset_unsafe(uintptr_t* bit_arr, uintptr_t cur_pos, uintptr_t forward_ct);
+uintptr_t jump_forward_unset_unsafe(const uintptr_t* bitarr, uintptr_t cur_pos, uintptr_t forward_ct);
 
-static inline uintptr_t popcount_chars(uintptr_t* lptr, uintptr_t start_idx, uintptr_t end_idx) {
+HEADER_INLINE uintptr_t popcount_chars(const uintptr_t* lptr, uintptr_t start_idx, uintptr_t end_idx) {
   return popcount_bit_idx(lptr, start_idx * 8, end_idx * 8);
 }
 
-uintptr_t popcount_longs_exclude(uintptr_t* lptr, uintptr_t* exclude_arr, uintptr_t end_idx);
+uintptr_t popcount_longs_exclude(const uintptr_t* __restrict lptr, const uintptr_t* __restrict exclude_arr, uintptr_t end_idx);
 
-uintptr_t popcount_longs_intersect(uintptr_t* lptr1, uintptr_t* lptr2, uintptr_t word_ct);
+uintptr_t popcount_longs_intersect(const uintptr_t* __restrict lptr1, const uintptr_t* __restrict lptr2, uintptr_t word_ct);
 
-void vertical_bitct_subtract(uintptr_t* bit_arr, uint32_t item_ct, uint32_t* sum_arr);
+void vertical_bitct_subtract(const uintptr_t* bitarr, uint32_t item_ct, uint32_t* sum_arr);
 
 #ifdef __LP64__
-void count_2freq_dbl_60v(__m128i* vptr, __m128i* vend, __m128i* mask1vp, __m128i* mask2vp, uint32_t* ct1abp, uint32_t* ct1cp, uint32_t* ct2abp, uint32_t* ct2cp);
+void count_2freq_dbl_960b(const VECITYPE* geno_vvec, const VECITYPE* geno_vvec_end, const VECITYPE* __restrict mask1vp, const VECITYPE* __restrict mask2vp, uint32_t* __restrict ct1abp, uint32_t* __restrict ct1cp, uint32_t* __restrict ct2abp, uint32_t* __restrict ct2cp);
 
-void count_3freq_120v(__m128i* vptr, __m128i* vend, __m128i* maskvp, uint32_t* ctap, uint32_t* ctbp, uint32_t* ctcp);
+void count_3freq_1920b(const VECITYPE* geno_vvec, const VECITYPE* geno_vvec_end, const VECITYPE* __restrict maskvp, uint32_t* __restrict ctap, uint32_t* __restrict ctbp, uint32_t* __restrict ctcp);
 #else
-void count_2freq_dbl_6(uintptr_t* lptr, uintptr_t* mask1p, uintptr_t* mask2p, uint32_t* ct1abp, uint32_t* ct1cp, uint32_t* ct2abp, uint32_t* ct2cp);
+void count_2freq_dbl_24b(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict mask1p, const uintptr_t* __restrict mask2p, uint32_t* __restrict ct1abp, uint32_t* __restrict ct1cp, uint32_t* __restrict ct2abp, uint32_t* __restrict ct2cp);
 
-void count_3freq_12(uintptr_t* lptr, uintptr_t* maskp, uint32_t* ctap, uint32_t* ctbp, uint32_t* ctcp);
+void count_3freq_48b(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict maskp, uint32_t* __restrict ctap, uint32_t* __restrict ctbp, uint32_t* __restrict ctcp);
 #endif
 
-void vec_set_freq(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* include_vec, uint32_t* set_ctp, uint32_t* missing_ctp);
+void genovec_set_freq(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict include_quatervec, uintptr_t sample_ctl2, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp);
 
-void vec_set_freq_x(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* include_vec, uintptr_t* male_vec, uint32_t* set_ctp, uint32_t* missing_ctp);
+void genovec_set_freq_x(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict include_quatervec, const uintptr_t* __restrict male_quatervec, uintptr_t sample_ctl2, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp);
 
-void vec_set_freq_y(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* include_vec, uintptr_t* nonmale_vec, uint32_t* set_ctp, uint32_t* missing_ctp);
+void genovec_set_freq_y(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict include_quatervec, const uintptr_t* __restrict nonmale_quatervec, uintptr_t sample_ctl2, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp);
 
-void vec_3freq(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* include_vec, uint32_t* missing_ctp, uint32_t* het_ctp, uint32_t* homa2_ctp);
+void genovec_3freq(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict include_quatervec, uintptr_t sample_ctl2, uint32_t* __restrict missing_ctp, uint32_t* __restrict het_ctp, uint32_t* __restrict homset_ctp);
 
-uintptr_t count_01(uintptr_t* lptr, uintptr_t word_ct);
+uintptr_t count_01(const uintptr_t* quatervec, uintptr_t word_ct);
 
-static inline void zero_trailing_bits(uintptr_t* bitfield, uintptr_t unfiltered_ct) {
+HEADER_INLINE void zero_trailing_bits(uintptr_t unfiltered_ct, uintptr_t* bitarr) {
   uintptr_t trail_ct = unfiltered_ct & (BITCT - 1);
   if (trail_ct) {
-    bitfield[unfiltered_ct / BITCT] &= (ONELU << trail_ct) - ONELU;
+    bitarr[unfiltered_ct / BITCT] &= (ONELU << trail_ct) - ONELU;
   }
 }
 
-void fill_all_bits(uintptr_t* bit_arr, uintptr_t ct);
+void fill_all_bits(uintptr_t ct, uintptr_t* bitarr);
 
-uint32_t numeric_range_list_to_bitfield(Range_list* range_list_ptr, uint32_t item_ct, uintptr_t* bitfield, uint32_t offset, uint32_t ignore_overflow);
+uint32_t numeric_range_list_to_bitarr(const Range_list* range_list_ptr, uint32_t item_ct, uint32_t offset, uint32_t ignore_overflow, uintptr_t* bitarr);
 
-int32_t string_range_list_to_bitfield(char* header_line, uint32_t item_ct, uint32_t fixed_len, Range_list* range_list_ptr, char* sorted_ids, uint32_t* id_map, int32_t* seen_idxs, const char* range_list_flag, const char* file_descrip, uintptr_t* bitfield);
+int32_t string_range_list_to_bitarr(char* header_line, uint32_t item_ct, uint32_t fixed_len, const Range_list* range_list_ptr, const char* __restrict sorted_ids, const uint32_t* __restrict id_map, const char* __restrict range_list_flag, const char* __restrict file_descrip, uintptr_t* bitarr, int32_t* __restrict seen_idxs);
 
-int32_t string_range_list_to_bitfield_alloc(char* header_line, uint32_t item_ct, uint32_t fixed_len, Range_list* range_list_ptr, uintptr_t** bitfield_ptr, const char* range_list_flag, const char* file_descrip);
+int32_t string_range_list_to_bitarr_alloc(char* header_line, uint32_t item_ct, uint32_t fixed_len, const Range_list* range_list_ptr, const char* __restrict range_list_flag, const char* __restrict file_descrip, uintptr_t** bitarr_ptr);
 
-int32_t string_range_list_to_bitfield2(char* sorted_ids, uint32_t* id_map, uintptr_t item_ct, uintptr_t max_id_len, Range_list* range_list_ptr, const char* range_list_flag, uintptr_t* bitfield_excl);
+int32_t string_range_list_to_bitarr2(const char* __restrict sorted_ids, const uint32_t* id_map, uintptr_t item_ct, uintptr_t max_id_len, const Range_list* __restrict range_list_ptr, const char* __restrict range_list_flag, uintptr_t* bitarr_excl);
 
-static inline uint32_t count_chrom_markers(Chrom_info* chrom_info_ptr, uint32_t chrom_idx, uintptr_t* marker_exclude) {
+HEADER_INLINE uint32_t count_chrom_markers(const Chrom_info* chrom_info_ptr, const uintptr_t* marker_exclude, uint32_t chrom_idx) {
   uint32_t min_idx;
   uint32_t max_idx;
   if (!is_set(chrom_info_ptr->chrom_mask, chrom_idx)) {
@@ -2180,19 +2383,20 @@ static inline uint32_t count_chrom_markers(Chrom_info* chrom_info_ptr, uint32_t
   return (max_idx - min_idx) - ((uint32_t)popcount_bit_idx(marker_exclude, min_idx, max_idx));
 }
 
-uint32_t count_non_autosomal_markers(Chrom_info* chrom_info_ptr, uintptr_t* marker_exclude, uint32_t count_x, uint32_t count_mt);
+uint32_t count_non_autosomal_markers(const Chrom_info* chrom_info_ptr, const uintptr_t* marker_exclude, uint32_t count_x, uint32_t count_mt);
 
-int32_t conditional_allocate_non_autosomal_markers(Chrom_info* chrom_info_ptr, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uint32_t marker_ct, uint32_t count_x, uint32_t count_mt, const char* calc_descrip, uintptr_t** marker_exclude_ptr, uint32_t* newly_excluded_ct_ptr);
+int32_t conditional_allocate_non_autosomal_markers(const Chrom_info* chrom_info_ptr, uintptr_t unfiltered_marker_ct, const uintptr_t* marker_exclude_orig, uint32_t marker_ct, uint32_t count_x, uint32_t count_mt, const char* calc_descrip, uintptr_t** marker_exclude_ptr, uint32_t* newly_excluded_ct_ptr);
 
-uint32_t get_max_chrom_size(Chrom_info* chrom_info_ptr, uintptr_t* marker_exclude, uint32_t* last_chrom_fo_idx_ptr);
+uint32_t get_max_chrom_size(const Chrom_info* chrom_info_ptr, const uintptr_t* marker_exclude, uint32_t* last_chrom_fo_idx_ptr);
 
-void count_genders(uintptr_t* sex_nm, uintptr_t* sex_male, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uint32_t* male_ct_ptr, uint32_t* female_ct_ptr, uint32_t* unk_ct_ptr);
+void count_genders(const uintptr_t* __restrict sex_nm, const uintptr_t* __restrict sex_male, const uintptr_t* __restrict sample_exclude, uintptr_t unfiltered_sample_ct, uint32_t* __restrict male_ct_ptr, uint32_t* __restrict female_ct_ptr, uint32_t* __restrict unk_ct_ptr);
 
-void reverse_loadbuf(unsigned char* loadbuf, uintptr_t unfiltered_sample_ct);
+void reverse_loadbuf(uintptr_t unfiltered_sample_ct, unsigned char* loadbuf);
 
-void collapse_copy_2bitarr(uintptr_t* rawbuf, uintptr_t* mainbuf, uint32_t unfiltered_sample_ct, uint32_t sample_ct, uintptr_t* sample_exclude);
+// deprecated, try to just use copy_quaterarr_nonempty_subset()
+void copy_quaterarr_nonempty_subset_excl(const uintptr_t* __restrict raw_quaterarr, const uintptr_t* __restrict subset_excl, uint32_t raw_quaterarr_size, uint32_t subset_size, uintptr_t* __restrict output_quaterarr);
 
-static inline uint32_t load_raw(FILE* bedfile, uintptr_t* rawbuf, uintptr_t unfiltered_sample_ct4) {
+HEADER_INLINE uint32_t load_raw(uintptr_t unfiltered_sample_ct4, FILE* bedfile, uintptr_t* rawbuf) {
   // only use this if all accesses to the data involve
   // 1. some sort of mask, or
   // 2. explicit iteration from 0..(unfiltered_sample_ct-1).
@@ -2201,7 +2405,7 @@ static inline uint32_t load_raw(FILE* bedfile, uintptr_t* rawbuf, uintptr_t unfi
   return (fread(rawbuf, 1, unfiltered_sample_ct4, bedfile) < unfiltered_sample_ct4);
 }
 
-static inline uintptr_t get_final_mask(uint32_t sample_ct) {
+HEADER_INLINE uintptr_t get_final_mask(uint32_t sample_ct) {
   uint32_t uii = sample_ct % BITCT2;
   if (uii) {
     return (ONELU << (2 * uii)) - ONELU;
@@ -2210,7 +2414,7 @@ static inline uintptr_t get_final_mask(uint32_t sample_ct) {
   }
 }
 
-static inline uint32_t load_raw2(FILE* bedfile, uintptr_t* rawbuf, uintptr_t unfiltered_sample_ct4, uintptr_t unfiltered_sample_ctl2m1, uintptr_t final_mask) {
+HEADER_INLINE uint32_t load_raw2(uintptr_t unfiltered_sample_ct4, uintptr_t unfiltered_sample_ctl2m1, uintptr_t final_mask, FILE* bedfile, uintptr_t* rawbuf) {
   if (fread(rawbuf, 1, unfiltered_sample_ct4, bedfile) < unfiltered_sample_ct4) {
     return 1;
   }
@@ -2218,66 +2422,96 @@ static inline uint32_t load_raw2(FILE* bedfile, uintptr_t* rawbuf, uintptr_t unf
   return 0;
 }
 
-uint32_t load_and_collapse(FILE* bedfile, uintptr_t* rawbuf, uint32_t unfiltered_sample_ct, uintptr_t* mainbuf, uint32_t sample_ct, uintptr_t* sample_exclude, uintptr_t final_mask, uint32_t do_reverse);
+uint32_t load_and_collapse(uint32_t unfiltered_sample_ct, uint32_t sample_ct, const uintptr_t* __restrict sample_exclude, uintptr_t final_mask, uint32_t do_reverse, FILE* bedfile, uintptr_t* __restrict rawbuf, uintptr_t* __restrict mainbuf);
 
-void collapse_copy_2bitarr_incl(uintptr_t* rawbuf, uintptr_t* mainbuf, uint32_t unfiltered_sample_ct, uint32_t sample_ct, uintptr_t* sample_include);
+// was "collapse_copy_quaterarr_incl", but this should be better way to think
+// about it
+void copy_quaterarr_nonempty_subset(const uintptr_t* __restrict raw_quaterarr, const uintptr_t* __restrict subset_mask, uint32_t raw_quaterarr_size, uint32_t subset_size, uintptr_t* __restrict output_quaterarr);
 
-uint32_t load_and_collapse_incl(FILE* bedfile, uintptr_t* rawbuf, uint32_t unfiltered_sample_ct, uintptr_t* mainbuf, uint32_t sample_ct, uintptr_t* sample_include, uintptr_t final_mask, uint32_t do_reverse);
+/*
+// in-place version of copy_quaterarr_subset (usually destroying original
+// data).
+// this doesn't seem to provide a meaningful advantage over
+// copy_quaterarr_subset in practice, and the latter is more versatile without
+// requiring much more memory.
+void inplace_quaterarr_proper_subset(const uintptr_t* __restrict subset_mask, uint32_t orig_quaterarr_size, uint32_t subset_size, uintptr_t* __restrict main_quaterarr);
+
+HEADER_INLINE void inplace_quaterarr_subset(const uintptr_t* __restrict subset_mask, uint32_t orig_quaterarr_size, uint32_t subset_size, uintptr_t* __restrict main_quaterarr) {
+  if (orig_quaterarr_size == subset_size) {
+    return;
+  }
+  inplace_quaterarr_proper_subset(subset_mask, orig_quaterarr_size, subset_size, main_quaterarr);
+}
+*/
 
-uint32_t load_and_split(FILE* bedfile, uintptr_t* rawbuf, uint32_t unfiltered_sample_ct, uintptr_t* casebuf, uintptr_t* ctrlbuf, uintptr_t* pheno_nm, uintptr_t* pheno_c);
+uint32_t load_and_collapse_incl(uint32_t unfiltered_sample_ct, uint32_t sample_ct, const uintptr_t* __restrict sample_include, uintptr_t final_mask, uint32_t do_reverse, FILE* bedfile, uintptr_t* __restrict rawbuf, uintptr_t* __restrict mainbuf);
 
-void vec_include_init(uintptr_t unfiltered_sample_ct, uintptr_t* new_include2, uintptr_t* old_include);
+// uint32_t load_and_collapse_incl_inplace(const uintptr_t* __restrict sample_include, uint32_t unfiltered_sample_ct, uint32_t sample_ct, uintptr_t final_mask, uint32_t do_reverse, FILE* bedfile, uintptr_t* __restrict mainbuf);
 
-void exclude_to_vec_include(uintptr_t unfiltered_sample_ct, uintptr_t* include_vec, uintptr_t* exclude_arr);
+uint32_t load_and_split(uint32_t unfiltered_sample_ct, const uintptr_t* __restrict pheno_nm, const uintptr_t* __restrict pheno_c, FILE* bedfile, uintptr_t* __restrict rawbuf, uintptr_t* __restrict casebuf, uintptr_t* __restrict ctrlbuf);
 
-void vec_init_invert(uintptr_t entry_ct, uintptr_t* target_arr, uintptr_t* source_arr);
+void init_quaterarr_from_bitarr(const uintptr_t* __restrict bitarr, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict new_quaterarr);
 
-void bitfield_andnot_copy(uintptr_t word_ct, uintptr_t* target_arr, uintptr_t* source_arr, uintptr_t* exclude_arr);
+void init_quaterarr_from_inverted_bitarr(const uintptr_t* __restrict inverted_bitarr, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict new_quaterarr);
 
-void vec_include_mask_in(uintptr_t unfiltered_sample_ct, uintptr_t* include_arr, uintptr_t* mask_arr);
+void quatervec_01_init_invert(const uintptr_t* __restrict source_quatervec, uintptr_t entry_ct, uintptr_t* __restrict target_quatervec);
 
-void vec_include_mask_out(uintptr_t unfiltered_sample_ct, uintptr_t* include_arr, uintptr_t* mask_arr);
+// target_vec := source_vec ANDNOT exclude_vec
+// may write an extra word
+void bitvec_andnot_copy(const uintptr_t* __restrict source_vec, const uintptr_t* __restrict exclude_vec, uintptr_t word_ct, uintptr_t* __restrict target_vec);
 
-void vec_include_mask_out_intersect(uintptr_t unfiltered_sample_ct, uintptr_t* include_arr, uintptr_t* mask_arr, uintptr_t* mask2_arr);
+void apply_bitarr_mask_to_quaterarr_01(const uintptr_t* __restrict mask_bitarr, uintptr_t unfiltered_sample_ct, uintptr_t* main_quaterarr);
 
-void vec_init_01(uintptr_t unfiltered_sample_ct, uintptr_t* data_ptr, uintptr_t* result_ptr);
+void apply_bitarr_excl_to_quaterarr_01(const uintptr_t* __restrict excl_bitarr, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict main_quaterarr);
 
-void vec_invert(uintptr_t unfiltered_sample_ct, uintptr_t* vec2);
+// excludes (excl_bitarr_1 & excl_bitarr_2).  (union can be excluded by calling
+// apply_excl_to_quaterarr_01() twice.)
+void apply_excl_intersect_to_quaterarr_01(const uintptr_t* __restrict excl_bitarr_1, const uintptr_t* __restrict excl_bitarr_2, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict main_quaterarr);
+
+// initializes output_quatervec bits to 01 iff input_quatervec bits are 01,
+// everything else zeroed out
+void quatervec_copy_only_01(const uintptr_t* __restrict input_quatervec, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict output_quatervec);
+
+void quatervec_01_invert(uintptr_t unfiltered_sample_ct, uintptr_t* main_quatervec);
 
 void vec_datamask(uintptr_t unfiltered_sample_ct, uint32_t matchval, uintptr_t* data_ptr, uintptr_t* mask_ptr, uintptr_t* result_ptr);
 
 // void vec_rotate_plink1_to_plink2(uintptr_t* lptr, uint32_t word_ct);
 
-void rotate_plink1_to_plink2_and_copy(uintptr_t* loadbuf, uintptr_t* writebuf, uintptr_t word_ct);
+void rotate_plink1_to_a2ct_and_copy(uintptr_t* loadbuf, uintptr_t* writebuf, uintptr_t word_ct);
 
-void extract_collapsed_missing_bitfield(uintptr_t* lptr, uintptr_t unfiltered_sample_ct, uintptr_t* sample_include2, uintptr_t sample_ct, uintptr_t* missing_bitfield);
+void extract_collapsed_missing_bitfield(uintptr_t* lptr, uintptr_t unfiltered_sample_ct, uintptr_t* sample_include_quaterarr, uintptr_t sample_ct, uintptr_t* missing_bitfield);
 
-void hh_reset(unsigned char* loadbuf, uintptr_t* sample_include2, uintptr_t unfiltered_sample_ct);
+void hh_reset(unsigned char* loadbuf, uintptr_t* sample_include_quaterarr, uintptr_t unfiltered_sample_ct);
 
-void hh_reset_y(unsigned char* loadbuf, uintptr_t* sample_include2, uintptr_t* sample_male_include2, uintptr_t unfiltered_sample_ct);
+void hh_reset_y(unsigned char* loadbuf, uintptr_t* sample_include_quaterarr, uintptr_t* sample_male_include_quaterarr, uintptr_t unfiltered_sample_ct);
 
-static inline void haploid_fix(uint32_t hh_exists, uintptr_t* sample_include2, uintptr_t* sample_male_include2, uintptr_t sample_ct, uint32_t is_x, uint32_t is_y, unsigned char* loadbuf) {
+HEADER_INLINE void haploid_fix(uint32_t hh_exists, uintptr_t* sample_include_quaterarr, uintptr_t* sample_male_include_quaterarr, uintptr_t sample_ct, uint32_t is_x, uint32_t is_y, unsigned char* loadbuf) {
   if (is_x) {
     if (hh_exists & XMHH_EXISTS) {
-      hh_reset(loadbuf, sample_male_include2, sample_ct);
+      hh_reset(loadbuf, sample_male_include_quaterarr, sample_ct);
     }
   } else if (is_y) {
     if (hh_exists & Y_FIX_NEEDED) {
-      hh_reset_y(loadbuf, sample_include2, sample_male_include2, sample_ct);
+      hh_reset_y(loadbuf, sample_include_quaterarr, sample_male_include_quaterarr, sample_ct);
     }
   } else if (hh_exists & NXMHH_EXISTS) {
-    hh_reset(loadbuf, sample_include2, sample_ct);
+    hh_reset(loadbuf, sample_include_quaterarr, sample_ct);
   }
 }
 
-uint32_t alloc_raw_haploid_filters(uint32_t unfiltered_sample_ct, uint32_t hh_exists, uint32_t is_include, uintptr_t* sample_bitarr, uintptr_t* sex_male, uintptr_t** sample_raw_include2_ptr, uintptr_t** sample_raw_male_include2_ptr);
+uint32_t alloc_raw_haploid_filters(uint32_t unfiltered_sample_ct, uint32_t hh_exists, uint32_t is_include, uintptr_t* sample_bitarr, uintptr_t* sex_male, uintptr_t** sample_raw_include_quatervec_ptr, uintptr_t** sample_raw_male_quatervec_ptr);
 
 void haploid_fix_multiple(uintptr_t* marker_exclude, uintptr_t marker_uidx_start, uintptr_t marker_ct, Chrom_info* chrom_info_ptr, uint32_t hh_exists, uintptr_t* sample_raw_include2, uintptr_t* sample_raw_male_include2, uintptr_t unfiltered_sample_ct, uintptr_t byte_ct_per_marker, unsigned char* loadbuf);
 
 void force_missing(unsigned char* loadbuf, uintptr_t* force_missing_include2, uintptr_t unfiltered_sample_ct);
 
-static inline char sexchar(uintptr_t* sex_nm, uintptr_t* sex_male, uintptr_t sample_uidx) {
-  return is_set(sex_nm, sample_uidx)? (is_set(sex_male, sample_uidx)? '1' : '2') : '0';
+HEADER_INLINE char sexchar(uintptr_t* sex_nm, uintptr_t* sex_male, uintptr_t sample_uidx) {
+  if (is_set(sex_nm, sample_uidx)) {
+    return '2' - is_set(sex_male, sample_uidx);
+  } else {
+    return '0';
+  }
 }
 
 int32_t open_and_size_string_list(char* fname, FILE** infile_ptr, uintptr_t* list_len_ptr, uintptr_t* max_str_len_ptr);
@@ -2306,9 +2540,10 @@ void inplace_delta_collapse_arr(char* item_arr, uintptr_t item_len, uintptr_t fi
 
 void inplace_delta_collapse_bitfield(uintptr_t* read_ptr, uint32_t filtered_ct_new, uintptr_t* exclude_orig, uintptr_t* exclude_new);
 
-void collapse_copy_bitarr(uint32_t orig_ct, uintptr_t* bit_arr, uintptr_t* exclude_arr, uint32_t filtered_ct, uintptr_t* output_arr);
+// deprecated, migrate to copy_bitarr_subset()
+void copy_bitarr_subset_excl(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict subset_excl, uint32_t raw_bitarr_size, uint32_t subset_size, uintptr_t* __restrict output_bitarr);
 
-void collapse_copy_bitarr_incl(uint32_t orig_ct, uintptr_t* bit_arr, uintptr_t* include_arr, uint32_t filtered_ct, uintptr_t* output_arr);
+void copy_bitarr_subset(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict subset_mask, uint32_t raw_bitarr_size, uint32_t subset_size, uintptr_t* __restrict output_bitarr);
 
 void uncollapse_copy_flip_include_arr(uintptr_t* collapsed_include_arr, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t* output_exclude_arr);
 
@@ -2316,8 +2551,8 @@ void copy_when_nonmissing(uintptr_t* loadbuf, char* source, uintptr_t elem_size,
 
 uint32_t collapse_duplicate_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_len, uint32_t* id_starts);
 
-static inline double rand_unif(void) {
-  return (sfmt_genrand_uint32(&sfmt) + 0.5) * RECIP_2_32;
+HEADER_INLINE double rand_unif(void) {
+  return (sfmt_genrand_uint32(&g_sfmt) + 0.5) * RECIP_2_32;
 }
 
 void range_list_init(Range_list* range_list_ptr);
@@ -2330,7 +2565,7 @@ double rand_normal(double* secondval_ptr);
 
 void init_sfmt64_from_sfmt32(sfmt_t* sfmt32, sfmt_t* sfmt64);
 
-static inline void precompute_mods(uintptr_t sample_ct, uint32_t* precomputed_mods) {
+HEADER_INLINE void precompute_mods(uintptr_t sample_ct, uint32_t* precomputed_mods) {
   // sets precomputed_mods[n] = 2^32 mod (n-2)
   uintptr_t sample_idx;
   for (sample_idx = 2; sample_idx <= sample_ct; sample_idx++) {
@@ -2357,7 +2592,7 @@ extern uint32_t g_is_last_thread_block;
 extern HANDLE g_thread_start_next_event[];
 extern HANDLE g_thread_cur_block_done_events[];
 
-static inline void THREAD_BLOCK_FINISH(uintptr_t tidx) {
+HEADER_INLINE void THREAD_BLOCK_FINISH(uintptr_t tidx) {
   SetEvent(g_thread_cur_block_done_events[tidx - 1]);
   WaitForSingleObject(g_thread_start_next_event[tidx - 1], INFINITE);
 }
@@ -2375,6 +2610,6 @@ int32_t spawn_threads2(pthread_t* threads, void* (*start_routine)(void*), uintpt
 
 extern sfmt_t** g_sfmtp_arr;
 
-uint32_t wkspace_init_sfmtp(uint32_t thread_ct);
+uint32_t bigstack_init_sfmtp(uint32_t thread_ct);
 
 #endif // __PLINK_COMMON_H__
diff --git a/plink_data.c b/plink_data.c
index 6bcc946..bd1577f 100644
--- a/plink_data.c
+++ b/plink_data.c
@@ -27,8 +27,8 @@ int32_t sort_item_ids_nx(char** sorted_ids_ptr, uint32_t** id_map_ptr, uintptr_t
   char* sorted_ids;
   char* dup_id;
   char* tptr;
-  if (wkspace_alloc_c_checked(sorted_ids_ptr, item_ct * max_id_len) ||
-      wkspace_alloc_ui_checked(id_map_ptr, item_ct * sizeof(int32_t))) {
+  if (bigstack_alloc_c(item_ct * max_id_len, sorted_ids_ptr) ||
+      bigstack_alloc_ui(item_ct, id_map_ptr)) {
     return RET_NOMEM;
   }
   sorted_ids = *sorted_ids_ptr;
@@ -57,11 +57,11 @@ int32_t sort_item_ids_nx(char** sorted_ids_ptr, uint32_t** id_map_ptr, uintptr_t
 int32_t sample_major_to_snp_major(char* sample_major_fname, char* outname, uintptr_t unfiltered_marker_ct, uintptr_t unfiltered_sample_ct, uint64_t fsize) {
   // previously used mmap(); turns out this is more portable without being
   // noticeably slower.
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
   FILE* outfile = NULL;
   uintptr_t unfiltered_marker_ct4 = (unfiltered_marker_ct + 3) / 4;
-  uintptr_t unfiltered_marker_ctl2 = (unfiltered_marker_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_marker_ctl2 = QUATERCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   uintptr_t marker_idx_end = 0;
   uint32_t bed_offset = fsize - unfiltered_sample_ct * ((uint64_t)unfiltered_marker_ct4);
@@ -70,6 +70,7 @@ int32_t sample_major_to_snp_major(char* sample_major_fname, char* outname, uintp
   uintptr_t* lptr;
   unsigned char* writebuf;
   unsigned char* ucptr;
+  uintptr_t cur_bigstack_left;
   uintptr_t write_marker_ct;
   uintptr_t marker_idx_base;
   uintptr_t marker_idx_block_end;
@@ -81,7 +82,7 @@ int32_t sample_major_to_snp_major(char* sample_major_fname, char* outname, uintp
   uintptr_t cur_word1;
   uintptr_t cur_word2;
   uintptr_t cur_word3;
-  if (fopen_checked(&outfile, outname, "wb")) {
+  if (fopen_checked(outname, FOPEN_WB, &outfile)) {
     goto sample_major_to_snp_major_ret_OPEN_FAIL;
   }
   if (fwrite_checked("l\x1b\x01", 3, outfile)) {
@@ -89,15 +90,16 @@ int32_t sample_major_to_snp_major(char* sample_major_fname, char* outname, uintp
   }
   if (unfiltered_marker_ct && unfiltered_sample_ct) {
     // could make this allocation a bit smaller in multipass case, but whatever
-    if (wkspace_alloc_ul_checked(&loadbuf, unfiltered_marker_ctl2 * 4 * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_marker_ctl2 * 4, &loadbuf)) {
       goto sample_major_to_snp_major_ret_NOMEM;
     }
-    if (wkspace_left < unfiltered_sample_ct4) {
+    cur_bigstack_left = bigstack_left();
+    if (cur_bigstack_left < unfiltered_sample_ct4) {
       goto sample_major_to_snp_major_ret_NOMEM;
     }
-    writebuf = (unsigned char*)wkspace_base;
-    write_marker_ct = BITCT2 * (wkspace_left / (unfiltered_sample_ct4 * BITCT2));
-    if (fopen_checked(&infile, sample_major_fname, "rb")) {
+    writebuf = (unsigned char*)g_bigstack_base;
+    write_marker_ct = BITCT2 * (cur_bigstack_left / (unfiltered_sample_ct4 * BITCT2));
+    if (fopen_checked(sample_major_fname, FOPEN_RB, &infile)) {
       goto sample_major_to_snp_major_ret_OPEN_FAIL;
     }
     loadbuf[unfiltered_marker_ctl2 - 1] = 0;
@@ -122,7 +124,7 @@ int32_t sample_major_to_snp_major(char* sample_major_fname, char* outname, uintp
 	}
 	lptr = loadbuf;
 	for (sample_idx = sample_idx_base; sample_idx < sample_idx_end; sample_idx++) {
-	  if (load_raw(infile, lptr, unfiltered_marker_ct4)) {
+	  if (load_raw(unfiltered_marker_ct4, infile, lptr)) {
 	    goto sample_major_to_snp_major_ret_READ_FAIL;
 	  }
 	  lptr = &(lptr[unfiltered_marker_ctl2]);
@@ -175,7 +177,7 @@ int32_t sample_major_to_snp_major(char* sample_major_fname, char* outname, uintp
     retval = RET_WRITE_FAIL;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(infile);
   fclose_cond(outfile);
   return retval;
@@ -232,20 +234,20 @@ int32_t load_map(FILE** mapfile_ptr, char* mapname, uint32_t* map_cols_ptr, uint
   int32_t ii;
   int32_t jj;
   fill_ulong_zero(loaded_chrom_mask, CHROM_MASK_WORDS);
-  if (fopen_checked(mapfile_ptr, mapname, "r")) {
+  if (fopen_checked(mapname, "r", mapfile_ptr)) {
     goto load_map_ret_OPEN_FAIL;
   }
   // first pass: count columns, determine raw marker count, determine maximum
   // marker ID length if necessary.
-  tbuf[MAXLINELEN - 6] = ' ';
-  while (fgets(tbuf, MAXLINELEN - 5, *mapfile_ptr)) {
+  g_textbuf[MAXLINELEN - 6] = ' ';
+  while (fgets(g_textbuf, MAXLINELEN - 5, *mapfile_ptr)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 6]) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of .map file is pathologically long.\n", line_idx);
+    if (!g_textbuf[MAXLINELEN - 6]) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .map file is pathologically long.\n", line_idx);
       goto load_map_ret_INVALID_FORMAT_2;
     }
-    bufptr = skip_initial_spaces(tbuf);
-    if (is_eoln_or_comment(*bufptr)) {
+    bufptr = skip_initial_spaces(g_textbuf);
+    if (is_eoln_or_comment_kns(*bufptr)) {
       continue;
     }
     bufptr = next_token(bufptr);
@@ -277,28 +279,27 @@ int32_t load_map(FILE** mapfile_ptr, char* mapname, uint32_t* map_cols_ptr, uint
   *unfiltered_marker_ct_ptr = unfiltered_marker_ct;
   *max_marker_id_len_ptr = max_marker_id_len;
   rewind(*mapfile_ptr);
-  unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
 
-  // unfiltered_marker_ct can be very large, so use wkspace for all allocations
-  // that are a multiple of it
+  // unfiltered_marker_ct can be very large, so use bigstack for all
+  // allocations that are a multiple of it
 
-  // permanent stack allocation #1: marker_exclude
-  if (wkspace_alloc_ul_checked(marker_exclude_ptr, unfiltered_marker_ctl * sizeof(intptr_t))) {
+  // permanent bigstack allocation #1: marker_exclude
+  if (bigstack_calloc_ul(unfiltered_marker_ctl, marker_exclude_ptr)) {
     goto load_map_ret_NOMEM;
   }
   marker_exclude = *marker_exclude_ptr;
-  fill_ulong_zero(marker_exclude, unfiltered_marker_ctl);
   fill_uint_zero(chrom_info_ptr->chrom_file_order, MAX_POSSIBLE_CHROM);
   fill_uint_zero(chrom_info_ptr->chrom_file_order_marker_idx, MAX_POSSIBLE_CHROM + 1);
   fill_uint_zero(chrom_info_ptr->chrom_start, MAX_POSSIBLE_CHROM);
   fill_uint_zero(chrom_info_ptr->chrom_end, MAX_POSSIBLE_CHROM);
-  // permanent stack allocation #3, if needed: marker_pos
+  // permanent bigstack allocation #2, if needed: marker_pos
   if (marker_pos_needed) {
-    if (wkspace_alloc_ui_checked(marker_pos_ptr, unfiltered_marker_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(unfiltered_marker_ct, marker_pos_ptr)) {
       goto load_map_ret_NOMEM;
     }
   }
-  if (wkspace_alloc_c_checked(marker_ids_ptr, unfiltered_marker_ct * max_marker_id_len)) {
+  if (bigstack_alloc_c(unfiltered_marker_ct * max_marker_id_len, marker_ids_ptr)) {
     goto load_map_ret_NOMEM;
   }
 
@@ -313,7 +314,7 @@ int32_t load_map(FILE** mapfile_ptr, char* mapname, uint32_t* map_cols_ptr, uint
       if (chrom_error(".map file", chrom_info_ptr, bufptr, line_idx, jj, allow_extra_chroms)) {
         goto load_map_ret_INVALID_FORMAT;
       }
-      retval = resolve_or_add_chrom_name(chrom_info_ptr, bufptr, &jj, line_idx, ".map file");
+      retval = resolve_or_add_chrom_name(bufptr, ".map file", line_idx, chrom_info_ptr, &jj);
       if (retval) {
 	goto load_map_ret_1;
       }
@@ -329,7 +330,7 @@ int32_t load_map(FILE** mapfile_ptr, char* mapname, uint32_t* map_cols_ptr, uint
       if (is_set(loaded_chrom_mask, jj)) {
 	*map_is_unsorted_ptr |= UNSORTED_SPLIT_CHROM | UNSORTED_BP;
       } else {
-	set_bit(loaded_chrom_mask, jj);
+	set_bit(jj, loaded_chrom_mask);
 	chrom_info_ptr->chrom_start[(uint32_t)jj] = marker_uidx;
 	chrom_info_ptr->chrom_file_order[++chroms_encountered_m1] = jj;
 	chrom_info_ptr->chrom_file_order_marker_idx[chroms_encountered_m1] = marker_uidx;
@@ -338,7 +339,7 @@ int32_t load_map(FILE** mapfile_ptr, char* mapname, uint32_t* map_cols_ptr, uint
     }
 
     if (!is_set(chrom_info_ptr->chrom_mask, jj)) {
-      SET_BIT(marker_exclude, marker_uidx);
+      SET_BIT(marker_uidx, marker_exclude);
       marker_exclude_ct++;
     } else {
       bufptr = next_token(bufptr);
@@ -351,11 +352,11 @@ int32_t load_map(FILE** mapfile_ptr, char* mapname, uint32_t* map_cols_ptr, uint
 	goto load_map_ret_MISSING_TOKENS;
       }
       if (scan_int_abs_defcap(bufptr, &ii)) {
-	sprintf(logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of .map file.\n", line_idx);
+	sprintf(g_logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of .map file.\n", line_idx);
 	goto load_map_ret_INVALID_FORMAT_2;
       }
       if (ii < 0) {
-	SET_BIT(marker_exclude, marker_uidx);
+	SET_BIT(marker_uidx, marker_exclude);
 	marker_exclude_ct++;
       } else {
 	cur_pos = ii;
@@ -391,7 +392,7 @@ int32_t load_map(FILE** mapfile_ptr, char* mapname, uint32_t* map_cols_ptr, uint
     retval = RET_READ_FAIL;
     break;
   load_map_ret_MISSING_TOKENS:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of .map file has fewer tokens than expected.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .map file has fewer tokens than expected.\n", line_idx);
   load_map_ret_INVALID_FORMAT_2:
     logerrprintb();
   load_map_ret_INVALID_FORMAT:
@@ -504,7 +505,7 @@ static inline uint32_t sf_out_of_range(uint32_t cur_pos, uint32_t chrom_idx, uin
 
 int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_marker_ct_ptr, uintptr_t* marker_exclude_ct_ptr, uintptr_t* max_marker_id_len_ptr, uintptr_t** marker_exclude_ptr, double** set_allele_freqs_ptr, uint32_t** nchrobs_ptr, char*** marker_allele_pp, uintptr_t* max_marker_allele_len_ptr, char** marker_ids_ptr, char* missing_mid_template, uint32_t new_id_max_allele_len, const char* missing_marker_id_match, Chrom_info* chrom_info_ptr, double** marker_cms_ptr, uint32_ [...]
   // supports .map now too, to make e.g. --snps + --dosage work
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* bimfile = NULL;
   uintptr_t unfiltered_marker_ct = 0;
   uintptr_t marker_exclude_ct = *marker_exclude_ct_ptr;
@@ -521,7 +522,10 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
   uint32_t from_slen = markername_from? strlen(markername_from) : 0;
   uint32_t to_slen = markername_to? strlen(markername_to) : 0;
   uint32_t snp_slen = markername_snp? strlen(markername_snp) : 0;
+  // "sf" = "snp filter" (could rename to "vf"...)
   uint32_t sf_ct = sf_range_list_ptr->name_ct;
+  // assume for now that sf_ct * sf_max_len < 2^32, since these are based on
+  // command-line parameters
   uint32_t sf_max_len = sf_range_list_ptr->name_max_len;
   uint32_t slen_check = from_slen || to_slen || snp_slen || sf_ct;
   uint32_t from_chrom = MAX_POSSIBLE_CHROM;
@@ -589,10 +593,10 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
     if (!sf_start_idxs) {
       goto load_bim_ret_NOMEM;
     }
-    if (wkspace_alloc_ui_checked(&sf_str_chroms, sf_ct * sizeof(int32_t)) ||
-	wkspace_alloc_ui_checked(&sf_str_pos, sf_ct * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&sf_str_lens, sf_ct * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&sf_llbuf, 3 * (MAX_POSSIBLE_CHROM + sf_ct) * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(sf_ct, &sf_str_chroms) ||
+	bigstack_alloc_ui(sf_ct, &sf_str_pos) ||
+        bigstack_alloc_ui(sf_ct, &sf_str_lens) ||
+        bigstack_alloc_ui(3 * (MAX_POSSIBLE_CHROM + sf_ct), &sf_llbuf)) {
       goto load_bim_ret_NOMEM;
     }
     for (uii = 0; uii < sf_ct; uii++) {
@@ -647,25 +651,25 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
       }
     }
   }
-  if (fopen_checked(&bimfile, bimname, "r")) {
+  if (fopen_checked(bimname, "r", &bimfile)) {
     goto load_bim_ret_OPEN_FAIL;
   }
   // first pass: count columns, determine raw marker count, determine maximum
   // marker ID length and/or marker allele length if necessary, save
   // nonstandard chromosome names.
-  loadbuf_size = wkspace_left;
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   } else if (loadbuf_size <= MAXLINELEN) {
     goto load_bim_ret_NOMEM;
   }
-  loadbuf = (char*)wkspace_base;
+  loadbuf = (char*)g_bigstack_base;
   loadbuf[loadbuf_size - 1] = ' ';
   while (fgets(loadbuf, loadbuf_size, bimfile)) {
     line_idx++;
     if (!loadbuf[loadbuf_size - 1]) {
       if (loadbuf_size == MAXLINEBUFLEN) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, ftype_str);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, ftype_str);
         goto load_bim_ret_INVALID_FORMAT_2;
       } else {
 	goto load_bim_ret_NOMEM;
@@ -677,7 +681,7 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
     }
     // bufptr3 = col 1 start
     bufptr3 = skip_initial_spaces(loadbuf);
-    if (is_eoln_or_comment(*bufptr3)) {
+    if (is_eoln_or_comment_kns(*bufptr3)) {
       continue;
     }
     jj = get_chrom_code(chrom_info_ptr, bufptr3);
@@ -685,7 +689,7 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
       if (chrom_error(ftype_str, chrom_info_ptr, bufptr3, line_idx, jj, allow_extra_chroms)) {
         goto load_bim_ret_INVALID_FORMAT;
       }
-      retval = resolve_or_add_chrom_name(chrom_info_ptr, bufptr3, &jj, line_idx, ftype_str);
+      retval = resolve_or_add_chrom_name(bufptr3, ftype_str, line_idx, chrom_info_ptr, &jj);
       if (retval) {
 	goto load_bim_ret_1;
       }
@@ -829,7 +833,7 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
 	    }
 	  }
 	  fill_ulong_zero(chrom_info_ptr->chrom_mask, CHROM_MASK_WORDS);
-	  SET_BIT(chrom_info_ptr->chrom_mask, from_chrom);
+	  SET_BIT(from_chrom, chrom_info_ptr->chrom_mask);
 	}
 	if ((ulii == to_slen) && (!memcmp(bufptr, markername_to, ulii))) {
 	  if (to_chrom != MAX_POSSIBLE_CHROM) {
@@ -845,7 +849,7 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
 	    }
 	  }
 	  fill_ulong_zero(chrom_info_ptr->chrom_mask, CHROM_MASK_WORDS);
-	  SET_BIT(chrom_info_ptr->chrom_mask, to_chrom);
+	  SET_BIT(to_chrom, chrom_info_ptr->chrom_mask);
 	}
 	if ((ulii == snp_slen) && (!memcmp(bufptr, markername_snp, ulii))) {
 	  if (snp_chrom != MAX_POSSIBLE_CHROM) {
@@ -857,7 +861,7 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
 	  }
 	  if (!exclude_snp) {
 	    fill_ulong_zero(chrom_info_ptr->chrom_mask, CHROM_MASK_WORDS);
-	    SET_BIT(chrom_info_ptr->chrom_mask, snp_chrom);
+	    SET_BIT(snp_chrom, chrom_info_ptr->chrom_mask);
 	  }
 	}
       }
@@ -933,7 +937,7 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
     ukk = 0;
     for (uii = 0; uii <= ujj; uii++) {
       if (sf_start_idxs[uii] == 1) {
-	CLEAR_BIT(sf_mask, uii);
+	CLEAR_BIT(uii, sf_mask);
 	sf_start_idxs[uii] = ukk;
 	continue;
       }
@@ -949,13 +953,13 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
     if (!exclude_snp) {
       memcpy(chrom_info_ptr->chrom_mask, sf_mask, CHROM_MASK_WORDS * sizeof(intptr_t));
     }
-    wkspace_reset(wkspace_mark);
+    bigstack_reset(bigstack_mark);
   }
   if (!feof(bimfile)) {
     goto load_bim_ret_READ_FAIL;
   }
   if ((!unfiltered_marker_ct) && (!allow_no_variants)) {
-    sprintf(logbuf, "Error: No variants in %s.\n", ftype_str);
+    sprintf(g_logbuf, "Error: No variants in %s.\n", ftype_str);
     goto load_bim_ret_INVALID_FORMAT_2;
   } else if (unfiltered_marker_ct > 2147483645) {
     // maximum prime < 2^32 is 4294967291; quadratic hashing guarantee breaks
@@ -1027,25 +1031,24 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
   *unfiltered_marker_ct_ptr = unfiltered_marker_ct;
   *max_marker_id_len_ptr = max_marker_id_len;
   rewind(bimfile);
-  unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
 
-  // unfiltered_marker_ct can be very large, so use wkspace for all allocations
-  // that are a multiple of it
+  // unfiltered_marker_ct can be very large, so use bigstack for all
+  // allocations that are a multiple of it
 
-  // permanent stack allocation #1: marker_exclude
-  // permanent stack allocation #2: set_allele_freqs
-  if (wkspace_alloc_ul_checked(marker_exclude_ptr, unfiltered_marker_ctl * sizeof(intptr_t))) {
+  // permanent bigstack allocation #1: marker_exclude
+  // permanent bigstack allocation #2: set_allele_freqs
+  if (bigstack_calloc_ul(unfiltered_marker_ctl, marker_exclude_ptr)) {
     goto load_bim_ret_NOMEM;
   }
   marker_exclude = *marker_exclude_ptr;
-  fill_ulong_zero(marker_exclude, unfiltered_marker_ctl);
   if (set_allele_freqs_ptr) {
-    if (wkspace_alloc_d_checked(set_allele_freqs_ptr, unfiltered_marker_ct * sizeof(double))) {
+    if (bigstack_alloc_d(unfiltered_marker_ct, set_allele_freqs_ptr)) {
       goto load_bim_ret_NOMEM;
     }
     // leave set_allele_freqs uninitialized
     if (nchrobs_ptr) {
-      if (wkspace_alloc_ui_checked(nchrobs_ptr, unfiltered_marker_ct * sizeof(int32_t))) {
+      if (bigstack_alloc_ui(unfiltered_marker_ct, nchrobs_ptr)) {
 	goto load_bim_ret_NOMEM;
       }
       // on the other hand, this is not autocomputed
@@ -1056,9 +1059,9 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
   fill_uint_zero(chrom_info_ptr->chrom_file_order_marker_idx, MAX_POSSIBLE_CHROM + 1);
   fill_uint_zero(chrom_info_ptr->chrom_start, MAX_POSSIBLE_CHROM);
   fill_uint_zero(chrom_info_ptr->chrom_end, MAX_POSSIBLE_CHROM);
-  // permanent stack allocation #3, if needed: marker_pos
+  // permanent bigstack allocation #3, if needed: marker_pos
   if (marker_pos_needed) {
-    if (wkspace_alloc_ui_checked(marker_pos_ptr, unfiltered_marker_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(unfiltered_marker_ct, marker_pos_ptr)) {
       goto load_bim_ret_NOMEM;
     }
   }
@@ -1066,13 +1069,13 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
     if (snps_only) {
       max_marker_allele_len = 2;
     }
-    if (max_marker_allele_len > 500000000) {
+    if (max_marker_allele_len > NON_BIGSTACK_MIN - 1) {
       // guard against overflows
-      logerrprint("Error: Alleles are limited to 500 million characters.\n");
+      LOGERRPRINTF("Error: Alleles are limited to %u characters.\n", NON_BIGSTACK_MIN - 1);
       goto load_bim_ret_INVALID_FORMAT;
     }
     *max_marker_allele_len_ptr = max_marker_allele_len;
-    marker_allele_ptrs = (char**)wkspace_alloc(unfiltered_marker_ct * 2 * sizeof(intptr_t));
+    marker_allele_ptrs = (char**)bigstack_alloc(unfiltered_marker_ct * 2 * sizeof(intptr_t));
     if (!marker_allele_ptrs) {
       goto load_bim_ret_NOMEM;
     }
@@ -1082,16 +1085,15 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
       marker_allele_ptrs[uii] = missing_geno_ptr;
     }
   }
-  if (wkspace_alloc_c_checked(marker_ids_ptr, unfiltered_marker_ct * max_marker_id_len)) {
+  if (bigstack_alloc_c(unfiltered_marker_ct * max_marker_id_len, marker_ids_ptr)) {
     goto load_bim_ret_NOMEM;
   }
   // todo: check whether marker_cms can be unloaded before
   // marker_ids/marker_alleles, or vice versa
   if (marker_cms_needed & MARKER_CMS_FORCED) {
-    if (wkspace_alloc_d_checked(marker_cms_ptr, unfiltered_marker_ct * sizeof(double))) {
+    if (bigstack_calloc_d(unfiltered_marker_ct, marker_cms_ptr)) {
       goto load_bim_ret_NOMEM;
     }
-    fill_double_zero(*marker_cms_ptr, unfiltered_marker_ct);
   }
   if (filter_flags & FILTER_ZERO_CMS) {
     marker_cms_needed = 0;
@@ -1117,7 +1119,7 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
 	goto load_bim_ret_READ_FAIL;
       }
       bufptr3 = skip_initial_spaces(loadbuf2);
-    } while (is_eoln_or_comment(*bufptr3));
+    } while (is_eoln_or_comment_kns(*bufptr3));
     jj = get_chrom_code(chrom_info_ptr, bufptr3);
     if (jj != prev_chrom) {
       if (!split_chrom) {
@@ -1130,7 +1132,7 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
 	prev_chrom = jj;
 	if (is_set(loaded_chrom_mask, jj)) {
 	  if (split_chrom_cmd) {
-	    sprintf(logbuf, "Error: %s has a split chromosome.  Use --%s by itself to\nremedy this.\n", ftype_str, split_chrom_cmd);
+	    sprintf(g_logbuf, "Error: %s has a split chromosome.  Use --%s by itself to\nremedy this.\n", ftype_str, split_chrom_cmd);
 	    goto load_bim_ret_INVALID_FORMAT_2;
 	  }
 	  split_chrom = 1;
@@ -1142,7 +1144,7 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
 	}
         last_pos = 0;
       }
-      set_bit(loaded_chrom_mask, jj);
+      set_bit(jj, loaded_chrom_mask);
     }
 
     if (is_set(chrom_info_ptr->chrom_mask, jj)) {
@@ -1162,13 +1164,12 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
 	}
 	if ((*bufptr != '0') || (bufptr[1] > ' ')) {
 	  if (!(*marker_cms_ptr)) {
-	    if (wkspace_alloc_d_checked(marker_cms_ptr, unfiltered_marker_ct * sizeof(double))) {
+	    if (bigstack_calloc_d(unfiltered_marker_ct, marker_cms_ptr)) {
 	      goto load_bim_ret_NOMEM;
 	    }
-	    fill_double_zero(*marker_cms_ptr, unfiltered_marker_ct);
 	  }
 	  if (scan_double(bufptr, &((*marker_cms_ptr)[marker_uidx]))) {
-	    sprintf(logbuf, "Error: Invalid centimorgan position on line %" PRIuPTR " of %s.\n", line_idx, ftype_str);
+	    sprintf(g_logbuf, "Error: Invalid centimorgan position on line %" PRIuPTR " of %s.\n", line_idx, ftype_str);
 	    goto load_bim_ret_INVALID_FORMAT_2;
 	  }
 	}
@@ -1225,11 +1226,13 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
 	    }
 	  }
 	  ulii = marker_uidx * 2;
-	  if (allele_set(&(marker_allele_ptrs[ulii]), bufptr4, ukk)) {
+	  // possible todo: warn if a comma is present (could use memchr),
+	  // since that breaks VCF and PLINK 2.0.
+	  if (allele_set(bufptr4, ukk, &(marker_allele_ptrs[ulii]))) {
 	    goto load_bim_ret_NOMEM;
 	  }
 	  ulii++;
-	  if (allele_set(&(marker_allele_ptrs[ulii]), bufptr5, umm)) {
+	  if (allele_set(bufptr5, umm, &(marker_allele_ptrs[ulii]))) {
 	    goto load_bim_ret_NOMEM;
 	  }
 	}
@@ -1279,7 +1282,7 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
       }
     } else {
     load_bim_skip_marker:
-      SET_BIT(marker_exclude, marker_uidx);
+      SET_BIT(marker_uidx, marker_exclude);
       marker_exclude_ct++;
       if (marker_pos_needed) {
         // support unfiltered marker_pos search
@@ -1292,7 +1295,7 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
     goto load_bim_ret_ALL_MARKERS_EXCLUDED;
   }
   if (missing_mid_template && ((*map_is_unsorted_ptr) & UNSORTED_BP)) {
-    sprintf(logbuf, "Error: --set-missing-var-ids requires a sorted %s.  Retry this command\nafter using --make-bed to sort your data.\n", ftype_str);
+    sprintf(g_logbuf, "Error: --set-missing-var-ids requires a sorted %s.  Retry this command\nafter using --make-bed to sort your data.\n", ftype_str);
     goto load_bim_ret_INVALID_FORMAT_2;
   }
   for (uii = 0; uii < CHROM_MASK_WORDS; uii++) {
@@ -1364,11 +1367,11 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
   // sex_nm and sex_male should be NULL unless sex is supposed to be added as
   // an extra covariate
   // covar_range_list_ptr is NULL iff --gxe was specified
-  unsigned char* wkspace_mark = wkspace_base;
-  unsigned char* wkspace_mark2 = NULL;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+  unsigned char* bigstack_mark2 = NULL;
   FILE* covar_file = NULL;
-  uintptr_t sample_ctl = (sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t topsize = 0;
+  uintptr_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
   uintptr_t covar_raw_ct = 0;
   uintptr_t loaded_sample_ct = 0;
   uintptr_t missing_cov_ct = 0;
@@ -1410,46 +1413,28 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
   int32_t ii;
 
   if ((!keep_pheno_on_missing_cov) || gxe_mcovar || sex_nm) {
-    sample_idx_to_uidx = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
-    if (!sample_idx_to_uidx) {
+    if (bigstack_end_alloc_ui(sample_ct, &sample_idx_to_uidx)) {
       goto load_covars_ret_NOMEM;
     }
     fill_idx_to_uidx(sample_exclude, unfiltered_sample_ct, sample_ct, sample_idx_to_uidx);
   }
-  sorted_ids = (char*)top_alloc(&topsize, sample_ct * max_sample_id_len);
-  if (!sorted_ids) {
+  if (bigstack_end_alloc_c(sample_ct * max_sample_id_len, &sorted_ids) ||
+      bigstack_end_alloc_ui(sample_ct, &id_map) ||
+      bigstack_end_calloc_ul(sample_ctl, &already_seen)) {
     goto load_covars_ret_NOMEM;
   }
-  id_map = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
-  if (!id_map) {
-    goto load_covars_ret_NOMEM;
-  }
-  already_seen = (uintptr_t*)top_alloc(&topsize, sample_ctl * sizeof(intptr_t));
-  if (!already_seen) {
-    goto load_covars_ret_NOMEM;
-  }
-  fill_ulong_zero(already_seen, sample_ctl);
   if (covar_modifier & COVAR_NAME) {
     ulii = covar_range_list_ptr->name_ct;
-    sorted_covar_name_flag_ids = (char*)top_alloc(&topsize, ulii * covar_range_list_ptr->name_max_len);
-    if (!sorted_covar_name_flag_ids) {
-      goto load_covars_ret_NOMEM;
-    }
-    covar_name_flag_id_map = (uint32_t*)top_alloc(&topsize, ulii * sizeof(int32_t));
-    if (!covar_name_flag_id_map) {
-      goto load_covars_ret_NOMEM;
-    }
-    covar_name_flag_seen_idxs = (int32_t*)top_alloc(&topsize, ulii * sizeof(int32_t));
-    if (!covar_name_flag_seen_idxs) {
+    if (bigstack_end_alloc_c(ulii * covar_range_list_ptr->name_max_len, &sorted_covar_name_flag_ids) ||
+	bigstack_end_alloc_ui(ulii, &covar_name_flag_id_map) ||
+        bigstack_end_alloc_i(ulii, &covar_name_flag_seen_idxs)) {
       goto load_covars_ret_NOMEM;
     }
 
-    wkspace_left -= topsize;
     // kludge to use sort_item_ids_noalloc()
-    fill_ulong_zero((uintptr_t*)covar_name_flag_seen_idxs, (ulii + (BITCT - 1)) / BITCT);
-    retval = sort_item_ids_noalloc(sorted_covar_name_flag_ids, covar_name_flag_id_map, ulii, (uintptr_t*)covar_name_flag_seen_idxs, ulii, covar_range_list_ptr->names, covar_range_list_ptr->name_max_len, 0, 0, strcmp_deref);
+    fill_ulong_zero((uintptr_t*)covar_name_flag_seen_idxs, BITCT_TO_WORDCT(ulii));
+    retval = sort_item_ids_noalloc(ulii, (const uintptr_t*)covar_name_flag_seen_idxs, ulii, covar_range_list_ptr->names, covar_range_list_ptr->name_max_len, 0, 0, strcmp_deref, sorted_covar_name_flag_ids, covar_name_flag_id_map);
     if (retval) {
-      wkspace_left += topsize;
       if (retval == RET_INVALID_FORMAT) {
 	logprint("(in --covar-name parameter sequence)\n");
 	retval = RET_INVALID_CMDLINE;
@@ -1457,11 +1442,8 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
       goto load_covars_ret_1;
     }
     fill_int_one(covar_name_flag_seen_idxs, ulii);
-  } else {
-    wkspace_left -= topsize;
   }
-  retval = sort_item_ids_noalloc(sorted_ids, id_map, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, 0, 1, strcmp_deref);
-  wkspace_left += topsize;
+  retval = sort_item_ids_noalloc(unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, 0, 1, strcmp_deref, sorted_ids, id_map);
   if (retval) {
     goto load_covars_ret_1;
   }
@@ -1471,17 +1453,17 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
   // the first nonempty line (a value and a space = 2 bytes), so reserving the
   // last 1/17 (rounded up) always works.  (Minor memory leak fix:
   // covars_active no longer remains allocated on function exit.)
-  loadbuf_size = ((wkspace_left - topsize) / 68) * 64;
+  loadbuf_size = (bigstack_left() / 68) * 64;
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   } else if (loadbuf_size <= MAXLINELEN) {
     goto load_covars_ret_NOMEM;
   }
-  loadbuf = (char*)wkspace_base;
+  loadbuf = (char*)g_bigstack_base;
   // was using open_and_load_to_first_token(), but we now don't want to
   // automatically print an error message on an empty file.
   loadbuf[loadbuf_size - 1] = ' ';
-  if (fopen_checked(&covar_file, covar_fname, "r")) {
+  if (fopen_checked(covar_fname, "r", &covar_file)) {
     goto load_covars_ret_OPEN_FAIL;
   }
   line_idx = 0;
@@ -1490,7 +1472,7 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
       if (!feof(covar_file)) {
 	goto load_covars_ret_READ_FAIL;
       }
-      strcpy(tbuf, "Empty --covar file.\n");
+      strcpy(g_textbuf, "Empty --covar file.\n");
       goto load_covars_none;
     }
     line_idx++;
@@ -1510,11 +1492,13 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
   }
   covar_raw_ct -= 2;
   if ((!covar_raw_ct) && (!sex_nm)) {
-    strcpy(tbuf, "No covariate columns in --covar file.\n");
+    strcpy(g_textbuf, "No covariate columns in --covar file.\n");
     goto load_covars_none;
   }
-  covar_raw_ctl = (covar_raw_ct + (BITCT - 1)) / BITCT;
-  covars_active = (uintptr_t*)top_alloc(&topsize, covar_raw_ctl * sizeof(intptr_t));
+  covar_raw_ctl = BITCT_TO_WORDCT(covar_raw_ct);
+  if (bigstack_end_alloc_ul(covar_raw_ctl, &covars_active)) {
+    goto load_covars_ret_NOMEM;
+  }
 
   // no header line present?
   bufptr2 = next_token(bufptr);
@@ -1524,7 +1508,7 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
   if ((covar_modifier & (COVAR_NAME | COVAR_NUMBER)) && covar_raw_ct) {
     fill_ulong_zero(covars_active, covar_raw_ctl);
     if (covar_modifier & COVAR_NUMBER) {
-      if (numeric_range_list_to_bitfield(covar_range_list_ptr, covar_raw_ct, covars_active, 1, 0)) {
+      if (numeric_range_list_to_bitarr(covar_range_list_ptr, covar_raw_ct, 1, 0, covars_active)) {
 	goto load_covars_ret_MISSING_TOKENS;
       }
     } else if (covar_modifier & COVAR_NAME) {
@@ -1532,17 +1516,16 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
 	logerrprint("Error: --covar file doesn't have a header line for --covar-name.\n");
 	goto load_covars_ret_INVALID_FORMAT;
       }
-      retval = string_range_list_to_bitfield(bufptr, covar_raw_ct, 0, covar_range_list_ptr, sorted_covar_name_flag_ids, covar_name_flag_id_map, covar_name_flag_seen_idxs, "covar-name", "--covar file header line", covars_active);
+      retval = string_range_list_to_bitarr(bufptr, covar_raw_ct, 0, covar_range_list_ptr, sorted_covar_name_flag_ids, covar_name_flag_id_map, "covar-name", "--covar file header line", covars_active, covar_name_flag_seen_idxs);
       if (retval) {
 	goto load_covars_ret_1;
       }
       // can't deallocate --covar-name support here due to covars_active
       // repositioning
-      // topsize -= (uintptr_t)(((unsigned char*)already_seen) - ((unsigned char*)covar_name_flag_seen_idxs));
     }
     covar_ct = popcount_longs(covars_active, covar_raw_ctl);
   } else if (covar_range_list_ptr) {
-    fill_all_bits(covars_active, covar_raw_ct);
+    fill_all_bits(covar_raw_ct, covars_active);
     covar_ct = covar_raw_ct;
   } else {
     // --gxe only
@@ -1551,7 +1534,7 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
   }
   covar_ctx = covar_ct + (sex_nm? 1 : 0);
   if ((!covar_ctx) && (!gxe_mcovar)) {
-    strcpy(tbuf, "No --covar values loaded.\n");
+    strcpy(g_textbuf, "No --covar values loaded.\n");
     goto load_covars_none;
   }
   min_covar_col_ct = covar_ct? (last_set_bit(covars_active, covar_raw_ctl) + 1) : 0;
@@ -1576,7 +1559,6 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
     }
   }
 
-  wkspace_left -= topsize;
   // * covar_nm does NOT have a separate entry per covariate; instead,
   //   if a single covariate is missing for a person, that person's covar_nm
   //   bit is zero.
@@ -1593,10 +1575,10 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
     *covar_ctx_ptr = covar_ctx;
     *max_covar_name_len_ptr = max_covar_name_len;
     ulii = covar_ctx * sample_ct;
-    if (wkspace_alloc_c_checked(covar_names_ptr, covar_ctx * max_covar_name_len) ||
-        wkspace_alloc_ul_checked(covar_nm_ptr, sample_ctl * sizeof(intptr_t)) ||
-        wkspace_alloc_d_checked(covar_d_ptr, ulii * sizeof(double))) {
-      goto load_covars_ret_NOMEM2;
+    if (bigstack_alloc_c(covar_ctx * max_covar_name_len, covar_names_ptr) ||
+        bigstack_alloc_ul(sample_ctl, covar_nm_ptr) ||
+        bigstack_alloc_d(ulii, covar_d_ptr)) {
+      goto load_covars_ret_NOMEM;
     }
     covar_names = *covar_names_ptr;
     covar_nm = *covar_nm_ptr;
@@ -1607,25 +1589,22 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
     }
   }
   if (gxe_mcovar) {
-    if (wkspace_alloc_ul_checked(gxe_covar_nm_ptr, sample_ctl * sizeof(intptr_t)) ||
-        wkspace_alloc_ul_checked(gxe_covar_c_ptr, sample_ctl * sizeof(intptr_t))) {
-      goto load_covars_ret_NOMEM2;
+    if (bigstack_calloc_ul(sample_ctl, gxe_covar_nm_ptr) ||
+        bigstack_calloc_ul(sample_ctl, gxe_covar_c_ptr)) {
+      goto load_covars_ret_NOMEM;
     }
     gxe_covar_nm = *gxe_covar_nm_ptr;
     gxe_covar_c = *gxe_covar_c_ptr;
-    fill_ulong_zero(gxe_covar_nm, sample_ctl);
-    fill_ulong_zero(gxe_covar_c, sample_ctl);
   }
-  if (wkspace_left <= MAXLINELEN) {
-    goto load_covars_ret_NOMEM2;
+  loadbuf_size = bigstack_left();
+  if (loadbuf_size <= MAXLINELEN) {
+    goto load_covars_ret_NOMEM;
   }
-  wkspace_mark2 = wkspace_base;
-  loadbuf = (char*)wkspace_base;
-  loadbuf_size = wkspace_left;
+  bigstack_mark2 = g_bigstack_base;
+  loadbuf = (char*)g_bigstack_base;
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   }
-  wkspace_left += topsize;
   loadbuf[loadbuf_size - 1] = ' ';
 
   rewind(covar_file);
@@ -1633,7 +1612,7 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
     if (covar_range_list_ptr) {
       for (covar_uidx = 0, covar_idx = 0; covar_idx < covar_ct; covar_idx++) {
 	covar_uidx = next_set_ul_unsafe(covars_active, covar_uidx);
-	uint32_writex(memcpyl3a(&(covar_names[covar_idx * max_covar_name_len]), "COV"), ++covar_uidx, '\0');
+	uint32toa_x(++covar_uidx, '\0', memcpyl3a(&(covar_names[covar_idx * max_covar_name_len]), "COV"));
       }
     }
     line_idx = 0;
@@ -1662,7 +1641,7 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
     line_idx++;
     if (!loadbuf[loadbuf_size - 1]) {
       if (loadbuf_size == MAXLINEBUFLEN) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of --covar file is pathologically long.\n", line_idx);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --covar file is pathologically long.\n", line_idx);
 	goto load_covars_ret_INVALID_FORMAT_2;
       } else {
 	goto load_covars_ret_NOMEM;
@@ -1672,7 +1651,7 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
-    if (bsearch_read_fam_indiv(tbuf, sorted_ids, max_sample_id_len, sample_ct, bufptr, &bufptr2, &ii)) {
+    if (bsearch_read_fam_indiv(bufptr, sorted_ids, max_sample_id_len, sample_ct, &bufptr2, &ii, g_textbuf)) {
       goto load_covars_ret_MISSING_TOKENS;
     }
     if (ii == -1) {
@@ -1682,7 +1661,7 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
       logerrprint("Error: Duplicate sample ID in --covar file.\n");
       goto load_covars_ret_INVALID_FORMAT;
     }
-    set_bit(already_seen, ii);
+    set_bit(ii, already_seen);
     sample_idx = id_map[(uint32_t)ii];
     bufptr = bufptr2;
     if (min_covar_col_ct) {
@@ -1717,12 +1696,12 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
 	    // --write-covar does not, so handle 0 separately for backward
 	    // compatibility
 	    if (!keep_pheno_on_missing_cov) {
-	      CLEAR_BIT(pheno_nm, sample_uidx);
+	      CLEAR_BIT(sample_uidx, pheno_nm);
 	    }
 	  } else if (dxx != 0.0) {
-	    SET_BIT(gxe_covar_nm, sample_idx);
+	    SET_BIT(sample_idx, gxe_covar_nm);
 	    if (dxx == 2.0) {
-	      SET_BIT(gxe_covar_c, sample_idx);
+	      SET_BIT(sample_idx, gxe_covar_c);
 	    }
 	  }
 	}
@@ -1739,13 +1718,13 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
 	}
       }
       if (!covar_missing) {
-	SET_BIT(covar_nm, sample_idx);
+	SET_BIT(sample_idx, covar_nm);
       } else {
 	missing_cov_ct++;
 	if (!keep_pheno_on_missing_cov) {
 	  sample_uidx = sample_idx_to_uidx[sample_idx];
 	  if (IS_SET(pheno_nm, sample_uidx)) {
-	    CLEAR_BIT(pheno_nm, sample_uidx);
+	    CLEAR_BIT(sample_uidx, pheno_nm);
 	  }
 	}
       }
@@ -1758,12 +1737,12 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
   if (covar_range_list_ptr) {
     if ((covar_ct + 1 < covar_raw_ct) || ((covar_ct + 1 == covar_raw_ct) && ((!gxe_mcovar) || is_set(covars_active, gxe_mcovar - 1)))) {
       if (gxe_mcovar && (!is_set(covars_active, gxe_mcovar - 1))) {
-        sprintf(logbuf, "--covar: 1 C/C cov. loaded for --gxe, %" PRIuPTR "/%" PRIuPTR " for other operations.\n", covar_ct, covar_raw_ct);
+        sprintf(g_logbuf, "--covar: 1 C/C cov. loaded for --gxe, %" PRIuPTR "/%" PRIuPTR " for other operations.\n", covar_ct, covar_raw_ct);
       } else {
-        sprintf(logbuf, "--covar: %" PRIuPTR " out of %" PRIuPTR " covariates loaded.\n", covar_ct, covar_raw_ct);
+        sprintf(g_logbuf, "--covar: %" PRIuPTR " out of %" PRIuPTR " covariates loaded.\n", covar_ct, covar_raw_ct);
       }
     } else {
-      sprintf(logbuf, "--covar: %" PRIuPTR " covariate%s loaded.\n", covar_ct, (covar_ct == 1)? "" : "s");
+      sprintf(g_logbuf, "--covar: %" PRIuPTR " covariate%s loaded.\n", covar_ct, (covar_ct == 1)? "" : "s");
     }
     logprintb();
   } else {
@@ -1787,12 +1766,11 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
     }
     if (covar_range_list_ptr) {
       // redefinition
-      covar_raw_ctl = (covar_ctx + BITCT - 1) / BITCT;
-      if (wkspace_alloc_ul_checked(&already_seen, covar_raw_ctl * sizeof(intptr_t))) {
+      covar_raw_ctl = BITCT_TO_WORDCT(covar_ctx);
+      if (bigstack_calloc_ul(covar_raw_ctl, &already_seen)) {
 	goto load_covars_ret_NOMEM;
       }
       // is covariate nonconstant?
-      fill_ulong_zero(already_seen, covar_raw_ctl);
       for (covar_idx = 0; covar_idx < covar_ctx; covar_idx++) {
 	dptr = &(covar_d[covar_idx]);
 	dxx = missing_phenod;
@@ -1808,12 +1786,12 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
 	  }
 	}
 	if (sample_idx < sample_ct) {
-	  SET_BIT(already_seen, covar_idx);
+	  SET_BIT(covar_idx, already_seen);
 	}
       }
       uii = popcount_longs(already_seen, covar_raw_ctl);
       if (!uii) {
-	strcpy(tbuf, "All covariates are constant.\n");
+	strcpy(g_textbuf, "All covariates are constant.\n");
 	goto load_covars_none;
       } else if (uii < covar_ctx) {
 	LOGPRINTF("--no-const-covar: %" PRIuPTR " constant covariate%s excluded.\n", covar_ctx - uii, (covar_ctx - uii == 1)? "" : "s");
@@ -1833,7 +1811,7 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
 	  if (!uii) {
 	    // if this sample had some missing covariate values, but all those
 	    // covariates were excluded by --no-const-covar, set covar_nm bit
-	    SET_BIT(covar_nm, sample_idx);
+	    SET_BIT(sample_idx, covar_nm);
 	  }
 	}
 	covar_idx = next_unset_unsafe(already_seen, 0);
@@ -1849,10 +1827,8 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
     }
   }
 
-  wkspace_reset(wkspace_mark2);
+  bigstack_reset(bigstack_mark2);
   while (0) {
-  load_covars_ret_NOMEM2:
-    wkspace_left += topsize;
   load_covars_ret_NOMEM:
     retval = RET_NOMEM;
     break;
@@ -1863,7 +1839,7 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
     retval = RET_READ_FAIL;
     break;
   load_covars_ret_MISSING_TOKENS:
-    sprintf(logbuf, "Error: Fewer tokens than expected on line %" PRIuPTR " of --covar file.\n", line_idx);
+    sprintf(g_logbuf, "Error: Fewer tokens than expected on line %" PRIuPTR " of --covar file.\n", line_idx);
   load_covars_ret_INVALID_FORMAT_2:
     logerrprintb();
   load_covars_ret_INVALID_FORMAT:
@@ -1877,23 +1853,26 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
       *covar_nm_ptr = NULL;
       *covar_d_ptr = NULL;
       // --gxe not possible
-      wkspace_reset(wkspace_mark);
+      bigstack_reset(bigstack_mark);
       logerrprint("Warning: ");
     } else {
       retval = RET_INVALID_FORMAT;
       logerrprint("Error: ");
     }
-    logerrprint(tbuf);
+    logerrprint(g_textbuf);
   }
  load_covars_ret_1:
   if (retval) {
-    wkspace_reset(wkspace_mark);
+    bigstack_reset(bigstack_mark);
   }
+  bigstack_end_reset(bigstack_end_mark);
   fclose_cond(covar_file);
   return retval;
 }
 
 int32_t write_covars(char* outname, char* outname_end, uint32_t write_covar_modifier, uint32_t write_covar_dummy_max_categories, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, char* paternal_ids, uintptr_t max_paternal_id_len, char* maternal_ids, uintptr_t max_maternal_id_len, uintptr_t* sex_nm, uintptr_t* sex_male, uintptr_t* pheno_nm, uintptr_t* pheno_c, double* pheno_d, double missing_phenod, char* output_ [...]
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   FILE* outfile = NULL;
   uint32_t write_pheno = write_covar_modifier & WRITE_COVAR_PHENO;
   uint32_t exclude_parents = write_covar_modifier & WRITE_COVAR_NO_PARENTS;
@@ -1909,7 +1888,6 @@ int32_t write_covars(char* outname, char* outname_end, uint32_t write_covar_modi
   uint32_t downcoding_no_round = (write_covar_modifier & WRITE_COVAR_DUMMY_NO_ROUND);
   uintptr_t downcoding_covar_ct = 0;
   uintptr_t covar_nm_ct = 0;
-  uintptr_t topsize = 0;
   int32_t retval = 0;
   uint32_t* downcoding_buf_idxs;
   int64_t* sorted_downcoding_intbuf;
@@ -1936,7 +1914,7 @@ int32_t write_covars(char* outname, char* outname_end, uint32_t write_covar_modi
   uint32_t uii;
   uint32_t ujj;
   memcpy(outname_end, ".cov", 5);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto write_covars_ret_OPEN_FAIL;
   }
   if (fputs_checked("FID IID ", outfile)) {
@@ -1953,32 +1931,24 @@ int32_t write_covars(char* outname, char* outname_end, uint32_t write_covar_modi
     }
   }
   if (do_downcoding) {
-    // could make downcoding_values allocation incremental (top_alloc() calls
-    // have been arranged to make this a simple change; would just need to
-    // wrap the qsort_ext() calls)
-    if (wkspace_alloc_ui_checked(&downcoding_level, covar_ct * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&downcoding_values, covar_ct * sample_ct * sizeof(int32_t))) {
+    // could make downcoding_values allocation incremental
+    // (bigstack_end_alloc() calls have been arranged to make this a simple
+    // change; would just need to wrap the qsort_ext() calls)
+    if (bigstack_alloc_ui(covar_ct, &downcoding_level) ||
+        bigstack_alloc_ui(covar_ct * sample_ct, &downcoding_values)) {
       goto write_covars_ret_NOMEM;
     }
     if (write_covar_dummy_max_categories > sample_ct) {
       write_covar_dummy_max_categories = sample_ct;
     }
-    downcoding_string_buf = (char*)top_alloc(&topsize, 16 * (write_covar_dummy_max_categories + 1));
-    if (!downcoding_string_buf) {
-      goto write_covars_ret_NOMEM;
-    }
-    category_idx_sort_buf = (int64_t*)top_alloc(&topsize, write_covar_dummy_max_categories * sizeof(int64_t));
-    if (!category_idx_sort_buf) {
-      goto write_covars_ret_NOMEM;
-    }
-    category_remap = (uint32_t*)top_alloc(&topsize, write_covar_dummy_max_categories * sizeof(int32_t));
-    if (!category_idx_sort_buf) {
+    if (bigstack_end_alloc_c(16 * (write_covar_dummy_max_categories + 1), &downcoding_string_buf) ||
+        bigstack_end_alloc_ll(write_covar_dummy_max_categories, &category_idx_sort_buf) ||
+	bigstack_end_alloc_ui(write_covar_dummy_max_categories, &category_remap)) {
       goto write_covars_ret_NOMEM;
     }
     uiptr = downcoding_values;
     if (!downcoding_no_round) {
-      sorted_downcoding_intbuf = (int64_t*)top_alloc(&topsize, sample_ct * sizeof(int64_t));
-      if (!sorted_downcoding_intbuf) {
+      if (bigstack_end_alloc_ll(sample_ct, &sorted_downcoding_intbuf)) {
         goto write_covars_ret_NOMEM;
       }
       for (covar_idx = 0; covar_idx < covar_ct; covar_idx++) {
@@ -2017,7 +1987,7 @@ int32_t write_covars(char* outname, char* outname_end, uint32_t write_covar_modi
 		break;
 	      }
 	      // save phenotype string
-	      int32_writex(&(downcoding_string_buf[16 * downcode_category_ct]), (int32_t)(((uint32_t)ullii) ^ 0x80000000U), '\0');
+	      int32toa_x((int32_t)(((uint32_t)ullii) ^ 0x80000000U), '\0', &(downcoding_string_buf[16 * downcode_category_ct]));
 
 	      // bits 0-31: initial category assignment
 	      // bits 32-63: smallest sample_idx2
@@ -2037,7 +2007,7 @@ int32_t write_covars(char* outname, char* outname_end, uint32_t write_covar_modi
 	// probably want to make this part its own function since it's
 	// practically identical when downcoding_no_round is set
         if (downcode_category_ct > 1) {
-	  int32_writex(&(downcoding_string_buf[16 * downcode_category_ct]), (int32_t)(((uint32_t)ullii) ^ 0x80000000U), '\0');
+	  int32toa_x((int32_t)(((uint32_t)ullii) ^ 0x80000000U), '\0', &(downcoding_string_buf[16 * downcode_category_ct]));
 	  category_idx_sort_buf[downcode_category_ct] = (int64_t)((((uint64_t)ulii) << 32) | ((uint64_t)downcode_category_ct));
 	  downcode_category_ct++;
           // now recover PLINK 1.07 category order
@@ -2048,12 +2018,12 @@ int32_t write_covars(char* outname, char* outname_end, uint32_t write_covar_modi
 #endif
 
 	  downcoding_level[covar_idx] = downcode_category_ct;
-          wptr_start = strcpyax(tbuf, &(covar_names[covar_idx * max_covar_name_len]), '_');
+          wptr_start = strcpyax(g_textbuf, &(covar_names[covar_idx * max_covar_name_len]), '_');
 	  for (downcode_idx = 0; downcode_idx < downcode_category_ct; downcode_idx++) {
 	    uii = (uint32_t)(category_idx_sort_buf[downcode_idx]);
 	    if (downcode_idx) {
 	      wptr = strcpyax(wptr_start, &(downcoding_string_buf[16 * uii]), ' ');
-	      fwrite(tbuf, 1, wptr - tbuf, outfile);
+	      fwrite(g_textbuf, 1, wptr - g_textbuf, outfile);
 	    }
             category_remap[uii] = downcode_idx;
 	  }
@@ -2071,12 +2041,8 @@ int32_t write_covars(char* outname, char* outname_end, uint32_t write_covar_modi
 	}
       }
     } else {
-      downcoding_buf_idxs = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
-      if (!downcoding_buf_idxs) {
-	goto write_covars_ret_NOMEM;
-      }
-      sorted_downcoding_buf = (double*)top_alloc(&topsize, sample_ct * sizeof(double));
-      if (!sorted_downcoding_buf) {
+      if (bigstack_end_alloc_ui(sample_ct, &downcoding_buf_idxs) ||
+          bigstack_end_alloc_d(sample_ct, &sorted_downcoding_buf)) {
 	goto write_covars_ret_NOMEM;
       }
       for (covar_idx = 0; covar_idx < covar_ct; covar_idx++) {
@@ -2096,7 +2062,7 @@ int32_t write_covars(char* outname, char* outname_end, uint32_t write_covar_modi
 	  if (qsort_ext((char*)sorted_downcoding_buf, covar_nm_ct, sizeof(double), double_cmp_deref, (char*)downcoding_buf_idxs, sizeof(int32_t))) {
 	    goto write_covars_ret_NOMEM;
 	  }
-	  wptr_start = double_g_write(downcoding_string_buf, sorted_downcoding_buf[0]);
+	  wptr_start = dtoa_g(sorted_downcoding_buf[0], downcoding_string_buf);
 	  slen = (uintptr_t)(wptr_start - downcoding_string_buf);
 	  *wptr_start = '\0';
 	  wptr_start = downcoding_string_buf;
@@ -2107,7 +2073,7 @@ int32_t write_covars(char* outname, char* outname_end, uint32_t write_covar_modi
 	    // a bit inefficient, but this is a safe way to achieve "two
 	    // doubles are equal if they yield the same printf %g output"
 	    // behavior
-	    wptr = double_g_write(bufptr, sorted_downcoding_buf[sample_idx2]);
+	    wptr = dtoa_g(sorted_downcoding_buf[sample_idx2], bufptr);
 	    ujj = downcoding_buf_idxs[sample_idx2];
 	    if (((uintptr_t)(wptr - bufptr) != slen) || memcmp(wptr_start, bufptr, slen)) {
 	      *wptr = '\0';
@@ -2142,12 +2108,12 @@ int32_t write_covars(char* outname, char* outname_end, uint32_t write_covar_modi
 #endif
 
 	  downcoding_level[covar_idx] = downcode_category_ct;
-          wptr_start = strcpyax(tbuf, &(covar_names[covar_idx * max_covar_name_len]), '_');
+          wptr_start = strcpyax(g_textbuf, &(covar_names[covar_idx * max_covar_name_len]), '_');
           for (downcode_idx = 0; downcode_idx < downcode_category_ct; downcode_idx++) {
 	    uii = (uint32_t)(category_idx_sort_buf[downcode_idx]);
 	    if (downcode_idx) {
 	      wptr = strcpyax(wptr_start, &(downcoding_string_buf[16 * uii]), ' ');
-	      fwrite(tbuf, 1, wptr - tbuf, outfile);
+	      fwrite(g_textbuf, 1, wptr - g_textbuf, outfile);
 	    }
 	    category_remap[uii] = downcode_idx;
           }
@@ -2165,14 +2131,14 @@ int32_t write_covars(char* outname, char* outname_end, uint32_t write_covar_modi
 	}
       }
     }
-    wkspace_shrink_top(downcoding_values, downcoding_covar_ct * sample_ct * sizeof(int32_t));
-    // topsize = 0;
+    bigstack_shrink_top(downcoding_values, downcoding_covar_ct * sample_ct * sizeof(int32_t));
+    bigstack_end_reset(bigstack_end_mark);
 
     // (write_covar_dummy_max_categories - 1) columns, then divide by two
     // rounding up; the -1 and +1 cancel
     ujj = write_covar_dummy_max_categories / 2;
-    if (wkspace_alloc_c_checked(&zbuf, ujj * sizeof(int32_t)) ||
-        wkspace_alloc_c_checked(&out_missing_buf, (write_covar_dummy_max_categories - 1) * omplen_p1)) {
+    if (bigstack_alloc_c(ujj * sizeof(int32_t), &zbuf) ||
+        bigstack_alloc_c((write_covar_dummy_max_categories - 1) * omplen_p1, &out_missing_buf)) {
       goto write_covars_ret_NOMEM;
     }
     uiptr = (uint32_t*)zbuf;
@@ -2226,8 +2192,8 @@ int32_t write_covars(char* outname, char* outname_end, uint32_t write_covar_modi
       } else if (pheno_c) {
         putc('1' + IS_SET(pheno_c, sample_uidx), outfile);
       } else {
-        wptr = double_g_write(tbuf, pheno_d[sample_uidx]);
-	fwrite(tbuf, 1, wptr - tbuf, outfile);
+        wptr = dtoa_g(pheno_d[sample_uidx], g_textbuf);
+	fwrite(g_textbuf, 1, wptr - g_textbuf, outfile);
       }
       putc(' ', outfile);
     }
@@ -2260,8 +2226,8 @@ int32_t write_covars(char* outname, char* outname_end, uint32_t write_covar_modi
 	    }
 	    downcode_idx++;
 	  } else {
-	    wptr = double_g_writex(tbuf, dxx, ' ');
-	    fwrite(tbuf, 1, wptr - tbuf, outfile);
+	    wptr = dtoa_gx(dxx, ' ', g_textbuf);
+	    fwrite(g_textbuf, 1, wptr - g_textbuf, outfile);
 	  }
 	} else {
 	  if (ujj) {
@@ -2277,8 +2243,8 @@ int32_t write_covars(char* outname, char* outname_end, uint32_t write_covar_modi
     } else {
       dptr = &(covar_d[sample_idx * covar_ct]);
       for (covar_idx = 0; covar_idx < covar_ct; covar_idx++) {
-	wptr = double_g_writex(tbuf, dptr[covar_idx], ' ');
-        fwrite(tbuf, 1, wptr - tbuf, outfile);
+	wptr = dtoa_gx(dptr[covar_idx], ' ', g_textbuf);
+        fwrite(g_textbuf, 1, wptr - g_textbuf, outfile);
       }
     }
     if (putc_checked('\n', outfile)) {
@@ -2297,6 +2263,7 @@ int32_t write_covars(char* outname, char* outname_end, uint32_t write_covar_modi
     retval = RET_WRITE_FAIL;
     break;
   }
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
   fclose_cond(outfile);
   return retval;
 }
@@ -2309,15 +2276,15 @@ int32_t zero_cluster_init(char* zerofname, uintptr_t unfiltered_marker_ct, uintp
   //    allocation
   // 2. load .zero file, converting to internal indices.  (lines with
   //    unrecognized IDs are skipped; we don't want a header line to cause this
-  //    to error out.)  this is top_alloc'd.
+  //    to error out.)  this is bigstack_end_alloc'd.
   // 3. free marker ID/cluster ID lists, sort loaded .zero contents
   // 4. assemble one block bitfield at a time, use save_set_bitfield() to
   //    compress each
   // 5. allocate and initialize cluster_zc_masks
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   FILE* zcfile = NULL;
   uintptr_t marker_ctp2l = (marker_ct + (BITCT + 1)) / BITCT;
-  uintptr_t sample_ctv2 = 2 * ((sample_ct + (BITCT - 1)) / BITCT);
-  uintptr_t topsize = 0;
+  uintptr_t sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
   uintptr_t zc_item_ct = 0;
   uintptr_t line_idx = 0;
   uint32_t range_first = marker_ct;
@@ -2333,7 +2300,6 @@ int32_t zero_cluster_init(char* zerofname, uintptr_t unfiltered_marker_ct, uintp
   int64_t* zc_entries;
   int64_t* zc_entries_end;
   uint64_t ullii;
-  uintptr_t topsize_base;
   uintptr_t max_zc_item_ct;
   uintptr_t marker_uidx;
   uint32_t marker_id_htable_size;
@@ -2344,42 +2310,41 @@ int32_t zero_cluster_init(char* zerofname, uintptr_t unfiltered_marker_ct, uintp
   uint32_t sample_idx;
   uint32_t uii;
   int32_t ii;
-  marker_bitfield_tmp = (uintptr_t*)top_alloc(&topsize, marker_ctp2l * sizeof(intptr_t));
-  if (!marker_bitfield_tmp) {
+  if (bigstack_end_alloc_ul(marker_ctp2l, &marker_bitfield_tmp) ||
+      (!bigstack_end_alloc(sizeof(int64_t)))) {
     goto zero_cluster_init_ret_NOMEM;
   }
 #ifdef __LP64__
-  fill_ulong_zero(marker_bitfield_tmp, (marker_ctp2l + 1) & (~1));
+  fill_ulong_zero(marker_bitfield_tmp, round_up_pow2(marker_ctp2l, 2));
 #else
-  fill_ulong_zero(marker_bitfield_tmp, (marker_ctp2l + 3) & (~3));
+  fill_ulong_zero(marker_bitfield_tmp, round_up_pow2(marker_ctp2l, 4));
 #endif
   zc_entries_end = (int64_t*)marker_bitfield_tmp;
   zc_entries = &(zc_entries_end[-1]);
-  wkspace_left -= topsize + 16;
-  retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable, &marker_id_htable_size);
+  retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable_size, &marker_id_htable);
   if (retval) {
     goto zero_cluster_init_ret_1;
   }
-  if (wkspace_alloc_ui_checked(&marker_uidx_to_idx, unfiltered_marker_ct * sizeof(int32_t))) {
+  if (bigstack_alloc_ui(unfiltered_marker_ct, &marker_uidx_to_idx)) {
     goto zero_cluster_init_ret_NOMEM;
   }
   fill_uidx_to_idx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_uidx_to_idx);
   // cluster IDs are already natural-sorted
 
-  if (fopen_checked(&zcfile, zerofname, "r")) {
+  if (fopen_checked(zerofname, "r", &zcfile)) {
     goto zero_cluster_init_ret_OPEN_FAIL;
   }
   // simplify cluster_idx loop
   *zc_entries = (int64_t)(((uint64_t)cluster_ct) << 32);
-  max_zc_item_ct = (wkspace_left + 8) / sizeof(int64_t);
-  tbuf[MAXLINELEN - 1] = ' ';
-  while (fgets(tbuf, MAXLINELEN, zcfile)) {
+  max_zc_item_ct = (((uintptr_t)zc_entries) - ((uintptr_t)g_bigstack_base)) / sizeof(int64_t);
+  g_textbuf[MAXLINELEN - 1] = ' ';
+  while (fgets(g_textbuf, MAXLINELEN, zcfile)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --zero-cluster file is pathologically long.\n", line_idx);
+    if (!g_textbuf[MAXLINELEN - 1]) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --zero-cluster file is pathologically long.\n", line_idx);
       goto zero_cluster_init_ret_INVALID_FORMAT_2;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
@@ -2408,11 +2373,8 @@ int32_t zero_cluster_init(char* zerofname, uintptr_t unfiltered_marker_ct, uintp
   if (fclose_null(&zcfile)) {
     goto zero_cluster_init_ret_READ_FAIL;
   }
-  wkspace_left += topsize;
-  topsize_base = topsize;
-  topsize += ((zc_item_ct + 1) / 2) * 16;
-  wkspace_reset(marker_id_htable);
-  wkspace_left -= topsize;
+  bigstack_double_reset(marker_id_htable, marker_bitfield_tmp);
+  bigstack_end_alloc(zc_item_ct * sizeof(int64_t));
 #ifdef __cplusplus
   std::sort(zc_entries, &(zc_entries[zc_item_ct]));
 #else
@@ -2427,15 +2389,13 @@ int32_t zero_cluster_init(char* zerofname, uintptr_t unfiltered_marker_ct, uintp
       fill_ulong_zero(marker_bitfield_tmp, marker_ctp2l);
       range_first = marker_ct;
       range_last = 0;
-      wkspace_left += topsize;
-      topsize = topsize_base + ((((uintptr_t)(marker_bitfield_tmp - ((uintptr_t*)zc_entries))) / 2) + 1) * 16;
-      wkspace_left -= topsize;
+      bigstack_end_set(zc_entries);
     }
     if (cur_cluster == cluster_idx) {
       range_first = (uint32_t)ullii;
       do {
 	range_last = (uint32_t)ullii;
-        SET_BIT(marker_bitfield_tmp, range_last);
+        SET_BIT(range_last, marker_bitfield_tmp);
         ullii = (uint64_t)(*zc_entries++);
         cur_cluster = (uint32_t)(ullii >> 32);
       } while (cur_cluster == cluster_idx);
@@ -2444,14 +2404,12 @@ int32_t zero_cluster_init(char* zerofname, uintptr_t unfiltered_marker_ct, uintp
       goto zero_cluster_init_ret_NOMEM;
     }
   }
-  wkspace_left += topsize;
-  topsize = 0;
-  if (wkspace_alloc_ul_checked(cluster_zc_masks_ptr, sample_ctv2 * cluster_ct * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&sample_uidx_to_idx, unfiltered_sample_ct * sizeof(int32_t))) {
+  bigstack_end_reset(bigstack_end_mark);
+  if (bigstack_calloc_ul(sample_ctv2 * cluster_ct, cluster_zc_masks_ptr) ||
+      bigstack_alloc_ui(unfiltered_sample_ct, &sample_uidx_to_idx)) {
     goto zero_cluster_init_ret_NOMEM;
   }
   cluster_zc_mask = *cluster_zc_masks_ptr;
-  fill_ulong_zero(cluster_zc_mask, sample_ctv2 * cluster_ct);
   if (!sample_sort_map) {
     fill_uidx_to_idx(sample_exclude, unfiltered_sample_ct, sample_ct, sample_uidx_to_idx);
   } else {
@@ -2466,12 +2424,12 @@ int32_t zero_cluster_init(char* zerofname, uintptr_t unfiltered_marker_ct, uintp
       sample_uidx = *cluster_map++;
       if (!IS_SET(sample_exclude, sample_uidx)) {
 	sample_idx = sample_uidx_to_idx[sample_uidx];
-        SET_BIT_DBL(cluster_zc_mask, sample_idx);
+        SET_BIT_DBL(sample_idx, cluster_zc_mask);
       }
     }
     cluster_zc_mask = &(cluster_zc_mask[sample_ctv2]);
   }
-  wkspace_reset(sample_uidx_to_idx);
+  bigstack_reset(sample_uidx_to_idx);
   LOGPRINTF("--zero-cluster: %" PRIuPTR " line%s processed.\n", zc_item_ct, (zc_item_ct == 1)? "" : "s");
   while (0) {
   zero_cluster_init_ret_NOMEM:
@@ -2484,14 +2442,14 @@ int32_t zero_cluster_init(char* zerofname, uintptr_t unfiltered_marker_ct, uintp
     retval = RET_READ_FAIL;
     break;
   zero_cluster_init_ret_MISSING_TOKENS:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of --zero-cluster file has fewer tokens than expected.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --zero-cluster file has fewer tokens than expected.\n", line_idx);
   zero_cluster_init_ret_INVALID_FORMAT_2:
     logerrprintb();
     retval = RET_INVALID_FORMAT;
     break;
   }
  zero_cluster_init_ret_1:
-  wkspace_left += topsize;
+  bigstack_end_reset(bigstack_end_mark);
   fclose_cond(zcfile);
   return retval;
 }
@@ -2506,7 +2464,7 @@ int32_t write_fam(char* outname, uintptr_t unfiltered_sample_ct, uintptr_t* samp
   char* bufptr;
   uintptr_t sample_idx;
   uintptr_t clen;
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto write_fam_ret_OPEN_FAIL;
   }
   for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
@@ -2519,7 +2477,7 @@ int32_t write_fam(char* outname, uintptr_t unfiltered_sample_ct, uintptr_t* samp
     }
     cptr = &(sample_ids[sample_uidx * max_sample_id_len]);
     clen = strlen_se(cptr);
-    bufptr = memcpyax(tbuf, cptr, clen, delim);
+    bufptr = memcpyax(g_textbuf, cptr, clen, delim);
     bufptr = strcpyax(bufptr, &(cptr[clen + 1]), delim);
     bufptr = strcpya(bufptr, &(paternal_ids[sample_uidx * max_paternal_id_len]));
     *bufptr++ = delim;
@@ -2532,10 +2490,10 @@ int32_t write_fam(char* outname, uintptr_t unfiltered_sample_ct, uintptr_t* samp
     } else if (pheno_c) {
       *bufptr++ = '1' + IS_SET(pheno_c, sample_uidx);
     } else {
-      bufptr = double_g_write(bufptr, pheno_d[sample_uidx]);
+      bufptr = dtoa_g(pheno_d[sample_uidx], bufptr);
     }
     *bufptr++ = '\n';
-    if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+    if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
       goto write_fam_ret_WRITE_FAIL;
     }
     if (!sample_sort_map) {
@@ -2567,7 +2525,7 @@ int32_t write_map_or_bim(char* outname, uintptr_t* marker_exclude, uintptr_t mar
   char* buf_start = NULL;
   uintptr_t marker_idx;
   char* bufptr;
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto write_map_or_bim_ret_OPEN_FAIL;
   }
   for (marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
@@ -2575,18 +2533,18 @@ int32_t write_map_or_bim(char* outname, uintptr_t* marker_exclude, uintptr_t mar
     while (marker_uidx >= chrom_end) {
       chrom_idx = chrom_info_ptr->chrom_file_order[++chrom_fo_idx];
       chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1];
-      buf_start = chrom_name_write(tbuf, chrom_info_ptr, chrom_idx);
+      buf_start = chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf);
       *buf_start++ = delim;
     }
     bufptr = strcpyax(buf_start, &(marker_ids[marker_uidx * max_marker_id_len]), delim);
     if (!marker_cms) {
       *bufptr++ = '0';
     } else {
-      bufptr = double_g_writewx8(bufptr, marker_cms[marker_uidx], 1);
+      bufptr = dtoa_g_wxp8(marker_cms[marker_uidx], 1, bufptr);
     }
     *bufptr++ = delim;
-    bufptr = uint32_write(bufptr, marker_pos[marker_uidx]);
-    if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+    bufptr = uint32toa(marker_pos[marker_uidx], bufptr);
+    if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
       goto write_map_or_bim_ret_WRITE_FAIL;
     }
     if (marker_allele_ptrs) {
@@ -2613,20 +2571,20 @@ int32_t write_map_or_bim(char* outname, uintptr_t* marker_exclude, uintptr_t mar
 }
 
 int32_t load_bim_split_chrom(char* bimname, uintptr_t* marker_exclude, uintptr_t marker_ct, Chrom_info* chrom_info_ptr, int64_t* ll_buf, uint32_t max_bim_linelen) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
-  char* loadbuf = tbuf;
+  char* loadbuf = g_textbuf;
   uint32_t marker_uidx = 0xffffffffU; // deliberate overflow
   int32_t retval = 0;
   uintptr_t marker_idx;
   char* bufptr;
   uint64_t chrom_idx;
   if (max_bim_linelen > MAXLINELEN) {
-    if (wkspace_alloc_c_checked(&loadbuf, max_bim_linelen)) {
+    if (bigstack_alloc_c(max_bim_linelen, &loadbuf)) {
       goto load_bim_split_chrom_ret_NOMEM;
     }
   }
-  if (fopen_checked(&infile, bimname, "r")) {
+  if (fopen_checked(bimname, "r", &infile)) {
     goto load_bim_split_chrom_ret_OPEN_FAIL;
   }
   for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
@@ -2635,7 +2593,7 @@ int32_t load_bim_split_chrom(char* bimname, uintptr_t* marker_exclude, uintptr_t
       goto load_bim_split_chrom_ret_READ_FAIL;
     }
     bufptr = skip_initial_spaces(loadbuf);
-    if (is_eoln_or_comment(*bufptr)) {
+    if (is_eoln_or_comment_kns(*bufptr)) {
       goto load_bim_split_chrom_reread;
     }
     marker_uidx++;
@@ -2658,7 +2616,7 @@ int32_t load_bim_split_chrom(char* bimname, uintptr_t* marker_exclude, uintptr_t
     break;
   }
   fclose_cond(infile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -2681,11 +2639,11 @@ void fill_ll_buf(uintptr_t* marker_exclude, uintptr_t marker_ct, Chrom_info* chr
 }
 
 int32_t update_marker_chroms(Two_col_params* update_chr, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t allow_extra_chroms, Chrom_info* chrom_info_ptr, int64_t* ll_buf) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
   char skipchar = update_chr->skipchar;
   uint32_t colid_first = (update_chr->colid < update_chr->colx);
-  uint32_t marker_ctl = (marker_ct + (BITCT - 1)) / BITCT;
+  uint32_t marker_ctl = BITCT_TO_WORDCT(marker_ct);
   uintptr_t hit_ct = 0;
   uintptr_t miss_ct = 0;
   uintptr_t marker_uidx = 0;
@@ -2705,18 +2663,17 @@ int32_t update_marker_chroms(Two_col_params* update_chr, uintptr_t unfiltered_ma
   int32_t sorted_idx;
   int32_t retval;
   char cc;
-  retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable, &marker_id_htable_size);
+  retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable_size, &marker_id_htable);
   if (retval) {
     goto update_marker_chroms_ret_1;
   }
-  if (wkspace_alloc_ul_checked(&already_seen, marker_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&marker_uidx_to_idx, unfiltered_marker_ct * sizeof(int32_t))) {
+  if (bigstack_calloc_ul(marker_ctl, &already_seen) ||
+      bigstack_alloc_ui(unfiltered_marker_ct, &marker_uidx_to_idx)) {
     goto update_marker_chroms_ret_NOMEM;
   }
-  fill_ulong_zero(already_seen, marker_ctl);
   fill_uidx_to_idx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_uidx_to_idx);
-  loadbuf = (char*)wkspace_base;
-  loadbuf_size = wkspace_left;
+  loadbuf = (char*)g_bigstack_base;
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   }
@@ -2738,7 +2695,7 @@ int32_t update_marker_chroms(Two_col_params* update_chr, uintptr_t unfiltered_ma
     line_idx++;
     if (!(loadbuf[loadbuf_size - 1])) {
       if (loadbuf_size == MAXLINEBUFLEN) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of --update-chr file is pathologically long.\n", line_idx);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --update-chr file is pathologically long.\n", line_idx);
 	goto update_marker_chroms_ret_INVALID_FORMAT_2;
       } else {
         goto update_marker_chroms_ret_NOMEM;
@@ -2774,14 +2731,14 @@ int32_t update_marker_chroms(Two_col_params* update_chr, uintptr_t unfiltered_ma
       LOGPREPRINTFWW("Error: Duplicate variant ID '%s' in --update-chr file.\n", colid_ptr);
       goto update_marker_chroms_ret_INVALID_FORMAT_2;
     }
-    set_bit(already_seen, marker_idx);
+    set_bit(marker_idx, already_seen);
     sorted_idx = get_chrom_code(chrom_info_ptr, colx_ptr);
     if (sorted_idx < 0) {
       if ((!allow_extra_chroms) || (sorted_idx == -1)) {
-	sprintf(logbuf, "Error: Invalid chromosome code on line %" PRIuPTR " of --update-chr file.\n", line_idx);
+	sprintf(g_logbuf, "Error: Invalid chromosome code on line %" PRIuPTR " of --update-chr file.\n", line_idx);
 	goto update_marker_chroms_ret_INVALID_FORMAT_2;
       }
-      retval = resolve_or_add_chrom_name(chrom_info_ptr, colx_ptr, &sorted_idx, line_idx, "--update-chr file");
+      retval = resolve_or_add_chrom_name(colx_ptr, "--update-chr file", line_idx, chrom_info_ptr, &sorted_idx);
       if (retval) {
 	goto update_marker_chroms_ret_1;
       }
@@ -2793,9 +2750,9 @@ int32_t update_marker_chroms(Two_col_params* update_chr, uintptr_t unfiltered_ma
     goto update_marker_chroms_ret_READ_FAIL;
   }
   if (miss_ct) {
-    sprintf(logbuf, "--update-chr: %" PRIuPTR " value%s updated, %" PRIuPTR " variant ID%s not present.\n", hit_ct, (hit_ct == 1)? "" : "s", miss_ct, (miss_ct == 1)? "" : "s");
+    sprintf(g_logbuf, "--update-chr: %" PRIuPTR " value%s updated, %" PRIuPTR " variant ID%s not present.\n", hit_ct, (hit_ct == 1)? "" : "s", miss_ct, (miss_ct == 1)? "" : "s");
   } else {
-    sprintf(logbuf, "--update-chr: %" PRIuPTR " value%s updated.\n", hit_ct, (hit_ct == 1)? "" : "s");
+    sprintf(g_logbuf, "--update-chr: %" PRIuPTR " value%s updated.\n", hit_ct, (hit_ct == 1)? "" : "s");
   }
   logprintb();
   while (0) {
@@ -2806,7 +2763,7 @@ int32_t update_marker_chroms(Two_col_params* update_chr, uintptr_t unfiltered_ma
     retval = RET_READ_FAIL;
     break;
   update_marker_chroms_ret_MISSING_TOKENS:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of --update-chr file has fewer tokens than expected.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --update-chr file has fewer tokens than expected.\n", line_idx);
   update_marker_chroms_ret_INVALID_FORMAT_2:
     logerrprintb();
     retval = RET_INVALID_FORMAT;
@@ -2814,7 +2771,7 @@ int32_t update_marker_chroms(Two_col_params* update_chr, uintptr_t unfiltered_ma
   }
  update_marker_chroms_ret_1:
   fclose_cond(infile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -2910,14 +2867,14 @@ int32_t sort_and_write_bim(uint32_t* map_reverse, uint32_t map_cols, char* outna
   // super-common case where all three numbers can be squeezed together in 64
   // bits.  But we care most about performance when this can't be done, so I
   // haven't bothered with that optimization.
-  if (wkspace_alloc_ui_checked(&chrom_start, (chrom_code_end + 1) * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&chrom_id, chrom_code_end * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&unpack_map, marker_ct * sizeof(int32_t))) {
+  if (bigstack_alloc_ui(chrom_code_end + 1, &chrom_start) ||
+      bigstack_alloc_ui(chrom_code_end, &chrom_id) ||
+      bigstack_alloc_ui(marker_ct, &unpack_map)) {
     goto sort_and_write_bim_ret_NOMEM;
   }
   fill_idx_to_uidx(marker_exclude, unfiltered_marker_ct, marker_ct, unpack_map);
   sort_marker_chrom_pos(ll_buf, marker_ct, marker_pos, chrom_start, chrom_id, unpack_map, &chrom_ct);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto sort_and_write_bim_ret_OPEN_FAIL;
   }
 
@@ -2925,7 +2882,7 @@ int32_t sort_and_write_bim(uint32_t* map_reverse, uint32_t map_cols, char* outna
   for (uii = 0; uii < chrom_ct; uii++) {
     cur_chrom = chrom_id[uii];
     ujj = chrom_start[uii + 1];
-    chrom_name_end = chrom_name_write(tbuf, chrom_info_ptr, cur_chrom);
+    chrom_name_end = chrom_name_write(chrom_info_ptr, cur_chrom, g_textbuf);
     *chrom_name_end++ = '\t';
     for (; marker_idx < ujj; marker_idx++) {
       marker_uidx = unpack_map[(uint32_t)ll_buf[marker_idx]];
@@ -2933,11 +2890,11 @@ int32_t sort_and_write_bim(uint32_t* map_reverse, uint32_t map_cols, char* outna
       if (!marker_cms) {
 	*bufptr++ = '0';
       } else {
-        bufptr = double_g_writewx8(bufptr, marker_cms[marker_uidx], 1);
+        bufptr = dtoa_g_wxp8(marker_cms[marker_uidx], 1, bufptr);
       }
       *bufptr++ = '\t';
-      bufptr = uint32_writex(bufptr, (uint32_t)(ll_buf[marker_idx] >> 32), '\t');
-      if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+      bufptr = uint32toa_x((uint32_t)(ll_buf[marker_idx] >> 32), '\t', bufptr);
+      if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	goto sort_and_write_bim_ret_WRITE_FAIL;
       }
       fputs(cond_replace(marker_allele_ptrs[2 * marker_uidx], missing_geno_ptr, output_missing_geno_ptr), outfile);
@@ -2988,14 +2945,14 @@ int32_t load_sort_and_write_map(uint32_t** map_reverse_ptr, FILE* mapfile, uint3
   uint32_t chrom_ct;
   // See sort_and_write_bim() for discussion.  Note that marker_ids and
   // marker_cms use filtered instead of unfiltered coordinates, though.
-  if (wkspace_alloc_ui_checked(map_reverse_ptr, (compact_map_reverse? marker_ct : unfiltered_marker_ct) * sizeof(int32_t)) ||
-      wkspace_alloc_ll_checked(&ll_buf, marker_ct * sizeof(int64_t)) ||
-      wkspace_alloc_c_checked(&marker_ids, marker_ct * max_marker_id_len) ||
-      wkspace_alloc_d_checked(&marker_cms, marker_ct * sizeof(double)) ||
-      wkspace_alloc_ui_checked(&pos_buf, marker_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&unpack_map, marker_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&chrom_start, (MAX_POSSIBLE_CHROM + 2) * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&chrom_id, (MAX_POSSIBLE_CHROM + 1) * sizeof(int32_t))) {
+  if (bigstack_alloc_ui(compact_map_reverse? marker_ct : unfiltered_marker_ct, map_reverse_ptr) ||
+      bigstack_alloc_ll(marker_ct, &ll_buf) ||
+      bigstack_alloc_c(marker_ct * max_marker_id_len, &marker_ids) ||
+      bigstack_alloc_d(marker_ct, &marker_cms) ||
+      bigstack_alloc_ui(marker_ct, &pos_buf) ||
+      bigstack_alloc_ui(marker_ct, &unpack_map) ||
+      bigstack_alloc_ui(MAX_POSSIBLE_CHROM + 2, &chrom_start) ||
+      bigstack_alloc_ui(MAX_POSSIBLE_CHROM + 1, &chrom_id)) {
     goto load_sort_and_write_map_ret_NOMEM;
   }
   rewind(mapfile);
@@ -3027,7 +2984,7 @@ int32_t load_sort_and_write_map(uint32_t** map_reverse_ptr, FILE* mapfile, uint3
   sort_marker_chrom_pos(ll_buf, marker_ct, pos_buf, chrom_start, chrom_id, NULL, &chrom_ct);
 
   strcpy(outname_end, ".map.tmp");
-  if (fopen_checked(&map_outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &map_outfile)) {
     goto load_sort_and_write_map_ret_OPEN_FAIL;
   }
 
@@ -3038,15 +2995,15 @@ int32_t load_sort_and_write_map(uint32_t** map_reverse_ptr, FILE* mapfile, uint3
   for (uii = 0; uii < chrom_ct; uii++) {
     cur_chrom = chrom_id[uii];
     ujj = chrom_start[uii + 1];
-    bufptr0 = chrom_name_write(tbuf, chrom_info_ptr, cur_chrom);
+    bufptr0 = chrom_name_write(chrom_info_ptr, cur_chrom, g_textbuf);
     *bufptr0++ = '\t';
     for (; marker_idx < ujj; marker_idx++) {
       marker_idx2 = (uint32_t)ll_buf[marker_idx];
       marker_uidx = unpack_map[marker_idx2];
       bufptr = strcpyax(bufptr0, &(marker_ids[marker_idx2 * max_marker_id_len]), '\t');
-      bufptr = double_g_writewx8x(bufptr, marker_cms[marker_idx2], 1, '\t');
-      bufptr = uint32_writex(bufptr, (uint32_t)(ll_buf[marker_idx] >> 32), '\n');
-      if (fwrite_checked(tbuf, bufptr - tbuf, map_outfile)) {
+      bufptr = dtoa_g_wxp8x(marker_cms[marker_idx2], 1, '\t', bufptr);
+      bufptr = uint32toa_x((uint32_t)(ll_buf[marker_idx] >> 32), '\n', bufptr);
+      if (fwrite_checked(g_textbuf, bufptr - g_textbuf, map_outfile)) {
 	goto load_sort_and_write_map_ret_WRITE_FAIL;
       }
       (*map_reverse_ptr)[compact_map_reverse? marker_idx2 : marker_uidx] = marker_idx;
@@ -3071,16 +3028,16 @@ int32_t load_sort_and_write_map(uint32_t** map_reverse_ptr, FILE* mapfile, uint3
   }
   chrom_info_ptr->zero_extra_chroms = orig_zec;
   if (ll_buf) {
-    wkspace_reset(ll_buf);
+    bigstack_reset(ll_buf);
   }
   return retval;
 }
 
 int32_t flip_subset_init(char* flip_fname, char* flip_subset_fname, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uint32_t* sample_sort_map, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* flip_subset_markers, uintptr_t* flip_subset_vec2) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
-  uintptr_t sample_ctv2 = 2 * ((sample_ct + (BITCT - 1)) / BITCT);
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
+  uintptr_t sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
   uintptr_t miss_ct = 0;
   uintptr_t line_idx = 0;
   uint32_t* sample_uidx_to_idx = NULL;
@@ -3103,21 +3060,21 @@ int32_t flip_subset_init(char* flip_fname, char* flip_subset_fname, uintptr_t un
   unsigned char ucc;
   // load --flip file, then --flip-subset
   fill_ulong_zero(flip_subset_markers, unfiltered_marker_ctl);
-  retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable, &marker_id_htable_size);
+  retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable_size, &marker_id_htable);
   if (retval) {
     goto flip_subset_init_ret_1;
   }
-  if (fopen_checked(&infile, flip_fname, "r")) {
+  if (fopen_checked(flip_fname, "r", &infile)) {
     goto flip_subset_init_ret_OPEN_FAIL;
   }
-  tbuf[MAXLINELEN - 1] = ' ';
-  while (fgets(tbuf, MAXLINELEN, infile)) {
+  g_textbuf[MAXLINELEN - 1] = ' ';
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --flip file is pathologically long.\n", line_idx);
+    if (!g_textbuf[MAXLINELEN - 1]) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --flip file is pathologically long.\n", line_idx);
       goto flip_subset_init_ret_INVALID_FORMAT_2;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
@@ -3130,7 +3087,7 @@ int32_t flip_subset_init(char* flip_fname, char* flip_subset_fname, uintptr_t un
     a2ptr = marker_allele_ptrs[2 * marker_uidx + 1];
     ucc = ((unsigned char)a1ptr[0]) - 'A';
     if (a1ptr[1] || a2ptr[1] || (ucc > 19) || (reverse_complements[ucc] != a2ptr[0])) {
-      sprintf(logbuf, "Error: Invalid alleles (not reverse complement single bases) on line\n%" PRIuPTR " of --flip file.\n", line_idx);
+      sprintf(g_logbuf, "Error: Invalid alleles (not reverse complement single bases) on line\n%" PRIuPTR " of --flip file.\n", line_idx);
       goto flip_subset_init_ret_INVALID_FORMAT_2;
     }
     if (is_set(flip_subset_markers, marker_uidx)) {
@@ -3138,42 +3095,42 @@ int32_t flip_subset_init(char* flip_fname, char* flip_subset_fname, uintptr_t un
       LOGPREPRINTFWW("Error: Duplicate marker ID '%s' in --flip file.\n", bufptr);
       goto flip_subset_init_ret_INVALID_FORMAT_2;
     }
-    set_bit(flip_subset_markers, marker_uidx);
+    set_bit(marker_uidx, flip_subset_markers);
     flip_marker_ct++;
   }
   if (fclose_null(&infile)) {
     goto flip_subset_init_ret_READ_FAIL;
   }
-  wkspace_reset(wkspace_mark);
-  retval = sort_item_ids(&sorted_sample_ids, &sample_id_map, unfiltered_sample_ct, sample_exclude, unfiltered_sample_ct - sample_ct, sample_ids, max_sample_id_len, 0, 1, strcmp_deref);
+  bigstack_reset(bigstack_mark);
+  retval = sort_item_ids(unfiltered_sample_ct, sample_exclude, unfiltered_sample_ct - sample_ct, sample_ids, max_sample_id_len, 0, 1, strcmp_deref, &sorted_sample_ids, &sample_id_map);
   if (retval) {
     goto flip_subset_init_ret_1;
   }
-  if (wkspace_alloc_c_checked(&id_buf, max_sample_id_len)) {
+  if (bigstack_alloc_c(max_sample_id_len, &id_buf)) {
     goto flip_subset_init_ret_NOMEM;
   }
   if (sample_sort_map) {
-    if (wkspace_alloc_ui_checked(&sample_uidx_to_idx, unfiltered_sample_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(unfiltered_sample_ct, &sample_uidx_to_idx)) {
       goto flip_subset_init_ret_NOMEM;
     }
     fill_uidx_to_idx(sample_exclude, unfiltered_sample_ct, sample_ct, sample_uidx_to_idx);
   }
-  if (fopen_checked(&infile, flip_subset_fname, "r")) {
+  if (fopen_checked(flip_subset_fname, "r", &infile)) {
     goto flip_subset_init_ret_OPEN_FAIL;
   }
   fill_ulong_zero(flip_subset_vec2, sample_ctv2);
   line_idx = 0;
-  while (fgets(tbuf, MAXLINELEN, infile)) {
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --flip-subset file is pathologically long.\n", line_idx);
+    if (!g_textbuf[MAXLINELEN - 1]) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --flip-subset file is pathologically long.\n", line_idx);
       goto flip_subset_init_ret_INVALID_FORMAT_2;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
-    if (bsearch_read_fam_indiv(id_buf, sorted_sample_ids, max_sample_id_len, sample_ct, bufptr, NULL, &sorted_idx) || (sorted_idx == -1)) {
+    if (bsearch_read_fam_indiv(bufptr, sorted_sample_ids, max_sample_id_len, sample_ct, NULL, &sorted_idx, id_buf) || (sorted_idx == -1)) {
       miss_ct++;
       continue;
     }
@@ -3187,7 +3144,7 @@ int32_t flip_subset_init(char* flip_fname, char* flip_subset_fname, uintptr_t un
       LOGPREPRINTFWW("Error: Duplicate sample ID '%s' in --flip-subset file.\n", id_buf);
       goto flip_subset_init_ret_INVALID_FORMAT_2;
     }
-    SET_BIT_DBL(flip_subset_vec2, sample_idx_write);
+    SET_BIT_DBL(sample_idx_write, flip_subset_vec2);
     flip_sample_ct++;
   }
   if (fclose_null(&infile)) {
@@ -3213,7 +3170,7 @@ int32_t flip_subset_init(char* flip_fname, char* flip_subset_fname, uintptr_t un
     break;
   }
  flip_subset_init_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(infile);
   return retval;
 }
@@ -3263,7 +3220,7 @@ int32_t make_bed_one_marker(FILE* bedfile, uintptr_t* loadbuf, uint32_t unfilter
   uint32_t sample_idx = 0;
   uint32_t sample_uidx2;
   if (sample_sort_map) {
-    if (load_raw(bedfile, loadbuf, unfiltered_sample_ct4)) {
+    if (load_raw(unfiltered_sample_ct4, bedfile, loadbuf)) {
       return RET_READ_FAIL;
     }
     for (; sample_idx < sample_ct; sample_idx++) {
@@ -3281,10 +3238,10 @@ int32_t make_bed_one_marker(FILE* bedfile, uintptr_t* loadbuf, uint32_t unfilter
       *writeptr = cur_word;
     }
     if (is_reverse) {
-      reverse_loadbuf((unsigned char*)writebuf, sample_ct);
+      reverse_loadbuf(sample_ct, (unsigned char*)writebuf);
     }
   } else {
-    if (load_and_collapse(bedfile, loadbuf, unfiltered_sample_ct, writeptr, sample_ct, sample_exclude, final_mask, is_reverse)) {
+    if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, is_reverse, bedfile, loadbuf, writeptr)) {
       return RET_READ_FAIL;
     }
   }
@@ -3302,14 +3259,14 @@ int32_t make_bed_me_missing_one_marker(FILE* bedfile, uintptr_t* loadbuf, uint32
   if ((!sample_sort_map) && (unfiltered_sample_ct == sample_ct)) {
     loadbuf = writebuf;
   }
-  if (load_raw2(bedfile, loadbuf, unfiltered_sample_ct4, unfiltered_sample_ctl2m1, final_mask)) {
+  if (load_raw2(unfiltered_sample_ct4, unfiltered_sample_ctl2m1, final_mask, bedfile, loadbuf)) {
     return RET_READ_FAIL;
   }
   if (set_hh_missing && is_x) {
     hh_reset((unsigned char*)loadbuf, sample_raw_male_include2, unfiltered_sample_ct);
   }
   if (is_reverse) {
-    reverse_loadbuf((unsigned char*)loadbuf, unfiltered_sample_ct);
+    reverse_loadbuf(unfiltered_sample_ct, (unsigned char*)loadbuf);
   }
   *error_ct_ptr += erase_mendel_errors(unfiltered_sample_ct, loadbuf, workbuf, sex_male, trio_lookup, trio_ct, is_x, multigen);
   if (sample_sort_map) {
@@ -3328,7 +3285,7 @@ int32_t make_bed_me_missing_one_marker(FILE* bedfile, uintptr_t* loadbuf, uint32
       *writeptr = cur_word;
     }
   } else if (unfiltered_sample_ct != sample_ct) {
-    collapse_copy_2bitarr(loadbuf, writebuf, unfiltered_sample_ct, sample_ct, sample_exclude);
+    copy_quaterarr_nonempty_subset_excl(loadbuf, sample_exclude, unfiltered_sample_ct, sample_ct, writebuf);
   }
   return 0;
 }
@@ -3352,7 +3309,7 @@ void zeropatch(uintptr_t sample_ctv2, uintptr_t cluster_ct, uintptr_t* cluster_z
 	at_least_one_cluster = 1;
 	fill_ulong_zero(patchbuf, sample_ctv2);
       }
-      bitfield_or(patchbuf, &(cluster_zc_masks[cluster_idx * sample_ctv2]), sample_ctv2);
+      bitvec_or(&(cluster_zc_masks[cluster_idx * sample_ctv2]), sample_ctv2, patchbuf);
     }
   }
   if (!at_least_one_cluster) {
@@ -3428,13 +3385,13 @@ void replace_missing_a2(uintptr_t* writebuf, uintptr_t* subset_vec2, uintptr_t w
 }
 
 int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t map_cols, char* outname, char* outname_end, uint64_t calculation_type, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, double* marker_cms, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_i [...]
-  unsigned char* wkspace_mark = wkspace_base;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + BITCT2 - 1) / BITCT2;
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t unfiltered_sample_ctp1l2 = 1 + (unfiltered_sample_ct / BITCT2);
   uintptr_t sample_ct4 = (sample_ct + 3) / 4;
-  uintptr_t sample_ctv2 = 2 * ((sample_ct + (BITCT - 1)) / BITCT);
+  uintptr_t sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
   uintptr_t marker_uidx = 0;
   uintptr_t marker_idx = 0;
   uintptr_t trio_ct = 0;
@@ -3474,6 +3431,7 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
   uintptr_t* writebuf_ptr;
   uint32_t* map_reverse;
   const char* errptr;
+  uintptr_t cur_bigstack_left;
   uintptr_t pass_size;
   uint32_t is_haploid;
   uint32_t is_x;
@@ -3487,8 +3445,8 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
   uint32_t seek_needed;
   uint32_t markers_done;
   if (flip_subset_fname) {
-    if (wkspace_alloc_ul_checked(&flip_subset_markers, unfiltered_marker_ctl * sizeof(intptr_t)) ||
-        wkspace_alloc_ul_checked(&flip_subset_vec2, sample_ctv2 * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_marker_ctl, &flip_subset_markers) ||
+        bigstack_alloc_ul(sample_ctv2, &flip_subset_vec2)) {
       goto make_bed_ret_NOMEM;
     }
     retval = flip_subset_init(flip_fname, flip_subset_fname, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, marker_allele_ptrs, unfiltered_sample_ct, sample_exclude, sample_ct, sample_sort_map, sample_ids, max_sample_id_len, flip_subset_markers, flip_subset_vec2);
@@ -3497,12 +3455,12 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
     }
   }
   if (calculation_type & CALC_MAKE_BED) {
-    if (wkspace_alloc_ul_checked(&loadbuf, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf)) {
       goto make_bed_ret_NOMEM;
     }
 
     if (zerofname && cluster_ct) {
-      zcdefs = (uint32_t**)wkspace_alloc(cluster_ct * sizeof(intptr_t));
+      zcdefs = (uint32_t**)bigstack_alloc(cluster_ct * sizeof(intptr_t));
       if (!zcdefs) {
 	goto make_bed_ret_NOMEM;
       }
@@ -3510,12 +3468,12 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
       if (retval) {
 	goto make_bed_ret_1;
       }
-      if (wkspace_alloc_ul_checked(&patchbuf, sample_ctv2 * sizeof(intptr_t))) {
+      if (bigstack_alloc_ul(sample_ctv2, &patchbuf)) {
 	goto make_bed_ret_NOMEM;
       }
     }
     memcpy(outname_end, ".bed", 5);
-    if (fopen_checked(&bedoutfile, outname, "wb")) {
+    if (fopen_checked(outname, FOPEN_WB, &bedoutfile)) {
       goto make_bed_ret_OPEN_FAIL;
     }
 
@@ -3536,8 +3494,8 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
 	retval = RET_CALC_NOT_YET_SUPPORTED;
 	goto make_bed_ret_1;
       }
-      if (wkspace_alloc_ui_checked(&map_reverse, unfiltered_marker_ct * sizeof(int32_t)) ||
-	  wkspace_alloc_ll_checked(&ll_buf, marker_ct * sizeof(int64_t))) {
+      if (bigstack_alloc_ui(unfiltered_marker_ct, &map_reverse) ||
+	  bigstack_alloc_ll(marker_ct, &ll_buf)) {
 	goto make_bed_ret_NOMEM;
       }
       if ((map_is_unsorted & UNSORTED_SPLIT_CHROM) || mergex || splitx_bound2 || update_chr) {
@@ -3582,16 +3540,17 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
       if (retval) {
 	goto make_bed_ret_1;
       }
-      wkspace_reset(ll_buf);
+      bigstack_reset(ll_buf);
 
       // oops, forgot to multiply by sizeof(intptr_t)!  fortunately, this
       // segfaulted instead of corrupting any data.
       // anyway, it's now time to implement multipass.
-      if (wkspace_left < sample_ctv2 * sizeof(intptr_t)) {
+      cur_bigstack_left = bigstack_left();
+      if (cur_bigstack_left < sample_ctv2 * sizeof(intptr_t)) {
         goto make_bed_ret_NOMEM;
       }
-      writebuf = (uintptr_t*)wkspace_base;
-      pass_ct = 1 + ((sample_ctv2 * marker_ct * sizeof(intptr_t) - 1) / wkspace_left);
+      writebuf = (uintptr_t*)g_bigstack_base;
+      pass_ct = 1 + ((sample_ctv2 * marker_ct * sizeof(intptr_t) - 1) / cur_bigstack_left);
       pass_size = 1 + ((marker_ct - 1) / pass_ct);
       *outname_end = '\0';
       LOGPRINTFWW5("--make-bed to %s.bed + %s.bim + %s.fam ... ", outname, outname, outname);
@@ -3659,7 +3618,7 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
       } else if (!set_hh_missing) {
 	hh_exists = 0;
       }
-      if (alloc_collapsed_haploid_filters(unfiltered_sample_ct, sample_ct, fill_missing_a2? Y_FIX_NEEDED : hh_exists, 0, sample_exclude, sex_male, &sample_include2, &sample_male_include2)) {
+      if (alloc_collapsed_haploid_filters(sample_exclude, sex_male, unfiltered_sample_ct, sample_ct, fill_missing_a2? Y_FIX_NEEDED : hh_exists, 0, &sample_include2, &sample_male_include2)) {
 	goto make_bed_ret_NOMEM;
       }
       if (set_me_missing) {
@@ -3668,22 +3627,22 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
 	  goto make_bed_ret_1;
 	}
 	if (trio_ct) {
-	  if (wkspace_alloc_ul_checked(&workbuf, unfiltered_sample_ctp1l2 * sizeof(intptr_t))) {
+	  if (bigstack_alloc_ul(unfiltered_sample_ctp1l2, &workbuf)) {
 	    goto make_bed_ret_NOMEM;
 	  }
 	  workbuf[unfiltered_sample_ctp1l2 - 1] = 0;
 	  if (set_hh_missing) {
-	    if (wkspace_alloc_ul_checked(&sample_raw_male_include2, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
+	    if (bigstack_alloc_ul(unfiltered_sample_ctl2, &sample_raw_male_include2)) {
 	      goto make_bed_ret_NOMEM;
 	    }
-	    exclude_to_vec_include(unfiltered_sample_ct, sample_raw_male_include2, sex_male);
+	    init_quaterarr_from_inverted_bitarr(sex_male, unfiltered_sample_ct, sample_raw_male_include2);
 	  }
 	} else {
 	  set_me_missing = 0;
 	}
       }
 
-      if (wkspace_alloc_ul_checked(&writebuf, sample_ctv2)) {
+      if (bigstack_alloc_ul(sample_ctv2, &writebuf)) {
 	goto make_bed_ret_NOMEM;
       }
       if (fseeko(bedfile, bed_offset, SEEK_SET)) {
@@ -3692,51 +3651,53 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
       *outname_end = '\0';
       LOGPRINTFWW5("--make-bed to %s.bed + %s.bim + %s.fam ... ", outname, outname, outname);
       fputs("0%", stdout);
-      marker_uidx = 0;
-      for (pct = 1; pct <= 100; pct++) {
-	loop_end = (pct * ((uint64_t)marker_ct)) / 100;
-	for (; marker_idx < loop_end; marker_uidx++, marker_idx++) {
-	  if (IS_SET(marker_exclude, marker_uidx)) {
-	    marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
-	    if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
-	      goto make_bed_ret_READ_FAIL;
+      if (sample_ct) {
+	marker_uidx = 0;
+	for (pct = 1; pct <= 100; pct++) {
+	  loop_end = (pct * ((uint64_t)marker_ct)) / 100;
+	  for (; marker_idx < loop_end; marker_uidx++, marker_idx++) {
+	    if (IS_SET(marker_exclude, marker_uidx)) {
+	      marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
+	      if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
+		goto make_bed_ret_READ_FAIL;
+	      }
 	    }
-	  }
-	  if (marker_uidx >= chrom_end) {
-	    chrom_fo_idx++;
-	    refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
-	  }
-	  if ((!set_me_missing) || (is_haploid && (!is_x))) {
-	    retval = make_bed_one_marker(bedfile, loadbuf, unfiltered_sample_ct, unfiltered_sample_ct4, sample_exclude, sample_ct, sample_sort_map, final_mask, IS_SET(marker_reverse, marker_uidx), writebuf);
-	    if (is_haploid && set_hh_missing) {
-	      haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)writebuf);
+	    if (marker_uidx >= chrom_end) {
+	      chrom_fo_idx++;
+	      refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
+	    }
+	    if ((!set_me_missing) || (is_haploid && (!is_x))) {
+	      retval = make_bed_one_marker(bedfile, loadbuf, unfiltered_sample_ct, unfiltered_sample_ct4, sample_exclude, sample_ct, sample_sort_map, final_mask, IS_SET(marker_reverse, marker_uidx), writebuf);
+	      if (is_haploid && set_hh_missing) {
+		haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)writebuf);
+	      }
+	    } else {
+	      retval = make_bed_me_missing_one_marker(bedfile, loadbuf, unfiltered_sample_ct, unfiltered_sample_ct4, sample_exclude, sample_ct, sample_sort_map, final_mask, unfiltered_sample_ctl2m1, IS_SET(marker_reverse, marker_uidx), writebuf, workbuf, sex_male, sample_raw_male_include2, trio_lookup, trio_ct, set_hh_missing, is_x, mendel_multigen, &mendel_error_ct);
+	    }
+	    if (retval) {
+	      goto make_bed_ret_1;
+	    }
+	    if (zcdefs) {
+	      zeropatch(sample_ctv2, cluster_ct, cluster_zc_masks, zcdefs, patchbuf, marker_idx, writebuf);
+	    }
+	    if (flip_subset_markers && is_set(flip_subset_markers, marker_uidx)) {
+	      reverse_subset(writebuf, flip_subset_vec2, sample_ctv2);
+	    }
+	    if (fill_missing_a2) {
+	      replace_missing_a2(writebuf, is_y? sample_male_include2 : sample_include2, sample_ctv2);
 	    }
-	  } else {
-	    retval = make_bed_me_missing_one_marker(bedfile, loadbuf, unfiltered_sample_ct, unfiltered_sample_ct4, sample_exclude, sample_ct, sample_sort_map, final_mask, unfiltered_sample_ctl2m1, IS_SET(marker_reverse, marker_uidx), writebuf, workbuf, sex_male, sample_raw_male_include2, trio_lookup, trio_ct, set_hh_missing, is_x, mendel_multigen, &mendel_error_ct);
-	  }
-	  if (retval) {
-	    goto make_bed_ret_1;
-	  }
-	  if (zcdefs) {
-	    zeropatch(sample_ctv2, cluster_ct, cluster_zc_masks, zcdefs, patchbuf, marker_idx, writebuf);
-	  }
-	  if (flip_subset_markers && is_set(flip_subset_markers, marker_uidx)) {
-	    reverse_subset(writebuf, flip_subset_vec2, sample_ctv2);
-	  }
-	  if (fill_missing_a2) {
-	    replace_missing_a2(writebuf, is_y? sample_male_include2 : sample_include2, sample_ctv2);
-	  }
 
-	  if (fwrite_checked(writebuf, sample_ct4, bedoutfile)) {
-	    goto make_bed_ret_WRITE_FAIL;
+	    if (fwrite_checked(writebuf, sample_ct4, bedoutfile)) {
+	      goto make_bed_ret_WRITE_FAIL;
+	    }
 	  }
-	}
-	if (pct < 100) {
-	  if (pct > 10) {
-	    putchar('\b');
+	  if (pct < 100) {
+	    if (pct > 10) {
+	      putchar('\b');
+	    }
+	    printf("\b\b%u%%", pct);
+	    fflush(stdout);
 	  }
-	  printf("\b\b%u%%", pct);
-	  fflush(stdout);
 	}
       }
     }
@@ -3749,8 +3710,8 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
       LOGPRINTFWW5("--make-just-bim to %s ... ", outname);
       fflush(stdout);
     }
-    if (wkspace_alloc_ui_checked(&map_reverse, unfiltered_marker_ct * sizeof(int32_t)) ||
-	wkspace_alloc_ll_checked(&ll_buf, marker_ct * sizeof(int64_t))) {
+    if (bigstack_alloc_ui(unfiltered_marker_ct, &map_reverse) ||
+	bigstack_alloc_ll(marker_ct, &ll_buf)) {
       goto make_bed_ret_NOMEM;
     }
     if (map_is_unsorted & UNSORTED_SPLIT_CHROM) {
@@ -3765,7 +3726,7 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
     if (retval) {
       goto make_bed_ret_1;
     }
-    wkspace_reset(map_reverse);
+    bigstack_reset(map_reverse);
     if (calculation_type & CALC_MAKE_BIM) {
       logprint("done.\n");
     }    
@@ -3830,12 +3791,12 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
   }
  make_bed_ret_1:
   fclose_cond(bedoutfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t load_fam(char* famname, uint32_t fam_cols, uint32_t tmp_fam_col_6, int32_t missing_pheno, uint32_t affection_01, uintptr_t* unfiltered_sample_ct_ptr, char** sample_ids_ptr, uintptr_t* max_sample_id_len_ptr, char** paternal_ids_ptr, uintptr_t* max_paternal_id_len_ptr, char** maternal_ids_ptr, uintptr_t* max_maternal_id_len_ptr, uintptr_t** sex_nm_ptr, uintptr_t** sex_male_ptr, uint32_t* affection_ptr, uintptr_t** pheno_nm_ptr, uintptr_t** pheno_c_ptr, double** pheno_d_ptr, uintptr [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   double missing_phenod = (double)missing_pheno;
   uintptr_t* pheno_c = NULL;
   double* pheno_d = NULL;
@@ -3847,7 +3808,8 @@ int32_t load_fam(char* famname, uint32_t fam_cols, uint32_t tmp_fam_col_6, int32
   uintptr_t line_idx = 0;
   uint32_t affection = 1;
   int32_t retval = 0;
-  char case_char = affection_01? '1' : '2';
+  double pheno_ctrld = (double)((int32_t)(1 - affection_01));
+  double pheno_cased = pheno_ctrld + 1.0;
   uintptr_t* sex_nm;
   uintptr_t* sex_male;
   uintptr_t* pheno_nm;
@@ -3867,16 +3829,15 @@ int32_t load_fam(char* famname, uint32_t fam_cols, uint32_t tmp_fam_col_6, int32
   double dxx;
 
   // we want this to work when the file is actually a .ped
-  if (wkspace_left > MAXLINEBUFLEN) {
+  loadbuf_size = bigstack_left();
+  if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
-  } else if (wkspace_left <= MAXLINELEN) {
+  } else if (loadbuf_size <= MAXLINELEN) {
     goto load_fam_ret_NOMEM;
-  } else {
-    loadbuf_size = wkspace_left;
   }
-  loadbuf = (char*)wkspace_base;
+  loadbuf = (char*)g_bigstack_base;
   loadbuf[loadbuf_size - 1] = ' ';
-  if (fopen_checked(&famfile, famname, "r")) {
+  if (fopen_checked(famname, "r", &famfile)) {
     goto load_fam_ret_OPEN_FAIL;
   }
   // ----- .fam read, first pass -----
@@ -3887,7 +3848,7 @@ int32_t load_fam(char* famname, uint32_t fam_cols, uint32_t tmp_fam_col_6, int32
     line_idx++;
     if (!loadbuf[loadbuf_size - 1]) {
       if (loadbuf_size == MAXLINEBUFLEN) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of .fam file is pathologically long.\n", line_idx);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .fam file is pathologically long.\n", line_idx);
 	goto load_fam_ret_INVALID_FORMAT_2;
       } else {
 	goto load_fam_ret_NOMEM;
@@ -3905,7 +3866,7 @@ int32_t load_fam(char* famname, uint32_t fam_cols, uint32_t tmp_fam_col_6, int32
       }
       tmp_len = strlen_se(bufptr);
       if ((tmp_len == 1) && (*bufptr == '0')) {
-	sprintf(logbuf, "Error: Invalid IID '0' on line %" PRIuPTR " of .fam file.\n", line_idx);
+	sprintf(g_logbuf, "Error: Invalid IID '0' on line %" PRIuPTR " of .fam file.\n", line_idx);
 	goto load_fam_ret_INVALID_FORMAT_2;
       }
       tmp_len = strlen_se(bufptr0) + strlen_se(bufptr) + 2;
@@ -3935,6 +3896,8 @@ int32_t load_fam(char* famname, uint32_t fam_cols, uint32_t tmp_fam_col_6, int32
 	if (no_more_tokens_kns(bufptr)) {
 	  goto load_fam_ret_MISSING_TOKENS;
 	}
+	// --1 forces case/control phenotype in plink 1.07, keep that for
+	// backward compatibility
 	if (affection && (!affection_01)) {
 	  affection = eval_affection(bufptr, missing_phenod);
 	}
@@ -3955,25 +3918,25 @@ int32_t load_fam(char* famname, uint32_t fam_cols, uint32_t tmp_fam_col_6, int32
     logerrprint("Error: FIDs and IIDs are limited to " MAX_ID_LEN_STR " characters.\n");
     goto load_fam_ret_INVALID_FORMAT;
   }
-  wkspace_reset(wkspace_mark);
-  unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  bigstack_reset(bigstack_mark);
+  unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   // could make paternal_ids/maternal_ids conditional, but memory footprint is
   // typically negligible
-  if (wkspace_alloc_c_checked(sample_ids_ptr, unfiltered_sample_ct * max_sample_id_len) ||
-      wkspace_alloc_c_checked(paternal_ids_ptr, unfiltered_sample_ct * max_paternal_id_len) ||
-      wkspace_alloc_c_checked(maternal_ids_ptr, unfiltered_sample_ct * max_maternal_id_len) ||
-      wkspace_alloc_ul_checked(sex_nm_ptr, unfiltered_sample_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(sex_male_ptr, unfiltered_sample_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(founder_info_ptr, unfiltered_sample_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(sample_exclude_ptr, unfiltered_sample_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(pheno_nm_ptr, unfiltered_sample_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_c(unfiltered_sample_ct * max_sample_id_len, sample_ids_ptr) ||
+      bigstack_alloc_c(unfiltered_sample_ct * max_paternal_id_len, paternal_ids_ptr) ||
+      bigstack_alloc_c(unfiltered_sample_ct * max_maternal_id_len, maternal_ids_ptr) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl, sex_nm_ptr) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl, sex_male_ptr) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl, founder_info_ptr) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl, sample_exclude_ptr) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl, pheno_nm_ptr)) {
     goto load_fam_ret_NOMEM;
   }
 
   // force either pheno_c or pheno_d to be allocated even if there is no
   // phenotype data
   if ((!tmp_fam_col_6) || affection) {
-    if (aligned_malloc(pheno_c_ptr, unfiltered_sample_ctl * sizeof(intptr_t))) {
+    if (aligned_malloc(unfiltered_sample_ctl * sizeof(intptr_t), pheno_c_ptr)) {
       goto load_fam_ret_NOMEM;
     }
     pheno_c = *pheno_c_ptr;
@@ -3986,15 +3949,14 @@ int32_t load_fam(char* famname, uint32_t fam_cols, uint32_t tmp_fam_col_6, int32
     fill_double_zero(pheno_d, unfiltered_sample_ct);
     *pheno_d_ptr = pheno_d;
   }
-  wkspace_mark = wkspace_base;
-  if (wkspace_left > MAXLINEBUFLEN) {
+  bigstack_mark = g_bigstack_base;
+  loadbuf_size = bigstack_left();
+  if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
-  } else if (wkspace_left <= MAXLINELEN) {
+  } else if (loadbuf_size <= MAXLINELEN) {
     goto load_fam_ret_NOMEM;
-  } else {
-    loadbuf_size = wkspace_left;
   }
-  loadbuf = (char*)wkspace_base;
+  loadbuf = (char*)g_bigstack_base;
   loadbuf[loadbuf_size - 1] = ' ';
   sample_ids = *sample_ids_ptr;
   paternal_ids = *paternal_ids_ptr;
@@ -4014,7 +3976,7 @@ int32_t load_fam(char* famname, uint32_t fam_cols, uint32_t tmp_fam_col_6, int32
   if (fam_cols & FAM_COL_34) {
     fill_ulong_zero(founder_info, unfiltered_sample_ctl);
   } else {
-    fill_all_bits(founder_info, unfiltered_sample_ct);
+    fill_all_bits(unfiltered_sample_ct, founder_info);
   }
   fill_ulong_zero(sex_nm, unfiltered_sample_ctl);
   fill_ulong_zero(sex_male, unfiltered_sample_ctl);
@@ -4050,33 +4012,33 @@ int32_t load_fam(char* famname, uint32_t fam_cols, uint32_t tmp_fam_col_6, int32
       tmp_len2 = strlen_se(bufptr);
       memcpyx(&(maternal_ids[sample_uidx * max_maternal_id_len]), bufptr, tmp_len2, '\0');
       if ((tmp_len == 1) && (tmp_len2 == 1) && (cc == '0') && (*bufptr == '0')) {
-	SET_BIT(founder_info, sample_uidx);
+	SET_BIT(sample_uidx, founder_info);
       }
     }
     if (fam_cols & FAM_COL_5) {
       bufptr = next_token(bufptr);
       if (strlen_se(bufptr) == 1) {
 	if (*bufptr == '1') {
-	  SET_BIT(sex_nm, sample_uidx);
-	  SET_BIT(sex_male, sample_uidx);
+	  SET_BIT(sample_uidx, sex_nm);
+	  SET_BIT(sample_uidx, sex_male);
 	} else if (*bufptr == '2') {
-	  SET_BIT(sex_nm, sample_uidx);
+	  SET_BIT(sample_uidx, sex_nm);
 	}
       }
     }
     if (tmp_fam_col_6) {
       bufptr = next_token(bufptr);
-      if (affection) {
-	if (!is_missing_pheno_cc(bufptr, missing_phenod, affection_01)) {
-	  SET_BIT(pheno_nm, sample_uidx);
-	  if (*bufptr == case_char) {
-	    SET_BIT(pheno_c, sample_uidx);
-	  }
-	}
-      } else {
-	if ((!scan_double(bufptr, &dxx)) && (dxx != missing_phenod)) {
+      if (!scan_double(bufptr, &dxx)) {
+	if (affection) {
+	  if (dxx == pheno_ctrld) {
+	    SET_BIT(sample_uidx, pheno_nm);
+	  } else if (dxx == pheno_cased) {
+	    SET_BIT(sample_uidx, pheno_nm);
+	    SET_BIT(sample_uidx, pheno_c);
+	  }
+	} else if (dxx != missing_phenod) {
 	  pheno_d[sample_uidx] = dxx;
-	  SET_BIT(pheno_nm, sample_uidx);
+	  SET_BIT(sample_uidx, pheno_nm);
 	}
       }
     }
@@ -4102,14 +4064,14 @@ int32_t load_fam(char* famname, uint32_t fam_cols, uint32_t tmp_fam_col_6, int32
     retval = RET_READ_FAIL;
     break;
   load_fam_ret_MISSING_TOKENS:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of .fam file has fewer tokens than expected.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .fam file has fewer tokens than expected.\n", line_idx);
   load_fam_ret_INVALID_FORMAT_2:
     logerrprintb();
   load_fam_ret_INVALID_FORMAT:
     retval = RET_INVALID_FORMAT;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(famfile);
   return retval;
 }
@@ -4117,7 +4079,7 @@ int32_t load_fam(char* famname, uint32_t fam_cols, uint32_t tmp_fam_col_6, int32
 #define D_EPSILON 0.000244140625
 
 int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outname_end, char* single_chr, char* pheno_name, double hard_call_threshold, char* missing_code, int32_t missing_pheno, uint64_t misc_flags, uint32_t is_bgen, Chrom_info* chrom_info_ptr) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
   gzFile gz_infile = NULL;
   FILE* outfile = NULL;
@@ -4128,7 +4090,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
   double hard_call_floor = 1.0 - hard_call_threshold;
   char* loadbuf = NULL;
   char* sorted_mc = NULL;
-  char* tbuf2 = &(tbuf[MAXLINELEN]); // .fam write
+  char* tbuf2 = &(g_textbuf[MAXLINELEN]); // .fam write
 
   // 0 = not present, otherwise zero-based index (this is fine since first
   //     column has to be FID)
@@ -4197,12 +4159,12 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
       goto oxford_to_bed_ret_INVALID_CMDLINE;
     }
   }
-  bufptr = int32_write(missing_pheno_str, missing_pheno);
+  bufptr = int32toa(missing_pheno, missing_pheno_str);
   missing_pheno_len = (uintptr_t)(bufptr - missing_pheno_str);
   if (!missing_code) {
     mc_ct = 1;
     max_mc_len = 3;
-    if (wkspace_alloc_c_checked(&sorted_mc, 3)) {
+    if (bigstack_alloc_c(3, &sorted_mc)) {
       goto oxford_to_bed_ret_NOMEM;
     }
     memcpy(sorted_mc, "NA", 3);
@@ -4227,7 +4189,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
       bufptr = bufptr2;
     }
     if (mc_ct) {
-      if (wkspace_alloc_c_checked(&sorted_mc, mc_ct * max_mc_len)) {
+      if (bigstack_alloc_c(mc_ct * max_mc_len, &sorted_mc)) {
 	goto oxford_to_bed_ret_NOMEM;
       }
       bufptr = missing_code;
@@ -4251,27 +4213,27 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
       qsort(sorted_mc, mc_ct, max_mc_len, strcmp_casted);
     }
   }
-  if (fopen_checked(&infile, samplename, "r")) {
+  if (fopen_checked(samplename, "r", &infile)) {
     goto oxford_to_bed_ret_OPEN_FAIL;
   }
   memcpy(outname_end, ".fam", 5);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto oxford_to_bed_ret_OPEN_FAIL;
   }
-  tbuf[MAXLINELEN - 1] = ' ';
+  g_textbuf[MAXLINELEN - 1] = ' ';
   do {
     line_idx++;
-    if (!fgets(tbuf, MAXLINELEN, infile)) {
+    if (!fgets(g_textbuf, MAXLINELEN, infile)) {
       if (ferror(infile)) {
 	goto oxford_to_bed_ret_READ_FAIL;
       }
       logerrprint("Error: Empty --data/--sample file.\n");
       goto oxford_to_bed_ret_INVALID_FORMAT;
     }
-    if (!tbuf[MAXLINELEN - 1]) {
+    if (!g_textbuf[MAXLINELEN - 1]) {
       goto oxford_to_bed_ret_SAMPLE_LONG_LINE;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
   } while (is_eoln_kns(*bufptr));
   bufptr2 = token_endnn(bufptr);
   if ((((uintptr_t)(bufptr2 - bufptr)) != 4) || memcmp(bufptr, "ID_1", 4)) {
@@ -4320,17 +4282,17 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
   }
   do {
     line_idx++;
-    if (!fgets(tbuf, MAXLINELEN, infile)) {
+    if (!fgets(g_textbuf, MAXLINELEN, infile)) {
       if (ferror(infile)) {
 	goto oxford_to_bed_ret_READ_FAIL;
       }
       logerrprint("Error: Only one nonempty line in .sample file.\n");
       goto oxford_to_bed_ret_INVALID_FORMAT;
     }
-    if (!tbuf[MAXLINELEN - 1]) {
+    if (!g_textbuf[MAXLINELEN - 1]) {
       goto oxford_to_bed_ret_SAMPLE_LONG_LINE;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
   } while (is_eoln_kns(*bufptr));
   bufptr2 = token_endnn(bufptr);
   if ((((uintptr_t)(bufptr2 - bufptr)) != 1) || (*bufptr != '0')) {
@@ -4390,12 +4352,12 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
       goto oxford_to_bed_ret_INVALID_CMDLINE;
     }
   }
-  while (fgets(tbuf, MAXLINELEN, infile)) {
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
+    if (!g_textbuf[MAXLINELEN - 1]) {
       goto oxford_to_bed_ret_SAMPLE_LONG_LINE;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
@@ -4424,7 +4386,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
       }
       cc = *bufptr++;
       if ((cc < '0') || (cc > '2') || ((*bufptr) > ' ')) {
-	sprintf(logbuf, "Error: Invalid sex code on line %" PRIuPTR " of .sample file.\n", line_idx);
+	sprintf(g_logbuf, "Error: Invalid sex code on line %" PRIuPTR " of .sample file.\n", line_idx);
         goto oxford_to_bed_ret_INVALID_FORMAT_2;
       }
       *wptr++ = cc;
@@ -4485,34 +4447,32 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
     goto oxford_to_bed_ret_WRITE_FAIL;
   }
   sample_ct4 = (sample_ct + 3) / 4;
-  sample_ctl2 = (sample_ct + (BITCT2 - 1)) / BITCT2;
-  if (wkspace_alloc_ul_checked(&writebuf, sample_ctl2 * sizeof(intptr_t))) {
+  sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+  if (bigstack_alloc_ul(sample_ctl2, &writebuf)) {
     goto oxford_to_bed_ret_NOMEM;
   }
   memcpy(outname_end, ".bim", 5);
-  if (fopen_checked(&outfile_bim, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile_bim)) {
     goto oxford_to_bed_ret_OPEN_FAIL;
   }
   memcpy(outname_end, ".bed", 5);
-  if (fopen_checked(&outfile, outname, "wb")) {
+  if (fopen_checked(outname, FOPEN_WB, &outfile)) {
     goto oxford_to_bed_ret_OPEN_FAIL;
   }
   if (fwrite_checked("l\x1b\x01", 3, outfile)) {
     goto oxford_to_bed_ret_WRITE_FAIL;
   }
   if (!is_bgen) {
-    loadbuf_size = wkspace_left;
+    loadbuf_size = bigstack_left();
     if (loadbuf_size > MAXLINEBUFLEN) {
       loadbuf_size = MAXLINEBUFLEN;
     } else if (loadbuf_size <= MAXLINELEN) {
       goto oxford_to_bed_ret_NOMEM;
     }
-    loadbuf = (char*)wkspace_base;
-    if (gzopen_checked(&gz_infile, genname, "rb")) {
-      goto oxford_to_bed_ret_OPEN_FAIL;
-    }
-    if (gzbuffer(gz_infile, 131072)) {
-      goto oxford_to_bed_ret_NOMEM;
+    loadbuf = (char*)g_bigstack_base;
+    retval = gzopen_read_checked(genname, &gz_infile);
+    if (retval) {
+      goto oxford_to_bed_ret_1;
     }
     loadbuf[loadbuf_size - 1] = ' ';
     line_idx = 0;
@@ -4544,7 +4504,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
 	    }
 	    goto oxford_to_bed_ret_INVALID_FORMAT;
 	  }
-	  retval = resolve_or_add_chrom_name(chrom_info_ptr, bufptr, &ii, line_idx, ".gen file");
+	  retval = resolve_or_add_chrom_name(bufptr, ".gen file", line_idx, chrom_info_ptr, &ii);
 	  if (retval) {
 	    goto oxford_to_bed_ret_1;
 	  }
@@ -4750,16 +4710,16 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
       goto oxford_to_bed_ret_INVALID_FORMAT;
     }
   } else {
-    if (fopen_checked(&infile, genname, "rb")) {
+    if (fopen_checked(genname, FOPEN_RB, &infile)) {
       goto oxford_to_bed_ret_OPEN_FAIL;
     }
     // supports BGEN v1.0 and v1.1.
-    bgen_probs = (uint16_t*)wkspace_alloc(6 * sample_ct);
+    bgen_probs = (uint16_t*)bigstack_alloc(6LU * sample_ct);
     if (!bgen_probs) {
       goto oxford_to_bed_ret_NOMEM;
     }
-    loadbuf = (char*)wkspace_base;
-    loadbuf_size = wkspace_left;
+    loadbuf = (char*)g_bigstack_base;
+    loadbuf_size = bigstack_left();
     if (loadbuf_size > MAXLINEBUFLEN) {
       loadbuf_size = MAXLINEBUFLEN;
     } else if (loadbuf_size < 3 * 65536) {
@@ -4814,7 +4774,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
     if (!is_randomized) {
       bgen_hardthresh = 32768 - (int32_t)(hard_call_threshold * 32768);
     }
-    memcpyl3(tbuf, " 0 ");
+    memcpyl3(g_textbuf, " 0 ");
     for (marker_uidx = 0; marker_uidx < raw_marker_ct; marker_uidx++) {
       if (fread(&uii, 1, 4, infile) < 4) {
 	goto oxford_to_bed_ret_READ_FAIL;
@@ -4892,7 +4852,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
 	  if (chrom_error(".bgen file", chrom_info_ptr, bufptr2, 0, ii, allow_extra_chroms)) {
             goto oxford_to_bed_ret_INVALID_FORMAT;
 	  }
-	  retval = resolve_or_add_chrom_name(chrom_info_ptr, bufptr2, &ii, 0, ".bgen file");
+	  retval = resolve_or_add_chrom_name(bufptr2, ".bgen file", 0, chrom_info_ptr, &ii);
           if (retval) {
 	    goto oxford_to_bed_ret_1;
 	  }
@@ -4927,11 +4887,11 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
 	  goto oxford_to_bed_ret_WRITE_FAIL;
 	}
         fwrite(bufptr, 1, usjj, outfile_bim);
-	bufptr = uint32_writex(&(tbuf[3]), uint_arr[0], ' ');
-	fwrite(tbuf, 1, bufptr - tbuf, outfile_bim);
+	bufptr = uint32toa_x(uint_arr[0], ' ', &(g_textbuf[3]));
+	fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile_bim);
 
         // halve the limit since there are two alleles
-	// (may want to enforce NON_WKSPACE_MIN allele length limit?)
+	// (may want to enforce NON_BIGSTACK_MIN allele length limit?)
         if (uint_arr[1] >= loadbuf_size / 2) {
 	  if (loadbuf_size < MAXLINEBUFLEN) {
 	    goto oxford_to_bed_ret_NOMEM;
@@ -4994,7 +4954,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
 	      goto oxford_to_bed_ret_INVALID_FORMAT;
 	    }
 	  }
-	  uint32_writex(loadbuf, (uint32_t)ii, '\0');
+	  uint32toa_x((uint32_t)ii, '\0', loadbuf);
 	  bufptr = loadbuf;
 	} else {
 	  ujj = (unsigned char)loadbuf[0];
@@ -5009,7 +4969,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
 	    if (chrom_error(".bgen file", chrom_info_ptr, bufptr, 0, ii, allow_extra_chroms)) {
 	      goto oxford_to_bed_ret_INVALID_FORMAT;
 	    }
-	    retval = resolve_or_add_chrom_name(chrom_info_ptr, bufptr, &ii, 0, ".bgen file");
+	    retval = resolve_or_add_chrom_name(bufptr, ".bgen file", 0, chrom_info_ptr, &ii);
 	    if (retval) {
 	      goto oxford_to_bed_ret_1;
 	    }
@@ -5036,7 +4996,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
 	}
 	fwrite(&(loadbuf[uii + 2]), 1, ukk, outfile_bim);
 	memcpy(&ujj, &(loadbuf[2 * uii + 3]), 4);
-	bufptr = uint32_writex(&(tbuf[3]), ujj, ' ');
+	bufptr = uint32toa_x(ujj, ' ', &(g_textbuf[3]));
 	identical_alleles = (loadbuf[2 * uii + 7] == loadbuf[2 * uii + 8]);
 	if (!identical_alleles) {
 	  *bufptr++ = loadbuf[2 * uii + 7];
@@ -5046,7 +5006,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
 	*bufptr++ = ' ';
 	*bufptr++ = loadbuf[2 * uii + 8];
 	*bufptr++ = '\n';
-	if (fwrite_checked(tbuf, bufptr - tbuf, outfile_bim)) {
+	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile_bim)) {
 	  goto oxford_to_bed_ret_WRITE_FAIL;
 	}
       }
@@ -5112,7 +5072,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
 	    while (1) {
 	      uii >>= 16;
 	      if (!uii) {
-		uii = sfmt_genrand_uint32(&sfmt) | 0x80000000U;
+		uii = sfmt_genrand_uint32(&g_sfmt) | 0x80000000U;
 	      }
 	      ujj = uii & 32767;
 	      if (ujj < ukk) {
@@ -5236,18 +5196,18 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
   gzclose_cond(gz_infile);
   fclose_cond(outfile);
   fclose_cond(outfile_bim);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
-// side effect: initializes tbuf to first nonempty line of .map/.bim
-int32_t check_cm_col(FILE* bimfile, char* tbuf, uint32_t is_binary, uint32_t allow_no_variants, uint32_t bufsize, uint32_t* cm_col_exists_ptr, uintptr_t* line_idx_ptr) {
+// side effect: initializes textbuf to first nonempty line of .map/.bim
+int32_t check_cm_col(FILE* bimfile, char* textbuf, uint32_t is_binary, uint32_t allow_no_variants, uint32_t bufsize, uint32_t* cm_col_exists_ptr, uintptr_t* line_idx_ptr) {
   uintptr_t line_idx = 0;
   char* bufptr;
-  while (fgets(tbuf, bufsize, bimfile)) {
+  while (fgets(textbuf, bufsize, bimfile)) {
     line_idx++;
-    bufptr = skip_initial_spaces(tbuf);
-    if (is_eoln_or_comment(*bufptr)) {
+    bufptr = skip_initial_spaces(textbuf);
+    if (is_eoln_or_comment_kns(*bufptr)) {
       continue;
     }
     bufptr = next_token_mult(bufptr, 2 + 2 * is_binary);
@@ -5290,7 +5250,7 @@ typedef struct ll_str_fixed_struct {
 #endif
 } Ll_str_fixed;
 
-int32_t incr_text_allele_str(uintptr_t* topsize_ptr, char* allele_name, uint32_t an_len, Ll_str* allele_list_start, uint32_t* marker_allele_cts) {
+int32_t incr_text_allele_str(char* allele_name, uint32_t an_len, Ll_str* allele_list_start, uint32_t* marker_allele_cts) {
   // Start with preallocated array of 16-byte Ll_strs.
   // Ll_str.ss is a null-terminated sequence of ordered, tab-delimited allele
   // names.  If the starting 8 (or 12 bytes, on 32-bit systems) is adequate,
@@ -5303,19 +5263,18 @@ int32_t incr_text_allele_str(uintptr_t* topsize_ptr, char* allele_name, uint32_t
   // .vcf files, etc.
   uint32_t allele_num = 0;
   char* cur_allele_name_start = allele_list_start->ss;
-  Ll_str* llptr;
+  Ll_str* ll_ptr;
   uint32_t slen;
   uintptr_t chars_left;
   if (!(*cur_allele_name_start)) {
     if (!(allele_list_start->next)) {
       if (an_len >= (16 - sizeof(intptr_t))) {
-	llptr = top_alloc_llstr(topsize_ptr, an_len + 1);
-	if (!llptr) {
+	if (bigstack_end_alloc_llstr(an_len + 1, &ll_ptr)) {
 	  return RET_NOMEM;
 	}
-        allele_list_start->next = llptr;
-	llptr->next = NULL;
-	cur_allele_name_start = llptr->ss;
+        allele_list_start->next = ll_ptr;
+	ll_ptr->next = NULL;
+	cur_allele_name_start = ll_ptr->ss;
       }
       memcpyx(cur_allele_name_start, allele_name, an_len, '\0');
       marker_allele_cts[0] = 1;
@@ -5342,13 +5301,12 @@ int32_t incr_text_allele_str(uintptr_t* topsize_ptr, char* allele_name, uint32_t
 	  cur_allele_name_start[slen] = '\t';
 	  memcpyx(&(cur_allele_name_start[slen + 1]), allele_name, an_len, '\0');
 	} else {
-	  llptr = top_alloc_llstr(topsize_ptr, an_len + 1);
-	  if (!llptr) {
+	  if (bigstack_end_alloc_llstr(an_len + 1, &ll_ptr)) {
 	    return RET_NOMEM;
 	  }
-	  allele_list_start->next = llptr;
-	  llptr->next = NULL;
-	  cur_allele_name_start = llptr->ss;
+	  allele_list_start->next = ll_ptr;
+	  ll_ptr->next = NULL;
+	  cur_allele_name_start = ll_ptr->ss;
 	  memcpyx(cur_allele_name_start, allele_name, an_len, '\0');
 	}
 	marker_allele_cts[allele_num] = 1;
@@ -5359,14 +5317,14 @@ int32_t incr_text_allele_str(uintptr_t* topsize_ptr, char* allele_name, uint32_t
   return RET_INVALID_FORMAT;
 }
 
-char* get_llstr(Ll_str* llptr, uint32_t allele_idx) {
-  char* cptr = llptr->ss;
+char* get_llstr(Ll_str* ll_ptr, uint32_t allele_idx) {
+  char* cptr = ll_ptr->ss;
   if (*cptr == '\0') {
-    llptr = llptr->next;
-    if (!llptr) {
+    ll_ptr = ll_ptr->next;
+    if (!ll_ptr) {
       return NULL;
     }
-    cptr = llptr->ss;
+    cptr = ll_ptr->ss;
   }
   while (allele_idx) {
     cptr = token_endnn(cptr);
@@ -5374,11 +5332,11 @@ char* get_llstr(Ll_str* llptr, uint32_t allele_idx) {
     if (*cptr) {
       cptr++;
     } else {
-      llptr = llptr->next;
-      if (!llptr) {
+      ll_ptr = ll_ptr->next;
+      if (!ll_ptr) {
 	return NULL;
       }
-      cptr = llptr->ss;
+      cptr = ll_ptr->ss;
     }
   }
   return cptr;
@@ -5401,9 +5359,9 @@ static inline char* write_token(char* read_ptr, FILE* outfile) {
 
 int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char* outname, char* outname_end, FILE** mapfile_ptr, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_alleles_f, uint32_t map_is_unsorted, uint32_t fam_cols, uint32_t ped_col_skip_iid, uint32_t ped_col_skip, uint32_t cm_col_exists, uint32_t* map_reverse, int64_t ped_size, char* missing_pheno_str) {
   // maintain allele counts and linked lists of observed alleles at FAR end of
-  // wkspace.
+  // bigstack.
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   int32_t retval = 0;
-  uintptr_t topsize = marker_ct * (4LU * sizeof(int32_t) + 16);
   uint32_t ped_buflen = 0;
   uintptr_t sample_ct = 0;
   uintptr_t line_idx = 0;
@@ -5450,20 +5408,22 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
   uint32_t ii_shift;
   unsigned char* writebuf;
   unsigned char* wbufptr;
-  wkspace_reset(marker_alleles_f);
-  if ((wkspace_left / (4LU * sizeof(int32_t) + 16)) <= marker_ct) {
+  bigstack_reset(marker_alleles_f);
+  loadbuf = (char*)g_bigstack_base;
+  if (bigstack_end_calloc_ui(marker_ct * 4, &marker_allele_cts)) {
     goto ped_to_bed_multichar_allele_ret_NOMEM;
   }
-  loadbuf = (char*)wkspace_base;
-  marker_allele_cts = (uint32_t*)(&(wkspace_base[wkspace_left - marker_ct * 4LU * sizeof(int32_t)]));
-  marker_alleles_tmp = (Ll_str_fixed*)(&(wkspace_base[wkspace_left - marker_ct * (4LU * sizeof(int32_t) + 16)]));
-  memset(marker_alleles_tmp, 0, marker_ct * (4LU * sizeof(int32_t) + 16));
+  marker_alleles_tmp = (Ll_str_fixed*)bigstack_end_alloc(marker_ct * sizeof(Ll_str_fixed));
+  if (!marker_alleles_tmp) {
+    goto ped_to_bed_multichar_allele_ret_NOMEM;
+  }
+  memset(marker_alleles_tmp, 0, marker_ct * sizeof(Ll_str_fixed));
 
   if (fclose_null(outfile_ptr)) {
     goto ped_to_bed_multichar_allele_ret_WRITE_FAIL;
   }
   memcpy(outname_end, ".fam", 5);
-  if (fopen_checked(outfile_ptr, outname, "w")) {
+  if (fopen_checked(outname, "w", outfile_ptr)) {
     goto ped_to_bed_multichar_allele_ret_OPEN_FAIL;
   }
   outfile = *outfile_ptr;
@@ -5471,7 +5431,7 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
   fputs("Rescanning .ped file... 0%", stdout);
   fflush(stdout);
   while (1) {
-    loadbuf_size = wkspace_left - topsize;
+    loadbuf_size = bigstack_left();
     if (loadbuf_size > MAXLINEBUFLEN) {
       loadbuf_size = MAXLINEBUFLEN;
     }
@@ -5484,7 +5444,7 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
     if (!loadbuf[loadbuf_size - 1]) {
       logprint("\n");
       if (loadbuf_size == MAXLINEBUFLEN) {
-        sprintf(logbuf, "Error: Line %" PRIuPTR " of .ped file is pathologically long.\n", line_idx);
+        sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .ped file is pathologically long.\n", line_idx);
 	goto ped_to_bed_multichar_allele_ret_INVALID_FORMAT_2;
       } else {
         goto ped_to_bed_multichar_allele_ret_NOMEM;
@@ -5496,7 +5456,7 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
       ped_buflen = ulii;
     }
     col1_ptr = skip_initial_spaces(loadbuf);
-    if (is_eoln_or_comment(*col1_ptr)) {
+    if (is_eoln_or_comment_kns(*col1_ptr)) {
       goto ped_to_bed_multichar_allele_loop_1_start;
     }
     // check for top-of-stack allocations colliding with load buffer
@@ -5536,16 +5496,14 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
     if (putc_checked('\n', outfile)) {
       goto ped_to_bed_multichar_allele_ret_WRITE_FAIL;
     }
-    wkspace_base += cur_slen_rdup;
-    wkspace_left -= cur_slen_rdup;
+    g_bigstack_base += cur_slen_rdup;
     for (marker_uidx = 0, marker_idx = 0; marker_uidx < unfiltered_marker_ct; marker_uidx++) {
       alen1 = strlen_se(bufptr);
       aptr1 = bufptr;
       bufptr = skip_initial_spaces(&(bufptr[alen1]));
       alen2 = strlen_se(bufptr);
       if (!alen2) {
-	wkspace_base -= cur_slen_rdup;
-	wkspace_left += cur_slen_rdup;
+	g_bigstack_base -= cur_slen_rdup;
 	goto ped_to_bed_multichar_allele_ret_MISSING_TOKENS;
       }
       aptr2 = bufptr;
@@ -5563,21 +5521,20 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
 	goto ped_to_bed_multichar_allele_ret_INVALID_FORMAT_4;
       }
       uii = map_is_unsorted? map_reverse[marker_idx] : marker_idx;
-      retval = incr_text_allele_str(&topsize, aptr1, alen1, (Ll_str*)(&(marker_alleles_tmp[uii])), &(marker_allele_cts[4 * uii]));
+      retval = incr_text_allele_str(aptr1, alen1, (Ll_str*)(&(marker_alleles_tmp[uii])), &(marker_allele_cts[4 * uii]));
       if (retval) {
 	goto ped_to_bed_multichar_allele_ret_INVALID_FORMAT_6;
       }
-      retval = incr_text_allele_str(&topsize, aptr2, alen2, (Ll_str*)(&(marker_alleles_tmp[uii])), &(marker_allele_cts[4 * uii]));
+      retval = incr_text_allele_str(aptr2, alen2, (Ll_str*)(&(marker_alleles_tmp[uii])), &(marker_allele_cts[4 * uii]));
       if (retval) {
 	goto ped_to_bed_multichar_allele_ret_INVALID_FORMAT_6;
       }
       marker_idx++;
     }
-    wkspace_base -= cur_slen_rdup;
-    wkspace_left += cur_slen_rdup;
+    g_bigstack_base -= cur_slen_rdup;
     if (!is_eoln_kns(*bufptr)) {
       logprint("\n");
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of .ped file has more tokens than expected.\n", line_idx);
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .ped file has more tokens than expected.\n", line_idx);
       goto ped_to_bed_multichar_allele_ret_INVALID_FORMAT_2;
     }
     sample_ct++;
@@ -5600,18 +5557,18 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
   if (fclose_null(outfile_ptr)) {
     goto ped_to_bed_multichar_allele_ret_WRITE_FAIL;
   }
-  if (marker_ct * 2 * sizeof(intptr_t) + topsize > wkspace_left) {
+  marker_allele_ptrs = (char**)bigstack_alloc(marker_ct * 2 * sizeof(intptr_t));
+  if (!marker_allele_ptrs) {
     goto ped_to_bed_multichar_allele_ret_NOMEM;
   }
-  marker_allele_ptrs = (char**)wkspace_alloc(marker_ct * 2 * sizeof(intptr_t));
   memcpy(outname_end, ".bim", 5);
-  if (fopen_checked(outfile_ptr, outname, "w")) {
+  if (fopen_checked(outname, "w", outfile_ptr)) {
     goto ped_to_bed_multichar_allele_ret_OPEN_FAIL;
   }
   outfile = *outfile_ptr;
   if (map_is_unsorted) {
     memcpy(outname_end, ".map.tmp", 9);
-    if (fopen_checked(mapfile_ptr, outname, "r")) {
+    if (fopen_checked(outname, "r", mapfile_ptr)) {
       goto ped_to_bed_multichar_allele_ret_OPEN_FAIL;
     }
   } else {
@@ -5620,23 +5577,23 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
   marker_uidx = 0;
   for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
     if (map_is_unsorted) {
-      if (!fgets(tbuf, MAXLINELEN, *mapfile_ptr)) {
+      if (!fgets(g_textbuf, MAXLINELEN, *mapfile_ptr)) {
 	goto ped_to_bed_multichar_allele_ret_READ_FAIL;
       }
     } else {
-      if (get_next_noncomment_excl(*mapfile_ptr, &bufptr, &line_idx, marker_exclude, &marker_uidx)) {
+      if (get_next_noncomment_excl(marker_exclude, *mapfile_ptr, &bufptr, &line_idx, &marker_uidx)) {
 	goto ped_to_bed_multichar_allele_ret_READ_FAIL;
       }
     }
     if (marker_allele_cts[4 * marker_idx + 2]) {
       uii = marker_allele_cts[4 * marker_idx + 3];
       if (map_is_unsorted) {
-        sprintf(logbuf, "Warning: Variant %u (post-sort/filter) %sallelic; setting rarest missing.\n", map_reverse[marker_idx] + 1, (uii? "quad" : "tri"));
+        sprintf(g_logbuf, "Warning: Variant %u (post-sort/filter) %sallelic; setting rarest missing.\n", map_reverse[marker_idx] + 1, (uii? "quad" : "tri"));
       } else {
-        sprintf(logbuf, "Warning: Variant %" PRIuPTR " %sallelic; setting rarest alleles missing.\n", marker_idx + 1, (uii? "quad" : "tri"));
+        sprintf(g_logbuf, "Warning: Variant %" PRIuPTR " %sallelic; setting rarest alleles missing.\n", marker_idx + 1, (uii? "quad" : "tri"));
       }
       logerrprintb();
-      get_top_two(&(marker_allele_cts[4 * marker_idx]), uii? 4 : 3, &ulii, &uljj);
+      get_top_two_ui(&(marker_allele_cts[4 * marker_idx]), uii? 4 : 3, &ulii, &uljj);
       uii = map_is_unsorted? map_reverse[marker_idx] : marker_idx;
     } else {
       ulii = (marker_allele_cts[4 * marker_idx] < marker_allele_cts[4 * marker_idx + 1])? 1 : 0;
@@ -5665,8 +5622,8 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
     marker_allele_ptrs[2 * marker_idx] = aptr1;
     marker_allele_ptrs[2 * marker_idx + 1] = aptr2;
     if (map_is_unsorted) {
-      bufptr = (char*)memchr(tbuf, '\n', MAXLINELEN);
-      if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+      bufptr = (char*)memchr(g_textbuf, '\n', MAXLINELEN);
+      if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
         goto ped_to_bed_multichar_allele_ret_WRITE_FAIL;
       }
     } else {
@@ -5708,30 +5665,30 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
     unlink(outname);
   }
   fclose_null(outfile_ptr);
-  if (wkspace_alloc_c_checked(&loadbuf, ped_buflen)) {
+  if (bigstack_alloc_c(ped_buflen, &loadbuf)) {
     goto ped_to_bed_multichar_allele_ret_NOMEM;
   }
-  if (wkspace_left >= marker_ct * sample_ct4) {
+  if (bigstack_left() >= marker_ct * sample_ct4) {
     markers_per_pass = marker_ct;
-    sprintf(logbuf, "Performing single-pass .bed write (%" PRIuPTR " variant%s, %" PRIuPTR " %s).\n", marker_ct, (marker_ct == 1)? "" : "s", sample_ct, species_str(sample_ct));
+    sprintf(g_logbuf, "Performing single-pass .bed write (%" PRIuPTR " variant%s, %" PRIuPTR " %s).\n", marker_ct, (marker_ct == 1)? "" : "s", sample_ct, species_str(sample_ct));
     pass_ct = 1;
   } else {
     if (!map_is_unsorted) {
-      if (wkspace_alloc_ll_checked(&line_starts, sample_ct * sizeof(int64_t))) {
+      if (bigstack_alloc_ll(sample_ct, &line_starts)) {
 	goto ped_to_bed_multichar_allele_ret_NOMEM;
       }
     }
-    markers_per_pass = wkspace_left / sample_ct4;
+    markers_per_pass = bigstack_left() / sample_ct4;
     if (!markers_per_pass) {
       goto ped_to_bed_multichar_allele_ret_NOMEM;
     }
     pass_ct = (marker_ct + markers_per_pass - 1) / markers_per_pass;
-    sprintf(logbuf, "Performing %u-pass .bed write (%u/%" PRIuPTR " variant%s/pass, %" PRIuPTR " %s).\n", pass_ct, markers_per_pass, marker_ct, (markers_per_pass == 1)? "" : "s", sample_ct, species_str(sample_ct));
+    sprintf(g_logbuf, "Performing %u-pass .bed write (%u/%" PRIuPTR " variant%s/pass, %" PRIuPTR " %s).\n", pass_ct, markers_per_pass, marker_ct, (markers_per_pass == 1)? "" : "s", sample_ct, species_str(sample_ct));
   }
   logprintb();
-  writebuf = wkspace_base;
+  writebuf = g_bigstack_base;
   memcpy(outname_end, ".bed", 5);
-  if (fopen_checked(outfile_ptr, outname, "wb")) {
+  if (fopen_checked(outname, FOPEN_WB, outfile_ptr)) {
     goto ped_to_bed_multichar_allele_ret_OPEN_FAIL;
   }
   if (fwrite_checked("l\x1b\x01", 3, *outfile_ptr)) {
@@ -5764,7 +5721,7 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
 	      goto ped_to_bed_multichar_allele_ret_READ_FAIL;
 	    }
 	    col1_ptr = skip_initial_spaces(loadbuf);
-	  } while (is_eoln_or_comment(*col1_ptr));
+	  } while (is_eoln_or_comment_kns(*col1_ptr));
 	  bufptr = next_token_mult(col1_ptr, ped_col_skip);
 	} else {
 	  ped_next_thresh = line_starts[sample_idx];
@@ -5891,35 +5848,34 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
     retval = RET_WRITE_FAIL;
     break;
   ped_to_bed_multichar_allele_ret_INVALID_FORMAT_6:
-    wkspace_base -= cur_slen_rdup;
-    wkspace_left += cur_slen_rdup;
+    g_bigstack_base -= cur_slen_rdup;
     logprint("\n");
     if (retval != RET_NOMEM) {
       LOGERRPRINTF("Error: More than 4 different alleles at variant %u%s.\n", uii + 1, map_is_unsorted? " (post-sort/filter)" : "");
     }
     break;
   ped_to_bed_multichar_allele_ret_INVALID_FORMAT_4:
-    wkspace_base -= cur_slen_rdup;
-    wkspace_left += cur_slen_rdup;
+    g_bigstack_base -= cur_slen_rdup;
     logprint("\n");
     LOGERRPRINTF("Error: Half-missing call in .ped file at variant %" PRIuPTR ", line %" PRIuPTR ".\n", marker_uidx + 1, line_idx);
     retval = RET_INVALID_FORMAT;
     break;
   ped_to_bed_multichar_allele_ret_MISSING_TOKENS:
     logprint("\n");
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of .ped file has fewer tokens than expected.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .ped file has fewer tokens than expected.\n", line_idx);
   ped_to_bed_multichar_allele_ret_INVALID_FORMAT_2:
     logerrprintb();
     retval = RET_INVALID_FORMAT;
     break;
   }
+  bigstack_end_reset(bigstack_end_mark);
   // no marker_allele_ptrs free since all strings were allocated on top of
   // stack
   return retval;
 }
 
 int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_end, uint32_t fam_cols, uint64_t misc_flags, int32_t missing_pheno, Chrom_info* chrom_info_ptr) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* mapfile = NULL;
   FILE* pedfile = NULL;
   FILE* outfile = NULL;
@@ -5971,6 +5927,7 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
   int32_t jj;
   char* loadbuf;
   uintptr_t loadbuf_size;
+  uintptr_t unfiltered_marker_ct_limit;
   char* col1_ptr;
   char* col2_ptr;
   char* bufptr;
@@ -5983,8 +5940,8 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
   unsigned char* wbufptr;
   int64_t ped_size;
   int64_t ped_next_thresh;
-  int32_writex(missing_pheno_str, missing_pheno, '\0');
-  marker_exclude = (uintptr_t*)wkspace_base;
+  int32toa_x(missing_pheno, '\0', missing_pheno_str);
+  marker_exclude = (uintptr_t*)g_bigstack_base;
   marker_exclude[0] = 0;
   // don't use fopen_checked() here, since we want to customize the error
   // message.
@@ -5994,12 +5951,12 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
     if ((uii > 8) && ((!memcmp(&(mapname[uii - 8]), ".ped.map", 8)) || (!memcmp(&(mapname[uii - 8]), ".map.map", 8)))) {
       LOGERRPRINTFWW("Error: Failed to open %s. (--file expects a filename *prefix*; '.ped' and '.map' are automatically appended.)\n", mapname);
     } else {
-      LOGERRPRINTFWW(errstr_fopen, mapname);
+      LOGERRPRINTFWW(g_errstr_fopen, mapname);
     }
     goto ped_to_bed_ret_OPEN_FAIL;
   }
-  tbuf[MAXLINELEN - 6] = ' ';
-  if (check_cm_col(mapfile, tbuf, 0, allow_no_variants, MAXLINELEN - 5, &cm_col_exists, &line_idx)) {
+  g_textbuf[MAXLINELEN - 6] = ' ';
+  if (check_cm_col(mapfile, g_textbuf, 0, allow_no_variants, MAXLINELEN - 5, &cm_col_exists, &line_idx)) {
     if (line_idx) {
       goto ped_to_bed_ret_MISSING_TOKENS_MAP;
     } else {
@@ -6012,14 +5969,20 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
     goto ped_to_bed_empty_map_with_allow_no_vars;
   }
   line_idx--;
+  unfiltered_marker_ct_limit = bigstack_left();
+  if (unfiltered_marker_ct_limit > 0xfffffff) {
+    unfiltered_marker_ct_limit = 0x80000000U;
+  } else {
+    unfiltered_marker_ct_limit *= 8;
+  }
   do {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 6]) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of .map file is pathologically long.\n", line_idx);
+    if (!g_textbuf[MAXLINELEN - 6]) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .map file is pathologically long.\n", line_idx);
       goto ped_to_bed_ret_INVALID_FORMAT_2;
     }
-    col1_ptr = skip_initial_spaces(tbuf);
-    if (is_eoln_or_comment(*col1_ptr)) {
+    col1_ptr = skip_initial_spaces(g_textbuf);
+    if (is_eoln_or_comment_kns(*col1_ptr)) {
       continue;
     }
     col2_ptr = next_token(col1_ptr);
@@ -6033,17 +5996,17 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
       if (chrom_error(".map file", chrom_info_ptr, col1_ptr, line_idx, ii, allow_extra_chroms)) {
 	goto ped_to_bed_ret_INVALID_FORMAT;
       }
-      retval = resolve_or_add_chrom_name(chrom_info_ptr, col1_ptr, &ii, line_idx, ".map file");
+      retval = resolve_or_add_chrom_name(col1_ptr, ".map file", line_idx, chrom_info_ptr, &ii);
       if (retval) {
 	goto ped_to_bed_ret_1;
       }
     }
     if (!is_set(chrom_info_ptr->chrom_mask, ii)) {
-      SET_BIT(marker_exclude, unfiltered_marker_ct);
+      SET_BIT(unfiltered_marker_ct, marker_exclude);
       marker_exclude_ct++;
     } else {
       if (scan_int_abs_defcap(bufptr, &jj)) {
-	sprintf(logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of .map file.\n", line_idx);
+	sprintf(g_logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of .map file.\n", line_idx);
 	goto ped_to_bed_ret_INVALID_FORMAT_2;
       }
       if (jj >= 0) {
@@ -6059,7 +6022,7 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
 	  max_marker_id_len = uii;
 	}
       } else {
-	SET_BIT(marker_exclude, unfiltered_marker_ct);
+	SET_BIT(unfiltered_marker_ct, marker_exclude);
 	marker_exclude_ct++;
       }
     }
@@ -6069,12 +6032,12 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
       goto ped_to_bed_ret_INVALID_FORMAT;
     }
     if (!(unfiltered_marker_ct & (BITCT - 1))) {
-      if ((unfiltered_marker_ct / 8) == wkspace_left) {
+      if (unfiltered_marker_ct == unfiltered_marker_ct_limit) {
 	goto ped_to_bed_ret_NOMEM;
       }
       marker_exclude[unfiltered_marker_ct / BITCT] = 0;
     }
-  } while (fgets(tbuf, MAXLINELEN - 5, mapfile));
+  } while (fgets(g_textbuf, MAXLINELEN - 5, mapfile));
   if (!feof(mapfile)) {
     goto ped_to_bed_ret_READ_FAIL;
   }
@@ -6084,7 +6047,7 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
     goto ped_to_bed_ret_ALL_MARKERS_EXCLUDED;
   }
  ped_to_bed_empty_map_with_allow_no_vars:
-  marker_exclude = (uintptr_t*)wkspace_alloc(((unfiltered_marker_ct + (BITCT - 1)) / BITCT) * sizeof(intptr_t));
+  bigstack_alloc_ul(BITCT_TO_WORDCT(unfiltered_marker_ct), &marker_exclude);
 
   if (map_is_unsorted) {
     retval = load_sort_and_write_map(&map_reverse, mapfile, 3 + cm_col_exists, outname, outname_end, unfiltered_marker_ct, marker_exclude, marker_ct, max_marker_id_len, 1, chrom_info_ptr);
@@ -6096,23 +6059,22 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
   }
   // provisionally assume max_marker_allele_len == 1
   // bugfix: allocate this after map_reverse
-  if (wkspace_alloc_c_checked(&marker_alleles_f, marker_ct * 2) ||
-      wkspace_alloc_c_checked(&marker_alleles, marker_ct * 4) ||
-      wkspace_alloc_ui_checked(&marker_allele_cts, marker_ct * 4 * sizeof(int32_t))) {
+  if (bigstack_alloc_c(marker_ct * 2, &marker_alleles_f) ||
+      bigstack_calloc_c(marker_ct * 4, &marker_alleles) ||
+      bigstack_alloc_ui(marker_ct * 4, &marker_allele_cts)) {
     goto ped_to_bed_ret_NOMEM;
   }
-  memset(marker_alleles, 0, marker_ct * 4);
 
   // first .ped scan: count samples, write .fam, note alleles at each locus
-  if (fopen_checked(&pedfile, pedname, "rb")) {
+  if (fopen_checked(pedname, FOPEN_RB, &pedfile)) {
     goto ped_to_bed_ret_OPEN_FAIL;
   }
   memcpy(outname_end, ".fam", 5);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto ped_to_bed_ret_OPEN_FAIL;
   }
-  loadbuf = (char*)wkspace_base;
-  loadbuf_size = wkspace_left;
+  loadbuf = (char*)g_bigstack_base;
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   } else if (loadbuf_size <= MAXLINELEN) {
@@ -6135,14 +6097,14 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
     if (!loadbuf[loadbuf_size - 1]) {
       if (loadbuf_size == MAXLINEBUFLEN) {
 	logprint("\n");
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of .ped file is pathologically long.\n", line_idx);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .ped file is pathologically long.\n", line_idx);
 	goto ped_to_bed_ret_INVALID_FORMAT_2;
       } else {
         goto ped_to_bed_ret_NOMEM;
       }
     }
     col1_ptr = skip_initial_spaces(loadbuf);
-    if (is_eoln_or_comment(*col1_ptr)) {
+    if (is_eoln_or_comment_kns(*col1_ptr)) {
       ulii = strlen(loadbuf) + 1;
       if (ulii > ped_buflen) {
 	ped_buflen = ulii;
@@ -6161,7 +6123,7 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
     bufptr = token_endnn(bufptr);
     if ((bufptr - col1_ptr) > (MAXLINELEN / 2) - 4) {
       logprint("\n");
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of .ped file has a pathologically long token.\n", line_idx);
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .ped file has a pathologically long token.\n", line_idx);
       goto ped_to_bed_ret_INVALID_FORMAT_2;
     }
     if (fwrite_checked(col1_ptr, strlen_se(col1_ptr), outfile)) {
@@ -6229,7 +6191,7 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
       putchar('\r');
       logstr("\n");
       if (!marker_ct) {
-        sprintf(logbuf, "Error: Line %" PRIuPTR " of .ped file has more tokens than expected.\n", line_idx);
+        sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .ped file has more tokens than expected.\n", line_idx);
         goto ped_to_bed_ret_INVALID_FORMAT_2;
       }
       logprint("Possibly irregular .ped line.  Restarting scan, assuming multichar alleles.\n");
@@ -6257,19 +6219,19 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
     }
     if ((!sample_ct) && (!allow_no_samples)) {
       logprint("\n");
-      sprintf(logbuf, "Error: No %s in .ped file.\n", g_species_plural);
+      sprintf(g_logbuf, "Error: No %s in .ped file.\n", g_species_plural);
       goto ped_to_bed_ret_INVALID_FORMAT_2;
     }
     if (fclose_null(&outfile)) {
       goto ped_to_bed_ret_WRITE_FAIL;
     }
     memcpy(outname_end, ".bim", 5);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto ped_to_bed_ret_OPEN_FAIL;
     }
     if (map_is_unsorted) {
       memcpy(outname_end, ".map.tmp", 9);
-      if (fopen_checked(&mapfile, outname, "r")) {
+      if (fopen_checked(outname, "r", &mapfile)) {
 	goto ped_to_bed_ret_OPEN_FAIL;
       }
     } else {
@@ -6281,20 +6243,20 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
     line_idx = 0;
     for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
       if (map_is_unsorted) {
-	if (!fgets(tbuf, MAXLINELEN, mapfile)) {
+	if (!fgets(g_textbuf, MAXLINELEN, mapfile)) {
 	  goto ped_to_bed_ret_READ_FAIL;
 	}
       } else {
-	if (get_next_noncomment_excl(mapfile, &bufptr, &line_idx, marker_exclude, &marker_uidx)) {
+	if (get_next_noncomment_excl(marker_exclude, mapfile, &bufptr, &line_idx, &marker_uidx)) {
 	  goto ped_to_bed_ret_READ_FAIL;
 	}
       }
       if (marker_alleles[marker_idx * 4 + 2]) {
 	cc = marker_alleles[marker_idx * 4 + 3];
 	if (map_is_unsorted) {
-	  sprintf(logbuf, "Warning: Variant %u (post-sort/filter) %sallelic; setting rarest missing.\n", map_reverse[marker_idx] + 1, (cc? "quad" : "tri"));
+	  sprintf(g_logbuf, "Warning: Variant %u (post-sort/filter) %sallelic; setting rarest missing.\n", map_reverse[marker_idx] + 1, (cc? "quad" : "tri"));
 	} else {
-	  sprintf(logbuf, "Warning: Variant %" PRIuPTR " %sallelic; setting rarest alleles missing.\n", marker_idx + 1, (cc? "quad" : "tri"));
+	  sprintf(g_logbuf, "Warning: Variant %" PRIuPTR " %sallelic; setting rarest alleles missing.\n", marker_idx + 1, (cc? "quad" : "tri"));
 	}
 	logerrprintb();
 	ujj = (cc? 4 : 3);
@@ -6333,8 +6295,8 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
 	cc2 = '0';
       }
       if (map_is_unsorted) {
-	bufptr = (char*)memchr(tbuf, '\n', MAXLINELEN);
-	if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+	bufptr = (char*)memchr(g_textbuf, '\n', MAXLINELEN);
+	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	  goto ped_to_bed_ret_WRITE_FAIL;
 	}
       } else {
@@ -6364,36 +6326,36 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
       marker_uidx++;
     }
     sample_ct4 = (sample_ct + 3) / 4;
-    wkspace_reset(marker_alleles);
+    bigstack_reset(marker_alleles);
     fclose_null(&mapfile);
     if (map_is_unsorted) {
       unlink(outname);
     }
     fclose_null(&outfile);
-    if (wkspace_alloc_c_checked(&loadbuf, ped_buflen)) {
+    if (bigstack_alloc_c(ped_buflen, &loadbuf)) {
       goto ped_to_bed_ret_NOMEM;
     }
-    if (wkspace_left >= marker_ct * sample_ct4) {
+    if (bigstack_left() >= marker_ct * sample_ct4) {
       markers_per_pass = marker_ct;
-      sprintf(logbuf, "Performing single-pass .bed write (%" PRIuPTR " variant%s, %" PRIuPTR " %s).\n", marker_ct, (marker_ct == 1)? "" : "s", sample_ct, species_str(sample_ct));
+      sprintf(g_logbuf, "Performing single-pass .bed write (%" PRIuPTR " variant%s, %" PRIuPTR " %s).\n", marker_ct, (marker_ct == 1)? "" : "s", sample_ct, species_str(sample_ct));
       pass_ct = (marker_ct * sample_ct4)? 1 : 0;
     } else {
       if (!map_is_unsorted) {
-	if (wkspace_alloc_ll_checked(&line_starts, sample_ct * sizeof(int64_t))) {
+	if (bigstack_alloc_ll(sample_ct, &line_starts)) {
 	  goto ped_to_bed_ret_NOMEM;
 	}
       }
-      markers_per_pass = wkspace_left / sample_ct4;
+      markers_per_pass = bigstack_left() / sample_ct4;
       if (!markers_per_pass) {
 	goto ped_to_bed_ret_NOMEM;
       }
       pass_ct = (marker_ct + markers_per_pass - 1) / markers_per_pass;
-      sprintf(logbuf, "Performing %u-pass .bed write (%u/%" PRIuPTR " variant%s/pass, %" PRIuPTR " %s).\n", pass_ct, markers_per_pass, marker_ct, (markers_per_pass == 1)? "" : "s", sample_ct, species_str(sample_ct));
+      sprintf(g_logbuf, "Performing %u-pass .bed write (%u/%" PRIuPTR " variant%s/pass, %" PRIuPTR " %s).\n", pass_ct, markers_per_pass, marker_ct, (markers_per_pass == 1)? "" : "s", sample_ct, species_str(sample_ct));
     }
     logprintb();
-    writebuf = wkspace_base;
+    writebuf = g_bigstack_base;
     memcpy(outname_end, ".bed", 5);
-    if (fopen_checked(&outfile, outname, "wb")) {
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
       goto ped_to_bed_ret_OPEN_FAIL;
     }
     if (fwrite_checked("l\x1b\x01", 3, outfile)) {
@@ -6426,7 +6388,7 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
 		goto ped_to_bed_ret_READ_FAIL;
 	      }
 	      col1_ptr = skip_initial_spaces(loadbuf);
-	    } while (is_eoln_or_comment(*col1_ptr));
+	    } while (is_eoln_or_comment_kns(*col1_ptr));
 	    bufptr = next_token_mult(col1_ptr, ped_col_skip);
 	  } else {
 	    ped_next_thresh = line_starts[sample_idx];
@@ -6559,7 +6521,7 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
     break;
   ped_to_bed_ret_MISSING_TOKENS_PED:
     logprint("\n");
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of .ped file has fewer tokens than expected.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .ped file has fewer tokens than expected.\n", line_idx);
   ped_to_bed_ret_INVALID_FORMAT_2:
     logerrprintb();
   ped_to_bed_ret_INVALID_FORMAT:
@@ -6573,12 +6535,12 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
   fclose_cond(pedfile);
   fclose_cond(mapfile);
   fclose_cond(outfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname, char* outname_end, int32_t missing_pheno, uint64_t misc_flags, uint32_t lgen_modifier, char* lgen_reference_fname, Chrom_info* chrom_info_ptr) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
   FILE* outfile = NULL;
   uint32_t lgen_allele_count = lgen_modifier & LGEN_ALLELE_COUNT;
@@ -6657,7 +6619,7 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
     goto lgen_to_bed_ret_1;
   }
   marker_ct = unfiltered_marker_ct - marker_exclude_ct;
-  retval = sort_item_ids(&sorted_marker_ids, &marker_id_map, unfiltered_marker_ct, marker_exclude, marker_exclude_ct, marker_ids, max_marker_id_len, 0, 0, strcmp_deref);
+  retval = sort_item_ids(unfiltered_marker_ct, marker_exclude, marker_exclude_ct, marker_ids, max_marker_id_len, 0, 0, strcmp_deref, &sorted_marker_ids, &marker_id_map);
   if (retval) {
     goto lgen_to_bed_ret_1;
   }
@@ -6672,7 +6634,7 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
     }
   }
   // collapse
-  if (wkspace_alloc_ui_checked(&sample_id_map, unfiltered_marker_ct * sizeof(int32_t))) {
+  if (bigstack_alloc_ui(unfiltered_marker_ct, &sample_id_map)) {
     goto lgen_to_bed_ret_NOMEM;
   }
   if (marker_ct) {
@@ -6683,7 +6645,7 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
   }
   fclose_null(&infile);
   memcpy(marker_ids, sorted_marker_ids, marker_ct * max_marker_id_len);
-  wkspace_reset(sorted_marker_ids);
+  bigstack_reset(sorted_marker_ids);
 
   retval = load_fam(famname, FAM_COL_13456, 1, missing_pheno, affection_01, &sample_ct, &sample_ids, &max_sample_id_len, &paternal_ids, &max_paternal_id_len, &maternal_ids, &max_maternal_id_len, &sex_nm, &sex_male, &affection, &pheno_nm, &pheno_c, &pheno_d, &founder_info, &sample_exclude, allow_no_samples);
   if (retval) {
@@ -6693,16 +6655,16 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
   if (retval) {
     goto lgen_to_bed_ret_1;
   }
-  if (wkspace_alloc_c_checked(&id_buf, MAXV(max_marker_id_len, max_sample_id_len))) {
+  if (bigstack_alloc_c(MAXV(max_marker_id_len, max_sample_id_len), &id_buf)) {
     goto lgen_to_bed_ret_NOMEM;
   }
-  marker_allele_ptrs = (char**)wkspace_alloc(2 * marker_ct * sizeof(char*));
+  marker_allele_ptrs = (char**)bigstack_alloc(2 * marker_ct * sizeof(char*));
   if (!marker_allele_ptrs) {
     goto lgen_to_bed_ret_NOMEM;
   }
   memset(marker_allele_ptrs, 0, 2 * marker_ct * sizeof(char*));
   sample_ct4 = (sample_ct + 3) / 4;
-  if (wkspace_alloc_uc_checked(&writebuf, ((uintptr_t)marker_ct) * sample_ct4)) {
+  if (bigstack_alloc_uc(((uintptr_t)marker_ct) * sample_ct4, &writebuf)) {
     logerrprint("Error: Multipass .lgen -> .bed autoconversions are not yet supported.  Try\nusing --chr and/or --memory (perhaps with a better machine).\n");
     goto lgen_to_bed_ret_CALC_NOT_YET_SUPPORTED;
   }
@@ -6715,16 +6677,16 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
   } else {
     memset(writebuf, 0x55, marker_ct * sample_ct4);
   }
-  loadbuf_size = wkspace_left;
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   } else if (loadbuf_size <= MAXLINELEN) {
     goto lgen_to_bed_ret_NOMEM;
   }
-  loadbuf = (char*)wkspace_base;
+  loadbuf = (char*)g_bigstack_base;
   loadbuf[loadbuf_size - 1] = ' ';
   if (lgen_modifier & LGEN_REFERENCE) {
-    if (fopen_checked(&infile, lgen_reference_fname, "r")) {
+    if (fopen_checked(lgen_reference_fname, "r", &infile)) {
       goto lgen_to_bed_ret_OPEN_FAIL;
     }
     line_idx = 0;
@@ -6732,7 +6694,7 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
       line_idx++;
       if (!loadbuf[loadbuf_size - 1]) {
 	if (loadbuf_size == MAXLINEBUFLEN) {
-	  sprintf(logbuf, "Error: Line %" PRIuPTR " of .ref file is pathologically long.\n", line_idx);
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .ref file is pathologically long.\n", line_idx);
 	  goto lgen_to_bed_ret_INVALID_FORMAT_2;
 	}
 	goto lgen_to_bed_ret_NOMEM;
@@ -6744,7 +6706,7 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
       cptr2 = token_endnn(cptr);
       a1ptr = skip_initial_spaces(cptr2);
       if (no_more_tokens_kns(a1ptr)) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of .ref file has fewer tokens than expected.\n", line_idx);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .ref file has fewer tokens than expected.\n", line_idx);
 	goto lgen_to_bed_ret_INVALID_FORMAT_2;
       }
       a1len = strlen_se(cptr);
@@ -6760,21 +6722,21 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
 	a2ptr = skip_initial_spaces(sptr);
 	a1len = (uintptr_t)(sptr - a1ptr);
 	a1ptr[a1len] = '\0';
-	if (allele_set(&(marker_allele_ptrs[2 * marker_idx + 1]), a1ptr, a1len)) {
+	if (allele_set(a1ptr, a1len, &(marker_allele_ptrs[2 * marker_idx + 1]))) {
 	  goto lgen_to_bed_ret_NOMEM;
 	}
 	if (no_more_tokens_kns(a2ptr)) {
 	  if (lgen_allele_count) {
 	    a1ptr[a1len++] = 'v';
 	    a1ptr[a1len] = '\0';
-	    if (allele_set(&(marker_allele_ptrs[2 * marker_idx]), a1ptr, a1len)) {
+	    if (allele_set(a1ptr, a1len, &(marker_allele_ptrs[2 * marker_idx]))) {
 	      goto lgen_to_bed_ret_NOMEM;
 	    }
 	  }
 	} else {
 	  a2len = strlen_se(a2ptr);
 	  a2ptr[a2len] = '\0';
-	  if (allele_set(&(marker_allele_ptrs[2 * marker_idx]), a2ptr, a2len)) {
+	  if (allele_set(a2ptr, a2len, &(marker_allele_ptrs[2 * marker_idx]))) {
 	    goto lgen_to_bed_ret_NOMEM;
 	  }
 	}
@@ -6794,13 +6756,13 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
   // Thus we just use the obvious one-pass load, and save proper handling of
   // triallelic sites, etc. for the future .pgen engine.
   memcpy(outname_end, ".bed", 5);
-  if (fopen_checked(&outfile, outname, "wb")) {
+  if (fopen_checked(outname, FOPEN_WB, &outfile)) {
     goto lgen_to_bed_ret_OPEN_FAIL;
   }
   if (fwrite_checked("l\x1b\x01", 3, outfile)) {
     goto lgen_to_bed_ret_WRITE_FAIL;
   }
-  if (fopen_checked(&infile, lgenname, "r")) {
+  if (fopen_checked(lgenname, "r", &infile)) {
     goto lgen_to_bed_ret_OPEN_FAIL;
   }
   if (fseeko(infile, 0, SEEK_END)) {
@@ -6827,39 +6789,36 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
       if (is_eoln_kns(*cptr)) {
 	continue;
       }
-      if (bsearch_read_fam_indiv(id_buf, sorted_sample_ids, max_sample_id_len, sample_ct, cptr, &cptr3, &ii)) {
+      if (bsearch_read_fam_indiv(cptr, sorted_sample_ids, max_sample_id_len, sample_ct, &cptr3, &ii, id_buf) || is_eoln_kns(*cptr3)) {
 	goto lgen_to_bed_ret_MISSING_TOKENS;
       }
       if (ii == -1) {
 	goto lgen_to_bed_ret_MISSING_IID;
       }
       sample_idx = sample_id_map[(uint32_t)ii];
-      cptr4 = token_end(cptr3);
-      if (!cptr4) {
+      cptr4 = token_endnn(cptr3);
+      a1ptr = skip_initial_spaces(cptr4);
+      if (is_eoln_kns(*a1ptr)) {
 	goto lgen_to_bed_ret_MISSING_TOKENS;
       }
-      a1ptr = skip_initial_spaces(cptr4);
-      sptr = token_end(a1ptr);
-      a2ptr = next_token(sptr);
+      sptr = token_endnn(a1ptr);
+      a2ptr = skip_initial_spaces(sptr);
       if (compound_genotypes == 1) {
-	if (no_more_tokens_kns(a2ptr)) {
+	if (is_eoln_kns(*a2ptr)) {
 	  compound_genotypes = 2;
 	} else {
 	  compound_genotypes = 0;
 	}
       }
       if (!compound_genotypes) {
-	if (no_more_tokens_kns(a2ptr)) {
+	if (is_eoln_kns(*a2ptr)) {
 	  goto lgen_to_bed_ret_MISSING_TOKENS;
 	}
         a1len = (uintptr_t)(sptr - a1ptr);
 	a2len = strlen_se(a2ptr);
       } else {
-	if (!sptr) {
-	  goto lgen_to_bed_ret_MISSING_TOKENS;
-	}
 	if ((uintptr_t)(sptr - a1ptr) != 2) {
-	  sprintf(logbuf, "Error: Invalid compound genotype on line %" PRIuPTR " of .lgen file.\n", line_idx);
+	  sprintf(g_logbuf, "Error: Invalid compound genotype on line %" PRIuPTR " of .lgen file.\n", line_idx);
 	  goto lgen_to_bed_ret_INVALID_FORMAT_2;
 	}
 	a1len = 1;
@@ -6883,14 +6842,14 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
 	  goto lgen_to_bed_ret_HALF_MISSING;
         } else {
           if (!sptr) {
-	    if (allele_set(&(marker_allele_ptrs[2 * marker_idx + 1]), a1ptr, a1len)) {
+	    if (allele_set(a1ptr, a1len, &(marker_allele_ptrs[2 * marker_idx + 1]))) {
 	      goto lgen_to_bed_ret_NOMEM;
 	    }
 	    if (!strcmp(a1ptr, a2ptr)) {
 	      uii = 2;
 	    } else {
 	      uii = 1;
-	      if (allele_set(&(marker_allele_ptrs[2 * marker_idx]), a2ptr, a2len)) {
+	      if (allele_set(a2ptr, a2len, &(marker_allele_ptrs[2 * marker_idx]))) {
 		goto lgen_to_bed_ret_NOMEM;
 	      }
 	    }
@@ -6902,12 +6861,12 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
 		  uii = 2;
 		} else {
 		  uii = 1;
-		  if (allele_set(&(marker_allele_ptrs[2 * marker_idx]), a2ptr, a2len)) {
+		  if (allele_set(a2ptr, a2len, &(marker_allele_ptrs[2 * marker_idx]))) {
 		    goto lgen_to_bed_ret_NOMEM;
 		  }
 		}
 	      } else {
-		if (allele_set(&(marker_allele_ptrs[2 * marker_idx]), a1ptr, a1len)) {
+		if (allele_set(a1ptr, a1len, &(marker_allele_ptrs[2 * marker_idx]))) {
 		  goto lgen_to_bed_ret_NOMEM;
 		}
 		if (!strcmp(a2ptr, sptr)) {
@@ -6966,26 +6925,24 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
       if (is_eoln_kns(*cptr)) {
 	continue;
       }
-      if (bsearch_read_fam_indiv(id_buf, sorted_sample_ids, max_sample_id_len, sample_ct, cptr, &cptr3, &ii)) {
+      if (bsearch_read_fam_indiv(cptr, sorted_sample_ids, max_sample_id_len, sample_ct, &cptr3, &ii, id_buf) || is_eoln_kns(*cptr3)) {
 	goto lgen_to_bed_ret_MISSING_TOKENS;
       }
       if (ii == -1) {
 	goto lgen_to_bed_ret_MISSING_IID;
       }
       sample_idx = sample_id_map[(uint32_t)ii];
-      cptr4 = token_end(cptr3);
-      if (!cptr4) {
-	goto lgen_to_bed_ret_MISSING_TOKENS;
-      }
+      cptr4 = token_endnn(cptr3);
       a1ptr = skip_initial_spaces(cptr4);
-      if (no_more_tokens_kns(a1ptr)) {
+      ucc = *a1ptr;
+      if (is_eoln_kns(ucc)) {
 	goto lgen_to_bed_ret_MISSING_TOKENS;
       }
       ii = bsearch_str(cptr3, (uintptr_t)(cptr4 - cptr3), marker_ids, max_marker_id_len, marker_ct);
       if (ii != -1) {
 	marker_idx = marker_id_map[(uint32_t)ii];
 	a1len = strlen_se(a1ptr);
-	uii = ((uint32_t)((unsigned char)(*a1ptr))) - 48;
+	uii = ((uint32_t)ucc) - 48;
 	if ((a1len != 1) || (uii > 2)) {
 	  uii = 1;
 	} else if (uii) {
@@ -7021,7 +6978,7 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
   logprint("done.\n");
   for (uii = 0; uii < marker_ct; uii++) {
     if (popcount_chars((uintptr_t*)writebuf, uii * sample_ct4, (uii + 1) * sample_ct4) < sample_ct) {
-      reverse_loadbuf(&(writebuf[uii * sample_ct4]), sample_ct);
+      reverse_loadbuf(sample_ct, &(writebuf[uii * sample_ct4]));
       cptr = marker_allele_ptrs[uii * 2];
       marker_allele_ptrs[uii * 2] = marker_allele_ptrs[uii * 2 + 1];
       marker_allele_ptrs[uii * 2 + 1] = cptr;
@@ -7035,16 +6992,16 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
   }
   if (map_is_unsorted) {
     memcpy(outname_end, ".map.tmp", 9);
-    if (fopen_checked(&infile, outname, "r")) {
+    if (fopen_checked(outname, "r", &infile)) {
       goto lgen_to_bed_ret_OPEN_FAIL;
     }
   } else {
-    if (fopen_checked(&infile, mapname, "r")) {
+    if (fopen_checked(mapname, "r", &infile)) {
       goto lgen_to_bed_ret_OPEN_FAIL;
     }
   }
   memcpy(outname_end, ".bim", 5);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto lgen_to_bed_ret_OPEN_FAIL;
   }
   uii = 2 * marker_ct;
@@ -7055,15 +7012,15 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
   }
   uii = 0;
   marker_idx = 0;
-  while (fgets(tbuf, MAXLINELEN, infile)) {
-    if (is_eoln_or_comment(*(skip_initial_spaces(tbuf)))) {
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
+    if (is_eoln_or_comment_kns(*(skip_initial_spaces(g_textbuf)))) {
       continue;
     }
     if (IS_SET(marker_exclude, uii)) {
       uii++;
       continue;
     }
-    cptr = (char*)memchr(tbuf, 0, MAXLINELEN);
+    cptr = (char*)memchr(g_textbuf, 0, MAXLINELEN);
     if (cptr[-1] == '\n') {
       cptr--;
       if (cptr[-1] == '\r') {
@@ -7071,7 +7028,7 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
       }
     }
     *cptr++ = '\t';
-    fwrite(tbuf, 1, cptr - tbuf, outfile);
+    fwrite(g_textbuf, 1, cptr - g_textbuf, outfile);
     fputs(marker_allele_ptrs[marker_idx * 2], outfile);
     putc('\t', outfile);
     fputs(marker_allele_ptrs[marker_idx * 2 + 1], outfile);
@@ -7094,31 +7051,31 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
   }
   memcpy(outname_end, ".fam", 5);
 #ifdef _WIN32
-  uii = GetFullPathName(famname, FNAMESIZE, tbuf, NULL);
+  uii = GetFullPathName(famname, FNAMESIZE, g_textbuf, NULL);
   if ((!uii) || (uii > FNAMESIZE))
 #else
-  if (!realpath(famname, tbuf))
+  if (!realpath(famname, g_textbuf))
 #endif
   {
     LOGERRPRINTFWW("Error: Failed to open %s.\n", outname);
     goto lgen_to_bed_ret_OPEN_FAIL;
   }
 #ifdef _WIN32
-  uii = GetFullPathName(outname, FNAMESIZE, &(tbuf[FNAMESIZE + 64]), NULL);
-  if (!(uii && (uii <= FNAMESIZE) && (!strcmp(tbuf, &(tbuf[FNAMESIZE + 64])))))
+  uii = GetFullPathName(outname, FNAMESIZE, &(g_textbuf[FNAMESIZE + 64]), NULL);
+  if (!(uii && (uii <= FNAMESIZE) && (!strcmp(g_textbuf, &(g_textbuf[FNAMESIZE + 64])))))
 #else
-  cptr = realpath(outname, &(tbuf[FNAMESIZE + 64]));
-  if (!(cptr && (!strcmp(tbuf, &(tbuf[FNAMESIZE + 64])))))
+  cptr = realpath(outname, &(g_textbuf[FNAMESIZE + 64]));
+  if (!(cptr && (!strcmp(g_textbuf, &(g_textbuf[FNAMESIZE + 64])))))
 #endif
   {
-    if (fopen_checked(&infile, famname, "r")) {
+    if (fopen_checked(famname, "r", &infile)) {
       goto lgen_to_bed_ret_OPEN_FAIL;
     }
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto lgen_to_bed_ret_OPEN_FAIL;
     }
-    while (fgets(tbuf, MAXLINELEN, infile)) {
-      cptr = skip_initial_spaces(tbuf);
+    while (fgets(g_textbuf, MAXLINELEN, infile)) {
+      cptr = skip_initial_spaces(g_textbuf);
       if (is_eoln_kns(*cptr)) {
 	continue;
       }
@@ -7166,7 +7123,7 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
     retval = RET_INVALID_FORMAT;
     break;
   lgen_to_bed_ret_MISSING_TOKENS:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of .lgen file has fewer tokens than expected.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .lgen file has fewer tokens than expected.\n", line_idx);
   lgen_to_bed_ret_INVALID_FORMAT_2:
     logerrprintb();
     retval = RET_INVALID_FORMAT;
@@ -7190,7 +7147,7 @@ int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname,
       }
     }
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   aligned_free_cond(pheno_c);
   if (infile) {
     fclose(infile);
@@ -7236,12 +7193,12 @@ void transposed_to_bed_print_pct(uint32_t pct) {
 }
 
 int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* outname_end, uint64_t misc_flags, Chrom_info* chrom_info_ptr) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   FILE* infile = NULL;
   FILE* bimfile = NULL;
   FILE* outfile = NULL;
   char** marker_allele_ptrs = NULL;
-  uintptr_t topsize = 0;
   uintptr_t sample_ct = 0;
   uintptr_t line_idx = 0;
   uint32_t no_extra_cols = 1;
@@ -7307,26 +7264,26 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
   uint32_t cur_chrom;
   uint32_t chrom_ct;
   double* marker_cms;
-  if (wkspace_alloc_ui_checked(&chrom_start, (MAX_POSSIBLE_CHROM + 1) * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&chrom_id, MAX_POSSIBLE_CHROM * sizeof(int32_t))) {
+  if (bigstack_alloc_ui(MAX_POSSIBLE_CHROM + 1, &chrom_start) ||
+      bigstack_alloc_ui(MAX_POSSIBLE_CHROM, &chrom_id)) {
     goto transposed_to_bed_ret_NOMEM;
   }
 
-  if (fopen_checked(&infile, tfamname, "r")) {
+  if (fopen_checked(tfamname, "r", &infile)) {
     goto transposed_to_bed_ret_OPEN_FAIL;
   }
   memcpy(outname_end, ".fam", 5);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto transposed_to_bed_ret_OPEN_FAIL;
   }
-  tbuf[MAXLINELEN - 1] = ' ';
-  while (fgets(tbuf, MAXLINELEN, infile)) {
+  g_textbuf[MAXLINELEN - 1] = ' ';
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of .tfam file is pathologically long.\n", line_idx);
+    if (!g_textbuf[MAXLINELEN - 1]) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .tfam file is pathologically long.\n", line_idx);
       goto transposed_to_bed_ret_INVALID_FORMAT_2R;
     }
-    cptr = skip_initial_spaces(tbuf);
+    cptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*cptr)) {
       continue;
     }
@@ -7343,7 +7300,7 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
     goto transposed_to_bed_ret_READ_FAIL;
   }
   if ((!sample_ct) && (!allow_no_samples)) {
-    sprintf(logbuf, "Error: No %s in .tfam file.\n", g_species_plural);
+    sprintf(g_logbuf, "Error: No %s in .tfam file.\n", g_species_plural);
     goto transposed_to_bed_ret_INVALID_FORMAT_2R;
   }
   sample_ct4 = (sample_ct + 3) / 4;
@@ -7351,31 +7308,28 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
   fclose_null(&outfile);
 
   memcpy(outname_end, ".bim.tmp", 9);
-  if (fopen_checked(&bimfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &bimfile)) {
     goto transposed_to_bed_ret_OPEN_FAIL;
   }
   memcpy(outname_end, ".bed.tmp", 9);
-  if (fopen_checked(&outfile, outname, "wb")) {
+  if (fopen_checked(outname, FOPEN_WB, &outfile)) {
     goto transposed_to_bed_ret_OPEN_FAIL;
   }
-  if (wkspace_alloc_uc_checked(&writebuf, sample_ct4) ||
-      wkspace_alloc_uc_checked(&prewritebuf, sample_ct)) {
+  if (bigstack_alloc_uc(sample_ct4, &writebuf) ||
+      bigstack_alloc_uc(sample_ct, &prewritebuf)) {
     goto transposed_to_bed_ret_NOMEM;
   }
-  // long allele names are allocated outside workspace anyway, so it makes
-  // sense for max allele length to be related to reserved non-workspace memory
-  allele_buf = (char*)top_alloc(&topsize, NON_WKSPACE_MIN);
-  if (!allele_buf) {
+  if (bigstack_end_alloc_c(NON_BIGSTACK_MIN, &allele_buf)) {
     goto transposed_to_bed_ret_NOMEM;
   }
-  max_markers = (wkspace_left - topsize) / sizeof(int64_t);
-  mapvals = (int64_t*)wkspace_base;
+  max_markers = bigstack_left() / sizeof(int64_t);
+  mapvals = (int64_t*)g_bigstack_base;
   writemap[16] = 1;
   if (fwrite_checked("l\x1b\x01", 3, outfile)) {
     goto transposed_to_bed_ret_WRITE_FAIL;
   }
 
-  if (fopen_checked(&infile, tpedname, "r")) {
+  if (fopen_checked(tpedname, "r", &infile)) {
     goto transposed_to_bed_ret_OPEN_FAIL;
   }
   if (fseeko(infile, 0, SEEK_END)) {
@@ -7391,18 +7345,18 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
   line_idx = 0;
   while (1) {
     line_idx++;
-    tbuf[MAXLINELEN - 1] = ' ';
-    if (!fgets(tbuf, MAXLINELEN, infile)) {
+    g_textbuf[MAXLINELEN - 1] = ' ';
+    if (!fgets(g_textbuf, MAXLINELEN, infile)) {
       break;
     }
     // assume first four fields are within MAXLINELEN characters, but after
     // that, anything goes.  given e.g. 6MB indels in real datasets, there's
     // legitimate reason for a .tped line to be even longer than 2GB, so we use
     // a custom loading loop.
-    cptr = skip_initial_spaces(tbuf);
+    cptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*cptr)) {
-      if (!tbuf[MAXLINELEN - 1]) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of .tped file has excessive whitespace.\n", line_idx);
+      if (!g_textbuf[MAXLINELEN - 1]) {
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .tped file has excessive whitespace.\n", line_idx);
         goto transposed_to_bed_ret_INVALID_FORMAT_2R;
       }
       continue;
@@ -7411,20 +7365,20 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
     cptr3 = next_token_mult(cptr2, 2);
     cptr4 = next_token(cptr3);
     if (no_more_tokens_kns(cptr4)) {
-      if (!tbuf[MAXLINELEN - 1]) {
+      if (!g_textbuf[MAXLINELEN - 1]) {
 	if (strlen_se(cptr) > MAX_ID_LEN) {
-	  sprintf(logbuf, "Error: Line %" PRIuPTR " of .tped file has an excessively long\nchromosome/contig name.  (The " PROG_NAME_CAPS " limit is " MAX_ID_LEN_STR " characters.)\n", line_idx);
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .tped file has an excessively long\nchromosome/contig name.  (The " PROG_NAME_CAPS " limit is " MAX_ID_LEN_STR " characters.)\n", line_idx);
 	} else if (cptr2 && (strlen_se(cptr2) > MAX_ID_LEN)) {
-	  sprintf(logbuf, "Error: Line %" PRIuPTR " of .tped file has an excessively long variant ID.\n(The " PROG_NAME_CAPS " limit is " MAX_ID_LEN_STR " characters.)\n", line_idx);
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .tped file has an excessively long variant ID.\n(The " PROG_NAME_CAPS " limit is " MAX_ID_LEN_STR " characters.)\n", line_idx);
 	} else if (next_token(cptr2) && (strlen_se(next_token(cptr2)) > MAX_ID_LEN)) {
 	  // far higher bound than necessary; main point is to ensure that if
 	  // we fall through to the "excessive whitespace" error message, that
 	  // complaint is justified.
-	  sprintf(logbuf, "Error: Line %" PRIuPTR " of .tped file has an excessively long centimorgan\nposition.\n", line_idx);
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .tped file has an excessively long centimorgan\nposition.\n", line_idx);
 	} else if (cptr3 && (strlen_se(cptr3) > MAX_ID_LEN)) {
-	  sprintf(logbuf, "Error: Line %" PRIuPTR " of .tped file has an excessively long bp coordinate.\n", line_idx);
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .tped file has an excessively long bp coordinate.\n", line_idx);
 	} else {
-	  sprintf(logbuf, "Error: Line %" PRIuPTR " of .tped file has excessive whitespace.\n", line_idx);
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .tped file has excessive whitespace.\n", line_idx);
 	}
         goto transposed_to_bed_ret_INVALID_FORMAT_2R;
       } else {
@@ -7446,14 +7400,14 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
       if (chrom_error(".tped file", chrom_info_ptr, cptr, line_idx, ii, allow_extra_chroms)) {
 	goto transposed_to_bed_ret_INVALID_FORMAT;
       }
-      retval = resolve_or_add_chrom_name(chrom_info_ptr, cptr, &ii, line_idx, ".tped file");
+      retval = resolve_or_add_chrom_name(cptr, ".tped file", line_idx, chrom_info_ptr, &ii);
       if (retval) {
 	goto transposed_to_bed_ret_1;
       }
     }
 
     if (scan_int_abs_defcap(cptr3, &jj)) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of .tped file has an invalid bp coordinate.\n", line_idx);
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .tped file has an invalid bp coordinate.\n", line_idx);
       goto transposed_to_bed_ret_INVALID_FORMAT_2R;
     }
     if ((!is_set(chrom_info_ptr->chrom_mask, ii)) || (jj < 0)) {
@@ -7493,17 +7447,17 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
     fill_uint_zero(allele_cts, 4);
     for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
       cptr2 = skip_initial_spaces(cptr2);
-      while (cptr2 == &(tbuf[MAXLINELEN - 1])) {
+      while (cptr2 == &(g_textbuf[MAXLINELEN - 1])) {
 	if (cptr2[-1] == '\n') {
 	  goto transposed_to_bed_ret_MISSING_TOKENS;
 	}
-        if (!fgets(tbuf, MAXLINELEN, infile)) {
+        if (!fgets(g_textbuf, MAXLINELEN, infile)) {
           if (ferror(infile)) {
 	    goto transposed_to_bed_ret_READ_FAIL;
 	  }
 	  goto transposed_to_bed_ret_MISSING_TOKENS;
 	}
-	cptr2 = skip_initial_spaces(tbuf);
+	cptr2 = skip_initial_spaces(g_textbuf);
       }
       axptr = cptr2;
       axlen = strlen_se(cptr2);
@@ -7517,20 +7471,20 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
 	cptr3 = memcpya(allele_buf, axptr, axlen);
         axptr = allele_buf;
 	do {
-	  if (!fgets(tbuf, MAXLINELEN, infile)) {
+	  if (!fgets(g_textbuf, MAXLINELEN, infile)) {
 	    if (ferror(infile)) {
 	      goto transposed_to_bed_ret_READ_FAIL;
 	    }
 	    goto transposed_to_bed_ret_MISSING_TOKENS;
 	  }
-	  cptr2 = tbuf;
+	  cptr2 = g_textbuf;
           if (!is_space_or_eoln(*cptr2)) {
 	    cptr2 = token_endnn(cptr2);
 	  }
-	  if ((((uintptr_t)(cptr3 - allele_buf)) + ((uintptr_t)(cptr2 - tbuf))) >= NON_WKSPACE_MIN) {
+	  if ((((uintptr_t)(cptr3 - allele_buf)) + ((uintptr_t)(cptr2 - g_textbuf))) >= NON_BIGSTACK_MIN) {
 	    goto transposed_to_bed_ret_NOMEM;
 	  }
-	  cptr3 = memcpya(cptr3, tbuf, cptr2 - tbuf);
+	  cptr3 = memcpya(cptr3, g_textbuf, cptr2 - g_textbuf);
 	} while (!(*cptr2));
 	axlen = (uintptr_t)(cptr3 - allele_buf);
       }
@@ -7546,17 +7500,17 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
 	uii = 4;
       }
       cptr2 = skip_initial_spaces(cptr2);
-      while (cptr2 == &(tbuf[MAXLINELEN - 1])) {
+      while (cptr2 == &(g_textbuf[MAXLINELEN - 1])) {
 	if (cptr2[-1] == '\n') {
 	  goto transposed_to_bed_ret_MISSING_TOKENS;
 	}
-        if (!fgets(tbuf, MAXLINELEN, infile)) {
+        if (!fgets(g_textbuf, MAXLINELEN, infile)) {
           if (ferror(infile)) {
 	    goto transposed_to_bed_ret_READ_FAIL;
 	  }
 	  goto transposed_to_bed_ret_MISSING_TOKENS;
 	}
-	cptr2 = skip_initial_spaces(tbuf);
+	cptr2 = skip_initial_spaces(g_textbuf);
       }
       axptr = cptr2;
       axlen = strlen_se(cptr2);
@@ -7568,24 +7522,24 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
 	cptr3 = memcpya(allele_buf, axptr, axlen);
         axptr = allele_buf;
 	do {
-	  cptr2 = tbuf;
-	  if (!fgets(tbuf, MAXLINELEN, infile)) {
+	  cptr2 = g_textbuf;
+	  if (!fgets(g_textbuf, MAXLINELEN, infile)) {
 	    if (ferror(infile)) {
 	      goto transposed_to_bed_ret_READ_FAIL;
 	    } else if (sample_idx != sample_ct - 1) {
 	      goto transposed_to_bed_ret_MISSING_TOKENS;
 	    } else {
-	      tbuf[0] = '\0';
+	      g_textbuf[0] = '\0';
 	      break;
 	    }
 	  }
           if (!is_space_or_eoln(*cptr2)) {
 	    cptr2 = token_endnn(cptr2);
 	  }
-	  if ((((uintptr_t)(cptr3 - allele_buf)) + ((uintptr_t)(cptr2 - tbuf))) >= NON_WKSPACE_MIN) {
+	  if ((((uintptr_t)(cptr3 - allele_buf)) + ((uintptr_t)(cptr2 - g_textbuf))) >= NON_BIGSTACK_MIN) {
 	    goto transposed_to_bed_ret_NOMEM;
 	  }
-	  cptr3 = memcpya(cptr3, tbuf, cptr2 - tbuf);
+	  cptr3 = memcpya(cptr3, g_textbuf, cptr2 - g_textbuf);
 	} while (!(*cptr2));
 	axlen = (uintptr_t)(cptr3 - allele_buf);
       }
@@ -7709,16 +7663,16 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
     }
     if (no_extra_cols) {
       cptr2 = skip_initial_spaces(cptr2);
-      while (cptr2 == &(tbuf[MAXLINELEN - 1])) {
+      while (cptr2 == &(g_textbuf[MAXLINELEN - 1])) {
 	if (cptr2[-1] == '\n') {
 	  break;
 	}
-	cptr2 = tbuf;
-	if (!fgets(tbuf, MAXLINELEN, infile)) {
+	cptr2 = g_textbuf;
+	if (!fgets(g_textbuf, MAXLINELEN, infile)) {
 	  if (ferror(infile)) {
 	    goto transposed_to_bed_ret_READ_FAIL;
 	  }
-	  tbuf[0] = '\0';
+	  g_textbuf[0] = '\0';
 	  break;
 	}
         cptr2 = skip_initial_spaces(cptr2);
@@ -7732,22 +7686,22 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
       }
     } else {
     transposed_to_bed_nextline:
-      cptr2 = (char*)memchr(cptr2, 0, MAXLINELEN - ((uintptr_t)(cptr2 - tbuf)));
-      while (cptr2 == &(tbuf[MAXLINELEN - 1])) {
+      cptr2 = (char*)memchr(cptr2, 0, MAXLINELEN - ((uintptr_t)(cptr2 - g_textbuf)));
+      while (cptr2 == &(g_textbuf[MAXLINELEN - 1])) {
 	if (cptr2[-1] == '\n') {
 	  break;
 	}
-        if (!fgets(tbuf, MAXLINELEN, infile)) {
+        if (!fgets(g_textbuf, MAXLINELEN, infile)) {
           if (ferror(infile)) {
 	    goto transposed_to_bed_ret_READ_FAIL;
 	  }
           break;
 	}
-	cptr2 = (char*)memchr(tbuf, 0, MAXLINELEN);
+	cptr2 = (char*)memchr(g_textbuf, 0, MAXLINELEN);
       }
     }
   }
-  // topsize = 0;
+  bigstack_end_reset(bigstack_end_mark);
   if (fclose_null(&infile)) {
     goto transposed_to_bed_ret_READ_FAIL;
   }
@@ -7766,16 +7720,16 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
   chrom_info_ptr->zero_extra_chroms = 0;
   if (map_is_unsorted) {
     loadbuf_size = 2 * max_marker_allele_len + MAXLINELEN;
-    wkspace_alloc(marker_ct * sizeof(int64_t)); // mapvals
+    bigstack_alloc(marker_ct * sizeof(int64_t)); // mapvals
 
-    if (wkspace_alloc_ll_checked(&ll_buf, marker_ct * sizeof(int64_t)) ||
-        wkspace_alloc_ui_checked(&pos_buf, marker_ct * sizeof(int32_t)) ||
-        wkspace_alloc_c_checked(&marker_ids, marker_ct * max_marker_id_len) ||
-	wkspace_alloc_d_checked(&marker_cms, marker_ct * sizeof(double)) ||
-        wkspace_alloc_c_checked(&loadbuf, loadbuf_size)) {
+    if (bigstack_alloc_ll(marker_ct, &ll_buf) ||
+        bigstack_alloc_ui(marker_ct, &pos_buf) ||
+        bigstack_alloc_c(marker_ct * max_marker_id_len, &marker_ids) ||
+	bigstack_alloc_d(marker_ct, &marker_cms) ||
+        bigstack_alloc_c(loadbuf_size, &loadbuf)) {
       goto transposed_to_bed_ret_NOMEM;
     }
-    marker_allele_ptrs = (char**)wkspace_alloc(marker_ct * 2 * sizeof(intptr_t));
+    marker_allele_ptrs = (char**)bigstack_alloc(marker_ct * 2 * sizeof(intptr_t));
     if (!marker_allele_ptrs) {
       goto transposed_to_bed_ret_NOMEM;
     }
@@ -7789,11 +7743,11 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
     sort_marker_chrom_pos(ll_buf, marker_ct, pos_buf, chrom_start, chrom_id, NULL, &chrom_ct);
 
     memcpy(outname_end, ".bim.tmp", 9);
-    if (fopen_checked(&infile, outname, "r")) {
+    if (fopen_checked(outname, "r", &infile)) {
       goto transposed_to_bed_ret_OPEN_FAIL;
     }
     outname_end[4] = '\0';
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto transposed_to_bed_ret_OPEN_FAIL;
     }
     marker_idx = 0;
@@ -7801,23 +7755,23 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
     while (fgets(loadbuf, loadbuf_size, infile)) {
       line_idx++;
       // .tmp file, guaranteed to be no spaces in front
-      cptr = next_token(loadbuf);
-      cptr2 = token_endl(cptr);
+      cptr = skip_initial_spaces(token_endnn(loadbuf));
+      cptr2 = token_endnn(cptr);
       cptr3 = skip_initial_spaces(cptr2);
       cptr4 = next_token_mult(cptr3, 2);
       uii = cptr2 - cptr;
       memcpyx(&(marker_ids[marker_idx * max_marker_id_len]), cptr, uii, '\0');
       if (scan_double(cptr3, &(marker_cms[marker_idx]))) {
-	sprintf(logbuf, "Error: Invalid centimorgan position on line %" PRIuPTR " of .tped file\n", line_idx);
+	sprintf(g_logbuf, "Error: Invalid centimorgan position on line %" PRIuPTR " of .tped file\n", line_idx);
 	goto transposed_to_bed_ret_INVALID_FORMAT_2R;
       }
       uii = strlen_se(cptr4);
-      if (allele_set(&(marker_allele_ptrs[2 * marker_idx]), cptr4, uii)) {
+      if (allele_set(cptr4, uii, &(marker_allele_ptrs[2 * marker_idx]))) {
 	goto transposed_to_bed_ret_NOMEM;
       }
       cptr4 = skip_initial_spaces(&(cptr4[uii + 1]));
       uii = strlen_se(cptr4);
-      if (allele_set(&(marker_allele_ptrs[2 * marker_idx + 1]), cptr4, uii)) {
+      if (allele_set(cptr4, uii, &(marker_allele_ptrs[2 * marker_idx + 1]))) {
 	goto transposed_to_bed_ret_NOMEM;
       }
       marker_idx++;
@@ -7830,15 +7784,16 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
     for (uii = 0; uii < chrom_ct; uii++) {
       cur_chrom = chrom_id[uii];
       ujj = chrom_start[uii + 1];
-      cptr2 = chrom_name_write(&(tbuf[MAXLINELEN]), chrom_info_ptr, cur_chrom);
+      cptr2 = chrom_name_write(chrom_info_ptr, cur_chrom, &(g_textbuf[MAXLINELEN]));
       *cptr2++ = '\t';
       for (; marker_idx < ujj; marker_idx++) {
 	marker_uidx = (uint32_t)ll_buf[marker_idx];
-	fwrite(&(tbuf[MAXLINELEN]), 1, cptr2 - (&(tbuf[MAXLINELEN])), outfile);
+	fwrite(&(g_textbuf[MAXLINELEN]), 1, cptr2 - (&(g_textbuf[MAXLINELEN])), outfile);
 	fputs(&(marker_ids[marker_uidx * max_marker_id_len]), outfile);
-	tbuf[0] = '\t';
-	cptr = uint32_writex(double_g_writex(&(tbuf[1]), marker_cms[marker_uidx], '\t'), (uint32_t)(ll_buf[marker_idx] >> 32), '\t');
-	if (fwrite_checked(tbuf, (uintptr_t)(cptr - tbuf), outfile)) {
+	g_textbuf[0] = '\t';
+	cptr = dtoa_gx(marker_cms[marker_uidx], '\t', &(g_textbuf[1]));
+	cptr = uint32toa_x((uint32_t)(ll_buf[marker_idx] >> 32), '\t', cptr);
+	if (fwrite_checked(g_textbuf, (uintptr_t)(cptr - g_textbuf), outfile)) {
 	  goto transposed_to_bed_ret_WRITE_FAIL;
 	}
         fputs(marker_allele_ptrs[2 * marker_uidx], outfile);
@@ -7858,11 +7813,11 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
 
     outname_end[2] = 'e';
     outname_end[3] = 'd';
-    if (fopen_checked(&infile, outname, "rb")) {
+    if (fopen_checked(outname, FOPEN_RB, &infile)) {
       goto transposed_to_bed_ret_OPEN_FAIL;
     }
     outname_end[4] = '\0';
-    if (fopen_checked(&outfile, outname, "wb")) {
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
       goto transposed_to_bed_ret_OPEN_FAIL;
     }
     if (fwrite_checked("l\x1b\x01", 3, outfile)) {
@@ -7876,7 +7831,7 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
 	  goto transposed_to_bed_ret_READ_FAIL;
 	}
       }
-      if (load_raw(infile, (uintptr_t*)writebuf, sample_ct4)) {
+      if (load_raw(sample_ct4, infile, (uintptr_t*)writebuf)) {
 	goto transposed_to_bed_ret_READ_FAIL;
       }
       if (fwrite_checked(writebuf, sample_ct4, outfile)) {
@@ -7891,16 +7846,16 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
   } else {
     uii = (outname_end - outname);
     memcpy(outname_end, ".bim.tmp", 9);
-    memcpy(tbuf, outname, 9 + uii);
+    memcpy(g_textbuf, outname, 9 + uii);
     outname_end[4] = '\0';
-    if (rename(tbuf, outname)) {
+    if (rename(g_textbuf, outname)) {
       goto transposed_to_bed_ret_WRITE_FAIL;
     }
-    tbuf[uii + 2] = 'e';
-    tbuf[uii + 3] = 'd';
+    g_textbuf[uii + 2] = 'e';
+    g_textbuf[uii + 3] = 'd';
     outname_end[2] = 'e';
     outname_end[3] = 'd';
-    if (rename(tbuf, outname)) {
+    if (rename(g_textbuf, outname)) {
       goto transposed_to_bed_ret_WRITE_FAIL;
     }
   }
@@ -7930,7 +7885,7 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
     retval = RET_INVALID_FORMAT;
     break;
   transposed_to_bed_ret_HALF_MISSING:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of .tped file has a half-missing call.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .tped file has a half-missing call.\n", line_idx);
   transposed_to_bed_ret_INVALID_FORMAT_2R:
     putchar('\r');
     logerrprintb();
@@ -7960,7 +7915,7 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
   fclose_cond(infile);
   fclose_cond(bimfile);
   fclose_cond(outfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
   return retval;
 }
 
@@ -7976,11 +7931,11 @@ int32_t vcf_sample_line(char* outname, char* outname_end, int32_t missing_pheno,
   char* wptr;
   uintptr_t slen;
   bufptr2 = memcpya(fam_trailer, "\t0\t0\t0\t", 7);
-  bufptr2 = int32_writex(bufptr2, missing_pheno, '\n');
+  bufptr2 = int32toa_x(missing_pheno, '\n', bufptr2);
   fam_trailer_len = (uintptr_t)(bufptr2 - fam_trailer);
 
   memcpy(outname_end, ".fam", 5);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto vcf_sample_line_ret_OPEN_FAIL;
   }
   if (const_fid) {
@@ -8013,7 +7968,7 @@ int32_t vcf_sample_line(char* outname, char* outname_end, int32_t missing_pheno,
       bufptr2 = &(bufptr[slen]);
     }
     if (slen > MAX_ID_LEN) {
-      sprintf(logbuf, "Error: --%ccf does not support sample IDs longer than " MAX_ID_LEN_STR " characters.\n", flag_char);
+      sprintf(g_logbuf, "Error: --%ccf does not support sample IDs longer than " MAX_ID_LEN_STR " characters.\n", flag_char);
       goto vcf_sample_line_ret_INVALID_FORMAT_2;
     }
     if ((*bufptr == '0') && (slen == 1)) {
@@ -8022,10 +7977,10 @@ int32_t vcf_sample_line(char* outname, char* outname_end, int32_t missing_pheno,
     }
     if (id_delim) {
       if (*bufptr == id_delim) {
-	sprintf(logbuf, "Error: '%c' at beginning of sample ID.\n", id_delim);
+	sprintf(g_logbuf, "Error: '%c' at beginning of sample ID.\n", id_delim);
 	goto vcf_sample_line_ret_INVALID_FORMAT_2;
       } else if (bufptr[slen - 1] == id_delim) {
-	sprintf(logbuf, "Error: '%c' at end of sample ID.\n", id_delim);
+	sprintf(g_logbuf, "Error: '%c' at end of sample ID.\n", id_delim);
 	goto vcf_sample_line_ret_INVALID_FORMAT_2;
       }
       bufptr3 = (char*)memchr(bufptr, (unsigned char)id_delim, slen);
@@ -8035,7 +7990,7 @@ int32_t vcf_sample_line(char* outname, char* outname_end, int32_t missing_pheno,
 	} else if (const_fid) {
 	  goto vcf_sample_line_const_id;
 	} else {
-	  sprintf(logbuf, "Error: No '%c' in sample ID.\n", id_delim);
+	  sprintf(g_logbuf, "Error: No '%c' in sample ID.\n", id_delim);
 	  goto vcf_sample_line_ret_INVALID_FORMAT_2;
 	}
       }
@@ -8046,25 +8001,25 @@ int32_t vcf_sample_line(char* outname, char* outname_end, int32_t missing_pheno,
 	}
         goto vcf_sample_line_ret_INVALID_FORMAT;
       }
-      wptr = memcpyax(tbuf, bufptr, (uintptr_t)(bufptr3 - bufptr), '\t');
+      wptr = memcpyax(g_textbuf, bufptr, (uintptr_t)(bufptr3 - bufptr), '\t');
       bufptr3++;
       if ((*bufptr3 == '0') && (bufptr2 == &(bufptr3[1]))) {
-        sprintf(logbuf, "Error: Sample ID ends with \"%c0\", which induces an invalid IID of '0'.\n", id_delim);
+        sprintf(g_logbuf, "Error: Sample ID ends with \"%c0\", which induces an invalid IID of '0'.\n", id_delim);
         goto vcf_sample_line_ret_INVALID_FORMAT_2;
       }
       wptr = memcpya(wptr, bufptr3, (uintptr_t)(bufptr2 - bufptr3));
     } else {
       if (double_id) {
       vcf_sample_line_double_id:
-	wptr = memcpyax(tbuf, bufptr, (uintptr_t)(bufptr2 - bufptr), '\t');
+	wptr = memcpyax(g_textbuf, bufptr, (uintptr_t)(bufptr2 - bufptr), '\t');
       } else {
       vcf_sample_line_const_id:
-        wptr = memcpyax(tbuf, const_fid, const_fid_len, '\t');
+        wptr = memcpyax(g_textbuf, const_fid, const_fid_len, '\t');
       }
       wptr = memcpya(wptr, bufptr, (uintptr_t)(bufptr2 - bufptr));
     }
     wptr = memcpya(wptr, fam_trailer, fam_trailer_len);
-    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
       goto vcf_sample_line_ret_WRITE_FAIL;
     }
     if (*bufptr2 != '\t') {
@@ -8140,7 +8095,7 @@ uint32_t vcf_gp_diploid_invalid(char* bufptr, char* bufptr2, double vcf_min_gp,
 #define MAX_VCF_ALT 65534
 
 int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t missing_pheno, uint64_t misc_flags, char* const_fid, char id_delim, char vcf_idspace_to, double vcf_min_qual, char* vcf_filter_exceptions_flattened, double vcf_min_gq, double vcf_min_gp, uint32_t vcf_half_call, Chrom_info* chrom_info_ptr) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   gzFile gz_infile = NULL;
   FILE* outfile = NULL;
   FILE* bimfile = NULL;
@@ -8208,11 +8163,9 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
   if (vcf_half_call_explicit_error) {
     vcf_half_call = 0;
   }
-  if (gzopen_checked(&gz_infile, vcfname, "rb")) {
-    goto vcf_to_bed_ret_OPEN_FAIL;
-  }
-  if (gzbuffer(gz_infile, 131072)) {
-    goto vcf_to_bed_ret_NOMEM;
+  retval = gzopen_read_checked(vcfname, &gz_infile);
+  if (retval) {
+    goto vcf_to_bed_ret_1;
   }
   if (misc_flags & MISC_VCF_FILTER) {
     // automatically include "." and "PASS"
@@ -8220,7 +8173,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
     if (vcf_filter_exceptions_flattened) {
       fexcept_ct += count_and_measure_multistr(vcf_filter_exceptions_flattened, &max_fexcept_len);
     }
-    if (wkspace_alloc_c_checked(&sorted_fexcepts, fexcept_ct * max_fexcept_len)) {
+    if (bigstack_alloc_c(fexcept_ct * max_fexcept_len, &sorted_fexcepts)) {
       goto vcf_to_bed_ret_NOMEM;
     }
     memcpy(sorted_fexcepts, ".", 2);
@@ -8239,14 +8192,14 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
     }
   }
 
-  loadbuf_size = wkspace_left;
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   } else if (loadbuf_size <= MAXLINELEN) {
     goto vcf_to_bed_ret_NOMEM;
   }
   
-  loadbuf = (char*)wkspace_base;
+  loadbuf = (char*)g_bigstack_base;
   loadbuf[loadbuf_size - 1] = ' ';
   while (1) {
     line_idx++;
@@ -8283,7 +8236,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
     }
   } else if (allow_no_samples) {
     memcpy(outname_end, ".fam", 5);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto vcf_to_bed_ret_OPEN_FAIL;
     }
     if (fclose_null(&outfile)) {
@@ -8295,32 +8248,32 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
     goto vcf_to_bed_ret_INVALID_FORMAT;
   }
   sample_ct4 = (sample_ct + 3) / 4;
-  sample_ctl2 = (sample_ct + BITCT2 - 1) / BITCT2;
-  sample_ctv2 = 2 * ((sample_ct + BITCT - 1) / BITCT);
+  sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+  sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
   final_mask = (~ZEROLU) >> (2 * ((0x7fffffe0 - sample_ct) % BITCT2));
-  if (wkspace_alloc_ul_checked(&base_bitfields, sample_ctv2 * 10 * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&vcf_alt_cts, MAX_VCF_ALT * sizeof(int32_t))) {
+  if (bigstack_alloc_ul(sample_ctv2 * 10, &base_bitfields) ||
+      bigstack_alloc_ui(MAX_VCF_ALT, &vcf_alt_cts)) {
     goto vcf_to_bed_ret_NOMEM;
   }
   memcpy(outname_end, ".bim", 5);
-  if (fopen_checked(&bimfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &bimfile)) {
     goto vcf_to_bed_ret_OPEN_FAIL;
   }
   memcpyl3(&(outname_end[2]), "ed");
-  if (fopen_checked(&outfile, outname, "wb")) {
+  if (fopen_checked(outname, FOPEN_WB, &outfile)) {
     goto vcf_to_bed_ret_OPEN_FAIL;
   }
   if (fwrite_checked("l\x1b\x01", 3, outfile)) {
     goto vcf_to_bed_ret_WRITE_FAIL;
   }
-  loadbuf_size = wkspace_left;
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   } else if (loadbuf_size <= MAXLINELEN) {
     goto vcf_to_bed_ret_NOMEM;
   }
   
-  loadbuf = (char*)wkspace_base;
+  loadbuf = (char*)g_bigstack_base;
   loadbuf[loadbuf_size - 1] = ' ';
   while (1) {
     line_idx++;
@@ -8351,7 +8304,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
       if (chrom_error(".vcf file", chrom_info_ptr, bufptr, line_idx, ii, allow_extra_chroms)) {
 	goto vcf_to_bed_ret_INVALID_FORMAT;
       }
-      retval = resolve_or_add_chrom_name(chrom_info_ptr, bufptr, &ii, line_idx, ".vcf file");
+      retval = resolve_or_add_chrom_name(bufptr, ".vcf file", line_idx, chrom_info_ptr, &ii);
       if (retval) {
 	logprint("\n");
         goto vcf_to_bed_ret_1;
@@ -8369,7 +8322,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
       goto vcf_to_bed_ret_MISSING_TOKENS;
     }
     if ((((unsigned char)(*pos_str)) - '0') >= 10) {
-      sprintf(logbuf, "Error: Invalid variant bp coordinate on line %" PRIuPTR " of .vcf file.\n", line_idx);
+      sprintf(g_logbuf, "Error: Invalid variant bp coordinate on line %" PRIuPTR " of .vcf file.\n", line_idx);
       goto vcf_to_bed_ret_INVALID_FORMAT_2N;
     }
     ref_allele_ptr = strchr(++marker_id, '\t');
@@ -8389,7 +8342,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
     // ',' < '.'
     while (1) {
       if ((unsigned char)cc <= ',' && (unsigned char)cc != '*') {
-	sprintf(logbuf, "Error: Invalid alternate allele on line %" PRIuPTR  " of .vcf file.\n", line_idx);
+	sprintf(g_logbuf, "Error: Invalid alternate allele on line %" PRIuPTR  " of .vcf file.\n", line_idx);
 	goto vcf_to_bed_ret_INVALID_FORMAT_2N;
       }
       bufptr2 = bufptr;
@@ -8399,7 +8352,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
       } while (((unsigned char)cc > ',') || (cc == '*'));
       if (((uintptr_t)(bufptr - bufptr2) == ref_allele_len) && (!memcmp(ref_allele_ptr, bufptr2, ref_allele_len))) {
 	if ((alt_ct != 1) || (cc == ',')) {
-	  sprintf(logbuf, "Error: ALT allele duplicates REF allele on line %" PRIuPTR " of .vcf file.\n", line_idx);
+	  sprintf(g_logbuf, "Error: ALT allele duplicates REF allele on line %" PRIuPTR " of .vcf file.\n", line_idx);
 	  goto vcf_to_bed_ret_INVALID_FORMAT_2N;
 	}
         *alt_alleles = '.'; // tolerate SHAPEIT output
@@ -8411,7 +8364,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
       alt_ct++;      
     }
     if (cc != '\t') {
-      sprintf(logbuf, "Error: Malformed ALT field on line %" PRIuPTR " of .vcf file.\n", line_idx);
+      sprintf(g_logbuf, "Error: Malformed ALT field on line %" PRIuPTR " of .vcf file.\n", line_idx);
       goto vcf_to_bed_ret_INVALID_FORMAT_2N;
     }
     if (biallelic_strict && (alt_ct > 1)) {
@@ -8428,7 +8381,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 	continue;
       }
       if (scan_double(bufptr, &dxx)) {
-        sprintf(logbuf, "Error: Invalid QUAL value on line %" PRIuPTR " of .vcf file.\n", line_idx);
+        sprintf(g_logbuf, "Error: Invalid QUAL value on line %" PRIuPTR " of .vcf file.\n", line_idx);
 	goto vcf_to_bed_ret_INVALID_FORMAT_2N;
       }
       if (dxx < vcf_min_qual) {
@@ -8480,7 +8433,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 	marker_skip_ct++;
 	continue;
       }
-      fill_vec_55(base_bitfields, sample_ct);
+      fill_quatervec_55(sample_ct, base_bitfields);
       missing_gt_ct++;
       alt_allele_idx = 1;
       goto vcf_to_bed_genotype_write;
@@ -8566,7 +8519,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 		  continue;
 		}
 	      }
-	      set_bit_ul(&(base_bitfields[uii * sample_ctv2]), sample_idx * 2 + 1);
+	      set_bit_ul(sample_idx * 2 + 1, &(base_bitfields[uii * sample_ctv2]));
 	    } else {
 	      cc = bufptr[3];
 	      if (((cc != '/') && (cc != '|')) || (bufptr[4] == '.')) {
@@ -8592,7 +8545,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 		      continue;
 		    }
 		  }
-		  set_bit_ul(&(base_bitfields[uii * sample_ctv2]), sample_idx * 2);
+		  set_bit_ul(sample_idx * 2, &(base_bitfields[uii * sample_ctv2]));
 		  base_bitfields[ujj * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
 		}
 	      }
@@ -8659,7 +8612,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 	        continue;
 	      }
 	    }
-	    set_bit_ul(&(base_bitfields[uii * sample_ctv2]), sample_idx * 2 + 1);
+	    set_bit_ul(sample_idx * 2 + 1, &(base_bitfields[uii * sample_ctv2]));
 	  } else {
 	    cc = bufptr[3];
 	    if (((cc != '/') && (cc != '|')) || (bufptr[4] == '.')) {
@@ -8687,7 +8640,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 		  continue;
 		}
 	      }
-	      set_bit_ul(&(base_bitfields[uii * sample_ctv2]), sample_idx * 2);
+	      set_bit_ul(sample_idx * 2, &(base_bitfields[uii * sample_ctv2]));
 	      base_bitfields[ujj * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
 	    }
 	  }
@@ -8748,7 +8701,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 	      }
 	    }
 	    if (!uii) {
-	      set_bit_ul(base_bitfields, sample_idx * 2 + 1);
+	      set_bit_ul(sample_idx * 2 + 1, base_bitfields);
 	    } else {
 	      vcf_alt_cts[uii - 1] += 2;
 	    }
@@ -8784,7 +8737,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 		}
 	      }
 	      if (!uii) {
-		set_bit_ul(base_bitfields, sample_idx * 2);
+		set_bit_ul(sample_idx * 2, base_bitfields);
 	      } else {
 		vcf_alt_cts[uii - 1] += 1;
 	      }
@@ -8847,7 +8800,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 	      // no need for ukk check since already validated
 	      continue;
 	    }
-            set_bit_ul(alt_bitfield, sample_idx * 2 + 1);
+            set_bit_ul(sample_idx * 2 + 1, alt_bitfield);
 	  }
 	} else if (*(++bufptr) == '.') {
 	  if ((vcf_half_call == VCF_HALF_CALL_HAPLOID) && (uii == alt_allele_idx)) {
@@ -8867,7 +8820,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 	      continue;
 	    }
 	    if (uii == alt_allele_idx) {
-	      set_bit_ul(alt_bitfield, sample_idx * 2);
+	      set_bit_ul(sample_idx * 2, alt_bitfield);
 	    }
 	    if (ujj == alt_allele_idx) {
               alt_bitfield[sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
@@ -8934,7 +8887,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
     if (skip3_list) {
       if (!marker_skip_ct) {
 	memcpy(outname_end, ".skip.3allele", 14);
-	if (fopen_checked(&skip3file, outname, "w")) {
+	if (fopen_checked(outname, "w", &skip3file)) {
 	  goto vcf_to_bed_ret_OPEN_FAIL;
 	}
 	memcpy(outname_end, ".bed", 5);
@@ -9011,7 +8964,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
     retval = RET_INVALID_FORMAT;
     break;
   vcf_to_bed_ret_LONG_LINE:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of .vcf file is pathologically long.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of .vcf file is pathologically long.\n", line_idx);
   vcf_to_bed_ret_INVALID_FORMAT_2N:
     logprint("\n");
     logerrprintb();
@@ -9024,7 +8977,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
   fclose_cond(outfile);
   fclose_cond(bimfile);
   fclose_cond(skip3file);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -9123,7 +9076,8 @@ int32_t read_bcf_typed_string(gzFile gz_infile, char* readbuf, uint32_t maxlen,
 }
 
 int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t missing_pheno, uint64_t misc_flags, char* const_fid, char id_delim, char vcf_idspace_to, double vcf_min_qual, char* vcf_filter_exceptions_flattened, Chrom_info* chrom_info_ptr) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   gzFile gz_infile = NULL;
   FILE* outfile = NULL;
   FILE* bimfile = NULL;
@@ -9132,14 +9086,13 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
   uintptr_t* fexcept_bitfield = NULL;
   uint32_t* fexcept_idxs = NULL;
   Ll_str* contig_list = NULL;
-  char* tbuf2 = &(tbuf[MAXLINELEN]);
+  char* tbuf2 = &(g_textbuf[MAXLINELEN]);
   uintptr_t contig_ct = 0;
   uintptr_t max_contig_len = 0;
   uintptr_t max_fexcept_len = 0;
   uintptr_t fexcept_ct = 0;
   uintptr_t marker_skip_ct = 0;
   uintptr_t missing_gt_ct = 0;
-  uintptr_t topsize = 0;
   uint32_t double_id = (misc_flags / MISC_DOUBLE_ID) & 1;
   uint32_t check_qual = (vcf_min_qual != -1);
   uint32_t allow_extra_chroms = (misc_flags / MISC_ALLOW_EXTRA_CHROMS) & 1;
@@ -9155,8 +9108,8 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
   uint32_t gt_idx = 0;
   uint32_t marker_ct = 0;
   uint32_t umm = 0;
+  uint32_t vcf_min_qualf_compare_bits = 0;
   int32_t retval = 0;
-  float vcf_min_qualf = vcf_min_qual;
   char missing_geno = *g_missing_geno_ptr;
   uint32_t bcf_var_header[8];
   Ll_str* ll_ptr;
@@ -9190,6 +9143,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
   uint64_t lastloc;
   uint64_t ullii;
   uint64_t ulljj;
+  float vcf_min_qualf;
   uint32_t sample_ct4;
   uint32_t sample_ctl2;
   uint32_t header_size;
@@ -9200,30 +9154,38 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
   uint32_t ujj;
   uint32_t ukk;
   int32_t ii;
-  __floatint32 fi;
+  if (check_qual) {
+    if (vcf_min_qual > FLT_MAXD) {
+      logerrprint("Error: --vcf-min-qual parameter too large.\n");
+      goto bcf_to_bed_ret_INVALID_CMDLINE;
+    }
+    vcf_min_qualf = (float)vcf_min_qual;
+    memcpy(&vcf_min_qualf_compare_bits, &vcf_min_qualf, 4);
+    // +infinity = 0x7f800000; this should pass the comparison
+    // quiet nan = 0x7f800001; this (and other nans) should fail
+    vcf_min_qualf_compare_bits += 0x807fffffU;
+  }
   // todo: check if a specialized bgzf reader can do faster forward seeks when
   // we don't have precomputed virtual offsets
-  if (gzopen_checked(&gz_infile, bcfname, "rb")) {
-    goto bcf_to_bed_ret_OPEN_FAIL;
-  }
-  if (gzbuffer(gz_infile, 131072)) {
-    goto bcf_to_bed_ret_NOMEM;
+  retval = gzopen_read_checked(bcfname, &gz_infile);
+  if (retval) {
+    goto bcf_to_bed_ret_1;
   }
-  if (gzread(gz_infile, tbuf, 5) < 5) {
+  if (gzread(gz_infile, g_textbuf, 5) < 5) {
     goto bcf_to_bed_ret_READ_OR_FORMAT_FAIL;
   }
-  if (memcmp(tbuf, "BCF\2", 4)) {
-    if (memcmp(tbuf, "BCF\4", 4)) {
+  if (memcmp(g_textbuf, "BCF\2", 4)) {
+    if (memcmp(g_textbuf, "BCF\4", 4)) {
       LOGPREPRINTFWW("Error: %s is not a BCF2 file.\n", bcfname);
     } else {
       LOGPREPRINTFWW("Error: %s appears to be a BCF1 file; --bcf only supports BCF2. Use 'bcftools view' to convert it to a PLINK-readable VCF.\n", bcfname);
     }
     goto bcf_to_bed_ret_INVALID_FORMAT_2;
   }
-  if (((unsigned char)(tbuf[4])) > 2) {
+  if (((unsigned char)(g_textbuf[4])) > 2) {
     // defend against 0x82-0x87 being given a meaning in 8-bit int vectors,
     // etc.
-    LOGPREPRINTFWW("Error: %s appears to be formatted as BCFv2.%u; this PLINK build only supports v2.0-2.2. You may need to obtain an updated version of PLINK.\n", bcfname, ((unsigned char)(tbuf[4])));
+    LOGPREPRINTFWW("Error: %s appears to be formatted as BCFv2.%u; this PLINK build only supports v2.0-2.2. You may need to obtain an updated version of PLINK.\n", bcfname, ((unsigned char)(g_textbuf[4])));
     goto bcf_to_bed_ret_INVALID_FORMAT_2;
   }
   if (gzread(gz_infile, &header_size, 4) < 4) {
@@ -9238,7 +9200,9 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
   if (vcf_filter_exceptions_flattened) {
     // vcf_filter guaranteed to be true
     fexcept_ct = count_and_measure_multistr(vcf_filter_exceptions_flattened, &max_fexcept_len);
-    sorted_fexcepts = (char*)top_alloc(&topsize, fexcept_ct * max_fexcept_len);
+    if (bigstack_end_alloc_c(fexcept_ct * max_fexcept_len, &sorted_fexcepts)) {
+      goto bcf_to_bed_ret_NOMEM;
+    }
     bufptr = vcf_filter_exceptions_flattened;
     for (ulii = 0; ulii < fexcept_ct; ulii++) {
       slen = strlen(bufptr) + 1;
@@ -9247,13 +9211,14 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
     }
     qsort(sorted_fexcepts, fexcept_ct, max_fexcept_len, strcmp_casted);
     fexcept_ct = collapse_duplicate_ids(sorted_fexcepts, fexcept_ct, max_fexcept_len, NULL);
-    fexcept_idxs = (uint32_t*)top_alloc(&topsize, fexcept_ct * sizeof(int32_t));
-    fill_uint_zero(fexcept_idxs, fexcept_ct);
+    if (bigstack_end_calloc_ui(fexcept_ct, &fexcept_idxs)) {
+      goto bcf_to_bed_ret_NOMEM;
+    }
   }
-  if (wkspace_left - topsize <= header_size) {
+  if (bigstack_left() <= header_size) {
     goto bcf_to_bed_ret_NOMEM;
   }
-  loadbuf = (char*)wkspace_alloc(header_size + 1);
+  loadbuf = (char*)bigstack_alloc(header_size + 1);
   if ((uint32_t)((uint64_t)gzread(gz_infile, loadbuf, header_size)) != header_size) {
     goto bcf_to_bed_ret_READ_OR_FORMAT_FAIL;
   }
@@ -9329,8 +9294,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
       if (slen >= max_contig_len) {
 	max_contig_len = slen + 1;
       }
-      ll_ptr = top_alloc_llstr(&topsize, slen + 1);
-      if (!ll_ptr) {
+      if (bigstack_end_alloc_llstr(slen + 1, &ll_ptr)) {
 	goto bcf_to_bed_ret_NOMEM;
       }
       ll_ptr->next = contig_list;
@@ -9365,7 +9329,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
   } else if (allow_no_samples) {
     gt_idx = 0;
     memcpy(outname_end, ".fam", 5);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto bcf_to_bed_ret_OPEN_FAIL;
     }
     if (fclose_null(&outfile)) {
@@ -9377,16 +9341,14 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
     goto bcf_to_bed_ret_INVALID_FORMAT;
   }
   sample_ct4 = (sample_ct + 3) / 4;
-  sample_ctl2 = (sample_ct + (BITCT2 - 1)) / BITCT2;
-  sample_ctv2 = 2 * ((sample_ct + (BITCT - 1)) / BITCT);
-  wkspace_reset(loadbuf);
-  wkspace_left -= topsize;
-  ulii = (contig_ct + (BITCT - 1)) / BITCT;
-  if (wkspace_alloc_ul_checked(&contig_bitfield, ulii * sizeof(intptr_t)) ||
-      wkspace_alloc_c_checked(&contigdict, contig_ct * max_contig_len)) {
-    goto bcf_to_bed_ret_NOMEM2;
-  }
-  fill_ulong_zero(contig_bitfield, ulii);
+  sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+  sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
+  bigstack_reset(loadbuf);
+  ulii = BITCT_TO_WORDCT(contig_ct);
+  if (bigstack_calloc_ul(ulii, &contig_bitfield) ||
+      bigstack_alloc_c(contig_ct * max_contig_len, &contigdict)) {
+    goto bcf_to_bed_ret_NOMEM;
+  }
   ulii = contig_ct;
   while (ulii) {
     ulii--;
@@ -9395,57 +9357,55 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
       if (chrom_error(".bcf file", chrom_info_ptr, contig_list->ss, 0, ii, allow_extra_chroms)) {
 	goto bcf_to_bed_ret_INVALID_FORMAT;
       }
-      retval = resolve_or_add_chrom_name(chrom_info_ptr, contig_list->ss, &ii, 0, ".bcf file");
+      retval = resolve_or_add_chrom_name(contig_list->ss, ".bcf file", 0, chrom_info_ptr, &ii);
       if (retval) {
         goto bcf_to_bed_ret_1;
       }
     }
     if (is_set(chrom_info_ptr->chrom_mask, ii)) {
-      set_bit_ul(contig_bitfield, ulii);
+      set_bit_ul(ulii, contig_bitfield);
       strcpy(&(contigdict[ulii * max_contig_len]), contig_list->ss);
     }
     contig_list = contig_list->next;
   }
   if (vcf_filter) {
-    uii = (stringdict_ct + (BITCT - 1)) / BITCT;
-    if (wkspace_alloc_ul_checked(&fexcept_bitfield, uii * sizeof(intptr_t))) {
-      goto bcf_to_bed_ret_NOMEM2;
+    uii = BITCT_TO_WORDCT(stringdict_ct);
+    if (bigstack_calloc_ul(uii, &fexcept_bitfield)) {
+      goto bcf_to_bed_ret_NOMEM;
     }
-    fill_ulong_zero(fexcept_bitfield, uii);
     fexcept_bitfield[0] = 1; // 'PASS'
     for (ulii = 0; ulii < fexcept_ct; ulii++) {
       // fexcept_idxs[] not dereferenced if --vcf-filter had no parameters
-      SET_BIT(fexcept_bitfield, fexcept_idxs[ulii]);
+      SET_BIT(fexcept_idxs[ulii], fexcept_bitfield);
     }
   }
-  wkspace_left += topsize;
-  // topsize = 0;
+  bigstack_end_reset(bigstack_end_mark);
 
   final_mask = (~ZEROLU) >> (2 * ((0x7fffffe0 - sample_ct) % BITCT2));
-  if (wkspace_alloc_c_checked(&loadbuf, sample_ct * 12) ||
-      wkspace_alloc_c_checked(&marker_id, 65536) ||
-      wkspace_alloc_c_checked(&allele_buf, NON_WKSPACE_MIN) ||
-      wkspace_alloc_ui_checked(&allele_lens, 65535 * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&vcf_alt_cts, MAX_VCF_ALT * sizeof(int32_t))) {
+  if (bigstack_alloc_c(sample_ct * 12, &loadbuf) ||
+      bigstack_alloc_c(65536, &marker_id) ||
+      bigstack_alloc_c(NON_BIGSTACK_MIN, &allele_buf) ||
+      bigstack_alloc_ui(65535, &allele_lens) ||
+      bigstack_alloc_ui(MAX_VCF_ALT, &vcf_alt_cts)) {
     goto bcf_to_bed_ret_NOMEM;
   }
-  allele_ptrs = (char**)wkspace_alloc(65535 * sizeof(intptr_t));
+  allele_ptrs = (char**)bigstack_alloc(65535 * sizeof(intptr_t));
   if (!allele_ptrs) {
     goto bcf_to_bed_ret_NOMEM;
   }
-  max_allele_ct = wkspace_left / (sample_ctv2 * sizeof(intptr_t));
+  max_allele_ct = bigstack_left() / (sample_ctv2 * sizeof(intptr_t));
   if (max_allele_ct < 3) {
     goto bcf_to_bed_ret_NOMEM;
   } else if (max_allele_ct > 65535) {
     max_allele_ct = 65535;
   }
-  base_bitfields = (uintptr_t*)wkspace_alloc(sample_ctv2 * max_allele_ct * sizeof(intptr_t));
+  bigstack_alloc_ul(sample_ctv2 * max_allele_ct, &base_bitfields);
   memcpy(outname_end, ".bim", 5);
-  if (fopen_checked(&bimfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &bimfile)) {
     goto bcf_to_bed_ret_OPEN_FAIL;
   }
   memcpy(outname_end, ".bed", 5);
-  if (fopen_checked(&outfile, outname, "wb")) {
+  if (fopen_checked(outname, FOPEN_WB, &outfile)) {
     goto bcf_to_bed_ret_OPEN_FAIL;
   }
   if (fwrite_checked("l\x1b\x01", 3, outfile)) {
@@ -9476,13 +9436,9 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
       goto bcf_to_bed_marker_skip;
     }
     if (check_qual) {
-      if (bcf_var_header[5] == 0x7f800001) {
+      if (bcf_var_header[5] + 0x807fffffU < vcf_min_qualf_compare_bits) {
         goto bcf_to_bed_marker_skip;
       }
-      fi.ii = bcf_var_header[5];
-      if (fi.ii < vcf_min_qualf) {
-	goto bcf_to_bed_marker_skip;
-      }
     }
     retval = read_bcf_typed_string(gz_infile, marker_id, 65535, &marker_id_len);
     if (retval) {
@@ -9495,7 +9451,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
     if (n_allele > max_allele_ct) {
       goto bcf_to_bed_ret_NOMEM;
     }
-    ujj = NON_WKSPACE_MIN; // remaining allele name buffer space
+    ujj = NON_BIGSTACK_MIN; // remaining allele name buffer space
     bufptr = allele_buf;
     if (n_allele) {
       for (uii = 0; uii < n_allele; uii++) {
@@ -9543,7 +9499,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	    if (ujj > 256) {
 	      goto bcf_to_bed_ret_INVALID_FORMAT_GENERIC;
 	    }
-	    ucptr = (unsigned char*)tbuf;
+	    ucptr = (unsigned char*)g_textbuf;
 	    if ((uint32_t)((uint64_t)gzread(gz_infile, ucptr, ujj)) < ujj) {
 	      goto bcf_to_bed_ret_READ_OR_FORMAT_FAIL;
 	    }
@@ -9559,7 +9515,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	    if (ujj > 65536) {
 	      goto bcf_to_bed_ret_INVALID_FORMAT_GENERIC;
 	    }
-            ui16ptr = (uint16_t*)tbuf;
+            ui16ptr = (uint16_t*)g_textbuf;
 	    if ((uint32_t)((uint64_t)gzread(gz_infile, ui16ptr, ujj * sizeof(int16_t))) < ujj * sizeof(int16_t)) {
 	      goto bcf_to_bed_ret_READ_OR_FORMAT_FAIL;
 	    }
@@ -9574,7 +9530,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	  } else {
 	    // a bit more care required to avoid buffer overflow, if for some
 	    // reason there are more than 32k filters...
-            uiptr = (uint32_t*)tbuf;
+            uiptr = (uint32_t*)g_textbuf;
 	    do {
 	      if (ujj > (MAXLINELEN / sizeof(int32_t))) {
 		ukk = MAXLINELEN / sizeof(int32_t);
@@ -9609,7 +9565,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	goto bcf_to_bed_skip_genotype_write;
       }
       missing_gt_ct++;
-      fill_vec_55(base_bitfields, sample_ct);
+      fill_quatervec_55(sample_ct, base_bitfields);
       goto bcf_to_bed_genotype_write;
     }
 
@@ -9667,7 +9623,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	    goto bcf_to_bed_marker_skip2;
 	  } else {
 	    missing_gt_ct++;
-	    fill_vec_55(base_bitfields, sample_ct);
+	    fill_quatervec_55(sample_ct, base_bitfields);
 	    goto bcf_to_bed_genotype_write;
 	  }
 	}
@@ -9675,7 +9631,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
     }
     if (!ujj) {
       // ploidy zero previously caused the variant to be skipped
-      fill_vec_55(base_bitfields, sample_ct);
+      fill_quatervec_55(sample_ct, base_bitfields);
       goto bcf_to_bed_genotype_write;
     }
     if (ukk == 5) {
@@ -9708,11 +9664,11 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	    ulii = ((ulii / 2) - 1) * sample_ctv2;
 	    uljj = (*ucptr) & 0x7e;
 	    if (uljj) {
-	      set_bit(&(base_bitfields[ulii]), sample_idx * 2);
+	      set_bit(sample_idx * 2, &(base_bitfields[ulii]));
 	      base_bitfields[((uljj / 2) - 1) * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
 	    } else {
 	      // could be MT or male X.  don't validate for now
-	      set_bit(&(base_bitfields[ulii]), sample_idx * 2 + 1);
+	      set_bit(sample_idx * 2 + 1, &(base_bitfields[ulii]));
 	    }
 	  }
 	}
@@ -9720,7 +9676,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
 	  ulii = (*ucptr++) & 0x7e;
 	  if (ulii) {
-	    set_bit(&(base_bitfields[((ulii / 2) - 1) * sample_ctv2]), sample_idx * 2 + 1);
+	    set_bit(sample_idx * 2 + 1, &(base_bitfields[((ulii / 2) - 1) * sample_ctv2]));
 	  }
 	}
       } else {
@@ -9733,10 +9689,10 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	      ulii = ((ulii / 2) - 1) * sample_ctv2;
 	      uljj = (*ucptr) & 0x7e;
 	      if (uljj) {
-		set_bit(&(base_bitfields[ulii]), sample_idx * 2);
+		set_bit(sample_idx * 2, &(base_bitfields[ulii]));
 		base_bitfields[((uljj / 2) - 1) * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
 	      } else {
-		set_bit(&(base_bitfields[ulii]), sample_idx * 2 + 1);
+		set_bit(sample_idx * 2 + 1, &(base_bitfields[ulii]));
 	      }
 	    }
 	    ucptr = &(ucptr[ujj - 1]);
@@ -9753,10 +9709,10 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	    ulii = ((ulii / 2) - 1) * sample_ctv2;
             uljj = (*ui16ptr) & 0x7ffe;
 	    if (uljj) {
-	      set_bit(&(base_bitfields[ulii]), sample_idx * 2);
+	      set_bit(sample_idx * 2, &(base_bitfields[ulii]));
 	      base_bitfields[((uljj / 2) - 1) * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
 	    } else {
-	      set_bit(&(base_bitfields[ulii]), sample_idx * 2 + 1);
+	      set_bit(sample_idx * 2 + 1, &(base_bitfields[ulii]));
 	    }
 	  }
 	}
@@ -9764,7 +9720,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
 	  ulii = (*ui16ptr++) & 0x7ffe;
 	  if (ulii) {
-	    set_bit(&(base_bitfields[((ulii / 2) - 1) * sample_ctv2]), sample_idx * 2 + 1);
+	    set_bit(sample_idx * 2 + 1, &(base_bitfields[((ulii / 2) - 1) * sample_ctv2]));
 	  }
 	}
       } else {
@@ -9777,10 +9733,10 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	      ulii = ((ulii / 2) - 1) * sample_ctv2;
               uljj = (*ui16ptr) & 0x7ffe;
 	      if (uljj) {
-		set_bit(&(base_bitfields[ulii]), sample_idx * 2);
+		set_bit(sample_idx * 2, &(base_bitfields[ulii]));
 		base_bitfields[((uljj / 2) - 1) * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
 	      } else {
-		set_bit(&(base_bitfields[ulii]), sample_idx * 2 + 1);
+		set_bit(sample_idx * 2 + 1, &(base_bitfields[ulii]));
 	      }
 	    }
 	    ui16ptr = &(ui16ptr[ujj - 1]);
@@ -9796,10 +9752,10 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	    ulii = ((ulii / 2) - 1) * sample_ctv2;
             uljj = (*uiptr) & 0x7ffffffe;
 	    if (uljj) {
-	      set_bit(&(base_bitfields[ulii]), sample_idx * 2);
+	      set_bit(sample_idx * 2, &(base_bitfields[ulii]));
 	      base_bitfields[((uljj / 2) - 1) * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
 	    } else {
-	      set_bit(&(base_bitfields[ulii]), sample_idx * 2 + 1);
+	      set_bit(sample_idx * 2 + 1, &(base_bitfields[ulii]));
 	    }
 	  }
 	}
@@ -9807,7 +9763,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
 	  ulii = (*uiptr++) & 0x7ffffffe;
 	  if (ulii) {
-	    set_bit(&(base_bitfields[((ulii / 2) - 1) * sample_ctv2]), sample_idx * 2 + 1);
+	    set_bit(sample_idx * 2 + 1, &(base_bitfields[((ulii / 2) - 1) * sample_ctv2]));
 	  }
 	}
       } else {
@@ -9820,10 +9776,10 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	      ulii = ((ulii / 2) - 1) * sample_ctv2;
               uljj = (*uiptr) & 0x7ffffffe;
 	      if (uljj) {
-		set_bit(&(base_bitfields[ulii]), sample_idx * 2);
+		set_bit(sample_idx * 2, &(base_bitfields[ulii]));
 		base_bitfields[((uljj / 2) - 1) * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
 	      } else {
-		set_bit(&(base_bitfields[ulii]), sample_idx * 2 + 1);
+		set_bit(sample_idx * 2 + 1, &(base_bitfields[ulii]));
 	      }
 	    }
 	    uiptr = &(uiptr[ujj - 1]);
@@ -9877,7 +9833,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
     // bcf2 coordinates are 0-based while vcf is 1-based... (seriously, whose
     // idea was this?  this is basically a bug in the spec due to how e.g.
     // telomeres are supposed to be encoded, but we have to play along)
-    bufptr = uint32_writex(&(tbuf2[3]), bcf_var_header[3] + 1, '\t');
+    bufptr = uint32toa_x(bcf_var_header[3] + 1, '\t', &(tbuf2[3]));
     if (fwrite_checked(tbuf2, bufptr - tbuf2, bimfile)) {
       goto bcf_to_bed_ret_WRITE_FAIL;
     }
@@ -9906,7 +9862,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
     if (skip3_list) {
       if (!marker_skip_ct) {
 	memcpy(outname_end, ".skip.3allele", 14);
-	if (fopen_checked(&skip3file, outname, "w")) {
+	if (fopen_checked(outname, "w", &skip3file)) {
 	  goto bcf_to_bed_ret_OPEN_FAIL;
 	}
 	memcpy(outname_end, ".bed", 5);
@@ -9960,8 +9916,6 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
     LOGERRPRINTF("Warning: %" PRIuPTR " variant record%s had no GT field.\n", missing_gt_ct, (missing_gt_ct == 1)? "" : "s");
   }
   while (0) {
-  bcf_to_bed_ret_NOMEM2:
-    wkspace_left += topsize;
   bcf_to_bed_ret_NOMEM:
     retval = RET_NOMEM;
     break;
@@ -9978,6 +9932,9 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
   bcf_to_bed_ret_WRITE_FAIL:
     retval = RET_WRITE_FAIL;
     break;
+  bcf_to_bed_ret_INVALID_CMDLINE:
+    retval = RET_INVALID_CMDLINE;
+    break;
   bcf_to_bed_ret_INVALID_FORMAT_GENERIC:
     logerrprint("Error: Improperly formatted .bcf file.\n");
     retval = RET_INVALID_FORMAT;
@@ -9993,7 +9950,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
   fclose_cond(outfile);
   fclose_cond(bimfile);
   fclose_cond(skip3file);
-  wkspace_reset(wkspace_mark);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
   return retval;
 }
 
@@ -10018,7 +9975,7 @@ uint32_t write_23_cached_chrom(char* write_cache, uint32_t markers_left, char ch
 }
 
 int32_t bed_from_23(char* infile_name, char* outname, char* outname_end, uint32_t modifier_23, char* fid_23, char* iid_23, double pheno_23, uint64_t misc_flags, char* paternal_id_23, char* maternal_id_23, Chrom_info* chrom_info_ptr) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile_23 = NULL;
   FILE* outfile_bed = NULL;
   FILE* outfile_txt = NULL;
@@ -10030,7 +9987,7 @@ int32_t bed_from_23(char* infile_name, char* outname, char* outname_end, uint32_
   uint32_t haploid_x_present = 0;
   uint32_t y_present = 0;
   uint32_t nonmissing_y_present = 0;
-  unsigned char* writebuf = (unsigned char*)(&(tbuf[MAXLINELEN]));
+  unsigned char* writebuf = (unsigned char*)(&(g_textbuf[MAXLINELEN]));
   int32_t retval = 0;
   uint32_t cur_chrom = 0;
   uint32_t chrom_mask_23 = (uint32_t)(chrom_info_ptr->chrom_mask[0]);
@@ -10051,33 +10008,33 @@ int32_t bed_from_23(char* infile_name, char* outname, char* outname_end, uint32_
   char cc;
   char cc2;
   unsigned char ucc;
-  if (wkspace_alloc_c_checked(&writebuf2, MAXLINELEN)) {
+  if (bigstack_alloc_c(MAXLINELEN, &writebuf2)) {
     goto bed_from_23_ret_NOMEM;
   }
-  if (fopen_checked(&infile_23, infile_name, "r")) {
+  if (fopen_checked(infile_name, "r", &infile_23)) {
     goto bed_from_23_ret_OPEN_FAIL;
   }
   memcpy(outname_end, ".bim", 5);
-  if (fopen_checked(&outfile_txt, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile_txt)) {
     goto bed_from_23_ret_OPEN_FAIL;
   }
   memcpy(&(outname_end[2]), "ed", 2);
-  if (fopen_checked(&outfile_bed, outname, "wb")) {
+  if (fopen_checked(outname, FOPEN_WB, &outfile_bed)) {
     goto bed_from_23_ret_OPEN_FAIL;
   }
-  if (wkspace_left < MAXLINELEN) {
+  if (bigstack_left() < MAXLINELEN) {
     goto bed_from_23_ret_NOMEM;
   }
   writebuf_cur = (unsigned char*)memcpyl3a((char*)writebuf, "l\x1b\x01");
   writebuf_end = &(writebuf[MAXLINELEN]);
-  tbuf[MAXLINELEN - 1] = ' ';
-  while (fgets(tbuf, MAXLINELEN, infile_23)) {
+  g_textbuf[MAXLINELEN - 1] = ' ';
+  while (fgets(g_textbuf, MAXLINELEN, infile_23)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, infile_name);
+    if (!g_textbuf[MAXLINELEN - 1]) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, infile_name);
       goto bed_from_23_ret_INVALID_FORMAT_2;
     }
-    id_start = skip_initial_spaces(tbuf);
+    id_start = skip_initial_spaces(g_textbuf);
     cc = *id_start;
     if (is_eoln_kns(cc) || (cc == '#')) {
       continue;
@@ -10097,7 +10054,7 @@ int32_t bed_from_23(char* infile_name, char* outname, char* outname_end, uint32_
     }
     ii = get_chrom_code(chrom_info_ptr, chrom_start);
     if (ii < 0) {
-      sprintf(logbuf, "Error: Invalid chromosome code on line %" PRIuPTR " of %s.\n", line_idx, infile_name);
+      sprintf(g_logbuf, "Error: Invalid chromosome code on line %" PRIuPTR " of %s.\n", line_idx, infile_name);
       goto bed_from_23_ret_INVALID_FORMAT_2;
     }
     uii = (uint32_t)ii;
@@ -10169,7 +10126,7 @@ int32_t bed_from_23(char* infile_name, char* outname, char* outname_end, uint32_
       goto bed_from_23_ret_MISSING_ALLELE_CALLS;
     }
     if (!null_chrom) {
-      writebuf2_cur = uint32_write(writebuf2, cur_chrom);
+      writebuf2_cur = uint32toa(cur_chrom, writebuf2);
     } else {
       writebuf2[0] = '0';
       writebuf2_cur = &(writebuf2[1]);
@@ -10212,7 +10169,7 @@ int32_t bed_from_23(char* infile_name, char* outname, char* outname_end, uint32_
     goto bed_from_23_ret_WRITE_FAIL;
   }
   memcpy(outname_end, ".fam", 5);
-  if (fopen_checked(&outfile_txt, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile_txt)) {
     goto bed_from_23_ret_OPEN_FAIL;
   }
   if (fid_23) {
@@ -10299,13 +10256,13 @@ int32_t bed_from_23(char* infile_name, char* outname, char* outname_end, uint32_
   fclose_cond(infile_23);
   fclose_cond(outfile_bed);
   fclose_cond(outfile_txt);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t generate_dummy(char* outname, char* outname_end, uint32_t flags, uintptr_t marker_ct, uintptr_t sample_ct, double geno_mrate, double pheno_mrate, int32_t missing_pheno) {
   FILE* outfile = NULL;
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t sample_ct4 = (sample_ct + 3) / 4;
   uintptr_t urand = 0;
   double missing_phenod = (double)missing_pheno;
@@ -10335,7 +10292,7 @@ int32_t generate_dummy(char* outname, char* outname_end, uint32_t flags, uintptr
   unsigned char ucc2;
   uint64_t ullii;
   double dxx;
-  wptr2 = int32_write(missing_pheno_str, missing_pheno);
+  wptr2 = int32toa(missing_pheno, missing_pheno_str);
   missing_pheno_len = (uintptr_t)(wptr2 - missing_pheno_str);
   *wptr2 = '\0';
   if (flags & DUMMY_ACGT) {
@@ -10349,11 +10306,11 @@ int32_t generate_dummy(char* outname, char* outname_end, uint32_t flags, uintptr
   } else {
     memcpyl3(alleles, "ABA");
   }
-  if (wkspace_alloc_uc_checked(&writebuf, sample_ct4)) {
+  if (bigstack_alloc_uc(sample_ct4, &writebuf)) {
     goto generate_dummy_ret_NOMEM;
   }
   memcpy(outname_end, ".bim", 5);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto generate_dummy_ret_OPEN_FAIL;
   }
   memcpy(wbuf, "1\tsnp", 5);
@@ -10361,13 +10318,15 @@ int32_t generate_dummy(char* outname, char* outname_end, uint32_t flags, uintptr
     for (uii = 0; uii < marker_ct; uii++) {
       if (!(uii % 8)) {
 	do {
-	  urand = sfmt_genrand_uint32(&sfmt);
+	  urand = sfmt_genrand_uint32(&g_sfmt);
 	} while (urand < 425132032LU); // 2^32 - 12^8.  heck, why not
       }
       ukk = urand / 12U;
       ujj = urand - (ukk * 12U);
       urand = ukk;
-      wptr2 = uint32_writex(memcpyl3a(uint32_write(wptr, uii), "\t0\t"), uii, '\t');
+      wptr2 = uint32toa(uii, wptr);
+      wptr2 = memcpyl3a(wptr2, "\t0\t");
+      wptr2 = uint32toa_x(uii, '\t', wptr2);
       wptr2[0] = alleles[ujj];
       wptr2[1] = '\t';
       wptr2[2] = alleles[ujj + 1];
@@ -10379,11 +10338,13 @@ int32_t generate_dummy(char* outname, char* outname_end, uint32_t flags, uintptr
   } else {
     for (uii = 0; uii < marker_ct; uii++) {
       if (!(uii % 32)) {
-	urand = sfmt_genrand_uint32(&sfmt);
+	urand = sfmt_genrand_uint32(&g_sfmt);
       }
       ujj = urand & 1;
       urand >>= 1;
-      wptr2 = uint32_writex(memcpyl3a(uint32_write(wptr, uii), "\t0\t"), uii, '\t');
+      wptr2 = uint32toa(uii, wptr);
+      wptr2 = memcpyl3a(wptr2, "\t0\t");
+      wptr2 = uint32toa_x(uii, '\t', wptr2);
       wptr2[0] = alleles[ujj];
       wptr2[1] = '\t';
       wptr2[2] = alleles[ujj + 1];
@@ -10397,13 +10358,13 @@ int32_t generate_dummy(char* outname, char* outname_end, uint32_t flags, uintptr
     goto generate_dummy_ret_WRITE_FAIL;
   }
   memcpy(outname_end, ".fam", 5);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto generate_dummy_ret_OPEN_FAIL;
   }
   wptr = memcpyl3a(wbuf, "per");
   if (flags & DUMMY_SCALAR_PHENO) {
     for (uii = 0; uii < sample_ct; uii++) {
-      if (pheno_m_check && (sfmt_genrand_uint32(&sfmt) <= pheno_m32)) {
+      if (pheno_m_check && (sfmt_genrand_uint32(&g_sfmt) <= pheno_m32)) {
 	dxx = missing_phenod;
       } else {
 	if (saved_rnormal) {
@@ -10414,7 +10375,11 @@ int32_t generate_dummy(char* outname, char* outname_end, uint32_t flags, uintptr
 	  saved_rnormal = 1;
 	}
       }
-      wptr2 = double_g_writex(memcpya(uint32_write(memcpya(uint32_write(wptr, uii), " per", 4), uii), " 0 0 2 ", 7), dxx, '\n');
+      wptr2 = uint32toa(uii, wptr);
+      wptr2 = memcpya(wptr2, " per", 4);
+      wptr2 = uint32toa(uii, wptr2);
+      wptr2 = memcpya(wptr2, " 0 0 2 ", 7);
+      wptr2 = dtoa_gx(dxx, '\n', wptr2);
       if (fwrite_checked(wbuf, wptr2 - wbuf, outfile)) {
 	goto generate_dummy_ret_WRITE_FAIL;
       }
@@ -10422,11 +10387,13 @@ int32_t generate_dummy(char* outname, char* outname_end, uint32_t flags, uintptr
   } else {
     for (uii = 0; uii < sample_ct; uii++) {
       if (!(uii % 32)) {
-	urand = sfmt_genrand_uint32(&sfmt);
+	urand = sfmt_genrand_uint32(&g_sfmt);
       }
-      wptr2 = uint32_write(memcpya(uint32_write(wptr, uii), " per", 4), uii);
+      wptr2 = uint32toa(uii, wptr);
+      wptr2 = memcpya(wptr2, " per", 4);
+      wptr2 = uint32toa(uii, wptr2);
       wptr2 = memcpya(wptr2, " 0 0 2 ", 7);
-      if (pheno_m_check && (sfmt_genrand_uint32(&sfmt) <= pheno_m32)) {
+      if (pheno_m_check && (sfmt_genrand_uint32(&g_sfmt) <= pheno_m32)) {
 	wptr2 = memcpya(wptr2, missing_pheno_str, missing_pheno_len);
       } else {
 	*wptr2++ = (char)((urand & 1) + '1');
@@ -10442,7 +10409,7 @@ int32_t generate_dummy(char* outname, char* outname_end, uint32_t flags, uintptr
     goto generate_dummy_ret_WRITE_FAIL;
   }
   memcpy(outname_end, ".bed", 5);
-  if (fopen_checked(&outfile, outname, "wb")) {
+  if (fopen_checked(outname, FOPEN_WB, &outfile)) {
     goto generate_dummy_ret_OPEN_FAIL;
   }
   if (fwrite_checked("l\x1b\x01", 3, outfile)) {
@@ -10462,11 +10429,11 @@ int32_t generate_dummy(char* outname, char* outname_end, uint32_t flags, uintptr
       ucptr = writebuf;
       for (ujj = 0; ujj < sample_ct4; ujj++) {
 	if (!(ujj % 4)) {
-	  urand = sfmt_genrand_uint32(&sfmt);
+	  urand = sfmt_genrand_uint32(&g_sfmt);
 	}
 	ucc = 0;
 	for (ukk = 0; ukk < 8; ukk += 2) {
-	  if (geno_m_check && (sfmt_genrand_uint32(&sfmt) < geno_m32)) {
+	  if (geno_m_check && (sfmt_genrand_uint32(&g_sfmt) < geno_m32)) {
 	    ucc2 = 1;
 	  } else {
 	    ucc2 = urand & 3;
@@ -10486,7 +10453,7 @@ int32_t generate_dummy(char* outname, char* outname_end, uint32_t flags, uintptr
 
       ujj = popcount_chars((uintptr_t*)writebuf, 0, sample_ct4);
       if (ujj < sample_ct) {
-	reverse_loadbuf(writebuf, sample_ct);
+	reverse_loadbuf(sample_ct, writebuf);
       }
       if (fwrite_checked(writebuf, sample_ct4, outfile)) {
 	putchar('\n');
@@ -10515,7 +10482,7 @@ int32_t generate_dummy(char* outname, char* outname_end, uint32_t flags, uintptr
     retval = RET_WRITE_FAIL;
   }
   fclose_cond(outfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -10856,9 +10823,9 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
   FILE* outfile_txt = NULL;
   FILE* outfile_simfreq = NULL;
   FILE* outfile_bed = NULL;
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   double* qt_vals = NULL;
-  char* cur_snp_label = &(tbuf[MAXLINELEN]);
+  char* cur_snp_label = &(g_textbuf[MAXLINELEN]);
   char* marker_freq_lb_ptr = NULL;
   char* marker_ld_ptr = NULL;
   uintptr_t* writebuf2 = NULL;
@@ -10925,13 +10892,12 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
   if (!is_qt) {
     sample_ct = case_ct + ctrl_ct;
   } else {
-    if (wkspace_alloc_d_checked(&qt_vals, sample_ct * sizeof(double))) {
+    if (bigstack_calloc_d(sample_ct, &qt_vals)) {
       goto simulate_ret_NOMEM;
     }
-    fill_double_zero(qt_vals, sample_ct);
   }
   sample_ct4 = (sample_ct + 3) / 4;
-  sample_ctl2 = (sample_ct + BITCT2 - 1) / BITCT2;
+  sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
   if (randomize_alleles) {
     if (flags & SIMULATE_ACGT) {
       memcpy(alleles, "ACAGATCGCTGTA", 13);
@@ -10947,27 +10913,27 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
       memcpy(alleles, "DdAB", 4);
     }
   }
-  if (wkspace_alloc_ul_checked(&writebuf, sample_ctl2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(sample_ctl2, &writebuf)) {
     goto simulate_ret_NOMEM;
   }
   if (do_haps) {
-    if (wkspace_alloc_ul_checked(&writebuf2, sample_ctl2 * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(sample_ctl2, &writebuf2)) {
       goto simulate_ret_NOMEM;
     }
   }
-  if (fopen_checked(&infile, simulate_fname, "r")) {
+  if (fopen_checked(simulate_fname, "r", &infile)) {
     goto simulate_ret_OPEN_FAIL;
   }
   memcpy(outname_end, ".bim", 5);
-  if (fopen_checked(&outfile_txt, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile_txt)) {
     goto simulate_ret_OPEN_FAIL;
   }
   memcpy(outname_end, ".simfreq", 9);
-  if (fopen_checked(&outfile_simfreq, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile_simfreq)) {
     goto simulate_ret_OPEN_FAIL;
   }
   memcpy(outname_end, ".bed", 5);
-  if (fopen_checked(&outfile_bed, outname, "wb")) {
+  if (fopen_checked(outname, FOPEN_WB, &outfile_bed)) {
     goto simulate_ret_OPEN_FAIL;
   }
   if (fwrite_checked("l\x1b\x01", 3, outfile_bed)) {
@@ -10977,26 +10943,26 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
   LOGPRINTFWW5("Writing --simulate%s dataset to %s.bed + %s.bim + %s.fam ... ", is_qt? "-qt" : "", outname, outname, outname);
   fputs("0%", stdout);
   fflush(stdout);
-  sfmt64p = (sfmt_t*)wkspace_alloc(sizeof(sfmt_t));
+  sfmt64p = (sfmt_t*)bigstack_alloc(sizeof(sfmt_t));
   if (!sfmt64p) {
     goto simulate_ret_NOMEM;
   }
-  init_sfmt64_from_sfmt32(&sfmt, sfmt64p);
-  tbuf[MAXLINELEN - 1] = ' ';
+  init_sfmt64_from_sfmt32(&g_sfmt, sfmt64p);
+  g_textbuf[MAXLINELEN - 1] = ' ';
   // just determine total marker ct in initial scan, for progress indicator
   ullii = 0;
-  while (fgets(tbuf, MAXLINELEN, infile)) {
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --simulate%s file is pathologically long.\n", line_idx, is_qt? "-qt" : "");
+    if (!g_textbuf[MAXLINELEN - 1]) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --simulate%s file is pathologically long.\n", line_idx, is_qt? "-qt" : "");
       goto simulate_ret_INVALID_FORMAT_2N;
     }
-    cptr = skip_initial_spaces(tbuf);
+    cptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*cptr)) {
       continue;
     }
     if (scan_uint_icap(cptr, &uii)) {
-      sprintf(logbuf, "Error: Invalid SNP count on line %" PRIuPTR " of --simulate%s input file.\n", line_idx, is_qt? "-qt" : "");
+      sprintf(g_logbuf, "Error: Invalid SNP count on line %" PRIuPTR " of --simulate%s input file.\n", line_idx, is_qt? "-qt" : "");
       goto simulate_ret_INVALID_FORMAT_2N;
     }
     ullii += uii;
@@ -11005,20 +10971,20 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
     goto simulate_ret_READ_FAIL;
   }
   if (!ullii) {
-    sprintf(logbuf, "Error: --simulate%s input file specifies zero SNPs.\n", is_qt? "-qt" : "");
+    sprintf(g_logbuf, "Error: --simulate%s input file specifies zero SNPs.\n", is_qt? "-qt" : "");
     goto simulate_ret_INVALID_FORMAT_2N;
   } else if (ullii > (do_haps? 0x3fffffff : 0x7fffffff)) {
-    sprintf(logbuf, "Error: --simulate%s input file specifies too many SNPs.\n", is_qt? "-qt" : "");
+    sprintf(g_logbuf, "Error: --simulate%s input file specifies too many SNPs.\n", is_qt? "-qt" : "");
     goto simulate_ret_INVALID_FORMAT_2N;
   }
   marker_ct = ullii;
   loop_end = (marker_ct + 99) / 100;
   rewind(infile);
   line_idx = 0;
-  while (fgets(tbuf, MAXLINELEN, infile)) {
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
     line_idx++;
     // already checked for long lines, don't need to repeat
-    cptr = skip_initial_spaces(tbuf);
+    cptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*cptr)) {
       continue;
     }
@@ -11032,12 +10998,12 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
       penult_ptr = next_token_mult(freq_lb_ptr, 2);
     }
     last_ptr = next_token(penult_ptr);
-    if (no_more_tokens(last_ptr)) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --simulate%s file has fewer tokens than expected.\n", line_idx, is_qt? "-qt" : "");
+    if (no_more_tokens_kns(last_ptr)) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --simulate%s file has fewer tokens than expected.\n", line_idx, is_qt? "-qt" : "");
       goto simulate_ret_INVALID_FORMAT_2N;
     }
-    if (!no_more_tokens(next_token(last_ptr))) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --simulate%s file has more tokens than expected.\n", line_idx, is_qt? "-qt" : "");
+    if (!no_more_tokens_kns(next_token(last_ptr))) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --simulate%s file has more tokens than expected.\n", line_idx, is_qt? "-qt" : "");
       goto simulate_ret_INVALID_FORMAT_2N;
     }
     scan_uint_icap(cptr, &uii);
@@ -11049,17 +11015,17 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
     memcpy(cur_snp_label, snp_label_ptr, snp_label_len);
     cur_snp_label[snp_label_len++] = '_';
     if (scan_two_doubles(freq_lb_ptr, &freq_lb, &freq_delta) || (freq_lb < 0) || (freq_delta < freq_lb) || (freq_delta > 1)) {
-      sprintf(logbuf, "Error: Invalid allele frequency bound on line %" PRIuPTR " of --simulate%s\nfile.\n", line_idx, is_qt? "-qt" : "");
+      sprintf(g_logbuf, "Error: Invalid allele frequency bound on line %" PRIuPTR " of --simulate%s\nfile.\n", line_idx, is_qt? "-qt" : "");
       goto simulate_ret_INVALID_FORMAT_2N;
     }
     freq_delta -= freq_lb;
     if (tags_or_haps) {
       if (scan_two_doubles(marker_freq_lb_ptr, &marker_freq_lb, &marker_freq_ub) || (marker_freq_lb < 0) || (marker_freq_ub < marker_freq_lb) || (marker_freq_ub > 1)) {
-	sprintf(logbuf, "Error: Invalid marker allele frequency bound on line %" PRIuPTR " of\n--simulate%s file.\n", line_idx, is_qt? "-qt" : "");
+	sprintf(g_logbuf, "Error: Invalid marker allele frequency bound on line %" PRIuPTR " of\n--simulate%s file.\n", line_idx, is_qt? "-qt" : "");
 	goto simulate_ret_INVALID_FORMAT_2N;
       }
       if (scan_double(marker_ld_ptr, &dprime) || (dprime < 0) || (dprime > 1)) {
-	sprintf(logbuf, "Error: Invalid d-prime on line %" PRIuPTR " of --simulate%s input file.\n", line_idx, is_qt? "-qt" : "");
+	sprintf(g_logbuf, "Error: Invalid d-prime on line %" PRIuPTR " of --simulate%s input file.\n", line_idx, is_qt? "-qt" : "");
 	goto simulate_ret_INVALID_FORMAT_2N;
       }
     } else {
@@ -11067,11 +11033,11 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
     }
     if (is_qt) {
       if (scan_double(penult_ptr, &qt_var) || (qt_var < 0) || (qt_var > 1)) {
-	sprintf(logbuf, "Error: Invalid variance value on line %" PRIuPTR " of --simulate-qt file.\n", line_idx);
+	sprintf(g_logbuf, "Error: Invalid variance value on line %" PRIuPTR " of --simulate-qt file.\n", line_idx);
 	goto simulate_ret_INVALID_FORMAT_2N;
       }
       if ((qt_var > 0) && (((freq_delta == 0) && ((freq_lb == 0) || (freq_lb == 1))) || (tags_or_haps && (marker_freq_lb == marker_freq_ub) && ((marker_freq_lb == 0) || (marker_freq_lb == 1))))) {
-	sprintf(logbuf, "Error: Nonzero variance with fixed 0/1 allele frequency on line %" PRIuPTR " of\n--simulate-qt file.\n", line_idx);
+	sprintf(g_logbuf, "Error: Nonzero variance with fixed 0/1 allele frequency on line %" PRIuPTR " of\n--simulate-qt file.\n", line_idx);
 	goto simulate_ret_INVALID_FORMAT_2N;
       }
       qt_totvar += ((intptr_t)cur_marker_ct) * qt_var;
@@ -11081,18 +11047,18 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
 	goto simulate_ret_INVALID_FORMAT;
       }
       if (scan_double(last_ptr, &qt_dom)) {
-	sprintf(logbuf, "Error: Invalid dominance deviation value on line %" PRIuPTR " of --simulate-qt\nfile.\n", line_idx);
+	sprintf(g_logbuf, "Error: Invalid dominance deviation value on line %" PRIuPTR " of --simulate-qt\nfile.\n", line_idx);
 	goto simulate_ret_INVALID_FORMAT_2N;
       }
     } else {
       if (scan_double(penult_ptr, &het_odds) || (het_odds < 0)) {
-	sprintf(logbuf, "Error: Invalid heterozygote disease odds ratio on line %" PRIuPTR " of\n--simulate file.\n", line_idx);
+	sprintf(g_logbuf, "Error: Invalid heterozygote disease odds ratio on line %" PRIuPTR " of\n--simulate file.\n", line_idx);
 	goto simulate_ret_INVALID_FORMAT_2N;
       }
       if ((strlen_se(last_ptr) == 4) && match_upper_nt(last_ptr, "MULT", 4)) {
 	hom0_odds = het_odds * het_odds;
       } else if (scan_double(last_ptr, &hom0_odds) || (hom0_odds < 0)) {
-	sprintf(logbuf, "Error: Invalid homozygote disease odds ratio on line %" PRIuPTR " of --simulate\nfile.\n", line_idx);
+	sprintf(g_logbuf, "Error: Invalid homozygote disease odds ratio on line %" PRIuPTR " of --simulate\nfile.\n", line_idx);
 	goto simulate_ret_INVALID_FORMAT_2N;
       }
       if ((!zero_odds_ratio_warning_given) && ((het_odds == 0) || (hom0_odds == 0))) {
@@ -11103,7 +11069,7 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
         printf("%u%%", pct);
       }
     }
-    tbuf[0] = '1';
+    g_textbuf[0] = '1';
     for (cur_marker_idx = 0; cur_marker_idx < cur_marker_ct; cur_marker_idx++) {
       freqs[0] = freq_lb + rand_unif() * freq_delta;
       if (tags_or_haps) {
@@ -11146,44 +11112,44 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
       } else {
 	simulate_init_freqs_cc(do_haps, dprime, freqs, prevalence, het_odds, hom0_odds, missing_freq, thresholds, case_thresholds);
       }
-      wptr = &(tbuf[1]);
+      wptr = &(g_textbuf[1]);
       *wptr++ = ' ';
       if (cur_marker_ct > 1) {
 	wptr = memcpya(wptr, cur_snp_label, snp_label_len);
-	wptr = uint32_write(wptr, cur_marker_idx);
+	wptr = uint32toa(cur_marker_idx, wptr);
       } else {
 	wptr = memcpya(wptr, cur_snp_label, snp_label_len - 1);
       }
       *wptr++ = '\t';
       dxx = freqs[0];
-      wptr = double_g_writex(wptr, dxx, ' ');
-      wptr = double_g_writex(wptr, dxx, '\t');
+      wptr = dtoa_gx(dxx, ' ', wptr);
+      wptr = dtoa_gx(dxx, '\t', wptr);
       if (tags_or_haps) {
 	dxx = freqs[1];
-	wptr = double_g_writex(wptr, dxx, ' ');
-	wptr = double_g_writex(wptr, dxx, '\t');
-	wptr = double_g_writex(wptr, dprime, '\t');
+	wptr = dtoa_gx(dxx, ' ', wptr);
+	wptr = dtoa_gx(dxx, '\t', wptr);
+	wptr = dtoa_gx(dprime, '\t', wptr);
       }
       if (is_qt) {
-	wptr = double_g_writex(wptr, qt_var, '\t');
-	wptr = double_g_writex(wptr, qt_dom, '\n');
+	wptr = dtoa_gx(qt_var, '\t', wptr);
+	wptr = dtoa_gx(qt_dom, '\n', wptr);
       } else {
-	wptr = double_g_writex(wptr, het_odds, '\t');
-	wptr = double_g_writex(wptr, hom0_odds, '\n');
+	wptr = dtoa_gx(het_odds, '\t', wptr);
+	wptr = dtoa_gx(hom0_odds, '\n', wptr);
       }
-      if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_simfreq)) {
+      if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_simfreq)) {
 	goto simulate_ret_WRITE_FAIL;
       }
       if (randomize_alleles) {
 	if (!simulate_12) {
 	  do {
-	    uii = sfmt_genrand_uint32(&sfmt);
+	    uii = sfmt_genrand_uint32(&g_sfmt);
 	  } while (uii >= 4294967184U); // largest multiple of 144 < 2^32
 	  uii = uii % 144U;
 	  ujj = uii / 12;
 	  uii -= ujj * 12;
 	} else {
-	  uii = sfmt_genrand_uint32(&sfmt) & 3;
+	  uii = sfmt_genrand_uint32(&g_sfmt) & 3;
 	  ujj = uii >> 1;
 	  uii &= 1;
 	}
@@ -11207,7 +11173,7 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
 	      ulkk = 0;
 	    }
 	    qt_vals[sample_idx] += qt_adj[ulkk];
-	    if (sfmt_genrand_uint32(&sfmt) < missing_thresh) {
+	    if (sfmt_genrand_uint32(&g_sfmt) < missing_thresh) {
 	      ulkk = 1;
 	    }
 	    ulii |= ulkk << ukk;
@@ -11269,7 +11235,7 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
 	    ulkk /= 4;
 	    ulkk += (ulkk + 1) >> 1;
 	    qt_vals[sample_idx] += qt_adj[ulkk];
-	    if (sfmt_genrand_uint32(&sfmt) < missing_thresh) {
+	    if (sfmt_genrand_uint32(&g_sfmt) < missing_thresh) {
 	      ulkk = 1;
 	    }
 	    ulii |= ulkk << ukk;
@@ -11311,16 +11277,16 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
 	}
       }
       if (popcount_longs(writebuf, sample_ctl2) < sample_ct) {
-	reverse_loadbuf((unsigned char*)writebuf, sample_ct);
+	reverse_loadbuf(sample_ct, (unsigned char*)writebuf);
 	cc = cur_alleles[0];
 	cur_alleles[0] = cur_alleles[1];
 	cur_alleles[1] = cc;
       }
-      wptr = &(tbuf[1]);
+      wptr = &(g_textbuf[1]);
       *wptr++ = '\t';
       if (cur_marker_ct > 1) {
 	wptr = memcpya(wptr, cur_snp_label, snp_label_len);
-	wptr = uint32_write(wptr, cur_marker_idx);
+	wptr = uint32toa(cur_marker_idx, wptr);
       } else {
 	wptr = memcpya(wptr, cur_snp_label, snp_label_len - 1);
       }
@@ -11328,12 +11294,12 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
 	wptr = memcpya(wptr, "_M", 2);
       }
       wptr = memcpyl3a(wptr, "\t0\t");
-      wptr = uint32_writex(wptr, marker_pos++, '\t');
+      wptr = uint32toa_x(marker_pos++, '\t', wptr);
       *wptr++ = cur_alleles[0];
       *wptr++ = '\t';
       *wptr++ = cur_alleles[1];
       *wptr++ = '\n';
-      if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_txt)) {
+      if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_txt)) {
 	goto simulate_ret_WRITE_FAIL;
       }
       if (fwrite_checked((unsigned char*)writebuf, sample_ct4, outfile_bed)) {
@@ -11341,20 +11307,20 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
       }
       if (do_haps) {
 	if (popcount_longs(writebuf2, sample_ctl2) < sample_ct) {
-	  reverse_loadbuf((unsigned char*)writebuf2, sample_ct);
+	  reverse_loadbuf(sample_ct, (unsigned char*)writebuf2);
 	  cc = cur_alleles[2];
 	  cur_alleles[2] = cur_alleles[3];
 	  cur_alleles[3] = cc;
 	}
-	wptr = &(tbuf[2 + snp_label_len]);
-	wptr = uint32_write(wptr, cur_marker_idx);
+	wptr = &(g_textbuf[2 + snp_label_len]);
+	wptr = uint32toa(cur_marker_idx, wptr);
 	wptr = memcpya(wptr, "_M\t0\t", 5);
-	wptr = uint32_writex(wptr, marker_pos++, '\t');
+	wptr = uint32toa_x(marker_pos++, '\t', wptr);
 	*wptr++ = cur_alleles[2];
 	*wptr++ = '\t';
 	*wptr++ = cur_alleles[3];
 	*wptr++ = '\n';
-	if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_txt)) {
+	if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_txt)) {
 	  goto simulate_ret_WRITE_FAIL;
 	}
 	if (fwrite_checked((unsigned char*)writebuf2, sample_ct4, outfile_bed)) {
@@ -11381,10 +11347,10 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
     goto simulate_ret_WRITE_FAIL;
   }
   memcpy(outname_end, ".fam", 5);
-  if (fopen_checked(&outfile_txt, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile_txt)) {
     goto simulate_ret_OPEN_FAIL;
   }
-  wptr = tbuf;
+  wptr = g_textbuf;
   if (name_prefix) {
     name_prefix_len = strlen(name_prefix);
     wptr = memcpyax(wptr, name_prefix, name_prefix_len, '-');
@@ -11401,12 +11367,12 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
     }
   }
   for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
-    wptr = uint32_writex(&(tbuf[uii]), sample_idx, ' ');
+    wptr = uint32toa_x(sample_idx, ' ', &(g_textbuf[uii]));
     if (name_prefix_len) {
       wptr = memcpyax(wptr, name_prefix, name_prefix_len, '-');
     }
     wptr = memcpyl3a(wptr, "per");
-    wptr = uint32_write(wptr, sample_idx);
+    wptr = uint32toa(sample_idx, wptr);
     wptr = memcpya(wptr, " 0 0 2 ", 7);
     if (is_qt) {
       if (sample_idx & 1) {
@@ -11414,7 +11380,7 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
       } else {
 	dzz = qt_vals[sample_idx] + dyy * rand_normal(&dxx);
       }
-      wptr = double_g_write(wptr, dzz);
+      wptr = dtoa_g(dzz, wptr);
     } else {
       if (sample_idx < case_ct) {
 	*wptr++ = '2';
@@ -11423,7 +11389,7 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
       }
     }
     *wptr++ = '\n';
-    if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_txt)) {
+    if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_txt)) {
       goto simulate_ret_WRITE_FAIL;
     }
   }
@@ -11458,17 +11424,18 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
   fclose_cond(outfile_txt);
   fclose_cond(outfile_simfreq);
   fclose_cond(outfile_bed);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t recode_allele_load(char* loadbuf, uintptr_t loadbuf_size, char* recode_allele_name, char*** allele_missing_ptr, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* recode_allele_reverse, char* recode_allele_extra) {
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   FILE* rafile = NULL;
   uint32_t missing_allele = 0;
   uint32_t marker_id_htable_size = get_id_htable_size(marker_ct);
   uintptr_t rae_size = 0;
   uintptr_t line_idx = 0;
-  uintptr_t topsize = 0;
+  uintptr_t cur_bigstack_left;
   uint32_t* marker_id_htable;
   char* bufptr;
   char* bufptr2;
@@ -11476,24 +11443,22 @@ int32_t recode_allele_load(char* loadbuf, uintptr_t loadbuf_size, char* recode_a
   uint32_t slen;
   uint32_t alen;
   uintptr_t marker_uidx;
-  if (fopen_checked(&rafile, recode_allele_name, "r")) {
+  if (fopen_checked(recode_allele_name, "r", &rafile)) {
     goto recode_allele_load_ret_OPEN_FAIL;
   }
-  marker_id_htable = (uint32_t*)top_alloc(&topsize, marker_id_htable_size * sizeof(int32_t));
-  if (!marker_id_htable) {
+  if (bigstack_end_alloc_ui(marker_id_htable_size, &marker_id_htable)) {
     goto recode_allele_load_ret_NOMEM;
   }
-  wkspace_left -= topsize;
-  retval = populate_id_htable(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, marker_id_htable, marker_id_htable_size);
+  retval = populate_id_htable(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, marker_id_htable_size, marker_id_htable);
   if (retval) {
-    wkspace_left += topsize;
     goto recode_allele_load_ret_1;
   }
   loadbuf[loadbuf_size - 1] = ' ';
+  cur_bigstack_left = bigstack_left();
   while (fgets(loadbuf, loadbuf_size, rafile)) {
     line_idx++;
     if (!loadbuf[loadbuf_size - 1]) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --recode-allele file is pathologically long.\n", line_idx);
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --recode-allele file is pathologically long.\n", line_idx);
       goto recode_allele_load_ret_INVALID_FORMAT_3;
     }
     bufptr = skip_initial_spaces(loadbuf);
@@ -11503,7 +11468,7 @@ int32_t recode_allele_load(char* loadbuf, uintptr_t loadbuf_size, char* recode_a
     slen = strlen_se(bufptr);
     bufptr2 = skip_initial_spaces(&(bufptr[slen]));
     if (is_eoln_kns(*bufptr2)) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --recode-allele file has fewer tokens than expected.\n", line_idx);
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --recode-allele file has fewer tokens than expected.\n", line_idx);
       goto recode_allele_load_ret_INVALID_FORMAT_3;
     }
     alen = strlen_se(bufptr2);
@@ -11511,12 +11476,12 @@ int32_t recode_allele_load(char* loadbuf, uintptr_t loadbuf_size, char* recode_a
     if (marker_uidx != 0xffffffffU) {
       bufptr2[alen++] = '\0';
       if (!strcmp(bufptr2, marker_allele_ptrs[2 * marker_uidx])) {
-	CLEAR_BIT(recode_allele_reverse, marker_uidx);
+	CLEAR_BIT(marker_uidx, recode_allele_reverse);
       } else if (!strcmp(bufptr2, marker_allele_ptrs[2 * marker_uidx + 1])) {
-	SET_BIT(recode_allele_reverse, marker_uidx);
+	SET_BIT(marker_uidx, recode_allele_reverse);
       } else {
-	if (rae_size + alen > wkspace_left) {
-	  goto recode_allele_load_ret_NOMEM2;
+	if (rae_size + alen > cur_bigstack_left) {
+	  goto recode_allele_load_ret_NOMEM;
 	}
 	missing_allele = 1;
 	(*allele_missing_ptr)[marker_uidx] = &(recode_allele_extra[rae_size]);
@@ -11525,13 +11490,10 @@ int32_t recode_allele_load(char* loadbuf, uintptr_t loadbuf_size, char* recode_a
       }
     }
   }
-  wkspace_left += topsize;
   if (!feof(rafile)) {
     goto recode_allele_load_ret_READ_FAIL;
   }
   while (0) {
-  recode_allele_load_ret_NOMEM2:
-    wkspace_left += topsize;
   recode_allele_load_ret_NOMEM:
     retval = RET_NOMEM;
     break;
@@ -11542,16 +11504,16 @@ int32_t recode_allele_load(char* loadbuf, uintptr_t loadbuf_size, char* recode_a
     retval = RET_READ_FAIL;
     break;
   recode_allele_load_ret_INVALID_FORMAT_3:
-    wkspace_left += topsize;
     logerrprintb();
     retval = RET_INVALID_FORMAT;
   }
  recode_allele_load_ret_1:
   fclose_cond(rafile);
+  bigstack_end_reset(bigstack_end_mark);
   if (missing_allele) {
-    recode_allele_extra = (char*)wkspace_alloc(rae_size);
+    recode_allele_extra = (char*)bigstack_alloc(rae_size);
   } else {
-    wkspace_reset(*allele_missing_ptr);
+    bigstack_reset(*allele_missing_ptr);
     *allele_missing_ptr = NULL;
   }
   return retval;
@@ -11586,11 +11548,11 @@ uint32_t recode_load_to(unsigned char* loadbuf, FILE* bedfile, uintptr_t bed_off
       return 1;
     }
     while (1) {
-      next_set_ul_ck(marker_reverse, &marker_uidx, marker_uidx_stop);
+      next_set_ul_ck(marker_reverse, marker_uidx_stop, &marker_uidx);
       if (marker_uidx == marker_uidx_stop) {
 	break;
       }
-      reverse_loadbuf(&(loadbuf[(marker_uidx - marker_uidx_start) * unfiltered_sample_ct4]), unfiltered_sample_ct);
+      reverse_loadbuf(unfiltered_sample_ct, &(loadbuf[(marker_uidx - marker_uidx_start) * unfiltered_sample_ct4]));
       marker_uidx++;
     }
     loadbuf = &(loadbuf[ulii]);
@@ -11618,7 +11580,7 @@ static inline int32_t recode_write_first_cols(FILE* outfile, uintptr_t sample_ui
   } else if (pheno_c) {
     putc('1' + IS_SET(pheno_c, sample_uidx), outfile);
   } else {
-    cptr = double_g_write(wbuf, pheno_d[sample_uidx]);
+    cptr = dtoa_g(pheno_d[sample_uidx], wbuf);
     fwrite(wbuf, 1, cptr - wbuf, outfile);
   }
   if (putc_checked(delimiter, outfile)) {
@@ -11740,13 +11702,13 @@ int32_t recode_beagle_new_chrom(char* outname, char* outname_end2, uintptr_t* ma
     return 0;
   }
 
-  wbufptr = chrom_name_write(outname_end2, chrom_info_ptr, chrom_idx);
+  wbufptr = chrom_name_write(chrom_info_ptr, chrom_idx, outname_end2);
   memcpy(wbufptr, ".dat", 5);
-  if (fopen_checked(datfile_ptr, outname, "w")) {
+  if (fopen_checked(outname, "w", datfile_ptr)) {
     goto recode_beagle_new_chrom_ret_OPEN_FAIL;
   }
   memcpy(wbufptr, ".map", 5);
-  if (fopen_checked(mapfile_ptr, outname, "w")) {
+  if (fopen_checked(outname, "w", mapfile_ptr)) {
     goto recode_beagle_new_chrom_ret_OPEN_FAIL;
   }
   if (fwrite_checked(dat_header, dat_header_len, *datfile_ptr)) {
@@ -11754,7 +11716,7 @@ int32_t recode_beagle_new_chrom(char* outname, char* outname_end2, uintptr_t* ma
   }
   *wbufptr = '\0';
   LOGPREPRINTFWW("%s.dat + %s.map created.\n", outname, outname);
-  logstr(logbuf);
+  logstr(g_logbuf);
   while (0) {
   recode_beagle_new_chrom_ret_OPEN_FAIL:
     retval = RET_OPEN_FAIL;
@@ -11773,26 +11735,26 @@ int32_t open_and_write_fastphase_header(FILE** outfile_ptr, char* outname, uintp
   char wbuf[16];
   char* wptr;
   uint32_t marker_idx;
-  if (fopen_checked(outfile_ptr, outname, "w")) {
+  if (fopen_checked(outname, "w", outfile_ptr)) {
     return RET_OPEN_FAIL;
   }
-  wptr = uint32_writex(wbuf, sample_ct, '\n');
+  wptr = uint32toa_x(sample_ct, '\n', wbuf);
   if (fwrite_checked(wbuf, wptr - wbuf, *outfile_ptr)) {
     return RET_WRITE_FAIL;
   }
-  wptr = uint32_write(wbuf, chrom_size);
+  wptr = uint32toa(chrom_size, wbuf);
   fwrite(wbuf, 1, wptr - wbuf, *outfile_ptr);
   fputs("\nP ", *outfile_ptr);
   for (marker_idx = 0; marker_idx < chrom_size; marker_uidx++, marker_idx++) {
     next_unset_unsafe_ck(marker_exclude, &marker_uidx);
-    wptr = uint32_writex(wbuf, marker_pos[marker_uidx], ' ');
+    wptr = uint32toa_x(marker_pos[marker_uidx], ' ', wbuf);
     fwrite(wbuf, 1, wptr - wbuf, *outfile_ptr);
   }
   if (putc_checked('\n', *outfile_ptr)) {
     return RET_WRITE_FAIL;
   }
   LOGPREPRINTFWW("%s created.\n", outname);
-  logstr(logbuf);
+  logstr(g_logbuf);
   return 0;
 }
 
@@ -11898,7 +11860,7 @@ uint32_t write_haploview_map(FILE* outfile, uintptr_t* marker_exclude, uintptr_t
     next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx_start);
     fputs(&(marker_ids[marker_uidx_start * max_marker_id_len]), outfile);
     putc('\t', outfile);
-    wptr = uint32_writex(wbuf, marker_pos[marker_uidx_start], '\n');
+    wptr = uint32toa_x(marker_pos[marker_uidx_start], '\n', wbuf);
     fwrite(wbuf, 1, wptr - wbuf, outfile);
   }
   if (ferror(outfile)) {
@@ -11962,16 +11924,16 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
   FILE* outfile2 = NULL;
   BGZF* bgz_outfile = NULL;
   char* pzwritep = NULL;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t sample_ctv2 = 2 * ((sample_ct + (BITCT - 1)) / BITCT);
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
   uintptr_t final_mask = get_final_mask(sample_ct);
   uintptr_t cur_final_mask = 0;
   uintptr_t sample_ct_y = 0;
   uintptr_t cur_sample_ct = 0;
-  uintptr_t topsize = 0;
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   char delimiter = (recode_modifier & RECODE_TAB)? '\t' : ' ';
   uintptr_t* recode_allele_reverse = NULL;
   uintptr_t* sample_exclude_y = NULL;
@@ -12081,7 +12043,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     if (recode_modifier & (RECODE_23 | RECODE_A_TRANSPOSE | RECODE_BEAGLE | RECODE_BEAGLE_NOMAP | RECODE_BIMBAM | RECODE_BIMBAM_1CHR | RECODE_LGEN | RECODE_LGEN_REF | RECODE_LIST | RECODE_OXFORD | RECODE_RLIST | RECODE_TRANSPOSE | RECODE_VCF)) {
       // SNP-major and no need for sample_uidx in inner loop, so we can use
       // collapsed representation
-      if (alloc_collapsed_haploid_filters(unfiltered_sample_ct, sample_ct, hh_exists | ((recode_modifier & RECODE_VCF)? XMHH_EXISTS : 0), 0, sample_exclude, sex_male, &sample_include2, &sample_male_include2)) {
+      if (alloc_collapsed_haploid_filters(sample_exclude, sex_male, unfiltered_sample_ct, sample_ct, hh_exists | ((recode_modifier & RECODE_VCF)? XMHH_EXISTS : 0), 0, &sample_include2, &sample_male_include2)) {
 	goto recode_ret_NOMEM;
       }
     } else {
@@ -12098,7 +12060,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       logerrprint("Error: The --recode '01' modifier normally has to be used with a nonzero\n--output-missing-genotype setting.\n");
       goto recode_ret_INVALID_CMDLINE;
     }
-    mk_allele_ptrs = (char**)wkspace_alloc(unfiltered_marker_ct * 2 * sizeof(intptr_t));
+    mk_allele_ptrs = (char**)bigstack_alloc(unfiltered_marker_ct * 2 * sizeof(intptr_t));
     if (!mk_allele_ptrs) {
       goto recode_ret_NOMEM;
     }
@@ -12110,10 +12072,10 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     max_marker_allele_len = 2;
   }
   if (recode_modifier & (RECODE_A_TRANSPOSE | RECODE_BEAGLE | RECODE_BEAGLE_NOMAP | RECODE_BIMBAM | RECODE_BIMBAM_1CHR | RECODE_LGEN | RECODE_LGEN_REF | RECODE_LIST | RECODE_OXFORD | RECODE_RLIST | RECODE_TRANSPOSE | RECODE_VCF)) {
-    if (wkspace_alloc_ul_checked(&loadbuf_collapsed, sample_ctv2 * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(sample_ctv2, &loadbuf_collapsed)) {
       goto recode_ret_NOMEM;
     }
-    loadbuf_collapsed_end = &(loadbuf_collapsed[(sample_ct + (BITCT2 - 1)) / BITCT2]);
+    loadbuf_collapsed_end = &(loadbuf_collapsed[QUATERCT_TO_WORDCT(sample_ct)]);
     if (recode_modifier & (RECODE_LGEN | RECODE_LGEN_REF | RECODE_LIST | RECODE_RLIST)) {
       // need to collapse sample_ids to remove need for sample_uidx in inner
       // loop
@@ -12122,34 +12084,33 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
         goto recode_ret_NOMEM;
       }
       if (omit_nonmale_y) {
-	if (wkspace_alloc_ul_checked(&sample_exclude_y, unfiltered_sample_ctl * sizeof(intptr_t))) {
+	if (bigstack_alloc_ul(unfiltered_sample_ctl, &sample_exclude_y)) {
 	  goto recode_ret_NOMEM;
 	}
 	memcpy(sample_exclude_y, sample_exclude, unfiltered_sample_ctl * sizeof(intptr_t));
-	bitfield_ornot(sample_exclude_y, sex_male, unfiltered_sample_ctl);
-	zero_trailing_bits(sample_exclude_y, unfiltered_sample_ct);
+	bitvec_ornot(sex_male, unfiltered_sample_ctl, sample_exclude_y);
+	zero_trailing_bits(unfiltered_sample_ct, sample_exclude_y);
 	sample_ct_y = unfiltered_sample_ct - popcount_longs(sample_exclude_y, unfiltered_sample_ctl);
-        uii = 2 * ((sample_ct_y + (BITCT - 1)) / BITCT);
-	if (wkspace_alloc_ul_checked(&sample_include2_y, uii * sizeof(intptr_t)) ||
-            wkspace_alloc_ul_checked(&sample_male_include2_y, uii * sizeof(intptr_t))) {
+        uii = QUATERCT_TO_ALIGNED_WORDCT(sample_ct_y);
+	if (bigstack_alloc_ul(uii, &sample_include2_y) ||
+            bigstack_alloc_ul(uii, &sample_male_include2_y)) {
 	  goto recode_ret_NOMEM;
 	}
-	fill_vec_55(sample_include2_y, sample_ct_y);
-	fill_vec_55(sample_male_include2_y, sample_ct_y);
+	fill_quatervec_55(sample_ct_y, sample_include2_y);
+	fill_quatervec_55(sample_ct_y, sample_male_include2_y);
 	sample_ids_collapsed_y = alloc_and_init_collapsed_arr(sample_ids, max_sample_id_len, unfiltered_sample_ct, sample_exclude_y, sample_ct_y, (delimiter == '\t'));
       }
     }
   }
   if (recode_modifier & RECODE_VCF) {
-    if (wkspace_alloc_c_checked(&writebuf, sample_ct * 4)) {
+    if (bigstack_alloc_c(sample_ct * 4, &writebuf)) {
       goto recode_ret_NOMEM;
     }
   } else if (recode_modifier & RECODE_OXFORD) {
-    if (wkspace_alloc_uc_checked(&overflow_buf, PIGZ_BLOCK_SIZE + sample_ct * 6 + 2 * max_marker_allele_len + MAXLINELEN) ||
-        wkspace_alloc_ui_checked(&missing_cts, sample_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_uc(PIGZ_BLOCK_SIZE + sample_ct * 6 + 2 * max_marker_allele_len + MAXLINELEN, &overflow_buf) ||
+        bigstack_calloc_ui(sample_ct, &missing_cts)) {
       goto recode_ret_NOMEM;
     }
-    fill_uint_zero(missing_cts, sample_ct);
   } else if (recode_modifier & (RECODE_BEAGLE | RECODE_BEAGLE_NOMAP)) {
     // common header:
     // "P FID " + ... + "\n"
@@ -12161,8 +12122,8 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     // per-marker:
     //   "M " + [marker name] + " " + ... + "\n"
     ulii = strlen(output_missing_pheno);
-    if (wkspace_alloc_c_checked(&writebuf, 2 * ulii + 2) ||
-        wkspace_alloc_c_checked(&writebuf2, 21 + sample_ct * (2 * max_sample_id_len + 64))) {
+    if (bigstack_alloc_c(2 * ulii + 2, &writebuf) ||
+        bigstack_alloc_c(21 + sample_ct * (2 * max_sample_id_len + 64), &writebuf2)) {
       goto recode_ret_NOMEM;
     }
     wbufptr = memcpyax(writebuf, output_missing_pheno, ulii, ' ');
@@ -12209,7 +12170,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       for (sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
 	next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
         if (IS_SET(pheno_nm, sample_uidx)) {
-	  cptr = double_g_writex(wbufptr, pheno_d[sample_uidx], ' ');
+	  cptr = dtoa_gx(pheno_d[sample_uidx], ' ', wbufptr);
           wbufptr = memcpya(cptr, wbufptr, (uintptr_t)(cptr - wbufptr));
 	} else {
 	  wbufptr = memcpya(wbufptr, writebuf, ulii);
@@ -12219,10 +12180,10 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     *wbufptr++ = '\n';
     // free unused space, and save header length
     header_len = (uintptr_t)(wbufptr - writebuf2);
-    wkspace_shrink_top(writebuf2, header_len);
+    bigstack_shrink_top(writebuf2, header_len);
     cmalen[1] = 4;
     ulii = 2 * max_marker_allele_len;
-    if (wkspace_alloc_c_checked(&cur_mk_allelesx_buf, 4 * max_marker_allele_len)) {
+    if (bigstack_alloc_c(4 * max_marker_allele_len, &cur_mk_allelesx_buf)) {
       goto recode_ret_NOMEM;
     }
     cur_mk_allelesx[0] = cur_mk_allelesx_buf;
@@ -12255,8 +12216,8 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       }
     }
     // +1 because memcpyl3a() copies an extra character
-    if (wkspace_alloc_c_checked(&writebuf, 3 * sample_ct + 1) ||
-        wkspace_alloc_c_checked(&writebuf2, 32)) {
+    if (bigstack_alloc_c(3 * sample_ct + 1, &writebuf) ||
+        bigstack_alloc_c(32, &writebuf2)) {
       goto recode_ret_NOMEM;
     }
   } else if (recode_modifier & (RECODE_FASTPHASE | RECODE_FASTPHASE_1CHR)) {
@@ -12270,7 +12231,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       goto recode_ret_INVALID_CMDLINE;
     }
     if (recode_012) {
-      if (wkspace_alloc_c_checked(&writebuf3, 8)) {
+      if (bigstack_alloc_c(8, &writebuf3)) {
         goto recode_ret_NOMEM;
       }
       if (recode_modifier & RECODE_01) {
@@ -12279,20 +12240,20 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	memcpy(writebuf3, "1?121?22", 8);
       }
     } else {
-      if (wkspace_alloc_c_checked(&writebuf3, max_chrom_size * 2)) {
+      if (bigstack_alloc_c(max_chrom_size * 2, &writebuf3)) {
 	goto recode_ret_NOMEM;
       }
     }
-    if (wkspace_left < ((uint64_t)unfiltered_sample_ct4) * max_chrom_size + 2 * ((max_chrom_size + 63) & (~(63 * ONELU)))) {
+    if (bigstack_left() < ((uint64_t)unfiltered_sample_ct4) * max_chrom_size + 2 * round_up_pow2(max_chrom_size, CACHELINE)) {
       goto recode_ret_NO_MULTIPASS_YET;
     }
-    if (wkspace_alloc_c_checked(&writebuf, max_chrom_size) ||
-        wkspace_alloc_c_checked(&writebuf2, max_chrom_size)) {
+    if (bigstack_alloc_c(max_chrom_size, &writebuf) ||
+        bigstack_alloc_c(max_chrom_size, &writebuf2)) {
       goto recode_ret_NOMEM;
     }
   } else if (recode_modifier & (RECODE_LGEN | RECODE_LGEN_REF)) {
     ulii = 1 + 2 * max_marker_allele_len + max_marker_id_len + max_sample_id_len;
-    if (wkspace_alloc_c_checked(&writebuf, 4 * ulii)) {
+    if (bigstack_alloc_c(4 * ulii, &writebuf)) {
       goto recode_ret_NOMEM;
     }
   } else if (recode_modifier & (RECODE_LIST | RECODE_RLIST)) {
@@ -12315,7 +12276,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     if (rlist) {
       ulii += 2;
     }
-    if (wkspace_alloc_c_checked(&writebuf, ulii * 4)) {
+    if (bigstack_alloc_c(ulii * 4, &writebuf)) {
       goto recode_ret_NOMEM;
     }
     writebufl[0] = writebuf;
@@ -12331,7 +12292,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       goto recode_ret_INVALID_FORMAT;
     }
     // chromosome code, marker position, single-char alleles
-    if (wkspace_alloc_c_checked(&writebuf, 32)) {
+    if (bigstack_alloc_c(32, &writebuf)) {
       goto recode_ret_NOMEM;
     }
   } else if (recode_modifier & RECODE_STRUCTURE) {
@@ -12347,7 +12308,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	}
       }
     }
-    if (wkspace_alloc_c_checked(&writebuf3, max_fid_len * sample_ct)) {
+    if (bigstack_alloc_c(max_fid_len * sample_ct, &writebuf3)) {
       goto recode_ret_NOMEM;
     }
     for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
@@ -12372,10 +12333,10 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	fid_ct++;
       }
     }
-    wkspace_shrink_top(writebuf3, fid_ct * max_fid_len);
-    if (wkspace_alloc_ui_checked(&fid_map, fid_ct * sizeof(int32_t)) ||
-        wkspace_alloc_c_checked(&writebuf, 4 * marker_ct) ||
-        wkspace_alloc_c_checked(&writebuf2, 16)) {
+    bigstack_shrink_top(writebuf3, fid_ct * max_fid_len);
+    if (bigstack_calloc_ui(fid_ct, &fid_map) ||
+        bigstack_alloc_c(4 * marker_ct, &writebuf) ||
+        bigstack_alloc_c(16, &writebuf2)) {
       goto recode_ret_NOMEM;
     }
     fill_uint_zero(fid_map, fid_ct);
@@ -12384,12 +12345,12 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       // format is new to PLINK 1.9, so use tab delimiter unless 'spacex'
       // modifier present
       delimiter = ((recode_modifier & (RECODE_TAB | RECODE_DELIMX)) == RECODE_DELIMX)? ' ' : '\t';
-      if (wkspace_alloc_c_checked(&writebuf, sample_ct * 3 + 1)) {
+      if (bigstack_alloc_c(sample_ct * 3 + 1, &writebuf)) {
         goto recode_ret_NOMEM;
       }
     } else {
       if (recode_modifier & RECODE_AD) {
-	if (wkspace_alloc_c_checked(&writebuf2, 32)) {
+	if (bigstack_alloc_c(32, &writebuf2)) {
 	  goto recode_ret_NOMEM;
 	}
 	memcpy(writebuf2, "2 0     1 1 0 0 NA NA", 21);
@@ -12433,7 +12394,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 		alen2 = strlen(mk_allele_ptrs[marker_uidx * 2 + 1]);
 		uljj += MAXV(alen, alen2) + 1;
 		marker_uidx++;
-		next_unset_ul_ck(marker_exclude, &marker_uidx, chrom_end);
+		next_unset_ul_ck(marker_exclude, chrom_end, &marker_uidx);
 	      }
 	      if (uljj > ulii) {
 		ulii = uljj;
@@ -12458,7 +12419,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	  }
 	}
 	if (recode_modifier & (RECODE_A | RECODE_AD | RECODE_COMPOUND)) {
-	  if (wkspace_alloc_c_checked(&writebuf, max_chrom_size * ulii)) {
+	  if (bigstack_alloc_c(max_chrom_size * ulii, &writebuf)) {
 	    goto recode_ret_NOMEM;
 	  }
 	  if ((recode_modifier & RECODE_COMPOUND) && max_chrom_size) {
@@ -12467,31 +12428,29 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	  }
 	} else {
 	  // --recode, --recode HV
-	  if (wkspace_alloc_c_checked(&writebuf, ulii)) {
+	  if (bigstack_alloc_c(ulii, &writebuf)) {
 	    goto recode_ret_NOMEM;
 	  }
 	}
       }
     }
     if (recode_allele_name) {
-      if (wkspace_alloc_ul_checked(&recode_allele_reverse, unfiltered_marker_ctl * sizeof(intptr_t))) {
+      if (bigstack_calloc_ul(unfiltered_marker_ctl, &recode_allele_reverse)) {
 	goto recode_ret_NOMEM;
       }
       // this indicates when we want to report the A2 allele instead of the
       // A1.  (potential double negatives, bleah)
-      fill_ulong_zero(recode_allele_reverse, unfiltered_marker_ctl);
-      allele_missing = (char**)wkspace_alloc(unfiltered_marker_ct * sizeof(char**));
+      allele_missing = (char**)bigstack_alloc(unfiltered_marker_ct * sizeof(char**));
       if (!allele_missing) {
 	goto recode_ret_NOMEM;
       }
-      recode_allele_extra = (char*)wkspace_base;
+      recode_allele_extra = (char*)g_bigstack_base;
       fill_ulong_zero((uintptr_t*)allele_missing, unfiltered_marker_ct);
-      ulii = (max_marker_allele_len + MAXLINELEN + 15) & (~(15 * ONELU));
-      loadbuf = (unsigned char*)top_alloc(&topsize, ulii);
+      ulii = round_up_pow2(max_marker_allele_len + MAXLINELEN, END_ALLOC_CHUNK);
+      loadbuf = (unsigned char*)bigstack_end_alloc_presized(ulii);
       if (!loadbuf) {
 	goto recode_ret_NOMEM;
       }
-      wkspace_left -= topsize;
       // When '12' and 'A'/'AD' are simultaneously present, most sensible
       // behavior is to match against real allele IDs and just apply '12'
       // to the output header line.  If that's not what the user wants,
@@ -12499,8 +12458,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       // (--recode12 simply overrode --recodeA/--recodeAD in PLINK 1.07; no
       // need to replicate that.) 
       retval = recode_allele_load((char*)loadbuf, ulii, recode_allele_name, &allele_missing, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, recode_allele_reverse, recode_allele_extra);
-      wkspace_left += topsize;
-      topsize = 0;
+      bigstack_end_reset(bigstack_end_mark);
       if (retval) {
 	goto recode_ret_1;
       }
@@ -12508,7 +12466,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
   }
 
   if (!(recode_modifier & (RECODE_A | RECODE_AD | RECODE_BEAGLE | RECODE_BEAGLE_NOMAP | RECODE_FASTPHASE | RECODE_FASTPHASE_1CHR | RECODE_LGEN | RECODE_LGEN_REF | RECODE_OXFORD | RECODE_VCF))) {
-    if (wkspace_alloc_c_checked(&cur_mk_allelesx_buf, 8 * max_marker_allele_len)) {
+    if (bigstack_alloc_c(8 * max_marker_allele_len, &cur_mk_allelesx_buf)) {
       goto recode_ret_NOMEM;
     }
     cur_mk_allelesx[0] = cur_mk_allelesx_buf;
@@ -12516,12 +12474,12 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     cur_mk_allelesx[2] = &(cur_mk_allelesx_buf[max_marker_allele_len * 4]);
     cur_mk_allelesx[3] = &(cur_mk_allelesx_buf[max_marker_allele_len * 6]);
   } else if (recode_modifier & RECODE_VCF) {
-    if (wkspace_alloc_c_checked(&cur_mk_allelesx_buf, 16)) {
+    if (bigstack_alloc_c(16, &cur_mk_allelesx_buf)) {
       goto recode_ret_NOMEM;
     }
     memcpy(cur_mk_allelesx_buf, "\t1/1\t./.\t0/1\t0/0", 16);
   } else if (recode_modifier & RECODE_OXFORD) {
-    if (wkspace_alloc_c_checked(&cur_mk_allelesx_buf, 32)) {
+    if (bigstack_alloc_c(32, &cur_mk_allelesx_buf)) {
       goto recode_ret_NOMEM;
     }
     memcpy(cur_mk_allelesx_buf, " 1 0 0   0 0 0   0 1 0   0 0 1", 30);
@@ -12531,10 +12489,10 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
   }
   marker_uidx = 0;
   marker_idx = 0;
-  if (wkspace_left < unfiltered_sample_ct4) {
+  if (bigstack_left() < unfiltered_sample_ct4) {
     goto recode_ret_NOMEM;
   }
-  loadbuf = wkspace_base;
+  loadbuf = g_bigstack_base;
   chrom_fo_idx = 0;
   if (unfiltered_marker_ct) {
     refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
@@ -12548,7 +12506,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
   }
   if (recode_modifier & RECODE_TRANSPOSE) {
     strcpy(outname_end, ".tped");
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto recode_ret_OPEN_FAIL;
     }
     *outname_end = '\0';
@@ -12561,7 +12519,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     cur_mk_allelesx[1][3] = output_missing_geno;
     cmalen[1] = 4;
 
-    cptr = chrom_name_write(tbuf, chrom_info_ptr, chrom_idx);
+    cptr = chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf);
     *cptr++ = delimiter;
     for (pct = 1; pct <= 100; pct++) {
       loop_end = (((uint64_t)pct) * marker_ct) / 100;
@@ -12576,23 +12534,23 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	  chrom_fo_idx++;
 	  refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
 	  chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-	  cptr = chrom_name_write(tbuf, chrom_info_ptr, chrom_idx);
+	  cptr = chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf);
 	  *cptr++ = delimiter;
 	}
 	wbufptr = strcpyax(cptr, &(marker_ids[marker_uidx * max_marker_id_len]), delimiter);
 	if (!marker_cms) {
 	  *wbufptr++ = '0';
 	} else {
-	  wbufptr = double_g_write(wbufptr, marker_cms[marker_uidx]);
+	  wbufptr = dtoa_g(marker_cms[marker_uidx], wbufptr);
 	}
 	*wbufptr++ = delimiter;
-	wbufptr = uint32_write(wbufptr, marker_pos[marker_uidx]);
-        if (fwrite_checked(tbuf, wbufptr - tbuf, outfile)) {
+	wbufptr = uint32toa(marker_pos[marker_uidx], wbufptr);
+        if (fwrite_checked(g_textbuf, wbufptr - g_textbuf, outfile)) {
 	  goto recode_ret_WRITE_FAIL;
 	}
 
 	if (sample_ct) {
-	  if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+	  if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, (uintptr_t*)loadbuf, loadbuf_collapsed)) {
 	    goto recode_ret_READ_FAIL;
 	  }
 	  if (is_haploid && set_hh_missing) {
@@ -12632,7 +12590,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     }
   } else if (recode_modifier & RECODE_A_TRANSPOSE) {
     strcpy(outname_end, ".traw");
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto recode_ret_OPEN_FAIL;
     }
     fputs((delimiter == '\t')? "CHR\tSNP\t(C)M\tPOS\tCOUNTED\tALT" : "CHR SNP (C)M POS COUNTED ALT", outfile);
@@ -12658,7 +12616,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     LOGPRINTFWW5("--recode A-transpose to %s ... ", outname);
     fputs("0%", stdout);
     fflush(stdout);
-    cptr = chrom_name_write(tbuf, chrom_info_ptr, chrom_idx);
+    cptr = chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf);
     *cptr++ = delimiter;
     for (pct = 1; pct <= 100; pct++) {
       loop_end = (((uint64_t)pct) * marker_ct) / 100;
@@ -12673,18 +12631,18 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	  chrom_fo_idx++;
 	  refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
 	  chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-	  cptr = chrom_name_write(tbuf, chrom_info_ptr, chrom_idx);
+	  cptr = chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf);
 	  *cptr++ = delimiter;
 	}
 	wbufptr = strcpyax(cptr, &(marker_ids[marker_uidx * max_marker_id_len]), delimiter);
 	if (!marker_cms) {
 	  *wbufptr++ = '0';
 	} else {
-	  wbufptr = double_g_write(wbufptr, marker_cms[marker_uidx]);
+	  wbufptr = dtoa_g(marker_cms[marker_uidx], wbufptr);
 	}
 	*wbufptr++ = delimiter;
-	wbufptr = uint32_writex(wbufptr, marker_pos[marker_uidx], delimiter);
-        if (fwrite_checked(tbuf, wbufptr - tbuf, outfile)) {
+	wbufptr = uint32toa_x(marker_pos[marker_uidx], delimiter, wbufptr);
+        if (fwrite_checked(g_textbuf, wbufptr - g_textbuf, outfile)) {
 	  goto recode_ret_WRITE_FAIL;
 	}
 	uii = IS_NONNULL_AND_SET(recode_allele_reverse, marker_uidx);
@@ -12700,7 +12658,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	fputs(mk_allele_ptrs[2 * marker_uidx + 1 - uii], outfile);
 	wbufptr = writebuf;
 	if (sample_ct) {
-	  if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, uii ^ IS_SET(marker_reverse, marker_uidx))) {
+	  if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, uii ^ IS_SET(marker_reverse, marker_uidx), bedfile, (uintptr_t*)loadbuf, loadbuf_collapsed)) {
 	    goto recode_ret_READ_FAIL;
 	  }
 	  if (is_haploid && set_hh_missing) {
@@ -12751,7 +12709,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
   } else if (recode_modifier & RECODE_VCF) {
     if (!output_bgz) {
       memcpy(outname_end, ".vcf", 5);
-      if (fopen_checked(&outfile, outname, "w")) {
+      if (fopen_checked(outname, "w", &outfile)) {
 	goto recode_ret_OPEN_FAIL;
       }
     } else {
@@ -12766,23 +12724,23 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       }
 #endif
     }
-    wbufptr = memcpya(tbuf, "##fileformat=VCFv4.2\n##fileDate=", 32);
+    wbufptr = memcpya(g_textbuf, "##fileformat=VCFv4.2\n##fileDate=", 32);
     time(&rawtime);
     loctime = localtime(&rawtime);
     wbufptr += strftime(wbufptr, MAXLINELEN, "%Y%m%d", loctime);
     wbufptr = memcpya(wbufptr, "\n##source=PLINKv1.90\n", 21);
     uii = 0; // '0' written already?
-    if (flexbwrite_checked(tbuf, wbufptr - tbuf, output_bgz, outfile, bgz_outfile)) {
+    if (flexbwrite_checked(g_textbuf, wbufptr - g_textbuf, output_bgz, outfile, bgz_outfile)) {
       goto recode_ret_WRITE_FAIL;
     }
-    memcpy(tbuf, "##contig=<ID=", 13);
+    memcpy(g_textbuf, "##contig=<ID=", 13);
     for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
       chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
       if (!IS_SET(chrom_info_ptr->chrom_mask, chrom_idx)) {
 	continue;
       }
-      cptr = chrom_name_write(&(tbuf[13]), chrom_info_ptr, chrom_idx);
-      if ((tbuf[13] == '0') && (cptr == &(tbuf[14]))) {
+      cptr = chrom_name_write(chrom_info_ptr, chrom_idx, &(g_textbuf[13]));
+      if ((g_textbuf[13] == '0') && (cptr == &(g_textbuf[14]))) {
 	if (uii) {
 	  continue;
 	}
@@ -12790,19 +12748,19 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	cptr = memcpya(cptr, ",length=2147483645", 18);
       } else {
 	*cptr = '\0';
-	if (strchr(&(tbuf[13]), ':')) {
+	if (strchr(&(g_textbuf[13]), ':')) {
 	  logerrprint("Error: VCF chromosome codes may not include the ':' character.\n");
 	  goto recode_ret_INVALID_FORMAT;
 	}
         cptr = memcpya(cptr, ",length=", 8);
 	if (!(map_is_unsorted & UNSORTED_BP)) {
-	  cptr = uint32_write(cptr, marker_pos[chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1] - 1] + 1);
+	  cptr = uint32toa(marker_pos[chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1] - 1] + 1, cptr);
 	} else {
 	  cptr = memcpya(cptr, "2147483645", 10); // unknown
 	}
       }
       cptr = memcpya(cptr, ">\n", 2);
-      if (flexbwrite_checked(tbuf, cptr - tbuf, output_bgz, outfile, bgz_outfile)) {
+      if (flexbwrite_checked(g_textbuf, cptr - g_textbuf, output_bgz, outfile, bgz_outfile)) {
 	goto recode_ret_WRITE_FAIL;
       }
     }
@@ -12855,7 +12813,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     LOGPRINTFWW5("--recode vcf%s%s to %s ... ", vcf_not_iid? (vcf_not_fid? "" : "-fid") : "-iid", output_bgz? " bgz" : "", outname);
     fputs("0%", stdout);
     fflush(stdout);
-    tbuf[0] = '\n';
+    g_textbuf[0] = '\n';
     if (((!hh_exists) || set_hh_missing) && is_haploid && (!is_x)) {
       uii = 2;
     } else {
@@ -12880,11 +12838,11 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	    uii = 4;
 	  }
 	}
-	wbufptr = chrom_name_write(&(tbuf[1]), chrom_info_ptr, chrom_idx);
+	wbufptr = chrom_name_write(chrom_info_ptr, chrom_idx, &(g_textbuf[1]));
 	*wbufptr++ = '\t';
-	wbufptr = uint32_writex(wbufptr, marker_pos[marker_uidx], '\t');
+	wbufptr = uint32toa_x(marker_pos[marker_uidx], '\t', wbufptr);
 	wbufptr = strcpyax(wbufptr, &(marker_ids[marker_uidx * max_marker_id_len]), '\t');
-	if (flexbwrite_checked(tbuf, wbufptr - tbuf, output_bgz, outfile, bgz_outfile)) {
+	if (flexbwrite_checked(g_textbuf, wbufptr - g_textbuf, output_bgz, outfile, bgz_outfile)) {
 	  goto recode_ret_WRITE_FAIL;
 	}
 	cptr = mk_allele_ptrs[2 * marker_uidx + 1];
@@ -12905,7 +12863,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	}
 
 	if (sample_ct) {
-	  if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+	  if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, (uintptr_t*)loadbuf, loadbuf_collapsed)) {
 	    goto recode_ret_READ_FAIL;
 	  }
 	  if (is_haploid && set_hh_missing) {
@@ -13026,14 +12984,14 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	  // not clear from documentation whether anything special should be
 	  // done for Y/haploid chromosomes
 	}
-	pzwritep = chrom_name_write(pzwritep, chrom_info_ptr, chrom_idx);
+	pzwritep = chrom_name_write(chrom_info_ptr, chrom_idx, pzwritep);
 	*pzwritep++ = ' ';
 	pzwritep = strcpyax(pzwritep, &(marker_ids[marker_uidx * max_marker_id_len]), ' ');
-	pzwritep = uint32_writex(pzwritep, marker_pos[marker_uidx], ' ');
+	pzwritep = uint32toa_x(marker_pos[marker_uidx], ' ', pzwritep);
 	pzwritep = strcpyax(pzwritep, mk_allele_ptrs[2 * marker_uidx], ' ');
 	pzwritep = strcpya(pzwritep, mk_allele_ptrs[2 * marker_uidx + 1]);
 	if (sample_ct) {
-	  if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+	  if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, (uintptr_t*)loadbuf, loadbuf_collapsed)) {
 	    goto recode_ret_READ_FAIL;
 	  }
 	  if (is_haploid && set_hh_missing) {
@@ -13079,7 +13037,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       goto recode_ret_WRITE_FAIL;
     }
     memcpy(outname_end, ".sample", 8);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto recode_ret_OPEN_FAIL;
     }
     if (fputs_checked(
@@ -13095,14 +13053,14 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
       cptr = &(sample_ids[sample_uidx * max_sample_id_len]);
       aptr = (char*)memchr(cptr, '\t', max_sample_id_len);
-      wbufptr = memcpyax(tbuf, cptr, aptr - cptr, ' ');
+      wbufptr = memcpyax(g_textbuf, cptr, aptr - cptr, ' ');
       wbufptr = strcpyax(wbufptr, &(aptr[1]), ' ');
-      wbufptr = double_g_writex(wbufptr, ((double)((int32_t)missing_cts[sample_idx])) * dxx, ' ');
+      wbufptr = dtoa_gx(((double)((int32_t)missing_cts[sample_idx])) * dxx, ' ', wbufptr);
       *wbufptr++ = sexchar(sex_nm, sex_male, sample_uidx);
       *wbufptr++ = ' ';
       if (IS_SET(pheno_nm, sample_uidx)) {
         if (pheno_d) {
-          wbufptr = double_g_write(wbufptr, pheno_d[sample_uidx]);
+          wbufptr = dtoa_g(pheno_d[sample_uidx], wbufptr);
         } else {
           *wbufptr++ = '0' + IS_SET(pheno_c, sample_uidx);
 	}
@@ -13110,13 +13068,13 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	wbufptr = memcpya(wbufptr, "NA", 2);
       }
       *wbufptr++ = '\n';
-      if (fwrite_checked(tbuf, wbufptr - tbuf, outfile)) {
+      if (fwrite_checked(g_textbuf, wbufptr - g_textbuf, outfile)) {
 	goto recode_ret_WRITE_FAIL;
       }
     }
   } else if (recode_modifier & RECODE_23) {
     memcpy(outname_end, ".txt", 5);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto recode_ret_OPEN_FAIL;
     }
     LOGPRINTFWW5("--recode 23 to %s ... ", outname);
@@ -13135,7 +13093,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       goto recode_ret_WRITE_FAIL;
     }
     writebuf[0] = '\t';
-    cptr = chrom_print_human(&(writebuf[1]), chrom_idx);
+    cptr = chrom_print_human(chrom_idx, &(writebuf[1]));
     *cptr++ = '\t';
     sample_uidx = next_unset_unsafe(sample_exclude, 0);
     ucc = IS_SET(sex_male, sample_uidx);
@@ -13148,7 +13106,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	chrom_fo_idx++;
 	refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
 	chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-        cptr = chrom_print_human(&(writebuf[1]), chrom_idx);
+        cptr = chrom_print_human(chrom_idx, &(writebuf[1]));
 	*cptr++ = '\t';
 	ucc2 = ((chrom_idx == 24) || (chrom_idx == 26) || (ucc && (chrom_idx == 23) && (!xmhh_exists_orig)))? 1 : 0;
       }
@@ -13167,7 +13125,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       if (IS_SET(marker_reverse, marker_uidx)) {
 	cur_word = cur_word ^ (((~(cur_word ^ (cur_word >> 1))) & 1) * 3);
       }
-      aptr = uint32_writex(cptr, marker_pos[marker_uidx], '\t');
+      aptr = uint32toa_x(marker_pos[marker_uidx], '\t', cptr);
       if (cur_word) {
 	if (cur_word == 3) {
 	  *aptr++ = mk_allele_ptrs[2 * marker_uidx + 1][0];
@@ -13196,7 +13154,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     // chromosomes, though, since chromosome 0 was actually processed
     autosomal_marker_ct = marker_ct - count_non_autosomal_markers(chrom_info_ptr, marker_exclude, 1, 1);
     if (chrom_info_ptr->xy_code != -1) {
-      autosomal_marker_ct -= count_chrom_markers(chrom_info_ptr, chrom_info_ptr->xy_code, marker_exclude);
+      autosomal_marker_ct -= count_chrom_markers(chrom_info_ptr, marker_exclude, chrom_info_ptr->xy_code);
     }
     if (!autosomal_marker_ct) {
       // could allow this?
@@ -13205,9 +13163,9 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     }
     if (!beagle_nomap) {
       memcpy(outname_end, ".chr-", 6);
-      sprintf(logbuf, "--recode beagle to %s*.dat + %s*.map... ", outname, outname);
-      wordwrap(logbuf, 5);
-      fputs(logbuf, stdout);
+      sprintf(g_logbuf, "--recode beagle to %s*.dat + %s*.map... ", outname, outname);
+      wordwrapb(5);
+      fputs(g_logbuf, stdout);
     } else {
       memcpy(outname_end, ".beagle.dat", 12);
       LOGPRINTFWW5("--recode beagle to %s ... ", outname);
@@ -13220,7 +13178,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       goto recode_ret_1;
     }
     if (beagle_nomap) {
-      if (fopen_checked(&outfile, outname, "w")) {
+      if (fopen_checked(outname, "w", &outfile)) {
 	goto recode_ret_OPEN_FAIL;
       }
       if (fwrite_checked(writebuf2, header_len, outfile)) {
@@ -13249,7 +13207,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	    goto recode_ret_1;
 	  }
 	}
-	if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+	if (sample_ct && load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, (uintptr_t*)loadbuf, loadbuf_collapsed)) {
 	  goto recode_ret_READ_FAIL;
 	}
 	cptr = &(marker_ids[marker_uidx * max_marker_id_len]);
@@ -13263,8 +13221,8 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	    goto recode_ret_WRITE_FAIL;
 	  }
 	  putc('\t', outfile2);
-	  wbufptr = uint32_writex(tbuf, marker_pos[marker_uidx], '\t');
-	  fwrite(tbuf, 1, wbufptr - tbuf, outfile2);
+	  wbufptr = uint32toa_x(marker_pos[marker_uidx], '\t', g_textbuf);
+	  fwrite(g_textbuf, 1, wbufptr - g_textbuf, outfile2);
 	}
 	aptr = mk_allele_ptrs[2 * marker_uidx];
 	aptr2 = mk_allele_ptrs[2 * marker_uidx + 1];
@@ -13276,9 +13234,9 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	wbufptr = memcpyax(&(cur_mk_allelesx[2][1]), aptr, alen, ' ');
 	memcpy(wbufptr, aptr2, alen2);
 	if (outfile2) {
-	  fputs(replace_if_zstr(aptr, "X"), outfile2);
+	  fputs((aptr != missing_geno_ptr)? aptr : "X", outfile2);
 	  putc('\t', outfile2);
-	  fputs(replace_if_zstr(aptr2, "X"), outfile2);
+	  fputs((aptr2 != missing_geno_ptr)? aptr2 : "X", outfile2);
 	}
 	cmalen[2] = alen + alen2 + 2;
 	wbufptr = memcpyax(&(cur_mk_allelesx[3][1]), aptr2, alen2, ' ');
@@ -13330,7 +13288,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	logerrprint("Error: No variants for --recode bimbam-1chr.\n");
 	goto recode_ret_ALL_MARKERS_EXCLUDED;
       }
-      ii = single_chrom_start(chrom_info_ptr, unfiltered_marker_ct, marker_exclude);
+      ii = single_chrom_start(chrom_info_ptr, marker_exclude, unfiltered_marker_ct);
       if (ii == -1) {
         logerrprint("Error: --recode bimbam-1chr requires a single-chromosome dataset.  Did you mean\n'--recode bimbam'?  (Note the lack of a dash in the middle.)\n");
         goto recode_ret_INVALID_CMDLINE;
@@ -13345,7 +13303,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     fputs("0%", stdout);
     fflush(stdout);
     memcpy(&(outname_end[8]), "pos.txt", 8);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto recode_ret_OPEN_FAIL;
     }
     writebuf2[0] = ' ';
@@ -13353,14 +13311,14 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     for (marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
       next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
       fputs(&(marker_ids[marker_uidx * max_marker_id_len]), outfile);
-      wbufptr = uint32_write(&(writebuf2[1]), marker_pos[marker_uidx]);
+      wbufptr = uint32toa(marker_pos[marker_uidx], &(writebuf2[1]));
       if (ulii) {
 	if (marker_uidx >= chrom_end) {
           chrom_idx = get_marker_chrom(chrom_info_ptr, marker_uidx);
           chrom_end = chrom_info_ptr->chrom_end[chrom_idx];
 	}
         *wbufptr++ = ' ';
-        wbufptr = chrom_name_write(wbufptr, chrom_info_ptr, chrom_idx);
+        wbufptr = chrom_name_write(chrom_info_ptr, chrom_idx, wbufptr);
       }
       *wbufptr++ = '\n';
       if (fwrite_checked(writebuf2, wbufptr - writebuf2, outfile)) {
@@ -13371,7 +13329,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       goto recode_ret_WRITE_FAIL;
     }
     memcpy(&(outname_end[8]), "pheno.txt", 10);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto recode_ret_OPEN_FAIL;
     }
     sample_uidx = 0;
@@ -13391,7 +13349,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       for (sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
 	next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
 	if (IS_SET(pheno_nm, sample_uidx)) {
-          wbufptr = double_g_write(writebuf2, pheno_d[sample_uidx]);
+          wbufptr = dtoa_g(pheno_d[sample_uidx], writebuf2);
 	  fwrite(writebuf2, 1, (uintptr_t)(wbufptr - writebuf2), outfile);
 	} else {
           fputs(output_missing_pheno, outfile);
@@ -13405,11 +13363,11 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       goto recode_ret_WRITE_FAIL;
     }
     memcpy(&(outname_end[8]), "geno.txt", 9);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto recode_ret_OPEN_FAIL;
     }
-    wbufptr = uint32_writex(writebuf2, sample_ct, '\n');
-    wbufptr = uint32_write(wbufptr, marker_ct);
+    wbufptr = uint32toa_x(sample_ct, '\n', writebuf2);
+    wbufptr = uint32toa(marker_ct, wbufptr);
     wbufptr = memcpya(wbufptr, "\nIND", 4);
     if (fwrite_checked(writebuf2, wbufptr - writebuf2, outfile)) {
       goto recode_ret_WRITE_FAIL;
@@ -13453,7 +13411,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	  goto recode_ret_WRITE_FAIL;
 	}
 	if (sample_ct) {
-	  if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+	  if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, (uintptr_t*)loadbuf, loadbuf_collapsed)) {
 	    goto recode_ret_READ_FAIL;
 	  }
 	  if (is_haploid && set_hh_missing) {
@@ -13514,9 +13472,9 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     } else {
       *outname_end = '\0';
     }
-    sprintf(logbuf, "--recode fastphase%s to %s.recode.phase.inp ... ", (recode_modifier & RECODE_FASTPHASE)? "" : "-1chr", outname);
-    wordwrap(logbuf, 15); // strlen("[chromosome 10]")
-    fputs(logbuf, stdout);
+    sprintf(g_logbuf, "--recode fastphase%s to %s.recode.phase.inp ... ", (recode_modifier & RECODE_FASTPHASE)? "" : "-1chr", outname);
+    wordwrapb(15); // strlen("[chromosome 10]")
+    fputs(g_logbuf, stdout);
     chrom_fo_idx = 0xffffffffU; // exploit overflow for initialization
     if (recode_modifier & RECODE_FASTPHASE) {
       fputs("[chromosome   ", stdout);
@@ -13528,9 +13486,9 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
       refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
       chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-      ulii = count_chrom_markers(chrom_info_ptr, chrom_idx, marker_exclude);
+      ulii = count_chrom_markers(chrom_info_ptr, marker_exclude, chrom_idx);
       if (recode_modifier & RECODE_FASTPHASE) {
-        wbufptr = chrom_name_write(&(outname_end[5]), chrom_info_ptr, chrom_idx);
+        wbufptr = chrom_name_write(chrom_info_ptr, chrom_idx, &(outname_end[5]));
         if (chrom_idx <= chrom_info_ptr->max_code) {
           printf("\b\b%u] \b", chrom_idx);
 	} else {
@@ -13629,16 +13587,16 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
   } else if (recode_modifier & (RECODE_LGEN | RECODE_LGEN_REF)) {
     if (lgen_ref) {
       strcpy(outname_end, ".ref");
-      if (fopen_checked(&outfile2, outname, "w")) {
+      if (fopen_checked(outname, "w", &outfile2)) {
 	goto recode_ret_OPEN_FAIL;
       }
     }
     strcpy(outname_end, ".lgen");
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto recode_ret_OPEN_FAIL;
     }
     if (delimiter == ' ') {
-      sample_delim_convert(unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, '\t', ' ');
+      sample_delim_convert(unfiltered_sample_ct, sample_exclude, sample_ct, max_sample_id_len, '\t', ' ', sample_ids);
     } else {
       if (!(recode_modifier & RECODE_DELIMX)) {
 	delim2 = ' ';
@@ -13668,7 +13626,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	  refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
 	}
 	if (sample_ct) {
-	  if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+	  if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, (uintptr_t*)loadbuf, loadbuf_collapsed)) {
 	    goto recode_ret_READ_FAIL;
 	  }
 	  if (is_haploid && set_hh_missing) {
@@ -13748,13 +13706,13 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	fflush(stdout);
       }
     }
-    sample_delim_convert(unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, ' ', '\t');
+    sample_delim_convert(unfiltered_sample_ct, sample_exclude, sample_ct, max_sample_id_len, ' ', '\t', sample_ids);
   } else if (recode_modifier & (RECODE_A | RECODE_AD)) {
     memcpy(outname_end, ".raw", 5);
-    if (wkspace_left < ((uint64_t)unfiltered_sample_ct4) * marker_ct) {
+    if (bigstack_left() < ((uint64_t)unfiltered_sample_ct4) * marker_ct) {
       goto recode_ret_NO_MULTIPASS_YET;
     }
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto recode_ret_OPEN_FAIL;
     }
     if (fputs_checked((delimiter == ' ')? "FID IID PAT MAT SEX PHENOTYPE" : "FID\tIID\tPAT\tMAT\tSEX\tPHENOTYPE", outfile)) {
@@ -13800,7 +13758,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     if (!recode_allele_reverse) {
       recode_allele_reverse = marker_reverse;
     } else {
-      bitfield_xor(recode_allele_reverse, marker_reverse, unfiltered_marker_ctl);
+      bitvec_xor(marker_reverse, unfiltered_marker_ctl, recode_allele_reverse);
     }
     if (recode_load_to(loadbuf, bedfile, bed_offset, unfiltered_marker_ct, 0, marker_ct, marker_exclude, recode_allele_reverse, &marker_uidx, unfiltered_sample_ct)) {
       goto recode_ret_READ_FAIL;
@@ -13878,26 +13836,25 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     }
   } else if (recode_modifier & (RECODE_LIST | RECODE_RLIST)) {
     strcpy(outname_end, rlist? ".rlist" : ".list");
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto recode_ret_OPEN_FAIL;
     }
     if (delimiter != '\t') {
-      if (wkspace_alloc_ul_checked(&ulptr, (sample_ctv2 / 2) * sizeof(intptr_t))) {
+      if (bigstack_calloc_ul(sample_ctv2 / 2, &ulptr)) {
 	goto recode_ret_NOMEM;
       }
-      fill_ulong_zero(ulptr, sample_ctv2 / 2);
-      sample_delim_convert(sample_ct, ulptr, sample_ct, sample_ids_collapsed, max_sample_id_len, '\t', ' ');
+      sample_delim_convert(sample_ct, ulptr, sample_ct, max_sample_id_len, '\t', ' ', sample_ids_collapsed);
       if (omit_nonmale_y) {
-        sample_delim_convert(sample_ct_y, ulptr, sample_ct_y, sample_ids_collapsed_y, max_sample_id_len, '\t', ' ');
+        sample_delim_convert(sample_ct_y, ulptr, sample_ct_y, max_sample_id_len, '\t', ' ', sample_ids_collapsed_y);
       }
     }
     if (rlist) {
       *outname_end = '\0';
-      sprintf(logbuf, "--recode rlist to %s.rlist + %s.map + %s.fam ... ", outname, outname, outname);
+      sprintf(g_logbuf, "--recode rlist to %s.rlist + %s.map + %s.fam ... ", outname, outname, outname);
     } else {
-      sprintf(logbuf, "--recode list to %s ... ", outname);
+      sprintf(g_logbuf, "--recode list to %s ... ", outname);
     }
-    wordwrap(logbuf, 5);
+    wordwrapb(5);
     logprintb();
     fputs("0%", stdout);
     fflush(stdout);
@@ -13936,8 +13893,8 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	  }
 	  chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
 	}
-	if (unfiltered_sample_ct) {
-	  if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, cur_sample_ct, cur_sample_exclude, cur_final_mask, IS_SET(marker_reverse, marker_uidx))) {
+	if (cur_sample_ct) {
+	  if (load_and_collapse(unfiltered_sample_ct, cur_sample_ct, cur_sample_exclude, cur_final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, (uintptr_t*)loadbuf, loadbuf_collapsed)) {
 	    goto recode_ret_READ_FAIL;
 	  }
 	  if (is_haploid && set_hh_missing) {
@@ -13953,7 +13910,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	for (ulii = 0; ulii < 4; ulii++) {
 	  wbufptr = writebufl[ulii];
 	  if (!rlist) {
-	    wbufptr = chrom_name_write(wbufptr, chrom_info_ptr, chrom_idx);
+	    wbufptr = chrom_name_write(chrom_info_ptr, chrom_idx, wbufptr);
 	    *wbufptr++ = delimiter;
 	  }
 	  wbufptr = memcpyax(wbufptr, aptr, alen, delimiter);
@@ -14066,7 +14023,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       }
     }
   } else if (recode_modifier & (RECODE_HV | RECODE_HV_1CHR)) {
-    if (wkspace_left < ((uint64_t)unfiltered_sample_ct4) * max_chrom_size) {
+    if (bigstack_left() < ((uint64_t)unfiltered_sample_ct4) * max_chrom_size) {
       goto recode_ret_NO_MULTIPASS_YET;
     }
     if (!marker_ct) {
@@ -14075,9 +14032,9 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     }
     if (recode_modifier & RECODE_HV) {
       memcpy(outname_end, ".chr-", 5);
-      sprintf(logbuf, "--recode HV to %s*.ped + .info... ", outname);
-      wordwrap(logbuf, 15); // strlen("[chromosome 10]");
-      fputs(logbuf, stdout);
+      sprintf(g_logbuf, "--recode HV to %s*.ped + .info... ", outname);
+      wordwrapb(15); // strlen("[chromosome 10]");
+      fputs(g_logbuf, stdout);
       fputs("[chromosome   ", stdout);
     } else {
       *outname_end = '\0';
@@ -14090,9 +14047,9 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
       refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
       chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-      ulii = count_chrom_markers(chrom_info_ptr, chrom_idx, marker_exclude);
+      ulii = count_chrom_markers(chrom_info_ptr, marker_exclude, chrom_idx);
       if (recode_modifier & RECODE_HV) {
-        wbufptr = chrom_name_write(&(outname_end[5]), chrom_info_ptr, chrom_idx);
+        wbufptr = chrom_name_write(chrom_info_ptr, chrom_idx, &(outname_end[5]));
         if (chrom_idx <= chrom_info_ptr->max_code) {
           printf("\b\b%u] \b", chrom_idx);
         } else {
@@ -14104,7 +14061,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
         wbufptr = outname_end;
       }
       memcpy(wbufptr, ".ped", 5);
-      if (fopen_checked(&outfile, outname, "w")) {
+      if (fopen_checked(outname, "w", &outfile)) {
 	goto recode_ret_OPEN_FAIL;
       }
       marker_uidx_start = marker_uidx;
@@ -14123,7 +14080,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	goto recode_ret_WRITE_FAIL;
       }
       memcpy(wbufptr, ".info", 6);
-      if (fopen_checked(&outfile, outname, "w")) {
+      if (fopen_checked(outname, "w", &outfile)) {
 	goto recode_ret_OPEN_FAIL;
       }
       if (write_haploview_map(outfile, marker_exclude, marker_uidx_start, ulii, marker_ids, max_marker_id_len, marker_pos)) {
@@ -14138,7 +14095,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	}
 	*wbufptr = '\0';
 	LOGPREPRINTFWW("%s.ped + %s.info created.\n", outname, outname);
-        logstr(logbuf);
+        logstr(g_logbuf);
       }
     } while (chrom_fo_idx < last_chrom_fo_idx);
     if (recode_modifier & RECODE_HV) {
@@ -14146,10 +14103,10 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     }
   } else if (recode_modifier & RECODE_STRUCTURE) {
     memcpy(outname_end, ".recode.strct_in", 17);
-    if (wkspace_left < ((uint64_t)unfiltered_sample_ct4) * marker_ct) {
+    if (bigstack_left() < ((uint64_t)unfiltered_sample_ct4) * marker_ct) {
       goto recode_ret_NO_MULTIPASS_YET;
     }
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto recode_ret_OPEN_FAIL;
     }
     LOGPRINTFWW5("--recode structure to %s ... ", outname);
@@ -14172,7 +14129,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	} while (marker_uidx >= chrom_end);
 	fputs("-1 ", outfile);
       } else {
-        wbufptr = uint32_writex(writebuf2, marker_pos[marker_uidx] - last_pos, ' ');
+        wbufptr = uint32toa_x(marker_pos[marker_uidx] - last_pos, ' ', writebuf2);
         fwrite(writebuf2, 1, wbufptr - writebuf2, outfile);
       }
       last_pos = marker_pos[marker_uidx];
@@ -14207,9 +14164,9 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	  cur_fid = ++last_pos;
           fid_map[(uint32_t)ii] = last_pos;
 	}
-	tbuf[0] = ' ';
-        wbufptr = uint32_write(&(tbuf[1]), cur_fid);
-        fwrite(tbuf, 1, wbufptr - tbuf, outfile);
+	g_textbuf[0] = ' ';
+        wbufptr = uint32toa(cur_fid, &(g_textbuf[1]));
+        fwrite(g_textbuf, 1, wbufptr - g_textbuf, outfile);
 	marker_uidx = 0;
 	wbufptr = writebuf;
         for (marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
@@ -14233,10 +14190,10 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     }
   } else {
     memcpy(outname_end, ".ped", 5);
-    if (wkspace_left < ((uint64_t)unfiltered_sample_ct4) * marker_ct) {
+    if (bigstack_left() < ((uint64_t)unfiltered_sample_ct4) * marker_ct) {
       goto recode_ret_NO_MULTIPASS_YET;
     }
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto recode_ret_OPEN_FAIL;
     }
     *outname_end = '\0';
@@ -14308,8 +14265,8 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     } else if (recode_modifier & RECODE_HV) {
       logstr("--recode HV complete.\n");
     } else {
-      sprintf(logbuf, "--recode fastphase%s complete.\n", (recode_modifier & RECODE_FASTPHASE_1CHR)? "-1chr" : "");
-      logstr(logbuf);
+      sprintf(g_logbuf, "--recode fastphase%s complete.\n", (recode_modifier & RECODE_FASTPHASE_1CHR)? "-1chr" : "");
+      logstr(g_logbuf);
     }
   }
   while (0) {
@@ -14344,7 +14301,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
     break;
   }
  recode_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
   fclose_cond(outfile2);
   fclose_cond(outfile);
   if (bgz_outfile) {
@@ -14355,8 +14312,8 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 }
 
 int32_t sample_sort_file_map(char* sample_sort_fname, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, uint32_t** sample_sort_map_ptr) {
-  unsigned char* wkspace_mark = wkspace_base;
-  uintptr_t sample_ctl = (sample_ct + (BITCT - 1)) / BITCT;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  uintptr_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
   FILE* infile = NULL;
   // temporary: sample_id_map[ascii-sorted idx] = uidx in input fileset
   uint32_t* sample_id_map = NULL;
@@ -14374,7 +14331,7 @@ int32_t sample_sort_file_map(char* sample_sort_fname, uintptr_t unfiltered_sampl
   int32_t ii;
   if (sample_exclude) {
     // called from plink()
-    if (wkspace_alloc_ui_checked(&sample_sort_map, sample_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(sample_ct, &sample_sort_map)) {
       goto sample_sort_file_map_ret_NOMEM;
     }
   } else {
@@ -14382,33 +14339,32 @@ int32_t sample_sort_file_map(char* sample_sort_fname, uintptr_t unfiltered_sampl
     sample_sort_map = *sample_sort_map_ptr;
     sorted_sample_ids = sample_ids;
   }
-  if (wkspace_alloc_c_checked(&idbuf, max_sample_id_len) ||
-      wkspace_alloc_ul_checked(&already_seen, sample_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_c(max_sample_id_len, &idbuf) ||
+      bigstack_calloc_ul(sample_ctl, &already_seen)) {
     goto sample_sort_file_map_ret_NOMEM;
   }
   if (sample_exclude) {
-    retval = sort_item_ids(&sorted_sample_ids, &sample_id_map, unfiltered_sample_ct, sample_exclude, unfiltered_sample_ct - sample_ct, sample_ids, max_sample_id_len, 0, 0, strcmp_deref);
+    retval = sort_item_ids(unfiltered_sample_ct, sample_exclude, unfiltered_sample_ct - sample_ct, sample_ids, max_sample_id_len, 0, 0, strcmp_deref, &sorted_sample_ids, &sample_id_map);
     if (retval) {
       goto sample_sort_file_map_ret_1;
     }
   }
-  fill_ulong_zero(already_seen, sample_ctl);
-  if (fopen_checked(&infile, sample_sort_fname, "r")) {
+  if (fopen_checked(sample_sort_fname, "r", &infile)) {
     goto sample_sort_file_map_ret_OPEN_FAIL;
   }
-  tbuf[MAXLINELEN - 1] = ' ';
-  while (fgets(tbuf, MAXLINELEN, infile)) {
+  g_textbuf[MAXLINELEN - 1] = ' ';
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --indiv-sort file is pathologically long.\n", line_idx);
+    if (!g_textbuf[MAXLINELEN - 1]) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --indiv-sort file is pathologically long.\n", line_idx);
       goto sample_sort_file_map_ret_INVALID_FORMAT_2;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
-    if (bsearch_read_fam_indiv(idbuf, sorted_sample_ids, max_sample_id_len, sample_ct, bufptr, NULL, &ii)) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --indiv-sort file has fewer tokens than expected.\n", line_idx);
+    if (bsearch_read_fam_indiv(bufptr, sorted_sample_ids, max_sample_id_len, sample_ct, NULL, &ii, idbuf)) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --indiv-sort file has fewer tokens than expected.\n", line_idx);
       goto sample_sort_file_map_ret_INVALID_FORMAT_2;
     }
     if (ii != -1) {
@@ -14417,7 +14373,7 @@ int32_t sample_sort_file_map(char* sample_sort_fname, uintptr_t unfiltered_sampl
         LOGPREPRINTFWW("Error: Duplicate ID '%s' in --indiv-sort file.\n", idbuf);
         goto sample_sort_file_map_ret_INVALID_FORMAT_2;
       }
-      set_bit(already_seen, ii);
+      set_bit(ii, already_seen);
       if (sample_id_map) {
         sample_sort_map[cur_seq] = sample_id_map[(uint32_t)ii];
       } else {
@@ -14434,7 +14390,7 @@ int32_t sample_sort_file_map(char* sample_sort_fname, uintptr_t unfiltered_sampl
     goto sample_sort_file_map_ret_INVALID_CMDLINE;
   }
   *sample_sort_map_ptr = sample_sort_map;
-  wkspace_mark = (unsigned char*)idbuf;
+  bigstack_mark = (unsigned char*)idbuf;
   while (0) {
   sample_sort_file_map_ret_NOMEM:
     retval = RET_NOMEM;
@@ -14454,7 +14410,7 @@ int32_t sample_sort_file_map(char* sample_sort_fname, uintptr_t unfiltered_sampl
     break;
   }
  sample_sort_file_map_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(infile);
   return retval;
 }
@@ -14465,7 +14421,7 @@ typedef struct ll_entry_struct {
   double pheno;
   uint32_t orig_order;
   char idstr[];
-} Ll_entry;
+} Ll_fam;
 
 // .bim
 typedef struct ll_entry2_struct {
@@ -14474,7 +14430,7 @@ typedef struct ll_entry2_struct {
   double cm;
   char* allele[2];
   char idstr[];
-} Ll_entry2;
+} Ll_bim;
 
 static inline int32_t idmatch(char* idtab, char* id0, uint32_t id0_len_p1, char* id1, uint32_t id1_len_p1) {
   return (!(memcmp(idtab, id0, id0_len_p1) || memcmp(&(idtab[id0_len_p1]), id1, id1_len_p1)));
@@ -14498,18 +14454,19 @@ static inline uint32_t hashval(char* id1, uint32_t id1_len, char* id2, uint32_t
   return vv;
 }
 
-static inline Ll_entry* top_alloc_ll(uintptr_t* topsize_ptr, uint32_t size) {
-  return (Ll_entry*)top_alloc(topsize_ptr, size + sizeof(Ll_entry));
+static inline int32_t bigstack_end_alloc_llfam(uintptr_t idstr_bytes, Ll_fam** llfamp_ptr) {
+  *llfamp_ptr = (Ll_fam*)bigstack_end_alloc(idstr_bytes + sizeof(Ll_fam));
+  return !(*llfamp_ptr);
 }
 
-static inline Ll_entry2* top_alloc_ll2(uintptr_t* topsize_ptr, uint32_t size) {
-  return (Ll_entry2*)top_alloc(topsize_ptr, size + sizeof(Ll_entry2));
+static inline int32_t bigstack_end_alloc_llbim(uintptr_t idstr_bytes, Ll_bim** llbimp_ptr) {
+  *llbimp_ptr = (Ll_bim*)bigstack_end_alloc(idstr_bytes + sizeof(Ll_bim));
+  return !(*llbimp_ptr);
 }
 
-int32_t merge_fam_id_scan(char* bedname, char* famname, uint32_t allow_no_samples, uintptr_t* max_sample_id_len_ptr, uint32_t* max_sample_full_len_ptr, uint32_t* is_dichot_pheno_ptr, Ll_entry** htable, uintptr_t* topsize_ptr, uint64_t* tot_sample_ct_ptr, uint32_t* ped_buflen_ptr, uint32_t* cur_sample_ct_ptr, uint32_t* orig_idx_ptr) {
+int32_t merge_fam_id_scan(char* bedname, char* famname, uint32_t allow_no_samples, uintptr_t* max_sample_id_len_ptr, uint32_t* max_sample_full_len_ptr, uint32_t* is_dichot_pheno_ptr, Ll_fam** htable_fam, uint64_t* tot_sample_ct_ptr, uint32_t* ped_buflen_ptr, uint32_t* cur_sample_ct_ptr, uint32_t* orig_idx_ptr) {
   uint64_t tot_sample_ct = *tot_sample_ct_ptr;
   uintptr_t max_sample_id_len = *max_sample_id_len_ptr;
-  uintptr_t topsize = *topsize_ptr;
   uintptr_t line_idx = 0;
   FILE* infile = NULL;
   uint32_t max_sample_full_len = *max_sample_full_len_ptr;
@@ -14525,8 +14482,8 @@ int32_t merge_fam_id_scan(char* bedname, char* famname, uint32_t allow_no_sample
   uint32_t tot_len;
   uintptr_t ulii;
   uint32_t uii;
-  Ll_entry** ll_pptr;
-  Ll_entry* ll_ptr;
+  Ll_fam** llfam_pptr;
+  Ll_fam* llfam_ptr;
   char* col2_start_ptr;
   char* col3_start_ptr;
   char* col4_start_ptr;
@@ -14550,16 +14507,16 @@ int32_t merge_fam_id_scan(char* bedname, char* famname, uint32_t allow_no_sample
       // with a specialized error message for that case.
       LOGERRPRINTFWW("Error: Failed to open %s. (--bfile expects a filename *prefix*; '.bed', '.bim', and '.fam' are automatically appended.)\n", famname);
     } else {
-      LOGERRPRINTFWW(errstr_fopen, famname);
+      LOGERRPRINTFWW(g_errstr_fopen, famname);
     }
     goto merge_fam_id_scan_ret_OPEN_FAIL;
   }
-  tbuf[MAXLINELEN - 1] = ' ';
-  while (fgets(tbuf, MAXLINELEN, infile)) {
+  g_textbuf[MAXLINELEN - 1] = ' ';
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
     line_idx++;
-    col1_start_ptr = skip_initial_spaces(tbuf);
+    col1_start_ptr = skip_initial_spaces(g_textbuf);
     cc = *col1_start_ptr;
-    if (!is_eoln_or_comment(cc)) {
+    if (!is_eoln_or_comment_kns(cc)) {
       col1_end_ptr = token_endnn(col1_start_ptr);
       col1_len = col1_end_ptr - col1_start_ptr;
       col2_start_ptr = skip_initial_spaces(col1_end_ptr);
@@ -14596,8 +14553,8 @@ int32_t merge_fam_id_scan(char* bedname, char* famname, uint32_t allow_no_sample
       }
       tot_len = uii + col3_len + col4_len + 4;
       uii = hashval(col1_start_ptr, col1_len, col2_start_ptr, col2_len);
-      ll_pptr = &(htable[uii]);
-      ll_ptr = *ll_pptr;
+      llfam_pptr = &(htable_fam[uii]);
+      llfam_ptr = *llfam_pptr;
       uii = 1;
       if (is_dichot_pheno) {
 	is_dichot_pheno = eval_affection(col6_start_ptr, -9);
@@ -14605,63 +14562,65 @@ int32_t merge_fam_id_scan(char* bedname, char* famname, uint32_t allow_no_sample
       if (scan_double(col6_start_ptr, &pheno)) {
 	pheno = -9;
       }
-      while (ll_ptr) {
-	if (idmatch(ll_ptr->idstr, col1_start_ptr, col1_len + 1, col2_start_ptr, col2_len + 1)) {
+      while (llfam_ptr) {
+	if (idmatch(llfam_ptr->idstr, col1_start_ptr, col1_len + 1, col2_start_ptr, col2_len + 1)) {
 	  uii = 0;
 	  /*
 	  // possibly for future: add parental ID/sex merge (not in PLINK 1.07)
 	  if (merge_mode == 1) {
-	    if (fabs(pheno - ll_ptr->pheno) > PHENO_EPSILON) {
-	      ll_ptr->pheno = -9;
+	    if (fabs(pheno - llfam_ptr->pheno) > PHENO_EPSILON) {
+	      llfam_ptr->pheno = -9;
 	    }
 	  } else if (merge_mode == 2) {
-	    if (ll_ptr->pheno == -9) {
-	      ll_ptr->pheno = pheno;
+	    if (llfam_ptr->pheno == -9) {
+	      llfam_ptr->pheno = pheno;
 	    }
 	  } else if ((merge_mode == 5) || ((merge_mode == 3) && (pheno != -9))) {
-	    ll_ptr->pheno = pheno;
+	    llfam_ptr->pheno = pheno;
 	  }
 	  */
 	  break;
 	}
-        ll_pptr = &(ll_ptr->next);
-	ll_ptr = *ll_pptr;
+        llfam_pptr = &(llfam_ptr->next);
+	llfam_ptr = *llfam_pptr;
       }
       if (uii) {
 	if (tot_len > max_sample_full_len) {
 	  max_sample_full_len = tot_len;
 	}
-	ll_ptr = top_alloc_ll(&topsize, tot_len);
-	ll_ptr->next = NULL;
-	ll_ptr->pheno = pheno;
-	ll_ptr->orig_order = orig_idx++;
-	wptr = memcpyax(memcpyax(memcpyax(memcpyax(ll_ptr->idstr, col1_start_ptr, col1_len, '\t'), col2_start_ptr, col2_len, '\t'), col3_start_ptr, col3_len, '\t'), col4_start_ptr, col4_len, '\t');
+	if (bigstack_end_alloc_llfam(tot_len, &llfam_ptr)) {
+	  goto merge_fam_id_scan_ret_NOMEM;
+	}
+	llfam_ptr->next = NULL;
+	llfam_ptr->pheno = pheno;
+	llfam_ptr->orig_order = orig_idx++;
+	wptr = memcpyax(memcpyax(memcpyax(memcpyax(llfam_ptr->idstr, col1_start_ptr, col1_len, '\t'), col2_start_ptr, col2_len, '\t'), col3_start_ptr, col3_len, '\t'), col4_start_ptr, col4_len, '\t');
 	*wptr = *col5_start_ptr;
 	wptr[1] = '\0';
-	*ll_pptr = ll_ptr;
+	*llfam_pptr = llfam_ptr;
 	tot_sample_ct++;
       }
       cur_sample_ct++;
     }
-    if (!tbuf[MAXLINELEN - 1]) {
+    if (!g_textbuf[MAXLINELEN - 1]) {
       if (!text_file) {
 	goto merge_fam_id_scan_ret_LONG_LINE;
       }
       ulii = 0;
       do {
-	tbuf[MAXLINELEN - 1] = ' ';
-	if (tbuf[MAXLINELEN - 2] == '\n') {
+	g_textbuf[MAXLINELEN - 1] = ' ';
+	if (g_textbuf[MAXLINELEN - 2] == '\n') {
 	  break;
 	}
 	ulii += MAXLINELEN - 1;
 	if (ulii >= MAXLINEBUFLEN) {
 	  goto merge_fam_id_scan_ret_LONG_LINE;
 	}
-        if (!fgets(tbuf, MAXLINELEN, infile)) {
+        if (!fgets(g_textbuf, MAXLINELEN, infile)) {
 	  goto merge_fam_id_scan_ret_READ_FAIL;
 	}
-      } while (!tbuf[MAXLINELEN - 1]);
-      ulii += strlen(tbuf) + 1;
+      } while (!g_textbuf[MAXLINELEN - 1]);
+      ulii += strlen(g_textbuf) + 1;
       if (ulii > (*ped_buflen_ptr)) {
 	*ped_buflen_ptr = ulii;
       }
@@ -14677,11 +14636,13 @@ int32_t merge_fam_id_scan(char* bedname, char* famname, uint32_t allow_no_sample
   *max_sample_id_len_ptr = max_sample_id_len;
   *max_sample_full_len_ptr = max_sample_full_len;
   *is_dichot_pheno_ptr = is_dichot_pheno;
-  *topsize_ptr = topsize;
   *tot_sample_ct_ptr = tot_sample_ct;
   *cur_sample_ct_ptr = cur_sample_ct;
   *orig_idx_ptr = orig_idx;
   while (0) {
+  merge_fam_id_scan_ret_NOMEM:
+    retval = RET_NOMEM;
+    break;
   merge_fam_id_scan_ret_OPEN_FAIL:
     retval = RET_OPEN_FAIL;
     break;
@@ -14701,26 +14662,25 @@ int32_t merge_fam_id_scan(char* bedname, char* famname, uint32_t allow_no_sample
 
 int32_t merge_sample_sortf(char* sample_sort_fname, char* sample_fids, uintptr_t tot_sample_ct, uintptr_t max_sample_full_len, char* sample_ids, uintptr_t max_sample_id_len, uint32_t* map_reverse) {
   // sample_fids[] is already sorted
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   int32_t retval = 0;
   uintptr_t sample_uidx;
   for (sample_uidx = 0; sample_uidx < tot_sample_ct; sample_uidx++) {
     strcpy(&(sample_ids[sample_uidx * max_sample_id_len]), &(sample_fids[sample_uidx * max_sample_full_len]));
   }
   retval = sample_sort_file_map(sample_sort_fname, tot_sample_ct, NULL, tot_sample_ct, sample_ids, max_sample_id_len, &map_reverse);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
-int32_t merge_bim_scan(char* bimname, uint32_t is_binary, uint32_t allow_no_variants, uintptr_t* max_marker_id_len_ptr, Ll_entry2** htable2, uintptr_t* topsize_ptr, uint32_t* max_bim_linelen_ptr, uint64_t* tot_marker_ct_ptr, uint32_t* cur_marker_ct_ptr, uint64_t* position_warning_ct_ptr, Ll_str** non_biallelics_ptr, uint32_t allow_extra_chroms, Chrom_info* chrom_info_ptr) {
-  unsigned char* wkspace_mark = wkspace_base;
+int32_t merge_bim_scan(char* bimname, uint32_t is_binary, uint32_t allow_no_variants, uintptr_t* max_marker_id_len_ptr, Ll_bim** htable_bim, uint32_t* max_bim_linelen_ptr, uint64_t* tot_marker_ct_ptr, uint32_t* cur_marker_ct_ptr, uint64_t* position_warning_ct_ptr, Ll_str** non_biallelics_ptr, uint32_t allow_extra_chroms, Chrom_info* chrom_info_ptr) {
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t max_marker_id_len = *max_marker_id_len_ptr;
-  uintptr_t topsize = *topsize_ptr;
+  uintptr_t loadbuf_size = MAXLINELEN;
   uint32_t max_bim_linelen = *max_bim_linelen_ptr;
   uint64_t tot_marker_ct = *tot_marker_ct_ptr;
   uint64_t position_warning_ct = *position_warning_ct_ptr;
   uint32_t cur_marker_ct = 0;
-  uint32_t loadbuf_size = MAXLINELEN;
   double cm = 0.0;
   FILE* infile = NULL;
   int32_t retval = 0;
@@ -14734,9 +14694,9 @@ int32_t merge_bim_scan(char* bimname, uint32_t is_binary, uint32_t allow_no_vari
   char* aptr1;
   char* aptr2;
   char* new_aptr;
-  Ll_entry2** ll_pptr;
-  Ll_entry2* ll_ptr;
-  Ll_str* ll_string_new;
+  Ll_bim** llbim_pptr;
+  Ll_bim* llbim_ptr;
+  Ll_str* llstr_new_ptr;
   int64_t llxx;
   uintptr_t line_idx;
   uint32_t cm_col_exists;
@@ -14747,19 +14707,18 @@ int32_t merge_bim_scan(char* bimname, uint32_t is_binary, uint32_t allow_no_vari
   uint32_t ukk;
   int32_t ii;
   int32_t jj;
-  if (fopen_checked(&infile, bimname, "r")) {
+  if (fopen_checked(bimname, "r", &infile)) {
     goto merge_bim_scan_ret_OPEN_FAIL;
   }
   if (is_binary) {
-    if (wkspace_left - topsize > 0x7fffff7f) {
+    loadbuf_size = (bigstack_left() / 2) & (~(CACHELINE - ONELU));
+    if (bigstack_left() > 0x3fffffc0) {
       loadbuf_size = 0x3fffffc0;
-    } else if (wkspace_left - topsize >= MAXLINELEN * 2) {
-      loadbuf_size = ((wkspace_left - topsize) / 2) & (~(CACHELINE - ONELU));
-    } else {
+    } else if (loadbuf_size <= MAXLINELEN) {
       goto merge_bim_scan_ret_NOMEM;
     }
   }
-  loadbuf = (char*)wkspace_alloc(loadbuf_size);
+  bigstack_alloc_c(loadbuf_size, &loadbuf);
   loadbuf[loadbuf_size - 1] = ' ';
   if (check_cm_col(infile, loadbuf, is_binary, allow_no_variants, loadbuf_size, &cm_col_exists, &line_idx)) {
     goto merge_bim_scan_ret_MISSING_TOKENS;
@@ -14785,7 +14744,7 @@ int32_t merge_bim_scan(char* bimname, uint32_t is_binary, uint32_t allow_no_vari
       max_bim_linelen = uii + 1;
     }
     bufptr = skip_initial_spaces(loadbuf);
-    if (is_eoln_or_comment(*bufptr)) {
+    if (is_eoln_or_comment_kns(*bufptr)) {
       continue;
     }
     ii = get_chrom_code(chrom_info_ptr, bufptr);
@@ -14793,17 +14752,20 @@ int32_t merge_bim_scan(char* bimname, uint32_t is_binary, uint32_t allow_no_vari
       if (chrom_error(bimname, chrom_info_ptr, bufptr, line_idx, ii, allow_extra_chroms)) {
 	goto merge_bim_scan_ret_INVALID_FORMAT;
       }
-      retval = resolve_or_add_chrom_name(chrom_info_ptr, bufptr, &ii, line_idx, bimname);
+      retval = resolve_or_add_chrom_name(bufptr, bimname, line_idx, chrom_info_ptr, &ii);
       if (retval) {
 	goto merge_bim_scan_ret_1;
       }
     }
     // do not filter on chrom_mask here, since that happens later
-    bufptr = next_token(bufptr);
-    bufptr2 = token_endl(bufptr);
+    bufptr = skip_initial_spaces(token_endnn(bufptr));
+    if (is_eoln_kns(*bufptr)) {
+      goto merge_bim_scan_ret_MISSING_TOKENS;
+    }
+    bufptr2 = token_endnn(bufptr);
     uii = bufptr2 - bufptr;
     bufptr2 = skip_initial_spaces(bufptr2);
-    if (no_more_tokens_kns(bufptr2)) {
+    if (is_eoln_kns(*bufptr2)) {
       goto merge_bim_scan_ret_MISSING_TOKENS;
     }
     if (cm_col_exists) {
@@ -14846,20 +14808,20 @@ int32_t merge_bim_scan(char* bimname, uint32_t is_binary, uint32_t allow_no_vari
       }
       llxx = (((uint64_t)((uint32_t)ii)) << 32) + ((uint32_t)jj);
       ujj = hashval2(bufptr, uii);
-      ll_pptr = &(htable2[ujj]);
-      ll_ptr = *ll_pptr;
+      llbim_pptr = &(htable_bim[ujj]);
+      llbim_ptr = *llbim_pptr;
       name_match = 0;
       bufptr[uii++] = '\0';
-      while (ll_ptr) {
-	if (!strcmp(ll_ptr->idstr, bufptr)) {
+      while (llbim_ptr) {
+	if (!strcmp(llbim_ptr->idstr, bufptr)) {
 	  if (is_binary) {
-	    bufptr2 = ll_ptr->allele[0];
+	    bufptr2 = llbim_ptr->allele[0];
 	    allele_ct = 0;
 	    if (bufptr2) {
 	      cur_alleles[0] = bufptr2;
 	      allele_ct = 1;
 	    }
-	    bufptr3 = ll_ptr->allele[1];
+	    bufptr3 = llbim_ptr->allele[1];
 	    if (bufptr3) {
 	      cur_alleles[allele_ct++] = bufptr3;
 	    }
@@ -14871,21 +14833,20 @@ int32_t merge_bim_scan(char* bimname, uint32_t is_binary, uint32_t allow_no_vari
 	      }
 	      if (ukk == allele_ct) {
 		if (allele_ct == 2) {
-		  ll_string_new = top_alloc_llstr(&topsize, uii);
-		  if (!ll_string_new) {
+		  if (bigstack_end_alloc_llstr(uii, &llstr_new_ptr)) {
 		    goto merge_bim_scan_ret_NOMEM;
 		  }
-		  ll_string_new->next = *non_biallelics_ptr;
-		  memcpy(ll_string_new->ss, bufptr, uii);
-		  *non_biallelics_ptr = ll_string_new;
+		  llstr_new_ptr->next = *non_biallelics_ptr;
+		  memcpy(llstr_new_ptr->ss, bufptr, uii);
+		  *non_biallelics_ptr = llstr_new_ptr;
 		} else {
-		  if (allele_set(&new_aptr, aptr2, alen2)) {
+		  if (allele_set(aptr2, alen2, &new_aptr)) {
 		    goto merge_bim_scan_ret_NOMEM;
 		  }
-		  if (!ll_ptr->allele[1]) {
-		    ll_ptr->allele[1] = new_aptr;
+		  if (!llbim_ptr->allele[1]) {
+		    llbim_ptr->allele[1] = new_aptr;
 		  } else {
-		    ll_ptr->allele[0] = new_aptr;
+		    llbim_ptr->allele[0] = new_aptr;
 		  }
 		  cur_alleles[allele_ct++] = new_aptr;
 		}
@@ -14899,34 +14860,33 @@ int32_t merge_bim_scan(char* bimname, uint32_t is_binary, uint32_t allow_no_vari
 	      }
 	      if (ukk == allele_ct) {
 		if (allele_ct == 2) {
-		  ll_string_new = top_alloc_llstr(&topsize, uii);
-		  if (!ll_string_new) {
+		  if (bigstack_end_alloc_llstr(uii, &llstr_new_ptr)) {
 		    goto merge_bim_scan_ret_NOMEM;
 		  }
-		  ll_string_new->next = *non_biallelics_ptr;
-		  memcpy(ll_string_new->ss, bufptr, uii);
-		  *non_biallelics_ptr = ll_string_new;
+		  llstr_new_ptr->next = *non_biallelics_ptr;
+		  memcpy(llstr_new_ptr->ss, bufptr, uii);
+		  *non_biallelics_ptr = llstr_new_ptr;
 		} else {
-		  if (allele_set(&new_aptr, aptr1, alen1)) {
+		  if (allele_set(aptr1, alen1, &new_aptr)) {
 		    goto merge_bim_scan_ret_NOMEM;
 		  }
-		  if (!ll_ptr->allele[1]) {
-		    ll_ptr->allele[1] = new_aptr;
+		  if (!llbim_ptr->allele[1]) {
+		    llbim_ptr->allele[1] = new_aptr;
 		  } else {
-		    ll_ptr->allele[0] = new_aptr;
+		    llbim_ptr->allele[0] = new_aptr;
 		  }
 		  cur_alleles[allele_ct++] = new_aptr;
 		}
 	      }
 	    }
 	  }
-	  if (ll_ptr->pos != llxx) {
-	    if ((((uint64_t)ll_ptr->pos) >> 32) == (((uint64_t)llxx) >> 32)) {
+	  if (llbim_ptr->pos != llxx) {
+	    if ((((uint64_t)llbim_ptr->pos) >> 32) == (((uint64_t)llxx) >> 32)) {
 	      LOGPREPRINTFWW("Warning: Multiple positions seen for variant '%s'.\n", bufptr);
 	      if (position_warning_ct < 3) {
 		logerrprintb();
 	      } else {
-		logstr(logbuf);
+		logstr(g_logbuf);
 	      }
 	      position_warning_ct++;
 	    } else {
@@ -14936,36 +14896,35 @@ int32_t merge_bim_scan(char* bimname, uint32_t is_binary, uint32_t allow_no_vari
 	  name_match = 1;
 	  break;
 	}
-        ll_pptr = &(ll_ptr->next);
-	ll_ptr = *ll_pptr;
+        llbim_pptr = &(llbim_ptr->next);
+	llbim_ptr = *llbim_pptr;
       }
       if (!name_match) {
         if (uii > max_marker_id_len) {
 	  max_marker_id_len = uii;
 	}
-	ll_ptr = top_alloc_ll2(&topsize, uii);
-	if (!ll_ptr) {
+	if (bigstack_end_alloc_llbim(uii, &llbim_ptr)) {
 	  goto merge_bim_scan_ret_NOMEM;
 	}
-	ll_ptr->next = NULL;
-	ll_ptr->pos = llxx;
-	ll_ptr->cm = cm;
+	llbim_ptr->next = NULL;
+	llbim_ptr->pos = llxx;
+	llbim_ptr->cm = cm;
 	if (aptr1) {
-	  if (allele_set(&(ll_ptr->allele[0]), aptr1, alen1)) {
+	  if (allele_set(aptr1, alen1, &(llbim_ptr->allele[0]))) {
 	    goto merge_bim_scan_ret_NOMEM;
 	  }
 	} else {
-	  ll_ptr->allele[0] = NULL;
+	  llbim_ptr->allele[0] = NULL;
 	}
 	if (aptr2) {
-	  if (allele_set(&(ll_ptr->allele[1]), aptr2, alen2)) {
+	  if (allele_set(aptr2, alen2, &(llbim_ptr->allele[1]))) {
 	    goto merge_bim_scan_ret_NOMEM;
 	  }
 	} else {
-	  ll_ptr->allele[1] = NULL;
+	  llbim_ptr->allele[1] = NULL;
 	}
-	memcpy(ll_ptr->idstr, bufptr, uii);
-	*ll_pptr = ll_ptr;
+	memcpy(llbim_ptr->idstr, bufptr, uii);
+	*llbim_pptr = llbim_ptr;
 	tot_marker_ct++;
       }
       cur_marker_ct++;
@@ -14979,7 +14938,6 @@ int32_t merge_bim_scan(char* bimname, uint32_t is_binary, uint32_t allow_no_vari
     goto merge_bim_scan_ret_INVALID_FORMAT_2;
   }
   *max_marker_id_len_ptr = max_marker_id_len;
-  *topsize_ptr = topsize;
   *max_bim_linelen_ptr = max_bim_linelen;
   *tot_marker_ct_ptr = tot_marker_ct;
   *cur_marker_ct_ptr = cur_marker_ct;
@@ -15004,7 +14962,7 @@ int32_t merge_bim_scan(char* bimname, uint32_t is_binary, uint32_t allow_no_vari
   }
  merge_bim_scan_ret_1:
   fclose_cond(infile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -15028,7 +14986,7 @@ int32_t report_non_biallelics(char* outname, char* outname_end, Ll_str* non_bial
     nbmarker_ct_dup++;
     cur_ptr = cur_ptr->next;
   } while (cur_ptr);
-  if (wkspace_alloc_c_checked(&id_arr, nbmarker_ct_dup * max_nbmarker_id_len)) {
+  if (bigstack_alloc_c(nbmarker_ct_dup * max_nbmarker_id_len, &id_arr)) {
     goto report_non_biallelics_ret_NOMEM;
   }
   cur_ptr = non_biallelics;
@@ -15040,7 +14998,7 @@ int32_t report_non_biallelics(char* outname, char* outname_end, Ll_str* non_bial
   } while (cur_ptr);
   qsort(id_arr, nbmarker_ct_dup, max_nbmarker_id_len, strcmp_casted);
   memcpy(outname_end, ".missnp", 8);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto report_non_biallelics_ret_OPEN_FAIL;
   }
   id_arr_ptr = id_arr;
@@ -15176,7 +15134,7 @@ static inline uint32_t merge_post_msort_update_maps(char* marker_ids, uintptr_t
 	if (position_warning_ct < 3) {
 	  logerrprintb();
 	} else {
-	  logstr(logbuf);
+	  logstr(g_logbuf);
 	}
 	position_warning_ct++;
 	if (merge_equal_pos) {
@@ -15266,7 +15224,7 @@ int32_t merge_main(char* bedname, char* bimname, char* famname, char* bim_loadbu
   // bugfix: there was a potential integer overflow back when these were
   // uint32_t
   uintptr_t tot_sample_ct4 = (tot_sample_ct + 3) / 4;
-  uintptr_t tot_sample_ctl = (tot_sample_ct + (BITCT - 1)) / BITCT;
+  uintptr_t tot_sample_ctl = BITCT_TO_WORDCT(tot_sample_ct);
   uint32_t end_marker_idx = start_marker_idx + marker_window_size;
   uint32_t marker_in_idx = 0xffffffffU; // overflow to zero on first add
   uint32_t last_marker_in_idx = 0xfffffffeU;
@@ -15313,11 +15271,11 @@ int32_t merge_main(char* bedname, char* bimname, char* famname, char* bim_loadbu
   unsigned char ucc4;
   char cc;
   if (is_binary) {
-    if (fopen_checked(&infile2, famname, "r")) {
+    if (fopen_checked(famname, "r", &infile2)) {
       goto merge_main_ret_OPEN_FAIL;
     }
-    while (fgets(tbuf, MAXLINELEN, infile2)) {
-      bufptr = skip_initial_spaces(tbuf);
+    while (fgets(g_textbuf, MAXLINELEN, infile2)) {
+      bufptr = skip_initial_spaces(g_textbuf);
       if (is_eoln_kns(*bufptr)) {
 	continue;
       }
@@ -15347,12 +15305,12 @@ int32_t merge_main(char* bedname, char* bimname, char* famname, char* bim_loadbu
     }
     fclose_null(&infile2);
     cur_sample_ct4 = (cur_sample_ct + 3) / 4;
-    cur_sample_ctl2 = (cur_sample_ct + (BITCT2 - 1)) / BITCT2;
+    cur_sample_ctl2 = QUATERCT_TO_WORDCT(cur_sample_ct);
   } else {
-    bim_loadbuf = tbuf;
+    bim_loadbuf = g_textbuf;
     max_bim_linelen = MAXLINELEN;
   }
-  if (fopen_checked(&infile2, bimname, "r")) {
+  if (fopen_checked(bimname, "r", &infile2)) {
     goto merge_main_ret_OPEN_FAIL;
   }
   if (check_cm_col(infile2, bim_loadbuf, is_binary, 1, max_bim_linelen, &cm_col_exists, &ulii)) {
@@ -15361,7 +15319,7 @@ int32_t merge_main(char* bedname, char* bimname, char* famname, char* bim_loadbu
   if (!ulii) {
     bim_loadbuf[0] = '\0';
   }
-  if (fopen_checked(&bedfile, bedname, is_binary? "rb" : "r")) {
+  if (fopen_checked(bedname, is_binary? FOPEN_RB : "r", &bedfile)) {
     goto merge_main_ret_OPEN_FAIL;
   }
   if (is_binary) {
@@ -15383,7 +15341,7 @@ int32_t merge_main(char* bedname, char* bimname, char* famname, char* bim_loadbu
   }
   do {
     bufptr = skip_initial_spaces(bim_loadbuf);
-    if (is_eoln_or_comment(*bufptr)) {
+    if (is_eoln_or_comment_kns(*bufptr)) {
       continue;
     }
     ++marker_in_idx;
@@ -15432,7 +15390,7 @@ int32_t merge_main(char* bedname, char* bimname, char* famname, char* bim_loadbu
       if (!cur_sample_ct) {
 	continue;
       }
-      if (load_raw(bedfile, readbuf_w, cur_sample_ct4)) {
+      if (load_raw(cur_sample_ct4, bedfile, readbuf_w)) {
 	goto merge_main_ret_READ_FAIL;
       }
       if ((((*bufptr2 != '0') || (alen1 != 1)) && (!strcmp(bufptr2, bufptr5)))  || (((*bufptr3 != '0') || (alen2 != 1)) && (!strcmp(bufptr3, bufptr4)))) {
@@ -15441,7 +15399,7 @@ int32_t merge_main(char* bedname, char* bimname, char* famname, char* bim_loadbu
 	// (since cur_sample_ct % 4 is not necessarily the same as
 	// tot_sample_ct % 4).  And while I'm at it, may as well switch
 	// the main loops to be word-based.
-	reverse_loadbuf((unsigned char*)readbuf_w, cur_sample_ct);
+	reverse_loadbuf(cur_sample_ct, (unsigned char*)readbuf_w);
       }
       rbufptr = readbuf_w;
       wbufptr = &(writebuf[(marker_out_idx - start_marker_idx) * tot_sample_ct4]);
@@ -15641,7 +15599,7 @@ int32_t merge_main(char* bedname, char* bimname, char* famname, char* bim_loadbu
       line_idx++;
       bufptr = skip_initial_spaces((char*)readbuf);
       cc = *bufptr;
-      if (is_eoln_or_comment(cc)) {
+      if (is_eoln_or_comment_kns(cc)) {
 	continue;
       }
       // only possible to get here if sample_ct and marker_ct are positive
@@ -15750,7 +15708,7 @@ int32_t merge_main(char* bedname, char* bimname, char* famname, char* bim_loadbu
 	    } else {
 	      goto merge_main_ret_NOT_BIALLELIC;
 	    }
-	    if (allele_set(&(marker_allele_ptrs[ukk]), aptr1, alen1)) {
+	    if (allele_set(aptr1, alen1, &(marker_allele_ptrs[ukk]))) {
 	      goto merge_main_ret_NOMEM;
 	    }
 	  }
@@ -15767,7 +15725,7 @@ int32_t merge_main(char* bedname, char* bimname, char* famname, char* bim_loadbu
 	    } else {
 	      goto merge_main_ret_NOT_BIALLELIC;
 	    }
-	    if (allele_set(&(marker_allele_ptrs[ukk]), aptr2, alen2)) {
+	    if (allele_set(aptr2, alen2, &(marker_allele_ptrs[ukk]))) {
 	      goto merge_main_ret_NOMEM;
 	    }
 	  }
@@ -15902,7 +15860,8 @@ int32_t merge_main(char* bedname, char* bimname, char* famname, char* bim_loadbu
 int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outname, char* outname_end, char* mergename1, char* mergename2, char* mergename3, char* sample_sort_fname, uint64_t calculation_type, uint32_t merge_type, uint32_t sample_sort, uint64_t misc_flags, Chrom_info* chrom_info_ptr) {
   FILE* mergelistfile = NULL;
   FILE* outfile = NULL;
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   uintptr_t max_sample_id_len = 0;
   uintptr_t max_marker_id_len = 0;
   uint32_t max_sample_full_len = 0;
@@ -15912,11 +15871,9 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
   uint32_t merge_list = merge_type & MERGE_LIST;
   uint32_t merge_mode = merge_type & MERGE_MODE_MASK;
   uint32_t merge_nsort = ((!sample_sort) || (sample_sort == SAMPLE_SORT_NATURAL))? 1 : 0;
-  uint32_t merge_equal_pos = (merge_type & MERGE_EQUAL_POS)? 1 : 0;
+  uint32_t merge_equal_pos = (merge_type / MERGE_EQUAL_POS) & 1;
   uint32_t allow_no_samples = (misc_flags / MISC_ALLOW_NO_SAMPLES) & 1;
   uint32_t allow_no_variants = (misc_flags / MISC_ALLOW_NO_VARS) & 1;
-  Ll_entry** htable = (Ll_entry**)(&(wkspace_base[wkspace_left - HASHMEM_S]));
-  Ll_entry2** htable2 = (Ll_entry2**)(&(wkspace_base[wkspace_left - HASHMEM]));
   Ll_str* non_biallelics = NULL;
   uint32_t ped_buflen = MAXLINELEN;
   uint32_t max_bim_linelen = 0;
@@ -15940,10 +15897,11 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
   char* bim_loadbuf = NULL;
   // N.B. marker_allele_ptrs are ordered by marker_id instead of position
   char** marker_allele_ptrs = NULL;
+  Ll_fam** htable_fam;
+  Ll_bim** htable_bim;
   uintptr_t* pcptr;
   uintptr_t markers_per_pass;
   uint32_t pass_ct;
-  uintptr_t topsize;
   char* sample_ids;
   char* sample_fids;
   char* marker_ids;
@@ -15978,8 +15936,8 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
   char* bufptr2;
   char* bufptr3;
   char* bufptr4;
-  Ll_entry* ll_ptr;
-  Ll_entry2* ll_ptr2;
+  Ll_fam* llfam_ptr;
+  Ll_bim* llbim_ptr;
   uint32_t* chrom_start;
   uint32_t* chrom_id;
   uint32_t chrom_ct;
@@ -15988,8 +15946,8 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
   unsigned char* ubufptr;
   char cc;
   unsigned char ucc;
-  if (wkspace_alloc_ui_checked(&chrom_start, (MAX_POSSIBLE_CHROM + 1) * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&chrom_id, MAX_POSSIBLE_CHROM * sizeof(int32_t))) {
+  if (bigstack_alloc_ui(MAX_POSSIBLE_CHROM + 1, &chrom_start) ||
+      bigstack_alloc_ui(MAX_POSSIBLE_CHROM, &chrom_id)) {
     goto merge_datasets_ret_NOMEM;
   }
 
@@ -15997,35 +15955,35 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
     merge_mode = 1;
   }
   if (merge_list) {
-    if (fopen_checked(&mergelistfile, mergename1, "r")) {
+    if (fopen_checked(mergename1, "r", &mergelistfile)) {
       goto merge_datasets_ret_READ_FAIL;
     }
     merge_ct = (famname[0] != '\0');
     ullxx = 0;
     // first pass: determine merge_ct, mergelist_buf size, verify no lines have
     // > 3 entries
-    tbuf[MAXLINELEN - 1] = ' ';
+    g_textbuf[MAXLINELEN - 1] = ' ';
     line_idx = 0;
-    while (fgets(tbuf, MAXLINELEN, mergelistfile)) {
+    while (fgets(g_textbuf, MAXLINELEN, mergelistfile)) {
       line_idx++;
-      if (!tbuf[MAXLINELEN - 1]) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of --merge-list file is pathologically long.\n", line_idx);
+      if (!g_textbuf[MAXLINELEN - 1]) {
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --merge-list file is pathologically long.\n", line_idx);
 	goto merge_datasets_ret_INVALID_FORMAT_2;
       }
-      bufptr = skip_initial_spaces(tbuf);
+      bufptr = skip_initial_spaces(g_textbuf);
       if (no_more_tokens_kns(bufptr)) {
 	continue;
       }
       bufptr2 = next_token_mult(bufptr, 3);
       if (!no_more_tokens_kns(bufptr2)) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of --merge-list file has more tokens than expected.\n", line_idx);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --merge-list file has more tokens than expected.\n", line_idx);
         goto merge_datasets_ret_INVALID_FORMAT_2;
       }
       if (no_more_tokens_kns(next_token(bufptr))) {
 	bufptr2 = token_endnn(bufptr);
 	ulii = bufptr2 - bufptr;
 	if (ulii > FNAMESIZE - 5) {
-	  sprintf(logbuf, "Error: Line %" PRIuPTR " of --merge-list file has an excessively long fileset\nprefix.\n", line_idx);
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --merge-list file has an excessively long fileset\nprefix.\n", line_idx);
 	  goto merge_datasets_ret_INVALID_FORMAT_2;
 	}
 	ullxx += 3 * ulii + 15;
@@ -16034,7 +15992,7 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
 	  bufptr2 = token_endnn(bufptr);
 	  ulii = bufptr2 - bufptr;
 	  if (ulii > FNAMESIZE - 1) {
-	    sprintf(logbuf, "Error: Line %" PRIuPTR " of --merge-list file has an excessively long filename.\n", line_idx);
+	    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --merge-list file has an excessively long filename.\n", line_idx);
 	    goto merge_datasets_ret_INVALID_FORMAT_2;
 	  }
 	  ullxx += ulii + 1;
@@ -16061,17 +16019,17 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
       goto merge_datasets_ret_NOMEM;
     }
 #endif
-    mergelist_bed = (char**)wkspace_alloc(merge_ct * sizeof(intptr_t));
-    mergelist_bim = (char**)wkspace_alloc(merge_ct * sizeof(intptr_t));
-    mergelist_fam = (char**)wkspace_alloc(merge_ct * sizeof(intptr_t));
-    if (wkspace_alloc_c_checked(&mergelist_buf, (uintptr_t)ullxx)) {
+    mergelist_bed = (char**)bigstack_alloc(merge_ct * sizeof(intptr_t));
+    mergelist_bim = (char**)bigstack_alloc(merge_ct * sizeof(intptr_t));
+    mergelist_fam = (char**)bigstack_alloc(merge_ct * sizeof(intptr_t));
+    if (bigstack_alloc_c((uintptr_t)ullxx, &mergelist_buf)) {
       goto merge_datasets_ret_NOMEM;
     }
     rewind(mergelistfile);
     bufptr4 = mergelist_buf;
     mlpos = (famname[0] != '\0');
-    while (fgets(tbuf, MAXLINELEN, mergelistfile)) {
-      bufptr = skip_initial_spaces(tbuf);
+    while (fgets(g_textbuf, MAXLINELEN, mergelistfile)) {
+      bufptr = skip_initial_spaces(g_textbuf);
       if (no_more_tokens_kns(bufptr)) {
 	continue;
       }
@@ -16112,9 +16070,9 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
     fclose_null(&mergelistfile);
   } else {
     merge_ct = 2;
-    mergelist_bed = (char**)wkspace_alloc(2 * sizeof(intptr_t));
-    mergelist_bim = (char**)wkspace_alloc(2 * sizeof(intptr_t));
-    mergelist_fam = (char**)wkspace_alloc(2 * sizeof(intptr_t));
+    mergelist_bed = (char**)bigstack_alloc(2 * sizeof(intptr_t));
+    mergelist_bim = (char**)bigstack_alloc(2 * sizeof(intptr_t));
+    mergelist_fam = (char**)bigstack_alloc(2 * sizeof(intptr_t));
     mergelist_bed[1] = mergename1;
     mergelist_bim[1] = mergename2;
     mergelist_fam[1] = (merge_type & MERGE_BINARY)? mergename3 : NULL;
@@ -16128,21 +16086,21 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
   // ID counting/duplicate detection strategy:
   // - We do NOT want to scan through .ped files any more times than absolutely
   // necessary.  So we actually use *gasp* a hash table here.
-  // - The hash table is positioned at the FAR end of wkspace, automatically
+  // - The hash table is positioned at the FAR end of bigstack, automatically
   // sized to ~4MB (or ~2MB on 32-bit systems).  IDs are then stored
   // backwards from there.  This simplifies copying into a sorted list.
-  if (wkspace_left < HASHSIZE_S * sizeof(intptr_t)) {
+  htable_fam = (Ll_fam**)bigstack_end_alloc(HASHSIZE_S * sizeof(intptr_t));
+  if (!htable_fam) {
     goto merge_datasets_ret_NOMEM;
   }
   for (uii = 0; uii < HASHSIZE_S; uii++) {
-    htable[uii] = NULL;
+    htable_fam[uii] = NULL;
   }
-  topsize = HASHMEM_S;
 
   ullxx = 0;
   mlpos = 0;
   for (mlpos = 0; mlpos < merge_ct; mlpos++) {
-    retval = merge_fam_id_scan(mergelist_bed[mlpos], mergelist_fam[mlpos], allow_no_samples, &max_sample_id_len, &max_sample_full_len, &is_dichot_pheno, htable, &topsize, &ullxx, &ped_buflen, &cur_sample_ct, &orig_idx);
+    retval = merge_fam_id_scan(mergelist_bed[mlpos], mergelist_fam[mlpos], allow_no_samples, &max_sample_id_len, &max_sample_full_len, &is_dichot_pheno, htable_fam, &ullxx, &ped_buflen, &cur_sample_ct, &orig_idx);
     if (retval) {
       goto merge_datasets_ret_1;
     }
@@ -16158,61 +16116,58 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
   }
 #ifdef __LP64__
   if (ullxx > 0x7fffffff) {
-    sprintf(logbuf, "Error: Too many %s (max 2147483647).\n", g_species_plural);
+    sprintf(g_logbuf, "Error: Too many %s (max 2147483647).\n", g_species_plural);
     goto merge_datasets_ret_INVALID_FORMAT_2;
   }
 #else
-  // avoid integer overflow in wkspace_alloc calls
+  // avoid integer overflow in bigstack_alloc calls
   if (ullxx * max_sample_full_len > 0x7fffffff) {
-    sprintf(logbuf, "Error: Too many %s for 32-bit " PROG_NAME_CAPS ".\n", g_species_plural);
+    sprintf(g_logbuf, "Error: Too many %s for 32-bit " PROG_NAME_CAPS ".\n", g_species_plural);
     goto merge_datasets_ret_INVALID_FORMAT_2;
   }
 #endif
   tot_sample_ct = ullxx;
-  // "allocate" first hash table off far side of stack before making regular
-  // stack allocations
-  wkspace_left -= topsize;
   if (sample_sort & (SAMPLE_SORT_NONE | SAMPLE_SORT_FILE)) {
-    if (wkspace_alloc_ui_checked(&sample_nsmap, tot_sample_ct * sizeof(int32_t))) {
-      goto merge_datasets_ret_NOMEM2;
+    if (bigstack_alloc_ui(tot_sample_ct, &sample_nsmap)) {
+      goto merge_datasets_ret_NOMEM;
     }
   }
-  if (wkspace_alloc_c_checked(&sample_ids, max_sample_id_len * tot_sample_ct) ||
-      wkspace_alloc_c_checked(&sample_fids, max_sample_full_len * tot_sample_ct)) {
-    goto merge_datasets_ret_NOMEM2;
+  if (bigstack_alloc_c(max_sample_id_len * tot_sample_ct, &sample_ids) ||
+      bigstack_alloc_c(max_sample_full_len * tot_sample_ct, &sample_fids)) {
+    goto merge_datasets_ret_NOMEM;
   }
   if (is_dichot_pheno) {
-    if (wkspace_alloc_c_checked(&pheno_c_char, tot_sample_ct)) {
-      goto merge_datasets_ret_NOMEM2;
+    if (bigstack_alloc_c(tot_sample_ct, &pheno_c_char)) {
+      goto merge_datasets_ret_NOMEM;
     }
   } else {
-    if (wkspace_alloc_d_checked(&pheno_d, tot_sample_ct * sizeof(double))) {
-      goto merge_datasets_ret_NOMEM2;
+    if (bigstack_alloc_d(tot_sample_ct, &pheno_d)) {
+      goto merge_datasets_ret_NOMEM;
     }
   }
   if (sample_sort & (SAMPLE_SORT_NONE | SAMPLE_SORT_FILE)) {
-    if (wkspace_alloc_ui_checked(&map_reverse, tot_sample_ct * sizeof(int32_t))) {
-      goto merge_datasets_ret_NOMEM2;
+    if (bigstack_alloc_ui(tot_sample_ct, &map_reverse)) {
+      goto merge_datasets_ret_NOMEM;
     }
   }
   if (sample_sort == SAMPLE_SORT_NONE) {
     for (uii = 0; uii < HASHSIZE_S; uii++) {
-      if (htable[uii]) {
-	ll_ptr = htable[uii];
+      if (htable_fam[uii]) {
+	llfam_ptr = htable_fam[uii];
 	do {
-	  ujj = ll_ptr->orig_order;
-	  strcpy(&(sample_fids[ujj * max_sample_full_len]), ll_ptr->idstr);
+	  ujj = llfam_ptr->orig_order;
+	  strcpy(&(sample_fids[ujj * max_sample_full_len]), llfam_ptr->idstr);
 	  if (is_dichot_pheno) {
-	    if (ll_ptr->pheno == -9) {
+	    if (llfam_ptr->pheno == -9) {
 	      pheno_c_char[ujj] = -1;
 	    } else {
-	      pheno_c_char[ujj] = ll_ptr->pheno - 1;
+	      pheno_c_char[ujj] = llfam_ptr->pheno - 1;
 	    }
 	  } else {
-	    pheno_d[ujj] = ll_ptr->pheno;
+	    pheno_d[ujj] = llfam_ptr->pheno;
 	  }
-	  ll_ptr = ll_ptr->next;
-	} while (ll_ptr);
+	  llfam_ptr = llfam_ptr->next;
+	} while (llfam_ptr);
       }
     }
     for (ulii = 0; ulii < tot_sample_ct; ulii++) {
@@ -16222,29 +16177,29 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
       *bufptr = '\0';
     }
     if (qsort_ext(sample_fids, tot_sample_ct, max_sample_full_len, strcmp_deref, (char*)sample_nsmap, sizeof(int32_t))) {
-      goto merge_datasets_ret_NOMEM2;
+      goto merge_datasets_ret_NOMEM;
     }
   } else {
     ulii = 0;
     bufptr = sample_fids;
     for (uii = 0; uii < HASHSIZE_S; uii++) {
-      if (htable[uii]) {
-	ll_ptr = htable[uii];
+      if (htable_fam[uii]) {
+	llfam_ptr = htable_fam[uii];
 	do {
-	  strcpy(bufptr, ll_ptr->idstr);
+	  strcpy(bufptr, llfam_ptr->idstr);
 	  bufptr = &(bufptr[max_sample_full_len]);
 	  if (is_dichot_pheno) {
-	    if (ll_ptr->pheno == -9) {
+	    if (llfam_ptr->pheno == -9) {
 	      pheno_c_char[ulii] = -1;
 	    } else {
-	      pheno_c_char[ulii] = ll_ptr->pheno - 1;
+	      pheno_c_char[ulii] = llfam_ptr->pheno - 1;
 	    }
 	  } else {
-	    pheno_d[ulii] = ll_ptr->pheno;
+	    pheno_d[ulii] = llfam_ptr->pheno;
 	  }
 	  ulii++;
-	  ll_ptr = ll_ptr->next;
-	} while (ll_ptr);
+	  llfam_ptr = llfam_ptr->next;
+	} while (llfam_ptr);
       }
     }
     // bugfix: parental IDs and phenotype were being used to break sorting
@@ -16258,25 +16213,24 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
     }
     if (is_dichot_pheno) {
       if (qsort_ext(sample_fids, tot_sample_ct, max_sample_full_len, merge_nsort? strcmp_natural_deref : strcmp_deref, pheno_c_char, 1)) {
-	goto merge_datasets_ret_NOMEM2;
+	goto merge_datasets_ret_NOMEM;
       }
     } else {
       if (qsort_ext(sample_fids, tot_sample_ct, max_sample_full_len, merge_nsort? strcmp_natural_deref : strcmp_deref, (char*)pheno_d, sizeof(double))) {
-	goto merge_datasets_ret_NOMEM2;
+	goto merge_datasets_ret_NOMEM;
       }
     }
     if (sample_sort == SAMPLE_SORT_FILE) {
       retval = merge_sample_sortf(sample_sort_fname, sample_fids, tot_sample_ct, max_sample_full_len, sample_ids, max_sample_id_len, map_reverse);
       if (retval) {
-        wkspace_left += topsize;
         goto merge_datasets_ret_1;
       }
     }
   }
-  wkspace_left += topsize; // deallocate first hash table
+  bigstack_end_reset(bigstack_end_mark); // deallocate first hash table
   if (merge_mode < 6) {
     memcpy(outname_end, ".fam", 5);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto merge_datasets_ret_OPEN_FAIL;
     }
   }
@@ -16359,15 +16313,18 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
       goto merge_datasets_ret_WRITE_FAIL;
     }
   }
-  wkspace_reset(sample_fids);
+  bigstack_reset(sample_fids);
+  htable_bim = (Ll_bim**)bigstack_end_alloc(HASHSIZE * sizeof(intptr_t));
+  if (!htable_bim) {
+    goto merge_datasets_ret_NOMEM;
+  }
   for (uii = 0; uii < HASHSIZE; uii++) {
-    htable2[uii] = NULL;
+    htable_bim[uii] = NULL;
   }
-  topsize = HASHMEM;
 
   ullxx = 0;
   for (mlpos = 0; mlpos < merge_ct; ++mlpos) {
-    retval = merge_bim_scan(mergelist_bim[mlpos], (mergelist_fam[mlpos])? 1 : 0, allow_no_variants, &max_marker_id_len, htable2, &topsize, &max_bim_linelen, &ullxx, &cur_marker_ct, &position_warning_ct, &non_biallelics, allow_extra_chroms, chrom_info_ptr);
+    retval = merge_bim_scan(mergelist_bim[mlpos], (mergelist_fam[mlpos])? 1 : 0, allow_no_variants, &max_marker_id_len, htable_bim, &max_bim_linelen, &ullxx, &cur_marker_ct, &position_warning_ct, &non_biallelics, allow_extra_chroms, chrom_info_ptr);
     if (retval) {
       goto merge_datasets_ret_1;
     }
@@ -16406,7 +16363,7 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
   }
 #endif
   if (non_biallelics) {
-    wkspace_reset(wkspace_mark);
+    bigstack_reset(bigstack_mark);
     retval = report_non_biallelics(outname, outname_end, non_biallelics);
     if (retval) {
       goto merge_datasets_ret_1;
@@ -16414,80 +16371,77 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
     goto merge_datasets_ret_INVALID_FORMAT;
   }
   tot_marker_ct = ullxx;
-  // "allocate" second hash table off far side of stack before making regular
-  // stack allocations
-  wkspace_left -= topsize;
-  marker_allele_ptrs = (char**)wkspace_alloc(tot_marker_ct * 2 * sizeof(intptr_t));
+  marker_allele_ptrs = (char**)bigstack_alloc(tot_marker_ct * 2 * sizeof(intptr_t));
   if (!marker_allele_ptrs) {
-    goto merge_datasets_ret_NOMEM2;
+    goto merge_datasets_ret_NOMEM;
   }
   for (uii = 0; uii < tot_marker_ct * 2; uii++) {
     marker_allele_ptrs[uii] = NULL;
   }
   if (max_bim_linelen) {
     max_bim_linelen++;
-    if (wkspace_alloc_c_checked(&bim_loadbuf, max_bim_linelen)) {
-      goto merge_datasets_ret_NOMEM2;
+    if (bigstack_alloc_c(max_bim_linelen, &bim_loadbuf)) {
+      goto merge_datasets_ret_NOMEM;
     }
   }
-  if (wkspace_alloc_c_checked(&marker_ids, max_marker_id_len * tot_marker_ct) ||
-      wkspace_alloc_ui_checked(&marker_map, tot_marker_ct * sizeof(int32_t)) ||
-      wkspace_alloc_d_checked(&marker_cms, tot_marker_ct * sizeof(double)) ||
-      wkspace_alloc_ui_checked(&pos_buf, tot_marker_ct * sizeof(int32_t)) ||
-      wkspace_alloc_d_checked(&marker_cms_tmp, tot_marker_ct * sizeof(double)) ||
-      wkspace_alloc_ll_checked(&ll_buf, tot_marker_ct * sizeof(int64_t))) {
-    goto merge_datasets_ret_NOMEM2;
+  if (bigstack_alloc_c(max_marker_id_len * tot_marker_ct, &marker_ids) ||
+      bigstack_alloc_ui(tot_marker_ct, &marker_map) ||
+      bigstack_alloc_d(tot_marker_ct, &marker_cms) ||
+      bigstack_alloc_ui(tot_marker_ct, &pos_buf) ||
+      bigstack_alloc_d(tot_marker_ct, &marker_cms_tmp) ||
+      bigstack_alloc_ll(tot_marker_ct, &ll_buf)) {
+    goto merge_datasets_ret_NOMEM;
   }
   for (uii = 0; uii < tot_marker_ct; uii++) {
     pos_buf[uii] = uii;
   }
   ulii = 0;
   for (uii = 0; uii < HASHSIZE; uii++) {
-    if (htable2[uii]) {
-      ll_ptr2 = htable2[uii];
+    if (htable_bim[uii]) {
+      llbim_ptr = htable_bim[uii];
       do {
-	strcpy(&(marker_ids[ulii * max_marker_id_len]), ll_ptr2->idstr);
+	strcpy(&(marker_ids[ulii * max_marker_id_len]), llbim_ptr->idstr);
         ulii++;
-	ll_ptr2 = ll_ptr2->next;
-      } while (ll_ptr2);
+	llbim_ptr = llbim_ptr->next;
+      } while (llbim_ptr);
     }
   }
   // todo: reimplement this in a manner that never performs a variant ID sort.
   // chrom/pos-based sort is of course still needed, but that involves cheaper
   // int64 comparisons.
   if (qsort_ext(marker_ids, tot_marker_ct, max_marker_id_len, strcmp_deref, (char*)pos_buf, sizeof(int32_t))) {
-    goto merge_datasets_ret_NOMEM2;
+    goto merge_datasets_ret_NOMEM;
   }
   // pos_buf[n] contains the position of lexicographic marker #n in the hash
   // table.  invert this map, then traverse the hash table.
   for (uii = 0; uii < tot_marker_ct; uii++) {
     marker_map[pos_buf[uii]] = uii;
   }
-  wkspace_left += topsize; // deallocate second hash table
+  bigstack_end_reset(bigstack_end_mark); // deallocate second hash table
   ulii = 0;
   for (uii = 0; uii < HASHSIZE; uii++) {
-    if (htable2[uii]) {
-      ll_ptr2 = htable2[uii];
+    if (htable_bim[uii]) {
+      llbim_ptr = htable_bim[uii];
       do {
 	ujj = marker_map[ulii++];
-	llxx = ll_ptr2->pos;
+	llxx = llbim_ptr->pos;
 	pos_buf[ujj] = (uint32_t)llxx;
-	bufptr = ll_ptr2->allele[0];
+	bufptr = llbim_ptr->allele[0];
 	if (bufptr) {
           marker_allele_ptrs[ujj * 2] = bufptr;
 	} else {
 	  marker_allele_ptrs[ujj * 2] = missing_geno_ptr;
 	}
-	bufptr = ll_ptr2->allele[1];
+	bufptr = llbim_ptr->allele[1];
 	if (bufptr) {
 	  marker_allele_ptrs[ujj * 2 + 1] = bufptr;
 	} else {
 	  marker_allele_ptrs[ujj * 2 + 1] = missing_geno_ptr;
 	}
-	marker_cms_tmp[ujj] = ll_ptr2->cm;
+	marker_cms_tmp[ujj] = llbim_ptr->cm;
 	ll_buf[ujj] = (((uint64_t)llxx) & 0xffffffff00000000LL) | ujj;
-	ll_ptr2 = ll_ptr2->next;
-      } while (ll_ptr2);
+	llbim_ptr = llbim_ptr->next;
+      } while (llbim_ptr);
     }
   }
   sort_marker_chrom_pos(ll_buf, tot_marker_ct, pos_buf, chrom_start, chrom_id, NULL, &chrom_ct);
@@ -16501,43 +16455,37 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
     logerrprint("Error: No variants in merged file.\n");
     goto merge_datasets_ret_INVALID_FORMAT;
   }
-  wkspace_reset((char*)marker_cms_tmp);
+  bigstack_reset((char*)marker_cms_tmp);
 
   tot_sample_ct4 = (tot_sample_ct + 3) / 4;
 
   if (!keep_allele_order) {
-    ulii = (tot_marker_ct + (BITCT - 1)) / BITCT;
-    if (wkspace_alloc_ul_checked(&reversed, ulii * sizeof(intptr_t))) {
+    if (bigstack_calloc_ul(BITCT_TO_WORDCT(tot_marker_ct), &reversed)) {
       goto merge_datasets_ret_NOMEM;
     }
-    fill_ulong_zero(reversed, ulii);
   }
-  if (wkspace_alloc_ui_checked(&flex_map, MAXV(max_cur_sample_ct, max_cur_marker_text_ct) * sizeof(int32_t)) ||
-      wkspace_alloc_c_checked(&idbuf, MAXV(max_marker_id_len, max_sample_id_len))) {
+  if (bigstack_alloc_ui(MAXV(max_cur_sample_ct, max_cur_marker_text_ct), &flex_map) ||
+      bigstack_alloc_c(MAXV(max_marker_id_len, max_sample_id_len), &idbuf)) {
     goto merge_datasets_ret_NOMEM;
   }
 
-  if (tot_sample_ct4 > ped_buflen) {
-    ulii = tot_sample_ct4;
-  } else {
-    ulii = ped_buflen;
-  }
-  if (wkspace_alloc_uc_checked(&readbuf, MAXV(ulii, 3))) {
+  ulii = MAXV(tot_sample_ct4, ped_buflen);
+  if (bigstack_alloc_uc(MAXV(ulii, 3), &readbuf)) {
     goto merge_datasets_ret_NOMEM;
   }
   if (merge_must_track_write(merge_mode)) {
-    ulii = (tot_sample_ct + (BITCT - 1)) / BITCT;
+    ulii = BITCT_TO_WORDCT(tot_sample_ct);
     if (ulii) {
-      markers_per_pass = wkspace_left / (3 * sizeof(intptr_t) * ulii);
+      markers_per_pass = bigstack_left() / (3 * sizeof(intptr_t) * ulii);
       if (markers_per_pass > dedup_marker_ct) {
 	markers_per_pass = dedup_marker_ct;
       }
     } else {
       markers_per_pass = dedup_marker_ct;
     }
-    markbuf = (uintptr_t*)wkspace_alloc(markers_per_pass * ulii * sizeof(intptr_t));
+    bigstack_alloc_ul(markers_per_pass * ulii, &markbuf);
   } else if (tot_sample_ct4) {
-    markers_per_pass = wkspace_left / tot_sample_ct4;
+    markers_per_pass = bigstack_left() / tot_sample_ct4;
     if (markers_per_pass > dedup_marker_ct) {
       markers_per_pass = dedup_marker_ct;
     }
@@ -16553,24 +16501,24 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
     pass_ct = 0;
   }
 
-  writebuf = wkspace_base;
-  pcptr = (uintptr_t*)wkspace_base;
+  writebuf = g_bigstack_base;
+  pcptr = (uintptr_t*)g_bigstack_base;
   if (merge_mode < 6) {
     memcpy(outname_end, ".bed", 5);
-    if (fopen_checked(&outfile, outname, "wb")) {
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
       goto merge_datasets_ret_OPEN_FAIL;
     }
     if (fwrite_checked("l\x1b\x01", 3, outfile)) {
       goto merge_datasets_ret_WRITE_FAIL;
     }
     if (pass_ct == 1) {
-      sprintf(logbuf, "Performing single-pass merge (%u %s, %u variant%s).\n", tot_sample_ct, species_str(tot_sample_ct), dedup_marker_ct, (dedup_marker_ct == 1)? "" : "s");
+      sprintf(g_logbuf, "Performing single-pass merge (%u %s, %u variant%s).\n", tot_sample_ct, species_str(tot_sample_ct), dedup_marker_ct, (dedup_marker_ct == 1)? "" : "s");
     } else {
-      sprintf(logbuf, "Performing %u-pass merge (%u %s, %" PRIuPTR "/%u variant%s per pass).\n", pass_ct, tot_sample_ct, species_str(tot_sample_ct), markers_per_pass, dedup_marker_ct, (dedup_marker_ct == 1)? "" : "s");
+      sprintf(g_logbuf, "Performing %u-pass merge (%u %s, %" PRIuPTR "/%u variant%s per pass).\n", pass_ct, tot_sample_ct, species_str(tot_sample_ct), markers_per_pass, dedup_marker_ct, (dedup_marker_ct == 1)? "" : "s");
     }
   } else {
     memcpy(outname_end, ".diff", 6);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto merge_datasets_ret_OPEN_FAIL;
     }
     if (fputs_checked("                 SNP                  FID                  IID      NEW      OLD \n", outfile)) {
@@ -16618,7 +16566,7 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
 	  if (umm < tot_sample_ct) {
 	    ulkk = (uii * markers_per_pass) + ukk;
 	    reversed[ulkk / BITCT] |= (ONELU << (ulkk % BITCT));
-	    reverse_loadbuf(&(writebuf[uljj]), tot_sample_ct);
+	    reverse_loadbuf(tot_sample_ct, &(writebuf[uljj]));
 	  }
 	}
       }
@@ -16634,13 +16582,13 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
   if (fclose_null(&outfile)) {
     goto merge_datasets_ret_WRITE_FAIL;
   }
-  wkspace_reset(flex_map);
-  if (wkspace_alloc_ui_checked(&map_reverse, dedup_marker_ct * sizeof(int32_t))) {
+  bigstack_reset(flex_map);
+  if (bigstack_alloc_ui(dedup_marker_ct, &map_reverse)) {
     goto merge_datasets_ret_NOMEM;
   }
   if (merge_mode < 6) {
     memcpy(outname_end, ".bim", 5);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto merge_datasets_ret_OPEN_FAIL;
     }
     uii = tot_marker_ct;
@@ -16656,8 +16604,8 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
       ukk = chrom_id[ulii];
       for (; ujj < uii; ujj++) {
 	umm = map_reverse[ujj];
-	bufptr = chrom_name_write(tbuf, chrom_info_ptr, ukk);
-	fwrite(tbuf, 1, bufptr - tbuf, outfile);
+	bufptr = chrom_name_write(chrom_info_ptr, ukk, g_textbuf);
+	fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
 	if (keep_allele_order || (!IS_SET(reversed, ujj))) {
 	  bufptr = marker_allele_ptrs[2 * umm];
 	  bufptr2 = marker_allele_ptrs[2 * umm + 1];
@@ -16684,8 +16632,6 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
 
   forget_extra_chrom_names(chrom_info_ptr);
   while (0) {
-  merge_datasets_ret_NOMEM2:
-    wkspace_left += topsize;
   merge_datasets_ret_NOMEM:
     retval = RET_NOMEM;
     break;
@@ -16715,6 +16661,6 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
   }
   fclose_cond(mergelistfile);
   fclose_cond(outfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
   return retval;
 }
diff --git a/plink_dosage.c b/plink_dosage.c
index 71b0c3a..bf7ceb1 100644
--- a/plink_dosage.c
+++ b/plink_dosage.c
@@ -70,13 +70,13 @@ int32_t dosage_load_score_files(Score_info* sc_ip, char* outname, char* outname_
   uint32_t rangename_len_limit;
   uint32_t slen;
   int32_t ii;
-  loadbuf_size = wkspace_left;
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   } else if (loadbuf_size <= MAXLINELEN) {
     goto dosage_load_score_files_ret_NOMEM;
   }
-  loadbuf = (char*)wkspace_base;
+  loadbuf = (char*)g_bigstack_base;
   retval = open_and_load_to_first_token(&infile, sc_ip->fname, loadbuf_size, '\0', "--score file", loadbuf, &bufptr, &line_idx);
   if (retval) {
     goto dosage_load_score_files_ret_1;
@@ -159,7 +159,7 @@ int32_t dosage_load_score_files(Score_info* sc_ip, char* outname, char* outname_
     }
     if (!(loadbuf[loadbuf_size - 1])) {
       if (loadbuf_size == MAXLINEBUFLEN) {
-        sprintf(logbuf, "Error: Line %" PRIuPTR " of --score file is pathologically long.\n", line_idx);
+        sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --score file is pathologically long.\n", line_idx);
         goto dosage_load_score_files_ret_INVALID_FORMAT_2;
       }
       goto dosage_load_score_files_ret_NOMEM;
@@ -182,36 +182,35 @@ int32_t dosage_load_score_files(Score_info* sc_ip, char* outname, char* outname_
     goto dosage_load_score_files_ret_NOMEM;
   }
 #endif
-  if (wkspace_alloc_c_checked(score_marker_ids_ptr, score_marker_ct * max_score_marker_id_len) ||
-      wkspace_alloc_d_checked(score_effect_sizes_ptr, score_marker_ct * sizeof(double)) ||
-      wkspace_alloc_c_checked(&allele_code_buf, (uintptr_t)allele_code_buf_len)) {
+  if (bigstack_alloc_c(score_marker_ct * max_score_marker_id_len, score_marker_ids_ptr) ||
+      bigstack_alloc_d(score_marker_ct, score_effect_sizes_ptr) ||
+      bigstack_alloc_c((uintptr_t)allele_code_buf_len, &allele_code_buf)) {
     goto dosage_load_score_files_ret_NOMEM;
   }
   score_marker_ids = *score_marker_ids_ptr;
   score_effect_sizes = *score_effect_sizes_ptr;
-  score_marker_ctl = (score_marker_ct + (BITCT - 1)) / BITCT;
+  score_marker_ctl = BITCT_TO_WORDCT(score_marker_ct);
   if (sc_ip->data_fname) {
-    if (wkspace_alloc_ul_checked(score_qrange_key_exists_ptr, score_marker_ctl * sizeof(intptr_t)) ||
-        wkspace_alloc_d_checked(score_qrange_keys_ptr, score_marker_ct * sizeof(double))) {
+    if (bigstack_calloc_ul(score_marker_ctl, score_qrange_key_exists_ptr) ||
+        bigstack_alloc_d(score_marker_ct, score_qrange_keys_ptr)) {
       goto dosage_load_score_files_ret_NOMEM;
     }
     score_qrange_key_exists = *score_qrange_key_exists_ptr;
     score_qrange_keys = *score_qrange_keys_ptr;
-    fill_ulong_zero(score_qrange_key_exists, score_marker_ctl);
   }
-  score_allele_codes = (char**)wkspace_alloc(score_marker_ct * sizeof(intptr_t));
+  score_allele_codes = (char**)bigstack_alloc(score_marker_ct * sizeof(intptr_t));
   if (!score_allele_codes) {
     goto dosage_load_score_files_ret_NOMEM;
   }
   *score_allele_codes_ptr = score_allele_codes;
   rewind(infile);
-  loadbuf_size = wkspace_left;
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   } else if (loadbuf_size <= MAXLINELEN) {
     goto dosage_load_score_files_ret_NOMEM;
   }
-  loadbuf = (char*)wkspace_base;
+  loadbuf = (char*)g_bigstack_base;
   loadbuf[loadbuf_size - 1] = ' ';
   // pass 2: load and sort variant IDs
   retval = load_to_first_token(infile, loadbuf_size, '\0', "--score file", loadbuf, &bufptr, &line_idx);
@@ -236,7 +235,7 @@ int32_t dosage_load_score_files(Score_info* sc_ip, char* outname, char* outname_
     }
     if (!(loadbuf[loadbuf_size - 1])) {
       if (loadbuf_size == MAXLINEBUFLEN) {
-        sprintf(logbuf, "Error: Line %" PRIuPTR " of --score file is pathologically long.\n", line_idx);
+        sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --score file is pathologically long.\n", line_idx);
         goto dosage_load_score_files_ret_INVALID_FORMAT_2;
       }
       goto dosage_load_score_files_ret_NOMEM;
@@ -336,7 +335,7 @@ int32_t dosage_load_score_files(Score_info* sc_ip, char* outname, char* outname_
 	    goto dosage_load_score_files_ret_INVALID_FORMAT_2;
 	  }
 	  score_qrange_keys[(uint32_t)ii] = dxx;
-	  set_bit(score_qrange_key_exists, ii);
+	  set_bit(ii, score_qrange_key_exists);
 	}
       } else {
 	miss_ct++;
@@ -348,7 +347,7 @@ int32_t dosage_load_score_files(Score_info* sc_ip, char* outname, char* outname_
       }
       if (!(loadbuf[loadbuf_size - 1])) {
 	if (loadbuf_size == MAXLINEBUFLEN) {
-	  sprintf(logbuf, "Error: Line %" PRIuPTR " of --q-score-range data file is pathologically long.\n", line_idx);
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --q-score-range data file is pathologically long.\n", line_idx);
 	  goto dosage_load_score_files_ret_INVALID_FORMAT_2;
 	}
 	goto dosage_load_score_files_ret_NOMEM;
@@ -364,18 +363,18 @@ int32_t dosage_load_score_files(Score_info* sc_ip, char* outname, char* outname_
     if (miss_ct) {
       LOGERRPRINTF("Warning: %" PRIuPTR " line%s skipped in --q-score-range data file.\n", miss_ct, (miss_ct == 1)? "" : "s");
     }
-    if (fopen_checked(&infile, sc_ip->range_fname, "r")) {
+    if (fopen_checked(sc_ip->range_fname, "r", &infile)) {
       goto dosage_load_score_files_ret_OPEN_FAIL;
     }
     rangename_len_limit = (FNAMESIZE - 10) - ((uintptr_t)(outname_end - outname));
-    tbuf[MAXLINELEN - 1] = ' ';
-    while (fgets(tbuf, MAXLINELEN, infile)) {
+    g_textbuf[MAXLINELEN - 1] = ' ';
+    while (fgets(g_textbuf, MAXLINELEN, infile)) {
       line_idx++;
-      if (!tbuf[MAXLINELEN - 1]) {
-        sprintf(logbuf, "Error: Line %" PRIuPTR " of --q-score-range range file is pathologically long.\n", line_idx);
+      if (!g_textbuf[MAXLINELEN - 1]) {
+        sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --q-score-range range file is pathologically long.\n", line_idx);
         goto dosage_load_score_files_ret_INVALID_FORMAT_2;
       }
-      bufptr = skip_initial_spaces(tbuf);
+      bufptr = skip_initial_spaces(g_textbuf);
       if (is_eoln_kns(*bufptr)) {
         continue;
       }
@@ -387,7 +386,7 @@ int32_t dosage_load_score_files(Score_info* sc_ip, char* outname, char* outname_
 	continue;
       }
       if (slen > rangename_len_limit) {
-        sprintf(logbuf, "Error: Excessively long range name on line %" PRIuPTR " of --q-score-range range\nfile.\n", line_idx);
+        sprintf(g_logbuf, "Error: Excessively long range name on line %" PRIuPTR " of --q-score-range range\nfile.\n", line_idx);
         goto dosage_load_score_files_ret_INVALID_FORMAT_2;
       }
       qrange_ct++;
@@ -402,16 +401,16 @@ int32_t dosage_load_score_files(Score_info* sc_ip, char* outname, char* outname_
       logerrprint("Error: No valid entries in --q-score-range range file.\n");
       goto dosage_load_score_files_ret_INVALID_FORMAT;
     }
-    if (wkspace_alloc_c_checked(score_qrange_names_ptr, qrange_ct * max_qrange_name_len) ||
-        wkspace_alloc_d_checked(score_qrange_bounds_ptr, qrange_ct * 2 * sizeof(double))) {
+    if (bigstack_alloc_c(qrange_ct * max_qrange_name_len, score_qrange_names_ptr) ||
+        bigstack_alloc_d(qrange_ct * 2, score_qrange_bounds_ptr)) {
       goto dosage_load_score_files_ret_NOMEM;
     }
     score_qrange_names = *score_qrange_names_ptr;
     score_qrange_bounds = *score_qrange_bounds_ptr;
     rewind(infile);
     ulii = 0; // range index
-    while (fgets(tbuf, MAXLINELEN, infile)) {
-      bufptr = skip_initial_spaces(tbuf);
+    while (fgets(g_textbuf, MAXLINELEN, infile)) {
+      bufptr = skip_initial_spaces(g_textbuf);
       if (is_eoln_kns(*bufptr)) {
         continue;
       }
@@ -452,7 +451,7 @@ int32_t dosage_load_score_files(Score_info* sc_ip, char* outname, char* outname_
     retval = RET_INVALID_FORMAT;
     break;
   dosage_load_score_files_ret_MISSING_TOKENS:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of --score file has fewer tokens than expected.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --score file has fewer tokens than expected.\n", line_idx);
   dosage_load_score_files_ret_INVALID_FORMAT_2:
     logerrprintb();
   dosage_load_score_files_ret_INVALID_FORMAT:
@@ -467,6 +466,7 @@ int32_t dosage_load_score_files(Score_info* sc_ip, char* outname, char* outname_
 int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* outname, char* outname_end, char* phenoname, char* extractname, char* excludename, char* keepname, char* removename, char* keepfamname, char* removefamname, char* filtername, char* makepheno_str, char* phenoname_str, char* covar_fname, Two_col_params* qual_filter, Two_col_params* update_map, Two_col_params* update_name, char* update_ids_fname, char* update_parents_fname, char* update_sex_fname, char* filtervals_ [...]
   // sucks to duplicate so much, but this code will be thrown out later so
   // there's no long-term maintenance problem
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   FILE* phenofile = NULL;
   FILE* infile = NULL;
   FILE* profile_outfile = NULL;
@@ -543,7 +543,6 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
   uint32_t* uiptr = NULL;
   uint32_t* uiptr2 = NULL;
   uint32_t* uiptr3 = NULL;
-  uintptr_t topsize = 0;
   uintptr_t unfiltered_marker_ct = 0;
   uintptr_t marker_exclude_ct = 0;
   uintptr_t max_marker_id_len = 0;
@@ -616,7 +615,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 #endif
   char missing_pheno_str[32];
   Pigz_state ps;
-  unsigned char* wkspace_mark;
+  unsigned char* bigstack_mark;
   unsigned char* overflow_buf;
   char* fnames;
   char* loadbuf;
@@ -708,13 +707,13 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
   if (retval) {
     goto plink1_dosage_ret_1;
   }
-  unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   if (misc_flags & MISC_MAKE_FOUNDERS_FIRST) {
     if (make_founders(unfiltered_sample_ct, unfiltered_sample_ct, sample_ids, max_sample_id_len, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, (misc_flags / MISC_MAKE_FOUNDERS_REQUIRE_2_MISSING) & 1, sample_exclude, founder_info)) {
       goto plink1_dosage_ret_NOMEM;
     }
   }
-  count_genders(sex_nm, sex_male, unfiltered_sample_ct, sample_exclude, &uii, &ujj, &gender_unk_ct);
+  count_genders(sex_nm, sex_male, sample_exclude, unfiltered_sample_ct, &uii, &ujj, &gender_unk_ct);
   marker_ct = unfiltered_marker_ct - marker_exclude_ct;
   if (gender_unk_ct) {
     LOGPRINTF("%" PRIuPTR " %s (%u male%s, %u female%s, %u ambiguous) loaded from .fam.\n", unfiltered_sample_ct, species_str(unfiltered_sample_ct), uii, (uii == 1)? "" : "s", ujj, (ujj == 1)? "" : "s", gender_unk_ct);
@@ -729,12 +728,12 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
   if (uii) {
     LOGPRINTF("%u phenotype value%s loaded from .fam.\n", uii, (uii == 1)? "" : "s");
   }
-  if (phenoname && fopen_checked(&phenofile, phenoname, "r")) {
+  if (phenoname && fopen_checked(phenoname, "r", &phenofile)) {
     goto plink1_dosage_ret_OPEN_FAIL;
   }
   if (phenofile || update_ids_fname || update_parents_fname || update_sex_fname || (filter_flags & FILTER_TAIL_PHENO)) {
-    wkspace_mark = wkspace_base;
-    retval = sort_item_ids(&sorted_sample_ids, &sample_id_map, unfiltered_sample_ct, sample_exclude, 0, sample_ids, max_sample_id_len, 0, 0, strcmp_deref);
+    bigstack_mark = g_bigstack_base;
+    retval = sort_item_ids(unfiltered_sample_ct, sample_exclude, 0, sample_ids, max_sample_id_len, 0, 0, strcmp_deref, &sorted_sample_ids, &sample_id_map);
     if (retval) {
       goto plink1_dosage_ret_1;
     }
@@ -750,7 +749,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	if (retval == LOAD_PHENO_LAST_COL) {
 	  logprintb();
 	  retval = RET_INVALID_FORMAT;
-	  wkspace_reset(wkspace_mark);
+	  bigstack_reset(bigstack_mark);
 	}
         goto plink1_dosage_ret_1;
       }
@@ -761,13 +760,13 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
         goto plink1_dosage_ret_1;
       }
     }
-    wkspace_reset(wkspace_mark);
+    bigstack_reset(bigstack_mark);
   }
   if (load_map) {
     uii = update_map || update_name || filter_attrib_fname || qual_filter;
     if (uii || extractname || excludename) {
-      wkspace_mark = wkspace_base;
-      retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, !uii, &marker_id_htable, &marker_id_htable_size);
+      bigstack_mark = g_bigstack_base;
+      retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, !uii, &marker_id_htable_size, &marker_id_htable);
       if (retval) {
 	goto plink1_dosage_ret_1;
       }
@@ -780,8 +779,8 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	  goto plink1_dosage_ret_1;
 	}
 	if (extractname || excludename) {
-	  wkspace_reset(wkspace_mark);
-	  retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable, &marker_id_htable_size);
+	  bigstack_reset(bigstack_mark);
+	  retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable_size, &marker_id_htable);
 	  if (retval) {
 	    goto plink1_dosage_ret_1;
 	  }
@@ -837,7 +836,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	  goto plink1_dosage_ret_1;
 	}
       }
-      wkspace_reset(wkspace_mark);
+      bigstack_reset(bigstack_mark);
     }
     if (thin_keep_prob != 1.0) {
       if (random_thin_markers(thin_keep_prob, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, 0)) {
@@ -851,8 +850,8 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
     }
   }
   if (update_ids_fname || update_parents_fname || update_sex_fname || keepname || keepfamname || removename || removefamname || filter_attrib_sample_fname || filtername) {
-    wkspace_mark = wkspace_base;
-    retval = sort_item_ids(&sorted_sample_ids, &sample_id_map, unfiltered_sample_ct, sample_exclude, sample_exclude_ct, sample_ids, max_sample_id_len, 0, 0, strcmp_deref);
+    bigstack_mark = g_bigstack_base;
+    retval = sort_item_ids(unfiltered_sample_ct, sample_exclude, sample_exclude_ct, sample_ids, max_sample_id_len, 0, 0, strcmp_deref, &sorted_sample_ids, &sample_id_map);
     if (retval) {
       goto plink1_dosage_ret_1;
     }
@@ -915,19 +914,19 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	goto plink1_dosage_ret_1;
       }
     }
-    wkspace_reset(wkspace_mark);
+    bigstack_reset(bigstack_mark);
   }
   if (gender_unk_ct && (!(sex_missing_pheno & ALLOW_NO_SEX))) {
     uii = popcount_longs_exclude(pheno_nm, sex_nm, unfiltered_sample_ctl);
     if (uii) {
-      bitfield_and(pheno_nm, sex_nm, unfiltered_sample_ctl);
+      bitvec_and(sex_nm, unfiltered_sample_ctl, pheno_nm);
       logerrprint("Warning: Ignoring phenotypes of missing-sex samples.  If you don't want those\nphenotypes to be ignored, use the --allow-no-sex flag.\n");
     }
   }
   if (do_glm || (filter_flags & FILTER_PRUNE)) {
     ulii = sample_exclude_ct;
-    bitfield_ornot(sample_exclude, pheno_nm, unfiltered_sample_ctl);
-    zero_trailing_bits(sample_exclude, unfiltered_sample_ct);
+    bitvec_ornot(pheno_nm, unfiltered_sample_ctl, sample_exclude);
+    zero_trailing_bits(unfiltered_sample_ct, sample_exclude);
     sample_exclude_ct = popcount_longs(sample_exclude, unfiltered_sample_ctl);
     uii = do_glm && (!(filter_flags & FILTER_PRUNE));
     if (sample_exclude_ct == unfiltered_sample_ct) {
@@ -988,8 +987,8 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
     LOGERRPRINTF("Error: No %s pass QC.\n", g_species_plural);
     goto plink1_dosage_ret_ALL_SAMPLES_EXCLUDED;
   }
-  sample_cta4 = (sample_ct + 3) & (~3);
-  sample_ctl = (sample_ct + (BITCT - 1)) / BITCT;
+  sample_cta4 = round_up_pow2(sample_ct, 4);
+  sample_ctl = BITCT_TO_WORDCT(sample_ct);
   uii = do_glm && pheno_d;
   if (g_thread_ct > 1) {
     if (output_gz) {
@@ -1025,37 +1024,37 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
       }
     }
   } else if (sex_covar) {
-    if (wkspace_alloc_c_checked(&covar_names, 4) ||
-        wkspace_alloc_ul_checked(&covar_nm, sample_ctl * sizeof(intptr_t)) ||
-        wkspace_alloc_d_checked(&covar_d, sample_ct * sizeof(double))) {
+    if (bigstack_alloc_c(4, &covar_names) ||
+        bigstack_alloc_ul(sample_ctl, &covar_nm) ||
+        bigstack_alloc_d(sample_ct, &covar_d)) {
       goto plink1_dosage_ret_NOMEM;
     }
     covar_ct = 1;
     max_covar_name_len = 4;
     memcpy(covar_names, "SEX", 4);
-    fill_all_bits(covar_nm, sample_ct);
+    fill_all_bits(sample_ct, covar_nm);
     for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
       next_unset_unsafe_ck(sample_exclude, &sample_uidx);
       if (is_set(sex_nm, sample_uidx)) {
         covar_d[sample_idx] = (double)((int32_t)is_set(sex_male, sample_idx));
       } else {
-	CLEAR_BIT(covar_nm, sample_idx);
+	CLEAR_BIT(sample_idx, covar_nm);
         covar_d[sample_idx] = missing_phenod;
       }
     }
   }
   param_ct = covar_ct + 2;
-  param_cta4 = (param_ct + 3) & (~3);
-  bitfield_andnot(pheno_nm, sample_exclude, unfiltered_sample_ctl);
+  param_cta4 = round_up_pow2(param_ct, 4);
+  bitvec_andnot(sample_exclude, unfiltered_sample_ctl, pheno_nm);
   if (pheno_c) {
-    bitfield_and(pheno_c, pheno_nm, unfiltered_sample_ctl);
+    bitvec_and(pheno_nm, unfiltered_sample_ctl, pheno_c);
   }
-  bitfield_andnot(founder_info, sample_exclude, unfiltered_sample_ctl);
-  bitfield_andnot(sex_nm, sample_exclude, unfiltered_sample_ctl);
+  bitvec_andnot(sample_exclude, unfiltered_sample_ctl, founder_info);
+  bitvec_andnot(sample_exclude, unfiltered_sample_ctl, sex_nm);
   if (gender_unk_ct) {
     gender_unk_ct = sample_ct - popcount_longs(sex_nm, unfiltered_sample_ctl);
   }
-  bitfield_and(sex_male, sex_nm, unfiltered_sample_ctl);
+  bitvec_and(sex_nm, unfiltered_sample_ctl, sex_male);
 
   pheno_nm_ct = popcount_longs(pheno_nm, unfiltered_sample_ctl);
 
@@ -1072,7 +1071,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
       enforce_min_bp_space(min_bp_space, unfiltered_marker_ct, marker_exclude, marker_pos, &marker_exclude_ct, chrom_info_ptr);
     }
     marker_ct = unfiltered_marker_ct - marker_exclude_ct;
-    retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable, &marker_id_htable_size);
+    retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable_size, &marker_id_htable);
     if (retval) {
       goto plink1_dosage_ret_1;
     }
@@ -1089,11 +1088,11 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
   } else if (pheno_c) {
     pheno_ctrl_ct = pheno_nm_ct - popcount_longs(pheno_c, unfiltered_sample_ctl);
     if (pheno_nm_ct != sample_ct) {
-      sprintf(logbuf, "Among remaining phenotypes, %u %s and %u %s.  (%" PRIuPTR " phenotype%s missing.)\n", pheno_nm_ct - pheno_ctrl_ct, (pheno_nm_ct - pheno_ctrl_ct == 1)? "is a case" : "are cases", pheno_ctrl_ct, (pheno_ctrl_ct == 1)? "is a control" : "are controls", sample_ct - pheno_nm_ct, (sample_ct - pheno_nm_ct == 1)? " is" : "s are");
+      sprintf(g_logbuf, "Among remaining phenotypes, %u %s and %u %s.  (%" PRIuPTR " phenotype%s missing.)\n", pheno_nm_ct - pheno_ctrl_ct, (pheno_nm_ct - pheno_ctrl_ct == 1)? "is a case" : "are cases", pheno_ctrl_ct, (pheno_ctrl_ct == 1)? "is a control" : "are controls", sample_ct - pheno_nm_ct, (sample_ct - pheno_nm_ct == 1)? " is" : "s are");
     } else {
-      sprintf(logbuf, "Among remaining phenotypes, %u %s and %u %s.\n", pheno_nm_ct - pheno_ctrl_ct, (pheno_nm_ct - pheno_ctrl_ct == 1)? "is a case" : "are cases", pheno_ctrl_ct, (pheno_ctrl_ct == 1)? "is a control" : "are controls");
+      sprintf(g_logbuf, "Among remaining phenotypes, %u %s and %u %s.\n", pheno_nm_ct - pheno_ctrl_ct, (pheno_nm_ct - pheno_ctrl_ct == 1)? "is a case" : "are cases", pheno_ctrl_ct, (pheno_ctrl_ct == 1)? "is a control" : "are controls");
     }
-    wordwrap(logbuf, 0);
+    wordwrapb(0);
     logprintb();
     if (standard_beta) {
       logerrprint("Error: --dosage 'standard-beta' modifier cannot be used with a case/control\nphenotype.\n");
@@ -1114,28 +1113,20 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
       goto plink1_dosage_ret_1;
     }
     if (qrange_ct) {
-      if (wkspace_alloc_d_checked(&cur_scores, sample_ct * qrange_ct * sizeof(double)) ||
-          wkspace_alloc_d_checked(&score_bases, qrange_ct * sizeof(double)) ||
-          wkspace_alloc_ui_checked(&score_range_obs_cts, qrange_ct * sizeof(int32_t)) ||
-          wkspace_alloc_ui_checked(&score_miss_cts, sample_ct * qrange_ct * sizeof(int32_t))) {
+      if (bigstack_calloc_d(sample_ct * qrange_ct, &cur_scores) ||
+          bigstack_calloc_d(qrange_ct, &score_bases) ||
+          bigstack_alloc_ui(qrange_ct, &score_range_obs_cts) ||
+          bigstack_alloc_ui(sample_ct * qrange_ct, &score_miss_cts)) {
 	goto plink1_dosage_ret_NOMEM;
       }
-      fill_double_zero(cur_scores, sample_ct * qrange_ct);
-      fill_double_zero(score_bases, qrange_ct);
-      fill_uint_zero(score_range_obs_cts, qrange_ct);
-      fill_uint_zero(score_miss_cts, sample_ct * qrange_ct);
       *outname_end = '.';
     } else {
-      if (wkspace_alloc_d_checked(&cur_scores, sample_ct * sizeof(double)) ||
-          wkspace_alloc_d_checked(&score_bases, sizeof(double)) ||
-          wkspace_alloc_ui_checked(&score_range_obs_cts, sizeof(int32_t)) ||
-          wkspace_alloc_ui_checked(&score_miss_cts, sample_ct * sizeof(int32_t))) {
+      if (bigstack_calloc_d(sample_ct, &cur_scores) ||
+          bigstack_calloc_d(1, &score_bases) ||
+          bigstack_calloc_ui(1, &score_range_obs_cts) ||
+          bigstack_calloc_ui(sample_ct, &score_miss_cts)) {
 	goto plink1_dosage_ret_NOMEM;
       }
-      fill_double_zero(cur_scores, sample_ct);
-      score_bases[0] = 0.0;
-      score_range_obs_cts[0] = 0;
-      fill_uint_zero(score_miss_cts, sample_ct);
     }
     calc_plink_maxfid(unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, &plink_maxfid, &plink_maxiid);
     missing_pheno_len = strlen(output_missing_pheno);
@@ -1170,7 +1161,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
   //       know how large all our other memory allocations are.
   // 4. final write loop if necessary
   if (doip->modifier & DOSAGE_LIST) {
-    retval = open_and_load_to_first_token(&infile, doip->fname, MAXLINELEN, '\0', "--dosage list file", tbuf, &bufptr, &line_idx);
+    retval = open_and_load_to_first_token(&infile, doip->fname, MAXLINELEN, '\0', "--dosage list file", g_textbuf, &bufptr, &line_idx);
     if (retval) {
       goto plink1_dosage_ret_1;
     }
@@ -1180,11 +1171,11 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
         logerrprint("Error: Unexpected number of columns in --dosage list file.\n");
 	goto plink1_dosage_ret_INVALID_FORMAT;
       }
-      batch_sizes = (uint32_t*)wkspace_base;
+      batch_sizes = (uint32_t*)g_bigstack_base;
       uiptr = batch_sizes;
-      uiptr2 = (uint32_t*)(&(wkspace_base[wkspace_left / 2]));
+      uiptr2 = (uint32_t*)(&(g_bigstack_base[bigstack_left() / 2]));
     } else {
-      if (wkspace_alloc_ui_checked(&batch_sizes, sizeof(int32_t))) {
+      if (bigstack_alloc_ui(1, &batch_sizes)) {
 	goto plink1_dosage_ret_NOMEM;
       }
     }
@@ -1196,13 +1187,13 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	}
 	bufptr2 = token_endnn(bufptr);
         if (scan_int32(bufptr, (int32_t*)uiptr)) {
-	  sprintf(logbuf, "Error: Invalid batch number on line %" PRIuPTR " of --dosage list file.\n", line_idx);
+	  sprintf(g_logbuf, "Error: Invalid batch number on line %" PRIuPTR " of --dosage list file.\n", line_idx);
           goto plink1_dosage_ret_INVALID_FORMAT_2;
 	}
 	uiptr++;
 	bufptr = skip_initial_spaces(bufptr2);
 	if (is_eoln_kns(*bufptr)) {
-          sprintf(logbuf, "Error: Line %" PRIuPTR " of --dosage list file has fewer tokens than expected.\n", line_idx);
+          sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --dosage list file has fewer tokens than expected.\n", line_idx);
 	  goto plink1_dosage_ret_INVALID_FORMAT_2;
 	}
       }
@@ -1214,7 +1205,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
       if (sepheader) {
 	bufptr = skip_initial_spaces(bufptr2);
 	if (!bufptr) {
-          sprintf(logbuf, "Error: Line %" PRIuPTR " of --dosage list file has fewer tokens than expected.\n", line_idx);
+          sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --dosage list file has fewer tokens than expected.\n", line_idx);
 	  goto plink1_dosage_ret_INVALID_FORMAT_2;
 	}
 	bufptr2 = token_endnn(bufptr);
@@ -1225,23 +1216,23 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
       }
       bufptr = skip_initial_spaces(bufptr2);
       if (!is_eoln_kns(*bufptr)) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of --dosage list file has more tokens than expected.\n", line_idx);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --dosage list file has more tokens than expected.\n", line_idx);
 	goto plink1_dosage_ret_INVALID_FORMAT_2;
       }
       infile_ct++;
     plink1_dosage_next_list_line:
-      if (!fgets(tbuf, MAXLINELEN, infile)) {
+      if (!fgets(g_textbuf, MAXLINELEN, infile)) {
 	if (ferror(infile)) {
 	  goto plink1_dosage_ret_READ_FAIL;
 	}
 	break;
       }
       line_idx++;
-      if (!tbuf[MAXLINELEN - 1]) {
-        sprintf(logbuf, "Error: Line %" PRIuPTR " of --dosage list file is pathologically long.\n", line_idx);
+      if (!g_textbuf[MAXLINELEN - 1]) {
+        sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --dosage list file is pathologically long.\n", line_idx);
         goto plink1_dosage_ret_INVALID_FORMAT_2;
       }
-      bufptr = skip_initial_spaces(tbuf);
+      bufptr = skip_initial_spaces(g_textbuf);
       if (is_eoln_kns(*bufptr)) {
 	goto plink1_dosage_next_list_line;
       }
@@ -1253,9 +1244,10 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 #else
       qsort(batch_sizes, infile_ct, sizeof(int32_t), intcmp2);
 #endif
-      batch_sizes = (uint32_t*)wkspace_alloc(infile_ct * sizeof(int32_t));
+      bigstack_alloc_ui(infile_ct, &batch_sizes);
+
       // temporary batch size buffer
-      uiptr3 = (uint32_t*)top_alloc(&topsize, infile_ct * sizeof(int32_t));
+      bigstack_end_alloc_ui(infile_ct, &uiptr3);
 
       uii = batch_sizes[0];
       uiptr2 = &(batch_sizes[1]);
@@ -1283,12 +1275,11 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
       }
 
       // batch numbers
-      uiptr = (uint32_t*)top_alloc(&topsize, batch_ct * sizeof(int32_t));
-      if (!uiptr) {
+      if (bigstack_end_alloc_ui(batch_ct, &uiptr)) {
 	goto plink1_dosage_ret_NOMEM;
       }
       memcpy(uiptr, batch_sizes, batch_ct * sizeof(int32_t));
-      wkspace_shrink_top(batch_sizes, batch_ct * sizeof(int32_t));
+      bigstack_shrink_top(batch_sizes, batch_ct * sizeof(int32_t));
       memcpy(batch_sizes, uiptr3, batch_ct * sizeof(int32_t));
       // convert uiptr3 to write offset array
       uii = uiptr3[0];
@@ -1298,24 +1289,22 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	uii += uiptr3[ujj];
         uiptr3[ujj] = ukk;
       }
-      wkspace_left -= topsize;
     } else {
       uiptr3 = batch_sizes;
       uiptr3[0] = 0;
     }
-    if (wkspace_alloc_c_checked(&fnames, infile_ct * max_fn_len)) {
-      goto plink1_dosage_ret_NOMEM2;
+    if (bigstack_alloc_c(infile_ct * max_fn_len, &fnames)) {
+      goto plink1_dosage_ret_NOMEM;
     }
     if (sepheader) {
-      if (wkspace_alloc_c_checked(&sep_fnames, infile_ct * max_sepheader_len)) {
-	goto plink1_dosage_ret_NOMEM2;
+      if (bigstack_alloc_c(infile_ct * max_sepheader_len, &sep_fnames)) {
+	goto plink1_dosage_ret_NOMEM;
       }
     }
-    wkspace_left += topsize;
-    topsize = 0;
+    bigstack_end_reset(bigstack_end_mark);
     rewind(infile);
-    while (fgets(tbuf, MAXLINELEN, infile)) {
-      bufptr = skip_initial_spaces(tbuf);
+    while (fgets(g_textbuf, MAXLINELEN, infile)) {
+      bufptr = skip_initial_spaces(g_textbuf);
       if (is_eoln_kns(*bufptr)) {
 	continue;
       }
@@ -1352,26 +1341,26 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
     }
   } else {
     uii = strlen(doip->fname) + 1;
-    if (wkspace_alloc_ui_checked(&batch_sizes, sizeof(int32_t)) ||
-        wkspace_alloc_c_checked(&fnames, uii + 1)) {
+    if (bigstack_alloc_ui(1, &batch_sizes) ||
+        bigstack_alloc_c(uii + 1, &fnames)) {
       goto plink1_dosage_ret_NOMEM;
     }
     batch_sizes[0] = 1;
     memcpy(fnames, doip->fname, uii);
     infile_ct = 1;
   }
-  if (wkspace_alloc_uc_checked(&overflow_buf, 2 * PIGZ_BLOCK_SIZE) ||
-      wkspace_alloc_ui_checked(&file_icts, max_batch_size * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(&line_idx_arr, max_batch_size * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&batch_samples, sample_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&cur_samples, sample_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&read_idx_to_sample_idx, sample_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&skip_vals, sample_ct * sizeof(int32_t)) ||
-      wkspace_alloc_d_checked(&cur_dosages, sample_ct * sizeof(double)) ||
-      wkspace_alloc_c_checked(&cur_marker_id_buf, MAX_ID_LEN)) {
+  if (bigstack_alloc_uc(2 * PIGZ_BLOCK_SIZE, &overflow_buf) ||
+      bigstack_alloc_ui(max_batch_size, &file_icts) ||
+      bigstack_alloc_ul(max_batch_size, &line_idx_arr) ||
+      bigstack_alloc_ul(sample_ctl, &batch_samples) ||
+      bigstack_alloc_ul(sample_ctl, &cur_samples) ||
+      bigstack_alloc_ui(sample_ct, &read_idx_to_sample_idx) ||
+      bigstack_alloc_ui(sample_ct, &skip_vals) ||
+      bigstack_alloc_d(sample_ct, &cur_dosages) ||
+      bigstack_alloc_c(MAX_ID_LEN, &cur_marker_id_buf)) {
     goto plink1_dosage_ret_NOMEM;
   }
-  gz_infiles = (gzFile*)wkspace_alloc(infile_ct * sizeof(gzFile));
+  gz_infiles = (gzFile*)bigstack_alloc(infile_ct * sizeof(gzFile));
   if (!gz_infiles) {
     infile_ct = 0;
     goto plink1_dosage_ret_NOMEM;
@@ -1390,7 +1379,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
     }
     // sorted_sample_ids = NULL;
   } else {
-    retval = sort_item_ids(&sorted_sample_ids, &sample_id_map, unfiltered_sample_ct, sample_exclude, sample_exclude_ct, sample_ids, max_sample_id_len, 0, 1, strcmp_deref);
+    retval = sort_item_ids(unfiltered_sample_ct, sample_exclude, sample_exclude_ct, sample_ids, max_sample_id_len, 0, 1, strcmp_deref, &sorted_sample_ids, &sample_id_map);
     if (retval) {
       goto plink1_dosage_ret_1;
     }
@@ -1411,10 +1400,10 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 #endif
       pheno_c_collapsed = pheno_c;
     } else {
-      if (wkspace_alloc_ul_checked(&pheno_nm_collapsed, sample_ctl * sizeof(intptr_t))) {
+      if (bigstack_alloc_ul(sample_ctl, &pheno_nm_collapsed)) {
 	goto plink1_dosage_ret_NOMEM;
       }
-      collapse_copy_bitarr(unfiltered_sample_ct, pheno_nm, sample_exclude, sample_ct, pheno_nm_collapsed);
+      copy_bitarr_subset_excl(pheno_nm, sample_exclude, unfiltered_sample_ct, sample_ct, pheno_nm_collapsed);
 #ifndef NOLAPACK
       if (pheno_d) {
 	pheno_d_collapsed = (double*)alloc_and_init_collapsed_arr((char*)pheno_d, sizeof(double), unfiltered_sample_ct, sample_exclude, sample_ct, 0);
@@ -1423,27 +1412,27 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	}
       } else {
 #endif
-	if (wkspace_alloc_ul_checked(&pheno_c_collapsed, sample_ctl * sizeof(intptr_t))) {
+	if (bigstack_alloc_ul(sample_ctl, &pheno_c_collapsed)) {
 	  goto plink1_dosage_ret_NOMEM;
 	}
-	collapse_copy_bitarr(unfiltered_sample_ct, pheno_c, sample_exclude, sample_ct, pheno_c_collapsed);
+	copy_bitarr_subset_excl(pheno_c, sample_exclude, unfiltered_sample_ct, sample_ct, pheno_c_collapsed);
 #ifndef NOLAPACK
       }
 #endif
     }
 #ifndef NOLAPACK
     if (pheno_d) {
-      if (wkspace_alloc_d_checked(&pheno_d2, sample_ct * sizeof(double)) ||
-	  wkspace_alloc_d_checked(&covars_cov_major_buf, param_ct * sample_ct * sizeof(double)) ||
-	  wkspace_alloc_d_checked(&covars_sample_major_buf, param_ct * sample_ct * sizeof(double)) ||
-	  wkspace_alloc_d_checked(&param_2d_buf, param_ct * param_ct * sizeof(double)) ||
-	  wkspace_alloc_d_checked(&param_2d_buf2, param_ct * param_ct * sizeof(double)) ||
-	  wkspace_alloc_d_checked(&regression_results, (param_ct - 1) * sizeof(double)) ||
-	  wkspace_alloc_d_checked(&dgels_a, param_ct * sample_ct * sizeof(double)) ||
-	  wkspace_alloc_d_checked(&dgels_b, sample_ct * sizeof(double))) {
+      if (bigstack_alloc_d(sample_ct, &pheno_d2) ||
+	  bigstack_alloc_d(param_ct * sample_ct, &covars_cov_major_buf) ||
+	  bigstack_alloc_d(param_ct * sample_ct, &covars_sample_major_buf) ||
+	  bigstack_alloc_d(param_ct * param_ct, &param_2d_buf) ||
+	  bigstack_alloc_d(param_ct * param_ct, &param_2d_buf2) ||
+	  bigstack_alloc_d(param_ct - 1, &regression_results) ||
+	  bigstack_alloc_d(param_ct * sample_ct, &dgels_a) ||
+	  bigstack_alloc_d(sample_ct, &dgels_b)) {
 	goto plink1_dosage_ret_NOMEM;
       }
-      mi_buf = (MATRIX_INVERT_BUF1_TYPE*)wkspace_alloc(param_ct * sizeof(MATRIX_INVERT_BUF1_TYPE));
+      mi_buf = (MATRIX_INVERT_BUF1_TYPE*)bigstack_alloc(param_ct * sizeof(MATRIX_INVERT_BUF1_TYPE));
       if (!mi_buf) {
 	goto plink1_dosage_ret_NOMEM;
       }
@@ -1460,14 +1449,14 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	goto plink1_dosage_ret_1;
       }
       dgels_lwork = (int32_t)dxx;
-      if (wkspace_alloc_d_checked(&dgels_work, dgels_lwork * sizeof(double))) {
+      if (bigstack_alloc_d(dgels_lwork, &dgels_work)) {
 	goto plink1_dosage_ret_NOMEM;
       }
     } else {
 #endif
       if (covar_ct) {
 	ulii = covar_ct * sample_ct;
-	if (wkspace_alloc_f_checked(&covar_f, ulii * sizeof(float))) {
+	if (bigstack_alloc_f(ulii, &covar_f)) {
 	  goto plink1_dosage_ret_NOMEM;
 	}
 	fptr = covar_f;
@@ -1476,29 +1465,29 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	  *fptr++ = (float)(*dptr++);
 	}
       }
-      if (wkspace_alloc_ul_checked(&perm_vec, sample_ctl * 2 * sizeof(intptr_t)) ||
-	  wkspace_alloc_f_checked(&covars_cov_major_f_buf, param_ct * sample_cta4 * sizeof(float)) ||
-	  wkspace_alloc_f_checked(&coef_f, param_cta4 * sizeof(float)) ||
-	  wkspace_alloc_f_checked(&pp_f, sample_cta4 * sizeof(float)) ||
-	  wkspace_alloc_f_checked(&sample_1d_buf_f, sample_ct * sizeof(float)) ||
-	  wkspace_alloc_f_checked(&pheno_buf_f, sample_ct * sizeof(float)) ||
-	  wkspace_alloc_f_checked(&param_1d_buf_f, param_ct * sizeof(float)) ||
-	  wkspace_alloc_f_checked(&param_1d_buf2_f, param_ct * sizeof(float)) ||
-	  wkspace_alloc_f_checked(&param_2d_buf_f, param_ct * param_cta4 * sizeof(float)) ||
-	  wkspace_alloc_f_checked(&param_2d_buf2_f, param_ct * param_cta4 * sizeof(float)) ||
-	  wkspace_alloc_f_checked(&regression_results_f, (param_ct - 1) * sizeof(float))) {
+      if (bigstack_alloc_ul(BITCT_TO_ALIGNED_WORDCT(sample_ct), &perm_vec) ||
+	  bigstack_alloc_f(param_ct * sample_cta4, &covars_cov_major_f_buf) ||
+	  bigstack_alloc_f(param_cta4, &coef_f) ||
+	  bigstack_alloc_f(sample_cta4, &pp_f) ||
+	  bigstack_alloc_f(sample_ct, &sample_1d_buf_f) ||
+	  bigstack_alloc_f(sample_ct, &pheno_buf_f) ||
+	  bigstack_alloc_f(param_ct, &param_1d_buf_f) ||
+	  bigstack_alloc_f(param_ct, &param_1d_buf2_f) ||
+	  bigstack_alloc_f(param_ct * param_cta4, &param_2d_buf_f) ||
+	  bigstack_alloc_f(param_ct * param_cta4, &param_2d_buf2_f) ||
+	  bigstack_alloc_f(param_ct - 1, &regression_results_f)) {
 	goto plink1_dosage_ret_NOMEM;
       }
 #ifndef NOLAPACK
     }
 #endif
-    if (wkspace_alloc_ul_checked(&perm_fails, sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(1, &perm_fails)) {
       goto plink1_dosage_ret_NOMEM;
     }
     if (load_map) {
-      bufptr = memcpya(tbuf, " CHR         SNP          BP", 28);
+      bufptr = memcpya(g_textbuf, " CHR         SNP          BP", 28);
     } else {
-      bufptr = memcpya(tbuf, "         SNP", 12);
+      bufptr = memcpya(g_textbuf, "         SNP", 12);
     }
     bufptr = memcpya(bufptr, freq_cc? "  A1  A2   FRQ_A   FRQ_U    INFO    " : "  A1  A2     FRQ    INFO    ", freq_cc? 36 : 28);
     bufptr = memcpya(bufptr, pheno_c? "  OR" : "BETA", 4);
@@ -1507,7 +1496,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
     bufptr2 = memcpyb(outname_end, ".assoc.dosage", 14);
   } else if (count_occur) {
     // could just use a uint32_t array if .map provided
-    htable = (Ll_ctstr_entry**)wkspace_alloc(HASHSIZE * sizeof(intptr_t));
+    htable = (Ll_ctstr_entry**)bigstack_alloc(HASHSIZE * sizeof(intptr_t));
     if (!htable) {
       goto plink1_dosage_ret_NOMEM;
     }
@@ -1517,7 +1506,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
     bufptr2 = memcpyb(outname_end, ".occur.dosage", 14);
   } else if (!do_score) {
     if (format_val != 1) {
-      if (wkspace_alloc_d_checked(&cur_dosages2, sample_ct * sizeof(double))) {
+      if (bigstack_alloc_d(sample_ct, &cur_dosages2)) {
 	goto plink1_dosage_ret_NOMEM;
       }
     }
@@ -1533,7 +1522,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
     }
     pzwritep = (char*)overflow_buf;
     if (do_glm) {
-      pzwritep = memcpya(pzwritep, tbuf, bufptr - tbuf);
+      pzwritep = memcpya(pzwritep, g_textbuf, bufptr - g_textbuf);
     } else if (!count_occur) {
       pzwritep = memcpya(pzwritep, "SNP A1 A2 ", 10);
       for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
@@ -1550,20 +1539,20 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
       append_binary_eoln(&pzwritep);
     }
   }
-  wkspace_mark = wkspace_base;
+  bigstack_mark = g_bigstack_base;
   for (batch_idx = 0; batch_idx < batch_ct; batch_idx++, file_idx_start += cur_batch_size) {
     cur_batch_size = batch_sizes[batch_idx];
     read_idx = 0;
-    loadbuf_size = wkspace_left;
+    loadbuf_size = bigstack_left();
     if (loadbuf_size > MAXLINEBUFLEN) {
       loadbuf_size = MAXLINEBUFLEN;
     } else if (loadbuf_size <= MAXLINELEN) {
       goto plink1_dosage_ret_NOMEM;
     }
-    loadbuf = (char*)wkspace_base;
+    loadbuf = (char*)g_bigstack_base;
     loadbuf[loadbuf_size - 1] = ' ';
     fill_ulong_zero(batch_samples, sample_ctl);
-    bufptr = memcpya(logbuf, "--dosage: Reading from ", 23);
+    bufptr = memcpya(g_logbuf, "--dosage: Reading from ", 23);
     if (cur_batch_size == 1) {
       bufptr = strcpya(bufptr, &(fnames[file_idx_start * max_fn_len]));
     } else if (cur_batch_size == 2) {
@@ -1579,28 +1568,29 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
       bufptr = strcpya(bufptr, &(fnames[(file_idx + file_idx_start) * max_fn_len]));
     }
     memcpyl3(bufptr, ".\n");
-    wordwrap(logbuf, 0);
+    wordwrapb(0);
     logprintb();
     for (file_idx = 0; file_idx < cur_batch_size; file_idx++) {
       read_idx_start = read_idx;
       if (sepheader) {
-	if (gzopen_checked(&(gz_infiles[file_idx]), &(sep_fnames[(file_idx + file_idx_start) * max_sepheader_len]), "rb")) {
-	  goto plink1_dosage_ret_OPEN_FAIL;
+	retval = gzopen_read_checked(&(sep_fnames[(file_idx + file_idx_start) * max_sepheader_len]), &(gz_infiles[file_idx]));
+	if (retval) {
+	  goto plink1_dosage_ret_1;
 	}
 	line_idx = 0;
 	uii = 1 + skip2; // current skip value
-	while (gzgets(gz_infiles[file_idx], tbuf, MAXLINELEN)) {
+	while (gzgets(gz_infiles[file_idx], g_textbuf, MAXLINELEN)) {
 	  line_idx++;
-	  if (!tbuf[MAXLINELEN - 1]) {
-	    sprintf(logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, &(sep_fnames[(file_idx + file_idx_start) * max_sepheader_len]));
+	  if (!g_textbuf[MAXLINELEN - 1]) {
+	    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, &(sep_fnames[(file_idx + file_idx_start) * max_sepheader_len]));
 	    goto plink1_dosage_ret_INVALID_FORMAT_WW;
 	  }
-          bufptr = skip_initial_spaces(tbuf);
+          bufptr = skip_initial_spaces(g_textbuf);
           if (is_eoln_kns(*bufptr)) {
 	    continue;
 	  }
-          if (bsearch_read_fam_indiv(&(tbuf[MAXLINELEN]), sorted_sample_ids, max_sample_id_len, sample_ct, bufptr, &bufptr2, &ii)) {
-            sprintf(logbuf, "Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, &(sep_fnames[(file_idx + file_idx_start) * max_sepheader_len]));
+          if (bsearch_read_fam_indiv(bufptr, sorted_sample_ids, max_sample_id_len, sample_ct, &bufptr2, &ii, &(g_textbuf[MAXLINELEN]))) {
+            sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, &(sep_fnames[(file_idx + file_idx_start) * max_sepheader_len]));
 	    goto plink1_dosage_ret_INVALID_FORMAT_WW;
 	  }
 	  if (ii == -1) {
@@ -1610,10 +1600,10 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	    if (is_set(batch_samples, ii)) {
 	      bufptr = &(sorted_sample_ids[((uint32_t)ii) * max_sample_id_len]);
 	      *strchr(bufptr, '\t') = ' ';
-	      sprintf(logbuf, "Error: '%s' appears multiple times.\n", bufptr);
+	      sprintf(g_logbuf, "Error: '%s' appears multiple times.\n", bufptr);
 	      goto plink1_dosage_ret_INVALID_FORMAT_WW;
 	    }
-	    set_bit(batch_samples, ii);
+	    set_bit(ii, batch_samples);
 	    read_idx_to_sample_idx[read_idx] = (uint32_t)ii;
             skip_vals[read_idx++] = uii;
 	    uii = 1 + (format_val == 3);
@@ -1625,11 +1615,12 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	}
 	gz_infiles[file_idx] = NULL;
 	if (read_idx_start == read_idx) {
-          sprintf(logbuf, "Error: %s is empty.\n", &(sep_fnames[(file_idx + file_idx_start) * max_sepheader_len]));
+          sprintf(g_logbuf, "Error: %s is empty.\n", &(sep_fnames[(file_idx + file_idx_start) * max_sepheader_len]));
           goto plink1_dosage_ret_INVALID_FORMAT_WW;
 	}
       }
-      if (gzopen_checked(&(gz_infiles[file_idx]), &(fnames[(file_idx + file_idx_start) * max_fn_len]), "rb")) {
+      retval = gzopen_read_checked(&(fnames[(file_idx + file_idx_start) * max_fn_len]), &(gz_infiles[file_idx]));
+      if (retval) {
 	goto plink1_dosage_ret_OPEN_FAIL;
       }
       line_idx = 0;
@@ -1648,11 +1639,11 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	for (read_idx = 0; read_idx < sample_ct; read_idx++) {
 	  read_idx_to_sample_idx[read_idx] = read_idx;
 	}
-	fill_all_bits(batch_samples, sample_ct);
+	fill_all_bits(sample_ct, batch_samples);
       } else if (!sepheader) {
 	do {
 	  if (!gzgets(gz_infiles[file_idx], loadbuf, loadbuf_size)) {
-            sprintf(logbuf, "Error: %s is empty.\n", &(fnames[(file_idx + file_idx_start) * max_fn_len]));
+            sprintf(g_logbuf, "Error: %s is empty.\n", &(fnames[(file_idx + file_idx_start) * max_fn_len]));
 	    goto plink1_dosage_ret_INVALID_FORMAT_WW;
 	  }
 	  line_idx++;
@@ -1663,30 +1654,30 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	} while (is_eoln_kns(*bufptr));
 	bufptr = next_token_multz(bufptr, skip0);
 	bufptr2 = next_token_mult(bufptr, skip1p1);
-	if (no_more_tokens(bufptr2)) {
+	if (no_more_tokens_kns(bufptr2)) {
 	  goto plink1_dosage_ret_MISSING_TOKENS;
 	}
 	if (strcmp_se(bufptr, "SNP", 3)) {
-	  sprintf(logbuf, "Error: Column %u of %s's header isn't 'SNP'.\n", skip0 + 1, &(fnames[(file_idx + file_idx_start) * max_fn_len]));
+	  sprintf(g_logbuf, "Error: Column %u of %s's header isn't 'SNP'.\n", skip0 + 1, &(fnames[(file_idx + file_idx_start) * max_fn_len]));
 	  goto plink1_dosage_ret_INVALID_FORMAT_WW;
 	} else if (strcmp_se(bufptr2, "A1", 2)) {
-	  sprintf(logbuf, "Error: Column %u of %s's header isn't 'A1'.\n", skip0 + skip1p1 + 1, &(fnames[(file_idx + file_idx_start) * max_fn_len]));
+	  sprintf(g_logbuf, "Error: Column %u of %s's header isn't 'A1'.\n", skip0 + skip1p1 + 1, &(fnames[(file_idx + file_idx_start) * max_fn_len]));
 	  goto plink1_dosage_ret_INVALID_FORMAT_WW;
 	}
 	bufptr = next_token(bufptr2);
 	bufptr2 = next_token_multz(bufptr, skip2);
-	if (no_more_tokens(bufptr2)) {
+	if (no_more_tokens_kns(bufptr2)) {
 	  goto plink1_dosage_ret_MISSING_TOKENS;
 	}
 	if (strcmp_se(bufptr, "A2", 2)) {
-	  sprintf(logbuf, "Error: Column %u of %s's header isn't 'A2'.\n", skip0 + skip1p1 + 2, &(fnames[(file_idx + file_idx_start) * max_fn_len]));
+	  sprintf(g_logbuf, "Error: Column %u of %s's header isn't 'A2'.\n", skip0 + skip1p1 + 2, &(fnames[(file_idx + file_idx_start) * max_fn_len]));
 	  goto plink1_dosage_ret_INVALID_FORMAT_WW;
 	}
 	uii = 1 + skip2;
 	bufptr = skip_initial_spaces(token_endnn(bufptr2));
 	while (!is_eoln_kns(*bufptr)) {
-          if (bsearch_read_fam_indiv(tbuf, sorted_sample_ids, max_sample_id_len, sample_ct, bufptr, &bufptr2, &ii)) {
-	    sprintf(logbuf, "Error: Header of %s has an odd number of tokens in the FID/IID section.\n", &(fnames[(file_idx + file_idx_start) * max_fn_len]));
+          if (bsearch_read_fam_indiv(bufptr, sorted_sample_ids, max_sample_id_len, sample_ct, &bufptr2, &ii, g_textbuf)) {
+	    sprintf(g_logbuf, "Error: Header of %s has an odd number of tokens in the FID/IID section.\n", &(fnames[(file_idx + file_idx_start) * max_fn_len]));
 	    goto plink1_dosage_ret_INVALID_FORMAT_WW;
 	  }
 	  if (ii == -1) {
@@ -1696,10 +1687,10 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	    if (is_set(batch_samples, ii)) {
 	      bufptr = &(sorted_sample_ids[((uint32_t)ii) * max_sample_id_len]);
 	      *strchr(bufptr, '\t') = ' ';
-	      sprintf(logbuf, "Error: '%s' appears multiple times.\n", bufptr);
+	      sprintf(g_logbuf, "Error: '%s' appears multiple times.\n", bufptr);
 	      goto plink1_dosage_ret_INVALID_FORMAT_WW;
 	    }
-	    set_bit(batch_samples, ii);
+	    set_bit(ii, batch_samples);
 	    read_idx_to_sample_idx[read_idx] = (uint32_t)ii;
             skip_vals[read_idx++] = uii;
 	    uii = 1 + (format_val == 3);
@@ -1707,7 +1698,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	  bufptr = bufptr2;
 	}
 	if (read_idx_start == read_idx) {
-	  sprintf(logbuf, "Error: Header of %s has no tokens in the FID/IID section.\n", &(fnames[(file_idx + file_idx_start) * max_fn_len]));
+	  sprintf(g_logbuf, "Error: Header of %s has no tokens in the FID/IID section.\n", &(fnames[(file_idx + file_idx_start) * max_fn_len]));
 	  goto plink1_dosage_ret_INVALID_FORMAT_WW;
 	}
       }
@@ -1744,7 +1735,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	bufptr = next_token_multz(bufptr, skip0);
 	bufptr3 = next_token_mult(bufptr, skip1p1);
 	bufptr5 = next_token(bufptr3);
-	if (no_more_tokens(bufptr5)) {
+	if (no_more_tokens_kns(bufptr5)) {
 	  goto plink1_dosage_ret_MISSING_TOKENS;
 	}
 	bufptr2 = token_endnn(bufptr);
@@ -1752,18 +1743,18 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	bufptr6 = token_endnn(bufptr5);
         slen = (uintptr_t)(bufptr2 - bufptr);
 	if (slen > MAX_ID_LEN) {
-	  sprintf(logbuf, "Error: Line %" PRIuPTR " of %s has an excessively long variant ID.\n", line_idx, &(fnames[(file_idx + file_idx_start) * max_fn_len]));
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s has an excessively long variant ID.\n", line_idx, &(fnames[(file_idx + file_idx_start) * max_fn_len]));
 	  goto plink1_dosage_ret_INVALID_FORMAT_WW;
 	}
 	if (!file_idx) {
 	  memcpyx(cur_marker_id_buf, bufptr, slen, '\0');
 	  cur_marker_id_len = slen;
 	  a1_len = (uintptr_t)(bufptr4 - bufptr3);
-	  if (allele_set(&a1_ptr, bufptr3, a1_len)) {
+	  if (allele_set(bufptr3, a1_len, &a1_ptr)) {
 	    goto plink1_dosage_ret_NOMEM;
 	  }
 	  a2_len = (uintptr_t)(bufptr6 - bufptr5);
-	  if (allele_set(&a2_ptr, bufptr5, a2_len)) {
+	  if (allele_set(bufptr5, a2_len, &a2_ptr)) {
 	    goto plink1_dosage_ret_NOMEM;
 	  }
 	  if (load_map) {
@@ -1797,11 +1788,11 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	  }
 	} else {
 	  if ((slen != cur_marker_id_len) || memcmp(bufptr, cur_marker_id_buf, slen)) {
-	    sprintf(logbuf, "Error: Variant ID mismatch between line %" PRIuPTR " of %s and line %" PRIuPTR " of %s.\n", line_idx_arr[0], &(fnames[file_idx_start * max_fn_len]), line_idx, &(fnames[(file_idx + file_idx_start) * max_fn_len]));
+	    sprintf(g_logbuf, "Error: Variant ID mismatch between line %" PRIuPTR " of %s and line %" PRIuPTR " of %s.\n", line_idx_arr[0], &(fnames[file_idx_start * max_fn_len]), line_idx, &(fnames[(file_idx + file_idx_start) * max_fn_len]));
 	    goto plink1_dosage_ret_INVALID_FORMAT_WW;
 	  }
 	  if (((uintptr_t)(bufptr4 - bufptr3) != a1_len) || memcmp(bufptr3, a1_ptr, a1_len) || ((uintptr_t)(bufptr6 - bufptr5) != a2_len) || memcmp(bufptr5, a2_ptr, a2_len)) {
-	    sprintf(logbuf, "Error: Allele code mismatch between line %" PRIuPTR " of %s and line %" PRIuPTR " of %s.\n", line_idx_arr[0], &(fnames[file_idx_start * max_fn_len]), line_idx, &(fnames[(file_idx + file_idx_start) * max_fn_len]));
+	    sprintf(g_logbuf, "Error: Allele code mismatch between line %" PRIuPTR " of %s and line %" PRIuPTR " of %s.\n", line_idx_arr[0], &(fnames[file_idx_start * max_fn_len]), line_idx, &(fnames[(file_idx + file_idx_start) * max_fn_len]));
 	    goto plink1_dosage_ret_INVALID_FORMAT_WW;
 	  }
 	  if ((marker_idx == ~ZEROLU) || (score_marker_idx == 0xffffffffU)) {
@@ -1816,20 +1807,19 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	    ll_ptr = *ll_pptr;
 	    if (!ll_ptr) {
 	      distinct_id_ct++;
-	      topsize += ((slen + sizeof(Ll_ctstr_entry) + 15) & (~(15 * ONELU)));
-	      loadbuf_size = wkspace_left - topsize;
-              ll_ptr = (Ll_ctstr_entry*)(&(wkspace_base[loadbuf_size]));
-	      ll_ptr->next = NULL;
-	      memcpy(ll_ptr->ss, cur_marker_id_buf, slen);
-	      if (slen > max_occur_id_len) {
-		max_occur_id_len = slen;
-	      }
+	      ll_ptr = (Ll_ctstr_entry*)bigstack_end_alloc(slen + sizeof(Ll_ctstr_entry));
+	      loadbuf_size = bigstack_left();
 	      if (loadbuf_size >= MAXLINEBUFLEN) {
 		loadbuf_size = MAXLINEBUFLEN;
 	      } else if (loadbuf_size > MAXLINELEN) {
                 loadbuf[loadbuf_size - 1] = ' ';
 	      } else {
-		goto plink1_dosage_ret_NOMEM2;
+		goto plink1_dosage_ret_NOMEM;
+	      }
+	      ll_ptr->next = NULL;
+	      memcpy(ll_ptr->ss, cur_marker_id_buf, slen);
+	      if (slen > max_occur_id_len) {
+		max_occur_id_len = slen;
 	      }
 	      ll_ptr->ct = 1;
 	      *ll_pptr = ll_ptr;
@@ -1852,14 +1842,14 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 		goto plink1_dosage_ret_MISSING_TOKENS;
 	      }
 	      if (scan_double(bufptr, &dxx)) {
-		clear_bit(cur_samples, read_idx_to_sample_idx[read_idx_start]);
+		clear_bit(read_idx_to_sample_idx[read_idx_start], cur_samples);
 		continue;
 	      }
 	      if (!dose1) {
 		dxx *= 0.5;
 	      }
 	      if ((dxx > 1.0 + DOSAGE_EPSILON) || (dxx < 0.0)) {
-		clear_bit(cur_samples, read_idx_to_sample_idx[read_idx_start]);
+		clear_bit(read_idx_to_sample_idx[read_idx_start], cur_samples);
 		continue;
 	      } else if (dxx > 1.0) {
 		dxx = 1.0;
@@ -1870,16 +1860,16 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	    for (; read_idx_start < read_idx; read_idx_start++) {
 	      bufptr2 = next_token_mult(bufptr, skip_vals[read_idx_start]);
 	      bufptr = next_token(bufptr2);
-	      if (no_more_tokens(bufptr)) {
+	      if (no_more_tokens_kns(bufptr)) {
 		goto plink1_dosage_ret_MISSING_TOKENS;
 	      }
 	      if (scan_double(bufptr2, &dxx) || scan_double(bufptr, &dyy)) {
-		clear_bit(cur_samples, read_idx_to_sample_idx[read_idx_start]);
+		clear_bit(read_idx_to_sample_idx[read_idx_start], cur_samples);
 		continue;
 	      }
 	      dzz = dxx + dyy;
 	      if ((dyy < 0.0) || (dxx < 0.0) || (dzz > 1.0 + DOSAGE_EPSILON)) {
-		clear_bit(cur_samples, read_idx_to_sample_idx[read_idx_start]);
+		clear_bit(read_idx_to_sample_idx[read_idx_start], cur_samples);
 		continue;
 	      } else if (dzz > 1.0) {
 		dzz = 1.0 / dzz;
@@ -1904,7 +1894,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	  if (covar_nm) {
 	    // it would be more efficient to make covar_nm act as a mask on
 	    // sample_exclude earlier, but this is throwaway code
-	    bitfield_and(cur_samples, covar_nm, sample_ctl);
+	    bitvec_and(covar_nm, sample_ctl, cur_samples);
 	  }
 	  sample_valid_ct = popcount_longs(cur_samples, sample_ctl);
 	  dxx = 0.0;
@@ -1950,11 +1940,11 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	    }
 	  }
 	  if (load_map) {
-	    pzwritep = width_force(4, pzwritep, chrom_name_write(pzwritep, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_idx)));
+	    pzwritep = width_force(4, pzwritep, chrom_name_write(chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_idx), pzwritep));
 	    *pzwritep++ = ' ';
 	    pzwritep = fw_strcpyn(11, cur_marker_id_len, cur_marker_id_buf, pzwritep);
             pzwritep = memseta(pzwritep, 32, 2);
-            pzwritep = uint32_writew10(pzwritep, marker_pos[marker_idx]);
+            pzwritep = uint32toa_w10(marker_pos[marker_idx], pzwritep);
 	  } else {
 	    *pzwritep++ = ' ';
 	    pzwritep = fw_strcpyn(11, cur_marker_id_len, cur_marker_id_buf, pzwritep);
@@ -1996,30 +1986,30 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	      }
 	    }
 	    if (uii) {
-	      pzwritep = double_f_writew74x(pzwritep, dxx / ((double)((int32_t)uii)), ' ');
+	      pzwritep = dtoa_f_w7p4x(dxx / ((double)((int32_t)uii)), ' ', pzwritep);
 	    } else {
 	      pzwritep = memcpya(pzwritep, "     NA ", 8);
 	    }
 	    uii = sample_valid_ct - uii;
 	    if (uii) {
-	      pzwritep = double_f_writew74(pzwritep, dyy / ((double)((int32_t)uii)));
+	      pzwritep = dtoa_f_w7p4(dyy / ((double)((int32_t)uii)), pzwritep);
 	    } else {
 	      pzwritep = memcpya(pzwritep, "     NA", 7);
 	    }
 	  } else {
-            pzwritep = double_f_writew74(pzwritep, dzz);
+            pzwritep = dtoa_f_w7p4(dzz, pzwritep);
 	    // remove this kludge once scripts stop depending on it
 	    if (freq_cc) {
 	      *pzwritep++ = ' ';
-	      pzwritep = double_f_writew74(pzwritep, dzz);
+	      pzwritep = dtoa_f_w7p4(dzz, pzwritep);
 	    }
 	  }
 	  *pzwritep++ = ' ';
-	  pzwritep = double_f_writew74x(pzwritep, rsq, ' ');
+	  pzwritep = dtoa_f_w7p4x(rsq, ' ', pzwritep);
 	  if (is_valid) {
-	    pzwritep = double_f_writew74x(pzwritep, pheno_c? exp(beta * 0.5) : (beta * 0.5), ' ');
-	    pzwritep = double_f_writew74x(pzwritep, se * 0.5, ' ');
-	    pzwritep = double_g_writewx4(pzwritep, MAXV(pval, output_min_p), 7);
+	    pzwritep = dtoa_f_w7p4x(pheno_c? exp(beta * 0.5) : (beta * 0.5), ' ', pzwritep);
+	    pzwritep = dtoa_f_w7p4x(se * 0.5, ' ', pzwritep);
+	    pzwritep = dtoa_g_wxp4(MAXV(pval, output_min_p), 7, pzwritep);
 	  } else {
 	    pzwritep = memcpya(pzwritep, "     NA      NA      NA", 23);
 	  }
@@ -2072,6 +2062,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	    }
 	  } while (++qrange_idx < qrange_ct);
 	} else if (!count_occur) {
+	  // --write-dosage
 	  pzwritep = strcpyax(pzwritep, cur_marker_id_buf, ' ');
 	  if (flex_pzputs_allele(&ps, &pzwritep, a1_ptr, a1_len)) {
 	    goto plink1_dosage_ret_WRITE_FAIL;
@@ -2100,7 +2091,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 		if (!is_set(cur_samples, sample_idx)) {
 		  pzwritep = memcpyl3a(pzwritep, "NA ");
 		} else {
-		  pzwritep = double_g_writex(pzwritep, 2 * cur_dosages[sample_idx], ' ');
+		  pzwritep = dtoa_gx(2 * cur_dosages[sample_idx], ' ', pzwritep);
 		}
 	      }
 	      if (flex_pzwrite(&ps, &pzwritep)) {
@@ -2118,8 +2109,8 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 		if (!is_set(cur_samples, sample_idx)) {
 		  pzwritep = memcpya(pzwritep, "NA NA ", 6);
 		} else {
-		  pzwritep = double_g_writex(pzwritep, cur_dosages[sample_idx], ' ');
-		  pzwritep = double_g_writex(pzwritep, cur_dosages2[sample_idx], ' ');
+		  pzwritep = dtoa_gx(cur_dosages[sample_idx], ' ', pzwritep);
+		  pzwritep = dtoa_gx(cur_dosages2[sample_idx], ' ', pzwritep);
 		}
 	      }
 	      if (flex_pzwrite(&ps, &pzwritep)) {
@@ -2138,14 +2129,14 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 		  pzwritep = memcpya(pzwritep, "NA NA NA ", 9);
 		} else {
 		  dxx = cur_dosages[sample_idx];
-		  pzwritep = double_g_writex(pzwritep, dxx, ' ');
+		  pzwritep = dtoa_gx(dxx, ' ', pzwritep);
 		  dyy = cur_dosages2[sample_idx];
-		  pzwritep = double_g_writex(pzwritep, dyy, ' ');
+		  pzwritep = dtoa_gx(dyy, ' ', pzwritep);
 		  dxx = 1.0 - dxx - dyy;
 		  if (fabs(dxx) < SMALL_EPSILON) {
 		    dxx = 0.0;
 		  }
-		  pzwritep = double_g_writex(pzwritep, dxx, ' ');
+		  pzwritep = dtoa_gx(dxx, ' ', pzwritep);
 		}
 	      }
 	      if (flex_pzwrite(&ps, &pzwritep)) {
@@ -2175,7 +2166,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
       }
       gz_infiles[file_idx] = NULL;
     }
-    wkspace_reset(wkspace_mark);
+    bigstack_reset(bigstack_mark);
   }
   if (do_score) {
     qrange_idx = 0;
@@ -2188,11 +2179,11 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
       }
       // this is not affected by 'gz' in PLINK 1.07; retain that for backward
       // compatibility.
-      if (fopen_checked(&profile_outfile, outname, "w")) {
+      if (fopen_checked(outname, "w", &profile_outfile)) {
 	goto plink1_dosage_ret_OPEN_FAIL;
       }
-      sprintf(tbuf, "%%%us %%%us  PHENO%s %s\n", plink_maxfid, plink_maxiid, dosage_score_cnt? "    CNT" : "", score_report_average? "   SCORE" : "SCORESUM");
-      fprintf(profile_outfile, tbuf, "FID", "IID");
+      sprintf(g_textbuf, "%%%us %%%us  PHENO%s %s\n", plink_maxfid, plink_maxiid, dosage_score_cnt? "    CNT" : "", score_report_average? "   SCORE" : "SCORESUM");
+      fprintf(profile_outfile, g_textbuf, "FID", "IID");
       uii = score_range_obs_cts[qrange_idx];
       uiptr = &(score_miss_cts[sample_ct * qrange_idx]);
       dxx = score_bases[qrange_idx];
@@ -2201,7 +2192,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	next_unset_unsafe_ck(sample_exclude, &sample_uidx);
 	bufptr = &(sample_ids[sample_uidx * max_sample_id_len]);
 	bufptr2 = strchr(bufptr, '\t');
-	bufptr = fw_strcpyn(plink_maxfid, (uintptr_t)(bufptr2 - bufptr), bufptr, tbuf);
+	bufptr = fw_strcpyn(plink_maxfid, (uintptr_t)(bufptr2 - bufptr), bufptr, g_textbuf);
 	*bufptr++ = ' ';
 	bufptr = fw_strcpy(plink_maxiid, &(bufptr2[1]), bufptr);
 	*bufptr++ = ' ';
@@ -2210,7 +2201,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	    bufptr = memseta(bufptr, 32, 5);
 	    *bufptr++ = '1' + IS_SET(pheno_c, sample_uidx);
 	  } else {
-	    bufptr = width_force(6, bufptr, double_g_write(bufptr, pheno_d[sample_uidx]));
+	    bufptr = width_force(6, bufptr, dtoa_g(pheno_d[sample_uidx], bufptr));
 	  }
 	} else {
 	  bufptr = memcpya(bufptr, missing_pheno_str, missing_pheno_len);
@@ -2218,7 +2209,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
         *bufptr++ = ' ';
 	ujj = uii - score_miss_cts[sample_idx];
 	if (dosage_score_cnt) {
-	  bufptr = uint32_writew6x(bufptr, uii, ' ');
+	  bufptr = uint32toa_w6x(uii, ' ', bufptr);
 	}
         if (score_mean_impute) {
 	  ujj = uii;
@@ -2229,9 +2220,9 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	} else if (score_report_average) {
 	  dyy /= ((double)((int32_t)ujj));
 	}
-	bufptr = width_force(8, bufptr, double_g_write(bufptr, dyy));
+	bufptr = width_force(8, bufptr, dtoa_g(dyy, bufptr));
 	*bufptr++ = '\n';
-        if (fwrite_checked(tbuf, bufptr - tbuf, profile_outfile)) {
+        if (fwrite_checked(g_textbuf, bufptr - g_textbuf, profile_outfile)) {
 	  goto plink1_dosage_ret_WRITE_FAIL;
 	}
       }
@@ -2243,9 +2234,8 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
   } else {
     if (count_occur) {
       max_occur_id_len += sizeof(int32_t) + 1; // null, uint32_t
-      wkspace_left -= topsize;
-      if (wkspace_alloc_c_checked(&bufptr, max_occur_id_len * distinct_id_ct)) {
-	goto plink1_dosage_ret_NOMEM2;
+      if (bigstack_alloc_c(max_occur_id_len * distinct_id_ct, &bufptr)) {
+	goto plink1_dosage_ret_NOMEM;
       }
       ulii = 0; // write idx
       ujj = 0; // number of counts > 1
@@ -2262,13 +2252,12 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	  ll_ptr = ll_ptr->next;
 	}
       }
-      wkspace_left += topsize;
       qsort(bufptr, distinct_id_ct, max_occur_id_len, strcmp_natural);
       for (ulii = 0; ulii < distinct_id_ct; ulii++) {
 	bufptr2 = &(bufptr[ulii * max_occur_id_len]);
 	slen = strlen(bufptr2);
 	pzwritep = memcpyax(pzwritep, bufptr2, slen, ' ');
-	pzwritep = uint32_write(pzwritep, *((uint32_t*)(&(bufptr2[slen + 1]))));
+	pzwritep = uint32toa(*((uint32_t*)(&(bufptr2[slen + 1]))), pzwritep);
 	append_binary_eoln(&pzwritep);
 	if (flex_pzwrite(&ps, &pzwritep)) {
 	  goto plink1_dosage_ret_WRITE_FAIL;
@@ -2289,8 +2278,6 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
   }
 
   while (0) {
-  plink1_dosage_ret_NOMEM2:
-    wkspace_left += topsize;
   plink1_dosage_ret_NOMEM:
     retval = RET_NOMEM;
     break;
@@ -2315,9 +2302,9 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
     retval = RET_NOMEM;
     break;
   plink1_dosage_ret_MISSING_TOKENS:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, &(fnames[(file_idx + file_idx_start) * max_fn_len]));
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, &(fnames[(file_idx + file_idx_start) * max_fn_len]));
   plink1_dosage_ret_INVALID_FORMAT_WW:
-    wordwrap(logbuf, 0);
+    wordwrapb(0);
   plink1_dosage_ret_INVALID_FORMAT_2:
     logerrprintb();
   plink1_dosage_ret_INVALID_FORMAT:
@@ -2348,6 +2335,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
       gzclose_cond(gz_infiles[uii]);
     }
   }
+  bigstack_end_reset(bigstack_end_mark);
 
   return retval;
 }
diff --git a/plink_family.c b/plink_family.c
index 12cee9e..d18a3ac 100644
--- a/plink_family.c
+++ b/plink_family.c
@@ -77,17 +77,17 @@ int32_t get_trios_and_families(uintptr_t unfiltered_sample_ct, uintptr_t* sample
   // with no adverse consequences.  For backward compatibility, we replicate
   // this.  (Possible todo: report a warning exactly once when this happens.)
   // It won't be replicated in PLINK 2.0.
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   uint64_t* edge_list = NULL;
   uint32_t* toposort_queue = NULL;
   char* fids = NULL;
   char* iids = NULL;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t unfiltered_sample_ctp1l = 1 + (unfiltered_sample_ct / BITCT);
   uintptr_t sample_uidx = next_unset_unsafe(sample_exclude, 0);
   // does *not* use populate_id_htable
   uintptr_t htable_size = geqprime(2 * unfiltered_sample_ct + 1);
-  uintptr_t topsize = 0;
   uintptr_t max_fid_len = 2;
   uintptr_t max_iid_len = 2;
   uint64_t family_code = 0;
@@ -126,8 +126,6 @@ int32_t get_trios_and_families(uintptr_t unfiltered_sample_ct, uintptr_t* sample
   uint64_t trio_code;
   uint64_t edge_code;
   uint64_t ullii;
-  uintptr_t topsize_bak;
-  uintptr_t topsize_bak2;
   uintptr_t family_idx;
   uintptr_t trio_ct;
   uintptr_t trio_idx;
@@ -141,48 +139,29 @@ int32_t get_trios_and_families(uintptr_t unfiltered_sample_ct, uintptr_t* sample
   uint32_t first_sex;
   uint32_t uii;
   int32_t sorted_idx;
-  founder_info2 = (uintptr_t*)top_alloc(&topsize, unfiltered_sample_ctp1l * sizeof(intptr_t));
-  if (!founder_info2) {
+  if (bigstack_end_alloc_ul(unfiltered_sample_ctp1l, &founder_info2) ||
+      bigstack_end_alloc_ull(sample_ct, &trio_list_tmp) ||
+      bigstack_end_alloc_c(sample_ct * max_sample_id_len, &sorted_sample_ids) ||
+      bigstack_end_alloc_ui(sample_ct, &sample_id_map) ||
+      bigstack_end_alloc_c(max_sample_id_len, &idbuf)) {
     goto get_trios_and_families_ret_NOMEM;
   }
   memcpy(founder_info2, founder_info, unfiltered_sample_ctl * sizeof(intptr_t));
   if (unfiltered_sample_ct & (BITCT - 1)) {
-    SET_BIT(founder_info2, unfiltered_sample_ct);
+    SET_BIT(unfiltered_sample_ct, founder_info2);
   } else {
     founder_info2[unfiltered_sample_ctl] = 1;
   }
-  topsize_bak = topsize;
-  trio_list_tmp = (uint64_t*)top_alloc(&topsize, sample_ct * sizeof(int64_t));
-  if (!trio_list_tmp) {
-    goto get_trios_and_families_ret_NOMEM;
-  }
-  topsize_bak2 = topsize;
-  sorted_sample_ids = (char*)top_alloc(&topsize, sample_ct * max_sample_id_len);
-  if (!sorted_sample_ids) {
-    goto get_trios_and_families_ret_NOMEM;
-  }
-  sample_id_map = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
-  if (!sample_id_map) {
-    goto get_trios_and_families_ret_NOMEM;
-  }
-  idbuf = (char*)top_alloc(&topsize, max_sample_id_len);
-  if (!idbuf) {
-    goto get_trios_and_families_ret_NOMEM;
-  }
-  wkspace_left -= topsize;
-  if (sort_item_ids_noalloc(sorted_sample_ids, sample_id_map, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, 0, 0, strcmp_deref)) {
-    wkspace_left += topsize;
+  if (sort_item_ids_noalloc(unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, 0, 0, strcmp_deref, sorted_sample_ids, sample_id_map)) {
     goto get_trios_and_families_ret_1;
   }
   // over-allocate here, we shrink family_list later when we know how many
   // families there are
-  if (wkspace_alloc_ull_checked(&family_list, sample_ct * sizeof(int64_t)) ||
-      wkspace_alloc_ull_checked(&family_htable, htable_size * sizeof(int64_t)) ||
-      wkspace_alloc_ui_checked(&family_idxs, htable_size * sizeof(int32_t))) {
-    goto get_trios_and_families_ret_NOMEM2;
+  if (bigstack_alloc_ull(sample_ct, &family_list) ||
+      bigstack_calloc_ull(htable_size, &family_htable) ||
+      bigstack_alloc_ui(htable_size, &family_idxs)) {
+    goto get_trios_and_families_ret_NOMEM;
   }
-  fill_ull_zero(family_htable, htable_size);
-  wkspace_left += topsize;
   // 1. populate family_list (while using family_htable to track duplicates),
   //    determine max_iid_len, count qualifying trios
   trio_write = trio_list_tmp;
@@ -214,7 +193,7 @@ int32_t get_trios_and_families(uintptr_t unfiltered_sample_ct, uintptr_t* sample
     first_sex = 0;
     if (sorted_idx == -1) {
       if (!include_duos) {
-	SET_BIT(founder_info2, sample_uidx);
+	SET_BIT(sample_uidx, founder_info2);
 	continue;
       }
       uidx1 = unfiltered_sample_ct;
@@ -240,7 +219,7 @@ int32_t get_trios_and_families(uintptr_t unfiltered_sample_ct, uintptr_t* sample
     }
     if (sorted_idx == -1) {
       if ((!include_duos) || (uidx1 == unfiltered_sample_ct)) {
-	SET_BIT(founder_info2, sample_uidx);
+	SET_BIT(sample_uidx, founder_info2);
 	continue;
       }
       if (first_sex == 1) {
@@ -304,34 +283,31 @@ int32_t get_trios_and_families(uintptr_t unfiltered_sample_ct, uintptr_t* sample
     *trio_write++ = (((uint64_t)family_idx) << 32) | ((uint64_t)sample_uidx);
   }
   trio_ct = (uintptr_t)(trio_write - trio_list_tmp);
-  wkspace_reset(wkspace_mark);
-  wkspace_alloc(family_ct * sizeof(int64_t)); // family_list
-  topsize = topsize_bak2;
-  wkspace_left -= topsize;
-  if (wkspace_alloc_ull_checked(&trio_write, trio_ct * sizeof(int64_t))) {
-    goto get_trios_and_families_ret_NOMEM2;
-  }
-  wkspace_left += topsize;
-  topsize = topsize_bak;
+  bigstack_reset(bigstack_mark);
+  bigstack_alloc(family_ct * sizeof(int64_t)); // family_list
+  bigstack_end_reset(trio_list_tmp);
+  if (bigstack_alloc_ull(trio_ct, &trio_write)) {
+    goto get_trios_and_families_ret_NOMEM;
+  }
   memcpy(trio_write, trio_list_tmp, trio_ct * sizeof(int64_t));
 #ifdef __cplusplus
   std::sort((int64_t*)trio_write, (int64_t*)(&(trio_write[trio_ct])));
 #else
   qsort(trio_write, trio_ct, sizeof(int64_t), llcmp);
 #endif
-  wkspace_left -= topsize;
-  if (wkspace_alloc_ui_checked(&trio_lookup, trio_ct * (3 + toposort) * sizeof(int32_t))) {
-    goto get_trios_and_families_ret_NOMEM2;
+  bigstack_end_reset(founder_info2);
+  if (bigstack_alloc_ui(trio_ct * (3 + toposort), &trio_lookup)) {
+    goto get_trios_and_families_ret_NOMEM;
   }
   if (fids_ptr) {
-    if (wkspace_alloc_c_checked(&fids, trio_ct * max_fid_len) ||
-	wkspace_alloc_c_checked(&iids, (unfiltered_sample_ct + include_duos) * max_iid_len)) {
-      goto get_trios_and_families_ret_NOMEM2;
+    if (bigstack_alloc_c(trio_ct * max_fid_len, &fids) ||
+	bigstack_alloc_c((unfiltered_sample_ct + include_duos) * max_iid_len, &iids)) {
+      goto get_trios_and_families_ret_NOMEM;
     }
   }
   if (toposort) {
-    if (wkspace_alloc_ull_checked(&edge_list, trio_ct * 2 * sizeof(int64_t))) {
-      goto get_trios_and_families_ret_NOMEM2;
+    if (bigstack_alloc_ull(trio_ct * 2, &edge_list)) {
+      goto get_trios_and_families_ret_NOMEM;
     }
     // Edge list excludes founder parents; edge codes have parental uidx in
     // high 32 bits, trio idx in low 32.
@@ -353,13 +329,12 @@ int32_t get_trios_and_families(uintptr_t unfiltered_sample_ct, uintptr_t* sample
       qsort(edge_list, edge_ct, sizeof(int64_t), llcmp);
 #endif
     }
-    wkspace_shrink_top(edge_list, edge_ct * sizeof(int64_t));
-    if (wkspace_alloc_ui_checked(&toposort_queue, trio_ct * sizeof(int32_t))) {
-      goto get_trios_and_families_ret_NOMEM2;
+    bigstack_shrink_top(edge_list, edge_ct * sizeof(int64_t));
+    if (bigstack_alloc_ui(trio_ct, &toposort_queue)) {
+      goto get_trios_and_families_ret_NOMEM;
     }
     remaining_edge_ct = edge_ct;
   }
-  wkspace_left += topsize;
   *family_list_ptr = family_list;
   *family_ct_ptr = family_ct;
   *trio_list_ptr = trio_write;
@@ -417,7 +392,7 @@ int32_t get_trios_and_families(uintptr_t unfiltered_sample_ct, uintptr_t* sample
       *uiptr++ = (uint32_t)(family_code >> 32);
       *uiptr++ = trio_idx;
       if (remaining_edge_ct) {
-        SET_BIT(founder_info2, sample_uidx);
+        SET_BIT(sample_uidx, founder_info2);
         ullii = ((uint64_t)sample_uidx) << 32;
         uii = uint64arr_greater_than(edge_list, edge_ct, ullii);
         ullii |= 0xffffffffLLU;
@@ -441,11 +416,9 @@ int32_t get_trios_and_families(uintptr_t unfiltered_sample_ct, uintptr_t* sample
       logerrprint("Error: Pedigree graph is cyclic.  Check for evidence of time travel abuse in\nyour cohort.\n");
       goto get_trios_and_families_ret_INVALID_FORMAT;
     }
-    wkspace_reset(edge_list);
+    bigstack_reset(edge_list);
   }
   while (0) {
-  get_trios_and_families_ret_NOMEM2:
-    wkspace_left += topsize;
   get_trios_and_families_ret_NOMEM:
     retval = RET_NOMEM;
     break;
@@ -457,8 +430,9 @@ int32_t get_trios_and_families(uintptr_t unfiltered_sample_ct, uintptr_t* sample
   }
  get_trios_and_families_ret_1:
   if (retval) {
-    wkspace_reset(wkspace_mark);
+    bigstack_reset(bigstack_mark);
   }
+  bigstack_end_reset(bigstack_end_mark);
   return retval;
 }
 
@@ -477,7 +451,7 @@ uint32_t erase_mendel_errors(uintptr_t unfiltered_sample_ct, uintptr_t* loadbuf,
   uint32_t uoo;
   uint32_t upp;
   memcpy(workbuf, loadbuf, (unfiltered_sample_ct + 3) / 4);
-  SET_BIT_DBL(workbuf, unfiltered_sample_ct);
+  SET_BIT_DBL(unfiltered_sample_ct, workbuf);
   if (!multigen) {
     for (trio_idx = 0; trio_idx < trio_ct; trio_idx++) {
       uii = *uiptr++;
@@ -686,7 +660,7 @@ void fill_mendel_errstr(uint32_t error_code, char** allele_ptrs, uint32_t* alens
 }
 
 int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t plink_maxfid, uint32_t plink_maxiid, uint32_t plink_maxsnp, uint32_t allow_no_variants, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_ [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
   FILE* outfile_l = NULL;
   uintptr_t* sample_male_include2 = NULL;
@@ -772,25 +746,28 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
     LOGERRPRINTF("Warning: Skipping --me/--mendel since there are no %strios.\n", include_duos? "duos or " : "");
     goto mendel_error_scan_ret_1;
   }
+  if (family_ct > 0x55555555U) {
+    // may as well document this limit
+    logerrprint("Error: Too many families for --me/--mendel.\n");
+    goto mendel_error_scan_ret_INVALID_CMDLINE;
+  }
+
   trio_ct4 = (trio_ct + 3) / 4;
-  trio_ctl = (trio_ct + (BITCT - 1)) / BITCT;
+  trio_ctl = BITCT_TO_WORDCT(trio_ct);
   var_error_max = (int32_t)(fam_ip->mendel_max_var_error * (1 + SMALL_EPSILON) * ((intptr_t)trio_ct));
-  if (wkspace_alloc_ul_checked(&loadbuf, unfiltered_sample_ctp1l2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&error_cts, trio_ct * 3 * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&error_cts_tmp, trio_ct4 * 4 * sizeof(int32_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctp1l2, &loadbuf) ||
+      bigstack_calloc_ui(trio_ct * 3, &error_cts) ||
+      bigstack_calloc_ui(trio_ct4 * 4, &error_cts_tmp)) {
     goto mendel_error_scan_ret_NOMEM;
   }
   if (!var_first) {
     error_cts_tmp2 = error_cts_tmp;
   } else {
-    if (wkspace_alloc_ui_checked(&error_cts_tmp2, trio_ct4 * 4 * sizeof(int32_t))) {
+    if (bigstack_calloc_ui(trio_ct4 * 4, &error_cts_tmp2)) {
       goto mendel_error_scan_ret_NOMEM;
     }
-    fill_uint_zero(error_cts_tmp2, trio_ct4 * 4);
   }
   loadbuf[unfiltered_sample_ctp1l2 - 1] = 0;
-  fill_uint_zero(error_cts, trio_ct * 3);
-  fill_uint_zero(error_cts_tmp, trio_ct4 * 4);
   hh_exists &= XMHH_EXISTS;
   if (alloc_raw_haploid_filters(unfiltered_sample_ct, hh_exists, 0, sample_exclude, sex_male, NULL, &sample_male_include2)) {
     goto mendel_error_scan_ret_NOMEM;
@@ -808,36 +785,35 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
     if (ulii < 25) {
       ulii = 25;
     }
-    if (wkspace_alloc_ull_checked(&family_error_cts, family_ct * 3 * sizeof(int64_t)) ||
-        wkspace_alloc_ui_checked(&child_cts, family_ct * sizeof(int32_t)) ||
-        wkspace_alloc_c_checked(&(errstrs[0]), ulii * 10)) {
+    if (bigstack_alloc_ull(family_ct * 3, &family_error_cts) ||
+        bigstack_alloc_ui(family_ct, &child_cts) ||
+        bigstack_alloc_c(ulii * 10, &(errstrs[0]))) {
       goto mendel_error_scan_ret_NOMEM;
     }
     for (uii = 1; uii < 10; uii++) {
       errstrs[uii] = &(errstrs[0][uii * ulii]);
     }
     if (multigen && full_error_list) {
-      if (wkspace_alloc_ul_checked(&error_locs, trio_ctl * sizeof(intptr_t)) ||
-	  wkspace_alloc_uc_checked(&cur_errors, trio_ct)) {
+      if (bigstack_calloc_ul(trio_ctl, &error_locs) ||
+	  bigstack_alloc_uc(trio_ct, &cur_errors)) {
 	goto mendel_error_scan_ret_NOMEM;
       }
-      fill_ulong_zero(error_locs, trio_ctl);
     }
     if (full_error_list) {
       memcpy(outname_end, ".mendel", 8);
-      if (fopen_checked(&outfile, outname, "w")) {
+      if (fopen_checked(outname, "w", &outfile)) {
 	goto mendel_error_scan_ret_OPEN_FAIL;
       }
-      sprintf(tbuf, "%%%us %%%us  CHR %%%us   CODE                 ERROR\n", plink_maxfid, plink_maxiid, plink_maxsnp);
-      fprintf(outfile, tbuf, "FID", "KID", "SNP");
+      sprintf(g_textbuf, "%%%us %%%us  CHR %%%us   CODE                 ERROR\n", plink_maxfid, plink_maxiid, plink_maxsnp);
+      fprintf(outfile, g_textbuf, "FID", "KID", "SNP");
     }
     memcpy(outname_end, ".lmendel", 9);
-    if (fopen_checked(&outfile_l, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile_l)) {
       goto mendel_error_scan_ret_OPEN_FAIL;
     }
     // replicate harmless 'N' misalignment bug
-    sprintf(tbuf, " CHR %%%us   N\n", plink_maxsnp);
-    fprintf(outfile_l, tbuf, "SNP");
+    sprintf(g_textbuf, " CHR %%%us   N\n", plink_maxsnp);
+    fprintf(outfile_l, g_textbuf, "SNP");
   } else {
     // suppress warning
     fill_ulong_zero((uintptr_t*)errstrs, 10);
@@ -854,18 +830,18 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
       continue;
     }
     if (calc_mendel) {
-      chrom_name_ptr = chrom_name_buf5w4write(chrom_name_buf, chrom_info_ptr, chrom_idx, &chrom_name_len);
+      chrom_name_ptr = chrom_name_buf5w4write(chrom_info_ptr, chrom_idx, &chrom_name_len, chrom_name_buf);
     }
     if (uii != marker_uidx) {
       marker_uidx = uii;
       goto mendel_error_scan_seek;
     }
     while (1) {
-      if (load_raw2(bedfile, loadbuf, unfiltered_sample_ct4, unfiltered_sample_ctl2m1, final_mask)) {
+      if (load_raw2(unfiltered_sample_ct4, unfiltered_sample_ctl2m1, final_mask, bedfile, loadbuf)) {
 	goto mendel_error_scan_ret_READ_FAIL;
       }
       if (IS_SET(marker_reverse, marker_uidx)) {
-        reverse_loadbuf((unsigned char*)loadbuf, unfiltered_sample_ct);
+        reverse_loadbuf(unfiltered_sample_ct, (unsigned char*)loadbuf);
       }
       if (hh_exists && is_x) {
 	hh_reset((unsigned char*)loadbuf, sample_male_include2, unfiltered_sample_ct);
@@ -873,7 +849,7 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
       // missing parents are treated as having uidx equal to
       // unfiltered_sample_ct, and we set the corresponding genotype to always
       // be missing.  This lets us avoid special-casing duos.
-      SET_BIT_DBL(loadbuf, unfiltered_sample_ct);
+      SET_BIT_DBL(unfiltered_sample_ct, loadbuf);
       uiptr = trio_lookup;
       cur_error_ct = 0;
       if (calc_mendel) {
@@ -900,14 +876,14 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
 	    cur_error_ct++;
 	    if (full_error_list) {
 	      umm >>= 24;
-	      wptr = fw_strcpy(plink_maxfid, &(fids[trio_idx * max_fid_len]), tbuf);
+	      wptr = fw_strcpy(plink_maxfid, &(fids[trio_idx * max_fid_len]), g_textbuf);
 	      *wptr++ = ' ';
 	      wptr = fw_strcpy(plink_maxiid, &(iids[uii * max_iid_len]), wptr);
 	      *wptr++ = ' ';
 	      wptr = memcpyax(wptr, chrom_name_ptr, chrom_name_len, ' ');
 	      wptr = fw_strcpyn(plink_maxsnp, varlen, varptr, wptr);
 	      wptr = memseta(wptr, 32, 5);
-	      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 		goto mendel_error_scan_ret_WRITE_FAIL;
 	      }
 	      if (!errstr_lens[umm]) {
@@ -939,7 +915,7 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
 	      error_cts_tmp2[trio_idx] += umm & 0xffffff;
 	      cur_error_ct++;
 	      if (full_error_list) {
-	        set_bit(error_locs, trio_idx);
+	        set_bit(trio_idx, error_locs);
 		umm >>= 24;
                 cur_errors[trio_idx] = (unsigned char)umm;
 	      }
@@ -954,14 +930,14 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
           trio_idx = 0;
 	  for (uii = 0; uii < cur_error_ct; trio_idx++, uii++) {
             next_set_ul_unsafe_ck(error_locs, &trio_idx);
-	    wptr = fw_strcpy(plink_maxfid, &(fids[trio_idx * max_fid_len]), tbuf);
+	    wptr = fw_strcpy(plink_maxfid, &(fids[trio_idx * max_fid_len]), g_textbuf);
 	    *wptr++ = ' ';
 	    wptr = fw_strcpy(plink_maxiid, &(iids[((uint32_t)trio_list[trio_idx]) * max_iid_len]), wptr);
 	    *wptr++ = ' ';
 	    wptr = memcpyax(wptr, chrom_name_ptr, chrom_name_len, ' ');
 	    wptr = fw_strcpyn(plink_maxsnp, varlen, varptr, wptr);
 	    wptr = memseta(wptr, 32, 5);
-	    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	      goto mendel_error_scan_ret_WRITE_FAIL;
 	    }
 	    umm = cur_errors[trio_idx];
@@ -979,17 +955,17 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
 	if (fwrite_checked(chrom_name_ptr, chrom_name_len, outfile_l)) {
 	  goto mendel_error_scan_ret_WRITE_FAIL;
 	}
-	tbuf[0] = ' ';
-	wptr = fw_strcpyn(plink_maxsnp, varlen, varptr, &(tbuf[1]));
+	g_textbuf[0] = ' ';
+	wptr = fw_strcpyn(plink_maxsnp, varlen, varptr, &(g_textbuf[1]));
         *wptr++ = ' ';
-        wptr = uint32_writew4x(wptr, cur_error_ct, '\n');
-	if (fwrite_checked(tbuf, wptr - tbuf, outfile_l)) {
+        wptr = uint32toa_w4x(cur_error_ct, '\n', wptr);
+	if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_l)) {
 	  goto mendel_error_scan_ret_WRITE_FAIL;
 	}
       }
       if (cur_error_ct) {
 	if (cur_error_ct > var_error_max) {
-	  SET_BIT(marker_exclude, marker_uidx);
+	  SET_BIT(marker_uidx, marker_exclude);
 	  new_marker_exclude_ct++;
 	}
 	if ((cur_error_ct <= var_error_max) || (!var_first)) {
@@ -1064,11 +1040,11 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
       goto mendel_error_scan_ret_WRITE_FAIL;
     }
     outname_end[1] = 'f';
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto mendel_error_scan_ret_OPEN_FAIL;
     }
-    sprintf(tbuf, "%%%us %%%us %%%us   CHLD    N\n", plink_maxfid, plink_maxiid, plink_maxiid);
-    fprintf(outfile, tbuf, "FID", "PAT", "MAT");
+    sprintf(g_textbuf, "%%%us %%%us %%%us   CHLD    N\n", plink_maxfid, plink_maxiid, plink_maxiid);
+    fprintf(outfile, g_textbuf, "FID", "PAT", "MAT");
     fill_ull_zero(family_error_cts, family_ct * 3);
     fill_uint_zero(child_cts, family_ct);
     for (trio_idx = 0; trio_idx < trio_ct; trio_idx++) {
@@ -1085,11 +1061,11 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
       if (ujj < unfiltered_sample_ct) {
 	// bleah, fids[] isn't in right order for this lookup
 	cptr = &(sample_ids[ujj * max_sample_id_len]);
-	wptr = fw_strcpyn(plink_maxfid, (uintptr_t)(((char*)memchr(cptr, '\t', max_sample_id_len)) - cptr), cptr, tbuf);
+	wptr = fw_strcpyn(plink_maxfid, (uintptr_t)(((char*)memchr(cptr, '\t', max_sample_id_len)) - cptr), cptr, g_textbuf);
       } else {
 	cptr = &(sample_ids[ukk * max_sample_id_len]);
-	wptr = fw_strcpyn(plink_maxfid, (uintptr_t)(((char*)memchr(cptr, '\t', max_sample_id_len)) - cptr), cptr, tbuf);
-	// wptr = memseta(tbuf, 32, plink_maxfid - 1);
+	wptr = fw_strcpyn(plink_maxfid, (uintptr_t)(((char*)memchr(cptr, '\t', max_sample_id_len)) - cptr), cptr, g_textbuf);
+	// wptr = memseta(g_textbuf, 32, plink_maxfid - 1);
 	// *wptr++ = '0';
       }
       *wptr++ = ' ';
@@ -1107,14 +1083,14 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
 	*wptr++ = '0';
       }
       *wptr++ = ' ';
-      wptr = uint32_writew6x(wptr, child_cts[uii], ' ');
+      wptr = uint32toa_w6x(child_cts[uii], ' ', wptr);
       if (family_error_cts[uii * 3] < 10000) {
-	wptr = uint32_writew4(wptr, (uint32_t)family_error_cts[uii * 3]);
+	wptr = uint32toa_w4((uint32_t)family_error_cts[uii * 3], wptr);
       } else {
-        wptr = int64_write(wptr, family_error_cts[uii * 3]);
+        wptr = int64toa(family_error_cts[uii * 3], wptr);
       }
       *wptr++ = '\n';
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	goto mendel_error_scan_ret_WRITE_FAIL;
       }
     }
@@ -1122,11 +1098,11 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
       goto mendel_error_scan_ret_WRITE_FAIL;
     }
     outname_end[1] = 'i';
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto mendel_error_scan_ret_OPEN_FAIL;
     }
-    sprintf(tbuf, "%%%us %%%us   N\n", plink_maxfid, plink_maxiid);
-    fprintf(outfile, tbuf, "FID", "IID");
+    sprintf(g_textbuf, "%%%us %%%us   N\n", plink_maxfid, plink_maxiid);
+    fprintf(outfile, g_textbuf, "FID", "IID");
     uii = 0xffffffffU; // family idx
     for (trio_idx = 0; trio_idx < trio_ct; trio_idx++) {
       trio_code = trio_list[trio_idx];
@@ -1134,18 +1110,18 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
       if (ujj != uii) {
 	uii = ujj;
         family_code = family_list[uii];
-	wptr = fw_strcpy(plink_maxfid, &(fids[trio_idx * max_fid_len]), tbuf);
+	wptr = fw_strcpy(plink_maxfid, &(fids[trio_idx * max_fid_len]), g_textbuf);
 	*wptr++ = ' ';
 	ujj = (uint32_t)family_code;
 	if (ujj != unfiltered_sample_ct) {
 	  wptr = fw_strcpy(plink_maxiid, &(iids[ujj * max_iid_len]), wptr);
 	  *wptr++ = ' ';
 	  if (family_error_cts[3 * uii + 1] < 10000) {
-	    wptr = uint32_writew4(wptr, (uint32_t)family_error_cts[3 * uii + 1]);
+	    wptr = uint32toa_w4((uint32_t)family_error_cts[3 * uii + 1], wptr);
 	  } else {
-	    wptr = int64_write(wptr, family_error_cts[3 * uii + 1]);
+	    wptr = int64toa(family_error_cts[3 * uii + 1], wptr);
 	  }
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	    goto mendel_error_scan_ret_WRITE_FAIL;
 	  }
 	}
@@ -1154,24 +1130,24 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
 	  if (ujj != unfiltered_sample_ct) {
 	    putc('\n', outfile);
 	  }
-	  wptr = fw_strcpy(plink_maxiid, &(iids[ukk * max_iid_len]), &(tbuf[plink_maxfid + 1]));
+	  wptr = fw_strcpy(plink_maxiid, &(iids[ukk * max_iid_len]), &(g_textbuf[plink_maxfid + 1]));
 	  *wptr++ = ' ';
 	  if (family_error_cts[3 * uii + 2] < 10000) {
-	    wptr = uint32_writew4(wptr, (uint32_t)family_error_cts[3 * uii + 2]);
+	    wptr = uint32toa_w4((uint32_t)family_error_cts[3 * uii + 2], wptr);
 	  } else {
-	    wptr = int64_write(wptr, family_error_cts[3 * uii + 2]);
+	    wptr = int64toa(family_error_cts[3 * uii + 2], wptr);
 	  }
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	    goto mendel_error_scan_ret_WRITE_FAIL;
 	  }
 	}
 	putc(' ', outfile); // PLINK 1.07 formatting quirk
 	putc('\n', outfile);
       }
-      wptr = fw_strcpy(plink_maxiid, &(iids[((uint32_t)trio_code) * max_iid_len]), &(tbuf[plink_maxfid + 1]));
+      wptr = fw_strcpy(plink_maxiid, &(iids[((uint32_t)trio_code) * max_iid_len]), &(g_textbuf[plink_maxfid + 1]));
       *wptr++ = ' ';
-      wptr = uint32_writew4x(wptr, error_cts[trio_idx * 3], '\n');
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      wptr = uint32toa_w4x(error_cts[trio_idx * 3], '\n', wptr);
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	goto mendel_error_scan_ret_WRITE_FAIL;
       }
     }
@@ -1197,30 +1173,30 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
 	  ujj = (uint32_t)family_code;
 	  ukk = (uint32_t)(family_code >> 32);
           if (exclude_one_ratio == 0.0) {
-	    set_bit(sample_exclude, (uint32_t)trio_code);
+	    set_bit((uint32_t)trio_code, sample_exclude);
 	    if (ujj < unfiltered_sample_ct) {
-	      set_bit(sample_exclude, ujj);
+	      set_bit(ujj, sample_exclude);
 	    }
 	    if (ukk < unfiltered_sample_ct) {
-	      set_bit(sample_exclude, ukk);
+	      set_bit(ukk, sample_exclude);
 	    }
 	  } else if ((exclude_one_ratio == -1) || (ujj == unfiltered_sample_ct) || (ukk == unfiltered_sample_ct)) {
-            set_bit(sample_exclude, (uint32_t)trio_code);
+            set_bit((uint32_t)trio_code, sample_exclude);
 	  } else {
 	    dxx = (double)((int32_t)trio_list[trio_idx * 3 + 1]);
 	    dyy = (double)((int32_t)trio_list[trio_idx * 3 + 2]);
 	    if (dxx > exclude_one_ratio * dyy) {
-	      set_bit(sample_exclude, ujj);
+	      set_bit(ujj, sample_exclude);
 	    } else if (dyy > exclude_one_ratio * dxx) {
-	      set_bit(sample_exclude, ukk);
+	      set_bit(ukk, sample_exclude);
 	    } else {
-	      set_bit(sample_exclude, (uint32_t)trio_code);
+	      set_bit((uint32_t)trio_code, sample_exclude);
 	    }
 	  }
 	}
       }
     }
-    ulii = popcount_longs(sample_exclude, (unfiltered_sample_ct + (BITCT - 1)) / BITCT);
+    ulii = popcount_longs(sample_exclude, BITCT_TO_WORDCT(unfiltered_sample_ct));
     if (unfiltered_sample_ct == ulii) {
       LOGERRPRINTF("Error: All %s excluded by --me.\n", g_species_plural);
       goto mendel_error_scan_ret_ALL_SAMPLES_EXCLUDED;
@@ -1242,6 +1218,9 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
   mendel_error_scan_ret_WRITE_FAIL:
     retval = RET_WRITE_FAIL;
     break;
+  mendel_error_scan_ret_INVALID_CMDLINE:
+    retval = RET_INVALID_CMDLINE;
+    break;
   mendel_error_scan_ret_ALL_MARKERS_EXCLUDED:
     retval = RET_ALL_MARKERS_EXCLUDED;
     break;
@@ -1250,7 +1229,7 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
     break;
   }
  mendel_error_scan_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   fclose_cond(outfile_l);
   return retval;
@@ -1259,8 +1238,18 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
 int32_t populate_pedigree_rel_info(Pedigree_rel_info* pri_ptr, uintptr_t unfiltered_sample_ct, char* sample_ids, uintptr_t max_sample_id_len, char* paternal_ids, uintptr_t max_paternal_id_len, char* maternal_ids, uintptr_t max_maternal_id_len, uintptr_t* founder_info) {
   // possible todo: if any families have been entirely filtered out, don't
   // construct pedigree for them
-  unsigned char* wkspace_mark;
-  unsigned char* wkspace_mark2;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t unfiltered_sample_ctlm = unfiltered_sample_ctl * BITCT;
+  uintptr_t max_family_id_len = 0;
+  uintptr_t max_indiv_id_len = 0;
+  uintptr_t max_pm_id_len = MAXV(max_paternal_id_len, max_maternal_id_len);
+  char* last_family_id = NULL;
+  double* tmp_rel_space = NULL;
+  double* tmp_rel_writer = NULL;
+  uint32_t* uiptr2 = NULL;
+  int32_t max_family_nf = 0;
+  unsigned char* bigstack_mark;
+  unsigned char* bigstack_mark2;
   int32_t ii;
   int32_t jj;
   int32_t kk;
@@ -1274,39 +1263,29 @@ int32_t populate_pedigree_rel_info(Pedigree_rel_info* pri_ptr, uintptr_t unfilte
   uint64_t ullii;
   char* family_ids;
   char* cur_sample_id;
-  char* last_family_id = NULL;
   char* cur_family_id;
   char* id_ptr;
   uint32_t* family_sizes;
   uint32_t* uiptr;
-  uint32_t* uiptr2 = NULL;
   uint32_t fidx;
   int32_t family_size;
   uint32_t* remaining_sample_idxs;
   int32_t* remaining_sample_parent_idxs; // -1 = no parent (or nonshared)
   uint32_t remaining_sample_ct;
   uint32_t sample_idx_write;
-  uintptr_t max_family_id_len = 0;
-  char* indiv_ids;
+  char* indiv_ids; // within a single family
   uint32_t* sample_id_lookup;
-  uintptr_t max_indiv_id_len = 0;
-  uintptr_t max_pm_id_len;
   uint32_t family_id_ct;
   uint32_t* fis_ptr;
   char* stray_parent_ids;
   intptr_t stray_parent_ct;
   uintptr_t* processed_samples;
   uint32_t founder_ct;
-  int32_t max_family_nf = 0;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t unfiltered_sample_ctlm = unfiltered_sample_ctl * BITCT;
   uint32_t* complete_sample_idxs;
   uintptr_t complete_sample_idx_ct;
   double* rs_ptr;
   double* rel_writer;
   double dxx;
-  double* tmp_rel_space = NULL;
-  double* tmp_rel_writer = NULL;
 
   for (sample_uidx = 0; sample_uidx < unfiltered_sample_ct; sample_uidx++) {
     ujj = strlen_se(&(sample_ids[sample_uidx * max_sample_id_len])) + 1;
@@ -1318,16 +1297,11 @@ int32_t populate_pedigree_rel_info(Pedigree_rel_info* pri_ptr, uintptr_t unfilte
       max_indiv_id_len = ujj + 1;
     }
   }
-  if (max_paternal_id_len > max_maternal_id_len) {
-    max_pm_id_len = max_paternal_id_len;
-  } else {
-    max_pm_id_len = max_maternal_id_len;
-  }
-  if (wkspace_alloc_ui_checked(&(pri_ptr->family_info_space), unfiltered_sample_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&(pri_ptr->family_rel_nf_idxs), unfiltered_sample_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&(pri_ptr->family_idxs), unfiltered_sample_ct * sizeof(int32_t)) ||
-      wkspace_alloc_c_checked(&family_ids, unfiltered_sample_ct * max_family_id_len) ||
-      wkspace_alloc_ui_checked(&family_sizes, unfiltered_sample_ct * sizeof(int32_t))) {
+  if (bigstack_alloc_ui(unfiltered_sample_ct, &(pri_ptr->family_info_space)) ||
+      bigstack_alloc_ui(unfiltered_sample_ct, &(pri_ptr->family_rel_nf_idxs)) ||
+      bigstack_alloc_ui(unfiltered_sample_ct, &(pri_ptr->family_idxs)) ||
+      bigstack_alloc_c(unfiltered_sample_ct * max_family_id_len, &family_ids) ||
+      bigstack_alloc_ui(unfiltered_sample_ct, &family_sizes)) {
     return RET_NOMEM;
   }
 
@@ -1400,8 +1374,8 @@ int32_t populate_pedigree_rel_info(Pedigree_rel_info* pri_ptr, uintptr_t unfilte
 
   if (family_id_ct < unfiltered_sample_ct) {
     uiptr = family_sizes;
-    wkspace_shrink_top(family_ids, family_id_ct * max_family_id_len);
-    family_sizes = (uint32_t*)wkspace_alloc(family_id_ct * sizeof(int32_t));
+    bigstack_shrink_top(family_ids, family_id_ct * max_family_id_len);
+    bigstack_alloc_ui(family_id_ct, &family_sizes);
     if (family_sizes < uiptr) {
       // copy back
       for (uii = 0; uii < family_id_ct; uii++) {
@@ -1414,12 +1388,11 @@ int32_t populate_pedigree_rel_info(Pedigree_rel_info* pri_ptr, uintptr_t unfilte
   pri_ptr->max_family_id_len = max_family_id_len;
   pri_ptr->family_sizes = family_sizes;
 
-  if (wkspace_alloc_ui_checked(&(pri_ptr->family_info_offsets), (family_id_ct + 1) * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(&(pri_ptr->family_rel_space_offsets), (family_id_ct + 1) * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&(pri_ptr->family_founder_cts), family_id_ct * sizeof(int32_t))) {
+  if (bigstack_alloc_ui(family_id_ct + 1, &(pri_ptr->family_info_offsets)) ||
+      bigstack_alloc_ul(family_id_ct + 1, &(pri_ptr->family_rel_space_offsets)) ||
+      bigstack_calloc_ui(family_id_ct, &(pri_ptr->family_founder_cts))) {
     return RET_NOMEM;
   }
-  fill_int_zero((int32_t*)(pri_ptr->family_founder_cts), family_id_ct);
 
   ii = 0; // running family_info offset
   for (fidx = 0; fidx < family_id_ct; fidx++) {
@@ -1428,10 +1401,9 @@ int32_t populate_pedigree_rel_info(Pedigree_rel_info* pri_ptr, uintptr_t unfilte
     ii += family_size;
   }
 
-  if (wkspace_alloc_ui_checked(&uiptr, family_id_ct * sizeof(int32_t))) {
+  if (bigstack_calloc_ui(family_id_ct, &uiptr)) {
     return RET_NOMEM;
   }
-  fill_uint_zero(uiptr, family_id_ct);
 
   // Fill family_idxs, family_founder_cts, and founder portion of
   // family_rel_nf_idxs.
@@ -1446,7 +1418,7 @@ int32_t populate_pedigree_rel_info(Pedigree_rel_info* pri_ptr, uintptr_t unfilte
     }
     cur_sample_id = &(cur_sample_id[max_sample_id_len]);
   }
-  wkspace_reset(uiptr);
+  bigstack_reset(uiptr);
   ulii = 0; // running rel_space offset
   for (fidx = 0; fidx < family_id_ct; fidx++) {
     family_size = pri_ptr->family_sizes[fidx];
@@ -1464,12 +1436,12 @@ int32_t populate_pedigree_rel_info(Pedigree_rel_info* pri_ptr, uintptr_t unfilte
   // offset, even if we're at the last family
   pri_ptr->family_info_offsets[family_id_ct] = unfiltered_sample_ct;
   pri_ptr->family_rel_space_offsets[family_id_ct] = ulii;
-  if (wkspace_alloc_d_checked(&(pri_ptr->rel_space), ulii * sizeof(double))) {
+  if (bigstack_alloc_d(ulii, &(pri_ptr->rel_space))) {
     return RET_NOMEM;
   }
 
-  wkspace_mark = wkspace_base;
-  if (wkspace_alloc_ui_checked(&uiptr, family_id_ct * sizeof(int32_t))) {
+  bigstack_mark = g_bigstack_base;
+  if (bigstack_alloc_ui(family_id_ct, &uiptr)) {
     return RET_NOMEM;
   }
   // populate family_info_space
@@ -1481,14 +1453,15 @@ int32_t populate_pedigree_rel_info(Pedigree_rel_info* pri_ptr, uintptr_t unfilte
     pri_ptr->family_info_space[uiptr[fidx]] = sample_uidx;
     uiptr[fidx] += 1;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
 
-  if (wkspace_alloc_ul_checked(&processed_samples, (unfiltered_sample_ctl + (max_family_nf + (BITCT2 - 1)) / BITCT2) * sizeof(intptr_t))) {
+  ulii = QUATERCT_TO_WORDCT(max_family_nf);
+  if (bigstack_alloc_ul(unfiltered_sample_ctl + ulii, &processed_samples)) {
     return RET_NOMEM;
   }
-  fill_ulong_one(&(processed_samples[unfiltered_sample_ctl]), (max_family_nf + (BITCT2 - 1)) / BITCT2);
+  fill_ulong_one(&(processed_samples[unfiltered_sample_ctl]), ulii);
 
-  wkspace_mark2 = wkspace_base;
+  bigstack_mark2 = g_bigstack_base;
   for (fidx = 0; fidx < family_id_ct; fidx++) {
     family_size = family_sizes[fidx];
     founder_ct = pri_ptr->family_founder_cts[fidx];
@@ -1496,12 +1469,12 @@ int32_t populate_pedigree_rel_info(Pedigree_rel_info* pri_ptr, uintptr_t unfilte
     stray_parent_ct = 0;
     if (remaining_sample_ct) {
       memcpy(processed_samples, founder_info, unfiltered_sample_ctl * sizeof(intptr_t));
-      if (wkspace_alloc_ui_checked(&complete_sample_idxs, family_size * sizeof(int32_t)) ||
-          wkspace_alloc_ui_checked(&remaining_sample_idxs, remaining_sample_ct * sizeof(int32_t)) ||
-          wkspace_alloc_c_checked(&indiv_ids, family_size * max_indiv_id_len) ||
-          wkspace_alloc_ui_checked(&sample_id_lookup, family_size * sizeof(int32_t)) ||
-          wkspace_alloc_i_checked(&remaining_sample_parent_idxs, remaining_sample_ct * 2 * sizeof(int32_t)) ||
-          wkspace_alloc_c_checked(&stray_parent_ids, remaining_sample_ct * 2 * max_pm_id_len)) {
+      if (bigstack_alloc_ui(family_size, &complete_sample_idxs) ||
+          bigstack_alloc_ui(remaining_sample_ct, &remaining_sample_idxs) ||
+          bigstack_alloc_c(family_size * max_indiv_id_len, &indiv_ids) ||
+          bigstack_alloc_ui(family_size, &sample_id_lookup) ||
+          bigstack_alloc_i(remaining_sample_ct * 2, &remaining_sample_parent_idxs) ||
+          bigstack_alloc_c(remaining_sample_ct * 2 * max_pm_id_len, &stray_parent_ids)) {
 	return RET_NOMEM;
       }
       ii = pri_ptr->family_info_offsets[fidx];
@@ -1593,7 +1566,7 @@ int32_t populate_pedigree_rel_info(Pedigree_rel_info* pri_ptr, uintptr_t unfilte
       // Now allocate temporary relatedness table between nonfounders and
       // stray parents with multiple references.
       if (stray_parent_ct) {
-        if (wkspace_alloc_d_checked(&tmp_rel_space, (family_size - founder_ct) * stray_parent_ct * sizeof(double))) {
+        if (bigstack_alloc_d((family_size - founder_ct) * stray_parent_ct, &tmp_rel_space)) {
 	  return RET_NOMEM;
         }
 	tmp_rel_writer = tmp_rel_space;
@@ -1713,7 +1686,7 @@ int32_t populate_pedigree_rel_info(Pedigree_rel_info* pri_ptr, uintptr_t unfilte
 	    }
 	    pri_ptr->family_rel_nf_idxs[jj] = complete_sample_idx_ct;
 	    complete_sample_idxs[complete_sample_idx_ct++] = jj;
-	    set_bit(processed_samples, jj);
+	    set_bit(jj, processed_samples);
 	  } else {
             remaining_sample_parent_idxs[sample_idx_write * 2] = kk;
 	    remaining_sample_parent_idxs[sample_idx_write * 2 + 1] = mm;
@@ -1726,10 +1699,10 @@ int32_t populate_pedigree_rel_info(Pedigree_rel_info* pri_ptr, uintptr_t unfilte
 	}
 	remaining_sample_ct = sample_idx_write;
       }
-      wkspace_reset(wkspace_mark2);
+      bigstack_reset(bigstack_mark2);
     }
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return 0;
 }
 
@@ -1802,7 +1775,7 @@ int32_t tdt_poo(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* o
   uint32_t ujj;
   uint32_t ukk;
   memcpy(outname_end, ".tdt.poo", 9);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto tdt_poo_ret_OPEN_FAIL;
   }
   sprintf(textbuf, " CHR %%%us  A1:A2      T:U_PAT    CHISQ_PAT        P_PAT      T:U_MAT    CHISQ_MAT        P_MAT        Z_POO        P_POO \n", plink_maxsnp);
@@ -1820,18 +1793,18 @@ int32_t tdt_poo(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* o
     if (uii == chrom_end) {
       continue;
     }
-    wptr_start = width_force(4, textbuf, chrom_name_write(textbuf, chrom_info_ptr, chrom_idx));
+    wptr_start = width_force(4, textbuf, chrom_name_write(chrom_info_ptr, chrom_idx, textbuf));
     *wptr_start++ = ' ';
     if (uii != marker_uidx) {
       marker_uidx = uii;
       goto tdt_poo_scan_seek;
     }
     while (1) {
-      if (load_raw2(bedfile, loadbuf, unfiltered_sample_ct4, unfiltered_sample_ctl2m1, final_mask)) {
+      if (load_raw2(unfiltered_sample_ct4, unfiltered_sample_ctl2m1, final_mask, bedfile, loadbuf)) {
 	goto tdt_poo_ret_READ_FAIL;
       }
       if (IS_SET(marker_reverse, marker_uidx)) {
-	reverse_loadbuf((unsigned char*)loadbuf, unfiltered_sample_ct);
+	reverse_loadbuf(unfiltered_sample_ct, (unsigned char*)loadbuf);
       }
       if (hh_exists && is_x) {
         hh_reset((unsigned char*)loadbuf, sample_male_include2, unfiltered_sample_ct);
@@ -1887,32 +1860,32 @@ int32_t tdt_poo(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* o
       *wptr++ = ' ';
       pat_a1transmit = 0.5 * ((double)poo_pat_a1transmit_x2);
       cur_a2transmit = 0.5 * ((double)(poo_obs_pat_x2 - poo_pat_a1transmit_x2));
-      wptr2 = double_g_writewx4x(wptr, pat_a1transmit, 1, ':');
-      wptr2 = double_g_writewx4(wptr2, cur_a2transmit, 1);
+      wptr2 = dtoa_g_wxp4x(pat_a1transmit, 1, ':', wptr);
+      wptr2 = dtoa_g_wxp4(cur_a2transmit, 1, wptr2);
       wptr = width_force(12, wptr, wptr2);
       *wptr++ = ' ';
       if (poo_obs_pat_x2) {
 	pat_a2transmit_recip = 1.0 / cur_a2transmit;
 	dxx = pat_a1transmit - cur_a2transmit;
 	chisq = dxx * dxx / (pat_a1transmit + cur_a2transmit);
-	wptr = double_g_writewx4x(wptr, chisq, 12, ' ');
-	wptr = double_g_writewx4(wptr, chiprob_p(chisq, 1), 12);
+	wptr = dtoa_g_wxp4x(chisq, 12, ' ', wptr);
+	wptr = dtoa_g_wxp4(chiprob_p(chisq, 1), 12, wptr);
       } else {
 	wptr = memcpya(wptr, "          NA           NA", 25);
       }
       *wptr++ = ' ';
       dxx = 0.5 * ((double)poo_mat_a1transmit_x2);
       cur_a2transmit = 0.5 * ((double)(poo_obs_mat_x2 - poo_mat_a1transmit_x2));
-      wptr2 = double_g_writewx4x(wptr, dxx, 1, ':');
-      wptr2 = double_g_writewx4(wptr2, cur_a2transmit, 1);
+      wptr2 = dtoa_g_wxp4x(dxx, 1, ':', wptr);
+      wptr2 = dtoa_g_wxp4(cur_a2transmit, 1, wptr2);
       wptr = width_force(12, wptr, wptr2);
       *wptr++ = ' ';
       if (poo_obs_mat_x2) {
 	mat_a1transmit_recip = 1.0 / dxx;
 	chisq = dxx - cur_a2transmit;
 	chisq = chisq * chisq / (dxx + cur_a2transmit);
-	wptr = double_g_writewx4x(wptr, chisq, 12, ' ');
-	wptr = double_g_writewx4(wptr, chiprob_p(chisq, 1), 12);
+	wptr = dtoa_g_wxp4x(chisq, 12, ' ', wptr);
+	wptr = dtoa_g_wxp4(chiprob_p(chisq, 1), 12, wptr);
       } else {
 	wptr = memcpya(wptr, "          NA           NA", 25);
       }
@@ -1921,13 +1894,13 @@ int32_t tdt_poo(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* o
 	// Z-score
 	dxx = (log(pat_a1transmit * pat_a2transmit_recip * mat_a1transmit_recip * cur_a2transmit) / sqrt(1.0 / pat_a1transmit + pat_a2transmit_recip + mat_a1transmit_recip + 1.0 / cur_a2transmit));
 
-        wptr = double_g_writewx4x(wptr, dxx, 12, ' ');
+        wptr = dtoa_g_wxp4x(dxx, 12, ' ', wptr);
 	if (orig_chisq) {
 	  // todo: --pat/--mat support
 	  orig_chisq[markers_done] = dxx * dxx;
 	}
 	dxx = normdist(-fabs(dxx)) * 2;
-	wptr = double_g_writewx4(wptr, MAXV(dxx, output_min_p), 12);
+	wptr = dtoa_g_wxp4(MAXV(dxx, output_min_p), 12, wptr);
       } else {
 	wptr = memcpya(wptr, "          NA           NA", 25);
 	if (orig_chisq) {
@@ -1986,15 +1959,15 @@ int32_t tdt_poo(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* o
 }
 
 int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double ci_size, double ci_zt, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfilter [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
-  char* textbuf = tbuf;
+  char* textbuf = g_textbuf;
   double* orig_chisq = NULL; // pval if exact test
   uint64_t last_parents = 0;
   // uint64_t mendel_error_ct = 0;
   double chisq = 0;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t unfiltered_sample_ctp1l2 = 1 + (unfiltered_sample_ct / BITCT2);
   uintptr_t final_mask = get_final_mask(unfiltered_sample_ct);
   uintptr_t marker_uidx = ~ZEROLU;
@@ -2107,7 +2080,7 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
     goto tdt_ret_1;
   }
   // now assemble list of nuclear families with at least one case child
-  if (wkspace_alloc_ui_checked(&trio_nuclear_lookup, (3 * family_ct + trio_ct) * sizeof(int32_t))) {
+  if (bigstack_alloc_ui(3LU * family_ct + trio_ct, &trio_nuclear_lookup)) {
     goto tdt_ret_NOMEM;
   }
   lookup_ptr = trio_nuclear_lookup;
@@ -2183,15 +2156,15 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
     LOGERRPRINTF("Warning: Skipping --tdt%s since there are no trios with an affected child%s.\n", poo_test? " poo" : "", poo_test? "" : ", and no\ndiscordant parent pairs");
     goto tdt_ret_1;
   }
-  wkspace_shrink_top(trio_nuclear_lookup, ((uintptr_t)(lookup_ptr - trio_nuclear_lookup)) * sizeof(int32_t));
+  bigstack_shrink_top(trio_nuclear_lookup, ((uintptr_t)(lookup_ptr - trio_nuclear_lookup)) * sizeof(int32_t));
 
   if (mtest_adjust) {
-    if (wkspace_alloc_d_checked(&orig_chisq, marker_ct * sizeof(double))) {
+    if (bigstack_alloc_d(marker_ct, &orig_chisq)) {
       goto tdt_ret_NOMEM;
     }
   }
-  if (wkspace_alloc_ul_checked(&loadbuf, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&workbuf, unfiltered_sample_ctp1l2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf) ||
+      bigstack_alloc_ul(unfiltered_sample_ctp1l2, &workbuf)) {
     goto tdt_ret_NOMEM;
   }
   loadbuf[unfiltered_sample_ctl2 - 1] = 0;
@@ -2207,7 +2180,7 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
   }
   ulii = 2 * max_marker_allele_len + plink_maxsnp + MAX_ID_LEN + 256;
   if (ulii > MAXLINELEN) {
-    if (wkspace_alloc_c_checked(&textbuf, ulii)) {
+    if (bigstack_alloc_c(ulii, &textbuf)) {
       goto tdt_ret_NOMEM;
     }
   }
@@ -2224,7 +2197,7 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
   }
   pct_thresh = marker_ct / 100;
   memcpy(outname_end, ".tdt", 5);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto tdt_ret_OPEN_FAIL;
   }
   sprintf(textbuf, " CHR %%%us           BP  A1  A2      T      U           OR ", plink_maxsnp);
@@ -2260,18 +2233,18 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
     if (uii == chrom_end) {
       continue;
     }
-    wptr_start = width_force(4, textbuf, chrom_name_write(textbuf, chrom_info_ptr, chrom_idx));
+    wptr_start = width_force(4, textbuf, chrom_name_write(chrom_info_ptr, chrom_idx, textbuf));
     *wptr_start++ = ' ';
     if (uii != marker_uidx) {
       marker_uidx = uii;
       goto tdt_scan_seek;
     }
     while (1) {
-      if (load_raw2(bedfile, loadbuf, unfiltered_sample_ct4, unfiltered_sample_ctl2m1, final_mask)) {
+      if (load_raw2(unfiltered_sample_ct4, unfiltered_sample_ctl2m1, final_mask, bedfile, loadbuf)) {
 	goto tdt_ret_READ_FAIL;
       }
       if (IS_SET(marker_reverse, marker_uidx)) {
-	reverse_loadbuf((unsigned char*)loadbuf, unfiltered_sample_ct);
+	reverse_loadbuf(unfiltered_sample_ct, (unsigned char*)loadbuf);
       }
       if (hh_exists && is_x) {
 	hh_reset((unsigned char*)loadbuf, sample_male_include2, unfiltered_sample_ct);
@@ -2356,24 +2329,24 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
       if ((pfilter == 2.0) || ((pval <= pfilter) && (pval >= 0.0))) {
 	wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
 	wptr = memseta(wptr, 32, 3);
-	wptr = uint32_writew10x(wptr, marker_pos[marker_uidx], ' ');
+	wptr = uint32toa_w10x(marker_pos[marker_uidx], ' ', wptr);
 	wptr = fw_strcpy(3, marker_allele_ptrs[2 * marker_uidx], wptr);
 	*wptr++ = ' ';
 	wptr = fw_strcpy(3, marker_allele_ptrs[2 * marker_uidx + 1], wptr);
 	*wptr++ = ' ';
-	wptr = uint32_writew6x(wptr, tdt_a1_trans_ct, ' ');
+	wptr = uint32toa_w6x(tdt_a1_trans_ct, ' ', wptr);
 	uii = tdt_obs_ct - tdt_a1_trans_ct; // untransmitted
-	wptr = uint32_writew6x(wptr, uii, ' ');
+	wptr = uint32toa_w6x(uii, ' ', wptr);
 	if (uii) {
 	  untransmitted_recip = 1.0 / ((double)((int32_t)uii));
 	  dxx = (double)((int32_t)tdt_a1_trans_ct);
 	  odds_ratio = dxx * untransmitted_recip;
-	  wptr = double_g_writewx4x(wptr, odds_ratio, 12, ' ');
+	  wptr = dtoa_g_wxp4x(odds_ratio, 12, ' ', wptr);
 	  if (display_ci) {
 	    odds_ratio = log(odds_ratio);
 	    dxx = ci_zt * sqrt(1.0 / dxx + untransmitted_recip);
-	    wptr = double_g_writewx4x(wptr, exp(odds_ratio - dxx), 12, ' ');
-	    wptr = double_g_writewx4x(wptr, exp(odds_ratio + dxx), 12, ' ');
+	    wptr = dtoa_g_wxp4x(exp(odds_ratio - dxx), 12, ' ', wptr);
+	    wptr = dtoa_g_wxp4x(exp(odds_ratio + dxx), 12, ' ', wptr);
 	  }
 	} else {
 	  wptr = memcpya(wptr, "          NA ", 13);
@@ -2382,11 +2355,11 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
 	  }
 	}
         if (is_exact) {
-	  wptr = double_g_writewx4x(wptr, MAXV(pval, output_min_p), 12, ' ');
+	  wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 12, ' ', wptr);
 	} else {
 	  if (pval >= 0) {
-	    wptr = double_g_writewx4x(wptr, chisq, 12, ' ');
-            wptr = double_g_writewx4x(wptr, MAXV(pval, output_min_p), 12, ' ');
+	    wptr = dtoa_g_wxp4x(chisq, 12, ' ', wptr);
+            wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 12, ' ', wptr);
 	  } else {
 	    wptr = memcpya(wptr, "          NA           NA ", 26);
 	  }
@@ -2400,8 +2373,8 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
 	  }
 	  uii = parentdt_case_a2_excess1 + 2 * parentdt_case_a2_excess2;
 	  ujj = parentdt_obs_ct1 + 2 * parentdt_obs_ct2;
-	  wptr2 = uint32_writex(wptr, uii, ':');
-	  wptr2 = uint32_write(wptr2, ujj - uii);
+	  wptr2 = uint32toa_x(uii, ':', wptr);
+	  wptr2 = uint32toa(ujj - uii, wptr2);
           wptr = width_force(12, wptr, wptr2);
           *wptr++ = ' ';
 	  // No exact test for now since we're dealing with a sum of step-1 and
@@ -2441,9 +2414,9 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
 	  } else {
 	    dxx = (double)(((int32_t)ujj) - 2 * ((int32_t)uii));
 	    chisq = dxx * dxx / ((double)((intptr_t)((uintptr_t)(ujj + 2 * parentdt_obs_ct2))));
-	    wptr = double_g_writewx4x(wptr, chisq, 12, ' ');
+	    wptr = dtoa_g_wxp4x(chisq, 12, ' ', wptr);
 	    dxx = chiprob_p(chisq, 1);
-	    wptr = double_g_writewx4(wptr, MAXV(dxx, output_min_p), 12);
+	    wptr = dtoa_g_wxp4(MAXV(dxx, output_min_p), 12, wptr);
 	  }
 	  *wptr++ = ' ';
 	  uii += tdt_a1_trans_ct;
@@ -2456,9 +2429,9 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
 	    // them consistent with each other.
             dxx = (double)((intptr_t)((uintptr_t)ujj) - 2 * ((intptr_t)((uintptr_t)uii)));
 	    chisq = dxx * dxx / ((double)((intptr_t)(((uintptr_t)ujj) + 2 * parentdt_obs_ct2)));
-	    wptr = double_g_writewx4x(wptr, chisq, 12, ' ');
+	    wptr = dtoa_g_wxp4x(chisq, 12, ' ', wptr);
 	    dxx = chiprob_p(chisq, 1);
-	    wptr = double_g_writewx4(wptr, MAXV(dxx, output_min_p), 12);
+	    wptr = dtoa_g_wxp4(MAXV(dxx, output_min_p), 12, wptr);
 	  }
 	}
 	wptr = memcpya(wptr, " \n", 2);
@@ -2497,9 +2470,9 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
   LOGPRINTF("--tdt: Report written to %s .\n", outname);
   if (mtest_adjust) {
   tdt_multcomp:
-    ulii = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
-    if (wkspace_alloc_ui_checked(&marker_idx_to_uidx, marker_ct * sizeof(int32_t)) ||
-        wkspace_alloc_ul_checked(&marker_exclude_tmp, ulii * sizeof(intptr_t))) {
+    ulii = BITCT_TO_WORDCT(unfiltered_marker_ct);
+    if (bigstack_alloc_ui(marker_ct, &marker_idx_to_uidx) ||
+        bigstack_alloc_ul(ulii, &marker_exclude_tmp)) {
       goto tdt_ret_NOMEM;
     }
     // need a custom marker_exclude that's set at Y/haploid/MT
@@ -2508,7 +2481,7 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
       chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
       if ((is_set(chrom_info_ptr->haploid_mask, chrom_idx) && ((int32_t)chrom_idx != chrom_info_ptr->x_code)) || ((int32_t)chrom_idx == chrom_info_ptr->mt_code)) {
 	uii = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx];
-	fill_bits(marker_exclude_tmp, uii, chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1] - uii);
+	fill_bits(uii, chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1] - uii, marker_exclude_tmp);
       }
     }
     fill_idx_to_uidx(marker_exclude_tmp, unfiltered_marker_ct, marker_ct, marker_idx_to_uidx);
@@ -2532,7 +2505,7 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
     break;
   }
  tdt_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   return retval;
 }
@@ -2552,11 +2525,11 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
   // It's probably appropriate to split this into two functions in the future,
   // one for dfam and one for qfam; the differences make this difficult to
   // maintain.
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t sample_ctl = (sample_ct + (BITCT - 1)) / BITCT;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
   uintptr_t max_merged_id_len = max_fid_len + max_paternal_id_len + max_maternal_id_len + sizeof(int32_t);
   uintptr_t trio_idx = 0;
-  uintptr_t topsize = 0;
   uintptr_t* tmp_within2_founder = NULL;
   uintptr_t* lm_within2_founder = NULL;
   uintptr_t* lm_eligible = NULL;
@@ -2576,8 +2549,7 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
   uint32_t* sample_lm_to_fss_idx;
   uint32_t* fs_starts;
   uint32_t* fss_contents;
-  uintptr_t topsize_bak;
-  uintptr_t topsize_bak2;
+  unsigned char* bigstack_end_mark2;
   uintptr_t cur_sample_ct;
   uintptr_t sample_uidx;
   uintptr_t sample_idx;
@@ -2590,70 +2562,57 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
   uint32_t umm;
   if (test_type) {
     if (is_within2) {
-      if (wkspace_alloc_ul_checked(&lm_within2_founder, sample_ctl * sizeof(intptr_t))) {
-	goto get_sibship_info_ret_NOMEM2;
+      if (bigstack_alloc_ul(sample_ctl, &lm_within2_founder)) {
+	goto get_sibship_info_ret_NOMEM;
       }
     }
-    if (wkspace_alloc_ul_checked(&lm_eligible, sample_ctl * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(sample_ctl, &lm_eligible)) {
       goto get_sibship_info_ret_NOMEM;
     }
   }
   if (test_type) {
     // shrink later
-    if (wkspace_alloc_ui_checked(&fss_contents, (sample_ct + 2 * family_ct) * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(sample_ct + 2 * family_ct, &fss_contents)) {
       goto get_sibship_info_ret_NOMEM;
     }
     // this is the equivalent of PLINK 1.07's family pointers
-    sample_to_fss_idx = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
-    if (!sample_to_fss_idx) {
+    if (bigstack_end_alloc_ui(sample_ct, &sample_to_fss_idx)) {
       goto get_sibship_info_ret_NOMEM;
     }
   } else {
-    if (wkspace_alloc_ui_checked(&sample_to_fss_idx, sample_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(sample_ct, &sample_to_fss_idx)) {
       goto get_sibship_info_ret_NOMEM;
     }
     // shrink later
-    if (wkspace_alloc_ui_checked(&fss_contents, (sample_ct + 2 * family_ct) * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(sample_ct + 2 * family_ct, &fss_contents)) {
       goto get_sibship_info_ret_NOMEM;
     }
   }
-  topsize_bak = topsize;
-  not_in_family = (uintptr_t*)top_alloc(&topsize, unfiltered_sample_ctl * sizeof(intptr_t));
-  if (!not_in_family) {
-    goto get_sibship_info_ret_NOMEM;
-  }
+  bigstack_end_mark2 = g_bigstack_end;
 
-  // Temporary bitfields used to track which parents are (i) part of multiple
-  // families, and (ii) not a child in any of them.  To ensure results are not
-  // dependent on the order of samples in the dataset, we now exclude these
-  // parents from the QFAM permutation.  (todo: compute an average in this case
-  // instead?)
-  sample_uidx_to_idx = (uint32_t*)top_alloc(&topsize, unfiltered_sample_ct * sizeof(int32_t));
-  if (!sample_uidx_to_idx) {
-    goto get_sibship_info_ret_NOMEM2;
-  }
-  ulptr = (uintptr_t*)top_alloc(&topsize, unfiltered_sample_ctl * sizeof(intptr_t));
-  if (!ulptr) {
-    goto get_sibship_info_ret_NOMEM2;
-  }
-  topsize_bak2 = topsize;
-  ulptr2 = (uintptr_t*)top_alloc(&topsize, unfiltered_sample_ctl * sizeof(intptr_t));
-  if (!ulptr2) {
-    goto get_sibship_info_ret_NOMEM2;
+  if (bigstack_end_alloc_ul(unfiltered_sample_ctl, &not_in_family) ||
+
+      // Temporary bitfields used to track which parents are (i) part of
+      // multiple families, and (ii) not a child in any of them.  To ensure
+      // results are not dependent on the order of samples in the dataset, we
+      // now exclude these parents from the QFAM permutation.  (todo: compute
+      // an average in this case instead?)
+      // ulptr = is a double-parent
+      // ulptr2 = is a child
+      bigstack_end_alloc_ui(unfiltered_sample_ct, &sample_uidx_to_idx) ||
+      bigstack_end_calloc_ul(unfiltered_sample_ctl, &ulptr) ||
+      bigstack_end_calloc_ul(unfiltered_sample_ctl, &ulptr2)) {
+    goto get_sibship_info_ret_NOMEM;
   }
   if (is_within2) {
-    tmp_within2_founder = (uintptr_t*)top_alloc(&topsize, unfiltered_sample_ctl * sizeof(intptr_t));
-    if (!tmp_within2_founder) {
-      goto get_sibship_info_ret_NOMEM2;
+    if (bigstack_end_calloc_ul(unfiltered_sample_ctl, &tmp_within2_founder)) {
+      goto get_sibship_info_ret_NOMEM;
     }
-    fill_ulong_zero(tmp_within2_founder, unfiltered_sample_ctl);
   }
 
-  bitfield_exclude_to_include(sample_exclude, not_in_family, unfiltered_sample_ct);
+  bitarr_invert_copy(sample_exclude, unfiltered_sample_ct, not_in_family);
   fill_uint_one(sample_to_fss_idx, sample_ct);
   fill_uidx_to_idx(sample_exclude, unfiltered_sample_ct, sample_ct, sample_uidx_to_idx);
-  fill_ulong_zero(ulptr, unfiltered_sample_ctl); // is a double-parent
-  fill_ulong_zero(ulptr2, unfiltered_sample_ctl); // is a child
   if (family_ct) {
     // iterate over all parents
     while (1) {
@@ -2667,8 +2626,8 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
       umm = sample_uidx_to_idx[ukk];
       if (is_within2) {
 	if (is_set(pheno_nm, uii) && is_set(pheno_nm, ukk)) {
-	  set_bit(tmp_within2_founder, uii);
-	  set_bit(tmp_within2_founder, ukk);
+	  set_bit(uii, tmp_within2_founder);
+	  set_bit(ukk, tmp_within2_founder);
 	}
       }
       if (is_set(not_in_family, uii)) {
@@ -2676,9 +2635,9 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
 	  // missing father
 	  sample_to_fss_idx[ujj] = family_idx;
 	}
-	clear_bit(not_in_family, uii);
+	clear_bit(uii, not_in_family);
       } else {
-	set_bit(ulptr, uii);
+	set_bit(uii, ulptr);
       }
       fss_contents[fssc_idx++] = umm;
       if (is_set(not_in_family, ukk)) {
@@ -2686,9 +2645,9 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
 	  // missing mother
 	  sample_to_fss_idx[umm] = family_idx;
 	}
-	clear_bit(not_in_family, ukk);
+	clear_bit(ukk, not_in_family);
       } else {
-	set_bit(ulptr, ukk);
+	set_bit(ukk, ulptr);
       }
 
       ullii = trio_list[trio_idx];
@@ -2698,7 +2657,7 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
 	ujj = sample_uidx_to_idx[uii];
 	fss_contents[fssc_idx++] = ujj;
 	sample_to_fss_idx[ujj] = family_idx;
-	set_bit(ulptr2, uii);
+	set_bit(uii, ulptr2);
 	if (++trio_idx == trio_ct) {
 	  goto get_sibship_info_first_pass_done;
 	}
@@ -2708,12 +2667,12 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
     }
   }
  get_sibship_info_first_pass_done:
-  bitfield_andnot(not_in_family, ulptr2, unfiltered_sample_ctl);
-  wkspace_shrink_top(fss_contents, (fssc_idx + popcount_longs(not_in_family, unfiltered_sample_ctl)) * sizeof(int32_t));
+  bitvec_andnot(ulptr2, unfiltered_sample_ctl, not_in_family);
+  bigstack_shrink_top(fss_contents, (fssc_idx + popcount_longs(not_in_family, unfiltered_sample_ctl)) * sizeof(int32_t));
   if (test_type) {
-    bitfield_andnot(ulptr, ulptr2, unfiltered_sample_ctl);
+    bitvec_andnot(ulptr2, unfiltered_sample_ctl, ulptr);
   } else {
-    bitfield_exclude_to_include(ulptr2, ulptr, unfiltered_sample_ct);
+    bitarr_invert_copy(ulptr2, unfiltered_sample_ct, ulptr);
   }
   // qfam: ulptr = double-parents who aren't also a child of two parents in
   //               immediate dataset
@@ -2721,35 +2680,33 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
   //               dataset
 
   if (is_within2) {
-    bitfield_andnot(tmp_within2_founder, ulptr, unfiltered_sample_ctl);
-    bitfield_and(tmp_within2_founder, founder_info, unfiltered_sample_ctl);
+    bitvec_andnot(ulptr, unfiltered_sample_ctl, tmp_within2_founder);
+    bitvec_and(founder_info, unfiltered_sample_ctl, tmp_within2_founder);
     // now this only consists of founder parents who (i) aren't in multiple
     // families, and (ii) have a different phenotype from their partner.
-    collapse_copy_bitarr(unfiltered_sample_ct, tmp_within2_founder, sample_exclude, sample_ct, lm_within2_founder);
+    copy_bitarr_subset_excl(tmp_within2_founder, sample_exclude, unfiltered_sample_ct, sample_ct, lm_within2_founder);
   }
   if (test_type) {
-    bitfield_andnot_reversed_args(ulptr, pheno_nm, unfiltered_sample_ctl);
+    bitvec_andnot_reversed_args(pheno_nm, unfiltered_sample_ctl, ulptr);
     if (test_type == QFAM_WITHIN1) {
-      bitfield_andnot(ulptr, founder_info, unfiltered_sample_ctl);
+      bitvec_andnot(founder_info, unfiltered_sample_ctl, ulptr);
     }
-    collapse_copy_bitarr(unfiltered_sample_ct, ulptr, sample_exclude, sample_ct, lm_eligible);
-    bitfield_andnot_copy(unfiltered_sample_ctl, ulptr, not_in_family, founder_info);
+    copy_bitarr_subset_excl(ulptr, sample_exclude, unfiltered_sample_ct, sample_ct, lm_eligible);
+    bitvec_andnot_copy(not_in_family, founder_info, unfiltered_sample_ctl, ulptr);
   } else {
-    bitfield_and(ulptr, pheno_nm, unfiltered_sample_ctl);
-    bitfield_andnot(ulptr, founder_info, unfiltered_sample_ctl);
+    bitvec_and(pheno_nm, unfiltered_sample_ctl, ulptr);
+    bitvec_andnot(founder_info, unfiltered_sample_ctl, ulptr);
   }
-  topsize = topsize_bak2;
+  bigstack_end_reset(ulptr);
 
   // qfam: not a parent or child in a trio, not a founder
   // dfam: not a child in a trio, not a founder; parent ok
 
   cur_sample_ct = popcount_longs(ulptr, unfiltered_sample_ctl);
 
-  wkspace_left -= topsize;
-  if (wkspace_alloc_ui_checked(&fs_starts, (1 + family_ct + (cur_sample_ct / 2)) * sizeof(int32_t))) {
-    goto get_sibship_info_ret_NOMEM2;
+  if (bigstack_alloc_ui(1 + family_ct + (cur_sample_ct / 2), &fs_starts)) {
+    goto get_sibship_info_ret_NOMEM;
   }
-  wkspace_left += topsize;
   family_idx = 0;
   if (trio_ct) {
     fs_starts[0] = 0;
@@ -2766,10 +2723,8 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
   }
   if (cur_sample_ct > 1) {
     // identify size-2+ sibships
-    ulii = topsize;
-    merged_ids = (char*)top_alloc(&topsize, max_merged_id_len * cur_sample_ct);
-    if (!merged_ids) {
-      goto get_sibship_info_ret_NOMEM2;
+    if (bigstack_end_alloc_c(max_merged_id_len * cur_sample_ct, &merged_ids)) {
+      goto get_sibship_info_ret_NOMEM;
     }
     for (sample_uidx = 0, sample_idx = 0; sample_idx < cur_sample_ct; sample_uidx++, sample_idx++) {
       next_set_ul_unsafe_ck(ulptr, &sample_uidx);
@@ -2788,13 +2743,13 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
       if (!memcmp(bufptr, bufptr2, slen)) {
         fs_starts[family_idx] = fssc_idx;
 	uii = *((uint32_t*)(&(bufptr[slen])));
-	clear_bit(not_in_family, uii);
+	clear_bit(uii, not_in_family);
 	ujj = sample_uidx_to_idx[uii];
         fss_contents[fssc_idx++] = ujj;
         sample_to_fss_idx[ujj] = family_idx;
 	do {
 	  uii = *((uint32_t*)(&(bufptr2[slen])));
-	  clear_bit(not_in_family, uii);
+	  clear_bit(uii, not_in_family);
 	  ujj = sample_uidx_to_idx[uii];
           sample_to_fss_idx[ujj] = family_idx;
           fss_contents[fssc_idx++] = ujj;
@@ -2812,7 +2767,7 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
  get_sibship_info_second_pass_done:
   *fs_ct_ptr = family_idx;
   fs_starts[family_idx] = fssc_idx;
-  wkspace_shrink_top(fs_starts, (family_idx + 1) * sizeof(int32_t));
+  bigstack_shrink_top(fs_starts, (family_idx + 1) * sizeof(int32_t));
   if (test_type) {
     // for qfam, save singletons, and collapse sample_to_fss_idx to
     // sample_lm_to_fss_idx
@@ -2824,13 +2779,11 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
       sample_to_fss_idx[ujj] = family_idx + sample_idx;
     }
     *singleton_ct_ptr = ulii;
-    topsize = topsize_bak;
-    wkspace_left -= topsize;
+    bigstack_end_reset(bigstack_end_mark2);
     ulii = popcount_longs(lm_eligible, sample_ctl);
-    if (wkspace_alloc_ui_checked(&sample_lm_to_fss_idx, ulii * sizeof(int32_t))) {
-      goto get_sibship_info_ret_NOMEM2;
+    if (bigstack_alloc_ui(ulii, &sample_lm_to_fss_idx)) {
+      goto get_sibship_info_ret_NOMEM;
     }
-    wkspace_left += topsize;
     for (sample_uidx = 0, sample_idx = 0; sample_idx < ulii; sample_uidx++, sample_idx++) {
       next_set_ul_unsafe_ck(lm_eligible, &sample_uidx);
       sample_lm_to_fss_idx[sample_idx] = sample_to_fss_idx[sample_uidx];
@@ -2842,26 +2795,24 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
   } else {
     // bugfix: for DFAM, we also need to prevent size-1 sibships from being
     // included in the unrelated cluster
-    if (wkspace_alloc_ul_checked(size_one_sibships_ptr, unfiltered_sample_ctl * sizeof(intptr_t))) {
-      goto get_sibship_info_ret_NOMEM2;
+    if (bigstack_alloc_ul(unfiltered_sample_ctl, size_one_sibships_ptr)) {
+      goto get_sibship_info_ret_NOMEM;
     }
     memcpy(*size_one_sibships_ptr, not_in_family, unfiltered_sample_ctl * sizeof(intptr_t));
-    bitfield_and(*size_one_sibships_ptr, ulptr, unfiltered_sample_ctl);
+    bitvec_and(ulptr, unfiltered_sample_ctl, *size_one_sibships_ptr);
 
     // return sample_to_fss_idx in place of sample_lm_to_fss_idx
     *sample_lm_to_fss_idx_ptr = sample_to_fss_idx;
   }
   *fs_starts_ptr = fs_starts;
   *fss_contents_ptr = fss_contents;
-  // topsize = 0;
 
   while (0) {
-  get_sibship_info_ret_NOMEM2:
-    wkspace_left += topsize;
   get_sibship_info_ret_NOMEM:
     retval = RET_NOMEM;
     break;
   }
+  bigstack_end_reset(bigstack_end_mark);
   return retval;
 }
 
@@ -2874,7 +2825,6 @@ static uintptr_t* g_dfam_flipa;
 #ifdef __LP64__
 static uintptr_t* g_dfam_flipa_shuffled;
 #endif
-static uintptr_t* g_dfam_perm_vecs;
 static uintptr_t* g_dfam_perm_vecst; // sample-major, shuffled
 static double* g_dfam_numers;
 static double* g_dfam_denoms;
@@ -2882,12 +2832,10 @@ static uintptr_t* g_dfam_acc;
 static int32_t* g_dfam_twice_numers;
 static uint32_t* g_dfam_total_counts;
 static uint32_t* g_dfam_iteration_order;
-static uintptr_t g_perm_vec_ct;
 static uint32_t g_dfam_family_all_case_children_ct;
 static uint32_t g_dfam_family_mixed_ct;
 static uint32_t g_dfam_sibship_mixed_ct;
 static uint32_t g_dfam_unrelated_cluster_ct;
-static uint32_t g_dfam_sample_ct;
 
 static uintptr_t* g_loadbuf;
 static uintptr_t* g_lm_eligible;
@@ -2912,6 +2860,7 @@ static double* g_beta_sum;
 static double* g_beta_ssq;
 static uint32_t* g_beta_fail_cts;
 static uintptr_t g_cur_perm_ct;
+static uintptr_t g_qfam_sample_ct;
 static double g_qt_sum_all;
 static double g_qt_ssq_all;
 static uint32_t g_test_type;
@@ -2964,7 +2913,7 @@ void dfam_sibship_or_unrelated_perm_calc(uintptr_t* loadbuf_ptr, const uint32_t*
 #else
   uintptr_t acc4_word_ct = perm_vec_ct128 * 16;
   uintptr_t acc8_word_ct = perm_vec_ct128 * 32;
-  uintptr_t perm_vec_wct = (perm_vec_ct + (BITCT - 1)) / BITCT;
+  uintptr_t perm_vec_wct = BITCT_TO_WORDCT(perm_vec_ct);
   const uintptr_t* pheno_perm_ptr;
   uintptr_t* acc4_ptr;
   uintptr_t loader;
@@ -3033,8 +2982,8 @@ void dfam_sibship_or_unrelated_perm_calc(uintptr_t* loadbuf_ptr, const uint32_t*
   }
 
 #ifdef __LP64__
-  fill_v128_zero(acc4, acc4_vec_ct);
-  fill_v128_zero(acc8, acc8_vec_ct);
+  fill_vec_zero(acc4, acc4_vec_ct);
+  fill_vec_zero(acc8, acc8_vec_ct);
 #else
   fill_ulong_zero(acc4, acc4_word_ct);
   fill_ulong_zero(acc8, acc8_word_ct);
@@ -3157,8 +3106,8 @@ void dfam_sibship_or_unrelated_perm_calc(uintptr_t* loadbuf_ptr, const uint32_t*
   }
 
 #ifdef __LP64__
-  fill_v128_zero(acc4, acc4_vec_ct);
-  fill_v128_zero(acc8, acc8_vec_ct);
+  fill_vec_zero(acc4, acc4_vec_ct);
+  fill_vec_zero(acc8, acc8_vec_ct);
 #else
   fill_ulong_zero(acc4, acc4_word_ct);
   fill_ulong_zero(acc8, acc8_word_ct);
@@ -3265,7 +3214,7 @@ THREAD_RET_TYPE dfam_perm_thread(void* arg) {
   uintptr_t perm_vec_ct = g_perm_vec_ct;
   uintptr_t perm_vec_ct128 = (perm_vec_ct + 127) / 128;
   uintptr_t perm_vec_cta128 = perm_vec_ct128 * 128;
-  uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
+  uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
   uint32_t dfam_thread_ct = g_xfam_thread_ct;
   uint32_t pidx_offset = g_perms_done;
   uint32_t first_adapt_check = g_first_adapt_check;
@@ -3273,8 +3222,8 @@ THREAD_RET_TYPE dfam_perm_thread(void* arg) {
   uint32_t family_mixed_ct = g_dfam_family_mixed_ct;
   uint32_t sibship_mixed_ct = g_dfam_sibship_mixed_ct;
   uint32_t unrelated_cluster_ct = g_dfam_unrelated_cluster_ct;
-  uint32_t dfam_sample_ct = g_dfam_sample_ct;
-  uint32_t dfam_sample_ctl2 = (dfam_sample_ct + (BITCT2 - 1)) / BITCT2;
+  uint32_t dfam_sample_ct = g_perm_pheno_nm_ct;
+  uint32_t dfam_sample_ctl2 = QUATERCT_TO_WORDCT(dfam_sample_ct);
   const uintptr_t perm_vec_wcta = perm_vec_ct128 * (128 / BITCT);
   const uintptr_t* flipa = g_dfam_flipa;
   const uintptr_t* perm_vecst = g_dfam_perm_vecst;
@@ -3328,7 +3277,7 @@ THREAD_RET_TYPE dfam_perm_thread(void* arg) {
   __m128i* acc8_ptr;
   uintptr_t vidx;
 #else
-  const uintptr_t perm_vec_wct = (perm_vec_ct + (BITCT - 1)) / BITCT;
+  const uintptr_t perm_vec_wct = BITCT_TO_WORDCT(perm_vec_ct);
   // acc8 requires (perm_vec_ct + 3) / 4 words
   // acc4 requires (perm_vec_ct + 7) / 8 words
   // sum reduces to perm_vec_ct128 * 304 since we also have 2 acc32s
@@ -3423,7 +3372,7 @@ THREAD_RET_TYPE dfam_perm_thread(void* arg) {
       chisq_high = orig_chisq[marker_idx] + EPSILON;
       chisq_low = orig_chisq[marker_idx] - EPSILON;
 #ifdef __LP64__
-      fill_v128_zero(case_a1_ct_acc8, acc8_vec_ct);
+      fill_vec_zero(case_a1_ct_acc8, acc8_vec_ct);
       max_incr4 = 0;
       max_incr8 = 0;
 #endif
@@ -3540,8 +3489,8 @@ THREAD_RET_TYPE dfam_perm_thread(void* arg) {
 	  cur_flipa = &(flipa[fs_idx * perm_vec_wcta]);
 	  fill_uint_zero(cur_case_a1_cts, perm_vec_ct);
 #ifdef __LP64__
-	  fill_v128_zero(acc4, acc4_vec_ct);
-	  fill_v128_zero(acc8, acc8_vec_ct);
+	  fill_vec_zero(acc4, acc4_vec_ct);
+	  fill_vec_zero(acc8, acc8_vec_ct);
 #else
 	  fill_ulong_zero(acc4, acc4_word_ct);
 	  fill_ulong_zero(acc8, acc8_word_ct);
@@ -3650,8 +3599,8 @@ THREAD_RET_TYPE dfam_perm_thread(void* arg) {
 	    cur_max_incr = 0;
 	    fill_uint_zero(cur_case_missing_cts, perm_vec_ct);
 #ifdef __LP64__
-	    fill_v128_zero(acc4, acc4_vec_ct);
-	    fill_v128_zero(acc8, acc8_vec_ct);
+	    fill_vec_zero(acc4, acc4_vec_ct);
+	    fill_vec_zero(acc8, acc8_vec_ct);
 #else
 	    fill_ulong_zero(acc4, acc4_word_ct);
 	    fill_ulong_zero(acc8, acc8_word_ct);
@@ -3800,7 +3749,7 @@ void dfam_sibship_calc(uint32_t cur_case_ct, uint32_t case_hom_a1_ct, uint32_t c
 void dfam_flipa_shuffle(uintptr_t* perms, uintptr_t* shuffled_perms, uint32_t perm_ct) {
   // 0 16 32 48 64 80 96 112 4 20 36 52 68 84 100 116 8 24 40 56 72 88 104 120 12 28 44 60 76 92 108 124
   // 1 17 ...
-  uint32_t vct = (perm_ct + 127) / 128;
+  uint32_t vct = BITCT_TO_VECCT(perm_ct);
   uint32_t vidx;
   uint32_t offset1;
   uint32_t offset8;
@@ -3826,15 +3775,15 @@ void dfam_flipa_shuffle(uintptr_t* perms, uintptr_t* shuffled_perms, uint32_t pe
 #endif
 
 int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude,  [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
   FILE* outfile_msa = NULL;
-  char* textbuf = tbuf;
+  char* textbuf = g_textbuf;
   uintptr_t marker_ct_orig_autosomal = marker_ct_orig;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t unfiltered_sample_ctp1l2 = 1 + (unfiltered_sample_ct / BITCT2);
   uintptr_t final_mask = get_final_mask(unfiltered_sample_ct);
   uintptr_t perm_vec_ct128 = 0;
@@ -3844,17 +3793,8 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
   uintptr_t perm_vec_ctcl8m = 0;
   uintptr_t* marker_exclude_orig_autosomal = marker_exclude_orig;
   uintptr_t* founder_pnm = NULL;
-  uintptr_t* perm_preimage = NULL;
   double* orig_chisq = NULL;
   double* maxt_extreme_stat = NULL;
-  uint32_t* dfam_cluster_map = NULL;
-  uint32_t* dfam_cluster_starts = NULL;
-  uint32_t* dfam_cluster_case_cts = NULL;
-  uint32_t* dfam_tot_quotients = NULL;
-  uint64_t* dfam_totq_magics = NULL;
-  uint32_t* dfam_totq_preshifts = NULL;
-  uint32_t* dfam_totq_postshifts = NULL;
-  uint32_t* dfam_totq_incrs = NULL;
   uint32_t unfiltered_sample_ctl2m1 = (unfiltered_sample_ct - 1) / BITCT2;
   uint32_t multigen = (fam_ip->mendel_modifier / MENDEL_MULTIGEN) & 1;
   uint32_t is_set_test = fam_ip->dfam_modifier & DFAM_SET_TEST;
@@ -3932,6 +3872,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
   uint32_t dfam_sample_ct;
   uint32_t dfam_sample_ctl;
   uint32_t dfam_sample_ctl2;
+  uint32_t dfam_sample_ctv;
   uint32_t chrom_fo_idx;
   uint32_t chrom_end;
   uint32_t chrom_idx;
@@ -3960,7 +3901,6 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
   uint32_t ctrl_het_ct;
   uint32_t hom_a1_ct;
   uint32_t het_ct;
-  uint32_t dfam_cluster_ct;
   uint32_t uii;
   uint32_t ujj;
   int32_t twice_numer;
@@ -3972,7 +3912,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
       goto dfam_ret_INVALID_CMDLINE;
     }
     marker_ct_orig_autosomal -= uii;
-    if (wkspace_alloc_ul_checked(&marker_exclude_orig_autosomal, unfiltered_marker_ctl * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_marker_ctl, &marker_exclude_orig_autosomal)) {
       goto dfam_ret_NOMEM;
     }
     memcpy(marker_exclude_orig_autosomal, marker_exclude_orig, unfiltered_marker_ctl * sizeof(intptr_t));
@@ -3980,7 +3920,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
       chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
       if (is_set(chrom_info_ptr->haploid_mask, chrom_idx) || ((int32_t)chrom_idx == chrom_info_ptr->mt_code)) {
 	uii = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx];
-	fill_bits(marker_exclude_orig_autosomal, uii, chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1] - uii);
+	fill_bits(uii, chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1] - uii, marker_exclude_orig_autosomal);
       }
     }
   } else if (is_set(chrom_info_ptr->haploid_mask, 0)) {
@@ -3996,16 +3936,16 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
   marker_ct = marker_ct_orig_autosomal;
 
   // PLINK 1.07 treats missing phenotypes as controls here
-  if (wkspace_alloc_ul_checked(&pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl, &pheno_nm)) {
     goto dfam_ret_NOMEM;
   }
-  bitfield_exclude_to_include(sample_exclude, pheno_nm, unfiltered_sample_ct);
+  bitarr_invert_copy(sample_exclude, unfiltered_sample_ct, pheno_nm);
   if (is_set_test) {
-    if (wkspace_alloc_ul_checked(&founder_pnm, unfiltered_sample_ctl * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_sample_ctl, &founder_pnm)) {
       goto dfam_ret_NOMEM;
     }
     memcpy(founder_pnm, pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t));
-    bitfield_and(founder_pnm, founder_info, unfiltered_sample_ctl);
+    bitvec_and(founder_info, unfiltered_sample_ctl, founder_pnm);
     if (extract_set_union_unfiltered(sip, NULL, unfiltered_marker_ct, marker_exclude_orig_autosomal, &marker_exclude, &marker_ct)) {
       goto dfam_ret_NOMEM;
     }
@@ -4040,13 +3980,13 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
   // computation is nearly I/O-bound without it.  Phenotypes are permuted
   // within each sibship/unrelated cluster, while transmitted alleles are
   // permuted in case-containing families.
-  if (wkspace_alloc_ul_checked(&dfam_sample_exclude, unfiltered_sample_ctl * sizeof(intptr_t)) ||
+  if (bigstack_alloc_ul(unfiltered_sample_ctl, &dfam_sample_exclude) ||
       // shrink this later
-      wkspace_alloc_ui_checked(&dfam_iteration_order, (sample_ct + (sample_ct / 2)) * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&idx_to_uidx, sample_ct * sizeof(int32_t))) {
+      bigstack_alloc_ui(sample_ct + (sample_ct / 2), &dfam_iteration_order) ||
+      bigstack_alloc_ui(sample_ct, &idx_to_uidx)) {
     goto dfam_ret_NOMEM;
   }
-  fill_all_bits(dfam_sample_exclude, unfiltered_sample_ct);
+  fill_all_bits(unfiltered_sample_ct, dfam_sample_exclude);
   fill_idx_to_uidx(sample_exclude, unfiltered_sample_ct, sample_ct, idx_to_uidx);
   cur_dfam_ptr = dfam_iteration_order;
   for (fs_idx = 0; fs_idx < family_ct; fs_idx++) {
@@ -4067,17 +4007,17 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
       // [3...]: child uidxs
       // We collapse the indexes again later.
       sample_uidx = idx_to_uidx[fss_contents[fssc_start - 2]];
-      clear_bit(dfam_sample_exclude, sample_uidx);
+      clear_bit(sample_uidx, dfam_sample_exclude);
       *cur_dfam_ptr++ = sample_uidx;
 
       sample_uidx = idx_to_uidx[fss_contents[fssc_start - 1]];
-      clear_bit(dfam_sample_exclude, sample_uidx);
+      clear_bit(sample_uidx, dfam_sample_exclude);
       *cur_dfam_ptr++ = sample_uidx;
 
       *cur_dfam_ptr++ = cur_case_ct;
       for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
 	sample_uidx = idx_to_uidx[fss_contents[fssc_idx]];
-	clear_bit(dfam_sample_exclude, sample_uidx);
+	clear_bit(sample_uidx, dfam_sample_exclude);
 	*cur_dfam_ptr++ = sample_uidx;
       }
     }
@@ -4095,18 +4035,18 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
     if (cur_case_ct && (cur_case_ct != sibling_ct)) {
       family_mixed_ct++;
       sample_uidx = idx_to_uidx[fss_contents[fssc_start - 2]];
-      clear_bit(dfam_sample_exclude, sample_uidx);
+      clear_bit(sample_uidx, dfam_sample_exclude);
       *cur_dfam_ptr++ = sample_uidx;
 
       sample_uidx = idx_to_uidx[fss_contents[fssc_start - 1]];
-      clear_bit(dfam_sample_exclude, sample_uidx);
+      clear_bit(sample_uidx, dfam_sample_exclude);
       *cur_dfam_ptr++ = sample_uidx;
 
       dfam_cluster_map_size += sibling_ct;
       *cur_dfam_ptr++ = sibling_ct;
       for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
 	sample_uidx = idx_to_uidx[fss_contents[fssc_idx]];
-	clear_bit(dfam_sample_exclude, sample_uidx);
+	clear_bit(sample_uidx, dfam_sample_exclude);
 	*cur_dfam_ptr++ = sample_uidx;
       }
     }
@@ -4128,14 +4068,14 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
       *cur_dfam_ptr++ = sibling_ct;
       for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
 	sample_uidx = idx_to_uidx[fss_contents[fssc_idx]];
-	clear_bit(dfam_sample_exclude, sample_uidx);
+	clear_bit(sample_uidx, dfam_sample_exclude);
 	*cur_dfam_ptr++ = sample_uidx;
       }
     }
   }
   dfam_cluster_map_size = ((uintptr_t)(cur_dfam_ptr - dfam_mixed_start)) - 3 * family_mixed_ct - sibship_mixed_ct;
   if (!no_unrelateds) {
-    if (wkspace_alloc_ui_checked(&sample_to_cluster, sample_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(sample_ct, &sample_to_cluster)) {
       goto dfam_ret_NOMEM;
     }
     // --within on an empty file actually causes --dfam to behave differently
@@ -4157,11 +4097,10 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
       }
     }
 
-    if (wkspace_alloc_ui_checked(&cluster_ctrl_case_cts, cluster_ct * 2 * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&cluster_write_idxs, cluster_ct * sizeof(int32_t))) {
+    if (bigstack_calloc_ui(cluster_ct * 2, &cluster_ctrl_case_cts) ||
+        bigstack_alloc_ui(cluster_ct, &cluster_write_idxs)) {
       goto dfam_ret_NOMEM;
     }
-    fill_uint_zero(cluster_ctrl_case_cts, 2 * cluster_ct);
     for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
       unrelated_cluster_idx = sample_to_cluster[sample_idx];
       if (unrelated_cluster_idx != 0xffffffffU) {
@@ -4191,19 +4130,20 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
 	if (cur_ctrl_ct && cur_case_ct) {
 	  uii = cluster_write_idxs[unrelated_cluster_idx];
 	  cur_dfam_ptr[uii] = sample_uidx;
-	  clear_bit(dfam_sample_exclude, sample_uidx);
+	  clear_bit(sample_uidx, dfam_sample_exclude);
 	  cluster_write_idxs[unrelated_cluster_idx] = uii + 1;
 	}
       }
     }
     cur_dfam_ptr = &(cur_dfam_ptr[write_idx]);
   }
-  wkspace_reset((unsigned char*)idx_to_uidx);
-  wkspace_shrink_top(dfam_iteration_order, (cur_dfam_ptr - dfam_iteration_order) * sizeof(int32_t));
+  bigstack_reset((unsigned char*)idx_to_uidx);
+  bigstack_shrink_top(dfam_iteration_order, (cur_dfam_ptr - dfam_iteration_order) * sizeof(int32_t));
   dfam_sample_ct = unfiltered_sample_ct - popcount_longs(dfam_sample_exclude, unfiltered_sample_ctl);
-  dfam_sample_ctl = (dfam_sample_ct + (BITCT - 1)) / BITCT;
-  dfam_sample_ctl2 = (dfam_sample_ct + (BITCT2 - 1)) / BITCT2;
-  if (wkspace_alloc_ui_checked(&sample_uidx_to_idx, unfiltered_sample_ct * sizeof(int32_t))) {
+  dfam_sample_ctl = BITCT_TO_WORDCT(dfam_sample_ct);
+  dfam_sample_ctl2 = QUATERCT_TO_WORDCT(dfam_sample_ct);
+  dfam_sample_ctv = BITCT_TO_ALIGNED_WORDCT(dfam_sample_ct);
+  if (bigstack_alloc_ui(unfiltered_sample_ct, &sample_uidx_to_idx)) {
     goto dfam_ret_NOMEM;
   }
   fill_uidx_to_idx(dfam_sample_exclude, unfiltered_sample_ct, dfam_sample_ct, sample_uidx_to_idx);
@@ -4228,19 +4168,19 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
       cur_dfam_ptr++;
     }
   }
-  wkspace_reset((unsigned char*)sample_uidx_to_idx);
-  if (wkspace_alloc_ul_checked(&dfam_pheno_c, dfam_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&workbuf, unfiltered_sample_ctp1l2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&g_loadbuf, MODEL_BLOCKSIZE * dfam_sample_ctl2 * sizeof(intptr_t))) {
+  bigstack_reset((unsigned char*)sample_uidx_to_idx);
+  if (bigstack_alloc_ul(dfam_sample_ctl2, &dfam_pheno_c) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw) ||
+      bigstack_alloc_ul(unfiltered_sample_ctp1l2, &workbuf) ||
+      bigstack_alloc_ul(MODEL_BLOCKSIZE * ((uintptr_t)dfam_sample_ctl2), &g_loadbuf)) {
     goto dfam_ret_NOMEM;
   }
-  collapse_copy_bitarr(sample_ct, pheno_c, dfam_sample_exclude, dfam_sample_ct, dfam_pheno_c);
+  copy_bitarr_subset_excl(pheno_c, dfam_sample_exclude, sample_ct, dfam_sample_ct, dfam_pheno_c);
   g_pheno_c = dfam_pheno_c;
   g_dfam_iteration_order = dfam_iteration_order;
   g_dfam_family_all_case_children_ct = family_all_case_children_ct;
   g_dfam_family_mixed_ct = family_mixed_ct;
-  g_dfam_sample_ct = dfam_sample_ct;
+  g_perm_pheno_nm_ct = dfam_sample_ct;
   g_dfam_sibship_mixed_ct = sibship_mixed_ct;
   g_dfam_unrelated_cluster_ct = unrelated_cluster_ct;
   g_test_type = perm_adapt_nst;
@@ -4253,68 +4193,67 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
   // no X/haploid/MT, so no haploid filters
 
   if (fill_orig_chisq) {
-    if (wkspace_alloc_d_checked(&orig_chisq, marker_ct * sizeof(double))) {
+    if (bigstack_alloc_d(marker_ct, &orig_chisq)) {
       goto dfam_ret_NOMEM;
     }
     g_orig_stat = orig_chisq;
   }
 
-  dfam_cluster_ct = family_mixed_ct + sibship_mixed_ct + unrelated_cluster_ct;
+  g_perm_cluster_ct = family_mixed_ct + sibship_mixed_ct + unrelated_cluster_ct;
   if (do_perms_nst) {
     logerrprint("Error: --dfam permutation tests are currently under development.\n");
     retval = RET_CALC_NOT_YET_SUPPORTED;
     goto dfam_ret_1;
-    if (wkspace_alloc_ui_checked(&dfam_cluster_map, dfam_cluster_map_size * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&dfam_cluster_starts, (dfam_cluster_ct + 1) * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&dfam_cluster_case_cts, dfam_cluster_ct * sizeof(int32_t)) ||
-        wkspace_alloc_ul_checked(&perm_preimage, dfam_sample_ctl * sizeof(intptr_t))) {
+    if (bigstack_alloc_ui(dfam_cluster_map_size, &g_perm_cluster_map) ||
+        bigstack_alloc_ui(g_perm_cluster_ct + 1, &g_perm_cluster_starts) ||
+        bigstack_alloc_ui(g_perm_cluster_ct, &g_perm_cluster_case_cts) ||
+        bigstack_calloc_ul(dfam_sample_ctl, &g_perm_cluster_cc_preimage)) {
       goto dfam_ret_NOMEM;
     }
-    fill_ulong_zero(perm_preimage, dfam_sample_ctl);
     cur_dfam_ptr = dfam_mixed_start;
     write_idx = 0;
     for (uii = 0; uii < family_mixed_ct; uii++) {
-      dfam_cluster_starts[uii] = write_idx;
+      g_perm_cluster_starts[uii] = write_idx;
       cur_dfam_ptr = &(cur_dfam_ptr[2]);
       sibling_ct = *cur_dfam_ptr++;
       cur_case_ct = 0;
       for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
 	sample_idx = cur_dfam_ptr[sib_idx];
-	dfam_cluster_map[write_idx++] = sample_idx;
+	g_perm_cluster_map[write_idx++] = sample_idx;
 	cur_case_ct += IS_SET(dfam_pheno_c, sample_idx);
       }
       if (cur_case_ct * 2 >= sibling_ct) {
 	for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
-	  SET_BIT(perm_preimage, cur_dfam_ptr[sib_idx]);
+	  SET_BIT(cur_dfam_ptr[sib_idx], g_perm_cluster_cc_preimage);
 	}
       }
       cur_dfam_ptr = &(cur_dfam_ptr[sibling_ct]);
-      dfam_cluster_case_cts[uii] = cur_case_ct;
+      g_perm_cluster_case_cts[uii] = cur_case_ct;
     }
-    for (; uii < dfam_cluster_ct; uii++) {
-      dfam_cluster_starts[uii] = write_idx;
+    for (; uii < g_perm_cluster_ct; uii++) {
+      g_perm_cluster_starts[uii] = write_idx;
       sibling_ct = *cur_dfam_ptr++;
       cur_case_ct = 0;
       for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
 	sample_idx = cur_dfam_ptr[sib_idx];
-	dfam_cluster_map[write_idx++] = sample_idx;
+	g_perm_cluster_map[write_idx++] = sample_idx;
 	cur_case_ct += IS_SET(dfam_pheno_c, sample_idx);
       }
       if (cur_case_ct * 2 >= sibling_ct) {
 	for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
-	  SET_BIT(perm_preimage, cur_dfam_ptr[sib_idx]);
+	  SET_BIT(cur_dfam_ptr[sib_idx], g_perm_cluster_cc_preimage);
 	}
       }
       cur_dfam_ptr = &(cur_dfam_ptr[sibling_ct]);
-      dfam_cluster_case_cts[uii] = cur_case_ct;
+      g_perm_cluster_case_cts[uii] = cur_case_ct;
     }
     if (write_idx != dfam_cluster_map_size) {
       logerrprint("assert failure: write_idx != dfam_cluster_map_size\n");
       exit(1);
     }
-    dfam_cluster_starts[dfam_cluster_ct] = write_idx;
+    g_perm_cluster_starts[g_perm_cluster_ct] = write_idx;
 
-    retval = cluster_alloc_and_populate_magic_nums(dfam_cluster_ct, dfam_cluster_map, dfam_cluster_starts, &dfam_tot_quotients, &dfam_totq_magics, &dfam_totq_preshifts, &dfam_totq_postshifts, &dfam_totq_incrs);
+    retval = cluster_alloc_and_populate_magic_nums(g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, &g_perm_tot_quotients, &g_perm_totq_magics, &g_perm_totq_preshifts, &g_perm_totq_postshifts, &g_perm_totq_incrs);
     if (retval) {
       goto dfam_ret_1;
     }
@@ -4322,7 +4261,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
 
   ulii = 2 * max_marker_allele_len + plink_maxsnp + MAX_ID_LEN + 256;
   if (ulii > MAXLINELEN) {
-    if (wkspace_alloc_c_checked(&textbuf, ulii)) {
+    if (bigstack_alloc_c(ulii, &textbuf)) {
       goto dfam_ret_NOMEM;
     }
   }
@@ -4331,17 +4270,16 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
   // since it's also restricted to autosomes
   g_perms_done = 0;
   g_mperm_save_all = NULL;
-  g_dfam_perm_vecs = NULL;
+  g_perm_vecs = NULL;
   if (perm_maxt_nst) {
     perms_total = fam_ip->dfam_mperm_val;
-    if (wkspace_alloc_d_checked(&maxt_extreme_stat, perms_total * sizeof(double))) {
+    if (bigstack_calloc_d(perms_total, &maxt_extreme_stat)) {
       goto dfam_ret_NOMEM;
     }
     g_maxt_extreme_stat = maxt_extreme_stat;
-    fill_double_zero(maxt_extreme_stat, perms_total);
     if (mperm_save & MPERM_DUMP_ALL) {
       memcpy(outname_end, ".mperm.dump.all", 16);
-      if (fopen_checked(&outfile_msa, outname, "w")) {
+      if (fopen_checked(outname, "w", &outfile_msa)) {
         goto dfam_ret_OPEN_FAIL;
       }
       if (putc_checked('0', outfile_msa)) {
@@ -4354,15 +4292,14 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
     if (perm_adapt_nst) {
       g_aperm_alpha = apip->alpha;
       perms_total = apip->max;
-      if (wkspace_alloc_ui_checked(&g_perm_attempt_ct, marker_ct * sizeof(int32_t)) ||
-          wkspace_alloc_uc_checked(&g_perm_adapt_stop, marker_ct)) {
+      if (bigstack_alloc_ui(marker_ct, &g_perm_attempt_ct) ||
+          bigstack_calloc_uc(round_up_pow2(marker_ct, BYTECT), &g_perm_adapt_stop)) {
         goto dfam_ret_NOMEM;
       }
       ujj = apip->max;
       for (uii = 0; uii < marker_ct; uii++) {
 	g_perm_attempt_ct[uii] = ujj;
       }
-      fill_ulong_zero((uintptr_t*)g_perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
       g_adaptive_ci_zt = ltqnorm(1 - apip->beta / (2.0 * ((intptr_t)marker_ct)));
       if (apip->min < apip->init_interval) {
         g_first_adapt_check = (int32_t)(apip->init_interval);
@@ -4375,7 +4312,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
   }
 
   outname_end2 = memcpyb(outname_end, ".dfam", 6);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto dfam_ret_OPEN_FAIL;
   }
   LOGPRINTFWW5("Writing --dfam results to %s ... ", outname);
@@ -4395,6 +4332,10 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
         perm_batch_size = fam_ip->dfam_mperm_val;
       }
     }
+    if (bigstack_init_sfmtp(max_thread_ct)) {
+      goto dfam_ret_NOMEM;
+    }
+    g_perm_is_1bit = 1;
   }
   
   fputs("0%", stdout);
@@ -4415,29 +4356,39 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
     }
     perm_vec_ct128 = (g_perm_vec_ct + 127) / 128;
     perm_vec_cta128 = perm_vec_ct128 * 128;
-    perm_vec_wct = (g_perm_vec_ct + (BITCT - 1)) / BITCT;
+    perm_vec_wct = BITCT_TO_WORDCT(g_perm_vec_ct);
     perm_vec_wcta = perm_vec_ct128 * (128 / BITCT);
-    perm_vec_ctcl8m = CACHEALIGN32_DBL(g_perm_vec_ct);
+    perm_vec_ctcl8m = round_up_pow2(g_perm_vec_ct, CACHELINE_DBL);
 
-    if (wkspace_alloc_ul_checked(&g_dfam_perm_vecs, g_perm_vec_ct * dfam_sample_ctl * sizeof(intptr_t)) ||
-	wkspace_alloc_ul_checked(&g_dfam_perm_vecst, dfam_sample_ct * perm_vec_wcta * sizeof(intptr_t)) ||
-	wkspace_alloc_ul_checked(&g_dfam_flipa, family_ct * perm_vec_wct * sizeof(intptr_t)) ||
+    if (bigstack_alloc_ul(dfam_sample_ct * perm_vec_wcta, &g_dfam_perm_vecst) ||
+        bigstack_alloc_ul(g_perm_vec_ct * dfam_sample_ctv, &g_perm_vecs)) {
+      goto dfam_ret_NOMEM;
+    }
+    // initialize phenotype permutations.
+    g_perm_generation_thread_ct = MINV(max_thread_ct, g_perm_vec_ct);
+    if (spawn_threads(threads, &generate_cc_cluster_perms_thread, g_perm_generation_thread_ct)) {
+      goto dfam_ret_THREAD_CREATE_FAIL;
+    }
+    ulii = 0;
+    generate_cc_cluster_perms_thread((void*)ulii);
+    join_threads(threads, g_perm_generation_thread_ct);
+
+    transpose_perm1s(g_perm_vecs, g_perm_vec_ct, sample_ct, (uint32_t*)g_dfam_perm_vecst);
+    bigstack_reset(g_perm_vecs);
+
+    if (bigstack_alloc_ul(family_ct * perm_vec_wct, &g_dfam_flipa) ||
 #ifdef __LP64__
-        wkspace_alloc_ul_checked(&g_dfam_flipa_shuffled, family_all_case_children_ct * perm_vec_wcta * sizeof(intptr_t)) ||
+        bigstack_alloc_ul(family_all_case_children_ct * perm_vec_wcta, &g_dfam_flipa_shuffled) ||
 #endif
-	wkspace_alloc_i_checked(&g_dfam_twice_numers, max_thread_ct * perm_vec_cta128 * sizeof(int32_t)) ||
-	wkspace_alloc_ui_checked(&g_dfam_total_counts, max_thread_ct * perm_vec_cta128 * sizeof(int32_t)) ||
-	wkspace_alloc_d_checked(&g_dfam_numers, max_thread_ct * perm_vec_cta128 * sizeof(double)) ||
-	wkspace_alloc_d_checked(&g_dfam_denoms, max_thread_ct * perm_vec_cta128 * sizeof(double))
-	) {
+	bigstack_alloc_i(max_thread_ct * perm_vec_cta128, &g_dfam_twice_numers) ||
+	bigstack_alloc_ui(max_thread_ct * perm_vec_cta128, &g_dfam_total_counts) ||
+	bigstack_alloc_d(max_thread_ct * perm_vec_cta128, &g_dfam_numers) ||
+	bigstack_alloc_d(max_thread_ct * perm_vec_cta128, &g_dfam_denoms)) {
       goto dfam_ret_NOMEM;
     }
-    // initialize phenotype and flipa permutations.
-    // don't bother multithreading for now
-    for (ulii = 0; ulii < g_perm_vec_ct; ulii++) {
-      generate_cc_cluster_perm1(dfam_sample_ct, perm_preimage, dfam_cluster_ct, dfam_cluster_map, dfam_cluster_starts, dfam_cluster_case_cts, dfam_tot_quotients, dfam_totq_magics, dfam_totq_preshifts, dfam_totq_postshifts, dfam_totq_incrs, &(g_dfam_perm_vecs[ulii * dfam_sample_ctl]), &sfmt);
-    }
-    transpose_perm1s(g_dfam_perm_vecs, g_perm_vec_ct, sample_ct, (uint32_t*)g_dfam_perm_vecst);
+    // initialize flipa permutations.
+    ;;;
+
     /*
     for () {
     }
@@ -4449,11 +4400,11 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
     }
 #endif
     if (perm_maxt_nst) {
-      if (wkspace_alloc_d_checked(&g_maxt_thread_results, max_thread_ct * perm_vec_ctcl8m * sizeof(double))) {
+      if (bigstack_alloc_d(max_thread_ct * perm_vec_ctcl8m, &g_maxt_thread_results)) {
 	goto dfam_ret_NOMEM;
       }
       if (mperm_save & MPERM_DUMP_ALL) {
-	if (wkspace_alloc_d_checked(&g_mperm_save_all, marker_ct * g_perm_vec_ct * sizeof(double))) {
+	if (bigstack_alloc_d(marker_ct * g_perm_vec_ct, &g_mperm_save_all)) {
 	  goto dfam_ret_NOMEM;
 	}
       }
@@ -4485,14 +4436,14 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
 	  goto dfam_ret_READ_FAIL;
 	}
       }
-      if (load_raw2(bedfile, loadbuf_raw, unfiltered_sample_ct4, unfiltered_sample_ctl2m1, final_mask)) {
+      if (load_raw2(unfiltered_sample_ct4, unfiltered_sample_ctl2m1, final_mask, bedfile, loadbuf_raw)) {
 	goto dfam_ret_READ_FAIL;
       }
       if (IS_SET(marker_reverse, marker_uidx)) {
-	reverse_loadbuf((unsigned char*)loadbuf_raw, unfiltered_sample_ct);
+	reverse_loadbuf(unfiltered_sample_ct, (unsigned char*)loadbuf_raw);
       }
       erase_mendel_errors(unfiltered_sample_ct, loadbuf_raw, workbuf, sex_male, trio_error_lookup, trio_ct, 0, multigen);
-      collapse_copy_2bitarr(loadbuf_raw, &(g_loadbuf[block_size * dfam_sample_ctl2]), unfiltered_sample_ct, dfam_sample_ct, dfam_sample_exclude);
+      copy_quaterarr_nonempty_subset_excl(loadbuf_raw, dfam_sample_exclude, unfiltered_sample_ct, dfam_sample_ct, &(g_loadbuf[block_size * dfam_sample_ctl2]));
       if (do_perms_nst) {
 	g_adapt_m_table[block_size] = marker_idx2++;
       }
@@ -4763,7 +4714,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
 	}
 	pval = chiprob_p(chisq, 1);
 	if ((pfilter == 2.0) || ((pval <= pfilter) && (pval >= 0.0))) {
-	  wptr = width_force(4, textbuf, chrom_name_write(textbuf, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx2)));
+	  wptr = width_force(4, textbuf, chrom_name_write(chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx2), textbuf));
 	  *wptr++ = ' ';
 	  wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr);
 	  *wptr++ = ' ';
@@ -4771,11 +4722,11 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
 	  *wptr++ = ' ';
 	  wptr = fw_strcpy(4, marker_allele_ptrs[2 * marker_uidx2 + 1], wptr);
 	  *wptr++ = ' ';
-	  wptr = uint32_writew8x(wptr, total_count, ' ');
-	  wptr = double_g_writewx4x(wptr, total_expected, 8, ' ');
+	  wptr = uint32toa_w8x(total_count, ' ', wptr);
+	  wptr = dtoa_g_wxp4x(total_expected, 8, ' ', wptr);
 	  if (denom != 0.0) {
-	    wptr = double_g_writewx4x(wptr, chisq, 12, ' ');
-	    wptr = double_g_writewx4(wptr, pval, 12);
+	    wptr = dtoa_g_wxp4x(chisq, 12, ' ', wptr);
+	    wptr = dtoa_g_wxp4(pval, 12, wptr);
 	  } else {
 	    wptr = memcpya(wptr, "          NA           NA", 25);
 	  }
@@ -4811,17 +4762,17 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
     fputs("\b\b", stdout);
     logprint("done.\n");
     if (do_perms_nst) {
-      // wkspace_reset();
+      // bigstack_reset();
     }
     if (fclose_null(&outfile)) {
       goto dfam_ret_WRITE_FAIL;
     }
     if (!is_set_test) {
       if (do_perms_nst) {
-	wkspace_reset(g_dfam_perm_vecs);
+	bigstack_reset(g_dfam_perm_vecst);
       }
       if (mtest_adjust) {
-	if (wkspace_alloc_ui_checked(&idx_to_uidx, marker_ct * sizeof(int32_t))) {
+	if (bigstack_alloc_ui(marker_ct, &idx_to_uidx)) {
 	  goto dfam_ret_NOMEM;
 	}
 	fill_idx_to_uidx(marker_exclude, unfiltered_marker_ct, marker_ct, idx_to_uidx);
@@ -4829,7 +4780,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
 	if (retval) {
 	  goto dfam_ret_1;
 	}
-	wkspace_reset(idx_to_uidx);
+	bigstack_reset(idx_to_uidx);
       }
       // if (mperm_save & MPERM_DUMP_ALL) { ...
     } else {
@@ -4841,7 +4792,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
   }
   if (do_perms_nst) {
     // if (mperm_save & MPERM_DUMP_ALL) { ...
-    wkspace_reset(g_dfam_perm_vecs);
+    bigstack_reset(g_dfam_perm_vecst);
     if (g_perms_done < perms_total) {
       if (perm_adapt_nst) {
 	marker_unstopped_ct = marker_ct - popcount_longs((uintptr_t*)g_perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
@@ -4878,20 +4829,20 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
       }
       memcpy(outname_end2, ".mperm", 7);
     }
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto dfam_ret_OPEN_FAIL;
     }
     if (perm_adapt_nst) {
-      sprintf(tbuf, " CHR %%%us    CHISQ_TDT         EMP1           NP \n", plink_maxsnp);
+      sprintf(g_textbuf, " CHR %%%us    CHISQ_TDT         EMP1           NP \n", plink_maxsnp);
     } else {
-      sprintf(tbuf, " CHR %%%us    CHISQ_TDT         EMP1         EMP2 \n", plink_maxsnp);
+      sprintf(g_textbuf, " CHR %%%us    CHISQ_TDT         EMP1         EMP2 \n", plink_maxsnp);
 #ifdef __cplusplus
       std::sort(g_maxt_extreme_stat, &(g_maxt_extreme_stat[perms_total]));
 #else
       qsort(g_maxt_extreme_stat, perms_total, sizeof(double), double_cmp);
 #endif
     }
-    fprintf(outfile, tbuf, "SNP");
+    fprintf(outfile, g_textbuf, "SNP");
     chrom_fo_idx = 0xffffffffU;
     marker_uidx = next_unset_unsafe(marker_exclude, 0);
     marker_idx = 0;
@@ -4902,7 +4853,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
 	chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[(++chrom_fo_idx) + 1U];
       } while (marker_uidx >= chrom_end);
       uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-      wptr_start = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, uii));
+      wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, uii, g_textbuf));
       *wptr_start++ = ' ';
       wptr_start[plink_maxsnp] = ' ';
       for (; marker_uidx < chrom_end;) {
@@ -4918,16 +4869,16 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
 	    // invalid
 	    wptr = memcpya(wptr, "          NA           NA           NA", 38);
 	  } else {
-	    wptr = double_g_writewx4x(wptr, orig_chisq[marker_idx], 12, ' ');
+	    wptr = dtoa_g_wxp4x(orig_chisq[marker_idx], 12, ' ', wptr);
 	    if (!perm_count) {
-	      wptr = double_g_writewx4(wptr, pval, 12);
+	      wptr = dtoa_g_wxp4(pval, 12, wptr);
 	    } else {
-	      wptr = double_g_writewx4(wptr, ((double)g_perm_2success_ct[marker_idx]) * 0.5, 12);
+	      wptr = dtoa_g_wxp4(((double)g_perm_2success_ct[marker_idx]) * 0.5, 12, wptr);
 	    }
 	    *wptr++ = ' ';
 	    if (perm_adapt_nst) {
 	      wptr = memseta(wptr, 32, 2);
-	      wptr = uint32_writew10(wptr, g_perm_attempt_ct[marker_idx]);
+	      wptr = uint32toa_w10(g_perm_attempt_ct[marker_idx], wptr);
 	    } else {
 	      // ...
 	      if (!perm_count) {
@@ -4935,7 +4886,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
 	      }
 	    }
 	    *wptr++ = '\n';
-	    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	      goto dfam_ret_WRITE_FAIL;
 	    }
 	  }
@@ -4971,14 +4922,12 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
   dfam_ret_INVALID_CMDLINE:
     retval = RET_INVALID_CMDLINE;
     break;
-    /*
   dfam_ret_THREAD_CREATE_FAIL:
     retval = RET_THREAD_CREATE_FAIL;
     break;
-    */
   }
  dfam_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   fclose_cond(outfile_msa);
   return retval;
@@ -5024,7 +4973,7 @@ void qfam_compute_bw(uintptr_t* loadbuf, uintptr_t sample_ct, uint32_t* fs_start
   uint32_t sample_idx;
   uint32_t fss_idx;
   uint32_t uii;
-  fill_all_bits(nm_fss, fss_ct);
+  fill_all_bits(fss_ct, nm_fss);
   cur_start = *fs_starts_ptr++;
   for (cur_idx = 0; cur_idx < family_ct; cur_idx++) {
     cur_end = *fs_starts_ptr++;
@@ -5053,7 +5002,7 @@ void qfam_compute_bw(uintptr_t* loadbuf, uintptr_t sample_ct, uint32_t* fs_start
       if (sib_ct) {
         qfam_b[cur_idx] = ((double)((intptr_t)(2 * (uintptr_t)sib_ct) - ((intptr_t)uljj))) / ((double)((int32_t)sib_ct));
       } else {
-	clear_bit(nm_fss, cur_idx);
+	clear_bit(cur_idx, nm_fss);
       }
     }
     cur_start = cur_end;
@@ -5077,7 +5026,7 @@ void qfam_compute_bw(uintptr_t* loadbuf, uintptr_t sample_ct, uint32_t* fs_start
     if (sib_ct) {
       qfam_b[cur_idx] = ((double)((intptr_t)(2 * (uintptr_t)sib_ct) - ((intptr_t)uljj))) / ((double)((int32_t)sib_ct));
     } else {
-      clear_bit(nm_fss, cur_idx);
+      clear_bit(cur_idx, nm_fss);
     }
     cur_start = cur_end;
   }
@@ -5088,10 +5037,10 @@ void qfam_compute_bw(uintptr_t* loadbuf, uintptr_t sample_ct, uint32_t* fs_start
     if (ulii != 1) {
       qfam_b[cur_idx] = (double)(2 - (intptr_t)(ulii + (ulii == 0)));
     } else {
-      clear_bit(nm_fss, cur_idx);
+      clear_bit(cur_idx, nm_fss);
     }
   }
-  fill_all_bits(nm_lm, lm_ct);
+  fill_all_bits(lm_ct, nm_lm);
   for (sample_uidx = 0, sample_idx = 0; sample_idx < lm_ct; sample_uidx++, sample_idx++) {
     next_set_unsafe_ck(lm_eligible, &sample_uidx);
     ulii = EXTRACT_2BIT_GENO(loadbuf, sample_uidx);
@@ -5118,7 +5067,7 @@ void qfam_compute_bw(uintptr_t* loadbuf, uintptr_t sample_ct, uint32_t* fs_start
       dxx = pheno_d2[sample_idx];
       qt_sum -= dxx;
       qt_ssq -= dxx * dxx;
-      clear_bit(nm_lm, sample_idx);
+      clear_bit(sample_idx, nm_lm);
     }
   }
   // 1.07 also excludes the nonmissing parent when only one out of two parents
@@ -5148,7 +5097,7 @@ void flip_precalc(uint32_t lm_ct, double* qfam_w, double* pheno_d2, uintptr_t* n
     }
     cur_geno = qfam_w[sample_idx];
     if (fabs(cur_geno) < SMALL_EPSILON) {
-      clear_bit(nm_lm, sample_idx);
+      clear_bit(sample_idx, nm_lm);
     } else {
       geno_sum += cur_geno;
       geno_ssq += cur_geno * cur_geno;
@@ -5238,22 +5187,22 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
   uint32_t lm_ct = g_lm_ct;
   uint32_t singleton_ct = g_singleton_ct;
   uint32_t fss_ct = fs_ct + singleton_ct;
-  uint32_t fss_ctl = (fss_ct + (BITCT - 1)) / BITCT;
-  uint32_t lm_ctl = (lm_ct + (BITCT - 1)) / BITCT;
+  uint32_t fss_ctl = BITCT_TO_WORDCT(fss_ct);
+  uint32_t lm_ctl = BITCT_TO_WORDCT(lm_ct);
   uint32_t test_type = g_test_type;
   uint32_t only_within = (test_type & (QFAM_WITHIN1 | QFAM_WITHIN2))? 1 : 0;
   uintptr_t* lm_eligible = g_lm_eligible;
   uintptr_t* lm_within2_founder = g_lm_within2_founder;
   uintptr_t* qfam_flip = g_qfam_flip;
-  uintptr_t* nm_fss = &(g_nm_fss[tidx * CACHEALIGN32_WORD(fss_ctl)]);
-  uintptr_t* nm_lm = &(g_nm_lm[tidx * CACHEALIGN32_WORD(lm_ctl)]);
-  double* qfam_b = &(g_qfam_b[tidx * CACHEALIGN32_DBL(fss_ct)]);
-  double* qfam_w = &(g_qfam_w[tidx * CACHEALIGN32_DBL(lm_ct)]);
+  uintptr_t* nm_fss = &(g_nm_fss[tidx * round_up_pow2(fss_ctl, CACHELINE_WORD)]);
+  uintptr_t* nm_lm = &(g_nm_lm[tidx * round_up_pow2(lm_ctl, CACHELINE_WORD)]);
+  double* qfam_b = &(g_qfam_b[tidx * round_up_pow2(fss_ct, CACHELINE_DBL)]);
+  double* qfam_w = &(g_qfam_w[tidx * round_up_pow2(lm_ct, CACHELINE_DBL)]);
   double* pheno_d2 = g_pheno_d2;
   double* beta_sum = g_beta_sum;
   double* beta_ssq = g_beta_ssq;
   uint32_t* qfam_permute = only_within? NULL : g_qfam_permute;
-  uint32_t* permute_edit_buf = only_within? NULL : (&(g_permute_edit[tidx * CACHEALIGN32_INT32(fss_ct)]));
+  uint32_t* permute_edit_buf = only_within? NULL : (&(g_permute_edit[tidx * round_up_pow2(fss_ct, CACHELINE_INT32)]));
   uint32_t* perm_2success_ct = g_perm_2success_ct;
   uint32_t* perm_attempt_ct = g_perm_attempt_ct;
   uint32_t* fs_starts = g_fs_starts;
@@ -5262,8 +5211,8 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
   uint32_t* perm_ptr = NULL;
   uint32_t* beta_fail_cts = g_beta_fail_cts;
   uintptr_t cur_perm_ct = g_cur_perm_ct;
-  uintptr_t sample_ct = g_sample_ct;
-  uintptr_t sample_ctl2 = (sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t sample_ct = g_qfam_sample_ct;
+  uintptr_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
   uintptr_t flip_ctl = only_within? lm_ctl : fss_ctl;
   double adaptive_intercept = g_adaptive_intercept;
   double adaptive_slope = g_adaptive_slope;
@@ -5421,12 +5370,12 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
 int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, Aperm_info* apip, uintptr_t* pheno_nm, double* pheno_d, uintptr_t* founder_info, u [...]
   // Fortunately, this can use some of qassoc()'s logic instead of punting to
   // LAPACK, since it doesn't support covariates.
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t unfiltered_sample_ctp1l2 = 1 + (unfiltered_sample_ct / BITCT2);
-  uintptr_t sample_ctl2 = (sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
   uintptr_t final_mask = get_final_mask(unfiltered_sample_ct);
   double qt_sum_all = 0.0;
   double qt_ssq_all = 0.0;
@@ -5574,27 +5523,25 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
   g_lm_eligible = lm_eligible;
   g_lm_within2_founder = lm_within2_founder;
   g_test_type = test_type;
-  g_sample_ct = sample_ct;
+  g_qfam_sample_ct = sample_ct;
   g_fs_ct = fs_ct;
   g_singleton_ct = singleton_ct;
   g_lm_ct = lm_ct;
   g_xfam_thread_ct = qfam_thread_ct;
-  fss_ctl = (fss_ct + BITCT - 1) / BITCT;
-  lm_ctl = (lm_ct + BITCT - 1) / BITCT;
+  fss_ctl = BITCT_TO_WORDCT(fss_ct);
+  lm_ctl = BITCT_TO_WORDCT(lm_ct);
   flip_ctl = only_within? lm_ctl : fss_ctl;
 
-  if (wkspace_alloc_uc_checked(&perm_adapt_stop, marker_ct) ||
-      wkspace_alloc_ui_checked(&g_perm_2success_ct, marker_ct * sizeof(int32_t))) {
+  if (bigstack_calloc_uc(round_up_pow2(marker_ct, BYTECT), &perm_adapt_stop) ||
+      bigstack_calloc_ui(marker_ct, &g_perm_2success_ct)) {
     goto qfam_ret_NOMEM;
   }
-  fill_ulong_zero((uintptr_t*)perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
-  fill_uint_zero(g_perm_2success_ct, marker_ct);
   g_perm_adapt_stop = perm_adapt_stop;
 
   if (perm_adapt) {
     g_aperm_alpha = apip->alpha;
     perms_total = apip->max;
-    if (wkspace_alloc_ui_checked(&g_perm_attempt_ct, marker_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(marker_ct, &g_perm_attempt_ct)) {
       goto qfam_ret_NOMEM;
     }
     ujj = apip->max;
@@ -5621,51 +5568,47 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
   outname_end = memcpya(outname_end, ".qfam.", 6);
   outname_end = strcpya(outname_end, flag_suffix);
   *outname_end = '\0';
-  if (wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&workbuf, unfiltered_sample_ctp1l2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw) ||
+      bigstack_alloc_ul(unfiltered_sample_ctp1l2, &workbuf)) {
     goto qfam_ret_NOMEM;
   }
   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
   workbuf[unfiltered_sample_ctp1l2 - 1] = 0;
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto qfam_ret_OPEN_FAIL;
   }
   if (perms_total < perm_batch_size) {
     perm_batch_size = perms_total;
   }
-  ulii = CACHELINE * ((uintptr_t)qfam_thread_ct);
   if (!only_within) {
-    if (wkspace_alloc_ui_checked(&g_qfam_permute, perm_batch_size * fss_ct * sizeof(intptr_t)) ||
-        wkspace_alloc_ui_checked(&g_permute_edit, ((fss_ct + CACHELINE_INT32 - 1) / CACHELINE_INT32) * ulii)) {
+    if (bigstack_alloc_ui(perm_batch_size * fss_ct, &g_qfam_permute) ||
+        bigstack_alloc_ui(round_up_pow2(fss_ct, CACHELINE_INT32) * qfam_thread_ct, &g_permute_edit)) {
       goto qfam_ret_NOMEM;
     }
   }
   if (emp_se) {
-    if (wkspace_alloc_d_checked(&orig_beta, marker_ct * sizeof(double)) ||
-        wkspace_alloc_d_checked(&g_beta_sum, marker_ct * sizeof(double)) ||
-        wkspace_alloc_d_checked(&g_beta_ssq, marker_ct * sizeof(double)) ||
-        wkspace_alloc_ui_checked(&g_beta_fail_cts, marker_ct * sizeof(double))) {
+    if (bigstack_alloc_d(marker_ct, &orig_beta) ||
+        bigstack_calloc_d(marker_ct, &g_beta_sum) ||
+        bigstack_calloc_d(marker_ct, &g_beta_ssq) ||
+        bigstack_calloc_ui(marker_ct, &g_beta_fail_cts)) {
       goto qfam_ret_NOMEM;
     }
-    fill_double_zero(g_beta_sum, marker_ct);
-    fill_double_zero(g_beta_ssq, marker_ct);
-    fill_uint_zero(g_beta_fail_cts, marker_ct);
   } else {
     g_beta_sum = NULL;
     g_beta_ssq = NULL;
     g_beta_fail_cts = NULL;
   }
-  if (wkspace_alloc_ul_checked(&g_loadbuf, MODEL_BLOCKSIZE * sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_d_checked(&g_orig_stat, marker_ct * sizeof(double)) ||
-      wkspace_alloc_ul_checked(&g_qfam_flip, perm_batch_size * flip_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&precomputed_mods, (fss_ct - 1) * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(&nm_fss, ((fss_ct + (CACHELINE * 8 - 1)) / (CACHELINE * 8)) * ulii) ||
-      wkspace_alloc_ul_checked(&nm_lm, ((lm_ct + (CACHELINE * 8 - 1)) / (CACHELINE * 8)) * ulii) ||
-      wkspace_alloc_d_checked(&pheno_d2, lm_ct * sizeof(double)) ||
-      wkspace_alloc_d_checked(&qfam_b, ((fss_ct + CACHELINE_DBL - 1) / CACHELINE_DBL) * ulii) ||
-      wkspace_alloc_d_checked(&qfam_w, ((lm_ct + CACHELINE_DBL - 1) / CACHELINE_DBL) * ulii) ||
-      wkspace_alloc_ui_checked(&dummy_perm, fss_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(&dummy_flip, fss_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(MODEL_BLOCKSIZE * sample_ctl2, &g_loadbuf) ||
+      bigstack_alloc_d(marker_ct, &g_orig_stat) ||
+      bigstack_alloc_ul(perm_batch_size * flip_ctl, &g_qfam_flip) ||
+      bigstack_alloc_ui(fss_ct - 1, &precomputed_mods) ||
+      bigstack_alloc_ul(round_up_pow2(fss_ct, CACHELINE_BIT) * qfam_thread_ct, &nm_fss) ||
+      bigstack_alloc_ul(round_up_pow2(lm_ct, CACHELINE_BIT) * qfam_thread_ct, &nm_lm) ||
+      bigstack_alloc_d(lm_ct, &pheno_d2) ||
+      bigstack_alloc_d(round_up_pow2(fss_ct, CACHELINE_DBL) * qfam_thread_ct, &qfam_b) ||
+      bigstack_alloc_d(round_up_pow2(lm_ct, CACHELINE_DBL) * qfam_thread_ct, &qfam_w) ||
+      bigstack_alloc_ui(fss_ct, &dummy_perm) ||
+      bigstack_alloc_ul(fss_ctl, &dummy_flip)) {
     goto qfam_ret_NOMEM;
   }
   for (uii = 0, ujj = 0, ukk = 0; ujj < sample_ct; uii++, ujj++) {
@@ -5700,8 +5643,8 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
   fflush(stdout);
   // deliberately rename last field to RAW_P to reduce likelihood of
   // misinterpretation.  --adjust also disabled.
-  sprintf(tbuf, " CHR %%%us         BP   A1       TEST     NIND       BETA         STAT        RAW_P\n", plink_maxsnp);
-  fprintf(outfile, tbuf, "SNP");
+  sprintf(g_textbuf, " CHR %%%us         BP   A1       TEST     NIND       BETA         STAT        RAW_P\n", plink_maxsnp);
+  fprintf(outfile, g_textbuf, "SNP");
   marker_unstopped_ct = marker_ct;
   loop_end = marker_ct / 100;
 
@@ -5729,11 +5672,11 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
       for (ulii = 0; ulii < cur_perm_ct; ulii++) {
         uiptr = (uint32_t*)dummy_flip;
 	for (uii = 0; uii < ujj; uii++) {
-          uiptr[uii] = sfmt_genrand_uint32(&sfmt);
+          uiptr[uii] = sfmt_genrand_uint32(&g_sfmt);
 	}
         for (uii = 0; uii < lm_ct; uii++) {
           if (is_set(dummy_flip, sample_lm_to_fss_idx[uii])) {
-	    set_bit(ulptr, uii);
+	    set_bit(uii, ulptr);
 	  }
 	}
 	ulptr = &(ulptr[lm_ctl]);
@@ -5741,12 +5684,12 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
       fill_ulong_zero(dummy_flip, fss_ctl);
     } else {
       for (ulii = 0; ulii < cur_perm_ct; ulii++) {
-	uint32_permute(&(g_qfam_permute[ulii * fss_ct]), &(precomputed_mods[-1]), &sfmt, fss_ct);
+	uint32_permute(&(g_qfam_permute[ulii * fss_ct]), &(precomputed_mods[-1]), &g_sfmt, fss_ct);
       }
       uiptr = (uint32_t*)g_qfam_flip;
       uljj = cur_perm_ct * fss_ctl * (BITCT / 32);
       for (ulii = 0; ulii < uljj; ulii++) {
-	*uiptr++ = sfmt_genrand_uint32(&sfmt);
+	*uiptr++ = sfmt_genrand_uint32(&g_sfmt);
       }
     }
     marker_uidx = next_unset_unsafe(marker_exclude, 0);
@@ -5788,15 +5731,15 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
 	  }
 	  seek_flag = 0;
 	}
-	if (load_raw2(bedfile, loadbuf_raw, unfiltered_sample_ct4, unfiltered_sample_ctl2m1, final_mask)) {
+	if (load_raw2(unfiltered_sample_ct4, unfiltered_sample_ctl2m1, final_mask, bedfile, loadbuf_raw)) {
 	  goto qfam_ret_READ_FAIL;
 	}
 	if (IS_SET(marker_reverse, marker_uidx)) {
-	  reverse_loadbuf((unsigned char*)loadbuf_raw, unfiltered_sample_ct);
+	  reverse_loadbuf(unfiltered_sample_ct, (unsigned char*)loadbuf_raw);
 	}
 	erase_mendel_errors(unfiltered_sample_ct, loadbuf_raw, workbuf, sex_male, trio_error_lookup, trio_ct, 0, multigen);
 	loadbuf_ptr = &(g_loadbuf[block_idx * sample_ctl2]);
-	collapse_copy_2bitarr(loadbuf_raw, loadbuf_ptr, unfiltered_sample_ct, sample_ct, sample_exclude);
+	copy_quaterarr_nonempty_subset_excl(loadbuf_raw, sample_exclude, unfiltered_sample_ct, sample_ct, loadbuf_ptr);
 	g_adapt_m_table[block_idx] = marker_idx;
 	mu_table[block_idx++] = marker_uidx;
       }
@@ -5812,32 +5755,32 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
 	    chrom_fo_idx = get_marker_chrom_fo_idx(chrom_info_ptr, marker_uidx_cur);
 	    chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
 	    chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1];
-	    chrom_name_ptr = chrom_name_buf5w4write(chrom_name_buf, chrom_info_ptr, chrom_idx, &chrom_name_len);
+	    chrom_name_ptr = chrom_name_buf5w4write(chrom_info_ptr, chrom_idx, &chrom_name_len, chrom_name_buf);
 	  }
-	  bufptr = memcpyax(tbuf, chrom_name_ptr, chrom_name_len, ' ');
+	  bufptr = memcpyax(g_textbuf, chrom_name_ptr, chrom_name_len, ' ');
 	  bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx_cur * max_marker_id_len]), bufptr);
 	  *bufptr++ = ' ';
-	  bufptr = uint32_writew10x(bufptr, marker_pos[marker_uidx_cur], ' ');
-	  if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+	  bufptr = uint32toa_w10x(marker_pos[marker_uidx_cur], ' ', bufptr);
+	  if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	    goto qfam_ret_WRITE_FAIL;
 	  }
 	  fputs_w4(marker_allele_ptrs[marker_uidx_cur * 2], outfile);
 	  loadbuf_ptr = &(g_loadbuf[block_idx * sample_ctl2]);
 	  qfam_compute_bw(loadbuf_ptr, sample_ct, fs_starts, fss_contents, sample_lm_to_fss_idx, lm_eligible, lm_within2_founder, family_ct, fs_ct, singleton_ct, lm_ct, nm_fss, nm_lm, pheno_d2, qt_sum_all, qt_ssq_all, qfam_b, qfam_w, &qt_sum, &qt_ssq);
 	  nind = popcount_longs(nm_lm, lm_ctl);
-	  bufptr = memseta(tbuf, 32, 7);
+	  bufptr = memseta(g_textbuf, 32, 7);
 	  bufptr = memcpya(bufptr, qfam_test_ptr, 5);
-	  bufptr = uint32_writew8x(bufptr, nind, ' ');
+	  bufptr = uint32toa_w8x(nind, ' ', bufptr);
 	  nind_recip = 1.0 / ((double)((int32_t)nind));
 	  if (only_within) {
 	    flip_precalc(lm_ct, qfam_w, pheno_d2, nm_lm, &geno_sum, &geno_ssq, &qt_g_prod);
           }
 	  if (!qfam_regress(test_type, nind, lm_ct, sample_lm_to_fss_idx, nm_lm, pheno_d2, qfam_b, qfam_w, dummy_perm, dummy_flip, nind_recip, qt_sum, qt_ssq, geno_sum, geno_ssq, qt_g_prod, &beta, &tstat)) {
-	    bufptr = double_g_writewx4x(bufptr, beta, 10, ' ');
-	    bufptr = double_g_writewx4x(bufptr, tstat, 12, ' ');
+	    bufptr = dtoa_g_wxp4x(beta, 10, ' ', bufptr);
+	    bufptr = dtoa_g_wxp4x(tstat, 12, ' ', bufptr);
 	    // do not apply --output-min-p since only the empirical p-value is
 	    // supposed to be postprocessed here, not this one
-	    bufptr = double_g_writewx4x(bufptr, calc_tprob(tstat, nind - 2), 12, '\n');
+	    bufptr = dtoa_g_wxp4x(calc_tprob(tstat, nind - 2), 12, '\n', bufptr);
 	    if (emp_se) {
 	      orig_beta[marker_idx_base + block_idx] = beta;
 	    }
@@ -5848,7 +5791,7 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
 	    *orig_stat_ptr++ = -9;
 	    regress_fail_ct++;
 	  }
-	  if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+	  if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	    goto qfam_ret_WRITE_FAIL;
 	  }
 	}
@@ -5897,11 +5840,11 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
   }
   putchar('\r');
   memcpy(outname_end, ".perm", 6);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto qfam_ret_OPEN_FAIL;
   }
-  sprintf(tbuf, emp_se? " CHR %%%us         BETA     EMP_BETA       EMP_SE         EMP1           NP \n" : " CHR %%%us         EMP1           NP \n", plink_maxsnp);
-  fprintf(outfile, tbuf, "SNP");
+  sprintf(g_textbuf, emp_se? " CHR %%%us         BETA     EMP_BETA       EMP_SE         EMP1           NP \n" : " CHR %%%us         EMP1           NP \n", plink_maxsnp);
+  fprintf(outfile, g_textbuf, "SNP");
   chrom_fo_idx = 0xffffffffU;
   chrom_end = 0;
   for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
@@ -5917,9 +5860,9 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
 	}
 	marker_uidx = next_unset_unsafe(marker_exclude, chrom_end);
       }
-      chrom_name_ptr = chrom_name_buf5w4write(chrom_name_buf, chrom_info_ptr, chrom_idx, &chrom_name_len);
+      chrom_name_ptr = chrom_name_buf5w4write(chrom_info_ptr, chrom_idx, &chrom_name_len, chrom_name_buf);
     }
-    bufptr = memcpyax(tbuf, chrom_name_ptr, chrom_name_len, ' ');
+    bufptr = memcpyax(g_textbuf, chrom_name_ptr, chrom_name_len, ' ');
     bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), bufptr);
     *bufptr++ = ' ';
     if (g_orig_stat[marker_idx] == -9) {
@@ -5935,15 +5878,15 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
 	ujj = perms_total;
       }
       if (emp_se) {
-	bufptr = double_g_writewx4x(bufptr, orig_beta[marker_idx], 12, ' ');
+	bufptr = dtoa_g_wxp4x(orig_beta[marker_idx], 12, ' ', bufptr);
 	ukk = ujj - g_beta_fail_cts[marker_idx];
 	if (ukk <= 1) {
           bufptr = memcpya(bufptr, "          NA ", 13);
 	} else {
 	  dxx = g_beta_sum[marker_idx] / ((double)((int32_t)ukk));
-	  bufptr = double_g_writewx4x(bufptr, dxx, 12, ' ');
+	  bufptr = dtoa_g_wxp4x(dxx, 12, ' ', bufptr);
 	  dxx = sqrt((g_beta_ssq[marker_idx] - g_beta_sum[marker_idx] * dxx) / ((double)((int32_t)(ukk - 1))));
-          bufptr = double_g_writewx4x(bufptr, dxx, 12, ' ');
+          bufptr = dtoa_g_wxp4x(dxx, 12, ' ', bufptr);
 	}
       }
       if (!perm_count) {
@@ -5951,11 +5894,11 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
       } else {
 	dxx = ((double)uii) * 0.5;
       }
-      bufptr = double_g_writewx4(bufptr, dxx, 12);
+      bufptr = dtoa_g_wxp4(dxx, 12, bufptr);
       bufptr = memseta(bufptr, 32, 3);
-      bufptr = uint32_writew10x(bufptr, ujj, '\n');
+      bufptr = uint32toa_w10x(ujj, '\n', bufptr);
     }
-    if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+    if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
       goto qfam_ret_WRITE_FAIL;
     }
   }
@@ -5984,7 +5927,7 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
     break;
   }
  qfam_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   return retval;
 }
diff --git a/plink_filter.c b/plink_filter.c
index baf4eb0..e2f44ce 100644
--- a/plink_filter.c
+++ b/plink_filter.c
@@ -26,10 +26,10 @@ void oblig_missing_cleanup(Oblig_missing_info* om_ip) {
   }
 }
 
-const char keep_str[] = "keep";
-const char keep_fam_str[] = "keep-fam";
-const char remove_str[] = "remove";
-const char remove_fam_str[] = "remove-fam";
+static const char keep_str[] = "keep";
+static const char keep_fam_str[] = "keep-fam";
+static const char remove_str[] = "remove";
+static const char remove_fam_str[] = "remove-fam";
 
 const char* keep_or_remove_flag_str(uint32_t flags) {
   switch (flags) {
@@ -47,9 +47,9 @@ const char* keep_or_remove_flag_str(uint32_t flags) {
 
 int32_t keep_or_remove(char* fname, char* sorted_ids, uintptr_t sorted_ids_ct, uintptr_t max_id_len, uint32_t* id_map, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t* exclude_ct_ptr, uint32_t flags, uint32_t allow_no_samples) {
   FILE* infile = NULL;
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t* exclude_arr_new = NULL;
-  uintptr_t unfiltered_ctl = (unfiltered_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_ctl = BITCT_TO_WORDCT(unfiltered_ct);
   uintptr_t duplicate_ct = 0;
   uintptr_t line_idx = 0;
   uint32_t do_exclude = flags & 1;
@@ -62,7 +62,7 @@ int32_t keep_or_remove(char* fname, char* sorted_ids, uintptr_t sorted_ids_ct, u
   uint32_t cur_idx;
   uint32_t last_idx;
 
-  if (wkspace_alloc_ul_checked(&exclude_arr_new, unfiltered_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_ctl, &exclude_arr_new)) {
     goto keep_or_remove_ret_NOMEM;
   }
   if (do_exclude) {
@@ -71,28 +71,28 @@ int32_t keep_or_remove(char* fname, char* sorted_ids, uintptr_t sorted_ids_ct, u
     // the already_seen strategy
     memcpy(exclude_arr_new, exclude_arr, unfiltered_ctl * sizeof(intptr_t));
   } else {
-    fill_all_bits(exclude_arr_new, unfiltered_ct);
+    fill_all_bits(unfiltered_ct, exclude_arr_new);
   }
-  if (fopen_checked(&infile, fname, "r")) {
+  if (fopen_checked(fname, "r", &infile)) {
     goto keep_or_remove_ret_OPEN_FAIL;
   }
-  tbuf[MAXLINELEN - 1] = ' ';
-  if (wkspace_alloc_c_checked(&id_buf, max_id_len)) {
+  g_textbuf[MAXLINELEN - 1] = ' ';
+  if (bigstack_alloc_c(max_id_len, &id_buf)) {
     goto keep_or_remove_ret_NOMEM;
   }
-  while (fgets(tbuf, MAXLINELEN, infile) != NULL) {
+  while (fgets(g_textbuf, MAXLINELEN, infile) != NULL) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --%s file is pathologically long.\n", line_idx, keep_or_remove_flag_str(flags));
+    if (!g_textbuf[MAXLINELEN - 1]) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --%s file is pathologically long.\n", line_idx, keep_or_remove_flag_str(flags));
       goto keep_or_remove_ret_INVALID_FORMAT_2;
     }
-    bufptr0 = skip_initial_spaces(tbuf);
+    bufptr0 = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr0)) {
       continue;
     }
     if (!families_only) {
-      if (bsearch_read_fam_indiv(id_buf, sorted_ids, max_id_len, sorted_ids_ct, bufptr0, NULL, &ii)) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of --%s file has fewer tokens than expected.\n", line_idx, keep_or_remove_flag_str(flags));
+      if (bsearch_read_fam_indiv(bufptr0, sorted_ids, max_id_len, sorted_ids_ct, NULL, &ii, id_buf)) {
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --%s file has fewer tokens than expected.\n", line_idx, keep_or_remove_flag_str(flags));
 	goto keep_or_remove_ret_INVALID_FORMAT_2;
       }
       if (ii != -1) {
@@ -102,19 +102,19 @@ int32_t keep_or_remove(char* fname, char* sorted_ids, uintptr_t sorted_ids_ct, u
 	    if (IS_SET(exclude_arr_new, unfiltered_idx)) {
 	      duplicate_ct++;
 	    } else {
-	      SET_BIT(exclude_arr_new, unfiltered_idx);
+	      SET_BIT(unfiltered_idx, exclude_arr_new);
 	    }
 	  } else {
 	    if (!IS_SET(exclude_arr_new, unfiltered_idx)) {
 	      duplicate_ct++;
 	    } else {
-	      CLEAR_BIT(exclude_arr_new, unfiltered_idx);
+	      CLEAR_BIT(unfiltered_idx, exclude_arr_new);
 	    }
 	  }
 	}
       }
     } else {
-      bsearch_fam(id_buf, sorted_ids, max_id_len, sorted_ids_ct, bufptr0, &cur_idx, &last_idx);
+      bsearch_fam(bufptr0, sorted_ids, max_id_len, sorted_ids_ct, &cur_idx, &last_idx, id_buf);
       ii = 0;
       while (cur_idx < last_idx) {
 	unfiltered_idx = id_map[cur_idx++];
@@ -123,13 +123,13 @@ int32_t keep_or_remove(char* fname, char* sorted_ids, uintptr_t sorted_ids_ct, u
 	    if (IS_SET(exclude_arr_new, unfiltered_idx)) {
 	      ii = 1;
 	    } else {
-	      SET_BIT(exclude_arr_new, unfiltered_idx);
+	      SET_BIT(unfiltered_idx, exclude_arr_new);
 	    }
 	  } else {
 	    if (!IS_SET(exclude_arr_new, unfiltered_idx)) {
 	      ii = 1;
 	    } else {
-	      CLEAR_BIT(exclude_arr_new, unfiltered_idx);
+	      CLEAR_BIT(unfiltered_idx, exclude_arr_new);
 	    }
 	  }
 	}
@@ -171,7 +171,7 @@ int32_t keep_or_remove(char* fname, char* sorted_ids, uintptr_t sorted_ids_ct, u
     retval = RET_ALL_SAMPLES_EXCLUDED;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(infile);
   return retval;
 }
@@ -211,7 +211,7 @@ void extract_exclude_process_token(const char* tok_start, const uint32_t* marker
       if (IS_SET(already_seen, marker_uidx)) {
 	*duplicate_ct_ptr += 1;
       } else {
-	SET_BIT(already_seen, marker_uidx);
+	SET_BIT(marker_uidx, already_seen);
 	if (!cur_dup) {
 	  return;
 	}
@@ -220,7 +220,7 @@ void extract_exclude_process_token(const char* tok_start, const uint32_t* marker
 	  if (cur_llidx == 0xffffffffU) {
 	    return;
 	  }
-	  SET_BIT(already_seen, extra_alloc_base[cur_llidx]);
+	  SET_BIT(extra_alloc_base[cur_llidx], already_seen);
 	}
       }
     }
@@ -236,13 +236,13 @@ void extract_exclude_process_token(const char* tok_start, const uint32_t* marker
 }
 
 int32_t extract_exclude_flag_norange(char* fname, uint32_t* marker_id_htable, uint32_t marker_id_htable_size, uint32_t do_exclude, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, uint32_t allow_no_variants) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t duplicate_ct = 0;
   // needs to be synced with populate_id_htable
-  const uint32_t* extra_alloc_base = &(marker_id_htable[CACHEALIGN32_INT32(marker_id_htable_size)]);
-  char* midbuf = &(tbuf[MAXLINELEN]);
+  const uint32_t* extra_alloc_base = &(marker_id_htable[round_up_pow2_ui(marker_id_htable_size, CACHELINE_INT32)]);
+  char* midbuf = &(g_textbuf[MAXLINELEN]);
   uint32_t curtoklen = 0;
   int32_t retval = 0;
   uintptr_t bufsize;
@@ -251,11 +251,10 @@ int32_t extract_exclude_flag_norange(char* fname, uint32_t* marker_id_htable, ui
   char* bufptr;
   char* bufptr2;
   char* bufptr3;
-  if (wkspace_alloc_ul_checked(&already_seen, unfiltered_marker_ctl * sizeof(intptr_t))) {
+  if (bigstack_calloc_ul(unfiltered_marker_ctl, &already_seen)) {
     goto extract_exclude_flag_norange_ret_NOMEM;
   }
-  fill_ulong_zero(already_seen, unfiltered_marker_ctl);
-  if (fopen_checked(&infile, fname, "rb")) {
+  if (fopen_checked(fname, FOPEN_RB, &infile)) {
     goto extract_exclude_flag_norange_ret_OPEN_FAIL;
   }
   while (1) {
@@ -264,14 +263,14 @@ int32_t extract_exclude_flag_norange(char* fname, uint32_t* marker_id_htable, ui
     }
     if (!bufsize) {
       if (curtoklen) {
-        extract_exclude_process_token(&(tbuf[MAXLINELEN - curtoklen]), marker_id_htable, marker_id_htable_size, extra_alloc_base, marker_ids, max_marker_id_len, marker_exclude, already_seen, &duplicate_ct, do_exclude, curtoklen);
+        extract_exclude_process_token(&(g_textbuf[MAXLINELEN - curtoklen]), marker_id_htable, marker_id_htable_size, extra_alloc_base, marker_ids, max_marker_id_len, marker_exclude, already_seen, &duplicate_ct, do_exclude, curtoklen);
       }
       break;
     }
     bufptr0 = &(midbuf[bufsize]);
     *bufptr0 = ' ';
     bufptr0[1] = '0';
-    bufptr = &(tbuf[MAXLINELEN - curtoklen]);
+    bufptr = &(g_textbuf[MAXLINELEN - curtoklen]);
     bufptr2 = midbuf;
     if (curtoklen) {
       goto extract_exclude_flag_norange_tok_start;
@@ -290,12 +289,12 @@ int32_t extract_exclude_flag_norange(char* fname, uint32_t* marker_id_htable, ui
 	bufptr2++;
       }
       curtoklen = (uintptr_t)(bufptr2 - bufptr);
-      if (bufptr2 == &(tbuf[MAXLINELEN * 2])) {
+      if (bufptr2 == &(g_textbuf[MAXLINELEN * 2])) {
         if (curtoklen > MAXLINELEN) {
-	  sprintf(logbuf, "Error: Excessively long ID in --%s file.\n", do_exclude? "exclude" : "extract");
+	  sprintf(g_logbuf, "Error: Excessively long ID in --%s file.\n", do_exclude? "exclude" : "extract");
           goto extract_exclude_flag_norange_ret_INVALID_FORMAT_2;
 	}
-	bufptr3 = &(tbuf[MAXLINELEN - curtoklen]);
+	bufptr3 = &(g_textbuf[MAXLINELEN - curtoklen]);
         memcpy(bufptr3, bufptr, curtoklen);
 	break;
       }
@@ -307,10 +306,10 @@ int32_t extract_exclude_flag_norange(char* fname, uint32_t* marker_id_htable, ui
     goto extract_exclude_flag_norange_ret_READ_FAIL;
   }
   if (do_exclude) {
-    bitfield_or(marker_exclude, already_seen, unfiltered_marker_ctl * sizeof(intptr_t));
+    bitvec_or(already_seen, unfiltered_marker_ctl, marker_exclude);
   } else {
-    bitfield_ornot(marker_exclude, already_seen, unfiltered_marker_ctl * sizeof(intptr_t));
-    zero_trailing_bits(marker_exclude, unfiltered_marker_ct);
+    bitvec_ornot(already_seen, unfiltered_marker_ctl, marker_exclude);
+    zero_trailing_bits(unfiltered_marker_ct, marker_exclude);
   }
   *marker_exclude_ct_ptr = popcount_longs(marker_exclude, unfiltered_marker_ctl);
   if ((*marker_exclude_ct_ptr == unfiltered_marker_ct) && (!allow_no_variants)) {
@@ -342,16 +341,16 @@ int32_t extract_exclude_flag_norange(char* fname, uint32_t* marker_id_htable, ui
     retval = RET_ALL_MARKERS_EXCLUDED;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(infile);
   return retval;
 }
 
 int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uint32_t id_htable_size, uint32_t allow_no_variants, char* item_ids, uintptr_t max_id_len, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t* exclude_ct_ptr) {
   gzFile gz_infile = NULL;
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t include_ct = 0;
-  uintptr_t unfiltered_ctl = (unfiltered_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_ctl = BITCT_TO_WORDCT(unfiltered_ct);
   char* sorted_pos_match = NULL;
   char* sorted_neg_match = NULL;
   char* bufptr2 = NULL;
@@ -375,12 +374,11 @@ int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uin
   uint32_t pos_match_needed;
   int32_t sorted_idx;
   
-  if (wkspace_alloc_ul_checked(&exclude_arr_new, unfiltered_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&already_seen, unfiltered_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_ctl, &exclude_arr_new) ||
+      bigstack_calloc_ul(unfiltered_ctl, &already_seen)) {
     goto filter_attrib_ret_NOMEM;
   }
-  fill_all_bits(exclude_arr_new, unfiltered_ct);
-  fill_ulong_zero(already_seen, unfiltered_ctl);
+  fill_all_bits(unfiltered_ct, exclude_arr_new);
   if (condition_str) {
     // allow NULL condition_str; this means all samples/variants named in the
     // file are included
@@ -420,12 +418,12 @@ int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uin
       is_neg = 0;
     }
     if (pos_match_ct) {
-      if (wkspace_alloc_c_checked(&sorted_pos_match, max_pos_match_len * pos_match_ct)) {
+      if (bigstack_alloc_c(max_pos_match_len * pos_match_ct, &sorted_pos_match)) {
 	goto filter_attrib_ret_NOMEM;
       }
     }
     if (neg_match_ct) {
-      if (wkspace_alloc_c_checked(&sorted_neg_match, max_neg_match_len * neg_match_ct)) {
+      if (bigstack_alloc_c(max_neg_match_len * neg_match_ct, &sorted_neg_match)) {
         goto filter_attrib_ret_NOMEM;
       }
     }
@@ -479,19 +477,17 @@ int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uin
       }
     }
   }
-  loadbuf_size = wkspace_left;
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   } else if (loadbuf_size <= MAXLINELEN) {
     goto filter_attrib_ret_NOMEM;
   }
-  if (gzopen_checked(&gz_infile, fname, "rb")) {
-    goto filter_attrib_ret_OPEN_FAIL;
-  }
-  if (gzbuffer(gz_infile, 131072)) {
-    goto filter_attrib_ret_NOMEM;
+  retval = gzopen_read_checked(fname, &gz_infile);
+  if (retval) {
+    goto filter_attrib_ret_1;
   }
-  loadbuf = (char*)wkspace_base;
+  loadbuf = (char*)g_bigstack_base;
   loadbuf[loadbuf_size - 1] = ' ';
   while (1) {
     line_idx++;
@@ -503,7 +499,7 @@ int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uin
     }
     if (!loadbuf[loadbuf_size - 1]) {
       if (loadbuf_size == MAXLINEBUFLEN) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR" of --attrib file is pathologically long.\n", line_idx);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR" of --attrib file is pathologically long.\n", line_idx);
         goto filter_attrib_ret_INVALID_FORMAT_2;
       }
       goto filter_attrib_ret_NOMEM;
@@ -524,7 +520,7 @@ int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uin
       LOGPREPRINTFWW("Error: Duplicate variant ID '%s' in --attrib file.\n", bufptr);
       goto filter_attrib_ret_INVALID_FORMAT_2;
     }
-    set_bit(already_seen, item_uidx);
+    set_bit(item_uidx, already_seen);
     pos_match_needed = pos_match_ct;
     while (!is_eoln_kns(*cond_ptr)) {
       bufptr2 = cond_ptr;
@@ -545,7 +541,7 @@ int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uin
     if (pos_match_needed) {
       continue;
     }
-    clear_bit(exclude_arr_new, item_uidx);
+    clear_bit(item_uidx, exclude_arr_new);
     include_ct++;
   }
   if ((!include_ct) && (!allow_no_variants)) {
@@ -560,9 +556,6 @@ int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uin
   filter_attrib_ret_NOMEM:
     retval = RET_NOMEM;
     break;
-  filter_attrib_ret_OPEN_FAIL:
-    retval = RET_OPEN_FAIL;
-    break;
   filter_attrib_ret_READ_FAIL:
     retval = RET_READ_FAIL;
     break;
@@ -577,7 +570,7 @@ int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uin
     break;
   }
  filter_attrib_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   gzclose_cond(gz_infile);
   return retval;
 }
@@ -586,9 +579,9 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
   // re-merge this with filter_attrib() after making sample ID lookup
   // hash-based
   gzFile gz_infile = NULL;
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t include_ct = 0;
-  uintptr_t unfiltered_ctl = (unfiltered_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_ctl = BITCT_TO_WORDCT(unfiltered_ct);
   char* sorted_pos_match = NULL;
   char* sorted_neg_match = NULL;
   char* id_buf = NULL;
@@ -613,13 +606,12 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
   uint32_t pos_match_needed;
   int32_t sorted_idx;
 
-  if (wkspace_alloc_ul_checked(&exclude_arr_new, unfiltered_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&already_seen, unfiltered_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_c_checked(&id_buf, max_id_len)) { 
+  if (bigstack_alloc_ul(unfiltered_ctl, &exclude_arr_new) ||
+      bigstack_calloc_ul(unfiltered_ctl, &already_seen) ||
+      bigstack_alloc_c(max_id_len, &id_buf)) { 
     goto filter_attrib_sample_ret_NOMEM;
   }
-  fill_all_bits(exclude_arr_new, unfiltered_ct);
-  fill_ulong_zero(already_seen, unfiltered_ctl);
+  fill_all_bits(unfiltered_ct, exclude_arr_new);
   if (condition_str) {
     // allow NULL condition_str; this means all samples/variants named in the
     // file are included
@@ -659,12 +651,12 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
       is_neg = 0;
     }
     if (pos_match_ct) {
-      if (wkspace_alloc_c_checked(&sorted_pos_match, max_pos_match_len * pos_match_ct)) {
+      if (bigstack_alloc_c(max_pos_match_len * pos_match_ct, &sorted_pos_match)) {
 	goto filter_attrib_sample_ret_NOMEM;
       }
     }
     if (neg_match_ct) {
-      if (wkspace_alloc_c_checked(&sorted_neg_match, max_neg_match_len * neg_match_ct)) {
+      if (bigstack_alloc_c(max_neg_match_len * neg_match_ct, &sorted_neg_match)) {
         goto filter_attrib_sample_ret_NOMEM;
       }
     }
@@ -714,19 +706,17 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
       }
     }
   }
-  loadbuf_size = wkspace_left;
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   } else if (loadbuf_size <= MAXLINELEN) {
     goto filter_attrib_sample_ret_NOMEM;
   }
-  if (gzopen_checked(&gz_infile, fname, "rb")) {
-    goto filter_attrib_sample_ret_OPEN_FAIL;
-  }
-  if (gzbuffer(gz_infile, 131072)) {
-    goto filter_attrib_sample_ret_NOMEM;
+  retval = gzopen_read_checked(fname, &gz_infile);
+  if (retval) {
+    goto filter_attrib_sample_ret_1;
   }
-  loadbuf = (char*)wkspace_base;
+  loadbuf = (char*)g_bigstack_base;
   loadbuf[loadbuf_size - 1] = ' ';
   while (1) {
     line_idx++;
@@ -738,7 +728,7 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
     }
     if (!loadbuf[loadbuf_size - 1]) {
       if (loadbuf_size == MAXLINEBUFLEN) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR" of --attrib-indiv file is pathologically long.\n", line_idx);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR" of --attrib-indiv file is pathologically long.\n", line_idx);
         goto filter_attrib_sample_ret_INVALID_FORMAT_2;
       }
       goto filter_attrib_sample_ret_NOMEM;
@@ -747,8 +737,8 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
-    if (bsearch_read_fam_indiv(id_buf, sorted_ids, max_id_len, sorted_ids_ct, bufptr, &cond_ptr, &sorted_idx)) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --attrib-indiv file has fewer tokens than\nexpected.\n", line_idx);
+    if (bsearch_read_fam_indiv(bufptr, sorted_ids, max_id_len, sorted_ids_ct, &cond_ptr, &sorted_idx, id_buf)) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --attrib-indiv file has fewer tokens than\nexpected.\n", line_idx);
       goto filter_attrib_sample_ret_INVALID_FORMAT_2;
     }
     if (sorted_idx == -1) {
@@ -759,7 +749,7 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
       LOGPREPRINTFWW("Error: Duplicate sample ID '%s' in --attrib-indiv file.\n", id_buf);
       goto filter_attrib_sample_ret_INVALID_FORMAT_2;
     }
-    set_bit(already_seen, sorted_idx);
+    set_bit(sorted_idx, already_seen);
     unfiltered_idx = id_map[(uint32_t)sorted_idx];
     if (is_set(exclude_arr, unfiltered_idx)) {
       // bugfix: don't proceed here
@@ -785,7 +775,7 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
     if (pos_match_needed) {
       continue;
     }
-    clear_bit(exclude_arr_new, unfiltered_idx);
+    clear_bit(unfiltered_idx, exclude_arr_new);
     include_ct++;
   }
   if ((!include_ct) && (!allow_no_samples)) {
@@ -801,9 +791,6 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
   filter_attrib_sample_ret_NOMEM:
     retval = RET_NOMEM;
     break;
-  filter_attrib_sample_ret_OPEN_FAIL:
-    retval = RET_OPEN_FAIL;
-    break;
   filter_attrib_sample_ret_READ_FAIL:
     retval = RET_READ_FAIL;
     break;
@@ -818,15 +805,15 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
     break;
   }
  filter_attrib_sample_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   gzclose_cond(gz_infile);
   return retval;
 }
 
 int32_t filter_qual_scores(Two_col_params* qual_filter, double qual_min_thresh, double qual_max_thresh, uint32_t* marker_id_htable, uint32_t marker_id_htable_size, uint32_t allow_no_variants, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t miss_ct = 0;
   uint32_t varid_first = (qual_filter->colid < qual_filter->colx);
   char skipchar = qual_filter->skipchar;
@@ -845,15 +832,14 @@ int32_t filter_qual_scores(Two_col_params* qual_filter, double qual_min_thresh,
   uint32_t marker_uidx;
   int32_t retval;
   char cc;
-  if (wkspace_alloc_ul_checked(&already_seen, unfiltered_marker_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&marker_exclude_orig, unfiltered_marker_ctl * sizeof(intptr_t))) {
+  if (bigstack_calloc_ul(unfiltered_marker_ctl, &already_seen) ||
+      bigstack_alloc_ul(unfiltered_marker_ctl, &marker_exclude_orig)) {
     goto filter_qual_scores_ret_NOMEM;
   }
-  fill_ulong_zero(already_seen, unfiltered_marker_ctl);
   memcpy(marker_exclude_orig, marker_exclude, unfiltered_marker_ctl * sizeof(intptr_t));
 
-  loadbuf = (char*)wkspace_base;
-  loadbuf_size = wkspace_left;
+  loadbuf = (char*)g_bigstack_base;
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   }
@@ -876,7 +862,7 @@ int32_t filter_qual_scores(Two_col_params* qual_filter, double qual_min_thresh,
     line_idx++;
     if (!loadbuf[loadbuf_size - 1]) {
       if (loadbuf_size == MAXLINEBUFLEN) {
-        sprintf(logbuf, "Error: Line %" PRIuPTR " of --qual-scores file is pathologically long.\n", line_idx);
+        sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --qual-scores file is pathologically long.\n", line_idx);
         goto filter_qual_scores_ret_INVALID_FORMAT_2;
       } else {
 	goto filter_qual_scores_ret_NOMEM;
@@ -911,9 +897,9 @@ int32_t filter_qual_scores(Two_col_params* qual_filter, double qual_min_thresh,
       LOGPREPRINTFWW("Error: Duplicate variant '%s' in --qual-scores file.\n", colid_ptr);
       goto filter_qual_scores_ret_INVALID_FORMAT_2;
     }
-    set_bit(already_seen, marker_uidx);
+    set_bit(marker_uidx, already_seen);
     if (scan_double(colx_ptr, &dxx) || (dxx < qual_min_thresh) || (dxx > qual_max_thresh)) {
-      set_bit(marker_exclude, marker_uidx);
+      set_bit(marker_uidx, marker_exclude);
     }
   }
   if (!feof(infile)) {
@@ -927,9 +913,9 @@ int32_t filter_qual_scores(Two_col_params* qual_filter, double qual_min_thresh,
     goto filter_qual_scores_ret_1;
   }
   if (miss_ct) {
-    sprintf(logbuf, "--qual-scores: %" PRIuPTR " variant%s remaining, %" PRIuPTR " ID%s missing.\n", marker_ct, (marker_ct == 1)? "" : "s", miss_ct, (miss_ct == 1)? "" : "s");
+    sprintf(g_logbuf, "--qual-scores: %" PRIuPTR " variant%s remaining, %" PRIuPTR " ID%s missing.\n", marker_ct, (marker_ct == 1)? "" : "s", miss_ct, (miss_ct == 1)? "" : "s");
   } else {
-    sprintf(logbuf, "--qual-scores: %" PRIuPTR " variant%s remaining.\n", marker_ct, (marker_ct == 1)? "" : "s");
+    sprintf(g_logbuf, "--qual-scores: %" PRIuPTR " variant%s remaining.\n", marker_ct, (marker_ct == 1)? "" : "s");
   }
   logprintb();
   while (0) {
@@ -940,14 +926,14 @@ int32_t filter_qual_scores(Two_col_params* qual_filter, double qual_min_thresh,
     retval = RET_READ_FAIL;
     break;
   filter_qual_scores_ret_MISSING_TOKENS:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of --qual-scores file has fewer tokens than expected.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --qual-scores file has fewer tokens than expected.\n", line_idx);
   filter_qual_scores_ret_INVALID_FORMAT_2:
     logerrprintb();
     retval = RET_INVALID_FORMAT;
     break;
   }
  filter_qual_scores_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(infile);
   return retval;
 }
@@ -964,8 +950,8 @@ uint32_t random_thin_markers(double thin_keep_prob, uintptr_t unfiltered_marker_
     marker_uidx_stop = next_set(marker_exclude, marker_uidx, unfiltered_marker_ct);
     markers_done += marker_uidx_stop - marker_uidx;
     do {
-      if (sfmt_genrand_uint32(&sfmt) >= uint32_thresh) {
-	SET_BIT(marker_exclude, marker_uidx);
+      if (sfmt_genrand_uint32(&g_sfmt) >= uint32_thresh) {
+	SET_BIT(marker_uidx, marker_exclude);
 	removed_ct++;
       }
     } while (++marker_uidx < marker_uidx_stop);
@@ -980,10 +966,10 @@ uint32_t random_thin_markers(double thin_keep_prob, uintptr_t unfiltered_marker_
 }
 
 int32_t random_thin_markers_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uint32_t marker_ct = unfiltered_marker_ct - *marker_exclude_ct_ptr;
   uint32_t marker_uidx = 0;
-  uintptr_t marker_ctl = (marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t marker_ctl = BITCT_TO_WORDCT(marker_ct);
   int32_t retval = 0;
   uintptr_t* perm_buf;
   uint32_t marker_idx;
@@ -992,7 +978,7 @@ int32_t random_thin_markers_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_marke
     goto random_thin_markers_ct_ret_INVALID_CMDLINE;
   }
   if (marker_ct > 1) {
-    if (wkspace_alloc_ul_checked(&perm_buf, marker_ctl * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(marker_ctl, &perm_buf)) {
       goto random_thin_markers_ct_ret_NOMEM;
     }
     // no actual interleaving here, but may as well use this function
@@ -1002,12 +988,12 @@ int32_t random_thin_markers_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_marke
     for (marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
       next_unset_unsafe_ck(marker_exclude, &marker_uidx);
       if (is_set(perm_buf, marker_idx)) {
-	set_bit(marker_exclude, marker_uidx);
+	set_bit(marker_uidx, marker_exclude);
       }
     }
   } else if ((!thin_keep_ct) && marker_ct) {
     marker_uidx = next_unset_unsafe(marker_exclude, 0);
-    set_bit(marker_exclude, marker_uidx);
+    set_bit(marker_uidx, marker_exclude);
   }
   LOGPRINTF("--thin-count: %u variant%s removed (%u remaining).\n", marker_ct - thin_keep_ct, (marker_ct - thin_keep_ct == 1)? "" : "s", thin_keep_ct);
   *marker_exclude_ct_ptr = unfiltered_marker_ct - thin_keep_ct;
@@ -1019,7 +1005,7 @@ int32_t random_thin_markers_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_marke
     retval = RET_INVALID_CMDLINE;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -1035,8 +1021,8 @@ uint32_t random_thin_samples(double thin_keep_prob, uintptr_t unfiltered_sample_
     sample_uidx_stop = next_set(sample_exclude, sample_uidx, unfiltered_sample_ct);
     samples_done += sample_uidx_stop - sample_uidx;
     do {
-      if(sfmt_genrand_uint32(&sfmt) >= uint32_thresh) {
-        SET_BIT(sample_exclude, sample_uidx);
+      if(sfmt_genrand_uint32(&g_sfmt) >= uint32_thresh) {
+        SET_BIT(sample_uidx, sample_exclude);
         removed_ct++;
       }
     } while (++sample_uidx < sample_uidx_stop);
@@ -1051,10 +1037,10 @@ uint32_t random_thin_samples(double thin_keep_prob, uintptr_t unfiltered_sample_
 }
 
 int32_t random_thin_samples_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uint32_t sample_ct = unfiltered_sample_ct - *sample_exclude_ct_ptr;
   uint32_t sample_uidx = 0;
-  uintptr_t sample_ctl = (sample_ct + (BITCT - 1)) / BITCT;
+  uintptr_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
   int32_t retval = 0;
   uintptr_t* perm_buf;
   uint32_t sample_idx;
@@ -1062,7 +1048,7 @@ int32_t random_thin_samples_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_sampl
     LOGERRPRINTF("Error: --thin-indiv-count parameter exceeds number of remaining %s.\n", g_species_plural);
     goto random_thin_samples_ct_ret_INVALID_CMDLINE;
   }
-  if (wkspace_alloc_ul_checked(&perm_buf, sample_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(sample_ctl, &perm_buf)) {
     goto random_thin_samples_ct_ret_NOMEM;
   }
 
@@ -1071,7 +1057,7 @@ int32_t random_thin_samples_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_sampl
   for (sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
     next_unset_unsafe_ck(sample_exclude, &sample_uidx);
     if (is_set(perm_buf, sample_idx)) {
-      set_bit(sample_exclude, sample_uidx);
+      set_bit(sample_uidx, sample_exclude);
     }
   }
   LOGPRINTF("--thin-indiv-count: %u %s removed (%u remaining).\n", sample_ct - thin_keep_ct, (sample_ct - thin_keep_ct == 1)? g_species_singular : g_species_plural, thin_keep_ct);
@@ -1084,7 +1070,7 @@ int32_t random_thin_samples_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_sampl
     retval = RET_INVALID_CMDLINE;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -1094,16 +1080,16 @@ int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfilt
   // 2. load marker file, sort by uidx
   // 3. check for early exit (no clusters and/or no .zero entries)
   // 4. scan through .bed sequentially, update oblig_missing_..._cts
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   FILE* infile = NULL;
-  char* idbuf = &(tbuf[MAXLINELEN]);
+  char* idbuf = &(g_textbuf[MAXLINELEN]);
   Ll_str* cluster_names = NULL;
   uint64_t tot_missing = 0;
   uintptr_t marker_ct = unfiltered_marker_ct - marker_exclude_ct;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + BITCT2 - 1) / BITCT2;
-  uintptr_t sorted_sample_ctl = (sorted_sample_ct + BITCT - 1) / BITCT;
-  uintptr_t topsize = 0;
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t sorted_sample_ctl = BITCT_TO_WORDCT(sorted_sample_ct);
   uintptr_t max_cluster_id_len = 0;
   uintptr_t possible_distinct_ct = 0;
   uintptr_t missing_cluster_ct = 0;
@@ -1113,7 +1099,7 @@ int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfilt
   int32_t y_code = chrom_info_ptr->y_code;
   uint32_t y_present = ((y_code != -1) && is_set(chrom_info_ptr->chrom_mask, y_code));
   int32_t retval = 0;
-  Ll_str* llptr;
+  Ll_str* ll_ptr;
   uintptr_t* loadbuf;
   uintptr_t* loadbuf_end;
   uintptr_t* cluster_zmask2s;
@@ -1130,7 +1116,6 @@ int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfilt
   char* bufptr2;
   int64_t* zc_entries;
   int64_t* zc_entries_end;
-  int64_t* wkspace_end;
   uintptr_t cluster_ct;
   uintptr_t cluster_mct; // doubled if Y chrom present
   uintptr_t marker_uidx;
@@ -1145,29 +1130,29 @@ int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfilt
     y_start = chrom_info_ptr->chrom_start[(uint32_t)y_code];
     y_end = chrom_info_ptr->chrom_end[(uint32_t)y_code];
   }
-  if (wkspace_alloc_ul_checked(&loadbuf, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf)) {
     goto load_oblig_missing_ret_NOMEM;
   }
   loadbuf_end = &(loadbuf[unfiltered_sample_ctl2]);
-  if (fopen_checked(&infile, om_ip->sample_fname, "r")) {
+  if (fopen_checked(om_ip->sample_fname, "r", &infile)) {
     goto load_oblig_missing_ret_OPEN_FAIL;
   }
-  tbuf[MAXLINELEN - 1] = ' ';
+  g_textbuf[MAXLINELEN - 1] = ' ';
 
   // two-pass load, same as load_clusters()
   // use loadbuf as duplicate IID detector
   fill_ulong_zero(loadbuf, sorted_sample_ctl);
-  while (fgets(tbuf, MAXLINELEN, infile)) {
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
+    if (!g_textbuf[MAXLINELEN - 1]) {
       LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, om_ip->sample_fname);
       goto load_oblig_missing_ret_INVALID_FORMAT_2;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
-    if (bsearch_read_fam_indiv(idbuf, sorted_sample_ids, max_sample_id_len, sorted_sample_ct, bufptr, &bufptr2, &ii)) {
+    if (bsearch_read_fam_indiv(bufptr, sorted_sample_ids, max_sample_id_len, sorted_sample_ct, &bufptr2, &ii, idbuf)) {
       LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, om_ip->sample_fname);
       goto load_oblig_missing_ret_INVALID_FORMAT_2;
     }
@@ -1177,20 +1162,19 @@ int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfilt
         LOGPREPRINTFWW("Error: Duplicate sample ID '%s' in %s.\n", idbuf, om_ip->sample_fname);
 	goto load_oblig_missing_ret_INVALID_FORMAT_2;
       }
-      set_bit(loadbuf, ii);
+      set_bit(ii, loadbuf);
       slen = strlen_se(bufptr2);
       if (slen >= max_cluster_id_len) {
 	max_cluster_id_len = slen + 1;
       }
       bufptr2[slen] = '\0';
       if ((!cluster_names) || (strcmp(cluster_names->ss, bufptr2) && ((!cluster_names->next) || strcmp(cluster_names->next->ss, bufptr2)))) {
-	llptr = top_alloc_llstr(&topsize, slen + 1);
-	if (!llptr) {
+	if (bigstack_end_alloc_llstr(slen + 1, &ll_ptr)) {
 	  goto load_oblig_missing_ret_NOMEM;
 	}
-	llptr->next = cluster_names;
-	memcpy(llptr->ss, bufptr2, slen + 1);
-	cluster_names = llptr;
+	ll_ptr->next = cluster_names;
+	memcpy(ll_ptr->ss, bufptr2, slen + 1);
+	cluster_names = ll_ptr;
 	possible_distinct_ct++;
       }
     }
@@ -1202,41 +1186,37 @@ int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfilt
     LOGERRPRINTFWW("Warning: --oblig-missing ignored, since no valid blocks were defined in %s.\n", om_ip->sample_fname);
     goto load_oblig_missing_ret_1;
   }
-  wkspace_left -= topsize;
-  if (wkspace_alloc_c_checked(&cluster_ids, possible_distinct_ct * max_cluster_id_len)) {
-    goto load_oblig_missing_ret_NOMEM2;
+  if (bigstack_alloc_c(possible_distinct_ct * max_cluster_id_len, &cluster_ids)) {
+    goto load_oblig_missing_ret_NOMEM;
   }
   for (ulii = 0; ulii < possible_distinct_ct; ulii++) {
     strcpy(&(cluster_ids[ulii * max_cluster_id_len]), cluster_names->ss);
     cluster_names = cluster_names->next;
   }
-  wkspace_left += topsize;
-  topsize = 0;
+  bigstack_end_reset(bigstack_end_mark);
   qsort(cluster_ids, possible_distinct_ct, max_cluster_id_len, strcmp_casted);
   cluster_ct = collapse_duplicate_ids(cluster_ids, possible_distinct_ct, max_cluster_id_len, NULL);
-  wkspace_shrink_top(cluster_ids, cluster_ct * max_cluster_id_len);
+  bigstack_shrink_top(cluster_ids, cluster_ct * max_cluster_id_len);
   cluster_mct = cluster_ct * (y_present + 1);
   sample_lookup = (uint32_t*)malloc(unfiltered_sample_ct * sizeof(int32_t));
   if (!sample_lookup) {
     goto load_oblig_missing_ret_NOMEM;
   }
   om_ip->sample_lookup = sample_lookup;
-  if (wkspace_alloc_ui_checked(&cluster_sizes, cluster_mct * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(&cluster_zmask2s, cluster_mct * unfiltered_sample_ctl2 * sizeof(intptr_t))) {
+  if (bigstack_calloc_ui(cluster_mct, &cluster_sizes) ||
+      bigstack_calloc_ul(cluster_mct * unfiltered_sample_ctl2, &cluster_zmask2s)) {
     goto load_oblig_missing_ret_NOMEM;
   }
-  fill_uint_zero(cluster_sizes, cluster_mct);
   fill_uint_one(sample_lookup, unfiltered_sample_ct);
-  fill_ulong_zero(cluster_zmask2s, cluster_mct * unfiltered_sample_ctl2);
 
   // second pass
   rewind(infile);
-  while (fgets(tbuf, MAXLINELEN, infile)) {
-    bufptr = skip_initial_spaces(tbuf);
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
-    bsearch_read_fam_indiv(idbuf, sorted_sample_ids, max_sample_id_len, sorted_sample_ct, bufptr, &bufptr2, &ii);
+    bsearch_read_fam_indiv(bufptr, sorted_sample_ids, max_sample_id_len, sorted_sample_ct, &bufptr2, &ii, idbuf);
     if (ii == -1) {
       continue;
     }
@@ -1244,7 +1224,7 @@ int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfilt
     slen = strlen_se(bufptr2);
     // guaranteed to succeed
     ii = bsearch_str(bufptr2, slen, cluster_ids, max_cluster_id_len, cluster_ct);
-    set_bit(&(cluster_zmask2s[((uintptr_t)((uint32_t)ii)) * unfiltered_sample_ctl2]), sample_uidx * 2);
+    set_bit(sample_uidx * 2, &(cluster_zmask2s[((uintptr_t)((uint32_t)ii)) * unfiltered_sample_ctl2]));
     cluster_sizes[(uint32_t)ii] += 1;
     sample_lookup[sample_uidx] = (uint32_t)ii;
   }
@@ -1252,7 +1232,7 @@ int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfilt
     goto load_oblig_missing_ret_READ_FAIL;
   }
   if (y_present) {
-    vec_include_init(unfiltered_sample_ct, loadbuf, sex_male);
+    init_quaterarr_from_bitarr(sex_male, unfiltered_sample_ct, loadbuf);
     cur_cluster_zmask2 = cluster_zmask2s;
     ulptr = &(cur_cluster_zmask2[cluster_ct * unfiltered_sample_ctl2]);
     for (cluster_idx = 0; cluster_idx < cluster_ct; cluster_idx++) {
@@ -1272,24 +1252,23 @@ int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfilt
   }
   om_ip->cluster_ref_cts = cluster_ref_cts;
   fill_uint_zero(cluster_ref_cts, cluster_ct * 2);
-  retval = sort_item_ids(&sorted_marker_ids, &marker_id_map, unfiltered_marker_ct, marker_exclude, marker_exclude_ct, marker_ids, max_marker_id_len, 0, 0, strcmp_deref);
+  retval = sort_item_ids(unfiltered_marker_ct, marker_exclude, marker_exclude_ct, marker_ids, max_marker_id_len, 0, 0, strcmp_deref, &sorted_marker_ids, &marker_id_map);
   if (retval) {
     goto load_oblig_missing_ret_1;
   }
-  zc_entries = (int64_t*)wkspace_base;
+  zc_entries = (int64_t*)g_bigstack_base;
   zc_entries_end = zc_entries;
-  wkspace_end = (int64_t*)(&(wkspace_base[wkspace_left]));
-  if (fopen_checked(&infile, om_ip->marker_fname, "r")) {
+  if (fopen_checked(om_ip->marker_fname, "r", &infile)) {
     goto load_oblig_missing_ret_OPEN_FAIL;
   }
   line_idx = 0;
-  while (fgets(tbuf, MAXLINELEN, infile)) {
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
+    if (!g_textbuf[MAXLINELEN - 1]) {
       LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, om_ip->marker_fname);
       goto load_oblig_missing_ret_INVALID_FORMAT_2;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
@@ -1305,7 +1284,7 @@ int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfilt
       slen = strlen_se(bufptr);
       ii = bsearch_str(bufptr, slen, cluster_ids, max_cluster_id_len, cluster_ct);
       if (ii != -1) {
-	if (zc_entries_end == wkspace_end) {
+	if (((unsigned char*)zc_entries_end) == g_bigstack_end) {
           goto load_oblig_missing_ret_NOMEM;
 	}
 	cluster_idx = (uint32_t)ii;
@@ -1353,7 +1332,7 @@ int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfilt
       if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
 	goto load_oblig_missing_ret_READ_FAIL;
       }
-      if (load_raw(bedfile, loadbuf, unfiltered_sample_ct4)) {
+      if (load_raw(unfiltered_sample_ct4, bedfile, loadbuf)) {
 	goto load_oblig_missing_ret_READ_FAIL;
       }
       // no need for het haploid handling here
@@ -1372,8 +1351,6 @@ int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfilt
   } while (zc_entries < zc_entries_end);
   LOGPRINTF("--oblig-missing: %" PRIu64 " call%s confirmed missing.\n", tot_missing, (tot_missing == 1)? "" : "s");
   while (0) {
-  load_oblig_missing_ret_NOMEM2:
-    wkspace_left += topsize;
   load_oblig_missing_ret_NOMEM:
     retval = RET_NOMEM;
     break;
@@ -1389,15 +1366,15 @@ int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfilt
     break;
   }
  load_oblig_missing_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
   fclose_cond(infile);
   return retval;
 }
 
 int32_t filter_samples_file(char* filtername, char* sorted_sample_ids, uintptr_t sorted_ids_len, uintptr_t max_sample_id_len, uint32_t* id_map, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* filtervals_flattened, uint32_t mfilter_col, uint32_t allow_no_samples) {
   FILE* infile = NULL;
-  unsigned char* wkspace_mark = wkspace_base;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t include_ct = 0;
   uintptr_t max_filterval_len = 0;
   uintptr_t line_idx = 0;
@@ -1410,12 +1387,12 @@ int32_t filter_samples_file(char* filtername, char* sorted_sample_ids, uintptr_t
   uint32_t filterval_idx;
   uint32_t slen;
   int32_t sample_idx;
-  if (wkspace_alloc_c_checked(&id_buf, max_sample_id_len) ||
-      wkspace_alloc_ul_checked(&sample_exclude_new, unfiltered_sample_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_c_checked(&sorted_filtervals, filterval_ct * max_filterval_len)) {
+  if (bigstack_alloc_c(max_sample_id_len, &id_buf) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl, &sample_exclude_new) ||
+      bigstack_alloc_c(filterval_ct * max_filterval_len, &sorted_filtervals)) {
     goto filter_samples_file_ret_NOMEM;
   }
-  fill_all_bits(sample_exclude_new, unfiltered_sample_ct);
+  fill_all_bits(unfiltered_sample_ct, sample_exclude_new);
   bufptr = filtervals_flattened;
   for (filterval_idx = 0; filterval_idx < filterval_ct; filterval_idx++) {
     slen = strlen(bufptr) + 1;
@@ -1424,21 +1401,21 @@ int32_t filter_samples_file(char* filtername, char* sorted_sample_ids, uintptr_t
   }
   qsort(sorted_filtervals, filterval_ct, max_filterval_len, strcmp_casted);
 
-  if (fopen_checked(&infile, filtername, "r")) {
+  if (fopen_checked(filtername, "r", &infile)) {
     goto filter_samples_file_ret_OPEN_FAIL;
   }
-  tbuf[MAXLINELEN - 1] = ' ';
-  while (fgets(tbuf, MAXLINELEN, infile)) {
+  g_textbuf[MAXLINELEN - 1] = ' ';
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --filter file is pathologically long.\n", line_idx);
+    if (!g_textbuf[MAXLINELEN - 1]) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --filter file is pathologically long.\n", line_idx);
       goto filter_samples_file_ret_INVALID_FORMAT_2;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
-    if (bsearch_read_fam_indiv(id_buf, sorted_sample_ids, max_sample_id_len, sorted_ids_len, bufptr, &bufptr, &sample_idx)) {
+    if (bsearch_read_fam_indiv(bufptr, sorted_sample_ids, max_sample_id_len, sorted_ids_len, &bufptr, &sample_idx, id_buf)) {
       goto filter_samples_file_ret_MISSING_TOKENS;
     }
     if (sample_idx != -1) {
@@ -1452,7 +1429,7 @@ int32_t filter_samples_file(char* filtername, char* sorted_sample_ids, uintptr_t
 	}
 	if (bsearch_str(bufptr, strlen_se(bufptr), sorted_filtervals, max_filterval_len, filterval_ct) != -1) {
 	  if (is_set(sample_exclude_new, sample_idx)) {
-	    clear_bit(sample_exclude_new, sample_idx);
+	    clear_bit(sample_idx, sample_exclude_new);
 	    include_ct++;
 	  }
 	}
@@ -1481,7 +1458,7 @@ int32_t filter_samples_file(char* filtername, char* sorted_sample_ids, uintptr_t
     retval = RET_READ_FAIL;
     break;
   filter_samples_file_ret_MISSING_TOKENS:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of --filter file has fewer tokens than expected.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --filter file has fewer tokens than expected.\n", line_idx);
   filter_samples_file_ret_INVALID_FORMAT_2:
     logerrprintb();
     retval = RET_INVALID_FORMAT;
@@ -1490,7 +1467,7 @@ int32_t filter_samples_file(char* filtername, char* sorted_sample_ids, uintptr_t
     retval = RET_ALL_SAMPLES_EXCLUDED;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(infile);
   return retval;
 }
@@ -1498,7 +1475,7 @@ int32_t filter_samples_file(char* filtername, char* sorted_sample_ids, uintptr_t
 void filter_samples_bitfields(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, uintptr_t* orfield, int32_t orfield_flip, uintptr_t* ornot) {
   // sample_exclude := sample_exclude | orfield | (~ornot) if !orfield_flip
   //                := sample_exclude | (~orfield) | (~ornot) otherwise
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t* ieptr = sample_exclude;
   uintptr_t* ieend = &(sample_exclude[unfiltered_sample_ctl]);
   if (orfield_flip) {
@@ -1522,7 +1499,7 @@ void filter_samples_bitfields(uintptr_t unfiltered_sample_ct, uintptr_t* sample_
       } while (++ieptr < ieend);
     }
   }
-  zero_trailing_bits(sample_exclude, unfiltered_sample_ct);
+  zero_trailing_bits(unfiltered_sample_ct, sample_exclude);
   *sample_exclude_ct_ptr = popcount_longs(sample_exclude, unfiltered_sample_ctl);
 }
 
@@ -1532,12 +1509,12 @@ int32_t mind_filter(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
   if (!sample_ct) {
     return 0;
   }
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
   uint32_t marker_ct = unfiltered_marker_ct - marker_exclude_ct;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t final_mask = get_final_mask(unfiltered_sample_ct);
   uintptr_t marker_idx = 0;
   uintptr_t y_start = 0;
@@ -1570,19 +1547,18 @@ int32_t mind_filter(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
   if (y_present) {
     y_start = chrom_info_ptr->chrom_start[(uint32_t)y_code];
     y_end = chrom_info_ptr->chrom_end[(uint32_t)y_code];
-    if (wkspace_alloc_ul_checked(&sample_male_include2, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_sample_ctl2, &sample_male_include2)) {
       goto mind_filter_ret_NOMEM;
     }
-    vec_include_init(unfiltered_sample_ct, sample_male_include2, sex_male);
+    init_quaterarr_from_bitarr(sex_male, unfiltered_sample_ct, sample_male_include2);
     nony_marker_ct = marker_ct - (y_end - y_start - popcount_bit_idx(marker_exclude, y_start, y_end));
   }
-  if (wkspace_alloc_ui_checked(&missing_cts, unfiltered_sample_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&newly_excluded, unfiltered_sample_ctl * sizeof(int32_t))) {
+  if (bigstack_calloc_ui(unfiltered_sample_ct, &missing_cts) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl, &newly_excluded)) {
     goto mind_filter_ret_NOMEM;
   }
   loadbuf[unfiltered_sample_ctl2 - 1] = 0;
-  fill_uint_zero(missing_cts, unfiltered_sample_ct);
   if (fseeko(bedfile, bed_offset, SEEK_SET)) {
     goto mind_filter_ret_READ_FAIL;
   }
@@ -1595,10 +1571,10 @@ int32_t mind_filter(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
 	goto mind_filter_ret_READ_FAIL;
       }
     }
-    if (load_raw2(bedfile, loadbuf, unfiltered_sample_ct4, unfiltered_sample_ctl2m1, final_mask)) {
+    if (load_raw2(unfiltered_sample_ct4, unfiltered_sample_ctl2m1, final_mask, bedfile, loadbuf)) {
       goto mind_filter_ret_READ_FAIL;
     }
-    // er, why doesn't this use load_and_collapse?
+    // todo: switch to load_and_collapse()
     lptr = loadbuf;
     if ((marker_uidx >= y_end) || (marker_uidx < y_start)) {
       for (uii = 0; uii < ujj; uii += BITCT2) {
@@ -1632,7 +1608,7 @@ int32_t mind_filter(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
       sample_idx += sample_uidx_stop - sample_uidx;
       do {
 	if (missing_cts[sample_uidx] > mind_int_thresh[is_set(sex_male, sample_uidx)]) {
-	  SET_BIT(newly_excluded, sample_uidx);
+	  SET_BIT(sample_uidx, newly_excluded);
 	  removed_ct++;
 	}
       } while (++sample_uidx < sample_uidx_stop);
@@ -1660,16 +1636,16 @@ int32_t mind_filter(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
 	  }
 	}
 	if ((missing_cts[sample_uidx] - ujj) > (uint32_t)((int32_t)(mind_thresh * ((int32_t)(cur_marker_ct - ujj)) * (1 + SMALL_EPSILON)))) {
-	  SET_BIT(newly_excluded, sample_uidx);
+	  SET_BIT(sample_uidx, newly_excluded);
 	  removed_ct++;
 	}
       } while (++sample_uidx < sample_uidx_stop);
     } while (sample_idx < sample_ct);
   }
   if (removed_ct) {
-    bitfield_or(sample_exclude, newly_excluded, unfiltered_sample_ctl);
+    bitvec_or(newly_excluded, unfiltered_sample_ctl, sample_exclude);
     memcpy(outname_end, ".irem", 6);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto mind_filter_ret_OPEN_FAIL;
     }
     sample_uidx = 0;
@@ -1709,7 +1685,7 @@ int32_t mind_filter(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
     retval = RET_ALL_SAMPLES_EXCLUDED;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   return retval;
 }
@@ -1726,8 +1702,8 @@ void freq_hwe_haploid_count_120v(__m128i* vptr, __m128i* vend, __m128i* maskvp,
   __m128i to_ct_hmaj1;
   __m128i to_ct_nm2;
   __m128i to_ct_hmaj2;
-  __uni16 acc_nm;
-  __uni16 acc_hmaj;
+  __univec acc_nm;
+  __univec acc_hmaj;
 
   acc_nm.vi = _mm_setzero_si128();
   acc_hmaj.vi = _mm_setzero_si128();
@@ -1979,13 +1955,13 @@ static inline void single_marker_freqs_and_hwe(uintptr_t unfiltered_sample_ctl2,
   while (unfiltered_sample_ctl2 >= 120) {
   single_marker_freqs_and_hwe_loop:
     lptr_12x_end = &(lptr[cur_decr]);
-    count_3freq_120v((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)sample_include2, &tot_a, &tot_b, &tot_c);
-    count_3freq_120v((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)founder_include2, &tot_a_f, &tot_b_f, &tot_c_f);
+    count_3freq_1920b((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)sample_include2, &tot_a, &tot_b, &tot_c);
+    count_3freq_1920b((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)founder_include2, &tot_a_f, &tot_b_f, &tot_c_f);
     if (hwe_or_geno_needed) {
-      count_3freq_120v((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)founder_ctrl_include2, &tot_a_hwe, &tot_b_hwe, &tot_c_hwe);
+      count_3freq_1920b((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)founder_ctrl_include2, &tot_a_hwe, &tot_b_hwe, &tot_c_hwe);
       founder_ctrl_include2 = &(founder_ctrl_include2[cur_decr]);
       if (hardy_needed) {
-	count_3freq_120v((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)founder_case_include2, &tot_a_chwe, &tot_b_chwe, &tot_c_chwe);
+	count_3freq_1920b((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)founder_case_include2, &tot_a_chwe, &tot_b_chwe, &tot_c_chwe);
 	founder_case_include2 = &(founder_case_include2[cur_decr]);
       }
     }
@@ -2001,13 +1977,13 @@ static inline void single_marker_freqs_and_hwe(uintptr_t unfiltered_sample_ctl2,
 #else
   uintptr_t* lptr_twelve_end = &(lptr[unfiltered_sample_ctl2 - unfiltered_sample_ctl2 % 12]);
   while (lptr < lptr_twelve_end) {
-    count_3freq_12(lptr, sample_include2, &tot_a, &tot_b, &tot_c);
-    count_3freq_12(lptr, founder_include2, &tot_a_f, &tot_b_f, &tot_c_f);
+    count_3freq_48b(lptr, sample_include2, &tot_a, &tot_b, &tot_c);
+    count_3freq_48b(lptr, founder_include2, &tot_a_f, &tot_b_f, &tot_c_f);
     if (hwe_or_geno_needed) {
-      count_3freq_12(lptr, founder_ctrl_include2, &tot_a_hwe, &tot_b_hwe, &tot_c_hwe);
+      count_3freq_48b(lptr, founder_ctrl_include2, &tot_a_hwe, &tot_b_hwe, &tot_c_hwe);
       founder_ctrl_include2 = &(founder_ctrl_include2[12]);
       if (hardy_needed) {
-	count_3freq_12(lptr, founder_case_include2, &tot_a_chwe, &tot_b_chwe, &tot_c_chwe);
+	count_3freq_48b(lptr, founder_case_include2, &tot_a_chwe, &tot_b_chwe, &tot_c_chwe);
 	founder_case_include2 = &(founder_case_include2[12]);
       }
     }
@@ -2117,7 +2093,7 @@ static inline void haploid_single_marker_freqs(uintptr_t unfiltered_sample_ct, u
   //   popcount(B) = het ct + homozyg major ct
   //   popcount(A) = missing_ct + homozyg major ct
   //               = sample_ct - homozyg minor ct - het ct
-    count_3freq_120v((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)sample_include2, &tot_a, &tot_b, &tot_hmaj);
+    count_3freq_1920b((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)sample_include2, &tot_a, &tot_b, &tot_hmaj);
     freq_hwe_haploid_count_120v((__m128i*)lptr, (__m128i*)lptr_12x_end, (__m128i*)founder_include2, &tot_nm_f, &tot_hmaj_f);
     lptr = lptr_12x_end;
     sample_include2 = &(sample_include2[cur_decr]);
@@ -2131,7 +2107,7 @@ static inline void haploid_single_marker_freqs(uintptr_t unfiltered_sample_ct, u
 #else
   uintptr_t* lptr_twelve_end = &(lptr[unfiltered_sample_ctl2 - unfiltered_sample_ctl2 % 12]);
   while (lptr < lptr_twelve_end) {
-    count_3freq_12(lptr, sample_include2, &tot_a, &tot_b, &tot_hmaj);
+    count_3freq_48b(lptr, sample_include2, &tot_a, &tot_b, &tot_hmaj);
     freq_hwe_haploid_count_12(lptr, founder_include2, &tot_nm_f, &tot_hmaj_f);
     lptr = &(lptr[12]);
     sample_include2 = &(sample_include2[12]);
@@ -2164,12 +2140,12 @@ static inline void haploid_single_marker_freqs(uintptr_t unfiltered_sample_ct, u
   *hethap_incr_ptr = hethap_incr;
 }
 
-int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_exclude_ct, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* founder_info, int32_t nonfounders, int32_t maf_succ, double* set_allele_freqs, uintptr_t bed_offset, uint32_t hwe_needed, uint32_t hwe_all, uin [...]
+int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_exclude_ct, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* founder_info, int32_t nonfounders, int32_t maf_succ, double* set_allele_freqs, uintptr_t bed_offset, uint32_t hwe_needed, uint32_t hwe_all, uin [...]
   FILE* hhfile = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + BITCT - 1) / BITCT;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t unfiltered_sample_ctv2 = 2 * unfiltered_sample_ctl;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   int32_t retval = 0;
   uint32_t pct = 1;
   uint32_t sample_ct = unfiltered_sample_ct - sample_exclude_ct;
@@ -2182,7 +2158,7 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
   uint32_t sample_f_ct = sample_ct;
   uintptr_t sample_f_ctrl_ct = sample_ct;
   uintptr_t sample_f_case_ct = sample_ct;
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uint32_t ll_hwe = 0;
   uint32_t lh_hwe = 0;
   uint32_t hh_hwe = 0;
@@ -2257,20 +2233,20 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
   double maf;
   double cur_genotyping_rate;
   if (!hwe_needed) {
-    *hwe_lls_ptr = (int32_t*)wkspace_base;
+    *hwe_lls_ptr = (int32_t*)g_bigstack_base;
   } else {
-    if (wkspace_alloc_i_checked(&hwe_lls, unfiltered_marker_ct * sizeof(int32_t)) ||
-	wkspace_alloc_i_checked(&hwe_lhs, unfiltered_marker_ct * sizeof(int32_t)) ||
-	wkspace_alloc_i_checked(&hwe_hhs, unfiltered_marker_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_i(unfiltered_marker_ct, &hwe_lls) ||
+	bigstack_alloc_i(unfiltered_marker_ct, &hwe_lhs) ||
+	bigstack_alloc_i(unfiltered_marker_ct, &hwe_hhs)) {
       goto calc_freqs_and_hwe_ret_NOMEM;
     }
     *hwe_lls_ptr = hwe_lls;
     *hwe_lhs_ptr = hwe_lhs;
     *hwe_hhs_ptr = hwe_hhs;
     if (hardy_needed) {
-      if (wkspace_alloc_i_checked(&hwe_ll_cases, unfiltered_marker_ct * sizeof(int32_t)) ||
-          wkspace_alloc_i_checked(&hwe_lh_cases, unfiltered_marker_ct * sizeof(int32_t)) ||
-          wkspace_alloc_i_checked(&hwe_hh_cases, unfiltered_marker_ct * sizeof(int32_t))) {
+      if (bigstack_alloc_i(unfiltered_marker_ct, &hwe_ll_cases) ||
+          bigstack_alloc_i(unfiltered_marker_ct, &hwe_lh_cases) ||
+          bigstack_alloc_i(unfiltered_marker_ct, &hwe_hh_cases)) {
 	goto calc_freqs_and_hwe_ret_NOMEM;
       }
     }
@@ -2278,11 +2254,11 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
     *hwe_lh_cases_ptr = hwe_lh_cases;
     *hwe_hh_cases_ptr = hwe_hh_cases;
   }
-  if (wkspace_alloc_i_checked(&hwe_ll_allfs, unfiltered_marker_ct * sizeof(int32_t)) ||
-      wkspace_alloc_i_checked(&hwe_lh_allfs, unfiltered_marker_ct * sizeof(int32_t)) ||
-      wkspace_alloc_i_checked(&hwe_hh_allfs, unfiltered_marker_ct * sizeof(int32_t)) ||
-      wkspace_alloc_i_checked(&hwe_hapl_allfs, unfiltered_marker_ct * sizeof(int32_t)) ||
-      wkspace_alloc_i_checked(&hwe_haph_allfs, unfiltered_marker_ct * sizeof(int32_t))) {
+  if (bigstack_calloc_i(unfiltered_marker_ct, &hwe_ll_allfs) ||
+      bigstack_calloc_i(unfiltered_marker_ct, &hwe_lh_allfs) ||
+      bigstack_calloc_i(unfiltered_marker_ct, &hwe_hh_allfs) ||
+      bigstack_calloc_i(unfiltered_marker_ct, &hwe_hapl_allfs) ||
+      bigstack_calloc_i(unfiltered_marker_ct, &hwe_haph_allfs)) {
     goto calc_freqs_and_hwe_ret_NOMEM;
   }
   *hwe_ll_allfs_ptr = hwe_ll_allfs;
@@ -2295,44 +2271,37 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
     hwe_all = 1;
   }
 
-  fill_int_zero(hwe_ll_allfs, unfiltered_marker_ct);
-  fill_int_zero(hwe_lh_allfs, unfiltered_marker_ct);
-  fill_int_zero(hwe_hh_allfs, unfiltered_marker_ct);
-  fill_int_zero(hwe_hapl_allfs, unfiltered_marker_ct);
-  fill_int_zero(hwe_haph_allfs, unfiltered_marker_ct);
   if (geno_thresh < 1.0) {
-    if (wkspace_alloc_ul_checked(geno_excl_bitfield_ptr, unfiltered_marker_ctl * sizeof(intptr_t))) {
+    if (bigstack_calloc_ul(unfiltered_marker_ctl, geno_excl_bitfield_ptr)) {
       goto calc_freqs_and_hwe_ret_NOMEM;
     }
     geno_excl_bitfield = *geno_excl_bitfield_ptr;
-    fill_ulong_zero(geno_excl_bitfield, unfiltered_marker_ctl);
     // change this to a minimum nonmissing rate
     geno_thresh = (1.0 - geno_thresh) * (1 - SMALL_EPSILON);
   }
   if ((min_ac > 0) || (max_ac < sample_ct)) {
-    if (wkspace_alloc_ul_checked(ac_excl_bitfield_ptr, unfiltered_marker_ctl * sizeof(intptr_t))) {
+    if (bigstack_calloc_ul(unfiltered_marker_ctl, ac_excl_bitfield_ptr)) {
       goto calc_freqs_and_hwe_ret_NOMEM;
     }
     ac_excl_bitfield = *ac_excl_bitfield_ptr;
-    fill_ulong_zero(ac_excl_bitfield, unfiltered_marker_ctl);
   }
-  wkspace_mark = wkspace_base;
-  if (wkspace_alloc_ul_checked(&loadbuf, unfiltered_sample_ctv2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&sample_include2, unfiltered_sample_ctv2 * sizeof(intptr_t))) {
+  bigstack_mark = g_bigstack_base;
+  if (bigstack_alloc_ul(unfiltered_sample_ctv2, &loadbuf) ||
+      bigstack_alloc_ul(unfiltered_sample_ctv2, &sample_include2)) {
     goto calc_freqs_and_hwe_ret_NOMEM;
   }
   loadbuf[unfiltered_sample_ctv2 - 2] = 0;
   loadbuf[unfiltered_sample_ctv2 - 1] = 0;
-  exclude_to_vec_include(unfiltered_sample_ct, sample_include2, sample_exclude);
+  init_quaterarr_from_inverted_bitarr(sample_exclude, unfiltered_sample_ct, sample_include2);
   ii = chrom_info_ptr->x_code;
   nonmales_needed = (!is_split_chrom) && (ii != -1) && is_set(chrom_info_ptr->chrom_mask, ii);
   ii = chrom_info_ptr->y_code;
   males_needed = nonmales_needed || ((!is_split_chrom) && (ii != -1) && is_set(chrom_info_ptr->chrom_mask, ii));
-  if (wkspace_alloc_ul_checked(&sample_male_include2, unfiltered_sample_ctv2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctv2, &sample_male_include2)) {
     goto calc_freqs_and_hwe_ret_NOMEM;
   }
   memcpy(sample_male_include2, sample_include2, unfiltered_sample_ctv2 * sizeof(intptr_t));
-  vec_include_mask_in(unfiltered_sample_ct, sample_male_include2, sex_male);
+  apply_bitarr_mask_to_quaterarr_01(sex_male, unfiltered_sample_ct, sample_male_include2);
   sample_male_ct = popcount01_longs(sample_male_include2, unfiltered_sample_ctv2);
   if (sample_male_ct) {
     male_ct_recip = 1.0 / ((double)((int32_t)sample_male_ct));
@@ -2342,11 +2311,11 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
   if (males_needed) {
     founder_male_include2 = sample_male_include2;
     if (nonmales_needed) {
-      if (wkspace_alloc_ul_checked(&sample_nonmale_include2, unfiltered_sample_ctv2 * sizeof(intptr_t))) {
+      if (bigstack_alloc_ul(unfiltered_sample_ctv2, &sample_nonmale_include2)) {
 	goto calc_freqs_and_hwe_ret_NOMEM;
       }
       memcpy(sample_nonmale_include2, sample_include2, unfiltered_sample_ctv2 * sizeof(intptr_t));
-      vec_include_mask_out_intersect(unfiltered_sample_ct, sample_nonmale_include2, sex_nm, sex_male);
+      apply_excl_intersect_to_quaterarr_01(sex_nm, sex_male, unfiltered_sample_ct, sample_nonmale_include2);
       sample_nonmale_ct = popcount01_longs(sample_nonmale_include2, unfiltered_sample_ctv2);
       sample_f_nonmale_ct = sample_nonmale_ct;
       founder_nonmale_include2 = sample_nonmale_include2;
@@ -2354,30 +2323,30 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
     }
   }
   founder_include2 = sample_include2;
-  if (wkspace_alloc_ul_checked(&tmp_sample_excl_mask, unfiltered_sample_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl, &tmp_sample_excl_mask)) {
     goto calc_freqs_and_hwe_ret_NOMEM;
   }
   memcpy(tmp_sample_excl_mask, sample_exclude, unfiltered_sample_ctl * sizeof(intptr_t));
   if (!nonfounders) {
-    if (wkspace_alloc_ul_checked(&founder_include2, unfiltered_sample_ctv2 * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_sample_ctv2, &founder_include2)) {
       goto calc_freqs_and_hwe_ret_NOMEM;
     }
-    bitfield_ornot(tmp_sample_excl_mask, founder_info, unfiltered_sample_ctl);
-    zero_trailing_bits(tmp_sample_excl_mask, unfiltered_sample_ct);
-    exclude_to_vec_include(unfiltered_sample_ct, founder_include2, tmp_sample_excl_mask);
+    bitvec_ornot(founder_info, unfiltered_sample_ctl, tmp_sample_excl_mask);
+    zero_trailing_bits(unfiltered_sample_ct, tmp_sample_excl_mask);
+    init_quaterarr_from_inverted_bitarr(tmp_sample_excl_mask, unfiltered_sample_ct, founder_include2);
     if (males_needed) {
-      if (wkspace_alloc_ul_checked(&founder_male_include2, unfiltered_sample_ctv2 * sizeof(intptr_t))) {
+      if (bigstack_alloc_ul(unfiltered_sample_ctv2, &founder_male_include2)) {
 	goto calc_freqs_and_hwe_ret_NOMEM;
       }
       memcpy(founder_male_include2, sample_male_include2, unfiltered_sample_ctl * 2 * sizeof(intptr_t));
-      vec_include_mask_in(unfiltered_sample_ct, founder_male_include2, founder_info);
+      apply_bitarr_mask_to_quaterarr_01(founder_info, unfiltered_sample_ct, founder_male_include2);
       sample_f_male_ct = popcount01_longs(founder_male_include2, unfiltered_sample_ctv2);
       if (nonmales_needed) {
-	if (wkspace_alloc_ul_checked(&founder_nonmale_include2, unfiltered_sample_ctv2 * sizeof(intptr_t))) {
+	if (bigstack_alloc_ul(unfiltered_sample_ctv2, &founder_nonmale_include2)) {
 	  goto calc_freqs_and_hwe_ret_NOMEM;
 	}
 	memcpy(founder_nonmale_include2, sample_nonmale_include2, unfiltered_sample_ctv2 * sizeof(intptr_t));
-	vec_include_mask_in(unfiltered_sample_ct, founder_nonmale_include2, founder_info);
+	apply_bitarr_mask_to_quaterarr_01(founder_info, unfiltered_sample_ct, founder_nonmale_include2);
 	sample_f_nonmale_ct = popcount01_longs(founder_nonmale_include2, unfiltered_sample_ctv2);
       }
     }
@@ -2392,41 +2361,41 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
   // founder_case_include2 properly if --hardy was used in a situation where
   // hwe_all would be set (e.g. all-case datasets).
   if ((!hwe_all) || hardy_needed) {
-    if (wkspace_alloc_ul_checked(&founder_ctrl_include2, unfiltered_sample_ctv2 *  sizeof(intptr_t)) ||
-	wkspace_alloc_ul_checked(&tmp_sample_excl_mask2, unfiltered_sample_ctl * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_sample_ctv2, &founder_ctrl_include2) ||
+	bigstack_alloc_ul(unfiltered_sample_ctl, &tmp_sample_excl_mask2)) {
       goto calc_freqs_and_hwe_ret_NOMEM;
     }
     memcpy(tmp_sample_excl_mask2, tmp_sample_excl_mask, unfiltered_sample_ctl * sizeof(intptr_t));
-    bitfield_ornot(tmp_sample_excl_mask2, pheno_nm, unfiltered_sample_ctl);
-    bitfield_or(tmp_sample_excl_mask2, pheno_c, unfiltered_sample_ctl);
-    zero_trailing_bits(tmp_sample_excl_mask2, unfiltered_sample_ct);
+    bitvec_ornot(pheno_nm, unfiltered_sample_ctl, tmp_sample_excl_mask2);
+    bitvec_or(pheno_c, unfiltered_sample_ctl, tmp_sample_excl_mask2);
+    zero_trailing_bits(unfiltered_sample_ct, tmp_sample_excl_mask2);
     // tmp_sample_excl_mask2 is now set for each sample who is excluded, or a
     // nonfounder, or is noncontrol.
     sample_f_ctrl_ct = unfiltered_sample_ct - popcount_longs(tmp_sample_excl_mask2, unfiltered_sample_ctl);
-    exclude_to_vec_include(unfiltered_sample_ct, founder_ctrl_include2, tmp_sample_excl_mask2);
+    init_quaterarr_from_inverted_bitarr(tmp_sample_excl_mask2, unfiltered_sample_ct, founder_ctrl_include2);
     if (nonmales_needed) {
-      if (wkspace_alloc_ul_checked(&founder_ctrl_nonmale_include2, unfiltered_sample_ctv2 * sizeof(intptr_t))) {
+      if (bigstack_alloc_ul(unfiltered_sample_ctv2, &founder_ctrl_nonmale_include2)) {
 	goto calc_freqs_and_hwe_ret_NOMEM;
       }
       memcpy(founder_ctrl_nonmale_include2, sample_nonmale_include2, unfiltered_sample_ctv2 * sizeof(intptr_t));
-      vec_include_mask_out(unfiltered_sample_ct, founder_ctrl_nonmale_include2, tmp_sample_excl_mask2);
+      apply_bitarr_excl_to_quaterarr_01(tmp_sample_excl_mask2, unfiltered_sample_ct, founder_ctrl_nonmale_include2);
       sample_f_ctl_nonmale_ct = popcount01_longs(founder_ctrl_nonmale_include2, unfiltered_sample_ctv2);
     }
     if (hardy_needed) {
-      if (wkspace_alloc_ul_checked(&founder_case_include2, unfiltered_sample_ctv2 *  sizeof(intptr_t))) {
+      if (bigstack_alloc_ul(unfiltered_sample_ctv2, &founder_case_include2)) {
 	goto calc_freqs_and_hwe_ret_NOMEM;
       }
-      bitfield_ornot(tmp_sample_excl_mask, pheno_nm, unfiltered_sample_ctl);
-      bitfield_ornot(tmp_sample_excl_mask, pheno_c, unfiltered_sample_ctl);
-      zero_trailing_bits(tmp_sample_excl_mask, unfiltered_sample_ct);
+      bitvec_ornot(pheno_nm, unfiltered_sample_ctl, tmp_sample_excl_mask);
+      bitvec_ornot(pheno_c, unfiltered_sample_ctl, tmp_sample_excl_mask);
+      zero_trailing_bits(unfiltered_sample_ct, tmp_sample_excl_mask);
       sample_f_case_ct = unfiltered_sample_ct - popcount_longs(tmp_sample_excl_mask, unfiltered_sample_ctl);
-      exclude_to_vec_include(unfiltered_sample_ct, founder_case_include2, tmp_sample_excl_mask);
+      init_quaterarr_from_inverted_bitarr(tmp_sample_excl_mask, unfiltered_sample_ct, founder_case_include2);
       if (nonmales_needed) {
-	if (wkspace_alloc_ul_checked(&founder_case_nonmale_include2, unfiltered_sample_ctv2 * sizeof(intptr_t))) {
+	if (bigstack_alloc_ul(unfiltered_sample_ctv2, &founder_case_nonmale_include2)) {
 	  goto calc_freqs_and_hwe_ret_NOMEM;
 	}
 	memcpy(founder_case_nonmale_include2, sample_nonmale_include2, unfiltered_sample_ctv2 * sizeof(intptr_t));
-	vec_include_mask_out(unfiltered_sample_ct, founder_case_nonmale_include2, tmp_sample_excl_mask);
+	apply_bitarr_excl_to_quaterarr_01(tmp_sample_excl_mask, unfiltered_sample_ct, founder_case_nonmale_include2);
 	sample_f_case_nonmale_ct = popcount01_longs(founder_case_nonmale_include2, unfiltered_sample_ctv2);
       }
     }
@@ -2436,10 +2405,9 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
     om_cluster_ct = om_ip->cluster_ct;
     om_sample_lookup = om_ip->sample_lookup;
     cur_om_entry = *om_entry_ptr;
-    if (wkspace_alloc_ui_checked(&om_cluster_sizes, om_cluster_ct * 2 * sizeof(int32_t))) {
+    if (bigstack_calloc_ui(om_cluster_ct * 2, &om_cluster_sizes)) {
       goto calc_freqs_and_hwe_ret_NOMEM;
     }
-    fill_uint_zero(om_cluster_sizes, om_cluster_ct * 2);
     sample_uidx = 0;
     for (sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
       next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
@@ -2479,7 +2447,7 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
 	  goto calc_freqs_and_hwe_ret_READ_FAIL;
 	}
       }
-      if (load_raw(bedfile, loadbuf, unfiltered_sample_ct4)) {
+      if (load_raw(unfiltered_sample_ct4, bedfile, loadbuf)) {
 	goto calc_freqs_and_hwe_ret_READ_FAIL;
       }
       if (marker_uidx >= next_chrom_start) {
@@ -2524,7 +2492,7 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
 	    uii = 2 * hh_ctf + lh_ctf;
 	  }
 	  if ((uii < min_ac) || (uii > max_ac)) {
-	    set_bit(ac_excl_bitfield, marker_uidx);
+	    set_bit(marker_uidx, ac_excl_bitfield);
 	  }
 	}
 	uii = 2 * (ll_ctf + lh_ctf + hh_ctf + maf_succ);
@@ -2604,7 +2572,7 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
 	if (hethap_incr) {
 	  if (!hhfile) {
 	    memcpy(outname_end, ".hh", 4);
-	    if (fopen_checked(&hhfile, outname, "w")) {
+	    if (fopen_checked(outname, "w", &hhfile)) {
 	      goto calc_freqs_and_hwe_ret_OPEN_FAIL;
 	    }
 	  }
@@ -2658,7 +2626,7 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
 	    ukk = uii - ujj;
 	  }
 	  if ((ukk < min_ac) || (ukk > max_ac)) {
-	    set_bit(ac_excl_bitfield, marker_uidx);
+	    set_bit(marker_uidx, ac_excl_bitfield);
 	  }
 	}
 	uii += 2 * maf_succ;
@@ -2672,7 +2640,7 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
       }
       nonmissing_rate_tot += cur_genotyping_rate;
       if (geno_excl_bitfield && (cur_genotyping_rate < geno_thresh)) {
-	SET_BIT(geno_excl_bitfield, marker_uidx);
+	SET_BIT(marker_uidx, geno_excl_bitfield);
       }
     }
     if (pct < 100) {
@@ -2710,16 +2678,16 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
     retval = RET_WRITE_FAIL;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(hhfile);
   return retval;
 }
 
 int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uint32_t plink_maxfid, uint32_t plink_maxiid, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uintptr_t* sample_exclude, uintptr_t* pheno_nm, uintptr_t* sex_ [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
-  uintptr_t unfiltered_sample_ctv2 = (unfiltered_sample_ctl2 + 1) & (~1);
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t unfiltered_sample_ctv2 = round_up_pow2(unfiltered_sample_ctl2, 2);
   uintptr_t marker_ct_y = 0;
   uintptr_t* sample_male_include2 = NULL;
   uint64_t* om_entry_ptr = NULL;
@@ -2773,23 +2741,22 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
   uint32_t umm;
   uint32_t unn;
   pzwrite_init_null(&ps);
-  if (wkspace_alloc_uc_checked(&overflow_buf, PIGZ_BLOCK_SIZE + MAXLINELEN) ||
-      wkspace_alloc_ui_checked(&missing_cts, unfiltered_sample_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf, unfiltered_sample_ctv2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&sample_include2, unfiltered_sample_ctv2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&sample_male_include2, unfiltered_sample_ctv2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_uc(PIGZ_BLOCK_SIZE + MAXLINELEN, &overflow_buf) ||
+      bigstack_calloc_ui(unfiltered_sample_ct, &missing_cts) ||
+      bigstack_alloc_ul(unfiltered_sample_ctv2, &loadbuf) ||
+      bigstack_alloc_ul(unfiltered_sample_ctv2, &sample_include2) ||
+      bigstack_alloc_ul(unfiltered_sample_ctv2, &sample_male_include2)) {
     goto write_missingness_reports_ret_NOMEM;
   }
   loadbuf[unfiltered_sample_ctv2 - 2] = 0;
   loadbuf[unfiltered_sample_ctv2 - 1] = 0;
-  exclude_to_vec_include(unfiltered_sample_ct, sample_include2, sample_exclude);
+  init_quaterarr_from_inverted_bitarr(sample_exclude, unfiltered_sample_ct, sample_include2);
   memcpy(sample_male_include2, sample_include2, unfiltered_sample_ctv2 * sizeof(intptr_t));
-  vec_include_mask_in(unfiltered_sample_ct, sample_male_include2, sex_male);
+  apply_bitarr_mask_to_quaterarr_01(sex_male, unfiltered_sample_ct, sample_male_include2);
   if (y_present) {
-    marker_ct_y = count_chrom_markers(chrom_info_ptr, chrom_info_ptr->y_code, marker_exclude);
+    marker_ct_y = count_chrom_markers(chrom_info_ptr, marker_exclude, chrom_info_ptr->y_code);
   }
   marker_ct_nony = marker_ct - marker_ct_y;
-  fill_uint_zero(missing_cts, unfiltered_sample_ct);
   if (fseeko(bedfile, bed_offset, SEEK_SET)) {
     goto write_missingness_reports_ret_READ_FAIL;
   }
@@ -2804,13 +2771,12 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
     om_cluster_ref_cts = om_ip->cluster_ref_cts;
     cur_om_entry = *om_entry_ptr;
     om_cluster_ct = om_ip->cluster_ct;
-    // divide by BITCT2 instead of BITCT due to Ychr
-    om_cluster_ctl = (om_cluster_ct + BITCT - 1) / BITCT;
+    om_cluster_ctl = BITCT_TO_WORDCT(om_cluster_ct);
     om_sample_lookup = om_ip->sample_lookup;
-    if (wkspace_alloc_ui_checked(&om_cluster_sizes, om_cluster_ct * 2 * sizeof(int32_t))) {
+    // doubled because of Ychr
+    if (bigstack_calloc_ui(om_cluster_ct * 2, &om_cluster_sizes)) {
       goto write_missingness_reports_ret_NOMEM;
     }
-    fill_uint_zero(om_cluster_sizes, om_cluster_ct * 2);
     for (sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
       next_unset_unsafe_ck(sample_exclude, &sample_uidx);
       uii = om_sample_lookup[sample_uidx];
@@ -2824,24 +2790,21 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
     sample_uidx = 0;
     sample_idx = 0;
     if (cluster_ct) {
-      if (wkspace_alloc_ul_checked(&cur_omidxs, om_cluster_ctl * sizeof(intptr_t))) {
+      if (bigstack_alloc_ul(om_cluster_ctl, &cur_omidxs)) {
         goto write_missingness_reports_ret_NOMEM;
       }
     }
   }
   ujj = unfiltered_sample_ctl2 * BITCT2;
   if (!cluster_ct) {
-    sprintf(tbuf, " CHR %%%us   N_MISS   N_GENO   F_MISS" EOLN_STR, plink_maxsnp);
+    sprintf(g_textbuf, " CHR %%%us   N_MISS   N_GENO   F_MISS" EOLN_STR, plink_maxsnp);
   } else {
-    if (wkspace_alloc_ui_checked(&sample_to_cluster, unfiltered_sample_ct * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&missing_ct_by_cluster, cluster_ct * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&oblig_missing_ct_by_cluster, cluster_ct * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&cluster_sizes, cluster_ct * 2 * sizeof(int32_t))) {
+    if (bigstack_calloc_ui(unfiltered_sample_ct, &sample_to_cluster) ||
+        bigstack_alloc_ui(cluster_ct, &missing_ct_by_cluster) ||
+        bigstack_calloc_ui(cluster_ct, &oblig_missing_ct_by_cluster) ||
+        bigstack_calloc_ui(cluster_ct * 2, &cluster_sizes)) {
       goto write_missingness_reports_ret_NOMEM;
     }
-    fill_uint_zero(sample_to_cluster, unfiltered_sample_ct);
-    fill_uint_zero(cluster_sizes, cluster_ct * 2);
-    fill_uint_zero(oblig_missing_ct_by_cluster, cluster_ct);
     cluster_sizes_y = &(cluster_sizes[cluster_ct]);
     for (clidx = 0; clidx < cluster_ct; clidx++) {
       unn = cluster_starts[clidx + 1];
@@ -2857,10 +2820,10 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
 	}
       }
     }
-    sprintf(tbuf, " CHR %%%us       CLST   N_MISS   N_CLST   N_GENO   F_MISS" EOLN_STR, plink_maxsnp);
+    sprintf(g_textbuf, " CHR %%%us       CLST   N_MISS   N_CLST   N_GENO   F_MISS" EOLN_STR, plink_maxsnp);
   }
 
-  pzwritep += sprintf(pzwritep, tbuf, "SNP");
+  pzwritep += sprintf(pzwritep, g_textbuf, "SNP");
   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
     chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
     chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1];
@@ -2880,13 +2843,13 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
 	cur_cluster_sizes = cluster_sizes_y;
 	om_ycorr = om_cluster_ct;
       }
-      cptr = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, chrom_idx));
+      cptr = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf));
       *cptr++ = ' ';
       if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
 	goto write_missingness_reports_ret_READ_FAIL;
       }
       do {
-	if (load_raw(bedfile, loadbuf, unfiltered_sample_ct4)) {
+	if (load_raw(unfiltered_sample_ct4, bedfile, loadbuf)) {
 	  goto write_missingness_reports_ret_READ_FAIL;
 	}
         if (is_haploid) {
@@ -2922,10 +2885,10 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
 	      ulii &= ulii - 1;
 	    }
 	  }
-          pzwritep = memcpya(pzwritep, tbuf, cptr2 - tbuf);
-	  pzwritep = uint32_writew8x(pzwritep, ukk - oblig_ct, ' ');
-          pzwritep = uint32_writew8x(pzwritep, cur_tot - oblig_ct, ' ');
-	  pzwritep = double_g_writewx4(pzwritep, ((double)((int32_t)(ukk - oblig_ct))) / ((double)((int32_t)(cur_tot - oblig_ct))), 8);
+          pzwritep = memcpya(pzwritep, g_textbuf, cptr2 - g_textbuf);
+	  pzwritep = uint32toa_w8x(ukk - oblig_ct, ' ', pzwritep);
+          pzwritep = uint32toa_w8x(cur_tot - oblig_ct, ' ', pzwritep);
+	  pzwritep = dtoa_g_wxp4(((double)((int32_t)(ukk - oblig_ct))) / ((double)((int32_t)(cur_tot - oblig_ct))), 8, pzwritep);
           append_binary_eoln(&pzwritep);
 	  if (flex_pzwrite(&ps, &pzwritep)) {
 	    goto write_missingness_reports_ret_WRITE_FAIL;
@@ -2949,7 +2912,7 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
 	  } else {
 	    fill_ulong_zero(cur_omidxs, om_cluster_ctl);
 	    do {
-              set_bit(cur_omidxs, ((uint32_t)cur_om_entry) - om_ycorr);
+              set_bit(((uint32_t)cur_om_entry) - om_ycorr, cur_omidxs);
 	      cur_om_entry = *(++om_entry_ptr);
 	    } while ((cur_om_entry >> 32) == marker_uidx);
 	    for (uii = 0; uii < ujj; uii += BITCT2) {
@@ -2971,16 +2934,16 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
 	    }
 	  }
 	  for (clidx = 0; clidx < cluster_ct; clidx++) {
-            pzwritep = memcpya(pzwritep, tbuf, cptr2 - tbuf);
+            pzwritep = memcpya(pzwritep, g_textbuf, cptr2 - g_textbuf);
             pzwritep = fw_strcpy(10, &(cluster_ids[clidx * max_cluster_id_len]), pzwritep);
 	    *pzwritep++ = ' ';
 	    uii = missing_ct_by_cluster[clidx];
-            pzwritep = uint32_writew8x(pzwritep, uii, ' ');
+            pzwritep = uint32toa_w8x(uii, ' ', pzwritep);
 	    umm = cur_cluster_sizes[clidx];
-	    pzwritep = uint32_writew8x(pzwritep, umm, ' ');
+	    pzwritep = uint32toa_w8x(umm, ' ', pzwritep);
 	    umm -= oblig_missing_ct_by_cluster[clidx];
-	    pzwritep = uint32_writew8x(pzwritep, umm, ' ');
-            pzwritep = double_g_writewx4(pzwritep, ((double)((int32_t)uii)) / ((double)((int32_t)umm)), 8);
+	    pzwritep = uint32toa_w8x(umm, ' ', pzwritep);
+            pzwritep = dtoa_g_wxp4(((double)((int32_t)uii)) / ((double)((int32_t)umm)), 8, pzwritep);
 	    append_binary_eoln(&pzwritep);
 	    if (flex_pzwrite(&ps, &pzwritep)) {
 	      goto write_missingness_reports_ret_WRITE_FAIL;
@@ -3007,8 +2970,8 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
     goto write_missingness_reports_ret_OPEN_FAIL;
   }
   pzwritep = (char*)overflow_buf;
-  sprintf(tbuf, "%%%us %%%us MISS_PHENO   N_MISS   N_GENO   F_MISS" EOLN_STR, plink_maxfid, plink_maxiid);
-  pzwritep += sprintf(pzwritep, tbuf, "FID", "IID");
+  sprintf(g_textbuf, "%%%us %%%us MISS_PHENO   N_MISS   N_GENO   F_MISS" EOLN_STR, plink_maxfid, plink_maxiid);
+  pzwritep += sprintf(pzwritep, g_textbuf, "FID", "IID");
   do {
     sample_uidx = next_unset_unsafe(sample_exclude, sample_uidx);
     sample_uidx_stop = next_set(sample_exclude, sample_uidx, unfiltered_sample_ct);
@@ -3034,9 +2997,9 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
 	  ujj -= umm;
 	}
       }
-      pzwritep = uint32_writew8x(pzwritep, uii, ' ');
-      pzwritep = uint32_writew8x(pzwritep, ujj, ' ');
-      pzwritep = double_g_writewx4(pzwritep, ((double)((int32_t)uii)) / ((double)((int32_t)ujj)), 8);
+      pzwritep = uint32toa_w8x(uii, ' ', pzwritep);
+      pzwritep = uint32toa_w8x(ujj, ' ', pzwritep);
+      pzwritep = dtoa_g_wxp4(((double)((int32_t)uii)) / ((double)((int32_t)ujj)), 8, pzwritep);
       append_binary_eoln(&pzwritep);
       if (flex_pzwrite(&ps, &pzwritep)) {
 	goto write_missingness_reports_ret_WRITE_FAIL;
@@ -3063,7 +3026,7 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
     retval = RET_WRITE_FAIL;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   flex_pzwrite_close_cond(&ps, pzwritep);
   return retval;
 }
@@ -3077,9 +3040,13 @@ int32_t hardy_report_write_line(Pigz_state* ps_ptr, char** pzwritep_ptr, char* p
   double minor_freq;
   pzwritep = memcpya(pzwritep, prefix_buf, prefix_len);
   if (reverse) {
-    cptr = uint32_write(uint32_writex(uint32_writex(wbuf, hh_ct, '/'), lh_ct, '/'), ll_ct);
+    cptr = uint32toa_x(hh_ct, '/', wbuf);
+    cptr = uint32toa_x(lh_ct, '/', cptr);
+    cptr = uint32toa(ll_ct, cptr);
   } else {
-    cptr = uint32_write(uint32_writex(uint32_writex(wbuf, ll_ct, '/'), lh_ct, '/'), hh_ct);
+    cptr = uint32toa_x(ll_ct, '/', wbuf);
+    cptr = uint32toa_x(lh_ct, '/', cptr);
+    cptr = uint32toa(hh_ct, cptr);
   }
   pzwritep = fw_strcpyn(20, cptr - wbuf, wbuf, pzwritep);
   *pzwritep++ = ' ';
@@ -3087,7 +3054,9 @@ int32_t hardy_report_write_line(Pigz_state* ps_ptr, char** pzwritep_ptr, char* p
   if (denom && (!is_mt)) {
     drecip = 1.0 / ((double)denom);
     minor_freq = (2 * ll_ct + lh_ct) * drecip;
-    pzwritep = double_g_writewx4(double_g_writewx4x(double_g_writewx4x(pzwritep, (lh_ct * 2) * drecip, 8, ' '), minor_freq * (2 * hh_ct + lh_ct) * drecip * 2, 8, ' '), MAXV(pval, output_min_p), 12);
+    pzwritep = dtoa_g_wxp4x((lh_ct * 2) * drecip, 8, ' ', pzwritep);
+    pzwritep = dtoa_g_wxp4x(minor_freq * (2 * hh_ct + lh_ct) * drecip * 2, 8, ' ', pzwritep);
+    pzwritep = dtoa_g_wxp4(MAXV(pval, output_min_p), 12, pzwritep);
   } else {
     pzwritep = memcpya(pzwritep, "     nan      nan          ", 27);
     pzwritep = memcpyl3a(pzwritep, hwe_midp? "0.5" : "  1");
@@ -3101,7 +3070,7 @@ int32_t hardy_report_write_line(Pigz_state* ps_ptr, char** pzwritep_ptr, char* p
 }
 
 int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, int32_t* hwe_lls, int32_t* hwe_lhs, int32_t* hwe_hhs, uint32_t hwe_modifier, uint32_t nonfounders, int32_t* hwe_ll_cases, int32_t* hwe_lh_cases, int32_t* hwe_hh_cases, int [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   char* pzwritep = NULL;
   uintptr_t marker_ct = unfiltered_marker_ct - marker_exclude_ct;
   uintptr_t marker_uidx = 0;
@@ -3138,9 +3107,9 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
     report_type = 2;
   }
   uii = report_type? 1 : 3;
-  if (wkspace_alloc_uc_checked(&overflow_buf, PIGZ_BLOCK_SIZE + 2 * max_marker_allele_len + MAXLINELEN) ||
-      wkspace_alloc_d_checked(&p_values, uii * marker_ct * sizeof(double)) ||
-      wkspace_alloc_c_checked(&writebuf, 2 * max_marker_allele_len + MAXLINELEN)) {
+  if (bigstack_alloc_uc(PIGZ_BLOCK_SIZE + 2 * max_marker_allele_len + MAXLINELEN, &overflow_buf) ||
+      bigstack_alloc_d(uii * marker_ct, &p_values) ||
+      bigstack_alloc_c(2 * max_marker_allele_len + MAXLINELEN, &writebuf)) {
     goto hardy_report_ret_NOMEM;
   }
 
@@ -3175,7 +3144,7 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
 
   chrom_fo_idx = 0;
   refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
-  cptr0 = width_force(4, writebuf, chrom_name_write(writebuf, chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx]));
+  cptr0 = width_force(4, writebuf, chrom_name_write(chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx], writebuf));
   *cptr0++ = ' ';
   cptr = &(cptr0[10 + plink_maxsnp]);
   prefix_len = 10 + ((uintptr_t)(cptr - writebuf));
@@ -3193,7 +3162,7 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
 	if (marker_uidx >= chrom_end) {
 	  chrom_fo_idx++;
 	  refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
-	  cptr0 = width_force(4, writebuf, chrom_name_write(writebuf, chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx]));
+	  cptr0 = width_force(4, writebuf, chrom_name_write(chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx], writebuf));
 	  *cptr0++ = ' ';
 	  cptr = &(cptr0[10 + plink_maxsnp]);
 	  prefix_len = 10 + ((uintptr_t)(cptr - writebuf));
@@ -3235,7 +3204,7 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
 	if (marker_uidx >= chrom_end) {
 	  chrom_fo_idx++;
 	  refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
-	  cptr0 = width_force(4, writebuf, chrom_name_write(writebuf, chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx]));
+	  cptr0 = width_force(4, writebuf, chrom_name_write(chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx], writebuf));
 	  *cptr0++ = ' ';
           memset(&(cptr0[plink_maxsnp]), 32, 20);
 	  cptr = &(cptr0[10 + plink_maxsnp]);
@@ -3290,7 +3259,7 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
     break;
   }
   flex_pzwrite_close_cond(&ps, pzwritep);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -3338,7 +3307,7 @@ uint32_t enforce_hwe_threshold(double hwe_thresh, uintptr_t unfiltered_marker_ct
 	  test_failed = SNPHWE_t(hwe_lhs[marker_uidx], hwe_lls[marker_uidx], hwe_hhs[marker_uidx], hwe_thresh);
 	}
 	if (test_failed) {
-	  SET_BIT(marker_exclude, marker_uidx);
+	  SET_BIT(marker_uidx, marker_exclude);
 	  removed_ct++;
 	}
 	cur_obs = hwe_lhs[marker_uidx] + hwe_lls[marker_uidx] + hwe_hhs[marker_uidx];
@@ -3372,7 +3341,7 @@ uint32_t enforce_hwe_threshold(double hwe_thresh, uintptr_t unfiltered_marker_ct
 }
 
 uint32_t enforce_minor_allele_thresholds(double min_maf, double max_maf, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* ac_excl_bitfield, uintptr_t* marker_exclude_ct_ptr, double* set_allele_freqs, uint32_t allow_no_variants) {
-  uint32_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uint32_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uint32_t marker_ct = unfiltered_marker_ct - *marker_exclude_ct_ptr;
   uint32_t marker_uidx = 0;
   uint32_t removed_ct = 0;
@@ -3389,13 +3358,13 @@ uint32_t enforce_minor_allele_thresholds(double min_maf, double max_maf, uintptr
       do {
 	dxx = get_maf(set_allele_freqs[marker_uidx]);
 	if ((dxx < min_maf) || (dxx > max_maf)) {
-	  SET_BIT(marker_exclude, marker_uidx);
+	  SET_BIT(marker_uidx, marker_exclude);
 	}
       } while (++marker_uidx < marker_uidx_stop);
     }
   }
   if (ac_excl_bitfield) {
-    bitfield_or(marker_exclude, ac_excl_bitfield, unfiltered_marker_ctl);
+    bitvec_or(ac_excl_bitfield, unfiltered_marker_ctl, marker_exclude);
   }
   removed_ct = popcount_longs(marker_exclude, unfiltered_marker_ctl) - (*marker_exclude_ct_ptr);
   if ((marker_ct == removed_ct) && (!allow_no_variants)) {
@@ -3428,7 +3397,7 @@ void enforce_min_bp_space(int32_t min_bp_space, uint32_t unfiltered_marker_ct, u
       do {
         cur_pos = marker_pos[marker_uidx];
         if (cur_pos < last_pos + min_bp_space) {
-          SET_BIT(marker_exclude, marker_uidx);
+          SET_BIT(marker_uidx, marker_exclude);
 	  removed_ct++;
 	} else {
 	  last_pos = cur_pos;
diff --git a/plink_filter.h b/plink_filter.h
index 7ac2d96..5298c0e 100644
--- a/plink_filter.h
+++ b/plink_filter.h
@@ -43,7 +43,7 @@ void filter_samples_bitfields(uintptr_t unfiltered_sample_ct, uintptr_t* sample_
 
 int32_t mind_filter(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double mind_thresh, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* sex_male, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip, uint32_t allow_no_samples);
 
-int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_exclude_ct, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* founder_info, int32_t nonfounders, int32_t maf_succ, double* set_allele_freqs, uintptr_t bed_offset, uint32_t hwe_needed, uint32_t hwe_all, uin [...]
+int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_exclude_ct, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* founder_info, int32_t nonfounders, int32_t maf_succ, double* set_allele_freqs, uintptr_t bed_offset, uint32_t hwe_needed, uint32_t hwe_all, uin [...]
 
 int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uint32_t plink_maxfid, uint32_t plink_maxiid, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uintptr_t* sample_exclude, uintptr_t* pheno_nm, uintptr_t* sex_ [...]
 
diff --git a/plink_glm.c b/plink_glm.c
index d71c59a..2d1c0c0 100644
--- a/plink_glm.c
+++ b/plink_glm.c
@@ -19,19 +19,11 @@ static double* g_mperm_save_all;
 // any better than the usual PLINK 2-bit format.
 static uintptr_t* g_loadbuf;
 
-static uintptr_t* g_perm_vecs;
-
-static double* g_pheno_d2;
 #ifndef NOLAPACK
 static double g_pheno_sum;
 static double g_pheno_ssq;
 #endif
 
-// permutation-major instead of sample-major order for --linear (PERMORY
-// speedups do not apply)
-static double* g_perm_pmajor;
-static uint32_t* g_precomputed_mods; // g_precomputed_mods[n] = 2^32 mod (n-2)
-
 static uint32_t* g_nm_cts;
 
 // This is *twice* the number of successes, because PLINK 1.07 counts tie as
@@ -47,12 +39,9 @@ static unsigned char* g_perm_adapt_stop;
 
 static uint32_t g_adapt_m_table[MODEL_BLOCKSIZE];
 static uint32_t g_assoc_thread_ct;
-static uintptr_t g_perm_vec_ct;
 static uint32_t g_block_diff;
 static uint32_t g_perms_done;
 static uint32_t g_first_adapt_check;
-static uint32_t g_pheno_nm_ct;
-static uint32_t g_case_ct;
 static double g_adaptive_intercept;
 static double g_adaptive_slope;
 static double g_aperm_alpha;
@@ -61,196 +50,19 @@ static uint32_t g_is_x;
 static uint32_t g_is_y;
 static uint32_t g_min_ploidy_1;
 
-static uint32_t g_tot_quotient;
-static uint64_t g_totq_magic;
-static uint32_t g_totq_preshift;
-static uint32_t g_totq_postshift;
-static uint32_t g_totq_incr;
-
-static uint32_t g_cluster_ct;
-static uint32_t* g_cluster_map;
-static uint32_t* g_cluster_starts;
-static uint32_t* g_cluster_case_cts;
-
-// per-cluster magic number sets
-static uintptr_t* g_cluster_cc_perm_preimage;
-static uint32_t* g_tot_quotients;
-static uint64_t* g_totq_magics;
-static uint32_t* g_totq_preshifts;
-static uint32_t* g_totq_postshifts;
-static uint32_t* g_totq_incrs;
-
-static uint32_t* g_sample_to_cluster;
-static uint32_t* g_qassoc_cluster_thread_wkspace;
-
-THREAD_RET_TYPE logistic_gen_perms_thread(void* arg) {
-  // just a clone of model_assoc_gen_perms_thread()
-  uintptr_t tidx = (uintptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
-  uint32_t case_ct = g_case_ct;
-  uint32_t tot_quotient = g_tot_quotient;
-  uint64_t totq_magic = g_totq_magic;
-  uint32_t totq_preshift = g_totq_preshift;
-  uint32_t totq_postshift = g_totq_postshift;
-  uint32_t totq_incr = g_totq_incr;
-  uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
-  sfmt_t* __restrict__ sfmtp = g_sfmtp_arr[tidx];
-  uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
-  uint32_t pidx = (((uint64_t)tidx) * g_perm_vec_ct) / g_assoc_thread_ct;
-  uint32_t pmax = (((uint64_t)tidx + 1) * g_perm_vec_ct) / g_assoc_thread_ct;
-  for (; pidx < pmax; pidx++) {
-    generate_cc_perm_vec(pheno_nm_ct, case_ct, tot_quotient, totq_magic, totq_preshift, totq_postshift, totq_incr, &(perm_vecs[pidx * pheno_nm_ctl2]), sfmtp);
-  }
-  THREAD_RETURN;
-}
-
-THREAD_RET_TYPE logistic_gen_cluster_perms_thread(void* arg) {
-  uintptr_t tidx = (uintptr_t)arg;
-  uint32_t pheno_nm_ct = g_pheno_nm_ct;
-  uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
-  sfmt_t* __restrict__ sfmtp = g_sfmtp_arr[tidx];
-  uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
-  uint32_t pidx = (((uint64_t)tidx) * g_perm_vec_ct) / g_assoc_thread_ct;
-  uint32_t pmax = (((uint64_t)tidx + 1) * g_perm_vec_ct) / g_assoc_thread_ct;
-  uint32_t cluster_ct = g_cluster_ct;
-  uint32_t* cluster_map = g_cluster_map;
-  uint32_t* cluster_starts = g_cluster_starts;
-  uint32_t* cluster_case_cts = g_cluster_case_cts;
-  uintptr_t* cluster_cc_perm_preimage = g_cluster_cc_perm_preimage;
-  uint32_t* tot_quotients = g_tot_quotients;
-  uint64_t* totq_magics = g_totq_magics;
-  uint32_t* totq_preshifts = g_totq_preshifts;
-  uint32_t* totq_postshifts = g_totq_postshifts;
-  uint32_t* totq_incrs = g_totq_incrs;
-  for (; pidx < pmax; pidx++) {
-    generate_cc_cluster_perm_vec(pheno_nm_ct, cluster_cc_perm_preimage, cluster_ct, cluster_map, cluster_starts, cluster_case_cts, tot_quotients, totq_magics, totq_preshifts, totq_postshifts, totq_incrs, &(perm_vecs[pidx * pheno_nm_ctl2]), sfmtp);
-  }
-  THREAD_RETURN;
-}
-
-THREAD_RET_TYPE linear_gen_perms_thread(void* arg) {
-  // Used by --linear.  Requires g_pheno_nm_ct, g_pheno_d2, g_sfmtp_arr,
-  // g_assoc_thread_ct, and g_perm_vec_ct to be initialized, and space must be
-  // allocated for g_perm_pmajor.  The nth permutation (0-based) is stored in
-  // g_perm_pmajor indices
-  //   [n * sample_valid_ct] to [(n + 1) * sample_valid_ct - 1]
-  // inclusive.
-  uintptr_t tidx = (uintptr_t)arg;
-  uint32_t sample_valid_ct = g_pheno_nm_ct;
-  uintptr_t perm_vec_ctcl = (g_perm_vec_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32;
-  sfmt_t* sfmtp = g_sfmtp_arr[tidx];
-  uintptr_t pmin = CACHELINE_INT32 * ((((uint64_t)tidx) * perm_vec_ctcl) / g_assoc_thread_ct);
-  uintptr_t pmax = CACHELINE_INT32 * ((((uint64_t)tidx + 1) * perm_vec_ctcl) / g_assoc_thread_ct);
-  double* perm_pmajor = &(g_perm_pmajor[pmin * sample_valid_ct]);
-  double* pheno_d2 = g_pheno_d2;
-  uint32_t* precomputed_mods = g_precomputed_mods;
-  uint32_t* lbound_ptr;
-  double* pheno_ptr;
-  uint32_t poffset;
-  uint32_t pdiff;
-  uint32_t sample_idx;
-  uint32_t urand;
-  uint32_t lbound;
-  if (tidx + 1 == g_assoc_thread_ct) {
-    pmax = g_perm_vec_ct;
-  }
-  pdiff = pmax - pmin;
-  for (poffset = 0; poffset < pdiff; poffset++) {
-    lbound_ptr = precomputed_mods;
-    pheno_ptr = pheno_d2;
-    perm_pmajor[0] = *pheno_ptr++;
-    for (sample_idx = 1; sample_idx < sample_valid_ct; sample_idx++) {
-      lbound = *lbound_ptr++;
-      do {
-        urand = sfmt_genrand_uint32(sfmtp);
-      } while (urand < lbound);
-      // er, this modulus operation is slow.  but doesn't seem to be worthwhile
-      // to use magic numbers here.
-      urand %= sample_idx + 1;
-      perm_pmajor[sample_idx] = perm_pmajor[urand];
-      perm_pmajor[urand] = *pheno_ptr++;
-    }
-    perm_pmajor = &(perm_pmajor[sample_valid_ct]);
-  }
-  THREAD_RETURN;
-}
-
-THREAD_RET_TYPE linear_gen_cluster_perms_thread(void* arg) {
-  // On top of the linear_gen_perms_thread requirements, this also needs
-  // g_cluster_ct, g_cluster_map, g_cluster_starts,
-  // g_qassoc_cluster_thread_wkspace, and g_sample_to_cluster to be
-  // initialized.
-  uintptr_t tidx = (uintptr_t)arg;
-  uint32_t sample_valid_ct = g_pheno_nm_ct;
-  uintptr_t perm_vec_ctcl = (g_perm_vec_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32;
-  sfmt_t* sfmtp = g_sfmtp_arr[tidx];
-  uintptr_t pmin = CACHELINE_INT32 * ((((uint64_t)tidx) * perm_vec_ctcl) / g_assoc_thread_ct);
-  uintptr_t pmax = CACHELINE_INT32 * ((((uint64_t)tidx + 1) * perm_vec_ctcl) / g_assoc_thread_ct);
-  double* perm_pmajor = &(g_perm_pmajor[pmin * sample_valid_ct]);
-  double* pheno_d2 = g_pheno_d2;
-  uint32_t* precomputed_mods = &(g_precomputed_mods[-1]);
-  uint32_t cluster_ct = g_cluster_ct;
-  uint32_t cluster_ctcl = (cluster_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32;
-  uint32_t* cluster_map = g_cluster_map;
-  uint32_t* cluster_starts = g_cluster_starts;
-  uint32_t* in_cluster_positions = &(g_qassoc_cluster_thread_wkspace[tidx * cluster_ctcl * CACHELINE_INT32]);
-  uint32_t* sample_to_cluster = g_sample_to_cluster;
-  double* pheno_ptr;
-  uint32_t poffset;
-  uint32_t pdiff;
-  uint32_t cluster_idx;
-  uint32_t cur_in_cluster_pos;
-  uint32_t sample_idx;
-  uint32_t urand;
-  uint32_t lbound;
-  uint32_t uii;
-  if (tidx + 1 == g_assoc_thread_ct) {
-    pmax = g_perm_vec_ct;
-  }
-  pdiff = pmax - pmin;
-  for (poffset = 0; poffset < pdiff; poffset++) {
-    fill_uint_zero(in_cluster_positions, cluster_ct);
-    pheno_ptr = pheno_d2;
-    for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
-      cluster_idx = sample_to_cluster[sample_idx];
-      if (cluster_idx == 0xffffffffU) {
-	cur_in_cluster_pos = 0;
-      } else {
-	cur_in_cluster_pos = in_cluster_positions[cluster_idx];
-	in_cluster_positions[cluster_idx] += 1;
-      }
-      if (!cur_in_cluster_pos) {
-        perm_pmajor[sample_idx] = *pheno_ptr++;
-      } else {
-        lbound = precomputed_mods[cur_in_cluster_pos];
-        do {
-	  urand = sfmt_genrand_uint32(sfmtp);
-	} while (urand < lbound);
-	urand %= (cur_in_cluster_pos + 1);
-	uii = cluster_map[cluster_starts[cluster_idx] + urand];
-        perm_pmajor[sample_idx] = perm_pmajor[uii];
-	perm_pmajor[uii] = *pheno_ptr++;
-      }
-    }
-    perm_pmajor = &(perm_pmajor[sample_valid_ct]);
-  }
-  THREAD_RETURN;
-}
-
 uint32_t glm_init_load_mask(uintptr_t* sample_exclude, uintptr_t* pheno_nm, uintptr_t* covar_nm, uint32_t sample_ct, uintptr_t unfiltered_sample_ctv2, uintptr_t** load_mask_ptr) {
   uint32_t sample_uidx = 0;
   uintptr_t* load_mask;
   uint32_t sample_idx;
-  if (wkspace_alloc_ul_checked(load_mask_ptr, unfiltered_sample_ctv2 * (sizeof(intptr_t) / 2))) {
+  if (bigstack_calloc_ul(unfiltered_sample_ctv2 / 2, load_mask_ptr)) {
     return 1;
   }
   load_mask = *load_mask_ptr;
-  fill_ulong_zero(load_mask, unfiltered_sample_ctv2 / 2);
   if (covar_nm) {
     for (sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
       next_unset_unsafe_ck(sample_exclude, &sample_uidx);
       if (IS_SET(pheno_nm, sample_uidx) & IS_SET(covar_nm, sample_idx)) {
-	SET_BIT(load_mask, sample_uidx);
+	SET_BIT(sample_uidx, load_mask);
       }
     }
   } else {
@@ -262,12 +74,12 @@ uint32_t glm_init_load_mask(uintptr_t* sample_exclude, uintptr_t* pheno_nm, uint
 int32_t glm_scan_conditions(char* condition_mname, char* condition_fname, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, uint32_t hh_or_mt_exists, uintptr_t* loadbuf_raw, FILE* bedfile, uintptr_t bed_offset, uintptr_t unfiltered_sample_ct, uintptr_t* sex_male, uintptr_t* load_mask, uintptr_t* sample_valid_ct_ptr, uintptr_t* condition_ct_ptr, uint32_t** condition_uidxs_ptr, uintptr_ [...]
   // side effects: load_mask and sample_valid_ct potentially updated,
   //   condition_ct should be changed, condition_uidxs should be malloc'd
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* condition_file = NULL;
   uint32_t* condition_uidxs = NULL;
-  uintptr_t marker_ctl = (marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t marker_ctl = BITCT_TO_WORDCT(marker_ct);
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctv2 = 2 * ((unfiltered_sample_ct + (BITCT - 1)) / BITCT);
+  uintptr_t unfiltered_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct);
   uintptr_t sample_valid_ct = *sample_valid_ct_ptr;
   uintptr_t miss_ct = 0;
   uintptr_t condition_ct = 0;
@@ -279,7 +91,7 @@ int32_t glm_scan_conditions(char* condition_mname, char* condition_fname, uintpt
   __m128i* loadbuf_vend;
   __m128i vii;
 #else
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t* loadbuf_end;
 #endif
   uintptr_t* loadbuf_ptr;
@@ -313,35 +125,33 @@ int32_t glm_scan_conditions(char* condition_mname, char* condition_fname, uintpt
     condition_uidxs = (uint32_t*)malloc(sizeof(int32_t));
     condition_uidxs[0] = (uint32_t)ii;
   } else {
-    if (wkspace_alloc_c_checked(&sorted_ids, marker_ct * max_marker_id_len) ||
-        wkspace_alloc_ui_checked(&id_map, marker_ct * sizeof(int32_t)) ||
-        wkspace_alloc_ul_checked(&already_seen, marker_ctl * sizeof(intptr_t)) ||
-        wkspace_alloc_ui_checked(&marker_idx_to_uidx, marker_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_c(marker_ct * max_marker_id_len, &sorted_ids) ||
+        bigstack_alloc_ui(marker_ct, &id_map) ||
+        bigstack_calloc_ul(marker_ctl, &already_seen) ||
+        bigstack_alloc_ui(marker_ct, &marker_idx_to_uidx)) {
       goto glm_scan_conditions_ret_NOMEM;
     }
     fill_idx_to_uidx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_idx_to_uidx);
-    fill_ulong_zero(already_seen, marker_ctl);
-    retval = sort_item_ids_noalloc(sorted_ids, id_map, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, 1, strcmp_deref);
+    retval = sort_item_ids_noalloc(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, 1, strcmp_deref, sorted_ids, id_map);
     if (retval) {
       goto glm_scan_conditions_ret_1;
     }
-    condition_uidxs_tmp = (uint32_t*)wkspace_base;
-    if (wkspace_left > marker_ct * sizeof(int32_t)) {
+    condition_uidxs_tmp = (uint32_t*)g_bigstack_base;
+    condition_ct_max = bigstack_left() / sizeof(int32_t);
+    if (condition_ct_max > marker_ct) {
       condition_ct_max = marker_ct;
-    } else {
-      condition_ct_max = wkspace_left / sizeof(int32_t);
     }
-    if (fopen_checked(&condition_file, condition_fname, "r")) {
+    if (fopen_checked(condition_fname, "r", &condition_file)) {
       goto glm_scan_conditions_ret_OPEN_FAIL;
     }
-    tbuf[MAXLINELEN - 1] = ' ';
-    while (fgets(tbuf, MAXLINELEN, condition_file)) {
+    g_textbuf[MAXLINELEN - 1] = ' ';
+    while (fgets(g_textbuf, MAXLINELEN, condition_file)) {
       line_idx++;
-      if (!tbuf[MAXLINELEN - 1]) {
-        sprintf(logbuf, "Error: Line %" PRIuPTR " of --condition-list file is pathologically long.\n", line_idx);
+      if (!g_textbuf[MAXLINELEN - 1]) {
+        sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --condition-list file is pathologically long.\n", line_idx);
         goto glm_scan_conditions_ret_INVALID_FORMAT_2;
       }
-      bufptr = skip_initial_spaces(tbuf);
+      bufptr = skip_initial_spaces(g_textbuf);
       while (!is_eoln_kns(*bufptr)) {
         bufptr2 = token_endnn(bufptr);
 	ii = bsearch_str(bufptr, (uintptr_t)(bufptr2 - bufptr), sorted_ids, max_marker_id_len, marker_ct);
@@ -355,7 +165,7 @@ int32_t glm_scan_conditions(char* condition_mname, char* condition_fname, uintpt
 	  if (condition_ct == condition_ct_max) {
 	    goto glm_scan_conditions_ret_NOMEM;
 	  }
-	  set_bit(already_seen, ii);
+	  set_bit(ii, already_seen);
 	  condition_uidxs_tmp[condition_ct++] = marker_idx_to_uidx[id_map[(uint32_t)ii]];
 	}
         bufptr = skip_initial_spaces(bufptr2);
@@ -372,18 +182,18 @@ int32_t glm_scan_conditions(char* condition_mname, char* condition_fname, uintpt
       goto glm_scan_conditions_ret_1;
     }
     if (miss_ct) {
-      sprintf(logbuf, "--condition-list: %" PRIuPTR " of %" PRIuPTR " variant ID%s loaded from %s.\n", condition_ct, condition_ct + miss_ct, (condition_ct + miss_ct == 1)? "" : "s", condition_fname);
+      sprintf(g_logbuf, "--condition-list: %" PRIuPTR " of %" PRIuPTR " variant ID%s loaded from %s.\n", condition_ct, condition_ct + miss_ct, (condition_ct + miss_ct == 1)? "" : "s", condition_fname);
     } else {
-      sprintf(logbuf, "--condition-list: %" PRIuPTR " variant ID%s loaded from %s.\n", condition_ct, (condition_ct == 1)? "" : "s", condition_fname);
+      sprintf(g_logbuf, "--condition-list: %" PRIuPTR " variant ID%s loaded from %s.\n", condition_ct, (condition_ct == 1)? "" : "s", condition_fname);
     }
     logprintb();
   }
   if (condition_ct) {
-    if (wkspace_alloc_ul_checked(&loadbuf_mask_orig, unfiltered_sample_ctv2 * sizeof(intptr_t)) ||
-        wkspace_alloc_ul_checked(&loadbuf_mask, unfiltered_sample_ctv2 * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_sample_ctv2, &loadbuf_mask_orig) ||
+        bigstack_alloc_ul(unfiltered_sample_ctv2, &loadbuf_mask)) {
       goto glm_scan_conditions_ret_NOMEM;
     }
-    vec_include_init(unfiltered_sample_ct, loadbuf_mask_orig, load_mask);
+    init_quaterarr_from_bitarr(load_mask, unfiltered_sample_ct, loadbuf_mask_orig);
     memcpy(loadbuf_mask, loadbuf_mask_orig, unfiltered_sample_ctv2 * sizeof(intptr_t));
 #ifdef __LP64__
     loadbuf_vend = (__m128i*)(&(loadbuf_raw[unfiltered_sample_ctv2]));
@@ -397,7 +207,7 @@ int32_t glm_scan_conditions(char* condition_mname, char* condition_fname, uintpt
 	goto glm_scan_conditions_ret_READ_FAIL;
       }
       // don't use load_and_collapse since collapse bitmask not finalized
-      if (load_raw(bedfile, loadbuf_raw, unfiltered_sample_ct4)) {
+      if (load_raw(unfiltered_sample_ct4, bedfile, loadbuf_raw)) {
 	goto glm_scan_conditions_ret_READ_FAIL;
       }
       chrom_idx = get_marker_chrom(chrom_info_ptr, marker_uidx);
@@ -433,7 +243,7 @@ int32_t glm_scan_conditions(char* condition_mname, char* condition_fname, uintpt
       ulii = (*loadbuf_ptr++) & (~(*loadbuf_mask_ptr++));
       while (ulii) {
         uljj = CTZLU(ulii);
-        clear_bit_ul(load_mask, sample_uidx_offset + (uljj / 2));
+        clear_bit_ul(sample_uidx_offset + (uljj / 2), load_mask);
         sample_valid_ct--;
         ulii &= ulii - 1;
       }
@@ -459,7 +269,7 @@ int32_t glm_scan_conditions(char* condition_mname, char* condition_fname, uintpt
     break;
   }
  glm_scan_conditions_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(condition_file);
   return retval;
 }
@@ -480,7 +290,7 @@ uint32_t glm_loadbuf_to_doubles(uintptr_t* loadbuf_collapsed, uint32_t sample_va
         if (cur_genotype != 1) {
           *covar_row = geno_map[cur_genotype];
 	} else {
-          SET_BIT(cur_missing, sample_idx);
+          SET_BIT(sample_idx, cur_missing);
 	}
       }
       sample_idx_stop += BITCT2;
@@ -508,7 +318,7 @@ uint32_t glm_loadbuf_to_doubles_x(uintptr_t* loadbuf_collapsed, uintptr_t* sex_m
         if (cur_genotype != 1) {
           *covar_row = geno_map[cur_genotype + 4 * IS_SET(sex_male_collapsed, sample_idx)];
 	} else {
-          SET_BIT(cur_missing, sample_idx);
+          SET_BIT(sample_idx, cur_missing);
 	}
       }
       sample_idx_stop += BITCT2;
@@ -537,7 +347,7 @@ uint32_t glm_loadbuf_to_floats(uintptr_t* loadbuf_collapsed, uint32_t sample_val
         if (cur_genotype != 1) {
           *covar_row = geno_map[cur_genotype];
 	} else {
-          SET_BIT(cur_missing, sample_idx);
+          SET_BIT(sample_idx, cur_missing);
 	}
       }
       sample_idx_stop += BITCT2;
@@ -565,7 +375,7 @@ uint32_t glm_loadbuf_to_floats_x(uintptr_t* loadbuf_collapsed, uintptr_t* sex_ma
         if (cur_genotype != 1) {
           *covar_row = geno_map[cur_genotype + 4 * IS_SET(sex_male_collapsed, sample_idx)];
 	} else {
-          SET_BIT(cur_missing, sample_idx);
+          SET_BIT(sample_idx, cur_missing);
 	}
       }
       sample_idx_stop += BITCT2;
@@ -686,7 +496,7 @@ uint32_t glm_linear(uintptr_t cur_batch_size, uintptr_t param_ct, uintptr_t samp
   double dxx;
   double dyy;
   double dzz;
-  fill_ulong_zero(perm_fails, (cur_batch_size + (BITCT - 1)) / BITCT);
+  fill_ulong_zero(perm_fails, BITCT_TO_WORDCT(cur_batch_size));
   col_major_matrix_multiply((uint32_t)param_ct, (uint32_t)param_ct, (uint32_t)sample_valid_ct, covars_sample_major, covars_cov_major, param_2d_buf);
   if (invert_matrix((uint32_t)param_ct, param_2d_buf, mi_buf, param_2d_buf2)) {
     return 1;
@@ -803,7 +613,7 @@ uint32_t glm_linear(uintptr_t cur_batch_size, uintptr_t param_ct, uintptr_t samp
       dyy = 0;
       perm_fail_ct++;
       dptr = &(dptr[param_ctx_m1]);
-      SET_BIT(perm_fails, perm_idx);
+      SET_BIT(perm_idx, perm_fails);
     } else {
       dptr2 = param_2d_buf2;
       if (!joint_test_requested) {
@@ -1223,7 +1033,7 @@ static inline void mult_tmatrix_nxd_vect_d(const float* tm, const float* vect, f
   __m128 r2;
   __m128 r3;
   __m128 r4;
-  uintptr_t col_cta4 = (col_ct + 3) & (~3);
+  uintptr_t col_cta4 = round_up_pow2(col_ct, 4);
   uint32_t row_idx = 0;
   uint32_t row_ctm3;
   uint32_t col_idx;
@@ -1314,7 +1124,7 @@ static inline void mult_tmatrix_nxd_vect_d(const float* tm, const float* vect, f
 }
 
 static inline void mult_matrix_dxn_vect_n(const float* mm, const float* vect, float* dest, uint32_t col_ct, uint32_t row_ct) {
-  uintptr_t col_cta4 = (col_ct + 3) & (~3);
+  uintptr_t col_cta4 = round_up_pow2(col_ct, 4);
   uint32_t row_idx = 0;
   const float* mm_ptr;
   __m128 s1;
@@ -1326,7 +1136,7 @@ static inline void mult_matrix_dxn_vect_n(const float* mm, const float* vect, fl
   __m128 a2;
   __m128 a3;
   __m128 a4;
-  __uni16 u16;
+  __univec uvec;
   uint32_t row_ctm3;
   uint32_t col_idx;
   if (row_ct > 3) {
@@ -1354,14 +1164,14 @@ static inline void mult_matrix_dxn_vect_n(const float* mm, const float* vect, fl
         s4 = _mm_add_ps(s4, a4);
       }
       // refrain from using SSE3 _mm_hadd_ps() for now
-      u16.vf = s1;
-      *dest++ = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
-      u16.vf = s2;
-      *dest++ = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
-      u16.vf = s3;
-      *dest++ = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
-      u16.vf = s4;
-      *dest++ = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
+      uvec.vf = s1;
+      *dest++ = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+      uvec.vf = s2;
+      *dest++ = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+      uvec.vf = s3;
+      *dest++ = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+      uvec.vf = s4;
+      *dest++ = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
     }
   }
   s1 = _mm_setzero_ps();
@@ -1382,12 +1192,12 @@ static inline void mult_matrix_dxn_vect_n(const float* mm, const float* vect, fl
       s2 = _mm_add_ps(s2, a2);
       s3 = _mm_add_ps(s3, a3);
     }
-    u16.vf = s1;
-    *dest++ = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
-    u16.vf = s2;
-    *dest++ = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
-    u16.vf = s3;
-    *dest = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
+    uvec.vf = s1;
+    *dest++ = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+    uvec.vf = s2;
+    *dest++ = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+    uvec.vf = s3;
+    *dest = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
     break;
   case 2:
     for (col_idx = 0; col_idx < col_ct; col_idx += 4) {
@@ -1400,10 +1210,10 @@ static inline void mult_matrix_dxn_vect_n(const float* mm, const float* vect, fl
       s1 = _mm_add_ps(s1, a1);
       s2 = _mm_add_ps(s2, a2);
     }
-    u16.vf = s1;
-    *dest++ = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
-    u16.vf = s2;
-    *dest = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
+    uvec.vf = s1;
+    *dest++ = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+    uvec.vf = s2;
+    *dest = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
     break;
   case 1:
     for (col_idx = 0; col_idx < col_ct; col_idx += 4) {
@@ -1412,8 +1222,8 @@ static inline void mult_matrix_dxn_vect_n(const float* mm, const float* vect, fl
       a1 = _mm_mul_ps(a1, vv);
       s1 = _mm_add_ps(s1, a1);
     }
-    u16.vf = s1;
-    *dest = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
+    uvec.vf = s1;
+    *dest = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
     break;
   }
 }
@@ -1423,7 +1233,7 @@ static inline float triple_product(const float* v1, const float* v2, const float
   __m128 aa;
   __m128 bb;
   __m128 cc;
-  __uni16 u16;
+  __univec uvec;
   uint32_t uii;
   for (uii = 0; uii < nn; uii += 4) {
     aa = _mm_load_ps(&(v1[uii]));
@@ -1431,8 +1241,8 @@ static inline float triple_product(const float* v1, const float* v2, const float
     cc = _mm_load_ps(&(v3[uii]));
     sum = _mm_add_ps(sum, _mm_mul_ps(_mm_mul_ps(aa, bb), cc));
   }
-  u16.vf = sum;
-  return u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
+  uvec.vf = sum;
+  return uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
 }
 
 static inline void compute_two_diag_triple_product(const float* aa, const float* bb, const float* vv, float* raa_ptr, float* rab_ptr, float* rbb_ptr, uint32_t nn) {
@@ -1444,7 +1254,7 @@ static inline void compute_two_diag_triple_product(const float* aa, const float*
   __m128 btmp;
   __m128 av;
   __m128 bv;
-  __uni16 u16;
+  __univec uvec;
   uint32_t uii;
   for (uii = 0; uii < nn; uii += 4) {
     vtmp = _mm_load_ps(&(vv[uii]));
@@ -1456,12 +1266,12 @@ static inline void compute_two_diag_triple_product(const float* aa, const float*
     sab = _mm_add_ps(sab, _mm_mul_ps(atmp, bv));
     sbb = _mm_add_ps(sbb, _mm_mul_ps(btmp, bv));
   }
-  u16.vf = saa;
-  *raa_ptr = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
-  u16.vf = sab;
-  *rab_ptr = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
-  u16.vf = sbb;
-  *rbb_ptr = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
+  uvec.vf = saa;
+  *raa_ptr = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+  uvec.vf = sab;
+  *rab_ptr = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+  uvec.vf = sbb;
+  *rbb_ptr = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
 }
 
 static inline void compute_three_triple_product(const float* bb, const float* a1, const float* a2, const float* a3, const float* vv, float* r1_ptr, float* r2_ptr, float* r3_ptr, uint32_t nn) {
@@ -1473,7 +1283,7 @@ static inline void compute_three_triple_product(const float* bb, const float* a1
   __m128 a3tmp;
   __m128 vtmp;
   __m128 btmp;
-  __uni16 u16;
+  __univec uvec;
   uint32_t uii;
   for (uii = 0; uii < nn; uii += 4) {
     a1tmp = _mm_load_ps(&(a1[uii]));
@@ -1486,12 +1296,12 @@ static inline void compute_three_triple_product(const float* bb, const float* a1
     s2 = _mm_add_ps(s2, _mm_mul_ps(a2tmp, btmp));
     s3 = _mm_add_ps(s3, _mm_mul_ps(a3tmp, btmp));
   }
-  u16.vf = s1;
-  *r1_ptr = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
-  u16.vf = s2;
-  *r2_ptr = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
-  u16.vf = s3;
-  *r3_ptr = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
+  uvec.vf = s1;
+  *r1_ptr = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+  uvec.vf = s2;
+  *r2_ptr = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+  uvec.vf = s3;
+  *r3_ptr = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
 }
 
 static inline void compute_two_plus_one_triple_product(const float* bb, const float* a1, const float* a2, const float* vv, float* r1_ptr, float* r2_ptr, float* r3_ptr, uint32_t nn) {
@@ -1503,7 +1313,7 @@ static inline void compute_two_plus_one_triple_product(const float* bb, const fl
   __m128 btmp;
   __m128 vtmp;
   __m128 bv;
-  __uni16 u16;
+  __univec uvec;
   uint32_t uii;
   for (uii = 0; uii < nn; uii += 4) {
     a1tmp = _mm_load_ps(&(a1[uii]));
@@ -1515,12 +1325,12 @@ static inline void compute_two_plus_one_triple_product(const float* bb, const fl
     s2 = _mm_add_ps(s2, _mm_mul_ps(a1tmp, bv));
     s3 = _mm_add_ps(s3, _mm_mul_ps(a2tmp, bv));
   }
-  u16.vf = s1;
-  *r1_ptr = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
-  u16.vf = s2;
-  *r2_ptr = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
-  u16.vf = s3;
-  *r3_ptr = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
+  uvec.vf = s1;
+  *r1_ptr = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+  uvec.vf = s2;
+  *r2_ptr = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+  uvec.vf = s3;
+  *r3_ptr = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
 }
 #else // no __LP64__ (and hence, unsafe to assume presence of SSE2)
 static inline void logistic_sse(float* vect, uint32_t nn) {
@@ -1539,7 +1349,7 @@ static inline void compute_v_and_p_minus_y(float* pp, float* vv, const float* yy
 }
 
 static inline void mult_tmatrix_nxd_vect_d(const float* tm, const float* vect, float* dest, uint32_t col_ct, uint32_t row_ct) {
-  uintptr_t col_cta4 = (col_ct + 3) & (~3);
+  uintptr_t col_cta4 = round_up_pow2(col_ct, 4);
   const float* tm_ptr;
   float vect_val;
   uint32_t col_idx;
@@ -1555,7 +1365,7 @@ static inline void mult_tmatrix_nxd_vect_d(const float* tm, const float* vect, f
 }
 
 static inline void mult_matrix_dxn_vect_n(const float* mm, const float* vect, float* dest, uint32_t col_ct, uint32_t row_ct) {
-  uintptr_t col_cta4 = (col_ct + 3) & (~3);
+  uintptr_t col_cta4 = round_up_pow2(col_ct, 4);
   const float* mm_ptr;
   const float* vect_ptr;
   uint32_t row_idx;
@@ -1641,8 +1451,8 @@ static inline void compute_two_plus_one_triple_product(const float* bb, const fl
 #endif
 
 static inline void compute_hessian(const float* mm, const float* vv, float* dest, uint32_t col_ct, uint32_t row_ct) {
-  uintptr_t col_cta4 = (col_ct + 3) & (~3);
-  uintptr_t row_cta4 = (row_ct + 3) & (~3);
+  uintptr_t col_cta4 = round_up_pow2(col_ct, 4);
+  uintptr_t row_cta4 = round_up_pow2(row_ct, 4);
   uintptr_t row_cta4p1 = row_cta4 + 1;
   const float* mm_cur;
   uint32_t row_ctm3;
@@ -1674,7 +1484,7 @@ static inline void compute_hessian(const float* mm, const float* vv, float* dest
 void solve_linear_system(const float* ll, const float* yy, float* xx, uint32_t dd) {
   // if we're ever able to produce 32-bit Linux builds with statically linked
   // LAPACK, we might want to use it in place of this hardcoded stuff
-  uintptr_t dim_cta4 = (dd + 3) & (~3);
+  uintptr_t dim_cta4 = round_up_pow2(dd, 4);
   const float* ll_ptr;
   float* xx_ptr;
   uint32_t row_idx;
@@ -1700,7 +1510,7 @@ void solve_linear_system(const float* ll, const float* yy, float* xx, uint32_t d
 }
 
 float compute_wald(const float* ll, uint32_t dd, float* xbuf) {
-  uintptr_t dim_cta4 = (dd + 3) & (~3);
+  uintptr_t dim_cta4 = round_up_pow2(dd, 4);
   uint32_t row_idx = 0;
   const float* ll_ptr;
   float* xbuf_ptr;
@@ -1723,7 +1533,7 @@ float compute_wald(const float* ll, uint32_t dd, float* xbuf) {
 }
 
 void cholesky_decomposition(const float* aa, float* ll, uint32_t dd) {
-  uintptr_t dim_cta4 = (dd + 3) & (~3);
+  uintptr_t dim_cta4 = round_up_pow2(dd, 4);
   uintptr_t dim_cta4p1 = dim_cta4 + 1;
   float* ll_ptr;
   float* ll_ptr2;
@@ -1782,7 +1592,7 @@ uint32_t logistic_regression(uint32_t sample_ct, uint32_t param_ct, float* vv, f
   // pp    = final likelihoods minus Y[]
   //
   // Returns 0 on success, 1 on convergence failure.
-  uintptr_t param_cta4 = (param_ct + 3) & (~3);
+  uintptr_t param_cta4 = round_up_pow2(param_ct, 4);
   uint32_t iteration = 0;
   float min_delta_coef = 1e9;
   float delta_coef;
@@ -1825,7 +1635,7 @@ uint32_t logistic_regression(uint32_t sample_ct, uint32_t param_ct, float* vv, f
       return 1;
     }
     if (iteration > 4) {
-      if (((delta_coef > 20.0) && (delta_coef > 2 * min_delta_coef)) || ((iteration >= 8) && fabsf(1.0 - delta_coef) < 1e-3)) {
+      if (((delta_coef > 20.0) && (delta_coef > 2 * min_delta_coef)) || ((iteration >= 8) && fabsf(1.0f - delta_coef) < 1e-3)) {
 	return 1;
       }
       if (iteration >= 15) {
@@ -1850,13 +1660,13 @@ uint32_t glm_logistic(uintptr_t cur_batch_size, uintptr_t param_ct, uintptr_t sa
   //   also need to add restart logic.)
   // * covars_cov_major must now have 16-byte aligned rows.
   // Returns number of regression failures.
-  uintptr_t param_cta4 = (param_ct + 3) & (~3);
+  uintptr_t param_cta4 = round_up_pow2(param_ct, 4);
   uintptr_t param_ct_p1 = param_ct + 1;
   uintptr_t param_ct_msi = param_ct - skip_intercept;
   uintptr_t joint_test_requested = (constraints_con_major? 1 : 0);
   uintptr_t param_ctx = param_ct + joint_test_requested;
   uintptr_t param_ctx_msi = param_ctx - skip_intercept;
-  uintptr_t sample_validx_ctv2 = 2 * ((sample_valid_ct + missing_ct + (BITCT - 1)) / BITCT);
+  uintptr_t sample_validx_ctv = BITCT_TO_ALIGNED_WORDCT(sample_valid_ct + missing_ct);
   uintptr_t perm_fail_ct = 0;
   uintptr_t cur_word = 0;
   uintptr_t perm_idx;
@@ -1868,15 +1678,12 @@ uint32_t glm_logistic(uintptr_t cur_batch_size, uintptr_t param_ct, uintptr_t sa
   float* fptr2;
   double dxx;
   float fxx;
-  fill_ulong_zero(perm_fails, (cur_batch_size + (BITCT - 1)) / BITCT);
+  fill_ulong_zero(perm_fails, BITCT_TO_WORDCT(cur_batch_size));
   for (perm_idx = 0; perm_idx < cur_batch_size; perm_idx++) {
     fptr = pheno_buf;
     if (!missing_ct) {
       for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
-	// strictly speaking, we can use 1 bit per permutation instead of 2
-	// bits here, but the gain is probably too small to justify even adding
-	// a parameter to generate_cc_[cluster_]perm_vec.
-	*fptr++ = (float)((int32_t)is_set_ul(perm_vecs, sample_idx * 2));
+	*fptr++ = (float)((int32_t)is_set_ul(perm_vecs, sample_idx));
       }
     } else {
       for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_valid_ct; sample_uidx++, sample_idx++) {
@@ -1892,7 +1699,7 @@ uint32_t glm_logistic(uintptr_t cur_batch_size, uintptr_t param_ct, uintptr_t sa
 	  }
 	  sample_uidx++;
 	}
-        *fptr++ = (float)((int32_t)is_set_ul(perm_vecs, sample_uidx * 2));
+        *fptr++ = (float)((int32_t)is_set_ul(perm_vecs, sample_uidx));
       }
     }
     if (logistic_regression(sample_valid_ct, param_ct, sample_1d_buf, param_2d_buf, param_1d_buf, param_2d_buf2, param_1d_buf2, covars_cov_major, pheno_buf, coef, pp)) {
@@ -1956,14 +1763,14 @@ uint32_t glm_logistic(uintptr_t cur_batch_size, uintptr_t param_ct, uintptr_t sa
     if (0) {
     glm_logistic_fail:
       fill_float_zero(&(logistic_results[perm_idx * param_ctx_msi]), param_ct_msi);
-      SET_BIT(perm_fails, perm_idx);
+      SET_BIT(perm_idx, perm_fails);
       perm_fail_ct++;
       if (joint_test_requested) {
         logistic_results[perm_idx * param_ctx_msi + param_ct_msi] = -9;
       }
     }
     coef = &(coef[param_cta4]);
-    perm_vecs = &(perm_vecs[sample_validx_ctv2]);
+    perm_vecs = &(perm_vecs[sample_validx_ctv]);
   }
   return perm_fail_ct;
 }
@@ -1990,7 +1797,7 @@ uint32_t glm_fill_design(uintptr_t* loadbuf_collapsed, double* fixed_covars_cov_
   double dzz;
   // don't need to recompute this during permutations, but it's so cheap that
   // it hardly matters
-  missing_ct = count_01(loadbuf_collapsed, (sample_valid_ct + BITCT2 - 1) / BITCT2);
+  missing_ct = count_01(loadbuf_collapsed, QUATERCT_TO_WORDCT(sample_valid_ct));
   if (missing_ct >= sample_valid_ct - 1) {
     // regression will be skipped in this case
     return missing_ct;
@@ -2408,13 +2215,13 @@ uint32_t glm_fill_design_float(uintptr_t* loadbuf_collapsed, float* fixed_covars
   uint32_t align_skip;
   // don't need to recompute this during permutations, but it's so cheap that
   // it hardly matters
-  missing_ct = count_01(loadbuf_collapsed, (sample_valid_ct + BITCT2 - 1) / BITCT2);
+  missing_ct = count_01(loadbuf_collapsed, QUATERCT_TO_WORDCT(sample_valid_ct));
   if (missing_ct >= sample_valid_ct - 1) {
     // regression will be skipped in this case
     return missing_ct;
   }
   cur_sample_valid_ct = sample_valid_ct - missing_ct;
-  cur_sample_valid_cta4 = (cur_sample_valid_ct + 3) & (~3);
+  cur_sample_valid_cta4 = round_up_pow2(cur_sample_valid_ct, 4);
   align_skip = cur_sample_valid_cta4 - cur_sample_valid_ct;
   for (sample_idx = 0; sample_idx < cur_sample_valid_ct; sample_idx++) {
     *fptr++ = 1;
@@ -2882,13 +2689,13 @@ static double* g_constraints_con_major;
 static uint32_t g_perm_batch_max;
 static float* g_fixed_covars_cov_major_f;
 
-const char glm_main_effects[] = "REC\0DOM\0HOM\0ADD";
+static const char glm_main_effects[] = "REC\0DOM\0HOM\0ADD";
 
 #ifndef NOLAPACK
 THREAD_RET_TYPE glm_linear_adapt_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
-  uintptr_t sample_valid_ct = g_pheno_nm_ct;
-  uintptr_t sample_valid_ctv2 = 2 * ((sample_valid_ct + (BITCT - 1)) / BITCT);
+  uintptr_t sample_valid_ct = g_perm_pheno_nm_ct;
+  uintptr_t sample_valid_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_valid_ct);
   uintptr_t perm_vec_ct = g_perm_vec_ct;
   // unlike the other permutation loops, g_perms_done is not preincremented
   // here
@@ -3073,8 +2880,8 @@ THREAD_RET_TYPE glm_linear_adapt_thread(void* arg) {
 
 THREAD_RET_TYPE glm_logistic_adapt_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
-  uintptr_t sample_valid_ct = g_pheno_nm_ct;
-  uintptr_t sample_valid_ctv2 = 2 * ((sample_valid_ct + (BITCT - 1)) / BITCT);
+  uintptr_t sample_valid_ct = g_perm_pheno_nm_ct;
+  uintptr_t sample_valid_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_valid_ct);
   uintptr_t perm_vec_ct = g_perm_vec_ct;
   uint32_t pidx_offset = g_perms_done;
   uintptr_t marker_blocks = g_block_diff / CACHELINE_INT32;
@@ -3094,7 +2901,7 @@ THREAD_RET_TYPE glm_logistic_adapt_thread(void* arg) {
   double adaptive_ci_zt = g_adaptive_ci_zt;
   double aperm_alpha = g_aperm_alpha;
   uintptr_t cur_param_ct = g_cur_param_ct;
-  uintptr_t cur_param_cta4 = (cur_param_ct + 3) & (~3);
+  uintptr_t cur_param_cta4 = round_up_pow2(cur_param_ct, 4);
   uintptr_t cur_constraint_ct = g_cur_constraint_ct;
   uint32_t coding_flags = g_coding_flags;
   uint32_t glm_xchr_model = g_glm_xchr_model;
@@ -3223,8 +3030,8 @@ THREAD_RET_TYPE glm_linear_maxt_thread(void* arg) {
   // todo: either switch to spawn_threads2 interface, or document why that
   // isn't a good idea
   uintptr_t tidx = (uintptr_t)arg;
-  uintptr_t sample_valid_ct = g_pheno_nm_ct;
-  uintptr_t sample_valid_ctv2 = 2 * ((sample_valid_ct + (BITCT - 1)) / BITCT);
+  uintptr_t sample_valid_ct = g_perm_pheno_nm_ct;
+  uintptr_t sample_valid_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_valid_ct);
   uintptr_t perm_vec_ct = g_perm_vec_ct;
   uint32_t pidx_offset = g_perms_done;
   uintptr_t marker_blocks = g_block_diff / CACHELINE_INT32;
@@ -3233,7 +3040,7 @@ THREAD_RET_TYPE glm_linear_maxt_thread(void* arg) {
   uintptr_t* loadbuf = g_loadbuf;
   uint32_t* adapt_m_table = &(g_adapt_m_table[marker_bidx]);
   double* perm_pmajor = g_perm_pmajor;
-  uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
+  uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
   double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
   unsigned char* __restrict__ perm_adapt_stop = g_perm_adapt_stop;
   uint32_t* __restrict__ perm_fail_cts = g_perm_attempt_ct;
@@ -3398,8 +3205,8 @@ THREAD_RET_TYPE glm_linear_maxt_thread(void* arg) {
 
 THREAD_RET_TYPE glm_logistic_maxt_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
-  uintptr_t sample_valid_ct = g_pheno_nm_ct;
-  uintptr_t sample_valid_ctv2 = 2 * ((sample_valid_ct + (BITCT - 1)) / BITCT);
+  uintptr_t sample_valid_ct = g_perm_pheno_nm_ct;
+  uintptr_t sample_valid_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_valid_ct);
   uintptr_t perm_vec_ct = g_perm_vec_ct;
   uint32_t pidx_offset = g_perms_done;
   uintptr_t marker_blocks = g_block_diff / CACHELINE_INT32;
@@ -3408,7 +3215,7 @@ THREAD_RET_TYPE glm_logistic_maxt_thread(void* arg) {
   uintptr_t* loadbuf = g_loadbuf;
   uint32_t* adapt_m_table = &(g_adapt_m_table[marker_bidx]);
   uintptr_t* perm_vecs = g_perm_vecs;
-  uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
+  uintptr_t perm_vec_ctcl8m = round_up_pow2(perm_vec_ct, CACHELINE_DBL);
   double* __restrict__ results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
   unsigned char* __restrict__ perm_adapt_stop = g_perm_adapt_stop;
   uint32_t* __restrict__ perm_fail_cts = g_perm_attempt_ct;
@@ -3416,7 +3223,7 @@ THREAD_RET_TYPE glm_logistic_maxt_thread(void* arg) {
   uintptr_t* joint_test_params = g_joint_test_params;
   double* __restrict__ orig_stats = g_orig_stats;
   uintptr_t cur_param_ct = g_cur_param_ct;
-  uintptr_t cur_param_cta4 = (cur_param_ct + 3) & (~3);
+  uintptr_t cur_param_cta4 = round_up_pow2(cur_param_ct, 4);
   uintptr_t cur_constraint_ct = g_cur_constraint_ct;
   uint32_t coding_flags = g_coding_flags;
   uint32_t glm_xchr_model = g_glm_xchr_model;
@@ -3537,8 +3344,8 @@ THREAD_RET_TYPE glm_logistic_maxt_thread(void* arg) {
 THREAD_RET_TYPE glm_linear_set_thread(void* arg) {
   // Simplified version of what glm_linear_maxt_thread() does.
   uintptr_t tidx = (uintptr_t)arg;
-  uintptr_t sample_valid_ct = g_pheno_nm_ct;
-  uintptr_t sample_valid_ctv2 = 2 * ((sample_valid_ct + (BITCT - 1)) / BITCT);
+  uintptr_t sample_valid_ct = g_perm_pheno_nm_ct;
+  uintptr_t sample_valid_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_valid_ct);
   uintptr_t perm_vec_ct = g_perm_vec_ct;
   uintptr_t marker_blocks = g_block_diff / CACHELINE_INT32;
   uint32_t marker_bidx = CACHELINE_INT32 * ((((uint64_t)tidx) * marker_blocks) / g_assoc_thread_ct);
@@ -3645,8 +3452,8 @@ THREAD_RET_TYPE glm_linear_set_thread(void* arg) {
 THREAD_RET_TYPE glm_logistic_set_thread(void* arg) {
   // Simplified version of what glm_logistic_maxt_thread() does.
   uintptr_t tidx = (uintptr_t)arg;
-  uintptr_t sample_valid_ct = g_pheno_nm_ct;
-  uintptr_t sample_valid_ctv2 = 2 * ((sample_valid_ct + (BITCT - 1)) / BITCT);
+  uintptr_t sample_valid_ct = g_perm_pheno_nm_ct;
+  uintptr_t sample_valid_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_valid_ct);
   uintptr_t perm_vec_ct = g_perm_vec_ct;
   uintptr_t marker_blocks = g_block_diff / CACHELINE_INT32;
   uint32_t marker_bidx = CACHELINE_INT32 * ((((uint64_t)tidx) * marker_blocks) / g_assoc_thread_ct);
@@ -3655,7 +3462,7 @@ THREAD_RET_TYPE glm_logistic_set_thread(void* arg) {
   uintptr_t* perm_vecs = g_perm_vecs;
   uintptr_t cur_param_ct = g_cur_param_ct;
   uintptr_t param_ct_m1 = cur_param_ct - 1;
-  uintptr_t cur_param_cta4 = (cur_param_ct + 3) & (~3);
+  uintptr_t cur_param_cta4 = round_up_pow2(cur_param_ct, 4);
   uint32_t coding_flags = g_coding_flags;
   uint32_t glm_xchr_model = g_glm_xchr_model;
   uintptr_t condition_list_start_idx = g_condition_list_start_idx;
@@ -3711,10 +3518,10 @@ THREAD_RET_TYPE glm_logistic_set_thread(void* arg) {
 }
 
 int32_t glm_common_init(FILE* bedfile, uintptr_t bed_offset, uint32_t glm_modifier, uint32_t standard_beta, uint32_t glm_xchr_model, Range_list* parameters_range_list_ptr, Range_list* tests_range_list_ptr, uint32_t mtest_adjust, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, char* condition_mname, char* condition_fname, Chrom_info* chrom_info_ptr, ui [...]
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + BITCT - 1) / BITCT;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t unfiltered_sample_ctv2 = 2 * unfiltered_sample_ctl;
   uintptr_t sample_uidx = 0;
-  uintptr_t topsize = 0;
   uintptr_t max_param_name_len = 2;
   uintptr_t np_base_raw = 2; // intercept, additive effect
   uintptr_t np_diploid_raw = 0; // genotypic, hethom
@@ -3763,11 +3570,11 @@ int32_t glm_common_init(FILE* bedfile, uintptr_t bed_offset, uint32_t glm_modifi
   uint32_t ujj;
   g_joint_test_params = NULL;
   if (max_marker_allele_len > MAXLINELEN) {
-    if (wkspace_alloc_c_checked(writebuf_ptr, max_marker_allele_len + MAXLINELEN)) {
+    if (bigstack_alloc_c(max_marker_allele_len + MAXLINELEN, writebuf_ptr)) {
       goto glm_common_init_ret_NOMEM;
     }
   } else {
-    *writebuf_ptr = tbuf;
+    *writebuf_ptr = g_textbuf;
   }
   g_standard_beta = standard_beta;
   g_coding_flags = glm_modifier & (GLM_HETHOM | GLM_DOMINANT | GLM_RECESSIVE);
@@ -3796,7 +3603,7 @@ int32_t glm_common_init(FILE* bedfile, uintptr_t bed_offset, uint32_t glm_modifi
   if (glm_init_load_mask(sample_exclude, pheno_nm, covar_nm, sample_ct, unfiltered_sample_ctv2, &load_mask)) {
     goto glm_common_init_ret_NOMEM;
   }
-  if (wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctv2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctv2, &loadbuf_raw)) {
     goto glm_common_init_ret_NOMEM;
   }
   loadbuf_raw[unfiltered_sample_ctv2 - 2] = 0;
@@ -3806,27 +3613,22 @@ int32_t glm_common_init(FILE* bedfile, uintptr_t bed_offset, uint32_t glm_modifi
     // temporary allocation of unfiltered sample_include2 and
     // sample_male_include2 for glm_scan_conditions()
     if (hh_or_mt_exists & (Y_FIX_NEEDED | NXMHH_EXISTS)) {
-      sample_include2 = (uintptr_t*)top_alloc(&topsize, unfiltered_sample_ctv2 * sizeof(intptr_t));
-      if (!sample_include2) {
+      if (bigstack_end_alloc_ul(unfiltered_sample_ctv2, &sample_include2)) {
 	goto glm_common_init_ret_NOMEM;
       }
-      fill_vec_55(sample_include2, unfiltered_sample_ct);
+      fill_quatervec_55(unfiltered_sample_ct, sample_include2);
     }
     if (hh_or_mt_exists & (XMHH_EXISTS | Y_FIX_NEEDED)) {
-      sample_male_include2 = (uintptr_t*)top_alloc(&topsize, unfiltered_sample_ctv2 * sizeof(intptr_t));
-      if (!sample_male_include2) {
+      if (bigstack_end_alloc_ul(unfiltered_sample_ctv2, &sample_male_include2)) {
         goto glm_common_init_ret_NOMEM;
       }
-      fill_ulong_zero(sample_male_include2, unfiltered_sample_ctv2);
-      vec_include_init(unfiltered_sample_ct, sample_male_include2, sex_male);
+      init_quaterarr_from_bitarr(sex_male, unfiltered_sample_ct, sample_male_include2);
     }
-    wkspace_left -= topsize;
     retval = glm_scan_conditions(condition_mname, condition_fname, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, chrom_info_ptr, hh_or_mt_exists, loadbuf_raw, bedfile, bed_offset, unfiltered_sample_ct, sex_male, load_mask, &sample_valid_ct, &condition_ct, &condition_uidxs, sample_include2, sample_male_include2);
-    wkspace_left += topsize;
     if (retval) {
       goto glm_common_init_ret_1;
     }
-    // topsize = 0;
+    bigstack_end_reset(bigstack_end_mark);
 
     // need to set to null for next alloc_collapsed_haploid_filters() call to
     // work properly
@@ -3868,7 +3670,7 @@ int32_t glm_common_init(FILE* bedfile, uintptr_t bed_offset, uint32_t glm_modifi
 	if (covar_interactions) {
 	  np_diploid_raw = genotypic_or_hethom;
 	}
-        bitfield_and(load_mask, sex_nm, unfiltered_sample_ctl);
+        bitvec_and(sex_nm, unfiltered_sample_ctl, load_mask);
         sample_valid_ct = popcount_longs(load_mask, unfiltered_sample_ctl);
       } else {
 	np_sex_raw = 1;
@@ -3901,22 +3703,22 @@ int32_t glm_common_init(FILE* bedfile, uintptr_t bed_offset, uint32_t glm_modifi
     }
   }
 
-  sample_valid_ctv2 = 2 * ((sample_valid_ct + BITCT - 1) / BITCT);
-  if (alloc_collapsed_haploid_filters(unfiltered_sample_ct, sample_valid_ct, hh_or_mt_exists, 1, load_mask, sex_male, &sample_include2, &sample_male_include2)) {
+  sample_valid_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_valid_ct);
+  if (alloc_collapsed_haploid_filters(load_mask, sex_male, unfiltered_sample_ct, sample_valid_ct, hh_or_mt_exists, 1, &sample_include2, &sample_male_include2)) {
     goto glm_common_init_ret_NOMEM;
   }
-  if (wkspace_alloc_ul_checked(&g_loadbuf, GLM_BLOCKSIZE * sample_valid_ctv2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&sex_male_collapsed, sample_valid_ctv2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(GLM_BLOCKSIZE * sample_valid_ctv2, &g_loadbuf) ||
+      bigstack_alloc_ul(sample_valid_ctv2, &sex_male_collapsed)) {
     goto glm_common_init_ret_NOMEM;
   }
   for (uii = 1; uii <= GLM_BLOCKSIZE; uii++) {
     g_loadbuf[uii * sample_valid_ctv2 - 2] = 0;
     g_loadbuf[uii * sample_valid_ctv2 - 1] = 0;
   }
-  collapse_copy_bitarr_incl(unfiltered_sample_ct, sex_male, load_mask, sample_valid_ct, sex_male_collapsed);
+  copy_bitarr_subset(sex_male, load_mask, unfiltered_sample_ct, sample_valid_ct, sex_male_collapsed);
   param_raw_ct_max = np_base_raw + np_diploid_raw + np_sex_raw;
-  param_raw_ctl = (param_raw_ct_max + BITCT - 1) / BITCT;
-  if (wkspace_alloc_ul_checked(&active_params, param_raw_ctl * sizeof(intptr_t))) {
+  param_raw_ctl = BITCT_TO_WORDCT(param_raw_ct_max);
+  if (bigstack_alloc_ul(param_raw_ctl, &active_params)) {
     goto glm_common_init_ret_NOMEM;
   }
   condition_list_start_idx = 2 + genotypic_or_hethom;
@@ -3940,7 +3742,7 @@ int32_t glm_common_init(FILE* bedfile, uintptr_t bed_offset, uint32_t glm_modifi
   if (parameters_range_list_ptr->name_ct) {
     fill_ulong_zero(active_params, param_raw_ctl);
     active_params[0] = 1;
-    numeric_range_list_to_bitfield(parameters_range_list_ptr, param_raw_ct_max, active_params, 0, 1);
+    numeric_range_list_to_bitarr(parameters_range_list_ptr, param_raw_ct_max, 0, 1, active_params);
     if ((!(active_params[0] & 2)) && ((!np_diploid_raw) || (active_params[0] & 4)) && ((!covar_interactions) || ((!popcount_bit_idx(active_params, interaction_start_idx, sex_start_idx)) && ((!variation_in_sex) || (!popcount_bit_idx(active_params, sex_start_idx + 1, param_raw_ct_max)))))) {
       // force the user to explicitly use no-snp if that's their intention
       logerrprint("Error: --parameters must retain at least one dosage-dependent variable.  To\nperform one-off regression(s), use the --linear 'no-snp' modifier instead.\n");
@@ -3964,7 +3766,7 @@ int32_t glm_common_init(FILE* bedfile, uintptr_t bed_offset, uint32_t glm_modifi
       }
     }
   } else {
-    fill_all_bits(active_params, param_raw_ct_max);
+    fill_all_bits(param_raw_ct_max, active_params);
     param_ct_max = param_raw_ct_max;
     np_base = np_base_raw;
     np_diploid = np_diploid_raw;
@@ -4102,8 +3904,8 @@ int32_t glm_common_init(FILE* bedfile, uintptr_t bed_offset, uint32_t glm_modifi
     }
   }
   param_ctx_max = param_ct_max;
-  param_ctl_max = (param_ct_max + BITCT - 1) / BITCT;
-  if (wkspace_alloc_ul_checked(&haploid_params, param_ctl_max * sizeof(intptr_t))) {
+  param_ctl_max = BITCT_TO_WORDCT(param_ct_max);
+  if (bigstack_alloc_ul(param_ctl_max, &haploid_params)) {
     goto glm_common_init_ret_NOMEM;
   }
   g_haploid_params = haploid_params;
@@ -4113,24 +3915,24 @@ int32_t glm_common_init(FILE* bedfile, uintptr_t bed_offset, uint32_t glm_modifi
     for (uii = 0, param_idx = 0; uii < ujj; uii++, param_idx++) {
       next_set_unsafe_ck(active_params, &uii);
       if ((uii != 2) && ((uii < interaction_start_idx) || (!((uii - interaction_start_idx) & 1)))) {
-        SET_BIT(haploid_params, param_idx);
+        SET_BIT(param_idx, haploid_params);
       }
     }
   } else {
-    fill_all_bits(haploid_params, param_ct_max - np_sex);
+    fill_all_bits(param_ct_max - np_sex, haploid_params);
   }
   uii = 0;
   if ((genotypic_or_hethom && ((active_params[0] & 6) == 6)) || tests_range_list_ptr->name_ct || (glm_modifier & GLM_TEST_ALL)) {
-    if (wkspace_alloc_ul_checked(&g_joint_test_params, param_ctl_max * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(param_ctl_max, &g_joint_test_params)) {
       goto glm_common_init_ret_NOMEM;
     }
     fill_ulong_zero(g_joint_test_params, param_ctl_max);
     if (tests_range_list_ptr->name_ct) {
-      numeric_range_list_to_bitfield(tests_range_list_ptr, param_ct_max - 1, g_joint_test_params, 1, 1);
+      numeric_range_list_to_bitarr(tests_range_list_ptr, param_ct_max - 1, 1, 1, g_joint_test_params);
       constraint_ct_max = popcount_longs(g_joint_test_params, param_ctl_max);
     } else if (glm_modifier & GLM_TEST_ALL) {
       constraint_ct_max = param_ct_max - 1;
-      fill_bits(g_joint_test_params, 0, constraint_ct_max);
+      fill_bits(0, constraint_ct_max, g_joint_test_params);
     } else {
       // genotypic/hethom, neither of first two terms excluded by --parameters,
       // no --tests
@@ -4152,7 +3954,7 @@ int32_t glm_common_init(FILE* bedfile, uintptr_t bed_offset, uint32_t glm_modifi
       }
       param_ctx_max++;
     } else {
-      wkspace_reset(g_joint_test_params);
+      bigstack_reset(g_joint_test_params);
       g_joint_test_params = NULL;
       constraint_ct_max = 0;
       logerrprint("Warning: Ignoring --tests since too few parameter indices are in range.\n");
@@ -4167,8 +3969,8 @@ int32_t glm_common_init(FILE* bedfile, uintptr_t bed_offset, uint32_t glm_modifi
     goto glm_common_init_ret_INVALID_CMDLINE;
   }
 
-  g_cluster_ct = 0;
-  g_pheno_nm_ct = sample_valid_ct;
+  g_perm_cluster_ct = 0;
+  g_perm_pheno_nm_ct = sample_valid_ct;
   g_perms_done = 0;
   g_mperm_save_all = NULL;
   if ((!do_perms) || is_set_test) {
@@ -4210,6 +4012,7 @@ int32_t glm_common_init(FILE* bedfile, uintptr_t bed_offset, uint32_t glm_modifi
     break;
   }
  glm_common_init_ret_1:
+  bigstack_end_reset(bigstack_end_mark);
   return retval;
 }
 
@@ -4221,7 +4024,7 @@ int32_t glm_linear_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t b
   // We could apply a similar procedure to xdf joint test p-values, but I'll
   // refrain until/unless methods developers say that's actually a worthwhile
   // procedure.
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   uintptr_t cur_param_ct = 0;
   uintptr_t* marker_exclude = marker_exclude_mid;
@@ -4238,13 +4041,13 @@ int32_t glm_linear_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t b
   uintptr_t marker_ct = marker_ct_mid;
   uintptr_t set_ct = 0;
   uintptr_t final_mask = get_final_mask(pheno_nm_ct);
-  uintptr_t sample_valid_ctv2 = 2 * ((sample_valid_ct + BITCT - 1) / BITCT);
+  uintptr_t sample_valid_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_valid_ct);
   double adaptive_ci_zt = 0.0;
   uint32_t max_thread_ct = g_thread_ct;
   uint32_t perm_count = glm_modifier & GLM_PERM_COUNT;
   uint32_t perms_done = 0;
   int32_t retval = 0;
-  unsigned char* wkspace_mark2;
+  unsigned char* bigstack_mark2;
   uintptr_t* set_incl;
   uintptr_t* loadbuf_ptr;
   double* orig_set_scores;
@@ -4296,7 +4099,7 @@ int32_t glm_linear_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t b
   if (!set_ct) {
     goto glm_linear_assoc_set_test_write;
   }
-  marker_ctl = (marker_ct + (BITCT - 1)) / BITCT;
+  marker_ctl = BITCT_TO_WORDCT(marker_ct);
   if (marker_ct_mid != marker_ct) {
     inplace_delta_collapse_arr((char*)tcnt, sizeof(int32_t), marker_ct_mid, marker_ct, marker_exclude_mid, marker_exclude);
     inplace_delta_collapse_bitfield(regression_skip, marker_ct, marker_exclude_mid, marker_exclude);
@@ -4315,14 +4118,14 @@ int32_t glm_linear_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t b
   if (max_thread_ct > perms_total) {
     max_thread_ct = perms_total;
   }
-  if (wkspace_init_sfmtp(max_thread_ct)) {
+  if (bigstack_init_sfmtp(max_thread_ct)) {
     goto glm_linear_assoc_set_test_ret_NOMEM;
   }
 
-  wkspace_mark2 = wkspace_base;
+  bigstack_mark2 = g_bigstack_base;
  glm_linear_assoc_set_test_more_perms:
-  bitfield_and(regression_skip, unstopped_markers, marker_ctl);
-  bitfield_andnot(unstopped_markers, regression_skip, marker_ctl);
+  bitvec_and(unstopped_markers, marker_ctl, regression_skip);
+  bitvec_andnot(regression_skip, marker_ctl, unstopped_markers);
   skip_ct = popcount_longs(regression_skip, marker_ctl);
   marker_unstopped_ct = popcount_longs(unstopped_markers, marker_ctl);
 
@@ -4339,28 +4142,25 @@ int32_t glm_linear_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t b
   }
   g_perm_vec_ct = perm_vec_ct;
   if (perm_vec_ct >= CACHELINE_INT32 * max_thread_ct) {
-    g_assoc_thread_ct = max_thread_ct;
+    g_perm_generation_thread_ct = max_thread_ct;
   } else {
-    g_assoc_thread_ct = perm_vec_ct / CACHELINE_INT32;
-    if (!g_assoc_thread_ct) {
-      g_assoc_thread_ct = 1;
-    }
+    g_perm_generation_thread_ct = MAXV(perm_vec_ct / CACHELINE_INT32, 1);
   }
   ulii = 0;
-  if (!g_cluster_ct) {
-    if (spawn_threads(threads, &linear_gen_perms_thread, g_assoc_thread_ct)) {
+  if (!g_perm_cluster_ct) {
+    if (spawn_threads(threads, &generate_qt_perms_pmajor_thread, g_perm_generation_thread_ct)) {
       goto glm_linear_assoc_set_test_ret_THREAD_CREATE_FAIL;
     }
-    linear_gen_perms_thread((void*)ulii);
+    generate_qt_perms_pmajor_thread((void*)ulii);
   } else {
-    if (spawn_threads(threads, &linear_gen_cluster_perms_thread, g_assoc_thread_ct)) {
+    if (spawn_threads(threads, &generate_qt_cluster_perms_pmajor_thread, g_perm_generation_thread_ct)) {
       goto glm_linear_assoc_set_test_ret_THREAD_CREATE_FAIL;
     }
-    linear_gen_cluster_perms_thread((void*)ulii);
+    generate_qt_cluster_perms_pmajor_thread((void*)ulii);
   }
-  join_threads(threads, g_assoc_thread_ct);
-  if (wkspace_alloc_d_checked(&g_mperm_save_all, MODEL_BLOCKSIZE * perm_vec_ct * sizeof(double)) ||
-      wkspace_alloc_d_checked(&chisq_pmajor, marker_ct * perm_vec_ct * sizeof(double))) {
+  join_threads(threads, g_perm_generation_thread_ct);
+  if (bigstack_alloc_d(MODEL_BLOCKSIZE * perm_vec_ct, &g_mperm_save_all) ||
+      bigstack_alloc_d(marker_ct * perm_vec_ct, &chisq_pmajor)) {
     goto glm_linear_assoc_set_test_ret_NOMEM;
   }
   for (pidx = 0; pidx < perm_vec_ct; pidx++) {
@@ -4416,7 +4216,7 @@ int32_t glm_linear_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t b
 	}
       }
       loadbuf_ptr = &(loadbuf[block_size * sample_valid_ctv2]);
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf_ptr, sample_valid_ct, load_mask, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, sample_valid_ct, load_mask, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_ptr)) {
 	goto glm_linear_assoc_set_test_ret_READ_FAIL;
       }
       if (g_min_ploidy_1 && hh_or_mt_exists) {
@@ -4483,7 +4283,7 @@ int32_t glm_linear_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t b
   } while (marker_idx < marker_unstopped_ct);
   perms_done += perm_vec_ct;
   compute_set_scores(marker_ct, perm_vec_ct, set_ct, chisq_pmajor, orig_set_scores, sorted_chisq_buf, sorted_marker_idx_buf, proxy_arr, setdefs, ld_map, apip, chisq_threshold, adaptive_ci_zt, first_adapt_check, perms_done, sip->set_max, perm_adapt_set_unstopped, perm_2success_ct, perm_attempt_ct);
-  wkspace_reset(wkspace_mark2);
+  bigstack_reset(bigstack_mark2);
   if (perms_done < perms_total) {
     if (glm_modifier & GLM_PERM) {
       if (!extract_set_union(setdefs, set_ct, perm_adapt_set_unstopped, unstopped_markers, marker_ct)) {
@@ -4522,15 +4322,15 @@ int32_t glm_linear_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t b
     break;
   }
  glm_linear_assoc_set_test_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t glm_modifier, double glm_vif_thresh, uint32_t glm_xchr_model, uint32_t glm_mperm_val, Range_list* parameters_range_list_ptr, Range_list* tests_range_list_ptr, double ci_size, double ci_zt, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, char*  [...]
   // todo: investigate pre-orthogonalization of covariates
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   FILE* outfile = NULL;
   FILE* outfile_msa = NULL;
   uintptr_t marker_ct = marker_ct_orig;
@@ -4672,11 +4472,11 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     hh_or_mt_exists |= NXMHH_EXISTS;
   }
   if (is_set_test) {
-    if (wkspace_alloc_ul_checked(&founder_pnm, unfiltered_sample_ctl * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_sample_ctl, &founder_pnm)) {
       goto glm_linear_assoc_ret_NOMEM;
     }
     memcpy(founder_pnm, pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t));
-    bitfield_and(founder_pnm, founder_info, unfiltered_sample_ctl);
+    bitvec_and(founder_info, unfiltered_sample_ctl, founder_pnm);
     if (extract_set_union_unfiltered(sip, NULL, unfiltered_marker_ct, marker_exclude_orig, &marker_exclude, &marker_ct)) {
       goto glm_linear_assoc_ret_NOMEM;
     }
@@ -4701,28 +4501,25 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     }
     goto glm_linear_assoc_ret_1;
   }
-  sample_valid_ctv2 = 2 * ((sample_valid_ct + BITCT - 1) / BITCT);
+  sample_valid_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_valid_ct);
   final_mask = get_final_mask(sample_valid_ct);
   param_ctx_max_m1 = param_ctx_max - 1;
-  if (wkspace_alloc_d_checked(&g_orig_stats, marker_initial_ct * sizeof(double)) ||
-      wkspace_alloc_c_checked(&param_names, param_ctx_max * max_param_name_len) ||
-      wkspace_alloc_d_checked(&g_fixed_covars_cov_major, (variation_in_sex + interaction_start_idx - condition_list_start_idx) * sample_valid_ct * sizeof(double)) ||
-      wkspace_alloc_ui_checked(&g_nm_cts, marker_initial_ct * sizeof(int32_t))) {
+  if (bigstack_alloc_d(marker_initial_ct, &g_orig_stats) ||
+      bigstack_alloc_c(param_ctx_max * max_param_name_len, &param_names) ||
+      bigstack_alloc_d((variation_in_sex + interaction_start_idx - condition_list_start_idx) * sample_valid_ct, &g_fixed_covars_cov_major) ||
+      bigstack_alloc_ui(marker_initial_ct, &g_nm_cts)) {
     goto glm_linear_assoc_ret_NOMEM;
   }
   if (!is_set_test) {
-    if (wkspace_alloc_uc_checked(&g_perm_adapt_stop, marker_initial_ct)) {
+    // use this array to track regression failures even in max(T) case
+    if (bigstack_calloc_uc(round_up_pow2(marker_initial_ct, BYTECT), &g_perm_adapt_stop)) {
       goto glm_linear_assoc_ret_NOMEM;
     }
-    // use this array to track regression failures even in max(T) case
-    fill_ulong_zero((uintptr_t*)g_perm_adapt_stop, (marker_initial_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
   } else {
     g_perm_adapt_stop = NULL;
-    ulii = (marker_initial_ct + (BITCT - 1)) / BITCT;
-    if (wkspace_alloc_ul_checked(&regression_skip, ulii * sizeof(intptr_t))) {
+    if (bigstack_calloc_ul(BITCT_TO_WORDCT(marker_initial_ct), &regression_skip)) {
       goto glm_linear_assoc_ret_NOMEM;
     }
-    fill_ulong_zero(regression_skip, ulii);
   }
 
   param_idx = 1;
@@ -4768,7 +4565,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
       goto glm_linear_assoc_ret_READ_FAIL;
     }
-    if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, g_loadbuf, sample_valid_ct, load_mask, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+    if (load_and_collapse_incl(unfiltered_sample_ct, sample_valid_ct, load_mask, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, g_loadbuf)) {
       goto glm_linear_assoc_ret_READ_FAIL;
     }
     chrom_idx = get_marker_chrom(chrom_info_ptr, marker_uidx);
@@ -4819,7 +4616,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
       if (is_set(active_params, ujj++)) {
         wptr = memcpyl3a(&(param_names[param_idx * max_param_name_len]), main_effect);
 	wptr = memcpya(wptr, "xCSNP", 5);
-        uint32_writex(wptr, uii + 1, '\0');
+        uint32toa_x(uii + 1, '\0', wptr);
 	param_idx++;
       }
       if (genotypic_or_hethom) {
@@ -4830,7 +4627,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	  } else {
 	    wptr = memcpya(wptr, "DOMDEVxCSNP", 11);
 	  }
-	  uint32_writex(wptr, uii + 1, '\0');
+	  uint32toa_x(uii + 1, '\0', wptr);
 	  param_idx++;
 	}
       }
@@ -4861,7 +4658,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     fill_double_zero(dptr, sample_valid_ct);
     sample_idx = 0;
     while (1) {
-      next_set_ul_ck(sex_male_collapsed, &sample_idx, sample_valid_ct);
+      next_set_ul_ck(sex_male_collapsed, sample_valid_ct, &sample_idx);
       if (sample_idx == sample_valid_ct) {
 	break;
       }
@@ -4893,7 +4690,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     }
   }
   if (genotypic_or_hethom) {
-    if (wkspace_alloc_c_checked(&haploid_param_names, np_base * max_param_name_len)) {
+    if (bigstack_alloc_c(np_base * max_param_name_len, &haploid_param_names)) {
       goto glm_linear_assoc_ret_NOMEM;
     }
     uii = 1;
@@ -4904,7 +4701,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
   }
 
   if (constraint_ct_max) {
-    if (wkspace_alloc_d_checked(&constraints_con_major, constraint_ct_max * param_ct_max * sizeof(double))) {
+    if (bigstack_alloc_d(constraint_ct_max * param_ct_max, &constraints_con_major)) {
       goto glm_linear_assoc_ret_NOMEM;
     }
     // special case: df may vary between chromosomes, so refill suffix at
@@ -4927,32 +4724,28 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
   if (fill_orig_chiabs) {
     if (mtest_adjust || is_set_test) {
       if (constraint_ct_max) {
-	if (wkspace_alloc_d_checked(&orig_pvals, marker_initial_ct * sizeof(double))) {
+	if (bigstack_alloc_d(marker_initial_ct, &orig_pvals)) {
 	  goto glm_linear_assoc_ret_NOMEM;
 	}
       }
       if ((!constraint_ct_max) || is_set_test) {
-	if (wkspace_alloc_ui_checked(&tcnt, marker_initial_ct * sizeof(int32_t))) {
+	if (bigstack_calloc_ui(marker_initial_ct, &tcnt)) {
 	  goto glm_linear_assoc_ret_NOMEM;
 	}
-	fill_uint_zero(tcnt, marker_initial_ct);
       }
       if (!is_set_test) {
-	if (wkspace_alloc_ui_checked(&marker_idx_to_uidx, marker_initial_ct * sizeof(int32_t))) {
+	if (bigstack_alloc_ui(marker_initial_ct, &marker_idx_to_uidx)) {
 	  goto glm_linear_assoc_ret_NOMEM;
 	}
       }
     }
     if (do_perms_nst) {
-      if (wkspace_alloc_ui_checked(&g_perm_2success_ct, marker_initial_ct * sizeof(int32_t)) ||
-	  wkspace_alloc_ui_checked(&g_perm_attempt_ct, marker_initial_ct * sizeof(int32_t))) {
+      // need this for max(T) now since we need to track permutation failures
+      // bugfix: g_perm_2success_ct was uninitialized.
+      if (bigstack_calloc_ui(marker_initial_ct, &g_perm_2success_ct) ||
+	  bigstack_calloc_ui(marker_initial_ct, &g_perm_attempt_ct)) {
 	goto glm_linear_assoc_ret_NOMEM;
       }
-      // need this for max(T) now since we need to track permutation failures
-      // bugfix: g_perm_2success_ct was uninitialized.  add a
-      // wkspace_calloc_...() idiom to reduce the frequency of that mistake?
-      fill_uint_zero(g_perm_2success_ct, marker_initial_ct);
-      fill_uint_zero(g_perm_attempt_ct, marker_initial_ct);
       perms_total = perm_adapt_nst? apip->max : glm_mperm_val;
       if (perms_total < orig_perm_batch_size) {
 	orig_perm_batch_size = perms_total;
@@ -4968,18 +4761,16 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	max_thread_ct = uii;
       }
       if (!perm_adapt_nst) {
-	ulii = CACHEALIGN32_DBL(perm_batch_size);
-        if (wkspace_alloc_d_checked(&g_maxt_thread_results, ulii * max_thread_ct * sizeof(double)),
-            wkspace_alloc_d_checked(&g_maxt_extreme_stat, perms_total * sizeof(double))) {
+        if (bigstack_alloc_d(round_up_pow2(perm_batch_size, CACHELINE_DBL) * max_thread_ct, &g_maxt_thread_results) ||
+            bigstack_calloc_d(perms_total, &g_maxt_extreme_stat)) {
           goto glm_linear_assoc_ret_NOMEM;
 	}
-	fill_double_zero(g_maxt_extreme_stat, perms_total);
 	if (mperm_save_all) {
-	  if (wkspace_alloc_d_checked(&g_mperm_save_all, marker_initial_ct * perm_batch_size * sizeof(double))) {
+	  if (bigstack_alloc_d(((uintptr_t)marker_initial_ct) * perm_batch_size, &g_mperm_save_all)) {
 	    goto glm_linear_assoc_ret_NOMEM;
 	  }
 	  memcpy(outname_end, ".mperm.dump.all", 16);
-	  if (fopen_checked(&outfile_msa, outname, "w")) {
+	  if (fopen_checked(outname, "w", &outfile_msa)) {
 	    goto glm_linear_assoc_ret_OPEN_FAIL;
 	  }
 	  if (putc_checked('0', outfile_msa)) {
@@ -4994,59 +4785,59 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
   if (do_perms) {
     if (cluster_starts) {
       // Pointless to include size-1 clusters in permutation.
-      retval = cluster_include_and_reindex(unfiltered_sample_ct, load_mask, 1, NULL, sample_valid_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_cluster_ct, &g_cluster_map, &g_cluster_starts, NULL, NULL);
+      retval = cluster_include_and_reindex(unfiltered_sample_ct, load_mask, 1, NULL, sample_valid_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, NULL, NULL);
       if (retval) {
 	goto glm_linear_assoc_ret_1;
       }
-      if (!g_cluster_ct) {
+      if (!g_perm_cluster_ct) {
 	goto glm_linear_assoc_ret_NO_PERMUTATION_CLUSTERS;
       }
-      if (wkspace_alloc_ui_checked(&g_qassoc_cluster_thread_wkspace, max_thread_ct * ((g_cluster_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32) * CACHELINE) ||
-          wkspace_alloc_ui_checked(&g_sample_to_cluster, sample_valid_ct * sizeof(int32_t))) {
+      if (bigstack_alloc_ui(max_thread_ct * round_up_pow2(g_perm_cluster_ct, CACHELINE_INT32), &g_perm_qt_cluster_thread_wkspace) ||
+          bigstack_alloc_ui(sample_valid_ct, &g_perm_sample_to_cluster)) {
 	goto glm_linear_assoc_ret_NOMEM;
       }
-      fill_unfiltered_sample_to_cluster(sample_valid_ct, g_cluster_ct, g_cluster_map, g_cluster_starts, g_sample_to_cluster);
+      fill_unfiltered_sample_to_cluster(sample_valid_ct, g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, g_perm_sample_to_cluster);
     }
     if (!is_set_test) {
-      if (wkspace_init_sfmtp(max_thread_ct)) {
+      if (bigstack_init_sfmtp(max_thread_ct)) {
 	goto glm_linear_assoc_ret_NOMEM;
       }
     }
   } else {
     orig_perm_batch_size = 1;
   }
-  if (wkspace_alloc_d_checked(&g_pheno_d2, sample_valid_ct * sizeof(double))) {
+  if (bigstack_alloc_d(sample_valid_ct, &g_perm_pheno_d2)) {
     goto glm_linear_assoc_ret_NOMEM;
   }
-  g_linear_mt = (Linear_multithread*)wkspace_alloc(max_thread_ct * sizeof(Linear_multithread));
+  g_linear_mt = (Linear_multithread*)bigstack_alloc(max_thread_ct * sizeof(Linear_multithread));
   if (!g_linear_mt) {
     goto glm_linear_assoc_ret_NOMEM;
   }
-  ulii = (orig_perm_batch_size + (BITCT - 1)) / BITCT;
+  ulii = BITCT_TO_WORDCT(orig_perm_batch_size);
   for (tidx = 0; tidx < max_thread_ct; tidx++) {
-    if (wkspace_alloc_d_checked(&(g_linear_mt[tidx].cur_covars_cov_major), param_ct_max * sample_valid_ct * sizeof(double)) ||
-        wkspace_alloc_d_checked(&(g_linear_mt[tidx].cur_covars_sample_major), param_ct_max * sample_valid_ct * sizeof(double)) ||
-        wkspace_alloc_ul_checked(&(g_linear_mt[tidx].perm_fails), ulii * sizeof(intptr_t)) ||
-        wkspace_alloc_d_checked(&(g_linear_mt[tidx].param_2d_buf), param_ct_max * param_ct_max * sizeof(double)) ||
-        wkspace_alloc_d_checked(&(g_linear_mt[tidx].param_2d_buf2), param_ct_max * param_ct_max * sizeof(double)) ||
-        wkspace_alloc_d_checked(&(g_linear_mt[tidx].regression_results), orig_perm_batch_size * param_ctx_max_m1 * sizeof(double))) {
+    if (bigstack_alloc_d(param_ct_max * sample_valid_ct, &(g_linear_mt[tidx].cur_covars_cov_major)) ||
+        bigstack_alloc_d(param_ct_max * sample_valid_ct, &(g_linear_mt[tidx].cur_covars_sample_major)) ||
+        bigstack_alloc_ul(ulii, &(g_linear_mt[tidx].perm_fails)) ||
+        bigstack_alloc_d(param_ct_max * param_ct_max, &(g_linear_mt[tidx].param_2d_buf)) ||
+        bigstack_alloc_d(param_ct_max * param_ct_max, &(g_linear_mt[tidx].param_2d_buf2)) ||
+        bigstack_alloc_d(orig_perm_batch_size * param_ctx_max_m1, &(g_linear_mt[tidx].regression_results))) {
       goto glm_linear_assoc_ret_NOMEM;
     }
 
-    g_linear_mt[tidx].mi_buf = (MATRIX_INVERT_BUF1_TYPE*)wkspace_alloc(param_ct_max * sizeof(MATRIX_INVERT_BUF1_TYPE));
+    g_linear_mt[tidx].mi_buf = (MATRIX_INVERT_BUF1_TYPE*)bigstack_alloc(param_ct_max * sizeof(MATRIX_INVERT_BUF1_TYPE));
     if (!(g_linear_mt[tidx].mi_buf)) {
       goto glm_linear_assoc_ret_NOMEM;
     }
     if (constraint_ct_max) {
-      if (wkspace_alloc_d_checked(&(g_linear_mt[tidx].df_df_buf), constraint_ct_max * constraint_ct_max * sizeof(double)) ||
-	  wkspace_alloc_d_checked(&(g_linear_mt[tidx].df_buf), constraint_ct_max * sizeof(double)) ||
-          wkspace_alloc_d_checked(&(g_linear_mt[tidx].param_df_buf), constraint_ct_max * param_ct_max * sizeof(double)) ||
-	  wkspace_alloc_d_checked(&(g_linear_mt[tidx].param_df_buf2), constraint_ct_max * param_ct_max * sizeof(double))) {
+      if (bigstack_alloc_d(constraint_ct_max * constraint_ct_max, &(g_linear_mt[tidx].df_df_buf)) ||
+	  bigstack_alloc_d(constraint_ct_max, &(g_linear_mt[tidx].df_buf)) ||
+          bigstack_alloc_d(constraint_ct_max * param_ct_max, &(g_linear_mt[tidx].param_df_buf)) ||
+	  bigstack_alloc_d(constraint_ct_max * param_ct_max, &(g_linear_mt[tidx].param_df_buf2))) {
 	goto glm_linear_assoc_ret_NOMEM;
       }
     }
-    if (wkspace_alloc_d_checked(&(g_linear_mt[tidx].dgels_a), param_ct_max * sample_valid_ct * sizeof(double)) ||
-	wkspace_alloc_d_checked(&(g_linear_mt[tidx].dgels_b), orig_perm_batch_size * sample_valid_ct * sizeof(double))) {
+    if (bigstack_alloc_d(param_ct_max * sample_valid_ct, &(g_linear_mt[tidx].dgels_a)) ||
+	bigstack_alloc_d(orig_perm_batch_size * sample_valid_ct, &(g_linear_mt[tidx].dgels_b))) {
       goto glm_linear_assoc_ret_NOMEM;
     }
     if (!tidx) {
@@ -5065,12 +4856,12 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
       }
       g_dgels_lwork = (int32_t)dxx;
     }
-    if (wkspace_alloc_d_checked(&(g_linear_mt[tidx].dgels_work), g_dgels_lwork * sizeof(double))) {
+    if (bigstack_alloc_d(g_dgels_lwork, &(g_linear_mt[tidx].dgels_work))) {
       goto glm_linear_assoc_ret_NOMEM;
     }
   }
 
-  dptr = g_pheno_d2;
+  dptr = g_perm_pheno_d2;
   g_pheno_sum = 0;
   g_pheno_ssq = 0;
   sample_uidx = 0;
@@ -5095,7 +4886,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
   if (standard_beta) {
     dxx = g_pheno_sum / ((double)((intptr_t)sample_valid_ct));
     dyy = sqrt(((double)((intptr_t)(sample_valid_ct - 1))) / (g_pheno_ssq - g_pheno_sum * dxx));
-    dptr = g_pheno_d2;
+    dptr = g_perm_pheno_d2;
     for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
       *dptr = ((*dptr) - dxx) * dyy;
       dptr++;
@@ -5104,21 +4895,21 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     g_pheno_ssq = (double)((intptr_t)(sample_valid_ct - 1));
   }
   if (do_perms) {
-    if (wkspace_alloc_ui_checked(&g_precomputed_mods, (sample_valid_ct - 1) * sizeof(int32_t)) ||
-	wkspace_alloc_d_checked(&g_perm_pmajor, orig_perm_batch_size * sample_valid_ct * sizeof(double))) {
+    if (bigstack_alloc_ui(sample_valid_ct - 1, &g_perm_precomputed_mods) ||
+	bigstack_alloc_d(orig_perm_batch_size * sample_valid_ct, &g_perm_pmajor)) {
       goto glm_linear_assoc_ret_NOMEM;
     }
-    precompute_mods(sample_valid_ct, g_precomputed_mods);
+    precompute_mods(sample_valid_ct, g_perm_precomputed_mods);
   }
 
   outname_end2 = memcpyb(outname_end, ".assoc.linear", 14);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto glm_linear_assoc_ret_OPEN_FAIL;
   }
   LOGPRINTFWW5("Writing linear model association results to %s ... ", outname);
   fflush(stdout);
-  sprintf(tbuf, " CHR %%%us         BP   A1       TEST    NMISS       BETA ", plink_maxsnp);
-  fprintf(outfile, tbuf, "SNP");
+  sprintf(g_textbuf, " CHR %%%us         BP   A1       TEST    NMISS       BETA ", plink_maxsnp);
+  fprintf(outfile, g_textbuf, "SNP");
   if (display_ci) {
     uii = (uint32_t)((int32_t)(ci_size * 100));
     if (uii >= 10) {
@@ -5167,26 +4958,23 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
       g_perm_vec_ct = perms_total - g_perms_done;
     }
     if (g_perm_vec_ct >= CACHELINE_INT32 * max_thread_ct) {
-      g_assoc_thread_ct = max_thread_ct;
+      g_perm_generation_thread_ct = max_thread_ct;
     } else {
-      g_assoc_thread_ct = g_perm_vec_ct / CACHELINE_INT32;
-      if (!g_assoc_thread_ct) {
-	g_assoc_thread_ct = 1;
-      }
+      g_perm_generation_thread_ct = MAXV(g_perm_vec_ct / CACHELINE_INT32, 1);
     }
     ulii = 0;
-    if (!g_cluster_ct) {
-      if (spawn_threads(threads, &linear_gen_perms_thread, g_assoc_thread_ct)) {
+    if (!g_perm_cluster_ct) {
+      if (spawn_threads(threads, &generate_qt_perms_pmajor_thread, g_perm_generation_thread_ct)) {
 	goto glm_linear_assoc_ret_THREAD_CREATE_FAIL;
       }
-      linear_gen_perms_thread((void*)ulii);
+      generate_qt_perms_pmajor_thread((void*)ulii);
     } else {
-      if (spawn_threads(threads, &linear_gen_cluster_perms_thread, g_assoc_thread_ct)) {
+      if (spawn_threads(threads, &generate_qt_cluster_perms_pmajor_thread, g_perm_generation_thread_ct)) {
 	goto glm_linear_assoc_ret_THREAD_CREATE_FAIL;
       }
-      linear_gen_cluster_perms_thread((void*)ulii);
+      generate_qt_cluster_perms_pmajor_thread((void*)ulii);
     }
-    join_threads(threads, g_assoc_thread_ct);
+    join_threads(threads, g_perm_generation_thread_ct);
   }
   chrom_fo_idx = 0xffffffffU;
   marker_uidx = next_unset_unsafe(marker_exclude, 0);
@@ -5209,7 +4997,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	g_min_ploidy_1 |= uii;
       } while ((!glm_xchr_model) && g_min_ploidy_1);
       uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-      wptr_start = width_force(4, writebuf, chrom_name_write(writebuf, chrom_info_ptr, uii));
+      wptr_start = width_force(4, writebuf, chrom_name_write(chrom_info_ptr, uii, writebuf));
       *wptr_start++ = ' ';
       fill_double_zero(constraints_con_major, constraint_ct_max * param_ct_max);
       g_male_x_01 = 0;
@@ -5263,7 +5051,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	    constraints_con_major[constraint_idx * cur_param_ct + ulii + 1] = 1;
 	  }
 	}
-        wptr = uint32_write(&(param_names[param_ct_max * max_param_name_len + 5]), cur_constraint_ct);
+        wptr = uint32toa(cur_constraint_ct, &(param_names[param_ct_max * max_param_name_len + 5]));
 	memcpy(wptr, "DF", 3);
       }
     }
@@ -5287,7 +5075,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	}
       }
       loadbuf_ptr = &(g_loadbuf[block_size * sample_valid_ctv2]);
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf_ptr, sample_valid_ct, load_mask, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, sample_valid_ct, load_mask, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_ptr)) {
 	goto glm_linear_assoc_ret_READ_FAIL;
       }
       if (g_min_ploidy_1 && hh_or_mt_exists) {
@@ -5332,7 +5120,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	if ((cur_sample_valid_ct > cur_param_ct) && (!glm_check_vif(glm_vif_thresh, cur_param_ct, cur_sample_valid_ct, g_linear_mt[0].cur_covars_cov_major, g_linear_mt[0].param_2d_buf, g_linear_mt[0].mi_buf, g_linear_mt[0].param_2d_buf2))) {
 	  regression_fail = 0;
 	  memcpy(g_linear_mt[0].dgels_a, g_linear_mt[0].cur_covars_cov_major, cur_param_ct * cur_sample_valid_ct * sizeof(double));
-	  copy_when_nonmissing(loadbuf_ptr, (char*)g_pheno_d2, sizeof(double), sample_valid_ct, cur_missing_ct, (char*)(g_linear_mt[0].dgels_b));
+	  copy_when_nonmissing(loadbuf_ptr, (char*)g_perm_pheno_d2, sizeof(double), sample_valid_ct, cur_missing_ct, (char*)(g_linear_mt[0].dgels_b));
 	  if (standard_beta && cur_missing_ct) {
 	    dxx = g_pheno_sum;
 	    dyy = g_pheno_ssq;
@@ -5341,7 +5129,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	      cur_word = *ulptr++;
 	      cur_word = cur_word & (~(cur_word >> 1)) & FIVEMASK;
 	      while (cur_word) {
-		dzz = g_pheno_d2[sample_idx + (CTZLU(cur_word) / 2)];
+		dzz = g_perm_pheno_d2[sample_idx + (CTZLU(cur_word) / 2)];
 		dxx -= dzz;
 		dyy -= dzz * dzz;
 		cur_word &= cur_word - 1;
@@ -5360,22 +5148,21 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	  dgels_ldb = dgels_m;
 
 	  dgels_(&dgels_trans, &dgels_m, &dgels_n, &dgels_nrhs, g_linear_mt[0].dgels_a, &dgels_m, g_linear_mt[0].dgels_b, &dgels_ldb, g_linear_mt[0].dgels_work, &g_dgels_lwork, &dgels_info);
-	  if (glm_linear(1, cur_param_ct, cur_sample_valid_ct, cur_missing_ct, loadbuf_ptr, standard_beta, g_pheno_sum, g_pheno_ssq, g_linear_mt[0].cur_covars_cov_major, g_linear_mt[0].cur_covars_sample_major, g_pheno_d2, g_linear_mt[0].dgels_b, g_linear_mt[0].param_2d_buf, g_linear_mt[0].mi_buf, g_linear_mt[0].param_2d_buf2, g_linear_mt[0].regression_results, cur_constraint_ct, constraints_con_major, g_linear_mt[0].param_df_buf, g_linear_mt[0].param_df_buf2, g_linear_mt[0].df_df_buf, g_linear_ [...]
+	  if (glm_linear(1, cur_param_ct, cur_sample_valid_ct, cur_missing_ct, loadbuf_ptr, standard_beta, g_pheno_sum, g_pheno_ssq, g_linear_mt[0].cur_covars_cov_major, g_linear_mt[0].cur_covars_sample_major, g_perm_pheno_d2, g_linear_mt[0].dgels_b, g_linear_mt[0].param_2d_buf, g_linear_mt[0].mi_buf, g_linear_mt[0].param_2d_buf2, g_linear_mt[0].regression_results, cur_constraint_ct, constraints_con_major, g_linear_mt[0].param_df_buf, g_linear_mt[0].param_df_buf2, g_linear_mt[0].df_df_buf, g_li [...]
 	    regression_fail = 1;
 	    if (is_set_test && is_monomorphic(loadbuf_ptr, sample_valid_ct)) {
-	      set_bit(regression_skip, marker_idx3);
+	      set_bit(marker_idx3, regression_skip);
 	    }
 	  }
 	} else {
 	  regression_fail = 1;
 	  if (is_set_test) {
-	    set_bit(regression_skip, marker_idx3);
+	    set_bit(marker_idx3, regression_skip);
 	  }
 	}
 	wptr_start2 = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr_start);
 	*wptr_start2++ = ' ';
-	wptr_start2 = uint32_writew10(wptr_start2, marker_pos[marker_uidx2]);
-	*wptr_start2++ = ' ';
+	wptr_start2 = uint32toa_w10x(marker_pos[marker_uidx2], ' ', wptr_start2);
 	wptr_start2 = fw_strcpy(4, marker_allele_ptrs[marker_uidx2 * 2], wptr_start2);
 	*wptr_start2++ = ' ';
 	orig_stats_ptr = &(g_orig_stats[marker_idx3]);
@@ -5387,7 +5174,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	    pval = calc_tprob(zval, cur_sample_valid_ct - cur_param_ct);
 	    if (param_idx == 1) {
 	      if (mperm_save_all) {
-		double_g_writex(&(numbuf[1]), fabs(zval), '\0');
+		dtoa_gx(fabs(zval), '\0', &(numbuf[1]));
 		fputs(numbuf, outfile_msa);
 	      }
 	      if (!constraint_ct_max) {
@@ -5403,16 +5190,16 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	    if ((param_idx < param_idx_end) && ((pfilter == 2.0) || ((pval <= pfilter) && (pval >= 0.0)))) {
 	      wptr = fw_strcpy(10, &(cur_param_names[param_idx * max_param_name_len]), wptr_start2);
 	      *wptr++ = ' ';
-	      wptr = uint32_writew8x(wptr, (uint32_t)cur_sample_valid_ct, ' ');
-	      wptr = double_g_writewx4x(wptr, dxx, 10, ' ');
+	      wptr = uint32toa_w8x((uint32_t)cur_sample_valid_ct, ' ', wptr);
+	      wptr = dtoa_g_wxp4x(dxx, 10, ' ', wptr);
 	      if (display_ci) {
 		dyy = ci_zt * se;
-		wptr = double_g_writewx4x(wptr, se, 8, ' ');
-		wptr = double_g_writewx4x(wptr, dxx - dyy, 8, ' ');
-		wptr = double_g_writewx4x(wptr, dxx + dyy, 8, ' ');
+		wptr = dtoa_g_wxp4x(se, 8, ' ', wptr);
+		wptr = dtoa_g_wxp4x(dxx - dyy, 8, ' ', wptr);
+		wptr = dtoa_g_wxp4x(dxx + dyy, 8, ' ', wptr);
 	      }
-	      wptr = double_g_writewx4x(wptr, zval, 12, ' ');
-	      wptr = double_g_writewx4x(wptr, MAXV(pval, output_min_p), 12, '\n');
+	      wptr = dtoa_g_wxp4x(zval, 12, ' ', wptr);
+	      wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 12, '\n', wptr);
 	      if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
 		goto glm_linear_assoc_ret_WRITE_FAIL;
 	      }
@@ -5421,15 +5208,15 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	  if (report_intercept) {
 	    dxx = g_linear_mt[0].dgels_b[0];
 	    wptr = memcpya(wptr_start2, " INTERCEPT ", 11);
-	    wptr = uint32_writew8x(wptr, (uint32_t)cur_sample_valid_ct, ' ');
-	    wptr = double_g_writewx4x(wptr, dxx, 10, ' ');
+	    wptr = uint32toa_w8x((uint32_t)cur_sample_valid_ct, ' ', wptr);
+	    wptr = dtoa_g_wxp4x(dxx, 10, ' ', wptr);
 	    if (display_ci) {
 	      // okay, this should be made more maintainable...
 	      se = sqrt(g_linear_mt[0].param_2d_buf2[0]);
 	      dyy = ci_zt * se;
-	      wptr = double_g_writewx4x(wptr, se, 8, ' ');
-	      wptr = double_g_writewx4x(wptr, dxx - dyy, 8, ' ');
-	      wptr = double_g_writewx4x(wptr, dxx + dyy, 8, ' ');
+	      wptr = dtoa_g_wxp4x(se, 8, ' ', wptr);
+	      wptr = dtoa_g_wxp4x(dxx - dyy, 8, ' ', wptr);
+	      wptr = dtoa_g_wxp4x(dxx + dyy, 8, ' ', wptr);
 	    }
 	    wptr = memcpya(wptr, "          NA           NA\n", 26);
 	    if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
@@ -5446,13 +5233,13 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	    if ((pfilter == 2.0) || ((pval <= pfilter) && (pval >= 0.0))) {
 	      wptr = fw_strcpy(10, &(param_names[param_ct_max * max_param_name_len]), wptr_start2);
               *wptr++ = ' ';
-              wptr = uint32_writew8(wptr, (uint32_t)cur_sample_valid_ct);
+              wptr = uint32toa_w8((uint32_t)cur_sample_valid_ct, wptr);
               wptr = memcpya(wptr, "         NA ", 12);
               if (display_ci) {
 		wptr = memcpya(wptr, "      NA       NA       NA ", 27);
 	      }
-              wptr = double_g_writewx4x(wptr, dxx, 12, ' ');
-              wptr = double_g_writewx4x(wptr, MAXV(pval, output_min_p), 12, '\n');
+              wptr = dtoa_g_wxp4x(dxx, 12, ' ', wptr);
+              wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 12, '\n', wptr);
               if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
 		goto glm_linear_assoc_ret_WRITE_FAIL;
 	      }
@@ -5484,7 +5271,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 		  wptr = fw_strcpy(10, &(param_names[param_ct_max * max_param_name_len]), wptr_start2);
 		}
 		*wptr++ = ' ';
-		wptr = uint32_writew8(wptr, (uint32_t)cur_sample_valid_ct);
+		wptr = uint32toa_w8((uint32_t)cur_sample_valid_ct, wptr);
 		wptr = memcpya(wptr, "         NA ", 12);
 		if (display_ci) {
 		  wptr = memcpya(wptr, "      NA       NA       NA ", 27);
@@ -5527,7 +5314,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	}
 	glm_linear_maxt_thread((void*)ulii);
 	join_threads(threads, g_assoc_thread_ct);
-        ulii = CACHEALIGN32_DBL(g_perm_vec_ct);
+        ulii = round_up_pow2(g_perm_vec_ct, CACHELINE_DBL);
 	ukk = g_perms_done + g_perm_vec_ct;
         for (uii = 0; uii < g_assoc_thread_ct; uii++) {
           dptr = &(g_maxt_thread_results[uii * ulii]);
@@ -5591,29 +5378,29 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
       fflush(stdout);
       ulii = g_perm_vec_ct;
       ujj = 1 + g_perms_done;
-      wptr = tbuf;
-      cptr = &(tbuf[MAXLINELEN]);
+      wptr = g_textbuf;
+      cptr = &(g_textbuf[MAXLINELEN]);
       for (uii = 0; uii < ulii; uii++) {
-	wptr = uint32_write(wptr, uii + ujj);
+	wptr = uint32toa(uii + ujj, wptr);
 	dptr = &(g_mperm_save_all[uii]);
 	for (ukk = 0; ukk < marker_ct; ukk++) {
 	  *wptr++ = ' ';
 	  dxx = dptr[ukk * ulii];
 	  if (dxx >= 0) {
-	    wptr = double_g_write(wptr, dxx);
+	    wptr = dtoa_g(dxx, wptr);
 	  } else {
 	    wptr = memcpya(wptr, "NA", 2);
 	  }
 	  if (wptr >= cptr) {
-	    if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_msa)) {
+	    if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
 	      goto glm_linear_assoc_ret_WRITE_FAIL;
 	    }
-	    wptr = tbuf;
+	    wptr = g_textbuf;
 	  }
 	}
 	*wptr++ = '\n';
       }
-      if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_msa)) {
+      if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
 	goto glm_linear_assoc_ret_WRITE_FAIL;
       }
       fputs("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b               ", stdout);
@@ -5621,7 +5408,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     g_perms_done += g_perm_vec_ct;
     if (g_perms_done < perms_total) {
       if (perm_adapt_nst || (!perm_pass_idx)) {
-        marker_unstopped_ct = marker_initial_ct - popcount_longs((uintptr_t*)g_perm_adapt_stop, (marker_initial_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
+        marker_unstopped_ct = marker_initial_ct - popcount01_longs((uintptr_t*)g_perm_adapt_stop, (marker_initial_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
         if (!marker_unstopped_ct) {
           goto glm_linear_assoc_perm_count;
 	}
@@ -5652,7 +5439,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
       if (mperm_save & MPERM_DUMP_BEST) {
 	memcpy(outname_end, ".mperm.dump.best", 17);
 	LOGPRINTFWW("Dumping best permutation %s to %s .\n", (!constraint_ct_max)? "absolute t-stats" : "chi-square values", outname);
-	if (fopen_checked(&outfile, outname, "w")) {
+	if (fopen_checked(outname, "w", &outfile)) {
 	  goto glm_linear_assoc_ret_OPEN_FAIL;
 	}
 	dxx = 0;
@@ -5661,15 +5448,15 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	    dxx = g_orig_stats[marker_idx];
 	  }
 	}
-	memcpy(tbuf, "0 ", 2);
-	wptr = double_g_writex(&(tbuf[2]), dxx, '\n');
-        if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile)) {
+	memcpy(g_textbuf, "0 ", 2);
+	wptr = dtoa_gx(dxx, '\n', &(g_textbuf[2]));
+        if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
 	  goto glm_linear_assoc_ret_WRITE_FAIL;
 	}
         for (uii = 0; uii < perms_total; uii++) {
-          wptr = uint32_writex(tbuf, uii + 1, ' ');
-          wptr = double_g_writex(wptr, g_maxt_extreme_stat[uii], '\n');
-          if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile)) {
+          wptr = uint32toa_x(uii + 1, ' ', g_textbuf);
+          wptr = dtoa_gx(g_maxt_extreme_stat[uii], '\n', wptr);
+          if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
 	    goto glm_linear_assoc_ret_WRITE_FAIL;
 	  }
 	}
@@ -5680,13 +5467,13 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
       }
       memcpy(outname_end2, ".mperm", 7);
     }
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto glm_linear_assoc_ret_OPEN_FAIL;
     }
     if (perm_adapt_nst) {
-      sprintf(tbuf, " CHR %%%us         EMP1           NP \n", plink_maxsnp);
+      sprintf(g_textbuf, " CHR %%%us         EMP1           NP \n", plink_maxsnp);
     } else {
-      sprintf(tbuf, " CHR %%%us         EMP1         EMP2 \n", plink_maxsnp);
+      sprintf(g_textbuf, " CHR %%%us         EMP1         EMP2 \n", plink_maxsnp);
 #ifdef __cplusplus
       std::sort(g_maxt_extreme_stat, &(g_maxt_extreme_stat[perms_total]));
 #else
@@ -5699,7 +5486,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	g_perm_attempt_ct[marker_idx] = perms_total - g_perm_attempt_ct[marker_idx];
       }
     }
-    fprintf(outfile, tbuf, "SNP");
+    fprintf(outfile, g_textbuf, "SNP");
     chrom_fo_idx = 0xffffffffU;
     marker_uidx = next_unset_unsafe(marker_exclude, 0);
     marker_idx = 0;
@@ -5708,7 +5495,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[(++chrom_fo_idx) + 1];
       } while (marker_uidx >= chrom_end);
       uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-      wptr_start = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, uii));
+      wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, uii, g_textbuf));
       *wptr_start++ = ' ';
       wptr_start[plink_maxsnp] = ' ';
       for (; marker_uidx < chrom_end;) {
@@ -5720,24 +5507,24 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
             wptr = memcpya(wptr, "          NA           NA", 25);
 	  } else {
 	    if (!perm_count) {
-	      wptr = double_g_writewx4x(wptr, pval, 12, ' ');
+	      wptr = dtoa_g_wxp4x(pval, 12, ' ', wptr);
 	    } else {
-	      wptr = double_g_writewx4x(wptr, ((double)g_perm_2success_ct[marker_idx]) * 0.5, 12, ' ');
+	      wptr = dtoa_g_wxp4x(((double)g_perm_2success_ct[marker_idx]) * 0.5, 12, ' ', wptr);
 	    }
 	    if (perm_adapt_nst) {
 	      wptr = memseta(wptr, 32, 2);
-	      wptr = uint32_writew10(wptr, g_perm_attempt_ct[marker_idx]);
+	      wptr = uint32toa_w10(g_perm_attempt_ct[marker_idx], wptr);
 	    } else {
 	      dzz = (int32_t)(perms_total - doublearr_greater_than(g_maxt_extreme_stat, perms_total, g_orig_stats[marker_idx] - EPSILON) + 1);
               if (!perm_count) {
-		wptr = double_g_writewx4(wptr, dzz / ((double)((int32_t)perms_total + 1)), 12);
+		wptr = dtoa_g_wxp4(dzz / ((double)((int32_t)perms_total + 1)), 12, wptr);
 	      } else {
-                wptr = double_g_writewx4(wptr, dzz - 1, 12);
+                wptr = dtoa_g_wxp4(dzz - 1, 12, wptr);
 	      }
 	    }
 	  }
 	  wptr = memcpya(wptr, " \n", 2);
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	    goto glm_linear_assoc_ret_WRITE_FAIL;
 	  }
 	}
@@ -5780,7 +5567,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     break;
   }
  glm_linear_assoc_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   fclose_cond(outfile_msa);
   free_cond(condition_uidxs);
@@ -5790,7 +5577,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 
 int32_t glm_logistic_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t glm_modifier, uint32_t glm_xchr_model, uint32_t glm_mperm_val, double pfilter, double output_min_p, uint32_t mtest_adjust, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, uintptr_t* marker_exclude_mid, uintptr_t marker_ct_mid, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* marker_reverse, uintptr_t* re [...]
   // Very similar to glm_linear_assoc_set_test(); could merge them.
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   uintptr_t cur_param_ct = 0;
   uintptr_t* marker_exclude = marker_exclude_mid;
@@ -5807,13 +5594,13 @@ int32_t glm_logistic_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t
   uintptr_t marker_ct = marker_ct_mid;
   uintptr_t set_ct = 0;
   uintptr_t final_mask = get_final_mask(pheno_nm_ct);
-  uintptr_t sample_valid_ctv2 = 2 * ((sample_valid_ct + BITCT - 1) / BITCT);
+  uintptr_t sample_valid_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_valid_ct);
   double adaptive_ci_zt = 0.0;
   uint32_t max_thread_ct = g_thread_ct;
   uint32_t perm_count = glm_modifier & GLM_PERM_COUNT;
   uint32_t perms_done = 0;
   int32_t retval = 0;
-  unsigned char* wkspace_mark2;
+  unsigned char* bigstack_mark2;
   uintptr_t* set_incl;
   uintptr_t* loadbuf_ptr;
   double* orig_set_scores;
@@ -5859,7 +5646,7 @@ int32_t glm_logistic_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t
   if (!set_ct) {
     goto glm_logistic_assoc_set_test_write;
   }
-  marker_ctl = (marker_ct + (BITCT - 1)) / BITCT;
+  marker_ctl = BITCT_TO_WORDCT(marker_ct);
   if (marker_ct_mid != marker_ct) {
     inplace_delta_collapse_bitfield(regression_skip, marker_ct, marker_exclude_mid, marker_exclude);
   }
@@ -5877,14 +5664,14 @@ int32_t glm_logistic_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t
   if (max_thread_ct > perms_total) {
     max_thread_ct = perms_total;
   }
-  if (wkspace_init_sfmtp(max_thread_ct)) {
+  if (bigstack_init_sfmtp(max_thread_ct)) {
     goto glm_logistic_assoc_set_test_ret_NOMEM;
   }
 
-  wkspace_mark2 = wkspace_base;
+  bigstack_mark2 = g_bigstack_base;
  glm_logistic_assoc_set_test_more_perms:
-  bitfield_and(regression_skip, unstopped_markers, marker_ctl);
-  bitfield_andnot(unstopped_markers, regression_skip, marker_ctl);
+  bitvec_and(unstopped_markers, marker_ctl, regression_skip);
+  bitvec_andnot(regression_skip, marker_ctl, unstopped_markers);
   skip_ct = popcount_longs(regression_skip, marker_ctl);
   marker_unstopped_ct = popcount_longs(unstopped_markers, marker_ctl);
 
@@ -5901,28 +5688,25 @@ int32_t glm_logistic_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t
   }
   g_perm_vec_ct = perm_vec_ct;
   if (perm_vec_ct >= CACHELINE_INT32 * max_thread_ct) {
-    g_assoc_thread_ct = max_thread_ct;
+    g_perm_generation_thread_ct = max_thread_ct;
   } else {
-    g_assoc_thread_ct = perm_vec_ct / CACHELINE_INT32;
-    if (!g_assoc_thread_ct) {
-      g_assoc_thread_ct = 1;
-    }
+    g_perm_generation_thread_ct = MAXV(perm_vec_ct / CACHELINE_INT32, 1);
   }
   ulii = 0;
-  if (!g_cluster_ct) {
-    if (spawn_threads(threads, &logistic_gen_perms_thread, g_assoc_thread_ct)) {
+  if (!g_perm_cluster_ct) {
+    if (spawn_threads(threads, &generate_cc_perms_thread, g_perm_generation_thread_ct)) {
       goto glm_logistic_assoc_set_test_ret_THREAD_CREATE_FAIL;
     }
-    logistic_gen_perms_thread((void*)ulii);
+    generate_cc_perms_thread((void*)ulii);
   } else {
-    if (spawn_threads(threads, &logistic_gen_cluster_perms_thread, g_assoc_thread_ct)) {
+    if (spawn_threads(threads, &generate_cc_cluster_perms_thread, g_perm_generation_thread_ct)) {
       goto glm_logistic_assoc_set_test_ret_THREAD_CREATE_FAIL;
     }
-    logistic_gen_cluster_perms_thread((void*)ulii);
+    generate_cc_cluster_perms_thread((void*)ulii);
   }
-  join_threads(threads, g_assoc_thread_ct);
-  if (wkspace_alloc_d_checked(&g_mperm_save_all, MODEL_BLOCKSIZE * perm_vec_ct * sizeof(double)) ||
-      wkspace_alloc_d_checked(&chisq_pmajor, marker_ct * perm_vec_ct * sizeof(double))) {
+  join_threads(threads, g_perm_generation_thread_ct);
+  if (bigstack_alloc_d(MODEL_BLOCKSIZE * perm_vec_ct, &g_mperm_save_all) ||
+      bigstack_alloc_d(marker_ct * perm_vec_ct, &chisq_pmajor)) {
     goto glm_logistic_assoc_set_test_ret_NOMEM;
   }
   for (pidx = 0; pidx < perm_vec_ct; pidx++) {
@@ -5978,7 +5762,7 @@ int32_t glm_logistic_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t
 	}
       }
       loadbuf_ptr = &(loadbuf[block_size * sample_valid_ctv2]);
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf_ptr, sample_valid_ct, load_mask, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, sample_valid_ct, load_mask, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_ptr)) {
 	goto glm_logistic_assoc_set_test_ret_READ_FAIL;
       }
       if (g_min_ploidy_1 && hh_or_mt_exists) {
@@ -6028,7 +5812,7 @@ int32_t glm_logistic_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t
   } while (marker_idx < marker_unstopped_ct);
   perms_done += perm_vec_ct;
   compute_set_scores(marker_ct, perm_vec_ct, set_ct, chisq_pmajor, orig_set_scores, sorted_chisq_buf, sorted_marker_idx_buf, proxy_arr, setdefs, ld_map, apip, chisq_threshold, adaptive_ci_zt, first_adapt_check, perms_done, sip->set_max, perm_adapt_set_unstopped, perm_2success_ct, perm_attempt_ct);
-  wkspace_reset(wkspace_mark2);
+  bigstack_reset(bigstack_mark2);
   if (perms_done < perms_total) {
     if (glm_modifier & GLM_PERM) {
       if (!extract_set_union(setdefs, set_ct, perm_adapt_set_unstopped, unstopped_markers, marker_ct)) {
@@ -6067,14 +5851,14 @@ int32_t glm_logistic_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t
     break;
   }
  glm_logistic_assoc_set_test_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t glm_modifier, double glm_vif_thresh, uint32_t glm_xchr_model, uint32_t glm_mperm_val, Range_list* parameters_range_list_ptr, Range_list* tests_range_list_ptr, double ci_size, double ci_zt, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, char [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   FILE* outfile = NULL;
   FILE* outfile_msa = NULL;
   uintptr_t marker_ct = marker_ct_orig;
@@ -6150,6 +5934,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
   uintptr_t param_idx_end;
   uintptr_t sample_valid_ct;
   uintptr_t sample_valid_cta4;
+  uintptr_t sample_valid_ctv;
   uintptr_t sample_valid_ctv2;
   uintptr_t sample_idx;
   uintptr_t param_ctx_max;
@@ -6209,11 +5994,11 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     hh_or_mt_exists |= NXMHH_EXISTS;
   }
   if (is_set_test) {
-    if (wkspace_alloc_ul_checked(&founder_pnm, unfiltered_sample_ctl * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_sample_ctl, &founder_pnm)) {
       goto glm_logistic_assoc_ret_NOMEM;
     }
     memcpy(founder_pnm, pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t));
-    bitfield_and(founder_pnm, founder_info, unfiltered_sample_ctl);
+    bitvec_and(founder_info, unfiltered_sample_ctl, founder_pnm);
     if (extract_set_union_unfiltered(sip, NULL, unfiltered_marker_ct, marker_exclude_orig, &marker_exclude, &marker_ct)) {
       goto glm_logistic_assoc_ret_NOMEM;
     }
@@ -6238,29 +6023,27 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     }
     goto glm_logistic_assoc_ret_1;
   }
-  sample_valid_cta4 = (sample_valid_ct + 3) & (~3);
-  sample_valid_ctv2 = 2 * ((sample_valid_ct + BITCT - 1) / BITCT);
+  sample_valid_cta4 = round_up_pow2(sample_valid_ct, 4);
+  sample_valid_ctv = BITCT_TO_ALIGNED_WORDCT(sample_valid_ct);
+  sample_valid_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_valid_ct);
   final_mask = get_final_mask(sample_valid_ct);
-  param_ct_maxa4 = (param_ct_max + 3) & (~3);
-  if (wkspace_alloc_d_checked(&g_orig_stats, marker_initial_ct * sizeof(double)) ||
-      wkspace_alloc_c_checked(&param_names, param_ctx_max * max_param_name_len) ||
-      wkspace_alloc_f_checked(&g_fixed_covars_cov_major_f, (variation_in_sex + interaction_start_idx - condition_list_start_idx) * sample_valid_ct * sizeof(float)) ||
-      wkspace_alloc_ui_checked(&g_nm_cts, marker_initial_ct * sizeof(int32_t))) {
+  param_ct_maxa4 = round_up_pow2(param_ct_max, 4);
+  if (bigstack_alloc_d(marker_initial_ct, &g_orig_stats) ||
+      bigstack_alloc_c(param_ctx_max * max_param_name_len, &param_names) ||
+      bigstack_alloc_f((variation_in_sex + interaction_start_idx - condition_list_start_idx) * sample_valid_ct, &g_fixed_covars_cov_major_f) ||
+      bigstack_alloc_ui(marker_initial_ct, &g_nm_cts)) {
     goto glm_logistic_assoc_ret_NOMEM;
   }
   if (!is_set_test) {
-    if (wkspace_alloc_uc_checked(&g_perm_adapt_stop, marker_initial_ct)) {
+    // use this array to track regression failures even in max(T) case
+    if (bigstack_calloc_uc(round_up_pow2(marker_initial_ct, BYTECT), &g_perm_adapt_stop)) {
       goto glm_logistic_assoc_ret_NOMEM;
     }
-    // use this array to track regression failures even in max(T) case
-    fill_ulong_zero((uintptr_t*)g_perm_adapt_stop, (marker_initial_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
   } else {
     g_perm_adapt_stop = NULL;
-    ulii = (marker_initial_ct + (BITCT - 1)) / BITCT;
-    if (wkspace_alloc_ul_checked(&regression_skip, ulii * sizeof(intptr_t))) {
+    if (bigstack_calloc_ul(BITCT_TO_WORDCT(marker_initial_ct), &regression_skip)) {
       goto glm_logistic_assoc_ret_NOMEM;
     }
-    fill_ulong_zero(regression_skip, ulii);
   }
 
   param_idx = 1;
@@ -6306,7 +6089,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
       goto glm_logistic_assoc_ret_READ_FAIL;
     }
-    if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, g_loadbuf, sample_valid_ct, load_mask, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+    if (load_and_collapse_incl(unfiltered_sample_ct, sample_valid_ct, load_mask, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, g_loadbuf)) {
       goto glm_logistic_assoc_ret_READ_FAIL;
     }
     chrom_idx = get_marker_chrom(chrom_info_ptr, marker_uidx);
@@ -6356,7 +6139,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
       if (is_set(active_params, ujj++)) {
         wptr = memcpyl3a(&(param_names[param_idx * max_param_name_len]), main_effect);
 	wptr = memcpya(wptr, "xCSNP", 5);
-        uint32_writex(wptr, uii + 1, '\0');
+        uint32toa_x(uii + 1, '\0', wptr);
 	param_idx++;
       }
       if (genotypic_or_hethom) {
@@ -6367,7 +6150,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
 	  } else {
 	    wptr = memcpya(wptr, "DOMDEVxCSNP", 11);
 	  }
-	  uint32_writex(wptr, uii + 1, '\0');
+	  uint32toa_x(uii + 1, '\0', wptr);
 	  param_idx++;
 	}
       }
@@ -6398,7 +6181,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     fill_float_zero(fptr, sample_valid_ct);
     sample_idx = 0;
     while (1) {
-      next_set_ul_ck(sex_male_collapsed, &sample_idx, sample_valid_ct);
+      next_set_ul_ck(sex_male_collapsed, sample_valid_ct, &sample_idx);
       if (sample_idx == sample_valid_ct) {
 	break;
       }
@@ -6430,7 +6213,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     }
   }
   if (genotypic_or_hethom) {
-    if (wkspace_alloc_c_checked(&haploid_param_names, np_base * max_param_name_len)) {
+    if (bigstack_alloc_c(np_base * max_param_name_len, &haploid_param_names)) {
       goto glm_logistic_assoc_ret_NOMEM;
     }
     for (uii = 1, param_idx = 1; param_idx < np_base; uii++, param_idx++) {
@@ -6440,7 +6223,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
   }
 
   if (constraint_ct_max) {
-    if (wkspace_alloc_d_checked(&constraints_con_major, constraint_ct_max * param_ct_max * sizeof(double))) {
+    if (bigstack_alloc_d(constraint_ct_max * param_ct_max, &constraints_con_major)) {
       goto glm_logistic_assoc_ret_NOMEM;
     }
     // special case: df may vary between chromosomes, so refill suffix at
@@ -6462,19 +6245,17 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
   mperm_save_all = mperm_save & MPERM_DUMP_ALL;
   if (fill_orig_chiabs) {
     if (mtest_adjust && (!is_set_test)) {
-      if (wkspace_alloc_d_checked(&orig_pvals, marker_initial_ct * sizeof(double)) ||
-	  wkspace_alloc_ui_checked(&marker_idx_to_uidx, marker_initial_ct * sizeof(int32_t))) {
+      if (bigstack_alloc_d(marker_initial_ct, &orig_pvals) ||
+	  bigstack_alloc_ui(marker_initial_ct, &marker_idx_to_uidx)) {
 	  goto glm_logistic_assoc_ret_NOMEM;
       }
     }
     if (do_perms_nst) {
-      if (wkspace_alloc_ui_checked(&g_perm_2success_ct, marker_initial_ct * sizeof(int32_t)) ||
-	  wkspace_alloc_ui_checked(&g_perm_attempt_ct, marker_initial_ct * sizeof(int32_t))) {
+      // need this for max(T) now since we need to track permutation failures
+      if (bigstack_calloc_ui(marker_initial_ct, &g_perm_2success_ct) ||
+	  bigstack_calloc_ui(marker_initial_ct, &g_perm_attempt_ct)) {
 	goto glm_logistic_assoc_ret_NOMEM;
       }
-      // need this for max(T) now since we need to track permutation failures
-      fill_uint_zero(g_perm_2success_ct, marker_initial_ct);
-      fill_uint_zero(g_perm_attempt_ct, marker_initial_ct);
       perms_total = perm_adapt_nst? apip->max : glm_mperm_val;
       if (perms_total < orig_perm_batch_size) {
 	orig_perm_batch_size = perms_total;
@@ -6488,18 +6269,16 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
 	max_thread_ct = uii;
       }
       if (!perm_adapt_nst) {
-	ulii = CACHEALIGN32_DBL(perm_batch_size);
-        if (wkspace_alloc_d_checked(&g_maxt_thread_results, ulii * max_thread_ct * sizeof(double)),
-            wkspace_alloc_d_checked(&g_maxt_extreme_stat, perms_total * sizeof(double))) {
+        if (bigstack_alloc_d(round_up_pow2(perm_batch_size, CACHELINE_DBL) * max_thread_ct, &g_maxt_thread_results) ||
+            bigstack_calloc_d(perms_total, &g_maxt_extreme_stat)) {
           goto glm_logistic_assoc_ret_NOMEM;
 	}
-	fill_double_zero(g_maxt_extreme_stat, perms_total);
 	if (mperm_save_all) {
-	  if (wkspace_alloc_d_checked(&g_mperm_save_all, marker_initial_ct * perm_batch_size * sizeof(double))) {
+	  if (bigstack_alloc_d(((uintptr_t)marker_initial_ct) * perm_batch_size, &g_mperm_save_all)) {
 	    goto glm_logistic_assoc_ret_NOMEM;
 	  }
 	  memcpy(outname_end, ".mperm.dump.all", 16);
-	  if (fopen_checked(&outfile_msa, outname, "w")) {
+	  if (fopen_checked(outname, "w", &outfile_msa)) {
 	    goto glm_logistic_assoc_ret_OPEN_FAIL;
 	  }
 	  if (putc_checked('0', outfile_msa)) {
@@ -6513,67 +6292,66 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
   }
   if (do_perms) {
     if (cluster_starts) {
-      retval = cluster_include_and_reindex(unfiltered_sample_ct, load_mask, 1, pheno_c, sample_valid_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_cluster_ct, &g_cluster_map, &g_cluster_starts, &g_cluster_case_cts, &g_cluster_cc_perm_preimage);
+      retval = cluster_include_and_reindex(unfiltered_sample_ct, load_mask, 1, pheno_c, sample_valid_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, &g_perm_cluster_case_cts, &g_perm_cluster_cc_preimage);
       if (retval) {
 	goto glm_logistic_assoc_ret_1;
       }
-      if (!g_cluster_ct) {
+      if (!g_perm_cluster_ct) {
 	goto glm_logistic_assoc_ret_NO_PERMUTATION_CLUSTERS;
       }
-      if (cluster_alloc_and_populate_magic_nums(g_cluster_ct, g_cluster_map, g_cluster_starts, &g_tot_quotients, &g_totq_magics, &g_totq_preshifts, &g_totq_postshifts, &g_totq_incrs)) {
+      if (cluster_alloc_and_populate_magic_nums(g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, &g_perm_tot_quotients, &g_perm_totq_magics, &g_perm_totq_preshifts, &g_perm_totq_postshifts, &g_perm_totq_incrs)) {
 	goto glm_logistic_assoc_ret_NOMEM;
       }
-      if (wkspace_alloc_ui_checked(&g_sample_to_cluster, sample_valid_ct * sizeof(int32_t))) {
+      if (bigstack_alloc_ui(sample_valid_ct, &g_perm_sample_to_cluster)) {
 	goto glm_logistic_assoc_ret_NOMEM;
       }
-      fill_unfiltered_sample_to_cluster(sample_valid_ct, g_cluster_ct, g_cluster_map, g_cluster_starts, g_sample_to_cluster);
+      fill_unfiltered_sample_to_cluster(sample_valid_ct, g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, g_perm_sample_to_cluster);
     }
-    g_tot_quotient = 0x100000000LLU / sample_valid_ct;
-    magic_num(g_tot_quotient, &g_totq_magic, &g_totq_preshift, &g_totq_postshift, &g_totq_incr);
+    g_perm_tot_quotient = 0x100000000LLU / sample_valid_ct;
+    magic_num(g_perm_tot_quotient, &g_perm_totq_magic, &g_perm_totq_preshift, &g_perm_totq_postshift, &g_perm_totq_incr);
     if (!is_set_test) {
-      if (wkspace_init_sfmtp(max_thread_ct)) {
+      if (bigstack_init_sfmtp(max_thread_ct)) {
 	goto glm_logistic_assoc_ret_NOMEM;
       }
     }
   } else {
     orig_perm_batch_size = 1;
   }
-  g_logistic_mt = (Logistic_multithread*)wkspace_alloc(max_thread_ct * sizeof(Logistic_multithread));
+  g_logistic_mt = (Logistic_multithread*)bigstack_alloc(max_thread_ct * sizeof(Logistic_multithread));
   if (!g_logistic_mt) {
     goto glm_logistic_assoc_ret_NOMEM;
   }
-  ulii = (orig_perm_batch_size + (BITCT - 1)) / BITCT;
   for (tidx = 0; tidx < max_thread_ct; tidx++) {
     // covars_cov_major, param_2d_buf, param_2d_buf2 matrices must have 16-byte
     // aligned rows
     // (no need to worry about 1D 16-byte alignment requirements since
-    // wkspace_alloc actually forces 64-byte alignment, and allocation sizes
+    // bigstack_alloc actually forces 64-byte alignment, and allocation sizes
     // are automatically rounded up)
     uii = (tidx || (orig_perm_batch_size > 1) || skip_intercept)? 1 : 0;
-    if (wkspace_alloc_f_checked(&(g_logistic_mt[tidx].cur_covars_cov_major), param_ct_max * sample_valid_cta4 * sizeof(float)) ||
-	wkspace_alloc_f_checked(&(g_logistic_mt[tidx].coef), param_ct_maxa4 * orig_perm_batch_size * sizeof(float)) ||
-	wkspace_alloc_f_checked(&(g_logistic_mt[tidx].pp), sample_valid_cta4 * sizeof(float)) ||
-        wkspace_alloc_f_checked(&(g_logistic_mt[tidx].sample_1d_buf), sample_valid_ct * sizeof(float)) ||
-        wkspace_alloc_f_checked(&(g_logistic_mt[tidx].pheno_buf), sample_valid_ct * sizeof(float)) ||
-        wkspace_alloc_f_checked(&(g_logistic_mt[tidx].param_1d_buf), param_ct_max * sizeof(float)) ||
-        wkspace_alloc_f_checked(&(g_logistic_mt[tidx].param_1d_buf2), param_ct_max * sizeof(float)) ||
-        wkspace_alloc_f_checked(&(g_logistic_mt[tidx].param_2d_buf), param_ct_max * param_ct_maxa4 * sizeof(float)) ||
-        wkspace_alloc_f_checked(&(g_logistic_mt[tidx].param_2d_buf2), param_ct_max * param_ct_maxa4 * sizeof(float)) ||
-        wkspace_alloc_f_checked(&(g_logistic_mt[tidx].regression_results), orig_perm_batch_size * (param_ctx_max - uii) * sizeof(float)) ||
-        wkspace_alloc_ul_checked(&(g_logistic_mt[tidx].perm_fails), ulii * sizeof(intptr_t))) {
+    if (bigstack_alloc_f(param_ct_max * sample_valid_cta4, &(g_logistic_mt[tidx].cur_covars_cov_major)) ||
+	bigstack_alloc_f(param_ct_maxa4 * orig_perm_batch_size, &(g_logistic_mt[tidx].coef)) ||
+	bigstack_alloc_f(sample_valid_cta4, &(g_logistic_mt[tidx].pp)) ||
+        bigstack_alloc_f(sample_valid_ct, &(g_logistic_mt[tidx].sample_1d_buf)) ||
+        bigstack_alloc_f(sample_valid_ct, &(g_logistic_mt[tidx].pheno_buf)) ||
+        bigstack_alloc_f(param_ct_max, &(g_logistic_mt[tidx].param_1d_buf)) ||
+        bigstack_alloc_f(param_ct_max, &(g_logistic_mt[tidx].param_1d_buf2)) ||
+        bigstack_alloc_f(param_ct_max * param_ct_maxa4, &(g_logistic_mt[tidx].param_2d_buf)) ||
+        bigstack_alloc_f(param_ct_max * param_ct_maxa4, &(g_logistic_mt[tidx].param_2d_buf2)) ||
+        bigstack_alloc_f(orig_perm_batch_size * (param_ctx_max - uii), &(g_logistic_mt[tidx].regression_results)) ||
+        bigstack_alloc_ul(BITCT_TO_WORDCT(orig_perm_batch_size), &(g_logistic_mt[tidx].perm_fails))) {
       goto glm_logistic_assoc_ret_NOMEM;
     }
     if (constraint_ct_max) {
-      g_logistic_mt[tidx].mi_buf = (MATRIX_INVERT_BUF1_TYPE*)wkspace_alloc(param_ct_max * sizeof(MATRIX_INVERT_BUF1_TYPE));
+      g_logistic_mt[tidx].mi_buf = (MATRIX_INVERT_BUF1_TYPE*)bigstack_alloc(param_ct_max * sizeof(MATRIX_INVERT_BUF1_TYPE));
       if (!(g_logistic_mt[tidx].mi_buf)) {
 	goto glm_logistic_assoc_ret_NOMEM;
       }
-      if (wkspace_alloc_d_checked(&(g_logistic_mt[tidx].param_1d_dbuf), param_ct_max * sizeof(double)) ||
-          wkspace_alloc_d_checked(&(g_logistic_mt[tidx].param_2d_dbuf), param_ct_max * param_ct_max * sizeof(double)) ||
-          wkspace_alloc_d_checked(&(g_logistic_mt[tidx].param_2d_dbuf2), param_ct_max * param_ct_max * sizeof(double)) ||
-          wkspace_alloc_d_checked(&(g_logistic_mt[tidx].param_df_dbuf), param_ct_max * constraint_ct_max * sizeof(double)) ||
-          wkspace_alloc_d_checked(&(g_logistic_mt[tidx].df_df_dbuf), constraint_ct_max * constraint_ct_max * sizeof(double)) ||
-	  wkspace_alloc_d_checked(&(g_logistic_mt[tidx].df_dbuf), constraint_ct_max * sizeof(double))) {
+      if (bigstack_alloc_d(param_ct_max, &(g_logistic_mt[tidx].param_1d_dbuf)) ||
+          bigstack_alloc_d(param_ct_max * param_ct_max, &(g_logistic_mt[tidx].param_2d_dbuf)) ||
+          bigstack_alloc_d(param_ct_max * param_ct_max, &(g_logistic_mt[tidx].param_2d_dbuf2)) ||
+          bigstack_alloc_d(param_ct_max * constraint_ct_max, &(g_logistic_mt[tidx].param_df_dbuf)) ||
+          bigstack_alloc_d(constraint_ct_max * constraint_ct_max, &(g_logistic_mt[tidx].df_df_dbuf)) ||
+	  bigstack_alloc_d(constraint_ct_max, &(g_logistic_mt[tidx].df_dbuf))) {
 	goto glm_logistic_assoc_ret_NOMEM;
       }
     } else {
@@ -6587,28 +6365,29 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     }
   }
 
-  if (wkspace_alloc_ul_checked(&pheno_c_collapsed, sample_valid_ctv2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(sample_valid_ctv, &pheno_c_collapsed)) {
     goto glm_logistic_assoc_ret_NOMEM;
   }
-  vec_collapse_init(pheno_c, unfiltered_sample_ct, load_mask, sample_valid_ct, pheno_c_collapsed);
-  g_case_ct = popcount_longs(pheno_c_collapsed, sample_valid_ctv2);
-  if ((!g_case_ct) || (g_case_ct == sample_valid_ct)) {
+  copy_bitarr_subset(pheno_c, load_mask, unfiltered_sample_ct, sample_valid_ct, pheno_c_collapsed);
+  g_perm_case_ct = popcount_longs(pheno_c_collapsed, sample_valid_ctv);
+  if ((!g_perm_case_ct) || (g_perm_case_ct == sample_valid_ct)) {
     goto glm_logistic_assoc_ret_PHENO_CONSTANT;
   }
   if (do_perms) {
-    if (wkspace_alloc_ul_checked(&g_perm_vecs, orig_perm_batch_size * sample_valid_ctv2 * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(orig_perm_batch_size * sample_valid_ctv, &g_perm_vecs)) {
       goto glm_logistic_assoc_ret_NOMEM;
     }
+    g_perm_is_1bit = 1;
   }
 
   outname_end2 = memcpyb(outname_end, ".assoc.logistic", 16);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto glm_logistic_assoc_ret_OPEN_FAIL;
   }
   LOGPRINTFWW5("Writing logistic model association results to %s ... ", outname);
   fflush(stdout);
-  sprintf(tbuf, " CHR %%%us         BP   A1       TEST    NMISS       %s ", plink_maxsnp, report_odds? "  OR" : "BETA");
-  fprintf(outfile, tbuf, "SNP");
+  sprintf(g_textbuf, " CHR %%%us         BP   A1       TEST    NMISS       %s ", plink_maxsnp, report_odds? "  OR" : "BETA");
+  fprintf(outfile, g_textbuf, "SNP");
   if (display_ci) {
     uii = (uint32_t)((int32_t)(ci_size * 100));
     if (uii >= 10) {
@@ -6657,23 +6436,19 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
       g_perm_vec_ct = perms_total - g_perms_done;
     }
     ulii = 0;
-    if (g_perm_vec_ct > max_thread_ct) {
-      g_assoc_thread_ct = max_thread_ct;
-    } else {
-      g_assoc_thread_ct = g_perm_vec_ct;
-    }
-    if (!g_cluster_ct) {
-      if (spawn_threads(threads, &logistic_gen_perms_thread, g_assoc_thread_ct)) {
+    g_perm_generation_thread_ct = MINV(max_thread_ct, g_perm_vec_ct);
+    if (!g_perm_cluster_ct) {
+      if (spawn_threads(threads, &generate_cc_perms_thread, g_perm_generation_thread_ct)) {
 	goto glm_logistic_assoc_ret_THREAD_CREATE_FAIL;
       }
-      logistic_gen_perms_thread((void*)ulii);
+      generate_cc_perms_thread((void*)ulii);
     } else {
-      if (spawn_threads(threads, &logistic_gen_cluster_perms_thread, g_assoc_thread_ct)) {
+      if (spawn_threads(threads, &generate_cc_cluster_perms_thread, g_perm_generation_thread_ct)) {
 	goto glm_logistic_assoc_ret_THREAD_CREATE_FAIL;
       }
-      logistic_gen_cluster_perms_thread((void*)ulii);
+      generate_cc_cluster_perms_thread((void*)ulii);
     }
-    join_threads(threads, g_assoc_thread_ct);
+    join_threads(threads, g_perm_generation_thread_ct);
   }
   chrom_fo_idx = 0xffffffffU;
   marker_uidx = next_unset_unsafe(marker_exclude, 0);
@@ -6696,7 +6471,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
 	g_min_ploidy_1 |= uii;
       } while ((!glm_xchr_model) && g_min_ploidy_1);
       uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-      wptr_start = width_force(4, writebuf, chrom_name_write(writebuf, chrom_info_ptr, uii));
+      wptr_start = width_force(4, writebuf, chrom_name_write(chrom_info_ptr, uii, writebuf));
       *wptr_start++ = ' ';
       fill_double_zero(constraints_con_major, constraint_ct_max * param_ct_max);
       g_male_x_01 = 0;
@@ -6747,7 +6522,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
 	    constraints_con_major[constraint_idx * cur_param_ct + ulii + 1] = 1;
 	  }
 	}
-        wptr = uint32_write(&(param_names[param_ct_max * max_param_name_len + 5]), cur_constraint_ct);
+        wptr = uint32toa(cur_constraint_ct, &(param_names[param_ct_max * max_param_name_len + 5]));
 	memcpy(wptr, "DF", 3);
       }
     }
@@ -6771,7 +6546,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
 	}
       }
       loadbuf_ptr = &(g_loadbuf[block_size * sample_valid_ctv2]);
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf_ptr, sample_valid_ct, load_mask, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, sample_valid_ct, load_mask, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_ptr)) {
 	goto glm_logistic_assoc_ret_READ_FAIL;
       }
       if (g_min_ploidy_1 && hh_or_mt_exists) {
@@ -6811,15 +6586,14 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
 	g_nm_cts[marker_idx3] = cur_sample_valid_ct;
 	if (cur_sample_valid_ct > cur_param_ct) {
 	  // todo: try better starting position
-	  fill_float_zero(g_logistic_mt[0].coef, (cur_param_ct + 3) & (~3));
+	  fill_float_zero(g_logistic_mt[0].coef, round_up_pow2(cur_param_ct, 4));
 	  regression_fail = glm_logistic(1, cur_param_ct, cur_sample_valid_ct, cur_missing_ct, skip_intercept, loadbuf_ptr, g_logistic_mt[0].cur_covars_cov_major, pheno_c_collapsed, g_logistic_mt[0].coef, g_logistic_mt[0].pp, g_logistic_mt[0].sample_1d_buf, g_logistic_mt[0].pheno_buf, g_logistic_mt[0].param_1d_buf, g_logistic_mt[0].param_1d_buf2, g_logistic_mt[0].param_2d_buf, g_logistic_mt[0].param_2d_buf2, g_logistic_mt[0].regression_results, cur_constraint_ct, constraints_con_major, g_logist [...]
 	} else {
 	  regression_fail = 1;
 	}
 	wptr_start2 = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr_start);
 	*wptr_start2++ = ' ';
-	wptr_start2 = uint32_writew10(wptr_start2, marker_pos[marker_uidx2]);
-	*wptr_start2++ = ' ';
+	wptr_start2 = uint32toa_w10x(marker_pos[marker_uidx2], ' ', wptr_start2);
 	wptr_start2 = fw_strcpy(4, marker_allele_ptrs[marker_uidx2 * 2], wptr_start2);
 	*wptr_start2++ = ' ';
 	orig_stats_ptr = &(g_orig_stats[marker_idx3]);
@@ -6831,7 +6605,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
 	    pval = chiprob_p(zval * zval, 1);
 	    if (param_idx == 1) {
 	      if (mperm_save_all) {
-		double_g_writex(&(numbuf[1]), zval * zval, '\0');
+		dtoa_gx(zval * zval, '\0', &(numbuf[1]));
 		fputs(numbuf, outfile_msa);
 	      }
 	      if (!constraint_ct_max) {
@@ -6844,21 +6618,21 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
 	    if ((param_idx < param_idx_end) && ((pfilter == 2.0) || ((pval <= pfilter) && (pval >= 0.0)))) {
 	      wptr = fw_strcpy(10, &(cur_param_names[param_idx * max_param_name_len]), wptr_start2);
 	      *wptr++ = ' ';
-	      wptr = uint32_writew8x(wptr, (uint32_t)cur_sample_valid_ct, ' ');
-	      wptr = double_g_writewx4x(wptr, report_odds? exp(dxx) : dxx, 10, ' ');
+	      wptr = uint32toa_w8x((uint32_t)cur_sample_valid_ct, ' ', wptr);
+	      wptr = dtoa_g_wxp4x(report_odds? exp(dxx) : dxx, 10, ' ', wptr);
 	      if (display_ci) {
 		dyy = ci_zt * se;
-		wptr = double_g_writewx4x(wptr, se, 8, ' ');
+		wptr = dtoa_g_wxp4x(se, 8, ' ', wptr);
 		if (report_odds) {
-		  wptr = double_g_writewx4x(wptr, exp(dxx - dyy), 8, ' ');
-		  wptr = double_g_writewx4x(wptr, exp(dxx + dyy), 8, ' ');
+		  wptr = dtoa_g_wxp4x(exp(dxx - dyy), 8, ' ', wptr);
+		  wptr = dtoa_g_wxp4x(exp(dxx + dyy), 8, ' ', wptr);
 		} else {
-		  wptr = double_g_writewx4x(wptr, dxx - dyy, 8, ' ');
-		  wptr = double_g_writewx4x(wptr, dxx + dyy, 8, ' ');
+		  wptr = dtoa_g_wxp4x(dxx - dyy, 8, ' ', wptr);
+		  wptr = dtoa_g_wxp4x(dxx + dyy, 8, ' ', wptr);
 		}
 	      }
-	      wptr = double_g_writewx4x(wptr, zval, 12, ' ');
-	      wptr = double_g_writewx4x(wptr, MAXV(pval, output_min_p), 12, '\n');
+	      wptr = dtoa_g_wxp4x(zval, 12, ' ', wptr);
+	      wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 12, '\n', wptr);
 	      if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
 		goto glm_logistic_assoc_ret_WRITE_FAIL;
 	      }
@@ -6867,18 +6641,18 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
 	  if (!skip_intercept) {
 	    dxx = (double)g_logistic_mt[0].coef[0];
 	    wptr = memcpya(wptr_start2, " INTERCEPT ", 11);
-	    wptr = uint32_writew8x(wptr, (uint32_t)cur_sample_valid_ct, ' ');
-	    wptr = double_g_writewx4x(wptr, dxx, 10, ' ');
+	    wptr = uint32toa_w8x((uint32_t)cur_sample_valid_ct, ' ', wptr);
+	    wptr = dtoa_g_wxp4x(dxx, 10, ' ', wptr);
 	    if (display_ci) {
 	      se = sqrt((double)g_logistic_mt[0].regression_results[0]);
 	      dyy = ci_zt * se;
-	      wptr = double_g_writewx4x(wptr, se, 8, ' ');
+	      wptr = dtoa_g_wxp4x(se, 8, ' ', wptr);
 	      if (report_odds) {
-		wptr = double_g_writewx4x(wptr, exp(dxx - dyy), 8, ' ');
-		wptr = double_g_writewx4x(wptr, exp(dxx + dyy), 8, ' ');
+		wptr = dtoa_g_wxp4x(exp(dxx - dyy), 8, ' ', wptr);
+		wptr = dtoa_g_wxp4x(exp(dxx + dyy), 8, ' ', wptr);
 	      } else {
-		wptr = double_g_writewx4x(wptr, dxx - dyy, 8, ' ');
-		wptr = double_g_writewx4x(wptr, dxx + dyy, 8, ' ');
+		wptr = dtoa_g_wxp4x(dxx - dyy, 8, ' ', wptr);
+		wptr = dtoa_g_wxp4x(dxx + dyy, 8, ' ', wptr);
 	      }
 	    }
 	    wptr = memcpya(wptr, "          NA           NA\n", 26);
@@ -6896,13 +6670,13 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
 	    if ((pfilter == 2.0) || ((pval <= pfilter) && (pval >= 0.0))) {
 	      wptr = fw_strcpy(10, &(param_names[param_ct_max * max_param_name_len]), wptr_start2);
               *wptr++ = ' ';
-              wptr = uint32_writew8(wptr, (uint32_t)cur_sample_valid_ct);
+              wptr = uint32toa_w8((uint32_t)cur_sample_valid_ct, wptr);
               wptr = memcpya(wptr, "         NA ", 12);
               if (display_ci) {
 		wptr = memcpya(wptr, "      NA       NA       NA ", 27);
 	      }
-              wptr = double_g_writewx4x(wptr, dxx, 12, ' ');
-              wptr = double_g_writewx4x(wptr, MAXV(pval, output_min_p), 12, '\n');
+              wptr = dtoa_g_wxp4x(dxx, 12, ' ', wptr);
+              wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 12, '\n', wptr);
               if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
 		goto glm_logistic_assoc_ret_WRITE_FAIL;
 	      }
@@ -6934,7 +6708,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
 		  wptr = fw_strcpy(10, &(param_names[param_ct_max * max_param_name_len]), wptr_start2);
 		}
 		*wptr++ = ' ';
-		wptr = uint32_writew8(wptr, (uint32_t)cur_sample_valid_ct);
+		wptr = uint32toa_w8((uint32_t)cur_sample_valid_ct, wptr);
 		wptr = memcpya(wptr, "         NA ", 12);
 		if (display_ci) {
 		  wptr = memcpya(wptr, "      NA       NA       NA ", 27);
@@ -6975,7 +6749,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
 	}
 	glm_logistic_maxt_thread((void*)ulii);
 	join_threads(threads, g_assoc_thread_ct);
-        ulii = CACHEALIGN32_DBL(g_perm_vec_ct);
+        ulii = round_up_pow2(g_perm_vec_ct, CACHELINE_DBL);
 	ukk = g_perms_done + g_perm_vec_ct;
         for (uii = 0; uii < g_assoc_thread_ct; uii++) {
           dptr = &(g_maxt_thread_results[uii * ulii]);
@@ -7039,29 +6813,29 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
       fflush(stdout);
       ulii = g_perm_vec_ct;
       ujj = 1 + g_perms_done;
-      wptr = tbuf;
-      cptr = &(tbuf[MAXLINELEN]);
+      wptr = g_textbuf;
+      cptr = &(g_textbuf[MAXLINELEN]);
       for (uii = 0; uii < ulii; uii++) {
-	wptr = uint32_write(wptr, uii + ujj);
+	wptr = uint32toa(uii + ujj, wptr);
 	dptr = &(g_mperm_save_all[uii]);
 	for (ukk = 0; ukk < marker_ct; ukk++) {
 	  *wptr++ = ' ';
 	  dxx = dptr[ukk * ulii];
 	  if (dxx >= 0) {
-	    wptr = double_g_write(wptr, dxx);
+	    wptr = dtoa_g(dxx, wptr);
 	  } else {
 	    wptr = memcpya(wptr, "NA", 2);
 	  }
 	  if (wptr >= cptr) {
-	    if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_msa)) {
+	    if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
 	      goto glm_logistic_assoc_ret_WRITE_FAIL;
 	    }
-	    wptr = tbuf;
+	    wptr = g_textbuf;
 	  }
 	}
 	*wptr++ = '\n';
       }
-      if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile_msa)) {
+      if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile_msa)) {
 	goto glm_logistic_assoc_ret_WRITE_FAIL;
       }
       fputs("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b               ", stdout);
@@ -7069,7 +6843,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     g_perms_done += g_perm_vec_ct;
     if (g_perms_done < perms_total) {
       if (perm_adapt_nst || (!perm_pass_idx)) {
-        marker_unstopped_ct = marker_initial_ct - popcount_longs((uintptr_t*)g_perm_adapt_stop, (marker_initial_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
+        marker_unstopped_ct = marker_initial_ct - popcount01_longs((uintptr_t*)g_perm_adapt_stop, (marker_initial_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
         if (!marker_unstopped_ct) {
           goto glm_logistic_assoc_perm_count;
 	}
@@ -7100,7 +6874,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
       if (mperm_save & MPERM_DUMP_BEST) {
 	memcpy(outname_end, ".mperm.dump.best", 17);
 	LOGPRINTF("Dumping best permutation chi-square values to %s .\n", outname);
-	if (fopen_checked(&outfile, outname, "w")) {
+	if (fopen_checked(outname, "w", &outfile)) {
 	  goto glm_logistic_assoc_ret_OPEN_FAIL;
 	}
 	dxx = 0;
@@ -7109,15 +6883,15 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
 	    dxx = g_orig_stats[marker_idx];
 	  }
 	}
-	memcpy(tbuf, "0 ", 2);
-	wptr = double_g_writex(&(tbuf[2]), dxx, '\n');
-        if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile)) {
+	memcpy(g_textbuf, "0 ", 2);
+	wptr = dtoa_gx(dxx, '\n', &(g_textbuf[2]));
+        if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
 	  goto glm_logistic_assoc_ret_WRITE_FAIL;
 	}
         for (uii = 0; uii < perms_total; uii++) {
-          wptr = uint32_writex(tbuf, uii + 1, ' ');
-          wptr = double_g_writex(wptr, g_maxt_extreme_stat[uii], '\n');
-          if (fwrite_checked(tbuf, (uintptr_t)(wptr - tbuf), outfile)) {
+          wptr = uint32toa_x(uii + 1, ' ', g_textbuf);
+          wptr = dtoa_gx(g_maxt_extreme_stat[uii], '\n', wptr);
+          if (fwrite_checked(g_textbuf, (uintptr_t)(wptr - g_textbuf), outfile)) {
 	    goto glm_logistic_assoc_ret_WRITE_FAIL;
 	  }
 	}
@@ -7128,13 +6902,13 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
       }
       memcpy(outname_end2, ".mperm", 7);
     }
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto glm_logistic_assoc_ret_OPEN_FAIL;
     }
     if (perm_adapt_nst) {
-      sprintf(tbuf, " CHR %%%us         EMP1           NP \n", plink_maxsnp);
+      sprintf(g_textbuf, " CHR %%%us         EMP1           NP \n", plink_maxsnp);
     } else {
-      sprintf(tbuf, " CHR %%%us         EMP1         EMP2 \n", plink_maxsnp);
+      sprintf(g_textbuf, " CHR %%%us         EMP1         EMP2 \n", plink_maxsnp);
 #ifdef __cplusplus
       std::sort(g_maxt_extreme_stat, &(g_maxt_extreme_stat[perms_total]));
 #else
@@ -7147,7 +6921,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
 	g_perm_attempt_ct[marker_idx] = perms_total - g_perm_attempt_ct[marker_idx];
       }
     }
-    fprintf(outfile, tbuf, "SNP");
+    fprintf(outfile, g_textbuf, "SNP");
     chrom_fo_idx = 0xffffffffU;
     marker_uidx = next_unset_unsafe(marker_exclude, 0);
     marker_idx = 0;
@@ -7156,7 +6930,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
 	chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[(++chrom_fo_idx) + 1];
       } while (marker_uidx >= chrom_end);
       uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-      wptr_start = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, uii));
+      wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, uii, g_textbuf));
       *wptr_start++ = ' ';
       wptr_start[plink_maxsnp] = ' ';
       for (; marker_uidx < chrom_end;) {
@@ -7168,24 +6942,24 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
             wptr = memcpya(wptr, "          NA           NA", 25);
 	  } else {
 	    if (!perm_count) {
-	      wptr = double_g_writewx4x(wptr, pval, 12, ' ');
+	      wptr = dtoa_g_wxp4x(pval, 12, ' ', wptr);
 	    } else {
-	      wptr = double_g_writewx4x(wptr, ((double)g_perm_2success_ct[marker_idx]) * 0.5, 12, ' ');
+	      wptr = dtoa_g_wxp4x(((double)g_perm_2success_ct[marker_idx]) * 0.5, 12, ' ', wptr);
 	    }
 	    if (perm_adapt_nst) {
 	      wptr = memseta(wptr, 32, 2);
-	      wptr = uint32_writew10(wptr, g_perm_attempt_ct[marker_idx]);
+	      wptr = uint32toa_w10(g_perm_attempt_ct[marker_idx], wptr);
 	    } else {
 	      dzz = (int32_t)(perms_total - doublearr_greater_than(g_maxt_extreme_stat, perms_total, g_orig_stats[marker_idx] - EPSILON) + 1);
               if (!perm_count) {
-		wptr = double_g_writewx4(wptr, dzz / ((double)((int32_t)perms_total + 1)), 12);
+		wptr = dtoa_g_wxp4(dzz / ((double)((int32_t)perms_total + 1)), 12, wptr);
 	      } else {
-                wptr = double_g_writewx4(wptr, dzz - 1, 12);
+                wptr = dtoa_g_wxp4(dzz - 1, 12, wptr);
 	      }
 	    }
 	  }
 	  wptr = memcpya(wptr, " \n", 2);
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	    goto glm_logistic_assoc_ret_WRITE_FAIL;
 	  }
 	}
@@ -7228,7 +7002,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     break;
   }
  glm_logistic_assoc_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   fclose_cond(outfile_msa);
   free_cond(condition_uidxs);
@@ -7237,13 +7011,13 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
 
 #ifndef NOLAPACK
 int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t glm_modifier, double glm_vif_thresh, uint32_t glm_xchr_model, uint32_t glm_mperm_val, Range_list* parameters_range_list_ptr, Range_list* tests_range_list_ptr, double ci_size, double ci_zt, double pfilter, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* mar [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + BITCT - 1) / BITCT;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t unfiltered_sample_ctv2 = 2 * unfiltered_sample_ctl;
   FILE* outfile = NULL;
   uintptr_t sample_uidx = 0;
-  uintptr_t topsize = 0;
   uintptr_t max_param_name_len = 2;
   uintptr_t param_raw_ct = 1;
   uintptr_t condition_ct = 0;
@@ -7344,36 +7118,30 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     hh_or_mt_exists |= NXMHH_EXISTS;
   }
   if (condition_mname || condition_fname) {
-    loadbuf_raw = (uintptr_t*)top_alloc(&topsize, unfiltered_sample_ctv2 * sizeof(intptr_t));
-    if (!loadbuf_raw) {
+    if (bigstack_end_alloc_ul(unfiltered_sample_ctv2, &loadbuf_raw)) {
       goto glm_linear_nosnp_ret_NOMEM;
     }
     loadbuf_raw[unfiltered_sample_ctv2 - 2] = 0;
     loadbuf_raw[unfiltered_sample_ctv2 - 1] = 0;
-    ulii = topsize;
 
     if (hh_or_mt_exists & (Y_FIX_NEEDED | NXMHH_EXISTS)) {
-      sample_include2 = (uintptr_t*)top_alloc(&topsize, unfiltered_sample_ctv2 * sizeof(intptr_t));
-      if (!sample_include2) {
+      if (bigstack_end_alloc_ul(unfiltered_sample_ctv2, &sample_include2)) {
         goto glm_linear_nosnp_ret_NOMEM;
       }
-      fill_vec_55(sample_include2, unfiltered_sample_ct); // harmless
+      fill_quatervec_55(unfiltered_sample_ct, sample_include2); // harmless
     }
     if (hh_or_mt_exists & (XMHH_EXISTS | Y_FIX_NEEDED)) {
-      sample_male_include2 = (uintptr_t*)top_alloc(&topsize, unfiltered_sample_ctv2 * sizeof(intptr_t));
-      if (!sample_male_include2) {
+      if (bigstack_end_alloc_ul(unfiltered_sample_ctv2, &sample_male_include2)) {
 	goto glm_linear_nosnp_ret_NOMEM;
       }
-      fill_ulong_zero(sample_male_include2, unfiltered_sample_ctv2);
-      vec_include_init(unfiltered_sample_ct, sample_male_include2, sex_male);
+      init_quaterarr_from_bitarr(sex_male, unfiltered_sample_ct, sample_male_include2);
     }
-    wkspace_left -= topsize;
     retval = glm_scan_conditions(condition_mname, condition_fname, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, chrom_info_ptr, hh_or_mt_exists, loadbuf_raw, bedfile, bed_offset, unfiltered_sample_ct, sex_male, load_mask, &sample_valid_ct, &condition_ct, &condition_uidxs, sample_include2, sample_male_include2);
-    wkspace_left += topsize;
     if (retval) {
       goto glm_linear_nosnp_ret_1;
     }
-    topsize = ulii; // deallocate temporary sample[_male]_include2
+    // deallocate temporary sample[_male]_include2
+    bigstack_end_reset(loadbuf_raw);
     param_raw_ct += condition_ct;
   }
   param_raw_ct += covar_ct;
@@ -7399,48 +7167,44 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     } while (sample_idx < sample_valid_ct);
     if (variation_in_sex) {
       param_raw_ct++;
-      bitfield_and(load_mask, sex_nm, unfiltered_sample_ctl);
+      bitvec_and(sex_nm, unfiltered_sample_ctl, load_mask);
       sample_valid_ct = popcount_longs(load_mask, unfiltered_sample_ctl);
     } else {
       logerrprint("Warning: Ignoring --linear 'sex' modifier since sex is invariant.\n");
     }
   }
-  sample_valid_ctv2 = 2 * ((sample_valid_ct + BITCT - 1) / BITCT);
+  sample_valid_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_valid_ct);
 
   if (condition_mname || condition_fname) {
     // now that we've determined which samples will be in the regression,
     // initialize collapsed sample_include2, sample_male_include2, sex_male
     if (hh_or_mt_exists & (Y_FIX_NEEDED | NXMHH_EXISTS)) {
-      sample_include2 = (uintptr_t*)top_alloc(&topsize, sample_valid_ctv2 * sizeof(intptr_t));
-      fill_vec_55(sample_include2, sample_valid_ct);
+      bigstack_end_alloc_ul(sample_valid_ctv2, &sample_include2);
+      fill_quatervec_55(sample_valid_ct, sample_include2);
     }
     if (hh_or_mt_exists & (XMHH_EXISTS | Y_FIX_NEEDED)) {
-      sample_male_include2 = (uintptr_t*)top_alloc(&topsize, sample_valid_ctv2 * sizeof(intptr_t));
-      alloc_collapsed_haploid_filters(unfiltered_sample_ct, sample_valid_ct, hh_or_mt_exists, 1, load_mask, sex_male, &sample_include2, &sample_male_include2);
+      bigstack_end_alloc_ul(sample_valid_ctv2, &sample_male_include2);
+      alloc_collapsed_haploid_filters(load_mask, sex_male, unfiltered_sample_ct, sample_valid_ct, hh_or_mt_exists, 1, &sample_include2, &sample_male_include2);
     }
-    loadbuf_collapsed = (uintptr_t*)top_alloc(&topsize, sample_valid_ctv2 * sizeof(intptr_t));
-    if (!loadbuf_collapsed) {
+    if (bigstack_end_alloc_ul(sample_valid_ctv2, &loadbuf_collapsed) ||
+        bigstack_end_alloc_ul(sample_valid_ctv2 / 2, &sex_male_collapsed)) {
       goto glm_linear_nosnp_ret_NOMEM;
     }
     loadbuf_collapsed[sample_valid_ctv2 - 2] = 0;
     loadbuf_collapsed[sample_valid_ctv2 - 1] = 0;
-    sex_male_collapsed = (uintptr_t*)top_alloc(&topsize, sample_valid_ctv2 * (sizeof(intptr_t) / 2));
-    if (!sex_male_collapsed) {
-      goto glm_linear_nosnp_ret_NOMEM;
-    }
-    collapse_copy_bitarr_incl(unfiltered_sample_ct, sex_male, load_mask, sample_valid_ct, sex_male_collapsed);
+    copy_bitarr_subset(sex_male, load_mask, unfiltered_sample_ct, sample_valid_ct, sex_male_collapsed);
   }
-  param_raw_ctl = (param_raw_ct + BITCT - 1) / BITCT;
-  if (aligned_malloc(&active_params, param_raw_ctl * sizeof(intptr_t))) {
+  param_raw_ctl = BITCT_TO_WORDCT(param_raw_ct);
+  if (aligned_malloc(param_raw_ctl * sizeof(intptr_t), &active_params)) {
     goto glm_linear_nosnp_ret_NOMEM;
   }
   if (parameters_range_list_ptr->name_ct) {
     fill_ulong_zero(active_params, param_raw_ctl);
     active_params[0] = 1;
-    numeric_range_list_to_bitfield(parameters_range_list_ptr, param_raw_ct, active_params, 0, 1);
+    numeric_range_list_to_bitarr(parameters_range_list_ptr, param_raw_ct, 0, 1, active_params);
     param_ct = popcount_longs(active_params, param_raw_ctl);
   } else {
-    fill_all_bits(active_params, param_raw_ct);
+    fill_all_bits(param_raw_ct, active_params);
     param_ct = param_raw_ct;
   }
   if (param_ct == 1) {
@@ -7477,17 +7241,17 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
   }
   param_ctx = param_ct;
   if (tests_range_list_ptr->name_ct || (glm_modifier & GLM_TEST_ALL)) {
-    ulii = (param_ct + (BITCT - 1)) / BITCT;
-    if (aligned_malloc(&joint_test_params, ulii * sizeof(intptr_t))) {
+    ulii = BITCT_TO_WORDCT(param_ct);
+    if (aligned_malloc(ulii * sizeof(intptr_t), &joint_test_params)) {
       goto glm_linear_nosnp_ret_NOMEM;
     }
     fill_ulong_zero(joint_test_params, ulii);
     if (tests_range_list_ptr->name_ct) {
-      numeric_range_list_to_bitfield(tests_range_list_ptr, param_ct - 1, joint_test_params, 1, 1);
+      numeric_range_list_to_bitarr(tests_range_list_ptr, param_ct - 1, 1, 1, joint_test_params);
       constraint_ct = popcount_longs(joint_test_params, ulii);
     } else {
       constraint_ct = param_ct - 1;
-      fill_bits(joint_test_params, 0, constraint_ct);
+      fill_bits(0, constraint_ct, joint_test_params);
     }
     if ((constraint_ct > 1) || (hide_covar && (constraint_ct == 1))) {
       // permit hide-covar + single --tests parameter combination
@@ -7509,12 +7273,10 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
       max_param_name_len = 4;
     }
   }
-  wkspace_left -= topsize;
-  if (wkspace_alloc_c_checked(&param_names, param_ctx * max_param_name_len) ||
-      wkspace_alloc_d_checked(&covars_cov_major, param_ct * sample_valid_ct * sizeof(double))) {
-    goto glm_linear_nosnp_ret_NOMEM2;
+  if (bigstack_alloc_c(param_ctx * max_param_name_len, &param_names) ||
+      bigstack_alloc_d(param_ct * sample_valid_ct, &covars_cov_major)) {
+    goto glm_linear_nosnp_ret_NOMEM;
   }
-  wkspace_left += topsize;
   for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
     covars_cov_major[sample_idx] = 1;
   }
@@ -7539,7 +7301,7 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
       if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
 	goto glm_linear_nosnp_ret_READ_FAIL;
       }
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf_collapsed, sample_valid_ct, load_mask, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, sample_valid_ct, load_mask, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_collapsed)) {
 	goto glm_linear_nosnp_ret_READ_FAIL;
       }
       chrom_idx = get_marker_chrom(chrom_info_ptr, marker_uidx);
@@ -7565,16 +7327,15 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
       param_idx++;
     }
   }
-  // topsize = 0;
+  bigstack_end_reset(bigstack_end_mark);
   if (constraint_ct) {
-    if (wkspace_alloc_d_checked(&constraints_con_major, constraint_ct * param_ct * sizeof(double)) ||
-        wkspace_alloc_d_checked(&df_df_buf, constraint_ct * constraint_ct * sizeof(double)) ||
-        wkspace_alloc_d_checked(&df_buf, constraint_ct * sizeof(double)) ||
-        wkspace_alloc_d_checked(&param_df_buf, constraint_ct * param_ct * sizeof(double)) ||
-	wkspace_alloc_d_checked(&param_df_buf2, constraint_ct * param_ct * sizeof(double))) {
+    if (bigstack_calloc_d(constraint_ct * param_ct, &constraints_con_major) ||
+        bigstack_alloc_d(constraint_ct * constraint_ct, &df_df_buf) ||
+        bigstack_alloc_d(constraint_ct, &df_buf) ||
+        bigstack_alloc_d(constraint_ct * param_ct, &param_df_buf) ||
+	bigstack_alloc_d(constraint_ct * param_ct, &param_df_buf2)) {
       goto glm_linear_nosnp_ret_NOMEM;
     }
-    fill_double_zero(constraints_con_major, constraint_ct * param_ct);
     uljj = 0;
     for (constraint_idx = 0; constraint_idx < constraint_ct; uljj++, constraint_idx++) {
       next_set_ul_unsafe_ck(joint_test_params, &uljj);
@@ -7582,27 +7343,26 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     }
     wptr = memcpya(&(param_names[param_ct * max_param_name_len]), (glm_modifier & GLM_TEST_ALL)? "FULL" : "USER", 4);
     *wptr++ = '_';
-    wptr = uint32_write(wptr, constraint_ct);
+    wptr = uint32toa(constraint_ct, wptr);
     memcpy(wptr, "DF", 3);
   }
-  if (wkspace_alloc_d_checked(&covars_sample_major, param_ct * sample_valid_ct * sizeof(double)) ||
-      wkspace_alloc_ui_checked(&perm_2success_ct, (param_ctx - 1) * sizeof(int32_t)) ||
-      wkspace_alloc_d_checked(&orig_stats, (param_ctx - 1) * sizeof(double)) ||
-      wkspace_alloc_d_checked(&param_2d_buf, param_ct * param_ct * sizeof(double)) ||
-      wkspace_alloc_d_checked(&param_2d_buf2, param_ct * param_ct * sizeof(double))) {
+  if (bigstack_alloc_d(param_ct * sample_valid_ct, &covars_sample_major) ||
+      bigstack_calloc_ui(param_ctx - 1, &perm_2success_ct) ||
+      bigstack_alloc_d(param_ctx - 1, &orig_stats) ||
+      bigstack_alloc_d(param_ct * param_ct, &param_2d_buf) ||
+      bigstack_alloc_d(param_ct * param_ct, &param_2d_buf2)) {
     goto glm_linear_nosnp_ret_NOMEM;
   }
-  mi_buf = (MATRIX_INVERT_BUF1_TYPE*)wkspace_alloc(param_ct * sizeof(MATRIX_INVERT_BUF1_TYPE));
+  mi_buf = (MATRIX_INVERT_BUF1_TYPE*)bigstack_alloc(param_ct * sizeof(MATRIX_INVERT_BUF1_TYPE));
   if (!mi_buf) {
     goto glm_linear_nosnp_ret_NOMEM;
   }
-  fill_uint_zero(perm_2success_ct, param_ctx - 1);
   sample_uidx = 0;
   sample_idx = 0;
-  if (wkspace_alloc_d_checked(&g_pheno_d2, sample_valid_ct * sizeof(double))) {
+  if (bigstack_alloc_d(sample_valid_ct, &g_perm_pheno_d2)) {
     goto glm_linear_nosnp_ret_NOMEM;
   }
-  dptr = g_pheno_d2;
+  dptr = g_perm_pheno_d2;
   g_pheno_sum = 0;
   g_pheno_ssq = 0;
   do {
@@ -7656,7 +7416,7 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     // with no SNPs, only need to do this once
     dzz = g_pheno_sum / ((double)((intptr_t)sample_valid_ct)); // mean
     dyy = sqrt(((double)((intptr_t)(sample_valid_ct - 1))) / (g_pheno_ssq - g_pheno_sum * dzz)); // 1/stdev
-    dptr = g_pheno_d2;
+    dptr = g_perm_pheno_d2;
     for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
       *dptr = ((*dptr) - dzz) * dyy;
       dptr++;
@@ -7693,8 +7453,8 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
   }
 
   // required for multithreaded permutation generation
-  g_cluster_ct = 0;
-  g_pheno_nm_ct = sample_valid_ct;
+  g_perm_cluster_ct = 0;
+  g_perm_pheno_nm_ct = sample_valid_ct;
 
   perms_done = 0;
 
@@ -7707,17 +7467,16 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     perm_batch_size = 1;
     mperm_save = 0;
   }
-  ulii = (perm_batch_size + (BITCT - 1)) / BITCT;
-  if (wkspace_alloc_ul_checked(&perm_fails, ulii * sizeof(intptr_t)) ||
-      wkspace_alloc_d_checked(&regression_results, perm_batch_size * (param_ctx - 1) * sizeof(double))) {
+  if (bigstack_alloc_ul(BITCT_TO_WORDCT(perm_batch_size), &perm_fails) ||
+      bigstack_alloc_d(perm_batch_size * (param_ctx - 1), &regression_results)) {
     goto glm_linear_nosnp_ret_NOMEM;
   }
 
   if (do_perms) {
-    if (wkspace_alloc_ui_checked(&g_precomputed_mods, (sample_valid_ct - 1) * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(sample_valid_ct - 1, &g_perm_precomputed_mods)) {
       goto glm_linear_nosnp_ret_NOMEM;
     }
-    precompute_mods(sample_valid_ct, g_precomputed_mods);
+    precompute_mods(sample_valid_ct, g_perm_precomputed_mods);
   }
   // may want to put a multiple linear regression wrapper function in
   // plink_matrix, perhaps with the PLINK 1.07 svdcmp/svbksb no-LAPACK
@@ -7728,14 +7487,14 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
   dgels_n = (int32_t)((uint32_t)param_ct);
   dgels_nrhs = perm_batch_size;
   dgels_ldb = dgels_m;
-  if (wkspace_alloc_d_checked(&g_perm_pmajor, perm_batch_size * sample_valid_ct * sizeof(double)) ||
-      wkspace_alloc_d_checked(&dgels_a, param_ct * sample_valid_ct * sizeof(double)) ||
-      wkspace_alloc_d_checked(&dgels_b, perm_batch_size * sample_valid_ct * sizeof(double))) {
+  if (bigstack_alloc_d(perm_batch_size * sample_valid_ct, &g_perm_pmajor) ||
+      bigstack_alloc_d(param_ct * sample_valid_ct, &dgels_a) ||
+      bigstack_alloc_d(perm_batch_size * sample_valid_ct, &dgels_b)) {
     goto glm_linear_nosnp_ret_NOMEM;
   }
   fill_double_zero(regression_results, param_ctx - 1);
   memcpy(dgels_a, covars_cov_major, param_ct * sample_valid_ct * sizeof(double));
-  memcpy(dgels_b, g_pheno_d2, sample_valid_ct * sizeof(double));
+  memcpy(dgels_b, g_perm_pheno_d2, sample_valid_ct * sizeof(double));
   dgels_(&dgels_trans, &dgels_m, &dgels_n, &dgels_nrhs, dgels_a, &dgels_m, dgels_b, &dgels_ldb, &dxx, &dgels_lwork, &dgels_info);
   if (dxx > 2147483647.0) {
     // maybe this can't actually happen, but just in case...
@@ -7746,7 +7505,7 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     goto glm_linear_nosnp_ret_1;
   }
   dgels_lwork = (int32_t)dxx;
-  if (wkspace_alloc_d_checked(&dgels_work, dgels_lwork * sizeof(double))) {
+  if (bigstack_alloc_d(dgels_lwork, &dgels_work)) {
     goto glm_linear_nosnp_ret_NOMEM;
   }
   dgels_nrhs = 1;
@@ -7765,31 +7524,31 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     max_thread_ct = uii;
   }
   if (cluster_starts && do_perms) {
-    retval = cluster_include_and_reindex(unfiltered_sample_ct, load_mask, 1, NULL, sample_valid_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_cluster_ct, &g_cluster_map, &g_cluster_starts, NULL, NULL);
+    retval = cluster_include_and_reindex(unfiltered_sample_ct, load_mask, 1, NULL, sample_valid_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, NULL, NULL);
     if (retval) {
       goto glm_linear_nosnp_ret_1;
     }
-    if (!g_cluster_ct) {
+    if (!g_perm_cluster_ct) {
       goto glm_linear_nosnp_ret_NO_PERMUTATION_CLUSTERS;
     }
-    if (wkspace_alloc_ui_checked(&g_qassoc_cluster_thread_wkspace, max_thread_ct * ((g_cluster_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32) * CACHELINE)) {
+    if (bigstack_alloc_ui(max_thread_ct * round_up_pow2(g_perm_cluster_ct, CACHELINE_INT32), &g_perm_qt_cluster_thread_wkspace)) {
       goto glm_linear_nosnp_ret_NOMEM;
     }
-    if (wkspace_alloc_ui_checked(&g_sample_to_cluster, sample_valid_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(sample_valid_ct, &g_perm_sample_to_cluster)) {
       goto glm_linear_nosnp_ret_NOMEM;
     }
-    fill_unfiltered_sample_to_cluster(sample_valid_ct, g_cluster_ct, g_cluster_map, g_cluster_starts, g_sample_to_cluster);
+    fill_unfiltered_sample_to_cluster(sample_valid_ct, g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, g_perm_sample_to_cluster);
   }
   if (do_perms) {
     // Note that, for now, the main nosnp regression loop is not multithreaded;
     // only the permutation generation process is.
-    if (wkspace_init_sfmtp(max_thread_ct)) {
+    if (bigstack_init_sfmtp(max_thread_ct)) {
       goto glm_linear_nosnp_ret_NOMEM;
     }
   }
 
   transpose_copy(param_ct, sample_valid_ct, covars_cov_major, covars_sample_major);
-  if (glm_linear(1, param_ct, sample_valid_ct, 0, NULL, 0, 0, 0, covars_cov_major, covars_sample_major, g_pheno_d2, dgels_b, param_2d_buf, mi_buf, param_2d_buf2, regression_results, constraint_ct, constraints_con_major, param_df_buf, param_df_buf2, df_df_buf, df_buf, &perm_fail_ct, perm_fails) || perm_fail_ct) {
+  if (glm_linear(1, param_ct, sample_valid_ct, 0, NULL, 0, 0, 0, covars_cov_major, covars_sample_major, g_perm_pheno_d2, dgels_b, param_2d_buf, mi_buf, param_2d_buf2, regression_results, constraint_ct, constraints_con_major, param_df_buf, param_df_buf2, df_df_buf, df_buf, &perm_fail_ct, perm_fails) || perm_fail_ct) {
     logerrprint("Warning: Skipping --linear no-snp due to multicollinearity.\n");
     goto glm_linear_nosnp_ret_1;
   }
@@ -7801,18 +7560,18 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
   if (mperm_save) {
     // --mperm-save prevented during command-line parsing, so must be
     // --mperm-save-all
-    if (wkspace_alloc_d_checked(&mperm_save_stats, glm_mperm_val * (param_ctx - 1) * sizeof(double))) {
+    if (bigstack_alloc_d(glm_mperm_val * (param_ctx - 1), &mperm_save_stats)) {
       goto glm_linear_nosnp_ret_NOMEM;
     }
     *outname_end = '\0';
-    LOGPREPRINTFWW(logbuf, "Dumping all permutation absolute t-stats to %s.[testID].mperm.dump.all.\n", outname);
-    fputs(logbuf, stdout);
+    LOGPREPRINTFWW(g_logbuf, "Dumping all permutation absolute t-stats to %s.[testID].mperm.dump.all.\n", outname);
+    fputs(g_logbuf, stdout);
     if (constraint_ct) {
       logprint("(exception: chi-square values will be dumped for joint test)\n");
     }
   }
   outname_end2 = memcpyb(outname_end, ".assoc.linear", 14);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto glm_linear_nosnp_ret_OPEN_FAIL;
   }
   LOGPRINTFWW5("Writing linear model association results to %s ... ", outname);
@@ -7837,36 +7596,36 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     orig_stats[param_idx - 1] = fabs(zval);
     pval = calc_tprob(zval, sample_valid_ct - param_ct);
     if ((!hide_covar) && ((pfilter == 2.0) || ((pval <= pfilter) && (pval >= 0.0)))) {
-      wptr = fw_strcpy(10, &(param_names[param_idx * max_param_name_len]), tbuf);
+      wptr = fw_strcpy(10, &(param_names[param_idx * max_param_name_len]), g_textbuf);
       *wptr++ = ' ';
-      wptr = uint32_writew8x(wptr, (uint32_t)sample_valid_ct, ' ');
-      wptr = double_g_writewx4x(wptr, dxx, 10, ' ');
+      wptr = uint32toa_w8x((uint32_t)sample_valid_ct, ' ', wptr);
+      wptr = dtoa_g_wxp4x(dxx, 10, ' ', wptr);
       if (display_ci) {
 	dyy = ci_zt * se;
-	wptr = double_g_writewx4x(wptr, se, 8, ' ');
-	wptr = double_g_writewx4x(wptr, dxx - dyy, 8, ' ');
-	wptr = double_g_writewx4x(wptr, dxx + dyy, 8, ' ');
+	wptr = dtoa_g_wxp4x(se, 8, ' ', wptr);
+	wptr = dtoa_g_wxp4x(dxx - dyy, 8, ' ', wptr);
+	wptr = dtoa_g_wxp4x(dxx + dyy, 8, ' ', wptr);
       }
-      wptr = double_g_writewx4x(wptr, zval, 12, ' ');
-      wptr = double_g_writewx4x(wptr, MAXV(pval, output_min_p), 12, '\n');
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      wptr = dtoa_g_wxp4x(zval, 12, ' ', wptr);
+      wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 12, '\n', wptr);
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	goto glm_linear_nosnp_ret_WRITE_FAIL;
       }
     }      
   }
   if (report_intercept) {
-    wptr = memcpya(tbuf, " INTERCEPT ", 11);
-    wptr = uint32_writew8x(wptr, (uint32_t)sample_valid_ct, ' ');
-    wptr = double_g_writewx4x(wptr, dgels_b[0], 10, ' ');
+    wptr = memcpya(g_textbuf, " INTERCEPT ", 11);
+    wptr = uint32toa_w8x((uint32_t)sample_valid_ct, ' ', wptr);
+    wptr = dtoa_g_wxp4x(dgels_b[0], 10, ' ', wptr);
     if (display_ci) {
       se = sqrt(param_2d_buf2[0]);
       dyy = ci_zt * se;
-      wptr = double_g_writewx4x(wptr, se, 8, ' ');
-      wptr = double_g_writewx4x(wptr, dgels_b[0] - dyy, 8, ' ');
-      wptr = double_g_writewx4x(wptr, dgels_b[0] + dyy, 8, ' ');
+      wptr = dtoa_g_wxp4x(se, 8, ' ', wptr);
+      wptr = dtoa_g_wxp4x(dgels_b[0] - dyy, 8, ' ', wptr);
+      wptr = dtoa_g_wxp4x(dgels_b[0] + dyy, 8, ' ', wptr);
     }
     wptr = memcpya(wptr, "          NA           NA\n", 26);
-    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
       goto glm_linear_nosnp_ret_WRITE_FAIL;
     }
   }
@@ -7875,16 +7634,16 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     orig_stats[param_ct - 1] = dxx;
     pval = chiprob_p(dxx, constraint_ct);
     if ((pfilter == 2.0) || ((pval <= pfilter) && (pval >= 0.0))) {
-      wptr = fw_strcpy(10, &(param_names[param_ct * max_param_name_len]), tbuf);
+      wptr = fw_strcpy(10, &(param_names[param_ct * max_param_name_len]), g_textbuf);
       *wptr++ = ' ';
-      wptr = uint32_writew8(wptr, (uint32_t)sample_valid_ct);
+      wptr = uint32toa_w8((uint32_t)sample_valid_ct, wptr);
       wptr = memcpya(wptr, "         NA ", 12);
       if (display_ci) {
 	wptr = memcpya(wptr, "      NA       NA       NA ", 27);
       }
-      wptr = double_g_writewx4x(wptr, dxx, 12, ' ');
-      wptr = double_g_writewx4x(wptr, MAXV(pval, output_min_p), 12, '\n');
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      wptr = dtoa_g_wxp4x(dxx, 12, ' ', wptr);
+      wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 12, '\n', wptr);
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	goto glm_linear_nosnp_ret_WRITE_FAIL;
       }
     }
@@ -7904,25 +7663,22 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     ulii = 0;
 
     if (cur_batch_size >= CACHELINE_INT32 * max_thread_ct) {
-      g_assoc_thread_ct = max_thread_ct;
+      g_perm_generation_thread_ct = max_thread_ct;
     } else {
-      g_assoc_thread_ct = cur_batch_size / CACHELINE_INT32;
-      if (!g_assoc_thread_ct) {
-	g_assoc_thread_ct = 1;
-      }
+      g_perm_generation_thread_ct = MAXV(cur_batch_size / CACHELINE_INT32, 1);
     }
-    if (!g_cluster_ct) {
-      if (spawn_threads(threads, &linear_gen_perms_thread, g_assoc_thread_ct)) {
+    if (!g_perm_cluster_ct) {
+      if (spawn_threads(threads, &generate_qt_perms_pmajor_thread, g_perm_generation_thread_ct)) {
 	goto glm_linear_nosnp_ret_THREAD_CREATE_FAIL;
       }
-      linear_gen_perms_thread((void*)ulii);
+      generate_qt_perms_pmajor_thread((void*)ulii);
     } else {
-      if (spawn_threads(threads, &linear_gen_cluster_perms_thread, g_assoc_thread_ct)) {
+      if (spawn_threads(threads, &generate_qt_cluster_perms_pmajor_thread, g_perm_generation_thread_ct)) {
 	goto glm_linear_nosnp_ret_THREAD_CREATE_FAIL;
       }
-      linear_gen_cluster_perms_thread((void*)ulii);
+      generate_qt_cluster_perms_pmajor_thread((void*)ulii);
     }
-    join_threads(threads, g_assoc_thread_ct);
+    join_threads(threads, g_perm_generation_thread_ct);
     dgels_nrhs = cur_batch_size;
     fill_double_zero(regression_results, (param_ctx - 1) * cur_batch_size);
     memcpy(dgels_a, covars_cov_major, param_ct * sample_valid_ct * sizeof(double));
@@ -7930,7 +7686,7 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     dgels_(&dgels_trans, &dgels_m, &dgels_n, &dgels_nrhs, dgels_a, &dgels_m, dgels_b, &dgels_ldb, dgels_work, &dgels_lwork, &dgels_info);
     if (glm_linear(cur_batch_size, param_ct, sample_valid_ct, 0, NULL, 0, 0, 0, covars_cov_major, covars_sample_major, g_perm_pmajor, dgels_b, param_2d_buf, mi_buf, param_2d_buf2, regression_results, constraint_ct, constraints_con_major, param_df_buf, param_df_buf2, df_df_buf, df_buf, &perm_fail_ct, perm_fails)) {
       perm_fail_ct = cur_batch_size;
-      fill_bits(perm_fails, 0, cur_batch_size);
+      fill_bits(0, cur_batch_size, perm_fails);
     }
     perm_fail_total += perm_fail_ct;
     ulii = param_ct - 1;
@@ -7991,7 +7747,7 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
   if (do_perms) {
     putchar('\n');
     memcpy(outname_end2, ".mperm", 7);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto glm_linear_nosnp_ret_OPEN_FAIL;
     }
     if (fputs_checked("      TEST         EMP1           NP \n", outfile)) {
@@ -7999,37 +7755,37 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     }
     dxx = 0.5 / ((double)((int32_t)(glm_mperm_val - perm_fail_total) + 1));
     for (param_idx = 1; param_idx < param_ct; param_idx++) {
-      wptr = fw_strcpy(10, &(param_names[param_idx * max_param_name_len]), tbuf);
+      wptr = fw_strcpy(10, &(param_names[param_idx * max_param_name_len]), g_textbuf);
       *wptr++ = ' ';
       pval = ((double)(perm_2success_ct[param_idx - 1] + 2)) * dxx;
       if (pval <= pfilter) {
 	if (!perm_count) {
-	  wptr = double_g_writewx4(wptr, pval, 12);
+	  wptr = dtoa_g_wxp4(pval, 12, wptr);
 	} else {
-          wptr = double_g_writewx4(wptr, ((double)perm_2success_ct[param_idx - 1]) * 0.5, 12);
+          wptr = dtoa_g_wxp4(((double)perm_2success_ct[param_idx - 1]) * 0.5, 12, wptr);
 	}
         wptr = memseta(wptr, 32, 3);
-        wptr = uint32_writew10(wptr, glm_mperm_val - perm_fail_total);
+        wptr = uint32toa_w10(glm_mperm_val - perm_fail_total, wptr);
         wptr = memcpya(wptr, " \n", 2);
-	if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	  goto glm_linear_nosnp_ret_WRITE_FAIL;
 	}
       }
     }
     if (constraint_ct) {
-      wptr = fw_strcpy(10, &(param_names[param_ct * max_param_name_len]), tbuf);
+      wptr = fw_strcpy(10, &(param_names[param_ct * max_param_name_len]), g_textbuf);
       *wptr++ = ' ';
       pval = ((double)(perm_2success_ct[param_ct - 1] + 2)) * 0.5 / ((double)((int32_t)(glm_mperm_val - perm_fail_total - joint_perm_fail_extra) + 1));
       if (pval <= pfilter) {
 	if (!perm_count) {
-	  wptr = double_g_writewx4(wptr, pval, 12);
+	  wptr = dtoa_g_wxp4(pval, 12, wptr);
 	} else {
-          wptr = double_g_writewx4(wptr, ((double)perm_2success_ct[param_ct - 1]) * 0.5, 12);
+          wptr = dtoa_g_wxp4(((double)perm_2success_ct[param_ct - 1]) * 0.5, 12, wptr);
 	}
         wptr = memseta(wptr, 32, 3);
-        wptr = uint32_writew10(wptr, glm_mperm_val - perm_fail_total - joint_perm_fail_extra);
+        wptr = uint32toa_w10(glm_mperm_val - perm_fail_total - joint_perm_fail_extra, wptr);
         wptr = memcpya(wptr, " \n", 2);
-	if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	  goto glm_linear_nosnp_ret_WRITE_FAIL;
 	}
       }
@@ -8048,43 +7804,41 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
       for (param_idx = 1; param_idx < param_ct; param_idx++) {
 	wptr = strcpya(&(outname_end[1]), &(param_names[param_idx * max_param_name_len]));
 	memcpy(wptr, ".mperm.dump.all", 17);
-	if (fopen_checked(&outfile, outname, "w")) {
+	if (fopen_checked(outname, "w", &outfile)) {
 	  goto glm_linear_nosnp_ret_OPEN_FAIL;
 	}
 	ulii = param_ctx - 1;
-	wptr = memcpya(tbuf, "0 ", 2);
-	wptr = double_g_writex(wptr, orig_stats[param_idx - 1], '\n');
-	wptr2 = &(tbuf[MAXLINELEN]);
+	wptr = memcpya(g_textbuf, "0 ", 2);
+	wptr = dtoa_gx(orig_stats[param_idx - 1], '\n', wptr);
+	wptr2 = &(g_textbuf[MAXLINELEN]);
 	dptr = &(mperm_save_stats[param_idx - 1]);
 	for (perm_idx = 0; perm_idx < glm_mperm_val; perm_idx++) {
-	  wptr = uint32_writex(wptr, perm_idx + 1, ' ');
+	  wptr = uint32toa_x(perm_idx + 1, ' ', wptr);
 	  dxx = dptr[perm_idx * ulii];
 	  if (dxx >= 0) {
-	    wptr = double_g_writex(wptr, dxx, '\n');
+	    wptr = dtoa_gx(dxx, '\n', wptr);
 	  } else {
 	    wptr = memcpyl3a(wptr, "NA\n");
 	  }
 	  if (wptr >= wptr2) {
-	    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	      goto glm_linear_nosnp_ret_WRITE_FAIL;
 	    }
-	    wptr = tbuf;
+	    wptr = g_textbuf;
 	  }
 	}
-	if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	  goto glm_linear_nosnp_ret_WRITE_FAIL;
 	}
 	if (fclose_null(&outfile)) {
 	  goto glm_linear_nosnp_ret_WRITE_FAIL;
 	}
 	LOGPREPRINTFWW("%s written.\n", outname);
-	logstr(logbuf);
+	logstr(g_logbuf);
       }
     }
   }
   while (0) {
-  glm_linear_nosnp_ret_NOMEM2:
-    wkspace_left += topsize;
   glm_linear_nosnp_ret_NOMEM:
     retval = RET_NOMEM;
     break;
@@ -8110,7 +7864,7 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     break;
   }
  glm_linear_nosnp_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
   fclose_cond(outfile);
   aligned_free_cond(active_params);
   aligned_free_cond(joint_test_params);
@@ -8120,13 +7874,13 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 #endif
 
 int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t glm_modifier, double glm_vif_thresh, uint32_t glm_xchr_model, uint32_t glm_mperm_val, Range_list* parameters_range_list_ptr, Range_list* tests_range_list_ptr, double ci_size, double ci_zt, double pfilter, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* m [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + BITCT - 1) / BITCT;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t unfiltered_sample_ctv2 = 2 * unfiltered_sample_ctl;
   FILE* outfile = NULL;
   uintptr_t sample_uidx = 0;
-  uintptr_t topsize = 0;
   uintptr_t max_param_name_len = 2;
   uintptr_t param_raw_ct = 1;
   uintptr_t condition_ct = 0;
@@ -8187,6 +7941,7 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
   double* dptr;
   uintptr_t sample_valid_ct;
   uintptr_t sample_valid_cta4;
+  uintptr_t sample_valid_ctv;
   uintptr_t sample_valid_ctv2;
   uintptr_t sample_uidx_stop;
   uintptr_t sample_idx;
@@ -8224,36 +7979,30 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     hh_or_mt_exists |= NXMHH_EXISTS;
   }
   if (condition_mname || condition_fname) {
-    loadbuf_raw = (uintptr_t*)top_alloc(&topsize, unfiltered_sample_ctv2 * sizeof(intptr_t));
-    if (!loadbuf_raw) {
+    if (bigstack_end_alloc_ul(unfiltered_sample_ctv2, &loadbuf_raw)) {
       goto glm_logistic_nosnp_ret_NOMEM;
     }
     loadbuf_raw[unfiltered_sample_ctv2 - 2] = 0;
     loadbuf_raw[unfiltered_sample_ctv2 - 1] = 0;
-    ulii = topsize;
 
     if (hh_or_mt_exists & (Y_FIX_NEEDED | NXMHH_EXISTS)) {
-      sample_include2 = (uintptr_t*)top_alloc(&topsize, unfiltered_sample_ctv2 * sizeof(intptr_t));
-      if (!sample_include2) {
+      if (bigstack_end_alloc_ul(unfiltered_sample_ctv2, &sample_include2)) {
         goto glm_logistic_nosnp_ret_NOMEM;
       }
-      fill_vec_55(sample_include2, unfiltered_sample_ct); // harmless
+      fill_quatervec_55(unfiltered_sample_ct, sample_include2); // harmless
     }
     if (hh_or_mt_exists & (XMHH_EXISTS | Y_FIX_NEEDED)) {
-      sample_male_include2 = (uintptr_t*)top_alloc(&topsize, unfiltered_sample_ctv2 * sizeof(intptr_t));
-      if (!sample_male_include2) {
+      if (bigstack_end_alloc_ul(unfiltered_sample_ctv2, &sample_male_include2)) {
 	goto glm_logistic_nosnp_ret_NOMEM;
       }
-      fill_ulong_zero(sample_male_include2, unfiltered_sample_ctv2);
-      vec_include_init(unfiltered_sample_ct, sample_male_include2, sex_male);
+      init_quaterarr_from_bitarr(sex_male, unfiltered_sample_ct, sample_male_include2);
     }
-    wkspace_left -= topsize;
     retval = glm_scan_conditions(condition_mname, condition_fname, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, chrom_info_ptr, hh_or_mt_exists, loadbuf_raw, bedfile, bed_offset, unfiltered_sample_ct, sex_male, load_mask, &sample_valid_ct, &condition_ct, &condition_uidxs, sample_include2, sample_male_include2);
-    wkspace_left += topsize;
     if (retval) {
       goto glm_logistic_nosnp_ret_1;
     }
-    topsize = ulii; // deallocate temporary sample[_male]_include2
+    // deallocate temporary sample[_male]_include2
+    bigstack_end_reset(loadbuf_raw);
     param_raw_ct += condition_ct;
   }
   param_raw_ct += covar_ct;
@@ -8279,52 +8028,49 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     } while (sample_idx < sample_valid_ct);
     if (variation_in_sex) {
       param_raw_ct++;
-      bitfield_and(load_mask, sex_nm, unfiltered_sample_ctl);
+      bitvec_and(sex_nm, unfiltered_sample_ctl, load_mask);
       sample_valid_ct = popcount_longs(load_mask, unfiltered_sample_ctl);
     } else {
       logerrprint("Warning: Ignoring --logistic 'sex' modifier since sex is invariant.\n");
     }
   }
-  sample_valid_cta4 = (sample_valid_ct + 3) & (~(3 * ONELU));
-  sample_valid_ctv2 = 2 * ((sample_valid_ct + BITCT - 1) / BITCT);
+  sample_valid_cta4 = round_up_pow2(sample_valid_ct, 4);
+  sample_valid_ctv = BITCT_TO_ALIGNED_WORDCT(sample_valid_ct);
+  sample_valid_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_valid_ct);
 
   if (condition_mname || condition_fname) {
     // now that we've determined which samples will be in the regression,
     // initialize collapsed sample_include2, sample_male_include2, sex_male
     if (hh_or_mt_exists & (Y_FIX_NEEDED | NXMHH_EXISTS)) {
-      sample_include2 = (uintptr_t*)top_alloc(&topsize, sample_valid_ctv2 * sizeof(intptr_t));
-      fill_vec_55(sample_include2, sample_valid_ct);
+      bigstack_end_alloc_ul(sample_valid_ctv2, &sample_include2);
+      fill_quatervec_55(sample_valid_ct, sample_include2);
     }
     if (hh_or_mt_exists & (XMHH_EXISTS | Y_FIX_NEEDED)) {
-      sample_male_include2 = (uintptr_t*)top_alloc(&topsize, sample_valid_ctv2 * sizeof(intptr_t));
-      alloc_collapsed_haploid_filters(unfiltered_sample_ct, sample_valid_ct, hh_or_mt_exists, 1, load_mask, sex_male, &sample_include2, &sample_male_include2);
+      bigstack_end_alloc_ul(sample_valid_ctv2, &sample_male_include2);
+      alloc_collapsed_haploid_filters(load_mask, sex_male, unfiltered_sample_ct, sample_valid_ct, hh_or_mt_exists, 1, &sample_include2, &sample_male_include2);
     }
-    loadbuf_collapsed = (uintptr_t*)top_alloc(&topsize, sample_valid_ctv2 * sizeof(intptr_t));
-    if (!loadbuf_collapsed) {
+    if (bigstack_end_alloc_ul(sample_valid_ctv2, &loadbuf_collapsed) ||
+        bigstack_end_alloc_ul(sample_valid_ctv2 / 2, &sex_male_collapsed)) {
       goto glm_logistic_nosnp_ret_NOMEM;
     }
     loadbuf_collapsed[sample_valid_ctv2 - 2] = 0;
     loadbuf_collapsed[sample_valid_ctv2 - 1] = 0;
-    sex_male_collapsed = (uintptr_t*)top_alloc(&topsize, sample_valid_ctv2 * (sizeof(intptr_t) / 2));
-    if (!sex_male_collapsed) {
-      goto glm_logistic_nosnp_ret_NOMEM;
-    }
-    collapse_copy_bitarr_incl(unfiltered_sample_ct, sex_male, load_mask, sample_valid_ct, sex_male_collapsed);
+    copy_bitarr_subset(sex_male, load_mask, unfiltered_sample_ct, sample_valid_ct, sex_male_collapsed);
   }
-  param_raw_ctl = (param_raw_ct + BITCT - 1) / BITCT;
-  if (aligned_malloc(&active_params, param_raw_ctl * sizeof(intptr_t))) {
+  param_raw_ctl = BITCT_TO_WORDCT(param_raw_ct);
+  if (aligned_malloc(param_raw_ctl * sizeof(intptr_t), &active_params)) {
     goto glm_logistic_nosnp_ret_NOMEM;
   }
   if (parameters_range_list_ptr->name_ct) {
     fill_ulong_zero(active_params, param_raw_ctl);
     active_params[0] = 1;
-    numeric_range_list_to_bitfield(parameters_range_list_ptr, param_raw_ct, active_params, 0, 1);
+    numeric_range_list_to_bitarr(parameters_range_list_ptr, param_raw_ct, 0, 1, active_params);
     param_ct = popcount_longs(active_params, param_raw_ctl);
   } else {
-    fill_all_bits(active_params, param_raw_ct);
+    fill_all_bits(param_raw_ct, active_params);
     param_ct = param_raw_ct;
   }
-  param_cta4 = (param_ct + 3) & (~(3 * ONELU));
+  param_cta4 = round_up_pow2(param_ct, 4);
   if (param_ct == 1) {
     logerrprint("Warning: Skipping --logistic since the intercept is the only variable.\n");
     goto glm_logistic_nosnp_ret_1;
@@ -8359,17 +8105,17 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
   }
   param_ctx = param_ct;
   if (tests_range_list_ptr->name_ct || (glm_modifier & GLM_TEST_ALL)) {
-    ulii = (param_ct + (BITCT - 1)) / BITCT;
-    if (aligned_malloc(&joint_test_params, ulii * sizeof(intptr_t))) {
+    ulii = BITCT_TO_WORDCT(param_ct);
+    if (aligned_malloc(ulii * sizeof(intptr_t), &joint_test_params)) {
       goto glm_logistic_nosnp_ret_NOMEM;
     }
     fill_ulong_zero(joint_test_params, ulii);
     if (tests_range_list_ptr->name_ct) {
-      numeric_range_list_to_bitfield(tests_range_list_ptr, param_ct - 1, joint_test_params, 1, 1);
+      numeric_range_list_to_bitarr(tests_range_list_ptr, param_ct - 1, 1, 1, joint_test_params);
       constraint_ct = popcount_longs(joint_test_params, ulii);
     } else {
       constraint_ct = param_ct - 1;
-      fill_bits(joint_test_params, 0, constraint_ct);
+      fill_bits(0, constraint_ct, joint_test_params);
     }
     if ((constraint_ct > 1) || (hide_covar && (constraint_ct == 1))) {
       // permit hide-covar + single --tests parameter combination
@@ -8391,12 +8137,10 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
       max_param_name_len = 4;
     }
   }
-  wkspace_left -= topsize;
-  if (wkspace_alloc_c_checked(&param_names, param_ctx * max_param_name_len) ||
-      wkspace_alloc_f_checked(&covars_cov_major, param_ct * sample_valid_cta4 * sizeof(float))) {
-    goto glm_logistic_nosnp_ret_NOMEM2;
+  if (bigstack_alloc_c(param_ctx * max_param_name_len, &param_names) ||
+      bigstack_alloc_f(param_ct * sample_valid_cta4, &covars_cov_major)) {
+    goto glm_logistic_nosnp_ret_NOMEM;
   }
-  wkspace_left += topsize;
   for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
     covars_cov_major[sample_idx] = 1;
   }
@@ -8428,7 +8172,7 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
       if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
 	goto glm_logistic_nosnp_ret_READ_FAIL;
       }
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf_collapsed, sample_valid_ct, load_mask, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, sample_valid_ct, load_mask, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_collapsed)) {
 	goto glm_logistic_nosnp_ret_READ_FAIL;
       }
       chrom_idx = get_marker_chrom(chrom_info_ptr, marker_uidx);
@@ -8454,17 +8198,16 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
       param_idx++;
     }
   }
-  // topsize = 0;
-  if (wkspace_alloc_ui_checked(&perm_2success_ct, (param_ctx - 1) * sizeof(int32_t)) ||
-      wkspace_alloc_d_checked(&orig_stats, (param_ctx - 1) * sizeof(double))) {
+  bigstack_end_reset(bigstack_end_mark);
+  if (bigstack_alloc_ui(param_ctx - 1, &perm_2success_ct) ||
+      bigstack_alloc_d(param_ctx - 1, &orig_stats)) {
     goto glm_logistic_nosnp_ret_NOMEM;
   }
 
   if (constraint_ct) {
-    if (wkspace_alloc_d_checked(&constraints_con_major, constraint_ct * param_ct * sizeof(double))) {
+    if (bigstack_calloc_d(constraint_ct * param_ct, &constraints_con_major)) {
       goto glm_logistic_nosnp_ret_NOMEM;
     }
-    fill_double_zero(constraints_con_major, constraint_ct * param_ct);
     uljj = 0;
     for (constraint_idx = 0; constraint_idx < constraint_ct; uljj++, constraint_idx++) {
       next_set_ul_unsafe_ck(joint_test_params, &uljj);
@@ -8472,7 +8215,7 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     }
     wptr = memcpya(&(param_names[param_ct * max_param_name_len]), (glm_modifier & GLM_TEST_ALL)? "FULL" : "USER", 4);
     *wptr++ = '_';
-    wptr = uint32_write(wptr, constraint_ct);
+    wptr = uint32toa(constraint_ct, wptr);
     memcpy(wptr, "DF", 3);
   }
   fill_uint_zero(perm_2success_ct, param_ctx - 1);
@@ -8511,8 +8254,8 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
   // no more VIF check
 
   // required for multithreaded permutation generation
-  g_cluster_ct = 0;
-  g_pheno_nm_ct = sample_valid_ct;
+  g_perm_cluster_ct = 0;
+  g_perm_pheno_nm_ct = sample_valid_ct;
 
   perms_done = 0;
 
@@ -8525,24 +8268,23 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     perm_batch_size = 1;
     mperm_save = 0;
   }
-  ulii = (perm_batch_size + (BITCT - 1)) / BITCT;
   uii = ((perm_batch_size > 1) || skip_intercept)? 1 : 0;
-  if (wkspace_alloc_f_checked(&coef, param_cta4 * perm_batch_size * sizeof(float)) ||
-      wkspace_alloc_f_checked(&pp, sample_valid_cta4 * sizeof(float)) ||
-      wkspace_alloc_f_checked(&sample_1d_buf, sample_valid_ct * sizeof(float)) ||
-      wkspace_alloc_f_checked(&pheno_buf, sample_valid_ct * sizeof(float)) ||
-      wkspace_alloc_f_checked(&param_1d_buf, param_ct * sizeof(float)) ||
-      wkspace_alloc_f_checked(&param_1d_buf2, param_ct * sizeof(float)) ||
-      wkspace_alloc_f_checked(&param_2d_buf, param_ct * param_cta4 * sizeof(float)) ||
-      wkspace_alloc_f_checked(&param_2d_buf2, param_ct * param_cta4 * sizeof(float)) ||
-      wkspace_alloc_f_checked(&regression_results, perm_batch_size * (param_ctx - uii) * sizeof(float)) ||
-      wkspace_alloc_ul_checked(&perm_fails, ulii * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&g_perm_vecs, perm_batch_size * sample_valid_ctv2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_f(param_cta4 * perm_batch_size, &coef) ||
+      bigstack_alloc_f(sample_valid_cta4, &pp) ||
+      bigstack_alloc_f(sample_valid_ct, &sample_1d_buf) ||
+      bigstack_alloc_f(sample_valid_ct, &pheno_buf) ||
+      bigstack_alloc_f(param_ct, &param_1d_buf) ||
+      bigstack_alloc_f(param_ct, &param_1d_buf2) ||
+      bigstack_alloc_f(param_ct * param_cta4, &param_2d_buf) ||
+      bigstack_alloc_f(param_ct * param_cta4, &param_2d_buf2) ||
+      bigstack_alloc_f(perm_batch_size * (param_ctx - uii), &regression_results) ||
+      bigstack_alloc_ul(BITCT_TO_WORDCT(perm_batch_size), &perm_fails) ||
+      bigstack_alloc_ul(perm_batch_size * sample_valid_ctv, &g_perm_vecs)) {
     goto glm_logistic_nosnp_ret_NOMEM;
   }
-  vec_collapse_init(pheno_c, unfiltered_sample_ct, load_mask, sample_valid_ct, g_perm_vecs);
-  g_case_ct = popcount01_longs(g_perm_vecs, sample_valid_ctv2);
-  if ((!g_case_ct) || (g_case_ct == sample_valid_ct)) {
+  copy_bitarr_subset(pheno_c, load_mask, unfiltered_sample_ct, sample_valid_ct, g_perm_vecs);
+  g_perm_case_ct = popcount_longs(g_perm_vecs, sample_valid_ctv);
+  if ((!g_perm_case_ct) || (g_perm_case_ct == sample_valid_ct)) {
     goto glm_logistic_nosnp_ret_PHENO_CONSTANT;
   }
 
@@ -8551,43 +8293,44 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     max_thread_ct = uii;
   }
   if (cluster_starts && do_perms) {
-    retval = cluster_include_and_reindex(unfiltered_sample_ct, load_mask, 1, pheno_c, sample_valid_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_cluster_ct, &g_cluster_map, &g_cluster_starts, &g_cluster_case_cts, &g_cluster_cc_perm_preimage);
+    retval = cluster_include_and_reindex(unfiltered_sample_ct, load_mask, 1, pheno_c, sample_valid_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, &g_perm_cluster_case_cts, &g_perm_cluster_cc_preimage);
     if (retval) {
       goto glm_logistic_nosnp_ret_1;
     }
-    if (!g_cluster_ct) {
+    if (!g_perm_cluster_ct) {
       goto glm_logistic_nosnp_ret_NO_PERMUTATION_CLUSTERS;
     }
-    if (cluster_alloc_and_populate_magic_nums(g_cluster_ct, g_cluster_map, g_cluster_starts, &g_tot_quotients, &g_totq_magics, &g_totq_preshifts, &g_totq_postshifts, &g_totq_incrs)) {
+    if (cluster_alloc_and_populate_magic_nums(g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, &g_perm_tot_quotients, &g_perm_totq_magics, &g_perm_totq_preshifts, &g_perm_totq_postshifts, &g_perm_totq_incrs)) {
       goto glm_logistic_nosnp_ret_NOMEM;
     }
-    if (wkspace_alloc_ui_checked(&g_sample_to_cluster, sample_valid_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(sample_valid_ct, &g_perm_sample_to_cluster)) {
       goto glm_logistic_nosnp_ret_NOMEM;
     }
-    fill_unfiltered_sample_to_cluster(sample_valid_ct, g_cluster_ct, g_cluster_map, g_cluster_starts, g_sample_to_cluster);
+    fill_unfiltered_sample_to_cluster(sample_valid_ct, g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, g_perm_sample_to_cluster);
   }
   if (constraint_ct) {
-    mi_buf = (MATRIX_INVERT_BUF1_TYPE*)wkspace_alloc(param_ct * sizeof(MATRIX_INVERT_BUF1_TYPE));
+    mi_buf = (MATRIX_INVERT_BUF1_TYPE*)bigstack_alloc(param_ct * sizeof(MATRIX_INVERT_BUF1_TYPE));
     if (!mi_buf) {
       goto glm_logistic_nosnp_ret_NOMEM;
     }
-    if (wkspace_alloc_d_checked(&param_1d_dbuf, param_ct * sizeof(double)) ||
-        wkspace_alloc_d_checked(&param_2d_dbuf, param_ct * param_ct * sizeof(double)) ||
-        wkspace_alloc_d_checked(&param_2d_dbuf2, param_ct * param_ct * sizeof(double)) ||
-        wkspace_alloc_d_checked(&param_df_dbuf, param_ct * constraint_ct * sizeof(double)) ||
-        wkspace_alloc_d_checked(&df_df_dbuf, constraint_ct * constraint_ct * sizeof(double)) ||
-        wkspace_alloc_d_checked(&df_dbuf, constraint_ct * sizeof(double))) {
+    if (bigstack_alloc_d(param_ct, &param_1d_dbuf) ||
+        bigstack_alloc_d(param_ct * param_ct, &param_2d_dbuf) ||
+        bigstack_alloc_d(param_ct * param_ct, &param_2d_dbuf2) ||
+        bigstack_alloc_d(param_ct * constraint_ct, &param_df_dbuf) ||
+        bigstack_alloc_d(constraint_ct * constraint_ct, &df_df_dbuf) ||
+        bigstack_alloc_d(constraint_ct, &df_dbuf)) {
       goto glm_logistic_nosnp_ret_NOMEM;
     }
   }
   if (do_perms) {
-    g_tot_quotient = 0x100000000LLU / sample_valid_ct;
-    magic_num(g_tot_quotient, &g_totq_magic, &g_totq_preshift, &g_totq_postshift, &g_totq_incr);
+    g_perm_tot_quotient = 0x100000000LLU / sample_valid_ct;
+    magic_num(g_perm_tot_quotient, &g_perm_totq_magic, &g_perm_totq_preshift, &g_perm_totq_postshift, &g_perm_totq_incr);
     // Note that, for now, the main nosnp regression loop is not multithreaded;
     // only the permutation generation process is.
-    if (wkspace_init_sfmtp(max_thread_ct)) {
+    if (bigstack_init_sfmtp(max_thread_ct)) {
       goto glm_logistic_nosnp_ret_NOMEM;
     }
+    g_perm_is_1bit = 1;
   }
 
   fill_float_zero(coef, param_cta4);
@@ -8603,15 +8346,15 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
   if (mperm_save) {
     // --mperm-save prevented during command-line parsing, so must be
     // --mperm-save-all
-    if (wkspace_alloc_d_checked(&mperm_save_stats, glm_mperm_val * (param_ctx - 1) * sizeof(double))) {
+    if (bigstack_alloc_d(glm_mperm_val * (param_ctx - 1), &mperm_save_stats)) {
       goto glm_logistic_nosnp_ret_NOMEM;
     }
     *outname_end = '\0';
     LOGPREPRINTFWW("Dumping all permutation chi-square values to %s.[testID].mperm.dump.all.\n", outname);
-    fputs(logbuf, stdout);
+    fputs(g_logbuf, stdout);
   }
   outname_end2 = memcpyb(outname_end, ".assoc.logistic", 16);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto glm_logistic_nosnp_ret_OPEN_FAIL;
   }
   LOGPRINTFWW5("Writing logistic model association results to %s ... ", outname);
@@ -8636,46 +8379,46 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     orig_stats[param_idx - 1] = zval * zval;
     pval = chiprob_p(zval * zval, 1);
     if ((!hide_covar) && ((pfilter == 2.0) || ((pval <= pfilter) && (pval >= 0.0)))) {
-      wptr = fw_strcpy(10, &(param_names[param_idx * max_param_name_len]), tbuf);
+      wptr = fw_strcpy(10, &(param_names[param_idx * max_param_name_len]), g_textbuf);
       *wptr++ = ' ';
-      wptr = uint32_writew8x(wptr, (uint32_t)sample_valid_ct, ' ');
-      wptr = double_g_writewx4x(wptr, report_odds? exp(dxx) : dxx, 10, ' ');
+      wptr = uint32toa_w8x((uint32_t)sample_valid_ct, ' ', wptr);
+      wptr = dtoa_g_wxp4x(report_odds? exp(dxx) : dxx, 10, ' ', wptr);
       if (display_ci) {
 	dyy = ci_zt * se;
-	wptr = double_g_writewx4x(wptr, se, 8, ' ');
+	wptr = dtoa_g_wxp4x(se, 8, ' ', wptr);
 	if (report_odds) {
-	  wptr = double_g_writewx4x(wptr, exp(dxx - dyy), 8, ' ');
-	  wptr = double_g_writewx4x(wptr, exp(dxx + dyy), 8, ' ');
+	  wptr = dtoa_g_wxp4x(exp(dxx - dyy), 8, ' ', wptr);
+	  wptr = dtoa_g_wxp4x(exp(dxx + dyy), 8, ' ', wptr);
 	} else {
-	  wptr = double_g_writewx4x(wptr, dxx - dyy, 8, ' ');
-	  wptr = double_g_writewx4x(wptr, dxx + dyy, 8, ' ');
+	  wptr = dtoa_g_wxp4x(dxx - dyy, 8, ' ', wptr);
+	  wptr = dtoa_g_wxp4x(dxx + dyy, 8, ' ', wptr);
 	}
       }
-      wptr = double_g_writewx4x(wptr, zval, 12, ' ');
-      wptr = double_g_writewx4x(wptr, MAXV(pval, output_min_p), 12, '\n');
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      wptr = dtoa_g_wxp4x(zval, 12, ' ', wptr);
+      wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 12, '\n', wptr);
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	goto glm_logistic_nosnp_ret_WRITE_FAIL;
       }
     }
   }
   if (!skip_intercept) {
     dxx = (double)coef[0];
-    wptr = memcpya(tbuf, " INTERCEPT ", 11);
-    wptr = uint32_writew8x(wptr, (uint32_t)sample_valid_ct, ' ');
-    wptr = double_g_writewx4x(wptr, report_odds? exp(dxx) : dxx, 10, ' ');
+    wptr = memcpya(g_textbuf, " INTERCEPT ", 11);
+    wptr = uint32toa_w8x((uint32_t)sample_valid_ct, ' ', wptr);
+    wptr = dtoa_g_wxp4x(report_odds? exp(dxx) : dxx, 10, ' ', wptr);
     if (display_ci) {
       se = sqrt((double)regression_results[0]);
       dyy = ci_zt * se;
-      wptr = double_g_writewx4x(wptr, se, 8, ' ');
+      wptr = dtoa_g_wxp4x(se, 8, ' ', wptr);
       if (report_odds) {
-	wptr = double_g_writewx4x(wptr, exp(dxx - dyy), 8, ' ');
-	wptr = double_g_writewx4x(wptr, exp(dxx + dyy), 8, ' ');
+	wptr = dtoa_g_wxp4x(exp(dxx - dyy), 8, ' ', wptr);
+	wptr = dtoa_g_wxp4x(exp(dxx + dyy), 8, ' ', wptr);
       } else {
-	wptr = double_g_writewx4x(wptr, dxx - dyy, 8, ' ');
-	wptr = double_g_writewx4x(wptr, dxx + dyy, 8, ' ');
+	wptr = dtoa_g_wxp4x(dxx - dyy, 8, ' ', wptr);
+	wptr = dtoa_g_wxp4x(dxx + dyy, 8, ' ', wptr);
       }
     }
-    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
       goto glm_logistic_nosnp_ret_WRITE_FAIL;
     }
   }
@@ -8684,16 +8427,16 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     orig_stats[param_ct - 1] = dxx;
     pval = chiprob_p(dxx, constraint_ct);
     if ((pfilter == 2.0) || ((pval <= pfilter) && (pval >= 0.0))) {
-      wptr = fw_strcpy(10, &(param_names[param_ct * max_param_name_len]), tbuf);
+      wptr = fw_strcpy(10, &(param_names[param_ct * max_param_name_len]), g_textbuf);
       *wptr++ = ' ';
-      wptr = uint32_writew8(wptr, (uint32_t)sample_valid_ct);
+      wptr = uint32toa_w8((uint32_t)sample_valid_ct, wptr);
       wptr = memcpya(wptr, "         NA ", 12);
       if (display_ci) {
 	wptr = memcpya(wptr, "      NA       NA       NA ", 27);
       }
-      wptr = double_g_writewx4x(wptr, dxx, 12, ' ');
-      wptr = double_g_writewx4x(wptr, MAXV(pval, output_min_p), 12, '\n');
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      wptr = dtoa_g_wxp4x(dxx, 12, ' ', wptr);
+      wptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 12, '\n', wptr);
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	goto glm_logistic_nosnp_ret_WRITE_FAIL;
       }
     }
@@ -8713,22 +8456,22 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     ulii = 0;
 
     if (cur_batch_size > max_thread_ct) {
-      g_assoc_thread_ct = max_thread_ct;
+      g_perm_generation_thread_ct = max_thread_ct;
     } else {
-      g_assoc_thread_ct = g_perm_vec_ct;
+      g_perm_generation_thread_ct = g_perm_vec_ct;
     }
-    if (!g_cluster_ct) {
-      if (spawn_threads(threads, &logistic_gen_perms_thread, g_assoc_thread_ct)) {
+    if (!g_perm_cluster_ct) {
+      if (spawn_threads(threads, &generate_cc_perms_thread, g_perm_generation_thread_ct)) {
 	goto glm_logistic_nosnp_ret_THREAD_CREATE_FAIL;
       }
-      logistic_gen_perms_thread((void*)ulii);
+      generate_cc_perms_thread((void*)ulii);
     } else {
-      if (spawn_threads(threads, &logistic_gen_cluster_perms_thread, g_assoc_thread_ct)) {
+      if (spawn_threads(threads, &generate_cc_cluster_perms_thread, g_perm_generation_thread_ct)) {
 	goto glm_logistic_nosnp_ret_THREAD_CREATE_FAIL;
       }
-      logistic_gen_cluster_perms_thread((void*)ulii);
+      generate_cc_cluster_perms_thread((void*)ulii);
     }
-    join_threads(threads, g_assoc_thread_ct);
+    join_threads(threads, g_perm_generation_thread_ct);
     fill_float_zero(coef, cur_batch_size * param_cta4);
     perm_fail_total += glm_logistic(cur_batch_size, param_ct, sample_valid_ct, 0, 1, NULL, covars_cov_major, g_perm_vecs, coef, pp, sample_1d_buf, pheno_buf, param_1d_buf, param_1d_buf2, param_2d_buf, param_2d_buf2, regression_results, constraint_ct, constraints_con_major, param_1d_dbuf, param_2d_dbuf, param_2d_dbuf2, param_df_dbuf, df_df_dbuf, mi_buf, df_dbuf, perm_fails);
     ulii = param_ct - 1;
@@ -8788,7 +8531,7 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
   if (do_perms) {
     putchar('\n');
     memcpy(outname_end2, ".mperm", 7);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto glm_logistic_nosnp_ret_OPEN_FAIL;
     }
     if (fputs_checked("      TEST         EMP1           NP \n", outfile)) {
@@ -8796,37 +8539,37 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     }
     dxx = 0.5 / ((double)((int32_t)(glm_mperm_val - perm_fail_total) + 1));
     for (param_idx = 1; param_idx < param_ct; param_idx++) {
-      wptr = fw_strcpy(10, &(param_names[param_idx * max_param_name_len]), tbuf);
+      wptr = fw_strcpy(10, &(param_names[param_idx * max_param_name_len]), g_textbuf);
       *wptr++ = ' ';
       pval = ((double)(perm_2success_ct[param_idx - 1] + 2)) * dxx;
       if (pval <= pfilter) {
 	if (!perm_count) {
-	  wptr = double_g_writewx4(wptr, pval, 12);
+	  wptr = dtoa_g_wxp4(pval, 12, wptr);
 	} else {
-          wptr = double_g_writewx4(wptr, ((double)perm_2success_ct[param_idx - 1]) * 0.5, 12);
+          wptr = dtoa_g_wxp4(((double)perm_2success_ct[param_idx - 1]) * 0.5, 12, wptr);
 	}
         wptr = memseta(wptr, 32, 3);
-        wptr = uint32_writew10(wptr, glm_mperm_val - perm_fail_total);
+        wptr = uint32toa_w10(glm_mperm_val - perm_fail_total, wptr);
         wptr = memcpya(wptr, " \n", 2);
-	if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	  goto glm_logistic_nosnp_ret_WRITE_FAIL;
 	}
       }
     }
     if (constraint_ct) {
-      wptr = fw_strcpy(10, &(param_names[param_ct * max_param_name_len]), tbuf);
+      wptr = fw_strcpy(10, &(param_names[param_ct * max_param_name_len]), g_textbuf);
       *wptr++ = ' ';
       pval = ((double)(perm_2success_ct[param_ct - 1] + 2)) * 0.5 / ((double)((int32_t)(glm_mperm_val - perm_fail_total - joint_perm_fail_extra) + 1));
       if (pval <= pfilter) {
 	if (!perm_count) {
-	  wptr = double_g_writewx4(wptr, pval, 12);
+	  wptr = dtoa_g_wxp4(pval, 12, wptr);
 	} else {
-          wptr = double_g_writewx4(wptr, ((double)perm_2success_ct[param_ct - 1]) * 0.5, 12);
+          wptr = dtoa_g_wxp4(((double)perm_2success_ct[param_ct - 1]) * 0.5, 12, wptr);
 	}
         wptr = memseta(wptr, 32, 3);
-        wptr = uint32_writew10(wptr, glm_mperm_val - perm_fail_total - joint_perm_fail_extra);
+        wptr = uint32toa_w10(glm_mperm_val - perm_fail_total - joint_perm_fail_extra, wptr);
         wptr = memcpya(wptr, " \n", 2);
-	if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	  goto glm_logistic_nosnp_ret_WRITE_FAIL;
 	}
       }
@@ -8845,43 +8588,41 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
       for (param_idx = 1; param_idx < param_ct; param_idx++) {
 	wptr = strcpya(&(outname_end[1]), &(param_names[param_idx * max_param_name_len]));
 	memcpy(wptr, ".mperm.dump.all", 17);
-	if (fopen_checked(&outfile, outname, "w")) {
+	if (fopen_checked(outname, "w", &outfile)) {
 	  goto glm_logistic_nosnp_ret_OPEN_FAIL;
 	}
 	ulii = param_ctx - 1;
-	wptr = memcpya(tbuf, "0 ", 2);
-	wptr = double_g_writex(wptr, orig_stats[param_idx - 1], '\n');
-	wptr2 = &(tbuf[MAXLINELEN]);
+	wptr = memcpya(g_textbuf, "0 ", 2);
+	wptr = dtoa_gx(orig_stats[param_idx - 1], '\n', wptr);
+	wptr2 = &(g_textbuf[MAXLINELEN]);
 	dptr = &(mperm_save_stats[param_idx - 1]);
 	for (perm_idx = 0; perm_idx < glm_mperm_val; perm_idx++) {
-	  wptr = uint32_writex(wptr, perm_idx + 1, ' ');
+	  wptr = uint32toa_x(perm_idx + 1, ' ', wptr);
 	  dxx = dptr[perm_idx * ulii];
 	  if (dxx >= 0) {
-	    wptr = double_g_writex(wptr, dxx, '\n');
+	    wptr = dtoa_gx(dxx, '\n', wptr);
 	  } else {
 	    wptr = memcpyl3a(wptr, "NA\n");
 	  }
 	  if (wptr >= wptr2) {
-	    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	      goto glm_logistic_nosnp_ret_WRITE_FAIL;
 	    }
-	    wptr = tbuf;
+	    wptr = g_textbuf;
 	  }
 	}
-	if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	  goto glm_logistic_nosnp_ret_WRITE_FAIL;
 	}
 	if (fclose_null(&outfile)) {
 	  goto glm_logistic_nosnp_ret_WRITE_FAIL;
 	}
 	LOGPREPRINTFWW("%s written.\n", outname);
-	logstr(logbuf);
+	logstr(g_logbuf);
       }
     }
   }
   while (0) {
-  glm_logistic_nosnp_ret_NOMEM2:
-    wkspace_left += topsize;
   glm_logistic_nosnp_ret_NOMEM:
     retval = RET_NOMEM;
     break;
@@ -8907,7 +8648,7 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     break;
   }
  glm_logistic_nosnp_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
   fclose_cond(outfile);
   aligned_free_cond(active_params);
   aligned_free_cond(joint_test_params);
@@ -9032,9 +8773,9 @@ uint32_t glm_logistic_dosage(uintptr_t sample_ct, uintptr_t* cur_samples, uintpt
   if (sample_valid_ct <= param_ct) {
     return 0;
   }
-  uintptr_t sample_valid_cta4 = (sample_valid_ct + 3) & (~(3 * ONELU));
-  uintptr_t sample_valid_ctv2 = 2 * ((sample_valid_ct + BITCT - 1) / BITCT);
-  uintptr_t param_cta4 = (param_ct + 3) & (~3);
+  uintptr_t sample_valid_cta4 = round_up_pow2(sample_valid_ct, 4);
+  uintptr_t sample_valid_ctv = BITCT_TO_ALIGNED_WORDCT(sample_valid_ct);
+  uintptr_t param_cta4 = round_up_pow2(param_ct, 4);
   float* fptr = covars_cov_major;
   uintptr_t case_ct;
   uintptr_t sample_uidx;
@@ -9042,8 +8783,8 @@ uint32_t glm_logistic_dosage(uintptr_t sample_ct, uintptr_t* cur_samples, uintpt
   uintptr_t covar_idx;
   double dxx;
   double dyy;
-  vec_collapse_init(pheno_c, sample_ct, cur_samples, sample_valid_ct, perm_vec);
-  case_ct = popcount01_longs(perm_vec, sample_valid_ctv2);
+  copy_bitarr_subset(pheno_c, cur_samples, sample_ct, sample_valid_ct, perm_vec);
+  case_ct = popcount_longs(perm_vec, sample_valid_ctv);
   if ((!case_ct) || (case_ct == sample_valid_ct)) {
     return 0;
   }
diff --git a/plink_help.c b/plink_help.c
index a172b8c..d4e97e2 100644
--- a/plink_help.c
+++ b/plink_help.c
@@ -75,10 +75,10 @@ void help_print(const char* cur_params, Help_ctrl* help_ctrl_ptr, uint32_t postp
   char* line_end;
   char* payload_end;
   if (help_ctrl_ptr->param_ct) {
-    strcpy(tbuf, cur_params);
+    strcpy(g_textbuf, cur_params);
     cur_param_ct = 1;
-    cur_param_start[0] = tbuf;
-    payload_ptr = strchr(tbuf, '\t');
+    cur_param_start[0] = g_textbuf;
+    payload_ptr = strchr(g_textbuf, '\t');
     while (payload_ptr) {
       *payload_ptr++ = '\0';
       cur_param_start[cur_param_ct++] = payload_ptr;
@@ -92,9 +92,9 @@ void help_print(const char* cur_params, Help_ctrl* help_ctrl_ptr, uint32_t postp
 	    arg_uidx = next_unset_unsafe(help_ctrl_ptr->all_match_arr, arg_uidx);
 	    for (cur_param_idx = 0; cur_param_idx < cur_param_ct; cur_param_idx++) {
 	      if (!strcmp(cur_param_start[cur_param_idx], help_ctrl_ptr->argv[arg_uidx])) {
-		SET_BIT(help_ctrl_ptr->perfect_match_arr, arg_uidx);
-		SET_BIT(help_ctrl_ptr->prefix_match_arr, arg_uidx);
-		SET_BIT(help_ctrl_ptr->all_match_arr, arg_uidx);
+		SET_BIT(arg_uidx, help_ctrl_ptr->perfect_match_arr);
+		SET_BIT(arg_uidx, help_ctrl_ptr->prefix_match_arr);
+		SET_BIT(arg_uidx, help_ctrl_ptr->all_match_arr);
 		help_ctrl_ptr->unmatched_ct -= 1;
 		break;
 	      }
@@ -111,8 +111,8 @@ void help_print(const char* cur_params, Help_ctrl* help_ctrl_ptr, uint32_t postp
 	    for (cur_param_idx = 0; cur_param_idx < cur_param_ct; cur_param_idx++) {
 	      if (cur_param_lens[cur_param_idx] > uii) {
 		if (!memcmp(help_ctrl_ptr->argv[arg_uidx], cur_param_start[cur_param_idx], uii)) {
-		  SET_BIT(help_ctrl_ptr->prefix_match_arr, arg_uidx);
-		  SET_BIT(help_ctrl_ptr->all_match_arr, arg_uidx);
+		  SET_BIT(arg_uidx, help_ctrl_ptr->prefix_match_arr);
+		  SET_BIT(arg_uidx, help_ctrl_ptr->all_match_arr);
 		  help_ctrl_ptr->unmatched_ct -= 1;
 		  break;
 		}
@@ -153,7 +153,7 @@ void help_print(const char* cur_params, Help_ctrl* help_ctrl_ptr, uint32_t postp
 	    if (edit1_match(cur_param_lens[cur_param_idx], cur_param_start[cur_param_idx], help_ctrl_ptr->param_lens[arg_uidx], help_ctrl_ptr->argv[arg_uidx])) {
 	      print_this = 1;
 	      if (!IS_SET(help_ctrl_ptr->all_match_arr, arg_uidx)) {
-		SET_BIT(help_ctrl_ptr->all_match_arr, arg_uidx);
+		SET_BIT(arg_uidx, help_ctrl_ptr->all_match_arr);
 		help_ctrl_ptr->unmatched_ct -= 1;
 	      }
 	      break;
@@ -180,8 +180,8 @@ void help_print(const char* cur_params, Help_ctrl* help_ctrl_ptr, uint32_t postp
 	    payload_ptr = &(payload_ptr[2]);
 	    uii -= 2;
 	  }
-	  memcpyx(tbuf, payload_ptr, uii, 0);
-	  fputs(tbuf, stdout);
+	  memcpyx(g_textbuf, payload_ptr, uii, 0);
+	  fputs(g_textbuf, stdout);
 	  payload_ptr = line_end;
 	} while (payload_ptr < payload_end);
       }
@@ -194,7 +194,7 @@ void help_print(const char* cur_params, Help_ctrl* help_ctrl_ptr, uint32_t postp
 int32_t disp_help(uint32_t param_ct, char** argv) {
   // yes, this is overkill.  But it should be a good template for other
   // command-line programs to use.
-  uint32_t param_ctl = (param_ct + (BITCT - 1)) / BITCT;
+  uint32_t param_ctl = BITCT_TO_WORDCT(param_ct);
   int32_t retval = 0;
   Help_ctrl help_ctrl;
   uint32_t arg_uidx;
@@ -268,7 +268,7 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "  * An ellipsis (...) indicates that you may enter multiple parameters of the\n"
 "    specified type.\n"
 , stdout);
-    fputs(cmdline_format_str, stdout);
+    fputs(g_cmdline_format_str, stdout);
     fputs(
 "Most " PROG_NAME_CAPS " runs require exactly one main input fileset.  The following flags\n"
 "are available for defining its form and location:\n\n"
diff --git a/plink_homozyg.c b/plink_homozyg.c
index 09caa89..0364b81 100644
--- a/plink_homozyg.c
+++ b/plink_homozyg.c
@@ -23,7 +23,7 @@ void homozyg_init(Homozyg_info* homozyg_ptr) {
 void mask_out_homozyg_major(uintptr_t* readbuf_cur, uint32_t sample_ct) {
   // if readbuf_cur were 16-byte aligned, this could be vectorized, but it
   // isn't, and this isn't a limiting step anyway
-  uintptr_t* readbuf_cur_end = &(readbuf_cur[(sample_ct + (BITCT2 - 1)) / BITCT2]);
+  uintptr_t* readbuf_cur_end = &(readbuf_cur[QUATERCT_TO_WORDCT(sample_ct)]);
   uintptr_t cur_word;
   do {
     cur_word = *readbuf_cur;
@@ -259,7 +259,7 @@ uint32_t roh_update(Homozyg_info* hp, uintptr_t* readbuf_cur, uintptr_t* swbuf_c
       if (readbuf_cur) {
 	if (swbuf_cur) {
 	  if ((het_cts[sample_idx] <= max_sw_hets) && (missing_cts[sample_idx] <= max_sw_missings)) {
-	    SET_BIT(swbuf_cur, sample_idx);
+	    SET_BIT(sample_idx, swbuf_cur);
 	    swhit_cts[sample_idx] += 1;
 	  }
 	}
@@ -335,7 +335,7 @@ uint32_t roh_update(Homozyg_info* hp, uintptr_t* readbuf_cur, uintptr_t* swbuf_c
       if (readbuf_cur) {
 	if (swbuf_cur) {
 	  if ((het_cts[sample_idx] <= max_sw_hets) && (missing_cts[sample_idx] <= max_sw_missings)) {
-	    SET_BIT(swbuf_cur, sample_idx);
+	    SET_BIT(sample_idx, swbuf_cur);
 	    swhit_cts[sample_idx] += 1;
 	  }
 	}
@@ -393,11 +393,11 @@ uint32_t roh_update(Homozyg_info* hp, uintptr_t* readbuf_cur, uintptr_t* swbuf_c
 }
 
 int32_t write_main_roh_reports(char* outname, char* outname_end, uintptr_t* marker_exclude, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, uint32_t* marker_pos, uintptr_t sample_ct, uintptr_t* sample_exclude, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uintptr_t* pheno_nm, uintptr_t* pheno_c, double* pheno_d, char* missing_pheno_str, uint32_t omp_is_numeric, uint32_t missing_pheno_len, uin [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
   FILE* outfile_indiv = NULL;
-  char* wptr_iid = &(tbuf[plink_maxfid + 1]);
-  char* wptr_phe = &(tbuf[plink_maxfid + plink_maxiid + 2]);
+  char* wptr_iid = &(g_textbuf[plink_maxfid + 1]);
+  char* wptr_phe = &(g_textbuf[plink_maxfid + plink_maxiid + 2]);
   int32_t* roh_ct_aff_adj = NULL;
   uintptr_t next_roh_idx = 0;
   uint32_t max_pool_size = 0;
@@ -428,26 +428,26 @@ int32_t write_main_roh_reports(char* outname, char* outname_end, uintptr_t* mark
   uint32_t slen;
   uint32_t uii;
   memcpy(outname_end, ".hom", 5);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto write_main_roh_reports_ret_OPEN_FAIL;
   }
-  sprintf(tbuf, "%%%us %%%us      PHE  CHR %%%us %%%us         POS1         POS2         KB     NSNP  DENSITY     PHOM     PHET\n", plink_maxfid, plink_maxiid, plink_maxsnp, plink_maxsnp);
-  fprintf(outfile, tbuf, "FID", "IID", "SNP1", "SNP2");
+  sprintf(g_textbuf, "%%%us %%%us      PHE  CHR %%%us %%%us         POS1         POS2         KB     NSNP  DENSITY     PHOM     PHET\n", plink_maxfid, plink_maxiid, plink_maxsnp, plink_maxsnp);
+  fprintf(outfile, g_textbuf, "FID", "IID", "SNP1", "SNP2");
   memcpy(&(outname_end[4]), ".indiv", 7);
-  if (fopen_checked(&outfile_indiv, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile_indiv)) {
     goto write_main_roh_reports_ret_OPEN_FAIL;
   }
-  sprintf(tbuf, "%%%us %%%us  PHE     NSEG       KB    KBAVG\n", plink_maxfid, plink_maxiid);
-  fprintf(outfile_indiv, tbuf, "FID", "IID");
-  tbuf[plink_maxfid] = ' ';
-  tbuf[plink_maxfid + plink_maxiid + 1] = ' ';
+  sprintf(g_textbuf, "%%%us %%%us  PHE     NSEG       KB    KBAVG\n", plink_maxfid, plink_maxiid);
+  fprintf(outfile_indiv, g_textbuf, "FID", "IID");
+  g_textbuf[plink_maxfid] = ' ';
+  g_textbuf[plink_maxfid + plink_maxiid + 1] = ' ';
   sample_uidx = 0;
   for (sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
     next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
     cptr = &(sample_ids[sample_uidx * max_sample_id_len]);
     cptr2 = (char*)memchr(cptr, '\t', max_sample_id_len);
     slen = (uintptr_t)(cptr2 - cptr);
-    memcpy(memseta(tbuf, 32, plink_maxfid - slen), cptr, slen);
+    memcpy(memseta(g_textbuf, 32, plink_maxfid - slen), cptr, slen);
     slen = strlen(++cptr2);
     memcpy(memseta(wptr_iid, 32, plink_maxiid - slen), cptr2, slen);
     if (!IS_SET(pheno_nm, sample_uidx)) {
@@ -456,7 +456,7 @@ int32_t write_main_roh_reports(char* outname, char* outname_end, uintptr_t* mark
       wptr_chr = memseta(wptr_phe, 32, 7);
       *wptr_chr++ = '1' + IS_SET(pheno_c, sample_uidx);
     } else {
-      wptr_chr = width_force(8, wptr_phe, double_f_writew3(wptr_phe, pheno_d[sample_uidx]));
+      wptr_chr = width_force(8, wptr_phe, dtoa_f_p3(pheno_d[sample_uidx], wptr_phe));
     }
     *wptr_chr++ = ' ';
     // traverse roh_list backwards, reversing the direction of [5], then
@@ -483,7 +483,7 @@ int32_t write_main_roh_reports(char* outname, char* outname_end, uintptr_t* mark
       cur_roh = &(roh_list[cur_roh_idx * ROH_ENTRY_INTS]);
       marker_uidx1 = cur_roh[0];
       marker_uidx2 = cur_roh[1];
-      wptr = width_force(4, wptr_chr, chrom_name_write(wptr_chr, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx1)));
+      wptr = width_force(4, wptr_chr, chrom_name_write(chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx1), wptr_chr));
       *wptr++ = ' ';
       cptr = &(marker_ids[marker_uidx1 * max_marker_id_len]);
       slen = strlen(cptr);
@@ -493,26 +493,26 @@ int32_t write_main_roh_reports(char* outname, char* outname_end, uintptr_t* mark
       slen = strlen(cptr);
       wptr = memcpya(memseta(wptr, 32, plink_maxsnp - slen), cptr, slen);
       wptr = memseta(wptr, 32, 3);
-      wptr = uint32_writew10(wptr, marker_pos[marker_uidx1]);
+      wptr = uint32toa_w10(marker_pos[marker_uidx1], wptr);
       wptr = memseta(wptr, 32, 3);
-      wptr = uint32_writew10x(wptr, marker_pos[marker_uidx2], ' ');
+      wptr = uint32toa_w10x(marker_pos[marker_uidx2], ' ', wptr);
       dxx = ((double)(marker_pos[marker_uidx2] + is_new_lengths - marker_pos[marker_uidx1])) / (1000.0 - EPSILON);
       kb_tot += dxx;
-      wptr = width_force(10, wptr, double_f_writew3(wptr, dxx));
+      wptr = width_force(10, wptr, dtoa_f_p3(dxx, wptr));
       *wptr++ = ' ';
       if (cur_roh[2] > max_roh_len) {
 	max_roh_len = cur_roh[2];
       }
-      wptr = uint32_writew8x(wptr, cur_roh[2], ' ');
+      wptr = uint32toa_w8x(cur_roh[2], ' ', wptr);
       dyy = (1.0 + SMALLISH_EPSILON) / ((double)((int32_t)cur_roh[2]));
-      wptr = width_force(8, wptr, double_f_writew3(wptr, dxx * dyy));
+      wptr = width_force(8, wptr, dtoa_f_p3(dxx * dyy, wptr));
       // next two decimals guaranteed to be length 5
       wptr = memseta(wptr, 32, 4);
-      wptr = double_f_writew3(wptr, ((double)((int32_t)cur_roh[3])) * dyy);
+      wptr = dtoa_f_p3(((double)((int32_t)cur_roh[3])) * dyy, wptr);
       wptr = memseta(wptr, 32, 4);
-      wptr = double_f_writew3(wptr, ((double)((int32_t)cur_roh[4])) * dyy);
+      wptr = dtoa_f_p3(((double)((int32_t)cur_roh[4])) * dyy, wptr);
       *wptr++ = '\n';
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	goto write_main_roh_reports_ret_WRITE_FAIL;
       }
 #ifdef __LP64__
@@ -528,21 +528,21 @@ int32_t write_main_roh_reports(char* outname, char* outname_end, uintptr_t* mark
       wptr = memseta(wptr_phe, 32, 3);
       *wptr++ = '1' + IS_SET(pheno_c, sample_uidx);
     } else {
-      wptr = width_force(4, wptr_phe, double_g_write(wptr_phe, pheno_d[sample_uidx]));
+      wptr = width_force(4, wptr_phe, dtoa_g(pheno_d[sample_uidx], wptr_phe));
     }
     *wptr++ = ' ';
-    wptr = uint32_writew8x(wptr, cur_roh_ct, ' ');
-    wptr = width_force(8, wptr, double_g_write(wptr, kb_tot));
+    wptr = uint32toa_w8x(cur_roh_ct, ' ', wptr);
+    wptr = width_force(8, wptr, dtoa_g(kb_tot, wptr));
     *wptr++ = ' ';
     if (cur_roh_ct) {
       kb_tot /= (double)((int32_t)cur_roh_ct);
     }
-    wptr = width_force(8, wptr, double_g_write(wptr, kb_tot));
+    wptr = width_force(8, wptr, dtoa_g(kb_tot, wptr));
     if (cur_roh_ct) {
       *wptr++ = ' ';
     }
     *wptr++ = '\n';
-    if (fwrite_checked(tbuf, wptr - tbuf, outfile_indiv)) {
+    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_indiv)) {
       goto write_main_roh_reports_ret_WRITE_FAIL;
     }
   }
@@ -553,27 +553,25 @@ int32_t write_main_roh_reports(char* outname, char* outname_end, uintptr_t* mark
     goto write_main_roh_reports_ret_WRITE_FAIL;
   }
   memcpy(&(outname_end[5]), "summary", 8);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto write_main_roh_reports_ret_WRITE_FAIL;
   }
-  sprintf(tbuf, " CHR %%%us           BP      AFF    UNAFF\n", plink_maxsnp);
-  fprintf(outfile, tbuf, "SNP");
+  sprintf(g_textbuf, " CHR %%%us           BP      AFF    UNAFF\n", plink_maxsnp);
+  fprintf(outfile, g_textbuf, "SNP");
   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
     chrom_roh_start = roh_list_chrom_starts[chrom_fo_idx];
     chrom_roh_ct = roh_list_chrom_starts[chrom_fo_idx + 1] - chrom_roh_start;
     uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
     chrom_start = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx];
     chrom_len = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1] - chrom_start;
-    wkspace_reset(wkspace_mark);
-    if (wkspace_alloc_i_checked(&roh_ct_unaff_adj, (chrom_len + 1) * sizeof(int32_t))) {
+    bigstack_reset(bigstack_mark);
+    if (bigstack_calloc_i(chrom_len + 1, &roh_ct_unaff_adj)) {
       goto write_main_roh_reports_ret_NOMEM;
     }
-    fill_int_zero(roh_ct_unaff_adj, chrom_len);
     if (pheno_c) {
-      if (wkspace_alloc_i_checked(&roh_ct_aff_adj, (chrom_len + 1) * sizeof(int32_t))) {
+      if (bigstack_calloc_i(chrom_len + 1, &roh_ct_aff_adj)) {
         goto write_main_roh_reports_ret_NOMEM;
       }
-      fill_int_zero(roh_ct_aff_adj, chrom_len);
     }
     cur_roh = &(roh_list[chrom_roh_start * ROH_ENTRY_INTS]);
     for (cur_roh_idx = 0; cur_roh_idx < chrom_roh_ct; cur_roh_idx++) {
@@ -587,7 +585,7 @@ int32_t write_main_roh_reports(char* outname, char* outname_end, uintptr_t* mark
       }
       cur_roh = &(cur_roh[ROH_ENTRY_INTS]);
     }
-    wptr_chr = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, uii));
+    wptr_chr = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, uii, g_textbuf));
     *wptr_chr++ = ' ';
     memset(&(wptr_chr[plink_maxsnp]), 32, 3);
     wptr_bp1 = &(wptr_chr[plink_maxsnp + 3]);
@@ -609,17 +607,17 @@ int32_t write_main_roh_reports(char* outname, char* outname_end, uintptr_t* mark
       cptr = &(marker_ids[marker_uidx1 * max_marker_id_len]);
       slen = strlen(cptr);
       memcpy(memseta(wptr_chr, 32, plink_maxsnp - slen), cptr, slen);
-      uint32_writew10(wptr_bp1, marker_pos[marker_uidx1]);
+      uint32toa_w10(marker_pos[marker_uidx1], wptr_bp1);
       if (!pheno_c) {
         wptr = &(wptr_bp1[20]);
       } else {
-        wptr = uint32_writew8x(&(wptr_bp1[11]), uii, ' ');
+        wptr = uint32toa_w8x(uii, ' ', &(wptr_bp1[11]));
       }
       if (cur_roh_ct + uii > max_pool_size) {
         max_pool_size = cur_roh_ct + uii;
       }
-      wptr = uint32_writew8x(wptr, cur_roh_ct, '\n');
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      wptr = uint32toa_w8x(cur_roh_ct, '\n', wptr);
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
         goto write_main_roh_reports_ret_WRITE_FAIL;
       }
     }
@@ -640,7 +638,7 @@ int32_t write_main_roh_reports(char* outname, char* outname_end, uintptr_t* mark
     retval = RET_WRITE_FAIL;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   fclose_cond(outfile_indiv);
   return retval;
@@ -694,7 +692,7 @@ void cur_roh_heap_removemax(uintptr_t* roh_slot_occupied, uint64_t* cur_roh_heap
   uint32_t initial_heap_max = *cur_roh_heap_max_ptr;
   uint32_t new_heap_max;
   do {
-    clear_bit(roh_slot_occupied, (uint32_t)(cur_roh_heap[1]));
+    clear_bit((uint32_t)(cur_roh_heap[1]), roh_slot_occupied);
     if ((--cur_roh_heap_top) == 1) {
       new_heap_max = 0;
       break;
@@ -871,7 +869,7 @@ int32_t populate_roh_slots_from_disk(FILE* bedfile, uint64_t bed_offset, uintptr
         return RET_READ_FAIL;
       }
     }
-    if (load_raw(bedfile, rawbuf, unfiltered_sample_ct4)) {
+    if (load_raw(unfiltered_sample_ct4, bedfile, rawbuf)) {
       return RET_READ_FAIL;
     }
     marker_cidx = marker_uidx_to_cidx[marker_uidx - chrom_start];
@@ -910,8 +908,8 @@ static inline uint32_t is_allelic_match(double mismatch_max, uintptr_t* roh_slot
   __m128i mismatch_sum1;
   __m128i joint_sum2;
   __m128i mismatch_sum2;
-  __uni16 accj;
-  __uni16 accm;
+  __univec accj;
+  __univec accm;
   __m128i* vptrl;
   __m128i* vptrs;
   __m128i* vptrl_end;
@@ -1218,7 +1216,7 @@ void compute_allelic_match_matrix(double mismatch_max, uintptr_t roh_slot_wsize,
 	} else {
 	  tri_coord = tri_coord_no_diag(slot_idxl, slot_idxs);
 	}
-        SET_BIT(allelic_match_matrix, tri_coord);
+        SET_BIT(tri_coord, allelic_match_matrix);
         allelic_match_cts[map_idxs] += 1;
 	incr_idxl++;
       }
@@ -1308,23 +1306,22 @@ char* roh_pool_write_middle(char* wptr, char* marker_ids, uintptr_t max_marker_i
   *wptr++ = ' ';
   wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr);
   wptr = memseta(wptr, 32, 5);
-  wptr = uint32_writew10(wptr, marker_pos[marker_uidx1]);
+  wptr = uint32toa_w10(marker_pos[marker_uidx1], wptr);
   wptr = memseta(wptr, 32, 5);
-  wptr = uint32_writew10x(wptr, marker_pos[marker_uidx2], ' ');
-  wptr = double_g_writewx8(wptr, ((double)(marker_pos[marker_uidx2] + is_new_lengths - marker_pos[marker_uidx1])) / 1000.0, 8);
-  *wptr++ = ' ';
+  wptr = uint32toa_w10x(marker_pos[marker_uidx2], ' ', wptr);
+  wptr = dtoa_g_wxp8x(((double)(marker_pos[marker_uidx2] + is_new_lengths - marker_pos[marker_uidx1])) / 1000.0, 8, ' ', wptr);
   return wptr;
 }
 
 int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* outname, char* outname_end, uintptr_t* rawbuf, uintptr_t* marker_exclude, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uint32_t* marker_pos, uintptr_t sample_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, char* sample_ids, uint32_t plink_maxfid, uint32_t plin [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
   uint64_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
   double mismatch_max = 1 - (hp->overlap_min * (1 - EPSILON)); // fuzz
   uint32_t is_consensus_match = hp->modifier & HOMOZYG_CONSENSUS_MATCH;
   uint32_t is_verbose = hp->modifier & HOMOZYG_GROUP_VERBOSE;
-  uint32_t max_pool_sizel = (max_pool_size + (BITCT - 1)) / BITCT;
+  uint32_t max_pool_sizel = BITCT_TO_WORDCT(max_pool_size);
   uint32_t pool_size_min = hp->pool_size_min;
   uint32_t pool_size_ct = max_pool_size + 1 - pool_size_min;
   uint32_t marker_uidx2 = 0;
@@ -1412,7 +1409,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
   uint32_t uii;
   uint32_t ujj;
   uint32_t ukk;
-  if (wkspace_alloc_ui_checked(&chrom_fo_idx_to_pidx, (chrom_info_ptr->chrom_ct + 1) * sizeof(int32_t))) {
+  if (bigstack_alloc_ui(chrom_info_ptr->chrom_ct + 1, &chrom_fo_idx_to_pidx)) {
     goto roh_pool_ret_NOMEM;
   }
   uii = 0; // max chrom len
@@ -1436,31 +1433,31 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
   // max_roh_len in {2..17} -> 2 words, etc.
   roh_slot_wsize = (max_roh_len + 30) / 16;
 #endif
-  if (wkspace_alloc_ul_checked(&pool_size_first_plidx, pool_size_ct * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&marker_uidx_to_cidx, uii * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(&roh_slots, max_pool_size * roh_slot_wsize * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&roh_slot_occupied, max_pool_sizel * sizeof(intptr_t)) ||
-      wkspace_alloc_ull_checked(&roh_slot_map, (max_pool_size + 1) * sizeof(int64_t)) ||
-      wkspace_alloc_ui_checked(&roh_slot_cidx_start, max_pool_size * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&roh_slot_cidx_end, max_pool_size * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&roh_slot_end_uidx, max_pool_size * sizeof(int32_t))) {
+  if (bigstack_alloc_ul(pool_size_ct, &pool_size_first_plidx) ||
+      bigstack_alloc_ui(uii, &marker_uidx_to_cidx) ||
+      bigstack_alloc_ul(max_pool_size * roh_slot_wsize, &roh_slots) ||
+      bigstack_calloc_ul(max_pool_sizel, &roh_slot_occupied) ||
+      bigstack_alloc_ull(max_pool_size + 1, &roh_slot_map) ||
+      bigstack_alloc_ui(max_pool_size, &roh_slot_cidx_start) ||
+      bigstack_alloc_ui(max_pool_size, &roh_slot_cidx_end) ||
+      bigstack_alloc_ui(max_pool_size, &roh_slot_end_uidx)) {
     goto roh_pool_ret_NOMEM;
   }
   if (!is_consensus_match) {
-    if (wkspace_alloc_ul_checked(&roh_slot_uncached, max_pool_sizel * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(max_pool_sizel, &roh_slot_uncached)) {
       goto roh_pool_ret_NOMEM;
     }
   }
   if (is_verbose) {
-    if (wkspace_alloc_ull_checked(&verbose_group_sort_buf, max_pool_size * sizeof(int64_t)) ||
-        wkspace_alloc_ui_checked(&verbose_uidx_bounds, max_pool_size * 2 * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&verbose_sample_uidx, max_pool_size * sizeof(int32_t)) ||
-        wkspace_alloc_c_checked(&writebuf, 2 * max_marker_allele_len + 5)) {
+    if (bigstack_alloc_ull(max_pool_size, &verbose_group_sort_buf) ||
+        bigstack_alloc_ui(max_pool_size * 2, &verbose_uidx_bounds) ||
+        bigstack_alloc_ui(max_pool_size, &verbose_sample_uidx) ||
+        bigstack_alloc_c(2 * max_marker_allele_len + 5, &writebuf)) {
       goto roh_pool_ret_NOMEM;
     }
   }
-  if (wkspace_alloc_ui_checked(&allelic_match_cts, max_pool_size * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(&allelic_match_matrix, (((uintptr_t)max_pool_size) * (max_pool_size - 1)) * (sizeof(intptr_t) / 2))) {
+  if (bigstack_alloc_ui(max_pool_size, &allelic_match_cts) ||
+      bigstack_alloc_ul((((uintptr_t)max_pool_size) * (max_pool_size - 1)) / 2, &allelic_match_matrix)) {
     goto roh_pool_ret_NOMEM;
   }
   // roh_slot_map / roh_slot_cidx_start... not used at the same time as
@@ -1469,10 +1466,9 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
   sample_uidx_sort_buf = roh_slot_cidx_start;
 
   fill_ulong_one(pool_size_first_plidx, pool_size_ct);
-  fill_ulong_zero(roh_slot_occupied, max_pool_sizel);
 
-  pool_list = (uintptr_t*)wkspace_base;
-  max_pool_list_size = wkspace_left / sizeof(intptr_t);
+  pool_list = (uintptr_t*)g_bigstack_base;
+  max_pool_list_size = bigstack_left() / sizeof(intptr_t);
   // Since our ROH are sorted by *last* SNP, it's easiest to scan for pools
   // from back to front if we wish to painlessly produce sorted lists.
   chrom_fo_idx = chrom_info_ptr->chrom_ct;
@@ -1496,7 +1492,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 	// check if this ROH doesn't intersect anything
 	if ((cur_roh_heap_top > 1) || ((roh_idx != chrom_roh_start) && (cur_roh[1 - ROH_ENTRY_INTS] >= uii))) {
 	  slot_idx1 = next_unset_unsafe(roh_slot_occupied, 0);
-	  SET_BIT(roh_slot_occupied, slot_idx1);
+	  SET_BIT(slot_idx1, roh_slot_occupied);
 	  // use roh_slots[0..(max_pool_size - 1)] to store references to
 	  // active ROH here
 	  roh_slots[slot_idx1] = roh_idx;
@@ -1585,10 +1581,10 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
   } while (chrom_fo_idx);
   chrom_fo_idx_to_pidx[0] = pool_ct;
 
-  wptr = uint32_write(logbuf, pool_ct);
+  wptr = uint32toa(pool_ct, g_logbuf);
   if (pool_size_min > 2) {
     wptr = memcpya(wptr, " size-", 6);
-    wptr = uint32_writex(wptr, pool_size_min, '+');
+    wptr = uint32toa_x(pool_size_min, '+', wptr);
   }
   wptr = memcpya(wptr, " pool", 5);
   if (pool_ct != 1) {
@@ -1599,9 +1595,9 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 
   // Now we know how much memory the pools require, so we can assign the rest
   // to a lookahead buffer.
-  wkspace_alloc(pool_list_size * sizeof(intptr_t)); // pool_list
-  max_lookahead = wkspace_left / (unfiltered_sample_ctl2 * sizeof(intptr_t));
-  lookahead_buf = (uintptr_t*)wkspace_base;
+  bigstack_alloc(pool_list_size * sizeof(intptr_t)); // pool_list
+  max_lookahead = bigstack_left() / (unfiltered_sample_ctl2 * sizeof(intptr_t));
+  lookahead_buf = (uintptr_t*)g_bigstack_base;
 
   // Now assign ID numbers.
   // We do not precisely imitate PLINK 1.07 here.  This is because
@@ -1684,11 +1680,11 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
           if (slot_idx1 == max_pool_size) {
 	    break;
 	  }
-          clear_bits(allelic_match_matrix, (((uintptr_t)slot_idx1) * (slot_idx1 - 1)) / 2, slot_idx1);
+          clear_bits((((uintptr_t)slot_idx1) * (slot_idx1 - 1)) / 2, slot_idx1, allelic_match_matrix);
           slot_idx1++;
 	}
       } else {
-        fill_ulong_zero(roh_slot_uncached, (pool_size + (BITCT - 1)) / BITCT);
+        fill_ulong_zero(roh_slot_uncached, BITCT_TO_WORDCT(pool_size));
       }
       slot_idx1 = 0;
       while (1) {
@@ -1697,16 +1693,16 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 	  break;
 	}
 	if (roh_slot_end_uidx[slot_idx1] <= con_uidx2) {
-          CLEAR_BIT(roh_slot_occupied, slot_idx1);
+          CLEAR_BIT(slot_idx1, roh_slot_occupied);
           if (!is_consensus_match) {
-            clear_bits(allelic_match_matrix, (((uintptr_t)slot_idx1) * (slot_idx1 - 1)) / 2, slot_idx1);
+            clear_bits((((uintptr_t)slot_idx1) * (slot_idx1 - 1)) / 2, slot_idx1, allelic_match_matrix);
 	    slot_idx2 = slot_idx1;
 	    while (1) {
               slot_idx2 = next_set(roh_slot_occupied, slot_idx2 + 1, max_pool_size);
 	      if (slot_idx2 == max_pool_size) {
 		break;
 	      }
-	      clear_bit_ul(allelic_match_matrix, tri_coord_no_diag(slot_idx1, slot_idx2));
+	      clear_bit_ul(tri_coord_no_diag(slot_idx1, slot_idx2), allelic_match_matrix);
 	    }
 	  }
 	}
@@ -1738,9 +1734,9 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 	    break;
 	  }
 	  ujj = next_unset_unsafe(roh_slot_occupied, 0);
-	  SET_BIT(roh_slot_occupied, ujj);
+	  SET_BIT(ujj, roh_slot_occupied);
 	  if (roh_slot_uncached) {
-	    SET_BIT(roh_slot_uncached, roh_idx - 1);
+	    SET_BIT(roh_idx - 1, roh_slot_uncached);
 	  }
           roh_slot_map[slot_idx2++] = (((uint64_t)sample_uidx2) << 32) | ((uint64_t)ujj);
 	  initialize_roh_slot(cur_roh, chrom_start, marker_uidx_to_cidx, &(roh_slots[ujj * roh_slot_wsize]), &(roh_slot_cidx_start[ujj]), &(roh_slot_cidx_end[ujj]), &(roh_slot_end_uidx[ujj]));
@@ -1751,9 +1747,9 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 	cur_roh = &(roh_list[cur_pool[roh_idx] * ROH_ENTRY_INTS]);
         sample_uidx2 = cur_roh[5];
         ujj = next_unset_unsafe(roh_slot_occupied, 0);
-        SET_BIT(roh_slot_occupied, ujj);
+        SET_BIT(ujj, roh_slot_occupied);
 	if (roh_slot_uncached) {
-	  SET_BIT(roh_slot_uncached, roh_idx);
+	  SET_BIT(roh_idx, roh_slot_uncached);
 	}
         roh_slot_map[roh_idx++] = (((uint64_t)sample_uidx2) << 32) | ((uint64_t)ujj);
 	initialize_roh_slot(cur_roh, chrom_start, marker_uidx_to_cidx, &(roh_slots[ujj * roh_slot_wsize]), &(roh_slot_cidx_start[ujj]), &(roh_slot_cidx_end[ujj]), &(roh_slot_end_uidx[ujj]));
@@ -1827,7 +1823,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 
 	  // last few bytes of each lookahead_buf row may be filled with
 	  // garbage, but it doesn't matter
-	  if (load_raw(bedfile, &(lookahead_buf[ulii * unfiltered_sample_ctl2]), unfiltered_sample_ct4)) {
+	  if (load_raw(unfiltered_sample_ct4, bedfile, &(lookahead_buf[ulii * unfiltered_sample_ctl2]))) {
 	    goto roh_pool_ret_READ_FAIL;
 	  }
 	  ulii++;
@@ -1870,12 +1866,12 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 
       if (is_verbose) {
 #ifdef __LP64__
-	wptr = uint32_write(&(outname_end[14]), (uint32_t)(cur_pool[-1] >> 32));
+	wptr = uint32toa((uint32_t)(cur_pool[-1] >> 32), &(outname_end[14]));
 #else
-	wptr = uint32_write(&(outname_end[14]), (uint32_t)cur_pool[-1]);
+	wptr = uint32toa((uint32_t)cur_pool[-1], &(outname_end[14]));
 #endif
 	memcpy(wptr, ".verbose", 9);
-	if (fopen_checked(&outfile, outname, "w")) {
+	if (fopen_checked(outname, "w", &outfile)) {
 	  goto roh_pool_ret_OPEN_FAIL;
 	}
 
@@ -1891,8 +1887,8 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 #else
 	qsort((int64_t*)verbose_group_sort_buf, pool_size, sizeof(int64_t), llcmp);
 #endif
-        sprintf(tbuf, "       %%%us %%%us  GRP \n", plink_maxfid, plink_maxiid);
-	fprintf(outfile, tbuf, "FID", "IID");
+        sprintf(g_textbuf, "       %%%us %%%us  GRP \n", plink_maxfid, plink_maxiid);
+	fprintf(outfile, g_textbuf, "FID", "IID");
 
 	for (slot_idx1 = 0; slot_idx1 < pool_size; slot_idx1++) {
 	  slot_idx2 = (uint32_t)verbose_group_sort_buf[slot_idx1];
@@ -1902,7 +1898,8 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 	  verbose_uidx_bounds[slot_idx1 * 2 + 1] = cur_roh[1];
 	  verbose_sample_uidx[slot_idx1] = cur_roh[5];
 	  sample_uidx1 = cur_roh[5];
-          wptr = width_force(4, tbuf, uint32_write(tbuf, slot_idx1 + 1));
+	  wptr = uint32toa(slot_idx1 + 1, g_textbuf);
+          wptr = width_force(4, g_textbuf, wptr);
 	  wptr = memcpyl3a(wptr, ")  ");
 	  cptr = &(sample_ids[sample_uidx1 * max_sample_id_len]);
 	  cptr2 = (char*)memchr(cptr, '\t', max_sample_id_len);
@@ -1910,20 +1907,21 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
           *wptr++ = ' ';
           wptr = fw_strcpy(plink_maxiid, &(cptr2[1]), wptr);
           wptr = memseta(wptr, 32, 3);
-          wptr = uint32_write(wptr, (uint32_t)(verbose_group_sort_buf[slot_idx1] >> 32));
+          wptr = uint32toa((uint32_t)(verbose_group_sort_buf[slot_idx1] >> 32), wptr);
 	  *wptr++ = '\n';
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	    goto roh_pool_ret_WRITE_FAIL;
 	  }
 	}
 	putc('\n', outfile);
-	wptr = memseta(tbuf, 32, plink_maxsnp - 3);
+	wptr = memseta(g_textbuf, 32, plink_maxsnp - 3);
 	wptr = memcpya(wptr, "SNP ", 4);
-        fwrite(tbuf, 1, wptr - tbuf, outfile);
+        fwrite(g_textbuf, 1, wptr - g_textbuf, outfile);
 	for (slot_idx1 = 0; slot_idx1 < pool_size; slot_idx1++) {
-          wptr = width_force(4, tbuf, uint32_write(tbuf, slot_idx1 + 1));
+	  wptr = uint32toa(slot_idx1 + 1, g_textbuf);
+          wptr = width_force(4, g_textbuf, wptr);
 	  wptr = memseta(wptr, 32, 2);
-	  fwrite(tbuf, 1, wptr - tbuf, outfile);
+	  fwrite(g_textbuf, 1, wptr - g_textbuf, outfile);
 	}
 	if (fputs_checked("\n\n", outfile)) {
 	  goto roh_pool_ret_WRITE_FAIL;
@@ -1939,9 +1937,9 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 	    ulii -= max_lookahead;
 	  }
 	  lookahead_row = &(lookahead_buf[ulii * unfiltered_sample_ctl2]);
-	  wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx1 * max_marker_id_len]), tbuf);
+	  wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx1 * max_marker_id_len]), g_textbuf);
 	  *wptr++ = ' ';
-          fwrite(tbuf, 1, wptr - tbuf, outfile);
+          fwrite(g_textbuf, 1, wptr - g_textbuf, outfile);
 	  allele_strs[2] = marker_allele_ptrs[marker_uidx1 * 2];
 	  allele_strs[3] = marker_allele_ptrs[marker_uidx1 * 2 + 1];
 	  allele_strs[0] = allele_strs[2 + IS_SET(marker_reverse, marker_uidx1)];
@@ -1998,14 +1996,14 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 	  while ((group_slot_end < pool_size) && (((uint32_t)(verbose_group_sort_buf[group_slot_end] >> 32)) == ujj)) {
 	    group_slot_end++;
 	  }
-	  wptr = memcpya(tbuf, "Group ", 6);
-	  wptr = uint32_write(wptr, ujj);
+	  wptr = memcpya(g_textbuf, "Group ", 6);
+	  wptr = uint32toa(ujj, wptr);
 	  wptr = memcpya(wptr, "\n\n", 2);
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	    goto roh_pool_ret_WRITE_FAIL;
 	  }
 	  for (slot_idx2 = slot_idx1; slot_idx2 < group_slot_end; slot_idx2++) {
-	    wptr = width_force(4, tbuf, uint32_write(tbuf, slot_idx2 + 1));
+            wptr = uint32toa_w4(slot_idx2 + 1, g_textbuf);
 	    wptr = memcpya(wptr, ") ", 2);
 	    sample_uidx1 = verbose_sample_uidx[slot_idx2];
 	    cptr = &(sample_ids[sample_uidx1 * max_sample_id_len]);
@@ -2019,26 +2017,26 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 		wptr = memseta(wptr, 32, 7);
 		*wptr++ = '1' + IS_SET(pheno_c, sample_uidx1);
 	      } else {
-		wptr = double_g_writewx2(wptr, pheno_d[sample_uidx1], 8);
+		wptr = dtoa_g_wxp2(pheno_d[sample_uidx1], 8, wptr);
 	      }
 	    } else {
               wptr = fw_strcpyn(8, missing_pheno_len, missing_pheno_str, wptr);
 	    }
 	    *wptr++ = '\n';
-	    fwrite(tbuf, 1, wptr - tbuf, outfile);
+	    fwrite(g_textbuf, 1, wptr - g_textbuf, outfile);
 	  }
 	  if (fputs_checked("\n\n", outfile)) {
 	    goto roh_pool_ret_WRITE_FAIL;
 	  }
-	  wptr = memseta(tbuf, 32, plink_maxsnp - 3);
+	  wptr = memseta(g_textbuf, 32, plink_maxsnp - 3);
 	  wptr = memcpya(wptr, "SNP         ", 12);
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	    goto roh_pool_ret_WRITE_FAIL;
 	  }
 	  for (slot_idx2 = slot_idx1; slot_idx2 < group_slot_end; slot_idx2++) {
-	    wptr = width_force(4, tbuf, uint32_write(tbuf, slot_idx2 + 1));
+	    wptr = uint32toa_w4(slot_idx2 + 1, g_textbuf);
 	    wptr = memseta(wptr, 32, 2);
-	    fwrite(tbuf, 1, wptr - tbuf, outfile);
+	    fwrite(g_textbuf, 1, wptr - g_textbuf, outfile);
 	  }
 	  if (fputs_checked("\n\n", outfile)) {
 	    goto roh_pool_ret_WRITE_FAIL;
@@ -2054,9 +2052,9 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 	      ulii -= max_lookahead;
 	    }
 	    lookahead_row = &(lookahead_buf[ulii * unfiltered_sample_ctl2]);
-	    wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx1 * max_marker_id_len]), tbuf);
+	    wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx1 * max_marker_id_len]), g_textbuf);
 	    *wptr++ = ' ';
-	    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	      goto roh_pool_ret_WRITE_FAIL;
 	    }
 	    ujj = 0; // A1 hom ct
@@ -2147,9 +2145,9 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 	    ulii -= max_lookahead;
 	  }
           lookahead_row = &(lookahead_buf[ulii * unfiltered_sample_ctl2]);
-	  wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx1 * max_marker_id_len]), tbuf);
+	  wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx1 * max_marker_id_len]), g_textbuf);
 	  *wptr++ = ' ';
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	    goto roh_pool_ret_WRITE_FAIL;
 	  }
 
@@ -2199,7 +2197,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 	  goto roh_pool_ret_WRITE_FAIL;
 	}
 	LOGPREPRINTFWW("%s written.\n", outname);
-        logstr(logbuf);
+        logstr(g_logbuf);
       }
     }
     if (chrom_info_ptr->chrom_file_order[chrom_fo_idx] > onechar_max) {
@@ -2209,11 +2207,11 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
   fputs("\b\b\b\b\b\b\b\b\b\b\b\b\b\b               \b\b\b\b\b\b\b\b\b\b\b\b\b\b\bdone.\n", stdout);
 
   outname_end[12] = '\0';
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto roh_pool_ret_OPEN_FAIL;
   }
-  sprintf(tbuf, " POOL %%%us %%%us      PHE  CHR %%%us %%%us            BP1            BP2       KB     NSNP NSIM    GRP\n", plink_maxfid, plink_maxiid, plink_maxsnp, plink_maxsnp);
-  fprintf(outfile, tbuf, "FID", "IID", "SNP1", "SNP2");
+  sprintf(g_textbuf, " POOL %%%us %%%us      PHE  CHR %%%us %%%us            BP1            BP2       KB     NSNP NSIM    GRP\n", plink_maxfid, plink_maxiid, plink_maxsnp, plink_maxsnp);
+  fprintf(outfile, g_textbuf, "FID", "IID", "SNP1", "SNP2");
   uii = 1; // pool ID
   fputs("Writing...", stdout);
   fflush(stdout);
@@ -2228,8 +2226,8 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
       cur_pool = &(cur_pool[3]);
 #endif
       case_ct = 0;
-      tbuf[0] = 'S';
-      wptr_start = width_force(5, tbuf, uint32_write(&(tbuf[1]), uii));
+      g_textbuf[0] = 'S';
+      wptr_start = width_force(5, g_textbuf, uint32toa(uii, &(g_textbuf[1])));
       *wptr_start++ = ' ';
       cur_roh = &(roh_list[cur_pool[0] * ROH_ENTRY_INTS]);
       con_uidx1 = cur_roh[0];
@@ -2273,13 +2271,13 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
               wptr = memcpya(wptr, "       1", 8);
 	    }
 	  } else {
-	    wptr = double_g_writewx4(wptr, pheno_d[sample_uidx1], 8);
+	    wptr = dtoa_g_wxp4(pheno_d[sample_uidx1], 8, wptr);
 	  }
 	} else {
           wptr = fw_strcpyn(8, missing_pheno_len, missing_pheno_str, wptr);
 	}
 	*wptr++ = ' ';
-	wptr = width_force(4, wptr, chrom_name_write(wptr, chrom_info_ptr, chrom_start));
+	wptr = width_force(4, wptr, chrom_name_write(chrom_info_ptr, chrom_start, wptr));
 	marker_uidx1 = cur_roh[0];
 	marker_uidx2 = cur_roh[1];
 	if (marker_uidx1 > con_uidx1) {
@@ -2293,12 +2291,11 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 	  union_uidx2 = marker_uidx2;
 	}
         wptr = roh_pool_write_middle(wptr, marker_ids, max_marker_id_len, plink_maxsnp, marker_pos, is_new_lengths, marker_uidx1, marker_uidx2);
-	wptr = uint32_writew8x(wptr, cur_roh[2], ' ');
+	wptr = uint32toa_w8x(cur_roh[2], ' ', wptr);
 #ifdef __LP64__
 	ulii = cur_pool[pool_size + slot_idx2];
-        wptr = width_force(4, wptr, uint32_write(wptr, (uint32_t)(ulii >> 32)));
-        *wptr++ = ' ';
-        wptr = width_force(5, wptr, uint32_write(wptr, ((uint32_t)ulii) & 0x7fffffff));
+        wptr = uint32toa_w4x((uint32_t)(ulii >> 32), ' ', wptr);
+        wptr = width_force(5, wptr, uint32toa(ulii & 0x7fffffff, wptr));
         if (ulii & 0x80000000LLU) {
           *wptr++ = '*';
 	} else {
@@ -2306,9 +2303,8 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 	}
 #else
 	ulii = cur_pool[pool_size + 2 * slot_idx2];
-        wptr = width_force(4, wptr, uint32_write(wptr, cur_pool[pool_size + 2 * slot_idx2 + 1]));
-	*wptr++ = ' ';
-        wptr = width_force(5, wptr, uint32_write(wptr, ulii & 0x7fffffff));
+        wptr = uint32toa_w4x(cur_pool[pool_size + 2 * slot_idx2 + 1], ' ', wptr);
+        wptr = width_force(5, wptr, uint32toa(ulii & 0x7fffffff, wptr));
 	if (ulii & 0x80000000U) {
 	  *wptr++ = '*';
 	} else {
@@ -2316,7 +2312,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 	}
 #endif
         wptr = memcpya(wptr, " \n", 2);
-	if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	  goto roh_pool_ret_WRITE_FAIL;
 	}
       }
@@ -2342,21 +2338,21 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
 #endif
 	}
         *wptr++ = ' ';
-	wptr = width_force(plink_maxiid, wptr, uint32_write(wptr, pool_size));
+	wptr = width_force(plink_maxiid, wptr, uint32toa(pool_size, wptr));
         *wptr++ = ' ';
-        cptr = uint32_write(wptr, case_ct);
+        cptr = uint32toa(case_ct, wptr);
 	*cptr++ = ':';
-	cptr = uint32_write(cptr, pool_size - case_ct);
+	cptr = uint32toa(pool_size - case_ct, cptr);
         wptr = width_force(8, wptr, cptr);
 	*wptr++ = ' ';
-	wptr = width_force(4, wptr, chrom_name_write(wptr, chrom_info_ptr, chrom_start));
+	wptr = width_force(4, wptr, chrom_name_write(chrom_info_ptr, chrom_start, wptr));
         wptr = roh_pool_write_middle(wptr, marker_ids, max_marker_id_len, plink_maxsnp, marker_pos, is_new_lengths, marker_uidx1, marker_uidx2);
-        wptr = uint32_writew8(wptr, marker_cidx);
+        wptr = uint32toa_w8(marker_cidx, wptr);
         wptr = memcpya(wptr, "    NA     NA \n", 15);
 	if (ujj) {
 	  *wptr++ = '\n';
 	}
-	if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	  goto roh_pool_ret_WRITE_FAIL;
 	}
       }
@@ -2370,7 +2366,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
   putchar('\r');
   LOGPRINTFWW("ROH pool report written to %s .\n", outname);
   if (is_verbose) {
-    wptr = strcpya(logbuf, "Per-pool report");
+    wptr = strcpya(g_logbuf, "Per-pool report");
     if (pool_ct != 1) {
       *wptr++ = 's';
     }
@@ -2383,11 +2379,11 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
       wptr = memcpya(wptr, "{1,2}", 5);
     } else {
       wptr = memcpya(wptr, "{1,...,", 7);
-      wptr = uint32_write(wptr, pool_ct);
+      wptr = uint32toa(pool_ct, wptr);
       *wptr++ = '}';
     }
     wptr = memcpya(wptr, ".verbose.\n", 11);
-    fputs(logbuf, stdout);
+    fputs(g_logbuf, stdout);
   }
 
   while (0) {
@@ -2405,17 +2401,18 @@ int32_t roh_pool(Homozyg_info* hp, FILE* bedfile, uint64_t bed_offset, char* out
     break;
   }
  roh_pool_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   return retval;
 }
 
 int32_t calc_homozyg(Homozyg_info* hp, FILE* bedfile, uintptr_t bed_offset, uint32_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uint32_t* marker_pos, uintptr_t sample_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, char* sample_ids, uint32_t plink_maxfid, uint32_t  [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   uint64_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
-  uintptr_t sample_ctl = (sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t sample_ctl2 = (sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+  uintptr_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
   uintptr_t window_size = hp->window_size;
   double hit_threshold = hp->hit_threshold;
   uint32_t is_new_lengths = 1 ^ ((hp->modifier / HOMOZYG_OLD_LENGTHS) & 1);
@@ -2423,7 +2420,6 @@ int32_t calc_homozyg(Homozyg_info* hp, FILE* bedfile, uintptr_t bed_offset, uint
   int32_t x_code = chrom_info_ptr->x_code;
   int32_t mt_code = chrom_info_ptr->mt_code;
   uintptr_t* haploid_mask = chrom_info_ptr->haploid_mask;
-  uintptr_t topsize = 0;
   uintptr_t roh_ct = 0;
   uintptr_t final_mask = get_final_mask(sample_ct);
   uintptr_t* sample_male = NULL;
@@ -2506,85 +2502,45 @@ int32_t calc_homozyg(Homozyg_info* hp, FILE* bedfile, uintptr_t bed_offset, uint
     }
   }
 
-  if (wkspace_alloc_ul_checked(&roh_list_chrom_starts, (chrom_ct + 1) * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&rawbuf, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
-    goto calc_homozyg_ret_NOMEM;
-  }
-
-  readbuf = (uintptr_t*)top_alloc(&topsize, sample_ctl2 * window_size * sizeof(intptr_t));
-  if (!readbuf) {
-    goto calc_homozyg_ret_NOMEM;
-  }
-  swbuf = (uintptr_t*)top_alloc(&topsize, sample_ctl * window_size * sizeof(intptr_t));
-  if (!swbuf) {
-    goto calc_homozyg_ret_NOMEM;
-  }
-  het_cts = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
-  if (!het_cts) {
-    goto calc_homozyg_ret_NOMEM;
-  }
-  missing_cts = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
-  if (!missing_cts) {
-    goto calc_homozyg_ret_NOMEM;
-  }
-  swhit_cts = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
-  if (!swhit_cts) {
-    goto calc_homozyg_ret_NOMEM;
-  }
-  cur_roh_uidx_starts = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
-  if (!cur_roh_uidx_starts) {
-    goto calc_homozyg_ret_NOMEM;
-  }
-  cur_roh_cidx_starts = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
-  if (!cur_roh_cidx_starts) {
+  if (bigstack_alloc_ul(chrom_ct + 1, &roh_list_chrom_starts) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl2, &rawbuf) ||
+      bigstack_end_alloc_ul(sample_ctl2 * window_size, &readbuf) ||
+      bigstack_end_alloc_ul(sample_ctl * window_size, &swbuf) ||
+      bigstack_end_alloc_ui(sample_ct, &het_cts) ||
+      bigstack_end_alloc_ui(sample_ct, &missing_cts) ||
+      bigstack_end_alloc_ui(sample_ct, &swhit_cts) ||
+      bigstack_end_alloc_ui(sample_ct, &cur_roh_uidx_starts) ||
+      bigstack_end_alloc_ui(sample_ct, &cur_roh_cidx_starts)) {
     goto calc_homozyg_ret_NOMEM;
   }
   if (hp->modifier & HOMOZYG_EXTEND) {
-    prev_roh_end_cidxs = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
-    if (!prev_roh_end_cidxs) {
-      goto calc_homozyg_ret_NOMEM;
-    }
-    end_nonhom_uidxs = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
-    if (!end_nonhom_uidxs) {
-      goto calc_homozyg_ret_NOMEM;
-    }
-    cur_roh_earliest_extend_uidxs = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
-    if (!cur_roh_earliest_extend_uidxs) {
+    if (bigstack_end_alloc_ui(sample_ct, &prev_roh_end_cidxs) ||
+        bigstack_end_alloc_ui(sample_ct, &end_nonhom_uidxs) ||
+        bigstack_end_alloc_ui(sample_ct, &cur_roh_earliest_extend_uidxs)) {
       goto calc_homozyg_ret_NOMEM;
     }
   }
-  cur_roh_het_cts = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
-  if (!cur_roh_het_cts) {
-    goto calc_homozyg_ret_NOMEM;
-  }
-  cur_roh_missing_cts = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
-  if (!cur_roh_missing_cts) {
-    goto calc_homozyg_ret_NOMEM;
-  }
-  sample_to_last_roh = (uintptr_t*)top_alloc(&topsize, sample_ct * sizeof(intptr_t));
-  if (!sample_to_last_roh) {
+  if (bigstack_end_alloc_ui(sample_ct, &cur_roh_het_cts) ||
+      bigstack_end_alloc_ui(sample_ct, &cur_roh_missing_cts) ||
+      bigstack_end_alloc_ul(sample_ct, &sample_to_last_roh) ||
+      bigstack_end_alloc_ui(window_size, &uidx_buf)) {
     goto calc_homozyg_ret_NOMEM;
   }
-  uidx_buf = (uint32_t*)top_alloc(&topsize, window_size * sizeof(int32_t));
-  if (!uidx_buf) {
-    goto calc_homozyg_ret_NOMEM;
+  if ((x_code != -1) && is_set(chrom_info_ptr->chrom_mask, x_code)) {
+    if (bigstack_end_alloc_ul(sample_ctl, &sample_male)) {
+      goto calc_homozyg_ret_NOMEM;
+    }
+    copy_bitarr_subset_excl(sex_male, sample_exclude, sample_ct, popcount_longs_exclude(sex_male, sample_exclude, sample_ctl), sample_male);
   }
   // no other workspace allocations during main scan, so we can assign it all
   // to the ROH list
-  max_roh_ct = ((wkspace_left - topsize) & (~(CACHELINE - ONELU))) / (ROH_ENTRY_INTS * sizeof(int32_t));
-  roh_list = (uint32_t*)wkspace_base;
+  max_roh_ct = (bigstack_left() & (~(CACHELINE - ONELU))) / (ROH_ENTRY_INTS * sizeof(int32_t));
+  roh_list = (uint32_t*)g_bigstack_base;
   ulii = sample_ctl2 - 1;
   rawbuf[unfiltered_sample_ctl2 - 1] = 0;
   for (widx = 0; widx < window_size; widx++) {
     readbuf[widx * sample_ctl2 + ulii] = 0;
   }
-  if ((x_code != -1) && is_set(chrom_info_ptr->chrom_mask, x_code)) {
-    sample_male = (uintptr_t*)top_alloc(&topsize, sample_ctl * sizeof(intptr_t));
-    if (!sample_male) {
-      goto calc_homozyg_ret_NOMEM;
-    }
-    collapse_copy_bitarr(sample_ct, sex_male, sample_exclude, popcount_longs_exclude(sex_male, sample_exclude, sample_ctl), sample_male);
-  }
   if (fseeko(bedfile, bed_offset, SEEK_SET)) {
     goto calc_homozyg_ret_READ_FAIL;
   }
@@ -2627,7 +2583,7 @@ int32_t calc_homozyg(Homozyg_info* hp, FILE* bedfile, uintptr_t bed_offset, uint
 	break;
       }
       readbuf_cur = &(readbuf[widx * sample_ctl2]);
-      if (load_and_collapse(bedfile, rawbuf, unfiltered_sample_ct, readbuf_cur, sample_ct, sample_exclude, final_mask, 0)) {
+      if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, 0, bedfile, rawbuf, readbuf_cur)) {
 	goto calc_homozyg_ret_READ_FAIL;
       }
       mask_out_homozyg_major(readbuf_cur, sample_ct);
@@ -2688,7 +2644,7 @@ int32_t calc_homozyg(Homozyg_info* hp, FILE* bedfile, uintptr_t bed_offset, uint
 	  }
 	}
 	uidx_buf[widx] = marker_uidx;
-	if (load_and_collapse(bedfile, rawbuf, unfiltered_sample_ct, readbuf_cur, sample_ct, sample_exclude, final_mask, 0)) {
+	if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, 0, bedfile, rawbuf, readbuf_cur)) {
 	  goto calc_homozyg_ret_READ_FAIL;
 	}
 	mask_out_homozyg_major(readbuf_cur, sample_ct);
@@ -2730,7 +2686,7 @@ int32_t calc_homozyg(Homozyg_info* hp, FILE* bedfile, uintptr_t bed_offset, uint
   roh_list_chrom_starts[chrom_ct] = roh_ct;
   // "truncate" the completed list so we can start making workspace allocations
   // again
-  wkspace_alloc(roh_ct * ROH_ENTRY_INTS * sizeof(int32_t)); // roh_list
+  bigstack_alloc(roh_ct * ROH_ENTRY_INTS * sizeof(int32_t)); // roh_list
   retval = write_main_roh_reports(outname, outname_end, marker_exclude, marker_ids, max_marker_id_len, plink_maxsnp, chrom_info_ptr, marker_pos, sample_ct, sample_exclude, sample_ids, plink_maxfid, plink_maxiid, max_sample_id_len, pheno_nm, pheno_c, pheno_d, missing_pheno_str, omp_is_numeric, missing_pheno_len, is_new_lengths, roh_ct, roh_list, roh_list_chrom_starts, sample_to_last_roh, &max_pool_size, &max_roh_len);
   if (retval) {
     goto calc_homozyg_ret_1;
@@ -2748,7 +2704,7 @@ int32_t calc_homozyg(Homozyg_info* hp, FILE* bedfile, uintptr_t bed_offset, uint
     } else {
       if (omp_is_numeric) {
 	scan_double(output_missing_pheno, &dxx);
-	wptr = double_g_writewx4(missing_pheno_str, dxx, 8);
+	wptr = dtoa_g_wxp4(dxx, 8, missing_pheno_str);
 	missing_pheno_len = (uintptr_t)(wptr - missing_pheno_str);
       }
       retval = roh_pool(hp, bedfile, bed_offset, outname, outname_end, rawbuf, marker_exclude, marker_ids, max_marker_id_len, plink_maxsnp, marker_allele_ptrs, max_marker_allele_len, marker_reverse, chrom_info_ptr, marker_pos, sample_ct, unfiltered_sample_ct, sample_exclude, sample_ids, plink_maxfid, plink_maxiid, max_sample_id_len, pheno_nm, pheno_c, pheno_d, missing_pheno_str, missing_pheno_len, is_new_lengths, roh_ct, roh_list, roh_list_chrom_starts, max_pool_size, max_roh_len);
@@ -2767,6 +2723,6 @@ int32_t calc_homozyg(Homozyg_info* hp, FILE* bedfile, uintptr_t bed_offset, uint
     break;
   }
  calc_homozyg_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
   return retval;
 }
diff --git a/plink_lasso.c b/plink_lasso.c
index 98ed6f0..f45833c 100644
--- a/plink_lasso.c
+++ b/plink_lasso.c
@@ -3,7 +3,9 @@
 #include "plink_lasso.h"
 #include "plink_matrix.h"
 
+// need to force to 64-bit integer if >= 2^16
 #define WARM_START_ITERS 1000
+
 #define NLAMBDA 100
 #define DELTA_THRESHOLD 0.0001
 
@@ -39,12 +41,12 @@ int32_t transpose_covar(uintptr_t sample_valid_ct, uintptr_t covar_ct, uintptr_t
 
 int32_t lasso_bigmem(FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_exclude, uintptr_t marker_ct, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* pheno_nm2, double lasso_h2, double lasso_minlambda, uint32_t select_covars, uintptr_t* select_covars_bitfield, double* pheno_d_collapsed, uintptr_t covar_ct, char* covar_names, uintptr_t max_covar_name_len, uintptr_t* covar_nm, double* covar_d, uint32_t hh_or_mt_exists, uintptr_t sample_ [...]
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  double* data_arr = (double*)wkspace_base; // marker-major
+  double* data_arr = (double*)g_bigstack_base; // marker-major
   double sqrt_n_recip = sqrt(1.0 / ((double)((intptr_t)sample_valid_ct)));
   double lambda_max = 0.0;
   double err_cur = 0.0;
   uint64_t iter_tot = 0;
-  uintptr_t sample_valid_ctl2 = (sample_valid_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t sample_valid_ctl2 = QUATERCT_TO_WORDCT(sample_valid_ct);
   uintptr_t polymorphic_marker_ct = 0;
   uintptr_t unselected_covar_ct = 0;
   uintptr_t final_mask = get_final_mask(sample_valid_ct);
@@ -151,18 +153,18 @@ int32_t lasso_bigmem(FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_excl
       refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &uii, &min_ploidy_1);
     }
     min_ploidy_1 |= uii;
-    if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf_collapsed, sample_valid_ct, pheno_nm2, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+    if (load_and_collapse_incl(unfiltered_sample_ct, sample_valid_ct, pheno_nm2, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf_collapsed)) {
       goto lasso_bigmem_ret_READ_FAIL;
     }
     if (min_ploidy_1) {
       haploid_fix(hh_or_mt_exists, sample_include2, sample_male_include2, sample_valid_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
     }
-    vec_3freq(sample_valid_ctl2, loadbuf_collapsed, sample_include2, &missing_ct, &het_ct, &homset_ct);
+    genovec_3freq(loadbuf_collapsed, sample_include2, sample_valid_ctl2, &missing_ct, &het_ct, &homset_ct);
     uii = sample_valid_ct - missing_ct;
     homrar_ct = uii - het_ct - homset_ct;
     if (!(((!homrar_ct) && ((!het_ct) || (!homset_ct))) || ((!het_ct) && (!homset_ct)))) {
       // ok, not monomorphic.  standardize to zero mean, unit variance
-      SET_BIT(polymorphic_markers, marker_uidx);
+      SET_BIT(marker_uidx, polymorphic_markers);
       dyy = (double)(2 * homrar_ct + het_ct); // sum
       dxx = dyy / ((double)((int32_t)uii)); // mean
       dyy = sqrt_n_recip * sqrt(((double)((int32_t)(uii - 1))) / (4 * ((double)((int32_t)homrar_ct)) + ((double)((int32_t)het_ct)) - dyy * dxx)); // 1/(stdev * sqrt(n))
@@ -198,12 +200,12 @@ int32_t lasso_bigmem(FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_excl
     return 0;
   }
   col_ct = covar_ct + polymorphic_marker_ct;
-  col_ctl = (col_ct + (BITCT - 1)) / BITCT;
-  wkspace_shrink_top(data_arr, col_ct * sample_valid_ct * sizeof(double));
+  col_ctl = BITCT_TO_WORDCT(col_ct);
+  bigstack_shrink_top(data_arr, col_ct * sample_valid_ct * sizeof(double));
   sige = sqrt(1.0 - lasso_h2 + 1.0 / ((double)((intptr_t)sample_valid_ct)));
   zz = sige * sqrt_n_recip;
   if (rand_matrix) {
-    prod_matrix = (double*)wkspace_alloc(WARM_START_ITERS * WARM_START_ITERS * sizeof(double));
+    bigstack_alloc_d(WARM_START_ITERS * WARM_START_ITERS, &prod_matrix);
     fputs("\r--lasso: Initializing warm start matrix...", stdout);
     fflush(stdout);
     fill_double_zero(misc_arr, WARM_START_ITERS);
@@ -225,13 +227,13 @@ int32_t lasso_bigmem(FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_excl
 	}
       }
     }
-    lambda_min = destructive_get_dmedian(misc_arr, WARM_START_ITERS) * zz;
+    lambda_min = destructive_get_dmedian(WARM_START_ITERS, misc_arr) * zz;
     logstr("--lasso:");
     LOGPRINTF(" using min lambda = %g.\n", lambda_min);
-    wkspace_reset(prod_matrix);
+    bigstack_reset(prod_matrix);
   }
-  xhat = (double*)wkspace_alloc(col_ct * sizeof(double));
-  active_set = (uintptr_t*)wkspace_alloc(col_ctl * sizeof(intptr_t));
+  bigstack_alloc_d(col_ct, &xhat);
+  bigstack_alloc_ul(col_ctl, &active_set);
   *xhat_ptr = xhat;
   dptr = data_arr;
   for (col_idx = 0; col_idx < col_ct; col_idx++) {
@@ -283,7 +285,7 @@ int32_t lasso_bigmem(FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_excl
       }
     }
     iter = 0;
-    fill_all_bits(active_set, col_ct);
+    fill_all_bits(col_ct, active_set);
     col_nz_ct = col_ct;
     while (1) {
       col_uidx = 0;
@@ -306,7 +308,7 @@ int32_t lasso_bigmem(FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_excl
 	}
         xhat[col_uidx] = dxx;
         if (dxx == 0.0) {
-          CLEAR_BIT(active_set, col_uidx);
+          CLEAR_BIT(col_uidx, active_set);
 	  col_to_z++;
 	}
         dptr = residuals;
@@ -354,7 +356,7 @@ int32_t lasso_bigmem(FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_excl
 }
 
 uint32_t load_and_normalize(FILE* bedfile, uintptr_t* loadbuf_raw, uintptr_t unfiltered_sample_ct, uintptr_t* loadbuf_collapsed, uintptr_t sample_valid_ct, uintptr_t* pheno_nm2, uintptr_t final_mask, uint32_t do_reverse, uint32_t min_ploidy_1, uint32_t hh_or_mt_exists, uintptr_t* sample_include2, uintptr_t* sample_male_include2, uint32_t is_x, uint32_t is_y, double sqrt_n_recip, double* data_window_ptr) {
-  uintptr_t sample_valid_ctl2 = (sample_valid_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t sample_valid_ctl2 = QUATERCT_TO_WORDCT(sample_valid_ct);
   uintptr_t sample_idx = 0;
   uintptr_t sample_idx_stop = BITCT2;
   uintptr_t* ulptr_end_init = &(loadbuf_collapsed[sample_valid_ct / BITCT2]);
@@ -370,13 +372,13 @@ uint32_t load_and_normalize(FILE* bedfile, uintptr_t* loadbuf_raw, uintptr_t unf
   uint32_t het_ct;
   uint32_t homset_ct;
   uint32_t uii;
-  if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf_collapsed, sample_valid_ct, pheno_nm2, final_mask, do_reverse)) {
+  if (load_and_collapse_incl(unfiltered_sample_ct, sample_valid_ct, pheno_nm2, final_mask, do_reverse, bedfile, loadbuf_raw, loadbuf_collapsed)) {
     return 2; // read failure
   }
   if (min_ploidy_1) {
     haploid_fix(hh_or_mt_exists, sample_include2, sample_male_include2, sample_valid_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
   }
-  vec_3freq(sample_valid_ctl2, loadbuf_collapsed, sample_include2, &missing_ct, &het_ct, &homset_ct);
+  genovec_3freq(loadbuf_collapsed, sample_include2, sample_valid_ctl2, &missing_ct, &het_ct, &homset_ct);
   uii = sample_valid_ct - missing_ct;
   homrar_ct = uii - het_ct - homset_ct;
   if (((!homrar_ct) && ((!het_ct) || (!homset_ct))) || ((!het_ct) && (!homset_ct))) {
@@ -467,7 +469,7 @@ int32_t lasso_smallmem(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
   }
   fputs("Using memory-conserving LASSO implementation.\n", stdout);
   if (covar_ct) {
-    if (wkspace_alloc_d_checked(&covar_data_arr, covar_ct * sample_valid_ct * sizeof(double))) {
+    if (bigstack_alloc_d(covar_ct * sample_valid_ct, &covar_data_arr)) {
       goto lasso_smallmem_ret_NOMEM;
     }
     dxx = 1.0 / ((double)((intptr_t)sample_valid_ct));
@@ -508,19 +510,19 @@ int32_t lasso_smallmem(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
   zz = sige * sqrt_n_recip;
   // put this on top of the permanent stack portion so we can shrink it when we
   // know the true column count
-  if (wkspace_alloc_d_checked(&xhat, (covar_ct + marker_ct) * sizeof(double))) {
+  if (bigstack_alloc_d(covar_ct + marker_ct, &xhat)) {
     goto lasso_smallmem_ret_NOMEM;
   }
   if (rand_matrix) {
-    if (wkspace_alloc_d_checked(&data_window, sample_valid_ct * WARM_START_ITERS * sizeof(double)) ||
-        wkspace_alloc_d_checked(&prod_matrix, WARM_START_ITERS * WARM_START_ITERS * sizeof(double))) {
+    if (bigstack_alloc_d(sample_valid_ct * WARM_START_ITERS, &data_window) ||
+        bigstack_alloc_d(WARM_START_ITERS * WARM_START_ITERS, &prod_matrix)) {
       goto lasso_smallmem_ret_NOMEM;
     }
     fputs("\r--lasso: Initializing warm start matrix...", stdout);
     fflush(stdout);
     fill_double_zero(misc_arr, WARM_START_ITERS);
   } else {
-    if (wkspace_alloc_d_checked(&data_window, sample_valid_ct * sizeof(double))) {
+    if (bigstack_alloc_d(sample_valid_ct, &data_window)) {
       goto lasso_smallmem_ret_NOMEM;
     }
   }
@@ -545,7 +547,7 @@ int32_t lasso_smallmem(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
     if (uii == 1) {
       continue;
     }
-    SET_BIT(polymorphic_markers, marker_uidx);
+    SET_BIT(marker_uidx, polymorphic_markers);
     dxx = 0.0;
     for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
       dxx += dptr[sample_idx] * pheno_d_collapsed[sample_idx];
@@ -595,24 +597,24 @@ int32_t lasso_smallmem(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
 	}
       }
     }
-    lambda_min = destructive_get_dmedian(misc_arr, WARM_START_ITERS) * zz;
+    lambda_min = destructive_get_dmedian(WARM_START_ITERS, misc_arr) * zz;
     logstr("--lasso:");
     LOGPRINTF(" using min lambda = %g.\n", lambda_min);
   }
   polymorphic_marker_ct += partial_marker_idx;
   *polymorphic_marker_ct_ptr = polymorphic_marker_ct;
   col_ct = covar_ct + polymorphic_marker_ct;
-  wkspace_reset(data_window);
-  wkspace_shrink_top(xhat, col_ct * sizeof(double));
-  col_ctl = (col_ct + (BITCT - 1)) / BITCT;
+  bigstack_reset(data_window);
+  bigstack_shrink_top(xhat, col_ct * sizeof(double));
+  col_ctl = BITCT_TO_WORDCT(col_ct);
   *xhat_ptr = xhat;
   if (lambda_min >= lambda_max) {
     logprint("\n");
     logerrprint("Error: min lambda >= max lambda.\n");
     goto lasso_smallmem_ret_INVALID_CMDLINE;
   }
-  if (wkspace_alloc_ul_checked(&active_set, col_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_d_checked(&data_window, sample_valid_ct * sizeof(double))) {
+  if (bigstack_alloc_ul(col_ctl, &active_set) ||
+      bigstack_alloc_d(sample_valid_ct, &data_window)) {
     goto lasso_smallmem_ret_NOMEM;
   }
   loghi = log(lambda_max);
@@ -664,7 +666,7 @@ int32_t lasso_smallmem(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
       }
     }
     iter = 0;
-    fill_all_bits(active_set, col_ct);
+    fill_all_bits(col_ct, active_set);
     col_nz_ct = col_ct;
     while (1) {
       col_uidx = 0;
@@ -711,7 +713,7 @@ int32_t lasso_smallmem(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
 	}
         xhat[col_uidx] = dxx;
         if (dxx == 0.0) {
-          CLEAR_BIT(active_set, col_uidx);
+          CLEAR_BIT(col_uidx, active_set);
 	  col_to_z++;
 	}
         dptr = residuals;
@@ -771,11 +773,11 @@ int32_t lasso(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* out
   // Vattikuti.
   // Not yet multithreaded.  (Main loop is fairly tightly coupled, so getting
   // a performance benefit will be a bit tricky.)
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t unfiltered_sample_ctv2 = 2 * ((unfiltered_sample_ct + (BITCT - 1)) / BITCT);
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t unfiltered_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct);
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t polymorphic_marker_ct = 0;
   uint64_t iter_tot = 0;
   double* xhat = NULL;
@@ -809,6 +811,7 @@ int32_t lasso(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* out
   double dyy;
   double dzz;
   uint64_t ullii;
+  uint64_t ulljj;
   uintptr_t sample_valid_ct;
   uintptr_t sample_valid_ctv2;
   uintptr_t marker_idx;
@@ -823,7 +826,7 @@ int32_t lasso(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* out
   if (!covar_ct) {
     sample_valid_ct = pheno_nm_ct;
   } else {
-    sample_valid_ct = popcount_longs(covar_nm, (pheno_nm_ct + (BITCT - 1)) / BITCT);
+    sample_valid_ct = popcount_longs(covar_nm, BITCT_TO_WORDCT(pheno_nm_ct));
   }
   if (sample_valid_ct < 2) {
     if (pheno_nm_ct < 2) {
@@ -836,30 +839,29 @@ int32_t lasso(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* out
   if (sample_valid_ct == pheno_nm_ct) {
     pheno_nm2 = pheno_nm;
   } else {
-    if (wkspace_alloc_ul_checked(&pheno_nm2, unfiltered_sample_ctl * sizeof(intptr_t))) {
+    if (bigstack_calloc_ul(unfiltered_sample_ctl, &pheno_nm2)) {
       goto lasso_ret_NOMEM;
     }
-    fill_ulong_zero(pheno_nm2, unfiltered_sample_ctl);
     for (sample_uidx = 0, sample_idx = 0; sample_idx < pheno_nm_ct; sample_uidx++, sample_idx++) {
       next_set_ul_unsafe_ck(pheno_nm, &sample_uidx);
       if (IS_SET(covar_nm, sample_idx)) {
-        SET_BIT(pheno_nm2, sample_uidx);
+        SET_BIT(sample_uidx, pheno_nm2);
       }
     }
   }
-  sample_valid_ctv2 = 2 * ((sample_valid_ct + (BITCT - 1)) / BITCT);
+  sample_valid_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_valid_ct);
   sqrt_n_recip = sqrt(1.0 / ((double)((intptr_t)sample_valid_ct)));
-  if (wkspace_alloc_ul_checked(&sample_include2, sample_valid_ctv2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctv2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf_collapsed, sample_valid_ctv2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&polymorphic_markers, unfiltered_marker_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_d_checked(&pheno_d_collapsed, sample_valid_ct * sizeof(double)) ||
-      wkspace_alloc_d_checked(&residuals, sample_valid_ct * sizeof(double))) {
+  if (bigstack_alloc_ul(sample_valid_ctv2, &sample_include2) ||
+      bigstack_alloc_ul(unfiltered_sample_ctv2, &loadbuf_raw) ||
+      bigstack_alloc_ul(sample_valid_ctv2, &loadbuf_collapsed) ||
+      bigstack_alloc_ul(unfiltered_marker_ctl, &polymorphic_markers) ||
+      bigstack_alloc_d(sample_valid_ct, &pheno_d_collapsed) ||
+      bigstack_alloc_d(sample_valid_ct, &residuals)) {
     goto lasso_ret_NOMEM;
   }
   if (lasso_minlambda == -1) {
-    if (wkspace_alloc_d_checked(&rand_matrix, sample_valid_ct * WARM_START_ITERS * sizeof(double)) ||
-        wkspace_alloc_d_checked(&misc_arr, WARM_START_ITERS * sizeof(double))) {
+    if (bigstack_alloc_d(sample_valid_ct * WARM_START_ITERS, &rand_matrix) ||
+        bigstack_alloc_d(WARM_START_ITERS, &misc_arr)) {
       goto lasso_ret_NOMEM;
     }
   }
@@ -907,12 +909,12 @@ int32_t lasso(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* out
     *dptr = ((*dptr) - dzz) * dyy;
     dptr++;
   }
-  fill_vec_55(sample_include2, sample_valid_ct);
+  fill_quatervec_55(sample_valid_ct, sample_include2);
   fill_ulong_zero(polymorphic_markers, unfiltered_marker_ctl);
   if ((chrom_info_ptr->mt_code != -1) && is_set(chrom_info_ptr->chrom_mask, chrom_info_ptr->mt_code)) {
     hh_or_mt_exists |= NXMHH_EXISTS;
   }
-  if (alloc_collapsed_haploid_filters(unfiltered_sample_ct, sample_valid_ct, hh_or_mt_exists, 1, pheno_nm2, sex_male, &sample_include2, &sample_male_include2)) {
+  if (alloc_collapsed_haploid_filters(pheno_nm2, sex_male, unfiltered_sample_ct, sample_valid_ct, hh_or_mt_exists, 1, &sample_include2, &sample_male_include2)) {
     goto lasso_ret_NOMEM;
   }
   if (select_covars && select_covars_range_list_ptr->name_ct) {
@@ -920,7 +922,7 @@ int32_t lasso(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* out
       logerrprint("Error: No covariates loaded for --lasso-select-covars.\n");
       goto lasso_ret_INVALID_CMDLINE;
     }
-    retval = string_range_list_to_bitfield_alloc(covar_names, covar_ct, max_covar_name_len, select_covars_range_list_ptr, &select_covars_bitfield, "lasso-select-covars", "--covar file");
+    retval = string_range_list_to_bitarr_alloc(covar_names, covar_ct, max_covar_name_len, select_covars_range_list_ptr, "lasso-select-covars", "--covar file", &select_covars_bitfield);
     if (retval) {
       goto lasso_ret_1;
     }
@@ -935,39 +937,38 @@ int32_t lasso(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* out
   // or
   //   3. prod_matrix: WARM_START_ITERS * WARM_START_ITERS * sizeof(double)
   // (whichever is larger)
-  ullii = CACHEALIGN(((uint64_t)uii) * sizeof(double)) + CACHEALIGN(((uii + 7) / 8));
+  ullii = round_up_pow2(((uint64_t)uii) * sizeof(double), CACHELINE) + round_up_pow2((uii + 7) / 8, CACHELINE);
   // assumes WARM_START_ITERS is even
   if (rand_matrix) {
     uljj = (sample_valid_ct * WARM_START_ITERS) - 1;
     for (ulii = 0; ulii < uljj; ulii += 2) {
       rand_matrix[ulii] = rand_normal(&(rand_matrix[ulii + 1]));
     }
-    if (ullii < CACHEALIGN(WARM_START_ITERS * WARM_START_ITERS * sizeof(double))) {
-      ullii = CACHEALIGN(WARM_START_ITERS * WARM_START_ITERS * sizeof(double));
+    ulljj = round_up_pow2(WARM_START_ITERS * WARM_START_ITERS * sizeof(double), CACHELINE);
+    if (ullii < ulljj) {
+      ullii = ulljj;
     }
   }
-  ullii += CACHEALIGN(((uint64_t)uii) * sample_valid_ct * sizeof(double));
-  // if (0) {
-  if (ullii <= wkspace_left) {
+  ullii += round_up_pow2(((uint64_t)uii) * sample_valid_ct * sizeof(double), CACHELINE);
+  if (ullii <= bigstack_left()) {
     retval = lasso_bigmem(bedfile, bed_offset, marker_exclude, marker_ct, marker_reverse, chrom_info_ptr, unfiltered_sample_ct, pheno_nm2, lasso_h2, lasso_minlambda, select_covars, select_covars_bitfield, pheno_d_collapsed, covar_ct, covar_names, max_covar_name_len, covar_nm, covar_d, hh_or_mt_exists, sample_valid_ct, sample_include2, sample_male_include2, loadbuf_raw, loadbuf_collapsed, rand_matrix, misc_arr, residuals, polymorphic_markers, &polymorphic_marker_ct, &iter_tot, &xhat);
   } else {
     retval = lasso_smallmem(threads, bedfile, bed_offset, marker_exclude, marker_ct, marker_reverse, chrom_info_ptr, unfiltered_sample_ct, pheno_nm2, lasso_h2, lasso_minlambda, select_covars, select_covars_bitfield, pheno_d_collapsed, covar_ct, covar_names, max_covar_name_len, covar_nm, covar_d, hh_or_mt_exists, sample_valid_ct, sample_include2, sample_male_include2, loadbuf_raw, loadbuf_collapsed, rand_matrix, misc_arr, residuals, polymorphic_markers, &polymorphic_marker_ct, &iter_tot, &xhat);
-    // retval = RET_NOMEM;
   }
   if (retval || (!polymorphic_marker_ct)) {
     goto lasso_ret_1;
   }
   memcpy(outname_end, ".lasso", 7);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto lasso_ret_OPEN_FAIL;
   }
   if (fputs_checked("CHR\tSNP\tA1\tEFFECT\n", outfile)) {
     goto lasso_ret_WRITE_FAIL;
   }
-  tbuf[MAXLINELEN] = '\t';
+  g_textbuf[MAXLINELEN] = '\t';
   if (select_covars) {
     if (select_covars_bitfield) {
-      marker_idx = covar_ct - popcount_longs(select_covars_bitfield, (covar_ct + (BITCT - 1)) / BITCT);
+      marker_idx = covar_ct - popcount_longs(select_covars_bitfield, BITCT_TO_WORDCT(covar_ct));
     } else {
       marker_idx = 0;
     }
@@ -979,11 +980,11 @@ int32_t lasso(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* out
       if ((!report_zeroes) && (dxx == 0)) {
 	continue;
       }
-      wptr = memcpya(tbuf, "COV\t", 4);
+      wptr = memcpya(g_textbuf, "COV\t", 4);
       wptr = strcpyax(wptr, &(covar_names[marker_uidx * max_covar_name_len]), '\t');
       wptr = memcpyl3a(wptr, "NA\t");
-      wptr = double_g_writex(wptr, dxx, '\n');
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      wptr = dtoa_gx(dxx, '\n', wptr);
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	goto lasso_ret_WRITE_FAIL;
       }
     }
@@ -994,7 +995,7 @@ int32_t lasso(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* out
       chrom_fo_idx++;
       refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &uii, &min_ploidy_1);
       uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-      wptr_start = chrom_name_write(tbuf, chrom_info_ptr, uii);
+      wptr_start = chrom_name_write(chrom_info_ptr, uii, g_textbuf);
       *wptr_start++ = '\t';
     }
     wptr = strcpyax(wptr_start, &(marker_ids[marker_uidx * max_marker_id_len]), '\t');
@@ -1003,18 +1004,18 @@ int32_t lasso(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* out
       if ((!report_zeroes) && (dxx == 0)) {
 	continue;
       }
-      wptr2 = double_g_writex(&(tbuf[MAXLINELEN + 1]), dxx, '\n');
+      wptr2 = dtoa_gx(dxx, '\n', &(g_textbuf[MAXLINELEN + 1]));
     } else {
       if (!report_zeroes) {
 	continue;
       }
-      wptr2 = memcpyl3a(&(tbuf[MAXLINELEN + 1]), "NA\n");
+      wptr2 = memcpyl3a(&(g_textbuf[MAXLINELEN + 1]), "NA\n");
     }
-    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
       goto lasso_ret_WRITE_FAIL;
     }
     fputs(marker_allele_ptrs[2 * marker_uidx], outfile);
-    if (fwrite_checked(&(tbuf[MAXLINELEN]), (uintptr_t)(wptr2 - (&(tbuf[MAXLINELEN]))), outfile)) {
+    if (fwrite_checked(&(g_textbuf[MAXLINELEN]), (uintptr_t)(wptr2 - (&(g_textbuf[MAXLINELEN]))), outfile)) {
       goto lasso_ret_WRITE_FAIL;
     }
   }
@@ -1039,7 +1040,7 @@ int32_t lasso(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* out
     break;
   }
  lasso_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   return retval;
 }
diff --git a/plink_ld.c b/plink_ld.c
index 3d78800..df5e2e6 100644
--- a/plink_ld.c
+++ b/plink_ld.c
@@ -149,11 +149,11 @@ static inline void ld_dot_prod_batch(__m128i* vec1, __m128i* vec2, __m128i* mask
   __m128i tmp_sum1;
   __m128i tmp_sum2;
   __m128i tmp_sum12;
-  __uni16 acc;
-  __uni16 acc1;
-  __uni16 acc2;
-  __uni16 acc11;
-  __uni16 acc22;
+  __univec acc;
+  __univec acc1;
+  __univec acc2;
+  __univec acc11;
+  __univec acc22;
   acc.vi = _mm_setzero_si128();
   acc1.vi = _mm_setzero_si128();
   acc2.vi = _mm_setzero_si128();
@@ -265,7 +265,7 @@ static inline int32_t ld_dot_prod_nm_batch(__m128i* vec1, __m128i* vec2, uint32_
   __m128i loader2;
   __m128i sum12;
   __m128i tmp_sum12;
-  __uni16 acc;
+  __univec acc;
   acc.vi = _mm_setzero_si128();
   do {
     loader1 = *vec1++;
@@ -498,7 +498,7 @@ int32_t ld_dot_prod_nm(uintptr_t* vec1, uintptr_t* vec2, uint32_t founder_ct, ui
 
 uint32_t ld_process_load(uintptr_t* geno_buf, uintptr_t* mask_buf, uintptr_t* missing_buf, uint32_t* missing_ct_ptr, double* sum_ptr, double* variance_recip_ptr, uint32_t founder_ct, uint32_t is_x, uint32_t weighted_x, uint32_t nonmale_founder_ct, uintptr_t* founder_male_include2, uintptr_t* nonmale_geno, uintptr_t* nonmale_masks, uintptr_t nonmale_offset) {
   uintptr_t* geno_ptr = geno_buf;
-  uintptr_t founder_ctl2 = (founder_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t founder_ctl2 = QUATERCT_TO_WORDCT(founder_ct);
   uintptr_t* geno_end = &(geno_buf[founder_ctl2]);
   uintptr_t* mask_buf_ptr = mask_buf;
   uintptr_t* missing_ptr = missing_buf;
@@ -638,14 +638,14 @@ void ld_prune_start_chrom(uint32_t ld_window_kb, uint32_t* cur_chrom_ptr, uint32
   uint32_t uii = 0;
   uint32_t window_size;
   live_indices[0] = window_unfiltered_start;
-  next_unset_ck(marker_exclude, &window_unfiltered_end, chrom_end);
+  next_unset_ck(marker_exclude, chrom_end, &window_unfiltered_end);
   if (ld_window_kb) {
     window_size = 1;
     uii = window_unfiltered_end;
     while ((uii < chrom_end) && (marker_pos[uii] <= marker_pos[window_unfiltered_start] + (1000 * ld_window_size))) {
       window_size++;
       uii++;
-      next_unset_ck(marker_exclude, &uii, chrom_end);
+      next_unset_ck(marker_exclude, chrom_end, &uii);
     }
     uii = 0;
   } else {
@@ -658,7 +658,7 @@ void ld_prune_start_chrom(uint32_t ld_window_kb, uint32_t* cur_chrom_ptr, uint32
     start_arr[uii - 1] = window_unfiltered_end;
     live_indices[uii] = window_unfiltered_end;
     window_unfiltered_end++;
-    next_unset_ck(marker_exclude, &window_unfiltered_end, chrom_end);
+    next_unset_ck(marker_exclude, chrom_end, &window_unfiltered_end);
   }
   *cur_window_size_ptr = uii;
   start_arr[uii - 1] = window_unfiltered_end;
@@ -679,7 +679,7 @@ int32_t ld_prune_write(char* outname, char* outname_end, uintptr_t* marker_exclu
   fputs("Writing...", stdout);
   fflush(stdout);
   strcpy(outname_end, ".prune.in");
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto ld_prune_write_ret_OPEN_FAIL;
   }
   for (cur_chrom = 1; cur_chrom < chrom_code_end; cur_chrom++) {
@@ -696,7 +696,7 @@ int32_t ld_prune_write(char* outname, char* outname_end, uintptr_t* marker_exclu
     goto ld_prune_write_ret_WRITE_FAIL;
   }
   strcpy(outname_end, ".prune.out");
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto ld_prune_write_ret_OPEN_FAIL;
   }
   for (cur_chrom = 1; cur_chrom < chrom_code_end; cur_chrom++) {
@@ -732,14 +732,14 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
   // variances don't exclude the missing markers).
 
   // for future consideration: chromosome-based multithread/parallel?
-  unsigned char* wkspace_mark = wkspace_base;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl2 = 2 * ((unfiltered_sample_ct + (BITCT - 1)) / BITCT);
-  uintptr_t founder_ct = popcount_longs(founder_info, unfiltered_sample_ctl2 / 2);
-  uintptr_t founder_ctl = (founder_ct + BITCT - 1) / BITCT;
+  uintptr_t unfiltered_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct);
+  uintptr_t founder_ct = popcount_longs(founder_info, unfiltered_sample_ctv2 / 2);
+  uintptr_t founder_ctl = BITCT_TO_WORDCT(founder_ct);
 #ifdef __LP64__
-  uintptr_t founder_ctv = 2 * ((founder_ct + 127) / 128);
+  uintptr_t founder_ctv = BITCT_TO_ALIGNED_WORDCT(founder_ct);
 #else
   uintptr_t founder_ctv = founder_ctl;
 #endif
@@ -829,10 +829,10 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
     goto ld_prune_ret_1;
   }
   if (is_set(chrom_info_ptr->chrom_mask, 0)) {
-    ulii = count_chrom_markers(chrom_info_ptr, 0, marker_exclude);
+    ulii = count_chrom_markers(chrom_info_ptr, marker_exclude, 0);
     if (chrom_info_ptr->zero_extra_chroms) {
       for (uii = chrom_info_ptr->max_code + 1; uii < chrom_code_end; uii++) {
-	ulii += count_chrom_markers(chrom_info_ptr, uii, marker_exclude);
+	ulii += count_chrom_markers(chrom_info_ptr, marker_exclude, uii);
       }
       chrom_code_end = chrom_info_ptr->max_code + 1;
     }
@@ -845,7 +845,7 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
   }
 
   // force founder_male_include2 allocation
-  if (alloc_collapsed_haploid_filters(unfiltered_sample_ct, founder_ct, XMHH_EXISTS | hh_exists, 1, founder_info, sex_male, &founder_include2, &founder_male_include2)) {
+  if (alloc_collapsed_haploid_filters(founder_info, sex_male, unfiltered_sample_ct, founder_ct, XMHH_EXISTS | hh_exists, 1, &founder_include2, &founder_male_include2)) {
     goto ld_prune_ret_NOMEM;
   }
   if (weighted_x) {
@@ -883,7 +883,7 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
 
   window_unfiltered_start = ld_prune_next_valid_chrom_start(marker_exclude, 0, chrom_info_ptr, chrom_code_end, unfiltered_marker_ct);
 
-  if (wkspace_alloc_ul_checked(&pruned_arr, unfiltered_marker_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_marker_ctl, &pruned_arr)) {
     goto ld_prune_ret_NOMEM;
   }
 
@@ -892,21 +892,20 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
   if (!window_is_kb) {
     window_max = ld_window_size;
   }
-  ulii = window_max;
-  if (wkspace_alloc_ui_checked(&live_indices, ulii * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&start_arr, ulii * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&geno, ulii * founder_ct_192_long * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&geno_masks, ulii * founder_ct_192_long * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&geno_mmasks, ulii * founder_ctv * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&missing_cts, ulii * sizeof(int32_t)) ||
-      wkspace_alloc_d_checked(&sums, ulii * sizeof(double)) ||
-      wkspace_alloc_d_checked(&variance_recips, ulii * sizeof(double))) {
+  if (bigstack_alloc_ui(window_max, &live_indices) ||
+      bigstack_alloc_ui(window_max, &start_arr) ||
+      bigstack_alloc_ul(unfiltered_sample_ctv2, &loadbuf) ||
+      bigstack_alloc_ul(window_max * founder_ct_192_long, &geno) ||
+      bigstack_alloc_ul(window_max * founder_ct_192_long, &geno_masks) ||
+      bigstack_alloc_ul(window_max * founder_ctv, &geno_mmasks) ||
+      bigstack_alloc_ui(window_max, &missing_cts) ||
+      bigstack_alloc_d(window_max, &sums) ||
+      bigstack_alloc_d(window_max, &variance_recips)) {
     goto ld_prune_ret_NOMEM;
   }
   if (weighted_x) {
-    if (wkspace_alloc_ul_checked(&nonmale_geno, ulii * founder_ct_192_long * sizeof(intptr_t)) ||
-        wkspace_alloc_ul_checked(&nonmale_masks, ulii * founder_ct_192_long * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(window_max * founder_ct_192_long, &nonmale_geno) ||
+        bigstack_alloc_ul(window_max * founder_ct_192_long, &nonmale_masks)) {
       goto ld_prune_ret_NOMEM;
     }
   }
@@ -919,13 +918,13 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
     }
   }
   if (!pairwise) {
-    if (wkspace_alloc_d_checked(&cov_matrix, window_max * window_max * sizeof(double)) ||
-        wkspace_alloc_d_checked(&new_cov_matrix, window_max * window_max * sizeof(double)) ||
-        wkspace_alloc_ui_checked(&idx_remap, window_max * sizeof(int32_t))) {
+    if (bigstack_alloc_d(window_max * window_max, &cov_matrix) ||
+        bigstack_alloc_d(window_max * window_max, &new_cov_matrix) ||
+        bigstack_alloc_ui(window_max, &idx_remap)) {
       goto ld_prune_ret_NOMEM;
     }
 
-    irow = (MATRIX_INVERT_BUF1_TYPE*)wkspace_alloc(window_max * 2 * sizeof(MATRIX_INVERT_BUF1_TYPE));
+    irow = (MATRIX_INVERT_BUF1_TYPE*)bigstack_alloc(window_max * 2 * sizeof(MATRIX_INVERT_BUF1_TYPE));
     if (!irow) {
       goto ld_prune_ret_NOMEM;
     }
@@ -935,7 +934,7 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
     } else {
       ulii = window_max;
     }
-    if (wkspace_alloc_d_checked(&work, ulii * window_max * sizeof(double))) {
+    if (bigstack_alloc_d(ulii * window_max, &work)) {
       goto ld_prune_ret_NOMEM;
     }
   }
@@ -957,14 +956,14 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
 	if (fseeko(bedfile, bed_offset + (uii * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
 	  goto ld_prune_ret_READ_FAIL;
 	}
-	if (load_and_collapse_incl(bedfile, loadbuf, unfiltered_sample_ct, &(geno[ulii * founder_ct_192_long]), founder_ct, founder_info, final_mask, IS_SET(marker_reverse, uii))) {
+	if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, uii), bedfile, loadbuf, &(geno[ulii * founder_ct_192_long]))) {
 	  goto ld_prune_ret_READ_FAIL;
 	}
 	if (is_haploid && hh_exists) {
 	  haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)(&(geno[ulii * founder_ct_192_long])));
 	}
         if (!ld_process_load(&(geno[ulii * founder_ct_192_long]), &(geno_masks[ulii * founder_ct_192_long]), &(geno_mmasks[ulii * founder_ctv]), &(missing_cts[ulii]), &(sums[ulii]), &(variance_recips[ulii]), founder_ct, is_x && (!ignore_x), weighted_x, nonmale_founder_ct, founder_male_include2, nonmale_geno, nonmale_masks, ulii * founder_ct_192_long)) {
-	  SET_BIT(pruned_arr, uii);
+	  SET_BIT(uii, pruned_arr);
           cur_exclude_ct++;
 	}
       }
@@ -1040,9 +1039,9 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
 		// get_maf() is too cheap for this to make a noticeable
 		// difference
 		if (get_maf(set_allele_freqs[live_indices[uii]]) < get_maf(set_allele_freqs[live_indices[ujj]])) {
-		  SET_BIT(pruned_arr, live_indices[uii]);
+		  SET_BIT(live_indices[uii], pruned_arr);
 		} else {
-		  SET_BIT(pruned_arr, live_indices[ujj]);
+		  SET_BIT(live_indices[ujj], pruned_arr);
 		  ujj++;
 		  while (ujj < cur_window_size) {
 		    if (!IS_SET(pruned_arr, live_indices[ujj])) {
@@ -1138,7 +1137,7 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
 	      ujj = bsearch_min;
 	      // bug reported by Kaustubh was a violation of this:
 	      // assert(!IS_SET(pruned_arr, live_indices[idx_remap[ujj]]));
-              SET_BIT(pruned_arr, live_indices[idx_remap[ujj]]);
+              SET_BIT(live_indices[idx_remap[ujj]], pruned_arr);
 	      cur_exclude_ct++;
 	      window_rem--;
 	      for (uii = ujj; uii < window_rem; uii++) {
@@ -1166,7 +1165,7 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
 	      }
 	    }
 	    if (dxx > ld_last_param) {
-	      SET_BIT(pruned_arr, live_indices[idx_remap[ujj]]);
+	      SET_BIT(live_indices[idx_remap[ujj]], pruned_arr);
 	      cur_exclude_ct++;
 	      window_rem--;
 	      if (idx_remap[ujj] < (uint32_t)old_window_size) {
@@ -1187,7 +1186,7 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
 	  break;
 	}
 	window_unfiltered_start++;
-	next_unset_ck(marker_exclude, &window_unfiltered_start, chrom_end);
+	next_unset_ck(marker_exclude, chrom_end, &window_unfiltered_start);
       }
       if (window_unfiltered_start == chrom_end) {
 	break;
@@ -1244,7 +1243,7 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
 	while ((ukk < chrom_end) && (marker_pos[ukk] <= marker_pos[window_unfiltered_start] + (1000 * ld_window_size))) {
 	  ujj++;
 	  ukk++;
-	  next_unset_ck(marker_exclude, &ukk, chrom_end);
+	  next_unset_ck(marker_exclude, chrom_end, &ukk);
 	}
       } else {
 	ujj = ld_window_incr;
@@ -1261,19 +1260,19 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
 	if (fseeko(bedfile, bed_offset + (window_unfiltered_end * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
 	  goto ld_prune_ret_READ_FAIL;
 	}
-	if (load_and_collapse_incl(bedfile, loadbuf, unfiltered_sample_ct, &(geno[cur_window_size * founder_ct_192_long]), founder_ct, founder_info, final_mask, IS_SET(marker_reverse, window_unfiltered_end))) {
+	if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, window_unfiltered_end), bedfile, loadbuf, &(geno[cur_window_size * founder_ct_192_long]))) {
 	  goto ld_prune_ret_READ_FAIL;
 	}
 	if (is_haploid && hh_exists) {
 	  haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)(&(geno[cur_window_size * founder_ct_192_long])));
 	}
 	if (!ld_process_load(&(geno[cur_window_size * founder_ct_192_long]), &(geno_masks[cur_window_size * founder_ct_192_long]), &(geno_mmasks[cur_window_size * founder_ctv]), &(missing_cts[cur_window_size]), &(sums[cur_window_size]), &(variance_recips[cur_window_size]), founder_ct, is_x && (!ignore_x), weighted_x, nonmale_founder_ct, founder_male_include2, nonmale_geno, nonmale_masks, cur_window_size * founder_ct_192_long)) {
-	  SET_BIT(pruned_arr, window_unfiltered_end);
+	  SET_BIT(window_unfiltered_end, pruned_arr);
 	  cur_exclude_ct++;
 	}
 	cur_window_size++;
 	window_unfiltered_end++;
-	next_unset_ck(marker_exclude, &window_unfiltered_end, chrom_end);
+	next_unset_ck(marker_exclude, chrom_end, &window_unfiltered_end);
       }
       if (cur_window_size > prev_end) {
 	start_arr[cur_window_size] = window_unfiltered_end;
@@ -1311,7 +1310,7 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
 #endif
   }
  ld_prune_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -1320,7 +1319,7 @@ void ld_process_load2(uintptr_t* geno_buf, uintptr_t* mask_buf, uint32_t* missin
   // --ld-xchr 3 support yet), and no zero-variance check (we just want to
   // dump nans in that case)
   uintptr_t* geno_ptr = geno_buf;
-  uintptr_t founder_ctl2 = (founder_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t founder_ctl2 = QUATERCT_TO_WORDCT(founder_ct);
   uintptr_t* geno_end = &(geno_buf[founder_ctl2]);
   uintptr_t* mask_buf_ptr = mask_buf;
   uintptr_t cur_geno;
@@ -1362,7 +1361,7 @@ uint32_t ld_missing_ct_intersect(uintptr_t* lptr1, uintptr_t* lptr2, uintptr_t w
   __m128i* vend1;
   __m128i loader1;
   __m128i loader2;
-  __uni16 acc;
+  __univec acc;
 
   while (word12_ct >= 10) {
     word12_ct -= 10;
@@ -1429,15 +1428,15 @@ uint32_t ld_missing_ct_intersect(uintptr_t* lptr1, uintptr_t* lptr2, uintptr_t w
 }
 
 int32_t flipscan(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uint32_t* marker_pos, uintptr_t unfiltered_sample_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t* founder_info, uintptr_t* s [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
   FILE* outfile_verbose = NULL;
   uintptr_t* sample_include2 = NULL;
   uintptr_t* sample_male_include2 = NULL;
   double min_corr = ldip->flipscan_thresh * (1 - SMALL_EPSILON);
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t marker_idx = 0;
   uintptr_t max_window_size = 1;
   uintptr_t pct = 1;
@@ -1518,26 +1517,26 @@ int32_t flipscan(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
   uint32_t uii;
   ulii = 2 * (max_marker_allele_len + plink_maxsnp) + 256;
   if (ulii <= MAXLINELEN) {
-    textbuf = tbuf;
+    textbuf = g_textbuf;
   } else {
-    if (wkspace_alloc_c_checked(&textbuf, ulii)) {
+    if (bigstack_alloc_c(ulii, &textbuf)) {
       goto flipscan_ret_NOMEM;
     }
   }
-  if (wkspace_alloc_ul_checked(&(founder_phenos[0]), unfiltered_sample_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&(founder_phenos[1]), unfiltered_sample_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl, &(founder_phenos[0])) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl, &(founder_phenos[1])) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw)) {
     goto flipscan_ret_NOMEM;
   }
   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
   memcpy(founder_phenos[0], founder_info, unfiltered_sample_ctl * sizeof(intptr_t));
-  bitfield_and(founder_phenos[0], pheno_nm, unfiltered_sample_ctl);
+  bitvec_and(pheno_nm, unfiltered_sample_ctl, founder_phenos[0]);
   if (alloc_raw_haploid_filters(unfiltered_sample_ct, hh_exists, 0, founder_phenos[0], sex_male, &sample_include2, &sample_male_include2)) {
     goto flipscan_ret_NOMEM;
   }
   memcpy(founder_phenos[1], founder_phenos[0], unfiltered_sample_ctl * sizeof(intptr_t));
-  bitfield_and(founder_phenos[1], pheno_c, unfiltered_sample_ctl);
-  bitfield_andnot(founder_phenos[0], pheno_c, unfiltered_sample_ctl);
+  bitvec_and(pheno_c, unfiltered_sample_ctl, founder_phenos[1]);
+  bitvec_andnot(pheno_c, unfiltered_sample_ctl, founder_phenos[0]);
   pheno_ct[0] = popcount_longs(founder_phenos[0], unfiltered_sample_ctl);
   pheno_ct[1] = popcount_longs(founder_phenos[1], unfiltered_sample_ctl);
   if ((!pheno_ct[0]) || (!pheno_ct[1])) {
@@ -1549,7 +1548,7 @@ int32_t flipscan(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
     goto flipscan_ret_INVALID_CMDLINE;
   }
   for (is_case = 0; is_case < 2; is_case++) {
-    pheno_ctl[is_case] = (pheno_ct[is_case] + (BITCT - 1)) / BITCT;
+    pheno_ctl[is_case] = BITCT_TO_WORDCT(pheno_ct[is_case]);
     ulii = (pheno_ct[is_case] + MULTIPLEX_LD - 1) / MULTIPLEX_LD;
     pheno_ct_mld_m1[is_case] = ulii - 1;
 #ifdef __LP64__
@@ -1562,20 +1561,20 @@ int32_t flipscan(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
     max_window_size = chrom_window_max(marker_pos, marker_exclude, chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx], max_window_locus_ct * 2 + 1, window_bp * 2, max_window_size);
   }
-  if (wkspace_alloc_ui_checked(&window_uidxs, max_window_size * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&window_cidx_starts, max_window_size * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&neg_uidx_buf, max_window_size * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(&(pheno_male_include2[0]), pheno_ctl[0] * 2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&(pheno_male_include2[1]), pheno_ctl[1] * 2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&missing_cts, max_window_size * 2 * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(&(window_geno[0]), max_window_size * pheno_ct_192_long[0] * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&(window_mask[0]), max_window_size * pheno_ct_192_long[0] * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&(window_geno[1]), max_window_size * pheno_ct_192_long[1] * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&(window_mask[1]), max_window_size * pheno_ct_192_long[1] * sizeof(intptr_t)) ||
+  if (bigstack_alloc_ui(max_window_size, &window_uidxs) ||
+      bigstack_alloc_ui(max_window_size, &window_cidx_starts) ||
+      bigstack_alloc_ui(max_window_size, &neg_uidx_buf) ||
+      bigstack_alloc_ul(pheno_ctl[0] * 2, &(pheno_male_include2[0])) ||
+      bigstack_alloc_ul(pheno_ctl[1] * 2, &(pheno_male_include2[1])) ||
+      bigstack_alloc_ui(max_window_size * 2, &missing_cts) ||
+      bigstack_alloc_ul(max_window_size * pheno_ct_192_long[0], &(window_geno[0])) ||
+      bigstack_alloc_ul(max_window_size * pheno_ct_192_long[0], &(window_mask[0])) ||
+      bigstack_alloc_ul(max_window_size * pheno_ct_192_long[1], &(window_geno[1])) ||
+      bigstack_alloc_ul(max_window_size * pheno_ct_192_long[1], &(window_mask[1])) ||
       // not advantageous to choose a very large block size here, so O(n^2)
       // memory is fine (though it can be avoided by calculating each
       // correlation twice).
-      wkspace_alloc_d_checked(&r_matrix, max_window_size * max_window_size * 2 * sizeof(double))) {
+      bigstack_alloc_d(max_window_size * max_window_size * 2, &r_matrix)) {
     goto flipscan_ret_NOMEM;
   }
   ulii = (max_window_size + 1) * 2;
@@ -1584,7 +1583,7 @@ int32_t flipscan(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
     neg_uidx_buf[uljj * ulii + 1] = 0.0;
   }
   for (is_case = 0; is_case < 2; is_case++) {
-    vec_collapse_init(sex_male, unfiltered_sample_ct, founder_phenos[is_case], pheno_ct[is_case], pheno_male_include2[is_case]);
+    quaterarr_collapse_init(sex_male, unfiltered_sample_ct, founder_phenos[is_case], pheno_ct[is_case], pheno_male_include2[is_case]);
     window_geno_ptr = window_geno[is_case];
     window_mask_ptr = window_mask[is_case];
     cur_192_long = pheno_ct_192_long[is_case];
@@ -1596,7 +1595,7 @@ int32_t flipscan(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
   }
 
   memcpy(outname_end, ".flipscan", 10);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto flipscan_ret_OPEN_FAIL;
   }
   wptr = memcpya(textbuf, "   CHR ", 7);
@@ -1607,7 +1606,7 @@ int32_t flipscan(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
   }
   if (verbose) {
     memcpy(&(outname_end[9]), ".verbose", 9);
-    if (fopen_checked(&outfile_verbose, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile_verbose)) {
       goto flipscan_ret_OPEN_FAIL;
     }
     outname_end[9] = '\0';
@@ -1632,7 +1631,7 @@ int32_t flipscan(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
       marker_idx += chrom_marker_ct;
       continue;
     }
-    wptr_start = width_force(6, textbuf, chrom_name_write(textbuf, chrom_info_ptr, chrom_idx));
+    wptr_start = width_force(6, textbuf, chrom_name_write(chrom_info_ptr, chrom_idx, textbuf));
     *wptr_start++ = ' ';
     is_haploid = is_set(chrom_info_ptr->haploid_mask, chrom_idx);
     is_x = (chrom_idx == ((uint32_t)chrom_info_ptr->x_code));
@@ -1651,11 +1650,11 @@ int32_t flipscan(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
 
       // circular index of beginning of window starting at current marker
       window_cidx_starts[window_cidx] = window_cidx2;
-      if (load_raw(bedfile, loadbuf_raw, unfiltered_sample_ct4)) {
+      if (load_raw(unfiltered_sample_ct4, bedfile, loadbuf_raw)) {
 	goto flipscan_ret_READ_FAIL;
       }
       if (IS_SET(marker_reverse, marker_uidx)) {
-	reverse_loadbuf((unsigned char*)loadbuf_raw, unfiltered_sample_ct);
+	reverse_loadbuf(unfiltered_sample_ct, (unsigned char*)loadbuf_raw);
       }
       if (is_haploid && hh_exists) {
         haploid_fix(hh_exists, sample_include2, sample_male_include2, unfiltered_sample_ct, is_x, is_y, (unsigned char*)loadbuf_raw);
@@ -1676,7 +1675,7 @@ int32_t flipscan(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
 	r_matrix_ptr = &(r_matrix[is_case]);
 	geno_fixed_vec_ptr = &(window_geno_ptr[window_cidx * cur_192_long]);
 	mask_fixed_vec_ptr = &(window_mask_ptr[window_cidx * cur_192_long]);
-        collapse_copy_2bitarr_incl(loadbuf_raw, geno_fixed_vec_ptr, unfiltered_sample_ct, cur_pheno_ct, founder_phenos[is_case]);
+        copy_quaterarr_nonempty_subset(loadbuf_raw, founder_phenos[is_case], unfiltered_sample_ct, cur_pheno_ct, geno_fixed_vec_ptr);
         ld_process_load2(geno_fixed_vec_ptr, mask_fixed_vec_ptr, &fixed_missing_ct, cur_pheno_ct, is_x && (!ignore_x), pheno_male_include2[is_case]);
 	fixed_non_missing_ct = cur_pheno_ct - fixed_missing_ct;
         missing_cts_ptr[window_cidx] = fixed_missing_ct;
@@ -1767,24 +1766,24 @@ int32_t flipscan(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
 	  }
 	  wptr_start2 = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr_start);
 	  wptr_start2 = memseta(wptr_start2, 32, 3);
-          wptr_start2 = uint32_writew10x(wptr_start2, marker_pos[marker_uidx2], ' ');
+          wptr_start2 = uint32toa_w10x(marker_pos[marker_uidx2], ' ', wptr_start2);
 	  wptr_start2 = fw_strcpy(4, marker_allele_ptrs[2 * marker_uidx2], wptr_start2);
 	  *wptr_start2++ = ' ';
 	  wptr = fw_strcpy(4, marker_allele_ptrs[2 * marker_uidx2 + 1], wptr_start2);
 	  *wptr++ = ' ';
-	  wptr = double_g_writewx3x(wptr, 1.0 - set_allele_freqs[marker_uidx2], 8, ' ');
-          wptr = uint32_writew6x(wptr, pos_r_ct, ' ');
+	  wptr = dtoa_g_wxp3x(1.0 - set_allele_freqs[marker_uidx2], 8, ' ', wptr);
+          wptr = uint32toa_w6x(pos_r_ct, ' ', wptr);
 	  if (!pos_r_ct) {
 	    wptr = memcpya(wptr, "      NA", 8);
 	  } else {
-            wptr = double_g_writewx3(wptr, pos_r_tot / ((int32_t)(pos_r_ct * 2)), 8);
+            wptr = dtoa_g_wxp3(pos_r_tot / ((int32_t)(pos_r_ct * 2)), 8, wptr);
 	  }
           *wptr++ = ' ';
-          wptr = uint32_writew6x(wptr, neg_r_ct, ' ');
+          wptr = uint32toa_w6x(neg_r_ct, ' ', wptr);
 	  if (!neg_r_ct) {
 	    wptr = memcpya(wptr, "      NA", 8);
 	  } else {
-	    wptr = double_g_writewx3(wptr, neg_r_tot / ((int32_t)(neg_r_ct * 2)), 8);
+	    wptr = dtoa_g_wxp3(neg_r_tot / ((int32_t)(neg_r_ct * 2)), 8, wptr);
 	  }
 	  *wptr++ = ' ';
           if (fwrite_checked(textbuf, wptr - textbuf, outfile)) {
@@ -1807,11 +1806,11 @@ int32_t flipscan(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
 		  marker_uidx3 = window_uidxs[window_cidx3];
 		  wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx3 * max_marker_id_len]), wptr_start2);
 		  wptr = memseta(wptr, 32, 3);
-		  wptr = uint32_writew10x(wptr, marker_pos[marker_uidx3], ' ');
+		  wptr = uint32toa_w10x(marker_pos[marker_uidx3], ' ', wptr);
 		  wptr = fw_strcpy(4, marker_allele_ptrs[2 * marker_uidx3], wptr);
                   *wptr++ = ' ';
-		  wptr = double_g_writewx3x(wptr, case_pheno, 8, ' ');
-		  wptr = double_g_writewx3x(wptr, ctrl_pheno, 8, '\n');
+		  wptr = dtoa_g_wxp3x(case_pheno, 8, ' ', wptr);
+		  wptr = dtoa_g_wxp3x(ctrl_pheno, 8, '\n', wptr);
 		  if (fwrite_checked(textbuf, wptr - textbuf, outfile_verbose)) {
 		    goto flipscan_ret_WRITE_FAIL;
 		  }
@@ -1885,7 +1884,7 @@ int32_t flipscan(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
     retval = RET_INVALID_CMDLINE;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   fclose_cond(outfile_verbose);
   return retval;
@@ -2099,7 +2098,7 @@ uint32_t ld_matrix_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
   while (block_idx1 < block_size1) {
     dptr = &(results[block_idx1 * marker_ctm8 + marker_idx]);
     while (marker_idx < marker_idx_end) {
-      sptr_cur = double_g_writex(sptr_cur, *dptr++, delimiter);
+      sptr_cur = dtoa_gx(*dptr++, delimiter, sptr_cur);
       marker_idx++;
       if (sptr_cur > readbuf_end) {
 	goto ld_matrix_emitn_ret;
@@ -2110,11 +2109,11 @@ uint32_t ld_matrix_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
       // bugfix: can't be <= since tab delimiter wouldn't be handled correctly
       // on subsequent pass
       if (ulii < marker_ct - marker_idx) {
-	sptr_cur = memcpya(sptr_cur, tbuf, ulii * 2);
+	sptr_cur = memcpya(sptr_cur, g_textbuf, ulii * 2);
 	marker_idx += ulii;
 	goto ld_matrix_emitn_ret;
       } else {
-	sptr_cur = memcpya(sptr_cur, tbuf, (marker_ct - marker_idx) * 2);
+	sptr_cur = memcpya(sptr_cur, g_textbuf, (marker_ct - marker_idx) * 2);
 	marker_idx = marker_ct;
       }
     }
@@ -2146,9 +2145,9 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
   uint32_t ignore_x = (ld_modifier / LD_IGNORE_X) & 1;
   uintptr_t marker_ct = g_ld_marker_ct;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t marker_ctm8 = (marker_ct + 7) & (~(7 * ONELU));
+  uintptr_t marker_ctm8 = round_up_pow2(marker_ct, 8);
   uintptr_t founder_ct = g_ld_founder_ct;
-  uintptr_t founder_ctl = (founder_ct + BITCT - 1) / BITCT;
+  uintptr_t founder_ctl = BITCT_TO_WORDCT(founder_ct);
   uintptr_t founder_ct_192_long = g_ld_founder_ct_192_long;
   uintptr_t final_mask = get_final_mask(founder_ct);
   uintptr_t marker_uidx_base = next_unset_unsafe(marker_exclude, 0);
@@ -2168,7 +2167,7 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
   uint32_t is_y = 0;
   uint32_t not_first_write = 0;
   int32_t retval = 0;
-  unsigned char* wkspace_mark2;
+  unsigned char* bigstack_mark2;
   uintptr_t* ulptr;
   unsigned char* overflow_buf;
   uint64_t tests_completed;
@@ -2187,7 +2186,7 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
   uint32_t chrom_end;
   uint32_t is_last_block;
 
-  if (wkspace_alloc_uc_checked(&overflow_buf, 262144)) {
+  if (bigstack_alloc_uc(262144, &overflow_buf)) {
     goto ld_report_matrix_ret_NOMEM;
   }
   if (output_single_prec) {
@@ -2195,7 +2194,7 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
     marker_ctm8 = (marker_ctm8 + 8) & (~15);
   }
   if (is_binary) {
-    if (fopen_checked(&outfile, outname, "wb")) {
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
       goto ld_report_matrix_ret_OPEN_FAIL;
     }
   }
@@ -2208,7 +2207,7 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
   //     g_ld_results buffer
   // round down to multiple of thread_ct for better workload distribution
   ulii = founder_ct_192_long * 2 * sizeof(intptr_t) + 3 * sizeof(int32_t) + marker_ctm8 * (8 - 4 * output_single_prec);
-  idx1_block_size = wkspace_left / (ulii * 2);
+  idx1_block_size = bigstack_left() / (ulii * 2);
   thread_workload = idx1_block_size / thread_ct;
   if (!thread_workload) {
     goto ld_report_matrix_ret_NOMEM;
@@ -2235,19 +2234,19 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
   if (idx1_block_size > marker_idx1_end - marker_idx1_start) {
     idx1_block_size = marker_idx1_end - marker_idx1_start;
   }
-  g_ld_geno1 = (uintptr_t*)wkspace_alloc(founder_ct_192_long * idx1_block_size * sizeof(intptr_t));
-  g_ld_geno_masks1 = (uintptr_t*)wkspace_alloc(founder_ct_192_long * idx1_block_size * sizeof(intptr_t));
-  g_ld_missing_cts1 = (uint32_t*)wkspace_alloc(idx1_block_size * sizeof(int32_t));
-  g_ld_interval1 = (uint32_t*)wkspace_alloc(idx1_block_size * 2 * sizeof(int32_t));
+  bigstack_alloc_ul(founder_ct_192_long * idx1_block_size, &g_ld_geno1);
+  bigstack_alloc_ul(founder_ct_192_long * idx1_block_size, &g_ld_geno_masks1);
+  bigstack_alloc_ui(idx1_block_size, &g_ld_missing_cts1);
+  bigstack_alloc_ui(idx1_block_size * 2, &g_ld_interval1);
 
   if (!output_single_prec) {
     // may want to set g_ld_results_f to NULL
-    if (wkspace_alloc_d_checked(&g_ld_results, marker_ctm8 * idx1_block_size * sizeof(double))) {
+    if (bigstack_alloc_d(marker_ctm8 * idx1_block_size, &g_ld_results)) {
       goto ld_report_matrix_ret_NOMEM;
     }
   } else {
     g_ld_results = NULL;
-    if (wkspace_alloc_f_checked(&g_ld_results_f, marker_ctm8 * idx1_block_size * sizeof(float))) {
+    if (bigstack_alloc_f(marker_ctm8 * idx1_block_size, &g_ld_results_f)) {
       goto ld_report_matrix_ret_NOMEM;
     }
   }
@@ -2255,24 +2254,24 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
   // claim the other half with idx2 buffer
   ulii -= marker_ctm8 * (8 - 4 * output_single_prec) + 2 * sizeof(int32_t);
   if (!output_single_prec) {
-    idx2_block_size = (wkspace_left / ulii) & (~(7 * ONELU));
+    idx2_block_size = (bigstack_left() / ulii) & (~(7 * ONELU));
   } else {
-    idx2_block_size = (wkspace_left / ulii) & (~(15 * ONELU));
+    idx2_block_size = (bigstack_left() / ulii) & (~(15 * ONELU));
   }
   if (idx2_block_size > marker_ctm8) {
     idx2_block_size = marker_ctm8;
   }
-  wkspace_mark2 = wkspace_base;
+  bigstack_mark2 = g_bigstack_base;
   while (1) {
     if (!idx2_block_size) {
       goto ld_report_matrix_ret_NOMEM;
     }
-    if (!(wkspace_alloc_ul_checked(&g_ld_geno2, founder_ct_192_long * idx2_block_size * sizeof(intptr_t)) ||
-          wkspace_alloc_ul_checked(&g_ld_geno_masks2, founder_ct_192_long * idx2_block_size * sizeof(intptr_t)) ||
-          wkspace_alloc_ui_checked(&g_ld_missing_cts2, idx2_block_size * sizeof(int32_t)))) {
+    if (!(bigstack_alloc_ul(founder_ct_192_long * idx2_block_size, &g_ld_geno2) ||
+          bigstack_alloc_ul(founder_ct_192_long * idx2_block_size, &g_ld_geno_masks2) ||
+          bigstack_alloc_ui(idx2_block_size, &g_ld_missing_cts2))) {
       break;
     }
-    wkspace_reset(wkspace_mark2);
+    bigstack_reset(bigstack_mark2);
     if (!output_single_prec) {
       idx2_block_size -= 8;
     } else {
@@ -2301,12 +2300,12 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
     if (is_square0) {
       if (is_binary) {
 	if (!output_single_prec) {
-          fill_double_zero((double*)tbuf, MAXLINELEN / sizeof(double));
+          fill_double_zero((double*)g_textbuf, MAXLINELEN / sizeof(double));
 	} else {
-          fill_float_zero((float*)tbuf, MAXLINELEN / sizeof(float));
+          fill_float_zero((float*)g_textbuf, MAXLINELEN / sizeof(float));
 	}
       } else {
-	ulptr = (uintptr_t*)tbuf;
+	ulptr = (uintptr_t*)g_textbuf;
 	// assume little-endian
 	// 0[delim]0[delim]...
 #ifdef __LP64__
@@ -2324,8 +2323,8 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
     marker_uidx1 = jump_forward_unset_unsafe(marker_exclude, marker_uidx1 + 1, marker_idx1);
   }
   g_ld_keep_sign = 0;
-  sprintf(logbuf, "--r%s %s%s to %s ... ", g_ld_is_r2? "2" : "", is_square? "square" : (is_square0? "square0" : "triangle"), is_binary? (output_single_prec? " bin4" : " bin") : (output_gz? " gz" : ""), outname);
-  wordwrap(logbuf, 16); // strlen("99% [processing]")
+  sprintf(g_logbuf, "--r%s %s%s to %s ... ", g_ld_is_r2? "2" : "", is_square? "square" : (is_square0? "square0" : "triangle"), is_binary? (output_single_prec? " bin4" : " bin") : (output_gz? " gz" : ""), outname);
+  wordwrapb(16); // strlen("99% [processing]")
   logprintb();
   fputs("0%", stdout);
   do {
@@ -2358,7 +2357,7 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
 	is_x = (((int32_t)chrom_idx) == chrom_info_ptr->x_code);
 	is_y = (((int32_t)chrom_idx) == chrom_info_ptr->y_code);
       }
-      if (load_and_collapse_incl(bedfile, loadbuf, unfiltered_sample_ct, &(g_ld_geno1[block_idx1 * founder_ct_192_long]), founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx1))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx1), bedfile, loadbuf, &(g_ld_geno1[block_idx1 * founder_ct_192_long]))) {
 	goto ld_report_matrix_ret_READ_FAIL;
       }
       if (is_haploid && hh_exists) {
@@ -2376,9 +2375,9 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
 	g_ld_interval1[2 * ulii - 1] = ulii + marker_idx1;
       }
       if (!output_single_prec) {
-        marker_ctm8 = (marker_idx2_end + 7) & (~7);
+        marker_ctm8 = round_up_pow2(marker_idx2_end, 8);
       } else {
-        marker_ctm8 = (marker_idx2_end + 15) & (~15);
+        marker_ctm8 = round_up_pow2(marker_idx2_end, 16);
       }
       g_ld_marker_ctm8 = marker_ctm8;
     }
@@ -2405,7 +2404,7 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
 	  is_x = (((int32_t)chrom_idx) == chrom_info_ptr->x_code);
 	  is_y = (((int32_t)chrom_idx) == chrom_info_ptr->y_code);
 	}
-	if (load_and_collapse_incl(bedfile, loadbuf, unfiltered_sample_ct, &(g_ld_geno2[block_idx2 * founder_ct_192_long]), founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx2))) {
+	if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx2), bedfile, loadbuf, &(g_ld_geno2[block_idx2 * founder_ct_192_long]))) {
 	  goto ld_report_matrix_ret_READ_FAIL;
 	}
 	if (is_haploid && hh_exists) {
@@ -2448,7 +2447,7 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
 		  uljj = ulii;
 		  ulii = 0;
 		}
-		if (fwrite_checked(tbuf, uljj * sizeof(double), outfile)) {
+		if (fwrite_checked(g_textbuf, uljj * sizeof(double), outfile)) {
 		  goto ld_report_matrix_ret_WRITE_FAIL;
 		}
 	      }
@@ -2477,7 +2476,7 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
 		  uljj = ulii;
 		  ulii = 0;
 		}
-		if (fwrite_checked(tbuf, uljj * sizeof(float), outfile)) {
+		if (fwrite_checked(g_textbuf, uljj * sizeof(float), outfile)) {
 		  goto ld_report_matrix_ret_WRITE_FAIL;
 		}
 	      }
@@ -2637,20 +2636,20 @@ uint32_t ld_regular_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
     }
   ld_regular_emitn_start:
     marker_uidx2 = marker_uidx2_start;
-    sptr2 = width_force(6, tbuf, chrom_name_write(tbuf, chrom_info_ptr, chrom_idx1));
+    sptr2 = width_force(6, g_textbuf, chrom_name_write(chrom_info_ptr, chrom_idx1, g_textbuf));
     sptr2 = memseta(sptr2, 32, 3);
-    sptr2 = uint32_writew10x(sptr2, marker_pos[marker_uidx1], ' ');
+    sptr2 = uint32toa_w10x(marker_pos[marker_uidx1], ' ', sptr2);
     sptr2 = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx1 * max_marker_id_len]), sptr2);
     *sptr2++ = ' ';
     if (set_allele_freqs) {
-      sptr2 = width_force(10, sptr2, double_g_write(sptr2, 1.0 - set_allele_freqs[marker_uidx1]));
+      sptr2 = width_force(10, sptr2, dtoa_g(1.0 - set_allele_freqs[marker_uidx1], sptr2));
       *sptr2++ = ' ';
     }
     if (!is_inter_chr) {
-      sptr2 = width_force(6, sptr2, chrom_name_write(sptr2, chrom_info_ptr, chrom_idx1));
+      sptr2 = width_force(6, sptr2, chrom_name_write(chrom_info_ptr, chrom_idx1, sptr2));
       sptr2 = memseta(sptr2, 32, 3);
     }
-    prefix_len = (uintptr_t)(sptr2 - tbuf);
+    prefix_len = (uintptr_t)(sptr2 - g_textbuf);
   ld_regular_emitn_start_2:
     if (marker_allele_ptrs) {
       fixed_a1 = marker_allele_ptrs[2 * marker_uidx1];
@@ -2665,17 +2664,17 @@ uint32_t ld_regular_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
       next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx2);
       dxx = *dptr++;
       if ((!is_r2) || (fabs(dxx) >= window_r2)) {
-	sptr_cur = memcpya(sptr_cur, tbuf, prefix_len);
+	sptr_cur = memcpya(sptr_cur, g_textbuf, prefix_len);
 	if (is_inter_chr) {
 	  if (marker_uidx2 >= chrom_end2) {
 	    chrom_fo_idx2 = get_marker_chrom_fo_idx(chrom_info_ptr, marker_uidx2);
 	    chrom_idx2 = chrom_info_ptr->chrom_file_order[chrom_fo_idx2];
 	    chrom_end2 = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx2 + 1];
 	  }
-	  sptr_cur = width_force(6, sptr_cur, chrom_name_write(sptr_cur, chrom_info_ptr, chrom_idx2));
+	  sptr_cur = width_force(6, sptr_cur, chrom_name_write(chrom_info_ptr, chrom_idx2, sptr_cur));
 	  sptr_cur = memseta(sptr_cur, 32, 3);
 	}
-	sptr_cur = uint32_writew10x(sptr_cur, marker_pos[marker_uidx2], ' ');
+	sptr_cur = uint32toa_w10x(marker_pos[marker_uidx2], ' ', sptr_cur);
 	sptr_cur = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), sptr_cur);
 	*sptr_cur++ = ' ';
 	if (marker_allele_ptrs) {
@@ -2688,16 +2687,16 @@ uint32_t ld_regular_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
 	  *sptr_cur++ = ' ';
 	}
 	if (set_allele_freqs) {
-	  sptr_cur = width_force(10, sptr_cur, double_g_write(sptr_cur, 1.0 - set_allele_freqs[marker_uidx2]));
+	  sptr_cur = width_force(10, sptr_cur, dtoa_g(1.0 - set_allele_freqs[marker_uidx2], sptr_cur));
 	  *sptr_cur++ = ' ';
 	}
 	if (is_r2) {
 	  dxx = fabs(dxx);
 	}
-	sptr_cur = width_force(12, sptr_cur, double_g_write(sptr_cur, dxx));
+	sptr_cur = width_force(12, sptr_cur, dtoa_g(dxx, sptr_cur));
 	if (is_dprime) {
 	  *sptr_cur++ = ' ';
-          sptr_cur = width_force(12, sptr_cur, double_g_write(sptr_cur, *dptr++));
+          sptr_cur = width_force(12, sptr_cur, dtoa_g(*dptr++, sptr_cur));
 	}
 	sptr_cur = memcpya(sptr_cur, " \n", 2);
       } else if (is_dprime) {
@@ -2757,7 +2756,7 @@ uint32_t load_and_split3(FILE* bedfile, uintptr_t* rawbuf, uint32_t unfiltered_s
   uintptr_t ulii;
   if (bedfile) {
     // ld_report_dprime() preloads this and does het. haploid handling, etc.
-    if (load_raw(bedfile, rawbuf, unfiltered_sample_ct4)) {
+    if (load_raw(unfiltered_sample_ct4, bedfile, rawbuf)) {
       return RET_READ_FAIL;
     }
   }
@@ -2849,9 +2848,9 @@ static void two_locus_3x3_tablev(__m128i* vec1, __m128i* vec2, uint32_t* counts_
   __m128i count20;
   __m128i count21;
   __m128i count22;
-  __uni16 acc0;
-  __uni16 acc1;
-  __uni16 acc2;
+  __univec acc0;
+  __univec acc1;
+  __univec acc2;
   uint32_t ct;
   uint32_t ct2;
   while (iter_ct--) {
@@ -2970,10 +2969,10 @@ static inline void two_locus_3x3_zmiss_tablev(__m128i* veca0, __m128i* vecb0, ui
   __m128i county01;
   __m128i county11;
   __m128i county10;
-  __uni16 acc00;
-  __uni16 acc01;
-  __uni16 acc11;
-  __uni16 acc10;
+  __univec acc00;
+  __univec acc01;
+  __univec acc11;
+  __univec acc10;
   uint32_t ct2;
   while (sample_ctv6 >= 30) {
     sample_ctv6 -= 30;
@@ -3497,7 +3496,7 @@ uint32_t boost_calc_p_ca(uint32_t case0_ct, uint32_t case1_ct, uint32_t case2_ct
   return (df_adj > 1);
 }
 
-double fepi_counts_to_boost_chisq(uint32_t* counts, double* p_bc, double* p_ca, double* alpha1sq_ptr, double* alpha2sq_ptr, uint32_t df_adj, double* chisq_ptr, uint32_t* sig_ct1_ptr, uint32_t* sig_ct2_ptr) {
+double fepi_counts_to_boost_chisq(uint32_t* counts, double* p_bc, double* p_ca, double* alpha1sq_ptr, double* alpha2sq_ptr, uintptr_t df_adj, double* chisq_ptr, uint32_t* sig_ct1_ptr, uint32_t* sig_ct2_ptr) {
   // see BOOSTx64.c lines 625-903.
   double interaction_measure = 0.0;
   double tau = 0.0;
@@ -3513,8 +3512,10 @@ double fepi_counts_to_boost_chisq(uint32_t* counts, double* p_bc, double* p_ca,
   double dxx;
   double dyy;
   double mu_error;
+
   // dirty hack: encode df adjustment in low bits of *chisq_ptr
-  __double_ulong du;
+  uintptr_t ularr[sizeof(double) / BYTECT];
+
   uint32_t uii;
   uint32_t ujj;
   uint32_t ukk;
@@ -3666,11 +3667,11 @@ double fepi_counts_to_boost_chisq(uint32_t* counts, double* p_bc, double* p_ca,
       }
     }
     interaction_measure = (interaction_measure + log(tau)) * ((int32_t)(sum * 2));
-    du.dd = interaction_measure;
+    memcpy(ularr, &interaction_measure, sizeof(double));
     // save df_adj in low two bits
-    du.uu[0] &= ~(3 * ONELU);
-    du.uu[0] |= df_adj;
-    *chisq_ptr = du.dd;
+    ularr[0] &= ~(3 * ONELU);
+    ularr[0] |= df_adj;
+    memcpy(chisq_ptr, ularr, sizeof(double));
     if (interaction_measure < alpha1sq_ptr[df_adj]) {
       interaction_measure = alpha1sq_ptr[df_adj];
     }
@@ -3691,8 +3692,8 @@ THREAD_RET_TYPE fast_epi_thread(void* arg) {
   uintptr_t marker_ct = g_epi_marker_ct;
   uint32_t case_ct = g_epi_case_ct;
   uint32_t ctrl_ct = g_epi_ctrl_ct;
-  uint32_t case_ctv3 = 2 * ((case_ct + (2 * BITCT - 1)) / (2 * BITCT));
-  uint32_t ctrl_ctv3 = 2 * ((ctrl_ct + (2 * BITCT - 1)) / (2 * BITCT));
+  uint32_t case_ctv3 = BITCT_TO_ALIGNED_WORDCT(case_ct);
+  uint32_t ctrl_ctv3 = BITCT_TO_ALIGNED_WORDCT(ctrl_ct);
   uint32_t case_ctsplit = 3 * case_ctv3;
   uint32_t ctrl_ctsplit = 3 * ctrl_ctv3;
   uint32_t tot_ctsplit = case_ctsplit + ctrl_ctsplit;
@@ -3744,7 +3745,7 @@ THREAD_RET_TYPE fast_epi_thread(void* arg) {
   uintptr_t cur_idx2_block_size;
   uintptr_t idx2_block_start;
   uintptr_t idx2_block_end;
-  uintptr_t idx2_block_sizem16;
+  uintptr_t idx2_block_sizea16;
   uintptr_t block_idx1;
   uintptr_t block_delta1;
   uintptr_t block_idx2;
@@ -3771,19 +3772,19 @@ THREAD_RET_TYPE fast_epi_thread(void* arg) {
     cur_idx2_block_size = idx2_block_size;
     idx2_block_start = g_epi_idx2_block_start;
     idx2_block_end = idx2_block_start + idx2_block_size;
-    idx2_block_sizem16 = (idx2_block_size + 15) & (~(15 * ONELU));
+    idx2_block_sizea16 = round_up_pow2(idx2_block_size, 16);
     geno2 = g_epi_geno2;
     zmiss2 = g_epi_zmiss2;
     tot2 = g_epi_tot2;
     boost_precalc2 = g_epi_boost_precalc2;
     all_chisq = &(g_epi_all_chisq[idx2_block_start]);
     best_chisq1 = &(g_epi_best_chisq1[idx1_block_start16]);
-    best_chisq2 = &(g_epi_best_chisq2[tidx * idx2_block_sizem16]);
+    best_chisq2 = &(g_epi_best_chisq2[tidx * idx2_block_sizea16]);
     n_sig_ct1 = &(g_epi_n_sig_ct1[idx1_block_start16]);
     fail_ct1 = &(g_epi_fail_ct1[idx1_block_start16]);
-    best_id2 = &(g_epi_best_id2[tidx * idx2_block_sizem16]);
-    n_sig_ct2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizem16]);
-    fail_ct2 = &(g_epi_fail_ct2[tidx * idx2_block_sizem16]);
+    best_id2 = &(g_epi_best_id2[tidx * idx2_block_sizea16]);
+    n_sig_ct2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizea16]);
+    fail_ct2 = &(g_epi_fail_ct2[tidx * idx2_block_sizea16]);
     for (block_idx1 = block_idx1_start; block_idx1 < block_idx1_end; block_idx1++, marker_idx1++) {
       ulii = geno1_offsets[2 * block_idx1];
       if (ulii > idx2_block_start) {
@@ -4102,7 +4103,7 @@ THREAD_RET_TYPE epi_linear_thread(void* arg) {
   uint32_t pheno_nm_ct = g_epi_pheno_nm_ct;
   uint32_t best_id_fixed = 0;
   uint32_t is_first_half = 0;
-  uintptr_t pheno_nm_ctl2 = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t pheno_nm_ctl2 = QUATERCT_TO_WORDCT(pheno_nm_ct);
   uintptr_t* geno1 = g_epi_geno1;
   double* pheno_d2 = g_epi_pheno_d2;
   uint32_t* geno1_offsets = g_epi_geno1_offsets;
@@ -4138,7 +4139,7 @@ THREAD_RET_TYPE epi_linear_thread(void* arg) {
   uintptr_t cur_idx2_block_size;
   uintptr_t idx2_block_start;
   uintptr_t idx2_block_end;
-  uintptr_t idx2_block_sizem16;
+  uintptr_t idx2_block_sizea16;
   uintptr_t block_idx1;
   uintptr_t block_delta1;
   uintptr_t block_idx2;
@@ -4194,7 +4195,7 @@ THREAD_RET_TYPE epi_linear_thread(void* arg) {
     cur_idx2_block_size = idx2_block_size;
     idx2_block_start = g_epi_idx2_block_start;
     idx2_block_end = idx2_block_start + idx2_block_size;
-    idx2_block_sizem16 = (idx2_block_size + 15) & (~(15 * ONELU));
+    idx2_block_sizea16 = round_up_pow2(idx2_block_size, 16);
     geno2 = g_epi_geno2;
     phenogeno1 = g_epi_phenogeno1;
     phenogeno2 = g_epi_phenogeno2;
@@ -4202,12 +4203,12 @@ THREAD_RET_TYPE epi_linear_thread(void* arg) {
     genosums2 = g_epi_genosums2;
     all_chisq = &(g_epi_all_chisq[2 * idx2_block_start]);
     best_chisq1 = &(g_epi_best_chisq1[idx1_block_start16]);
-    best_chisq2 = &(g_epi_best_chisq2[tidx * idx2_block_sizem16]);
+    best_chisq2 = &(g_epi_best_chisq2[tidx * idx2_block_sizea16]);
     n_sig_ct1 = &(g_epi_n_sig_ct1[idx1_block_start16]);
     fail_ct1 = &(g_epi_fail_ct1[idx1_block_start16]);
-    best_id2 = &(g_epi_best_id2[tidx * idx2_block_sizem16]);
-    n_sig_ct2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizem16]);
-    fail_ct2 = &(g_epi_fail_ct2[tidx * idx2_block_sizem16]);
+    best_id2 = &(g_epi_best_id2[tidx * idx2_block_sizea16]);
+    n_sig_ct2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizea16]);
+    fail_ct2 = &(g_epi_fail_ct2[tidx * idx2_block_sizea16]);
     for (block_idx1 = block_idx1_start; block_idx1 < block_idx1_end; block_idx1++, marker_idx1++) {
       ulii = geno1_offsets[2 * block_idx1];
       if (ulii > idx2_block_start) {
@@ -4543,7 +4544,7 @@ THREAD_RET_TYPE epi_logistic_thread(void* arg) {
   uint32_t pheno_nm_ct = g_epi_pheno_nm_ct;
   uint32_t best_id_fixed = 0;
   uint32_t is_first_half = 0;
-  uintptr_t pheno_nm_ctl2 = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t pheno_nm_ctl2 = QUATERCT_TO_WORDCT(pheno_nm_ct);
   uintptr_t* geno1 = g_epi_geno1;
   uintptr_t* pheno_c = g_epi_pheno_c;
   float* covars_cov_major = g_epi_logistic_mt[tidx].cur_covars_cov_major;
@@ -4576,7 +4577,7 @@ THREAD_RET_TYPE epi_logistic_thread(void* arg) {
   uintptr_t cur_idx2_block_size;
   uintptr_t idx2_block_start;
   uintptr_t idx2_block_end;
-  uintptr_t idx2_block_sizem16;
+  uintptr_t idx2_block_sizea16;
   uintptr_t block_idx1;
   uintptr_t block_delta1;
   uintptr_t block_idx2;
@@ -4603,16 +4604,16 @@ THREAD_RET_TYPE epi_logistic_thread(void* arg) {
     cur_idx2_block_size = idx2_block_size;
     idx2_block_start = g_epi_idx2_block_start;
     idx2_block_end = idx2_block_start + idx2_block_size;
-    idx2_block_sizem16 = (idx2_block_size + 15) & (~(15 * ONELU));
+    idx2_block_sizea16 = round_up_pow2(idx2_block_size, 16);
     geno2 = g_epi_geno2;
     all_chisq = &(g_epi_all_chisq_f[2 * idx2_block_start]);
     best_chisq1 = &(g_epi_best_chisq_f1[idx1_block_start16]);
-    best_chisq2 = &(g_epi_best_chisq_f2[tidx * idx2_block_sizem16]);
+    best_chisq2 = &(g_epi_best_chisq_f2[tidx * idx2_block_sizea16]);
     n_sig_ct1 = &(g_epi_n_sig_ct1[idx1_block_start16]);
     fail_ct1 = &(g_epi_fail_ct1[idx1_block_start16]);
-    best_id2 = &(g_epi_best_id2[tidx * idx2_block_sizem16]);
-    n_sig_ct2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizem16]);
-    fail_ct2 = &(g_epi_fail_ct2[tidx * idx2_block_sizem16]);
+    best_id2 = &(g_epi_best_id2[tidx * idx2_block_sizea16]);
+    n_sig_ct2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizea16]);
+    fail_ct2 = &(g_epi_fail_ct2[tidx * idx2_block_sizea16]);
     for (block_idx1 = block_idx1_start; block_idx1 < block_idx1_end; block_idx1++, marker_idx1++) {
       ulii = geno1_offsets[2 * block_idx1];
       if (ulii > idx2_block_start) {
@@ -4667,7 +4668,7 @@ THREAD_RET_TYPE epi_logistic_thread(void* arg) {
 	}
 	// 2. now populate covariate-major matrix with 16-byte-aligned,
 	//    trailing-entries-zeroed rows
-	cur_sample_cta4 = (cur_sample_ct + 3) & (~3);
+	cur_sample_cta4 = round_up_pow2(cur_sample_ct, 4);
 	for (widx = 0; widx < pheno_nm_ctl2; widx++) {
 	  sample_idx = widx * BITCT2;
           cur_word1 = cur_geno1[widx];
@@ -5004,7 +5005,7 @@ THREAD_RET_TYPE ld_dprime_thread(void* arg) {
   uintptr_t block_idx1_end = ((tidx + 1) * g_ld_idx1_block_size) / g_ld_thread_ct;
   uintptr_t marker_idx2_maxw = g_ld_marker_ctm8;
   uintptr_t founder_ct = g_ld_founder_ct;
-  uint32_t founder_ctv3 = 2 * ((founder_ct + (2 * BITCT - 1)) / (2 * BITCT));
+  uint32_t founder_ctv3 = BITCT_TO_ALIGNED_WORDCT(founder_ct);
   uint32_t founder_ctsplit = 3 * founder_ctv3;
   uintptr_t* geno1 = g_ld_geno1;
   uintptr_t* zmiss1 = g_epi_zmiss1;
@@ -5044,7 +5045,7 @@ THREAD_RET_TYPE ld_dprime_thread(void* arg) {
   uint32_t is_x2;
   uint32_t nm_fixed;
   if (g_ld_thread_wkspace) {
-    cur_geno1_male = &(g_ld_thread_wkspace[tidx * CACHEALIGN32_WORD(founder_ctsplit)]);
+    cur_geno1_male = &(g_ld_thread_wkspace[tidx * round_up_pow2(founder_ctsplit, CACHELINE_WORD)]);
   }
   // suppress warning
   fill_uint_zero(&(tot1[3]), 3);
@@ -5084,11 +5085,11 @@ THREAD_RET_TYPE ld_dprime_thread(void* arg) {
       tot1[2] = popcount_longs(&(cur_geno1[2 * founder_ctv3]), founder_ctv3);
       if (is_x1 || x2_present) {
 	memcpy(cur_geno1_male, cur_geno1, founder_ctsplit * sizeof(intptr_t));
-        bitfield_and(cur_geno1_male, sex_male, founder_ctv3);
+        bitvec_and(sex_male, founder_ctv3, cur_geno1_male);
         tot1[3] = popcount_longs(cur_geno1_male, founder_ctv3);
-        bitfield_and(&(cur_geno1_male[founder_ctv3]), sex_male, founder_ctv3);
+        bitvec_and(sex_male, founder_ctv3, &(cur_geno1_male[founder_ctv3]));
 	tot1[4] = popcount_longs(&(cur_geno1_male[founder_ctv3]), founder_ctv3);
-        bitfield_and(&(cur_geno1_male[2 * founder_ctv3]), sex_male, founder_ctv3);
+        bitvec_and(sex_male, founder_ctv3, &(cur_geno1_male[2 * founder_ctv3]));
 	tot1[5] = popcount_longs(&(cur_geno1_male[2 * founder_ctv3]), founder_ctv3);
       }
       cur_geno2 = &(geno2[block_idx2 * founder_ctsplit]);
@@ -5165,8 +5166,8 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
   uintptr_t marker_ct = g_ld_marker_ct;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   uintptr_t founder_ct = g_ld_founder_ct;
-  uintptr_t founder_ctl = (founder_ct + (BITCT - 1)) / BITCT;
-  uintptr_t founder_ctv3 = 2 * ((founder_ct + (2 * BITCT - 1)) / (2 * BITCT));
+  uintptr_t founder_ctl = BITCT_TO_WORDCT(founder_ct);
+  uintptr_t founder_ctv3 = BITCT_TO_ALIGNED_WORDCT(founder_ct);
   uintptr_t founder_ctsplit = 3 * founder_ctv3;
   uintptr_t final_mask = get_final_mask(founder_ct);
   uintptr_t orig_marker_ctm8 = g_ld_marker_ctm8;
@@ -5202,7 +5203,8 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
   uintptr_t* ulptr;
   uint32_t* uiptr;
   unsigned char* overflow_buf;
-  unsigned char* wkspace_mark2;
+  unsigned char* bigstack_mark2;
+  uintptr_t cur_bigstack_left;
   uintptr_t thread_workload;
   uintptr_t idx1_block_size;
   uintptr_t idx2_block_size;
@@ -5222,22 +5224,21 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
   uint32_t cur_marker_pos;
   uint32_t is_last_block;
   uint32_t uii;
-  if (wkspace_alloc_uc_checked(&overflow_buf, 262144) ||
-      wkspace_alloc_ul_checked(&loadbuf, founder_ctl * 2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&dummy_nm, founder_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_uc(262144, &overflow_buf) ||
+      bigstack_alloc_ul(founder_ctl * 2, &loadbuf) ||
+      bigstack_alloc_ul(founder_ctl, &dummy_nm)) {
     goto ld_report_dprime_ret_NOMEM;
   }
   loadbuf[founder_ctl * 2 - 2] = 0;
   loadbuf[founder_ctl * 2 - 1] = 0;
-  fill_all_bits(dummy_nm, founder_ct);
+  fill_all_bits(founder_ct, dummy_nm);
   g_ld_thread_wkspace = NULL;
   if ((x_code != -1) && is_set(chrom_info_ptr->chrom_mask, x_code)) {
     uii = chrom_info_ptr->chrom_start[(uint32_t)x_code];
     chrom_end = chrom_info_ptr->chrom_end[(uint32_t)x_code];
     chrom_end = chrom_end - uii - popcount_bit_idx(marker_exclude, uii, chrom_end);
     if (chrom_end) {
-      ulii = CACHEALIGN32_WORD(founder_ctsplit);
-      if (wkspace_alloc_ul_checked(&g_ld_thread_wkspace, ulii * thread_ct * sizeof(intptr_t))) {
+      if (bigstack_alloc_ul(round_up_pow2(founder_ctsplit, CACHELINE_WORD) * thread_ct, &g_ld_thread_wkspace)) {
 	goto ld_report_dprime_ret_NOMEM;
       }
       xstart = uii - popcount_bit_idx(marker_exclude, 0, uii);
@@ -5245,7 +5246,11 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
       g_ld_sex_male = sex_male;
     }
   }
-  idx1_block_size = (wkspace_left - 2 * CACHELINE) / (ulii * 2 + 1);
+  cur_bigstack_left = bigstack_left();
+  if (cur_bigstack_left < 2 * CACHELINE) {
+    goto ld_report_dprime_ret_NOMEM;
+  }
+  idx1_block_size = (cur_bigstack_left - 2 * CACHELINE) / (ulii * 2 + 1);
   thread_workload = idx1_block_size / thread_ct;
   if (!thread_workload) {
     goto ld_report_dprime_ret_NOMEM;
@@ -5254,12 +5259,12 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
   if (idx1_block_size > job_size) {
     idx1_block_size = job_size;
   }
-  if (wkspace_alloc_ul_checked(&g_ld_geno1, founder_ctsplit * idx1_block_size * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&g_epi_zmiss1, ((idx1_block_size + BITCT - 1) / BITCT) * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&g_ld_interval1, idx1_block_size * 2 * sizeof(int32_t)) ||
+  if (bigstack_alloc_ul(founder_ctsplit * idx1_block_size, &g_ld_geno1) ||
+      bigstack_alloc_ul(BITCT_TO_WORDCT(idx1_block_size), &g_epi_zmiss1) ||
+      bigstack_alloc_ui(idx1_block_size * 2, &g_ld_interval1) ||
       // double size since both r/r^2 and dprime are needed
       // (marker_idx2_maxw only needs to be divisible by 4 as a result)
-      wkspace_alloc_d_checked(&g_ld_results, marker_idx2_maxw * 2 * idx1_block_size * sizeof(double))) {
+      bigstack_alloc_d(marker_idx2_maxw * 2 * idx1_block_size, &g_ld_results)) {
     goto ld_report_dprime_ret_NOMEM;
   }
   for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
@@ -5269,21 +5274,25 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
   }
 
   ulii = founder_ctsplit * sizeof(intptr_t) + 1 + 3 * sizeof(int32_t);
-  idx2_block_size = ((wkspace_left - CACHELINE) / ulii) & (~(7 * ONELU));
+  cur_bigstack_left = bigstack_left();
+  if (cur_bigstack_left >= CACHELINE) {
+    cur_bigstack_left -= CACHELINE;
+  }
+  idx2_block_size = (cur_bigstack_left / ulii) & (~(7 * ONELU));
   if (idx2_block_size > marker_ct) {
-    idx2_block_size = (marker_ct + 7) & (~7);
+    idx2_block_size = round_up_pow2(marker_ct, 8);
   }
-  wkspace_mark2 = wkspace_base;
+  bigstack_mark2 = g_bigstack_base;
   while (1) {
     if (!idx2_block_size) {
       goto ld_report_dprime_ret_NOMEM;
     }
-    if (!(wkspace_alloc_ul_checked(&g_ld_geno2, founder_ctsplit * idx2_block_size * sizeof(intptr_t)) ||
-          wkspace_alloc_ul_checked(&g_epi_zmiss2, ((idx2_block_size + (BITCT - 1)) / BITCT) * sizeof(intptr_t)) ||
-          wkspace_alloc_ui_checked(&g_epi_tot2, idx2_block_size * 3 * sizeof(int32_t)))) {
+    if (!(bigstack_alloc_ul(founder_ctsplit * idx2_block_size, &g_ld_geno2) ||
+          bigstack_alloc_ul(BITCT_TO_WORDCT(idx2_block_size), &g_epi_zmiss2) ||
+          bigstack_alloc_ui(idx2_block_size * 3, &g_epi_tot2))) {
       break;
     }
-    wkspace_reset(wkspace_mark2);
+    bigstack_reset(bigstack_mark2);
     idx2_block_size -= 4;
   }
   for (block_idx2 = 0; block_idx2 < idx2_block_size; block_idx2++) {
@@ -5338,7 +5347,7 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
       goto ld_report_dprime_ret_READ_FAIL;
     }
     chrom_end = 0;
-    fill_ulong_zero(g_epi_zmiss1, (idx1_block_size + (BITCT - 1)) / BITCT);
+    fill_ulong_zero(g_epi_zmiss1, BITCT_TO_WORDCT(idx1_block_size));
     for (block_idx1 = 0; block_idx1 < idx1_block_size; marker_uidx1_tmp++, block_idx1++, marker_idx2++) {
       if (IS_SET(marker_exclude_idx1, marker_uidx1_tmp)) {
         ulii = next_unset_ul_unsafe(marker_exclude_idx1, marker_uidx1_tmp);
@@ -5426,7 +5435,7 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
 	g_ld_interval1[block_idx1 * 2 + 1] = marker_ct - marker_idx2_base;
       }
 
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx1_tmp))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx1_tmp), bedfile, loadbuf_raw, loadbuf)) {
 	goto ld_report_dprime_ret_READ_FAIL;
       }
       if (is_haploid && hh_exists) {
@@ -5434,7 +5443,7 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
       }
       load_and_split3(NULL, loadbuf, founder_ct, &(g_ld_geno1[block_idx1 * founder_ctsplit]), dummy_nm, dummy_nm, founder_ctv3, 0, 0, 1, &ulii);
       if (ulii == 3) {
-        SET_BIT(g_epi_zmiss1, block_idx1);
+        SET_BIT(block_idx1, g_epi_zmiss1);
       }
     }
     marker_uidx2 = marker_uidx2_base;
@@ -5443,8 +5452,9 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
     }
 
     cur_idx2_block_size = idx2_block_size;
-    marker_idx2_end = g_ld_interval1[2 * idx1_block_size - 1] + marker_idx2_base;
-    marker_idx2_maxw = (marker_idx2_end + 3 - marker_idx2_base) & (~3);
+    uljj = g_ld_interval1[2 * idx1_block_size - 1];
+    marker_idx2_end = uljj + marker_idx2_base;
+    marker_idx2_maxw = round_up_pow2(uljj, 4);
     if (marker_idx2_maxw > orig_marker_ctm8) {
       marker_idx2_maxw = orig_marker_ctm8;
     }
@@ -5459,7 +5469,7 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
 	g_ld_xstart2 = uii - marker_idx2;
 	g_ld_xend2 = MINV(xend, marker_idx2 + cur_idx2_block_size) - uii;
       }
-      fill_ulong_zero(g_epi_zmiss2, (cur_idx2_block_size + (BITCT - 1)) / BITCT);
+      fill_ulong_zero(g_epi_zmiss2, BITCT_TO_WORDCT(cur_idx2_block_size));
       for (block_idx2 = 0; block_idx2 < cur_idx2_block_size; marker_uidx2++, block_idx2++) {
 	if (IS_SET(marker_exclude, marker_uidx2)) {
           marker_uidx2 = next_unset_ul_unsafe(marker_exclude, marker_uidx2);
@@ -5474,7 +5484,7 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
 	  is_x = (((int32_t)chrom_idx) == chrom_info_ptr->x_code);
 	  is_y = (((int32_t)chrom_idx) == chrom_info_ptr->y_code);
 	}
-	if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx2))) {
+	if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx2), bedfile, loadbuf_raw, loadbuf)) {
 	  goto ld_report_dprime_ret_READ_FAIL;
 	}
 	if (is_haploid && hh_exists) {
@@ -5487,7 +5497,7 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
 	uiptr[1] = popcount_longs(&(ulptr[founder_ctv3]), founder_ctv3);
         uiptr[2] = popcount_longs(&(ulptr[2 * founder_ctv3]), founder_ctv3);
 	if (ulii == 3) {
-	  SET_BIT(g_epi_zmiss2, block_idx2);
+	  SET_BIT(block_idx2, g_epi_zmiss2);
 	}
       }
       g_ld_idx2_block_size = cur_idx2_block_size;
@@ -5564,10 +5574,10 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
   uint32_t snp_list_file = ld_modifier & LD_SNP_LIST_FILE;
   uintptr_t marker_ct = g_ld_marker_ct;
   uintptr_t marker_ct1 = marker_ct;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   uintptr_t founder_ct = g_ld_founder_ct;
-  uintptr_t founder_ctl = (founder_ct + BITCT - 1) / BITCT;
+  uintptr_t founder_ctl = BITCT_TO_WORDCT(founder_ct);
   uintptr_t founder_ct_192_long = g_ld_founder_ct_mld_m1 * (MULTIPLEX_LD / BITCT2) + g_ld_founder_ct_mld_rem * (192 / BITCT2);
   uintptr_t final_mask = get_final_mask(founder_ct);
   uintptr_t pct = 1;
@@ -5593,7 +5603,7 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
   uint32_t window_lead_ct = 0;
   uint32_t chrom_last = 0;
   int32_t retval = 0;
-  unsigned char* wkspace_mark2;
+  unsigned char* bigstack_mark2;
   unsigned char* overflow_buf;
   uint32_t* id_map;
   char* sorted_ids;
@@ -5630,14 +5640,14 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
   uint32_t is_last_block;
   uint32_t uii;
   int32_t ii;
-  if (wkspace_alloc_uc_checked(&overflow_buf, 262144)) {
+  if (bigstack_alloc_uc(262144, &overflow_buf)) {
     goto ld_report_regular_ret_NOMEM;
   }
   if (idx1_subset) {
-    if (wkspace_alloc_ul_checked(&marker_exclude_idx1, unfiltered_marker_ctl * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_marker_ctl, &marker_exclude_idx1)) {
       goto ld_report_regular_ret_NOMEM;
     }
-    fill_all_bits(marker_exclude_idx1, unfiltered_marker_ct);
+    fill_all_bits(unfiltered_marker_ct, marker_exclude_idx1);
     marker_uidx1 = next_unset_unsafe(marker_exclude, 0);
     if (ldip->snpstr && (!snp_list_file)) {
       bufptr = ldip->snpstr;
@@ -5654,32 +5664,32 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
       if (marker_idx1 == marker_ct) {
 	goto ld_report_regular_ret_EMPTY_SET1;
       }
-      clear_bit_ul(marker_exclude_idx1, marker_uidx1);
+      clear_bit_ul(marker_uidx1, marker_exclude_idx1);
       marker_ct1 = 1;
     } else {
       marker_ct1 = 0;
-      retval = sort_item_ids(&sorted_ids, &id_map, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_ct, marker_ids, max_marker_id_len, 0, 0, strcmp_deref);
+      retval = sort_item_ids(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_ct, marker_ids, max_marker_id_len, 0, 0, strcmp_deref, &sorted_ids, &id_map);
       if (retval) {
 	goto ld_report_regular_ret_1;
       }
       if (snp_list_file) {
-        if (fopen_checked(&infile, ldip->snpstr, "rb")) {
+        if (fopen_checked(ldip->snpstr, FOPEN_RB, &infile)) {
 	  goto ld_report_regular_ret_OPEN_FAIL;
 	}
 	snplist_ct = 0;
 	max_snplist_id_len = 0;
-	retval = scan_token_ct_len(infile, tbuf, MAXLINELEN, &snplist_ct, &max_snplist_id_len);
+	retval = scan_token_ct_len(MAXLINELEN, infile, g_textbuf, &snplist_ct, &max_snplist_id_len);
 	if (retval) {
 	  goto ld_report_regular_ret_1;
 	}
 	if (!snplist_ct) {
 	  goto ld_report_regular_ret_EMPTY_SET1;
 	}
-	if (wkspace_alloc_c_checked(&bufptr, snplist_ct * max_snplist_id_len)) {
+	if (bigstack_alloc_c(snplist_ct * max_snplist_id_len, &bufptr)) {
 	  goto ld_report_regular_ret_NOMEM;
 	}
 	rewind(infile);
-	retval = read_tokens(infile, tbuf, MAXLINELEN, snplist_ct, max_snplist_id_len, bufptr);
+	retval = read_tokens(MAXLINELEN, snplist_ct, max_snplist_id_len, infile, g_textbuf, bufptr);
 	if (retval) {
 	  goto ld_report_regular_ret_1;
 	}
@@ -5694,19 +5704,19 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
 	      logerrprint("Error: Duplicate variant ID in --ld-snp-list file.\n");
 	      goto ld_report_regular_ret_INVALID_FORMAT;
 	    }
-            clear_bit(marker_exclude_idx1, uii);
+            clear_bit(uii, marker_exclude_idx1);
             marker_ct1++;
 	  }
 	}
       } else {
-        retval = string_range_list_to_bitfield2(sorted_ids, id_map, marker_ct, max_marker_id_len, &(ldip->snps_rl), "ld-snps", marker_exclude_idx1);
-        bitfield_or(marker_exclude_idx1, marker_exclude, unfiltered_marker_ctl);
+        retval = string_range_list_to_bitarr2(sorted_ids, id_map, marker_ct, max_marker_id_len, &(ldip->snps_rl), "ld-snps", marker_exclude_idx1);
+        bitvec_or(marker_exclude, unfiltered_marker_ctl, marker_exclude_idx1);
         marker_ct1 = marker_ct - popcount_longs(marker_exclude_idx1, unfiltered_marker_ctl);
       }
       if (!marker_ct1) {
 	goto ld_report_regular_ret_EMPTY_SET1;
       }
-      wkspace_reset(id_map);
+      bigstack_reset(id_map);
     }
   }
   if ((parallel_tot > 1) && (marker_ct1 < 2 * parallel_tot)) {
@@ -5743,11 +5753,11 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
   if (ld_modifier & LD_DPRIME) {
     // this is more like --fast-epistasis under the hood, since it requires the
     // entire 3x3 table
-    g_ld_marker_ctm8 = (marker_idx2_maxw + 3) & (~(3 * ONELU));
+    g_ld_marker_ctm8 = round_up_pow2(marker_idx2_maxw, 4);
     retval = ld_report_dprime(threads, ldip, bedfile, bed_offset, marker_reverse, unfiltered_sample_ct, founder_info, sex_male, founder_include2, founder_male_include2, loadbuf, outname, hh_exists, marker_idx1_start, marker_idx1_end);
     goto ld_report_regular_ret_1;
   }
-  marker_idx2_maxw = (marker_idx2_maxw + 7) & (~(7 * ONELU));
+  marker_idx2_maxw = round_up_pow2(marker_idx2_maxw, 8);
   orig_marker_ctm8 = marker_idx2_maxw;
   g_ld_marker_ctm8 = marker_idx2_maxw;
   g_ld_keep_sign = 1;
@@ -5759,7 +5769,7 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
   // + marker_idx2_maxw * sizeof(double) for g_ld_results buffer
   // round down to multiple of thread_ct for better workload distribution
   ulii = founder_ct_192_long * 2 * sizeof(intptr_t) + 3 * sizeof(int32_t) + marker_idx2_maxw * sizeof(double);
-  idx1_block_size = wkspace_left / (ulii * 2);
+  idx1_block_size = bigstack_left() / (ulii * 2);
   thread_workload = idx1_block_size / thread_ct;
   if (!thread_workload) {
     goto ld_report_regular_ret_NOMEM;
@@ -5768,30 +5778,30 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
   if (idx1_block_size > job_size) {
     idx1_block_size = job_size;
   }
-  g_ld_geno1 = (uintptr_t*)wkspace_alloc(founder_ct_192_long * idx1_block_size * sizeof(intptr_t));
-  g_ld_geno_masks1 = (uintptr_t*)wkspace_alloc(founder_ct_192_long * idx1_block_size * sizeof(intptr_t));
-  g_ld_missing_cts1 = (uint32_t*)wkspace_alloc(idx1_block_size * sizeof(int32_t));
-  g_ld_interval1 = (uint32_t*)wkspace_alloc(idx1_block_size * 2 * sizeof(int32_t));
-  if (wkspace_alloc_d_checked(&g_ld_results, marker_idx2_maxw * idx1_block_size * sizeof(double))) {
+  bigstack_alloc_ul(founder_ct_192_long * idx1_block_size, &g_ld_geno1);
+  bigstack_alloc_ul(founder_ct_192_long * idx1_block_size, &g_ld_geno_masks1);
+  bigstack_alloc_ui(idx1_block_size, &g_ld_missing_cts1);
+  bigstack_alloc_ui(idx1_block_size * 2, &g_ld_interval1);
+  if (bigstack_alloc_d(marker_idx2_maxw * idx1_block_size, &g_ld_results)) {
     goto ld_report_regular_ret_NOMEM;
   }
 
   ulii -= 2 * sizeof(int32_t) + marker_idx2_maxw * sizeof(double);
-  idx2_block_size = (wkspace_left / ulii) & (~(7 * ONELU));
+  idx2_block_size = (bigstack_left() / ulii) & (~(7 * ONELU));
   if (idx2_block_size > marker_ct) {
-    idx2_block_size = (marker_ct + 7) & (~(7 * ONELU));
+    idx2_block_size = round_up_pow2(marker_ct, 8);
   }
-  wkspace_mark2 = wkspace_base;
+  bigstack_mark2 = g_bigstack_base;
   while (1) {
     if (!idx2_block_size) {
       goto ld_report_regular_ret_NOMEM;
     }
-    if (!(wkspace_alloc_ul_checked(&g_ld_geno2, founder_ct_192_long * idx2_block_size * sizeof(intptr_t)) ||
-          wkspace_alloc_ul_checked(&g_ld_geno_masks2, founder_ct_192_long * idx2_block_size * sizeof(intptr_t)) ||
-          wkspace_alloc_ui_checked(&g_ld_missing_cts2, idx2_block_size * sizeof(int32_t)))) {
+    if (!(bigstack_alloc_ul(founder_ct_192_long * idx2_block_size, &g_ld_geno2) ||
+          bigstack_alloc_ul(founder_ct_192_long * idx2_block_size, &g_ld_geno_masks2) ||
+          bigstack_alloc_ui(idx2_block_size, &g_ld_missing_cts2))) {
       break;
     }
-    wkspace_reset(wkspace_mark2);
+    bigstack_reset(bigstack_mark2);
     idx2_block_size -= 8;
   }
   uljj = founder_trail_ct + 2;
@@ -5807,8 +5817,8 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
   if (marker_idx1) {
     marker_uidx1 = jump_forward_unset_unsafe(marker_exclude_idx1, marker_uidx1 + 1, marker_idx1);
   }
-  sprintf(logbuf, "--r%s%s%s%s to %s ... ", g_ld_is_r2? "2" : "", is_inter_chr? " inter-chr" : "", g_ld_marker_allele_ptrs? " in-phase" : "", g_ld_set_allele_freqs? " with-freqs" : "", outname);
-  wordwrap(logbuf, 16); // strlen("99% [processing]")
+  sprintf(g_logbuf, "--r%s%s%s%s to %s ... ", g_ld_is_r2? "2" : "", is_inter_chr? " inter-chr" : "", g_ld_marker_allele_ptrs? " in-phase" : "", g_ld_set_allele_freqs? " with-freqs" : "", outname);
+  wordwrapb(16); // strlen("99% [processing]")
   logprintb();
   fputs("0%", stdout);
   while (1) {
@@ -5934,7 +5944,7 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
 	g_ld_interval1[block_idx1 * 2 + 1] = marker_ct - marker_idx2_base;
       }
 
-      if (load_and_collapse_incl(bedfile, loadbuf, unfiltered_sample_ct, &(g_ld_geno1[block_idx1 * founder_ct_192_long]), founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx1_tmp))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx1_tmp), bedfile, loadbuf, &(g_ld_geno1[block_idx1 * founder_ct_192_long]))) {
 	goto ld_report_regular_ret_READ_FAIL;
       }
       if (is_haploid && hh_exists) {
@@ -5948,8 +5958,9 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
     }
 
     cur_idx2_block_size = idx2_block_size;
-    marker_idx2_end = g_ld_interval1[2 * idx1_block_size - 1] + marker_idx2_base;
-    marker_idx2_maxw = (marker_idx2_end + 7 - marker_idx2_base) & (~7);
+    uljj = g_ld_interval1[2 * idx1_block_size - 1];
+    marker_idx2_end = uljj + marker_idx2_base;
+    marker_idx2_maxw = round_up_pow2(uljj, 8);
     if (marker_idx2_maxw > orig_marker_ctm8) {
       marker_idx2_maxw = orig_marker_ctm8;
     }
@@ -5977,7 +5988,7 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
 	  is_x = (((int32_t)chrom_idx2) == chrom_info_ptr->x_code);
 	  is_y = (((int32_t)chrom_idx2) == chrom_info_ptr->y_code);
 	}
-	if (load_and_collapse_incl(bedfile, loadbuf, unfiltered_sample_ct, &(g_ld_geno2[block_idx2 * founder_ct_192_long]), founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx2))) {
+	if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx2), bedfile, loadbuf, &(g_ld_geno2[block_idx2 * founder_ct_192_long]))) {
 	  goto ld_report_regular_ret_READ_FAIL;
 	}
 	if (is_haploid && hh_exists) {
@@ -6061,8 +6072,8 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
 }
 
 int32_t ld_report(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, double* set_allele_freqs, Chrom_info* chrom_info_ptr, uint32_t* marker_pos, uintptr_t unfiltered_sample_ct, uintptr_t* founder_info, uint32_t parallel_idx, uint32_t  [...]
-  unsigned char* wkspace_mark = wkspace_base;
-  uintptr_t unfiltered_sample_ctv2 = 2 * ((unfiltered_sample_ct + (BITCT - 1)) / BITCT);
+  unsigned char* bigstack_mark = g_bigstack_base;
+  uintptr_t unfiltered_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct);
   uintptr_t founder_ct = popcount_longs(founder_info, unfiltered_sample_ctv2 / 2);
   uintptr_t* founder_include2 = NULL;
   uintptr_t* founder_male_include2 = NULL;
@@ -6102,10 +6113,10 @@ int32_t ld_report(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintptr_t be
     logerrprint("Error: Gigantic (over 400k loci) --r/--r2 unfiltered, non-distributed\ncomputation.  Rerun with the 'yes-really' modifier if you are SURE you have\nenough hard drive space and want to do this.\n");
     goto ld_report_ret_INVALID_CMDLINE;
   }
-  if (alloc_collapsed_haploid_filters(unfiltered_sample_ct, founder_ct, XMHH_EXISTS | hh_exists, 1, founder_info, sex_male, &founder_include2, &founder_male_include2)) {
+  if (alloc_collapsed_haploid_filters(founder_info, sex_male, unfiltered_sample_ct, founder_ct, XMHH_EXISTS | hh_exists, 1, &founder_include2, &founder_male_include2)) {
     goto ld_report_ret_NOMEM;
   }
-  if (wkspace_alloc_ul_checked(&loadbuf, unfiltered_sample_ctv2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctv2, &loadbuf)) {
     goto ld_report_ret_NOMEM;
   }
   loadbuf[unfiltered_sample_ctv2 - 2] = 0;
@@ -6117,7 +6128,7 @@ int32_t ld_report(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintptr_t be
   }
   if (parallel_tot > 1) {
     *bufptr++ = '.';
-    bufptr = uint32_write(bufptr, parallel_idx + 1);
+    bufptr = uint32toa(parallel_idx + 1, bufptr);
   }
   if (!is_binary) {
     g_ld_delimiter = (ld_modifier & LD_MATRIX_SPACES)? ' ' : '\t';
@@ -6154,19 +6165,19 @@ int32_t ld_report(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintptr_t be
     break;
   }
  ld_report_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t show_tags(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* founder_info, uintptr_t* sex_male, char* outname, char* outname_end, uint32_t hh_exists) {
   // Similar to ld_prune() and flipscan().
-  unsigned char* wkspace_mark = wkspace_base;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t founder_ct = popcount_longs(founder_info, unfiltered_sample_ctl);
-  uintptr_t founder_ctl = (founder_ct + BITCT - 1) / BITCT;
+  uintptr_t founder_ctl = BITCT_TO_WORDCT(founder_ct);
   uintptr_t final_mask = get_final_mask(founder_ct);
   uintptr_t marker_idx = 0;
   uintptr_t max_window_size = 1;
@@ -6251,30 +6262,30 @@ int32_t show_tags(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t
     logerrprint("Warning: Skipping --show-tags since there are less than two founders.\n(--make-founders may come in handy here.)\n");
     goto show_tags_ret_1;
   }
-  if (wkspace_alloc_ul_checked(&targets, unfiltered_marker_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_marker_ctl, &targets) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw)) {
     goto show_tags_ret_NOMEM;
   }
   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
   if (ldip->show_tags_fname) {
     fill_ulong_zero(targets, unfiltered_marker_ctl);
-    retval = sort_item_ids(&sorted_marker_ids, &marker_id_map, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_ct, marker_ids, max_marker_id_len, 0, 0, strcmp_deref);
+    retval = sort_item_ids(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_ct, marker_ids, max_marker_id_len, 0, 0, strcmp_deref, &sorted_marker_ids, &marker_id_map);
     if (retval) {
       goto show_tags_ret_1;
     }
-    if (fopen_checked(&infile, ldip->show_tags_fname, "r")) {
+    if (fopen_checked(ldip->show_tags_fname, "r", &infile)) {
       goto show_tags_ret_OPEN_FAIL;
     }
-    tbuf[MAXLINELEN - 1] = ' ';
+    g_textbuf[MAXLINELEN - 1] = ' ';
     line_idx = 0;
     unrecog_ct = 0;
-    while (fgets(tbuf, MAXLINELEN, infile)) {
+    while (fgets(g_textbuf, MAXLINELEN, infile)) {
       line_idx++;
-      if (!tbuf[MAXLINELEN - 1]) {
+      if (!g_textbuf[MAXLINELEN - 1]) {
 	LOGERRPRINTF("Error: Line %" PRIuPTR " of --show-tags file is pathologically long.\n", line_idx);
 	goto show_tags_ret_INVALID_FORMAT;
       }
-      bufptr = skip_initial_spaces(tbuf);
+      bufptr = skip_initial_spaces(g_textbuf);
       if (is_eoln_kns(*bufptr)) {
 	continue;
       }
@@ -6300,18 +6311,18 @@ int32_t show_tags(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t
         LOGERRPRINTF("Error: Duplicate variant ID '%s' in --show-tags file.\n", bufptr);
 	goto show_tags_ret_INVALID_FORMAT;
       }
-      SET_BIT(targets, marker_uidx);
+      SET_BIT(marker_uidx, targets);
     }
     if (fclose_null(&infile)) {
       goto show_tags_ret_READ_FAIL;
     }
-    wkspace_reset((unsigned char*)marker_id_map);
+    bigstack_reset((unsigned char*)marker_id_map);
     target_ct = popcount_longs(targets, unfiltered_marker_ctl);
     if (!target_ct) {
       logerrprint("Error: No recognized variant IDs in --show-tags file.\n");
       goto show_tags_ret_INVALID_FORMAT;
     }
-    if (wkspace_alloc_ul_checked(&final_set, unfiltered_marker_ctl * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_marker_ctl, &final_set)) {
       goto show_tags_ret_NOMEM;
     }
     memcpy(final_set, targets, unfiltered_marker_ctl * sizeof(intptr_t));
@@ -6320,10 +6331,10 @@ int32_t show_tags(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t
       LOGERRPRINTF("Warning: %" PRIuPTR " unrecognized variant ID%s in --show-tags file.\n", unrecog_ct, (unrecog_ct == 1)? "" : "s");
     }
   } else {
-    bitfield_exclude_to_include(marker_exclude, targets, unfiltered_marker_ct);
+    bitarr_invert_copy(marker_exclude, unfiltered_marker_ct, targets);
   }
   // force founder_male_include2 allocation
-  if (alloc_collapsed_haploid_filters(unfiltered_sample_ct, founder_ct, XMHH_EXISTS | hh_exists, 1, founder_info, sex_male, &founder_include2, &founder_male_include2)) {
+  if (alloc_collapsed_haploid_filters(founder_info, sex_male, unfiltered_sample_ct, founder_ct, XMHH_EXISTS | hh_exists, 1, &founder_include2, &founder_male_include2)) {
     goto show_tags_ret_NOMEM;
   }
   founder_ct_mld_m1 = (founder_ct - 1) / MULTIPLEX_LD;
@@ -6341,15 +6352,15 @@ int32_t show_tags(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t
   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
     max_window_size = chrom_window_max(marker_pos, marker_exclude, chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx], 0x7fffffff, window_bp * 2, max_window_size);
   }
-  max_window_ctl = (max_window_size + (BITCT - 1)) / BITCT;
+  max_window_ctl = BITCT_TO_WORDCT(max_window_size);
   max_window_ctal = max_window_ctl * BITCT;
-  if (wkspace_alloc_ui_checked(&window_uidxs, max_window_size * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&window_cidx_starts, max_window_size * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&missing_cts, max_window_size * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(&geno, max_window_size * founder_ct_192_long * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&geno_masks, max_window_size * founder_ct_192_long * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&cur_targets, max_window_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&tag_matrix, max_window_size * max_window_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_ui(max_window_size, &window_uidxs) ||
+      bigstack_alloc_ui(max_window_size, &window_cidx_starts) ||
+      bigstack_alloc_ui(max_window_size, &missing_cts) ||
+      bigstack_alloc_ul(max_window_size * founder_ct_192_long, &geno) ||
+      bigstack_alloc_ul(max_window_size * founder_ct_192_long, &geno_masks) ||
+      bigstack_alloc_ul(max_window_ctl, &cur_targets) ||
+      bigstack_alloc_ul(max_window_size * max_window_ctl, &tag_matrix)) {
     goto show_tags_ret_NOMEM;
   }
   uii = 2 + founder_ct_192_long - founder_ctl * 2;
@@ -6360,11 +6371,11 @@ int32_t show_tags(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t
 
   if (tags_list) {
     memcpy(outname_end, ".tags.list", 11);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto show_tags_ret_WRITE_FAIL;
     }
-    sprintf(tbuf, "%%%us  CHR         BP NTAG       LEFT      RIGHT   KBSPAN TAGS\n", plink_maxsnp);
-    fprintf(outfile, tbuf, "SNP");
+    sprintf(g_textbuf, "%%%us  CHR         BP NTAG       LEFT      RIGHT   KBSPAN TAGS\n", plink_maxsnp);
+    fprintf(outfile, g_textbuf, "SNP");
   }
   printf("--show-tags%s: 0%%", final_set? "" : " all");
   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
@@ -6376,7 +6387,7 @@ int32_t show_tags(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t
       marker_idx += chrom_marker_ct;
       continue;
     }
-    chrom_name_ptr = chrom_name_buf5w4write(chrom_name_buf, chrom_info_ptr, chrom_idx, &chrom_name_len);
+    chrom_name_ptr = chrom_name_buf5w4write(chrom_info_ptr, chrom_idx, &chrom_name_len, chrom_name_buf);
     is_haploid = is_set(chrom_info_ptr->haploid_mask, chrom_idx);
     is_x = (chrom_idx == ((uint32_t)chrom_info_ptr->x_code));
     is_y = (chrom_idx == ((uint32_t)chrom_info_ptr->y_code));
@@ -6393,9 +6404,9 @@ int32_t show_tags(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t
       window_uidxs[window_cidx] = marker_uidx;
       is_target = IS_SET(targets, marker_uidx);
       if (is_target) {
-	SET_BIT(cur_targets, window_cidx);
+	SET_BIT(window_cidx, cur_targets);
       } else {
-	CLEAR_BIT(cur_targets, window_cidx);
+	CLEAR_BIT(window_cidx, cur_targets);
       }
 
       // circular index of beginning of window starting at current marker
@@ -6403,7 +6414,7 @@ int32_t show_tags(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t
       geno_fixed_vec_ptr = &(geno[window_cidx * founder_ct_192_long]);
       mask_fixed_vec_ptr = &(geno_masks[window_cidx * founder_ct_192_long]);
       fill_ulong_zero(&(tag_matrix[window_cidx * max_window_ctl]), max_window_ctl);
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, geno_fixed_vec_ptr, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, geno_fixed_vec_ptr)) {
         goto show_tags_ret_READ_FAIL;
       }
       if (is_haploid && hh_exists) {
@@ -6435,8 +6446,8 @@ int32_t show_tags(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t
 	    cov12 = dp_result[0] * non_missing_ctd - dxx * dyy;
 	    dxx = (dp_result[3] * non_missing_ctd + dxx * dxx) * (dp_result[4] * non_missing_ctd + dyy * dyy);
 	    if (cov12 * cov12 > dxx * tag_thresh) {
-	      set_bit_ul(tag_matrix, window_cidx * max_window_ctal + window_cidx3);
-	      set_bit_ul(tag_matrix, window_cidx3 * max_window_ctal + window_cidx);
+	      set_bit_ul(window_cidx * max_window_ctal + window_cidx3, tag_matrix);
+	      set_bit_ul(window_cidx3 * max_window_ctal + window_cidx, tag_matrix);
 	    }
 	  }
 	}
@@ -6472,13 +6483,13 @@ int32_t show_tags(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t
 	  max_bp = marker_pos[marker_uidx2];
 	  window_cidx3 = window_cidx_starts[window_cidx2];
 	  for (uii = 0; uii < tag_ct; uii++, window_cidx3++) {
-	    next_set_ul_ck(tag_matrix_row_ptr, &window_cidx3, max_window_size);
+	    next_set_ul_ck(tag_matrix_row_ptr, max_window_size, &window_cidx3);
 	    if (window_cidx3 == max_window_size) {
 	      window_cidx3 = next_set_unsafe(tag_matrix_row_ptr, 0);
 	    }
 	    marker_uidx3 = window_uidxs[window_cidx3];
 	    if (final_set) {
-	      SET_BIT(final_set, marker_uidx3);
+	      SET_BIT(marker_uidx3, final_set);
 	    }
 	    if (tags_list) {
 	      cur_bp = marker_pos[marker_uidx3];
@@ -6490,21 +6501,21 @@ int32_t show_tags(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t
 	    }
 	  }
 	  if (tags_list) {
-	    bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), tbuf);
+	    bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), g_textbuf);
 	    *bufptr++ = ' ';
 	    bufptr = memcpyax(bufptr, chrom_name_ptr, chrom_name_len, ' ');
-	    bufptr = uint32_writew10x(bufptr, marker_pos[marker_uidx2], ' ');
-	    bufptr = uint32_writew4x(bufptr, tag_ct, ' ');
-	    bufptr = uint32_writew10x(bufptr, min_bp, ' ');
-	    bufptr = uint32_writew10x(bufptr, max_bp, ' ');
-	    bufptr = width_force(8, bufptr, double_g_write(bufptr, ((int32_t)(max_bp - min_bp + 1)) * 0.001));
+	    bufptr = uint32toa_w10x(marker_pos[marker_uidx2], ' ', bufptr);
+	    bufptr = uint32toa_w4x(tag_ct, ' ', bufptr);
+	    bufptr = uint32toa_w10x(min_bp, ' ', bufptr);
+	    bufptr = uint32toa_w10x(max_bp, ' ', bufptr);
+	    bufptr = width_force(8, bufptr, dtoa_g(((int32_t)(max_bp - min_bp + 1)) * 0.001, bufptr));
 	    *bufptr++ = ' ';
-	    if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+	    if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	      goto show_tags_ret_WRITE_FAIL;
 	    }
 	    window_cidx3 = window_cidx_starts[window_cidx2];
 	    for (uii = 0; uii < tag_ct; uii++, window_cidx3++) {
-	      next_set_ul_ck(tag_matrix_row_ptr, &window_cidx3, max_window_size);
+	      next_set_ul_ck(tag_matrix_row_ptr, max_window_size, &window_cidx3);
 	      if (window_cidx3 == max_window_size) {
 		window_cidx3 = next_set_unsafe(tag_matrix_row_ptr, 0);
 	      }
@@ -6554,7 +6565,7 @@ int32_t show_tags(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t
   }
   if (final_set) {
     memcpy(outname_end, ".tags", 6);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto show_tags_ret_OPEN_FAIL;
     }
     if (!twocolumn) {
@@ -6563,7 +6574,7 @@ int32_t show_tags(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t
 	fputs(&(marker_ids[marker_uidx * max_marker_id_len]), outfile);
 	putc('\n', outfile);
 	marker_uidx++;
-	next_set_ul_ck(final_set, &marker_uidx, unfiltered_marker_ct);
+	next_set_ul_ck(final_set, unfiltered_marker_ct, &marker_uidx);
       }
     } else {
       for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
@@ -6602,7 +6613,7 @@ int32_t show_tags(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t
     break;
   }
  show_tags_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(infile);
   fclose_cond(outfile);
   return retval;
@@ -6933,11 +6944,11 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
   // MAF < 0.05 markers have a minor effect on PLINK 1.07 --blocks's behavior
   // when present, while Haploview completely ignores them.  We replicate
   // Haploview's behavior.
-  unsigned char* wkspace_mark = wkspace_base;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
   FILE* outfile = NULL;
   FILE* outfile_det = NULL;
   // circular.  [2n] = numStrong, [2n+1] = numRec
@@ -6982,7 +6993,7 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
   uintptr_t* index_data;
   uintptr_t* window_data;
   uintptr_t* window_data_ptr;
-  unsigned char* wkspace_mark2;
+  unsigned char* bigstack_mark2;
   uint32_t* block_uidxs;
   uint32_t* forward_block_sizes;
   uint32_t* candidate_pairs;
@@ -7046,15 +7057,15 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
   //    secondarily by start uidx.
   // 4. Greedily construct blocks from the sorted list (i.e. form largest
   //    blocks first).
-  if (wkspace_alloc_ul_checked(&founder_pnm, unfiltered_sample_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&marker_exclude, unfiltered_marker_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&in_haploblock, unfiltered_marker_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl, &founder_pnm) ||
+      bigstack_alloc_ul(unfiltered_marker_ctl, &marker_exclude) ||
+      bigstack_alloc_ul(unfiltered_marker_ctl, &in_haploblock) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw)) {
     goto haploview_blocks_ret_NOMEM;
   }
   memcpy(founder_pnm, founder_info, unfiltered_sample_ctl * sizeof(intptr_t));
   if (!no_pheno_req) {
-    bitfield_and(founder_pnm, pheno_nm, unfiltered_sample_ctl);
+    bitvec_and(pheno_nm, unfiltered_sample_ctl, founder_pnm);
   }
   founder_ct = popcount_longs(founder_pnm, unfiltered_sample_ctl);
   if (founder_ct < 2) {
@@ -7074,7 +7085,7 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
       next_unset_ul_unsafe_ck(marker_exclude_orig, &marker_uidx);
       dxx = set_allele_freqs[marker_uidx];
       if ((dxx < min_maf) || (dxx > max_maf)) {
-	set_bit_ul(marker_exclude, marker_uidx);
+	set_bit_ul(marker_uidx, marker_exclude);
       }
     }
     marker_ct = unfiltered_marker_ct - popcount_longs(marker_exclude, unfiltered_marker_ctl);
@@ -7086,26 +7097,26 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
   pct_thresh = marker_ct / 100;
   fill_ulong_zero(in_haploblock, unfiltered_marker_ctl);
   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
-  founder_ctl2 = (founder_ct + (BITCT2 - 1)) / BITCT2;
-  founder_ctv2 = 2 * ((founder_ct + (BITCT - 1)) / BITCT);
-  if (wkspace_alloc_ul_checked(&index_data, 5 * founder_ctv2 * sizeof(intptr_t))) {
+  founder_ctl2 = QUATERCT_TO_WORDCT(founder_ct);
+  founder_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(founder_ct);
+  if (bigstack_alloc_ul(5 * founder_ctv2, &index_data)) {
     goto haploview_blocks_ret_NOMEM;
   }
-  if (alloc_collapsed_haploid_filters(unfiltered_sample_ct, founder_ct, Y_FIX_NEEDED, 1, founder_info, sex_male, &founder_include2, &founder_male_include2)) {
+  if (alloc_collapsed_haploid_filters(founder_info, sex_male, unfiltered_sample_ct, founder_ct, Y_FIX_NEEDED, 1, &founder_include2, &founder_male_include2)) {
     goto haploview_blocks_ret_NOMEM;
   }
   memcpy(outname_end, ".blocks.det", 12);
-  if (fopen_checked(&outfile_det, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile_det)) {
     goto haploview_blocks_ret_OPEN_FAIL;
   }
   if (fputs_checked(" CHR          BP1          BP2           KB  NSNPS SNPS\n", outfile_det)) {
     goto haploview_blocks_ret_WRITE_FAIL;
   }
   outname_end[7] = '\0';
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto haploview_blocks_ret_OPEN_FAIL;
   }
-  wkspace_mark2 = wkspace_base;
+  bigstack_mark2 = g_bigstack_base;
   fputs("--blocks: 0%", stdout);
   fflush(stdout);
   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++, markers_done += cur_marker_ct) {
@@ -7131,7 +7142,7 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
     is_haploid = IS_SET(chrom_info_ptr->haploid_mask, chrom_idx);
     is_x = (((int32_t)chrom_idx) == chrom_info_ptr->x_code);
     is_y = (((int32_t)chrom_idx) == chrom_info_ptr->y_code);
-    wkspace_reset(wkspace_mark2);
+    bigstack_reset(bigstack_mark2);
     // Need to compute full 3x3 count tables, but only for a limited window;
     // more similar to --clump than --fast-epistasis, so we don't bother with
     // precomputing 0-only/1-only/2-only bitfields or multithreading for now.
@@ -7157,9 +7168,9 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
     // detailed information on the most recent blocks) to find all potentially
     // valid blocks in a single pass.  So we can use practically all our memory
     // to track and sort those blocks by bp length.
-    if (wkspace_alloc_ui_checked(&block_uidxs, max_block_size * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&forward_block_sizes, max_block_size * sizeof(int32_t)) ||
-        wkspace_alloc_ul_checked(&window_data, max_block_size * founder_ctv2 * sizeof(intptr_t))) {
+    if (bigstack_alloc_ui(max_block_size, &block_uidxs) ||
+        bigstack_alloc_ui(max_block_size, &forward_block_sizes) ||
+        bigstack_alloc_ul(max_block_size * founder_ctv2, &window_data)) {
       goto haploview_blocks_ret_NOMEM;
     }
     if (max_block_size >= 4) {
@@ -7168,7 +7179,7 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
       //   strong_rec_cts[(block_cidx + delta) * 2 + 1] = numRec
       // for the potential [m - delta, m] block, taking array indices modulo
       // max_block_size * 2.
-      if (wkspace_alloc_ul_checked(&strong_rec_cts, max_block_size * 2 * sizeof(intptr_t))) {
+      if (bigstack_alloc_ul(max_block_size * 2, &strong_rec_cts)) {
 	goto haploview_blocks_ret_NOMEM;
       }
     }
@@ -7182,8 +7193,8 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
     block_uidx_first = chrom_start;
     marker_uidx = chrom_start;
     block_pos_first = marker_pos[chrom_start];
-    max_candidates = wkspace_left / (3 * sizeof(int32_t));
-    candidate_pairs = (uint32_t*)wkspace_alloc(max_candidates * 3 * sizeof(int32_t));
+    max_candidates = bigstack_left() / (3 * sizeof(int32_t));
+    bigstack_alloc_ui(max_candidates * 3, &candidate_pairs);
     candidate_ct = 0;
     cur_block_size = 0;
     fill_uint_zero(recent_ci_types, 3);
@@ -7207,7 +7218,7 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
 	}
       }
       block_uidxs[block_cidx] = marker_uidx;
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, window_data_ptr, founder_ct, founder_pnm, final_mask, 0)) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_pnm, final_mask, 0, bedfile, loadbuf_raw, window_data_ptr)) {
 	goto haploview_blocks_ret_READ_FAIL;
       }
       if (is_haploid) {
@@ -7244,7 +7255,7 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
       }
       while (marker_pos_thresh >= marker_pos[forward_scan_uidx]) {
 	uii = forward_scan_uidx + 1;
-	next_unset_ck(marker_exclude, &uii, chrom_end);
+	next_unset_ck(marker_exclude, chrom_end, &uii);
 	if (uii == chrom_end) {
 	  break;
 	}
@@ -7298,17 +7309,17 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
           prev_strong = strong_rec_cts[block_cidx2 * 2];
 	}
 	window_data_ptr = &(window_data[block_cidx2 * founder_ctv2]);
-	vec_3freq(founder_ctl2, window_data_ptr, index_data, &(counts[0]), &(counts[1]), &(counts[2]));
+	genovec_3freq(window_data_ptr, index_data, founder_ctl2, &(counts[0]), &(counts[1]), &(counts[2]));
 	counts[0] = index_tots[0] - counts[0] - counts[1] - counts[2];
-	vec_3freq(founder_ctl2, window_data_ptr, &(index_data[founder_ctv2]), &(counts[3]), &(counts[4]), &(counts[5]));
+	genovec_3freq(window_data_ptr, &(index_data[founder_ctv2]), founder_ctl2, &(counts[3]), &(counts[4]), &(counts[5]));
 	counts[3] = index_tots[1] - counts[3] - counts[4] - counts[5];
-	vec_3freq(founder_ctl2, window_data_ptr, &(index_data[2 * founder_ctv2]), &(counts[6]), &(counts[7]), &(counts[8]));
+	genovec_3freq(window_data_ptr, &(index_data[2 * founder_ctv2]), founder_ctl2, &(counts[6]), &(counts[7]), &(counts[8]));
 	counts[6] = index_tots[2] - counts[6] - counts[7] - counts[8];
 	if (is_x) {
-	  vec_3freq(founder_ctl2, window_data_ptr, &(index_data[3 * founder_ctv2]), &(counts[9]), &(counts[10]), &(counts[11]));
+	  genovec_3freq(window_data_ptr, &(index_data[3 * founder_ctv2]), founder_ctl2, &(counts[9]), &(counts[10]), &(counts[11]));
 	  // counts[10] should always be zero
 	  counts[9] = index_tots[3] - counts[9] - counts[11];
-	  vec_3freq(founder_ctl2, window_data_ptr, &(index_data[4 * founder_ctv2]), &(counts[12]), &(counts[13]), &(counts[14]));
+	  genovec_3freq(window_data_ptr, &(index_data[4 * founder_ctv2]), founder_ctl2, &(counts[12]), &(counts[13]), &(counts[14]));
 	  counts[12] = index_tots[4] - counts[12] - counts[14];
 	}
 	cur_ci_type = haploview_blocks_classify(counts, lowci_max, lowci_min, recomb_highci, strong_highci, strong_lowci, strong_lowci_outer, is_x, recomb_fast_ln_thresh);
@@ -7412,7 +7423,7 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
       }
       candidate_pairs[2 * ulii] = block_cidx;
       candidate_pairs[2 * ulii + 1] = block_cidx2;
-      fill_bits(in_haploblock, block_cidx, block_cidx2 + 1 - block_cidx);
+      fill_bits(block_cidx, block_cidx2 + 1 - block_cidx, in_haploblock);
       ulii++;
     }
 #ifdef __cplusplus
@@ -7420,20 +7431,20 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
 #else
     qsort(candidate_pairs, ulii, sizeof(int64_t), llcmp);
 #endif
-    wptr_start = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, chrom_idx));
+    wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf));
     wptr_start = memseta(wptr_start, 32, 3);
     for (candidate_idx = 0; candidate_idx < ulii; candidate_idx++) {
       putc('*', outfile);
       block_cidx = candidate_pairs[2 * candidate_idx];
       block_cidx2 = candidate_pairs[2 * candidate_idx + 1];
       marker_uidx = block_cidx;
-      wptr = uint32_writew10(wptr_start, marker_pos[block_cidx]);
+      wptr = uint32toa_w10(marker_pos[block_cidx], wptr_start);
       wptr = memseta(wptr, 32, 3);
-      wptr = uint32_writew10x(wptr, marker_pos[block_cidx2], ' ');
-      wptr = width_force(12, wptr, double_g_write(wptr, ((int32_t)(marker_pos[block_cidx2] + 1 - marker_pos[block_cidx])) * 0.001));
+      wptr = uint32toa_w10x(marker_pos[block_cidx2], ' ', wptr);
+      wptr = width_force(12, wptr, dtoa_g(((int32_t)(marker_pos[block_cidx2] + 1 - marker_pos[block_cidx])) * 0.001, wptr));
       *wptr++ = ' ';
-      wptr = uint32_writew6x(wptr, block_cidx2 + 1 - block_cidx - popcount_bit_idx(marker_exclude, block_cidx, block_cidx2), ' ');
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile_det)) {
+      wptr = uint32toa_w6x(block_cidx2 + 1 - block_cidx - popcount_bit_idx(marker_exclude, block_cidx, block_cidx2), ' ', wptr);
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile_det)) {
 	goto haploview_blocks_ret_WRITE_FAIL;
       }
       for (marker_uidx = block_cidx; marker_uidx <= block_cidx2; marker_uidx++) {
@@ -7483,7 +7494,7 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
 #endif
   }
  haploview_blocks_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   fclose_cond(outfile_det);
   return retval;
@@ -7492,7 +7503,7 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
 void twolocus_write_table(FILE* outfile, uint32_t* counts, uint32_t plink_maxsnp, char* mkr1, char* mkr2, char* allele00, char* allele01, char* allele10, char* allele11, uint32_t alen00, uint32_t alen01, uint32_t alen10, uint32_t alen11) {
   // PLINK 1.07's print settings for this function don't handle large numbers
   // well so we break byte-for-byte compatibility.
-  char* bufptr = memseta(tbuf, 32, plink_maxsnp + 14);
+  char* bufptr = memseta(g_textbuf, 32, plink_maxsnp + 14);
   uint32_t* uiptr = counts;
   uint32_t total = 0;
   uint32_t marg_a[4];
@@ -7517,8 +7528,8 @@ void twolocus_write_table(FILE* outfile, uint32_t* counts, uint32_t plink_maxsnp
   }
   tot_recip = 1.0 / ((double)((int32_t)total));
   bufptr = strcpyax(bufptr, mkr2, '\n');
-  fwrite(tbuf, 1, bufptr - tbuf, outfile);
-  fwrite(tbuf, 1, plink_maxsnp + 7, outfile);
+  fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
+  fwrite(g_textbuf, 1, plink_maxsnp + 7, outfile);
   if (alen10 < 4) {
     fwrite(spaces, 1, 9 - 2 * alen10, outfile);
   }
@@ -7541,81 +7552,81 @@ void twolocus_write_table(FILE* outfile, uint32_t* counts, uint32_t plink_maxsnp
   fputs(allele11, outfile);
   fputs("        0/0        */*\n", outfile);
 
-  bufptr = fw_strcpy(plink_maxsnp, mkr1, tbuf);
+  bufptr = fw_strcpy(plink_maxsnp, mkr1, g_textbuf);
   *bufptr++ = ' ';
   if (alen00 == 1) {
     bufptr = memseta(bufptr, 32, 2);
   }
-  fwrite(tbuf, 1, bufptr - tbuf, outfile);
+  fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
   fputs(allele00, outfile);
   putc('/', outfile);
   fputs(allele00, outfile);
-  bufptr = tbuf;
+  bufptr = g_textbuf;
   *bufptr++ = ' ';
-  bufptr = uint32_writew10x(bufptr, counts[0], ' ');
-  bufptr = uint32_writew10x(bufptr, counts[2], ' ');
-  bufptr = uint32_writew10x(bufptr, counts[3], ' ');
-  bufptr = uint32_writew10x(bufptr, counts[1], ' ');
-  bufptr = uint32_writew10x(bufptr, marg_a[0], '\n');
-  fwrite(tbuf, 1, bufptr - tbuf, outfile);
-
-  bufptr = memseta(tbuf, 32, plink_maxsnp + 1);
+  bufptr = uint32toa_w10x(counts[0], ' ', bufptr);
+  bufptr = uint32toa_w10x(counts[2], ' ', bufptr);
+  bufptr = uint32toa_w10x(counts[3], ' ', bufptr);
+  bufptr = uint32toa_w10x(counts[1], ' ', bufptr);
+  bufptr = uint32toa_w10x(marg_a[0], '\n', bufptr);
+  fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
+
+  bufptr = memseta(g_textbuf, 32, plink_maxsnp + 1);
   if (alen00 + alen01 < 4) {
     bufptr = memseta(bufptr, 32, 4 - alen00 - alen01);
   }
-  fwrite(tbuf, 1, bufptr - tbuf, outfile);
+  fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
   fputs(allele00, outfile);
   putc('/', outfile);
   fputs(allele01, outfile);
-  bufptr = tbuf;
+  bufptr = g_textbuf;
   *bufptr++ = ' ';
-  bufptr = uint32_writew10x(bufptr, counts[8], ' ');
-  bufptr = uint32_writew10x(bufptr, counts[10], ' ');
-  bufptr = uint32_writew10x(bufptr, counts[11], ' ');
-  bufptr = uint32_writew10x(bufptr, counts[9], ' ');
-  bufptr = uint32_writew10x(bufptr, marg_a[2], '\n');
-  fwrite(tbuf, 1, bufptr - tbuf, outfile);
-
-  bufptr = memseta(tbuf, 32, plink_maxsnp + 1);
+  bufptr = uint32toa_w10x(counts[8], ' ', bufptr);
+  bufptr = uint32toa_w10x(counts[10], ' ', bufptr);
+  bufptr = uint32toa_w10x(counts[11], ' ', bufptr);
+  bufptr = uint32toa_w10x(counts[9], ' ', bufptr);
+  bufptr = uint32toa_w10x(marg_a[2], '\n', bufptr);
+  fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
+
+  bufptr = memseta(g_textbuf, 32, plink_maxsnp + 1);
   if (alen01 == 1) {
     bufptr = memseta(bufptr, 32, 2);
   }
-  fwrite(tbuf, 1, bufptr - tbuf, outfile);
+  fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
   fputs(allele01, outfile);
   putc('/', outfile);
   fputs(allele01, outfile);
-  bufptr = tbuf;
+  bufptr = g_textbuf;
   *bufptr++ = ' ';
-  bufptr = uint32_writew10x(bufptr, counts[12], ' ');
-  bufptr = uint32_writew10x(bufptr, counts[14], ' ');
-  bufptr = uint32_writew10x(bufptr, counts[15], ' ');
-  bufptr = uint32_writew10x(bufptr, counts[13], ' ');
-  bufptr = uint32_writew10x(bufptr, marg_a[3], '\n');
-  fwrite(tbuf, 1, bufptr - tbuf, outfile);
-
-  bufptr = memseta(tbuf, 32, plink_maxsnp + 3);
+  bufptr = uint32toa_w10x(counts[12], ' ', bufptr);
+  bufptr = uint32toa_w10x(counts[14], ' ', bufptr);
+  bufptr = uint32toa_w10x(counts[15], ' ', bufptr);
+  bufptr = uint32toa_w10x(counts[13], ' ', bufptr);
+  bufptr = uint32toa_w10x(marg_a[3], '\n', bufptr);
+  fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
+
+  bufptr = memseta(g_textbuf, 32, plink_maxsnp + 3);
   bufptr = memcpya(bufptr, "0/0 ", 4);
-  bufptr = uint32_writew10x(bufptr, counts[4], ' ');
-  bufptr = uint32_writew10x(bufptr, counts[6], ' ');
-  bufptr = uint32_writew10x(bufptr, counts[7], ' ');
-  bufptr = uint32_writew10x(bufptr, counts[5], ' ');
-  bufptr = uint32_writew10x(bufptr, marg_a[1], '\n');
-  fwrite(tbuf, 1, bufptr - tbuf, outfile);
-
-  bufptr = memseta(tbuf, 32, plink_maxsnp + 3);
+  bufptr = uint32toa_w10x(counts[4], ' ', bufptr);
+  bufptr = uint32toa_w10x(counts[6], ' ', bufptr);
+  bufptr = uint32toa_w10x(counts[7], ' ', bufptr);
+  bufptr = uint32toa_w10x(counts[5], ' ', bufptr);
+  bufptr = uint32toa_w10x(marg_a[1], '\n', bufptr);
+  fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
+
+  bufptr = memseta(g_textbuf, 32, plink_maxsnp + 3);
   bufptr = memcpya(bufptr, "*/* ", 4);
-  bufptr = uint32_writew10x(bufptr, marg_b[0], ' ');
-  bufptr = uint32_writew10x(bufptr, marg_b[2], ' ');
-  bufptr = uint32_writew10x(bufptr, marg_b[3], ' ');
-  bufptr = uint32_writew10x(bufptr, marg_b[1], ' ');
-  bufptr = uint32_writew10x(bufptr, total, '\n');
+  bufptr = uint32toa_w10x(marg_b[0], ' ', bufptr);
+  bufptr = uint32toa_w10x(marg_b[2], ' ', bufptr);
+  bufptr = uint32toa_w10x(marg_b[3], ' ', bufptr);
+  bufptr = uint32toa_w10x(marg_b[1], ' ', bufptr);
+  bufptr = uint32toa_w10x(total, '\n', bufptr);
   *bufptr++ = '\n';
-  fwrite(tbuf, 1, bufptr - tbuf, outfile);
+  fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
 
-  bufptr = memseta(tbuf, 32, plink_maxsnp + 14);
+  bufptr = memseta(g_textbuf, 32, plink_maxsnp + 14);
   bufptr = strcpyax(bufptr, mkr2, '\n');
-  fwrite(tbuf, 1, bufptr - tbuf, outfile);
-  fwrite(tbuf, 1, plink_maxsnp + 9, outfile);
+  fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
+  fwrite(g_textbuf, 1, plink_maxsnp + 9, outfile);
   fputs(allele10, outfile);
   putc('/', outfile);
   fputs(allele10, outfile);
@@ -7638,106 +7649,106 @@ void twolocus_write_table(FILE* outfile, uint32_t* counts, uint32_t plink_maxsnp
   }
   fputs(" 0/0        */*\n", outfile);
 
-  bufptr = fw_strcpy(plink_maxsnp, mkr1, tbuf);
+  bufptr = fw_strcpy(plink_maxsnp, mkr1, g_textbuf);
   *bufptr++ = ' ';
   if (alen00 == 1) {
     bufptr = memseta(bufptr, 32, 2);
   }
-  fwrite(tbuf, 1, bufptr - tbuf, outfile);
+  fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
   fputs(allele00, outfile);
   putc('/', outfile);
   fputs(allele00, outfile);
-  bufptr = memseta(tbuf, 32, 2);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)counts[0]) * tot_recip);
+  bufptr = memseta(g_textbuf, 32, 2);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[0]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)counts[2]) * tot_recip);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[2]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)counts[3]) * tot_recip);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[3]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)counts[1]) * tot_recip);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[1]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_clipped(bufptr, ((int32_t)marg_a[0]) * tot_recip);
+  bufptr = dtoa_f_w9p6_clipped(((int32_t)marg_a[0]) * tot_recip, bufptr);
   *bufptr++ = '\n';
-  fwrite(tbuf, 1, bufptr - tbuf, outfile);
+  fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
 
-  bufptr = memseta(tbuf, 32, plink_maxsnp + 1);
+  bufptr = memseta(g_textbuf, 32, plink_maxsnp + 1);
   if (alen00 + alen01 < 4) {
     bufptr = memseta(bufptr, 32, 4 - alen00 - alen01);
   }
-  fwrite(tbuf, 1, bufptr - tbuf, outfile);
+  fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
   fputs(allele00, outfile);
   putc('/', outfile);
   fputs(allele01, outfile);
-  bufptr = memseta(tbuf, 32, 2);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)counts[8]) * tot_recip);
+  bufptr = memseta(g_textbuf, 32, 2);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[8]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)counts[10]) * tot_recip);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[10]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)counts[11]) * tot_recip);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[11]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)counts[9]) * tot_recip);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[9]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_clipped(bufptr, ((int32_t)marg_a[2]) * tot_recip);
+  bufptr = dtoa_f_w9p6_clipped(((int32_t)marg_a[2]) * tot_recip, bufptr);
   *bufptr++ = '\n';
-  fwrite(tbuf, 1, bufptr - tbuf, outfile);
+  fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
 
-  bufptr = memseta(tbuf, 32, plink_maxsnp + 1);
+  bufptr = memseta(g_textbuf, 32, plink_maxsnp + 1);
   if (alen01 == 1) {
     bufptr = memseta(bufptr, 32, 2);
   }
-  fwrite(tbuf, 1, bufptr - tbuf, outfile);
+  fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
   fputs(allele01, outfile);
   putc('/', outfile);
   fputs(allele01, outfile);
-  bufptr = memseta(tbuf, 32, 2);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)counts[12]) * tot_recip);
+  bufptr = memseta(g_textbuf, 32, 2);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[12]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)counts[14]) * tot_recip);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[14]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)counts[15]) * tot_recip);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[15]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)counts[13]) * tot_recip);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[13]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_clipped(bufptr, ((int32_t)marg_a[3]) * tot_recip);
+  bufptr = dtoa_f_w9p6_clipped(((int32_t)marg_a[3]) * tot_recip, bufptr);
   *bufptr++ = '\n';
-  fwrite(tbuf, 1, bufptr - tbuf, outfile);
+  fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
 
-  bufptr = memseta(tbuf, 32, plink_maxsnp + 3);
+  bufptr = memseta(g_textbuf, 32, plink_maxsnp + 3);
   bufptr = memcpya(bufptr, "0/0  ", 5);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)counts[4]) * tot_recip);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[4]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)counts[6]) * tot_recip);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[6]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)counts[7]) * tot_recip);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[7]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)counts[5]) * tot_recip);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)counts[5]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_clipped(bufptr, ((int32_t)marg_a[1]) * tot_recip);
+  bufptr = dtoa_f_w9p6_clipped(((int32_t)marg_a[1]) * tot_recip, bufptr);
   *bufptr++ = '\n';
-  fwrite(tbuf, 1, bufptr - tbuf, outfile);
+  fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
 
-  bufptr = memseta(tbuf, 32, plink_maxsnp + 3);
+  bufptr = memseta(g_textbuf, 32, plink_maxsnp + 3);
   bufptr = memcpya(bufptr, "*/*  ", 5);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)marg_b[0]) * tot_recip);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)marg_b[0]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)marg_b[2]) * tot_recip);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)marg_b[2]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)marg_b[3]) * tot_recip);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)marg_b[3]) * tot_recip, bufptr);
   bufptr = memseta(bufptr, 32, 2);
-  bufptr = double_f_writew96_spaced(bufptr, ((int32_t)marg_b[1]) * tot_recip);
+  bufptr = dtoa_f_w9p6_spaced(((int32_t)marg_b[1]) * tot_recip, bufptr);
   bufptr = memcpya(bufptr, "   1\n\n", 6);
-  fwrite(tbuf, 1, bufptr - tbuf, outfile);
+  fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
 }
 
 int32_t twolocus(Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uintptr_t* pheno_nm, uint32_t pheno_nm_ct, uint32_t pheno_ctrl_ct, uintptr_t* pheno_c, uintptr_t* sex_male,  [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
   char* mkr1 = outname? epi_ip->twolocus_mkr1 : epi_ip->ld_mkr1;
   char* mkr2 = outname? epi_ip->twolocus_mkr2 : epi_ip->ld_mkr2;
   uintptr_t* sample_include2 = NULL;
   uintptr_t* sample_male_include2 = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t ulii = strlen(mkr1) + 1;
   uintptr_t uljj = strlen(mkr2) + 1;
   uint32_t hwe_midp = epi_ip->modifier & EPI_HWE_MIDP;
@@ -7783,20 +7794,20 @@ int32_t twolocus(Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_
   uint32_t alen11;
   uint32_t count_total;
   if (!outname) {
-    ulkk = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+    ulkk = BITCT_TO_WORDCT(unfiltered_sample_ct);
     // ulkk = (unfiltered_sample_ctl2 + 1) / 2;
     sample_ct = popcount_longs(sample_exclude, ulkk);
     if (!sample_ct) {
       logerrprint("Warning: Skipping --ld since there are no founders.  (--make-founders may come\nin handy here.)\n");
       goto twolocus_ret_1;
     }
-    if (wkspace_alloc_ul_checked(&loadbuf_raw, ulkk * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(ulkk, &loadbuf_raw)) {
       goto twolocus_ret_NOMEM;
     }
-    bitfield_exclude_to_include(sample_exclude, loadbuf_raw, unfiltered_sample_ct);
+    bitarr_invert_copy(sample_exclude, unfiltered_sample_ct, loadbuf_raw);
     sample_exclude = loadbuf_raw;
   }
-  sample_ctl2 = (sample_ct + (BITCT2 - 1)) / BITCT2;
+  sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
   final_mask = get_final_mask(sample_ct);
   if ((ulii > max_marker_id_len) || (uljj > max_marker_id_len)) {
     goto twolocus_ret_MARKER_NOT_FOUND;
@@ -7823,15 +7834,15 @@ int32_t twolocus(Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_
   if (marker_idx == marker_ct) {
     goto twolocus_ret_MARKER_NOT_FOUND;
   }  
-  if (wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&loadbufs[0], sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&loadbufs[1], sample_ctl2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw) ||
+      bigstack_alloc_ul(sample_ctl2, &loadbufs[0]) ||
+      bigstack_alloc_ul(sample_ctl2, &loadbufs[1])) {
     goto twolocus_ret_NOMEM;
   }
   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
   loadbufs[0][sample_ctl2 - 1] = 0;
   loadbufs[1][sample_ctl2 - 1] = 0;
-  if (alloc_collapsed_haploid_filters(unfiltered_sample_ct, sample_ct, hh_exists, 0, sample_exclude, sex_male, &sample_include2, &sample_male_include2)) {
+  if (alloc_collapsed_haploid_filters(sample_exclude, sex_male, unfiltered_sample_ct, sample_ct, hh_exists, 0, &sample_include2, &sample_male_include2)) {
     goto twolocus_ret_NOMEM;
   }
   is_haploid[0] = 0;
@@ -7843,7 +7854,7 @@ int32_t twolocus(Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_
     if (fseeko(bedfile, bed_offset + (marker_uidx * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
       goto twolocus_ret_READ_FAIL;
     }
-    if (load_and_collapse(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbufs[marker_idx], sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+    if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbufs[marker_idx])) {
       goto twolocus_ret_READ_FAIL;
     }
     chrom_fo_idx = get_marker_chrom_fo_idx(chrom_info_ptr, marker_uidx);
@@ -7911,7 +7922,7 @@ int32_t twolocus(Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_
   alen11 = strlen(marker_allele_ptrs[2 * marker_uidxs[1] + 1]);
   if (outname) {
     memcpy(outname_end, ".twolocus", 10);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto twolocus_ret_OPEN_FAIL;
     }
     fputs("\nAll individuals\n===============\n", outfile);
@@ -8057,69 +8068,69 @@ int32_t twolocus(Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_
       if (fabs(dxx) < SMALL_EPSILON) {
 	dxx = 0;
       }
-      bufptr = memcpya(logbuf, "   R-sq = ", 10);
-      bufptr2 = double_g_write(bufptr, dxx * dxx / (freq1x * freqx1 * freq2x * freqx2));
+      bufptr = memcpya(g_logbuf, "   R-sq = ", 10);
+      bufptr2 = dtoa_g(dxx * dxx / (freq1x * freqx1 * freq2x * freqx2), bufptr);
       // assumes bufptr2 - bufptr < 15
       bufptr = memseta(bufptr2, 32, 15 - ((uintptr_t)(bufptr2 - bufptr)));
       bufptr = memcpya(bufptr, "D' = ", 5);
       if (dxx >= 0) {
-	bufptr = double_g_write(bufptr, dxx / MINV(freqx1 * freq2x, freqx2 * freq1x));
+	bufptr = dtoa_g(dxx / MINV(freqx1 * freq2x, freqx2 * freq1x), bufptr);
       } else {
-	bufptr = double_g_write(bufptr, -dxx / MINV(freqx1 * freq1x, freqx2 * freq2x));
+	bufptr = dtoa_g(-dxx / MINV(freqx1 * freq1x, freqx2 * freq2x), bufptr);
       }
       bufptr = memcpya(bufptr, "\n\n", 3);
       logprintb();
       logprint("   Haplotype     Frequency    Expectation under LE\n");
       logprint("   ---------     ---------    --------------------\n");
-      bufptr = memseta(logbuf, 32, 3);
+      bufptr = memseta(g_logbuf, 32, 3);
       if (alen00 + alen10 < 9) {
 	bufptr = memseta(bufptr, 32, 9 - alen00 - alen10);
       }
       bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[0]], alen00);
       bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[1]], alen10);
       bufptr = memseta(bufptr, 32, 5);
-      bufptr = double_f_writew96_spaced(bufptr, freq11 + solutions[ulkk]);
+      bufptr = dtoa_f_w9p6_spaced(freq11 + solutions[ulkk], bufptr);
       bufptr = memseta(bufptr, 32, 15);
-      bufptr = double_f_writew96_clipped(bufptr, freqx1 * freq1x);
+      bufptr = dtoa_f_w9p6_clipped(freqx1 * freq1x, bufptr);
       bufptr = memcpya(bufptr, "\n", 2);
       logprintb();
-      bufptr = &(logbuf[3]);
+      bufptr = &(g_logbuf[3]);
       if (alen01 + alen10 < 9) {
 	bufptr = memseta(bufptr, 32, 9 - alen01 - alen10);
       }
       bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[0] + 1], alen01);
       bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[1]], alen10);
       bufptr = memseta(bufptr, 32, 5);
-      bufptr = double_f_writew96_spaced(bufptr, freq21 + half_hethet_share - solutions[ulkk]);
+      bufptr = dtoa_f_w9p6_spaced(freq21 + half_hethet_share - solutions[ulkk], bufptr);
       bufptr = memseta(bufptr, 32, 15);
-      bufptr = double_f_writew96_clipped(bufptr, freqx1 * freq2x);
+      bufptr = dtoa_f_w9p6_clipped(freqx1 * freq2x, bufptr);
       bufptr = memcpya(bufptr, "\n", 2);
       logprintb();
-      bufptr = &(logbuf[3]);
+      bufptr = &(g_logbuf[3]);
       if (alen00 + alen11 < 9) {
 	bufptr = memseta(bufptr, 32, 9 - alen00 - alen11);
       }
       bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[0]], alen00);
       bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[1] + 1], alen11);
       bufptr = memseta(bufptr, 32, 5);
-      bufptr = double_f_writew96_spaced(bufptr, freq12 + half_hethet_share - solutions[ulkk]);
+      bufptr = dtoa_f_w9p6_spaced(freq12 + half_hethet_share - solutions[ulkk], bufptr);
       bufptr = memseta(bufptr, 32, 15);
-      bufptr = double_f_writew96_clipped(bufptr, freqx2 * freq1x);
+      bufptr = dtoa_f_w9p6_clipped(freqx2 * freq1x, bufptr);
       bufptr = memcpya(bufptr, "\n", 2);
       logprintb();
-      bufptr = &(logbuf[3]);
+      bufptr = &(g_logbuf[3]);
       if (alen01 + alen11 < 9) {
 	bufptr = memseta(bufptr, 32, 9 - alen01 - alen11);
       }
       bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[0] + 1], alen01);
       bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[1] + 1], alen11);
       bufptr = memseta(bufptr, 32, 5);
-      bufptr = double_f_writew96_spaced(bufptr, freq22 + solutions[ulkk]);
+      bufptr = dtoa_f_w9p6_spaced(freq22 + solutions[ulkk], bufptr);
       bufptr = memseta(bufptr, 32, 15);
-      bufptr = double_f_writew96_clipped(bufptr, freqx2 * freq2x);
+      bufptr = dtoa_f_w9p6_clipped(freqx2 * freq2x, bufptr);
       bufptr = memcpyl3a(bufptr, "\n\n");
       logprintb();
-      bufptr = &(logbuf[3]);
+      bufptr = &(g_logbuf[3]);
       bufptr = memcpya(bufptr, "In phase alleles are ", 21);
       if (dxx > 0) {
 	bufptr = memcpya(bufptr, marker_allele_ptrs[2 * marker_uidxs[0]], alen00);
@@ -8165,7 +8176,7 @@ int32_t twolocus(Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_
   }
  twolocus_ret_1:
   fclose_cond(outfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -8224,7 +8235,7 @@ int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE*
   // first two.  So we're able to use variations of the QT --assoc bit hacks.
   FILE* outfile = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t pheno_nm_ctl2 = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t pheno_nm_ctl2 = QUATERCT_TO_WORDCT(pheno_nm_ct);
   uintptr_t final_mask = get_final_mask(pheno_nm_ct);
   uintptr_t marker_uidx = marker_uidx_base;
   uintptr_t pct = 1;
@@ -8239,7 +8250,7 @@ int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE*
   uint32_t chrom_idx = 0;
   uint32_t chrom_idx2 = 0;
   int32_t retval = 0;
-  unsigned char* wkspace_mark2;
+  unsigned char* bigstack_mark2;
   char* wptr_start;
   char* wptr_start2;
   char* wptr;
@@ -8251,10 +8262,11 @@ int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE*
   uint32_t* uiptr3;
   uint32_t* uiptr4;
   uint32_t* uiptr5;
+  uintptr_t cur_bigstack_left;
   uintptr_t cur_workload;
   uintptr_t idx1_block_size;
   uintptr_t idx2_block_size;
-  uintptr_t idx2_block_sizem16;
+  uintptr_t idx2_block_sizea16;
   uintptr_t marker_uidx_tmp;
   uintptr_t block_idx1;
   uintptr_t block_idx2;
@@ -8272,7 +8284,7 @@ int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE*
   uint32_t sample_idx;
   uint32_t uii;
   uint32_t ujj;
-  if (wkspace_alloc_d_checked(&pheno_d2, pheno_nm_ct * sizeof(double))) {
+  if (bigstack_alloc_d(pheno_nm_ct, &pheno_d2)) {
     goto epistasis_linear_regression_ret_NOMEM;
   }
   g_epi_pheno_d2 = pheno_d2;
@@ -8304,8 +8316,13 @@ int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE*
   //     for other stuff (see epistasis_report() comment, starting from
   //     "offset"; main result buffer must be double-size to store both beta
   //     and chi-square stat)
+  cur_bigstack_left = bigstack_left();
+  ulii = 6 * CACHELINE + max_thread_ct * (5 * (CACHELINE - 4)) - 5 * sizeof(int32_t) - sizeof(double);
+  if (cur_bigstack_left >= ulii) {
+    cur_bigstack_left -= ulii;
+  }
   ulii = pheno_nm_ctl2 * sizeof(intptr_t) + 6 * sizeof(int32_t) + 2 * sizeof(double) + marker_ct2 * 2 * sizeof(double);
-  idx1_block_size = (wkspace_left - 6 * CACHELINE + 5 * sizeof(int32_t) + sizeof(double) - max_thread_ct * (5 * (CACHELINE - 4))) / (ulii * 2 + 1);
+  idx1_block_size = cur_bigstack_left / (ulii * 2 + 1);
   if (!idx1_block_size) {
     goto epistasis_linear_regression_ret_NOMEM;
   }
@@ -8314,17 +8331,17 @@ int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE*
   }
   // pad to avoid threads writing to same cacheline
   ulii = (max_thread_ct - 1) * 15 + idx1_block_size;
-  g_epi_geno1_offsets = (uint32_t*)wkspace_alloc(idx1_block_size * 2 * sizeof(int32_t));
-  g_epi_geno1 = (uintptr_t*)wkspace_alloc(pheno_nm_ctl2 * idx1_block_size * sizeof(intptr_t));
-  g_epi_phenogeno1 = (double*)wkspace_alloc(idx1_block_size * sizeof(double));
+  bigstack_alloc_ui(idx1_block_size * 2, &g_epi_geno1_offsets);
+  bigstack_alloc_ul(pheno_nm_ctl2 * idx1_block_size, &g_epi_geno1);
+  bigstack_alloc_d(idx1_block_size, &g_epi_phenogeno1);
   // may be better to just recompute genosums values in inner loop?  can test
   // this later
-  g_epi_genosums1 = (uint32_t*)wkspace_alloc(idx1_block_size * 2 * sizeof(int32_t));
-  g_epi_all_chisq = (double*)wkspace_alloc(idx1_block_size * marker_ct2 * 2 * sizeof(double));
-  g_epi_best_chisq1 = (double*)wkspace_alloc(ulii * sizeof(double));
-  g_epi_best_id1 = (uint32_t*)wkspace_alloc(ulii * sizeof(int32_t));
-  g_epi_n_sig_ct1 = (uint32_t*)wkspace_alloc(ulii * sizeof(int32_t));
-  g_epi_fail_ct1 = (uint32_t*)wkspace_alloc(ulii * sizeof(int32_t));
+  bigstack_alloc_ui(idx1_block_size * 2, &g_epi_genosums1);
+  bigstack_alloc_d(idx1_block_size * marker_ct2 * 2, &g_epi_all_chisq);
+  bigstack_alloc_d(ulii, &g_epi_best_chisq1);
+  bigstack_alloc_ui(ulii, &g_epi_best_id1);
+  bigstack_alloc_ui(ulii, &g_epi_n_sig_ct1);
+  bigstack_alloc_ui(ulii, &g_epi_fail_ct1);
   for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
     g_epi_geno1[block_idx1 * pheno_nm_ctl2 + pheno_nm_ctl2 - 1] = 0;
   }
@@ -8333,46 +8350,46 @@ int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE*
   }
 
   ulii = pheno_nm_ctl2 * sizeof(intptr_t) + 2 * sizeof(int32_t) + sizeof(double) + max_thread_ct * (3 * sizeof(int32_t) + sizeof(double));
-  idx2_block_size = (wkspace_left - (3 * CACHELINE - sizeof(intptr_t) - 2 * sizeof(int32_t) - sizeof(double)) - max_thread_ct * (3 * (CACHELINE - sizeof(int32_t)) + (CACHELINE - sizeof(double)))) / ulii;
+  idx2_block_size = (bigstack_left() - (3 * CACHELINE - sizeof(intptr_t) - 2 * sizeof(int32_t) - sizeof(double)) - max_thread_ct * (3 * (CACHELINE - sizeof(int32_t)) + (CACHELINE - sizeof(double)))) / ulii;
   if (idx2_block_size > marker_ct2) {
     idx2_block_size = marker_ct2;
   }
-  idx2_block_size = (idx2_block_size + 15) & (~(15 * ONELU));
+  idx2_block_size = round_up_pow2(idx2_block_size, 16);
 
   memcpy(outname_end, ".epi.qt", 8);
   if (parallel_tot > 1) {
     outname_end[7] = '.';
-    uint32_writex(&(outname_end[8]), parallel_idx + 1, '\0');
+    uint32toa_x(parallel_idx + 1, '\0', &(outname_end[8]));
   }
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto epistasis_linear_regression_ret_OPEN_FAIL;
   }
   if (!parallel_idx) {
-    wptr = memcpya(tbuf, "CHR1 ", 5);
+    wptr = memcpya(g_textbuf, "CHR1 ", 5);
     wptr = fw_strcpyn(plink_maxsnp, 4, "SNP1", wptr);
     wptr = memcpya(wptr, " CHR2 ", 6);
     wptr = fw_strcpyn(plink_maxsnp, 4, "SNP2", wptr);
     wptr = memcpya(wptr, "     BETA_INT         STAT            P \n", 41);
-    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
       goto epistasis_linear_regression_ret_WRITE_FAIL;
     }
   }
 
-  wkspace_mark2 = wkspace_base;
+  bigstack_mark2 = g_bigstack_base;
   while (1) {
     if (!idx2_block_size) {
       goto epistasis_linear_regression_ret_NOMEM;
     }
-    if (!(wkspace_alloc_ul_checked(&g_epi_geno2, pheno_nm_ctl2 * idx2_block_size * sizeof(intptr_t)) ||
-          wkspace_alloc_d_checked(&g_epi_phenogeno2, idx2_block_size * sizeof(double)) ||
-          wkspace_alloc_ui_checked(&g_epi_genosums2, idx2_block_size * 2 * sizeof(int32_t)) ||
-          wkspace_alloc_d_checked(&g_epi_best_chisq2, max_thread_ct * idx2_block_size * sizeof(double)) ||
-          wkspace_alloc_ui_checked(&g_epi_best_id2, max_thread_ct * idx2_block_size * sizeof(int32_t)) ||
-          wkspace_alloc_ui_checked(&g_epi_n_sig_ct2, max_thread_ct * idx2_block_size * sizeof(int32_t)) ||
-          wkspace_alloc_ui_checked(&g_epi_fail_ct2, max_thread_ct * idx2_block_size * sizeof(int32_t)))) {
+    if (!(bigstack_alloc_ul(pheno_nm_ctl2 * idx2_block_size, &g_epi_geno2) ||
+          bigstack_alloc_d(idx2_block_size, &g_epi_phenogeno2) ||
+          bigstack_alloc_ui(idx2_block_size * 2, &g_epi_genosums2) ||
+          bigstack_alloc_d(max_thread_ct * idx2_block_size, &g_epi_best_chisq2) ||
+          bigstack_alloc_ui(max_thread_ct * idx2_block_size, &g_epi_best_id2) ||
+          bigstack_alloc_ui(max_thread_ct * idx2_block_size, &g_epi_n_sig_ct2) ||
+          bigstack_alloc_ui(max_thread_ct * idx2_block_size, &g_epi_fail_ct2))) {
       break;
     }
-    wkspace_reset(wkspace_mark2);
+    bigstack_reset(bigstack_mark2);
     idx2_block_size -= 16;
   }
   for (block_idx2 = 0; block_idx2 < idx2_block_size; block_idx2++) {
@@ -8382,10 +8399,10 @@ int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE*
   if (marker_idx1) {
     marker_uidx = jump_forward_unset_unsafe(marker_exclude1, marker_uidx + 1, marker_idx1);
   }
-  wptr = memcpya(logbuf, "QT --epistasis to ", 18);
+  wptr = memcpya(g_logbuf, "QT --epistasis to ", 18);
   wptr = strcpya(wptr, outname);
   memcpy(wptr, " ... ", 6);
-  wordwrap(logbuf, 16); // strlen("99% [processing]")
+  wordwrapb(16); // strlen("99% [processing]")
   logprintb();
   fputs("0%", stdout);
   do {
@@ -8441,7 +8458,7 @@ int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE*
       }
       uii = block_idx1 - g_epi_idx1_block_bounds[tidx - 1];
       g_epi_idx1_block_bounds[tidx] = block_idx1;
-      g_epi_idx1_block_bounds16[tidx] = g_epi_idx1_block_bounds16[tidx - 1] + ((uii + 15) & (~15));
+      g_epi_idx1_block_bounds16[tidx] = g_epi_idx1_block_bounds16[tidx - 1] + round_up_pow2_ui(uii, 16);
     }
     g_epi_idx1_block_bounds[max_thread_ct] = idx1_block_size;
     chrom_end = 0;
@@ -8452,7 +8469,7 @@ int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE*
           goto epistasis_linear_regression_ret_READ_FAIL;
 	}
       }
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx_tmp))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx_tmp), bedfile, loadbuf_raw, loadbuf)) {
         goto epistasis_linear_regression_ret_READ_FAIL;
       }
       rotate_loadbuf_and_compute_phenogeno(loadbuf, pheno_d2, pheno_nm_ct, &(g_epi_geno1[block_idx1 * pheno_nm_ctl2]), &(g_epi_phenogeno1[block_idx1]), &(g_epi_genosums1[block_idx1 * 2]));
@@ -8489,18 +8506,18 @@ int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE*
             goto epistasis_linear_regression_ret_READ_FAIL;
 	  }
 	}
-	if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx2))) {
+	if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx2), bedfile, loadbuf_raw, loadbuf)) {
 	  goto epistasis_linear_regression_ret_READ_FAIL;
 	}
         rotate_loadbuf_and_compute_phenogeno(loadbuf, pheno_d2, pheno_nm_ct, &(g_epi_geno2[block_idx2 * pheno_nm_ctl2]), &(g_epi_phenogeno2[block_idx2]), &(g_epi_genosums2[block_idx2 * 2]));
       }
       g_epi_idx2_block_size = cur_idx2_block_size;
       g_epi_idx2_block_start = marker_idx2;
-      idx2_block_sizem16 = (cur_idx2_block_size + 15) & (~(15 * ONELU));
+      idx2_block_sizea16 = round_up_pow2(cur_idx2_block_size, 16);
       fill_uint_zero(g_epi_n_sig_ct1, idx1_block_size + 15 * (max_thread_ct - 1));
       fill_uint_zero(g_epi_fail_ct1, idx1_block_size + 15 * (max_thread_ct - 1));
-      fill_uint_zero(g_epi_n_sig_ct2, idx2_block_sizem16 * max_thread_ct);
-      fill_uint_zero(g_epi_fail_ct2, idx2_block_sizem16 * max_thread_ct);
+      fill_uint_zero(g_epi_n_sig_ct2, idx2_block_sizea16 * max_thread_ct);
+      fill_uint_zero(g_epi_fail_ct2, idx2_block_sizea16 * max_thread_ct);
       for (tidx = 0; tidx < max_thread_ct; tidx++) {
         ulii = g_epi_idx1_block_bounds[tidx];
         uljj = g_epi_idx1_block_bounds[tidx + 1] - ulii;
@@ -8517,7 +8534,7 @@ int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE*
             ulii -= marker_idx2;
 	  }
           uljj = cur_idx2_block_size - ulii;
-	  dptr = &(g_epi_best_chisq2[tidx * idx2_block_sizem16 + ulii]);
+	  dptr = &(g_epi_best_chisq2[tidx * idx2_block_sizea16 + ulii]);
 	  dptr2 = &(g_epi_all_chisq[(marker_idx2 + ulii) * 2]);
           for (ulkk = 0; ulkk < uljj; ulkk++) {
             *dptr++ = dptr2[ulkk * 2];
@@ -8561,10 +8578,10 @@ int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE*
 	  } else {
 	    block_idx2 -= marker_idx2;
 	  }
-	  dptr = &(g_epi_best_chisq2[tidx * idx2_block_sizem16 + block_idx2]);
-	  uiptr = &(g_epi_best_id2[tidx * idx2_block_sizem16]);
-	  uiptr2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizem16 + block_idx2]);
-	  uiptr3 = &(g_epi_fail_ct2[tidx * idx2_block_sizem16 + block_idx2]);
+	  dptr = &(g_epi_best_chisq2[tidx * idx2_block_sizea16 + block_idx2]);
+	  uiptr = &(g_epi_best_id2[tidx * idx2_block_sizea16]);
+	  uiptr2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizea16 + block_idx2]);
+	  uiptr3 = &(g_epi_fail_ct2[tidx * idx2_block_sizea16 + block_idx2]);
 	  dptr2 = &(best_chisq[block_idx2 + marker_idx2]);
 	  uiptr4 = &(n_sig_cts[block_idx2 + marker_idx2]);
 	  uiptr5 = &(fail_cts[block_idx2 + marker_idx2]);
@@ -8595,7 +8612,7 @@ int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE*
 	chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
 	chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1];
       }
-      wptr_start = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, chrom_idx));
+      wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf));
       *wptr_start++ = ' ';
       wptr_start = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
       *wptr_start++ = ' ';
@@ -8603,9 +8620,9 @@ int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE*
       for (chrom_fo_idx2 = get_marker_chrom_fo_idx(chrom_info_ptr, marker_uidx2); chrom_fo_idx2 < chrom_ct; chrom_fo_idx2++) {
 	chrom_idx2 = chrom_info_ptr->chrom_file_order[chrom_fo_idx2];
 	chrom_end2 = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx2 + 1];
-	wptr_start2 = width_force(4, wptr_start, chrom_name_write(wptr_start, chrom_info_ptr, chrom_idx2));
+	wptr_start2 = width_force(4, wptr_start, chrom_name_write(chrom_info_ptr, chrom_idx2, wptr_start));
 	*wptr_start2++ = ' ';
-	for (; marker_uidx2 < chrom_end2; next_unset_ul_ck(marker_exclude2, &marker_uidx2, chrom_end2), marker_idx2++, dptr = &(dptr[2])) {
+	for (; marker_uidx2 < chrom_end2; next_unset_ul_ck(marker_exclude2, chrom_end2, &marker_uidx2), marker_idx2++, dptr = &(dptr[2])) {
 	  if (marker_idx2 == ujj) {
 	    marker_idx2 = g_epi_geno1_offsets[2 * block_idx1 + 1];
 	    if (marker_idx2 == marker_ct2) {
@@ -8626,14 +8643,14 @@ int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE*
 	    wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr_start2);
 	    *wptr++ = ' ';
 	    // beta
-	    wptr = width_force(12, wptr, double_g_write(wptr, dptr[1]));
+	    wptr = width_force(12, wptr, dtoa_g(dptr[1], wptr));
             *wptr++ = ' ';
-	    wptr = width_force(12, wptr, double_g_write(wptr, dxx));
+	    wptr = width_force(12, wptr, dtoa_g(dxx, wptr));
 	    *wptr++ = ' ';
 	    dxx = normdist(-sqrt(dxx)) * 2;
-	    wptr = double_g_writewx4x(wptr, MAXV(dxx, output_min_p), 12, ' ');
+	    wptr = dtoa_g_wxp4x(MAXV(dxx, output_min_p), 12, ' ', wptr);
 	    *wptr++ = '\n';
-	    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	      goto epistasis_linear_regression_ret_WRITE_FAIL;
 	    }
 	    // could remove this writeback in --epi1 1 case
@@ -8694,8 +8711,8 @@ int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE*
 int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_t unfiltered_marker_ct, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, uintptr_t marker_uidx_base, uintptr_t marker_ct1, uintptr_t* marker_exclude1, uintptr_t marker_idx1_start, uintptr_t marker_idx1_end, uintptr_t marker_ct2, uintptr_t* marker_exclude2, uint32_t is_triangular, uintptr_t job_ [...]
   FILE* outfile = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t pheno_nm_cta4 = (pheno_nm_ct + 3) & (~3);
-  uintptr_t pheno_nm_ctl2 = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t pheno_nm_cta4 = round_up_pow2(pheno_nm_ct, 4);
+  uintptr_t pheno_nm_ctl2 = QUATERCT_TO_WORDCT(pheno_nm_ct);
   uintptr_t final_mask = get_final_mask(pheno_nm_ct);
   uintptr_t marker_uidx = marker_uidx_base;
   uintptr_t pct = 1;
@@ -8710,7 +8727,7 @@ int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE
   uint32_t chrom_idx = 0;
   uint32_t chrom_idx2 = 0;
   int32_t retval = 0;
-  unsigned char* wkspace_mark2;
+  unsigned char* bigstack_mark2;
   uintptr_t* ulptr;
   char* wptr_start;
   char* wptr_start2;
@@ -8723,10 +8740,11 @@ int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE
   uint32_t* uiptr3;
   uint32_t* uiptr4;
   uint32_t* uiptr5;
+  uintptr_t cur_bigstack_left;
   uintptr_t cur_workload;
   uintptr_t idx1_block_size;
   uintptr_t idx2_block_size;
-  uintptr_t idx2_block_sizem16;
+  uintptr_t idx2_block_sizea16;
   uintptr_t marker_uidx_tmp;
   uintptr_t block_idx1;
   uintptr_t block_idx2;
@@ -8743,27 +8761,27 @@ int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE
   uint32_t is_last_block;
   uint32_t uii;
   uint32_t ujj;
-  if (wkspace_alloc_ul_checked(&g_epi_pheno_c, pheno_nm_ctl2 * sizeof(uintptr_t))) {
+  if (bigstack_alloc_ul(pheno_nm_ctl2, &g_epi_pheno_c)) {
     goto epistasis_logistic_regression_ret_NOMEM;
   }
-  collapse_copy_bitarr_incl(unfiltered_sample_ct, pheno_c, pheno_nm, pheno_nm_ct, g_epi_pheno_c);
+  copy_bitarr_subset(pheno_c, pheno_nm, unfiltered_sample_ct, pheno_nm_ct, g_epi_pheno_c);
   g_epi_pheno_nm_ct = pheno_nm_ct;
   // per-thread buffers
-  g_epi_logistic_mt = (Epi_logistic_multithread*)wkspace_alloc(max_thread_ct * sizeof(Epi_logistic_multithread));
+  g_epi_logistic_mt = (Epi_logistic_multithread*)bigstack_alloc(max_thread_ct * sizeof(Epi_logistic_multithread));
   if (!g_epi_logistic_mt) {
     goto epistasis_logistic_regression_ret_NOMEM;
   }
   // param_ct_max = 4 (intercept, A, B, AB)
   for (tidx = 0; tidx < max_thread_ct; tidx++) {
-    if (wkspace_alloc_f_checked(&(g_epi_logistic_mt[tidx].cur_covars_cov_major), pheno_nm_cta4 * 4 * sizeof(float)) ||
-        wkspace_alloc_f_checked(&(g_epi_logistic_mt[tidx].coef), 4 * sizeof(float)) ||
-        wkspace_alloc_f_checked(&(g_epi_logistic_mt[tidx].pp), pheno_nm_cta4 * sizeof(float)) ||
-        wkspace_alloc_f_checked(&(g_epi_logistic_mt[tidx].sample_1d_buf), pheno_nm_ct * sizeof(float)) ||
-        wkspace_alloc_f_checked(&(g_epi_logistic_mt[tidx].pheno_buf), pheno_nm_ct * sizeof(float)) ||
-        wkspace_alloc_f_checked(&(g_epi_logistic_mt[tidx].param_1d_buf), pheno_nm_ct * 4 * sizeof(float)) ||
-        wkspace_alloc_f_checked(&(g_epi_logistic_mt[tidx].param_1d_buf2), pheno_nm_ct * sizeof(float)) ||
-	wkspace_alloc_f_checked(&(g_epi_logistic_mt[tidx].param_2d_buf), 4 * 4 * sizeof(float)) ||
-        wkspace_alloc_f_checked(&(g_epi_logistic_mt[tidx].param_2d_buf2), 4 * 4 * sizeof(float))) {
+    if (bigstack_alloc_f(pheno_nm_cta4 * 4, &(g_epi_logistic_mt[tidx].cur_covars_cov_major)) ||
+        bigstack_alloc_f(4, &(g_epi_logistic_mt[tidx].coef)) ||
+        bigstack_alloc_f(pheno_nm_cta4, &(g_epi_logistic_mt[tidx].pp)) ||
+        bigstack_alloc_f(pheno_nm_ct, &(g_epi_logistic_mt[tidx].sample_1d_buf)) ||
+        bigstack_alloc_f(pheno_nm_ct, &(g_epi_logistic_mt[tidx].pheno_buf)) ||
+        bigstack_alloc_f(pheno_nm_ct * 4, &(g_epi_logistic_mt[tidx].param_1d_buf)) ||
+        bigstack_alloc_f(pheno_nm_ct, &(g_epi_logistic_mt[tidx].param_1d_buf2)) ||
+	bigstack_alloc_f(4 * 4, &(g_epi_logistic_mt[tidx].param_2d_buf)) ||
+        bigstack_alloc_f(4 * 4, &(g_epi_logistic_mt[tidx].param_2d_buf2))) {
       goto epistasis_logistic_regression_ret_NOMEM;
     }
   }
@@ -8774,8 +8792,13 @@ int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE
   //     for other stuff (see epistasis_report() comment, starting from
   //     "offset"; main result buffer must be double-size to store both beta
   //     and chi-square stat)
+  cur_bigstack_left = bigstack_left();
+  ulii = 4 * CACHELINE - 3 * sizeof(int32_t) + max_thread_ct * (5 * (CACHELINE - 4));
+  if (cur_bigstack_left >= ulii) {
+    cur_bigstack_left -= ulii;
+  }
   ulii = pheno_nm_ctl2 * sizeof(intptr_t) + 4 * sizeof(int32_t) + sizeof(float) + marker_ct2 * 2 * sizeof(float);
-  idx1_block_size = (wkspace_left - 4 * CACHELINE + 3 * sizeof(int32_t) - max_thread_ct * (5 * (CACHELINE - 4))) / (ulii * 2 + 1);
+  idx1_block_size = cur_bigstack_left / (ulii * 2 + 1);
   if (!idx1_block_size) {
     goto epistasis_logistic_regression_ret_NOMEM;
   }
@@ -8784,13 +8807,13 @@ int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE
   }
   // pad to avoid threads writing to same cacheline
   ulii = (max_thread_ct - 1) * 15 + idx1_block_size;
-  g_epi_geno1_offsets = (uint32_t*)wkspace_alloc(idx1_block_size * 2 * sizeof(int32_t));
-  g_epi_geno1 = (uintptr_t*)wkspace_alloc(pheno_nm_ctl2 * idx1_block_size * sizeof(intptr_t));
-  g_epi_all_chisq_f = (float*)wkspace_alloc(idx1_block_size * marker_ct2 * 2 * sizeof(float));
-  g_epi_best_chisq_f1 = (float*)wkspace_alloc(ulii * sizeof(float));
-  g_epi_best_id1 = (uint32_t*)wkspace_alloc(ulii * sizeof(int32_t));
-  g_epi_n_sig_ct1 = (uint32_t*)wkspace_alloc(ulii * sizeof(int32_t));
-  g_epi_fail_ct1 = (uint32_t*)wkspace_alloc(ulii * sizeof(int32_t));
+  bigstack_alloc_ui(idx1_block_size * 2, &g_epi_geno1_offsets);
+  bigstack_alloc_ul(pheno_nm_ctl2 * idx1_block_size, &g_epi_geno1);
+  bigstack_alloc_f(idx1_block_size * marker_ct2 * 2, &g_epi_all_chisq_f);
+  bigstack_alloc_f(ulii, &g_epi_best_chisq_f1);
+  bigstack_alloc_ui(ulii, &g_epi_best_id1);
+  bigstack_alloc_ui(ulii, &g_epi_n_sig_ct1);
+  bigstack_alloc_ui(ulii, &g_epi_fail_ct1);
   for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
     g_epi_geno1[block_idx1 * pheno_nm_ctl2 + pheno_nm_ctl2 - 1] = 0;
   }
@@ -8799,44 +8822,44 @@ int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE
   }
 
   ulii = pheno_nm_ctl2 * sizeof(intptr_t) + max_thread_ct * (3 * sizeof(int32_t) + sizeof(double));
-  idx2_block_size = (wkspace_left - (CACHELINE - sizeof(intptr_t)) - max_thread_ct * (3 * (CACHELINE - sizeof(int32_t)) + (CACHELINE - sizeof(float)))) / ulii;
+  idx2_block_size = (bigstack_left() - (CACHELINE - sizeof(intptr_t)) - max_thread_ct * (3 * (CACHELINE - sizeof(int32_t)) + (CACHELINE - sizeof(float)))) / ulii;
   if (idx2_block_size > marker_ct2) {
     idx2_block_size = marker_ct2;
   }
-  idx2_block_size = (idx2_block_size + 15) & (~(15 * ONELU));
+  idx2_block_size = round_up_pow2(idx2_block_size, 16);
 
   memcpy(outname_end, ".epi.cc", 8);
   if (parallel_tot > 1) {
     outname_end[7] = '.';
-    uint32_writex(&(outname_end[8]), parallel_idx + 1, '\0');
+    uint32toa_x(parallel_idx + 1, '\0', &(outname_end[8]));
   }
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto epistasis_logistic_regression_ret_OPEN_FAIL;
   }
   if (!parallel_idx) {
-    wptr = memcpya(tbuf, "CHR1 ", 5);
+    wptr = memcpya(g_textbuf, "CHR1 ", 5);
     wptr = fw_strcpyn(plink_maxsnp, 4, "SNP1", wptr);
     wptr = memcpya(wptr, " CHR2 ", 6);
     wptr = fw_strcpyn(plink_maxsnp, 4, "SNP2", wptr);
     wptr = memcpya(wptr, "       OR_INT         STAT            P \n", 41);
-    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
       goto epistasis_logistic_regression_ret_WRITE_FAIL;
     }
   }
 
-  wkspace_mark2 = wkspace_base;
+  bigstack_mark2 = g_bigstack_base;
   while (1) {
     if (!idx2_block_size) {
       goto epistasis_logistic_regression_ret_NOMEM;
     }
-    if (!(wkspace_alloc_ul_checked(&g_epi_geno2, pheno_nm_ctl2 * idx2_block_size * sizeof(intptr_t)) ||
-          wkspace_alloc_f_checked(&g_epi_best_chisq_f2, max_thread_ct * idx2_block_size * sizeof(float)) ||
-          wkspace_alloc_ui_checked(&g_epi_best_id2, max_thread_ct * idx2_block_size * sizeof(int32_t)) ||
-          wkspace_alloc_ui_checked(&g_epi_n_sig_ct2, max_thread_ct * idx2_block_size * sizeof(int32_t)) ||
-          wkspace_alloc_ui_checked(&g_epi_fail_ct2, max_thread_ct * idx2_block_size * sizeof(int32_t)))) {
+    if (!(bigstack_alloc_ul(pheno_nm_ctl2 * idx2_block_size, &g_epi_geno2) ||
+          bigstack_alloc_f(max_thread_ct * idx2_block_size, &g_epi_best_chisq_f2) ||
+          bigstack_alloc_ui(max_thread_ct * idx2_block_size, &g_epi_best_id2) ||
+          bigstack_alloc_ui(max_thread_ct * idx2_block_size, &g_epi_n_sig_ct2) ||
+          bigstack_alloc_ui(max_thread_ct * idx2_block_size, &g_epi_fail_ct2))) {
       break;
     }
-    wkspace_reset(wkspace_mark2);
+    bigstack_reset(bigstack_mark2);
     idx2_block_size -= 16;
   }
   for (block_idx2 = 0; block_idx2 < idx2_block_size; block_idx2++) {
@@ -8846,10 +8869,10 @@ int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE
   if (marker_idx1) {
     marker_uidx = jump_forward_unset_unsafe(marker_exclude1, marker_uidx + 1, marker_idx1);
   }
-  wptr = memcpya(logbuf, "C/C --epistasis to ", 19);
+  wptr = memcpya(g_logbuf, "C/C --epistasis to ", 19);
   wptr = strcpya(wptr, outname);
   memcpy(wptr, " ... ", 6);
-  wordwrap(logbuf, 16); // strlen("99% [processing]")
+  wordwrapb(16); // strlen("99% [processing]")
   logprintb();
   fputs("0%", stdout);
   do {
@@ -8905,7 +8928,7 @@ int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE
       }
       uii = block_idx1 - g_epi_idx1_block_bounds[tidx - 1];
       g_epi_idx1_block_bounds[tidx] = block_idx1;
-      g_epi_idx1_block_bounds16[tidx] = g_epi_idx1_block_bounds16[tidx - 1] + ((uii + 15) & (~15));
+      g_epi_idx1_block_bounds16[tidx] = g_epi_idx1_block_bounds16[tidx - 1] + round_up_pow2_ui(uii, 16);
     }
     g_epi_idx1_block_bounds[max_thread_ct] = idx1_block_size;
     chrom_end = 0;
@@ -8917,14 +8940,14 @@ int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE
 	}
       }
       // marker_reverse deliberately flipped
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, pheno_nm_ct, pheno_nm, final_mask, !IS_SET(marker_reverse, marker_uidx_tmp))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, !IS_SET(marker_reverse, marker_uidx_tmp), bedfile, loadbuf_raw, loadbuf)) {
         goto epistasis_logistic_regression_ret_READ_FAIL;
       }
       // rotate to hom A1 = 10, het = 01, hom A2 = 00, missing = 11, to allow
       // inner loop to use ordinary multiplication
       // this is a bit redundant with the forced reverse, but it's not a
       // bottleneck
-      rotate_plink1_to_plink2_and_copy(loadbuf, &(g_epi_geno1[block_idx1 * pheno_nm_ctl2]), pheno_nm_ctl2);
+      rotate_plink1_to_a2ct_and_copy(loadbuf, &(g_epi_geno1[block_idx1 * pheno_nm_ctl2]), pheno_nm_ctl2);
       if (!is_triangular) {
 	if (!IS_SET(marker_exclude2, marker_uidx_tmp)) {
           // do not compare against self
@@ -8960,18 +8983,18 @@ int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE
 	}
         ulptr = &(g_epi_geno2[block_idx2 * pheno_nm_ctl2]);
 	// marker_reverse deliberately flipped
-	if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, pheno_nm_ct, pheno_nm, final_mask, !IS_SET(marker_reverse, marker_uidx2))) {
+	if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, !IS_SET(marker_reverse, marker_uidx2), bedfile, loadbuf_raw, loadbuf)) {
 	  goto epistasis_logistic_regression_ret_READ_FAIL;
 	}
-	rotate_plink1_to_plink2_and_copy(loadbuf, ulptr, pheno_nm_ctl2);
+	rotate_plink1_to_a2ct_and_copy(loadbuf, ulptr, pheno_nm_ctl2);
       }
       g_epi_idx2_block_size = cur_idx2_block_size;
       g_epi_idx2_block_start = marker_idx2;
-      idx2_block_sizem16 = (cur_idx2_block_size + 15) & (~(15 * ONELU));
+      idx2_block_sizea16 = round_up_pow2(cur_idx2_block_size, 16);
       fill_uint_zero(g_epi_n_sig_ct1, idx1_block_size + 15 * (max_thread_ct - 1));
       fill_uint_zero(g_epi_fail_ct1, idx1_block_size + 15 * (max_thread_ct - 1));
-      fill_uint_zero(g_epi_n_sig_ct2, idx2_block_sizem16 * max_thread_ct);
-      fill_uint_zero(g_epi_fail_ct2, idx2_block_sizem16 * max_thread_ct);
+      fill_uint_zero(g_epi_n_sig_ct2, idx2_block_sizea16 * max_thread_ct);
+      fill_uint_zero(g_epi_fail_ct2, idx2_block_sizea16 * max_thread_ct);
       for (tidx = 0; tidx < max_thread_ct; tidx++) {
         ulii = g_epi_idx1_block_bounds[tidx];
         uljj = g_epi_idx1_block_bounds[tidx + 1] - ulii;
@@ -8988,7 +9011,7 @@ int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE
             ulii -= marker_idx2;
 	  }
           uljj = cur_idx2_block_size - ulii;
-	  fptr = &(g_epi_best_chisq_f2[tidx * idx2_block_sizem16 + ulii]);
+	  fptr = &(g_epi_best_chisq_f2[tidx * idx2_block_sizea16 + ulii]);
 	  fptr2 = &(g_epi_all_chisq_f[(marker_idx2 + ulii) * 2]);
           for (ulkk = 0; ulkk < uljj; ulkk++) {
             *fptr++ = fptr2[ulkk * 2];
@@ -9032,10 +9055,10 @@ int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE
 	  } else {
 	    block_idx2 -= marker_idx2;
 	  }
-	  fptr = &(g_epi_best_chisq_f2[tidx * idx2_block_sizem16 + block_idx2]);
-	  uiptr = &(g_epi_best_id2[tidx * idx2_block_sizem16]);
-	  uiptr2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizem16 + block_idx2]);
-	  uiptr3 = &(g_epi_fail_ct2[tidx * idx2_block_sizem16 + block_idx2]);
+	  fptr = &(g_epi_best_chisq_f2[tidx * idx2_block_sizea16 + block_idx2]);
+	  uiptr = &(g_epi_best_id2[tidx * idx2_block_sizea16]);
+	  uiptr2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizea16 + block_idx2]);
+	  uiptr3 = &(g_epi_fail_ct2[tidx * idx2_block_sizea16 + block_idx2]);
 	  dptr = &(best_chisq[block_idx2 + marker_idx2]);
 	  uiptr4 = &(n_sig_cts[block_idx2 + marker_idx2]);
 	  uiptr5 = &(fail_cts[block_idx2 + marker_idx2]);
@@ -9066,7 +9089,7 @@ int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE
 	chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
 	chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1];
       }
-      wptr_start = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, chrom_idx));
+      wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf));
       *wptr_start++ = ' ';
       wptr_start = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
       *wptr_start++ = ' ';
@@ -9074,9 +9097,9 @@ int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE
       for (chrom_fo_idx2 = get_marker_chrom_fo_idx(chrom_info_ptr, marker_uidx2); chrom_fo_idx2 < chrom_ct; chrom_fo_idx2++) {
 	chrom_idx2 = chrom_info_ptr->chrom_file_order[chrom_fo_idx2];
 	chrom_end2 = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx2 + 1];
-	wptr_start2 = width_force(4, wptr_start, chrom_name_write(wptr_start, chrom_info_ptr, chrom_idx2));
+	wptr_start2 = width_force(4, wptr_start, chrom_name_write(chrom_info_ptr, chrom_idx2, wptr_start));
 	*wptr_start2++ = ' ';
-	for (; marker_uidx2 < chrom_end2; next_unset_ul_ck(marker_exclude2, &marker_uidx2, chrom_end2), marker_idx2++, fptr = &(fptr[2])) {
+	for (; marker_uidx2 < chrom_end2; next_unset_ul_ck(marker_exclude2, chrom_end2, &marker_uidx2), marker_idx2++, fptr = &(fptr[2])) {
 	  if (marker_idx2 == ujj) {
 	    marker_idx2 = g_epi_geno1_offsets[2 * block_idx1 + 1];
 	    if (marker_idx2 == marker_ct2) {
@@ -9098,14 +9121,14 @@ int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE
 	    wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr_start2);
 	    *wptr++ = ' ';
 	    // odds ratio
-	    wptr = width_force(12, wptr, double_g_write(wptr, exp((double)fptr[1])));
+	    wptr = width_force(12, wptr, dtoa_g(exp((double)fptr[1]), wptr));
             *wptr++ = ' ';
-	    wptr = width_force(12, wptr, float_g_write(wptr, fxx));
+	    wptr = width_force(12, wptr, ftoa_g(fxx, wptr));
 	    *wptr++ = ' ';
 	    dxx = normdist(-sqrt(dxx)) * 2;
-	    wptr = double_g_writewx4x(wptr, MAXV(dxx, output_min_p), 12, ' ');
+	    wptr = dtoa_g_wxp4x(MAXV(dxx, output_min_p), 12, ' ', wptr);
 	    *wptr++ = '\n';
-	    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	      goto epistasis_logistic_regression_ret_WRITE_FAIL;
 	    }
 	    // could remove this writeback in --epi1 1 case
@@ -9161,12 +9184,12 @@ int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE
 }
 
 int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct2, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t* marker_pos, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* pheno_nm, uint32_t pheno_nm_ct, uint32_t ctrl_ct, uintptr_t* pheno_c, double* pheno_d, uint32_t parallel_idx, uint32_t pa [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctv2 = 2 * ((unfiltered_sample_ct + (BITCT - 1)) / BITCT);
+  uintptr_t unfiltered_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct);
   uintptr_t final_mask = get_final_mask(pheno_nm_ct);
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t marker_uidx_base = next_unset_unsafe(marker_exclude, 0);
   uintptr_t marker_uidx = marker_uidx_base;
   uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
@@ -9185,11 +9208,11 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
   uint32_t is_case_only_window = (is_case_only && case_only_gap);
   uint32_t case_ct = pheno_nm_ct - ctrl_ct;
   uint32_t cellminx3 = 0;
-  uintptr_t case_ctl2 = (case_ct + (BITCT2 - 1)) / BITCT2;
-  uintptr_t case_ctv2 = 2 * ((case_ct + (BITCT - 1)) / BITCT);
-  uintptr_t ctrl_ctl2 = (ctrl_ct + (BITCT2 - 1)) / BITCT2;
-  uintptr_t case_ctv3 = 2 * ((case_ct + (2 * BITCT - 1)) / (2 * BITCT));
-  uintptr_t ctrl_ctv3 = 2 * ((ctrl_ct + (2 * BITCT - 1)) / (2 * BITCT));
+  uintptr_t case_ctl2 = QUATERCT_TO_WORDCT(case_ct);
+  uintptr_t case_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(case_ct);
+  uintptr_t ctrl_ctl2 = QUATERCT_TO_WORDCT(ctrl_ct);
+  uintptr_t case_ctv3 = BITCT_TO_ALIGNED_WORDCT(case_ct);
+  uintptr_t ctrl_ctv3 = BITCT_TO_ALIGNED_WORDCT(ctrl_ct);
   uintptr_t case_ctsplit = 3 * case_ctv3;
   uintptr_t ctrl_ctsplit = 3 * ctrl_ctv3;
   uintptr_t pct = 1;
@@ -9210,6 +9233,7 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
   uintptr_t* ctrlbuf = NULL;
   uintptr_t* marker_exclude1 = NULL;
   uintptr_t* ulptr = NULL;
+  uintptr_t ularr[sizeof(double) / BYTECT];
   uintptr_t* casebuf;
   uintptr_t* loadbuf;
   uintptr_t* marker_exclude2;
@@ -9218,8 +9242,8 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
   uint32_t* n_sig_cts;
   uint32_t* fail_cts;
   uint32_t* marker_idx_to_uidx;
-  unsigned char* wkspace_mark2;
-  unsigned char* wkspace_mark3;
+  unsigned char* bigstack_mark2;
+  unsigned char* bigstack_mark3;
   char* wptr_start;
   char* wptr_start2;
   char* wptr;
@@ -9236,13 +9260,14 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
   uintptr_t marker_ct1;
   uintptr_t tot_ctsplit;
   uintptr_t job_size;
+  uintptr_t cur_bigstack_left;
   uintptr_t cur_workload;
   uintptr_t marker_idx1_start;
   uintptr_t marker_idx1;
   uintptr_t marker_idx1_end;
   uintptr_t idx1_block_size;
   uintptr_t idx2_block_size;
-  uintptr_t idx2_block_sizem16;
+  uintptr_t idx2_block_sizea16;
   uintptr_t marker_uidx_tmp;
   uintptr_t block_idx1;
   uintptr_t block_idx2;
@@ -9251,7 +9276,6 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
   uintptr_t ulii;
   uintptr_t uljj;
   uintptr_t chrom_end2;
-  __double_ulong du;
   uint32_t chrom_fo_idx;
   uint32_t chrom_fo_idx2;
   uint32_t chrom_idx2;
@@ -9264,7 +9288,7 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
   // monomorphic and non-autosomal diploid sites
   if (is_custom_set1) {
     if (!sip->ct) {
-      sprintf(logbuf, "Error: --%sepistasis set-by-%s requires a variant set to be loaded.\n", is_fast? "fast-" : "", is_set_by_set? "set" : "all");
+      sprintf(g_logbuf, "Error: --%sepistasis set-by-%s requires a variant set to be loaded.\n", is_fast? "fast-" : "", is_set_by_set? "set" : "all");
       goto epistasis_report_ret_INVALID_CMDLINE_2;
     } else if (!is_set_by_set) {
       if (sip->ct > 1) {
@@ -9275,7 +9299,7 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
       logerrprint("Error: --{fast-}epistasis set-by-set requires exactly one or two sets.\n(--set-names or --set-collapse-all may be handy here.)\n");
       goto epistasis_report_ret_INVALID_CMDLINE;
     }
-    if (wkspace_alloc_ul_checked(&marker_exclude1, unfiltered_marker_ctl * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_marker_ctl, &marker_exclude1)) {
       goto epistasis_report_ret_NOMEM;
     }
     unpack_set_unfiltered(marker_ct2, unfiltered_marker_ct, marker_exclude, sip->setdefs[0], marker_exclude1);
@@ -9295,25 +9319,25 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
   }
   if (!pheno_d) {
     if ((case_ct < 2) || ((!is_case_only) && (ctrl_ct < 2))) {
-      sprintf(logbuf, "Error: --%sepistasis requires at least two cases%s.\n", is_fast? "fast-" : "", is_case_only? "" : " and two controls");
+      sprintf(g_logbuf, "Error: --%sepistasis requires at least two cases%s.\n", is_fast? "fast-" : "", is_case_only? "" : " and two controls");
       goto epistasis_report_ret_INVALID_CMDLINE_2;
     }
-    if (wkspace_alloc_ul_checked(&casebuf, (case_ctv2 + ctrl_ctl2) * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(case_ctv2 + ctrl_ctl2, &casebuf)) {
       goto epistasis_report_ret_NOMEM;
     }
     ctrlbuf = &(casebuf[case_ctv2]);
     ctrlbuf[ctrl_ctl2 - 1] = 0;
   } else {
-    case_ctv2 = 2 * (pheno_nm_ct + (BITCT - 1)) / BITCT;
-    if (wkspace_alloc_ul_checked(&casebuf, case_ctv2 * sizeof(intptr_t))) {
+    case_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
+    if (bigstack_alloc_ul(case_ctv2, &casebuf)) {
       goto epistasis_report_ret_NOMEM;
     }
   }
   casebuf[case_ctv2 - 2] = 0;
   casebuf[case_ctv2 - 1] = 0;
   // marker_exclude2 should be on top since we might free it
-  if (wkspace_alloc_ul_checked(&loadbuf, unfiltered_sample_ctv2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&marker_exclude2, unfiltered_marker_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctv2, &loadbuf) ||
+      bigstack_alloc_ul(unfiltered_marker_ctl, &marker_exclude2)) {
     goto epistasis_report_ret_NOMEM;
   }
   loadbuf[unfiltered_sample_ctv2 - 2] = 0;
@@ -9326,24 +9350,24 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
   if (do_joint_effects && epi_ip->je_cellmin) {
     cellminx3 = epi_ip->je_cellmin * 3;
     if ((case_ct < cellminx3 * 3) || ((!is_case_only) && (ctrl_ct < cellminx3 * 3))) {
-      sprintf(logbuf, "Error: Too few cases or controls for --je-cellmin %u.\n", epi_ip->je_cellmin);
+      sprintf(g_logbuf, "Error: Too few cases or controls for --je-cellmin %u.\n", epi_ip->je_cellmin);
       goto epistasis_report_ret_INVALID_CMDLINE_2;
     }
     ulii = case_ctl2;
     if ((!is_case_only) && (ctrl_ctl2 > case_ctl2)) {
       ulii = ctrl_ctl2;
     }
-    if (wkspace_alloc_ul_checked(&ulptr, ulii * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(ulii, &ulptr)) {
       goto epistasis_report_ret_NOMEM;
     }
-    fill_vec_55(ulptr, ulii * BITCT2);
+    fill_quatervec_55(ulii * BITCT2, ulptr);
   }
   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_ct; chrom_fo_idx++) {
     chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1];
     chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
     if (is_set(chrom_info_ptr->haploid_mask, chrom_idx)) {
       uii = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx];
-      fill_bits(marker_exclude2, uii, chrom_end - uii);
+      fill_bits(uii, chrom_end - uii, marker_exclude2);
       marker_uidx = chrom_end;
       continue;
     }
@@ -9363,34 +9387,34 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
 	}
       }
       if ((!no_ueki) && (!cellminx3)) {
-	if (load_and_collapse_incl(bedfile, loadbuf, unfiltered_sample_ct, casebuf, pheno_nm_ct, pheno_nm, final_mask, 0)) {
+	if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, 0, bedfile, loadbuf, casebuf)) {
 	  goto epistasis_report_ret_READ_FAIL;
 	}
 	if (is_boost) {
 	  if (less_than_two_genotypes(casebuf, pheno_nm_ct)) {
-	    SET_BIT(marker_exclude2, marker_uidx);
+	    SET_BIT(marker_uidx, marker_exclude2);
 	  }
 	} else {
 	  if (is_monomorphic(casebuf, pheno_nm_ct)) {
-	    SET_BIT(marker_exclude2, marker_uidx);
+	    SET_BIT(marker_uidx, marker_exclude2);
 	  }
 	}
       } else {
-        if (load_and_split(bedfile, loadbuf, unfiltered_sample_ct, casebuf, ctrlbuf, pheno_nm, pheno_c)) {
+        if (load_and_split(unfiltered_sample_ct, pheno_nm, pheno_c, bedfile, loadbuf, casebuf, ctrlbuf)) {
           goto epistasis_report_ret_READ_FAIL;
 	}
 	if (no_ueki) {
 	  if (is_monomorphic(casebuf, case_ct) || ((!is_case_only) && is_monomorphic(ctrlbuf, ctrl_ct))) {
-	    SET_BIT(marker_exclude2, marker_uidx);
+	    SET_BIT(marker_uidx, marker_exclude2);
 	  }
 	} else {
-	  vec_3freq(case_ctl2, casebuf, ulptr, &missing_ct, &uii, &ujj);
+	  genovec_3freq(casebuf, ulptr, case_ctl2, &missing_ct, &uii, &ujj);
 	  if ((uii < cellminx3) || (ujj < cellminx3) || (case_ct - uii - ujj - missing_ct < cellminx3)) {
-	    SET_BIT(marker_exclude2, marker_uidx);
+	    SET_BIT(marker_uidx, marker_exclude2);
 	  } else if (!is_case_only) {
-	    vec_3freq(ctrl_ctl2, ctrlbuf, ulptr, &missing_ct, &uii, &ujj);
+	    genovec_3freq(ctrlbuf, ulptr, ctrl_ctl2, &missing_ct, &uii, &ujj);
 	    if ((uii < cellminx3) || (ujj < cellminx3) || (ctrl_ct - uii - ujj - missing_ct < cellminx3)) {
-	      SET_BIT(marker_exclude2, marker_uidx);
+	      SET_BIT(marker_uidx, marker_exclude2);
 	    }
 	  }
 	}
@@ -9407,11 +9431,11 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
       LOGPRINTF("--%sepistasis: Skipping %" PRIuPTR " monomorphic/non-autosomal site%s.\n", is_fast? "fast-" : "", marker_ct2 - ulii, (marker_ct2 - ulii == 1)? "" : "s");
     } else {
       LOGPRINTF("--%sepistasis: Skipping %" PRIuPTR " site%s due to --je-cellmin setting.\n", is_fast? "fast-" : "", marker_ct2 - ulii, (marker_ct2 - ulii == 1)? "" : "s");
-      wkspace_reset(ulptr);
+      bigstack_reset(ulptr);
     }
     marker_uidx_base = next_unset_ul_unsafe(marker_exclude2, marker_uidx_base);
   } else if ((!is_custom_set1) || (!is_set_by_set)) {
-    wkspace_reset(marker_exclude2);
+    bigstack_reset(marker_exclude2);
     marker_exclude2 = marker_exclude;
   }
   if (is_triangular) {
@@ -9422,16 +9446,16 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
     marker_ct2 = ulii;
     tests_expected = ((((uint64_t)marker_ct1) * (marker_ct1 - 1)) / 2);
   } else {
-    bitfield_or(marker_exclude1, marker_exclude2, unfiltered_marker_ctl);
+    bitvec_or(marker_exclude2, unfiltered_marker_ctl, marker_exclude1);
     marker_ct1 = unfiltered_marker_ct - popcount_longs(marker_exclude1, unfiltered_marker_ctl);
     if (sip->ct == 2) {
-      if (wkspace_alloc_ul_checked(&ulptr, unfiltered_marker_ctl * sizeof(intptr_t))) {
+      if (bigstack_alloc_ul(unfiltered_marker_ctl, &ulptr)) {
 	goto epistasis_report_ret_NOMEM;
       }
       memcpy(ulptr, marker_exclude2, unfiltered_marker_ctl * sizeof(intptr_t));
       unpack_set_unfiltered(marker_ct2, unfiltered_marker_ct, marker_exclude, sip->setdefs[1], marker_exclude2);
-      bitfield_or(marker_exclude2, ulptr, unfiltered_marker_ctl);
-      wkspace_reset(ulptr);
+      bitvec_or(ulptr, unfiltered_marker_ctl, marker_exclude2);
+      bigstack_reset(ulptr);
       marker_ct2 = unfiltered_marker_ct - popcount_longs(marker_exclude2, unfiltered_marker_ctl);
     } else {
       marker_ct2 = ulii;
@@ -9443,7 +9467,7 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
   }
   if (parallel_tot > 1) {
     if (marker_ct1 < (1 + is_triangular) * parallel_tot) {
-      sprintf(logbuf, "Error: Too few loci remaining for --parallel %u %u + --%sepistasis.\n", parallel_idx + 1, parallel_tot, is_fast? "fast-" : "");
+      sprintf(g_logbuf, "Error: Too few loci remaining for --parallel %u %u + --%sepistasis.\n", parallel_idx + 1, parallel_tot, is_fast? "fast-" : "");
       goto epistasis_report_ret_INVALID_CMDLINE_2;
     }
     if (is_triangular) {
@@ -9478,25 +9502,20 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
   if (max_thread_ct > job_size) {
     max_thread_ct = job_size;
   }
-  if (wkspace_alloc_d_checked(&best_chisq, marker_ct1 * sizeof(double)) ||
-      wkspace_alloc_ui_checked(&best_ids, marker_ct1 * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&n_sig_cts, marker_ct1 * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&fail_cts, marker_ct1 * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&g_epi_idx1_block_bounds, (max_thread_ct + 1) * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&g_epi_idx1_block_bounds16, max_thread_ct * sizeof(int32_t))) {
+  if (bigstack_calloc_d(marker_ct1, &best_chisq) ||
+      bigstack_calloc_ui(marker_ct1, &best_ids) ||
+      bigstack_calloc_ui(marker_ct1, &n_sig_cts) ||
+      bigstack_calloc_ui(marker_ct1, &fail_cts) ||
+      bigstack_alloc_ui(max_thread_ct + 1, &g_epi_idx1_block_bounds) ||
+      bigstack_alloc_ui(max_thread_ct, &g_epi_idx1_block_bounds16)) {
     goto epistasis_report_ret_NOMEM;
   }
-  fill_double_zero(best_chisq, marker_ct1);
-  fill_uint_one(best_ids, marker_ct1);
-  fill_uint_zero(n_sig_cts, marker_ct1);
-  fill_uint_zero(fail_cts, marker_ct1);
   if (is_case_only_window || (!is_triangular)) {
-    if (wkspace_alloc_ui_checked(&gap_cts, marker_ct1 * sizeof(int32_t))) {
+    if (bigstack_calloc_ui(marker_ct1, &gap_cts)) {
       goto epistasis_report_ret_NOMEM;
     }
-    fill_uint_zero(gap_cts, marker_ct1);
   }
-  wkspace_mark3 = wkspace_base;
+  bigstack_mark3 = g_bigstack_base;
 
   g_epi_thread_ct = max_thread_ct;
   g_epi_case_ct = case_ct;
@@ -9523,7 +9542,7 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
       g_epi_alpha2sq[1] = inverse_chiprob(epi_ip->epi2, 2);
       g_epi_alpha2sq[2] = inverse_chiprob(epi_ip->epi2, 1);
     }
-    if (wkspace_alloc_d_checked(&g_epi_recip_cache, (pheno_nm_ct + 1) * sizeof(double))) {
+    if (bigstack_alloc_d(pheno_nm_ct + 1, &g_epi_recip_cache)) {
       goto epistasis_report_ret_NOMEM;
     }
     g_epi_recip_cache[0] = 0.0;
@@ -9563,14 +9582,14 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
     }
     if (parallel_tot > 1) {
       outname_end[7] = '.';
-      uint32_writex(&(outname_end[8]), parallel_idx + 1, '\0');
+      uint32toa_x(parallel_idx + 1, '\0', &(outname_end[8]));
     }
     tot_ctsplit = case_ctsplit + ctrl_ctsplit;
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto epistasis_report_ret_OPEN_FAIL;
     }
     if (!parallel_idx) {
-      wptr = memcpya(tbuf, "CHR1 ", 5);
+      wptr = memcpya(g_textbuf, "CHR1 ", 5);
       wptr = fw_strcpyn(plink_maxsnp, 4, "SNP1", wptr);
       wptr = memcpya(wptr, " CHR2 ", 6);
       wptr = fw_strcpyn(plink_maxsnp, 4, "SNP2", wptr);
@@ -9582,7 +9601,7 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
         wptr = memcpya(wptr, "           P ", 13);
       }
       *wptr++ = '\n';
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	goto epistasis_report_ret_WRITE_FAIL;
       }
     }
@@ -9597,8 +9616,13 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
     //   sizeof(int32_t) for N_SIG count,
     //   sizeof(int32_t) for per-site fail counts, and (bleah)
     //   marker_ct2 * sizeof(double) for the usually oversized results space
+    cur_bigstack_left = bigstack_left();
+    ulii = 4 * CACHELINE - 3 * sizeof(int32_t) + max_thread_ct * (5 * (CACHELINE - 4));
+    if (cur_bigstack_left >= ulii) {
+      cur_bigstack_left -= ulii;
+    }
     ulii = tot_ctsplit * sizeof(intptr_t) + 4 * sizeof(int32_t) + sizeof(double) + marker_ct2 * sizeof(double);
-    idx1_block_size = (wkspace_left - 4 * CACHELINE + 3 * sizeof(int32_t) - max_thread_ct * (5 * (CACHELINE - 4))) / (ulii * 2 + 1);
+    idx1_block_size = cur_bigstack_left / (ulii * 2 + 1);
     if (!idx1_block_size) {
       goto epistasis_report_ret_NOMEM;
     }
@@ -9610,14 +9634,14 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
     // offsets[] isn't really needed, but barely takes any memory
     // if 'case-only', want two more offsets columns to store where the "too
     // close" variants are
-    g_epi_geno1_offsets = (uint32_t*)wkspace_alloc(idx1_block_size * 2 * sizeof(int32_t));
-    g_epi_geno1 = (uintptr_t*)wkspace_alloc(tot_ctsplit * idx1_block_size * sizeof(intptr_t));
-    g_epi_zmiss1 = (uintptr_t*)wkspace_alloc(((idx1_block_size + (BITCT2 - 1)) / BITCT2) * sizeof(intptr_t));
-    g_epi_all_chisq = (double*)wkspace_alloc(idx1_block_size * marker_ct2 * sizeof(double));
-    g_epi_best_chisq1 = (double*)wkspace_alloc(ulii * sizeof(double));
-    g_epi_best_id1 = (uint32_t*)wkspace_alloc(ulii * sizeof(int32_t));
-    g_epi_n_sig_ct1 = (uint32_t*)wkspace_alloc(ulii * sizeof(int32_t));
-    g_epi_fail_ct1 = (uint32_t*)wkspace_alloc(ulii * sizeof(int32_t));
+    bigstack_alloc_ui(idx1_block_size * 2, &g_epi_geno1_offsets);
+    bigstack_alloc_ul(tot_ctsplit * idx1_block_size, &g_epi_geno1);
+    bigstack_alloc_ul(QUATERCT_TO_WORDCT(idx1_block_size), &g_epi_zmiss1);
+    bigstack_alloc_d(idx1_block_size * marker_ct2, &g_epi_all_chisq);
+    bigstack_alloc_d(ulii, &g_epi_best_chisq1);
+    bigstack_alloc_ui(ulii, &g_epi_best_id1);
+    bigstack_alloc_ui(ulii, &g_epi_n_sig_ct1);
+    bigstack_alloc_ui(ulii, &g_epi_fail_ct1);
     for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
       g_epi_geno1[block_idx1 * tot_ctsplit + case_ctv3 - 1] = 0;
       g_epi_geno1[block_idx1 * tot_ctsplit + 2 * case_ctv3 - 1] = 0;
@@ -9633,28 +9657,28 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
     // triangular, but rather not complicate/duplicate the common case inner
     // loop for now
     ulii = tot_ctsplit * sizeof(intptr_t) + 1 + is_boost * 6 * sizeof(double) + tot_stride * sizeof(int32_t) + max_thread_ct * (3 * sizeof(int32_t) + sizeof(double));
-    idx2_block_size = (wkspace_left - CACHELINE - is_boost * (CACHELINE - 8) - max_thread_ct * (5 * (CACHELINE - 4))) / ulii;
+    idx2_block_size = (bigstack_left() - CACHELINE - is_boost * (CACHELINE - 8) - max_thread_ct * (5 * (CACHELINE - 4))) / ulii;
     if (idx2_block_size > marker_ct2) {
       idx2_block_size = marker_ct2;
     }
-    idx2_block_size = (idx2_block_size + 15) & (~(15 * ONELU));
-    wkspace_mark2 = wkspace_base;
+    idx2_block_size = round_up_pow2(idx2_block_size, 16);
+    bigstack_mark2 = g_bigstack_base;
     while (1) {
       if (!idx2_block_size) {
 	goto epistasis_report_ret_NOMEM;
       }
-      if (!(wkspace_alloc_ul_checked(&g_epi_geno2, tot_ctsplit * idx2_block_size * sizeof(intptr_t)) ||
-            wkspace_alloc_ul_checked(&g_epi_zmiss2, ((idx2_block_size + (BITCT2 - 1)) / BITCT2) * sizeof(intptr_t)) ||
-	    wkspace_alloc_ui_checked(&g_epi_tot2, idx2_block_size * tot_stride * sizeof(int32_t)) ||
-	    wkspace_alloc_d_checked(&g_epi_best_chisq2, max_thread_ct * idx2_block_size * sizeof(double)) ||
-	    wkspace_alloc_ui_checked(&g_epi_best_id2, max_thread_ct * idx2_block_size * sizeof(int32_t)) ||
-	    wkspace_alloc_ui_checked(&g_epi_n_sig_ct2, max_thread_ct * idx2_block_size * sizeof(int32_t)) ||
-	    wkspace_alloc_ui_checked(&g_epi_fail_ct2, max_thread_ct * idx2_block_size * sizeof(int32_t)))) {
-	if ((!is_boost) || (!wkspace_alloc_d_checked(&g_epi_boost_precalc2, 6 * idx2_block_size * sizeof(double)))) {
+      if (!(bigstack_alloc_ul(tot_ctsplit * idx2_block_size, &g_epi_geno2) ||
+            bigstack_alloc_ul(QUATERCT_TO_WORDCT(idx2_block_size), &g_epi_zmiss2) ||
+	    bigstack_alloc_ui(idx2_block_size * tot_stride, &g_epi_tot2) ||
+	    bigstack_alloc_d(max_thread_ct * idx2_block_size, &g_epi_best_chisq2) ||
+	    bigstack_alloc_ui(max_thread_ct * idx2_block_size, &g_epi_best_id2) ||
+	    bigstack_alloc_ui(max_thread_ct * idx2_block_size, &g_epi_n_sig_ct2) ||
+	    bigstack_alloc_ui(max_thread_ct * idx2_block_size, &g_epi_fail_ct2))) {
+	if ((!is_boost) || (!bigstack_alloc_d(6 * idx2_block_size, &g_epi_boost_precalc2))) {
 	  break;
 	}
       }
-      wkspace_reset(wkspace_mark2);
+      bigstack_reset(bigstack_mark2);
       idx2_block_size -= 16;
     }
     for (block_idx2 = 0; block_idx2 < idx2_block_size; block_idx2++) {
@@ -9669,7 +9693,7 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
     if (marker_idx1) {
       marker_uidx = jump_forward_unset_unsafe(marker_exclude1, marker_uidx + 1, marker_idx1);
     }
-    wptr = memcpya(logbuf, "--fast-epistasis", 16);
+    wptr = memcpya(g_logbuf, "--fast-epistasis", 16);
     if (is_boost) {
       wptr = memcpya(wptr, " boost", 6);
     } else if (no_ueki) {
@@ -9683,7 +9707,7 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
     wptr = memcpya(wptr, " to ", 4);
     wptr = strcpya(wptr, outname);
     memcpy(wptr, " ... ", 6);
-    wordwrap(logbuf, 16); // strlen("99% [processing]") 
+    wordwrapb(16); // strlen("99% [processing]") 
     logprintb();
     fputs("0%", stdout);
     do {
@@ -9743,10 +9767,10 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
 	}
 	uii = block_idx1 - g_epi_idx1_block_bounds[tidx - 1];
         g_epi_idx1_block_bounds[tidx] = block_idx1;
-        g_epi_idx1_block_bounds16[tidx] = g_epi_idx1_block_bounds16[tidx - 1] + ((uii + 15) & (~15));
+        g_epi_idx1_block_bounds16[tidx] = g_epi_idx1_block_bounds16[tidx - 1] + round_up_pow2_ui(uii, 16);
       }
       g_epi_idx1_block_bounds[max_thread_ct] = idx1_block_size;
-      fill_ulong_zero(g_epi_zmiss1, (idx1_block_size + (BITCT2 - 1)) / BITCT2);
+      fill_ulong_zero(g_epi_zmiss1, QUATERCT_TO_WORDCT(idx1_block_size));
       chrom_end = 0;
       for (block_idx1 = 0; block_idx1 < idx1_block_size; marker_uidx_tmp++, block_idx1++) {
         if (IS_SET(marker_exclude1, marker_uidx_tmp)) {
@@ -9801,7 +9825,7 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
 	  while (last_pos < cur_window_end) {
 	    marker_idx2++;
 	    marker_uidx2++;
-	    next_unset_ul_ck(marker_exclude2, &marker_uidx2, chrom_end);
+	    next_unset_ul_ck(marker_exclude2, chrom_end, &marker_uidx2);
 	    if (marker_uidx2 != chrom_end) {
 	      last_pos = marker_pos[marker_uidx2];
 	    } else {
@@ -9820,7 +9844,7 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
 	    while (first_pos + case_only_gap <= uii) {
 	      marker_idx2_trail++;
 	      marker_uidx2_trail++;
-	      next_unset_ul_ck(marker_exclude2, &marker_uidx2_trail, chrom_end);
+	      next_unset_ul_ck(marker_exclude2, chrom_end, &marker_uidx2_trail);
               if (marker_uidx2_trail != chrom_end) {
 		first_pos = marker_pos[marker_uidx2_trail];
 	      } else {
@@ -9859,7 +9883,7 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
 	if (cur_idx2_block_size > marker_ct2 - marker_idx2) {
 	  cur_idx2_block_size = marker_ct2 - marker_idx2;
 	}
-	fill_ulong_zero(g_epi_zmiss2, (cur_idx2_block_size + (BITCT2 - 1)) / BITCT2);
+	fill_ulong_zero(g_epi_zmiss2, QUATERCT_TO_WORDCT(cur_idx2_block_size));
         for (block_idx2 = 0; block_idx2 < cur_idx2_block_size; marker_uidx2++, block_idx2++) {
           if (IS_SET(marker_exclude2, marker_uidx2)) {
 	    marker_uidx2 = next_unset_ul_unsafe(marker_exclude2, marker_uidx2);
@@ -9890,11 +9914,11 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
 	}
 	g_epi_idx2_block_size = cur_idx2_block_size;
 	g_epi_idx2_block_start = marker_idx2;
-	idx2_block_sizem16 = (cur_idx2_block_size + 15) & (~(15 * ONELU));
+	idx2_block_sizea16 = round_up_pow2(cur_idx2_block_size, 16);
         fill_uint_zero(g_epi_n_sig_ct1, idx1_block_size + 15 * (max_thread_ct - 1));
 	fill_uint_zero(g_epi_fail_ct1, idx1_block_size + 15 * (max_thread_ct - 1));
-        fill_uint_zero(g_epi_n_sig_ct2, idx2_block_sizem16 * max_thread_ct);
-	fill_uint_zero(g_epi_fail_ct2, idx2_block_sizem16 * max_thread_ct);
+        fill_uint_zero(g_epi_n_sig_ct2, idx2_block_sizea16 * max_thread_ct);
+	fill_uint_zero(g_epi_fail_ct2, idx2_block_sizea16 * max_thread_ct);
 	for (tidx = 0; tidx < max_thread_ct; tidx++) {
 	  ulii = g_epi_idx1_block_bounds[tidx];
 	  uljj = g_epi_idx1_block_bounds[tidx + 1];
@@ -9906,7 +9930,7 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
 	    } else {
 	      ulii -= marker_idx2;
 	    }
-	    memcpy(&(g_epi_best_chisq2[tidx * idx2_block_sizem16 + ulii]), &(g_epi_all_chisq[marker_idx2 + ulii]), (cur_idx2_block_size - ulii) * sizeof(double));
+	    memcpy(&(g_epi_best_chisq2[tidx * idx2_block_sizea16 + ulii]), &(g_epi_all_chisq[marker_idx2 + ulii]), (cur_idx2_block_size - ulii) * sizeof(double));
 	  }
 	  // no need to initialize IDs since they are only referenced when a
 	  // higher chisq value is present, and when that happens an ID is
@@ -9949,10 +9973,10 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
 	    } else {
 	      block_idx2 -= marker_idx2;
 	    }
-	    dptr = &(g_epi_best_chisq2[tidx * idx2_block_sizem16 + block_idx2]);
-	    uiptr = &(g_epi_best_id2[tidx * idx2_block_sizem16]);
-	    uiptr2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizem16 + block_idx2]);
-	    uiptr3 = &(g_epi_fail_ct2[tidx * idx2_block_sizem16 + block_idx2]);
+	    dptr = &(g_epi_best_chisq2[tidx * idx2_block_sizea16 + block_idx2]);
+	    uiptr = &(g_epi_best_id2[tidx * idx2_block_sizea16]);
+	    uiptr2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizea16 + block_idx2]);
+	    uiptr3 = &(g_epi_fail_ct2[tidx * idx2_block_sizea16 + block_idx2]);
 	    dptr2 = &(best_chisq[block_idx2 + marker_idx2]);
 	    uiptr4 = &(n_sig_cts[block_idx2 + marker_idx2]);
 	    uiptr5 = &(fail_cts[block_idx2 + marker_idx2]);
@@ -9983,7 +10007,7 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
 	  chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
 	  chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1];
 	}
-        wptr_start = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, chrom_idx));
+        wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf));
 	*wptr_start++ = ' ';
 	wptr_start = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
 	*wptr_start++ = ' ';
@@ -9994,9 +10018,9 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
 	    continue;
 	  }
           chrom_idx2 = chrom_info_ptr->chrom_file_order[chrom_fo_idx2];
-          wptr_start2 = width_force(4, wptr_start, chrom_name_write(wptr_start, chrom_info_ptr, chrom_idx2));
+          wptr_start2 = width_force(4, wptr_start, chrom_name_write(chrom_info_ptr, chrom_idx2, wptr_start));
 	  *wptr_start2++ = ' ';
-	  for (; marker_uidx2 < chrom_end2; next_unset_ul_ck(marker_exclude2, &marker_uidx2, unfiltered_marker_ct), marker_idx2++, dptr++) {
+	  for (; marker_uidx2 < chrom_end2; next_unset_ul_ck(marker_exclude2, unfiltered_marker_ct, &marker_uidx2), marker_idx2++, dptr++) {
 	    if (marker_idx2 == ujj) {
 	      marker_idx2 = g_epi_geno1_offsets[2 * block_idx1 + 1];
 	      if (marker_idx2 == marker_ct2) {
@@ -10018,12 +10042,12 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
 	      *wptr++ = ' ';
 	      if (is_boost) {
 		if (dxx == dxx) { // not nan
-		  du.dd = dxx;
-		  uii = 4 >> (du.uu[0] & 3);
+		  memcpy(ularr, &dxx, sizeof(double));
+		  uii = 4 >> (ularr[0] & 3);
 		  // don't want ugly e-324s when zero belongs
-		  du.uu[0] &= ~(3 * ONELU);
-		  dxx = du.dd;
-		  wptr = width_force(12, wptr, double_g_write(wptr, dxx));
+		  ularr[0] &= ~(3 * ONELU);
+		  memcpy(&dxx, ularr, sizeof(double));
+		  wptr = width_force(12, wptr, dtoa_g(dxx, wptr));
 		  wptr = memseta(wptr, 32, 4);
 		  *wptr++ = '0' + uii;
 		  *wptr++ = ' ';
@@ -10032,25 +10056,25 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
 		  uii = 0;
 		}
 	      } else if (!no_ueki) {
-		wptr = width_force(12, wptr, double_g_write(wptr, dxx));
+		wptr = width_force(12, wptr, dtoa_g(dxx, wptr));
 		*wptr++ = ' ';
 	      } else {
 		// lower precision compatibility mode
-                wptr = double_g_writewx4x(wptr, dxx, 12, ' ');
+                wptr = dtoa_g_wxp4x(dxx, 12, ' ', wptr);
 	      }
 	      if (!no_p_value) {
 		if (!is_boost) {
 		  dxx = normdist(-sqrt(dxx)) * 2;
-		  wptr = double_g_writewx4x(wptr, MAXV(dxx, output_min_p), 12, ' ');
+		  wptr = dtoa_g_wxp4x(MAXV(dxx, output_min_p), 12, ' ', wptr);
 		} else if (uii) {
 		  dxx = chiprob_p(dxx, uii);
-		  wptr = double_g_writewx4x(wptr, MAXV(dxx, output_min_p), 12, ' ');
+		  wptr = dtoa_g_wxp4x(MAXV(dxx, output_min_p), 12, ' ', wptr);
 		} else {
 		  wptr = memcpya(wptr, "          NA ", 13);
 		}
 	      }
 	      *wptr++ = '\n';
-	      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 		goto epistasis_report_ret_WRITE_FAIL;
 	      }
 	      // could remove this writeback in --epi1 1 case
@@ -10087,12 +10111,12 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
   memcpy(&(outname_end[7]), ".summary", 9);
   if (parallel_tot > 1) {
     outname_end[15] = '.';
-    uint32_writex(&(outname_end[16]), parallel_idx + 1, '\0');
+    uint32toa_x(parallel_idx + 1, '\0', &(outname_end[16]));
   }
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto epistasis_report_ret_OPEN_FAIL;
   }
-  wptr = memcpya(tbuf, " CHR ", 5);
+  wptr = memcpya(g_textbuf, " CHR ", 5);
   wptr = fw_strcpyn(plink_maxsnp, 3, "SNP", wptr);
   if (parallel_tot == 1) {
     wptr = strcpya(wptr, "        N_SIG        N_TOT         PROP   BEST_CHISQ BEST_CHR ");
@@ -10101,11 +10125,11 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
   }
   wptr = fw_strcpyn(plink_maxsnp, 8, "BEST_SNP", wptr);
   wptr = memcpya(wptr, " \n", 2);
-  if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+  if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
     goto epistasis_report_ret_WRITE_FAIL;
   }
-  wkspace_reset(wkspace_mark3);
-  if (wkspace_alloc_ui_checked(&marker_idx_to_uidx, marker_ct1 * sizeof(int32_t))) {
+  bigstack_reset(bigstack_mark3);
+  if (bigstack_alloc_ui(marker_ct1, &marker_idx_to_uidx)) {
     goto epistasis_report_ret_NOMEM;
   }
   fill_idx_to_uidx(marker_exclude2, unfiltered_marker_ct, marker_ct2, marker_idx_to_uidx);
@@ -10120,9 +10144,9 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
       continue;
     }
     chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-    wptr_start = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, chrom_idx));
+    wptr_start = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf));
     *wptr_start++ = ' ';
-    for (; marker_uidx < chrom_end; marker_uidx++, next_unset_ul_ck(marker_exclude1, &marker_uidx, unfiltered_marker_ct), marker_idx1++) {
+    for (; marker_uidx < chrom_end; marker_uidx++, next_unset_ul_ck(marker_exclude1, unfiltered_marker_ct, &marker_uidx), marker_idx1++) {
       uii = n_sig_cts[marker_idx1];
       ujj = fail_cts[marker_idx1];
       if (gap_cts) {
@@ -10145,23 +10169,23 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
 	ujj = job_size - ujj;
       }
       wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
-      wptr = memcpyl3a(wptr, "   ");
-      wptr = uint32_writew10(wptr, uii);
-      wptr = memcpyl3a(wptr, "   ");
-      wptr = uint32_writew10x(wptr, ujj, ' ');
+      wptr = memseta(wptr, 32, 3);
+      wptr = uint32toa_w10(uii, wptr);
+      wptr = memseta(wptr, 32, 3);
+      wptr = uint32toa_w10x(ujj, ' ', wptr);
       if (parallel_tot == 1) {
-        wptr = double_g_writewx4x(wptr, ((double)((int32_t)uii)) / ((double)((int32_t)ujj)), 12, ' ');
+        wptr = dtoa_g_wxp4x(((double)((int32_t)uii)) / ((double)((int32_t)ujj)), 12, ' ', wptr);
       }
       if (ujj) {
 	if (parallel_tot == 1) {
 	  // or cat mode
-	  wptr = double_g_writewx4x(wptr, best_chisq[marker_idx1], 12, ' ');
+	  wptr = dtoa_g_wxp4x(best_chisq[marker_idx1], 12, ' ', wptr);
 	} else {
 	  // greater precision for accurate merges
-	  wptr = double_g_writewx8x(wptr, best_chisq[marker_idx1], 12, ' ');
+	  wptr = dtoa_g_wxp8x(best_chisq[marker_idx1], 12, ' ', wptr);
 	}
 	uii = marker_idx_to_uidx[best_ids[marker_idx1]];
-	wptr = width_force(4, wptr, chrom_name_write(wptr, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, uii)));
+	wptr = width_force(4, wptr, chrom_name_write(chrom_info_ptr, get_marker_chrom(chrom_info_ptr, uii), wptr));
 	*wptr++ = ' ';
 	wptr = fw_strcpy(plink_maxsnp, &(marker_ids[uii * max_marker_id_len]), wptr);
       } else {
@@ -10170,7 +10194,7 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
 	wptr = memcpya(wptr, "NA", 2);
       }
       wptr = memcpya(wptr, " \n", 2);
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	goto epistasis_report_ret_WRITE_FAIL;
       }
     }
@@ -10222,20 +10246,20 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
   }
  epistasis_report_ret_1:
   fclose_cond(outfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uint32_t* marker_pos, uintptr_t unfiltered_sample_ct, uintptr_t* founder_info, uintptr_t* sex_male, char* outname, char* outname_end, uint32_t hh_exists) {
   // Like ld_prune(), except that it computes the full 3x3 contingency table,
   // and is always in pairwise mode.
-  unsigned char* wkspace_mark = wkspace_base;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl2 = 2 * ((unfiltered_sample_ct + (BITCT - 1)) / BITCT);
-  uintptr_t founder_ct = popcount_longs(founder_info, unfiltered_sample_ctl2 / 2);
-  uintptr_t founder_ctl = (founder_ct + BITCT - 1) / BITCT;
-  uintptr_t founder_ctv3 = 2 * ((founder_ct + (2 * BITCT - 1)) / (2 * BITCT));
+  uintptr_t unfiltered_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct);
+  uintptr_t founder_ct = popcount_longs(founder_info, unfiltered_sample_ctv2 / 2);
+  uintptr_t founder_ctl = BITCT_TO_WORDCT(founder_ct);
+  uintptr_t founder_ctv3 = BITCT_TO_ALIGNED_WORDCT(founder_ct);
   // no actual case/control split here, but keep the variables the same to
   // minimize divergence from ld_report_dprime()
   uintptr_t founder_ctsplit = 3 * founder_ctv3;
@@ -10298,10 +10322,10 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
     goto indep_pairphase_ret_1;
   }
   if (is_set(chrom_info_ptr->chrom_mask, 0)) {
-    ulii = count_chrom_markers(chrom_info_ptr, 0, marker_exclude);
+    ulii = count_chrom_markers(chrom_info_ptr, marker_exclude, 0);
     if (chrom_info_ptr->zero_extra_chroms) {
       for (uii = chrom_info_ptr->max_code + 1; uii < chrom_code_end; uii++) {
-	ulii += count_chrom_markers(chrom_info_ptr, uii, marker_exclude);
+	ulii += count_chrom_markers(chrom_info_ptr, marker_exclude, uii);
       }
       chrom_code_end = chrom_info_ptr->max_code + 1;
     }
@@ -10314,7 +10338,7 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
   }
 
   // no need to force founder_male_include2 initialization here
-  if (alloc_collapsed_haploid_filters(unfiltered_sample_ct, founder_ct, hh_exists, 1, founder_info, sex_male, &founder_include2, &founder_male_include2)) {
+  if (alloc_collapsed_haploid_filters(founder_info, sex_male, unfiltered_sample_ct, founder_ct, hh_exists, 1, &founder_include2, &founder_male_include2)) {
     goto indep_pairphase_ret_NOMEM;
   }
 
@@ -10329,7 +10353,7 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
 
   window_unfiltered_start = ld_prune_next_valid_chrom_start(marker_exclude, 0, chrom_info_ptr, chrom_code_end, unfiltered_marker_ct);
 
-  if (wkspace_alloc_ul_checked(&pruned_arr, unfiltered_marker_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_marker_ctl, &pruned_arr)) {
     goto indep_pairphase_ret_NOMEM;
   }
 
@@ -10338,21 +10362,20 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
   if (!window_is_kb) {
     window_max = ld_window_size;
   }
-  ulii = window_max;
-  window_maxl = (window_max + (BITCT - 1)) / BITCT;
-  if (wkspace_alloc_ui_checked(&live_indices, ulii * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&start_arr, ulii * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf, founder_ctl * 2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&dummy_nm, founder_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&geno, founder_ctsplit * ulii * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&zmiss, window_maxl * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&cur_tots, ulii * 3 * sizeof(int32_t))) {
+  window_maxl = BITCT_TO_WORDCT(window_max);
+  if (bigstack_alloc_ui(window_max, &live_indices) ||
+      bigstack_alloc_ui(window_max, &start_arr) ||
+      bigstack_alloc_ul(unfiltered_sample_ctv2, &loadbuf_raw) ||
+      bigstack_alloc_ul(founder_ctl * 2, &loadbuf) ||
+      bigstack_alloc_ul(founder_ctl, &dummy_nm) ||
+      bigstack_alloc_ul(founder_ctsplit * window_max, &geno) ||
+      bigstack_alloc_ul(window_maxl, &zmiss) ||
+      bigstack_alloc_ui(window_max * 3, &cur_tots)) {
     goto indep_pairphase_ret_NOMEM;
   }
   loadbuf[founder_ctl * 2 - 2] = 0;
   loadbuf[founder_ctl * 2 - 1] = 0;
-  fill_all_bits(dummy_nm, founder_ct);
+  fill_all_bits(founder_ct, dummy_nm);
   // bugfix: this loop must start at 0, not 1
   for (ulii = 0; ulii < window_max; ulii++) {
     geno[ulii * founder_ctsplit + founder_ctv3 - 1] = 0;
@@ -10360,11 +10383,11 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
     geno[ulii * founder_ctsplit + founder_ctsplit - 1] = 0;
   }
   if ((chrom_info_ptr->x_code != -1) && is_set(chrom_info_ptr->chrom_mask, chrom_info_ptr->x_code)) {
-    if (wkspace_alloc_ul_checked(&sex_male_collapsed, founder_ctl * sizeof(intptr_t)) ||
-        wkspace_alloc_ul_checked(&cur_geno1_male, founder_ctsplit * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(founder_ctl, &sex_male_collapsed) ||
+        bigstack_alloc_ul(founder_ctsplit, &cur_geno1_male)) {
       goto indep_pairphase_ret_NOMEM;
     }
-    collapse_copy_bitarr_incl(unfiltered_sample_ct, sex_male, founder_info, founder_ct, sex_male_collapsed);
+    copy_bitarr_subset(sex_male, founder_info, unfiltered_sample_ct, founder_ct, sex_male_collapsed);
   }
   do {
     prev_end = 0;
@@ -10377,7 +10400,7 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
 	if (fseeko(bedfile, bed_offset + (uljj * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
 	  goto indep_pairphase_ret_READ_FAIL;
 	}
-	if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, uljj))) {
+	if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, uljj), bedfile, loadbuf_raw, loadbuf)) {
 	  goto indep_pairphase_ret_READ_FAIL;
 	}
 	if (is_haploid && hh_exists) {
@@ -10389,10 +10412,10 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
 	cur_tots[ulii * 3 + 1] = popcount_longs(&(cur_geno1[founder_ctv3]), founder_ctv3);
 	cur_tots[ulii * 3 + 2] = popcount_longs(&(cur_geno1[2 * founder_ctv3]), founder_ctv3);
 	if ((!cur_tots[ulii * 3 + 1]) && ((!cur_tots[ulii * 3]) || (!cur_tots[ulii * 3 + 2]))) {
-	  SET_BIT(pruned_arr, uljj);
+	  SET_BIT(uljj, pruned_arr);
 	  cur_exclude_ct++;
 	} else if (ulkk == 3) {
-	  SET_BIT(zmiss, ulii);
+	  SET_BIT(ulii, zmiss);
 	}
       }
     }
@@ -10417,11 +10440,11 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
 	    nm_fixed = is_set_ul(zmiss, ulii);
 	    if (is_x) {
 	      memcpy(cur_geno1_male, cur_geno1, founder_ctsplit * sizeof(intptr_t));
-              bitfield_and(cur_geno1_male, sex_male_collapsed, founder_ctv3);
+              bitvec_and(sex_male_collapsed, founder_ctv3, cur_geno1_male);
 	      tot1[3] = popcount_longs(cur_geno1_male, founder_ctv3);
-              bitfield_and(&(cur_geno1_male[founder_ctv3]), sex_male_collapsed, founder_ctv3);
+              bitvec_and(sex_male_collapsed, founder_ctv3, &(cur_geno1_male[founder_ctv3]));
 	      tot1[4] = popcount_longs(&(cur_geno1_male[founder_ctv3]), founder_ctv3);
-              bitfield_and(&(cur_geno1_male[2 * founder_ctv3]), sex_male_collapsed, founder_ctv3);
+              bitvec_and(sex_male_collapsed, founder_ctv3, &(cur_geno1_male[2 * founder_ctv3]));
 	      tot1[5] = popcount_longs(&(cur_geno1_male[2 * founder_ctv3]), founder_ctv3);
 	    }
 	    for (; uljj < cur_window_size; uljj++) {
@@ -10465,9 +10488,9 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
 		  cur_exclude_ct++;
 		  // remove marker with lower MAF
 		  if (get_maf(set_allele_freqs[live_indices[ulii]]) < get_maf(set_allele_freqs[live_indices[uljj]])) {
-		    SET_BIT(pruned_arr, live_indices[ulii]);
+		    SET_BIT(live_indices[ulii], pruned_arr);
 		  } else {
-		    SET_BIT(pruned_arr, live_indices[uljj]);
+		    SET_BIT(live_indices[uljj], pruned_arr);
 		    uljj++;
 		    while (uljj < cur_window_size) {
 		      if (!IS_SET(pruned_arr, live_indices[uljj])) {
@@ -10495,7 +10518,7 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
 	  break;
 	}
 	window_unfiltered_start++;
-	next_unset_ck(marker_exclude, &window_unfiltered_start, chrom_end);
+	next_unset_ck(marker_exclude, chrom_end, &window_unfiltered_start);
       }
       if (window_unfiltered_start == chrom_end) {
 	break;
@@ -10527,13 +10550,13 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
 	memcpy(&(cur_tots[ulii * 3]), &(cur_tots[uljj * 3]), 3 * sizeof(int32_t));
 	// bugfix: forgot to update zmiss
 	if (IS_SET(zmiss, uljj)) {
-	  SET_BIT(zmiss, ulii);
+	  SET_BIT(ulii, zmiss);
 	} else {
-	  CLEAR_BIT(zmiss, ulii);
+	  CLEAR_BIT(ulii, zmiss);
 	}
 	ulii++;
       }
-      clear_bits(zmiss, ulii, window_max);
+      clear_bits(ulii, window_max, zmiss);
 
       prev_end = ulii;
       cur_window_size = ulii;
@@ -10543,7 +10566,7 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
 	while ((window_unfiltered_end < chrom_end) && (marker_pos[window_unfiltered_end] <= marker_pos[window_unfiltered_start] + (1000 * ld_window_size))) {
 	  uljj++;
 	  window_unfiltered_end++;
-	  next_unset_ck(marker_exclude, &window_unfiltered_end, chrom_end);
+	  next_unset_ck(marker_exclude, chrom_end, &window_unfiltered_end);
 	}
 	window_unfiltered_end = ulkk;
       } else {
@@ -10560,7 +10583,7 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
 	if (fseeko(bedfile, bed_offset + (window_unfiltered_end * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
 	  goto indep_pairphase_ret_READ_FAIL;
 	}
-	if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, window_unfiltered_end))) {
+	if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, IS_SET(marker_reverse, window_unfiltered_end), bedfile, loadbuf_raw, loadbuf)) {
 	  goto indep_pairphase_ret_READ_FAIL;
 	}
 	if (is_haploid && hh_exists) {
@@ -10572,14 +10595,14 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
 	cur_tots[((uintptr_t)cur_window_size) * 3 + 1] = popcount_longs(&(cur_geno1[founder_ctv3]), founder_ctv3);
 	cur_tots[((uintptr_t)cur_window_size) * 3 + 2] = popcount_longs(&(cur_geno1[2 * founder_ctv3]), founder_ctv3);
 	if ((!cur_tots[((uintptr_t)cur_window_size) * 3 + 1]) && ((!cur_tots[((uintptr_t)cur_window_size) * 3]) || (!cur_tots[((uintptr_t)cur_window_size) * 3 + 2]))) {
-	  SET_BIT(pruned_arr, window_unfiltered_end);
+	  SET_BIT(window_unfiltered_end, pruned_arr);
 	  cur_exclude_ct++;
 	} else if (ulkk == 3) {
-	  SET_BIT(zmiss, cur_window_size);
+	  SET_BIT(cur_window_size, zmiss);
 	}
 	cur_window_size++;
 	window_unfiltered_end++;
-	next_unset_ck(marker_exclude, &window_unfiltered_end, chrom_end);
+	next_unset_ck(marker_exclude, chrom_end, &window_unfiltered_end);
       }
       if (cur_window_size > prev_end) {
 	start_arr[cur_window_size] = window_unfiltered_end;
@@ -10612,7 +10635,7 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
     break;
   }
  indep_pairphase_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -10626,16 +10649,14 @@ typedef struct ll_epi_summary_struct {
   char strbuf[];
 } Ll_epi_summary;
 
-// N.B. moves wkspace_base/wkspace_left in word-size instead of cacheline
-// increments
+// N.B. moves g_bigstack_base in word-size instead of cacheline increments
 Ll_epi_summary* lle_alloc(char* chrom_id, uint32_t chrom_len, char* marker_id, uint32_t marker_id_len, uint32_t nsig, uint32_t ntot, double chisq) {
   uintptr_t alloc_size = (sizeof(Ll_epi_summary) + chrom_len + marker_id_len + sizeof(intptr_t)) & (~(sizeof(intptr_t) - ONELU));
-  Ll_epi_summary* newptr = (Ll_epi_summary*)wkspace_base;
-  if (wkspace_left < alloc_size) {
+  Ll_epi_summary* newptr = (Ll_epi_summary*)g_bigstack_base;
+  if (bigstack_left() < alloc_size) {
     return NULL;
   }
-  wkspace_base = &(wkspace_base[alloc_size]);
-  wkspace_left -= alloc_size;
+  g_bigstack_base = &(g_bigstack_base[alloc_size]);
   newptr->next = NULL;
   newptr->best_chisq = chisq;
   newptr->n_sig = nsig;
@@ -10698,7 +10719,7 @@ int32_t validate_epistasis_summary_header(char* bufptr) {
 }
 
 int32_t epi_summary_merge(Epi_info* epi_ip, char* outname, char* outname_end) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
   FILE* outfile = NULL;
   char* inprefix = epi_ip->summary_merge_prefix;
@@ -10757,12 +10778,12 @@ int32_t epi_summary_merge(Epi_info* epi_ip, char* outname, char* outname_end) {
   //    would add confusion for little reason; instead we detect the telltale
   //    "PROP" in the first file's header line and switch to cat.
 
-  tbuf[MAXLINELEN - 1] = ' ';
+  g_textbuf[MAXLINELEN - 1] = ' ';
   memcpy(inprefix_end, "1", 2);
-  if (fopen_checked(&infile, inprefix, "r")) {
+  if (fopen_checked(inprefix, "r", &infile)) {
     goto epi_summary_merge_ret_OPEN_FAIL;
   }
-  retval = load_to_first_token(infile, MAXLINELEN, '\0', "--epistasis-summary-merge file", tbuf, &bufptr, &line_idx);
+  retval = load_to_first_token(infile, MAXLINELEN, '\0', "--epistasis-summary-merge file", g_textbuf, &bufptr, &line_idx);
   if (retval) {
     goto epi_summary_merge_ret_1;
   }
@@ -10771,20 +10792,20 @@ int32_t epi_summary_merge(Epi_info* epi_ip, char* outname, char* outname_end) {
     if (retval == -1) {
       // switch to cat mode.  meow.
       fclose_null(&infile);
-      if (fopen_checked(&outfile, outname, "wb")) {
+      if (fopen_checked(outname, FOPEN_WB, &outfile)) {
 	goto epi_summary_merge_ret_OPEN_FAIL;
       }
       for (file_idx = 1; file_idx <= file_ct; file_idx++) {
-        uint32_writex(inprefix_end, file_idx, '\0');
-	if (fopen_checked(&infile, inprefix, "rb")) {
+        uint32toa_x(file_idx, '\0', inprefix_end);
+	if (fopen_checked(inprefix, FOPEN_RB, &infile)) {
 	  goto epi_summary_merge_ret_OPEN_FAIL;
 	}
 	while (1) {
-	  ulii = fread(tbuf, 1, MAXLINELEN, infile);
+	  ulii = fread(g_textbuf, 1, MAXLINELEN, infile);
           if (!ulii) {
 	    break;
 	  }
-	  if (fwrite_checked(tbuf, ulii, outfile)) {
+	  if (fwrite_checked(g_textbuf, ulii, outfile)) {
 	    goto epi_summary_merge_ret_WRITE_FAIL;
 	  }
 	}
@@ -10797,15 +10818,15 @@ int32_t epi_summary_merge(Epi_info* epi_ip, char* outname, char* outname_end) {
     }
     goto epi_summary_merge_ret_INVALID_HEADER;
   }
-  bufptr2 = token_end(bufptr);
+  bufptr2 = token_endnn(bufptr);
   bufptr = skip_initial_spaces(bufptr2);
-  plink_maxsnp = ((uintptr_t)(token_end(bufptr) - bufptr2)) - 1;
-  while (fgets(tbuf, MAXLINELEN, infile)) {
+  plink_maxsnp = ((uintptr_t)(token_endnn(bufptr) - bufptr2)) - 1;
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
+    if (!g_textbuf[MAXLINELEN - 1]) {
       goto epi_summary_merge_ret_LONG_LINE;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
@@ -10844,10 +10865,10 @@ int32_t epi_summary_merge(Epi_info* epi_ip, char* outname, char* outname_end) {
     }
     // throw in an extra word, to reduce the need for reallocation
     ulii = (chrom_len + id_len + 1 + 2 * sizeof(intptr_t)) & (~(sizeof(intptr_t) - 1));
-    if (ulii > wkspace_left) {
+    if (ulii > bigstack_left()) {
       goto epi_summary_merge_ret_NOMEM;
     }
-    bufptr = (char*)wkspace_base;
+    bufptr = (char*)g_bigstack_base;
     memcpyx(bufptr, best_chr_ptr, chrom_len, '\t');
     memcpy(&(bufptr[chrom_len + 1]), best_marker_ptr, id_len);
     // pad with nulls then tab-terminate, so we can find the buffer end later
@@ -10855,8 +10876,7 @@ int32_t epi_summary_merge(Epi_info* epi_ip, char* outname, char* outname_end) {
     bufptr[ulii - 1] = '\t';
     (*lle_pp)->best_chr_and_snp = bufptr;
     lle_pp = &((*lle_pp)->next);
-    wkspace_base = &(wkspace_base[ulii]);
-    wkspace_left -= ulii;
+    g_bigstack_base = &(g_bigstack_base[ulii]);
   }
   if (fclose_null(&infile)) {
     goto epi_summary_merge_ret_READ_FAIL;
@@ -10867,11 +10887,11 @@ int32_t epi_summary_merge(Epi_info* epi_ip, char* outname, char* outname_end) {
   }
   last_start = list_start->next;
   for (file_idx = 2; file_idx <= file_ct; file_idx++) {
-    uint32_writex(inprefix_end, file_idx, '\0');
-    if (fopen_checked(&infile, inprefix, "r")) {
+    uint32toa_x(file_idx, '\0', inprefix_end);
+    if (fopen_checked(inprefix, "r", &infile)) {
       goto epi_summary_merge_ret_OPEN_FAIL;
     }
-    retval = load_to_first_token(infile, MAXLINELEN, '\0', "--epistasis-summary-merge file", tbuf, &bufptr, &line_idx);
+    retval = load_to_first_token(infile, MAXLINELEN, '\0', "--epistasis-summary-merge file", g_textbuf, &bufptr, &line_idx);
     if (retval) {
       goto epi_summary_merge_ret_1;
     }
@@ -10881,12 +10901,12 @@ int32_t epi_summary_merge(Epi_info* epi_ip, char* outname, char* outname_end) {
     }
     lle_ptr = last_start;
     is_first_entry = 1;
-    while (fgets(tbuf, MAXLINELEN, infile)) {
+    while (fgets(g_textbuf, MAXLINELEN, infile)) {
       line_idx++;
-      if (!tbuf[MAXLINELEN - 1]) {
+      if (!g_textbuf[MAXLINELEN - 1]) {
 	goto epi_summary_merge_ret_LONG_LINE;
       }
-      bufptr = skip_initial_spaces(tbuf);
+      bufptr = skip_initial_spaces(g_textbuf);
       if (is_eoln_kns(*bufptr)) {
 	continue;
       }
@@ -10951,15 +10971,14 @@ int32_t epi_summary_merge(Epi_info* epi_ip, char* outname, char* outname_end) {
 	  ulii = (uintptr_t)(bufptr4 - bufptr);
 	  if (ulii <= chrom_len + id_len + 1) {
 	    ulii = (chrom_len + id_len + 1 + sizeof(intptr_t)) & (~(sizeof(intptr_t) - 1));
-            if (ulii > wkspace_left) {
+            if (ulii > bigstack_left()) {
 	      goto epi_summary_merge_ret_NOMEM;
 	    }
-            bufptr = (char*)wkspace_base;
+            bufptr = (char*)g_bigstack_base;
 	    bufptr3 = &(bufptr[ulii - 1]);
 	    *bufptr3 = '\t';
             lle_ptr->best_chr_and_snp = bufptr;
-            wkspace_base = &(wkspace_base[ulii]);
-	    wkspace_left -= ulii;
+            g_bigstack_base = &(g_bigstack_base[ulii]);
 	  }
 	  bufptr = memcpyax(bufptr, best_chr_ptr, chrom_len, '\t');
 	  bufptr = memcpya(bufptr, best_marker_ptr, id_len);
@@ -10975,15 +10994,15 @@ int32_t epi_summary_merge(Epi_info* epi_ip, char* outname, char* outname_end) {
     }
   }
 
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto epi_summary_merge_ret_OPEN_FAIL;
   }
-  bufptr = memcpya(tbuf, " CHR ", 5);
+  bufptr = memcpya(g_textbuf, " CHR ", 5);
   bufptr = fw_strcpyn(plink_maxsnp, 3, "SNP", bufptr);
   bufptr = strcpya(bufptr, "        N_SIG        N_TOT         PROP   BEST_CHISQ BEST_CHR ");
   bufptr = fw_strcpyn(plink_maxsnp, 8, "BEST_SNP", bufptr);
   bufptr = memcpya(bufptr, " \n", 2);
-  if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+  if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
     goto epi_summary_merge_ret_WRITE_FAIL;
   }
   lle_ptr = list_start;
@@ -10991,17 +11010,17 @@ int32_t epi_summary_merge(Epi_info* epi_ip, char* outname, char* outname_end) {
     bufptr2 = lle_ptr->strbuf;
     id_len = lle_ptr->id_len;
     bufptr3 = &(bufptr2[id_len]);
-    bufptr = fw_strcpy(4, bufptr3, tbuf);
+    bufptr = fw_strcpy(4, bufptr3, g_textbuf);
     *bufptr++ = ' ';
     bufptr = fw_strcpyn(plink_maxsnp, id_len, bufptr2, bufptr);
     nsig = lle_ptr->n_sig;
     ntot = lle_ptr->n_tot;
-    bufptr = memcpyl3a(bufptr, "   ");
-    bufptr = uint32_writew10(bufptr, nsig);
-    bufptr = memcpyl3a(bufptr, "   ");
-    bufptr = uint32_writew10x(bufptr, ntot, ' ');
-    bufptr = double_g_writewx4x(bufptr, ((double)((int32_t)nsig)) / ((double)((int32_t)ntot)), 12, ' ');
-    bufptr = double_g_writewx4x(bufptr, lle_ptr->best_chisq, 12, ' ');
+    bufptr = memseta(bufptr, 32, 3);
+    bufptr = uint32toa_w10(nsig, bufptr);
+    bufptr = memseta(bufptr, 32, 3);
+    bufptr = uint32toa_w10x(ntot, ' ', bufptr);
+    bufptr = dtoa_g_wxp4x(((double)((int32_t)nsig)) / ((double)((int32_t)ntot)), 12, ' ', bufptr);
+    bufptr = dtoa_g_wxp4x(lle_ptr->best_chisq, 12, ' ', bufptr);
     // no need to special-case ntot == 0, this code correctly copies 'NA'
     bufptr2 = lle_ptr->best_chr_and_snp;
     bufptr3 = (char*)memchr(bufptr2, '\t', MAXLINELEN);
@@ -11010,7 +11029,7 @@ int32_t epi_summary_merge(Epi_info* epi_ip, char* outname, char* outname_end) {
     *bufptr++ = ' ';
     bufptr = fw_strcpy(plink_maxsnp, &(bufptr3[1]), bufptr);
     bufptr = memcpya(bufptr, " \n", 2);
-    if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+    if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
       goto epi_summary_merge_ret_WRITE_FAIL;
     }
     lle_ptr = lle_ptr->next;
@@ -11063,7 +11082,7 @@ int32_t epi_summary_merge(Epi_info* epi_ip, char* outname, char* outname_end) {
     retval = RET_INVALID_FORMAT;
     break;
   epi_summary_merge_ret_INVALID_HEADER:
-    LOGPREPRINTFWW(logbuf, "Error: Invalid --epistasis-summary-merge header in %s.\n", inprefix);
+    LOGPREPRINTFWW(g_logbuf, "Error: Invalid --epistasis-summary-merge header in %s.\n", inprefix);
   epi_summary_merge_ret_INVALID_FORMAT_2:
     logerrprintb();
     retval = RET_INVALID_FORMAT;
@@ -11072,14 +11091,14 @@ int32_t epi_summary_merge(Epi_info* epi_ip, char* outname, char* outname_end) {
  epi_summary_merge_ret_1:
   fclose_cond(infile);
   fclose_cond(outfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 void test_mishap_write_line(FILE* outfile, char* wptr, uint32_t prev_alen, uint32_t next_alen, const char* prev_aptr, const char* next_aptr, double* total_cts, double* curhap_cts, double tot_recip, double output_min_p, char* flankstr, uint32_t flanklen) {
   // total_cts[0] = caseN[0] + caseN[1]
   // total_cts[1] = controlN[0] + controlN[1]
-  char* tbuf_cur = tbuf;
+  char* tbuf_cur = g_textbuf;
   double casen_1 = total_cts[0] - curhap_cts[0];
   double ctrln_1 = total_cts[1] - curhap_cts[1];
   uint32_t uii = prev_alen + next_alen;
@@ -11097,7 +11116,7 @@ void test_mishap_write_line(FILE* outfile, char* wptr, uint32_t prev_alen, uint3
       wptr = memcpya(wptr, next_aptr, next_alen);
     }
   } else {
-    fwrite(tbuf, 1, (uintptr_t)(wptr - tbuf), outfile);
+    fwrite(g_textbuf, 1, (uintptr_t)(wptr - g_textbuf), outfile);
     if (prev_alen) {
       fputs(prev_aptr, outfile);
     }
@@ -11108,25 +11127,25 @@ void test_mishap_write_line(FILE* outfile, char* wptr, uint32_t prev_alen, uint3
   }
   *wptr++ = ' ';
   if (total_cts[0] > 0.0) {
-    wptr = double_g_writewx3(wptr, curhap_cts[0] / total_cts[0], 8);
+    wptr = dtoa_g_wxp3(curhap_cts[0] / total_cts[0], 8, wptr);
   } else {
     wptr = memcpya(wptr, "      NA", 8);
   }
   *wptr++ = ' ';
   if (total_cts[1] > 0.0) {
-    wptr = double_g_writewx3(wptr, curhap_cts[1] / total_cts[1], 8);
+    wptr = dtoa_g_wxp3(curhap_cts[1] / total_cts[1], 8, wptr);
   } else {
     wptr = memcpya(wptr, "      NA", 8);
   }
   *wptr++ = ' ';
-  wptr2 = double_g_write(wptr, curhap_cts[0]);
+  wptr2 = dtoa_g(curhap_cts[0], wptr);
   *wptr2++ = '/';
-  wptr2 = double_g_write(wptr2, curhap_cts[1]);
+  wptr2 = dtoa_g(curhap_cts[1], wptr2);
   wptr = width_force(20, wptr, wptr2);
   *wptr++ = ' ';
-  wptr2 = double_g_write(wptr, casen_1);
+  wptr2 = dtoa_g(casen_1, wptr);
   *wptr2++ = '/';
-  wptr2 = double_g_write(wptr2, ctrln_1);
+  wptr2 = dtoa_g(ctrln_1, wptr2);
   wptr = width_force(20, wptr, wptr2);
   *wptr++ = ' ';
   if ((curhap_cts[0] > 0.0) && (curhap_cts[1] > 0.0) && (casen_1 > 0.0) && (ctrln_1 > 0.0)) {
@@ -11144,10 +11163,10 @@ void test_mishap_write_line(FILE* outfile, char* wptr, uint32_t prev_alen, uint3
     cur_expected = row_mult * total_cts[1];
     dxx = ctrln_1 - cur_expected;
     chisq += dxx * dxx / cur_expected;
-    wptr = double_g_writewx3(wptr, chisq, 8);
+    wptr = dtoa_g_wxp3(chisq, 8, wptr);
     *wptr++ = ' ';
     dxx = chiprob_p(chisq, 1);
-    wptr = double_g_writewx3(wptr, MAXV(dxx, output_min_p), 8);
+    wptr = dtoa_g_wxp3(MAXV(dxx, output_min_p), 8, wptr);
   } else {
     wptr = memcpya(wptr, "      NA       NA", 17);
   }
@@ -11156,14 +11175,14 @@ void test_mishap_write_line(FILE* outfile, char* wptr, uint32_t prev_alen, uint3
 }
 
 int32_t test_mishap(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, double min_maf, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + BITCT2 - 1) / BITCT2;
-  uintptr_t sample_ctl2 = (sample_ct + BITCT2 - 1) / BITCT2;
-  uintptr_t sample_ctv2 = 2 * ((sample_ct + BITCT - 1) / BITCT);
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+  uintptr_t sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
   uintptr_t final_mask = get_final_mask(sample_ct);
-  char* tbuf2 = &(tbuf[MAXLINELEN]);
+  char* tbuf2 = &(g_textbuf[MAXLINELEN]);
   char* wptr2 = NULL;
   uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
   uint32_t inspected_ct = 0;
@@ -11226,10 +11245,10 @@ int32_t test_mishap(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
     logerrprint("Error: --test-mishap does not support >= 2^30 samples.\n");
     goto test_mishap_ret_INVALID_CMDLINE;
   }
-  if (wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf, sample_ctv2 * 3 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&maskbuf_mid, sample_ctv2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&maskbuf, sample_ctv2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw) ||
+      bigstack_alloc_ul(sample_ctv2 * 3, &loadbuf) ||
+      bigstack_alloc_ul(sample_ctv2, &maskbuf_mid) ||
+      bigstack_alloc_ul(sample_ctv2, &maskbuf)) {
     goto test_mishap_ret_NOMEM;
   }
   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
@@ -11242,11 +11261,11 @@ int32_t test_mishap(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
   loadbuf_end = &(loadbuf[sample_ctv2 * 3]);
   tbuf2[0] = ' ';
   memcpy(outname_end, ".missing.hap", 13);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto test_mishap_ret_OPEN_FAIL;
   }
-  sprintf(tbuf, "%%%us  HAPLOTYPE      F_0      F_1                 M_H1                 M_H2    CHISQ        P FLANKING\n", plink_maxsnp);
-  fprintf(outfile, tbuf, "SNP");
+  sprintf(g_textbuf, "%%%us  HAPLOTYPE      F_0      F_1                 M_H1                 M_H2    CHISQ        P FLANKING\n", plink_maxsnp);
+  fprintf(outfile, g_textbuf, "SNP");
   min_maf *= 1 - SMALL_EPSILON;
   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_ct; chrom_fo_idx++) {
     chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
@@ -11269,7 +11288,7 @@ int32_t test_mishap(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
     if (fseeko(bedfile, bed_offset + marker_uidx_cur * ((uint64_t)unfiltered_sample_ct4), SEEK_SET)) {
       goto test_mishap_ret_READ_FAIL;
     }
-    if (load_and_collapse(bedfile, loadbuf_raw, unfiltered_sample_ct, cursnp_ptr, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx_cur))) {
+    if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx_cur), bedfile, loadbuf_raw, cursnp_ptr)) {
       goto test_mishap_ret_READ_FAIL;
     }
     missing_ct_cur = count_01(cursnp_ptr, sample_ctl2);
@@ -11289,7 +11308,7 @@ int32_t test_mishap(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
 	    goto test_mishap_ret_READ_FAIL;
 	  }
 	}
-        if (load_and_collapse(bedfile, loadbuf_raw, unfiltered_sample_ct, nextsnp_ptr, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx_next))) {
+        if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx_next), bedfile, loadbuf_raw, nextsnp_ptr)) {
           goto test_mishap_ret_READ_FAIL;
 	}
         missing_ct_next = count_01(nextsnp_ptr, sample_ctl2);
@@ -11300,21 +11319,21 @@ int32_t test_mishap(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
       if (missing_ct_cur < 5) {
 	continue;
       }
-      vec_init_01(unfiltered_sample_ct, cursnp_ptr, maskbuf_mid);
+      quatervec_copy_only_01(cursnp_ptr, unfiltered_sample_ct, maskbuf_mid);
       uiptr = counts;
       for (uii = 0; uii < 2; uii++) {
 	if (uii) {
-	  vec_invert(unfiltered_sample_ct, maskbuf_mid);
+	  quatervec_01_invert(unfiltered_sample_ct, maskbuf_mid);
 	}
         for (ujj = 0; ujj < 3; ujj++) {
           vec_datamask(unfiltered_sample_ct, ujj + (ujj + 1) / 2, prevsnp_ptr, maskbuf_mid, maskbuf);
 	  ukk = popcount01_longs(maskbuf, sample_ctl2);
-	  vec_3freq(sample_ctl2, nextsnp_ptr, maskbuf, &umm, &(uiptr[1]), &(uiptr[2]));
+	  genovec_3freq(nextsnp_ptr, maskbuf, sample_ctl2, &umm, &(uiptr[1]), &(uiptr[2]));
 	  uiptr[0] = ukk - umm - uiptr[1] - uiptr[2];
 	  uiptr = &(uiptr[3]);
 	}
       }
-      wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx_cur * max_marker_id_len]), tbuf);
+      wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx_cur * max_marker_id_len]), g_textbuf);
       *wptr++ = ' ';
       if (marker_uidx_prev != ~ZEROLU) {
 	prev_a1len = strlen(marker_allele_ptrs[2 * marker_uidx_prev]);
@@ -11515,7 +11534,7 @@ int32_t test_mishap(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
     break;
   }
   fclose_cond(outfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -11525,6 +11544,7 @@ static uintptr_t* g_ld_result_bitfield;
 THREAD_RET_TYPE ld_map_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
   uint32_t thread_ct = g_ld_thread_ct;
+  // er, this use of "ctv" is nonstandard, probably want to fix this later
   uintptr_t marker_ctv = ((g_ld_marker_ct + 127) / 128) * (128 / BITCT);
   uintptr_t idx1_offset = g_ld_block_idx1;
   uintptr_t block_idx1_start = (tidx * g_ld_idx1_block_size) / thread_ct;
@@ -11606,7 +11626,7 @@ THREAD_RET_TYPE ld_map_thread(void* arg) {
         dyy = dp_result[2];
         cov12 = dp_result[0] * non_missing_ctd - dxx * dyy;
         if (cov12 * cov12 <= r2_thresh * ((dp_result[3] * non_missing_ctd + dxx * dxx) * (dp_result[4] * non_missing_ctd + dyy * dyy))) {
-          clear_bit(rb_cur, marker_idx2);
+          clear_bit(marker_idx2, rb_cur);
 	}
 	uii = marker_idx2++;
 	if (is_set(rb_cur, marker_idx2)) {
@@ -11639,13 +11659,13 @@ int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
   // Note that, when very large set(s) are present, and there's a moderate
   // amount of "random" long-range LD, the memory requirement may be huge.
   FILE* outfile = NULL;
-  uintptr_t topsize = 0;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   uintptr_t marker_ctv = ((marker_ct + 127) / 128) * (128 / BITCT);
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t max_set_id_len = sip->max_name_len;
   uintptr_t founder_ct = popcount_longs(founder_pnm, unfiltered_sample_ctl);
-  uintptr_t founder_ctl = (founder_ct + BITCT - 1) / BITCT;
+  uintptr_t founder_ctl = BITCT_TO_WORDCT(founder_ct);
   uintptr_t founder_ctv2 = founder_ctl * 2;
   uintptr_t founder_ct_mld = (founder_ct + MULTIPLEX_LD - 1) / MULTIPLEX_LD;
   uint32_t founder_ct_mld_m1 = ((uint32_t)founder_ct_mld) - 1;
@@ -11687,7 +11707,6 @@ int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
   uintptr_t memreq1;
   uintptr_t memreq2;
   uintptr_t minmem;
-  uintptr_t topsize_base;
   uintptr_t idx1_block_size;
   uintptr_t idx2_block_size;
   uintptr_t cur_idx2_block_size;
@@ -11716,7 +11735,7 @@ int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     logerrprint("Error: Cannot construct LD map, since there are no founders with nonmissing\nphenotypes.  (--make-founders may come in handy here.)\n");
     goto construct_ld_map_ret_INVALID_CMDLINE;
   }
-  ld_map = (uint32_t**)wkspace_alloc(marker_ct * sizeof(intptr_t));
+  ld_map = (uint32_t**)bigstack_alloc(marker_ct * sizeof(intptr_t));
   if (!ld_map) {
     goto construct_ld_map_ret_NOMEM;
   }
@@ -11724,11 +11743,11 @@ int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
   // To avoid too much back-and-forth disk seeking for large datasets, we
   // construct the LD map in blocks, using similar logic to the --r/--r2 and
   // --fast-epistasis computations.
-  // 1. top_alloc space for main window markers' raw data, bitfields for them
-  //    listing intersecting markers in front (i.e. we only look at the upper
-  //    right triangle of the LD matrix), and another union bitfield.
+  // 1. bigstack_end_alloc space for main window markers' raw data, bitfields
+  //    for them listing intersecting markers in front (i.e. we only look at
+  //    the upper right triangle of the LD matrix), and another union bitfield.
   //    Break the union into secondary windows, and for each secondary window:
-  //    a. top_alloc secondary window markers' raw data
+  //    a. bigstack_end_alloc secondary window markers' raw data
   //    b. perform multithreaded LD calculations, saving results via in-place
   //       clearing of the first markers' bitfields
   //    Memory requirement per main window marker is:
@@ -11745,34 +11764,24 @@ int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
   // 3. save final results for each marker in compressed setdef format at the
   //    current workspace bottom
   // 4. dump .ldset file if necessary
-  loadbuf = (uintptr_t*)top_alloc(&topsize, unfiltered_sample_ct4);
+  loadbuf = (uintptr_t*)bigstack_end_alloc(unfiltered_sample_ct4);
   if (!loadbuf) {
+    // separate since unfiltered_sample_ct4 is a byte, not word, count
     goto construct_ld_map_ret_NOMEM;
   }
-  load2_bitfield = (uintptr_t*)top_alloc(&topsize, marker_ctv * sizeof(intptr_t));
-  if (!load2_bitfield) {
-    goto construct_ld_map_ret_NOMEM0;
-  }
-  tmp_set_bitfield = (uintptr_t*)top_alloc(&topsize, marker_ctv * sizeof(intptr_t));
-  if (!tmp_set_bitfield) {
-    goto construct_ld_map_ret_NOMEM0;
+  if (bigstack_end_alloc_ul(marker_ctv, &load2_bitfield) ||
+      bigstack_end_alloc_ul(marker_ctv, &tmp_set_bitfield) ||
+      bigstack_end_alloc_ul(founder_ctv2, &founder_include2) ||
+      bigstack_end_alloc_ul(founder_ctv2, &founder_male_include2)) {
+    goto construct_ld_map_ret_NOMEM;
   }
   // bugfix: last word might not be initialized by unpack_set().  Also
   // initialize second-to-last word to defend against an unpack_set()
   // implementation change.
   tmp_set_bitfield[marker_ctv - 2] = 0;
   tmp_set_bitfield[marker_ctv - 1] = 0;
-  founder_include2 = (uintptr_t*)top_alloc(&topsize, founder_ctv2 * sizeof(intptr_t));
-  if (!founder_include2) {
-    goto construct_ld_map_ret_NOMEM0;
-  }
-  founder_male_include2 = (uintptr_t*)top_alloc(&topsize, founder_ctv2 * sizeof(intptr_t));
-  if (!founder_male_include2) {
-    goto construct_ld_map_ret_NOMEM0;
-  }
   g_ld_load2_bitfield = load2_bitfield;
-  wkspace_left -= topsize;
-  alloc_collapsed_haploid_filters(unfiltered_sample_ct, founder_ct, XMHH_EXISTS | hh_exists, 1, founder_pnm, sex_male, &founder_include2, &founder_male_include2);
+  alloc_collapsed_haploid_filters(founder_pnm, sex_male, unfiltered_sample_ct, founder_ct, XMHH_EXISTS | hh_exists, 1, &founder_include2, &founder_male_include2);
   memreq2 = founder_ct_192_long * sizeof(intptr_t) * 2 + 4;
 
   // this guarantees enough room for save_set_bitfield() worst case
@@ -11782,7 +11791,6 @@ int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
   if (minmem < memreq1 * 4) {
     minmem = memreq1 * 4;
   }
-  topsize_base = topsize;
   g_ld_marker_ct = marker_ct;
   g_ld_founder_ct = founder_ct;
   g_ld_founder_ct_192_long = founder_ct_192_long;
@@ -11790,7 +11798,7 @@ int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
   g_ld_founder_ct_mld_rem = founder_ct_mld_rem;
   g_ld_window_r2 = sip->set_r2 * (1 - SMALL_EPSILON);
   do {
-    ulii = (wkspace_left - topsize) / 2;
+    ulii = bigstack_left() / 2;
     if (ulii < minmem) {
       goto construct_ld_map_ret_NOMEM;
     }
@@ -11812,13 +11820,13 @@ int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     }
     g_ld_block_idx1 = marker_idx;
     g_ld_idx1_block_size = idx1_block_size;
-    geno1 = (uintptr_t*)top_alloc(&topsize, idx1_block_size * founder_ct_192_long * sizeof(intptr_t));
-    geno_masks1 = (uintptr_t*)top_alloc(&topsize, idx1_block_size * founder_ct_192_long * sizeof(intptr_t));
-    g_ld_missing_cts1 = (uint32_t*)top_alloc(&topsize, idx1_block_size * sizeof(int32_t));
-    geno2 = (uintptr_t*)top_alloc(&topsize, idx2_block_size * founder_ct_192_long * sizeof(intptr_t));
-    geno_masks2 = (uintptr_t*)top_alloc(&topsize, idx2_block_size * founder_ct_192_long * sizeof(intptr_t));
-    g_ld_missing_cts2 = (uint32_t*)top_alloc(&topsize, idx2_block_size * sizeof(int32_t));
-    result_bitfield = (uintptr_t*)top_alloc(&topsize, idx1_block_size * marker_ctv * sizeof(intptr_t));
+    bigstack_end_alloc_ul(idx1_block_size * founder_ct_192_long, &geno1);
+    bigstack_end_alloc_ul(idx1_block_size * founder_ct_192_long, &geno_masks1);
+    bigstack_end_alloc_ui(idx1_block_size, &g_ld_missing_cts1);
+    bigstack_end_alloc_ul(idx2_block_size * founder_ct_192_long, &geno2);
+    bigstack_end_alloc_ul(idx2_block_size * founder_ct_192_long, &geno_masks2);
+    bigstack_end_alloc_ui(idx2_block_size, &g_ld_missing_cts2);
+    bigstack_end_alloc_ul(idx1_block_size * marker_ctv, &result_bitfield);
     uljj = founder_trail_ct + 2;
     for (ulii = 1; ulii <= idx1_block_size; ulii++) {
       fill_ulong_zero(&(geno1[ulii * founder_ct_192_long - uljj]), uljj);
@@ -11846,9 +11854,9 @@ int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	if (wlen) {
 	  uii = marker_idx2;
 	  do {
-	    bitfield_or(&(result_bitfield[((marker_idx2 - marker_idx) * marker_ctv + firstw)]), &(tmp_set_bitfield[firstw]), wlen);
+	    bitvec_or(&(tmp_set_bitfield[firstw]), wlen, &(result_bitfield[((marker_idx2 - marker_idx) * marker_ctv + firstw)]));
 	    marker_idx2++;
-	    next_set_ck(tmp_set_bitfield, &marker_idx2, idx1_block_end);
+	    next_set_ck(tmp_set_bitfield, idx1_block_end, &marker_idx2);
 	  } while (marker_idx2 < idx1_block_end);
 	  // don't need to load the first intersecting member or anything
 	  // before it, since we're only traversing the upper right triangle
@@ -11858,8 +11866,8 @@ int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 #else
 	  firstw = uii / 32;
 #endif
-	  clear_bits(&(tmp_set_bitfield[firstw]), 0, uii + 1 - firstw * BITCT);
-	  bitfield_or(&(load2_bitfield[firstw]), &(tmp_set_bitfield[firstw]), wlen - firstw);
+	  clear_bits(0, uii + 1 - firstw * BITCT, &(tmp_set_bitfield[firstw]));
+	  bitvec_or(&(tmp_set_bitfield[firstw]), wlen - firstw, &(load2_bitfield[firstw]));
 	}
       }
     }
@@ -11894,7 +11902,7 @@ int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
       }
       ulii = block_idx1 * founder_ct_192_long;
       loadbuf_ptr = &(geno1[ulii]);
-      if (load_and_collapse_incl(bedfile, loadbuf, unfiltered_sample_ct, loadbuf_ptr, founder_ct, founder_pnm, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_pnm, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf, loadbuf_ptr)) {
 	goto construct_ld_map_ret_READ_FAIL;
       }
       if (is_haploid && hh_exists) {
@@ -11927,7 +11935,7 @@ int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	}
 	ulii = block_idx2 * founder_ct_192_long;
 	loadbuf_ptr = &(geno2[ulii]);
-	if (load_and_collapse_incl(bedfile, loadbuf, unfiltered_sample_ct, loadbuf_ptr, founder_ct, founder_pnm, final_mask, IS_SET(marker_reverse, marker_uidx2))) {
+	if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_pnm, final_mask, IS_SET(marker_reverse, marker_uidx2), bedfile, loadbuf, loadbuf_ptr)) {
 	  goto construct_ld_map_ret_READ_FAIL;
 	}
 	if (is_haploid && hh_exists) {
@@ -11967,11 +11975,11 @@ int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
       while (1) {
 	marker_idx2 = next_set(rb_ptr, marker_idx2, block_idx1);
 	if (marker_idx2 == block_idx1) {
-	  clear_bit(rb_ptr, block_idx1);
+	  clear_bit(block_idx1, rb_ptr);
 	  break;
 	}
 	if (!in_setdef(ld_map[marker_idx2], block_idx1)) {
-	  clear_bit(rb_ptr, marker_idx2);
+	  clear_bit(marker_idx2, rb_ptr);
 	}
 	marker_idx2++;
       }
@@ -11981,13 +11989,14 @@ int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
       }
       save_set_bitfield(rb_ptr, marker_ct, range_start, range_end, 0, &(ld_map[block_idx1]));
     }
-    topsize = topsize_base; // "free" previous round of allocations
+    // free previous round of allocations
+    bigstack_end_reset(founder_male_include2);
     marker_idx = idx1_block_end;
   } while (marker_idx < marker_ct);
   if (sip->modifier & SET_R2_WRITE) {
     memcpy(charbuf, outname_end, 8);
     memcpy(outname_end, ".ldset", 7);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto construct_ld_map_ret_OPEN_FAIL;
     }
     set_uidx = 0;
@@ -11995,7 +12004,7 @@ int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
       next_set_unsafe_ck(set_incl, &set_uidx);
       sptr = &(sip->names[set_uidx * max_set_id_len]);
       uii = strlen(sptr);
-      wptr_start = memcpyax(tbuf, sptr, uii, ' ');
+      wptr_start = memcpyax(g_textbuf, sptr, uii, ' ');
       cur_setdef = setdefs[set_idx];
       setdef_iter_init(cur_setdef, marker_ct, 0, &marker_idx, &setdef_incr_aux);
 
@@ -12010,7 +12019,7 @@ int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 	    if (!uii) {
 	      uii = 1;
 	      wptr = strcpyax(wptr_start, &(marker_ids[marker_idx_to_uidx[marker_idx] * max_marker_id_len]), ' ');
-	      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 		goto construct_ld_map_ret_WRITE_FAIL;
 	      }
 	    }
@@ -12035,10 +12044,7 @@ int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
   } else {
     logprint("LD map constructed.\n");
   }
-  wkspace_left += topsize;
   while (0) {
-  construct_ld_map_ret_NOMEM0:
-    topsize = 0;
   construct_ld_map_ret_NOMEM:
     retval = RET_NOMEM;
     break;
@@ -12059,7 +12065,7 @@ int32_t construct_ld_map(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     break;
   }
   fclose_cond(outfile);
-  wkspace_left += topsize;
+  bigstack_end_reset(bigstack_end_mark);
   return retval;
 }
 
@@ -12129,7 +12135,7 @@ int32_t set_test_common_init(pthread_t* threads, FILE* bedfile, uintptr_t bed_of
   uintptr_t marker_ct_mid = *marker_ct_ptr;
   uintptr_t marker_ct = marker_ct_mid;
   uintptr_t raw_set_ct = sip->ct;
-  uintptr_t raw_set_ctl = (raw_set_ct + (BITCT - 1)) / BITCT;
+  uintptr_t raw_set_ctl = BITCT_TO_WORDCT(raw_set_ct);
   uintptr_t set_ct = 0;
   uintptr_t* marker_exclude_mid = *marker_exclude_ptr;
   double chisq_threshold = inverse_chiprob(sip->set_p, 1);
@@ -12154,12 +12160,11 @@ int32_t set_test_common_init(pthread_t* threads, FILE* bedfile, uintptr_t bed_of
   uint32_t include_out_of_bounds;
   uint32_t cur_set_size;
   uint32_t uii;
-  if (wkspace_alloc_ul_checked(set_incl_ptr, raw_set_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&marker_midx_to_idx, marker_ct_orig * sizeof(int32_t))) {
+  if (bigstack_calloc_ul(raw_set_ctl, set_incl_ptr) ||
+      bigstack_alloc_ui(marker_ct_orig, &marker_midx_to_idx)) {
     goto set_test_common_init_ret_NOMEM;
   }
   set_incl = *set_incl_ptr;
-  fill_ulong_zero(set_incl, raw_set_ctl);
   fill_midx_to_idx(marker_exclude_orig, marker_exclude_mid, marker_ct, marker_midx_to_idx);
 
   // determine which sets contain at least one significant marker.  do not
@@ -12226,7 +12231,7 @@ int32_t set_test_common_init(pthread_t* threads, FILE* bedfile, uintptr_t bed_of
       }
     }
     if (uii) {
-      SET_BIT(set_incl, set_uidx);
+      SET_BIT(set_uidx, set_incl);
       set_ct++;
       if (cur_set_size > max_sigset_size) {
 	max_sigset_size = cur_set_size;
@@ -12238,7 +12243,7 @@ int32_t set_test_common_init(pthread_t* threads, FILE* bedfile, uintptr_t bed_of
     goto set_test_common_init_ret_1;
   }
   LOGPRINTFWW("%s set test: Testing %" PRIuPTR " set%s with at least one significant variant.\n", flag_descrip, set_ct, (set_ct == 1)? "" : "s");
-  wkspace_reset((unsigned char*)marker_midx_to_idx);
+  bigstack_reset((unsigned char*)marker_midx_to_idx);
   if (set_ct < raw_set_ct) {
     marker_ct = marker_ct_orig;
     if (extract_set_union_unfiltered(sip, set_incl, unfiltered_marker_ct, marker_exclude_orig, marker_exclude_ptr, &marker_ct)) {
@@ -12247,7 +12252,7 @@ int32_t set_test_common_init(pthread_t* threads, FILE* bedfile, uintptr_t bed_of
   }
   // Okay, we've pruned all we can, now it's time to suck it up and construct
   // the potentially huge LD map
-  if (wkspace_alloc_ui_checked(marker_idx_to_uidx_ptr, marker_ct * sizeof(int32_t))) {
+  if (bigstack_alloc_ui(marker_ct, marker_idx_to_uidx_ptr)) {
     goto set_test_common_init_ret_NOMEM;
   }
   fill_idx_to_uidx(*marker_exclude_ptr, unfiltered_marker_ct, marker_ct, *marker_idx_to_uidx_ptr);
@@ -12267,11 +12272,11 @@ int32_t set_test_common_init(pthread_t* threads, FILE* bedfile, uintptr_t bed_of
     // caller needs to collapse other arrays
     inplace_delta_collapse_arr((char*)orig_chisq, sizeof(double), marker_ct_mid, marker_ct, marker_exclude_mid, *marker_exclude_ptr);
   }
-  if (wkspace_alloc_d_checked(orig_set_scores_ptr, set_ct * sizeof(double)) ||
-      wkspace_alloc_d_checked(sorted_chisq_buf_ptr, max_sigset_size * sizeof(double)) ||
-      wkspace_alloc_ui_checked(sorted_marker_idx_buf_ptr, max_sigset_size * sizeof(int32_t)) ||
-      // technically assumes sizeof(double) >= sizeof(intptr_t)
-      wkspace_alloc_ui_checked(proxy_arr_ptr, max_sigset_size * (sizeof(double) + sizeof(int32_t)))) {
+  if (bigstack_alloc_d(set_ct, orig_set_scores_ptr) ||
+      bigstack_alloc_d(max_sigset_size, sorted_chisq_buf_ptr) ||
+      bigstack_alloc_ui(max_sigset_size, sorted_marker_idx_buf_ptr) ||
+      // 3 int32s = max(sizeof(double), sizeof(intptr_t)) + sizeof(int32_t)
+      bigstack_alloc_ui(max_sigset_size * 3LU, proxy_arr_ptr)) {
     goto set_test_common_init_ret_NOMEM;
   }
   orig_set_scores = *orig_set_scores_ptr;
@@ -12282,15 +12287,14 @@ int32_t set_test_common_init(pthread_t* threads, FILE* bedfile, uintptr_t bed_of
   }
   // just treat --mperm as --perm with min_perms == max_perms, since this isn't
   // a proper max(T) test
-  if (wkspace_alloc_ul_checked(perm_adapt_set_unstopped_ptr, ((set_ct + (BITCT - 1)) / BITCT) * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(perm_2success_ct_ptr, set_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(perm_attempt_ct_ptr, set_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(unstopped_markers_ptr, ((marker_ct + (BITCT - 1)) / BITCT) * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(BITCT_TO_WORDCT(set_ct), perm_adapt_set_unstopped_ptr) ||
+      bigstack_calloc_ui(set_ct, perm_2success_ct_ptr) ||
+      bigstack_alloc_ui(set_ct, perm_attempt_ct_ptr) ||
+      bigstack_alloc_ul(BITCT_TO_WORDCT(marker_ct), unstopped_markers_ptr)) {
     goto set_test_common_init_ret_NOMEM;
   }
-  fill_all_bits(*perm_adapt_set_unstopped_ptr, set_ct);
-  fill_uint_zero(*perm_2success_ct_ptr, set_ct);
-  fill_all_bits(*unstopped_markers_ptr, marker_ct);
+  fill_all_bits(set_ct, *perm_adapt_set_unstopped_ptr);
+  fill_all_bits(marker_ct, *unstopped_markers_ptr);
   while (0) {
   set_test_common_init_ret_NOMEM:
     retval = RET_NOMEM;
@@ -12334,7 +12338,7 @@ void compute_set_scores(uintptr_t marker_ct, uintptr_t perm_vec_ct, uintptr_t se
 	    pval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
 	    dxx = adaptive_ci_zt * sqrt(pval * (1 - pval) / ((int32_t)next_adapt_check));
 	    if ((pval - dxx > apip->alpha) || (pval + dxx < apip->alpha)) {
-	      CLEAR_BIT(perm_adapt_set_unstopped, set_idx);
+	      CLEAR_BIT(set_idx, perm_adapt_set_unstopped);
 	      perm_attempt_ct[set_idx] = next_adapt_check;
 	      break;
 	    }
@@ -12370,35 +12374,35 @@ int32_t write_set_test_results(char* outname, char* outname_end2, Set_info* sip,
     if (alloc_and_populate_nonempty_set_incl(sip, &nonempty_set_ct, &nonempty_set_incl)) {
       goto write_set_test_results_ret_NOMEM;
     }
-    if (wkspace_alloc_d_checked(&empirical_pvals, nonempty_set_ct * sizeof(double))) {
+    if (bigstack_alloc_d(nonempty_set_ct, &empirical_pvals)) {
       goto write_set_test_results_ret_NOMEM;
     }
   }
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto write_set_test_results_ret_OPEN_FAIL;
   }
   fprintf(outfile, "         SET   NSNP   NSIG   ISIG         EMP1 %sSNPS\n", perm_count? "          NP " : "");
   for (set_uidx = 0, set_midx = 0, set_idx = 0; set_uidx < raw_set_ct; set_uidx++) {
-    bufptr = fw_strcpy(12, &(sip->names[set_uidx * max_set_id_len]), tbuf);
+    bufptr = fw_strcpy(12, &(sip->names[set_uidx * max_set_id_len]), g_textbuf);
     *bufptr++ = ' ';
-    bufptr = uint32_writew6x(bufptr, setdef_size(sip->setdefs[set_uidx], marker_ct_orig), ' ');
+    bufptr = uint32toa_w6x(setdef_size(sip->setdefs[set_uidx], marker_ct_orig), ' ', bufptr);
     if (IS_SET(set_incl, set_uidx)) {
       set_test_score(marker_ct, chisq_threshold, sip->set_max, orig_stats, ld_map, setdefs[set_idx], sorted_chisq_buf, sorted_marker_idx_buf, proxy_arr, &raw_sig_ct, &final_sig_ct, &cur_score);
-      bufptr = uint32_writew6x(bufptr, raw_sig_ct, ' ');
-      bufptr = uint32_writew6x(bufptr, final_sig_ct, ' ');
+      bufptr = uint32toa_w6x(raw_sig_ct, ' ', bufptr);
+      bufptr = uint32toa_w6x(final_sig_ct, ' ', bufptr);
       pval = ((double)(perm_2success_ct[set_idx] + 2)) / ((double)(2 * (perm_attempt_ct[set_idx] + 1)));
       if (empirical_pvals) {
 	empirical_pvals[set_midx] = pval;
       }
       if (pval <= pfilter) {
 	if (!perm_count) {
-	  bufptr = double_g_writewx4x(bufptr, MAXV(pval, output_min_p), 12, ' ');
+	  bufptr = dtoa_g_wxp4x(MAXV(pval, output_min_p), 12, ' ', bufptr);
 	} else {
-	  bufptr = double_g_writewx4(bufptr, ((double)perm_2success_ct[set_idx]) * 0.5, 12);
+	  bufptr = dtoa_g_wxp4(((double)perm_2success_ct[set_idx]) * 0.5, 12, bufptr);
 	  bufptr = memseta(bufptr, 32, 3);
-	  bufptr = uint32_writew10x(bufptr, perm_attempt_ct[set_idx], ' ');
+	  bufptr = uint32toa_w10x(perm_attempt_ct[set_idx], ' ', bufptr);
 	}
-	if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	  goto write_set_test_results_ret_WRITE_FAIL;
 	}
 	fputs(&(marker_ids[marker_idx_to_uidx[proxy_arr[0]] * max_marker_id_len]), outfile);
@@ -12418,7 +12422,7 @@ int32_t write_set_test_results(char* outname, char* outname_end2, Set_info* sip,
       } else {
         bufptr = memcpya(bufptr, "     0      0            0            0 NA\n", 43);
       }
-      if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+      if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	goto write_set_test_results_ret_WRITE_FAIL;
       }
       if (nonempty_set_incl && is_set(nonempty_set_incl, set_uidx)) {
@@ -12432,7 +12436,7 @@ int32_t write_set_test_results(char* outname, char* outname_end2, Set_info* sip,
   }
   LOGPRINTFWW("Set test results written to %s .\n", outname);
   if (empirical_pvals) {
-    if (wkspace_alloc_ui_checked(&nonempty_set_idx_to_uidx, nonempty_set_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(nonempty_set_ct, &nonempty_set_idx_to_uidx)) {
       goto write_set_test_results_ret_NOMEM;
     }
     fill_idx_to_uidx_incl(nonempty_set_incl, raw_set_ct, nonempty_set_ct, nonempty_set_idx_to_uidx);
@@ -12491,20 +12495,20 @@ void update_clump_histo(double pval, uintptr_t* histo) {
 }
 
 int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* founder_info, Clump_info* clump_ip, uintptr_t* sex_male, uint32_t hh_exists) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   gzFile gz_infile = NULL;
   FILE* outfile = NULL;
   FILE* outfile_ranges = NULL;
   FILE* outfile_best = NULL;
-  uintptr_t marker_ctl = (marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t marker_ctl = BITCT_TO_WORDCT(marker_ct);
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t founder_ct = popcount_longs(founder_info, unfiltered_sample_ctl);
-  uintptr_t founder_ctl2 = (founder_ct + (BITCT2 - 1)) / BITCT2;
-  uintptr_t founder_ctv2 = 2 * ((founder_ct + (BITCT - 1)) / BITCT);
+  uintptr_t founder_ctl2 = QUATERCT_TO_WORDCT(founder_ct);
+  uintptr_t founder_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(founder_ct);
   uintptr_t final_mask = get_final_mask(founder_ct);
-  uintptr_t topsize = 0;
   uintptr_t range_group_ct = 0;
   uintptr_t max_range_group_id_len = 0;
   uintptr_t max_header_len = 2;
@@ -12519,7 +12523,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
   char* range_group_names = NULL;
   char* fname_ptr = NULL;
   char* annot_flattened = clump_ip->annotate_flattened;
-  char* tbuf2 = &(tbuf[MAXLINELEN]);
+  char* tbuf2 = &(g_textbuf[MAXLINELEN]);
   char* header2_ptr = NULL;
   char* annot_ptr = NULL;
   char* cur_rg_names = NULL;
@@ -12555,7 +12559,6 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
   Clump_entry* clump_entry_ptr;
   Clump_entry* best_entry_ptr;
   Cur_clump_info* cur_clump_base;
-  Cur_clump_info* cur_clump_ceil;
   Cur_clump_info* cc_ptr;
   uintptr_t* col_bitfield;
   uintptr_t* cur_bitfield;
@@ -12584,6 +12587,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
   Clump_missing_id* cm_ptr;
   uintptr_t header_dict_ct;
   uintptr_t extra_annot_space;
+  uintptr_t cur_bigstack_left;
   uintptr_t loadbuft_size;
   uintptr_t marker_idx;
   uintptr_t last_marker_idx;
@@ -12665,15 +12669,14 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
     }
   }
   // 2. create marker ID hash table, allocate index-tracking bitfield
-  retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable, &marker_id_htable_size);
+  retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable_size, &marker_id_htable);
   if (retval) {
     goto clump_reports_ret_1;
   }
-  if (wkspace_alloc_ul_checked(&cur_bitfield, marker_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&marker_uidx_to_idx, unfiltered_marker_ct * sizeof(int32_t))) {
+  if (bigstack_calloc_ul(marker_ctl, &cur_bitfield) ||
+      bigstack_alloc_ui(unfiltered_marker_ct, &marker_uidx_to_idx)) {
     goto clump_reports_ret_NOMEM;
   }
-  fill_ulong_zero(cur_bitfield, marker_ctl);
   fill_uidx_to_idx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_uidx_to_idx);
   if (clump_ip->snpfield_search_order) {
     snpfield_search_ct = count_and_measure_multistr(clump_ip->snpfield_search_order, &max_header_len);
@@ -12698,11 +12701,11 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
   // stores string length. 
   annot_ct_p2 = 2 + annot_ct;
   annot_ct_p2_ctl = (annot_ct + (BITCT + 1)) / BITCT;
-  if (wkspace_alloc_c_checked(&sorted_header_dict, max_header_len * header_dict_ct) ||
-      wkspace_alloc_ui_checked(&header_id_map, header_dict_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(&col_bitfield, annot_ct_p2_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&parse_table, annot_ct_p2 * 2 * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&cur_parse_info, annot_ct_p2 * 2 * sizeof(int32_t))) {
+  if (bigstack_alloc_c(max_header_len * header_dict_ct, &sorted_header_dict) ||
+      bigstack_alloc_ui(header_dict_ct, &header_id_map) ||
+      bigstack_alloc_ul(annot_ct_p2_ctl, &col_bitfield) ||
+      bigstack_alloc_ui(annot_ct_p2 * 2, &parse_table) ||
+      bigstack_alloc_ui(annot_ct_p2 * 2, &cur_parse_info)) {
     goto clump_reports_ret_NOMEM;
   }
   ulii = 0; // write position
@@ -12751,11 +12754,10 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
     goto clump_reports_ret_INVALID_CMDLINE;
   }
 
-  if (wkspace_alloc_ui_checked(&nsig_arr, marker_ct * sizeof(int32_t))) {
+  if (bigstack_calloc_ui(marker_ct, &nsig_arr)) {
     goto clump_reports_ret_NOMEM;
   }
-  fill_uint_zero(nsig_arr, marker_ct);
-  clump_entries = (Clump_entry**)wkspace_alloc(marker_ct * sizeof(intptr_t));
+  clump_entries = (Clump_entry**)bigstack_alloc(marker_ct * sizeof(intptr_t));
   if (!clump_entries) {
     goto clump_reports_ret_NOMEM;
   }
@@ -12771,7 +12773,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
     bufptr++;
     file_ct++;
   } while (*bufptr);
-  loadbuft = (char*)wkspace_base;
+  loadbuft = (char*)g_bigstack_base;
   if (clump_best) {
     if (file_ct == 2) {
       if (!clump_index_first) {
@@ -12792,9 +12794,10 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
   // To reduce the risk of 32-bit integer overflow bugs, we cap line length at
   // a bit under 2^30 instead of 2^31 here.
   extra_annot_space = (48 + 2 * annot_ct) & (~(15 * ONELU));
-  if (wkspace_left <= 2 * MAXLINELEN + extra_annot_space) {
+  cur_bigstack_left = bigstack_left();
+  if (cur_bigstack_left <= 2 * MAXLINELEN + extra_annot_space) {
     goto clump_reports_ret_NOMEM;
-  } else if (wkspace_left - extra_annot_space >= MAXLINEBUFLEN) {
+  } else if (cur_bigstack_left - extra_annot_space >= MAXLINEBUFLEN) {
     loadbuft[(MAXLINEBUFLEN / 2) - 1] = ' ';
   }
   if (clump_index_first && (file_ct > 1)) {
@@ -12802,15 +12805,13 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
   }
   // load in reverse order since we're adding to the front of the linked lists
   for (file_idx = file_ct; file_idx; file_idx--) {
-    if (gzopen_checked(&gz_infile, fname_ptr, "rb")) {
-      goto clump_reports_ret_OPEN_FAIL;
-    }
-    if (gzbuffer(gz_infile, 131072)) {
-      goto clump_reports_ret_NOMEM;
+    retval = gzopen_read_checked(fname_ptr, &gz_infile);
+    if (retval) {
+      goto clump_reports_ret_1;
     }
-    loadbuft_size = wkspace_left - topsize;
+    loadbuft_size = bigstack_left();
     if (loadbuft_size <= 2 * MAXLINELEN + extra_annot_space) {
-      goto clump_reports_ret_NOMEM2;
+      goto clump_reports_ret_NOMEM;
     }
     loadbuft_size = (loadbuft_size - extra_annot_space) / 2;
     if (loadbuft_size >= MAXLINEBUFLEN / 2) {
@@ -12865,7 +12866,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
         if (ujj >= 0x40000000) {
           if (ujj < ukk) {
 	    // ignore title if higher-precedence title already seen
-	    set_bit(col_bitfield, 0);
+	    set_bit(0, col_bitfield);
 	    ukk = ujj;
 	    parse_table[1] = uii; // temporary storage
 	  } else if (ujj == ukk) {
@@ -12873,7 +12874,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	  }
 	} else if (ujj >= 0x20000000) {
 	  if (ujj < umm) {
-	    set_bit(col_bitfield, 1);
+	    set_bit(1, col_bitfield);
             umm = ujj;
 	    parse_table[3] = uii;
 	  } else if (ujj == umm) {
@@ -12883,7 +12884,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	  if (is_set(col_bitfield, ujj)) {
 	    goto clump_reports_ret_DUPLICATE_HEADER_COL;
 	  }
-	  set_bit(col_bitfield, ujj);
+	  set_bit(ujj, col_bitfield);
           parse_table[cur_read_ct * 2 + 1] = uii;
 	  parse_table[cur_read_ct * 2] = ujj;
 	  cur_read_ct++;
@@ -12983,7 +12984,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	}
 	continue;
       }
-      clump_entry_ptr = (Clump_entry*)top_alloc(&topsize, offsetof(Clump_entry, annot) + ukk - 1);
+      clump_entry_ptr = (Clump_entry*)bigstack_end_alloc(offsetof(Clump_entry, annot) + ukk - 1);
       if (!clump_entry_ptr) {
 	goto clump_reports_ret_NOMEM;
       }
@@ -13004,11 +13005,11 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
       }
       clump_entries[marker_idx] = clump_entry_ptr;
       if ((pval <= p1_thresh) && index_eligible) {
-	set_bit(cur_bitfield, marker_idx);
+	set_bit(marker_idx, cur_bitfield);
       }
-      loadbuft_size = wkspace_left - topsize;
+      loadbuft_size = bigstack_left();
       if (loadbuft_size <= 2 * MAXLINELEN + extra_annot_space) {
-	goto clump_reports_ret_NOMEM2;
+	goto clump_reports_ret_NOMEM;
       }
       loadbuft_size = (loadbuft_size - extra_annot_space) / 2;
       if (loadbuft_size >= MAXLINEBUFLEN / 2) {
@@ -13038,10 +13039,9 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
     logerrprint("Warning: No significant --clump results.  Skipping.\n");
     goto clump_reports_ret_1;
   }
-  wkspace_left -= topsize;
-  if (wkspace_alloc_d_checked(&sorted_pvals, index_ct * sizeof(double)) ||
-      wkspace_alloc_ui_checked(&pval_map, index_ct * sizeof(int32_t))) {
-    goto clump_reports_ret_NOMEM2;
+  if (bigstack_alloc_d(index_ct, &sorted_pvals) ||
+      bigstack_alloc_ui(index_ct, &pval_map)) {
+    goto clump_reports_ret_NOMEM;
   }
   marker_idx = 0;
   for (uii = 0; uii < index_ct; uii++, marker_idx++) {
@@ -13060,39 +13060,37 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
     pval_map[uii] = marker_idx;
   }
   if (qsort_ext((char*)sorted_pvals, index_ct, sizeof(double), double_cmp_deref, (char*)pval_map, sizeof(int32_t))) {
-    goto clump_reports_ret_NOMEM2;
+    goto clump_reports_ret_NOMEM;
   }
-  if (wkspace_alloc_ui_checked(&marker_idx_to_uidx, marker_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&index_data, 5 * founder_ctv2 * sizeof(intptr_t))) {
-    goto clump_reports_ret_NOMEM2;
+  if (bigstack_alloc_ui(marker_ct, &marker_idx_to_uidx) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw) ||
+      bigstack_alloc_ul(5 * founder_ctv2, &index_data)) {
+    goto clump_reports_ret_NOMEM;
   }
   for (uii = 1; uii <= 5; uii++) {
     index_data[uii * founder_ctv2 - 2] = 0;
     index_data[uii * founder_ctv2 - 1] = 0;
   }
-  if (alloc_collapsed_haploid_filters(unfiltered_sample_ct, founder_ct, Y_FIX_NEEDED, 1, founder_info, sex_male, &founder_include2, &founder_male_include2)) {
-    goto clump_reports_ret_NOMEM2; 
+  if (alloc_collapsed_haploid_filters(founder_info, sex_male, unfiltered_sample_ct, founder_ct, Y_FIX_NEEDED, 1, &founder_include2, &founder_male_include2)) {
+    goto clump_reports_ret_NOMEM; 
  }
   if (clump_verbose && rg_setdefs) {
-    if (wkspace_alloc_ul_checked(&rangematch_bitfield, ((range_chrom_max + (BITCT - 1)) / BITCT) * sizeof(intptr_t))) {
-      goto clump_reports_ret_NOMEM2;
+    if (bigstack_alloc_ul(BITCT_TO_WORDCT(range_chrom_max), &rangematch_bitfield)) {
+      goto clump_reports_ret_NOMEM;
     }
   }
-  window_data = (uintptr_t*)wkspace_base;
-  max_window_size = wkspace_left / (founder_ctv2 * sizeof(intptr_t) + sizeof(Cur_clump_info));
+  window_data = (uintptr_t*)g_bigstack_base;
+  max_window_size = bigstack_left() / (founder_ctv2 * sizeof(intptr_t) + sizeof(Cur_clump_info));
   if (!max_window_size) {
-    goto clump_reports_ret_NOMEM2;
+    goto clump_reports_ret_NOMEM;
   }
-  cur_clump_ceil = (Cur_clump_info*)(&(wkspace_base[wkspace_left]));
   fill_idx_to_uidx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_idx_to_uidx);
   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
-  wkspace_left += topsize;
   // now this indicates whether a variant has previously been in a clump
   fill_ulong_zero(cur_bitfield, marker_ctl);
   // 5. iterate through clumps, calculate r^2 and write output
   memcpy(outname_end, ".clumped", 9);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto clump_reports_ret_OPEN_FAIL;
   }
   bufptr = tbuf2;
@@ -13111,7 +13109,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
     }
     if (rg_setdefs) {
       memcpy(&(outname_end[8]), ".ranges", 8);
-      if (fopen_checked(&outfile_ranges, outname, "w")) {
+      if (fopen_checked(outname, "w", &outfile_ranges)) {
 	goto clump_reports_ret_OPEN_FAIL;
       }
       bufptr = fw_strcpyn(plink_maxsnp, 3, "SNP", &(tbuf2[5]));
@@ -13135,14 +13133,14 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
   }
   if (clump_best) {
     memcpy(&(outname_end[8]), ".best", 6);
-    if (fopen_checked(&outfile_best, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile_best)) {
       goto clump_reports_ret_OPEN_FAIL;
     }
-    bufptr = fw_strcpyn(plink_maxsnp, 5, "INDEX", tbuf);
+    bufptr = fw_strcpyn(plink_maxsnp, 5, "INDEX", g_textbuf);
     *bufptr++ = ' ';
     bufptr = fw_strcpyn(plink_maxsnp, 4, "PSNP", bufptr);
     bufptr = strcpya(bufptr, "    RSQ       KB        P  ALLELES        F\n");
-    if (fwrite_checked(tbuf, bufptr - tbuf, outfile_best)) {
+    if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile_best)) {
       goto clump_reports_ret_WRITE_FAIL;
     }
   }
@@ -13189,7 +13187,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
       }
       window_data_ptr[founder_ctv2 - 2] = 0;
       window_data_ptr[founder_ctv2 - 1] = 0;
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, window_data_ptr, founder_ct, founder_info, final_mask, is_set(marker_reverse, marker_uidx))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, is_set(marker_reverse, marker_uidx), bedfile, loadbuf_raw, window_data_ptr)) {
 	goto clump_reports_ret_READ_FAIL;
       }
       if (is_haploid) {
@@ -13203,7 +13201,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
     }
     window_data_ptr[founder_ctv2 - 2] = 0;
     window_data_ptr[founder_ctv2 - 1] = 0;
-    if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, window_data_ptr, founder_ct, founder_info, final_mask, is_set(marker_reverse, marker_uidx))) {
+    if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, is_set(marker_reverse, marker_uidx), bedfile, loadbuf_raw, window_data_ptr)) {
       goto clump_reports_ret_READ_FAIL;
     }
     if (is_haploid) {
@@ -13240,16 +13238,16 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
       if (((!allow_overlap) && is_set(cur_bitfield, marker_idx)) || ((!clump_entry_ptr) && (!nsig_arr[marker_idx]))) {
 	continue;
       }
-      vec_3freq(founder_ctl2, window_data_ptr, index_data, &(counts[0]), &(counts[1]), &(counts[2]));
+      genovec_3freq(window_data_ptr, index_data, founder_ctl2, &(counts[0]), &(counts[1]), &(counts[2]));
       counts[0] = index_tots[0] - counts[0] - counts[1] - counts[2];
-      vec_3freq(founder_ctl2, window_data_ptr, &(index_data[founder_ctv2]), &(counts[3]), &(counts[4]), &(counts[5]));
+      genovec_3freq(window_data_ptr, &(index_data[founder_ctv2]), founder_ctl2, &(counts[3]), &(counts[4]), &(counts[5]));
       counts[3] = index_tots[1] - counts[3] - counts[4] - counts[5];
-      vec_3freq(founder_ctl2, window_data_ptr, &(index_data[2 * founder_ctv2]), &(counts[6]), &(counts[7]), &(counts[8]));
+      genovec_3freq(window_data_ptr, &(index_data[2 * founder_ctv2]), founder_ctl2, &(counts[6]), &(counts[7]), &(counts[8]));
       counts[6] = index_tots[2] - counts[6] - counts[7] - counts[8];
       if (is_x) {
-        vec_3freq(founder_ctl2, window_data_ptr, &(index_data[3 * founder_ctv2]), &(counts[9]), &(counts[10]), &(counts[11]));
+        genovec_3freq(window_data_ptr, &(index_data[3 * founder_ctv2]), founder_ctl2, &(counts[9]), &(counts[10]), &(counts[11]));
         counts[9] = index_tots[3] - counts[9] - counts[11];
-        vec_3freq(founder_ctl2, window_data_ptr, &(index_data[4 * founder_ctv2]), &(counts[15]), &(counts[16]), &(counts[17]));
+        genovec_3freq(window_data_ptr, &(index_data[4 * founder_ctv2]), founder_ctl2, &(counts[15]), &(counts[16]), &(counts[17]));
         counts[15] = index_tots[4] - counts[15] - counts[17];
       }
       if (!em_phase_hethet_nobase(counts, is_x, is_x, &freq1x, &freq2x, &freqx1, &freqx2, &freq11)) {
@@ -13269,7 +13267,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	    dxx = clump_entry_ptr->pval;
 	    update_clump_histo(dxx, histo);
 	    if (dxx < p2_thresh) {
-	      if (cc_ptr >= cur_clump_ceil) {
+	      if (((unsigned char*)cc_ptr) >= g_bigstack_end) {
 		goto clump_reports_ret_NOMEM;
 	      }
 	      cc_ptr->r2 = cur_r2;
@@ -13286,7 +13284,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	    clump_entry_ptr = clump_entry_ptr->next;
 	  }
 	  histo[0] += nsig_arr[marker_idx];
-	  set_bit(cur_bitfield, marker_idx);
+	  set_bit(marker_idx, cur_bitfield);
 	}
       }
       window_data_ptr = &(window_data_ptr[founder_ctv2]);
@@ -13300,7 +13298,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	dxx = clump_entry_ptr->pval;
 	update_clump_histo(dxx, histo);
 	if (dxx < p2_thresh) {
-	  if (cc_ptr >= cur_clump_ceil) {
+	  if (((unsigned char*)cc_ptr) >= g_bigstack_end) {
 	    goto clump_reports_ret_NOMEM;
 	  }
 	  cc_ptr->r2 = 1;
@@ -13324,7 +13322,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	  dxx = clump_entry_ptr->pval;
 	  update_clump_histo(dxx, histo);
 	  if (dxx < p2_thresh) {
-	    if (cc_ptr >= cur_clump_ceil) {
+	    if (((unsigned char*)cc_ptr) >= g_bigstack_end) {
 	      goto clump_reports_ret_NOMEM;
 	    }
 	    cc_ptr->r2 = 1;
@@ -13347,7 +13345,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
     //     ii-b. index variant position was not previously clumped 
     if ((uii || nsig_arr[ivar_idx]) && (allow_overlap || (!is_set(cur_bitfield, ivar_idx)))) {
       histo[0] += nsig_arr[ivar_idx];
-      set_bit(cur_bitfield, ivar_idx);
+      set_bit(ivar_idx, cur_bitfield);
     }
     marker_uidx = ivar_uidx;
     marker_idx = ivar_idx;
@@ -13364,22 +13362,22 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
       }
       window_data[founder_ctv2 - 2] = 0;
       window_data[founder_ctv2 - 1] = 0;
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, window_data, founder_ct, founder_info, final_mask, is_set(marker_reverse, marker_uidx))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, founder_ct, founder_info, final_mask, is_set(marker_reverse, marker_uidx), bedfile, loadbuf_raw, window_data)) {
 	goto clump_reports_ret_READ_FAIL;
       }
       if (is_haploid) {
         haploid_fix(hh_exists, founder_include2, founder_male_include2, founder_ct, is_x, is_y, (unsigned char*)window_data);
       }
-      vec_3freq(founder_ctl2, window_data, index_data, &(counts[0]), &(counts[1]), &(counts[2]));
+      genovec_3freq(window_data, index_data, founder_ctl2, &(counts[0]), &(counts[1]), &(counts[2]));
       counts[0] = index_tots[0] - counts[0] - counts[1] - counts[2];
-      vec_3freq(founder_ctl2, window_data, &(index_data[founder_ctv2]), &(counts[3]), &(counts[4]), &(counts[5]));
+      genovec_3freq(window_data, &(index_data[founder_ctv2]), founder_ctl2, &(counts[3]), &(counts[4]), &(counts[5]));
       counts[3] = index_tots[1] - counts[3] - counts[4] - counts[5];
-      vec_3freq(founder_ctl2, window_data, &(index_data[2 * founder_ctv2]), &(counts[6]), &(counts[7]), &(counts[8]));
+      genovec_3freq(window_data, &(index_data[2 * founder_ctv2]), founder_ctl2, &(counts[6]), &(counts[7]), &(counts[8]));
       counts[6] = index_tots[2] - counts[6] - counts[7] - counts[8];
       if (is_x) {
-        vec_3freq(founder_ctl2, window_data, &(index_data[3 * founder_ctv2]), &(counts[9]), &(counts[10]), &(counts[11]));
+        genovec_3freq(window_data, &(index_data[3 * founder_ctv2]), founder_ctl2, &(counts[9]), &(counts[10]), &(counts[11]));
         counts[9] = index_tots[3] - counts[9] - counts[11];
-        vec_3freq(founder_ctl2, window_data, &(index_data[4 * founder_ctv2]), &(counts[15]), &(counts[16]), &(counts[17]));
+        genovec_3freq(window_data, &(index_data[4 * founder_ctv2]), founder_ctl2, &(counts[15]), &(counts[16]), &(counts[17]));
         counts[15] = index_tots[4] - counts[15] - counts[17];
       }
       if (!em_phase_hethet_nobase(counts, is_x, is_x, &freq1x, &freq2x, &freqx1, &freqx2, &freq11)) {
@@ -13396,7 +13394,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	    dxx = clump_entry_ptr->pval;
             update_clump_histo(dxx, histo);
 	    if (dxx < p2_thresh) {
-	      if (cc_ptr >= cur_clump_ceil) {
+	      if (((unsigned char*)cc_ptr) >= g_bigstack_end) {
 		goto clump_reports_ret_NOMEM;
 	      }
 	      cc_ptr->r2 = cur_r2;
@@ -13413,7 +13411,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	    clump_entry_ptr = clump_entry_ptr->next;
 	  }
 	  histo[0] += nsig_arr[marker_idx];
-	  set_bit(cur_bitfield, marker_idx);
+	  set_bit(marker_idx, cur_bitfield);
 	}
       }
     }
@@ -13433,7 +13431,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
       }
     }
     if (clump_best) {
-      bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[ivar_uidx * max_marker_id_len]), tbuf);
+      bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[ivar_uidx * max_marker_id_len]), g_textbuf);
       *bufptr++ = ' ';
       if (best_entry_ptr) {
 	bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[max_r2_uidx * max_marker_id_len]), bufptr);
@@ -13441,11 +13439,11 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
         if (max_r2_uidx == ivar_uidx) {
 	  bufptr = memcpya(bufptr, "     *", 6);
 	} else {
-	  bufptr = double_g_writewx3(bufptr, fabs(max_r2), 6);
+	  bufptr = dtoa_g_wxp3(fabs(max_r2), 6, bufptr);
 	}
 	*bufptr++ = ' ';
-	bufptr = double_g_writewx3x(bufptr, ((double)((int32_t)(marker_pos[max_r2_uidx] - cur_bp))) * 0.001, 8, ' ');
-	bufptr = double_g_writewx3x(bufptr, best_entry_ptr->pval, 8, ' ');
+	bufptr = dtoa_g_wxp3x(((double)((int32_t)(marker_pos[max_r2_uidx] - cur_bp))) * 0.001, 8, ' ', bufptr);
+	bufptr = dtoa_g_wxp3x(best_entry_ptr->pval, 8, ' ', bufptr);
 	if (max_r2 > 0) {
 	  uii = 0;
 	} else {
@@ -13480,7 +13478,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	    break;
 	  }
 	}
-	if (fwrite_checked(tbuf, bufptr - tbuf, outfile_best)) {
+	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile_best)) {
 	  goto clump_reports_ret_WRITE_FAIL;
 	}
         fputs(cur_a1, outfile_best);
@@ -13488,9 +13486,9 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	putc('/', outfile_best);
         fputs(cur_a2, outfile_best);
         fputs(bufptr3, outfile_best);
-        tbuf[0] = ' ';
-        bufptr = uint32_writew8x(&(tbuf[1]), best_fidx_match, ' ');
-	if (fwrite_checked(tbuf, bufptr - tbuf, outfile_best)) {
+        g_textbuf[0] = ' ';
+        bufptr = uint32toa_w8x(best_fidx_match, ' ', &(g_textbuf[1]));
+	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile_best)) {
 	  goto clump_reports_ret_WRITE_FAIL;
 	}
 	if (annot_flattened) {
@@ -13500,31 +13498,31 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
       } else {
 	bufptr = fw_strcpyn(plink_maxsnp, 2, "NA", bufptr);
         bufptr = memcpya(bufptr, "     NA       NA       NA       NA       NA \n", 45);
-	if (fwrite_checked(tbuf, bufptr - tbuf, outfile_best)) {
+	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile_best)) {
 	  goto clump_reports_ret_WRITE_FAIL;
 	}
       }
     }
-    bufptr = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, clump_chrom_idx));
+    bufptr = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, clump_chrom_idx, g_textbuf));
     *bufptr++ = ' ';
-    bufptr = uint32_writew4(bufptr, index_fidx);
+    bufptr = uint32toa_w4(index_fidx, bufptr);
     *bufptr++ = ' ';
     bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[ivar_uidx * max_marker_id_len]), bufptr);
     *bufptr++ = ' ';
-    bufptr = uint32_writew10x(bufptr, cur_bp, ' ');
-    bufptr = double_g_writewx3x(bufptr, pval, 10, ' ');
+    bufptr = uint32toa_w10x(cur_bp, ' ', bufptr);
+    bufptr = dtoa_g_wxp3x(pval, 10, ' ', bufptr);
 #ifdef __LP64__
     // may as well be paranoid
-    bufptr = width_force(8, bufptr, int64_write(bufptr, (int64_t)(histo[0] + histo[1] + histo[2] + histo[3] + histo[4])));
+    bufptr = width_force(8, bufptr, int64toa((int64_t)(histo[0] + histo[1] + histo[2] + histo[3] + histo[4]), bufptr));
     *bufptr++ = ' ';
     for (uii = 0; uii < 5; uii++) {
-      bufptr = width_force(6, bufptr, int64_write(bufptr, (int64_t)((uintptr_t)histo[uii])));
+      bufptr = width_force(6, bufptr, int64toa((int64_t)((uintptr_t)histo[uii]), bufptr));
       *bufptr++ = ' ';
     }
 #else
-    bufptr = uint32_writew8x(bufptr, histo[0] + histo[1] + histo[2] + histo[3] + histo[4], ' ');
+    bufptr = uint32toa_w8x(histo[0] + histo[1] + histo[2] + histo[3] + histo[4], ' ', bufptr);
     for (uii = 0; uii < 5; uii++) {
-      bufptr = uint32_writew6x(bufptr, histo[uii], ' ');
+      bufptr = uint32toa_w6x(histo[uii], ' ', bufptr);
     }
 #endif
     final_clump_ct++;
@@ -13549,52 +13547,52 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
     if (!clump_verbose) {
       if (!cur_window_size) {
 	bufptr = memcpya(bufptr, "NONE\n", 5);
-	if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	  goto clump_reports_ret_WRITE_FAIL;
 	}
       } else {
 	// avoid buffer overflow
-	if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	  goto clump_reports_ret_WRITE_FAIL;
 	}
-	tbuf[0] = '(';
+	g_textbuf[0] = '(';
 	for (ulii = 0; ulii < cur_window_size;) {
           fputs(&(marker_ids[marker_idx_to_uidx[cur_clump_base[ulii].marker_idx] * max_marker_id_len]), outfile);
-	  bufptr = uint32_writex(&(tbuf[1]), cur_clump_base[ulii].fidx, ')');
+	  bufptr = uint32toa_x(cur_clump_base[ulii].fidx, ')', &(g_textbuf[1]));
 	  ulii++;
 	  if (ulii != cur_window_size) {
 	    *bufptr++ = ',';
 	  }
-	  fwrite(tbuf, 1, (uintptr_t)(bufptr - tbuf), outfile);
+	  fwrite(g_textbuf, 1, (uintptr_t)(bufptr - g_textbuf), outfile);
 	}
 	if (putc_checked('\n', outfile)) {
 	  goto clump_reports_ret_WRITE_FAIL;
 	}
       }
       if (rg_setdefs) {
-        bufptr = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, clump_chrom_idx));
+        bufptr = width_force(4, g_textbuf, chrom_name_write(chrom_info_ptr, clump_chrom_idx, g_textbuf));
         *bufptr++ = ' ';
         bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[ivar_uidx * max_marker_id_len]), bufptr);
         *bufptr++ = ' ';
-        bufptr = double_g_writewx4x(bufptr, pval, 10, ' ');
-        bufptr = uint32_writew6x(bufptr, cur_window_size + 1, ' ');
+        bufptr = dtoa_g_wxp4x(pval, 10, ' ', bufptr);
+        bufptr = uint32toa_w6x(cur_window_size + 1, ' ', bufptr);
 	if (clump_chrom_idx <= chrom_info_ptr->max_code) {
 	  bufptr2 = memcpyl3a(bufptr, "chr");
-	  bufptr2 = uint32_write(bufptr2, clump_chrom_idx);
+	  bufptr2 = uint32toa(clump_chrom_idx, bufptr2);
 	} else if (chrom_info_ptr->zero_extra_chroms) {
 	  bufptr2 = memcpya(bufptr, "chr0", 4);
 	} else {
 	  bufptr2 = strcpya(bufptr, chrom_info_ptr->nonstd_names[clump_chrom_idx]);
 	}
         *bufptr2++ = ':';
-        bufptr2 = uint32_write(bufptr2, min_bp);
+        bufptr2 = uint32toa(min_bp, bufptr2);
         bufptr2 = memcpya(bufptr2, "..", 2);
-        bufptr2 = uint32_write(bufptr2, max_bp);
+        bufptr2 = uint32toa(max_bp, bufptr2);
         bufptr = width_force(28, bufptr, bufptr2);
         *bufptr++ = ' ';
-        bufptr = width_force(10, bufptr, double_g_write(bufptr, ((int32_t)(max_bp - min_bp + 1)) * 0.001));
+        bufptr = width_force(10, bufptr, dtoa_g(((int32_t)(max_bp - min_bp + 1)) * 0.001, bufptr));
 	bufptr = memcpya(bufptr, " [", 2);
-        if (fwrite_checked(tbuf, bufptr - tbuf, outfile_ranges)) {
+        if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile_ranges)) {
 	  goto clump_reports_ret_WRITE_FAIL;
 	}
 	uljj = 0;
@@ -13615,21 +13613,21 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	goto clump_reports_ret_WRITE_FAIL;
       }
       *bufptr++ = '\n';
-      if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+      if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	goto clump_reports_ret_WRITE_FAIL;
       }
       if (cur_window_size) {
 	if (fwrite_checked(header2_ptr, header2_len, outfile)) {
 	  goto clump_reports_ret_WRITE_FAIL;
 	}
-        bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[ivar_uidx * max_marker_id_len]), tbuf);
+        bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[ivar_uidx * max_marker_id_len]), g_textbuf);
 	bufptr = memcpya(bufptr, "          0    1.000 ", 21);
 	cur_a1 = marker_allele_ptrs[2 * ivar_uidx];
 	a1_len = strlen(cur_a1);
 	if (a1_len < 8) {
 	  bufptr = memseta(bufptr, 32, 8 - a1_len);
 	}
-	if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	  goto clump_reports_ret_WRITE_FAIL;
 	}
 	fwrite(cur_a1, 1, a1_len, outfile);
@@ -13640,10 +13638,10 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	} else {
 	  allele_padding = 0;
 	}
-	tbuf[0] = ' ';
-        bufptr = uint32_writew4x(&(tbuf[1]), index_fidx, ' ');
-	bufptr = double_g_writewx3x(bufptr, pval, 12, ' ');
-	if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+	g_textbuf[0] = ' ';
+        bufptr = uint32toa_w4x(index_fidx, ' ', &(g_textbuf[1]));
+	bufptr = dtoa_g_wxp3x(pval, 12, ' ', bufptr);
+	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	  goto clump_reports_ret_WRITE_FAIL;
 	}
 	if (annot_flattened) {
@@ -13659,11 +13657,11 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	fputs("\n\n", outfile);
 	last_marker_idx = ~ZEROLU;
 	if (rg_setdefs) {
-	  fill_ulong_zero(rangematch_bitfield, (cur_rg_ct + (BITCT - 1)) / BITCT);
+	  fill_ulong_zero(rangematch_bitfield, BITCT_TO_WORDCT(cur_rg_ct));
 	  unmatched_group_ct = cur_rg_ct;
 	}
 	for (ulii = 0; ulii < cur_window_size; ulii++) {
-	  bufptr = memseta(tbuf, 32, 10);
+	  bufptr = memseta(g_textbuf, 32, 10);
 	  marker_idx = cur_clump_base[ulii].marker_idx;
 	  if (last_marker_idx != marker_idx) {
 	    marker_uidx = marker_idx_to_uidx[marker_idx];
@@ -13675,7 +13673,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	      for (ulmm = 0; ulmm < unmatched_group_ct; uljj++, ulmm++) {
 		next_unset_ul_unsafe_ck(rangematch_bitfield, &uljj);
 		if (interval_in_setdef(cur_rg_setdefs[uljj], uii, uii + 1)) {
-		  set_bit(rangematch_bitfield, uljj);
+		  set_bit(uljj, rangematch_bitfield);
 		  ulkk++;
 		}
 	      }
@@ -13688,14 +13686,14 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	  }
 	  bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), bufptr);
 	  *bufptr++ = ' ';
-	  bufptr = double_g_writewx3x(bufptr, ((double)(((int32_t)marker_pos[marker_uidx]) - ((int32_t)cur_bp))) * 0.001, 10, ' ');
+	  bufptr = dtoa_g_wxp3x(((double)(((int32_t)marker_pos[marker_uidx]) - ((int32_t)cur_bp))) * 0.001, 10, ' ', bufptr);
 	  cur_r2 = cur_clump_base[ulii].r2;
 	  if (cur_r2 > 0) {
 	    ujj = 0;
 	  } else {
 	    ujj = 1; // reversed phase
 	  }
-	  bufptr = double_g_writewx3x(bufptr, fabs(cur_r2), 8, ' ');
+	  bufptr = dtoa_g_wxp3x(fabs(cur_r2), 8, ' ', bufptr);
 	  bufptr2 = marker_allele_ptrs[marker_uidx * 2 + ujj];
 	  bufptr3 = marker_allele_ptrs[marker_uidx * 2 + 1 - ujj];
 	  if (allele_padding) {
@@ -13715,7 +13713,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	      }
 	    }
 	  }
-	  if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+	  if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	    goto clump_reports_ret_WRITE_FAIL;
 	  }
 	  fwrite(cur_a1, 1, a1_len, outfile);
@@ -13723,10 +13721,10 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	  putc('/', outfile);
 	  fwrite(cur_a2, 1, a2_len, outfile);
 	  fputs(bufptr3, outfile);
-	  tbuf[0] = ' ';
-	  bufptr = uint32_writew4x(&(tbuf[1]), cur_clump_base[ulii].fidx, ' ');
-	  bufptr = double_g_writewx3x(bufptr, clump_entry_ptr->pval, 12, ' ');
-	  if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+	  g_textbuf[0] = ' ';
+	  bufptr = uint32toa_w4x(cur_clump_base[ulii].fidx, ' ', &(g_textbuf[1]));
+	  bufptr = dtoa_g_wxp3x(clump_entry_ptr->pval, 12, ' ', bufptr);
+	  if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	    goto clump_reports_ret_WRITE_FAIL;
 	  }
 	  if (annot_flattened) {
@@ -13743,23 +13741,23 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
 	  putc('\n', outfile);
 	  last_marker_idx = marker_idx;
 	}
-	bufptr = memcpya(tbuf, "\n          RANGE: ", 18);
+	bufptr = memcpya(g_textbuf, "\n          RANGE: ", 18);
 	if (clump_chrom_idx <= chrom_info_ptr->max_code) {
 	  bufptr = memcpyl3a(bufptr, "chr");
-	  bufptr = uint32_write(bufptr, clump_chrom_idx);
+	  bufptr = uint32toa(clump_chrom_idx, bufptr);
 	} else if (chrom_info_ptr->zero_extra_chroms) {
 	  bufptr = memcpya(bufptr, "chr0", 4);
 	} else {
 	  bufptr = strcpya(bufptr, chrom_info_ptr->nonstd_names[clump_chrom_idx]);
 	}
 	*bufptr++ = ':';
-	bufptr = uint32_write(bufptr, min_bp);
+	bufptr = uint32toa(min_bp, bufptr);
 	bufptr = memcpya(bufptr, "..", 2);
-	bufptr = uint32_write(bufptr, max_bp);
+	bufptr = uint32toa(max_bp, bufptr);
 	bufptr = memcpya(bufptr, "\n           SPAN: ", 18);
-	bufptr = uint32_write(bufptr, (max_bp - min_bp + 1) / 1000);
+	bufptr = uint32toa((max_bp - min_bp + 1) / 1000, bufptr);
 	bufptr = memcpyl3a(bufptr, "kb\n");
-	if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	  goto clump_reports_ret_WRITE_FAIL;
 	}
 	if (rg_setdefs) {
@@ -13813,9 +13811,9 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
     // 2. pick smallest pval when duplicates present
     // 3. sort by pval
     // 4. write results
-    wkspace_reset(wkspace_mark);
-    if (wkspace_alloc_c_checked(&sorted_missing_variant_ids, missing_variant_ct * max_missing_id_len) ||
-	wkspace_alloc_d_checked(&sorted_pvals, missing_variant_ct * sizeof(double))) {
+    bigstack_double_reset(bigstack_mark, bigstack_end_mark);
+    if (bigstack_alloc_c(missing_variant_ct * max_missing_id_len, &sorted_missing_variant_ids) ||
+	bigstack_alloc_d(missing_variant_ct, &sorted_pvals)) {
       goto clump_reports_ret_NOMEM;
     }
     for (ulii = 0; ulii < missing_variant_ct; ulii++) {
@@ -13880,7 +13878,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
         fprintf(stderr, "%" PRIuPTR " more top variant ID%s missing; see log file.\n", missing_variant_ct - 3, (missing_variant_ct == 4)? "" : "s");
 	for (ulii = 3; ulii < missing_variant_ct; ulii++) {
 	  LOGPREPRINTFWW("Warning: '%s' is missing from the main dataset, and is a top variant.\n", &(sorted_missing_variant_ids[ulii * max_missing_id_len]));
-	  logstr(logbuf);
+	  logstr(g_logbuf);
 	}
       }
     }
@@ -13901,8 +13899,6 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
     LOGPRINTFWW("--clump-best: Best proxies written to %s .\n", outname);
   }
   while (0) {
-  clump_reports_ret_NOMEM2:
-    wkspace_left += topsize;
   clump_reports_ret_NOMEM:
     retval = RET_NOMEM;
     break;
@@ -13927,7 +13923,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
     break;
   }
  clump_reports_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
   gzclose_cond(gz_infile);
   fclose_cond(outfile);
   fclose_cond(outfile_ranges);
diff --git a/plink_matrix.c b/plink_matrix.c
index 4abb262..9e2e167 100644
--- a/plink_matrix.c
+++ b/plink_matrix.c
@@ -40,7 +40,7 @@ int32_t svdcmp_c(int32_t m, double* a, double* w, double* v) {
   // Note that this function is NOT thread-safe, due to the buffer allocated
   // from the workspace stack.  Pass in a preallocated buffer if that's not
   // okay.
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   int32_t n = m;
   int32_t flag;
   int32_t l = 0; // suppress compile warning
@@ -48,7 +48,7 @@ int32_t svdcmp_c(int32_t m, double* a, double* w, double* v) {
   double anorm,c,f,g,h,s,scale,x,y,z;
   double volatile temp;
   double* rv1;
-  if (wkspace_alloc_d_checked(&rv1, m * sizeof(double))) {
+  if (bigstack_alloc_d(m, &rv1)) {
     return -1;
   }
 
@@ -225,7 +225,7 @@ int32_t svdcmp_c(int32_t m, double* a, double* w, double* v) {
       w[k]=x;
     }
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return 1;
 }
 
diff --git a/plink_misc.c b/plink_misc.c
index 3955967..2d120cb 100644
--- a/plink_misc.c
+++ b/plink_misc.c
@@ -24,8 +24,8 @@ void misc_cleanup(Score_info* sc_ip) {
 }
 
 int32_t make_founders(uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, char* paternal_ids, uintptr_t max_paternal_id_len, char* maternal_ids, uintptr_t max_maternal_id_len, uint32_t require_two, uintptr_t* sample_exclude, uintptr_t* founder_info) {
-  unsigned char* wkspace_mark = wkspace_base;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uint32_t new_founder_ct = 0;
   int32_t retval = 0;
   char* sorted_ids;
@@ -38,12 +38,12 @@ int32_t make_founders(uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, char*
   uint32_t fam_len_p1;
   uint32_t missing_parent_ct;
   uint32_t cur_len;
-  if (wkspace_alloc_c_checked(&id_buf, max_sample_id_len) ||
-      wkspace_alloc_ul_checked(&nf_bitarr, unfiltered_sample_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_c(max_sample_id_len, &id_buf) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl, &nf_bitarr)) {
     goto make_founders_ret_NOMEM;
   }
-  bitfield_exclude_to_include(sample_exclude, nf_bitarr, unfiltered_sample_ct);
-  bitfield_andnot(nf_bitarr, founder_info, unfiltered_sample_ctl);
+  bitarr_invert_copy(sample_exclude, unfiltered_sample_ct, nf_bitarr);
+  bitvec_andnot(founder_info, unfiltered_sample_ctl, nf_bitarr);
   sample_uidx = unfiltered_sample_ct? next_set(nf_bitarr, 0, unfiltered_sample_ct) : 0;
   if (sample_uidx == unfiltered_sample_ct) {
     logprint("Note: Skipping --make-founders since there are no nonfounders.\n");
@@ -80,13 +80,13 @@ int32_t make_founders(uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, char*
       }
     }
     if (missing_parent_ct > require_two) {
-      SET_BIT(founder_info, sample_uidx);
+      SET_BIT(sample_uidx, founder_info);
       memcpy(pat_ptr, "0", 2);
       memcpy(mat_ptr, "0", 2);
       new_founder_ct++;
     }
     sample_uidx++;
-    next_set_ul_ck(nf_bitarr, &sample_uidx, unfiltered_sample_ct);
+    next_set_ul_ck(nf_bitarr, unfiltered_sample_ct, &sample_uidx);
   } while (sample_uidx < unfiltered_sample_ct);
   LOGPRINTF("--make-founders: %u sample%s affected.\n", new_founder_ct, (new_founder_ct == 1)? "" : "s");
   while (0) {
@@ -95,25 +95,25 @@ int32_t make_founders(uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, char*
     break;
   }
  make_founders_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t write_nosex(char* outname, char* outname_end, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sex_nm, uintptr_t gender_unk_ct, char* sample_ids, uintptr_t max_sample_id_len) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t sample_uidx = 0;
   int32_t retval = 0;
   uintptr_t* sex_missing;
   uintptr_t sample_idx;
-  if (wkspace_alloc_ul_checked(&sex_missing, unfiltered_sample_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl, &sex_missing)) {
     goto write_nosex_ret_NOMEM;
   }
-  bitfield_exclude_to_include(sample_exclude, sex_missing, unfiltered_sample_ct);
-  bitfield_andnot(sex_missing, sex_nm, unfiltered_sample_ctl);
+  bitarr_invert_copy(sample_exclude, unfiltered_sample_ct, sex_missing);
+  bitvec_andnot(sex_nm, unfiltered_sample_ctl, sex_missing);
   memcpy(outname_end, ".nosex", 7);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto write_nosex_ret_OPEN_FAIL;
   }
   for (sample_idx = 0; sample_idx < gender_unk_ct; sample_idx++, sample_uidx++) {
@@ -136,7 +136,7 @@ int32_t write_nosex(char* outname, char* outname_end, uintptr_t unfiltered_sampl
     retval = RET_WRITE_FAIL;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   return retval;
 }
@@ -144,9 +144,9 @@ int32_t write_nosex(char* outname, char* outname_end, uintptr_t unfiltered_sampl
 int32_t makepheno_load(FILE* phenofile, char* makepheno_str, uintptr_t unfiltered_sample_ct, char* sorted_sample_ids, uintptr_t max_sample_id_len, uint32_t* id_map, uintptr_t* pheno_nm, uintptr_t** pheno_c_ptr) {
   uint32_t mp_strlen = strlen(makepheno_str);
   uint32_t makepheno_all = ((mp_strlen == 1) && (makepheno_str[0] == '*'));
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t* pheno_c = *pheno_c_ptr;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t line_idx = 0;
   int32_t retval = 0;
   char* id_buf;
@@ -155,44 +155,44 @@ int32_t makepheno_load(FILE* phenofile, char* makepheno_str, uintptr_t unfiltere
   int32_t ii;
   uint32_t sample_idx;
   uint32_t tmp_len;
-  if (wkspace_alloc_c_checked(&id_buf, max_sample_id_len)) {
+  if (bigstack_alloc_c(max_sample_id_len, &id_buf)) {
     goto makepheno_load_ret_NOMEM;
   }
   if (!pheno_c) {
-    if (aligned_malloc(pheno_c_ptr, unfiltered_sample_ctl * sizeof(intptr_t))) {
+    if (aligned_malloc(unfiltered_sample_ctl * sizeof(intptr_t), pheno_c_ptr)) {
       goto makepheno_load_ret_NOMEM;
     }
     pheno_c = *pheno_c_ptr;
     fill_ulong_zero(pheno_c, unfiltered_sample_ctl);
   }
   if (makepheno_all) {
-    fill_all_bits(pheno_nm, unfiltered_sample_ct);
+    fill_all_bits(unfiltered_sample_ct, pheno_nm);
   }
   // probably want to permit long lines here
-  tbuf[MAXLINELEN - 1] = ' '; 
-  while (fgets(tbuf, MAXLINELEN, phenofile) != NULL) {
+  g_textbuf[MAXLINELEN - 1] = ' '; 
+  while (fgets(g_textbuf, MAXLINELEN, phenofile) != NULL) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --make-pheno file is pathologically long.\n", line_idx);
+    if (!g_textbuf[MAXLINELEN - 1]) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --make-pheno file is pathologically long.\n", line_idx);
       goto makepheno_load_ret_INVALID_FORMAT_2;
     }
-    bufptr0 = skip_initial_spaces(tbuf);
+    bufptr0 = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr0)) {
       continue;
     }
-    if (bsearch_read_fam_indiv(id_buf, sorted_sample_ids, max_sample_id_len, unfiltered_sample_ct, bufptr0, &bufptr, &ii)) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --make-pheno file has fewer tokens than expected.\n", line_idx);
+    if (bsearch_read_fam_indiv(bufptr0, sorted_sample_ids, max_sample_id_len, unfiltered_sample_ct, &bufptr, &ii, id_buf)) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --make-pheno file has fewer tokens than expected.\n", line_idx);
       goto makepheno_load_ret_INVALID_FORMAT_2;
     }
     if (ii != -1) {
       sample_idx = id_map[(uint32_t)ii];
       if (makepheno_all) {
-	SET_BIT(pheno_c, sample_idx);
+	SET_BIT(sample_idx, pheno_c);
       } else {
-	SET_BIT(pheno_nm, sample_idx);
+	SET_BIT(sample_idx, pheno_nm);
         tmp_len = strlen_se(bufptr);
 	if ((tmp_len == mp_strlen) && (!memcmp(bufptr, makepheno_str, mp_strlen))) {
-	  SET_BIT(pheno_c, sample_idx);
+	  SET_BIT(sample_idx, pheno_c);
 	}
       }
     }
@@ -214,7 +214,7 @@ int32_t makepheno_load(FILE* phenofile, char* makepheno_str, uintptr_t unfiltere
     retval = RET_INVALID_FORMAT;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -223,32 +223,33 @@ int32_t load_pheno(FILE* phenofile, uintptr_t unfiltered_sample_ct, uintptr_t sa
   uintptr_t* pheno_c = *pheno_c_ptr;
   double* pheno_d = *pheno_d_ptr;
   int32_t header_processed = 0;
-  unsigned char* wkspace_mark = wkspace_base;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t sample_ct = unfiltered_sample_ct - sample_exclude_ct;
   uintptr_t line_idx = 0;
-  char case_char = affection_01? '1' : '2';
   uintptr_t* isz = NULL;
+  double pheno_ctrld = (double)((int32_t)(1 - affection_01));
+  double pheno_cased = pheno_ctrld + 1.0;
   double missing_phenod = (double)missing_pheno;
   int32_t retval = 0;
+  double dxx;
+  uintptr_t loadbuf_size;
   char* loadbuf;
-  uint32_t loadbuf_size;
-  int32_t sample_idx;
   char* bufptr0;
   char* bufptr;
+  char* ss;
   uint32_t tmp_len;
   uint32_t tmp_len2;
   uint32_t uii;
-  double dxx;
+  int32_t sample_idx;
   if (pheno_d) {
     affection = 0;
   } else {
-    if (wkspace_alloc_ul_checked(&isz, unfiltered_sample_ctl * sizeof(intptr_t))) {
+    if (bigstack_calloc_ul(unfiltered_sample_ctl, &isz)) {
       goto load_pheno_ret_NOMEM;
     }
-    fill_ulong_zero(isz, unfiltered_sample_ctl);
     if (!pheno_c) {
-      if (aligned_malloc(pheno_c_ptr, unfiltered_sample_ctl * sizeof(intptr_t))) {
+      if (aligned_malloc(unfiltered_sample_ctl * sizeof(intptr_t), pheno_c_ptr)) {
 	goto load_pheno_ret_NOMEM;
       }
       pheno_c = *pheno_c_ptr;
@@ -256,21 +257,20 @@ int32_t load_pheno(FILE* phenofile, uintptr_t unfiltered_sample_ct, uintptr_t sa
     }
   }
   // ----- phenotype file load -----
-  // worthwhile to support very long lines here...
-  if (wkspace_left > MAXLINEBUFLEN) {
+  // worthwhile to support very long lines here
+  loadbuf_size = bigstack_left();
+  if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
-  } else if (wkspace_left > MAXLINELEN) {
-    loadbuf_size = wkspace_left;
-  } else {
+  } else if (loadbuf_size <= MAXLINELEN) {
     goto load_pheno_ret_NOMEM;
   }
-  loadbuf = (char*)wkspace_base;
+  loadbuf = (char*)g_bigstack_base;
   loadbuf[loadbuf_size - 1] = ' ';
   while (fgets(loadbuf, loadbuf_size, phenofile) != NULL) {
     line_idx++;
     if (!loadbuf[loadbuf_size - 1]) {
       if (loadbuf_size == MAXLINEBUFLEN) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of --pheno file is pathologically long.\n", line_idx);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --pheno file is pathologically long.\n", line_idx);
 	goto load_pheno_ret_INVALID_FORMAT_2;
       } else {
 	goto load_pheno_ret_NOMEM;
@@ -324,7 +324,7 @@ int32_t load_pheno(FILE* phenofile, uintptr_t unfiltered_sample_ct, uintptr_t sa
     if (!header_processed) {
       header_processed = 1;
     } else {
-      if (bsearch_read_fam_indiv(tbuf, sorted_sample_ids, max_sample_id_len, sample_ct, bufptr0, &bufptr, &sample_idx)) {
+      if (bsearch_read_fam_indiv(bufptr0, sorted_sample_ids, max_sample_id_len, sample_ct, &bufptr, &sample_idx, g_textbuf)) {
 	goto load_pheno_ret_MISSING_TOKENS;
       }
       if (sample_idx != -1) {
@@ -333,31 +333,33 @@ int32_t load_pheno(FILE* phenofile, uintptr_t unfiltered_sample_ct, uintptr_t sa
 	  bufptr = next_token_mult(bufptr, mpheno_col - 1);
 	}
 	if (no_more_tokens_kns(bufptr)) {
-	  // Sometimes, but not always, an error.  So we populate logbuf but
+	  // Sometimes, but not always, an error.  So we populate g_logbuf but
 	  // let the caller decide whether to actually log it.
-          sprintf(logbuf, "Error: Line %" PRIuPTR " of --pheno file has fewer tokens than expected.\n", line_idx);
+          sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --pheno file has fewer tokens than expected.\n", line_idx);
 	  return LOAD_PHENO_LAST_COL;
 	}
+	dxx = strtod(bufptr, &ss);
 	if (affection) {
-	  if (affection_01 || eval_affection(bufptr, missing_phenod)) {
-	    if (is_missing_pheno_cc(bufptr, missing_phenod, affection_01)) {
-	      // Since we're only making one pass through the file, we don't
-	      // have the luxury of knowing in advance whether the phenotype is
-	      // binary or scalar.  If there is a '0' entry that occurs before
-	      // we know the phenotype is scalar, we need to not set the
-	      // phenotype to zero during the binary -> scalar conversion step.
-	      if (*bufptr == '0') {
-		set_bit(isz, sample_idx);
-	      }
-	      clear_bit(pheno_c, sample_idx);
-	    } else {
-	      if (*bufptr == case_char) {
-		set_bit(pheno_c, sample_idx);
-	      } else {
-		clear_bit(pheno_c, sample_idx);
-	      }
-	      set_bit(pheno_nm, sample_idx);
+	  // er, this was calling strtod() twice on the same string.  time to
+	  // drop down a level and remove that redundancy...
+
+	  if (dxx == pheno_cased) {
+	    set_bit(sample_idx, pheno_c);
+	    set_bit(sample_idx, pheno_nm);
+	  } else if ((ss != bufptr) && (dxx == pheno_ctrld)) {
+	    clear_bit(sample_idx, pheno_c);
+	    set_bit(sample_idx, pheno_nm);
+	  } else if (affection_01 || (dxx == missing_phenod) || (dxx == 0.0)) {
+	    // Since we're only making one pass through the file, we don't
+	    // have the luxury of knowing in advance whether the phenotype is
+	    // binary or scalar.  If there is a '0' entry that occurs before
+	    // we know the phenotype is scalar, we need to not set the
+	    // phenotype to zero during the binary -> scalar conversion step.
+	    if (dxx == 0.0) {
+	      set_bit(sample_idx, isz);
 	    }
+	    clear_bit(sample_idx, pheno_nm);
+	    clear_bit(sample_idx, pheno_c);
 	  } else {
 	    pheno_d = (double*)malloc(unfiltered_sample_ct * sizeof(double));
 	    if (!pheno_d) {
@@ -367,7 +369,7 @@ int32_t load_pheno(FILE* phenofile, uintptr_t unfiltered_sample_ct, uintptr_t sa
 	    for (uii = 0; uii < unfiltered_sample_ct; uii++) {
 	      if (is_set(isz, uii)) {
 		pheno_d[uii] = 0.0;
-		set_bit(pheno_nm, uii);
+		set_bit(uii, pheno_nm);
 	      } else if (is_set(pheno_nm, uii)) {
 		pheno_d[uii] = (double)((int32_t)(1 + is_set(pheno_c, uii)));
 	      }
@@ -377,9 +379,9 @@ int32_t load_pheno(FILE* phenofile, uintptr_t unfiltered_sample_ct, uintptr_t sa
 	  }
 	}
 	if (!affection) {
-	  if ((!scan_double(bufptr, &dxx)) && (dxx != missing_phenod)) {
+	  if ((ss != bufptr) && (dxx != missing_phenod)) {
 	    pheno_d[(uint32_t)sample_idx] = dxx;
-	    set_bit(pheno_nm, sample_idx);
+	    set_bit(sample_idx, pheno_nm);
 	  }
 	}
       }
@@ -398,14 +400,14 @@ int32_t load_pheno(FILE* phenofile, uintptr_t unfiltered_sample_ct, uintptr_t sa
     retval = RET_READ_FAIL;
     break;
   load_pheno_ret_MISSING_TOKENS:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of --pheno file has fewer tokens than expected.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --pheno file has fewer tokens than expected.\n", line_idx);
   load_pheno_ret_INVALID_FORMAT_2:
     logerrprintb();
   load_pheno_ret_INVALID_FORMAT:
     retval = RET_INVALID_FORMAT;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -419,9 +421,9 @@ int32_t convert_tail_pheno(uint32_t unfiltered_sample_ct, uintptr_t* pheno_nm, u
     logerrprint("Error: --tail-pheno requires scalar phenotype data.\n");
     return RET_INVALID_FORMAT;
   }
-  sample_uidx = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  sample_uidx = BITCT_TO_WORDCT(unfiltered_sample_ct);
   if (!pheno_c) {
-    if (aligned_malloc(pheno_c_ptr, sample_uidx * sizeof(intptr_t))) {
+    if (aligned_malloc(sample_uidx * sizeof(intptr_t), pheno_c_ptr)) {
       return RET_NOMEM;
     }
     pheno_c = *pheno_c_ptr;
@@ -435,16 +437,16 @@ int32_t convert_tail_pheno(uint32_t unfiltered_sample_ct, uintptr_t* pheno_nm, u
       dxx = pheno_d[sample_uidx];
       if (dxx > tail_bottom) {
         if (dxx > tail_top) {
-          SET_BIT(pheno_c, sample_uidx);
+          SET_BIT(sample_uidx, pheno_c);
         } else {
-	  CLEAR_BIT(pheno_nm, sample_uidx);
+	  CLEAR_BIT(sample_uidx, pheno_nm);
         }
       }
     }
   } while (sample_uidx_stop < unfiltered_sample_ct);
   free(pheno_d);
   *pheno_d_ptr = NULL;
-  sample_uidx = popcount_longs(pheno_nm, (unfiltered_sample_ct + (BITCT - 1)) / BITCT);
+  sample_uidx = popcount_longs(pheno_nm, BITCT_TO_WORDCT(unfiltered_sample_ct));
   LOGPRINTF("--tail-pheno: %u phenotype value%s remaining.\n", sample_uidx, (sample_uidx == 1)? "" : "s");
   return 0;
 }
@@ -453,7 +455,7 @@ int32_t apply_cm_map(char* cm_map_fname, char* cm_map_chrname, uintptr_t unfilte
   FILE* shapeitfile = NULL;
   char* at_sign_ptr = NULL;
   char* fname_write = NULL;
-  char* fname_buf = &(tbuf[MAXLINELEN]);
+  char* fname_buf = &(g_textbuf[MAXLINELEN]);
   double cm_old = 0.0;
   uint32_t autosome_ct = chrom_info_ptr->autosome_ct;
   uint32_t post_at_sign_len = 0;
@@ -490,7 +492,7 @@ int32_t apply_cm_map(char* cm_map_fname, char* cm_map_chrname, uintptr_t unfilte
     chrom_ct = chrom_fo_idx + 1;
     fname_buf = cm_map_fname;
   }
-  tbuf[MAXLINELEN - 1] = ' ';
+  g_textbuf[MAXLINELEN - 1] = ' ';
   for (; chrom_fo_idx < chrom_ct; chrom_fo_idx++) {
     chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1];
     marker_uidx = next_unset(marker_exclude, chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx], chrom_end);
@@ -502,14 +504,14 @@ int32_t apply_cm_map(char* cm_map_fname, char* cm_map_chrname, uintptr_t unfilte
       if ((!uii) || (uii > autosome_ct)) {
         continue;
       }
-      bufptr = uint32_write(fname_write, uii);
+      bufptr = uint32toa(uii, fname_write);
       memcpy(bufptr, at_sign_ptr, post_at_sign_len);
-      if (fopen_checked(&shapeitfile, fname_buf, "r")) {
+      if (fopen_checked(fname_buf, "r", &shapeitfile)) {
 	LOGERRPRINTFWW("Warning: --cm-map failed to open %s.\n", fname_buf);
         continue;
       }
     } else {
-      if (fopen_checked(&shapeitfile, cm_map_fname, "r")) {
+      if (fopen_checked(cm_map_fname, "r", &shapeitfile)) {
         goto apply_cm_map_ret_OPEN_FAIL;
       }
     }
@@ -522,7 +524,7 @@ int32_t apply_cm_map(char* cm_map_fname, char* cm_map_chrname, uintptr_t unfilte
     //   3. current cM position
     // We mostly ignore field 2, since depending just on fields 1 and 3
     // maximizes accuracy.  The one exception is the very first nonheader line.
-    retval = load_to_first_token(shapeitfile, MAXLINELEN, '\0', "--cm-map file", tbuf, &bufptr, &line_idx);
+    retval = load_to_first_token(shapeitfile, MAXLINELEN, '\0', "--cm-map file", g_textbuf, &bufptr, &line_idx);
     if (retval) {
       goto apply_cm_map_ret_1;
     }
@@ -535,13 +537,13 @@ int32_t apply_cm_map(char* cm_map_fname, char* cm_map_chrname, uintptr_t unfilte
       goto apply_cm_map_ret_MISSING_TOKENS;
     }
     bp_old = -1;
-    while (fgets(tbuf, MAXLINELEN, shapeitfile)) {
+    while (fgets(g_textbuf, MAXLINELEN, shapeitfile)) {
       line_idx++;
-      if (!tbuf[MAXLINELEN - 1]) {
-        sprintf(logbuf, "Error: Line %" PRIuPTR " of --cm-map file is pathologically long.\n", line_idx);
+      if (!g_textbuf[MAXLINELEN - 1]) {
+        sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --cm-map file is pathologically long.\n", line_idx);
         goto apply_cm_map_ret_INVALID_FORMAT_2;
       }
-      bufptr = skip_initial_spaces(tbuf);
+      bufptr = skip_initial_spaces(g_textbuf);
       if ((*bufptr < '+') || (*bufptr > '9')) {
 	// warning instead of error if text line found, since as of 8 Jan 2014
         // the posted chromosome 19 map has such a line
@@ -551,7 +553,7 @@ int32_t apply_cm_map(char* cm_map_fname, char* cm_map_chrname, uintptr_t unfilte
         continue;
       }
       if (scan_uint_defcap(bufptr, (uint32_t*)&bp_new)) {
-	sprintf(logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of --cm-map file.\n", line_idx);
+	sprintf(g_logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of --cm-map file.\n", line_idx);
         goto apply_cm_map_ret_INVALID_FORMAT_2;
       }
       if (bp_new <= bp_old) {
@@ -563,14 +565,14 @@ int32_t apply_cm_map(char* cm_map_fname, char* cm_map_chrname, uintptr_t unfilte
 	goto apply_cm_map_ret_MISSING_TOKENS;
       }
       if (scan_double(bufptr2, &cm_new)) {
-	sprintf(logbuf, "Error: Invalid centimorgan position on line %" PRIuPTR " of --cm-map file.\n", line_idx);
+	sprintf(g_logbuf, "Error: Invalid centimorgan position on line %" PRIuPTR " of --cm-map file.\n", line_idx);
 	goto apply_cm_map_ret_INVALID_FORMAT_2;
       }
       if (bp_old == -1) {
 	// parse field 2 only in this case
         bufptr = next_token(bufptr);
         if (scan_double(bufptr, &dxx)) {
-	  sprintf(logbuf, "Error: Invalid recombination rate on line %" PRIuPTR " of --cm-map file.\n", line_idx);
+	  sprintf(g_logbuf, "Error: Invalid recombination rate on line %" PRIuPTR " of --cm-map file.\n", line_idx);
 	  goto apply_cm_map_ret_INVALID_FORMAT_2;
 	}
         cm_old = cm_new - dxx * 0.000001 * ((double)(bp_new + 1));
@@ -579,7 +581,7 @@ int32_t apply_cm_map(char* cm_map_fname, char* cm_map_chrname, uintptr_t unfilte
       while (marker_pos[marker_uidx] <= ((uint32_t)bp_new)) {
 	marker_cms[marker_uidx] = cm_new - ((int32_t)(((uint32_t)bp_new) - marker_pos[marker_uidx])) * dxx;
 	marker_uidx++;
-	next_unset_ck(marker_exclude, &marker_uidx, chrom_end);
+	next_unset_ck(marker_exclude, chrom_end, &marker_uidx);
 	if (marker_uidx == chrom_end) {
 	  goto apply_cm_map_chrom_done;
 	}
@@ -611,7 +613,7 @@ int32_t apply_cm_map(char* cm_map_fname, char* cm_map_chrname, uintptr_t unfilte
     retval = RET_INVALID_CMDLINE;
     break;
   apply_cm_map_ret_MISSING_TOKENS:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of --cm-map file has fewer tokens than expected.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --cm-map file has fewer tokens than expected.\n", line_idx);
   apply_cm_map_ret_INVALID_FORMAT_2:
     logerrprintb();
   apply_cm_map_ret_INVALID_FORMAT:
@@ -624,11 +626,11 @@ int32_t apply_cm_map(char* cm_map_fname, char* cm_map_chrname, uintptr_t unfilte
 }
 
 int32_t update_marker_cms(Two_col_params* update_cm, uint32_t* marker_id_htable, uint32_t marker_id_htable_size, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_marker_ct, double* marker_cms) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
   char skipchar = update_cm->skipchar;
   uint32_t colid_first = (update_cm->colid < update_cm->colx);
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t hit_ct = 0;
   uintptr_t miss_ct = 0;
   uintptr_t* already_seen;
@@ -643,13 +645,12 @@ int32_t update_marker_cms(Two_col_params* update_cm, uint32_t* marker_id_htable,
   uint32_t marker_uidx;
   char cc;
   int32_t retval;
-  if (wkspace_alloc_ul_checked(&already_seen, unfiltered_marker_ctl * sizeof(intptr_t))) {
+  if (bigstack_calloc_ul(unfiltered_marker_ctl, &already_seen)) {
     goto update_marker_cms_ret_NOMEM;
   }
-  fill_ulong_zero(already_seen, unfiltered_marker_ctl);
 
-  loadbuf = (char*)wkspace_base;
-  loadbuf_size = wkspace_left;
+  loadbuf = (char*)g_bigstack_base;
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   }
@@ -672,7 +673,7 @@ int32_t update_marker_cms(Two_col_params* update_cm, uint32_t* marker_id_htable,
     line_idx++;
     if (!(loadbuf[loadbuf_size - 1])) {
       if (loadbuf_size == MAXLINEBUFLEN) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of --update-cm file is pathologically long.\n", line_idx);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --update-cm file is pathologically long.\n", line_idx);
 	goto update_marker_cms_ret_INVALID_FORMAT_2;
       } else {
         goto update_marker_cms_ret_NOMEM;
@@ -707,9 +708,9 @@ int32_t update_marker_cms(Two_col_params* update_cm, uint32_t* marker_id_htable,
       LOGPREPRINTFWW("Error: Duplicate variant '%s' in --update-cm file.\n", colid_ptr);
       goto update_marker_cms_ret_INVALID_FORMAT_2;
     }
-    set_bit(already_seen, marker_uidx);
+    set_bit(marker_uidx, already_seen);
     if (scan_double(colx_ptr, &(marker_cms[marker_uidx]))) {
-      sprintf(logbuf, "Error: Invalid centimorgan position on line %" PRIuPTR " of --update-cm file.\n", line_idx);
+      sprintf(g_logbuf, "Error: Invalid centimorgan position on line %" PRIuPTR " of --update-cm file.\n", line_idx);
       goto update_marker_cms_ret_INVALID_FORMAT_2;
     }
     hit_ct++;
@@ -718,9 +719,9 @@ int32_t update_marker_cms(Two_col_params* update_cm, uint32_t* marker_id_htable,
     goto update_marker_cms_ret_READ_FAIL;
   }
   if (miss_ct) {
-    sprintf(logbuf, "--update-cm: %" PRIuPTR " value%s changed, %" PRIuPTR " variant ID%s not present.\n", hit_ct, (hit_ct == 1)? "" : "s", miss_ct, (miss_ct == 1)? "" : "s");
+    sprintf(g_logbuf, "--update-cm: %" PRIuPTR " value%s changed, %" PRIuPTR " variant ID%s not present.\n", hit_ct, (hit_ct == 1)? "" : "s", miss_ct, (miss_ct == 1)? "" : "s");
   } else {
-    sprintf(logbuf, "--update-cm: %" PRIuPTR " value%s changed.\n", hit_ct, (hit_ct == 1)? "" : "s");
+    sprintf(g_logbuf, "--update-cm: %" PRIuPTR " value%s changed.\n", hit_ct, (hit_ct == 1)? "" : "s");
   }
   logprintb();
   while (0) {
@@ -731,7 +732,7 @@ int32_t update_marker_cms(Two_col_params* update_cm, uint32_t* marker_id_htable,
     retval = RET_READ_FAIL;
     break;
   update_marker_cms_ret_MISSING_TOKENS:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of --update-cm file has fewer tokens than expected.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --update-cm file has fewer tokens than expected.\n", line_idx);
   update_marker_cms_ret_INVALID_FORMAT_2:
     logerrprintb();
     retval = RET_INVALID_FORMAT;
@@ -739,16 +740,16 @@ int32_t update_marker_cms(Two_col_params* update_cm, uint32_t* marker_id_htable,
   }
  update_marker_cms_ret_1:
   fclose_cond(infile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t update_marker_pos(Two_col_params* update_map, uint32_t* marker_id_htable, uint32_t marker_id_htable_size, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, uint32_t* marker_pos, uint32_t* map_is_unsorted_ptr, Chrom_info* chrom_info_ptr) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
   char skipchar = update_map->skipchar;
   uint32_t colid_first = (update_map->colid < update_map->colx);
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t hit_ct = 0;
   uintptr_t miss_ct = 0;
   uint32_t marker_ct = unfiltered_marker_ct - *marker_exclude_ct_ptr;
@@ -770,13 +771,12 @@ int32_t update_marker_pos(Two_col_params* update_map, uint32_t* marker_id_htable
   int32_t bp_coord;
   int32_t retval;
   char cc;
-  if (wkspace_alloc_ul_checked(&already_seen, unfiltered_marker_ctl * sizeof(intptr_t))) {
+  if (bigstack_calloc_ul(unfiltered_marker_ctl, &already_seen)) {
     goto update_marker_pos_ret_NOMEM;
   }
-  fill_ulong_zero(already_seen, unfiltered_marker_ctl);
 
-  loadbuf = (char*)wkspace_base;
-  loadbuf_size = wkspace_left;
+  loadbuf = (char*)g_bigstack_base;
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   }
@@ -799,7 +799,7 @@ int32_t update_marker_pos(Two_col_params* update_map, uint32_t* marker_id_htable
     line_idx++;
     if (!(loadbuf[loadbuf_size - 1])) {
       if (loadbuf_size == MAXLINEBUFLEN) {
-        sprintf(logbuf, "Error: Line %" PRIuPTR " of --update-map file is pathologically long.\n", line_idx);
+        sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --update-map file is pathologically long.\n", line_idx);
 	goto update_marker_pos_ret_INVALID_FORMAT_2;
       } else {
         goto update_marker_pos_ret_NOMEM;
@@ -834,13 +834,13 @@ int32_t update_marker_pos(Two_col_params* update_map, uint32_t* marker_id_htable
       LOGPREPRINTFWW("Error: Duplicate variant '%s' in --update-map file.\n", colid_ptr);
       goto update_marker_pos_ret_INVALID_FORMAT_2;
     }
-    set_bit(already_seen, marker_uidx);
+    set_bit(marker_uidx, already_seen);
     if (scan_int_abs_defcap(colx_ptr, &bp_coord)) {
-      sprintf(logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of --update-map file.\n", line_idx);
+      sprintf(g_logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of --update-map file.\n", line_idx);
       goto update_marker_pos_ret_INVALID_FORMAT_2;
     }
     if (bp_coord < 0) {
-      set_bit(marker_exclude, marker_uidx);
+      set_bit(marker_uidx, marker_exclude);
       marker_ct--;
     } else {
       marker_pos[marker_uidx] = bp_coord;
@@ -851,9 +851,9 @@ int32_t update_marker_pos(Two_col_params* update_map, uint32_t* marker_id_htable
     goto update_marker_pos_ret_READ_FAIL;
   }
   if (miss_ct) {
-    sprintf(logbuf, "--update-map: %" PRIuPTR " value%s updated, %" PRIuPTR " variant ID%s not present.\n", hit_ct, (hit_ct == 1)? "" : "s", miss_ct, (miss_ct == 1)? "" : "s");
+    sprintf(g_logbuf, "--update-map: %" PRIuPTR " value%s updated, %" PRIuPTR " variant ID%s not present.\n", hit_ct, (hit_ct == 1)? "" : "s", miss_ct, (miss_ct == 1)? "" : "s");
   } else {
-    sprintf(logbuf, "--update-map: %" PRIuPTR " value%s updated.\n", hit_ct, (hit_ct == 1)? "" : "s");
+    sprintf(g_logbuf, "--update-map: %" PRIuPTR " value%s updated.\n", hit_ct, (hit_ct == 1)? "" : "s");
   }
   logprintb();
   *marker_exclude_ct_ptr = unfiltered_marker_ct - marker_ct;
@@ -888,7 +888,7 @@ int32_t update_marker_pos(Two_col_params* update_map, uint32_t* marker_id_htable
     retval = RET_READ_FAIL;
     break;
   update_marker_pos_ret_MISSING_TOKENS:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of --update-map file has fewer tokens than expected.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --update-map file has fewer tokens than expected.\n", line_idx);
   update_marker_pos_ret_INVALID_FORMAT_2:
     logerrprintb();
     retval = RET_INVALID_FORMAT;
@@ -899,16 +899,16 @@ int32_t update_marker_pos(Two_col_params* update_map, uint32_t* marker_id_htable
   }
  update_marker_pos_ret_1:
   fclose_cond(infile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t update_marker_names(Two_col_params* update_name, uint32_t* marker_id_htable, uint32_t marker_id_htable_size, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_marker_ct) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
   char skipchar = update_name->skipchar;
   uint32_t colold_first = (update_name->colid < update_name->colx);
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t hit_ct = 0;
   uintptr_t miss_ct = 0;
   uintptr_t* already_seen;
@@ -923,14 +923,13 @@ int32_t update_marker_names(Two_col_params* update_name, uint32_t* marker_id_hta
   uint32_t slen;
   int32_t retval;
   char cc;
-  if (wkspace_alloc_ul_checked(&already_seen, unfiltered_marker_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_c_checked(&marker_ids_copy, unfiltered_marker_ct * max_marker_id_len)) {
+  if (bigstack_calloc_ul(unfiltered_marker_ctl, &already_seen) ||
+      bigstack_alloc_c(unfiltered_marker_ct * max_marker_id_len, &marker_ids_copy)) {
     goto update_marker_names_ret_NOMEM;
   }
-  fill_ulong_zero(already_seen, unfiltered_marker_ctl);
   memcpy(marker_ids_copy, marker_ids, unfiltered_marker_ct * max_marker_id_len);
-  loadbuf = (char*)wkspace_base;
-  loadbuf_size = wkspace_left;
+  loadbuf = (char*)g_bigstack_base;
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   }
@@ -976,7 +975,7 @@ int32_t update_marker_names(Two_col_params* update_name, uint32_t* marker_id_hta
       LOGPREPRINTFWW("Error: Duplicate variant ID '%s' in --update-name file.\n", colold_ptr);
       goto update_marker_names_ret_INVALID_FORMAT_2;
     }
-    set_bit(already_seen, marker_uidx);
+    set_bit(marker_uidx, already_seen);
     slen = strlen_se(colnew_ptr);
     colnew_ptr[slen] = '\0';
     memcpy(&(marker_ids[marker_uidx * max_marker_id_len]), colnew_ptr, slen + 1);
@@ -986,9 +985,9 @@ int32_t update_marker_names(Two_col_params* update_name, uint32_t* marker_id_hta
     goto update_marker_names_ret_READ_FAIL;
   }
   if (miss_ct) {
-    sprintf(logbuf, "--update-name: %" PRIuPTR " value%s updated, %" PRIuPTR " variant ID%s not present.\n", hit_ct, (hit_ct == 1)? "" : "s", miss_ct, (miss_ct == 1)? "" : "s");
+    sprintf(g_logbuf, "--update-name: %" PRIuPTR " value%s updated, %" PRIuPTR " variant ID%s not present.\n", hit_ct, (hit_ct == 1)? "" : "s", miss_ct, (miss_ct == 1)? "" : "s");
   } else {
-    sprintf(logbuf, "--update-name: %" PRIuPTR " value%s updated.\n", hit_ct, (hit_ct == 1)? "" : "s");
+    sprintf(g_logbuf, "--update-name: %" PRIuPTR " value%s updated.\n", hit_ct, (hit_ct == 1)? "" : "s");
   }
   logprintb();
   while (0) {
@@ -1005,15 +1004,15 @@ int32_t update_marker_names(Two_col_params* update_name, uint32_t* marker_id_hta
   }
  update_marker_names_ret_1:
   fclose_cond(infile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t update_marker_alleles(char* update_alleles_fname, uint32_t* marker_id_htable, uint32_t marker_id_htable_size, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, char** marker_allele_ptrs, uintptr_t* max_marker_allele_len_ptr, char* outname, char* outname_end) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
   FILE* errfile = NULL;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t max_marker_allele_len = *max_marker_allele_len_ptr;
   uintptr_t hit_ct = 0;
   uintptr_t miss_ct = 0;
@@ -1039,26 +1038,25 @@ int32_t update_marker_alleles(char* update_alleles_fname, uint32_t* marker_id_ht
   uint32_t uoo;
   uint32_t upp;
   uint32_t uqq;
-  if (wkspace_alloc_ul_checked(&already_seen, unfiltered_marker_ctl * sizeof(intptr_t))) {
+  if (bigstack_calloc_ul(unfiltered_marker_ctl, &already_seen)) {
     goto update_marker_alleles_ret_NOMEM;
   }
-  fill_ulong_zero(already_seen, unfiltered_marker_ctl);
-  if (fopen_checked(&infile, update_alleles_fname, "r")) {
+  if (fopen_checked(update_alleles_fname, "r", &infile)) {
     goto update_marker_alleles_ret_OPEN_FAIL;
   }
-  loadbuf_size = wkspace_left;
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   } else if (loadbuf_size <= MAXLINELEN) {
     goto update_marker_alleles_ret_NOMEM;
   }
-  loadbuf = (char*)wkspace_alloc(loadbuf_size);
+  loadbuf = (char*)bigstack_alloc(loadbuf_size);
   loadbuf[loadbuf_size - 1] = ' ';
   while (fgets(loadbuf, loadbuf_size, infile)) {
     line_idx++;
     if (!loadbuf[loadbuf_size - 1]) {
       if (loadbuf_size == MAXLINEBUFLEN) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of --update-alleles file is pathologically long.\n", line_idx);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --update-alleles file is pathologically long.\n", line_idx);
 	goto update_marker_alleles_ret_INVALID_FORMAT_2;
       } else {
 	goto update_marker_alleles_ret_NOMEM;
@@ -1079,7 +1077,7 @@ int32_t update_marker_alleles(char* update_alleles_fname, uint32_t* marker_id_ht
       LOGPREPRINTFWW("Error: Duplicate variant ID '%s' in --update-alleles file.\n", bufptr3);
       goto update_marker_alleles_ret_INVALID_FORMAT_2;
     }
-    SET_BIT(already_seen, marker_uidx);
+    SET_BIT(marker_uidx, already_seen);
     bufptr2 = skip_initial_spaces(bufptr2);
     len2 = strlen_se(bufptr2);
     bufptr = &(bufptr2[len2]);
@@ -1119,13 +1117,13 @@ int32_t update_marker_alleles(char* update_alleles_fname, uint32_t* marker_id_ht
       if (len2 >= max_marker_allele_len) {
 	max_marker_allele_len = len2 + 1;
       }
-      allele_reset(&(marker_allele_ptrs[2 * marker_uidx]), bufptr2, len2);
-      allele_reset(&(marker_allele_ptrs[2 * marker_uidx + 1]), bufptr, len);
+      allele_reset(bufptr2, len2, &(marker_allele_ptrs[2 * marker_uidx]));
+      allele_reset(bufptr, len, &(marker_allele_ptrs[2 * marker_uidx + 1]));
       hit_ct++;
     } else {
       if (!err_ct) {
 	memcpy(outname_end, ".allele.no.snp", 15);
-	if (fopen_checked(&errfile, outname, "w")) {
+	if (fopen_checked(outname, "w", &errfile)) {
 	  goto update_marker_alleles_ret_OPEN_FAIL;
 	}
       }
@@ -1146,9 +1144,9 @@ int32_t update_marker_alleles(char* update_alleles_fname, uint32_t* marker_id_ht
   }
   *max_marker_allele_len_ptr = max_marker_allele_len;
   if (miss_ct) {
-    sprintf(logbuf, "--update-alleles: %" PRIuPTR " variant%s updated, %" PRIuPTR " ID%s not present.\n", hit_ct, (hit_ct == 1)? "" : "s", miss_ct, (miss_ct == 1)? "" : "s");
+    sprintf(g_logbuf, "--update-alleles: %" PRIuPTR " variant%s updated, %" PRIuPTR " ID%s not present.\n", hit_ct, (hit_ct == 1)? "" : "s", miss_ct, (miss_ct == 1)? "" : "s");
   } else {
-    sprintf(logbuf, "--update-alleles: %" PRIuPTR " variant%s updated.\n", hit_ct, (hit_ct == 1)? "" : "s");
+    sprintf(g_logbuf, "--update-alleles: %" PRIuPTR " variant%s updated.\n", hit_ct, (hit_ct == 1)? "" : "s");
   }
   logprintb();
   if (err_ct) {
@@ -1177,7 +1175,7 @@ int32_t update_marker_alleles(char* update_alleles_fname, uint32_t* marker_id_ht
   }
   fclose_cond(infile);
   fclose_cond(errfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -1216,7 +1214,7 @@ uint32_t flip_process_token(char* tok_start, uint32_t* marker_id_htable, uint32_
     LOGPREPRINTFWW("Error: Duplicate variant ID '%s' in --flip file.\n", tok_start);
     return 1;
   }
-  SET_BIT(already_seen, marker_uidx);
+  SET_BIT(marker_uidx, already_seen);
   cur_non_acgt0 = 0;
   cur_non_acgt0 |= flip_str(&(marker_allele_ptrs[2 * marker_uidx]));
   cur_non_acgt0 |= flip_str(&(marker_allele_ptrs[2 * marker_uidx + 1]));
@@ -1226,12 +1224,12 @@ uint32_t flip_process_token(char* tok_start, uint32_t* marker_id_htable, uint32_
 }
 
 int32_t flip_strand(char* flip_fname, uint32_t* marker_id_htable, uint32_t marker_id_htable_size, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, char** marker_allele_ptrs) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* flipfile = NULL;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t hit_ct = 0;
   uintptr_t miss_ct = 0;
-  char* midbuf = &(tbuf[MAXLINELEN]);
+  char* midbuf = &(g_textbuf[MAXLINELEN]);
   uint32_t non_acgt_ct = 0;
   uint32_t curtoklen = 0;
   int32_t retval = 0;
@@ -1241,13 +1239,12 @@ int32_t flip_strand(char* flip_fname, uint32_t* marker_id_htable, uint32_t marke
   char* bufptr;
   char* bufptr2;
   char* bufptr3;
-  if (wkspace_alloc_ul_checked(&already_seen, unfiltered_marker_ctl * sizeof(intptr_t))) {
+  if (bigstack_calloc_ul(unfiltered_marker_ctl, &already_seen)) {
     goto flip_strand_ret_NOMEM;
   }
-  fill_ulong_zero(already_seen, unfiltered_marker_ctl);
   // Compatibility fix: PLINK 1.07 uses a token- rather than a line-based
   // loader here.
-  if (fopen_checked(&flipfile, flip_fname, "rb")) {
+  if (fopen_checked(flip_fname, FOPEN_RB, &flipfile)) {
     goto flip_strand_ret_OPEN_FAIL;
   }
   while (1) {
@@ -1256,7 +1253,7 @@ int32_t flip_strand(char* flip_fname, uint32_t* marker_id_htable, uint32_t marke
     }
     if (!bufsize) {
       if (curtoklen) {
-        if (flip_process_token(&(tbuf[MAXLINELEN - curtoklen]), marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len, marker_exclude, already_seen, marker_allele_ptrs, &hit_ct, &miss_ct, &non_acgt_ct)) {
+        if (flip_process_token(&(g_textbuf[MAXLINELEN - curtoklen]), marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len, marker_exclude, already_seen, marker_allele_ptrs, &hit_ct, &miss_ct, &non_acgt_ct)) {
 	  goto flip_strand_ret_INVALID_FORMAT_2;
 	}
       }
@@ -1265,7 +1262,7 @@ int32_t flip_strand(char* flip_fname, uint32_t* marker_id_htable, uint32_t marke
     bufptr0 = &(midbuf[bufsize]);
     *bufptr0 = ' ';
     bufptr0[1] = '0';
-    bufptr = &(tbuf[MAXLINELEN - curtoklen]);
+    bufptr = &(g_textbuf[MAXLINELEN - curtoklen]);
     bufptr2 = midbuf;
     if (curtoklen) {
       goto flip_strand_tok_start;
@@ -1284,12 +1281,12 @@ int32_t flip_strand(char* flip_fname, uint32_t* marker_id_htable, uint32_t marke
         bufptr2++;
       }
       curtoklen = (uintptr_t)(bufptr2 - bufptr);
-      if (bufptr2 == &(tbuf[MAXLINELEN * 2])) {
+      if (bufptr2 == &(g_textbuf[MAXLINELEN * 2])) {
         if (curtoklen > MAX_ID_LEN) {
 	  logerrprint("Error: Excessively long ID in --flip file.\n");
 	  goto flip_strand_ret_INVALID_FORMAT;
 	}
-        bufptr3 = &(tbuf[MAXLINELEN - curtoklen]);
+        bufptr3 = &(g_textbuf[MAXLINELEN - curtoklen]);
         memcpy(bufptr3, bufptr, curtoklen);
 	break;
       }
@@ -1303,9 +1300,9 @@ int32_t flip_strand(char* flip_fname, uint32_t* marker_id_htable, uint32_t marke
     goto flip_strand_ret_READ_FAIL;
   }
   if (miss_ct) {
-    sprintf(logbuf, "--flip: %" PRIuPTR " SNP%s flipped, %" PRIuPTR " SNP ID%s not present.\n", hit_ct, (hit_ct == 1)? "" : "s", miss_ct, (miss_ct == 1)? "" : "s");
+    sprintf(g_logbuf, "--flip: %" PRIuPTR " SNP%s flipped, %" PRIuPTR " SNP ID%s not present.\n", hit_ct, (hit_ct == 1)? "" : "s", miss_ct, (miss_ct == 1)? "" : "s");
   } else {
-    sprintf(logbuf, "--flip: %" PRIuPTR " SNP%s flipped.\n", hit_ct, (hit_ct == 1)? "" : "s");
+    sprintf(g_logbuf, "--flip: %" PRIuPTR " SNP%s flipped.\n", hit_ct, (hit_ct == 1)? "" : "s");
   }
   logprintb();
   if (non_acgt_ct) {
@@ -1328,16 +1325,16 @@ int32_t flip_strand(char* flip_fname, uint32_t* marker_id_htable, uint32_t marke
     break;
   }
   fclose_cond(flipfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t update_sample_ids(char* update_ids_fname, char* sorted_sample_ids, uintptr_t sample_ct, uintptr_t max_sample_id_len, uint32_t* sample_id_map, char* sample_ids) {
   // file has been pre-scanned
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
   int32_t retval = 0;
-  uintptr_t sample_ctl = (sample_ct + (BITCT - 1)) / BITCT;
+  uintptr_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
   uintptr_t hit_ct = 0;
   uintptr_t miss_ct = 0;
   uintptr_t line_idx = 0;
@@ -1349,28 +1346,27 @@ int32_t update_sample_ids(char* update_ids_fname, char* sorted_sample_ids, uintp
   uintptr_t sample_uidx;
   uint32_t len;
   int32_t sorted_idx;
-  if (wkspace_alloc_c_checked(&idbuf, max_sample_id_len) ||
-      wkspace_alloc_ul_checked(&already_seen, sample_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_c(max_sample_id_len, &idbuf) ||
+      bigstack_calloc_ul(sample_ctl, &already_seen)) {
     goto update_sample_ids_ret_NOMEM;
   }
-  fill_ulong_zero(already_seen, sample_ctl);
-  if (fopen_checked(&infile, update_ids_fname, "r")) {
+  if (fopen_checked(update_ids_fname, "r", &infile)) {
     goto update_sample_ids_ret_OPEN_FAIL;
   }
-  tbuf[MAXLINELEN - 1] = ' ';
-  while (fgets(tbuf, MAXLINELEN, infile)) {
+  g_textbuf[MAXLINELEN - 1] = ' ';
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
+    if (!g_textbuf[MAXLINELEN - 1]) {
       // er, either this buffer should be extended, or the
       // scan_max_fam_indiv_strlen() should use this length...
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --update-ids file is pathologically long.\n", line_idx);
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --update-ids file is pathologically long.\n", line_idx);
       goto update_sample_ids_ret_INVALID_FORMAT_2;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
-    bsearch_read_fam_indiv(idbuf, sorted_sample_ids, max_sample_id_len, sample_ct, bufptr, &bufptr, &sorted_idx);
+    bsearch_read_fam_indiv(bufptr, sorted_sample_ids, max_sample_id_len, sample_ct, &bufptr, &sorted_idx, idbuf);
     if (sorted_idx == -1) {
       miss_ct++;
       continue;
@@ -1380,7 +1376,7 @@ int32_t update_sample_ids(char* update_ids_fname, char* sorted_sample_ids, uintp
       LOGPREPRINTFWW("Error: Duplicate sample ID '%s' in --update-ids file.\n", idbuf);
       goto update_sample_ids_ret_INVALID_FORMAT_2;
     }
-    set_bit(already_seen, sorted_idx);
+    set_bit(sorted_idx, already_seen);
     sample_uidx = sample_id_map[((uint32_t)sorted_idx)];
     wptr = &(sample_ids[sample_uidx * max_sample_id_len]);
     len = strlen_se(bufptr);
@@ -1389,7 +1385,7 @@ int32_t update_sample_ids(char* update_ids_fname, char* sorted_sample_ids, uintp
     bufptr = skip_initial_spaces(&(bufptr2[1]));
     len = strlen_se(bufptr);
     if ((len == 1) && (*bufptr == '0')) {
-      sprintf(logbuf, "Error: Invalid IID '0' on line %" PRIuPTR " of --update-ids file.\n", line_idx);
+      sprintf(g_logbuf, "Error: Invalid IID '0' on line %" PRIuPTR " of --update-ids file.\n", line_idx);
       goto update_sample_ids_ret_INVALID_FORMAT_2;
     }
     memcpyx(wptr, bufptr, len, '\0');
@@ -1399,9 +1395,9 @@ int32_t update_sample_ids(char* update_ids_fname, char* sorted_sample_ids, uintp
     goto update_sample_ids_ret_READ_FAIL;
   }
   if (miss_ct) {
-    sprintf(logbuf, "--update-ids: %" PRIuPTR " %s updated, %" PRIuPTR " ID%s not present.\n", hit_ct, species_str(hit_ct), miss_ct, (miss_ct == 1)? "" : "s");
+    sprintf(g_logbuf, "--update-ids: %" PRIuPTR " %s updated, %" PRIuPTR " ID%s not present.\n", hit_ct, species_str(hit_ct), miss_ct, (miss_ct == 1)? "" : "s");
   } else {
-    sprintf(logbuf, "--update-ids: %" PRIuPTR " %s updated.\n", hit_ct, species_str(hit_ct));
+    sprintf(g_logbuf, "--update-ids: %" PRIuPTR " %s updated.\n", hit_ct, species_str(hit_ct));
   }
   logprintb();
 
@@ -1421,15 +1417,15 @@ int32_t update_sample_ids(char* update_ids_fname, char* sorted_sample_ids, uintp
     break;
   }
   fclose_cond(infile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t update_sample_parents(char* update_parents_fname, char* sorted_sample_ids, uintptr_t sample_ct, uintptr_t max_sample_id_len, uint32_t* sample_id_map, char* paternal_ids, uintptr_t max_paternal_id_len, char* maternal_ids, uintptr_t max_maternal_id_len, uintptr_t* founder_info) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
   int32_t retval = 0;
-  uintptr_t sample_ctl = (sample_ct + (BITCT - 1)) / BITCT;
+  uintptr_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
   uintptr_t hit_ct = 0;
   uintptr_t miss_ct = 0;
   char* idbuf;
@@ -1444,23 +1440,21 @@ int32_t update_sample_parents(char* update_parents_fname, char* sorted_sample_id
   uint32_t len;
   uint32_t len2;
   int32_t sorted_idx;
-  if (wkspace_alloc_c_checked(&idbuf, max_sample_id_len) ||
-      wkspace_alloc_ul_checked(&already_seen, sample_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_c(max_sample_id_len, &idbuf) ||
+      bigstack_calloc_ul(sample_ctl, &already_seen)) {
     goto update_sample_parents_ret_NOMEM;
   }
-  fill_ulong_zero(already_seen, sample_ctl);
-  if (fopen_checked(&infile, update_parents_fname, "r")) {
+  if (fopen_checked(update_parents_fname, "r", &infile)) {
     goto update_sample_parents_ret_OPEN_FAIL;
   }
   // permit very long lines since this can be pointed at .ped files
-  if (wkspace_left > MAXLINEBUFLEN) {
+  loadbuf_size = bigstack_left();
+  if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
-  } else if (wkspace_left > MAXLINELEN) {
-    loadbuf_size = wkspace_left;
-  } else {
+  } else if (loadbuf_size <= MAXLINELEN) {
     goto update_sample_parents_ret_NOMEM;
   }
-  loadbuf = (char*)wkspace_base;
+  loadbuf = (char*)g_bigstack_base;
   loadbuf[loadbuf_size - 1] = ' ';
   while (fgets(loadbuf, loadbuf_size, infile)) {
     // no line_idx since all the validation happened earlier
@@ -1471,7 +1465,7 @@ int32_t update_sample_parents(char* update_parents_fname, char* sorted_sample_id
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
-    bsearch_read_fam_indiv(idbuf, sorted_sample_ids, max_sample_id_len, sample_ct, bufptr, &bufptr, &sorted_idx);
+    bsearch_read_fam_indiv(bufptr, sorted_sample_ids, max_sample_id_len, sample_ct, &bufptr, &sorted_idx, idbuf);
     if (sorted_idx == -1) {
       miss_ct++;
       continue;
@@ -1481,7 +1475,7 @@ int32_t update_sample_parents(char* update_parents_fname, char* sorted_sample_id
       LOGPREPRINTFWW("Error: Duplicate sample ID '%s' in --update-parents file.\n", idbuf);
       goto update_sample_parents_ret_INVALID_FORMAT_2;
     }
-    set_bit(already_seen, sorted_idx);
+    set_bit(sorted_idx, already_seen);
     sample_uidx = sample_id_map[((uint32_t)sorted_idx)];
     wptr = &(paternal_ids[sample_uidx * max_paternal_id_len]);
     len = strlen_se(bufptr);
@@ -1492,9 +1486,9 @@ int32_t update_sample_parents(char* update_parents_fname, char* sorted_sample_id
     len2 = strlen_se(bufptr3);
     memcpyx(wptr, bufptr3, len2, '\0');
     if ((len == 1) && (*bufptr == '0') && (len2 == 1) && (*bufptr3 == '0')) {
-      SET_BIT(founder_info, sample_uidx);
+      SET_BIT(sample_uidx, founder_info);
     } else {
-      CLEAR_BIT(founder_info, sample_uidx);
+      CLEAR_BIT(sample_uidx, founder_info);
     }
     hit_ct++;
   }
@@ -1502,9 +1496,9 @@ int32_t update_sample_parents(char* update_parents_fname, char* sorted_sample_id
     goto update_sample_parents_ret_READ_FAIL;
   }
   if (miss_ct) {
-    sprintf(logbuf, "--update-parents: %" PRIuPTR " %s updated, %" PRIuPTR " ID%s not present.\n", hit_ct, species_str(hit_ct), miss_ct, (miss_ct == 1)? "" : "s");
+    sprintf(g_logbuf, "--update-parents: %" PRIuPTR " %s updated, %" PRIuPTR " ID%s not present.\n", hit_ct, species_str(hit_ct), miss_ct, (miss_ct == 1)? "" : "s");
   } else {
-    sprintf(logbuf, "--update-parents: %" PRIuPTR " %s updated.\n", hit_ct, species_str(hit_ct));
+    sprintf(g_logbuf, "--update-parents: %" PRIuPTR " %s updated.\n", hit_ct, species_str(hit_ct));
   }
   logprintb();
 
@@ -1524,15 +1518,15 @@ int32_t update_sample_parents(char* update_parents_fname, char* sorted_sample_id
     break;
   }
   fclose_cond(infile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t update_sample_sexes(char* update_sex_fname, uint32_t update_sex_col, char* sorted_sample_ids, uintptr_t sample_ct, uintptr_t max_sample_id_len, uint32_t* sample_id_map, uintptr_t* sex_nm, uintptr_t* sex_male) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
   int32_t retval = 0;
-  uintptr_t sample_ctl = (sample_ct + (BITCT - 1)) / BITCT;
+  uintptr_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
   uintptr_t hit_ct = 0;
   uintptr_t miss_ct = 0;
   uintptr_t line_idx = 0;
@@ -1546,29 +1540,27 @@ int32_t update_sample_sexes(char* update_sex_fname, uint32_t update_sex_col, cha
   char cc;
   unsigned char ucc;
   update_sex_col--;
-  if (wkspace_alloc_c_checked(&idbuf, max_sample_id_len) ||
-      wkspace_alloc_ul_checked(&already_seen, sample_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_c(max_sample_id_len, &idbuf) ||
+      bigstack_calloc_ul(sample_ctl, &already_seen)) {
     goto update_sample_sexes_ret_NOMEM;
   }
-  fill_ulong_zero(already_seen, sample_ctl);
-  if (fopen_checked(&infile, update_sex_fname, "r")) {
+  if (fopen_checked(update_sex_fname, "r", &infile)) {
     goto update_sample_sexes_ret_OPEN_FAIL;
   }
   // permit very long lines since this can be pointed at .ped files
-  if (wkspace_left > MAXLINEBUFLEN) {
+  loadbuf_size = bigstack_left();
+  if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
-  } else if (wkspace_left > MAXLINELEN) {
-    loadbuf_size = wkspace_left;
-  } else {
+  } else if (loadbuf_size <= MAXLINELEN) {
     goto update_sample_sexes_ret_NOMEM;
   }
-  loadbuf = (char*)wkspace_base;
+  loadbuf = (char*)g_bigstack_base;
   loadbuf[loadbuf_size - 1] = ' ';
   while (fgets(loadbuf, loadbuf_size, infile)) {
     line_idx++;
     if (!loadbuf[loadbuf_size - 1]) {
       if (loadbuf_size == MAXLINEBUFLEN) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of --update-sex file is pathologically long.\n", line_idx);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --update-sex file is pathologically long.\n", line_idx);
 	goto update_sample_sexes_ret_INVALID_FORMAT_2;
       } else {
 	goto update_sample_sexes_ret_NOMEM;
@@ -1578,7 +1570,7 @@ int32_t update_sample_sexes(char* update_sex_fname, uint32_t update_sex_col, cha
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
-    if (bsearch_read_fam_indiv(idbuf, sorted_sample_ids, max_sample_id_len, sample_ct, bufptr, &bufptr, &sorted_idx)) {
+    if (bsearch_read_fam_indiv(bufptr, sorted_sample_ids, max_sample_id_len, sample_ct, &bufptr, &sorted_idx, idbuf)) {
       goto update_sample_sexes_ret_MISSING_TOKENS;
     }
     if (sorted_idx == -1) {
@@ -1590,7 +1582,7 @@ int32_t update_sample_sexes(char* update_sex_fname, uint32_t update_sex_col, cha
       LOGPREPRINTFWW("Error: Duplicate sample ID '%s' in --update-sex file.\n", idbuf);
       goto update_sample_sexes_ret_INVALID_FORMAT_2;
     }
-    set_bit(already_seen, sorted_idx);
+    set_bit(sorted_idx, already_seen);
     sample_uidx = sample_id_map[((uint32_t)sorted_idx)];
     bufptr = next_token_multz(bufptr, update_sex_col);
     if (no_more_tokens_kns(bufptr)) {
@@ -1599,18 +1591,18 @@ int32_t update_sample_sexes(char* update_sex_fname, uint32_t update_sex_col, cha
     cc = *bufptr;
     ucc = ((unsigned char)cc) & 0xdfU;
     if ((cc < '0') || ((cc > '2') && (ucc != 'M') && (ucc != 'F')) || (bufptr[1] > ' ')) {
-      sprintf(logbuf, "Error: Invalid sex value on line %" PRIuPTR " of --update-sex file.\n(Acceptable values: 1/M = male, 2/F = female, 0 = missing.)\n", line_idx);
+      sprintf(g_logbuf, "Error: Invalid sex value on line %" PRIuPTR " of --update-sex file.\n(Acceptable values: 1/M = male, 2/F = female, 0 = missing.)\n", line_idx);
       goto update_sample_sexes_ret_INVALID_FORMAT_2;
     }
     if (cc == '0') {
-      CLEAR_BIT(sex_nm, sample_uidx);
-      CLEAR_BIT(sex_male, sample_uidx);
+      CLEAR_BIT(sample_uidx, sex_nm);
+      CLEAR_BIT(sample_uidx, sex_male);
     } else {
-      SET_BIT(sex_nm, sample_uidx);
+      SET_BIT(sample_uidx, sex_nm);
       if ((cc == '1') || (ucc == 'M')) {
-	SET_BIT(sex_male, sample_uidx);
+	SET_BIT(sample_uidx, sex_male);
       } else {
-	CLEAR_BIT(sex_male, sample_uidx);
+	CLEAR_BIT(sample_uidx, sex_male);
       }
     }
     hit_ct++;
@@ -1619,9 +1611,9 @@ int32_t update_sample_sexes(char* update_sex_fname, uint32_t update_sex_col, cha
     goto update_sample_sexes_ret_READ_FAIL;
   }
   if (miss_ct) {
-    sprintf(logbuf, "--update-sex: %" PRIuPTR " %s updated, %" PRIuPTR " ID%s not present.\n", hit_ct, species_str(hit_ct), miss_ct, (miss_ct == 1)? "" : "s");
+    sprintf(g_logbuf, "--update-sex: %" PRIuPTR " %s updated, %" PRIuPTR " ID%s not present.\n", hit_ct, species_str(hit_ct), miss_ct, (miss_ct == 1)? "" : "s");
   } else {
-    sprintf(logbuf, "--update-sex: %" PRIuPTR " %s updated.\n", hit_ct, species_str(hit_ct));
+    sprintf(g_logbuf, "--update-sex: %" PRIuPTR " %s updated.\n", hit_ct, species_str(hit_ct));
   }
   logprintb();
 
@@ -1636,14 +1628,14 @@ int32_t update_sample_sexes(char* update_sex_fname, uint32_t update_sex_col, cha
     retval = RET_READ_FAIL;
     break;
   update_sample_sexes_ret_MISSING_TOKENS:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of --update-sex file has fewer tokens than expected.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --update-sex file has fewer tokens than expected.\n", line_idx);
   update_sample_sexes_ret_INVALID_FORMAT_2:
     logerrprintb();
     retval = RET_INVALID_FORMAT;
     break;
   }
   fclose_cond(infile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
@@ -1734,7 +1726,7 @@ int32_t load_one_freq(uint32_t alen1, const char* aptr1, uint32_t alen2, const c
   if ((malen0 == alen1) && (!memcmp(mastrs_ptr[0], aptr1, alen1))) {
     if (((malen1 != alen2)) || memcmp(mastrs_ptr[1], aptr2, alen2)) {
       if (missing1) {
-	if (allele_set(&(mastrs_ptr[1]), aptr2, alen2)) {
+	if (allele_set(aptr2, alen2, &(mastrs_ptr[1]))) {
 	  return RET_NOMEM;
 	}
       } else {
@@ -1745,7 +1737,7 @@ int32_t load_one_freq(uint32_t alen1, const char* aptr1, uint32_t alen2, const c
   } else if ((malen1 == alen1) && (!memcmp(mastrs_ptr[1], aptr1, alen1))) {
     if ((malen0 != alen2) || memcmp(mastrs_ptr[0], aptr2, alen2)) {
       if (missing0) {
-        if (allele_set(&(mastrs_ptr[0]), aptr2, alen2)) {
+        if (allele_set(aptr2, alen2, &(mastrs_ptr[0]))) {
 	  return RET_NOMEM;
 	}
       } else {
@@ -1754,12 +1746,12 @@ int32_t load_one_freq(uint32_t alen1, const char* aptr1, uint32_t alen2, const c
     }
     *set_allele_freq_ptr = maf;
   } else if (missing0 && (!missing1) && (malen1 == alen2) && (!memcmp(mastrs_ptr[1], aptr2, alen2))) {
-    if (allele_set(&(mastrs_ptr[0]), aptr1, alen1)) {
+    if (allele_set(aptr1, alen1, &(mastrs_ptr[0]))) {
       return RET_NOMEM;
     }
     *set_allele_freq_ptr = 1.0 - maf;
   } else if (missing1 && (!missing0) && (malen0 == alen2) && (!memcmp(mastrs_ptr[0], aptr2, alen2))) {
-    if (allele_set(&(mastrs_ptr[1]), aptr1, alen1)) {
+    if (allele_set(aptr1, alen1, &(mastrs_ptr[1]))) {
       return RET_NOMEM;
     }
     *set_allele_freq_ptr = maf;
@@ -1837,7 +1829,7 @@ uint32_t get_freq_file_type(char* bufptr) {
 }
 
 int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, char** marker_allele_ptrs, double* set_allele_freqs, uint32_t* nchrobs, uint32_t maf_succ) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* freqfile = NULL;
   uintptr_t line_idx = 0;
   uint32_t freq_counts = 0;
@@ -1869,20 +1861,20 @@ int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uint
   int32_t c_hap_a1;
   int32_t c_hap_a2;
   int32_t ii;
-  if (fopen_checked(&freqfile, freqname, "r")) {
+  if (fopen_checked(freqname, "r", &freqfile)) {
     goto read_external_freqs_ret_OPEN_FAIL;
   }
-  retval = sort_item_ids(&sorted_ids, &id_map, unfiltered_marker_ct, marker_exclude, marker_exclude_ct, marker_ids, max_marker_id_len, 0, 0, strcmp_deref);
+  retval = sort_item_ids(unfiltered_marker_ct, marker_exclude, marker_exclude_ct, marker_ids, max_marker_id_len, 0, 0, strcmp_deref, &sorted_ids, &id_map);
   if (retval) {
     goto read_external_freqs_ret_1;
   }
-  loadbuf_size = wkspace_left;
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   } else if (loadbuf_size <= MAXLINELEN) {
     goto read_external_freqs_ret_NOMEM;
   }
-  loadbuf = (char*)wkspace_base;
+  loadbuf = (char*)g_bigstack_base;
   loadbuf[loadbuf_size - 1] = ' ';
   do {
     if (!fgets(loadbuf, loadbuf_size, freqfile)) {
@@ -2041,18 +2033,18 @@ int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uint
 	    goto read_external_freqs_ret_INVALID_HOM_A1;
 	  }
 	  if (scan_uint_icap(bufptr2, (uint32_t*)&c_het)) {
-	    sprintf(logbuf, "Error: Invalid het count on line %" PRIuPTR " of --read-freq file.\n", line_idx);
+	    sprintf(g_logbuf, "Error: Invalid het count on line %" PRIuPTR " of --read-freq file.\n", line_idx);
 	    goto read_external_freqs_ret_INVALID_FORMAT_2;
 	  }
 	  if (scan_uint_icap(bufptr3, (uint32_t*)&c_hom_a2)) {
 	    goto read_external_freqs_ret_INVALID_HOM_A2;
 	  }
 	  if (scan_uint_icap(bufptr4, (uint32_t*)&c_hap_a1)) {
-	    sprintf(logbuf, "Error: Invalid hap. A1 count on line %" PRIuPTR " of --read-freq file.\n", line_idx);
+	    sprintf(g_logbuf, "Error: Invalid hap. A1 count on line %" PRIuPTR " of --read-freq file.\n", line_idx);
 	    goto read_external_freqs_ret_INVALID_FORMAT_2;
 	  }
 	  if (scan_uint_icap(bufptr5, (uint32_t*)&c_hap_a2)) {
-	    sprintf(logbuf, "Error: Invalid hap. A2 count on line %" PRIuPTR " of --read-freq file.\n", line_idx);
+	    sprintf(g_logbuf, "Error: Invalid hap. A2 count on line %" PRIuPTR " of --read-freq file.\n", line_idx);
 	    goto read_external_freqs_ret_INVALID_FORMAT_2;
 	  }
 	  cur_nchrobs = 2 * (c_hom_a1 + c_het + c_hom_a2 + maf_succ) + c_hap_a1 + c_hap_a2;
@@ -2118,7 +2110,7 @@ int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uint
 	  goto read_external_freqs_ret_MISSING_TOKENS;
 	}
         if (!no_more_tokens_kns(next_token(bufptr))) {
-	  sprintf(logbuf, "Error: Line %" PRIuPTR " of --read-freq has more tokens than expected.\n", line_idx);
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --read-freq has more tokens than expected.\n", line_idx);
 	  goto read_external_freqs_ret_INVALID_FORMAT_2;
 	}
       }
@@ -2162,7 +2154,7 @@ int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uint
     retval = RET_INVALID_FORMAT;
     break;
   read_external_freqs_ret_INVALID_CHROM:
-    sprintf(logbuf, "Error: Invalid chromosome code on line %" PRIuPTR" of --read-freq file.\n", line_idx);
+    sprintf(g_logbuf, "Error: Invalid chromosome code on line %" PRIuPTR" of --read-freq file.\n", line_idx);
   read_external_freqs_ret_INVALID_FORMAT_2:
     logerrprintb();
   read_external_freqs_ret_INVALID_FORMAT:
@@ -2181,18 +2173,18 @@ int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uint
   }
  read_external_freqs_ret_1:
   fclose_cond(freqfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t load_ax_alleles(Two_col_params* axalleles, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char** marker_allele_ptrs, uintptr_t* max_marker_allele_len_ptr, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, double* set_allele_freqs, uint32_t is_a2) {
   // note that swap_reversed_marker_alleles() has NOT been called yet
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
   char skipchar = axalleles->skipchar;
   const char* missing_geno_ptr = g_missing_geno_ptr;
   uint32_t colid_first = (axalleles->colid < axalleles->colx);
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t max_marker_allele_len = *max_marker_allele_len_ptr;
   uintptr_t* already_seen;
   char* loadbuf;
@@ -2210,21 +2202,20 @@ int32_t load_ax_alleles(Two_col_params* axalleles, uintptr_t unfiltered_marker_c
   char cc;
   uint32_t replace_other;
   int32_t retval;
-  retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable, &marker_id_htable_size);
+  retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable_size, &marker_id_htable);
   if (retval) {
     goto load_ax_alleles_ret_1;
   }
-  if (wkspace_alloc_ul_checked(&already_seen, unfiltered_marker_ctl * sizeof(intptr_t))) {
+  if (bigstack_calloc_ul(unfiltered_marker_ctl, &already_seen)) {
     goto load_ax_alleles_ret_NOMEM;
   }
-  fill_ulong_zero(already_seen, unfiltered_marker_ctl);
-  loadbuf_size = wkspace_left;
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   } else if (loadbuf_size <= MAXLINELEN) {
     goto load_ax_alleles_ret_NOMEM;
   }
-  loadbuf = (char*)wkspace_base;
+  loadbuf = (char*)g_bigstack_base;
   retval = open_and_skip_first_lines(&infile, axalleles->fname, loadbuf, loadbuf_size, axalleles->skip);
   if (retval) {
     goto load_ax_alleles_ret_1;
@@ -2269,22 +2260,22 @@ int32_t load_ax_alleles(Two_col_params* axalleles, uintptr_t unfiltered_marker_c
       LOGPREPRINTFWW("Error: Duplicate variant ID '%s' in --a%c-allele file.\n", colid_ptr, is_a2? '2' : '1');
       goto load_ax_alleles_ret_INVALID_FORMAT_2;
     }
-    set_bit(already_seen, marker_uidx);
+    set_bit(marker_uidx, already_seen);
     alen = strlen_se(colx_ptr);
     colx_ptr[alen] = '\0';
     if (!strcmp(colx_ptr, marker_allele_ptrs[marker_uidx * 2 + is_a2])) {
       if (IS_SET(marker_reverse, marker_uidx)) {
         set_allele_freqs[marker_uidx] = 1.0 - set_allele_freqs[marker_uidx];
-        CLEAR_BIT(marker_reverse, marker_uidx);
+        CLEAR_BIT(marker_uidx, marker_reverse);
       }
     } else if (!strcmp(colx_ptr, marker_allele_ptrs[marker_uidx * 2 + 1 - is_a2])) {
       if (!IS_SET(marker_reverse, marker_uidx)) {
         set_allele_freqs[marker_uidx] = 1.0 - set_allele_freqs[marker_uidx];
-        SET_BIT(marker_reverse, marker_uidx);
+        SET_BIT(marker_uidx, marker_reverse);
       }
     } else if ((marker_allele_ptrs[marker_uidx * 2] == missing_geno_ptr) || (marker_allele_ptrs[marker_uidx * 2 + 1] == missing_geno_ptr)) {
       replace_other = (marker_allele_ptrs[marker_uidx * 2 + is_a2] != missing_geno_ptr);
-      if (allele_reset(&(marker_allele_ptrs[marker_uidx * 2 + (is_a2 ^ replace_other)]), colx_ptr, alen)) {
+      if (allele_reset(colx_ptr, alen, &(marker_allele_ptrs[marker_uidx * 2 + (is_a2 ^ replace_other)]))) {
 	goto load_ax_alleles_ret_NOMEM;
       }
       if (alen >= max_marker_allele_len) {
@@ -2293,12 +2284,12 @@ int32_t load_ax_alleles(Two_col_params* axalleles, uintptr_t unfiltered_marker_c
       if (!replace_other) {
 	if (IS_SET(marker_reverse, marker_uidx)) {
 	  set_allele_freqs[marker_uidx] = 1.0 - set_allele_freqs[marker_uidx];
-	  CLEAR_BIT(marker_reverse, marker_uidx);
+	  CLEAR_BIT(marker_uidx, marker_reverse);
 	}
       } else {
 	if (!IS_SET(marker_reverse, marker_uidx)) {
 	  set_allele_freqs[marker_uidx] = 1.0 - set_allele_freqs[marker_uidx];
-	  SET_BIT(marker_reverse, marker_uidx);
+	  SET_BIT(marker_uidx, marker_reverse);
 	}
       }
     } else {
@@ -2326,17 +2317,17 @@ int32_t load_ax_alleles(Two_col_params* axalleles, uintptr_t unfiltered_marker_c
   }
  load_ax_alleles_ret_1:
   fclose_cond(infile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uint32_t sample_f_ct, uintptr_t* founder_info, uint32_t nonfounders, uintptr_t* sex_male, uint32_t s [...]
   // unfiltered_sample_ct == 0 ok
-  unsigned char* wkspace_mark = wkspace_base;
-  char* writebuf = tbuf;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  char* writebuf = g_textbuf;
   char* pzwritep = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
   uint32_t* cur_cluster_map = cluster_map;
   uint32_t* cur_cluster_starts = cluster_starts;
   uint32_t* cluster_map_nonmale = NULL;
@@ -2371,18 +2362,18 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
   uint32_t uii;
   pzwrite_init_null(&ps);
   uii = 2 * max_marker_allele_len + max_marker_id_len + max_cluster_id_len + 256;
-  if (wkspace_alloc_uc_checked(&overflow_buf, uii + PIGZ_BLOCK_SIZE) ||
-      wkspace_alloc_ul_checked(&readbuf, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_uc(uii + PIGZ_BLOCK_SIZE, &overflow_buf) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl2, &readbuf)) {
     goto write_stratified_freqs_ret_NOMEM;
   }
   if (uii > MAXLINELEN) {
-    if (wkspace_alloc_c_checked(&writebuf, uii)) {
+    if (bigstack_alloc_c(uii, &writebuf)) {
       goto write_stratified_freqs_ret_NOMEM;
     }
   }
   if ((sample_ct > sample_f_ct) && (!nonfounders)) {
-    if (wkspace_alloc_ui_checked(&cur_cluster_starts, (cluster_ct + 1) * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&cur_cluster_map, sample_f_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(cluster_ct + 1, &cur_cluster_starts) ||
+        bigstack_alloc_ui(sample_f_ct, &cur_cluster_map)) {
       goto write_stratified_freqs_ret_NOMEM;
     }
     clmpos = 0;
@@ -2401,8 +2392,8 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
   }
   chrom_idx = chrom_info_ptr->x_code;
   if ((chrom_idx != -1) && is_set(chrom_info_ptr->chrom_mask, chrom_idx)) {
-    if (wkspace_alloc_ui_checked(&cluster_starts_nonmale, (cluster_ct + 1) * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&cluster_map_nonmale, (sample_f_ct - sample_f_male_ct) * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(cluster_ct + 1, &cluster_starts_nonmale) ||
+        bigstack_alloc_ui(sample_f_ct - sample_f_male_ct, &cluster_map_nonmale)) {
       goto write_stratified_freqs_ret_NOMEM;
     }
     clmpos = 0;
@@ -2421,8 +2412,8 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
   }
   chrom_idx = chrom_info_ptr->y_code;
   if (cluster_map_nonmale || ((chrom_idx != -1) && is_set(chrom_info_ptr->chrom_mask, chrom_idx))) {
-    if (wkspace_alloc_ui_checked(&cluster_starts_male, (cluster_ct + 1) * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&cluster_map_male, sample_f_male_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(cluster_ct + 1, &cluster_starts_male) ||
+        bigstack_alloc_ui(sample_f_male_ct, &cluster_map_male)) {
       goto write_stratified_freqs_ret_NOMEM;
     }
     clmpos = 0;
@@ -2444,9 +2435,9 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
     goto write_stratified_freqs_ret_OPEN_FAIL;
   }
   pzwritep = (char*)overflow_buf;
-  sprintf(tbuf, " CHR %%%us     CLST   A1   A2      MAF    MAC  NCHROBS" EOLN_STR, plink_maxsnp);
-  pzwritep += sprintf(pzwritep, tbuf, "SNP");
-  if (wkspace_alloc_c_checked(&csptr, 2 * max_marker_allele_len + 16)) {
+  sprintf(g_textbuf, " CHR %%%us     CLST   A1   A2      MAF    MAC  NCHROBS" EOLN_STR, plink_maxsnp);
+  pzwritep += sprintf(pzwritep, g_textbuf, "SNP");
+  if (bigstack_alloc_c(2 * max_marker_allele_len + 16, &csptr)) {
     goto write_stratified_freqs_ret_NOMEM;
   }
   memset(csptr, 32, 10);
@@ -2465,7 +2456,7 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
     if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
       goto write_stratified_freqs_ret_READ_FAIL;
     }
-    col_2_start = width_force(4, writebuf, chrom_name_write(writebuf, chrom_info_ptr, chrom_idx));
+    col_2_start = width_force(4, writebuf, chrom_name_write(chrom_info_ptr, chrom_idx, writebuf));
     *col_2_start++ = ' ';
     do {
       sptr = &(marker_ids[marker_uidx * max_marker_id_len]);
@@ -2480,11 +2471,11 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
       *wptr++ = ' ';
       cslen = (uintptr_t)(wptr - csptr);
 
-      if (load_raw(bedfile, readbuf, unfiltered_sample_ct4)) {
+      if (load_raw(unfiltered_sample_ct4, bedfile, readbuf)) {
 	goto write_stratified_freqs_ret_READ_FAIL;
       }
       if (IS_SET(marker_reverse, marker_uidx)) {
-	reverse_loadbuf((unsigned char*)readbuf, unfiltered_sample_ct);
+	reverse_loadbuf(unfiltered_sample_ct, (unsigned char*)readbuf);
       }
       if (is_x) {
 	uiptr = cluster_map_nonmale;
@@ -2510,10 +2501,9 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
 	  a1_obs += cur_cts[0];
 	  tot_obs += cur_cts[0] + cur_cts[3];
 	  if (tot_obs) {
-            pzwritep = double_g_writewx4x(pzwritep, ((double)((int32_t)a1_obs)) / ((double)tot_obs), 8, ' ');
-	    pzwritep = uint32_writew6x(pzwritep, a1_obs, ' ');
-	    pzwritep = uint32_writew8(pzwritep, tot_obs);
-	    *pzwritep++ = ' ';
+            pzwritep = dtoa_g_wxp4x(((double)((int32_t)a1_obs)) / ((double)tot_obs), 8, ' ', pzwritep);
+	    pzwritep = uint32toa_w6x(a1_obs, ' ', pzwritep);
+	    pzwritep = uint32toa_w8x(tot_obs, ' ', pzwritep);
 	  } else {
 	    pzwritep = memcpya(pzwritep, "       0      0        0 ", 25);
 	  }
@@ -2542,10 +2532,9 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
 	    tot_obs = 2 * (cur_cts[0] + cur_cts[2] + cur_cts[3]);
 	  }
 	  if (tot_obs) {
-            pzwritep = double_g_writewx4x(pzwritep, ((double)((int32_t)a1_obs)) / ((double)tot_obs), 8, ' ');
-	    pzwritep = uint32_writew6x(pzwritep, a1_obs, ' ');
-	    pzwritep = uint32_writew8(pzwritep, tot_obs);
-	    *pzwritep++ = ' ';
+            pzwritep = dtoa_g_wxp4x(((double)((int32_t)a1_obs)) / ((double)tot_obs), 8, ' ', pzwritep);
+	    pzwritep = uint32toa_w6x(a1_obs, ' ', pzwritep);
+	    pzwritep = uint32toa_w8x(tot_obs, ' ', pzwritep);
 	  } else {
 	    pzwritep = memcpya(pzwritep, "       0      0        0 ", 25);
 	  }
@@ -2574,10 +2563,9 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
 	    tot_obs = 2 * (cur_cts[0] + cur_cts[2] + cur_cts[3]);
 	  }
 	  if (tot_obs) {
-            pzwritep = double_g_writewx4x(pzwritep, ((double)((int32_t)a1_obs)) / ((double)tot_obs), 8, ' ');
-	    pzwritep = uint32_writew6x(pzwritep, a1_obs, ' ');
-	    pzwritep = uint32_writew8(pzwritep, tot_obs);
-	    *pzwritep++ = ' ';
+            pzwritep = dtoa_g_wxp4x(((double)((int32_t)a1_obs)) / ((double)tot_obs), 8, ' ', pzwritep);
+	    pzwritep = uint32toa_w6x(a1_obs, ' ', pzwritep);
+	    pzwritep = uint32toa_w8x(tot_obs, ' ', pzwritep);
 	  } else {
 	    pzwritep = memcpya(pzwritep, "       0      0        0 ", 25);
 	  }
@@ -2614,17 +2602,17 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
     retval = RET_WRITE_FAIL;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   flex_pzwrite_close_cond(&ps, pzwritep);
   return retval;
 }
 
 int32_t write_cc_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t* founder_info, uint32_t nonfounders, uintptr_t* sex_male, uintptr_t* marker_reverse, uintptr_t* pheno_nm, uintptr_t* ph [...]
   // unfiltered_sample_ct must be positive
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   char* pzwritep = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
   uintptr_t* loadbuf = NULL;
   uintptr_t* case_include2 = NULL;
   uintptr_t* ctrl_include2 = NULL;
@@ -2652,34 +2640,34 @@ int32_t write_cc_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
   uint32_t ctrl_obs;
   uint32_t uii;
   pzwrite_init_null(&ps);
-  if (wkspace_alloc_uc_checked(&overflow_buf, PIGZ_BLOCK_SIZE + 2 * max_marker_allele_len + max_marker_id_len + 256)) {
+  if (bigstack_alloc_uc(PIGZ_BLOCK_SIZE + 2 * max_marker_allele_len + max_marker_id_len + 256, &overflow_buf)) {
     goto write_cc_freqs_ret_NOMEM;
   }
 
-  if (wkspace_alloc_ul_checked(&loadbuf, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&case_include2, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&ctrl_include2, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&male_vec, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&nonmale_vec, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl2, &case_include2) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl2, &ctrl_include2) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl2, &male_vec) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl2, &nonmale_vec)) {
       goto write_cc_freqs_ret_NOMEM;
   }
   loadbuf[unfiltered_sample_ctl2 - 1] = 0;
-  vec_include_init(unfiltered_sample_ct, case_include2, pheno_c);
-  vec_include_init(unfiltered_sample_ct, ctrl_include2, pheno_nm);
+  init_quaterarr_from_bitarr(pheno_c, unfiltered_sample_ct, case_include2);
+  init_quaterarr_from_bitarr(pheno_nm, unfiltered_sample_ct, ctrl_include2);
   memcpy(nonmale_vec, ctrl_include2, unfiltered_sample_ctl2 * sizeof(intptr_t));
-  bitfield_andnot(ctrl_include2, case_include2, unfiltered_sample_ctl2);
-  vec_include_init(unfiltered_sample_ct, male_vec, sex_male);
-  bitfield_andnot(nonmale_vec, male_vec, unfiltered_sample_ctl2);
+  bitvec_andnot(case_include2, unfiltered_sample_ctl2, ctrl_include2);
+  init_quaterarr_from_bitarr(sex_male, unfiltered_sample_ct, male_vec);
+  bitvec_andnot(male_vec, unfiltered_sample_ctl2, nonmale_vec);
   if (!nonfounders) {
-    if (wkspace_alloc_ul_checked(&ulptr, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
+    if (bigstack_alloc_ul(unfiltered_sample_ctl2, &ulptr)) {
       goto write_cc_freqs_ret_NOMEM;
     }
-    vec_include_init(unfiltered_sample_ct, ulptr, founder_info);
-    bitfield_and(case_include2, ulptr, unfiltered_sample_ctl2);
-    bitfield_and(ctrl_include2, ulptr, unfiltered_sample_ctl2);
-    bitfield_and(male_vec, ulptr, unfiltered_sample_ctl2);
-    bitfield_and(nonmale_vec, ulptr, unfiltered_sample_ctl2);
-    wkspace_reset(ulptr);
+    init_quaterarr_from_bitarr(founder_info, unfiltered_sample_ct, ulptr);
+    bitvec_and(ulptr, unfiltered_sample_ctl2, case_include2);
+    bitvec_and(ulptr, unfiltered_sample_ctl2, ctrl_include2);
+    bitvec_and(ulptr, unfiltered_sample_ctl2, male_vec);
+    bitvec_and(ulptr, unfiltered_sample_ctl2, nonmale_vec);
+    bigstack_reset(ulptr);
   }
   case_ct = popcount2_longs(case_include2, unfiltered_sample_ctl2);
   ctrl_ct = popcount2_longs(ctrl_include2, unfiltered_sample_ctl2);
@@ -2716,7 +2704,7 @@ int32_t write_cc_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
       goto write_cc_freqs_ret_READ_FAIL;
     }
     do {
-      pzwritep = width_force(4, pzwritep, chrom_name_write(pzwritep, chrom_info_ptr, chrom_idx));
+      pzwritep = width_force(4, pzwritep, chrom_name_write(chrom_info_ptr, chrom_idx, pzwritep));
       *pzwritep++ = ' ';
       pzwritep = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), pzwritep);
       *pzwritep++ = ' ';
@@ -2725,48 +2713,48 @@ int32_t write_cc_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
       pzwritep = fw_strcpy(4, marker_allele_ptrs[marker_uidx * 2 + 1], pzwritep);
       *pzwritep++ = ' ';
 
-      if (load_raw(bedfile, loadbuf, unfiltered_sample_ct4)) {
+      if (load_raw(unfiltered_sample_ct4, bedfile, loadbuf)) {
 	goto write_cc_freqs_ret_READ_FAIL;
       }
       if (IS_SET(marker_reverse, marker_uidx)) {
-	reverse_loadbuf((unsigned char*)loadbuf, unfiltered_sample_ct);
+	reverse_loadbuf(unfiltered_sample_ct, (unsigned char*)loadbuf);
       }
       if (is_x) {
-	vec_set_freq_x(unfiltered_sample_ctl2, loadbuf, case_include2, male_vec, &case_set_ct, &case_missing_ct);
-	vec_set_freq_x(unfiltered_sample_ctl2, loadbuf, ctrl_include2, male_vec, &ctrl_set_ct, &ctrl_missing_ct);
+	genovec_set_freq_x(loadbuf, case_include2, male_vec, unfiltered_sample_ctl2, &case_set_ct, &case_missing_ct);
+	genovec_set_freq_x(loadbuf, ctrl_include2, male_vec, unfiltered_sample_ctl2, &ctrl_set_ct, &ctrl_missing_ct);
 	case_obs = 2 * case_ct - case_missing_ct;
 	ctrl_obs = 2 * ctrl_ct - ctrl_missing_ct;
       } else if (!is_haploid) {
-	vec_set_freq(unfiltered_sample_ctl2, loadbuf, case_include2, &case_set_ct, &case_missing_ct);
-	vec_set_freq(unfiltered_sample_ctl2, loadbuf, ctrl_include2, &ctrl_set_ct, &ctrl_missing_ct);
+	genovec_set_freq(loadbuf, case_include2, unfiltered_sample_ctl2, &case_set_ct, &case_missing_ct);
+	genovec_set_freq(loadbuf, ctrl_include2, unfiltered_sample_ctl2, &ctrl_set_ct, &ctrl_missing_ct);
 	case_obs = 2 * (case_ct - case_missing_ct);
 	ctrl_obs = 2 * (ctrl_ct - ctrl_missing_ct);
       } else {
         if (is_y) {
-	  vec_set_freq_y(unfiltered_sample_ctl2, loadbuf, case_include2, nonmale_vec, &case_set_ct, &case_missing_ct);
-	  vec_set_freq_y(unfiltered_sample_ctl2, loadbuf, ctrl_include2, nonmale_vec, &ctrl_set_ct, &ctrl_missing_ct);
+	  genovec_set_freq_y(loadbuf, case_include2, nonmale_vec, unfiltered_sample_ctl2, &case_set_ct, &case_missing_ct);
+	  genovec_set_freq_y(loadbuf, ctrl_include2, nonmale_vec, unfiltered_sample_ctl2, &ctrl_set_ct, &ctrl_missing_ct);
         } else {
-	  vec_3freq(unfiltered_sample_ctl2, loadbuf, case_include2, &case_missing_ct, &uii, &case_set_ct);
+	  genovec_3freq(loadbuf, case_include2, unfiltered_sample_ctl2, &case_missing_ct, &uii, &case_set_ct);
 	  case_missing_ct += uii;
-	  vec_3freq(unfiltered_sample_ctl2, loadbuf, ctrl_include2, &ctrl_missing_ct, &uii, &ctrl_set_ct);
+	  genovec_3freq(loadbuf, ctrl_include2, unfiltered_sample_ctl2, &ctrl_missing_ct, &uii, &ctrl_set_ct);
 	  ctrl_missing_ct += uii;
 	}
 	case_obs = case_ct - case_missing_ct;
 	ctrl_obs = ctrl_ct - ctrl_missing_ct;
       }
       if (case_obs) {
-	pzwritep = double_g_writewx4x(pzwritep, ((double)((int32_t)(case_obs - case_set_ct))) / ((double)case_obs), 12, ' ');
+	pzwritep = dtoa_g_wxp4x(((double)((int32_t)(case_obs - case_set_ct))) / ((double)case_obs), 12, ' ', pzwritep);
       } else {
 	pzwritep = memcpya(pzwritep, "        NA ", 11);
       }
       if (ctrl_obs) {
-	pzwritep = double_g_writewx4x(pzwritep, ((double)((int32_t)(ctrl_obs - ctrl_set_ct))) / ((double)ctrl_obs), 12, ' ');
+	pzwritep = dtoa_g_wxp4x(((double)((int32_t)(ctrl_obs - ctrl_set_ct))) / ((double)ctrl_obs), 12, ' ', pzwritep);
       } else {
 	pzwritep = memcpya(pzwritep, "        NA ", 11);
       }
 
-      pzwritep = uint32_writew10x(pzwritep, case_obs, ' ');
-      pzwritep = uint32_writew10(pzwritep, ctrl_obs);
+      pzwritep = uint32toa_w10x(case_obs, ' ', pzwritep);
+      pzwritep = uint32toa_w10(ctrl_obs, pzwritep);
       append_binary_eoln(&pzwritep);
       if (flex_pzwrite(&ps, &pzwritep)) {
 	goto write_cc_freqs_ret_WRITE_FAIL;
@@ -2799,13 +2787,13 @@ int32_t write_cc_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
     break;
   }
   flex_pzwrite_close_cond(&ps, pzwritep);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, double* set_allele_freqs, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, int32_t* ll_cts, int32_t* lh_cts, int32_t* hh_cts, int32_t* hapl_cts, int32_t* haph_cts, uint32_t sample_f_ct, uint32_t sample_f_male_ct, uint32_t nonfounders, uint64_t misc_flags, uintptr_t* mar [...]
   // unfiltered_sample_ct == 0 ok
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   char* pzwritep = NULL;
   uint32_t reverse = 0;
   uint32_t freq_counts = (misc_flags / MISC_FREQ_COUNTS) & 1;
@@ -2828,7 +2816,7 @@ int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uin
   int32_t chrom_idx;
   uint32_t uii;
   pzwrite_init_null(&ps);
-  if (wkspace_alloc_uc_checked(&overflow_buf, PIGZ_BLOCK_SIZE + 2 * max_marker_allele_len + MAXLINELEN)) {
+  if (bigstack_alloc_uc(PIGZ_BLOCK_SIZE + 2 * max_marker_allele_len + MAXLINELEN, &overflow_buf)) {
     goto write_freqs_ret_NOMEM;
   }
 
@@ -2886,19 +2874,19 @@ int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uin
 	  missing_ct = sample_f_ct - (ll_cts[marker_uidx] + lh_cts[marker_uidx] + hh_cts[marker_uidx]);
 	}
 	if (freqx) {
-	  pzwritep = chrom_name_write(pzwritep, chrom_info_ptr, chrom_idx);
+	  pzwritep = chrom_name_write(chrom_info_ptr, chrom_idx, pzwritep);
 	  *pzwritep++ = '\t';
 	  pzwritep = strcpyax(pzwritep, &(marker_ids[marker_uidx * max_marker_id_len]), '\t');
 	  pzwritep = strcpyax(pzwritep, minor_ptr, '\t');
           pzwritep = strcpyax(pzwritep, major_ptr, '\t');
-          pzwritep = uint32_writex(pzwritep, reverse? hh_cts[marker_uidx] : ll_cts[marker_uidx], '\t');
-	  pzwritep = uint32_writex(pzwritep, lh_cts[marker_uidx], '\t');
-          pzwritep = uint32_writex(pzwritep, reverse? ll_cts[marker_uidx] : hh_cts[marker_uidx], '\t');
-          pzwritep = uint32_writex(pzwritep, reverse? haph_cts[marker_uidx] : hapl_cts[marker_uidx], '\t');
-          pzwritep = uint32_writex(pzwritep, reverse? hapl_cts[marker_uidx] : haph_cts[marker_uidx], '\t');
-          pzwritep = uint32_write(pzwritep, missing_ct);
+          pzwritep = uint32toa_x(reverse? hh_cts[marker_uidx] : ll_cts[marker_uidx], '\t', pzwritep);
+	  pzwritep = uint32toa_x(lh_cts[marker_uidx], '\t', pzwritep);
+          pzwritep = uint32toa_x(reverse? ll_cts[marker_uidx] : hh_cts[marker_uidx], '\t', pzwritep);
+          pzwritep = uint32toa_x(reverse? haph_cts[marker_uidx] : hapl_cts[marker_uidx], '\t', pzwritep);
+          pzwritep = uint32toa_x(reverse? hapl_cts[marker_uidx] : haph_cts[marker_uidx], '\t', pzwritep);
+          pzwritep = uint32toa(missing_ct, pzwritep);
 	} else {
-	  pzwritep = width_force(4, pzwritep, chrom_name_write(pzwritep, chrom_info_ptr, chrom_idx));
+	  pzwritep = width_force(4, pzwritep, chrom_name_write(chrom_info_ptr, chrom_idx, pzwritep));
 	  *pzwritep++ = ' ';
 	  pzwritep = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), pzwritep);
 	  *pzwritep++ = ' ';
@@ -2906,12 +2894,12 @@ int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uin
 	  *pzwritep++ = ' ';
 	  pzwritep = fw_strcpy(4, major_ptr, pzwritep);
 	  *pzwritep++ = ' ';
-          pzwritep = uint32_writew6x(pzwritep, 2 * ll_cts[marker_uidx] + lh_cts[marker_uidx] + hapl_cts[marker_uidx], ' ');
-	  pzwritep = uint32_writew6x(pzwritep, 2 * hh_cts[marker_uidx] + lh_cts[marker_uidx] + haph_cts[marker_uidx], ' ');
-	  pzwritep = uint32_writew6(pzwritep, missing_ct);
+          pzwritep = uint32toa_w6x(2 * ll_cts[marker_uidx] + lh_cts[marker_uidx] + hapl_cts[marker_uidx], ' ', pzwritep);
+	  pzwritep = uint32toa_w6x(2 * hh_cts[marker_uidx] + lh_cts[marker_uidx] + haph_cts[marker_uidx], ' ', pzwritep);
+	  pzwritep = uint32toa_w6(missing_ct, pzwritep);
 	}
       } else {
-	pzwritep = width_force(4, pzwritep, chrom_name_write(pzwritep, chrom_info_ptr, chrom_idx));
+	pzwritep = width_force(4, pzwritep, chrom_name_write(chrom_info_ptr, chrom_idx, pzwritep));
 	*pzwritep++ = ' ';
 	pzwritep = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), pzwritep);
         *pzwritep++ = ' ';
@@ -2921,12 +2909,12 @@ int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uin
 	*pzwritep++ = ' ';
 	uii = 2 * (ll_cts[marker_uidx] + lh_cts[marker_uidx] + hh_cts[marker_uidx]) + hapl_cts[marker_uidx] + haph_cts[marker_uidx];
 	if (maf_succ || uii || (set_allele_freqs[marker_uidx] != 0.5)) {
-	  pzwritep = double_g_writewx4(pzwritep, 1.0 - set_allele_freqs[marker_uidx], 12);
+	  pzwritep = dtoa_g_wxp4(1.0 - set_allele_freqs[marker_uidx], 12, pzwritep);
 	} else {
 	  pzwritep = memcpya(pzwritep, "          NA", 12);
 	}
 	*pzwritep++ = ' ';
-	pzwritep = uint32_writew8(pzwritep, uii);
+	pzwritep = uint32toa_w8(uii, pzwritep);
       }
       append_binary_eoln(&pzwritep);
       if (flex_pzwrite(&ps, &pzwritep)) {
@@ -2951,21 +2939,21 @@ int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uin
     break;
   }
   flex_pzwrite_close_cond(&ps, pzwritep);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 int32_t sexcheck(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uintptr_t* sex_nm, uintptr_t* sex_male, uint64_t misc_flags, double check_sex_fthresh, double check_sex_mthresh, uint32_t max_f_yobs, uint32_t min_m_yobs, Chrom_info* chrom [...]
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
   uint32_t* het_cts = NULL;
   uint32_t* missing_cts = NULL;
   double* nei_offsets = NULL;
   uint32_t* ymiss_cts = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
-  uintptr_t sample_ctl2 = (sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
   uintptr_t final_mask = get_final_mask(sample_ct);
   uintptr_t x_variant_ct = 0;
   uintptr_t ytotal = 0;
@@ -3010,17 +2998,16 @@ int32_t sexcheck(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
   uintptr_t ulii;
   uint32_t orig_sex_code;
   uint32_t imputed_sex_code;
-  if (wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf, sample_ctl2 * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw) ||
+      bigstack_alloc_ul(sample_ctl2, &loadbuf)) {
     goto sexcheck_ret_NOMEM;
   }
   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
   loadbuf[sample_ctl2 - 1] = 0;
   if (check_y) {
-    if (wkspace_alloc_ui_checked(&ymiss_cts, sample_ct * sizeof(int32_t))) {
+    if (bigstack_calloc_ui(sample_ct, &ymiss_cts)) {
       goto sexcheck_ret_NOMEM;
     }
-    fill_uint_zero(ymiss_cts, sample_ct);
   }
   if (!yonly) {
     if ((x_code == -1) || (!is_set(chrom_info_ptr->chrom_mask, (uint32_t)x_code))) {
@@ -3032,14 +3019,11 @@ int32_t sexcheck(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
       goto sexcheck_ret_NO_X_VAR;
     }
     marker_idxs_left = marker_uidx_end - marker_uidx - popcount_bit_idx(marker_exclude, marker_uidx, marker_uidx_end);
-    if (wkspace_alloc_ui_checked(&het_cts, sample_ct * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&missing_cts, sample_ct * sizeof(int32_t)) ||
-        wkspace_alloc_d_checked(&nei_offsets, sample_ct * sizeof(double))) {
+    if (bigstack_calloc_ui(sample_ct, &het_cts) ||
+        bigstack_calloc_ui(sample_ct, &missing_cts) ||
+        bigstack_calloc_d(sample_ct, &nei_offsets)) {
       goto sexcheck_ret_NOMEM;
     }
-    fill_uint_zero(het_cts, sample_ct);
-    fill_uint_zero(missing_cts, sample_ct);
-    fill_double_zero(nei_offsets, sample_ct);
     if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
       goto sexcheck_ret_READ_FAIL;
     }
@@ -3050,7 +3034,7 @@ int32_t sexcheck(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
 	  goto sexcheck_ret_READ_FAIL;
 	}
       }
-      if (load_and_collapse(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, sample_ct, sample_exclude, final_mask, 0)) {
+      if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, 0, bedfile, loadbuf_raw, loadbuf)) {
 	goto sexcheck_ret_READ_FAIL;
       }
       cur_missing_ct = count_01(loadbuf, sample_ctl2);
@@ -3109,7 +3093,7 @@ int32_t sexcheck(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
 	    goto sexcheck_ret_READ_FAIL;
 	  }
 	}
-	if (load_and_collapse(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, sample_ct, sample_exclude, final_mask, 0)) {
+	if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, 0, bedfile, loadbuf_raw, loadbuf)) {
 	  goto sexcheck_ret_READ_FAIL;
 	}
 	lptr = loadbuf;
@@ -3132,20 +3116,20 @@ int32_t sexcheck(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
     }
   }
   memcpy(outname_end, ".sexcheck", 10);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto sexcheck_ret_OPEN_FAIL;
   }
-  sprintf(tbuf, "%%%us %%%us       PEDSEX       SNPSEX       STATUS%s%s\n", plink_maxfid, plink_maxiid, yonly? "" : "            F", check_y? "   YCOUNT" : "");
-  fprintf(outfile, tbuf, "FID", "IID");
+  sprintf(g_textbuf, "%%%us %%%us       PEDSEX       SNPSEX       STATUS%s%s\n", plink_maxfid, plink_maxiid, yonly? "" : "            F", check_y? "   YCOUNT" : "");
+  fprintf(outfile, g_textbuf, "FID", "IID");
   sample_uidx = 0;
   if (do_impute) {
-    bitfield_andnot(sex_nm, sample_exclude, unfiltered_sample_ctl);
+    bitvec_andnot(sample_exclude, unfiltered_sample_ctl, sex_nm);
   }
   for (sample_idx = 0; sample_idx < sample_ct; sample_idx++, sample_uidx++) {
     next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
     fid_ptr = &(sample_ids[sample_uidx * max_sample_id_len]);
     iid_ptr = (char*)memchr(fid_ptr, '\t', max_sample_id_len);
-    wptr = fw_strcpyn(plink_maxfid, (uintptr_t)(iid_ptr - fid_ptr), fid_ptr, tbuf);
+    wptr = fw_strcpyn(plink_maxfid, (uintptr_t)(iid_ptr - fid_ptr), fid_ptr, g_textbuf);
     *wptr++ = ' ';
     wptr = fw_strcpy(plink_maxiid, &(iid_ptr[1]), wptr);
     if (!IS_SET(sex_nm, sample_uidx)) {
@@ -3177,14 +3161,14 @@ int32_t sexcheck(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
 	  wptr = memcpya(wptr, "      PROBLEM ", 14);
 	  problem_ct++;
         }
-        wptr = double_g_writewx4(wptr, dff, 12);
+        wptr = dtoa_g_wxp4(dff, 12, wptr);
       } else {
         wptr = memcpya(wptr, "0      PROBLEM          nan", 27);
         problem_ct++;
       }
       if (check_y) {
 	*wptr++ = ' ';
-	wptr = uint32_writew8(wptr, ytotal - ymiss_cts[sample_idx]);
+	wptr = uint32toa_w8(ytotal - ymiss_cts[sample_idx], wptr);
       }
     } else {
       if (ymiss_cts[sample_idx] + min_m_yobs <= ytotal) {
@@ -3199,22 +3183,22 @@ int32_t sexcheck(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
 	wptr = memcpya(wptr, "      PROBLEM ", 14);
 	problem_ct++;
       }
-      wptr = uint32_writew8(wptr, ytotal - ymiss_cts[sample_idx]);
+      wptr = uint32toa_w8(ytotal - ymiss_cts[sample_idx], wptr);
     }
     *wptr++ = '\n';
-    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
       goto sexcheck_ret_WRITE_FAIL;
     }
     if (do_impute) {
       if (imputed_sex_code) {
-	SET_BIT(sex_nm, sample_uidx);
+	SET_BIT(sample_uidx, sex_nm);
 	if (imputed_sex_code == 1) {
-	  SET_BIT(sex_male, sample_uidx);
+	  SET_BIT(sample_uidx, sex_male);
 	} else {
-	  CLEAR_BIT(sex_male, sample_uidx);
+	  CLEAR_BIT(sample_uidx, sex_male);
 	}
       } else {
-	CLEAR_BIT(sex_nm, sample_uidx);
+	CLEAR_BIT(sample_uidx, sex_nm);
       }
     }
   }
@@ -3222,7 +3206,7 @@ int32_t sexcheck(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
     goto sexcheck_ret_WRITE_FAIL;
   }
   if (do_impute) {
-    bitfield_and(sex_male, sex_nm, unfiltered_sample_ctl);
+    bitvec_and(sex_nm, unfiltered_sample_ctl, sex_male);
     gender_unk_ct = sample_ct - popcount_longs(sex_nm, unfiltered_sample_ctl);
     if (!gender_unk_ct) {
       LOGPREPRINTFWW("--impute-sex: %" PRIuPTR " Xchr and %" PRIuPTR " Ychr variant(s) scanned, all sexes imputed. Report written to %s .\n", x_variant_ct, ytotal, outname);
@@ -3257,7 +3241,7 @@ int32_t sexcheck(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
     retval = RET_INVALID_CMDLINE;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   return retval;
 }
@@ -3280,7 +3264,7 @@ int32_t write_snplist(char* outname, char* outname_end, uintptr_t unfiltered_mar
   } else {
     memcpy(outname_end, ".indel", 7);
   }
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto write_snplist_ret_OPEN_FAIL;
   }
   if (!list_23_indels) {
@@ -3345,7 +3329,7 @@ int32_t write_var_ranges(char* outname, char* outname_end, uintptr_t unfiltered_
     goto write_var_ranges_ret_INVALID_CMDLINE;
   }
   memcpy(outname_end, ".var.ranges", 12);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto write_var_ranges_ret_OPEN_FAIL;
   }
   if (fputs_checked("FIRST\tLAST\n", outfile)) {
@@ -3384,11 +3368,11 @@ int32_t write_var_ranges(char* outname, char* outname_end, uintptr_t unfiltered_
 }
 
 int32_t list_duplicate_vars(char* outname, char* outname_end, uint32_t dupvar_modifier, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t* marker_pos, Chrom_info* chrom_info_ptr, char** marker_allele_ptrs) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
-  uint32_t* uidx_list_end = (uint32_t*)(&(wkspace_base[wkspace_left]));
-  uint32_t* group_list_start = (uint32_t*)wkspace_base;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
+  uint32_t* uidx_list_end = (uint32_t*)g_bigstack_end;
+  uint32_t* group_list_start = (uint32_t*)g_bigstack_base;
   uint32_t* group_write = group_list_start;
   uint32_t require_same_ref = dupvar_modifier & DUPVAR_REF;
   uint32_t ids_only = dupvar_modifier & DUPVAR_IDS_ONLY;
@@ -3428,7 +3412,7 @@ int32_t list_duplicate_vars(char* outname, char* outname_end, uint32_t dupvar_mo
   uint32_t ujj;
   uidx_list_end--;
   memcpy(outname_end, ".dupvar", 8);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto list_duplicate_vars_ret_OPEN_FAIL;
   }
   if (!ids_only) {
@@ -3436,20 +3420,20 @@ int32_t list_duplicate_vars(char* outname, char* outname_end, uint32_t dupvar_mo
       goto list_duplicate_vars_ret_WRITE_FAIL;
     }
   }
-  max_batch_size = wkspace_left / (5 * sizeof(int32_t));
+  max_batch_size = bigstack_left() / (5 * sizeof(int32_t));
   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
     chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1];
     marker_uidx = next_unset(marker_exclude, chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx], chrom_end);
     if (marker_uidx == chrom_end) {
       continue;
     }
-    wptr_start = chrom_name_write(tbuf, chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx]);
+    wptr_start = chrom_name_write(chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx], g_textbuf);
     *wptr_start++ = '\t';
     last_pos = marker_pos[marker_uidx];
     while (1) {
       last_uidx = marker_uidx;
       marker_uidx++;
-      next_unset_ck(marker_exclude, &marker_uidx, chrom_end);
+      next_unset_ck(marker_exclude, chrom_end, &marker_uidx);
       if (marker_uidx == chrom_end) {
 	break;
       }
@@ -3464,7 +3448,7 @@ int32_t list_duplicate_vars(char* outname, char* outname_end, uint32_t dupvar_mo
 	  }
 	  *(--uidx_list) = marker_uidx;
 	  marker_uidx++;
-	  next_unset_ck(marker_exclude, &marker_uidx, chrom_end);
+	  next_unset_ck(marker_exclude, chrom_end, &marker_uidx);
 	  if (marker_uidx == chrom_end) {
 	    break;
 	  }
@@ -3553,8 +3537,8 @@ int32_t list_duplicate_vars(char* outname, char* outname_end, uint32_t dupvar_mo
 	  if (!ids_only) {
 	    read_uiptr = group_write;
 	    for (group_idx = 0; group_idx < duplicate_group_ct; group_idx++) {
-	      wptr = uint32_writex(wptr_start, last_pos, '\t');
-	      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	      wptr = uint32toa_x(last_pos, '\t', wptr_start);
+	      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 		goto list_duplicate_vars_ret_WRITE_FAIL;
 	      }
 	      uii = *read_uiptr;
@@ -3607,20 +3591,20 @@ int32_t list_duplicate_vars(char* outname, char* outname_end, uint32_t dupvar_mo
   }
   if (ids_only && (group_write != group_list_start)) {
     htable_entry_ct = (uintptr_t)(group_write - group_list_start);
-    group_list_start = (uint32_t*)wkspace_alloc(htable_entry_ct * sizeof(int32_t));
-    if (wkspace_alloc_ul_checked(&uniqueness_check_bitfield, unfiltered_marker_ctl * sizeof(intptr_t))) {
+    bigstack_alloc_ui(htable_entry_ct, &group_list_start);
+    if (bigstack_alloc_ul(unfiltered_marker_ctl, &uniqueness_check_bitfield)) {
       goto list_duplicate_vars_ret_NOMEM;
     }
-    fill_all_bits(uniqueness_check_bitfield, unfiltered_marker_ct);
+    fill_all_bits(unfiltered_marker_ct, uniqueness_check_bitfield);
     for (uii = 0; uii < htable_entry_ct; uii++) {
-      clear_bit(uniqueness_check_bitfield, (group_list_start[uii] & 0x7fffffff));
+      clear_bit((group_list_start[uii] & 0x7fffffff), uniqueness_check_bitfield);
     }
-    retval = alloc_and_populate_id_htable(unfiltered_marker_ct, uniqueness_check_bitfield, htable_entry_ct, marker_ids, max_marker_id_len, 0, &reported_id_htable, &reported_id_htable_size);
+    retval = alloc_and_populate_id_htable(unfiltered_marker_ct, uniqueness_check_bitfield, htable_entry_ct, marker_ids, max_marker_id_len, 0, &reported_id_htable_size, &reported_id_htable);
     if (retval) {
       goto list_duplicate_vars_ret_1;
     }
-    bitfield_invert(uniqueness_check_bitfield, unfiltered_marker_ct);
-    bitfield_or(uniqueness_check_bitfield, marker_exclude, unfiltered_marker_ctl);
+    bitarr_invert(unfiltered_marker_ct, uniqueness_check_bitfield);
+    bitvec_or(marker_exclude, unfiltered_marker_ctl, uniqueness_check_bitfield);
     uniqueness_check_ct = marker_ct - htable_entry_ct;
     for (marker_uidx2 = 0, marker_idx = 0; marker_idx < uniqueness_check_ct; marker_uidx2++, marker_idx++) {
       next_unset_ul_unsafe_ck(uniqueness_check_bitfield, &marker_uidx2);
@@ -3662,21 +3646,21 @@ int32_t list_duplicate_vars(char* outname, char* outname_end, uint32_t dupvar_mo
     break;
   }
  list_duplicate_vars_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   return retval;
 }
 
 int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uintptr_t* founder_info, Chrom_info* chrom_info_ptr, double* set_allele_freqs) {
   // Same F coefficient computation as sexcheck().
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   uintptr_t* loadbuf_f = NULL;
   uintptr_t* founder_vec11 = NULL;
   char* pzwritep = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
-  uintptr_t sample_ctl2 = (sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
   uintptr_t final_mask = get_final_mask(sample_ct);
   uintptr_t founder_ct = 0;
   uintptr_t monomorphic_ct = 0;
@@ -3715,19 +3699,16 @@ int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
     logerrprint("Error: --het cannot be used on haploid genomes.\n");
     goto het_report_ret_INVALID_CMDLINE;
   }
-  if (wkspace_alloc_uc_checked(&overflow_buf, PIGZ_BLOCK_SIZE + MAXLINELEN) ||
-      wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf, sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&het_cts, sample_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&missing_cts, sample_ct * sizeof(int32_t)) ||
-      wkspace_alloc_d_checked(&nei_offsets, sample_ct * sizeof(double))) {
+  if (bigstack_alloc_uc(PIGZ_BLOCK_SIZE + MAXLINELEN, &overflow_buf) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw) ||
+      bigstack_alloc_ul(sample_ctl2, &loadbuf) ||
+      bigstack_calloc_ui(sample_ct, &het_cts) ||
+      bigstack_calloc_ui(sample_ct, &missing_cts) ||
+      bigstack_calloc_d(sample_ct, &nei_offsets)) {
     goto het_report_ret_NOMEM;
   }
   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
   loadbuf[sample_ctl2 - 1] = 0;
-  fill_uint_zero(het_cts, sample_ct);
-  fill_uint_zero(missing_cts, sample_ct);
-  fill_double_zero(nei_offsets, sample_ct);
   marker_ct -= count_non_autosomal_markers(chrom_info_ptr, marker_exclude, 1, 1);
   if (!marker_ct) {
     goto het_report_ret_INVALID_CMDLINE;
@@ -3738,11 +3719,11 @@ int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
     // missing/allele counts in that subset.
     founder_ct = popcount_longs(founder_info, unfiltered_sample_ctl);
     if (founder_ct < sample_ct) {
-      if (wkspace_alloc_ul_checked(&founder_vec11, sample_ctl2 * sizeof(intptr_t)) ||
-          wkspace_alloc_ul_checked(&loadbuf_f, sample_ctl2 * sizeof(intptr_t))) {
+      if (bigstack_alloc_ul(sample_ctl2, &founder_vec11) ||
+          bigstack_alloc_ul(sample_ctl2, &loadbuf_f)) {
 	goto het_report_ret_NOMEM;
       }
-      vec_collapse_init_exclude(founder_info, unfiltered_sample_ct, sample_exclude, sample_ct, founder_vec11);
+      quaterarr_collapse_init_exclude(founder_info, unfiltered_sample_ct, sample_exclude, sample_ct, founder_vec11);
       lptr = founder_vec11;
       for (ulii = 0; ulii < sample_ctl2; ulii++) {
 	*lptr = (*lptr) * 3;
@@ -3771,7 +3752,7 @@ int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
         goto het_report_ret_READ_FAIL;
       }
     }
-    if (load_and_collapse(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, sample_ct, sample_exclude, final_mask, 0)) {
+    if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, 0, bedfile, loadbuf_raw, loadbuf)) {
       goto het_report_ret_READ_FAIL;
     }
     cur_missing_ct = count_01(loadbuf, sample_ctl2);
@@ -3786,7 +3767,7 @@ int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
     } else {
       if (founder_vec11) {
         memcpy(loadbuf_f, loadbuf, sample_ctl2 * sizeof(intptr_t));
-        bitfield_and(loadbuf_f, founder_vec11, sample_ctl2);
+        bitvec_and(founder_vec11, sample_ctl2, loadbuf_f);
         f_missing_ct = count_01(loadbuf_f, sample_ctl2);
       } else {
 	f_missing_ct = cur_missing_ct;
@@ -3837,8 +3818,8 @@ int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
     goto het_report_ret_OPEN_FAIL;
   }
   pzwritep = (char*)overflow_buf;
-  sprintf(tbuf, "%%%us %%%us       O(HOM)       E(HOM)        N(NM)            F\n", plink_maxfid, plink_maxiid);
-  pzwritep += sprintf(pzwritep, tbuf, "FID", "IID");
+  sprintf(g_textbuf, "%%%us %%%us       O(HOM)       E(HOM)        N(NM)            F\n", plink_maxfid, plink_maxiid);
+  pzwritep += sprintf(pzwritep, g_textbuf, "FID", "IID");
   sample_uidx = 0;
   for (sample_idx = 0; sample_idx < sample_ct; sample_idx++, sample_uidx++) {
     next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
@@ -3850,14 +3831,14 @@ int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
     pzwritep = memseta(pzwritep, 32, 3);
     obs_ct = marker_ct - missing_cts[sample_idx];
     if (obs_ct) {
-      pzwritep = uint32_writew10x(pzwritep, obs_ct - het_cts[sample_idx], ' ');
+      pzwritep = uint32toa_w10x(obs_ct - het_cts[sample_idx], ' ', pzwritep);
       dee = nei_sum - nei_offsets[sample_idx];
-      pzwritep = double_g_writewx4(pzwritep, dee, 12);
+      pzwritep = dtoa_g_wxp4(dee, 12, pzwritep);
       pzwritep = memseta(pzwritep, 32, 3);
-      pzwritep = uint32_writew10x(pzwritep, obs_ct, ' ');
+      pzwritep = uint32toa_w10x(obs_ct, ' ', pzwritep);
       dtot = (double)((int32_t)obs_ct) - dee;
       dff = (dtot - ((double)((int32_t)(het_cts[sample_idx])))) / dtot;
-      pzwritep = double_g_writewx4(pzwritep, dff, 12);
+      pzwritep = dtoa_g_wxp4(dff, 12, pzwritep);
     } else {
       pzwritep = memcpya(pzwritep, "         0            0            0          nan", 49);
     }
@@ -3888,7 +3869,7 @@ int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
     retval = RET_INVALID_CMDLINE;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   flex_pzwrite_close_cond(&ps, pzwritep);
   return retval;
 }
@@ -3896,12 +3877,12 @@ int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
 int32_t fst_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t* marker_pos, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts) {
   // Math based on VCFtools variant_file::output_weir_and_cockerham_fst();
   // frequency counting logic similar to cmh_assoc().
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
   char* wptr_start = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
-  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + BITCT - 1) / BITCT;
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
   double sum1 = 0.0;
   double sum2 = 0.0;
   double sum3 = 0.0;
@@ -3960,13 +3941,12 @@ int32_t fst_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
   if (pheno_c) {
     cluster_ct = 2;
   }
-  if (wkspace_alloc_ul_checked(&loadbuf, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&cluster_mask, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ui_checked(&sample_to_cluster, unfiltered_sample_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&cluster_sizes, cluster_ct * sizeof(int32_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf) ||
+      bigstack_calloc_ul(unfiltered_sample_ctl2, &cluster_mask) ||
+      bigstack_alloc_ui(unfiltered_sample_ct, &sample_to_cluster) ||
+      bigstack_alloc_ui(cluster_ct, &cluster_sizes)) {
     goto fst_report_ret_NOMEM;
   }
-  fill_ulong_zero(cluster_mask, unfiltered_sample_ctl2);
   if (pheno_c) {
     cur_sample_ct = popcount_longs(pheno_nm, unfiltered_sample_ctl);
     uii = popcount_longs(pheno_c, unfiltered_sample_ctl);
@@ -4004,17 +3984,17 @@ int32_t fst_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
       logerrprint("Error: --fst requires at least two nonempty clusters.\n");
       goto fst_report_ret_INVALID_CMDLINE;
     }
-    wkspace_shrink_top(cluster_sizes, cluster_ct * sizeof(int32_t));
+    bigstack_shrink_top(cluster_sizes, cluster_ct * sizeof(int32_t));
   }
-  cluster_ctd = (double)((int32_t)cluster_ct);
+  cluster_ctd = (double)((intptr_t)cluster_ct);
   cluster_ct_recip = 1.0 / cluster_ctd;
   cluster_ctm1_recip = 1.0 / (cluster_ctd - 1.0);
   one_minus_cluster_ct_recip = 1.0 - cluster_ct_recip;
-  if (wkspace_alloc_ui_checked(&cluster_geno_cts, cluster_ct * 3 * sizeof(int32_t))) {
+  if (bigstack_alloc_ui(cluster_ct * 3, &cluster_geno_cts)) {
     goto fst_report_ret_NOMEM;
   }
   memcpy(outname_end, ".fst", 5);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto fst_report_ret_OPEN_FAIL;
   }
   if (fputs_checked("CHR\tSNP\tPOS\tNMISS\tFST\n", outfile)) {
@@ -4042,7 +4022,7 @@ int32_t fst_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
 	seek_flag = 1;
 	marker_uidx = next_unset_unsafe(marker_exclude, chrom_end);
       }
-      wptr_start = chrom_name_write(tbuf, chrom_info_ptr, chrom_idx);
+      wptr_start = chrom_name_write(chrom_info_ptr, chrom_idx, g_textbuf);
       *wptr_start++ = '\t';
     }
     if (seek_flag) {
@@ -4051,7 +4031,7 @@ int32_t fst_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
       }
       seek_flag = 0;
     }
-    if (load_raw(bedfile, loadbuf, unfiltered_sample_ct4)) {
+    if (load_raw(unfiltered_sample_ct4, bedfile, loadbuf)) {
       goto fst_report_ret_READ_FAIL;
     }
     fill_uint_zero(cluster_geno_cts, cluster_ct * 3);
@@ -4112,10 +4092,10 @@ int32_t fst_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
       sum3 += dyy;
     }
     wptr = strcpyax(wptr_start, &(marker_ids[marker_uidx * max_marker_id_len]), '\t');
-    wptr = uint32_writex(wptr, marker_pos[marker_uidx], '\t');
-    wptr = uint32_writex(wptr, cur_sample_ct, '\t');
-    wptr = double_g_writex(wptr, dyy, '\n');
-    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+    wptr = uint32toa_x(marker_pos[marker_uidx], '\t', wptr);
+    wptr = uint32toa_x(cur_sample_ct, '\t', wptr);
+    wptr = dtoa_gx(dyy, '\n', wptr);
+    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
       goto fst_report_ret_WRITE_FAIL;
     }
     if (marker_idx >= loop_end) {
@@ -4164,7 +4144,7 @@ int32_t fst_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
     retval = RET_INVALID_CMDLINE;
     break;
   }
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   fclose_cond(outfile);
   return retval;
 }
@@ -4172,22 +4152,22 @@ int32_t fst_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
 int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, double* set_allele_freqs, uintptr_t sample_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uintptr_t* sex_male, uintptr_t [...]
   // Note that there is a dosage-only implementation of this logic in
   // plink_dosage.c.
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   FILE* infile = NULL;
   FILE* outfile = NULL;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
-  uintptr_t sample_ctl2 = (sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctl2 = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
   uintptr_t final_mask = get_final_mask(sample_ct);
-  uintptr_t topsize = 0;
   uintptr_t miss_ct = 0;
   uint32_t miss_varid_ct = 0;
   uint32_t miss_allele_ct = 0;
   uintptr_t range_ct = 0;
   uintptr_t range_skip = 0;
   uintptr_t ulii = 0;
-  char* tbuf2 = &(tbuf[MAXLINELEN]);
+  char* tbuf2 = &(g_textbuf[MAXLINELEN]);
   uintptr_t* marker_exclude_main = NULL;
   uintptr_t* sample_include2 = NULL;
   uintptr_t* sample_male_include2 = NULL;
@@ -4259,33 +4239,26 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
   int32_t delta1;
   int32_t delta2;
   int32_t deltam;
-  marker_id_htable = (uint32_t*)top_alloc(&topsize, marker_id_htable_size * sizeof(int32_t));
-  if (!marker_id_htable) {
+  if (bigstack_end_alloc_ui(marker_id_htable_size, &marker_id_htable)) {
     goto score_report_ret_NOMEM;
   }
-  retval = populate_id_htable(unfiltered_marker_ct, marker_exclude_orig, marker_ct, marker_ids, max_marker_id_len, 0, marker_id_htable, marker_id_htable_size);
+  retval = populate_id_htable(unfiltered_marker_ct, marker_exclude_orig, marker_ct, marker_ids, max_marker_id_len, 0, marker_id_htable_size, marker_id_htable);
   if (retval) {
     goto score_report_ret_1;
   }
-  dptr = (double*)top_alloc(&topsize, unfiltered_marker_ct * sizeof(double));
-  if (!dptr) {
+  if (bigstack_end_alloc_d(unfiltered_marker_ct, &dptr) ||
+      bigstack_alloc_ul(unfiltered_marker_ctl, &marker_exclude) ||
+      bigstack_calloc_ul(unfiltered_marker_ctl, &a2_effect)) {
     goto score_report_ret_NOMEM;
   }
-  wkspace_left -= topsize;
-  if (wkspace_alloc_ul_checked(&marker_exclude, unfiltered_marker_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&a2_effect, unfiltered_marker_ctl * sizeof(intptr_t))) {
-    goto score_report_ret_NOMEM2;
-  }
-  fill_all_bits(marker_exclude, unfiltered_marker_ct);
-  fill_ulong_zero(a2_effect, unfiltered_marker_ctl);
-  loadbuf_size = wkspace_left;
-  wkspace_left += topsize;
+  fill_all_bits(unfiltered_marker_ct, marker_exclude);
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   } else if (loadbuf_size <= MAXLINELEN) {
     goto score_report_ret_NOMEM;
   }
-  loadbuf_c = (char*)wkspace_base;
+  loadbuf_c = (char*)g_bigstack_base;
   retval = open_and_load_to_first_token(&infile, sc_ip->fname, loadbuf_size, '\0', "--score file", loadbuf_c, &bufptr, &line_idx);
   if (retval) {
     goto score_report_ret_1;
@@ -4355,7 +4328,7 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
       if ((!uii) || (!strcmp(bufptr_arr[allele_idx], marker_allele_ptrs[2 * marker_uidx + 1]))) {
         if (scan_double(bufptr_arr[effect_idx], &(dptr[marker_uidx]))) {
 	  if (!miss_ct) {
-	    if (fopen_checked(&outfile, outname, "w")) {
+	    if (fopen_checked(outname, "w", &outfile)) {
 	      goto score_report_ret_OPEN_FAIL;
 	    }
 	  }
@@ -4371,14 +4344,14 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
             LOGPREPRINTFWW("Error: Duplicate variant '%s' in --score file.\n", bufptr_arr[varid_idx]);
             goto score_report_ret_INVALID_FORMAT_2;
 	  }
-          CLEAR_BIT(marker_exclude, marker_uidx);
+          CLEAR_BIT(marker_uidx, marker_exclude);
 	  if (uii) {
-	    SET_BIT(a2_effect, marker_uidx);
+	    SET_BIT(marker_uidx, a2_effect);
 	  }
 	}
       } else {
 	if (!miss_ct) {
-	  if (fopen_checked(&outfile, outname, "w")) {
+	  if (fopen_checked(outname, "w", &outfile)) {
 	    goto score_report_ret_OPEN_FAIL;
 	  }
 	}
@@ -4398,7 +4371,7 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
       }
     } else {
       if (!miss_ct) {
-	if (fopen_checked(&outfile, outname, "w")) {
+	if (fopen_checked(outname, "w", &outfile)) {
 	  goto score_report_ret_OPEN_FAIL;
 	}
       }
@@ -4417,7 +4390,7 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
     }
     if (!(loadbuf_c[loadbuf_size - 1])) {
       if (loadbuf_size == MAXLINEBUFLEN) {
-        sprintf(logbuf, "Error: Line %" PRIuPTR " of --score file is pathologically long.\n", line_idx);
+        sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --score file is pathologically long.\n", line_idx);
         goto score_report_ret_INVALID_FORMAT_2;
       }
       goto score_report_ret_NOMEM;
@@ -4447,24 +4420,18 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
   LOGPRINTF("--score: %u valid predictor%s loaded.\n", cur_marker_ct, (cur_marker_ct == 1)? "" : "s");
   if (sc_ip->data_fname) {
     effect_sizes_cur = dptr; // not collapsed yet
-    ulii = topsize;
-    dptr = (double*)top_alloc(&topsize, unfiltered_marker_ct * sizeof(double));
-    if (!dptr) {
+    if (bigstack_end_alloc_d(unfiltered_marker_ct, &dptr) ||
+        bigstack_alloc_ul(unfiltered_marker_ctl, &marker_exclude_main)) {
       goto score_report_ret_NOMEM;
     }
-    wkspace_left -= topsize;
-    if (wkspace_alloc_ul_checked(&marker_exclude_main, unfiltered_marker_ctl * sizeof(intptr_t))) {
-      goto score_report_ret_NOMEM2;
-    }
-    fill_all_bits(marker_exclude_main, unfiltered_marker_ct);
-    wkspace_left += topsize;
-    loadbuf_size = wkspace_left - topsize;
+    fill_all_bits(unfiltered_marker_ct, marker_exclude_main);
+    loadbuf_size = bigstack_left();
     if (loadbuf_size > MAXLINEBUFLEN) {
       loadbuf_size = MAXLINEBUFLEN;
     } else if (loadbuf_size <= MAXLINELEN) {
       goto score_report_ret_NOMEM;
     }
-    loadbuf_c = (char*)wkspace_base;
+    loadbuf_c = (char*)g_bigstack_base;
     retval = open_and_load_to_first_token(&infile, sc_ip->data_fname, loadbuf_size, '\0', "--q-score-range data file", loadbuf_c, &bufptr, &line_idx);
     if (retval) {
       goto score_report_ret_1;
@@ -4501,7 +4468,7 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
 	      LOGPREPRINTFWW("Error: Duplicate variant '%s' in --q-score-range data file.\n", bufptr_arr[varid_idx]);
 	      goto score_report_ret_INVALID_FORMAT_2;
 	    }
-            CLEAR_BIT(marker_exclude_main, marker_uidx);
+            CLEAR_BIT(marker_uidx, marker_exclude_main);
 	  }
 	} else {
 	  miss_ct++;
@@ -4516,7 +4483,7 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
       }
       if (!(loadbuf_c[loadbuf_size - 1])) {
 	if (loadbuf_size == MAXLINEBUFLEN) {
-	  sprintf(logbuf, "Error: Line %" PRIuPTR " of --q-score-range data file is pathologically long.\n", line_idx);
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --q-score-range data file is pathologically long.\n", line_idx);
 	  goto score_report_ret_INVALID_FORMAT_2;
 	}
 	goto score_report_ret_NOMEM;
@@ -4534,52 +4501,46 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
       logerrprint("Error: No valid entries in --q-score-range data file.\n");
       goto score_report_ret_INVALID_FORMAT;
     }
-    wkspace_left -= topsize;
     qrange_keys = (double*)alloc_and_init_collapsed_arr((char*)dptr, sizeof(double), unfiltered_marker_ct, marker_exclude_main, marker_ct, 0);
-    wkspace_left += topsize;
     if (!qrange_keys) {
       goto score_report_ret_NOMEM;
     }
-    topsize = ulii;
-    wkspace_left -= topsize;
+    bigstack_end_reset(effect_sizes_cur);
     effect_sizes = (double*)alloc_and_init_collapsed_arr((char*)effect_sizes_cur, sizeof(double), unfiltered_marker_ct, marker_exclude_main, marker_ct, 0);
-    wkspace_left += topsize;
     if (!effect_sizes) {
       goto score_report_ret_NOMEM;
     }
-    if (wkspace_alloc_d_checked(&effect_sizes_cur, marker_ct * sizeof(double))) {
+    if (bigstack_alloc_d(marker_ct, &effect_sizes_cur)) {
       goto score_report_ret_NOMEM;
     }
     if (miss_ct) {
       LOGERRPRINTF("Warning: %" PRIuPTR " line%s skipped in --q-score-range data file.\n", miss_ct, (miss_ct == 1)? "" : "s");
     }
     miss_ct = 0;
-    if (fopen_checked(&infile, sc_ip->range_fname, "r")) {
+    if (fopen_checked(sc_ip->range_fname, "r", &infile)) {
       goto score_report_ret_OPEN_FAIL;
     }
     max_rangename_len = (FNAMESIZE - 10) - ((uintptr_t)(outname_end - outname));
-    tbuf[MAXLINELEN - 1] = ' ';
+    g_textbuf[MAXLINELEN - 1] = ' ';
     *outname_end = '.';
   } else {
-    wkspace_left -= topsize;
     effect_sizes = (double*)alloc_and_init_collapsed_arr((char*)dptr, sizeof(double), unfiltered_marker_ct, marker_exclude, cur_marker_ct, 0);
-    wkspace_left += topsize;
     if (!effect_sizes) {
       goto score_report_ret_NOMEM;
     }
   }
-  // topsize = 0;
-  if (wkspace_alloc_d_checked(&score_deltas, sample_ct * sizeof(double)) ||
-      wkspace_alloc_ui_checked(&miss_cts, sample_ct * sizeof(int32_t)) ||
-      wkspace_alloc_i_checked(&named_allele_ct_deltas, sample_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf, sample_ctl2 * sizeof(intptr_t))) {
+  bigstack_end_reset(bigstack_end_mark);
+  if (bigstack_alloc_d(sample_ct, &score_deltas) ||
+      bigstack_alloc_ui(sample_ct, &miss_cts) ||
+      bigstack_alloc_i(sample_ct, &named_allele_ct_deltas) ||
+      bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw) ||
+      bigstack_alloc_ul(sample_ctl2, &loadbuf)) {
     goto score_report_ret_NOMEM;
   }
   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
   loadbuf[sample_ctl2 - 1] = 0;
   // force sample_male_include2 allocation
-  if (alloc_collapsed_haploid_filters(unfiltered_sample_ct, sample_ct, hh_exists | XMHH_EXISTS, 0, sample_exclude, sex_male, &sample_include2, &sample_male_include2)) {
+  if (alloc_collapsed_haploid_filters(sample_exclude, sex_male, unfiltered_sample_ct, sample_ct, hh_exists | XMHH_EXISTS, 0, &sample_include2, &sample_male_include2)) {
     goto score_report_ret_NOMEM;
   }
   missing_pheno_len = strlen(output_missing_pheno);
@@ -4595,7 +4556,7 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
   score_report_qrange_next:
     while (1) {
       line_idx++;
-      if (!fgets(tbuf, MAXLINELEN, infile)) {
+      if (!fgets(g_textbuf, MAXLINELEN, infile)) {
 	if (fclose_null(&infile)) {
 	  goto score_report_ret_READ_FAIL;
 	}
@@ -4606,14 +4567,14 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
 	}
 	logprint(".\n");
 	LOGPREPRINTFWW("Results written to %s.*.profile.\n", outname);
-	fputs(logbuf, stdout);
+	fputs(g_logbuf, stdout);
 	goto score_report_ret_1;
       }
-      if (!tbuf[MAXLINELEN - 1]) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of --q-score-range range file is pathologically long.\n", line_idx);
+      if (!g_textbuf[MAXLINELEN - 1]) {
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --q-score-range range file is pathologically long.\n", line_idx);
 	goto score_report_ret_INVALID_FORMAT_2;
       }
-      bufptr = skip_initial_spaces(tbuf);
+      bufptr = skip_initial_spaces(g_textbuf);
       if (is_eoln_kns(*bufptr)) {
 	continue;
       }
@@ -4624,20 +4585,20 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
 	continue;
       }
       if (rangename_len > max_rangename_len) {
-	sprintf(logbuf, "Error: Excessively long range name on line %" PRIuPTR " of --q-score-range range\nfile.\n", line_idx);
+	sprintf(g_logbuf, "Error: Excessively long range name on line %" PRIuPTR " of --q-score-range range\nfile.\n", line_idx);
 	goto score_report_ret_INVALID_FORMAT_2;
       }
       bufptr_arr[0] = bufptr;
       break;
     }
-    fill_all_bits(marker_exclude, unfiltered_marker_ct);
+    fill_all_bits(unfiltered_marker_ct, marker_exclude);
     marker_uidx = next_unset_unsafe(marker_exclude_main, 0);
     dptr = effect_sizes_cur;
     for (marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
       next_unset_ul_unsafe_ck(marker_exclude_main, &marker_uidx);
       dxx = qrange_keys[marker_idx];
       if ((dxx >= lbound) && (dxx <= ubound)) {
-	CLEAR_BIT(marker_exclude, marker_uidx);
+	CLEAR_BIT(marker_uidx, marker_exclude);
 	*dptr++ = effect_sizes[marker_idx];
       }
     }
@@ -4684,7 +4645,7 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
       ploidy = 2 - is_haploid;
       ploidy_d = (double)((int32_t)ploidy);
     }
-    if (load_and_collapse(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+    if (load_and_collapse(unfiltered_sample_ct, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, loadbuf)) {
       goto score_report_ret_READ_FAIL;
     }
     if (is_haploid && hh_exists) {
@@ -4818,7 +4779,7 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
   } else {
     memcpy(outname_end, ".profile", 9);
   }
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto score_report_ret_OPEN_FAIL;
   }
   sprintf(tbuf2, "%%%us %%%us  PHENO    CNT   CNT2 %s\n", plink_maxfid, plink_maxiid, report_average? "   SCORE" : "SCORESUM");
@@ -4836,7 +4797,7 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
 	bufptr = memseta(bufptr, 32, 5);
 	*bufptr++ = '1' + IS_SET(pheno_c, sample_uidx);
       } else {
-	bufptr = width_force(6, bufptr, double_g_write(bufptr, pheno_d[sample_uidx]));
+	bufptr = width_force(6, bufptr, dtoa_g(pheno_d[sample_uidx], bufptr));
       }
     } else {
       bufptr = memcpya(bufptr, missing_pheno_str, missing_pheno_len);
@@ -4844,18 +4805,18 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
     *bufptr++ = ' ';
     ujj = 1 - IS_SET_DBL(sample_male_include2, sample_idx); // female?
     uii = obs_expected + ((int32_t)ujj) * obs_expected_female_delta - miss_cts[sample_idx];
-    bufptr = uint32_writew6x(bufptr, uii, ' ');
+    bufptr = uint32toa_w6x(uii, ' ', bufptr);
     if (mean_impute) {
       uii += miss_cts[sample_idx];
     }
-    bufptr = uint32_writew6x(bufptr, ((int32_t)named_allele_ct_expected) - ujj * named_allele_ct_female_delta + named_allele_ct_deltas[sample_idx], ' ');
+    bufptr = uint32toa_w6x(((int32_t)named_allele_ct_expected) - ujj * named_allele_ct_female_delta + named_allele_ct_deltas[sample_idx], ' ', bufptr);
     dxx = (score_base + ((int32_t)ujj) * female_y_offset + score_deltas[sample_idx]);
     if (fabs(dxx) < SMALL_EPSILON) {
       dxx = 0;
     } else if (report_average) {
       dxx /= ((double)((int32_t)uii));
     }
-    bufptr = width_force(8, bufptr, double_g_write(bufptr, dxx));
+    bufptr = width_force(8, bufptr, dtoa_g(dxx, bufptr));
     *bufptr++ = '\n';
     if (fwrite_checked(tbuf2, bufptr - tbuf2, outfile)) {
       goto score_report_ret_WRITE_FAIL;
@@ -4866,7 +4827,7 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
   }
   if (marker_exclude_main) {
     LOGPREPRINTFWW("%s written.\n", outname);
-    logstr(logbuf);
+    logstr(g_logbuf);
     goto score_report_qrange_next;
   }
   LOGPRINTFWW("--score: Results written to %s .\n", outname);
@@ -4874,8 +4835,6 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
   score_report_ret_OPEN_FAIL:
     retval = RET_OPEN_FAIL;
     break;
-  score_report_ret_NOMEM2:
-    wkspace_left += topsize;
   score_report_ret_NOMEM:
     retval = RET_NOMEM;
     break;
@@ -4890,7 +4849,7 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
     retval = RET_INVALID_FORMAT;
     break;
   score_report_ret_MISSING_TOKENS:
-    sprintf(logbuf, "Error: Line %" PRIuPTR " of --score file has fewer tokens than expected.\n", line_idx);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --score file has fewer tokens than expected.\n", line_idx);
   score_report_ret_INVALID_FORMAT_2:
     logerrprintb();
   score_report_ret_INVALID_FORMAT:
@@ -4898,7 +4857,7 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
     break;
   }
  score_report_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
   fclose_cond(infile);
   fclose_cond(outfile);
   return retval;
@@ -4924,11 +4883,9 @@ int32_t meta_analysis_open_and_read_header(const char* fname, char* loadbuf, uin
   if (line_idx_ptr) {
     line_max = *line_max_ptr;
   }
-  if (gzopen_checked(gz_infile_ptr, fname, "rb")) {
-    goto meta_analysis_open_and_read_header_ret_OPEN_FAIL;
-  }
-  if (gzbuffer(*gz_infile_ptr, 131072)) {
-    goto meta_analysis_open_and_read_header_ret_NOMEM;
+  retval = gzopen_read_checked(fname, gz_infile_ptr);
+  if (retval) {
+    goto meta_analysis_open_and_read_header_ret_1;
   }
 
   while (1) {
@@ -4936,12 +4893,12 @@ int32_t meta_analysis_open_and_read_header(const char* fname, char* loadbuf, uin
       if (!gzeof(*gz_infile_ptr)) {
 	goto meta_analysis_open_and_read_header_ret_READ_FAIL;
       }
-      sprintf(logbuf, "Error: %s is empty.\n", fname);
+      sprintf(g_logbuf, "Error: %s is empty.\n", fname);
       goto meta_analysis_open_and_read_header_ret_INVALID_FORMAT_WW;
     }
     if (line_max && (!loadbuf[loadbuf_size - 1])) {
       if (loadbuf_size == MAXLINEBUFLEN) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, fname);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, fname);
 	goto meta_analysis_open_and_read_header_ret_INVALID_FORMAT_WW;
       }
       goto meta_analysis_open_and_read_header_ret_NOMEM;
@@ -5022,7 +4979,7 @@ int32_t meta_analysis_open_and_read_header(const char* fname, char* loadbuf, uin
     bufptr = skip_initial_spaces(&(bufptr[slen]));
     if (++colnum == 0x8000000) {
       // pathological case
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of %s has too many columns.\n", line_idx, fname);
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s has too many columns.\n", line_idx, fname);
       goto meta_analysis_open_and_read_header_ret_INVALID_FORMAT_WW;
     }
   } while (!is_eoln_kns(*bufptr));
@@ -5034,29 +4991,29 @@ int32_t meta_analysis_open_and_read_header(const char* fname, char* loadbuf, uin
     *line_idx_ptr = line_idx;
     *line_max_ptr = line_max;
     if (parse_table[0] == 0xffffffffU) {
-      sprintf(logbuf, "Error: No variant ID field found in %s.\n", fname);
+      sprintf(g_logbuf, "Error: No variant ID field found in %s.\n", fname);
       goto meta_analysis_open_and_read_header_ret_INVALID_FORMAT_WW;
     } else if (parse_table[1] == 0xffffffffU) {
-      sprintf(logbuf, "Error: No effect size field found in %s.\n", fname);
+      sprintf(g_logbuf, "Error: No effect size field found in %s.\n", fname);
       goto meta_analysis_open_and_read_header_ret_INVALID_FORMAT_WW;
     } else if (parse_table[2] == 0xffffffffU) {
-      sprintf(logbuf, "Error: No standard error field found in %s.\n", fname);
+      sprintf(g_logbuf, "Error: No standard error field found in %s.\n", fname);
       goto meta_analysis_open_and_read_header_ret_INVALID_FORMAT_WW;
     } else if (weighted_z && (parse_table[3] == 0xffffffffU)) {
-      sprintf(logbuf, "Error: No p-value field found in %s.\n", fname);
+      sprintf(g_logbuf, "Error: No p-value field found in %s.\n", fname);
       goto meta_analysis_open_and_read_header_ret_INVALID_FORMAT_WW;
     } else if (weighted_z && (parse_table[4] == 0xffffffffU)) {
-      sprintf(logbuf, "Error: No effective sample size field found in %s.\n", fname);
+      sprintf(g_logbuf, "Error: No effective sample size field found in %s.\n", fname);
       goto meta_analysis_open_and_read_header_ret_INVALID_FORMAT_WW;
     } else if (token_ct > 5) {
       if (parse_table[5] == 0xffffffffU) {
-	sprintf(logbuf, "Error: No CHR field found in %s.\n", fname);
+	sprintf(g_logbuf, "Error: No CHR field found in %s.\n", fname);
 	goto meta_analysis_open_and_read_header_ret_INVALID_FORMAT_WW;
       } else if (parse_table[6] == 0xffffffffU) {
-	sprintf(logbuf, "Error: No POS field found in %s.\n", fname);
+	sprintf(g_logbuf, "Error: No POS field found in %s.\n", fname);
 	goto meta_analysis_open_and_read_header_ret_INVALID_FORMAT_WW;
       } else if ((token_ct > 7) && (parse_table[7] == 0xffffffffU)) {
-	sprintf(logbuf, "Error: No A1 allele field found in %s.\n", fname);
+	sprintf(g_logbuf, "Error: No A1 allele field found in %s.\n", fname);
 	goto meta_analysis_open_and_read_header_ret_INVALID_FORMAT_WW;
       }
     }
@@ -5090,21 +5047,19 @@ int32_t meta_analysis_open_and_read_header(const char* fname, char* loadbuf, uin
   meta_analysis_open_and_read_header_ret_NOMEM:
     retval = RET_NOMEM;
     break;
-  meta_analysis_open_and_read_header_ret_OPEN_FAIL:
-    retval = RET_OPEN_FAIL;
-    break;
   meta_analysis_open_and_read_header_ret_READ_FAIL:
     retval = RET_READ_FAIL;
     break;
   meta_analysis_open_and_read_header_ret_DUPLICATE_HEADER_COL:
     bufptr[slen] = '\0';
-    sprintf(logbuf, "Error: Duplicate column header '%s' in %s.\n", bufptr, fname);
+    sprintf(g_logbuf, "Error: Duplicate column header '%s' in %s.\n", bufptr, fname);
   meta_analysis_open_and_read_header_ret_INVALID_FORMAT_WW:
-    wordwrap(logbuf, 0);
+    wordwrapb(0);
     logerrprintb();
     retval = RET_INVALID_FORMAT;
     break;
   }
+ meta_analysis_open_and_read_header_ret_1:
   return retval;
 }
 
@@ -5118,12 +5073,31 @@ uint32_t meta_analysis_allelic_match(const char* existing_a1ptr, char** token_pt
   return ((!(token_ct & 1)) || (!memcmp(&(existing_a1ptr[a1lenp1]), token_ptrs[8], a2lenp1)));
 }
 
+static inline char* uint32_encode_5_hi_uchar(uint32_t uii, char* start) {
+  // tried a few bit hacks here, but turns out nothing really beats this
+  *start++ = (unsigned char)((uii >> 28) | 0x80);
+  *start++ = (unsigned char)((uii >> 21) | 0x80);
+  *start++ = (unsigned char)((uii >> 14) | 0x80);
+  *start++ = (unsigned char)((uii >> 7) | 0x80);
+  *start++ = (unsigned char)(uii | 0x80);
+  return start;
+}
+
+static inline uint32_t uint32_decode_5_hi_uchar(const char* start) {
+  uint32_t uii = ((uint32_t)((unsigned char)(*start++))) << 28;
+  uii |= (((uint32_t)((unsigned char)(*start++))) & 0x7f) << 21;
+  uii |= (((uint32_t)((unsigned char)(*start++))) & 0x7f) << 14;
+  uii |= (((uint32_t)((unsigned char)(*start++))) & 0x7f) << 7;
+  uii |= ((uint32_t)((unsigned char)(*start))) & 0x7f;
+  return uii;
+}
+
 int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1field_search_order, char* a2field_search_order, char* pfield_search_order, char* essfield_search_order, uint32_t flags, char* extractname, char* outname, char* outname_end, double output_min_p, Chrom_info* chrom_info_ptr) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   gzFile gz_infile = NULL;
   FILE* infile = NULL;
   FILE* outfile = NULL;
-  char* loadbuf_end = (char*)(&(wkspace_base[wkspace_left]));
   char* cur_window_marker_ids = NULL;
   char* sorted_extract_ids = NULL;
   uintptr_t* duplicate_id_bitfield = NULL;
@@ -5177,10 +5151,6 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
   char* token_ptrs[9];
   uint32_t col_skips[9];
   uint32_t col_sequence[9];
-
-  // always initialized when allocating space for master variant list
-  uintptr_t topsize;
-
   uintptr_t loadbuf_size;
   uintptr_t max_var_id_len_p5;
   uintptr_t line_idx;
@@ -5198,7 +5168,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
   Ll_str* ll_ptr;
   Ll_str* htable_write;
   Ll_str* duplicate_id_htable_write;
-  unsigned char* wkspace_mark2;
+  unsigned char* bigstack_mark2;
   char* sorted_header_dict;
   char* master_var_list;
   char* cur_entry_list_window;
@@ -5297,8 +5267,8 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
       parse_max = 7;
     }
   }
-  if (wkspace_alloc_c_checked(&sorted_header_dict, header_dict_ct * max_header_len) ||
-      wkspace_alloc_ui_checked(&header_id_map, header_dict_ct * sizeof(int32_t))) {
+  if (bigstack_alloc_c(header_dict_ct * max_header_len, &sorted_header_dict) ||
+      bigstack_alloc_ui(header_dict_ct, &header_id_map)) {
     goto meta_analysis_ret_NOMEM;
   }
   ulii = 0; // write position
@@ -5396,10 +5366,10 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
 
   // 2. If --extract specified, load and sort permitted variant list.
   if (extractname) {
-    if (fopen_checked(&infile, extractname, "rb")) {
+    if (fopen_checked(extractname, FOPEN_RB, &infile)) {
       goto meta_analysis_ret_OPEN_FAIL;
     }
-    retval = scan_token_ct_len(infile, tbuf, MAXLINELEN, &extract_ct, &max_extract_id_len);
+    retval = scan_token_ct_len(MAXLINELEN, infile, g_textbuf, &extract_ct, &max_extract_id_len);
     if (retval) {
       goto meta_analysis_ret_1;
     }
@@ -5411,7 +5381,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
       logerrprint("Error: --extract IDs are limited to " MAX_ID_LEN_STR " characters.\n");
       goto meta_analysis_ret_INVALID_FORMAT;
     }
-    if (wkspace_alloc_c_checked(&sorted_extract_ids, extract_ct * max_extract_id_len)) {
+    if (bigstack_alloc_c(extract_ct * max_extract_id_len, &sorted_extract_ids)) {
       goto meta_analysis_ret_NOMEM;
     }
     rewind(infile);
@@ -5421,7 +5391,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
     // revisiting this decision in the future, though, since there are
     // reasonable use cases involving 40-80 million line --extract files, and
     // skipping the sort step there is a big win.
-    retval = read_tokens(infile, tbuf, MAXLINELEN, extract_ct, max_extract_id_len, sorted_extract_ids);
+    retval = read_tokens(MAXLINELEN, extract_ct, max_extract_id_len, infile, g_textbuf, sorted_extract_ids);
     if (retval) {
       goto meta_analysis_ret_1;
     }
@@ -5432,14 +5402,14 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
     ulii = collapse_duplicate_ids(sorted_extract_ids, extract_ct, max_extract_id_len, NULL);
     if (ulii < extract_ct) {
       extract_ct = ulii;
-      wkspace_shrink_top(sorted_extract_ids, extract_ct * max_extract_id_len);
+      bigstack_shrink_top(sorted_extract_ids, extract_ct * max_extract_id_len);
     }
-    extract_ctl = (extract_ct + BITCT - 1) / BITCT;
-    if (wkspace_alloc_ul_checked(&duplicate_id_bitfield, extract_ctl * sizeof(intptr_t))) {
+    extract_ctl = BITCT_TO_WORDCT(extract_ct);
+    if (bigstack_alloc_ul(extract_ctl, &duplicate_id_bitfield)) {
       goto meta_analysis_ret_NOMEM;
     }
   } else {
-    duplicate_id_htable = (Ll_str**)wkspace_alloc(HASHMEM);
+    duplicate_id_htable = (Ll_str**)bigstack_alloc(HASHMEM);
   }
 
   // 3. Allocate space for initial hash table.
@@ -5451,8 +5421,8 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
   //                        need to widen chromosome byte later
   // [W+5M+1]: null-terminated variant ID.  Followed by null-terminated A1/A2
   //           if 'no-allele' not specified
-  wkspace_mark2 = wkspace_base;
-  htable = (Ll_str**)wkspace_alloc(HASHMEM);
+  bigstack_mark2 = g_bigstack_base;
+  htable = (Ll_str**)bigstack_alloc(HASHMEM);
   if (!htable) {
     goto meta_analysis_ret_NOMEM;
   }
@@ -5479,9 +5449,8 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
     slen_base += 5;
   }
   fname_ptr = input_fnames;
-  loadbuf_end = (char*)(&(wkspace_base[wkspace_left]));
-  htable_write = (Ll_str*)wkspace_base;
-  loadbuf_end[-1] = ' ';
+  htable_write = (Ll_str*)g_bigstack_base;
+  bigstack_end_mark[-1] = ' ';
   for (file_idx = 0; file_idx < file_ct; file_idx++) {
     if (sorted_extract_ids) {
       fill_ulong_zero(duplicate_id_bitfield, extract_ctl);
@@ -5492,13 +5461,13 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
     }
     fname_len = strlen(fname_ptr);
     // prevent overlap between loadbuf and new hash table entries.
-    loadbuf_size = (((uintptr_t)(loadbuf_end - ((char*)htable_write))) / 4);
+    loadbuf_size = (((uintptr_t)(bigstack_end_mark - ((unsigned char*)htable_write))) / 4);
     if (loadbuf_size > MAXLINEBUFLEN) {
       loadbuf_size = MAXLINEBUFLEN;
     } else if (loadbuf_size <= MAXLINELEN) {
       goto meta_analysis_ret_NOMEM;
     }
-    loadbuf = &(loadbuf_end[-((intptr_t)loadbuf_size)]);
+    loadbuf = (char*)(&(bigstack_end_mark[-((intptr_t)loadbuf_size)]));
     duplicate_id_htable_write = (Ll_str*)loadbuf;
     htable_write_limit = ((uintptr_t)loadbuf) - loadbuf_size - 16;
     token_ct = parse_max;
@@ -5516,7 +5485,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
       }
       if (!loadbuf[loadbuf_size - 1]) {
 	if (loadbuf_size == MAXLINEBUFLEN) {
-	  sprintf(logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, fname_ptr);
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, fname_ptr);
 	  goto meta_analysis_ret_INVALID_FORMAT_WW;
 	}
 	goto meta_analysis_ret_NOMEM;
@@ -5551,7 +5520,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
       bufptr = token_ptrs[0];
       var_id_len = strlen_se(bufptr);
       if (var_id_len > MAX_ID_LEN) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of %s has an excessively long variant ID.\n", line_idx, fname_ptr);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s has an excessively long variant ID.\n", line_idx, fname_ptr);
 	goto meta_analysis_ret_INVALID_FORMAT_WW;
       }
       bufptr[var_id_len] = '\0';
@@ -5566,7 +5535,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
 	  problem_mask = 0x200;
 	  goto meta_analysis_report_error;
 	}
-	set_bit(duplicate_id_bitfield, ii);
+	set_bit(ii, duplicate_id_bitfield);
       } else {
 	ll_pptr = &(duplicate_id_htable[uii]);
 	while (1) {
@@ -5582,7 +5551,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
 	}
 	// word-align for now
 	// note that it is NOT safe to use uii here.
-	ulii = sizeof(intptr_t) + ((var_id_len + BYTECT - 1) & (~(BYTECT - 1)));
+	ulii = sizeof(intptr_t) + round_up_pow2(var_id_len, BYTECT);
 	if (((uintptr_t)htable_write) + ulii > ((uintptr_t)duplicate_id_htable_write)) {
 	  goto meta_analysis_ret_NOMEM;
 	}
@@ -5685,7 +5654,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
 	if (report_all) {
 	  final_variant_ct++;
 	}
-	htable_write = (Ll_str*)((((uintptr_t)wptr) + sizeof(uintptr_t) - 1) & (~(sizeof(uintptr_t) - ONELU)));
+	htable_write = (Ll_str*)round_up_pow2((uintptr_t)wptr, sizeof(uintptr_t));
 	if ((((uintptr_t)htable_write) > ((uintptr_t)duplicate_id_htable_write)) || (((uintptr_t)htable_write) > htable_write_limit)) {
 	  goto meta_analysis_ret_NOMEM;
 	}
@@ -5710,15 +5679,15 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
 	  }
 	  if (!outfile) {
 	    memcpy(outname_end, ".prob", 6);
-	    if (fopen_checked(&outfile, outname, "w")) {
+	    if (fopen_checked(outname, "w", &outfile)) {
 	      goto meta_analysis_ret_OPEN_FAIL;
 	    }
 	  }
-	  bufptr = memcpyax(tbuf, fname_ptr, fname_len, '\t');
+	  bufptr = memcpyax(g_textbuf, fname_ptr, fname_len, '\t');
 	  bufptr = memcpyax(bufptr, token_ptrs[0], var_id_len - 1, '\t');
 	  do {
 	    wptr = strcpyax(bufptr, problem_strings[__builtin_ctz(problem_mask)], '\n');
-	    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	    if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	      goto meta_analysis_ret_WRITE_FAIL;
 	    }
 	    problem_mask &= problem_mask - 1;
@@ -5764,17 +5733,17 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
   }
   // bp coordinate, if present, expands from 4 to 5 bytes
   master_var_entry_len = slen_base + use_map + max_var_id_len_p1 + combined_allele_len_byte_width;
-  loadbuf_size = (line_max + 15) & (~15);
-  loadbuf = &(loadbuf_end[-((intptr_t)loadbuf_size)]);
-  topsize = loadbuf_size + ((final_variant_ct * master_var_entry_len + 15) & (~(15 * ONELU)));
-  if ((uintptr_t)(loadbuf_end - ((char*)htable_write)) < topsize) {
+  loadbuf_size = round_up_pow2(line_max, END_ALLOC_CHUNK);
+  loadbuf = (char*)bigstack_end_alloc_presized(loadbuf_size);
+  if ((!loadbuf) ||
+      bigstack_end_alloc_c(final_variant_ct * master_var_entry_len, &master_var_list) ||
+      (((uintptr_t)htable_write) > ((uintptr_t)master_var_list))) {
     goto meta_analysis_ret_NOMEM;
   }
-  master_var_list = &(loadbuf_end[-((intptr_t)topsize)]);
   // instead of following hash table pointers, we just plow through the table
   // entries in the order they were allocated in; this lets us access memory
   // sequentially
-  ll_ptr = (Ll_str*)wkspace_base;
+  ll_ptr = (Ll_str*)g_bigstack_base;
   for (master_var_idx = 0; master_var_idx < final_variant_ct;) {
     cur_file_ct_m1 = 0; // clear high bits
     memcpy(&cur_file_ct_m1, ll_ptr->ss, file_ct_byte_width);
@@ -5784,7 +5753,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
       if (use_map) {
 	*wptr++ = ll_ptr->ss[file_ct_byte_width];
 	memcpy(&uii, &(ll_ptr->ss[file_ct_byte_width + 1]), 4);
-	wptr = uint32_encode_5_hi_uchar(wptr, uii);
+	wptr = uint32_encode_5_hi_uchar(uii, wptr);
       }
       bufptr = &(ll_ptr->ss[slen_base]);
       slen = strlen(bufptr) + 1;
@@ -5809,21 +5778,21 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
     // now bufptr points to the byte past the end of the hash table entry
     // allocation, and we know the next allocation starts at [this byte,
     // rounded up to nearest word boundary]
-    ll_ptr = (Ll_str*)((((uintptr_t)bufptr) + sizeof(uintptr_t) - 1) & (~(sizeof(uintptr_t) - ONELU)));
+    ll_ptr = (Ll_str*)round_up_pow2((uintptr_t)bufptr, sizeof(intptr_t));
   }
   qsort(master_var_list, final_variant_ct, master_var_entry_len, strcmp_natural);
   // don't need htable anymore
-  wkspace_reset(wkspace_mark2);
+  bigstack_reset(bigstack_mark2);
   if (!sorted_extract_ids) {
-    wkspace_alloc(duplicate_id_htable_max_alloc);
+    bigstack_alloc(duplicate_id_htable_max_alloc);
   }
-  total_data_slots = (wkspace_left - topsize) / sizeof(uintptr_t);
+  total_data_slots = bigstack_left() / sizeof(uintptr_t);
 
   // 6. Remaining load passes: determine how many remaining variants' worth of
   //    effect sizes/SEs/Ps/ESSes fit in memory, load and meta-analyze just
   //    those variants, rinse and repeat.
   memcpy(outname_end, ".meta", 6);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto meta_analysis_ret_OPEN_FAIL;
   }
   if (use_map) {
@@ -5839,18 +5808,18 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
   }
   if (report_study_specific) {
     for (file_idx = 0; file_idx < file_ct; file_idx++) {
-      tbuf[0] = ' ';
-      tbuf[1] = 'F';
-      wptr = uint32_write(&(tbuf[2]), file_idx);
-      wptr = width_force(8, tbuf, wptr);
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      g_textbuf[0] = ' ';
+      g_textbuf[1] = 'F';
+      wptr = uint32toa(file_idx, &(g_textbuf[2]));
+      wptr = width_force(8, g_textbuf, wptr);
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	goto meta_analysis_ret_WRITE_FAIL;
       }
     }
   }
   putc('\n', outfile);
 
-  cur_data_index = (uintptr_t*)wkspace_base;
+  cur_data_index = (uintptr_t*)g_bigstack_base;
   if (use_map) {
     // chr/bp values can be discordant; when they are, we can't directly search
     // master_var_list for variant IDs.  Instead, we populate
@@ -5951,7 +5920,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
 	  duplicate_id_htable[uii] = NULL;
 	}
       }
-      duplicate_id_htable_write = (Ll_str*)wkspace_mark2;
+      duplicate_id_htable_write = (Ll_str*)bigstack_mark2;
       fname_len = strlen(fname_ptr);
       token_ct = parse_max;
       retval = meta_analysis_open_and_read_header(fname_ptr, loadbuf, loadbuf_size, sorted_header_dict, header_id_map, header_dict_ct, max_header_len, weighted_z, &token_ct, &gz_infile, col_skips, col_sequence, NULL, NULL);
@@ -5993,7 +5962,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
 	  if (is_set(duplicate_id_bitfield, ii)) {
 	    continue;
 	  }
-	  set_bit(duplicate_id_bitfield, ii);
+	  set_bit(ii, duplicate_id_bitfield);
 	} else {
 	  uii = hashval2(bufptr, var_id_len);
 	  ll_pptr = &(duplicate_id_htable[uii]);
@@ -6108,7 +6077,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
 	  }
 	}
 	if (report_study_specific) {
-          set_bit((uintptr_t*)cur_data_ptr, file_idx);
+          set_bit(file_idx, (uintptr_t*)cur_data_ptr);
 	}
 	if (weighted_z) {
 	  dxx = ltqnorm(1.0 - cur_p * 0.5) * sqrt(cur_ess);
@@ -6135,20 +6104,20 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
       cur_data_ptr = (double*)cur_data_index[2 * cur_var_idx];
       cur_file_ct = cur_data_index[2 * cur_var_idx + 1];
       bufptr = &(cur_entry_list_window[cur_var_idx * master_var_entry_len]);
-      wptr = tbuf;
+      wptr = g_textbuf;
       if (use_map) {
 	cur_chrom = (uint32_t)((unsigned char)(*bufptr++));
-	wptr = width_force(4, wptr, chrom_name_write(wptr, chrom_info_ptr, cur_chrom));
+	wptr = width_force(4, wptr, chrom_name_write(chrom_info_ptr, cur_chrom, wptr));
 	wptr = memseta(wptr, 32, 2);
 	cur_bp = uint32_decode_5_hi_uchar(bufptr);
 	bufptr = &(bufptr[5]);
-	wptr = uint32_writew10(wptr, cur_bp);
+	wptr = uint32toa_w10(cur_bp, wptr);
       }
       *wptr++ = ' ';
       var_id_len = strlen(bufptr);
       // bleah, this column width was not adaptive
       wptr = fw_strcpyn(14, var_id_len, bufptr, wptr);
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	goto meta_analysis_ret_WRITE_FAIL;
       }
       if (!no_allele) {
@@ -6188,9 +6157,9 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
 	  fputs("   ?", outfile);
 	}
       }
-      tbuf[0] = ' ';
-      wptr = &(tbuf[1]);
-      wptr = width_force(3, wptr, uint32_write(wptr, cur_file_ct));
+      g_textbuf[0] = ' ';
+      wptr = &(g_textbuf[1]);
+      wptr = width_force(3, wptr, uint32toa(cur_file_ct, wptr));
       if (cur_file_ct >= 2) {
 	// and here's the actual computation.
 	numer = 0.0;
@@ -6249,31 +6218,31 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
 	}
 	*wptr++ = ' ';
         if (p1 >= 0.0) {
-	  wptr = double_g_writewx4x(wptr, MAXV(p1, output_min_p), 11, ' ');
+	  wptr = dtoa_g_wxp4x(MAXV(p1, output_min_p), 11, ' ', wptr);
 	} else {
 	  wptr = memcpya(wptr, "         NA ", 12);
 	}
 	if (pr >= 0.0) {
-	  wptr = double_g_writewx4x(wptr, MAXV(pr, output_min_p), 11, ' ');
+	  wptr = dtoa_g_wxp4x(MAXV(pr, output_min_p), 11, ' ', wptr);
 	} else {
 	  wptr = memcpya(wptr, "         NA ", 12);
 	}
-	wptr = double_f_writew74x(wptr, summ, ' ');
-	wptr = double_f_writew74x(wptr, summ_random, ' ');
+	wptr = dtoa_f_w7p4x(summ, ' ', wptr);
+	wptr = dtoa_f_w7p4x(summ_random, ' ', wptr);
 	if (pq >= 0.0) {
-	  wptr = double_f_writew74x(wptr, MAXV(pq, output_min_p), ' ');
+	  wptr = dtoa_f_w7p4x(MAXV(pq, output_min_p), ' ', wptr);
 	} else {
 	  wptr = memcpya(wptr, "     NA ", 8);
 	}
-	wptr = width_force(7, wptr, double_f_writew2(wptr, meta_i));
+	wptr = width_force(7, wptr, dtoa_f_p2(meta_i, wptr));
 	if (weighted_z) {
 	  numer = cur_data_ptr[-2];
 	  denom2 = cur_data_ptr[-1];
 	  dxx = numer / sqrt(denom2);
 	  *wptr++ = ' ';
-	  wptr = double_g_writewx4x(wptr, dxx, 11, ' ');
+	  wptr = dtoa_g_wxp4x(dxx, 11, ' ', wptr);
 	  dxx = 1.0 - 2 * fabs(normdist(fabs(dxx)) - 0.5);
-	  wptr = double_g_writewx4(wptr, MAXV(dxx, output_min_p), 11);
+	  wptr = dtoa_g_wxp4(MAXV(dxx, output_min_p), 11, wptr);
 	}
       } else {
 	wptr = memcpya(wptr, "          NA          NA      NA      NA      NA      NA", 56);
@@ -6281,7 +6250,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
 	  wptr = memcpya(wptr, "          NA          NA", 24);
 	}
       }
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      if (fwrite_checked(g_textbuf, wptr - g_textbuf, outfile)) {
 	goto meta_analysis_ret_WRITE_FAIL;
       }
       if (report_study_specific) {
@@ -6295,8 +6264,8 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
 	      // finish fixing PLINK 1.07 bug
 	      dxx = exp(dxx);
 	    }
-	    double_f_writew74x(&(tbuf[1]), dxx, '\0');
-	    fputs(tbuf, outfile);
+	    dtoa_f_w7p4x(dxx, '\0', &(g_textbuf[1]));
+	    fputs(g_textbuf, outfile);
 	  } else {
 	    fputs("      NA", outfile);
 	  }
@@ -6333,7 +6302,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
     retval = RET_INVALID_CMDLINE;
     break;
   meta_analysis_ret_INVALID_FORMAT_WW:
-    wordwrap(logbuf, 0);
+    wordwrapb(0);
     logerrprintb();
   meta_analysis_ret_INVALID_FORMAT:
     retval = RET_INVALID_FORMAT;
@@ -6343,6 +6312,6 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
   gzclose_cond(gz_infile);
   fclose_cond(infile);
   fclose_cond(outfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
   return retval;
 }
diff --git a/plink_perm.c b/plink_perm.c
index 20320f4..aaad0b4 100644
--- a/plink_perm.c
+++ b/plink_perm.c
@@ -1,5 +1,43 @@
 #include "plink_common.h"
 
+#include "plink_cluster.h"
+
+// Inputs/outputs for multithreaded permutation generators.
+uint32_t g_perm_pheno_nm_ct;
+uint32_t g_perm_case_ct;
+uint32_t g_perm_tot_quotient;
+uint64_t g_perm_totq_magic;
+uint32_t g_perm_totq_preshift;
+uint32_t g_perm_totq_postshift;
+uint32_t g_perm_totq_incr;
+uint32_t g_perm_is_1bit;
+uint32_t g_perm_generation_thread_ct;
+uintptr_t g_perm_vec_ct;
+
+uint32_t g_perm_cluster_ct;
+uint32_t* g_perm_cluster_map;
+uint32_t* g_perm_cluster_starts;
+uint32_t* g_perm_cluster_case_cts;
+uintptr_t* g_perm_cluster_cc_preimage;
+uint32_t* g_perm_tot_quotients;
+uint64_t* g_perm_totq_magics;
+uint32_t* g_perm_totq_preshifts;
+uint32_t* g_perm_totq_postshifts;
+uint32_t* g_perm_totq_incrs;
+
+uintptr_t* g_perm_vecs;
+
+// always use genotype indexing for QT --assoc
+double* g_perm_vecstd;
+double* g_perm_pheno_d2;
+uint32_t* g_perm_sample_to_cluster;
+uint32_t* g_perm_qt_cluster_thread_wkspace;
+
+// permutation-major instead of sample-major order for --linear (PERMORY
+// speedups do not apply)
+double* g_perm_pmajor;
+uint32_t* g_perm_precomputed_mods; // [n] = 2^32 mod (n-2)
+
 void generate_cc_perm_vec(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotient, uint64_t totq_magic, uint32_t totq_preshift, uint32_t totq_postshift, uint32_t totq_incr, uintptr_t* perm_vec, sfmt_t* sfmtp) {
   // Assumes tot_quotient is 2^32 / tot_ct, and
   // totq_magic/totq_preshift/totq_postshift/totq_incr have been precomputed
@@ -12,7 +50,7 @@ void generate_cc_perm_vec(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotien
   uint32_t urand;
   uint32_t uii;
   if (set_ct * 2 < tot_ct) {
-    fill_ulong_zero(perm_vec, 2 * ((tot_ct + (BITCT - 1)) / BITCT));
+    fill_ulong_zero(perm_vec, QUATERCT_TO_ALIGNED_WORDCT(tot_ct));
     for (; num_set < set_ct; num_set++) {
       do {
 	do {
@@ -26,7 +64,7 @@ void generate_cc_perm_vec(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotien
       perm_vec[widx] = pv_val | wcomp;
     }
   } else {
-    fill_vec_55(perm_vec, tot_ct);
+    fill_quatervec_55(tot_ct, perm_vec);
     set_ct = tot_ct - set_ct;
     for (; num_set < set_ct; num_set++) {
       do {
@@ -53,7 +91,7 @@ void generate_cc_perm1(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotient,
   uint32_t urand;
   uint32_t uii;
   if (set_ct * 2 < tot_ct) {
-    fill_ulong_zero(perm_vec, (tot_ct + (BITCT - 1)) / BITCT);
+    fill_ulong_zero(perm_vec, BITCT_TO_WORDCT(tot_ct));
     for (; num_set < set_ct; num_set++) {
       do {
 	do {
@@ -67,7 +105,7 @@ void generate_cc_perm1(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotient,
       perm_vec[widx] = pv_val | wcomp;
     }
   } else {
-    fill_all_bits(perm_vec, tot_ct);
+    fill_all_bits(tot_ct, perm_vec);
     set_ct = tot_ct - set_ct;
     for (; num_set < set_ct; num_set++) {
       do {
@@ -85,7 +123,6 @@ void generate_cc_perm1(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotient,
 }
 
 void generate_cc_cluster_perm_vec(uint32_t tot_ct, uintptr_t* preimage, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t* cluster_case_cts, uint32_t* tot_quotients, uint64_t* totq_magics, uint32_t* totq_preshifts, uint32_t* totq_postshifts, uint32_t* totq_incrs, uintptr_t* perm_vec, sfmt_t* sfmtp) {
-  uint32_t tot_ctl2 = 2 * ((tot_ct + (BITCT - 1)) / BITCT);
   uint32_t cluster_idx;
   uint32_t target_ct;
   uint32_t cluster_end;
@@ -102,7 +139,7 @@ void generate_cc_cluster_perm_vec(uint32_t tot_ct, uintptr_t* preimage, uint32_t
   uintptr_t pv_val;
   uint32_t urand;
   uint32_t uii;
-  memcpy(perm_vec, preimage, tot_ctl2 * sizeof(intptr_t));
+  memcpy(perm_vec, preimage, QUATERCT_TO_ALIGNED_WORDCT(tot_ct) * sizeof(intptr_t));
   for (cluster_idx = 0; cluster_idx < cluster_ct; cluster_idx++) {
     target_ct = cluster_case_cts[cluster_idx];
     cluster_end = cluster_starts[cluster_idx + 1];
@@ -147,7 +184,7 @@ void generate_cc_cluster_perm_vec(uint32_t tot_ct, uintptr_t* preimage, uint32_t
 }
 
 void generate_cc_cluster_perm1(uint32_t tot_ct, uintptr_t* preimage, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t* cluster_case_cts, uint32_t* tot_quotients, uint64_t* totq_magics, uint32_t* totq_preshifts, uint32_t* totq_postshifts, uint32_t* totq_incrs, uintptr_t* perm_vec, sfmt_t* sfmtp) {
-  uint32_t tot_ctl = (tot_ct + (BITCT - 1)) / BITCT;
+  uint32_t tot_ctl = BITCT_TO_WORDCT(tot_ct);
   uint32_t cluster_idx;
   uint32_t target_ct;
   uint32_t cluster_end;
@@ -208,6 +245,326 @@ void generate_cc_cluster_perm1(uint32_t tot_ct, uintptr_t* preimage, uint32_t cl
   }
 }
 
+THREAD_RET_TYPE generate_cc_perms_thread(void* arg) {
+  intptr_t tidx = (intptr_t)arg;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
+  uint32_t case_ct = g_perm_case_ct;
+  uint32_t tot_quotient = g_perm_tot_quotient;
+  uint64_t totq_magic = g_perm_totq_magic;
+  uint32_t totq_preshift = g_perm_totq_preshift;
+  uint32_t totq_postshift = g_perm_totq_postshift;
+  uint32_t totq_incr = g_perm_totq_incr;
+  uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
+  sfmt_t* __restrict__ sfmtp = g_sfmtp_arr[tidx];
+  uintptr_t pheno_nm_ctv = BITCT_TO_WORDCT(pheno_nm_ct);
+  uint32_t pidx = (((uint64_t)tidx) * g_perm_vec_ct) / g_perm_generation_thread_ct;
+  uint32_t pmax = (((uint64_t)tidx + 1) * g_perm_vec_ct) / g_perm_generation_thread_ct;
+  if (!g_perm_is_1bit) {
+    pheno_nm_ctv *= 2;
+    for (; pidx < pmax; pidx++) {
+      generate_cc_perm_vec(pheno_nm_ct, case_ct, tot_quotient, totq_magic, totq_preshift, totq_postshift, totq_incr, &(perm_vecs[pidx * pheno_nm_ctv]), sfmtp);
+    }
+  } else {
+    // 16-byte alignment currently isn't needed; but it might be useful in the
+    // future, and the cost is low enough that I won't bother with writing the
+    // tiny-bit-more-efficient-half-the-time 8-byte alignment version for now.
+    pheno_nm_ctv = round_up_pow2(pheno_nm_ctv, 2);
+    for (; pidx < pmax; pidx++) {
+      generate_cc_perm1(pheno_nm_ct, case_ct, tot_quotient, totq_magic, totq_preshift, totq_postshift, totq_incr, &(perm_vecs[pidx * pheno_nm_ctv]), sfmtp);
+    }
+  }
+  THREAD_RETURN;
+}
+
+THREAD_RET_TYPE generate_cc_cluster_perms_thread(void* arg) {
+  intptr_t tidx = (intptr_t)arg;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
+  uintptr_t* __restrict__ perm_vecs = g_perm_vecs;
+  sfmt_t* __restrict__ sfmtp = g_sfmtp_arr[tidx];
+  uintptr_t pheno_nm_ctv = BITCT_TO_WORDCT(pheno_nm_ct);
+  uint32_t pidx = (((uint64_t)tidx) * g_perm_vec_ct) / g_perm_generation_thread_ct;
+  uint32_t pmax = (((uint64_t)tidx + 1) * g_perm_vec_ct) / g_perm_generation_thread_ct;
+  uint32_t cluster_ct = g_perm_cluster_ct;
+  uint32_t* cluster_map = g_perm_cluster_map;
+  uint32_t* cluster_starts = g_perm_cluster_starts;
+  uint32_t* cluster_case_cts = g_perm_cluster_case_cts;
+  uintptr_t* perm_cluster_cc_preimage = g_perm_cluster_cc_preimage;
+  uint32_t* tot_quotients = g_perm_tot_quotients;
+  uint64_t* totq_magics = g_perm_totq_magics;
+  uint32_t* totq_preshifts = g_perm_totq_preshifts;
+  uint32_t* totq_postshifts = g_perm_totq_postshifts;
+  uint32_t* totq_incrs = g_perm_totq_incrs;
+  if (!g_perm_is_1bit) {
+    pheno_nm_ctv *= 2;
+    for (; pidx < pmax; pidx++) {
+      generate_cc_cluster_perm_vec(pheno_nm_ct, perm_cluster_cc_preimage, cluster_ct, cluster_map, cluster_starts, cluster_case_cts, tot_quotients, totq_magics, totq_preshifts, totq_postshifts, totq_incrs, &(perm_vecs[pidx * pheno_nm_ctv]), sfmtp);
+    }
+  } else {
+    pheno_nm_ctv = round_up_pow2(pheno_nm_ctv, 2);
+    for (; pidx < pmax; pidx++) {
+      generate_cc_cluster_perm1(pheno_nm_ct, perm_cluster_cc_preimage, cluster_ct, cluster_map, cluster_starts, cluster_case_cts, tot_quotients, totq_magics, totq_preshifts, totq_postshifts, totq_incrs, &(perm_vecs[pidx * pheno_nm_ctv]), sfmtp);
+    }
+  }
+  THREAD_RETURN;
+}
+
+THREAD_RET_TYPE generate_qt_perms_smajor_thread(void* arg) {
+  // Used by QT --assoc and --make-perm-pheno.
+  //
+  // Takes an array of phenotype values in g_perm_pheno_d2 of length
+  // g_perm_pheno_nm_ct, and populates g_perm_vecstd[] with permutations of
+  // those values.  Also requires g_sfmtp_arr[] and
+  // g_perm_generation_thread_ct to be initialized.
+  //
+  // g_perm_vecstd is sample-major.  The nth permutation is stored across
+  //   g_perm_vecstd[n]
+  //   g_perm_vecstd[n + perm_vec_ctcl8m]
+  //   g_perm_vecstd[n + 2 * perm_vec_ctcl8m]
+  //   ...
+  uintptr_t tidx = (uintptr_t)arg;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
+  uintptr_t perm_vec_ctcl8 = (g_perm_vec_ct + (CACHELINE_DBL - 1)) / CACHELINE_DBL;
+  uintptr_t perm_vec_ctcl8m = perm_vec_ctcl8 * CACHELINE_DBL;
+  double* pheno_d2 = g_perm_pheno_d2;
+  sfmt_t* sfmtp = g_sfmtp_arr[tidx];
+  uint32_t pmin = CACHELINE_DBL * ((((uint64_t)tidx) * perm_vec_ctcl8) / g_perm_generation_thread_ct);
+  uint32_t pmax = CACHELINE_DBL * ((((uint64_t)tidx + 1) * perm_vec_ctcl8) / g_perm_generation_thread_ct);
+  double* perm_vecstd = &(g_perm_vecstd[pmin]);
+  uint32_t poffset = 0;
+  uint32_t sample_idx = 1;
+  uint32_t pdiff;
+  uint32_t tot_quotient;
+  uint32_t upper_bound;
+  uint64_t totq_magic;
+  uint32_t totq_preshift;
+  uint32_t totq_postshift;
+  uint32_t totq_incr;
+  uint32_t urand;
+  uint32_t uii;
+  double* wptr;
+  double* wptr2;
+  double* wptr3;
+  double cur_source;
+  if (tidx + 1 == g_perm_generation_thread_ct) {
+    pmax = g_perm_vec_ct;
+  }
+  pdiff = pmax - pmin;
+  cur_source = *pheno_d2++;
+  wptr = perm_vecstd;
+  for (; poffset < pdiff; poffset++) {
+    *wptr++ = cur_source;
+  }
+  for (; sample_idx < pheno_nm_ct; sample_idx++) {
+    tot_quotient = 0x100000000LLU / (sample_idx + 1);
+    upper_bound = (sample_idx + 1) * tot_quotient - 1;
+    magic_num(tot_quotient, &totq_magic, &totq_preshift, &totq_postshift, &totq_incr);
+    cur_source = *pheno_d2++;
+    wptr = &(perm_vecstd[sample_idx * perm_vec_ctcl8m]);
+    wptr2 = perm_vecstd;
+    for (poffset = 0; poffset < pdiff; poffset++) {
+      do {
+	urand = sfmt_genrand_uint32(sfmtp);
+      } while (urand > upper_bound);
+      uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
+      wptr3 = &(wptr2[uii * perm_vec_ctcl8m]);
+      *wptr++ = *wptr3;
+      *wptr3 = cur_source;
+      wptr2++;
+    }
+  }
+  THREAD_RETURN;
+}
+
+THREAD_RET_TYPE generate_qt_cluster_perms_smajor_thread(void* arg) {
+  // Variant of generate_qt_perms_smajor_thread() which restricts permutations
+  // to be within-cluster.
+  // On top of the generate_qt_perms_smajor_thread requirements, this also
+  // needs g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts,
+  // g_perm_qt_cluster_thread_wkspace, and g_perm_sample_to_cluster to be
+  // initialized.
+  uintptr_t tidx = (uintptr_t)arg;
+  uint32_t pheno_nm_ct = g_perm_pheno_nm_ct;
+  uintptr_t perm_vec_ctcl8 = (g_perm_vec_ct + (CACHELINE_DBL - 1)) / CACHELINE_DBL;
+  uintptr_t perm_vec_ctcl8m = perm_vec_ctcl8 * CACHELINE_DBL;
+  double* pheno_d2 = g_perm_pheno_d2;
+  sfmt_t* sfmtp = g_sfmtp_arr[tidx];
+  uint32_t pmin = CACHELINE_DBL * ((((uint64_t)tidx) * perm_vec_ctcl8) / g_perm_generation_thread_ct);
+  uint32_t pmax = CACHELINE_DBL * ((((uint64_t)tidx + 1) * perm_vec_ctcl8) / g_perm_generation_thread_ct);
+  double* perm_vecstd = &(g_perm_vecstd[pmin]);
+  uint32_t cluster_ct = g_perm_cluster_ct;
+  uint32_t cluster_ctcl = (cluster_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32;
+  uint32_t* cluster_map = g_perm_cluster_map;
+  uint32_t* cluster_starts = g_perm_cluster_starts;
+  uint32_t* in_cluster_positions = &(g_perm_qt_cluster_thread_wkspace[tidx * cluster_ctcl * CACHELINE_INT32]);
+  uint32_t* sample_to_cluster = g_perm_sample_to_cluster;
+  uint32_t poffset = 0;
+  uint32_t sample_idx = 0;
+  uint32_t* cur_map_start;
+  uint32_t pdiff;
+  uint32_t cluster_idx;
+  uint32_t cur_in_cluster_pos;
+  uint32_t tot_quotient;
+  uint32_t upper_bound;
+  uint64_t totq_magic;
+  uint32_t totq_preshift;
+  uint32_t totq_postshift;
+  uint32_t totq_incr;
+  uint32_t urand;
+  uint32_t uii;
+  double* wptr;
+  double* wptr2;
+  double* wptr3;
+  double cur_source;
+  if (tidx + 1 == g_perm_generation_thread_ct) {
+    pmax = g_perm_vec_ct;
+  }
+  pdiff = pmax - pmin;
+  fill_uint_zero(in_cluster_positions, cluster_ct);
+  for (; sample_idx < pheno_nm_ct; sample_idx++) {
+    cur_source = *pheno_d2++;
+    cluster_idx = sample_to_cluster[sample_idx];
+    if (cluster_idx == 0xffffffffU) {
+      cur_in_cluster_pos = 0;
+    } else {
+      cur_in_cluster_pos = in_cluster_positions[cluster_idx];
+      in_cluster_positions[cluster_idx] += 1;
+    }
+    wptr = &(perm_vecstd[sample_idx * perm_vec_ctcl8m]);
+    if (!cur_in_cluster_pos) {
+      for (poffset = 0; poffset < pdiff; poffset++) {
+        *wptr++ = cur_source;
+      }
+    } else {
+      cur_map_start = &(cluster_map[cluster_starts[cluster_idx]]);
+      tot_quotient = 0x100000000LLU / (cur_in_cluster_pos + 1);
+      upper_bound = (cur_in_cluster_pos + 1) * tot_quotient - 1;
+      magic_num(tot_quotient, &totq_magic, &totq_preshift, &totq_postshift, &totq_incr);
+      wptr2 = perm_vecstd;
+      for (poffset = 0; poffset < pdiff; poffset++) {
+	do {
+	  urand = sfmt_genrand_uint32(sfmtp);
+	} while (urand > upper_bound);
+	uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
+	wptr3 = &(wptr2[cur_map_start[uii] * perm_vec_ctcl8m]);
+	*wptr++ = *wptr3;
+	*wptr3 = cur_source;
+	wptr2++;
+      }
+    }
+  }
+  THREAD_RETURN;
+}
+
+THREAD_RET_TYPE generate_qt_perms_pmajor_thread(void* arg) {
+  // Used by --linear.  Requires g_perm_pheno_nm_ct, g_perm_pheno_d2,
+  // g_sfmtp_arr, g_perm_generation_thread_ct, and g_perm_vec_ct to be
+  // initialized, and space must be allocated for g_perm_pmajor.  The nth
+  // permutation (0-based) is stored in g_perm_pmajor indices
+  //   [n * sample_valid_ct] to [(n + 1) * sample_valid_ct - 1]
+  // inclusive.
+  uintptr_t tidx = (uintptr_t)arg;
+  uint32_t sample_valid_ct = g_perm_pheno_nm_ct;
+  uintptr_t perm_vec_ctcl = (g_perm_vec_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32;
+  sfmt_t* sfmtp = g_sfmtp_arr[tidx];
+  uintptr_t pmin = CACHELINE_INT32 * ((((uint64_t)tidx) * perm_vec_ctcl) / g_perm_generation_thread_ct);
+  uintptr_t pmax = CACHELINE_INT32 * ((((uint64_t)tidx + 1) * perm_vec_ctcl) / g_perm_generation_thread_ct);
+  double* perm_pmajor = &(g_perm_pmajor[pmin * sample_valid_ct]);
+  double* pheno_d2 = g_perm_pheno_d2;
+  uint32_t* precomputed_mods = g_perm_precomputed_mods;
+  uint32_t* lbound_ptr;
+  double* pheno_ptr;
+  uint32_t poffset;
+  uint32_t pdiff;
+  uint32_t sample_idx;
+  uint32_t urand;
+  uint32_t lbound;
+  if (tidx + 1 == g_perm_generation_thread_ct) {
+    pmax = g_perm_vec_ct;
+  }
+  pdiff = pmax - pmin;
+  for (poffset = 0; poffset < pdiff; poffset++) {
+    lbound_ptr = precomputed_mods;
+    pheno_ptr = pheno_d2;
+    perm_pmajor[0] = *pheno_ptr++;
+    for (sample_idx = 1; sample_idx < sample_valid_ct; sample_idx++) {
+      lbound = *lbound_ptr++;
+      do {
+        urand = sfmt_genrand_uint32(sfmtp);
+      } while (urand < lbound);
+      // er, this modulus operation is slow.  but doesn't seem to be worthwhile
+      // to use magic numbers here.
+      urand %= sample_idx + 1;
+      perm_pmajor[sample_idx] = perm_pmajor[urand];
+      perm_pmajor[urand] = *pheno_ptr++;
+    }
+    perm_pmajor = &(perm_pmajor[sample_valid_ct]);
+  }
+  THREAD_RETURN;
+}
+
+THREAD_RET_TYPE generate_qt_cluster_perms_pmajor_thread(void* arg) {
+  // On top of the linear_gen_perms_thread requirements, this also needs
+  // g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts,
+  // g_perm_qt_cluster_thread_wkspace, and g_perm_sample_to_cluster to be
+  // initialized.
+  uintptr_t tidx = (uintptr_t)arg;
+  uint32_t sample_valid_ct = g_perm_pheno_nm_ct;
+  uintptr_t perm_vec_ctcl = (g_perm_vec_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32;
+  sfmt_t* sfmtp = g_sfmtp_arr[tidx];
+  uintptr_t pmin = CACHELINE_INT32 * ((((uint64_t)tidx) * perm_vec_ctcl) / g_perm_generation_thread_ct);
+  uintptr_t pmax = CACHELINE_INT32 * ((((uint64_t)tidx + 1) * perm_vec_ctcl) / g_perm_generation_thread_ct);
+  double* perm_pmajor = &(g_perm_pmajor[pmin * sample_valid_ct]);
+  double* pheno_d2 = g_perm_pheno_d2;
+  uint32_t* precomputed_mods = &(g_perm_precomputed_mods[-1]);
+  uint32_t cluster_ct = g_perm_cluster_ct;
+  uint32_t cluster_ctcl = (cluster_ct + (CACHELINE_INT32 - 1)) / CACHELINE_INT32;
+  uint32_t* cluster_map = g_perm_cluster_map;
+  uint32_t* cluster_starts = g_perm_cluster_starts;
+  uint32_t* in_cluster_positions = &(g_perm_qt_cluster_thread_wkspace[tidx * cluster_ctcl * CACHELINE_INT32]);
+  uint32_t* sample_to_cluster = g_perm_sample_to_cluster;
+  double* pheno_ptr;
+  uint32_t poffset;
+  uint32_t pdiff;
+  uint32_t cluster_idx;
+  uint32_t cur_in_cluster_pos;
+  uint32_t sample_idx;
+  uint32_t urand;
+  uint32_t lbound;
+  uint32_t uii;
+  if (tidx + 1 == g_perm_generation_thread_ct) {
+    pmax = g_perm_vec_ct;
+  }
+  pdiff = pmax - pmin;
+  for (poffset = 0; poffset < pdiff; poffset++) {
+    fill_uint_zero(in_cluster_positions, cluster_ct);
+    pheno_ptr = pheno_d2;
+    for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
+      cluster_idx = sample_to_cluster[sample_idx];
+      if (cluster_idx == 0xffffffffU) {
+	cur_in_cluster_pos = 0;
+      } else {
+	cur_in_cluster_pos = in_cluster_positions[cluster_idx];
+	in_cluster_positions[cluster_idx] += 1;
+      }
+      if (!cur_in_cluster_pos) {
+        perm_pmajor[sample_idx] = *pheno_ptr++;
+      } else {
+        lbound = precomputed_mods[cur_in_cluster_pos];
+        do {
+	  urand = sfmt_genrand_uint32(sfmtp);
+	} while (urand < lbound);
+	urand %= (cur_in_cluster_pos + 1);
+	uii = cluster_map[cluster_starts[cluster_idx] + urand];
+        perm_pmajor[sample_idx] = perm_pmajor[uii];
+	perm_pmajor[uii] = *pheno_ptr++;
+      }
+    }
+    perm_pmajor = &(perm_pmajor[sample_valid_ct]);
+  }
+  THREAD_RETURN;
+}
+
+
 void transpose_perms(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_nm_ct, uint32_t* perm_vecst) {
   // Transpose permutations so PRESTO/PERMORY-style genotype indexing can work.
   //
@@ -224,7 +581,7 @@ void transpose_perms(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_
   //   first 4 bytes: 0 8 16 24 4 12 20 28 1 9 17 25 5 13 21 29 2 10 18...
   //   next 4 bytes: 32 40 48...
   uintptr_t sample_idx = 0;
-  uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+  uintptr_t pheno_nm_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
 #ifdef __LP64__
   uint32_t wbuf[4];
   uint32_t* wbptr;
@@ -254,7 +611,7 @@ void transpose_perms(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_
 	}
 	wbptr = wbuf;
       }
-      *wbptr |= ((pvptr[perm_idx * pheno_nm_ctl2] >> rshift) & 1) << wshift;
+      *wbptr |= ((pvptr[perm_idx * pheno_nm_ctv2] >> rshift) & 1) << wshift;
       wbptr++;
     } while (++perm_idx < perm_vec_ct);
     memcpy(perm_vecst, wbuf, 16);
@@ -269,7 +626,7 @@ void transpose_perms(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_
 	wval = 0;
 	wshift = 0;
       }
-      wval |= ((pvptr[perm_idx * pheno_nm_ctl2] >> rshift) & 1) << wshift;
+      wval |= ((pvptr[perm_idx * pheno_nm_ctv2] >> rshift) & 1) << wshift;
     } while (++perm_idx < perm_vec_ct);
     *perm_vecst++ = wval;
 #endif
@@ -278,7 +635,7 @@ void transpose_perms(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_
 
 void transpose_perm1s(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_nm_ct, uint32_t* perm_vecst) {
   uintptr_t sample_idx = 0;
-  uintptr_t pheno_nm_ctl = (pheno_nm_ct + (BITCT - 1)) / BITCT;
+  uintptr_t pheno_nm_ctv = BITCT_TO_ALIGNED_WORDCT(pheno_nm_ct);
 #ifdef __LP64__
   uint32_t wbuf[4];
   uint32_t* wbptr;
@@ -308,7 +665,7 @@ void transpose_perm1s(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno
 	}
 	wbptr = wbuf;
       }
-      *wbptr |= ((pvptr[perm_idx * pheno_nm_ctl] >> rshift) & 1) << wshift;
+      *wbptr |= ((pvptr[perm_idx * pheno_nm_ctv] >> rshift) & 1) << wshift;
       wbptr++;
     } while (++perm_idx < perm_vec_ct);
     memcpy(perm_vecst, wbuf, 16);
@@ -323,11 +680,179 @@ void transpose_perm1s(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno
 	wval = 0;
 	wshift = 0;
       }
-      wval |= ((pvptr[perm_idx * pheno_nm_ctl] >> rshift) & 1) << wshift;
+      wval |= ((pvptr[perm_idx * pheno_nm_ctv] >> rshift) & 1) << wshift;
     } while (++perm_idx < perm_vec_ct);
     *perm_vecst++ = wval;
 #endif
   }
 }
 
-// todo: add multithread globals with extern linkage
+
+int32_t make_perm_pheno(pthread_t* threads, char* outname, char* outname_end, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, double* pheno_d, char* output_missing_pheno, uint32_t permphe_ct) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  FILE* outfile = NULL;
+  uintptr_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
+  uintptr_t pheno_nm_ctl = BITCT_TO_WORDCT(pheno_nm_ct);
+  uintptr_t pheno_nm_ctv = round_up_pow2(pheno_nm_ctl, VEC_WORDS);
+  uintptr_t perm_vec_ctcl8m = 0;
+  char* writebuf = NULL;
+  int32_t retval = 0;
+  uintptr_t* ulptr;
+  double* dptr;
+  char* wptr;
+  uintptr_t sample_uidx;
+  uintptr_t sample_idx;
+  uintptr_t perm_idx;
+  uintptr_t ulii;
+  uint32_t sample_nmidx;
+  uint32_t rshift;
+  if (!pheno_nm_ct) {
+    logerrprint("Error: --make-perm-pheno requires phenotype data.\n");
+    goto make_perm_pheno_ret_INVALID_CMDLINE;
+  }
+  g_perm_generation_thread_ct = MINV(g_thread_ct, permphe_ct);
+  if (bigstack_init_sfmtp(g_perm_generation_thread_ct)) {
+    goto make_perm_pheno_ret_NOMEM;
+  }
+  g_perm_pheno_nm_ct = pheno_nm_ct;
+  g_perm_vec_ct = permphe_ct;
+  ulii = 0;
+  if (pheno_c) {
+    g_perm_is_1bit = 1;
+    g_perm_case_ct = popcount_longs(pheno_c, unfiltered_sample_ctl);
+    // could seamlessly support multipass by using different permutation logic,
+    // but pointless in practice; better to just generate multiple files
+    if (bigstack_alloc_ul(permphe_ct * pheno_nm_ctv, &g_perm_vecs)) {
+      goto make_perm_pheno_ret_NOMEM;
+    }
+    if (cluster_starts) {
+      // most similar to testmiss()
+      retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, pheno_c, pheno_nm_ct, 1, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, &g_perm_cluster_case_cts, &g_perm_cluster_cc_preimage);
+      if (retval) {
+	goto make_perm_pheno_ret_1;
+      }
+      if (!g_perm_cluster_ct) {
+        logerrprint("Error: Degenerate --make-perm-pheno invocation (no size 2+ clusters).\n");
+        goto make_perm_pheno_ret_INVALID_CMDLINE;
+      }
+      retval = cluster_alloc_and_populate_magic_nums(g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, &g_perm_tot_quotients, &g_perm_totq_magics, &g_perm_totq_preshifts, &g_perm_totq_postshifts, &g_perm_totq_incrs);
+      if (retval) {
+        goto make_perm_pheno_ret_1;
+      }
+      // not actually much of a point to multithreading since this is I/O
+      // bound, but what the hell, the permutation generators already support
+      // it
+      if (spawn_threads(threads, &generate_cc_cluster_perms_thread, g_perm_generation_thread_ct)) {
+	goto make_perm_pheno_ret_THREAD_CREATE_FAIL;
+      }
+      generate_cc_cluster_perms_thread((void*)ulii);
+    } else {
+      g_perm_cluster_starts = NULL;
+      g_perm_tot_quotient = 0x100000000LLU / pheno_nm_ct;
+      magic_num(g_perm_tot_quotient, &g_perm_totq_magic, &g_perm_totq_preshift, &g_perm_totq_postshift, &g_perm_totq_incr);
+      if (spawn_threads(threads, &generate_cc_perms_thread, g_perm_generation_thread_ct)) {
+	goto make_perm_pheno_ret_THREAD_CREATE_FAIL;
+      }
+      generate_cc_perms_thread((void*)ulii);
+    }
+  } else {
+    g_perm_pheno_d2 = (double*)alloc_and_init_collapsed_arr_incl((char*)pheno_d, sizeof(double), unfiltered_sample_ct, pheno_nm, pheno_nm_ct, 1);
+    if (!g_perm_pheno_d2) {
+      goto make_perm_pheno_ret_NOMEM;
+    }
+    perm_vec_ctcl8m = round_up_pow2(permphe_ct, CACHELINE_DBL);
+    if (bigstack_alloc_d(perm_vec_ctcl8m * pheno_nm_ct, &g_perm_vecstd)) {
+      goto make_perm_pheno_ret_NOMEM;
+    }
+    if (cluster_starts) {
+      retval = cluster_include_and_reindex(unfiltered_sample_ct, pheno_nm, 1, NULL, pheno_nm_ct, 0, cluster_ct, cluster_map, cluster_starts, &g_perm_cluster_ct, &g_perm_cluster_map, &g_perm_cluster_starts, NULL, NULL);
+      if (retval) {
+	goto make_perm_pheno_ret_1;
+      }
+      if (!g_perm_cluster_ct) {
+        logerrprint("Error: Degenerate --make-perm-pheno invocation (no size 2+ clusters).\n");
+        goto make_perm_pheno_ret_INVALID_CMDLINE;
+      }
+      if (bigstack_alloc_ui(pheno_nm_ct, &g_perm_sample_to_cluster) ||
+          bigstack_alloc_ui(g_perm_generation_thread_ct * round_up_pow2(g_perm_cluster_ct, CACHELINE_INT32), &g_perm_qt_cluster_thread_wkspace)) {
+	goto make_perm_pheno_ret_NOMEM;
+      }
+      fill_unfiltered_sample_to_cluster(pheno_nm_ct, g_perm_cluster_ct, g_perm_cluster_map, g_perm_cluster_starts, g_perm_sample_to_cluster);
+      if (spawn_threads(threads, &generate_qt_cluster_perms_smajor_thread, g_perm_generation_thread_ct)) {
+	goto make_perm_pheno_ret_THREAD_CREATE_FAIL;
+      }
+      generate_qt_cluster_perms_smajor_thread((void*)ulii);
+    } else {
+      if (spawn_threads(threads, &generate_qt_perms_smajor_thread, g_perm_generation_thread_ct)) {
+	goto make_perm_pheno_ret_THREAD_CREATE_FAIL;
+      }
+      generate_qt_perms_smajor_thread((void*)ulii);
+    }
+    if (bigstack_alloc_c(permphe_ct * 16LU, &writebuf)) {
+      goto make_perm_pheno_ret_NOMEM;
+    }
+  }
+  join_threads(threads, g_perm_generation_thread_ct);
+  memcpy(outname_end, ".pphe", 6);
+  if (fopen_checked(outname, "w", &outfile)) {
+    goto make_perm_pheno_ret_OPEN_FAIL;
+  }
+  sample_nmidx = 0;
+  for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
+    next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
+    fputs(&(sample_ids[sample_uidx * max_sample_id_len]), outfile);
+    if (!IS_SET(pheno_nm, sample_uidx)) {
+      for (perm_idx = 0; perm_idx < permphe_ct; perm_idx++) {
+	putc('\t', outfile);
+	fputs(output_missing_pheno, outfile);
+      }
+    } else if (pheno_c) {
+      ulptr = &(g_perm_vecs[sample_nmidx / BITCT]);
+      rshift = sample_nmidx % BITCT;
+      for (perm_idx = 0; perm_idx < permphe_ct; perm_idx++) {
+	putc('\t', outfile);
+        putc('1' + ((ulptr[perm_idx * pheno_nm_ctv] >> rshift) & 1), outfile);
+      }
+      sample_nmidx++;
+    } else {
+      wptr = writebuf;
+      dptr = &(g_perm_vecstd[sample_nmidx * perm_vec_ctcl8m]);
+      for (perm_idx = 0; perm_idx < permphe_ct; perm_idx++) {
+	*wptr++ = '\t';
+        wptr = dtoa_g(*dptr++, wptr);
+      }
+      if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
+	goto make_perm_pheno_ret_WRITE_FAIL;
+      }
+      sample_nmidx++;
+    }
+    if (putc_checked('\n', outfile)) {
+      goto make_perm_pheno_ret_WRITE_FAIL;
+    }
+  }
+  if (fclose_null(&outfile)) {
+    goto make_perm_pheno_ret_WRITE_FAIL;
+  }
+  LOGPRINTFWW("--make-perm-pheno: Permuted phenotypes written to %s .\n", outname);
+  while (0) {
+  make_perm_pheno_ret_NOMEM:
+    retval = RET_NOMEM;
+    break;
+  make_perm_pheno_ret_OPEN_FAIL:
+    retval = RET_OPEN_FAIL;
+    break;
+  make_perm_pheno_ret_WRITE_FAIL:
+    retval = RET_WRITE_FAIL;
+    break;
+  make_perm_pheno_ret_INVALID_CMDLINE:
+    retval = RET_INVALID_CMDLINE;
+    break;
+  make_perm_pheno_ret_THREAD_CREATE_FAIL:
+    retval = RET_THREAD_CREATE_FAIL;
+    break;
+  }
+ make_perm_pheno_ret_1:
+  bigstack_reset(bigstack_mark);
+  fclose_cond(outfile);
+  return retval;
+}
diff --git a/plink_perm.h b/plink_perm.h
index 8de84e5..6b182a9 100644
--- a/plink_perm.h
+++ b/plink_perm.h
@@ -3,6 +3,39 @@
 // Permutation generation and interpretation code common to many association
 // tests.
 
+// Inputs/outputs for multithreaded permutation generators.
+extern uint32_t g_perm_pheno_nm_ct;
+extern uint32_t g_perm_case_ct;
+extern uint32_t g_perm_tot_quotient;
+extern uint64_t g_perm_totq_magic;
+extern uint32_t g_perm_totq_preshift;
+extern uint32_t g_perm_totq_postshift;
+extern uint32_t g_perm_totq_incr;
+extern uint32_t g_perm_is_1bit;
+extern uint32_t g_perm_generation_thread_ct;
+extern uintptr_t g_perm_vec_ct;
+
+extern uint32_t g_perm_cluster_ct;
+extern uint32_t* g_perm_cluster_map;
+extern uint32_t* g_perm_cluster_starts;
+extern uint32_t* g_perm_cluster_case_cts;
+extern uintptr_t* g_perm_cluster_cc_preimage;
+extern uint32_t* g_perm_tot_quotients;
+extern uint64_t* g_perm_totq_magics;
+extern uint32_t* g_perm_totq_preshifts;
+extern uint32_t* g_perm_totq_postshifts;
+extern uint32_t* g_perm_totq_incrs;
+
+extern uintptr_t* g_perm_vecs;
+
+extern double* g_perm_vecstd;
+extern double* g_perm_pheno_d2;
+extern uint32_t* g_perm_sample_to_cluster;
+extern uint32_t* g_perm_qt_cluster_thread_wkspace;
+
+extern double* g_perm_pmajor;
+extern uint32_t* g_perm_precomputed_mods; // [n] = 2^32 mod (n-2)
+
 void generate_cc_perm_vec(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotient, uint64_t totq_magic, uint32_t totq_preshift, uint32_t totq_postshift, uint32_t totq_incr, uintptr_t* perm_vec, sfmt_t* sfmtp);
 
 void generate_cc_perm1(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotient, uint64_t totq_magic, uint32_t totq_preshift, uint32_t totq_postshift, uint32_t totq_incr, uintptr_t* perm_vec, sfmt_t* sfmtp);
@@ -11,13 +44,25 @@ void generate_cc_cluster_perm_vec(uint32_t tot_ct, uintptr_t* preimage, uint32_t
 
 void generate_cc_cluster_perm1(uint32_t tot_ct, uintptr_t* preimage, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t* cluster_case_cts, uint32_t* tot_quotients, uint64_t* totq_magics, uint32_t* totq_preshifts, uint32_t* totq_postshifts, uint32_t* totq_incrs, uintptr_t* perm_vec, sfmt_t* sfmtp);
 
+THREAD_RET_TYPE generate_cc_perms_thread(void* arg);
+
+THREAD_RET_TYPE generate_cc_cluster_perms_thread(void* arg);
+
+THREAD_RET_TYPE generate_qt_perms_smajor_thread(void* arg);
+
+THREAD_RET_TYPE generate_qt_cluster_perms_smajor_thread(void* arg);
+
+THREAD_RET_TYPE generate_qt_perms_pmajor_thread(void* arg);
+
+THREAD_RET_TYPE generate_qt_cluster_perms_pmajor_thread(void* arg);
+
 // Efficient "vertical popcount" support.
 void transpose_perms(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_nm_ct, uint32_t* perm_vecst);
 
 void transpose_perm1s(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_nm_ct, uint32_t* perm_vecst);
 
 #ifdef __LP64__
-static inline void unroll_incr_1_4(const __m128i* acc1, __m128i* acc4, uint32_t acc1_vec_ct) {
+HEADER_INLINE void unroll_incr_1_4(const __m128i* acc1, __m128i* acc4, uint32_t acc1_vec_ct) {
   const __m128i m1x4 = {0x1111111111111111LLU, 0x1111111111111111LLU};
   __m128i loader;
   uint32_t vidx;
@@ -37,7 +82,7 @@ static inline void unroll_incr_1_4(const __m128i* acc1, __m128i* acc4, uint32_t
   }
 }
 
-static inline void unroll_incr_4_8(const __m128i* acc4, __m128i* acc8, uint32_t acc4_vec_ct) {
+HEADER_INLINE void unroll_incr_4_8(const __m128i* acc4, __m128i* acc8, uint32_t acc4_vec_ct) {
   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
   __m128i loader;
   uint32_t vidx;
@@ -51,7 +96,7 @@ static inline void unroll_incr_4_8(const __m128i* acc4, __m128i* acc8, uint32_t
   }
 }
 
-static inline void unroll_zero_incr_4_8(__m128i* acc4, __m128i* acc8, uint32_t acc4_vec_ct) {
+HEADER_INLINE void unroll_zero_incr_4_8(__m128i* acc4, __m128i* acc8, uint32_t acc4_vec_ct) {
   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
   __m128i loader;
   uint32_t vidx;
@@ -66,7 +111,7 @@ static inline void unroll_zero_incr_4_8(__m128i* acc4, __m128i* acc8, uint32_t a
   }
 }
 
-static inline void unroll_incr_8_32(const __m128i* acc8, __m128i* acc32, uint32_t acc8_vec_ct) {
+HEADER_INLINE void unroll_incr_8_32(const __m128i* acc8, __m128i* acc32, uint32_t acc8_vec_ct) {
   const __m128i m8x32 = {0x000000ff000000ffLLU, 0x000000ff000000ffLLU};
   __m128i loader;
   uint32_t vidx;
@@ -86,7 +131,7 @@ static inline void unroll_incr_8_32(const __m128i* acc8, __m128i* acc32, uint32_
   }
 }
 
-static inline void unroll_zero_incr_8_32(__m128i* acc8, __m128i* acc32, uint32_t acc8_vec_ct) {
+HEADER_INLINE void unroll_zero_incr_8_32(__m128i* acc8, __m128i* acc32, uint32_t acc8_vec_ct) {
   const __m128i m8x32 = {0x000000ff000000ffLLU, 0x000000ff000000ffLLU};
   __m128i loader;
   uint32_t vidx;
@@ -107,7 +152,7 @@ static inline void unroll_zero_incr_8_32(__m128i* acc8, __m128i* acc32, uint32_t
   }
 }
 #else
-static inline void unroll_incr_1_4(const uintptr_t* acc1, uintptr_t* acc4, uint32_t acc1_word_ct) {
+HEADER_INLINE void unroll_incr_1_4(const uintptr_t* acc1, uintptr_t* acc4, uint32_t acc1_word_ct) {
   uint32_t widx;
   uintptr_t loader;
   for (widx = 0; widx < acc1_word_ct; widx++) {
@@ -126,7 +171,7 @@ static inline void unroll_incr_1_4(const uintptr_t* acc1, uintptr_t* acc4, uint3
   }
 }
 
-static inline void unroll_incr_4_8(const uintptr_t* acc4, uintptr_t* acc8, uint32_t acc4_word_ct) {
+HEADER_INLINE void unroll_incr_4_8(const uintptr_t* acc4, uintptr_t* acc8, uint32_t acc4_word_ct) {
   uint32_t widx;
   uintptr_t loader;
   for (widx = 0; widx < acc4_word_ct; widx++) {
@@ -139,7 +184,7 @@ static inline void unroll_incr_4_8(const uintptr_t* acc4, uintptr_t* acc8, uint3
   }
 }
 
-static inline void unroll_zero_incr_4_8(uintptr_t* acc4, uintptr_t* acc8, uint32_t acc4_word_ct) {
+HEADER_INLINE void unroll_zero_incr_4_8(uintptr_t* acc4, uintptr_t* acc8, uint32_t acc4_word_ct) {
   uint32_t widx;
   uintptr_t loader;
   for (widx = 0; widx < acc4_word_ct; widx++) {
@@ -153,7 +198,7 @@ static inline void unroll_zero_incr_4_8(uintptr_t* acc4, uintptr_t* acc8, uint32
   }
 }
 
-static inline void unroll_incr_8_32(const uintptr_t* acc8, uintptr_t* acc32, uint32_t acc8_word_ct) {
+HEADER_INLINE void unroll_incr_8_32(const uintptr_t* acc8, uintptr_t* acc32, uint32_t acc8_word_ct) {
   uint32_t widx;
   uintptr_t loader;
   for (widx = 0; widx < acc8_word_ct; widx++) {
@@ -172,7 +217,7 @@ static inline void unroll_incr_8_32(const uintptr_t* acc8, uintptr_t* acc32, uin
   }
 }
 
-static inline void unroll_zero_incr_8_32(uintptr_t* acc8, uintptr_t* acc32, uint32_t acc8_word_ct) {
+HEADER_INLINE void unroll_zero_incr_8_32(uintptr_t* acc8, uintptr_t* acc32, uint32_t acc8_word_ct) {
   uint32_t widx;
   uintptr_t loader;
   for (widx = 0; widx < acc8_word_ct; widx++) {
@@ -193,4 +238,6 @@ static inline void unroll_zero_incr_8_32(uintptr_t* acc8, uintptr_t* acc32, uint
 }
 #endif
 
+int32_t make_perm_pheno(pthread_t* threads, char* outname, char* outname_end, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t pheno_nm_ct, uintptr_t* pheno_nm, uintptr_t* pheno_c, double* pheno_d, char* output_missing_pheno, uint32_t permphe_ct);
+
 #endif // __PLINK_PERM_H__
diff --git a/plink_rserve.c b/plink_rserve.c
index 8c401c1..277d129 100644
--- a/plink_rserve.c
+++ b/plink_rserve.c
@@ -15,10 +15,9 @@
 
 int32_t rserve_call(char* rplugin_fname, uint32_t rplugin_port, uint32_t rplugin_debug, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uint32_t* marker_pos, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* pheno_nm, uint32_t pheno_nm_ct, uintptr_t* pheno_c, double* pheno_d, u [...]
   // See PLINK 1.07 r.cpp.
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* infile = NULL;
   FILE* outfile = NULL;
-  char* wkspace_end = (char*)(&(wkspace_base[wkspace_left]));
   int32_t* geno_int_buf = NULL;
   Rinteger* r_n = NULL;
   Rinteger* r_s = NULL;
@@ -64,21 +63,21 @@ int32_t rserve_call(char* rplugin_fname, uint32_t rplugin_port, uint32_t rplugin
   uint32_t cur_data_len;
   uint32_t uii;
   int32_t ii;
-  if (fopen_checked(&infile, rplugin_fname, "r")) {
+  if (fopen_checked(rplugin_fname, "r", &infile)) {
     goto rserve_call_ret_OPEN_FAIL;
   }
-  if (wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&loadbuf, pheno_nm_ctl2 * RPLUGIN_BLOCK_SIZE * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_sample_ctl2, &loadbuf_raw) ||
+      bigstack_alloc_ul(pheno_nm_ctl2 * RPLUGIN_BLOCK_SIZE, &loadbuf)) {
     goto rserve_call_ret_NOMEM;
   }
   loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
   for (ulii = 1; ulii <= RPLUGIN_BLOCK_SIZE; ulii++) {
     loadbuf[ulii * pheno_nm_ctl2 - 1] = 0;
   }
-  inbuf_start = (char*)wkspace_base;
+  inbuf_start = (char*)g_bigstack_base;
   inbuf_end = inbuf_start;
   while (1) {
-    if ((uintptr_t)(wkspace_end - inbuf_start) < MAXLINELEN) {
+    if (((uintptr_t)g_bigstack_end) - ((uintptr_t)inbuf_start) < MAXLINELEN) {
       goto rserve_call_ret_NOMEM;
     }
     inbuf_end[MAXLINELEN - 1] = ' ';
@@ -87,7 +86,7 @@ int32_t rserve_call(char* rplugin_fname, uint32_t rplugin_port, uint32_t rplugin
     }
     line_idx++;
     if (!(inbuf_end[MAXLINELEN - 1])) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --R file is pathologically long.\n", line_idx);
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --R file is pathologically long.\n", line_idx);
       goto rserve_call_ret_INVALID_FORMAT_2;
     }
     uii = strlen(inbuf_end);
@@ -116,9 +115,9 @@ int32_t rserve_call(char* rplugin_fname, uint32_t rplugin_port, uint32_t rplugin
     goto rserve_call_ret_INVALID_FORMAT;
   }
   *inbuf_end = '\0';
-  wkspace_alloc(1 + ((uintptr_t)(inbuf_end - inbuf_start)));
+  bigstack_alloc(1 + ((uintptr_t)(inbuf_end - inbuf_start)));
   if (pheno_c) {
-    if (wkspace_alloc_d_checked(&pheno_d2, pheno_nm_ct * sizeof(double))) {
+    if (bigstack_alloc_d(pheno_nm_ct, &pheno_d2)) {
       goto rserve_call_ret_NOMEM;
     }
     for (sample_uidx = 0, sample_idx = 0; sample_idx < pheno_nm_ct; sample_uidx++, sample_idx++) {
@@ -134,28 +133,27 @@ int32_t rserve_call(char* rplugin_fname, uint32_t rplugin_port, uint32_t rplugin
     pheno_d2 = pheno_d;
   }
   if (cluster_ct) {
-    if (wkspace_alloc_i_checked(&sample_to_cluster, unfiltered_sample_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_i(unfiltered_sample_ct, &sample_to_cluster)) {
       goto rserve_call_ret_NOMEM;
     }
     fill_int_one(sample_to_cluster, pheno_nm_ct);
     fill_unfiltered_sample_to_cluster(unfiltered_sample_ct, cluster_ct, cluster_map, cluster_starts, (uint32_t*)sample_to_cluster);
     inplace_collapse_uint32_incl((uint32_t*)sample_to_cluster, unfiltered_sample_ct, pheno_nm, pheno_nm_ct);
-    wkspace_shrink_top(sample_to_cluster, pheno_nm_ct * sizeof(int32_t));
+    bigstack_shrink_top(sample_to_cluster, pheno_nm_ct * sizeof(int32_t));
   } else {
-    if (wkspace_alloc_i_checked(&sample_to_cluster, pheno_nm_ct * sizeof(int32_t))) {
+    if (bigstack_calloc_i(pheno_nm_ct, &sample_to_cluster)) {
       goto rserve_call_ret_NOMEM;
     }
-    fill_int_zero(sample_to_cluster, pheno_nm_ct);
   }
   if (!rplugin_debug) {
-    if (wkspace_alloc_i_checked(&geno_int_buf, RPLUGIN_BLOCK_SIZE * pheno_nm_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_i(RPLUGIN_BLOCK_SIZE * ((uintptr_t)pheno_nm_ct), &geno_int_buf)) {
       goto rserve_call_ret_NOMEM;
     }
     rc = new Rconnection("127.0.0.1", rplugin_port);
     ii = rc->connect();
     if (ii) {
-      sockerrorchecks(tbuf, 128, -1);
-      LOGERRPRINTFWW("Error: Unable to connect (code %d: %s).\n", ii, tbuf);
+      sockerrorchecks(g_textbuf, 128, -1);
+      LOGERRPRINTFWW("Error: Unable to connect (code %d: %s).\n", ii, g_textbuf);
       goto rserve_call_ret_NETWORK;
     }
     rc->eval("options(echo=F)");
@@ -178,7 +176,7 @@ int32_t rserve_call(char* rplugin_fname, uint32_t rplugin_port, uint32_t rplugin
   } else {
     memcpy(outname_end, ".debug.R", 9);
   }
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto rserve_call_ret_OPEN_FAIL;
   }
   LOGPRINTFWW5("--R%s: writing to %s ... ", rplugin_debug? " debug" : "", outname);
@@ -186,32 +184,32 @@ int32_t rserve_call(char* rplugin_fname, uint32_t rplugin_port, uint32_t rplugin
   fflush(stdout);
   loop_end = marker_ct / 100;
   if (rplugin_debug) {
-    bufptr = memcpya(tbuf, "n <- ", 5);
-    bufptr = uint32_write(bufptr, pheno_nm_ct);
+    bufptr = memcpya(g_textbuf, "n <- ", 5);
+    bufptr = uint32toa(pheno_nm_ct, bufptr);
     bufptr = memcpya(bufptr, "\nPHENO <- c( ", 13);
-    if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+    if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
       goto rserve_call_ret_WRITE_FAIL;
     }
     for (sample_idx = 0; sample_idx < pheno_nm_ct - 1; sample_idx++) {
-      bufptr = double_g_write(tbuf, pheno_d2[sample_idx]);
+      bufptr = dtoa_g(pheno_d2[sample_idx], g_textbuf);
       bufptr = memcpya(bufptr, ", ", 2);
-      fwrite(tbuf, 1, (uintptr_t)(bufptr - tbuf), outfile);
+      fwrite(g_textbuf, 1, (uintptr_t)(bufptr - g_textbuf), outfile);
     }
-    bufptr = double_g_write(tbuf, pheno_d2[sample_idx]);
+    bufptr = dtoa_g(pheno_d2[sample_idx], g_textbuf);
     bufptr = memcpya(bufptr, " ) \n", 4);
-    if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+    if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
       goto rserve_call_ret_WRITE_FAIL;
     }
     if (covar_ct) {
       fputs("c <- c( ", outfile);
       uljj = pheno_nm_ct * covar_ct - 1;
       for (ulii = 0; ulii < uljj; ulii++) {
-	bufptr = double_g_write(tbuf, covar_d[ulii]);
+	bufptr = dtoa_g(covar_d[ulii], g_textbuf);
 	bufptr = memcpya(bufptr, ", ", 2);
-	fwrite(tbuf, 1, (uintptr_t)(bufptr - tbuf), outfile);
+	fwrite(g_textbuf, 1, (uintptr_t)(bufptr - g_textbuf), outfile);
       }
-      bufptr = double_g_write(tbuf, covar_d[ulii]);
-      fwrite(tbuf, 1, (uintptr_t)(bufptr - tbuf), outfile);
+      bufptr = dtoa_g(covar_d[ulii], g_textbuf);
+      fwrite(g_textbuf, 1, (uintptr_t)(bufptr - g_textbuf), outfile);
       fputs(" ) \nCOVAR <- matrix( c , nrow = n , byrow=T)\n", outfile);
     } else {
       // old code (this might be better?  but --R backward compatibility is
@@ -221,14 +219,14 @@ int32_t rserve_call(char* rplugin_fname, uint32_t rplugin_port, uint32_t rplugin
     }
     fputs("CLUSTER <- c( ", outfile);
     for (sample_idx = 0; sample_idx < pheno_nm_ct - 1; sample_idx++) {
-      bufptr = int32_write(tbuf, sample_to_cluster[sample_idx]);
+      bufptr = int32toa(sample_to_cluster[sample_idx], g_textbuf);
       bufptr = memcpya(bufptr, ", ", 2);
-      fwrite(tbuf, 1, (uintptr_t)(bufptr - tbuf), outfile);
+      fwrite(g_textbuf, 1, (uintptr_t)(bufptr - g_textbuf), outfile);
     }
-    bufptr = int32_write(tbuf, sample_to_cluster[sample_idx]);
+    bufptr = int32toa(sample_to_cluster[sample_idx], g_textbuf);
     bufptr = memcpya(bufptr, " ) \n", 4);
     fputs("CLUSTER[CLUSTER==-1] <- NA\n", outfile);
-    if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+    if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
       goto rserve_call_ret_WRITE_FAIL;
     }
   }
@@ -250,7 +248,7 @@ int32_t rserve_call(char* rplugin_fname, uint32_t rplugin_port, uint32_t rplugin
 	  goto rserve_call_ret_READ_FAIL;
 	}
       }
-      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, ulptr, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+      if (load_and_collapse_incl(unfiltered_sample_ct, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx), bedfile, loadbuf_raw, ulptr)) {
 	goto rserve_call_ret_READ_FAIL;
       }
       // 0 -> 3
@@ -298,36 +296,36 @@ int32_t rserve_call(char* rplugin_fname, uint32_t rplugin_port, uint32_t rplugin
 	      chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[(++chrom_fo_idx) + 1];
 	    } while (marker_uidx >= chrom_end);
 	    uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-	    chrom_name_ptr = chrom_name_buf5w4write(chrom_name_buf, chrom_info_ptr, uii, &chrom_name_len);
+	    chrom_name_ptr = chrom_name_buf5w4write(chrom_info_ptr, uii, &chrom_name_len, chrom_name_buf);
 	  }
-	  bufptr = memcpyax(tbuf, chrom_name_ptr, chrom_name_len, ' ');
+	  bufptr = memcpyax(g_textbuf, chrom_name_ptr, chrom_name_len, ' ');
 	  bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), bufptr);
 	  *bufptr++ = ' ';
-	  bufptr = uint32_writew10x(bufptr, marker_pos[marker_uidx], ' ');
-	  if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+	  bufptr = uint32toa_w10x(marker_pos[marker_uidx], ' ', bufptr);
+	  if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	    goto rserve_call_ret_WRITE_FAIL;
 	  }
 	  fputs_w4(marker_allele_ptrs[2 * marker_uidx], outfile);
-	  tbuf[0] = ' ';
-	  bufptr = &(tbuf[1]);
+	  g_textbuf[0] = ' ';
+	  bufptr = &(g_textbuf[1]);
 	  cur_data_len = (int32_t)(*dptr++);
 	  for (uii = 0; uii < cur_data_len; uii++) {
 	    dxx = *dptr++;
 	    if (realnum(dxx)) {
-	      bufptr = double_g_write(bufptr, dxx);
+	      bufptr = dtoa_g(dxx, bufptr);
 	    } else {
 	      bufptr = memcpya(bufptr, "NA", 2);
 	    }
 	    *bufptr++ = '\t';
-	    if (bufptr > &(tbuf[MAXLINELEN])) {
-	      if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+	    if (bufptr > &(g_textbuf[MAXLINELEN])) {
+	      if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 		goto rserve_call_ret_WRITE_FAIL;
 	      }
-	      bufptr = tbuf;
+	      bufptr = g_textbuf;
 	    }
 	  }
 	  *bufptr++ = '\n';
-	  if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+	  if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	    goto rserve_call_ret_WRITE_FAIL;
 	  }
 	}
@@ -340,13 +338,13 @@ int32_t rserve_call(char* rplugin_fname, uint32_t rplugin_port, uint32_t rplugin
 	      chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[(++chrom_fo_idx) + 1];
 	    } while (marker_uidx >= chrom_end);
 	    uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-	    chrom_name_ptr = chrom_name_buf5w4write(chrom_name_buf, chrom_info_ptr, uii, &chrom_name_len);
+	    chrom_name_ptr = chrom_name_buf5w4write(chrom_info_ptr, uii, &chrom_name_len, chrom_name_buf);
 	  }
-	  bufptr = memcpyax(tbuf, chrom_name_ptr, chrom_name_len, ' ');
+	  bufptr = memcpyax(g_textbuf, chrom_name_ptr, chrom_name_len, ' ');
 	  bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), bufptr);
 	  *bufptr++ = ' ';
-	  bufptr = uint32_writew10x(bufptr, marker_pos[marker_uidx], ' ');
-	  if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+	  bufptr = uint32toa_w10x(marker_pos[marker_uidx], ' ', bufptr);
+	  if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	    goto rserve_call_ret_WRITE_FAIL;
 	  }
 	  fputs_w4(marker_allele_ptrs[2 * marker_uidx], outfile);
@@ -354,16 +352,16 @@ int32_t rserve_call(char* rplugin_fname, uint32_t rplugin_port, uint32_t rplugin
 	}
       }
     } else {
-      bufptr = memcpya(tbuf, "l <- ", 5);
-      bufptr = uint32_write(bufptr, block_size);
+      bufptr = memcpya(g_textbuf, "l <- ", 5);
+      bufptr = uint32toa(block_size, bufptr);
       bufptr = memcpya(bufptr, "\ng <- c( ", 9);
-      if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+      if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	goto rserve_call_ret_WRITE_FAIL;
       }
       block_offset = 0;
       sample_idx = 0;
       while (1) {
-        bufptr = tbuf;
+        bufptr = g_textbuf;
 	ulptr = &(loadbuf[sample_idx / BITCT2]);
 	uii = 2 * (sample_idx & (BITCT2 - 1));
 	for (block_offset = 0; block_offset < block_size; block_offset++, ulptr = &(ulptr[pheno_nm_ctl2])) {
@@ -379,12 +377,12 @@ int32_t rserve_call(char* rplugin_fname, uint32_t rplugin_port, uint32_t rplugin
 	if (++sample_idx == pheno_nm_ct) {
 	  break;
 	}
-	if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+	if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	  goto rserve_call_ret_WRITE_FAIL;
 	}
       }
       bufptr = memcpya(&(bufptr[-2]), " ) \nGENO <- matrix( g , nrow = n ,byrow=T)\nGENO[GENO == -1 ] <- NA \n\n\n", 70);
-      if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+      if (fwrite_checked(g_textbuf, bufptr - g_textbuf, outfile)) {
 	goto rserve_call_ret_WRITE_FAIL;
       }
       if (fwrite_checked(inbuf_start, inbuf_end - inbuf_start, outfile)) {
@@ -435,7 +433,7 @@ int32_t rserve_call(char* rplugin_fname, uint32_t rplugin_port, uint32_t rplugin
   }
   fclose_cond(infile);
   fclose_cond(outfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   delete r_n;
   delete r_p;
   delete r_s;
diff --git a/plink_set.c b/plink_set.c
index 014de18..ea6c551 100644
--- a/plink_set.c
+++ b/plink_set.c
@@ -231,18 +231,17 @@ uint32_t setdef_iter(uint32_t* setdef, uint32_t* cur_idx_ptr, uint32_t* aux_ptr)
 
 uint32_t alloc_and_populate_nonempty_set_incl(Set_info* sip, uint32_t* nonempty_set_ct_ptr, uintptr_t** nonempty_set_incl_ptr) {
   uint32_t raw_set_ct = sip->ct;
-  uint32_t raw_set_ctl = (raw_set_ct + (BITCT - 1)) / BITCT;
+  uint32_t raw_set_ctl = BITCT_TO_WORDCT(raw_set_ct);
   uint32_t nonempty_set_ct = 0;
   uintptr_t* nonempty_set_incl;
   uint32_t set_uidx;
-  if (wkspace_alloc_ul_checked(nonempty_set_incl_ptr, raw_set_ctl * sizeof(intptr_t))) {
+  if (bigstack_calloc_ul(raw_set_ctl, nonempty_set_incl_ptr)) {
     return 1;
   }
   nonempty_set_incl = *nonempty_set_incl_ptr;
-  fill_ulong_zero(nonempty_set_incl, raw_set_ctl);
   for (set_uidx = 0; set_uidx < raw_set_ct; set_uidx++) {
     if (sip->setdefs[set_uidx][0]) {
-      set_bit(nonempty_set_incl, set_uidx);
+      set_bit(set_uidx, nonempty_set_incl);
       nonempty_set_ct++;
     }
   }
@@ -250,11 +249,10 @@ uint32_t alloc_and_populate_nonempty_set_incl(Set_info* sip, uint32_t* nonempty_
   return 0;
 }
 
-int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_extend, uint32_t collapse_group, uint32_t fail_on_no_sets, uint32_t c_prefix, uint32_t allow_no_variants, uintptr_t subset_ct, char* sorted_subset_ids, uintptr_t max_subset_id_len, uint32_t* marker_pos, Chrom_info* chrom_info_ptr, uintptr_t* topsize_ptr, uintptr_t* set_ct_ptr, char** set_names_ptr, uintptr_t* max_set_id_len_ptr, Make_set_range*** make_set_range_arr_ptr, uint64_t** range_sort_buf_ptr, const ch [...]
+int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_extend, uint32_t collapse_group, uint32_t fail_on_no_sets, uint32_t c_prefix, uint32_t allow_no_variants, uintptr_t subset_ct, char* sorted_subset_ids, uintptr_t max_subset_id_len, uint32_t* marker_pos, Chrom_info* chrom_info_ptr, uintptr_t* set_ct_ptr, char** set_names_ptr, uintptr_t* max_set_id_len_ptr, Make_set_range*** make_set_range_arr_ptr, uint64_t** range_sort_buf_ptr, const char* file_descrip) {
   // Called directly by extract_exclude_range(), define_sets(), and indirectly
   // by annotate(), gene_report(), and clump_reports().
-  // Assumes topsize has not been subtracted off wkspace_left.  (This remains
-  // true on exit.)
+  // Assumes caller will reset g_bigstack_end later.
   Ll_str* make_set_ll = NULL;
   char* set_names = NULL;
   uintptr_t set_ct = 0;
@@ -277,16 +275,16 @@ int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_
   uint32_t uii;
   uint32_t ujj;
   int32_t ii;
-  tbuf[MAXLINELEN - 1] = ' ';
+  g_textbuf[MAXLINELEN - 1] = ' ';
   // if we need to track set names, put together a sorted list
   if (track_set_names) {
-    while (fgets(tbuf, MAXLINELEN, infile)) {
+    while (fgets(g_textbuf, MAXLINELEN, infile)) {
       line_idx++;
-      if (!tbuf[MAXLINELEN - 1]) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of %s file is pathologically long.\n", line_idx, file_descrip);
+      if (!g_textbuf[MAXLINELEN - 1]) {
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s file is pathologically long.\n", line_idx, file_descrip);
 	goto load_range_list_ret_INVALID_FORMAT_2;
       }
-      bufptr = skip_initial_spaces(tbuf);
+      bufptr = skip_initial_spaces(g_textbuf);
       if (is_eoln_kns(*bufptr)) {
 	continue;
       }
@@ -297,12 +295,12 @@ int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_
 	bufptr3 = next_token(bufptr2);
       }
       if (no_more_tokens_kns(bufptr3)) {
-	sprintf(logbuf, "Error: Line %" PRIuPTR " of %s file has fewer tokens than expected.\n", line_idx, file_descrip);
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s file has fewer tokens than expected.\n", line_idx, file_descrip);
 	goto load_range_list_ret_INVALID_FORMAT_2;
       }
       ii = get_chrom_code(chrom_info_ptr, bufptr);
       if (ii < 0) {
-	sprintf(logbuf, "Error: Invalid chromosome code on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
+	sprintf(g_logbuf, "Error: Invalid chromosome code on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
 	goto load_range_list_ret_INVALID_FORMAT_2;
       }
       // chrom_mask check removed, we want to track empty sets
@@ -332,12 +330,14 @@ int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_
       if (uii > max_set_id_len) {
 	max_set_id_len = uii;
       }
-      ll_tmp = top_alloc_llstr(topsize_ptr, uii);
+      if (bigstack_end_alloc_llstr(uii, &ll_tmp)) {
+        goto load_range_list_ret_NOMEM;
+      }
       ll_tmp->next = make_set_ll;
       if (marker_pos) {
         memcpy(ll_tmp->ss, bufptr3, uii);
       } else {
-	uint32_write4(ll_tmp->ss, (uint32_t)ii);
+	uitoa_z4((uint32_t)ii, ll_tmp->ss);
 	// if first character of gene name is a digit, natural sort has strange
 	// effects unless we force [3] to be nonnumeric...
 	ll_tmp->ss[3] -= 15;
@@ -373,11 +373,9 @@ int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_
       logerrprint("Error: Set IDs are limited to " MAX_ID_LEN_STR " characters.\n");
       goto load_range_list_ret_INVALID_FORMAT;
     }
-    wkspace_left -= *topsize_ptr;
-    if (wkspace_alloc_c_checked(set_names_ptr, set_ct)) {
-      goto load_range_list_ret_NOMEM2;
+    if (bigstack_alloc_c(set_ct * max_set_id_len, set_names_ptr)) {
+      goto load_range_list_ret_NOMEM;
     }
-    wkspace_left += *topsize_ptr;
     set_names = *set_names_ptr;
     if (!c_prefix) {
       for (ulii = 0; ulii < set_ct; ulii++) {
@@ -393,23 +391,26 @@ int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_
     }
     qsort(set_names, set_ct, max_set_id_len, strcmp_natural);
     set_ct = collapse_duplicate_ids(set_names, set_ct, max_set_id_len, NULL);
-    wkspace_shrink_top(set_names, set_ct * max_set_id_len);
+    bigstack_shrink_top(set_names, set_ct * max_set_id_len);
     rewind(infile);
   } else {
     set_ct = 1;
   }
-  make_set_range_arr = (Make_set_range**)top_alloc(topsize_ptr, set_ct * sizeof(intptr_t));
+  make_set_range_arr = (Make_set_range**)bigstack_end_alloc(set_ct * sizeof(intptr_t));
+  if (!make_set_range_arr) {
+    goto load_range_list_ret_NOMEM;
+  }
   for (set_idx = 0; set_idx < set_ct; set_idx++) {
     make_set_range_arr[set_idx] = NULL;
   }
   line_idx = 0;
-  while (fgets(tbuf, MAXLINELEN, infile)) {
+  while (fgets(g_textbuf, MAXLINELEN, infile)) {
     line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of %s file is pathologically long.\n", line_idx, file_descrip);
+    if (!g_textbuf[MAXLINELEN - 1]) {
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s file is pathologically long.\n", line_idx, file_descrip);
       goto load_range_list_ret_INVALID_FORMAT_2;
     }
-    bufptr = skip_initial_spaces(tbuf);
+    bufptr = skip_initial_spaces(g_textbuf);
     if (is_eoln_kns(*bufptr)) {
       continue;
     }
@@ -420,12 +421,12 @@ int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_
       bufptr3 = next_token(bufptr2);
     }
     if (no_more_tokens_kns(bufptr3)) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of %s file has fewer tokens than expected.\n", line_idx, file_descrip);
+      sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s file has fewer tokens than expected.\n", line_idx, file_descrip);
       goto load_range_list_ret_INVALID_FORMAT_2;
     }
     ii = get_chrom_code(chrom_info_ptr, bufptr);
     if (ii < 0) {
-      sprintf(logbuf, "Error: Invalid chromosome code on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
+      sprintf(g_logbuf, "Error: Invalid chromosome code on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
       goto load_range_list_ret_INVALID_FORMAT_2;
     }
     if (!is_set(chrom_info_ptr->chrom_mask, ii)) {
@@ -445,17 +446,17 @@ int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_
     }
     bufptr = next_token(bufptr);
     if (scan_uint_defcap(bufptr, &range_first)) {
-      sprintf(logbuf, "Error: Invalid range start position on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
+      sprintf(g_logbuf, "Error: Invalid range start position on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
       goto load_range_list_ret_INVALID_FORMAT_2;
     }
     bufptr = next_token(bufptr);
     if (scan_uint_defcap(bufptr, &range_last)) {
-      sprintf(logbuf, "Error: Invalid range end position on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
+      sprintf(g_logbuf, "Error: Invalid range end position on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
       goto load_range_list_ret_INVALID_FORMAT_2;
     }
     if (range_last < range_first) {
-      sprintf(logbuf, "Error: Range end position smaller than range start on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
-      wordwrap(logbuf, 0);
+      sprintf(g_logbuf, "Error: Range end position smaller than range start on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
+      wordwrapb(0);
       goto load_range_list_ret_INVALID_FORMAT_2;
     }
     if (border_extend > range_first) {
@@ -473,7 +474,7 @@ int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_
 	memcpy(bufptr3, "C_", 2);
       } else if (!marker_pos) {
 	bufptr3 = &(bufptr3[-4]);
-	uint32_write4(bufptr3, chrom_idx);
+	uitoa_z4(chrom_idx, bufptr3);
 	bufptr3[3] -= 15;
       }
       // this should never fail
@@ -486,7 +487,7 @@ int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_
       range_first = uint32arr_greater_than(&(marker_pos[chrom_start]), chrom_end - chrom_start, range_first);
       range_last = uint32arr_greater_than(&(marker_pos[chrom_start]), chrom_end - chrom_start, range_last + 1);
       if (range_last > range_first) {
-	msr_tmp = (Make_set_range*)top_alloc(topsize_ptr, sizeof(Make_set_range));
+	msr_tmp = (Make_set_range*)bigstack_end_alloc(sizeof(Make_set_range));
 	msr_tmp->next = make_set_range_arr[set_idx];
 	// normally, I'd keep chrom_idx here since that enables by-chromosome
 	// sorting, but that's probably not worth bloating Make_set_range from
@@ -496,7 +497,7 @@ int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_
 	make_set_range_arr[set_idx] = msr_tmp;
       }
     } else {
-      msr_tmp = (Make_set_range*)top_alloc(topsize_ptr, sizeof(Make_set_range));
+      msr_tmp = (Make_set_range*)bigstack_end_alloc(sizeof(Make_set_range));
       msr_tmp->next = make_set_range_arr[set_idx];
       msr_tmp->uidx_start = range_first;
       msr_tmp->uidx_end = range_last + 1;
@@ -517,7 +518,7 @@ int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_
     }
   }
   if (range_sort_buf_ptr) {
-    *range_sort_buf_ptr = (uint64_t*)top_alloc(topsize_ptr, uii * sizeof(int64_t));
+    bigstack_end_alloc_ull(uii, range_sort_buf_ptr);
   }
   if (set_ct_ptr) {
     *set_ct_ptr = set_ct;
@@ -527,9 +528,7 @@ int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_
   }
   *make_set_range_arr_ptr = make_set_range_arr;
   while (0) {
-  load_range_list_ret_NOMEM2:
-    wkspace_left += *topsize_ptr;
-    *topsize_ptr = 0;
+  load_range_list_ret_NOMEM:
     retval = RET_NOMEM;
     break;
   load_range_list_ret_INVALID_FORMAT_2:
@@ -546,19 +545,19 @@ int32_t extract_exclude_range(char* fname, uint32_t* marker_pos, uintptr_t unfil
   if (unfiltered_marker_ct == *marker_exclude_ct_ptr) {
     return 0;
   }
-  unsigned char* wkspace_mark = wkspace_base;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   FILE* infile = NULL;
-  uintptr_t topsize = 0;
   uintptr_t orig_marker_exclude_ct = *marker_exclude_ct_ptr;
   Make_set_range** range_arr = NULL;
   int32_t retval = 0;
   Make_set_range* msr_tmp;
   uintptr_t* marker_exclude_new;
-  if (fopen_checked(&infile, fname, "r")) {
+  if (fopen_checked(fname, "r", &infile)) {
     goto extract_exclude_range_ret_OPEN_FAIL;
   }
-  retval = load_range_list(infile, 0, 0, 0, 0, 0, allow_no_variants, 0, NULL, 0, marker_pos, chrom_info_ptr, &topsize, NULL, NULL, NULL, &range_arr, NULL, is_exclude? "--exclude range" : "--extract range");
+  retval = load_range_list(infile, 0, 0, 0, 0, 0, allow_no_variants, 0, NULL, 0, marker_pos, chrom_info_ptr, NULL, NULL, NULL, &range_arr, NULL, is_exclude? "--exclude range" : "--extract range");
   if (retval) {
     goto extract_exclude_range_ret_1;
   }
@@ -568,22 +567,20 @@ int32_t extract_exclude_range(char* fname, uint32_t* marker_pos, uintptr_t unfil
   msr_tmp = range_arr[0];
   if (is_exclude) {
     while (msr_tmp) {
-      fill_bits(marker_exclude, msr_tmp->uidx_start, msr_tmp->uidx_end - msr_tmp->uidx_start);
+      fill_bits(msr_tmp->uidx_start, msr_tmp->uidx_end - msr_tmp->uidx_start, marker_exclude);
       msr_tmp = msr_tmp->next;
     }
   } else {
-    wkspace_base -= topsize;
-    marker_exclude_new = (uintptr_t*)wkspace_alloc(unfiltered_marker_ctl * sizeof(intptr_t));
-    wkspace_base += topsize;
+    bigstack_alloc_ul(unfiltered_marker_ctl, &marker_exclude_new);
     if (!marker_exclude_new) {
       goto extract_exclude_range_ret_NOMEM;
     }
-    fill_all_bits(marker_exclude_new, unfiltered_marker_ct);
+    fill_all_bits(unfiltered_marker_ct, marker_exclude_new);
     while (msr_tmp) {
-      clear_bits(marker_exclude_new, msr_tmp->uidx_start, msr_tmp->uidx_end - msr_tmp->uidx_start);
+      clear_bits(msr_tmp->uidx_start, msr_tmp->uidx_end - msr_tmp->uidx_start, marker_exclude_new);
       msr_tmp = msr_tmp->next;
     }
-    bitfield_or(marker_exclude, marker_exclude_new, unfiltered_marker_ctl);
+    bitvec_or(marker_exclude_new, unfiltered_marker_ctl, marker_exclude);
   }
   *marker_exclude_ct_ptr = popcount_longs(marker_exclude, unfiltered_marker_ctl);
   if ((*marker_exclude_ct_ptr == unfiltered_marker_ct) && (!allow_no_variants)) {
@@ -608,7 +605,7 @@ int32_t extract_exclude_range(char* fname, uint32_t* marker_pos, uintptr_t unfil
   }
  extract_exclude_range_ret_1:
   fclose_cond(infile);
-  wkspace_reset(wkspace_mark);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
   return retval;
 }
 
@@ -625,15 +622,14 @@ uint32_t save_set_bitfield(uintptr_t* marker_bitfield_tmp, uint32_t marker_ct, u
   uint32_t uii;
   uint32_t ujj;
   uint32_t ukk;
-  if (wkspace_left < mem_req) {
+  if (bigstack_left() < mem_req) {
     return 1;
   }
-  *set_range_pp = (uint32_t*)wkspace_base;
+  *set_range_pp = (uint32_t*)g_bigstack_base;
   if (range_start == marker_ct) {
     // empty or full set
   save_set_bitfield_degen:
-    wkspace_left -= 16;
-    wkspace_base = &(wkspace_base[16]);
+    g_bigstack_base = &(g_bigstack_base[16]);
     if (complement_sets) {
       (*set_range_pp)[0] = 1;
       (*set_range_pp)[1] = 0;
@@ -711,8 +707,8 @@ uint32_t save_set_bitfield(uintptr_t* marker_bitfield_tmp, uint32_t marker_ct, u
       *uiptr++ = bit_idx;
     } while (bit_idx < range_end);
   } else {
-    set_bit(marker_bitfield_tmp, ujj);
-    clear_bit(marker_bitfield_tmp, ujj + 1);
+    set_bit(ujj, marker_bitfield_tmp);
+    clear_bit(ujj + 1, marker_bitfield_tmp);
     ukk = ujj;
     if (ukk > marker_ct) {
       ukk = marker_ct;
@@ -761,16 +757,15 @@ uint32_t save_set_bitfield(uintptr_t* marker_bitfield_tmp, uint32_t marker_ct, u
     (*set_range_pp)[3] = set_bits_outer;
     memcpy(&((*set_range_pp)[4]), &(marker_bitfield_tmp[bound_bottom_d128 / BITCT]), mem_req - 16);
     if (complement_sets) {
-      bitfield_invert((uintptr_t*)(&((*set_range_pp)[4])), bound_top_d128 - bound_bottom_d128);
+      bitarr_invert(bound_top_d128 - bound_bottom_d128, (uintptr_t*)(&((*set_range_pp)[4])));
     }
   }
-  wkspace_left -= mem_req;
-  wkspace_base = &(wkspace_base[mem_req]);
+  g_bigstack_base = &(g_bigstack_base[mem_req]);
   return 0;
 }
 
 uint32_t save_set_range(uint64_t* range_sort_buf, uint32_t marker_ct, uint32_t rsb_last_idx, uint32_t complement_sets, uint32_t** set_range_pp) {
-  uint32_t* uiptr = (uint32_t*)wkspace_base;
+  uint32_t* uiptr = (uint32_t*)g_bigstack_base;
   uint32_t range_start = (uint32_t)(range_sort_buf[0] >> 32);
   uint32_t range_end = (uint32_t)(range_sort_buf[rsb_last_idx]);
   uint32_t bound_bottom_d128 = range_start / 128;
@@ -785,7 +780,7 @@ uint32_t save_set_range(uint64_t* range_sort_buf, uint32_t marker_ct, uint32_t r
   uintptr_t ulii;
   uint32_t uii;
   uint32_t ujj;
-  if (wkspace_left < (rsb_last_idx / 2) * 16 + 32) {
+  if (bigstack_left() < (rsb_last_idx / 2) * 16 + 32) {
     return 1;
   }
   *set_range_pp = uiptr;
@@ -797,8 +792,7 @@ uint32_t save_set_range(uint64_t* range_sort_buf, uint32_t marker_ct, uint32_t r
         range_start = uii;
 	range_end = (uint32_t)(range_sort_buf[rsb_last_idx] >> 32);
       } else {
-	wkspace_left -= 16;
-	wkspace_base = &(wkspace_base[16]);
+	g_bigstack_base = &(g_bigstack_base[16]);
 	if (!complement_sets) {
 	  uiptr[0] = 1;
 	  uiptr[1] = 0;
@@ -855,7 +849,7 @@ uint32_t save_set_range(uint64_t* range_sort_buf, uint32_t marker_ct, uint32_t r
 	uii = (uint32_t)(ullii >> 32);
 	ujj = (uint32_t)ullii;
       save_set_range_late_start_1:
-	fill_bits(bitfield_ptr, uii - range_start, ujj - uii);
+	fill_bits(uii - range_start, ujj - uii, bitfield_ptr);
       }
       if (do_flip) {
 	// last range may go past bitfield end
@@ -865,12 +859,11 @@ uint32_t save_set_range(uint64_t* range_sort_buf, uint32_t marker_ct, uint32_t r
         if (ujj > range_end) {
 	  ujj = range_end;
 	}
-	fill_bits(bitfield_ptr, uii - range_start, ujj - uii);
+	fill_bits(uii - range_start, ujj - uii, bitfield_ptr);
       }
       goto save_set_range_bitfield_finish_encode;
     }
-    wkspace_left -= ulii;
-    wkspace_base = &(wkspace_base[ulii]);
+    g_bigstack_base = &(g_bigstack_base[ulii]);
     *uiptr++ = rsb_last_idx + 1;
     for (; rsb_idx <= rsb_last_idx; rsb_idx++) {
       ullii = range_sort_buf[rsb_idx];
@@ -892,7 +885,7 @@ uint32_t save_set_range(uint64_t* range_sort_buf, uint32_t marker_ct, uint32_t r
     ulii *= 16;
     if (ulii > mem_req) {
       range_start = bound_bottom_d128 * 128;
-      fill_all_bits(bitfield_ptr, range_end - range_start);
+      fill_all_bits(range_end - range_start, bitfield_ptr);
       if (do_flip) {
 	rsb_last_idx--;
 	if (range_start) {
@@ -908,7 +901,7 @@ uint32_t save_set_range(uint64_t* range_sort_buf, uint32_t marker_ct, uint32_t r
 	uii = (uint32_t)(ullii >> 32);
         ujj = (uint32_t)ullii;
       save_set_range_late_start_2:
-        clear_bits(bitfield_ptr, uii - range_start, ujj - uii);
+        clear_bits(uii - range_start, ujj - uii, bitfield_ptr);
       }
       if (do_flip) {
         ullii = range_sort_buf[rsb_idx];
@@ -917,18 +910,16 @@ uint32_t save_set_range(uint64_t* range_sort_buf, uint32_t marker_ct, uint32_t r
         if (ujj > range_end) {
 	  ujj = range_end;
 	}
-	clear_bits(bitfield_ptr, uii - range_start, ujj - uii);
+	clear_bits(uii - range_start, ujj - uii, bitfield_ptr);
       }
     save_set_range_bitfield_finish_encode:
-      wkspace_left -= mem_req;
-      wkspace_base = &(wkspace_base[mem_req]);
+      g_bigstack_base = &(g_bigstack_base[mem_req]);
       uiptr[0] = 0xffffffffU;
       uiptr[1] = range_start;
       uiptr[2] = range_end - range_start;
       uiptr[3] = set_bits_outer;
     } else {
-      wkspace_left -= ulii;
-      wkspace_base = &(wkspace_base[ulii]);
+      g_bigstack_base = &(g_bigstack_base[ulii]);
       if (range_start) {
 	*uiptr++ = range_ct;
 	*uiptr++ = 0;
@@ -951,11 +942,11 @@ uint32_t save_set_range(uint64_t* range_sort_buf, uint32_t marker_ct, uint32_t r
 }
 
 int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uint32_t* marker_pos, uintptr_t* marker_exclude_ct_ptr, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, uint32_t allow_no_variants) {
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   FILE* infile = NULL;
-  uintptr_t topsize = 0;
   char* sorted_marker_ids = NULL;
   char* sorted_genekeep_ids = NULL;
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t marker_exclude_ct = *marker_exclude_ct_ptr;
   uintptr_t marker_ct = unfiltered_marker_ct - marker_exclude_ct;
   uintptr_t set_ct = 0;
@@ -972,7 +963,7 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
   uintptr_t max_genekeep_len = 0;
   uintptr_t max_set_id_len = 0;
   Make_set_range** make_set_range_arr = NULL;
-  char* midbuf = &(tbuf[MAXLINELEN]);
+  char* midbuf = &(g_textbuf[MAXLINELEN]);
   char* sorted_subset_ids = NULL;
   char* set_names = NULL;
   char* bufptr = NULL;
@@ -981,6 +972,7 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
   char* bufptr3;
   char* buf_end;
   Make_set_range* msr_tmp;
+  unsigned char* bigstack_end_mark2;
   uint32_t* marker_id_map;
   uint32_t* marker_uidx_to_idx;
   uint32_t** all_setdefs;
@@ -988,7 +980,6 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
   uintptr_t* marker_bitfield_tmp;
   uintptr_t set_idx;
   uintptr_t bufsize;
-  uintptr_t topsize_bak;
   uintptr_t marker_ctp2l;
   uintptr_t ulii;
   uint64_t ullii;
@@ -1043,8 +1034,7 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
 	  goto define_sets_ret_EXCLUDE_ALL_MARKERS_ALLOWED;
 	}
       }
-      sorted_genekeep_ids = (char*)top_alloc(&topsize, genekeep_ct * max_genekeep_len);
-      if (!sorted_genekeep_ids) {
+      if (bigstack_end_alloc_c(genekeep_ct * max_genekeep_len, &sorted_genekeep_ids)) {
 	goto define_sets_ret_NOMEM;
       }
       bufptr = sip->genekeep_flattened;
@@ -1063,10 +1053,10 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
   // 2. if --set-names and/or --subset is present, (load and) sort those lists
   if (sip->setnames_flattened || sip->subset_fname) {
     if (sip->subset_fname) {
-      if (fopen_checked(&infile, sip->subset_fname, "rb")) {
+      if (fopen_checked(sip->subset_fname, FOPEN_RB, &infile)) {
 	goto define_sets_ret_OPEN_FAIL;
       }
-      retval = scan_token_ct_len(infile, tbuf, MAXLINELEN, &subset_ct, &max_subset_id_len);
+      retval = scan_token_ct_len(MAXLINELEN, infile, g_textbuf, &subset_ct, &max_subset_id_len);
       if (retval) {
 	if (retval == RET_INVALID_FORMAT) {
 	  logerrprint("Error: Pathologically long token in --subset file.\n");
@@ -1106,14 +1096,13 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
       logerrprint("Error: Subset IDs are limited to " MAX_ID_LEN_STR " characters.\n");
       goto define_sets_ret_INVALID_FORMAT;
     }
-    sorted_subset_ids = (char*)top_alloc(&topsize, subset_ct * max_subset_id_len);
-    if (!sorted_subset_ids) {
+    if (bigstack_end_alloc_c(subset_ct * max_subset_id_len, &sorted_subset_ids)) {
       goto define_sets_ret_NOMEM;
     }
     if (sip->subset_fname) {
       if (ulii) {
 	rewind(infile);
-	retval = read_tokens(infile, tbuf, MAXLINELEN, ulii, max_subset_id_len, sorted_subset_ids);
+	retval = read_tokens(MAXLINELEN, ulii, max_subset_id_len, infile, g_textbuf, sorted_subset_ids);
 	if (retval) {
 	  goto define_sets_ret_1;
 	}
@@ -1134,12 +1123,12 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
     qsort(sorted_subset_ids, subset_ct, max_subset_id_len, strcmp_casted);
     subset_ct = collapse_duplicate_ids(sorted_subset_ids, subset_ct, max_subset_id_len, NULL);
   }
-  if (fopen_checked(&infile, sip->fname, make_set? "r" : "rb")) {
+  if (fopen_checked(sip->fname, make_set? "r" : FOPEN_RB, &infile)) {
     goto define_sets_ret_OPEN_FAIL;
   }
   // 3. load --make-set range list
   if (make_set) {
-    retval = load_range_list(infile, !sip->merged_set_name, sip->make_set_border, sip->modifier & SET_MAKE_COLLAPSE_GROUP, gene_all || sip->genekeep_flattened, c_prefix, allow_no_variants, subset_ct, sorted_subset_ids, max_subset_id_len, marker_pos, chrom_info_ptr, &topsize, &set_ct, &set_names, &max_set_id_len, &make_set_range_arr, &range_sort_buf, "--make-set");
+    retval = load_range_list(infile, !sip->merged_set_name, sip->make_set_border, sip->modifier & SET_MAKE_COLLAPSE_GROUP, gene_all || sip->genekeep_flattened, c_prefix, allow_no_variants, subset_ct, sorted_subset_ids, max_subset_id_len, marker_pos, chrom_info_ptr, &set_ct, &set_names, &max_set_id_len, &make_set_range_arr, &range_sort_buf, "--make-set");
     if (retval) {
       goto define_sets_ret_1;
     }
@@ -1147,17 +1136,12 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
 
   // 4. if --gene or --gene-all is present, pre-filter variants.
   if (gene_all || sip->genekeep_flattened) {
-    topsize_bak = topsize;
-    marker_bitfield_tmp = (uintptr_t*)top_alloc(&topsize, unfiltered_marker_ctl * sizeof(intptr_t));
-    if (!marker_bitfield_tmp) {
-      goto define_sets_ret_NOMEM;
-    }
-    marker_exclude_new = (uintptr_t*)top_alloc(&topsize, unfiltered_marker_ctl * sizeof(intptr_t));
-    if (!marker_exclude_new) {
+    bigstack_end_mark2 = g_bigstack_end;
+    if (bigstack_end_calloc_ul(unfiltered_marker_ctl, &marker_bitfield_tmp) ||
+        bigstack_end_alloc_ul(unfiltered_marker_ctl, &marker_exclude_new)) {
       goto define_sets_ret_NOMEM;
     }
-    fill_ulong_zero(marker_bitfield_tmp, unfiltered_marker_ctl);
-    fill_all_bits(marker_exclude_new, unfiltered_marker_ct);
+    fill_all_bits(unfiltered_marker_ct, marker_exclude_new);
     // then include every variant that appears, or include every variant that
     // fails to appear in a fully loaded set in the complement case
     if (make_set) {
@@ -1165,27 +1149,21 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
 	if (gene_all || (bsearch_str_nl(&(set_names[set_idx * max_set_id_len]), sorted_genekeep_ids, max_genekeep_len, genekeep_ct) != -1)) {
 	  msr_tmp = make_set_range_arr[set_idx];
 	  while (msr_tmp) {
-	    fill_bits(marker_bitfield_tmp, msr_tmp->uidx_start, msr_tmp->uidx_end - msr_tmp->uidx_start);
+	    fill_bits(msr_tmp->uidx_start, msr_tmp->uidx_end - msr_tmp->uidx_start, marker_bitfield_tmp);
 	    msr_tmp = msr_tmp->next;
 	  }
 	}
         if (complement_sets) {
-	  bitfield_and(marker_exclude_new, marker_bitfield_tmp, unfiltered_marker_ctl);
+	  bitvec_and(marker_bitfield_tmp, unfiltered_marker_ctl, marker_exclude_new);
           fill_ulong_zero(marker_bitfield_tmp, unfiltered_marker_ctl);
 	}
       }
     } else {
-      sorted_marker_ids = (char*)top_alloc(&topsize, marker_ct * max_marker_id_len);
-      if (!sorted_marker_ids) {
+      if (bigstack_end_alloc_c(marker_ct * max_marker_id_len, &sorted_marker_ids) ||
+          bigstack_end_alloc_ui(marker_ct, &marker_id_map)) {
 	goto define_sets_ret_NOMEM;
       }
-      marker_id_map = (uint32_t*)top_alloc(&topsize, marker_ct * sizeof(int32_t));
-      if (!marker_id_map) {
-	goto define_sets_ret_NOMEM;
-      }
-      wkspace_left -= topsize;
-      retval = sort_item_ids_noalloc(sorted_marker_ids, marker_id_map, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, 0, strcmp_deref);
-      wkspace_left += topsize;
+      retval = sort_item_ids_noalloc(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, 0, strcmp_deref, sorted_marker_ids, marker_id_map);
       if (retval) {
 	goto define_sets_ret_NOMEM;
       }
@@ -1210,7 +1188,7 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
         buf_end = &(midbuf[bufsize]);
         *buf_end = ' ';
         buf_end[1] = '0';
-        bufptr = &(tbuf[MAXLINELEN - curtoklen]);
+        bufptr = &(g_textbuf[MAXLINELEN - curtoklen]);
         bufptr2 = midbuf;
         if (curtoklen) {
           goto define_sets_tok_start_1;
@@ -1229,12 +1207,12 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
 	    bufptr2++;
 	  }
           curtoklen = (uintptr_t)(bufptr2 - bufptr);
-          if (bufptr2 == &(tbuf[MAXLINELEN * 2])) {
+          if (bufptr2 == &(g_textbuf[MAXLINELEN * 2])) {
 	    if (curtoklen > MAXLINELEN) {
 	      logerrprint("Error: Excessively long token in --set file.\n");
 	      goto define_sets_ret_INVALID_FORMAT;
 	    }
-            bufptr3 = &(tbuf[MAXLINELEN - curtoklen]);
+            bufptr3 = &(g_textbuf[MAXLINELEN - curtoklen]);
             memcpy(bufptr3, bufptr, curtoklen);
             bufptr = bufptr3;
 	    break;
@@ -1244,7 +1222,7 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
 	      goto define_sets_ret_INVALID_FORMAT_EXTRA_END;
 	    }
             if (complement_sets) {
-	      bitfield_and(marker_exclude_new, marker_bitfield_tmp, unfiltered_marker_ctl);
+	      bitvec_and(marker_bitfield_tmp, unfiltered_marker_ctl, marker_exclude_new);
               fill_ulong_zero(marker_bitfield_tmp, unfiltered_marker_ctl);
 	    }
             in_set = 0;
@@ -1262,7 +1240,7 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
 	  } else if (in_set == 1) {
 	    ii = bsearch_str(bufptr, (uintptr_t)(bufptr2 - bufptr), sorted_marker_ids, max_marker_id_len, marker_ct);
 	    if (ii != -1) {
-	      set_bit(marker_bitfield_tmp, marker_id_map[(uint32_t)ii]);
+	      set_bit(marker_id_map[(uint32_t)ii], marker_bitfield_tmp);
 	    }
 	  }
 	  bufptr = &(bufptr2[1]);
@@ -1286,9 +1264,9 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
       }
     }
     if (!complement_sets) {
-      bitfield_andnot(marker_exclude_new, marker_bitfield_tmp, unfiltered_marker_ctl);
+      bitvec_andnot(marker_bitfield_tmp, unfiltered_marker_ctl, marker_exclude_new);
     }
-    bitfield_or(marker_exclude, marker_exclude_new, unfiltered_marker_ctl);
+    bitvec_or(marker_exclude_new, unfiltered_marker_ctl, marker_exclude);
     marker_exclude_ct = popcount_longs(marker_exclude, unfiltered_marker_ctl);
     *marker_exclude_ct_ptr = marker_exclude_ct;
     if (marker_exclude_ct == unfiltered_marker_ct) {
@@ -1300,7 +1278,7 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
     }
     marker_ct = unfiltered_marker_ct - marker_exclude_ct;
     rewind(infile);
-    topsize = topsize_bak;
+    bigstack_end_reset(bigstack_end_mark2);
   } else if ((!make_set) && (!sip->merged_set_name)) {
     // 5. otherwise, with --set and no --set-collapse-all, count number of sets
     //    and max_name_len.
@@ -1323,7 +1301,7 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
       buf_end = &(midbuf[bufsize]);
       *buf_end = ' ';
       buf_end[1] = '0';
-      bufptr = &(tbuf[MAXLINELEN - curtoklen]);
+      bufptr = &(g_textbuf[MAXLINELEN - curtoklen]);
       bufptr2 = midbuf;
       if (curtoklen) {
 	goto define_sets_tok_start_2;
@@ -1342,8 +1320,8 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
 	  bufptr2++;
 	}
         curtoklen = (uintptr_t)(bufptr2 - bufptr);
-        if (bufptr2 == &(tbuf[MAXLINELEN * 2])) {
-          bufptr3 = &(tbuf[MAXLINELEN - curtoklen]);
+        if (bufptr2 == &(g_textbuf[MAXLINELEN * 2])) {
+          bufptr3 = &(g_textbuf[MAXLINELEN - curtoklen]);
           memcpy(bufptr3, bufptr, curtoklen);
 	  bufptr = bufptr3;
 	  break;
@@ -1375,12 +1353,10 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
     rewind(infile);
   }
   // 6. allocate sip->names[], setdefs[] on stack
-  marker_uidx_to_idx = (uint32_t*)top_alloc(&topsize, unfiltered_marker_ct * sizeof(int32_t));
-  if (!marker_uidx_to_idx) {
+  if (bigstack_end_alloc_ui(unfiltered_marker_ct, &marker_uidx_to_idx)) {
     goto define_sets_ret_NOMEM;
   }
   fill_uidx_to_idx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_uidx_to_idx);
-  wkspace_left -= topsize;
   if (!set_names) {
     if (sip->merged_set_name) {
       set_ct = 1;
@@ -1389,8 +1365,8 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
 	logerrprint("Error: Set IDs are limited to " MAX_ID_LEN_STR " characters.\n");
 	goto define_sets_ret_INVALID_FORMAT;
       }
-      if (wkspace_alloc_c_checked(&set_names, max_set_id_len)) {
-	goto define_sets_ret_NOMEM2;
+      if (bigstack_alloc_c(max_set_id_len, &set_names)) {
+	goto define_sets_ret_NOMEM;
       }
       memcpy(set_names, sip->merged_set_name, max_set_id_len);
     } else {
@@ -1398,14 +1374,14 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
 	logerrprint("Error: Set IDs are limited to " MAX_ID_LEN_STR " characters.\n");
 	goto define_sets_ret_INVALID_FORMAT;
       }
-      if (wkspace_alloc_c_checked(&set_names, set_ct * max_set_id_len)) {
-	goto define_sets_ret_NOMEM2;
+      if (bigstack_alloc_c(set_ct * max_set_id_len, &set_names)) {
+	goto define_sets_ret_NOMEM;
       }
     }
   }
-  all_setdefs = (uint32_t**)wkspace_alloc(set_ct * sizeof(intptr_t));
+  all_setdefs = (uint32_t**)bigstack_alloc(set_ct * sizeof(intptr_t));
   if (!all_setdefs) {
-    goto define_sets_ret_NOMEM2;
+    goto define_sets_ret_NOMEM;
   }
   if (make_set) {
     // 7. If --make-set, allocate entries on stack
@@ -1439,12 +1415,11 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
       }
       if (!uii) {
 	// special case: empty set
-	if (wkspace_left < 16) {
+	if (bigstack_left() < 16) {
 	  goto define_sets_ret_NOMEM;
 	}
-	all_setdefs[set_idx] = (uint32_t*)wkspace_base;
-	wkspace_left -= 16;
-	wkspace_base = &(wkspace_base[16]);
+	all_setdefs[set_idx] = (uint32_t*)g_bigstack_base;
+	g_bigstack_base = &(g_bigstack_base[16]);
 	if (!complement_sets) {
 	  all_setdefs[set_idx][0] = 0;
 	} else {
@@ -1486,35 +1461,24 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
     set_idx = 0;
     in_set = 0;
     curtoklen = 0;
-    topsize_bak = topsize;
     range_first = marker_ct;
     range_last = 0;
     // guarantee two free bits at end to simplify loop termination checks (may
     // want to default to doing this...)
     marker_ctp2l = (marker_ct + (BITCT + 1)) / BITCT;
-    marker_bitfield_tmp = (uintptr_t*)top_alloc(&topsize, marker_ctp2l * sizeof(intptr_t));
-    if (!marker_bitfield_tmp) {
-      goto define_sets_ret_NOMEM2;
-    }
-    sorted_marker_ids = (char*)top_alloc(&topsize, marker_ct * max_marker_id_len);
-    if (!sorted_marker_ids) {
-      wkspace_left += topsize_bak;
-      goto define_sets_ret_NOMEM;
-    }
-    marker_id_map = (uint32_t*)top_alloc(&topsize, marker_ct * sizeof(int32_t));
-    if (!marker_id_map) {
-      wkspace_left += topsize_bak;
+    if (bigstack_end_alloc_ul(marker_ctp2l, &marker_bitfield_tmp) ||
+        bigstack_end_alloc_c(marker_ct * max_marker_id_len, &sorted_marker_ids) ||
+        bigstack_end_alloc_ui(marker_ct, &marker_id_map)) {
       goto define_sets_ret_NOMEM;
     }
-    wkspace_left -= topsize - topsize_bak;
-    retval = sort_item_ids_noalloc(sorted_marker_ids, marker_id_map, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, 1, strcmp_deref);
+    retval = sort_item_ids_noalloc(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, 1, strcmp_deref, sorted_marker_ids, marker_id_map);
     if (retval) {
-      goto define_sets_ret_NOMEM2;
+      goto define_sets_ret_NOMEM;
     }
 #ifdef __LP64__
-    fill_ulong_zero(marker_bitfield_tmp, (marker_ctp2l + 1) & (~1));
+    fill_ulong_zero(marker_bitfield_tmp, round_up_pow2(marker_ctp2l, 2));
 #else
-    fill_ulong_zero(marker_bitfield_tmp, (marker_ctp2l + 3) & (~3));
+    fill_ulong_zero(marker_bitfield_tmp, round_up_pow2(marker_ctp2l, 4));
 #endif
     while (1) {
       if (fread_checked(midbuf, MAXLINELEN, infile, &bufsize)) {
@@ -1535,7 +1499,7 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
       buf_end = &(midbuf[bufsize]);
       *buf_end = ' ';
       buf_end[1] = '0';
-      bufptr = &(tbuf[MAXLINELEN - curtoklen]);
+      bufptr = &(g_textbuf[MAXLINELEN - curtoklen]);
       bufptr2 = midbuf;
       if (curtoklen) {
 	goto define_sets_tok_start_3;
@@ -1554,8 +1518,8 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
 	  bufptr2++;
 	}
 	curtoklen = (uintptr_t)(bufptr2 - bufptr);
-        if ((bufptr2 == buf_end) && (buf_end == &(tbuf[MAXLINELEN * 2]))) {
-	  bufptr3 = &(tbuf[MAXLINELEN - curtoklen]);
+        if ((bufptr2 == buf_end) && (buf_end == &(g_textbuf[MAXLINELEN * 2]))) {
+	  bufptr3 = &(g_textbuf[MAXLINELEN - curtoklen]);
           memcpy(bufptr3, bufptr, curtoklen);
           bufptr = bufptr3;
           break;
@@ -1594,7 +1558,7 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
 	    if (uii > range_last) {
 	      range_last = uii;
 	    }
-	    set_bit(marker_bitfield_tmp, uii);
+	    set_bit(uii, marker_bitfield_tmp);
 	  }
 	}
 	bufptr = &(bufptr2[1]);
@@ -1606,7 +1570,6 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
       }
     }
   }
-  wkspace_left += topsize;
   if (fclose_null(&infile)) {
     goto define_sets_ret_READ_FAIL;
   }
@@ -1619,13 +1582,13 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
   define_sets_merge_nothing:
     sip->ct = 1;
     uii = strlen(sip->merged_set_name) + 1;
-    // topsize = 0;
-    sip->setdefs = (uint32_t**)wkspace_alloc(sizeof(intptr_t));
+    bigstack_end_reset(bigstack_end_mark);
+    sip->setdefs = (uint32_t**)bigstack_alloc(sizeof(intptr_t));
     if (!sip->setdefs) {
       goto define_sets_ret_NOMEM;
     }
-    if (wkspace_alloc_c_checked(&sip->names, uii) ||
-	wkspace_alloc_ui_checked(&(sip->setdefs[0]), (1 + 2 * complement_sets) * sizeof(int32_t))) {
+    if (bigstack_alloc_c(uii, &sip->names) ||
+	bigstack_alloc_ui(1 + 2 * complement_sets, &(sip->setdefs[0]))) {
       goto define_sets_ret_NOMEM;
     }
     memcpy(sip->names, sip->merged_set_name, uii);
@@ -1639,8 +1602,6 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
     }
     LOGPRINTF("--%sset: 1 set defined.\n", make_set? "make-" : "");
     break;
-  define_sets_ret_NOMEM2:
-    wkspace_left += topsize;
   define_sets_ret_NOMEM:
     retval = RET_NOMEM;
     break;
@@ -1656,7 +1617,7 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
     retval = RET_ALL_MARKERS_EXCLUDED;
     break;
   define_sets_ret_EXCLUDE_ALL_MARKERS_ALLOWED:
-    fill_all_bits(marker_exclude, unfiltered_marker_ct);
+    fill_all_bits(unfiltered_marker_ct, marker_exclude);
     *marker_exclude_ct_ptr = unfiltered_marker_ct;
     break;
   define_sets_ret_INVALID_FORMAT_EXTRA_END:
@@ -1670,12 +1631,13 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
     break;
   }
  define_sets_ret_1:
+  bigstack_end_reset(bigstack_end_mark);
   fclose_cond(infile);
   return retval;
 }
 
 int32_t write_set(Set_info* sip, char* outname, char* outname_end, uint32_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, char* marker_ids, uintptr_t max_marker_id_len, uint32_t* marker_pos, Chrom_info* chrom_info_ptr) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
   FILE* outfile = NULL;
   uintptr_t set_ct = sip->ct;
   uintptr_t max_set_name_len = sip->max_name_len;
@@ -1701,7 +1663,7 @@ int32_t write_set(Set_info* sip, char* outname, char* outname_end, uint32_t mark
   uint32_t ukk;
   if (sip->modifier & SET_WRITE_TABLE) {
     memcpy(outname_end, ".set.table", 11);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto write_set_ret_OPEN_FAIL;
     }
     fputs("SNP\tCHR\tBP", outfile);
@@ -1712,16 +1674,14 @@ int32_t write_set(Set_info* sip, char* outname, char* outname_end, uint32_t mark
     if (putc_checked('\n', outfile)) {
       goto write_set_ret_WRITE_FAIL;
     }
-    if (wkspace_alloc_ui_checked(&last_idx, set_ct * sizeof(int32_t)) ||
-        wkspace_alloc_ui_checked(&next_adj, set_ct * sizeof(int32_t)) ||
-        wkspace_alloc_c_checked(&cur_setting, set_ct) ||
-        wkspace_alloc_c_checked(&writebuf, 2 * set_ct)) {
+    if (bigstack_calloc_ui(set_ct, &last_idx) ||
+        bigstack_calloc_ui(set_ct, &next_adj) ||
+        bigstack_alloc_c(set_ct, &cur_setting) ||
+        bigstack_alloc_c(2 * set_ct, &writebuf)) {
       goto write_set_ret_NOMEM;
     }
-    fill_uint_zero(last_idx, set_ct);
-    fill_uint_zero(next_adj, set_ct);
     marker_uidx = 0;
-    tbuf[0] = '\t';
+    g_textbuf[0] = '\t';
     chrom_end = 0;
     for (set_idx = 1; set_idx < set_ct; set_idx++) {
       writebuf[2 * set_idx - 1] = '\t';
@@ -1735,12 +1695,12 @@ int32_t write_set(Set_info* sip, char* outname, char* outname_end, uint32_t mark
         chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[uii];
       }
       fputs(&(marker_ids[marker_uidx * max_marker_id_len]), outfile);
-      bufptr = chrom_name_write(&(tbuf[1]), chrom_info_ptr, chrom_idx);
+      bufptr = chrom_name_write(chrom_info_ptr, chrom_idx, &(g_textbuf[1]));
       *bufptr++ = '\t';
-      bufptr = uint32_writex(bufptr, marker_pos[marker_uidx], '\t');
+      bufptr = uint32toa_x(marker_pos[marker_uidx], '\t', bufptr);
       // do not keep double-tab (if it was intentional, it should have been in
       // the header line too...)
-      fwrite(tbuf, 1, bufptr - tbuf, outfile);
+      fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
       bufptr = writebuf;
       cptr = cur_setting;
       for (set_idx = 0; set_idx < set_ct; set_idx++) {
@@ -1774,7 +1734,7 @@ int32_t write_set(Set_info* sip, char* outname, char* outname_end, uint32_t mark
                 if (IS_SET(ulptr, ukk)) {
 		  *cptr = '1';
 		  ukk++;
-		  next_unset_ck(ulptr, &ukk, ujj);
+		  next_unset_ck(ulptr, ujj, &ukk);
 		} else {
                   *cptr = '0';
                   ukk = next_set(ulptr, ukk, ujj);
@@ -1804,10 +1764,10 @@ int32_t write_set(Set_info* sip, char* outname, char* outname_end, uint32_t mark
   }
   if (sip->modifier & SET_WRITE_LIST) {
     memcpy(outname_end, ".set", 5);
-    if (fopen_checked(&outfile, outname, "w")) {
+    if (fopen_checked(outname, "w", &outfile)) {
       goto write_set_ret_OPEN_FAIL;
     }
-    if (wkspace_alloc_ui_checked(&marker_idx_to_uidx, marker_ct * sizeof(int32_t))) {
+    if (bigstack_alloc_ui(marker_ct, &marker_idx_to_uidx)) {
       goto write_set_ret_NOMEM;
     }
     fill_idx_to_uidx(marker_exclude, unfiltered_marker_ct, marker_ct, marker_idx_to_uidx);
@@ -1836,7 +1796,7 @@ int32_t write_set(Set_info* sip, char* outname, char* outname_end, uint32_t mark
 	}
 	marker_idx = 0;
 	while (1) {
-	  next_set_ck(ulptr, &marker_idx, uii);
+	  next_set_ck(ulptr, uii, &marker_idx);
 	  if (marker_idx == uii) {
 	    break;
 	  }
@@ -1872,12 +1832,12 @@ int32_t write_set(Set_info* sip, char* outname, char* outname_end, uint32_t mark
     break;
   }
   fclose_cond(outfile);
-  wkspace_reset(wkspace_mark);
+  bigstack_reset(bigstack_mark);
   return retval;
 }
 
 void unpack_set(uintptr_t marker_ct, uint32_t* setdef, uintptr_t* include_bitfield) {
-  uintptr_t marker_ctl = (marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t marker_ctl = BITCT_TO_WORDCT(marker_ct);
   uint32_t range_ct = setdef[0];
   uint32_t keep_outer;
   uint32_t range_start;
@@ -1897,7 +1857,7 @@ void unpack_set(uintptr_t marker_ct, uint32_t* setdef, uintptr_t* include_bitfie
     uii = range_start + range_ct;
     if (uii < marker_ct) {
       if (keep_outer) {
-        fill_bits(include_bitfield, uii, marker_ct - uii);
+        fill_bits(uii, marker_ct - uii, include_bitfield);
       } else {
         fill_ulong_zero(&(include_bitfield[uii / BITCT]), marker_ctl - uii / BITCT);
       }
@@ -1906,13 +1866,13 @@ void unpack_set(uintptr_t marker_ct, uint32_t* setdef, uintptr_t* include_bitfie
     fill_ulong_zero(include_bitfield, marker_ctl);
     for (uii = 0; uii < range_ct; uii++) {
       range_start = setdef[uii * 2 + 1];
-      fill_bits(include_bitfield, range_start, setdef[uii * 2 + 2] - range_start);
+      fill_bits(range_start, setdef[uii * 2 + 2] - range_start, include_bitfield);
     }
   }
 }
 
 void unpack_set_unfiltered(uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uint32_t* setdef, uintptr_t* new_exclude) {
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t last_uidx = next_unset_unsafe(marker_exclude, 0);
   uintptr_t marker_uidx = last_uidx;
   uint32_t range_ct = setdef[0];
@@ -1932,7 +1892,7 @@ void unpack_set_unfiltered(uintptr_t marker_ct, uintptr_t unfiltered_marker_ct,
       // if nonzero, range_start also must be greater than 1
       marker_uidx = jump_forward_unset_unsafe(marker_exclude, last_uidx + 1, range_start);
       if (!keep_outer) {
-	fill_bits(new_exclude, last_uidx, marker_uidx - last_uidx);
+	fill_bits(last_uidx, marker_uidx - last_uidx, new_exclude);
       }
     }
     for (range_idx = 0; range_idx < range_ct; range_idx++, marker_uidx++) {
@@ -1940,11 +1900,11 @@ void unpack_set_unfiltered(uintptr_t marker_ct, uintptr_t unfiltered_marker_ct,
       // we know that range representation is not more compact, so probably not
       // worthwhile to use next_unset/next_set/fill_bits() here
       if (!IS_SET(bitfield_ptr, range_idx)) {
-	SET_BIT(new_exclude, marker_uidx);
+	SET_BIT(marker_uidx, new_exclude);
       }
     }
     if ((!keep_outer) && (range_start + range_ct < marker_ct)) {
-      fill_bits(new_exclude, marker_uidx, unfiltered_marker_ct - marker_uidx);
+      fill_bits(marker_uidx, unfiltered_marker_ct - marker_uidx, new_exclude);
     }
   } else {
     uiptr = &(setdef[1]);
@@ -1958,7 +1918,7 @@ void unpack_set_unfiltered(uintptr_t marker_ct, uintptr_t unfiltered_marker_ct,
       if (range_start > range_end) {
         marker_uidx = jump_forward_unset_unsafe(marker_exclude, last_uidx + 1, range_start - range_end);
       }
-      fill_bits(new_exclude, last_uidx, marker_uidx - last_uidx);
+      fill_bits(last_uidx, marker_uidx - last_uidx, new_exclude);
       next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
     unpack_set_unfiltered_late_start:
       range_end = *uiptr++;
@@ -1968,12 +1928,12 @@ void unpack_set_unfiltered(uintptr_t marker_ct, uintptr_t unfiltered_marker_ct,
       }
       last_uidx = jump_forward_unset_unsafe(marker_exclude, marker_uidx + 1, range_end - range_start);
     }
-    fill_bits(new_exclude, last_uidx, unfiltered_marker_ct - last_uidx);
+    fill_bits(last_uidx, unfiltered_marker_ct - last_uidx, new_exclude);
   }
 }
 
 uint32_t extract_set_union(uint32_t** setdefs, uintptr_t set_ct, uintptr_t* set_incl, uintptr_t* filtered_union, uintptr_t marker_ct) {
-  uintptr_t marker_ctl = (marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t marker_ctl = BITCT_TO_WORDCT(marker_ct);
 
   // these track known filled words at the beginning and end.  (just intended
   // to detect early exit opportunities; doesn't need to be perfect.)
@@ -2014,7 +1974,7 @@ uint32_t extract_set_union(uint32_t** setdefs, uintptr_t set_ct, uintptr_t* set_
           range_end = unset_endw;
 	}
         if (range_start < range_end) {
-	  bitfield_or(&(filtered_union[range_start]), (uintptr_t*)(&(cur_setdef[4 + (BITCT / 32) * read_offset])), range_end - range_start);
+	  bitvec_or((uintptr_t*)(&(cur_setdef[4 + (BITCT / 32) * read_offset])), range_end - range_start, &(filtered_union[range_start]));
 	}
       }
       if (keep_outer && (range_end < unset_endw)) {
@@ -2046,11 +2006,11 @@ uint32_t extract_set_union(uint32_t** setdefs, uintptr_t set_ct, uintptr_t* set_
 	  range_start = unset_startw * BITCT;
 	}
 	if (range_ct > 1) {
-          fill_bits(filtered_union, range_start, range_end - range_start);
+          fill_bits(range_start, range_end - range_start, filtered_union);
 	  for (range_idx = 2; range_idx < range_ct; range_idx++) {
 	    range_start = *(cur_setdef++);
 	    range_end = *(cur_setdef++);
-	    fill_bits(filtered_union, range_start, range_end - range_start);
+	    fill_bits(range_start, range_end - range_start, filtered_union);
 	  }
           range_start = *(cur_setdef++);
           range_end = *(cur_setdef++);
@@ -2058,7 +2018,7 @@ uint32_t extract_set_union(uint32_t** setdefs, uintptr_t set_ct, uintptr_t* set_
 	if (range_end > unset_endw * BITCT) {
 	  range_end = unset_endw * BITCT;
 	}
-        fill_bits(filtered_union, range_start, range_end - range_start);
+        fill_bits(range_start, range_end - range_start, filtered_union);
       }
     }
     while (1) {
@@ -2076,7 +2036,7 @@ uint32_t extract_set_union(uint32_t** setdefs, uintptr_t set_ct, uintptr_t* set_
     }
   }
  extract_set_union_exit_early:
-  zero_trailing_bits(filtered_union, marker_ct);
+  zero_trailing_bits(marker_ct, filtered_union);
   return popcount_longs(filtered_union, marker_ctl);
 }
 
@@ -2085,22 +2045,22 @@ uint32_t extract_set_union_unfiltered(Set_info* sip, uintptr_t* set_incl, uintpt
   // point to marker_exclude.  Otherwise, allocates union_marker_exclude on the
   // "stack".
   // Assumes marker_ct is initial value of *union_marker_ct_ptr.
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
   uintptr_t orig_marker_ct = *union_marker_ct_ptr;
-  uintptr_t orig_marker_ctl = (orig_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t orig_marker_ctl = BITCT_TO_WORDCT(orig_marker_ct);
   uintptr_t* union_marker_exclude;
   uintptr_t* filtered_union;
-  if (wkspace_alloc_ul_checked(&union_marker_exclude, unfiltered_marker_ctl * sizeof(intptr_t)) ||
-      wkspace_alloc_ul_checked(&filtered_union, orig_marker_ctl * sizeof(intptr_t))) {
+  if (bigstack_alloc_ul(unfiltered_marker_ctl, &union_marker_exclude) ||
+      bigstack_alloc_ul(orig_marker_ctl, &filtered_union)) {
     return 1;
   }
   *union_marker_ct_ptr = extract_set_union(sip->setdefs, sip->ct, set_incl, filtered_union, orig_marker_ct);
   if ((*union_marker_ct_ptr) == orig_marker_ct) {
-    wkspace_reset(union_marker_exclude);
+    bigstack_reset(union_marker_exclude);
     *union_marker_exclude_ptr = marker_exclude;
   } else {
     uncollapse_copy_flip_include_arr(filtered_union, unfiltered_marker_ct, marker_exclude, union_marker_exclude);
-    wkspace_reset(filtered_union);
+    bigstack_reset(filtered_union);
     *union_marker_exclude_ptr = union_marker_exclude;
   }
   return 0;
@@ -2109,8 +2069,8 @@ uint32_t extract_set_union_unfiltered(Set_info* sip, uintptr_t* set_incl, uintpt
 uint32_t setdefs_compress(Set_info* sip, uintptr_t* set_incl, uintptr_t set_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, uintptr_t* marker_exclude, uintptr_t marker_ct, uint32_t*** new_setdefs_ptr) {
   // currently assumes marker_exclude does not exclude anything in the union of
   // the remaining sets
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   uintptr_t marker_ctlv = ((marker_ct + 127) / 128) * (128 / BITCT);
-  uintptr_t topsize = 0;
   uint32_t set_uidx = 0;
   uintptr_t* cur_bitfield;
   uintptr_t* read_bitfield;
@@ -2126,20 +2086,16 @@ uint32_t setdefs_compress(Set_info* sip, uintptr_t* set_incl, uintptr_t set_ct,
   uint32_t range_end;
   uint32_t include_out_of_bounds;
   uint32_t marker_midx;
-  new_setdefs = (uint32_t**)wkspace_alloc(set_ct * sizeof(intptr_t));
+  new_setdefs = (uint32_t**)bigstack_alloc(set_ct * sizeof(intptr_t));
   if (!new_setdefs) {
     return 1;
   }
-  cur_bitfield = (uintptr_t*)top_alloc(&topsize, marker_ctlv * sizeof(intptr_t));
-  if (!cur_bitfield) {
-    return 1;
-  }
-  marker_midx_to_idx = (uint32_t*)top_alloc(&topsize, marker_ct_orig * sizeof(int32_t));
-  if (!marker_midx_to_idx) {
+  if (bigstack_end_alloc_ul(marker_ctlv, &cur_bitfield) ||
+      bigstack_end_alloc_ui(marker_ct_orig, &marker_midx_to_idx)) {
+    bigstack_end_reset(bigstack_end_mark);
     return 1;
   }
   fill_midx_to_idx(marker_exclude_orig, marker_exclude, marker_ct, marker_midx_to_idx);
-  wkspace_left -= topsize;
   for (set_idx = 0; set_idx < set_ct; set_uidx++, set_idx++) {
     if (set_incl) {
       next_set_unsafe_ck(set_incl, &set_uidx);
@@ -2155,7 +2111,7 @@ uint32_t setdefs_compress(Set_info* sip, uintptr_t* set_incl, uintptr_t set_ct,
 	for (range_idx = 0; range_idx < range_ct; range_idx++) {
 	  range_offset = *(++cur_setdef);
 	  range_stop = *(++cur_setdef);
-	  fill_bits(cur_bitfield, marker_midx_to_idx[range_offset], range_stop - range_offset);
+	  fill_bits(marker_midx_to_idx[range_offset], range_stop - range_offset, cur_bitfield);
 	}
         range_end = marker_midx_to_idx[range_offset] + range_stop - range_offset;
       }
@@ -2170,36 +2126,36 @@ uint32_t setdefs_compress(Set_info* sip, uintptr_t* set_incl, uintptr_t set_ct,
       }
       for (marker_midx = 0; marker_midx < range_stop; marker_midx++) {
         if (IS_SET(read_bitfield, marker_midx)) {
-          set_bit(cur_bitfield, marker_midx_to_idx[marker_midx + range_offset]);
+          set_bit(marker_midx_to_idx[marker_midx + range_offset], cur_bitfield);
 	}
       }
       if (include_out_of_bounds && (range_offset + range_stop < marker_ct_orig)) {
-        fill_bits(cur_bitfield, marker_midx_to_idx[range_offset + range_stop], marker_ct_orig - range_offset - range_stop);
+        fill_bits(marker_midx_to_idx[range_offset + range_stop], marker_ct_orig - range_offset - range_stop, cur_bitfield);
         range_end = marker_ct;
       } else {
-        range_end = 1 + last_set_bit(cur_bitfield, (marker_ct + (BITCT - 1)) / BITCT);
+        range_end = 1 + last_set_bit(cur_bitfield, BITCT_TO_WORDCT(marker_ct));
       }
       if (range_start) {
         range_start = marker_midx_to_idx[next_set_unsafe(read_bitfield, 0) + range_offset];
       }
     }
     if (save_set_bitfield(cur_bitfield, marker_ct, range_start, range_end, 0, &(new_setdefs[set_idx]))) {
-      wkspace_left += topsize;
+      bigstack_end_reset(bigstack_end_mark);
       return 1;
     }
   }
   *new_setdefs_ptr = new_setdefs;
-  wkspace_left += topsize;
+  bigstack_end_reset(bigstack_end_mark);
   return 0;
 }
 
 int32_t load_range_list_sortpos(char* fname, uint32_t border_extend, uintptr_t subset_ct, char* sorted_subset_ids, uintptr_t max_subset_id_len, Chrom_info* chrom_info_ptr, uintptr_t* gene_ct_ptr, char** gene_names_ptr, uintptr_t* max_gene_id_len_ptr, uintptr_t** chrom_bounds_ptr, uint32_t*** genedefs_ptr, uintptr_t* chrom_max_gene_ct_ptr, const char* file_descrip) {
   // --annotate, --clump-range, --gene-report
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   FILE* infile = NULL;
   uintptr_t gene_ct = 0;
   uintptr_t max_gene_id_len = 0;
   uintptr_t chrom_max_gene_ct = 0;
-  uintptr_t topsize = 0;
   uint32_t chrom_code_end = chrom_info_ptr->max_code + 1 + chrom_info_ptr->name_ct;
   uint32_t chrom_idx = 0;
   Make_set_range** gene_arr;
@@ -2220,23 +2176,22 @@ int32_t load_range_list_sortpos(char* fname, uint32_t border_extend, uintptr_t s
   uint32_t ukk;
   uint32_t umm;
   int32_t retval;
-  if (fopen_checked(&infile, fname, "r")) {
+  if (fopen_checked(fname, "r", &infile)) {
     goto load_range_list_sortpos_ret_OPEN_FAIL;
   }
-  retval = load_range_list(infile, 1, border_extend, 0, 0, 0, 0, subset_ct, sorted_subset_ids, 0, NULL, chrom_info_ptr, &topsize, &gene_ct, gene_names_ptr, &max_gene_id_len, &gene_arr, &range_sort_buf, file_descrip);
+  retval = load_range_list(infile, 1, border_extend, 0, 0, 0, 0, subset_ct, sorted_subset_ids, 0, NULL, chrom_info_ptr, &gene_ct, gene_names_ptr, &max_gene_id_len, &gene_arr, &range_sort_buf, file_descrip);
   if (retval) {
     goto load_range_list_sortpos_ret_1;
   }
   gene_names = *gene_names_ptr;
-  wkspace_left -= topsize;
-  if (wkspace_alloc_ul_checked(chrom_bounds_ptr, (chrom_code_end + 1) * sizeof(intptr_t))) {
-    goto load_range_list_sortpos_ret_NOMEM2;
+  if (bigstack_alloc_ul(chrom_code_end + 1, chrom_bounds_ptr)) {
+    goto load_range_list_sortpos_ret_NOMEM;
   }
   chrom_bounds = *chrom_bounds_ptr;
   chrom_bounds[0] = 0;
-  genedefs = (uint32_t**)wkspace_alloc(gene_ct * sizeof(intptr_t));
+  genedefs = (uint32_t**)bigstack_alloc(gene_ct * sizeof(intptr_t));
   if (!genedefs) {
-    goto load_range_list_sortpos_ret_NOMEM2;
+    goto load_range_list_sortpos_ret_NOMEM;
   }
   for (gene_idx = 0; gene_idx < gene_ct; gene_idx++) {
     bufptr = &(gene_names[gene_idx * max_gene_id_len]);
@@ -2262,12 +2217,11 @@ int32_t load_range_list_sortpos(char* fname, uint32_t border_extend, uintptr_t s
       msr_tmp = msr_tmp->next;
     }
     if (!uii) {
-      if (wkspace_left < 16) {
-	goto load_range_list_sortpos_ret_NOMEM2;
+      if (bigstack_left() < 16) {
+	goto load_range_list_sortpos_ret_NOMEM;
       }
-      genedefs[gene_idx] = (uint32_t*)wkspace_base;
-      wkspace_left -= 16;
-      wkspace_base = &(wkspace_base[16]);
+      genedefs[gene_idx] = (uint32_t*)g_bigstack_base;
+      g_bigstack_base = &(g_bigstack_base[16]);
       genedefs[gene_idx][0] = 0;
       continue;
     }
@@ -2294,13 +2248,15 @@ int32_t load_range_list_sortpos(char* fname, uint32_t border_extend, uintptr_t s
 	range_last = (uint32_t)ullii;
       }
     }
+
+    // this boilerplate can be removed once a 16-instead-of-64-byte-aligned
+    // bigstack_alloc() exists
     ulii = (((++ukk) * 2 + 4) * sizeof(int32_t)) & (~(15 * ONELU));
-    if (wkspace_left < ulii) {
-      goto load_range_list_sortpos_ret_NOMEM2;
+    if (bigstack_left() < ulii) {
+      goto load_range_list_sortpos_ret_NOMEM;
     }
-    genedefs[gene_idx] = (uint32_t*)wkspace_base;
-    wkspace_left -= ulii;
-    wkspace_base = &(wkspace_base[ulii]);
+    genedefs[gene_idx] = (uint32_t*)g_bigstack_base;
+    g_bigstack_base = &(g_bigstack_base[ulii]);
     uiptr = genedefs[gene_idx];
     *uiptr++ = ukk;
     for (uii = 0; uii < ukk; uii++) {
@@ -2316,7 +2272,6 @@ int32_t load_range_list_sortpos(char* fname, uint32_t border_extend, uintptr_t s
   while (chrom_idx < chrom_code_end) {
     chrom_bounds[++chrom_idx] = gene_ct;
   }
-  wkspace_left += topsize;
   if (fclose_null(&infile)) {
     goto load_range_list_sortpos_ret_READ_FAIL;
   }
@@ -2325,8 +2280,7 @@ int32_t load_range_list_sortpos(char* fname, uint32_t border_extend, uintptr_t s
   *chrom_max_gene_ct_ptr = chrom_max_gene_ct;
   *genedefs_ptr = genedefs;
   while (0) {
-  load_range_list_sortpos_ret_NOMEM2:
-    wkspace_left += topsize;
+  load_range_list_sortpos_ret_NOMEM:
     retval = RET_NOMEM;
     break;
   load_range_list_sortpos_ret_OPEN_FAIL:
@@ -2337,12 +2291,14 @@ int32_t load_range_list_sortpos(char* fname, uint32_t border_extend, uintptr_t s
     break;
   }
  load_range_list_sortpos_ret_1:
+  bigstack_end_reset(bigstack_end_mark);
   fclose_cond(infile);
   return retval;
 }
 
 int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilter, Chrom_info* chrom_info_ptr) {
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   gzFile gz_attribfile = NULL;
   FILE* infile = NULL;
   FILE* outfile = NULL;
@@ -2369,7 +2325,6 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
   const char constna4str[] = "  NA";
   const char* no_annot_str = (aip->modifier & ANNOT_NA)? constnastr : constdotstr;
   const char* no_sign_str = (aip->modifier & ANNOT_NA)? constna4str : constdot4str;
-  uintptr_t topsize = 0;
   uintptr_t snplist_ct = 0;
   uintptr_t max_snplist_id_len = 0;
   uintptr_t snplist_attr_ct = 0;
@@ -2427,7 +2382,6 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
   uintptr_t* ulptr;
   uint32_t* uiptr;
   uintptr_t loadbuf_size;
-  uintptr_t topsize_bak;
   uintptr_t line_idx;
   uintptr_t range_idx;
   uintptr_t ulii;
@@ -2454,10 +2408,10 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
       snp_field_len = 3;
     }
     if (aip->snps_fname) {
-      if (fopen_checked(&infile, aip->snps_fname, "rb")) {
+      if (fopen_checked(aip->snps_fname, FOPEN_RB, &infile)) {
 	goto annotate_ret_OPEN_FAIL;
       }
-      retval = scan_token_ct_len(infile, tbuf, MAXLINELEN, &snplist_ct, &max_snplist_id_len);
+      retval = scan_token_ct_len(MAXLINELEN, infile, g_textbuf, &snplist_ct, &max_snplist_id_len);
       if (retval) {
 	if (retval == RET_INVALID_FORMAT) {
 	  logerrprint("Error: Pathologically long token in --annotate snps file.\n");
@@ -2465,14 +2419,14 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
 	goto annotate_ret_1;
       }
       if (!snplist_ct) {
-	sprintf(logbuf, "Error: %s is empty.\n", aip->snps_fname);
+	sprintf(g_logbuf, "Error: %s is empty.\n", aip->snps_fname);
 	goto annotate_ret_INVALID_FORMAT_WW;
       }
-      if (wkspace_alloc_c_checked(&sorted_snplist, snplist_ct * max_snplist_id_len)) {
+      if (bigstack_alloc_c(snplist_ct * max_snplist_id_len, &sorted_snplist)) {
 	goto annotate_ret_NOMEM;
       }
       rewind(infile);
-      retval = read_tokens(infile, tbuf, MAXLINELEN, snplist_ct, max_snplist_id_len, sorted_snplist);
+      retval = read_tokens(MAXLINELEN, snplist_ct, max_snplist_id_len, infile, g_textbuf, sorted_snplist);
       if (retval) {
 	goto annotate_ret_1;
       }
@@ -2483,41 +2437,42 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
       ulii = collapse_duplicate_ids(sorted_snplist, snplist_ct, max_snplist_id_len, NULL);
       if (ulii < snplist_ct) {
 	snplist_ct = ulii;
-	wkspace_shrink_top(sorted_snplist, snplist_ct * max_snplist_id_len);
+	bigstack_shrink_top(sorted_snplist, snplist_ct * max_snplist_id_len);
       }
     }
     if (aip->attrib_fname) {
-      if (gzopen_checked(&gz_attribfile, aip->attrib_fname, "rb")) {
-	goto annotate_ret_OPEN_FAIL;
-      }
-      if (gzbuffer(gz_attribfile, 131072)) {
-	goto annotate_ret_NOMEM;
+      retval = gzopen_read_checked(aip->attrib_fname, &gz_attribfile);
+      if (retval) {
+	goto annotate_ret_1;
       }
       line_idx = 0;
-      tbuf[MAXLINELEN - 1] = ' ';
+      g_textbuf[MAXLINELEN - 1] = ' ';
       // two-pass load.
       // 1. determine attribute set, as well as relevant variant ID count and
       //    max length
       // intermission. extract attribute names from hash table, natural sort,
       //               deallocate hash table
       // 2. save relevant variant IDs and attribute bitfields, then qsort_ext()
-      attr_id_htable = (Ll_str**)top_alloc(&topsize, HASHMEM);
+      attr_id_htable = (Ll_str**)bigstack_end_alloc(HASHMEM);
+      if (!attr_id_htable) {
+	goto annotate_ret_NOMEM;
+      }
       for (uii = 0; uii < HASHSIZE; uii++) {
 	attr_id_htable[uii] = NULL;
       }
       while (1) {
 	line_idx++;
-	if (!gzgets(gz_attribfile, tbuf, MAXLINELEN)) {
+	if (!gzgets(gz_attribfile, g_textbuf, MAXLINELEN)) {
 	  if (!gzeof(gz_attribfile)) {
 	    goto annotate_ret_READ_FAIL;
 	  }
 	  break;
 	}
-	if (!tbuf[MAXLINELEN - 1]) {
-	  sprintf(logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, aip->attrib_fname);
+	if (!g_textbuf[MAXLINELEN - 1]) {
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, aip->attrib_fname);
 	  goto annotate_ret_INVALID_FORMAT_WW;
 	}
-	bufptr = skip_initial_spaces(tbuf);
+	bufptr = skip_initial_spaces(g_textbuf);
 	if (is_eoln_kns(*bufptr)) {
 	  continue;
 	}
@@ -2546,13 +2501,12 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
 #ifdef __LP64__
 	      // we'll run out of memory way earlier in 32-bit mode
 	      if (attr_id_ct == 0x80000000LLU) {
-	        sprintf(logbuf, "Error: Too many unique attributes in %s (max 2147483648).\n", aip->attrib_fname);
+	        sprintf(g_logbuf, "Error: Too many unique attributes in %s (max 2147483648).\n", aip->attrib_fname);
 	        goto annotate_ret_INVALID_FORMAT_WW;
 	      }
 #endif
 	      attr_id_ct++;
-	      ll_ptr = top_alloc_llstr(&topsize, slen);
-	      if (!ll_ptr) {
+	      if (bigstack_end_alloc_llstr(slen, &ll_ptr)) {
 	        goto annotate_ret_NOMEM;
 	      }
 	      ll_ptr->next = NULL;
@@ -2575,7 +2529,7 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
 	}
       }
       if (!attr_id_ct) {
-	sprintf(logbuf, "Error: No attributes in %s.\n", aip->attrib_fname);
+	sprintf(g_logbuf, "Error: No attributes in %s.\n", aip->attrib_fname);
 	goto annotate_ret_INVALID_FORMAT_WW;
       }
       if (max_onevar_attr_ct > attr_id_ct) {
@@ -2583,11 +2537,9 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
 	// attribute repeated over and over again for some reason
 	max_onevar_attr_ct = attr_id_ct;
       }
-      wkspace_left -= topsize;
-      if (wkspace_alloc_c_checked(&sorted_attr_ids, attr_id_ct * max_attr_id_len)) {
-	goto annotate_ret_NOMEM2;
+      if (bigstack_alloc_c(attr_id_ct * max_attr_id_len, &sorted_attr_ids)) {
+	goto annotate_ret_NOMEM;
       }
-      wkspace_left += topsize;
       ulii = 0;
       for (uii = 0; uii < HASHSIZE; uii++) {
 	ll_ptr = attr_id_htable[uii];
@@ -2598,20 +2550,19 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
 	}
       }
       qsort(sorted_attr_ids, attr_id_ct, max_attr_id_len, strcmp_natural);
-      topsize = 0;
+      bigstack_end_reset(bigstack_end_mark);
       gzrewind(gz_attribfile);
-      attr_id_ctl = (attr_id_ct + (BITCT - 1)) / BITCT;
-      if (wkspace_alloc_c_checked(&sorted_snplist_attr_ids, snplist_attr_ct * max_snplist_attr_id_len) ||
-	  wkspace_alloc_ul_checked(&attr_bitfields, snplist_attr_ct * attr_id_ctl * sizeof(intptr_t))) {
+      attr_id_ctl = BITCT_TO_WORDCT(attr_id_ct);
+      if (bigstack_alloc_c(snplist_attr_ct * max_snplist_attr_id_len, &sorted_snplist_attr_ids) ||
+	  bigstack_calloc_ul(snplist_attr_ct * attr_id_ctl, &attr_bitfields)) {
 	goto annotate_ret_NOMEM;
       }
-      fill_ulong_zero(attr_bitfields, snplist_attr_ct * attr_id_ctl);
       for (ulii = 0; ulii < snplist_attr_ct; ulii++) {
       annotate_skip_line:
-	if (!gzgets(gz_attribfile, tbuf, MAXLINELEN)) {
+	if (!gzgets(gz_attribfile, g_textbuf, MAXLINELEN)) {
 	  goto annotate_ret_READ_FAIL;
 	}
-	bufptr = skip_initial_spaces(tbuf);
+	bufptr = skip_initial_spaces(g_textbuf);
 	if (is_eoln_kns(*bufptr)) {
 	  goto annotate_skip_line;
 	}
@@ -2629,7 +2580,7 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
 	  bufptr = skip_initial_spaces(bufptr);
 	  bufptr2[slen] = '\0';
 	  sorted_idx = bsearch_str_natural(bufptr2, sorted_attr_ids, max_attr_id_len, attr_id_ct);
-	  set_bit(ulptr, sorted_idx);
+	  set_bit(sorted_idx, ulptr);
 	  bufptr2 = bufptr;
 	} while (!is_eoln_kns(*bufptr2));
       }
@@ -2644,10 +2595,10 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
   if (need_pos) {
     if (aip->ranges_fname) {
       if (aip->subset_fname) {
-	if (fopen_checked(&infile, aip->subset_fname, "rb")) {
+	if (fopen_checked(aip->subset_fname, FOPEN_RB, &infile)) {
 	  goto annotate_ret_OPEN_FAIL;
 	}
-	retval = scan_token_ct_len(infile, tbuf, MAXLINELEN, &subset_ct, &max_subset_id_len);
+	retval = scan_token_ct_len(MAXLINELEN, infile, g_textbuf, &subset_ct, &max_subset_id_len);
 	if (retval) {
 	  if (retval == RET_INVALID_FORMAT) {
 	    logerrprint("Error: Pathologically long token in --annotate subset file.\n");
@@ -2662,12 +2613,11 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
 	  logerrprint("Error: --annotate subset IDs are limited to " MAX_ID_LEN_STR " characters.\n");
 	  goto annotate_ret_INVALID_FORMAT;
 	}
-	sorted_subset_ids = (char*)top_alloc(&topsize, subset_ct * max_subset_id_len);
-	if (!sorted_subset_ids) {
+	if (bigstack_end_alloc_c(subset_ct * max_subset_id_len, &sorted_subset_ids)) {
 	  goto annotate_ret_NOMEM;
 	}
 	rewind(infile);
-	retval = read_tokens(infile, tbuf, MAXLINELEN, subset_ct, max_subset_id_len, sorted_subset_ids);
+	retval = read_tokens(MAXLINELEN, subset_ct, max_subset_id_len, infile, g_textbuf, sorted_subset_ids);
 	if (retval) {
 	  goto annotate_ret_1;
 	}
@@ -2684,7 +2634,7 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
       }
 #ifdef __LP64__
       if (range_ct > 0x80000000LLU) {
-	sprintf(logbuf, "Error: Too many annotations in %s (max 2147483648, counting multi-chromosome annotations once per spanned chromosome).\n", aip->ranges_fname);
+	sprintf(g_logbuf, "Error: Too many annotations in %s (max 2147483648, counting multi-chromosome annotations once per spanned chromosome).\n", aip->ranges_fname);
 	goto annotate_ret_INVALID_FORMAT_WW;
       }
 #endif
@@ -2694,7 +2644,7 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
 	LOGPRINTFWW("--annotate ranges: 1 annotation loaded from %s.\n", aip->ranges_fname);
       }
     }
-    topsize = 0;
+    bigstack_end_reset(bigstack_end_mark);
     if (aip->filter_fname) {
       retval = load_range_list_sortpos(aip->filter_fname, border, 0, NULL, 0, chrom_info_ptr, &filter_range_ct, &filter_range_names, &max_filter_range_name_len, &chrom_filter_bounds, &filter_rangedefs, &chrom_max_filter_range_ct, "--annotate filter");
       if (retval) {
@@ -2711,7 +2661,7 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
 	logerrprint("Error: Too many unique annotations for '--annotate block' (max 1073741823).\n");
         goto annotate_ret_INVALID_FORMAT;
       }
-      if (wkspace_alloc_ui_checked(&range_idx_lookup, ulii * sizeof(int32_t))) {
+      if (bigstack_alloc_ui(ulii, &range_idx_lookup)) {
 	goto annotate_ret_NOMEM;
       }
       if (attr_id_ct) {
@@ -2719,16 +2669,11 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
       }
       // create a master natural-sorted annotation ID list
 
-      // this must persist until the output header line has been written
-      merged_attr_idx_buf = (uint32_t*)top_alloc(&topsize, ulii * sizeof(int32_t));
-      if (!merged_attr_idx_buf) {
-	goto annotate_ret_NOMEM;
-      }
       uii = MAXV((max_range_name_len - 4), max_attr_id_len);
-      // this is larger and doesn't need to persist
-      topsize_bak = topsize;
-      merged_attr_ids = (char*)top_alloc(&topsize, ulii * uii);
-      if (!merged_attr_ids) {
+      // this must persist until the output header line has been written
+      if (bigstack_end_alloc_ui(ulii, &merged_attr_idx_buf) ||
+          // this is larger and doesn't need to persist
+          bigstack_end_alloc_c(ulii * uii, &merged_attr_ids)) {
 	goto annotate_ret_NOMEM;
       }
       uiptr = merged_attr_idx_buf;
@@ -2742,12 +2687,10 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
 	  *uiptr++ = ujj + 0x80000000U;
 	}
       }
-      wkspace_left -= topsize;
       if (qsort_ext(merged_attr_ids, ulii, uii, strcmp_natural_deref, (char*)merged_attr_idx_buf, sizeof(int32_t))) {
-	goto annotate_ret_NOMEM2;
+	goto annotate_ret_NOMEM;
       }
-      wkspace_left += topsize;
-      topsize = topsize_bak;
+      bigstack_end_reset(merged_attr_idx_buf);
 
       // similar to collapse_duplicate_ids(), except we need to save lookup
       // info
@@ -2787,9 +2730,8 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
     if (unique_annot_ct > 1000) {
       logerrprint("Warning: Output file may be very large.  Are you sure you want >1000 additional\ncolumns per line?  If not, restart without 'block'.\n");
     }
-    wkspace_left -= topsize;
-    if (wkspace_alloc_c_checked(&writebuf, unique_annot_ctlw * sizeof(intptr_t))) {
-      goto annotate_ret_NOMEM2;
+    if (bigstack_alloc_c(unique_annot_ctlw * sizeof(intptr_t), &writebuf)) {
+      goto annotate_ret_NOMEM;
     }
     ulptr = (uintptr_t*)writebuf;
     for (ulii = 0; ulii < unique_annot_ctlw; ulii++) {
@@ -2804,13 +2746,12 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
   } else {
     // worst case: max_onevar_attr_ct attributes and chrom_max_range_ct range
     // annotations
-    if (wkspace_alloc_c_checked(&writebuf, (max_onevar_attr_ct * max_attr_id_len) + (chrom_max_range_ct * (max_range_name_len + (3 + 16 * (border != 0)) * range_dist)))) {
+    if (bigstack_alloc_c((max_onevar_attr_ct * max_attr_id_len) + (chrom_max_range_ct * (max_range_name_len + (3 + 16 * (border != 0)) * range_dist)), &writebuf)) {
       goto annotate_ret_NOMEM;
     }
   }
-  loadbuf = (char*)wkspace_base;
-  loadbuf_size = wkspace_left;
-  wkspace_left += topsize;
+  loadbuf = (char*)g_bigstack_base;
+  loadbuf_size = bigstack_left();
   if (loadbuf_size > MAXLINEBUFLEN) {
     loadbuf_size = MAXLINEBUFLEN;
   } else if (loadbuf_size <= MAXLINELEN) {
@@ -2842,7 +2783,7 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
       if (uii != 4) {
         if ((ujj >> uii) & 1) {
 	  *bufptr2 = '\0';
-          sprintf(logbuf, "Error: Duplicate column header '%s' in %s.\n", bufptr, aip->fname);
+          sprintf(g_logbuf, "Error: Duplicate column header '%s' in %s.\n", bufptr, aip->fname);
           goto annotate_ret_INVALID_FORMAT_WW;
 	}
 	ujj |= 1 << uii;
@@ -2858,11 +2799,11 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
     col_idx++;
   } while (!is_eoln_kns(*bufptr));
   if (seq_idx != token_ct) {
-    sprintf(logbuf, "Error: Missing column header%s in %s.\n", (seq_idx + 1 == token_ct)? "" : "s", aip->fname);
+    sprintf(g_logbuf, "Error: Missing column header%s in %s.\n", (seq_idx + 1 == token_ct)? "" : "s", aip->fname);
     goto annotate_ret_INVALID_FORMAT_WW;
   }
   memcpy(outname_end, ".annot", 7);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto annotate_ret_OPEN_FAIL;
   }
   if (fwrite_checked(loadbuf, (uintptr_t)(bufptr - loadbuf), outfile)) {
@@ -2887,8 +2828,8 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
 	  fputs(&(sorted_attr_ids[(ujj & 0x7fffffff) * max_attr_id_len]), outfile);
 	}
       }
-      loadbuf_size += topsize;
-      topsize = 0;
+      bigstack_end_reset(bigstack_end_mark);
+      loadbuf_size = bigstack_left();
       if (loadbuf_size > MAXLINEBUFLEN) {
 	loadbuf_size = MAXLINEBUFLEN;
       }
@@ -2902,7 +2843,7 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
     line_idx++;
     if (!loadbuf[loadbuf_size - 1]) {
       if (loadbuf_size == MAXLINEBUFLEN) {
-        sprintf(logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, aip->fname);
+        sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, aip->fname);
 	goto annotate_ret_INVALID_FORMAT_WW;
       } else {
         goto annotate_ret_NOMEM;
@@ -2995,7 +2936,7 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
 		  if (ii > 0) {
 		    *wptr++ = '+';
 		  }
-		  wptr = double_g_writewx4(wptr, ((double)ii) * 0.001, 1);
+		  wptr = dtoa_g_wxp4(((double)ii) * 0.001, 1, wptr);
 		  wptr = memcpya(wptr, "kb)|", 4);
 		}
 	      } else {
@@ -3105,23 +3046,19 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
     }
     if (track_distance) {
       if (abs_min_dist != 0x80000000U) {
-	bufptr2 = width_force(12, tbuf, double_g_write(tbuf, ((double)((int32_t)abs_min_dist)) * 0.001));
+	bufptr2 = width_force(12, g_textbuf, dtoa_g(((double)((int32_t)abs_min_dist)) * 0.001, g_textbuf));
 	if (!abs_min_dist) {
           bufptr2 = memcpya(bufptr2, no_sign_str, 4);
 	} else {
-          bufptr2 = memcpyl3a(bufptr2, "   ");
-          if (min_dist > 0) {
-	    *bufptr2++ = '+';
-	  } else {
-	    *bufptr2++ = '-';
-	  }
+          bufptr2 = memseta(bufptr2, 32, 3);
+	  *bufptr2++ = (min_dist > 0)? '+' : '-';
 	}
       } else {
-	bufptr2 = memseta(tbuf, 32, 8);
+	bufptr2 = memseta(g_textbuf, 32, 8);
 	bufptr2 = memcpya(bufptr2, no_sign_str, 4);
 	bufptr2 = memcpya(bufptr2, no_sign_str, 4);
       }
-      if (fwrite_checked(tbuf, bufptr2 - tbuf, outfile)) {
+      if (fwrite_checked(g_textbuf, bufptr2 - g_textbuf, outfile)) {
 	goto annotate_ret_WRITE_FAIL;
       }
     }
@@ -3155,8 +3092,6 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
     LOGPRINTFWW("--annotate: %" PRIuPTR " row%s annotated; new report written to %s .\n", total_row_ct, (total_row_ct == 1)? "" : "s", outname);
   }
   while (0) {
-  annotate_ret_NOMEM2:
-    wkspace_left += topsize;
   annotate_ret_NOMEM:
     retval = RET_NOMEM;
     break;
@@ -3170,14 +3105,14 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
     retval = RET_WRITE_FAIL;
     break;
   annotate_ret_INVALID_FORMAT_WW:
-    wordwrap(logbuf, 0);
+    wordwrapb(0);
     logerrprintb();
   annotate_ret_INVALID_FORMAT:
     retval = RET_INVALID_FORMAT;
     break;
   }
  annotate_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
   fclose_cond(infile);
   gzclose_cond(gz_attribfile);
   fclose_cond(outfile);
@@ -3186,10 +3121,10 @@ int32_t annotate(Annot_info* aip, char* outname, char* outname_end, double pfilt
 
 int32_t gene_report(char* fname, char* glist, char* subset_fname, uint32_t border, char* extractname, const char* snp_field, char* outname, char* outname_end, double pfilter, Chrom_info* chrom_info_ptr) {
   // similar to define_sets() and --clump
-  unsigned char* wkspace_mark = wkspace_base;
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   FILE* infile = NULL;
   FILE* outfile = NULL;
-  uintptr_t topsize = 0;
   uintptr_t subset_ct = 0;
   uintptr_t max_subset_id_len = 0;
   uintptr_t extract_ct = 0;
@@ -3245,10 +3180,10 @@ int32_t gene_report(char* fname, char* glist, char* subset_fname, uint32_t borde
   uint32_t ujj;
   int32_t chrom_idx;
   if (subset_fname) {
-    if (fopen_checked(&infile, subset_fname, "rb")) {
+    if (fopen_checked(subset_fname, FOPEN_RB, &infile)) {
       goto gene_report_ret_OPEN_FAIL;
     }
-    retval = scan_token_ct_len(infile, tbuf, MAXLINELEN, &subset_ct, &max_subset_id_len);
+    retval = scan_token_ct_len(MAXLINELEN, infile, g_textbuf, &subset_ct, &max_subset_id_len);
     if (retval) {
       if (retval == RET_INVALID_FORMAT) {
 	logerrprint("Error: Pathologically long token in --gene-subset file.\n");
@@ -3263,12 +3198,11 @@ int32_t gene_report(char* fname, char* glist, char* subset_fname, uint32_t borde
       logerrprint("Error: --gene-subset IDs are limited to " MAX_ID_LEN_STR " characters.\n");
       goto gene_report_ret_INVALID_FORMAT;
     }
-    sorted_subset_ids = (char*)top_alloc(&topsize, subset_ct * max_subset_id_len);
-    if (!sorted_subset_ids) {
+    if (bigstack_end_alloc_c(subset_ct * max_subset_id_len, &sorted_subset_ids)) {
       goto gene_report_ret_NOMEM;
     }
     rewind(infile);
-    retval = read_tokens(infile, tbuf, MAXLINELEN, subset_ct, max_subset_id_len, sorted_subset_ids);
+    retval = read_tokens(MAXLINELEN, subset_ct, max_subset_id_len, infile, g_textbuf, sorted_subset_ids);
     if (retval) {
       goto gene_report_ret_1;
     }
@@ -3279,10 +3213,10 @@ int32_t gene_report(char* fname, char* glist, char* subset_fname, uint32_t borde
     subset_ct = collapse_duplicate_ids(sorted_subset_ids, subset_ct, max_subset_id_len, NULL);
   }
   if (extractname) {
-    if (fopen_checked(&infile, extractname, "rb")) {
+    if (fopen_checked(extractname, FOPEN_RB, &infile)) {
       goto gene_report_ret_OPEN_FAIL;
     }
-    retval = scan_token_ct_len(infile, tbuf, MAXLINELEN, &extract_ct, &max_extract_id_len);
+    retval = scan_token_ct_len(MAXLINELEN, infile, g_textbuf, &extract_ct, &max_extract_id_len);
     if (retval) {
       goto gene_report_ret_1;
     }
@@ -3294,14 +3228,12 @@ int32_t gene_report(char* fname, char* glist, char* subset_fname, uint32_t borde
       logerrprint("Error: --extract IDs are limited to " MAX_ID_LEN_STR " characters.\n");
       goto gene_report_ret_INVALID_FORMAT;
     }
-    wkspace_left -= topsize;
-    if (wkspace_alloc_c_checked(&sorted_extract_ids, extract_ct * max_extract_id_len)) {
-      goto gene_report_ret_NOMEM2;
+    if (bigstack_alloc_c(extract_ct * max_extract_id_len, &sorted_extract_ids)) {
+      goto gene_report_ret_NOMEM;
     }
-    wkspace_left += topsize;
     rewind(infile);
     // todo: switch to hash table to avoid sort
-    retval = read_tokens(infile, tbuf, MAXLINELEN, extract_ct, max_extract_id_len, sorted_extract_ids);
+    retval = read_tokens(MAXLINELEN, extract_ct, max_extract_id_len, infile, g_textbuf, sorted_extract_ids);
     if (retval) {
       goto gene_report_ret_1;
     }
@@ -3312,30 +3244,29 @@ int32_t gene_report(char* fname, char* glist, char* subset_fname, uint32_t borde
     ulii = collapse_duplicate_ids(sorted_extract_ids, extract_ct, max_extract_id_len, NULL);
     if (ulii < extract_ct) {
       extract_ct = ulii;
-      wkspace_shrink_top(sorted_extract_ids, extract_ct * max_extract_id_len);
+      bigstack_shrink_top(sorted_extract_ids, extract_ct * max_extract_id_len);
     }
   }
   retval = load_range_list_sortpos(glist, 0, subset_ct, sorted_subset_ids, max_subset_id_len, chrom_info_ptr, &gene_ct, &gene_names, &max_gene_name_len, &chrom_bounds, &genedefs, &chrom_max_gene_ct, "--gene-report");
-  wkspace_left += topsize;
   if (retval) {
     goto gene_report_ret_1;
   }
 #ifdef __LP64__
   if (gene_ct > 0x80000000LLU) {
-    sprintf(logbuf, "Error: Too many genes in %s (max 2147483648).\n", glist);
+    sprintf(g_logbuf, "Error: Too many genes in %s (max 2147483648).\n", glist);
     goto gene_report_ret_INVALID_FORMAT_WW;
   }
 #endif
 
-  topsize = 0;
+  bigstack_end_reset(bigstack_end_mark);
   // gene_names is sorted primarily by chromosome index, and secondarily by
   // gene name.  Final output will be the other way around, so we need a
   // remapping table.
   // This logic needs to change a bit if support for unplaced contigs is added
   // or MAX_CHROM_TEXTNUM_LEN changes.
-  if (wkspace_alloc_ui_checked(&gene_chridx_to_nameidx, gene_ct * sizeof(int32_t)) ||
-      wkspace_alloc_ui_checked(&gene_nameidx_to_chridx, gene_ct * sizeof(int32_t)) ||
-      wkspace_alloc_c_checked(&loadbuf, gene_ct * max_gene_name_len)) {
+  if (bigstack_alloc_ui(gene_ct, &gene_chridx_to_nameidx) ||
+      bigstack_alloc_ui(gene_ct, &gene_nameidx_to_chridx) ||
+      bigstack_alloc_c(gene_ct * max_gene_name_len, &loadbuf)) {
     goto gene_report_ret_NOMEM;
   }
   for (gene_idx = 0; gene_idx < gene_ct; gene_idx++) {
@@ -3354,23 +3285,23 @@ int32_t gene_report(char* fname, char* glist, char* subset_fname, uint32_t borde
   for (gene_idx = 0; gene_idx < gene_ct; gene_idx++) {
     gene_chridx_to_nameidx[gene_nameidx_to_chridx[gene_idx]] = gene_idx;
   }
-  wkspace_reset((unsigned char*)loadbuf);
+  bigstack_reset((unsigned char*)loadbuf);
 
-  if (wkspace_left < MAXLINELEN + 64) {
+  linebuf_left = bigstack_left();
+  if (linebuf_left < MAXLINELEN + 64) {
     goto gene_report_ret_NOMEM;
   }
-  // mirror wkspace_base/wkspace_base since we'll be doing nonstandard-size
-  // allocations
-  linebuf_top = (char*)wkspace_base;
-  linebuf_left = wkspace_left;
-  gene_match_list_end = (uint64_t*)(&(wkspace_base[wkspace_left]));
+  // mirror g_bigstack_base/g_bigstack_base since we'll be doing
+  // nonstandard-size allocations
+  linebuf_top = (char*)g_bigstack_base;
+  gene_match_list_end = (uint64_t*)g_bigstack_end;
   gene_match_list = gene_match_list_end;
 
   header_ptr = linebuf_top;
   loadbuf = memcpya(header_ptr, "kb ) ", 5);
   if (border) {
     loadbuf = memcpya(loadbuf, " plus ", 6);
-    loadbuf = double_g_write(loadbuf, ((double)((int32_t)border)) * 0.001);
+    loadbuf = dtoa_g(((double)((int32_t)border)) * 0.001, loadbuf);
     loadbuf = memcpya(loadbuf, "kb border ", 10);
   }
   loadbuf = memcpya(loadbuf, "\n\n        DIST ", 15);
@@ -3414,7 +3345,7 @@ int32_t gene_report(char* fname, char* glist, char* subset_fname, uint32_t borde
       if (ujj != 4) {
 	if ((found_header_bitfield >> ujj) & 1) {
 	  *bufptr2 = '\0';
-	  sprintf(logbuf, "Error: Duplicate column header '%s' in %s.\n", bufptr, fname);
+	  sprintf(g_logbuf, "Error: Duplicate column header '%s' in %s.\n", bufptr, fname);
 	  goto gene_report_ret_INVALID_FORMAT_WW;
 	}
 	found_header_bitfield |= 1 << ujj;
@@ -3430,7 +3361,7 @@ int32_t gene_report(char* fname, char* glist, char* subset_fname, uint32_t borde
     col_idx++;
   } while (!is_eoln_kns(*bufptr));
   if (seq_idx != token_ct) {
-    sprintf(logbuf, "Error: Missing column header%s in %s.\n", (seq_idx + 1 == token_ct)? "" : "s", fname);
+    sprintf(g_logbuf, "Error: Missing column header%s in %s.\n", (seq_idx + 1 == token_ct)? "" : "s", fname);
     goto gene_report_ret_INVALID_FORMAT_WW;
   }
   // assume *bufptr is now \n (if it isn't, header line is never written to
@@ -3532,7 +3463,7 @@ int32_t gene_report(char* fname, char* glist, char* subset_fname, uint32_t borde
     linebuf_top = &(linebuf_top[slen + 8]);
 #ifdef __LP64__
     if (saved_line_ct == 0x100000000LLU) {
-      sprintf(logbuf, "Error: Too many valid lines in %s (--gene-report can only handle 4294967296).\n", fname);
+      sprintf(g_logbuf, "Error: Too many valid lines in %s (--gene-report can only handle 4294967296).\n", fname);
       goto gene_report_ret_INVALID_FORMAT_WW;
     }
 #endif
@@ -3559,7 +3490,7 @@ int32_t gene_report(char* fname, char* glist, char* subset_fname, uint32_t borde
 #endif
 
   memcpy(outname_end, ".range.report", 14);
-  if (fopen_checked(&outfile, outname, "w")) {
+  if (fopen_checked(outname, "w", &outfile)) {
     goto gene_report_ret_OPEN_FAIL;
   }
   ulii = ~ZEROLU; // current gene index
@@ -3588,16 +3519,16 @@ int32_t gene_report(char* fname, char* glist, char* subset_fname, uint32_t borde
 	  fputs(", ", outfile);
 	}
         cur_bp = *uiptr++;
-        bufptr = uint32_write(tbuf, cur_bp);
+        bufptr = uint32toa(cur_bp, g_textbuf);
 	bufptr = memcpya(bufptr, "..", 2);
         uii = *uiptr++;
-        bufptr = uint32_write(bufptr, uii - 1);
-        fwrite(tbuf, 1, bufptr - tbuf, outfile);
+        bufptr = uint32toa(uii - 1, bufptr);
+        fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
 	ujj += uii - cur_bp;
       }
-      bufptr = memcpyl3a(tbuf, " ( ");
-      bufptr = double_g_write(bufptr, ((double)((int32_t)ujj)) * 0.001);
-      fwrite(tbuf, 1, bufptr - tbuf, outfile);
+      bufptr = memcpyl3a(g_textbuf, " ( ");
+      bufptr = dtoa_g(((double)((int32_t)ujj)) * 0.001, bufptr);
+      fwrite(g_textbuf, 1, bufptr - g_textbuf, outfile);
       if (fwrite_checked(header_ptr, header_len, outfile)) {
 	goto gene_report_ret_WRITE_FAIL;
       }
@@ -3606,9 +3537,9 @@ int32_t gene_report(char* fname, char* glist, char* subset_fname, uint32_t borde
     bufptr = line_lookup[(uint32_t)ullii];
     uii = *((uint32_t*)bufptr); // line length
     ujj = ((uint32_t*)bufptr)[1]; // bp
-    bufptr2 = double_g_writewx4(tbuf, ((double)((int32_t)(ujj - cur_bp))) * 0.001, 10);
+    bufptr2 = dtoa_g_wxp4(((double)((int32_t)(ujj - cur_bp))) * 0.001, 10, g_textbuf);
     bufptr2 = memcpyl3a(bufptr2, "kb ");
-    fwrite(tbuf, 1, bufptr2 - tbuf, outfile);
+    fwrite(g_textbuf, 1, bufptr2 - g_textbuf, outfile);
     fwrite(&(bufptr[8]), 1, uii, outfile);
   }
   if (ulii != ~ZEROLU) {
@@ -3625,8 +3556,6 @@ int32_t gene_report(char* fname, char* glist, char* subset_fname, uint32_t borde
       retval = RET_INVALID_FORMAT;
       break;
     }
-  gene_report_ret_NOMEM2:
-    wkspace_left += topsize;
   gene_report_ret_NOMEM:
     retval = RET_NOMEM;
     break;
@@ -3640,14 +3569,14 @@ int32_t gene_report(char* fname, char* glist, char* subset_fname, uint32_t borde
     retval = RET_WRITE_FAIL;
     break;
   gene_report_ret_INVALID_FORMAT_WW:
-    wordwrap(logbuf, 0);
+    wordwrapb(0);
     logerrprintb();
   gene_report_ret_INVALID_FORMAT:
     retval = RET_INVALID_FORMAT;
     break;
   }
  gene_report_ret_1:
-  wkspace_reset(wkspace_mark);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
   fclose_cond(infile);
   fclose_cond(outfile);
   return retval;
diff --git a/plink_stats.c b/plink_stats.c
index 60fea0a..bef8785 100644
--- a/plink_stats.c
+++ b/plink_stats.c
@@ -1947,7 +1947,11 @@ double chi23_eval(intptr_t m11, intptr_t m12, intptr_t row1_sum, intptr_t col1_s
   chisq += delta * delta / expect;
   expect = dxx * col3_sumd;
   delta = (col3_sum - m13) - expect;
-  return chisq + (delta * delta / expect);
+  chisq += delta * delta / expect;
+  if (chisq < (SMALL_EPSILON * SMALL_EPSILON)) {
+    chisq = 0;
+  }
+  return 0;
 }
 
 void chi23_evalx(intptr_t m11, intptr_t m12, intptr_t m13, intptr_t m21, intptr_t m22, intptr_t m23, double* chip, uint32_t* dfp) {
@@ -2013,7 +2017,11 @@ void chi23_evalx(intptr_t m11, intptr_t m12, intptr_t m13, intptr_t m21, intptr_
   chisq += delta * delta / expect;
   expect = dxx * col3_sumd;
   delta = m23 - expect;
-  *chip = chisq + (delta * delta / expect);
+  chisq += delta * delta / expect;
+  if (chisq < (SMALL_EPSILON * SMALL_EPSILON)) {
+    chisq = 0;
+  }
+  *chip = chisq;
   *dfp = 2;
 }
 
diff --git a/yarn.c b/yarn.c
index 955903e..0724714 100644
--- a/yarn.c
+++ b/yarn.c
@@ -3,7 +3,6 @@
  * Version 1.3  13 Jan 2012  Mark Adler
  * For conditions of distribution and use, see copyright notice in yarn.h
  */
-
 #ifndef _WIN32
 
 /* Basic thread operations implemented using the POSIX pthread library.  All

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/plink1.9.git



More information about the debian-med-commit mailing list