[med-svn] [plink2] 01/03: Imported Upstream version 1.90~b3-150112

Dylan Aïssi bob.dybian-guest at moszumanska.debian.org
Tue Jan 13 06:02:01 UTC 2015


This is an automated email from the git hooks/post-receive script.

bob.dybian-guest pushed a commit to branch master
in repository plink2.

commit 2feb514f4359c3177e7b1e13c9184f697bdae21a
Author: Dylan Aïssi <bob.dybian at gmail.com>
Date:   Tue Jan 13 06:58:51 2015 +0100

    Imported Upstream version 1.90~b3-150112
---
 SFMT.c         |    5 +
 plink.c        |   77 +--
 plink_common.c |   57 ++
 plink_common.h |    7 +
 plink_data.c   |  173 +++--
 plink_glm.c    |   50 +-
 plink_glm.h    |    6 +
 plink_help.c   |   34 +-
 plink_ld.c     | 1912 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 plink_ld.h     |    2 +-
 plink_matrix.c |    2 +-
 plink_misc.c   |  110 +++-
 12 files changed, 2204 insertions(+), 231 deletions(-)

diff --git a/SFMT.c b/SFMT.c
index 21ccb96..b79f2b2 100644
--- a/SFMT.c
+++ b/SFMT.c
@@ -47,8 +47,11 @@ extern "C" {
 #include <string.h>
 #include <assert.h>
 #include "SFMT.h"
+
+#ifndef __LP64__
 inline static void do_recursion(w128_t * r, w128_t * a, w128_t * b,
 				w128_t * c, w128_t * d);
+#endif
 
 inline static void rshift128(w128_t *out,  w128_t const *in, int shift);
 inline static void lshift128(w128_t *out,  w128_t const *in, int shift);
@@ -107,6 +110,7 @@ inline static void lshift128(w128_t *out, w128_t const *in, int shift)
  * @param c a 128-bit part of the internal state array
  * @param d a 128-bit part of the internal state array
  */
+#ifndef __LP64__
 inline static void do_recursion(w128_t *r, w128_t *a, w128_t *b,
 				w128_t *c, w128_t *d)
 {
@@ -124,6 +128,7 @@ inline static void do_recursion(w128_t *r, w128_t *a, w128_t *b,
     r->u[3] = a->u[3] ^ x.u[3] ^ ((b->u[3] >> SFMT_SR1) & SFMT_MSK4)
 	^ y.u[3] ^ (d->u[3] << SFMT_SL1);
 }
+#endif
 
 /**
  * parameters used by sse2.
diff --git a/plink.c b/plink.c
index 6852cd9..ca3e837 100644
--- a/plink.c
+++ b/plink.c
@@ -1,5 +1,5 @@
 // PLINK 1.90
-// Copyright (C) 2005-2014 Shaun Purcell, Christopher Chang
+// Copyright (C) 2005-2015 Shaun Purcell, Christopher Chang
 
 // This program is free software: you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
@@ -86,9 +86,9 @@
 
 const char ver_str[] =
 #ifdef STABLE_BUILD
-  "PLINK v1.90b2t"
+  "PLINK v1.90b3a"
 #else
-  "PLINK v1.90b3p"
+  "PLINK v1.90p"
 #endif
 #ifdef NOLAPACK
   "NL"
@@ -99,28 +99,30 @@ const char ver_str[] =
   " 32-bit"
 #endif
   // include trailing space if day < 10, so character length stays the same
-  " (20 Dec 2014)";
+  " (12 Jan 2015)";
 const char ver_str2[] =
 #ifdef STABLE_BUILD
-  // " " // (don't want this when version number has a trailing letter)
+  "" // (don't want this when version number has a trailing letter)
+#else
+  "  " // (don't want this when version number has e.g. "b3" before "p")
 #endif
 #ifndef NOLAPACK
   "  "
 #endif
   "      https://www.cog-genomics.org/plink2\n"
-  "(C) 2005-2014 Shaun Purcell, Christopher Chang   GNU General Public License v3\n";
+  "(C) 2005-2015 Shaun Purcell, Christopher Chang   GNU General Public License v3\n";
 const char errstr_append[] = "For more information, try '" PROG_NAME_STR " --help [flag name]' or '" PROG_NAME_STR " --help | more'.\n";
 #ifdef STABLE_BUILD
   #ifndef NOLAPACK
-const char notestr_null_calc2[] = "Commands include --make-bed, --recode, --flip-scan, --merge-list,\n--write-snplist, --freqx, --missing, --test-mishap, --hardy, --mendel, --ibc,\n--impute-sex, --indep-pairphase, --r2, --blocks, --distance, --genome,\n--homozyg, --make-rel, --make-grm-gz, --rel-cutoff, --cluster, --pca,\n--neighbour, --ibs-test, --regress-distance, --model, --bd, --gxe, --logistic,\n--dosage, --lasso, --test-missing, --make-perm-pheno, --tdt, --qfam,\n--annotate, --clum [...]
+const char notestr_null_calc2[] = "Commands include --make-bed, --recode, --flip-scan, --merge-list,\n--write-snplist, --list-duplicate-vars, --freqx, --missing, --test-mishap,\n--hardy, --mendel, --ibc, --impute-sex, --indep-pairphase, --r2, --show-tags,\n--blocks, --distance, --genome, --homozyg, --make-rel, --make-grm-gz,\n--rel-cutoff, --cluster, --pca, --neighbour, --ibs-test, --regress-distance,\n--model, --bd, --gxe, --logistic, --dosage, --lasso, --test-missing,\n--make-perm-phen [...]
   #else
-const char notestr_null_calc2[] = "Commands include --make-bed, --recode, --flip-scan, --merge-list,\n--write-snplist, --freqx, --missing, --test-mishap, --hardy, --mendel, --ibc,\n--impute-sex, --indep-pairphase, --r2, --blocks, --distance, --genome,\n--homozyg, --make-rel, --make-grm-gz, --rel-cutoff, --cluster, --neighbour,\n--ibs-test, --regress-distance, --model, --bd, --gxe, --logistic, --dosage,\n--lasso, --test-missing, --make-perm-pheno, --tdt, --qfam, --annotate, --clump,\n--ge [...]
+const char notestr_null_calc2[] = "Commands include --make-bed, --recode, --flip-scan, --merge-list,\n--write-snplist, --list-duplicate-vars, --freqx, --missing, --test-mishap,\n--hardy, --mendel, --ibc, --impute-sex, --indep-pairphase, --r2, --show-tags,\n--blocks, --distance, --genome, --homozyg, --make-rel, --make-grm-gz,\n--rel-cutoff, --cluster, --neighbour, --ibs-test, --regress-distance, --model,\n--bd, --gxe, --logistic, --dosage, --lasso, --test-missing, --make-perm-pheno,\n--td [...]
   #endif
 #else
   #ifndef NOLAPACK
-const char notestr_null_calc2[] = "Commands include --make-bed, --recode, --flip-scan, --merge-list,\n--write-snplist, --list-duplicate-vars, --freqx, --missing, --test-mishap,\n--hardy, --mendel, --ibc, --impute-sex, --indep-pairphase, --r2, --show-tags,\n--blocks, --distance, --genome, --homozyg, --make-rel, --make-grm-gz,\n--rel-cutoff, --cluster, --pca, --neighbour, --ibs-test, --regress-distance,\n--model, --bd, --gxe, --logistic, --dosage, --lasso, --test-missing,\n--make-perm-phen [...]
+const char notestr_null_calc2[] = "Commands include --make-bed, --recode, --flip-scan, --merge-list,\n--write-snplist, --list-duplicate-vars, --freqx, --missing, --test-mishap,\n--hardy, --mendel, --ibc, --impute-sex, --indep-pairphase, --r2, --show-tags,\n--blocks, --distance, --genome, --homozyg, --make-rel, --make-grm-gz,\n--rel-cutoff, --cluster, --pca, --neighbour, --ibs-test, --regress-distance,\n--model, --bd, --gxe, --logistic, --dosage, --lasso, --test-missing,\n--make-perm-phen [...]
   #else
-const char notestr_null_calc2[] = "Commands include --make-bed, --recode, --flip-scan, --merge-list,\n--write-snplist, --list-duplicate-vars, --freqx, --missing, --test-mishap,\n--hardy, --mendel, --ibc, --impute-sex, --indep-pairphase, --r2, --show-tags,\n--blocks, --distance, --genome, --homozyg, --make-rel, --make-grm-gz,\n--rel-cutoff, --cluster, --neighbour, --ibs-test, --regress-distance, --model,\n--bd, --gxe, --logistic, --dosage, --lasso, --test-missing, --make-perm-pheno,\n--td [...]
+const char notestr_null_calc2[] = "Commands include --make-bed, --recode, --flip-scan, --merge-list,\n--write-snplist, --list-duplicate-vars, --freqx, --missing, --test-mishap,\n--hardy, --mendel, --ibc, --impute-sex, --indep-pairphase, --r2, --show-tags,\n--blocks, --distance, --genome, --homozyg, --make-rel, --make-grm-gz,\n--rel-cutoff, --cluster, --neighbour, --ibs-test, --regress-distance, --model,\n--bd, --gxe, --logistic, --dosage, --lasso, --test-missing, --make-perm-pheno,\n--td [...]
   #endif
 #endif
 
@@ -1097,7 +1099,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
     }
   }
   if (g_thread_ct > 1) {
-    if ((calculation_type & (CALC_RELATIONSHIP | CALC_REL_CUTOFF | CALC_GDISTANCE_MASK | CALC_IBS_TEST | CALC_GROUPDIST | CALC_REGRESS_DISTANCE | CALC_GENOME | CALC_REGRESS_REL | CALC_UNRELATED_HERITABILITY | CALC_LD | CALC_PCA | CALC_MAKE_PERM_PHENO | CALC_QFAM)) || ((calculation_type & CALC_MODEL) && (model_modifier & (MODEL_PERM | MODEL_MPERM))) || ((calculation_type & CALC_GLM) && (glm_modifier & (GLM_PERM | GLM_MPERM))) || ((calculation_type & CALC_TESTMISS) && (testmiss_modifier &  [...]
+    if ((calculation_type & (CALC_RELATIONSHIP | CALC_REL_CUTOFF | CALC_GDISTANCE_MASK | CALC_IBS_TEST | CALC_GROUPDIST | CALC_REGRESS_DISTANCE | CALC_GENOME | CALC_REGRESS_REL | CALC_UNRELATED_HERITABILITY | CALC_LD | CALC_PCA | CALC_MAKE_PERM_PHENO | CALC_QFAM)) || ((calculation_type & CALC_MODEL) && (model_modifier & (MODEL_PERM | MODEL_MPERM))) || ((calculation_type & CALC_GLM) && (glm_modifier & (GLM_PERM | GLM_MPERM))) || ((calculation_type & CALC_TESTMISS) && (testmiss_modifier &  [...]
       LOGPRINTF("Using up to %u threads (change this with --threads).\n", g_thread_ct);
     } else {
       logprint("Using 1 thread (no multithreaded calculations invoked).\n");
@@ -1823,7 +1825,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
       logprint("Error: --fast-epistasis case-only requires a sorted .bim.  Retry this command\nafter using --make-bed to sort your data.\n");
       goto plink_ret_INVALID_CMDLINE;
     }
-    retval = epistasis_report(threads, epi_ip, bedfile, bed_offset, marker_ct, unfiltered_marker_ct, marker_exclude, marker_reverse, marker_ids, max_marker_id_len, marker_pos, plink_maxsnp, chrom_info_ptr, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, pheno_ctrl_ct, pheno_c, pheno_d, parallel_idx, parallel_tot, outname, outname_end, output_min_p, sip);
+    retval = epistasis_report(threads, epi_ip, bedfile, bed_offset, marker_ct, unfiltered_marker_ct, marker_exclude, marker_reverse, marker_ids, max_marker_id_len, marker_pos, plink_maxsnp, chrom_info_ptr, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, pheno_ctrl_ct, pheno_c, pheno_d, parallel_idx, parallel_tot, outname, outname_end, output_min_p, glm_vif_thresh, sip);
     if (retval) {
       goto plink_ret_1;
     }
@@ -7812,11 +7814,9 @@ int32_t main(int32_t argc, char** argv) {
 	}
         calculation_type |= CALC_EPI;
       } else if (!memcmp(argptr2, "ist-all", 8)) {
-	UNSTABLE;
 	ld_info.modifier |= LD_SHOW_TAGS_LIST_ALL;
         goto main_param_zero;
       } else if (!memcmp(argptr2, "ist-duplicate-vars", 19)) {
-	UNSTABLE;
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 3)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
@@ -8884,7 +8884,6 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	calculation_type |= CALC_MAKE_PERM_PHENO;
       } else if (!memcmp(argptr2, "eta-analysis", 13)) {
-	UNSTABLE;
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 2, 0x1fffffff)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
@@ -9239,7 +9238,6 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_NOMEM;
 	}
       } else if (!memcmp(argptr2, "utput-min-p", 12)) {
-	UNSTABLE;
         if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
@@ -10410,6 +10408,13 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE;
 	}
 	calculation_type |= CALC_LD;
+      } else if (!memcmp(argptr2, "eal-ref-alleles", 16)) {
+	if (load_rare & (LOAD_RARE_CNV | LOAD_RARE_DOSAGE)) {
+	  sprintf(logbuf, "Error: --real-ref-alleles has no effect with %s.\n", (load_rare == LOAD_RARE_CNV)? "a .cnv fileset" : "--dosage");
+	  goto main_ret_INVALID_CMDLINE_2A;
+	}
+        misc_flags |= MISC_REAL_REF_ALLELES | MISC_KEEP_ALLELE_ORDER;
+        goto main_param_zero;
       } else if (!memcmp(argptr2, "ange", 5)) {
         if (extractname) {
 	  misc_flags |= MISC_EXTRACT_RANGE;
@@ -10693,17 +10698,18 @@ int32_t main(int32_t argc, char** argv) {
 	goto main_param_zero;
       } else if (!memcmp(argptr2, "ex", 3)) {
 	if (load_rare == LOAD_RARE_DOSAGE) {
-	  logprint("Error: --dosage + --sex did not work properly in PLINK 1.07 (--sex had no\neffect).  If you have used that flag combination in the past, we recommend\nthat you rerun your analysis with PLINK 1.9 '--dosage ... sex'.\n");
-	  goto main_ret_INVALID_CMDLINE_A;
-	} else if (!(calculation_type & CALC_GLM)) {
-	  logprint("Error: --sex must be used with --linear or --logistic.\n");
-	  goto main_ret_INVALID_CMDLINE_A;
-	} else if (glm_modifier & GLM_NO_X_SEX) {
-	  sprintf(logbuf, "Error: --sex conflicts with a --%s modifier.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear");
-	  goto main_ret_INVALID_CMDLINE_2A;
+	  dosage_info.modifier |= DOSAGE_SEX;
+	} else {
+	  if (!(calculation_type & CALC_GLM)) {
+	    logprint("Error: --sex must be used with --linear/--logistic/--dosage.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  } else if (glm_modifier & GLM_NO_X_SEX) {
+	    sprintf(logbuf, "Error: --sex conflicts with a --%s modifier.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear");
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  logprint("Note: --sex flag deprecated.  Use e.g. '--linear sex'.\n");
+	  glm_modifier |= GLM_SEX;
 	}
-	logprint("Note: --sex flag deprecated.  Use e.g. '--linear sex'.\n");
-	glm_modifier |= GLM_SEX;
 	goto main_param_zero;
       } else if (!memcmp(argptr2, "tandard-beta", 13)) {
 	if (((!(calculation_type & CALC_GLM)) || (glm_modifier & GLM_LOGISTIC)) && (!(dosage_info.modifier & DOSAGE_GLM))) {
@@ -11757,19 +11763,19 @@ int32_t main(int32_t argc, char** argv) {
 
     case 'v':
       if (!memcmp(argptr2, "if", 3)) {
-	if ((!(calculation_type & CALC_GLM)) || (glm_modifier & GLM_LOGISTIC)) {
-	  logprint("Error: --vif must be used with --linear.\n");
+	if (((!(calculation_type & CALC_GLM)) || (glm_modifier & GLM_LOGISTIC)) && (!((calculation_type & CALC_EPI) || (!(epi_info.modifier & EPI_REG))))) {
+	  logprint("Error: --vif must be used with --linear/--epistasis.\n");
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (scan_double(argv[cur_arg + 1], &glm_vif_thresh)) {
-	  sprintf(logbuf, "Error: Invalid --linear/--logistic VIF threshold '%s'.\n", argv[cur_arg + 1]);
+	  sprintf(logbuf, "Error: Invalid --linear/--epistasis VIF threshold '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	if (glm_vif_thresh < 1.0) {
-	  sprintf(logbuf, "Error: --linear/--logistic VIF threshold '%s' too small (must be >= 1).\n", argv[cur_arg + 1]);
+	  sprintf(logbuf, "Error: --linear/--epistasis VIF threshold '%s' too small (must be >= 1).\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "egas", 5)) {
@@ -11821,7 +11827,6 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	misc_flags |= MISC_VCF_FILTER;
       } else if (!memcmp(argptr2, "cf-min-gp", 10)) {
-	UNSTABLE;
 	if (!(load_rare & LOAD_RARE_VCF)) {
 	  // er, probably want to support BCF too
 	  logprint("Error: --vcf-min-gp must be used with --vcf.\n");
@@ -11836,7 +11841,6 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	vcf_min_gp *= 1 - SMALL_EPSILON;
       } else if (!memcmp(argptr2, "cf-min-gq", 10)) {
-	UNSTABLE;
 	if (!(load_rare & LOAD_RARE_VCF)) {
 	  // probably want to support BCF too
 	  logprint("Error: --vcf-min-gq must be used with --vcf.\n");
@@ -11898,7 +11902,6 @@ int32_t main(int32_t argc, char** argv) {
 	calculation_type |= CALC_WRITE_SNPLIST;
 	goto main_param_zero;
       } else if (!memcmp(argptr2, "rite-var-ranges", 16)) {
-	UNSTABLE;
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
@@ -12167,6 +12170,10 @@ int32_t main(int32_t argc, char** argv) {
       goto main_ret_INVALID_CMDLINE_A;
     }
   }
+  if (sample_sort && (calculation_type & (~(CALC_MERGE | CALC_MAKE_BED)))) {
+    logprint("Error: --indiv-sort only affects --make-bed and --merge/--bmerge/--merge-list.\n");
+    goto main_ret_INVALID_CMDLINE_A;
+  }
   if ((cnv_intersect_filter_type & CNV_COUNT) && (!(cnv_calc_type & (CNV_SAMPLE_PERM | CNV_ENRICHMENT_TEST)))) {
     logprint("Error: --cnv-count must be used with --cnv-indiv-perm or --cnv-enrichment-test.\n");
     goto main_ret_INVALID_CMDLINE;
@@ -12219,8 +12226,8 @@ int32_t main(int32_t argc, char** argv) {
     }
   }
   if ((parallel_tot > 1) && (!(calculation_type & (CALC_LD | CALC_DISTANCE | CALC_GENOME | CALC_RELATIONSHIP)))) {
-    if ((!(calculation_type & CALC_EPI)) || (!(epi_info.modifier & EPI_FAST))) {
-      logprint("Error: --parallel only affects --r/--r2, --distance, --genome, --make-rel,\n--make-grm-gz/--make-grm-bin, and --fast-epistasis.\n");
+    if ((!(calculation_type & CALC_EPI)) || (!(epi_info.modifier & (EPI_FAST | EPI_REG)))) {
+      logprint("Error: --parallel only affects --r/--r2, --distance, --genome, --make-rel,\n--make-grm-gz/--make-grm-bin, and --epistasis/--fast-epistasis.\n");
       goto main_ret_INVALID_CMDLINE_A;
     }
   }
diff --git a/plink_common.c b/plink_common.c
index adc0a7f..0ee85cd 100644
--- a/plink_common.c
+++ b/plink_common.c
@@ -4325,6 +4325,11 @@ int32_t resolve_or_add_chrom_name(Chrom_info* chrom_info_ptr, char* bufptr, int3
     *chrom_idx_ptr = (int32_t)(chrom_idx + max_code_p1);
     return 0;
   }
+  if (*bufptr == '#') {
+    // this breaks VCF and PLINK 2 binary
+    logprint("Error: Chromosome/contig names may not begin with '#'.\n");
+    return RET_INVALID_FORMAT;
+  }
   if (slen > MAX_ID_LEN) {
     if (line_idx) {
       LOGPRINTFWW("Error: Line %" PRIuPTR " of %s has an excessively long chromosome/contig name.  (The " PROG_NAME_CAPS " limit is " MAX_ID_LEN_STR " characters.)\n", line_idx, file_descrip);
@@ -5281,7 +5286,17 @@ void bitfield_xor(uintptr_t* bit_arr, uintptr_t* xor_arr, uintptr_t word_ct) {
     *bit_arr++ ^= *xor_arr++;
   } while (bit_arr < bit_arr_end);
 #endif
+}
 
+uint32_t is_monomorphic_a2(uintptr_t* lptr, uint32_t sample_ct) {
+  uintptr_t* loop_end = &(lptr[sample_ct / BITCT2]);
+  uint32_t sample_rem = sample_ct % BITCT2;
+  for (; lptr < loop_end; lptr++) {
+    if ((~(*lptr)) & FIVEMASK) {
+      return 0;
+    }
+  }
+  return (sample_rem && ((~(*lptr)) & (FIVEMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
 }
 
 uint32_t is_monomorphic(uintptr_t* lptr, uint32_t sample_ct) {
@@ -8337,6 +8352,48 @@ void vec_datamask(uintptr_t unfiltered_sample_ct, uint32_t matchval, uintptr_t*
   }
 }
 
+/*
+void vec_rotate_plink1_to_plink2(uintptr_t* lptr, uint32_t word_ct) {
+#ifdef __LP64__
+  const __m128i m1 = {FIVEMASK, FIVEMASK};
+  __m128i* vptr = (__m128i*)lptr;
+  __m128i* vend = (__m128i*)(&(lptr[word_ct]));
+  __m128i vii;
+  __m128i vjj;
+  do {
+    // new high bit set iff old low bit was set
+    // new low bit set iff old bits differed 
+    vii = *vptr;
+    vjj = _mm_and_si128(vii, m1); // old low bit
+    vii = _mm_and_si128(_mm_srli_epi64(vii, 1), m1); // old high bit, shifted
+    *vptr = _mm_or_si128(_mm_slli_epi64(vjj, 1), _mm_xor_si128(vii, vjj));
+  } while (++vptr != vend);
+#else
+  uintptr_t* lend = &(lptr[word_ct]);
+  uintptr_t ulii;
+  uintptr_t uljj;
+  do {
+    ulii = *lptr;
+    uljj = ulii & FIVEMASK;
+    ulii = (ulii >> 1) & FIVEMASK;
+    *lptr = ulii ^ (uljj * 3);
+  } while (++lptr != lend);
+#endif
+}
+*/
+
+void rotate_plink1_to_plink2_and_copy(uintptr_t* loadbuf, uintptr_t* writebuf, uintptr_t word_ct) {
+  uintptr_t* loadbuf_end = &(loadbuf[word_ct]);
+  uintptr_t ulii;
+  uintptr_t uljj;
+  do {
+    ulii = *loadbuf++;
+    uljj = ulii & FIVEMASK;
+    ulii = (ulii >> 1) & FIVEMASK;
+    *writebuf++ = ulii ^ (uljj * 3);
+  } while (loadbuf < loadbuf_end);
+}
+
 void extract_collapsed_missing_bitfield(uintptr_t* lptr, uintptr_t unfiltered_sample_ct, uintptr_t* sample_include2, uintptr_t sample_ct, uintptr_t* missing_bitfield) {
   uint32_t word_ct = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
   uintptr_t sample_idx;
diff --git a/plink_common.h b/plink_common.h
index a4751cf..137e872 100644
--- a/plink_common.h
+++ b/plink_common.h
@@ -220,6 +220,7 @@
 #define MISC_HET_SMALL_SAMPLE 0x100000000LLU
 #define MISC_FST_CC 0x200000000LLU
 #define MISC_SPLIT_MERGE_NOFAIL 0x400000000LLU
+#define MISC_REAL_REF_ALLELES 0x800000000LLU
 
 // assume for now that .bed must always be accompanied by both .bim and .fam
 #define FILTER_ALL_REQ 1LLU
@@ -1987,6 +1988,8 @@ static inline uint32_t popcount_long(uintptr_t val) {
   return popcount2_long(val - ((val >> 1) & FIVEMASK));
 }
 
+uint32_t is_monomorphic_a2(uintptr_t* lptr, uint32_t sample_ct);
+
 uint32_t is_monomorphic(uintptr_t* lptr, uint32_t sample_ct);
 
 // same as is_monomorphic, except it also flags the all-heterozygote case
@@ -2153,6 +2156,10 @@ void vec_invert(uintptr_t unfiltered_sample_ct, uintptr_t* vec2);
 
 void vec_datamask(uintptr_t unfiltered_sample_ct, uint32_t matchval, uintptr_t* data_ptr, uintptr_t* mask_ptr, uintptr_t* result_ptr);
 
+// void vec_rotate_plink1_to_plink2(uintptr_t* lptr, uint32_t word_ct);
+
+void rotate_plink1_to_plink2_and_copy(uintptr_t* loadbuf, uintptr_t* writebuf, uintptr_t word_ct);
+
 void extract_collapsed_missing_bitfield(uintptr_t* lptr, uintptr_t unfiltered_sample_ct, uintptr_t* sample_include2, uintptr_t sample_ct, uintptr_t* missing_bitfield);
 
 void hh_reset(unsigned char* loadbuf, uintptr_t* sample_include2, uintptr_t unfiltered_sample_ct);
diff --git a/plink_data.c b/plink_data.c
index 0c72017..228a7d7 100644
--- a/plink_data.c
+++ b/plink_data.c
@@ -1523,7 +1523,7 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
   //   if a single covariate is missing for a person, that person's covar_nm
   //   bit is zero.
   // * covar_d is in sample-major order (i.e. the value of covariate m for
-  //   sample n is covar_d[n * covar_ct + m], where m and n are zero-based).
+  //   sample n is covar_d[n * covar_ctx + m], where m and n are zero-based).
   //   It does track when some covariates are missing and others aren't
   //   (missing covariates are represented as the --missing-phenotype value).
   if (covar_range_list_ptr) {
@@ -1635,7 +1635,7 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
       goto load_covars_ret_MISSING_TOKENS;
     }
     if (covar_range_list_ptr) {
-      dptr = &(covar_d[sample_idx * covar_ct]);
+      dptr = &(covar_d[sample_idx * covar_ctx]);
     }
     covar_missing = 0;
     for (uii = 0; uii < min_covar_col_ct; uii++) {
@@ -3325,11 +3325,18 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
   uintptr_t* writebuf_ptr;
   uint32_t* map_reverse;
   const char* errptr;
+  uintptr_t pass_size;
   uint32_t is_haploid;
   uint32_t is_x;
   uint32_t is_y;
   uint32_t is_mt;
   uint32_t loop_end;
+  uint32_t pass_ct;
+  uint32_t pass_idx;
+  uint32_t pass_start;
+  uint32_t pass_end;
+  uint32_t seek_needed;
+  uint32_t markers_done;
   if (flip_subset_fname) {
     if (wkspace_alloc_ul_checked(&flip_subset_markers, unfiltered_marker_ctl * sizeof(intptr_t)) ||
         wkspace_alloc_ul_checked(&flip_subset_vec2, sample_ctv2 * sizeof(intptr_t))) {
@@ -3367,9 +3374,6 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
       goto make_bed_ret_WRITE_FAIL;
     }
     fflush(stdout);
-    if (fseeko(bedfile, bed_offset, SEEK_SET)) {
-      goto make_bed_ret_READ_FAIL;
-    }
     if (resort_map) {
       if (set_hh_missing || set_me_missing || fill_missing_a2) {
 	// could remove this restriction if we added a chromosome check to the
@@ -3431,25 +3435,43 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
       }
       wkspace_reset(ll_buf);
 
-      if (wkspace_alloc_ul_checked(&writebuf, marker_ct * sample_ctv2)) {
-	// todo: implement multipass.  should now be a minimal change
-	logprint("\nError: Insufficient memory for current --make-bed implementation.  Try raising\nthe --memory value for now.\n");
-	retval = RET_CALC_NOT_YET_SUPPORTED;
-	goto make_bed_ret_1;
+      // oops, forgot to multiply by sizeof(intptr_t)!  fortunately, this
+      // segfaulted instead of corrupting any data.
+      // anyway, it's now time to implement multipass.
+      if (wkspace_left < sample_ctv2 * sizeof(intptr_t)) {
+        goto make_bed_ret_NOMEM;
       }
+      writebuf = (uintptr_t*)wkspace_base;
+      pass_ct = 1 + ((sample_ctv2 * marker_ct * sizeof(intptr_t) - 1) / wkspace_left);
+      pass_size = 1 + ((marker_ct - 1) / pass_ct);
       *outname_end = '\0';
       LOGPRINTFWW5("--make-bed to %s.bed + %s.bim + %s.fam ... ", outname, outname, outname);
       fputs("0%", stdout);
-      for (; pct <= 100; pct++) {
-	loop_end = (pct * ((uint64_t)marker_ct)) / 100;
-	for (; marker_idx < loop_end; marker_uidx++, marker_idx++) {
+      loop_end = marker_ct / 100;
+      markers_done = 0;
+      for (pass_idx = 0; pass_idx < pass_ct; pass_idx++) {
+        pass_start = pass_idx * pass_size;
+	pass_end = (pass_idx + 1) * pass_size;
+	if (pass_idx + 1 == pass_ct) {
+	  pass_end = marker_ct;
+	}
+	seek_needed = 1;
+	for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
 	  if (IS_SET(marker_exclude, marker_uidx)) {
 	    marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
+	    seek_needed = 1;
+	  }
+	  if ((map_reverse[marker_uidx] < pass_start) || (map_reverse[marker_uidx] >= pass_end)) {
+	    seek_needed = 1;
+	    continue;
+	  }
+	  writebuf_ptr = &(writebuf[sample_ctv2 * (map_reverse[marker_uidx] - pass_start)]);
+	  if (seek_needed) {
 	    if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
 	      goto make_bed_ret_READ_FAIL;
 	    }
+	    seek_needed = 0;
 	  }
-	  writebuf_ptr = &(writebuf[sample_ctv2 * map_reverse[marker_uidx]]);
 	  retval = make_bed_one_marker(bedfile, loadbuf, unfiltered_sample_ct, unfiltered_sample_ct4, sample_exclude, sample_ct, sample_sort_map, final_mask, IS_SET(marker_reverse, marker_uidx), writebuf_ptr);
 	  if (retval) {
 	    goto make_bed_ret_1;
@@ -3460,20 +3482,25 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
 	  if (flip_subset_markers && is_set(flip_subset_markers, marker_uidx)) {
 	    reverse_subset(writebuf_ptr, flip_subset_vec2, sample_ctv2);
 	  }
-	}
-	if (pct < 100) {
-	  if (pct > 10) {
-	    putchar('\b');
+	  if (markers_done >= loop_end) {
+	    if (pct > 10) {
+	      putchar('\b');
+	    }
+	    pct = (markers_done * 100LLU) / marker_ct;
+	    printf("\b\b%u%%", pct);
+	    fflush(stdout);
+	    pct++;
+	    loop_end = (pct * ((uint64_t)marker_ct)) / 100;
 	  }
-	  printf("\b\b%u%%", pct);
-	  fflush(stdout);
+	  markers_done++;
 	}
-      }
-      for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
-	if (fwrite_checked(writebuf, sample_ct4, bedoutfile)) {
-	  goto make_bed_ret_WRITE_FAIL;
+	writebuf_ptr = writebuf;
+	for (marker_idx = pass_start; marker_idx < pass_end; marker_idx++) {
+	  if (fwrite_checked(writebuf_ptr, sample_ct4, bedoutfile)) {
+	    goto make_bed_ret_WRITE_FAIL;
+	  }
+	  writebuf_ptr = &(writebuf_ptr[sample_ctv2]);
 	}
-	writebuf = &(writebuf[sample_ctv2]);
       }
     } else {
       if (!hh_exists) {
@@ -3508,6 +3535,9 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
       if (wkspace_alloc_ul_checked(&writebuf, sample_ctv2)) {
 	goto make_bed_ret_NOMEM;
       }
+      if (fseeko(bedfile, bed_offset, SEEK_SET)) {
+	goto make_bed_ret_READ_FAIL;
+      }
       *outname_end = '\0';
       LOGPRINTFWW5("--make-bed to %s.bed + %s.bim + %s.fam ... ", outname, outname, outname);
       fputs("0%", stdout);
@@ -7836,12 +7866,12 @@ uint32_t vcf_gp_invalid(char* bufptr, char* bufptr2, double vcf_min_gp, uint32_t
   for (uii = 0; uii < gp_field_pos; uii++) {
     bufptr = (char*)memchr(bufptr, ':', (uintptr_t)(bufptr2 - bufptr));
     if (!bufptr) {
-      *is_error_ptr = 1;
-      return 1;
+      *is_error_ptr = 0;
+      return 0;
     }
     bufptr++;
   }
-  // hmm, defend against decimal without leading zero
+  // do not treat decimal without leading zero as missing value
   if (((*bufptr == '.') || (*bufptr == '?')) && (bufptr[1] < '.')) {
     *is_error_ptr = 0;
     return 0;
@@ -8254,7 +8284,8 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 	      for (ujj = 0; ujj < gq_field_pos; ujj++) {
 		gq_scan_ptr = (char*)memchr(gq_scan_ptr, ':', (uintptr_t)(bufptr2 - gq_scan_ptr));
 		if (!gq_scan_ptr) {
-		  goto vcf_to_bed_ret_MISSING_GQ;
+		  // non-GT fields are allowed to be missing
+		  goto vcf_to_bed_missing_gq_1;
 		}
 		gq_scan_ptr++;
 	      }
@@ -8262,6 +8293,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 		continue;
 	      }
 	    }
+	  vcf_to_bed_missing_gq_1:
 	    cc = bufptr[1];
 	    if ((cc != '/') && (cc != '|')) {
 	      // haploid
@@ -8269,7 +8301,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 	      if (gp_field_pos) {
 		if (vcf_gp_invalid(bufptr, bufptr2, vcf_min_gp, gp_field_pos, uii, &ukk)) {
 		  if (ukk) {
-		    goto vcf_to_bed_ret_MISSING_GP;
+		    goto vcf_to_bed_ret_INVALID_GP;
 		  }
 		  continue;
 		}
@@ -8295,7 +8327,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 		  if (gp_field_pos) {
 		    if (vcf_gp_diploid_invalid(bufptr, bufptr2, vcf_min_gp, gp_field_pos, uii, ujj, &ukk)) {
 		      if (ukk) {
-			goto vcf_to_bed_ret_MISSING_GP;
+			goto vcf_to_bed_ret_INVALID_GP;
 		      }
 		      continue;
 		    }
@@ -8347,7 +8379,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 	    for (ujj = 0; ujj < gq_field_pos; ujj++) {
 	      gq_scan_ptr = (char*)memchr(gq_scan_ptr, ':', (uintptr_t)(bufptr2 - gq_scan_ptr));
               if (!gq_scan_ptr) {
-		goto vcf_to_bed_ret_MISSING_GQ;
+                goto vcf_to_bed_missing_gq_2;
 	      }
 	      gq_scan_ptr++;
 	    }
@@ -8355,13 +8387,14 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 	      continue;
 	    }
 	  }
+	vcf_to_bed_missing_gq_2:
 	  cc = bufptr[1];
 	  if ((cc != '/') && (cc != '|')) {
 	  vcf_to_bed_haploid_2:
 	    if (gp_field_pos) {
 	      if (vcf_gp_invalid(bufptr, bufptr2, vcf_min_gp, gp_field_pos, uii, &ukk)) {
 		if (ukk) {
-		  goto vcf_to_bed_ret_MISSING_GP;
+		  goto vcf_to_bed_ret_INVALID_GP;
 		}
 	        continue;
 	      }
@@ -8389,7 +8422,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 	      if (gp_field_pos) {
 		if (vcf_gp_diploid_invalid(bufptr, bufptr2, vcf_min_gp, gp_field_pos, uii, ujj, &ukk)) {
 		  if (ukk) {
-		    goto vcf_to_bed_ret_MISSING_GP;
+		    goto vcf_to_bed_ret_INVALID_GP;
 		  }
 		  continue;
 		}
@@ -8426,7 +8459,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 	    for (ujj = 0; ujj < gq_field_pos; ujj++) {
 	      gq_scan_ptr = (char*)memchr(gq_scan_ptr, ':', (uintptr_t)(bufptr2 - gq_scan_ptr));
 	      if (!gq_scan_ptr) {
-		goto vcf_to_bed_ret_MISSING_GQ;
+                goto vcf_to_bed_missing_gq_3;
 	      }
 	      gq_scan_ptr++;
 	    }
@@ -8434,6 +8467,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 	      continue;
 	    }
 	  }
+	vcf_to_bed_missing_gq_3:
 	  while (1) {
 	    ujj = ((unsigned char)(*(++bufptr))) - 48;
 	    if (ujj > 9) {
@@ -8448,7 +8482,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 	    if (gp_field_pos) {
 	      if (vcf_gp_invalid(bufptr, bufptr2, vcf_min_gp, gp_field_pos, uii, &ukk)) {
 		if (ukk) {
-		  goto vcf_to_bed_ret_MISSING_GP;
+		  goto vcf_to_bed_ret_INVALID_GP;
 		}
 		continue;
 	      }
@@ -8484,7 +8518,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
 	      if (gp_field_pos) {
                 if (vcf_gp_diploid_invalid(bufptr, bufptr2, vcf_min_gp, gp_field_pos, uii, ujj, &ukk)) {
 		  if (ukk) {
-		    goto vcf_to_bed_ret_MISSING_GP;
+		    goto vcf_to_bed_ret_INVALID_GP;
 		  }
 		  continue;
 		}
@@ -8679,13 +8713,9 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
     }
     retval = RET_INVALID_FORMAT;
     break;
-  vcf_to_bed_ret_MISSING_GQ:
-    LOGPRINTF("\nError: Line %" PRIuPTR " of .vcf file has a missing GQ field.\n", line_idx);
-    retval = RET_INVALID_FORMAT;
-    break;
-  vcf_to_bed_ret_MISSING_GP:
+  vcf_to_bed_ret_INVALID_GP:
     logprint("\n");
-    LOGPRINTFWW("Error: Line %" PRIuPTR " of .vcf file has a missing or improperly formatted GP field.\n", line_idx);
+    LOGPRINTF("Error: Line %" PRIuPTR " of .vcf file has an improperly formatted GP field.\n", line_idx);
     retval = RET_INVALID_FORMAT;
     break;
   vcf_to_bed_ret_INVALID_GT:
@@ -11541,6 +11571,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
   uint32_t vcf_two_ids = vcf_not_fid && vcf_not_iid;
   uint32_t recode_012 = recode_modifier & (RECODE_01 | RECODE_12);
   uint32_t set_hh_missing = (misc_flags / MISC_SET_HH_MISSING) & 1;
+  uint32_t real_ref_alleles = (misc_flags / MISC_REAL_REF_ALLELES) & 1;
   uint32_t xmhh_exists_orig = hh_exists & XMHH_EXISTS;
   uintptr_t header_len = 0;
   uintptr_t max_chrom_size = 0;
@@ -12264,42 +12295,52 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       goto recode_ret_OPEN_FAIL;
     }
     if (fputs_checked(
-"##fileformat=VCFv4.1\n"
-"##filedate=", outfile)) {
+"##fileformat=VCFv4.2\n"
+"##fileDate=", outfile)) {
       goto recode_ret_WRITE_FAIL;
     }
     time(&rawtime);
     loctime = localtime(&rawtime);
     strftime(tbuf, MAXLINELEN, "%Y%m%d", loctime);
     fputs(tbuf, outfile);
-    fputs(
-"\n##source=" PROG_NAME_CAPS "\n"
-"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
-, outfile);
+    fputs("\n##source=PLINKv1.90\n", outfile);
     uii = 0; // '0' written already?
     memcpy(tbuf, "##contig=<ID=", 13);
     for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
       chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
+      if (!IS_SET(chrom_info_ptr->chrom_mask, chrom_idx)) {
+	continue;
+      }
       cptr = chrom_name_write(&(tbuf[13]), chrom_info_ptr, chrom_idx);
       if ((tbuf[13] == '0') && (cptr == &(tbuf[14]))) {
 	if (uii) {
 	  continue;
 	}
 	uii = 1;
-	cptr = memcpya(cptr, ",length=536870911", 17);
+	cptr = memcpya(cptr, ",length=2147483645", 18);
       } else {
+	*cptr = '\0';
+	if (strchr(&(tbuf[13]), ':')) {
+	  logprint("Error: VCF chromosome codes may not include the ':' character.\n");
+	  goto recode_ret_INVALID_FORMAT;
+	}
         cptr = memcpya(cptr, ",length=", 8);
 	if (!(map_is_unsorted & UNSORTED_BP)) {
 	  cptr = uint32_write(cptr, marker_pos[chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1] - 1] + 1);
 	} else {
-	  cptr = memcpya(cptr, "536870911", 9); // unknown
+	  cptr = memcpya(cptr, "2147483645", 10); // unknown
 	}
       }
       cptr = memcpya(cptr, ">\n", 2);
       fwrite(tbuf, 1, cptr - tbuf, outfile);
     }
+    if (!real_ref_alleles) {
+      fputs("##INFO=<ID=PR,Number=0,Type=Flag,Description=\"Provisional reference allele, may not be based on real reference genome\"\n", outfile);
+    }
+    fputs("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", outfile);
+    // todo: include PEDIGREE in header, and make --vcf be able to read it?
+    // Can't find a specification for how this should be done...
     fputs("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", outfile);
-
     chrom_fo_idx = 0;
     refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
     chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
@@ -12371,23 +12412,35 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	  fputs(cptr, outfile);
 	}
 	putc('\t', outfile);
+
+	if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+	  goto recode_ret_READ_FAIL;
+	}
+	if (is_haploid && set_hh_missing) {
+	  haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
+	}
+
 	cptr = mk_allele_ptrs[2 * marker_uidx];
 	if (cptr != missing_geno_ptr) {
           if ((!invalid_allele_code_seen) && (!valid_vcf_allele_code(cptr))) {
 	    invalid_allele_code_seen = 1;
 	  }
-	  fputs(cptr, outfile);
+	  // if ALT allele is not actually present in immediate dataset, VCF
+	  // spec actually requires '.'
+	  if (!is_monomorphic_a2(loadbuf_collapsed, sample_ct)) {
+	    fputs(cptr, outfile);
+	  } else {
+	    putc('.', outfile);
+	  }
 	} else {
 	  putc('.', outfile);
 	}
-	fputs("\t.\t.\t.\tGT", outfile);
-
-	if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
-	  goto recode_ret_READ_FAIL;
-	}
-	if (is_haploid && set_hh_missing) {
-	  haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
+	if (!real_ref_alleles) {
+	  fputs("\t.\t.\tPR\tGT", outfile);
+	} else {
+	  fputs("\t.\t.\t.\tGT", outfile);
 	}
+
 	wbufptr = writebuf;
 	ulptr = loadbuf_collapsed;
 	ulptr_end = &(loadbuf_collapsed[sample_ct / BITCT2]);
diff --git a/plink_glm.c b/plink_glm.c
index 4591d91..7d93bdb 100644
--- a/plink_glm.c
+++ b/plink_glm.c
@@ -591,7 +591,6 @@ int32_t glm_check_vif(double vif_thresh, uintptr_t param_ct, uintptr_t sample_va
   uintptr_t sample_idx;
   double dxx;
   double dyy;
-  int32_t ii;
   for (param_idx = 1; param_idx < param_ct; param_idx++) {
     dyy = 0; // sum
     dptr = &(covars_collapsed[param_idx * sample_valid_ct]);
@@ -603,7 +602,7 @@ int32_t glm_check_vif(double vif_thresh, uintptr_t param_ct, uintptr_t sample_va
   }
   for (param_idx = 1; param_idx < param_ct; param_idx++) {
     dptr = &(param_2d_buf[(param_idx - 1) * param_ct]);
-    dyy = param_2d_buf2[param_idx];
+    dyy = param_2d_buf2[param_idx] * sample_ct_d;
     for (param_idx2 = param_idx; param_idx2 < param_ct; param_idx2++) {
       dxx = 0;
       dptr2 = &(covars_collapsed[param_idx * sample_valid_ct]);
@@ -611,7 +610,7 @@ int32_t glm_check_vif(double vif_thresh, uintptr_t param_ct, uintptr_t sample_va
       for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
 	dxx += (*dptr2++) * (*dptr3++);
       }
-      dxx -= dyy * param_2d_buf2[param_idx2] * sample_ct_d;
+      dxx -= dyy * param_2d_buf2[param_idx2];
       *dptr++ = dxx * sample_ct_m1_recip;
     }
   }
@@ -640,9 +639,8 @@ int32_t glm_check_vif(double vif_thresh, uintptr_t param_ct, uintptr_t sample_va
     param_2d_buf[param_idx * param_ct] = 1;
   }
   
-  ii = invert_matrix(dim, param_2d_buf, mi_buf, param_2d_buf2);
-  if (ii) {
-    return ii;
+  if (invert_matrix(dim, param_2d_buf, mi_buf, param_2d_buf2)) {
+    return 1;
   }
   for (param_idx = 0; param_idx < param_ct_m1; param_idx++) {
     if (param_2d_buf[param_idx * param_ct] > vif_thresh) {
@@ -737,7 +735,7 @@ uint32_t glm_linear(uintptr_t cur_batch_size, uintptr_t param_ct, uintptr_t samp
     dptr2 = covars_sample_major;
     dxx = 0;
     if (!missing_ct) {
-      for (sample_idx = 0; sample_idx < sample_valid_ct + missing_ct; sample_idx++) {
+      for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
 	partial = 0;
 	dptr3 = dptr;
 	for (param_idx = 0; param_idx < param_ct; param_idx++) {
@@ -1161,9 +1159,14 @@ static inline __m128 fmath_exp_ps(__m128 xx) {
   u4 = _mm_srli_epi32(u4, 10);
   u4 = _mm_slli_epi32(u4, 23);
   uint32_t v0 = _mm_cvtsi128_si32(v4);
-  uint32_t v1 = ((int32_t)(uint16_t)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(v4), (int32_t)(2)));
-  uint32_t v2 = ((int32_t)(uint16_t)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(v4), (int32_t)(4)));
-  uint32_t v3 = ((int32_t)(uint16_t)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(v4), (int32_t)(6)));
+  // uint32_t v1 = ((int32_t)(uint16_t)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(v4), (int32_t)(2)));
+  // uint32_t v2 = ((int32_t)(uint16_t)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(v4), (int32_t)(4)));
+  // uint32_t v3 = ((int32_t)(uint16_t)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(v4), (int32_t)(6)));
+  // make this work with LLVM
+  uint32_t v1 = _mm_extract_epi16(((__m128i)(v4)), ((int32_t)(2)));
+  uint32_t v2 = _mm_extract_epi16(((__m128i)(v4)), ((int32_t)(4)));
+  uint32_t v3 = _mm_extract_epi16(((__m128i)(v4)), ((int32_t)(6)));
+
   __m128 t0 = _mm_set_ss(float_exp_lookup[v0]);
   __m128 t1 = _mm_set_ss(float_exp_lookup[v1]);
   __m128 t2 = _mm_set_ss(float_exp_lookup[v2]);
@@ -1836,9 +1839,6 @@ uint32_t logistic_regression(uint32_t sample_ct, uint32_t param_ct, float* vv, f
   }
 }
 
-// debug
-static uint32_t g_marker_uidx2;
-
 uint32_t glm_logistic(uintptr_t cur_batch_size, uintptr_t param_ct, uintptr_t sample_valid_ct, uint32_t missing_ct, uintptr_t* loadbuf, float* covars_cov_major, uintptr_t* perm_vecs, float* coef, float* pp, float* sample_1d_buf, float* pheno_buf, float* param_1d_buf, float* param_1d_buf2, float* param_2d_buf, float* param_2d_buf2, float* logistic_results, uintptr_t constraint_ct, double* constraints_con_major, double* param_1d_dbuf, double* param_2d_dbuf, double* param_2d_dbuf2, double*  [...]
   // Similar to logistic.cpp fitLM(), but incorporates changes from the
   // postprocessed TopCoder contest code.
@@ -5015,6 +5015,9 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     goto glm_linear_assoc_ret_NOMEM;
   }
   g_linear_mt = (Linear_multithread*)wkspace_alloc(max_thread_ct * sizeof(Linear_multithread));
+  if (!g_linear_mt) {
+    goto glm_linear_assoc_ret_NOMEM;
+  }
   ulii = (orig_perm_batch_size + (BITCT - 1)) / BITCT;
   for (tidx = 0; tidx < max_thread_ct; tidx++) {
     if (wkspace_alloc_d_checked(&(g_linear_mt[tidx].cur_covars_cov_major), param_ct_max * sample_valid_ct * sizeof(double)) ||
@@ -5032,13 +5035,13 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     }
     if (constraint_ct_max) {
       if (wkspace_alloc_d_checked(&(g_linear_mt[tidx].df_df_buf), constraint_ct_max * constraint_ct_max * sizeof(double)) ||
-	  wkspace_alloc_d_checked(&(g_linear_mt[tidx].df_buf), constraint_ct_max * sizeof(double))) {
+	  wkspace_alloc_d_checked(&(g_linear_mt[tidx].df_buf), constraint_ct_max * sizeof(double)) ||
+          wkspace_alloc_d_checked(&(g_linear_mt[tidx].param_df_buf), constraint_ct_max * param_ct_max * sizeof(double)) ||
+	  wkspace_alloc_d_checked(&(g_linear_mt[tidx].param_df_buf2), constraint_ct_max * param_ct_max * sizeof(double))) {
 	goto glm_linear_assoc_ret_NOMEM;
       }
     }
-    if (wkspace_alloc_d_checked(&(g_linear_mt[tidx].param_df_buf), constraint_ct_max * param_ct_max * sizeof(double)) ||
-	wkspace_alloc_d_checked(&(g_linear_mt[tidx].param_df_buf2), constraint_ct_max * param_ct_max * sizeof(double)) ||
-	wkspace_alloc_d_checked(&(g_linear_mt[tidx].dgels_a), param_ct_max * sample_valid_ct * sizeof(double)) ||
+    if (wkspace_alloc_d_checked(&(g_linear_mt[tidx].dgels_a), param_ct_max * sample_valid_ct * sizeof(double)) ||
 	wkspace_alloc_d_checked(&(g_linear_mt[tidx].dgels_b), orig_perm_batch_size * sample_valid_ct * sizeof(double))) {
       goto glm_linear_assoc_ret_NOMEM;
     }
@@ -6519,6 +6522,9 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     orig_perm_batch_size = 1;
   }
   g_logistic_mt = (Logistic_multithread*)wkspace_alloc(max_thread_ct * sizeof(Logistic_multithread));
+  if (!g_logistic_mt) {
+    goto glm_logistic_assoc_ret_NOMEM;
+  }
   ulii = (orig_perm_batch_size + (BITCT - 1)) / BITCT;
   for (tidx = 0; tidx < max_thread_ct; tidx++) {
     // covars_cov_major, param_2d_buf, param_2d_buf2 matrices must have 16-byte
@@ -6788,7 +6794,6 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
 	if (cur_sample_valid_ct > cur_param_ct) {
 	  // todo: try better starting position
 	  fill_float_zero(g_logistic_mt[0].coef, (cur_param_ct + 3) & (~3));
-	  g_marker_uidx2 = marker_uidx2;
 	  regression_fail = glm_logistic(1, cur_param_ct, cur_sample_valid_ct, cur_missing_ct, loadbuf_ptr, g_logistic_mt[0].cur_covars_cov_major, pheno_c_collapsed, g_logistic_mt[0].coef, g_logistic_mt[0].pp, g_logistic_mt[0].sample_1d_buf, g_logistic_mt[0].pheno_buf, g_logistic_mt[0].param_1d_buf, g_logistic_mt[0].param_1d_buf2, g_logistic_mt[0].param_2d_buf, g_logistic_mt[0].param_2d_buf2, g_logistic_mt[0].regression_results, cur_constraint_ct, constraints_con_major, g_logistic_mt[0].param_1 [...]
 	} else {
 	  regression_fail = 1;
@@ -7524,10 +7529,8 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
   if (constraint_ct) {
     if (wkspace_alloc_d_checked(&constraints_con_major, constraint_ct * param_ct * sizeof(double)) ||
         wkspace_alloc_d_checked(&df_df_buf, constraint_ct * constraint_ct * sizeof(double)) ||
-        wkspace_alloc_d_checked(&df_buf, constraint_ct * sizeof(double))) {
-      goto glm_linear_nosnp_ret_NOMEM;
-    }
-    if (wkspace_alloc_d_checked(&param_df_buf, constraint_ct * param_ct * sizeof(double)) ||
+        wkspace_alloc_d_checked(&df_buf, constraint_ct * sizeof(double)) ||
+        wkspace_alloc_d_checked(&param_df_buf, constraint_ct * param_ct * sizeof(double)) ||
 	wkspace_alloc_d_checked(&param_df_buf2, constraint_ct * param_ct * sizeof(double))) {
       goto glm_linear_nosnp_ret_NOMEM;
     }
@@ -9013,6 +9016,7 @@ uint32_t glm_logistic_dosage(uintptr_t sample_ct, uintptr_t* cur_samples, uintpt
   dyy = sqrt((double)regression_results[0]);
   *beta_ptr = dxx;
   *se_ptr = dyy;
-  *pval_ptr = calc_tprob(dxx / dyy, sample_valid_ct - param_ct);
+  dxx /= dyy;
+  *pval_ptr = chiprob_p(dxx * dxx, 1);
   return 1;
 }
diff --git a/plink_glm.h b/plink_glm.h
index da359ac..d312046 100644
--- a/plink_glm.h
+++ b/plink_glm.h
@@ -4,6 +4,12 @@
 #include "plink_matrix.h"
 #include "plink_set.h"
 
+// int32_t glm_check_vif(double vif_thresh, uintptr_t param_ct, uintptr_t sample_valid_ct, double* covars_collapsed, double* param_2d_buf, MATRIX_INVERT_BUF1_TYPE* mi_buf, double* param_2d_buf2);
+
+void solve_linear_system(const float* ll, const float* yy, float* xx, uint32_t dd);
+
+uint32_t logistic_regression(uint32_t sample_ct, uint32_t param_ct, float* vv, float* hh, float* grad, float* ll, float* dcoef, const float* xx, const float* yy, float* coef, float* pp);
+
 #ifndef NOLAPACK
 int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t glm_modifier, double glm_vif_thresh, uint32_t glm_xchr_model, uint32_t glm_mperm_val, Range_list* parameters_range_list_ptr, Range_list* tests_range_list_ptr, double ci_size, double ci_zt, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct, char* marke [...]
 #endif
diff --git a/plink_help.c b/plink_help.c
index 8b6ff41..c384356 100644
--- a/plink_help.c
+++ b/plink_help.c
@@ -418,13 +418,14 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "    * 'oxford' causes an Oxford-format .gen + .sample fileset to be generated.\n"
 "    * The 'structure' modifier causes a Structure-format file to be generated.\n"
 "    * 'transpose' creates a transposed text fileset (loadable with --tfile).\n"
-"    * 'vcf', 'vcf-fid', and 'vcf-iid' result in production of a VCFv4.1 file.\n"
+"    * 'vcf', 'vcf-fid', and 'vcf-iid' result in production of a VCFv4.2 file.\n"
 "      'vcf-fid' and 'vcf-iid' cause family IDs or within-family IDs\n"
 "      respectively to be used for the sample IDs in the last header row, while\n"
-"      'vcf' merges both IDs and puts an underscore between them.  The A2 allele\n"
-"      is saved as the reference; when it is important for reference alleles to\n"
-"      be correct, you'll usually also want to include --a2-allele in your\n"
-"      command.\n"
+"      'vcf' merges both IDs and puts an underscore between them.\n"
+"      The A2 allele is saved as the reference and normally flagged as not based\n"
+"      on a real reference genome ('PR' INFO field value).  When it is important\n"
+"      for reference alleles to be correct, you'll also want to include\n"
+"      --a2-allele and --real-ref-alleles in your command.\n"
 "    * The 'tab' modifier makes the output mostly tab-delimited instead of\n"
 "      mostly space-delimited.  'tabx' and 'spacex' force all tabs and all\n"
 "      spaces, respectively.\n\n"
@@ -483,7 +484,6 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "    --list-23-indels writes the subset with 23andMe-style indel calls (D/I\n"
 "    allele codes).\n\n"
 	       );
-#ifndef STABLE_BUILD
     help_print("list-duplicate-vars", &help_ctrl, 1,
 "  --list-duplicate-vars <require-same-ref> <ids-only> <suppress-first>\n"
 "    --list-duplicate-vars writes a .dupvar file describing all groups of\n"
@@ -497,7 +497,6 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "    * 'suppress-first' causes the first variant ID in each group to be omitted\n"
 "      from the report.\n\n"
 	       );
-#endif
     help_print("freq\tfreqx\tfrqx\tcounts", &help_ctrl, 1,
 "  --freq <counts>\n"
 "  --freqx\n"
@@ -635,7 +634,6 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "    maximum likelihood solution identified by --r/--r2), along with HWE exact\n"
 "    test statistics.\n\n"
 	       );
-#ifndef STABLE_BUILD
     help_print("show-tags", &help_ctrl, 1,
 "  --show-tags [filename]\n"
 "  --show-tags all\n"
@@ -645,7 +643,6 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "    * If 'all' mode is specified, for each variant, each *other* variant which\n"
 "      tags it is reported.\n\n"
 	       );
-#endif
     help_print("blocks\thap\thap-all\thap-assoc\thap-freq\thap-impute\thap-impute-verbose\thap-linear\thap-logistic\thap-max-phase\thap-min-phase-prob\thap-miss\thap-omnibus\thap-only\thap-phase\thap-phase-wide\thap-pp\thap-snps\thap-tdt\thap-window\tchap\twhap", &help_ctrl, 1,
 "  --blocks <no-pheno-req> <no-small-max-span>\n"
 "    Estimate haplotype blocks, via Haploview's interpretation of the block\n"
@@ -1029,8 +1026,7 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "      0..1 hom A2.\n"
 "    * 'Zout' causes the output file to be gzipped.\n"
 "    * Normally, an association analysis is performed.  'standard-beta' and\n"
-"      'sex' behave as they are supposed to with --linear/--logistic.  (Note\n"
-"      that --dosage + --sex did NOT work properly in PLINK 1.07.)\n"
+"      'sex' behave as they are supposed to with --linear/--logistic.\n"
 "    * There are three alternate modes which cause the association analysis to\n"
 "      be skipped.\n"
 "      * 'occur' requests a simple variant occurrence report.\n"
@@ -1152,7 +1148,6 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "    * When --extract (without 'range') is present, only variants named in the\n"
 "      --extract file are considered.\n\n"
 	       );
-#ifndef STABLE_BUILD
     help_print("meta-analysis", &help_ctrl, 1,
 "  --meta-analysis [PLINK report filenames...]\n"
 "  --meta-analysis [PLINK report filenames...] + <logscale | qt>\n"
@@ -1181,7 +1176,6 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "      --extract file are considered.\n"
 "    * Unless 'no-map' is specified, chromosome filters are also respected.\n\n"
 	       );
-#endif
     help_print("fast-epistasis\tepistasis\tset-test\tset-by-all\tcase-only\tnop\tepistasis-summary-merge", &help_ctrl, 1,
 "  --fast-epistasis <boost | joint-effects | no-ueki> <case-only>\n"
 "                   <set-by-set | set-by-all> <nop>\n"
@@ -1339,12 +1333,10 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "  --biallelic-only <strict> <list> : Skip VCF variants with 2+ alt. alleles.\n"
 "  --vcf-min-qual [val]             : Skip VCF variants with low/missing QUAL.\n"
 "  --vcf-filter {exception(s)...}   : Skip variants which have FILTER failures.\n"
-#ifndef STABLE_BUILD
 "  --vcf-min-gq [val]               : No-call a genotype when GQ is below the\n"
 "                                     given threshold.\n"
 "  --vcf-min-gp [val]               : No-call a genotype when 0-1 scaled GP is\n"
 "                                     below the given threshold.\n"
-#endif
 "  --vcf-half-call [m]  : Specify how '0/.' and similar VCF GT values should be\n"
 "                         handled.  The following three modes are supported:\n"
 "                         * 'error'/'e' (default) errors out and reports line #.\n"
@@ -1729,9 +1721,11 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "  --flip-scan-window-kb [x] : Set --flip-scan max kb distance (default 1000).\n"
 "  --flip-scan-threshold [x] : Set --flip-scan min correlation (default 0.5).\n"
 	       );
-    help_print("keep-allele-order\tmake-bed\tmerge\tbmerge\tmerge-list", &help_ctrl, 0,
+    help_print("keep-allele-order\treal-ref-alleles\tmake-bed\tmerge\tbmerge\tmerge-list\trecode", &help_ctrl, 0,
 "  --keep-allele-order  : Keep the allele order defined in the .bim file,\n"
-"                         instead of forcing A2 to be the major allele.\n"
+"  --real-ref-alleles     instead of forcing A2 to be the major allele.\n"
+"                         --real-ref-alleles also removes 'PR' from the INFO\n"
+"                         values emitted by --recode vcf{-fid/-iid}.\n"
 	       );
     help_print("a1-allele\treference-allele\tupdate-ref-allele\ta2-allele", &help_ctrl, 0,
 "  --a1-allele [f] {a1col} {IDcol} {skip} : Force alleles in the file to A1.\n"
@@ -1789,7 +1783,6 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "  --ld-snps [vID...] : Restrict first --r/--r2 variant to the given ranges.\n"
 "  --ld-snp-list [f]  : Restrict first --r/--r2 var. to those named in the file.\n"
 	       );
-#ifndef STABLE_BUILD
     help_print("list-all\ttag-kb\ttag-r2\ttag-mode2\tshow-tags", &help_ctrl, 0,
 "  --list-all         : Generate the 'all' mode report when using --show-tags in\n"
 "                       file mode.\n"
@@ -1797,7 +1790,6 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "  --tag-r2 [val]     : Set --show-tags min tag r-squared (default 0.8)\n"
 "  --tag-mode2        : Use two-column --show-tags (file mode) I/O format.\n"
 	       );
-#endif
     help_print("indep\tindep-pairwise\tr\tr2\tflip-scan\tflipscan\tshow-tags\tld-xchr", &help_ctrl, 0,
 "  --ld-xchr [code]   : Set Xchr model for --indep{-pairwise}, --r/--r2,\n"
 "                       --flip-scan, and --show-tags.\n"
@@ -1996,7 +1988,6 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
     help_print("clump-best\tclump", &help_ctrl, 0,
 "  --clump-best              : Report best proxy for each --clump index var.\n"
 	       );
-#ifndef STABLE_BUILD
     help_print("meta-analysis-snp-field\tmeta-analysis-a1-field\tmeta-analysis-a2-field\tmeta-analysis-p-field\tmeta-analysis-ess-field\tmeta-analysis", &help_ctrl, 0,
 "  --meta-analysis-snp-field [n...] : Set --meta-analysis variant ID, A1/A2\n"
 "  --meta-analysis-a1-field [n...]    allele, p-value, and/or effective sample\n"
@@ -2010,7 +2001,6 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "                                     size should be\n"
 "                                       4 / (1/[# cases] + 1/[# controls]).\n"
 	       );
-#endif
     help_print("gene-list-border\tgene-report\tgene-subset\tgene-list\tgene-report-snp-field", &help_ctrl, 0,
 "  --gene-list-border [kbs]   : Extend --gene-report regions by given # of kbs.\n"
 "  --gene-subset [filename]   : Specify gene name subset for --gene-report.\n"
@@ -2070,11 +2060,9 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "  --perm-batch-size [val] : Set number of permutations per batch for some\n"
 "                            permutation tests.\n"
 	       );
-#ifndef STABLE_BUILD
     help_print("output-min-p", &help_ctrl, 0,
 "  --output-min-p [p] : Specify minimum p-value to write to reports.\n"
 	       );
-#endif
     help_print("debug", &help_ctrl, 0,
 "  --debug            : Use slower, more crash-resistant logging method.\n"
 	       );
diff --git a/plink_ld.c b/plink_ld.c
index d16a0b4..2c22be4 100644
--- a/plink_ld.c
+++ b/plink_ld.c
@@ -2,6 +2,7 @@
 
 #include <stddef.h>
 #include "plink_assoc.h"
+#include "plink_glm.h"
 #include "plink_ld.h"
 #include "plink_stats.h"
 #include "pigz.h"
@@ -3929,7 +3930,7 @@ THREAD_RET_TYPE fast_epi_thread(void* arg) {
 	  fail_ct_fixed++;
 	  fail_ct2[block_idx2] += 1;
 	  if (alpha1sq == 0.0) {
-	    // special case: log NA
+	    // special case: log NA when '--epi1 1' specified
 	    all_chisq_write[block_idx2] = NAN;
 	  }
 	}
@@ -3964,6 +3965,814 @@ THREAD_RET_TYPE fast_epi_thread(void* arg) {
   }
 }
 
+// epistasis linear/logistic regression multithread globals
+
+static double* g_epi_pheno_d2;
+static double* g_epi_phenogeno1;
+static double* g_epi_phenogeno2;
+static uint32_t* g_epi_genosums1;
+static uint32_t* g_epi_genosums2;
+static double g_epi_pheno_sum;
+static double g_epi_pheno_ssq;
+static double g_epi_vif_thresh;
+
+static uint32_t g_epi_pheno_nm_ct;
+
+typedef struct epi_logistic_multithread_struct {
+  float* cur_covars_cov_major;
+  float* coef;
+  float* pp;
+  float* sample_1d_buf;
+  float* pheno_buf;
+  float* param_1d_buf;
+  float* param_1d_buf2;
+  float* param_2d_buf;
+  float* param_2d_buf2;
+} Epi_logistic_multithread;
+
+static Epi_logistic_multithread* g_epi_logistic_mt;
+static uintptr_t* g_epi_pheno_c;
+static float* g_epi_all_chisq_f;
+static float* g_epi_best_chisq_f1;
+static float* g_epi_best_chisq_f2;
+
+uint32_t matrix_invert_4x4symm(double* dmatrix) {
+  double buf[16];
+  double determinant;
+  // initially, dww = A_{22}A_{34} - A_{23}A_{24}
+  //            dxx = A_{23}A_{34} - A_{24}A_{33}
+  //            dyy = A_{23}A_{44} - A_{24}A_{34}
+  //            dzz = A_{33}A_{44} - A_{34}A_{34}
+  double dww = dmatrix[5] * dmatrix[11] - dmatrix[6] * dmatrix[7];
+  double dxx = dmatrix[6] * dmatrix[11] - dmatrix[7] * dmatrix[10];
+  double dyy = dmatrix[6] * dmatrix[15] - dmatrix[7] * dmatrix[11];
+  double dzz = dmatrix[10] * dmatrix[15] - dmatrix[11] * dmatrix[11];
+  double dvv;
+  double duu;
+  buf[0] = dmatrix[5] * dzz
+         - dmatrix[6] * dyy
+         + dmatrix[7] * dxx;
+  buf[1] = dmatrix[2] * dyy
+         - dmatrix[1] * dzz
+         - dmatrix[3] * dxx;
+  buf[2] = dmatrix[1] * dyy
+         + dmatrix[2] * (dmatrix[7] * dmatrix[7] - dmatrix[5] * dmatrix[15])
+         + dmatrix[3] * dww;
+  duu = dmatrix[5] * dmatrix[10] - dmatrix[6] * dmatrix[6];
+  buf[3] = dmatrix[2] * dww
+         - dmatrix[1] * dxx
+         - dmatrix[3] * duu;
+  determinant = dmatrix[0] * buf[0] + dmatrix[1] * buf[1] + dmatrix[2] * buf[2] + dmatrix[3] * buf[3];
+  if (fabs(determinant) < EPSILON) {
+    return 1;
+  }
+  buf[5] = dmatrix[0] * dzz
+         + dmatrix[2] * (dmatrix[3] * dmatrix[11] - dmatrix[2] * dmatrix[15])
+         + dmatrix[3] * (dmatrix[2] * dmatrix[11] - dmatrix[3] * dmatrix[10]);
+  dzz = dmatrix[1] * dmatrix[15] - dmatrix[3] * dmatrix[7];
+  buf[6] = dmatrix[2] * dzz
+         - dmatrix[0] * dyy
+         + dmatrix[3] * (dmatrix[3] * dmatrix[6] - dmatrix[1] * dmatrix[11]);
+  dyy = dmatrix[1] * dmatrix[11] - dmatrix[2] * dmatrix[7];
+  dvv = dmatrix[1] * dmatrix[10] - dmatrix[2] * dmatrix[6];
+  buf[7] = dmatrix[0] * dxx
+         - dmatrix[2] * dyy
+         + dmatrix[3] * dvv;
+  buf[10] = dmatrix[0] * (dmatrix[5] * dmatrix[15] - dmatrix[7] * dmatrix[7])
+          - dmatrix[1] * dzz
+          + dmatrix[3] * (dmatrix[1] * dmatrix[7] - dmatrix[3] * dmatrix[5]);
+  dxx = dmatrix[1] * dmatrix[6] - dmatrix[2] * dmatrix[5];
+  buf[11] = dmatrix[1] * dyy
+          - dmatrix[0] * dww
+          - dmatrix[3] * dxx;
+  buf[15] = dmatrix[0] * duu
+          - dmatrix[1] * dvv
+          + dmatrix[2] * dxx;
+  determinant = 1.0 / determinant; // now reciprocal
+  dmatrix[0] = buf[0] * determinant;
+  dmatrix[1] = buf[1] * determinant;
+  dmatrix[2] = buf[2] * determinant;
+  dmatrix[3] = buf[3] * determinant;
+  dmatrix[4] = dmatrix[1];
+  dmatrix[5] = buf[5] * determinant;
+  dmatrix[6] = buf[6] * determinant;
+  dmatrix[7] = buf[7] * determinant;
+  dmatrix[8] = dmatrix[2];
+  dmatrix[9] = dmatrix[6];
+  dmatrix[10] = buf[10] * determinant;
+  dmatrix[11] = buf[11] * determinant;
+  dmatrix[12] = dmatrix[3];
+  dmatrix[13] = dmatrix[7];
+  dmatrix[14] = dmatrix[11];
+  dmatrix[15] = buf[15] * determinant;
+  return 0;
+}
+
+THREAD_RET_TYPE epi_linear_thread(void* arg) {
+  uintptr_t tidx = (uintptr_t)arg;
+  uintptr_t block_idx1_start = g_epi_idx1_block_bounds[tidx];
+  uintptr_t block_idx1_end = g_epi_idx1_block_bounds[tidx + 1];
+  uintptr_t idx1_block_start16 = g_epi_idx1_block_bounds16[tidx];
+  uintptr_t marker_idx1 = g_epi_marker_idx1 + block_idx1_start;
+  uintptr_t marker_ct = g_epi_marker_ct;
+  double alpha1sq = g_epi_alpha1sq[0];
+  double alpha2sq = g_epi_alpha2sq[0];
+  double pheno_sum = g_epi_pheno_sum;
+  double pheno_ssq = g_epi_pheno_ssq;
+  double vif_thresh = g_epi_vif_thresh;
+  uint32_t pheno_nm_ct = g_epi_pheno_nm_ct;
+  uint32_t best_id_fixed = 0;
+  uint32_t is_first_half = 0;
+  uintptr_t pheno_nm_ctl2 = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t* geno1 = g_epi_geno1;
+  double* pheno_d2 = g_epi_pheno_d2;
+  uint32_t* geno1_offsets = g_epi_geno1_offsets;
+  uint32_t* best_id1 = &(g_epi_best_id1[idx1_block_start16]);
+  const double dconst[] = {1.0, 2.0, 2.0, 4.0};
+  double dmatrix_buf[16];
+  double dmatrix_buf2[4];
+
+  // sum(aa), sum(ab), sum(bb), sum(aab), sum(abb), and sum(aabb) can all be
+  // derived from these four quantities.
+  uint32_t cur_minor_cts[4]; // 11, 12, 21, 22
+
+  uintptr_t* cur_geno1;
+  uintptr_t* geno2;
+  uintptr_t* cur_geno2;
+  double* phenogeno1;
+  double* phenogeno2;
+  double* all_chisq_write;
+  double* chisq2_ptr;
+  double* all_chisq;
+  double* best_chisq1;
+  double* best_chisq2;
+  double* dptr;
+  double* dptr2;
+  uint32_t* n_sig_ct1;
+  uint32_t* fail_ct1;
+  uint32_t* best_id2;
+  uint32_t* n_sig_ct2;
+  uint32_t* fail_ct2;
+  uint32_t* genosums1;
+  uint32_t* genosums2;
+  uintptr_t idx2_block_size;
+  uintptr_t cur_idx2_block_size;
+  uintptr_t idx2_block_start;
+  uintptr_t idx2_block_end;
+  uintptr_t idx2_block_sizem16;
+  uintptr_t block_idx1;
+  uintptr_t block_delta1;
+  uintptr_t block_idx2;
+  uintptr_t cur_word1;
+  uintptr_t cur_word2;
+  uintptr_t active_mask;
+  uintptr_t param_idx;
+  uintptr_t param_idx2;
+  uintptr_t cur_sum_aab;
+  uintptr_t cur_sum_abb;
+  uintptr_t cur_sum_aabb;
+  uintptr_t ulii;
+  uintptr_t uljj;
+  double best_chisq_fixed;
+  double sum_a_pheno_base;
+  double cur_pheno_sum;
+  double cur_pheno_ssq;
+  double cur_sum_a_pheno;
+  double cur_sum_b_pheno;
+  double cur_sum_ab_pheno;
+  double sample_ctd;
+  double sample_ct_recip;
+  double sample_ct_m1_recip;
+  double cur_sum_ad;
+  double cur_sum_bd;
+  double cur_sum_abd;
+  double determinant;
+  double min_sigma;
+  double sigma;
+  double dxx;
+  double dyy;
+  double dzz;
+  double dww;
+  double dvv;
+  double duu;
+  double zsq;
+  uint32_t n_sig_ct_fixed;
+  uint32_t fail_ct_fixed;
+
+  uint32_t sum_a_base;
+  uint32_t sum_aa_base;
+  uint32_t cur_sum_a;
+  uint32_t cur_sum_aa;
+  uint32_t cur_sum_b;
+  uint32_t cur_sum_bb;
+  uint32_t cur_sum_ab;
+  uint32_t widx;
+  uint32_t sample_idx;
+  uint32_t cur_sample_ct;
+  uint32_t woffset;
+  while (1) {
+    idx2_block_size = g_epi_idx2_block_size;
+    cur_idx2_block_size = idx2_block_size;
+    idx2_block_start = g_epi_idx2_block_start;
+    idx2_block_end = idx2_block_start + idx2_block_size;
+    idx2_block_sizem16 = (idx2_block_size + 15) & (~(15 * ONELU));
+    geno2 = g_epi_geno2;
+    phenogeno1 = g_epi_phenogeno1;
+    phenogeno2 = g_epi_phenogeno2;
+    genosums1 = g_epi_genosums1;
+    genosums2 = g_epi_genosums2;
+    all_chisq = &(g_epi_all_chisq[2 * idx2_block_start]);
+    best_chisq1 = &(g_epi_best_chisq1[idx1_block_start16]);
+    best_chisq2 = &(g_epi_best_chisq2[tidx * idx2_block_sizem16]);
+    n_sig_ct1 = &(g_epi_n_sig_ct1[idx1_block_start16]);
+    fail_ct1 = &(g_epi_fail_ct1[idx1_block_start16]);
+    best_id2 = &(g_epi_best_id2[tidx * idx2_block_sizem16]);
+    n_sig_ct2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizem16]);
+    fail_ct2 = &(g_epi_fail_ct2[tidx * idx2_block_sizem16]);
+    for (block_idx1 = block_idx1_start; block_idx1 < block_idx1_end; block_idx1++, marker_idx1++) {
+      ulii = geno1_offsets[2 * block_idx1];
+      if (ulii > idx2_block_start) {
+        block_idx2 = 0;
+        cur_idx2_block_size = ulii - idx2_block_start;
+	if (cur_idx2_block_size >= idx2_block_size) {
+          cur_idx2_block_size = idx2_block_size;
+	} else {
+	  is_first_half = 1;
+        }
+      } else {
+        ulii = geno1_offsets[2 * block_idx1 + 1];
+        if (ulii >= idx2_block_end) {
+          // may not be done in set1 x all or set1 x set2 cases
+          continue;
+	} else {
+          if (ulii <= idx2_block_start) {
+            block_idx2 = 0;
+	  } else {
+            block_idx2 = ulii - idx2_block_start;
+	  }
+	}
+      }
+      cur_geno1 = &(geno1[block_idx1 * pheno_nm_ctl2]);
+      n_sig_ct_fixed = 0;
+      fail_ct_fixed = 0;
+      block_delta1 = block_idx1 - block_idx1_start;
+      best_chisq_fixed = best_chisq1[block_delta1];
+      sum_a_pheno_base = phenogeno1[block_idx1];
+      sum_a_base = genosums1[2 * block_idx1];
+      sum_aa_base = genosums1[2 * block_idx1 + 1];
+
+      // [0] = chisq, [1] = beta
+      all_chisq_write = &(all_chisq[block_idx1 * marker_ct * 2]);
+
+    epi_linear_thread_second_half:
+      cur_geno2 = &(geno2[block_idx2 * pheno_nm_ctl2]);
+      chisq2_ptr = &(best_chisq2[block_idx2]);
+      for (; block_idx2 < cur_idx2_block_size; block_idx2++, chisq2_ptr++, cur_geno2 = &(cur_geno2[pheno_nm_ctl2])) {
+	// Our covariates are 1, genotype A (in {0, 1, 2}), genotype B, and
+	//   [genotype A] * [genotype B].
+	// The ordinary least squares solution to this system is
+	//   (X^T X)^{-1} X^T Y
+	// where X^T X is the following 4x4 matrix (where n = # of samples):
+	//   [ n       sum(A)   sum(B)   sum(AB)   ]
+	//   [ sum(A)  sum(AA)  sum(AB)  sum(AAB)  ]
+	//   [ sum(B)  sum(AB)  sum(BB)  sum(ABB)  ]
+	//   [ sum(AB) sum(AAB) sum(ABB) sum(AABB) ]
+        // (sum(.) denotes the sum of that (product of) genotypes, across all
+	// samples.)
+	// Meanwhile, X^T Y is the following 4x1 matrix:
+	//   [ sum(pheno)      ]
+	//   [ sum(A * pheno)  ]
+	//   [ sum(B * pheno)  ]
+	//   [ sum(AB * pheno) ]
+	// Crucially, the VIF and valid parameters checks can also operate
+	// purely on the terms above and sum(pheno * pheno).
+
+	// these nine values can be mostly precomputed; just need to subtract
+	// from them sometimes when missing values are present.
+	cur_pheno_sum = pheno_sum;
+	cur_pheno_ssq = pheno_ssq;
+	cur_sum_a_pheno = sum_a_pheno_base;
+	cur_sum_b_pheno = phenogeno2[block_idx2];
+	cur_sum_a = sum_a_base;
+	cur_sum_aa = sum_aa_base;
+	cur_sum_b = genosums2[block_idx2 * 2];
+	cur_sum_bb = genosums2[block_idx2 * 2 + 1];
+	cur_sample_ct = pheno_nm_ct;
+
+	cur_sum_ab_pheno = 0.0;
+	fill_uint_zero(cur_minor_cts, 4);
+	for (widx = 0; widx < pheno_nm_ctl2; widx++) {
+	  sample_idx = widx * BITCT2;
+          cur_word1 = cur_geno1[widx];
+          cur_word2 = cur_geno2[widx];
+	  // we can entirely skip 5 common cases: 00/00, 00/01, 00/10, 01/00,
+	  // 10/00.
+	  active_mask = cur_word1 | cur_word2;
+	  active_mask = (active_mask & (active_mask >> 1) & FIVEMASK) | (cur_word1 & cur_word2);
+	  dptr = &(pheno_d2[sample_idx]);
+	  while (active_mask) {
+            woffset = CTZLU(active_mask) / 2;
+	    dxx = dptr[woffset];
+	    woffset *= 2;
+	    ulii = (cur_word1 >> woffset) & (3 * ONELU);
+            uljj = (cur_word2 >> woffset) & (3 * ONELU);
+	    active_mask &= ~((3 * ONELU) << woffset);
+	    if (ulii && uljj) {
+	      if (ulii == 3) {
+		if (uljj == 1) {
+		  cur_sum_b_pheno -= dxx;
+		  cur_sum_b--;
+		  cur_sum_bb--;
+		} else if (uljj == 2) {
+		  cur_sum_b_pheno -= 2 * dxx;
+		  cur_sum_b -= 2;
+		  cur_sum_bb -= 4;
+		}
+	      } else if (uljj == 3) {
+		// ulii must be 1 or 2
+		cur_sum_a_pheno -= dxx;
+		if (ulii == 2) {
+		  cur_sum_a_pheno -= dxx;
+		}
+		cur_sum_a -= ulii;
+		cur_sum_aa -= ulii * ulii;
+	      } else {
+		ulii = ulii * 2 + uljj - 3;
+		cur_sum_ab_pheno += dconst[ulii] * dxx;
+		cur_minor_cts[ulii] += 1;
+		continue;
+	      }
+	    }
+	    cur_pheno_sum -= dxx;
+	    cur_pheno_ssq -= dxx * dxx;
+	    cur_sample_ct--;
+	  }
+	}
+	if (cur_sample_ct <= 4) {
+          goto epi_linear_thread_regression_fail;
+	}
+	// VIF check.  Mirrors glm_check_vif(), but param_ct is hardcoded to 4
+	// and we avoid additional iteration over the sample_idxs.
+	sample_ctd = (double)((int32_t)cur_sample_ct);
+	sample_ct_recip = 1.0 / sample_ctd;
+	sample_ct_m1_recip = 1.0 / ((double)((int32_t)(cur_sample_ct - 1)));
+	cur_sum_ab = cur_minor_cts[0] + 2 * (cur_minor_cts[1] + cur_minor_cts[2]) + 4 * cur_minor_cts[3];
+	cur_sum_aab = cur_minor_cts[0] + 2 * cur_minor_cts[1] + 4 * cur_minor_cts[2] + (8 * ONELU) * cur_minor_cts[3];
+	cur_sum_abb = cur_minor_cts[0] + 4 * cur_minor_cts[1] + 2 * cur_minor_cts[2] + (8 * ONELU) * cur_minor_cts[3];
+	cur_sum_aabb = cur_minor_cts[0] + 4 * (cur_minor_cts[1] + cur_minor_cts[2]) + (16 * ONELU) * cur_minor_cts[3];
+
+	cur_sum_ad = (double)((int32_t)cur_sum_a);
+	cur_sum_bd = (double)((int32_t)cur_sum_b);
+	cur_sum_abd = (double)((int32_t)cur_sum_ab);
+
+	// some genotype means
+	dxx = cur_sum_bd * sample_ct_recip;
+	dyy = cur_sum_abd * sample_ct_recip;
+
+	dww = ((double)((int32_t)cur_sum_aa)) - cur_sum_ad * cur_sum_ad * sample_ct_recip;
+	dvv = ((double)((int32_t)cur_sum_bb)) - cur_sum_bd * dxx;
+	duu = ((double)((intptr_t)cur_sum_aabb)) - cur_sum_abd * dyy;
+	if ((dww <= 0) || (dvv <= 0) || (duu <= 0)) {
+	  goto epi_linear_thread_regression_fail;
+	}
+	dww = 1.0 / sqrt(dww * sample_ct_m1_recip);
+	dvv = 1.0 / sqrt(dvv * sample_ct_m1_recip);
+	duu = 1.0 / sqrt(duu * sample_ct_m1_recip);
+
+	dxx = (cur_sum_abd - cur_sum_ad * dxx) * sample_ct_m1_recip;
+	dzz = (((double)((intptr_t)cur_sum_abb)) - cur_sum_bd * dyy) * sample_ct_m1_recip;
+	dyy = (((double)((intptr_t)cur_sum_aab)) - cur_sum_ad * dyy) * sample_ct_m1_recip;
+	// now dxx = A_{12}, dyy = A_{13}, dzz = A_{23}
+
+	dxx *= dww * dvv;
+	dyy *= dww * duu;
+	dzz *= dvv * duu;
+	if ((dxx > 0.999) || (dyy > 0.999) || (dzz > 0.999)) {
+	  goto epi_linear_thread_regression_fail;
+	}
+	// Use analytic formula for 3x3 symmetric matrix inverse.
+	// det A = A_{11}A_{22}A_{33} + 2 * A_{12}A_{13}A_{23}
+	//       - A_{11}(A_{23}^2) - A_{22}(A_{13}^2) - A_{33}(A_{12}^2)
+	// upper left of inverse = (A_{22}A_{33} - (A_{23}^2))(det A)^{-1}
+        //                middle = (A_{11}A_{33} - (A_{13}^2))(det A)^{-1}
+	//           lower right = (A_{11}A_{22} - (A_{12}^2))(det A)^{-1}
+	dww = dxx * dxx;
+	dvv = dyy * dyy;
+	duu = dzz * dzz;
+	determinant = 1 + 2 * dxx * dyy * dzz - dww - dvv - duu;
+	if (fabs(determinant) < EPSILON) {
+	  goto epi_linear_thread_regression_fail;
+	}
+	// (1 - x^2)/det > vif_thresh
+	// if det > 0:
+	//   1 - x^2 > vif_thresh * det
+	//   1 - vif_thresh * det > x^2
+	// otherwise:
+	//   1 - x^2 < vif_thresh * det
+	//   1 - vif_thresh * det < x^2
+        dxx = 1 - vif_thresh * determinant; // now a threshold
+	if (((determinant > 0) && ((dxx > dww) || (dxx > dvv) || (dxx > duu))) || ((determinant < 0) && ((dxx < dww) || (dxx < dvv) || (dxx < duu)))) {
+	  goto epi_linear_thread_regression_fail;
+	}
+
+	// VIF check done, now perform linear regression
+	dmatrix_buf[0] = sample_ctd;
+	dmatrix_buf[1] = cur_sum_ad;
+        dmatrix_buf[2] = cur_sum_bd;
+	dmatrix_buf[3] = cur_sum_abd;
+	dmatrix_buf[5] = (double)((int32_t)cur_sum_aa);
+	dmatrix_buf[6] = cur_sum_abd;
+	dmatrix_buf[7] = (double)((intptr_t)cur_sum_aab);
+	dmatrix_buf[10] = (double)((int32_t)cur_sum_bb);
+	dmatrix_buf[11] = (double)((intptr_t)cur_sum_abb);
+	dmatrix_buf[15] = (double)((intptr_t)cur_sum_aabb);
+	if (matrix_invert_4x4symm(dmatrix_buf)) {
+	  goto epi_linear_thread_regression_fail;
+	}
+
+	for (param_idx = 0; param_idx < 4; param_idx++) {
+	  dmatrix_buf2[param_idx] = sqrt(dmatrix_buf[param_idx * 5]);
+	}
+        for (param_idx = 1; param_idx < 4; param_idx++) {
+          dxx = 0.99999 * dmatrix_buf2[param_idx];
+          dptr = &(dmatrix_buf[param_idx * 4]);
+          dptr2 = dmatrix_buf2;
+          for (param_idx2 = 0; param_idx2 < param_idx; param_idx2++) {
+            if ((*dptr++) > dxx * (*dptr2++)) {
+              goto epi_linear_thread_regression_fail;
+	    }
+	  }
+	}
+        min_sigma = MAXV(dmatrix_buf[5], dmatrix_buf[10]);
+	if (dmatrix_buf[15] > min_sigma) {
+          min_sigma = dmatrix_buf[15];
+	}
+	min_sigma = 1e-20 / min_sigma;
+
+	for (param_idx = 0; param_idx < 4; param_idx++) {
+	  dptr = &(dmatrix_buf[param_idx * 4]);
+	  dmatrix_buf2[param_idx] = cur_pheno_sum * dptr[0] + cur_sum_a_pheno * dptr[1] + cur_sum_b_pheno * dptr[2] + cur_sum_ab_pheno * dptr[3];
+	}
+	// dmatrix_buf2[0..3] now has linear regression result
+
+	// partial = coef[0] + A * coef[1] + B * coef[2] + AB * coef[3] - pheno
+	// sigma = \sum_{all samples} (partial * partial)
+	//       = \sum (coef[0]^2
+        //               + 2 * A * coef[0] * coef[1]
+	//               + 2 * B * coef[0] * coef[2]
+	//               + 2 * AB * coef[0] * coef[3]
+	//               - 2 * coef[0] * pheno
+	//               + AA * coef[1]^2
+	//               + 2 * AB * coef[1] * coef[2]
+	//               + 2 * AAB * coef[1] * coef[3]
+	//               - 2 * A * coef[1] * pheno
+	//               + BB * coef[2]^2
+	//               + 2 * ABB * coef[2] * coef[3]
+	//               - 2 * B * coef[2] * pheno
+	//               + AABB * coef[3]^2
+	//               - 2 * AB * coef[3] * pheno
+	//               + pheno * pheno
+	sigma = dmatrix_buf2[0] * dmatrix_buf2[0] * sample_ctd
+	      + dmatrix_buf2[1] * dmatrix_buf2[1] * ((double)((int32_t)cur_sum_aa))
+	      + dmatrix_buf2[2] * dmatrix_buf2[2] * ((double)((int32_t)cur_sum_bb))
+	      + dmatrix_buf2[3] * dmatrix_buf2[3] * ((double)((intptr_t)cur_sum_aabb))
+              + cur_pheno_ssq
+	      + 2 * (dmatrix_buf2[0] * (dmatrix_buf2[1] * cur_sum_ad
+                                      + dmatrix_buf2[2] * cur_sum_bd
+                                      + dmatrix_buf2[3] * cur_sum_abd
+				      - cur_pheno_sum)
+		   + dmatrix_buf2[1] * (dmatrix_buf2[2] * cur_sum_abd
+				      + dmatrix_buf2[3] * ((double)((intptr_t)cur_sum_aab))
+				      - cur_sum_a_pheno)
+		   + dmatrix_buf2[2] * (dmatrix_buf2[3] * ((double)((intptr_t)cur_sum_abb))
+				      - cur_sum_b_pheno)
+		   - dmatrix_buf2[3] * cur_sum_ab_pheno);
+	sigma /= (double)((int32_t)(cur_sample_ct - 4));
+        if (sigma < min_sigma) {
+          goto epi_linear_thread_regression_fail;
+	}
+
+	// dmatrix_buf2[3] = linear regression beta for AB term
+	// sqrt(dmatrix_buf[15] * sigma) = standard error for AB term
+	dxx = dmatrix_buf2[3];
+	zsq = (dxx * dxx) / (dmatrix_buf[15] * sigma);
+	if (zsq >= alpha1sq) {
+          all_chisq_write[2 * block_idx2] = zsq;
+          all_chisq_write[2 * block_idx2 + 1] = dxx;
+	}
+	if (zsq >= alpha2sq) {
+          n_sig_ct_fixed++;
+	  n_sig_ct2[block_idx2] += 1;
+	}
+	if (zsq > best_chisq_fixed) {
+          best_chisq_fixed = zsq;
+	  best_id_fixed = block_idx2 + idx2_block_start;
+	}
+        dxx = *chisq2_ptr;
+        if (zsq > dxx) {
+          *chisq2_ptr = zsq;
+	  best_id2[block_idx2] = marker_idx1;
+	}
+	while (0) {
+	epi_linear_thread_regression_fail:
+	  zsq = 0;
+	  fail_ct_fixed++;
+	  fail_ct2[block_idx2] += 1;
+	  if (alpha1sq == 0.0) {
+	    // special case: log NA when '--epi1 1' specified
+	    all_chisq_write[block_idx2 * 2] = NAN;
+	    all_chisq_write[block_idx2 * 2 + 1] = NAN;
+	  }
+	}
+      }
+      if (is_first_half) {
+        is_first_half = 0;
+	ulii = geno1_offsets[2 * block_idx1 + 1];
+        cur_idx2_block_size = idx2_block_size;
+        if (ulii < idx2_block_end) {
+	  // guaranteed to be larger than idx2_block_start, otherwise there
+	  // would have been no first half
+          block_idx2 = ulii - idx2_block_start;
+	  goto epi_linear_thread_second_half;
+	}
+      }
+      if (best_chisq_fixed > best_chisq1[block_delta1]) {
+        best_chisq1[block_delta1] = best_chisq_fixed;
+	best_id1[block_delta1] = best_id_fixed;
+      }
+      n_sig_ct1[block_delta1] = n_sig_ct_fixed;
+      if (fail_ct_fixed) {
+        fail_ct1[block_delta1] = fail_ct_fixed;
+      }
+    }
+    if ((!tidx) || g_is_last_thread_block) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+  }
+}
+
+THREAD_RET_TYPE epi_logistic_thread(void* arg) {
+  uintptr_t tidx = (uintptr_t)arg;
+  uintptr_t block_idx1_start = g_epi_idx1_block_bounds[tidx];
+  uintptr_t block_idx1_end = g_epi_idx1_block_bounds[tidx + 1];
+  uintptr_t idx1_block_start16 = g_epi_idx1_block_bounds16[tidx];
+  uintptr_t marker_idx1 = g_epi_marker_idx1 + block_idx1_start;
+  uintptr_t marker_ct = g_epi_marker_ct;
+  float alpha1sq = (float)g_epi_alpha1sq[0];
+  float alpha2sq = (float)g_epi_alpha2sq[0];
+  uint32_t pheno_nm_ct = g_epi_pheno_nm_ct;
+  uint32_t best_id_fixed = 0;
+  uint32_t is_first_half = 0;
+  uintptr_t pheno_nm_ctl2 = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t* geno1 = g_epi_geno1;
+  uintptr_t* pheno_c = g_epi_pheno_c;
+  float* covars_cov_major = g_epi_logistic_mt[tidx].cur_covars_cov_major;
+  float* coef = g_epi_logistic_mt[tidx].coef;
+  float* pp = g_epi_logistic_mt[tidx].pp;
+  float* sample_1d_buf = g_epi_logistic_mt[tidx].sample_1d_buf;
+  float* pheno_buf = g_epi_logistic_mt[tidx].pheno_buf;
+  float* param_1d_buf = g_epi_logistic_mt[tidx].param_1d_buf;
+  float* param_1d_buf2 = g_epi_logistic_mt[tidx].param_1d_buf2;
+  float* param_2d_buf = g_epi_logistic_mt[tidx].param_2d_buf;
+  float* param_2d_buf2 = g_epi_logistic_mt[tidx].param_2d_buf2;
+  uint32_t* geno1_offsets = g_epi_geno1_offsets;
+  uint32_t* best_id1 = &(g_epi_best_id1[idx1_block_start16]);
+  uintptr_t* cur_geno1;
+  uintptr_t* geno2;
+  uintptr_t* cur_geno2;
+  float* all_chisq_write;
+  float* chisq2_ptr;
+  float* all_chisq;
+  float* best_chisq1;
+  float* best_chisq2;
+  float* fptr;
+  float* fptr2;
+  uint32_t* n_sig_ct1;
+  uint32_t* fail_ct1;
+  uint32_t* best_id2;
+  uint32_t* n_sig_ct2;
+  uint32_t* fail_ct2;
+  uintptr_t idx2_block_size;
+  uintptr_t cur_idx2_block_size;
+  uintptr_t idx2_block_start;
+  uintptr_t idx2_block_end;
+  uintptr_t idx2_block_sizem16;
+  uintptr_t block_idx1;
+  uintptr_t block_delta1;
+  uintptr_t block_idx2;
+  uintptr_t cur_word1;
+  uintptr_t cur_word2;
+  uintptr_t param_idx;
+  uintptr_t param_idx2;
+  uintptr_t cur_sample_cta4;
+  uintptr_t ulii;
+  uintptr_t uljj;
+  float best_chisq_fixed;
+  // todo
+  float fxx;
+  float fyy;
+  float zsq;
+  uint32_t n_sig_ct_fixed;
+  uint32_t fail_ct_fixed;
+  uint32_t widx;
+  uint32_t loop_end;
+  uint32_t sample_idx;
+  uint32_t cur_sample_ct;
+  while (1) {
+    idx2_block_size = g_epi_idx2_block_size;
+    cur_idx2_block_size = idx2_block_size;
+    idx2_block_start = g_epi_idx2_block_start;
+    idx2_block_end = idx2_block_start + idx2_block_size;
+    idx2_block_sizem16 = (idx2_block_size + 15) & (~(15 * ONELU));
+    geno2 = g_epi_geno2;
+    all_chisq = &(g_epi_all_chisq_f[2 * idx2_block_start]);
+    best_chisq1 = &(g_epi_best_chisq_f1[idx1_block_start16]);
+    best_chisq2 = &(g_epi_best_chisq_f2[tidx * idx2_block_sizem16]);
+    n_sig_ct1 = &(g_epi_n_sig_ct1[idx1_block_start16]);
+    fail_ct1 = &(g_epi_fail_ct1[idx1_block_start16]);
+    best_id2 = &(g_epi_best_id2[tidx * idx2_block_sizem16]);
+    n_sig_ct2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizem16]);
+    fail_ct2 = &(g_epi_fail_ct2[tidx * idx2_block_sizem16]);
+    for (block_idx1 = block_idx1_start; block_idx1 < block_idx1_end; block_idx1++, marker_idx1++) {
+      ulii = geno1_offsets[2 * block_idx1];
+      if (ulii > idx2_block_start) {
+        block_idx2 = 0;
+        cur_idx2_block_size = ulii - idx2_block_start;
+	if (cur_idx2_block_size >= idx2_block_size) {
+          cur_idx2_block_size = idx2_block_size;
+	} else {
+	  is_first_half = 1;
+        }
+      } else {
+        ulii = geno1_offsets[2 * block_idx1 + 1];
+        if (ulii >= idx2_block_end) {
+          // may not be done in set1 x all or set1 x set2 cases
+          continue;
+	} else {
+          if (ulii <= idx2_block_start) {
+            block_idx2 = 0;
+	  } else {
+            block_idx2 = ulii - idx2_block_start;
+	  }
+	}
+      }
+      cur_geno1 = &(geno1[block_idx1 * pheno_nm_ctl2]);
+      n_sig_ct_fixed = 0;
+      fail_ct_fixed = 0;
+      block_delta1 = block_idx1 - block_idx1_start;
+      best_chisq_fixed = best_chisq1[block_delta1];
+
+      // [0] = chisq, [1] = beta
+      all_chisq_write = &(all_chisq[block_idx1 * marker_ct * 2]);
+
+    epi_logistic_thread_second_half:
+      cur_geno2 = &(geno2[block_idx2 * pheno_nm_ctl2]);
+      chisq2_ptr = &(best_chisq2[block_idx2]);
+      for (; block_idx2 < cur_idx2_block_size; block_idx2++, chisq2_ptr++, cur_geno2 = &(cur_geno2[pheno_nm_ctl2])) {
+        fptr = covars_cov_major;
+	fptr2 = pheno_buf;
+	cur_sample_ct = pheno_nm_ct;
+	// this part is similar to glm_logistic().
+
+	// 1. determine number of samples with at least one missing genotype
+	for (widx = 0; widx < pheno_nm_ctl2; widx++) {
+	  cur_word1 = cur_geno1[widx];
+	  cur_word2 = cur_geno2[widx];
+	  cur_word1 = cur_word1 & (cur_word1 >> 1);
+	  cur_word2 = cur_word2 & (cur_word2 >> 1);
+	  cur_sample_ct -= popcount2_long((cur_word1 | cur_word2) & FIVEMASK);
+	}
+	if (cur_sample_ct <= 4) {
+	  goto epi_logistic_thread_regression_fail;
+	}
+	// 2. now populate covariate-major matrix with 16-byte-aligned,
+	//    trailing-entries-zeroed rows
+	cur_sample_cta4 = (cur_sample_ct + 3) & (~3);
+	for (widx = 0; widx < pheno_nm_ctl2; widx++) {
+	  sample_idx = widx * BITCT2;
+          cur_word1 = cur_geno1[widx];
+          cur_word2 = cur_geno2[widx];
+          loop_end = sample_idx + BITCT2;
+	  if (loop_end > pheno_nm_ct) {
+            loop_end = pheno_nm_ct;
+	  }
+          for (; sample_idx < loop_end; sample_idx++) {
+            ulii = cur_word1 & (3 * ONELU);
+            uljj = cur_word2 & (3 * ONELU);
+	    if ((ulii != 3) && (uljj != 3)) {
+              *fptr = 1.0;
+	      fxx = (float)((intptr_t)ulii);
+	      fyy = (float)((intptr_t)uljj);
+	      // maybe this is faster with continuous writes instead of
+	      // continuous reads?  can experiment later
+              fptr[cur_sample_cta4] = fxx;
+	      fptr[2 * cur_sample_cta4] = fyy;
+              fptr[3 * cur_sample_cta4] = fxx * fyy;
+	      fptr++;
+	      *fptr2++ = (float)((int32_t)is_set(pheno_c, sample_idx));
+	    }
+            cur_word1 >>= 2;
+            cur_word2 >>= 2;
+	  }
+	}
+	if (cur_sample_ct < cur_sample_cta4) {
+	  loop_end = cur_sample_cta4 - cur_sample_ct;
+	  fill_float_zero(fptr, loop_end);
+	  fill_float_zero(&(fptr[cur_sample_cta4]), loop_end);
+	  fill_float_zero(&(fptr[2 * cur_sample_cta4]), loop_end);
+	  fill_float_zero(&(fptr[3 * cur_sample_cta4]), loop_end);
+	  fill_float_zero(fptr2, loop_end);
+	}
+
+	fill_float_zero(coef, 4);
+	if (logistic_regression(cur_sample_ct, 4, sample_1d_buf, param_2d_buf, param_1d_buf, param_2d_buf2, param_1d_buf2, covars_cov_major, pheno_buf, coef, pp)) {
+          goto epi_logistic_thread_regression_fail;
+	}
+
+	// compute S
+	for (param_idx = 0; param_idx < 4; param_idx++) {
+          fill_float_zero(param_1d_buf, 4);
+          param_1d_buf[param_idx] = 1.0;
+	  solve_linear_system(param_2d_buf2, param_1d_buf, param_1d_buf2, 4);
+	  memcpy(&(param_2d_buf[param_idx * 4]), param_1d_buf2, 4 * sizeof(float));
+	}
+	for (param_idx = 1; param_idx < 4; param_idx++) {
+          fxx = param_2d_buf[param_idx * 5];
+	  if ((fxx < 1e-20) || (!realnum(fxx))) {
+	    goto epi_logistic_thread_regression_fail;
+	  }
+          param_2d_buf2[param_idx] = sqrtf(fxx);
+	}
+	param_2d_buf2[0] = sqrtf(param_2d_buf[0]);
+	for (param_idx = 1; param_idx < 4; param_idx++) {
+          fxx = 0.99999 * param_2d_buf2[param_idx];
+	  fptr = &(param_2d_buf[param_idx * 4]);
+	  fptr2 = param_2d_buf2;
+	  for (param_idx2 = 0; param_idx2 < param_idx; param_idx2++) {
+            if ((*fptr++) > fxx * (*fptr2++)) {
+	      goto epi_logistic_thread_regression_fail;
+	    }
+	  }
+	}
+
+	// coef[3] = logistic regression beta for AB term
+	// sqrt(param_2d_buf[15]) = standard error for AB term
+	zsq = coef[3] * coef[3] / param_2d_buf[15];
+	if (zsq >= alpha1sq) {
+          all_chisq_write[2 * block_idx2] = zsq;
+          all_chisq_write[2 * block_idx2 + 1] = coef[3];
+	}
+	if (zsq >= alpha2sq) {
+          n_sig_ct_fixed++;
+	  n_sig_ct2[block_idx2] += 1;
+	}
+	if (zsq > best_chisq_fixed) {
+          best_chisq_fixed = zsq;
+	  best_id_fixed = block_idx2 + idx2_block_start;
+	}
+        fxx = *chisq2_ptr;
+        if (zsq > fxx) {
+          *chisq2_ptr = zsq;
+	  best_id2[block_idx2] = marker_idx1;
+	}
+	while (0) {
+	epi_logistic_thread_regression_fail:
+	  zsq = 0;
+	  fail_ct_fixed++;
+	  fail_ct2[block_idx2] += 1;
+	  if (alpha1sq == 0.0) {
+	    // special case: log NA when '--epi1 1' specified
+	    all_chisq_write[block_idx2 * 2] = NAN;
+	    all_chisq_write[block_idx2 * 2 + 1] = NAN;
+	  }
+	}
+      }
+      if (is_first_half) {
+        is_first_half = 0;
+	ulii = geno1_offsets[2 * block_idx1 + 1];
+        cur_idx2_block_size = idx2_block_size;
+        if (ulii < idx2_block_end) {
+          block_idx2 = ulii - idx2_block_start;
+	  goto epi_logistic_thread_second_half;
+	}
+      }
+      if (best_chisq_fixed > best_chisq1[block_delta1]) {
+        best_chisq1[block_delta1] = best_chisq_fixed;
+	best_id1[block_delta1] = best_id_fixed;
+      }
+      n_sig_ct1[block_delta1] = n_sig_ct_fixed;
+      if (fail_ct_fixed) {
+        fail_ct1[block_delta1] = fail_ct_fixed;
+      }
+    }
+    if ((!tidx) || g_is_last_thread_block) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+  }
+}
+
 double calc_lnlike(double known11, double known12, double known21, double known22, double center_ct_d, double freq11, double freq12, double freq21, double freq22, double half_hethet_share, double freq11_incr) {
   double lnlike;
   freq11 += freq11_incr;
@@ -6246,7 +7055,7 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
     marker_ct = unfiltered_marker_ct - popcount_longs(marker_exclude, unfiltered_marker_ctl);
   }
   if (marker_ct < 2) {
-    logprint("Warning: Skipping --blocks since there are too few markers with MAF >= 0.05.\n");
+    logprint("Warning: Skipping --blocks since there are too few variants with MAF >= 0.05.\n");
     goto haploview_blocks_ret_1;
   }
   pct_thresh = marker_ct / 100;
@@ -6289,7 +7098,7 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
     }
 #ifndef __LP64__
     if (max_block_size > 65536) {
-      logprint("\nError: 32-bit --blocks cannot analyze potential blocks with more than 65536\nmarkers.  Use a 64-bit PLINK build or a smaller --blocks-window-kb value.\n");
+      logprint("\nError: 32-bit --blocks cannot analyze potential blocks with more than 65536\nvariants.  Use a 64-bit PLINK build or a smaller --blocks-window-kb value.\n");
       goto haploview_blocks_ret_INVALID_CMDLINE;
     }
 #endif
@@ -7334,49 +8143,1035 @@ int32_t twolocus(Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_
   return retval;
 }
 
-int32_t epistasis_regression() {
-  logprint("Error: --epistasis has not been implemented yet.\n");
-  return RET_CALC_NOT_YET_SUPPORTED;
+void rotate_loadbuf_and_compute_phenogeno(uintptr_t* loadbuf, double* pheno_d2, uint32_t pheno_nm_ct, uintptr_t* loadbuf_write, double* phenogeno, uint32_t* genosums) {
+  double cur_phenogeno = 0;
+  uint32_t geno1_ct = 0;
+  uint32_t geno2_ct = 0;
+  uintptr_t cur_word;
+  uintptr_t ulii;
+  uintptr_t uljj;
+  double dxx;
+  uint32_t sample_idx;
+  uint32_t sample_idx_base;
+  uint32_t uii;
+  for (sample_idx = 0; sample_idx < pheno_nm_ct;) {
+    // we're interested in hom A1 (non-trailing 00) and het (10) bit
+    // values here.
+    cur_word = ~(*loadbuf++);
+    sample_idx_base = sample_idx;
+    sample_idx += BITCT2;
+    if (sample_idx > pheno_nm_ct) {
+      cur_word &= (ONELU << (2 * (pheno_nm_ct % BITCT2))) - ONELU;
+    }
+    // now hom A1 = 11 and het = 01.  Temporarily erase the 10s.
+    uljj = cur_word & FIVEMASK;
+    ulii = uljj | (cur_word & (uljj << 1));
+    while (ulii) {
+      uii = CTZLU(ulii);
+      dxx = pheno_d2[sample_idx_base + uii / 2];
+      if (!((ulii >> (uii + 1)) & 1)) {
+	// het
+	cur_phenogeno += dxx;
+	geno1_ct++;
+      } else {
+	// hom A1
+	cur_phenogeno += 2 * dxx;
+	geno2_ct++;
+      }
+      ulii &= ~((3 * ONELU) << uii);
+    }
+    // currently hom A1 = 11, missing = 10, het = 01, hom A2 = 00
+    // rotate to hom A1 = 10, missing = 11, het = 01, hom A2 = 00
+    // to allow inner loop to use ordinary multiplication
+    *loadbuf_write++ = cur_word ^ ((cur_word >> 1) & FIVEMASK);
+  }
+  *phenogeno = cur_phenogeno;
+  genosums[0] = geno1_ct + 2 * geno2_ct;
+  genosums[1] = geno1_ct + 4 * geno2_ct;
 }
 
-int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct2, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t* marker_pos, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* pheno_nm, uint32_t pheno_nm_ct, uint32_t ctrl_ct, uintptr_t* pheno_c, double* pheno_d, uint32_t parallel_idx, uint32_t pa [...]
-  unsigned char* wkspace_mark = wkspace_base;
+int32_t epistasis_linear_regression(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_t unfiltered_marker_ct, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, uintptr_t marker_uidx_base, uintptr_t marker_ct1, uintptr_t* marker_exclude1, uintptr_t marker_idx1_start, uintptr_t marker_idx1_end, uintptr_t marker_ct2, uintptr_t* marker_exclude2, uint32_t is_triangular, uintptr_t job_si [...]
+  // We use QT --assoc's strategy for speeding up linear regression, since we
+  // do not need to support arbitrary covariates.  It's more complicated here
+  // because we have 3 covariates instead of one, but two of them are still
+  // restricted to the values {0, 1, 2} and the last is the product of the
+  // first two.  So we're able to use variations of the QT --assoc bit hacks.
   FILE* outfile = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
-  uintptr_t unfiltered_sample_ctv2 = 2 * ((unfiltered_sample_ct + (BITCT - 1)) / BITCT);
+  uintptr_t pheno_nm_ctl2 = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
   uintptr_t final_mask = get_final_mask(pheno_nm_ct);
-  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
-  uintptr_t marker_uidx_base = next_unset_unsafe(marker_exclude, 0);
   uintptr_t marker_uidx = marker_uidx_base;
-  uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
-  uint32_t modifier = epi_ip->modifier;
-  uint32_t is_fast = modifier & EPI_FAST;
-  uint32_t is_boost = (modifier / EPI_FAST_BOOST) & 1;
-  uint32_t do_joint_effects = modifier & EPI_FAST_JOINT_EFFECTS;
-  uint32_t no_ueki = modifier & EPI_FAST_NO_UEKI;
-  uint32_t is_case_only = (modifier / EPI_FAST_CASE_ONLY) & 1;
-  uint32_t is_triangular = 1;
-  uint32_t is_custom_set1 = modifier & (EPI_SET_BY_SET | EPI_SET_BY_ALL)? 1 : 0;
-  uint32_t is_set_by_set = modifier & EPI_SET_BY_SET;
-  uint32_t tot_stride = 6 - 3 * is_case_only;
-  uint32_t no_p_value = modifier & EPI_FAST_NO_P_VALUE;
-  uint32_t case_only_gap = epi_ip->case_only_gap;
-  uint32_t is_case_only_window = (is_case_only && case_only_gap);
-  uint32_t case_ct = pheno_nm_ct - ctrl_ct;
-  uint32_t cellminx3 = 0;
-  uintptr_t case_ctl2 = (case_ct + (BITCT2 - 1)) / BITCT2;
-  uintptr_t case_ctv2 = 2 * ((case_ct + (BITCT - 1)) / BITCT);
-  uintptr_t ctrl_ctl2 = (ctrl_ct + (BITCT2 - 1)) / BITCT2;
-  uintptr_t case_ctv3 = 2 * ((case_ct + (2 * BITCT - 1)) / (2 * BITCT));
-  uintptr_t ctrl_ctv3 = 2 * ((ctrl_ct + (2 * BITCT - 1)) / (2 * BITCT));
-  uintptr_t case_ctsplit = 3 * case_ctv3;
-  uintptr_t ctrl_ctsplit = 3 * ctrl_ctv3;
   uintptr_t pct = 1;
   uintptr_t marker_uidx2 = 0;
-  uintptr_t marker_uidx2_trail = 0;
+  uintptr_t marker_idx1 = marker_idx1_start;
   uintptr_t marker_idx2 = 0;
-  uintptr_t marker_idx2_trail = 0;
-  uint64_t tests_thrown_out = 0;
+  uint64_t pct_thresh = tests_expected / 100;
+  uint64_t tests_complete = 0;
+  uint32_t max_thread_ct = g_epi_thread_ct;
+  uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
+  uint32_t chrom_end = 0;
+  uint32_t chrom_idx = 0;
+  uint32_t chrom_idx2 = 0;
+  int32_t retval = 0;
+  unsigned char* wkspace_mark2;
+  char* wptr_start;
+  char* wptr_start2;
+  char* wptr;
+  double* pheno_d2;
+  double* dptr;
+  double* dptr2;
+  uint32_t* uiptr;
+  uint32_t* uiptr2;
+  uint32_t* uiptr3;
+  uint32_t* uiptr4;
+  uint32_t* uiptr5;
+  uintptr_t cur_workload;
+  uintptr_t idx1_block_size;
+  uintptr_t idx2_block_size;
+  uintptr_t idx2_block_sizem16;
+  uintptr_t marker_uidx_tmp;
+  uintptr_t block_idx1;
+  uintptr_t block_idx2;
+  uintptr_t cur_idx2_block_size;
+  uintptr_t chrom_end2;
+  uintptr_t tidx;
+  uintptr_t ulii;
+  uintptr_t uljj;
+  uintptr_t ulkk;
+  double dxx;
+  uint32_t chrom_fo_idx;
+  uint32_t chrom_fo_idx2;
+  uint32_t is_last_block;
+  uint32_t sample_uidx;
+  uint32_t sample_idx;
+  uint32_t uii;
+  uint32_t ujj;
+  if (wkspace_alloc_d_checked(&pheno_d2, pheno_nm_ct * sizeof(double))) {
+    goto epistasis_linear_regression_ret_NOMEM;
+  }
+  g_epi_pheno_d2 = pheno_d2;
+  g_epi_pheno_nm_ct = pheno_nm_ct;
+  dptr = pheno_d2;
+  g_epi_pheno_sum = 0;
+  g_epi_pheno_ssq = 0;
+  for (sample_uidx = 0, sample_idx = 0; sample_idx < pheno_nm_ct; sample_uidx++, sample_idx++) {
+    next_set_unsafe_ck(pheno_nm, &sample_uidx);
+    dxx = pheno_d[sample_uidx];
+    g_epi_pheno_sum += dxx;
+    g_epi_pheno_ssq += dxx * dxx;
+    pheno_d2[sample_idx] = dxx;
+  }
+  // could add an epsilon here, but this is good enough to catch the most
+  // common case (all phenotypes are the same integer near zero).
+  if (g_epi_pheno_ssq * ((double)((int32_t)pheno_nm_ct)) == g_epi_pheno_sum * g_epi_pheno_sum) {
+    logprint("Error: Phenotype is constant.\n");
+    goto epistasis_linear_regression_ret_INVALID_CMDLINE;
+  }
+  g_epi_vif_thresh = glm_vif_thresh;
+
+  // claim up to half of memory with idx1 bufs; each marker currently costs:
+  //   pheno_nm_ctl2 * sizeof(intptr_t) for geno buf
+  //   sizeof(double) for precomputed sum(phenotype * genotype) values
+  //   2 * sizeof(int32_t) for precomputed sum(genotype) and sum(genotype^2)
+  //     values
+  //   4 * sizeof(int32_t) + sizeof(double) + marker_ct2 * 2 * sizeof(double)
+  //     for other stuff (see epistasis_report() comment, starting from
+  //     "offset"; main result buffer must be double-size to store both beta
+  //     and chi-square stat)
+  ulii = pheno_nm_ctl2 * sizeof(intptr_t) + 6 * sizeof(int32_t) + 2 * sizeof(double) + marker_ct2 * 2 * sizeof(double);
+  idx1_block_size = (wkspace_left - 6 * CACHELINE + 5 * sizeof(int32_t) + sizeof(double) - max_thread_ct * (5 * (CACHELINE - 4))) / (ulii * 2 + 1);
+  if (!idx1_block_size) {
+    goto epistasis_linear_regression_ret_NOMEM;
+  }
+  if (idx1_block_size > job_size) {
+    idx1_block_size = job_size;
+  }
+  // pad to avoid threads writing to same cacheline
+  ulii = (max_thread_ct - 1) * 15 + idx1_block_size;
+  g_epi_geno1_offsets = (uint32_t*)wkspace_alloc(idx1_block_size * 2 * sizeof(int32_t));
+  g_epi_geno1 = (uintptr_t*)wkspace_alloc(pheno_nm_ctl2 * idx1_block_size * sizeof(intptr_t));
+  g_epi_phenogeno1 = (double*)wkspace_alloc(idx1_block_size * sizeof(double));
+  // may be better to just recompute genosums values in inner loop?  can test
+  // this later
+  g_epi_genosums1 = (uint32_t*)wkspace_alloc(idx1_block_size * 2 * sizeof(int32_t));
+  g_epi_all_chisq = (double*)wkspace_alloc(idx1_block_size * marker_ct2 * 2 * sizeof(double));
+  g_epi_best_chisq1 = (double*)wkspace_alloc(ulii * sizeof(double));
+  g_epi_best_id1 = (uint32_t*)wkspace_alloc(ulii * sizeof(int32_t));
+  g_epi_n_sig_ct1 = (uint32_t*)wkspace_alloc(ulii * sizeof(int32_t));
+  g_epi_fail_ct1 = (uint32_t*)wkspace_alloc(ulii * sizeof(int32_t));
+  for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
+    g_epi_geno1[block_idx1 * pheno_nm_ctl2 + pheno_nm_ctl2 - 1] = 0;
+  }
+  if (is_triangular) {
+    fill_uint_zero(g_epi_geno1_offsets, 2 * idx1_block_size);
+  }
+
+  ulii = pheno_nm_ctl2 * sizeof(intptr_t) + 2 * sizeof(int32_t) + sizeof(double) + max_thread_ct * (3 * sizeof(int32_t) + sizeof(double));
+  idx2_block_size = (wkspace_left - (3 * CACHELINE - sizeof(intptr_t) - 2 * sizeof(int32_t) - sizeof(double)) - max_thread_ct * (3 * (CACHELINE - sizeof(int32_t)) + (CACHELINE - sizeof(double)))) / ulii;
+  if (idx2_block_size > marker_ct2) {
+    idx2_block_size = marker_ct2;
+  }
+  idx2_block_size = (idx2_block_size + 15) & (~(15 * ONELU));
+
+  memcpy(outname_end, ".epi.qt", 8);
+  if (parallel_tot > 1) {
+    outname_end[7] = '.';
+    uint32_writex(&(outname_end[8]), parallel_idx + 1, '\0');
+  }
+  if (fopen_checked(&outfile, outname, "w")) {
+    goto epistasis_linear_regression_ret_OPEN_FAIL;
+  }
+  if (!parallel_idx) {
+    wptr = memcpya(tbuf, "CHR1 ", 5);
+    wptr = fw_strcpyn(plink_maxsnp, 4, "SNP1", wptr);
+    wptr = memcpya(wptr, " CHR2 ", 6);
+    wptr = fw_strcpyn(plink_maxsnp, 4, "SNP2", wptr);
+    wptr = memcpya(wptr, "     BETA_INT         STAT            P \n", 41);
+    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      goto epistasis_linear_regression_ret_WRITE_FAIL;
+    }
+  }
+
+  wkspace_mark2 = wkspace_base;
+  while (1) {
+    if (!idx2_block_size) {
+      goto epistasis_linear_regression_ret_NOMEM;
+    }
+    if (!(wkspace_alloc_ul_checked(&g_epi_geno2, pheno_nm_ctl2 * idx2_block_size * sizeof(intptr_t)) ||
+          wkspace_alloc_d_checked(&g_epi_phenogeno2, idx2_block_size * sizeof(double)) ||
+          wkspace_alloc_ui_checked(&g_epi_genosums2, idx2_block_size * 2 * sizeof(int32_t)) ||
+          wkspace_alloc_d_checked(&g_epi_best_chisq2, max_thread_ct * idx2_block_size * sizeof(double)) ||
+          wkspace_alloc_ui_checked(&g_epi_best_id2, max_thread_ct * idx2_block_size * sizeof(int32_t)) ||
+          wkspace_alloc_ui_checked(&g_epi_n_sig_ct2, max_thread_ct * idx2_block_size * sizeof(int32_t)) ||
+          wkspace_alloc_ui_checked(&g_epi_fail_ct2, max_thread_ct * idx2_block_size * sizeof(int32_t)))) {
+      break;
+    }
+    wkspace_reset(wkspace_mark2);
+    idx2_block_size -= 16;
+  }
+  for (block_idx2 = 0; block_idx2 < idx2_block_size; block_idx2++) {
+    g_epi_geno2[block_idx2 * pheno_nm_ctl2 + pheno_nm_ctl2 - 1] = 0;
+  }
+  marker_uidx = next_unset_ul_unsafe(marker_exclude1, marker_uidx_base);
+  if (marker_idx1) {
+    marker_uidx = jump_forward_unset_unsafe(marker_exclude1, marker_uidx + 1, marker_idx1);
+  }
+  wptr = memcpya(logbuf, "QT --epistasis to ", 18);
+  wptr = strcpya(wptr, outname);
+  memcpy(wptr, " ... ", 6);
+  wordwrap(logbuf, 16); // strlen("99% [processing]")
+  logprintb();
+  fputs("0%", stdout);
+  do {
+    fputs(" [processing]", stdout);
+    fflush(stdout);
+    if (idx1_block_size > marker_idx1_end - marker_idx1) {
+      idx1_block_size = marker_idx1_end - marker_idx1;
+      if (idx1_block_size < max_thread_ct) {
+        max_thread_ct = idx1_block_size;
+        g_epi_thread_ct = max_thread_ct;
+      }
+    }
+    g_epi_marker_idx1 = marker_idx1;
+    dptr = g_epi_all_chisq;
+    dptr2 = &(g_epi_all_chisq[idx1_block_size * marker_ct2 * 2]);
+    do {
+      *dptr = -1;
+      dptr = &(dptr[2]);
+    } while (dptr < dptr2);
+    marker_uidx_tmp = marker_uidx;
+    if (fseeko(bedfile, bed_offset + (marker_uidx * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
+      goto epistasis_linear_regression_ret_READ_FAIL;
+    }
+    cur_workload = idx1_block_size * marker_ct2;
+    if (is_triangular) {
+      for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
+        ulii = block_idx1 + marker_idx1 + 1;
+        cur_workload -= ulii;
+        g_epi_geno1_offsets[2 * block_idx1 + 1] = ulii;
+      }
+    } else {
+      fill_uint_zero(g_epi_geno1_offsets, 2 * idx1_block_size);
+      marker_uidx2 = marker_uidx_base;
+      marker_idx2 = 0;
+    }
+    tests_complete += cur_workload;
+    ulii = 0; // total number of tests
+    g_epi_idx1_block_bounds[0] = 0;
+    g_epi_idx1_block_bounds16[0] = 0;
+    block_idx1 = 0;
+    for (tidx = 1; tidx < max_thread_ct; tidx++) {
+      uljj = (((uint64_t)cur_workload) * tidx) / max_thread_ct;
+      if (is_triangular) {
+        do {
+          ulii += marker_ct2 - g_epi_geno1_offsets[2 * block_idx1 + 1];
+          block_idx1++;
+	} while (ulii < uljj);
+      } else {
+        do {
+	  ulii += marker_ct2;
+	  block_idx1++;
+	} while (ulii < uljj);
+      }
+      uii = block_idx1 - g_epi_idx1_block_bounds[tidx - 1];
+      g_epi_idx1_block_bounds[tidx] = block_idx1;
+      g_epi_idx1_block_bounds16[tidx] = g_epi_idx1_block_bounds16[tidx - 1] + ((uii + 15) & (~15));
+    }
+    g_epi_idx1_block_bounds[max_thread_ct] = idx1_block_size;
+    chrom_end = 0;
+    for (block_idx1 = 0; block_idx1 < idx1_block_size; marker_uidx_tmp++, block_idx1++) {
+      if (IS_SET(marker_exclude1, marker_uidx_tmp)) {
+	marker_uidx_tmp = next_unset_ul_unsafe(marker_exclude1, marker_uidx_tmp);
+        if (fseeko(bedfile, bed_offset + (marker_uidx_tmp * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
+          goto epistasis_linear_regression_ret_READ_FAIL;
+	}
+      }
+      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx_tmp))) {
+        goto epistasis_linear_regression_ret_READ_FAIL;
+      }
+      rotate_loadbuf_and_compute_phenogeno(loadbuf, pheno_d2, pheno_nm_ct, &(g_epi_geno1[block_idx1 * pheno_nm_ctl2]), &(g_epi_phenogeno1[block_idx1]), &(g_epi_genosums1[block_idx1 * 2]));
+      if (!is_triangular) {
+	if (!IS_SET(marker_exclude2, marker_uidx_tmp)) {
+          // do not compare against self
+          marker_idx2 += marker_uidx_tmp - marker_uidx2 - popcount_bit_idx(marker_exclude2, marker_uidx2, marker_uidx_tmp);
+          marker_uidx2 = marker_uidx_tmp;
+          g_epi_geno1_offsets[2 * block_idx1] = marker_idx2;
+          g_epi_geno1_offsets[2 * block_idx1 + 1] = marker_idx2 + 1;
+          gap_cts[block_idx1 + marker_idx1] = 1;
+	}
+      }
+    }
+    marker_uidx2 = next_unset_ul_unsafe(marker_exclude2, marker_uidx_base);
+    if (is_triangular) {
+      marker_idx2 = marker_idx1 + 1;
+      marker_uidx2 = jump_forward_unset_unsafe(marker_exclude2, marker_uidx2 + 1, marker_idx2);
+    } else {
+      marker_idx2 = 0;
+    }
+    if (fseeko(bedfile, bed_offset + (marker_uidx2 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
+      goto epistasis_linear_regression_ret_READ_FAIL;
+    }
+    cur_idx2_block_size = idx2_block_size;
+    do {
+      if (cur_idx2_block_size > marker_ct2 - marker_idx2) {
+        cur_idx2_block_size = marker_ct2 - marker_idx2;
+      }
+      for (block_idx2 = 0; block_idx2 < cur_idx2_block_size; marker_uidx2++, block_idx2++) {
+        if (IS_SET(marker_exclude2, marker_uidx2)) {
+          marker_uidx2 = next_unset_ul_unsafe(marker_exclude2, marker_uidx2);
+          if (fseeko(bedfile, bed_offset + (marker_uidx2 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
+            goto epistasis_linear_regression_ret_READ_FAIL;
+	  }
+	}
+	if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, pheno_nm_ct, pheno_nm, final_mask, IS_SET(marker_reverse, marker_uidx2))) {
+	  goto epistasis_linear_regression_ret_READ_FAIL;
+	}
+        rotate_loadbuf_and_compute_phenogeno(loadbuf, pheno_d2, pheno_nm_ct, &(g_epi_geno2[block_idx2 * pheno_nm_ctl2]), &(g_epi_phenogeno2[block_idx2]), &(g_epi_genosums2[block_idx2 * 2]));
+      }
+      g_epi_idx2_block_size = cur_idx2_block_size;
+      g_epi_idx2_block_start = marker_idx2;
+      idx2_block_sizem16 = (cur_idx2_block_size + 15) & (~(15 * ONELU));
+      fill_uint_zero(g_epi_n_sig_ct1, idx1_block_size + 15 * (max_thread_ct - 1));
+      fill_uint_zero(g_epi_fail_ct1, idx1_block_size + 15 * (max_thread_ct - 1));
+      fill_uint_zero(g_epi_n_sig_ct2, idx2_block_sizem16 * max_thread_ct);
+      fill_uint_zero(g_epi_fail_ct2, idx2_block_sizem16 * max_thread_ct);
+      for (tidx = 0; tidx < max_thread_ct; tidx++) {
+        ulii = g_epi_idx1_block_bounds[tidx];
+        uljj = g_epi_idx1_block_bounds[tidx + 1] - ulii;
+        dptr = &(g_epi_best_chisq1[g_epi_idx1_block_bounds[tidx]]);
+	dptr2 = &(g_epi_all_chisq[(marker_idx1 + ulii) * 2]);
+	for (ulkk = 0; ulkk < uljj; ulkk++) {
+          *dptr++ = dptr2[ulkk * 2];
+	}
+        ulii = g_epi_geno1_offsets[2 * ulii + 1];
+        if (ulii < marker_idx2 + cur_idx2_block_size) {
+          if (ulii <= marker_idx2) {
+            ulii = 0;
+	  } else {
+            ulii -= marker_idx2;
+	  }
+          uljj = cur_idx2_block_size - ulii;
+	  dptr = &(g_epi_best_chisq2[tidx * idx2_block_sizem16 + ulii]);
+	  dptr2 = &(g_epi_all_chisq[(marker_idx2 + ulii) * 2]);
+          for (ulkk = 0; ulkk < uljj; ulkk++) {
+            *dptr++ = dptr2[ulkk * 2];
+	  }
+	}
+      }
+      is_last_block = (marker_idx2 + cur_idx2_block_size >= marker_ct2);
+      if (spawn_threads2(threads, &epi_linear_thread, max_thread_ct, is_last_block)) {
+	goto epistasis_linear_regression_ret_THREAD_CREATE_FAIL;
+      }
+      epi_linear_thread((void*)0);
+      join_threads2(threads, max_thread_ct, is_last_block);
+      // merge best_chisq, best_ids, fail_cts
+      for (tidx = 0; tidx < max_thread_ct; tidx++) {
+	ulii = g_epi_idx1_block_bounds[tidx];
+	uljj = g_epi_idx1_block_bounds[tidx + 1] - ulii;
+	uii = g_epi_idx1_block_bounds16[tidx];
+	dptr = &(g_epi_best_chisq1[uii]);
+	uiptr = &(g_epi_best_id1[uii]);
+	uiptr2 = &(g_epi_n_sig_ct1[uii]);
+	uiptr3 = &(g_epi_fail_ct1[uii]);
+	ulii += marker_idx1;
+	dptr2 = &(best_chisq[ulii]);
+	uiptr4 = &(n_sig_cts[ulii]);
+	uiptr5 = &(fail_cts[ulii]);
+	for (block_idx1 = 0; block_idx1 < uljj; block_idx1++, dptr2++, uiptr4++, uiptr5++) {
+	  dxx = *dptr++;
+	  if (dxx > (*dptr2)) {
+	    *dptr2 = dxx;
+	    best_ids[block_idx1 + ulii] = uiptr[block_idx1];
+	  }
+	  *uiptr4 += *uiptr2++;
+	  *uiptr5 += *uiptr3++;
+	}
+      }
+      if (is_triangular) {
+	for (tidx = 0; tidx < max_thread_ct; tidx++) {
+	  block_idx2 = g_epi_geno1_offsets[2 * g_epi_idx1_block_bounds[tidx] + 1];
+	  if (block_idx2 <= marker_idx2) {
+	    block_idx2 = 0;
+	  } else {
+	    block_idx2 -= marker_idx2;
+	  }
+	  dptr = &(g_epi_best_chisq2[tidx * idx2_block_sizem16 + block_idx2]);
+	  uiptr = &(g_epi_best_id2[tidx * idx2_block_sizem16]);
+	  uiptr2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizem16 + block_idx2]);
+	  uiptr3 = &(g_epi_fail_ct2[tidx * idx2_block_sizem16 + block_idx2]);
+	  dptr2 = &(best_chisq[block_idx2 + marker_idx2]);
+	  uiptr4 = &(n_sig_cts[block_idx2 + marker_idx2]);
+	  uiptr5 = &(fail_cts[block_idx2 + marker_idx2]);
+	  for (; block_idx2 < cur_idx2_block_size; block_idx2++, dptr2++, uiptr4++, uiptr5++) {
+	    dxx = *dptr++;
+	    if (dxx > (*dptr2)) {
+	      *dptr2 = dxx;
+	      best_ids[block_idx2 + marker_idx2] = uiptr[block_idx2];
+	    }
+	    *uiptr4 += *uiptr2++;
+	    *uiptr5 += *uiptr3++;
+	  }
+	}
+      }
+      marker_idx2 += cur_idx2_block_size;
+    } while (marker_idx2 < marker_ct2);
+    fputs("\b\b\b\b\b\b\b\b\b\b\bwriting]   \b\b\b", stdout);
+    fflush(stdout);
+    chrom_end = 0;
+    block_idx1 = 0;
+    while (1) {
+      next_unset_ul_unsafe_ck(marker_exclude1, &marker_uidx);
+      ujj = g_epi_geno1_offsets[2 * block_idx1];
+      marker_idx2 = 0;
+      dptr = &(g_epi_all_chisq[block_idx1 * 2 * marker_ct2]);
+      if (marker_uidx >= chrom_end) {
+	chrom_fo_idx = get_marker_chrom_fo_idx(chrom_info_ptr, marker_uidx);
+	chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
+	chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1];
+      }
+      wptr_start = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, chrom_idx));
+      *wptr_start++ = ' ';
+      wptr_start = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
+      *wptr_start++ = ' ';
+      marker_uidx2 = next_unset_ul_unsafe(marker_exclude2, marker_uidx_base);
+      for (chrom_fo_idx2 = get_marker_chrom_fo_idx(chrom_info_ptr, marker_uidx2); chrom_fo_idx2 < chrom_ct; chrom_fo_idx2++) {
+	chrom_idx2 = chrom_info_ptr->chrom_file_order[chrom_fo_idx2];
+	chrom_end2 = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx2 + 1];
+	wptr_start2 = width_force(4, wptr_start, chrom_name_write(wptr_start, chrom_info_ptr, chrom_idx2));
+	*wptr_start2++ = ' ';
+	for (; marker_uidx2 < chrom_end2; next_unset_ul_ck(marker_exclude2, &marker_uidx2, chrom_end2), marker_idx2++, dptr = &(dptr[2])) {
+	  if (marker_idx2 == ujj) {
+	    marker_idx2 = g_epi_geno1_offsets[2 * block_idx1 + 1];
+	    if (marker_idx2 == marker_ct2) {
+	      goto epistasis_linear_regression_write_loop;
+	    }
+	    if (marker_idx2 > ujj) {
+	      marker_uidx2 = jump_forward_unset_unsafe(marker_exclude2, marker_uidx2 + 1, marker_idx2 - ujj);
+	      dptr = &(dptr[2 * (marker_idx2 - ujj)]);
+	      if (marker_uidx2 >= chrom_end2) {
+		break;
+	      }
+	    }
+	  } else if (marker_idx2 == marker_ct2) {
+	    goto epistasis_linear_regression_write_loop;
+	  }
+	  dxx = *dptr;
+	  if (dxx != -1) {
+	    wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr_start2);
+	    *wptr++ = ' ';
+	    // beta
+	    wptr = width_force(12, wptr, double_g_write(wptr, dptr[1]));
+            *wptr++ = ' ';
+	    wptr = width_force(12, wptr, double_g_write(wptr, dxx));
+	    *wptr++ = ' ';
+	    dxx = normdist(-sqrt(dxx)) * 2;
+	    wptr = double_g_writewx4x(wptr, MAXV(dxx, output_min_p), 12, ' ');
+	    *wptr++ = '\n';
+	    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	      goto epistasis_linear_regression_ret_WRITE_FAIL;
+	    }
+	    // could remove this writeback in --epi1 1 case
+	    *dptr = -1;
+	  }
+	  marker_uidx2++;
+	}
+      }
+    epistasis_linear_regression_write_loop:
+      block_idx1++;
+      marker_uidx++;
+      if (block_idx1 >= idx1_block_size) {
+        break;
+      }
+    }
+    marker_idx1 += idx1_block_size;
+    fputs("\b\b\b\b\b\b\b\b\b\b          \b\b\b\b\b\b\b\b\b\b", stdout);
+    if (tests_complete >= pct_thresh) {
+      if (pct > 10) {
+        putchar('\b');
+      }
+      pct = (tests_complete * 100LLU) / tests_expected;
+      if (pct < 100) {
+        printf("\b\b%" PRIuPTR "%%", pct);
+        fflush(stdout);
+        pct_thresh = ((++pct) * ((uint64_t)tests_expected)) / 100;
+      }
+    }
+  } while (marker_idx1 < marker_idx1_end);
+  if (fclose_null(&outfile)) {
+    goto epistasis_linear_regression_ret_WRITE_FAIL;
+  }
+  while (0) {
+  epistasis_linear_regression_ret_NOMEM:
+    retval = RET_NOMEM;
+    break;
+  epistasis_linear_regression_ret_OPEN_FAIL:
+    retval = RET_OPEN_FAIL;
+    break;
+  epistasis_linear_regression_ret_READ_FAIL:
+    retval = RET_READ_FAIL;
+    break;
+  epistasis_linear_regression_ret_WRITE_FAIL:
+    retval = RET_WRITE_FAIL;
+    break;
+  epistasis_linear_regression_ret_INVALID_CMDLINE:
+    retval = RET_INVALID_CMDLINE;
+    break;
+  epistasis_linear_regression_ret_THREAD_CREATE_FAIL:
+    retval = RET_THREAD_CREATE_FAIL;
+    break;
+  }
+  fclose_cond(outfile);
+  // caller will free memory
+  return retval;
+}
+
+int32_t epistasis_logistic_regression(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_t unfiltered_marker_ct, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, uintptr_t marker_uidx_base, uintptr_t marker_ct1, uintptr_t* marker_exclude1, uintptr_t marker_idx1_start, uintptr_t marker_idx1_end, uintptr_t marker_ct2, uintptr_t* marker_exclude2, uint32_t is_triangular, uintptr_t job_ [...]
+  FILE* outfile = NULL;
+  uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
+  uintptr_t pheno_nm_cta4 = (pheno_nm_ct + 3) & (~3);
+  uintptr_t pheno_nm_ctl2 = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t final_mask = get_final_mask(pheno_nm_ct);
+  uintptr_t marker_uidx = marker_uidx_base;
+  uintptr_t pct = 1;
+  uintptr_t marker_uidx2 = 0;
+  uintptr_t marker_idx1 = marker_idx1_start;
+  uintptr_t marker_idx2 = 0;
+  uint64_t pct_thresh = tests_expected / 100;
+  uint64_t tests_complete = 0;
+  uint32_t max_thread_ct = g_epi_thread_ct;
+  uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
+  uint32_t chrom_end = 0;
+  uint32_t chrom_idx = 0;
+  uint32_t chrom_idx2 = 0;
+  int32_t retval = 0;
+  unsigned char* wkspace_mark2;
+  uintptr_t* ulptr;
+  char* wptr_start;
+  char* wptr_start2;
+  char* wptr;
+  float* fptr;
+  float* fptr2;
+  double* dptr;
+  uint32_t* uiptr;
+  uint32_t* uiptr2;
+  uint32_t* uiptr3;
+  uint32_t* uiptr4;
+  uint32_t* uiptr5;
+  uintptr_t cur_workload;
+  uintptr_t idx1_block_size;
+  uintptr_t idx2_block_size;
+  uintptr_t idx2_block_sizem16;
+  uintptr_t marker_uidx_tmp;
+  uintptr_t block_idx1;
+  uintptr_t block_idx2;
+  uintptr_t cur_idx2_block_size;
+  uintptr_t chrom_end2;
+  uintptr_t tidx;
+  uintptr_t ulii;
+  uintptr_t uljj;
+  uintptr_t ulkk;
+  double dxx;
+  float fxx;
+  uint32_t chrom_fo_idx;
+  uint32_t chrom_fo_idx2;
+  uint32_t is_last_block;
+  uint32_t uii;
+  uint32_t ujj;
+  if (wkspace_alloc_ul_checked(&g_epi_pheno_c, pheno_nm_ctl2 * sizeof(uintptr_t))) {
+    goto epistasis_logistic_regression_ret_NOMEM;
+  }
+  collapse_copy_bitarr_incl(unfiltered_sample_ct, pheno_c, pheno_nm, pheno_nm_ct, g_epi_pheno_c);
+  g_epi_pheno_nm_ct = pheno_nm_ct;
+  // per-thread buffers
+  g_epi_logistic_mt = (Epi_logistic_multithread*)wkspace_alloc(max_thread_ct * sizeof(Epi_logistic_multithread));
+  if (!g_epi_logistic_mt) {
+    goto epistasis_logistic_regression_ret_NOMEM;
+  }
+  // param_ct_max = 4 (intercept, A, B, AB)
+  for (tidx = 0; tidx < max_thread_ct; tidx++) {
+    if (wkspace_alloc_f_checked(&(g_epi_logistic_mt[tidx].cur_covars_cov_major), pheno_nm_cta4 * 4 * sizeof(float)) ||
+        wkspace_alloc_f_checked(&(g_epi_logistic_mt[tidx].coef), 4 * sizeof(float)) ||
+        wkspace_alloc_f_checked(&(g_epi_logistic_mt[tidx].pp), pheno_nm_cta4 * sizeof(float)) ||
+        wkspace_alloc_f_checked(&(g_epi_logistic_mt[tidx].sample_1d_buf), pheno_nm_ct * sizeof(float)) ||
+        wkspace_alloc_f_checked(&(g_epi_logistic_mt[tidx].pheno_buf), pheno_nm_ct * sizeof(float)) ||
+        wkspace_alloc_f_checked(&(g_epi_logistic_mt[tidx].param_1d_buf), pheno_nm_ct * 4 * sizeof(float)) ||
+        wkspace_alloc_f_checked(&(g_epi_logistic_mt[tidx].param_1d_buf2), pheno_nm_ct * sizeof(float)) ||
+	wkspace_alloc_f_checked(&(g_epi_logistic_mt[tidx].param_2d_buf), 4 * 4 * sizeof(float)) ||
+        wkspace_alloc_f_checked(&(g_epi_logistic_mt[tidx].param_2d_buf2), 4 * 4 * sizeof(float))) {
+      goto epistasis_logistic_regression_ret_NOMEM;
+    }
+  }
+
+  // claim up to half of memory with idx1 bufs; each marker currently costs:
+  //   pheno_nm_ctl2 * sizeof(intptr_t) for geno buf
+  //   4 * sizeof(int32_t) + sizeof(float) + marker_ct2 * 2 * sizeof(float)
+  //     for other stuff (see epistasis_report() comment, starting from
+  //     "offset"; main result buffer must be double-size to store both beta
+  //     and chi-square stat)
+  ulii = pheno_nm_ctl2 * sizeof(intptr_t) + 4 * sizeof(int32_t) + sizeof(float) + marker_ct2 * 2 * sizeof(float);
+  idx1_block_size = (wkspace_left - 4 * CACHELINE + 3 * sizeof(int32_t) - max_thread_ct * (5 * (CACHELINE - 4))) / (ulii * 2 + 1);
+  if (!idx1_block_size) {
+    goto epistasis_logistic_regression_ret_NOMEM;
+  }
+  if (idx1_block_size > job_size) {
+    idx1_block_size = job_size;
+  }
+  // pad to avoid threads writing to same cacheline
+  ulii = (max_thread_ct - 1) * 15 + idx1_block_size;
+  g_epi_geno1_offsets = (uint32_t*)wkspace_alloc(idx1_block_size * 2 * sizeof(int32_t));
+  g_epi_geno1 = (uintptr_t*)wkspace_alloc(pheno_nm_ctl2 * idx1_block_size * sizeof(intptr_t));
+  g_epi_all_chisq_f = (float*)wkspace_alloc(idx1_block_size * marker_ct2 * 2 * sizeof(float));
+  g_epi_best_chisq_f1 = (float*)wkspace_alloc(ulii * sizeof(float));
+  g_epi_best_id1 = (uint32_t*)wkspace_alloc(ulii * sizeof(int32_t));
+  g_epi_n_sig_ct1 = (uint32_t*)wkspace_alloc(ulii * sizeof(int32_t));
+  g_epi_fail_ct1 = (uint32_t*)wkspace_alloc(ulii * sizeof(int32_t));
+  for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
+    g_epi_geno1[block_idx1 * pheno_nm_ctl2 + pheno_nm_ctl2 - 1] = 0;
+  }
+  if (is_triangular) {
+    fill_uint_zero(g_epi_geno1_offsets, 2 * idx1_block_size);
+  }
+
+  ulii = pheno_nm_ctl2 * sizeof(intptr_t) + max_thread_ct * (3 * sizeof(int32_t) + sizeof(double));
+  idx2_block_size = (wkspace_left - (CACHELINE - sizeof(intptr_t)) - max_thread_ct * (3 * (CACHELINE - sizeof(int32_t)) + (CACHELINE - sizeof(float)))) / ulii;
+  if (idx2_block_size > marker_ct2) {
+    idx2_block_size = marker_ct2;
+  }
+  idx2_block_size = (idx2_block_size + 15) & (~(15 * ONELU));
+
+  memcpy(outname_end, ".epi.cc", 8);
+  if (parallel_tot > 1) {
+    outname_end[7] = '.';
+    uint32_writex(&(outname_end[8]), parallel_idx + 1, '\0');
+  }
+  if (fopen_checked(&outfile, outname, "w")) {
+    goto epistasis_logistic_regression_ret_OPEN_FAIL;
+  }
+  if (!parallel_idx) {
+    wptr = memcpya(tbuf, "CHR1 ", 5);
+    wptr = fw_strcpyn(plink_maxsnp, 4, "SNP1", wptr);
+    wptr = memcpya(wptr, " CHR2 ", 6);
+    wptr = fw_strcpyn(plink_maxsnp, 4, "SNP2", wptr);
+    wptr = memcpya(wptr, "       OR_INT         STAT            P \n", 41);
+    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      goto epistasis_logistic_regression_ret_WRITE_FAIL;
+    }
+  }
+
+  wkspace_mark2 = wkspace_base;
+  while (1) {
+    if (!idx2_block_size) {
+      goto epistasis_logistic_regression_ret_NOMEM;
+    }
+    if (!(wkspace_alloc_ul_checked(&g_epi_geno2, pheno_nm_ctl2 * idx2_block_size * sizeof(intptr_t)) ||
+          wkspace_alloc_f_checked(&g_epi_best_chisq_f2, max_thread_ct * idx2_block_size * sizeof(float)) ||
+          wkspace_alloc_ui_checked(&g_epi_best_id2, max_thread_ct * idx2_block_size * sizeof(int32_t)) ||
+          wkspace_alloc_ui_checked(&g_epi_n_sig_ct2, max_thread_ct * idx2_block_size * sizeof(int32_t)) ||
+          wkspace_alloc_ui_checked(&g_epi_fail_ct2, max_thread_ct * idx2_block_size * sizeof(int32_t)))) {
+      break;
+    }
+    wkspace_reset(wkspace_mark2);
+    idx2_block_size -= 16;
+  }
+  for (block_idx2 = 0; block_idx2 < idx2_block_size; block_idx2++) {
+    g_epi_geno2[block_idx2 * pheno_nm_ctl2 + pheno_nm_ctl2 - 1] = 0;
+  }
+  marker_uidx = next_unset_ul_unsafe(marker_exclude1, marker_uidx_base);
+  if (marker_idx1) {
+    marker_uidx = jump_forward_unset_unsafe(marker_exclude1, marker_uidx + 1, marker_idx1);
+  }
+  wptr = memcpya(logbuf, "C/C --epistasis to ", 19);
+  wptr = strcpya(wptr, outname);
+  memcpy(wptr, " ... ", 6);
+  wordwrap(logbuf, 16); // strlen("99% [processing]")
+  logprintb();
+  fputs("0%", stdout);
+  do {
+    fputs(" [processing]", stdout);
+    fflush(stdout);
+    if (idx1_block_size > marker_idx1_end - marker_idx1) {
+      idx1_block_size = marker_idx1_end - marker_idx1;
+      if (idx1_block_size < max_thread_ct) {
+        max_thread_ct = idx1_block_size;
+        g_epi_thread_ct = max_thread_ct;
+      }
+    }
+    g_epi_marker_idx1 = marker_idx1;
+    fptr = g_epi_all_chisq_f;
+    fptr2 = &(g_epi_all_chisq_f[idx1_block_size * marker_ct2 * 2]);
+    do {
+      *fptr = -1;
+      fptr = &(fptr[2]);
+    } while (fptr < fptr2);
+    marker_uidx_tmp = marker_uidx;
+    if (fseeko(bedfile, bed_offset + (marker_uidx * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
+      goto epistasis_logistic_regression_ret_READ_FAIL;
+    }
+    cur_workload = idx1_block_size * marker_ct2;
+    if (is_triangular) {
+      for (block_idx1 = 0; block_idx1 < idx1_block_size; block_idx1++) {
+        ulii = block_idx1 + marker_idx1 + 1;
+        cur_workload -= ulii;
+        g_epi_geno1_offsets[2 * block_idx1 + 1] = ulii;
+      }
+    } else {
+      fill_uint_zero(g_epi_geno1_offsets, 2 * idx1_block_size);
+      marker_uidx2 = marker_uidx_base;
+      marker_idx2 = 0;
+    }
+    tests_complete += cur_workload;
+    ulii = 0; // total number of tests
+    g_epi_idx1_block_bounds[0] = 0;
+    g_epi_idx1_block_bounds16[0] = 0;
+    block_idx1 = 0;
+    for (tidx = 1; tidx < max_thread_ct; tidx++) {
+      uljj = (((uint64_t)cur_workload) * tidx) / max_thread_ct;
+      if (is_triangular) {
+        do {
+          ulii += marker_ct2 - g_epi_geno1_offsets[2 * block_idx1 + 1];
+          block_idx1++;
+	} while (ulii < uljj);
+      } else {
+        do {
+	  ulii += marker_ct2;
+	  block_idx1++;
+	} while (ulii < uljj);
+      }
+      uii = block_idx1 - g_epi_idx1_block_bounds[tidx - 1];
+      g_epi_idx1_block_bounds[tidx] = block_idx1;
+      g_epi_idx1_block_bounds16[tidx] = g_epi_idx1_block_bounds16[tidx - 1] + ((uii + 15) & (~15));
+    }
+    g_epi_idx1_block_bounds[max_thread_ct] = idx1_block_size;
+    chrom_end = 0;
+    for (block_idx1 = 0; block_idx1 < idx1_block_size; marker_uidx_tmp++, block_idx1++) {
+      if (IS_SET(marker_exclude1, marker_uidx_tmp)) {
+	marker_uidx_tmp = next_unset_ul_unsafe(marker_exclude1, marker_uidx_tmp);
+        if (fseeko(bedfile, bed_offset + (marker_uidx_tmp * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
+          goto epistasis_logistic_regression_ret_READ_FAIL;
+	}
+      }
+      // marker_reverse deliberately flipped
+      if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, pheno_nm_ct, pheno_nm, final_mask, !IS_SET(marker_reverse, marker_uidx_tmp))) {
+        goto epistasis_logistic_regression_ret_READ_FAIL;
+      }
+      // rotate to hom A1 = 10, het = 01, hom A2 = 00, missing = 11, to allow
+      // inner loop to use ordinary multiplication
+      // this is a bit redundant with the forced reverse, but it's not a
+      // bottleneck
+      rotate_plink1_to_plink2_and_copy(loadbuf, &(g_epi_geno1[block_idx1 * pheno_nm_ctl2]), pheno_nm_ctl2);
+      if (!is_triangular) {
+	if (!IS_SET(marker_exclude2, marker_uidx_tmp)) {
+          // do not compare against self
+          marker_idx2 += marker_uidx_tmp - marker_uidx2 - popcount_bit_idx(marker_exclude2, marker_uidx2, marker_uidx_tmp);
+          marker_uidx2 = marker_uidx_tmp;
+          g_epi_geno1_offsets[2 * block_idx1] = marker_idx2;
+          g_epi_geno1_offsets[2 * block_idx1 + 1] = marker_idx2 + 1;
+          gap_cts[block_idx1 + marker_idx1] = 1;
+	}
+      }
+    }
+    marker_uidx2 = next_unset_ul_unsafe(marker_exclude2, marker_uidx_base);
+    if (is_triangular) {
+      marker_idx2 = marker_idx1 + 1;
+      marker_uidx2 = jump_forward_unset_unsafe(marker_exclude2, marker_uidx2 + 1, marker_idx2);
+    } else {
+      marker_idx2 = 0;
+    }
+    if (fseeko(bedfile, bed_offset + (marker_uidx2 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
+      goto epistasis_logistic_regression_ret_READ_FAIL;
+    }
+    cur_idx2_block_size = idx2_block_size;
+    do {
+      if (cur_idx2_block_size > marker_ct2 - marker_idx2) {
+        cur_idx2_block_size = marker_ct2 - marker_idx2;
+      }
+      for (block_idx2 = 0; block_idx2 < cur_idx2_block_size; marker_uidx2++, block_idx2++) {
+        if (IS_SET(marker_exclude2, marker_uidx2)) {
+          marker_uidx2 = next_unset_ul_unsafe(marker_exclude2, marker_uidx2);
+          if (fseeko(bedfile, bed_offset + (marker_uidx2 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
+            goto epistasis_logistic_regression_ret_READ_FAIL;
+	  }
+	}
+        ulptr = &(g_epi_geno2[block_idx2 * pheno_nm_ctl2]);
+	// marker_reverse deliberately flipped
+	if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf, pheno_nm_ct, pheno_nm, final_mask, !IS_SET(marker_reverse, marker_uidx2))) {
+	  goto epistasis_logistic_regression_ret_READ_FAIL;
+	}
+	rotate_plink1_to_plink2_and_copy(loadbuf, ulptr, pheno_nm_ctl2);
+      }
+      g_epi_idx2_block_size = cur_idx2_block_size;
+      g_epi_idx2_block_start = marker_idx2;
+      idx2_block_sizem16 = (cur_idx2_block_size + 15) & (~(15 * ONELU));
+      fill_uint_zero(g_epi_n_sig_ct1, idx1_block_size + 15 * (max_thread_ct - 1));
+      fill_uint_zero(g_epi_fail_ct1, idx1_block_size + 15 * (max_thread_ct - 1));
+      fill_uint_zero(g_epi_n_sig_ct2, idx2_block_sizem16 * max_thread_ct);
+      fill_uint_zero(g_epi_fail_ct2, idx2_block_sizem16 * max_thread_ct);
+      for (tidx = 0; tidx < max_thread_ct; tidx++) {
+        ulii = g_epi_idx1_block_bounds[tidx];
+        uljj = g_epi_idx1_block_bounds[tidx + 1] - ulii;
+        fptr = &(g_epi_best_chisq_f1[g_epi_idx1_block_bounds[tidx]]);
+	fptr2 = &(g_epi_all_chisq_f[(marker_idx1 + ulii) * 2]);
+	for (ulkk = 0; ulkk < uljj; ulkk++) {
+          *fptr++ = fptr2[ulkk * 2];
+	}
+        ulii = g_epi_geno1_offsets[2 * ulii + 1];
+        if (ulii < marker_idx2 + cur_idx2_block_size) {
+          if (ulii <= marker_idx2) {
+            ulii = 0;
+	  } else {
+            ulii -= marker_idx2;
+	  }
+          uljj = cur_idx2_block_size - ulii;
+	  fptr = &(g_epi_best_chisq_f2[tidx * idx2_block_sizem16 + ulii]);
+	  fptr2 = &(g_epi_all_chisq_f[(marker_idx2 + ulii) * 2]);
+          for (ulkk = 0; ulkk < uljj; ulkk++) {
+            *fptr++ = fptr2[ulkk * 2];
+	  }
+	}
+      }
+      is_last_block = (marker_idx2 + cur_idx2_block_size >= marker_ct2);
+      if (spawn_threads2(threads, &epi_logistic_thread, max_thread_ct, is_last_block)) {
+	goto epistasis_logistic_regression_ret_THREAD_CREATE_FAIL;
+      }
+      epi_logistic_thread((void*)0);
+      join_threads2(threads, max_thread_ct, is_last_block);
+      // merge best_chisq, best_ids, fail_cts
+      for (tidx = 0; tidx < max_thread_ct; tidx++) {
+	ulii = g_epi_idx1_block_bounds[tidx];
+	uljj = g_epi_idx1_block_bounds[tidx + 1] - ulii;
+	uii = g_epi_idx1_block_bounds16[tidx];
+	fptr = &(g_epi_best_chisq_f1[uii]);
+	uiptr = &(g_epi_best_id1[uii]);
+	uiptr2 = &(g_epi_n_sig_ct1[uii]);
+	uiptr3 = &(g_epi_fail_ct1[uii]);
+	ulii += marker_idx1;
+	dptr = &(best_chisq[ulii]);
+	uiptr4 = &(n_sig_cts[ulii]);
+	uiptr5 = &(fail_cts[ulii]);
+	for (block_idx1 = 0; block_idx1 < uljj; block_idx1++, dptr++, uiptr4++, uiptr5++) {
+	  dxx = (double)(*fptr++);
+	  if (dxx > (*dptr)) {
+	    *dptr = dxx;
+	    best_ids[block_idx1 + ulii] = uiptr[block_idx1];
+	  }
+	  *uiptr4 += *uiptr2++;
+	  *uiptr5 += *uiptr3++;
+	}
+      }
+      if (is_triangular) {
+	for (tidx = 0; tidx < max_thread_ct; tidx++) {
+	  block_idx2 = g_epi_geno1_offsets[2 * g_epi_idx1_block_bounds[tidx] + 1];
+	  if (block_idx2 <= marker_idx2) {
+	    block_idx2 = 0;
+	  } else {
+	    block_idx2 -= marker_idx2;
+	  }
+	  fptr = &(g_epi_best_chisq_f2[tidx * idx2_block_sizem16 + block_idx2]);
+	  uiptr = &(g_epi_best_id2[tidx * idx2_block_sizem16]);
+	  uiptr2 = &(g_epi_n_sig_ct2[tidx * idx2_block_sizem16 + block_idx2]);
+	  uiptr3 = &(g_epi_fail_ct2[tidx * idx2_block_sizem16 + block_idx2]);
+	  dptr = &(best_chisq[block_idx2 + marker_idx2]);
+	  uiptr4 = &(n_sig_cts[block_idx2 + marker_idx2]);
+	  uiptr5 = &(fail_cts[block_idx2 + marker_idx2]);
+	  for (; block_idx2 < cur_idx2_block_size; block_idx2++, dptr++, uiptr4++, uiptr5++) {
+	    dxx = (double)(*fptr++);
+	    if (dxx > (*dptr)) {
+	      *dptr = dxx;
+	      best_ids[block_idx2 + marker_idx2] = uiptr[block_idx2];
+	    }
+	    *uiptr4 += *uiptr2++;
+	    *uiptr5 += *uiptr3++;
+	  }
+	}
+      }
+      marker_idx2 += cur_idx2_block_size;
+    } while (marker_idx2 < marker_ct2);
+    fputs("\b\b\b\b\b\b\b\b\b\b\bwriting]   \b\b\b", stdout);
+    fflush(stdout);
+    chrom_end = 0;
+    block_idx1 = 0;
+    while (1) {
+      next_unset_ul_unsafe_ck(marker_exclude1, &marker_uidx);
+      ujj = g_epi_geno1_offsets[2 * block_idx1];
+      marker_idx2 = 0;
+      fptr = &(g_epi_all_chisq_f[block_idx1 * 2 * marker_ct2]);
+      if (marker_uidx >= chrom_end) {
+	chrom_fo_idx = get_marker_chrom_fo_idx(chrom_info_ptr, marker_uidx);
+	chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
+	chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1];
+      }
+      wptr_start = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, chrom_idx));
+      *wptr_start++ = ' ';
+      wptr_start = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
+      *wptr_start++ = ' ';
+      marker_uidx2 = next_unset_ul_unsafe(marker_exclude2, marker_uidx_base);
+      for (chrom_fo_idx2 = get_marker_chrom_fo_idx(chrom_info_ptr, marker_uidx2); chrom_fo_idx2 < chrom_ct; chrom_fo_idx2++) {
+	chrom_idx2 = chrom_info_ptr->chrom_file_order[chrom_fo_idx2];
+	chrom_end2 = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx2 + 1];
+	wptr_start2 = width_force(4, wptr_start, chrom_name_write(wptr_start, chrom_info_ptr, chrom_idx2));
+	*wptr_start2++ = ' ';
+	for (; marker_uidx2 < chrom_end2; next_unset_ul_ck(marker_exclude2, &marker_uidx2, chrom_end2), marker_idx2++, fptr = &(fptr[2])) {
+	  if (marker_idx2 == ujj) {
+	    marker_idx2 = g_epi_geno1_offsets[2 * block_idx1 + 1];
+	    if (marker_idx2 == marker_ct2) {
+	      goto epistasis_logistic_regression_write_loop;
+	    }
+	    if (marker_idx2 > ujj) {
+	      marker_uidx2 = jump_forward_unset_unsafe(marker_exclude2, marker_uidx2 + 1, marker_idx2 - ujj);
+	      fptr = &(fptr[2 * (marker_idx2 - ujj)]);
+	      if (marker_uidx2 >= chrom_end2) {
+		break;
+	      }
+	    }
+	  } else if (marker_idx2 == marker_ct2) {
+	    goto epistasis_logistic_regression_write_loop;
+	  }
+	  fxx = *fptr;
+	  if (fxx != -1) {
+	    dxx = (double)fxx;
+	    wptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx2 * max_marker_id_len]), wptr_start2);
+	    *wptr++ = ' ';
+	    // odds ratio
+	    wptr = width_force(12, wptr, double_g_write(wptr, exp((double)fptr[1])));
+            *wptr++ = ' ';
+	    wptr = width_force(12, wptr, float_g_write(wptr, fxx));
+	    *wptr++ = ' ';
+	    dxx = normdist(-sqrt(dxx)) * 2;
+	    wptr = double_g_writewx4x(wptr, MAXV(dxx, output_min_p), 12, ' ');
+	    *wptr++ = '\n';
+	    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	      goto epistasis_logistic_regression_ret_WRITE_FAIL;
+	    }
+	    // could remove this writeback in --epi1 1 case
+	    *fptr = -1;
+	  }
+	  marker_uidx2++;
+	}
+      }
+    epistasis_logistic_regression_write_loop:
+      block_idx1++;
+      marker_uidx++;
+      if (block_idx1 >= idx1_block_size) {
+        break;
+      }
+    }
+    marker_idx1 += idx1_block_size;
+    fputs("\b\b\b\b\b\b\b\b\b\b          \b\b\b\b\b\b\b\b\b\b", stdout);
+    if (tests_complete >= pct_thresh) {
+      if (pct > 10) {
+        putchar('\b');
+      }
+      pct = (tests_complete * 100LLU) / tests_expected;
+      if (pct < 100) {
+        printf("\b\b%" PRIuPTR "%%", pct);
+        fflush(stdout);
+        pct_thresh = ((++pct) * ((uint64_t)tests_expected)) / 100;
+      }
+    }
+  } while (marker_idx1 < marker_idx1_end);
+  if (fclose_null(&outfile)) {
+    goto epistasis_logistic_regression_ret_WRITE_FAIL;
+  }
+  while (0) {
+  epistasis_logistic_regression_ret_NOMEM:
+    retval = RET_NOMEM;
+    break;
+  epistasis_logistic_regression_ret_OPEN_FAIL:
+    retval = RET_OPEN_FAIL;
+    break;
+  epistasis_logistic_regression_ret_READ_FAIL:
+    retval = RET_READ_FAIL;
+    break;
+  epistasis_logistic_regression_ret_WRITE_FAIL:
+    retval = RET_WRITE_FAIL;
+    break;
+  epistasis_logistic_regression_ret_THREAD_CREATE_FAIL:
+    retval = RET_THREAD_CREATE_FAIL;
+    break;
+  }
+  fclose_cond(outfile);
+  // caller will free memory
+  return retval;
+}
+
+int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct2, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t* marker_pos, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* pheno_nm, uint32_t pheno_nm_ct, uint32_t ctrl_ct, uintptr_t* pheno_c, double* pheno_d, uint32_t parallel_idx, uint32_t pa [...]
+  unsigned char* wkspace_mark = wkspace_base;
+  FILE* outfile = NULL;
+  uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
+  uintptr_t unfiltered_sample_ctv2 = 2 * ((unfiltered_sample_ct + (BITCT - 1)) / BITCT);
+  uintptr_t final_mask = get_final_mask(pheno_nm_ct);
+  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t marker_uidx_base = next_unset_unsafe(marker_exclude, 0);
+  uintptr_t marker_uidx = marker_uidx_base;
+  uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
+  uint32_t modifier = epi_ip->modifier;
+  uint32_t is_fast = modifier & EPI_FAST;
+  uint32_t is_boost = (modifier / EPI_FAST_BOOST) & 1;
+  uint32_t do_joint_effects = modifier & EPI_FAST_JOINT_EFFECTS;
+  uint32_t no_ueki = modifier & EPI_FAST_NO_UEKI;
+  uint32_t is_case_only = (modifier / EPI_FAST_CASE_ONLY) & 1;
+  uint32_t is_triangular = 1;
+  uint32_t is_custom_set1 = modifier & (EPI_SET_BY_SET | EPI_SET_BY_ALL)? 1 : 0;
+  uint32_t is_set_by_set = modifier & EPI_SET_BY_SET;
+  uint32_t tot_stride = 6 - 3 * is_case_only;
+  uint32_t no_p_value = modifier & EPI_FAST_NO_P_VALUE;
+  uint32_t case_only_gap = epi_ip->case_only_gap;
+  uint32_t is_case_only_window = (is_case_only && case_only_gap);
+  uint32_t case_ct = pheno_nm_ct - ctrl_ct;
+  uint32_t cellminx3 = 0;
+  uintptr_t case_ctl2 = (case_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t case_ctv2 = 2 * ((case_ct + (BITCT - 1)) / BITCT);
+  uintptr_t ctrl_ctl2 = (ctrl_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t case_ctv3 = 2 * ((case_ct + (2 * BITCT - 1)) / (2 * BITCT));
+  uintptr_t ctrl_ctv3 = 2 * ((ctrl_ct + (2 * BITCT - 1)) / (2 * BITCT));
+  uintptr_t case_ctsplit = 3 * case_ctv3;
+  uintptr_t ctrl_ctsplit = 3 * ctrl_ctv3;
+  uintptr_t pct = 1;
+  uintptr_t marker_uidx2 = 0;
+  uintptr_t marker_uidx2_trail = 0;
+  uintptr_t marker_idx2 = 0;
+  uintptr_t marker_idx2_trail = 0;
+  uint64_t tests_thrown_out = 0;
   uint64_t tests_complete = 0;
   uint32_t max_thread_ct = g_thread_ct;
   uint32_t chrom_idx = 0;
@@ -7438,6 +9233,7 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
   uint32_t is_last_block;
   uint32_t missing_ct;
   uint32_t ujj;
+
   // common initialization between --epistasis and --fast-epistasis: remove
   // monomorphic and non-autosomal diploid sites
   if (is_custom_set1) {
@@ -7719,14 +9515,17 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
     dxx = ltqnorm(epi_ip->epi2 / 2);
     g_epi_alpha2sq[0] = dxx * dxx;
   }
-  pct_thresh = tests_expected / 100;
   if (!is_fast) {
-    // hmm, this might not end up as a separate function
-    retval = epistasis_regression();
+    if (pheno_d) {
+      retval = epistasis_linear_regression(threads, epi_ip, bedfile, bed_offset, unfiltered_marker_ct, marker_reverse, marker_ids, max_marker_id_len, plink_maxsnp, chrom_info_ptr, marker_uidx_base, marker_ct1, marker_exclude1, marker_idx1_start, marker_idx1_end, marker_ct2, marker_exclude2, is_triangular, job_size, tests_expected, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, pheno_d, parallel_idx, parallel_tot, outname, outname_end, output_min_p, glm_vif_thresh, loadbuf, casebuf, best_ch [...]
+    } else {
+      retval = epistasis_logistic_regression(threads, epi_ip, bedfile, bed_offset, unfiltered_marker_ct, marker_reverse, marker_ids, max_marker_id_len, plink_maxsnp, chrom_info_ptr, marker_uidx_base, marker_ct1, marker_exclude1, marker_idx1_start, marker_idx1_end, marker_ct2, marker_exclude2, is_triangular, job_size, tests_expected, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, pheno_c, parallel_idx, parallel_tot, outname, outname_end, output_min_p, loadbuf, casebuf, best_chisq, best_ids, [...]
+    }
     if (retval) {
       goto epistasis_report_ret_1;
     }
   } else {
+    pct_thresh = tests_expected / 100;
     if (is_case_only) {
       g_epi_ctrl_ct = 0;
       ctrl_ctv3 = 0;
@@ -7761,7 +9560,7 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
 	goto epistasis_report_ret_WRITE_FAIL;
       }
     }
-    // claim up to half of memory with idx1 bufs; each marker currently costs
+    // claim up to half of memory with idx1 bufs; each marker currently costs:
     //   (case_ctsplit + ctrl_ctsplit) * sizeof(intptr_t) for loose geno buf
     //   0.25 for missing tracker
     //   sizeof(int32_t) for offset (to skip bottom left triangle, and/or
@@ -7857,9 +9656,10 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
     }
     wptr = memcpya(wptr, " to ", 4);
     wptr = strcpya(wptr, outname);
-    memcpy(wptr, "...", 4);
+    memcpy(wptr, " ... ", 6);
+    wordwrap(logbuf, 16); // strlen("99% [processing]") 
     logprintb();
-    fputs(" 0%", stdout);
+    fputs("0%", stdout);
     do {
       fputs(" [processing]", stdout);
       fflush(stdout);
@@ -8147,15 +9947,7 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
       fflush(stdout);
       chrom_end = 0;
       block_idx1 = 0;
-      goto epistasis_report_write_start;
       while (1) {
-      epistasis_report_write_loop:
-	block_idx1++;
-	marker_uidx++;
-	if (block_idx1 >= idx1_block_size) {
-	  break;
-	}
-      epistasis_report_write_start:
 	next_unset_ul_unsafe_ck(marker_exclude1, &marker_uidx);
 	ujj = g_epi_geno1_offsets[2 * block_idx1];
 	marker_idx2 = 0;
@@ -8238,6 +10030,12 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
 	    marker_uidx2++;
 	  }
 	}
+      epistasis_report_write_loop:
+	block_idx1++;
+	marker_uidx++;
+	if (block_idx1 >= idx1_block_size) {
+	  break;
+	}
       }
       marker_idx1 += idx1_block_size;
       fputs("\b\b\b\b\b\b\b\b\b\b          \b\b\b\b\b\b\b\b\b\b", stdout);
@@ -8253,9 +10051,9 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
 	}
       }
     } while (marker_idx1 < marker_idx1_end);
-  }
-  if (fclose_null(&outfile)) {
-    goto epistasis_report_ret_WRITE_FAIL;
+    if (fclose_null(&outfile)) {
+      goto epistasis_report_ret_WRITE_FAIL;
+    }
   }
   memcpy(&(outname_end[7]), ".summary", 9);
   if (parallel_tot > 1) {
@@ -8351,9 +10149,9 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
   if (is_triangular) {
     tests_thrown_out /= 2; // all fails double-counted in triangle case
   }
-  fputs("\b\b\b", stdout);
-  LOGPRINTF(" done.\n");
-  LOGPRINTFWW("%" PRIu64 " valid tests performed, summary written to %s .\n", tests_expected - tests_thrown_out, outname);
+  fputs("\b\b", stdout);
+  LOGPRINTF("done.\n");
+  LOGPRINTFWW("%" PRIu64 " valid test%s performed, summary written to %s .\n", tests_expected - tests_thrown_out, (tests_expected - tests_thrown_out == 1)? "" : "s", outname);
 
   while (0) {
   epistasis_report_ret_NOMEM:
diff --git a/plink_ld.h b/plink_ld.h
index e0a1e72..573a579 100644
--- a/plink_ld.h
+++ b/plink_ld.h
@@ -120,7 +120,7 @@ int32_t haploview_blocks(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uin
 
 int32_t twolocus(Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uintptr_t* pheno_nm, uint32_t pheno_nm_ct, uint32_t pheno_ctrl_ct, uintptr_t* pheno_c, uintptr_t* sex_male,  [...]
 
-int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t* marker_pos, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* pheno_nm, uint32_t pheno_nm_ct, uint32_t ctrl_ct, uintptr_t* pheno_c, double* pheno_d, uint32_t parallel_idx, uint32_t par [...]
+int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, uint32_t* marker_pos, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* pheno_nm, uint32_t pheno_nm_ct, uint32_t ctrl_ct, uintptr_t* pheno_c, double* pheno_d, uint32_t parallel_idx, uint32_t par [...]
 
 int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uint32_t* marker_pos, uintptr_t unfiltered_sample_ct, uintptr_t* founder_info, uintptr_t* sex_male, char* outname, char* outname_end, uint32_t hh_exists);
 
diff --git a/plink_matrix.c b/plink_matrix.c
index 9e3e989..4abb262 100644
--- a/plink_matrix.c
+++ b/plink_matrix.c
@@ -283,7 +283,7 @@ int32_t invert_matrix(__CLPK_integer dim, double* matrix, MATRIX_INVERT_BUF1_TYP
   dgetrf_(&dim, &dim, matrix, &dim, int_1d_buf, &info);
   dgetri_(&dim, matrix, &dim, int_1d_buf, dbl_2d_buf, &lwork, &info);
   if (info) {
-    return -1;
+    return 1;
   }
   return 0;
 }
diff --git a/plink_misc.c b/plink_misc.c
index cc05893..70a6fe4 100644
--- a/plink_misc.c
+++ b/plink_misc.c
@@ -1200,55 +1200,102 @@ uint32_t flip_str(char** allele_str_ptr) {
   return 2;
 }
 
+uint32_t flip_process_token(char* tok_start, uint32_t* marker_id_htable, uint32_t marker_id_htable_size, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* marker_exclude, uintptr_t* already_seen, char** marker_allele_ptrs, uintptr_t* hit_ct_ptr, uintptr_t* miss_ct_ptr, uint32_t* non_acgt_ct_ptr) {
+  uint32_t slen = strlen_se(tok_start);
+  uintptr_t marker_uidx;
+  uint32_t cur_non_acgt0;
+  marker_uidx = id_htable_find(tok_start, slen, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len);
+  if ((marker_uidx == 0xffffffffU) || IS_SET(marker_exclude, marker_uidx)) {
+    *miss_ct_ptr += 1;
+    return 0;
+  }
+  if (IS_SET(already_seen, marker_uidx)) {
+    tok_start[slen] = '\0';
+    LOGPREPRINTFWW("Error: Duplicate variant ID '%s' in --flip file.\n", tok_start);
+    return 1;
+  }
+  SET_BIT(already_seen, marker_uidx);
+  cur_non_acgt0 = 0;
+  cur_non_acgt0 |= flip_str(&(marker_allele_ptrs[2 * marker_uidx]));
+  cur_non_acgt0 |= flip_str(&(marker_allele_ptrs[2 * marker_uidx + 1]));
+  *non_acgt_ct_ptr += (cur_non_acgt0 & 1);
+  *hit_ct_ptr += (cur_non_acgt0 >> 1);
+  return 0;
+}
+
 int32_t flip_strand(char* flip_fname, uint32_t* marker_id_htable, uint32_t marker_id_htable_size, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, char** marker_allele_ptrs) {
   unsigned char* wkspace_mark = wkspace_base;
   FILE* flipfile = NULL;
-  uint32_t non_acgt_ct = 0;
-  int32_t retval = 0;
   uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
   uintptr_t hit_ct = 0;
   uintptr_t miss_ct = 0;
-  uintptr_t line_idx = 0;
+  char* midbuf = &(tbuf[MAXLINELEN]);
+  uint32_t non_acgt_ct = 0;
+  uint32_t curtoklen = 0;
+  int32_t retval = 0;
+  uintptr_t bufsize;
   uintptr_t* already_seen;
+  char* bufptr0;
   char* bufptr;
-  uint32_t slen;
-  uint32_t cur_non_acgt0;
-  uintptr_t marker_uidx;
+  char* bufptr2;
+  char* bufptr3;
   if (wkspace_alloc_ul_checked(&already_seen, unfiltered_marker_ctl * sizeof(intptr_t))) {
     goto flip_strand_ret_NOMEM;
   }
   fill_ulong_zero(already_seen, unfiltered_marker_ctl);
-  if (fopen_checked(&flipfile, flip_fname, "r")) {
+  // Compatibility fix: PLINK 1.07 uses a token- rather than a line-based
+  // loader here.
+  if (fopen_checked(&flipfile, flip_fname, "rb")) {
     goto flip_strand_ret_OPEN_FAIL;
   }
-  tbuf[MAXLINELEN - 1] = ' ';
-  while (fgets(tbuf, MAXLINELEN, flipfile)) {
-    line_idx++;
-    if (!tbuf[MAXLINELEN - 1]) {
-      sprintf(logbuf, "Error: Line %" PRIuPTR " of --flip file is pathologically long.\n", line_idx);
-      goto flip_strand_ret_INVALID_FORMAT_2;
+  while (1) {
+    if (fread_checked(midbuf, MAXLINELEN, flipfile, &bufsize)) {
+      goto flip_strand_ret_READ_FAIL;
     }
-    bufptr = skip_initial_spaces(tbuf);
-    if (is_eoln_kns(*bufptr)) {
-      continue;
+    if (!bufsize) {
+      if (curtoklen) {
+        if (flip_process_token(&(tbuf[MAXLINELEN - curtoklen]), marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len, marker_exclude, already_seen, marker_allele_ptrs, &hit_ct, &miss_ct, &non_acgt_ct)) {
+	  goto flip_strand_ret_INVALID_FORMAT_2;
+	}
+      }
+      break;
     }
-    slen = strlen_se(bufptr);
-    marker_uidx = id_htable_find(bufptr, slen, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len);
-    if ((marker_uidx == 0xffffffffU) || IS_SET(marker_exclude, marker_uidx)) {
-      miss_ct++;
-      continue;
+    bufptr0 = &(midbuf[bufsize]);
+    *bufptr0 = ' ';
+    bufptr0[1] = '0';
+    bufptr = &(tbuf[MAXLINELEN - curtoklen]);
+    bufptr2 = midbuf;
+    if (curtoklen) {
+      goto flip_strand_tok_start;
     }
-    if (IS_SET(already_seen, marker_uidx)) {
-      bufptr[slen] = '\0';
-      LOGPREPRINTFWW("Error: Duplicate variant ID '%s' in --flip file.\n", bufptr);
-      goto flip_strand_ret_INVALID_FORMAT_2;
+    while (1) {
+      while (*bufptr <= ' ') {
+	bufptr++;
+      }
+      if (bufptr >= bufptr0) {
+        curtoklen = 0;
+        break;
+      }
+      bufptr2 = &(bufptr[1]);
+    flip_strand_tok_start:
+      while (*bufptr2 > ' ') {
+        bufptr2++;
+      }
+      curtoklen = (uintptr_t)(bufptr2 - bufptr);
+      if (bufptr2 == &(tbuf[MAXLINELEN * 2])) {
+        if (curtoklen > MAX_ID_LEN) {
+	  logprint("Error: Excessively long ID in --flip file.\n");
+	  goto flip_strand_ret_INVALID_FORMAT;
+	}
+        bufptr3 = &(tbuf[MAXLINELEN - curtoklen]);
+        memcpy(bufptr3, bufptr, curtoklen);
+	break;
+      }
+      if (flip_process_token(bufptr, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len, marker_exclude, already_seen, marker_allele_ptrs, &hit_ct, &miss_ct, &non_acgt_ct)) {
+	goto flip_strand_ret_INVALID_FORMAT_2;
+      }
+      bufptr = &(bufptr2[1]);
     }
-    SET_BIT(already_seen, marker_uidx);
-    cur_non_acgt0 = 0;
-    cur_non_acgt0 |= flip_str(&(marker_allele_ptrs[2 * marker_uidx]));
-    cur_non_acgt0 |= flip_str(&(marker_allele_ptrs[2 * marker_uidx + 1]));
-    non_acgt_ct += (cur_non_acgt0 & 1);
-    hit_ct += (cur_non_acgt0 >> 1);
   }
   if (!feof(flipfile)) {
     goto flip_strand_ret_READ_FAIL;
@@ -1274,6 +1321,7 @@ int32_t flip_strand(char* flip_fname, uint32_t* marker_id_htable, uint32_t marke
     break;
   flip_strand_ret_INVALID_FORMAT_2:
     logprintb();
+  flip_strand_ret_INVALID_FORMAT:
     retval = RET_INVALID_FORMAT;
     break;
   }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/plink2.git



More information about the debian-med-commit mailing list