[med-svn] [plink1.9] 01/01: Remove the LP64->SSE2 patch again

Sat Sep 19 18:17:14 UTC 2015

This is an automated email from the git hooks/post-receive script.

gert-guest pushed a commit to branch master
in repository plink1.9.

commit 9d0c74fca25259b2edba6791b5c5b94be02373d9
Author: Gert Wollny <gw.fossdev at gmail.com>
Date:   Sat Sep 19 20:14:53 2015 +0200

    Remove the LP64->SSE2 patch again
    
    It would fix the build on non-x86 64 bit archs, but these builds would
    not be usable, because sometimes the define __LP64__ is used for enabling
    the SSE2 code path but somethimes it is also used to decide for an alternative
    64 bit code path.
---
 debian/changelog                             |    3 -
 debian/patches/03_replace_LP64_by_SSE2.patch | 2589 --------------------------
 debian/patches/series                        |    1 -
 3 files changed, 2593 deletions(-)

diff --git a/debian/changelog b/debian/changelog
index 41a6cff..0647af9 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -5,9 +5,6 @@ plink1.9 (1.90~b3w-150903-1) UNRELEASED; urgency=low
   * debian/upstream/metadata: Update reference paper
   * debian/copyright: Update
 
-  [Gert Wollny]
-  * Add patch 03_replace_LP64_by_SSE2 (Closes: #799471) 
-
  -- Dylan Aïssi <bob.dybian at gmail.com>  Sat, 19 Sep 2015 15:44:10 +0200
 
 plink1.9 (1.90~b3b-150117-1) unstable; urgency=low
diff --git a/debian/patches/03_replace_LP64_by_SSE2.patch b/debian/patches/03_replace_LP64_by_SSE2.patch
deleted file mode 100644
index b6f7c23..0000000
--- a/debian/patches/03_replace_LP64_by_SSE2.patch
+++ /dev/null
@@ -1,2589 +0,0 @@
-Description: Replace the test for LP64 by a test for SSE2
- This patch replaces the test for the define __LP64__ by a test for 
- __SSE2__ since the former only provides information abouth whether 
- the arch is 64 bit, while actually the availability of SSE2 needs 
- to be tested. 
-Author: Gert Wollny <gw.fossdev at gmail.com> 
-Bug-Debian: http://bugs.debian.org/799471
-Forwarded: no
-Last-Update: 2015-09-19
-
---- plink1.9-1.90~b3w-150903.orig/Rsrv.h
-+++ plink1.9-1.90~b3w-150903/Rsrv.h
-@@ -355,11 +355,11 @@ typedef unsigned long rlen_t;
- #ifdef ULONG_MAX
- #define rlen_max ULONG_MAX
- #else
--#ifdef __LP64__
-+#ifdef __SSE2__
- #define rlen_max 0xffffffffffffffffL 
- #else
- #define rlen_max 0xffffffffL
--#endif /* __LP64__ */
-+#endif /* __SSE2__ */
- #endif /* ULONG_MAX */
- 
- 
---- plink1.9-1.90~b3w-150903.orig/SFMT.c
-+++ plink1.9-1.90~b3w-150903/SFMT.c
-@@ -48,7 +48,7 @@ extern "C" {
- #include <assert.h>
- #include "SFMT.h"
- 
--#ifndef __LP64__
-+#ifndef __SSE2__
- inline static void do_recursion(w128_t * r, w128_t * a, w128_t * b,
- 				w128_t * c, w128_t * d);
- #endif
-@@ -110,7 +110,7 @@ inline static void lshift128(w128_t *out
-  * @param c a 128-bit part of the internal state array
-  * @param d a 128-bit part of the internal state array
-  */
--#ifndef __LP64__
-+#ifndef __SSE2__
- inline static void do_recursion(w128_t *r, w128_t *a, w128_t *b,
- 				w128_t *c, w128_t *d)
- {
-@@ -144,7 +144,7 @@ inline static uint32_t func1(uint32_t x)
- inline static uint32_t func2(uint32_t x);
- static void period_certification(sfmt_t * sfmt);
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- inline static void mm_recursion(__m128i * r, __m128i a, __m128i b,
- 				__m128i c, __m128i d);
- 
-@@ -255,7 +255,7 @@ inline static int idxof(int i) {
-     return i;
- }
- 
--#ifndef __LP64__
-+#ifndef __SSE2__
- /**
-  * This function fills the user-specified array with pseudorandom
-  * integers.
-@@ -391,7 +391,7 @@ int sfmt_get_min_array_size64(sfmt_t * s
-     return SFMT_N64;
- }
- 
--#ifndef __LP64__
-+#ifndef __SSE2__
- /**
-  * This function fills the internal state array with pseudorandom
-  * integers.
---- plink1.9-1.90~b3w-150903.orig/SFMT.h
-+++ plink1.9-1.90~b3w-150903/SFMT.h
-@@ -128,7 +128,7 @@ extern "C" {
- /*------------------------------------------
-   128-bit SIMD like data type for standard C
-   ------------------------------------------*/
--#ifdef __LP64__
-+#ifdef __SSE2__
-   #include <emmintrin.h>
- 
- /** 128-bit data structure */
---- plink1.9-1.90~b3w-150903.orig/plink.c
-+++ plink1.9-1.90~b3w-150903/plink.c
-@@ -98,7 +98,7 @@ const char ver_str[] =
- #ifdef NOLAPACK
-   "NL"
- #endif
--#ifdef __LP64__
-+#ifdef __SSE2__
-   " 64-bit"
- #else
-   " 32-bit"
-@@ -1670,7 +1670,7 @@ int32_t plink(char* outname, char* outna
-     wkspace_mark_postcluster = wkspace_base;
-     ulii = (sample_ct * (sample_ct - 1)) >> 1;
-     if (cluster_ptr->mds_dim_ct) {
--#ifndef __LP64__
-+#ifndef __SSE2__
-       // catch 32-bit intptr_t overflow
-       if (sample_ct > 23169) {
-         goto plink_ret_NOMEM;
-@@ -1692,13 +1692,13 @@ int32_t plink(char* outname, char* outna
- 
-     if (cluster_ct) {
-       ulii = cluster_ct + sample_ct - cluster_starts[cluster_ct];
--#ifndef __LP64__
-+#ifndef __SSE2__
-       if (ulii > 23169) {
- 	goto plink_ret_NOMEM;
-       }
- #endif
-       ulii = (ulii * (ulii - 1)) >> 1;
--#ifndef __LP64__
-+#ifndef __SSE2__
-     } else if (sample_ct > 23169) {
-       goto plink_ret_NOMEM;
- #endif
-@@ -3056,7 +3056,7 @@ int32_t init_delim_and_species(uint32_t
-     break;
-   case SPECIES_DOG:
-     chrom_info_ptr->autosome_ct = 38;
--#ifdef __LP64__
-+#ifdef __SSE2__
-     chrom_info_ptr->haploid_mask[0] = 0x18000000000LLU;
- #else
-     chrom_info_ptr->haploid_mask[1] = 0x180;
-@@ -3064,7 +3064,7 @@ int32_t init_delim_and_species(uint32_t
-     break;
-   case SPECIES_HORSE:
-     chrom_info_ptr->autosome_ct = 31;
--#ifdef __LP64__
-+#ifdef __SSE2__
-     chrom_info_ptr->haploid_mask[0] = 0x300000000LLU;
- #else
-     chrom_info_ptr->haploid_mask[1] = 3;
-@@ -8292,7 +8292,7 @@ int32_t main(int32_t argc, char** argv)
- 	  sprintf(logbuf, "Error: Invalid --memory parameter '%s' (minimum %u).\n", argv[cur_arg + 1], WKSPACE_MIN_MB);
- 	  goto main_ret_INVALID_CMDLINE_WWA;
- 	}
--#ifndef __LP64__
-+#ifndef __SSE2__
- 	if (malloc_size_mb > 2047) {
- 	  logerrprint("Error: --memory parameter too large for 32-bit version (max 2047).\n");
- 	  goto main_ret_INVALID_CMDLINE;
-@@ -13200,7 +13200,7 @@ int32_t main(int32_t argc, char** argv)
-   } else if (malloc_size_mb < WKSPACE_MIN_MB) {
-     malloc_size_mb = WKSPACE_MIN_MB;
-   }
--#ifndef __LP64__
-+#ifndef __SSE2__
-   if (malloc_size_mb > 2047) {
-     malloc_size_mb = 2047;
-   }
---- plink1.9-1.90~b3w-150903.orig/plink_assoc.c
-+++ plink1.9-1.90~b3w-150903/plink_assoc.c
-@@ -34,7 +34,7 @@ void single_marker_cc_freqs(uintptr_t sa
-   uintptr_t loader2;
-   uintptr_t loader3;
-   uintptr_t loader4;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t cur_decr = 60;
-   uintptr_t* lptr_6x_end;
-   sample_ctl2 -= sample_ctl2 % 6;
-@@ -136,7 +136,7 @@ void single_marker_cc_3freqs(uintptr_t s
-   uintptr_t loader;
-   uintptr_t loader2;
-   uintptr_t loader3;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t cur_decr = 120;
-   uintptr_t* lptr_12x_end;
-   sample_ctl2 -= sample_ctl2 % 12;
-@@ -786,7 +786,7 @@ void transpose_perms(uintptr_t* perm_vec
-   //   next 4 bytes: 32 40 48...
-   uintptr_t sample_idx = 0;
-   uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t wbuf[4];
-   uint32_t* wbptr;
- #else
-@@ -801,7 +801,7 @@ void transpose_perms(uintptr_t* perm_vec
-     pvptr = &(perm_vecs[sample_idx / BITCT2]);
-     rshift = 2 * (sample_idx % BITCT2);
-     goto transpose_perms_loop_start;
--#ifdef __LP64__
-+#ifdef __SSE2__
-     do {
-       if (!(perm_idx % 4)) {
- 	if (perm_idx % 128) {
-@@ -840,7 +840,7 @@ void transpose_perms(uintptr_t* perm_vec
- void transpose_perm1s(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_nm_ct, uint32_t* perm_vecst) {
-   uintptr_t sample_idx = 0;
-   uintptr_t pheno_nm_ctl = (pheno_nm_ct + (BITCT - 1)) / BITCT;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t wbuf[4];
-   uint32_t* wbptr;
- #else
-@@ -855,7 +855,7 @@ void transpose_perm1s(uintptr_t* perm_ve
-     pvptr = &(perm_vecs[sample_idx / BITCT]);
-     rshift = sample_idx % BITCT;
-     goto transpose_perm1s_loop_start;
--#ifdef __LP64__
-+#ifdef __SSE2__
-     do {
-       if (!(perm_idx % 4)) {
- 	if (perm_idx % 128) {
-@@ -919,7 +919,7 @@ void calc_git(uint32_t pheno_nm_ct, uint
-   // is called.
-   uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
-   uint32_t perm_ct16 = (perm_vec_ct + 15) / 16;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-   uint32_t perm_ct128x4 = perm_ct128 * 4;
-   uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
-@@ -955,7 +955,7 @@ void calc_git(uint32_t pheno_nm_ct, uint
-   uint32_t ujj;
-   uint32_t ukk;
-   uint32_t sample_type;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   // 4- and 8-bit partial counts
-   gitv[0] = (__m128i*)thread_wkspace;
-   gitv[1] = &(((__m128i*)thread_wkspace)[perm_ct128x4]);
-@@ -992,7 +992,7 @@ void calc_git(uint32_t pheno_nm_ct, uint
-       ujj = CTZLU(ulii) & (BITCT - 2); // get pos of next non-[hom A2] sample
-       sample_type = ((ulii >> ujj) & 3) - 1;
-       git_merge4 = gitv[sample_type];
--#ifdef __LP64__
-+#ifdef __SSE2__
-       perm_ptr = &(permsv[(ujj / 2) * perm_ct128]);
-       for (pbidx = 0; pbidx < perm_ct128; pbidx++) {
- 	loader = *perm_ptr++;
-@@ -1067,7 +1067,7 @@ void calc_git(uint32_t pheno_nm_ct, uint
- #endif
-       ulii &= ~((3 * ONELU) << ujj);
-     }
--#ifdef __LP64__
-+#ifdef __SSE2__
-     permsv = &(permsv[BITCT2 * perm_ct128]);
- #else
-     permsv = &(permsv[BITCT2 * perm_ct32]);
-@@ -1075,7 +1075,7 @@ void calc_git(uint32_t pheno_nm_ct, uint
-   }
-   for (sample_type = 0; sample_type < 3; sample_type++) {
-     uii = cur_cts[sample_type];
--#ifdef __LP64__
-+#ifdef __SSE2__
-     if (uii % 15) {
-       git_merge4 = gitv[sample_type];
-       git_merge8 = gitv[sample_type + 3];
-@@ -1127,7 +1127,7 @@ void calc_git(uint32_t pheno_nm_ct, uint
- 
- void calc_qgit(uint32_t pheno_nm_ct, uintptr_t perm_vec_ctcl8m, uint32_t num_perms_now, uintptr_t* __restrict__ loadbuf, double* perm_vecstd, double* thread_bufs) {
-   uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
-   uint32_t row_mult = perm_vec_ctcl8m / 4;
- 
-@@ -1160,7 +1160,7 @@ void calc_qgit(uint32_t pheno_nm_ct, uin
-     while (ulii) {
-       ujj = CTZLU(ulii) & (BITCT - 2);
-       sample_type = (ulii >> ujj) & 3;
--#ifdef __LP64__
-+#ifdef __SSE2__
-       // note that the gain from using SSE2 for double-precision arithmetic is
-       // typically minimal because modern cores tend to have two FPUs, so we
-       // should only use it opportunistically.  it's painless here, though.
-@@ -1220,7 +1220,7 @@ void calc_qgit(uint32_t pheno_nm_ct, uin
- #endif
-       ulii &= ~((3 * ONELU) << ujj);
-     }
--#ifdef __LP64__
-+#ifdef __SSE2__
-     permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
- #else
-     perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
-@@ -1230,7 +1230,7 @@ void calc_qgit(uint32_t pheno_nm_ct, uin
- 
- void calc_qgit_lin(uint32_t pheno_nm_ct, uintptr_t perm_vec_ctcl8m, uint32_t num_perms_now, uintptr_t* __restrict__ loadbuf, double* perm_vecstd, double* thread_bufs) {
-   uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
-   uint32_t row_mult = perm_vec_ctcl8m / 4;
- 
-@@ -1263,7 +1263,7 @@ void calc_qgit_lin(uint32_t pheno_nm_ct,
-     while (ulii) {
-       ujj = CTZLU(ulii) & (BITCT - 2);
-       sample_type = (ulii >> ujj) & 3;
--#ifdef __LP64__
-+#ifdef __SSE2__
-       perm_readv = &(permsv[ujj * row_mult]);
-       if (sample_type == 1) {
- 	git_writev = (__m128d*)thread_bufs;
-@@ -1306,7 +1306,7 @@ void calc_qgit_lin(uint32_t pheno_nm_ct,
- #endif
-       ulii &= ~((3 * ONELU) << ujj);
-     }
--#ifdef __LP64__
-+#ifdef __SSE2__
-     permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
- #else
-     perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
-@@ -1314,7 +1314,7 @@ void calc_qgit_lin(uint32_t pheno_nm_ct,
-   }
- }
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t rem_cost_60v(__m128i* vec1, __m128i* vend, __m128i* vec2) {
-   const __m128i m1 = {FIVEMASK, FIVEMASK};
-   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
-@@ -1522,7 +1522,7 @@ uintptr_t rem_cost(uintptr_t sample_ctv2
-   uintptr_t detect_homcom;
-   uintptr_t result_a;
-   uintptr_t result_b;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t cur_decr = 60;
-   uintptr_t* lptr_6x_end;
-   sample_ctv2 -= sample_ctv2 % 6;
-@@ -1583,7 +1583,7 @@ uintptr_t qrem_cost2(uintptr_t sample_ct
-   uintptr_t result_a;
-   uintptr_t result_b;
-   uintptr_t result_c;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t cur_decr = 40;
-   uintptr_t* lptr_4x_end;
-   sample_ctl2 &= ~3LLU;
-@@ -1620,7 +1620,7 @@ uintptr_t qrem_cost2(uintptr_t sample_ct
-   return cost;
- }
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- static inline void calc_rem_merge4_one(uint32_t perm_ct128, __m128i* __restrict__ perm_ptr, __m128i* __restrict__ rem_merge4) {
-   const __m128i m1x4 = {0x1111111111111111LLU, 0x1111111111111111LLU};
-   __m128i loader;
-@@ -1788,7 +1788,7 @@ void calc_rem(uint32_t pheno_nm_ct, uint
-   // low 8 bits give index of first remv[] array to increment; next 8 bits give
-   // second index if nonzero, or indicate its absence
-   const uint32_t idx_table[3][4] = {{0x300, 0x102, 4, 5}, {0x500, 2, 0x104, 3}, {0, 0x502, 0x304, 1}};
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-   uint32_t perm_ct128x4 = perm_ct128 * 4;
-   uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
-@@ -1819,7 +1819,7 @@ void calc_rem(uint32_t pheno_nm_ct, uint
-   uint32_t uii;
-   uint32_t ujj;
-   uint32_t ukk;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   for (uii = 0; uii < 6; uii++) {
-     remv[uii] = &(((__m128i*)thread_wkspace)[uii * perm_ct128x4]);
-   }
-@@ -1860,7 +1860,7 @@ void calc_rem(uint32_t pheno_nm_ct, uint
-       idx1 = idx_table[cur_xor - 1][cur_raw];
-       idx2 = idx1 >> 8;
-       idx1 &= 255;
--#ifdef __LP64__
-+#ifdef __SSE2__
-       perm_ptr = &(permsv[(ujj / 2) * perm_ct128]);
-       if (!idx2) {
- 	calc_rem_merge4_one(perm_ct128, perm_ptr, remv[idx1]);
-@@ -1917,7 +1917,7 @@ void calc_rem(uint32_t pheno_nm_ct, uint
- #endif
-       ulxor &= ~((3 * ONELU) << ujj);
-     }
--#ifdef __LP64__
-+#ifdef __SSE2__
-     permsv = &(permsv[BITCT2 * perm_ct128]);
- #else
-     permsv = &(permsv[BITCT2 * perm_ct32]);
-@@ -1925,7 +1925,7 @@ void calc_rem(uint32_t pheno_nm_ct, uint
-   }
-   for (idx1 = 0; idx1 < 6; idx1++) {
-     uii = cur_cts[idx1];
--#ifdef __LP64__
-+#ifdef __SSE2__
-     if (uii % 15) {
-       calc_rem_merge8(perm_ct32, remv[idx1], remv[idx1 + 6]);
-     }
-@@ -1954,7 +1954,7 @@ void calc_rem(uint32_t pheno_nm_ct, uint
- void calc_qrem(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, uintptr_t* loadbuf_ref, double* perm_vecstd, double* outbufs) {
-   uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
-   uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
-   uint32_t row_mult = perm_vec_ctcl8m / 4;
- 
-@@ -1993,7 +1993,7 @@ void calc_qrem(uint32_t pheno_nm_ct, uin
-       ujj = CTZLU(ulxor) & (BITCT - 2);
-       cur_xor = (ulxor >> ujj) & 3;
-       cur_raw = (ulraw1 >> ujj) & 3;
--#ifdef __LP64__
-+#ifdef __SSE2__
-       perm_readv = &(permsv[ujj * row_mult]);
-       rem_writev = (__m128d*)outbufs;
-       rem_write2v = (__m128d*)(&(outbufs[perm_vec_ctcl8m]));
-@@ -2208,7 +2208,7 @@ void calc_qrem(uint32_t pheno_nm_ct, uin
- #endif
-       ulxor &= ~((3 * ONELU) << ujj);
-     }
--#ifdef __LP64__
-+#ifdef __SSE2__
-     permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
- #else
-     perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
-@@ -2219,7 +2219,7 @@ void calc_qrem(uint32_t pheno_nm_ct, uin
- void calc_qrem_lin(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, uintptr_t* loadbuf_ref, double* perm_vecstd, double* outbufs) {
-   uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
-   uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
-   uint32_t row_mult = perm_vec_ctcl8m / 4;
- 
-@@ -2264,7 +2264,7 @@ void calc_qrem_lin(uint32_t pheno_nm_ct,
-       ujj = CTZLU(ulxor) & (BITCT - 2);
-       cur_xor = (ulxor >> ujj) & 3;
-       cur_raw = (ulraw1 >> ujj) & 3;
--#ifdef __LP64__
-+#ifdef __SSE2__
-       perm_readv = &(permsv[ujj * row_mult]);
-       if (cur_raw == 3) {
- 	if (cur_xor == 1) {
-@@ -2589,7 +2589,7 @@ void calc_qrem_lin(uint32_t pheno_nm_ct,
- #endif
-       ulxor &= ~((3 * ONELU) << ujj);
-     }
--#ifdef __LP64__
-+#ifdef __SSE2__
-     permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
- #else
-     perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
-@@ -3197,7 +3197,7 @@ THREAD_RET_TYPE assoc_maxt_thread(void*
-   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
-   uint32_t model_fisher = g_model_fisher;
-   uint32_t fisher_midp = g_fisher_midp;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
- #else
-@@ -3359,13 +3359,13 @@ THREAD_RET_TYPE assoc_maxt_thread(void*
- 	  ldrefs[marker_idx] = ldref;
- 	}
- 	if (ldref == marker_bidx) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	  fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
- #else
- 	  fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
- #endif
- 	  calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	  fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
- #else
- 	  fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-@@ -3478,7 +3478,7 @@ THREAD_RET_TYPE assoc_set_thread(void* a
-   uint32_t assoc_thread_ct = g_assoc_thread_ct;
-   uintptr_t perm_vec_ct = g_perm_vec_ct;
-   uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
- #else
-@@ -3556,13 +3556,13 @@ THREAD_RET_TYPE assoc_set_thread(void* a
- 	git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
- 	git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
- 	git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
- #else
- 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
- #endif
- 	calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
- #else
- 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-@@ -4619,7 +4619,7 @@ THREAD_RET_TYPE model_maxt_domrec_thread
-   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
-   uint32_t model_fisher = g_model_fisher;
-   uint32_t fisher_midp = g_fisher_midp;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
- #else
-@@ -4760,13 +4760,13 @@ THREAD_RET_TYPE model_maxt_domrec_thread
- 	ldrefs[marker_idx] = ldref;
-       }
-       if (ldref == marker_bidx) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
- #else
- 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
- #endif
- 	calc_git(pheno_nm_ct, perm_vec_ct, &(loadbuf[marker_bidx * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
- #else
- 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-@@ -4862,7 +4862,7 @@ THREAD_RET_TYPE model_set_domrec_thread(
-   uint32_t assoc_thread_ct = g_assoc_thread_ct;
-   uintptr_t perm_vec_ct = g_perm_vec_ct;
-   uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
- #else
-@@ -4929,13 +4929,13 @@ THREAD_RET_TYPE model_set_domrec_thread(
-       git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
-       git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
-       git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
--#ifdef __LP64__
-+#ifdef __SSE2__
-       fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
- #else
-       fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
- #endif
-       calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
--#ifdef __LP64__
-+#ifdef __SSE2__
-       fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
- #else
-       fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-@@ -5104,7 +5104,7 @@ THREAD_RET_TYPE model_maxt_trend_thread(
-   uintptr_t pheno_nm_ctv2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
-   uint32_t assoc_thread_ct = g_assoc_thread_ct;
-   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
- #else
-@@ -5221,13 +5221,13 @@ THREAD_RET_TYPE model_maxt_trend_thread(
- 	ldrefs[marker_idx] = ldref;
-       }
-       if (ldref == marker_bidx) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
- #else
- 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
- #endif
- 	calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
- #else
- 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-@@ -5299,7 +5299,7 @@ THREAD_RET_TYPE model_set_trend_thread(v
-   uint32_t assoc_thread_ct = g_assoc_thread_ct;
-   uintptr_t perm_vec_ct = g_perm_vec_ct;
-   uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
- #else
-@@ -5359,13 +5359,13 @@ THREAD_RET_TYPE model_set_trend_thread(v
-       git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
-       git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
-       git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
--#ifdef __LP64__
-+#ifdef __SSE2__
-       fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
- #else
-       fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
- #endif
-       calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
--#ifdef __LP64__
-+#ifdef __SSE2__
-       fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
- #else
-       fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-@@ -5542,7 +5542,7 @@ THREAD_RET_TYPE model_maxt_gen_thread(vo
-   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
-   uint32_t model_fisher = g_model_fisher;
-   uint32_t fisher_midp = g_fisher_midp;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
- #else
-@@ -5665,13 +5665,13 @@ THREAD_RET_TYPE model_maxt_gen_thread(vo
- 	ldrefs[marker_idx] = ldref;
-       }
-       if (ldref == marker_bidx) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
- #else
- 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
- #endif
- 	calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
- #else
- 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-@@ -5971,7 +5971,7 @@ THREAD_RET_TYPE model_maxt_best_thread(v
-   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
-   uint32_t model_fisher = g_model_fisher;
-   uint32_t fisher_midp = g_fisher_midp;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
- #else
-@@ -6112,13 +6112,13 @@ THREAD_RET_TYPE model_maxt_best_thread(v
- 	ldrefs[marker_idx] = ldref;
-       }
-       if (ldref == marker_bidx) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
- #else
- 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
- #endif
- 	calc_git(pheno_nm_ct, perm_vec_ct, &(loadbuf[marker_bidx * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
- #else
- 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-@@ -6307,7 +6307,7 @@ THREAD_RET_TYPE model_set_best_thread(vo
-   uint32_t assoc_thread_ct = g_assoc_thread_ct;
-   uintptr_t perm_vec_ct = g_perm_vec_ct;
-   uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
- #else
-@@ -6381,13 +6381,13 @@ THREAD_RET_TYPE model_set_best_thread(vo
-       git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
-       git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
-       git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
--#ifdef __LP64__
-+#ifdef __SSE2__
-       fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
- #else
-       fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
- #endif
-       calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
--#ifdef __LP64__
-+#ifdef __SSE2__
-       fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
- #else
-       fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-@@ -6612,7 +6612,7 @@ int32_t model_assoc_set_test(pthread_t*
-   join_threads(threads, assoc_thread_ct);
-   g_assoc_thread_ct = max_thread_ct;
-   g_resultbuf = (uint32_t*)wkspace_alloc(perm_vec_ctcl4m * 3 * MODEL_BLOCKSIZE * sizeof(int32_t));
--#ifdef __LP64__
-+#ifdef __SSE2__
-   ulii = ((perm_vec_ct + 127) / 128) * 16;
-   g_perm_vecst = (uint32_t*)wkspace_alloc(ulii * pheno_nm_ct);
- #else
-@@ -6622,7 +6622,7 @@ int32_t model_assoc_set_test(pthread_t*
- #endif
-   g_thread_git_wkspace = (uint32_t*)wkspace_alloc(ulii * 72 * max_thread_ct);
-   transpose_perms(g_perm_vecs, perm_vec_ct, pheno_nm_ct, g_perm_vecst);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 9 * max_thread_ct);
- #else
-   fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 18 * max_thread_ct);
-@@ -7211,7 +7211,7 @@ int32_t model_assoc(pthread_t* threads,
-       if (!g_ldrefs) {
- 	goto model_assoc_ret_NOMEM;
-       }
--#ifdef __LP64__
-+#ifdef __SSE2__
-       fill_ulong_one((uintptr_t*)g_ldrefs, (marker_ct + 3) / 4);
- #else
-       fill_ulong_one((uintptr_t*)g_ldrefs, (marker_ct + 1) / 2);
-@@ -7408,7 +7408,7 @@ int32_t model_assoc(pthread_t* threads,
-       ulii = (perm_vec_ct + (CACHELINE_DBL - 1)) / CACHELINE_DBL;
-       g_maxt_thread_results = (double*)wkspace_alloc(max_thread_ct * ulii * CACHELINE);
-       g_resultbuf = (uint32_t*)wkspace_alloc(perm_vec_ctcl4m * 3 * MODEL_BLOCKSIZE * sizeof(int32_t));
--#ifdef __LP64__
-+#ifdef __SSE2__
-       ulii = ((perm_vec_ct + 127) / 128) * 16;
-       g_perm_vecst = (uint32_t*)wkspace_alloc(ulii * pheno_nm_ct);
- #else
-@@ -7418,7 +7418,7 @@ int32_t model_assoc(pthread_t* threads,
- #endif
-       g_thread_git_wkspace = (uint32_t*)wkspace_alloc(ulii * 72 * max_thread_ct);
-       transpose_perms(g_perm_vecs, perm_vec_ct, pheno_nm_ct, g_perm_vecst);
--#ifdef __LP64__
-+#ifdef __SSE2__
-       fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 9 * max_thread_ct);
- #else
-       fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 18 * max_thread_ct);
-@@ -9037,7 +9037,7 @@ int32_t qassoc(pthread_t* threads, FILE*
-     if (!g_ldrefs) {
-       goto qassoc_ret_NOMEM;
-     }
--#ifdef __LP64__
-+#ifdef __SSE2__
-     fill_ulong_one((uintptr_t*)g_ldrefs, (marker_ct + 3) / 4);
- #else
-     fill_ulong_one((uintptr_t*)g_ldrefs, (marker_ct + 1) / 2);
-@@ -10433,7 +10433,7 @@ void calc_git_missing(uint32_t pheno_nm_
-   // thread_wkspace[] is assumed to be zeroed out before this function is
-   // called.
-   uint32_t pheno_nm_ctl = (pheno_nm_ct + (BITCT - 1)) / BITCT;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t perm_ct16 = (perm_vec_ct + 15) / 16;
-   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
-   uint32_t perm_ct128x4 = perm_ct128 * 4;
-@@ -10466,7 +10466,7 @@ void calc_git_missing(uint32_t pheno_nm_
-   uint32_t pbidx;
-   uint32_t uii;
-   uint32_t ujj;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   // 4- and 8-bit partial counts
-   gitv[0] = &(((__m128i*)thread_wkspace)[8 * perm_ct128x4]);
-   gitv[1] = &(((__m128i*)thread_wkspace)[9 * perm_ct128x4]);
-@@ -10488,7 +10488,7 @@ void calc_git_missing(uint32_t pheno_nm_
-     while (ulii) {
-       ujj = CTZLU(ulii);
-       git_merge4 = gitv[0];
--#ifdef __LP64__
-+#ifdef __SSE2__
-       perm_ptr = &(permsv[ujj * perm_ct128]);
-       for (pbidx = 0; pbidx < perm_ct128; pbidx++) {
- 	loader = *perm_ptr++;
-@@ -10561,13 +10561,13 @@ void calc_git_missing(uint32_t pheno_nm_
- #endif
-       ulii &= ulii - 1;
-     }
--#ifdef __LP64__
-+#ifdef __SSE2__
-     permsv = &(permsv[BITCT * perm_ct128]);
- #else
-     permsv = &(permsv[BITCT * perm_ct32]);
- #endif
-   }
--#ifdef __LP64__
-+#ifdef __SSE2__
-   if (cur_ct % 15) {
-     git_merge4 = gitv[0];
-     git_merge8 = gitv[1];
-@@ -11325,7 +11325,7 @@ int32_t testmiss(pthread_t* threads, FIL
-     if (perm_maxt) {
-       ulii = (g_perm_vec_ct + (CACHELINE_DBL - 1)) / CACHELINE_DBL;
-       g_maxt_thread_results = (double*)wkspace_alloc(max_thread_ct * ulii * CACHELINE);
--#ifdef __LP64__
-+#ifdef __SSE2__
-       ulii = ((g_perm_vec_ct + 127) / 128) * 16;
-       g_perm_vecst = (uint32_t*)wkspace_alloc(ulii * pheno_nm_ct);
- #else
-@@ -11335,7 +11335,7 @@ int32_t testmiss(pthread_t* threads, FIL
- #endif
-       g_thread_git_wkspace = (uint32_t*)wkspace_alloc(ulii * 44 * max_thread_ct);
-       transpose_perm1s(g_perm_vecs, g_perm_vec_ct, pheno_nm_ct, g_perm_vecst);
--#ifdef __LP64__
-+#ifdef __SSE2__
-       fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, (ulii / 2) * 11 * max_thread_ct);
- #else
-       fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 11 * max_thread_ct);
-@@ -12552,7 +12552,7 @@ int32_t cmh2_assoc(FILE* bedfile, uintpt
-     logerrprint("Error: --mh2 requires at least two cases and two controls.\n");
-     goto cmh2_assoc_ret_INVALID_CMDLINE;
-   }
--#ifdef __LP64__
-+#ifdef __SSE2__
-   if (cluster_ct1 > 46341) {
-     // might actually be ok, but play it safe in case LAPACK matrix inversion
-     // routine has an integer overflow here
---- plink1.9-1.90~b3w-150903.orig/plink_calc.c
-+++ plink1.9-1.90~b3w-150903/plink_calc.c
-@@ -132,7 +132,7 @@ void update_rel_ibc(double* rel_ibc, uin
-   double* weights2 = &(weights[128]);
-   double* weights3 = &(weights[256]);
-   double* weights4 = &(weights[320]);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   double* weights5 = &(weights[384]);
-   double* weights6 = &(weights[448]);
-   double* weights7 = &(weights[512]);
-@@ -207,7 +207,7 @@ void update_rel_ibc(double* rel_ibc, uin
-   }
-   for (ukk = 0; ukk < (BITCT * 5) / 32; ukk++) {
-     wtptr = &(wtarr[16 * ukk]);
--#ifdef __LP64__
-+#ifdef __SSE2__
-     if ((ukk == 2) || (ukk == 7)) {
-       for (uii = 0; uii < 8; uii++) {
- 	twt = wtptr[uii + 8];
-@@ -245,7 +245,7 @@ void update_rel_ibc(double* rel_ibc, uin
-   }
-   for (umm = 0; umm < sample_ct; umm++) {
-     ulii = *geno++;
--#ifdef __LP64__
-+#ifdef __SSE2__
-     *rel_ibc += weights9[ulii >> 57] + weights8[(ulii >> 51) & 63] + weights7[(ulii >> 44) & 127] + weights6[(ulii >> 38) & 63] + weights5[(ulii >> 32) & 63] + weights4[(ulii >> 25) & 63] + weights3[(ulii >> 19) & 63] + weights2[(ulii >> 12) & 127] + weights1[(ulii >> 6) & 63] + weights[ulii & 63];
- #else
-     *rel_ibc += weights4[ulii >> 25] + weights3[(ulii >> 19) & 63] + weights2[(ulii >> 12) & 127] + weights1[(ulii >> 6) & 63] + weights[ulii & 63];
-@@ -263,7 +263,7 @@ void fill_subset_weights(double* subset_
-   uint32_t uoo;
-   double wtarr[MULTIPLEX_DIST_EXP / 2];
-   double* wt;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   double twt[5];
-   double twtf;
-   __m128d* swpairs = (__m128d*)subset_weights;
-@@ -278,7 +278,7 @@ void fill_subset_weights(double* subset_
-   memcpy(wtarr, main_weights, (MULTIPLEX_DIST_EXP / 2) * sizeof(double));
-   for (uoo = 0; uoo < 2; uoo++) {
-     wt = &(wtarr[7 * uoo]);
--#ifdef __LP64__
-+#ifdef __SSE2__
-     vfinal1 = _mm_set_pd(wt[0], 0.0);
-     vfinal2 = _mm_set_pd(wt[0] * 2, wt[0]);
- #endif
-@@ -309,7 +309,7 @@ void fill_subset_weights(double* subset_
- 	      if (unn & 1) {
- 		twt[4] += wt[2];
- 	      }
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	      twtf = twt[4];
- 	      vpen = _mm_set1_pd(twtf);
- 	      *swpairs++ = _mm_add_pd(vpen, vfinal1);
-@@ -346,7 +346,7 @@ void fill_subset_weights(double* subset_
-       }
-     }
-   }
--#ifdef __LP64__
-+#ifdef __SSE2__
-   for (uoo = 0; uoo < 3; uoo++) {
-     wt = &(wtarr[14 + 6 * uoo]);
-     vfinal1 = _mm_set_pd(wt[0], 0.0);
-@@ -415,7 +415,7 @@ void fill_subset_weights_r(double* subse
-   double mean_m2;
-   double mult = 1.0;
-   double aux;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   __m128d* swpairs = (__m128d*)subset_weights;
-   __m128d vpen;
-   __m128d vfinal1;
-@@ -492,7 +492,7 @@ void fill_subset_weights_r(double* subse
-   }
-   for (unn = 0; unn < BITCT / 16; unn++) {
-     wtptr = &(wtarr[40 * unn]);
--#ifdef __LP64__
-+#ifdef __SSE2__
-     vfinal1 = _mm_load_pd(wtptr);
-     vfinal2 = _mm_load_pd(&(wtptr[2]));
-     vfinal3 = _mm_load_pd(&(wtptr[4]));
-@@ -506,7 +506,7 @@ void fill_subset_weights_r(double* subse
-           twt3 = twt2 + wtptr[ukk + 16];
-           for (umm = 0; umm < 8; umm++) {
-             twt4 = twt3 + wtptr[umm + 8];
--#ifdef __LP64__
-+#ifdef __SSE2__
-             vpen = _mm_set1_pd(twt4);
-             *swpairs++ = _mm_add_pd(vpen, vfinal1);
-             *swpairs++ = _mm_add_pd(vpen, vfinal2);
-@@ -572,7 +572,7 @@ static inline void collapse_copy_phenod_
-   } while (target < target_end);
- }
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- // XOR + mask variants of vectorized Lauradoux/Walisch popcount.  (See
- // popcount_vecs() in plink_common.c for basic documentation.)
- // Note that the size of the popcounted buffer is a hardcoded constant
-@@ -852,7 +852,7 @@ void ibs_test_process_perms(uintptr_t* p
-       do {
- 	sub_block_idx = 0;
- 	ulii = *perm_row_start++;
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	dxx = psbuf[(uint8_t)ulii] + psbuf[256 + ((uint8_t)(ulii >> 8))] + psbuf[512 + ((uint8_t)(ulii >> 16))] + psbuf[768 + ((uint8_t)(ulii >> 24))] + psbuf[1024 + ((uint8_t)(ulii >> 32))] + psbuf[1280 + ((uint8_t)(ulii >> 40))] + psbuf[1536 + ((uint8_t)(ulii >> 48))] + psbuf[1792 + (ulii >> 56)];
- #else
-         dxx = psbuf[(uint8_t)ulii] + psbuf[256 + ((uint8_t)(ulii >> 8))] + psbuf[512 + ((uint8_t)(ulii >> 16))] + psbuf[768 + (ulii >> 24)];
-@@ -969,7 +969,7 @@ THREAD_RET_TYPE ibs_test_thread(void* ar
- }
- 
- void incr_dists_i(uint32_t* idists, uintptr_t* geno, uintptr_t* masks, uint32_t start_idx, uint32_t end_idx) {
--#ifdef __LP64__
-+#ifdef __SSE2__
-   __m128i* glptr;
-   __m128i* glptr2;
-   __m128i* mptr;
-@@ -986,7 +986,7 @@ void incr_dists_i(uint32_t* idists, uint
-   uintptr_t mask_fixed;
-   for (uii = start_idx; uii < end_idx; uii++) {
-     jj = uii * (MULTIPLEX_2DIST / BITCT);
--#ifdef __LP64__
-+#ifdef __SSE2__
-     glptr = (__m128i*)geno;
-     glptr2 = (__m128i*)(&(geno[jj]));
-     lptr = &(masks[jj]);
-@@ -1104,7 +1104,7 @@ THREAD_RET_TYPE calc_ibs_thread(void* ar
- }
- 
- void incr_genome(uint32_t* genome_main, uintptr_t* geno, uintptr_t* masks, uintptr_t sample_ct, uint32_t start_idx, uint32_t end_idx) {
--#ifdef __LP64__
-+#ifdef __SSE2__
-   const __m128i m1 = {FIVEMASK, FIVEMASK};
-   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
-   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
-@@ -1161,14 +1161,14 @@ void incr_genome(uint32_t* genome_main,
-   uintptr_t* marker_window_ptr;
-   int32_t lowct2 = g_ctrl_ct * 2;
-   int32_t highct2 = g_case_ct * 2;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   glptr_end = (__m128i*)(&(geno[sample_ct * (GENOME_MULTIPLEX2 / BITCT)]));
- #else
-   glptr_end = &(geno[sample_ct * (GENOME_MULTIPLEX2 / BITCT)]);
- #endif
-   for (uii = start_idx; uii < end_idx; uii++) {
-     ujj = uii * (GENOME_MULTIPLEX2 / BITCT);
--#ifdef __LP64__
-+#ifdef __SSE2__
-     glptr_fixed = (__m128i*)(&(geno[ujj]));
-     glptr = (__m128i*)(&(geno[ujj + (GENOME_MULTIPLEX2 / BITCT)]));
-     lptr = &(masks[ujj]);
-@@ -1194,7 +1194,7 @@ void incr_genome(uint32_t* genome_main,
- 	glptr_back = (uintptr_t*)glptr;
- 	glptr_fixed_tmp = glptr_fixed;
- 	maskptr_fixed_tmp = maskptr_fixed;
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	acc_ibs1.vi = _mm_setzero_si128();
- 	acc_ibs0.vi = _mm_setzero_si128();
- 	do {
-@@ -1356,7 +1356,7 @@ void incr_genome(uint32_t* genome_main,
- 	xor_ptr = xor_buf;
- 	glptr_back = (uintptr_t*)glptr;
- 	glptr_fixed_tmp = glptr_fixed;
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	acc_ibs1.vi = _mm_setzero_si128();
- 	acc_ibs0.vi = _mm_setzero_si128();
- 	do {
-@@ -1549,7 +1549,7 @@ void incr_dists(double* dists, uintptr_t
-   uintptr_t uljj;
-   uintptr_t* mptr;
-   double* weights1 = &(weights[16384]);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   double* weights2 = &(weights[32768]);
-   double* weights3 = &(weights[36864]);
-   double* weights4 = &(weights[40960]);
-@@ -1561,7 +1561,7 @@ void incr_dists(double* dists, uintptr_t
-     ulii = geno[uii];
-     mptr = masks;
-     mask_fixed = masks[uii];
--#ifdef __LP64__
-+#ifdef __SSE2__
-     if (mask_fixed == ~ZEROLU) {
-       for (ujj = 0; ujj < uii; ujj++) {
- 	uljj = (*glptr++ ^ ulii) & (*mptr++);
-@@ -1628,7 +1628,7 @@ void incr_dists_r(double* dists, uintptr
-   uintptr_t uljj;
-   uintptr_t basemask;
-   double* weights1 = &(weights[32768]);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   double* weights2 = &(weights[65536]);
-   double* weights3 = &(weights[98304]);
- #endif
-@@ -1642,7 +1642,7 @@ void incr_dists_r(double* dists, uintptr
-     if (!basemask) {
-       for (ujj = 0; ujj < uii; ujj++) {
- 	uljj = ((*glptr++) + ulii) | (*maskptr++);
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	*dists += weights[(uint16_t)uljj] + weights1[(uint16_t)(uljj >> 16)] + weights2[(uint16_t)(uljj >> 32)] + weights3[uljj >> 48];
- #else
- 	*dists += weights[(uint16_t)uljj] + weights1[uljj >> 16];
-@@ -1652,7 +1652,7 @@ void incr_dists_r(double* dists, uintptr
-     } else {
-       for (ujj = 0; ujj < uii; ujj++) {
-         uljj = ((*glptr++) + ulii) | ((*maskptr++) | basemask);
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	*dists += weights[(uint16_t)uljj] + weights1[(uint16_t)(uljj >> 16)] + weights2[(uint16_t)(uljj >> 32)] + weights3[uljj >> 48];
- #else
- 	*dists += weights[(uint16_t)uljj] + weights1[uljj >> 16];
-@@ -1803,7 +1803,7 @@ void pick_d(unsigned char* cbuf, uint32_
-   uint32_t ujj;
-   uint32_t ukk;
-   memset(cbuf, 0, ct);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   ukk = (uint32_t)(0x100000000LLU % ct);
- #else
-   ukk = 2 * (0x80000000U % ct);
-@@ -2116,14 +2116,14 @@ void matrix_const_mult_add(uint32_t samp
-   uint32_t loop_end = sample_ct - 1;
-   uint32_t ujj;
-   double* dptr = matrix;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   __m128d* vptr;
-   __m128d v_mult_val = _mm_set1_pd(mult_val);
- #endif
-   for (uii = 0; uii < loop_end; uii++) {
-     *dptr = (*dptr) * mult_val + add_val;
-     dptr++;
--#ifdef __LP64__
-+#ifdef __SSE2__
-     if ((uintptr_t)dptr & 8) {
-       *dptr *= mult_val;
-       dptr++;
-@@ -2727,7 +2727,7 @@ int32_t ibs_test_calc(pthread_t* threads
-   double perm_ct_recip;
-   uintptr_t ulii;
-   uintptr_t uljj = 0;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   __m128d* rvptr1;
-   __m128d* rvptr2;
- #else
-@@ -2802,7 +2802,7 @@ int32_t ibs_test_calc(pthread_t* threads
-     ctrl_ctrl_ssq += g_calc_result[tidx][1];
-     ctrl_case_ssq += g_calc_result[tidx][2];
-     case_case_ssq += g_calc_result[tidx][3];
--#ifdef __LP64__
-+#ifdef __SSE2__
-     rvptr1 = (__m128d*)perm_results;
-     rvptr2 = (__m128d*)(&(perm_results[2 * perm_ctcldm * tidx]));
-     for (perm_idx = 0; perm_idx < perm_ct; perm_idx++) {
-@@ -4542,7 +4542,7 @@ int32_t distance_d_write(FILE** outfile_
-   } else {
-     if (shape == DISTANCE_SQ0) {
-       // assume little-endian
--#ifdef __LP64__
-+#ifdef __SSE2__
-       ulii = 0x3009300930093009LLU;
- #else
-       ulii = 0x30093009;
-@@ -5923,7 +5923,7 @@ int32_t rel_cutoff_batch(uint32_t load_g
-   fclose_null(&idfile);
-   ullii = sample_ct;
-   ullii = ((ullii * (ullii - 1)) / 2 + BITCT - 1) / BITCT;
--#ifndef __LP64__
-+#ifndef __SSE2__
-   if (ullii >= 0x20000000) {
-     goto rel_cutoff_batch_ret_NOMEM;
-   }
-@@ -7353,7 +7353,7 @@ int32_t calc_rel(pthread_t* threads, uin
- 	  //   cptr2[uii] = '\t';
- 	  //   cptr2[uii + 1] = '0';
- 	  // }
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	  ulii = 0x3009300930093009LLU;
- #else
- 	  ulii = 0x30093009LU;
-@@ -8077,7 +8077,7 @@ int32_t calc_distance(pthread_t* threads
-   llxx = g_thread_start[dist_thread_ct];
-   llxx = ((llxx * (llxx - 1)) - (int64_t)g_thread_start[0] * (g_thread_start[0] - 1)) / 2;
-   dists_alloc = llxx * sizeof(double);
--#ifndef __LP64__
-+#ifndef __SSE2__
-   if (dists_alloc > 0x7fffffff) {
-     goto calc_distance_ret_NOMEM;
-   }
-@@ -8236,7 +8236,7 @@ int32_t calc_distance(pthread_t* threads
-     goto calc_distance_ret_NOMEM;
-   }
-   if (main_weights) {
--#ifdef __LP64__
-+#ifdef __SSE2__
-     if (wkspace_alloc_d_checked(&subset_weights, 45056 * sizeof(double))) {
-       goto calc_distance_ret_NOMEM;
-     }
-@@ -8424,7 +8424,7 @@ int32_t calc_distance(pthread_t* threads
- 		*giptr3 += wtbuf[umm + ukk];
- 	      }
- 	    }
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	    ulii ^= FIVEMASK;
- 	    *glptr++ = ulii;
- 	    ulii = (ulii | (ulii >> 1)) & FIVEMASK;
-@@ -8725,7 +8725,7 @@ int32_t calc_cluster_neighbor(pthread_t*
-   // as a special case in the future.
-   FILE* outfile = NULL;
-   uint32_t* cluster_sorted_ibs_indices = NULL;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   // uint64_t* cluster_sorted_ibs_indices_big = NULL;
- #endif
-   uint32_t* sample_to_cluster = NULL;
-@@ -9457,7 +9457,7 @@ int32_t calc_cluster_neighbor(pthread_t*
-   logprint("Clustering...");
-   printf(" [sorting IB%c values]", cluster_missing? 'M' : 'S');
-   fflush(stdout);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   if (cur_cluster_ct <= 65536) {
- #endif
-     // Objective: Produce a list of inter-cluster IBS values sorted in
-@@ -9488,7 +9488,7 @@ int32_t calc_cluster_neighbor(pthread_t*
-       // f(0) = 1
-       // f(1) = f(2) = 2
-       // f(3) = f(4) = f(5) = 3... (triangle_divide() with different rounding)
--#ifdef __LP64__
-+#ifdef __SSE2__
-       umm = (int32_t)sqrt((intptr_t)(tcoord * 2));
- #else
-       umm = (int32_t)sqrt(2 * ((double)((intptr_t)tcoord)));
-@@ -9601,7 +9601,7 @@ int32_t calc_cluster_neighbor(pthread_t*
- 	cluster_index[tri_coord_no_diag_32(ukk & 65535, ukk >> 16)] = uii + 1;
-       }
-     }
--#ifdef __LP64__
-+#ifdef __SSE2__
-   } else {
-     logerrprint("Error: --cluster cannot handle >65536 initial clusters yet.\n");
-     retval = RET_CALC_NOT_YET_SUPPORTED;
---- plink1.9-1.90~b3w-150903.orig/plink_cnv.c
-+++ plink1.9-1.90~b3w-150903/plink_cnv.c
-@@ -13,7 +13,7 @@ int32_t cnv_subset_load(char* subset_fna
-     logerrprint("Error: Empty --cnv-subset file.\n");
-     goto cnv_subset_load_ret_INVALID_FORMAT;
-   }
--#ifndef __LP64__
-+#ifndef __SSE2__
-   if (((uint64_t)subset_ct) * max_subset_name_len > 0x7fffffffLLU) {
-     goto cnv_subset_load_ret_NOMEM;
-   }
-@@ -625,7 +625,7 @@ int32_t cnv_make_map(FILE* cnvfile, char
-   }
-   for (ulii = 1; ulii < raw_marker_ct; ulii++) {
-     if (marker_pos_arr[ulii] != llii) {
--#ifdef __LP64__
-+#ifdef __SSE2__
-       if ((++distinct_marker_ct) == 0x80000000U) {
- 	logprint("\n");
- 	logerrprint("Error: Too many distinct .cnv.map positions (max 2^31 - 1).\n");
-@@ -983,7 +983,7 @@ int32_t plink_cnv(char* outname, char* o
-     }
-   }
-   ulii = marker_chrom_start[chrom_info_ptr->max_code + 1 + chrom_info_ptr->name_ct];
--#ifndef __LP64__
-+#ifndef __SSE2__
-   if (((uint64_t)ulii) * max_marker_id_len > 0x7fffffffLLU) {
-     goto plink_cnv_ret_NOMEM;
-   }
---- plink1.9-1.90~b3w-150903.orig/plink_common.c
-+++ plink1.9-1.90~b3w-150903/plink_common.c
-@@ -30,7 +30,7 @@ uintptr_t g_sample_ct;
- uint32_t g_thread_ct;
- 
- uint32_t aligned_malloc(uintptr_t** aligned_pp, uintptr_t size) {
--#ifdef __LP64__
-+#ifdef __SSE2__
-   // Avoid random segfaults on 64-bit machines which have 8-byte- instead of
-   // 16-byte-aligned malloc().  (Slightly different code is needed if malloc()
-   // does not even guarantee 8-byte alignment.)
-@@ -51,7 +51,7 @@ uint32_t aligned_malloc(uintptr_t** alig
- }
- 
- void aligned_free(uintptr_t* aligned_pp) {
--#ifdef __LP64__
-+#ifdef __SSE2__
-   free((uintptr_t*)(aligned_pp[-1]));
- #else
-   free(aligned_pp);
-@@ -3349,7 +3349,7 @@ uint32_t next_unset_unsafe(uintptr_t* bi
-   return ((uintptr_t)(bit_arr_ptr - bit_arr)) * BITCT + CTZLU(~ulii);
- }
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t next_unset_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc) {
-   uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
-   uintptr_t ulii = (~(*bit_arr_ptr)) >> (loc % BITCT);
-@@ -3383,7 +3383,7 @@ uint32_t next_unset(uintptr_t* bit_arr,
-   return MINV(loc, ceil);
- }
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t next_unset_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil) {
-   uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
-   uintptr_t ulii = (~(*bit_arr_ptr)) >> (loc % BITCT);
-@@ -3416,7 +3416,7 @@ uint32_t next_set_unsafe(uintptr_t* bit_
-   return ((uintptr_t)(bit_arr_ptr - bit_arr)) * BITCT + CTZLU(ulii);
- }
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t next_set_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc) {
-   uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
-   uintptr_t ulii = (*bit_arr_ptr) >> (loc % BITCT);
-@@ -3450,7 +3450,7 @@ uint32_t next_set(uintptr_t* bit_arr, ui
-   return MINV(rval, ceil);
- }
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t next_set_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil) {
-   uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
-   uintptr_t ulii = (*bit_arr_ptr) >> (loc % BITCT);
-@@ -3718,7 +3718,7 @@ int32_t populate_id_htable(uintptr_t unf
-       }
-     }
-   } else {
--#ifdef __LP64__
-+#ifdef __SSE2__
-     if (wkspace_left >= 0x400000000LLU) {
-       max_extra_alloc = 0xfffffffeU;
-     } else {
-@@ -3888,7 +3888,7 @@ void fill_vec_55(uintptr_t* vec, uint32_
-   uint32_t ctl = 2 * ((ct + (BITCT - 1)) / BITCT);
-   uint32_t rem = ct & (BITCT - 1);
-   uintptr_t* second_to_last = &(vec[ctl - 2]);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   const __m128i m1 = {FIVEMASK, FIVEMASK};
-   __m128i* vecp = (__m128i*)vec;
-   __m128i* vec_end = (__m128i*)(&(vec[ctl]));
-@@ -4009,7 +4009,7 @@ void sample_delim_convert(uintptr_t unfi
- void get_set_wrange_align(uintptr_t* bitfield, uintptr_t word_ct, uintptr_t* firstw_ptr, uintptr_t* wlen_ptr) {
-   uintptr_t* bitfield_ptr = bitfield;
-   uintptr_t* bitfield_end = &(bitfield[word_ct]);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t* bitfield_end2 = &(bitfield[word_ct & (~ONELU)]);
-   while (bitfield_ptr < bitfield_end2) {
-     if (bitfield_ptr[0] || bitfield_ptr[1]) {
-@@ -5223,7 +5223,7 @@ void bitfield_exclude_to_include(uintptr
- void bitfield_and(uintptr_t* vv, uintptr_t* include_vec, uintptr_t word_ct) {
-   // vv := vv AND include_vec
-   // on 64-bit systems, assumes vv and include_vec are 16-byte aligned
--#ifdef __LP64__
-+#ifdef __SSE2__
-   __m128i* vv128 = (__m128i*)vv;
-   __m128i* iv128 = (__m128i*)include_vec;
-   __m128i* vv128_end = &(vv128[word_ct / 2]);
-@@ -5247,7 +5247,7 @@ void bitfield_andnot(uintptr_t* vv, uint
-   // vv := vv ANDNOT exclude_vec
-   // on 64-bit systems, assumes vv and exclude_vec are 16-byte aligned
-   // note that this is the reverse of the _mm_andnot() operand order
--#ifdef __LP64__
-+#ifdef __SSE2__
-   __m128i* vv128 = (__m128i*)vv;
-   __m128i* ev128 = (__m128i*)exclude_vec;
-   __m128i* vv128_end = &(vv128[word_ct / 2]);
-@@ -5270,7 +5270,7 @@ void bitfield_andnot(uintptr_t* vv, uint
- void bitfield_andnot_reversed_args(uintptr_t* vv, uintptr_t* include_vec, uintptr_t word_ct) {
-   // vv := (~vv) AND include_vec
-   // on 64-bit systems, assumes vv and exclude_vec are 16-byte aligned
--#ifdef __LP64__
-+#ifdef __SSE2__
-   __m128i* vv128 = (__m128i*)vv;
-   __m128i* iv128 = (__m128i*)include_vec;
-   __m128i* vv128_end = &(vv128[word_ct / 2]);
-@@ -5294,7 +5294,7 @@ void bitfield_andnot_reversed_args(uintp
- void bitfield_or(uintptr_t* vv, uintptr_t* or_vec, uintptr_t word_ct) {
-   // vv := vv OR include_vec
-   // on 64-bit systems, assumes vv and include_vec are 16-byte aligned
--#ifdef __LP64__
-+#ifdef __SSE2__
-   __m128i* vv128 = (__m128i*)vv;
-   __m128i* ov128 = (__m128i*)or_vec;
-   __m128i* vv128_end = &(vv128[word_ct / 2]);
-@@ -5317,7 +5317,7 @@ void bitfield_or(uintptr_t* vv, uintptr_
- void bitfield_ornot(uintptr_t* vv, uintptr_t* inverted_or_vec, uintptr_t word_ct) {
-   // vv := vv OR (~inverted_or_vec)
-   // on 64-bit systems, assumes vv and inverted_or_vec are 16-byte aligned
--#ifdef __LP64__
-+#ifdef __SSE2__
- #ifdef __APPLE__
-   const __m128i all1 = {0xffffffffffffffffLLU, 0xffffffffffffffffLLU};
- #else
-@@ -5345,7 +5345,7 @@ void bitfield_ornot(uintptr_t* vv, uintp
- void bitfield_xor(uintptr_t* bit_arr, uintptr_t* xor_arr, uintptr_t word_ct) {
-   // bit_arr := bit_arr XOR xor_arr
-   // on 64-bit systems, assumes bit_arr and xor_arr are 16-byte aligned
--#ifdef __LP64__
-+#ifdef __SSE2__
-   __m128i* bitv128 = (__m128i*)bit_arr;
-   __m128i* xorv128 = (__m128i*)xor_arr;
-   __m128i* bitv128_end = &(bitv128[word_ct / 2]);
-@@ -5539,7 +5539,7 @@ uint32_t has_three_genotypes(uintptr_t*
- }
- */
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- // Basic SSE2 implementation of Lauradoux/Walisch popcount.
- static inline uintptr_t popcount_vecs(__m128i* vptr, uintptr_t ct) {
-   // popcounts vptr[0..(ct-1)].  Assumes ct is a multiple of 3 (0 ok).
-@@ -5730,7 +5730,7 @@ uintptr_t popcount_longs(uintptr_t* lptr
-   // index.
-   uintptr_t tot = 0;
-   uintptr_t* lptr_end = &(lptr[word_ct]);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t six_ct;
-   __m128i* vptr;
-   vptr = (__m128i*)lptr;
-@@ -5788,7 +5788,7 @@ uintptr_t popcount2_longs(uintptr_t* lpt
-   // treats lptr[] as an array of two-bit instead of one-bit numbers
-   uintptr_t tot = 0;
-   uintptr_t* lptr_end = &(lptr[word_ct]);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t twelve_ct;
-   __m128i* vptr;
-   vptr = (__m128i*)lptr;
-@@ -6035,7 +6035,7 @@ uintptr_t jump_forward_unset_unsafe(uint
-   uintptr_t* bptr = &(bit_arr[widx]);
-   uintptr_t uljj;
-   uintptr_t ulkk;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   __m128i* vptr;
- #endif
-   if (ulii) {
-@@ -6055,7 +6055,7 @@ uintptr_t jump_forward_unset_unsafe(uint
-     bptr++;
-   }
-   ulii = 0;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   if (widx & 1) {
-     uljj = ~(*bptr);
-     ulkk = popcount_long(uljj);
-@@ -6101,7 +6101,7 @@ uintptr_t popcount_longs_exclude(uintptr
-   // N.B. on 64-bit systems, assumes lptr and exclude_arr are 16-byte aligned.
-   uintptr_t tot = 0;
-   uintptr_t* lptr_end = &(lptr[end_idx]);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t six_ct = end_idx / 6;
-   tot += popcount_vecs_exclude((__m128i*)lptr, (__m128i*)exclude_arr, six_ct * 3);
-   lptr = &(lptr[six_ct * 6]);
-@@ -6151,7 +6151,7 @@ uintptr_t popcount_longs_exclude(uintptr
- uintptr_t popcount_longs_intersect(uintptr_t* lptr1, uintptr_t* lptr2, uintptr_t word_ct) {
-   uintptr_t tot = 0;
-   uintptr_t* lptr1_end = &(lptr1[word_ct]);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t six_ct = word_ct / 6;
-   tot += popcount_vecs_intersect((__m128i*)lptr1, (__m128i*)lptr2, six_ct * 3);
-   lptr1 = &(lptr1[six_ct * 6]);
-@@ -6213,7 +6213,7 @@ void vertical_bitct_subtract(uintptr_t*
-   }
- }
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- void count_2freq_dbl_60v(__m128i* vptr, __m128i* vend, __m128i* mask1vp, __m128i* mask2vp, uint32_t* ct1abp, uint32_t* ct1cp, uint32_t* ct2abp, uint32_t* ct2cp) {
-   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
-   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
-@@ -6594,7 +6594,7 @@ void count_3freq_12(uintptr_t* lptr, uin
- }
- #endif
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- void count_set_freq_60v(__m128i* vptr, __m128i* vend, __m128i* include_vec, uint32_t* set_ctp, uint32_t* missing_ctp) {
-   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
-   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
-@@ -7310,7 +7310,7 @@ void vec_set_freq(uintptr_t sample_ctl2,
-   uintptr_t missing_incr;
-   uint32_t acc = 0;
-   uint32_t accm = 0;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t cur_decr = 60;
-   uintptr_t* lptr_6x_end;
-   sample_ctl2 -= sample_ctl2 % 6;
-@@ -7356,7 +7356,7 @@ void vec_set_freq_x(uintptr_t sample_ctl
-   uintptr_t missing_incr;
-   uint32_t acc = 0;
-   uint32_t accm = 0;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t cur_decr = 60;
-   uintptr_t* lptr_6x_end;
-   sample_ctl2 -= sample_ctl2 % 6;
-@@ -7408,7 +7408,7 @@ void vec_set_freq_y(uintptr_t sample_ctl
-   uintptr_t loader4;
-   uint32_t acc = 0;
-   uint32_t accm = 0;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t cur_decr = 120;
-   uintptr_t* lptr_12x_end;
-   sample_ctl2 -= sample_ctl2 % 12;
-@@ -7455,7 +7455,7 @@ void vec_3freq(uintptr_t sample_ctl2, ui
-   uint32_t acc_even = 0;
-   uint32_t acc_odd = 0;
-   uint32_t acc_and = 0;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t cur_decr = 120;
-   uintptr_t* lptr_12x_end;
-   sample_ctl2 -= sample_ctl2 % 12;
-@@ -7497,7 +7497,7 @@ uintptr_t count_01(uintptr_t* lptr, uint
-   // unlike popcount01_longs, this does not assume lptr[] has no 11s
-   uintptr_t* lptr_end = &(lptr[word_ct]);
-   uintptr_t loader;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t acc;
-   word_ct -= word_ct % 12;
-   acc = count_01_vecs((__m128i*)lptr, word_ct / 2);
-@@ -7818,7 +7818,7 @@ void reverse_loadbuf(unsigned char* load
-   uint32_t* loadbuf_alias32;
-   uint32_t uii;
-   uint32_t ujj;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   const __m128i m1 = {FIVEMASK, FIVEMASK};
-   __m128i* loadbuf_alias;
-   __m128i vii;
-@@ -8044,7 +8044,7 @@ void vec_include_init(uintptr_t unfilter
-     ulmm = FIVEMASK;
-     if (ulii) {
-       uljj = ulii >> BITCT2;
--#ifdef __LP64__
-+#ifdef __SSE2__
-       ulii &= 0xffffffffLLU;
- #else
-       ulii &= 0xffffLU;
-@@ -8092,7 +8092,7 @@ void exclude_to_vec_include(uintptr_t un
-     ulmm = FIVEMASK;
-     if (ulii) {
-       uljj = ulii >> BITCT2;
--#ifdef __LP64__
-+#ifdef __SSE2__
-       ulii &= 0xffffffffLLU;
- #else
-       ulii &= 0xffffLU;
-@@ -8133,7 +8133,7 @@ void vec_init_invert(uintptr_t entry_ct,
-   uint32_t vec_wsize = 2 * ((entry_ct + (BITCT - 1)) / BITCT);
-   uintptr_t* second_to_last = &(target_arr[vec_wsize - 2]);
-   uint32_t rem = entry_ct & (BITCT - 1);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   const __m128i m1 = {FIVEMASK, FIVEMASK};
-   __m128i* tptr = (__m128i*)target_arr;
-   __m128i* sptr = (__m128i*)source_arr;
-@@ -8158,7 +8158,7 @@ void vec_init_invert(uintptr_t entry_ct,
- void bitfield_andnot_copy(uintptr_t word_ct, uintptr_t* target_arr, uintptr_t* source_arr, uintptr_t* exclude_arr) {
-   // target_arr := source_arr ANDNOT exclude_arr
-   // may write an extra word
--#ifdef __LP64__
-+#ifdef __SSE2__
-   __m128i* tptr = (__m128i*)target_arr;
-   __m128i* sptr = (__m128i*)source_arr;
-   __m128i* xptr = (__m128i*)exclude_arr;
-@@ -8187,7 +8187,7 @@ void vec_include_mask_in(uintptr_t unfil
-     ulmm = include_arr[1];
-     if (ulii) {
-       uljj = ulii >> BITCT2;
--#ifdef __LP64__
-+#ifdef __SSE2__
-       ulii &= 0xffffffffLLU;
- #else
-       ulii &= 0xffffLU;
-@@ -8225,7 +8225,7 @@ void vec_include_mask_out(uintptr_t unfi
-     ulmm = include_arr[1];
-     if (ulii) {
-       uljj = ulii >> BITCT2;
--#ifdef __LP64__
-+#ifdef __SSE2__
-       ulii &= 0xffffffffLLU;
- #else
-       ulii &= 0xffffLU;
-@@ -8263,7 +8263,7 @@ void vec_include_mask_out_intersect(uint
-     ulmm = include_arr[1];
-     if (ulii) {
-       uljj = ulii >> BITCT2;
--#ifdef __LP64__
-+#ifdef __SSE2__
-       ulii &= 0xffffffffLLU;
- #else
-       ulii &= 0xffffLU;
-@@ -8290,7 +8290,7 @@ void vec_include_mask_out_intersect(uint
- 
- void vec_init_01(uintptr_t unfiltered_sample_ct, uintptr_t* data_ptr, uintptr_t* result_ptr) {
-   // initializes result_ptr bits 01 iff data_ptr bits are 01
--#ifdef __LP64__
-+#ifdef __SSE2__
-   const __m128i m1 = {FIVEMASK, FIVEMASK};
-   __m128i* vec2_read = (__m128i*)data_ptr;
-   __m128i* read_end = &(vec2_read[(unfiltered_sample_ct + (BITCT - 1)) / BITCT]);
-@@ -8313,7 +8313,7 @@ void vec_init_01(uintptr_t unfiltered_sa
- void vec_invert(uintptr_t unfiltered_sample_ct, uintptr_t* vec2) {
-   uintptr_t* vec2_last = &(vec2[unfiltered_sample_ct / BITCT2]);
-   uint32_t remainder = unfiltered_sample_ct & (BITCT2 - 1);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   const __m128i m1 = {FIVEMASK, FIVEMASK};
-   __m128i* vec2_128 = (__m128i*)vec2;
-   __m128i* vec2_last128 = &(vec2_128[unfiltered_sample_ct / BITCT]);
-@@ -8342,7 +8342,7 @@ void vec_datamask(uintptr_t unfiltered_s
-   // sets result_vec bits to 01 iff data_ptr bits are equal to matchval and
-   // vec_ptr bit is set, 00 otherwise.
-   // currently assumes matchval is not 1.
--#ifdef __LP64__
-+#ifdef __SSE2__
-   __m128i* data_read = (__m128i*)data_ptr;
-   __m128i* mask_read = (__m128i*)mask_ptr;
-   __m128i* data_read_end = &(data_read[(unfiltered_sample_ct + (BITCT - 1)) / BITCT]);
-@@ -8354,7 +8354,7 @@ void vec_datamask(uintptr_t unfiltered_s
- #endif
-   if (matchval) {
-     if (matchval == 2) {
--#ifdef __LP64__
-+#ifdef __SSE2__
-       do {
-         loader = *data_read++;
-         *writer++ = _mm_and_si128(_mm_andnot_si128(loader, _mm_srli_epi64(loader, 1)), *mask_read++);
-@@ -8366,7 +8366,7 @@ void vec_datamask(uintptr_t unfiltered_s
-       } while (data_ptr < data_read_end);
- #endif
-     } else {
--#ifdef __LP64__
-+#ifdef __SSE2__
-       do {
-         loader = *data_read++;
-         *writer++ = _mm_and_si128(_mm_and_si128(loader, _mm_srli_epi64(loader, 1)), *mask_read++);
-@@ -8379,7 +8379,7 @@ void vec_datamask(uintptr_t unfiltered_s
- #endif
-     }
-   } else {
--#ifdef __LP64__
-+#ifdef __SSE2__
-     do {
-       loader = *data_read++;
-       *writer++ = _mm_andnot_si128(_mm_or_si128(loader, _mm_srli_epi64(loader, 1)), *mask_read++);
-@@ -8395,7 +8395,7 @@ void vec_datamask(uintptr_t unfiltered_s
- 
- /*
- void vec_rotate_plink1_to_plink2(uintptr_t* lptr, uint32_t word_ct) {
--#ifdef __LP64__
-+#ifdef __SSE2__
-   const __m128i m1 = {FIVEMASK, FIVEMASK};
-   __m128i* vptr = (__m128i*)lptr;
-   __m128i* vend = (__m128i*)(&(lptr[word_ct]));
-@@ -8512,7 +8512,7 @@ void hh_reset(unsigned char* loadbuf, ui
-   uint32_t* loadbuf_alias32;
-   uint32_t uii;
-   uint32_t ujj;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t* sample_include2_alias32;
-   __m128i* loadbuf_alias;
-   __m128i* iivp;
-@@ -8576,7 +8576,7 @@ void hh_reset_y(unsigned char* loadbuf,
-   uint32_t uii;
-   uint32_t ujj;
-   uint32_t ukk;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   const __m128i m1 = {FIVEMASK, FIVEMASK};
-   uint32_t* sample_include2_alias32;
-   uint32_t* sample_male_include2_alias32;
-@@ -8730,7 +8730,7 @@ void force_missing(unsigned char* loadbu
-   uint32_t* loadbuf_alias32;
-   uint32_t uii;
-   uint32_t ujj;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t* force_missing_include2_alias32;
-   __m128i* loadbuf_alias;
-   __m128i* fmivp;
---- plink1.9-1.90~b3w-150903.orig/plink_common.h
-+++ plink1.9-1.90~b3w-150903/plink_common.h
-@@ -63,13 +63,13 @@
- #endif
- 
- #ifdef _WIN64
--  #define __LP64__
-+  #define __SSE2__
-   #define CTZLU __builtin_ctzll
-   #define CLZLU __builtin_clzll
- #else
-   #define CTZLU __builtin_ctzl
-   #define CLZLU __builtin_clzl
--  #ifndef __LP64__
-+  #ifndef __SSE2__
-     #ifndef uintptr_t
-       #define uintptr_t unsigned long
-     #endif
-@@ -83,7 +83,7 @@
-   #include <algorithm>
- #endif
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
-   #include <emmintrin.h>
-   #define FIVEMASK 0x5555555555555555LLU
-   typedef union {
-@@ -120,7 +120,7 @@
- 
-   #endif // Win64
- 
--#else // not __LP64__
-+#else // not __SSE2__
- 
-   #define FIVEMASK 0x55555555
-   #define ZEROLU 0LU
-@@ -133,7 +133,7 @@
-   #endif
-   #define PRIxPTR2 "08lx"
- 
--#endif // __LP64__
-+#endif // __SSE2__
- 
- #include <zlib.h>
- #include "SFMT.h"
-@@ -601,7 +601,7 @@
-   #define MAX_THREADS_P1 513
- #endif
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
-   #define BITCT 64
- #else
-   #define BITCT 32
-@@ -647,7 +647,7 @@
- #define JACKKNIFE_VALS_DIST 5
- #define JACKKNIFE_VALS_GROUPDIST 3
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
-   #define AAAAMASK 0xaaaaaaaaaaaaaaaaLLU
-   // number of snp-major .bed lines to read at once for distance calc if
-   // exponent is nonzero.
-@@ -679,7 +679,7 @@
- #define HASHSIZE 524287
- #define HASHSIZE_S 524287
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- #define HASHMEM 4194304
- #define HASHMEM_S 4194304
- #else
-@@ -779,7 +779,7 @@ typedef union {
- 
- typedef union {
-   double dd;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t uu[1];
- #else
-   uintptr_t uu[2];
-@@ -1031,7 +1031,7 @@ static inline char* skip_initial_spaces(
- /*
- static inline int32_t is_space_or_eoln(unsigned char cc) {
-   // ' ', \t, \n, \0, \r
--#ifdef __LP64__
-+#ifdef __SSE2__
-   return (ucc <= 32) && (0x100002601LLU & (1LLU << ucc));
- #else
-   return ((ucc <= 32) && ((ucc == ' ') || (0x2601LU & (ONELU << ucc))));
-@@ -1526,7 +1526,7 @@ static inline void next_unset_unsafe_ck(
-   }
- }
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t next_unset_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc);
- #else
- static inline uintptr_t next_unset_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc) {
-@@ -1548,7 +1548,7 @@ static inline void next_unset_ck(uintptr
-   }
- }
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t next_unset_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil);
- #else
- static inline uintptr_t next_unset_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil) {
-@@ -1570,7 +1570,7 @@ static inline void next_set_unsafe_ck(ui
-   }
- }
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t next_set_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc);
- #else
- static inline uintptr_t next_set_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc) {
-@@ -1592,7 +1592,7 @@ static inline void next_set_ck(uintptr_t
-   }
- }
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t next_set_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil);
- #else
- static inline uintptr_t next_set_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil) {
-@@ -1639,7 +1639,7 @@ static inline void fill_ulong_zero(uintp
-   }
- }
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- static inline void fill_ull_zero(uint64_t* ullarr, size_t size) {
-   fill_ulong_zero((uintptr_t*)ullarr, size);
- }
-@@ -1663,7 +1663,7 @@ static inline void fill_ulong_one(uintpt
-   }
- }
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- static inline void fill_ull_one(uint64_t* ullarr, size_t size) {
-   fill_ulong_one((uintptr_t*)ullarr, size);
- }
-@@ -1812,7 +1812,7 @@ void get_set_wrange_align(uintptr_t* bit
- #define CHROM_XY (MAX_POSSIBLE_CHROM + 2)
- #define CHROM_MT (MAX_POSSIBLE_CHROM + 3)
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
-   // dog requires 42 bits, and other species require less
-   #define CHROM_MASK_INITIAL_WORDS 1
- #else
-@@ -2046,7 +2046,7 @@ void bitfield_ornot(uintptr_t* vv, uintp
- void bitfield_xor(uintptr_t* bit_arr, uintptr_t* xor_arr, uintptr_t word_ct);
- 
- static inline uint32_t popcount2_long(uintptr_t val) {
--#ifdef __LP64__
-+#ifdef __SSE2__
-   val = (val & 0x3333333333333333LLU) + ((val >> 2) & 0x3333333333333333LLU);
-   return (((val + (val >> 4)) & 0x0f0f0f0f0f0f0f0fLLU) * 0x0101010101010101LLU) >> 56;
- #else
-@@ -2071,7 +2071,7 @@ uint32_t less_than_two_genotypes(uintptr
- 
- uintptr_t popcount_longs(uintptr_t* lptr, uintptr_t word_ct);
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- static inline uintptr_t popcount_longs_nzbase(uintptr_t* lptr, uintptr_t start_idx, uintptr_t end_idx) {
-   uintptr_t prefix_ct = 0;
-   if (start_idx & 1) {
-@@ -2112,7 +2112,7 @@ uintptr_t popcount_longs_intersect(uintp
- 
- void vertical_bitct_subtract(uintptr_t* bit_arr, uint32_t item_ct, uint32_t* sum_arr);
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- void count_2freq_dbl_60v(__m128i* vptr, __m128i* vend, __m128i* mask1vp, __m128i* mask2vp, uint32_t* ct1abp, uint32_t* ct1cp, uint32_t* ct2abp, uint32_t* ct2cp);
- 
- void count_3freq_120v(__m128i* vptr, __m128i* vend, __m128i* maskvp, uint32_t* ctap, uint32_t* ctbp, uint32_t* ctcp);
---- plink1.9-1.90~b3w-150903.orig/plink_data.c
-+++ plink1.9-1.90~b3w-150903/plink_data.c
-@@ -2222,7 +2222,7 @@ int32_t zero_cluster_init(char* zerofnam
-   if (!marker_bitfield_tmp) {
-     goto zero_cluster_init_ret_NOMEM;
-   }
--#ifdef __LP64__
-+#ifdef __SSE2__
-   fill_ulong_zero(marker_bitfield_tmp, (marker_ctp2l + 1) & (~1));
- #else
-   fill_ulong_zero(marker_bitfield_tmp, (marker_ctp2l + 3) & (~3));
-@@ -3203,7 +3203,7 @@ int32_t make_bed_me_missing_one_marker(F
- }
- 
- void zeropatch(uintptr_t sample_ctv2, uintptr_t cluster_ct, uintptr_t* cluster_zc_masks, uint32_t** zcdefs, uintptr_t* patchbuf, uintptr_t marker_idx, uintptr_t* writebuf) {
--#ifdef __LP64__
-+#ifdef __SSE2__
-   __m128i* writevec = (__m128i*)writebuf;
-   __m128i* patchvec = (__m128i*)patchbuf;
-   __m128i* patchvec_end = (__m128i*)(&(patchbuf[sample_ctv2]));
-@@ -3227,7 +3227,7 @@ void zeropatch(uintptr_t sample_ctv2, ui
-   if (!at_least_one_cluster) {
-     return;
-   }
--#ifdef __LP64__
-+#ifdef __SSE2__
-   do {
-     vec1 = *writevec;
-     vec2 = *patchvec++;
-@@ -3246,7 +3246,7 @@ void zeropatch(uintptr_t sample_ctv2, ui
- 
- void reverse_subset(uintptr_t* writebuf, uintptr_t* subset_vec2, uintptr_t word_ct) {
-   // reverse_loadbuf() variant that requires subset_vec2 bit to be set
--#ifdef __LP64__
-+#ifdef __SSE2__
-   __m128i* wvec = (__m128i*)writebuf;
-   __m128i* svec = (__m128i*)subset_vec2;
-   __m128i* wvec_end = (__m128i*)(&(writebuf[word_ct]));
-@@ -3273,7 +3273,7 @@ void reverse_subset(uintptr_t* writebuf,
- 
- void replace_missing_a2(uintptr_t* writebuf, uintptr_t* subset_vec2, uintptr_t word_ct) {
-   // 01 -> 11 for each set bit in subset_vec2
--#ifdef __LP64__
-+#ifdef __SSE2__
-   __m128i* wvec = (__m128i*)writebuf;
-   __m128i* svec = (__m128i*)subset_vec2;
-   __m128i* wvec_end = (__m128i*)(&(writebuf[word_ct]));
-@@ -5139,7 +5139,7 @@ int32_t incr_text_allele0(char cc, char*
- 
- typedef struct ll_str_fixed_struct {
-   struct ll_str_struct* next;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   char ss[8];
- #else
-   char ss[12];
-@@ -11630,7 +11630,7 @@ uint32_t valid_vcf_allele_code(const cha
-     uii -= 64;
-     // A = 1, C = 3, G = 7, N = 14, T = 20, so (0x10408a >> ucc) & 1 works as a
-     // set membership test
--#ifdef __LP64__
-+#ifdef __SSE2__
-     if ((uii > 63) || (!((0x10408a0010408aLLU >> uii) & 1))) {
-       // if '[', ']', or '.', assume breakend
-       return ((uii == 27) || (uii == 29) || (uii == 0xffffffeeU))? 1 : 0;
-@@ -15707,7 +15707,7 @@ int32_t merge_datasets(char* bedname, ch
-         logerrprint("Warning: --merge-list file is empty.\n");
-       }
-     }
--#ifndef __LP64__
-+#ifndef __SSE2__
-     if (ullxx > 0x7fffffff) {
-       goto merge_datasets_ret_NOMEM;
-     }
-@@ -15807,7 +15807,7 @@ int32_t merge_datasets(char* bedname, ch
-       max_cur_sample_ct = cur_sample_ct;
-     }
-   }
--#ifdef __LP64__
-+#ifdef __SSE2__
-   if (ullxx > 0x7fffffff) {
-     sprintf(logbuf, "Error: Too many %s (max 2147483647).\n", g_species_plural);
-     goto merge_datasets_ret_INVALID_FORMAT_2;
-@@ -16045,7 +16045,7 @@ int32_t merge_datasets(char* bedname, ch
-   if (position_warning_ct > 3) {
-     fprintf(stderr, "%" PRIu64 " more multiple-position warning%s: see log file.\n", position_warning_ct - 3, (position_warning_ct == 4)? "" : "s");
-   }
--#ifdef __LP64__
-+#ifdef __SSE2__
-   if (ullxx > 0x7fffffff) {
-     logerrprint("Error: Too many variants (max 2147483647).\n");
-     goto merge_datasets_ret_INVALID_FORMAT;
---- plink1.9-1.90~b3w-150903.orig/plink_dosage.c
-+++ plink1.9-1.90~b3w-150903/plink_dosage.c
-@@ -177,7 +177,7 @@ int32_t dosage_load_score_files(Score_in
-     logerrprint("Error: --score does not support >= 2^30 variants.\n");
-     goto dosage_load_score_files_ret_INVALID_FORMAT;
-   }
--#ifndef __LP64__
-+#ifndef __SSE2__
-   if (allele_code_buf_len > 0x7fffffff) {
-     goto dosage_load_score_files_ret_NOMEM;
-   }
-@@ -1762,7 +1762,7 @@ int32_t plink1_dosage(Dosage_info* doip,
- 	  if (load_map) {
- 	    marker_idx = id_htable_find(bufptr, slen, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len);
- 	    if (marker_idx == 0xffffffffU) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	      marker_idx = ~ZEROLU;
- #endif
- 	      continue;
---- plink1.9-1.90~b3w-150903.orig/plink_family.c
-+++ plink1.9-1.90~b3w-150903/plink_family.c
-@@ -728,7 +728,7 @@ int32_t mendel_error_scan(Family_info* f
-   uint32_t* error_cts_tmp;
-   uint32_t* error_cts_tmp2;
-   uint32_t* uiptr;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   __m128i* vptr;
-   __m128i* vptr2;
- #endif
-@@ -993,7 +993,7 @@ int32_t mendel_error_scan(Family_info* f
- 	}
- 	if ((cur_error_ct <= var_error_max) || (!var_first)) {
- 	  if (var_first) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	    vptr = (__m128i*)error_cts_tmp;
- 	    vptr2 = (__m128i*)error_cts_tmp2;
- 	    for (trio_idx = 0; trio_idx < trio_ct4; trio_idx++) {
-@@ -3125,7 +3125,7 @@ int32_t dfam(pthread_t* threads, FILE* b
-   if (retval) {
-     goto dfam_ret_1;
-   }
--#ifdef __LP64__
-+#ifdef __SSE2__
-   if ((12 * sample_ct + 2 * family_ct) > 0xffffffffLLU) {
-     logerrprint("Error: Too many samples and families for DFAM test.\n");
-     goto dfam_ret_INVALID_CMDLINE;
-@@ -4429,7 +4429,7 @@ int32_t qfam(pthread_t* threads, FILE* b
-     goto qfam_ret_1;
-   }
-   g_family_ct = family_ct;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   // no need to check in 32-bit case since a nomem error would have occurred
-   // earlier...
-   // (okay, no need to check anyway, but best to document this overflow
---- plink1.9-1.90~b3w-150903.orig/plink_filter.c
-+++ plink1.9-1.90~b3w-150903/plink_filter.c
-@@ -1695,7 +1695,7 @@ int32_t mind_filter(FILE* bedfile, uintp
-   return retval;
- }
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- void freq_hwe_haploid_count_120v(__m128i* vptr, __m128i* vend, __m128i* maskvp, uint32_t* ct_nmp, uint32_t* ct_hmajp) {
-   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
-   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
-@@ -1953,7 +1953,7 @@ static inline void single_marker_freqs_a
-   uintptr_t loader;
-   uintptr_t loader2;
-   uintptr_t loader3;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t cur_decr = 120;
-   uintptr_t* lptr_12x_end;
-   unfiltered_sample_ctl2 -= unfiltered_sample_ctl2 % 12;
-@@ -2082,7 +2082,7 @@ static inline void haploid_single_marker
-   uintptr_t loader2;
-   uintptr_t loader3;
-   uintptr_t loader4;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t cur_decr = 120;
-   uintptr_t* lptr_12x_end;
-   unfiltered_sample_ctl2 -= unfiltered_sample_ctl2 % 12;
---- plink1.9-1.90~b3w-150903.orig/plink_glm.c
-+++ plink1.9-1.90~b3w-150903/plink_glm.c
-@@ -272,7 +272,7 @@ int32_t glm_scan_conditions(char* condit
-   uintptr_t condition_ct = 0;
-   uintptr_t line_idx = 0;
-   int32_t retval = 0;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   __m128i* loadbuf_vptr;
-   __m128i* loadbuf_mask_vptr;
-   __m128i* loadbuf_vend;
-@@ -384,7 +384,7 @@ int32_t glm_scan_conditions(char* condit
-     }
-     vec_include_init(unfiltered_sample_ct, loadbuf_mask_orig, load_mask);
-     memcpy(loadbuf_mask, loadbuf_mask_orig, unfiltered_sample_ctv2 * sizeof(intptr_t));
--#ifdef __LP64__
-+#ifdef __SSE2__
-     loadbuf_vend = (__m128i*)(&(loadbuf_raw[unfiltered_sample_ctv2]));
- #else
-     loadbuf_end = &(loadbuf_raw[unfiltered_sample_ctl2]);
-@@ -406,7 +406,7 @@ int32_t glm_scan_conditions(char* condit
- 	haploid_fix(hh_or_mt_exists, sample_raw_include2, sample_raw_male_include2, unfiltered_sample_ct, is_x, is_y, (unsigned char*)loadbuf_raw);
-       }
-       // clear loadbuf_mask bits where loadbuf is 01.
--#ifdef __LP64__
-+#ifdef __SSE2__
-       loadbuf_vptr = (__m128i*)loadbuf_raw;
-       loadbuf_mask_vptr = (__m128i*)loadbuf_mask;
-       do {
-@@ -845,7 +845,7 @@ uint32_t glm_linear(uintptr_t cur_batch_
- // Lakhani and Eva Guinan.
- // #####
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- // exp_ps is a C port of Shigeo Mitsunari's fast math library posted at
- // http://homepage1.nifty.com/herumi/ .  License is
- // http://opensource.org/licenses/BSD-3-Clause .
-@@ -1180,7 +1180,7 @@ static inline __m128 fmath_exp_ps(__m128
-   return tt;
- }
- 
--// For equivalent "normal" C/C++ code, see the non-__LP64__ versions of these
-+// For equivalent "normal" C/C++ code, see the non-__SSE2__ versions of these
- // functions.
- static inline void logistic_sse(float* vect, uint32_t nn) {
-   __m128 zero = _mm_setzero_ps();
-@@ -1521,7 +1521,7 @@ static inline void compute_two_plus_one_
-   u16.vf = s3;
-   *r3_ptr = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
- }
--#else // no __LP64__ (and hence, unsafe to assume presence of SSE2)
-+#else // no __SSE2__ (and hence, unsafe to assume presence of SSE2)
- static inline void logistic_sse(float* vect, uint32_t nn) {
-   uint32_t uii;
-   for (uii = 0; uii < nn; uii++) {
---- plink1.9-1.90~b3w-150903.orig/plink_homozyg.c
-+++ plink1.9-1.90~b3w-150903/plink_homozyg.c
-@@ -85,7 +85,7 @@ void update_end_nonhom(uintptr_t* readbu
-   }
- }
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- #define ROH_ENTRY_INTS 7
- #else
- #define ROH_ENTRY_INTS 6
-@@ -210,7 +210,7 @@ void save_confirmed_roh_extend(uint32_t
-   *roh_list++ = cidx_len;
-   *roh_list++ = cidx_len - cur_roh_het_ct - cur_roh_missing_ct;
-   *roh_list++ = cur_roh_het_ct;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   *roh_list++ = (uint32_t)sample_last_roh_idx;
-   *roh_list++ = (uint32_t)(sample_last_roh_idx >> 32);
- #else
-@@ -283,7 +283,7 @@ uint32_t roh_update(Homozyg_info* hp, ui
- 	      *roh_list_cur++ = cidx_len - cur_het_ct - cur_roh_missing_cts[sample_idx];
- 	      *roh_list_cur++ = cur_het_ct;
- 	      last_roh_idx = sample_to_last_roh[sample_idx];
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	      *roh_list_cur++ = (uint32_t)last_roh_idx;
- 	      *roh_list_cur++ = (uint32_t)(last_roh_idx >> 32);
- #else
-@@ -465,7 +465,7 @@ int32_t write_main_roh_reports(char* out
-     cur_roh_ct = 0;
-     while (cur_roh_idx != ~ZEROLU) {
-       cur_roh = &(roh_list[cur_roh_idx * ROH_ENTRY_INTS]);
--#ifdef __LP64__
-+#ifdef __SSE2__
-       prev_roh_idx = ((uintptr_t)cur_roh[5]) | (((uintptr_t)cur_roh[6]) << 32);
-       cur_roh[5] = (uint32_t)next_roh_idx;
-       cur_roh[6] = (uint32_t)(next_roh_idx >> 32);
-@@ -515,7 +515,7 @@ int32_t write_main_roh_reports(char* out
-       if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
- 	goto write_main_roh_reports_ret_WRITE_FAIL;
-       }
--#ifdef __LP64__
-+#ifdef __SSE2__
-       cur_roh_idx = ((uintptr_t)cur_roh[5]) | (((uintptr_t)cur_roh[6]) << 32);
- #else
-       cur_roh_idx = (uintptr_t)cur_roh[5];
-@@ -739,7 +739,7 @@ void extract_pool_info(uint32_t pool_siz
- void initialize_roh_slot(uint32_t* cur_roh, uint32_t chrom_start, uint32_t* marker_uidx_to_cidx, uintptr_t* roh_slot, uint32_t* roh_slot_cidx_start, uint32_t* roh_slot_cidx_end, uint32_t* roh_slot_end_uidx) {
-   uint32_t cidx_first = marker_uidx_to_cidx[cur_roh[0] - chrom_start];
-   uint32_t cidx_last = marker_uidx_to_cidx[cur_roh[1] - chrom_start];
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t cidx_first_block = cidx_first & (~63);
-   uint32_t cidx_last_block = cidx_last & (~63);
-   uint32_t cur_bidx = 2;
-@@ -759,7 +759,7 @@ void initialize_roh_slot(uint32_t* cur_r
-   *roh_slot_cidx_end = cidx_last + 1;
-   *roh_slot_end_uidx = cur_roh[1] + 1;
-   uii = cidx_first & (BITCT2 - 1);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   if (cidx_first & 32) {
-     roh_slot[0] = FIVEMASK;
-     roh_slot[1] = 0x1555555555555555LLU >> (2 * (31 - uii));
-@@ -772,7 +772,7 @@ void initialize_roh_slot(uint32_t* cur_r
- #endif
-   fill_ulong_zero(&(roh_slot[cur_bidx]), end_bidx - cur_bidx);
-   uii = cidx_last & (BITCT2 - 1);
--#ifdef __LP64__
-+#ifdef __SSE2__
-   if (cidx_last & 32) {
-     // |= instead of = in case first_block and last_block are the same
-     roh_slot[end_bidx - 1] |= 0x5555555555555554LLU << (2 * uii);
-@@ -821,7 +821,7 @@ void populate_roh_slots_from_lookahead_b
-     read_shift = 2 * (sample_uidx & (BITCT2 - 1));
-     slot_idx = (uintptr_t)((*roh_slot_map) & 0xffffffffU);
-     cidx_start = roh_slot_cidx_start[slot_idx];
--#ifdef __LP64__
-+#ifdef __SSE2__
-     cidx_start_block = cidx_start & (~63);
- #else
-     cidx_start_block = cidx_start & (~15);
-@@ -882,7 +882,7 @@ int32_t populate_roh_slots_from_disk(FIL
-       roh_write_slot_idx = (uintptr_t)(roh_slot_map[roh_read_slot_idx] & 0xffffffffU);
-       cidx_start = roh_slot_cidx_start[roh_write_slot_idx];
-       if ((marker_cidx >= cidx_start) && (marker_cidx < roh_slot_cidx_end[roh_write_slot_idx])) {
--#ifdef __LP64__
-+#ifdef __SSE2__
-         start_c_bidx = 2 * (cidx_start / 64);
- #else
-         start_c_bidx = cidx_start / 16;
-@@ -895,7 +895,7 @@ int32_t populate_roh_slots_from_disk(FIL
- }
- 
- static inline uint32_t is_allelic_match(double mismatch_max, uintptr_t* roh_slot_idxl, uintptr_t* roh_slot_idxs, uint32_t block_start_idxl, uint32_t block_start_idxs, uint32_t overlap_cidx_start, uint32_t overlap_cidx_end) {
--#ifdef __LP64__
-+#ifdef __SSE2__
-   const __m128i m1 = {FIVEMASK, FIVEMASK};
-   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
-   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
-@@ -1179,7 +1179,7 @@ void compute_allelic_match_matrix(double
-     incr_idxl = 0;
-     roh_slot_idxl = &(roh_slots[slot_idxl * roh_slot_wsize]);
-     cidx_start_idxl = roh_slot_cidx_start[slot_idxl];
--#ifdef __LP64__
-+#ifdef __SSE2__
-     block_start_idxl = cidx_start_idxl & (~63);
- #else
-     block_start_idxl = cidx_start_idxl & (~15);
-@@ -1197,7 +1197,7 @@ void compute_allelic_match_matrix(double
-       }
-       slot_idxs = (uint32_t)(roh_slot_map[map_idxs]);
-       cidx_start_idxs = roh_slot_cidx_start[slot_idxs];
--#ifdef __LP64__
-+#ifdef __SSE2__
-       block_start_idxs = cidx_start_idxs & (~63);
- #else
-       block_start_idxs = cidx_start_idxs & (~15);
-@@ -1244,7 +1244,7 @@ void assign_allelic_match_groups(uint32_
-     if (ulii) {
-       nsim_nz_ct++;
-     }
--#ifdef __LP64__
-+#ifdef __SSE2__
-     cur_pool[pool_idx] = ulii << 32;
- #else
-     cur_pool[2 * pool_idx + 1] = ulii;
-@@ -1278,14 +1278,14 @@ void assign_allelic_match_groups(uint32_
- 	  nsim_nz_ct--;
- 	  allelic_match_cts[pool_idx] = 0xffffffffU;
- 	}
--#ifdef __LP64__
-+#ifdef __SSE2__
-         cur_pool[pool_idx] = (cur_pool[pool_idx] & 0xffffffff00000000LLU) | group_idx;
- #else
-         cur_pool[2 * pool_idx] = group_idx;
- #endif
-       }
-     }
--#ifdef __LP64__
-+#ifdef __SSE2__
-     cur_pool[max_nsim_pidx] |= 0x80000000LLU | (group_idx++);
- #else
-     cur_pool[2 * max_nsim_pidx] = 0x80000000U | (group_idx++);
-@@ -1293,7 +1293,7 @@ void assign_allelic_match_groups(uint32_
-   }
-   for (pool_idx = 0; pool_idx < pool_size; pool_idx++) {
-     if (allelic_match_cts[pool_idx] != 0xffffffffU) {
--#ifdef __LP64__
-+#ifdef __SSE2__
-       cur_pool[pool_idx] |= 0x80000000LLU | (group_idx++);
- #else
-       cur_pool[2 * pool_idx] = 0x80000000U | (group_idx++);
-@@ -1425,7 +1425,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
-       uii = chrom_len;
-     }
-   }
--#ifdef __LP64__
-+#ifdef __SSE2__
-   // want each roh_slots space to be 16-byte aligned, to enable SSE2
-   // max_roh_len = 1 -> 1 vec
-   // max_roh_len in {2..65} -> 2 vecs
-@@ -1538,7 +1538,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- 	    //   [3P+3]: consensus NSNP
- 	    //   [3P+4]: union NSNP
- 	    old_pool_list_size = pool_list_size;
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	    pool_list_size += 2 * pool_size + 3;
- #else
-             pool_list_size += 3 * pool_size + 5;
-@@ -1550,7 +1550,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
-             *cur_pool++ = pool_size_first_plidx[pool_size - pool_size_min];
-             pool_size_first_plidx[pool_size - pool_size_min] = old_pool_list_size;
- 	    *cur_pool++ = pool_size;
--#ifndef __LP64__
-+#ifndef __SSE2__
- 	    *cur_pool++ = 0;
- #endif
- 	    uiptr = sample_uidx_sort_buf;
-@@ -1560,14 +1560,14 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- 	      pool_list_idx = roh_slots[slot_idx1]; // actually a ROH idx
- 	      *uiptr++ = roh_list[pool_list_idx * ROH_ENTRY_INTS + 5]; // sample_uidx
-               *uiptr++ = (uint32_t)pool_list_idx;
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	      *uiptr++ = (uint32_t)(pool_list_idx >> 32);
- #endif
- 	    }
- 	    // sort in increasing sample_uidx order, for reproducible results
-             qsort(sample_uidx_sort_buf, pool_size, 4 + sizeof(intptr_t), intcmp);
- 	    for (uii = 0; uii < pool_size; uii++) {
--#ifdef __LP64__
-+#ifdef __SSE2__
-               *cur_pool++ = ((uintptr_t)sample_uidx_sort_buf[3 * uii + 1]) | (((uintptr_t)sample_uidx_sort_buf[3 * uii + 2]) << 32);
- #else
- 	      *cur_pool++ = sample_uidx_sort_buf[2 * uii + 1];
-@@ -1616,7 +1616,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
-   for (pool_size = max_pool_size; pool_size >= pool_size_min; --pool_size) {
-     pool_list_idx = pool_size_first_plidx[pool_size - pool_size_min];
-     while (pool_list_idx != ~ZEROLU) {
--#ifdef __LP64__
-+#ifdef __SSE2__
-       pool_list[pool_list_idx + 1] |= ((uintptr_t)(++uii)) << 32;
- #else
-       pool_list[pool_list_idx + 2] = ++uii;
-@@ -1666,7 +1666,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
-       pool_list_idx = pool_list[pool_list_idx - 1];
-       cur_pool = &(pool_list[pool_list_idx]);
-       pool_size = (uint32_t)cur_pool[1];
--#ifdef __LP64__
-+#ifdef __SSE2__
-       cur_pool = &(cur_pool[2]);
- #else
-       cur_pool = &(cur_pool[3]);
-@@ -1861,7 +1861,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- 
-       assign_allelic_match_groups(pool_size, allelic_match_cts, allelic_match_matrix, roh_slot_map, &(cur_pool[pool_size]));
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
-       cur_pool[2 * pool_size] = (((uintptr_t)(marker_uidx_to_cidx[union_uidx2 - chrom_start] + 1 - marker_uidx_to_cidx[union_uidx1 - chrom_start])) << 32) | ((uintptr_t)(marker_uidx_to_cidx[con_uidx2 - chrom_start] + 1 - marker_uidx_to_cidx[con_uidx1 - chrom_start]));
- #else
-       cur_pool[3 * pool_size] = marker_uidx_to_cidx[con_uidx2 - chrom_start] + 1 - marker_uidx_to_cidx[con_uidx1 - chrom_start];
-@@ -1869,7 +1869,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- #endif
- 
-       if (is_verbose) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	wptr = uint32_write(&(outname_end[14]), (uint32_t)(cur_pool[-1] >> 32));
- #else
- 	wptr = uint32_write(&(outname_end[14]), (uint32_t)cur_pool[-1]);
-@@ -1880,7 +1880,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- 	}
- 
- 	for (slot_idx1 = 0; slot_idx1 < pool_size; slot_idx1++) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	  verbose_group_sort_buf[slot_idx1] = ((cur_pool[pool_size + slot_idx1] & 0x7fffffffLLU) << 32) | ((uint64_t)slot_idx1);
- #else
- 	  verbose_group_sort_buf[slot_idx1] = (((uint64_t)(cur_pool[pool_size + 2 * slot_idx1] & 0x7fffffff)) << 32) | ((uint64_t)slot_idx1);
-@@ -2222,7 +2222,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
-     while (pool_list_idx != ~ZEROLU) {
-       cur_pool = &(pool_list[pool_list_idx]);
-       pool_list_idx = *cur_pool;
--#ifdef __LP64__
-+#ifdef __SSE2__
-       cur_pool = &(cur_pool[2]);
- #else
-       cur_pool = &(cur_pool[3]);
-@@ -2240,7 +2240,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
-       // sort pool members primarily by allelic-match group number, then by
-       // internal ID
-       for (slot_idx1 = 0; slot_idx1 < pool_size; slot_idx1++) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	roh_slot_map[slot_idx1] = ((cur_pool[pool_size + slot_idx1] & 0x7fffffffLLU) << 32) | ((uint64_t)slot_idx1);
- #else
- 	// would like to just sort 32-bit integers, but if there are >32k
-@@ -2294,7 +2294,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- 	}
-         wptr = roh_pool_write_middle(wptr, marker_ids, max_marker_id_len, plink_maxsnp, marker_pos, is_new_lengths, marker_uidx1, marker_uidx2);
- 	wptr = uint32_writew8x(wptr, cur_roh[2], ' ');
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	ulii = cur_pool[pool_size + slot_idx2];
-         wptr = width_force(4, wptr, uint32_write(wptr, (uint32_t)(ulii >> 32)));
-         *wptr++ = ' ';
-@@ -2325,7 +2325,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- 	  wptr = fw_strcpyn(plink_maxfid, 3, "CON", wptr_start);
- 	  marker_uidx1 = con_uidx1;
- 	  marker_uidx2 = con_uidx2;
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	  marker_cidx = (uint32_t)(cur_pool[2 * pool_size]);
- #else
- 	  marker_cidx = cur_pool[3 * pool_size];
-@@ -2334,7 +2334,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- 	  wptr = fw_strcpyn(plink_maxfid, 5, "UNION", wptr_start);
- 	  marker_uidx1 = union_uidx1;
- 	  marker_uidx2 = union_uidx2;
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	  // NSNP
- 	  marker_cidx = (uint32_t)(cur_pool[2 * pool_size] >> 32);
- #else
-@@ -2740,7 +2740,7 @@ int32_t calc_homozyg(Homozyg_info* hp, F
-   if (hp->modifier & (HOMOZYG_GROUP | HOMOZYG_GROUP_VERBOSE)) {
-     if (max_pool_size < hp->pool_size_min) {
-       LOGERRPRINTF("Warning: Skipping --homozyg group%s report since there are no pools.\n", (hp->modifier & HOMOZYG_GROUP_VERBOSE)? "-verbose" : "");
--#ifndef __LP64__
-+#ifndef __SSE2__
-     } else if (max_pool_size > 65536) {
-       logerrprint("Error: 32-bit " PROG_NAME_STR "'s --homozyg group cannot handle a pool of size >65536.\n");
-       goto calc_homozyg_ret_NOMEM;
---- plink1.9-1.90~b3w-150903.orig/plink_ld.c
-+++ plink1.9-1.90~b3w-150903/plink_ld.c
-@@ -73,7 +73,7 @@ void ld_epi_cleanup(Ld_info* ldip, Epi_i
-   free_cond(clump_ip->range_fname);
- }
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- static inline void ld_dot_prod_batch(__m128i* vec1, __m128i* vec2, __m128i* mask1, __m128i* mask2, int32_t* return_vals, uint32_t iters) {
-   // Main routine for computation of \sum_i^M (x_i - \mu_x)(y_i - \mu_y), where
-   // x_i, y_i \in \{-1, 0, 1\}, but there are missing values.
-@@ -494,7 +494,7 @@ int32_t ld_dot_prod_nm(uintptr_t* vec1,
-   result -= ld_dot_prod_nm_batch(vec1, vec2, last_batch_size);
-   return result;
- }
--#endif // __LP64__
-+#endif // __SSE2__
- 
- uint32_t ld_process_load(uintptr_t* geno_buf, uintptr_t* mask_buf, uintptr_t* missing_buf, uint32_t* missing_ct_ptr, double* sum_ptr, double* variance_recip_ptr, uint32_t founder_ct, uint32_t is_x, uint32_t weighted_x, uint32_t nonmale_founder_ct, uintptr_t* founder_male_include2, uintptr_t* nonmale_geno, uintptr_t* nonmale_masks, uintptr_t nonmale_offset) {
-   uintptr_t* geno_ptr = geno_buf;
-@@ -732,14 +732,14 @@ int32_t ld_prune(Ld_info* ldip, FILE* be
-   uintptr_t unfiltered_sample_ctl2 = 2 * ((unfiltered_sample_ct + (BITCT - 1)) / BITCT);
-   uintptr_t founder_ct = popcount_longs(founder_info, unfiltered_sample_ctl2 / 2);
-   uintptr_t founder_ctl = (founder_ct + BITCT - 1) / BITCT;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t founder_ctv = 2 * ((founder_ct + 127) / 128);
- #else
-   uintptr_t founder_ctv = founder_ctl;
- #endif
-   uintptr_t founder_ct_mld = (founder_ct + MULTIPLEX_LD - 1) / MULTIPLEX_LD;
-   uint32_t founder_ct_mld_m1 = ((uint32_t)founder_ct_mld) - 1;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t founder_ct_mld_rem = (MULTIPLEX_LD / 192) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 192;
- #else
-   uint32_t founder_ct_mld_rem = (MULTIPLEX_LD / 48) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 48;
-@@ -862,7 +862,7 @@ int32_t ld_prune(Ld_info* ldip, FILE* be
-   if (pairwise) {
-     prune_ld_thresh = ld_last_param * (1 + SMALL_EPSILON);
-   } else {
--#ifdef __LP64__
-+#ifdef __SSE2__
-     if (window_max > 46340) {
-       // todo: check what LAPACK's matrix inversion limit actually is.  Guess
-       // sqrt(2^31 - 1) for now.
-@@ -1290,7 +1290,7 @@ int32_t ld_prune(Ld_info* ldip, FILE* be
-   ld_prune_ret_INVALID_FORMAT:
-     retval = RET_INVALID_FORMAT;
-     break;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   ld_prune_ret_INVALID_CMDLINE:
-     retval = RET_INVALID_CMDLINE;
-     break;
-@@ -1338,7 +1338,7 @@ uint32_t ld_missing_ct_intersect(uintptr
-   // variant of popcount_longs_intersect()
-   uintptr_t tot = 0;
-   uintptr_t* lptr1_end2;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   const __m128i m1 = {FIVEMASK, FIVEMASK};
-   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
-   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
-@@ -1538,7 +1538,7 @@ int32_t flipscan(Ld_info* ldip, FILE* be
-     pheno_ctl[is_case] = (pheno_ct[is_case] + (BITCT - 1)) / BITCT;
-     ulii = (pheno_ct[is_case] + MULTIPLEX_LD - 1) / MULTIPLEX_LD;
-     pheno_ct_mld_m1[is_case] = ulii - 1;
--#ifdef __LP64__
-+#ifdef __SSE2__
-     pheno_ct_mld_rem[is_case] = (MULTIPLEX_LD / 192) - (ulii * MULTIPLEX_LD - pheno_ct[is_case]) / 192;
- #else
-     pheno_ct_mld_rem[is_case] = (MULTIPLEX_LD / 48) - (ulii * MULTIPLEX_LD - pheno_ct[is_case]) / 48;
-@@ -2295,7 +2295,7 @@ int32_t ld_report_matrix(pthread_t* thre
- 	ulptr = (uintptr_t*)tbuf;
- 	// assume little-endian
- 	// 0[delim]0[delim]...
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	ulii = 0x30003000300030LLU | (0x100010001000100LLU * ((unsigned char)g_ld_delimiter));
- #else
- 	ulii = 0x300030 | (0x1000100 * ((unsigned char)g_ld_delimiter));
-@@ -2816,7 +2816,7 @@ uint32_t load_and_split3(FILE* bedfile,
-   }
- }
- 
--#ifdef __LP64__
-+#ifdef __SSE2__
- static void two_locus_3x3_tablev(__m128i* vec1, __m128i* vec2, uint32_t* counts_3x3, uint32_t sample_ctv6, uint32_t iter_ct) {
-   const __m128i m1 = {FIVEMASK, FIVEMASK};
-   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
-@@ -3039,7 +3039,7 @@ static inline void two_locus_3x3_zmiss_t
- #endif
- 
- static void two_locus_count_table_zmiss1(uintptr_t* lptr1, uintptr_t* lptr2, uint32_t* counts_3x3, uint32_t sample_ctv3, uint32_t is_zmiss2) {
--#ifdef __LP64__
-+#ifdef __SSE2__
-   fill_uint_zero(counts_3x3, 6);
-   if (is_zmiss2) {
-     two_locus_3x3_zmiss_tablev((__m128i*)lptr1, (__m128i*)lptr2, counts_3x3, sample_ctv3 / 2);
-@@ -3060,7 +3060,7 @@ static void two_locus_count_table_zmiss1
- }
- 
- static void two_locus_count_table(uintptr_t* lptr1, uintptr_t* lptr2, uint32_t* counts_3x3, uint32_t sample_ctv3, uint32_t is_zmiss2) {
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t uii;
-   fill_uint_zero(counts_3x3, 9);
-   if (!is_zmiss2) {
-@@ -6052,7 +6052,7 @@ int32_t ld_report(pthread_t* threads, Ld
-   uintptr_t* founder_male_include2 = NULL;
-   uintptr_t founder_ct_mld = (founder_ct + MULTIPLEX_LD - 1) / MULTIPLEX_LD;
-   uint32_t founder_ct_mld_m1 = ((uint32_t)founder_ct_mld) - 1;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uint32_t founder_ct_mld_rem = (MULTIPLEX_LD / 192) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 192;
- #else
-   uint32_t founder_ct_mld_rem = (MULTIPLEX_LD / 48) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 48;
-@@ -6312,7 +6312,7 @@ int32_t show_tags(Ld_info* ldip, FILE* b
-   }
-   founder_ct_mld_m1 = (founder_ct - 1) / MULTIPLEX_LD;
-   ulii = founder_ct_mld_m1 + 1;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   founder_ct_mld_rem = (MULTIPLEX_LD / 192) - (ulii * MULTIPLEX_LD - founder_ct) / 192;
- #else
-   founder_ct_mld_rem = (MULTIPLEX_LD / 48) - (ulii * MULTIPLEX_LD - founder_ct) / 48;
-@@ -7105,7 +7105,7 @@ int32_t haploview_blocks(Ld_info* ldip,
-     if (max_block_size < 2) {
-       continue;
-     }
--#ifndef __LP64__
-+#ifndef __SSE2__
-     if (max_block_size > 65536) {
-       logprint("\n");
-       logerrprint("Error: 32-bit --blocks cannot analyze potential blocks with more than 65536\nvariants.  Use a 64-bit PLINK build or a smaller --blocks-window-kb value.\n");
-@@ -7460,7 +7460,7 @@ int32_t haploview_blocks(Ld_info* ldip,
-   haploview_blocks_ret_WRITE_FAIL:
-     retval = RET_WRITE_FAIL;
-     break;
--#ifndef __LP64__
-+#ifndef __SSE2__
-   haploview_blocks_ret_INVALID_CMDLINE:
-     retval = RET_INVALID_CMDLINE;
-     break;
-@@ -11619,7 +11619,7 @@ int32_t construct_ld_map(pthread_t* thre
-   uintptr_t founder_ctv2 = founder_ctl * 2;
-   uintptr_t founder_ct_mld = (founder_ct + MULTIPLEX_LD - 1) / MULTIPLEX_LD;
-   uint32_t founder_ct_mld_m1 = ((uint32_t)founder_ct_mld) - 1;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   uintptr_t founder_ct_mld_rem = (MULTIPLEX_LD / 192) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 192;
- #else
-   uintptr_t founder_ct_mld_rem = (MULTIPLEX_LD / 48) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 48;
-@@ -11823,7 +11823,7 @@ int32_t construct_ld_map(pthread_t* thre
- 	  // don't need to load the first intersecting member or anything
- 	  // before it, since we're only traversing the upper right triangle
- 	  wlen += firstw;
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	  firstw = 2 * (uii / 128);
- #else
- 	  firstw = uii / 32;
-@@ -13483,7 +13483,7 @@ int32_t clump_reports(FILE* bedfile, uin
-     *bufptr++ = ' ';
-     bufptr = uint32_writew10x(bufptr, cur_bp, ' ');
-     bufptr = double_g_writewx3x(bufptr, pval, 10, ' ');
--#ifdef __LP64__
-+#ifdef __SSE2__
-     // may as well be paranoid
-     bufptr = width_force(8, bufptr, int64_write(bufptr, (int64_t)(histo[0] + histo[1] + histo[2] + histo[3] + histo[4])));
-     *bufptr++ = ' ';
---- plink1.9-1.90~b3w-150903.orig/plink_matrix.h
-+++ plink1.9-1.90~b3w-150903/plink_matrix.h
-@@ -55,7 +55,7 @@ extern "C" {
- 
- #else // not _WIN32
- #include <cblas.h>
--#ifdef __LP64__
-+#ifdef __SSE2__
-   typedef int32_t __CLPK_integer;
- #else
-   typedef long int __CLPK_integer;
---- plink1.9-1.90~b3w-150903.orig/plink_misc.c
-+++ plink1.9-1.90~b3w-150903/plink_misc.c
-@@ -5687,7 +5687,7 @@ int32_t meta_analysis(char* input_fnames
-   if (!final_variant_ct) {
-     logerrprint("Error: No --meta-analysis variants.\n");
-     goto meta_analysis_ret_INVALID_CMDLINE;
--#ifdef __LP64__
-+#ifdef __SSE2__
-   } else if (final_variant_ct > 0x7fffffff) {
-     logerrprint("Error: Too many distinct --meta-analysis variants (max 2^31 - 1).\n");
- #endif
-@@ -5818,7 +5818,7 @@ int32_t meta_analysis(char* input_fnames
-       memcpy(&cur_file_ct_m1, bufptr2, file_ct_byte_width);
-       cur_data_slots = 0;
-       if (report_study_specific) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	cur_data_slots += file_ct64;
- #else
- 	cur_data_slots += 2 * file_ct64;
---- plink1.9-1.90~b3w-150903.orig/plink_set.c
-+++ plink1.9-1.90~b3w-150903/plink_set.c
-@@ -1480,7 +1480,7 @@ int32_t define_sets(Set_info* sip, uintp
-     if (retval) {
-       goto define_sets_ret_NOMEM2;
-     }
--#ifdef __LP64__
-+#ifdef __SSE2__
-     fill_ulong_zero(marker_bitfield_tmp, (marker_ctp2l + 1) & (~1));
- #else
-     fill_ulong_zero(marker_bitfield_tmp, (marker_ctp2l + 3) & (~3));
-@@ -2508,7 +2508,7 @@ int32_t annotate(Annot_info* aip, char*
- 	  while (1) {
- 	    ll_ptr = *ll_pptr;
-             if (!ll_ptr) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	      // we'll run out of memory way earlier in 32-bit mode
- 	      if (attr_id_ct == 0x80000000LLU) {
- 	        sprintf(logbuf, "Error: Too many unique attributes in %s (max 2147483648).\n", aip->attrib_fname);
-@@ -2647,7 +2647,7 @@ int32_t annotate(Annot_info* aip, char*
-       if (retval) {
- 	goto annotate_ret_1;
-       }
--#ifdef __LP64__
-+#ifdef __SSE2__
-       if (range_ct > 0x80000000LLU) {
- 	sprintf(logbuf, "Error: Too many annotations in %s (max 2147483648, counting multi-chromosome annotations once per spanned chromosome).\n", aip->ranges_fname);
- 	goto annotate_ret_INVALID_FORMAT_WW;
-@@ -2743,7 +2743,7 @@ int32_t annotate(Annot_info* aip, char*
-     } else {
-       unique_annot_ct = attr_id_ct;
-     }
--#ifdef __LP64__
-+#ifdef __SSE2__
-     unique_annot_ctlw = (unique_annot_ct + 3) / 4;
- #else
-     unique_annot_ctlw = (unique_annot_ct + 1) / 2;
-@@ -2759,7 +2759,7 @@ int32_t annotate(Annot_info* aip, char*
-     ulptr = (uintptr_t*)writebuf;
-     for (ulii = 0; ulii < unique_annot_ctlw; ulii++) {
-       // fill with repeated " 0"
--#ifdef __LP64__
-+#ifdef __SSE2__
-       *ulptr++ = 0x3020302030203020LLU;
- #else
-       *ulptr++ = 0x30203020;
-@@ -3100,7 +3100,7 @@ int32_t annotate(Annot_info* aip, char*
-       // reinitialize
-       ulptr = (uintptr_t*)writebuf;
-       for (ulii = 0; ulii < unique_annot_ctlw; ulii++) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- 	*ulptr++ = 0x3020302030203020LLU;
- #else
- 	*ulptr++ = 0x30203020;
-@@ -3285,7 +3285,7 @@ int32_t gene_report(char* fname, char* g
-   if (retval) {
-     goto gene_report_ret_1;
-   }
--#ifdef __LP64__
-+#ifdef __SSE2__
-   if (gene_ct > 0x80000000LLU) {
-     sprintf(logbuf, "Error: Too many genes in %s (max 2147483648).\n", glist);
-     goto gene_report_ret_INVALID_FORMAT_WW;
-@@ -3495,7 +3495,7 @@ int32_t gene_report(char* fname, char* g
-     ((uint32_t*)linebuf_top)[1] = cur_bp;
-     linebuf_left -= slen + 8;
-     linebuf_top = &(linebuf_top[slen + 8]);
--#ifdef __LP64__
-+#ifdef __SSE2__
-     if (saved_line_ct == 0x100000000LLU) {
-       sprintf(logbuf, "Error: Too many valid lines in %s (--gene-report can only handle 4294967296).\n", fname);
-       goto gene_report_ret_INVALID_FORMAT_WW;
diff --git a/debian/patches/series b/debian/patches/series
index 0f5610a..c1895f8 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,3 +1,2 @@
 01_Fix_use_internal_lib.patch
 02_Activate_Stable_Build.patch
-03_replace_LP64_by_SSE2.patch

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/plink1.9.git