[med-svn] [plink1.9] 01/01: add patch to properly test for SSE2

Sat Sep 19 17:45:43 UTC 2015

This is an automated email from the git hooks/post-receive script.

gert-guest pushed a commit to branch master
in repository plink1.9.

commit b91b5d1cca1586d90e1c7de8a7508603207fe277
Author: Gert Wollny <gw.fossdev at gmail.com>
Date:   Sat Sep 19 19:31:59 2015 +0200

    add patch to properly test for SSE2
---
 debian/patches/03_replace_LP64_by_SSE2.patch | 2589 ++++++++++++++++++++++++++
 debian/patches/series                        |    1 +
 2 files changed, 2590 insertions(+)

diff --git a/debian/patches/03_replace_LP64_by_SSE2.patch b/debian/patches/03_replace_LP64_by_SSE2.patch
new file mode 100644
index 0000000..b6f7c23
--- /dev/null
+++ b/debian/patches/03_replace_LP64_by_SSE2.patch
@@ -0,0 +1,2589 @@
+Description: Replace the test for LP64 by a test for SSE2
+ This patch replaces the test for the define __LP64__ by a test for 
+ __SSE2__ since the former only provides information abouth whether 
+ the arch is 64 bit, while actually the availability of SSE2 needs 
+ to be tested. 
+Author: Gert Wollny <gw.fossdev at gmail.com> 
+Bug-Debian: http://bugs.debian.org/799471
+Forwarded: no
+Last-Update: 2015-09-19
+
+--- plink1.9-1.90~b3w-150903.orig/Rsrv.h
++++ plink1.9-1.90~b3w-150903/Rsrv.h
+@@ -355,11 +355,11 @@ typedef unsigned long rlen_t;
+ #ifdef ULONG_MAX
+ #define rlen_max ULONG_MAX
+ #else
+-#ifdef __LP64__
++#ifdef __SSE2__
+ #define rlen_max 0xffffffffffffffffL 
+ #else
+ #define rlen_max 0xffffffffL
+-#endif /* __LP64__ */
++#endif /* __SSE2__ */
+ #endif /* ULONG_MAX */
+ 
+ 
+--- plink1.9-1.90~b3w-150903.orig/SFMT.c
++++ plink1.9-1.90~b3w-150903/SFMT.c
+@@ -48,7 +48,7 @@ extern "C" {
+ #include <assert.h>
+ #include "SFMT.h"
+ 
+-#ifndef __LP64__
++#ifndef __SSE2__
+ inline static void do_recursion(w128_t * r, w128_t * a, w128_t * b,
+ 				w128_t * c, w128_t * d);
+ #endif
+@@ -110,7 +110,7 @@ inline static void lshift128(w128_t *out
+  * @param c a 128-bit part of the internal state array
+  * @param d a 128-bit part of the internal state array
+  */
+-#ifndef __LP64__
++#ifndef __SSE2__
+ inline static void do_recursion(w128_t *r, w128_t *a, w128_t *b,
+ 				w128_t *c, w128_t *d)
+ {
+@@ -144,7 +144,7 @@ inline static uint32_t func1(uint32_t x)
+ inline static uint32_t func2(uint32_t x);
+ static void period_certification(sfmt_t * sfmt);
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ inline static void mm_recursion(__m128i * r, __m128i a, __m128i b,
+ 				__m128i c, __m128i d);
+ 
+@@ -255,7 +255,7 @@ inline static int idxof(int i) {
+     return i;
+ }
+ 
+-#ifndef __LP64__
++#ifndef __SSE2__
+ /**
+  * This function fills the user-specified array with pseudorandom
+  * integers.
+@@ -391,7 +391,7 @@ int sfmt_get_min_array_size64(sfmt_t * s
+     return SFMT_N64;
+ }
+ 
+-#ifndef __LP64__
++#ifndef __SSE2__
+ /**
+  * This function fills the internal state array with pseudorandom
+  * integers.
+--- plink1.9-1.90~b3w-150903.orig/SFMT.h
++++ plink1.9-1.90~b3w-150903/SFMT.h
+@@ -128,7 +128,7 @@ extern "C" {
+ /*------------------------------------------
+   128-bit SIMD like data type for standard C
+   ------------------------------------------*/
+-#ifdef __LP64__
++#ifdef __SSE2__
+   #include <emmintrin.h>
+ 
+ /** 128-bit data structure */
+--- plink1.9-1.90~b3w-150903.orig/plink.c
++++ plink1.9-1.90~b3w-150903/plink.c
+@@ -98,7 +98,7 @@ const char ver_str[] =
+ #ifdef NOLAPACK
+   "NL"
+ #endif
+-#ifdef __LP64__
++#ifdef __SSE2__
+   " 64-bit"
+ #else
+   " 32-bit"
+@@ -1670,7 +1670,7 @@ int32_t plink(char* outname, char* outna
+     wkspace_mark_postcluster = wkspace_base;
+     ulii = (sample_ct * (sample_ct - 1)) >> 1;
+     if (cluster_ptr->mds_dim_ct) {
+-#ifndef __LP64__
++#ifndef __SSE2__
+       // catch 32-bit intptr_t overflow
+       if (sample_ct > 23169) {
+         goto plink_ret_NOMEM;
+@@ -1692,13 +1692,13 @@ int32_t plink(char* outname, char* outna
+ 
+     if (cluster_ct) {
+       ulii = cluster_ct + sample_ct - cluster_starts[cluster_ct];
+-#ifndef __LP64__
++#ifndef __SSE2__
+       if (ulii > 23169) {
+ 	goto plink_ret_NOMEM;
+       }
+ #endif
+       ulii = (ulii * (ulii - 1)) >> 1;
+-#ifndef __LP64__
++#ifndef __SSE2__
+     } else if (sample_ct > 23169) {
+       goto plink_ret_NOMEM;
+ #endif
+@@ -3056,7 +3056,7 @@ int32_t init_delim_and_species(uint32_t
+     break;
+   case SPECIES_DOG:
+     chrom_info_ptr->autosome_ct = 38;
+-#ifdef __LP64__
++#ifdef __SSE2__
+     chrom_info_ptr->haploid_mask[0] = 0x18000000000LLU;
+ #else
+     chrom_info_ptr->haploid_mask[1] = 0x180;
+@@ -3064,7 +3064,7 @@ int32_t init_delim_and_species(uint32_t
+     break;
+   case SPECIES_HORSE:
+     chrom_info_ptr->autosome_ct = 31;
+-#ifdef __LP64__
++#ifdef __SSE2__
+     chrom_info_ptr->haploid_mask[0] = 0x300000000LLU;
+ #else
+     chrom_info_ptr->haploid_mask[1] = 3;
+@@ -8292,7 +8292,7 @@ int32_t main(int32_t argc, char** argv)
+ 	  sprintf(logbuf, "Error: Invalid --memory parameter '%s' (minimum %u).\n", argv[cur_arg + 1], WKSPACE_MIN_MB);
+ 	  goto main_ret_INVALID_CMDLINE_WWA;
+ 	}
+-#ifndef __LP64__
++#ifndef __SSE2__
+ 	if (malloc_size_mb > 2047) {
+ 	  logerrprint("Error: --memory parameter too large for 32-bit version (max 2047).\n");
+ 	  goto main_ret_INVALID_CMDLINE;
+@@ -13200,7 +13200,7 @@ int32_t main(int32_t argc, char** argv)
+   } else if (malloc_size_mb < WKSPACE_MIN_MB) {
+     malloc_size_mb = WKSPACE_MIN_MB;
+   }
+-#ifndef __LP64__
++#ifndef __SSE2__
+   if (malloc_size_mb > 2047) {
+     malloc_size_mb = 2047;
+   }
+--- plink1.9-1.90~b3w-150903.orig/plink_assoc.c
++++ plink1.9-1.90~b3w-150903/plink_assoc.c
+@@ -34,7 +34,7 @@ void single_marker_cc_freqs(uintptr_t sa
+   uintptr_t loader2;
+   uintptr_t loader3;
+   uintptr_t loader4;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t cur_decr = 60;
+   uintptr_t* lptr_6x_end;
+   sample_ctl2 -= sample_ctl2 % 6;
+@@ -136,7 +136,7 @@ void single_marker_cc_3freqs(uintptr_t s
+   uintptr_t loader;
+   uintptr_t loader2;
+   uintptr_t loader3;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t cur_decr = 120;
+   uintptr_t* lptr_12x_end;
+   sample_ctl2 -= sample_ctl2 % 12;
+@@ -786,7 +786,7 @@ void transpose_perms(uintptr_t* perm_vec
+   //   next 4 bytes: 32 40 48...
+   uintptr_t sample_idx = 0;
+   uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t wbuf[4];
+   uint32_t* wbptr;
+ #else
+@@ -801,7 +801,7 @@ void transpose_perms(uintptr_t* perm_vec
+     pvptr = &(perm_vecs[sample_idx / BITCT2]);
+     rshift = 2 * (sample_idx % BITCT2);
+     goto transpose_perms_loop_start;
+-#ifdef __LP64__
++#ifdef __SSE2__
+     do {
+       if (!(perm_idx % 4)) {
+ 	if (perm_idx % 128) {
+@@ -840,7 +840,7 @@ void transpose_perms(uintptr_t* perm_vec
+ void transpose_perm1s(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_nm_ct, uint32_t* perm_vecst) {
+   uintptr_t sample_idx = 0;
+   uintptr_t pheno_nm_ctl = (pheno_nm_ct + (BITCT - 1)) / BITCT;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t wbuf[4];
+   uint32_t* wbptr;
+ #else
+@@ -855,7 +855,7 @@ void transpose_perm1s(uintptr_t* perm_ve
+     pvptr = &(perm_vecs[sample_idx / BITCT]);
+     rshift = sample_idx % BITCT;
+     goto transpose_perm1s_loop_start;
+-#ifdef __LP64__
++#ifdef __SSE2__
+     do {
+       if (!(perm_idx % 4)) {
+ 	if (perm_idx % 128) {
+@@ -919,7 +919,7 @@ void calc_git(uint32_t pheno_nm_ct, uint
+   // is called.
+   uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
+   uint32_t perm_ct16 = (perm_vec_ct + 15) / 16;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
+   uint32_t perm_ct128x4 = perm_ct128 * 4;
+   uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
+@@ -955,7 +955,7 @@ void calc_git(uint32_t pheno_nm_ct, uint
+   uint32_t ujj;
+   uint32_t ukk;
+   uint32_t sample_type;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   // 4- and 8-bit partial counts
+   gitv[0] = (__m128i*)thread_wkspace;
+   gitv[1] = &(((__m128i*)thread_wkspace)[perm_ct128x4]);
+@@ -992,7 +992,7 @@ void calc_git(uint32_t pheno_nm_ct, uint
+       ujj = CTZLU(ulii) & (BITCT - 2); // get pos of next non-[hom A2] sample
+       sample_type = ((ulii >> ujj) & 3) - 1;
+       git_merge4 = gitv[sample_type];
+-#ifdef __LP64__
++#ifdef __SSE2__
+       perm_ptr = &(permsv[(ujj / 2) * perm_ct128]);
+       for (pbidx = 0; pbidx < perm_ct128; pbidx++) {
+ 	loader = *perm_ptr++;
+@@ -1067,7 +1067,7 @@ void calc_git(uint32_t pheno_nm_ct, uint
+ #endif
+       ulii &= ~((3 * ONELU) << ujj);
+     }
+-#ifdef __LP64__
++#ifdef __SSE2__
+     permsv = &(permsv[BITCT2 * perm_ct128]);
+ #else
+     permsv = &(permsv[BITCT2 * perm_ct32]);
+@@ -1075,7 +1075,7 @@ void calc_git(uint32_t pheno_nm_ct, uint
+   }
+   for (sample_type = 0; sample_type < 3; sample_type++) {
+     uii = cur_cts[sample_type];
+-#ifdef __LP64__
++#ifdef __SSE2__
+     if (uii % 15) {
+       git_merge4 = gitv[sample_type];
+       git_merge8 = gitv[sample_type + 3];
+@@ -1127,7 +1127,7 @@ void calc_git(uint32_t pheno_nm_ct, uint
+ 
+ void calc_qgit(uint32_t pheno_nm_ct, uintptr_t perm_vec_ctcl8m, uint32_t num_perms_now, uintptr_t* __restrict__ loadbuf, double* perm_vecstd, double* thread_bufs) {
+   uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
+   uint32_t row_mult = perm_vec_ctcl8m / 4;
+ 
+@@ -1160,7 +1160,7 @@ void calc_qgit(uint32_t pheno_nm_ct, uin
+     while (ulii) {
+       ujj = CTZLU(ulii) & (BITCT - 2);
+       sample_type = (ulii >> ujj) & 3;
+-#ifdef __LP64__
++#ifdef __SSE2__
+       // note that the gain from using SSE2 for double-precision arithmetic is
+       // typically minimal because modern cores tend to have two FPUs, so we
+       // should only use it opportunistically.  it's painless here, though.
+@@ -1220,7 +1220,7 @@ void calc_qgit(uint32_t pheno_nm_ct, uin
+ #endif
+       ulii &= ~((3 * ONELU) << ujj);
+     }
+-#ifdef __LP64__
++#ifdef __SSE2__
+     permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
+ #else
+     perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
+@@ -1230,7 +1230,7 @@ void calc_qgit(uint32_t pheno_nm_ct, uin
+ 
+ void calc_qgit_lin(uint32_t pheno_nm_ct, uintptr_t perm_vec_ctcl8m, uint32_t num_perms_now, uintptr_t* __restrict__ loadbuf, double* perm_vecstd, double* thread_bufs) {
+   uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
+   uint32_t row_mult = perm_vec_ctcl8m / 4;
+ 
+@@ -1263,7 +1263,7 @@ void calc_qgit_lin(uint32_t pheno_nm_ct,
+     while (ulii) {
+       ujj = CTZLU(ulii) & (BITCT - 2);
+       sample_type = (ulii >> ujj) & 3;
+-#ifdef __LP64__
++#ifdef __SSE2__
+       perm_readv = &(permsv[ujj * row_mult]);
+       if (sample_type == 1) {
+ 	git_writev = (__m128d*)thread_bufs;
+@@ -1306,7 +1306,7 @@ void calc_qgit_lin(uint32_t pheno_nm_ct,
+ #endif
+       ulii &= ~((3 * ONELU) << ujj);
+     }
+-#ifdef __LP64__
++#ifdef __SSE2__
+     permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
+ #else
+     perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
+@@ -1314,7 +1314,7 @@ void calc_qgit_lin(uint32_t pheno_nm_ct,
+   }
+ }
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ uintptr_t rem_cost_60v(__m128i* vec1, __m128i* vend, __m128i* vec2) {
+   const __m128i m1 = {FIVEMASK, FIVEMASK};
+   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
+@@ -1522,7 +1522,7 @@ uintptr_t rem_cost(uintptr_t sample_ctv2
+   uintptr_t detect_homcom;
+   uintptr_t result_a;
+   uintptr_t result_b;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t cur_decr = 60;
+   uintptr_t* lptr_6x_end;
+   sample_ctv2 -= sample_ctv2 % 6;
+@@ -1583,7 +1583,7 @@ uintptr_t qrem_cost2(uintptr_t sample_ct
+   uintptr_t result_a;
+   uintptr_t result_b;
+   uintptr_t result_c;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t cur_decr = 40;
+   uintptr_t* lptr_4x_end;
+   sample_ctl2 &= ~3LLU;
+@@ -1620,7 +1620,7 @@ uintptr_t qrem_cost2(uintptr_t sample_ct
+   return cost;
+ }
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ static inline void calc_rem_merge4_one(uint32_t perm_ct128, __m128i* __restrict__ perm_ptr, __m128i* __restrict__ rem_merge4) {
+   const __m128i m1x4 = {0x1111111111111111LLU, 0x1111111111111111LLU};
+   __m128i loader;
+@@ -1788,7 +1788,7 @@ void calc_rem(uint32_t pheno_nm_ct, uint
+   // low 8 bits give index of first remv[] array to increment; next 8 bits give
+   // second index if nonzero, or indicate its absence
+   const uint32_t idx_table[3][4] = {{0x300, 0x102, 4, 5}, {0x500, 2, 0x104, 3}, {0, 0x502, 0x304, 1}};
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
+   uint32_t perm_ct128x4 = perm_ct128 * 4;
+   uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
+@@ -1819,7 +1819,7 @@ void calc_rem(uint32_t pheno_nm_ct, uint
+   uint32_t uii;
+   uint32_t ujj;
+   uint32_t ukk;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   for (uii = 0; uii < 6; uii++) {
+     remv[uii] = &(((__m128i*)thread_wkspace)[uii * perm_ct128x4]);
+   }
+@@ -1860,7 +1860,7 @@ void calc_rem(uint32_t pheno_nm_ct, uint
+       idx1 = idx_table[cur_xor - 1][cur_raw];
+       idx2 = idx1 >> 8;
+       idx1 &= 255;
+-#ifdef __LP64__
++#ifdef __SSE2__
+       perm_ptr = &(permsv[(ujj / 2) * perm_ct128]);
+       if (!idx2) {
+ 	calc_rem_merge4_one(perm_ct128, perm_ptr, remv[idx1]);
+@@ -1917,7 +1917,7 @@ void calc_rem(uint32_t pheno_nm_ct, uint
+ #endif
+       ulxor &= ~((3 * ONELU) << ujj);
+     }
+-#ifdef __LP64__
++#ifdef __SSE2__
+     permsv = &(permsv[BITCT2 * perm_ct128]);
+ #else
+     permsv = &(permsv[BITCT2 * perm_ct32]);
+@@ -1925,7 +1925,7 @@ void calc_rem(uint32_t pheno_nm_ct, uint
+   }
+   for (idx1 = 0; idx1 < 6; idx1++) {
+     uii = cur_cts[idx1];
+-#ifdef __LP64__
++#ifdef __SSE2__
+     if (uii % 15) {
+       calc_rem_merge8(perm_ct32, remv[idx1], remv[idx1 + 6]);
+     }
+@@ -1954,7 +1954,7 @@ void calc_rem(uint32_t pheno_nm_ct, uint
+ void calc_qrem(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, uintptr_t* loadbuf_ref, double* perm_vecstd, double* outbufs) {
+   uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
+   uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
+   uint32_t row_mult = perm_vec_ctcl8m / 4;
+ 
+@@ -1993,7 +1993,7 @@ void calc_qrem(uint32_t pheno_nm_ct, uin
+       ujj = CTZLU(ulxor) & (BITCT - 2);
+       cur_xor = (ulxor >> ujj) & 3;
+       cur_raw = (ulraw1 >> ujj) & 3;
+-#ifdef __LP64__
++#ifdef __SSE2__
+       perm_readv = &(permsv[ujj * row_mult]);
+       rem_writev = (__m128d*)outbufs;
+       rem_write2v = (__m128d*)(&(outbufs[perm_vec_ctcl8m]));
+@@ -2208,7 +2208,7 @@ void calc_qrem(uint32_t pheno_nm_ct, uin
+ #endif
+       ulxor &= ~((3 * ONELU) << ujj);
+     }
+-#ifdef __LP64__
++#ifdef __SSE2__
+     permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
+ #else
+     perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
+@@ -2219,7 +2219,7 @@ void calc_qrem(uint32_t pheno_nm_ct, uin
+ void calc_qrem_lin(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, uintptr_t* loadbuf_ref, double* perm_vecstd, double* outbufs) {
+   uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
+   uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
+   uint32_t row_mult = perm_vec_ctcl8m / 4;
+ 
+@@ -2264,7 +2264,7 @@ void calc_qrem_lin(uint32_t pheno_nm_ct,
+       ujj = CTZLU(ulxor) & (BITCT - 2);
+       cur_xor = (ulxor >> ujj) & 3;
+       cur_raw = (ulraw1 >> ujj) & 3;
+-#ifdef __LP64__
++#ifdef __SSE2__
+       perm_readv = &(permsv[ujj * row_mult]);
+       if (cur_raw == 3) {
+ 	if (cur_xor == 1) {
+@@ -2589,7 +2589,7 @@ void calc_qrem_lin(uint32_t pheno_nm_ct,
+ #endif
+       ulxor &= ~((3 * ONELU) << ujj);
+     }
+-#ifdef __LP64__
++#ifdef __SSE2__
+     permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
+ #else
+     perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
+@@ -3197,7 +3197,7 @@ THREAD_RET_TYPE assoc_maxt_thread(void*
+   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
+   uint32_t model_fisher = g_model_fisher;
+   uint32_t fisher_midp = g_fisher_midp;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
+   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
+ #else
+@@ -3359,13 +3359,13 @@ THREAD_RET_TYPE assoc_maxt_thread(void*
+ 	  ldrefs[marker_idx] = ldref;
+ 	}
+ 	if (ldref == marker_bidx) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	  fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
+ #else
+ 	  fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
+ #endif
+ 	  calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	  fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
+ #else
+ 	  fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
+@@ -3478,7 +3478,7 @@ THREAD_RET_TYPE assoc_set_thread(void* a
+   uint32_t assoc_thread_ct = g_assoc_thread_ct;
+   uintptr_t perm_vec_ct = g_perm_vec_ct;
+   uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
+   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
+ #else
+@@ -3556,13 +3556,13 @@ THREAD_RET_TYPE assoc_set_thread(void* a
+ 	git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
+ 	git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
+ 	git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
+ #else
+ 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
+ #endif
+ 	calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
+ #else
+ 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
+@@ -4619,7 +4619,7 @@ THREAD_RET_TYPE model_maxt_domrec_thread
+   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
+   uint32_t model_fisher = g_model_fisher;
+   uint32_t fisher_midp = g_fisher_midp;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
+   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
+ #else
+@@ -4760,13 +4760,13 @@ THREAD_RET_TYPE model_maxt_domrec_thread
+ 	ldrefs[marker_idx] = ldref;
+       }
+       if (ldref == marker_bidx) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
+ #else
+ 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
+ #endif
+ 	calc_git(pheno_nm_ct, perm_vec_ct, &(loadbuf[marker_bidx * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
+ #else
+ 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
+@@ -4862,7 +4862,7 @@ THREAD_RET_TYPE model_set_domrec_thread(
+   uint32_t assoc_thread_ct = g_assoc_thread_ct;
+   uintptr_t perm_vec_ct = g_perm_vec_ct;
+   uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
+   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
+ #else
+@@ -4929,13 +4929,13 @@ THREAD_RET_TYPE model_set_domrec_thread(
+       git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
+       git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
+       git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
+-#ifdef __LP64__
++#ifdef __SSE2__
+       fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
+ #else
+       fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
+ #endif
+       calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
+-#ifdef __LP64__
++#ifdef __SSE2__
+       fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
+ #else
+       fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
+@@ -5104,7 +5104,7 @@ THREAD_RET_TYPE model_maxt_trend_thread(
+   uintptr_t pheno_nm_ctv2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+   uint32_t assoc_thread_ct = g_assoc_thread_ct;
+   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
+   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
+ #else
+@@ -5221,13 +5221,13 @@ THREAD_RET_TYPE model_maxt_trend_thread(
+ 	ldrefs[marker_idx] = ldref;
+       }
+       if (ldref == marker_bidx) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
+ #else
+ 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
+ #endif
+ 	calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
+ #else
+ 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
+@@ -5299,7 +5299,7 @@ THREAD_RET_TYPE model_set_trend_thread(v
+   uint32_t assoc_thread_ct = g_assoc_thread_ct;
+   uintptr_t perm_vec_ct = g_perm_vec_ct;
+   uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
+   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
+ #else
+@@ -5359,13 +5359,13 @@ THREAD_RET_TYPE model_set_trend_thread(v
+       git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
+       git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
+       git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
+-#ifdef __LP64__
++#ifdef __SSE2__
+       fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
+ #else
+       fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
+ #endif
+       calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
+-#ifdef __LP64__
++#ifdef __SSE2__
+       fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
+ #else
+       fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
+@@ -5542,7 +5542,7 @@ THREAD_RET_TYPE model_maxt_gen_thread(vo
+   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
+   uint32_t model_fisher = g_model_fisher;
+   uint32_t fisher_midp = g_fisher_midp;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
+   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
+ #else
+@@ -5665,13 +5665,13 @@ THREAD_RET_TYPE model_maxt_gen_thread(vo
+ 	ldrefs[marker_idx] = ldref;
+       }
+       if (ldref == marker_bidx) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
+ #else
+ 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
+ #endif
+ 	calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
+ #else
+ 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
+@@ -5971,7 +5971,7 @@ THREAD_RET_TYPE model_maxt_best_thread(v
+   uint32_t pidx_offset = g_perms_done - perm_vec_ct;
+   uint32_t model_fisher = g_model_fisher;
+   uint32_t fisher_midp = g_fisher_midp;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
+   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
+ #else
+@@ -6112,13 +6112,13 @@ THREAD_RET_TYPE model_maxt_best_thread(v
+ 	ldrefs[marker_idx] = ldref;
+       }
+       if (ldref == marker_bidx) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
+ #else
+ 	fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
+ #endif
+ 	calc_git(pheno_nm_ct, perm_vec_ct, &(loadbuf[marker_bidx * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
+ #else
+ 	fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
+@@ -6307,7 +6307,7 @@ THREAD_RET_TYPE model_set_best_thread(vo
+   uint32_t assoc_thread_ct = g_assoc_thread_ct;
+   uintptr_t perm_vec_ct = g_perm_vec_ct;
+   uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
+   uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
+ #else
+@@ -6381,13 +6381,13 @@ THREAD_RET_TYPE model_set_best_thread(vo
+       git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
+       git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
+       git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
+-#ifdef __LP64__
++#ifdef __SSE2__
+       fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
+ #else
+       fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
+ #endif
+       calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
+-#ifdef __LP64__
++#ifdef __SSE2__
+       fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
+ #else
+       fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
+@@ -6612,7 +6612,7 @@ int32_t model_assoc_set_test(pthread_t*
+   join_threads(threads, assoc_thread_ct);
+   g_assoc_thread_ct = max_thread_ct;
+   g_resultbuf = (uint32_t*)wkspace_alloc(perm_vec_ctcl4m * 3 * MODEL_BLOCKSIZE * sizeof(int32_t));
+-#ifdef __LP64__
++#ifdef __SSE2__
+   ulii = ((perm_vec_ct + 127) / 128) * 16;
+   g_perm_vecst = (uint32_t*)wkspace_alloc(ulii * pheno_nm_ct);
+ #else
+@@ -6622,7 +6622,7 @@ int32_t model_assoc_set_test(pthread_t*
+ #endif
+   g_thread_git_wkspace = (uint32_t*)wkspace_alloc(ulii * 72 * max_thread_ct);
+   transpose_perms(g_perm_vecs, perm_vec_ct, pheno_nm_ct, g_perm_vecst);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 9 * max_thread_ct);
+ #else
+   fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 18 * max_thread_ct);
+@@ -7211,7 +7211,7 @@ int32_t model_assoc(pthread_t* threads,
+       if (!g_ldrefs) {
+ 	goto model_assoc_ret_NOMEM;
+       }
+-#ifdef __LP64__
++#ifdef __SSE2__
+       fill_ulong_one((uintptr_t*)g_ldrefs, (marker_ct + 3) / 4);
+ #else
+       fill_ulong_one((uintptr_t*)g_ldrefs, (marker_ct + 1) / 2);
+@@ -7408,7 +7408,7 @@ int32_t model_assoc(pthread_t* threads,
+       ulii = (perm_vec_ct + (CACHELINE_DBL - 1)) / CACHELINE_DBL;
+       g_maxt_thread_results = (double*)wkspace_alloc(max_thread_ct * ulii * CACHELINE);
+       g_resultbuf = (uint32_t*)wkspace_alloc(perm_vec_ctcl4m * 3 * MODEL_BLOCKSIZE * sizeof(int32_t));
+-#ifdef __LP64__
++#ifdef __SSE2__
+       ulii = ((perm_vec_ct + 127) / 128) * 16;
+       g_perm_vecst = (uint32_t*)wkspace_alloc(ulii * pheno_nm_ct);
+ #else
+@@ -7418,7 +7418,7 @@ int32_t model_assoc(pthread_t* threads,
+ #endif
+       g_thread_git_wkspace = (uint32_t*)wkspace_alloc(ulii * 72 * max_thread_ct);
+       transpose_perms(g_perm_vecs, perm_vec_ct, pheno_nm_ct, g_perm_vecst);
+-#ifdef __LP64__
++#ifdef __SSE2__
+       fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 9 * max_thread_ct);
+ #else
+       fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 18 * max_thread_ct);
+@@ -9037,7 +9037,7 @@ int32_t qassoc(pthread_t* threads, FILE*
+     if (!g_ldrefs) {
+       goto qassoc_ret_NOMEM;
+     }
+-#ifdef __LP64__
++#ifdef __SSE2__
+     fill_ulong_one((uintptr_t*)g_ldrefs, (marker_ct + 3) / 4);
+ #else
+     fill_ulong_one((uintptr_t*)g_ldrefs, (marker_ct + 1) / 2);
+@@ -10433,7 +10433,7 @@ void calc_git_missing(uint32_t pheno_nm_
+   // thread_wkspace[] is assumed to be zeroed out before this function is
+   // called.
+   uint32_t pheno_nm_ctl = (pheno_nm_ct + (BITCT - 1)) / BITCT;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t perm_ct16 = (perm_vec_ct + 15) / 16;
+   uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
+   uint32_t perm_ct128x4 = perm_ct128 * 4;
+@@ -10466,7 +10466,7 @@ void calc_git_missing(uint32_t pheno_nm_
+   uint32_t pbidx;
+   uint32_t uii;
+   uint32_t ujj;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   // 4- and 8-bit partial counts
+   gitv[0] = &(((__m128i*)thread_wkspace)[8 * perm_ct128x4]);
+   gitv[1] = &(((__m128i*)thread_wkspace)[9 * perm_ct128x4]);
+@@ -10488,7 +10488,7 @@ void calc_git_missing(uint32_t pheno_nm_
+     while (ulii) {
+       ujj = CTZLU(ulii);
+       git_merge4 = gitv[0];
+-#ifdef __LP64__
++#ifdef __SSE2__
+       perm_ptr = &(permsv[ujj * perm_ct128]);
+       for (pbidx = 0; pbidx < perm_ct128; pbidx++) {
+ 	loader = *perm_ptr++;
+@@ -10561,13 +10561,13 @@ void calc_git_missing(uint32_t pheno_nm_
+ #endif
+       ulii &= ulii - 1;
+     }
+-#ifdef __LP64__
++#ifdef __SSE2__
+     permsv = &(permsv[BITCT * perm_ct128]);
+ #else
+     permsv = &(permsv[BITCT * perm_ct32]);
+ #endif
+   }
+-#ifdef __LP64__
++#ifdef __SSE2__
+   if (cur_ct % 15) {
+     git_merge4 = gitv[0];
+     git_merge8 = gitv[1];
+@@ -11325,7 +11325,7 @@ int32_t testmiss(pthread_t* threads, FIL
+     if (perm_maxt) {
+       ulii = (g_perm_vec_ct + (CACHELINE_DBL - 1)) / CACHELINE_DBL;
+       g_maxt_thread_results = (double*)wkspace_alloc(max_thread_ct * ulii * CACHELINE);
+-#ifdef __LP64__
++#ifdef __SSE2__
+       ulii = ((g_perm_vec_ct + 127) / 128) * 16;
+       g_perm_vecst = (uint32_t*)wkspace_alloc(ulii * pheno_nm_ct);
+ #else
+@@ -11335,7 +11335,7 @@ int32_t testmiss(pthread_t* threads, FIL
+ #endif
+       g_thread_git_wkspace = (uint32_t*)wkspace_alloc(ulii * 44 * max_thread_ct);
+       transpose_perm1s(g_perm_vecs, g_perm_vec_ct, pheno_nm_ct, g_perm_vecst);
+-#ifdef __LP64__
++#ifdef __SSE2__
+       fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, (ulii / 2) * 11 * max_thread_ct);
+ #else
+       fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 11 * max_thread_ct);
+@@ -12552,7 +12552,7 @@ int32_t cmh2_assoc(FILE* bedfile, uintpt
+     logerrprint("Error: --mh2 requires at least two cases and two controls.\n");
+     goto cmh2_assoc_ret_INVALID_CMDLINE;
+   }
+-#ifdef __LP64__
++#ifdef __SSE2__
+   if (cluster_ct1 > 46341) {
+     // might actually be ok, but play it safe in case LAPACK matrix inversion
+     // routine has an integer overflow here
+--- plink1.9-1.90~b3w-150903.orig/plink_calc.c
++++ plink1.9-1.90~b3w-150903/plink_calc.c
+@@ -132,7 +132,7 @@ void update_rel_ibc(double* rel_ibc, uin
+   double* weights2 = &(weights[128]);
+   double* weights3 = &(weights[256]);
+   double* weights4 = &(weights[320]);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   double* weights5 = &(weights[384]);
+   double* weights6 = &(weights[448]);
+   double* weights7 = &(weights[512]);
+@@ -207,7 +207,7 @@ void update_rel_ibc(double* rel_ibc, uin
+   }
+   for (ukk = 0; ukk < (BITCT * 5) / 32; ukk++) {
+     wtptr = &(wtarr[16 * ukk]);
+-#ifdef __LP64__
++#ifdef __SSE2__
+     if ((ukk == 2) || (ukk == 7)) {
+       for (uii = 0; uii < 8; uii++) {
+ 	twt = wtptr[uii + 8];
+@@ -245,7 +245,7 @@ void update_rel_ibc(double* rel_ibc, uin
+   }
+   for (umm = 0; umm < sample_ct; umm++) {
+     ulii = *geno++;
+-#ifdef __LP64__
++#ifdef __SSE2__
+     *rel_ibc += weights9[ulii >> 57] + weights8[(ulii >> 51) & 63] + weights7[(ulii >> 44) & 127] + weights6[(ulii >> 38) & 63] + weights5[(ulii >> 32) & 63] + weights4[(ulii >> 25) & 63] + weights3[(ulii >> 19) & 63] + weights2[(ulii >> 12) & 127] + weights1[(ulii >> 6) & 63] + weights[ulii & 63];
+ #else
+     *rel_ibc += weights4[ulii >> 25] + weights3[(ulii >> 19) & 63] + weights2[(ulii >> 12) & 127] + weights1[(ulii >> 6) & 63] + weights[ulii & 63];
+@@ -263,7 +263,7 @@ void fill_subset_weights(double* subset_
+   uint32_t uoo;
+   double wtarr[MULTIPLEX_DIST_EXP / 2];
+   double* wt;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   double twt[5];
+   double twtf;
+   __m128d* swpairs = (__m128d*)subset_weights;
+@@ -278,7 +278,7 @@ void fill_subset_weights(double* subset_
+   memcpy(wtarr, main_weights, (MULTIPLEX_DIST_EXP / 2) * sizeof(double));
+   for (uoo = 0; uoo < 2; uoo++) {
+     wt = &(wtarr[7 * uoo]);
+-#ifdef __LP64__
++#ifdef __SSE2__
+     vfinal1 = _mm_set_pd(wt[0], 0.0);
+     vfinal2 = _mm_set_pd(wt[0] * 2, wt[0]);
+ #endif
+@@ -309,7 +309,7 @@ void fill_subset_weights(double* subset_
+ 	      if (unn & 1) {
+ 		twt[4] += wt[2];
+ 	      }
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	      twtf = twt[4];
+ 	      vpen = _mm_set1_pd(twtf);
+ 	      *swpairs++ = _mm_add_pd(vpen, vfinal1);
+@@ -346,7 +346,7 @@ void fill_subset_weights(double* subset_
+       }
+     }
+   }
+-#ifdef __LP64__
++#ifdef __SSE2__
+   for (uoo = 0; uoo < 3; uoo++) {
+     wt = &(wtarr[14 + 6 * uoo]);
+     vfinal1 = _mm_set_pd(wt[0], 0.0);
+@@ -415,7 +415,7 @@ void fill_subset_weights_r(double* subse
+   double mean_m2;
+   double mult = 1.0;
+   double aux;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   __m128d* swpairs = (__m128d*)subset_weights;
+   __m128d vpen;
+   __m128d vfinal1;
+@@ -492,7 +492,7 @@ void fill_subset_weights_r(double* subse
+   }
+   for (unn = 0; unn < BITCT / 16; unn++) {
+     wtptr = &(wtarr[40 * unn]);
+-#ifdef __LP64__
++#ifdef __SSE2__
+     vfinal1 = _mm_load_pd(wtptr);
+     vfinal2 = _mm_load_pd(&(wtptr[2]));
+     vfinal3 = _mm_load_pd(&(wtptr[4]));
+@@ -506,7 +506,7 @@ void fill_subset_weights_r(double* subse
+           twt3 = twt2 + wtptr[ukk + 16];
+           for (umm = 0; umm < 8; umm++) {
+             twt4 = twt3 + wtptr[umm + 8];
+-#ifdef __LP64__
++#ifdef __SSE2__
+             vpen = _mm_set1_pd(twt4);
+             *swpairs++ = _mm_add_pd(vpen, vfinal1);
+             *swpairs++ = _mm_add_pd(vpen, vfinal2);
+@@ -572,7 +572,7 @@ static inline void collapse_copy_phenod_
+   } while (target < target_end);
+ }
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ // XOR + mask variants of vectorized Lauradoux/Walisch popcount.  (See
+ // popcount_vecs() in plink_common.c for basic documentation.)
+ // Note that the size of the popcounted buffer is a hardcoded constant
+@@ -852,7 +852,7 @@ void ibs_test_process_perms(uintptr_t* p
+       do {
+ 	sub_block_idx = 0;
+ 	ulii = *perm_row_start++;
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	dxx = psbuf[(uint8_t)ulii] + psbuf[256 + ((uint8_t)(ulii >> 8))] + psbuf[512 + ((uint8_t)(ulii >> 16))] + psbuf[768 + ((uint8_t)(ulii >> 24))] + psbuf[1024 + ((uint8_t)(ulii >> 32))] + psbuf[1280 + ((uint8_t)(ulii >> 40))] + psbuf[1536 + ((uint8_t)(ulii >> 48))] + psbuf[1792 + (ulii >> 56)];
+ #else
+         dxx = psbuf[(uint8_t)ulii] + psbuf[256 + ((uint8_t)(ulii >> 8))] + psbuf[512 + ((uint8_t)(ulii >> 16))] + psbuf[768 + (ulii >> 24)];
+@@ -969,7 +969,7 @@ THREAD_RET_TYPE ibs_test_thread(void* ar
+ }
+ 
+ void incr_dists_i(uint32_t* idists, uintptr_t* geno, uintptr_t* masks, uint32_t start_idx, uint32_t end_idx) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+   __m128i* glptr;
+   __m128i* glptr2;
+   __m128i* mptr;
+@@ -986,7 +986,7 @@ void incr_dists_i(uint32_t* idists, uint
+   uintptr_t mask_fixed;
+   for (uii = start_idx; uii < end_idx; uii++) {
+     jj = uii * (MULTIPLEX_2DIST / BITCT);
+-#ifdef __LP64__
++#ifdef __SSE2__
+     glptr = (__m128i*)geno;
+     glptr2 = (__m128i*)(&(geno[jj]));
+     lptr = &(masks[jj]);
+@@ -1104,7 +1104,7 @@ THREAD_RET_TYPE calc_ibs_thread(void* ar
+ }
+ 
+ void incr_genome(uint32_t* genome_main, uintptr_t* geno, uintptr_t* masks, uintptr_t sample_ct, uint32_t start_idx, uint32_t end_idx) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+   const __m128i m1 = {FIVEMASK, FIVEMASK};
+   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
+   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
+@@ -1161,14 +1161,14 @@ void incr_genome(uint32_t* genome_main,
+   uintptr_t* marker_window_ptr;
+   int32_t lowct2 = g_ctrl_ct * 2;
+   int32_t highct2 = g_case_ct * 2;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   glptr_end = (__m128i*)(&(geno[sample_ct * (GENOME_MULTIPLEX2 / BITCT)]));
+ #else
+   glptr_end = &(geno[sample_ct * (GENOME_MULTIPLEX2 / BITCT)]);
+ #endif
+   for (uii = start_idx; uii < end_idx; uii++) {
+     ujj = uii * (GENOME_MULTIPLEX2 / BITCT);
+-#ifdef __LP64__
++#ifdef __SSE2__
+     glptr_fixed = (__m128i*)(&(geno[ujj]));
+     glptr = (__m128i*)(&(geno[ujj + (GENOME_MULTIPLEX2 / BITCT)]));
+     lptr = &(masks[ujj]);
+@@ -1194,7 +1194,7 @@ void incr_genome(uint32_t* genome_main,
+ 	glptr_back = (uintptr_t*)glptr;
+ 	glptr_fixed_tmp = glptr_fixed;
+ 	maskptr_fixed_tmp = maskptr_fixed;
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	acc_ibs1.vi = _mm_setzero_si128();
+ 	acc_ibs0.vi = _mm_setzero_si128();
+ 	do {
+@@ -1356,7 +1356,7 @@ void incr_genome(uint32_t* genome_main,
+ 	xor_ptr = xor_buf;
+ 	glptr_back = (uintptr_t*)glptr;
+ 	glptr_fixed_tmp = glptr_fixed;
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	acc_ibs1.vi = _mm_setzero_si128();
+ 	acc_ibs0.vi = _mm_setzero_si128();
+ 	do {
+@@ -1549,7 +1549,7 @@ void incr_dists(double* dists, uintptr_t
+   uintptr_t uljj;
+   uintptr_t* mptr;
+   double* weights1 = &(weights[16384]);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   double* weights2 = &(weights[32768]);
+   double* weights3 = &(weights[36864]);
+   double* weights4 = &(weights[40960]);
+@@ -1561,7 +1561,7 @@ void incr_dists(double* dists, uintptr_t
+     ulii = geno[uii];
+     mptr = masks;
+     mask_fixed = masks[uii];
+-#ifdef __LP64__
++#ifdef __SSE2__
+     if (mask_fixed == ~ZEROLU) {
+       for (ujj = 0; ujj < uii; ujj++) {
+ 	uljj = (*glptr++ ^ ulii) & (*mptr++);
+@@ -1628,7 +1628,7 @@ void incr_dists_r(double* dists, uintptr
+   uintptr_t uljj;
+   uintptr_t basemask;
+   double* weights1 = &(weights[32768]);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   double* weights2 = &(weights[65536]);
+   double* weights3 = &(weights[98304]);
+ #endif
+@@ -1642,7 +1642,7 @@ void incr_dists_r(double* dists, uintptr
+     if (!basemask) {
+       for (ujj = 0; ujj < uii; ujj++) {
+ 	uljj = ((*glptr++) + ulii) | (*maskptr++);
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	*dists += weights[(uint16_t)uljj] + weights1[(uint16_t)(uljj >> 16)] + weights2[(uint16_t)(uljj >> 32)] + weights3[uljj >> 48];
+ #else
+ 	*dists += weights[(uint16_t)uljj] + weights1[uljj >> 16];
+@@ -1652,7 +1652,7 @@ void incr_dists_r(double* dists, uintptr
+     } else {
+       for (ujj = 0; ujj < uii; ujj++) {
+         uljj = ((*glptr++) + ulii) | ((*maskptr++) | basemask);
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	*dists += weights[(uint16_t)uljj] + weights1[(uint16_t)(uljj >> 16)] + weights2[(uint16_t)(uljj >> 32)] + weights3[uljj >> 48];
+ #else
+ 	*dists += weights[(uint16_t)uljj] + weights1[uljj >> 16];
+@@ -1803,7 +1803,7 @@ void pick_d(unsigned char* cbuf, uint32_
+   uint32_t ujj;
+   uint32_t ukk;
+   memset(cbuf, 0, ct);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   ukk = (uint32_t)(0x100000000LLU % ct);
+ #else
+   ukk = 2 * (0x80000000U % ct);
+@@ -2116,14 +2116,14 @@ void matrix_const_mult_add(uint32_t samp
+   uint32_t loop_end = sample_ct - 1;
+   uint32_t ujj;
+   double* dptr = matrix;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   __m128d* vptr;
+   __m128d v_mult_val = _mm_set1_pd(mult_val);
+ #endif
+   for (uii = 0; uii < loop_end; uii++) {
+     *dptr = (*dptr) * mult_val + add_val;
+     dptr++;
+-#ifdef __LP64__
++#ifdef __SSE2__
+     if ((uintptr_t)dptr & 8) {
+       *dptr *= mult_val;
+       dptr++;
+@@ -2727,7 +2727,7 @@ int32_t ibs_test_calc(pthread_t* threads
+   double perm_ct_recip;
+   uintptr_t ulii;
+   uintptr_t uljj = 0;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   __m128d* rvptr1;
+   __m128d* rvptr2;
+ #else
+@@ -2802,7 +2802,7 @@ int32_t ibs_test_calc(pthread_t* threads
+     ctrl_ctrl_ssq += g_calc_result[tidx][1];
+     ctrl_case_ssq += g_calc_result[tidx][2];
+     case_case_ssq += g_calc_result[tidx][3];
+-#ifdef __LP64__
++#ifdef __SSE2__
+     rvptr1 = (__m128d*)perm_results;
+     rvptr2 = (__m128d*)(&(perm_results[2 * perm_ctcldm * tidx]));
+     for (perm_idx = 0; perm_idx < perm_ct; perm_idx++) {
+@@ -4542,7 +4542,7 @@ int32_t distance_d_write(FILE** outfile_
+   } else {
+     if (shape == DISTANCE_SQ0) {
+       // assume little-endian
+-#ifdef __LP64__
++#ifdef __SSE2__
+       ulii = 0x3009300930093009LLU;
+ #else
+       ulii = 0x30093009;
+@@ -5923,7 +5923,7 @@ int32_t rel_cutoff_batch(uint32_t load_g
+   fclose_null(&idfile);
+   ullii = sample_ct;
+   ullii = ((ullii * (ullii - 1)) / 2 + BITCT - 1) / BITCT;
+-#ifndef __LP64__
++#ifndef __SSE2__
+   if (ullii >= 0x20000000) {
+     goto rel_cutoff_batch_ret_NOMEM;
+   }
+@@ -7353,7 +7353,7 @@ int32_t calc_rel(pthread_t* threads, uin
+ 	  //   cptr2[uii] = '\t';
+ 	  //   cptr2[uii + 1] = '0';
+ 	  // }
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	  ulii = 0x3009300930093009LLU;
+ #else
+ 	  ulii = 0x30093009LU;
+@@ -8077,7 +8077,7 @@ int32_t calc_distance(pthread_t* threads
+   llxx = g_thread_start[dist_thread_ct];
+   llxx = ((llxx * (llxx - 1)) - (int64_t)g_thread_start[0] * (g_thread_start[0] - 1)) / 2;
+   dists_alloc = llxx * sizeof(double);
+-#ifndef __LP64__
++#ifndef __SSE2__
+   if (dists_alloc > 0x7fffffff) {
+     goto calc_distance_ret_NOMEM;
+   }
+@@ -8236,7 +8236,7 @@ int32_t calc_distance(pthread_t* threads
+     goto calc_distance_ret_NOMEM;
+   }
+   if (main_weights) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+     if (wkspace_alloc_d_checked(&subset_weights, 45056 * sizeof(double))) {
+       goto calc_distance_ret_NOMEM;
+     }
+@@ -8424,7 +8424,7 @@ int32_t calc_distance(pthread_t* threads
+ 		*giptr3 += wtbuf[umm + ukk];
+ 	      }
+ 	    }
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	    ulii ^= FIVEMASK;
+ 	    *glptr++ = ulii;
+ 	    ulii = (ulii | (ulii >> 1)) & FIVEMASK;
+@@ -8725,7 +8725,7 @@ int32_t calc_cluster_neighbor(pthread_t*
+   // as a special case in the future.
+   FILE* outfile = NULL;
+   uint32_t* cluster_sorted_ibs_indices = NULL;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   // uint64_t* cluster_sorted_ibs_indices_big = NULL;
+ #endif
+   uint32_t* sample_to_cluster = NULL;
+@@ -9457,7 +9457,7 @@ int32_t calc_cluster_neighbor(pthread_t*
+   logprint("Clustering...");
+   printf(" [sorting IB%c values]", cluster_missing? 'M' : 'S');
+   fflush(stdout);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   if (cur_cluster_ct <= 65536) {
+ #endif
+     // Objective: Produce a list of inter-cluster IBS values sorted in
+@@ -9488,7 +9488,7 @@ int32_t calc_cluster_neighbor(pthread_t*
+       // f(0) = 1
+       // f(1) = f(2) = 2
+       // f(3) = f(4) = f(5) = 3... (triangle_divide() with different rounding)
+-#ifdef __LP64__
++#ifdef __SSE2__
+       umm = (int32_t)sqrt((intptr_t)(tcoord * 2));
+ #else
+       umm = (int32_t)sqrt(2 * ((double)((intptr_t)tcoord)));
+@@ -9601,7 +9601,7 @@ int32_t calc_cluster_neighbor(pthread_t*
+ 	cluster_index[tri_coord_no_diag_32(ukk & 65535, ukk >> 16)] = uii + 1;
+       }
+     }
+-#ifdef __LP64__
++#ifdef __SSE2__
+   } else {
+     logerrprint("Error: --cluster cannot handle >65536 initial clusters yet.\n");
+     retval = RET_CALC_NOT_YET_SUPPORTED;
+--- plink1.9-1.90~b3w-150903.orig/plink_cnv.c
++++ plink1.9-1.90~b3w-150903/plink_cnv.c
+@@ -13,7 +13,7 @@ int32_t cnv_subset_load(char* subset_fna
+     logerrprint("Error: Empty --cnv-subset file.\n");
+     goto cnv_subset_load_ret_INVALID_FORMAT;
+   }
+-#ifndef __LP64__
++#ifndef __SSE2__
+   if (((uint64_t)subset_ct) * max_subset_name_len > 0x7fffffffLLU) {
+     goto cnv_subset_load_ret_NOMEM;
+   }
+@@ -625,7 +625,7 @@ int32_t cnv_make_map(FILE* cnvfile, char
+   }
+   for (ulii = 1; ulii < raw_marker_ct; ulii++) {
+     if (marker_pos_arr[ulii] != llii) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+       if ((++distinct_marker_ct) == 0x80000000U) {
+ 	logprint("\n");
+ 	logerrprint("Error: Too many distinct .cnv.map positions (max 2^31 - 1).\n");
+@@ -983,7 +983,7 @@ int32_t plink_cnv(char* outname, char* o
+     }
+   }
+   ulii = marker_chrom_start[chrom_info_ptr->max_code + 1 + chrom_info_ptr->name_ct];
+-#ifndef __LP64__
++#ifndef __SSE2__
+   if (((uint64_t)ulii) * max_marker_id_len > 0x7fffffffLLU) {
+     goto plink_cnv_ret_NOMEM;
+   }
+--- plink1.9-1.90~b3w-150903.orig/plink_common.c
++++ plink1.9-1.90~b3w-150903/plink_common.c
+@@ -30,7 +30,7 @@ uintptr_t g_sample_ct;
+ uint32_t g_thread_ct;
+ 
+ uint32_t aligned_malloc(uintptr_t** aligned_pp, uintptr_t size) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+   // Avoid random segfaults on 64-bit machines which have 8-byte- instead of
+   // 16-byte-aligned malloc().  (Slightly different code is needed if malloc()
+   // does not even guarantee 8-byte alignment.)
+@@ -51,7 +51,7 @@ uint32_t aligned_malloc(uintptr_t** alig
+ }
+ 
+ void aligned_free(uintptr_t* aligned_pp) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+   free((uintptr_t*)(aligned_pp[-1]));
+ #else
+   free(aligned_pp);
+@@ -3349,7 +3349,7 @@ uint32_t next_unset_unsafe(uintptr_t* bi
+   return ((uintptr_t)(bit_arr_ptr - bit_arr)) * BITCT + CTZLU(~ulii);
+ }
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ uintptr_t next_unset_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc) {
+   uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
+   uintptr_t ulii = (~(*bit_arr_ptr)) >> (loc % BITCT);
+@@ -3383,7 +3383,7 @@ uint32_t next_unset(uintptr_t* bit_arr,
+   return MINV(loc, ceil);
+ }
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ uintptr_t next_unset_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil) {
+   uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
+   uintptr_t ulii = (~(*bit_arr_ptr)) >> (loc % BITCT);
+@@ -3416,7 +3416,7 @@ uint32_t next_set_unsafe(uintptr_t* bit_
+   return ((uintptr_t)(bit_arr_ptr - bit_arr)) * BITCT + CTZLU(ulii);
+ }
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ uintptr_t next_set_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc) {
+   uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
+   uintptr_t ulii = (*bit_arr_ptr) >> (loc % BITCT);
+@@ -3450,7 +3450,7 @@ uint32_t next_set(uintptr_t* bit_arr, ui
+   return MINV(rval, ceil);
+ }
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ uintptr_t next_set_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil) {
+   uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
+   uintptr_t ulii = (*bit_arr_ptr) >> (loc % BITCT);
+@@ -3718,7 +3718,7 @@ int32_t populate_id_htable(uintptr_t unf
+       }
+     }
+   } else {
+-#ifdef __LP64__
++#ifdef __SSE2__
+     if (wkspace_left >= 0x400000000LLU) {
+       max_extra_alloc = 0xfffffffeU;
+     } else {
+@@ -3888,7 +3888,7 @@ void fill_vec_55(uintptr_t* vec, uint32_
+   uint32_t ctl = 2 * ((ct + (BITCT - 1)) / BITCT);
+   uint32_t rem = ct & (BITCT - 1);
+   uintptr_t* second_to_last = &(vec[ctl - 2]);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   const __m128i m1 = {FIVEMASK, FIVEMASK};
+   __m128i* vecp = (__m128i*)vec;
+   __m128i* vec_end = (__m128i*)(&(vec[ctl]));
+@@ -4009,7 +4009,7 @@ void sample_delim_convert(uintptr_t unfi
+ void get_set_wrange_align(uintptr_t* bitfield, uintptr_t word_ct, uintptr_t* firstw_ptr, uintptr_t* wlen_ptr) {
+   uintptr_t* bitfield_ptr = bitfield;
+   uintptr_t* bitfield_end = &(bitfield[word_ct]);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t* bitfield_end2 = &(bitfield[word_ct & (~ONELU)]);
+   while (bitfield_ptr < bitfield_end2) {
+     if (bitfield_ptr[0] || bitfield_ptr[1]) {
+@@ -5223,7 +5223,7 @@ void bitfield_exclude_to_include(uintptr
+ void bitfield_and(uintptr_t* vv, uintptr_t* include_vec, uintptr_t word_ct) {
+   // vv := vv AND include_vec
+   // on 64-bit systems, assumes vv and include_vec are 16-byte aligned
+-#ifdef __LP64__
++#ifdef __SSE2__
+   __m128i* vv128 = (__m128i*)vv;
+   __m128i* iv128 = (__m128i*)include_vec;
+   __m128i* vv128_end = &(vv128[word_ct / 2]);
+@@ -5247,7 +5247,7 @@ void bitfield_andnot(uintptr_t* vv, uint
+   // vv := vv ANDNOT exclude_vec
+   // on 64-bit systems, assumes vv and exclude_vec are 16-byte aligned
+   // note that this is the reverse of the _mm_andnot() operand order
+-#ifdef __LP64__
++#ifdef __SSE2__
+   __m128i* vv128 = (__m128i*)vv;
+   __m128i* ev128 = (__m128i*)exclude_vec;
+   __m128i* vv128_end = &(vv128[word_ct / 2]);
+@@ -5270,7 +5270,7 @@ void bitfield_andnot(uintptr_t* vv, uint
+ void bitfield_andnot_reversed_args(uintptr_t* vv, uintptr_t* include_vec, uintptr_t word_ct) {
+   // vv := (~vv) AND include_vec
+   // on 64-bit systems, assumes vv and exclude_vec are 16-byte aligned
+-#ifdef __LP64__
++#ifdef __SSE2__
+   __m128i* vv128 = (__m128i*)vv;
+   __m128i* iv128 = (__m128i*)include_vec;
+   __m128i* vv128_end = &(vv128[word_ct / 2]);
+@@ -5294,7 +5294,7 @@ void bitfield_andnot_reversed_args(uintp
+ void bitfield_or(uintptr_t* vv, uintptr_t* or_vec, uintptr_t word_ct) {
+   // vv := vv OR include_vec
+   // on 64-bit systems, assumes vv and include_vec are 16-byte aligned
+-#ifdef __LP64__
++#ifdef __SSE2__
+   __m128i* vv128 = (__m128i*)vv;
+   __m128i* ov128 = (__m128i*)or_vec;
+   __m128i* vv128_end = &(vv128[word_ct / 2]);
+@@ -5317,7 +5317,7 @@ void bitfield_or(uintptr_t* vv, uintptr_
+ void bitfield_ornot(uintptr_t* vv, uintptr_t* inverted_or_vec, uintptr_t word_ct) {
+   // vv := vv OR (~inverted_or_vec)
+   // on 64-bit systems, assumes vv and inverted_or_vec are 16-byte aligned
+-#ifdef __LP64__
++#ifdef __SSE2__
+ #ifdef __APPLE__
+   const __m128i all1 = {0xffffffffffffffffLLU, 0xffffffffffffffffLLU};
+ #else
+@@ -5345,7 +5345,7 @@ void bitfield_ornot(uintptr_t* vv, uintp
+ void bitfield_xor(uintptr_t* bit_arr, uintptr_t* xor_arr, uintptr_t word_ct) {
+   // bit_arr := bit_arr XOR xor_arr
+   // on 64-bit systems, assumes bit_arr and xor_arr are 16-byte aligned
+-#ifdef __LP64__
++#ifdef __SSE2__
+   __m128i* bitv128 = (__m128i*)bit_arr;
+   __m128i* xorv128 = (__m128i*)xor_arr;
+   __m128i* bitv128_end = &(bitv128[word_ct / 2]);
+@@ -5539,7 +5539,7 @@ uint32_t has_three_genotypes(uintptr_t*
+ }
+ */
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ // Basic SSE2 implementation of Lauradoux/Walisch popcount.
+ static inline uintptr_t popcount_vecs(__m128i* vptr, uintptr_t ct) {
+   // popcounts vptr[0..(ct-1)].  Assumes ct is a multiple of 3 (0 ok).
+@@ -5730,7 +5730,7 @@ uintptr_t popcount_longs(uintptr_t* lptr
+   // index.
+   uintptr_t tot = 0;
+   uintptr_t* lptr_end = &(lptr[word_ct]);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t six_ct;
+   __m128i* vptr;
+   vptr = (__m128i*)lptr;
+@@ -5788,7 +5788,7 @@ uintptr_t popcount2_longs(uintptr_t* lpt
+   // treats lptr[] as an array of two-bit instead of one-bit numbers
+   uintptr_t tot = 0;
+   uintptr_t* lptr_end = &(lptr[word_ct]);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t twelve_ct;
+   __m128i* vptr;
+   vptr = (__m128i*)lptr;
+@@ -6035,7 +6035,7 @@ uintptr_t jump_forward_unset_unsafe(uint
+   uintptr_t* bptr = &(bit_arr[widx]);
+   uintptr_t uljj;
+   uintptr_t ulkk;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   __m128i* vptr;
+ #endif
+   if (ulii) {
+@@ -6055,7 +6055,7 @@ uintptr_t jump_forward_unset_unsafe(uint
+     bptr++;
+   }
+   ulii = 0;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   if (widx & 1) {
+     uljj = ~(*bptr);
+     ulkk = popcount_long(uljj);
+@@ -6101,7 +6101,7 @@ uintptr_t popcount_longs_exclude(uintptr
+   // N.B. on 64-bit systems, assumes lptr and exclude_arr are 16-byte aligned.
+   uintptr_t tot = 0;
+   uintptr_t* lptr_end = &(lptr[end_idx]);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t six_ct = end_idx / 6;
+   tot += popcount_vecs_exclude((__m128i*)lptr, (__m128i*)exclude_arr, six_ct * 3);
+   lptr = &(lptr[six_ct * 6]);
+@@ -6151,7 +6151,7 @@ uintptr_t popcount_longs_exclude(uintptr
+ uintptr_t popcount_longs_intersect(uintptr_t* lptr1, uintptr_t* lptr2, uintptr_t word_ct) {
+   uintptr_t tot = 0;
+   uintptr_t* lptr1_end = &(lptr1[word_ct]);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t six_ct = word_ct / 6;
+   tot += popcount_vecs_intersect((__m128i*)lptr1, (__m128i*)lptr2, six_ct * 3);
+   lptr1 = &(lptr1[six_ct * 6]);
+@@ -6213,7 +6213,7 @@ void vertical_bitct_subtract(uintptr_t*
+   }
+ }
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ void count_2freq_dbl_60v(__m128i* vptr, __m128i* vend, __m128i* mask1vp, __m128i* mask2vp, uint32_t* ct1abp, uint32_t* ct1cp, uint32_t* ct2abp, uint32_t* ct2cp) {
+   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
+   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
+@@ -6594,7 +6594,7 @@ void count_3freq_12(uintptr_t* lptr, uin
+ }
+ #endif
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ void count_set_freq_60v(__m128i* vptr, __m128i* vend, __m128i* include_vec, uint32_t* set_ctp, uint32_t* missing_ctp) {
+   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
+   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
+@@ -7310,7 +7310,7 @@ void vec_set_freq(uintptr_t sample_ctl2,
+   uintptr_t missing_incr;
+   uint32_t acc = 0;
+   uint32_t accm = 0;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t cur_decr = 60;
+   uintptr_t* lptr_6x_end;
+   sample_ctl2 -= sample_ctl2 % 6;
+@@ -7356,7 +7356,7 @@ void vec_set_freq_x(uintptr_t sample_ctl
+   uintptr_t missing_incr;
+   uint32_t acc = 0;
+   uint32_t accm = 0;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t cur_decr = 60;
+   uintptr_t* lptr_6x_end;
+   sample_ctl2 -= sample_ctl2 % 6;
+@@ -7408,7 +7408,7 @@ void vec_set_freq_y(uintptr_t sample_ctl
+   uintptr_t loader4;
+   uint32_t acc = 0;
+   uint32_t accm = 0;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t cur_decr = 120;
+   uintptr_t* lptr_12x_end;
+   sample_ctl2 -= sample_ctl2 % 12;
+@@ -7455,7 +7455,7 @@ void vec_3freq(uintptr_t sample_ctl2, ui
+   uint32_t acc_even = 0;
+   uint32_t acc_odd = 0;
+   uint32_t acc_and = 0;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t cur_decr = 120;
+   uintptr_t* lptr_12x_end;
+   sample_ctl2 -= sample_ctl2 % 12;
+@@ -7497,7 +7497,7 @@ uintptr_t count_01(uintptr_t* lptr, uint
+   // unlike popcount01_longs, this does not assume lptr[] has no 11s
+   uintptr_t* lptr_end = &(lptr[word_ct]);
+   uintptr_t loader;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t acc;
+   word_ct -= word_ct % 12;
+   acc = count_01_vecs((__m128i*)lptr, word_ct / 2);
+@@ -7818,7 +7818,7 @@ void reverse_loadbuf(unsigned char* load
+   uint32_t* loadbuf_alias32;
+   uint32_t uii;
+   uint32_t ujj;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   const __m128i m1 = {FIVEMASK, FIVEMASK};
+   __m128i* loadbuf_alias;
+   __m128i vii;
+@@ -8044,7 +8044,7 @@ void vec_include_init(uintptr_t unfilter
+     ulmm = FIVEMASK;
+     if (ulii) {
+       uljj = ulii >> BITCT2;
+-#ifdef __LP64__
++#ifdef __SSE2__
+       ulii &= 0xffffffffLLU;
+ #else
+       ulii &= 0xffffLU;
+@@ -8092,7 +8092,7 @@ void exclude_to_vec_include(uintptr_t un
+     ulmm = FIVEMASK;
+     if (ulii) {
+       uljj = ulii >> BITCT2;
+-#ifdef __LP64__
++#ifdef __SSE2__
+       ulii &= 0xffffffffLLU;
+ #else
+       ulii &= 0xffffLU;
+@@ -8133,7 +8133,7 @@ void vec_init_invert(uintptr_t entry_ct,
+   uint32_t vec_wsize = 2 * ((entry_ct + (BITCT - 1)) / BITCT);
+   uintptr_t* second_to_last = &(target_arr[vec_wsize - 2]);
+   uint32_t rem = entry_ct & (BITCT - 1);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   const __m128i m1 = {FIVEMASK, FIVEMASK};
+   __m128i* tptr = (__m128i*)target_arr;
+   __m128i* sptr = (__m128i*)source_arr;
+@@ -8158,7 +8158,7 @@ void vec_init_invert(uintptr_t entry_ct,
+ void bitfield_andnot_copy(uintptr_t word_ct, uintptr_t* target_arr, uintptr_t* source_arr, uintptr_t* exclude_arr) {
+   // target_arr := source_arr ANDNOT exclude_arr
+   // may write an extra word
+-#ifdef __LP64__
++#ifdef __SSE2__
+   __m128i* tptr = (__m128i*)target_arr;
+   __m128i* sptr = (__m128i*)source_arr;
+   __m128i* xptr = (__m128i*)exclude_arr;
+@@ -8187,7 +8187,7 @@ void vec_include_mask_in(uintptr_t unfil
+     ulmm = include_arr[1];
+     if (ulii) {
+       uljj = ulii >> BITCT2;
+-#ifdef __LP64__
++#ifdef __SSE2__
+       ulii &= 0xffffffffLLU;
+ #else
+       ulii &= 0xffffLU;
+@@ -8225,7 +8225,7 @@ void vec_include_mask_out(uintptr_t unfi
+     ulmm = include_arr[1];
+     if (ulii) {
+       uljj = ulii >> BITCT2;
+-#ifdef __LP64__
++#ifdef __SSE2__
+       ulii &= 0xffffffffLLU;
+ #else
+       ulii &= 0xffffLU;
+@@ -8263,7 +8263,7 @@ void vec_include_mask_out_intersect(uint
+     ulmm = include_arr[1];
+     if (ulii) {
+       uljj = ulii >> BITCT2;
+-#ifdef __LP64__
++#ifdef __SSE2__
+       ulii &= 0xffffffffLLU;
+ #else
+       ulii &= 0xffffLU;
+@@ -8290,7 +8290,7 @@ void vec_include_mask_out_intersect(uint
+ 
+ void vec_init_01(uintptr_t unfiltered_sample_ct, uintptr_t* data_ptr, uintptr_t* result_ptr) {
+   // initializes result_ptr bits 01 iff data_ptr bits are 01
+-#ifdef __LP64__
++#ifdef __SSE2__
+   const __m128i m1 = {FIVEMASK, FIVEMASK};
+   __m128i* vec2_read = (__m128i*)data_ptr;
+   __m128i* read_end = &(vec2_read[(unfiltered_sample_ct + (BITCT - 1)) / BITCT]);
+@@ -8313,7 +8313,7 @@ void vec_init_01(uintptr_t unfiltered_sa
+ void vec_invert(uintptr_t unfiltered_sample_ct, uintptr_t* vec2) {
+   uintptr_t* vec2_last = &(vec2[unfiltered_sample_ct / BITCT2]);
+   uint32_t remainder = unfiltered_sample_ct & (BITCT2 - 1);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   const __m128i m1 = {FIVEMASK, FIVEMASK};
+   __m128i* vec2_128 = (__m128i*)vec2;
+   __m128i* vec2_last128 = &(vec2_128[unfiltered_sample_ct / BITCT]);
+@@ -8342,7 +8342,7 @@ void vec_datamask(uintptr_t unfiltered_s
+   // sets result_vec bits to 01 iff data_ptr bits are equal to matchval and
+   // vec_ptr bit is set, 00 otherwise.
+   // currently assumes matchval is not 1.
+-#ifdef __LP64__
++#ifdef __SSE2__
+   __m128i* data_read = (__m128i*)data_ptr;
+   __m128i* mask_read = (__m128i*)mask_ptr;
+   __m128i* data_read_end = &(data_read[(unfiltered_sample_ct + (BITCT - 1)) / BITCT]);
+@@ -8354,7 +8354,7 @@ void vec_datamask(uintptr_t unfiltered_s
+ #endif
+   if (matchval) {
+     if (matchval == 2) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+       do {
+         loader = *data_read++;
+         *writer++ = _mm_and_si128(_mm_andnot_si128(loader, _mm_srli_epi64(loader, 1)), *mask_read++);
+@@ -8366,7 +8366,7 @@ void vec_datamask(uintptr_t unfiltered_s
+       } while (data_ptr < data_read_end);
+ #endif
+     } else {
+-#ifdef __LP64__
++#ifdef __SSE2__
+       do {
+         loader = *data_read++;
+         *writer++ = _mm_and_si128(_mm_and_si128(loader, _mm_srli_epi64(loader, 1)), *mask_read++);
+@@ -8379,7 +8379,7 @@ void vec_datamask(uintptr_t unfiltered_s
+ #endif
+     }
+   } else {
+-#ifdef __LP64__
++#ifdef __SSE2__
+     do {
+       loader = *data_read++;
+       *writer++ = _mm_andnot_si128(_mm_or_si128(loader, _mm_srli_epi64(loader, 1)), *mask_read++);
+@@ -8395,7 +8395,7 @@ void vec_datamask(uintptr_t unfiltered_s
+ 
+ /*
+ void vec_rotate_plink1_to_plink2(uintptr_t* lptr, uint32_t word_ct) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+   const __m128i m1 = {FIVEMASK, FIVEMASK};
+   __m128i* vptr = (__m128i*)lptr;
+   __m128i* vend = (__m128i*)(&(lptr[word_ct]));
+@@ -8512,7 +8512,7 @@ void hh_reset(unsigned char* loadbuf, ui
+   uint32_t* loadbuf_alias32;
+   uint32_t uii;
+   uint32_t ujj;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t* sample_include2_alias32;
+   __m128i* loadbuf_alias;
+   __m128i* iivp;
+@@ -8576,7 +8576,7 @@ void hh_reset_y(unsigned char* loadbuf,
+   uint32_t uii;
+   uint32_t ujj;
+   uint32_t ukk;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   const __m128i m1 = {FIVEMASK, FIVEMASK};
+   uint32_t* sample_include2_alias32;
+   uint32_t* sample_male_include2_alias32;
+@@ -8730,7 +8730,7 @@ void force_missing(unsigned char* loadbu
+   uint32_t* loadbuf_alias32;
+   uint32_t uii;
+   uint32_t ujj;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t* force_missing_include2_alias32;
+   __m128i* loadbuf_alias;
+   __m128i* fmivp;
+--- plink1.9-1.90~b3w-150903.orig/plink_common.h
++++ plink1.9-1.90~b3w-150903/plink_common.h
+@@ -63,13 +63,13 @@
+ #endif
+ 
+ #ifdef _WIN64
+-  #define __LP64__
++  #define __SSE2__
+   #define CTZLU __builtin_ctzll
+   #define CLZLU __builtin_clzll
+ #else
+   #define CTZLU __builtin_ctzl
+   #define CLZLU __builtin_clzl
+-  #ifndef __LP64__
++  #ifndef __SSE2__
+     #ifndef uintptr_t
+       #define uintptr_t unsigned long
+     #endif
+@@ -83,7 +83,7 @@
+   #include <algorithm>
+ #endif
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+   #include <emmintrin.h>
+   #define FIVEMASK 0x5555555555555555LLU
+   typedef union {
+@@ -120,7 +120,7 @@
+ 
+   #endif // Win64
+ 
+-#else // not __LP64__
++#else // not __SSE2__
+ 
+   #define FIVEMASK 0x55555555
+   #define ZEROLU 0LU
+@@ -133,7 +133,7 @@
+   #endif
+   #define PRIxPTR2 "08lx"
+ 
+-#endif // __LP64__
++#endif // __SSE2__
+ 
+ #include <zlib.h>
+ #include "SFMT.h"
+@@ -601,7 +601,7 @@
+   #define MAX_THREADS_P1 513
+ #endif
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+   #define BITCT 64
+ #else
+   #define BITCT 32
+@@ -647,7 +647,7 @@
+ #define JACKKNIFE_VALS_DIST 5
+ #define JACKKNIFE_VALS_GROUPDIST 3
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+   #define AAAAMASK 0xaaaaaaaaaaaaaaaaLLU
+   // number of snp-major .bed lines to read at once for distance calc if
+   // exponent is nonzero.
+@@ -679,7 +679,7 @@
+ #define HASHSIZE 524287
+ #define HASHSIZE_S 524287
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ #define HASHMEM 4194304
+ #define HASHMEM_S 4194304
+ #else
+@@ -779,7 +779,7 @@ typedef union {
+ 
+ typedef union {
+   double dd;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t uu[1];
+ #else
+   uintptr_t uu[2];
+@@ -1031,7 +1031,7 @@ static inline char* skip_initial_spaces(
+ /*
+ static inline int32_t is_space_or_eoln(unsigned char cc) {
+   // ' ', \t, \n, \0, \r
+-#ifdef __LP64__
++#ifdef __SSE2__
+   return (ucc <= 32) && (0x100002601LLU & (1LLU << ucc));
+ #else
+   return ((ucc <= 32) && ((ucc == ' ') || (0x2601LU & (ONELU << ucc))));
+@@ -1526,7 +1526,7 @@ static inline void next_unset_unsafe_ck(
+   }
+ }
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ uintptr_t next_unset_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc);
+ #else
+ static inline uintptr_t next_unset_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc) {
+@@ -1548,7 +1548,7 @@ static inline void next_unset_ck(uintptr
+   }
+ }
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ uintptr_t next_unset_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil);
+ #else
+ static inline uintptr_t next_unset_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil) {
+@@ -1570,7 +1570,7 @@ static inline void next_set_unsafe_ck(ui
+   }
+ }
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ uintptr_t next_set_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc);
+ #else
+ static inline uintptr_t next_set_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc) {
+@@ -1592,7 +1592,7 @@ static inline void next_set_ck(uintptr_t
+   }
+ }
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ uintptr_t next_set_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil);
+ #else
+ static inline uintptr_t next_set_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil) {
+@@ -1639,7 +1639,7 @@ static inline void fill_ulong_zero(uintp
+   }
+ }
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ static inline void fill_ull_zero(uint64_t* ullarr, size_t size) {
+   fill_ulong_zero((uintptr_t*)ullarr, size);
+ }
+@@ -1663,7 +1663,7 @@ static inline void fill_ulong_one(uintpt
+   }
+ }
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ static inline void fill_ull_one(uint64_t* ullarr, size_t size) {
+   fill_ulong_one((uintptr_t*)ullarr, size);
+ }
+@@ -1812,7 +1812,7 @@ void get_set_wrange_align(uintptr_t* bit
+ #define CHROM_XY (MAX_POSSIBLE_CHROM + 2)
+ #define CHROM_MT (MAX_POSSIBLE_CHROM + 3)
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+   // dog requires 42 bits, and other species require less
+   #define CHROM_MASK_INITIAL_WORDS 1
+ #else
+@@ -2046,7 +2046,7 @@ void bitfield_ornot(uintptr_t* vv, uintp
+ void bitfield_xor(uintptr_t* bit_arr, uintptr_t* xor_arr, uintptr_t word_ct);
+ 
+ static inline uint32_t popcount2_long(uintptr_t val) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+   val = (val & 0x3333333333333333LLU) + ((val >> 2) & 0x3333333333333333LLU);
+   return (((val + (val >> 4)) & 0x0f0f0f0f0f0f0f0fLLU) * 0x0101010101010101LLU) >> 56;
+ #else
+@@ -2071,7 +2071,7 @@ uint32_t less_than_two_genotypes(uintptr
+ 
+ uintptr_t popcount_longs(uintptr_t* lptr, uintptr_t word_ct);
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ static inline uintptr_t popcount_longs_nzbase(uintptr_t* lptr, uintptr_t start_idx, uintptr_t end_idx) {
+   uintptr_t prefix_ct = 0;
+   if (start_idx & 1) {
+@@ -2112,7 +2112,7 @@ uintptr_t popcount_longs_intersect(uintp
+ 
+ void vertical_bitct_subtract(uintptr_t* bit_arr, uint32_t item_ct, uint32_t* sum_arr);
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ void count_2freq_dbl_60v(__m128i* vptr, __m128i* vend, __m128i* mask1vp, __m128i* mask2vp, uint32_t* ct1abp, uint32_t* ct1cp, uint32_t* ct2abp, uint32_t* ct2cp);
+ 
+ void count_3freq_120v(__m128i* vptr, __m128i* vend, __m128i* maskvp, uint32_t* ctap, uint32_t* ctbp, uint32_t* ctcp);
+--- plink1.9-1.90~b3w-150903.orig/plink_data.c
++++ plink1.9-1.90~b3w-150903/plink_data.c
+@@ -2222,7 +2222,7 @@ int32_t zero_cluster_init(char* zerofnam
+   if (!marker_bitfield_tmp) {
+     goto zero_cluster_init_ret_NOMEM;
+   }
+-#ifdef __LP64__
++#ifdef __SSE2__
+   fill_ulong_zero(marker_bitfield_tmp, (marker_ctp2l + 1) & (~1));
+ #else
+   fill_ulong_zero(marker_bitfield_tmp, (marker_ctp2l + 3) & (~3));
+@@ -3203,7 +3203,7 @@ int32_t make_bed_me_missing_one_marker(F
+ }
+ 
+ void zeropatch(uintptr_t sample_ctv2, uintptr_t cluster_ct, uintptr_t* cluster_zc_masks, uint32_t** zcdefs, uintptr_t* patchbuf, uintptr_t marker_idx, uintptr_t* writebuf) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+   __m128i* writevec = (__m128i*)writebuf;
+   __m128i* patchvec = (__m128i*)patchbuf;
+   __m128i* patchvec_end = (__m128i*)(&(patchbuf[sample_ctv2]));
+@@ -3227,7 +3227,7 @@ void zeropatch(uintptr_t sample_ctv2, ui
+   if (!at_least_one_cluster) {
+     return;
+   }
+-#ifdef __LP64__
++#ifdef __SSE2__
+   do {
+     vec1 = *writevec;
+     vec2 = *patchvec++;
+@@ -3246,7 +3246,7 @@ void zeropatch(uintptr_t sample_ctv2, ui
+ 
+ void reverse_subset(uintptr_t* writebuf, uintptr_t* subset_vec2, uintptr_t word_ct) {
+   // reverse_loadbuf() variant that requires subset_vec2 bit to be set
+-#ifdef __LP64__
++#ifdef __SSE2__
+   __m128i* wvec = (__m128i*)writebuf;
+   __m128i* svec = (__m128i*)subset_vec2;
+   __m128i* wvec_end = (__m128i*)(&(writebuf[word_ct]));
+@@ -3273,7 +3273,7 @@ void reverse_subset(uintptr_t* writebuf,
+ 
+ void replace_missing_a2(uintptr_t* writebuf, uintptr_t* subset_vec2, uintptr_t word_ct) {
+   // 01 -> 11 for each set bit in subset_vec2
+-#ifdef __LP64__
++#ifdef __SSE2__
+   __m128i* wvec = (__m128i*)writebuf;
+   __m128i* svec = (__m128i*)subset_vec2;
+   __m128i* wvec_end = (__m128i*)(&(writebuf[word_ct]));
+@@ -5139,7 +5139,7 @@ int32_t incr_text_allele0(char cc, char*
+ 
+ typedef struct ll_str_fixed_struct {
+   struct ll_str_struct* next;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   char ss[8];
+ #else
+   char ss[12];
+@@ -11630,7 +11630,7 @@ uint32_t valid_vcf_allele_code(const cha
+     uii -= 64;
+     // A = 1, C = 3, G = 7, N = 14, T = 20, so (0x10408a >> ucc) & 1 works as a
+     // set membership test
+-#ifdef __LP64__
++#ifdef __SSE2__
+     if ((uii > 63) || (!((0x10408a0010408aLLU >> uii) & 1))) {
+       // if '[', ']', or '.', assume breakend
+       return ((uii == 27) || (uii == 29) || (uii == 0xffffffeeU))? 1 : 0;
+@@ -15707,7 +15707,7 @@ int32_t merge_datasets(char* bedname, ch
+         logerrprint("Warning: --merge-list file is empty.\n");
+       }
+     }
+-#ifndef __LP64__
++#ifndef __SSE2__
+     if (ullxx > 0x7fffffff) {
+       goto merge_datasets_ret_NOMEM;
+     }
+@@ -15807,7 +15807,7 @@ int32_t merge_datasets(char* bedname, ch
+       max_cur_sample_ct = cur_sample_ct;
+     }
+   }
+-#ifdef __LP64__
++#ifdef __SSE2__
+   if (ullxx > 0x7fffffff) {
+     sprintf(logbuf, "Error: Too many %s (max 2147483647).\n", g_species_plural);
+     goto merge_datasets_ret_INVALID_FORMAT_2;
+@@ -16045,7 +16045,7 @@ int32_t merge_datasets(char* bedname, ch
+   if (position_warning_ct > 3) {
+     fprintf(stderr, "%" PRIu64 " more multiple-position warning%s: see log file.\n", position_warning_ct - 3, (position_warning_ct == 4)? "" : "s");
+   }
+-#ifdef __LP64__
++#ifdef __SSE2__
+   if (ullxx > 0x7fffffff) {
+     logerrprint("Error: Too many variants (max 2147483647).\n");
+     goto merge_datasets_ret_INVALID_FORMAT;
+--- plink1.9-1.90~b3w-150903.orig/plink_dosage.c
++++ plink1.9-1.90~b3w-150903/plink_dosage.c
+@@ -177,7 +177,7 @@ int32_t dosage_load_score_files(Score_in
+     logerrprint("Error: --score does not support >= 2^30 variants.\n");
+     goto dosage_load_score_files_ret_INVALID_FORMAT;
+   }
+-#ifndef __LP64__
++#ifndef __SSE2__
+   if (allele_code_buf_len > 0x7fffffff) {
+     goto dosage_load_score_files_ret_NOMEM;
+   }
+@@ -1762,7 +1762,7 @@ int32_t plink1_dosage(Dosage_info* doip,
+ 	  if (load_map) {
+ 	    marker_idx = id_htable_find(bufptr, slen, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len);
+ 	    if (marker_idx == 0xffffffffU) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	      marker_idx = ~ZEROLU;
+ #endif
+ 	      continue;
+--- plink1.9-1.90~b3w-150903.orig/plink_family.c
++++ plink1.9-1.90~b3w-150903/plink_family.c
+@@ -728,7 +728,7 @@ int32_t mendel_error_scan(Family_info* f
+   uint32_t* error_cts_tmp;
+   uint32_t* error_cts_tmp2;
+   uint32_t* uiptr;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   __m128i* vptr;
+   __m128i* vptr2;
+ #endif
+@@ -993,7 +993,7 @@ int32_t mendel_error_scan(Family_info* f
+ 	}
+ 	if ((cur_error_ct <= var_error_max) || (!var_first)) {
+ 	  if (var_first) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	    vptr = (__m128i*)error_cts_tmp;
+ 	    vptr2 = (__m128i*)error_cts_tmp2;
+ 	    for (trio_idx = 0; trio_idx < trio_ct4; trio_idx++) {
+@@ -3125,7 +3125,7 @@ int32_t dfam(pthread_t* threads, FILE* b
+   if (retval) {
+     goto dfam_ret_1;
+   }
+-#ifdef __LP64__
++#ifdef __SSE2__
+   if ((12 * sample_ct + 2 * family_ct) > 0xffffffffLLU) {
+     logerrprint("Error: Too many samples and families for DFAM test.\n");
+     goto dfam_ret_INVALID_CMDLINE;
+@@ -4429,7 +4429,7 @@ int32_t qfam(pthread_t* threads, FILE* b
+     goto qfam_ret_1;
+   }
+   g_family_ct = family_ct;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   // no need to check in 32-bit case since a nomem error would have occurred
+   // earlier...
+   // (okay, no need to check anyway, but best to document this overflow
+--- plink1.9-1.90~b3w-150903.orig/plink_filter.c
++++ plink1.9-1.90~b3w-150903/plink_filter.c
+@@ -1695,7 +1695,7 @@ int32_t mind_filter(FILE* bedfile, uintp
+   return retval;
+ }
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ void freq_hwe_haploid_count_120v(__m128i* vptr, __m128i* vend, __m128i* maskvp, uint32_t* ct_nmp, uint32_t* ct_hmajp) {
+   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
+   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
+@@ -1953,7 +1953,7 @@ static inline void single_marker_freqs_a
+   uintptr_t loader;
+   uintptr_t loader2;
+   uintptr_t loader3;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t cur_decr = 120;
+   uintptr_t* lptr_12x_end;
+   unfiltered_sample_ctl2 -= unfiltered_sample_ctl2 % 12;
+@@ -2082,7 +2082,7 @@ static inline void haploid_single_marker
+   uintptr_t loader2;
+   uintptr_t loader3;
+   uintptr_t loader4;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t cur_decr = 120;
+   uintptr_t* lptr_12x_end;
+   unfiltered_sample_ctl2 -= unfiltered_sample_ctl2 % 12;
+--- plink1.9-1.90~b3w-150903.orig/plink_glm.c
++++ plink1.9-1.90~b3w-150903/plink_glm.c
+@@ -272,7 +272,7 @@ int32_t glm_scan_conditions(char* condit
+   uintptr_t condition_ct = 0;
+   uintptr_t line_idx = 0;
+   int32_t retval = 0;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   __m128i* loadbuf_vptr;
+   __m128i* loadbuf_mask_vptr;
+   __m128i* loadbuf_vend;
+@@ -384,7 +384,7 @@ int32_t glm_scan_conditions(char* condit
+     }
+     vec_include_init(unfiltered_sample_ct, loadbuf_mask_orig, load_mask);
+     memcpy(loadbuf_mask, loadbuf_mask_orig, unfiltered_sample_ctv2 * sizeof(intptr_t));
+-#ifdef __LP64__
++#ifdef __SSE2__
+     loadbuf_vend = (__m128i*)(&(loadbuf_raw[unfiltered_sample_ctv2]));
+ #else
+     loadbuf_end = &(loadbuf_raw[unfiltered_sample_ctl2]);
+@@ -406,7 +406,7 @@ int32_t glm_scan_conditions(char* condit
+ 	haploid_fix(hh_or_mt_exists, sample_raw_include2, sample_raw_male_include2, unfiltered_sample_ct, is_x, is_y, (unsigned char*)loadbuf_raw);
+       }
+       // clear loadbuf_mask bits where loadbuf is 01.
+-#ifdef __LP64__
++#ifdef __SSE2__
+       loadbuf_vptr = (__m128i*)loadbuf_raw;
+       loadbuf_mask_vptr = (__m128i*)loadbuf_mask;
+       do {
+@@ -845,7 +845,7 @@ uint32_t glm_linear(uintptr_t cur_batch_
+ // Lakhani and Eva Guinan.
+ // #####
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ // exp_ps is a C port of Shigeo Mitsunari's fast math library posted at
+ // http://homepage1.nifty.com/herumi/ .  License is
+ // http://opensource.org/licenses/BSD-3-Clause .
+@@ -1180,7 +1180,7 @@ static inline __m128 fmath_exp_ps(__m128
+   return tt;
+ }
+ 
+-// For equivalent "normal" C/C++ code, see the non-__LP64__ versions of these
++// For equivalent "normal" C/C++ code, see the non-__SSE2__ versions of these
+ // functions.
+ static inline void logistic_sse(float* vect, uint32_t nn) {
+   __m128 zero = _mm_setzero_ps();
+@@ -1521,7 +1521,7 @@ static inline void compute_two_plus_one_
+   u16.vf = s3;
+   *r3_ptr = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
+ }
+-#else // no __LP64__ (and hence, unsafe to assume presence of SSE2)
++#else // no __SSE2__ (and hence, unsafe to assume presence of SSE2)
+ static inline void logistic_sse(float* vect, uint32_t nn) {
+   uint32_t uii;
+   for (uii = 0; uii < nn; uii++) {
+--- plink1.9-1.90~b3w-150903.orig/plink_homozyg.c
++++ plink1.9-1.90~b3w-150903/plink_homozyg.c
+@@ -85,7 +85,7 @@ void update_end_nonhom(uintptr_t* readbu
+   }
+ }
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ #define ROH_ENTRY_INTS 7
+ #else
+ #define ROH_ENTRY_INTS 6
+@@ -210,7 +210,7 @@ void save_confirmed_roh_extend(uint32_t
+   *roh_list++ = cidx_len;
+   *roh_list++ = cidx_len - cur_roh_het_ct - cur_roh_missing_ct;
+   *roh_list++ = cur_roh_het_ct;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   *roh_list++ = (uint32_t)sample_last_roh_idx;
+   *roh_list++ = (uint32_t)(sample_last_roh_idx >> 32);
+ #else
+@@ -283,7 +283,7 @@ uint32_t roh_update(Homozyg_info* hp, ui
+ 	      *roh_list_cur++ = cidx_len - cur_het_ct - cur_roh_missing_cts[sample_idx];
+ 	      *roh_list_cur++ = cur_het_ct;
+ 	      last_roh_idx = sample_to_last_roh[sample_idx];
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	      *roh_list_cur++ = (uint32_t)last_roh_idx;
+ 	      *roh_list_cur++ = (uint32_t)(last_roh_idx >> 32);
+ #else
+@@ -465,7 +465,7 @@ int32_t write_main_roh_reports(char* out
+     cur_roh_ct = 0;
+     while (cur_roh_idx != ~ZEROLU) {
+       cur_roh = &(roh_list[cur_roh_idx * ROH_ENTRY_INTS]);
+-#ifdef __LP64__
++#ifdef __SSE2__
+       prev_roh_idx = ((uintptr_t)cur_roh[5]) | (((uintptr_t)cur_roh[6]) << 32);
+       cur_roh[5] = (uint32_t)next_roh_idx;
+       cur_roh[6] = (uint32_t)(next_roh_idx >> 32);
+@@ -515,7 +515,7 @@ int32_t write_main_roh_reports(char* out
+       if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+ 	goto write_main_roh_reports_ret_WRITE_FAIL;
+       }
+-#ifdef __LP64__
++#ifdef __SSE2__
+       cur_roh_idx = ((uintptr_t)cur_roh[5]) | (((uintptr_t)cur_roh[6]) << 32);
+ #else
+       cur_roh_idx = (uintptr_t)cur_roh[5];
+@@ -739,7 +739,7 @@ void extract_pool_info(uint32_t pool_siz
+ void initialize_roh_slot(uint32_t* cur_roh, uint32_t chrom_start, uint32_t* marker_uidx_to_cidx, uintptr_t* roh_slot, uint32_t* roh_slot_cidx_start, uint32_t* roh_slot_cidx_end, uint32_t* roh_slot_end_uidx) {
+   uint32_t cidx_first = marker_uidx_to_cidx[cur_roh[0] - chrom_start];
+   uint32_t cidx_last = marker_uidx_to_cidx[cur_roh[1] - chrom_start];
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t cidx_first_block = cidx_first & (~63);
+   uint32_t cidx_last_block = cidx_last & (~63);
+   uint32_t cur_bidx = 2;
+@@ -759,7 +759,7 @@ void initialize_roh_slot(uint32_t* cur_r
+   *roh_slot_cidx_end = cidx_last + 1;
+   *roh_slot_end_uidx = cur_roh[1] + 1;
+   uii = cidx_first & (BITCT2 - 1);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   if (cidx_first & 32) {
+     roh_slot[0] = FIVEMASK;
+     roh_slot[1] = 0x1555555555555555LLU >> (2 * (31 - uii));
+@@ -772,7 +772,7 @@ void initialize_roh_slot(uint32_t* cur_r
+ #endif
+   fill_ulong_zero(&(roh_slot[cur_bidx]), end_bidx - cur_bidx);
+   uii = cidx_last & (BITCT2 - 1);
+-#ifdef __LP64__
++#ifdef __SSE2__
+   if (cidx_last & 32) {
+     // |= instead of = in case first_block and last_block are the same
+     roh_slot[end_bidx - 1] |= 0x5555555555555554LLU << (2 * uii);
+@@ -821,7 +821,7 @@ void populate_roh_slots_from_lookahead_b
+     read_shift = 2 * (sample_uidx & (BITCT2 - 1));
+     slot_idx = (uintptr_t)((*roh_slot_map) & 0xffffffffU);
+     cidx_start = roh_slot_cidx_start[slot_idx];
+-#ifdef __LP64__
++#ifdef __SSE2__
+     cidx_start_block = cidx_start & (~63);
+ #else
+     cidx_start_block = cidx_start & (~15);
+@@ -882,7 +882,7 @@ int32_t populate_roh_slots_from_disk(FIL
+       roh_write_slot_idx = (uintptr_t)(roh_slot_map[roh_read_slot_idx] & 0xffffffffU);
+       cidx_start = roh_slot_cidx_start[roh_write_slot_idx];
+       if ((marker_cidx >= cidx_start) && (marker_cidx < roh_slot_cidx_end[roh_write_slot_idx])) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+         start_c_bidx = 2 * (cidx_start / 64);
+ #else
+         start_c_bidx = cidx_start / 16;
+@@ -895,7 +895,7 @@ int32_t populate_roh_slots_from_disk(FIL
+ }
+ 
+ static inline uint32_t is_allelic_match(double mismatch_max, uintptr_t* roh_slot_idxl, uintptr_t* roh_slot_idxs, uint32_t block_start_idxl, uint32_t block_start_idxs, uint32_t overlap_cidx_start, uint32_t overlap_cidx_end) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+   const __m128i m1 = {FIVEMASK, FIVEMASK};
+   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
+   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
+@@ -1179,7 +1179,7 @@ void compute_allelic_match_matrix(double
+     incr_idxl = 0;
+     roh_slot_idxl = &(roh_slots[slot_idxl * roh_slot_wsize]);
+     cidx_start_idxl = roh_slot_cidx_start[slot_idxl];
+-#ifdef __LP64__
++#ifdef __SSE2__
+     block_start_idxl = cidx_start_idxl & (~63);
+ #else
+     block_start_idxl = cidx_start_idxl & (~15);
+@@ -1197,7 +1197,7 @@ void compute_allelic_match_matrix(double
+       }
+       slot_idxs = (uint32_t)(roh_slot_map[map_idxs]);
+       cidx_start_idxs = roh_slot_cidx_start[slot_idxs];
+-#ifdef __LP64__
++#ifdef __SSE2__
+       block_start_idxs = cidx_start_idxs & (~63);
+ #else
+       block_start_idxs = cidx_start_idxs & (~15);
+@@ -1244,7 +1244,7 @@ void assign_allelic_match_groups(uint32_
+     if (ulii) {
+       nsim_nz_ct++;
+     }
+-#ifdef __LP64__
++#ifdef __SSE2__
+     cur_pool[pool_idx] = ulii << 32;
+ #else
+     cur_pool[2 * pool_idx + 1] = ulii;
+@@ -1278,14 +1278,14 @@ void assign_allelic_match_groups(uint32_
+ 	  nsim_nz_ct--;
+ 	  allelic_match_cts[pool_idx] = 0xffffffffU;
+ 	}
+-#ifdef __LP64__
++#ifdef __SSE2__
+         cur_pool[pool_idx] = (cur_pool[pool_idx] & 0xffffffff00000000LLU) | group_idx;
+ #else
+         cur_pool[2 * pool_idx] = group_idx;
+ #endif
+       }
+     }
+-#ifdef __LP64__
++#ifdef __SSE2__
+     cur_pool[max_nsim_pidx] |= 0x80000000LLU | (group_idx++);
+ #else
+     cur_pool[2 * max_nsim_pidx] = 0x80000000U | (group_idx++);
+@@ -1293,7 +1293,7 @@ void assign_allelic_match_groups(uint32_
+   }
+   for (pool_idx = 0; pool_idx < pool_size; pool_idx++) {
+     if (allelic_match_cts[pool_idx] != 0xffffffffU) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+       cur_pool[pool_idx] |= 0x80000000LLU | (group_idx++);
+ #else
+       cur_pool[2 * pool_idx] = 0x80000000U | (group_idx++);
+@@ -1425,7 +1425,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
+       uii = chrom_len;
+     }
+   }
+-#ifdef __LP64__
++#ifdef __SSE2__
+   // want each roh_slots space to be 16-byte aligned, to enable SSE2
+   // max_roh_len = 1 -> 1 vec
+   // max_roh_len in {2..65} -> 2 vecs
+@@ -1538,7 +1538,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
+ 	    //   [3P+3]: consensus NSNP
+ 	    //   [3P+4]: union NSNP
+ 	    old_pool_list_size = pool_list_size;
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	    pool_list_size += 2 * pool_size + 3;
+ #else
+             pool_list_size += 3 * pool_size + 5;
+@@ -1550,7 +1550,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
+             *cur_pool++ = pool_size_first_plidx[pool_size - pool_size_min];
+             pool_size_first_plidx[pool_size - pool_size_min] = old_pool_list_size;
+ 	    *cur_pool++ = pool_size;
+-#ifndef __LP64__
++#ifndef __SSE2__
+ 	    *cur_pool++ = 0;
+ #endif
+ 	    uiptr = sample_uidx_sort_buf;
+@@ -1560,14 +1560,14 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
+ 	      pool_list_idx = roh_slots[slot_idx1]; // actually a ROH idx
+ 	      *uiptr++ = roh_list[pool_list_idx * ROH_ENTRY_INTS + 5]; // sample_uidx
+               *uiptr++ = (uint32_t)pool_list_idx;
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	      *uiptr++ = (uint32_t)(pool_list_idx >> 32);
+ #endif
+ 	    }
+ 	    // sort in increasing sample_uidx order, for reproducible results
+             qsort(sample_uidx_sort_buf, pool_size, 4 + sizeof(intptr_t), intcmp);
+ 	    for (uii = 0; uii < pool_size; uii++) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+               *cur_pool++ = ((uintptr_t)sample_uidx_sort_buf[3 * uii + 1]) | (((uintptr_t)sample_uidx_sort_buf[3 * uii + 2]) << 32);
+ #else
+ 	      *cur_pool++ = sample_uidx_sort_buf[2 * uii + 1];
+@@ -1616,7 +1616,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
+   for (pool_size = max_pool_size; pool_size >= pool_size_min; --pool_size) {
+     pool_list_idx = pool_size_first_plidx[pool_size - pool_size_min];
+     while (pool_list_idx != ~ZEROLU) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+       pool_list[pool_list_idx + 1] |= ((uintptr_t)(++uii)) << 32;
+ #else
+       pool_list[pool_list_idx + 2] = ++uii;
+@@ -1666,7 +1666,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
+       pool_list_idx = pool_list[pool_list_idx - 1];
+       cur_pool = &(pool_list[pool_list_idx]);
+       pool_size = (uint32_t)cur_pool[1];
+-#ifdef __LP64__
++#ifdef __SSE2__
+       cur_pool = &(cur_pool[2]);
+ #else
+       cur_pool = &(cur_pool[3]);
+@@ -1861,7 +1861,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
+ 
+       assign_allelic_match_groups(pool_size, allelic_match_cts, allelic_match_matrix, roh_slot_map, &(cur_pool[pool_size]));
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+       cur_pool[2 * pool_size] = (((uintptr_t)(marker_uidx_to_cidx[union_uidx2 - chrom_start] + 1 - marker_uidx_to_cidx[union_uidx1 - chrom_start])) << 32) | ((uintptr_t)(marker_uidx_to_cidx[con_uidx2 - chrom_start] + 1 - marker_uidx_to_cidx[con_uidx1 - chrom_start]));
+ #else
+       cur_pool[3 * pool_size] = marker_uidx_to_cidx[con_uidx2 - chrom_start] + 1 - marker_uidx_to_cidx[con_uidx1 - chrom_start];
+@@ -1869,7 +1869,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
+ #endif
+ 
+       if (is_verbose) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	wptr = uint32_write(&(outname_end[14]), (uint32_t)(cur_pool[-1] >> 32));
+ #else
+ 	wptr = uint32_write(&(outname_end[14]), (uint32_t)cur_pool[-1]);
+@@ -1880,7 +1880,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
+ 	}
+ 
+ 	for (slot_idx1 = 0; slot_idx1 < pool_size; slot_idx1++) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	  verbose_group_sort_buf[slot_idx1] = ((cur_pool[pool_size + slot_idx1] & 0x7fffffffLLU) << 32) | ((uint64_t)slot_idx1);
+ #else
+ 	  verbose_group_sort_buf[slot_idx1] = (((uint64_t)(cur_pool[pool_size + 2 * slot_idx1] & 0x7fffffff)) << 32) | ((uint64_t)slot_idx1);
+@@ -2222,7 +2222,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
+     while (pool_list_idx != ~ZEROLU) {
+       cur_pool = &(pool_list[pool_list_idx]);
+       pool_list_idx = *cur_pool;
+-#ifdef __LP64__
++#ifdef __SSE2__
+       cur_pool = &(cur_pool[2]);
+ #else
+       cur_pool = &(cur_pool[3]);
+@@ -2240,7 +2240,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
+       // sort pool members primarily by allelic-match group number, then by
+       // internal ID
+       for (slot_idx1 = 0; slot_idx1 < pool_size; slot_idx1++) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	roh_slot_map[slot_idx1] = ((cur_pool[pool_size + slot_idx1] & 0x7fffffffLLU) << 32) | ((uint64_t)slot_idx1);
+ #else
+ 	// would like to just sort 32-bit integers, but if there are >32k
+@@ -2294,7 +2294,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
+ 	}
+         wptr = roh_pool_write_middle(wptr, marker_ids, max_marker_id_len, plink_maxsnp, marker_pos, is_new_lengths, marker_uidx1, marker_uidx2);
+ 	wptr = uint32_writew8x(wptr, cur_roh[2], ' ');
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	ulii = cur_pool[pool_size + slot_idx2];
+         wptr = width_force(4, wptr, uint32_write(wptr, (uint32_t)(ulii >> 32)));
+         *wptr++ = ' ';
+@@ -2325,7 +2325,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
+ 	  wptr = fw_strcpyn(plink_maxfid, 3, "CON", wptr_start);
+ 	  marker_uidx1 = con_uidx1;
+ 	  marker_uidx2 = con_uidx2;
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	  marker_cidx = (uint32_t)(cur_pool[2 * pool_size]);
+ #else
+ 	  marker_cidx = cur_pool[3 * pool_size];
+@@ -2334,7 +2334,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
+ 	  wptr = fw_strcpyn(plink_maxfid, 5, "UNION", wptr_start);
+ 	  marker_uidx1 = union_uidx1;
+ 	  marker_uidx2 = union_uidx2;
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	  // NSNP
+ 	  marker_cidx = (uint32_t)(cur_pool[2 * pool_size] >> 32);
+ #else
+@@ -2740,7 +2740,7 @@ int32_t calc_homozyg(Homozyg_info* hp, F
+   if (hp->modifier & (HOMOZYG_GROUP | HOMOZYG_GROUP_VERBOSE)) {
+     if (max_pool_size < hp->pool_size_min) {
+       LOGERRPRINTF("Warning: Skipping --homozyg group%s report since there are no pools.\n", (hp->modifier & HOMOZYG_GROUP_VERBOSE)? "-verbose" : "");
+-#ifndef __LP64__
++#ifndef __SSE2__
+     } else if (max_pool_size > 65536) {
+       logerrprint("Error: 32-bit " PROG_NAME_STR "'s --homozyg group cannot handle a pool of size >65536.\n");
+       goto calc_homozyg_ret_NOMEM;
+--- plink1.9-1.90~b3w-150903.orig/plink_ld.c
++++ plink1.9-1.90~b3w-150903/plink_ld.c
+@@ -73,7 +73,7 @@ void ld_epi_cleanup(Ld_info* ldip, Epi_i
+   free_cond(clump_ip->range_fname);
+ }
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ static inline void ld_dot_prod_batch(__m128i* vec1, __m128i* vec2, __m128i* mask1, __m128i* mask2, int32_t* return_vals, uint32_t iters) {
+   // Main routine for computation of \sum_i^M (x_i - \mu_x)(y_i - \mu_y), where
+   // x_i, y_i \in \{-1, 0, 1\}, but there are missing values.
+@@ -494,7 +494,7 @@ int32_t ld_dot_prod_nm(uintptr_t* vec1,
+   result -= ld_dot_prod_nm_batch(vec1, vec2, last_batch_size);
+   return result;
+ }
+-#endif // __LP64__
++#endif // __SSE2__
+ 
+ uint32_t ld_process_load(uintptr_t* geno_buf, uintptr_t* mask_buf, uintptr_t* missing_buf, uint32_t* missing_ct_ptr, double* sum_ptr, double* variance_recip_ptr, uint32_t founder_ct, uint32_t is_x, uint32_t weighted_x, uint32_t nonmale_founder_ct, uintptr_t* founder_male_include2, uintptr_t* nonmale_geno, uintptr_t* nonmale_masks, uintptr_t nonmale_offset) {
+   uintptr_t* geno_ptr = geno_buf;
+@@ -732,14 +732,14 @@ int32_t ld_prune(Ld_info* ldip, FILE* be
+   uintptr_t unfiltered_sample_ctl2 = 2 * ((unfiltered_sample_ct + (BITCT - 1)) / BITCT);
+   uintptr_t founder_ct = popcount_longs(founder_info, unfiltered_sample_ctl2 / 2);
+   uintptr_t founder_ctl = (founder_ct + BITCT - 1) / BITCT;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t founder_ctv = 2 * ((founder_ct + 127) / 128);
+ #else
+   uintptr_t founder_ctv = founder_ctl;
+ #endif
+   uintptr_t founder_ct_mld = (founder_ct + MULTIPLEX_LD - 1) / MULTIPLEX_LD;
+   uint32_t founder_ct_mld_m1 = ((uint32_t)founder_ct_mld) - 1;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t founder_ct_mld_rem = (MULTIPLEX_LD / 192) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 192;
+ #else
+   uint32_t founder_ct_mld_rem = (MULTIPLEX_LD / 48) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 48;
+@@ -862,7 +862,7 @@ int32_t ld_prune(Ld_info* ldip, FILE* be
+   if (pairwise) {
+     prune_ld_thresh = ld_last_param * (1 + SMALL_EPSILON);
+   } else {
+-#ifdef __LP64__
++#ifdef __SSE2__
+     if (window_max > 46340) {
+       // todo: check what LAPACK's matrix inversion limit actually is.  Guess
+       // sqrt(2^31 - 1) for now.
+@@ -1290,7 +1290,7 @@ int32_t ld_prune(Ld_info* ldip, FILE* be
+   ld_prune_ret_INVALID_FORMAT:
+     retval = RET_INVALID_FORMAT;
+     break;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   ld_prune_ret_INVALID_CMDLINE:
+     retval = RET_INVALID_CMDLINE;
+     break;
+@@ -1338,7 +1338,7 @@ uint32_t ld_missing_ct_intersect(uintptr
+   // variant of popcount_longs_intersect()
+   uintptr_t tot = 0;
+   uintptr_t* lptr1_end2;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   const __m128i m1 = {FIVEMASK, FIVEMASK};
+   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
+   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
+@@ -1538,7 +1538,7 @@ int32_t flipscan(Ld_info* ldip, FILE* be
+     pheno_ctl[is_case] = (pheno_ct[is_case] + (BITCT - 1)) / BITCT;
+     ulii = (pheno_ct[is_case] + MULTIPLEX_LD - 1) / MULTIPLEX_LD;
+     pheno_ct_mld_m1[is_case] = ulii - 1;
+-#ifdef __LP64__
++#ifdef __SSE2__
+     pheno_ct_mld_rem[is_case] = (MULTIPLEX_LD / 192) - (ulii * MULTIPLEX_LD - pheno_ct[is_case]) / 192;
+ #else
+     pheno_ct_mld_rem[is_case] = (MULTIPLEX_LD / 48) - (ulii * MULTIPLEX_LD - pheno_ct[is_case]) / 48;
+@@ -2295,7 +2295,7 @@ int32_t ld_report_matrix(pthread_t* thre
+ 	ulptr = (uintptr_t*)tbuf;
+ 	// assume little-endian
+ 	// 0[delim]0[delim]...
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	ulii = 0x30003000300030LLU | (0x100010001000100LLU * ((unsigned char)g_ld_delimiter));
+ #else
+ 	ulii = 0x300030 | (0x1000100 * ((unsigned char)g_ld_delimiter));
+@@ -2816,7 +2816,7 @@ uint32_t load_and_split3(FILE* bedfile,
+   }
+ }
+ 
+-#ifdef __LP64__
++#ifdef __SSE2__
+ static void two_locus_3x3_tablev(__m128i* vec1, __m128i* vec2, uint32_t* counts_3x3, uint32_t sample_ctv6, uint32_t iter_ct) {
+   const __m128i m1 = {FIVEMASK, FIVEMASK};
+   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
+@@ -3039,7 +3039,7 @@ static inline void two_locus_3x3_zmiss_t
+ #endif
+ 
+ static void two_locus_count_table_zmiss1(uintptr_t* lptr1, uintptr_t* lptr2, uint32_t* counts_3x3, uint32_t sample_ctv3, uint32_t is_zmiss2) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+   fill_uint_zero(counts_3x3, 6);
+   if (is_zmiss2) {
+     two_locus_3x3_zmiss_tablev((__m128i*)lptr1, (__m128i*)lptr2, counts_3x3, sample_ctv3 / 2);
+@@ -3060,7 +3060,7 @@ static void two_locus_count_table_zmiss1
+ }
+ 
+ static void two_locus_count_table(uintptr_t* lptr1, uintptr_t* lptr2, uint32_t* counts_3x3, uint32_t sample_ctv3, uint32_t is_zmiss2) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t uii;
+   fill_uint_zero(counts_3x3, 9);
+   if (!is_zmiss2) {
+@@ -6052,7 +6052,7 @@ int32_t ld_report(pthread_t* threads, Ld
+   uintptr_t* founder_male_include2 = NULL;
+   uintptr_t founder_ct_mld = (founder_ct + MULTIPLEX_LD - 1) / MULTIPLEX_LD;
+   uint32_t founder_ct_mld_m1 = ((uint32_t)founder_ct_mld) - 1;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uint32_t founder_ct_mld_rem = (MULTIPLEX_LD / 192) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 192;
+ #else
+   uint32_t founder_ct_mld_rem = (MULTIPLEX_LD / 48) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 48;
+@@ -6312,7 +6312,7 @@ int32_t show_tags(Ld_info* ldip, FILE* b
+   }
+   founder_ct_mld_m1 = (founder_ct - 1) / MULTIPLEX_LD;
+   ulii = founder_ct_mld_m1 + 1;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   founder_ct_mld_rem = (MULTIPLEX_LD / 192) - (ulii * MULTIPLEX_LD - founder_ct) / 192;
+ #else
+   founder_ct_mld_rem = (MULTIPLEX_LD / 48) - (ulii * MULTIPLEX_LD - founder_ct) / 48;
+@@ -7105,7 +7105,7 @@ int32_t haploview_blocks(Ld_info* ldip,
+     if (max_block_size < 2) {
+       continue;
+     }
+-#ifndef __LP64__
++#ifndef __SSE2__
+     if (max_block_size > 65536) {
+       logprint("\n");
+       logerrprint("Error: 32-bit --blocks cannot analyze potential blocks with more than 65536\nvariants.  Use a 64-bit PLINK build or a smaller --blocks-window-kb value.\n");
+@@ -7460,7 +7460,7 @@ int32_t haploview_blocks(Ld_info* ldip,
+   haploview_blocks_ret_WRITE_FAIL:
+     retval = RET_WRITE_FAIL;
+     break;
+-#ifndef __LP64__
++#ifndef __SSE2__
+   haploview_blocks_ret_INVALID_CMDLINE:
+     retval = RET_INVALID_CMDLINE;
+     break;
+@@ -11619,7 +11619,7 @@ int32_t construct_ld_map(pthread_t* thre
+   uintptr_t founder_ctv2 = founder_ctl * 2;
+   uintptr_t founder_ct_mld = (founder_ct + MULTIPLEX_LD - 1) / MULTIPLEX_LD;
+   uint32_t founder_ct_mld_m1 = ((uint32_t)founder_ct_mld) - 1;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   uintptr_t founder_ct_mld_rem = (MULTIPLEX_LD / 192) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 192;
+ #else
+   uintptr_t founder_ct_mld_rem = (MULTIPLEX_LD / 48) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 48;
+@@ -11823,7 +11823,7 @@ int32_t construct_ld_map(pthread_t* thre
+ 	  // don't need to load the first intersecting member or anything
+ 	  // before it, since we're only traversing the upper right triangle
+ 	  wlen += firstw;
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	  firstw = 2 * (uii / 128);
+ #else
+ 	  firstw = uii / 32;
+@@ -13483,7 +13483,7 @@ int32_t clump_reports(FILE* bedfile, uin
+     *bufptr++ = ' ';
+     bufptr = uint32_writew10x(bufptr, cur_bp, ' ');
+     bufptr = double_g_writewx3x(bufptr, pval, 10, ' ');
+-#ifdef __LP64__
++#ifdef __SSE2__
+     // may as well be paranoid
+     bufptr = width_force(8, bufptr, int64_write(bufptr, (int64_t)(histo[0] + histo[1] + histo[2] + histo[3] + histo[4])));
+     *bufptr++ = ' ';
+--- plink1.9-1.90~b3w-150903.orig/plink_matrix.h
++++ plink1.9-1.90~b3w-150903/plink_matrix.h
+@@ -55,7 +55,7 @@ extern "C" {
+ 
+ #else // not _WIN32
+ #include <cblas.h>
+-#ifdef __LP64__
++#ifdef __SSE2__
+   typedef int32_t __CLPK_integer;
+ #else
+   typedef long int __CLPK_integer;
+--- plink1.9-1.90~b3w-150903.orig/plink_misc.c
++++ plink1.9-1.90~b3w-150903/plink_misc.c
+@@ -5687,7 +5687,7 @@ int32_t meta_analysis(char* input_fnames
+   if (!final_variant_ct) {
+     logerrprint("Error: No --meta-analysis variants.\n");
+     goto meta_analysis_ret_INVALID_CMDLINE;
+-#ifdef __LP64__
++#ifdef __SSE2__
+   } else if (final_variant_ct > 0x7fffffff) {
+     logerrprint("Error: Too many distinct --meta-analysis variants (max 2^31 - 1).\n");
+ #endif
+@@ -5818,7 +5818,7 @@ int32_t meta_analysis(char* input_fnames
+       memcpy(&cur_file_ct_m1, bufptr2, file_ct_byte_width);
+       cur_data_slots = 0;
+       if (report_study_specific) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	cur_data_slots += file_ct64;
+ #else
+ 	cur_data_slots += 2 * file_ct64;
+--- plink1.9-1.90~b3w-150903.orig/plink_set.c
++++ plink1.9-1.90~b3w-150903/plink_set.c
+@@ -1480,7 +1480,7 @@ int32_t define_sets(Set_info* sip, uintp
+     if (retval) {
+       goto define_sets_ret_NOMEM2;
+     }
+-#ifdef __LP64__
++#ifdef __SSE2__
+     fill_ulong_zero(marker_bitfield_tmp, (marker_ctp2l + 1) & (~1));
+ #else
+     fill_ulong_zero(marker_bitfield_tmp, (marker_ctp2l + 3) & (~3));
+@@ -2508,7 +2508,7 @@ int32_t annotate(Annot_info* aip, char*
+ 	  while (1) {
+ 	    ll_ptr = *ll_pptr;
+             if (!ll_ptr) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	      // we'll run out of memory way earlier in 32-bit mode
+ 	      if (attr_id_ct == 0x80000000LLU) {
+ 	        sprintf(logbuf, "Error: Too many unique attributes in %s (max 2147483648).\n", aip->attrib_fname);
+@@ -2647,7 +2647,7 @@ int32_t annotate(Annot_info* aip, char*
+       if (retval) {
+ 	goto annotate_ret_1;
+       }
+-#ifdef __LP64__
++#ifdef __SSE2__
+       if (range_ct > 0x80000000LLU) {
+ 	sprintf(logbuf, "Error: Too many annotations in %s (max 2147483648, counting multi-chromosome annotations once per spanned chromosome).\n", aip->ranges_fname);
+ 	goto annotate_ret_INVALID_FORMAT_WW;
+@@ -2743,7 +2743,7 @@ int32_t annotate(Annot_info* aip, char*
+     } else {
+       unique_annot_ct = attr_id_ct;
+     }
+-#ifdef __LP64__
++#ifdef __SSE2__
+     unique_annot_ctlw = (unique_annot_ct + 3) / 4;
+ #else
+     unique_annot_ctlw = (unique_annot_ct + 1) / 2;
+@@ -2759,7 +2759,7 @@ int32_t annotate(Annot_info* aip, char*
+     ulptr = (uintptr_t*)writebuf;
+     for (ulii = 0; ulii < unique_annot_ctlw; ulii++) {
+       // fill with repeated " 0"
+-#ifdef __LP64__
++#ifdef __SSE2__
+       *ulptr++ = 0x3020302030203020LLU;
+ #else
+       *ulptr++ = 0x30203020;
+@@ -3100,7 +3100,7 @@ int32_t annotate(Annot_info* aip, char*
+       // reinitialize
+       ulptr = (uintptr_t*)writebuf;
+       for (ulii = 0; ulii < unique_annot_ctlw; ulii++) {
+-#ifdef __LP64__
++#ifdef __SSE2__
+ 	*ulptr++ = 0x3020302030203020LLU;
+ #else
+ 	*ulptr++ = 0x30203020;
+@@ -3285,7 +3285,7 @@ int32_t gene_report(char* fname, char* g
+   if (retval) {
+     goto gene_report_ret_1;
+   }
+-#ifdef __LP64__
++#ifdef __SSE2__
+   if (gene_ct > 0x80000000LLU) {
+     sprintf(logbuf, "Error: Too many genes in %s (max 2147483648).\n", glist);
+     goto gene_report_ret_INVALID_FORMAT_WW;
+@@ -3495,7 +3495,7 @@ int32_t gene_report(char* fname, char* g
+     ((uint32_t*)linebuf_top)[1] = cur_bp;
+     linebuf_left -= slen + 8;
+     linebuf_top = &(linebuf_top[slen + 8]);
+-#ifdef __LP64__
++#ifdef __SSE2__
+     if (saved_line_ct == 0x100000000LLU) {
+       sprintf(logbuf, "Error: Too many valid lines in %s (--gene-report can only handle 4294967296).\n", fname);
+       goto gene_report_ret_INVALID_FORMAT_WW;
diff --git a/debian/patches/series b/debian/patches/series
index c1895f8..0f5610a 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,2 +1,3 @@
 01_Fix_use_internal_lib.patch
 02_Activate_Stable_Build.patch
+03_replace_LP64_by_SSE2.patch

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/plink1.9.git