[med-svn] [plink1.9] 01/01: Remove the LP64->SSE2 patch again
Gert Wollny
gert-guest at moszumanska.debian.org
Sat Sep 19 18:17:14 UTC 2015
This is an automated email from the git hooks/post-receive script.
gert-guest pushed a commit to branch master
in repository plink1.9.
commit 9d0c74fca25259b2edba6791b5c5b94be02373d9
Author: Gert Wollny <gw.fossdev at gmail.com>
Date: Sat Sep 19 20:14:53 2015 +0200
Remove the LP64->SSE2 patch again
It would fix the build on non-x86 64 bit archs, but these builds would
not be usable, because sometimes the define __LP64__ is used for enabling
the SSE2 code path but somethimes it is also used to decide for an alternative
64 bit code path.
---
debian/changelog | 3 -
debian/patches/03_replace_LP64_by_SSE2.patch | 2589 --------------------------
debian/patches/series | 1 -
3 files changed, 2593 deletions(-)
diff --git a/debian/changelog b/debian/changelog
index 41a6cff..0647af9 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -5,9 +5,6 @@ plink1.9 (1.90~b3w-150903-1) UNRELEASED; urgency=low
* debian/upstream/metadata: Update reference paper
* debian/copyright: Update
- [Gert Wollny]
- * Add patch 03_replace_LP64_by_SSE2 (Closes: #799471)
-
-- Dylan Aïssi <bob.dybian at gmail.com> Sat, 19 Sep 2015 15:44:10 +0200
plink1.9 (1.90~b3b-150117-1) unstable; urgency=low
diff --git a/debian/patches/03_replace_LP64_by_SSE2.patch b/debian/patches/03_replace_LP64_by_SSE2.patch
deleted file mode 100644
index b6f7c23..0000000
--- a/debian/patches/03_replace_LP64_by_SSE2.patch
+++ /dev/null
@@ -1,2589 +0,0 @@
-Description: Replace the test for LP64 by a test for SSE2
- This patch replaces the test for the define __LP64__ by a test for
- __SSE2__ since the former only provides information abouth whether
- the arch is 64 bit, while actually the availability of SSE2 needs
- to be tested.
-Author: Gert Wollny <gw.fossdev at gmail.com>
-Bug-Debian: http://bugs.debian.org/799471
-Forwarded: no
-Last-Update: 2015-09-19
-
---- plink1.9-1.90~b3w-150903.orig/Rsrv.h
-+++ plink1.9-1.90~b3w-150903/Rsrv.h
-@@ -355,11 +355,11 @@ typedef unsigned long rlen_t;
- #ifdef ULONG_MAX
- #define rlen_max ULONG_MAX
- #else
--#ifdef __LP64__
-+#ifdef __SSE2__
- #define rlen_max 0xffffffffffffffffL
- #else
- #define rlen_max 0xffffffffL
--#endif /* __LP64__ */
-+#endif /* __SSE2__ */
- #endif /* ULONG_MAX */
-
-
---- plink1.9-1.90~b3w-150903.orig/SFMT.c
-+++ plink1.9-1.90~b3w-150903/SFMT.c
-@@ -48,7 +48,7 @@ extern "C" {
- #include <assert.h>
- #include "SFMT.h"
-
--#ifndef __LP64__
-+#ifndef __SSE2__
- inline static void do_recursion(w128_t * r, w128_t * a, w128_t * b,
- w128_t * c, w128_t * d);
- #endif
-@@ -110,7 +110,7 @@ inline static void lshift128(w128_t *out
- * @param c a 128-bit part of the internal state array
- * @param d a 128-bit part of the internal state array
- */
--#ifndef __LP64__
-+#ifndef __SSE2__
- inline static void do_recursion(w128_t *r, w128_t *a, w128_t *b,
- w128_t *c, w128_t *d)
- {
-@@ -144,7 +144,7 @@ inline static uint32_t func1(uint32_t x)
- inline static uint32_t func2(uint32_t x);
- static void period_certification(sfmt_t * sfmt);
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- inline static void mm_recursion(__m128i * r, __m128i a, __m128i b,
- __m128i c, __m128i d);
-
-@@ -255,7 +255,7 @@ inline static int idxof(int i) {
- return i;
- }
-
--#ifndef __LP64__
-+#ifndef __SSE2__
- /**
- * This function fills the user-specified array with pseudorandom
- * integers.
-@@ -391,7 +391,7 @@ int sfmt_get_min_array_size64(sfmt_t * s
- return SFMT_N64;
- }
-
--#ifndef __LP64__
-+#ifndef __SSE2__
- /**
- * This function fills the internal state array with pseudorandom
- * integers.
---- plink1.9-1.90~b3w-150903.orig/SFMT.h
-+++ plink1.9-1.90~b3w-150903/SFMT.h
-@@ -128,7 +128,7 @@ extern "C" {
- /*------------------------------------------
- 128-bit SIMD like data type for standard C
- ------------------------------------------*/
--#ifdef __LP64__
-+#ifdef __SSE2__
- #include <emmintrin.h>
-
- /** 128-bit data structure */
---- plink1.9-1.90~b3w-150903.orig/plink.c
-+++ plink1.9-1.90~b3w-150903/plink.c
-@@ -98,7 +98,7 @@ const char ver_str[] =
- #ifdef NOLAPACK
- "NL"
- #endif
--#ifdef __LP64__
-+#ifdef __SSE2__
- " 64-bit"
- #else
- " 32-bit"
-@@ -1670,7 +1670,7 @@ int32_t plink(char* outname, char* outna
- wkspace_mark_postcluster = wkspace_base;
- ulii = (sample_ct * (sample_ct - 1)) >> 1;
- if (cluster_ptr->mds_dim_ct) {
--#ifndef __LP64__
-+#ifndef __SSE2__
- // catch 32-bit intptr_t overflow
- if (sample_ct > 23169) {
- goto plink_ret_NOMEM;
-@@ -1692,13 +1692,13 @@ int32_t plink(char* outname, char* outna
-
- if (cluster_ct) {
- ulii = cluster_ct + sample_ct - cluster_starts[cluster_ct];
--#ifndef __LP64__
-+#ifndef __SSE2__
- if (ulii > 23169) {
- goto plink_ret_NOMEM;
- }
- #endif
- ulii = (ulii * (ulii - 1)) >> 1;
--#ifndef __LP64__
-+#ifndef __SSE2__
- } else if (sample_ct > 23169) {
- goto plink_ret_NOMEM;
- #endif
-@@ -3056,7 +3056,7 @@ int32_t init_delim_and_species(uint32_t
- break;
- case SPECIES_DOG:
- chrom_info_ptr->autosome_ct = 38;
--#ifdef __LP64__
-+#ifdef __SSE2__
- chrom_info_ptr->haploid_mask[0] = 0x18000000000LLU;
- #else
- chrom_info_ptr->haploid_mask[1] = 0x180;
-@@ -3064,7 +3064,7 @@ int32_t init_delim_and_species(uint32_t
- break;
- case SPECIES_HORSE:
- chrom_info_ptr->autosome_ct = 31;
--#ifdef __LP64__
-+#ifdef __SSE2__
- chrom_info_ptr->haploid_mask[0] = 0x300000000LLU;
- #else
- chrom_info_ptr->haploid_mask[1] = 3;
-@@ -8292,7 +8292,7 @@ int32_t main(int32_t argc, char** argv)
- sprintf(logbuf, "Error: Invalid --memory parameter '%s' (minimum %u).\n", argv[cur_arg + 1], WKSPACE_MIN_MB);
- goto main_ret_INVALID_CMDLINE_WWA;
- }
--#ifndef __LP64__
-+#ifndef __SSE2__
- if (malloc_size_mb > 2047) {
- logerrprint("Error: --memory parameter too large for 32-bit version (max 2047).\n");
- goto main_ret_INVALID_CMDLINE;
-@@ -13200,7 +13200,7 @@ int32_t main(int32_t argc, char** argv)
- } else if (malloc_size_mb < WKSPACE_MIN_MB) {
- malloc_size_mb = WKSPACE_MIN_MB;
- }
--#ifndef __LP64__
-+#ifndef __SSE2__
- if (malloc_size_mb > 2047) {
- malloc_size_mb = 2047;
- }
---- plink1.9-1.90~b3w-150903.orig/plink_assoc.c
-+++ plink1.9-1.90~b3w-150903/plink_assoc.c
-@@ -34,7 +34,7 @@ void single_marker_cc_freqs(uintptr_t sa
- uintptr_t loader2;
- uintptr_t loader3;
- uintptr_t loader4;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t cur_decr = 60;
- uintptr_t* lptr_6x_end;
- sample_ctl2 -= sample_ctl2 % 6;
-@@ -136,7 +136,7 @@ void single_marker_cc_3freqs(uintptr_t s
- uintptr_t loader;
- uintptr_t loader2;
- uintptr_t loader3;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t cur_decr = 120;
- uintptr_t* lptr_12x_end;
- sample_ctl2 -= sample_ctl2 % 12;
-@@ -786,7 +786,7 @@ void transpose_perms(uintptr_t* perm_vec
- // next 4 bytes: 32 40 48...
- uintptr_t sample_idx = 0;
- uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t wbuf[4];
- uint32_t* wbptr;
- #else
-@@ -801,7 +801,7 @@ void transpose_perms(uintptr_t* perm_vec
- pvptr = &(perm_vecs[sample_idx / BITCT2]);
- rshift = 2 * (sample_idx % BITCT2);
- goto transpose_perms_loop_start;
--#ifdef __LP64__
-+#ifdef __SSE2__
- do {
- if (!(perm_idx % 4)) {
- if (perm_idx % 128) {
-@@ -840,7 +840,7 @@ void transpose_perms(uintptr_t* perm_vec
- void transpose_perm1s(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_nm_ct, uint32_t* perm_vecst) {
- uintptr_t sample_idx = 0;
- uintptr_t pheno_nm_ctl = (pheno_nm_ct + (BITCT - 1)) / BITCT;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t wbuf[4];
- uint32_t* wbptr;
- #else
-@@ -855,7 +855,7 @@ void transpose_perm1s(uintptr_t* perm_ve
- pvptr = &(perm_vecs[sample_idx / BITCT]);
- rshift = sample_idx % BITCT;
- goto transpose_perm1s_loop_start;
--#ifdef __LP64__
-+#ifdef __SSE2__
- do {
- if (!(perm_idx % 4)) {
- if (perm_idx % 128) {
-@@ -919,7 +919,7 @@ void calc_git(uint32_t pheno_nm_ct, uint
- // is called.
- uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
- uint32_t perm_ct16 = (perm_vec_ct + 15) / 16;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
- uint32_t perm_ct128x4 = perm_ct128 * 4;
- uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
-@@ -955,7 +955,7 @@ void calc_git(uint32_t pheno_nm_ct, uint
- uint32_t ujj;
- uint32_t ukk;
- uint32_t sample_type;
--#ifdef __LP64__
-+#ifdef __SSE2__
- // 4- and 8-bit partial counts
- gitv[0] = (__m128i*)thread_wkspace;
- gitv[1] = &(((__m128i*)thread_wkspace)[perm_ct128x4]);
-@@ -992,7 +992,7 @@ void calc_git(uint32_t pheno_nm_ct, uint
- ujj = CTZLU(ulii) & (BITCT - 2); // get pos of next non-[hom A2] sample
- sample_type = ((ulii >> ujj) & 3) - 1;
- git_merge4 = gitv[sample_type];
--#ifdef __LP64__
-+#ifdef __SSE2__
- perm_ptr = &(permsv[(ujj / 2) * perm_ct128]);
- for (pbidx = 0; pbidx < perm_ct128; pbidx++) {
- loader = *perm_ptr++;
-@@ -1067,7 +1067,7 @@ void calc_git(uint32_t pheno_nm_ct, uint
- #endif
- ulii &= ~((3 * ONELU) << ujj);
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- permsv = &(permsv[BITCT2 * perm_ct128]);
- #else
- permsv = &(permsv[BITCT2 * perm_ct32]);
-@@ -1075,7 +1075,7 @@ void calc_git(uint32_t pheno_nm_ct, uint
- }
- for (sample_type = 0; sample_type < 3; sample_type++) {
- uii = cur_cts[sample_type];
--#ifdef __LP64__
-+#ifdef __SSE2__
- if (uii % 15) {
- git_merge4 = gitv[sample_type];
- git_merge8 = gitv[sample_type + 3];
-@@ -1127,7 +1127,7 @@ void calc_git(uint32_t pheno_nm_ct, uint
-
- void calc_qgit(uint32_t pheno_nm_ct, uintptr_t perm_vec_ctcl8m, uint32_t num_perms_now, uintptr_t* __restrict__ loadbuf, double* perm_vecstd, double* thread_bufs) {
- uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
--#ifdef __LP64__
-+#ifdef __SSE2__
- // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
- uint32_t row_mult = perm_vec_ctcl8m / 4;
-
-@@ -1160,7 +1160,7 @@ void calc_qgit(uint32_t pheno_nm_ct, uin
- while (ulii) {
- ujj = CTZLU(ulii) & (BITCT - 2);
- sample_type = (ulii >> ujj) & 3;
--#ifdef __LP64__
-+#ifdef __SSE2__
- // note that the gain from using SSE2 for double-precision arithmetic is
- // typically minimal because modern cores tend to have two FPUs, so we
- // should only use it opportunistically. it's painless here, though.
-@@ -1220,7 +1220,7 @@ void calc_qgit(uint32_t pheno_nm_ct, uin
- #endif
- ulii &= ~((3 * ONELU) << ujj);
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
- #else
- perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
-@@ -1230,7 +1230,7 @@ void calc_qgit(uint32_t pheno_nm_ct, uin
-
- void calc_qgit_lin(uint32_t pheno_nm_ct, uintptr_t perm_vec_ctcl8m, uint32_t num_perms_now, uintptr_t* __restrict__ loadbuf, double* perm_vecstd, double* thread_bufs) {
- uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
--#ifdef __LP64__
-+#ifdef __SSE2__
- // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
- uint32_t row_mult = perm_vec_ctcl8m / 4;
-
-@@ -1263,7 +1263,7 @@ void calc_qgit_lin(uint32_t pheno_nm_ct,
- while (ulii) {
- ujj = CTZLU(ulii) & (BITCT - 2);
- sample_type = (ulii >> ujj) & 3;
--#ifdef __LP64__
-+#ifdef __SSE2__
- perm_readv = &(permsv[ujj * row_mult]);
- if (sample_type == 1) {
- git_writev = (__m128d*)thread_bufs;
-@@ -1306,7 +1306,7 @@ void calc_qgit_lin(uint32_t pheno_nm_ct,
- #endif
- ulii &= ~((3 * ONELU) << ujj);
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
- #else
- perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
-@@ -1314,7 +1314,7 @@ void calc_qgit_lin(uint32_t pheno_nm_ct,
- }
- }
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t rem_cost_60v(__m128i* vec1, __m128i* vend, __m128i* vec2) {
- const __m128i m1 = {FIVEMASK, FIVEMASK};
- const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
-@@ -1522,7 +1522,7 @@ uintptr_t rem_cost(uintptr_t sample_ctv2
- uintptr_t detect_homcom;
- uintptr_t result_a;
- uintptr_t result_b;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t cur_decr = 60;
- uintptr_t* lptr_6x_end;
- sample_ctv2 -= sample_ctv2 % 6;
-@@ -1583,7 +1583,7 @@ uintptr_t qrem_cost2(uintptr_t sample_ct
- uintptr_t result_a;
- uintptr_t result_b;
- uintptr_t result_c;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t cur_decr = 40;
- uintptr_t* lptr_4x_end;
- sample_ctl2 &= ~3LLU;
-@@ -1620,7 +1620,7 @@ uintptr_t qrem_cost2(uintptr_t sample_ct
- return cost;
- }
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- static inline void calc_rem_merge4_one(uint32_t perm_ct128, __m128i* __restrict__ perm_ptr, __m128i* __restrict__ rem_merge4) {
- const __m128i m1x4 = {0x1111111111111111LLU, 0x1111111111111111LLU};
- __m128i loader;
-@@ -1788,7 +1788,7 @@ void calc_rem(uint32_t pheno_nm_ct, uint
- // low 8 bits give index of first remv[] array to increment; next 8 bits give
- // second index if nonzero, or indicate its absence
- const uint32_t idx_table[3][4] = {{0x300, 0x102, 4, 5}, {0x500, 2, 0x104, 3}, {0, 0x502, 0x304, 1}};
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
- uint32_t perm_ct128x4 = perm_ct128 * 4;
- uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
-@@ -1819,7 +1819,7 @@ void calc_rem(uint32_t pheno_nm_ct, uint
- uint32_t uii;
- uint32_t ujj;
- uint32_t ukk;
--#ifdef __LP64__
-+#ifdef __SSE2__
- for (uii = 0; uii < 6; uii++) {
- remv[uii] = &(((__m128i*)thread_wkspace)[uii * perm_ct128x4]);
- }
-@@ -1860,7 +1860,7 @@ void calc_rem(uint32_t pheno_nm_ct, uint
- idx1 = idx_table[cur_xor - 1][cur_raw];
- idx2 = idx1 >> 8;
- idx1 &= 255;
--#ifdef __LP64__
-+#ifdef __SSE2__
- perm_ptr = &(permsv[(ujj / 2) * perm_ct128]);
- if (!idx2) {
- calc_rem_merge4_one(perm_ct128, perm_ptr, remv[idx1]);
-@@ -1917,7 +1917,7 @@ void calc_rem(uint32_t pheno_nm_ct, uint
- #endif
- ulxor &= ~((3 * ONELU) << ujj);
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- permsv = &(permsv[BITCT2 * perm_ct128]);
- #else
- permsv = &(permsv[BITCT2 * perm_ct32]);
-@@ -1925,7 +1925,7 @@ void calc_rem(uint32_t pheno_nm_ct, uint
- }
- for (idx1 = 0; idx1 < 6; idx1++) {
- uii = cur_cts[idx1];
--#ifdef __LP64__
-+#ifdef __SSE2__
- if (uii % 15) {
- calc_rem_merge8(perm_ct32, remv[idx1], remv[idx1 + 6]);
- }
-@@ -1954,7 +1954,7 @@ void calc_rem(uint32_t pheno_nm_ct, uint
- void calc_qrem(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, uintptr_t* loadbuf_ref, double* perm_vecstd, double* outbufs) {
- uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
- uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
--#ifdef __LP64__
-+#ifdef __SSE2__
- // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
- uint32_t row_mult = perm_vec_ctcl8m / 4;
-
-@@ -1993,7 +1993,7 @@ void calc_qrem(uint32_t pheno_nm_ct, uin
- ujj = CTZLU(ulxor) & (BITCT - 2);
- cur_xor = (ulxor >> ujj) & 3;
- cur_raw = (ulraw1 >> ujj) & 3;
--#ifdef __LP64__
-+#ifdef __SSE2__
- perm_readv = &(permsv[ujj * row_mult]);
- rem_writev = (__m128d*)outbufs;
- rem_write2v = (__m128d*)(&(outbufs[perm_vec_ctcl8m]));
-@@ -2208,7 +2208,7 @@ void calc_qrem(uint32_t pheno_nm_ct, uin
- #endif
- ulxor &= ~((3 * ONELU) << ujj);
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
- #else
- perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
-@@ -2219,7 +2219,7 @@ void calc_qrem(uint32_t pheno_nm_ct, uin
- void calc_qrem_lin(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, uintptr_t* loadbuf_ref, double* perm_vecstd, double* outbufs) {
- uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
- uint32_t pheno_nm_ctl2x = (pheno_nm_ct + (BITCT2 - 1)) / BITCT2;
--#ifdef __LP64__
-+#ifdef __SSE2__
- // halve for 8 bytes vs. 16, halve again for ujj being double the sample idx
- uint32_t row_mult = perm_vec_ctcl8m / 4;
-
-@@ -2264,7 +2264,7 @@ void calc_qrem_lin(uint32_t pheno_nm_ct,
- ujj = CTZLU(ulxor) & (BITCT - 2);
- cur_xor = (ulxor >> ujj) & 3;
- cur_raw = (ulraw1 >> ujj) & 3;
--#ifdef __LP64__
-+#ifdef __SSE2__
- perm_readv = &(permsv[ujj * row_mult]);
- if (cur_raw == 3) {
- if (cur_xor == 1) {
-@@ -2589,7 +2589,7 @@ void calc_qrem_lin(uint32_t pheno_nm_ct,
- #endif
- ulxor &= ~((3 * ONELU) << ujj);
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- permsv = &(permsv[(BITCT2 / 2) * perm_vec_ctcl8m]);
- #else
- perm_vecstd = &(perm_vecstd[BITCT2 * perm_vec_ctcl8m]);
-@@ -3197,7 +3197,7 @@ THREAD_RET_TYPE assoc_maxt_thread(void*
- uint32_t pidx_offset = g_perms_done - perm_vec_ct;
- uint32_t model_fisher = g_model_fisher;
- uint32_t fisher_midp = g_fisher_midp;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
- uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
- #else
-@@ -3359,13 +3359,13 @@ THREAD_RET_TYPE assoc_maxt_thread(void*
- ldrefs[marker_idx] = ldref;
- }
- if (ldref == marker_bidx) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
- #else
- fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
- #endif
- calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
- #else
- fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-@@ -3478,7 +3478,7 @@ THREAD_RET_TYPE assoc_set_thread(void* a
- uint32_t assoc_thread_ct = g_assoc_thread_ct;
- uintptr_t perm_vec_ct = g_perm_vec_ct;
- uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
- uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
- #else
-@@ -3556,13 +3556,13 @@ THREAD_RET_TYPE assoc_set_thread(void* a
- git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
- git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
- git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
- #else
- fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
- #endif
- calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
- #else
- fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-@@ -4619,7 +4619,7 @@ THREAD_RET_TYPE model_maxt_domrec_thread
- uint32_t pidx_offset = g_perms_done - perm_vec_ct;
- uint32_t model_fisher = g_model_fisher;
- uint32_t fisher_midp = g_fisher_midp;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
- uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
- #else
-@@ -4760,13 +4760,13 @@ THREAD_RET_TYPE model_maxt_domrec_thread
- ldrefs[marker_idx] = ldref;
- }
- if (ldref == marker_bidx) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
- #else
- fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
- #endif
- calc_git(pheno_nm_ct, perm_vec_ct, &(loadbuf[marker_bidx * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
- #else
- fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-@@ -4862,7 +4862,7 @@ THREAD_RET_TYPE model_set_domrec_thread(
- uint32_t assoc_thread_ct = g_assoc_thread_ct;
- uintptr_t perm_vec_ct = g_perm_vec_ct;
- uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
- uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
- #else
-@@ -4929,13 +4929,13 @@ THREAD_RET_TYPE model_set_domrec_thread(
- git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
- git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
- git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
- #else
- fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
- #endif
- calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
- #else
- fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-@@ -5104,7 +5104,7 @@ THREAD_RET_TYPE model_maxt_trend_thread(
- uintptr_t pheno_nm_ctv2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
- uint32_t assoc_thread_ct = g_assoc_thread_ct;
- uint32_t pidx_offset = g_perms_done - perm_vec_ct;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
- uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
- #else
-@@ -5221,13 +5221,13 @@ THREAD_RET_TYPE model_maxt_trend_thread(
- ldrefs[marker_idx] = ldref;
- }
- if (ldref == marker_bidx) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
- #else
- fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
- #endif
- calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
- #else
- fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-@@ -5299,7 +5299,7 @@ THREAD_RET_TYPE model_set_trend_thread(v
- uint32_t assoc_thread_ct = g_assoc_thread_ct;
- uintptr_t perm_vec_ct = g_perm_vec_ct;
- uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
- uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
- #else
-@@ -5359,13 +5359,13 @@ THREAD_RET_TYPE model_set_trend_thread(v
- git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
- git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
- git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
- #else
- fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
- #endif
- calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
- #else
- fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-@@ -5542,7 +5542,7 @@ THREAD_RET_TYPE model_maxt_gen_thread(vo
- uint32_t pidx_offset = g_perms_done - perm_vec_ct;
- uint32_t model_fisher = g_model_fisher;
- uint32_t fisher_midp = g_fisher_midp;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
- uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
- #else
-@@ -5665,13 +5665,13 @@ THREAD_RET_TYPE model_maxt_gen_thread(vo
- ldrefs[marker_idx] = ldref;
- }
- if (ldref == marker_bidx) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
- #else
- fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
- #endif
- calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
- #else
- fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-@@ -5971,7 +5971,7 @@ THREAD_RET_TYPE model_maxt_best_thread(v
- uint32_t pidx_offset = g_perms_done - perm_vec_ct;
- uint32_t model_fisher = g_model_fisher;
- uint32_t fisher_midp = g_fisher_midp;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
- uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
- #else
-@@ -6112,13 +6112,13 @@ THREAD_RET_TYPE model_maxt_best_thread(v
- ldrefs[marker_idx] = ldref;
- }
- if (ldref == marker_bidx) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
- #else
- fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
- #endif
- calc_git(pheno_nm_ct, perm_vec_ct, &(loadbuf[marker_bidx * pheno_nm_ctv2]), perm_vecst, git_homrar_cts, thread_git_wkspace);
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
- #else
- fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-@@ -6307,7 +6307,7 @@ THREAD_RET_TYPE model_set_best_thread(vo
- uint32_t assoc_thread_ct = g_assoc_thread_ct;
- uintptr_t perm_vec_ct = g_perm_vec_ct;
- uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
- uint32_t* thread_git_wkspace = &(g_thread_git_wkspace[tidx * perm_ct128 * 288]);
- #else
-@@ -6381,13 +6381,13 @@ THREAD_RET_TYPE model_set_best_thread(vo
- git_homrar_cts = &(resultbuf[3 * marker_bidx * perm_vec_ctcl4m]);
- git_missing_cts = &(git_homrar_cts[perm_vec_ctcl4m]);
- git_het_cts = &(git_homrar_cts[2 * perm_vec_ctcl4m]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * (perm_vec_ctcl4m / 2));
- #else
- fill_ulong_zero((uintptr_t*)git_homrar_cts, 3 * perm_vec_ctcl4m);
- #endif
- calc_git(pheno_nm_ct, perm_vec_ct, loadbuf_cur, perm_vecst, git_homrar_cts, thread_git_wkspace);
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct128 * 72);
- #else
- fill_ulong_zero((uintptr_t*)thread_git_wkspace, perm_ct64 * 72);
-@@ -6612,7 +6612,7 @@ int32_t model_assoc_set_test(pthread_t*
- join_threads(threads, assoc_thread_ct);
- g_assoc_thread_ct = max_thread_ct;
- g_resultbuf = (uint32_t*)wkspace_alloc(perm_vec_ctcl4m * 3 * MODEL_BLOCKSIZE * sizeof(int32_t));
--#ifdef __LP64__
-+#ifdef __SSE2__
- ulii = ((perm_vec_ct + 127) / 128) * 16;
- g_perm_vecst = (uint32_t*)wkspace_alloc(ulii * pheno_nm_ct);
- #else
-@@ -6622,7 +6622,7 @@ int32_t model_assoc_set_test(pthread_t*
- #endif
- g_thread_git_wkspace = (uint32_t*)wkspace_alloc(ulii * 72 * max_thread_ct);
- transpose_perms(g_perm_vecs, perm_vec_ct, pheno_nm_ct, g_perm_vecst);
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 9 * max_thread_ct);
- #else
- fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 18 * max_thread_ct);
-@@ -7211,7 +7211,7 @@ int32_t model_assoc(pthread_t* threads,
- if (!g_ldrefs) {
- goto model_assoc_ret_NOMEM;
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_one((uintptr_t*)g_ldrefs, (marker_ct + 3) / 4);
- #else
- fill_ulong_one((uintptr_t*)g_ldrefs, (marker_ct + 1) / 2);
-@@ -7408,7 +7408,7 @@ int32_t model_assoc(pthread_t* threads,
- ulii = (perm_vec_ct + (CACHELINE_DBL - 1)) / CACHELINE_DBL;
- g_maxt_thread_results = (double*)wkspace_alloc(max_thread_ct * ulii * CACHELINE);
- g_resultbuf = (uint32_t*)wkspace_alloc(perm_vec_ctcl4m * 3 * MODEL_BLOCKSIZE * sizeof(int32_t));
--#ifdef __LP64__
-+#ifdef __SSE2__
- ulii = ((perm_vec_ct + 127) / 128) * 16;
- g_perm_vecst = (uint32_t*)wkspace_alloc(ulii * pheno_nm_ct);
- #else
-@@ -7418,7 +7418,7 @@ int32_t model_assoc(pthread_t* threads,
- #endif
- g_thread_git_wkspace = (uint32_t*)wkspace_alloc(ulii * 72 * max_thread_ct);
- transpose_perms(g_perm_vecs, perm_vec_ct, pheno_nm_ct, g_perm_vecst);
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 9 * max_thread_ct);
- #else
- fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 18 * max_thread_ct);
-@@ -9037,7 +9037,7 @@ int32_t qassoc(pthread_t* threads, FILE*
- if (!g_ldrefs) {
- goto qassoc_ret_NOMEM;
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_one((uintptr_t*)g_ldrefs, (marker_ct + 3) / 4);
- #else
- fill_ulong_one((uintptr_t*)g_ldrefs, (marker_ct + 1) / 2);
-@@ -10433,7 +10433,7 @@ void calc_git_missing(uint32_t pheno_nm_
- // thread_wkspace[] is assumed to be zeroed out before this function is
- // called.
- uint32_t pheno_nm_ctl = (pheno_nm_ct + (BITCT - 1)) / BITCT;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t perm_ct16 = (perm_vec_ct + 15) / 16;
- uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
- uint32_t perm_ct128x4 = perm_ct128 * 4;
-@@ -10466,7 +10466,7 @@ void calc_git_missing(uint32_t pheno_nm_
- uint32_t pbidx;
- uint32_t uii;
- uint32_t ujj;
--#ifdef __LP64__
-+#ifdef __SSE2__
- // 4- and 8-bit partial counts
- gitv[0] = &(((__m128i*)thread_wkspace)[8 * perm_ct128x4]);
- gitv[1] = &(((__m128i*)thread_wkspace)[9 * perm_ct128x4]);
-@@ -10488,7 +10488,7 @@ void calc_git_missing(uint32_t pheno_nm_
- while (ulii) {
- ujj = CTZLU(ulii);
- git_merge4 = gitv[0];
--#ifdef __LP64__
-+#ifdef __SSE2__
- perm_ptr = &(permsv[ujj * perm_ct128]);
- for (pbidx = 0; pbidx < perm_ct128; pbidx++) {
- loader = *perm_ptr++;
-@@ -10561,13 +10561,13 @@ void calc_git_missing(uint32_t pheno_nm_
- #endif
- ulii &= ulii - 1;
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- permsv = &(permsv[BITCT * perm_ct128]);
- #else
- permsv = &(permsv[BITCT * perm_ct32]);
- #endif
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- if (cur_ct % 15) {
- git_merge4 = gitv[0];
- git_merge8 = gitv[1];
-@@ -11325,7 +11325,7 @@ int32_t testmiss(pthread_t* threads, FIL
- if (perm_maxt) {
- ulii = (g_perm_vec_ct + (CACHELINE_DBL - 1)) / CACHELINE_DBL;
- g_maxt_thread_results = (double*)wkspace_alloc(max_thread_ct * ulii * CACHELINE);
--#ifdef __LP64__
-+#ifdef __SSE2__
- ulii = ((g_perm_vec_ct + 127) / 128) * 16;
- g_perm_vecst = (uint32_t*)wkspace_alloc(ulii * pheno_nm_ct);
- #else
-@@ -11335,7 +11335,7 @@ int32_t testmiss(pthread_t* threads, FIL
- #endif
- g_thread_git_wkspace = (uint32_t*)wkspace_alloc(ulii * 44 * max_thread_ct);
- transpose_perm1s(g_perm_vecs, g_perm_vec_ct, pheno_nm_ct, g_perm_vecst);
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, (ulii / 2) * 11 * max_thread_ct);
- #else
- fill_ulong_zero((uintptr_t*)g_thread_git_wkspace, ulii * 11 * max_thread_ct);
-@@ -12552,7 +12552,7 @@ int32_t cmh2_assoc(FILE* bedfile, uintpt
- logerrprint("Error: --mh2 requires at least two cases and two controls.\n");
- goto cmh2_assoc_ret_INVALID_CMDLINE;
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- if (cluster_ct1 > 46341) {
- // might actually be ok, but play it safe in case LAPACK matrix inversion
- // routine has an integer overflow here
---- plink1.9-1.90~b3w-150903.orig/plink_calc.c
-+++ plink1.9-1.90~b3w-150903/plink_calc.c
-@@ -132,7 +132,7 @@ void update_rel_ibc(double* rel_ibc, uin
- double* weights2 = &(weights[128]);
- double* weights3 = &(weights[256]);
- double* weights4 = &(weights[320]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- double* weights5 = &(weights[384]);
- double* weights6 = &(weights[448]);
- double* weights7 = &(weights[512]);
-@@ -207,7 +207,7 @@ void update_rel_ibc(double* rel_ibc, uin
- }
- for (ukk = 0; ukk < (BITCT * 5) / 32; ukk++) {
- wtptr = &(wtarr[16 * ukk]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- if ((ukk == 2) || (ukk == 7)) {
- for (uii = 0; uii < 8; uii++) {
- twt = wtptr[uii + 8];
-@@ -245,7 +245,7 @@ void update_rel_ibc(double* rel_ibc, uin
- }
- for (umm = 0; umm < sample_ct; umm++) {
- ulii = *geno++;
--#ifdef __LP64__
-+#ifdef __SSE2__
- *rel_ibc += weights9[ulii >> 57] + weights8[(ulii >> 51) & 63] + weights7[(ulii >> 44) & 127] + weights6[(ulii >> 38) & 63] + weights5[(ulii >> 32) & 63] + weights4[(ulii >> 25) & 63] + weights3[(ulii >> 19) & 63] + weights2[(ulii >> 12) & 127] + weights1[(ulii >> 6) & 63] + weights[ulii & 63];
- #else
- *rel_ibc += weights4[ulii >> 25] + weights3[(ulii >> 19) & 63] + weights2[(ulii >> 12) & 127] + weights1[(ulii >> 6) & 63] + weights[ulii & 63];
-@@ -263,7 +263,7 @@ void fill_subset_weights(double* subset_
- uint32_t uoo;
- double wtarr[MULTIPLEX_DIST_EXP / 2];
- double* wt;
--#ifdef __LP64__
-+#ifdef __SSE2__
- double twt[5];
- double twtf;
- __m128d* swpairs = (__m128d*)subset_weights;
-@@ -278,7 +278,7 @@ void fill_subset_weights(double* subset_
- memcpy(wtarr, main_weights, (MULTIPLEX_DIST_EXP / 2) * sizeof(double));
- for (uoo = 0; uoo < 2; uoo++) {
- wt = &(wtarr[7 * uoo]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- vfinal1 = _mm_set_pd(wt[0], 0.0);
- vfinal2 = _mm_set_pd(wt[0] * 2, wt[0]);
- #endif
-@@ -309,7 +309,7 @@ void fill_subset_weights(double* subset_
- if (unn & 1) {
- twt[4] += wt[2];
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- twtf = twt[4];
- vpen = _mm_set1_pd(twtf);
- *swpairs++ = _mm_add_pd(vpen, vfinal1);
-@@ -346,7 +346,7 @@ void fill_subset_weights(double* subset_
- }
- }
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- for (uoo = 0; uoo < 3; uoo++) {
- wt = &(wtarr[14 + 6 * uoo]);
- vfinal1 = _mm_set_pd(wt[0], 0.0);
-@@ -415,7 +415,7 @@ void fill_subset_weights_r(double* subse
- double mean_m2;
- double mult = 1.0;
- double aux;
--#ifdef __LP64__
-+#ifdef __SSE2__
- __m128d* swpairs = (__m128d*)subset_weights;
- __m128d vpen;
- __m128d vfinal1;
-@@ -492,7 +492,7 @@ void fill_subset_weights_r(double* subse
- }
- for (unn = 0; unn < BITCT / 16; unn++) {
- wtptr = &(wtarr[40 * unn]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- vfinal1 = _mm_load_pd(wtptr);
- vfinal2 = _mm_load_pd(&(wtptr[2]));
- vfinal3 = _mm_load_pd(&(wtptr[4]));
-@@ -506,7 +506,7 @@ void fill_subset_weights_r(double* subse
- twt3 = twt2 + wtptr[ukk + 16];
- for (umm = 0; umm < 8; umm++) {
- twt4 = twt3 + wtptr[umm + 8];
--#ifdef __LP64__
-+#ifdef __SSE2__
- vpen = _mm_set1_pd(twt4);
- *swpairs++ = _mm_add_pd(vpen, vfinal1);
- *swpairs++ = _mm_add_pd(vpen, vfinal2);
-@@ -572,7 +572,7 @@ static inline void collapse_copy_phenod_
- } while (target < target_end);
- }
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- // XOR + mask variants of vectorized Lauradoux/Walisch popcount. (See
- // popcount_vecs() in plink_common.c for basic documentation.)
- // Note that the size of the popcounted buffer is a hardcoded constant
-@@ -852,7 +852,7 @@ void ibs_test_process_perms(uintptr_t* p
- do {
- sub_block_idx = 0;
- ulii = *perm_row_start++;
--#ifdef __LP64__
-+#ifdef __SSE2__
- dxx = psbuf[(uint8_t)ulii] + psbuf[256 + ((uint8_t)(ulii >> 8))] + psbuf[512 + ((uint8_t)(ulii >> 16))] + psbuf[768 + ((uint8_t)(ulii >> 24))] + psbuf[1024 + ((uint8_t)(ulii >> 32))] + psbuf[1280 + ((uint8_t)(ulii >> 40))] + psbuf[1536 + ((uint8_t)(ulii >> 48))] + psbuf[1792 + (ulii >> 56)];
- #else
- dxx = psbuf[(uint8_t)ulii] + psbuf[256 + ((uint8_t)(ulii >> 8))] + psbuf[512 + ((uint8_t)(ulii >> 16))] + psbuf[768 + (ulii >> 24)];
-@@ -969,7 +969,7 @@ THREAD_RET_TYPE ibs_test_thread(void* ar
- }
-
- void incr_dists_i(uint32_t* idists, uintptr_t* geno, uintptr_t* masks, uint32_t start_idx, uint32_t end_idx) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- __m128i* glptr;
- __m128i* glptr2;
- __m128i* mptr;
-@@ -986,7 +986,7 @@ void incr_dists_i(uint32_t* idists, uint
- uintptr_t mask_fixed;
- for (uii = start_idx; uii < end_idx; uii++) {
- jj = uii * (MULTIPLEX_2DIST / BITCT);
--#ifdef __LP64__
-+#ifdef __SSE2__
- glptr = (__m128i*)geno;
- glptr2 = (__m128i*)(&(geno[jj]));
- lptr = &(masks[jj]);
-@@ -1104,7 +1104,7 @@ THREAD_RET_TYPE calc_ibs_thread(void* ar
- }
-
- void incr_genome(uint32_t* genome_main, uintptr_t* geno, uintptr_t* masks, uintptr_t sample_ct, uint32_t start_idx, uint32_t end_idx) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- const __m128i m1 = {FIVEMASK, FIVEMASK};
- const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
- const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
-@@ -1161,14 +1161,14 @@ void incr_genome(uint32_t* genome_main,
- uintptr_t* marker_window_ptr;
- int32_t lowct2 = g_ctrl_ct * 2;
- int32_t highct2 = g_case_ct * 2;
--#ifdef __LP64__
-+#ifdef __SSE2__
- glptr_end = (__m128i*)(&(geno[sample_ct * (GENOME_MULTIPLEX2 / BITCT)]));
- #else
- glptr_end = &(geno[sample_ct * (GENOME_MULTIPLEX2 / BITCT)]);
- #endif
- for (uii = start_idx; uii < end_idx; uii++) {
- ujj = uii * (GENOME_MULTIPLEX2 / BITCT);
--#ifdef __LP64__
-+#ifdef __SSE2__
- glptr_fixed = (__m128i*)(&(geno[ujj]));
- glptr = (__m128i*)(&(geno[ujj + (GENOME_MULTIPLEX2 / BITCT)]));
- lptr = &(masks[ujj]);
-@@ -1194,7 +1194,7 @@ void incr_genome(uint32_t* genome_main,
- glptr_back = (uintptr_t*)glptr;
- glptr_fixed_tmp = glptr_fixed;
- maskptr_fixed_tmp = maskptr_fixed;
--#ifdef __LP64__
-+#ifdef __SSE2__
- acc_ibs1.vi = _mm_setzero_si128();
- acc_ibs0.vi = _mm_setzero_si128();
- do {
-@@ -1356,7 +1356,7 @@ void incr_genome(uint32_t* genome_main,
- xor_ptr = xor_buf;
- glptr_back = (uintptr_t*)glptr;
- glptr_fixed_tmp = glptr_fixed;
--#ifdef __LP64__
-+#ifdef __SSE2__
- acc_ibs1.vi = _mm_setzero_si128();
- acc_ibs0.vi = _mm_setzero_si128();
- do {
-@@ -1549,7 +1549,7 @@ void incr_dists(double* dists, uintptr_t
- uintptr_t uljj;
- uintptr_t* mptr;
- double* weights1 = &(weights[16384]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- double* weights2 = &(weights[32768]);
- double* weights3 = &(weights[36864]);
- double* weights4 = &(weights[40960]);
-@@ -1561,7 +1561,7 @@ void incr_dists(double* dists, uintptr_t
- ulii = geno[uii];
- mptr = masks;
- mask_fixed = masks[uii];
--#ifdef __LP64__
-+#ifdef __SSE2__
- if (mask_fixed == ~ZEROLU) {
- for (ujj = 0; ujj < uii; ujj++) {
- uljj = (*glptr++ ^ ulii) & (*mptr++);
-@@ -1628,7 +1628,7 @@ void incr_dists_r(double* dists, uintptr
- uintptr_t uljj;
- uintptr_t basemask;
- double* weights1 = &(weights[32768]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- double* weights2 = &(weights[65536]);
- double* weights3 = &(weights[98304]);
- #endif
-@@ -1642,7 +1642,7 @@ void incr_dists_r(double* dists, uintptr
- if (!basemask) {
- for (ujj = 0; ujj < uii; ujj++) {
- uljj = ((*glptr++) + ulii) | (*maskptr++);
--#ifdef __LP64__
-+#ifdef __SSE2__
- *dists += weights[(uint16_t)uljj] + weights1[(uint16_t)(uljj >> 16)] + weights2[(uint16_t)(uljj >> 32)] + weights3[uljj >> 48];
- #else
- *dists += weights[(uint16_t)uljj] + weights1[uljj >> 16];
-@@ -1652,7 +1652,7 @@ void incr_dists_r(double* dists, uintptr
- } else {
- for (ujj = 0; ujj < uii; ujj++) {
- uljj = ((*glptr++) + ulii) | ((*maskptr++) | basemask);
--#ifdef __LP64__
-+#ifdef __SSE2__
- *dists += weights[(uint16_t)uljj] + weights1[(uint16_t)(uljj >> 16)] + weights2[(uint16_t)(uljj >> 32)] + weights3[uljj >> 48];
- #else
- *dists += weights[(uint16_t)uljj] + weights1[uljj >> 16];
-@@ -1803,7 +1803,7 @@ void pick_d(unsigned char* cbuf, uint32_
- uint32_t ujj;
- uint32_t ukk;
- memset(cbuf, 0, ct);
--#ifdef __LP64__
-+#ifdef __SSE2__
- ukk = (uint32_t)(0x100000000LLU % ct);
- #else
- ukk = 2 * (0x80000000U % ct);
-@@ -2116,14 +2116,14 @@ void matrix_const_mult_add(uint32_t samp
- uint32_t loop_end = sample_ct - 1;
- uint32_t ujj;
- double* dptr = matrix;
--#ifdef __LP64__
-+#ifdef __SSE2__
- __m128d* vptr;
- __m128d v_mult_val = _mm_set1_pd(mult_val);
- #endif
- for (uii = 0; uii < loop_end; uii++) {
- *dptr = (*dptr) * mult_val + add_val;
- dptr++;
--#ifdef __LP64__
-+#ifdef __SSE2__
- if ((uintptr_t)dptr & 8) {
- *dptr *= mult_val;
- dptr++;
-@@ -2727,7 +2727,7 @@ int32_t ibs_test_calc(pthread_t* threads
- double perm_ct_recip;
- uintptr_t ulii;
- uintptr_t uljj = 0;
--#ifdef __LP64__
-+#ifdef __SSE2__
- __m128d* rvptr1;
- __m128d* rvptr2;
- #else
-@@ -2802,7 +2802,7 @@ int32_t ibs_test_calc(pthread_t* threads
- ctrl_ctrl_ssq += g_calc_result[tidx][1];
- ctrl_case_ssq += g_calc_result[tidx][2];
- case_case_ssq += g_calc_result[tidx][3];
--#ifdef __LP64__
-+#ifdef __SSE2__
- rvptr1 = (__m128d*)perm_results;
- rvptr2 = (__m128d*)(&(perm_results[2 * perm_ctcldm * tidx]));
- for (perm_idx = 0; perm_idx < perm_ct; perm_idx++) {
-@@ -4542,7 +4542,7 @@ int32_t distance_d_write(FILE** outfile_
- } else {
- if (shape == DISTANCE_SQ0) {
- // assume little-endian
--#ifdef __LP64__
-+#ifdef __SSE2__
- ulii = 0x3009300930093009LLU;
- #else
- ulii = 0x30093009;
-@@ -5923,7 +5923,7 @@ int32_t rel_cutoff_batch(uint32_t load_g
- fclose_null(&idfile);
- ullii = sample_ct;
- ullii = ((ullii * (ullii - 1)) / 2 + BITCT - 1) / BITCT;
--#ifndef __LP64__
-+#ifndef __SSE2__
- if (ullii >= 0x20000000) {
- goto rel_cutoff_batch_ret_NOMEM;
- }
-@@ -7353,7 +7353,7 @@ int32_t calc_rel(pthread_t* threads, uin
- // cptr2[uii] = '\t';
- // cptr2[uii + 1] = '0';
- // }
--#ifdef __LP64__
-+#ifdef __SSE2__
- ulii = 0x3009300930093009LLU;
- #else
- ulii = 0x30093009LU;
-@@ -8077,7 +8077,7 @@ int32_t calc_distance(pthread_t* threads
- llxx = g_thread_start[dist_thread_ct];
- llxx = ((llxx * (llxx - 1)) - (int64_t)g_thread_start[0] * (g_thread_start[0] - 1)) / 2;
- dists_alloc = llxx * sizeof(double);
--#ifndef __LP64__
-+#ifndef __SSE2__
- if (dists_alloc > 0x7fffffff) {
- goto calc_distance_ret_NOMEM;
- }
-@@ -8236,7 +8236,7 @@ int32_t calc_distance(pthread_t* threads
- goto calc_distance_ret_NOMEM;
- }
- if (main_weights) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- if (wkspace_alloc_d_checked(&subset_weights, 45056 * sizeof(double))) {
- goto calc_distance_ret_NOMEM;
- }
-@@ -8424,7 +8424,7 @@ int32_t calc_distance(pthread_t* threads
- *giptr3 += wtbuf[umm + ukk];
- }
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- ulii ^= FIVEMASK;
- *glptr++ = ulii;
- ulii = (ulii | (ulii >> 1)) & FIVEMASK;
-@@ -8725,7 +8725,7 @@ int32_t calc_cluster_neighbor(pthread_t*
- // as a special case in the future.
- FILE* outfile = NULL;
- uint32_t* cluster_sorted_ibs_indices = NULL;
--#ifdef __LP64__
-+#ifdef __SSE2__
- // uint64_t* cluster_sorted_ibs_indices_big = NULL;
- #endif
- uint32_t* sample_to_cluster = NULL;
-@@ -9457,7 +9457,7 @@ int32_t calc_cluster_neighbor(pthread_t*
- logprint("Clustering...");
- printf(" [sorting IB%c values]", cluster_missing? 'M' : 'S');
- fflush(stdout);
--#ifdef __LP64__
-+#ifdef __SSE2__
- if (cur_cluster_ct <= 65536) {
- #endif
- // Objective: Produce a list of inter-cluster IBS values sorted in
-@@ -9488,7 +9488,7 @@ int32_t calc_cluster_neighbor(pthread_t*
- // f(0) = 1
- // f(1) = f(2) = 2
- // f(3) = f(4) = f(5) = 3... (triangle_divide() with different rounding)
--#ifdef __LP64__
-+#ifdef __SSE2__
- umm = (int32_t)sqrt((intptr_t)(tcoord * 2));
- #else
- umm = (int32_t)sqrt(2 * ((double)((intptr_t)tcoord)));
-@@ -9601,7 +9601,7 @@ int32_t calc_cluster_neighbor(pthread_t*
- cluster_index[tri_coord_no_diag_32(ukk & 65535, ukk >> 16)] = uii + 1;
- }
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- } else {
- logerrprint("Error: --cluster cannot handle >65536 initial clusters yet.\n");
- retval = RET_CALC_NOT_YET_SUPPORTED;
---- plink1.9-1.90~b3w-150903.orig/plink_cnv.c
-+++ plink1.9-1.90~b3w-150903/plink_cnv.c
-@@ -13,7 +13,7 @@ int32_t cnv_subset_load(char* subset_fna
- logerrprint("Error: Empty --cnv-subset file.\n");
- goto cnv_subset_load_ret_INVALID_FORMAT;
- }
--#ifndef __LP64__
-+#ifndef __SSE2__
- if (((uint64_t)subset_ct) * max_subset_name_len > 0x7fffffffLLU) {
- goto cnv_subset_load_ret_NOMEM;
- }
-@@ -625,7 +625,7 @@ int32_t cnv_make_map(FILE* cnvfile, char
- }
- for (ulii = 1; ulii < raw_marker_ct; ulii++) {
- if (marker_pos_arr[ulii] != llii) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- if ((++distinct_marker_ct) == 0x80000000U) {
- logprint("\n");
- logerrprint("Error: Too many distinct .cnv.map positions (max 2^31 - 1).\n");
-@@ -983,7 +983,7 @@ int32_t plink_cnv(char* outname, char* o
- }
- }
- ulii = marker_chrom_start[chrom_info_ptr->max_code + 1 + chrom_info_ptr->name_ct];
--#ifndef __LP64__
-+#ifndef __SSE2__
- if (((uint64_t)ulii) * max_marker_id_len > 0x7fffffffLLU) {
- goto plink_cnv_ret_NOMEM;
- }
---- plink1.9-1.90~b3w-150903.orig/plink_common.c
-+++ plink1.9-1.90~b3w-150903/plink_common.c
-@@ -30,7 +30,7 @@ uintptr_t g_sample_ct;
- uint32_t g_thread_ct;
-
- uint32_t aligned_malloc(uintptr_t** aligned_pp, uintptr_t size) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- // Avoid random segfaults on 64-bit machines which have 8-byte- instead of
- // 16-byte-aligned malloc(). (Slightly different code is needed if malloc()
- // does not even guarantee 8-byte alignment.)
-@@ -51,7 +51,7 @@ uint32_t aligned_malloc(uintptr_t** alig
- }
-
- void aligned_free(uintptr_t* aligned_pp) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- free((uintptr_t*)(aligned_pp[-1]));
- #else
- free(aligned_pp);
-@@ -3349,7 +3349,7 @@ uint32_t next_unset_unsafe(uintptr_t* bi
- return ((uintptr_t)(bit_arr_ptr - bit_arr)) * BITCT + CTZLU(~ulii);
- }
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t next_unset_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc) {
- uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
- uintptr_t ulii = (~(*bit_arr_ptr)) >> (loc % BITCT);
-@@ -3383,7 +3383,7 @@ uint32_t next_unset(uintptr_t* bit_arr,
- return MINV(loc, ceil);
- }
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t next_unset_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil) {
- uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
- uintptr_t ulii = (~(*bit_arr_ptr)) >> (loc % BITCT);
-@@ -3416,7 +3416,7 @@ uint32_t next_set_unsafe(uintptr_t* bit_
- return ((uintptr_t)(bit_arr_ptr - bit_arr)) * BITCT + CTZLU(ulii);
- }
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t next_set_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc) {
- uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
- uintptr_t ulii = (*bit_arr_ptr) >> (loc % BITCT);
-@@ -3450,7 +3450,7 @@ uint32_t next_set(uintptr_t* bit_arr, ui
- return MINV(rval, ceil);
- }
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t next_set_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil) {
- uintptr_t* bit_arr_ptr = &(bit_arr[loc / BITCT]);
- uintptr_t ulii = (*bit_arr_ptr) >> (loc % BITCT);
-@@ -3718,7 +3718,7 @@ int32_t populate_id_htable(uintptr_t unf
- }
- }
- } else {
--#ifdef __LP64__
-+#ifdef __SSE2__
- if (wkspace_left >= 0x400000000LLU) {
- max_extra_alloc = 0xfffffffeU;
- } else {
-@@ -3888,7 +3888,7 @@ void fill_vec_55(uintptr_t* vec, uint32_
- uint32_t ctl = 2 * ((ct + (BITCT - 1)) / BITCT);
- uint32_t rem = ct & (BITCT - 1);
- uintptr_t* second_to_last = &(vec[ctl - 2]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- const __m128i m1 = {FIVEMASK, FIVEMASK};
- __m128i* vecp = (__m128i*)vec;
- __m128i* vec_end = (__m128i*)(&(vec[ctl]));
-@@ -4009,7 +4009,7 @@ void sample_delim_convert(uintptr_t unfi
- void get_set_wrange_align(uintptr_t* bitfield, uintptr_t word_ct, uintptr_t* firstw_ptr, uintptr_t* wlen_ptr) {
- uintptr_t* bitfield_ptr = bitfield;
- uintptr_t* bitfield_end = &(bitfield[word_ct]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t* bitfield_end2 = &(bitfield[word_ct & (~ONELU)]);
- while (bitfield_ptr < bitfield_end2) {
- if (bitfield_ptr[0] || bitfield_ptr[1]) {
-@@ -5223,7 +5223,7 @@ void bitfield_exclude_to_include(uintptr
- void bitfield_and(uintptr_t* vv, uintptr_t* include_vec, uintptr_t word_ct) {
- // vv := vv AND include_vec
- // on 64-bit systems, assumes vv and include_vec are 16-byte aligned
--#ifdef __LP64__
-+#ifdef __SSE2__
- __m128i* vv128 = (__m128i*)vv;
- __m128i* iv128 = (__m128i*)include_vec;
- __m128i* vv128_end = &(vv128[word_ct / 2]);
-@@ -5247,7 +5247,7 @@ void bitfield_andnot(uintptr_t* vv, uint
- // vv := vv ANDNOT exclude_vec
- // on 64-bit systems, assumes vv and exclude_vec are 16-byte aligned
- // note that this is the reverse of the _mm_andnot() operand order
--#ifdef __LP64__
-+#ifdef __SSE2__
- __m128i* vv128 = (__m128i*)vv;
- __m128i* ev128 = (__m128i*)exclude_vec;
- __m128i* vv128_end = &(vv128[word_ct / 2]);
-@@ -5270,7 +5270,7 @@ void bitfield_andnot(uintptr_t* vv, uint
- void bitfield_andnot_reversed_args(uintptr_t* vv, uintptr_t* include_vec, uintptr_t word_ct) {
- // vv := (~vv) AND include_vec
- // on 64-bit systems, assumes vv and exclude_vec are 16-byte aligned
--#ifdef __LP64__
-+#ifdef __SSE2__
- __m128i* vv128 = (__m128i*)vv;
- __m128i* iv128 = (__m128i*)include_vec;
- __m128i* vv128_end = &(vv128[word_ct / 2]);
-@@ -5294,7 +5294,7 @@ void bitfield_andnot_reversed_args(uintp
- void bitfield_or(uintptr_t* vv, uintptr_t* or_vec, uintptr_t word_ct) {
- // vv := vv OR include_vec
- // on 64-bit systems, assumes vv and include_vec are 16-byte aligned
--#ifdef __LP64__
-+#ifdef __SSE2__
- __m128i* vv128 = (__m128i*)vv;
- __m128i* ov128 = (__m128i*)or_vec;
- __m128i* vv128_end = &(vv128[word_ct / 2]);
-@@ -5317,7 +5317,7 @@ void bitfield_or(uintptr_t* vv, uintptr_
- void bitfield_ornot(uintptr_t* vv, uintptr_t* inverted_or_vec, uintptr_t word_ct) {
- // vv := vv OR (~inverted_or_vec)
- // on 64-bit systems, assumes vv and inverted_or_vec are 16-byte aligned
--#ifdef __LP64__
-+#ifdef __SSE2__
- #ifdef __APPLE__
- const __m128i all1 = {0xffffffffffffffffLLU, 0xffffffffffffffffLLU};
- #else
-@@ -5345,7 +5345,7 @@ void bitfield_ornot(uintptr_t* vv, uintp
- void bitfield_xor(uintptr_t* bit_arr, uintptr_t* xor_arr, uintptr_t word_ct) {
- // bit_arr := bit_arr XOR xor_arr
- // on 64-bit systems, assumes bit_arr and xor_arr are 16-byte aligned
--#ifdef __LP64__
-+#ifdef __SSE2__
- __m128i* bitv128 = (__m128i*)bit_arr;
- __m128i* xorv128 = (__m128i*)xor_arr;
- __m128i* bitv128_end = &(bitv128[word_ct / 2]);
-@@ -5539,7 +5539,7 @@ uint32_t has_three_genotypes(uintptr_t*
- }
- */
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- // Basic SSE2 implementation of Lauradoux/Walisch popcount.
- static inline uintptr_t popcount_vecs(__m128i* vptr, uintptr_t ct) {
- // popcounts vptr[0..(ct-1)]. Assumes ct is a multiple of 3 (0 ok).
-@@ -5730,7 +5730,7 @@ uintptr_t popcount_longs(uintptr_t* lptr
- // index.
- uintptr_t tot = 0;
- uintptr_t* lptr_end = &(lptr[word_ct]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t six_ct;
- __m128i* vptr;
- vptr = (__m128i*)lptr;
-@@ -5788,7 +5788,7 @@ uintptr_t popcount2_longs(uintptr_t* lpt
- // treats lptr[] as an array of two-bit instead of one-bit numbers
- uintptr_t tot = 0;
- uintptr_t* lptr_end = &(lptr[word_ct]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t twelve_ct;
- __m128i* vptr;
- vptr = (__m128i*)lptr;
-@@ -6035,7 +6035,7 @@ uintptr_t jump_forward_unset_unsafe(uint
- uintptr_t* bptr = &(bit_arr[widx]);
- uintptr_t uljj;
- uintptr_t ulkk;
--#ifdef __LP64__
-+#ifdef __SSE2__
- __m128i* vptr;
- #endif
- if (ulii) {
-@@ -6055,7 +6055,7 @@ uintptr_t jump_forward_unset_unsafe(uint
- bptr++;
- }
- ulii = 0;
--#ifdef __LP64__
-+#ifdef __SSE2__
- if (widx & 1) {
- uljj = ~(*bptr);
- ulkk = popcount_long(uljj);
-@@ -6101,7 +6101,7 @@ uintptr_t popcount_longs_exclude(uintptr
- // N.B. on 64-bit systems, assumes lptr and exclude_arr are 16-byte aligned.
- uintptr_t tot = 0;
- uintptr_t* lptr_end = &(lptr[end_idx]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t six_ct = end_idx / 6;
- tot += popcount_vecs_exclude((__m128i*)lptr, (__m128i*)exclude_arr, six_ct * 3);
- lptr = &(lptr[six_ct * 6]);
-@@ -6151,7 +6151,7 @@ uintptr_t popcount_longs_exclude(uintptr
- uintptr_t popcount_longs_intersect(uintptr_t* lptr1, uintptr_t* lptr2, uintptr_t word_ct) {
- uintptr_t tot = 0;
- uintptr_t* lptr1_end = &(lptr1[word_ct]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t six_ct = word_ct / 6;
- tot += popcount_vecs_intersect((__m128i*)lptr1, (__m128i*)lptr2, six_ct * 3);
- lptr1 = &(lptr1[six_ct * 6]);
-@@ -6213,7 +6213,7 @@ void vertical_bitct_subtract(uintptr_t*
- }
- }
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- void count_2freq_dbl_60v(__m128i* vptr, __m128i* vend, __m128i* mask1vp, __m128i* mask2vp, uint32_t* ct1abp, uint32_t* ct1cp, uint32_t* ct2abp, uint32_t* ct2cp) {
- const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
- const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
-@@ -6594,7 +6594,7 @@ void count_3freq_12(uintptr_t* lptr, uin
- }
- #endif
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- void count_set_freq_60v(__m128i* vptr, __m128i* vend, __m128i* include_vec, uint32_t* set_ctp, uint32_t* missing_ctp) {
- const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
- const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
-@@ -7310,7 +7310,7 @@ void vec_set_freq(uintptr_t sample_ctl2,
- uintptr_t missing_incr;
- uint32_t acc = 0;
- uint32_t accm = 0;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t cur_decr = 60;
- uintptr_t* lptr_6x_end;
- sample_ctl2 -= sample_ctl2 % 6;
-@@ -7356,7 +7356,7 @@ void vec_set_freq_x(uintptr_t sample_ctl
- uintptr_t missing_incr;
- uint32_t acc = 0;
- uint32_t accm = 0;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t cur_decr = 60;
- uintptr_t* lptr_6x_end;
- sample_ctl2 -= sample_ctl2 % 6;
-@@ -7408,7 +7408,7 @@ void vec_set_freq_y(uintptr_t sample_ctl
- uintptr_t loader4;
- uint32_t acc = 0;
- uint32_t accm = 0;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t cur_decr = 120;
- uintptr_t* lptr_12x_end;
- sample_ctl2 -= sample_ctl2 % 12;
-@@ -7455,7 +7455,7 @@ void vec_3freq(uintptr_t sample_ctl2, ui
- uint32_t acc_even = 0;
- uint32_t acc_odd = 0;
- uint32_t acc_and = 0;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t cur_decr = 120;
- uintptr_t* lptr_12x_end;
- sample_ctl2 -= sample_ctl2 % 12;
-@@ -7497,7 +7497,7 @@ uintptr_t count_01(uintptr_t* lptr, uint
- // unlike popcount01_longs, this does not assume lptr[] has no 11s
- uintptr_t* lptr_end = &(lptr[word_ct]);
- uintptr_t loader;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t acc;
- word_ct -= word_ct % 12;
- acc = count_01_vecs((__m128i*)lptr, word_ct / 2);
-@@ -7818,7 +7818,7 @@ void reverse_loadbuf(unsigned char* load
- uint32_t* loadbuf_alias32;
- uint32_t uii;
- uint32_t ujj;
--#ifdef __LP64__
-+#ifdef __SSE2__
- const __m128i m1 = {FIVEMASK, FIVEMASK};
- __m128i* loadbuf_alias;
- __m128i vii;
-@@ -8044,7 +8044,7 @@ void vec_include_init(uintptr_t unfilter
- ulmm = FIVEMASK;
- if (ulii) {
- uljj = ulii >> BITCT2;
--#ifdef __LP64__
-+#ifdef __SSE2__
- ulii &= 0xffffffffLLU;
- #else
- ulii &= 0xffffLU;
-@@ -8092,7 +8092,7 @@ void exclude_to_vec_include(uintptr_t un
- ulmm = FIVEMASK;
- if (ulii) {
- uljj = ulii >> BITCT2;
--#ifdef __LP64__
-+#ifdef __SSE2__
- ulii &= 0xffffffffLLU;
- #else
- ulii &= 0xffffLU;
-@@ -8133,7 +8133,7 @@ void vec_init_invert(uintptr_t entry_ct,
- uint32_t vec_wsize = 2 * ((entry_ct + (BITCT - 1)) / BITCT);
- uintptr_t* second_to_last = &(target_arr[vec_wsize - 2]);
- uint32_t rem = entry_ct & (BITCT - 1);
--#ifdef __LP64__
-+#ifdef __SSE2__
- const __m128i m1 = {FIVEMASK, FIVEMASK};
- __m128i* tptr = (__m128i*)target_arr;
- __m128i* sptr = (__m128i*)source_arr;
-@@ -8158,7 +8158,7 @@ void vec_init_invert(uintptr_t entry_ct,
- void bitfield_andnot_copy(uintptr_t word_ct, uintptr_t* target_arr, uintptr_t* source_arr, uintptr_t* exclude_arr) {
- // target_arr := source_arr ANDNOT exclude_arr
- // may write an extra word
--#ifdef __LP64__
-+#ifdef __SSE2__
- __m128i* tptr = (__m128i*)target_arr;
- __m128i* sptr = (__m128i*)source_arr;
- __m128i* xptr = (__m128i*)exclude_arr;
-@@ -8187,7 +8187,7 @@ void vec_include_mask_in(uintptr_t unfil
- ulmm = include_arr[1];
- if (ulii) {
- uljj = ulii >> BITCT2;
--#ifdef __LP64__
-+#ifdef __SSE2__
- ulii &= 0xffffffffLLU;
- #else
- ulii &= 0xffffLU;
-@@ -8225,7 +8225,7 @@ void vec_include_mask_out(uintptr_t unfi
- ulmm = include_arr[1];
- if (ulii) {
- uljj = ulii >> BITCT2;
--#ifdef __LP64__
-+#ifdef __SSE2__
- ulii &= 0xffffffffLLU;
- #else
- ulii &= 0xffffLU;
-@@ -8263,7 +8263,7 @@ void vec_include_mask_out_intersect(uint
- ulmm = include_arr[1];
- if (ulii) {
- uljj = ulii >> BITCT2;
--#ifdef __LP64__
-+#ifdef __SSE2__
- ulii &= 0xffffffffLLU;
- #else
- ulii &= 0xffffLU;
-@@ -8290,7 +8290,7 @@ void vec_include_mask_out_intersect(uint
-
- void vec_init_01(uintptr_t unfiltered_sample_ct, uintptr_t* data_ptr, uintptr_t* result_ptr) {
- // initializes result_ptr bits 01 iff data_ptr bits are 01
--#ifdef __LP64__
-+#ifdef __SSE2__
- const __m128i m1 = {FIVEMASK, FIVEMASK};
- __m128i* vec2_read = (__m128i*)data_ptr;
- __m128i* read_end = &(vec2_read[(unfiltered_sample_ct + (BITCT - 1)) / BITCT]);
-@@ -8313,7 +8313,7 @@ void vec_init_01(uintptr_t unfiltered_sa
- void vec_invert(uintptr_t unfiltered_sample_ct, uintptr_t* vec2) {
- uintptr_t* vec2_last = &(vec2[unfiltered_sample_ct / BITCT2]);
- uint32_t remainder = unfiltered_sample_ct & (BITCT2 - 1);
--#ifdef __LP64__
-+#ifdef __SSE2__
- const __m128i m1 = {FIVEMASK, FIVEMASK};
- __m128i* vec2_128 = (__m128i*)vec2;
- __m128i* vec2_last128 = &(vec2_128[unfiltered_sample_ct / BITCT]);
-@@ -8342,7 +8342,7 @@ void vec_datamask(uintptr_t unfiltered_s
- // sets result_vec bits to 01 iff data_ptr bits are equal to matchval and
- // vec_ptr bit is set, 00 otherwise.
- // currently assumes matchval is not 1.
--#ifdef __LP64__
-+#ifdef __SSE2__
- __m128i* data_read = (__m128i*)data_ptr;
- __m128i* mask_read = (__m128i*)mask_ptr;
- __m128i* data_read_end = &(data_read[(unfiltered_sample_ct + (BITCT - 1)) / BITCT]);
-@@ -8354,7 +8354,7 @@ void vec_datamask(uintptr_t unfiltered_s
- #endif
- if (matchval) {
- if (matchval == 2) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- do {
- loader = *data_read++;
- *writer++ = _mm_and_si128(_mm_andnot_si128(loader, _mm_srli_epi64(loader, 1)), *mask_read++);
-@@ -8366,7 +8366,7 @@ void vec_datamask(uintptr_t unfiltered_s
- } while (data_ptr < data_read_end);
- #endif
- } else {
--#ifdef __LP64__
-+#ifdef __SSE2__
- do {
- loader = *data_read++;
- *writer++ = _mm_and_si128(_mm_and_si128(loader, _mm_srli_epi64(loader, 1)), *mask_read++);
-@@ -8379,7 +8379,7 @@ void vec_datamask(uintptr_t unfiltered_s
- #endif
- }
- } else {
--#ifdef __LP64__
-+#ifdef __SSE2__
- do {
- loader = *data_read++;
- *writer++ = _mm_andnot_si128(_mm_or_si128(loader, _mm_srli_epi64(loader, 1)), *mask_read++);
-@@ -8395,7 +8395,7 @@ void vec_datamask(uintptr_t unfiltered_s
-
- /*
- void vec_rotate_plink1_to_plink2(uintptr_t* lptr, uint32_t word_ct) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- const __m128i m1 = {FIVEMASK, FIVEMASK};
- __m128i* vptr = (__m128i*)lptr;
- __m128i* vend = (__m128i*)(&(lptr[word_ct]));
-@@ -8512,7 +8512,7 @@ void hh_reset(unsigned char* loadbuf, ui
- uint32_t* loadbuf_alias32;
- uint32_t uii;
- uint32_t ujj;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t* sample_include2_alias32;
- __m128i* loadbuf_alias;
- __m128i* iivp;
-@@ -8576,7 +8576,7 @@ void hh_reset_y(unsigned char* loadbuf,
- uint32_t uii;
- uint32_t ujj;
- uint32_t ukk;
--#ifdef __LP64__
-+#ifdef __SSE2__
- const __m128i m1 = {FIVEMASK, FIVEMASK};
- uint32_t* sample_include2_alias32;
- uint32_t* sample_male_include2_alias32;
-@@ -8730,7 +8730,7 @@ void force_missing(unsigned char* loadbu
- uint32_t* loadbuf_alias32;
- uint32_t uii;
- uint32_t ujj;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t* force_missing_include2_alias32;
- __m128i* loadbuf_alias;
- __m128i* fmivp;
---- plink1.9-1.90~b3w-150903.orig/plink_common.h
-+++ plink1.9-1.90~b3w-150903/plink_common.h
-@@ -63,13 +63,13 @@
- #endif
-
- #ifdef _WIN64
-- #define __LP64__
-+ #define __SSE2__
- #define CTZLU __builtin_ctzll
- #define CLZLU __builtin_clzll
- #else
- #define CTZLU __builtin_ctzl
- #define CLZLU __builtin_clzl
-- #ifndef __LP64__
-+ #ifndef __SSE2__
- #ifndef uintptr_t
- #define uintptr_t unsigned long
- #endif
-@@ -83,7 +83,7 @@
- #include <algorithm>
- #endif
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- #include <emmintrin.h>
- #define FIVEMASK 0x5555555555555555LLU
- typedef union {
-@@ -120,7 +120,7 @@
-
- #endif // Win64
-
--#else // not __LP64__
-+#else // not __SSE2__
-
- #define FIVEMASK 0x55555555
- #define ZEROLU 0LU
-@@ -133,7 +133,7 @@
- #endif
- #define PRIxPTR2 "08lx"
-
--#endif // __LP64__
-+#endif // __SSE2__
-
- #include <zlib.h>
- #include "SFMT.h"
-@@ -601,7 +601,7 @@
- #define MAX_THREADS_P1 513
- #endif
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- #define BITCT 64
- #else
- #define BITCT 32
-@@ -647,7 +647,7 @@
- #define JACKKNIFE_VALS_DIST 5
- #define JACKKNIFE_VALS_GROUPDIST 3
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- #define AAAAMASK 0xaaaaaaaaaaaaaaaaLLU
- // number of snp-major .bed lines to read at once for distance calc if
- // exponent is nonzero.
-@@ -679,7 +679,7 @@
- #define HASHSIZE 524287
- #define HASHSIZE_S 524287
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- #define HASHMEM 4194304
- #define HASHMEM_S 4194304
- #else
-@@ -779,7 +779,7 @@ typedef union {
-
- typedef union {
- double dd;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t uu[1];
- #else
- uintptr_t uu[2];
-@@ -1031,7 +1031,7 @@ static inline char* skip_initial_spaces(
- /*
- static inline int32_t is_space_or_eoln(unsigned char cc) {
- // ' ', \t, \n, \0, \r
--#ifdef __LP64__
-+#ifdef __SSE2__
- return (ucc <= 32) && (0x100002601LLU & (1LLU << ucc));
- #else
- return ((ucc <= 32) && ((ucc == ' ') || (0x2601LU & (ONELU << ucc))));
-@@ -1526,7 +1526,7 @@ static inline void next_unset_unsafe_ck(
- }
- }
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t next_unset_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc);
- #else
- static inline uintptr_t next_unset_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc) {
-@@ -1548,7 +1548,7 @@ static inline void next_unset_ck(uintptr
- }
- }
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t next_unset_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil);
- #else
- static inline uintptr_t next_unset_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil) {
-@@ -1570,7 +1570,7 @@ static inline void next_set_unsafe_ck(ui
- }
- }
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t next_set_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc);
- #else
- static inline uintptr_t next_set_ul_unsafe(uintptr_t* bit_arr, uintptr_t loc) {
-@@ -1592,7 +1592,7 @@ static inline void next_set_ck(uintptr_t
- }
- }
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t next_set_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil);
- #else
- static inline uintptr_t next_set_ul(uintptr_t* bit_arr, uintptr_t loc, uintptr_t ceil) {
-@@ -1639,7 +1639,7 @@ static inline void fill_ulong_zero(uintp
- }
- }
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- static inline void fill_ull_zero(uint64_t* ullarr, size_t size) {
- fill_ulong_zero((uintptr_t*)ullarr, size);
- }
-@@ -1663,7 +1663,7 @@ static inline void fill_ulong_one(uintpt
- }
- }
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- static inline void fill_ull_one(uint64_t* ullarr, size_t size) {
- fill_ulong_one((uintptr_t*)ullarr, size);
- }
-@@ -1812,7 +1812,7 @@ void get_set_wrange_align(uintptr_t* bit
- #define CHROM_XY (MAX_POSSIBLE_CHROM + 2)
- #define CHROM_MT (MAX_POSSIBLE_CHROM + 3)
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- // dog requires 42 bits, and other species require less
- #define CHROM_MASK_INITIAL_WORDS 1
- #else
-@@ -2046,7 +2046,7 @@ void bitfield_ornot(uintptr_t* vv, uintp
- void bitfield_xor(uintptr_t* bit_arr, uintptr_t* xor_arr, uintptr_t word_ct);
-
- static inline uint32_t popcount2_long(uintptr_t val) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- val = (val & 0x3333333333333333LLU) + ((val >> 2) & 0x3333333333333333LLU);
- return (((val + (val >> 4)) & 0x0f0f0f0f0f0f0f0fLLU) * 0x0101010101010101LLU) >> 56;
- #else
-@@ -2071,7 +2071,7 @@ uint32_t less_than_two_genotypes(uintptr
-
- uintptr_t popcount_longs(uintptr_t* lptr, uintptr_t word_ct);
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- static inline uintptr_t popcount_longs_nzbase(uintptr_t* lptr, uintptr_t start_idx, uintptr_t end_idx) {
- uintptr_t prefix_ct = 0;
- if (start_idx & 1) {
-@@ -2112,7 +2112,7 @@ uintptr_t popcount_longs_intersect(uintp
-
- void vertical_bitct_subtract(uintptr_t* bit_arr, uint32_t item_ct, uint32_t* sum_arr);
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- void count_2freq_dbl_60v(__m128i* vptr, __m128i* vend, __m128i* mask1vp, __m128i* mask2vp, uint32_t* ct1abp, uint32_t* ct1cp, uint32_t* ct2abp, uint32_t* ct2cp);
-
- void count_3freq_120v(__m128i* vptr, __m128i* vend, __m128i* maskvp, uint32_t* ctap, uint32_t* ctbp, uint32_t* ctcp);
---- plink1.9-1.90~b3w-150903.orig/plink_data.c
-+++ plink1.9-1.90~b3w-150903/plink_data.c
-@@ -2222,7 +2222,7 @@ int32_t zero_cluster_init(char* zerofnam
- if (!marker_bitfield_tmp) {
- goto zero_cluster_init_ret_NOMEM;
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero(marker_bitfield_tmp, (marker_ctp2l + 1) & (~1));
- #else
- fill_ulong_zero(marker_bitfield_tmp, (marker_ctp2l + 3) & (~3));
-@@ -3203,7 +3203,7 @@ int32_t make_bed_me_missing_one_marker(F
- }
-
- void zeropatch(uintptr_t sample_ctv2, uintptr_t cluster_ct, uintptr_t* cluster_zc_masks, uint32_t** zcdefs, uintptr_t* patchbuf, uintptr_t marker_idx, uintptr_t* writebuf) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- __m128i* writevec = (__m128i*)writebuf;
- __m128i* patchvec = (__m128i*)patchbuf;
- __m128i* patchvec_end = (__m128i*)(&(patchbuf[sample_ctv2]));
-@@ -3227,7 +3227,7 @@ void zeropatch(uintptr_t sample_ctv2, ui
- if (!at_least_one_cluster) {
- return;
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- do {
- vec1 = *writevec;
- vec2 = *patchvec++;
-@@ -3246,7 +3246,7 @@ void zeropatch(uintptr_t sample_ctv2, ui
-
- void reverse_subset(uintptr_t* writebuf, uintptr_t* subset_vec2, uintptr_t word_ct) {
- // reverse_loadbuf() variant that requires subset_vec2 bit to be set
--#ifdef __LP64__
-+#ifdef __SSE2__
- __m128i* wvec = (__m128i*)writebuf;
- __m128i* svec = (__m128i*)subset_vec2;
- __m128i* wvec_end = (__m128i*)(&(writebuf[word_ct]));
-@@ -3273,7 +3273,7 @@ void reverse_subset(uintptr_t* writebuf,
-
- void replace_missing_a2(uintptr_t* writebuf, uintptr_t* subset_vec2, uintptr_t word_ct) {
- // 01 -> 11 for each set bit in subset_vec2
--#ifdef __LP64__
-+#ifdef __SSE2__
- __m128i* wvec = (__m128i*)writebuf;
- __m128i* svec = (__m128i*)subset_vec2;
- __m128i* wvec_end = (__m128i*)(&(writebuf[word_ct]));
-@@ -5139,7 +5139,7 @@ int32_t incr_text_allele0(char cc, char*
-
- typedef struct ll_str_fixed_struct {
- struct ll_str_struct* next;
--#ifdef __LP64__
-+#ifdef __SSE2__
- char ss[8];
- #else
- char ss[12];
-@@ -11630,7 +11630,7 @@ uint32_t valid_vcf_allele_code(const cha
- uii -= 64;
- // A = 1, C = 3, G = 7, N = 14, T = 20, so (0x10408a >> ucc) & 1 works as a
- // set membership test
--#ifdef __LP64__
-+#ifdef __SSE2__
- if ((uii > 63) || (!((0x10408a0010408aLLU >> uii) & 1))) {
- // if '[', ']', or '.', assume breakend
- return ((uii == 27) || (uii == 29) || (uii == 0xffffffeeU))? 1 : 0;
-@@ -15707,7 +15707,7 @@ int32_t merge_datasets(char* bedname, ch
- logerrprint("Warning: --merge-list file is empty.\n");
- }
- }
--#ifndef __LP64__
-+#ifndef __SSE2__
- if (ullxx > 0x7fffffff) {
- goto merge_datasets_ret_NOMEM;
- }
-@@ -15807,7 +15807,7 @@ int32_t merge_datasets(char* bedname, ch
- max_cur_sample_ct = cur_sample_ct;
- }
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- if (ullxx > 0x7fffffff) {
- sprintf(logbuf, "Error: Too many %s (max 2147483647).\n", g_species_plural);
- goto merge_datasets_ret_INVALID_FORMAT_2;
-@@ -16045,7 +16045,7 @@ int32_t merge_datasets(char* bedname, ch
- if (position_warning_ct > 3) {
- fprintf(stderr, "%" PRIu64 " more multiple-position warning%s: see log file.\n", position_warning_ct - 3, (position_warning_ct == 4)? "" : "s");
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- if (ullxx > 0x7fffffff) {
- logerrprint("Error: Too many variants (max 2147483647).\n");
- goto merge_datasets_ret_INVALID_FORMAT;
---- plink1.9-1.90~b3w-150903.orig/plink_dosage.c
-+++ plink1.9-1.90~b3w-150903/plink_dosage.c
-@@ -177,7 +177,7 @@ int32_t dosage_load_score_files(Score_in
- logerrprint("Error: --score does not support >= 2^30 variants.\n");
- goto dosage_load_score_files_ret_INVALID_FORMAT;
- }
--#ifndef __LP64__
-+#ifndef __SSE2__
- if (allele_code_buf_len > 0x7fffffff) {
- goto dosage_load_score_files_ret_NOMEM;
- }
-@@ -1762,7 +1762,7 @@ int32_t plink1_dosage(Dosage_info* doip,
- if (load_map) {
- marker_idx = id_htable_find(bufptr, slen, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len);
- if (marker_idx == 0xffffffffU) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- marker_idx = ~ZEROLU;
- #endif
- continue;
---- plink1.9-1.90~b3w-150903.orig/plink_family.c
-+++ plink1.9-1.90~b3w-150903/plink_family.c
-@@ -728,7 +728,7 @@ int32_t mendel_error_scan(Family_info* f
- uint32_t* error_cts_tmp;
- uint32_t* error_cts_tmp2;
- uint32_t* uiptr;
--#ifdef __LP64__
-+#ifdef __SSE2__
- __m128i* vptr;
- __m128i* vptr2;
- #endif
-@@ -993,7 +993,7 @@ int32_t mendel_error_scan(Family_info* f
- }
- if ((cur_error_ct <= var_error_max) || (!var_first)) {
- if (var_first) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- vptr = (__m128i*)error_cts_tmp;
- vptr2 = (__m128i*)error_cts_tmp2;
- for (trio_idx = 0; trio_idx < trio_ct4; trio_idx++) {
-@@ -3125,7 +3125,7 @@ int32_t dfam(pthread_t* threads, FILE* b
- if (retval) {
- goto dfam_ret_1;
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- if ((12 * sample_ct + 2 * family_ct) > 0xffffffffLLU) {
- logerrprint("Error: Too many samples and families for DFAM test.\n");
- goto dfam_ret_INVALID_CMDLINE;
-@@ -4429,7 +4429,7 @@ int32_t qfam(pthread_t* threads, FILE* b
- goto qfam_ret_1;
- }
- g_family_ct = family_ct;
--#ifdef __LP64__
-+#ifdef __SSE2__
- // no need to check in 32-bit case since a nomem error would have occurred
- // earlier...
- // (okay, no need to check anyway, but best to document this overflow
---- plink1.9-1.90~b3w-150903.orig/plink_filter.c
-+++ plink1.9-1.90~b3w-150903/plink_filter.c
-@@ -1695,7 +1695,7 @@ int32_t mind_filter(FILE* bedfile, uintp
- return retval;
- }
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- void freq_hwe_haploid_count_120v(__m128i* vptr, __m128i* vend, __m128i* maskvp, uint32_t* ct_nmp, uint32_t* ct_hmajp) {
- const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
- const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
-@@ -1953,7 +1953,7 @@ static inline void single_marker_freqs_a
- uintptr_t loader;
- uintptr_t loader2;
- uintptr_t loader3;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t cur_decr = 120;
- uintptr_t* lptr_12x_end;
- unfiltered_sample_ctl2 -= unfiltered_sample_ctl2 % 12;
-@@ -2082,7 +2082,7 @@ static inline void haploid_single_marker
- uintptr_t loader2;
- uintptr_t loader3;
- uintptr_t loader4;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t cur_decr = 120;
- uintptr_t* lptr_12x_end;
- unfiltered_sample_ctl2 -= unfiltered_sample_ctl2 % 12;
---- plink1.9-1.90~b3w-150903.orig/plink_glm.c
-+++ plink1.9-1.90~b3w-150903/plink_glm.c
-@@ -272,7 +272,7 @@ int32_t glm_scan_conditions(char* condit
- uintptr_t condition_ct = 0;
- uintptr_t line_idx = 0;
- int32_t retval = 0;
--#ifdef __LP64__
-+#ifdef __SSE2__
- __m128i* loadbuf_vptr;
- __m128i* loadbuf_mask_vptr;
- __m128i* loadbuf_vend;
-@@ -384,7 +384,7 @@ int32_t glm_scan_conditions(char* condit
- }
- vec_include_init(unfiltered_sample_ct, loadbuf_mask_orig, load_mask);
- memcpy(loadbuf_mask, loadbuf_mask_orig, unfiltered_sample_ctv2 * sizeof(intptr_t));
--#ifdef __LP64__
-+#ifdef __SSE2__
- loadbuf_vend = (__m128i*)(&(loadbuf_raw[unfiltered_sample_ctv2]));
- #else
- loadbuf_end = &(loadbuf_raw[unfiltered_sample_ctl2]);
-@@ -406,7 +406,7 @@ int32_t glm_scan_conditions(char* condit
- haploid_fix(hh_or_mt_exists, sample_raw_include2, sample_raw_male_include2, unfiltered_sample_ct, is_x, is_y, (unsigned char*)loadbuf_raw);
- }
- // clear loadbuf_mask bits where loadbuf is 01.
--#ifdef __LP64__
-+#ifdef __SSE2__
- loadbuf_vptr = (__m128i*)loadbuf_raw;
- loadbuf_mask_vptr = (__m128i*)loadbuf_mask;
- do {
-@@ -845,7 +845,7 @@ uint32_t glm_linear(uintptr_t cur_batch_
- // Lakhani and Eva Guinan.
- // #####
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- // exp_ps is a C port of Shigeo Mitsunari's fast math library posted at
- // http://homepage1.nifty.com/herumi/ . License is
- // http://opensource.org/licenses/BSD-3-Clause .
-@@ -1180,7 +1180,7 @@ static inline __m128 fmath_exp_ps(__m128
- return tt;
- }
-
--// For equivalent "normal" C/C++ code, see the non-__LP64__ versions of these
-+// For equivalent "normal" C/C++ code, see the non-__SSE2__ versions of these
- // functions.
- static inline void logistic_sse(float* vect, uint32_t nn) {
- __m128 zero = _mm_setzero_ps();
-@@ -1521,7 +1521,7 @@ static inline void compute_two_plus_one_
- u16.vf = s3;
- *r3_ptr = u16.f4[0] + u16.f4[1] + u16.f4[2] + u16.f4[3];
- }
--#else // no __LP64__ (and hence, unsafe to assume presence of SSE2)
-+#else // no __SSE2__ (and hence, unsafe to assume presence of SSE2)
- static inline void logistic_sse(float* vect, uint32_t nn) {
- uint32_t uii;
- for (uii = 0; uii < nn; uii++) {
---- plink1.9-1.90~b3w-150903.orig/plink_homozyg.c
-+++ plink1.9-1.90~b3w-150903/plink_homozyg.c
-@@ -85,7 +85,7 @@ void update_end_nonhom(uintptr_t* readbu
- }
- }
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- #define ROH_ENTRY_INTS 7
- #else
- #define ROH_ENTRY_INTS 6
-@@ -210,7 +210,7 @@ void save_confirmed_roh_extend(uint32_t
- *roh_list++ = cidx_len;
- *roh_list++ = cidx_len - cur_roh_het_ct - cur_roh_missing_ct;
- *roh_list++ = cur_roh_het_ct;
--#ifdef __LP64__
-+#ifdef __SSE2__
- *roh_list++ = (uint32_t)sample_last_roh_idx;
- *roh_list++ = (uint32_t)(sample_last_roh_idx >> 32);
- #else
-@@ -283,7 +283,7 @@ uint32_t roh_update(Homozyg_info* hp, ui
- *roh_list_cur++ = cidx_len - cur_het_ct - cur_roh_missing_cts[sample_idx];
- *roh_list_cur++ = cur_het_ct;
- last_roh_idx = sample_to_last_roh[sample_idx];
--#ifdef __LP64__
-+#ifdef __SSE2__
- *roh_list_cur++ = (uint32_t)last_roh_idx;
- *roh_list_cur++ = (uint32_t)(last_roh_idx >> 32);
- #else
-@@ -465,7 +465,7 @@ int32_t write_main_roh_reports(char* out
- cur_roh_ct = 0;
- while (cur_roh_idx != ~ZEROLU) {
- cur_roh = &(roh_list[cur_roh_idx * ROH_ENTRY_INTS]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- prev_roh_idx = ((uintptr_t)cur_roh[5]) | (((uintptr_t)cur_roh[6]) << 32);
- cur_roh[5] = (uint32_t)next_roh_idx;
- cur_roh[6] = (uint32_t)(next_roh_idx >> 32);
-@@ -515,7 +515,7 @@ int32_t write_main_roh_reports(char* out
- if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
- goto write_main_roh_reports_ret_WRITE_FAIL;
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- cur_roh_idx = ((uintptr_t)cur_roh[5]) | (((uintptr_t)cur_roh[6]) << 32);
- #else
- cur_roh_idx = (uintptr_t)cur_roh[5];
-@@ -739,7 +739,7 @@ void extract_pool_info(uint32_t pool_siz
- void initialize_roh_slot(uint32_t* cur_roh, uint32_t chrom_start, uint32_t* marker_uidx_to_cidx, uintptr_t* roh_slot, uint32_t* roh_slot_cidx_start, uint32_t* roh_slot_cidx_end, uint32_t* roh_slot_end_uidx) {
- uint32_t cidx_first = marker_uidx_to_cidx[cur_roh[0] - chrom_start];
- uint32_t cidx_last = marker_uidx_to_cidx[cur_roh[1] - chrom_start];
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t cidx_first_block = cidx_first & (~63);
- uint32_t cidx_last_block = cidx_last & (~63);
- uint32_t cur_bidx = 2;
-@@ -759,7 +759,7 @@ void initialize_roh_slot(uint32_t* cur_r
- *roh_slot_cidx_end = cidx_last + 1;
- *roh_slot_end_uidx = cur_roh[1] + 1;
- uii = cidx_first & (BITCT2 - 1);
--#ifdef __LP64__
-+#ifdef __SSE2__
- if (cidx_first & 32) {
- roh_slot[0] = FIVEMASK;
- roh_slot[1] = 0x1555555555555555LLU >> (2 * (31 - uii));
-@@ -772,7 +772,7 @@ void initialize_roh_slot(uint32_t* cur_r
- #endif
- fill_ulong_zero(&(roh_slot[cur_bidx]), end_bidx - cur_bidx);
- uii = cidx_last & (BITCT2 - 1);
--#ifdef __LP64__
-+#ifdef __SSE2__
- if (cidx_last & 32) {
- // |= instead of = in case first_block and last_block are the same
- roh_slot[end_bidx - 1] |= 0x5555555555555554LLU << (2 * uii);
-@@ -821,7 +821,7 @@ void populate_roh_slots_from_lookahead_b
- read_shift = 2 * (sample_uidx & (BITCT2 - 1));
- slot_idx = (uintptr_t)((*roh_slot_map) & 0xffffffffU);
- cidx_start = roh_slot_cidx_start[slot_idx];
--#ifdef __LP64__
-+#ifdef __SSE2__
- cidx_start_block = cidx_start & (~63);
- #else
- cidx_start_block = cidx_start & (~15);
-@@ -882,7 +882,7 @@ int32_t populate_roh_slots_from_disk(FIL
- roh_write_slot_idx = (uintptr_t)(roh_slot_map[roh_read_slot_idx] & 0xffffffffU);
- cidx_start = roh_slot_cidx_start[roh_write_slot_idx];
- if ((marker_cidx >= cidx_start) && (marker_cidx < roh_slot_cidx_end[roh_write_slot_idx])) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- start_c_bidx = 2 * (cidx_start / 64);
- #else
- start_c_bidx = cidx_start / 16;
-@@ -895,7 +895,7 @@ int32_t populate_roh_slots_from_disk(FIL
- }
-
- static inline uint32_t is_allelic_match(double mismatch_max, uintptr_t* roh_slot_idxl, uintptr_t* roh_slot_idxs, uint32_t block_start_idxl, uint32_t block_start_idxs, uint32_t overlap_cidx_start, uint32_t overlap_cidx_end) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- const __m128i m1 = {FIVEMASK, FIVEMASK};
- const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
- const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
-@@ -1179,7 +1179,7 @@ void compute_allelic_match_matrix(double
- incr_idxl = 0;
- roh_slot_idxl = &(roh_slots[slot_idxl * roh_slot_wsize]);
- cidx_start_idxl = roh_slot_cidx_start[slot_idxl];
--#ifdef __LP64__
-+#ifdef __SSE2__
- block_start_idxl = cidx_start_idxl & (~63);
- #else
- block_start_idxl = cidx_start_idxl & (~15);
-@@ -1197,7 +1197,7 @@ void compute_allelic_match_matrix(double
- }
- slot_idxs = (uint32_t)(roh_slot_map[map_idxs]);
- cidx_start_idxs = roh_slot_cidx_start[slot_idxs];
--#ifdef __LP64__
-+#ifdef __SSE2__
- block_start_idxs = cidx_start_idxs & (~63);
- #else
- block_start_idxs = cidx_start_idxs & (~15);
-@@ -1244,7 +1244,7 @@ void assign_allelic_match_groups(uint32_
- if (ulii) {
- nsim_nz_ct++;
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- cur_pool[pool_idx] = ulii << 32;
- #else
- cur_pool[2 * pool_idx + 1] = ulii;
-@@ -1278,14 +1278,14 @@ void assign_allelic_match_groups(uint32_
- nsim_nz_ct--;
- allelic_match_cts[pool_idx] = 0xffffffffU;
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- cur_pool[pool_idx] = (cur_pool[pool_idx] & 0xffffffff00000000LLU) | group_idx;
- #else
- cur_pool[2 * pool_idx] = group_idx;
- #endif
- }
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- cur_pool[max_nsim_pidx] |= 0x80000000LLU | (group_idx++);
- #else
- cur_pool[2 * max_nsim_pidx] = 0x80000000U | (group_idx++);
-@@ -1293,7 +1293,7 @@ void assign_allelic_match_groups(uint32_
- }
- for (pool_idx = 0; pool_idx < pool_size; pool_idx++) {
- if (allelic_match_cts[pool_idx] != 0xffffffffU) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- cur_pool[pool_idx] |= 0x80000000LLU | (group_idx++);
- #else
- cur_pool[2 * pool_idx] = 0x80000000U | (group_idx++);
-@@ -1425,7 +1425,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- uii = chrom_len;
- }
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- // want each roh_slots space to be 16-byte aligned, to enable SSE2
- // max_roh_len = 1 -> 1 vec
- // max_roh_len in {2..65} -> 2 vecs
-@@ -1538,7 +1538,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- // [3P+3]: consensus NSNP
- // [3P+4]: union NSNP
- old_pool_list_size = pool_list_size;
--#ifdef __LP64__
-+#ifdef __SSE2__
- pool_list_size += 2 * pool_size + 3;
- #else
- pool_list_size += 3 * pool_size + 5;
-@@ -1550,7 +1550,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- *cur_pool++ = pool_size_first_plidx[pool_size - pool_size_min];
- pool_size_first_plidx[pool_size - pool_size_min] = old_pool_list_size;
- *cur_pool++ = pool_size;
--#ifndef __LP64__
-+#ifndef __SSE2__
- *cur_pool++ = 0;
- #endif
- uiptr = sample_uidx_sort_buf;
-@@ -1560,14 +1560,14 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- pool_list_idx = roh_slots[slot_idx1]; // actually a ROH idx
- *uiptr++ = roh_list[pool_list_idx * ROH_ENTRY_INTS + 5]; // sample_uidx
- *uiptr++ = (uint32_t)pool_list_idx;
--#ifdef __LP64__
-+#ifdef __SSE2__
- *uiptr++ = (uint32_t)(pool_list_idx >> 32);
- #endif
- }
- // sort in increasing sample_uidx order, for reproducible results
- qsort(sample_uidx_sort_buf, pool_size, 4 + sizeof(intptr_t), intcmp);
- for (uii = 0; uii < pool_size; uii++) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- *cur_pool++ = ((uintptr_t)sample_uidx_sort_buf[3 * uii + 1]) | (((uintptr_t)sample_uidx_sort_buf[3 * uii + 2]) << 32);
- #else
- *cur_pool++ = sample_uidx_sort_buf[2 * uii + 1];
-@@ -1616,7 +1616,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- for (pool_size = max_pool_size; pool_size >= pool_size_min; --pool_size) {
- pool_list_idx = pool_size_first_plidx[pool_size - pool_size_min];
- while (pool_list_idx != ~ZEROLU) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- pool_list[pool_list_idx + 1] |= ((uintptr_t)(++uii)) << 32;
- #else
- pool_list[pool_list_idx + 2] = ++uii;
-@@ -1666,7 +1666,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- pool_list_idx = pool_list[pool_list_idx - 1];
- cur_pool = &(pool_list[pool_list_idx]);
- pool_size = (uint32_t)cur_pool[1];
--#ifdef __LP64__
-+#ifdef __SSE2__
- cur_pool = &(cur_pool[2]);
- #else
- cur_pool = &(cur_pool[3]);
-@@ -1861,7 +1861,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
-
- assign_allelic_match_groups(pool_size, allelic_match_cts, allelic_match_matrix, roh_slot_map, &(cur_pool[pool_size]));
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- cur_pool[2 * pool_size] = (((uintptr_t)(marker_uidx_to_cidx[union_uidx2 - chrom_start] + 1 - marker_uidx_to_cidx[union_uidx1 - chrom_start])) << 32) | ((uintptr_t)(marker_uidx_to_cidx[con_uidx2 - chrom_start] + 1 - marker_uidx_to_cidx[con_uidx1 - chrom_start]));
- #else
- cur_pool[3 * pool_size] = marker_uidx_to_cidx[con_uidx2 - chrom_start] + 1 - marker_uidx_to_cidx[con_uidx1 - chrom_start];
-@@ -1869,7 +1869,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- #endif
-
- if (is_verbose) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- wptr = uint32_write(&(outname_end[14]), (uint32_t)(cur_pool[-1] >> 32));
- #else
- wptr = uint32_write(&(outname_end[14]), (uint32_t)cur_pool[-1]);
-@@ -1880,7 +1880,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- }
-
- for (slot_idx1 = 0; slot_idx1 < pool_size; slot_idx1++) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- verbose_group_sort_buf[slot_idx1] = ((cur_pool[pool_size + slot_idx1] & 0x7fffffffLLU) << 32) | ((uint64_t)slot_idx1);
- #else
- verbose_group_sort_buf[slot_idx1] = (((uint64_t)(cur_pool[pool_size + 2 * slot_idx1] & 0x7fffffff)) << 32) | ((uint64_t)slot_idx1);
-@@ -2222,7 +2222,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- while (pool_list_idx != ~ZEROLU) {
- cur_pool = &(pool_list[pool_list_idx]);
- pool_list_idx = *cur_pool;
--#ifdef __LP64__
-+#ifdef __SSE2__
- cur_pool = &(cur_pool[2]);
- #else
- cur_pool = &(cur_pool[3]);
-@@ -2240,7 +2240,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- // sort pool members primarily by allelic-match group number, then by
- // internal ID
- for (slot_idx1 = 0; slot_idx1 < pool_size; slot_idx1++) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- roh_slot_map[slot_idx1] = ((cur_pool[pool_size + slot_idx1] & 0x7fffffffLLU) << 32) | ((uint64_t)slot_idx1);
- #else
- // would like to just sort 32-bit integers, but if there are >32k
-@@ -2294,7 +2294,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- }
- wptr = roh_pool_write_middle(wptr, marker_ids, max_marker_id_len, plink_maxsnp, marker_pos, is_new_lengths, marker_uidx1, marker_uidx2);
- wptr = uint32_writew8x(wptr, cur_roh[2], ' ');
--#ifdef __LP64__
-+#ifdef __SSE2__
- ulii = cur_pool[pool_size + slot_idx2];
- wptr = width_force(4, wptr, uint32_write(wptr, (uint32_t)(ulii >> 32)));
- *wptr++ = ' ';
-@@ -2325,7 +2325,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- wptr = fw_strcpyn(plink_maxfid, 3, "CON", wptr_start);
- marker_uidx1 = con_uidx1;
- marker_uidx2 = con_uidx2;
--#ifdef __LP64__
-+#ifdef __SSE2__
- marker_cidx = (uint32_t)(cur_pool[2 * pool_size]);
- #else
- marker_cidx = cur_pool[3 * pool_size];
-@@ -2334,7 +2334,7 @@ int32_t roh_pool(Homozyg_info* hp, FILE*
- wptr = fw_strcpyn(plink_maxfid, 5, "UNION", wptr_start);
- marker_uidx1 = union_uidx1;
- marker_uidx2 = union_uidx2;
--#ifdef __LP64__
-+#ifdef __SSE2__
- // NSNP
- marker_cidx = (uint32_t)(cur_pool[2 * pool_size] >> 32);
- #else
-@@ -2740,7 +2740,7 @@ int32_t calc_homozyg(Homozyg_info* hp, F
- if (hp->modifier & (HOMOZYG_GROUP | HOMOZYG_GROUP_VERBOSE)) {
- if (max_pool_size < hp->pool_size_min) {
- LOGERRPRINTF("Warning: Skipping --homozyg group%s report since there are no pools.\n", (hp->modifier & HOMOZYG_GROUP_VERBOSE)? "-verbose" : "");
--#ifndef __LP64__
-+#ifndef __SSE2__
- } else if (max_pool_size > 65536) {
- logerrprint("Error: 32-bit " PROG_NAME_STR "'s --homozyg group cannot handle a pool of size >65536.\n");
- goto calc_homozyg_ret_NOMEM;
---- plink1.9-1.90~b3w-150903.orig/plink_ld.c
-+++ plink1.9-1.90~b3w-150903/plink_ld.c
-@@ -73,7 +73,7 @@ void ld_epi_cleanup(Ld_info* ldip, Epi_i
- free_cond(clump_ip->range_fname);
- }
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- static inline void ld_dot_prod_batch(__m128i* vec1, __m128i* vec2, __m128i* mask1, __m128i* mask2, int32_t* return_vals, uint32_t iters) {
- // Main routine for computation of \sum_i^M (x_i - \mu_x)(y_i - \mu_y), where
- // x_i, y_i \in \{-1, 0, 1\}, but there are missing values.
-@@ -494,7 +494,7 @@ int32_t ld_dot_prod_nm(uintptr_t* vec1,
- result -= ld_dot_prod_nm_batch(vec1, vec2, last_batch_size);
- return result;
- }
--#endif // __LP64__
-+#endif // __SSE2__
-
- uint32_t ld_process_load(uintptr_t* geno_buf, uintptr_t* mask_buf, uintptr_t* missing_buf, uint32_t* missing_ct_ptr, double* sum_ptr, double* variance_recip_ptr, uint32_t founder_ct, uint32_t is_x, uint32_t weighted_x, uint32_t nonmale_founder_ct, uintptr_t* founder_male_include2, uintptr_t* nonmale_geno, uintptr_t* nonmale_masks, uintptr_t nonmale_offset) {
- uintptr_t* geno_ptr = geno_buf;
-@@ -732,14 +732,14 @@ int32_t ld_prune(Ld_info* ldip, FILE* be
- uintptr_t unfiltered_sample_ctl2 = 2 * ((unfiltered_sample_ct + (BITCT - 1)) / BITCT);
- uintptr_t founder_ct = popcount_longs(founder_info, unfiltered_sample_ctl2 / 2);
- uintptr_t founder_ctl = (founder_ct + BITCT - 1) / BITCT;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t founder_ctv = 2 * ((founder_ct + 127) / 128);
- #else
- uintptr_t founder_ctv = founder_ctl;
- #endif
- uintptr_t founder_ct_mld = (founder_ct + MULTIPLEX_LD - 1) / MULTIPLEX_LD;
- uint32_t founder_ct_mld_m1 = ((uint32_t)founder_ct_mld) - 1;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t founder_ct_mld_rem = (MULTIPLEX_LD / 192) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 192;
- #else
- uint32_t founder_ct_mld_rem = (MULTIPLEX_LD / 48) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 48;
-@@ -862,7 +862,7 @@ int32_t ld_prune(Ld_info* ldip, FILE* be
- if (pairwise) {
- prune_ld_thresh = ld_last_param * (1 + SMALL_EPSILON);
- } else {
--#ifdef __LP64__
-+#ifdef __SSE2__
- if (window_max > 46340) {
- // todo: check what LAPACK's matrix inversion limit actually is. Guess
- // sqrt(2^31 - 1) for now.
-@@ -1290,7 +1290,7 @@ int32_t ld_prune(Ld_info* ldip, FILE* be
- ld_prune_ret_INVALID_FORMAT:
- retval = RET_INVALID_FORMAT;
- break;
--#ifdef __LP64__
-+#ifdef __SSE2__
- ld_prune_ret_INVALID_CMDLINE:
- retval = RET_INVALID_CMDLINE;
- break;
-@@ -1338,7 +1338,7 @@ uint32_t ld_missing_ct_intersect(uintptr
- // variant of popcount_longs_intersect()
- uintptr_t tot = 0;
- uintptr_t* lptr1_end2;
--#ifdef __LP64__
-+#ifdef __SSE2__
- const __m128i m1 = {FIVEMASK, FIVEMASK};
- const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
- const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
-@@ -1538,7 +1538,7 @@ int32_t flipscan(Ld_info* ldip, FILE* be
- pheno_ctl[is_case] = (pheno_ct[is_case] + (BITCT - 1)) / BITCT;
- ulii = (pheno_ct[is_case] + MULTIPLEX_LD - 1) / MULTIPLEX_LD;
- pheno_ct_mld_m1[is_case] = ulii - 1;
--#ifdef __LP64__
-+#ifdef __SSE2__
- pheno_ct_mld_rem[is_case] = (MULTIPLEX_LD / 192) - (ulii * MULTIPLEX_LD - pheno_ct[is_case]) / 192;
- #else
- pheno_ct_mld_rem[is_case] = (MULTIPLEX_LD / 48) - (ulii * MULTIPLEX_LD - pheno_ct[is_case]) / 48;
-@@ -2295,7 +2295,7 @@ int32_t ld_report_matrix(pthread_t* thre
- ulptr = (uintptr_t*)tbuf;
- // assume little-endian
- // 0[delim]0[delim]...
--#ifdef __LP64__
-+#ifdef __SSE2__
- ulii = 0x30003000300030LLU | (0x100010001000100LLU * ((unsigned char)g_ld_delimiter));
- #else
- ulii = 0x300030 | (0x1000100 * ((unsigned char)g_ld_delimiter));
-@@ -2816,7 +2816,7 @@ uint32_t load_and_split3(FILE* bedfile,
- }
- }
-
--#ifdef __LP64__
-+#ifdef __SSE2__
- static void two_locus_3x3_tablev(__m128i* vec1, __m128i* vec2, uint32_t* counts_3x3, uint32_t sample_ctv6, uint32_t iter_ct) {
- const __m128i m1 = {FIVEMASK, FIVEMASK};
- const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
-@@ -3039,7 +3039,7 @@ static inline void two_locus_3x3_zmiss_t
- #endif
-
- static void two_locus_count_table_zmiss1(uintptr_t* lptr1, uintptr_t* lptr2, uint32_t* counts_3x3, uint32_t sample_ctv3, uint32_t is_zmiss2) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_uint_zero(counts_3x3, 6);
- if (is_zmiss2) {
- two_locus_3x3_zmiss_tablev((__m128i*)lptr1, (__m128i*)lptr2, counts_3x3, sample_ctv3 / 2);
-@@ -3060,7 +3060,7 @@ static void two_locus_count_table_zmiss1
- }
-
- static void two_locus_count_table(uintptr_t* lptr1, uintptr_t* lptr2, uint32_t* counts_3x3, uint32_t sample_ctv3, uint32_t is_zmiss2) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t uii;
- fill_uint_zero(counts_3x3, 9);
- if (!is_zmiss2) {
-@@ -6052,7 +6052,7 @@ int32_t ld_report(pthread_t* threads, Ld
- uintptr_t* founder_male_include2 = NULL;
- uintptr_t founder_ct_mld = (founder_ct + MULTIPLEX_LD - 1) / MULTIPLEX_LD;
- uint32_t founder_ct_mld_m1 = ((uint32_t)founder_ct_mld) - 1;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uint32_t founder_ct_mld_rem = (MULTIPLEX_LD / 192) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 192;
- #else
- uint32_t founder_ct_mld_rem = (MULTIPLEX_LD / 48) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 48;
-@@ -6312,7 +6312,7 @@ int32_t show_tags(Ld_info* ldip, FILE* b
- }
- founder_ct_mld_m1 = (founder_ct - 1) / MULTIPLEX_LD;
- ulii = founder_ct_mld_m1 + 1;
--#ifdef __LP64__
-+#ifdef __SSE2__
- founder_ct_mld_rem = (MULTIPLEX_LD / 192) - (ulii * MULTIPLEX_LD - founder_ct) / 192;
- #else
- founder_ct_mld_rem = (MULTIPLEX_LD / 48) - (ulii * MULTIPLEX_LD - founder_ct) / 48;
-@@ -7105,7 +7105,7 @@ int32_t haploview_blocks(Ld_info* ldip,
- if (max_block_size < 2) {
- continue;
- }
--#ifndef __LP64__
-+#ifndef __SSE2__
- if (max_block_size > 65536) {
- logprint("\n");
- logerrprint("Error: 32-bit --blocks cannot analyze potential blocks with more than 65536\nvariants. Use a 64-bit PLINK build or a smaller --blocks-window-kb value.\n");
-@@ -7460,7 +7460,7 @@ int32_t haploview_blocks(Ld_info* ldip,
- haploview_blocks_ret_WRITE_FAIL:
- retval = RET_WRITE_FAIL;
- break;
--#ifndef __LP64__
-+#ifndef __SSE2__
- haploview_blocks_ret_INVALID_CMDLINE:
- retval = RET_INVALID_CMDLINE;
- break;
-@@ -11619,7 +11619,7 @@ int32_t construct_ld_map(pthread_t* thre
- uintptr_t founder_ctv2 = founder_ctl * 2;
- uintptr_t founder_ct_mld = (founder_ct + MULTIPLEX_LD - 1) / MULTIPLEX_LD;
- uint32_t founder_ct_mld_m1 = ((uint32_t)founder_ct_mld) - 1;
--#ifdef __LP64__
-+#ifdef __SSE2__
- uintptr_t founder_ct_mld_rem = (MULTIPLEX_LD / 192) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 192;
- #else
- uintptr_t founder_ct_mld_rem = (MULTIPLEX_LD / 48) - (founder_ct_mld * MULTIPLEX_LD - founder_ct) / 48;
-@@ -11823,7 +11823,7 @@ int32_t construct_ld_map(pthread_t* thre
- // don't need to load the first intersecting member or anything
- // before it, since we're only traversing the upper right triangle
- wlen += firstw;
--#ifdef __LP64__
-+#ifdef __SSE2__
- firstw = 2 * (uii / 128);
- #else
- firstw = uii / 32;
-@@ -13483,7 +13483,7 @@ int32_t clump_reports(FILE* bedfile, uin
- *bufptr++ = ' ';
- bufptr = uint32_writew10x(bufptr, cur_bp, ' ');
- bufptr = double_g_writewx3x(bufptr, pval, 10, ' ');
--#ifdef __LP64__
-+#ifdef __SSE2__
- // may as well be paranoid
- bufptr = width_force(8, bufptr, int64_write(bufptr, (int64_t)(histo[0] + histo[1] + histo[2] + histo[3] + histo[4])));
- *bufptr++ = ' ';
---- plink1.9-1.90~b3w-150903.orig/plink_matrix.h
-+++ plink1.9-1.90~b3w-150903/plink_matrix.h
-@@ -55,7 +55,7 @@ extern "C" {
-
- #else // not _WIN32
- #include <cblas.h>
--#ifdef __LP64__
-+#ifdef __SSE2__
- typedef int32_t __CLPK_integer;
- #else
- typedef long int __CLPK_integer;
---- plink1.9-1.90~b3w-150903.orig/plink_misc.c
-+++ plink1.9-1.90~b3w-150903/plink_misc.c
-@@ -5687,7 +5687,7 @@ int32_t meta_analysis(char* input_fnames
- if (!final_variant_ct) {
- logerrprint("Error: No --meta-analysis variants.\n");
- goto meta_analysis_ret_INVALID_CMDLINE;
--#ifdef __LP64__
-+#ifdef __SSE2__
- } else if (final_variant_ct > 0x7fffffff) {
- logerrprint("Error: Too many distinct --meta-analysis variants (max 2^31 - 1).\n");
- #endif
-@@ -5818,7 +5818,7 @@ int32_t meta_analysis(char* input_fnames
- memcpy(&cur_file_ct_m1, bufptr2, file_ct_byte_width);
- cur_data_slots = 0;
- if (report_study_specific) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- cur_data_slots += file_ct64;
- #else
- cur_data_slots += 2 * file_ct64;
---- plink1.9-1.90~b3w-150903.orig/plink_set.c
-+++ plink1.9-1.90~b3w-150903/plink_set.c
-@@ -1480,7 +1480,7 @@ int32_t define_sets(Set_info* sip, uintp
- if (retval) {
- goto define_sets_ret_NOMEM2;
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- fill_ulong_zero(marker_bitfield_tmp, (marker_ctp2l + 1) & (~1));
- #else
- fill_ulong_zero(marker_bitfield_tmp, (marker_ctp2l + 3) & (~3));
-@@ -2508,7 +2508,7 @@ int32_t annotate(Annot_info* aip, char*
- while (1) {
- ll_ptr = *ll_pptr;
- if (!ll_ptr) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- // we'll run out of memory way earlier in 32-bit mode
- if (attr_id_ct == 0x80000000LLU) {
- sprintf(logbuf, "Error: Too many unique attributes in %s (max 2147483648).\n", aip->attrib_fname);
-@@ -2647,7 +2647,7 @@ int32_t annotate(Annot_info* aip, char*
- if (retval) {
- goto annotate_ret_1;
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- if (range_ct > 0x80000000LLU) {
- sprintf(logbuf, "Error: Too many annotations in %s (max 2147483648, counting multi-chromosome annotations once per spanned chromosome).\n", aip->ranges_fname);
- goto annotate_ret_INVALID_FORMAT_WW;
-@@ -2743,7 +2743,7 @@ int32_t annotate(Annot_info* aip, char*
- } else {
- unique_annot_ct = attr_id_ct;
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- unique_annot_ctlw = (unique_annot_ct + 3) / 4;
- #else
- unique_annot_ctlw = (unique_annot_ct + 1) / 2;
-@@ -2759,7 +2759,7 @@ int32_t annotate(Annot_info* aip, char*
- ulptr = (uintptr_t*)writebuf;
- for (ulii = 0; ulii < unique_annot_ctlw; ulii++) {
- // fill with repeated " 0"
--#ifdef __LP64__
-+#ifdef __SSE2__
- *ulptr++ = 0x3020302030203020LLU;
- #else
- *ulptr++ = 0x30203020;
-@@ -3100,7 +3100,7 @@ int32_t annotate(Annot_info* aip, char*
- // reinitialize
- ulptr = (uintptr_t*)writebuf;
- for (ulii = 0; ulii < unique_annot_ctlw; ulii++) {
--#ifdef __LP64__
-+#ifdef __SSE2__
- *ulptr++ = 0x3020302030203020LLU;
- #else
- *ulptr++ = 0x30203020;
-@@ -3285,7 +3285,7 @@ int32_t gene_report(char* fname, char* g
- if (retval) {
- goto gene_report_ret_1;
- }
--#ifdef __LP64__
-+#ifdef __SSE2__
- if (gene_ct > 0x80000000LLU) {
- sprintf(logbuf, "Error: Too many genes in %s (max 2147483648).\n", glist);
- goto gene_report_ret_INVALID_FORMAT_WW;
-@@ -3495,7 +3495,7 @@ int32_t gene_report(char* fname, char* g
- ((uint32_t*)linebuf_top)[1] = cur_bp;
- linebuf_left -= slen + 8;
- linebuf_top = &(linebuf_top[slen + 8]);
--#ifdef __LP64__
-+#ifdef __SSE2__
- if (saved_line_ct == 0x100000000LLU) {
- sprintf(logbuf, "Error: Too many valid lines in %s (--gene-report can only handle 4294967296).\n", fname);
- goto gene_report_ret_INVALID_FORMAT_WW;
diff --git a/debian/patches/series b/debian/patches/series
index 0f5610a..c1895f8 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,3 +1,2 @@
01_Fix_use_internal_lib.patch
02_Activate_Stable_Build.patch
-03_replace_LP64_by_SSE2.patch
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/plink1.9.git
More information about the debian-med-commit
mailing list