[med-svn] [plink1.9] 01/02: Imported Upstream version 1.90~b3l-150418
Dylan Aïssi
bob.dybian-guest at moszumanska.debian.org
Mon Apr 27 03:25:07 UTC 2015
This is an automated email from the git hooks/post-receive script.
bob.dybian-guest pushed a commit to branch master
in repository plink1.9.
commit 71a56b4eed006fdeec03b1aaa664070b112037f8
Author: Dylan Aïssi <bob.dybian at gmail.com>
Date: Mon Apr 27 05:21:36 2015 +0200
Imported Upstream version 1.90~b3l-150418
---
Makefile | 2 +-
bgzf.c | 1123 +++++++++++++++++++++++++++++++++++++++++++++++++
bgzf.h | 313 ++++++++++++++
hfile.c | 578 ++++++++++++++++++++++++++
hfile.h | 212 ++++++++++
hfile_internal.h | 76 ++++
hts.h | 456 ++++++++++++++++++++
hts_defs.h | 55 +++
khash.h | 619 +++++++++++++++++++++++++++
pigz.c | 463 +++++++++++++++++----
pigz.h | 148 ++++++-
plink.c | 525 +++++++++++++++--------
plink_assoc.c | 8 +-
plink_calc.c | 993 +++++++++++++++++++++++---------------------
plink_calc.h | 4 +-
plink_cluster.c | 7 +-
plink_common.c | 153 +++----
plink_common.h | 149 ++++---
plink_data.c | 376 ++++++++++++-----
plink_dosage.c | 330 +++++----------
plink_family.c | 1216 +++++++++++++++++++++++++++++++++++++++++++++++++-----
plink_family.h | 26 +-
plink_filter.c | 233 +++++++----
plink_filter.h | 8 +-
plink_glm.c | 45 +-
plink_help.c | 78 ++--
plink_ld.c | 43 +-
plink_misc.c | 326 +++++++--------
plink_misc.h | 10 +-
plink_set.c | 1 +
yarn.h | 5 +
31 files changed, 6969 insertions(+), 1612 deletions(-)
diff --git a/Makefile b/Makefile
index 5c8ac77..c7ef5ee 100644
--- a/Makefile
+++ b/Makefile
@@ -49,7 +49,7 @@ ifdef NO_LAPACK
BLASFLAGS=
endif
-SRC = plink.c plink_assoc.c plink_calc.c plink_cluster.c plink_cnv.c plink_common.c plink_data.c plink_dosage.c plink_family.c plink_filter.c plink_glm.c plink_help.c plink_homozyg.c plink_lasso.c plink_ld.c plink_matrix.c plink_misc.c plink_rserve.c plink_set.c plink_stats.c SFMT.c dcdflib.c pigz.c yarn.c Rconnection.cc
+SRC = plink.c plink_assoc.c plink_calc.c plink_cluster.c plink_cnv.c plink_common.c plink_data.c plink_dosage.c plink_family.c plink_filter.c plink_glm.c plink_help.c plink_homozyg.c plink_lasso.c plink_ld.c plink_matrix.c plink_misc.c plink_rserve.c plink_set.c plink_stats.c SFMT.c dcdflib.c pigz.c yarn.c Rconnection.cc hfile.c bgzf.c
# In the event that you are still concurrently using PLINK 1.07, we suggest
# renaming that binary to "plink107" and "plink1". (Previously,
diff --git a/bgzf.c b/bgzf.c
new file mode 100644
index 0000000..a6c9c9b
--- /dev/null
+++ b/bgzf.c
@@ -0,0 +1,1123 @@
+/* The MIT License
+
+ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+ 2011, 2012 Attractive Chaos <attractor at live.co.uk>
+ Copyright (C) 2009, 2013, 2014 Genome Research Ltd
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <errno.h>
+#include <unistd.h>
+#include <assert.h>
+#include <sys/types.h>
+
+#include "hts.h"
+#include "bgzf.h"
+#include "hfile.h"
+
+#define BGZF_CACHE
+
+#ifndef _WIN32
+ #define BGZF_MT
+#endif
+
+#define BLOCK_HEADER_LENGTH 18
+#define BLOCK_FOOTER_LENGTH 8
+
+
+/* BGZF/GZIP header (speciallized from RFC 1952; little endian):
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN|
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ BGZF extension:
+ ^ ^ ^ ^
+ | | | |
+ FLG.EXTRA XLEN B C
+
+ BGZF format is compatible with GZIP. It limits the size of each compressed
+ block to 2^16 bytes and adds and an extra "BC" field in the gzip header which
+ records the size.
+
+*/
+static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
+
+#ifdef BGZF_CACHE
+typedef struct {
+ int size;
+ uint8_t *block;
+ int64_t end_offset;
+} cache_t;
+#include "khash.h"
+KHASH_MAP_INIT_INT64(cache, cache_t)
+#endif
+
+typedef struct
+{
+ uint64_t uaddr; // offset w.r.t. uncompressed data
+ uint64_t caddr; // offset w.r.t. compressed data
+}
+bgzidx1_t;
+
+struct __bgzidx_t
+{
+ int noffs, moffs; // the size of the index, n:used, m:allocated
+ bgzidx1_t *offs; // offsets
+ uint64_t ublock_addr; // offset of the current block (uncompressed data)
+};
+
+void bgzf_index_destroy(BGZF *fp);
+int bgzf_index_add_block(BGZF *fp);
+
+static inline void packInt16(uint8_t *buffer, uint16_t value)
+{
+ buffer[0] = value;
+ buffer[1] = value >> 8;
+}
+
+static inline int unpackInt16(const uint8_t *buffer)
+{
+ return buffer[0] | buffer[1] << 8;
+}
+
+static inline void packInt32(uint8_t *buffer, uint32_t value)
+{
+ buffer[0] = value;
+ buffer[1] = value >> 8;
+ buffer[2] = value >> 16;
+ buffer[3] = value >> 24;
+}
+
+static BGZF *bgzf_read_init(hFILE *hfpr)
+{
+ BGZF *fp;
+ uint8_t magic[18];
+ ssize_t n = hpeek(hfpr, magic, 18);
+ if (n < 0) return NULL;
+
+ fp = (BGZF*)calloc(1, sizeof(BGZF));
+ if (fp == NULL) return NULL;
+
+ fp->is_write = 0;
+ fp->is_compressed = (n==2 && magic[0]==0x1f && magic[1]==0x8b);
+ fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+ fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+ fp->is_compressed = (n==18 && magic[0]==0x1f && magic[1]==0x8b) ? 1 : 0;
+ fp->is_gzip = ( !fp->is_compressed || ((magic[3]&4) && memcmp(&magic[12], "BC\2\0",4)==0) ) ? 0 : 1;
+#ifdef BGZF_CACHE
+ fp->cache = kh_init(cache);
+#endif
+ return fp;
+}
+
+// get the compress level from the mode string: compress_level==-1 for the default level, -2 plain uncompressed
+static int mode2level(const char *__restrict mode)
+{
+ int i, compress_level = -1;
+ for (i = 0; mode[i]; ++i)
+ if (mode[i] >= '0' && mode[i] <= '9') break;
+ if (mode[i]) compress_level = (int)mode[i] - '0';
+ if (strchr(mode, 'u')) compress_level = -2;
+ return compress_level;
+}
+static BGZF *bgzf_write_init(const char *mode)
+{
+ BGZF *fp;
+ fp = (BGZF*)calloc(1, sizeof(BGZF));
+ fp->is_write = 1;
+ int compress_level = mode2level(mode);
+ if ( compress_level==-2 )
+ {
+ fp->is_compressed = 0;
+ return fp;
+ }
+ fp->is_compressed = 1;
+ fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+ fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+ fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
+ if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
+ if ( strchr(mode,'g') )
+ {
+ // gzip output
+ fp->is_gzip = 1;
+ fp->gz_stream = (z_stream*)calloc(1,sizeof(z_stream));
+ fp->gz_stream->zalloc = NULL;
+ fp->gz_stream->zfree = NULL;
+ if ( deflateInit2(fp->gz_stream, fp->compress_level, Z_DEFLATED, 15|16, 8, Z_DEFAULT_STRATEGY)!=Z_OK ) return NULL;
+ }
+ return fp;
+}
+
+BGZF *bgzf_open(const char *path, const char *mode)
+{
+ BGZF *fp = 0;
+ assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
+ if (strchr(mode, 'r')) {
+ hFILE *fpr;
+ if ((fpr = hopen(path, mode)) == 0) return 0;
+ fp = bgzf_read_init(fpr);
+ if (fp == 0) { hclose_abruptly(fpr); return NULL; }
+ fp->fp = fpr;
+ } else if (strchr(mode, 'w') || strchr(mode, 'a')) {
+ hFILE *fpw;
+ if ((fpw = hopen(path, mode)) == 0) return 0;
+ fp = bgzf_write_init(mode);
+ fp->fp = fpw;
+ }
+ else { errno = EINVAL; return 0; }
+
+ fp->is_be = ed_is_big();
+ return fp;
+}
+
+BGZF *bgzf_dopen(int fd, const char *mode)
+{
+ BGZF *fp = 0;
+ assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
+ if (strchr(mode, 'r')) {
+ hFILE *fpr;
+ if ((fpr = hdopen(fd, mode)) == 0) return 0;
+ fp = bgzf_read_init(fpr);
+ if (fp == 0) { hclose_abruptly(fpr); return NULL; } // FIXME this closes fd
+ fp->fp = fpr;
+ } else if (strchr(mode, 'w') || strchr(mode, 'a')) {
+ hFILE *fpw;
+ if ((fpw = hdopen(fd, mode)) == 0) return 0;
+ fp = bgzf_write_init(mode);
+ fp->fp = fpw;
+ }
+ else { errno = EINVAL; return 0; }
+
+ fp->is_be = ed_is_big();
+ return fp;
+}
+
+BGZF *bgzf_hopen(hFILE *hfp, const char *mode)
+{
+ BGZF *fp = NULL;
+ assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
+ if (strchr(mode, 'r')) {
+ fp = bgzf_read_init(hfp);
+ if (fp == NULL) return NULL;
+ } else if (strchr(mode, 'w') || strchr(mode, 'a')) {
+ fp = bgzf_write_init(mode);
+ }
+ else { errno = EINVAL; return 0; }
+
+ fp->fp = hfp;
+ fp->is_be = ed_is_big();
+ return fp;
+}
+
+static int bgzf_compress(void *_dst, int *dlen, void *src, int slen, int level)
+{
+ uint32_t crc;
+ z_stream zs;
+ uint8_t *dst = (uint8_t*)_dst;
+
+ // compress the body
+ zs.zalloc = NULL; zs.zfree = NULL;
+ zs.next_in = (Bytef*)src;
+ zs.avail_in = slen;
+ zs.next_out = dst + BLOCK_HEADER_LENGTH;
+ zs.avail_out = *dlen - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+ if (deflateInit2(&zs, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) return -1; // -15 to disable zlib header/footer
+ if (deflate(&zs, Z_FINISH) != Z_STREAM_END) return -1;
+ if (deflateEnd(&zs) != Z_OK) return -1;
+ *dlen = zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+ // write the header
+ memcpy(dst, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
+ packInt16(&dst[16], *dlen - 1); // write the compressed length; -1 to fit 2 bytes
+ // write the footer
+ crc = crc32(crc32(0L, NULL, 0L), (Bytef*)src, slen);
+ packInt32((uint8_t*)&dst[*dlen - 8], crc);
+ packInt32((uint8_t*)&dst[*dlen - 4], slen);
+ return 0;
+}
+
+static int bgzf_gzip_compress(BGZF *fp, void *_dst, int *dlen, void *src, int slen, int level)
+{
+ uint8_t *dst = (uint8_t*)_dst;
+ z_stream *zs = fp->gz_stream;
+ int flush = slen ? Z_NO_FLUSH : Z_FINISH;
+ zs->next_in = (Bytef*)src;
+ zs->avail_in = slen;
+ zs->next_out = dst;
+ zs->avail_out = *dlen;
+ if ( deflate(zs, flush) == Z_STREAM_ERROR ) return -1;
+ *dlen = *dlen - zs->avail_out;
+ return 0;
+}
+
+// Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length.
+static int deflate_block(BGZF *fp, int block_length)
+{
+ int comp_size = BGZF_MAX_BLOCK_SIZE;
+ int ret;
+ if ( !fp->is_gzip )
+ ret = bgzf_compress(fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level);
+ else
+ ret = bgzf_gzip_compress(fp, fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level);
+
+ if ( ret != 0 )
+ {
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ fp->block_offset = 0;
+ return comp_size;
+}
+
+// Inflate the block in fp->compressed_block into fp->uncompressed_block
+static int inflate_block(BGZF* fp, int block_length)
+{
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = (Bytef*)fp->compressed_block + 18;
+ zs.avail_in = block_length - 16;
+ zs.next_out = (Bytef*)fp->uncompressed_block;
+ zs.avail_out = BGZF_MAX_BLOCK_SIZE;
+
+ if (inflateInit2(&zs, -15) != Z_OK) {
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ if (inflate(&zs, Z_FINISH) != Z_STREAM_END) {
+ inflateEnd(&zs);
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ if (inflateEnd(&zs) != Z_OK) {
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ return zs.total_out;
+}
+
+static int inflate_gzip_block(BGZF *fp, int cached)
+{
+ int ret = Z_OK;
+ do
+ {
+ if ( !cached && fp->gz_stream->avail_out!=0 )
+ {
+ fp->gz_stream->avail_in = hread(fp->fp, fp->compressed_block, BGZF_BLOCK_SIZE);
+ if ( fp->gz_stream->avail_in<=0 ) return fp->gz_stream->avail_in;
+ if ( fp->gz_stream->avail_in==0 ) break;
+ fp->gz_stream->next_in = (Bytef*)(fp->compressed_block);
+ }
+ else cached = 0;
+ do
+ {
+ fp->gz_stream->next_out = (Bytef*)fp->uncompressed_block + fp->block_offset;
+ fp->gz_stream->avail_out = BGZF_MAX_BLOCK_SIZE - fp->block_offset;
+ ret = inflate(fp->gz_stream, Z_NO_FLUSH);
+ if ( ret==Z_BUF_ERROR ) continue; // non-critical error
+ if ( ret<0 ) return -1;
+ unsigned int have = BGZF_MAX_BLOCK_SIZE - fp->gz_stream->avail_out;
+ if ( have ) return have;
+ }
+ while ( fp->gz_stream->avail_out == 0 );
+ }
+ while (ret != Z_STREAM_END);
+ return BGZF_MAX_BLOCK_SIZE - fp->gz_stream->avail_out;
+}
+
+// Returns: 0 on success (BGZF header); -1 on non-BGZF GZIP header; -2 on error
+static int check_header(const uint8_t *header)
+{
+ if ( header[0] != 31 || header[1] != 139 || header[2] != 8 ) return -2;
+ return ((header[3] & 4) != 0
+ && unpackInt16((uint8_t*)&header[10]) == 6
+ && header[12] == 'B' && header[13] == 'C'
+ && unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1;
+}
+
+#ifdef BGZF_CACHE
+static void free_cache(BGZF *fp)
+{
+ khint_t k;
+ khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+ if (fp->is_write) return;
+ for (k = kh_begin(h); k < kh_end(h); ++k)
+ if (kh_exist(h, k)) free(kh_val(h, k).block);
+ kh_destroy(cache, h);
+}
+
+static int load_block_from_cache(BGZF *fp, int64_t block_address)
+{
+ khint_t k;
+ cache_t *p;
+ khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+ k = kh_get(cache, h, block_address);
+ if (k == kh_end(h)) return 0;
+ p = &kh_val(h, k);
+ if (fp->block_length != 0) fp->block_offset = 0;
+ fp->block_address = block_address;
+ fp->block_length = p->size;
+ memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE);
+ if ( hseek(fp->fp, p->end_offset, SEEK_SET) < 0 )
+ {
+ // todo: move the error up
+ fprintf(stderr, "Could not hseek to %" PRId64 "\n", p->end_offset);
+ exit(1);
+ }
+ return p->size;
+}
+
+static void cache_block(BGZF *fp, int size)
+{
+ int ret;
+ khint_t k;
+ cache_t *p;
+ khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+ if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return;
+ if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > (uint32_t)fp->cache_size) {
+ /* A better way would be to remove the oldest block in the
+ * cache, but here we remove a random one for simplicity. This
+ * should not have a big impact on performance. */
+ for (k = kh_begin(h); k < kh_end(h); ++k)
+ if (kh_exist(h, k)) break;
+ if (k < kh_end(h)) {
+ free(kh_val(h, k).block);
+ kh_del(cache, h, k);
+ }
+ }
+ k = kh_put(cache, h, fp->block_address, &ret);
+ if (ret == 0) return; // if this happens, a bug!
+ p = &kh_val(h, k);
+ p->size = fp->block_length;
+ p->end_offset = fp->block_address + size;
+ p->block = (uint8_t*)malloc(BGZF_MAX_BLOCK_SIZE);
+ memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE);
+}
+#else
+static void free_cache(BGZF *fp) {}
+static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
+static void cache_block(BGZF *fp, int size) {}
+#endif
+
+int bgzf_read_block(BGZF *fp)
+{
+ uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
+ int count, size = 0, block_length, remaining;
+
+ // Reading an uncompressed file
+ if ( !fp->is_compressed )
+ {
+ count = hread(fp->fp, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE);
+ if ( count==0 )
+ {
+ fp->block_length = 0;
+ return 0;
+ }
+ if (fp->block_length != 0) fp->block_offset = 0;
+ fp->block_address += count;
+ fp->block_length = count;
+ return 0;
+ }
+
+ // Reading compressed file
+ int64_t block_address;
+ block_address = htell(fp->fp);
+ if ( fp->is_gzip && fp->gz_stream ) // is this is a initialized gzip stream?
+ {
+ count = inflate_gzip_block(fp, 0);
+ if ( count<0 )
+ {
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ fp->block_length = count;
+ fp->block_address = block_address;
+ return 0;
+ }
+ if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0;
+ count = hread(fp->fp, header, sizeof(header));
+ if (count == 0) { // no data read
+ fp->block_length = 0;
+ return 0;
+ }
+ int ret;
+ if ( count != sizeof(header) || (ret=check_header(header))==-2 )
+ {
+ fp->errcode |= BGZF_ERR_HEADER;
+ return -1;
+ }
+ if ( ret==-1 )
+ {
+ // GZIP, not BGZF
+ uint8_t *cblock = (uint8_t*)fp->compressed_block;
+ memcpy(cblock, header, sizeof(header));
+ count = hread(fp->fp, cblock+sizeof(header), BGZF_BLOCK_SIZE - sizeof(header)) + sizeof(header);
+ int nskip = 10;
+
+ // Check optional fields to skip: FLG.FNAME,FLG.FCOMMENT,FLG.FHCRC,FLG.FEXTRA
+ // Note: Some of these fields are untested, I did not have appropriate data available
+ if ( header[3] & 0x4 ) // FLG.FEXTRA
+ {
+ nskip += unpackInt16(&cblock[nskip]) + 2;
+ }
+ if ( header[3] & 0x8 ) // FLG.FNAME
+ {
+ while ( nskip<BGZF_BLOCK_SIZE && cblock[nskip] ) nskip++;
+ if ( nskip==BGZF_BLOCK_SIZE )
+ {
+ fp->errcode |= BGZF_ERR_HEADER;
+ return -1;
+ }
+ nskip++;
+ }
+ if ( header[3] & 0x10 ) // FLG.FCOMMENT
+ {
+ while ( nskip<BGZF_BLOCK_SIZE && cblock[nskip] ) nskip++;
+ if ( nskip==BGZF_BLOCK_SIZE )
+ {
+ fp->errcode |= BGZF_ERR_HEADER;
+ return -1;
+ }
+ nskip++;
+ }
+ if ( header[3] & 0x2 ) nskip += 2; // FLG.FHCRC
+
+ fp->is_gzip = 1;
+ fp->gz_stream = (z_stream*) calloc(1,sizeof(z_stream));
+ int ret = inflateInit2(fp->gz_stream, -15);
+ if (ret != Z_OK)
+ {
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ fp->gz_stream->avail_in = count - nskip;
+ fp->gz_stream->next_in = cblock + nskip;
+ count = inflate_gzip_block(fp, 1);
+ if ( count<0 )
+ {
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ fp->block_length = count;
+ fp->block_address = block_address;
+ if ( fp->idx_build_otf ) return -1; // cannot build index for gzip
+ return 0;
+ }
+ size = count;
+ block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
+ compressed_block = (uint8_t*)fp->compressed_block;
+ memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
+ remaining = block_length - BLOCK_HEADER_LENGTH;
+ count = hread(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
+ if (count != remaining) {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+ size += count;
+ if ((count = inflate_block(fp, block_length)) < 0) return -1;
+ if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek.
+ fp->block_address = block_address;
+ fp->block_length = count;
+ if ( fp->idx_build_otf )
+ {
+ bgzf_index_add_block(fp);
+ fp->idx->ublock_addr += count;
+ }
+ cache_block(fp, size);
+ return 0;
+}
+
+ssize_t bgzf_read(BGZF *fp, void *data, size_t length)
+{
+ ssize_t bytes_read = 0;
+ uint8_t *output = (uint8_t*)data;
+ if (length <= 0) return 0;
+ assert(fp->is_write == 0);
+ // kludge to address signed vs. unsigned comparison warning
+ while (bytes_read < ((intptr_t)length)) {
+ int copy_length, available = fp->block_length - fp->block_offset;
+ uint8_t *buffer;
+ if (available <= 0) {
+ if (bgzf_read_block(fp) != 0) return -1;
+ available = fp->block_length - fp->block_offset;
+ if (available <= 0) break;
+ }
+ copy_length = ((intptr_t)(length - bytes_read)) < available? length - bytes_read : available;
+ buffer = (uint8_t*)fp->uncompressed_block;
+ memcpy(output, buffer + fp->block_offset, copy_length);
+ fp->block_offset += copy_length;
+ output += copy_length;
+ bytes_read += copy_length;
+ }
+ if (fp->block_offset == fp->block_length) {
+ fp->block_address = htell(fp->fp);
+ fp->block_offset = fp->block_length = 0;
+ }
+ fp->uncompressed_address += bytes_read;
+ return bytes_read;
+}
+
+ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length)
+{
+ return hread(fp->fp, data, length);
+}
+
+#ifdef BGZF_MT
+
+typedef struct {
+ struct bgzf_mtaux_t *mt;
+ void *buf;
+ int i, errcode, toproc, compress_level;
+} worker_t;
+
+typedef struct bgzf_mtaux_t {
+ int n_threads, n_blks, curr, done;
+ volatile int proc_cnt;
+ void **blk;
+ int *len;
+ worker_t *w;
+ pthread_t *tid;
+ pthread_mutex_t lock;
+ pthread_cond_t cv;
+} mtaux_t;
+
+static int worker_aux(worker_t *w)
+{
+ int i, stop = 0;
+ // wait for condition: to process or all done
+ pthread_mutex_lock(&w->mt->lock);
+ while (!w->toproc && !w->mt->done)
+ pthread_cond_wait(&w->mt->cv, &w->mt->lock);
+ if (w->mt->done) stop = 1;
+ w->toproc = 0;
+ pthread_mutex_unlock(&w->mt->lock);
+ if (stop) return 1; // to quit the thread
+ w->errcode = 0;
+ for (i = w->i; i < w->mt->curr; i += w->mt->n_threads) {
+ int clen = BGZF_MAX_BLOCK_SIZE;
+ if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->compress_level) != 0)
+ w->errcode |= BGZF_ERR_ZLIB;
+ memcpy(w->mt->blk[i], w->buf, clen);
+ w->mt->len[i] = clen;
+ }
+ __sync_fetch_and_add(&w->mt->proc_cnt, 1);
+ return 0;
+}
+
+static void *mt_worker(void *data)
+{
+ while (worker_aux((worker_t*)data) == 0);
+ return 0;
+}
+
+int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
+{
+ int i;
+ mtaux_t *mt;
+ pthread_attr_t attr;
+ if (!fp->is_write || fp->mt || n_threads <= 1) return -1;
+ mt = (mtaux_t*)calloc(1, sizeof(mtaux_t));
+ mt->n_threads = n_threads;
+ mt->n_blks = n_threads * n_sub_blks;
+ mt->len = (int*)calloc(mt->n_blks, sizeof(int));
+ mt->blk = (void**)calloc(mt->n_blks, sizeof(void*));
+ for (i = 0; i < mt->n_blks; ++i)
+ mt->blk[i] = malloc(BGZF_MAX_BLOCK_SIZE);
+ mt->tid = (pthread_t*)calloc(mt->n_threads, sizeof(pthread_t)); // tid[0] is not used, as the worker 0 is launched by the master
+ mt->w = (worker_t*)calloc(mt->n_threads, sizeof(worker_t));
+ for (i = 0; i < mt->n_threads; ++i) {
+ mt->w[i].i = i;
+ mt->w[i].mt = mt;
+ mt->w[i].compress_level = fp->compress_level;
+ mt->w[i].buf = malloc(BGZF_MAX_BLOCK_SIZE);
+ }
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+ pthread_mutex_init(&mt->lock, 0);
+ pthread_cond_init(&mt->cv, 0);
+ for (i = 1; i < mt->n_threads; ++i) // worker 0 is effectively launched by the master thread
+ pthread_create(&mt->tid[i], &attr, mt_worker, &mt->w[i]);
+ fp->mt = mt;
+ return 0;
+}
+
+static void mt_destroy(mtaux_t *mt)
+{
+ int i;
+ // signal all workers to quit
+ pthread_mutex_lock(&mt->lock);
+ mt->done = 1; mt->proc_cnt = 0;
+ pthread_cond_broadcast(&mt->cv);
+ pthread_mutex_unlock(&mt->lock);
+ for (i = 1; i < mt->n_threads; ++i) pthread_join(mt->tid[i], 0); // worker 0 is effectively launched by the master thread
+ // free other data allocated on heap
+ for (i = 0; i < mt->n_blks; ++i) free(mt->blk[i]);
+ for (i = 0; i < mt->n_threads; ++i) free(mt->w[i].buf);
+ free(mt->blk); free(mt->len); free(mt->w); free(mt->tid);
+ pthread_cond_destroy(&mt->cv);
+ pthread_mutex_destroy(&mt->lock);
+ free(mt);
+}
+
+static void mt_queue(BGZF *fp)
+{
+ mtaux_t *mt = fp->mt;
+ assert(mt->curr < mt->n_blks); // guaranteed by the caller
+ memcpy(mt->blk[mt->curr], fp->uncompressed_block, fp->block_offset);
+ mt->len[mt->curr] = fp->block_offset;
+ fp->block_offset = 0;
+ ++mt->curr;
+}
+
+static int mt_flush_queue(BGZF *fp)
+{
+ int i;
+ mtaux_t *mt = fp->mt;
+ // signal all the workers to compress
+ pthread_mutex_lock(&mt->lock);
+ for (i = 0; i < mt->n_threads; ++i) mt->w[i].toproc = 1;
+ mt->proc_cnt = 0;
+ pthread_cond_broadcast(&mt->cv);
+ pthread_mutex_unlock(&mt->lock);
+ // worker 0 is doing things here
+ worker_aux(&mt->w[0]);
+ // wait for all the threads to complete
+ while (mt->proc_cnt < mt->n_threads);
+ // dump data to disk
+ for (i = 0; i < mt->n_threads; ++i) fp->errcode |= mt->w[i].errcode;
+ for (i = 0; i < mt->curr; ++i)
+ if (hwrite(fp->fp, mt->blk[i], mt->len[i]) != mt->len[i]) {
+ fp->errcode |= BGZF_ERR_IO;
+ break;
+ }
+ mt->curr = 0;
+ return (fp->errcode == 0)? 0 : -1;
+}
+
+static int lazy_flush(BGZF *fp)
+{
+ if (fp->mt) {
+ if (fp->block_offset) mt_queue(fp);
+ return (fp->mt->curr < fp->mt->n_blks)? 0 : mt_flush_queue(fp);
+ }
+ else return bgzf_flush(fp);
+}
+
+#else // ~ #ifdef BGZF_MT
+
+int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
+{
+ return 0;
+}
+
+static inline int lazy_flush(BGZF *fp)
+{
+ return bgzf_flush(fp);
+}
+
+#endif // ~ #ifdef BGZF_MT
+
+int bgzf_flush(BGZF *fp)
+{
+ if (!fp->is_write) return 0;
+#ifdef BGZF_MT
+ if (fp->mt) {
+ if (fp->block_offset) mt_queue(fp); // guaranteed that assertion does not fail
+ return mt_flush_queue(fp);
+ }
+#endif
+ while (fp->block_offset > 0) {
+ if ( fp->idx_build_otf )
+ {
+ bgzf_index_add_block(fp);
+ fp->idx->ublock_addr += fp->block_offset;
+ }
+ int block_length = deflate_block(fp, fp->block_offset);
+ if (block_length < 0) return -1;
+ if (hwrite(fp->fp, fp->compressed_block, block_length) != block_length) {
+ fp->errcode |= BGZF_ERR_IO; // possibly truncated file
+ return -1;
+ }
+ fp->block_address += block_length;
+ }
+ return 0;
+}
+
+int bgzf_flush_try(BGZF *fp, ssize_t size)
+{
+ if (fp->block_offset + size > BGZF_BLOCK_SIZE) return lazy_flush(fp);
+ return 0;
+}
+
+ssize_t bgzf_write(BGZF *fp, const void *data, size_t length)
+{
+ if ( !fp->is_compressed )
+ return hwrite(fp->fp, data, length);
+
+ const uint8_t *input = (const uint8_t*)data;
+ ssize_t remaining = length;
+ assert(fp->is_write);
+ while (remaining > 0) {
+ uint8_t* buffer = (uint8_t*)fp->uncompressed_block;
+ int copy_length = BGZF_BLOCK_SIZE - fp->block_offset;
+ if (copy_length > remaining) copy_length = remaining;
+ memcpy(buffer + fp->block_offset, input, copy_length);
+ fp->block_offset += copy_length;
+ input += copy_length;
+ remaining -= copy_length;
+ if (fp->block_offset == BGZF_BLOCK_SIZE) {
+ if (lazy_flush(fp) != 0) return -1;
+ }
+ }
+ return length - remaining;
+}
+
+ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length)
+{
+ return hwrite(fp->fp, data, length);
+}
+
+int bgzf_close(BGZF* fp)
+{
+ int ret, block_length;
+ if (fp == 0) return -1;
+ if (fp->is_write && fp->is_compressed) {
+ if (bgzf_flush(fp) != 0) return -1;
+ fp->compress_level = -1;
+ block_length = deflate_block(fp, 0); // write an empty block
+ if (hwrite(fp->fp, fp->compressed_block, block_length) < 0
+ || hflush(fp->fp) != 0) {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+#ifdef BGZF_MT
+ if (fp->mt) mt_destroy(fp->mt);
+#endif
+ }
+ if ( fp->is_gzip )
+ {
+ if (!fp->is_write) (void)inflateEnd(fp->gz_stream);
+ else (void)deflateEnd(fp->gz_stream);
+ free(fp->gz_stream);
+ }
+ ret = hclose(fp->fp);
+ if (ret != 0) return -1;
+ bgzf_index_destroy(fp);
+ free(fp->uncompressed_block);
+ free(fp->compressed_block);
+ free_cache(fp);
+ free(fp);
+ return 0;
+}
+
+void bgzf_set_cache_size(BGZF *fp, int cache_size)
+{
+ if (fp) fp->cache_size = cache_size;
+}
+
+int bgzf_check_EOF(BGZF *fp)
+{
+ uint8_t buf[28];
+ off_t offset = htell(fp->fp);
+ if (hseek(fp->fp, -28, SEEK_END) < 0) {
+ if (errno == ESPIPE) { hclearerr(fp->fp); return 2; }
+ else return -1;
+ }
+ if ( hread(fp->fp, buf, 28) != 28 ) return -1;
+ if ( hseek(fp->fp, offset, SEEK_SET) < 0 ) return -1;
+ return (memcmp("\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0", buf, 28) == 0)? 1 : 0;
+}
+
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
+{
+ int block_offset;
+ int64_t block_address;
+
+ if (fp->is_write || where != SEEK_SET) {
+ fp->errcode |= BGZF_ERR_MISUSE;
+ return -1;
+ }
+ block_offset = pos & 0xFFFF;
+ block_address = pos >> 16;
+ if (hseek(fp->fp, block_address, SEEK_SET) < 0) {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+ fp->block_length = 0; // indicates current block has not been loaded
+ fp->block_address = block_address;
+ fp->block_offset = block_offset;
+ return 0;
+}
+
+int bgzf_is_bgzf(const char *fn)
+{
+ uint8_t buf[16];
+ int n;
+ hFILE *fp;
+ if ((fp = hopen(fn, "r")) == 0) return 0;
+ n = hread(fp, buf, 16);
+ if ( hclose(fp) < 0 ) return -1;
+ if (n != 16) return 0;
+ return memcmp(g_magic, buf, 16) == 0? 1 : 0;
+}
+
+int bgzf_getc(BGZF *fp)
+{
+ int c;
+ if (fp->block_offset >= fp->block_length) {
+ if (bgzf_read_block(fp) != 0) return -2; /* error */
+ if (fp->block_length == 0) return -1; /* end-of-file */
+ }
+ c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
+ if (fp->block_offset == fp->block_length) {
+ fp->block_address = htell(fp->fp);
+ fp->block_offset = 0;
+ fp->block_length = 0;
+ }
+ fp->uncompressed_address++;
+ return c;
+}
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
+{
+ int l, state = 0;
+ unsigned char *buf = (unsigned char*)fp->uncompressed_block;
+ str->l = 0;
+ do {
+ if (fp->block_offset >= fp->block_length) {
+ if (bgzf_read_block(fp) != 0) { state = -2; break; }
+ if (fp->block_length == 0) { state = -1; break; }
+ }
+ for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
+ if (l < fp->block_length) state = 1;
+ l -= fp->block_offset;
+ if (str->l + l + 1 >= str->m) {
+ str->m = str->l + l + 2;
+ kroundup32(str->m);
+ str->s = (char*)realloc(str->s, str->m);
+ }
+ memcpy(str->s + str->l, buf + fp->block_offset, l);
+ str->l += l;
+ fp->block_offset += l + 1;
+ if (fp->block_offset >= fp->block_length) {
+ fp->block_address = htell(fp->fp);
+ fp->block_offset = 0;
+ fp->block_length = 0;
+ }
+ } while (state == 0);
+ if (str->l == 0 && state < 0) return state;
+ fp->uncompressed_address += str->l;
+ if ( delim=='\n' && str->l>0 && str->s[str->l-1]=='\r' ) str->l--;
+ str->s[str->l] = 0;
+ return str->l;
+}
+
+void bgzf_index_destroy(BGZF *fp)
+{
+ if ( !fp->idx ) return;
+ free(fp->idx->offs);
+ free(fp->idx);
+ fp->idx = NULL;
+ fp->idx_build_otf = 0;
+}
+
+int bgzf_index_build_init(BGZF *fp)
+{
+ bgzf_index_destroy(fp);
+ fp->idx = (bgzidx_t*) calloc(1,sizeof(bgzidx_t));
+ if ( !fp->idx ) return -1;
+ fp->idx_build_otf = 1; // build index on the fly
+ return 0;
+}
+
+int bgzf_index_add_block(BGZF *fp)
+{
+ fp->idx->noffs++;
+ if ( fp->idx->noffs > fp->idx->moffs )
+ {
+ fp->idx->moffs = fp->idx->noffs;
+ kroundup32(fp->idx->moffs);
+ fp->idx->offs = (bgzidx1_t*) realloc(fp->idx->offs, fp->idx->moffs*sizeof(bgzidx1_t));
+ if ( !fp->idx->offs ) return -1;
+ }
+ fp->idx->offs[ fp->idx->noffs-1 ].uaddr = fp->idx->ublock_addr;
+ fp->idx->offs[ fp->idx->noffs-1 ].caddr = fp->block_address;
+ return 0;
+}
+
+int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix)
+{
+ if (bgzf_flush(fp) != 0) return -1;
+
+ assert(fp->idx);
+ char *tmp = NULL;
+ if ( suffix )
+ {
+ int blen = strlen(bname);
+ int slen = strlen(suffix);
+ tmp = (char*) malloc(blen + slen + 1);
+ if ( !tmp ) return -1;
+ memcpy(tmp,bname,blen);
+ memcpy(tmp+blen,suffix,slen+1);
+ }
+
+ FILE *idx = fopen(tmp?tmp:bname,"wb");
+ if ( tmp ) free(tmp);
+ if ( !idx ) return -1;
+
+ // Note that the index contains one extra record when indexing files opened
+ // for reading. The terminating record is not present when opened for writing.
+ // This is not a bug.
+
+ int i;
+ if ( fp->is_be )
+ {
+ uint64_t x = fp->idx->noffs - 1;
+ fwrite(ed_swap_8p(&x), 1, sizeof(x), idx);
+ for (i=1; i<fp->idx->noffs; i++)
+ {
+ x = fp->idx->offs[i].caddr; fwrite(ed_swap_8p(&x), 1, sizeof(x), idx);
+ x = fp->idx->offs[i].uaddr; fwrite(ed_swap_8p(&x), 1, sizeof(x), idx);
+ }
+ }
+ else
+ {
+ uint64_t x = fp->idx->noffs - 1;
+ fwrite(&x, 1, sizeof(x), idx);
+ for (i=1; i<fp->idx->noffs; i++)
+ {
+ fwrite(&fp->idx->offs[i].caddr, 1, sizeof(fp->idx->offs[i].caddr), idx);
+ fwrite(&fp->idx->offs[i].uaddr, 1, sizeof(fp->idx->offs[i].uaddr), idx);
+ }
+ }
+ fclose(idx);
+ return 0;
+}
+
+
+int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix)
+{
+ char *tmp = NULL;
+ if ( suffix )
+ {
+ int blen = strlen(bname);
+ int slen = strlen(suffix);
+ tmp = (char*) malloc(blen + slen + 1);
+ if ( !tmp ) return -1;
+ memcpy(tmp,bname,blen);
+ memcpy(tmp+blen,suffix,slen+1);
+ }
+
+ FILE *idx = fopen(tmp?tmp:bname,"rb");
+ if ( tmp ) free(tmp);
+ if ( !idx ) return -1;
+
+ fp->idx = (bgzidx_t*) calloc(1,sizeof(bgzidx_t));
+ uint64_t x;
+ if ( fread(&x, 1, sizeof(x), idx) != sizeof(x) ) return -1;
+
+ fp->idx->noffs = fp->idx->moffs = 1 + (fp->is_be ? ed_swap_8(x) : x);
+ fp->idx->offs = (bgzidx1_t*) malloc(fp->idx->moffs*sizeof(bgzidx1_t));
+ fp->idx->offs[0].caddr = fp->idx->offs[0].uaddr = 0;
+
+ int i;
+ if ( fp->is_be )
+ {
+ int ret = 0;
+ for (i=1; i<fp->idx->noffs; i++)
+ {
+ ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].caddr = ed_swap_8(x);
+ ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].uaddr = ed_swap_8(x);
+ }
+ if ( ret != ((intptr_t)(sizeof(x)*2*(fp->idx->noffs-1))) ) return -1;
+ }
+ else
+ {
+ int ret = 0;
+ for (i=1; i<fp->idx->noffs; i++)
+ {
+ ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].caddr = x;
+ ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].uaddr = x;
+ }
+ if ( ret != ((intptr_t)(sizeof(x)*2*(fp->idx->noffs-1))) ) return -1;
+ }
+ fclose(idx);
+ return 0;
+
+}
+
+int bgzf_useek(BGZF *fp, long uoffset, int where)
+{
+ if ( !fp->is_compressed )
+ {
+ if (hseek(fp->fp, uoffset, SEEK_SET) < 0)
+ {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+ fp->block_length = 0; // indicates current block has not been loaded
+ fp->block_address = uoffset;
+ fp->block_offset = 0;
+ bgzf_read_block(fp);
+ fp->uncompressed_address = uoffset;
+ return 0;
+ }
+
+ if ( !fp->idx )
+ {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+
+ // binary search
+ int ilo = 0, ihi = fp->idx->noffs - 1;
+ while ( ilo<=ihi )
+ {
+ int i = (ilo+ihi)*0.5;
+ if ( uoffset < ((intptr_t)fp->idx->offs[i].uaddr) ) ihi = i - 1;
+ else if ( uoffset >= ((intptr_t)fp->idx->offs[i].uaddr) ) ilo = i + 1;
+ else break;
+ }
+ int i = ilo-1;
+ if (hseek(fp->fp, fp->idx->offs[i].caddr, SEEK_SET) < 0)
+ {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+ fp->block_length = 0; // indicates current block has not been loaded
+ fp->block_address = fp->idx->offs[i].caddr;
+ fp->block_offset = 0;
+ if ( bgzf_read_block(fp) < 0 ) return -1;
+ if ( uoffset - fp->idx->offs[i].uaddr > 0 )
+ {
+ fp->block_offset = uoffset - fp->idx->offs[i].uaddr;
+ assert( fp->block_offset <= fp->block_length ); // todo: skipped, unindexed, blocks
+ }
+ fp->uncompressed_address = uoffset;
+ return 0;
+}
+
+long bgzf_utell(BGZF *fp)
+{
+ return fp->uncompressed_address; // currently maintained only when reading
+}
+
diff --git a/bgzf.h b/bgzf.h
new file mode 100644
index 0000000..acdb673
--- /dev/null
+++ b/bgzf.h
@@ -0,0 +1,313 @@
+/* The MIT License
+
+ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+ 2011, 2012 Attractive Chaos <attractor at live.co.uk>
+ Copyright (C) 2009, 2013, 2014 Genome Research Ltd
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+/* The BGZF library was originally written by Bob Handsaker from the Broad
+ * Institute. It was later improved by the SAMtools developers. */
+
+#ifndef HTSLIB_BGZF_H
+#define HTSLIB_BGZF_H
+
+#include "plink_common.h"
+#include <sys/types.h>
+
+#define BGZF_BLOCK_SIZE 0xff00 // make sure compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE
+#define BGZF_MAX_BLOCK_SIZE 0x10000
+
+#define BGZF_ERR_ZLIB 1
+#define BGZF_ERR_HEADER 2
+#define BGZF_ERR_IO 4
+#define BGZF_ERR_MISUSE 8
+
+struct hFILE;
+struct bgzf_mtaux_t;
+typedef struct __bgzidx_t bgzidx_t;
+
+struct BGZF {
+ int errcode:16, is_write:2, is_be:2, compress_level:9, is_compressed:2, is_gzip:1;
+ int cache_size;
+ int block_length, block_offset;
+ int64_t block_address, uncompressed_address;
+ void *uncompressed_block, *compressed_block;
+ void *cache; // a pointer to a hash table
+ struct hFILE *fp; // actual file handle
+ struct bgzf_mtaux_t *mt; // only used for multi-threading
+ bgzidx_t *idx; // BGZF index
+ int idx_build_otf; // build index on the fly, set by bgzf_index_build_init()
+ z_stream *gz_stream;// for gzip-compressed files
+};
+#ifndef HTS_BGZF_TYPEDEF
+typedef struct BGZF BGZF;
+#define HTS_BGZF_TYPEDEF
+#endif
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ size_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
+// #ifdef __cplusplus
+// extern "C" {
+// #endif
+
+ /******************
+ * Basic routines *
+ ******************/
+
+ /**
+ * Open an existing file descriptor for reading or writing.
+ *
+ * @param fd file descriptor
+ * @param mode mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for
+ * writing, 'a' for appending, 'g' for gzip rather than BGZF
+ * compression (with 'w' only), and digit specifies the zlib
+ * compression level.
+ * Note that there is a distinction between 'u' and '0': the
+ * first yields plain uncompressed output whereas the latter
+ * outputs uncompressed data wrapped in the zlib format.
+ * @return BGZF file handler; 0 on error
+ */
+ BGZF* bgzf_dopen(int fd, const char *mode);
+
+ #define bgzf_fdopen(fd, mode) bgzf_dopen((fd), (mode)) // for backward compatibility
+
+ /**
+ * Open the specified file for reading or writing.
+ */
+ BGZF* bgzf_open(const char* path, const char *mode);
+
+ /**
+ * Open an existing hFILE stream for reading or writing.
+ */
+ BGZF* bgzf_hopen(struct hFILE *fp, const char *mode);
+
+ /**
+ * Close the BGZF and free all associated resources.
+ *
+ * @param fp BGZF file handler
+ * @return 0 on success and -1 on error
+ */
+ int bgzf_close(BGZF *fp);
+
+ /**
+ * Read up to _length_ bytes from the file storing into _data_.
+ *
+ * @param fp BGZF file handler
+ * @param data data array to read into
+ * @param length size of data to read
+ * @return number of bytes actually read; 0 on end-of-file and -1 on error
+ */
+ ssize_t bgzf_read(BGZF *fp, void *data, size_t length);
+
+ /**
+ * Write _length_ bytes from _data_ to the file. If no I/O errors occur,
+ * the complete _length_ bytes will be written (or queued for writing).
+ *
+ * @param fp BGZF file handler
+ * @param data data array to write
+ * @param length size of data to write
+ * @return number of bytes written (i.e., _length_); negative on error
+ */
+ ssize_t bgzf_write(BGZF *fp, const void *data, size_t length);
+
+ /**
+ * Read up to _length_ bytes directly from the underlying stream without
+ * decompressing. Bypasses BGZF blocking, so must be used with care in
+ * specialised circumstances only.
+ *
+ * @param fp BGZF file handler
+ * @param data data array to read into
+ * @param length number of raw bytes to read
+ * @return number of bytes actually read; 0 on end-of-file and -1 on error
+ */
+ ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length);
+
+ /**
+ * Write _length_ bytes directly to the underlying stream without
+ * compressing. Bypasses BGZF blocking, so must be used with care
+ * in specialised circumstances only.
+ *
+ * @param fp BGZF file handler
+ * @param data data array to write
+ * @param length number of raw bytes to write
+ * @return number of bytes actually written; -1 on error
+ */
+ ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length);
+
+ /**
+ * Write the data in the buffer to the file.
+ */
+ int bgzf_flush(BGZF *fp);
+
+ /**
+ * Return a virtual file pointer to the current location in the file.
+ * No interpetation of the value should be made, other than a subsequent
+ * call to bgzf_seek can be used to position the file at the same point.
+ * Return value is non-negative on success.
+ */
+ #define bgzf_tell(fp) (((fp)->block_address << 16) | ((fp)->block_offset & 0xFFFF))
+
+ /**
+ * Set the file to read from the location specified by _pos_.
+ *
+ * @param fp BGZF file handler
+ * @param pos virtual file offset returned by bgzf_tell()
+ * @param whence must be SEEK_SET
+ * @return 0 on success and -1 on error
+ */
+ int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence);
+
+ /**
+ * Check if the BGZF end-of-file (EOF) marker is present
+ *
+ * @param fp BGZF file handler opened for reading
+ * @return 1 if the EOF marker is present and correct;
+ * 2 if it can't be checked, e.g., because fp isn't seekable;
+ * 0 if the EOF marker is absent;
+ * -1 (with errno set) on error
+ */
+ int bgzf_check_EOF(BGZF *fp);
+
+ /**
+ * Check if a file is in the BGZF format
+ *
+ * @param fn file name
+ * @return 1 if _fn_ is BGZF; 0 if not or on I/O error
+ */
+ int bgzf_is_bgzf(const char *fn);
+
+ /*********************
+ * Advanced routines *
+ *********************/
+
+ /**
+ * Set the cache size. Only effective when compiled with -DBGZF_CACHE.
+ *
+ * @param fp BGZF file handler
+ * @param size size of cache in bytes; 0 to disable caching (default)
+ */
+ void bgzf_set_cache_size(BGZF *fp, int size);
+
+ /**
+ * Flush the file if the remaining buffer size is smaller than _size_
+ * @return 0 if flushing succeeded or was not needed; negative on error
+ */
+ int bgzf_flush_try(BGZF *fp, ssize_t size);
+
+ /**
+ * Read one byte from a BGZF file. It is faster than bgzf_read()
+ * @param fp BGZF file handler
+ * @return byte read; -1 on end-of-file or error
+ */
+ int bgzf_getc(BGZF *fp);
+
+ /**
+ * Read one line from a BGZF file. It is faster than bgzf_getc()
+ *
+ * @param fp BGZF file handler
+ * @param delim delimitor
+ * @param str string to write to; must be initialized
+ * @return length of the string; 0 on end-of-file; negative on error
+ */
+ int bgzf_getline(BGZF *fp, int delim, kstring_t *str);
+
+ /**
+ * Read the next BGZF block.
+ */
+ int bgzf_read_block(BGZF *fp);
+
+ /**
+ * Enable multi-threading (only effective on writing and when the
+ * library was compiled with -DBGZF_MT)
+ *
+ * @param fp BGZF file handler; must be opened for writing
+ * @param n_threads #threads used for writing
+ * @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended
+ */
+ int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks);
+
+
+ /*******************
+ * bgzidx routines *
+ *******************/
+
+ /**
+ * Position BGZF at the uncompressed offset
+ *
+ * @param fp BGZF file handler; must be opened for reading
+ * @param uoffset file offset in the uncompressed data
+ * @param where SEEK_SET supported atm
+ *
+ * Returns 0 on success and -1 on error.
+ */
+ int bgzf_useek(BGZF *fp, long uoffset, int where);
+
+ /**
+ * Position in uncompressed BGZF
+ *
+ * @param fp BGZF file handler; must be opened for reading
+ *
+ * Returns the current offset on success and -1 on error.
+ */
+ long bgzf_utell(BGZF *fp);
+
+ /**
+ * Tell BGZF to build index while compressing.
+ *
+ * @param fp BGZF file handler; can be opened for reading or writing.
+ *
+ * Returns 0 on success and -1 on error.
+ */
+ int bgzf_index_build_init(BGZF *fp);
+
+ /**
+ * Load BGZF index
+ *
+ * @param fp BGZF file handler
+ * @param bname base name
+ * @param suffix suffix to add to bname (can be NULL)
+ *
+ * Returns 0 on success and -1 on error.
+ */
+ int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix);
+
+ /**
+ * Save BGZF index
+ *
+ * @param fp BGZF file handler
+ * @param bname base name
+ * @param suffix suffix to add to bname (can be NULL)
+ *
+ * Returns 0 on success and -1 on error.
+ */
+ int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix);
+
+// #ifdef __cplusplus
+// }
+// #endif
+
+#endif
diff --git a/hfile.c b/hfile.c
new file mode 100644
index 0000000..9ab1ea9
--- /dev/null
+++ b/hfile.c
@@ -0,0 +1,578 @@
+/* hfile.c -- buffered low-level input/output streams.
+
+ Copyright (C) 2013-2015 Genome Research Ltd.
+
+ Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include "plink_common.h"
+#include <errno.h>
+
+#include "hfile.h"
+#include "hfile_internal.h"
+
+/* hFILE fields are used as follows:
+
+ char *buffer; // Pointer to the start of the I/O buffer
+ char *begin; // First not-yet-read character / unused position
+ char *end; // First unfilled/unfillable position
+ char *limit; // Pointer to the first position past the buffer
+
+ const hFILE_backend *backend; // Methods to refill/flush I/O buffer
+
+ off_t offset; // Offset within the stream of buffer position 0
+ int at_eof:1; // For reading, whether EOF has been seen
+ int has_errno; // Error number from the last failure on this stream
+
+For reading, begin is the first unread character in the buffer and end is the
+first unfilled position:
+
+ -----------ABCDEFGHIJKLMNO---------------
+ ^buffer ^begin ^end ^limit
+
+For writing, begin is the first unused position and end is unused so remains
+equal to buffer:
+
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ---------------
+ ^buffer ^begin ^limit
+ ^end
+
+Thus if begin > end then there is a non-empty write buffer, if begin < end
+then there is a non-empty read buffer, and if begin == end then both buffers
+are empty. In all cases, the stream's file position indicator corresponds
+to the position pointed to by begin. */
+
+hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity)
+{
+ hFILE *fp = (hFILE *) malloc(struct_size);
+ if (fp == NULL) goto error;
+
+ if (capacity == 0) capacity = 32768;
+ // FIXME For now, clamp input buffer sizes so mpileup doesn't eat memory
+ if (strchr(mode, 'r') && capacity > 32768) capacity = 32768;
+
+ fp->buffer = (char *) malloc(capacity);
+ if (fp->buffer == NULL) goto error;
+
+ fp->begin = fp->end = fp->buffer;
+ fp->limit = &fp->buffer[capacity];
+
+ fp->offset = 0;
+ fp->at_eof = 0;
+ fp->has_errno = 0;
+ return fp;
+
+error:
+ hfile_destroy(fp);
+ return NULL;
+}
+
+void hfile_destroy(hFILE *fp)
+{
+ int save = errno;
+ if (fp) free(fp->buffer);
+ free(fp);
+ errno = save;
+}
+
+static inline int writebuffer_is_nonempty(hFILE *fp)
+{
+ return fp->begin > fp->end;
+}
+
+/* Refills the read buffer from the backend (once, so may only partially
+ fill the buffer), returning the number of additional characters read
+ (which might be 0), or negative when an error occurred. */
+static ssize_t refill_buffer(hFILE *fp)
+{
+ ssize_t n;
+
+ // Move any unread characters to the start of the buffer
+ if (fp->begin > fp->buffer) {
+ fp->offset += fp->begin - fp->buffer;
+ memmove(fp->buffer, fp->begin, fp->end - fp->begin);
+ fp->end = &fp->buffer[fp->end - fp->begin];
+ fp->begin = fp->buffer;
+ }
+
+ // Read into the available buffer space at fp->[end,limit)
+ if (fp->at_eof || fp->end == fp->limit) n = 0;
+ else {
+ n = fp->backend->read(fp, fp->end, fp->limit - fp->end);
+ if (n < 0) { fp->has_errno = errno; return n; }
+ else if (n == 0) fp->at_eof = 1;
+ }
+
+ fp->end += n;
+ return n;
+}
+
+/* Called only from hgetc(), when our buffer is empty. */
+int hgetc2(hFILE *fp)
+{
+ return (refill_buffer(fp) > 0)? (unsigned char) *(fp->begin++) : EOF;
+}
+
+ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes)
+{
+ size_t n = fp->end - fp->begin;
+ while (n < nbytes) {
+ ssize_t ret = refill_buffer(fp);
+ if (ret < 0) return ret;
+ else if (ret == 0) break;
+ else n += ret;
+ }
+
+ if (n > nbytes) n = nbytes;
+ memcpy(buffer, fp->begin, n);
+ return n;
+}
+
+/* Called only from hread(); when called, our buffer is empty and nread bytes
+ have already been placed in the destination buffer. */
+ssize_t hread2(hFILE *fp, void *destv, size_t nbytes, size_t nread)
+{
+ const size_t capacity = fp->limit - fp->buffer;
+ char *dest = (char *) destv;
+ dest += nread, nbytes -= nread;
+
+ // Read large requests directly into the destination buffer
+ while (nbytes * 2 >= capacity && !fp->at_eof) {
+ ssize_t n = fp->backend->read(fp, dest, nbytes);
+ if (n < 0) { fp->has_errno = errno; return n; }
+ else if (n == 0) fp->at_eof = 1;
+ fp->offset += n;
+ dest += n, nbytes -= n;
+ nread += n;
+ }
+
+ while (nbytes > 0 && !fp->at_eof) {
+ size_t n;
+ ssize_t ret = refill_buffer(fp);
+ if (ret < 0) return ret;
+
+ n = fp->end - fp->begin;
+ if (n > nbytes) n = nbytes;
+ memcpy(dest, fp->begin, n);
+ fp->begin += n;
+ dest += n, nbytes -= n;
+ nread += n;
+ }
+
+ return nread;
+}
+
+/* Flushes the write buffer, fp->[buffer,begin), out through the backend
+ returning 0 on success or negative if an error occurred. */
+static ssize_t flush_buffer(hFILE *fp)
+{
+ const char *buffer = fp->buffer;
+ while (buffer < fp->begin) {
+ ssize_t n = fp->backend->write(fp, buffer, fp->begin - buffer);
+ if (n < 0) { fp->has_errno = errno; return n; }
+ buffer += n;
+ fp->offset += n;
+ }
+
+ fp->begin = fp->buffer; // Leave the buffer empty
+ return 0;
+}
+
+int hflush(hFILE *fp)
+{
+ if (flush_buffer(fp) < 0) return EOF;
+ if (fp->backend->flush(fp) < 0) { fp->has_errno = errno; return EOF; }
+ return 0;
+}
+
+/* Called only from hputc(), when our buffer is already full. */
+int hputc2(int c, hFILE *fp)
+{
+ if (flush_buffer(fp) < 0) return EOF;
+ *(fp->begin++) = c;
+ return c;
+}
+
+/* Called only from hwrite() and hputs2(); when called, our buffer is full and
+ ncopied bytes from the source have already been copied to our buffer. */
+ssize_t hwrite2(hFILE *fp, const void *srcv, size_t totalbytes, size_t ncopied)
+{
+ const char *src = (const char *) srcv;
+ ssize_t ret;
+ const size_t capacity = fp->limit - fp->buffer;
+ size_t remaining = totalbytes - ncopied;
+ src += ncopied;
+
+ ret = flush_buffer(fp);
+ if (ret < 0) return ret;
+
+ // Write large blocks out directly from the source buffer
+ while (remaining * 2 >= capacity) {
+ ssize_t n = fp->backend->write(fp, src, remaining);
+ if (n < 0) { fp->has_errno = errno; return n; }
+ fp->offset += n;
+ src += n, remaining -= n;
+ }
+
+ // Just buffer any remaining characters
+ memcpy(fp->begin, src, remaining);
+ fp->begin += remaining;
+
+ return totalbytes;
+}
+
+/* Called only from hputs(), when our buffer is already full. */
+int hputs2(const char *text, size_t totalbytes, size_t ncopied, hFILE *fp)
+{
+ return (hwrite2(fp, text, totalbytes, ncopied) >= 0)? 0 : EOF;
+}
+
+off_t hseek(hFILE *fp, off_t offset, int whence)
+{
+ off_t pos;
+
+ if (writebuffer_is_nonempty(fp)) {
+ int ret = flush_buffer(fp);
+ if (ret < 0) return ret;
+ }
+ else {
+ // Convert relative offsets from being relative to the hFILE's stream
+ // position (at begin) to being relative to the backend's physical
+ // stream position (at end, due to the buffering read-ahead).
+ if (whence == SEEK_CUR) offset -= fp->end - fp->begin;
+ }
+
+ pos = fp->backend->seek(fp, offset, whence);
+ if (pos < 0) { fp->has_errno = errno; return pos; }
+
+ // Seeking succeeded, so discard any non-empty read buffer
+ fp->begin = fp->end = fp->buffer;
+ fp->at_eof = 0;
+
+ fp->offset = pos;
+ return pos;
+}
+
+int hclose(hFILE *fp)
+{
+ int err = fp->has_errno;
+
+ if (writebuffer_is_nonempty(fp) && hflush(fp) < 0) err = fp->has_errno;
+ if (fp->backend->close(fp) < 0) err = errno;
+ hfile_destroy(fp);
+
+ if (err) {
+ errno = err;
+ return EOF;
+ }
+ else return 0;
+}
+
+void hclose_abruptly(hFILE *fp)
+{
+ int save = errno;
+ if (fp->backend->close(fp) < 0) { /* Ignore subsequent errors */ }
+ hfile_destroy(fp);
+ errno = save;
+}
+
+
+/***************************
+ * File descriptor backend *
+ ***************************/
+
+// #include <sys/socket.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+// #ifdef _WIN32
+// #define HAVE_CLOSESOCKET
+// #endif
+
+/* For Unix, it doesn't matter whether a file descriptor is a socket.
+ However Windows insists on send()/recv() and its own closesocket()
+ being used when fd happens to be a socket. */
+
+typedef struct {
+ hFILE base;
+ int fd;
+ // int is_socket:1;
+} hFILE_fd;
+
+static ssize_t fd_read(hFILE *fpv, void *buffer, size_t nbytes)
+{
+ hFILE_fd *fp = (hFILE_fd *) fpv;
+ ssize_t n;
+ do {
+ /*
+ n = fp->is_socket? recv(fp->fd, buffer, nbytes, 0)
+ : read(fp->fd, buffer, nbytes);
+ */
+ n = read(fp->fd, buffer, nbytes);
+ } while (n < 0 && errno == EINTR);
+ return n;
+}
+
+static ssize_t fd_write(hFILE *fpv, const void *buffer, size_t nbytes)
+{
+ hFILE_fd *fp = (hFILE_fd *) fpv;
+ ssize_t n;
+ do {
+ /*
+ n = fp->is_socket? send(fp->fd, buffer, nbytes, 0)
+ : write(fp->fd, buffer, nbytes);
+ */
+ n = write(fp->fd, buffer, nbytes);
+ } while (n < 0 && errno == EINTR);
+ return n;
+}
+
+static off_t fd_seek(hFILE *fpv, off_t offset, int whence)
+{
+ hFILE_fd *fp = (hFILE_fd *) fpv;
+ return lseek(fp->fd, offset, whence);
+}
+
+static int fd_flush(hFILE *fpv)
+{
+ hFILE_fd *fp = (hFILE_fd *) fpv;
+#ifdef _WIN32
+ // See the patch at
+ // https://lists.gnu.org/archive/html/bug-gnulib/2008-10/msg00004.html .
+ HANDLE hh = (HANDLE)_get_osfhandle(fp->fd);
+ DWORD err;
+ if (hh == INVALID_HANDLE_VALUE) {
+ errno = EBADF;
+ return -1;
+ }
+ if (!FlushFileBuffers(hh)) {
+ err = GetLastError();
+ switch (err) {
+ case ERROR_INVALID_HANDLE:
+ errno = EINVAL;
+ break;
+ default:
+ errno = EIO;
+ }
+ return -1;
+ }
+ return 0;
+#else
+ int ret;
+ do {
+ ret = fsync(fp->fd);
+ // Ignore invalid-for-fsync(2) errors due to being, e.g., a pipe,
+ // and operation-not-supported errors (Mac OS X)
+ if (ret < 0 && (errno == EINVAL || errno == ENOTSUP)) ret = 0;
+ } while (ret < 0 && errno == EINTR);
+ return ret;
+#endif
+}
+
+static int fd_close(hFILE *fpv)
+{
+ hFILE_fd *fp = (hFILE_fd *) fpv;
+ int ret;
+ do {
+#ifdef HAVE_CLOSESOCKET
+ ret = fp->is_socket? closesocket(fp->fd) : close(fp->fd);
+#else
+ ret = close(fp->fd);
+#endif
+ } while (ret < 0 && errno == EINTR);
+ return ret;
+}
+
+static const struct hFILE_backend fd_backend =
+{
+ fd_read, fd_write, fd_seek, fd_flush, fd_close
+};
+
+static size_t blksize(int fd)
+{
+ struct stat sbuf;
+ if (fstat(fd, &sbuf) != 0) return 0;
+#ifdef _WIN32
+ return 512;
+#else
+ return sbuf.st_blksize;
+#endif
+}
+
+static hFILE *hopen_fd(const char *filename, const char *mode)
+{
+ hFILE_fd *fp = NULL;
+ int fd = open(filename, hfile_oflags(mode), 0666);
+ if (fd < 0) goto error;
+
+ fp = (hFILE_fd *) hfile_init(sizeof (hFILE_fd), mode, blksize(fd));
+ if (fp == NULL) goto error;
+
+ fp->fd = fd;
+ // fp->is_socket = 0;
+ fp->base.backend = &fd_backend;
+ return &fp->base;
+
+error:
+ if (fd >= 0) { int save = errno; (void) close(fd); errno = save; }
+ hfile_destroy((hFILE *) fp);
+ return NULL;
+}
+
+hFILE *hdopen(int fd, const char *mode)
+{
+ hFILE_fd *fp = (hFILE_fd*) hfile_init(sizeof (hFILE_fd), mode, blksize(fd));
+ if (fp == NULL) return NULL;
+
+ fp->fd = fd;
+ // fp->is_socket = (strchr(mode, 's') != NULL);
+ fp->base.backend = &fd_backend;
+ return &fp->base;
+}
+
+static hFILE *hopen_fd_stdinout(const char *mode)
+{
+ int fd = (strchr(mode, 'r') != NULL)? STDIN_FILENO : STDOUT_FILENO;
+ // TODO Set binary mode (for Windows)
+ return hdopen(fd, mode);
+}
+
+int hfile_oflags(const char *mode)
+{
+ int rdwr = 0, flags = 0;
+ const char *s;
+ for (s = mode; *s; s++)
+ switch (*s) {
+ case 'r': rdwr = O_RDONLY; break;
+ case 'w': rdwr = O_WRONLY; flags |= O_CREAT | O_TRUNC; break;
+ case 'a': rdwr = O_WRONLY; flags |= O_CREAT | O_APPEND; break;
+ case '+': rdwr = O_RDWR; break;
+ default: break;
+ }
+
+#ifdef O_BINARY
+ flags |= O_BINARY;
+#endif
+
+ return rdwr | flags;
+}
+
+
+/*********************
+ * In-memory backend *
+ *********************/
+
+typedef struct {
+ hFILE base;
+ const char *buffer;
+ size_t length, pos;
+} hFILE_mem;
+
+static ssize_t mem_read(hFILE *fpv, void *buffer, size_t nbytes)
+{
+ hFILE_mem *fp = (hFILE_mem *) fpv;
+ size_t avail = fp->length - fp->pos;
+ if (nbytes > avail) nbytes = avail;
+ memcpy(buffer, fp->buffer + fp->pos, nbytes);
+ fp->pos += nbytes;
+ return nbytes;
+}
+
+static off_t mem_seek(hFILE *fpv, off_t offset, int whence)
+{
+ hFILE_mem *fp = (hFILE_mem *) fpv;
+ size_t absoffset = (offset >= 0)? offset : -offset;
+ size_t origin;
+
+ switch (whence) {
+ case SEEK_SET: origin = 0; break;
+ case SEEK_CUR: origin = fp->pos; break;
+ case SEEK_END: origin = fp->length; break;
+ default: errno = EINVAL; return -1;
+ }
+
+ if ((offset < 0 && absoffset > origin) ||
+ (offset >= 0 && absoffset > fp->length - origin)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ fp->pos = origin + offset;
+ return fp->pos;
+}
+
+static int mem_close(hFILE *fpv)
+{
+ return 0;
+}
+
+static const struct hFILE_backend mem_backend =
+{
+ mem_read, NULL, mem_seek, NULL, mem_close
+};
+
+/*
+static hFILE *hopen_mem(const char *data, const char *mode)
+{
+ // TODO Implement write modes, which will require memory allocation
+ if (strchr(mode, 'r') == NULL) { errno = EINVAL; return NULL; }
+
+ hFILE_mem *fp = (hFILE_mem *) hfile_init(sizeof (hFILE_mem), mode, 0);
+ if (fp == NULL) return NULL;
+
+ fp->buffer = data;
+ fp->length = strlen(data);
+ fp->pos = 0;
+ fp->base.backend = &mem_backend;
+ return &fp->base;
+}
+*/
+
+
+/******************************
+ * hopen() backend dispatcher *
+ ******************************/
+
+hFILE *hopen(const char *fname, const char *mode)
+{
+ // if (strncmp(fname, "http://", 7) == 0 ||
+ // strncmp(fname, "ftp://", 6) == 0) return hopen_net(fname, mode);
+#ifdef HAVE_IRODS
+ // else if (strncmp(fname, "irods:", 6) == 0) return hopen_irods(fname, mode);
+#endif
+ // else if (strncmp(fname, "data:", 5) == 0) return hopen_mem(fname + 5, mode);
+ if (strcmp(fname, "-") == 0) return hopen_fd_stdinout(mode);
+ else return hopen_fd(fname, mode);
+}
+
+/*
+int hisremote(const char *fname)
+{
+ // FIXME Make a new backend entry to return this
+ if (strncmp(fname, "http://", 7) == 0 ||
+ strncmp(fname, "https://", 8) == 0 ||
+ strncmp(fname, "ftp://", 6) == 0) return 1;
+#ifdef HAVE_IRODS
+ else if (strncmp(fname, "irods:", 6) == 0) return 1;
+#endif
+ else return 0;
+}
+*/
diff --git a/hfile.h b/hfile.h
new file mode 100644
index 0000000..f8b59f3
--- /dev/null
+++ b/hfile.h
@@ -0,0 +1,212 @@
+/* hfile.h -- buffered low-level input/output streams.
+
+ Copyright (C) 2013-2015 Genome Research Ltd.
+
+ Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef HTSLIB_HFILE_H
+#define HTSLIB_HFILE_H
+
+#include <string.h>
+
+#include <sys/types.h>
+
+#include "hts_defs.h"
+
+// #ifdef __cplusplus
+// extern "C" {
+// #endif
+
+/* These fields are declared here solely for the benefit of the inline functions
+ below. They may change in future releases. User code should not use them
+ directly; you should imagine that hFILE is an opaque incomplete type. */
+struct hFILE_backend;
+typedef struct hFILE {
+ char *buffer, *begin, *end, *limit;
+ const struct hFILE_backend *backend;
+ off_t offset;
+ int at_eof:1;
+ int has_errno;
+} hFILE;
+
+/*!
+ @abstract Open the named file or URL as a stream
+ @return An hFILE pointer, or NULL (with errno set) if an error occurred.
+*/
+hFILE *hopen(const char *filename, const char *mode) HTS_RESULT_USED;
+
+/*!
+ @abstract Associate a stream with an existing open file descriptor
+ @return An hFILE pointer, or NULL (with errno set) if an error occurred.
+ @notes For socket descriptors (on Windows), mode should contain 's'.
+*/
+hFILE *hdopen(int fd, const char *mode) HTS_RESULT_USED;
+
+/*!
+ @abstract Report whether the file name or URL denotes remote storage
+ @return 0 if local, 1 if remote.
+ @notes "Remote" means involving e.g. explicit network access, with the
+ implication that callers may wish to cache such files' contents locally.
+*/
+// int hisremote(const char *filename) HTS_RESULT_USED;
+
+/*!
+ @abstract Flush (for output streams) and close the stream
+ @return 0 if successful, or EOF (with errno set) if an error occurred.
+*/
+int hclose(hFILE *fp) HTS_RESULT_USED;
+
+/*!
+ @abstract Close the stream, without flushing or propagating errors
+ @notes For use while cleaning up after an error only. Preserves errno.
+*/
+void hclose_abruptly(hFILE *fp);
+
+/*!
+ @abstract Return the stream's error indicator
+ @return Non-zero (in fact, an errno value) if an error has occurred.
+ @notes This would be called herror() and return true/false to parallel
+ ferror(3), but a networking-related herror(3) function already exists. */
+static inline int herrno(hFILE *fp)
+{
+ return fp->has_errno;
+}
+
+/*!
+ @abstract Clear the stream's error indicator
+*/
+static inline void hclearerr(hFILE *fp)
+{
+ fp->has_errno = 0;
+}
+
+/*!
+ @abstract Reposition the read/write stream offset
+ @return The resulting offset within the stream (as per lseek(2)),
+ or negative if an error occurred.
+*/
+off_t hseek(hFILE *fp, off_t offset, int whence) HTS_RESULT_USED;
+
+/*!
+ @abstract Report the current stream offset
+ @return The offset within the stream, starting from zero.
+*/
+static inline off_t htell(hFILE *fp)
+{
+ return fp->offset + (fp->begin - fp->buffer);
+}
+
+/*!
+ @abstract Read one character from the stream
+ @return The character read, or EOF on end-of-file or error
+*/
+static inline int hgetc(hFILE *fp)
+{
+ extern int hgetc2(hFILE *);
+ return (fp->end > fp->begin)? (unsigned char) *(fp->begin++) : hgetc2(fp);
+}
+
+/*!
+ @abstract Peek at characters to be read without removing them from buffers
+ @param fp The file stream
+ @param buffer The buffer to which the peeked bytes will be written
+ @param nbytes The number of bytes to peek at; limited by the size of the
+ internal buffer, which could be as small as 4K.
+ @return The number of bytes peeked, which may be less than nbytes if EOF
+ is encountered; or negative, if there was an I/O error.
+ @notes The characters peeked at remain in the stream's internal buffer,
+ and will be returned by later hread() etc calls.
+*/
+ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes) HTS_RESULT_USED;
+
+/*!
+ @abstract Read a block of characters from the file
+ @return The number of bytes read, or negative if an error occurred.
+ @notes The full nbytes requested will be returned, except as limited
+ by EOF or I/O errors.
+*/
+static inline ssize_t HTS_RESULT_USED
+hread(hFILE *fp, void *buffer, size_t nbytes)
+{
+ extern ssize_t hread2(hFILE *, void *, size_t, size_t);
+
+ size_t n = fp->end - fp->begin;
+ if (n > nbytes) n = nbytes;
+ memcpy(buffer, fp->begin, n);
+ fp->begin += n;
+ return (n == nbytes)? (ssize_t) n : hread2(fp, buffer, nbytes, n);
+}
+
+/*!
+ @abstract Write a character to the stream
+ @return The character written, or EOF if an error occurred.
+*/
+static inline int hputc(int c, hFILE *fp)
+{
+ extern int hputc2(int, hFILE *);
+ if (fp->begin < fp->limit) *(fp->begin++) = c;
+ else c = hputc2(c, fp);
+ return c;
+}
+
+/*!
+ @abstract Write a string to the stream
+ @return 0 if successful, or EOF if an error occurred.
+*/
+static inline int hputs(const char *text, hFILE *fp)
+{
+ extern int hputs2(const char *, size_t, size_t, hFILE *);
+
+ size_t nbytes = strlen(text), n = fp->limit - fp->begin;
+ if (n > nbytes) n = nbytes;
+ memcpy(fp->begin, text, n);
+ fp->begin += n;
+ return (n == nbytes)? 0 : hputs2(text, nbytes, n, fp);
+}
+
+/*!
+ @abstract Write a block of characters to the file
+ @return Either nbytes, or negative if an error occurred.
+ @notes In the absence of I/O errors, the full nbytes will be written.
+*/
+static inline ssize_t HTS_RESULT_USED
+hwrite(hFILE *fp, const void *buffer, size_t nbytes)
+{
+ extern ssize_t hwrite2(hFILE *, const void *, size_t, size_t);
+
+ size_t n = fp->limit - fp->begin;
+ if (n > nbytes) n = nbytes;
+ memcpy(fp->begin, buffer, n);
+ fp->begin += n;
+ return (n==nbytes)? (ssize_t) n : hwrite2(fp, buffer, nbytes, n);
+}
+
+/*!
+ @abstract For writing streams, flush buffered output to the underlying stream
+ @return 0 if successful, or EOF if an error occurred.
+*/
+int hflush(hFILE *fp) HTS_RESULT_USED;
+
+// #ifdef __cplusplus
+// }
+// #endif
+
+#endif
diff --git a/hfile_internal.h b/hfile_internal.h
new file mode 100644
index 0000000..0997705
--- /dev/null
+++ b/hfile_internal.h
@@ -0,0 +1,76 @@
+/* hfile_internal.h -- internal parts of low-level input/output streams.
+
+ Copyright (C) 2013-2015 Genome Research Ltd.
+
+ Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef HFILE_INTERNAL_H
+#define HFILE_INTERNAL_H
+
+#include "hfile.h"
+
+struct hFILE_backend {
+ /* As per read(2), returning the number of bytes read (possibly 0) or
+ negative (and setting errno) on errors. Front-end code will call this
+ repeatedly if necessary to attempt to get the desired byte count. */
+ ssize_t (*read)(hFILE *fp, void *buffer, size_t nbytes) HTS_RESULT_USED;
+
+ /* As per write(2), returning the number of bytes written or negative (and
+ setting errno) on errors. Front-end code will call this repeatedly if
+ necessary until the desired block is written or an error occurs. */
+ ssize_t (*write)(hFILE *fp, const void *buffer, size_t nbytes)
+ HTS_RESULT_USED;
+
+ /* As per lseek(2), returning the resulting offset within the stream or
+ negative (and setting errno) on errors. */
+ off_t (*seek)(hFILE *fp, off_t offset, int whence) HTS_RESULT_USED;
+
+ /* Performs low-level flushing, if any, e.g., fsync(2); for writing streams
+ only. Returns 0 for success or negative (and sets errno) on errors. */
+ int (*flush)(hFILE *fp) HTS_RESULT_USED;
+
+ /* Closes the underlying stream (for output streams, the buffer will
+ already have been flushed), returning 0 for success or negative (and
+ setting errno) on errors, as per close(2). */
+ int (*close)(hFILE *fp) HTS_RESULT_USED;
+};
+
+/* These are called from the hopen() dispatcher, and should call hfile_init()
+ to malloc a struct "derived" from hFILE and initialise it appropriately,
+ including setting base.backend to their own backend vector. */
+hFILE *hopen_irods(const char *filename, const char *mode);
+hFILE *hopen_net(const char *filename, const char *mode);
+
+/* May be called by hopen_*() functions to decode a fopen()-style mode into
+ open(2)-style flags. */
+int hfile_oflags(const char *mode);
+
+/* Must be called by hopen_*() functions to allocate the hFILE struct and set
+ up its base. Capacity is a suggested buffer size (e.g., via fstat(2))
+ or 0 for a default-sized buffer. */
+hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity);
+
+/* May be called by hopen_*() functions to undo the effects of hfile_init()
+ in the event opening the stream subsequently fails. (This is safe to use
+ even if fp is NULL. This takes care to preserve errno.) */
+void hfile_destroy(hFILE *fp);
+
+#endif
diff --git a/hts.h b/hts.h
new file mode 100644
index 0000000..084c162
--- /dev/null
+++ b/hts.h
@@ -0,0 +1,456 @@
+/* hts.h -- format-neutral I/O, indexing, and iterator API functions.
+
+ Copyright (C) 2012-2014 Genome Research Ltd.
+ Copyright (C) 2012 Broad Institute.
+
+ Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef HTSLIB_HTS_H
+#define HTSLIB_HTS_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifndef HTS_BGZF_TYPEDEF
+typedef struct BGZF BGZF;
+#define HTS_BGZF_TYPEDEF
+#endif
+struct cram_fd;
+struct hFILE;
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ size_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+/**
+ * hts_expand() - expands memory block pointed to by $ptr;
+ * hts_expand0() the latter sets the newly allocated part to 0.
+ *
+ * @param n requested number of elements of type type_t
+ * @param m size of memory allocated
+ */
+#define hts_expand(type_t, n, m, ptr) if ((n) > (m)) { \
+ (m) = (n); kroundup32(m); \
+ (ptr) = (type_t*)realloc((ptr), (m) * sizeof(type_t)); \
+ }
+#define hts_expand0(type_t, n, m, ptr) if ((n) > (m)) { \
+ int t = (m); (m) = (n); kroundup32(m); \
+ (ptr) = (type_t*)realloc((ptr), (m) * sizeof(type_t)); \
+ memset(((type_t*)ptr)+t,0,sizeof(type_t)*((m)-t)); \
+ }
+
+/************
+ * File I/O *
+ ************/
+
+// Add new entries only at the end (but before the *_maximum entry)
+// of these enums, as their numbering is part of the htslib ABI.
+
+enum htsFormatCategory {
+ unknown_category,
+ sequence_data, // Sequence data -- SAM, BAM, CRAM, etc
+ variant_data, // Variant calling data -- VCF, BCF, etc
+ index_file, // Index file associated with some data file
+ region_list, // Coordinate intervals or regions -- BED, etc
+ category_maximum = 32767
+};
+
+enum htsExactFormat {
+ unknown_format,
+ binary_format, text_format,
+ sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed,
+ format_maximum = 32767
+};
+
+enum htsCompression {
+ no_compression, gzip, bgzf, custom,
+ compression_maximum = 32767
+};
+
+typedef struct htsFormat {
+ enum htsFormatCategory category;
+ enum htsExactFormat format;
+ struct { short major, minor; } version;
+ enum htsCompression compression;
+ short compression_level; // currently unused
+ void *specific; // currently unused
+} htsFormat;
+
+// Maintainers note htsFile cannot be an opaque structure because some of its
+// fields are part of libhts.so's ABI (hence these fields must not be moved):
+// - fp is used in the public sam_itr_next()/etc macros
+// - is_bin is used directly in samtools <= 1.1 and bcftools <= 1.1
+// - is_write and is_cram are used directly in samtools <= 1.1
+// - fp is used directly in samtools (up to and including current develop)
+// - line is used directly in bcftools (up to and including current develop)
+typedef struct {
+ uint32_t is_bin:1, is_write:1, is_be:1, is_cram:1, dummy:28;
+ int64_t lineno;
+ kstring_t line;
+ char *fn, *fn_aux;
+ union {
+ BGZF *bgzf;
+ struct cram_fd *cram;
+ struct hFILE *hfile;
+ void *voidp;
+ } fp;
+ htsFormat format;
+} htsFile;
+
+// REQUIRED_FIELDS
+enum sam_fields {
+ SAM_QNAME = 0x00000001,
+ SAM_FLAG = 0x00000002,
+ SAM_RNAME = 0x00000004,
+ SAM_POS = 0x00000008,
+ SAM_MAPQ = 0x00000010,
+ SAM_CIGAR = 0x00000020,
+ SAM_RNEXT = 0x00000040,
+ SAM_PNEXT = 0x00000080,
+ SAM_TLEN = 0x00000100,
+ SAM_SEQ = 0x00000200,
+ SAM_QUAL = 0x00000400,
+ SAM_AUX = 0x00000800,
+ SAM_RGAUX = 0x00001000,
+};
+
+enum cram_option {
+ CRAM_OPT_DECODE_MD,
+ CRAM_OPT_PREFIX,
+ CRAM_OPT_VERBOSITY,
+ CRAM_OPT_SEQS_PER_SLICE,
+ CRAM_OPT_SLICES_PER_CONTAINER,
+ CRAM_OPT_RANGE,
+ CRAM_OPT_VERSION,
+ CRAM_OPT_EMBED_REF,
+ CRAM_OPT_IGNORE_MD5,
+ CRAM_OPT_REFERENCE,
+ CRAM_OPT_MULTI_SEQ_PER_SLICE,
+ CRAM_OPT_NO_REF,
+ CRAM_OPT_USE_BZIP2,
+ CRAM_OPT_SHARED_REF,
+ CRAM_OPT_NTHREADS,
+ CRAM_OPT_THREAD_POOL,
+ CRAM_OPT_USE_LZMA,
+ CRAM_OPT_USE_RANS,
+ CRAM_OPT_REQUIRED_FIELDS,
+};
+
+/**********************
+ * Exported functions *
+ **********************/
+
+extern int hts_verbose;
+
+/*! @abstract Table for converting a nucleotide character to 4-bit encoding.
+The input character may be either an IUPAC ambiguity code, '=' for 0, or
+'0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8
+for A/C/G/T or combinations of these bits for ambiguous bases.
+*/
+extern const unsigned char seq_nt16_table[256];
+
+/*! @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC
+ambiguity code letter (or '=' when given 0).
+*/
+extern const char seq_nt16_str[];
+
+/*! @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits.
+Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous).
+*/
+extern const int seq_nt16_int[];
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ @abstract Get the htslib version number
+ @return For released versions, a string like "N.N[.N]"; or git describe
+ output if using a library built within a Git repository.
+*/
+const char *hts_version(void);
+
+/*!
+ @abstract Determine format by peeking at the start of a file
+ @param fp File opened for reading, positioned at the beginning
+ @param fmt Format structure that will be filled out on return
+ @return 0 for success, or negative if an error occurred.
+*/
+int hts_detect_format(struct hFILE *fp, htsFormat *fmt);
+
+/*!
+ @abstract Get a human-readable description of the file format
+ @return Description string, to be freed by the caller after use.
+*/
+char *hts_format_description(const htsFormat *format);
+
+/*!
+ @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file
+ @param fn The file name or "-" for stdin/stdout
+ @param mode Mode matching /[rwa][bcuz0-9]+/
+ @discussion
+ With 'r' opens for reading; any further format mode letters are ignored
+ as the format is detected by checking the first few bytes or BGZF blocks
+ of the file. With 'w' or 'a' opens for writing or appending, with format
+ specifier letters:
+ b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc)
+ c CRAM format
+ g gzip compressed
+ u uncompressed
+ z bgzf compressed
+ [0-9] zlib compression level
+ Note that there is a distinction between 'u' and '0': the first yields
+ plain uncompressed output whereas the latter outputs uncompressed data
+ wrapped in the zlib format.
+ @example
+ [rw]b .. compressed BCF, BAM, FAI
+ [rw]u .. uncompressed BCF
+ [rw]z .. compressed VCF
+ [rw] .. uncompressed VCF
+*/
+htsFile *hts_open(const char *fn, const char *mode);
+
+/*!
+ @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file
+ @param fn The already-open file handle
+ @param mode Open mode, as per hts_open()
+*/
+htsFile *hts_hopen(struct hFILE *fp, const char *fn, const char *mode);
+
+/*!
+ @abstract Close a file handle, flushing buffered data for output streams
+ @param fp The file handle to be closed
+ @return 0 for success, or negative if an error occurred.
+*/
+int hts_close(htsFile *fp);
+
+/*!
+ @abstract Returns the file's format information
+ @param fp The file handle
+ @return Read-only pointer to the file's htsFormat.
+*/
+const htsFormat *hts_get_format(htsFile *fp);
+
+/*!
+ @abstract Sets a specified CRAM option on the open file handle.
+ @param fp The file handle open the open file.
+ @param opt The CRAM_OPT_* option.
+ @param ... Optional arguments, dependent on the option used.
+ @return 0 for success, or negative if an error occurred.
+*/
+int hts_set_opt(htsFile *fp, enum cram_option opt, ...);
+
+int hts_getline(htsFile *fp, int delimiter, kstring_t *str);
+char **hts_readlines(const char *fn, int *_n);
+/*!
+ @abstract Parse comma-separated list or read list from a file
+ @param list File name or comma-separated list
+ @param is_file
+ @param _n Size of the output array (number of items read)
+ @return NULL on failure or pointer to newly allocated array of
+ strings
+*/
+char **hts_readlist(const char *fn, int is_file, int *_n);
+
+/*!
+ @abstract Create extra threads to aid compress/decompression for this file
+ @param fp The file handle
+ @param n The number of worker threads to create
+ @return 0 for success, or negative if an error occurred.
+ @notes THIS THREADING API IS LIKELY TO CHANGE IN FUTURE.
+*/
+int hts_set_threads(htsFile *fp, int n);
+
+/*!
+ @abstract Set .fai filename for a file opened for reading
+ @return 0 for success, negative on failure
+ @discussion
+ Called before *_hdr_read(), this provides the name of a .fai file
+ used to provide a reference list if the htsFile contains no @SQ headers.
+*/
+int hts_set_fai_filename(htsFile *fp, const char *fn_aux);
+
+#ifdef __cplusplus
+}
+#endif
+
+/************
+ * Indexing *
+ ************/
+
+/*!
+These HTS_IDX_* macros are used as special tid values for hts_itr_query()/etc,
+producing iterators operating as follows:
+ - HTS_IDX_NOCOOR iterates over unmapped reads sorted at the end of the file
+ - HTS_IDX_START iterates over the entire file
+ - HTS_IDX_REST iterates from the current position to the end of the file
+ - HTS_IDX_NONE always returns "no more alignment records"
+When one of these special tid values is used, beg and end are ignored.
+When REST or NONE is used, idx is also ignored and may be NULL.
+*/
+#define HTS_IDX_NOCOOR (-2)
+#define HTS_IDX_START (-3)
+#define HTS_IDX_REST (-4)
+#define HTS_IDX_NONE (-5)
+
+#define HTS_FMT_CSI 0
+#define HTS_FMT_BAI 1
+#define HTS_FMT_TBI 2
+#define HTS_FMT_CRAI 3
+
+struct __hts_idx_t;
+typedef struct __hts_idx_t hts_idx_t;
+
+typedef struct {
+ uint64_t u, v;
+} hts_pair64_t;
+
+typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end);
+
+typedef struct {
+ uint32_t read_rest:1, finished:1, dummy:29;
+ int tid, beg, end, n_off, i;
+ int curr_tid, curr_beg, curr_end;
+ uint64_t curr_off;
+ hts_pair64_t *off;
+ hts_readrec_func *readrec;
+ struct {
+ int n, m;
+ int *a;
+ } bins;
+} hts_itr_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ #define hts_bin_first(l) (((1<<(((l)<<1) + (l))) - 1) / 7)
+ #define hts_bin_parent(l) (((l) - 1) >> 3)
+
+ hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls);
+ void hts_idx_destroy(hts_idx_t *idx);
+ int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped);
+ void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset);
+
+ void hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt);
+ hts_idx_t *hts_idx_load(const char *fn, int fmt);
+
+ uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta);
+ void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy);
+
+ int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped);
+ uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx);
+
+ const char *hts_parse_reg(const char *s, int *beg, int *end);
+ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec);
+ void hts_itr_destroy(hts_itr_t *iter);
+
+ typedef int (*hts_name2id_f)(void*, const char*);
+ typedef const char *(*hts_id2name_f)(void*, int);
+ typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec);
+
+ hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec);
+ int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data);
+ const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr); // free only the array, not the values
+
+ /**
+ * hts_file_type() - Convenience function to determine file type
+ * DEPRECATED: This function has been replaced by hts_detect_format().
+ * It and these FT_* macros will be removed in a future HTSlib release.
+ */
+ #define FT_UNKN 0
+ #define FT_GZ 1
+ #define FT_VCF 2
+ #define FT_VCF_GZ (FT_GZ|FT_VCF)
+ #define FT_BCF (1<<2)
+ #define FT_BCF_GZ (FT_GZ|FT_BCF)
+ #define FT_STDIN (1<<3)
+ int hts_file_type(const char *fname);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+static inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls)
+{
+ int l, s = min_shift, t = ((1<<((n_lvls<<1) + n_lvls)) - 1) / 7;
+ for (--end, l = n_lvls; l > 0; --l, s += 3, t -= 1<<((l<<1)+l))
+ if (beg>>s == end>>s) return t + (beg>>s);
+ return 0;
+}
+
+static inline int hts_bin_bot(int bin, int n_lvls)
+{
+ int l, b;
+ for (l = 0, b = bin; b; ++l, b = hts_bin_parent(b)); // compute the level of bin
+ return (bin - hts_bin_first(l)) << (n_lvls - l) * 3;
+}
+
+/**************
+ * Endianness *
+ **************/
+
+static inline int ed_is_big(void)
+{
+ long one= 1;
+ return !(*((char *)(&one)));
+}
+static inline uint16_t ed_swap_2(uint16_t v)
+{
+ return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
+}
+static inline void *ed_swap_2p(void *x)
+{
+ *(uint16_t*)x = ed_swap_2(*(uint16_t*)x);
+ return x;
+}
+static inline uint32_t ed_swap_4(uint32_t v)
+{
+ v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+ return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+static inline void *ed_swap_4p(void *x)
+{
+ *(uint32_t*)x = ed_swap_4(*(uint32_t*)x);
+ return x;
+}
+static inline uint64_t ed_swap_8(uint64_t v)
+{
+ v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
+ v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
+ return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
+}
+static inline void *ed_swap_8p(void *x)
+{
+ *(uint64_t*)x = ed_swap_8(*(uint64_t*)x);
+ return x;
+}
+
+#endif
diff --git a/hts_defs.h b/hts_defs.h
new file mode 100644
index 0000000..f0cab80
--- /dev/null
+++ b/hts_defs.h
@@ -0,0 +1,55 @@
+/* hts_defs.h -- Miscellaneous definitions.
+
+ Copyright (C) 2013-2015 Genome Research Ltd.
+
+ Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef HTSLIB_HTS_DEFS_H
+#define HTSLIB_HTS_DEFS_H
+
+#if __clang__major__ >= 2 || __GNUC__ >= 3
+#define HTS_NORETURN __attribute__ ((__noreturn__))
+#else
+#define HTS_NORETURN
+#endif
+
+#if (defined __clang__ && __clang_major__ >= 3) || \
+ (defined __GNUC__ && (__GNUC__ > 4 || (__GNUC__==4 && __GNUC_MINOR__ >= 5)))
+#define HTS_RESULT_USED __attribute__ ((__warn_unused_result__))
+#else
+#define HTS_RESULT_USED
+#endif
+
+#if defined __clang__ || \
+ (defined __GNUC__ && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95)))
+#define HTS_UNUSED __attribute__ ((__unused__))
+#else
+#define HTS_UNUSED
+#endif
+
+#if (defined __clang__ && (__clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 1))) || \
+ (defined __GNUC__ && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)))
+#define HTS_DEPRECATED(x) __attribute__ ((__deprecated__(x)))
+#else
+#define HTS_DEPRECATED(x)
+#endif
+
+#endif
diff --git a/khash.h b/khash.h
new file mode 100644
index 0000000..5e55088
--- /dev/null
+++ b/khash.h
@@ -0,0 +1,619 @@
+/* The MIT License
+
+ Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/*
+ An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+ int ret, is_missing;
+ khiter_t k;
+ khash_t(32) *h = kh_init(32);
+ k = kh_put(32, h, 5, &ret);
+ kh_value(h, k) = 10;
+ k = kh_get(32, h, 10);
+ is_missing = (k == kh_end(h));
+ k = kh_get(32, h, 5);
+ kh_del(32, h, k);
+ for (k = kh_begin(h); k != kh_end(h); ++k)
+ if (kh_exist(h, k)) kh_value(h, k) = 1;
+ kh_destroy(32, h);
+ return 0;
+}
+*/
+
+/*
+ 2013-05-02 (0.2.8):
+
+ * Use quadratic probing. When the capacity is power of 2, stepping function
+ i*(i+1)/2 guarantees to traverse each bucket. It is better than double
+ hashing on cache performance and is more robust than linear probing.
+
+ In theory, double hashing should be more robust than quadratic probing.
+ However, my implementation is probably not for large hash tables, because
+ the second hash function is closely tied to the first hash function,
+ which reduce the effectiveness of double hashing.
+
+ Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
+
+ 2011-12-29 (0.2.7):
+
+ * Minor code clean up; no actual effect.
+
+ 2011-09-16 (0.2.6):
+
+ * The capacity is a power of 2. This seems to dramatically improve the
+ speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+ - http://code.google.com/p/ulib/
+ - http://nothings.org/computer/judy/
+
+ * Allow to optionally use linear probing which usually has better
+ performance for random input. Double hashing is still the default as it
+ is more robust to certain non-random input.
+
+ * Added Wang's integer hash function (not used by default). This hash
+ function is more robust to certain non-random input.
+
+ 2011-02-14 (0.2.5):
+
+ * Allow to declare global functions.
+
+ 2009-09-26 (0.2.4):
+
+ * Improve portability
+
+ 2008-09-19 (0.2.3):
+
+ * Corrected the example
+ * Improved interfaces
+
+ 2008-09-11 (0.2.2):
+
+ * Improved speed a little in kh_put()
+
+ 2008-09-10 (0.2.1):
+
+ * Added kh_clear()
+ * Fixed a compiling error
+
+ 2008-09-02 (0.2.0):
+
+ * Changed to token concatenation which increases flexibility.
+
+ 2008-08-31 (0.1.2):
+
+ * Fixed a bug in kh_get(), which has not been tested previously.
+
+ 2008-08-31 (0.1.1):
+
+ * Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+ @header
+
+ Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.8"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+/* compiler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifndef kh_inline
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+#endif /* kh_inline */
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kcalloc
+#define kcalloc(N,Z) calloc(N,Z)
+#endif
+#ifndef kmalloc
+#define kmalloc(Z) malloc(Z)
+#endif
+#ifndef krealloc
+#define krealloc(P,Z) realloc(P,Z)
+#endif
+#ifndef kfree
+#define kfree(P) free(P)
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+ typedef struct kh_##name##_s { \
+ khint_t n_buckets, size, n_occupied, upper_bound; \
+ khint32_t *flags; \
+ khkey_t *keys; \
+ khval_t *vals; \
+ } kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \
+ extern kh_##name##_t *kh_init_##name(void); \
+ extern void kh_destroy_##name(kh_##name##_t *h); \
+ extern void kh_clear_##name(kh_##name##_t *h); \
+ extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
+ extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+ extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+ extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ SCOPE kh_##name##_t *kh_init_##name(void) { \
+ return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \
+ } \
+ SCOPE void kh_destroy_##name(kh_##name##_t *h) \
+ { \
+ if (h) { \
+ kfree((void *)h->keys); kfree(h->flags); \
+ kfree((void *)h->vals); \
+ kfree(h); \
+ } \
+ } \
+ SCOPE void kh_clear_##name(kh_##name##_t *h) \
+ { \
+ if (h && h->flags) { \
+ memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+ h->size = h->n_occupied = 0; \
+ } \
+ } \
+ SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
+ { \
+ if (h->n_buckets) { \
+ khint_t k, i, last, mask, step = 0; \
+ mask = h->n_buckets - 1; \
+ k = __hash_func(key); i = k & mask; \
+ last = i; \
+ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+ i = (i + (++step)) & mask; \
+ if (i == last) return h->n_buckets; \
+ } \
+ return __ac_iseither(h->flags, i)? h->n_buckets : i; \
+ } else return 0; \
+ } \
+ SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+ { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+ khint32_t *new_flags = 0; \
+ khint_t j = 1; \
+ { \
+ kroundup32(new_n_buckets); \
+ if (new_n_buckets < 4) new_n_buckets = 4; \
+ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
+ else { /* hash table size to be changed (shrink or expand); rehash */ \
+ new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+ if (!new_flags) return -1; \
+ memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+ if (h->n_buckets < new_n_buckets) { /* expand */ \
+ khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+ if (!new_keys) { kfree(new_flags); return -1; } \
+ h->keys = new_keys; \
+ if (kh_is_map) { \
+ khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+ if (!new_vals) { kfree(new_flags); return -1; } \
+ h->vals = new_vals; \
+ } \
+ } /* otherwise shrink */ \
+ } \
+ } \
+ if (j) { /* rehashing is needed */ \
+ for (j = 0; j != h->n_buckets; ++j) { \
+ if (__ac_iseither(h->flags, j) == 0) { \
+ khkey_t key = h->keys[j]; \
+ khval_t val; \
+ khint_t new_mask; \
+ new_mask = new_n_buckets - 1; \
+ if (kh_is_map) val = h->vals[j]; \
+ __ac_set_isdel_true(h->flags, j); \
+ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+ khint_t k, i, step = 0; \
+ k = __hash_func(key); \
+ i = k & new_mask; \
+ while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
+ __ac_set_isempty_false(new_flags, i); \
+ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+ __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+ } else { /* write the element and jump out of the loop */ \
+ h->keys[i] = key; \
+ if (kh_is_map) h->vals[i] = val; \
+ break; \
+ } \
+ } \
+ } \
+ } \
+ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+ h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+ if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+ } \
+ kfree(h->flags); /* free the working space */ \
+ h->flags = new_flags; \
+ h->n_buckets = new_n_buckets; \
+ h->n_occupied = h->size; \
+ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+ } \
+ return 0; \
+ } \
+ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+ { \
+ khint_t x; \
+ if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+ if (h->n_buckets > (h->size<<1)) { \
+ if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+ *ret = -1; return h->n_buckets; \
+ } \
+ } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+ *ret = -1; return h->n_buckets; \
+ } \
+ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+ { \
+ khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
+ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+ if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
+ else { \
+ last = i; \
+ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+ if (__ac_isdel(h->flags, i)) site = i; \
+ i = (i + (++step)) & mask; \
+ if (i == last) { x = site; break; } \
+ } \
+ if (x == h->n_buckets) { \
+ if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+ else x = i; \
+ } \
+ } \
+ } \
+ if (__ac_isempty(h->flags, x)) { /* not present at all */ \
+ h->keys[x] = key; \
+ __ac_set_isboth_false(h->flags, x); \
+ ++h->size; ++h->n_occupied; \
+ *ret = 1; \
+ } else if (__ac_isdel(h->flags, x)) { /* deleted */ \
+ h->keys[x] = key; \
+ __ac_set_isboth_false(h->flags, x); \
+ ++h->size; \
+ *ret = 2; \
+ } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+ return x; \
+ } \
+ SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
+ { \
+ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
+ __ac_set_isdel_true(h->flags, x); \
+ --h->size; \
+ } \
+ }
+
+#define KHASH_DECLARE(name, khkey_t, khval_t) \
+ __KHASH_TYPE(name, khkey_t, khval_t) \
+ __KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ __KHASH_TYPE(name, khkey_t, khval_t) \
+ __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+ @abstract Integer hash function
+ @param key The integer [khint32_t]
+ @return The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+ @abstract Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+ @abstract 64-bit integer hash function
+ @param key The integer [khint64_t]
+ @return The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+ @abstract 64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+ @abstract const char* hash function
+ @param s Pointer to a null terminated string
+ @return The hash value
+ */
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
+{
+ khint_t h = (khint_t)*s;
+ if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+ return h;
+}
+/*! @function
+ @abstract Another interface to const char* hash function
+ @param key Pointer to a null terminated string [const char*]
+ @return The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+ @abstract Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+ key += ~(key << 15);
+ key ^= (key >> 10);
+ key += (key << 3);
+ key ^= (key >> 6);
+ key += ~(key << 11);
+ key ^= (key >> 16);
+ return key;
+}
+#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+ @abstract Type of the hash table.
+ @param name Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+ @abstract Initiate a hash table.
+ @param name Name of the hash table [symbol]
+ @return Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+ @abstract Destroy a hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+ @abstract Reset a hash table without deallocating memory.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+ @abstract Resize a hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param s New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+ @abstract Insert a key to the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Key [type of keys]
+ @param r Extra return code: -1 if the operation failed;
+ 0 if the key is present in the hash table;
+ 1 if the bucket is empty (never used); 2 if the element in
+ the bucket has been deleted [int*]
+ @return Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+ @abstract Retrieve a key from the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Key [type of keys]
+ @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+ @abstract Remove a key from the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+ @abstract Test whether a bucket contains data.
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return 1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+ @abstract Get key given an iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+ @abstract Get value given an iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return Value [type of values]
+ @discussion For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+ @abstract Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+ @abstract Get the start iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+ @abstract Get the end iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+ @abstract Get the number of elements in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+ @abstract Get the number of buckets in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/*! @function
+ @abstract Iterate over the entries in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param kvar Variable to which key will be assigned
+ @param vvar Variable to which value will be assigned
+ @param code Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \
+ for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
+ if (!kh_exist(h,__i)) continue; \
+ (kvar) = kh_key(h,__i); \
+ (vvar) = kh_val(h,__i); \
+ code; \
+ } }
+
+/*! @function
+ @abstract Iterate over the values in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param vvar Variable to which value will be assigned
+ @param code Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i; \
+ for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
+ if (!kh_exist(h,__i)) continue; \
+ (vvar) = kh_val(h,__i); \
+ code; \
+ } }
+
+/* More conenient interfaces */
+
+/*! @function
+ @abstract Instantiate a hash set containing integer keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name) \
+ KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing integer keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t) \
+ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing 64-bit integer keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name) \
+ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing 64-bit integer keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t) \
+ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+ @abstract Instantiate a hash map containing const char* keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name) \
+ KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing const char* keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t) \
+ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/pigz.c b/pigz.c
index 6605925..2a4b7ab 100644
--- a/pigz.c
+++ b/pigz.c
@@ -291,12 +291,6 @@
input buffers to about the same number.
*/
-#include <stdint.h>
-#include <inttypes.h>
-#define BLOCKSIZE 131072LU
-// extra allocated input buffer space, to simplify callback function logic
-#define SUPERSIZE 131072LU
-
#ifdef _WIN32
// stopgap non-parallel code for Windows
@@ -305,44 +299,162 @@
#include <windows.h>
#include "zlib-1.2.8/zlib.h"
+#include "pigz.h"
+
void pigz_init(uint32_t setprocs) {
return;
}
-void parallel_compress(char* out_fname, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*)) {
- unsigned char buf[BLOCKSIZE + SUPERSIZE];
+void parallel_compress(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*)) {
+ // minor issue: this currently writes \n instead of \r\n linebreaks.
uint32_t overflow_ct = 0;
gzFile gz_outfile = gzopen(out_fname, do_append? "ab": "wb");
+ unsigned char* write_ptr;
uint32_t last_size;
if (!gz_outfile) {
printf("\nError: Failed to open %s.\n", out_fname);
exit(2);
}
do {
- last_size = emitn(overflow_ct, buf);
- if (last_size > BLOCKSIZE) {
- overflow_ct = last_size - BLOCKSIZE;
- last_size = BLOCKSIZE;
+ last_size = emitn(overflow_ct, overflow_buf);
+ if (last_size > PIGZ_BLOCK_SIZE) {
+ overflow_ct = last_size - PIGZ_BLOCK_SIZE;
+ last_size = PIGZ_BLOCK_SIZE;
} else {
overflow_ct = 0;
}
if (last_size) {
- if (!gzwrite(gz_outfile, buf, last_size)) {
- printf("\nError: File write failure.\n");
+ if (!gzwrite(gz_outfile, overflow_buf, last_size)) {
+ fputs("\nError: File write failure.\n", stdout);
gzclose(gz_outfile);
exit(6);
}
}
if (overflow_ct) {
- memcpy(buf, &(buf[BLOCKSIZE]), overflow_ct);
+ write_ptr = &(overflow_buf[PIGZ_BLOCK_SIZE]);
+ while (overflow_ct > PIGZ_BLOCK_SIZE) {
+ if (!gzwrite(gz_outfile, write_ptr, PIGZ_BLOCK_SIZE)) {
+ fputs("\nError: File write failure.\n", stdout);
+ gzclose(gz_outfile);
+ exit(6);
+ }
+ write_ptr = &(write_ptr[PIGZ_BLOCK_SIZE]);
+ overflow_ct -= PIGZ_BLOCK_SIZE;
+ }
+ memcpy(overflow_buf, write_ptr, overflow_ct);
}
} while (last_size);
if (gzclose(gz_outfile) != Z_OK) {
- printf("\nError: File write failure.\n");
+ fputs("\nError: File write failure.\n", stdout);
exit(6);
}
}
+int32_t pzwrite_init(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, Pigz_state* ps_ptr) {
+ ps_ptr->outfile = fopen(out_fname, do_append? "ab" : "wb");
+ ps_ptr->gz_outfile = NULL;
+ if (!ps_ptr->outfile) {
+ printf("\nError: Failed to open %s.\n", out_fname);
+ return 2; // RET_OPEN_FAIL
+ }
+ ps_ptr->overflow_buf = overflow_buf;
+ return 0;
+}
+
+void compressed_pzwrite_init(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, Pigz_state* ps_ptr) {
+ ps_ptr->outfile = NULL;
+ ps_ptr->gz_outfile = gzopen(out_fname, do_append? "ab" : "wb");
+ if (!ps_ptr->gz_outfile) {
+ printf("\nError: Failed to open %s.\n", out_fname);
+ exit(2);
+ }
+ ps_ptr->overflow_buf = overflow_buf;
+}
+
+int32_t flex_pzwrite_init(uint32_t output_gz, char* out_fname, unsigned char* overflow_buf, uint32_t do_append, Pigz_state* ps_ptr) {
+ if (!output_gz) {
+ return pzwrite_init(out_fname, overflow_buf, do_append, ps_ptr);
+ } else {
+ compressed_pzwrite_init(out_fname, overflow_buf, do_append, ps_ptr);
+ return 0;
+ }
+}
+
+int32_t force_pzwrite(Pigz_state* ps_ptr, char** writep_ptr, uint32_t write_min) {
+ unsigned char* writep = (unsigned char*)(*writep_ptr);
+ if (ps_ptr->overflow_buf != writep) {
+ if (!fwrite(ps_ptr->overflow_buf, writep - ps_ptr->overflow_buf, 1, ps_ptr->outfile)) {
+ return 6; // RET_WRITE_FAIL
+ }
+ *writep_ptr = (char*)(ps_ptr->overflow_buf);
+ }
+ return 0;
+}
+
+void force_compressed_pzwrite(Pigz_state* ps_ptr, char** writep_ptr, uint32_t write_min) {
+ unsigned char* writep = (unsigned char*)(*writep_ptr);
+ if (ps_ptr->overflow_buf != writep) {
+ if (!gzwrite(ps_ptr->gz_outfile, ps_ptr->overflow_buf, writep - ps_ptr->overflow_buf)) {
+ fputs("\nError: File write failure.\n", stdout);
+ gzclose(ps_ptr->gz_outfile);
+ exit(6);
+ }
+ *writep_ptr = (char*)(ps_ptr->overflow_buf);
+ }
+}
+
+int32_t flex_pzputs_std(Pigz_state* ps_ptr, char** writep_ptr, char* ss, uint32_t sslen) {
+ unsigned char* writep = (unsigned char*)(*writep_ptr);
+ unsigned char* readp = (unsigned char*)ss;
+ uint32_t cur_write_space = 2 * PIGZ_BLOCK_SIZE - ((uintptr_t)(writep - ps_ptr->overflow_buf));
+ while (sslen > cur_write_space) {
+ memcpy(writep, readp, cur_write_space);
+ if (is_uncompressed_pzwrite(ps_ptr)) {
+ if (!fwrite(ps_ptr->overflow_buf, 2 * PIGZ_BLOCK_SIZE, 1, ps_ptr->outfile)) {
+ return 6;
+ }
+ } else {
+ if (!gzwrite(ps_ptr->gz_outfile, ps_ptr->overflow_buf, 2 * PIGZ_BLOCK_SIZE)) {
+ fputs("\nError: File write failure.\n", stdout);
+ gzclose(ps_ptr->gz_outfile);
+ exit(6);
+ }
+ }
+ writep = ps_ptr->overflow_buf;
+ readp = &(readp[cur_write_space]);
+ sslen -= cur_write_space;
+ cur_write_space = 2 * PIGZ_BLOCK_SIZE;
+ }
+ memcpy(writep, readp, sslen);
+ *writep_ptr = (char*)(&(writep[sslen]));
+ return flex_pzwrite(ps_ptr, writep_ptr);
+}
+
+int32_t pzwrite_close_null(Pigz_state* ps_ptr, char* writep) {
+ force_pzwrite(ps_ptr, &writep, 0);
+ int32_t ii = ferror(ps_ptr->outfile);
+ int32_t jj = fclose(ps_ptr->outfile);
+ ps_ptr->overflow_buf = NULL;
+ return ii || jj;
+}
+
+void compressed_pzwrite_close_null(Pigz_state* ps_ptr, char* writep) {
+ force_compressed_pzwrite(ps_ptr, &writep, 0);
+ ps_ptr->overflow_buf = NULL;
+ if (gzclose(ps_ptr->gz_outfile) != Z_OK) {
+ fputs("\nError: File write failure.\n", stdout);
+ exit(6);
+ }
+}
+
+int32_t flex_pzwrite_close_null(Pigz_state* ps_ptr, char* writep) {
+ if (is_uncompressed_pzwrite(ps_ptr)) {
+ return pzwrite_close_null(ps_ptr, writep);
+ } else {
+ compressed_pzwrite_close_null(ps_ptr, writep);
+ return 0;
+ }
+}
#else
#define VERSION "pigz 2.3\n"
@@ -399,6 +511,9 @@ void parallel_compress(char* out_fname, uint32_t do_append, uint32_t(* emitn)(ui
release(), peek_lock(), free_lock(), yarn_name */
#endif
+#include "pigz.h"
+
+
/* for local functions and globals */
#define local static
@@ -667,37 +782,6 @@ local unsigned long crc32_comb(unsigned long crc1, unsigned long crc2,
#define BASE 65521U /* largest prime smaller than 65536 */
#define LOW16 0xffff /* mask lower 16 bits */
-
-/* -- pool of spaces for buffer management -- */
-
-/* These routines manage a pool of spaces. Each pool specifies a fixed size
- buffer to be contained in each space. Each space has a use count, which
- when decremented to zero returns the space to the pool. If a space is
- requested from the pool and the pool is empty, a space is immediately
- created unless a specified limit on the number of spaces has been reached.
- Only if the limit is reached will it wait for a space to be returned to the
- pool. Each space knows what pool it belongs to, so that it can be returned.
- */
-
-/* a space (one buffer for each space) */
-struct space {
- lock *use; /* use count -- return to pool when zero */
- unsigned char *buf; /* buffer of size size */
- size_t size; /* current size of this buffer */
- size_t len; /* for application usage (initially zero) */
- struct pool *pool; /* pool to return to */
- struct space *next; /* for pool linked list */
-};
-
-/* pool of spaces (one pool for each type needed) */
-struct pool {
- lock *have; /* unused spaces available, lock for list */
- struct space *head; /* linked list of available buffers */
- size_t size; /* size of new buffers in this pool */
- int limit; /* number of new spaces allowed, or -1 */
- int made; /* number of buffers made */
-};
-
/* initialize a pool (pool structure itself provided, not allocated) -- the
limit is the maximum number of spaces in the pool, or -1 to indicate no
limit, i.e., to never wait for a buffer to return to the pool */
@@ -892,7 +976,7 @@ local void setup_jobs(void)
/* initialize buffer pools (initial size for out_pool not critical, since
buffers will be grown in size if needed -- initial size chosen to make
this unlikely -- same for lens_pool) */
- new_pool(&in_pool, g.block + SUPERSIZE, INBUFS(g.procs));
+ new_pool(&in_pool, g.block, INBUFS(g.procs));
new_pool(&out_pool, OUTPOOL(g.block), -1);
new_pool(&dict_pool, DICT, -1);
new_pool(&lens_pool, g.block >> (RSYNCBITS - 1), -1);
@@ -1192,17 +1276,24 @@ local void write_thread(void *dummy)
value calculations and one other thread for writing the output -- compress
threads will be launched and left running (waiting actually) to support
subsequent calls of parallel_compress() */
-void parallel_compress(char* out_fname, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*))
+void parallel_compress(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*))
{
- unsigned char overflow_buf[SUPERSIZE];
+ // overflow_buf must have size >= PIGZ_BLOCK_SIZE + maximum emission
+
+ // if overflow_ct is nonzero, this points to the first uncompressed
+ // character in overflow_buf
+ unsigned char* read_ptr = NULL;
+
uint32_t overflow_ct;
long seq; /* sequence number */
struct space *curr; /* input data to compress */
struct space *next; /* input data that follows curr */
struct space *dict; /* dictionary for next compression */
struct job *job; /* job for compress, then write */
+
int more; /* true if more input to read */
size_t len; /* for various length computations */
+ uint32_t cur_len;
g.outf = out_fname;
g.outd = open(g.outf, O_WRONLY | (do_append? O_APPEND : (O_CREAT | O_TRUNC)), 0644);
@@ -1217,14 +1308,17 @@ void parallel_compress(char* out_fname, uint32_t do_append, uint32_t(* emitn)(ui
the output of the compress threads) */
seq = 0;
next = get_space(&in_pool);
- next->len = emitn(0, next->buf);
- if (next->len > BLOCKSIZE) {
- overflow_ct = next->len - BLOCKSIZE;
- memcpy(overflow_buf, &(next->buf[BLOCKSIZE]), overflow_ct);
- next->len = BLOCKSIZE;
+ cur_len = emitn(0, overflow_buf);
+ if (cur_len > PIGZ_BLOCK_SIZE) {
+ memcpy(next->buf, overflow_buf, PIGZ_BLOCK_SIZE);
+ next->len = PIGZ_BLOCK_SIZE;
+ read_ptr = &(overflow_buf[PIGZ_BLOCK_SIZE]);
} else {
- overflow_ct = 0;
+ memcpy(next->buf, overflow_buf, cur_len);
+ next->len = cur_len;
}
+ overflow_ct = cur_len - next->len;
+
dict = NULL;
do {
/* create a new job */
@@ -1238,14 +1332,27 @@ void parallel_compress(char* out_fname, uint32_t do_append, uint32_t(* emitn)(ui
/* get more input if we don't already have some */
next = get_space(&in_pool);
- memcpy(next->buf, overflow_buf, overflow_ct);
- next->len = emitn(overflow_ct, next->buf);
- if (next->len > BLOCKSIZE) {
- overflow_ct = next->len - BLOCKSIZE;
- memcpy(overflow_buf, &(next->buf[BLOCKSIZE]), overflow_ct);
- next->len = BLOCKSIZE;
+ if (overflow_ct >= PIGZ_BLOCK_SIZE) {
+ // no need to call emitn(), since we still have >= 128K of text
+ // from the previous call to compress
+ memcpy(next->buf, read_ptr, PIGZ_BLOCK_SIZE);
+ next->len = PIGZ_BLOCK_SIZE;
+ read_ptr = &(read_ptr[PIGZ_BLOCK_SIZE]);
+ overflow_ct -= PIGZ_BLOCK_SIZE;
} else {
- overflow_ct = 0;
+ if (overflow_ct) {
+ memcpy(overflow_buf, read_ptr, overflow_ct);
+ }
+ cur_len = emitn(overflow_ct, overflow_buf);
+ if (cur_len > PIGZ_BLOCK_SIZE) {
+ memcpy(next->buf, overflow_buf, PIGZ_BLOCK_SIZE);
+ next->len = PIGZ_BLOCK_SIZE;
+ read_ptr = &(overflow_buf[PIGZ_BLOCK_SIZE]);
+ } else {
+ memcpy(next->buf, overflow_buf, cur_len);
+ next->len = cur_len;
+ }
+ overflow_ct = cur_len - next->len;
}
/* if rsyncable, generate block lengths and prepare curr for job to
@@ -1302,6 +1409,203 @@ void parallel_compress(char* out_fname, uint32_t do_append, uint32_t(* emitn)(ui
finish_jobs();
}
+
+// about time to implement this without the awkward callback interface...
+int32_t pzwrite_init(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, Pigz_state* ps_ptr) {
+ // unbuffered, and doesn't need to support Windows
+ ps_ptr->outd = open(out_fname, O_WRONLY | (do_append? O_APPEND : (O_CREAT | O_TRUNC)), 0644);
+ if (ps_ptr->outd == -1) {
+ printf("\nError: Failed to open %s.\n", out_fname);
+ return 2; // RET_OPEN_FAIL
+ }
+ ps_ptr->overflow_buf = overflow_buf;
+ return 0;
+}
+
+void compressed_pzwrite_init(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, Pigz_state* ps_ptr) {
+ ps_ptr->outd = -1;
+ g.outf = out_fname;
+ g.outd = open(g.outf, O_WRONLY | (do_append? O_APPEND : (O_CREAT | O_TRUNC)), 0644);
+
+ /* if first time or after an option change, setup the job lists */
+ setup_jobs();
+
+ /* start write thread */
+ writeth = launch(write_thread, NULL);
+
+ ps_ptr->overflow_buf = overflow_buf;
+ ps_ptr->next = NULL;
+}
+
+int32_t flex_pzwrite_init(uint32_t output_gz, char* out_fname, unsigned char* overflow_buf, uint32_t do_append, Pigz_state* ps_ptr) {
+ if (!output_gz) {
+ return pzwrite_init(out_fname, overflow_buf, do_append, ps_ptr);
+ } else {
+ compressed_pzwrite_init(out_fname, overflow_buf, do_append, ps_ptr);
+ return 0;
+ }
+}
+
+int32_t force_pzwrite(Pigz_state* ps_ptr, char** writep_ptr, uint32_t write_min) {
+ unsigned char* writep = (unsigned char*)(*writep_ptr);
+ unsigned char* buf = ps_ptr->overflow_buf;
+ uint32_t len = (uintptr_t)(writep - buf);
+ ssize_t ret;
+ while (len) {
+ ret = write(ps_ptr->outd, ps_ptr->overflow_buf, len);
+ if (ret < 1) {
+ return 6; // RET_WRITE_FAIL
+ }
+ buf += ret;
+ len -= ret;
+ }
+ *writep_ptr = (char*)(ps_ptr->overflow_buf);
+ return 0;
+}
+
+void force_compressed_pzwrite(Pigz_state* ps_ptr, char** writep_ptr, uint32_t write_min) {
+ // Caller must not request a length-0 write until it's time to close the
+ // file.
+ unsigned char* writep = (unsigned char*)(*writep_ptr);
+ unsigned char* readp = ps_ptr->overflow_buf;
+ uint32_t cur_len = (uintptr_t)(writep - readp);
+
+ struct space* curr; /* input data to compress */
+ struct job *job; /* job for compress, then write */
+
+ int more; /* true if more input to read */
+ size_t len; /* for various length computations */
+ if (!ps_ptr->next) {
+ ps_ptr->seq = 0;
+ ps_ptr->next = get_space(&in_pool);
+ if (cur_len > PIGZ_BLOCK_SIZE) {
+ memcpy(ps_ptr->next->buf, readp, PIGZ_BLOCK_SIZE);
+ ps_ptr->next->len = PIGZ_BLOCK_SIZE;
+ readp = &(readp[PIGZ_BLOCK_SIZE]);
+ cur_len -= PIGZ_BLOCK_SIZE;
+ } else {
+ memcpy(ps_ptr->next->buf, readp, cur_len);
+ ps_ptr->next->len = cur_len;
+ readp = writep;
+ cur_len = 0;
+ }
+ ps_ptr->dict = NULL;
+ if ((cur_len <= PIGZ_BLOCK_SIZE) && write_min) {
+ // need more input to handle dict properly
+ if (cur_len) {
+ memcpy(ps_ptr->overflow_buf, readp, cur_len);
+ }
+ *writep_ptr = (char*)(&(ps_ptr->overflow_buf[cur_len]));
+ return;
+ }
+ }
+
+ do {
+ // create a new job
+ job = (struct job*)malloc(sizeof(struct job));
+ if (job == NULL) {
+ bail("not enough memory", "");
+ }
+ job->calc = new_lock(0);
+ curr = ps_ptr->next;
+ ps_ptr->next = get_space(&in_pool);
+ if (cur_len > PIGZ_BLOCK_SIZE) {
+ memcpy(ps_ptr->next->buf, readp, PIGZ_BLOCK_SIZE);
+ ps_ptr->next->len = PIGZ_BLOCK_SIZE;
+ readp = &(readp[PIGZ_BLOCK_SIZE]);
+ } else {
+ memcpy(ps_ptr->next->buf, readp, cur_len);
+ ps_ptr->next->len = cur_len;
+ readp = writep;
+ }
+ job->lens = NULL;
+ job->in = curr;
+ more = (cur_len != 0);
+ job->more = more;
+ job->out = ps_ptr->dict;
+ if (more) {
+ if (curr->len >= DICT || job->out == NULL) {
+ ps_ptr->dict = curr;
+ use_space(ps_ptr->dict);
+ } else {
+ ps_ptr->dict = get_space(&dict_pool);
+ len = DICT - curr->len;
+ memcpy(ps_ptr->dict->buf, job->out->buf + (job->out->len - len), len);
+ memcpy(ps_ptr->dict->buf + len, curr->buf, curr->len);
+ ps_ptr->dict->len = DICT;
+ }
+ }
+ job->seq = ps_ptr->seq;
+ if (++(ps_ptr->seq) < 1) {
+ bail("input too long: ", "");
+ }
+ if (cthreads < ps_ptr->seq && cthreads < g.procs) {
+ (void)launch(compress_thread, NULL);
+ cthreads++;
+ }
+ possess(compress_have);
+ job->next = NULL;
+ *compress_tail = job;
+ compress_tail = &(job->next);
+ twist(compress_have, BY, +1);
+ cur_len = (uintptr_t)(writep - readp);
+ } while ((cur_len >= write_min) && more);
+ if (cur_len) {
+ memcpy(ps_ptr->overflow_buf, readp, cur_len);
+ }
+ *writep_ptr = (char*)(&(ps_ptr->overflow_buf[cur_len]));
+}
+
+int32_t flex_pzputs_std(Pigz_state* ps_ptr, char** writep_ptr, char* ss, uint32_t sslen) {
+ unsigned char* writep = (unsigned char*)(*writep_ptr);
+ unsigned char* readp = (unsigned char*)ss;
+ uint32_t cur_write_space = 2 * PIGZ_BLOCK_SIZE - ((uintptr_t)(writep - ps_ptr->overflow_buf));
+ int32_t ii;
+ while (sslen > cur_write_space) {
+ memcpy(writep, readp, cur_write_space);
+ if (is_uncompressed_pzwrite(ps_ptr)) {
+ ii = force_pzwrite(ps_ptr, (char**)(&writep), PIGZ_BLOCK_SIZE + 1);
+ if (ii) {
+ return ii;
+ }
+ } else {
+ force_compressed_pzwrite(ps_ptr, (char**)(&writep), PIGZ_BLOCK_SIZE + 1);
+ }
+ readp = &(readp[cur_write_space]);
+ sslen -= cur_write_space;
+ cur_write_space = 2 * PIGZ_BLOCK_SIZE;
+ }
+ memcpy(writep, readp, sslen);
+ *writep_ptr = (char*)(&(writep[sslen]));
+ return flex_pzwrite(ps_ptr, writep_ptr);
+}
+
+int32_t pzwrite_close_null(Pigz_state* ps_ptr, char* writep) {
+ int32_t ii = force_pzwrite(ps_ptr, &writep, 0);
+ int32_t jj = close(ps_ptr->outd);
+ ps_ptr->overflow_buf = NULL;
+ return ii || jj;
+}
+
+void compressed_pzwrite_close_null(Pigz_state* ps_ptr, char* writep) {
+ force_compressed_pzwrite(ps_ptr, &writep, 0);
+ drop_space(ps_ptr->next);
+ /* wait for the write thread to complete (we leave the compress threads out
+ there and waiting in case there is another stream to compress) */
+ join(writeth);
+ writeth = NULL;
+ finish_jobs();
+ ps_ptr->overflow_buf = NULL;
+}
+
+int32_t flex_pzwrite_close_null(Pigz_state* ps_ptr, char* writep) {
+ if (is_uncompressed_pzwrite(ps_ptr)) {
+ return pzwrite_close_null(ps_ptr, writep);
+ } else {
+ compressed_pzwrite_close_null(ps_ptr, writep);
+ return 0;
+ }
+}
#endif
/* catch termination signal */
@@ -1330,45 +1634,56 @@ void pigz_init(uint32_t setprocs)
#endif
yarn_prefix = g.prog;
yarn_abort = cut_short;
- g.block = BLOCKSIZE; /* 128K */
+ g.block = PIGZ_BLOCK_SIZE; /* 128K */
g.verbosity = 1; /* normal message level */
}
#endif // _WIN32
// provide identical interface for uncompressed writing, to simplify code that
// can generate either compressed or uncompressed output
-int32_t write_uncompressed(char* out_fname, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*)) {
- unsigned char buf[BLOCKSIZE + SUPERSIZE];
+int32_t write_uncompressed(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*)) {
uint32_t overflow_ct = 0;
// if it's potentially worth compressing, it should be text, hence mode "w"
// instead of "wb"
+ // (er, that actually does the wrong thing on Windows. Fixed in pzwrite.)
FILE* outfile = fopen(out_fname, do_append? "a" : "w");
+ unsigned char* write_ptr;
uint32_t last_size;
if (!outfile) {
printf("\nError: Failed to open %s.\n", out_fname);
return 2; // RET_OPEN_FAIL
}
do {
- last_size = emitn(overflow_ct, buf);
- if (last_size > BLOCKSIZE) {
- overflow_ct = last_size - BLOCKSIZE;
- last_size = BLOCKSIZE;
+ last_size = emitn(overflow_ct, overflow_buf);
+ if (last_size > PIGZ_BLOCK_SIZE) {
+ overflow_ct = last_size - PIGZ_BLOCK_SIZE;
+ last_size = PIGZ_BLOCK_SIZE;
} else {
overflow_ct = 0;
}
if (last_size) {
- if (!fwrite(buf, last_size, 1, outfile)) {
- printf("\nError: File write failure.\n");
+ if (!fwrite(overflow_buf, last_size, 1, outfile)) {
+ fputs("\nError: File write failure.\n", stdout);
fclose(outfile);
return 6; // RET_WRITE_FAIL
}
}
if (overflow_ct) {
- memcpy(buf, &(buf[BLOCKSIZE]), overflow_ct);
+ write_ptr = &(overflow_buf[PIGZ_BLOCK_SIZE]);
+ while (overflow_ct > PIGZ_BLOCK_SIZE) {
+ if (!fwrite(write_ptr, PIGZ_BLOCK_SIZE, 1, outfile)) {
+ fputs("\nError: File write failure.\n", stdout);
+ fclose(outfile);
+ return 6;
+ }
+ write_ptr = &(write_ptr[PIGZ_BLOCK_SIZE]);
+ overflow_ct -= PIGZ_BLOCK_SIZE;
+ }
+ memcpy(overflow_buf, write_ptr, overflow_ct);
}
} while (last_size);
if (fclose(outfile)) {
- printf("\nError: File write failure.\n");
+ fputs("\nError: File write failure.\n", stdout);
return 6;
}
return 0;
diff --git a/pigz.h b/pigz.h
index c77926b..37913cf 100644
--- a/pigz.h
+++ b/pigz.h
@@ -6,10 +6,154 @@
#define PIGZ_BLOCK_SIZE 131072
-void parallel_compress(char* out_fname, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*));
+#ifndef _WIN32
+/* -- pool of spaces for buffer management -- */
+
+/* These routines manage a pool of spaces. Each pool specifies a fixed size
+ buffer to be contained in each space. Each space has a use count, which
+ when decremented to zero returns the space to the pool. If a space is
+ requested from the pool and the pool is empty, a space is immediately
+ created unless a specified limit on the number of spaces has been reached.
+ Only if the limit is reached will it wait for a space to be returned to the
+ pool. Each space knows what pool it belongs to, so that it can be returned.
+ */
+
+#include "yarn.h"
+
+/* a space (one buffer for each space) */
+struct space {
+ lock *use; /* use count -- return to pool when zero */
+ unsigned char *buf; /* buffer of size size */
+ size_t size; /* current size of this buffer */
+ size_t len; /* for application usage (initially zero) */
+ struct pool *pool; /* pool to return to */
+ struct space *next; /* for pool linked list */
+};
+
+/* pool of spaces (one pool for each type needed) */
+struct pool {
+ lock *have; /* unused spaces available, lock for list */
+ struct space *head; /* linked list of available buffers */
+ size_t size; /* size of new buffers in this pool */
+ int limit; /* number of new spaces allowed, or -1 */
+ int made; /* number of buffers made */
+};
+
+// Note that this does NOT actually capture anywhere near all of pigz's state;
+// there are plenty of global variables that prevent multiple
+// parallel_compress2 instances from running concurrently. It's just the bare
+// minimum to remove parallel_compress's callback requirement.
+typedef struct {
+ unsigned char* overflow_buf;
+ long seq;
+ struct space* dict;
+ struct space* next;
+ int outd; // uncompressed writing
+} Pigz_state;
+
+static inline uint32_t is_uncompressed_pzwrite(Pigz_state* ps_ptr) {
+ return ps_ptr->outd != -1;
+}
+#else
+typedef struct {
+ unsigned char* overflow_buf;
+ FILE* outfile;
+ gzFile gz_outfile;
+} Pigz_state;
+
+static inline uint32_t is_uncompressed_pzwrite(Pigz_state* ps_ptr) {
+ return (ps_ptr->outfile != NULL);
+}
+#endif // _WIN32 / NOTHREAD
+
+// This interface is obsolete; compressed_pzwrite/flex_pzwrite is far easier to
+// use.
+void parallel_compress(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*));
+
+
+static inline void pzwrite_init_null(Pigz_state* ps_ptr) {
+ ps_ptr->overflow_buf = NULL;
+}
+
+int32_t pzwrite_init(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, Pigz_state* ps_ptr);
+
+void compressed_pzwrite_init(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, Pigz_state* ps_ptr);
+
+int32_t flex_pzwrite_init(uint32_t output_gz, char* out_fname, unsigned char* overflow_buf, uint32_t do_append, Pigz_state* ps_ptr);
+
+int32_t force_pzwrite(Pigz_state* ps_ptr, char** writep_ptr, uint32_t write_min);
+
+void force_compressed_pzwrite(Pigz_state* ps_ptr, char** writep_ptr, uint32_t write_min);
+
+static inline int32_t pzwrite(Pigz_state* ps_ptr, char** writep_ptr) {
+ if ((uintptr_t)(((unsigned char*)(*writep_ptr)) - ps_ptr->overflow_buf) >= PIGZ_BLOCK_SIZE + 1) {
+ return force_pzwrite(ps_ptr, writep_ptr, PIGZ_BLOCK_SIZE + 1);
+ }
+ return 0;
+}
+
+static inline void compressed_pzwrite(Pigz_state* ps_ptr, char** writep_ptr) {
+ if ((uintptr_t)(((unsigned char*)(*writep_ptr)) - ps_ptr->overflow_buf) >= PIGZ_BLOCK_SIZE + 1) {
+ force_compressed_pzwrite(ps_ptr, writep_ptr, PIGZ_BLOCK_SIZE + 1);
+ }
+}
+
+static inline int32_t flex_pzwrite(Pigz_state* ps_ptr, char** writep_ptr) {
+ if ((uintptr_t)(((unsigned char*)(*writep_ptr)) - ps_ptr->overflow_buf) >= PIGZ_BLOCK_SIZE + 1) {
+ if (is_uncompressed_pzwrite(ps_ptr)) {
+ return force_pzwrite(ps_ptr, writep_ptr, PIGZ_BLOCK_SIZE + 1);
+ }
+ force_compressed_pzwrite(ps_ptr, writep_ptr, PIGZ_BLOCK_SIZE + 1);
+ }
+ return 0;
+}
+
+// Assumes overflow_buf has size 2 * PIGZ_BLOCK_SIZE.
+int32_t flex_pzputs_std(Pigz_state* ps_ptr, char** writep_ptr, char* ss, uint32_t sslen);
+
+// designed to write allele codes, which are usually length-1, but could have
+// length in the millions. Assumes overflow_buf has size 2 * PIGZ_BLOCK_SIZE.
+static inline int32_t flex_pzputs_allele(Pigz_state* ps_ptr, char** writep_ptr, char* allele_code, uint32_t allele_len) {
+ // optimize the common case
+ if (allele_len == 1) {
+ **writep_ptr = *allele_code;
+ *writep_ptr += 1;
+ return flex_pzwrite(ps_ptr, writep_ptr);
+ }
+ return flex_pzputs_std(ps_ptr, writep_ptr, allele_code, allele_len);
+}
+
+int32_t pzwrite_close_null(Pigz_state* ps_ptr, char* writep);
+
+void compressed_pzwrite_close_null(Pigz_state* ps_ptr, char* writep);
+
+int32_t flex_pzwrite_close_null(Pigz_state* ps_ptr, char* writep);
+
+static inline void pzwrite_close_cond(Pigz_state* ps_ptr, char* writep) {
+ if (ps_ptr->overflow_buf) {
+ pzwrite_close_null(ps_ptr, writep);
+ }
+}
+
+static inline void compressed_pzwrite_close_cond(Pigz_state* ps_ptr, char* writep) {
+ if (ps_ptr->overflow_buf) {
+ compressed_pzwrite_close_null(ps_ptr, writep);
+ }
+}
+
+static inline void flex_pzwrite_close_cond(Pigz_state* ps_ptr, char* writep) {
+ if (ps_ptr->overflow_buf) {
+ if (is_uncompressed_pzwrite(ps_ptr)) {
+ pzwrite_close_null(ps_ptr, writep);
+ } else {
+ compressed_pzwrite_close_null(ps_ptr, writep);
+ }
+ }
+}
+
void pigz_init(uint32_t setprocs);
-int32_t write_uncompressed(char* out_fname, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*));
+int32_t write_uncompressed(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*));
#endif // __PIGZ_H__
diff --git a/plink.c b/plink.c
index 151e2e7..b50c0c7 100644
--- a/plink.c
+++ b/plink.c
@@ -91,7 +91,7 @@
const char ver_str[] =
#ifdef STABLE_BUILD
- "PLINK v1.90b3b"
+ "PLINK v1.90b3l"
#else
"PLINK v1.90p"
#endif
@@ -104,7 +104,7 @@ const char ver_str[] =
" 32-bit"
#endif
// include trailing space if day < 10, so character length stays the same
- " (17 Jan 2015)";
+ " (18 Apr 2015)";
const char ver_str2[] =
#ifdef STABLE_BUILD
"" // (don't want this when version number has a trailing letter)
@@ -276,18 +276,14 @@ static inline uint32_t are_marker_cms_needed(uint64_t calculation_type, char* cm
}
static inline uint32_t are_marker_alleles_needed(uint64_t calculation_type, char* freqname, Homozyg_info* homozyg_ptr, Two_col_params* a1alleles, Two_col_params* a2alleles, uint32_t ld_modifier, uint32_t snp_only, uint32_t clump_modifier, uint32_t cluster_modifier) {
- return (freqname || (calculation_type & (CALC_FREQ | CALC_HARDY | CALC_MAKE_BED | CALC_MAKE_BIM | CALC_RECODE | CALC_REGRESS_PCS | CALC_MODEL | CALC_GLM | CALC_LASSO | CALC_LIST_23_INDELS | CALC_EPI | CALC_TESTMISHAP | CALC_SCORE | CALC_MENDEL | CALC_TDT | CALC_FLIPSCAN | CALC_QFAM | CALC_HOMOG | CALC_DUPVAR | CALC_RPLUGIN)) || ((calculation_type & CALC_HOMOZYG) && (homozyg_ptr->modifier & HOMOZYG_GROUP_VERBOSE)) || ((calculation_type & CALC_LD) && (ld_modifier & LD_INPHASE)) || ((calc [...]
+ return (freqname || (calculation_type & (CALC_FREQ | CALC_HARDY | CALC_MAKE_BED | CALC_MAKE_BIM | CALC_RECODE | CALC_REGRESS_PCS | CALC_MODEL | CALC_GLM | CALC_LASSO | CALC_LIST_23_INDELS | CALC_EPI | CALC_TESTMISHAP | CALC_SCORE | CALC_MENDEL | CALC_TDT | CALC_FLIPSCAN | CALC_QFAM | CALC_HOMOG | CALC_DUPVAR | CALC_RPLUGIN | CALC_DFAM)) || ((calculation_type & CALC_HOMOZYG) && (homozyg_ptr->modifier & HOMOZYG_GROUP_VERBOSE)) || ((calculation_type & CALC_LD) && (ld_modifier & LD_INPHASE [...]
}
static inline int32_t relationship_or_ibc_req(uint64_t calculation_type) {
return (relationship_req(calculation_type) || (calculation_type & CALC_IBC));
}
-static inline int32_t distance_wt_req(uint64_t calculation_type, char* read_dists_fname, uint32_t dist_calc_type) {
- return (((calculation_type & CALC_DISTANCE) || ((!read_dists_fname) && ((calculation_type & (CALC_IBS_TEST | CALC_GROUPDIST | CALC_REGRESS_DISTANCE))))) && (!(dist_calc_type & DISTANCE_FLAT_MISSING)));
-}
-
-int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, char* famname, char* cm_map_fname, char* cm_map_chrname, char* phenoname, char* extractname, char* excludename, char* keepname, char* removename, char* keepfamname, char* removefamname, char* filtername, char* freqname, char* read_dists_fname, char* read_dists_id_fname, char* evecname, char* mergename1, char* mergename2, char* mergename3, char* missing_mid_template, char* missing_marker_id_match, char* makephen [...]
+int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, char* famname, char* cm_map_fname, char* cm_map_chrname, char* phenoname, char* extractname, char* excludename, char* keepname, char* removename, char* keepfamname, char* removefamname, char* filtername, char* freqname, char* distance_wts_fname, char* read_dists_fname, char* read_dists_id_fname, char* evecname, char* mergename1, char* mergename2, char* mergename3, char* missing_mid_template, char* missing_marke [...]
FILE* bedfile = NULL;
FILE* phenofile = NULL;
uintptr_t unfiltered_marker_ct = 0;
@@ -345,9 +341,6 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
uintptr_t* ac_excl_bitfield = NULL;
double* pheno_d = NULL;
double* orig_pheno_d = NULL;
- double* marker_weights = NULL;
- uint32_t marker_weight_sum = 0;
- uint32_t* marker_weights_i = NULL;
char* sample_ids = NULL;
uintptr_t max_sample_id_len = 4;
char* paternal_ids = NULL;
@@ -369,7 +362,6 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
uint64_t dists_alloc = 0;
double missing_phenod = (double)missing_pheno;
double ci_zt = 0.0;
- uint32_t wt_needed = distance_wt_req(calculation_type, read_dists_fname, dist_calc_type);
uintptr_t bed_offset = 3;
uint32_t* marker_pos = NULL;
uint32_t hh_exists = 0;
@@ -406,13 +398,10 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
int32_t* hwe_haph_allfs = NULL;
pthread_t threads[MAX_THREADS];
uint32_t* uiptr;
- double* dptr;
- double* dptr2;
double* rel_ibc;
uintptr_t uljj;
uint32_t ujj;
uint32_t ukk;
- double dxx;
char* outname_end2;
int32_t ii;
int64_t llyy;
@@ -422,8 +411,6 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
uint32_t sample_f_male_ct;
Pedigree_rel_info pri;
uintptr_t marker_uidx;
- uintptr_t marker_uidx_stop;
- uintptr_t marker_idx;
if ((cm_map_fname || update_cm) && (!marker_cms_needed)) {
LOGPRINTF("Error: --%s results would never be used. (Did you forget --make-bed?)\n", cm_map_fname? "cm-map" : "update-cm");
@@ -724,15 +711,17 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
} else if ((calculation_type & CALC_GLM) && (glm_modifier & GLM_LOGISTIC)) {
logprint("Error: --logistic without --all-pheno requires a case/control phenotype.\n");
goto plink_ret_INVALID_CMDLINE;
- } else if (calculation_type & (CALC_CMH | CALC_HOMOG | CALC_TESTMISS | CALC_TDT)) {
+ } else if (calculation_type & (CALC_CMH | CALC_HOMOG | CALC_TESTMISS | CALC_TDT | CALC_DFAM)) {
if (calculation_type & CALC_CMH) {
logprint("Error: --mh and --mh2 require a case/control phenotype.\n");
} else if (calculation_type & CALC_HOMOG) {
logprint("Error: --homog requires a case/control phenotype.\n");
} else if (calculation_type & CALC_TESTMISS) {
logprint("Error: --test-missing requires a case/control phenotype.\n");
- } else {
+ } else if (calculation_type & CALC_TDT) {
logprint("Error: --tdt requires a case/control phenotype.\n");
+ } else {
+ logprint("Error: --dfam requires a case/control phenotype.\n");
}
goto plink_ret_INVALID_CMDLINE;
}
@@ -1074,6 +1063,17 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
LOGPRINTF("%d %s removed due to founder status (--filter-%s).\n", ii, species_str(ii), (filter_flags & FILTER_BINARY_FOUNDERS)? "founders" : "nonfounders");
}
+ if (thin_keep_sample_prob != 1.0) {
+ if (random_thin_samples(thin_keep_sample_prob, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct)) {
+ goto plink_ret_ALL_SAMPLES_EXCLUDED;
+ }
+ } else if (thin_keep_sample_ct) {
+ retval = random_thin_samples_ct(thin_keep_sample_ct, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct);
+ if (retval) {
+ goto plink_ret_1;
+ }
+ }
+
if (mind_thresh < 1.0) {
retval = mind_filter(bedfile, bed_offset, outname, outname_end, mind_thresh, unfiltered_marker_ct, marker_exclude, marker_exclude_ct, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, sample_ids, max_sample_id_len, sex_male, chrom_info_ptr, om_ip);
if (retval) {
@@ -1110,7 +1110,15 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
}
}
if (g_thread_ct > 1) {
- if ((calculation_type & (CALC_RELATIONSHIP | CALC_REL_CUTOFF | CALC_GDISTANCE_MASK | CALC_IBS_TEST | CALC_GROUPDIST | CALC_REGRESS_DISTANCE | CALC_GENOME | CALC_REGRESS_REL | CALC_UNRELATED_HERITABILITY | CALC_LD | CALC_PCA | CALC_MAKE_PERM_PHENO | CALC_QFAM)) || ((calculation_type & CALC_MODEL) && (model_modifier & (MODEL_PERM | MODEL_MPERM))) || ((calculation_type & CALC_GLM) && (glm_modifier & (GLM_PERM | GLM_MPERM))) || ((calculation_type & CALC_TESTMISS) && (testmiss_modifier & [...]
+ if ((calculation_type & (CALC_RELATIONSHIP | CALC_REL_CUTOFF | CALC_GDISTANCE_MASK | CALC_IBS_TEST | CALC_GROUPDIST | CALC_REGRESS_DISTANCE | CALC_GENOME | CALC_REGRESS_REL | CALC_UNRELATED_HERITABILITY | CALC_LD | CALC_PCA | CALC_MAKE_PERM_PHENO | CALC_QFAM)) || ((calculation_type & CALC_MODEL) && (model_modifier & (MODEL_PERM | MODEL_MPERM))) || ((calculation_type & CALC_GLM) && (glm_modifier & (GLM_PERM | GLM_MPERM))) || ((calculation_type & CALC_TESTMISS) && (testmiss_modifier & [...]
+#ifndef _WIN32
+ || ((calculation_type & CALC_FREQ) && (misc_flags & MISC_FREQ_GZ))
+ || ((calculation_type & CALC_MISSING_REPORT) && (misc_flags & MISC_MISSING_GZ))
+ || ((calculation_type & CALC_HARDY) && (hwe_modifier & HWE_GZ))
+ || ((calculation_type & CALC_HET) && (misc_flags & MISC_HET_GZ))
+ || ((calculation_type & CALC_RECODE) && (((recode_modifier & (RECODE_VCF | RECODE_BGZ)) == (RECODE_VCF | RECODE_BGZ))))
+#endif
+) {
LOGPRINTF("Using up to %u threads (change this with --threads).\n", g_thread_ct);
} else {
logprint("Using 1 thread (no multithreaded calculations invoked).\n");
@@ -1207,13 +1215,13 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
}
fill_ulong_zero(marker_reverse, uii);
if (bedfile) {
- retval = calc_freqs_and_hwe(bedfile, outname, outname_end, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, unfiltered_sample_ct, sample_exclude, sample_exclude_ct, sample_ids, max_sample_id_len, founder_info, nonfounders, (misc_flags / MISC_MAF_SUCC) & 1, set_allele_freqs, bed_offset, (hwe_thresh > 0.0) || (calculation_type & CALC_HARDY), hwe_modifier & HWE_THRESH_ALL, (pheno_nm_ct && pheno_c)? ((calculation_type / CALC [...]
+ retval = calc_freqs_and_hwe(bedfile, outname, outname_end, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, unfiltered_sample_ct, sample_exclude, sample_exclude_ct, sample_ids, max_sample_id_len, founder_info, nonfounders, (misc_flags / MISC_MAF_SUCC) & 1, set_allele_freqs, bed_offset, (hwe_thresh > 0.0) || (calculation_type & CALC_HARDY), hwe_modifier & HWE_THRESH_ALL, (pheno_nm_ct && pheno_c)? ((calculation_type / CALC [...]
if (retval) {
goto plink_ret_1;
}
if (freqname) {
- retval = read_external_freqs(freqname, unfiltered_marker_ct, marker_exclude, marker_exclude_ct, marker_ids, max_marker_id_len, chrom_info_ptr, marker_allele_ptrs, set_allele_freqs, nchrobs, (misc_flags / MISC_MAF_SUCC) & 1, exponent, wt_needed, marker_weights);
+ retval = read_external_freqs(freqname, unfiltered_marker_ct, marker_exclude, marker_exclude_ct, marker_ids, max_marker_id_len, chrom_info_ptr, marker_allele_ptrs, set_allele_freqs, nchrobs, (misc_flags / MISC_MAF_SUCC) & 1);
if (retval) {
goto plink_ret_1;
}
@@ -1245,24 +1253,16 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
if (misc_flags & MISC_FREQ_COUNTS) {
logprint("Note: --freq 'counts' modifier has no effect on cluster-stratified report.\n");
}
- memcpy(outname_end, ".frq.strat", 11);
- retval = write_stratified_freqs(bedfile, bed_offset, outname, plink_maxsnp, unfiltered_marker_ct, marker_exclude, chrom_info_ptr, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, unfiltered_sample_ct, sample_ct, sample_f_ct, founder_info, nonfounders, sex_male, sample_f_male_ct, marker_reverse, cluster_ct, cluster_map, cluster_starts, cluster_ids, max_cluster_id_len);
+ retval = write_stratified_freqs(bedfile, bed_offset, outname, outname_end, (misc_flags / MISC_FREQ_GZ) & 1, plink_maxsnp, unfiltered_marker_ct, marker_exclude, chrom_info_ptr, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, unfiltered_sample_ct, sample_ct, sample_f_ct, founder_info, nonfounders, sex_male, sample_f_male_ct, marker_reverse, cluster_ct, cluster_map, cluster_starts, cluster_ids, max_cluster_id_len);
} else {
- if (misc_flags & MISC_FREQX) {
- memcpy(outname_end, ".frqx", 6);
- } else if (misc_flags & MISC_FREQ_COUNTS) {
- memcpy(outname_end, ".frq.count", 11);
- } else {
- memcpy(outname_end, ".frq", 5);
- }
- retval = write_freqs(outname, plink_maxsnp, unfiltered_marker_ct, marker_exclude, set_allele_freqs, chrom_info_ptr, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, hwe_ll_allfs, hwe_lh_allfs, hwe_hh_allfs, hwe_hapl_allfs, hwe_haph_allfs, sample_f_ct, sample_f_male_ct, nonfounders, misc_flags, marker_reverse);
+ retval = write_freqs(outname, outname_end, plink_maxsnp, unfiltered_marker_ct, marker_exclude, set_allele_freqs, chrom_info_ptr, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, hwe_ll_allfs, hwe_lh_allfs, hwe_hh_allfs, hwe_hapl_allfs, hwe_haph_allfs, sample_f_ct, sample_f_male_ct, nonfounders, misc_flags, marker_reverse);
}
if (retval || (!(calculation_type & (~(CALC_MERGE | CALC_WRITE_CLUSTER | CALC_FREQ))))) {
goto plink_ret_1;
}
}
if (calculation_type & CALC_MISSING_REPORT) {
- retval = write_missingness_reports(bedfile, bed_offset, outname, outname_end, plink_maxfid, plink_maxiid, plink_maxsnp, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, chrom_info_ptr, om_ip, marker_ids, max_marker_id_len, unfiltered_sample_ct, sample_ct, sample_exclude, pheno_nm, sex_male, sample_male_ct, sample_ids, max_sample_id_len, cluster_ct, cluster_map, cluster_starts, cluster_ids, max_cluster_id_len, hh_exists);
+ retval = write_missingness_reports(bedfile, bed_offset, outname, outname_end, (misc_flags / MISC_MISSING_GZ) & 1, plink_maxfid, plink_maxiid, plink_maxsnp, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, chrom_info_ptr, om_ip, marker_ids, max_marker_id_len, unfiltered_sample_ct, sample_ct, sample_exclude, pheno_nm, sex_male, sample_male_ct, sample_ids, max_sample_id_len, cluster_ct, cluster_map, cluster_starts, cluster_ids, max_cluster_id_len, hh_exists);
if (retval || (!(calculation_type & (~(CALC_MERGE | CALC_WRITE_CLUSTER | CALC_FREQ | CALC_MISSING_REPORT))))) {
goto plink_ret_1;
}
@@ -1327,10 +1327,6 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
}
}
}
-
- if (wt_needed) {
- calc_marker_weights(exponent, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, hwe_ll_allfs, hwe_lh_allfs, hwe_hh_allfs, marker_weights);
- }
wkspace_reset(hwe_lls);
}
if (sip->fname) {
@@ -1372,50 +1368,6 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
}
}
- if (wt_needed) {
- // normalize included marker weights to add to just under 2^32. (switch to
- // 2^64 if/when 32-bit performance becomes less important than accuracy on
- // 50+ million marker sets.)
- dxx = 0.0;
- marker_uidx = 0;
- marker_idx = 0;
- do {
- marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
- marker_uidx_stop = next_set_ul(marker_exclude, marker_uidx, unfiltered_marker_ct);
- marker_idx += marker_uidx_stop - marker_uidx;
- dptr = &(marker_weights[marker_uidx]);
- dptr2 = &(marker_weights[marker_uidx_stop]);
- marker_uidx = marker_uidx_stop;
- do {
- dxx += *dptr++;
- } while (dptr < dptr2);
- } while (marker_idx < marker_ct);
- // subtract marker_ct to guard against marker_weight_sum overflow from
- // rounding
- dxx = (4294967296.0 - ((double)((intptr_t)marker_ct))) / dxx;
- if (wkspace_alloc_ui_checked(&marker_weights_i, marker_idx * sizeof(int32_t))) {
- goto plink_ret_NOMEM;
- }
- marker_uidx = 0;
- marker_idx = 0;
- uiptr = marker_weights_i;
- do {
- marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
- marker_uidx_stop = next_set_ul(marker_exclude, marker_uidx, unfiltered_marker_ct);
- marker_idx += marker_uidx_stop - marker_uidx;
- dptr = &(marker_weights[marker_uidx]);
- dptr2 = &(marker_weights[marker_uidx_stop]);
- marker_uidx = marker_uidx_stop;
- do {
- uii = (uint32_t)((*dptr++) * dxx + 0.5);
- marker_weight_sum += uii;
- *uiptr++ = uii;
- } while (dptr < dptr2);
- } while (marker_idx < marker_ct);
- wkspace_left += topsize;
- topsize = 0;
- }
-
if (relationship_or_ibc_req(calculation_type)) {
if (relip->pca_cluster_names_flattened || relip->pca_clusters_fname) {
retval = extract_clusters(unfiltered_sample_ct, sample_exclude, sample_ct, cluster_ct, cluster_map, cluster_starts, cluster_ids, max_cluster_id_len, relip->pca_cluster_names_flattened, relip->pca_clusters_fname, &pca_sample_exclude, &pca_sample_ct);
@@ -1434,7 +1386,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
ulii = unfiltered_sample_ct - pca_sample_ct;
}
}
- retval = calc_rel(threads, parallel_idx, parallel_tot, calculation_type, relip, bedfile, bed_offset, outname, outname_end, unfiltered_marker_ct, marker_exclude, marker_reverse, marker_ct, unfiltered_sample_ct, pca_sample_exclude? pca_sample_exclude : sample_exclude, pca_sample_exclude? (&ulii) : (&sample_exclude_ct), sample_ids, max_sample_id_len, set_allele_freqs, &rel_ibc, chrom_info_ptr);
+ retval = calc_rel(threads, parallel_idx, parallel_tot, calculation_type, relip, bedfile, bed_offset, outname, outname_end, distance_wts_fname, (dist_calc_type & DISTANCE_WTS_NOHEADER), unfiltered_marker_ct, marker_exclude, marker_reverse, marker_ct, marker_ids, max_marker_id_len, unfiltered_sample_ct, pca_sample_exclude? pca_sample_exclude : sample_exclude, pca_sample_exclude? (&ulii) : (&sample_exclude_ct), sample_ids, max_sample_id_len, set_allele_freqs, &rel_ibc, chrom_info_ptr);
if (retval) {
goto plink_ret_1;
}
@@ -1748,7 +1700,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
} else
*/
if (distance_req(calculation_type, read_dists_fname)) {
- retval = calc_distance(threads, parallel_idx, parallel_tot, bedfile, bed_offset, outname, outname_end, calculation_type, dist_calc_type, marker_exclude, marker_ct, set_allele_freqs, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, chrom_info_ptr, wt_needed, marker_weight_sum, marker_weights_i, exponent);
+ retval = calc_distance(threads, parallel_idx, parallel_tot, bedfile, bed_offset, outname, outname_end, read_dists_fname, distance_wts_fname, distance_exp, calculation_type, dist_calc_type, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, set_allele_freqs, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, chrom_info_ptr);
if (retval) {
goto plink_ret_1;
}
@@ -1811,7 +1763,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
}
if (calculation_type & CALC_HET) {
- retval = het_report(bedfile, bed_offset, outname, outname_end, unfiltered_marker_ct, marker_exclude, marker_ct, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, plink_maxfid, plink_maxiid, max_sample_id_len, (misc_flags & MISC_HET_SMALL_SAMPLE)? founder_info : NULL, chrom_info_ptr, set_allele_freqs);
+ retval = het_report(bedfile, bed_offset, outname, outname_end, (misc_flags / MISC_HET_GZ) & 1, unfiltered_marker_ct, marker_exclude, marker_ct, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, plink_maxfid, plink_maxiid, max_sample_id_len, (misc_flags & MISC_HET_SMALL_SAMPLE)? founder_info : NULL, chrom_info_ptr, set_allele_freqs);
if (retval) {
goto plink_ret_1;
}
@@ -1858,7 +1810,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
}
#endif
- if (calculation_type & (CALC_MODEL | CALC_GXE | CALC_GLM | CALC_LASSO | CALC_CMH | CALC_HOMOG | CALC_TESTMISS | CALC_TDT | CALC_QFAM)) {
+ if (calculation_type & (CALC_MODEL | CALC_GXE | CALC_GLM | CALC_LASSO | CALC_CMH | CALC_HOMOG | CALC_TESTMISS | CALC_TDT | CALC_DFAM | CALC_QFAM)) {
// can't use pheno_ctrl_ct in here since new phenotypes may be loaded, and
// we don't bother updating it...
if ((!pheno_all) && (!loop_assoc_fname)) {
@@ -2038,7 +1990,13 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
}
}
if ((calculation_type & CALC_TDT) && pheno_c) {
- retval = tdt(threads, bedfile, bed_offset, outname, outname_end2, ci_size, ci_zt, pfilter, output_min_p, mtest_adjust, adjust_lambda, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, marker_pos, marker_allele_ptrs, max_marker_allele_len, marker_reverse, unfiltered_sample_ct, sample_exclude, sample_ct, mperm_save, pheno_nm, pheno_c, founder_info, sex_nm, sex_male, sample_ids, max_sample_id_len, paternal_ids, max_paternal_id_len, maternal_ids, [...]
+ retval = tdt(threads, bedfile, bed_offset, outname, outname_end2, ci_size, ci_zt, pfilter, output_min_p, mtest_adjust, adjust_lambda, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, marker_pos, marker_allele_ptrs, max_marker_allele_len, marker_reverse, unfiltered_sample_ct, sample_exclude, sample_ct, apip, mperm_save, pheno_nm, pheno_c, founder_info, sex_nm, sex_male, sample_ids, max_sample_id_len, paternal_ids, max_paternal_id_len, maternal [...]
+ if (retval) {
+ goto plink_ret_1;
+ }
+ }
+ if ((calculation_type & CALC_DFAM) && pheno_c) {
+ retval = dfam(threads, bedfile, bed_offset, outname, outname_end2, pfilter, output_min_p, mtest_adjust, adjust_lambda, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, marker_allele_ptrs, max_marker_allele_len, marker_reverse, unfiltered_sample_ct, sample_exclude, sample_ct, cluster_ct, cluster_map, loop_assoc_fname? NULL : cluster_starts, apip, mperm_save, pheno_c, founder_info, sex_nm, sex_male, sample_ids, max_sample_id_len, paternal_ids, [...]
if (retval) {
goto plink_ret_1;
}
@@ -2936,6 +2894,7 @@ int32_t main(int32_t argc, char** argv) {
char* filtervals_flattened = NULL;
char* evecname = NULL;
char* filtername = NULL;
+ char* distance_wts_fname = NULL;
char* read_dists_fname = NULL;
char* read_dists_id_fname = NULL;
char* freqname = NULL;
@@ -3006,13 +2965,15 @@ int32_t main(int32_t argc, char** argv) {
uint64_t misc_flags = 0;
uint64_t filter_flags = 0;
double thin_keep_prob = 1.0;
+ double thin_keep_sample_prob = 1.0;
uint32_t thin_keep_ct = 0;
+ uint32_t thin_keep_sample_ct = 0;
uint32_t min_bp_space = 0;
uint32_t check_sex_f_yobs = 0;
uint32_t check_sex_m_yobs = 0;
double check_sex_fthresh = 0.2;
double check_sex_mthresh = 0.8;
- double exponent = 0.0;
+ double distance_exp = 0.0;
double min_maf = 0.0;
double max_maf = 0.5;
double geno_thresh = 1.0;
@@ -3568,7 +3529,6 @@ int32_t main(int32_t argc, char** argv) {
memcpy(flagptr, "snp", 4);
break;
} else if (!strcmp(argptr, "exponent")) {
- fputs("Note: --exponent flag has been renamed to --distance-exp.\n", stdout);
memcpy(flagptr, "distance-exp", 13);
break;
}
@@ -3644,6 +3604,9 @@ int32_t main(int32_t argc, char** argv) {
} else if (!strcmp(argptr, "max-ac")) {
memcpy(flagptr, "max-mac", 8);
break;
+ } else if (!strcmp(argptr, "max-indv")) {
+ memcpy(flagptr, "thin-indiv-count", 17);
+ break;
}
goto main_flag_copy;
case 'n':
@@ -4012,7 +3975,7 @@ int32_t main(int32_t argc, char** argv) {
case 'R':
if (*argptr2 == '\0') {
#if defined __cplusplus && !defined _WIN32
- UNSTABLE;
+ UNSTABLE("R");
if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 2)) {
goto main_ret_INVALID_CMDLINE_2A;
}
@@ -4587,7 +4550,7 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE;
}
if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &(cluster.cmh_mperm_val))) {
- sprintf(logbuf, "Error: Invalid --bd mperm parameter '%s'.\n", argv[cur_arg + uii]);
+ sprintf(logbuf, "Error: Invalid --bd mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
goto main_ret_INVALID_CMDLINE_WWA;
}
cluster.modifier |= CLUSTER_CMH_MPERM;
@@ -4936,7 +4899,7 @@ int32_t main(int32_t argc, char** argv) {
cluster.modifier |= CLUSTER_MISSING;
goto main_param_zero;
} else if (!memcmp(argptr2, "file", 5)) {
- UNSTABLE;
+ UNSTABLE("cfile");
if (load_rare || load_params) {
goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
}
@@ -4958,7 +4921,7 @@ int32_t main(int32_t argc, char** argv) {
memcpy(memcpya(mapname, sptr, uii), ".cnv.map", 9);
load_rare = LOAD_RARE_CNV;
} else if (!memcmp(argptr2, "nv-count", 9)) {
- UNSTABLE;
+ UNSTABLE("cnv-count");
if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
goto main_ret_INVALID_CMDLINE_2A;
}
@@ -4968,15 +4931,15 @@ int32_t main(int32_t argc, char** argv) {
}
cnv_intersect_filter_type = CNV_COUNT;
} else if (!memcmp(argptr2, "nv-del", 7)) {
- UNSTABLE;
+ UNSTABLE("cnv-del");
cnv_calc_type |= CNV_DEL;
goto main_param_zero;
} else if (!memcmp(argptr2, "nv-disrupt", 11)) {
- UNSTABLE;
+ UNSTABLE("cnv-disrupt");
cnv_overlap_type = CNV_DISRUPT;
goto main_param_zero;
} else if (!memcmp(argptr2, "nv-dup", 7)) {
- UNSTABLE;
+ UNSTABLE("cnv-dup");
if (cnv_calc_type & CNV_DEL) {
logprint("Error: --cnv-dup cannot be used with --cnv-del.\n");
goto main_ret_INVALID_CMDLINE_A;
@@ -4984,7 +4947,7 @@ int32_t main(int32_t argc, char** argv) {
cnv_calc_type |= CNV_DUP;
goto main_param_zero;
} else if (!memcmp(argptr2, "nv-enrichment-test", 19)) {
- UNSTABLE;
+ UNSTABLE("cnv-enrichment-test");
if (!cnv_intersect_filter_type) {
logprint("Error: --cnv-enrichment-test must be used with --cnv-count.\n");
goto main_ret_INVALID_CMDLINE_A;
@@ -5000,7 +4963,7 @@ int32_t main(int32_t argc, char** argv) {
}
cnv_calc_type |= CNV_ENRICHMENT_TEST;
} else if (!memcmp(argptr2, "nv-exclude", 11)) {
- UNSTABLE;
+ UNSTABLE("cnv-exclude");
if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
goto main_ret_INVALID_CMDLINE_2A;
}
@@ -5014,11 +4977,11 @@ int32_t main(int32_t argc, char** argv) {
}
cnv_intersect_filter_type = CNV_EXCLUDE;
} else if (!memcmp(argptr2, "nv-exclude-off-by-1", 20)) {
- UNSTABLE;
+ UNSTABLE("cnv-exclude-off-by-1");
cnv_calc_type |= CNV_EXCLUDE_OFF_BY_1;
goto main_param_zero;
} else if (!memcmp(argptr2, "nv-freq-exclude-above", 22)) {
- UNSTABLE;
+ UNSTABLE("cnv-freq-exclude-above");
if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
goto main_ret_INVALID_CMDLINE_2A;
}
@@ -5028,7 +4991,7 @@ int32_t main(int32_t argc, char** argv) {
}
cnv_freq_type = CNV_FREQ_EXCLUDE_ABOVE;
} else if (!memcmp(argptr2, "nv-freq-exclude-below", 22)) {
- UNSTABLE;
+ UNSTABLE("cnv-freq-exclude-below");
if (cnv_freq_type) {
logprint("Error: --cnv-freq-exclude-below cannot be used with --cnv-freq-exclude-above.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5042,7 +5005,7 @@ int32_t main(int32_t argc, char** argv) {
}
cnv_freq_type = CNV_FREQ_EXCLUDE_BELOW;
} else if (!memcmp(argptr2, "nv-freq-exclude-exact", 22)) {
- UNSTABLE;
+ UNSTABLE("cnv-freq-exclude-exact");
if (cnv_freq_type) {
logprint("Error: --cnv-freq-exclude-exact cannot be used with\n--cnv-freq-exclude-above/-below.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5056,7 +5019,7 @@ int32_t main(int32_t argc, char** argv) {
}
cnv_freq_type = CNV_FREQ_EXCLUDE_EXACT;
} else if (!memcmp(argptr2, "nv-freq-include-exact", 22)) {
- UNSTABLE;
+ UNSTABLE("cnv-freq-include-exact");
if (cnv_freq_type) {
logprint("Error: --cnv-freq-include-exact cannot be used with\n--cnv-freq-exclude-above/-below/-exact.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5070,7 +5033,7 @@ int32_t main(int32_t argc, char** argv) {
}
cnv_freq_type = CNV_FREQ_INCLUDE_EXACT;
} else if (!memcmp(argptr2, "nv-freq-method2", 16)) {
- UNSTABLE;
+ UNSTABLE("cnv-freq-method2");
if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 1)) {
goto main_ret_INVALID_CMDLINE_2A;
}
@@ -5086,7 +5049,7 @@ int32_t main(int32_t argc, char** argv) {
cnv_freq_val2 = SMALLISH_EPSILON;
}
} else if (!memcmp(argptr2, "nv-freq-overlap", 16)) {
- UNSTABLE;
+ UNSTABLE("cnv-freq-overlap");
if (!(cnv_freq_type & CNV_FREQ_FILTER)) {
logprint("Error: --cnv-freq-overlap must be used with --cnv-freq-include-exact or\n--cnv-freq-exclude-above/-below/-exact.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5108,7 +5071,7 @@ int32_t main(int32_t argc, char** argv) {
}
cnv_freq_type |= CNV_FREQ_OVERLAP;
} else if (!memcmp(argptr2, "nv-indiv-perm", 14)) {
- UNSTABLE;
+ UNSTABLE("cnv-indiv-perm");
if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 1)) {
goto main_ret_INVALID_CMDLINE_2A;
}
@@ -5120,7 +5083,7 @@ int32_t main(int32_t argc, char** argv) {
}
cnv_calc_type |= CNV_SAMPLE_PERM;
} else if (!memcmp(argptr2, "nv-intersect", 13)) {
- UNSTABLE;
+ UNSTABLE("cnv-intersect");
if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
goto main_ret_INVALID_CMDLINE_2A;
}
@@ -5134,7 +5097,7 @@ int32_t main(int32_t argc, char** argv) {
}
cnv_intersect_filter_type = CNV_INTERSECT;
} else if (!memcmp(argptr2, "nv-kb", 6)) {
- UNSTABLE;
+ UNSTABLE("cnv-kb");
if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
goto main_ret_INVALID_CMDLINE_2A;
}
@@ -5144,7 +5107,7 @@ int32_t main(int32_t argc, char** argv) {
}
cnv_min_seglen = (int32_t)(dxx * 1000 * (1 + SMALL_EPSILON));
} else if (!memcmp(argptr2, "nv-list", 8)) {
- UNSTABLE;
+ UNSTABLE("cnv-list");
if ((load_rare & (~LOAD_RARE_CNV)) || load_params) {
goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
}
@@ -5158,7 +5121,7 @@ int32_t main(int32_t argc, char** argv) {
strcpya(pedname, argv[cur_arg + 1]);
load_rare = LOAD_RARE_CNV;
} else if (!memcmp(argptr2, "nv-make-map", 12)) {
- UNSTABLE;
+ UNSTABLE("cnv-make-map");
if (!(load_rare & LOAD_RARE_CNV)) {
logprint("Error: --cnv-make-map cannot be used without a .cnv fileset.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5176,7 +5139,7 @@ int32_t main(int32_t argc, char** argv) {
cnv_calc_type |= CNV_MAKE_MAP | CNV_MAKE_MAP_LONG;
}
} else if (!memcmp(argptr2, "nv-max-kb", 10)) {
- UNSTABLE;
+ UNSTABLE("cnv-max-kb");
if (!(load_rare & LOAD_RARE_CNV)) {
logprint("Error: --cnv-max-kb cannot be used without a .cnv fileset.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5194,7 +5157,7 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE;
}
} else if (!memcmp(argptr2, "nv-max-score", 13)) {
- UNSTABLE;
+ UNSTABLE("cnv-max-score");
if (!(load_rare & LOAD_RARE_CNV)) {
logprint("Error: --cnv-max-score cannot be used without a .cnv fileset.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5207,7 +5170,7 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE_WWA;
}
} else if (!memcmp(argptr2, "nv-max-sites", 13)) {
- UNSTABLE;
+ UNSTABLE("cnv-max-sites");
if (!(load_rare & LOAD_RARE_CNV)) {
logprint("Error: --cnv-max-sites cannot be used without a .cnv fileset.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5220,7 +5183,7 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE_WWA;
}
} else if (!memcmp(argptr2, "nv-overlap", 11)) {
- UNSTABLE;
+ UNSTABLE("cnv-overlap");
if (!(load_rare & LOAD_RARE_CNV)) {
logprint("Error: --cnv-overlap cannot be used without a .cnv fileset.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5247,7 +5210,7 @@ int32_t main(int32_t argc, char** argv) {
}
}
} else if (!memcmp(argptr2, "nv-region-overlap", 18)) {
- UNSTABLE;
+ UNSTABLE("cnv-region-overlap");
if (!(load_rare & LOAD_RARE_CNV)) {
logprint("Error: --cnv-region-overlap cannot be used without a .cnv fileset.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5264,7 +5227,7 @@ int32_t main(int32_t argc, char** argv) {
}
cnv_overlap_type = CNV_OVERLAP_REGION;
} else if (!memcmp(argptr2, "nv-score", 9)) {
- UNSTABLE;
+ UNSTABLE("cnv-score");
if (!(load_rare & LOAD_RARE_CNV)) {
logprint("Error: --cnv-score cannot be used without a .cnv fileset.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5281,7 +5244,7 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE;
}
} else if (!memcmp(argptr2, "nv-sites", 9)) {
- UNSTABLE;
+ UNSTABLE("cnv-sites");
if (!(load_rare & LOAD_RARE_CNV)) {
logprint("Error: --cnv-sites cannot be used without a .cnv fileset.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5298,7 +5261,7 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE;
}
} else if (!memcmp(argptr2, "nv-subset", 10)) {
- UNSTABLE;
+ UNSTABLE("cnv-subset");
if (!(load_rare & LOAD_RARE_CNV)) {
logprint("Error: --cnv-subset cannot be used without a .cnv fileset.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5314,7 +5277,7 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_1;
}
} else if (!memcmp(argptr2, "nv-test", 8)) {
- UNSTABLE;
+ UNSTABLE("cnv-test");
if (!(load_rare & LOAD_RARE_CNV)) {
logprint("Error: --cnv-test cannot be used without a .cnv fileset.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5349,7 +5312,7 @@ int32_t main(int32_t argc, char** argv) {
}
cnv_calc_type |= CNV_TEST;
} else if (!memcmp(argptr2, "nv-test-1sided", 15)) {
- UNSTABLE;
+ UNSTABLE("cnv-test-1sided");
if (cnv_calc_type & CNV_TEST_FORCE_2SIDED) {
logprint("Error: --cnv-test cannot be both 1-sided and 2-sided at the same time.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5357,7 +5320,7 @@ int32_t main(int32_t argc, char** argv) {
logprint("Note: --cnv-test-1sided flag deprecated. Use '--cnv-test 1sided'.\n");
cnv_calc_type |= CNV_TEST_FORCE_1SIDED;
} else if (!memcmp(argptr2, "nv-test-2sided", 15)) {
- UNSTABLE;
+ UNSTABLE("cnv-test-2sided");
if (cnv_calc_type & CNV_TEST_FORCE_1SIDED) {
logprint("Error: --cnv-test cannot be both 1-sided and 2-sided at the same time.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5365,7 +5328,7 @@ int32_t main(int32_t argc, char** argv) {
logprint("Note: --cnv-test-2sided flag deprecated. Use '--cnv-test 2sided'.\n");
cnv_calc_type |= CNV_TEST_FORCE_2SIDED;
} else if (!memcmp(argptr2, "nv-test-region", 15)) {
- UNSTABLE;
+ UNSTABLE("cnv-test-region");
if (!(load_rare & LOAD_RARE_CNV)) {
logprint("Error: --cnv-test-region cannot be used without a .cnv fileset.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5381,7 +5344,7 @@ int32_t main(int32_t argc, char** argv) {
}
cnv_calc_type |= CNV_TEST_REGION;
} else if (!memcmp(argptr2, "nv-test-window", 15)) {
- UNSTABLE;
+ UNSTABLE("cnv-test-window");
if (!(load_rare & LOAD_RARE_CNV)) {
logprint("Error: --cnv-test-window cannot be used without a .cnv fileset.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5400,7 +5363,7 @@ int32_t main(int32_t argc, char** argv) {
cnv_test_window = (int32_t)(dxx * (1 + SMALL_EPSILON));
}
} else if (!memcmp(argptr2, "nv-union-overlap", 17)) {
- UNSTABLE;
+ UNSTABLE("cnv-union-overlap");
if (!(load_rare & LOAD_RARE_CNV)) {
logprint("Error: --cnv-union-overlap cannot be used without a .cnv fileset.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5417,7 +5380,7 @@ int32_t main(int32_t argc, char** argv) {
}
cnv_overlap_type = CNV_OVERLAP_UNION;
} else if (!memcmp(argptr2, "nv-write", 9)) {
- UNSTABLE;
+ UNSTABLE("cnv-write");
if (!(load_rare & LOAD_RARE_CNV)) {
logprint("Error: --cnv-write cannot be used without a .cnv fileset.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5438,7 +5401,7 @@ int32_t main(int32_t argc, char** argv) {
}
cnv_calc_type |= CNV_WRITE;
} else if (!memcmp(argptr2, "nv-write-freq", 14)) {
- UNSTABLE;
+ UNSTABLE("cnv-write-freq");
if (!(load_rare & LOAD_RARE_CNV)) {
logprint("Error: --cnv-write freq cannot be used without a .cnv fileset.\n");
goto main_ret_INVALID_CMDLINE;
@@ -5945,12 +5908,46 @@ int32_t main(int32_t argc, char** argv) {
if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
goto main_ret_INVALID_CMDLINE_2A;
}
- if (scan_double(argv[cur_arg + 1], &exponent)) {
+ if (scan_double(argv[cur_arg + 1], &distance_exp)) {
sprintf(logbuf, "Error: Invalid --distance-exp parameter '%s'.\n", argv[cur_arg + 1]);
goto main_ret_INVALID_CMDLINE_WW;
}
+ fputs("Note: '--distance-exp [x]' deprecated. Use '--distance-weights exp=[x]' instead.\n", stdout);
+ } else if (!memcmp(argptr2, "istance-wts", 12)) {
+ if (distance_exp != 0.0) {
+ logprint("Error: --distance-wts cannot be used with --distance-exp.\n");
+ goto main_ret_INVALID_CMDLINE;
+ } else if (calculation_type & CALC_PLINK1_DISTANCE_MATRIX) {
+ logprint("Error: --distance-wts cannot be used with --distance-matrix.\n");
+ goto main_ret_INVALID_CMDLINE;
+ }
+ if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 2)) {
+ goto main_ret_INVALID_CMDLINE_2A;
+ }
+ if ((strlen(argv[cur_arg + 1]) > 4) && (!memcmp(argv[cur_arg + 1], "exp=", 4))) {
+ if (scan_double(&(argv[cur_arg + 1][4]), &distance_exp)) {
+ sprintf(logbuf, "Error: Invalid --distance-wts exponent '%s'.\n", &(argv[cur_arg + 1][4]));
+ goto main_ret_INVALID_CMDLINE_WW;
+ }
+ } else {
+ UNSTABLE("distance-wts");
+ uii = 1;
+ if (param_ct == 2) {
+ if (!strcmp(argv[cur_arg + 1], "noheader")) {
+ uii = 2;
+ } else if (strcmp(argv[cur_arg + 2], "noheader")) {
+ sprintf(logbuf, "Error: Invalid --distance-wts parameter '%s'.\n", argv[cur_arg + 2]);
+ goto main_ret_INVALID_CMDLINE_WWA;
+ }
+ dist_calc_type |= DISTANCE_WTS_NOHEADER;
+ }
+ retval = alloc_fname(&distance_wts_fname, argv[cur_arg + uii], argptr, 0);
+ if (retval) {
+ goto main_ret_1;
+ }
+ }
} else if (!memcmp(argptr2, "istance-matrix", 15)) {
- if (exponent != 0.0) {
+ if (distance_exp != 0.0) {
logprint("Error: --distance-matrix cannot be used with --distance-exp.\n");
goto main_ret_INVALID_CMDLINE;
}
@@ -6152,6 +6149,56 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE_A;
}
load_rare = LOAD_RARE_DOSAGE;
+ } else if (!memcmp(argptr2, "fam", 4)) {
+ UNSTABLE("dfam");
+ if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 4)) {
+ goto main_ret_INVALID_CMDLINE_2A;
+ }
+ for (uii = 1; uii <= param_ct; uii++) {
+ if (!strcmp(argv[cur_arg + uii], "no-unrelateds")) {
+ family_info.dfam_modifier |= DFAM_NO_UNRELATEDS;
+ } else if (!strcmp(argv[cur_arg + uii], "perm")) {
+ if (family_info.dfam_modifier & DFAM_MPERM) {
+ logprint("Error: --dfam 'mperm' and 'perm' cannot be used together.\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
+ family_info.dfam_modifier |= DFAM_PERM;
+ } else if (!strcmp(argv[cur_arg + uii], "perm-count")) {
+ family_info.dfam_modifier |= DFAM_PERM_COUNT;
+ } else if ((strlen(argv[cur_arg + uii]) > 6) && (!memcmp(argv[cur_arg + uii], "mperm=", 6))) {
+ if (family_info.dfam_modifier & DFAM_PERM) {
+ logprint("Error: --dfam 'mperm' and 'perm' cannot be used together.\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ } else if (family_info.dfam_modifier & DFAM_MPERM) {
+ logprint("Error: Duplicate --dfam 'mperm' modifier.\n");
+ goto main_ret_INVALID_CMDLINE;
+ }
+ if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &family_info.dfam_mperm_val)) {
+ sprintf(logbuf, "Error: Invalid --dfam mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
+ goto main_ret_INVALID_CMDLINE_WWA;
+ }
+ family_info.dfam_modifier |= DFAM_MPERM;
+ } else if (!strcmp(argv[cur_arg + uii], "set-test")) {
+ family_info.dfam_modifier |= DFAM_SET_TEST;
+ } else if (!strcmp(argv[cur_arg + uii], "mperm")) {
+ logprint("Error: Improper --dfam mperm syntax. (Use '--dfam mperm=[value]'.)\n");
+ goto main_ret_INVALID_CMDLINE;
+ } else {
+ sprintf(logbuf, "Error: Invalid --dfam parameter '%s'.\n", argv[cur_arg + uii]);
+ goto main_ret_INVALID_CMDLINE_WWA;
+ }
+ }
+ calculation_type |= CALC_DFAM;
+ } else if (!memcmp(argptr2, "fam-no-unrelateds", 18)) {
+ // keep this undocumented flag since it makes DFAM correspond to the
+ // original sib-TDT.
+ if (!(calculation_type & CALC_DFAM)) {
+ logprint("Error: --dfam-no-unrelateds must be used with --dfam.\n");
+ goto main_ret_INVALID_CMDLINE;
+ }
+ family_info.dfam_modifier |= DFAM_NO_UNRELATEDS;
+ logprint("Note: --dfam-no-unrelateds flag deprecated. Use '--dfam no-unrelateds'.\n");
+ goto main_param_zero;
} else if (!memcmp(argptr2, "prime", 6)) {
logprint("Note: --dprime flag deprecated. Use e.g. '--r2 dprime'.\n");
ld_info.modifier |= LD_DPRIME;
@@ -6374,19 +6421,24 @@ int32_t main(int32_t argc, char** argv) {
filter_flags |= FILTER_FAM_REQ | FILTER_BINARY_NONFOUNDERS;
goto main_param_zero;
} else if (!memcmp(argptr2, "req", 4)) {
- if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 1)) {
+ if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 2)) {
goto main_ret_INVALID_CMDLINE_2A;
}
- if (param_ct) {
- if (strcmp(argv[cur_arg + 1], "counts")) {
- sprintf(logbuf, "Error: Invalid --freq parameter '%s'.\n", argv[cur_arg + 1]);
+ for (uii = 1; uii <= param_ct; uii++) {
+ if (!strcmp(argv[cur_arg + uii], "counts")) {
+ misc_flags |= MISC_FREQ_COUNTS;
+ } else if (!strcmp(argv[cur_arg + uii], "gz")) {
+ misc_flags |= MISC_FREQ_GZ;
+ } else {
+ sprintf(logbuf, "Error: Invalid --freq parameter '%s'.\n", argv[cur_arg + uii]);
goto main_ret_INVALID_CMDLINE_WWA;
}
- misc_flags |= MISC_FREQ_COUNTS;
}
calculation_type |= CALC_FREQ;
if (misc_flags & MISC_FREQ_COUNTS) {
// --keep-allele-order also set for backward compatibility
+ // placed here instead of a few lines up because '--freq --counts' is
+ // permitted
misc_flags |= MISC_KEEP_ALLELE_ORDER;
}
} else if (!memcmp(argptr2, "reqx", 5)) {
@@ -6394,9 +6446,18 @@ int32_t main(int32_t argc, char** argv) {
logprint("Error: --freqx cannot be used with --freq.\n");
goto main_ret_INVALID_CMDLINE_A;
}
+ if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 1)) {
+ goto main_ret_INVALID_CMDLINE_2A;
+ }
+ if (param_ct) {
+ if (strcmp(argv[cur_arg + 1], "gz")) {
+ sprintf(logbuf, "Error: Invalid --freqx parameter '%s'.\n", argv[cur_arg + 1]);
+ goto main_ret_INVALID_CMDLINE_WWA;
+ }
+ misc_flags |= MISC_FREQ_GZ;
+ }
calculation_type |= CALC_FREQ;
misc_flags |= MISC_FREQX;
- goto main_param_zero;
} else if (!memcmp(argptr2, "rom", 4)) {
if (chrom_flag_present) {
logprint("Error: --from cannot be used with --autosome{-xy} or --{not-}chr.\n");
@@ -6597,6 +6658,10 @@ int32_t main(int32_t argc, char** argv) {
logprint("Note: --flip-scan-verbose flag deprecated. Use '--flip-scan verbose'.\n");
ld_info.modifier |= LD_FLIPSCAN_VERBOSE;
} else if (!memcmp(argptr2, "amily", 6)) {
+ if (calculation_type & CALC_DFAM) {
+ logprint("Error: --family cannot be used with --dfam.\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
misc_flags |= MISC_FAMILY_CLUSTERS;
filter_flags |= FILTER_FAM_REQ;
goto main_param_zero;
@@ -6785,7 +6850,7 @@ int32_t main(int32_t argc, char** argv) {
mtest_adjust |= ADJUST_GC;
goto main_param_zero;
} else if (!memcmp(argptr2, "file", 5)) {
- UNSTABLE;
+ UNSTABLE("gfile");
if (load_rare || (load_params & (~LOAD_PARAMS_FAM))) {
goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
}
@@ -6978,30 +7043,36 @@ int32_t main(int32_t argc, char** argv) {
hwe_modifier |= HWE_THRESH_ALL;
goto main_param_zero;
} else if (!memcmp(argptr2, "et", 3)) {
- if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 1)) {
+ if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 2)) {
goto main_ret_INVALID_CMDLINE_2A;
}
- if (param_ct) {
- if (strcmp(argv[cur_arg + 1], "small-sample")) {
- sprintf(logbuf, "Error: Invalid --het parameter '%s'.\n", argv[cur_arg + 1]);
+ for (uii = 1; uii <= param_ct; uii++) {
+ if (!strcmp(argv[cur_arg + uii], "small-sample")) {
+ misc_flags |= MISC_HET_SMALL_SAMPLE;
+ } else if (!strcmp(argv[cur_arg + uii], "gz")) {
+ misc_flags |= MISC_HET_GZ;
+ } else {
+ sprintf(logbuf, "Error: Invalid --het parameter '%s'.\n", argv[cur_arg + uii]);
goto main_ret_INVALID_CMDLINE_WWA;
}
- misc_flags |= MISC_HET_SMALL_SAMPLE;
}
calculation_type |= CALC_HET;
} else if ((!memcmp(argptr2, "ardy", 5)) || (!memcmp(argptr2, "ardy midp", 10))) {
- if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 1)) {
+ if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 2)) {
goto main_ret_INVALID_CMDLINE_2A;
}
if (argptr2[4]) {
hwe_modifier |= HWE_MIDP;
}
- if (param_ct) {
- if (strcmp(argv[cur_arg + 1], "midp")) {
- sprintf(logbuf, "Error: Invalid --hardy parameter '%s'.\n", argv[cur_arg + 1]);
+ for (uii = 1; uii <= param_ct; uii++) {
+ if (!strcmp(argv[cur_arg + uii], "midp")) {
+ hwe_modifier |= HWE_MIDP;
+ } else if (!strcmp(argv[cur_arg + uii], "gz")) {
+ hwe_modifier |= HWE_GZ;
+ } else {
+ sprintf(logbuf, "Error: Invalid --hardy parameter '%s'.\n", argv[cur_arg + uii]);
goto main_ret_INVALID_CMDLINE_WWA;
}
- hwe_modifier |= HWE_MIDP;
}
calculation_type |= CALC_HARDY;
} else if (!memcmp(argptr2, "omozyg", 7)) {
@@ -7923,6 +7994,10 @@ int32_t main(int32_t argc, char** argv) {
}
}
calculation_type |= CALC_DUPVAR;
+ } else if (!memcmp(argptr2, "d-pred", 7)) {
+ logprint("Error: --ld-pred is currently under development.\n");
+ retval = RET_CALC_NOT_YET_SUPPORTED;
+ goto main_ret_1;
} else if ((!memcmp(argptr2, "ookup", 6)) ||
(!memcmp(argptr2, "ookup-list", 11)) ||
(!memcmp(argptr2, "ookup-gene", 11)) ||
@@ -8099,6 +8174,10 @@ int32_t main(int32_t argc, char** argv) {
logprint("Error: --make-grm-bin cannot be used with --make-grm-gz.\n");
goto main_ret_INVALID_CMDLINE_A;
}
+ if (distance_exp != 0.0) {
+ logprint("Error: '--distance-wts exp=[x]' cannot be used with --make-grm-gz.\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 2)) {
goto main_ret_INVALID_CMDLINE_2A;
}
@@ -8136,6 +8215,10 @@ int32_t main(int32_t argc, char** argv) {
}
calculation_type |= CALC_RELATIONSHIP;
} else if (!memcmp(argptr2, "ake-grm-bin", 12)) {
+ if (distance_exp != 0.0) {
+ logprint("Error: '--distance-wts exp=[x]' cannot be used with --make-grm-bin.\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 1)) {
goto main_ret_INVALID_CMDLINE_2A;
}
@@ -8160,6 +8243,10 @@ int32_t main(int32_t argc, char** argv) {
logprint("Error: --make-rel cannot be used with --make-grm-gz/--make-grm-bin.\n");
goto main_ret_INVALID_CMDLINE_A;
}
+ if (distance_exp != 0.0) {
+ logprint("Error: '--distance-wts exp=[x]' cannot be used with --make-rel.\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 3)) {
goto main_ret_INVALID_CMDLINE_2A;
}
@@ -8538,6 +8625,9 @@ int32_t main(int32_t argc, char** argv) {
} else if (glm_modifier & (GLM_PERM | GLM_MPERM)) {
sprintf(logbuf, "Error: --mperm cannot be used with --%s %sperm.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear", (glm_modifier & GLM_PERM)? "" : "m");
goto main_ret_INVALID_CMDLINE_2A;
+ } else if (family_info.dfam_modifier & (DFAM_PERM | DFAM_MPERM)) {
+ sprintf(logbuf, "Error: --mperm cannot be used with --dfam %sperm.\n", (family_info.dfam_modifier & DFAM_PERM)? "" : "m");
+ goto main_ret_INVALID_CMDLINE_2A;
} else if (cluster.modifier & (CLUSTER_CMH_PERM | CLUSTER_CMH_MPERM)) {
sprintf(logbuf, "Error: --mperm cannot be used with --%s %sperm.\n", (cluster.modifier & CLUSTER_CMH_BD)? "bd" : "mh", (cluster.modifier & CLUSTER_CMH_PERM)? "" : "m");
goto main_ret_INVALID_CMDLINE_2A;
@@ -8581,6 +8671,8 @@ int32_t main(int32_t argc, char** argv) {
testmiss_modifier |= TESTMISS_MPERM;
family_info.tdt_mperm_val = mperm_val;
family_info.tdt_modifier |= TDT_MPERM;
+ family_info.dfam_mperm_val = mperm_val;
+ family_info.dfam_modifier |= DFAM_MPERM;
family_info.qfam_mperm_val = mperm_val;
family_info.qfam_modifier |= QFAM_MPERM;
cluster.cmh_mperm_val = mperm_val;
@@ -8763,8 +8855,17 @@ int32_t main(int32_t argc, char** argv) {
}
filter_flags |= FILTER_FAM_REQ | FILTER_MAKE_FOUNDERS;
} else if (!memcmp(argptr2, "issing", 7)) {
+ if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 1)) {
+ goto main_ret_INVALID_CMDLINE_2A;
+ }
+ if (param_ct) {
+ if (strcmp(argv[cur_arg + 1], "gz")) {
+ sprintf(logbuf, "Error: Invalid --missing parameter '%s'.\n", argv[cur_arg + 1]);
+ goto main_ret_INVALID_CMDLINE_WWA;
+ }
+ misc_flags |= MISC_MISSING_GZ;
+ }
calculation_type |= CALC_MISSING_REPORT;
- goto main_param_zero;
} else if (!memcmp(argptr2, "h", 2)) {
if (calculation_type & CALC_CMH) {
logprint("Error: --mh is redundant with --bd.\n");
@@ -8789,7 +8890,7 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE;
}
if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &(cluster.cmh_mperm_val))) {
- sprintf(logbuf, "Error: Invalid --mh mperm parameter '%s'.\n", argv[cur_arg + uii]);
+ sprintf(logbuf, "Error: Invalid --mh mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
goto main_ret_INVALID_CMDLINE_WWA;
}
cluster.modifier |= CLUSTER_CMH_MPERM;
@@ -9079,7 +9180,7 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_NOMEM;
}
} else if (!memcmp(argptr2, "ac", 3)) {
- UNSTABLE;
+ UNSTABLE("mac");
if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
goto main_ret_INVALID_CMDLINE_2A;
}
@@ -9088,7 +9189,7 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE_WWA;
}
} else if (!memcmp(argptr2, "ax-mac", 7)) {
- UNSTABLE;
+ UNSTABLE("max-mac");
if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
goto main_ret_INVALID_CMDLINE_2A;
}
@@ -9322,7 +9423,11 @@ int32_t main(int32_t argc, char** argv) {
}
} else if (!memcmp(argptr2, "xford-single-chr", 17)) {
if (!(load_params & LOAD_PARAMS_OXGEN)) {
- logprint("Error: --oxford-single-chr must be used with .gen input.\n");
+ if (load_params & LOAD_PARAMS_OXBGEN) {
+ logprint("Error: --oxford-single-chr must be used with .gen input. (Single-chromosome\n.bgen files do not require this, since they still contain chromosome codes.)\n");
+ } else {
+ logprint("Error: --oxford-single-chr must be used with .gen input.\n");
+ }
goto main_ret_INVALID_CMDLINE_A;
}
if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
@@ -9472,14 +9577,19 @@ int32_t main(int32_t argc, char** argv) {
ppc_gap = (int32_t)(dxx * (1 + SMALL_EPSILON));
}
} else if (!memcmp(argptr2, "erm", 4)) {
- if ((model_modifier & MODEL_MPERM) && (calculation_type & CALC_MODEL)) {
- sprintf(logbuf, "Error: --perm cannot be used with --%s mperm.\n", (model_modifier & MODEL_ASSOC)? "assoc" : "model");
- goto main_ret_INVALID_CMDLINE_2A;
+ if (model_modifier & MODEL_MPERM) {
+ if (calculation_type & CALC_MODEL) {
+ sprintf(logbuf, "Error: --perm cannot be used with --%s mperm.\n", (model_modifier & MODEL_ASSOC)? "assoc" : "model");
+ goto main_ret_INVALID_CMDLINE_2A;
+ } else {
+ logprint("Error: --perm cannot be used with --mperm.\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
} else if ((calculation_type & CALC_GLM) && (glm_modifier & (GLM_MPERM | GLM_NO_SNP))) {
sprintf(logbuf, "Error: --perm cannot be used with --%s %s.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear", (glm_modifier & GLM_MPERM)? "mperm" : "no-snp");
goto main_ret_INVALID_CMDLINE_2A;
- } else if (model_modifier & MODEL_MPERM) {
- logprint("Error: --perm cannot be used with --mperm.\n");
+ } else if (family_info.dfam_modifier & DFAM_MPERM) {
+ logprint("Error: --perm cannot be used with --dfam mperm.\n");
goto main_ret_INVALID_CMDLINE_A;
} else if (calculation_type & CALC_CMH) {
if (cluster.modifier & CLUSTER_CMH_MPERM) {
@@ -9494,6 +9604,7 @@ int32_t main(int32_t argc, char** argv) {
glm_modifier |= GLM_PERM;
testmiss_modifier |= TESTMISS_PERM;
family_info.tdt_modifier |= TDT_PERM;
+ family_info.dfam_modifier |= DFAM_PERM;
family_info.qfam_modifier |= QFAM_PERM;
cluster.modifier |= CLUSTER_CMH_PERM;
logprint("Note: --perm flag deprecated. Use e.g. '--model perm'.\n");
@@ -9502,6 +9613,8 @@ int32_t main(int32_t argc, char** argv) {
model_modifier |= MODEL_PERM_COUNT;
glm_modifier |= GLM_PERM_COUNT;
testmiss_modifier |= TESTMISS_PERM_COUNT;
+ family_info.tdt_modifier |= TDT_PERM_COUNT;
+ family_info.dfam_modifier |= DFAM_PERM_COUNT;
family_info.qfam_modifier |= QFAM_PERM_COUNT;
cluster.modifier |= CLUSTER_CMH_PERM_COUNT;
logprint("Note: --perm-count flag deprecated. Use e.g. '--model perm-count'.\n");
@@ -9526,7 +9639,7 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE_A;
}
pfilter = dxx;
- } else if (!memcmp(argptr2, "erm-batch-size", 1)) {
+ } else if (!memcmp(argptr2, "erm-batch-size", 15)) {
if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
goto main_ret_INVALID_CMDLINE_2A;
}
@@ -9815,11 +9928,13 @@ int32_t main(int32_t argc, char** argv) {
logprint("Error: Only one QFAM test can be run at a time.\n");
goto main_ret_INVALID_CMDLINE_A;
}
- if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 2)) {
+ if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 3)) {
goto main_ret_INVALID_CMDLINE_2A;
}
for (uii = 1; uii <= param_ct; uii++) {
- if (!strcmp(argv[cur_arg + uii], "perm")) {
+ if (!strcmp(argv[cur_arg + uii], "emp-se")) {
+ family_info.qfam_modifier |= QFAM_EMP_SE;
+ } else if (!strcmp(argv[cur_arg + uii], "perm")) {
if (family_info.qfam_modifier & QFAM_MPERM) {
sprintf(logbuf, "Error: --%s 'mperm' and 'perm' cannot be used together.\n", argptr);
goto main_ret_INVALID_CMDLINE_2A;
@@ -9834,7 +9949,7 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE_2;
}
if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &(family_info.qfam_mperm_val))) {
- sprintf(logbuf, "Error: Invalid --%s mperm parameter '%s'.\n", argptr, argv[cur_arg + uii]);
+ sprintf(logbuf, "Error: Invalid --%s mperm parameter '%s'.\n", argptr, &(argv[cur_arg + uii][6]));
goto main_ret_INVALID_CMDLINE_WWA;
}
family_info.qfam_modifier |= QFAM_MPERM;
@@ -10201,6 +10316,8 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE_A;
}
recode_modifier |= RECODE_DELIMX;
+ } else if (!strcmp(argv[cur_arg + uii], "bgz")) {
+ recode_modifier |= RECODE_BGZ;
} else if (!strcmp(argv[cur_arg + uii], "beagle")) {
if (recode_type_set(&recode_modifier, RECODE_BEAGLE)) {
goto main_ret_INVALID_CMDLINE_A;
@@ -10295,6 +10412,10 @@ int32_t main(int32_t argc, char** argv) {
logprint("Error: --recode 'include-alt' modifier must be used with 'A' or 'AD'.\n");
goto main_ret_INVALID_CMDLINE_A;
}
+ if ((recode_modifier & RECODE_BGZ) && (!(recode_modifier & RECODE_VCF))) {
+ logprint("Error: --recode 'bgz' modifier must be used with VCF output.\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
calculation_type |= CALC_RECODE;
} else if (!memcmp(argptr2, "ecode-whap", 11)) {
logprint("Error: --recode-whap flag retired since WHAP is no longer supported.\n");
@@ -10854,6 +10975,12 @@ int32_t main(int32_t argc, char** argv) {
}
glm_modifier |= GLM_SET_TEST;
}
+ if (calculation_type & CALC_TDT) {
+ family_info.tdt_modifier |= TDT_SET_TEST;
+ }
+ if (calculation_type & CALC_DFAM) {
+ family_info.dfam_modifier |= DFAM_SET_TEST;
+ }
if ((calculation_type & CALC_CMH) && (!(cluster.modifier & CLUSTER_CMH2))) {
cluster.modifier |= CLUSTER_CMH_SET_TEST;
}
@@ -11399,6 +11526,35 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE_WWA;
}
filter_flags |= FILTER_BIM_REQ | FILTER_DOSAGEMAP | FILTER_NOCNV;
+ } else if (!memcmp(argptr2, "hin-indiv", 10)) {
+ UNSTABLE("thin-indiv");
+ if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
+ goto main_ret_INVALID_CMDLINE_2A;
+ }
+ if (scan_double(argv[cur_arg + 1], &thin_keep_sample_prob)) {
+ sprintf(logbuf, "Error: Invalid --thin-indiv %s retention probability '%s'.\n", g_species_singular, argv[cur_arg + 1]);
+ goto main_ret_INVALID_CMDLINE_WWA;
+ }
+ if (thin_keep_sample_prob < (0.5 / 4294967296.0)) {
+ LOGPRINTF("Error: --thin-indiv %s retention probability too small.\n", g_species_singular);
+ goto main_ret_INVALID_CMDLINE_A;
+ } else if (thin_keep_sample_prob >= (4294967295.5 / 4294967296.0)) {
+ LOGPRINTF("Error: --thin-indiv %s retention probability too large.\n", g_species_singular);
+ goto main_ret_INVALID_CMDLINE_A;
+ }
+ } else if (!memcmp(argptr2, "hin-indiv-count", 16)) {
+ UNSTABLE("thin-indiv-count");
+ if (thin_keep_sample_prob != 1.0) {
+ logprint("Error: --thin-indiv cannot be used with --thin-indiv-count.\n");
+ goto main_ret_INVALID_CMDLINE_WWA;
+ }
+ if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
+ goto main_ret_INVALID_CMDLINE_2A;
+ }
+ if (scan_posint_defcap(argv[cur_arg + 1], &thin_keep_sample_ct)) {
+ sprintf(logbuf, "Error: Invalid --thin-indiv-count parameter '%s'.\n", argv[cur_arg + 1]);
+ goto main_ret_INVALID_CMDLINE_WWA;
+ }
} else if (!memcmp(argptr2, "ests", 5)) {
if (!(calculation_type & CALC_GLM)) {
logprint("Error: --tests must be used with --linear or --logistic.\n");
@@ -11460,7 +11616,7 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE;
}
if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &testmiss_mperm_val)) {
- sprintf(logbuf, "Error: Invalid --test-missing mperm parameter '%s'.\n", argv[cur_arg + uii]);
+ sprintf(logbuf, "Error: Invalid --test-missing mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
goto main_ret_INVALID_CMDLINE_WWA;
}
testmiss_modifier |= TESTMISS_MPERM;
@@ -11481,7 +11637,7 @@ int32_t main(int32_t argc, char** argv) {
calculation_type |= CALC_TESTMISHAP;
goto main_param_zero;
} else if (!memcmp(argptr2, "dt", 3)) {
- if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 4)) {
+ if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 5)) {
goto main_ret_INVALID_CMDLINE_2A;
}
for (uii = 1; uii <= param_ct; uii++) {
@@ -11515,6 +11671,8 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE_A;
}
family_info.tdt_modifier |= TDT_PERM;
+ } else if (!strcmp(argv[cur_arg + uii], "perm-count")) {
+ family_info.tdt_modifier |= TDT_PERM_COUNT;
} else if ((strlen(argv[cur_arg + uii]) > 6) && (!memcmp(argv[cur_arg + uii], "mperm=", 6))) {
if (family_info.tdt_modifier & TDT_PERM) {
logprint("Error: --tdt 'mperm' and 'perm' cannot be used together.\n");
@@ -11524,7 +11682,7 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE;
}
if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &family_info.tdt_mperm_val)) {
- sprintf(logbuf, "Error: Invalid --tdt mperm parameter '%s'.\n", argv[cur_arg + uii]);
+ sprintf(logbuf, "Error: Invalid --tdt mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
goto main_ret_INVALID_CMDLINE_WWA;
}
family_info.tdt_modifier |= TDT_MPERM;
@@ -11634,7 +11792,7 @@ int32_t main(int32_t argc, char** argv) {
logprint("Error: --unrelated-heritability requires " PROG_NAME_CAPS " to be built with LAPACK.\n");
goto main_ret_INVALID_CMDLINE;
#else
- UNSTABLE;
+ UNSTABLE("unrelated-heritability");
if (rel_info.modifier & REL_CALC_COV) {
logprint("Error: --unrelated-heritability flag cannot coexist with a covariance\nmatrix calculation.\n");
goto main_ret_INVALID_CMDLINE_A;
@@ -11760,7 +11918,7 @@ int32_t main(int32_t argc, char** argv) {
if (retval) {
goto main_ret_1;
}
- filter_flags |= FILTER_BIM_REQ;
+ filter_flags |= FILTER_FAM_REQ;
} else if (!memcmp(argptr2, "pdate-map", 10)) {
if (cnv_calc_type & CNV_MAKE_MAP) {
logprint("--update-map cannot be used with --cnv-make-map.\n");
@@ -12315,8 +12473,8 @@ int32_t main(int32_t argc, char** argv) {
calculation_type |= CALC_PLINK1_IBS_MATRIX;
}
if (calculation_type & CALC_PLINK1_IBS_MATRIX) {
- if (exponent != 0.0) {
- logprint("Error: --ibs-matrix cannot be used with --distance-exp.\n");
+ if (distance_wts_fname || (distance_exp != 0.0)) {
+ logprint("Error: --ibs-matrix cannot be used with --distance-wts.\n");
goto main_ret_INVALID_CMDLINE;
}
if (dist_calc_type & DISTANCE_IBS) {
@@ -12336,6 +12494,10 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE_A;
}
}
+ if (distance_wts_fname && (!(calculation_type & (CALC_DISTANCE | CALC_RELATIONSHIP)))) {
+ logprint("Error: --distance-wts must be used with --distance, --make-rel, --make-grm-bin,\nor --make-grm-gz.\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
if ((parallel_tot > 1) && (!(calculation_type & (CALC_LD | CALC_DISTANCE | CALC_GENOME | CALC_RELATIONSHIP)))) {
if ((!(calculation_type & CALC_EPI)) || (!(epi_info.modifier & (EPI_FAST | EPI_REG)))) {
logprint("Error: --parallel only affects --r/--r2, --distance, --genome, --make-rel,\n--make-grm-gz/--make-grm-bin, and --epistasis/--fast-epistasis.\n");
@@ -12367,8 +12529,11 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE_A;
}
}
- if ((family_info.mendel_modifier & (MENDEL_DUOS | MENDEL_MULTIGEN)) && (!(calculation_type & CALC_MENDEL)) && (!(family_info.mendel_modifier & MENDEL_FILTER)) && (!(misc_flags & MISC_SET_ME_MISSING))) {
- logprint("Error: --mendel-duos/--mendel-multigen must be used with\n--me/--mendel/--set-me-missing.\n");
+ if ((family_info.mendel_modifier & MENDEL_DUOS) && (!(calculation_type & CALC_MENDEL)) && (!(family_info.mendel_modifier & MENDEL_FILTER)) && (!(misc_flags & MISC_SET_ME_MISSING))) {
+ logprint("Error: --mendel-duos must be used with --me/--mendel/--set-me-missing.\n");
+ goto main_ret_INVALID_CMDLINE;
+ } else if ((family_info.mendel_modifier & MENDEL_MULTIGEN) && (!(calculation_type & (CALC_MENDEL | CALC_TDT | CALC_DFAM | CALC_QFAM))) && (!(family_info.mendel_modifier & MENDEL_FILTER)) && (!(misc_flags & MISC_SET_ME_MISSING))) {
+ logprint("Error: --mendel-multigen must be used with --me, --mendel, --set-me-missing, or\nan association test which checks for Mendel errors.\n");
goto main_ret_INVALID_CMDLINE;
}
if (flip_subset_fname && (load_rare || (calculation_type != CALC_MAKE_BED) || (min_maf != 0.0) || (max_maf != 0.5) || (hwe_thresh != 0.0))) {
@@ -12519,8 +12684,18 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_1;
uii = 1;
}
+ if (family_info.dfam_modifier & DFAM_SET_TEST) {
+ if (!(family_info.dfam_modifier & (DFAM_PERM | DFAM_MPERM))) {
+ logprint("Error: --dfam set-test requires permutation.\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
+ logprint("Error: --dfam set-test is currently under development.\n");
+ retval = RET_CALC_NOT_YET_SUPPORTED;
+ goto main_ret_1;
+ uii = 1;
+ }
if (cluster.modifier & CLUSTER_CMH_SET_TEST) {
- if (!(family_info.tdt_modifier & (TDT_PERM | TDT_MPERM))) {
+ if (!(cluster.modifier & (CLUSTER_CMH_PERM | CLUSTER_CMH_MPERM))) {
logprint("Error: --mh/--bd set-test requires permutation.\n");
goto main_ret_INVALID_CMDLINE_A;
}
@@ -12727,6 +12902,10 @@ int32_t main(int32_t argc, char** argv) {
logprint("Error: --gen/--bgen cannot be used without --data or --sample.\n");
goto main_ret_INVALID_CMDLINE_A;
}
+ if ((merge_type & MERGE_EQUAL_POS) && (!(calculation_type & CALC_MERGE))) {
+ logprint("Error: --merge-equal-pos must be used with --merge/--bmerge/--merge-list.\n(Note that you are permitted to merge a fileset with itself.)\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
// short batch job?
uii = 0;
if ((!calculation_type) && (!(load_rare & (LOAD_RARE_LGEN | LOAD_RARE_DUMMY | LOAD_RARE_SIMULATE | LOAD_RARE_TRANSPOSE_MASK | LOAD_RARE_23 | LOAD_RARE_CNV | LOAD_RARE_VCF | LOAD_RARE_BCF)))) {
@@ -12860,6 +13039,7 @@ int32_t main(int32_t argc, char** argv) {
logprint("Error: --dosage cannot be used with other PLINK computations.\n");
goto main_ret_INVALID_CMDLINE;
}
+ pigz_init(g_thread_ct);
retval = plink1_dosage(&dosage_info, famname, mapname, outname, outname_end, phenoname, extractname, excludename, keepname, removename, keepfamname, removefamname, filtername, makepheno_str, phenoname_str, covar_fname, qual_filter, update_map, update_name, update_ids_fname, update_parents_fname, update_sex_fname, filtervals_flattened, filter_attrib_fname, filter_attrib_liststr, filter_attrib_sample_fname, filter_attrib_sample_liststr, qual_min_thresh, qual_max_thresh, thin_keep_prob, [...]
// unconditional; note that plink1_dosage() currently doesn't even bother
// to pop stuff off the stack when it's done
@@ -12956,7 +13136,7 @@ int32_t main(int32_t argc, char** argv) {
} else if (!rel_info.ibc_type) {
rel_info.ibc_type = 1;
}
- retval = plink(outname, outname_end, pedname, mapname, famname, cm_map_fname, cm_map_chrname, phenoname, extractname, excludename, keepname, removename, keepfamname, removefamname, filtername, freqname, read_dists_fname, read_dists_id_fname, evecname, mergename1, mergename2, mergename3, missing_mid_template, missing_marker_id_match, makepheno_str, phenoname_str, a1alleles, a2alleles, recode_allele_name, covar_fname, update_alleles_fname, read_genome_fname, qual_filter, update_chr, up [...]
+ retval = plink(outname, outname_end, pedname, mapname, famname, cm_map_fname, cm_map_chrname, phenoname, extractname, excludename, keepname, removename, keepfamname, removefamname, filtername, freqname, distance_wts_fname, read_dists_fname, read_dists_id_fname, evecname, mergename1, mergename2, mergename3, missing_mid_template, missing_marker_id_match, makepheno_str, phenoname_str, a1alleles, a2alleles, recode_allele_name, covar_fname, update_alleles_fname, read_genome_fname, qual_fi [...]
}
while (0) {
main_ret_NOMEM:
@@ -13004,7 +13184,11 @@ int32_t main(int32_t argc, char** argv) {
#ifdef STABLE_BUILD
break;
main_unstable_disabled:
- logprint("Error: This flag's implementation is unfinished or unstable. If you wish to\ntest it, use the latest development build.\n");
+ // see the UNSTABLE macro in plink_common.h
+ memcpy(logbuf, "Error: --", 9);
+ strcpy(sptr, " is either unfinished or not yet well-tested. If you wish to help with testing, use the latest development build.\n");
+ wordwrap(logbuf, 0);
+ logprintb();
retval = RET_CALC_NOT_YET_SUPPORTED;
#endif
}
@@ -13026,6 +13210,7 @@ int32_t main(int32_t argc, char** argv) {
free_cond(filtervals_flattened);
free_cond(evecname);
free_cond(filtername);
+ free_cond(distance_wts_fname);
free_cond(read_dists_fname);
free_cond(read_dists_id_fname);
free_cond(freqname);
diff --git a/plink_assoc.c b/plink_assoc.c
index 9ef4a8e..dded1e3 100644
--- a/plink_assoc.c
+++ b/plink_assoc.c
@@ -810,7 +810,7 @@ void transpose_perms(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_
memcpy(perm_vecst, wbuf, 16);
perm_vecst = &(perm_vecst[4]);
transpose_perms_loop_start:
- fill_ulong_zero((uintptr_t*)wbuf, 2);
+ fill_uint_zero(wbuf, 4);
wshift = 0;
}
wbptr = wbuf;
@@ -864,7 +864,7 @@ void transpose_perm1s(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno
memcpy(perm_vecst, wbuf, 16);
perm_vecst = &(perm_vecst[4]);
transpose_perm1s_loop_start:
- fill_ulong_zero((uintptr_t*)wbuf, 2);
+ fill_uint_zero(wbuf, 2);
wshift = 0;
}
wbptr = wbuf;
@@ -6421,7 +6421,7 @@ THREAD_RET_TYPE model_set_best_thread(void* arg) {
}
}
-int32_t model_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, char* outname_end2, uint32_t model_modifier, uint32_t model_mperm_val, double pfilter, double output_min_p, uint32_t mtest_adjust, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, uintptr_t* marker_exclude_mid, uintptr_t marker_ct_mid, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* marker_reverse, Chrom_info* chrom_inf [...]
+int32_t model_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, char* outname_end2, uint32_t model_modifier, uint32_t model_mperm_val, double pfilter, double output_min_p, uint32_t mtest_adjust, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, uintptr_t* marker_exclude_mid, uintptr_t marker_ct_mid, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* marker_reverse, Chrom_info* chrom_inf [...]
// Could reuse more of the code in model_assoc() since there's considerable
// overlap, but there are enough differences between the regular and set
// permutation tests that separating this out and doing a fair bit of
@@ -8347,7 +8347,7 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
}
}
} else {
- retval = model_assoc_set_test(threads, bedfile, bed_offset, outname, outname_end, outname_end2, model_modifier, model_mperm_val, pfilter, output_min_p, mtest_adjust, unfiltered_marker_ct, marker_exclude_orig, marker_ct_orig, marker_exclude, marker_ct, marker_ids, max_marker_id_len, marker_reverse, chrom_info_ptr, unfiltered_sample_ct, sex_male, apip, pheno_nm_ct, pheno_nm, pheno_c, founder_pnm, gender_req, ld_ignore_x, hh_exists, perm_batch_size, sip, loadbuf_raw);
+ retval = model_assoc_set_test(threads, bedfile, bed_offset, outname, outname_end, outname_end2, model_modifier, model_mperm_val, pfilter, output_min_p, mtest_adjust, unfiltered_marker_ct, marker_exclude_orig, marker_ct_orig, marker_exclude, marker_ct, marker_ids, max_marker_id_len, marker_reverse, chrom_info_ptr, unfiltered_sample_ct, sex_male, apip, pheno_nm_ct, pheno_nm, founder_pnm, gender_req, ld_ignore_x, hh_exists, perm_batch_size, sip, loadbuf_raw);
if (retval) {
goto model_assoc_ret_1;
}
diff --git a/plink_calc.c b/plink_calc.c
index ef4e2a9..547c404 100644
--- a/plink_calc.c
+++ b/plink_calc.c
@@ -117,7 +117,7 @@ void rel_cleanup(Rel_info* relip) {
free_cond(relip->pca_clusters_fname);
}
-void update_rel_ibc(double* rel_ibc, uintptr_t* geno, double* set_allele_freqs, int32_t ibc_type, uint32_t sample_ct, uint32_t window_size) {
+void update_rel_ibc(double* rel_ibc, uintptr_t* geno, double* set_allele_freqs, double* main_weights, int32_t ibc_type, uint32_t sample_ct, uint32_t window_size) {
// first calculate weight array, then loop
uint32_t uii;
uint32_t ujj;
@@ -199,134 +199,10 @@ void update_rel_ibc(double* rel_ibc, uintptr_t* geno, double* set_allele_freqs,
}
}
}
- }
- for (ukk = 0; ukk < (BITCT * 5) / 32; ukk++) {
- wtptr = &(wtarr[16 * ukk]);
-#ifdef __LP64__
- if ((ukk == 2) || (ukk == 7)) {
- for (uii = 0; uii < 8; uii++) {
- twt = wtptr[uii + 8];
- for (ujj = 0; ujj < 8; ujj++) {
- *wptr++ = twt + wtptr[ujj];
- }
- wptr = &(wptr[8]);
- }
- } else {
- for (uii = 0; uii < 8; uii++) {
- twt = wtptr[uii + 8];
- for (ujj = 0; ujj < 8; ujj++) {
- *wptr++ = twt + wtptr[ujj];
- }
- }
- }
-#else
- if (ukk == 2) {
- for (uii = 0; uii < 8; uii++) {
- twt = wtptr[uii + 8];
- for (ujj = 0; ujj < 8; ujj++) {
- *wptr++ = twt + wtptr[ujj];
- }
- wptr = &(wptr[8]);
- }
- } else {
- for (uii = 0; uii < 8; uii++) {
- twt = wtptr[uii + 8];
- for (ujj = 0; ujj < 8; ujj++) {
- *wptr++ = twt + wtptr[ujj];
- }
- }
- }
-#endif
- }
- for (umm = 0; umm < sample_ct; umm++) {
- ulii = *geno++;
-#ifdef __LP64__
- *rel_ibc += weights9[ulii >> 57] + weights8[(ulii >> 51) & 63] + weights7[(ulii >> 44) & 127] + weights6[(ulii >> 38) & 63] + weights5[(ulii >> 32) & 63] + weights4[(ulii >> 25) & 63] + weights3[(ulii >> 19) & 63] + weights2[(ulii >> 12) & 127] + weights1[(ulii >> 6) & 63] + weights[ulii & 63];
-#else
- *rel_ibc += weights4[ulii >> 25] + weights3[(ulii >> 19) & 63] + weights2[(ulii >> 12) & 127] + weights1[(ulii >> 6) & 63] + weights[ulii & 63];
-#endif
- rel_ibc++;
- }
-}
-
-void update_rel_f_ibc(float* rel_ibc, uintptr_t* geno, float* set_allele_freqs, int32_t ibc_type, uint32_t sample_ct, uint32_t window_size) {
- // first calculate weight array, then loop
- uint32_t uii;
- uint32_t ujj;
- uint32_t ukk;
- uint32_t umm;
- float twt;
- float* wtptr;
- float mult = 1.0;
- uintptr_t ulii;
- float weights[BITCT * 12];
- float* weights1 = &(weights[64]);
- float* weights2 = &(weights[128]);
- float* weights3 = &(weights[256]);
- float* weights4 = &(weights[320]);
-#ifdef __LP64__
- float* weights5 = &(weights[384]);
- float* weights6 = &(weights[448]);
- float* weights7 = &(weights[512]);
- float* weights8 = &(weights[640]);
- float* weights9 = &(weights[704]);
-#endif
- float wtarr[BITCT2 * 5];
- float *wptr = weights;
- fill_float_zero(wtarr, BITCT2 * 5);
- for (uii = 0; uii < window_size; uii += 1) {
- if ((set_allele_freqs[uii] != 0.0) && (set_allele_freqs[uii] < (1.0 - EPSILON))) {
- if (ibc_type) {
- if (ibc_type == 2) {
- wtarr[uii * 8] = 2;
- wtarr[uii * 8 + 2] = 2.0 - 1.0 / (2 * set_allele_freqs[uii] * (1.0 - set_allele_freqs[uii]));
- wtarr[uii * 8 + 3] = 2;
- } else {
- twt = 2 * set_allele_freqs[uii];
- if (ibc_type == 1) {
- mult = 1 / (twt * (1.0 - set_allele_freqs[uii]));
- }
- wtarr[uii * 8] = twt * twt * mult;
- wtarr[uii * 8 + 2] = (1.0 - twt) * (1.0 - twt) * mult;
- wtarr[uii * 8 + 3] = (2.0 - twt) * (2.0 - twt) * mult;
- }
- } else {
- twt = 1.0 - set_allele_freqs[uii];
- mult = 1 / (set_allele_freqs[uii] * twt);
- wtarr[uii * 8] = 1.0 + set_allele_freqs[uii] * set_allele_freqs[uii] * mult;
- wtarr[uii * 8 + 3] = 1.0 + twt * twt * mult;
- }
- } else {
- if (ibc_type) {
- if (ibc_type == -1) {
- twt = 2 * set_allele_freqs[uii];
- wtarr[uii * 8] = twt * twt;
- wtarr[uii * 8 + 2] = (1.0 - twt) * (1.0 - twt);
- wtarr[uii * 8 + 3] = (2.0 - twt) * (2.0 - twt);
- } else if (ibc_type == 1) {
- wtarr[uii * 8 + 2] = INFINITY;
- if (set_allele_freqs[uii] == 0.0) {
- wtarr[uii * 8] = 0;
- wtarr[uii * 8 + 3] = INFINITY;
- } else {
- wtarr[uii * 8] = INFINITY;
- wtarr[uii * 8 + 3] = 0;
- }
- } else {
- // need to set to 1 instead of 2 for agreement with GCTA
- wtarr[uii * 8] = 1;
- wtarr[uii * 8 + 2] = -INFINITY;
- wtarr[uii * 8 + 3] = 1;
- }
- } else {
- if (set_allele_freqs[uii] == 0.0) {
- wtarr[uii * 8] = 1;
- wtarr[uii * 8 + 3] = INFINITY;
- } else {
- wtarr[uii * 8] = INFINITY;
- wtarr[uii * 8 + 3] = 1;
- }
- }
+ if (main_weights) {
+ wtarr[uii * 8] *= main_weights[uii];
+ wtarr[uii * 8 + 2] *= main_weights[uii];
+ wtarr[uii * 8 + 3] *= main_weights[uii];
}
}
for (ukk = 0; ukk < (BITCT * 5) / 32; ukk++) {
@@ -378,7 +254,7 @@ void update_rel_f_ibc(float* rel_ibc, uintptr_t* geno, float* set_allele_freqs,
}
}
-void fill_weights(double* weights, double* set_allele_freqs, double exponent) {
+void fill_subset_weights(double* subset_weights, double* main_weights) {
uint32_t uii;
uint32_t ujj;
uint32_t ukk;
@@ -390,7 +266,7 @@ void fill_weights(double* weights, double* set_allele_freqs, double exponent) {
#ifdef __LP64__
double twt[5];
double twtf;
- __m128d* wpairs = (__m128d*)weights;
+ __m128d* swpairs = (__m128d*)subset_weights;
__m128d vpen;
__m128d vfinal1;
__m128d vfinal2;
@@ -399,9 +275,7 @@ void fill_weights(double* weights, double* set_allele_freqs, double exponent) {
uint32_t uqq;
double twt[7];
#endif
- for (uii = 0; uii < MULTIPLEX_DIST_EXP / 2; uii++) {
- wtarr[uii] = pow(2 * set_allele_freqs[uii] * (1.0 - set_allele_freqs[uii]), -exponent);
- }
+ memcpy(wtarr, main_weights, (MULTIPLEX_DIST_EXP / 2) * sizeof(double));
for (uoo = 0; uoo < 2; uoo++) {
wt = &(wtarr[7 * uoo]);
#ifdef __LP64__
@@ -438,19 +312,19 @@ void fill_weights(double* weights, double* set_allele_freqs, double exponent) {
#ifdef __LP64__
twtf = twt[4];
vpen = _mm_set1_pd(twtf);
- *wpairs++ = _mm_add_pd(vpen, vfinal1);
- *wpairs++ = _mm_add_pd(vpen, vfinal2);
+ *swpairs++ = _mm_add_pd(vpen, vfinal1);
+ *swpairs++ = _mm_add_pd(vpen, vfinal2);
twtf += wt[1];
vpen = _mm_set1_pd(twtf);
- *wpairs++ = _mm_add_pd(vpen, vfinal1);
- *wpairs++ = _mm_add_pd(vpen, vfinal2);
- *wpairs = *(wpairs - 2);
- wpairs++;
- *wpairs = *(wpairs - 2);
- wpairs++;
+ *swpairs++ = _mm_add_pd(vpen, vfinal1);
+ *swpairs++ = _mm_add_pd(vpen, vfinal2);
+ *swpairs = *(swpairs - 2);
+ swpairs++;
+ *swpairs = *(swpairs - 2);
+ swpairs++;
vpen = _mm_set1_pd(twtf + wt[1]);
- *wpairs++ = _mm_add_pd(vpen, vfinal1);
- *wpairs++ = _mm_add_pd(vpen, vfinal2);
+ *swpairs++ = _mm_add_pd(vpen, vfinal1);
+ *swpairs++ = _mm_add_pd(vpen, vfinal2);
#else
twt[5] = twt[4];
for (upp = 0; upp < 4; upp++) {
@@ -462,7 +336,7 @@ void fill_weights(double* weights, double* set_allele_freqs, double exponent) {
if (uqq & 1) {
twt[6] += wt[0];
}
- *weights++ = twt[6];
+ *subset_weights++ = twt[6];
}
}
#endif
@@ -499,19 +373,19 @@ void fill_weights(double* weights, double* set_allele_freqs, double exponent) {
}
twtf = twt[3];
vpen = _mm_set1_pd(twtf);
- *wpairs++ = _mm_add_pd(vpen, vfinal1);
- *wpairs++ = _mm_add_pd(vpen, vfinal2);
+ *swpairs++ = _mm_add_pd(vpen, vfinal1);
+ *swpairs++ = _mm_add_pd(vpen, vfinal2);
twtf += wt[1];
vpen = _mm_set1_pd(twtf);
- *wpairs++ = _mm_add_pd(vpen, vfinal1);
- *wpairs++ = _mm_add_pd(vpen, vfinal2);
- *wpairs = *(wpairs - 2);
- wpairs++;
- *wpairs = *(wpairs - 2);
- wpairs++;
+ *swpairs++ = _mm_add_pd(vpen, vfinal1);
+ *swpairs++ = _mm_add_pd(vpen, vfinal2);
+ *swpairs = *(swpairs - 2);
+ swpairs++;
+ *swpairs = *(swpairs - 2);
+ swpairs++;
vpen = _mm_set1_pd(twtf + wt[1]);
- *wpairs++ = _mm_add_pd(vpen, vfinal1);
- *wpairs++ = _mm_add_pd(vpen, vfinal2);
+ *swpairs++ = _mm_add_pd(vpen, vfinal1);
+ *swpairs++ = _mm_add_pd(vpen, vfinal2);
}
}
}
@@ -520,7 +394,7 @@ void fill_weights(double* weights, double* set_allele_freqs, double exponent) {
#endif
}
-void fill_weights_r(double* weights, double* set_allele_freqs, uint32_t var_std) {
+void fill_subset_weights_r(double* subset_weights, double* set_allele_freqs, double* main_weights, uint32_t var_std) {
uint32_t uii;
uint32_t ujj;
uint32_t ukk;
@@ -528,7 +402,7 @@ void fill_weights_r(double* weights, double* set_allele_freqs, uint32_t var_std)
uint32_t unn;
// 20 markers to process in quintuplets, for 64-bit; 10, for 32-bit.
// Each quintuplet of markers requires 40 wtarr entries, and induces
- // 2^15 writes to weights[].
+ // 2^15 writes to subset_weights[].
double wtarr_raw[BITCT2 * 5 + 1];
double* wtarr = wtarr_raw;
double twt;
@@ -542,7 +416,7 @@ void fill_weights_r(double* weights, double* set_allele_freqs, uint32_t var_std)
double mult = 1.0;
double aux;
#ifdef __LP64__
- __m128d* wpairs = (__m128d*)weights;
+ __m128d* swpairs = (__m128d*)subset_weights;
__m128d vpen;
__m128d vfinal1;
__m128d vfinal2;
@@ -609,6 +483,11 @@ void fill_weights_r(double* weights, double* set_allele_freqs, uint32_t var_std)
wtarr[uii * 8 + 6] = 0;
}
}
+ if (main_weights) {
+ for (ujj = 0; ujj < 7; ujj++) {
+ wtarr[uii * 8 + ujj] *= main_weights[uii];
+ }
+ }
wtarr[uii * 8 + 7] = 0;
}
for (unn = 0; unn < BITCT / 16; unn++) {
@@ -629,133 +508,13 @@ void fill_weights_r(double* weights, double* set_allele_freqs, uint32_t var_std)
twt4 = twt3 + wtptr[umm + 8];
#ifdef __LP64__
vpen = _mm_set1_pd(twt4);
- *wpairs++ = _mm_add_pd(vpen, vfinal1);
- *wpairs++ = _mm_add_pd(vpen, vfinal2);
- *wpairs++ = _mm_add_pd(vpen, vfinal3);
- *wpairs++ = _mm_add_pd(vpen, vfinal4);
+ *swpairs++ = _mm_add_pd(vpen, vfinal1);
+ *swpairs++ = _mm_add_pd(vpen, vfinal2);
+ *swpairs++ = _mm_add_pd(vpen, vfinal3);
+ *swpairs++ = _mm_add_pd(vpen, vfinal4);
#else
for (uoo = 0; uoo < 8; uoo++) {
- *weights++ = twt4 + wtptr[uoo];
- }
-#endif
- }
- }
- }
- }
- }
-}
-
-void fill_weights_r_f(float* weights_f, float* set_allele_freqs_f, uint32_t var_std) {
- uint32_t uii;
- uint32_t ujj;
- uint32_t ukk;
- uint32_t umm;
- uint32_t unn;
- // 20 markers to process in quintuplets, for 64-bit; 10, for 32-bit.
- // Each quintuplet of markers requires 40 wtarr entries, and induces
- // 2^15 writes to weights_f[].
- float wtarr_raw[BITCT2 * 5 + 3];
- float* wtarr = wtarr_raw;
- float twt;
- float twt2;
- float twt3;
- float twt4;
- float* wtptr;
- float mean;
- float mean_m1;
- float mean_m2;
- float mult = 1.0;
- float aux;
-#ifdef __LP64__
- __m128* wquads = (__m128*)weights_f;
- __m128 vpen;
- __m128 vfinal1;
- __m128 vfinal2;
-#else
- uint32_t uoo;
-#endif
- uii = (((uintptr_t)wtarr) & 15);
- if (uii) {
- // force 16-byte alignment; can't do this at compile-time since stack
- // pointer has no 16-byte align guarantee.
- // yes, this assumes floats are 4 bytes.
- wtarr = &(wtarr[4 - (uii / 4)]);
- }
- for (uii = 0; uii < MULTIPLEX_REL / 3; uii += 1) {
- if (((set_allele_freqs_f[uii] != 0.0) && (set_allele_freqs_f[uii] < (1.0 - EPSILON))) || (!var_std)) {
- if (set_allele_freqs_f[uii] < 0.5) {
- mean = 2 * set_allele_freqs_f[uii];
- mean_m1 = mean - 1.0;
- mean_m2 = mean - 2.0;
- if (var_std) {
- mult = 1 / (mean * (1.0 - set_allele_freqs_f[uii]));
- }
- aux = mean * mult;
- wtarr[uii * 8] = mean * aux;
- wtarr[uii * 8 + 1] = 0;
- wtarr[uii * 8 + 2] = mean_m1 * aux;
- wtarr[uii * 8 + 3] = mean_m2 * aux;
- wtarr[uii * 8 + 4] = mean_m1 * mean_m1 * mult;
- wtarr[uii * 8 + 5] = mean_m2 * mean_m1 * mult;
- wtarr[uii * 8 + 6] = mean_m2 * mean_m2 * mult;
- } else {
- mean = 2 * (1.0 - set_allele_freqs_f[uii]);
- mean_m1 = mean - 1.0;
- mean_m2 = mean - 2.0;
- if (var_std) {
- mult = 1 / (mean * set_allele_freqs_f[uii]);
- }
- aux = mean_m2 * mult;
- wtarr[uii * 8] = mean_m2 * aux;
- wtarr[uii * 8 + 1] = 0;
- wtarr[uii * 8 + 2] = mean_m1 * aux;
- wtarr[uii * 8 + 3] = mean * aux;
- wtarr[uii * 8 + 4] = mean_m1 * mean_m1 * mult;
- wtarr[uii * 8 + 5] = mean_m1 * mean * mult;
- wtarr[uii * 8 + 6] = mean * mean * mult;
- }
- } else {
- if (set_allele_freqs_f[uii] == 0.0) {
- wtarr[uii * 8] = 0;
- wtarr[uii * 8 + 1] = 0;
- wtarr[uii * 8 + 2] = -1;
- wtarr[uii * 8 + 3] = -2;
- wtarr[uii * 8 + 4] = INFINITY;
- wtarr[uii * 8 + 5] = INFINITY;
- wtarr[uii * 8 + 6] = INFINITY;
- } else {
- wtarr[uii * 8] = INFINITY;
- wtarr[uii * 8 + 1] = 0;
- wtarr[uii * 8 + 2] = INFINITY;
- wtarr[uii * 8 + 3] = -2;
- wtarr[uii * 8 + 4] = INFINITY;
- wtarr[uii * 8 + 5] = -1;
- wtarr[uii * 8 + 6] = 0;
- }
- }
- wtarr[uii * 8 + 7] = 0;
- }
- for (unn = 0; unn < BITCT / 16; unn++) {
- wtptr = &(wtarr[40 * unn]);
-#ifdef __LP64__
- vfinal1 = _mm_load_ps(wtptr);
- vfinal2 = _mm_load_ps(&(wtptr[4]));
-#endif
- for (uii = 0; uii < 8; uii++) {
- twt = wtptr[uii + 32];
- for (ujj = 0; ujj < 8; ujj++) {
- twt2 = twt + wtptr[ujj + 24];
- for (ukk = 0; ukk < 8; ukk++) {
- twt3 = twt2 + wtptr[ukk + 16];
- for (umm = 0; umm < 8; umm++) {
- twt4 = twt3 + wtptr[umm + 8];
-#ifdef __LP64__
- vpen = _mm_set1_ps(twt4);
- *wquads++ = _mm_add_ps(vpen, vfinal1);
- *wquads++ = _mm_add_ps(vpen, vfinal2);
-#else
- for (uoo = 0; uoo < 8; uoo++) {
- *weights_f++ = twt4 + wtptr[uoo];
+ *subset_weights++ = twt4 + wtptr[uoo];
}
#endif
}
@@ -968,8 +727,8 @@ static int32_t* g_idists;
static uintptr_t* g_pheno_nm = NULL;
static uintptr_t* g_pheno_c = NULL;
static unsigned char* g_geno = NULL;
-static double* g_weights;
-static uint32_t* g_weights_i;
+static double* g_subset_weights;
+static uint32_t* g_subset_weights_i;
static double g_reg_tot_xy;
static double g_reg_tot_x;
static double g_reg_tot_y;
@@ -1262,7 +1021,7 @@ void incr_dists_i(uint32_t* idists, uintptr_t* geno, uintptr_t* masks, uint32_t
}
}
-void incr_wt_dist_missing(uint32_t* mtw, uint32_t* weights_i, uintptr_t* mmasks, uint32_t start_idx, uint32_t end_idx) {
+void incr_wt_dist_missing(uint32_t* mtw, uint32_t* subset_weights_i, uintptr_t* mmasks, uint32_t start_idx, uint32_t end_idx) {
uintptr_t* glptr;
uintptr_t ulii;
uintptr_t uljj;
@@ -1275,7 +1034,7 @@ void incr_wt_dist_missing(uint32_t* mtw, uint32_t* weights_i, uintptr_t* mmasks,
for (ujj = 0; ujj < uii; ujj++) {
uljj = (*glptr++) & ulii;
while (uljj) {
- mtw[ujj] += weights_i[CTZLU(uljj)];
+ mtw[ujj] += subset_weights_i[CTZLU(uljj)];
uljj &= uljj - 1;
}
}
@@ -1328,8 +1087,8 @@ THREAD_RET_TYPE calc_ibs_thread(void* arg) {
while (1) {
is_last_block = g_is_last_thread_block;
if (weighted_missing_ptr) {
- // g_weights_i moves around
- incr_wt_dist_missing(weighted_missing_ptr, g_weights_i, mmasks_ptr, ulii, end_idx);
+ // g_subset_weights_i moves around
+ incr_wt_dist_missing(weighted_missing_ptr, g_subset_weights_i, mmasks_ptr, ulii, end_idx);
}
if (flat_missing_ptr) {
incr_dists_rm(flat_missing_ptr, mmasks_ptr, ulii, end_idx);
@@ -1843,17 +1602,17 @@ THREAD_RET_TYPE calc_wdist_thread(void* arg) {
uintptr_t* geno_ptr = (uintptr_t*)g_geno;
uintptr_t* masks_ptr = g_masks;
uintptr_t* mmasks_ptr = g_mmasks;
- double* weights_ptr = g_weights;
- uint32_t* weights_i_ptr = g_weights_i;
+ double* subset_weights_ptr = g_subset_weights;
+ uint32_t* subset_weights_i_ptr = g_subset_weights_i;
uint32_t* weighted_missing_ptr = &(g_missing_tot_weights[offset]);
uint32_t end_idx = g_thread_start[tidx + 1];
uint32_t is_last_block;
while (1) {
is_last_block = g_is_last_thread_block;
- incr_dists(dists_ptr, geno_ptr, masks_ptr, weights_ptr, ulii, end_idx);
+ incr_dists(dists_ptr, geno_ptr, masks_ptr, subset_weights_ptr, ulii, end_idx);
if (is_last_block || (g_thread_spawn_ct & 1)) {
- // weights_i is stationary here
- incr_wt_dist_missing(weighted_missing_ptr, weights_i_ptr, mmasks_ptr, ulii, end_idx);
+ // subset_weights_i is stationary here
+ incr_wt_dist_missing(weighted_missing_ptr, subset_weights_i_ptr, mmasks_ptr, ulii, end_idx);
}
if ((!tidx) || is_last_block) {
THREAD_RETURN;
@@ -1914,12 +1673,12 @@ THREAD_RET_TYPE calc_rel_thread(void* arg) {
uintptr_t* masks_ptr = g_masks;
uintptr_t* mmasks_ptr = g_mmasks;
uint32_t* missing_ptr = &(g_missing_dbl_excluded[offset]);
- double* weights_ptr = g_weights;
+ double* subset_weights_ptr = g_subset_weights;
uint32_t end_idx = g_thread_start[tidx + 1];
uint32_t is_last_block;
while (1) {
is_last_block = g_is_last_thread_block;
- incr_dists_r(rel_ptr, geno_ptr, masks_ptr, (uint32_t)tidx, weights_ptr);
+ incr_dists_r(rel_ptr, geno_ptr, masks_ptr, (uint32_t)tidx, subset_weights_ptr);
if (is_last_block || ((g_thread_spawn_ct % 3) == 2)) {
incr_dists_rm(missing_ptr, mmasks_ptr, ulii, end_idx);
}
@@ -1930,45 +1689,30 @@ THREAD_RET_TYPE calc_rel_thread(void* arg) {
}
}
-void incr_dists_r_f(float* dists_f, uintptr_t* geno, uintptr_t* masks, float* weights_f, uint32_t start_idx, uint32_t end_idx) {
- uintptr_t* glptr;
- uintptr_t* maskptr;
- uintptr_t ulii;
- uintptr_t uljj;
- uintptr_t basemask;
- float* weights1 = &(weights_f[32768]);
-#ifdef __LP64__
- float* weights2 = &(weights_f[65536]);
- float* weights3 = &(weights_f[98304]);
-#endif
- uint32_t uii;
- uint32_t ujj;
- for (uii = start_idx; uii < end_idx; uii++) {
- glptr = geno;
- ulii = geno[uii];
- maskptr = masks;
- basemask = masks[uii];
- if (!basemask) {
- for (ujj = 0; ujj < uii; ujj++) {
- uljj = ((*glptr++) + ulii) | (*maskptr++);
-#ifdef __LP64__
- *dists_f += weights_f[(uint16_t)uljj] + weights1[(uint16_t)(uljj >> 16)] + weights2[(uint16_t)(uljj >> 32)] + weights3[uljj >> 48];
-#else
- *dists_f += weights_f[(uint16_t)uljj] + weights1[uljj >> 16];
-#endif
- dists_f++;
- }
- } else {
- for (ujj = 0; ujj < uii; ujj++) {
- uljj = ((*glptr++) + ulii) | ((*maskptr++) | basemask);
-#ifdef __LP64__
- *dists_f += weights_f[(uint16_t)uljj] + weights1[(uint16_t)(uljj >> 16)] + weights2[(uint16_t)(uljj >> 32)] + weights3[uljj >> 48];
-#else
- *dists_f += weights_f[(uint16_t)uljj] + weights1[uljj >> 16];
-#endif
- dists_f++;
- }
+THREAD_RET_TYPE calc_wt_rel_thread(void* arg) {
+ // this needs more work
+ uintptr_t tidx = (uintptr_t)arg;
+ uintptr_t ulii = g_thread_start[tidx];
+ uintptr_t uljj = g_thread_start[0];
+ uintptr_t offset = (((uint64_t)ulii) * (ulii - 1) - ((uint64_t)uljj) * (uljj - 1)) / 2;
+ double* rel_ptr = &(g_rel_dists[offset]);
+ uintptr_t* geno_ptr = (uintptr_t*)g_geno;
+ uintptr_t* masks_ptr = g_masks;
+ uintptr_t* mmasks_ptr = g_mmasks;
+ uint32_t* missing_ptr = &(g_missing_dbl_excluded[offset]);
+ double* subset_weights_ptr = g_subset_weights;
+ uint32_t end_idx = g_thread_start[tidx + 1];
+ uint32_t is_last_block;
+ while (1) {
+ is_last_block = g_is_last_thread_block;
+ incr_dists_r(rel_ptr, geno_ptr, masks_ptr, (uint32_t)tidx, subset_weights_ptr);
+ if (is_last_block || ((g_thread_spawn_ct % 3) == 2)) {
+ incr_dists_rm(missing_ptr, mmasks_ptr, ulii, end_idx);
+ }
+ if ((!tidx) || is_last_block) {
+ THREAD_RETURN;
}
+ THREAD_BLOCK_FINISH(tidx);
}
}
@@ -4477,6 +4221,7 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
int32_t write_ibs_matrix = dist_calc_type & DISTANCE_IBS;
int32_t write_1mibs_matrix = dist_calc_type & DISTANCE_1_MINUS_IBS;
int32_t retval = 0;
+ unsigned char overflow_buf[262144];
double dxx;
double dyy;
double* dist_ptr;
@@ -4824,11 +4569,11 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
sprintf(outname_end, ".dist.gz");
}
if (shape == DISTANCE_SQ) {
- parallel_compress(outname, 0, distance_d_write_sq_emitn);
+ parallel_compress(outname, overflow_buf, 0, distance_d_write_sq_emitn);
} else if (shape == DISTANCE_SQ0) {
- parallel_compress(outname, 0, distance_d_write_sq0_emitn);
+ parallel_compress(outname, overflow_buf, 0, distance_d_write_sq0_emitn);
} else {
- parallel_compress(outname, 0, distance_d_write_tri_emitn);
+ parallel_compress(outname, overflow_buf, 0, distance_d_write_tri_emitn);
}
} else {
if (parallel_tot > 1) {
@@ -4837,11 +4582,11 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
sprintf(outname_end, ".dist");
}
if (shape == DISTANCE_SQ) {
- retval = write_uncompressed(outname, 0, distance_d_write_sq_emitn);
+ retval = write_uncompressed(outname, overflow_buf, 0, distance_d_write_sq_emitn);
} else if (shape == DISTANCE_SQ0) {
- retval = write_uncompressed(outname, 0, distance_d_write_sq0_emitn);
+ retval = write_uncompressed(outname, overflow_buf, 0, distance_d_write_sq0_emitn);
} else {
- retval = write_uncompressed(outname, 0, distance_d_write_tri_emitn);
+ retval = write_uncompressed(outname, overflow_buf, 0, distance_d_write_tri_emitn);
}
if (retval) {
goto distance_d_write_ret_1;
@@ -4862,11 +4607,11 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
sprintf(outname_end, ".mdist.gz");
}
if (shape == DISTANCE_SQ) {
- parallel_compress(outname, 0, distance_d_write_1mibs_sq_emitn);
+ parallel_compress(outname, overflow_buf, 0, distance_d_write_1mibs_sq_emitn);
} else if (shape == DISTANCE_SQ0) {
- parallel_compress(outname, 0, distance_d_write_1mibs_sq0_emitn);
+ parallel_compress(outname, overflow_buf, 0, distance_d_write_1mibs_sq0_emitn);
} else {
- parallel_compress(outname, 0, distance_d_write_1mibs_tri_emitn);
+ parallel_compress(outname, overflow_buf, 0, distance_d_write_1mibs_tri_emitn);
}
} else {
if (parallel_tot > 1) {
@@ -4875,11 +4620,11 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
sprintf(outname_end, ".mdist");
}
if (shape == DISTANCE_SQ) {
- retval = write_uncompressed(outname, 0, distance_d_write_1mibs_sq_emitn);
+ retval = write_uncompressed(outname, overflow_buf, 0, distance_d_write_1mibs_sq_emitn);
} else if (shape == DISTANCE_SQ0) {
- retval = write_uncompressed(outname, 0, distance_d_write_1mibs_sq0_emitn);
+ retval = write_uncompressed(outname, overflow_buf, 0, distance_d_write_1mibs_sq0_emitn);
} else {
- retval = write_uncompressed(outname, 0, distance_d_write_1mibs_tri_emitn);
+ retval = write_uncompressed(outname, overflow_buf, 0, distance_d_write_1mibs_tri_emitn);
}
if (retval) {
goto distance_d_write_ret_1;
@@ -4902,11 +4647,11 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
sprintf(outname_end, ".mibs.gz");
}
if (shape == DISTANCE_SQ) {
- parallel_compress(outname, 0, distance_d_write_ibs_sq_emitn);
+ parallel_compress(outname, overflow_buf, 0, distance_d_write_ibs_sq_emitn);
} else if (shape == DISTANCE_SQ0) {
- parallel_compress(outname, 0, distance_d_write_ibs_sq0_emitn);
+ parallel_compress(outname, overflow_buf, 0, distance_d_write_ibs_sq0_emitn);
} else {
- parallel_compress(outname, 0, distance_d_write_ibs_tri_emitn);
+ parallel_compress(outname, overflow_buf, 0, distance_d_write_ibs_tri_emitn);
}
} else {
if (parallel_tot > 1) {
@@ -4915,11 +4660,11 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
sprintf(outname_end, ".mibs");
}
if (shape == DISTANCE_SQ) {
- retval = write_uncompressed(outname, 0, distance_d_write_ibs_sq_emitn);
+ retval = write_uncompressed(outname, overflow_buf, 0, distance_d_write_ibs_sq_emitn);
} else if (shape == DISTANCE_SQ0) {
- retval = write_uncompressed(outname, 0, distance_d_write_ibs_sq0_emitn);
+ retval = write_uncompressed(outname, overflow_buf, 0, distance_d_write_ibs_sq0_emitn);
} else {
- retval = write_uncompressed(outname, 0, distance_d_write_ibs_tri_emitn);
+ retval = write_uncompressed(outname, overflow_buf, 0, distance_d_write_ibs_tri_emitn);
}
if (retval) {
goto distance_d_write_ret_1;
@@ -5239,6 +4984,7 @@ int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uin
int32_t missing_ct_buf[BITCT];
double set_allele_freq_buf[GENOME_MULTIPLEX];
uint32_t nchrobs_buf[GENOME_MULTIPLEX];
+ unsigned char* overflow_buf;
unsigned char* gptr;
char* cptr;
uintptr_t* geno;
@@ -5337,7 +5083,8 @@ int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uin
wkspace_alloc_ul_checked(&masks, sample_ct * (GENOME_MULTIPLEX / 4)) ||
wkspace_alloc_ul_checked(&mmasks, sample_ct * sizeof(intptr_t)) ||
wkspace_alloc_c_checked(&g_cg_fam1, plink_maxfid + 1) ||
- wkspace_alloc_c_checked(&g_cg_fam2, plink_maxfid + 1)) {
+ wkspace_alloc_c_checked(&g_cg_fam2, plink_maxfid + 1) ||
+ wkspace_alloc_uc_checked(&overflow_buf, 262144)) {
goto calc_genome_ret_NOMEM;
}
@@ -5719,14 +5466,14 @@ int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uin
} else {
strcpy(outname_end, ".genome.gz");
}
- parallel_compress(outname, 0, calc_genome_emitn);
+ parallel_compress(outname, overflow_buf, 0, calc_genome_emitn);
} else {
if (parallel_tot > 1) {
sprintf(outname_end, ".genome.%d", parallel_idx + 1);
} else {
strcpy(outname_end, ".genome");
}
- retval = write_uncompressed(outname, 0, calc_genome_emitn);
+ retval = write_uncompressed(outname, overflow_buf, 0, calc_genome_emitn);
if (retval) {
goto calc_genome_ret_1;
}
@@ -6119,6 +5866,7 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
uint32_t rel_calc_type = relip->modifier & REL_CALC_MASK;
uintptr_t* compact_rel_table;
uintptr_t* rtptr;
+ unsigned char* overflow_buf;
char* bufptr;
uint64_t ullii;
uint64_t ulljj;
@@ -6185,7 +5933,8 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
goto rel_cutoff_batch_ret_NOMEM;
}
fill_ulong_zero(compact_rel_table, tot_words);
- if (wkspace_alloc_i_checked(&rel_ct_arr, sample_ct * sizeof(int32_t))) {
+ if (wkspace_alloc_i_checked(&rel_ct_arr, sample_ct * sizeof(int32_t)) ||
+ wkspace_alloc_uc_checked(&overflow_buf, 262144)) {
goto rel_cutoff_batch_ret_NOMEM;
}
fill_int_zero(rel_ct_arr, sample_ct);
@@ -6539,10 +6288,10 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
if (load_grm_bin) {
if (rel_calc_type & REL_CALC_GZ) {
memcpy(outname_end, ".grm.gz", 8);
- parallel_compress(outname, 0, rel_cutoff_batch_rbin_emitn);
+ parallel_compress(outname, overflow_buf, 0, rel_cutoff_batch_rbin_emitn);
} else {
memcpy(outname_end, ".grm", 5);
- retval = write_uncompressed(outname, 0, rel_cutoff_batch_rbin_emitn);
+ retval = write_uncompressed(outname, overflow_buf, 0, rel_cutoff_batch_rbin_emitn);
if (retval) {
goto rel_cutoff_batch_ret_1;
}
@@ -6550,10 +6299,10 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
} else {
if (rel_calc_type & REL_CALC_GZ) {
memcpy(outname_end, ".grm.gz", 8);
- parallel_compress(outname, 0, rel_cutoff_batch_emitn);
+ parallel_compress(outname, overflow_buf, 0, rel_cutoff_batch_emitn);
} else {
memcpy(outname_end, ".grm", 5);
- retval = write_uncompressed(outname, 0, rel_cutoff_batch_emitn);
+ retval = write_uncompressed(outname, overflow_buf, 0, rel_cutoff_batch_emitn);
if (retval) {
goto rel_cutoff_batch_ret_1;
}
@@ -6879,7 +6628,197 @@ uint32_t calc_rel_grm_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
return (uintptr_t)(((unsigned char*)sptr_cur) - readbuf);
}
-int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_tot, uint64_t calculation_type, Rel_info* relip, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, uint32_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* sample_ids, uintptr_t max_sample_id_len, double* set_allele_freqs, double** rel_ibc_ptr, [...]
+uint32_t block_load(FILE* bedfile, int32_t bed_offset, uintptr_t* marker_exclude, uint32_t marker_ct, uint32_t block_max_size, uintptr_t unfiltered_sample_ct4, unsigned char* readbuf, uintptr_t* marker_uidx_ptr, uintptr_t* marker_idx_ptr, uint32_t* block_size_ptr) {
+ uintptr_t marker_uidx = *marker_uidx_ptr;
+ uintptr_t marker_idx = *marker_idx_ptr;
+ uint32_t markers_read = 0;
+ if (block_max_size > marker_ct - marker_idx) {
+ block_max_size = marker_ct - marker_idx;
+ }
+ while (markers_read < block_max_size) {
+ if (IS_SET(marker_exclude, marker_uidx)) {
+ marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
+ if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
+ return RET_READ_FAIL;
+ }
+ }
+ if (fread(&(readbuf[markers_read * unfiltered_sample_ct4]), 1, unfiltered_sample_ct4, bedfile) < unfiltered_sample_ct4) {
+ return RET_READ_FAIL;
+ }
+ markers_read++;
+ marker_idx++;
+ marker_uidx++;
+ }
+
+ *marker_uidx_ptr = marker_uidx;
+ *marker_idx_ptr = marker_idx;
+ *block_size_ptr = markers_read;
+ return 0;
+}
+
+void copy_set_allele_freqs(uintptr_t marker_uidx, uintptr_t* marker_exclude, uint32_t block_max_size, uintptr_t marker_idx, uint32_t marker_ct, uintptr_t* marker_reverse, double* set_allele_freqs, double* set_allele_freq_buf) {
+ uint32_t markers_read = 0;
+ if (block_max_size > marker_ct - marker_idx) {
+ block_max_size = marker_ct - marker_idx;
+ }
+ while (markers_read < block_max_size) {
+ next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
+ if ((!marker_reverse) || (!IS_SET(marker_reverse, marker_uidx))) {
+ set_allele_freq_buf[markers_read] = set_allele_freqs[marker_uidx];
+ } else {
+ set_allele_freq_buf[markers_read] = 1.0 - set_allele_freqs[marker_uidx];
+ }
+ markers_read++;
+ marker_idx++;
+ marker_uidx++;
+ }
+}
+
+int32_t load_distance_wts(char* distance_wts_fname, uintptr_t unfiltered_marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t noheader, uint32_t conditional_alloc_exclude, uintptr_t** marker_exclude_ptr, uint32_t* marker_ct_ptr, double** main_weights_ptr) {
+ FILE* infile = NULL;
+ uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+ uintptr_t line_idx = 0;
+ uintptr_t topsize = 0;
+
+ // special case: weight-0 assignment effectively doesn't exist, but we still
+ // want to check for repeated IDs there.
+ uint32_t zcount = 0;
+
+ int32_t retval = 0;
+ unsigned char* wkspace_mark;
+ uintptr_t* marker_include;
+ double* main_weights_tmp;
+ double* dptr;
+ char* bufptr;
+ uint32_t* marker_id_htable;
+ double dxx;
+ uint32_t marker_id_htable_size;
+ uint32_t marker_uidx;
+ uint32_t marker_idx;
+ uint32_t idlen;
+ uint32_t marker_ct;
+ marker_include = (uintptr_t*)top_alloc(&topsize, unfiltered_marker_ctl * sizeof(intptr_t));
+ if (!marker_include) {
+ goto load_distance_wts_ret_NOMEM;
+ }
+ fill_ulong_zero(marker_include, unfiltered_marker_ctl);
+ main_weights_tmp = (double*)top_alloc(&topsize, unfiltered_marker_ct * sizeof(double));
+ if (!main_weights_tmp) {
+ goto load_distance_wts_ret_NOMEM;
+ }
+ wkspace_left -= topsize;
+ wkspace_mark = wkspace_base;
+ retval = alloc_and_populate_id_htable(unfiltered_marker_ct, *marker_exclude_ptr, *marker_ct_ptr, marker_ids, max_marker_id_len, 0, &marker_id_htable, &marker_id_htable_size);
+ wkspace_left += topsize;
+ if (retval) {
+ goto load_distance_wts_ret_1;
+ }
+ if (fopen_checked(&infile, distance_wts_fname, "r")) {
+ goto load_distance_wts_ret_OPEN_FAIL;
+ }
+ tbuf[MAXLINELEN - 1] = ' ';
+ while (fgets(tbuf, MAXLINELEN, infile)) {
+ line_idx++;
+ if (!tbuf[MAXLINELEN - 1]) {
+ LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, distance_wts_fname);
+ goto load_distance_wts_ret_INVALID_FORMAT_2;
+ }
+ bufptr = skip_initial_spaces(tbuf);
+ if (is_eoln_kns(*bufptr)) {
+ continue;
+ }
+ if (!noheader) {
+ noheader = 1;
+ continue;
+ }
+ // variant ID in first column, weight in second
+ idlen = strlen_se(bufptr);
+ marker_uidx = id_htable_find(bufptr, idlen, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len);
+ if (marker_uidx == 0xffffffffU) {
+ continue;
+ }
+ if (is_set(marker_include, marker_uidx)) {
+ bufptr[idlen] = '\0';
+ LOGPREPRINTFWW("Error: Duplicate variant ID '%s' in --distance-wts file.\n", bufptr);
+ goto load_distance_wts_ret_INVALID_FORMAT_2;
+ }
+ set_bit(marker_include, marker_uidx);
+ bufptr = skip_initial_spaces(&(bufptr[idlen]));
+ if (is_eoln_kns(*bufptr)) {
+ sprintf(logbuf, "Error: Line %" PRIuPTR " of --distance-wts file has fewer tokens than expected.\n", line_idx);
+ goto load_distance_wts_ret_INVALID_FORMAT_2;
+ }
+ if (scan_double(bufptr, &dxx)) {
+ goto load_distance_wts_ret_INVALID_WEIGHT;
+ }
+ if (!((dxx >= 0.0) && (dxx != INFINITY))) {
+ goto load_distance_wts_ret_INVALID_WEIGHT;
+ }
+ if (dxx == 0.0) {
+ zcount++;
+ }
+ main_weights_tmp[marker_uidx] = dxx;
+ }
+ if (!feof(infile)) {
+ goto load_distance_wts_ret_READ_FAIL;
+ }
+ wkspace_reset(wkspace_mark);
+ marker_ct = popcount_longs(marker_include, unfiltered_marker_ctl) - zcount;
+ if (!marker_ct) {
+ logprint("Error: No valid nonzero entries in --distance-wts file.\n");
+ goto load_distance_wts_ret_INVALID_FORMAT;
+ }
+ wkspace_left -= topsize;
+ if ((marker_ct != (*marker_ct_ptr))) {
+ if (conditional_alloc_exclude) {
+ if (wkspace_alloc_ul_checked(marker_exclude_ptr, unfiltered_marker_ctl * sizeof(intptr_t))) {
+ goto load_distance_wts_ret_NOMEM2;
+ }
+ }
+ bitfield_exclude_to_include(marker_include, *marker_exclude_ptr, unfiltered_marker_ct);
+ *marker_ct_ptr = marker_ct;
+ }
+ if (wkspace_alloc_d_checked(main_weights_ptr, marker_ct * sizeof(double))) {
+ goto load_distance_wts_ret_NOMEM2;
+ }
+ wkspace_left += topsize;
+ dptr = *main_weights_ptr;
+ *marker_ct_ptr = marker_ct;
+ for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++) {
+ next_set_unsafe_ck(marker_include, &marker_uidx);
+ dxx = main_weights_tmp[marker_uidx];
+ if (dxx != 0.0) {
+ *dptr++ = dxx;
+ marker_idx++;
+ }
+ }
+ // topsize = 0;
+ while (0) {
+ load_distance_wts_ret_NOMEM2:
+ wkspace_left += topsize;
+ load_distance_wts_ret_NOMEM:
+ retval = RET_NOMEM;
+ break;
+ load_distance_wts_ret_OPEN_FAIL:
+ retval = RET_OPEN_FAIL;
+ break;
+ load_distance_wts_ret_READ_FAIL:
+ retval = RET_READ_FAIL;
+ break;
+ load_distance_wts_ret_INVALID_WEIGHT:
+ sprintf(logbuf, "Error: Invalid weight on line %" PRIuPTR " of --distance-wts file.\n", line_idx);
+ load_distance_wts_ret_INVALID_FORMAT_2:
+ logprintb();
+ load_distance_wts_ret_INVALID_FORMAT:
+ retval = RET_INVALID_FORMAT;
+ break;
+ }
+ load_distance_wts_ret_1:
+ fclose_cond(infile);
+ return retval;
+}
+
+int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_tot, uint64_t calculation_type, Rel_info* relip, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, char* distance_wts_fname, uint32_t distance_wts_noheader, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t* marker_reverse, uint32_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_e [...]
unsigned char* wkspace_mark = wkspace_base;
uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
uintptr_t sample_ct = unfiltered_sample_ct - (*sample_exclude_ct_ptr);
@@ -6887,6 +6826,7 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
uintptr_t marker_idx = 0;
FILE* outfile = NULL;
FILE* out_bin_nfile = NULL;
+ uintptr_t* marker_exclude = marker_exclude_orig;
uint32_t rel_calc_type = relip->modifier & REL_CALC_MASK;
int32_t ibc_type = relip->ibc_type;
int32_t retval = 0;
@@ -6899,19 +6839,21 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
double* dptr3 = NULL;
double* dptr4 = NULL;
double* rel_dists = NULL;
- uint32_t chrom_fo_idx = 0;
+ double* main_weights = NULL;
+ double* main_weights_ptr = NULL;
double* dptr2;
double set_allele_freq_buf[MULTIPLEX_DIST];
char wbuf[96];
uint64_t start_offset;
uint64_t hundredth;
+ unsigned char* overflow_buf;
char* wptr;
char* fam_id;
char* sample_id;
uintptr_t* geno;
uintptr_t* masks;
uintptr_t* mmasks;
- double* weights;
+ double* subset_weights;
double* rel_ibc;
uint32_t* mdeptr;
uint32_t* sample_missing_unwt;
@@ -6937,10 +6879,11 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
uint32_t* giptr;
uint32_t* giptr2;
uintptr_t* glptr2;
- if (is_set(chrom_info_ptr->haploid_mask, 0)) {
- logprint("Error: --make-rel/--make-grm-... cannot be used on haploid genomes.\n");
- goto calc_rel_ret_INVALID_CMDLINE;
+ if (distance_wts_fname) {
+ logprint("Error: --make-{rel,grm-gz,grm-bin} + --distance-wts is currently under\ndevelopment.\n");
+ goto calc_rel_ret_1;
}
+
// timing results on the NIH 512-core machine suggest that it's
// counterproductive to make thread count exceed about n/64
if (dist_thread_ct > sample_ct / 64) {
@@ -6990,6 +6933,7 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
fill_double_zero(rel_dists, llxx);
}
wkspace_mark = wkspace_base;
+ // stack allocations after this point are freed normally
if (rel_req && (!g_missing_dbl_excluded)) {
if (wkspace_alloc_ui_checked(&g_missing_dbl_excluded, llxx * sizeof(int32_t))) {
goto calc_rel_ret_NOMEM;
@@ -7003,23 +6947,27 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
wkspace_alloc_ul_checked(&mmasks, sample_ct * sizeof(intptr_t)) ||
wkspace_alloc_uc_checked(&gptr, MULTIPLEX_REL * unfiltered_sample_ct4) ||
wkspace_alloc_ul_checked(&masks, sample_ct * sizeof(intptr_t)) ||
- wkspace_alloc_d_checked(&weights, 2048 * BITCT * sizeof(double))) {
+ wkspace_alloc_d_checked(&subset_weights, 2048 * BITCT * sizeof(double)) ||
+ wkspace_alloc_uc_checked(&overflow_buf, 262144)) {
goto calc_rel_ret_NOMEM;
}
g_geno = (unsigned char*)geno;
g_masks = masks;
g_mmasks = mmasks;
- g_weights = weights;
+ g_subset_weights = subset_weights;
// Exclude markers on non-autosomal chromosomes for now.
- uii = count_non_autosomal_markers(chrom_info_ptr, marker_exclude, 1, 1);
- if (uii) {
- if (uii == marker_ct) {
- logprint("Error: No autosomal variants for relationship matrix calculation.\n");
- goto calc_rel_ret_INVALID_CMDLINE;
+ retval = conditional_allocate_non_autosomal_markers(chrom_info_ptr, unfiltered_marker_ct, marker_exclude_orig, marker_ct, 1, 1, "relationship matrix calc", &marker_exclude, &uii);
+ if (retval) {
+ goto calc_rel_ret_1;
+ }
+ marker_ct -= uii;
+
+ if (distance_wts_fname) {
+ retval = load_distance_wts(distance_wts_fname, unfiltered_marker_ct, marker_ids, max_marker_id_len, distance_wts_noheader, (marker_exclude == marker_exclude_orig), &marker_exclude, &marker_ct, &main_weights);
+ if (retval) {
+ goto calc_rel_ret_1;
}
- LOGPRINTF("Excluding %u variant%s on non-autosomes from relationship matrix calc.\n", uii, (uii == 1)? "" : "s");
- marker_ct -= uii;
}
// See comments at the beginning of this file, and those in the main
@@ -7028,7 +6976,11 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
// each marker to 3 bits and use + instead of XOR to distinguish the
// cases.
do {
- retval = block_load_autosomal(bedfile, bed_offset, marker_exclude, marker_ct, MULTIPLEX_REL, unfiltered_sample_ct4, chrom_info_ptr, set_allele_freqs, NULL, gptr, &chrom_fo_idx, &marker_uidx, &marker_idx, &cur_markers_loaded, marker_reverse, set_allele_freq_buf, NULL, NULL);
+ copy_set_allele_freqs(marker_uidx, marker_exclude, MULTIPLEX_REL, marker_idx, marker_ct, marker_reverse, set_allele_freqs, set_allele_freq_buf);
+ if (main_weights) {
+ main_weights_ptr = &(main_weights[marker_idx]);
+ }
+ retval = block_load(bedfile, bed_offset, marker_exclude, marker_ct, MULTIPLEX_REL, unfiltered_sample_ct4, gptr, &marker_uidx, &marker_idx, &cur_markers_loaded);
if (retval) {
goto calc_rel_ret_1;
}
@@ -7074,18 +7026,25 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
}
if (calculation_type & CALC_IBC) {
for (uii = 0; uii < 3; uii++) {
- update_rel_ibc(&(rel_ibc[uii * sample_ct]), geno, &(set_allele_freq_buf[win_marker_idx]), uii, sample_ct, ukk);
+ update_rel_ibc(&(rel_ibc[uii * sample_ct]), geno, &(set_allele_freq_buf[win_marker_idx]), main_weights_ptr? (&(main_weights_ptr[win_marker_idx])) : NULL, uii, sample_ct, ukk);
}
} else {
- update_rel_ibc(rel_ibc, geno, &(set_allele_freq_buf[win_marker_idx]), ibc_type, sample_ct, ukk);
+ update_rel_ibc(rel_ibc, geno, &(set_allele_freq_buf[win_marker_idx]), main_weights_ptr? (&(main_weights_ptr[win_marker_idx])) : NULL, ibc_type, sample_ct, ukk);
}
if (rel_req) {
- fill_weights_r(weights, &(set_allele_freq_buf[win_marker_idx]), (ibc_type != -1));
- if (spawn_threads2(threads, &calc_rel_thread, dist_thread_ct, ujj)) {
- goto calc_rel_ret_THREAD_CREATE_FAIL;
- }
+ fill_subset_weights_r(subset_weights, &(set_allele_freq_buf[win_marker_idx]), main_weights_ptr? (&(main_weights_ptr[win_marker_idx])) : NULL, (ibc_type != -1));
ulii = 0;
- calc_rel_thread((void*)ulii);
+ if (!main_weights_ptr) {
+ if (spawn_threads2(threads, &calc_rel_thread, dist_thread_ct, ujj)) {
+ goto calc_rel_ret_THREAD_CREATE_FAIL;
+ }
+ calc_rel_thread((void*)ulii);
+ } else {
+ if (spawn_threads2(threads, &calc_wt_rel_thread, dist_thread_ct, ujj)) {
+ goto calc_rel_ret_THREAD_CREATE_FAIL;
+ }
+ calc_wt_rel_thread((void*)ulii);
+ }
join_threads2(threads, dist_thread_ct, ujj);
}
}
@@ -7354,13 +7313,13 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
} else {
strcpy(outname_end, ".grm.gz");
}
- parallel_compress(outname, 0, calc_rel_grm_emitn);
+ parallel_compress(outname, overflow_buf, 0, calc_rel_grm_emitn);
} else {
strcpy(outname_end, ".grm");
if (parallel_tot > 1) {
sprintf(&(outname_end[4]), ".%u", parallel_idx + 1);
}
- retval = write_uncompressed(outname, 0, calc_rel_grm_emitn);
+ retval = write_uncompressed(outname, overflow_buf, 0, calc_rel_grm_emitn);
if (retval) {
goto calc_rel_ret_1;
}
@@ -7380,9 +7339,9 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
}
if (rel_shape == REL_CALC_TRI) {
if (rel_calc_type & REL_CALC_GZ) {
- parallel_compress(outname, 0, calc_rel_tri_emitn);
+ parallel_compress(outname, overflow_buf, 0, calc_rel_tri_emitn);
} else {
- retval = write_uncompressed(outname, 0, calc_rel_tri_emitn);
+ retval = write_uncompressed(outname, overflow_buf, 0, calc_rel_tri_emitn);
if (retval) {
goto calc_rel_ret_1;
}
@@ -7406,9 +7365,9 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
}
g_cr_min_sample = min_sample;
if (rel_calc_type & REL_CALC_GZ) {
- parallel_compress(outname, 0, calc_rel_sq0_emitn);
+ parallel_compress(outname, overflow_buf, 0, calc_rel_sq0_emitn);
} else {
- retval = write_uncompressed(outname, 0, calc_rel_sq0_emitn);
+ retval = write_uncompressed(outname, overflow_buf, 0, calc_rel_sq0_emitn);
if (retval) {
goto calc_rel_ret_1;
}
@@ -7416,9 +7375,9 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
} else {
g_cr_min_sample = min_sample;
if (rel_calc_type & REL_CALC_GZ) {
- parallel_compress(outname, 0, calc_rel_sq_emitn);
+ parallel_compress(outname, overflow_buf, 0, calc_rel_sq_emitn);
} else {
- retval = write_uncompressed(outname, 0, calc_rel_sq_emitn);
+ retval = write_uncompressed(outname, overflow_buf, 0, calc_rel_sq_emitn);
if (retval) {
goto calc_rel_ret_1;
}
@@ -7470,9 +7429,6 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
calc_rel_ret_WRITE_FAIL:
retval = RET_WRITE_FAIL;
break;
- calc_rel_ret_INVALID_CMDLINE:
- retval = RET_INVALID_CMDLINE;
- break;
calc_rel_ret_THREAD_CREATE_FAIL:
retval = RET_THREAD_CREATE_FAIL;
break;
@@ -7937,13 +7893,13 @@ int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
}
#endif
-int32_t calc_ibm(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_exclude, uint32_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, Chrom_info* chrom_info_ptr) {
+int32_t calc_ibm(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uint32_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, Chrom_info* chrom_info_ptr) {
uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
uintptr_t marker_uidx = 0;
uintptr_t marker_idx = 0;
- uint32_t chrom_fo_idx = 0;
uint32_t dist_thread_ct = g_thread_ct;
int32_t retval = 0;
+ uintptr_t* marker_exclude = marker_exclude_orig;
uint32_t* giptr = NULL;
unsigned char* wkspace_mark;
unsigned char* bedbuf;
@@ -7959,12 +7915,7 @@ int32_t calc_ibm(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uintpt
uint32_t umm;
uint32_t unn;
uintptr_t* glptr;
- uint32_t marker_ct_autosomal;
int64_t llxx;
- if (is_set(chrom_info_ptr->haploid_mask, 0)) {
- logprint("Error: '--cluster missing' cannot currently be used on haploid genomes.\n");
- goto calc_ibm_ret_INVALID_CMDLINE;
- }
g_sample_ct = sample_ct;
if (dist_thread_ct > sample_ct / 32) {
dist_thread_ct = sample_ct / 32;
@@ -7990,21 +7941,20 @@ int32_t calc_ibm(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uintpt
}
g_mmasks = mmasks;
fseeko(bedfile, bed_offset, SEEK_SET);
- uii = count_non_autosomal_markers(chrom_info_ptr, marker_exclude, 1, 1);
- marker_ct_autosomal = marker_ct - uii;
- if (uii) {
- LOGPRINTF("Excluding %u variant%s on non-autosomes from IBM calculation.\n", uii, (uii == 1)? "" : "s");
- }
- is_last_block = (marker_idx == marker_ct_autosomal);
- while (!is_last_block) {
- retval = block_load_autosomal(bedfile, bed_offset, marker_exclude, marker_ct_autosomal, MULTIPLEX_DIST, unfiltered_sample_ct4, chrom_info_ptr, NULL, NULL, bedbuf, &chrom_fo_idx, &marker_uidx, &marker_idx, &ujj, NULL, NULL, NULL, NULL);
+ retval = conditional_allocate_non_autosomal_markers(chrom_info_ptr, unfiltered_marker_ct, marker_exclude_orig, marker_ct, 1, 1, "IBM calculation", &marker_exclude, &uii);
+ if (retval) {
+ goto calc_ibm_ret_1;
+ }
+ marker_ct -= uii;
+ do {
+ retval = block_load(bedfile, bed_offset, marker_exclude, marker_ct, MULTIPLEX_DIST, unfiltered_sample_ct4, bedbuf, &marker_uidx, &marker_idx, &ujj);
if (retval) {
goto calc_ibm_ret_1;
}
if (ujj < MULTIPLEX_DIST) {
memset(&(bedbuf[ujj * unfiltered_sample_ct4]), 0, (MULTIPLEX_DIST - ujj) * unfiltered_sample_ct4);
}
- is_last_block = (marker_idx == marker_ct_autosomal);
+ is_last_block = (marker_idx == marker_ct);
for (ukk = 0; ukk < ujj; ukk += BITCT) {
glptr = mmasks;
giptr = sample_missing_unwt;
@@ -8044,43 +7994,43 @@ int32_t calc_ibm(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uintpt
printf("\r%" PRIuPTR " markers complete.", marker_idx);
fflush(stdout);
- }
+ } while (!is_last_block);
putchar('\r');
wkspace_reset(wkspace_mark);
while (0) {
calc_ibm_ret_NOMEM:
retval = RET_NOMEM;
break;
- calc_ibm_ret_INVALID_CMDLINE:
- retval = RET_INVALID_CMDLINE;
- break;
calc_ibm_ret_THREAD_CREATE_FAIL:
retval = RET_THREAD_CREATE_FAIL;
break;
}
calc_ibm_ret_1:
+ // caller will free memory if there was an error
return retval;
}
-int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_tot, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint64_t calculation_type, uint32_t dist_calc_type, uintptr_t* marker_exclude, uint32_t marker_ct, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, Chrom_info* chrom_info_ptr, uint32_t wt_needed, uint32_t marker_weight_sum, uint3 [...]
+int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_tot, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, char* read_dists_fname, char* distance_wts_fname, double distance_exp, uint64_t calculation_type, uint32_t dist_calc_type, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uint32_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exc [...]
+ // if calculation_type == 0, this must perform the basic unweighted
+ // computation and not write to disk.
FILE* outfile = NULL;
FILE* outfile2 = NULL;
FILE* outfile3 = NULL;
uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
uint64_t dists_alloc = 0;
- double marker_weight_sum_d = (double)marker_weight_sum;
+ uint32_t missing_wt_needed = ((calculation_type & CALC_DISTANCE) || ((!read_dists_fname) && (calculation_type & (CALC_IBS_TEST | CALC_GROUPDIST | CALC_REGRESS_DISTANCE)))) && (!(dist_calc_type & DISTANCE_FLAT_MISSING));
uint32_t unwt_needed = 0;
- uintptr_t marker_uidx = 0;
- uintptr_t marker_idx = 0;
- uint32_t chrom_fo_idx = 0;
+ uint32_t marker_weight_sum = 0;
int32_t retval = 0;
- uint32_t exp0 = (exponent == 0.0);
+ uintptr_t* marker_exclude = marker_exclude_orig;
+ uint32_t* dist_missing_wts_i = NULL;
uint32_t* sample_missing = NULL;
uint32_t* sample_missing_unwt = NULL;
uint32_t* giptr = NULL;
uint32_t* giptr2 = NULL;
char* writebuf = NULL;
- double* weights = NULL;
+ double* main_weights = NULL;
+ double* subset_weights = NULL;
uint32_t dist_thread_ct = g_thread_ct;
double set_allele_freq_buf[MULTIPLEX_DIST];
uint32_t wtbuf[MULTIPLEX_DIST];
@@ -8093,6 +8043,8 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
unsigned char* gptr;
uintptr_t sample_uidx;
uintptr_t sample_idx;
+ uintptr_t marker_uidx;
+ uintptr_t marker_idx;
uintptr_t ulii;
uintptr_t uljj;
uintptr_t ulkk;
@@ -8109,17 +8061,13 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
uintptr_t* glptr;
uintptr_t* glptr2;
uintptr_t* glptr3;
+ double* dist_missing_wts;
double* dptr2;
+ double marker_weight_sum_d;
double dxx;
double dyy;
- uint32_t marker_ct_autosomal;
uint32_t multiplex;
- uint32_t chrom_end;
int64_t llxx;
- if (is_set(chrom_info_ptr->haploid_mask, 0)) {
- logprint("Error: --distance/--ibs-matrix/--distance-matrix cannot be used on haploid\ngenomes.\n");
- goto calc_distance_ret_INVALID_CMDLINE;
- }
g_sample_ct = sample_ct;
if (dist_thread_ct > sample_ct / 32) {
dist_thread_ct = sample_ct / 32;
@@ -8154,8 +8102,9 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
if (wkspace_alloc_d_checked(&g_dists, dists_alloc + CACHELINE)) {
goto calc_distance_ret_NOMEM;
}
+ // stack allocations before this point must be freed by the caller.
wkspace_mark = wkspace_base;
- if (wt_needed) {
+ if (missing_wt_needed) {
if (wkspace_alloc_ui_checked(&g_missing_tot_weights, llxx * sizeof(int32_t)) ||
wkspace_alloc_ui_checked(&sample_missing, sample_ct * sizeof(int32_t))) {
goto calc_distance_ret_NOMEM;
@@ -8167,12 +8116,31 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
g_missing_tot_weights = NULL;
}
- if (exp0) {
+ ujj = distance_wts_fname || (distance_exp != 0.0); // special weights?
+ if (!ujj) {
g_idists = (int32_t*)((char*)wkspace_mark - CACHEALIGN(llxx * sizeof(int32_t)));
fill_int_zero(g_idists, llxx);
- masks = (uintptr_t*)wkspace_alloc(sample_ct * (MULTIPLEX_2DIST / 8));
} else {
fill_double_zero(g_dists, llxx);
+ }
+
+ retval = conditional_allocate_non_autosomal_markers(chrom_info_ptr, unfiltered_marker_ct, marker_exclude_orig, marker_ct, 1, 1, "distance matrix calc", &marker_exclude, &uii);
+ if (retval) {
+ goto calc_distance_ret_1;
+ }
+ marker_ct -= uii;
+
+ if (distance_wts_fname) {
+ retval = load_distance_wts(distance_wts_fname, unfiltered_marker_ct, marker_ids, max_marker_id_len, dist_calc_type & DISTANCE_WTS_NOHEADER, (marker_exclude == marker_exclude_orig), &marker_exclude, &marker_ct, &main_weights);
+ if (retval) {
+ goto calc_distance_ret_1;
+ }
+ }
+
+ // stack allocations past this point are freed BEFORE results are written.
+ if (!ujj) {
+ masks = (uintptr_t*)wkspace_alloc(sample_ct * (MULTIPLEX_2DIST / 8));
+ } else {
masks = (uintptr_t*)wkspace_alloc(sample_ct * sizeof(intptr_t));
}
if (!masks) {
@@ -8182,7 +8150,77 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
goto calc_distance_ret_NOMEM;
}
- if (exp0) {
+ // Load or compute nonuniform marker weighting scheme.
+ if (distance_exp != 0.0) {
+ if (wkspace_alloc_d_checked(&main_weights, marker_ct * sizeof(double))) {
+ goto calc_distance_ret_NOMEM;
+ }
+ for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
+ next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
+ dxx = set_allele_freqs[marker_uidx];
+ dyy = 2 * dxx * (1.0 - dxx);
+ if (dyy != 0.0) {
+ dyy = pow(dyy, -distance_exp);
+ }
+ main_weights[marker_idx] = dyy;
+ }
+ }
+ // Now compute missing observation weights. (Note that these are usually not
+ // the same as the raw marker weights: for instance, a missing observation at
+ // a MAF-0 marker has no weight at all.)
+ if (missing_wt_needed) {
+ // hack: overwrite dist_missing_wts while populating dist_missing_wts_i.
+ // CACHELINE padding added to reduce risk of an aliasing problem.
+ if (wkspace_alloc_ui_checked(&dist_missing_wts_i, CACHELINE) ||
+ wkspace_alloc_d_checked(&dist_missing_wts, marker_ct * sizeof(double))) {
+ goto calc_distance_ret_NOMEM;
+ }
+ dyy = 0.0; // raw weight sum
+ for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
+ next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
+ // assume HWE, compute expected contribution to distance statistic:
+ // expected minor allele obs: 2 * maf
+ // P(0 copies) = (1 - maf) * (1 - maf)
+ // P(1 copy) = 2 * maf * (1 - maf)
+ // P(2 copies) = maf * maf
+ // frequency of distance-1 pairs:
+ // freq[0-1 pair] + freq[1-2 pair]
+ // = 2 * (1 - maf) * (1 - maf) * 2 * maf * (1 - maf)
+ // + 2 * 2 * maf * (1 - maf) * maf * maf
+ // = 4 * maf * (1 - maf) * (maf * maf + (1 - maf) * (1 - maf))
+ // 4 * maf * (1 - maf) * (2 * maf * maf - 2 * maf + 1)
+ // frequency of distance-2 pairs:
+ // 2 * (1 - maf) * (1 - maf) * maf * maf
+ // expected distance:
+ // 4 * maf * (1 - maf) * (2 * maf * maf - 2 * maf + 1
+ // + maf * (1 - maf))
+ // = 4 * maf * (1 - maf) * (maf * maf - maf + 1)
+ // constant factor doesn't matter here
+ dxx = set_allele_freqs[marker_uidx];
+ if ((dxx != 0.0) && (dxx != 1.0)) {
+ dxx = dxx * (1.0 - dxx) * (dxx * dxx - dxx + 1);
+ if (main_weights) {
+ dxx *= main_weights[marker_idx];
+ }
+ }
+ dist_missing_wts[marker_idx] = dxx;
+ dyy += dxx;
+ }
+
+ // now normalize to sum to just under 2^32. (switch to 2^64 if/when 32-bit
+ // performance becomes less important than accuracy on 50+ million marker
+ // sets.)
+ // subtract marker_ct to guard against rounding-driven overflow
+ dyy = (4294967296.0 - ((double)((intptr_t)marker_ct))) / dyy;
+ for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
+ uii = (uint32_t)(dist_missing_wts[marker_idx] * dyy + 0.5);
+ marker_weight_sum += uii;
+ dist_missing_wts_i[marker_idx] = uii;
+ }
+ }
+ marker_weight_sum_d = (double)marker_weight_sum;
+
+ if (!main_weights) {
multiplex = MULTIPLEX_DIST;
geno = (uintptr_t*)wkspace_alloc(sample_ct * (MULTIPLEX_2DIST / 8));
} else {
@@ -8199,27 +8237,23 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
if (wkspace_alloc_uc_checked(&bedbuf, multiplex * unfiltered_sample_ct4)) {
goto calc_distance_ret_NOMEM;
}
- if (!exp0) {
+ if (main_weights) {
#ifdef __LP64__
- if (wkspace_alloc_d_checked(&weights, 45056 * sizeof(double))) {
+ if (wkspace_alloc_d_checked(&subset_weights, 45056 * sizeof(double))) {
goto calc_distance_ret_NOMEM;
}
#else
- if (wkspace_alloc_d_checked(&weights, 32768 * sizeof(double))) {
+ if (wkspace_alloc_d_checked(&subset_weights, 32768 * sizeof(double))) {
goto calc_distance_ret_NOMEM;
}
- g_weights_i = wtbuf;
+ g_subset_weights_i = wtbuf;
#endif
- g_weights = weights;
+ g_subset_weights = subset_weights;
}
fseeko(bedfile, bed_offset, SEEK_SET);
- uii = count_non_autosomal_markers(chrom_info_ptr, marker_exclude, 1, 1);
- marker_ct_autosomal = marker_ct - uii;
- if (uii) {
- LOGPRINTF("Excluding %u variant%s on non-autosomes from distance matrix calc.\n", uii, (uii == 1)? "" : "s");
- }
- is_last_block = (marker_idx == marker_ct_autosomal);
- while (!is_last_block) {
+ marker_uidx = 0;
+ marker_idx = 0;
+ do {
for (ujj = 0; ujj < multiplex; ujj++) {
set_allele_freq_buf[ujj] = 0.5;
}
@@ -8268,13 +8302,21 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
// See the comments at the beginning of this file for discussion of
// the zero exponent special case.
- retval = block_load_autosomal(bedfile, bed_offset, marker_exclude, marker_ct_autosomal, multiplex, unfiltered_sample_ct4, chrom_info_ptr, set_allele_freqs, marker_weights_i, bedbuf, &chrom_fo_idx, &marker_uidx, &marker_idx, &ujj, NULL, set_allele_freq_buf, NULL, wt_needed? wtbuf : NULL);
+ copy_set_allele_freqs(marker_uidx, marker_exclude, multiplex, marker_idx, marker_ct, NULL, set_allele_freqs, set_allele_freq_buf);
+ if (missing_wt_needed) {
+ uii = marker_ct - marker_idx;
+ if (uii > multiplex) {
+ uii = multiplex;
+ }
+ memcpy(wtbuf, &(dist_missing_wts_i[marker_idx]), uii * sizeof(int32_t));
+ }
+ retval = block_load(bedfile, bed_offset, marker_exclude, marker_ct, multiplex, unfiltered_sample_ct4, bedbuf, &marker_uidx, &marker_idx, &ujj);
if (retval) {
goto calc_distance_ret_1;
}
if (ujj < multiplex) {
memset(&(bedbuf[ujj * unfiltered_sample_ct4]), 0, (multiplex - ujj) * unfiltered_sample_ct4);
- if (exp0) {
+ if (!main_weights) {
fill_ulong_zero(geno, sample_ct * (MULTIPLEX_2DIST / BITCT));
fill_ulong_zero(masks, sample_ct * (MULTIPLEX_2DIST / BITCT));
} else {
@@ -8282,13 +8324,13 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
fill_ulong_zero(masks, sample_ct);
}
}
- is_last_block = (marker_idx == marker_ct_autosomal);
- if (exp0) {
+ is_last_block = (marker_idx == marker_ct);
+ if (!main_weights) {
for (ukk = 0; ukk < ujj; ukk += BITCT) {
glptr = &(geno[ukk / BITCT2]);
glptr2 = &(masks[ukk / BITCT2]);
glptr3 = mmasks;
- if (wt_needed) {
+ if (missing_wt_needed) {
giptr = sample_missing;
}
if (unwt_needed) {
@@ -8305,7 +8347,7 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
ulii |= uljj << (umm * 2);
if (uljj == 1) {
ulkk |= ONELU << umm;
- if (wt_needed) {
+ if (missing_wt_needed) {
*giptr += wtbuf[umm + ukk];
}
if (unwt_needed) {
@@ -8328,7 +8370,7 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
ulii |= uljj << (umm * 2);
if (uljj == 1) {
ulkk |= ONELU << umm;
- if (wt_needed) {
+ if (missing_wt_needed) {
*giptr += wtbuf[umm + ukk + BITCT2];
}
if (unwt_needed) {
@@ -8343,7 +8385,7 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
*glptr3++ |= ulkk << BITCT2;
glptr = &(glptr[(MULTIPLEX_2DIST / BITCT) - 1]);
glptr2 = &(glptr2[(MULTIPLEX_2DIST / BITCT) - 1]);
- if (wt_needed) {
+ if (missing_wt_needed) {
giptr++;
}
if (unwt_needed) {
@@ -8352,8 +8394,8 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
}
}
- if (wt_needed) {
- g_weights_i = &(wtbuf[ukk]);
+ if (missing_wt_needed) {
+ g_subset_weights_i = &(wtbuf[ukk]);
}
uii = is_last_block && (ukk + BITCT >= ujj);
if (spawn_threads2(threads, &calc_ibs_thread, dist_thread_ct, uii)) {
@@ -8400,7 +8442,7 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
giptr3++;
}
}
- fill_weights(weights, &(set_allele_freq_buf[ukk]), exponent);
+ fill_subset_weights(subset_weights, &(main_weights[ukk]));
uii = is_last_block && (ukk + (MULTIPLEX_DIST_EXP / 3) >= ujj);
if (spawn_threads2(threads, &calc_wdist_thread, dist_thread_ct, uii)) {
goto calc_distance_ret_THREAD_CREATE_FAIL;
@@ -8412,7 +8454,7 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
}
printf("\r%" PRIuPTR " markers complete.", marker_idx);
fflush(stdout);
- }
+ } while (!is_last_block);
putchar('\r');
logprint("Distance matrix calculation complete.\n");
wkspace_reset(masks);
@@ -8432,7 +8474,7 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
// parallel_tot must be 1 for --distance-matrix
for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
giptr2 = sample_missing_unwt;
- uii = marker_ct_autosomal - giptr2[sample_idx];
+ uii = marker_ct - giptr2[sample_idx];
wptr = writebuf;
for (ujj = 0; ujj < sample_idx; ujj++) {
wptr = double_g_writex(wptr, ((double)(*iptr++)) / (2 * (uii - (*giptr2++) + (*giptr++))), ' ');
@@ -8482,7 +8524,7 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
pct = 1;
for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
giptr2 = sample_missing_unwt;
- uii = marker_ct_autosomal - giptr2[sample_idx];
+ uii = marker_ct - giptr2[sample_idx];
wptr = writebuf;
for (ujj = 0; ujj < sample_idx; ujj++) {
wptr = double_g_writex(wptr, 1.0 - (((double)(*iptr++)) / (2 * (uii - (*giptr2++) + (*giptr++)))), ' ');
@@ -8514,12 +8556,12 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
}
outname_end[5] = '\0';
LOGPRINTFWW("IBS matrix written to %s , and IDs to %s.id .\n", outname, outname);
- }
+ } while (!is_last_block);
tstc = g_thread_start[dist_thread_ct];
- if (wt_needed) {
+ if (missing_wt_needed) {
giptr = g_missing_tot_weights;
dptr2 = g_dists;
- if (exp0) {
+ if (!main_weights) {
iptr = g_idists;
for (sample_idx = g_thread_start[0]; sample_idx < tstc; sample_idx++) {
giptr2 = sample_missing;
@@ -8541,13 +8583,13 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
} else if (dist_calc_type & DISTANCE_FLAT_MISSING) {
dptr2 = g_dists;
giptr = g_missing_dbl_excluded;
- if (exp0) {
+ if (!main_weights) {
iptr = g_idists;
if (dist_calc_type & DISTANCE_CLUSTER) {
// save as IBS
for (sample_idx = g_thread_start[0]; sample_idx < tstc; sample_idx++) {
giptr2 = sample_missing_unwt;
- uii = marker_ct_autosomal - giptr2[sample_idx];
+ uii = marker_ct - giptr2[sample_idx];
for (ujj = 0; ujj < sample_idx; ujj++) {
*dptr2++ = 1.0 - (((double)(*iptr++)) / (2 * (uii - (*giptr2++) + (*giptr++))));
}
@@ -8555,18 +8597,18 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
} else {
for (sample_idx = g_thread_start[0]; sample_idx < tstc; sample_idx++) {
giptr2 = sample_missing_unwt;
- uii = marker_ct_autosomal - giptr2[sample_idx];
+ uii = marker_ct - giptr2[sample_idx];
for (ujj = 0; ujj < sample_idx; ujj++) {
- *dptr2++ = (((double)marker_ct_autosomal) / (uii - (*giptr2++) + (*giptr++))) * (*iptr++);
+ *dptr2++ = (((double)marker_ct) / (uii - (*giptr2++) + (*giptr++))) * (*iptr++);
}
}
}
} else {
for (sample_idx = g_thread_start[0]; sample_idx < tstc; sample_idx++) {
giptr2 = sample_missing_unwt;
- uii = marker_ct_autosomal - giptr2[sample_idx];
+ uii = marker_ct - giptr2[sample_idx];
for (ujj = 0; ujj < sample_idx; ujj++) {
- *dptr2 *= ((double)marker_ct_autosomal) / (uii - (*giptr2++) + (*giptr++));
+ *dptr2 *= ((double)marker_ct) / (uii - (*giptr2++) + (*giptr++));
dptr2++;
}
}
@@ -8574,18 +8616,16 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
}
if (calculation_type & (CALC_DISTANCE | CALC_IBS_TEST)) {
- if ((exponent == 0.0) || (!(dist_calc_type & (DISTANCE_IBS | DISTANCE_1_MINUS_IBS)))) {
- g_half_marker_ct_recip = 0.5 / (double)marker_ct_autosomal;
+ if ((distance_exp == 0.0) || (!(dist_calc_type & (DISTANCE_IBS | DISTANCE_1_MINUS_IBS)))) {
+ g_half_marker_ct_recip = 0.5 / (double)marker_ct;
} else {
dyy = 0.0;
marker_uidx = 0;
- chrom_fo_idx = 0xffffffffU;
- chrom_end = 0;
- for (marker_idx = 0; marker_idx < marker_ct_autosomal; marker_uidx++, marker_idx++) {
- marker_uidx = next_autosomal_unsafe(marker_exclude, marker_uidx, chrom_info_ptr, &chrom_end, &chrom_fo_idx);
+ for (marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
+ next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
dxx = set_allele_freqs[marker_uidx];
if ((dxx > 0.0) && (dxx < 1.0)) {
- dyy += pow(2 * dxx * (1.0 - dxx), -exponent);
+ dyy += pow(2 * dxx * (1.0 - dxx), -distance_exp);
} else {
dyy += 1.0;
}
@@ -8618,9 +8658,6 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
calc_distance_ret_WRITE_FAIL:
retval = RET_WRITE_FAIL;
break;
- calc_distance_ret_INVALID_CMDLINE:
- retval = RET_INVALID_CMDLINE;
- break;
calc_distance_ret_THREAD_CREATE_FAIL:
retval = RET_THREAD_CREATE_FAIL;
break;
@@ -8919,7 +8956,7 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
// calculate entire distance matrix, or use already-calculated matrix in
// memory
if (!g_dists) {
- retval = calc_distance(threads, 0, 1, bedfile, bed_offset, outname, outname_end, 0, DISTANCE_FLAT_MISSING | DISTANCE_CLUSTER, marker_exclude, marker_ct, set_allele_freqs, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, chrom_info_ptr, 0, 0, NULL, 0.0);
+ retval = calc_distance(threads, 0, 1, bedfile, bed_offset, outname, outname_end, NULL, NULL, 0.0, 0, DISTANCE_FLAT_MISSING | DISTANCE_CLUSTER, unfiltered_marker_ct, marker_exclude, marker_ct, NULL, 0, set_allele_freqs, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, chrom_info_ptr);
if (retval) {
goto calc_cluster_neighbor_ret_1;
}
@@ -9057,7 +9094,7 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
}
if (cluster_missing || ibm_constraint) {
if (!g_missing_dbl_excluded) {
- retval = calc_ibm(threads, bedfile, bed_offset, marker_exclude, marker_ct, unfiltered_sample_ct, sample_exclude, sample_ct, chrom_info_ptr);
+ retval = calc_ibm(threads, bedfile, bed_offset, unfiltered_marker_ct, marker_exclude, marker_ct, unfiltered_sample_ct, sample_exclude, sample_ct, chrom_info_ptr);
if (retval) {
goto calc_cluster_neighbor_ret_1;
}
diff --git a/plink_calc.h b/plink_calc.h
index 1dc3abb..6e86302 100644
--- a/plink_calc.h
+++ b/plink_calc.h
@@ -61,13 +61,13 @@ int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uin
int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, char* outname_end, Rel_info* relip);
-int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_tot, uint64_t calculation_type, Rel_info* relip, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, uint32_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* sample_ids, uintptr_t max_sample_id_len, double* set_allele_freqs, double** rel_ibc_ptr, [...]
+int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_tot, uint64_t calculation_type, Rel_info* relip, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, char* distance_wts_fname, uint32_t distance_wts_noheader, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t* marker_reverse, uint32_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_e [...]
#ifndef NOLAPACK
int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint64_t calculation_type, Rel_info* relip, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uintptr_t* pca_sample_exclude, uintptr_t pca_sample_ct, char* sample_ids, uintptr_t max_sample_id_len, double* set_allele_freq [...]
#endif
-int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_tot, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint64_t calculation_type, uint32_t dist_calc_type, uintptr_t* marker_exclude, uint32_t marker_ct, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, Chrom_info* chrom_info_ptr, uint32_t wt_needed, uint32_t marker_weight_sum, uint3 [...]
+int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_tot, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, char* read_dists_fname, char* distance_wts_fname, double distance_exp, uint64_t calculation_type, uint32_t dist_calc_type, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uint32_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exc [...]
int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uint32_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, char* read_dists_fname, char* read_dists_id_fname, char* read_genome_fname, char* outname, char* ou [...]
diff --git a/plink_cluster.c b/plink_cluster.c
index 7d23997..305e817 100644
--- a/plink_cluster.c
+++ b/plink_cluster.c
@@ -600,6 +600,11 @@ void fill_unfiltered_sample_to_cluster(uintptr_t unfiltered_sample_ct, uintptr_t
}
int32_t fill_sample_to_cluster(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uintptr_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t* sample_to_cluster, uint32_t* late_clidx_to_sample_uidx) {
+ // If late_clidx_to_sample_uidx is not NULL, all samples not in a loaded
+ // cluster are given their own cluster, and late_clidx_to_sample_uidx is
+ // filled with the cluster index -> sample uidx mapping.
+ // (Yes, this is a strange interface; it may be switched to filtered sample
+ // indexes later.)
unsigned char* wkspace_mark = wkspace_base;
uint32_t* cluster_map_pos = cluster_map;
int32_t retval = 0;
@@ -619,7 +624,7 @@ int32_t fill_sample_to_cluster(uintptr_t unfiltered_sample_ct, uintptr_t* sample
sample_to_cluster[uidx_to_idx[*cluster_map_pos]] = cluster_idx;
} while (++cluster_map_pos < cluster_end_ptr);
}
- if (cluster_starts[cluster_ct] < sample_ct) {
+ if (late_clidx_to_sample_uidx && (cluster_starts[cluster_ct] < sample_ct)) {
sample_uidx = 0;
for (sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
sample_uidx = next_unset_unsafe(sample_exclude, sample_uidx);
diff --git a/plink_common.c b/plink_common.c
index 5a6680c..7d15185 100644
--- a/plink_common.c
+++ b/plink_common.c
@@ -568,6 +568,25 @@ int32_t read_tokens(FILE* infile, char* buf, uintptr_t half_bufsize, uintptr_t t
}
}
+int32_t gzputs_w4(gzFile gz_outfile, const char* ss) {
+ if (!ss[1]) {
+ if (gzputs(gz_outfile, " ") == -1) {
+ return -1;
+ }
+ return gzputc(gz_outfile, ss[0]);
+ }
+ if (!ss[2]) {
+ if (gzputs(gz_outfile, " ") == -1) {
+ return -1;
+ }
+ } else if (!ss[3]) {
+ if (gzputc(gz_outfile, ' ') == -1) {
+ return -1;
+ }
+ }
+ return gzputs(gz_outfile, ss);
+}
+
int32_t get_next_noncomment(FILE* fptr, char** lptr_ptr, uintptr_t* line_idx_ptr) {
char* lptr;
do {
@@ -7633,6 +7652,54 @@ uint32_t count_non_autosomal_markers(Chrom_info* chrom_info_ptr, uintptr_t* mark
return ct;
}
+int32_t conditional_allocate_non_autosomal_markers(Chrom_info* chrom_info_ptr, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uint32_t marker_ct, uint32_t count_x, uint32_t count_mt, const char* calc_descrip, uintptr_t** marker_exclude_ptr, uint32_t* newly_excluded_ct_ptr) {
+ uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+ int32_t x_code = chrom_info_ptr->x_code;
+ int32_t y_code = chrom_info_ptr->y_code;
+ int32_t mt_code = chrom_info_ptr->mt_code;
+ uint32_t x_ct = 0;
+ uint32_t y_ct = 0;
+ uint32_t mt_ct = 0;
+ if (is_set(chrom_info_ptr->haploid_mask, 0)) {
+ *newly_excluded_ct_ptr = marker_ct;
+ } else {
+ if (count_x && (x_code != -1)) {
+ x_ct = count_chrom_markers(chrom_info_ptr, x_code, marker_exclude_orig);
+ }
+ if (y_code != -1) {
+ y_ct = count_chrom_markers(chrom_info_ptr, y_code, marker_exclude_orig);
+ }
+ if (count_mt && (mt_code != -1)) {
+ mt_ct = count_chrom_markers(chrom_info_ptr, mt_code, marker_exclude_orig);
+ }
+ *newly_excluded_ct_ptr = x_ct + y_ct + mt_ct;
+ }
+ if (*newly_excluded_ct_ptr) {
+ LOGPRINTF("Excluding %u variant%s on non-autosomes from %s.\n", *newly_excluded_ct_ptr, (*newly_excluded_ct_ptr == 1)? "" : "s", calc_descrip);
+ }
+ if (*newly_excluded_ct_ptr == marker_ct) {
+ logprint("Error: No variants remaining.\n");
+ return RET_INVALID_CMDLINE;
+ }
+ if (!(*newly_excluded_ct_ptr)) {
+ return 0;
+ }
+ if (wkspace_alloc_ul_checked(marker_exclude_ptr, unfiltered_marker_ctl * sizeof(intptr_t))) {
+ return RET_NOMEM;
+ }
+ memcpy(*marker_exclude_ptr, marker_exclude_orig, unfiltered_marker_ctl * sizeof(intptr_t));
+ if (x_ct) {
+ fill_bits(*marker_exclude_ptr, chrom_info_ptr->chrom_start[(uint32_t)x_code], chrom_info_ptr->chrom_end[(uint32_t)x_code] - chrom_info_ptr->chrom_start[(uint32_t)x_code]);
+ }
+ if (y_ct) {
+ fill_bits(*marker_exclude_ptr, chrom_info_ptr->chrom_start[(uint32_t)y_code], chrom_info_ptr->chrom_end[(uint32_t)y_code] - chrom_info_ptr->chrom_start[(uint32_t)y_code]);
+ }
+ if (mt_ct) {
+ fill_bits(*marker_exclude_ptr, chrom_info_ptr->chrom_start[(uint32_t)mt_code], chrom_info_ptr->chrom_end[(uint32_t)mt_code] - chrom_info_ptr->chrom_start[(uint32_t)mt_code]);
+ }
+ return 0;
+}
+
uint32_t get_max_chrom_size(Chrom_info* chrom_info_ptr, uintptr_t* marker_exclude, uint32_t* last_chrom_fo_idx_ptr) {
uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
uint32_t max_chrom_size = 0;
@@ -7683,21 +7750,6 @@ void count_genders(uintptr_t* sex_nm, uintptr_t* sex_male, uintptr_t unfiltered_
*unk_ct_ptr = unk_ct;
}
-double calc_wt_mean_maf(double exponent, double maf) {
- // assume Hardy-Weinberg equilibrium
- // homozygote frequencies: maf^2, (1-maf)^2
- // heterozygote frequency: 2maf(1-maf)
- double ll_freq = maf * maf;
- double lh_freq = 2 * maf * (1.0 - maf);
- double hh_freq = (1.0 - maf) * (1.0 - maf);
- double weight;
- if (lh_freq == 0.0) {
- return 0.0;
- }
- weight = pow(lh_freq, -exponent);
- return (lh_freq * (ll_freq + lh_freq) + 2 * ll_freq * hh_freq) * weight;
-}
-
void reverse_loadbuf(unsigned char* loadbuf, uintptr_t unfiltered_sample_ct) {
uintptr_t sample_bidx = 0;
unsigned char* loadbuf_end = &(loadbuf[(unfiltered_sample_ct + 3) / 4]);
@@ -7920,77 +7972,6 @@ uint32_t load_and_split(FILE* bedfile, uintptr_t* rawbuf, uint32_t unfiltered_sa
}
}
-uint32_t block_load_autosomal(FILE* bedfile, int32_t bed_offset, uintptr_t* marker_exclude, uint32_t marker_ct_autosomal, uint32_t block_max_size, uintptr_t unfiltered_sample_ct4, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uint32_t* marker_weights, unsigned char* readbuf, uint32_t* chrom_fo_idx_ptr, uintptr_t* marker_uidx_ptr, uintptr_t* marker_idx_ptr, uint32_t* block_size_ptr, uintptr_t* marker_reverse, double* set_allele_freq_buf, float* set_allele_freq_buf_fl, uint32_t* wtbuf) {
- uintptr_t marker_uidx = *marker_uidx_ptr;
- uintptr_t marker_idx = *marker_idx_ptr;
- uint32_t chrom_fo_idx = *chrom_fo_idx_ptr;
- uint32_t chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1];
- uint32_t markers_read = 0;
- uint32_t autosome_ct = chrom_info_ptr->autosome_ct;
- uint32_t xy_code = (uint32_t)chrom_info_ptr->xy_code;
- uint32_t max_code = chrom_info_ptr->max_code;
- uint32_t cur_chrom;
- uint32_t is_x;
- uint32_t is_y;
- uint32_t is_mt;
- uint32_t is_haploid;
-
- if (block_max_size > marker_ct_autosomal - marker_idx) {
- block_max_size = marker_ct_autosomal - marker_idx;
- }
- while (markers_read < block_max_size) {
- if (IS_SET(marker_exclude, marker_uidx)) {
- marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
- if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
- return RET_READ_FAIL;
- }
- }
- if (marker_uidx >= chrom_end) {
- while (1) {
- chrom_fo_idx++;
- refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
- cur_chrom = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
- if ((cur_chrom <= autosome_ct) || (cur_chrom == xy_code) || (cur_chrom > max_code)) {
- // for now, unplaced chromosomes are all "autosomal"
- break;
- }
- marker_uidx = next_unset_ul_unsafe(marker_exclude, chrom_end);
- if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
- return RET_READ_FAIL;
- }
- }
- }
- if (fread(&(readbuf[markers_read * unfiltered_sample_ct4]), 1, unfiltered_sample_ct4, bedfile) < unfiltered_sample_ct4) {
- return RET_READ_FAIL;
- }
- if (set_allele_freq_buf) {
- if ((!marker_reverse) || (!IS_SET(marker_reverse, marker_uidx))) {
- set_allele_freq_buf[markers_read] = set_allele_freqs[marker_uidx];
- } else {
- set_allele_freq_buf[markers_read] = 1.0 - set_allele_freqs[marker_uidx];
- }
- } else if (set_allele_freq_buf_fl) {
- if (!IS_SET(marker_reverse, marker_uidx)) {
- set_allele_freq_buf_fl[markers_read] = (float)set_allele_freqs[marker_uidx];
- } else {
- set_allele_freq_buf_fl[markers_read] = 1.0 - ((float)set_allele_freqs[marker_uidx]);
- }
- }
- if (wtbuf) {
- wtbuf[markers_read] = marker_weights[marker_idx];
- }
- markers_read++;
- marker_idx++;
- marker_uidx++;
- }
-
- *chrom_fo_idx_ptr = chrom_fo_idx;
- *marker_uidx_ptr = marker_uidx;
- *marker_idx_ptr = marker_idx;
- *block_size_ptr = markers_read;
- return 0;
-}
-
void vec_include_init(uintptr_t unfiltered_sample_ct, uintptr_t* new_include2, uintptr_t* old_include) {
uint32_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
uintptr_t ulii;
diff --git a/plink_common.h b/plink_common.h
index f8848e1..315f56e 100644
--- a/plink_common.h
+++ b/plink_common.h
@@ -20,12 +20,6 @@
#define PROG_NAME_STR "plink"
#define PROG_NAME_CAPS "PLINK"
-#ifdef STABLE_BUILD
- #define UNSTABLE goto main_unstable_disabled
-#else
- #define UNSTABLE
-#endif
-
#ifdef _WIN32
// needed for MEMORYSTATUSEX
#ifndef _WIN64
@@ -45,6 +39,7 @@
#define pthread_t HANDLE
#define THREAD_RET_TYPE unsigned __stdcall
#define THREAD_RETURN return 0
+ #define EOLN_STR "\r\n"
#else
#include <pthread.h>
#define THREAD_RET_TYPE void*
@@ -54,6 +49,7 @@
#define PRId64 "lld"
#endif
#endif
+ #define EOLN_STR "\n"
#endif
#ifdef __APPLE__
@@ -223,6 +219,9 @@
#define MISC_SPLIT_MERGE_NOFAIL 0x400000000LLU
#define MISC_REAL_REF_ALLELES 0x800000000LLU
#define MISC_RPLUGIN_DEBUG 0x1000000000LLU
+#define MISC_MISSING_GZ 0x2000000000LLU
+#define MISC_FREQ_GZ 0x4000000000LLU
+#define MISC_HET_GZ 0x8000000000LLU
// assume for now that .bed must always be accompanied by both .bim and .fam
#define FILTER_ALL_REQ 1LLU
@@ -305,9 +304,10 @@
#define CALC_WRITE_VAR_RANGES 0x40000000000000LLU
#define CALC_DUPVAR 0x80000000000000LLU
#define CALC_RPLUGIN 0x100000000000000LLU
+#define CALC_DFAM 0x200000000000000LLU
#define CALC_ONLY_BIM (CALC_WRITE_SET | CALC_WRITE_SNPLIST | CALC_WRITE_VAR_RANGES | CALC_LIST_23_INDELS | CALC_MAKE_BIM | CALC_DUPVAR)
#define CALC_ONLY_FAM (CALC_MAKE_PERM_PHENO | CALC_WRITE_COVAR | CALC_MAKE_FAM)
-// only room for 7 more basic commands before we need to switch from a single
+// only room for 6 more basic commands before we need to switch from a single
// uint64_t to uintptr_t*/is_set()/etc.
// necessary to patch heterozygous haploids/female Y chromosome genotypes
@@ -370,6 +370,7 @@
#define DISTANCE_TYPEMASK 0xe0
#define DISTANCE_FLAT_MISSING 0x100
#define DISTANCE_CLUSTER 0x200
+#define DISTANCE_WTS_NOHEADER 0x400
#define RECODE_01 1
#define RECODE_12 2
@@ -400,6 +401,7 @@
#define RECODE_FID 0x2000000
#define RECODE_IID 0x4000000
#define RECODE_INCLUDE_ALT 0x8000000
+#define RECODE_BGZ 0x10000000
#define GENOME_OUTPUT_GZ 1
#define GENOME_REL_CHECK 2
@@ -433,6 +435,7 @@
#define HWE_MIDP 1
#define HWE_THRESH_MIDP 2
#define HWE_THRESH_ALL 4
+#define HWE_GZ 8
#define MENDEL_FILTER 1
#define MENDEL_FILTER_VAR_FIRST 2
@@ -802,6 +805,12 @@ void wordwrap(char* ss, uint32_t suffix_len);
// 5 = length of "done." suffix, which is commonly used
#define LOGPRINTFWW5(...) sprintf(logbuf, __VA_ARGS__); wordwrap(logbuf, 5); logprintb();
+#ifdef STABLE_BUILD
+ #define UNSTABLE(val) sptr = strcpya(&(logbuf[9]), val); goto main_unstable_disabled
+#else
+ #define UNSTABLE(val)
+#endif
+
int32_t fopen_checked(FILE** target_ptr, const char* fname, const char* mode);
static inline int32_t putc_checked(int32_t ii, FILE* outfile) {
@@ -843,12 +852,51 @@ static inline int32_t fclose_null(FILE** fptr_ptr) {
int32_t gzopen_checked(gzFile* target_ptr, const char* fname, const char* mode);
+static inline int32_t gzclose_null(gzFile* gzf_ptr) {
+ int32_t ii = gzclose(*gzf_ptr);
+ *gzf_ptr = NULL;
+ return (ii != Z_OK);
+}
+
static inline void gzclose_cond(gzFile gz_infile) {
if (gz_infile) {
gzclose(gz_infile);
}
}
+static inline int32_t flexwrite_checked(const void* buf, size_t len, uint32_t output_gz, FILE* outfile, gzFile gz_outfile) {
+ if (!output_gz) {
+ return fwrite_checked(buf, len, outfile);
+ } else {
+ return (!gzwrite(gz_outfile, buf, len));
+ }
+}
+
+static inline int32_t flexputc_checked(int32_t ii, uint32_t output_gz, FILE* outfile, gzFile gz_outfile) {
+ if (!output_gz) {
+ putc(ii, outfile);
+ return ferror(outfile);
+ } else {
+ return (gzputc(gz_outfile, ii) == -1);
+ }
+}
+
+static inline int32_t flexputs_checked(const char* ss, uint32_t output_gz, FILE* outfile, gzFile gz_outfile) {
+ if (!output_gz) {
+ return fputs_checked(ss, outfile);
+ } else {
+ return (gzputs(gz_outfile, ss) == -1);
+ }
+}
+
+static inline int32_t flexclose_null(uint32_t output_gz, FILE** fptr_ptr, gzFile* gzf_ptr) {
+ if (!output_gz) {
+ return fclose_null(fptr_ptr);
+ } else {
+ return gzclose_null(gzf_ptr);
+ }
+}
+
static inline int32_t bed_suffix_conflict(uint64_t calculation_type, uint32_t recode_modifier) {
return (calculation_type & CALC_MAKE_BED) || ((calculation_type & CALC_RECODE) && (recode_modifier & (RECODE_LGEN | RECODE_LGEN_REF | RECODE_RLIST)));
}
@@ -1101,6 +1149,17 @@ static inline char* strcpyax(char* target, const void* source, char extra_char)
return &(target[slen + 1]);
}
+static inline void append_binary_eoln(char** target_ptr) {
+#ifdef _WIN32
+ (*target_ptr)[0] = '\r';
+ (*target_ptr)[1] = '\n';
+ *target_ptr += 2;
+#else
+ **target_ptr = '\n';
+ *target_ptr += 1;
+#endif
+}
+
static inline void fputs_w4(char* ss, FILE* outfile) {
// for efficient handling of width-4 allele columns; don't want to call
// strlen() since that's redundant with fputs
@@ -1118,6 +1177,8 @@ static inline void fputs_w4(char* ss, FILE* outfile) {
}
}
+int32_t gzputs_w4(gzFile gz_outfile, const char* ss);
+
int32_t get_next_noncomment(FILE* fptr, char** lptr_ptr, uintptr_t* line_idx_ptr);
int32_t get_next_noncomment_excl(FILE* fptr, char** lptr_ptr, uintptr_t* line_idx_ptr, uintptr_t* marker_exclude, uintptr_t* marker_uidx_ptr);
@@ -1553,15 +1614,15 @@ static inline void prev_unset_unsafe_ck(uintptr_t* bit_arr, uint32_t* loc_ptr) {
// These functions seem to optimize better than memset(arr, 0, x) under gcc.
static inline void fill_long_zero(intptr_t* larr, size_t size) {
- intptr_t* lptr = &(larr[size]);
- while (larr < lptr) {
+ size_t ulii;
+ for (ulii = 0; ulii < size; ulii++) {
*larr++ = 0;
}
}
static inline void fill_ulong_zero(uintptr_t* ularr, size_t size) {
- uintptr_t* ulptr = &(ularr[size]);
- while (ularr < ulptr) {
+ size_t ulii;
+ for (ulii = 0; ulii < size; ulii++) {
*ularr++ = 0;
}
}
@@ -1577,15 +1638,15 @@ static inline void fill_ull_zero(uint64_t* ullarr, size_t size) {
#endif
static inline void fill_long_one(intptr_t* larr, size_t size) {
- intptr_t* lptr = &(larr[size]);
- while (larr < lptr) {
+ size_t ulii;
+ for (ulii = 0; ulii < size; ulii++) {
*larr++ = -1;
}
}
static inline void fill_ulong_one(uintptr_t* ularr, size_t size) {
- uintptr_t* ulptr = &(ularr[size]);
- while (ularr < ulptr) {
+ size_t ulii;
+ for (ulii = 0; ulii < size; ulii++) {
*ularr++ = ~ZEROLU;
}
}
@@ -1601,59 +1662,43 @@ static inline void fill_ull_one(uint64_t* ullarr, size_t size) {
#endif
static inline void fill_int_zero(int32_t* iarr, size_t size) {
-#ifdef __LP64__
- fill_long_zero((intptr_t*)iarr, size >> 1);
- if (size & 1) {
- iarr[size - 1] = 0;
+ size_t ulii;
+ for (ulii = 0; ulii < size; ulii++) {
+ *iarr++ = 0;
}
-#else
- fill_long_zero((intptr_t*)iarr, size);
-#endif
}
static inline void fill_int_one(int32_t* iarr, size_t size) {
-#ifdef __LP64__
- fill_long_one((intptr_t*)iarr, size >> 1);
- if (size & 1) {
- iarr[size - 1] = -1;
+ size_t ulii;
+ for (ulii = 0; ulii < size; ulii++) {
+ *iarr++ = -1;
}
-#else
- fill_long_one((intptr_t*)iarr, size);
-#endif
}
static inline void fill_uint_zero(uint32_t* uiarr, size_t size) {
-#ifdef __LP64__
- fill_long_zero((intptr_t*)uiarr, size >> 1);
- if (size & 1) {
- uiarr[size - 1] = 0;
+ size_t ulii;
+ for (ulii = 0; ulii < size; ulii++) {
+ *uiarr++ = 0;
}
-#else
- fill_long_zero((intptr_t*)uiarr, size);
-#endif
}
static inline void fill_uint_one(uint32_t* uiarr, size_t size) {
-#ifdef __LP64__
- fill_ulong_one((uintptr_t*)uiarr, size >> 1);
- if (size & 1) {
- uiarr[size - 1] = ~0U;
+ size_t ulii;
+ for (ulii = 0; ulii < size; ulii++) {
+ *uiarr++ = ~0U;
}
-#else
- fill_ulong_one((uintptr_t*)uiarr, size);
-#endif
}
static inline void fill_float_zero(float* farr, size_t size) {
- float* fptr = &(farr[size]);
- while (farr < fptr) {
+ size_t ulii;
+ for (ulii = 0; ulii < size; ulii++) {
*farr++ = 0.0;
}
}
static inline void fill_double_zero(double* darr, size_t size) {
- double* dptr = &(darr[size]);
- while (darr < dptr) {
+ size_t ulii;
+ for (ulii = 0; ulii < size; ulii++) {
*darr++ = 0.0;
}
}
@@ -1861,6 +1906,9 @@ static inline int32_t chrom_exists(Chrom_info* chrom_info_ptr, uint32_t chrom_id
int32_t resolve_or_add_chrom_name(Chrom_info* chrom_info_ptr, char* bufptr, int32_t* chrom_idx_ptr, uintptr_t line_idx, const char* file_descrip);
+// no need for this; code is simpler if we just create a copy of marker_exclude
+// with all non-autosomal loci removed
+/*
static inline uintptr_t next_autosomal_unsafe(uintptr_t* marker_exclude, uintptr_t marker_uidx, Chrom_info* chrom_info_ptr, uint32_t* chrom_end_ptr, uint32_t* chrom_fo_idx_ptr) {
// assumes we are at an autosomal marker if marker_uidx < *chrom_end_ptr
next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
@@ -1881,6 +1929,7 @@ static inline uintptr_t next_autosomal_unsafe(uintptr_t* marker_exclude, uintptr
marker_uidx = next_unset_ul_unsafe(marker_exclude, *chrom_end_ptr);
}
}
+*/
void refresh_chrom_info(Chrom_info* chrom_info_ptr, uintptr_t marker_uidx, uint32_t* chrom_end_ptr, uint32_t* chrom_fo_idx_ptr, uint32_t* is_x_ptr, uint32_t* is_y_ptr, uint32_t* is_mt_ptr, uint32_t* is_haploid_ptr);
@@ -2097,12 +2146,12 @@ static inline uint32_t count_chrom_markers(Chrom_info* chrom_info_ptr, uint32_t
uint32_t count_non_autosomal_markers(Chrom_info* chrom_info_ptr, uintptr_t* marker_exclude, uint32_t count_x, uint32_t count_mt);
+int32_t conditional_allocate_non_autosomal_markers(Chrom_info* chrom_info_ptr, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uint32_t marker_ct, uint32_t count_x, uint32_t count_mt, const char* calc_descrip, uintptr_t** marker_exclude_ptr, uint32_t* newly_excluded_ct_ptr);
+
uint32_t get_max_chrom_size(Chrom_info* chrom_info_ptr, uintptr_t* marker_exclude, uint32_t* last_chrom_fo_idx_ptr);
void count_genders(uintptr_t* sex_nm, uintptr_t* sex_male, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uint32_t* male_ct_ptr, uint32_t* female_ct_ptr, uint32_t* unk_ct_ptr);
-double calc_wt_mean_maf(double exponent, double maf);
-
void reverse_loadbuf(unsigned char* loadbuf, uintptr_t unfiltered_sample_ct);
void collapse_copy_2bitarr(uintptr_t* rawbuf, uintptr_t* mainbuf, uint32_t unfiltered_sample_ct, uint32_t sample_ct, uintptr_t* sample_exclude);
@@ -2141,8 +2190,6 @@ uint32_t load_and_collapse_incl(FILE* bedfile, uintptr_t* rawbuf, uint32_t unfil
uint32_t load_and_split(FILE* bedfile, uintptr_t* rawbuf, uint32_t unfiltered_sample_ct, uintptr_t* casebuf, uintptr_t* ctrlbuf, uintptr_t* pheno_nm, uintptr_t* pheno_c);
-uint32_t block_load_autosomal(FILE* bedfile, int32_t bed_offset, uintptr_t* marker_exclude, uint32_t marker_ct_autosomal, uint32_t block_max_size, uintptr_t unfiltered_sample_ct4, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uint32_t* marker_weights, unsigned char* readbuf, uint32_t* chrom_fo_idx_ptr, uintptr_t* marker_uidx_ptr, uintptr_t* marker_idx_ptr, uint32_t* block_size_ptr, uintptr_t* marker_reverse, double* set_allele_freq_buf, float* set_allele_freq_buf_fl, uint32_t* wtbuf);
-
void vec_include_init(uintptr_t unfiltered_sample_ct, uintptr_t* new_include2, uintptr_t* old_include);
void exclude_to_vec_include(uintptr_t unfiltered_sample_ct, uintptr_t* include_vec, uintptr_t* exclude_arr);
diff --git a/plink_data.c b/plink_data.c
index 228a7d7..13d1f44 100644
--- a/plink_data.c
+++ b/plink_data.c
@@ -14,6 +14,7 @@
#include <sys/types.h>
#include "plink_family.h"
#include "plink_set.h"
+#include "bgzf.h"
#define PHENO_EPSILON 0.000030517578125
@@ -566,7 +567,10 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
uint32_t cur_pos;
char cc;
fill_ulong_zero(loaded_chrom_mask, CHROM_MASK_WORDS);
- fill_ulong_zero((uintptr_t*)(&insert_buf), 4);
+ insert_buf[0] = NULL;
+ insert_buf[1] = NULL;
+ insert_buf[2] = NULL;
+ insert_buf[3] = NULL;
if (sf_ct) {
sf_start_idxs = (uint32_t*)malloc((MAX_POSSIBLE_CHROM + 1) * sizeof(int32_t));
if (!sf_start_idxs) {
@@ -584,7 +588,11 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
}
}
fill_uint_zero(missing_template_seg_len, 5);
- fill_ulong_zero((uintptr_t*)(&missing_template_seg), 5);
+ missing_template_seg[0] = NULL;
+ missing_template_seg[1] = NULL;
+ missing_template_seg[2] = NULL;
+ missing_template_seg[3] = NULL;
+ missing_template_seg[4] = NULL;
if (missing_mid_template) {
if (!missing_marker_id_match) {
missing_marker_id_match = &(g_one_char_strs[92]); // '.'
@@ -1045,6 +1053,11 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
if (snps_only) {
max_marker_allele_len = 2;
}
+ if (max_marker_allele_len > 500000000) {
+ // guard against overflows
+ logprint("Error: Alleles are limited to 500 million characters.\n");
+ goto load_bim_ret_INVALID_FORMAT;
+ }
*max_marker_allele_len_ptr = max_marker_allele_len;
marker_allele_ptrs = (char**)wkspace_alloc(unfiltered_marker_ct * 2 * sizeof(intptr_t));
if (!marker_allele_ptrs) {
@@ -4026,6 +4039,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
uint32_t shiftval;
uint32_t bgen_compressed;
uint32_t bgen_multichar_alleles;
+ uint32_t identical_alleles;
uint32_t uii;
uint32_t ujj;
uint32_t ukk;
@@ -4415,20 +4429,34 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
if (putc_checked(' ', outfile_bim)) {
goto oxford_to_bed_ret_WRITE_FAIL;
}
- bufptr = bufptr2;
- bufptr2 = next_token_mult(bufptr, 2);
- if (no_more_tokens_kns(bufptr2)) {
+ bufptr = next_token(bufptr2);
+ bufptr3 = next_token(bufptr);
+ if (no_more_tokens_kns(bufptr3)) {
goto oxford_to_bed_ret_MISSING_TOKENS_GEN;
}
- bufptr2 = token_endnn(bufptr2);
- fwrite(bufptr, 1, bufptr2 - bufptr, outfile_bim);
+ // bufptr2 = pos
+ // bufptr = allele 1
+ // bufptr3 = allele 2
+ bufptr4 = token_endnn(bufptr3);
+ uii = (uintptr_t)(bufptr4 - bufptr3);
+ identical_alleles = (strlen_se(bufptr) == uii) && (!memcmp(bufptr, bufptr3, uii));
+ if (identical_alleles) {
+ // we treat identical A1 and A2 as a special case, since naive handling
+ // prevents e.g. later data merge.
+ // maybe add a warning?
+ fwrite(bufptr2, 1, strlen_se(bufptr2), outfile_bim);
+ fputs(" 0 ", outfile_bim);
+ fwrite(bufptr3, 1, bufptr4 - bufptr3, outfile_bim);
+ } else {
+ fwrite(bufptr2, 1, bufptr4 - bufptr2, outfile_bim);
+ }
if (putc_checked('\n', outfile_bim)) {
goto oxford_to_bed_ret_WRITE_FAIL;
}
cur_word = 0;
shiftval = 0;
ulptr = writebuf;
- bufptr = skip_initial_spaces(&(bufptr2[1]));
+ bufptr = skip_initial_spaces(&(bufptr4[1]));
for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
if (is_eoln_kns(*bufptr)) {
goto oxford_to_bed_ret_MISSING_TOKENS_GEN;
@@ -4555,6 +4583,16 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
if (shiftval) {
*ulptr++ = cur_word;
}
+ if (identical_alleles) {
+ // keep missing calls, but convert hom/het A1 to hom A2.
+ for (ulptr = writebuf; ulptr < (&(writebuf[sample_ctl2])); ulptr++) {
+ ulii = *ulptr;
+ *ulptr = ((~ulii) << 1) | ulii | FIVEMASK;
+ }
+ if (sample_ct % 4) {
+ writebuf[sample_ctl2 - 1] &= (ONELU << (2 * (sample_ct % BITCT2))) - ONELU;
+ }
+ }
if (fwrite_checked(writebuf, sample_ct4, outfile)) {
goto oxford_to_bed_ret_WRITE_FAIL;
}
@@ -4580,9 +4618,9 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
}
loadbuf = (char*)wkspace_base;
loadbuf_size = wkspace_left;
- if (loadbuf_size > MAXLINEBUFLEN / 2) {
+ if (loadbuf_size > MAXLINEBUFLEN) {
// halve the limit since there are two alleles
- loadbuf_size = MAXLINEBUFLEN / 2;
+ loadbuf_size = MAXLINEBUFLEN;
} else if (loadbuf_size < 3 * 65536) {
goto oxford_to_bed_ret_NOMEM;
}
@@ -4738,8 +4776,8 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
fwrite(bufptr, 1, usjj, outfile_bim);
bufptr = uint32_writex(&(tbuf[3]), uint_arr[0], ' ');
fwrite(tbuf, 1, bufptr - tbuf, outfile_bim);
- if (uint_arr[1] >= loadbuf_size) {
- if (loadbuf_size < MAXLINEBUFLEN / 2) {
+ if (uint_arr[1] >= loadbuf_size / 2) {
+ if (loadbuf_size < MAXLINEBUFLEN) {
goto oxford_to_bed_ret_NOMEM;
}
logprint("Error: Excessively long allele in .bgen file.\n");
@@ -4749,25 +4787,31 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
goto oxford_to_bed_ret_READ_FAIL;
}
loadbuf[uint_arr[1]] = ' ';
- if (fwrite_checked(loadbuf, uint_arr[1] + 1, outfile_bim)) {
- goto oxford_to_bed_ret_WRITE_FAIL;
- }
if (fread(&uii, 1, 4, infile) < 4) {
goto oxford_to_bed_ret_READ_FAIL;
}
- if (uii >= loadbuf_size) {
- if (loadbuf_size < MAXLINEBUFLEN / 2) {
+ if (uii >= loadbuf_size / 2) {
+ if (loadbuf_size < MAXLINEBUFLEN) {
goto oxford_to_bed_ret_NOMEM;
}
logprint("Error: Excessively long allele in .bgen file.\n");
goto oxford_to_bed_ret_INVALID_FORMAT;
}
- if (fread(loadbuf, 1, uii, infile) < uii) {
+ bufptr = &(loadbuf[uint_arr[1] + 1]);
+ if (fread(bufptr, 1, uii, infile) < uii) {
goto oxford_to_bed_ret_READ_FAIL;
}
- loadbuf[uii] = '\n';
- if (fwrite_checked(loadbuf, uii + 1, outfile_bim)) {
- goto oxford_to_bed_ret_WRITE_FAIL;
+ bufptr[uii] = '\n';
+ identical_alleles = (uii == uint_arr[1]) && (!memcmp(loadbuf, bufptr, uii));
+ if (!identical_alleles) {
+ if (fwrite_checked(loadbuf, uint_arr[1] + uii + 2, outfile_bim)) {
+ goto oxford_to_bed_ret_WRITE_FAIL;
+ }
+ } else {
+ fputs("0 ", outfile_bim);
+ if (fwrite_checked(bufptr, uii + 1, outfile_bim)) {
+ goto oxford_to_bed_ret_WRITE_FAIL;
+ }
}
} else {
uii = 0;
@@ -4834,7 +4878,12 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
fwrite(&(loadbuf[uii + 2]), 1, ukk, outfile_bim);
memcpy(&ujj, &(loadbuf[2 * uii + 3]), 4);
bufptr = uint32_writex(&(tbuf[3]), ujj, ' ');
- *bufptr++ = loadbuf[2 * uii + 7];
+ identical_alleles = (loadbuf[2 * uii + 7] == loadbuf[2 * uii + 8]);
+ if (!identical_alleles) {
+ *bufptr++ = loadbuf[2 * uii + 7];
+ } else {
+ *bufptr++ = '0';
+ }
*bufptr++ = ' ';
*bufptr++ = loadbuf[2 * uii + 8];
*bufptr++ = '\n';
@@ -4942,6 +4991,15 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
if (shiftval) {
*ulptr++ = cur_word;
}
+ if (identical_alleles) {
+ for (ulptr = writebuf; ulptr < (&(writebuf[sample_ctl2])); ulptr++) {
+ ulii = *ulptr;
+ *ulptr = ((~ulii) << 1) | ulii | FIVEMASK;
+ }
+ if (sample_ct % 4) {
+ writebuf[sample_ctl2 - 1] &= (ONELU << (2 * (sample_ct % BITCT2))) - ONELU;
+ }
+ }
if (fwrite_checked(writebuf, sample_ct4, outfile)) {
goto oxford_to_bed_ret_WRITE_FAIL;
}
@@ -5583,17 +5641,19 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
ukk = map_reverse[umm++];
if ((ukk >= marker_start) && (ukk < marker_end)) {
ucc = 1;
- if (!strcmp(aptr1, marker_allele_ptrs[2 * ukk + 1])) {
- if ((alen1 == alen2) && (!memcmp(aptr1, aptr2, alen1))) {
- ucc = 3;
- } else if (!strcmp(aptr2, marker_allele_ptrs[2 * ukk])) {
- ucc = 2;
- }
- } else if (!strcmp(aptr1, marker_allele_ptrs[2 * ukk])) {
- if ((alen1 == alen2) && (!memcmp(aptr1, aptr2, alen1))) {
- ucc = 0;
- } else if (!strcmp(aptr2, marker_allele_ptrs[2 * ukk + 1])) {
- ucc = 2;
+ if ((*aptr1 != missing_geno) || (alen1 != 1)) {
+ if (!strcmp(aptr1, marker_allele_ptrs[2 * ukk + 1])) {
+ if ((alen1 == alen2) && (!memcmp(aptr1, aptr2, alen1))) {
+ ucc = 3;
+ } else if (!strcmp(aptr2, marker_allele_ptrs[2 * ukk])) {
+ ucc = 2;
+ }
+ } else if (!strcmp(aptr1, marker_allele_ptrs[2 * ukk])) {
+ if ((alen1 == alen2) && (!memcmp(aptr1, aptr2, alen1))) {
+ ucc = 0;
+ } else if (!strcmp(aptr2, marker_allele_ptrs[2 * ukk + 1])) {
+ ucc = 2;
+ }
}
}
wbufptr[(ukk - marker_start) * sample_ct4] |= ucc << ii_shift;
@@ -5616,17 +5676,19 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
continue;
}
ucc = 1;
- if (!strcmp(aptr1, marker_allele_ptrs[2 * marker_idx + 1])) {
- if ((alen1 == alen2) && (!memcmp(aptr1, aptr2, alen1))) {
- ucc = 3;
- } else if (!strcmp(aptr2, marker_allele_ptrs[2 * marker_idx])) {
- ucc = 2;
- }
- } else if (!strcmp(aptr1, marker_allele_ptrs[2 * marker_idx])) {
- if ((alen1 == alen2) && (!memcmp(aptr1, aptr2, alen1))) {
- ucc = 0;
- } else if (!strcmp(aptr2, marker_allele_ptrs[2 * marker_idx + 1])) {
- ucc = 2;
+ if ((*aptr1 != missing_geno) || (alen1 != 1)) {
+ if (!strcmp(aptr1, marker_allele_ptrs[2 * marker_idx + 1])) {
+ if ((alen1 == alen2) && (!memcmp(aptr1, aptr2, alen1))) {
+ ucc = 3;
+ } else if (!strcmp(aptr2, marker_allele_ptrs[2 * marker_idx])) {
+ ucc = 2;
+ }
+ } else if (!strcmp(aptr1, marker_allele_ptrs[2 * marker_idx])) {
+ if ((alen1 == alen2) && (!memcmp(aptr1, aptr2, alen1))) {
+ ucc = 0;
+ } else if (!strcmp(aptr2, marker_allele_ptrs[2 * marker_idx + 1])) {
+ ucc = 2;
+ }
}
}
*wbufptr |= ucc << ii_shift;
@@ -7260,7 +7322,10 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
goto transposed_to_bed_ret_WRITE_FAIL;
}
cptr2 = cptr4;
- fill_ulong_zero((uintptr_t*)alleles, 4);
+ alleles[0] = NULL;
+ alleles[1] = NULL;
+ alleles[2] = NULL;
+ alleles[3] = NULL;
fill_uint_zero(allele_cts, 4);
for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
cptr2 = skip_initial_spaces(cptr2);
@@ -7806,8 +7871,11 @@ int32_t vcf_sample_line(char* outname, char* outname_end, int32_t missing_pheno,
}
}
if (memchr(&(bufptr3[1]), (unsigned char)id_delim, (uintptr_t)(bufptr2 - &(bufptr3[1])))) {
- sprintf(logbuf, "Error: Multiple instances of '%c' in sample ID.\n", id_delim);
- goto vcf_sample_line_ret_INVALID_FORMAT_2;
+ LOGPRINTF("Error: Multiple instances of '%c' in sample ID.\n", id_delim);
+ if (id_delim == '_') {
+ logprint("If you do not want '_' to be treated as a FID/IID delimiter, use --double-id or\n--const-fid to choose a different method of converting VCF sample IDs to PLINK\nIDs, or --id-delim to change the FID/IID delimiter.\n");
+ }
+ goto vcf_sample_line_ret_INVALID_FORMAT;
}
wptr = memcpyax(tbuf, bufptr, (uintptr_t)(bufptr3 - bufptr), '\t');
bufptr3++;
@@ -8928,6 +8996,12 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
}
goto bcf_to_bed_ret_INVALID_FORMAT_2;
}
+ if (((unsigned char)(tbuf[4])) > 2) {
+ // defend against 0x82-0x87 being given a meaning in 8-bit int vectors,
+ // etc.
+ LOGPREPRINTFWW("Error: %s appears to be formatted as BCFv2.%u; this PLINK build only supports v2.0-2.2. You may need to obtain an updated version of PLINK.\n", bcfname, ((unsigned char)(tbuf[4])));
+ goto bcf_to_bed_ret_INVALID_FORMAT_2;
+ }
if (gzread(gz_infile, &header_size, 4) < 4) {
goto bcf_to_bed_ret_READ_OR_FORMAT_FAIL;
}
@@ -9341,11 +9415,12 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
ucptr = (unsigned char*)loadbuf;
if (ujj == 2) {
for (sample_idx = 0; sample_idx < sample_ct; sample_idx++, ucptr++) {
- ulii = *ucptr++;
+ // discard all phase bits for now
+ // missing = 0x80 or 0x81
+ ulii = (*ucptr++) & 0x7e;
if (ulii) {
- // discard all phase bits for now
ulii = ((ulii / 2) - 1) * sample_ctv2;
- uljj = *ucptr;
+ uljj = (*ucptr) & 0x7e;
if (uljj) {
set_bit(&(base_bitfields[ulii]), sample_idx * 2);
base_bitfields[((uljj / 2) - 1) * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
@@ -9357,7 +9432,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
}
} else if (ujj == 1) {
for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
- ulii = *ucptr++;
+ ulii = (*ucptr++) & 0x7e;
if (ulii) {
set_bit(&(base_bitfields[((ulii / 2) - 1) * sample_ctv2]), sample_idx * 2 + 1);
}
@@ -9367,10 +9442,10 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
if (ucptr[2]) {
ucptr = &(ucptr[ujj]);
} else {
- ulii = *ucptr++;
+ ulii = (*ucptr++) & 0x7e;
if (ulii) {
ulii = ((ulii / 2) - 1) * sample_ctv2;
- uljj = *ucptr;
+ uljj = (*ucptr) & 0x7e;
if (uljj) {
set_bit(&(base_bitfields[ulii]), sample_idx * 2);
base_bitfields[((uljj / 2) - 1) * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
@@ -9387,10 +9462,10 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
// bleah, this should totally use templates instead of cut-and-paste
if (ujj == 2) {
for (sample_idx = 0; sample_idx < sample_ct; sample_idx++, ui16ptr++) {
- ulii = *ui16ptr++;
+ ulii = (*ui16ptr++) & 0x7ffe;
if (ulii) {
ulii = ((ulii / 2) - 1) * sample_ctv2;
- uljj = *ui16ptr;
+ uljj = (*ui16ptr) & 0x7ffe;
if (uljj) {
set_bit(&(base_bitfields[ulii]), sample_idx * 2);
base_bitfields[((uljj / 2) - 1) * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
@@ -9401,7 +9476,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
}
} else if (ujj == 1) {
for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
- ulii = *ui16ptr++;
+ ulii = (*ui16ptr++) & 0x7ffe;
if (ulii) {
set_bit(&(base_bitfields[((ulii / 2) - 1) * sample_ctv2]), sample_idx * 2 + 1);
}
@@ -9411,10 +9486,10 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
if (ui16ptr[2]) {
ui16ptr = &(ui16ptr[ujj]);
} else {
- ulii = *ui16ptr++;
+ ulii = (*ui16ptr++) & 0x7ffe;
if (ulii) {
ulii = ((ulii / 2) - 1) * sample_ctv2;
- uljj = *ui16ptr;
+ uljj = (*ui16ptr) & 0x7ffe;
if (uljj) {
set_bit(&(base_bitfields[ulii]), sample_idx * 2);
base_bitfields[((uljj / 2) - 1) * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
@@ -9430,10 +9505,10 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
uiptr = (uint32_t*)loadbuf;
if (ujj == 2) {
for (sample_idx = 0; sample_idx < sample_ct; sample_idx++, uiptr++) {
- ulii = *uiptr++;
+ ulii = (*uiptr++) & 0x7ffffffe;
if (ulii) {
ulii = ((ulii / 2) - 1) * sample_ctv2;
- uljj = *uiptr;
+ uljj = (*uiptr) & 0x7ffffffe;
if (uljj) {
set_bit(&(base_bitfields[ulii]), sample_idx * 2);
base_bitfields[((uljj / 2) - 1) * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
@@ -9444,7 +9519,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
}
} else if (ujj == 1) {
for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
- ulii = *uiptr++;
+ ulii = (*uiptr++) & 0x7ffffffe;
if (ulii) {
set_bit(&(base_bitfields[((ulii / 2) - 1) * sample_ctv2]), sample_idx * 2 + 1);
}
@@ -9454,10 +9529,10 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
if (uiptr[2]) {
uiptr = &(uiptr[ujj]);
} else {
- ulii = *uiptr++;
+ ulii = (*uiptr++) & 0x7ffffffe;
if (ulii) {
ulii = ((ulii / 2) - 1) * sample_ctv2;
- uljj = *uiptr;
+ uljj = (*uiptr) & 0x7ffffffe;
if (uljj) {
set_bit(&(base_bitfields[ulii]), sample_idx * 2);
base_bitfields[((uljj / 2) - 1) * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
@@ -11545,9 +11620,34 @@ uint32_t valid_vcf_allele_code(const char* allele_code) {
return 1;
}
+int32_t flexbwrite_checked(const void* buf, size_t len, uint32_t output_bgz, FILE* outfile, BGZF* bgz_outfile) {
+ if (!output_bgz) {
+ return fwrite_checked(buf, len, outfile);
+ } else {
+ return (bgzf_write(bgz_outfile, buf, len) < 0);
+ }
+}
+
+int32_t flexbputs_checked(const char* buf, uint32_t output_bgz, FILE* outfile, BGZF* bgz_outfile) {
+ if (!output_bgz) {
+ return fputs_checked(buf, outfile);
+ } else {
+ return (bgzf_write(bgz_outfile, buf, strlen(buf)) < 0);
+ }
+}
+
+int32_t flexbputc_checked(unsigned char ucc, uint32_t output_bgz, FILE* outfile, BGZF* bgz_outfile) {
+ if (!output_bgz) {
+ return putc_checked(ucc, outfile);
+ } else {
+ return (bgzf_write(bgz_outfile, &ucc, 1) < 0);
+ }
+}
+
int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, char* recode_allele_name, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* marker_ids, uintptr_t max_marker_id_len, double* marker_cms, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uint32_t* marker_pos, uintptr_t* marker_reverse, char* sample_ids, [...]
FILE* outfile = NULL;
FILE* outfile2 = NULL;
+ BGZF* bgz_outfile = NULL;
uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
uintptr_t sample_ctv2 = 2 * ((sample_ct + (BITCT - 1)) / BITCT);
@@ -11569,6 +11669,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
uint32_t vcf_not_fid = (recode_modifier & RECODE_VCF) && (!(recode_modifier & RECODE_FID));
uint32_t vcf_not_iid = (recode_modifier & RECODE_VCF) && (!(recode_modifier & RECODE_IID));
uint32_t vcf_two_ids = vcf_not_fid && vcf_not_iid;
+ uint32_t output_bgz = (recode_modifier / RECODE_BGZ) & 1;
uint32_t recode_012 = recode_modifier & (RECODE_01 | RECODE_12);
uint32_t set_hh_missing = (misc_flags / MISC_SET_HH_MISSING) & 1;
uint32_t real_ref_alleles = (misc_flags / MISC_REAL_REF_ALLELES) & 1;
@@ -12290,21 +12391,32 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
}
}
} else if (recode_modifier & RECODE_VCF) {
- strcpy(outname_end, ".vcf");
- if (fopen_checked(&outfile, outname, "w")) {
- goto recode_ret_OPEN_FAIL;
- }
- if (fputs_checked(
-"##fileformat=VCFv4.2\n"
-"##fileDate=", outfile)) {
- goto recode_ret_WRITE_FAIL;
+ if (!output_bgz) {
+ memcpy(outname_end, ".vcf", 5);
+ if (fopen_checked(&outfile, outname, "w")) {
+ goto recode_ret_OPEN_FAIL;
+ }
+ } else {
+ memcpy(outname_end, ".vcf.gz", 7);
+ bgz_outfile = bgzf_open(outname, "w");
+ if (!bgz_outfile) {
+ goto recode_ret_OPEN_FAIL;
+ }
+#ifndef _WIN32
+ if (g_thread_ct > 1) {
+ bgzf_mt(bgz_outfile, g_thread_ct, 128);
+ }
+#endif
}
+ wbufptr = memcpya(tbuf, "##fileformat=VCFv4.2\n##fileDate=", 32);
time(&rawtime);
loctime = localtime(&rawtime);
- strftime(tbuf, MAXLINELEN, "%Y%m%d", loctime);
- fputs(tbuf, outfile);
- fputs("\n##source=PLINKv1.90\n", outfile);
+ wbufptr += strftime(wbufptr, MAXLINELEN, "%Y%m%d", loctime);
+ wbufptr = memcpya(wbufptr, "\n##source=PLINKv1.90\n", 21);
uii = 0; // '0' written already?
+ if (flexbwrite_checked(tbuf, wbufptr - tbuf, output_bgz, outfile, bgz_outfile)) {
+ goto recode_ret_WRITE_FAIL;
+ }
memcpy(tbuf, "##contig=<ID=", 13);
for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
@@ -12332,15 +12444,21 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
}
}
cptr = memcpya(cptr, ">\n", 2);
- fwrite(tbuf, 1, cptr - tbuf, outfile);
+ if (flexbwrite_checked(tbuf, cptr - tbuf, output_bgz, outfile, bgz_outfile)) {
+ goto recode_ret_WRITE_FAIL;
+ }
}
if (!real_ref_alleles) {
- fputs("##INFO=<ID=PR,Number=0,Type=Flag,Description=\"Provisional reference allele, may not be based on real reference genome\"\n", outfile);
+ if (flexbputs_checked("##INFO=<ID=PR,Number=0,Type=Flag,Description=\"Provisional reference allele, may not be based on real reference genome\">\n", output_bgz, outfile, bgz_outfile)) {
+ goto recode_ret_WRITE_FAIL;
+ }
}
- fputs("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", outfile);
// todo: include PEDIGREE in header, and make --vcf be able to read it?
- // Can't find a specification for how this should be done...
- fputs("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", outfile);
+ if (flexbputs_checked(
+"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
+"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", output_bgz, outfile, bgz_outfile)) {
+ goto recode_ret_WRITE_FAIL;
+ }
chrom_fo_idx = 0;
refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
@@ -12350,9 +12468,13 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
cptr = &(sample_ids[sample_uidx * max_sample_id_len]);
ulii = strlen_se(cptr);
- putc('\t', outfile);
+ if (flexbputc_checked('\t', output_bgz, outfile, bgz_outfile)) {
+ goto recode_ret_WRITE_FAIL;
+ }
if (vcf_not_iid) {
- fwrite(cptr, 1, ulii, outfile);
+ if (flexbwrite_checked(cptr, ulii, output_bgz, outfile, bgz_outfile)) {
+ goto recode_ret_WRITE_FAIL;
+ }
if (vcf_two_ids) {
if (!shiftval) {
if (strchr(cptr, '_')) {
@@ -12360,14 +12482,18 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
logprint("Warning: Underscore(s) present in sample IDs.\n");
}
}
- putc('_', outfile);
+ if (flexbputc_checked('_', output_bgz, outfile, bgz_outfile)) {
+ goto recode_ret_WRITE_FAIL;
+ }
}
}
if (vcf_not_fid) {
- fputs(&(cptr[ulii + 1]), outfile);
+ if (flexbputs_checked(&(cptr[ulii + 1]), output_bgz, outfile, bgz_outfile)) {
+ goto recode_ret_WRITE_FAIL;
+ }
}
}
- LOGPRINTFWW5("--recode vcf%s to %s ... ", vcf_not_iid? (vcf_not_fid? "" : "-fid") : "-iid", outname);
+ LOGPRINTFWW5("--recode vcf%s%s to %s ... ", vcf_not_iid? (vcf_not_fid? "" : "-fid") : "-iid", output_bgz? " bgz" : "", outname);
fputs("0%", stdout);
fflush(stdout);
tbuf[0] = '\n';
@@ -12399,19 +12525,25 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
*wbufptr++ = '\t';
wbufptr = uint32_writex(wbufptr, marker_pos[marker_uidx], '\t');
wbufptr = strcpyax(wbufptr, &(marker_ids[marker_uidx * max_marker_id_len]), '\t');
- if (fwrite_checked(tbuf, wbufptr - tbuf, outfile)) {
+ if (flexbwrite_checked(tbuf, wbufptr - tbuf, output_bgz, outfile, bgz_outfile)) {
goto recode_ret_WRITE_FAIL;
}
cptr = mk_allele_ptrs[2 * marker_uidx + 1];
if (cptr == missing_geno_ptr) {
- putc('N', outfile);
+ if (flexbputc_checked('N', output_bgz, outfile, bgz_outfile)) {
+ goto recode_ret_WRITE_FAIL;
+ }
} else {
if ((!invalid_allele_code_seen) && (!valid_vcf_allele_code(cptr))) {
invalid_allele_code_seen = 1;
}
- fputs(cptr, outfile);
+ if (flexbputs_checked(cptr, output_bgz, outfile, bgz_outfile)) {
+ goto recode_ret_WRITE_FAIL;
+ }
+ }
+ if (flexbputc_checked('\t', output_bgz, outfile, bgz_outfile)) {
+ goto recode_ret_WRITE_FAIL;
}
- putc('\t', outfile);
if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
goto recode_ret_READ_FAIL;
@@ -12428,19 +12560,22 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
// if ALT allele is not actually present in immediate dataset, VCF
// spec actually requires '.'
if (!is_monomorphic_a2(loadbuf_collapsed, sample_ct)) {
- fputs(cptr, outfile);
+ if (flexbputs_checked(cptr, output_bgz, outfile, bgz_outfile)) {
+ goto recode_ret_WRITE_FAIL;
+ }
} else {
- putc('.', outfile);
+ if (flexbputc_checked('.', output_bgz, outfile, bgz_outfile)) {
+ goto recode_ret_WRITE_FAIL;
+ }
}
} else {
- putc('.', outfile);
+ if (flexbputc_checked('.', output_bgz, outfile, bgz_outfile)) {
+ goto recode_ret_WRITE_FAIL;
+ }
}
- if (!real_ref_alleles) {
- fputs("\t.\t.\tPR\tGT", outfile);
- } else {
- fputs("\t.\t.\t.\tGT", outfile);
+ if (flexbputs_checked(real_ref_alleles? "\t.\t.\t.\tGT" : "\t.\t.\tPR\tGT", output_bgz, outfile, bgz_outfile)) {
+ goto recode_ret_WRITE_FAIL;
}
-
wbufptr = writebuf;
ulptr = loadbuf_collapsed;
ulptr_end = &(loadbuf_collapsed[sample_ct / BITCT2]);
@@ -12482,7 +12617,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
shiftmax = sample_ct % BITCT2;
}
}
- if (fwrite_checked(writebuf, wbufptr - writebuf, outfile)) {
+ if (flexbwrite_checked(writebuf, wbufptr - writebuf, output_bgz, outfile, bgz_outfile)) {
goto recode_ret_WRITE_FAIL;
}
}
@@ -12494,9 +12629,16 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
fflush(stdout);
}
}
- if (putc_checked('\n', outfile)) {
+ if (flexbputc_checked('\n', output_bgz, outfile, bgz_outfile)) {
goto recode_ret_WRITE_FAIL;
}
+ if (output_bgz) {
+ if (bgzf_close(bgz_outfile)) {
+ bgz_outfile = NULL;
+ goto recode_ret_WRITE_FAIL;
+ }
+ bgz_outfile = NULL;
+ }
} else if (recode_modifier & RECODE_OXFORD) {
memcpy(outname_end, ".gen", 5);
if (fopen_checked(&outfile, outname, "w")) {
@@ -13795,6 +13937,9 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
wkspace_reset(wkspace_mark);
fclose_cond(outfile2);
fclose_cond(outfile);
+ if (bgz_outfile) {
+ bgzf_close(bgz_outfile);
+ }
return retval;
}
@@ -14575,15 +14720,18 @@ static inline uint32_t merge_post_msort_update_maps(char* marker_ids, uintptr_t
uint32_t chrom_idx;
uint32_t chrom_read_end_idx;
int64_t llxx;
+ uint32_t unplaced;
uint32_t prev_bp;
uint32_t cur_bp;
uint32_t presort_idx;
for (chrom_idx = 0; chrom_idx < chrom_ct; chrom_idx++) {
- if (!IS_SET(chrom_mask, chrom_id[chrom_idx])) {
+ unplaced = chrom_id[chrom_idx]; // initially chromosome code
+ if (!IS_SET(chrom_mask, unplaced)) {
read_pos = chrom_start[chrom_idx + 1];
chrom_start[chrom_idx + 1] = write_pos;
continue;
}
+ unplaced = (unplaced == 0) || (chrom_info_ptr->zero_extra_chroms && (unplaced > chrom_info_ptr->max_code));
chrom_read_end_idx = chrom_start[chrom_idx + 1];
// ll_buf has base-pair positions in high 32 bits, and pre-sort indices in
// low 32 bits.
@@ -14596,21 +14744,19 @@ static inline uint32_t merge_post_msort_update_maps(char* marker_ids, uintptr_t
llxx = ll_buf[read_pos];
presort_idx = (uint32_t)llxx;
cur_bp = (uint32_t)(llxx >> 32);
- if (prev_bp == cur_bp) {
+ // do not merge chr 0 (unplaced).
+ if ((prev_bp == cur_bp) && (!unplaced)) {
if (merge_equal_pos && merge_alleles(marker_allele_ptrs, ((uint32_t)ll_buf[read_pos - 1]), presort_idx)) {
LOGPRINTFWW("Error: --merge-equal-pos failure. Variants '%s' and '%s' have the same position, but do not share the same alleles.\n", &(marker_ids[max_marker_id_len * presort_idx]), &(marker_ids[max_marker_id_len * ((uint32_t)ll_buf[read_pos - 1])]));
return 1;
}
- if (prev_bp) {
- // no warning if prev_bp is 0
- LOGPREPRINTFWW("Warning: Variants '%s' and '%s' have the same position.\n", &(marker_ids[max_marker_id_len * presort_idx]), &(marker_ids[max_marker_id_len * ((uint32_t)ll_buf[read_pos - 1])]));
- if (position_warning_ct < 3) {
- logprintb();
- } else {
- logstr(logbuf);
- }
- position_warning_ct++;
+ LOGPREPRINTFWW("Warning: Variants '%s' and '%s' have the same position.\n", &(marker_ids[max_marker_id_len * presort_idx]), &(marker_ids[max_marker_id_len * ((uint32_t)ll_buf[read_pos - 1])]));
+ if (position_warning_ct < 3) {
+ logprintb();
+ } else {
+ logstr(logbuf);
}
+ position_warning_ct++;
if (merge_equal_pos) {
marker_map[presort_idx] = write_pos - 1;
continue;
@@ -15357,6 +15503,7 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
uint32_t orig_idx = 0;
uint32_t cur_marker_ct = 0;
uint32_t tot_marker_ct = 0;
+ int32_t retval = 0;
uint32_t* map_reverse = NULL;
uintptr_t* reversed = NULL;
char* bim_loadbuf = NULL;
@@ -15410,7 +15557,6 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
unsigned char* ubufptr;
char cc;
unsigned char ucc;
- int32_t retval;
if (wkspace_alloc_ui_checked(&chrom_start, (MAX_POSSIBLE_CHROM + 1) * sizeof(int32_t)) ||
wkspace_alloc_ui_checked(&chrom_id, MAX_POSSIBLE_CHROM * sizeof(int32_t))) {
goto merge_datasets_ret_NOMEM;
@@ -15573,7 +15719,7 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
LOGPRINTFWW("%u %s loaded from %s.\n", max_cur_sample_ct, species_str(max_cur_sample_ct), mergelist_fam[0]);
LOGPRINTFWW("%u %s to be merged from %s.\n", cur_sample_ct, species_str(cur_sample_ct), mergelist_fam[1]);
uii = ullxx - max_cur_sample_ct;
- LOGPRINTF("Of these, %u are new, while %u are present in the base dataset.\n", uii, cur_sample_ct - uii);
+ LOGPRINTF("Of these, %u %s new, while %u %s present in the base dataset.\n", uii, (uii == 1)? "is" : "are", cur_sample_ct - uii, (cur_sample_ct - uii == 1)? "is" : "are");
}
if (cur_sample_ct > max_cur_sample_ct) {
max_cur_sample_ct = cur_sample_ct;
@@ -15781,12 +15927,16 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
}
if (!merge_list) {
if (!mlpos) {
- uii = cur_marker_ct;
+ uii = ullxx;
} else {
LOGPRINTFWW("%u marker%s loaded from %s.\n", uii, (uii == 1)? "" : "s", mergelist_bim[0]);
LOGPRINTFWW("%u marker%s to be merged from %s.\n", cur_marker_ct, (cur_marker_ct == 1)? "" : "s", mergelist_bim[1]);
+ // bugfix: don't underflow when a single file has duplicate IDs (e.g.
+ // '.').
+ // Merging should fail anyway in that case, but we should not embarrass
+ // ourselves by printing inaccurate numbers here.
uii = ullxx - uii;
- LOGPRINTF("Of these, %u are new, while %u are present in the base dataset.\n", uii, cur_marker_ct - uii);
+ LOGPRINTF("Of these, %u %s new, while %u %s present in the base dataset.\n", uii, (uii == 1)? "is" : "are", cur_marker_ct - uii, (cur_marker_ct - uii == 1)? "is" : "are");
}
}
if (!mergelist_fam[mlpos]) {
diff --git a/plink_dosage.c b/plink_dosage.c
index b1d9041..4bf4505 100644
--- a/plink_dosage.c
+++ b/plink_dosage.c
@@ -7,6 +7,7 @@
#include "plink_glm.h"
#include "plink_matrix.h"
#include "plink_misc.h"
+#include "pigz.h"
void dosage_init(Dosage_info* doip) {
doip->fname = NULL;
@@ -422,6 +423,10 @@ int32_t dosage_load_score_files(Score_info* sc_ip, char* outname, char* outname_
score_qrange_bounds[2 * ulii + 1] = ubound;
ulii++;
}
+ if (ulii != qrange_ct) {
+ // catches /dev/stdin redirection
+ goto dosage_load_score_files_ret_READ_FAIL;
+ }
if (fclose_null(&infile)) {
goto dosage_load_score_files_ret_READ_FAIL;
}
@@ -461,9 +466,8 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
// there's no long-term maintenance problem
FILE* phenofile = NULL;
FILE* infile = NULL;
- FILE* outfile = NULL;
+ FILE* profile_outfile = NULL;
gzFile* gz_infiles = NULL;
- gzFile gz_outfile = NULL;
char* marker_ids = NULL;
char* sample_ids = NULL;
char* paternal_ids = NULL;
@@ -478,6 +482,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
char** score_allele_codes = NULL;
char* a1_ptr = NULL;
char* a2_ptr = NULL;
+ char* pzwritep = NULL;
uintptr_t* marker_exclude = NULL;
uintptr_t* sample_exclude = NULL;
uintptr_t* sex_nm = NULL;
@@ -606,7 +611,9 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
__CLPK_integer dgels_lwork;
#endif
char missing_pheno_str[32];
+ Pigz_state ps;
unsigned char* wkspace_mark;
+ unsigned char* overflow_buf;
char* fnames;
char* loadbuf;
char* bufptr;
@@ -663,6 +670,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
uint32_t uii;
uint32_t ukk;
int32_t ii;
+ pzwrite_init_null(&ps);
if (load_map) {
retval = load_bim(mapname, &map_cols, &unfiltered_marker_ct, &marker_exclude_ct, &max_marker_id_len, &marker_exclude, NULL, NULL, NULL, &ulii, &marker_ids, NULL, 0, NULL, chrom_info_ptr, NULL, &marker_pos, misc_flags, filter_flags, marker_pos_start, marker_pos_end, snp_window_size, markername_from, markername_to, markername_snp, snps_range_list_ptr, &map_is_unsorted, do_glm || min_bp_space || (misc_flags & (MISC_EXTRACT_RANGE | MISC_EXCLUDE_RANGE)), 0, 0, NULL, ".map file", NULL);
if (retval) {
@@ -796,7 +804,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
}
if (excludename) {
if (!(misc_flags & MISC_EXCLUDE_RANGE)) {
- retval = extract_exclude_flag_norange(extractname, marker_id_htable, marker_id_htable_size, 1, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
+ retval = extract_exclude_flag_norange(excludename, marker_id_htable, marker_id_htable_size, 1, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
if (retval) {
goto plink1_dosage_ret_1;
}
@@ -1330,7 +1338,8 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
memcpy(fnames, doip->fname, uii);
infile_ct = 1;
}
- if (wkspace_alloc_ui_checked(&file_icts, max_batch_size * sizeof(int32_t)) ||
+ if (wkspace_alloc_uc_checked(&overflow_buf, 2 * PIGZ_BLOCK_SIZE) ||
+ wkspace_alloc_ui_checked(&file_icts, max_batch_size * sizeof(int32_t)) ||
wkspace_alloc_ul_checked(&line_idx_arr, max_batch_size * sizeof(intptr_t)) ||
wkspace_alloc_ul_checked(&batch_samples, sample_ctl * sizeof(intptr_t)) ||
wkspace_alloc_ul_checked(&cur_samples, sample_ctl * sizeof(intptr_t)) ||
@@ -1463,7 +1472,8 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
}
bufptr = memcpya(bufptr, " A1 A2 FRQ INFO ", 28);
bufptr = memcpya(bufptr, pheno_c? " OR" : "BETA", 4);
- bufptr = memcpya(bufptr, " SE P\n", 17);
+ bufptr = memcpya(bufptr, " SE P", 16);
+ append_binary_eoln(&bufptr);
bufptr2 = memcpyb(outname_end, ".assoc.dosage", 14);
} else if (count_occur) {
// could just use a uint32_t array if .map provided
@@ -1485,56 +1495,29 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
}
if (output_gz) {
memcpy(bufptr2, ".gz", 4);
- if (gzopen_checked(&gz_outfile, outname, "wb")) {
- goto plink1_dosage_ret_OPEN_FAIL;
- }
+ }
+ if (flex_pzwrite_init(output_gz, outname, overflow_buf, 0, &ps)) {
+ goto plink1_dosage_ret_OPEN_FAIL;
+ }
+ pzwritep = (char*)overflow_buf;
+
+ if (!do_score) {
if (do_glm) {
- if (!gzwrite(gz_outfile, tbuf, bufptr - tbuf)) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- } else {
- if (gzputs(gz_outfile, "SNP A1 A2 ") == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
+ pzwritep = memcpya(pzwritep, tbuf, bufptr - tbuf);
+ } else if (!count_occur) {
+ pzwritep = memcpya(pzwritep, "SNP A1 A2 ", 10);
for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
next_unset_unsafe_ck(sample_exclude, &sample_uidx);
bufptr = &(sample_ids[sample_uidx * max_sample_id_len]);
bufptr2 = strchr(bufptr, '\t');
- *bufptr2 = ' ';
- if (gzputs(gz_outfile, bufptr) == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- if (gzputc(gz_outfile, ' ') == -1) {
+ pzwritep = memcpya(pzwritep, bufptr, bufptr2 - bufptr);
+ *pzwritep++ = ' ';
+ pzwritep = strcpyax(pzwritep, &(bufptr2[1]), ' ');
+ if (flex_pzwrite(&ps, &pzwritep)) {
goto plink1_dosage_ret_WRITE_FAIL;
}
- *bufptr2 = '\t';
- }
- if (gzputc(gz_outfile, '\n') == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- }
- } else if (!do_score) {
- if (fopen_checked(&outfile, outname, "w")) {
- goto plink1_dosage_ret_OPEN_FAIL;
- }
- if (do_glm) {
- if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- } else if (!count_occur) {
- fputs("SNP A1 A2 ", outfile);
- for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
- next_unset_unsafe_ck(sample_exclude, &sample_uidx);
- bufptr = &(sample_ids[sample_uidx * max_sample_id_len]);
- bufptr2 = strchr(bufptr, '\t');
- *bufptr2 = ' ';
- fputs(bufptr, outfile);
- putc(' ', outfile);
- *bufptr2 = '\t';
- }
- if (putc_checked('\n', outfile)) {
- goto plink1_dosage_ret_WRITE_FAIL;
}
+ append_binary_eoln(&pzwritep);
}
}
wkspace_mark = wkspace_base;
@@ -1923,92 +1906,52 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
}
#endif
if (load_map) {
- bufptr = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_idx)));
- *bufptr++ = ' ';
- bufptr = fw_strcpyn(11, cur_marker_id_len, cur_marker_id_buf, bufptr);
- bufptr = memseta(bufptr, 32, 2);
- bufptr = uint32_writew10(bufptr, marker_pos[marker_idx]);
+ pzwritep = width_force(4, pzwritep, chrom_name_write(pzwritep, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_idx)));
+ *pzwritep++ = ' ';
+ pzwritep = fw_strcpyn(11, cur_marker_id_len, cur_marker_id_buf, pzwritep);
+ pzwritep = memseta(pzwritep, 32, 2);
+ pzwritep = uint32_writew10(pzwritep, marker_pos[marker_idx]);
} else {
- tbuf[0] = ' ';
- bufptr = fw_strcpyn(11, cur_marker_id_len, cur_marker_id_buf, &(tbuf[1]));
+ *pzwritep++ = ' ';
+ pzwritep = fw_strcpyn(11, cur_marker_id_len, cur_marker_id_buf, pzwritep);
}
- *bufptr++ = ' ';
- *bufptr = '\0';
- if (output_gz) {
- if (gzputs(gz_outfile, tbuf) == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- if (a1_len < 3) {
- if (gzputc(gz_outfile, ' ') == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- if (a1_len == 1) {
- if (gzputc(gz_outfile, ' ') == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- }
- }
- if (gzputs(gz_outfile, a1_ptr) == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- if (gzputc(gz_outfile, ' ') == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- if (a2_len < 3) {
- if (gzputc(gz_outfile, ' ') == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- if (a2_len == 1) {
- if (gzputc(gz_outfile, ' ') == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- }
- }
- if (gzputs(gz_outfile, a2_ptr) == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- } else {
- fputs(tbuf, outfile);
- if (a1_len < 3) {
- putc(' ', outfile);
- if (a1_len == 1) {
- putc(' ', outfile);
- }
+ *pzwritep++ = ' ';
+ if (a1_len < 3) {
+ *pzwritep++ = ' ';
+ if (a1_len == 1) {
+ *pzwritep++ = ' ';
}
- fputs(a1_ptr, outfile);
- putc(' ', outfile);
- if (a2_len < 3) {
- putc(' ', outfile);
- if (a2_len == 1) {
- putc(' ', outfile);
- }
+ }
+ if (flex_pzputs_allele(&ps, &pzwritep, a1_ptr, a1_len)) {
+ goto plink1_dosage_ret_WRITE_FAIL;
+ }
+ *pzwritep++ = ' ';
+ if (a2_len < 3) {
+ *pzwritep++ = ' ';
+ if (a2_len == 1) {
+ *pzwritep++ = ' ';
}
- fputs(a2_ptr, outfile);
- }
- bufptr = tbuf;
- *bufptr++ = ' ';
- bufptr = double_f_writew74(bufptr, dzz);
- *bufptr++ = ' ';
- bufptr = double_f_writew74(bufptr, rsq);
- *bufptr++ = ' ';
+ }
+ if (flex_pzputs_allele(&ps, &pzwritep, a2_ptr, a2_len)) {
+ goto plink1_dosage_ret_WRITE_FAIL;
+ }
+ *pzwritep++ = ' ';
+ pzwritep = double_f_writew74(pzwritep, dzz);
+ *pzwritep++ = ' ';
+ pzwritep = double_f_writew74(pzwritep, rsq);
+ *pzwritep++ = ' ';
if (is_valid) {
- bufptr = double_f_writew74(bufptr, pheno_c? exp(beta * 0.5) : (beta * 0.5));
- *bufptr++ = ' ';
- bufptr = double_f_writew74(bufptr, se * 0.5);
- *bufptr++ = ' ';
- bufptr = double_g_writewx4(bufptr, MAXV(pval, output_min_p), 7);
- bufptr = memcpya(bufptr, "\n", 2);
+ pzwritep = double_f_writew74(pzwritep, pheno_c? exp(beta * 0.5) : (beta * 0.5));
+ *pzwritep++ = ' ';
+ pzwritep = double_f_writew74(pzwritep, se * 0.5);
+ *pzwritep++ = ' ';
+ pzwritep = double_g_writewx4(pzwritep, MAXV(pval, output_min_p), 7);
} else {
- bufptr = memcpya(bufptr, " NA NA NA\n", 25);
+ pzwritep = memcpya(pzwritep, " NA NA NA", 23);
}
- if (output_gz) {
- if (gzputs(gz_outfile, tbuf) == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- } else {
- if (fputs_checked(tbuf, outfile)) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
+ append_binary_eoln(&pzwritep);
+ if (flex_pzwrite(&ps, &pzwritep)) {
+ goto plink1_dosage_ret_WRITE_FAIL;
}
} else if (do_score) {
sample_valid_ct = popcount_longs(cur_samples, sample_ctl);
@@ -2055,32 +1998,17 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
}
} while (++qrange_idx < qrange_ct);
} else if (!count_occur) {
- if (output_gz) {
- if (gzputs(gz_outfile, cur_marker_id_buf) == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- if (gzputc(gz_outfile, ' ') == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- if (gzputs(gz_outfile, a1_ptr) == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- if (gzputc(gz_outfile, ' ') == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- if (gzputs(gz_outfile, a2_ptr) == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- if (gzputc(gz_outfile, ' ') == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- } else {
- fputs(cur_marker_id_buf, outfile);
- putc(' ', outfile);
- fputs(a1_ptr, outfile);
- putc(' ', outfile);
- fputs(a2_ptr, outfile);
- putc(' ', outfile);
+ pzwritep = strcpyax(pzwritep, cur_marker_id_buf, ' ');
+ if (flex_pzputs_allele(&ps, &pzwritep, a1_ptr, a1_len)) {
+ goto plink1_dosage_ret_WRITE_FAIL;
+ }
+ *pzwritep++ = ' ';
+ if (flex_pzputs_allele(&ps, &pzwritep, a2_ptr, a2_len)) {
+ goto plink1_dosage_ret_WRITE_FAIL;
+ }
+ *pzwritep++ = ' ';
+ if (flex_pzwrite(&ps, &pzwritep)) {
+ goto plink1_dosage_ret_WRITE_FAIL;
}
ulii = 0;
// could make output format independent of input format (other than
@@ -2091,94 +2019,67 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
sample_idx = 0;
do {
ulii += MAXLINELEN / 16;
- bufptr = tbuf;
if (ulii > sample_ct) {
ulii = sample_ct;
}
for (; sample_idx < ulii; sample_idx++) {
if (!is_set(cur_samples, sample_idx)) {
- bufptr = memcpyl3a(bufptr, "NA ");
+ pzwritep = memcpyl3a(pzwritep, "NA ");
} else {
- bufptr = double_g_writex(bufptr, 2 * cur_dosages[sample_idx], ' ');
+ pzwritep = double_g_writex(pzwritep, 2 * cur_dosages[sample_idx], ' ');
}
}
- if (output_gz) {
- if (!gzwrite(gz_outfile, tbuf, bufptr - tbuf)) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- } else {
- if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
+ if (flex_pzwrite(&ps, &pzwritep)) {
+ goto plink1_dosage_ret_WRITE_FAIL;
}
} while (ulii < sample_ct);
} else if (format_val == 2) {
sample_idx = 0;
do {
ulii += MAXLINELEN / 32;
- bufptr = tbuf;
if (ulii > sample_ct) {
ulii = sample_ct;
}
for (; sample_idx < ulii; sample_idx++) {
if (!is_set(cur_samples, sample_idx)) {
- bufptr = memcpya(bufptr, "NA NA ", 6);
+ pzwritep = memcpya(pzwritep, "NA NA ", 6);
} else {
- bufptr = double_g_writex(bufptr, cur_dosages[sample_idx], ' ');
- bufptr = double_g_writex(bufptr, cur_dosages2[sample_idx], ' ');
+ pzwritep = double_g_writex(pzwritep, cur_dosages[sample_idx], ' ');
+ pzwritep = double_g_writex(pzwritep, cur_dosages2[sample_idx], ' ');
}
}
- if (output_gz) {
- if (!gzwrite(gz_outfile, tbuf, bufptr - tbuf)) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- } else {
- if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
+ if (flex_pzwrite(&ps, &pzwritep)) {
+ goto plink1_dosage_ret_WRITE_FAIL;
}
} while (ulii < sample_ct);
} else {
sample_idx = 0;
do {
ulii += MAXLINELEN / 48;
- bufptr = tbuf;
if (ulii > sample_ct) {
ulii = sample_ct;
}
for (; sample_idx < ulii; sample_idx++) {
if (!is_set(cur_samples, sample_idx)) {
- bufptr = memcpya(bufptr, "NA NA NA ", 9);
+ pzwritep = memcpya(pzwritep, "NA NA NA ", 9);
} else {
dxx = cur_dosages[sample_idx];
- bufptr = double_g_writex(bufptr, dxx, ' ');
+ pzwritep = double_g_writex(pzwritep, dxx, ' ');
dyy = cur_dosages2[sample_idx];
- bufptr = double_g_writex(bufptr, dyy, ' ');
+ pzwritep = double_g_writex(pzwritep, dyy, ' ');
dxx = 1.0 - dxx - dyy;
if (fabs(dxx) < SMALL_EPSILON) {
dxx = 0.0;
}
- bufptr = double_g_writex(bufptr, dxx, ' ');
+ pzwritep = double_g_writex(pzwritep, dxx, ' ');
}
}
- if (output_gz) {
- if (!gzwrite(gz_outfile, tbuf, bufptr - tbuf)) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- } else {
- if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
+ if (flex_pzwrite(&ps, &pzwritep)) {
+ goto plink1_dosage_ret_WRITE_FAIL;
}
} while (ulii < sample_ct);
}
- if (output_gz) {
- if (gzputc(gz_outfile, '\n') == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- } else {
- putc('\n', outfile);
- }
+ append_binary_eoln(&pzwritep);
}
}
if (a1_ptr) {
@@ -2211,11 +2112,13 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
} else {
memcpy(outname_end, ".profile", 9);
}
- if (fopen_checked(&outfile, outname, "w")) {
+ // this is not affected by 'gz' in PLINK 1.07; retain that for backward
+ // compatibility.
+ if (fopen_checked(&profile_outfile, outname, "w")) {
goto plink1_dosage_ret_OPEN_FAIL;
}
sprintf(tbuf, "%%%us %%%us PHENO%s %s\n", plink_maxfid, plink_maxiid, dosage_score_cnt? " CNT" : "", score_report_average? " SCORE" : "SCORESUM");
- fprintf(outfile, tbuf, "FID", "IID");
+ fprintf(profile_outfile, tbuf, "FID", "IID");
uii = score_range_obs_cts[qrange_idx];
uiptr = &(score_miss_cts[sample_ct * qrange_idx]);
dxx = score_bases[qrange_idx];
@@ -2254,11 +2157,11 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
}
bufptr = width_force(8, bufptr, double_g_write(bufptr, dyy));
*bufptr++ = '\n';
- if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+ if (fwrite_checked(tbuf, bufptr - tbuf, profile_outfile)) {
goto plink1_dosage_ret_WRITE_FAIL;
}
}
- if (fclose_null(&outfile)) {
+ if (fclose_null(&profile_outfile)) {
goto plink1_dosage_ret_WRITE_FAIL;
}
LOGPRINTFWW("--score: Results written to %s .\n", outname);
@@ -2290,27 +2193,16 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
for (ulii = 0; ulii < distinct_id_ct; ulii++) {
bufptr2 = &(bufptr[ulii * max_occur_id_len]);
slen = strlen(bufptr2);
- bufptr3 = memcpyax(tbuf, bufptr2, slen, ' ');
- bufptr3 = uint32_write(bufptr3, *((uint32_t*)(&(bufptr2[slen + 1]))));
- memcpy(bufptr3, "\n", 2);
- if (output_gz) {
- if (gzputs(gz_outfile, tbuf) == -1) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- } else {
- fputs(tbuf, outfile);
+ pzwritep = memcpyax(pzwritep, bufptr2, slen, ' ');
+ pzwritep = uint32_write(pzwritep, *((uint32_t*)(&(bufptr2[slen + 1]))));
+ append_binary_eoln(&pzwritep);
+ if (flex_pzwrite(&ps, &pzwritep)) {
+ goto plink1_dosage_ret_WRITE_FAIL;
}
}
}
- if (output_gz) {
- if (gzclose(gz_outfile) != Z_OK) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
- gz_outfile = NULL;
- } else {
- if (fclose_null(&outfile)) {
- goto plink1_dosage_ret_WRITE_FAIL;
- }
+ if (flex_pzwrite_close_null(&ps, pzwritep)) {
+ goto plink1_dosage_ret_WRITE_FAIL;
}
LOGPRINTFWW("--%sdosage%s: Results saved to %s .\n", (do_glm || count_occur)? "" : "write-", count_occur? " occur" : "", outname);
if (count_occur) {
@@ -2369,8 +2261,8 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
free_cond(pheno_d);
fclose_cond(phenofile);
fclose_cond(infile);
- fclose_cond(outfile);
- gzclose_cond(gz_outfile);
+ fclose_cond(profile_outfile);
+ flex_pzwrite_close_cond(&ps, pzwritep);
if (a1_ptr && a1_ptr[1]) {
free(a1_ptr);
}
diff --git a/plink_family.c b/plink_family.c
index eb764d3..fe36eda 100644
--- a/plink_family.c
+++ b/plink_family.c
@@ -1,6 +1,7 @@
#include "plink_common.h"
#include "plink_assoc.h"
+#include "plink_cluster.h"
#include "plink_family.h"
#include "plink_stats.h"
@@ -11,6 +12,8 @@ void family_init(Family_info* fam_ip) {
fam_ip->mendel_modifier = 0;
fam_ip->tdt_modifier = 0;
fam_ip->tdt_mperm_val = 0;
+ fam_ip->dfam_modifier = 0;
+ fam_ip->dfam_mperm_val = 0;
fam_ip->qfam_modifier = 0;
fam_ip->qfam_mperm_val = 0;
}
@@ -60,6 +63,9 @@ const uint32_t mendel_error_table_x[] =
0x5000001, 0, 0x2010101, 0};
int32_t get_trios_and_families(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uintptr_t* founder_info, uintptr_t* sex_nm, uintptr_t* sex_male, char* sample_ids, uintptr_t max_sample_id_len, char* paternal_ids, uintptr_t max_paternal_id_len, char* maternal_ids, uintptr_t max_maternal_id_len, char** fids_ptr, uintptr_t* max_fid_len_ptr, char** iids_ptr, uintptr_t* max_iid_len_ptr, uint64_t** family_list_ptr, uint32_t* family_ct_ptr, uint64_t** trio_list_ptr [...]
+ // This mirrors linkRelateds() in genedrop.cpp, and parseTrios() in trio.cpp,
+ // in PLINK 1.07.
+ //
// family_list has paternal indices in low 32 bits, maternal indices in high
// 32, sorted in child ID order.
// trio_list has child IDs in low 32 bits, family_list indices in high 32
@@ -78,7 +84,8 @@ int32_t get_trios_and_families(uintptr_t unfiltered_sample_ct, uintptr_t* sample
// PLINK 1.07 enforces <= 1 father and <= 1 mother per sample (and ambiguous
// sex parents are not permitted), but the IDs CAN be reversed in the .fam
// with no adverse consequences. For backward compatibility, we replicate
- // this. (Todo: report a warning exactly once when this happens.)
+ // this. (Possible todo: report a warning exactly once when this happens.)
+ // It won't be replicated in PLINK 2.0.
unsigned char* wkspace_mark = wkspace_base;
uint64_t* edge_list = NULL;
uint32_t* toposort_queue = NULL;
@@ -1065,18 +1072,31 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
for (uii = 0; uii < family_ct; uii++) {
family_code = family_list[uii];
ujj = (uint32_t)family_code; // paternal uidx
+ ukk = (uint32_t)(family_code >> 32); // maternal uidx
if (ujj < unfiltered_sample_ct) {
// bleah, fids[] isn't in right order for this lookup
cptr = &(sample_ids[ujj * max_sample_id_len]);
wptr = fw_strcpyn(plink_maxfid, (uintptr_t)(((char*)memchr(cptr, '\t', max_sample_id_len)) - cptr), cptr, tbuf);
} else {
- wptr = memseta(tbuf, 32, plink_maxfid - 1);
- *wptr++ = '0';
+ cptr = &(sample_ids[ukk * max_sample_id_len]);
+ wptr = fw_strcpyn(plink_maxfid, (uintptr_t)(((char*)memchr(cptr, '\t', max_sample_id_len)) - cptr), cptr, tbuf);
+ // wptr = memseta(tbuf, 32, plink_maxfid - 1);
+ // *wptr++ = '0';
}
*wptr++ = ' ';
- wptr = fw_strcpy(plink_maxiid, &(iids[ujj * max_iid_len]), wptr);
+ if (ujj != unfiltered_sample_ct) {
+ wptr = fw_strcpy(plink_maxiid, &(iids[ujj * max_iid_len]), wptr);
+ } else {
+ wptr = memseta(wptr, 32, plink_maxiid - 1);
+ *wptr++ = '0';
+ }
*wptr++ = ' ';
- wptr = fw_strcpy(plink_maxiid, &(iids[((uintptr_t)(family_code >> 32)) * max_iid_len]), wptr);
+ if (ukk != unfiltered_sample_ct) {
+ wptr = fw_strcpy(plink_maxiid, &(iids[ukk * max_iid_len]), wptr);
+ } else {
+ wptr = memseta(wptr, 32, plink_maxiid - 1);
+ *wptr++ = '0';
+ }
*wptr++ = ' ';
wptr = uint32_writew6x(wptr, child_cts[uii], ' ');
if (family_error_cts[uii * 3] < 10000) {
@@ -1704,7 +1724,7 @@ int32_t populate_pedigree_rel_info(Pedigree_rel_info* pri_ptr, uintptr_t unfilte
return 0;
}
-int32_t tdt_poo(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct_ax, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sample_male_include2, uint32_t* trio_nuclear_lookup, uint32_t family_ct, uint32_t [...]
+int32_t tdt_poo(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct_ax, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sample_male_include2, uint32_t* trio_nuclear_lookup, uint32_t family_ct, Aperm_in [...]
FILE* outfile = NULL;
uint64_t mendel_error_ct = 0;
double pat_a2transmit_recip = 0.0;
@@ -1918,7 +1938,7 @@ int32_t tdt_poo(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* o
if (pct < 100) {
printf("\b\b%" PRIuPTR "%%", pct);
fflush(stdout);
- pct_thresh = ((++pct) * ((uint64_t)markers_done)) / 100;
+ pct_thresh = ((++pct) * ((uint64_t)marker_ct_ax)) / 100;
}
}
if (++marker_uidx == chrom_end) {
@@ -1956,7 +1976,7 @@ int32_t tdt_poo(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* o
return retval;
}
-int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double ci_size, double ci_zt, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfilter [...]
+int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double ci_size, double ci_zt, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfilter [...]
unsigned char* wkspace_mark = wkspace_base;
FILE* outfile = NULL;
char* textbuf = tbuf;
@@ -1977,6 +1997,7 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
uint32_t is_exact = fam_ip->tdt_modifier & TDT_EXACT;
uint32_t is_midp = fam_ip->tdt_modifier & TDT_MIDP;
uint32_t poo_test = fam_ip->tdt_modifier & TDT_POO;
+ // uint32_t perm_count = fam_ip->tdt_modifier & TDT_PERM_COUNT;
uint32_t case_trio_ct = 0;
uint32_t is_discordant = 0;
uint32_t discord_exists = 0;
@@ -2182,7 +2203,7 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
}
}
if (poo_test) {
- retval = tdt_poo(threads, bedfile, bed_offset, outname, outname_end, output_min_p, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, marker_allele_ptrs, max_marker_allele_len, marker_reverse, unfiltered_sample_ct, sample_male_include2, trio_nuclear_lookup, family_ct, mperm_save, sample_ids, max_sample_id_len, chrom_info_ptr, hh_exists, fam_ip, loadbuf, workbuf, textbuf, orig_chisq, trio_error_lookup, trio_ct);
+ retval = tdt_poo(threads, bedfile, bed_offset, outname, outname_end, output_min_p, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, marker_allele_ptrs, max_marker_allele_len, marker_reverse, unfiltered_sample_ct, sample_male_include2, trio_nuclear_lookup, family_ct, apip, mperm_save, sample_ids, max_sample_id_len, chrom_info_ptr, hh_exists, fam_ip, loadbuf, workbuf, textbuf, orig_chisq, trio_error_lookup, trio_ct);
if (retval) {
goto tdt_ret_1;
}
@@ -2445,7 +2466,7 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
if (pct < 100) {
printf("\b\b%" PRIuPTR "%%", pct);
fflush(stdout);
- pct_thresh = ((++pct) * ((uint64_t)markers_done)) / 100;
+ pct_thresh = ((++pct) * ((uint64_t)marker_ct)) / 100;
}
}
if (++marker_uidx == chrom_end) {
@@ -2507,14 +2528,17 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
return retval;
}
-int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uintptr_t* pheno_nm, double* pheno_d, uintptr_t* founder_info, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t max_fid_len, char* paternal_ids, uintptr_t max_paternal_id_len, char* maternal_ids, uintptr_t max_maternal_id_len, uint64_t* family_list, uint64_t* trio_list, uint32_t family_ct, uintptr_t trio_ct, uint32_t test_type, uintptr_t** lm_eligible_ptr, uintptr_t** lm_withi [...]
- // on top of get_trios_and_families()'s return values, we need the following
- // information for the main qfam() loop:
+int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uintptr_t* pheno_nm, uintptr_t* founder_info, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t max_fid_len, char* paternal_ids, uintptr_t max_paternal_id_len, char* maternal_ids, uintptr_t max_maternal_id_len, uint64_t* family_list, uint64_t* trio_list, uint32_t family_ct, uintptr_t trio_ct, uint32_t test_type, uintptr_t** lm_eligible_ptr, uintptr_t** lm_within2_founder_ptr, u [...]
+ // On top of get_trios_and_families()'s return values, we need the following
+ // information for the main dfam() and qfam() loops:
// 1. sample idx -> family/sibship idx array
// 2. fs_starts[]/fs_contents[] arrays describing family/sibship idx ->
// sample idxs mapping.
- // we may as well sort size-1 sibships/singleton founders to the end; this
+ // We may as well sort size-1 sibships/singleton founders to the end; this
// lets us get away with a smaller fs_starts[] array and a faster loop.
+ // There is also some qfam-specific initialization here (e.g. a divorcee with
+ // children from two different spouses may be excluded from the linear
+ // model). test_type is zero for dfam and nonzero for qfam.
uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
uintptr_t sample_ctl = (sample_ct + (BITCT - 1)) / BITCT;
uintptr_t max_merged_id_len = max_fid_len + max_paternal_id_len + max_maternal_id_len + sizeof(int32_t);
@@ -2522,6 +2546,7 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
uintptr_t topsize = 0;
uintptr_t* tmp_within2_founder = NULL;
uintptr_t* lm_within2_founder = NULL;
+ uintptr_t* lm_eligible = NULL;
uint32_t is_within2 = (test_type == QFAM_WITHIN2);
uint32_t family_idx = 0;
uint32_t fssc_idx = 0;
@@ -2530,7 +2555,6 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
char* bufptr;
char* bufptr2;
char* bufptr3;
- uintptr_t* lm_eligible;
uintptr_t* not_in_family;
uintptr_t* ulptr;
uintptr_t* ulptr2;
@@ -2550,20 +2574,34 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
uint32_t ujj;
uint32_t ukk;
uint32_t umm;
- if (is_within2) {
- if (wkspace_alloc_ul_checked(&lm_within2_founder, sample_ctl * sizeof(intptr_t))) {
- goto get_sibship_info_ret_NOMEM2;
+ if (test_type) {
+ if (is_within2) {
+ if (wkspace_alloc_ul_checked(&lm_within2_founder, sample_ctl * sizeof(intptr_t))) {
+ goto get_sibship_info_ret_NOMEM2;
+ }
+ }
+ if (wkspace_alloc_ul_checked(&lm_eligible, sample_ctl * sizeof(intptr_t))) {
+ goto get_sibship_info_ret_NOMEM;
}
}
- if (wkspace_alloc_ul_checked(&lm_eligible, sample_ctl * sizeof(intptr_t)) ||
- // shrink later
- wkspace_alloc_ui_checked(&fss_contents, (sample_ct + 2 * family_ct) * sizeof(int32_t))) {
- goto get_sibship_info_ret_NOMEM;
- }
- // this is the equivalent of PLINK 1.07's family pointers
- sample_to_fss_idx = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
- if (!sample_to_fss_idx) {
- goto get_sibship_info_ret_NOMEM;
+ if (test_type) {
+ // shrink later
+ if (wkspace_alloc_ui_checked(&fss_contents, (sample_ct + 2 * family_ct) * sizeof(int32_t))) {
+ goto get_sibship_info_ret_NOMEM;
+ }
+ // this is the equivalent of PLINK 1.07's family pointers
+ sample_to_fss_idx = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
+ if (!sample_to_fss_idx) {
+ goto get_sibship_info_ret_NOMEM;
+ }
+ } else {
+ if (wkspace_alloc_ui_checked(&sample_to_fss_idx, sample_ct * sizeof(int32_t))) {
+ goto get_sibship_info_ret_NOMEM;
+ }
+ // shrink later
+ if (wkspace_alloc_ui_checked(&fss_contents, (sample_ct + 2 * family_ct) * sizeof(int32_t))) {
+ goto get_sibship_info_ret_NOMEM;
+ }
}
topsize_bak = topsize;
not_in_family = (uintptr_t*)top_alloc(&topsize, unfiltered_sample_ctl * sizeof(intptr_t));
@@ -2605,13 +2643,15 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
if (family_ct) {
while (1) {
ullii = family_list[family_idx];
+ // uii, ukk = unfiltered idxs of parents
+ // ujj, umm = filtered idxs
uii = (uint32_t)ullii;
ujj = sample_uidx_to_idx[uii];
fss_contents[fssc_idx++] = ujj;
ukk = (uint32_t)(ullii >> 32);
umm = sample_uidx_to_idx[ukk];
if (is_within2) {
- if (is_set(pheno_nm, uii) && is_set(pheno_nm, ukk) && (pheno_d[uii] != pheno_d[ukk])) {
+ if (is_set(pheno_nm, uii) && is_set(pheno_nm, ukk)) {
set_bit(tmp_within2_founder, uii);
set_bit(tmp_within2_founder, ukk);
}
@@ -2661,10 +2701,12 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
collapse_copy_bitarr(unfiltered_sample_ct, tmp_within2_founder, sample_exclude, sample_ct, lm_within2_founder);
}
bitfield_andnot_reversed_args(ulptr, pheno_nm, unfiltered_sample_ctl);
- if (test_type == QFAM_WITHIN1) {
- bitfield_andnot(ulptr, founder_info, unfiltered_sample_ctl);
+ if (test_type) {
+ if (test_type == QFAM_WITHIN1) {
+ bitfield_andnot(ulptr, founder_info, unfiltered_sample_ctl);
+ }
+ collapse_copy_bitarr(unfiltered_sample_ct, ulptr, sample_exclude, sample_ct, lm_eligible);
}
- collapse_copy_bitarr(unfiltered_sample_ct, ulptr, sample_exclude, sample_ct, lm_eligible);
topsize = ulii;
memcpy(ulptr, not_in_family, unfiltered_sample_ctl * sizeof(intptr_t));
@@ -2738,33 +2780,38 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
*fs_ct_ptr = family_idx;
fs_starts[family_idx] = fssc_idx;
wkspace_shrink_top(fs_starts, (family_idx + 1) * sizeof(int32_t));
- // now iterate through not_in_family
- ulii = popcount_longs(not_in_family, unfiltered_sample_ctl);
- for (sample_uidx = 0, sample_idx = 0; sample_idx < ulii; sample_uidx++, sample_idx++) {
- next_set_ul_unsafe_ck(not_in_family, &sample_uidx);
- ujj = sample_uidx_to_idx[sample_uidx];
- fss_contents[fssc_idx++] = ujj;
- sample_to_fss_idx[ujj] = family_idx + sample_idx;
- }
- *singleton_ct_ptr = ulii;
- // finally, collapse sample_to_fss_idx to sample_lm_to_fss_idx
- topsize = topsize_bak;
- wkspace_left -= topsize;
- ulii = popcount_longs(lm_eligible, sample_ctl);
- if (wkspace_alloc_ui_checked(&sample_lm_to_fss_idx, ulii * sizeof(int32_t))) {
- goto get_sibship_info_ret_NOMEM2;
- }
- wkspace_left += topsize;
- for (sample_uidx = 0, sample_idx = 0; sample_idx < ulii; sample_uidx++, sample_idx++) {
- next_set_ul_unsafe_ck(lm_eligible, &sample_uidx);
- sample_lm_to_fss_idx[sample_idx] = sample_to_fss_idx[sample_uidx];
+ if (test_type) {
+ // for qfam, save singletons, and collapse sample_to_fss_idx to
+ // sample_lm_to_fss_idx
+ ulii = popcount_longs(not_in_family, unfiltered_sample_ctl);
+ for (sample_uidx = 0, sample_idx = 0; sample_idx < ulii; sample_uidx++, sample_idx++) {
+ next_set_ul_unsafe_ck(not_in_family, &sample_uidx);
+ ujj = sample_uidx_to_idx[sample_uidx];
+ fss_contents[fssc_idx++] = ujj;
+ sample_to_fss_idx[ujj] = family_idx + sample_idx;
+ }
+ *singleton_ct_ptr = ulii;
+ topsize = topsize_bak;
+ wkspace_left -= topsize;
+ ulii = popcount_longs(lm_eligible, sample_ctl);
+ if (wkspace_alloc_ui_checked(&sample_lm_to_fss_idx, ulii * sizeof(int32_t))) {
+ goto get_sibship_info_ret_NOMEM2;
+ }
+ wkspace_left += topsize;
+ for (sample_uidx = 0, sample_idx = 0; sample_idx < ulii; sample_uidx++, sample_idx++) {
+ next_set_ul_unsafe_ck(lm_eligible, &sample_uidx);
+ sample_lm_to_fss_idx[sample_idx] = sample_to_fss_idx[sample_uidx];
+ }
+ *lm_eligible_ptr = lm_eligible;
+ *lm_within2_founder_ptr = lm_within2_founder;
+ *sample_lm_to_fss_idx_ptr = sample_lm_to_fss_idx;
+ *lm_ct_ptr = ulii;
+ } else {
+ // return sample_to_fss_idx in place of sample_lm_to_fss_idx
+ *sample_lm_to_fss_idx_ptr = sample_to_fss_idx;
}
- *lm_eligible_ptr = lm_eligible;
- *lm_within2_founder_ptr = lm_within2_founder;
*fs_starts_ptr = fs_starts;
*fss_contents_ptr = fss_contents;
- *sample_lm_to_fss_idx_ptr = sample_lm_to_fss_idx;
- *lm_ct_ptr = ulii;
// topsize = 0;
while (0) {
@@ -2777,6 +2824,969 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
return retval;
}
+// multithread globals
+/*
+static double* g_maxt_extreme_stat;
+static double* g_maxt_thread_results;
+static double* g_mperm_save_all;
+static uintptr_t* g_pheno_c;
+*/
+
+static uintptr_t* g_loadbuf;
+static uintptr_t* g_lm_eligible;
+static uintptr_t* g_lm_within2_founder;
+static uintptr_t* g_qfam_flip;
+static uintptr_t* g_nm_fss;
+static uintptr_t* g_nm_lm;
+static uint32_t* g_qfam_permute;
+static uint32_t* g_permute_edit;
+static uint32_t* g_perm_2success_ct;
+static uint32_t* g_perm_attempt_ct;
+static uint32_t* g_fs_starts;
+static uint32_t* g_fss_contents;
+static uint32_t* g_sample_lm_to_fss_idx;
+static unsigned char* g_perm_adapt_stop;
+static uint32_t g_adapt_m_table[MODEL_BLOCKSIZE];
+static double* g_orig_stat;
+static double* g_pheno_d2;
+static double* g_qfam_b;
+static double* g_qfam_w;
+static double* g_beta_sum;
+static double* g_beta_ssq;
+static uint32_t* g_beta_fail_cts;
+static uintptr_t g_cur_perm_ct;
+static double g_qt_sum_all;
+static double g_qt_ssq_all;
+static uint32_t g_test_type;
+static uint32_t g_qfam_thread_ct;
+static uint32_t g_fs_ct;
+static uint32_t g_singleton_ct;
+static uint32_t g_lm_ct;
+static uint32_t g_family_ct;
+static uint32_t g_block_size;
+static uint32_t g_perms_done;
+static uint32_t g_first_adapt_check;
+static double g_adaptive_intercept;
+static double g_adaptive_slope;
+static double g_aperm_alpha;
+static double g_adaptive_ci_zt;
+
+// tried encoding this in a single 32-bit integer, but that appears to be
+// slower.
+const uint8_t dfam_allele_ct_table[] =
+{0, 0, 3, 0,
+ 0, 0, 0, 0,
+ 3, 0, 2, 1,
+ 0, 0, 1, 0};
+
+void dfam_sibship_calc(uint32_t cur_case_ct, uint32_t case_hom_a1_ct, uint32_t case_het_ct, uint32_t cur_ctrl_ct, uint32_t ctrl_hom_a1_ct, uint32_t ctrl_het_ct, uint32_t* total_a1_count_ptr, double* numer_ptr, double* denom_ptr, double* total_expected_ptr) {
+ if (!cur_ctrl_ct) {
+ return;
+ }
+ uint32_t hom_a1_ct = case_hom_a1_ct + ctrl_hom_a1_ct;
+ uint32_t het_ct = case_het_ct + ctrl_het_ct;
+ uint32_t total_ct = cur_case_ct + cur_ctrl_ct;
+ uint32_t case_a1_ct = 2 * case_hom_a1_ct + case_het_ct;
+ *total_a1_count_ptr += case_a1_ct;
+ if (((!hom_a1_ct) && (!het_ct)) || (het_ct == total_ct) || (hom_a1_ct == total_ct)) {
+ *total_expected_ptr += (double)((int32_t)case_a1_ct);
+ return;
+ }
+ double hom_a1_ctd = (double)((int32_t)hom_a1_ct);
+ double het_ctd = (double)((int32_t)het_ct);
+ double case_ctd = (double)((int32_t)cur_case_ct);
+ double ctrl_ctd = (double)((int32_t)cur_ctrl_ct);
+ double total_ctd = (double)((int32_t)total_ct);
+ double total_ct_recip = 1.0 / total_ctd;
+ double case_proportion = case_ctd * total_ct_recip;
+ double case_expected_hom_a1 = case_proportion * hom_a1_ctd;
+ double case_expected_het = case_proportion * het_ctd;
+ double case_ctrl_div_xxxm1 = case_proportion * ctrl_ctd / (total_ctd * (total_ctd - 1));
+ double case_var_hom_a1 = case_ctrl_div_xxxm1 * hom_a1_ctd * (total_ctd - hom_a1_ctd);
+ double case_var_het = case_ctrl_div_xxxm1 * het_ctd * (total_ctd - het_ctd);
+ double case_neg_covar = case_ctrl_div_xxxm1 * hom_a1_ctd * het_ctd;
+ double case_expected_a1_ct = 2 * case_expected_hom_a1 + case_expected_het;
+ double case_var_a1_ct = 4 * (case_var_hom_a1 + case_neg_covar) + case_var_het;
+ *numer_ptr += (double)((int32_t)case_a1_ct) - case_expected_a1_ct;
+ *denom_ptr += case_var_a1_ct;
+ *total_expected_ptr += case_expected_a1_ct;
+}
+
+int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, [...]
+ logprint("Error: --dfam is currently under development.\n");
+ return RET_CALC_NOT_YET_SUPPORTED;
+ /*
+ unsigned char* wkspace_mark = wkspace_base;
+ FILE* outfile = NULL;
+ FILE* outfile_msa = NULL;
+ char* textbuf = tbuf;
+ uintptr_t marker_ct_orig_autosomal = marker_ct_orig;
+ uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+ uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
+ uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+ uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+ uintptr_t unfiltered_sample_ctp1l2 = 1 + (unfiltered_sample_ct / BITCT2);
+ uintptr_t final_mask = get_final_mask(unfiltered_sample_ct);
+ uintptr_t* marker_exclude_orig_autosomal = marker_exclude_orig;
+ uintptr_t* founder_pnm = NULL;
+ double* orig_chisq = NULL;
+ uint32_t unfiltered_sample_ctl2m1 = (unfiltered_sample_ct - 1) / BITCT2;
+ uint32_t multigen = (fam_ip->mendel_modifier / MENDEL_MULTIGEN) & 1;
+ uint32_t is_set_test = fam_ip->dfam_modifier & DFAM_SET_TEST;
+ uint32_t perm_adapt_nst = (fam_ip->dfam_modifier & DFAM_PERM) && (!is_set_test);
+ uint32_t perm_maxt_nst = (fam_ip->dfam_modifier & DFAM_MPERM) && (!is_set_test);
+ uint32_t do_perms = fam_ip->dfam_modifier & (DFAM_PERM | DFAM_MPERM);
+ uint32_t do_perms_nst = do_perms && (!is_set_test);
+ uint32_t perm_count = fam_ip->dfam_modifier & DFAM_PERM_COUNT;
+ uint32_t fill_orig_chisq = do_perms || mtest_adjust;
+ uint32_t no_unrelateds = (fam_ip->dfam_modifier & DFAM_NO_UNRELATEDS) || (within_cmdflag && (!cluster_ct));
+ uint32_t family_all_case_children_ct = 0;
+ uint32_t family_mixed_ct = 0;
+ uint32_t sibship_mixed_ct = 0;
+ uint32_t unrelated_cluster_ct = 0;
+ uint32_t pct = 0;
+ uint32_t max_thread_ct = g_thread_ct;
+ uint32_t perm_pass_idx = 0;
+ uint32_t perms_total = 0;
+ uint32_t perms_done = 0;
+ int32_t retval = 0;
+ uintptr_t* pheno_nm;
+ uintptr_t* dfam_pheno_c;
+ uintptr_t* loadbuf_raw;
+ uintptr_t* loadbuf_ptr;
+ uintptr_t* workbuf;
+ uintptr_t* marker_exclude;
+ uintptr_t* dfam_sample_exclude;
+ double* maxt_extreme_stat = NULL;
+ uint32_t mu_table[MODEL_BLOCKSIZE];
+ char* outname_end2;
+ char* wptr_start;
+ char* wptr;
+ uint64_t* family_list;
+ uint64_t* trio_list;
+ uint32_t* trio_error_lookup;
+ uint32_t* fs_starts;
+ uint32_t* fss_contents;
+ uint32_t* sample_to_fss_idx;
+ uint32_t* dfam_iteration_order;
+ uint32_t* idx_to_uidx;
+ uint32_t* uidx_to_idx;
+ uint32_t* sample_to_cluster;
+ uint32_t* cluster_ctrl_case_cts;
+ uint32_t* cluster_write_idxs;
+ uint32_t* cur_dfam_ptr;
+ uintptr_t marker_ct;
+ uintptr_t marker_uidx; // loading
+ uintptr_t marker_uidx2; // writing
+ uintptr_t trio_ct;
+ uintptr_t max_fid_len;
+ uintptr_t ulii;
+ double numer;
+ double denom;
+ double total_expected;
+ double case_proportion;
+ double case_expected_a1_ct;
+ double case_var_a1_ct;
+ double dxx;
+ uint32_t family_ct;
+ uint32_t fs_ct;
+ uint32_t sample_uidx;
+ uint32_t sample_idx;
+ uint32_t fs_idx;
+ uint32_t fssc_start;
+ uint32_t fssc_end;
+ uint32_t fssc_idx;
+ uint32_t unrelated_cluster_idx;
+ uint32_t write_idx;
+ uint32_t cur_ctrl_ct;
+ uint32_t cur_case_ct;
+ uint32_t dfam_sample_ct;
+ uint32_t dfam_sample_ctl2;
+ uint32_t chrom_fo_idx;
+ uint32_t chrom_idx;
+ uint32_t chrom_end;
+ uint32_t block_size;
+ uint32_t block_end;
+ uint32_t marker_bidx;
+ uint32_t marker_unstopped_ct;
+ uint32_t loop_end;
+ uint32_t marker_idx;
+ uint32_t marker_idx2;
+ uint32_t paternal_id;
+ uint32_t maternal_id;
+ uint32_t paternal_geno;
+ uint32_t maternal_geno;
+ uint32_t sibling_ct;
+ uint32_t parental_a1_ct;
+ uint32_t sib_idx;
+ uint32_t cur_geno;
+ uint32_t case_a1_ct;
+ uint32_t quad_denom;
+ uint32_t total_count;
+ uint32_t twice_total_expected;
+ uint32_t case_hom_a1_ct;
+ uint32_t case_het_ct;
+ uint32_t ctrl_hom_a1_ct;
+ uint32_t ctrl_het_ct;
+ uint32_t hom_a1_ct;
+ uint32_t het_ct;
+ uint32_t uii;
+ uint32_t ujj;
+ int32_t twice_numer;
+ uii = count_non_autosomal_markers(chrom_info_ptr, marker_exclude_orig, 1, 1);
+ if (uii) {
+ LOGPRINTF("Excluding %u X/MT/haploid variant%s from DFAM test.\n", uii, (uii == 1)? "" : "s");
+ if (uii == marker_ct_orig_autosomal) {
+ logprint("Error: No variants remaining for DFAM analysis.\n");
+ goto dfam_ret_INVALID_CMDLINE;
+ }
+ marker_ct_orig_autosomal -= uii;
+ if (wkspace_alloc_ul_checked(&marker_exclude_orig_autosomal, unfiltered_marker_ctl * sizeof(intptr_t))) {
+ goto dfam_ret_NOMEM;
+ }
+ memcpy(marker_exclude_orig_autosomal, marker_exclude_orig, unfiltered_marker_ctl * sizeof(intptr_t));
+ for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
+ chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
+ if (is_set(chrom_info_ptr->haploid_mask, chrom_idx) || ((int32_t)chrom_idx == chrom_info_ptr->mt_code)) {
+ uii = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx];
+ fill_bits(marker_exclude_orig_autosomal, uii, chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1] - uii);
+ }
+ }
+ } else if (is_set(chrom_info_ptr->haploid_mask, 0)) {
+ logprint("Error: DFAM test does not support haploid data.\n");
+ goto dfam_ret_INVALID_CMDLINE;
+ }
+ uii = popcount_longs_exclude(pheno_c, sample_exclude, unfiltered_sample_ct);
+ if (!uii) {
+ logprint("Error: DFAM test requires at least one case.\n");
+ goto dfam_ret_INVALID_CMDLINE;
+ }
+ marker_exclude = marker_exclude_orig_autosomal;
+ marker_ct = marker_ct_orig_autosomal;
+
+ // PLINK 1.07 treats missing phenotypes as controls here
+ if (wkspace_alloc_ul_checked(&pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t))) {
+ goto dfam_ret_NOMEM;
+ }
+ bitfield_exclude_to_include(sample_exclude, pheno_nm, unfiltered_sample_ct);
+ if (is_set_test) {
+ if (wkspace_alloc_ul_checked(&founder_pnm, unfiltered_sample_ctl * sizeof(intptr_t))) {
+ goto dfam_ret_NOMEM;
+ }
+ memcpy(founder_pnm, pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t));
+ bitfield_and(founder_pnm, founder_info, unfiltered_sample_ctl);
+ if (extract_set_union_unfiltered(sip, NULL, unfiltered_marker_ct, marker_exclude_orig_autosomal, &marker_exclude, &marker_ct)) {
+ goto dfam_ret_NOMEM;
+ }
+ }
+
+ // no --mendel-duos support for now
+ retval = get_trios_and_families(unfiltered_sample_ct, sample_exclude, sample_ct, founder_info, sex_nm, sex_male, sample_ids, max_sample_id_len, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, NULL, &max_fid_len, NULL, NULL, &family_list, &family_ct, &trio_list, &trio_ct, &trio_error_lookup, 0, multigen);
+ if (retval) {
+ goto dfam_ret_1;
+ }
+#ifdef __LP64__
+ if ((12 * sample_ct + 2 * family_ct) > 0xffffffffLLU) {
+ logprint("Error: Too many samples and families for DFAM test.\n");
+ goto dfam_ret_INVALID_CMDLINE;
+ }
+#endif
+ if (get_sibship_info(unfiltered_sample_ct, sample_exclude, sample_ct, pheno_nm, founder_info, sample_ids, max_sample_id_len, max_fid_len, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, family_list, trio_list, family_ct, trio_ct, 0, NULL, NULL, &fs_starts, &fss_contents, &sample_to_fss_idx, &fs_ct, NULL, NULL)) {
+ goto dfam_ret_NOMEM;
+ }
+ // Prepare final family, sibship, and unrelated cluster data structures.
+ // * Families with at least one affected child are processed using regular
+ // TDT logic when possible; however, when both parents have homozygous
+ // calls, or they aren't both genotyped, we fall back on sibship logic.
+ // (Families with no affected children are entirely excluded from the
+ // test.)
+ // * Only sibships with at least one affected child and one unaffected child
+ // are considered. (I.e. the sibship fallback never applies to families
+ // with only affected children.)
+ // * Only unrelated clusters with at least one affected and one unaffected
+ // member are considered.
+ // The data structures are optimized for the permutation test, since the
+ // computation is nearly I/O-bound without it. Phenotypes are permuted
+ // within each sibship/unrelated cluster, while transmitted alleles are
+ // permuted in case-containing families.
+ if (wkspace_alloc_ul_checked(&dfam_sample_exclude, unfiltered_sample_ctl * sizeof(intptr_t)) ||
+ // shrink this later
+ wkspace_alloc_ui_checked(&dfam_iteration_order, (sample_ct + (sample_ct / 2)) * sizeof(int32_t)) ||
+ wkspace_alloc_ui_checked(&idx_to_uidx, sample_ct * sizeof(int32_t))) {
+ goto dfam_ret_NOMEM;
+ }
+ fill_all_bits(dfam_sample_exclude, unfiltered_sample_ct);
+ fill_idx_to_uidx(sample_exclude, unfiltered_sample_ct, sample_ct, idx_to_uidx);
+ cur_dfam_ptr = dfam_iteration_order;
+ for (fs_idx = 0; fs_idx < family_ct; fs_idx++) {
+ // Scan for families with only case children.
+ fssc_start = fs_starts[fs_idx] + 2;
+ fssc_end = fs_starts[fs_idx + 1];
+ cur_case_ct = 0;
+ for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
+ cur_case_ct += is_set(pheno_c, idx_to_uidx[fss_contents[fssc_idx]]);
+ }
+ if (cur_case_ct == fssc_end - fssc_start) {
+ family_all_case_children_ct++;
+ // Could point to fss_contents, but I assume it's a better idea to
+ // optimize the inner loop for data locality and linear access.
+ // These family entries are temporarily stored as:
+ // [0-1]: parent uidxs
+ // [2]: number of children
+ // [3...]: child uidxs
+ // We collapse the indexes again later.
+ sample_uidx = idx_to_uidx[fss_contents[fssc_start - 2]];
+ clear_bit(dfam_sample_exclude, sample_uidx);
+ *cur_dfam_ptr++ = sample_uidx;
+
+ sample_uidx = idx_to_uidx[fss_contents[fssc_start - 1]];
+ clear_bit(dfam_sample_exclude, sample_uidx);
+ *cur_dfam_ptr++ = sample_uidx;
+
+ *cur_dfam_ptr++ = cur_case_ct;
+ for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
+ sample_uidx = idx_to_uidx[fss_contents[fssc_idx]];
+ clear_bit(dfam_sample_exclude, sample_uidx);
+ *cur_dfam_ptr++ = sample_uidx;
+ }
+ }
+ }
+ for (fs_idx = 0; fs_idx < family_ct; fs_idx++) {
+ // Scan for families with at least one case and one control child.
+ fssc_start = fs_starts[fs_idx] + 2;
+ fssc_end = fs_starts[fs_idx + 1];
+ cur_case_ct = 0;
+ for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
+ cur_case_ct += is_set(pheno_c, idx_to_uidx[fss_contents[fssc_idx]]);
+ }
+ if (cur_case_ct && (cur_case_ct != fssc_end - fssc_start)) {
+ family_mixed_ct++;
+ sample_uidx = idx_to_uidx[fss_contents[fssc_start - 2]];
+ clear_bit(dfam_sample_exclude, sample_uidx);
+ *cur_dfam_ptr++ = sample_uidx;
+
+ sample_uidx = idx_to_uidx[fss_contents[fssc_start - 1]];
+ clear_bit(dfam_sample_exclude, sample_uidx);
+ *cur_dfam_ptr++ = sample_uidx;
+
+ *cur_dfam_ptr++ = fssc_end - fssc_start;
+ for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
+ sample_uidx = idx_to_uidx[fss_contents[fssc_idx]];
+ clear_bit(dfam_sample_exclude, sample_uidx);
+ *cur_dfam_ptr++ = sample_uidx;
+ }
+ }
+ }
+ for (; fs_idx < fs_ct; fs_idx++) {
+ // Scan for sibships with at least one case and one control.
+ fssc_start = fs_starts[fs_idx];
+ fssc_end = fs_starts[fs_idx + 1];
+ cur_case_ct = 0;
+ for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
+ cur_case_ct += is_set(pheno_c, idx_to_uidx[fss_contents[fssc_idx]]);
+ }
+ if (cur_case_ct && (cur_case_ct != fssc_end - fssc_start)) {
+ sibship_mixed_ct++;
+ // [0]: sibling ct
+ // [1...]: member uidxs
+ *cur_dfam_ptr++ = fssc_end - fssc_start;
+ for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
+ sample_uidx = idx_to_uidx[fss_contents[fssc_idx]];
+ clear_bit(dfam_sample_exclude, sample_uidx);
+ *cur_dfam_ptr++ = sample_uidx;
+ }
+ }
+ }
+ if (!no_unrelateds) {
+ if (wkspace_alloc_ui_checked(&sample_to_cluster, sample_ct * sizeof(int32_t))) {
+ goto dfam_ret_NOMEM;
+ }
+ // --within on an empty file actually causes --dfam to behave differently
+ // than no --within at all in PLINK 1.07. Replicate this for now.
+ if (within_cmdflag) {
+ if (fill_sample_to_cluster(unfiltered_sample_ct, sample_exclude, sample_ct, cluster_ct, cluster_map, cluster_starts, sample_to_cluster, NULL)) {
+ goto dfam_ret_NOMEM;
+ }
+ } else {
+ // Start everyone in the same cluster.
+ fill_uint_zero(sample_to_cluster, sample_ct);
+ cluster_ct = 1;
+ }
+ for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
+ // Remove families and sibships.
+ if (sample_to_fss_idx[sample_idx] != 0xffffffffU) {
+ sample_to_cluster[sample_idx] = 0xffffffffU;
+ }
+ }
+
+ if (wkspace_alloc_ui_checked(&cluster_ctrl_case_cts, cluster_ct * 2 * sizeof(int32_t)) ||
+ wkspace_alloc_ui_checked(&cluster_write_idxs, cluster_ct * sizeof(int32_t))) {
+ goto dfam_ret_NOMEM;
+ }
+ fill_uint_zero(cluster_ctrl_case_cts, 2 * cluster_ct);
+ for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
+ unrelated_cluster_idx = sample_to_cluster[sample_idx];
+ if (unrelated_cluster_idx != 0xffffffffU) {
+ cluster_ctrl_case_cts[2 * unrelated_cluster_idx + is_set(pheno_c, sample_uidx)] += 1;
+ }
+ }
+ // Construct reduced clusters -> samples map.
+ write_idx = 0;
+ for (unrelated_cluster_idx = 0; unrelated_cluster_idx < cluster_ct; unrelated_cluster_idx++) {
+ cur_ctrl_ct = cluster_ctrl_case_cts[2 * unrelated_cluster_idx];
+ cur_case_ct = cluster_ctrl_case_cts[2 * unrelated_cluster_idx + 1];
+ if (cur_ctrl_ct && cur_case_ct) {
+ unrelated_cluster_ct++;
+ cur_dfam_ptr[write_idx++] = cur_ctrl_ct + cur_case_ct;
+ cluster_write_idxs[unrelated_cluster_idx] = write_idx;
+ write_idx += cur_ctrl_ct + cur_case_ct;
+ }
+ }
+ for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
+ next_unset_unsafe_ck(sample_exclude, &sample_uidx);
+ unrelated_cluster_idx = sample_to_cluster[sample_idx];
+ if (unrelated_cluster_idx != 0xffffffffU) {
+ cur_ctrl_ct = cluster_ctrl_case_cts[2 * unrelated_cluster_idx];
+ cur_case_ct = cluster_ctrl_case_cts[2 * unrelated_cluster_idx + 1];
+ if (cur_ctrl_ct && cur_case_ct) {
+ uii = cluster_write_idxs[unrelated_cluster_idx];
+ cur_dfam_ptr[uii] = sample_uidx;
+ clear_bit(dfam_sample_exclude, sample_uidx);
+ cluster_write_idxs[unrelated_cluster_idx] = uii + 1;
+ }
+ }
+ }
+ cur_dfam_ptr = &(cur_dfam_ptr[write_idx]);
+ }
+ wkspace_reset((unsigned char*)idx_to_uidx);
+ wkspace_shrink_top(dfam_iteration_order, (cur_dfam_ptr - dfam_iteration_order) * sizeof(int32_t));
+ if (do_perms) {
+ logprint("Error: --dfam permutation tests are currently under development.\n");
+ retval = RET_CALC_NOT_YET_SUPPORTED;
+ goto dfam_ret_1;
+ }
+ if (mtest_adjust || do_perms) {
+ if (wkspace_alloc_d_checked(&orig_chisq, marker_ct * sizeof(double))) {
+ goto dfam_ret_NOMEM;
+ }
+ }
+ dfam_sample_ct = unfiltered_sample_ct - popcount_longs(dfam_sample_exclude, unfiltered_sample_ctl);
+ dfam_sample_ctl2 = (dfam_sample_ct + (BITCT2 - 1)) / BITCT2;
+ if (wkspace_alloc_ui_checked(&uidx_to_idx, unfiltered_sample_ct * sizeof(int32_t))) {
+ goto dfam_ret_NOMEM;
+ }
+ fill_uidx_to_idx(dfam_sample_exclude, unfiltered_sample_ct, dfam_sample_ct, uidx_to_idx);
+ cur_dfam_ptr = dfam_iteration_order;
+ uii = family_all_case_children_ct + family_mixed_ct;
+ for (fs_idx = 0; fs_idx < uii; fs_idx++) {
+ *cur_dfam_ptr = uidx_to_idx[*cur_dfam_ptr];
+ cur_dfam_ptr++;
+ *cur_dfam_ptr = uidx_to_idx[*cur_dfam_ptr];
+ cur_dfam_ptr++;
+ sibling_ct = *cur_dfam_ptr++;
+ for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+ *cur_dfam_ptr = uidx_to_idx[*cur_dfam_ptr];
+ cur_dfam_ptr++;
+ }
+ }
+ uii = sibship_mixed_ct + unrelated_cluster_ct;
+ for (fs_idx = 0; fs_idx < uii; fs_idx++) {
+ sibling_ct = *cur_dfam_ptr++;
+ for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+ *cur_dfam_ptr = uidx_to_idx[*cur_dfam_ptr];
+ cur_dfam_ptr++;
+ }
+ }
+ // DEBUG
+ printf("%u %u %u %u\n", family_all_case_children_ct, family_mixed_ct, sibship_mixed_ct, unrelated_cluster_ct);
+ wkspace_reset((unsigned char*)uidx_to_idx);
+ if (wkspace_alloc_ul_checked(&dfam_pheno_c, dfam_sample_ctl2 * sizeof(intptr_t)) ||
+ wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
+ wkspace_alloc_ul_checked(&workbuf, unfiltered_sample_ctp1l2 * sizeof(intptr_t)) ||
+ wkspace_alloc_ul_checked(&g_loadbuf, MODEL_BLOCKSIZE * dfam_sample_ctl2 * sizeof(intptr_t))) {
+ goto dfam_ret_NOMEM;
+ }
+ collapse_copy_bitarr(sample_ct, pheno_c, dfam_sample_exclude, dfam_sample_ct, dfam_pheno_c);
+ g_pheno_c = dfam_pheno_c;
+ loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
+ workbuf[unfiltered_sample_ctp1l2 - 1] = 0;
+ for (ulii = 1; ulii <= MODEL_BLOCKSIZE; ulii++) {
+ // defensive
+ g_loadbuf[dfam_sample_ctl2 * ulii - 1] = 0;
+ }
+ // no X/haploid/MT, so no haploid filters
+
+ if (fill_orig_chisq) {
+ if (wkspace_alloc_d_checked(&g_orig_stat, marker_ct * sizeof(double))) {
+ goto dfam_ret_NOMEM;
+ }
+ }
+
+ ulii = 2 * max_marker_allele_len + plink_maxsnp + MAX_ID_LEN + 256;
+ if (ulii > MAXLINELEN) {
+ if (wkspace_alloc_c_checked(&textbuf, ulii)) {
+ goto dfam_ret_NOMEM;
+ }
+ }
+
+ // permutation test boilerplate mostly copied from qassoc() in plink_assoc.c,
+ // since it's also restricted to autosomes
+ g_mperm_save_all = NULL;
+ if (perm_maxt_nst) {
+ perms_total = fam_ip->dfam_mperm_val;
+ if (wkspace_alloc_d_checked(&maxt_extreme_stat, perms_total * sizeof(double))) {
+ goto dfam_ret_NOMEM;
+ }
+ g_maxt_extreme_stat = maxt_extreme_stat;
+ fill_double_zero(maxt_extreme_stat, perms_total);
+ if (mperm_save & MPERM_DUMP_ALL) {
+ memcpy(outname_end, ".mperm.dump.all", 16);
+ if (fopen_checked(&outfile_msa, outname, "w")) {
+ goto dfam_ret_OPEN_FAIL;
+ }
+ if (putc_checked('0', outfile_msa)) {
+ goto dfam_ret_WRITE_FAIL;
+ }
+ LOGPRINTF("Dumping all permutation chi-square values to %s .\n", outname);
+ }
+ } else {
+ mperm_save = 0;
+ if (perm_adapt_nst) {
+ g_aperm_alpha = apip->alpha;
+ perms_total = apip->max;
+ if (wkspace_alloc_ui_checked(&g_perm_attempt_ct, marker_ct * sizeof(int32_t)) ||
+ wkspace_alloc_uc_checked(&g_perm_adapt_stop, marker_ct)) {
+ goto dfam_ret_NOMEM;
+ }
+ ujj = apip->max;
+ for (uii = 0; uii < marker_ct; uii++) {
+ g_perm_attempt_ct[uii] = ujj;
+ }
+ fill_ulong_zero((uintptr_t*)g_perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
+ g_adaptive_ci_zt = ltqnorm(1 - apip->beta / (2.0 * ((intptr_t)marker_ct)));
+ if (apip->min < apip->init_interval) {
+ g_first_adapt_check = (int32_t)(apip->init_interval);
+ } else {
+ g_first_adapt_check = apip->min;
+ }
+ g_adaptive_intercept = apip->init_interval;
+ g_adaptive_slope = apip->interval_slope;
+ }
+ }
+
+ outname_end2 = memcpyb(outname_end, ".dfam", 6);
+ if (fopen_checked(&outfile, outname, "w")) {
+ goto dfam_ret_OPEN_FAIL;
+ }
+ LOGPRINTFWW5("Writing --dfam results to %s ... ", outname);
+ fflush(stdout);
+ sprintf(textbuf, " CHR %%%us A1 A2 OBS EXP CHISQ P \n", plink_maxsnp);
+ fprintf(outfile, textbuf, "SNP");
+ loop_end = marker_ct / 100;
+ marker_unstopped_ct = marker_ct;
+
+ if (do_perms) {
+ if (fam_ip->dfam_modifier & DFAM_PERM) {
+ if (perm_batch_size > apip->max) {
+ perm_batch_size = apip->max;
+ }
+ } else {
+ if (perm_batch_size > fam_ip->dfam_mperm_val) {
+ perm_batch_size = fam_ip->dfam_mperm_val;
+ }
+ }
+ }
+
+ fputs("0%", stdout);
+ fflush(stdout);
+ // ----- begin main loop -----
+ dfam_more_perms:
+ if (do_perms_nst) {
+ if (!perm_pass_idx) {
+ // ...
+ }
+ }
+ chrom_fo_idx = 0xffffffffU;
+ marker_uidx = next_unset_unsafe(marker_exclude, 0);
+ if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
+ goto dfam_ret_READ_FAIL;
+ }
+ marker_idx = 0;
+ marker_idx2 = 0;
+ chrom_end = 0;
+ do {
+ // since X/haploid/MT is not supported, ignore chromosome boundaries in
+ // this loop
+ block_size = 0;
+ block_end = marker_unstopped_ct - marker_idx;
+ if (block_end > MODEL_BLOCKSIZE) {
+ block_end = MODEL_BLOCKSIZE;
+ }
+ do {
+ if (perm_adapt_nst && g_perm_adapt_stop[marker_idx2]) {
+ do {
+ marker_uidx++;
+ next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
+ marker_idx2++;
+ } while (g_perm_adapt_stop[marker_idx2]);
+ if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
+ goto dfam_ret_READ_FAIL;
+ }
+ }
+ if (load_raw2(bedfile, loadbuf_raw, unfiltered_sample_ct4, unfiltered_sample_ctl2m1, final_mask)) {
+ goto dfam_ret_READ_FAIL;
+ }
+ if (IS_SET(marker_reverse, marker_uidx)) {
+ reverse_loadbuf((unsigned char*)loadbuf_raw, unfiltered_sample_ct);
+ }
+ erase_mendel_errors(unfiltered_sample_ct, loadbuf_raw, workbuf, trio_error_lookup, trio_ct, multigen);
+ collapse_copy_2bitarr(loadbuf_raw, &(g_loadbuf[block_size * dfam_sample_ctl2]), unfiltered_sample_ct, dfam_sample_ct, dfam_sample_exclude);
+ if (perm_adapt_nst) {
+ g_adapt_m_table[block_size] = marker_idx2++;
+ }
+ mu_table[block_size++] = marker_uidx;
+ if (marker_idx + block_size == marker_unstopped_ct) {
+ break;
+ }
+ marker_uidx++;
+ if (IS_SET(marker_exclude, marker_uidx)) {
+ marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
+ if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
+ goto dfam_ret_READ_FAIL;
+ }
+ }
+ } while (block_size < block_end);
+ if (!perm_pass_idx) {
+ // Calculate original chi-square values and write to disk:
+ // 1. Iterate through nuclear families with only case children. If both
+ // parents are not heterozygous, either parent has a missing call, or
+ // all children have missing calls, skip. Otherwise,
+ // twice_numer += 2 * [A1 allele count among kids] -
+ // ([# of kids] * [parental A1 allele ct])
+ // quad_denom += # of het parents
+ // total_count += [A1 allele count among kids]
+ // twice_total_expected += [# of kids] * [parental A1 allele ct]
+ // 2. Iterate through nuclear families with at least one case and at
+ // least one control child. If all case children have missing calls,
+ // skip. Otherwise, if both parents are not heterozygous, or either
+ // parent has a missing call, handle the children as in step 3.
+ // Otherwise,
+ // twice_numer += 2 * [A1 allele count among case kids] -
+ // ([# of case kids] * [parental A1 ct])
+ // quad_denom += # of het parents
+ // total_count += [A1 allele count among case kids]
+ // twice_total_expected += [# of case kids] * [parental A1 ct]
+ // 3. Iterate through sibships. If all case siblings, or all control
+ // siblings, have missing genotypes, skip. Otherwise (see lines
+ // 420-456 of PLINK 1.07 dfam.cpp),
+ // case_expected_hom_a1 := [case sib ct] * [sib hom A1 ct] /
+ // [sib ct]
+ // case_expected_het := [case sib ct] * [sib het ct] / [sib ct]
+ // case_var_hom_a1 := ([case sib ct] * [ctrl sib ct] *
+ // [sib hom A1 ct] * [sib non-hom-A1]) /
+ // ([sib ct] * [sib ct] * ([sib ct - 1]))
+ // case_var_het := ([case sib ct] * [ctrl sib ct] *
+ // [sib het ct] * [sib non-het]) /
+ // ([sib ct] * [sib ct] * ([sib ct - 1]))
+ // case_neg_covar := ([case sib ct] * [ctrl sib ct] *
+ // (between case hom a1 [sib hom A1 ct] * [sib het ct]) /
+ // and case het cts) ([sib ct] * [sib ct] * ([sib ct] - 1))
+ // case_expected_a1_ct := 2 * case_expected_hom_a1 +
+ // case_expected_het
+ // case_var_a1_ct := 4 * case_var_hom_a1 + case_var_het +
+ // 4 * case_neg_covar
+ // numer += case_a1_ct - case_expected_a1_ct
+ // denom += case_var_a1_ct
+ // total_count += case_a1_ct
+ // total_expected += case_expected_a1_ct
+ // Shortcut when all genotypes are identical (this is common):
+ // total_count += case_a1_ct
+ // total_expected += case_a1_ct
+ // We could entirely skip this instead, but that would lead to a
+ // different output file than 1.07.
+ // 4. Iterate through clusters of unrelateds. If all genotypes are
+ // missing or identical, skip. Otherwise (see lines 557-571 of
+ // dfam.cpp),
+ // case_expected_a1_ct := [case ct] * [A1 ct] / [cluster size]
+ // case_var_a1_ct := ([case ct] * [ctrl ct]
+ // [A1 ct] * [A2 ct]) /
+ // (([clst size]^2) * ([clst size] - 1))
+ // numer += case_a1_ct - case_expected_a1_ct
+ // denom += case_var_a1_ct
+ // total_count += case_a1_ct
+ // total_expected += case_expected_a1_ct
+ for (marker_bidx = 0; marker_bidx < block_size; marker_bidx++) {
+ marker_uidx2 = mu_table[marker_bidx];
+ // marker_idx_to_uidx[marker_idx + marker_bidx] = marker_uidx2;
+ loadbuf_ptr = &(g_loadbuf[marker_bidx * dfam_sample_ctl2]);
+ cur_dfam_ptr = dfam_iteration_order;
+ twice_numer = 0;
+ quad_denom = 0;
+ total_count = 0;
+ numer = 0.0;
+ denom = 0.0;
+ twice_total_expected = 0;
+ total_expected = 0;
+ for (fs_idx = 0; fs_idx < family_all_case_children_ct; fs_idx++) {
+ paternal_id = *cur_dfam_ptr++;
+ maternal_id = *cur_dfam_ptr++;
+ sibling_ct = *cur_dfam_ptr++;
+ paternal_geno = (loadbuf_ptr[paternal_id / BITCT2] >> (2 * (paternal_id % BITCT2))) & 3;
+ maternal_geno = (loadbuf_ptr[maternal_id / BITCT2] >> (2 * (maternal_id % BITCT2))) & 3;
+ parental_a1_ct = dfam_allele_ct_table[paternal_geno * 4 + maternal_geno];
+ if (!parental_a1_ct) {
+ cur_dfam_ptr = &(cur_dfam_ptr[sibling_ct]);
+ continue;
+ }
+ cur_case_ct = 0;
+ case_a1_ct = 0;
+ for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+ sample_idx = *cur_dfam_ptr++;
+ cur_geno = (loadbuf_ptr[sample_idx / BITCT2] >> (2 * (sample_idx % BITCT2))) & 3;
+ if (cur_geno == 1) {
+ continue;
+ }
+ cur_case_ct++;
+ case_a1_ct += (4 - cur_geno) / 2;
+ }
+ if (cur_case_ct) {
+ twice_numer += (int32_t)(2 * case_a1_ct) - (int32_t)(cur_case_ct * parental_a1_ct);
+ quad_denom += 2 - (parental_a1_ct & 1);
+ total_count += case_a1_ct;
+ twice_total_expected += cur_case_ct * parental_a1_ct;
+ }
+ }
+ for (fs_idx = 0; fs_idx < family_mixed_ct; fs_idx++) {
+ paternal_id = *cur_dfam_ptr++;
+ maternal_id = *cur_dfam_ptr++;
+ sibling_ct = *cur_dfam_ptr++;
+ paternal_geno = (loadbuf_ptr[paternal_id / BITCT2] >> (2 * (paternal_id % BITCT2))) & 3;
+ maternal_geno = (loadbuf_ptr[maternal_id / BITCT2] >> (2 * (maternal_id % BITCT2))) & 3;
+ parental_a1_ct = dfam_allele_ct_table[paternal_geno * 4 + maternal_geno];
+ cur_case_ct = 0;
+ cur_ctrl_ct = 0;
+ case_hom_a1_ct = 0;
+ case_het_ct = 0;
+ ctrl_hom_a1_ct = 0;
+ ctrl_het_ct = 0;
+ for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+ sample_idx = *cur_dfam_ptr++;
+ cur_geno = (loadbuf_ptr[sample_idx / BITCT2] >> (2 * (sample_idx % BITCT2))) & 3;
+ if (cur_geno == 1) {
+ continue;
+ }
+ if (IS_SET(dfam_pheno_c, sample_idx)) {
+ cur_case_ct++;
+ if (cur_geno != 3) {
+ if (cur_geno == 2) {
+ case_het_ct++;
+ } else {
+ case_hom_a1_ct++;
+ }
+ }
+ } else {
+ cur_ctrl_ct++;
+ if (cur_geno != 3) {
+ if (cur_geno == 2) {
+ ctrl_het_ct++;
+ } else {
+ ctrl_hom_a1_ct++;
+ }
+ }
+ }
+ }
+ if (!cur_case_ct) {
+ continue;
+ }
+ if (!parental_a1_ct) {
+ dfam_sibship_calc(cur_case_ct, case_hom_a1_ct, case_het_ct, cur_ctrl_ct, ctrl_hom_a1_ct, ctrl_het_ct, &total_count, &numer, &denom, &total_expected);
+ } else {
+ case_a1_ct = 2 * case_hom_a1_ct + case_het_ct;
+ twice_numer += (int32_t)(2 * case_a1_ct) - (int32_t)(cur_case_ct * parental_a1_ct);
+ quad_denom += 2 - (parental_a1_ct & 1);
+ total_count += case_a1_ct;
+ twice_total_expected += cur_case_ct * parental_a1_ct;
+ }
+ }
+ numer += 0.5 * ((double)twice_numer);
+ denom += 0.25 * ((double)quad_denom);
+ total_expected += 0.5 * ((double)twice_total_expected);
+ for (fs_idx = 0; fs_idx < sibship_mixed_ct; fs_idx++) {
+ sibling_ct = *cur_dfam_ptr++;
+ cur_case_ct = 0;
+ cur_ctrl_ct = 0;
+ case_hom_a1_ct = 0;
+ case_het_ct = 0;
+ ctrl_hom_a1_ct = 0;
+ ctrl_het_ct = 0;
+ for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+ sample_idx = *cur_dfam_ptr++;
+ cur_geno = (loadbuf_ptr[sample_idx / BITCT2] >> (2 * (sample_idx % BITCT2))) & 3;
+ if (cur_geno == 1) {
+ continue;
+ }
+ if (IS_SET(dfam_pheno_c, sample_idx)) {
+ cur_case_ct++;
+ if (cur_geno != 3) {
+ if (cur_geno == 2) {
+ case_het_ct++;
+ } else {
+ case_hom_a1_ct++;
+ }
+ }
+ } else {
+ cur_ctrl_ct++;
+ if (cur_geno != 3) {
+ if (cur_geno == 2) {
+ ctrl_het_ct++;
+ } else {
+ ctrl_hom_a1_ct++;
+ }
+ }
+ }
+ }
+ if (!cur_case_ct) {
+ continue;
+ }
+ dfam_sibship_calc(cur_case_ct, case_hom_a1_ct, case_het_ct, cur_ctrl_ct, ctrl_hom_a1_ct, ctrl_het_ct, &total_count, &numer, &denom, &total_expected);
+ }
+ for (unrelated_cluster_idx = 0; unrelated_cluster_idx < unrelated_cluster_ct; unrelated_cluster_idx++) {
+ sibling_ct = *cur_dfam_ptr++; // not actually siblings
+ cur_case_ct = 0;
+ cur_ctrl_ct = 0;
+ case_hom_a1_ct = 0;
+ case_het_ct = 0;
+ ctrl_hom_a1_ct = 0;
+ ctrl_het_ct = 0;
+ for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+ sample_idx = *cur_dfam_ptr++;
+ cur_geno = (loadbuf_ptr[sample_idx / BITCT2] >> (2 * (sample_idx % BITCT2))) & 3;
+ if (cur_geno == 1) {
+ continue;
+ }
+ if (IS_SET(dfam_pheno_c, sample_idx)) {
+ cur_case_ct++;
+ if (cur_geno != 3) {
+ if (cur_geno == 2) {
+ case_het_ct++;
+ } else {
+ case_hom_a1_ct++;
+ }
+ }
+ } else {
+ cur_ctrl_ct++;
+ if (cur_geno != 3) {
+ if (cur_geno == 2) {
+ ctrl_het_ct++;
+ } else {
+ ctrl_hom_a1_ct++;
+ }
+ }
+ }
+ }
+ case_a1_ct = 2 * case_hom_a1_ct + case_het_ct;
+ hom_a1_ct = case_hom_a1_ct + ctrl_hom_a1_ct;
+ het_ct = case_het_ct + ctrl_het_ct;
+ uii = cur_case_ct + cur_ctrl_ct;
+ if ((uii <= 1) || ((!hom_a1_ct) && (!het_ct)) || (hom_a1_ct == uii) || (het_ct == uii)) {
+ continue;
+ }
+ total_count += case_a1_ct;
+ if ((!cur_case_ct) || (!cur_ctrl_ct)) {
+ total_expected += (double)((int32_t)case_a1_ct);
+ continue;
+ }
+ dxx = ((double)((int32_t)uii));
+ case_proportion = ((double)((int32_t)cur_case_ct)) / dxx;
+ ujj = 2 * hom_a1_ct + het_ct;
+ case_expected_a1_ct = case_proportion * ((double)((int32_t)ujj));
+ case_var_a1_ct = case_expected_a1_ct * cur_ctrl_ct * ((double)((int32_t)(2 * uii - ujj))) / (dxx * (dxx - 1));
+ numer += case_a1_ct - case_expected_a1_ct;
+ denom += case_var_a1_ct;
+ total_expected += case_expected_a1_ct;
+ }
+ printf("%g %g %u %g\n", numer, denom, total_count, total_expected);
+ if (marker_bidx == 2) {
+ exit(1);
+ }
+ }
+ }
+ marker_idx += block_size;
+ if ((!perm_pass_idx) && (marker_idx >= loop_end)) {
+ if (marker_idx < marker_unstopped_ct) {
+ if (pct >= 10) {
+ putchar('\b');
+ }
+ pct = (marker_idx * 100LLU) / marker_unstopped_ct;
+ printf("\b\b%u%%", pct);
+ fflush(stdout);
+ loop_end = (((uint64_t)pct + 1LLU) * marker_unstopped_ct) / 100;
+ }
+ }
+ } while (marker_idx < marker_unstopped_ct);
+ if (!perm_pass_idx) {
+ if (pct >= 10) {
+ putchar('\b');
+ }
+ fputs("\b\b", stdout);
+ logprint("done.\n");
+ if (do_perms_nst) {
+ // wkspace_reset();
+ }
+ if (fclose_null(&outfile)) {
+ goto dfam_ret_WRITE_FAIL;
+ }
+ if (!is_set_test) {
+ if (wkspace_alloc_ui_checked(&idx_to_uidx, marker_ct * sizeof(int32_t))) {
+ goto dfam_ret_NOMEM;
+ }
+ fill_idx_to_uidx(marker_exclude, unfiltered_marker_ct, marker_ct, idx_to_uidx);
+ retval = multcomp(outname, outname_end, idx_to_uidx, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, chrom_info_ptr, orig_chisq, pfilter, output_min_p, mtest_adjust, 0, adjust_lambda, NULL, NULL);
+ if (retval) {
+ goto dfam_ret_1;
+ }
+ wkspace_reset(idx_to_uidx);
+ // if (mperm_save & MPERM_DUMP_ALL) { ...
+ } else {
+ // retval = dfam_set_test(threads, bedfile, bed_offset, outname, outname_end, ...);
+ if (retval) {
+ goto dfam_ret_1;
+ }
+ }
+ }
+ if (do_perms_nst) {
+ // if (mperm_save & MPERM_DUMP_ALL) { ...
+ // wkspace_reset();
+ if (perms_done < perms_total) {
+ }
+ }
+ // ...
+
+ while (0) {
+ dfam_ret_NOMEM:
+ retval = RET_NOMEM;
+ break;
+ dfam_ret_OPEN_FAIL:
+ retval = RET_OPEN_FAIL;
+ break;
+ dfam_ret_READ_FAIL:
+ retval = RET_READ_FAIL;
+ break;
+ dfam_ret_WRITE_FAIL:
+ retval = RET_WRITE_FAIL;
+ break;
+ dfam_ret_INVALID_CMDLINE:
+ retval = RET_INVALID_CMDLINE;
+ break;
+ }
+ dfam_ret_1:
+ wkspace_reset(wkspace_mark);
+ fclose_cond(outfile);
+ fclose_cond(outfile_msa);
+ return retval;
+ */
+}
+
void uint32_permute(uint32_t* perm_arr, uint32_t* precomputed_mods, sfmt_t* sfmtp, uint32_t ct) {
// Sets perm_arr[0..(ct-1)] to a random permutation of 0..(ct-1). Assumes
// ct >= 2.
@@ -2871,6 +3881,7 @@ void qfam_compute_bw(uintptr_t* loadbuf, uintptr_t sample_ct, uint32_t* fs_start
} else {
clear_bit(nm_fss, cur_idx);
}
+ cur_start = cur_end;
}
for (; cur_idx < fss_ct; cur_idx++) {
sample_uidx = *fss_ptr++;
@@ -3021,43 +4032,6 @@ static inline uint32_t qfam_regress(uint32_t test_type, uint32_t nind, uint32_t
return 0;
}
-// multithread globals
-static uintptr_t* g_loadbuf;
-static uintptr_t* g_lm_eligible;
-static uintptr_t* g_lm_within2_founder;
-static uintptr_t* g_qfam_flip;
-static uintptr_t* g_nm_fss;
-static uintptr_t* g_nm_lm;
-static uint32_t* g_qfam_permute;
-static uint32_t* g_permute_edit;
-static uint32_t* g_perm_2success_ct;
-static uint32_t* g_perm_attempt_ct;
-static uint32_t* g_fs_starts;
-static uint32_t* g_fss_contents;
-static uint32_t* g_sample_lm_to_fss_idx;
-static unsigned char* g_perm_adapt_stop;
-static uint32_t g_adapt_m_table[MODEL_BLOCKSIZE];
-static double* g_orig_stat;
-static double* g_pheno_d2;
-static double* g_qfam_b;
-static double* g_qfam_w;
-static uintptr_t g_cur_perm_ct;
-static double g_qt_sum_all;
-static double g_qt_ssq_all;
-static uint32_t g_test_type;
-static uint32_t g_qfam_thread_ct;
-static uint32_t g_fs_ct;
-static uint32_t g_singleton_ct;
-static uint32_t g_lm_ct;
-static uint32_t g_family_ct;
-static uint32_t g_block_size;
-static uint32_t g_perms_done;
-static uint32_t g_first_adapt_check;
-static double g_adaptive_intercept;
-static double g_adaptive_slope;
-static double g_aperm_alpha;
-static double g_adaptive_ci_zt;
-
THREAD_RET_TYPE qfam_thread(void* arg) {
uintptr_t tidx = (uintptr_t)arg;
uint32_t qfam_thread_ct = g_qfam_thread_ct;
@@ -3077,6 +4051,8 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
double* qfam_b = &(g_qfam_b[tidx * CACHEALIGN32_DBL(fss_ct)]);
double* qfam_w = &(g_qfam_w[tidx * CACHEALIGN32_DBL(lm_ct)]);
double* pheno_d2 = g_pheno_d2;
+ double* beta_sum = g_beta_sum;
+ double* beta_ssq = g_beta_ssq;
uint32_t* qfam_permute = only_within? NULL : g_qfam_permute;
uint32_t* permute_edit_buf = only_within? NULL : (&(g_permute_edit[tidx * CACHEALIGN32_INT32(fss_ct)]));
uint32_t* perm_2success_ct = g_perm_2success_ct;
@@ -3085,6 +4061,7 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
uint32_t* fss_contents = g_fss_contents;
uint32_t* sample_lm_to_fss_idx = g_sample_lm_to_fss_idx;
uint32_t* perm_ptr = NULL;
+ uint32_t* beta_fail_cts = g_beta_fail_cts;
uintptr_t cur_perm_ct = g_cur_perm_ct;
uintptr_t sample_ct = g_sample_ct;
uintptr_t sample_ctl2 = (sample_ct + (BITCT2 - 1)) / BITCT2;
@@ -3111,6 +4088,8 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
double qt_ssq;
double nind_recip;
double beta;
+ double cur_beta_sum;
+ double cur_beta_ssq;
double tstat;
double pval;
double dxx;
@@ -3122,6 +4101,7 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
uint32_t success_2start;
uint32_t success_2incr;
uint32_t next_adapt_check;
+ uint32_t cur_beta_fail_cts;
uint32_t cur_fss_ct;
uint32_t nind;
uint32_t orig_fss_idx;
@@ -3159,6 +4139,9 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
if (only_within) {
flip_precalc(lm_ct, qfam_w, pheno_d2, nm_lm, &geno_sum, &geno_ssq, &qt_g_prod);
}
+ cur_beta_sum = 0.0;
+ cur_beta_ssq = 0.0;
+ cur_beta_fail_cts = 0;
for (pidx = 0; pidx < cur_perm_ct;) {
if (!only_within) {
@@ -3189,6 +4172,8 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
}
}
if (!qfam_regress(test_type, nind, lm_ct, sample_lm_to_fss_idx, nm_lm, pheno_d2, qfam_b, qfam_w, perm_ptr, &(qfam_flip[pidx * flip_ctl]), nind_recip, qt_sum, qt_ssq, geno_sum, geno_ssq, qt_g_prod, &beta, &tstat)) {
+ cur_beta_sum += beta;
+ cur_beta_ssq += beta * beta;
tstat = fabs(tstat);
if (tstat > stat_high) {
success_2incr += 2;
@@ -3198,6 +4183,7 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
} else {
// conservative handling of permutation regression failure
success_2incr += 2;
+ cur_beta_fail_cts++;
}
if (++pidx == next_adapt_check - pidx_offset) {
// won't ever get here with fixed number of permutations
@@ -3217,6 +4203,13 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
}
}
perm_2success_ct[marker_idx] += success_2incr;
+ if (beta_sum) {
+ beta_sum[marker_idx] += cur_beta_sum;
+ beta_ssq[marker_idx] += cur_beta_ssq;
+ if (cur_beta_fail_cts) {
+ beta_fail_cts[marker_idx] += cur_beta_fail_cts;
+ }
+ }
}
qfam_thread_skip_all:
if ((!tidx) || g_is_last_thread_block) {
@@ -3241,6 +4234,7 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
double geno_sum = 0.0;
double geno_ssq = 0.0;
double qt_g_prod = 0.0;
+ double* orig_beta = NULL;
char* chrom_name_ptr = NULL;
uint32_t unfiltered_sample_ctl2m1 = (unfiltered_sample_ct - 1) / BITCT2;
uint32_t test_type = fam_ip->qfam_modifier & QFAM_TEST;
@@ -3248,6 +4242,7 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
uint32_t multigen = (fam_ip->mendel_modifier / MENDEL_MULTIGEN) & 1;
uint32_t only_within = (test_type & (QFAM_WITHIN1 | QFAM_WITHIN2))? 1 : 0;
uint32_t perm_count = fam_ip->qfam_modifier & QFAM_PERM_COUNT;
+ uint32_t emp_se = fam_ip->qfam_modifier & QFAM_EMP_SE;
uint32_t perms_done = 0;
uint32_t chrom_idx = 0;
uint32_t qfam_thread_ct = g_thread_ct;
@@ -3363,7 +4358,7 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
goto qfam_ret_INVALID_CMDLINE;
}
#endif
- if (get_sibship_info(unfiltered_sample_ct, sample_exclude, sample_ct, pheno_nm, pheno_d, founder_info, sample_ids, max_sample_id_len, max_fid_len, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, family_list, trio_list, family_ct, trio_ct, test_type, &lm_eligible, &lm_within2_founder, &fs_starts, &fss_contents, &sample_lm_to_fss_idx, &fs_ct, &lm_ct, &singleton_ct)) {
+ if (get_sibship_info(unfiltered_sample_ct, sample_exclude, sample_ct, pheno_nm, founder_info, sample_ids, max_sample_id_len, max_fid_len, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, family_list, trio_list, family_ct, trio_ct, test_type, &lm_eligible, &lm_within2_founder, &fs_starts, &fss_contents, &sample_lm_to_fss_idx, &fs_ct, &lm_ct, &singleton_ct)) {
goto qfam_ret_NOMEM;
}
fss_ct = fs_ct + singleton_ct;
@@ -3446,6 +4441,21 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
goto qfam_ret_NOMEM;
}
}
+ if (emp_se) {
+ if (wkspace_alloc_d_checked(&orig_beta, marker_ct * sizeof(double)) ||
+ wkspace_alloc_d_checked(&g_beta_sum, marker_ct * sizeof(double)) ||
+ wkspace_alloc_d_checked(&g_beta_ssq, marker_ct * sizeof(double)) ||
+ wkspace_alloc_ui_checked(&g_beta_fail_cts, marker_ct * sizeof(double))) {
+ goto qfam_ret_NOMEM;
+ }
+ fill_double_zero(g_beta_sum, marker_ct);
+ fill_double_zero(g_beta_ssq, marker_ct);
+ fill_uint_zero(g_beta_fail_cts, marker_ct);
+ } else {
+ g_beta_sum = NULL;
+ g_beta_ssq = NULL;
+ g_beta_fail_cts = NULL;
+ }
if (wkspace_alloc_ul_checked(&g_loadbuf, MODEL_BLOCKSIZE * sample_ctl2 * sizeof(intptr_t)) ||
wkspace_alloc_d_checked(&g_orig_stat, marker_ct * sizeof(double)) ||
wkspace_alloc_ul_checked(&g_qfam_flip, perm_batch_size * flip_ctl * sizeof(intptr_t)) ||
@@ -3629,6 +4639,9 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
// do not apply --output-min-p since only the empirical p-value is
// supposed to be postprocessed here, not this one
bufptr = double_g_writewx4x(bufptr, calc_tprob(tstat, nind - 2), 12, '\n');
+ if (emp_se) {
+ orig_beta[marker_idx_base + block_idx] = beta;
+ }
*orig_stat_ptr++ = fabs(tstat);
} else {
bufptr = memcpya(bufptr, " NA NA NA\n", 37);
@@ -3688,7 +4701,7 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
if (fopen_checked(&outfile, outname, "w")) {
goto qfam_ret_OPEN_FAIL;
}
- sprintf(tbuf, " CHR %%%us EMP1 NP \n", plink_maxsnp);
+ sprintf(tbuf, emp_se? " CHR %%%us BETA EMP_BETA EMP_SE EMP1 NP \n" : " CHR %%%us EMP1 NP \n", plink_maxsnp);
fprintf(outfile, tbuf, "SNP");
chrom_fo_idx = 0xffffffffU;
chrom_end = 0;
@@ -3711,6 +4724,9 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), bufptr);
*bufptr++ = ' ';
if (g_orig_stat[marker_idx] == -9) {
+ if (emp_se) {
+ bufptr = memcpya(bufptr, " NA NA NA ", 39);
+ }
bufptr = memcpya(bufptr, " NA NA\n", 26);
} else {
uii = g_perm_2success_ct[marker_idx];
@@ -3719,6 +4735,18 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
} else {
ujj = perms_total;
}
+ if (emp_se) {
+ bufptr = double_g_writewx4x(bufptr, orig_beta[marker_idx], 12, ' ');
+ ukk = ujj - g_beta_fail_cts[marker_idx];
+ if (ukk <= 1) {
+ bufptr = memcpya(bufptr, " NA ", 13);
+ } else {
+ dxx = g_beta_sum[marker_idx] / ((double)((int32_t)ukk));
+ bufptr = double_g_writewx4x(bufptr, dxx, 12, ' ');
+ dxx = sqrt((g_beta_ssq[marker_idx] - g_beta_sum[marker_idx] * dxx) / ((double)((int32_t)(ukk - 1))));
+ bufptr = double_g_writewx4x(bufptr, dxx, 12, ' ');
+ }
+ }
if (!perm_count) {
dxx = ((double)(uii + 2)) / ((double)(2 * (ujj + 1)));
} else {
diff --git a/plink_family.h b/plink_family.h
index 2434199..1558046 100644
--- a/plink_family.h
+++ b/plink_family.h
@@ -1,16 +1,25 @@
#ifndef __PLINK_FAMILY_H__
#define __PLINK_FAMILY_H__
+#include "plink_set.h"
+
#define TDT_EXACT 1
#define TDT_MIDP 2
#define TDT_POO 4
#define TDT_PERM 8
#define TDT_MPERM 0x10
-#define TDT_PARENPERM1 0x20
-#define TDT_PARENPERM2 0x40
-#define TDT_POOPERM_PAT 0x80
-#define TDT_POOPERM_MAT 0x100
-#define TDT_SET_TEST 0x200
+#define TDT_PERM_COUNT 0x20
+#define TDT_PARENPERM1 0x40
+#define TDT_PARENPERM2 0x80
+#define TDT_POOPERM_PAT 0x100
+#define TDT_POOPERM_MAT 0x200
+#define TDT_SET_TEST 0x400
+
+#define DFAM_NO_UNRELATEDS 1
+#define DFAM_PERM 2
+#define DFAM_MPERM 4
+#define DFAM_PERM_COUNT 8
+#define DFAM_SET_TEST 0x10
#define QFAM_WITHIN1 1
#define QFAM_WITHIN2 2
@@ -20,6 +29,7 @@
#define QFAM_PERM 0x10
#define QFAM_MPERM 0x20
#define QFAM_PERM_COUNT 0x40
+#define QFAM_EMP_SE 0x80
extern const uint32_t mendel_error_table[];
extern const uint32_t mendel_error_table_x[];
@@ -31,6 +41,8 @@ typedef struct {
uint32_t mendel_modifier;
uint32_t tdt_modifier;
uint32_t tdt_mperm_val;
+ uint32_t dfam_modifier;
+ uint32_t dfam_mperm_val;
uint32_t qfam_modifier;
uint32_t qfam_mperm_val;
} Family_info;
@@ -70,7 +82,9 @@ typedef struct {
int32_t populate_pedigree_rel_info(Pedigree_rel_info* pri_ptr, uintptr_t unfiltered_sample_ct, char* sample_ids, uintptr_t max_sample_id_len, char* paternal_ids, uintptr_t max_paternal_id_len, char* maternal_ids, uintptr_t max_maternal_id_len, uintptr_t* founder_info);
-int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double ci_size, double ci_zt, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfilter [...]
+int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double ci_size, double ci_zt, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfilter [...]
+
+int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintp [...]
int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, Aperm_info* apip, uintptr_t* pheno_nm, double* pheno_d, uintptr_t* founder_info, u [...]
diff --git a/plink_filter.c b/plink_filter.c
index bcd2310..b17c998 100644
--- a/plink_filter.c
+++ b/plink_filter.c
@@ -3,6 +3,8 @@
#include "plink_filter.h"
#include "plink_stats.h"
+#include "pigz.h"
+
void oblig_missing_init(Oblig_missing_info* om_ip) {
om_ip->cluster_ct = 0;
om_ip->entry_ct = 0;
@@ -1037,6 +1039,72 @@ int32_t random_thin_markers_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_marke
return retval;
}
+uint32_t random_thin_samples(double thin_keep_prob, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr) {
+ uint32_t sample_ct = unfiltered_sample_ct - *sample_exclude_ct_ptr;
+ uint32_t sample_uidx = 0;
+ uint32_t samples_done = 0;
+ uint32_t removed_ct = 0;
+ uint32_t uint32_thresh = (uint32_t)(thin_keep_prob * 4294967296.0 + 0.5);
+ uint32_t sample_uidx_stop;
+ while (samples_done < sample_ct) {
+ sample_uidx = next_unset_unsafe(sample_exclude, sample_uidx);
+ sample_uidx_stop = next_set(sample_exclude, sample_uidx, unfiltered_sample_ct);
+ samples_done += sample_uidx_stop - sample_uidx;
+ do {
+ if(sfmt_genrand_uint32(&sfmt) >= uint32_thresh) {
+ SET_BIT(sample_exclude, sample_uidx);
+ removed_ct++;
+ }
+ } while (++sample_uidx < sample_uidx_stop);
+ }
+ if (sample_ct == removed_ct) {
+ LOGPRINTF("Error: All %s removed by --thin-indiv. Try a higher probability.\n", g_species_plural);
+ return 1;
+ }
+ LOGPRINTF("--thin-indiv: %u %s removed (%u remaining).\n", removed_ct, (removed_ct==1)? g_species_singular : g_species_plural, sample_ct - removed_ct);
+ *sample_exclude_ct_ptr += removed_ct;
+ return 0;
+}
+
+int32_t random_thin_samples_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr) {
+ unsigned char* wkspace_mark = wkspace_base;
+ uint32_t sample_ct = unfiltered_sample_ct - *sample_exclude_ct_ptr;
+ uint32_t sample_uidx = 0;
+ uintptr_t sample_ctl = (sample_ct + (BITCT - 1)) / BITCT;
+ int32_t retval = 0;
+ uintptr_t* perm_buf;
+ uint32_t sample_idx;
+ if (thin_keep_ct > sample_ct) {
+ LOGPRINTF("Error: --thin-indiv-count parameter exceeds number of remaining %s.\n", g_species_plural);
+ goto random_thin_samples_ct_ret_INVALID_CMDLINE;
+ }
+ if (wkspace_alloc_ul_checked(&perm_buf, sample_ctl * sizeof(intptr_t))) {
+ goto random_thin_samples_ct_ret_NOMEM;
+ }
+
+ generate_perm1_interleaved(sample_ct, sample_ct - thin_keep_ct, 0, 1, perm_buf);
+ sample_uidx = 0;
+ for (sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
+ next_unset_unsafe_ck(sample_exclude, &sample_uidx);
+ if (is_set(perm_buf, sample_idx)) {
+ set_bit(sample_exclude, sample_uidx);
+ }
+ }
+ LOGPRINTF("--thin-indiv-count: %u %s removed (%u remaining).\n", sample_ct - thin_keep_ct, (sample_ct - thin_keep_ct == 1)? g_species_singular : g_species_plural, thin_keep_ct);
+ *sample_exclude_ct_ptr = unfiltered_sample_ct - thin_keep_ct;
+ while(0) {
+ random_thin_samples_ct_ret_NOMEM:
+ retval = RET_NOMEM;
+ break;
+ random_thin_samples_ct_ret_INVALID_CMDLINE:
+ retval = RET_INVALID_CMDLINE;
+ break;
+ }
+ wkspace_reset(wkspace_mark);
+ return retval;
+}
+
+
int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, char* sorted_sample_ids, uintptr_t sorted_sample_ct, uintptr_t max_sample_id_len, uint32_t* sample_id_map, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sex_male, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip) {
// 1. load and validate cluster file
// 2. load marker file, sort by uidx
@@ -2109,7 +2177,7 @@ static inline void haploid_single_marker_freqs(uintptr_t unfiltered_sample_ct, u
*hethap_incr_ptr = hethap_incr;
}
-int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_exclude_ct, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* founder_info, int32_t nonfounders, int32_t maf_succ, double* set_allele_freqs, uintptr_t bed_offset, uint32_t hwe_needed, uint32_t hwe_all, uin [...]
+int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_exclude_ct, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* founder_info, int32_t nonfounders, int32_t maf_succ, double* set_allele_freqs, uintptr_t bed_offset, uint32_t hwe_needed, uint32_t hwe_all, uin [...]
FILE* hhfile = NULL;
uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + BITCT - 1) / BITCT;
@@ -2170,7 +2238,6 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
uintptr_t* geno_excl_bitfield = NULL;
uintptr_t* ac_excl_bitfield = NULL;
uint64_t* om_entry_ptr = NULL;
- double* marker_weights = NULL;
uint32_t sample_nonmale_ct = 0;
uint32_t sample_f_nonmale_ct = 0;
uint32_t sample_f_ctl_nonmale_ct = 0;
@@ -2202,20 +2269,6 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
uint32_t ujj;
double maf;
double cur_genotyping_rate;
- if (wt_needed) {
- // this is a pretty ugly hack... but no worse than what preceded it, I
- // suppose
- marker_weights = (double*)top_alloc(topsize_ptr, CACHEALIGN(unfiltered_marker_ct * sizeof(double)));
- if (!marker_weights) {
- goto calc_freqs_and_hwe_ret_NOMEM;
- }
- wkspace_left -= *topsize_ptr;
- *marker_weights_ptr = marker_weights;
- for (marker_uidx = 0; marker_uidx < unfiltered_marker_ct; marker_uidx++) {
- marker_weights[marker_uidx] = -1.0;
- }
- }
-
if (!hwe_needed) {
*hwe_lls_ptr = (int32_t*)wkspace_base;
} else {
@@ -2629,9 +2682,6 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
maf = ((double)ujj) / ((double)uii);
}
set_allele_freqs[marker_uidx] = maf;
- if (wt_needed) {
- marker_weights[marker_uidx] = calc_wt_mean_maf(exponent, maf);
- }
}
nonmissing_rate_tot += cur_genotyping_rate;
if (geno_excl_bitfield && (cur_genotyping_rate < geno_thresh)) {
@@ -2678,9 +2728,8 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
return retval;
}
-int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t plink_maxfid, uint32_t plink_maxiid, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uintptr_t* sample_exclude, uintptr_t* pheno_nm, uintptr_t* sex_male, uint32_t sampl [...]
+int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uint32_t plink_maxfid, uint32_t plink_maxiid, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uintptr_t* sample_exclude, uintptr_t* pheno_nm, uintptr_t* sex_ [...]
unsigned char* wkspace_mark = wkspace_base;
- FILE* outfile = NULL;
uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
uintptr_t unfiltered_sample_ct2l = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
uintptr_t unfiltered_sample_ctv2 = (unfiltered_sample_ct2l + 1) & (~1);
@@ -2688,6 +2737,7 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
uintptr_t* sample_male_include2 = NULL;
uint64_t* om_entry_ptr = NULL;
uintptr_t* cur_omidxs = NULL;
+ char* pzwritep = NULL;
uint32_t* sample_to_cluster = NULL;
uint32_t* missing_ct_by_cluster = NULL;
uint32_t* oblig_missing_ct_by_cluster = NULL;
@@ -2705,6 +2755,7 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
uint32_t om_cluster_ct = 0;
uint32_t om_cluster_ctl = 0;
int32_t retval = 0;
+ Pigz_state ps;
uintptr_t* loadbuf;
uintptr_t* sample_include2;
uintptr_t* cur_nm;
@@ -2712,7 +2763,7 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
uintptr_t* lptr2;
uint32_t* missing_cts;
uint32_t* cur_cluster_sizes;
- char* wptr;
+ unsigned char* overflow_buf;
char* cptr;
char* cptr2;
uintptr_t marker_ct_nony;
@@ -2734,7 +2785,9 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
uint32_t ukk;
uint32_t umm;
uint32_t unn;
- if (wkspace_alloc_ui_checked(&missing_cts, unfiltered_sample_ct * sizeof(int32_t)) ||
+ pzwrite_init_null(&ps);
+ if (wkspace_alloc_uc_checked(&overflow_buf, PIGZ_BLOCK_SIZE + MAXLINELEN) ||
+ wkspace_alloc_ui_checked(&missing_cts, unfiltered_sample_ct * sizeof(int32_t)) ||
wkspace_alloc_ul_checked(&loadbuf, unfiltered_sample_ctv2 * sizeof(intptr_t)) ||
wkspace_alloc_ul_checked(&sample_include2, unfiltered_sample_ctv2 * sizeof(intptr_t)) ||
wkspace_alloc_ul_checked(&sample_male_include2, unfiltered_sample_ctv2 * sizeof(intptr_t))) {
@@ -2753,10 +2806,12 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
if (fseeko(bedfile, bed_offset, SEEK_SET)) {
goto write_missingness_reports_ret_READ_FAIL;
}
- memcpy(outname_end, ".lmiss", 7);
- if (fopen_checked(&outfile, outname, "w")) {
- goto write_missingness_reports_ret_WRITE_FAIL;
+ memcpy(outname_end, output_gz? ".lmiss.gz" : ".lmiss", output_gz? 10 : 7);
+ if (flex_pzwrite_init(output_gz, outname, overflow_buf, 0, &ps)) {
+ goto write_missingness_reports_ret_OPEN_FAIL;
}
+ pzwritep = (char*)overflow_buf;
+
if (om_ip->entry_ct) {
om_entry_ptr = om_ip->entries;
om_cluster_ref_cts = om_ip->cluster_ref_cts;
@@ -2789,7 +2844,7 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
}
ujj = unfiltered_sample_ct2l * BITCT2;
if (!cluster_ct) {
- sprintf(tbuf, " CHR %%%us N_MISS N_GENO F_MISS\n", plink_maxsnp);
+ sprintf(tbuf, " CHR %%%us N_MISS N_GENO F_MISS" EOLN_STR, plink_maxsnp);
} else {
if (wkspace_alloc_ui_checked(&sample_to_cluster, unfiltered_sample_ct * sizeof(int32_t)) ||
wkspace_alloc_ui_checked(&missing_ct_by_cluster, cluster_ct * sizeof(int32_t)) ||
@@ -2815,9 +2870,10 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
}
}
}
- sprintf(tbuf, " CHR %%%us CLST N_MISS N_CLST N_GENO F_MISS\n", plink_maxsnp);
+ sprintf(tbuf, " CHR %%%us CLST N_MISS N_CLST N_GENO F_MISS" EOLN_STR, plink_maxsnp);
}
- fprintf(outfile, tbuf, "SNP");
+
+ pzwritep += sprintf(pzwritep, tbuf, "SNP");
for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1];
@@ -2879,10 +2935,12 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
ulii &= ulii - 1;
}
}
- wptr = uint32_writew8x(cptr2, ukk - oblig_ct, ' ');
- wptr = uint32_writew8x(wptr, cur_tot - oblig_ct, ' ');
- wptr = double_g_writewx4x(wptr, ((double)((int32_t)(ukk - oblig_ct))) / ((double)((int32_t)(cur_tot - oblig_ct))), 8, '\n');
- if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+ pzwritep = memcpya(pzwritep, tbuf, cptr2 - tbuf);
+ pzwritep = uint32_writew8x(pzwritep, ukk - oblig_ct, ' ');
+ pzwritep = uint32_writew8x(pzwritep, cur_tot - oblig_ct, ' ');
+ pzwritep = double_g_writewx4(pzwritep, ((double)((int32_t)(ukk - oblig_ct))) / ((double)((int32_t)(cur_tot - oblig_ct))), 8);
+ append_binary_eoln(&pzwritep);
+ if (flex_pzwrite(&ps, &pzwritep)) {
goto write_missingness_reports_ret_WRITE_FAIL;
}
} else {
@@ -2926,16 +2984,18 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
}
}
for (clidx = 0; clidx < cluster_ct; clidx++) {
- wptr = fw_strcpy(10, &(cluster_ids[clidx * max_cluster_id_len]), cptr2);
- *wptr++ = ' ';
+ pzwritep = memcpya(pzwritep, tbuf, cptr2 - tbuf);
+ pzwritep = fw_strcpy(10, &(cluster_ids[clidx * max_cluster_id_len]), pzwritep);
+ *pzwritep++ = ' ';
uii = missing_ct_by_cluster[clidx];
- wptr = uint32_writew8x(wptr, uii, ' ');
+ pzwritep = uint32_writew8x(pzwritep, uii, ' ');
umm = cur_cluster_sizes[clidx];
- wptr = uint32_writew8x(wptr, umm, ' ');
+ pzwritep = uint32_writew8x(pzwritep, umm, ' ');
umm -= oblig_missing_ct_by_cluster[clidx];
- wptr = uint32_writew8x(wptr, umm, ' ');
- wptr = double_g_writewx4x(wptr, ((double)((int32_t)uii)) / ((double)((int32_t)umm)), 8, '\n');
- if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+ pzwritep = uint32_writew8x(pzwritep, umm, ' ');
+ pzwritep = double_g_writewx4(pzwritep, ((double)((int32_t)uii)) / ((double)((int32_t)umm)), 8);
+ append_binary_eoln(&pzwritep);
+ if (flex_pzwrite(&ps, &pzwritep)) {
goto write_missingness_reports_ret_WRITE_FAIL;
}
}
@@ -2952,15 +3012,16 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
} while (marker_uidx < chrom_end);
}
}
- if (fclose_null(&outfile)) {
+ if (flex_pzwrite_close_null(&ps, pzwritep)) {
goto write_missingness_reports_ret_WRITE_FAIL;
}
outname_end[1] = 'i';
- if (fopen_checked(&outfile, outname, "w")) {
- goto write_missingness_reports_ret_WRITE_FAIL;
+ if (flex_pzwrite_init(output_gz, outname, overflow_buf, 0, &ps)) {
+ goto write_missingness_reports_ret_OPEN_FAIL;
}
- sprintf(tbuf, "%%%us %%%us MISS_PHENO N_MISS N_GENO F_MISS\n", plink_maxfid, plink_maxiid);
- fprintf(outfile, tbuf, "FID", "IID");
+ pzwritep = (char*)overflow_buf;
+ sprintf(tbuf, "%%%us %%%us MISS_PHENO N_MISS N_GENO F_MISS" EOLN_STR, plink_maxfid, plink_maxiid);
+ pzwritep += sprintf(pzwritep, tbuf, "FID", "IID");
do {
sample_uidx = next_unset_unsafe(sample_exclude, sample_uidx);
sample_uidx_stop = next_set(sample_exclude, sample_uidx, unfiltered_sample_ct);
@@ -2969,12 +3030,12 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
cptr = &(sample_ids[sample_uidx * max_sample_id_len]);
cptr2 = (char*)memchr(cptr, '\t', max_sample_id_len);
slen = (uintptr_t)(cptr2 - cptr);
- wptr = memseta(tbuf, 32, plink_maxfid - slen);
- wptr = memcpyax(wptr, cptr, slen, ' ');
- wptr = fw_strcpy(plink_maxiid, &(cptr2[1]), wptr);
- wptr = memseta(wptr, 32, 10);
- *wptr++ = 'Y' - (is_set(pheno_nm, sample_uidx) * 11);
- *wptr++ = ' ';
+ pzwritep = memseta(pzwritep, 32, plink_maxfid - slen);
+ pzwritep = memcpyax(pzwritep, cptr, slen, ' ');
+ pzwritep = fw_strcpy(plink_maxiid, &(cptr2[1]), pzwritep);
+ pzwritep = memseta(pzwritep, 32, 10);
+ *pzwritep++ = 'Y' - (is_set(pheno_nm, sample_uidx) * 11);
+ *pzwritep++ = ' ';
uii = missing_cts[sample_uidx];
ukk = is_set(sex_male, sample_uidx);
ujj = marker_ct_nony + (ukk * marker_ct_y);
@@ -2986,23 +3047,28 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
ujj -= umm;
}
}
- wptr = uint32_writew8x(wptr, uii, ' ');
- wptr = uint32_writew8x(wptr, ujj, ' ');
- wptr = double_g_writewx4x(wptr, ((double)((int32_t)uii)) / ((double)((int32_t)ujj)), 8, '\n');
- if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+ pzwritep = uint32_writew8x(pzwritep, uii, ' ');
+ pzwritep = uint32_writew8x(pzwritep, ujj, ' ');
+ pzwritep = double_g_writewx4(pzwritep, ((double)((int32_t)uii)) / ((double)((int32_t)ujj)), 8);
+ append_binary_eoln(&pzwritep);
+ if (flex_pzwrite(&ps, &pzwritep)) {
goto write_missingness_reports_ret_WRITE_FAIL;
}
} while (++sample_uidx < sample_uidx_stop);
} while (sample_idx < sample_ct);
- if (fclose_null(&outfile)) {
+
+ if (flex_pzwrite_close_null(&ps, pzwritep)) {
goto write_missingness_reports_ret_WRITE_FAIL;
}
*outname_end = '\0';
- LOGPRINTFWW("--missing: Sample missing data report written to %s.imiss, and variant-based %smissing data report written to %s.lmiss.\n", outname, cluster_ct? "cluster-stratified " : "", outname);
+ LOGPRINTFWW("--missing: Sample missing data report written to %s.imiss%s, and variant-based %smissing data report written to %s.lmiss%s.\n", outname, output_gz? ".gz" : "", cluster_ct? "cluster-stratified " : "", outname, output_gz? ".gz" : "");
while (0) {
write_missingness_reports_ret_NOMEM:
retval = RET_NOMEM;
break;
+ write_missingness_reports_ret_OPEN_FAIL:
+ retval = RET_OPEN_FAIL;
+ break;
write_missingness_reports_ret_READ_FAIL:
retval = RET_READ_FAIL;
break;
@@ -3011,45 +3077,53 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
break;
}
wkspace_reset(wkspace_mark);
- fclose_cond(outfile);
+ flex_pzwrite_close_cond(&ps, pzwritep);
return retval;
}
-int32_t hardy_report_write_line(FILE* outfile, char* prefix_buf, uint32_t prefix_len, uint32_t reverse, uint32_t ll_ct, uint32_t lh_ct, uint32_t hh_ct, char* midbuf_ptr, double pval, double output_min_p) {
+int32_t hardy_report_write_line(Pigz_state* ps_ptr, char** pzwritep_ptr, char* prefix_buf, uint32_t prefix_len, uint32_t reverse, uint32_t ll_ct, uint32_t lh_ct, uint32_t hh_ct, char* midbuf_ptr, double pval, double output_min_p) {
+ char* pzwritep = *pzwritep_ptr;
char wbuf[48];
char* cptr;
uint32_t denom;
double drecip;
double minor_freq;
- fwrite(prefix_buf, 1, prefix_len, outfile);
+ pzwritep = memcpya(pzwritep, prefix_buf, prefix_len);
if (reverse) {
cptr = uint32_write(uint32_writex(uint32_writex(wbuf, hh_ct, '/'), lh_ct, '/'), ll_ct);
} else {
cptr = uint32_write(uint32_writex(uint32_writex(wbuf, ll_ct, '/'), lh_ct, '/'), hh_ct);
}
- cptr = fw_strcpyn(20, cptr - wbuf, wbuf, midbuf_ptr);
- *cptr++ = ' ';
+ pzwritep = fw_strcpyn(20, cptr - wbuf, wbuf, pzwritep);
+ *pzwritep++ = ' ';
denom = (ll_ct + lh_ct + hh_ct) * 2;
if (denom) {
drecip = 1.0 / ((double)denom);
minor_freq = (2 * ll_ct + lh_ct) * drecip;
- cptr = double_g_writewx4x(double_g_writewx4x(double_g_writewx4x(cptr, (lh_ct * 2) * drecip, 8, ' '), minor_freq * (2 * hh_ct + lh_ct) * drecip * 2, 8, ' '), MAXV(pval, output_min_p), 12, '\n');
+ pzwritep = double_g_writewx4(double_g_writewx4x(double_g_writewx4x(pzwritep, (lh_ct * 2) * drecip, 8, ' '), minor_freq * (2 * hh_ct + lh_ct) * drecip * 2, 8, ' '), MAXV(pval, output_min_p), 12);
} else {
- cptr = memcpya(cptr, " nan nan NA\n", 31);
+ pzwritep = memcpya(pzwritep, " nan nan NA", 30);
}
- return fwrite_checked(midbuf_ptr, (cptr - midbuf_ptr), outfile);
+ append_binary_eoln(&pzwritep);
+ if (flex_pzwrite(ps_ptr, &pzwritep)) {
+ return 1;
+ }
+ *pzwritep_ptr = pzwritep;
+ return 0;
}
int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, int32_t* hwe_lls, int32_t* hwe_lhs, int32_t* hwe_hhs, uint32_t hwe_modifier, uint32_t nonfounders, int32_t* hwe_ll_cases, int32_t* hwe_lh_cases, int32_t* hwe_hh_cases, int [...]
- FILE* outfile = NULL;
unsigned char* wkspace_mark = wkspace_base;
+ char* pzwritep = NULL;
uintptr_t marker_ct = unfiltered_marker_ct - marker_exclude_ct;
uintptr_t marker_uidx = 0;
uintptr_t marker_idx = 0;
uint32_t hwe_midp = hwe_modifier & HWE_MIDP;
+ uint32_t output_gz = (hwe_modifier / HWE_GZ) & 1;
int32_t retval = 0;
uint32_t skip_chrom = 0;
uint32_t pct = 0;
+ Pigz_state ps;
uint32_t prefix_len;
uint32_t loop_end;
uint32_t uii;
@@ -3062,6 +3136,7 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
uint32_t chrom_end;
uint32_t reverse;
double* p_values;
+ unsigned char* overflow_buf;
char* writebuf;
char* cptr0;
char* cptr;
@@ -3069,13 +3144,15 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
char* cptr3;
char* cptr4;
char* cptr5;
+ pzwrite_init_null(&ps);
if (pheno_nm_ct) {
report_type = pheno_c? 0 : 1;
} else {
report_type = 2;
}
uii = report_type? 1 : 3;
- if (wkspace_alloc_d_checked(&p_values, uii * marker_ct * sizeof(double)) ||
+ if (wkspace_alloc_uc_checked(&overflow_buf, PIGZ_BLOCK_SIZE + 2 * max_marker_allele_len + MAXLINELEN) ||
+ wkspace_alloc_d_checked(&p_values, uii * marker_ct * sizeof(double)) ||
wkspace_alloc_c_checked(&writebuf, 2 * max_marker_allele_len + MAXLINELEN)) {
goto hardy_report_ret_NOMEM;
}
@@ -3097,16 +3174,18 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
marker_uidx = 0;
marker_idx = 0;
- memcpy(outname_end, ".hwe", 5);
- if (fopen_checked(&outfile, outname, "w")) {
+ memcpy(outname_end, output_gz? ".hwe.gz" : ".hwe", output_gz? 8 : 5);
+ if (flex_pzwrite_init(output_gz, outname, overflow_buf, 0, &ps)) {
goto hardy_report_ret_OPEN_FAIL;
}
+ pzwritep = (char*)overflow_buf;
+
LOGPRINTFWW5("--hardy: Writing Hardy-Weinberg report (%s) to %s ... ", nonfounders? "all samples" : "founders only", outname);
fputs("0%", stdout);
fflush(stdout);
- sprintf(writebuf, " CHR %%%us TEST A1 A2 GENO O(HET) E(HET) P \n", plink_maxsnp);
- fprintf(outfile, writebuf, "SNP");
-
+ sprintf(writebuf, " CHR %%%us TEST A1 A2 GENO O(HET) E(HET) P " EOLN_STR, plink_maxsnp);
+ pzwritep += sprintf(pzwritep, writebuf, "SNP");
+
chrom_fo_idx = 0;
refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
skip_chrom = (is_haploid && (!is_x)) || is_mt;
@@ -3152,7 +3231,7 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
cptr5 = fw_strcpy(4, cptr4, &(cptr5[1]));
*cptr5 = ' ';
prefix_len = 1 + (cptr5 - writebuf);
- if (hardy_report_write_line(outfile, writebuf, prefix_len, reverse, hwe_ll_allfs[marker_uidx], hwe_lh_allfs[marker_uidx], hwe_hh_allfs[marker_uidx], cptr2, p_values[marker_idx], output_min_p)) {
+ if (hardy_report_write_line(&ps, &pzwritep, writebuf, prefix_len, reverse, hwe_ll_allfs[marker_uidx], hwe_lh_allfs[marker_uidx], hwe_hh_allfs[marker_uidx], cptr2, p_values[marker_idx], output_min_p)) {
goto hardy_report_ret_WRITE_FAIL;
}
}
@@ -3195,17 +3274,17 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
cptr5 = fw_strcpy(4, cptr4, &(cptr5[1]));
*cptr5 = ' ';
prefix_len = 1 + (cptr5 - writebuf);
- if (hardy_report_write_line(outfile, writebuf, prefix_len, reverse, hwe_ll_allfs[marker_uidx], hwe_lh_allfs[marker_uidx], hwe_hh_allfs[marker_uidx], cptr2, p_values[3 * marker_idx], output_min_p)) {
+ if (hardy_report_write_line(&ps, &pzwritep, writebuf, prefix_len, reverse, hwe_ll_allfs[marker_uidx], hwe_lh_allfs[marker_uidx], hwe_hh_allfs[marker_uidx], cptr2, p_values[3 * marker_idx], output_min_p)) {
goto hardy_report_ret_WRITE_FAIL;
}
memcpy(&(cptr0[7 + plink_maxsnp]), "FF", 2);
- if (hardy_report_write_line(outfile, writebuf, prefix_len, reverse, hwe_ll_cases[marker_uidx], hwe_lh_cases[marker_uidx], hwe_hh_cases[marker_uidx], cptr2, p_values[3 * marker_idx + 1], output_min_p)) {
+ if (hardy_report_write_line(&ps, &pzwritep, writebuf, prefix_len, reverse, hwe_ll_cases[marker_uidx], hwe_lh_cases[marker_uidx], hwe_hh_cases[marker_uidx], cptr2, p_values[3 * marker_idx + 1], output_min_p)) {
goto hardy_report_ret_WRITE_FAIL;
}
memcpy(&(cptr0[4 + plink_maxsnp]), "UN", 2);
- if (hardy_report_write_line(outfile, writebuf, prefix_len, reverse, hwe_lls[marker_uidx], hwe_lhs[marker_uidx], hwe_hhs[marker_uidx], cptr2, p_values[3 * marker_idx + 2], output_min_p)) {
+ if (hardy_report_write_line(&ps, &pzwritep, writebuf, prefix_len, reverse, hwe_lls[marker_uidx], hwe_lhs[marker_uidx], hwe_hhs[marker_uidx], cptr2, p_values[3 * marker_idx + 2], output_min_p)) {
goto hardy_report_ret_WRITE_FAIL;
}
}
@@ -3232,7 +3311,7 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
retval = RET_WRITE_FAIL;
break;
}
- fclose_cond(outfile);
+ flex_pzwrite_close_cond(&ps, pzwritep);
wkspace_reset(wkspace_mark);
return retval;
}
diff --git a/plink_filter.h b/plink_filter.h
index 9c79b7b..9c5194d 100644
--- a/plink_filter.h
+++ b/plink_filter.h
@@ -31,6 +31,10 @@ uint32_t random_thin_markers(double thin_keep_prob, uintptr_t unfiltered_marker_
int32_t random_thin_markers_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr);
+uint32_t random_thin_samples(double thin_keep_prob, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr);
+
+int32_t random_thin_samples_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr);
+
int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, char* sorted_sample_ids, uintptr_t sorted_sample_ct, uintptr_t max_sample_id_len, uint32_t* sample_id_map, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sex_male, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip);
int32_t filter_samples_file(char* filtername, char* sorted_sample_ids, uintptr_t sorted_ids_len, uintptr_t max_sample_id_len, uint32_t* id_map, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* filtervals_flattened, uint32_t mfilter_col);
@@ -39,9 +43,9 @@ void filter_samples_bitfields(uintptr_t unfiltered_sample_ct, uintptr_t* sample_
int32_t mind_filter(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double mind_thresh, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* sex_male, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip);
-int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_exclude_ct, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* founder_info, int32_t nonfounders, int32_t maf_succ, double* set_allele_freqs, uintptr_t bed_offset, uint32_t hwe_needed, uint32_t hwe_all, uin [...]
+int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_exclude_ct, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* founder_info, int32_t nonfounders, int32_t maf_succ, double* set_allele_freqs, uintptr_t bed_offset, uint32_t hwe_needed, uint32_t hwe_all, uin [...]
-int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t plink_maxfid, uint32_t plink_maxiid, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uintptr_t* sample_exclude, uintptr_t* pheno_nm, uintptr_t* sex_male, uint32_t sampl [...]
+int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uint32_t plink_maxfid, uint32_t plink_maxiid, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uintptr_t* sample_exclude, uintptr_t* pheno_nm, uintptr_t* sex_ [...]
int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, int32_t* hwe_lls, int32_t* hwe_lhs, int32_t* hwe_hhs, uint32_t hwe_modifier, uint32_t nonfounders, int32_t* hwe_ll_cases, int32_t* hwe_lh_cases, int32_t* hwe_hh_cases, int [...]
diff --git a/plink_glm.c b/plink_glm.c
index 7d93bdb..27759b0 100644
--- a/plink_glm.c
+++ b/plink_glm.c
@@ -3953,12 +3953,13 @@ int32_t glm_common_init(FILE* bedfile, uintptr_t bed_offset, uint32_t glm_modifi
}
}
}
+ np_base = param_ct_max - np_diploid;
if (!sex_covar_everywhere) {
np_sex = popcount_bit_idx(active_params, sex_start_idx, param_raw_ct_max);
- }
- np_base = param_ct_max - np_diploid - np_sex;
- if (!np_sex) {
- variation_in_sex = 0;
+ np_base -= np_sex;
+ if (!np_sex) {
+ variation_in_sex = 0;
+ }
}
} else {
fill_all_bits(active_params, param_raw_ct_max);
@@ -3967,13 +3968,6 @@ int32_t glm_common_init(FILE* bedfile, uintptr_t bed_offset, uint32_t glm_modifi
np_diploid = np_diploid_raw;
np_sex = np_sex_raw;
}
- if (sample_valid_ct <= param_ct_max) {
- logprint("Warning: Skipping --linear since # variables >= # samples.\n");
- if (pheno_nm_ct > param_ct_max) {
- logprint("(Check your covariates--all samples with at least one missing covariate are\nexcluded from this analysis.)\n");
- }
- goto glm_common_init_ret_1;
- }
// parameter sequence:
// 1. intercept
// 2. allelic dosage
@@ -4698,6 +4692,13 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
if (retval) {
goto glm_linear_assoc_ret_1;
}
+ if (sample_valid_ct <= param_ct_max) {
+ logprint("Warning: Skipping --linear since # variables >= # samples.\n");
+ if (pheno_nm_ct > param_ct_max) {
+ logprint("(Check your covariates--all samples with at least one missing covariate are\nexcluded from this analysis.)\n");
+ }
+ goto glm_linear_assoc_ret_1;
+ }
sample_valid_ctv2 = 2 * ((sample_valid_ct + BITCT - 1) / BITCT);
final_mask = get_final_mask(sample_valid_ct);
param_ctx_max_m1 = param_ctx_max - 1;
@@ -4714,6 +4715,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
// use this array to track regression failures even in max(T) case
fill_ulong_zero((uintptr_t*)g_perm_adapt_stop, (marker_initial_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
} else {
+ g_perm_adapt_stop = NULL;
ulii = (marker_initial_ct + (BITCT - 1)) / BITCT;
if (wkspace_alloc_ul_checked(®ression_skip, ulii * sizeof(intptr_t))) {
goto glm_linear_assoc_ret_NOMEM;
@@ -5219,12 +5221,15 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
} else if ((!g_min_ploidy_1) || (!genotypic_or_hethom)) {
cur_param_ct = np_base + np_diploid;
if (constraint_ct_max) {
- cur_constraint_ct = popcount_bit_idx(g_joint_test_params, 0, constraint_ct_max - np_sex);
+ // bugfix: this incorrectly had constraint_ct_max as last parameter
+ cur_constraint_ct = popcount_bit_idx(g_joint_test_params, 0, cur_param_ct);
} else {
cur_constraint_ct = 0;
}
cur_param_names = param_names;
} else {
+ // er, is this still reachable with forced --xchr-model 0? should it
+ // be reachable?
cur_param_ct = np_base;
cur_constraint_ct = 0;
if (constraint_ct_max) {
@@ -5451,7 +5456,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
goto glm_linear_assoc_ret_WRITE_FAIL;
}
}
- } else if (orig_pvals) {
+ } else if (orig_pvals && constraint_ct_max) {
orig_pvals[marker_idx3] = -9;
}
} else {
@@ -6195,6 +6200,10 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
uint32_t ujj;
uint32_t ukk;
numbuf[0] = ' ';
+ if (pheno_nm_ct < 2) {
+ logprint("Warning: Skipping --logistic since less than two phenotypes are present.\n");
+ goto glm_logistic_assoc_ret_1;
+ }
if ((chrom_info_ptr->mt_code != -1) && is_set(chrom_info_ptr->chrom_mask, chrom_info_ptr->mt_code)) {
hh_or_mt_exists |= NXMHH_EXISTS;
}
@@ -6221,6 +6230,13 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
if (retval) {
goto glm_logistic_assoc_ret_1;
}
+ if (sample_valid_ct <= param_ct_max) {
+ logprint("Warning: Skipping --logistic since # variables >= # samples.\n");
+ if (pheno_nm_ct > param_ct_max) {
+ logprint("(Check your covariates--all samples with at least one missing covariate are\nexcluded from this analysis.)\n");
+ }
+ goto glm_logistic_assoc_ret_1;
+ }
sample_valid_cta4 = (sample_valid_ct + 3) & (~3);
sample_valid_ctv2 = 2 * ((sample_valid_ct + BITCT - 1) / BITCT);
final_mask = get_final_mask(sample_valid_ct);
@@ -6239,6 +6255,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
// use this array to track regression failures even in max(T) case
fill_ulong_zero((uintptr_t*)g_perm_adapt_stop, (marker_initial_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
} else {
+ g_perm_adapt_stop = NULL;
ulii = (marker_initial_ct + (BITCT - 1)) / BITCT;
if (wkspace_alloc_ul_checked(®ression_skip, ulii * sizeof(intptr_t))) {
goto glm_logistic_assoc_ret_NOMEM;
@@ -6867,7 +6884,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
goto glm_logistic_assoc_ret_WRITE_FAIL;
}
}
- } else if (orig_pvals) {
+ } else if (orig_pvals && constraint_ct_max) {
orig_pvals[marker_idx3] = -9;
}
} else {
diff --git a/plink_help.c b/plink_help.c
index 62e26f3..dc1aaf7 100644
--- a/plink_help.c
+++ b/plink_help.c
@@ -382,7 +382,7 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" --recode <01 | 12> <23 | A{-transpose} | AD | beagle{-nomap} | bimbam{-1chr}\n"
" | compound-genotypes | fastphase{-1chr} | HV{-1chr} | lgen{-ref} |\n"
" list | oxford | rlist | structure | transpose | vcf | vcf-fid |\n"
-" vcf-iid> <tab | tabx | spacex> <include-alt>\n"
+" vcf-iid> <tab | tabx | spacex | bgz> <include-alt>\n"
" Create a new text fileset with all filters applied. By default, the\n"
" fileset consists of a .ped and a .map file, readable with --file.\n"
" * The '12' modifier causes A1 (usually minor) alleles to be coded as '1'\n"
@@ -422,6 +422,7 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" 'vcf-fid' and 'vcf-iid' cause family IDs or within-family IDs\n"
" respectively to be used for the sample IDs in the last header row, while\n"
" 'vcf' merges both IDs and puts an underscore between them.\n"
+" If the 'bgz' modifier is added, the VCF file is block-gzipped.\n"
" The A2 allele is saved as the reference and normally flagged as not based\n"
" on a real reference genome ('PR' INFO field value). When it is important\n"
" for reference alleles to be correct, you'll also want to include\n"
@@ -498,8 +499,8 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" from the report.\n\n"
);
help_print("freq\tfreqx\tfrqx\tcounts", &help_ctrl, 1,
-" --freq <counts>\n"
-" --freqx\n"
+" --freq <counts> <gz>\n"
+" --freqx <gz>\n"
" --freq generates a basic allele frequency (or count, if the 'counts'\n"
" modifier is present) report. This can be combined with --within/--family\n"
" to produce a cluster-stratified allele frequency/count report instead.\n"
@@ -507,16 +508,17 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" with --read-freq.\n\n"
);
help_print("missing", &help_ctrl, 1,
-" --missing\n"
+" --missing <gz>\n"
" Generate sample- and variant-based missing data reports. If clusters are\n"
-" defined, the variant-based report is cluster-stratified.\n\n"
+" defined, the variant-based report is cluster-stratified. 'gz' causes the\n"
+" output files to be gzipped.\n\n"
);
help_print("test-mishap", &help_ctrl, 1,
" --test-mishap\n"
" Check for association between missing calls and flanking haplotypes.\n\n"
);
help_print("hardy\thardy2", &help_ctrl, 1,
-" --hardy <midp>\n"
+" --hardy <midp> <gz>\n"
" Generate a Hardy-Weinberg exact test p-value report. (This does NOT\n"
" simultaneously filter on the p-value any more; use --hwe for that.) With\n"
" the 'midp' modifier, the test applies the mid-p adjustment described in\n"
@@ -528,7 +530,7 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" Generate a Mendel error report.\n\n"
);
help_print("het\tibc", &help_ctrl, 1,
-" --het <small-sample>\n"
+" --het <small-sample> <gz>\n"
" --ibc\n"
" Estimate inbreeding coefficients. --het reports method-of-moments\n"
" estimates, while --ibc calculates all three values described in Yang J, Lee\n"
@@ -1071,7 +1073,7 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
#endif
#endif
help_print("tdt\tpoo\tperm\tmperm\tparentdt1\tparentdt2\tpat\tmat\tset-test", &help_ctrl, 1,
-" --tdt <exact | exact-midp | poo> <perm | mperm=[value]>\n"
+" --tdt <exact | exact-midp | poo> <perm | mperm=[value]> <perm-count>\n"
" <parentdt1 | parentdt2 | pat | mat> <set-test>\n"
" Report transmission disequilibrium test statistics, given case/control\n"
" phenotypes and pedigree information.\n"
@@ -1093,17 +1095,27 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" parent-of-origin test Z score; 'pat'/'mat' cause paternal or maternal TDT\n"
" chi-square statistics, respectively, to be considered instead.\n\n"
);
+#ifndef STABLE_BUILD
+ help_print("dfam", &help_ctrl, 1,
+" --dfam <no-unrelateds> <perm | mperm=[value]> <perm-count> <set-test>\n"
+" Sib-TDT-based association test. By default, clusters of unrelated\n"
+" individuals are included in the test; the 'no-unrelateds' modifier removes\n"
+" this component, leaving the original sib-TDT.\n\n"
+ );
+#endif
help_print("qfam\tqfam-between\tqfam-parents\tqfam-total", &help_ctrl, 1,
-" --qfam <perm | mperm=[value]> <perm-count>\n"
-" --qfam-parents <perm | mperm=[value]> <perm-count>\n"
-" --qfam-between <perm | mperm=[value]> <perm-count>\n"
-" --qfam-total <perm | mperm=[value]> <perm-count>\n"
+" --qfam <perm | mperm=[value]> <perm-count> <emp-se>\n"
+" --qfam-parents <perm | mperm=[value]> <perm-count> <emp-se>\n"
+" --qfam-between <perm | mperm=[value]> <perm-count> <emp-se>\n"
+" --qfam-total <perm | mperm=[value]> <perm-count> <emp-se>\n"
" QFAM family-based association test for quantitative traits.\n"
" * A Mendel error check is performed before the main tests; offending\n"
" genotypes are treated as missing by this analysis.\n"
" * This procedure requires permutation. 'perm' and 'perm-count' have the\n"
" usual meanings. However, 'mperm=[value]' just specifies a fixed number\n"
-" of permutations; the method does not support a proper max(T) test.\n\n"
+" of permutations; the method does not support a proper max(T) test.\n"
+" * The 'emp-se' modifier adds BETA and EMP_SE (empirical standard error for\n"
+" beta) fields to the .perm output file.\n\n"
);
help_print("annotate", &help_ctrl, 1,
" --annotate [PLINK report] <attrib=[file]> <ranges=[file]> <filter=[file]>\n"
@@ -1240,10 +1252,10 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
#if defined __cplusplus && !defined _WIN32 && !defined STABLE_BUILD
help_print("R\tR-debug", &help_ctrl, 1,
" --R [R script file] <debug>\n"
-" Connect to a Rserve background process, and execute the Rplink function\n"
-" defined in the input file. (Unless the 'debug' modifier is present; in\n"
-" that case, the R commands that PLINK would have tried to execute are logged\n"
-" to a file.)\n\n"
+" Connect to a Rserve (preferably version 1.7 or later) background process,\n"
+" and execute the Rplink function defined in the input file. (Unless the\n"
+" 'debug' modifier is present; in that case, the R commands that PLINK would\n"
+" have tried to execute are logged to a file.)\n\n"
);
#endif
/*
@@ -1408,7 +1420,7 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" using the values in the main input fileset.\n"
" --all-pheno : For basic association tests, loop through all phenotypes\n"
" in --pheno file.\n"
-" --mpheno [col] : Specify phenotype column number in --pheno file.\n"
+" --mpheno [n] : Load phenotype from column (n+2) in --pheno file.\n"
" --pheno-name [c] : If --pheno file has a header row, use column with the\n"
" given name.\n"
" --pheno-merge : When the main input fileset contains an phenotype value\n"
@@ -1551,6 +1563,10 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" --bp-space [bps] : Remove variants so that each pair is no closer than the\n"
" given bp distance. (Equivalent to VCFtools --thin.)\n"
);
+ help_print("thin-indiv\tthin-indiv-count\tmax-indv", &help_ctrl, 0,
+" --thin-indiv [p] : Randomly remove samples, retaining with prob. p.\n"
+" --thin-indiv-count [n] : Randomly remove samples until n of them remain.\n"
+ );
help_print("filter\tmfilter", &help_ctrl, 0,
" --filter [f] [val(s)...] : Exclude all samples without a 3rd column entry in\n"
" the given file matching one of the given\n"
@@ -1777,8 +1793,9 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" 7 = report mismatching nonmissing calls without merging\n"
);
help_print("merge\tbmerge\tmerge-list\tmerge-mode\tmerge-equal-pos", &help_ctrl, 0,
-" --merge-equal-pos : Merge variants with different names but identical\n"
-" positions.\n"
+" --merge-equal-pos : With --merge/--bmerge/--merge-list, merge variants with\n"
+" different names but identical positions. (Exception:\n"
+" same-position chromosome code 0 variants aren't merged.)\n"
);
help_print("mendel-duos\tmendel-multigen\tme\tmendel\ttdt\tset-me-missing", &help_ctrl, 0,
" --mendel-duos : Make Mendel error checks consider samples with only one\n"
@@ -1820,12 +1837,17 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" informative pairs] ratios to be larger than this\n"
" value (default 0.95).\n"
);
- help_print("distance-exp\texponent\tdistance", &help_ctrl, 0,
-" --distance-exp [x] : When computing genomic distances, assign each variant a\n"
-" weight of (2q(1-q))^{-x}, where q is the inferred MAF.\n"
-" (Use --read-freq if you want to explicitly specify some\n"
-" or all of the MAFs.)\n"
+ help_print("distance-wts\tdistance-exp\texponent\tdistance", &help_ctrl, 0,
+" --distance-wts exp=[x] : When computing genomic distances, assign each\n"
+" variant a weight of (2q(1-q))^{-x}, where q\n"
+" is the loaded or inferred MAF.\n"
);
+#ifndef STABLE_BUILD
+ help_print("distance-wts\tdistance\tmake-grm-gz\tmake-grm-bin", &help_ctrl, 0,
+" --distance-wts [f] <noheader> : When computing genomic distances, assign each\n"
+" variant the weight specified in the file.\n"
+ );
+#endif
help_print("read-dists\tload-dists\tibs-test\tgroupdist\tregress-distance\tcluster\tneighbour\tneighbor", &help_ctrl, 0,
" --read-dists [dist file] {id file} : Load a triangular binary distance matrix\n"
" instead of recalculating from scratch.\n"
@@ -2148,7 +2170,11 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
#endif
if (!param_ct) {
fputs(
-"\nFor further documentation and support, consult the main webpage\n"
+"\nPrimary methods paper:\n"
+"Chang CC, Chow CC, Tellier LCAM, Vattikuti S, Purcell SM, Lee JJ (2015)\n"
+"Second-generation PLINK: rising to the challenge of larger and richer datasets.\n"
+"GigaScience, 4.\n\n"
+"For further documentation and support, consult the main webpage\n"
"(https://www.cog-genomics.org/plink2 ) and/or the mailing list\n"
"(https://groups.google.com/d/forum/plink2-users ).\n"
, stdout);
diff --git a/plink_ld.c b/plink_ld.c
index 2c22be4..fb97c82 100644
--- a/plink_ld.c
+++ b/plink_ld.c
@@ -2156,11 +2156,11 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
int32_t retval = 0;
unsigned char* wkspace_mark2;
uintptr_t* ulptr;
+ unsigned char* overflow_buf;
uint64_t tests_completed;
uintptr_t thread_workload;
uintptr_t cur_idx2_block_size;
uintptr_t marker_idx2_end;
- uintptr_t marker_uidx1_tmp;
uintptr_t block_idx1;
uintptr_t marker_uidx2;
uintptr_t marker_idx2;
@@ -2173,6 +2173,9 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
uint32_t chrom_end;
uint32_t is_last_block;
+ if (wkspace_alloc_uc_checked(&overflow_buf, 262144)) {
+ goto ld_report_matrix_ret_NOMEM;
+ }
if (output_single_prec) {
// force divisibility by 16 instead (cacheline = 64 bytes, float = 4)
marker_ctm8 = (marker_ctm8 + 8) & (~15);
@@ -2322,26 +2325,26 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
}
}
g_ld_idx1_block_size = idx1_block_size;
- marker_uidx1_tmp = marker_uidx1;
+ // marker_uidx1_tmp = marker_uidx1;
if (fseeko(bedfile, bed_offset + (marker_uidx1 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
goto ld_report_matrix_ret_READ_FAIL;
}
chrom_end = 0;
- for (block_idx1 = 0; block_idx1 < idx1_block_size; marker_uidx1_tmp++, block_idx1++) {
- if (IS_SET(marker_exclude, marker_uidx1_tmp)) {
- marker_uidx1_tmp = next_unset_ul_unsafe(marker_exclude, marker_uidx1_tmp);
- if (fseeko(bedfile, bed_offset + (marker_uidx1_tmp * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
+ for (block_idx1 = 0; block_idx1 < idx1_block_size; marker_uidx1++, block_idx1++) {
+ if (IS_SET(marker_exclude, marker_uidx1)) {
+ marker_uidx1 = next_unset_ul_unsafe(marker_exclude, marker_uidx1);
+ if (fseeko(bedfile, bed_offset + (marker_uidx1 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
goto ld_report_matrix_ret_READ_FAIL;
}
}
- if (marker_uidx1_tmp >= chrom_end) {
- chrom_fo_idx = get_marker_chrom_fo_idx(chrom_info_ptr, marker_uidx1_tmp);
+ if (marker_uidx1 >= chrom_end) {
+ chrom_fo_idx = get_marker_chrom_fo_idx(chrom_info_ptr, marker_uidx1);
chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
is_haploid = IS_SET(chrom_info_ptr->haploid_mask, chrom_idx);
is_x = (((int32_t)chrom_idx) == chrom_info_ptr->x_code);
is_y = (((int32_t)chrom_idx) == chrom_info_ptr->y_code);
}
- if (load_and_collapse_incl(bedfile, loadbuf, unfiltered_sample_ct, &(g_ld_geno1[block_idx1 * founder_ct_192_long]), founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx1_tmp))) {
+ if (load_and_collapse_incl(bedfile, loadbuf, unfiltered_sample_ct, &(g_ld_geno1[block_idx1 * founder_ct_192_long]), founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx1))) {
goto ld_report_matrix_ret_READ_FAIL;
}
if (is_haploid && hh_exists) {
@@ -2477,9 +2480,9 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
g_ld_idx2_block_size = marker_idx1 + 1;
}
if (output_gz) {
- parallel_compress(outname, not_first_write, ld_matrix_emitn);
+ parallel_compress(outname, overflow_buf, not_first_write, ld_matrix_emitn);
} else {
- write_uncompressed(outname, not_first_write, ld_matrix_emitn);
+ write_uncompressed(outname, overflow_buf, not_first_write, ld_matrix_emitn);
}
not_first_write = 1;
}
@@ -3048,7 +3051,7 @@ static void two_locus_count_table_zmiss1(uintptr_t* lptr1, uintptr_t* lptr2, uin
counts_3x3[1] = popcount_longs_intersect(lptr1, &(lptr2[sample_ctv3]), sample_ctv3);
if (!is_zmiss2) {
counts_3x3[2] = popcount_longs_intersect(lptr1, &(lptr2[2 * sample_ctv3]), sample_ctv3);
- counts_3x3[5] = popcount_longs_intersect(&(lptr1[2 * sample_ctv3]), &(lptr2[2 * sample_ctv3]), sample_ctv3);
+ counts_3x3[5] = popcount_longs_intersect(&(lptr1[sample_ctv3]), &(lptr2[2 * sample_ctv3]), sample_ctv3);
}
lptr1 = &(lptr1[sample_ctv3]);
counts_3x3[3] = popcount_longs_intersect(lptr1, lptr2, sample_ctv3);
@@ -5182,6 +5185,7 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
uintptr_t* dummy_nm;
uintptr_t* ulptr;
uint32_t* uiptr;
+ unsigned char* overflow_buf;
unsigned char* wkspace_mark2;
uintptr_t thread_workload;
uintptr_t idx1_block_size;
@@ -5202,7 +5206,8 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
uint32_t cur_marker_pos;
uint32_t is_last_block;
uint32_t uii;
- if (wkspace_alloc_ul_checked(&loadbuf, founder_ctl * 2 * sizeof(intptr_t)) ||
+ if (wkspace_alloc_uc_checked(&overflow_buf, 262144) ||
+ wkspace_alloc_ul_checked(&loadbuf, founder_ctl * 2 * sizeof(intptr_t)) ||
wkspace_alloc_ul_checked(&dummy_nm, founder_ctl * sizeof(intptr_t))) {
goto ld_report_dprime_ret_NOMEM;
}
@@ -5488,9 +5493,9 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
g_ld_idx2_block_start = 0;
g_ld_block_idx2 = 0;
if (output_gz) {
- parallel_compress(outname, not_first_write, ld_regular_emitn);
+ parallel_compress(outname, overflow_buf, not_first_write, ld_regular_emitn);
} else {
- write_uncompressed(outname, not_first_write, ld_regular_emitn);
+ write_uncompressed(outname, overflow_buf, not_first_write, ld_regular_emitn);
}
not_first_write = 1;
g_ld_is_first_block = 0;
@@ -5573,6 +5578,7 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
uint32_t chrom_last = 0;
int32_t retval = 0;
unsigned char* wkspace_mark2;
+ unsigned char* overflow_buf;
uint32_t* id_map;
char* sorted_ids;
char* bufptr;
@@ -5608,6 +5614,9 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
uint32_t is_last_block;
uint32_t uii;
int32_t ii;
+ if (wkspace_alloc_uc_checked(&overflow_buf, 262144)) {
+ goto ld_report_regular_ret_NOMEM;
+ }
if (idx1_subset) {
if (wkspace_alloc_ul_checked(&marker_exclude_idx1, unfiltered_marker_ctl * sizeof(intptr_t))) {
goto ld_report_regular_ret_NOMEM;
@@ -5980,9 +5989,9 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
g_ld_idx2_block_start = 0;
g_ld_block_idx2 = 0;
if (output_gz) {
- parallel_compress(outname, not_first_write, ld_regular_emitn);
+ parallel_compress(outname, overflow_buf, not_first_write, ld_regular_emitn);
} else {
- write_uncompressed(outname, not_first_write, ld_regular_emitn);
+ write_uncompressed(outname, overflow_buf, not_first_write, ld_regular_emitn);
}
not_first_write = 1;
g_ld_is_first_block = 0;
diff --git a/plink_misc.c b/plink_misc.c
index 70a6fe4..9ec831d 100644
--- a/plink_misc.c
+++ b/plink_misc.c
@@ -3,6 +3,8 @@
#include "plink_misc.h"
#include "plink_stats.h"
+#include "pigz.h"
+
void misc_init(Score_info* sc_ip) {
sc_ip->fname = NULL;
sc_ip->range_fname = NULL;
@@ -1712,20 +1714,6 @@ uint32_t calc_plink_maxsnp(uint32_t unfiltered_marker_ct, uintptr_t* marker_excl
return plink_maxsnp;
}
-double calc_wt_mean(double exponent, int32_t lhi, int32_t lli, int32_t hhi) {
- double lcount = (double)lli + ((double)lhi * 0.5);
- int64_t tot = lhi + lli + hhi;
- double dtot = (double)tot;
- int64_t subcount = lli; // avoid 32-bit integer overflow
- double weight;
- if ((!lhi) && ((!lli) || (!hhi))) {
- return 0.0;
- }
- weight = pow(2 * lcount * (dtot - lcount) / (dtot * dtot), -exponent);
- subcount = lhi * (subcount + hhi) + 2 * subcount * hhi;
- return (subcount * weight * 2) / (double)(tot * tot);
-}
-
// aptr1 = minor, aptr2 = major
int32_t load_one_freq(uint32_t alen1, const char* aptr1, uint32_t alen2, const char* aptr2, double maf, double* set_allele_freq_ptr, char** mastrs_ptr, char missing_geno) {
uint32_t malen0 = strlen(mastrs_ptr[0]);
@@ -1848,7 +1836,7 @@ uint32_t get_freq_file_type(char* bufptr) {
return 0;
}
-int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, char** marker_allele_ptrs, double* set_allele_freqs, uint32_t* nchrobs, uint32_t maf_succ, double exponent, uint32_t wt_needed, double* marker_weights) {
+int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, char** marker_allele_ptrs, double* set_allele_freqs, uint32_t* nchrobs, uint32_t maf_succ) {
unsigned char* wkspace_mark = wkspace_base;
FILE* freqfile = NULL;
uintptr_t line_idx = 0;
@@ -1994,9 +1982,6 @@ int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uint
if (retval) {
goto read_external_freqs_ret_ALLELE_MISMATCH;
}
- if (wt_needed) {
- marker_weights[marker_uidx] = calc_wt_mean_maf(exponent, set_allele_freqs[marker_uidx]);
- }
}
}
}
@@ -2080,13 +2065,6 @@ int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uint
if (retval) {
goto read_external_freqs_ret_ALLELE_MISMATCH;
}
- if (wt_needed) {
- if (c_hap_a1 || c_hap_a2) {
- marker_weights[marker_uidx] = calc_wt_mean_maf(exponent, set_allele_freqs[marker_uidx]);
- } else {
- marker_weights[marker_uidx] = calc_wt_mean(exponent, c_het, c_hom_a1, c_hom_a2);
- }
- }
}
}
}
@@ -2134,9 +2112,6 @@ int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uint
if (retval) {
goto read_external_freqs_ret_ALLELE_MISMATCH;
}
- if (wt_needed) {
- marker_weights[marker_uidx] = calc_wt_mean_maf(exponent, set_allele_freqs[marker_uidx]);
- }
} else {
// if there aren't exactly 3 columns, this isn't a GCTA .freq file
bufptr = next_token(bufptr);
@@ -2356,9 +2331,10 @@ int32_t load_ax_alleles(Two_col_params* axalleles, uintptr_t unfiltered_marker_c
return retval;
}
-int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uint32_t sample_f_ct, uintptr_t* founder_info, uint32_t nonfounders, uintptr_t* sex_male, uint32_t sample_f_male_ct, uintptr_t* marker_reve [...]
+int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uint32_t sample_f_ct, uintptr_t* founder_info, uint32_t nonfounders, uintptr_t* sex_male, uint32_t s [...]
unsigned char* wkspace_mark = wkspace_base;
- FILE* outfile = NULL;
+ char* writebuf = tbuf;
+ char* pzwritep = NULL;
uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
uint32_t* cur_cluster_map = cluster_map;
@@ -2371,10 +2347,12 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
uint32_t cslen = 10;
int32_t retval = 0;
uint32_t cur_cts[4];
+ Pigz_state ps;
uintptr_t* readbuf;
uint32_t* uiptr;
uint32_t* uiptr2;
uint32_t* uiptr3;
+ unsigned char* overflow_buf;
char* csptr;
char* col_2_start;
char* wptr_start;
@@ -2391,9 +2369,17 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
uint32_t a1_obs;
uint32_t tot_obs;
uint32_t uii;
- if (wkspace_alloc_ul_checked(&readbuf, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
+ pzwrite_init_null(&ps);
+ uii = 2 * max_marker_allele_len + max_marker_id_len + max_cluster_id_len + 256;
+ if (wkspace_alloc_uc_checked(&overflow_buf, uii + PIGZ_BLOCK_SIZE) ||
+ wkspace_alloc_ul_checked(&readbuf, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
goto write_stratified_freqs_ret_NOMEM;
}
+ if (uii > MAXLINELEN) {
+ if (wkspace_alloc_c_checked(&writebuf, uii)) {
+ goto write_stratified_freqs_ret_NOMEM;
+ }
+ }
if ((sample_ct > sample_f_ct) && (!nonfounders)) {
if (wkspace_alloc_ui_checked(&cur_cluster_starts, (cluster_ct + 1) * sizeof(int32_t)) ||
wkspace_alloc_ui_checked(&cur_cluster_map, sample_f_ct * sizeof(int32_t))) {
@@ -2453,11 +2439,13 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
cluster_starts_male[clidx + 1] = clmpos;
}
}
- if (fopen_checked(&outfile, outname, "w")) {
+ memcpy(outname_end, output_gz? ".frq.strat.gz" : ".frq.strat", output_gz? 14 : 11);
+ if (flex_pzwrite_init(output_gz, outname, overflow_buf, 0, &ps)) {
goto write_stratified_freqs_ret_OPEN_FAIL;
}
- sprintf(tbuf, " CHR %%%ds CLST A1 A2 MAF MAC NCHROBS\n", plink_maxsnp);
- fprintf(outfile, tbuf, "SNP");
+ pzwritep = (char*)overflow_buf;
+ sprintf(tbuf, " CHR %%%us CLST A1 A2 MAF MAC NCHROBS" EOLN_STR, plink_maxsnp);
+ pzwritep += sprintf(pzwritep, tbuf, "SNP");
if (wkspace_alloc_c_checked(&csptr, 2 * max_marker_allele_len + 16)) {
goto write_stratified_freqs_ret_NOMEM;
}
@@ -2477,7 +2465,7 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
goto write_stratified_freqs_ret_READ_FAIL;
}
- col_2_start = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, chrom_idx));
+ col_2_start = width_force(4, writebuf, chrom_name_write(writebuf, chrom_info_ptr, chrom_idx));
*col_2_start++ = ' ';
do {
sptr = &(marker_ids[marker_uidx * max_marker_id_len]);
@@ -2502,8 +2490,9 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
uiptr = cluster_map_nonmale;
uiptr2 = cluster_map_male;
for (clidx = 0; clidx < cluster_ct; clidx++) {
- wptr = fw_strcpy(8, &(cluster_ids[clidx * max_cluster_id_len]), wptr_start);
- wptr = memcpyax(wptr, csptr, cslen, ' ');
+ pzwritep = memcpya(pzwritep, writebuf, wptr_start - writebuf);
+ pzwritep = fw_strcpy(8, &(cluster_ids[clidx * max_cluster_id_len]), pzwritep);
+ pzwritep = memcpya(pzwritep, csptr, cslen);
fill_uint_zero(cur_cts, 4);
uiptr3 = &(cluster_map_nonmale[cluster_starts_nonmale[clidx + 1]]);
while (uiptr < uiptr3) {
@@ -2521,22 +2510,24 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
a1_obs += cur_cts[0];
tot_obs += cur_cts[0] + cur_cts[3];
if (tot_obs) {
- wptr = double_g_writewx4x(wptr, ((double)((int32_t)a1_obs)) / ((double)tot_obs), 8, ' ');
- wptr = uint32_writew6x(wptr, a1_obs, ' ');
- wptr = uint32_writew8(wptr, tot_obs);
- wptr = memcpya(wptr, " \n", 2);
+ pzwritep = double_g_writewx4x(pzwritep, ((double)((int32_t)a1_obs)) / ((double)tot_obs), 8, ' ');
+ pzwritep = uint32_writew6x(pzwritep, a1_obs, ' ');
+ pzwritep = uint32_writew8(pzwritep, tot_obs);
+ *pzwritep++ = ' ';
} else {
- wptr = memcpya(wptr, " 0 0 0 \n", 26);
+ pzwritep = memcpya(pzwritep, " 0 0 0 ", 25);
}
- if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+ append_binary_eoln(&pzwritep);
+ if (flex_pzwrite(&ps, &pzwritep)) {
goto write_stratified_freqs_ret_WRITE_FAIL;
}
}
} else if (is_y) {
uiptr = cluster_map_male;
for (clidx = 0; clidx < cluster_ct; clidx++) {
- wptr = fw_strcpy(8, &(cluster_ids[clidx * max_cluster_id_len]), wptr_start);
- wptr = memcpyax(wptr, csptr, cslen, ' ');
+ pzwritep = memcpya(pzwritep, writebuf, wptr_start - writebuf);
+ pzwritep = fw_strcpy(8, &(cluster_ids[clidx * max_cluster_id_len]), pzwritep);
+ pzwritep = memcpya(pzwritep, csptr, cslen);
fill_uint_zero(cur_cts, 4);
uiptr2 = &(cluster_map_male[cluster_starts_male[clidx + 1]]);
while (uiptr < uiptr2) {
@@ -2551,22 +2542,24 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
tot_obs = 2 * (cur_cts[0] + cur_cts[2] + cur_cts[3]);
}
if (tot_obs) {
- wptr = double_g_writewx4x(wptr, ((double)((int32_t)a1_obs)) / ((double)tot_obs), 8, ' ');
- wptr = uint32_writew6x(wptr, a1_obs, ' ');
- wptr = uint32_writew8(wptr, tot_obs);
- wptr = memcpya(wptr, " \n", 2);
+ pzwritep = double_g_writewx4x(pzwritep, ((double)((int32_t)a1_obs)) / ((double)tot_obs), 8, ' ');
+ pzwritep = uint32_writew6x(pzwritep, a1_obs, ' ');
+ pzwritep = uint32_writew8(pzwritep, tot_obs);
+ *pzwritep++ = ' ';
} else {
- wptr = memcpya(wptr, " 0 0 0 \n", 26);
+ pzwritep = memcpya(pzwritep, " 0 0 0 ", 25);
}
- if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+ append_binary_eoln(&pzwritep);
+ if (flex_pzwrite(&ps, &pzwritep)) {
goto write_stratified_freqs_ret_WRITE_FAIL;
}
}
} else {
uiptr = cur_cluster_map;
for (clidx = 0; clidx < cluster_ct; clidx++) {
- wptr = fw_strcpy(8, &(cluster_ids[clidx * max_cluster_id_len]), wptr_start);
- wptr = memcpyax(wptr, csptr, cslen, ' ');
+ pzwritep = memcpya(pzwritep, writebuf, wptr_start - writebuf);
+ pzwritep = fw_strcpy(8, &(cluster_ids[clidx * max_cluster_id_len]), pzwritep);
+ pzwritep = memcpya(pzwritep, csptr, cslen);
fill_uint_zero(cur_cts, 4);
uiptr2 = &(cur_cluster_map[cur_cluster_starts[clidx + 1]]);
while (uiptr < uiptr2) {
@@ -2581,14 +2574,15 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
tot_obs = 2 * (cur_cts[0] + cur_cts[2] + cur_cts[3]);
}
if (tot_obs) {
- wptr = double_g_writewx4x(wptr, ((double)((int32_t)a1_obs)) / ((double)tot_obs), 8, ' ');
- wptr = uint32_writew6x(wptr, a1_obs, ' ');
- wptr = uint32_writew8(wptr, tot_obs);
- wptr = memcpya(wptr, " \n", 2);
+ pzwritep = double_g_writewx4x(pzwritep, ((double)((int32_t)a1_obs)) / ((double)tot_obs), 8, ' ');
+ pzwritep = uint32_writew6x(pzwritep, a1_obs, ' ');
+ pzwritep = uint32_writew8(pzwritep, tot_obs);
+ *pzwritep++ = ' ';
} else {
- wptr = memcpya(wptr, " 0 0 0 \n", 26);
+ pzwritep = memcpya(pzwritep, " 0 0 0 ", 25);
}
- if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+ append_binary_eoln(&pzwritep);
+ if (flex_pzwrite(&ps, &pzwritep)) {
goto write_stratified_freqs_ret_WRITE_FAIL;
}
}
@@ -2602,7 +2596,7 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
}
} while (marker_uidx < chrom_end);
}
- if (fclose_null(&outfile)) {
+ if (flex_pzwrite_close_null(&ps, pzwritep)) {
goto write_stratified_freqs_ret_WRITE_FAIL;
}
LOGPRINTFWW("--freq: Cluster-stratified allele frequencies (%s) written to %s .\n", nonfounders? "all samples" : "founders only", outname);
@@ -2621,18 +2615,22 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
break;
}
wkspace_reset(wkspace_mark);
- fclose_cond(outfile);
+ flex_pzwrite_close_cond(&ps, pzwritep);
return retval;
}
-int32_t write_freqs(char* outname, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, double* set_allele_freqs, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, int32_t* ll_cts, int32_t* lh_cts, int32_t* hh_cts, int32_t* hapl_cts, int32_t* haph_cts, uint32_t sample_f_ct, uint32_t sample_f_male_ct, uint32_t nonfounders, uint64_t misc_flags, uintptr_t* marker_reverse) {
- FILE* outfile = NULL;
+int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, double* set_allele_freqs, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, int32_t* ll_cts, int32_t* lh_cts, int32_t* hh_cts, int32_t* hapl_cts, int32_t* haph_cts, uint32_t sample_f_ct, uint32_t sample_f_male_ct, uint32_t nonfounders, uint64_t misc_flags, uintptr_t* mar [...]
+ unsigned char* wkspace_mark = wkspace_base;
+ char* pzwritep = NULL;
uint32_t reverse = 0;
uint32_t freq_counts = (misc_flags / MISC_FREQ_COUNTS) & 1;
uint32_t freqx = (misc_flags / MISC_FREQX) & 1;
+ uint32_t output_gz = (misc_flags / MISC_FREQ_GZ) & 1;
uint32_t maf_succ = (misc_flags / MISC_MAF_SUCC) & 1;
int32_t chrom_code_end = chrom_info_ptr->max_code + 1 + chrom_info_ptr->name_ct;
int32_t retval = 0;
+ Pigz_state ps;
+ unsigned char* overflow_buf;
char* minor_ptr;
char* major_ptr;
char* bufptr;
@@ -2644,32 +2642,33 @@ int32_t write_freqs(char* outname, uint32_t plink_maxsnp, uintptr_t unfiltered_m
uint32_t missing_ct;
int32_t chrom_idx;
uint32_t uii;
- if (fopen_checked(&outfile, outname, "w")) {
- goto write_freqs_ret_OPEN_FAIL;
+ pzwrite_init_null(&ps);
+ if (wkspace_alloc_uc_checked(&overflow_buf, PIGZ_BLOCK_SIZE + 2 * max_marker_allele_len + MAXLINELEN)) {
+ goto write_freqs_ret_NOMEM;
}
+
+ bufptr = memcpya(outname_end, ".frq", 4);
if (freqx) {
- if (fputs_checked("CHR\tSNP\tA1\tA2\tC(HOM A1)\tC(HET)\tC(HOM A2)\tC(HAP A1)\tC(HAP A2)\tC(MISSING)\n", outfile)) {
- goto write_freqs_ret_WRITE_FAIL;
- }
- } else if (plink_maxsnp < 5) {
- if (freq_counts) {
- if (fputs_checked(" CHR SNP A1 A2 C1 C2 G0\n", outfile)) {
- goto write_freqs_ret_WRITE_FAIL;
- }
- } else {
- if (fputs_checked(" CHR SNP A1 A2 MAF NCHROBS\n", outfile)) {
- goto write_freqs_ret_WRITE_FAIL;
- }
- }
+ *bufptr++ = 'x';
} else if (freq_counts) {
- sprintf(tbuf, " CHR %%%us A1 A2 C1 C2 G0\n", plink_maxsnp);
- fprintf(outfile, tbuf, "SNP");
+ bufptr = memcpya(bufptr, ".counts", 7);
+ }
+ if (!output_gz) {
+ *bufptr = '\0';
} else {
- sprintf(tbuf, " CHR %%%us A1 A2 MAF NCHROBS\n", plink_maxsnp);
- fprintf(outfile, tbuf, "SNP");
+ memcpy(bufptr, ".gz", 4);
}
- if (ferror(outfile)) {
- goto write_freqs_ret_WRITE_FAIL;
+ if (flex_pzwrite_init(output_gz, outname, overflow_buf, 0, &ps)) {
+ goto write_freqs_ret_OPEN_FAIL;
+ }
+ pzwritep = (char*)overflow_buf;
+ if (freqx) {
+ pzwritep = strcpya(pzwritep, "CHR\tSNP\tA1\tA2\tC(HOM A1)\tC(HET)\tC(HOM A2)\tC(HAP A1)\tC(HAP A2)\tC(MISSING)" EOLN_STR);
+ } else if (plink_maxsnp < 5) {
+ pzwritep = strcpya(pzwritep, freq_counts? (" CHR SNP A1 A2 C1 C2 G0" EOLN_STR) : (" CHR SNP A1 A2 MAF NCHROBS" EOLN_STR));
+ } else {
+ sprintf(tbuf, freq_counts? (" CHR %%%us A1 A2 C1 C2 G0" EOLN_STR) : (" CHR %%%us A1 A2 MAF NCHROBS" EOLN_STR), plink_maxsnp);
+ pzwritep += sprintf(pzwritep, tbuf, "SNP");
}
for (chrom_idx = 0; chrom_idx < chrom_code_end; chrom_idx++) {
if (!chrom_exists(chrom_info_ptr, chrom_idx)) {
@@ -2697,67 +2696,63 @@ int32_t write_freqs(char* outname, uint32_t plink_maxsnp, uintptr_t unfiltered_m
missing_ct = sample_f_ct - (ll_cts[marker_uidx] + lh_cts[marker_uidx] + hh_cts[marker_uidx]);
}
if (freqx) {
- bufptr = chrom_name_write(tbuf, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx));
- *bufptr++ = '\t';
- bufptr = strcpyax(bufptr, &(marker_ids[marker_uidx * max_marker_id_len]), '\t');
- fwrite(tbuf, 1, bufptr - tbuf, outfile);
- fputs(minor_ptr, outfile);
- putc('\t', outfile);
- fputs(major_ptr, outfile);
- tbuf[0] = '\t';
- bufptr = uint32_writex(&(tbuf[1]), reverse? hh_cts[marker_uidx] : ll_cts[marker_uidx], '\t');
- bufptr = uint32_writex(bufptr, lh_cts[marker_uidx], '\t');
- bufptr = uint32_writex(bufptr, reverse? ll_cts[marker_uidx] : hh_cts[marker_uidx], '\t');
- bufptr = uint32_writex(bufptr, reverse? haph_cts[marker_uidx] : hapl_cts[marker_uidx], '\t');
- bufptr = uint32_writex(bufptr, reverse? hapl_cts[marker_uidx] : haph_cts[marker_uidx], '\t');
- bufptr = uint32_writex(bufptr, missing_ct, '\n');
- fwrite(tbuf, 1, bufptr - tbuf, outfile);
+ pzwritep = chrom_name_write(pzwritep, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx));
+ *pzwritep++ = '\t';
+ pzwritep = strcpyax(pzwritep, &(marker_ids[marker_uidx * max_marker_id_len]), '\t');
+ pzwritep = strcpyax(pzwritep, minor_ptr, '\t');
+ pzwritep = strcpyax(pzwritep, major_ptr, '\t');
+ pzwritep = uint32_writex(pzwritep, reverse? hh_cts[marker_uidx] : ll_cts[marker_uidx], '\t');
+ pzwritep = uint32_writex(pzwritep, lh_cts[marker_uidx], '\t');
+ pzwritep = uint32_writex(pzwritep, reverse? ll_cts[marker_uidx] : hh_cts[marker_uidx], '\t');
+ pzwritep = uint32_writex(pzwritep, reverse? haph_cts[marker_uidx] : hapl_cts[marker_uidx], '\t');
+ pzwritep = uint32_writex(pzwritep, reverse? hapl_cts[marker_uidx] : haph_cts[marker_uidx], '\t');
+ pzwritep = uint32_write(pzwritep, missing_ct);
} else {
- bufptr = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx)));
- *bufptr++ = ' ';
- bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), bufptr);
- *bufptr++ = ' ';
- fwrite(tbuf, 1, bufptr - tbuf, outfile);
- fputs_w4(minor_ptr, outfile);
- putc(' ', outfile);
- fputs_w4(major_ptr, outfile);
- tbuf[0] = ' ';
- bufptr = uint32_writew6x(&(tbuf[1]), 2 * ll_cts[marker_uidx] + lh_cts[marker_uidx] + hapl_cts[marker_uidx], ' ');
- bufptr = uint32_writew6x(bufptr, 2 * hh_cts[marker_uidx] + lh_cts[marker_uidx] + haph_cts[marker_uidx], ' ');
- bufptr = uint32_writew6x(bufptr, missing_ct, '\n');
- fwrite(tbuf, 1, bufptr - tbuf, outfile);
+ pzwritep = width_force(4, pzwritep, chrom_name_write(pzwritep, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx)));
+ *pzwritep++ = ' ';
+ pzwritep = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), pzwritep);
+ *pzwritep++ = ' ';
+ pzwritep = fw_strcpy(4, minor_ptr, pzwritep);
+ *pzwritep++ = ' ';
+ pzwritep = fw_strcpy(4, major_ptr, pzwritep);
+ *pzwritep++ = ' ';
+ pzwritep = uint32_writew6x(pzwritep, 2 * ll_cts[marker_uidx] + lh_cts[marker_uidx] + hapl_cts[marker_uidx], ' ');
+ pzwritep = uint32_writew6x(pzwritep, 2 * hh_cts[marker_uidx] + lh_cts[marker_uidx] + haph_cts[marker_uidx], ' ');
+ pzwritep = uint32_writew6(pzwritep, missing_ct);
}
} else {
- bufptr = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx)));
- *bufptr++ = ' ';
- bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), bufptr);
- *bufptr++ = ' ';
- fwrite(tbuf, 1, bufptr - tbuf, outfile);
- fputs_w4(minor_ptr, outfile);
- putc(' ', outfile);
- fputs_w4(major_ptr, outfile);
- tbuf[0] = ' ';
+ pzwritep = width_force(4, pzwritep, chrom_name_write(pzwritep, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx)));
+ *pzwritep++ = ' ';
+ pzwritep = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), pzwritep);
+ *pzwritep++ = ' ';
+ pzwritep = fw_strcpy(4, minor_ptr, pzwritep);
+ *pzwritep++ = ' ';
+ pzwritep = fw_strcpy(4, major_ptr, pzwritep);
+ *pzwritep++ = ' ';
uii = 2 * (ll_cts[marker_uidx] + lh_cts[marker_uidx] + hh_cts[marker_uidx]) + hapl_cts[marker_uidx] + haph_cts[marker_uidx];
if (maf_succ || uii || (set_allele_freqs[marker_uidx] != 0.5)) {
- bufptr = double_g_writewx4(&(tbuf[1]), 1.0 - set_allele_freqs[marker_uidx], 12);
+ pzwritep = double_g_writewx4(pzwritep, 1.0 - set_allele_freqs[marker_uidx], 12);
} else {
- bufptr = memcpya(&(tbuf[1]), " NA", 12);
+ pzwritep = memcpya(pzwritep, " NA", 12);
}
- *bufptr++ = ' ';
- bufptr = uint32_writew8x(bufptr, uii, '\n');
- fwrite(tbuf, 1, bufptr - tbuf, outfile);
+ *pzwritep++ = ' ';
+ pzwritep = uint32_writew8(pzwritep, uii);
}
- if (ferror(outfile)) {
+ append_binary_eoln(&pzwritep);
+ if (flex_pzwrite(&ps, &pzwritep)) {
goto write_freqs_ret_WRITE_FAIL;
}
marker_uidx = next_unset(marker_exclude, marker_uidx + 1, chrom_end);
}
}
- if (fclose_null(&outfile)) {
+ if (flex_pzwrite_close_null(&ps, pzwritep)) {
goto write_freqs_ret_WRITE_FAIL;
}
LOGPRINTFWW("--freq%s: Allele frequencies (%s) written to %s .\n", freqx? "x" : "", nonfounders? "all samples" : "founders only", outname);
while (0) {
+ write_freqs_ret_NOMEM:
+ retval = RET_NOMEM;
+ break;
write_freqs_ret_OPEN_FAIL:
retval = RET_OPEN_FAIL;
break;
@@ -2765,26 +2760,11 @@ int32_t write_freqs(char* outname, uint32_t plink_maxsnp, uintptr_t unfiltered_m
retval = RET_WRITE_FAIL;
break;
}
- fclose_cond(outfile);
+ flex_pzwrite_close_cond(&ps, pzwritep);
+ wkspace_reset(wkspace_mark);
return retval;
}
-void calc_marker_weights(double exponent, uint32_t unfiltered_marker_ct, uintptr_t* marker_exclude, uint32_t marker_ct, int32_t* ll_cts, int32_t* lh_cts, int32_t* hh_cts, double* marker_weights) {
- uint32_t marker_uidx = 0;
- uint32_t markers_done = 0;
- uint32_t marker_uidx_stop;
- do {
- marker_uidx = next_unset_unsafe(marker_exclude, marker_uidx);
- marker_uidx_stop = next_set(marker_exclude, marker_uidx, unfiltered_marker_ct);
- markers_done += marker_uidx_stop - marker_uidx;
- do {
- if (marker_weights[marker_uidx] < 0.0) {
- marker_weights[marker_uidx] = calc_wt_mean(exponent, lh_cts[marker_uidx], ll_cts[marker_uidx], hh_cts[marker_uidx]);
- }
- } while (++marker_uidx < marker_uidx_stop);
- } while (markers_done < marker_ct);
-}
-
int32_t sexcheck(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uintptr_t* sex_nm, uintptr_t* sex_male, uint64_t misc_flags, double check_sex_fthresh, double check_sex_mthresh, uint32_t max_f_yobs, uint32_t min_m_yobs, Chrom_info* chrom [...]
unsigned char* wkspace_mark = wkspace_base;
FILE* outfile = NULL;
@@ -3497,12 +3477,12 @@ int32_t list_duplicate_vars(char* outname, char* outname_end, uint32_t dupvar_mo
return retval;
}
-int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uintptr_t* founder_info, Chrom_info* chrom_info_ptr, double* set_allele_freqs) {
+int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uintptr_t* founder_info, Chrom_info* chrom_info_ptr, double* set_allele_freqs) {
// Same F coefficient computation as sexcheck().
unsigned char* wkspace_mark = wkspace_base;
- FILE* outfile = NULL;
uintptr_t* loadbuf_f = NULL;
uintptr_t* founder_vec11 = NULL;
+ char* pzwritep = NULL;
uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
@@ -3515,15 +3495,16 @@ int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
uint32_t chrom_fo_idx = 0xffffffffU; // deliberate overflow
uint32_t chrom_end = 0;
int32_t retval = 0;
+ Pigz_state ps;
uintptr_t* loadbuf_raw;
uintptr_t* loadbuf;
uintptr_t* lptr;
uint32_t* het_cts;
uint32_t* missing_cts;
double* nei_offsets;
+ unsigned char* overflow_buf;
char* fid_ptr;
char* iid_ptr;
- char* wptr;
double dpp;
double dtot;
double cur_nei;
@@ -3539,11 +3520,13 @@ int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
uintptr_t cur_word;
uintptr_t ulii;
uint32_t obs_ct;
+ pzwrite_init_null(&ps);
if (is_set(chrom_info_ptr->haploid_mask, 0)) {
logprint("Error: --het cannot be used on haploid genomes.\n");
goto het_report_ret_INVALID_CMDLINE;
}
- if (wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
+ if (wkspace_alloc_uc_checked(&overflow_buf, PIGZ_BLOCK_SIZE + MAXLINELEN) ||
+ wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
wkspace_alloc_ul_checked(&loadbuf, sample_ctl2 * sizeof(intptr_t)) ||
wkspace_alloc_ui_checked(&het_cts, sample_ct * sizeof(int32_t)) ||
wkspace_alloc_ui_checked(&missing_cts, sample_ct * sizeof(int32_t)) ||
@@ -3659,39 +3642,41 @@ int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
if (!marker_ct) {
goto het_report_ret_INVALID_CMDLINE;
}
- memcpy(outname_end, ".het", 5);
- if (fopen_checked(&outfile, outname, "w")) {
+ memcpy(outname_end, output_gz? ".het.gz" : ".het", output_gz? 8 : 5);
+ if (flex_pzwrite_init(output_gz, outname, overflow_buf, 0, &ps)) {
goto het_report_ret_OPEN_FAIL;
}
+ pzwritep = (char*)overflow_buf;
sprintf(tbuf, "%%%us %%%us O(HOM) E(HOM) N(NM) F\n", plink_maxfid, plink_maxiid);
- fprintf(outfile, tbuf, "FID", "IID");
+ pzwritep += sprintf(pzwritep, tbuf, "FID", "IID");
sample_uidx = 0;
for (sample_idx = 0; sample_idx < sample_ct; sample_idx++, sample_uidx++) {
next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
fid_ptr = &(sample_ids[sample_uidx * max_sample_id_len]);
iid_ptr = (char*)memchr(fid_ptr, '\t', max_sample_id_len);
- wptr = fw_strcpyn(plink_maxfid, (uintptr_t)(iid_ptr - fid_ptr), fid_ptr, tbuf);
- *wptr++ = ' ';
- wptr = fw_strcpy(plink_maxiid, &(iid_ptr[1]), wptr);
- wptr = memseta(wptr, 32, 3);
+ pzwritep = fw_strcpyn(plink_maxfid, (uintptr_t)(iid_ptr - fid_ptr), fid_ptr, pzwritep);
+ *pzwritep++ = ' ';
+ pzwritep = fw_strcpy(plink_maxiid, &(iid_ptr[1]), pzwritep);
+ pzwritep = memseta(pzwritep, 32, 3);
obs_ct = marker_ct - missing_cts[sample_idx];
if (obs_ct) {
- wptr = uint32_writew10x(wptr, obs_ct - het_cts[sample_idx], ' ');
+ pzwritep = uint32_writew10x(pzwritep, obs_ct - het_cts[sample_idx], ' ');
dee = nei_sum - nei_offsets[sample_idx];
- wptr = double_g_writewx4(wptr, dee, 12);
- wptr = memseta(wptr, 32, 3);
- wptr = uint32_writew10x(wptr, obs_ct, ' ');
+ pzwritep = double_g_writewx4(pzwritep, dee, 12);
+ pzwritep = memseta(pzwritep, 32, 3);
+ pzwritep = uint32_writew10x(pzwritep, obs_ct, ' ');
dtot = (double)((int32_t)obs_ct) - dee;
dff = (dtot - ((double)((int32_t)(het_cts[sample_idx])))) / dtot;
- wptr = double_g_writewx4x(wptr, dff, 12, '\n');
+ pzwritep = double_g_writewx4(pzwritep, dff, 12);
} else {
- wptr = memcpya(wptr, " 0 0 0 nan\n", 50);
+ pzwritep = memcpya(pzwritep, " 0 0 0 nan", 49);
}
- if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+ append_binary_eoln(&pzwritep);
+ if (flex_pzwrite(&ps, &pzwritep)) {
goto het_report_ret_WRITE_FAIL;
}
}
- if (fclose_null(&outfile)) {
+ if (flex_pzwrite_close_null(&ps, pzwritep)) {
goto het_report_ret_WRITE_FAIL;
}
LOGPRINTFWW("--het%s: %" PRIuPTR " variant%s scanned, report written to %s .\n", loadbuf_f? " small-sample" : "", marker_ct, (marker_ct == 1)? "" : "s", outname);
@@ -3714,7 +3699,7 @@ int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
break;
}
wkspace_reset(wkspace_mark);
- fclose_cond(outfile);
+ flex_pzwrite_close_cond(&ps, pzwritep);
return retval;
}
@@ -3852,7 +3837,7 @@ int32_t fst_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
loop_end = marker_ct / 100;
for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
if (IS_SET(marker_exclude, marker_uidx)) {
- marker_uidx = next_set_ul_unsafe(marker_exclude, marker_uidx);
+ marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
seek_flag = 1;
}
if (marker_uidx >= chrom_end) {
@@ -4842,11 +4827,14 @@ int32_t meta_analysis_open_and_read_header(const char* fname, char* loadbuf, uin
}
}
#ifdef __cplusplus
- std::sort(parse_table, &(parse_table[token_ct]));
+ // suppress bogus gcc 4.4 warning, this is not performance-critical
+ qsort((int32_t*)parse_table, token_ct, sizeof(int32_t), intcmp);
+ // std::sort(parse_table, &(parse_table[token_ct]));
#else
qsort((int32_t*)parse_table, token_ct, sizeof(int32_t), intcmp);
#endif
- if (!weighted_z) {
+ // bugfix: this caused a segfault in no-map case
+ if ((!weighted_z) && (token_ct > 5)) {
token_ct -= 2;
}
col_skips[0] = parse_table[0] >> 4;
@@ -5541,7 +5529,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
if (!no_allele) {
fputs(" A1 A2", outfile);
}
- fputs(" N P P(R) OR OR(R) Q I", outfile);
+ fputs(output_beta? " N P P(R) BETA BETA(R) Q I" : " N P P(R) OR OR(R) Q I", outfile);
if (weighted_z) {
fputs(" WEIGHTED_Z P(WZ)", outfile);
}
diff --git a/plink_misc.h b/plink_misc.h
index b33926c..5bbdfd0 100644
--- a/plink_misc.h
+++ b/plink_misc.h
@@ -63,15 +63,13 @@ void calc_plink_maxfid(uint32_t unfiltered_sample_ct, uintptr_t* sample_exclude,
uint32_t calc_plink_maxsnp(uint32_t unfiltered_marker_ct, uintptr_t* marker_exclude, uint32_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len);
-int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, char** marker_allele_ptrs, double* set_allele_freqs, uint32_t* nchrobs, uint32_t maf_succ, double exponent, uint32_t wt_needed, double* marker_weights);
+int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, char** marker_allele_ptrs, double* set_allele_freqs, uint32_t* nchrobs, uint32_t maf_succ);
int32_t load_ax_alleles(Two_col_params* axalleles, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char** marker_allele_ptrs, uintptr_t* max_marker_allele_len_ptr, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, double* set_allele_freqs, uint32_t is_a2);
-int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uint32_t sample_f_ct, uintptr_t* founder_info, uint32_t nonfounders, uintptr_t* sex_male, uint32_t sample_f_male_ct, uintptr_t* marker_reve [...]
+int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uint32_t sample_f_ct, uintptr_t* founder_info, uint32_t nonfounders, uintptr_t* sex_male, uint32_t s [...]
-int32_t write_freqs(char* outname, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, double* set_allele_freqs, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, int32_t* ll_cts, int32_t* lh_cts, int32_t* hh_cts, int32_t* hapl_cts, int32_t* haph_cts, uint32_t sample_f_ct, uint32_t sample_f_male_ct, uint32_t nonfounders, uint64_t misc_flags, uintptr_t* marker_reverse);
-
-void calc_marker_weights(double exponent, uint32_t unfiltered_marker_ct, uintptr_t* marker_exclude, uint32_t marker_ct, int32_t* ll_cts, int32_t* lh_cts, int32_t* hh_cts, double* marker_weights);
+int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, double* set_allele_freqs, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, int32_t* ll_cts, int32_t* lh_cts, int32_t* hh_cts, int32_t* hapl_cts, int32_t* haph_cts, uint32_t sample_f_ct, uint32_t sample_f_male_ct, uint32_t nonfounders, uint64_t misc_flags, uintptr_t* mar [...]
int32_t sexcheck(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uintptr_t* sex_nm, uintptr_t* sex_male, uint64_t misc_flags, double check_sex_fthresh, double check_sex_mthresh, uint32_t max_f_yobs, uint32_t min_m_yobs, Chrom_info* chrom [...]
@@ -81,7 +79,7 @@ int32_t write_var_ranges(char* outname, char* outname_end, uintptr_t unfiltered_
int32_t list_duplicate_vars(char* outname, char* outname_end, uint32_t dupvar_modifier, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t* marker_pos, Chrom_info* chrom_info_ptr, char** marker_allele_ptrs);
-int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uintptr_t* founder_info, Chrom_info* chrom_info_ptr, double* set_allele_freqs);
+int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uintptr_t* founder_info, Chrom_info* chrom_info_ptr, double* set_allele_freqs);
int32_t fst_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t* marker_pos, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts);
diff --git a/plink_set.c b/plink_set.c
index e8aea8d..25fc84b 100644
--- a/plink_set.c
+++ b/plink_set.c
@@ -1929,6 +1929,7 @@ void unpack_set_unfiltered(uintptr_t marker_ct, uintptr_t unfiltered_marker_ct,
unpack_set_unfiltered_late_start:
range_end = *uiptr++;
if (range_end == marker_ct) {
+ last_uidx = unfiltered_marker_ct;
break;
}
last_uidx = jump_forward_unset_unsafe(marker_exclude, marker_uidx + 1, range_end - range_start);
diff --git a/yarn.h b/yarn.h
index 436a675..63acf76 100644
--- a/yarn.h
+++ b/yarn.h
@@ -109,6 +109,9 @@
handler will exit (set to NULL by default for no action)
*/
+#ifndef __YARN_H__
+#define __YARN_H__
+
extern const char *yarn_prefix;
extern void (*yarn_abort)(int);
@@ -132,3 +135,5 @@ enum wait_op {
void wait_for(lock *, enum wait_op, long);
long peek_lock(lock *);
void free_lock(lock *);
+
+#endif // __YARN_H__
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/plink1.9.git
More information about the debian-med-commit
mailing list