[med-svn] [Git][med-team/plink2][master] 3 commits: New upstream version 2.00~a3-210701+dfsg
Dylan Aïssi (@daissi)
gitlab at salsa.debian.org
Fri Jul 23 20:53:57 BST 2021
Dylan Aïssi pushed to branch master at Debian Med / plink2
Commits:
dd37e023 by Dylan Aïssi at 2021-07-23T21:49:18+02:00
New upstream version 2.00~a3-210701+dfsg
- - - - -
52ff17d8 by Dylan Aïssi at 2021-07-23T21:49:21+02:00
Update upstream source from tag 'upstream/2.00_a3-210701+dfsg'
Update to upstream version '2.00~a3-210701+dfsg'
with Debian dir 1a532024f15d7baeaed7a5c3387acd594edc4880
- - - - -
d2773ed4 by Dylan Aïssi at 2021-07-23T21:50:27+02:00
Bump d/changelog
- - - - -
27 changed files:
- debian/changelog
- include/pgenlib_misc.cc
- include/pgenlib_misc.h
- include/pgenlib_read.cc
- include/pgenlib_read.h
- include/plink2_string.cc
- include/plink2_string.h
- include/plink2_text.cc
- include/plink2_text.h
- pgen_compress.cc
- plink2.cc
- plink2_cmdline.cc
- plink2_cmdline.h
- plink2_common.h
- plink2_data.cc
- plink2_data.h
- plink2_export.cc
- plink2_fasta.cc
- plink2_glm.cc
- plink2_help.cc
- plink2_import.cc
- plink2_merge.cc
- plink2_merge.h
- plink2_misc.cc
- plink2_psam.cc
- plink2_pvar.cc
- plink2_pvar.h
Changes:
=====================================
debian/changelog
=====================================
@@ -1,3 +1,9 @@
+plink2 (2.00~a3-210701+dfsg-1) UNRELEASED; urgency=medium
+
+ * New upstream version
+
+ -- Dylan Aïssi <daissi at debian.org> Fri, 23 Jul 2021 21:49:33 +0200
+
plink2 (2.00~a3-210203+dfsg-1) unstable; urgency=medium
* New upstream version
=====================================
include/pgenlib_misc.cc
=====================================
@@ -122,7 +122,7 @@ void CopyGenomatchSubset(const uintptr_t* __restrict raw_bitarr, const uintptr_t
}
// Variant of ExpandBytearr() which is based off a target 2-bit value instead
-// of single expand_mask bits. expand_size must be the number of instance of
+// of single expand_mask bits. expand_size must be the number of instances of
// the target value in genovec.
void ExpandBytearrFromGenoarr(const void* __restrict compact_bitarr, const uintptr_t* __restrict genoarr, uintptr_t match_word, uint32_t genoword_ct, uint32_t expand_size, uint32_t read_start_bit, uintptr_t* __restrict target) {
const uint32_t expand_sizex_m1 = expand_size + read_start_bit - 1;
@@ -726,6 +726,23 @@ void GenoarrCountFreqsUnsafe(const uintptr_t* genoarr, uint32_t sample_ct, STD_A
genocounts[3] = bothset_ct;
}
+uintptr_t MostCommonGenoUnsafe(const uintptr_t* genoarr, uint32_t sample_ct) {
+ STD_ARRAY_DECL(uint32_t, 4, genocounts);
+ GenoarrCountFreqsUnsafe(genoarr, sample_ct, genocounts);
+ uint32_t most_common_geno_ct = genocounts[0];
+ if (most_common_geno_ct * 2 >= sample_ct) {
+ return 0;
+ }
+ uintptr_t most_common_geno = 0;
+ for (uintptr_t cur_geno = 1; cur_geno != 4; ++cur_geno) {
+ if (most_common_geno_ct < genocounts[cur_geno]) {
+ most_common_geno = cur_geno;
+ most_common_geno_ct = genocounts[cur_geno];
+ }
+ }
+ return most_common_geno;
+}
+
// geno_vvec now allowed to be unaligned.
void CountSubset3FreqVec6(const VecW* __restrict geno_vvec, const VecW* __restrict interleaved_mask_vvec, uint32_t vec_ct, uint32_t* __restrict even_ctp, uint32_t* __restrict odd_ctp, uint32_t* __restrict bothset_ctp) {
assert(!(vec_ct % 6));
=====================================
include/pgenlib_misc.h
=====================================
@@ -232,6 +232,10 @@ void Count3FreqVec6(const VecW* geno_vvec, uint32_t vec_ct, uint32_t* __restrict
// vector-alignment preferred.
void GenoarrCountFreqsUnsafe(const uintptr_t* genoarr, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts);
+// GenoarrCountsFreqsUnsafe() wrapper that returns most common genotype,
+// breaking ties in favor of the lower value.
+uintptr_t MostCommonGenoUnsafe(const uintptr_t* genoarr, uint32_t sample_ct);
+
// geno_vvec now allowed to be unaligned.
void CountSubset3FreqVec6(const VecW* __restrict geno_vvec, const VecW* __restrict interleaved_mask_vvec, uint32_t vec_ct, uint32_t* __restrict even_ctp, uint32_t* __restrict odd_ctp, uint32_t* __restrict bothset_ctp);
=====================================
include/pgenlib_read.cc
=====================================
@@ -2696,8 +2696,7 @@ void CopyAndSubsetDifflist(const uintptr_t* __restrict sample_include, const uin
// Populates pgrp->ldbase_genovec or
// pgrp->ldbase_{raregeno,difflist_sample_ids,difflist_len}, depending on
// storage type.
-// Currently just called by ReadDifflistOrGenovecSubsetUnsafe(), which isn't
-// exploited by plink2 yet.
+// Currently just called by ReadDifflistOrGenovecSubsetUnsafe().
PglErr LdLoadMinimalSubsetIfNecessary(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp) {
if (!LdLoadNecessary(vidx, pgrp)) {
return kPglRetSuccess;
=====================================
include/pgenlib_read.h
=====================================
@@ -277,6 +277,14 @@ HEADER_INLINE unsigned char* PgrGetVrtypes(PgenReader* pgr_ptr) {
return pgrp->fi.vrtypes;
}
+HEADER_INLINE uint32_t PgrGetVrtype(const PgenReader* pgr_ptr, uint32_t vidx) {
+ const PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
+ if (pgrp->fi.vrtypes) {
+ return pgrp->fi.vrtypes[vidx];
+ }
+ return pgrp->fi.const_vrtype;
+}
+
HEADER_INLINE uintptr_t* PgrGetNonrefFlags(PgenReader* pgr_ptr) {
PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
return pgrp->fi.nonref_flags;
=====================================
include/plink2_string.cc
=====================================
@@ -219,52 +219,41 @@ void WordWrap(uint32_t suffix_len, char* strbuf) {
}
-static const uint32_t kPow10[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000};
+// This implementation is from Kendall Willets. See
+// https://lemire.me/blog/2021/06/03/computing-the-number-of-digits-of-an-integer-even-faster/
#ifdef USE_AVX2
-static const unsigned char kLzUintSlenBase[] =
-{9, 9, 9, 8,
- 8, 8, 7, 7,
- 7, 6, 6, 6,
- 6, 5, 5, 5,
- 4, 4, 4, 3,
- 3, 3, 3, 2,
- 2, 2, 1, 1,
- 1, 0, 0, 0,
- 1}; // UintSlen(0) needs to be 1, not zero
+static const uint64_t kLzcntUintSlenTable[] =
+ {42949672960LL, 42949672960LL, 41949672960LL, 41949672960LL, 41949672960LL,
+ 38554705664LL, 38554705664LL, 38554705664LL, 34349738368LL, 34349738368LL,
+ 34349738368LL, 34349738368LL, 30063771072LL, 30063771072LL, 30063771072LL,
+ 25769703776LL, 25769703776LL, 25769703776LL, 21474826480LL, 21474826480LL,
+ 21474826480LL, 21474826480LL, 17179868184LL, 17179868184LL, 17179868184LL,
+ 12884901788LL, 12884901788LL, 12884901788LL, 8589934582LL, 8589934582LL,
+ 8589934582LL, 4294967296LL};
uint32_t UintSlen(uint32_t num) {
const uint32_t lz_ct = _lzcnt_u32(num);
- const uint32_t slen_base = kLzUintSlenBase[lz_ct];
- return slen_base + (num >= kPow10[slen_base]);
+ return (num + kLzcntUintSlenTable[lz_ct]) >> 32;
}
#else
-// could also use something like ((32 - lz_ct) * 77) >> 8, since 77/256 is a
-// sufficiently good approximation of ln(2)/ln(10), but that's a bit slower and
-// this table doesn't take much space
-//
-// bugfix (29 Mar 2018): this table was totally wrong, ugh
-static const unsigned char kBsrUintSlenBase[] =
-{1, 1, 1, 1,
- 2, 2, 2, 3,
- 3, 3, 4, 4,
- 4, 4, 5, 5,
- 5, 6, 6, 6,
- 7, 7, 7, 7,
- 8, 8, 8, 9,
- 9, 9, 9, 9};
+static const uint64_t kBsrUintSlenTable[] =
+ {4294967296LL, 8589934582LL, 8589934582LL, 8589934582LL, 12884901788LL,
+ 12884901788LL, 12884901788LL, 17179868184LL, 17179868184LL, 17179868184LL,
+ 21474826480LL, 21474826480LL, 21474826480LL, 21474826480LL, 25769703776LL,
+ 25769703776LL, 25769703776LL, 30063771072LL, 30063771072LL, 30063771072LL,
+ 34349738368LL, 34349738368LL, 34349738368LL, 34349738368LL, 38554705664LL,
+ 38554705664LL, 38554705664LL, 41949672960LL, 41949672960LL, 41949672960LL,
+ 42949672960LL, 42949672960LL};
uint32_t UintSlen(uint32_t num) {
// tried divide-by-10 and divide-by-100 loops, they were slower
// also tried a hardcoded binary tree, it was better but still slower
// bsru32(0) is undefined
- if (num < 10) {
- return 1;
- }
+ num |= 1;
const uint32_t top_bit_pos = bsru32(num);
- const uint32_t slen_base = kBsrUintSlenBase[top_bit_pos];
- return slen_base + (num >= kPow10[slen_base]);
+ return (num + kBsrUintSlenTable[top_bit_pos]) >> 32;
}
#endif
=====================================
include/plink2_string.h
=====================================
@@ -596,6 +596,11 @@ HEADER_INLINE void StrptrArrNsort(uintptr_t ct, const char** strptr_arr) {
std::sort(R_CAST(StrNsortDeref*, strptr_arr), &(R_CAST(StrNsortDeref*, strptr_arr)[ct]));
}
+// Considered adding stable-sort variants of StrptrArrSort, but looks like it's
+// usually better to define a more sophisticated comparator that removes the
+// need for the explicit stable-sort. (Note that std::stable_sort can throw
+// std::bad_alloc, etc.)
+
// need to expose these for plink2_cmdline bigstack-allocating
// SortStrboxIndexed()'s use
void SortStrbox32bFinish(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, Strbuf28Ui* filled_wkspace, char* sorted_strbox, uint32_t* id_map);
=====================================
include/plink2_text.cc
=====================================
@@ -98,6 +98,7 @@ BoolErr GzRawInit(const void* buf, uint32_t nbytes, GzRawDecompressStream* gzp)
return 1;
}
gzp->ds_initialized = 1;
+ gzp->eof = 0;
return 0;
}
@@ -316,45 +317,68 @@ BoolErr IsPathologicallyLongLineOrToken(const char* line_start, const char* load
const char kShortErrRfileTruncatedGz[] = "GzRawStreamRead: gzipped file appears to be truncated";
PglErr GzRawStreamRead(char* dst_end, FILE* ff, GzRawDecompressStream* gzp, char** dst_iterp, const char** errmsgp) {
- z_stream* dsp = &gzp->ds;
- if ((!dsp->avail_in) && feof_unlocked(ff)) {
+ if (gzp->eof) {
+ // (!dsp->avail_in) && feof_unlocked(ff)) {
return kPglRetSuccess;
}
+ z_stream* dsp = &gzp->ds;
char* dst_iter = *dst_iterp;
do {
- int zerr = Z_OK;
if (dsp->avail_in) { // can be zero after TextRewind()
- dsp->next_out = R_CAST(unsigned char*, dst_iter);
- dsp->avail_out = dst_end - dst_iter;
- zerr = inflate(dsp, Z_SYNC_FLUSH);
- if (unlikely((zerr < 0) || (zerr == Z_NEED_DICT))) {
- if (dsp->msg) {
- *errmsgp = dsp->msg;
- } else {
- *errmsgp = zError(zerr);
+ while (1) {
+ dsp->next_out = R_CAST(unsigned char*, dst_iter);
+ dsp->avail_out = dst_end - dst_iter;
+ int zerr = inflate(dsp, Z_SYNC_FLUSH);
+ if (unlikely((zerr < 0) || (zerr == Z_NEED_DICT))) {
+ if (dsp->msg) {
+ *errmsgp = dsp->msg;
+ } else {
+ *errmsgp = zError(zerr);
+ }
+ return kPglRetDecompressFail;
+ }
+ dst_iter = R_CAST(char*, dsp->next_out);
+ if (zerr != Z_STREAM_END) {
+ break;
+ }
+ // bugfix (25 May 2021): may need to open a new stream, or skip
+ // trailing garbage
+ const uint32_t trailing_byte_ct = dsp->avail_in;
+ if (trailing_byte_ct < 2) {
+ if (trailing_byte_ct) {
+ gzp->in[0] = dsp->next_in[0];
+ }
+ const uint32_t nbytes = fread_unlocked(&(gzp->in[trailing_byte_ct]), 1, kDecompressChunkSize - trailing_byte_ct, ff);
+ dsp->next_in = gzp->in;
+ dsp->avail_in = trailing_byte_ct + nbytes;
}
- return kPglRetDecompressFail;
+ if ((dsp->avail_in < 2) || (dsp->next_in[0] != 31) || (dsp->next_in[1] != 139)) {
+ // EOF or trailing garbage
+ gzp->eof = 1;
+ *dst_iterp = dst_iter;
+ return kPglRetSuccess;
+ }
+#ifdef NDEBUG
+ inflateReset(dsp);
+#else
+ const int errcode = inflateReset(dsp);
+ assert(errcode == Z_OK);
+#endif
}
- dst_iter = R_CAST(char*, dsp->next_out);
if (dsp->avail_in) {
- assert(dst_iter == dst_end);
break;
}
}
const uint32_t nbytes = fread_unlocked(gzp->in, 1, kDecompressChunkSize, ff);
dsp->next_in = gzp->in;
dsp->avail_in = nbytes;
- if (!nbytes) {
+ if (unlikely(!nbytes)) {
if (unlikely(!feof_unlocked(ff))) {
*errmsgp = strerror(errno);
return kPglRetReadFail;
}
- if (unlikely(zerr == Z_OK)) {
- *errmsgp = kShortErrRfileTruncatedGz;
- return kPglRetDecompressFail;
- }
- // Normal EOF.
- break;
+ *errmsgp = kShortErrRfileTruncatedGz;
+ return kPglRetDecompressFail;
}
} while (dst_iter != dst_end);
*dst_iterp = dst_iter;
@@ -391,12 +415,12 @@ PglErr ZstRawStreamRead(char* dst_end, FILE* ff, ZstRawDecompressStream* zstp, c
return kPglRetReadFail;
}
zstp->ib.pos = 0;
- zstp->ib.size = nbytes + n_inbytes;
- if (!nbytes) {
- if (unlikely(n_inbytes)) {
- *errmsgp = kShortErrZstdPrefixUnknown;
- return kPglRetDecompressFail;
- }
+ const uint32_t new_size = nbytes + n_inbytes;
+ zstp->ib.size = new_size;
+ // bugfix (28 Feb 2021): Possible for nbytes to be zero at the same time
+ // n_inbytes is positive when e.g. the input .zst file is the concatenation
+ // of multiple ordinary .zst files, which is permitted by the spec.
+ if (!new_size) {
break;
}
}
@@ -678,6 +702,7 @@ void TextFileRewind(textFILE* txf_ptr) {
if (basep->file_type != kFileUncompressed) {
if (basep->file_type == kFileGzip) {
txfp->rds.gz.ds.avail_in = 0;
+ txfp->rds.gz.eof = 0;
#ifdef NDEBUG
inflateReset(&txfp->rds.gz.ds);
#else
@@ -1171,6 +1196,7 @@ THREAD_FUNC_DECL TextStreamThread(void* raw_arg) {
if (file_type != kFileUncompressed) {
if (file_type == kFileGzip) {
rdsp->gz.ds.avail_in = 0;
+ rdsp->gz.eof = 0;
#ifdef NDEBUG
inflateReset(&rdsp->gz.ds);
#else
@@ -1257,6 +1283,7 @@ THREAD_FUNC_DECL TextStreamThread(void* raw_arg) {
case kFileGzip:
{
GzRawDecompressStream* gzp = &rdsp->gz;
+ gzp->eof = 0;
z_stream* dsp = &gzp->ds;
#ifdef NDEBUG
inflateReset(dsp);
=====================================
include/plink2_text.h
=====================================
@@ -150,6 +150,7 @@ typedef struct GzRawDecompressStreamStruct {
unsigned char* in;
z_stream ds;
uint32_t ds_initialized;
+ uint32_t eof;
} GzRawDecompressStream;
// BgzfRawDecompressStream declared in plink2_bgzf.h.
=====================================
pgen_compress.cc
=====================================
@@ -30,9 +30,9 @@ int32_t main(int32_t argc, char** argv) {
if ((argc < 3) || (argc > 5)) {
fputs(
"Usage:\n"
-"pgen_compress [input .bed or .pgen] [output filename] {sample_ct}\n"
+"pgen_compress <input .bed or .pgen> <output filename> [sample_ct]\n"
" (sample_ct is required when loading a .bed file)\n"
-"pgen_compress -u [input .pgen] [output .bed]\n"
+"pgen_compress -u <input .pgen> <output .bed>\n"
, stdout);
goto main_ret_INVALID_CMDLINE;
}
=====================================
plink2.cc
=====================================
@@ -71,7 +71,7 @@ static const char ver_str[] = "PLINK v2.00a3"
#ifdef USE_MKL
" Intel"
#endif
- " (3 Feb 2021)";
+ " (1 Jul 2021)";
static const char ver_str2[] =
// include leading space if day < 10, so character length stays the same
" "
@@ -999,9 +999,17 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c
goto Plink2Core_ret_NOMEM;
}
const uint32_t nonref_flags_already_loaded = (nonref_flags != nullptr);
- if ((!nonref_flags) && ((header_ctrl & 192) == 192)) {
- if (unlikely(bigstack_alloc_w(raw_variant_ctl, &nonref_flags))) {
- goto Plink2Core_ret_NOMEM;
+ if (!nonref_flags_already_loaded) {
+ const uint32_t nonref_flags_status_shifted = header_ctrl & 192;
+ if (nonref_flags_status_shifted == 192) {
+ if (unlikely(bigstack_alloc_w(raw_variant_ctl, &nonref_flags))) {
+ goto Plink2Core_ret_NOMEM;
+ }
+ } else if ((pgfi.const_vrtype != kPglVrtypePlink1) && (!nonref_flags_status_shifted)) {
+ // This should never happen with a chain of plink2-generated .pgen
+ // files, but it's permitted by the specification and makes future
+ // merge unsafe.
+ logerrputs("Warning: .pgen file indicates that provisional-REF information is stored in the\ncompanion .pvar, but that .pvar lacks either an INFO/PR header or an INFO\ncolumn. Assuming all REF alleles are correct.\n");
}
}
pgfi.nonref_flags = nonref_flags;
@@ -2884,8 +2892,6 @@ uint32_t CmdlineSingleChr(const ChrInfo* cip, MiscFlags misc_flags) {
}
void GetExportfTargets(const char* const* argvk, uint32_t param_ct, ExportfFlags* exportf_flags_ptr, IdpasteFlags* exportf_id_paste_ptr, uint64_t* format_param_idxs_ptr) {
- // does not error out if no format present, since this is needed for --recode
- // translation
// supports multiple formats
uint64_t format_param_idxs = 0;
for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
@@ -2984,7 +2990,9 @@ void GetExportfTargets(const char* const* argvk, uint32_t param_ct, ExportfFlags
}
case 'o':
if (!strcmp(cur_modif2, "xford")) {
- cur_format = kfExportfOxGen;
+ cur_format = kfExportfOxGenV1;
+ } else if (!strcmp(cur_modif2, "xford-v2")) {
+ cur_format = kfExportfOxGenV2;
}
break;
case 'p':
@@ -3014,7 +3022,7 @@ void GetExportfTargets(const char* const* argvk, uint32_t param_ct, ExportfFlags
} else if (!strcmp(&(cur_modif2[2]), "-4.2")) {
cur_format = kfExportfVcf42;
} else if ((!strcmp(&(cur_modif2[2]), "-fid")) || (!strcmp(&(cur_modif2[2]), "-iid"))) {
- snprintf(g_logbuf, kLogbufSize, "Note: --export 'v%s' modifier is deprecated. Use 'vcf' + 'id-paste=%s'.\n", cur_modif2, &(cur_modif2[3]));
+ logprintf("Note: --export 'v%s' modifier is deprecated. Use 'vcf' + 'id-paste=%s'.\n", cur_modif2, &(cur_modif2[3]));
cur_format = kfExportfVcf43;
*exportf_id_paste_ptr = (cur_modif2[3] == 'f')? kfIdpasteFid : kfIdpasteIid;
}
@@ -3443,21 +3451,9 @@ int main(int argc, char** argv) {
break;
case 'r':
if (strequal_k(flagname_p, "recode", flag_slen)) {
- // special case: translate to "export ped" if no format specified
- const uint32_t param_ct = GetParamCt(argvk, argc, arg_idx);
- if (unlikely(param_ct > 4)) {
- fputs("Error: --recode accepts at most 4 arguments.\n", stderr);
- goto main_ret_INVALID_CMDLINE;
- }
- ExportfFlags dummy;
- IdpasteFlags dummy2;
- uint64_t format_param_idxs;
- GetExportfTargets(&(argvk[arg_idx]), param_ct, &dummy, &dummy2, &format_param_idxs);
- if (!format_param_idxs) {
- snprintf(flagname_write_iter, kMaxFlagBlen, "export ped");
- } else {
- snprintf(flagname_write_iter, kMaxFlagBlen, "export");
- }
+ // don't bother translating no-modifier --recode to "--export ped",
+ // just let it fail
+ snprintf(flagname_write_iter, kMaxFlagBlen, "export");
} else if (strequal_k(flagname_p, "remove-founders", flag_slen)) {
snprintf(flagname_write_iter, kMaxFlagBlen, "keep-founders");
} else if (strequal_k(flagname_p, "remove-nonfounders", flag_slen)) {
@@ -4658,21 +4654,16 @@ int main(int argc, char** argv) {
goto main_ret_1;
}
pc.filter_flags |= kfFilterPvarReq;
- } else if (strequal_k_unsafe(flagname_p2, "xport") || strequal_k_unsafe(flagname_p2, "xport ped")) {
+ } else if (strequal_k_unsafe(flagname_p2, "xport")) {
// todo: determine actual limit
if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 0, 50))) {
goto main_ret_INVALID_CMDLINE_2A;
}
uint64_t format_param_idxs = 0;
- if (!flagname_p2[5]) {
- GetExportfTargets(&(argvk[arg_idx]), param_ct, &pc.exportf_info.flags, &pc.exportf_info.idpaste_flags, &format_param_idxs);
- if (unlikely(!format_param_idxs)) {
- logerrputs("Error: --export requires at least one output format. (Did you forget 'ped' or\n'vcf'?)\n");
- goto main_ret_INVALID_CMDLINE_A;
- }
- logputsb();
- } else {
- pc.exportf_info.flags = kfExportfPed;
+ GetExportfTargets(&(argvk[arg_idx]), param_ct, &pc.exportf_info.flags, &pc.exportf_info.idpaste_flags, &format_param_idxs);
+ if (unlikely(!format_param_idxs)) {
+ logerrputs("Error: --export requires at least one output format. (Did you forget 'ped' or\n'vcf'?)\n");
+ goto main_ret_INVALID_CMDLINE_A;
}
// can't have e.g. bgen-1.1 and bgen-1.2 simultaneously, since they
// have the same extension and different content.
@@ -4689,6 +4680,10 @@ int main(int argc, char** argv) {
logerrputs("Error: 'A' and 'AD' formats cannot be exported simultaneously.\n");
goto main_ret_INVALID_CMDLINE;
}
+ if (unlikely((pc.exportf_info.flags & (kfExportfOxGenV1 | kfExportfOxGenV2)) == (kfExportfOxGenV1 | kfExportfOxGenV2))) {
+ logerrputs("Error: 'oxford' and 'oxford-v2' formats cannot be exported simultaneously.\n");
+ goto main_ret_INVALID_CMDLINE;
+ }
for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
// could use AdvBoundedTo0Bit()...
if ((format_param_idxs >> param_idx) & 1) {
@@ -5799,7 +5794,7 @@ int main(int argc, char** argv) {
logerrputs("Error: Multiple --het cols= modifiers.\n");
goto main_ret_INVALID_CMDLINE;
}
- reterr = ParseColDescriptor(&(cur_modif[5]), "maybefid\0fid\0maybesid\0sid\0hom\0het\0nobs\0f\0", "het", kfHardyColChrom, kfHardyColDefault, 1, &pc.hardy_flags);
+ reterr = ParseColDescriptor(&(cur_modif[5]), "maybefid\0fid\0maybesid\0sid\0hom\0het\0nobs\0f\0", "het", kfHetColMaybefid, kfHetColDefault, 1, &pc.het_flags);
if (unlikely(reterr)) {
goto main_ret_1;
}
@@ -7801,7 +7796,7 @@ int main(int argc, char** argv) {
const char* cur_modif = argvk[arg_idx + 1];
const uint32_t cur_modif_slen = strlen(cur_modif);
if (strequal_k(cur_modif, "erase", cur_modif_slen)) {
- if (unlikely(pmerge_info.merge_info_mode != kMergeQicModeErase)) {
+ if (unlikely(pmerge_info.merge_info_mode != kMergeInfoCmModeErase)) {
logerrputs("Error: \"--merge-xheader-mode erase\" requires \"--merge-info-mode erase\".\n");
goto main_ret_INVALID_CMDLINE_A;
}
@@ -7815,34 +7810,24 @@ int main(int argc, char** argv) {
} else if (unlikely(strequal_k_unsafe(flagname_p2, "erge-equal-pos"))) {
logerrputs("Error: --merge-equal-pos has been retired. Use e.g. --set-all-var-ids before\nmerging instead.\n");
goto main_ret_INVALID_CMDLINE_A;
- } else if (strequal_k_unsafe(flagname_p2, "erge-qual-mode") ||
- strequal_k_unsafe(flagname_p2, "erge-info-mode") ||
- strequal_k_unsafe(flagname_p2, "erge-cm-mode")) {
+ } else if (strequal_k_unsafe(flagname_p2, "erge-qual-mode")) {
if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 1))) {
goto main_ret_INVALID_CMDLINE_2A;
}
const char* cur_modif = argvk[arg_idx + 1];
const uint32_t cur_modif_slen = strlen(cur_modif);
- MergeQicMode mode;
if (strequal_k(cur_modif, "erase", cur_modif_slen)) {
- mode = kMergeQicModeErase;
+ pmerge_info.merge_qual_mode = kMergeQualModeErase;
} else if (strequal_k(cur_modif, "nm-match", cur_modif_slen)) {
- mode = kMergeQicModeNmMatch;
+ pmerge_info.merge_qual_mode = kMergeQualModeNmMatch;
} else if (strequal_k(cur_modif, "nm-first", cur_modif_slen)) {
- mode = kMergeQicModeNmFirst;
- } else if (likely(strequal_k(cur_modif, "first", cur_modif_slen))) {
- mode = kMergeQicModeFirst;
- } else {
- snprintf(g_logbuf, kLogbufSize, "Error: Invalid --%s argument '%s'.\n", flagname_p, cur_modif);
+ pmerge_info.merge_qual_mode = kMergeQualModeNmFirst;
+ } else if (strequal_k(cur_modif, "first", cur_modif_slen)) {
+ pmerge_info.merge_qual_mode = kMergeQualModeFirst;
+ } else if (unlikely(!strequal_k(cur_modif, "min", cur_modif_slen))) {
+ snprintf(g_logbuf, kLogbufSize, "Error: Invalid --merge-qual-mode argument '%s'.\n", cur_modif);
goto main_ret_INVALID_CMDLINE_WWA;
}
- if (flagname_p2[5] == 'q') {
- pmerge_info.merge_qual_mode = mode;
- } else if (flagname_p2[5] == 'i') {
- pmerge_info.merge_info_mode = mode;
- } else {
- pmerge_info.merge_cm_mode = mode;
- }
} else if (strequal_k_unsafe(flagname_p2, "erge-filter-mode")) {
if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 1))) {
goto main_ret_INVALID_CMDLINE_2A;
@@ -7861,6 +7846,31 @@ int main(int argc, char** argv) {
snprintf(g_logbuf, kLogbufSize, "Error: Invalid --merge-filter-mode argument '%s'.\n", cur_modif);
goto main_ret_INVALID_CMDLINE_WWA;
}
+ } else if (strequal_k_unsafe(flagname_p2, "erge-info-mode") ||
+ strequal_k_unsafe(flagname_p2, "erge-cm-mode")) {
+ if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 1))) {
+ goto main_ret_INVALID_CMDLINE_2A;
+ }
+ const char* cur_modif = argvk[arg_idx + 1];
+ const uint32_t cur_modif_slen = strlen(cur_modif);
+ MergeInfoCmMode mode;
+ if (strequal_k(cur_modif, "erase", cur_modif_slen)) {
+ mode = kMergeInfoCmModeErase;
+ } else if (strequal_k(cur_modif, "nm-match", cur_modif_slen)) {
+ mode = kMergeInfoCmModeNmMatch;
+ } else if (strequal_k(cur_modif, "nm-first", cur_modif_slen)) {
+ mode = kMergeInfoCmModeNmFirst;
+ } else if (likely(strequal_k(cur_modif, "first", cur_modif_slen))) {
+ mode = kMergeInfoCmModeFirst;
+ } else {
+ snprintf(g_logbuf, kLogbufSize, "Error: Invalid --%s argument '%s'.\n", flagname_p, cur_modif);
+ goto main_ret_INVALID_CMDLINE_WWA;
+ }
+ if (flagname_p2[5] == 'i') {
+ pmerge_info.merge_info_mode = mode;
+ } else {
+ pmerge_info.merge_cm_mode = mode;
+ }
} else if (strequal_k_unsafe(flagname_p2, "erge-info-sort") ||
strequal_k_unsafe(flagname_p2, "erge-pheno-sort")) {
if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 1))) {
@@ -8546,6 +8556,48 @@ int main(int argc, char** argv) {
goto main_ret_1;
}
pc.command_flags1 |= kfCommand1Pmerge;
+ } else if (strequal_k_unsafe(flagname_p2, "merge-list-dir")) {
+ if (unlikely((!(pc.command_flags1 & kfCommand1Pmerge)) || (!pmerge_info.list_fname))) {
+ logerrputs("Error: --pmerge-list-dir must be used with --pmerge-list.\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
+ if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 1))) {
+ goto main_ret_INVALID_CMDLINE_2A;
+ }
+ const char* dir_str = argvk[arg_idx + 1];
+ const uint32_t dir_slen = strlen(dir_str);
+ uint32_t append_path_sep = 0;
+#ifdef _WIN32
+ const char path_sep = '\\';
+ if ((dir_str[dir_slen - 1] != '/') && (dir_str[dir_slen - 1] != '\\')) {
+ append_path_sep = 1;
+ }
+#else
+ const char path_sep = '/';
+ if (dir_str[dir_slen - 1] != '/') {
+ append_path_sep = 1;
+ }
+#endif
+ const uint32_t alloc_blen = dir_slen + 1 + append_path_sep;
+ if (unlikely(alloc_blen > (kPglFnamesize - 9))) {
+ logerrputs("Error: --pmerge-list-dir argument too long.\n");
+ goto main_ret_OPEN_FAIL;
+ }
+ if (unlikely(pgl_malloc(alloc_blen, &pmerge_info.list_base_dir))) {
+ goto main_ret_NOMEM;
+ }
+ char* write_iter = memcpya(pmerge_info.list_base_dir, dir_str, dir_slen);
+ if (append_path_sep) {
+ *write_iter++ = path_sep;
+ }
+ *write_iter = '\0';
+ } else if (strequal_k_unsafe(flagname_p2, "merge-output-vzs")) {
+ if (unlikely(!(pc.command_flags1 & kfCommand1Pmerge))) {
+ logerrputs("Error: --pmerge-output-vzs must be used with --pmerge or --pmerge-list.\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
+ pmerge_info.flags |= kfPmergeOutputVzs;
+ goto main_param_zero;
} else if (strequal_k_unsafe(flagname_p2, "gen-diff")) {
if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 7))) {
goto main_ret_INVALID_CMDLINE_2A;
@@ -10565,7 +10617,7 @@ int main(int argc, char** argv) {
const uint32_t no_samples_ok = !((pc.dependency_flags & (kfFilterAllReq | kfFilterPsamReq)) || (pc.command_flags1 & kfCommand1Pmerge));
const uint32_t is_vcf = (xload / kfXloadVcf) & 1;
pvar_is_compressed = ((import_flags & (kfImportKeepAutoconv | kfImportKeepAutoconvVzs)) != kfImportKeepAutoconv);
- if (no_samples_ok && is_vcf && (!(import_flags & kfImportKeepAutoconv)) && pc.command_flags1) {
+ if (no_samples_ok && is_vcf && (!(import_flags & (kfImportKeepAutoconv | kfImportVcfRefNMissing))) && pc.command_flags1) {
// special case: just treat the VCF as a .pvar file
strcpy(pvarname, pgenname);
pgenname[0] = '\0';
@@ -10609,7 +10661,6 @@ int main(int argc, char** argv) {
goto main_ret_1;
}
- // todo: we have to skip this when merging is involved
pc.hard_call_thresh = UINT32_MAX;
pc.dosage_erase_thresh = 0;
@@ -10676,19 +10727,11 @@ int main(int argc, char** argv) {
}
if (pc.command_flags1 & kfCommand1Pmerge) {
- const uint32_t zst_level = g_zst_level;
char* merge_outname_end = outname_end;
- if (pc.command_flags1 == kfCommand1Pmerge) {
- import_flags |= kfImportKeepAutoconv;
- } else {
- if (make_plink2_flags & (kfMakePgen | kfMakePvar | kfMakePsam)) {
- merge_outname_end = strcpya_k(merge_outname_end, "-merge");
- }
- if (!(import_flags & kfImportKeepAutoconv)) {
- g_zst_level = 1;
- }
+ if (make_plink2_flags & (kfMakePgen | kfMakePvar | kfMakePsam)) {
+ merge_outname_end = strcpya_k(merge_outname_end, "-merge");
}
- reterr = Pmerge(&pmerge_info, pc.sample_sort_fname, pc.misc_flags, import_flags, pc.sample_sort_mode, pc.fam_cols, pc.missing_pheno, pc.max_thread_ct, pc.sort_vars_mode, pgenname, psamname, pvarname, outname, merge_outname_end, &chr_info);
+ reterr = Pmerge(&pmerge_info, pc.sample_sort_fname, pc.misc_flags, pc.sample_sort_mode, pc.fam_cols, pc.missing_pheno, pc.max_thread_ct, pc.sort_vars_mode, pgenname, psamname, pvarname, outname, merge_outname_end, &chr_info);
if (unlikely(reterr)) {
goto main_ret_1;
}
@@ -10702,10 +10745,9 @@ int main(int argc, char** argv) {
// otherwise error out (since --real-ref-alleles does not make sense on
// the merged .pgen).
pc.misc_flags &= ~kfMiscRealRefAlleles;
- g_zst_level = zst_level;
}
- if ((pc.command_flags1 & (~(kfCommand1MakePlink2 | kfCommand1Validate | kfCommand1WriteSnplist | kfCommand1WriteCovar | kfCommand1WriteSamples))) || ((pc.command_flags1 & kfCommand1MakePlink2) && (pc.sort_vars_mode > kSortNone))) {
+ if ((pc.command_flags1 & (~(kfCommand1MakePlink2 | kfCommand1Validate | kfCommand1WriteSnplist | kfCommand1WriteCovar | kfCommand1WriteSamples))) || ((pc.command_flags1 & kfCommand1MakePlink2) && (pc.sort_vars_mode <= kSortNone))) {
// split-chromosome prohibited for all commands unless explicitly
// permitted here
pc.dependency_flags |= kfFilterNoSplitChr;
=====================================
plink2_cmdline.cc
=====================================
@@ -746,6 +746,15 @@ BoolErr bigstack_calloc_kcp(uintptr_t ct, const char*** kcp_arr_ptr) {
return 0;
}
+BoolErr bigstack_calloc_cpp(uintptr_t ct, char**** cpp_arr_ptr) {
+ *cpp_arr_ptr = S_CAST(char***, bigstack_alloc(ct * sizeof(intptr_t)));
+ if (unlikely(!(*cpp_arr_ptr))) {
+ return 1;
+ }
+ ZeroPtrArr(ct, *cpp_arr_ptr);
+ return 0;
+}
+
BoolErr bigstack_end_calloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr) {
*uc_arr_ptr = S_CAST(unsigned char*, bigstack_end_alloc(ct));
if (unlikely(!(*uc_arr_ptr))) {
@@ -1098,7 +1107,6 @@ void BitvecInvertCopy(const uintptr_t* __restrict source_bitvec, uintptr_t word_
#endif
}
-/*
void BitvecXor(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec) {
// main_bitvec := main_bitvec XOR arg_bitvec
#ifdef __LP64__
@@ -1134,7 +1142,6 @@ void BitvecXor(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintpt
}
#endif
}
-*/
void BitvecInvertAndMask(const uintptr_t* __restrict include_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
// main_bitvec := (~main_bitvec) AND include_bitvec
=====================================
plink2_cmdline.h
=====================================
@@ -604,6 +604,11 @@ HEADER_INLINE BoolErr bigstack_alloc_wpp(uintptr_t ct, uintptr_t**** wpp_arr_ptr
return !(*wpp_arr_ptr);
}
+HEADER_INLINE BoolErr bigstack_alloc_cpp(uintptr_t ct, char**** cpp_arr_ptr) {
+ *cpp_arr_ptr = S_CAST(char***, bigstack_alloc(ct * sizeof(intptr_t)));
+ return !(*cpp_arr_ptr);
+}
+
HEADER_INLINE BoolErr bigstack_alloc_u32pp(uintptr_t ct, uint32_t**** u32pp_arr_ptr) {
*u32pp_arr_ptr = S_CAST(uint32_t***, bigstack_alloc(ct * sizeof(intptr_t)));
return !(*u32pp_arr_ptr);
@@ -636,6 +641,8 @@ BoolErr bigstack_calloc_cp(uintptr_t ct, char*** cp_arr_ptr);
BoolErr bigstack_calloc_kcp(uintptr_t ct, const char*** kcp_arr_ptr);
+BoolErr bigstack_calloc_cpp(uintptr_t ct, char**** cpp_arr_ptr);
+
HEADER_INLINE BoolErr bigstack_calloc_c(uintptr_t ct, char** c_arr_ptr) {
return bigstack_calloc_uc(ct, R_CAST(unsigned char**, c_arr_ptr));
}
@@ -1177,7 +1184,7 @@ HEADER_INLINE void AlignedBitarrInvertCopy(const uintptr_t* __restrict source_bi
}
}
-// void BitvecXor(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec);
+void BitvecXor(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec);
void BitvecInvertAndMask(const uintptr_t* __restrict include_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec);
=====================================
plink2_common.h
=====================================
@@ -171,19 +171,21 @@ FLAGSET64_DEF_START()
kfExportfLgenRef = (1 << 25),
kfExportfList = (1 << 26),
kfExportfRlist = (1 << 27),
- kfExportfOxGen = (1 << 28),
- kfExportfPed = (1 << 29),
- kfExportfCompound = (1 << 30),
- kfExportfStructure = (1U << 31),
- kfExportfTranspose = (1LLU << 32),
- kfExportfVcf42 = (1LLU << 33),
- kfExportfVcf43 = (1LLU << 34),
+ kfExportfOxGenV1 = (1 << 28),
+ kfExportfOxGenV2 = (1 << 29),
+ kfExportfOxGen = kfExportfOxGenV1 | kfExportfOxGenV2,
+ kfExportfPed = (1 << 30),
+ kfExportfCompound = (1U << 31),
+ kfExportfStructure = (1LLU << 32),
+ kfExportfTranspose = (1LLU << 33),
+ kfExportfVcf42 = (1LLU << 34),
+ kfExportfVcf43 = (1LLU << 35),
kfExportfVcf = kfExportfVcf42 | kfExportfVcf43,
kfExportfTypemask = (2LLU * kfExportfVcf43) - kfExportf23,
- kfExportfIncludeAlt = (1LLU << 35),
- kfExportfBgz = (1LLU << 36),
- kfExportfOmitNonmaleY = (1LLU << 37),
- kfExportfSampleV2 = (1LLU << 38)
+ kfExportfIncludeAlt = (1LLU << 36),
+ kfExportfBgz = (1LLU << 37),
+ kfExportfOmitNonmaleY = (1LLU << 38),
+ kfExportfSampleV2 = (1LLU << 39)
FLAGSET64_DEF_END(ExportfFlags);
FLAGSET_DEF_START()
@@ -389,7 +391,7 @@ void PopulateRescaledDosageF(const uintptr_t* genoarr, const uintptr_t* dosage_p
// assumes trailing bits of genoarr are zeroed out
HEADER_INLINE uint32_t AtLeastOneHetUnsafe(const uintptr_t* genoarr, uint32_t sample_ct) {
- const uint32_t sample_ctl2 = DivUp(sample_ct, kBitsPerWordD2);
+ const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
for (uint32_t uii = 0; uii != sample_ctl2; ++uii) {
const uintptr_t geno_word = genoarr[uii];
if (Word01(geno_word)) {
=====================================
plink2_data.cc
=====================================
@@ -174,7 +174,7 @@ void PvarInfoWrite(uint32_t info_pr_flag_present, uint32_t is_pr, char* info_tok
uint32_t info_token_slen = info_token_end - info_token;
char* info_token_pr = nullptr;
if (info_pr_flag_present) {
- info_token_pr = PrInInfoToken(info_token_slen, info_token);
+ info_token_pr = InfoPrStart(info_token_slen, info_token);
}
char* write_iter = *write_iter_ptr;
if (is_pr || (!info_token_pr)) {
@@ -553,14 +553,20 @@ PglErr WritePvar(const char* outname, const uintptr_t* variant_include, const Ch
const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
const uint32_t all_nonref = (nonref_flags_storage == 2);
uint32_t write_info_pr = all_nonref;
- uint32_t write_info = (pvar_psam_flags & kfPvarColInfo) || pvar_info_reload;
+ if (pvar_info_reload && (!(pvar_psam_flags & kfPvarColXinfo))) {
+ // bugfix (6 Feb 2021): don't write INFO column when INFO column
+ // explicitly excluded here, but pvar_info_reload is needed for some
+ // other reason
+ pvar_info_reload = nullptr;
+ }
+ const uint32_t write_info = (pvar_psam_flags & kfPvarColInfo) || pvar_info_reload;
if (write_info && nonref_flags) {
write_info_pr = !IntersectionIsEmpty(variant_include, nonref_flags, raw_variant_ctl);
}
write_info_pr = write_info_pr && write_info;
if (unlikely(write_info_pr && (info_flags & kfInfoPrNonflagPresent))) {
logputs("\n");
- logerrputs("Error: Conflicting INFO/PR definitions. Either fix all REF alleles so that the\n'provisional reference' flag is no longer needed, or remove/rename the other\nuse of the INFO/PR key.\n");
+ logerrputs("Error: Conflicting INFO/PR definitions. Either fix all REF alleles so that the\n\"provisional reference\" flag is no longer needed, or remove/rename the other\nuse of the INFO/PR key.\n");
goto WritePvar_ret_INCONSISTENT_INPUT;
}
@@ -1184,9 +1190,9 @@ PglErr WritePsam(const char* outname, const uintptr_t* sample_include, const Sam
/*
#ifdef NO_UNALIGNED
-# error "Unaligned accesses in BitvecResort()."
+# error "Unaligned accesses in BitvecPermute()."
#endif
-void BitvecResort(const uintptr_t* bitvec, const uint32_t* new_sample_idx_to_old, uint32_t sample_ct, unsigned char* writebuf) {
+void BitvecPermute(const uintptr_t* bitvec, const uint32_t* new_sample_idx_to_old, uint32_t sample_ct, unsigned char* writebuf) {
const uint32_t sample_ctl_m1 = BitCtToWordCt(sample_ct) - 1;
uint32_t widx = 0;
uint32_t cur_word_entry_ct = kBitsPerWord;
@@ -1210,37 +1216,111 @@ void BitvecResort(const uintptr_t* bitvec, const uint32_t* new_sample_idx_to_old
}
*/
+// Now assumes trailing bits of genovec have been zeroed out.
+// writebuf need not be word-aligned.
#ifdef NO_UNALIGNED
-# error "Unaligned accesses in GenovecResort()."
+# error "Unaligned accesses in GenovecPermute()."
#endif
-void GenovecResort(const uintptr_t* genovec, const uint32_t* new_sample_idx_to_old, uint32_t sample_ct, void* writebuf) {
- // writebuf need not be word-aligned
+void GenovecPermute(const uintptr_t* genovec, const uint32_t* old_sample_idx_to_new, uint32_t sample_ct, void* writebuf) {
+ const uintptr_t most_common_geno_word = MostCommonGenoUnsafe(genovec, sample_ct) * kMask5555;
const uint32_t sample_ctl2_m1 = NypCtToWordCt(sample_ct) - 1;
- const uint32_t* new_sample_idx_to_old_iter = new_sample_idx_to_old;
uintptr_t* writebuf_walias = S_CAST(uintptr_t*, writebuf);
for (uint32_t widx = 0; widx != sample_ctl2_m1; ++widx) {
- uintptr_t cur_word = 0;
- // this is noticeably better than the ascending loop
- for (uint32_t uii = kBitsPerWordD2 - 1; ; --uii) {
- cur_word |= GetNyparrEntry(genovec, new_sample_idx_to_old_iter[uii]);
- if (!uii) {
- break;
+ writebuf_walias[widx] = most_common_geno_word;
+ }
+ const uint32_t trailing_bit_ct = 2 * ModNz(sample_ct, kBitsPerWordD2);
+ SubwordStore(bzhi_max(most_common_geno_word, trailing_bit_ct), DivUp(trailing_bit_ct, CHAR_BIT), &(writebuf_walias[sample_ctl2_m1]));
+
+ unsigned char* writebuf_b = S_CAST(unsigned char*, writebuf);
+ for (uint32_t widx = 0; ; ++widx) {
+ uintptr_t geno_word_xor;
+ if (widx >= sample_ctl2_m1) {
+ if (widx > sample_ctl2_m1) {
+ return;
}
- cur_word = cur_word << 2;
+ geno_word_xor = bzhi_max(most_common_geno_word ^ genovec[widx], trailing_bit_ct);
+ } else {
+ geno_word_xor = most_common_geno_word ^ genovec[widx];
}
- writebuf_walias[widx] = cur_word;
- new_sample_idx_to_old_iter = &(new_sample_idx_to_old_iter[kBitsPerWordD2]);
+ if (!geno_word_xor) {
+ continue;
+ }
+ const uint32_t* cur_old_sample_idx_to_new = &(old_sample_idx_to_new[widx * kBitsPerWordD2]);
+ do {
+ const uint32_t bit_read_shift_ct = ctzw(geno_word_xor) & (kBitsPerWord - 2);
+ const uintptr_t cur_geno_xor = (geno_word_xor >> bit_read_shift_ct) & 3;
+ const uint32_t new_sample_idx = cur_old_sample_idx_to_new[bit_read_shift_ct / 2];
+ const uint32_t new_byte_idx = new_sample_idx / 4;
+ const uint32_t bit_write_shift_ct = 2 * (new_sample_idx % 4);
+ // Value has been preset to most_common_geno, so if we xor it with
+ // (most_common_geno ^ actual_geno), the result is actual_geno.
+ writebuf_b[new_byte_idx] ^= cur_geno_xor << bit_write_shift_ct;
+ geno_word_xor &= (~(3 * k1LU)) << bit_read_shift_ct;
+ } while (geno_word_xor);
}
- const uint32_t cur_word_entry_ct = ModNz(sample_ct, kBitsPerWordD2);
- uintptr_t cur_word = 0;
- for (uint32_t uii = cur_word_entry_ct - 1; ; --uii) {
- cur_word |= GetNyparrEntry(genovec, new_sample_idx_to_old_iter[uii]);
- if (!uii) {
- break;
+}
+
+void GenovecPermuteSubset(const uintptr_t* genovec, const uintptr_t* sample_include, const uintptr_t* sample_include_interleaved_vec, const uint32_t* old_sample_idx_to_new, uint32_t raw_sample_ct, uint32_t sample_ct, void* writebuf) {
+ if (sample_ct == raw_sample_ct) {
+ GenovecPermute(genovec, old_sample_idx_to_new, sample_ct, writebuf);
+ return;
+ }
+ STD_ARRAY_DECL(uint32_t, 4, genocounts);
+ GenoarrCountSubsetFreqs(genovec, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
+ uintptr_t most_common_geno = 0;
+ uint32_t most_common_geno_ct = genocounts[0];
+ if (most_common_geno_ct * 2 < sample_ct) {
+ for (uintptr_t cur_geno = 1; cur_geno != 4; ++cur_geno) {
+ if (most_common_geno_ct < genocounts[cur_geno]) {
+ most_common_geno = cur_geno;
+ most_common_geno_ct = genocounts[cur_geno];
+ }
}
- cur_word = cur_word << 2;
}
- SubwordStore(cur_word, NypCtToByteCt(cur_word_entry_ct), &(writebuf_walias[sample_ctl2_m1]));
+ const uintptr_t most_common_geno_word = most_common_geno * kMask5555;
+ const uint32_t sample_ctl2_m1 = NypCtToWordCt(sample_ct) - 1;
+ uintptr_t* writebuf_walias = S_CAST(uintptr_t*, writebuf);
+ for (uint32_t widx = 0; widx != sample_ctl2_m1; ++widx) {
+ writebuf_walias[widx] = most_common_geno_word;
+ }
+ const uint32_t write_trailing_bit_ct = 2 * ModNz(sample_ct, kBitsPerWordD2);
+ SubwordStore(bzhi_max(most_common_geno_word, write_trailing_bit_ct), DivUp(write_trailing_bit_ct, CHAR_BIT), &(writebuf_walias[sample_ctl2_m1]));
+ if (most_common_geno_ct == sample_ct) {
+ return;
+ }
+
+ const uint32_t raw_sample_ctl2_m1 = NypCtToWordCt(raw_sample_ct) - 1;
+ const Halfword* sample_include_alias = R_CAST(const Halfword*, sample_include);
+ unsigned char* writebuf_b = S_CAST(unsigned char*, writebuf);
+ for (uint32_t widx = 0; ; ++widx) {
+ uintptr_t geno_word_xor;
+ if (widx >= raw_sample_ctl2_m1) {
+ if (widx > raw_sample_ctl2_m1) {
+ return;
+ }
+ geno_word_xor = bzhi_max(most_common_geno_word ^ genovec[widx], 2 * ModNz(raw_sample_ct, kBitsPerWordD2));
+ } else {
+ geno_word_xor = most_common_geno_word ^ genovec[widx];
+ }
+ if (!geno_word_xor) {
+ continue;
+ }
+ geno_word_xor &= UnpackHalfwordToWord(sample_include_alias[widx]) * 3;
+ if (!geno_word_xor) {
+ continue;
+ }
+ const uint32_t* cur_old_sample_idx_to_new = &(old_sample_idx_to_new[widx * kBitsPerWordD2]);
+ do {
+ const uint32_t bit_read_shift_ct = ctzw(geno_word_xor) & (kBitsPerWord - 2);
+ const uintptr_t cur_geno_xor = (geno_word_xor >> bit_read_shift_ct) & 3;
+ const uint32_t new_sample_idx = cur_old_sample_idx_to_new[bit_read_shift_ct / 2];
+ // assert(new_sample_idx != UINT32_MAX);
+ const uint32_t new_byte_idx = new_sample_idx / 4;
+ const uint32_t bit_write_shift_ct = 2 * (new_sample_idx % 4);
+ writebuf_b[new_byte_idx] ^= cur_geno_xor << bit_write_shift_ct;
+ geno_word_xor &= (~(3 * k1LU)) << bit_read_shift_ct;
+ } while (geno_word_xor);
+ }
}
// Revised phaseraw:
@@ -1292,7 +1372,7 @@ void UnpackHphaseSubset(const uintptr_t* __restrict all_hets, const uintptr_t* _
}
}
-void UnpackAndResortHphase(const uintptr_t* __restrict all_hets, const uintptr_t* __restrict phaseraw, const uintptr_t* sample_include, const uint32_t* old_sample_idx_to_new, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t** phasepresent_ptr, uintptr_t* __restrict phaseinfo) {
+void UnpackAndPermuteHphase(const uintptr_t* __restrict all_hets, const uintptr_t* __restrict phaseraw, const uintptr_t* sample_include, const uint32_t* old_sample_idx_to_new, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t** phasepresent_ptr, uintptr_t* __restrict phaseinfo) {
const uintptr_t* aux2a_iter = &(phaseraw[8 / kBytesPerWord]);
const uint32_t* old_sample_idx_to_new_iter = old_sample_idx_to_new;
const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
@@ -1509,44 +1589,122 @@ void CopyDosage(const uintptr_t* __restrict read_dosagepresent, const Dosage* re
memcpy(write_dosagevals, read_dosagevals, dosage_ct * sizeof(Dosage));
}
-uint32_t CopyAndResort8bit(const uintptr_t* __restrict src_subset, const void* __restrict src_vals, const uint32_t* __restrict new_sample_idx_to_old, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t* __restrict dst_subset, void* __restrict dst_vals, uint32_t* __restrict cumulative_popcount_buf) {
- const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
- FillCumulativePopcounts(src_subset, raw_sample_ctl, cumulative_popcount_buf);
+uint32_t CopyAndPermute8bit(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict src_subset, const void* __restrict src_vals, const uint32_t* __restrict old_sample_idx_to_new, uint32_t sample_ct, uint32_t val_ct, uintptr_t* __restrict dst_subset, void* __restrict dst_vals) {
const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
ZeroWArr(sample_ctl, dst_subset);
const unsigned char* src_vals_uc = S_CAST(const unsigned char*, src_vals);
unsigned char* dst_vals_uc = S_CAST(unsigned char*, dst_vals);
- unsigned char* dst_vals_iter = dst_vals_uc;
- // Tried word-based loop, was significantly worse
- for (uint32_t new_sample_idx = 0; new_sample_idx != sample_ct; ++new_sample_idx) {
- const uint32_t old_sample_idx = new_sample_idx_to_old[new_sample_idx];
- if (IsSet(src_subset, old_sample_idx)) {
+
+ uintptr_t cur_bits = src_subset[0];
+ uint32_t new_val_ct;
+ if (!sample_include) {
+ uintptr_t old_sample_idx_base = 0;
+ for (uint32_t old_val_idx = 0; old_val_idx != val_ct; ++old_val_idx) {
+ const uint32_t old_sample_idx = BitIter1(src_subset, &old_sample_idx_base, &cur_bits);
+ const uint32_t new_sample_idx = old_sample_idx_to_new[old_sample_idx];
+ SetBit(new_sample_idx, dst_subset);
+ dst_vals_uc[new_sample_idx] = src_vals_uc[old_val_idx];
+ }
+ new_val_ct = val_ct;
+ } else {
+ uintptr_t widx = 0;
+ new_val_ct = 0;
+ for (uint32_t old_val_idx = 0; old_val_idx != val_ct; ++old_val_idx) {
+ const uintptr_t old_sample_idx_lowbit = BitIter1y(src_subset, &widx, &cur_bits);
+ if (!(sample_include[widx] & old_sample_idx_lowbit)) {
+ continue;
+ }
+ const uint32_t old_sample_idx = widx * kBitsPerWord + ctzw(old_sample_idx_lowbit);
+ const uint32_t new_sample_idx = old_sample_idx_to_new[old_sample_idx];
SetBit(new_sample_idx, dst_subset);
- const uint32_t old_dosagevals_idx = RawToSubsettedPos(src_subset, cumulative_popcount_buf, old_sample_idx);
- *dst_vals_iter++ = src_vals_uc[old_dosagevals_idx];
+ dst_vals_uc[new_sample_idx] = src_vals_uc[old_val_idx];
+ ++new_val_ct;
}
}
- return dst_vals_iter - dst_vals_uc;
+ uintptr_t new_sample_idx_base = 0;
+ cur_bits = dst_subset[0];
+ for (uint32_t new_val_idx = 0; new_val_idx != new_val_ct; ++new_val_idx) {
+ const uint32_t new_sample_idx = BitIter1(dst_subset, &new_sample_idx_base, &cur_bits);
+ dst_vals_uc[new_val_idx] = dst_vals_uc[new_sample_idx];
+ }
+ return new_val_ct;
}
-uint32_t CopyAndResort16bit(const uintptr_t* __restrict src_subset, const void* __restrict src_vals, const uint32_t* __restrict new_sample_idx_to_old, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t* __restrict dst_subset, void* __restrict dst_vals, uint32_t* __restrict cumulative_popcount_buf) {
- const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
- FillCumulativePopcounts(src_subset, raw_sample_ctl, cumulative_popcount_buf);
+uint32_t CopyAndPermute16bit(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict src_subset, const void* __restrict src_vals, const uint32_t* __restrict old_sample_idx_to_new, uint32_t sample_ct, uint32_t val_ct, uintptr_t* __restrict dst_subset, void* __restrict dst_vals) {
const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
ZeroWArr(sample_ctl, dst_subset);
const uint16_t* src_vals_u16 = S_CAST(const uint16_t*, src_vals);
uint16_t* dst_vals_u16 = S_CAST(uint16_t*, dst_vals);
- uint16_t* dst_vals_iter = dst_vals_u16;
- // Tried word-based loop, was significantly worse
- for (uint32_t new_sample_idx = 0; new_sample_idx != sample_ct; ++new_sample_idx) {
- const uint32_t old_sample_idx = new_sample_idx_to_old[new_sample_idx];
- if (IsSet(src_subset, old_sample_idx)) {
+
+ uintptr_t cur_bits = src_subset[0];
+ uint32_t new_val_ct;
+ if (!sample_include) {
+ uintptr_t old_sample_idx_base = 0;
+ for (uint32_t old_val_idx = 0; old_val_idx != val_ct; ++old_val_idx) {
+ const uint32_t old_sample_idx = BitIter1(src_subset, &old_sample_idx_base, &cur_bits);
+ const uint32_t new_sample_idx = old_sample_idx_to_new[old_sample_idx];
+ SetBit(new_sample_idx, dst_subset);
+ dst_vals_u16[new_sample_idx] = src_vals_u16[old_val_idx];
+ }
+ new_val_ct = val_ct;
+ } else {
+ uintptr_t widx = 0;
+ new_val_ct = 0;
+ for (uint32_t old_val_idx = 0; old_val_idx != val_ct; ++old_val_idx) {
+ const uintptr_t old_sample_idx_lowbit = BitIter1y(src_subset, &widx, &cur_bits);
+ if (!(sample_include[widx] & old_sample_idx_lowbit)) {
+ continue;
+ }
+ const uint32_t old_sample_idx = widx * kBitsPerWord + ctzw(old_sample_idx_lowbit);
+ const uint32_t new_sample_idx = old_sample_idx_to_new[old_sample_idx];
SetBit(new_sample_idx, dst_subset);
- const uint32_t old_dosagevals_idx = RawToSubsettedPos(src_subset, cumulative_popcount_buf, old_sample_idx);
- *dst_vals_iter++ = src_vals_u16[old_dosagevals_idx];
+ dst_vals_u16[new_sample_idx] = src_vals_u16[old_val_idx];
+ ++new_val_ct;
}
}
- return dst_vals_iter - dst_vals_u16;
+ uintptr_t new_sample_idx_base = 0;
+ cur_bits = dst_subset[0];
+ for (uint32_t new_val_idx = 0; new_val_idx != new_val_ct; ++new_val_idx) {
+ const uint32_t new_sample_idx = BitIter1(dst_subset, &new_sample_idx_base, &cur_bits);
+ dst_vals_u16[new_val_idx] = dst_vals_u16[new_sample_idx];
+ }
+ return new_val_ct;
+}
+
+uint32_t Dense8bitToSparse(const uintptr_t* __restrict set, uint32_t sample_ctl, void* __restrict vals) {
+ unsigned char* vals_uc = S_CAST(unsigned char*, vals);
+ uint32_t write_idx = 0;
+ for (uint32_t widx = 0; widx != sample_ctl; ++widx) {
+ uintptr_t set_word = set[widx];
+ if (!set_word) {
+ continue;
+ }
+ unsigned char* cur_vals_read = &(vals_uc[widx * kBitsPerWord]);
+ do {
+ const uint32_t sample_idx_lowbits = ctzw(set_word);
+ vals_uc[write_idx++] = cur_vals_read[sample_idx_lowbits];
+ set_word &= set_word - 1;
+ } while (set_word);
+ }
+ return write_idx;
+}
+
+uint32_t Dense16bitToSparse(const uintptr_t* __restrict set, uint32_t sample_ctl, void* __restrict vals) {
+ uint16_t* vals_u16 = S_CAST(uint16_t*, vals);
+ uint32_t write_idx = 0;
+ for (uint32_t widx = 0; widx != sample_ctl; ++widx) {
+ uintptr_t set_word = set[widx];
+ if (!set_word) {
+ continue;
+ }
+ uint16_t* cur_vals_read = &(vals_u16[widx * kBitsPerWord]);
+ do {
+ const uint32_t sample_idx_lowbits = ctzw(set_word);
+ vals_u16[write_idx++] = cur_vals_read[sample_idx_lowbits];
+ set_word &= set_word - 1;
+ } while (set_word);
+ }
+ return write_idx;
}
// Requires trailing bits of genovec to be zeroed out.
@@ -3517,13 +3675,13 @@ BoolErr FillInfoVtypeNum(const char* num_str_start, int32_t* info_vtype_num_ptr)
} else if (first_num_char == 'R') {
*info_vtype_num_ptr = kInfoVtypeR;
} else if (likely(first_num_char == 'G')) {
- *info_vtype_num_ptr = kInfoVtypeG;
+ *info_vtype_num_ptr = kInfoVtypeUnknown;
} else {
return 1;
}
} else {
uint32_t val;
- if (unlikely(ScanmovPosintCapped(UINT32_MAX, &num_str_start, &val) || (num_str_start[0] != ','))) {
+ if (unlikely(ScanmovPosintCapped(0x3fffffff, &num_str_start, &val) || (num_str_start[0] != ','))) {
return 1;
}
*info_vtype_num_ptr = val;
@@ -3681,14 +3839,17 @@ PglErr WritePvarSplit(const char* outname, const uintptr_t* variant_include, con
const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
const uint32_t all_nonref = (nonref_flags_storage == 2);
uint32_t write_info_pr = all_nonref;
- uint32_t write_info = (pvar_psam_flags & kfPvarColInfo) || pvar_info_reload;
+ if (pvar_info_reload && (!(pvar_psam_flags & kfPvarColXinfo))) {
+ pvar_info_reload = nullptr;
+ }
+ const uint32_t write_info = (pvar_psam_flags & kfPvarColInfo) || pvar_info_reload;
if (write_info && nonref_flags) {
write_info_pr = !IntersectionIsEmpty(variant_include, nonref_flags, raw_variant_ctl);
}
write_info_pr = write_info_pr && write_info;
if (unlikely(write_info_pr && (info_flags & kfInfoPrNonflagPresent))) {
logputs("\n");
- logerrputs("Error: Conflicting INFO/PR definitions. Either fix all REF alleles so that the\n'provisional reference' flag is no longer needed, or remove/rename the other\nuse of the INFO/PR key.\n");
+ logerrputs("Error: Conflicting INFO/PR definitions. Either fix all REF alleles so that the\n\"provisional reference\" flag is no longer needed, or remove/rename the other\nuse of the INFO/PR key.\n");
goto WritePvarSplit_ret_INCONSISTENT_INPUT;
}
@@ -4041,24 +4202,6 @@ PglErr WritePvarSplit(const char* outname, const uintptr_t* variant_include, con
const char* cur_info_cur = info_curs[kpos];
const char* subtoken_end = AdvToDelimOrEnd(cur_info_cur, cur_info_end, ',');
- if (knum == kInfoVtypeG) {
- if (unlikely(subtoken_end == cur_info_end)) {
- snprintf(g_logbuf, kLogbufSize, "Error: Too few values for INFO key '%s', variant ID '%s'.\n", cur_key_str, orig_variant_id);
- goto WritePvarSplit_ret_MALFORMED_INPUT_WW;
- }
- cswritep = memcpya(cswritep, cur_info_cur, 1 + S_CAST(uintptr_t, subtoken_end - cur_info_cur));
- cur_info_cur = subtoken_end;
- const uint32_t skip_ct = alt_allele_idx - 1;
- if (skip_ct) {
- cur_info_cur = AdvToNthDelimChecked(&(cur_info_cur[1]), cur_info_end, skip_ct, ',');
- if (unlikely(!subtoken_end)) {
- snprintf(g_logbuf, kLogbufSize, "Error: Too few values for INFO key '%s', variant ID '%s'.\n", cur_key_str, orig_variant_id);
- goto WritePvarSplit_ret_MALFORMED_INPUT_WW;
- }
- }
- ++cur_info_cur;
- subtoken_end = AdvToDelimOrEnd(cur_info_cur, cur_info_end, ',');
- }
if (unlikely((subtoken_end == cur_info_end) != is_last_allele)) {
snprintf(g_logbuf, kLogbufSize, "Error: Wrong number of values for INFO key '%s', variant ID '%s'.\n", cur_key_str, orig_variant_id);
goto WritePvarSplit_ret_MALFORMED_INPUT_WW;
@@ -4337,14 +4480,17 @@ PglErr WritePvarJoin(const char* outname, const uintptr_t* variant_include, cons
const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
const uint32_t all_nonref = (nonref_flags_storage == 2);
uint32_t write_info_pr = all_nonref;
- uint32_t write_info = (pvar_psam_flags & kfPvarColInfo) || pvar_info_reload;
+ if (pvar_info_reload && (!(pvar_psam_flags & kfPvarColXinfo))) {
+ pvar_info_reload = nullptr;
+ }
+ const uint32_t write_info = (pvar_psam_flags & kfPvarColInfo) || pvar_info_reload;
if (write_info && nonref_flags) {
write_info_pr = !IntersectionIsEmpty(variant_include, nonref_flags, raw_variant_ctl);
}
write_info_pr = write_info_pr && write_info;
if (unlikely(write_info_pr && (info_flags & kfInfoPrNonflagPresent))) {
logputs("\n");
- logerrputs("Error: Conflicting INFO/PR definitions. Either fix all REF alleles so that the\n'provisional reference' flag is no longer needed, or remove/rename the other\nuse of the INFO/PR key.\n");
+ logerrputs("Error: Conflicting INFO/PR definitions. Either fix all REF alleles so that the\n\"provisional reference\" flag is no longer needed, or remove/rename the other\nuse of the INFO/PR key.\n");
goto WritePvarJoin_ret_INCONSISTENT_INPUT;
}
@@ -4736,7 +4882,8 @@ typedef struct MakeBedlikeCtxStruct {
const uintptr_t* variant_include;
uint32_t* sample_include_cumulative_popcounts;
- const uint32_t* collapsed_sort_map;
+ // unlike MakePgenCtx, old_sample_idx is subsetted here
+ const uint32_t* old_sample_idx_to_new;
PgenReader** pgr_ptrs;
@@ -4775,7 +4922,7 @@ THREAD_FUNC_DECL MakeBedlikeThread(void* raw_arg) {
PgrSetSampleSubsetIndex(ctx->sample_include_cumulative_popcounts, pgrp, &pssi);
const uintptr_t* sex_male_collapsed_interleaved = mcp->sex_male_collapsed_interleaved;
const uintptr_t* sex_female_collapsed_interleaved = mcp->sex_female_collapsed_interleaved;
- const uint32_t* collapsed_sort_map = ctx->collapsed_sort_map;
+ const uint32_t* old_sample_idx_to_new = ctx->old_sample_idx_to_new;
const Plink2WriteFlags plink2_write_flags = mcp->plink2_write_flags;
const uint32_t set_hh_missing = plink2_write_flags & kfPlink2WriteSetHhMissing;
const uint32_t set_mixed_mt_missing = plink2_write_flags & kfPlink2WriteSetMixedMtMissing;
@@ -4878,12 +5025,11 @@ THREAD_FUNC_DECL MakeBedlikeThread(void* raw_arg) {
if (write_plink1) {
PgrPlink2ToPlink1InplaceUnsafe(sample_ct, genovec);
}
- // trailing bytes don't matter, but trailing bits of last byte may
ZeroTrailingNyps(sample_ct, genovec);
- if (!collapsed_sort_map) {
+ if (!old_sample_idx_to_new) {
writebuf_iter = memcpyua(writebuf_iter, genovec, sample_ct4);
} else {
- GenovecResort(genovec, collapsed_sort_map, sample_ct, writebuf_iter);
+ GenovecPermute(genovec, old_sample_idx_to_new, sample_ct, writebuf_iter);
writebuf_iter = &(writebuf_iter[sample_ct4]);
}
}
@@ -4957,7 +5103,7 @@ PglErr MakeBedlikeMain(const uintptr_t* sample_include, const uint32_t* new_samp
// tried more threads, pointless since this is too I/O-bound
// (exception: reordering samples)
uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
- ctx.collapsed_sort_map = new_sample_idx_to_old;
+ ctx.old_sample_idx_to_new = nullptr;
if (!new_sample_idx_to_old) {
// Without BMI2 instructions, subsetting is most expensive with
// sample_ct near 2/3 of raw_sample_ct; up to ~7 compute threads are
@@ -4977,15 +5123,22 @@ PglErr MakeBedlikeMain(const uintptr_t* sample_include, const uint32_t* new_samp
if (calc_thread_max < calc_thread_ct) {
calc_thread_ct = calc_thread_max;
}
- } else if (sample_ct < raw_sample_ct) {
- uint32_t* new_collapsed_sort_map;
- if (unlikely(bigstack_alloc_u32(sample_ct, &new_collapsed_sort_map))) {
+ } else {
+ uint32_t* old_sample_idx_to_new;
+ if (unlikely(bigstack_alloc_u32(raw_sample_ct, &old_sample_idx_to_new))) {
goto MakeBedlikeMain_ret_NOMEM;
}
- // bugfix (26 Mar 2018): forgot to initialize this
- memcpy(new_collapsed_sort_map, new_sample_idx_to_old, sample_ct * sizeof(int32_t));
- UidxsToIdxs(sample_include, ctx.sample_include_cumulative_popcounts, sample_ct, new_collapsed_sort_map);
- ctx.collapsed_sort_map = new_collapsed_sort_map;
+ if (sample_ct == raw_sample_ct) {
+ for (uint32_t new_sample_idx = 0; new_sample_idx != sample_ct; ++new_sample_idx) {
+ old_sample_idx_to_new[new_sample_idx_to_old[new_sample_idx]] = new_sample_idx;
+ }
+ } else {
+ for (uint32_t new_sample_idx = 0; new_sample_idx != sample_ct; ++new_sample_idx) {
+ const uint32_t old_sample_uidx = new_sample_idx_to_old[new_sample_idx];
+ old_sample_idx_to_new[RawToSubsettedPos(sample_include, ctx.sample_include_cumulative_popcounts, old_sample_uidx)] = new_sample_idx;
+ }
+ }
+ ctx.old_sample_idx_to_new = old_sample_idx_to_new;
}
if (make_plink2_flags & kfMakeBed) {
@@ -5122,8 +5275,8 @@ PglErr MakeBedlikeMain(const uintptr_t* sample_include, const uint32_t* new_samp
typedef struct MakePgenCtxStruct {
MakeCommon* mcp;
- const uint32_t* new_sample_idx_to_old;
uint32_t* old_sample_idx_to_new;
+ uintptr_t* sample_include_interleaved_vec;
// combine existing chr_mask/xymt_codes/haploid_mask/chr_idx_to_foidx with
// new collapsed chromosome boundary table
uint32_t* write_chr_fo_vidx_start;
@@ -5150,7 +5303,6 @@ typedef struct MakePgenCtxStruct {
Dosage** thread_write_dosagevals;
uintptr_t** thread_write_dphasepresents;
SDosage** thread_write_dphasedeltas;
- uint32_t** thread_cumulative_popcount_bufs;
PglErr write_reterr;
int32_t write_errno;
} MakePgenCtx;
@@ -5168,7 +5320,6 @@ THREAD_FUNC_DECL MakePgenThread(void* raw_arg) {
const uintptr_t tidx = arg->tidx;
MakePgenCtx* ctx = S_CAST(MakePgenCtx*, arg->sharedp->context);
- const uint32_t* new_sample_idx_to_old = ctx->new_sample_idx_to_old;
const uint32_t* old_sample_idx_to_new = ctx->old_sample_idx_to_new;
const MakeCommon* mcp = ctx->mcp;
const ChrInfo* cip = mcp->cip;
@@ -5176,6 +5327,7 @@ THREAD_FUNC_DECL MakePgenThread(void* raw_arg) {
const uintptr_t* write_allele_idx_offsets = ctx->write_allele_idx_offsets;
const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select_iter) = mcp->refalt1_select;
const uintptr_t* sample_include = mcp->sample_include;
+ const uintptr_t* sample_include_interleaved_vec = ctx->sample_include_interleaved_vec;
const uintptr_t* sex_male_collapsed = ctx->sex_male_collapsed;
@@ -5216,7 +5368,7 @@ THREAD_FUNC_DECL MakePgenThread(void* raw_arg) {
}
uintptr_t* write_genovec = nullptr;
// assumes sample_include == nullptr if sample_ct == raw_sample_ct
- if (new_sample_idx_to_old || sample_include) {
+ if (old_sample_idx_to_new || sample_include) {
write_genovec = ctx->thread_write_genovecs[tidx];
write_genovec[sample_ctl2 - 1] = 0;
}
@@ -5242,7 +5394,6 @@ THREAD_FUNC_DECL MakePgenThread(void* raw_arg) {
uintptr_t* write_dphasepresent = nullptr;
SDosage* write_dphasedeltas = nullptr;
SDosage* tmp_dphasedeltas = nullptr;
- uint32_t* cumulative_popcount_buf = nullptr;
if (ctx->thread_write_dosagepresents) {
write_dosagepresent = ctx->thread_write_dosagepresents[tidx];
write_dosagevals = ctx->thread_write_dosagevals[tidx];
@@ -5252,9 +5403,6 @@ THREAD_FUNC_DECL MakePgenThread(void* raw_arg) {
tmp_dphasedeltas = &(write_dphasedeltas[RoundUpPow2(sample_ct, kCacheline / 2)]);
}
}
- if ((ctx->thread_write_mhc || ctx->thread_write_dosagepresents) && new_sample_idx_to_old) {
- cumulative_popcount_buf = ctx->thread_cumulative_popcount_bufs[tidx];
- }
uint32_t variant_idx_offset = 0;
uint32_t allele_ct = 2;
uint32_t parity = 0;
@@ -5369,21 +5517,22 @@ THREAD_FUNC_DECL MakePgenThread(void* raw_arg) {
uint32_t write_rare10_ct = 0;
uint32_t write_dosage_ct = 0;
uint32_t write_dphase_ct = 0;
- if (new_sample_idx_to_old) {
- GenovecResort(loadbuf_iter, new_sample_idx_to_old, sample_ct, write_genovec);
+ if (old_sample_idx_to_new) {
+ ZeroTrailingNyps(raw_sample_ct, loadbuf_iter);
+ GenovecPermuteSubset(loadbuf_iter, sample_include, sample_include_interleaved_vec, old_sample_idx_to_new, raw_sample_ct, sample_ct, write_genovec);
if (read_rare01_ct) {
- write_rare01_ct = CopyAndResort8bit(read_patch_01_set, read_patch_01_vals, new_sample_idx_to_old, raw_sample_ct, sample_ct, write_patch_01_set, write_patch_01_vals, cumulative_popcount_buf);
+ write_rare01_ct = CopyAndPermute8bit(sample_include, read_patch_01_set, read_patch_01_vals, old_sample_idx_to_new, raw_sample_ct, read_rare01_ct, write_patch_01_set, write_patch_01_vals);
}
if (read_rare10_ct) {
- write_rare10_ct = CopyAndResort16bit(read_patch_10_set, read_patch_10_vals, new_sample_idx_to_old, raw_sample_ct, sample_ct, write_patch_10_set, write_patch_10_vals, cumulative_popcount_buf);
+ write_rare10_ct = CopyAndPermute16bit(sample_include, read_patch_10_set, read_patch_10_vals, old_sample_idx_to_new, raw_sample_ct, read_rare10_ct, write_patch_10_set, write_patch_10_vals);
}
if (is_hphase) {
- UnpackAndResortHphase(all_hets, cur_phaseraw, sample_include, old_sample_idx_to_new, raw_sample_ct, sample_ct, &cur_write_phasepresent, write_phaseinfo);
+ UnpackAndPermuteHphase(all_hets, cur_phaseraw, sample_include, old_sample_idx_to_new, raw_sample_ct, sample_ct, &cur_write_phasepresent, write_phaseinfo);
}
if (is_dosage) {
- write_dosage_ct = CopyAndResort16bit(cur_dosagepresent, cur_dosagevals, new_sample_idx_to_old, raw_sample_ct, sample_ct, write_dosagepresent, write_dosagevals, cumulative_popcount_buf);
+ write_dosage_ct = CopyAndPermute16bit(sample_include, cur_dosagepresent, cur_dosagevals, old_sample_idx_to_new, raw_sample_ct, read_dosage_ct, write_dosagepresent, write_dosagevals);
if (is_dphase) {
- write_dphase_ct = CopyAndResort16bit(cur_dphasepresent, cur_dphasedelta, new_sample_idx_to_old, raw_sample_ct, sample_ct, write_dphasepresent, write_dphasedeltas, cumulative_popcount_buf);
+ write_dphase_ct = CopyAndPermute16bit(sample_include, cur_dphasepresent, cur_dphasedelta, old_sample_idx_to_new, raw_sample_ct, read_dphase_ct, write_dphasepresent, write_dphasedeltas);
}
}
} else if (sample_include) {
@@ -5865,7 +6014,6 @@ PglErr MakePgenRobust(const uintptr_t* sample_include, const uint32_t* new_sampl
const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
mcp->sample_include = subsetting_required? sample_include : nullptr;
- ctx.new_sample_idx_to_old = new_sample_idx_to_old;
ctx.sex_male_collapsed = sex_male_collapsed;
ctx.sex_female_collapsed = sex_female_collapsed;
ctx.write_reterr = kPglRetSuccess;
@@ -6017,15 +6165,25 @@ PglErr MakePgenRobust(const uintptr_t* sample_include, const uint32_t* new_sampl
const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
ctx.thread_write_genovecs = nullptr;
+ ctx.old_sample_idx_to_new = nullptr;
+ ctx.sample_include_interleaved_vec = nullptr;
uint32_t write_mhc_needed = 0;
if (new_sample_idx_to_old || subsetting_required) {
if (unlikely(bigstack_alloc_wp(1, &ctx.thread_write_genovecs))) {
goto MakePgenRobust_ret_NOMEM;
}
- if (read_phase_present && new_sample_idx_to_old) {
+ if (new_sample_idx_to_old) {
if (unlikely(bigstack_alloc_u32(raw_sample_ct, &ctx.old_sample_idx_to_new))) {
goto MakePgenRobust_ret_NOMEM;
}
+ if (subsetting_required) {
+ // SetAllU32Arr(raw_sample_ct, ctx.old_sample_idx_to_new);
+ const uint32_t raw_sample_ctv = BitCtToVecCt(raw_sample_ct);
+ if (unlikely(bigstack_alloc_w(raw_sample_ctv * kWordsPerVec, &ctx.sample_include_interleaved_vec))) {
+ goto MakePgenRobust_ret_NOMEM;
+ }
+ FillInterleavedMaskVec(sample_include, raw_sample_ctv, ctx.sample_include_interleaved_vec);
+ }
for (uint32_t new_sample_idx = 0; new_sample_idx != sample_ct; ++new_sample_idx) {
ctx.old_sample_idx_to_new[new_sample_idx_to_old[new_sample_idx]] = new_sample_idx;
}
@@ -6080,12 +6238,6 @@ PglErr MakePgenRobust(const uintptr_t* sample_include, const uint32_t* new_sampl
}
}
}
- if ((write_mhc_needed || read_dosage_present) && new_sample_idx_to_old) {
- if (unlikely(bigstack_alloc_u32p(1, &ctx.thread_cumulative_popcount_bufs) ||
- bigstack_alloc_u32(raw_sample_ctl, &(ctx.thread_cumulative_popcount_bufs[0])))) {
- goto MakePgenRobust_ret_NOMEM;
- }
- }
mcp->refalt1_select = refalt1_select;
if (refalt1_select) {
if (write_allele_idx_offsets) {
@@ -6159,7 +6311,27 @@ PglErr MakePgenRobust(const uintptr_t* sample_include, const uint32_t* new_sampl
const uint32_t raw_sample_ctv2 = NypCtToVecCt(raw_sample_ct);
uintptr_t load_variant_vec_ct = raw_sample_ctv2;
- uint32_t loaded_vrtypes_needed = (read_gflags & kfPgenGlobalMultiallelicHardcallFound)? 1 : 0;
+ // bugfix (8 Jun 2021): forgot to include multiallelic track in
+ // load_variant_vec_ct computation
+ uint32_t loaded_vrtypes_needed = 0;
+ if (read_gflags & kfPgenGlobalMultiallelicHardcallFound) {
+ loaded_vrtypes_needed = 1;
+ // raw format:
+ // rare01_ct, padded out to a word
+ // rare10_ct, padded out to a word
+ // [round up to vector boundary, for patch_01_set]
+ // aux1a, if not mode 15:
+ // patch_01_set as bitarray, raw_sample_ctl words
+ // patch_01_vals (raw_sample_ct * sizeof(AlleleCode)), round up to
+ // word boundary
+ // [round up to vector boundary, for patch_10_set]
+ // aux1b, if not mode 15:
+ // patch_10_set as bitarray, raw_sample_ctl words
+ // patch_10_vals (raw_sample_ct * 2 * sizeof(AlleleCode)), round up
+ // to word boundary
+ const uintptr_t multiallelic_raw_vec_ct = WordCtToVecCt(2) + WordCtToVecCt(raw_sample_ctl + DivUp(raw_sample_ct * sizeof(AlleleCode), kBytesPerWord)) + WordCtToVecCt(raw_sample_ctl + DivUp(raw_sample_ct * 2 * sizeof(AlleleCode), kBytesPerWord));
+ load_variant_vec_ct += WordCtToVecCt(multiallelic_raw_vec_ct);
+ }
if (read_phase_present || read_dosage_present) {
loaded_vrtypes_needed = 1;
if (read_phase_present) {
@@ -6837,7 +7009,6 @@ PglErr MakePlink2NoVsort(const uintptr_t* sample_include, const PedigreeIdInfo*
}
MakeCommon mc;
mc.plink2_write_flags = kfPlink2Write0;
- const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
ctx.sex_male_collapsed = nullptr; // defensive
if (make_plink2_flags & kfMakePlink2SetHhMissing) {
@@ -7024,9 +7195,9 @@ PglErr MakePlink2NoVsort(const uintptr_t* sample_include, const PedigreeIdInfo*
if (!new_sample_idx_to_old) {
// hphase doesn't seem to affect read:write ratio much
#ifdef USE_AVX2
- const uint32_t max_calc_thread_ct = 2;
+ const uint32_t max_calc_thread_ct = 3;
#else
- const uint32_t max_calc_thread_ct = 2 + subsetting_required;
+ const uint32_t max_calc_thread_ct = 3 + subsetting_required;
#endif
if (calc_thread_ct > max_calc_thread_ct) {
calc_thread_ct = max_calc_thread_ct;
@@ -7099,7 +7270,6 @@ PglErr MakePlink2NoVsort(const uintptr_t* sample_include, const PedigreeIdInfo*
}
}
ctx.pwcs = &(mpgwp->pwcs[0]);
- ctx.new_sample_idx_to_old = new_sample_idx_to_old;
ctx.thread_write_genovecs = nullptr;
ctx.thread_write_mhc = nullptr;
@@ -7107,15 +7277,26 @@ PglErr MakePlink2NoVsort(const uintptr_t* sample_include, const PedigreeIdInfo*
// the I/O thread loads the next (64k * thread_ct).
uintptr_t other_per_thread_cacheline_ct = 2 * load_vblock_cacheline_ct;
+ ctx.old_sample_idx_to_new = nullptr;
+ ctx.sample_include_interleaved_vec = nullptr;
+
uint32_t write_mhc_needed = 0;
if (new_sample_idx_to_old || subsetting_required) {
if (bigstack_alloc_wp(calc_thread_ct, &ctx.thread_write_genovecs)) {
goto MakePlink2NoVsort_fallback;
}
- if (read_phase_present && new_sample_idx_to_old) {
- if (bigstack_alloc_u32(raw_sample_ct, &ctx.old_sample_idx_to_new)) {
+ if (new_sample_idx_to_old) {
+ if (unlikely(bigstack_alloc_u32(raw_sample_ct, &ctx.old_sample_idx_to_new))) {
goto MakePlink2NoVsort_fallback;
}
+ if (subsetting_required) {
+ // SetAllU32Arr(raw_sample_ct, ctx.old_sample_idx_to_new);
+ const uint32_t raw_sample_ctv = BitCtToVecCt(raw_sample_ct);
+ if (unlikely(bigstack_alloc_w(raw_sample_ctv * kWordsPerVec, &ctx.sample_include_interleaved_vec))) {
+ goto MakePlink2NoVsort_fallback;
+ }
+ FillInterleavedMaskVec(sample_include, raw_sample_ctv, ctx.sample_include_interleaved_vec);
+ }
for (uint32_t new_sample_idx = 0; new_sample_idx != sample_ct; ++new_sample_idx) {
ctx.old_sample_idx_to_new[new_sample_idx_to_old[new_sample_idx]] = new_sample_idx;
}
@@ -7134,13 +7315,6 @@ PglErr MakePlink2NoVsort(const uintptr_t* sample_include, const PedigreeIdInfo*
write_mhcraw_cacheline_ct = DivUp(mhcwrite_word_ct, kWordsPerCacheline);
other_per_thread_cacheline_ct += write_mhcraw_cacheline_ct;
}
- if ((write_mhc_needed || read_dosage_present) && new_sample_idx_to_old) {
- // ctx.thread_cumulative_popcount_bufs
- other_per_thread_cacheline_ct += Int32CtToCachelineCt(raw_sample_ctl);
- if (bigstack_alloc_u32p(calc_thread_ct, &ctx.thread_cumulative_popcount_bufs)) {
- goto MakePlink2NoVsort_fallback;
- }
- }
ctx.thread_write_phasepresents = nullptr;
ctx.thread_all_hets = nullptr;
ctx.thread_write_dosagepresents = nullptr;
@@ -7230,9 +7404,6 @@ PglErr MakePlink2NoVsort(const uintptr_t* sample_include, const PedigreeIdInfo*
if (write_mhc_needed) {
ctx.thread_write_mhc[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(write_mhcraw_cacheline_ct * kCacheline));
}
- if ((write_mhc_needed || read_dosage_present) && new_sample_idx_to_old) {
- ctx.thread_cumulative_popcount_bufs[tidx] = S_CAST(uint32_t*, bigstack_alloc_raw(Int32CtToCachelineCt(raw_sample_ctl) * kCacheline));
- }
}
}
snprintf(outname_end, kMaxOutfnameExtBlen, ".pgen");
@@ -7786,14 +7957,17 @@ PglErr WritePvarResorted(const char* outname, const uintptr_t* variant_include,
const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
const uint32_t all_nonref = (nonref_flags_storage == 2);
uint32_t write_info_pr = all_nonref;
- uint32_t write_info = (pvar_psam_flags & kfPvarColInfo) || pvar_info_reload;
+ if (pvar_info_reload && (!(pvar_psam_flags & kfPvarColXinfo))) {
+ pvar_info_reload = nullptr;
+ }
+ const uint32_t write_info = (pvar_psam_flags & kfPvarColInfo) || pvar_info_reload;
if (write_info && nonref_flags) {
write_info_pr = !IntersectionIsEmpty(variant_include, nonref_flags, raw_variant_ctl);
}
write_info_pr = write_info_pr && write_info;
if (unlikely(write_info_pr && (info_flags & kfInfoPrNonflagPresent))) {
logputs("\n");
- logerrputs("Error: Conflicting INFO/PR definitions. Either fix all REF alleles so that the\n'provisional reference' flag is no longer needed, or remove/rename the other\nuse of the INFO/PR key.\n");
+ logerrputs("Error: Conflicting INFO/PR definitions. Either fix all REF alleles so that the\n\"provisional reference\" flag is no longer needed, or remove/rename the other\nuse of the INFO/PR key.\n");
goto WritePvarResorted_ret_INCONSISTENT_INPUT;
}
=====================================
plink2_data.h
=====================================
@@ -71,13 +71,14 @@ CONSTI32(kMaxInfoKeySlen, kMaxIdSlen);
#define MAX_INFO_KEY_SLEN_STR MAX_ID_SLEN_STR
// We only need to distinguish between the following INFO-value-type cases:
-// Number=0 (flag), Number=<positive integer>, Number=., Number=A, Number=R,
-// and Number=G. We use negative numbers to represent the last 4 cases in
+// Number=0 (flag), Number=<positive integer>, Number=., Number=A, and
+// Number=R. (Number=G is being treated as Number=., since otherwise chrX is a
+// nightmare; see https://github.com/samtools/hts-specs/issues/272 for some
+// discussion.) We use negative numbers to represent the last 3 cases in
// InfoVtype.
CONSTI32(kInfoVtypeUnknown, -1);
CONSTI32(kInfoVtypeA, -2);
CONSTI32(kInfoVtypeR, -3);
-CONSTI32(kInfoVtypeG, -4);
// Main fixed data structure when splitting/joining INFO is a hashmap of keys.
// Behavior when splitting:
@@ -87,17 +88,15 @@ CONSTI32(kInfoVtypeG, -4);
// - Number=A and Number=R require splitting the value on ',' and verifying the
// comma count is correct, but is otherwise straightforward since alleles
// can't be permuted.
-// - Number=G requires a bit more work but isn't fundamentally different from
-// A/R.
+// - Number=G is being treated as Number=.
// When joining:
// - Field order is determined by header line order.
-// - Number=. and Number>0 just require a buffer of size ~info_reload_slen, and
-// a boolean indicating whether no mismatch has been found.
+// - Number=./G and Number>0 just require a buffer of size ~info_reload_slen,
+// and a boolean indicating whether no mismatch has been found.
// - Number=0 (Flag) requires a single boolean, we perform an or operation.
-// - Number=A/R/G are the messy ones: we need to have enough space for
+// - Number=A/R are the messy ones: we need to have enough space for
// max_write_allele_ct (or that minus 1) comma-separated values in the =A and
-// =R cases, and max_write_allele_ct * (max_write_allele_ct + 1) / 2 in the
-// diploid =G case.
+// =R cases.
// Since we permit already-multiallelic variants to be part of a join, the =G
// case may require a lot of working memory to handle. We reserve up to 1/16
// of remaining workspace memory for this when we cannot prove that we can
@@ -136,6 +135,14 @@ uint32_t DataParentalColsAreRequired(const uintptr_t* sample_include, const Samp
char* AppendPhenoStr(const PhenoCol* pheno_col, const char* output_missing_pheno, uint32_t omp_slen, uint32_t sample_uidx, char* write_iter);
+uint32_t CopyAndPermute8bit(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict src_subset, const void* __restrict src_vals, const uint32_t* __restrict old_sample_idx_to_new, uint32_t sample_ct, uint32_t val_ct, uintptr_t* __restrict dst_subset, void* __restrict dst_vals);
+
+uint32_t CopyAndPermute16bit(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict src_subset, const void* __restrict src_vals, const uint32_t* __restrict old_sample_idx_to_new, uint32_t sample_ct, uint32_t val_ct, uintptr_t* __restrict dst_subset, void* __restrict dst_vals);
+
+uint32_t Dense8bitToSparse(const uintptr_t* __restrict set, uint32_t sample_ctl, void* __restrict vals);
+
+uint32_t Dense16bitToSparse(const uintptr_t* __restrict set, uint32_t sample_ctl, void* __restrict vals);
+
PglErr LoadAlleleAndGenoCounts(const uintptr_t* sample_include, const uintptr_t* founder_info, const uintptr_t* sex_nm, const uintptr_t* sex_male, const uintptr_t* variant_include, const ChrInfo* cip, const uintptr_t* allele_idx_offsets, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t founder_ct, uint32_t male_ct, uint32_t nosex_ct, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t first_hap_uidx, uint32_t is_minimac3_r2, uint32_t max_thread_ct, uintptr_t pgr_alloc_cacheline_ct, PgenFileInfo* pgfip, uintptr_t* allele_presents, uint64_t* allele_ddosages, uint64_t* founder_allele_ddosages, uint32_t* variant_missing_hc_cts, uint32_t* variant_missing_dosage_cts, uint32_t* variant_hethap_cts, STD_ARRAY_PTR_DECL(uint32_t, 3, raw_geno_cts), STD_ARRAY_PTR_DECL(uint32_t, 3, founder_raw_geno_cts), STD_ARRAY_PTR_DECL(uint32_t, 3, x_male_geno_cts), STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_male_geno_cts), STD_ARRAY_PTR_DECL(uint32_t, 3, x_nosex_geno_cts), STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_nosex_geno_cts), double* imp_r2_vals);
void ApplyHardCallThresh(const uintptr_t* dosage_present, const Dosage* dosage_main, uint32_t dosage_ct, uint32_t hard_call_halfdist, uintptr_t* genovec);
=====================================
plink2_export.cc
=====================================
@@ -964,6 +964,7 @@ PglErr ExportOxGen(const uintptr_t* sample_include, const uint32_t* sample_inclu
}
}
const uint32_t max_chr_blen = GetMaxChrSlen(cip) + 1;
+ const uint32_t is_v2 = (exportf_flags / kfExportfOxGenV2) & 1;
// if no dosages, all genotypes are 6 bytes (missing = " 0 0 0")
// with dosages, we print up to 5 digits past the decimal point, so 7 bytes
// + space for each number, 24 bytes max
@@ -971,7 +972,7 @@ PglErr ExportOxGen(const uintptr_t* sample_include, const uint32_t* sample_inclu
char* chr_buf; // includes trailing space
char* writebuf;
if (unlikely(bigstack_alloc_c(max_chr_blen, &chr_buf) ||
- bigstack_alloc_c(kMaxMediumLine + max_chr_blen + kMaxIdSlen + 16 + 2 * max_allele_slen + max_geno_slen * sample_ct, &writebuf))) {
+ bigstack_alloc_c(kMaxMediumLine + max_chr_blen + (kMaxIdSlen << is_v2) + 16 + 2 * max_allele_slen + max_geno_slen * sample_ct, &writebuf))) {
goto ExportOxGen_ret_NOMEM;
}
{
@@ -1035,7 +1036,12 @@ PglErr ExportOxGen(const uintptr_t* sample_include, const uint32_t* sample_inclu
is_y = (chr_idx == y_code);
}
write_iter = memcpya(write_iter, chr_buf, chr_blen);
- write_iter = strcpyax(write_iter, variant_ids[variant_uidx], ' ');
+ const char* variant_id = variant_ids[variant_uidx];
+ const uint32_t variant_id_slen = strlen(variant_id);
+ write_iter = memcpyax(write_iter, variant_id, variant_id_slen, ' ');
+ if (is_v2) {
+ write_iter = memcpyax(write_iter, variant_id, variant_id_slen, ' ');
+ }
write_iter = u32toa_x(variant_bps[variant_uidx], ' ', write_iter);
uintptr_t allele_idx_offset_base = variant_uidx * 2;
if (allele_idx_offsets) {
@@ -3158,12 +3164,13 @@ THREAD_FUNC_DECL ExportBgen13Thread(void* raw_arg) {
// export an ID hash table as well (since such a hash table is always
// constructed for the sake of detecting and warning about duplicate
// post-id-paste IDs).
-BoolErr ExportIdpaste(const uintptr_t* sample_include, const SampleIdInfo* siip, const char* ftypename, uint32_t sample_ct, IdpasteFlags exportf_id_paste, char exportf_id_delim, uintptr_t* max_exported_sample_id_blen_ptr, char** exported_sample_ids_ptr) {
+BoolErr ExportIdpaste(const uintptr_t* sample_include, const SampleIdInfo* siip, const char* ftypename, const char* reimport_flagname, uint32_t sample_ct, IdpasteFlags exportf_id_paste, char exportf_id_delim, uintptr_t* max_exported_sample_id_blen_ptr, char** exported_sample_ids_ptr) {
const uint32_t write_fid = DataFidColIsRequired(sample_include, siip, sample_ct, exportf_id_paste / kfIdpasteMaybefid);
const char* sample_ids = siip->sample_ids;
const char* sids = siip->sids;
const uintptr_t max_sample_id_blen = siip->max_sample_id_blen;
uintptr_t max_sid_blen = siip->max_sid_blen;
+ const uint32_t write_iid = (exportf_id_paste / kfIdpasteIid) & 1;
const uint32_t write_sid = DataSidColIsRequired(sample_include, sids, sample_ct, max_sid_blen, exportf_id_paste / kfIdpasteMaybesid);
if (write_sid && (!sids)) {
max_sid_blen = 2;
@@ -3194,7 +3201,7 @@ BoolErr ExportIdpaste(const uintptr_t* sample_include, const SampleIdInfo* siip,
}
exported_sample_ids_iter = memcpyax(exported_sample_ids_iter, orig_sample_id, fid_slen, id_delim);
}
- if (exportf_id_paste & kfIdpasteIid) {
+ if (write_iid) {
const char* orig_iid = &(orig_fid_end[1]);
const uint32_t iid_slen = strlen(orig_iid);
if ((!id_delim_warning) && memchr(orig_iid, id_delim, iid_slen)) {
@@ -3217,12 +3224,11 @@ BoolErr ExportIdpaste(const uintptr_t* sample_include, const SampleIdInfo* siip,
}
exported_sample_ids_iter[-1] = '\0';
}
- // todo: revise this warning condition?
- if (id_delim_warning) {
+ if (id_delim_warning && (write_fid + write_iid + write_sid >= 2)) {
if (exportf_id_delim) {
- logerrprintfww("Warning: '%c' present in original sample IDs; --export %s will not be able to reconstruct them. Consider rerunning with a different --export id-delim= value.\n", exportf_id_delim, ftypename);
+ logerrprintfww("Warning: '%c' present in original sample IDs; --%s will not be able to reconstruct them. Consider rerunning with a different --export id-delim= value.\n", exportf_id_delim, reimport_flagname);
} else {
- logerrprintfww("Warning: '_' present in original sample IDs; --export %s will not be able to reconstruct them. Consider rerunning with a suitable --export id-delim= value.\n", ftypename);
+ logerrprintfww("Warning: '_' present in original sample IDs; --%s will not be able to reconstruct them. Consider rerunning with a suitable --export id-delim= value.\n", reimport_flagname);
}
}
if (PopulateStrboxHtable(exported_sample_ids, sample_ct, max_exported_sample_id_blen, exported_id_htable_size, new_id_htable)) {
@@ -3333,7 +3339,7 @@ PglErr ExportBgen13(const char* outname, const uintptr_t* sample_include, uint32
// always save sample IDs...
char* exported_sample_ids;
uintptr_t max_exported_sample_id_blen;
- if (unlikely(ExportIdpaste(sample_include, siip, use_zstd_compression? "bgen-1.3" : "bgen-1.2", sample_ct, exportf_id_paste, exportf_id_delim, &max_exported_sample_id_blen, &exported_sample_ids))) {
+ if (unlikely(ExportIdpaste(sample_include, siip, use_zstd_compression? "bgen-1.3" : "bgen-1.2", "bgen", sample_ct, exportf_id_paste, exportf_id_delim, &max_exported_sample_id_blen, &exported_sample_ids))) {
goto ExportBgen13_ret_NOMEM;
}
// Compute total length of sample ID block now; this is necessary to fill
@@ -3898,7 +3904,7 @@ PglErr ExportOxSampleV2(const char* outname, const uintptr_t* sample_include, co
char* exported_sample_ids;
uintptr_t max_exported_sample_id_blen;
- if (unlikely(ExportIdpaste(sample_include, &piip->sii, "sample-v2", sample_ct, exportf_id_paste, exportf_id_delim, &max_exported_sample_id_blen, &exported_sample_ids))) {
+ if (unlikely(ExportIdpaste(sample_include, &piip->sii, "sample-v2", "sample", sample_ct, exportf_id_paste, exportf_id_delim, &max_exported_sample_id_blen, &exported_sample_ids))) {
goto ExportOxSampleV2_ret_NOMEM;
}
@@ -4607,7 +4613,7 @@ PglErr ExportVcf(const uintptr_t* sample_include, const uint32_t* sample_include
write_iter = strcpya_k(write_iter, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" EOLN_STR "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
char* exported_sample_ids;
uintptr_t max_exported_sample_id_blen;
- if (unlikely(ExportIdpaste(sample_include, siip, "vcf", sample_ct, exportf_id_paste, exportf_id_delim, &max_exported_sample_id_blen, &exported_sample_ids))) {
+ if (unlikely(ExportIdpaste(sample_include, siip, "vcf", "vcf", sample_ct, exportf_id_paste, exportf_id_delim, &max_exported_sample_id_blen, &exported_sample_ids))) {
goto ExportVcf_ret_NOMEM;
}
for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
@@ -7976,7 +7982,7 @@ PglErr ExportBcf(const uintptr_t* sample_include, const uint32_t* sample_include
char* exported_sample_ids;
uintptr_t max_exported_sample_id_blen;
- if (unlikely(ExportIdpaste(sample_include, siip, "bcf", sample_ct, exportf_id_paste, exportf_id_delim, &max_exported_sample_id_blen, &exported_sample_ids))) {
+ if (unlikely(ExportIdpaste(sample_include, siip, "bcf", "bcf", sample_ct, exportf_id_paste, exportf_id_delim, &max_exported_sample_id_blen, &exported_sample_ids))) {
goto ExportBcf_ret_NOMEM;
}
for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
=====================================
plink2_fasta.cc
=====================================
@@ -542,7 +542,8 @@ PglErr ProcessFa(const uintptr_t* variant_include, const char* const* variant_id
if (ucc < 'A') {
// > = ascii 62
// ; = ascii 59
- if (ucc == ';') {
+ // bugfix (25 May 2021): newline is also valid
+ if ((ucc == ';') || ((ucc < 32) && (ucc != '\t'))) {
continue;
}
is_first_noncomment_line = 0;
=====================================
plink2_glm.cc
=====================================
@@ -4383,7 +4383,9 @@ THREAD_FUNC_DECL GlmLogisticThread(void* raw_arg) {
ZeromovFArr(nm_sample_ct_rem, &nm_predictors_pmaj_iter);
}
nm_predictors_pmaj_istart = nm_predictors_pmaj_iter;
- prev_nm = !missing_ct;
+ // bugfix (13 Apr 2021): if local covariates are present, we can't
+ // optimize as aggressively
+ prev_nm = !(missing_ct || local_covar_ct);
} else {
// bugfix (15 Aug 2018): this was not handling --parameters
// correctly when a covariate was only needed as part of an
@@ -7436,7 +7438,7 @@ THREAD_FUNC_DECL GlmLinearThread(void* raw_arg) {
}
}
nm_predictors_pmaj_istart = nm_predictors_pmaj_iter;
- prev_nm = !missing_ct;
+ prev_nm = !(missing_ct || local_covar_ct);
} else {
// bugfix (15 Aug 2018): this was not handling --parameters
// correctly when a covariate was only needed as part of an
@@ -9504,7 +9506,7 @@ THREAD_FUNC_DECL GlmLinearSubbatchThread(void* raw_arg) {
}
}
nm_predictors_pmaj_istart = nm_predictors_pmaj_iter;
- prev_nm = !missing_ct;
+ prev_nm = !(missing_ct || local_covar_ct);
} else {
// bugfix (15 Aug 2018): this was not handling --parameters
// correctly when a covariate was only needed as part of an
@@ -11140,6 +11142,8 @@ PglErr GlmMain(const uintptr_t* orig_sample_include, const SampleIdInfo* siip, c
if (new_max_covar_name_blen < condition_blen) {
new_max_covar_name_blen = condition_blen;
}
+ // TODO: this count will be different with a multiallelic variant
+ logprintf("--glm: One --condition covariate added.\n");
} else {
if (unlikely(ii == -2)) {
logerrprintfww("Error: Duplicate --condition variant ID '%s'.\n", glm_info_ptr->condition_varname);
=====================================
plink2_help.cc
=====================================
@@ -463,8 +463,12 @@ PglErr DispHelp(const char* const* argvk, uint32_t param_ct) {
" * 'rlist': .rlist + .fam + .map fileset, where the .rlist file is a\n"
" genotype-based list which omits the most common genotype for\n"
" each variant. Also supports 'omit-nonmale-y'.\n"
-" * 'oxford': Oxford-format .gen + .sample. When the 'bgz' modifier is\n"
-" present, the .gen file is block-gzipped.\n"
+" * 'oxford', 'oxford-v2': Oxford-format .gen + .sample. When the 'bgz'\n"
+" modifier is present, the .gen file is\n"
+" block-gzipped. 'oxford' requests the original\n"
+" .gen file format with 5 leading columns\n"
+" (understood by older PLINK builds); 'oxford-v2'\n"
+" requests the current 6-leading-column flavor.\n"
" * 'ped': PLINK 1 sample-major (.ped + .map), loadable with --file.\n"
" * 'compound-genotypes': Same as 'ped', except that the space between each\n"
" pair of same-variant allele codes is removed.\n"
@@ -1311,8 +1315,8 @@ PglErr DispHelp(const char* const* argvk, uint32_t param_ct) {
" scoresums: Score sums.\n"
" The default is maybefid,maybesid,phenos,nallele,dosagesum,scoreavgs.\n"
" For more sophisticated polygenic risk scoring, we recommend looking at the\n"
-" LDpred (https://github.com/bvilhjal/ldpred ) and PRSice-2\n"
-" (https://www.prsice.info/ ) software packages.\n\n"
+" LDpred2 (https://privefl.github.io/bigsnpr/articles/LDpred2.html ) and\n"
+" PRSice-2 (https://www.prsice.info/ ) software packages.\n\n"
);
HelpPrint("variant-score\0vscore\0", &help_ctrl, 1,
" --variant-score <filename> ['bin' | 'bin4' | 'cols='<col set descriptor>]\n"
@@ -1715,9 +1719,9 @@ PglErr DispHelp(const char* const* argvk, uint32_t param_ct) {
" Linearly transform named covariates (and quantitative phenotypes, if\n"
" --variance-standardize) to mean-zero, variance 1. If no arguments are\n"
" provided, all possible phenotypes/covariates are affected.\n"
-" This is frequently necessary to prevent multicollinearity when dealing with\n"
-" covariates where abs(mean) is much larger than abs(standard deviation),\n"
-" such as year of birth.\n"
+" This is frequently necessary to prevent the multicollinearity check from\n"
+" failing when dealing with covariates where abs(mean) is much larger than\n"
+" the standard deviation, such as year of birth.\n"
" --quantile-normalize [...] : Force named covariates and quantitative\n"
" --pheno-quantile-normalize [...] phenotypes to a N(0,1) distribution,\n"
" --covar-quantile-normalize [...] preserving only the original rank orders.\n"
@@ -2118,7 +2122,9 @@ PglErr DispHelp(const char* const* argvk, uint32_t param_ct) {
" * 'file'/'f' uses the order in the given file\n"
" (named in the last argument).\n"
);
- HelpPrint("pmerge\0pmerge-list\0sample-inner-join\0variant-inner-join\0pheno-inner-join\0merge-mode\0merge-parents-mode\0merge-sex-mode\0merge-pheno-mode\0merge-xheader-mode\0merge-qual-mode\0merge-filter-mode\0merge-info-mode\0merge-cm-mode\0", &help_ctrl, 0,
+ HelpPrint("pmerge\0pmerge-list\0pmerge-list-dir\0pmerge-output-vzs\0sample-inner-join\0variant-inner-join\0pheno-inner-join\0merge-mode\0merge-parents-mode\0merge-sex-mode\0merge-pheno-mode\0merge-xheader-mode\0merge-qual-mode\0merge-filter-mode\0merge-info-mode\0merge-cm-mode\0", &help_ctrl, 0,
+" --pmerge-list-dir <dir> : Specify base dir to join to --pmerge-list entries.\n"
+" --pmerge-output-vzs : Compress the .pvar file from --pmerge[-list].\n"
" --sample-inner-join : By default, --pmerge[-list] performs an 'outer\n"
" --variant-inner-join join': the merged fileset contains the union of\n"
" --pheno-inner-join the samples in the input filesets, and ditto for\n"
@@ -2142,9 +2148,11 @@ PglErr DispHelp(const char* const* argvk, uint32_t param_ct) {
" --merge-filter-mode <m> QUAL/FILTER/INFO/CM entries.\n"
" --merge-info-mode <mode> * 'erase' = remove column\n"
" --merge-cm-mode <mode> * 'nm-match' = nonmissing values must match\n"
-" * 'nm-first' = keep first nonmissing (qual/info/CM\n"
+" * 'nm-first' = keep first nonmissing (info/CM\n"
" default)\n"
" * 'first' = keep first value, even if missing\n"
+" * 'min' = keep minimum value (--merge-qual-mode\n"
+" default, not applicable to others)\n"
" * 'np-union' = keep all non-PASS values\n"
" (--merge-filter-mode default, not\n"
" applicable to others)\n"
=====================================
plink2_import.cc
=====================================
@@ -3119,7 +3119,7 @@ PglErr VcfToPgen(const char* vcfname, const char* preexisting_psamname, const ch
allele_idx_end += alt_ct + 1;
const uint32_t variant_idx_lowbits = variant_ct % kBitsPerWord;
if (info_pr_present) {
- if (PrInInfoToken(info_end - info_start, info_start)) {
+ if (PrInInfo(info_end - info_start, info_start)) {
nonref_word |= k1LU << variant_idx_lowbits;
}
if (variant_idx_lowbits == (kBitsPerWord - 1)) {
@@ -7881,65 +7881,68 @@ PglErr BcfToPgen(const char* bcfname, const char* preexisting_psamname, const ch
}
// INFO
uintptr_t info_pr_here = 0;
- uint64_t cur_info_slen_ubound = n_info - 1;
- for (uint32_t uii = 0; uii != n_info; ++uii) {
- uint32_t sidx;
- if (unlikely(ScanBcfTypedInt(&parse_iter, &sidx) || (parse_iter > shared_end) || (sidx >= fif_string_idx_end))) {
- goto BcfToPgen_ret_VREC_GENERIC;
- }
- const uint32_t key_slen = fif_slens[sidx];
- if (unlikely((!key_slen) || ScanBcfType(&parse_iter, &value_type, &value_ct) || (parse_iter > shared_end))) {
- goto BcfToPgen_ret_VREC_GENERIC;
- }
- cur_info_slen_ubound += key_slen;
- if (value_ct) {
- const uint32_t bytes_per_elem = kBcfBytesPerElem[value_type];
- const uint64_t vec_byte_ct = bytes_per_elem * S_CAST(uint64_t, value_ct);
- if (unlikely((!bytes_per_elem) || (S_CAST(uint64_t, shared_end - parse_iter) < vec_byte_ct))) {
+ if (n_info) {
+ // bugfix (1 Jul 2021): avoid underflow when n_info == 0
+ uint64_t cur_info_slen_ubound = n_info - 1;
+ for (uint32_t uii = 0; uii != n_info; ++uii) {
+ uint32_t sidx;
+ if (unlikely(ScanBcfTypedInt(&parse_iter, &sidx) || (parse_iter > shared_end) || (sidx >= fif_string_idx_end))) {
goto BcfToPgen_ret_VREC_GENERIC;
}
- if (value_type == 1) {
- // int8, max len 4 ("-127"), add 1 for comma
- cur_info_slen_ubound += 5 * S_CAST(uint64_t, value_ct);
- } else if (value_type == 2) {
- // int16, max len 6
- cur_info_slen_ubound += 7 * S_CAST(uint64_t, value_ct);
- } else if (value_type == 3) {
- // int32, max len 11
- cur_info_slen_ubound += 12 * S_CAST(uint64_t, value_ct);
- } else if (value_type == 5) {
- cur_info_slen_ubound += (kMaxFloatGSlen + 1) * S_CAST(uint64_t, value_ct);
- } else {
- // string
- cur_info_slen_ubound += value_ct + 1;
+ const uint32_t key_slen = fif_slens[sidx];
+ if (unlikely((!key_slen) || ScanBcfType(&parse_iter, &value_type, &value_ct) || (parse_iter > shared_end))) {
+ goto BcfToPgen_ret_VREC_GENERIC;
}
- parse_iter = &(parse_iter[vec_byte_ct]);
- } else {
- // tolerate value_type == value_ct == 0 "untyped-missing" special
- // case, but only if field is of type Flag. Note that this means the
- // flag IS present (and yes, bcftools emits this since it's compact).
- //
- // value_ct == 0, value_type == 7 is also valid (string, missing).
- //
- // Otherwise, value_ct == 0 should not happen.
- if (!IsSet(info_flags, sidx)) {
- if (unlikely(value_ct != 7)) {
+ cur_info_slen_ubound += key_slen;
+ if (value_ct) {
+ const uint32_t bytes_per_elem = kBcfBytesPerElem[value_type];
+ const uint64_t vec_byte_ct = bytes_per_elem * S_CAST(uint64_t, value_ct);
+ if (unlikely((!bytes_per_elem) || (S_CAST(uint64_t, shared_end - parse_iter) < vec_byte_ct))) {
goto BcfToPgen_ret_VREC_GENERIC;
}
- cur_info_slen_ubound += 2;
+ if (value_type == 1) {
+ // int8, max len 4 ("-127"), add 1 for comma
+ cur_info_slen_ubound += 5 * S_CAST(uint64_t, value_ct);
+ } else if (value_type == 2) {
+ // int16, max len 6
+ cur_info_slen_ubound += 7 * S_CAST(uint64_t, value_ct);
+ } else if (value_type == 3) {
+ // int32, max len 11
+ cur_info_slen_ubound += 12 * S_CAST(uint64_t, value_ct);
+ } else if (value_type == 5) {
+ cur_info_slen_ubound += (kMaxFloatGSlen + 1) * S_CAST(uint64_t, value_ct);
+ } else {
+ // string
+ cur_info_slen_ubound += value_ct + 1;
+ }
+ parse_iter = &(parse_iter[vec_byte_ct]);
+ } else {
+ // tolerate value_type == value_ct == 0 "untyped-missing" special
+ // case, but only if field is of type Flag. Note that this means the
+ // flag IS present (and yes, bcftools emits this since it's compact).
+ //
+ // value_ct == 0, value_type == 7 is also valid (string, missing).
+ //
+ // Otherwise, value_ct == 0 should not happen.
+ if (!IsSet(info_flags, sidx)) {
+ if (unlikely(value_ct != 7)) {
+ goto BcfToPgen_ret_VREC_GENERIC;
+ }
+ cur_info_slen_ubound += 2;
+ }
}
- }
- if (sidx == pr_sidx) {
- if (unlikely(info_pr_here)) {
- putc_unlocked('\n', stdout);
- snprintf(g_logbuf, kLogbufSize, "Error: Variant record #%" PRIuPTR " in --bcf file has multiple INFO/PR entries.\n", vrec_idx);
- goto BcfToPgen_ret_MALFORMED_INPUT_WW;
+ if (sidx == pr_sidx) {
+ if (unlikely(info_pr_here)) {
+ putc_unlocked('\n', stdout);
+ snprintf(g_logbuf, kLogbufSize, "Error: Variant record #%" PRIuPTR " in --bcf file has multiple INFO/PR entries.\n", vrec_idx);
+ goto BcfToPgen_ret_MALFORMED_INPUT_WW;
+ }
+ info_pr_here = 1;
}
- info_pr_here = 1;
}
- }
- if (cur_info_slen_ubound > filter_info_slen_ubound) {
- filter_info_slen_ubound = cur_info_slen_ubound;
+ if (cur_info_slen_ubound > filter_info_slen_ubound) {
+ filter_info_slen_ubound = cur_info_slen_ubound;
+ }
}
if (unlikely(parse_iter != shared_end)) {
goto BcfToPgen_ret_VREC_GENERIC;
@@ -9199,7 +9202,7 @@ PglErr OxSampleToPsam(const char* samplename, const char* const_fid, const char*
} else if (col_type_char == 'B') {
at_least_one_binary_pheno = 1;
cur_col_type = kOxSampleColBinary;
- } else if (col_type_char == 'P') {
+ } else if ((col_type_char == 'C') || (col_type_char == 'P')) {
cur_col_type = kOxSampleColQuantitative;
} else if (likely(col_type_char == '0')) {
++initial_skip_col_ct;
@@ -9888,13 +9891,40 @@ PglErr OxGenToPgen(const char* genname, const char* samplename, const char* cons
uintptr_t variant_skip_ct = 0;
char* line_iter = TextLineEnd(&gen_txs);
++line_idx;
+ if (unlikely(!TextGetUnsafe2(&gen_txs, &line_iter))) {
+ if (TextStreamErrcode2(&gen_txs, &reterr)) {
+ goto OxGenToPgen_ret_TSTREAM_FAIL;
+ }
+ logerrputs("Error: Empty .gen file.\n");
+ goto OxGenToPgen_ret_INCONSISTENT_INPUT;
+ }
+ uint32_t is_v2 = 0;
+ {
+ const uint32_t token_ct = CountTokens(line_iter);
+ const uint32_t expected_v2_token_ct = 3 * sample_ct + 6;
+ if (token_ct == expected_v2_token_ct) {
+ is_v2 = 1;
+ } else if (unlikely(token_ct != expected_v2_token_ct - 1)) {
+ logerrprintfww("Error: Unexpected number of columns in .gen file (%u or %u expected).\n", expected_v2_token_ct - 1, expected_v2_token_ct);
+ goto OxGenToPgen_ret_INCONSISTENT_INPUT;
+ }
+ }
+ goto OxGenToPgen_loop_start;
for (; TextGetUnsafe2(&gen_txs, &line_iter); ++line_idx) {
+ OxGenToPgen_loop_start:
+ ; // make this work with plain C, as opposed to just C++, compilers
char* chr_code_str = line_iter;
char* chr_code_end = CurTokenEnd(chr_code_str);
const char* variant_id_str = FirstNonTspace(chr_code_end);
if (unlikely(IsEolnKns(*variant_id_str))) {
goto OxGenToPgen_ret_MISSING_TOKENS;
}
+ if (is_v2) {
+ variant_id_str = FirstNonTspace(CurTokenEnd(variant_id_str));
+ if (unlikely(IsEolnKns(*variant_id_str))) {
+ goto OxGenToPgen_ret_MISSING_TOKENS;
+ }
+ }
if (!single_chr_str) {
uint32_t cur_chr_code;
@@ -10055,10 +10085,6 @@ PglErr OxGenToPgen(const char* genname, const char* samplename, const char* cons
goto OxGenToPgen_ret_WRITE_FAIL;
}
if (unlikely(!variant_ct)) {
- if (!variant_skip_ct) {
- logerrputs("Error: Empty .gen file.\n");
- goto OxGenToPgen_ret_INCONSISTENT_INPUT;
- }
logerrprintfww("Error: All %" PRIuPTR " variant%s in .gen file skipped due to chromosome filter.\n", variant_skip_ct, (variant_skip_ct == 1)? "" : "s");
goto OxGenToPgen_ret_INCONSISTENT_INPUT;
}
@@ -10138,7 +10164,7 @@ PglErr OxGenToPgen(const char* genname, const char* samplename, const char* cons
continue;
}
}
- const char* linebuf_iter = NextTokenMult(FirstNonTspace(&(chr_code_end[1])), 4);
+ const char* linebuf_iter = NextTokenMult(FirstNonTspace(&(chr_code_end[1])), 4 + is_v2);
uint32_t inner_loop_last = kBitsPerWordD2 - 1;
Dosage* dosage_main_iter = dosage_main;
for (uint32_t widx = 0; ; ++widx) {
@@ -13843,6 +13869,14 @@ PglErr OxHapslegendToPgen(const char* hapsname, const char* legendname, const ch
}
reterr = InitTextStream(hapsname, max_line_blen, ClipU32(max_thread_ct - 1, 1, 4), &haps_txs);
if (unlikely(reterr)) {
+ if (reterr == kPglRetOpenFail) {
+ const uint32_t slen = strlen(hapsname);
+ if ((!StrEndsWith(hapsname, ".haps", slen)) &&
+ (!StrEndsWith(hapsname, ".haps.gz", slen))) {
+ logerrprintfww("Error: Failed to open %s : %s. (--haps expects a complete filename; did you forget '.haps' at the end?)\n", hapsname, strerror(errno));
+ goto OxHapslegendToPgen_ret_1;
+ }
+ }
goto OxHapslegendToPgen_ret_TSTREAM_FAIL_HAPS;
}
unsigned char* bigstack_mark2 = g_bigstack_base;
@@ -13926,7 +13960,14 @@ PglErr OxHapslegendToPgen(const char* hapsname, const char* legendname, const ch
}
reterr = InitTextStream(legendname, max_line_blen, 1, &legend_txs);
if (unlikely(reterr)) {
- goto OxHapslegendToPgen_ret_1;
+ if (reterr == kPglRetOpenFail) {
+ const uint32_t slen = strlen(legendname);
+ if (!StrEndsWith(legendname, ".legend", slen)) {
+ logerrprintfww("Error: Failed to open %s : %s. (--legend expects a complete filename; did you forget '.legend' at the end?)\n", legendname, strerror(errno));
+ goto OxHapslegendToPgen_ret_1;
+ }
+ }
+ goto OxHapslegendToPgen_ret_TSTREAM_FAIL_LEGEND;
}
++line_idx_legend;
char* legend_line_start = TextGet(&legend_txs);
=====================================
plink2_merge.cc
=====================================
The diff for this file was not included because it is too large.
=====================================
plink2_merge.h
=====================================
@@ -36,7 +36,8 @@ FLAGSET_DEF_START()
kfPmergeSampleInnerJoin = (1 << 0),
kfPmergeVariantInnerJoin = (1 << 1),
kfPmergePhenoInnerJoin = (1 << 2),
- kfPmergeMultiallelicsAlreadyJoined = (1 << 3)
+ kfPmergeMultiallelicsAlreadyJoined = (1 << 3),
+ kfPmergeOutputVzs = (1 << 4)
FLAGSET_DEF_END(PmergeFlags);
ENUM_U31_DEF_START()
@@ -58,11 +59,12 @@ ENUM_U31_DEF_START()
ENUM_U31_DEF_END(MergeXheaderMode);
ENUM_U31_DEF_START()
- kMergeQicModeErase,
- kMergeQicModeNmMatch,
- kMergeQicModeNmFirst,
- kMergeQicModeFirst
-ENUM_U31_DEF_END(MergeQicMode);
+ kMergeQualModeErase,
+ kMergeQualModeNmMatch,
+ kMergeQualModeNmFirst,
+ kMergeQualModeFirst,
+ kMergeQualModeMin
+ENUM_U31_DEF_END(MergeQualMode);
ENUM_U31_DEF_START()
kMergeFilterModeErase,
@@ -72,6 +74,13 @@ ENUM_U31_DEF_START()
kMergeFilterModeNonpassUnion
ENUM_U31_DEF_END(MergeFilterMode);
+ENUM_U31_DEF_START()
+ kMergeInfoCmModeErase,
+ kMergeInfoCmModeNmMatch,
+ kMergeInfoCmModeNmFirst,
+ kMergeInfoCmModeFirst
+ENUM_U31_DEF_END(MergeInfoCmMode);
+
ENUM_U31_DEF_START()
kSort0,
kSortNone,
@@ -108,10 +117,10 @@ typedef struct PmergeStruct {
MergePhenoMode merge_sex_mode;
MergePhenoMode merge_pheno_mode;
MergeXheaderMode merge_xheader_mode;
- MergeQicMode merge_qual_mode;
+ MergeQualMode merge_qual_mode;
MergeFilterMode merge_filter_mode;
- MergeQicMode merge_info_mode;
- MergeQicMode merge_cm_mode;
+ MergeInfoCmMode merge_info_mode;
+ MergeInfoCmMode merge_cm_mode;
SortMode merge_pheno_sort;
SortMode merge_info_sort;
uint32_t max_allele_ct;
@@ -119,6 +128,7 @@ typedef struct PmergeStruct {
char* pvar_fname;
char* psam_fname;
char* list_fname;
+ char* list_base_dir;
} PmergeInfo;
typedef struct PgenDiffStruct {
@@ -138,7 +148,7 @@ void InitPgenDiff(PgenDiffInfo* pgen_diff_info_ptr);
void CleanupPgenDiff(PgenDiffInfo* pgen_diff_info_ptr);
-PglErr Pmerge(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags misc_flags, ImportFlags import_flags, SortMode sample_sort_mode, FamCol fam_cols, int32_t missing_pheno, uint32_t max_thread_ct, SortMode sort_vars_mode, char* pgenname, char* psamname, char* pvarname, char* outname, char* outname_end, ChrInfo* cip);
+PglErr Pmerge(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags misc_flags, SortMode sample_sort_mode, FamCol fam_cols, int32_t missing_pheno, uint32_t max_thread_ct, SortMode sort_vars_mode, char* pgenname, char* psamname, char* pvarname, char* outname, char* outname_end, ChrInfo* cip);
PglErr PgenDiff(const uintptr_t* orig_sample_include, const SampleIdInfo* siip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const PgenDiffInfo* pdip, uint32_t raw_sample_ct, uint32_t orig_sample_ct, uint32_t raw_variant_ct, uint32_t max_allele_ct1, uint32_t max_allele_slen, uint32_t max_thread_ct, PgenFileInfo* pgfip, PgenReader* simple_pgrp, char* outname, char* outname_end);
=====================================
plink2_misc.cc
=====================================
@@ -1305,7 +1305,9 @@ PglErr Plink1ClusterImport(const char* within_fname, const char* catpheno_name,
}
char* cat_name_start = cat_name_iter;
cat_name_iter = memcpya(cat_name_iter, main_token_start, main_token_slen + 1);
- cur_cat_idx = nonnull_cat_ct + 1;
+ // bugfix (5 May 2021): dropped this increment in mid-Jan refactor
+ ++nonnull_cat_ct;
+ cur_cat_idx = nonnull_cat_ct;
cur_cat_names[cur_cat_idx] = cat_name_start;
}
// permit duplicates if category is identical
@@ -6451,11 +6453,17 @@ PglErr SampleCounts(const uintptr_t* sample_include, const SampleIdInfo* siip, c
y_variant_ct = CountChrVariantsUnsafe(variant_include, cip, y_code);
}
uintptr_t haploid_mask_lowbit = cip->haploid_mask[0] & 1;
- if (male_ct) {
- uint32_t x_code;
- if ((!haploid_mask_lowbit) && XymtExists(cip, kChrOffsetX, &x_code)) {
+ // bugfix (28 Mar 2021): if chrX is present, no autosomal diploid
+ // chromosomes are present, and there are no males, we need to set the
+ // low bit of chr_types and increment diploid_chr_type_ct.
+ uint32_t x_code;
+ if ((!haploid_mask_lowbit) && XymtExists(cip, kChrOffsetX, &x_code)) {
+ if (male_ct) {
chr_types |= 2;
++diploid_chr_type_ct;
+ } else if (!chr_types) {
+ chr_types = 1;
+ ++diploid_chr_type_ct;
}
}
if (y_variant_ct) {
=====================================
plink2_psam.cc
=====================================
@@ -236,10 +236,6 @@ PglErr LoadPsam(const char* psamname, const RangeList* pheno_range_list_ptr, Fam
snprintf(g_logbuf, kLogbufSize, "Error: IID column is not first or second in %s.\n", psamname);
goto LoadPsam_ret_MALFORMED_INPUT_WW;
}
- if (unlikely((psam_cols_mask & 2) && ((col_types[1] != 1) || (col_skips[1] != 1)))) {
- snprintf(g_logbuf, kLogbufSize, "Error: SID column does not immediately follow IID column in %s.\n", psamname);
- goto LoadPsam_ret_MALFORMED_INPUT_WW;
- }
if (unlikely(in_interval)) {
snprintf(g_logbuf, kLogbufSize, "Error: --pheno-name range is inconsistent with %s.\n", psamname);
goto LoadPsam_ret_INCONSISTENT_INPUT_WW;
@@ -248,6 +244,10 @@ PglErr LoadPsam(const char* psamname, const RangeList* pheno_range_list_ptr, Fam
for (rpf_col_idx = relevant_postfid_col_ct - 1; rpf_col_idx; --rpf_col_idx) {
col_skips[rpf_col_idx] -= col_skips[rpf_col_idx - 1];
}
+ if (unlikely((psam_cols_mask & 2) && ((col_types[1] != 1) || (col_skips[1] != 1)))) {
+ snprintf(g_logbuf, kLogbufSize, "Error: SID column does not immediately follow IID column in %s.\n", psamname);
+ goto LoadPsam_ret_MALFORMED_INPUT_WW;
+ }
line_iter = AdvPastDelim(linebuf_iter, '\n');
++line_idx;
@@ -787,9 +787,10 @@ PglErr LoadPsam(const char* psamname, const RangeList* pheno_range_list_ptr, Fam
if (fid_present) {
piip->sii.flags |= kfSampleIdFidPresent;
}
- // special case: if there's exactly one phenotype and it's all-missing,
- // discard it
- if ((pheno_ct == 1) && (!PopcountWords(pheno_cols[0].nonmiss, raw_sample_ctl))) {
+ // special case: if there's exactly one phenotype, it has the default name,
+ // and it's all-missing, discard it. This removes forced .fam-derived and
+ // similar phenotype columns, and is unlikely to break anything else..
+ if ((pheno_ct == 1) && (!strcmp(*pheno_names_ptr, "PHENO1")) && (!PopcountWords(pheno_cols[0].nonmiss, raw_sample_ctl))) {
free(*pheno_names_ptr);
*pheno_names_ptr = nullptr;
CleanupPhenoCols(1, pheno_cols);
=====================================
plink2_pvar.cc
=====================================
@@ -450,7 +450,19 @@ void BackfillChrIdxs(const ChrInfo* cip, uint32_t chrs_encountered_m1, uint32_t
}
}
-char* PrInInfoToken(uint32_t info_slen, char* info_token) {
+uint32_t PrInInfo(uint32_t info_slen, char* info_token) {
+ if (memequal_k(info_token, "PR", 2) && ((info_slen == 2) || (info_token[2] == ';'))) {
+ return 1;
+ }
+ if (memequal_k(&(info_token[S_CAST(int32_t, info_slen) - 3]), ";PR", 3)) {
+ return 1;
+ }
+ info_token[info_slen] = '\0';
+ char* first_info_end = strchr(info_token, ';');
+ return first_info_end && (strstr(info_token, ";PR;") != nullptr);
+}
+
+char* InfoPrStart(uint32_t info_slen, char* info_token) {
if (memequal_k(info_token, "PR", 2) && ((info_slen == 2) || (info_token[2] == ';'))) {
return info_token;
}
=====================================
plink2_pvar.h
=====================================
@@ -82,10 +82,12 @@ void VaridTemplateInit(const char* varid_template_str, const char* missing_id_ma
char* VaridTemplateWrite(const VaridTemplate* vtp, const char* ref_start, const char* alt1_start, uint32_t cur_bp, uint32_t ref_token_slen, uint32_t extra_alt_ct, uint32_t alt_token_slen, char* dst);
-// assumes info_token[-1] is safe to read
-// may set info_token[info_slen] to \0, since it needs to use strstr()
+// These functions assume info_token[-1] is safe to read
+// They may set info_token[info_slen] to \0, since they need to use strstr()
// (todo: try memmem()... except it isn't available on 32-bit mingw?)
-char* PrInInfoToken(uint32_t info_slen, char* info_token);
+uint32_t PrInInfo(uint32_t info_slen, char* info_token);
+
+char* InfoPrStart(uint32_t info_slen, char* info_token);
// cip, max_variant_id_slen, and info_reload are in/out parameters.
// Chromosome filtering is performed if cip requests it.
View it on GitLab: https://salsa.debian.org/med-team/plink2/-/compare/cce84c08bbabda72d56f0ef78c4056523175bb52...d2773ed43c23f74d0db64432442fb669681849cf
--
View it on GitLab: https://salsa.debian.org/med-team/plink2/-/compare/cce84c08bbabda72d56f0ef78c4056523175bb52...d2773ed43c23f74d0db64432442fb669681849cf
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210723/0fc74038/attachment-0001.htm>
More information about the debian-med-commit
mailing list