[med-svn] [Git][med-team/plink2][master] 7 commits: debian/watch: assume all upstream versions are prefixed with "2.00~a3-"
Michael R. Crusoe
gitlab at salsa.debian.org
Sun Jan 17 11:48:21 GMT 2021
Michael R. Crusoe pushed to branch master at Debian Med / plink2
Commits:
6c3e8051 by Michael R. Crusoe at 2021-01-17T10:00:13+01:00
debian/watch: assume all upstream versions are prefixed with "2.00~a3-"
- - - - -
75e93bdd by Michael R. Crusoe at 2021-01-17T10:00:43+01:00
routine-update: New upstream version
- - - - -
b3bdaa5c by Michael R. Crusoe at 2021-01-17T10:00:44+01:00
New upstream version 2.00~a3-210116+dfsg
- - - - -
4eebee14 by Michael R. Crusoe at 2021-01-17T10:00:46+01:00
Update upstream source from tag 'upstream/2.00_a3-210116+dfsg'
Update to upstream version '2.00~a3-210116+dfsg'
with Debian dir a233588a93ac5443b4ef3402db5e2134f1a40290
- - - - -
77ed2da0 by Michael R. Crusoe at 2021-01-17T10:00:51+01:00
Remove obsolete field Contact from debian/upstream/metadata (already present in machine-readable debian/copyright).
Changes-By: lintian-brush
- - - - -
471ec6e3 by Michael R. Crusoe at 2021-01-17T10:07:46+01:00
debian/patches/baseline: respect Debian's architecture baselines
- - - - -
dc83f84b by Michael R. Crusoe at 2021-01-17T12:46:39+01:00
routine-update: Ready to upload to unstable
- - - - -
21 changed files:
- Makefile
- debian/changelog
- + debian/patches/baseline
- debian/patches/series
- debian/upstream/metadata
- debian/watch
- plink2.cc
- plink2_cmdline.cc
- plink2_cmdline.h
- plink2_common.cc
- plink2_common.h
- plink2_data.cc
- plink2_data.h
- plink2_export.cc
- plink2_help.cc
- plink2_import.cc
- plink2_merge.cc
- plink2_merge.h
- plink2_misc.cc
- plink2_psam.cc
- plink2_psam.h
Changes:
=====================================
Makefile
=====================================
@@ -6,8 +6,8 @@
# Does not currently support -DCPU_CHECK_...
-BASEFLAGS=-g -DZSTD_MULTITHREAD -DSTATIC_ZSTD
-# BASEFLAGS=-g -mavx2 -mbmi -mbmi2 -mlzcnt -DZSTD_MULTITHREAD
+# BASEFLAGS=-g -DZSTD_MULTITHREAD -DSTATIC_ZSTD
+BASEFLAGS=-g -mavx2 -mbmi -mbmi2 -mlzcnt -DZSTD_MULTITHREAD
# BASEFLAGS=-g -msse4.2 -DZSTD_MULTITHREAD
include Makefile.src
=====================================
debian/changelog
=====================================
@@ -1,3 +1,14 @@
+plink2 (2.00~a3-210116+dfsg-1) unstable; urgency=medium
+
+ * Team upload.
+ * debian/watch: assume all upstream versions are prefixed with
+ "2.00~a3-"
+ * Remove obsolete field Contact from debian/upstream/metadata (already present
+ in machine-readable debian/copyright).
+ * debian/patches/baseline: respect Debian's architecture baselines
+
+ -- Michael R. Crusoe <crusoe at debian.org> Sun, 17 Jan 2021 10:00:51 +0100
+
plink2 (2.00~a3-210104+dfsg-1) unstable; urgency=medium
* Team upload.
=====================================
debian/patches/baseline
=====================================
@@ -0,0 +1,14 @@
+Author: Michael R. Crusoe <crusoe at debian.org>
+Description: respect the architecture baseline
+Forwarded: not-needed
+--- plink2.orig/Makefile
++++ plink2/Makefile
+@@ -7,7 +7,7 @@
+ # Does not currently support -DCPU_CHECK_...
+
+ # BASEFLAGS=-g -DZSTD_MULTITHREAD -DSTATIC_ZSTD
+-BASEFLAGS=-g -mavx2 -mbmi -mbmi2 -mlzcnt -DZSTD_MULTITHREAD
++BASEFLAGS=-g -DZSTD_MULTITHREAD
+ # BASEFLAGS=-g -msse4.2 -DZSTD_MULTITHREAD
+
+ include Makefile.src
=====================================
debian/patches/series
=====================================
@@ -1,2 +1,3 @@
Fix_Makefile.patch
use_packaged_libdeflate.patch
+baseline
=====================================
debian/upstream/metadata
=====================================
@@ -1,4 +1,3 @@
-Contact: Christopher Chang <chrchang at alumni.caltech.edu>
Name: plink1.9
Reference:
- Author: >
=====================================
debian/watch
=====================================
@@ -1,3 +1,3 @@
version=4
-opts="repacksuffix=+dfsg,dversionmangle=s/\+dfsg//g;s/(\d\S*)\-//,repack,compression=xz" \
+opts="repacksuffix=+dfsg,dversionmangle=s/\+dfsg//g,uversionmangle=s/(\d\S+)/2.00~a3-$1/,repack,compression=xz" \
https://www.cog-genomics.org/plink/2.0/ /static/bin/plink2_src_(\d\S+)\.zip
=====================================
plink2.cc
=====================================
@@ -71,10 +71,10 @@ static const char ver_str[] = "PLINK v2.00a3"
#ifdef USE_MKL
" Intel"
#endif
- " (4 Jan 2021)";
+ " (16 Jan 2021)";
static const char ver_str2[] =
// include leading space if day < 10, so character length stays the same
- " "
+ ""
#ifndef LAPACK_ILP64
" "
#endif
@@ -286,8 +286,8 @@ typedef struct Plink2CmdlineStruct {
Command1Flags command_flags1;
PvarPsamFlags pvar_psam_flags;
- SortFlags sample_sort_flags;
- SortFlags sort_vars_flags;
+ SortMode sample_sort_mode;
+ SortMode sort_vars_mode;
GrmFlags grm_flags;
PcaFlags pca_flags;
WriteCovarFlags write_covar_flags;
@@ -1363,6 +1363,8 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c
if (unlikely(reterr)) {
goto Plink2Core_ret_1;
}
+ // bugfix (16 Jan 2021)
+ pii.sii.flags |= kfSampleIdParentsPresent;
}
}
// --update-parents goes here
@@ -2502,11 +2504,11 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c
BigstackReset(bigstack_mark_allele_ddosages);
uint32_t* new_sample_idx_to_old = nullptr;
- if (pcp->sample_sort_flags & (kfSortNatural | kfSortAscii | kfSortFile)) {
+ if (pcp->sample_sort_mode > kSortNone) {
if (sample_ct < 2) {
logerrputs("Warning: Skipping --sample-sort since <2 samples are present.\n");
} else {
- if (pcp->sample_sort_flags & kfSortFile) {
+ if (pcp->sample_sort_mode == kSortFile) {
reterr = SampleSortFileMap(sample_include, &pii.sii, pcp->sample_sort_fname, raw_sample_ct, sample_ct, &new_sample_idx_to_old);
if (unlikely(reterr)) {
goto Plink2Core_ret_1;
@@ -2517,7 +2519,7 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c
// to spare here, so keep the code simpler for now
char* sorted_xidbox_tmp;
uintptr_t max_xid_blen;
- reterr = SortedXidboxInitAlloc(sample_include, &pii.sii, sample_ct, 0, pii.sii.sids? kfXidModeFidIidSid : kfXidModeFidIid, (pcp->sample_sort_flags == kfSortNatural), &sorted_xidbox_tmp, &new_sample_idx_to_old, &max_xid_blen);
+ reterr = SortedXidboxInitAlloc(sample_include, &pii.sii, sample_ct, 0, pii.sii.sids? kfXidModeFidIidSid : kfXidModeFidIid, (pcp->sample_sort_mode == kSortNatural), &sorted_xidbox_tmp, &new_sample_idx_to_old, &max_xid_blen);
if (unlikely(reterr)) {
goto Plink2Core_ret_1;
}
@@ -2544,8 +2546,8 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c
if (pcp->command_flags1 & kfCommand1MakePlink2) {
// todo: unsorted case (--update-chr, etc.)
- if (pcp->sort_vars_flags != kfSort0) {
- reterr = MakePlink2Vsort(sample_include, &pii, sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, info_reload_slen? pvarname : nullptr, variant_cms, chr_idxs, xheader_blen, info_flags, raw_sample_ct, sample_ct, pheno_ct, max_pheno_name_blen, raw_variant_ct, variant_ct, max_allele_ct, max_allele_slen, max_filter_slen, info_reload_slen, pcp->max_thread_ct, pcp->hard_call_thresh, pcp->dosage_erase_thresh, make_plink2_flags, (pcp->sort_vars_flags == kfSortNatural), pcp->pvar_psam_flags, xheader, &simple_pgr, outname, outname_end);
+ if (pcp->sort_vars_mode > kSortNone) {
+ reterr = MakePlink2Vsort(sample_include, &pii, sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, info_reload_slen? pvarname : nullptr, variant_cms, chr_idxs, xheader_blen, info_flags, raw_sample_ct, sample_ct, pheno_ct, max_pheno_name_blen, raw_variant_ct, variant_ct, max_allele_ct, max_allele_slen, max_filter_slen, info_reload_slen, pcp->max_thread_ct, pcp->hard_call_thresh, pcp->dosage_erase_thresh, make_plink2_flags, (pcp->sort_vars_mode == kSortNatural), pcp->pvar_psam_flags, xheader, &simple_pgr, outname, outname_end);
} else {
if (vpos_sortstatus & kfUnsortedVarBp) {
logerrputs("Warning: Variants are not sorted by position. Consider rerunning with the\n--sort-vars flag added to remedy this.\n");
@@ -3530,8 +3532,8 @@ int main(int argc, char** argv) {
// uint64_t command_flags2 = 0;
pc.misc_flags = kfMisc0;
pc.pvar_psam_flags = kfPvarPsam0;
- pc.sample_sort_flags = kfSort0;
- pc.sort_vars_flags = kfSort0;
+ pc.sample_sort_mode = kSort0;
+ pc.sort_vars_mode = kSort0;
pc.grm_flags = kfGrm0;
pc.pca_flags = kfPca0;
pc.write_covar_flags = kfWriteCovar0;
@@ -5845,7 +5847,7 @@ int main(int argc, char** argv) {
case 'i':
if (strequal_k_unsafe(flagname_p2, "ndiv-sort")) {
- if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 3))) {
+ if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 2))) {
goto main_ret_INVALID_CMDLINE_2A;
}
const char* mode_str = argvk[arg_idx + 1];
@@ -5853,28 +5855,21 @@ int main(int argc, char** argv) {
const uint32_t mode_slen = strlen(mode_str);
if (strequal_k(mode_str, "0", mode_slen) ||
strequal_k(mode_str, "none", mode_slen)) {
- pc.sample_sort_flags = kfSortNone;
+ pc.sample_sort_mode = kSortNone;
} else if (((mode_slen == 1) && (first_char_upcase_match == 'N')) ||
strequal_k(mode_str, "natural", mode_slen)) {
- pc.sample_sort_flags = kfSortNatural;
+ pc.sample_sort_mode = kSortNatural;
} else if (((mode_slen == 1) && (first_char_upcase_match == 'A')) ||
strequal_k(mode_str, "ascii", mode_slen)) {
- pc.sample_sort_flags = kfSortAscii;
+ pc.sample_sort_mode = kSortAscii;
} else if (likely(((mode_slen == 1) && (first_char_upcase_match == 'F')) ||
strequal_k(mode_str, "file", mode_slen))) {
if (unlikely(param_ct == 1)) {
snprintf(g_logbuf, kLogbufSize, "Error: Missing '--indiv-sort %s' filename.\n", mode_str);
goto main_ret_INVALID_CMDLINE_2A;
}
- pc.sample_sort_flags = kfSortFile;
- uint32_t fname_modif_idx = 2;
- if (param_ct == 3) {
- if (unlikely(CheckExtraParam(&(argvk[arg_idx]), "sid", &fname_modif_idx))) {
- goto main_ret_INVALID_CMDLINE_A;
- }
- pc.sample_sort_flags |= kfSortFileSid;
- }
- reterr = AllocFname(argvk[arg_idx + fname_modif_idx], flagname_p, 0, &pc.sample_sort_fname);
+ pc.sample_sort_mode = kSortFile;
+ reterr = AllocFname(argvk[arg_idx + 2], flagname_p, 0, &pc.sample_sort_fname);
if (unlikely(reterr)) {
goto main_ret_1;
}
@@ -5882,7 +5877,7 @@ int main(int argc, char** argv) {
snprintf(g_logbuf, kLogbufSize, "Error: '%s' is not a valid mode for --indiv-sort.\n", mode_str);
goto main_ret_INVALID_CMDLINE_WWA;
}
- if (unlikely((param_ct > 1) && (!(pc.sample_sort_flags & kfSortFile)))) {
+ if (unlikely((param_ct > 1) && (pc.sample_sort_mode != kSortFile))) {
snprintf(g_logbuf, kLogbufSize, "Error: '--indiv-sort %s' does not accept additional arguments.\n", mode_str);
goto main_ret_INVALID_CMDLINE_2A;
}
@@ -7844,24 +7839,31 @@ int main(int argc, char** argv) {
snprintf(g_logbuf, kLogbufSize, "Error: Invalid --merge-filter-mode argument '%s'.\n", cur_modif);
goto main_ret_INVALID_CMDLINE_WWA;
}
- } else if (strequal_k_unsafe(flagname_p2, "erge-info-sort")) {
+ } else if (strequal_k_unsafe(flagname_p2, "erge-info-sort") ||
+ strequal_k_unsafe(flagname_p2, "erge-pheno-sort")) {
if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 1))) {
goto main_ret_INVALID_CMDLINE_2A;
}
const char* mode_str = argvk[arg_idx + 1];
const char first_char_upcase_match = mode_str[0] & 0xdf;
const uint32_t mode_slen = strlen(mode_str);
- if (strequal_k(mode_str, "0", mode_slen) ||
- strequal_k(mode_str, "none", mode_slen)) {
- pmerge_info.merge_info_sort = kfSortNone;
+ SortMode sort_mode = kSortNone;
+ if (((mode_slen == 1) && (first_char_upcase_match == 'N')) ||
+ strequal_k(mode_str, "natural", mode_slen)) {
+ sort_mode = kSortNatural;
} else if (((mode_slen == 1) && (first_char_upcase_match == 'A')) ||
strequal_k(mode_str, "ascii", mode_slen)) {
- pmerge_info.merge_info_sort = kfSortAscii;
- } else if (unlikely(!(((mode_slen == 1) && (first_char_upcase_match == 'N')) ||
- strequal_k(mode_str, "natural", mode_slen)))) {
- snprintf(g_logbuf, kLogbufSize, "Error: Invalid --merge-info-sort argument '%s'.\n", mode_str);
+ sort_mode = kSortAscii;
+ } else if (unlikely(!(strequal_k(mode_str, "0", mode_slen) ||
+ strequal_k(mode_str, "none", mode_slen)))) {
+ snprintf(g_logbuf, kLogbufSize, "Error: Invalid --%s argument '%s'.\n", flagname_p, mode_str);
goto main_ret_INVALID_CMDLINE_WWA;
}
+ if (flagname_p2[5] == 'i') {
+ pmerge_info.merge_info_sort = sort_mode;
+ } else {
+ pmerge_info.merge_pheno_sort = sort_mode;
+ }
} else if (strequal_k_unsafe(flagname_p2, "erge-max-allele-ct")) {
if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 1))) {
goto main_ret_INVALID_CMDLINE_2A;
@@ -8467,7 +8469,6 @@ int main(int argc, char** argv) {
}
}
pc.command_flags1 |= kfCommand1Pmerge;
- pc.dependency_flags |= kfFilterAllReq;
} else if (strequal_k_unsafe(flagname_p2, "merge-list")) {
if (pc.command_flags1 & kfCommand1Pmerge) {
logerrputs("Error: --pmerge-list cannot be used with --pmerge.\n");
@@ -8518,7 +8519,6 @@ int main(int argc, char** argv) {
goto main_ret_1;
}
pc.command_flags1 |= kfCommand1Pmerge;
- pc.dependency_flags |= kfFilterAllReq;
} else if (strequal_k_unsafe(flagname_p2, "gen-diff")) {
if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 7))) {
goto main_ret_INVALID_CMDLINE_2A;
@@ -9328,16 +9328,16 @@ int main(int argc, char** argv) {
const uint32_t mode_slen = strlen(mode_str);
if (((mode_slen == 1) && (first_char_upcase_match == 'N')) ||
strequal_k(mode_str, "natural", mode_slen)) {
- pc.sort_vars_flags = kfSortNatural;
+ pc.sort_vars_mode = kSortNatural;
} else if (likely(((mode_slen == 1) && (first_char_upcase_match == 'A')) ||
strequal_k(mode_str, "ascii", mode_slen))) {
- pc.sort_vars_flags = kfSortAscii;
+ pc.sort_vars_mode = kSortAscii;
} else {
snprintf(g_logbuf, kLogbufSize, "Error: '%s' is not a valid mode for --sort-vars.\n", mode_str);
goto main_ret_INVALID_CMDLINE_WWA;
}
} else {
- pc.sort_vars_flags = kfSortNatural;
+ pc.sort_vars_mode = kSortNatural;
}
} else if (strequal_k_unsafe(flagname_p2, "ample-diff")) {
if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 0x7fffffff))) {
@@ -10303,7 +10303,7 @@ int main(int argc, char** argv) {
logerrputs("Error: --data/--sample cannot be used with --1.\n");
goto main_ret_INVALID_CMDLINE_A;
}
- if (unlikely((pc.sample_sort_flags != kfSort0) && (!(pc.command_flags1 & (kfCommand1MakePlink2 | kfCommand1WriteCovar | kfCommand1Pmerge))))) {
+ if (unlikely((pc.sample_sort_mode != kSort0) && (!(pc.command_flags1 & (kfCommand1MakePlink2 | kfCommand1WriteCovar | kfCommand1Pmerge))))) {
logerrputs("Error: --indiv-sort must be used with --make-[b]pgen/--make-bed/--write-covar\nor dataset merging.\n");
goto main_ret_INVALID_CMDLINE_A;
}
@@ -10332,9 +10332,6 @@ int main(int argc, char** argv) {
goto main_ret_INVALID_CMDLINE;
}
}
- if ((pc.command_flags1 & kfCommand1Pmerge) && (pc.command_flags1 & (~kfCommand1Pmerge))) {
- ;;;
- }
if (pc.keep_cat_phenoname && (!pc.keep_cat_names_flattened) && (!pc.keep_cats_fname)) {
logerrputs("Error: --keep-cat-pheno must be used with --keep-cats and/or --keep-cat-names.\n");
}
@@ -10529,10 +10526,10 @@ int main(int argc, char** argv) {
// section; otherwise only do it in "--keep-autoconv vzs" case.
uint32_t pvar_is_compressed;
if (xload & (kfXloadVcf | kfXloadBcf)) {
- const uint32_t no_samples_ok = !(pc.dependency_flags & (kfFilterAllReq | kfFilterPsamReq));
+ const uint32_t no_samples_ok = !((pc.dependency_flags & (kfFilterAllReq | kfFilterPsamReq)) || (pc.command_flags1 & kfCommand1Pmerge));
const uint32_t is_vcf = (xload / kfXloadVcf) & 1;
pvar_is_compressed = ((import_flags & (kfImportKeepAutoconv | kfImportKeepAutoconvVzs)) != kfImportKeepAutoconv);
- if (no_samples_ok && is_vcf && (!(import_flags & kfImportKeepAutoconv)) && pc.command_flags1 && (!(pc.command_flags1 & kfCommand1Pmerge))) {
+ if (no_samples_ok && is_vcf && (!(import_flags & kfImportKeepAutoconv)) && pc.command_flags1) {
// special case: just treat the VCF as a .pvar file
strcpy(pvarname, pgenname);
pgenname[0] = '\0';
@@ -10608,7 +10605,12 @@ int main(int argc, char** argv) {
if ((pc.dependency_flags & kfFilterOpportunisticPgen) && (pgenname[0] != '\0')) {
pc.dependency_flags |= kfFilterAllReq;
}
- if (pc.dependency_flags & kfFilterAllReq) {
+ if (pmerge_info.list_fname) {
+ if (unlikely((!xload) && (load_params != kfLoadParamsPfileAll) && (load_params!= kfLoadParams0))) {
+ logerrputs("Error: --pmerge-list cannot be used with a proper subset of {--pgen, --pvar,\n--psam}. You must specify either none or all of those filenames.\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
+ } else if ((pc.dependency_flags & kfFilterAllReq) || (pc.command_flags1 & kfCommand1Pmerge)) {
if (unlikely((!xload) && (load_params != kfLoadParamsPfileAll))) {
logerrputs("Error: A full fileset (.pgen/.bed + .pvar/.bim + .psam/.fam) is required for\nthis.\n");
goto main_ret_INVALID_CMDLINE_A;
@@ -10642,7 +10644,7 @@ int main(int argc, char** argv) {
if (make_plink2_flags & (kfMakePgen | kfMakePvar | kfMakePsam)) {
merge_outname_end = strcpya_k(merge_outname_end, "-merge");
}
- reterr = Pmerge(&pmerge_info, pc.sample_sort_fname, pc.misc_flags, pc.sample_sort_flags, pc.fam_cols, pc.max_thread_ct, pgenname, psamname, pvarname, outname, merge_outname_end, &chr_info);
+ reterr = Pmerge(&pmerge_info, pc.sample_sort_fname, pc.misc_flags, pc.sample_sort_mode, pc.fam_cols, pc.max_thread_ct, pgenname, psamname, pvarname, outname, merge_outname_end, &chr_info);
if (unlikely(reterr)) {
goto main_ret_1;
}
@@ -10652,7 +10654,7 @@ int main(int argc, char** argv) {
}
}
- if ((pc.command_flags1 & (~(kfCommand1MakePlink2 | kfCommand1Validate | kfCommand1WriteSnplist | kfCommand1WriteCovar | kfCommand1WriteSamples))) || ((pc.command_flags1 & kfCommand1MakePlink2) && (pc.sort_vars_flags == kfSort0))) {
+ if ((pc.command_flags1 & (~(kfCommand1MakePlink2 | kfCommand1Validate | kfCommand1WriteSnplist | kfCommand1WriteCovar | kfCommand1WriteSamples))) || ((pc.command_flags1 & kfCommand1MakePlink2) && (pc.sort_vars_mode > kSortNone))) {
// split-chromosome prohibited for all commands unless explicitly
// permitted here
pc.dependency_flags |= kfFilterNoSplitChr;
=====================================
plink2_cmdline.cc
=====================================
@@ -1393,33 +1393,6 @@ BoolErr HtableGoodSizeAlloc(uint32_t item_ct, uintptr_t bytes_avail, uint32_t**
return 0;
}
-uint32_t PopulateStrboxHtable(const char* strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable) {
- // may want subset_mask parameter later
- SetAllU32Arr(str_htable_size, str_htable);
- const char* strbox_iter = strbox;
- for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
- const uint32_t slen = strlen(strbox_iter);
- // previously used quadratic probing, but turns out that that isn't
- // meaningfully better than linear probing.
- for (uint32_t hashval = Hashceil(strbox_iter, slen, str_htable_size); ; ) {
- const uint32_t cur_htable_entry = str_htable[hashval];
- if (cur_htable_entry == UINT32_MAX) {
- str_htable[hashval] = str_idx;
- break;
- }
- if (memequal(strbox_iter, &(strbox[cur_htable_entry * max_str_blen]), slen + 1)) {
- // guaranteed to be positive
- return str_idx;
- }
- if (++hashval == str_htable_size) {
- hashval = 0;
- }
- }
- strbox_iter = &(strbox_iter[max_str_blen]);
- }
- return 0;
-}
-
// could merge this with non-subset case, but this isn't much code
/*
uint32_t populate_strbox_subset_htable(const uintptr_t* __restrict subset_mask, const char* strbox, uintptr_t raw_str_ct, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable) {
@@ -1476,6 +1449,38 @@ uint32_t IdHtableFindNnt(const char* cur_id, const char* const* item_ids, const
}
}
+uint32_t IdHtableAdd(const char* cur_id, const char* const* item_ids, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t value, uint32_t* id_htable) {
+ for (uint32_t hashval = Hashceil(cur_id, cur_id_slen, id_htable_size); ; ) {
+ const uint32_t cur_htable_entry = id_htable[hashval];
+ if (cur_htable_entry == UINT32_MAX) {
+ id_htable[hashval] = value;
+ return UINT32_MAX;
+ }
+ if (memequal(cur_id, item_ids[cur_htable_entry], cur_id_slen + 1)) {
+ return cur_htable_entry;
+ }
+ if (++hashval == id_htable_size) {
+ hashval = 0;
+ }
+ }
+}
+
+uint32_t IdHtableAddNnt(const char* cur_id, const char* const* item_ids, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t value, uint32_t* id_htable) {
+ for (uint32_t hashval = Hashceil(cur_id, cur_id_slen, id_htable_size); ; ) {
+ const uint32_t cur_htable_entry = id_htable[hashval];
+ if (cur_htable_entry == UINT32_MAX) {
+ id_htable[hashval] = value;
+ return UINT32_MAX;
+ }
+ if (strequal_unsafe(item_ids[cur_htable_entry], cur_id, cur_id_slen)) {
+ return cur_htable_entry;
+ }
+ if (++hashval == id_htable_size) {
+ hashval = 0;
+ }
+ }
+}
+
// assumes cur_id_slen < max_str_blen.
// requires cur_id to be null-terminated.
uint32_t StrboxHtableFind(const char* cur_id, const char* strbox, const uint32_t* id_htable, uintptr_t max_str_blen, uint32_t cur_id_slen, uint32_t id_htable_size) {
@@ -1491,6 +1496,37 @@ uint32_t StrboxHtableFind(const char* cur_id, const char* strbox, const uint32_t
}
}
+uint32_t StrboxHtableAdd(const char* cur_id, const char* strbox, uintptr_t max_str_blen, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t value, uint32_t* id_htable) {
+ for (uint32_t hashval = Hashceil(cur_id, cur_id_slen, id_htable_size); ; ) {
+ const uint32_t cur_htable_entry = id_htable[hashval];
+ if (cur_htable_entry == UINT32_MAX) {
+ id_htable[hashval] = value;
+ return UINT32_MAX;
+ }
+ if (memequal(cur_id, &(strbox[cur_htable_entry * max_str_blen]), cur_id_slen + 1)) {
+ return cur_htable_entry;
+ }
+ if (++hashval == id_htable_size) {
+ hashval = 0;
+ }
+ }
+}
+
+uint32_t PopulateStrboxHtable(const char* strbox, uint32_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable) {
+ // may want subset_mask parameter later
+ SetAllU32Arr(str_htable_size, str_htable);
+ const char* strbox_iter = strbox;
+ for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
+ const uint32_t slen = strlen(strbox_iter);
+ if (StrboxHtableAdd(strbox_iter, strbox, max_str_blen, slen, str_htable_size, str_idx, str_htable) != UINT32_MAX) {
+ // guaranteed to be positive
+ return str_idx;
+ }
+ strbox_iter = &(strbox_iter[max_str_blen]);
+ }
+ return 0;
+}
+
uint32_t VariantIdDupflagHtableFind(const char* idbuf, const char* const* variant_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t max_id_slen) {
// assumes duplicate variant IDs are flagged, but full variant_uidx linked
// lists are not stored
@@ -3071,7 +3107,6 @@ PglErr AllocAndPopulateIdHtableMt(const uintptr_t* subset_mask, const char* cons
return PopulateIdHtableMt(g_bigstack_end, subset_mask, item_ids, item_ct, store_all_dups, id_htable_size, max_thread_ct, &g_bigstack_base, *id_htable_ptr, dup_ct_ptr);
}
-
uint32_t Edit1Match(const char* s1, const char* s2, uint32_t len1, uint32_t len2) {
// Permit one difference of the following forms (Damerau-Levenshtein distance
// 1):
=====================================
plink2_cmdline.h
=====================================
@@ -957,10 +957,19 @@ HEADER_INLINE BoolErr arena_alloc_cp(unsigned char* arena_top, uintptr_t ct, uns
return !(*cp_arr_ptr);
}
+HEADER_INLINE BoolErr arena_alloc_kcp(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, const char*** kcp_arr_ptr) {
+ *kcp_arr_ptr = S_CAST(const char**, arena_alloc(arena_top, ct * sizeof(intptr_t), arena_bottom_ptr));
+ return !(*kcp_arr_ptr);
+}
+
HEADER_INLINE void ArenaBaseSet(const void* unaligned_base, unsigned char** arena_bottom_ptr) {
*arena_bottom_ptr = R_CAST(unsigned char*, RoundUpPow2(R_CAST(uintptr_t, unaligned_base), kCacheline));
}
+HEADER_INLINE void ArenaEndSet(const void* unaligned_end, unsigned char** arena_top_ptr) {
+ *arena_top_ptr = R_CAST(unsigned char*, RoundDownPow2(R_CAST(uintptr_t, unaligned_end), kEndAllocAlign));
+}
+
HEADER_INLINE void* arena_end_alloc_raw(uintptr_t size, unsigned char** arena_top_ptr) {
assert(!(size % kEndAllocAlign));
unsigned char* alloc_ptr = *arena_top_ptr;
@@ -1028,6 +1037,11 @@ HEADER_INLINE BoolErr arena_end_alloc_cp(unsigned char* arena_bottom, uintptr_t
return !(*cp_arr_ptr);
}
+HEADER_INLINE BoolErr arena_end_alloc_kcp(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, const char*** kcp_arr_ptr) {
+ *kcp_arr_ptr = S_CAST(const char**, arena_end_alloc(arena_bottom, ct * sizeof(intptr_t), arena_top_ptr));
+ return !(*kcp_arr_ptr);
+}
+
BoolErr PushLlStr(const char* str, LlStr** ll_stack_ptr);
@@ -1337,6 +1351,29 @@ HEADER_INLINE uint32_t Hashceil(const char* idstr, uint32_t idlen, uint32_t htab
return (S_CAST(uint64_t, Hash32(idstr, idlen)) * htable_size) >> 32;
}
+// In most cases, plink2 represents an array of strings in one of the following
+// two ways:
+// (const char* strbox, uint32_t str_ct, uintptr_t max_str_blen):
+// null-terminated string #x starts at &(strbox[x * max_str_blen)), where
+// x is a 0-based index.
+// (const char* const* item_ids, uint32_t str_ct):
+// null-terminated string #x starts at item_ids[x].
+// When we need to perform string -> string-index lookups into the array, we
+// construct a hashmap as follows:
+// - Allocate a uint32_t array of htable_size >= 2 * str_ct, and initialize all
+// entries to UINT32_MAX to mark them empty.
+// - Iterate through the array, computing hashval := Hashceil(str, strlen(str),
+// htable_size) for each string, and then setting htable[hashval] :=
+// string-index whenever that htable entry is empty. When there is a
+// conflict, use linear probing (increment hashval until an empty table cell
+// is found, wrapping around from (htable_size - 1) to 0).
+// - PLINK 1.9 used quadratic probing, but that's been scrapped since
+// benchmark results suggest that it has no meaningful advantage. We
+// aren't dealing with adversarial input...
+// The "StrboxHtable" functions below work with the const char* strbox
+// string-array representation, while "IdHtable" functions work with the const
+// char* const* item_ids representation.
+
// uintptr_t geqprime(uintptr_t floor);
// assumes ceil is odd and greater than 4. Returns the first prime <= ceil.
@@ -1358,10 +1395,6 @@ HEADER_INLINE uint32_t GetHtableFastSize(uint32_t item_ct) {
BoolErr HtableGoodSizeAlloc(uint32_t item_ct, uintptr_t bytes_avail, uint32_t** htable_ptr, uint32_t* htable_size_ptr);
-// useful for duplicate detection: returns 0 on no duplicates, a positive index
-// of a duplicate pair if they're present
-uint32_t PopulateStrboxHtable(const char* strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable);
-
// returned index in duplicate-pair case is unfiltered
// uint32_t populate_strbox_subset_htable(const uintptr_t* __restrict subset_mask, const char* strbox, uintptr_t raw_str_ct, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable);
@@ -1372,10 +1405,41 @@ uint32_t IdHtableFind(const char* cur_id, const char* const* item_ids, const uin
// null-terminated any more.
uint32_t IdHtableFindNnt(const char* cur_id, const char* const* item_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size);
+HEADER_INLINE void HtableAddNondup(const char* cur_id, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t value, uint32_t* id_htable) {
+ for (uint32_t hashval = Hashceil(cur_id, cur_id_slen, id_htable_size); ; ) {
+ const uint32_t cur_htable_entry = id_htable[hashval];
+ if (cur_htable_entry == UINT32_MAX) {
+ id_htable[hashval] = value;
+ return;
+ }
+ if (++hashval == id_htable_size) {
+ hashval = 0;
+ }
+ }
+}
+
+// Assumes cur_id is null-terminated.
+// item_ids overread must be ok.
+// Returns string-index if cur_id is already in the table, UINT32_MAX if it was
+// added.
+uint32_t IdHtableAdd(const char* cur_id, const char* const* item_ids, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t value, uint32_t* id_htable);
+
+// Does not require cur_id to be null-terminated.
+uint32_t IdHtableAddNnt(const char* cur_id, const char* const* item_ids, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t value, uint32_t* id_htable);
+
// assumes cur_id_slen < max_str_blen.
// requires cur_id to be null-terminated.
uint32_t StrboxHtableFind(const char* cur_id, const char* strbox, const uint32_t* id_htable, uintptr_t max_str_blen, uint32_t cur_id_slen, uint32_t id_htable_size);
+// Assumes cur_id is null-terminated.
+// Returns string-index if cur_id is already in the table, UINT32_MAX if it was
+// added.
+uint32_t StrboxHtableAdd(const char* cur_id, const char* strbox, uintptr_t max_str_blen, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t value, uint32_t* id_htable);
+
+// useful for duplicate detection: returns 0 on no duplicates, a positive index
+// of a duplicate pair if they're present
+uint32_t PopulateStrboxHtable(const char* strbox, uint32_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable);
+
// last variant_ids entry must be at least kMaxIdBlen bytes before end of
// bigstack
uint32_t VariantIdDupflagHtableFind(const char* idbuf, const char* const* variant_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t max_id_slen);
@@ -1703,7 +1767,6 @@ PglErr CheckIdUniqueness(unsigned char* arena_bottom, unsigned char* arena_top,
// not doing that again).
PglErr AllocAndPopulateIdHtableMt(const uintptr_t* subset_mask, const char* const* item_ids, uintptr_t item_ct, uintptr_t fast_size_min_extra_bytes, uint32_t max_thread_ct, uint32_t** id_htable_ptr, uint32_t** htable_dup_base_ptr, uint32_t* id_htable_size_ptr, uint32_t* dup_ct_ptr);
-
typedef struct HelpCtrlStruct {
NONCOPYABLE(HelpCtrlStruct);
uint32_t iters_left;
=====================================
plink2_common.cc
=====================================
@@ -1794,15 +1794,8 @@ PglErr TryToAddChrName(const char* chr_name, const char* file_descrip, uintptr_t
*chr_idx_ptr = chr_code_end;
cip->name_ct = name_ct + 1;
uint32_t* id_htable = cip->nonstd_id_htable;
- for (uint32_t hashval = Hashceil(chr_name, name_slen, kChrHtableSize); ; ) {
- if (id_htable[hashval] == UINT32_MAX) {
- id_htable[hashval] = chr_code_end;
- return kPglRetSuccess;
- }
- if (++hashval == kChrHtableSize) {
- hashval = 0;
- }
- }
+ HtableAddNondup(chr_name, name_slen, kChrHtableSize, chr_code_end, id_htable);
+ return kPglRetSuccess;
}
@@ -3037,6 +3030,115 @@ void ExpandMhc(uint32_t sample_ct, uintptr_t* mhc, uintptr_t** patch_01_set_ptr,
*patch_10_vals_ptr = R_CAST(AlleleCode*, &(patch_10_set[sample_ctl]));
}
+// Returns 2 if resize needed (str_ct == strset_table_size / 2), 1 if OOM
+uint32_t StrsetAdd(unsigned char* arena_top, const char* src, uint32_t slen, uint32_t strset_table_size, char** strset, uint32_t* str_ctp, unsigned char** arena_bottom_ptr) {
+ for (uint32_t hashval = Hashceil(src, slen, strset_table_size); ; ) {
+ char* strset_entry = strset[hashval];
+ if (!strset_entry) {
+ const uint32_t str_ct = 1 + (*str_ctp);
+ if (unlikely(str_ct * 2 > strset_table_size)) {
+ return 2;
+ }
+ *str_ctp = str_ct;
+ return S_CAST(uint32_t, StoreStringAtBase(arena_top, src, slen, arena_bottom_ptr, &(strset[hashval])));
+ }
+ if (strequal_unsafe(strset_entry, src, slen)) {
+ return 0;
+ }
+ if (++hashval == strset_table_size) {
+ hashval = 0;
+ }
+ }
+}
+
+uint32_t StrsetAddEnd(unsigned char* arena_bottom, const char* src, uint32_t slen, uint32_t strset_table_size, char** strset, uint32_t* str_ctp, unsigned char** arena_top_ptr) {
+ for (uint32_t hashval = Hashceil(src, slen, strset_table_size); ; ) {
+ char* strset_entry = strset[hashval];
+ if (!strset_entry) {
+ const uint32_t str_ct = 1 + (*str_ctp);
+ if (unlikely(str_ct * 2 > strset_table_size)) {
+ return 2;
+ }
+ *str_ctp = str_ct;
+ return S_CAST(uint32_t, StoreStringAtEnd(arena_bottom, src, slen, arena_top_ptr, &(strset[hashval])));
+ }
+ if (strequal_unsafe(strset_entry, src, slen)) {
+ return 0;
+ }
+ if (++hashval == strset_table_size) {
+ hashval = 0;
+ }
+ }
+}
+
+void RepopulateStrset(char* str_iter, uint32_t str_ct, uint32_t strset_size, char** strset) {
+ ZeroPtrArr(strset_size, strset);
+ for (uint32_t str_idx = 0; str_idx != str_ct; ++str_idx) {
+ char* str_end = strnul(str_iter);
+ const uint32_t slen = str_end - str_iter;
+ for (uint32_t hashval = Hashceil(str_iter, slen, strset_size); ; ) {
+ char* strset_entry = strset[hashval];
+ if (!strset_entry) {
+ strset[hashval] = str_iter;
+ break;
+ }
+ if (++hashval == strset_size) {
+ hashval = 0;
+ }
+ }
+ str_iter = &(str_end[1]);
+ }
+}
+
+BoolErr StrsetAddResize(unsigned char* arena_top, const char* src, uint32_t slen, uint32_t strset_table_size_max, char** strset, uint32_t* strset_table_sizep, uint32_t* str_ctp, unsigned char** arena_bottom_ptr) {
+ uint32_t strset_table_size = *strset_table_sizep;
+ uint32_t retval = StrsetAdd(arena_top, src, slen, strset_table_size, strset, str_ctp, arena_bottom_ptr);
+ if (!retval) {
+ return 0;
+ }
+ if (unlikely((retval == 1) || (strset_table_size == strset_table_size_max))) {
+ return 1;
+ }
+ const uint32_t new_table_size = MINV(strset_table_size * 2LLU, strset_table_size_max);
+ const uint64_t bytes_needed = sizeof(intptr_t) * (new_table_size - strset_table_size);
+ if (S_CAST(uintptr_t, arena_top - (*arena_bottom_ptr)) < bytes_needed) {
+ return 1;
+ }
+ unsigned char* old_str_base = R_CAST(unsigned char*, &(strset[strset_table_size]));
+ unsigned char* new_str_base = R_CAST(unsigned char*, &(strset[new_table_size]));
+ memmove(new_str_base, old_str_base, (*arena_bottom_ptr) - old_str_base);
+ *arena_bottom_ptr += bytes_needed;
+ RepopulateStrset(R_CAST(char*, new_str_base), *str_ctp, new_table_size, strset);
+ *strset_table_sizep = new_table_size;
+ return (StrsetAdd(arena_top, src, slen, new_table_size, strset, str_ctp, arena_bottom_ptr) != 0);
+}
+
+BoolErr StrsetAddEndResize(unsigned char* arena_bottom, const char* src, uint32_t slen, uint32_t strset_table_size_max, char*** strsetp, uint32_t* strset_table_sizep, uint32_t* str_ctp, unsigned char** arena_top_ptr) {
+ char** strset = *strsetp;
+ uint32_t strset_table_size = *strset_table_sizep;
+ uint32_t retval = StrsetAddEnd(arena_bottom, src, slen, strset_table_size, strset, str_ctp, arena_top_ptr);
+ if (!retval) {
+ return 0;
+ }
+ if (unlikely((retval == 1) || (strset_table_size == strset_table_size_max))) {
+ return 1;
+ }
+ const uint32_t new_table_size = MINV(strset_table_size * 2LLU, strset_table_size_max);
+ const uint64_t bytes_needed = sizeof(intptr_t) * (new_table_size - strset_table_size);
+ unsigned char* old_str_base = *arena_top_ptr;
+ if (unlikely(S_CAST(uintptr_t, old_str_base - arena_bottom) < bytes_needed)) {
+ return 1;
+ }
+ unsigned char* new_str_base = old_str_base - bytes_needed;
+ memmove(new_str_base, old_str_base, R_CAST(unsigned char*, strset) - old_str_base);
+ *arena_top_ptr = new_str_base;
+ strset -= new_table_size - strset_table_size;
+ *strsetp = strset;
+ RepopulateStrset(R_CAST(char*, new_str_base), *str_ctp, new_table_size, strset);
+ *strset_table_sizep = new_table_size;
+ return (StrsetAddEnd(arena_bottom, src, slen, new_table_size, strset, str_ctp, arena_top_ptr) != 0);
+}
+
PglErr WriteSampleIdsOverride(const uintptr_t* sample_include, const SampleIdInfo* siip, const char* outname, uint32_t sample_ct, SampleIdFlags override_flags) {
FILE* outfile = nullptr;
PglErr reterr = kPglRetSuccess;
=====================================
plink2_common.h
=====================================
@@ -226,6 +226,7 @@ FLAGSET_DEF_START()
kfPsamColAll = ((kfPsamColPhenos * 2) - kfPsamColMaybefid)
FLAGSET_DEF_END(PvarPsamFlags);
+// may want to rename FidPresent to FidMayBePresent
FLAGSET_DEF_START()
kfSampleId0,
kfSampleIdFidPresent = (1 << 0),
@@ -985,6 +986,8 @@ typedef struct PhenoColStruct {
// * When .sample non-V2 categorical variables are imported, 'C' is added in
// front of the integers.
// * category_names[1..(n-1)] is guaranteed to be in natural-sorted order.
+ // (MergePsams() has slightly different behavior since its pheno_cols
+ // instance is only used for a one-time internal write.)
const char** category_names;
uintptr_t* nonmiss; // bitvector
@@ -1109,8 +1112,25 @@ HEADER_INLINE BoolErr StoreStringAndPrecharAtEnd(unsigned char* arena_bottom, co
return 0;
}
+// Memory layout:
+// [ strset ][packed strings]
+// ^
+// |
+// arena_bottom
+// strset pointers do not remain valid after resize.
+
+// returns 1 if resize needed, 2 if OOM
+// uint32_t StrsetAdd(unsigned char* arena_top, const char* src, uint32_t slen, uint32_t strset_table_size, char** strset, uint32_t* str_ctp, unsigned char** arena_bottom_ptr);
+
+// uint32_t StrsetAddEnd(unsigned char* arena_bottom, const char* src, uint32_t slen, uint32_t strset_table_size, char** strset, uint32_t* str_ctp, unsigned char** arena_top_ptr);
+
+BoolErr StrsetAddResize(unsigned char* arena_top, const char* src, uint32_t slen, uint32_t strset_table_size_max, char** strset, uint32_t* strset_table_sizep, uint32_t* str_ctp, unsigned char** arena_bottom_ptr);
+
+BoolErr StrsetAddEndResize(unsigned char* arena_bottom, const char* src, uint32_t slen, uint32_t strset_table_size_max, char*** strsetp, uint32_t* strset_table_sizep, uint32_t* str_ctp, unsigned char** arena_top_ptr);
+
// These use g_textbuf.
PglErr WriteSampleIdsOverride(const uintptr_t* sample_include, const SampleIdInfo* siip, const char* outname, uint32_t sample_ct, SampleIdFlags override_flags);
+
HEADER_INLINE PglErr WriteSampleIds(const uintptr_t* sample_include, const SampleIdInfo* siip, const char* outname, uint32_t sample_ct) {
return WriteSampleIdsOverride(sample_include, siip, outname, sample_ct, siip->flags);
}
=====================================
plink2_data.cc
=====================================
@@ -928,17 +928,17 @@ uint32_t DataSidColIsRequired(const uintptr_t* sample_include, const char* sids,
return 0;
}
-uint32_t DataParentalColsAreRequired(const uintptr_t* sample_include, const PedigreeIdInfo* piip, uint32_t sample_ct, uint32_t maybe_modifier) {
+uint32_t DataParentalColsAreRequired(const uintptr_t* sample_include, const SampleIdInfo* siip, const ParentalIdInfo* parental_id_infop, uint32_t sample_ct, uint32_t maybe_modifier) {
if (maybe_modifier & 2) {
return 1;
}
- if ((!(maybe_modifier & 1)) || (!(piip->sii.flags & kfSampleIdParentsPresent))) {
+ if ((!(maybe_modifier & 1)) || (!(siip->flags & kfSampleIdParentsPresent))) {
return 0;
}
- const char* paternal_ids = piip->parental_id_info.paternal_ids;
- const char* maternal_ids = piip->parental_id_info.maternal_ids;
- const uintptr_t max_paternal_id_blen = piip->parental_id_info.max_paternal_id_blen;
- const uintptr_t max_maternal_id_blen = piip->parental_id_info.max_maternal_id_blen;
+ const char* paternal_ids = parental_id_infop->paternal_ids;
+ const char* maternal_ids = parental_id_infop->maternal_ids;
+ const uintptr_t max_paternal_id_blen = parental_id_infop->max_paternal_id_blen;
+ const uintptr_t max_maternal_id_blen = parental_id_infop->max_maternal_id_blen;
uintptr_t sample_uidx_base = 0;
uintptr_t cur_bits = sample_include[0];
for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
@@ -966,29 +966,28 @@ char* AppendPhenoStr(const PhenoCol* pheno_col, const char* output_missing_pheno
return write_iter;
}
-PglErr WritePsam(const char* outname, const uintptr_t* sample_include, const PedigreeIdInfo* piip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, PvarPsamFlags pvar_psam_flags) {
+PglErr WritePsam(const char* outname, const uintptr_t* sample_include, const SampleIdInfo* siip, const ParentalIdInfo* parental_id_infop, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, const char* output_missing_pheno, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, PvarPsamFlags pvar_psam_flags) {
FILE* outfile = nullptr;
PglErr reterr = kPglRetSuccess;
{
if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
goto WritePsam_ret_OPEN_FAIL;
}
- const char* output_missing_pheno = g_output_missing_pheno;
const uint32_t omp_slen = strlen(output_missing_pheno);
char* textbuf_flush = &(g_textbuf[kMaxMediumLine]);
- const char* sample_ids = piip->sii.sample_ids;
- const char* sids = piip->sii.sids;
- const char* paternal_ids = piip->parental_id_info.paternal_ids;
- const char* maternal_ids = piip->parental_id_info.maternal_ids;
- const uintptr_t max_sample_id_blen = piip->sii.max_sample_id_blen;
- const uintptr_t max_sid_blen = piip->sii.max_sid_blen;
- const uintptr_t max_paternal_id_blen = piip->parental_id_info.max_paternal_id_blen;
- const uintptr_t max_maternal_id_blen = piip->parental_id_info.max_maternal_id_blen;
- const uint32_t write_fid = DataFidColIsRequired(sample_include, &(piip->sii), sample_ct, pvar_psam_flags / kfPsamColMaybefid);
+ const char* sample_ids = siip->sample_ids;
+ const char* sids = siip->sids;
+ const char* paternal_ids = parental_id_infop->paternal_ids;
+ const char* maternal_ids = parental_id_infop->maternal_ids;
+ const uintptr_t max_sample_id_blen = siip->max_sample_id_blen;
+ const uintptr_t max_sid_blen = siip->max_sid_blen;
+ const uintptr_t max_paternal_id_blen = parental_id_infop->max_paternal_id_blen;
+ const uintptr_t max_maternal_id_blen = parental_id_infop->max_maternal_id_blen;
+ const uint32_t write_fid = DataFidColIsRequired(sample_include, siip, sample_ct, pvar_psam_flags / kfPsamColMaybefid);
const uint32_t write_sid = DataSidColIsRequired(sample_include, sids, sample_ct, max_sid_blen, pvar_psam_flags / kfPsamColMaybesid);
- const uint32_t write_parents = DataParentalColsAreRequired(sample_include, piip, sample_ct, pvar_psam_flags / kfPsamColMaybeparents);
+ const uint32_t write_parents = DataParentalColsAreRequired(sample_include, siip, parental_id_infop, sample_ct, pvar_psam_flags / kfPsamColMaybeparents);
const uint32_t write_sex = (pvar_psam_flags / kfPsamColSex) & 1;
const uint32_t write_empty_pheno = (pvar_psam_flags & kfPsamColPheno1) && (!pheno_ct);
const uint32_t write_phenos = (pvar_psam_flags & (kfPsamColPheno1 | kfPsamColPhenos)) && pheno_ct;
@@ -4184,6 +4183,7 @@ PglErr MakeFilterHtable(const uintptr_t* variant_include, const uintptr_t* filte
// while still being small relative to L1 cache. Double table size
// whenever load factor reaches 0.25; there shouldn't be *that* many
// distinct filters.
+ // TODO: check if this should use strset instead
// possible todo: multithread this scan, merge results at the end; can also
// separate this stage from the rest of the function.
uint32_t table_size = 128;
@@ -4254,7 +4254,7 @@ PglErr MakeFilterHtable(const uintptr_t* variant_include, const uintptr_t* filte
}
break;
}
- if ((!memcmp(filter_iter, cur_token_ptr, cur_id_slen)) && (!cur_token_ptr[cur_id_slen])) {
+ if (strequal_unsafe(filter_iter, cur_token_ptr, cur_id_slen)) {
break;
}
if (++hashval == table_size) {
@@ -4292,14 +4292,7 @@ PglErr MakeFilterHtable(const uintptr_t* variant_include, const uintptr_t* filte
uint32_t* filter_keys_htable = *filter_keys_htable_ptr;
SetAllU32Arr(filter_keys_htable_size, filter_keys_htable);
for (uint32_t uii = 0; uii != filter_key_ct; ++uii) {
- for (uint32_t hashval = Hashceil(filter_keys[uii], strlen(filter_keys[uii]), filter_keys_htable_size); ; ) {
- if (filter_keys_htable[hashval] == UINT32_MAX) {
- filter_keys_htable[hashval] = uii;
- }
- if (++hashval == filter_keys_htable_size) {
- hashval = 0;
- }
- }
+ HtableAddNondup(filter_keys[uii], strlen(filter_keys[uii]), filter_keys_htable_size, uii, filter_keys_htable);
}
}
while (0) {
@@ -6765,7 +6758,7 @@ PglErr MakePlink2NoVsort(const uintptr_t* sample_include, const PedigreeIdInfo*
snprintf(outname_end, kMaxOutfnameExtBlen, ".psam");
logprintfww5("Writing %s ... ", outname);
fflush(stdout);
- reterr = WritePsam(outname, sample_include, piip, sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, sample_ct, pheno_ct, max_pheno_name_blen, pvar_psam_flags);
+ reterr = WritePsam(outname, sample_include, &(piip->sii), &(piip->parental_id_info), sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, g_output_missing_pheno, sample_ct, pheno_ct, max_pheno_name_blen, pvar_psam_flags);
if (unlikely(reterr)) {
goto MakePlink2NoVsort_ret_1;
}
@@ -8218,7 +8211,7 @@ PglErr MakePlink2Vsort(const uintptr_t* sample_include, const PedigreeIdInfo* pi
snprintf(outname_end, kMaxOutfnameExtBlen, ".psam");
logprintfww5("Writing %s ... ", outname);
fflush(stdout);
- reterr = WritePsam(outname, sample_include, piip, sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, sample_ct, pheno_ct, max_pheno_name_blen, pvar_psam_flags);
+ reterr = WritePsam(outname, sample_include, &(piip->sii), &(piip->parental_id_info), sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, g_output_missing_pheno, sample_ct, pheno_ct, max_pheno_name_blen, pvar_psam_flags);
if (unlikely(reterr)) {
goto MakePlink2Vsort_ret_1;
}
=====================================
plink2_data.h
=====================================
@@ -63,6 +63,8 @@ CONSTI32(kMaxInfoKeySlen, kMaxIdSlen);
PglErr WriteMapOrBim(const char* outname, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const double* variant_cms, uint32_t variant_ct, uint32_t max_allele_slen, char delim, uint32_t output_zst, uint32_t thread_ct);
+PglErr WritePsam(const char* outname, const uintptr_t* sample_include, const SampleIdInfo* siip, const ParentalIdInfo* parental_id_infop, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, const char* output_missing_pheno, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, PvarPsamFlags pvar_psam_flags);
+
PglErr PvarInfoOpenAndReloadHeader(const char* pvar_info_reload, uint32_t max_thread_ct, TextStream* pvar_reload_txsp, char** line_iterp, uint32_t* info_col_idx_ptr);
PglErr PvarInfoReload(uint32_t info_col_idx, uint32_t variant_uidx, TextStream* pvar_reload_txsp, char** line_iterp, uint32_t* trs_variant_uidx_ptr);
@@ -83,7 +85,7 @@ uint32_t DataFidColIsRequired(const uintptr_t* sample_include, const SampleIdInf
uint32_t DataSidColIsRequired(const uintptr_t* sample_include, const char* sids, uint32_t sample_ct, uint32_t max_sid_blen, uint32_t maybe_modifier);
-uint32_t DataParentalColsAreRequired(const uintptr_t* sample_include, const PedigreeIdInfo* piip, uint32_t sample_ct, uint32_t maybe_modifier);
+uint32_t DataParentalColsAreRequired(const uintptr_t* sample_include, const SampleIdInfo* siip, const ParentalIdInfo* parental_id_infop, uint32_t sample_ct, uint32_t maybe_modifier);
char* AppendPhenoStr(const PhenoCol* pheno_col, const char* output_missing_pheno, uint32_t omp_slen, uint32_t sample_uidx, char* write_iter);
=====================================
plink2_export.cc
=====================================
@@ -3845,7 +3845,7 @@ PglErr ExportOxSampleV2(const char* outname, const uintptr_t* sample_include, co
}
}
// possible todo: support column sets here
- const uint32_t write_parents = DataParentalColsAreRequired(sample_include, piip, sample_ct, 1);
+ const uint32_t write_parents = DataParentalColsAreRequired(sample_include, &(piip->sii), &(piip->parental_id_info), sample_ct, 1);
if (write_parents) {
writebuf_size += max_paternal_id_blen + max_maternal_id_blen;
}
@@ -6169,48 +6169,40 @@ PglErr ExportVcf(const uintptr_t* sample_include, const uint32_t* sample_include
}
PglErr AddToFifHtable(unsigned char* arena_bottom, const char* key, uint32_t htable_size, uint32_t key_slen, unsigned char prechar, unsigned char** arena_top_ptr, char** keys, uint32_t* htable, uint32_t* key_ct_ptr, uint32_t* cur_idx_ptr) {
- for (uint32_t hashval = Hashceil(key, key_slen, htable_size); ; ) {
- uint32_t cur_idx = htable[hashval];
- if (cur_idx == UINT32_MAX) {
- const uint32_t key_ct = *key_ct_ptr;
- if (StoreStringAndPrecharAtEnd(arena_bottom, key, prechar, key_slen, arena_top_ptr, &(keys[key_ct]))) {
- return kPglRetNomem;
- }
- *cur_idx_ptr = key_ct;
- htable[hashval] = key_ct;
- *key_ct_ptr = key_ct + 1;
- return kPglRetSuccess;
- }
- char* existing_key = keys[cur_idx];
- if (strequal_unsafe(existing_key, key, key_slen)) {
- // Only permit this if previous instance was FILTER and this is INFO, or
- // vice versa, or this is a FORMAT key.
- // INFO entries always have prechar bit 0 set and 3 unset, while FILTER
- // entries are the reverse. FORMAT keys have prechar == 0.
- const unsigned char new_prechar = S_CAST(unsigned char, existing_key[-1]) ^ prechar;
- if (unlikely(prechar && ((new_prechar & 9) != 9))) {
- char* write_iter = strcpya_k(g_logbuf, "Error: Multiple ");
- if (prechar & 8) {
- write_iter = strcpya_k(write_iter, "FILTER");
- } else {
- write_iter = strcpya_k(write_iter, "INFO");
- }
- *write_iter++ = ':';
- write_iter = memcpya(write_iter, key, key_slen);
- strcpy_k(write_iter, " header lines.\n");
- WordWrapB(0);
- logerrputsb();
- return kPglRetMalformedInput;
- }
- existing_key[-1] = new_prechar;
- // bugfix (3 Jan 2021): forgot to return this
- *cur_idx_ptr = cur_idx;
- return kPglRetSuccess;
- }
- if (++hashval == htable_size) {
- hashval = 0;
+ const uint32_t key_ct = *key_ct_ptr;
+ const uint32_t cur_idx = IdHtableAddNnt(key, TO_CONSTCPCONSTP(keys), key_slen, htable_size, key_ct, htable);
+ if (cur_idx == UINT32_MAX) {
+ if (StoreStringAndPrecharAtEnd(arena_bottom, key, prechar, key_slen, arena_top_ptr, &(keys[key_ct]))) {
+ return kPglRetNomem;
+ }
+ *cur_idx_ptr = key_ct;
+ *key_ct_ptr = key_ct + 1;
+ return kPglRetSuccess;
+ }
+ // Key already present. Only permit this if previous instance was FILTER and
+ // this is INFO, or vice versa, or this is a FORMAT key.
+ // INFO entries always have prechar bit 0 set and 3 unset, while FILTER
+ // entries are the reverse. FORMAT keys have prechar == 0.
+ char* existing_key = keys[cur_idx];
+ const unsigned char new_prechar = S_CAST(unsigned char, existing_key[-1]) ^ prechar;
+ if (unlikely(prechar && ((new_prechar & 9) != 9))) {
+ char* write_iter = strcpya_k(g_logbuf, "Error: Multiple ");
+ if (prechar & 8) {
+ write_iter = strcpya_k(write_iter, "FILTER");
+ } else {
+ write_iter = strcpya_k(write_iter, "INFO");
}
+ *write_iter++ = ':';
+ write_iter = memcpya(write_iter, key, key_slen);
+ strcpy_k(write_iter, " header lines.\n");
+ WordWrapB(0);
+ logerrputsb();
+ return kPglRetMalformedInput;
}
+ existing_key[-1] = new_prechar;
+ // bugfix (3 Jan 2021): forgot to return this
+ *cur_idx_ptr = cur_idx;
+ return kPglRetSuccess;
}
// Assumes ii is not in 0x80000000..0x80000007.
=====================================
plink2_help.cc
=====================================
@@ -2145,8 +2145,9 @@ PglErr DispHelp(const char* const* argvk, uint32_t param_ct) {
" (--merge-filter-mode default, not\n"
" applicable to others)\n"
);
- HelpPrint("pmerge\0pmerge-list\0merge-info-sort\0", &help_ctrl, 0,
-" --merge-info-sort <mode> : Set sort order for INFO entries when merging.\n"
+ HelpPrint("pmerge\0pmerge-list\0merge-pheno-sort\0merge-info-sort\0", &help_ctrl, 0,
+" --merge-pheno-sort <m> : Set sort order for phenotype columns and INFO\n"
+" --merge-info-sort <mode> entries when merging.\n"
" * 'none'/'0' = keep in loaded order (default)\n"
" * 'ascii'/'a' = ASCII order\n"
" * 'natural'/'n' = natural sort\n"
=====================================
plink2_import.cc
=====================================
@@ -14811,7 +14811,7 @@ PglErr Plink1DosageToPgen(const char* dosagename, const char* famname, const cha
if (write_sid) {
write_iter = strcpya_k(write_iter, "\tSID");
}
- const uint32_t write_parents = DataParentalColsAreRequired(sample_include, &pii, sample_ct, 1);
+ const uint32_t write_parents = DataParentalColsAreRequired(sample_include, &pii.sii, &pii.parental_id_info, sample_ct, 1);
if (write_parents) {
write_iter = strcpya_k(write_iter, "\tPAT\tMAT");
}
=====================================
plink2_merge.cc
=====================================
@@ -17,6 +17,7 @@
#include "include/pgenlib_write.h"
#include "plink2_compress_stream.h"
+#include "plink2_data.h"
#include "plink2_merge.h"
#include "plink2_psam.h"
#include "plink2_pvar.h"
@@ -34,7 +35,8 @@ void InitPmerge(PmergeInfo* pmerge_info_ptr) {
pmerge_info_ptr->merge_qual_mode = kMergeQualInfoModeNmFirst;
pmerge_info_ptr->merge_filter_mode = kMergeFilterModeNmFirst;
pmerge_info_ptr->merge_info_mode = kMergeQualInfoModeNmFirst;
- pmerge_info_ptr->merge_info_sort = kfSortNone;
+ pmerge_info_ptr->merge_pheno_sort = kSortNone;
+ pmerge_info_ptr->merge_info_sort = kSortNone;
pmerge_info_ptr->max_allele_ct = 0;
pmerge_info_ptr->pgen_fname = nullptr;
pmerge_info_ptr->pvar_fname = nullptr;
@@ -63,14 +65,1157 @@ void CleanupPgenDiff(PgenDiffInfo* pgen_diff_info_ptr) {
free_cond(pgen_diff_info_ptr->psam_fname);
}
-typedef struct PmergeInputFilesetStruct {
+typedef struct PmergeInputFilesetLlStruct {
+ struct PmergeInputFilesetLlStruct* next;
char* pgen_fname;
char* pvar_fname;
char* psam_fname;
-} PmergeInputFileset;
+ uint32_t variant_ct;
+ uintptr_t grand_allele_ct;
+} PmergeInputFilesetLl;
-PglErr Pmerge(__attribute__((unused)) const PmergeInfo* pmip, __attribute__((unused)) const char* sample_sort_fname, __attribute__((unused)) MiscFlags misc_flags, __attribute__((unused)) SortFlags sample_sort_flags, __attribute__((unused)) FamCol fam_cols, __attribute__((unused)) uint32_t max_thread_ct, __attribute__((unused)) char* pgenname, __attribute__((unused)) char* psamname, __attribute__((unused)) char* pvarname, __attribute__((unused)) char* outname, __attribute__((unused)) char* outname_end, __attribute__((unused)) ChrInfo* cip) {
+// Allocates at end of bigstack.
+PmergeInputFilesetLl* AllocFilesetLlEntry(PmergeInputFilesetLl*** filesets_endpp) {
+ const uintptr_t alloc_size = RoundUpPow2(sizeof(PmergeInputFilesetLl), kEndAllocAlign);
+ if (unlikely(S_CAST(uintptr_t, g_bigstack_end - g_bigstack_base) < alloc_size)) {
+ return nullptr;
+ }
+ g_bigstack_end -= alloc_size;
+ PmergeInputFilesetLl* new_entry = R_CAST(PmergeInputFilesetLl*, g_bigstack_end);
+ new_entry->next = nullptr;
+ **filesets_endpp = new_entry;
+ *filesets_endpp = &(new_entry->next);
+ return new_entry;
+}
+
+PglErr LoadPmergeList(const char* list_fname, PmergeListMode mode, uint32_t main_fileset_present, PmergeInputFilesetLl*** filesets_endpp) {
unsigned char* bigstack_mark = g_bigstack_base;
+ uintptr_t line_idx = 0;
+ PglErr reterr = kPglRetSuccess;
+ TextStream txs;
+ PreinitTextStream(&txs);
+ {
+ reterr = InitTextStream(list_fname, kTextStreamBlenFast, 1, &txs);
+ if (unlikely(reterr)) {
+ goto LoadPmergeList_ret_TSTREAM_FAIL;
+ }
+ const char* pgen_suffix;
+ const char* pvar_suffix;
+ const char* psam_suffix;
+ switch (mode) {
+ case kPmergeListModeBfile:
+ pgen_suffix = ".bed";
+ pvar_suffix = ".bim";
+ psam_suffix = ".fam";
+ break;
+ case kPmergeListModeBpfile:
+ pgen_suffix = ".pgen";
+ pvar_suffix = ".bim";
+ psam_suffix = ".fam";
+ break;
+ case kPmergeListModePfile:
+ pgen_suffix = ".pgen";
+ pvar_suffix = ".pvar";
+ psam_suffix = ".psam";
+ break;
+ default:
+ assert(mode == kPmergeListModePfileVzs);
+ pgen_suffix = ".pgen";
+ pvar_suffix = ".pvar.zst";
+ psam_suffix = ".psam";
+ break;
+ }
+ const uint32_t pgen_blen = strlen(pgen_suffix) + 1;
+ const uint32_t pvar_blen = strlen(pvar_suffix) + 1;
+ const uint32_t psam_blen = strlen(psam_suffix) + 1;
+ const uint32_t max_single_token_slen = kPglFnamesize - 1 - MAXV(pgen_blen, pvar_blen);
+ while (1) {
+ const char* first_token_start = TextGet(&txs);
+ if (!first_token_start) {
+ break;
+ }
+ ++line_idx;
+ PmergeInputFilesetLl* cur_entry = AllocFilesetLlEntry(filesets_endpp);
+ if (unlikely(!cur_entry)) {
+ goto LoadPmergeList_ret_NOMEM;
+ }
+ const char* first_token_end = CurTokenEnd(first_token_start);
+ const uint32_t first_token_slen = first_token_end - first_token_start;
+ const char* second_token_start = FirstNonTspace(first_token_end);
+ if (IsEolnKns(*second_token_start)) {
+ // Expand single token, using the --pmerge-list mode.
+ if (unlikely(first_token_slen > max_single_token_slen)) {
+ logerrprintf("Error: Entry on line %" PRIuPTR " of --pmerge-list file is too long.\n", line_idx);
+ goto LoadPmergeList_ret_MALFORMED_INPUT;
+ }
+ char* fname_iter;
+ if (unlikely(bigstack_end_alloc_c(first_token_slen * 3 + pgen_blen + pvar_blen + psam_blen, &fname_iter))) {
+ goto LoadPmergeList_ret_NOMEM;
+ }
+ cur_entry->pgen_fname = fname_iter;
+ fname_iter = memcpya(fname_iter, first_token_start, first_token_slen);
+ fname_iter = memcpya(fname_iter, pgen_suffix, pgen_blen);
+ cur_entry->pvar_fname = fname_iter;
+ fname_iter = memcpya(fname_iter, first_token_start, first_token_slen);
+ fname_iter = memcpya(fname_iter, pvar_suffix, pvar_blen);
+ cur_entry->psam_fname = fname_iter;
+ fname_iter = memcpya(fname_iter, first_token_start, first_token_slen);
+ memcpy(fname_iter, psam_suffix, psam_blen);
+ } else {
+ const char* second_token_end = CurTokenEnd(second_token_start);
+ const char* third_token_start = FirstNonTspace(second_token_end);
+ if (unlikely(IsEolnKns(*third_token_start))) {
+ snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of --pmerge-list has exactly two tokens; 1 or 3 expected.\n", line_idx);
+ goto LoadPmergeList_ret_MALFORMED_INPUT_WW;
+ }
+ const char* third_token_end = CurTokenEnd(third_token_start);
+ const char* fourth_token_start = FirstNonTspace(third_token_end);
+ if (unlikely(!IsEolnKns(*fourth_token_start))) {
+ snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of --pmerge-list has more than 3 tokens.\n", line_idx);
+ goto LoadPmergeList_ret_MALFORMED_INPUT_WW;
+ }
+ const uint32_t second_token_slen = second_token_end - second_token_start;
+ const uint32_t third_token_slen = third_token_end - third_token_start;
+ if (unlikely((first_token_slen >= kPglFnamesize) ||
+ (second_token_slen >= kPglFnamesize) ||
+ (third_token_slen >= kPglFnamesize))) {
+ logerrprintf("Error: Filename on line %" PRIuPTR " of --pmerge-list file is too long.\n", line_idx);
+ goto LoadPmergeList_ret_MALFORMED_INPUT;
+ }
+ char* fname_iter;
+ if (unlikely(bigstack_end_alloc_c(first_token_slen + second_token_slen + third_token_slen + 3, &fname_iter))) {
+ goto LoadPmergeList_ret_NOMEM;
+ }
+ cur_entry->pgen_fname = fname_iter;
+ fname_iter = memcpyax(fname_iter, first_token_start, first_token_slen, '\0');
+ cur_entry->pvar_fname = fname_iter;
+ fname_iter = memcpyax(fname_iter, second_token_start, third_token_slen, '\0');
+ cur_entry->psam_fname = fname_iter;
+ memcpyx(fname_iter, third_token_start, third_token_slen, '\0');
+ }
+ }
+ if (unlikely(TextStreamErrcode2(&txs, &reterr))) {
+ goto LoadPmergeList_ret_TSTREAM_FAIL;
+ }
+ if (main_fileset_present + line_idx < 2) {
+ logerrputs("Error: --pmerge-list requires at least two filesets to be specified.\n");
+ goto LoadPmergeList_ret_INCONSISTENT_INPUT;
+ }
+ logprintf("--pmerge-list: %" PRIuPTR " filesets specified%s.\n", main_fileset_present + line_idx, main_fileset_present? " (including main fileset)" : "");
+ }
+ while (0) {
+ LoadPmergeList_ret_NOMEM:
+ reterr = kPglRetNomem;
+ break;
+ LoadPmergeList_ret_TSTREAM_FAIL:
+ TextStreamErrPrint("--pmerge-list file", &txs);
+ break;
+ LoadPmergeList_ret_MALFORMED_INPUT_WW:
+ WordWrapB(0);
+ logerrputsb();
+ LoadPmergeList_ret_MALFORMED_INPUT:
+ reterr = kPglRetMalformedInput;
+ break;
+ LoadPmergeList_ret_INCONSISTENT_INPUT:
+ reterr = kPglRetInconsistentInput;
+ break;
+ }
+ BigstackReset(bigstack_mark);
+ return reterr;
+}
+
+// Permanent allocations are at end of bigstack.
+PglErr MergePsams(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags misc_flags, SortMode sample_sort_mode, FamCol fam_cols, uint32_t max_thread_ct, char* outname, char* outname_end, PmergeInputFilesetLl* filesets, SampleIdInfo* siip, uint32_t* sample_ctp) {
+ unsigned char* bigstack_mark = g_bigstack_base;
+ unsigned char* bigstack_end_mark = g_bigstack_end;
+ uintptr_t line_idx = 0;
+ const char* cur_fname = nullptr;
+ PglErr reterr = kPglRetSuccess;
+ TextStream txs;
+ PreinitTextStream(&txs);
+ {
+ // First pass: determine sample IDs and phenotype set.
+ // Intermission: sort sample and phenotype IDs.
+ // Second pass: populate data structure, then call WritePsam().
+
+ // Allocate a single linebuf that's used in all first-pass loads, so that
+ // we can have the pheno strset grow from bigstack_end down in an
+ // unfragmented manner.
+ uintptr_t linebuf_capacity = MINV(kMaxLongLine, bigstack_left() / 4) + kDecompressChunkSize;
+ char* linebuf;
+ if (unlikely(bigstack_end_alloc_c(linebuf_capacity, &linebuf))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ // If --sample-inner-join is specified, we append raw null-terminated
+ // sample-ID strings to the bottom of bigstack while scanning the first
+ // file, construct the usual non-resizable string-hash-table data structure
+ // when we're done. Each ID then has a 0-based index, and we use
+ // sample_include/cur_sample_include to track which sample IDs are present
+ // in all files from that point on.
+ // Otherwise, we add sample IDs to the resizable sample_id_strset data
+ // structure and wait till we've processed all files before ordering the
+ // elements.
+ // Analogous thing happens for phenotype names if --pheno-inner-join is
+ // specified. Phenotype-name data structure grows down from the top of
+ // bigstack.
+ const PmergeFlags flags = pmip->flags;
+ char* first_sample_ids_start = nullptr;
+ const char** first_sample_ids = nullptr;
+ uint32_t* first_sample_ids_htable = nullptr;
+ uintptr_t* sample_include = nullptr;
+ uintptr_t* cur_sample_include = nullptr;
+ char** sample_id_strset = nullptr;
+ uint32_t first_sample_id_ct = 0;
+ uint32_t sample_id_table_size = 512;
+ if (flags & kfPmergeSampleInnerJoin) {
+ first_sample_ids_start = R_CAST(char*, g_bigstack_base);
+ // - arena_bottom tracks the current append point
+ // - first_sample_ids, first_sample_ids_htable, sample_include,
+ // cur_sample_include allocated after we're done scanning first file
+ // - sample_id_table_size set to first_sample_ids_htable size at that
+ // point
+ } else {
+ if (unlikely(bigstack_calloc_cp(sample_id_table_size, &sample_id_strset))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ }
+ const char** first_pheno_names = nullptr;
+ uint32_t* first_pheno_names_htable = nullptr;
+ uintptr_t* pheno_include = nullptr;
+ uintptr_t* cur_pheno_include = nullptr;
+ char** pheno_strset = nullptr;
+ uint32_t first_pheno_ct = 0;
+ uint32_t pheno_names_table_size = 512;
+ if (!(flags & kfPmergePhenoInnerJoin)) {
+ if (unlikely(bigstack_end_calloc_cp(pheno_names_table_size, &pheno_strset))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ }
+ unsigned char* arena_bottom = g_bigstack_base;
+ unsigned char* arena_top = g_bigstack_end;
+
+ const uint32_t decompress_thread_ct = MAXV(max_thread_ct - 1, 1);
+ PmergeInputFilesetLl* filesets_iter = filesets;
+ char* idbuf = g_textbuf;
+ uint32_t max_line_blen = 0;
+ uint32_t max_sample_id_blen_m2 = 2;
+ uintptr_t max_sid_blen = 0;
+ uintptr_t max_paternal_id_blen = 2;
+ uintptr_t max_maternal_id_blen = 2;
+ uintptr_t max_pheno_name_blen = 0;
+ uint32_t sample_ct = 0;
+ uint32_t pheno_ct = 0;
+ do {
+ cur_fname = filesets_iter->psam_fname;
+ reterr = TextStreamOpenEx(cur_fname, kMaxLongLine, linebuf_capacity, decompress_thread_ct, nullptr, linebuf, &txs);
+ if (unlikely(reterr)) {
+ goto MergePsams_ret_TSTREAM_FAIL;
+ }
+ // Worth optimizing this more than most text-reading loops, since we may
+ // be processing a LOT of files.
+ const char* line_start = TextLineEnd(&txs);
+ for (line_idx = 1; ; ++line_idx) {
+ if (unlikely(!TextGetUnsafe2K(&txs, &line_start))) {
+ if (TextStreamErrcode2(&txs, &reterr)) {
+ goto MergePsams_ret_TSTREAM_FAIL;
+ }
+ logerrprintfww("Error: No samples in %s.\n", cur_fname);
+ goto MergePsams_ret_MALFORMED_INPUT;
+ }
+ if ((line_start[0] != '#') || tokequal_k(&(line_start[1]), "FID") || tokequal_k(&(line_start[1]), "IID")) {
+ break;
+ }
+ const char* line_end = AdvPastDelim(line_start, '\n');
+ const uint32_t line_blen = line_end - line_start;
+ if (max_line_blen < line_blen) {
+ max_line_blen = line_blen;
+ }
+ line_start = line_end;
+ }
+ uint32_t sid_present = 0;
+ uint32_t postid_pat_col_idx = 0;
+ uint32_t postid_mat_col_idx = 0;
+ uint32_t fid_present;
+ if (line_start[0] == '#') {
+ const char* iid_end = &(line_start[4]);
+ fid_present = (line_start[1] == 'F');
+ if (fid_present) {
+ const char* iid_start = FirstNonTspace(iid_end);
+ iid_end = FirstPrechar(iid_start, 33);
+ if (unlikely(!strequal_k(iid_start, "IID", iid_end - iid_start))) {
+ goto MergePsams_ret_MALFORMED_INPUT;
+ }
+ }
+ const char* token_start = FirstNonTspace(iid_end);
+ if (tokequal_k(token_start, "SID")) {
+ sid_present = 1;
+ token_start = FirstNonTspace(&(token_start[3]));
+ }
+ const char* token_end;
+ for (uint32_t postid_col_idx = 1; !IsEolnKns(*token_start); token_start = FirstNonTspace(token_end), ++postid_col_idx) {
+ token_end = CurTokenEnd(token_start);
+ const uint32_t token_slen = token_end - token_start;
+ if (token_slen == 3) {
+ if (unlikely(memequal_k(token_start, "FID", 3))) {
+ if (fid_present) {
+ snprintf(g_logbuf, kLogbufSize, "Error: Duplicate FID column in %s.\n", cur_fname);
+ } else {
+ snprintf(g_logbuf, kLogbufSize, "Error: Improperly positioned FID column in %s (must be first).\n", cur_fname);
+ }
+ goto MergePsams_ret_MALFORMED_INPUT_WW;
+ }
+ if (unlikely(memequal_k(token_start, "IID", 3))) {
+ snprintf(g_logbuf, kLogbufSize, "Error: Duplicate IID column in %s.\n", cur_fname);
+ goto MergePsams_ret_MALFORMED_INPUT_WW;
+ }
+ if (unlikely(memequal_k(token_start, "SID", 3))) {
+ if (sid_present) {
+ snprintf(g_logbuf, kLogbufSize, "Error: Duplicate SID column in %s.\n", cur_fname);
+ } else {
+ snprintf(g_logbuf, kLogbufSize, "Error: Improperly positioned SID column in %s (must immediately follow IID).\n", cur_fname);
+ }
+ goto MergePsams_ret_MALFORMED_INPUT_WW;
+ }
+ if (memequal_k(token_start, "PAT", 3)) {
+ if (unlikely(postid_pat_col_idx)) {
+ snprintf(g_logbuf, kLogbufSize, "Error: Duplicate PAT column in %s.\n", cur_fname);
+ goto MergePsams_ret_MALFORMED_INPUT_WW;
+ }
+ postid_pat_col_idx = postid_col_idx;
+ continue;
+ }
+ if (memequal_k(token_start, "MAT", 3)) {
+ if (unlikely(postid_mat_col_idx)) {
+ snprintf(g_logbuf, kLogbufSize, "Error: Duplicate MAT column in %s.\n", cur_fname);
+ goto MergePsams_ret_MALFORMED_INPUT_WW;
+ }
+ postid_mat_col_idx = postid_col_idx;
+ continue;
+ }
+ if (memequal_k(token_start, "SEX", 3)) {
+ continue;
+ }
+ } else if (token_slen > kMaxIdBlen) {
+ logerrputs("Error: Phenotype names are limited to " MAX_ID_SLEN_STR " characters.\n");
+ goto MergePsams_ret_MALFORMED_INPUT;
+ }
+ if (pheno_strset) {
+ if (token_slen >= max_pheno_name_blen) {
+ max_pheno_name_blen = token_slen + 1;
+ }
+ if (unlikely(StrsetAddEndResize(arena_bottom, token_start, token_slen, kMaxPhenoCt * 2, &pheno_strset, &pheno_names_table_size, &pheno_ct, &arena_top))) {
+ if (pheno_ct == kMaxPhenoCt) {
+ logerrputs("Error: " PROG_NAME_STR " does not support more than " MAX_PHENO_CT_STR " phenotypes.\n");
+ goto MergePsams_ret_INCONSISTENT_INPUT;
+ }
+ goto MergePsams_ret_NOMEM;
+ }
+ } else {
+ // max_pheno_name_blen calculation deferred since any name may not
+ // appear in a later file
+ if (!first_pheno_names_htable) {
+ // StoreStringAtEnd(), without dst assignment
+ if (unlikely(PtrWSubCk(arena_bottom, token_slen + 1, &arena_top))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ memcpyx(arena_top, token_start, token_slen, '\0');
+ ++first_pheno_ct;
+ } else if (pheno_ct) {
+ const uint32_t pheno_idx = IdHtableFindNnt(token_start, first_pheno_names, first_pheno_names_htable, token_slen, pheno_names_table_size);
+ if (pheno_idx != UINT32_MAX) {
+ if (unlikely(IsSet(cur_pheno_include, pheno_idx))) {
+ snprintf(g_logbuf, kLogbufSize, "Error: Duplicate phenotype name '%s' in %s.\n", first_pheno_names[pheno_idx], cur_fname);
+ goto MergePsams_ret_MALFORMED_INPUT_WW;
+ }
+ SetBit(pheno_idx, cur_pheno_include);
+ }
+ }
+ }
+ }
+ if ((!postid_pat_col_idx) != (!postid_mat_col_idx)) {
+ snprintf(g_logbuf, kLogbufSize, "Error: %s has a '%cAT' column without a '%cAT' column; either both or neither must be present.\n", cur_fname, postid_pat_col_idx? 'P' : 'M', postid_pat_col_idx? 'M' : 'P');
+ goto MergePsams_ret_MALFORMED_INPUT_WW;
+ }
+ const char* line_end = AdvPastDelim(token_start, '\n');
+ const uint32_t line_blen = line_end - line_start;
+ if (max_line_blen < line_blen) {
+ max_line_blen = line_blen;
+ }
+ line_start = line_end;
+ ++line_idx;
+ if (unlikely(!TextGetUnsafe2K(&txs, &line_start))) {
+ if (TextStreamErrcode2(&txs, &reterr)) {
+ goto MergePsams_ret_TSTREAM_FAIL;
+ }
+ logerrprintfww("Error: No samples in %s.\n", cur_fname);
+ goto MergePsams_ret_MALFORMED_INPUT;
+ }
+ } else {
+ // .fam
+ fid_present = (fam_cols / kfFamCol1) & 1;
+ if (fam_cols & kfFamCol34) {
+ postid_pat_col_idx = 1;
+ postid_mat_col_idx = 2;
+ }
+ if (fam_cols & kfFamCol6) {
+ if (pheno_strset) {
+ if (unlikely(StrsetAddEndResize(arena_bottom, "PHENO1", strlen("PHENO1"), kMaxPhenoCt * 2, &pheno_strset, &pheno_names_table_size, &pheno_ct, &arena_top))) {
+ if (pheno_ct == kMaxPhenoCt) {
+ logerrputs("Error: " PROG_NAME_STR " does not support more than " MAX_PHENO_CT_STR " phenotypes.\n");
+ goto MergePsams_ret_INCONSISTENT_INPUT;
+ }
+ goto MergePsams_ret_NOMEM;
+ }
+ if (strlen("PHENO1") >= max_pheno_name_blen) {
+ max_pheno_name_blen = strlen("PHENO1") + 1;
+ }
+ } else {
+ if (!first_pheno_names_htable) {
+ if (unlikely(PtrWSubCk(arena_bottom, strlen("PHENO1") + 1, &arena_top))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ strcpy_k(R_CAST(char*, arena_top), "PHENO1");
+ first_pheno_ct = 1;
+ } else if (pheno_ct) {
+ const uint32_t pheno_idx = IdHtableFind("PHENO1", TO_CONSTCPCONSTP(first_pheno_names), first_pheno_names_htable, strlen("PHENO1"), pheno_names_table_size);
+ if (pheno_idx != UINT32_MAX) {
+ SetBit(pheno_idx, cur_pheno_include);
+ }
+ }
+ }
+ }
+ }
+ uint32_t first_parent_postid_col_idx = postid_pat_col_idx;
+ uint32_t parent_col_skip = postid_mat_col_idx - postid_pat_col_idx;
+ uintptr_t first_parent_max_blen = max_paternal_id_blen;
+ uintptr_t second_parent_max_blen = max_maternal_id_blen;
+ if (postid_mat_col_idx < postid_pat_col_idx) {
+ first_parent_postid_col_idx = postid_mat_col_idx;
+ parent_col_skip = -parent_col_skip;
+ first_parent_max_blen = max_maternal_id_blen;
+ second_parent_max_blen = max_paternal_id_blen;
+ }
+ const char* fid_start = &(g_one_char_strs[96]);
+ uint32_t fid_slen = 1;
+ const char* sid_start = &(g_one_char_strs[96]);
+ uint32_t sid_slen = 1;
+ while (1) {
+ const char* token_start = line_start;
+ if (fid_present) {
+ fid_start = line_start;
+ const char* fid_end = CurTokenEnd(fid_start);
+ fid_slen = fid_end - fid_start;
+ token_start = FirstNonTspace(fid_end);
+ if (unlikely(IsEolnKns(*token_start))) {
+ goto MergePsams_ret_MISSING_TOKENS;
+ }
+ }
+ const char* iid_start = token_start;
+ const char* iid_end = CurTokenEnd(iid_start);
+ const char* token_end = iid_end;
+ const uint32_t iid_slen = iid_end - iid_start;
+ if (sid_present) {
+ sid_start = FirstNonTspace(iid_end);
+ token_end = CurTokenEnd(token_start);
+ sid_slen = token_end - sid_start;
+ if ((sid_slen > 1) || (sid_start[0] != '0')) {
+ if (sid_slen >= max_sid_blen) {
+ if (unlikely(sid_slen > kMaxIdBlen)) {
+ logerrputs("Error: SIDs are limited to " MAX_ID_SLEN_STR " characters.\n");
+ goto MergePsams_ret_MALFORMED_INPUT;
+ }
+ max_sid_blen = sid_slen + 1;
+ }
+ }
+ }
+ if (fid_slen + iid_slen > max_sample_id_blen_m2) {
+ max_sample_id_blen_m2 = fid_slen + iid_slen;
+ if (unlikely(max_sample_id_blen_m2 > 2 * kMaxIdSlen)) {
+ logerrputs("Error: FIDs and IIDs are limited to " MAX_ID_SLEN_STR " characters.\n");
+ goto MergePsams_ret_MALFORMED_INPUT;
+ }
+ }
+ char* id_iter = memcpyax(idbuf, fid_start, fid_slen, '\t');
+ id_iter = memcpyax(id_iter, iid_start, iid_slen, '\t');
+ id_iter = memcpya(id_iter, sid_start, sid_slen);
+ *id_iter = '\0';
+ const uint32_t id_slen = id_iter - idbuf;
+ if (sample_id_strset) {
+ if (unlikely(StrsetAddResize(arena_top, idbuf, id_slen, 2U * 0x7ffffffe, sample_id_strset, &sample_id_table_size, &sample_ct, &arena_bottom))) {
+ if (sample_ct == 0x7ffffffe) {
+ logerrputs("Error: " PROG_NAME_STR " does not support more than 2^31 - 2 samples.\n");
+ goto MergePsams_ret_INCONSISTENT_INPUT;
+ }
+ goto MergePsams_ret_NOMEM;
+ }
+ } else {
+ if (!first_sample_ids_htable) {
+ if (unlikely(id_slen >= S_CAST(uintptr_t, arena_top - arena_bottom))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ arena_bottom = memcpyua(arena_bottom, idbuf, id_slen + 1);
+ if (unlikely(first_sample_id_ct == 0x7ffffffe)) {
+ logerrputs("Error: " PROG_NAME_STR " does not support more than 2^31 - 2 samples.\n");
+ goto MergePsams_ret_INCONSISTENT_INPUT;
+ }
+ ++first_sample_id_ct;
+ } else {
+ const uint32_t sample_idx = IdHtableFind(idbuf, first_sample_ids, first_sample_ids_htable, id_slen, sample_id_table_size);
+ if (sample_idx != UINT32_MAX) {
+ if (unlikely(IsSet(cur_sample_include, sample_idx))) {
+ TabsToSpaces(idbuf);
+ snprintf(g_logbuf, kLogbufSize, "Error: Duplicate sample ID \"%s\" in %s.\n", idbuf, cur_fname);
+ goto MergePsams_ret_MALFORMED_INPUT_WW;
+ }
+ SetBit(sample_idx, cur_sample_include);
+ }
+ }
+ }
+ if (first_parent_postid_col_idx) {
+ const char* first_parent_start = NextTokenMult(token_end, first_parent_postid_col_idx);
+ if (unlikely(!first_parent_start)) {
+ goto MergePsams_ret_MISSING_TOKENS;
+ }
+ const char* first_parent_end = CurTokenEnd(first_parent_start);
+ const uintptr_t first_parent_slen = first_parent_end - first_parent_start;
+ if (first_parent_slen >= first_parent_max_blen) {
+ if (unlikely(first_parent_slen > kMaxIdSlen)) {
+ logerrputs("Error: FIDs and IIDs are limited to " MAX_ID_SLEN_STR " characters.\n");
+ goto MergePsams_ret_MALFORMED_INPUT;
+ }
+ first_parent_max_blen = first_parent_slen + 1;
+ }
+ const char* second_parent_start = NextTokenMult(first_parent_end, parent_col_skip);
+ if (unlikely(!second_parent_start)) {
+ goto MergePsams_ret_MISSING_TOKENS;
+ }
+ token_end = CurTokenEnd(second_parent_start);
+ const uintptr_t second_parent_slen = token_end - second_parent_start;
+ if (second_parent_slen >= second_parent_max_blen) {
+ if (unlikely(second_parent_slen > kMaxIdSlen)) {
+ logerrputs("Error: FIDs and IIDs are limited to " MAX_ID_SLEN_STR " characters.\n");
+ goto MergePsams_ret_MALFORMED_INPUT;
+ }
+ second_parent_max_blen = second_parent_slen + 1;
+ }
+ }
+ const char* line_end = AdvPastDelim(token_end, '\n');
+ const uint32_t line_blen = line_end - line_start;
+ if (max_line_blen < line_blen) {
+ max_line_blen = line_blen;
+ }
+ line_start = line_end;
+ ++line_idx;
+ if (!TextGetUnsafe2K(&txs, &line_start)) {
+ break;
+ }
+ if (unlikely(line_start[0] == '#')) {
+ snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of %s starts with a '#'. (This is only permitted before the first nonheader line, and if a #FID/IID header line is present it must denote the end of the header block.)\n", line_idx, cur_fname);
+ goto MergePsams_ret_MALFORMED_INPUT_WW;
+ }
+ }
+ if (unlikely(TextStreamErrcode2(&txs, &reterr))) {
+ goto MergePsams_ret_TSTREAM_FAIL;
+ }
+ if (unlikely(CleanupTextStream2(cur_fname, &txs, &reterr))) {
+ goto MergePsams_ret_1;
+ }
+ if (postid_pat_col_idx) {
+ if (postid_mat_col_idx > postid_pat_col_idx) {
+ max_paternal_id_blen = first_parent_max_blen;
+ max_maternal_id_blen = second_parent_max_blen;
+ } else {
+ max_paternal_id_blen = second_parent_max_blen;
+ max_maternal_id_blen = first_parent_max_blen;
+ }
+ }
+ if (first_sample_ids_start) {
+ const uint32_t first_sample_ctl = BitCtToWordCt(first_sample_id_ct);
+ if (filesets_iter == filesets) {
+ ArenaBaseSet(arena_bottom, &arena_bottom);
+ sample_id_table_size = GetHtableFastSize(first_sample_id_ct);
+ if (unlikely((arena_bottom > arena_top) ||
+ arena_alloc_kcp(arena_top, first_sample_id_ct, &arena_bottom, &first_sample_ids) ||
+ arena_alloc_u32(arena_top, sample_id_table_size, &arena_bottom, &first_sample_ids_htable) ||
+ arena_alloc_w(arena_top, first_sample_ctl, &arena_bottom, &sample_include) ||
+ arena_alloc_w(arena_top, first_sample_ctl, &arena_bottom, &cur_sample_include))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ SetAllU32Arr(sample_id_table_size, first_sample_ids_htable);
+ SetAllBits(first_sample_id_ct, sample_include);
+ const char* first_sample_ids_iter = first_sample_ids_start;
+ for (uint32_t sample_idx = 0; sample_idx != first_sample_id_ct; ++sample_idx) {
+ first_sample_ids[sample_idx] = first_sample_ids_iter;
+ const uint32_t slen = strlen(first_sample_ids_iter);
+ if (unlikely(IdHtableAdd(first_sample_ids_iter, first_sample_ids, slen, sample_id_table_size, sample_idx, first_sample_ids_htable) != UINT32_MAX)) {
+ char* mutable_id = K_CAST(char*, first_sample_ids_iter);
+ TabsToSpaces(mutable_id);
+ snprintf(g_logbuf, kLogbufSize, "Error: Duplicate sample ID \"%s\" in %s.\n", mutable_id, cur_fname);
+ goto MergePsams_ret_MALFORMED_INPUT_WW;
+ }
+ first_sample_ids_iter = &(first_sample_ids_iter[slen + 1]);
+ }
+ } else {
+ BitvecAnd(cur_sample_include, first_sample_ctl, sample_include);
+ if (unlikely(AllWordsAreZero(sample_include, first_sample_ctl))) {
+ snprintf(g_logbuf, kLogbufSize, "Error: No common samples in --pmerge%s --sample-inner-join job.\n", pmip->list_fname? "-list" : "");
+ goto MergePsams_ret_MALFORMED_INPUT_2;
+ }
+ }
+ ZeroWArr(first_sample_ctl, cur_sample_include);
+ }
+ if (!pheno_strset) {
+ const uint32_t first_pheno_ctl = BitCtToWordCt(first_pheno_ct);
+ if (filesets_iter == filesets) {
+ // must handle pheno_ct == 0
+ // (only really need to set first_pheno_names_htable to non-null in
+ // that case, since that's how we currently recognize we aren't
+ // processing the first .psam file)
+ const char* pheno_names_iter = R_CAST(char*, arena_top);
+ ArenaEndSet(arena_top, &arena_top);
+ pheno_names_table_size = GetHtableFastSize(first_pheno_ct);
+ if (unlikely(arena_end_alloc_kcp(arena_bottom, first_pheno_ct, &arena_top, &first_pheno_names) ||
+ arena_end_alloc_u32(arena_bottom, pheno_names_table_size, &arena_top, &first_pheno_names_htable) ||
+ arena_end_alloc_w(arena_bottom, first_pheno_ctl, &arena_top, &pheno_include) ||
+ arena_end_alloc_w(arena_bottom, first_pheno_ctl, &arena_top, &cur_pheno_include))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ SetAllU32Arr(pheno_names_table_size, first_pheno_names_htable);
+ SetAllBits(first_pheno_ct, pheno_include);
+ ZeroWArr(first_pheno_ctl, cur_pheno_include);
+ pheno_ct = first_pheno_ct;
+ for (uint32_t pheno_idx = 0; pheno_idx != first_pheno_ct; ++pheno_idx) {
+ first_pheno_names[pheno_idx] = pheno_names_iter;
+ const uint32_t slen = strlen(pheno_names_iter);
+ if (unlikely(IdHtableAdd(pheno_names_iter, first_pheno_names, slen, pheno_names_table_size, pheno_idx, first_pheno_names_htable) != UINT32_MAX)) {
+ snprintf(g_logbuf, kLogbufSize, "Error: Duplicate phenotype name '%s' in %s.\n", pheno_names_iter, cur_fname);
+ goto MergePsams_ret_MALFORMED_INPUT_WW;
+ }
+ pheno_names_iter = &(pheno_names_iter[slen + 1]);
+ }
+ } else if (pheno_ct) {
+ BitvecAnd(cur_pheno_include, first_pheno_ctl, pheno_include);
+ pheno_ct = PopcountWords(pheno_include, first_pheno_ctl);
+ ZeroWArr(first_pheno_ctl, cur_pheno_include);
+ }
+ }
+ filesets_iter = filesets_iter->next;
+ } while (filesets_iter);
+ // Intermission:
+ // 1. Move final set of phenotype ID strings to bottom, compute
+ // max_pheno_name_blen if --pheno-inner-join.
+ // 2. Compute max_sample_id_blen and max_sid_blen if --sample-inner-join.
+ // 3. Free all end-of-bigstack allocations.
+ // 4. Construct SampleIdInfo at end of bigstack, to be returned.
+ // 5. Construct pheno_names, hash tables.
+ char* pheno_names = nullptr;
+ uint32_t* pheno_names_htable = nullptr;
+ uint32_t* sample_id_htable;
+ uintptr_t max_sample_id_blen;
+ uint32_t sample_ctl;
+ {
+ char* pheno_names_tmp_start = R_CAST(char*, arena_bottom);
+ char* pheno_names_tmp_end;
+ if (pheno_strset) {
+ const uintptr_t byte_ct = R_CAST(unsigned char*, pheno_strset) - arena_top;
+ memmove(pheno_names_tmp_start, arena_top, byte_ct);
+ pheno_names_tmp_end = &(pheno_names_tmp_start[byte_ct]);
+ } else {
+ max_pheno_name_blen = 0;
+ char* pheno_names_write_iter = pheno_names_tmp_start;
+ if (pheno_ct) {
+ if (S_CAST(uintptr_t, first_pheno_names[0] - pheno_names_write_iter) < kMaxIdBlen) {
+ // ensure memcpy is safe
+ goto MergePsams_ret_NOMEM;
+ }
+ uintptr_t pheno_uidx_base = 0;
+ uintptr_t cur_bits = pheno_include[0];
+ for (uint32_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
+ const uint32_t pheno_uidx = BitIter1(pheno_include, &pheno_uidx_base, &cur_bits);
+ const char* cur_pheno_name = first_pheno_names[pheno_uidx];
+ const uint32_t blen = 1 + strlen(cur_pheno_name);
+ if (blen > max_pheno_name_blen) {
+ max_pheno_name_blen = blen;
+ }
+ pheno_names_write_iter = memcpya(pheno_names_write_iter, cur_pheno_name, blen);
+ }
+ }
+ pheno_names_tmp_end = pheno_names_write_iter;
+ }
+ BigstackBaseSet(pheno_names_tmp_end);
+
+ if (sample_id_strset) {
+ max_sample_id_blen = max_sample_id_blen_m2 + 2;
+ } else {
+ const uint32_t first_sample_ctl = BitCtToWordCt(first_sample_id_ct);
+ sample_ct = PopcountWords(sample_include, first_sample_ctl);
+ max_sample_id_blen = 4;
+ max_sid_blen = 0;
+ uintptr_t sample_uidx_base = 0;
+ uintptr_t cur_bits = sample_include[0];
+ for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
+ const uint32_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
+ const char* cur_sample_id = first_sample_ids[sample_uidx];
+ const char* iid_end = AdvToDelim(AdvPastDelim(cur_sample_id, '\t'), '\t');
+ const uint32_t sample_id_blen_m1 = iid_end - cur_sample_id;
+ if (max_sample_id_blen <= sample_id_blen_m1) {
+ max_sample_id_blen = sample_id_blen_m1 + 1;
+ }
+ const char* sid_start = &(iid_end[1]);
+ const uint32_t sid_slen = strlen(sid_start);
+ if ((sid_slen > 1) || (sid_start[0] != '0')) {
+ if (max_sid_blen <= sid_slen) {
+ max_sid_blen = sid_slen + 1;
+ }
+ }
+ }
+ }
+ // defensive
+ linebuf = nullptr;
+ first_pheno_names = nullptr;
+ first_pheno_names_htable = nullptr;
+ pheno_include = nullptr;
+ cur_pheno_include = nullptr;
+ pheno_strset = nullptr;
+
+ sample_ctl = BitCtToWordCt(sample_ct);
+ siip->flags = kfSampleIdFidPresent | kfSampleIdParentsPresent;
+ if (misc_flags & kfMiscStrictSid0) {
+ // affects --indiv-sort
+ siip->flags |= kfSampleIdStrictSid0;
+ }
+ siip->max_sample_id_blen = max_sample_id_blen;
+ siip->max_sid_blen = max_sid_blen;
+ siip->sids = nullptr;
+ if (max_sid_blen) {
+ if (unlikely(bigstack_end_alloc_c(max_sid_blen * sample_ct, &(siip->sids)))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ }
+ // place this below optional siip->sids, so that
+ // BigstackEndSet(siip->sample_ids) is always correct on success
+ if (unlikely(bigstack_end_alloc_c(max_sample_id_blen * sample_ct, &(siip->sample_ids)))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ if (pheno_ct) {
+ pheno_names_table_size = GetHtableFastSize(pheno_ct);
+ if (unlikely(bigstack_end_alloc_c(max_pheno_name_blen * pheno_ct, &pheno_names) ||
+ bigstack_end_alloc_u32(pheno_names_table_size, &pheno_names_htable))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ }
+
+ char* sample_ids_write_iter = siip->sample_ids;
+ if (sample_id_strset) {
+ const char* sample_ids_read_iter = R_CAST(char*, &(sample_id_strset[sample_id_table_size]));
+ for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
+ const char* iid_end = AdvToDelim(AdvPastDelim(sample_ids_read_iter, '\t'), '\t');
+ memcpyx(sample_ids_write_iter, sample_ids_read_iter, iid_end - sample_ids_read_iter, '\0');
+ sample_ids_write_iter = &(sample_ids_write_iter[max_sample_id_blen]);
+ const char* sid_end = strnul(iid_end);
+ if (max_sid_blen) {
+ const uint32_t sid_blen = sid_end - iid_end;
+ const char* sid_start = &(iid_end[1]);
+ memcpy(&(siip->sids[sample_idx * max_sid_blen]), sid_start, sid_blen);
+ }
+ sample_ids_read_iter = &(sid_end[1]);
+ }
+ } else {
+ uintptr_t sample_uidx_base = 0;
+ uintptr_t cur_bits = sample_include[0];
+ for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
+ const uint32_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
+ const char* cur_sample_id = first_sample_ids[sample_uidx];
+ const char* iid_end = AdvToDelim(AdvPastDelim(cur_sample_id, '\t'), '\t');
+ memcpyx(sample_ids_write_iter, cur_sample_id, iid_end - cur_sample_id, '\0');
+ sample_ids_write_iter = &(sample_ids_write_iter[max_sample_id_blen]);
+ if (max_sid_blen) {
+ const char* sid_start = &(iid_end[1]);
+ strcpy(&(siip->sids[sample_idx * max_sid_blen]), sid_start);
+ }
+ }
+ }
+
+ // defensive
+ first_sample_ids_start = nullptr;
+ first_sample_ids = nullptr;
+ first_sample_ids_htable = nullptr;
+ sample_include = nullptr;
+ cur_sample_include = nullptr;
+ sample_id_strset = nullptr;
+ g_bigstack_base = bigstack_mark;
+
+ if (pheno_ct) {
+ const char* pheno_names_read_iter = pheno_names_tmp_start;
+ // phenotypes are stored in reverse
+ char* pheno_names_write_iter = &(pheno_names[pheno_ct * max_pheno_name_blen]);
+ for (uint32_t pheno_idx = pheno_ct; pheno_idx; ) {
+ --pheno_idx;
+ pheno_names_write_iter -= max_pheno_name_blen;
+ const uint32_t slen = strlen(pheno_names_read_iter);
+ memcpy(pheno_names_write_iter, pheno_names_read_iter, slen + 1);
+ pheno_names_read_iter = &(pheno_names_read_iter[slen + 1]);
+ }
+ const SortMode pheno_sort_mode = pmip->merge_pheno_sort;
+ if (pheno_sort_mode == kSortAscii) {
+ qsort(pheno_names, pheno_ct, max_pheno_name_blen, strcmp_overread_casted);
+ } else if (pheno_sort_mode == kSortNatural) {
+ qsort(pheno_names, pheno_ct, max_pheno_name_blen, strcmp_natural);
+ }
+
+ SetAllU32Arr(pheno_names_table_size, pheno_names_htable);
+ pheno_names_read_iter = pheno_names;
+ for (uint32_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
+ const uint32_t slen = strlen(pheno_names_read_iter);
+ HtableAddNondup(pheno_names_read_iter, slen, pheno_names_table_size, pheno_idx, pheno_names_htable);
+ pheno_names_read_iter = &(pheno_names_read_iter[max_pheno_name_blen]);
+ }
+ }
+
+ if (unlikely(bigstack_alloc_w(sample_ctl, &sample_include))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ SetAllBits(sample_ct, sample_include);
+ char* sample_ids = siip->sample_ids;
+ char* sids = siip->sids;
+ if (sample_sort_mode != kSortNone) {
+ if (sample_sort_mode == kSortFile) {
+ // yes, this is a bit circuitous
+ unsigned char* bigstack_end_mark2 = g_bigstack_end;
+ uint32_t* new_sample_idx_to_old;
+ char* sample_ids_tmp;
+ if (unlikely(bigstack_end_alloc_u32(sample_ct, &new_sample_idx_to_old) ||
+ bigstack_end_alloc_c(sample_ct * max_sample_id_blen, &sample_ids_tmp))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ char* sids_tmp = nullptr;
+ if (max_sid_blen) {
+ if (unlikely(bigstack_end_alloc_c(sample_ct * max_sid_blen, &sids_tmp))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ }
+ reterr = SampleSortFileMap(sample_include, siip, sample_sort_fname, sample_ct, sample_ct, &new_sample_idx_to_old);
+ if (unlikely(reterr)) {
+ goto MergePsams_ret_1;
+ }
+ char* new_sample_ids_iter = sample_ids_tmp;
+ char* new_sids_iter = sids_tmp;
+ for (uint32_t new_idx = 0; new_idx != sample_ct; ++new_idx) {
+ const uint32_t old_idx = new_sample_idx_to_old[new_idx];
+ strcpy(new_sample_ids_iter, &(sample_ids[old_idx * max_sample_id_blen]));
+ new_sample_ids_iter = &(new_sample_ids_iter[max_sample_id_blen]);
+ if (sids) {
+ strcpy(new_sids_iter, &(sids[old_idx * max_sid_blen]));
+ new_sids_iter = &(new_sids_iter[max_sid_blen]);
+ }
+ }
+ memcpy(sample_ids, sample_ids_tmp, sample_ct * max_sample_id_blen);
+ if (sids) {
+ memcpy(sids, sids_tmp, sample_ct * max_sid_blen);
+ }
+ BigstackEndReset(bigstack_end_mark2);
+ } else if (sample_sort_mode == kSortAscii) {
+ qsort(siip->sample_ids, sample_ct, max_sample_id_blen, strcmp_overread_casted);
+ } else {
+ // natural-sort, even if sample_sort_mode == kSort0
+ qsort(siip->sample_ids, sample_ct, max_sample_id_blen, strcmp_natural);
+ }
+ }
+
+ sample_id_table_size = GetHtableFastSize(sample_ct);
+ if (unlikely(bigstack_alloc_u32(sample_id_table_size, &sample_id_htable))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ SetAllU32Arr(sample_id_table_size, sample_id_htable);
+
+ const char* sample_ids_iter = siip->sample_ids;
+ const char* sids_iter = siip->sids;
+ for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
+ uint32_t slen = strlen(sample_ids_iter);
+ const char* cur_sample_id;
+ if (!sids_iter) {
+ cur_sample_id = sample_ids_iter;
+ } else {
+ char* id_iter = memcpyax(idbuf, sample_ids_iter, slen, '\t');
+ const uint32_t sid_blen = 1 + strlen(sids_iter);
+ memcpy(id_iter, sids_iter, sid_blen);
+ slen += sid_blen;
+ sids_iter = &(sids_iter[max_sid_blen]);
+ cur_sample_id = idbuf;
+ }
+ HtableAddNondup(cur_sample_id, slen, sample_id_table_size, sample_idx, sample_id_htable);
+ }
+ }
+ linebuf_capacity = MAXV(max_line_blen, kDecompressMinBlen) + kDecompressChunkSize;
+ // max_{p,m}aternal_id_blen may be overestimated in --sample-inner-join
+ // case, but that's harmless since we free this strbox soon enough
+ ParentalIdInfo parental_id_info;
+ uintptr_t* sex_nm;
+ uintptr_t* sex_male;
+ if (unlikely(bigstack_alloc_c(sample_ct * max_paternal_id_blen, &parental_id_info.paternal_ids) ||
+ bigstack_alloc_c(sample_ct * max_maternal_id_blen, &parental_id_info.maternal_ids) ||
+ bigstack_calloc_w(sample_ctl, &sex_nm) ||
+ bigstack_calloc_w(sample_ctl, &sex_male))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ {
+ char* paternal_ids_iter = parental_id_info.paternal_ids;
+ char* maternal_ids_iter = parental_id_info.maternal_ids;
+ for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
+ strcpy_k(paternal_ids_iter, "0");
+ paternal_ids_iter = &(paternal_ids_iter[max_paternal_id_blen]);
+ strcpy_k(maternal_ids_iter, "0");
+ maternal_ids_iter = &(maternal_ids_iter[max_maternal_id_blen]);
+ }
+ }
+ parental_id_info.max_paternal_id_blen = max_paternal_id_blen;
+ parental_id_info.max_maternal_id_blen = max_maternal_id_blen;
+
+ // unlike core pheno_cols, this just lives on bigstack
+ PhenoCol* pheno_cols = nullptr;
+
+ if (pheno_ct) {
+ if (unlikely(BIGSTACK_ALLOC_X(PhenoCol, pheno_ct, &pheno_cols))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ for (uint32_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
+ // we don't really distinguish between case/control and quantitative
+ // phenotypes here (just use data.qt in both cases), and it should be
+ // ok to overallocate a bit in the categorical-phenotype case
+ if (unlikely(bigstack_calloc_w(sample_ctl, &(pheno_cols[pheno_idx].nonmiss)) ||
+ bigstack_alloc_d(sample_ct, &(pheno_cols[pheno_idx].data.qt)))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ // We use kPhenoDtypeCc instead of kPhenoTypeOther to indicate "type
+ // not yet known", since that lets us leave it unchanged if all values
+ // are missing.
+ pheno_cols[pheno_idx].type_code = kPhenoDtypeCc;
+ // defensive
+ pheno_cols[pheno_idx].category_names = nullptr;
+ pheno_cols[pheno_idx].nonnull_category_ct = 0;
+ }
+ }
+ unsigned char* bigstack_mark2 = g_bigstack_base;
+ const uint32_t kNonphenoColCt = 5;
+ const uint32_t max_col_ct = pheno_ct + kNonphenoColCt;
+ // note that ZeroWArr(pheno_ctl, pheno_include) is safe when pheno_ct == 0
+ const uint32_t pheno_ctl = BitCtToWordCt(pheno_ct);
+ uint32_t* col_skips;
+ uint32_t* col_types;
+ const char** token_ptrs;
+ uint32_t* token_slens;
+ if (unlikely(bigstack_alloc_u32(max_col_ct, &col_skips) ||
+ bigstack_alloc_u32(max_col_ct, &col_types) ||
+ bigstack_alloc_kcp(max_col_ct, &token_ptrs) ||
+ bigstack_alloc_u32(max_col_ct, &token_slens) ||
+ bigstack_alloc_w(pheno_ctl, &pheno_include) ||
+ bigstack_alloc_c(linebuf_capacity, &linebuf))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ // phenotype-major
+ uintptr_t* pheno_locked = nullptr;
+ if (pheno_ct && (pmip->merge_pheno_mode != kMergePhenoModeNmFirst)) {
+ // no need to check for overflow in 32-bit case since we would have
+ // already run out of memory on data.qt allocations
+ const uintptr_t bit_ct = S_CAST(uintptr_t, pheno_ct) * sample_ct;
+ if (unlikely(bigstack_calloc_w(BitCtToWordCt(bit_ct), &pheno_locked))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ }
+
+ if (pheno_ct) {
+ logprintf("--pmerge%s: %u sample%s and %u phenotype%s present.\n", pmip->list_fname? "-list" : "", sample_ct, (sample_ct == 1)? "" : "s", pheno_ct, (pheno_ct == 1)? "" : "s");
+ } else {
+ logprintf(g_logbuf, kLogbufSize, "--pmerge%s: %u sample%s present.\n", pmip->list_fname? "-list" : "", sample_ct, (sample_ct == 1)? "" : "s");
+ }
+ // [category_names] [category_names_htable] [strings]
+ // ^ ^ ^ ^
+ // | | | |
+ // g_bigstack_base arena_bottom arena_top g_bigstack_end
+ //
+ // category-name strings are allocated at the end of bigstack, and need to
+ // remain allocated until WritePsam() returns.
+ // category_names_htable is repositioned, resized, and rebuilt whenever
+ // category_names would otherwise overflow.
+ // No other allocations allowed past this point, until WritePsam() call.
+ arena_bottom = g_bigstack_base;
+ arena_top = g_bigstack_end;
+ const char** category_names = nullptr;
+ // uint32_t* category_names_htable = nullptr;
+ uint32_t category_names_ct = 0;
+ // category_names_capacity == category_names_htable_size / 2
+ // uint32_t category_names_htable_size = 0;
+ filesets_iter = filesets;
+ // possible todo: track seen realpaths, skip duplicates
+ do {
+ cur_fname = filesets_iter->psam_fname;
+ reterr = TextStreamOpenEx(cur_fname, kMaxLongLine, linebuf_capacity, decompress_thread_ct, nullptr, linebuf, &txs);
+ if (unlikely(reterr)) {
+ goto MergePsams_ret_TSTREAM_REWIND_FAIL;
+ }
+ const char* line_start = TextLineEnd(&txs);
+ for (line_idx = 1; ; ++line_idx) {
+ if (unlikely(!TextGetUnsafe2K(&txs, &line_start))) {
+ reterr = TextStreamRawErrcode(&txs);
+ goto MergePsams_ret_TSTREAM_REWIND_FAIL;
+ }
+ if ((line_start[0] != '#') || tokequal_k(&(line_start[1]), "FID") || tokequal_k(&(line_start[1]), "IID")) {
+ break;
+ }
+ line_start = AdvPastDelim(line_start, '\n');
+ }
+ ZeroWArr(pheno_ctl, pheno_include);
+ // uint32_t sid_present = 0;
+ // uint32_t parents_present = 0;
+ // uint32_t sex_present = 0;
+ uint32_t fid_present;
+ uint32_t relevant_postfid_col_ct;
+ col_types[0] = 0;
+ if (line_start[0] == '#') {
+ fid_present = (line_start[1] == 'F');
+ relevant_postfid_col_ct = 1 - fid_present;
+ if (!fid_present) {
+ col_skips[0] = 0;
+ }
+ const char* token_end = &(line_start[4]);
+ for (uint32_t col_idx = 1; ; ++col_idx) {
+ const char* token_start = FirstNonTspace(token_end);
+ if (IsEolnKns(*token_start)) {
+ break;
+ }
+ token_end = CurTokenEnd(token_start);
+ const uint32_t token_slen = token_end - token_start;
+ uint32_t cur_col_type = UINT32_MAX;
+ if (token_slen == 3) {
+ if (memequal_k(token_start, "IID", 3)) {
+ cur_col_type = 0;
+ } else if (memequal_k(token_start, "SID", 3)) {
+ // sid_present = 1;
+ cur_col_type = 1;
+ } else if (memequal_k(token_start, "PAT", 3)) {
+ cur_col_type = 2;
+ // parents_present = 1;
+ } else if (memequal_k(token_start, "MAT", 3)) {
+ cur_col_type = 3;
+ } else if (memequal_k(token_start, "SEX", 3)) {
+ cur_col_type = 4;
+ // sex_present = 1;
+ }
+ }
+ if (cur_col_type == UINT32_MAX) {
+ if (token_slen < max_pheno_name_blen) {
+ cur_col_type = StrboxHtableFind(token_start, pheno_names, pheno_names_htable, max_pheno_name_blen, token_slen, pheno_names_table_size);
+ }
+ if (cur_col_type == UINT32_MAX) {
+ continue;
+ }
+ SetBit(cur_col_type, pheno_include);
+ cur_col_type += kNonphenoColCt;
+ }
+ col_skips[relevant_postfid_col_ct] = col_idx;
+ col_types[relevant_postfid_col_ct++] = cur_col_type;
+ }
+ for (uint32_t rpf_col_idx = relevant_postfid_col_ct - 1; rpf_col_idx; --rpf_col_idx) {
+ col_skips[rpf_col_idx] -= col_skips[rpf_col_idx - 1];
+ }
+ line_start = AdvPastDelim(token_end, '\n');
+ ++line_idx;
+ } else {
+ // .fam
+ fid_present = (fam_cols / kfFamCol1) & 1;
+ col_skips[0] = fid_present;
+ relevant_postfid_col_ct = 1;
+ if (fam_cols & kfFamCol34) {
+ col_skips[relevant_postfid_col_ct] = 1;
+ col_types[relevant_postfid_col_ct++] = 2;
+ col_skips[relevant_postfid_col_ct] = 1;
+ col_types[relevant_postfid_col_ct++] = 3;
+ // parents_present = 1;
+ }
+ if (fam_cols & kfFamCol5) {
+ col_skips[relevant_postfid_col_ct] = 1;
+ col_types[relevant_postfid_col_ct++] = 4;
+ // sex_present = 1;
+ }
+ if (fam_cols & kfFamCol6) {
+ col_skips[relevant_postfid_col_ct] = 1;
+ const uint32_t pheno_idx = StrboxHtableFind("PHENO1", pheno_names, pheno_names_htable, max_pheno_name_blen, strlen("PHENO1"), pheno_names_table_size);
+ col_types[relevant_postfid_col_ct++] = kNonphenoColCt + pheno_idx;
+ SetBit(pheno_idx, pheno_include);
+ }
+ }
+ // TODO
+ if (unlikely(TextStreamErrcode2(&txs, &reterr))) {
+ goto MergePsams_ret_TSTREAM_FAIL;
+ }
+ if (unlikely(CleanupTextStream2(cur_fname, &txs, &reterr))) {
+ goto MergePsams_ret_1;
+ }
+ filesets_iter = filesets_iter->next;
+ } while (filesets_iter);
+ BigstackReset(bigstack_mark2);
+ BigstackEndSet(arena_top);
+ if (pheno_ct) {
+ if (category_names_ct) {
+ const char** category_names_final = R_CAST(const char**, g_bigstack_base);
+ BigstackBaseSet(&(category_names_final[category_names_ct]));
+ memmove(category_names_final, category_names, category_names_ct * sizeof(intptr_t));
+ category_names = category_names_final;
+ }
+ for (uint32_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
+ PhenoCol* pheno_col = &(pheno_cols[pheno_idx]);
+ if (pheno_col->type_code == kPhenoDtypeQt) {
+ if (misc_flags & kfMiscAffection01) {
+ // TODO
+ // If all values are in {0, 1, missing}, convert to {1, 2, missing}
+ }
+ } else if (pheno_col->type_code == kPhenoDtypeCat) {
+ pheno_col->category_names = category_names;
+ }
+ }
+ ;;;
+ }
+ snprintf(outname_end, kMaxOutfnameExtBlen, ".psam");
+ reterr = WritePsam(outname, sample_include, siip, &parental_id_info, sex_nm, sex_male, pheno_cols, pheno_names, nullptr, "NA", sample_ct, pheno_ct, max_pheno_name_blen, kfPsamColDefault);
+ if (unlikely(reterr)) {
+ goto MergePsams_ret_1;
+ }
+ logprintfww("--pmerge%s: Merged .psam written to %s .\n", pmip->list_fname? "-list" : "", outname);
+ *sample_ctp = sample_ct;
+ ArenaEndSet(siip->sample_ids, &bigstack_end_mark);
+ }
+ while (0) {
+ MergePsams_ret_NOMEM:
+ reterr = kPglRetNomem;
+ break;
+ MergePsams_ret_TSTREAM_REWIND_FAIL:
+ TextStreamErrPrintRewind(cur_fname, &txs, &reterr);
+ break;
+ MergePsams_ret_TSTREAM_FAIL:
+ TextStreamErrPrint(cur_fname, &txs);
+ break;
+ MergePsams_ret_MISSING_TOKENS:
+ snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, cur_fname);
+ MergePsams_ret_MALFORMED_INPUT_WW:
+ WordWrapB(0);
+ MergePsams_ret_MALFORMED_INPUT_2:
+ logerrputsb();
+ MergePsams_ret_MALFORMED_INPUT:
+ reterr = kPglRetMalformedInput;
+ break;
+ MergePsams_ret_INCONSISTENT_INPUT:
+ reterr = kPglRetInconsistentInput;
+ break;
+ }
+ MergePsams_ret_1:
+ CleanupTextStream2(cur_fname, &txs, &reterr);
+ BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
+ return reterr;
+}
+
+PglErr Pmerge(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags misc_flags, SortMode sample_sort_mode, FamCol fam_cols, uint32_t max_thread_ct, char* pgenname, char* psamname, char* pvarname, char* outname, char* outname_end, __attribute__((unused)) ChrInfo* cip) {
+ unsigned char* bigstack_mark = g_bigstack_base;
+ unsigned char* bigstack_end_mark = g_bigstack_end;
PglErr reterr = kPglRetSuccess;
{
// 1. Construct/load fileset list.
@@ -81,17 +1226,63 @@ PglErr Pmerge(__attribute__((unused)) const PmergeInfo* pmip, __attribute__((unu
// 4. If filesets cover disjoint positions, handle this as a concatenation
// job (or error out on --variant-inner-join).
// 5. Otherwise, perform general-purpose incremental merge.
- // const PmergeFlags flags = pmip->flags;
- ;;;
+ PmergeInputFilesetLl* filesets = nullptr;
+ PmergeInputFilesetLl** filesets_endp = &filesets;
+ if (pgenname[0]) {
+ PmergeInputFilesetLl* cur_entry = AllocFilesetLlEntry(&filesets_endp);
+ if (unlikely(!cur_entry)) {
+ goto Pmerge_ret_NOMEM;
+ }
+ const uint32_t pgen_fname_blen = strlen(pgenname) + 1;
+ const uint32_t pvar_fname_blen = strlen(pvarname) + 1;
+ const uint32_t psam_fname_blen = strlen(psamname) + 1;
+ char* fname_iter;
+ if (unlikely(bigstack_end_alloc_c(pgen_fname_blen + pvar_fname_blen + psam_fname_blen, &fname_iter))) {
+ goto Pmerge_ret_NOMEM;
+ }
+ cur_entry->pgen_fname = fname_iter;
+ fname_iter = memcpya(fname_iter, pgenname, pgen_fname_blen);
+ cur_entry->pvar_fname = fname_iter;
+ fname_iter = memcpya(fname_iter, pvarname, pvar_fname_blen);
+ cur_entry->psam_fname = fname_iter;
+ memcpy(fname_iter, psamname, psam_fname_blen);
+ }
+ if (!pmip->list_fname) {
+ PmergeInputFilesetLl* cur_entry = AllocFilesetLlEntry(&filesets_endp);
+ if (unlikely(!cur_entry)) {
+ goto Pmerge_ret_NOMEM;
+ }
+ cur_entry->pgen_fname = pmip->pgen_fname;
+ cur_entry->pvar_fname = pmip->pvar_fname;
+ cur_entry->psam_fname = pmip->psam_fname;
+ } else {
+ reterr = LoadPmergeList(pmip->list_fname, pmip->list_mode, pgenname[0] != '\0', &filesets_endp);
+ if (unlikely(reterr)) {
+ goto Pmerge_ret_1;
+ }
+ }
+
+ SampleIdInfo sii;
+ uint32_t sample_ct;
+ reterr = MergePsams(pmip, sample_sort_fname, misc_flags, sample_sort_mode, fam_cols, max_thread_ct, outname, outname_end, filesets, &sii, &sample_ct);
+ if (unlikely(reterr)) {
+ goto Pmerge_ret_1;
+ }
+
+ if (unlikely(reterr)) {
+ goto Pmerge_ret_1;
+ }
logerrputs("Error: --pmerge[-list] is under development.\n");
reterr = kPglRetNotYetSupported;
goto Pmerge_ret_1;
}
while (0) {
-
+ Pmerge_ret_NOMEM:
+ reterr = kPglRetNomem;
+ break;
}
Pmerge_ret_1:
- BigstackReset(bigstack_mark);
+ BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
return reterr;
}
@@ -925,7 +2116,7 @@ PglErr PgenDiff(const uintptr_t* orig_sample_include, const SampleIdInfo* siip,
const uint32_t cur_max_merged_alleles = 1 + cur_allele_ct1 + cur_allele_ct2 - allele1_alt_start - allele2_alt_start;
const uint32_t cur_htable_size = (cur_max_merged_alleles * 9) / 2;
SetAllU32Arr(cur_htable_size, merged_alleles_htable);
- // See e.g PopulateStrboxHtable().
+ // See e.g. PopulateStrboxHtable().
uint32_t hashval = Hashceil(merged_alleles[0], strlen(merged_alleles[0]), cur_htable_size);
merged_alleles_htable[hashval] = 0;
++merged_allele_ct;
@@ -936,27 +2127,18 @@ PglErr PgenDiff(const uintptr_t* orig_sample_include, const SampleIdInfo* siip,
}
const char* cur_allele1 = cur_allele1s[allele1_idx];
const uint32_t cur_allele1_slen = strlen(cur_allele1);
- for (hashval = Hashceil(cur_allele1, cur_allele1_slen, cur_htable_size); ; ) {
- const uint32_t cur_htable_entry = merged_alleles_htable[hashval];
- if (cur_htable_entry == UINT32_MAX) {
- if (unlikely(merged_allele_ct == kPglMaxAlleleCt)) {
- logerrprintfww("Error: Too many alleles across --pgen-diff and main filesets for variant '%s' at position %s:%u. (This " PROG_NAME_STR " build is limited to " PGL_MAX_ALLELE_CT_STR ".)\n", variant_ids[variant_uidx], chr_buf, cur_included_bp);
- reterr = kPglRetNotYetSupported;
- goto PgenDiff_ret_1;
- }
- remap1[allele1_idx] = merged_allele_ct;
- merged_alleles_htable[hashval] = merged_allele_ct;
- merged_alleles[merged_allele_ct] = cur_allele1;
- ++merged_allele_ct;
- break;
- }
- if (memequal(cur_allele1, merged_alleles[cur_htable_entry], cur_allele1_slen + 1)) {
- remap1[allele1_idx] = cur_htable_entry;
- break;
- }
- if (++hashval == cur_htable_size) {
- hashval = 0;
+ const uint32_t cur_idx = IdHtableAdd(cur_allele1, merged_alleles, cur_allele1_slen, cur_htable_size, merged_allele_ct, merged_alleles_htable);
+ if (cur_idx == UINT32_MAX) {
+ if (unlikely(merged_allele_ct == kPglMaxAlleleCt)) {
+ logerrprintfww("Error: Too many alleles across --pgen-diff and main filesets for variant '%s' at position %s:%u. (This " PROG_NAME_STR " build is limited to " PGL_MAX_ALLELE_CT_STR ".)\n", variant_ids[variant_uidx], chr_buf, cur_included_bp);
+ reterr = kPglRetNotYetSupported;
+ goto PgenDiff_ret_1;
}
+ remap1[allele1_idx] = merged_allele_ct;
+ merged_alleles[merged_allele_ct] = cur_allele1;
+ ++merged_allele_ct;
+ } else {
+ remap1[allele1_idx] = cur_idx;
}
}
for (uint32_t allele2_idx = allele2_alt_start; allele2_idx != cur_allele_ct2; ++allele2_idx) {
@@ -965,27 +2147,18 @@ PglErr PgenDiff(const uintptr_t* orig_sample_include, const SampleIdInfo* siip,
}
const char* cur_allele2 = cur_allele2s[allele2_idx];
const uint32_t cur_allele2_slen = strlen(cur_allele2);
- for (hashval = Hashceil(cur_allele2, cur_allele2_slen, cur_htable_size); ; ) {
- const uint32_t cur_htable_entry = merged_alleles_htable[hashval];
- if (cur_htable_entry == UINT32_MAX) {
- if (unlikely(merged_allele_ct == kPglMaxAlleleCt)) {
- logerrprintfww("Error: Too many alleles across --pgen-diff and main filesets for variant '%s' at position %s:%u. (This " PROG_NAME_STR " build is limited to " PGL_MAX_ALLELE_CT_STR ".)\n", variant_ids[variant_uidx], chr_buf, cur_included_bp);
- reterr = kPglRetNotYetSupported;
- goto PgenDiff_ret_1;
- }
- remap2[allele2_idx] = merged_allele_ct;
- merged_alleles_htable[hashval] = merged_allele_ct;
- merged_alleles[merged_allele_ct] = cur_allele2;
- ++merged_allele_ct;
- break;
- }
- if (memequal(cur_allele2, merged_alleles[cur_htable_entry], cur_allele2_slen + 1)) {
- remap2[allele2_idx] = cur_htable_entry;
- break;
- }
- if (++hashval == cur_htable_size) {
- hashval = 0;
+ const uint32_t cur_idx = IdHtableAdd(cur_allele2, merged_alleles, cur_allele2_slen, cur_htable_size, merged_allele_ct, merged_alleles_htable);
+ if (cur_idx == UINT32_MAX) {
+ if (unlikely(merged_allele_ct == kPglMaxAlleleCt)) {
+ logerrprintfww("Error: Too many alleles across --pgen-diff and main filesets for variant '%s' at position %s:%u. (This " PROG_NAME_STR " build is limited to " PGL_MAX_ALLELE_CT_STR ".)\n", variant_ids[variant_uidx], chr_buf, cur_included_bp);
+ reterr = kPglRetNotYetSupported;
+ goto PgenDiff_ret_1;
}
+ remap2[allele2_idx] = merged_allele_ct;
+ merged_alleles[merged_allele_ct] = cur_allele2;
+ ++merged_allele_ct;
+ } else {
+ remap2[allele2_idx] = cur_idx;
}
}
const uint32_t merged_allele_ctl = BitCtToWordCt(merged_allele_ct);
=====================================
plink2_merge.h
=====================================
@@ -71,15 +71,13 @@ ENUM_U31_DEF_START()
kMergeFilterModeNonpassUnion
ENUM_U31_DEF_END(MergeFilterMode);
-// this is a hybrid, only kfSortFileSid is actually a flag
-FLAGSET_DEF_START()
- kfSort0,
- kfSortNone = (1 << 0),
- kfSortNatural = (1 << 1),
- kfSortAscii = (1 << 2),
- kfSortFile = (1 << 3),
- kfSortFileSid = (1 << 4)
-FLAGSET_DEF_END(SortFlags);
+ENUM_U31_DEF_START()
+ kSort0,
+ kSortNone,
+ kSortNatural,
+ kSortAscii,
+ kSortFile
+ENUM_U31_DEF_END(SortMode);
// --pgen-diff based here due to --merge-mode 6/7 history
FLAGSET_DEF_START()
@@ -110,7 +108,8 @@ typedef struct PmergeStruct {
MergeQualInfoMode merge_qual_mode;
MergeFilterMode merge_filter_mode;
MergeQualInfoMode merge_info_mode;
- SortFlags merge_info_sort;
+ SortMode merge_pheno_sort;
+ SortMode merge_info_sort;
uint32_t max_allele_ct;
char* pgen_fname;
char* pvar_fname;
@@ -135,7 +134,7 @@ void InitPgenDiff(PgenDiffInfo* pgen_diff_info_ptr);
void CleanupPgenDiff(PgenDiffInfo* pgen_diff_info_ptr);
-PglErr Pmerge(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags misc_flags, SortFlags sample_sort_flags, FamCol fam_cols, uint32_t max_thread_ct, char* pgenname, char* psamname, char* pvarname, char* outname, char* outname_end, ChrInfo* cip);
+PglErr Pmerge(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags misc_flags, SortMode sample_sort_mode, FamCol fam_cols, uint32_t max_thread_ct, char* pgenname, char* psamname, char* pvarname, char* outname, char* outname_end, ChrInfo* cip);
PglErr PgenDiff(const uintptr_t* orig_sample_include, const SampleIdInfo* siip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const PgenDiffInfo* pdip, uint32_t raw_sample_ct, uint32_t orig_sample_ct, uint32_t raw_variant_ct, uint32_t max_allele_ct1, uint32_t max_allele_slen, uint32_t max_thread_ct, PgenFileInfo* pgfip, PgenReader* simple_pgrp, char* outname, char* outname_end);
=====================================
plink2_misc.cc
=====================================
@@ -512,7 +512,7 @@ PglErr UpdateVarAlleles(const char* fname, const uintptr_t* variant_include, con
if ((old_allele2_match == UINT32_MAX) || (cur_allele_ct != 2) || (old_slen1 != 1) || (old_allele1_start[0] != '.')) {
UpdateVarAlleles_errfile:
if (!err_ct) {
- strcpy(outname_end, ".allele.no.snp");
+ strcpy_k(outname_end, ".allele.no.snp");
if (fopen_checked(outname, FOPEN_WB, &errfile)) {
goto UpdateVarAlleles_ret_OPEN_FAIL;
}
@@ -1023,7 +1023,7 @@ PglErr RecoverVarIds(const char* fname, const uintptr_t* variant_include, const
logprintf("--recover-var-ids: %" PRIuPTR " line%s scanned.\n", line_idx, (line_idx == 1)? "" : "s");
const uint32_t conflict_ct = PopcountWords(conflict_bitarr, raw_variant_ctl);
if (conflict_ct) {
- strcpy(outname_end, ".recoverid.dup");
+ strcpy_k(outname_end, ".recoverid.dup");
if (fopen_checked(outname, FOPEN_WB, &errfile)) {
goto RecoverVarIds_ret_OPEN_FAIL;
}
@@ -1290,33 +1290,20 @@ PglErr Plink1ClusterImport(const char* within_fname, const char* catpheno_name,
logerrputs("Error: Category names are limited to " MAX_ID_SLEN_STR " characters.\n");
goto Plink1ClusterImport_ret_INCONSISTENT_INPUT;
}
- uint32_t hashval = Hashceil(main_token_start, main_token_slen, cat_htable_size);
- const uint32_t main_token_blen = main_token_slen + 1;
- uint32_t cur_htable_entry;
- while (1) {
- cur_htable_entry = cat_htable[hashval];
- if (cur_htable_entry == UINT32_MAX) {
- if (unlikely(main_token_blen > S_CAST(uintptr_t, cat_name_write_max - cat_name_iter))) {
- goto Plink1ClusterImport_ret_NOMEM;
- }
- char* cat_name_start = cat_name_iter;
- cat_name_iter = memcpya(cat_name_iter, main_token_start, main_token_blen);
- cur_cat_names[++nonnull_cat_ct] = cat_name_start;
- cur_htable_entry = nonnull_cat_ct;
- cat_htable[hashval] = cur_htable_entry;
- break;
- }
- if (memequal(main_token_start, cur_cat_names[cur_htable_entry], main_token_blen)) {
- break;
- }
- if (++hashval == cat_htable_size) {
- hashval = 0;
+ uint32_t cur_cat_idx = IdHtableAdd(main_token_start, cur_cat_names, main_token_slen, cat_htable_size, nonnull_cat_ct + 1, cat_htable);
+ if (cur_cat_idx == UINT32_MAX) {
+ if (unlikely(main_token_slen >= S_CAST(uintptr_t, cat_name_write_max - cat_name_iter))) {
+ goto Plink1ClusterImport_ret_NOMEM;
}
+ char* cat_name_start = cat_name_iter;
+ cat_name_iter = memcpya(cat_name_iter, main_token_start, main_token_slen + 1);
+ cur_cat_idx = nonnull_cat_ct + 1;
+ cur_cat_names[cur_cat_idx] = cat_name_start;
}
// permit duplicates if category is identical
if (IsSet(already_seen, lb_idx)) {
const uint32_t existing_cat_idx = sorted_cat_idxs[lb_idx];
- if (unlikely(existing_cat_idx != cur_htable_entry)) {
+ if (unlikely(existing_cat_idx != cur_cat_idx)) {
idbuf[fid_slen] = ' ';
logpreprintfww("Error: Duplicate sample ID '%s' with conflicting category assignments in --within file.\n", idbuf);
goto Plink1ClusterImport_ret_MALFORMED_INPUT_2;
@@ -1325,7 +1312,7 @@ PglErr Plink1ClusterImport(const char* within_fname, const char* catpheno_name,
} else {
SetBit(lb_idx, already_seen);
for (; lb_idx != ub_idx; ++lb_idx) {
- sorted_cat_idxs[lb_idx] = cur_htable_entry;
+ sorted_cat_idxs[lb_idx] = cur_cat_idx;
}
}
goto Plink1ClusterImport_LINE_ITER_ALREADY_ADVANCED;
@@ -1936,6 +1923,10 @@ PglErr UpdateSampleIds(const char* fname, const uintptr_t* sample_include, uint3
} else {
xid_mode = old_sid_present? kfXidModeIidSid : kfXidModeIid;
}
+ if (new_fid_present) {
+ // bugfix (14 Jan 2021)
+ siip->flags |= kfSampleIdFidPresent;
+ }
const uintptr_t max_sample_id_blen = siip->max_sample_id_blen;
const uintptr_t max_sid_blen = siip->max_sid_blen;
uint32_t* xid_map = nullptr;
@@ -9007,19 +8998,9 @@ PglErr WriteCovar(const uintptr_t* sample_include, const PedigreeIdInfo* piip, c
uint32_t max_xcovar_name_blen = max_covar_name_blen;
if (write_sex) {
// add "SEX"
- for (uint32_t hashval = Hashceil("SEX", 3, covar_name_htable_size); ; ) {
- const uint32_t cur_htable_entry = covar_name_htable[hashval];
- if (cur_htable_entry == UINT32_MAX) {
- covar_name_htable[hashval] = covar_ct;
- break;
- }
- if (unlikely(strequal_k_unsafe(&(covar_names[cur_htable_entry * max_covar_name_blen]), "SEX"))) {
- logerrputs("Error: .cov file cannot have both a regular SEX column and a covariate named\n'SEX'. Exclude or rename one of these columns.\n");
- goto WriteCovar_ret_INCONSISTENT_INPUT;
- }
- if (++hashval == covar_name_htable_size) {
- hashval = 0;
- }
+ if (unlikely(StrboxHtableAdd("SEX", covar_names, max_covar_name_blen, strlen("SEX"), covar_name_htable_size, covar_ct, covar_name_htable) != UINT32_MAX)) {
+ logerrputs("Error: .cov file cannot have both a regular SEX column and a covariate named\n'SEX'. Exclude or rename one of these columns.\n");
+ goto WriteCovar_ret_INCONSISTENT_INPUT;
}
if (max_xcovar_name_blen < 4) {
max_xcovar_name_blen = 4;
@@ -9031,6 +9012,8 @@ PglErr WriteCovar(const uintptr_t* sample_include, const PedigreeIdInfo* piip, c
*write_iter++ = '\t';
const uint32_t cur_pheno_name_slen = strlen(pheno_name_iter);
if (cur_pheno_name_slen < max_xcovar_name_blen) {
+ // can't just use StrboxHtableFind() since "SEX" may not be stored
+ // in covar_names[]
const uint32_t cur_pheno_name_blen = cur_pheno_name_slen + 1;
for (uint32_t hashval = Hashceil(pheno_name_iter, cur_pheno_name_slen, covar_name_htable_size); ; ) {
const uint32_t cur_htable_idval = covar_name_htable[hashval];
=====================================
plink2_psam.cc
=====================================
@@ -94,7 +94,7 @@ PglErr LoadPsam(const char* psamname, const RangeList* pheno_range_list_ptr, Fam
// parse header
// [-1] = #FID (if present, must be first column)
// [0] = IID (could also be first column)
- // [1] = SID
+ // [1] = SID (if present, must immediately follow IID)
// [2] = PAT
// [3] = MAT
// [4] = SEX
@@ -229,7 +229,15 @@ PglErr LoadPsam(const char* psamname, const RangeList* pheno_range_list_ptr, Fam
}
BigstackBaseSet(ll_alloc_base);
if (unlikely(!(psam_cols_mask & 1))) {
- snprintf(g_logbuf, kLogbufSize, "Error: No IID column on line %" PRIuPTR " of %s.\n", line_idx, psamname);
+ snprintf(g_logbuf, kLogbufSize, "Error: No IID column in %s.\n", psamname);
+ goto LoadPsam_ret_MALFORMED_INPUT_WW;
+ }
+ if (unlikely(col_types[0] != 0)) {
+ snprintf(g_logbuf, kLogbufSize, "Error: IID column is not first or second in %s.\n", psamname);
+ goto LoadPsam_ret_MALFORMED_INPUT_WW;
+ }
+ if (unlikely((psam_cols_mask & 2) && ((col_types[1] != 1) || (col_skips[1] != 1)))) {
+ snprintf(g_logbuf, kLogbufSize, "Error: SID column does not immediately follow IID column in %s.\n", psamname);
goto LoadPsam_ret_MALFORMED_INPUT_WW;
}
if (unlikely(in_interval)) {
@@ -257,7 +265,7 @@ PglErr LoadPsam(const char* psamname, const RangeList* pheno_range_list_ptr, Fam
col_skips = S_CAST(uint32_t*, bigstack_alloc_raw_rd(relevant_postfid_col_ct * sizeof(int32_t)));
col_types = S_CAST(uint32_t*, bigstack_alloc_raw_rd(relevant_postfid_col_ct * sizeof(int32_t)));
bigstack_mark2 = g_bigstack_base;
- col_skips[0] = fam_cols & 1; // assumes kfFamCol1 == 1
+ col_skips[0] = fid_present;
col_types[0] = 0;
// psam_cols_mask = 1; // may need this later
uint32_t rpf_col_idx = 1;
=====================================
plink2_psam.h
=====================================
@@ -35,8 +35,8 @@ namespace plink2 {
// by a leading '#'. All lines which don't start with '#FID' or '#IID' are
// currently ignored. The #FID/IID line specifies the columns in the .psam
// file; the following column headers are recognized:
-// IID (required)
-// SID
+// IID (required, must be first or second)
+// SID (must immediately follow IID if present)
// PAT
// MAT
// SEX
View it on GitLab: https://salsa.debian.org/med-team/plink2/-/compare/8fb4d070837984696a17af2daf20a5b7ee95bc70...dc83f84b6f540ce18f6c247dc383bc3af3ac7b01
--
View it on GitLab: https://salsa.debian.org/med-team/plink2/-/compare/8fb4d070837984696a17af2daf20a5b7ee95bc70...dc83f84b6f540ce18f6c247dc383bc3af3ac7b01
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210117/5ce2d318/attachment-0001.html>
More information about the debian-med-commit
mailing list