[med-svn] [Git][med-team/plink2][master] 7 commits: debian/watch: assume all upstream versions are prefixed with "2.00~a3-"

Michael R. Crusoe gitlab at salsa.debian.org
Sun Jan 17 11:48:21 GMT 2021



Michael R. Crusoe pushed to branch master at Debian Med / plink2


Commits:
6c3e8051 by Michael R. Crusoe at 2021-01-17T10:00:13+01:00
debian/watch: assume all upstream versions are prefixed with "2.00~a3-"

- - - - -
75e93bdd by Michael R. Crusoe at 2021-01-17T10:00:43+01:00
routine-update: New upstream version

- - - - -
b3bdaa5c by Michael R. Crusoe at 2021-01-17T10:00:44+01:00
New upstream version 2.00~a3-210116+dfsg
- - - - -
4eebee14 by Michael R. Crusoe at 2021-01-17T10:00:46+01:00
Update upstream source from tag 'upstream/2.00_a3-210116+dfsg'

Update to upstream version '2.00~a3-210116+dfsg'
with Debian dir a233588a93ac5443b4ef3402db5e2134f1a40290
- - - - -
77ed2da0 by Michael R. Crusoe at 2021-01-17T10:00:51+01:00
Remove obsolete field Contact from debian/upstream/metadata (already present in machine-readable debian/copyright).

Changes-By: lintian-brush

- - - - -
471ec6e3 by Michael R. Crusoe at 2021-01-17T10:07:46+01:00
debian/patches/baseline: respect Debian's architecture baselines

- - - - -
dc83f84b by Michael R. Crusoe at 2021-01-17T12:46:39+01:00
routine-update: Ready to upload to unstable

- - - - -


21 changed files:

- Makefile
- debian/changelog
- + debian/patches/baseline
- debian/patches/series
- debian/upstream/metadata
- debian/watch
- plink2.cc
- plink2_cmdline.cc
- plink2_cmdline.h
- plink2_common.cc
- plink2_common.h
- plink2_data.cc
- plink2_data.h
- plink2_export.cc
- plink2_help.cc
- plink2_import.cc
- plink2_merge.cc
- plink2_merge.h
- plink2_misc.cc
- plink2_psam.cc
- plink2_psam.h


Changes:

=====================================
Makefile
=====================================
@@ -6,8 +6,8 @@
 
 # Does not currently support -DCPU_CHECK_...
 
-BASEFLAGS=-g -DZSTD_MULTITHREAD -DSTATIC_ZSTD
-# BASEFLAGS=-g -mavx2 -mbmi -mbmi2 -mlzcnt -DZSTD_MULTITHREAD
+# BASEFLAGS=-g -DZSTD_MULTITHREAD -DSTATIC_ZSTD
+BASEFLAGS=-g -mavx2 -mbmi -mbmi2 -mlzcnt -DZSTD_MULTITHREAD
 # BASEFLAGS=-g -msse4.2 -DZSTD_MULTITHREAD
 
 include Makefile.src


=====================================
debian/changelog
=====================================
@@ -1,3 +1,14 @@
+plink2 (2.00~a3-210116+dfsg-1) unstable; urgency=medium
+
+  * Team upload.
+  * debian/watch: assume all upstream versions are prefixed with
+    "2.00~a3-"
+  * Remove obsolete field Contact from debian/upstream/metadata (already present
+    in machine-readable debian/copyright).
+  * debian/patches/baseline: respect Debian's architecture baselines
+
+ -- Michael R. Crusoe <crusoe at debian.org>  Sun, 17 Jan 2021 10:00:51 +0100
+
 plink2 (2.00~a3-210104+dfsg-1) unstable; urgency=medium
 
   * Team upload.


=====================================
debian/patches/baseline
=====================================
@@ -0,0 +1,14 @@
+Author: Michael R. Crusoe <crusoe at debian.org>
+Description: respect the architecture baseline
+Forwarded: not-needed
+--- plink2.orig/Makefile
++++ plink2/Makefile
+@@ -7,7 +7,7 @@
+ # Does not currently support -DCPU_CHECK_...
+ 
+ # BASEFLAGS=-g -DZSTD_MULTITHREAD -DSTATIC_ZSTD
+-BASEFLAGS=-g -mavx2 -mbmi -mbmi2 -mlzcnt -DZSTD_MULTITHREAD
++BASEFLAGS=-g -DZSTD_MULTITHREAD
+ # BASEFLAGS=-g -msse4.2 -DZSTD_MULTITHREAD
+ 
+ include Makefile.src


=====================================
debian/patches/series
=====================================
@@ -1,2 +1,3 @@
 Fix_Makefile.patch
 use_packaged_libdeflate.patch
+baseline


=====================================
debian/upstream/metadata
=====================================
@@ -1,4 +1,3 @@
-Contact: Christopher Chang <chrchang at alumni.caltech.edu>
 Name: plink1.9
 Reference:
  - Author: >


=====================================
debian/watch
=====================================
@@ -1,3 +1,3 @@
 version=4
-opts="repacksuffix=+dfsg,dversionmangle=s/\+dfsg//g;s/(\d\S*)\-//,repack,compression=xz" \
+opts="repacksuffix=+dfsg,dversionmangle=s/\+dfsg//g,uversionmangle=s/(\d\S+)/2.00~a3-$1/,repack,compression=xz" \
  https://www.cog-genomics.org/plink/2.0/ /static/bin/plink2_src_(\d\S+)\.zip


=====================================
plink2.cc
=====================================
@@ -71,10 +71,10 @@ static const char ver_str[] = "PLINK v2.00a3"
 #ifdef USE_MKL
   " Intel"
 #endif
-  " (4 Jan 2021)";
+  " (16 Jan 2021)";
 static const char ver_str2[] =
   // include leading space if day < 10, so character length stays the same
-  " "
+  ""
 #ifndef LAPACK_ILP64
   "  "
 #endif
@@ -286,8 +286,8 @@ typedef struct Plink2CmdlineStruct {
 
   Command1Flags command_flags1;
   PvarPsamFlags pvar_psam_flags;
-  SortFlags sample_sort_flags;
-  SortFlags sort_vars_flags;
+  SortMode sample_sort_mode;
+  SortMode sort_vars_mode;
   GrmFlags grm_flags;
   PcaFlags pca_flags;
   WriteCovarFlags write_covar_flags;
@@ -1363,6 +1363,8 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c
             if (unlikely(reterr)) {
               goto Plink2Core_ret_1;
             }
+            // bugfix (16 Jan 2021)
+            pii.sii.flags |= kfSampleIdParentsPresent;
           }
         }
         // --update-parents goes here
@@ -2502,11 +2504,11 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c
         BigstackReset(bigstack_mark_allele_ddosages);
 
         uint32_t* new_sample_idx_to_old = nullptr;
-        if (pcp->sample_sort_flags & (kfSortNatural | kfSortAscii | kfSortFile)) {
+        if (pcp->sample_sort_mode > kSortNone) {
           if (sample_ct < 2) {
             logerrputs("Warning: Skipping --sample-sort since <2 samples are present.\n");
           } else {
-            if (pcp->sample_sort_flags & kfSortFile) {
+            if (pcp->sample_sort_mode == kSortFile) {
               reterr = SampleSortFileMap(sample_include, &pii.sii, pcp->sample_sort_fname, raw_sample_ct, sample_ct, &new_sample_idx_to_old);
               if (unlikely(reterr)) {
                 goto Plink2Core_ret_1;
@@ -2517,7 +2519,7 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c
               // to spare here, so keep the code simpler for now
               char* sorted_xidbox_tmp;
               uintptr_t max_xid_blen;
-              reterr = SortedXidboxInitAlloc(sample_include, &pii.sii, sample_ct, 0, pii.sii.sids? kfXidModeFidIidSid : kfXidModeFidIid, (pcp->sample_sort_flags == kfSortNatural), &sorted_xidbox_tmp, &new_sample_idx_to_old, &max_xid_blen);
+              reterr = SortedXidboxInitAlloc(sample_include, &pii.sii, sample_ct, 0, pii.sii.sids? kfXidModeFidIidSid : kfXidModeFidIid, (pcp->sample_sort_mode == kSortNatural), &sorted_xidbox_tmp, &new_sample_idx_to_old, &max_xid_blen);
               if (unlikely(reterr)) {
                 goto Plink2Core_ret_1;
               }
@@ -2544,8 +2546,8 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c
 
         if (pcp->command_flags1 & kfCommand1MakePlink2) {
           // todo: unsorted case (--update-chr, etc.)
-          if (pcp->sort_vars_flags != kfSort0) {
-            reterr = MakePlink2Vsort(sample_include, &pii, sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, info_reload_slen? pvarname : nullptr, variant_cms, chr_idxs, xheader_blen, info_flags, raw_sample_ct, sample_ct, pheno_ct, max_pheno_name_blen, raw_variant_ct, variant_ct, max_allele_ct, max_allele_slen, max_filter_slen, info_reload_slen, pcp->max_thread_ct, pcp->hard_call_thresh, pcp->dosage_erase_thresh, make_plink2_flags, (pcp->sort_vars_flags == kfSortNatural), pcp->pvar_psam_flags, xheader, &simple_pgr, outname, outname_end);
+          if (pcp->sort_vars_mode > kSortNone) {
+            reterr = MakePlink2Vsort(sample_include, &pii, sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, info_reload_slen? pvarname : nullptr, variant_cms, chr_idxs, xheader_blen, info_flags, raw_sample_ct, sample_ct, pheno_ct, max_pheno_name_blen, raw_variant_ct, variant_ct, max_allele_ct, max_allele_slen, max_filter_slen, info_reload_slen, pcp->max_thread_ct, pcp->hard_call_thresh, pcp->dosage_erase_thresh, make_plink2_flags, (pcp->sort_vars_mode == kSortNatural), pcp->pvar_psam_flags, xheader, &simple_pgr, outname, outname_end);
           } else {
             if (vpos_sortstatus & kfUnsortedVarBp) {
               logerrputs("Warning: Variants are not sorted by position.  Consider rerunning with the\n--sort-vars flag added to remedy this.\n");
@@ -3530,8 +3532,8 @@ int main(int argc, char** argv) {
     // uint64_t command_flags2 = 0;
     pc.misc_flags = kfMisc0;
     pc.pvar_psam_flags = kfPvarPsam0;
-    pc.sample_sort_flags = kfSort0;
-    pc.sort_vars_flags = kfSort0;
+    pc.sample_sort_mode = kSort0;
+    pc.sort_vars_mode = kSort0;
     pc.grm_flags = kfGrm0;
     pc.pca_flags = kfPca0;
     pc.write_covar_flags = kfWriteCovar0;
@@ -5845,7 +5847,7 @@ int main(int argc, char** argv) {
 
       case 'i':
         if (strequal_k_unsafe(flagname_p2, "ndiv-sort")) {
-          if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 3))) {
+          if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 2))) {
             goto main_ret_INVALID_CMDLINE_2A;
           }
           const char* mode_str = argvk[arg_idx + 1];
@@ -5853,28 +5855,21 @@ int main(int argc, char** argv) {
           const uint32_t mode_slen = strlen(mode_str);
           if (strequal_k(mode_str, "0", mode_slen) ||
               strequal_k(mode_str, "none", mode_slen)) {
-            pc.sample_sort_flags = kfSortNone;
+            pc.sample_sort_mode = kSortNone;
           } else if (((mode_slen == 1) && (first_char_upcase_match == 'N')) ||
                      strequal_k(mode_str, "natural", mode_slen)) {
-            pc.sample_sort_flags = kfSortNatural;
+            pc.sample_sort_mode = kSortNatural;
           } else if (((mode_slen == 1) && (first_char_upcase_match == 'A')) ||
                      strequal_k(mode_str, "ascii", mode_slen)) {
-            pc.sample_sort_flags = kfSortAscii;
+            pc.sample_sort_mode = kSortAscii;
           } else if (likely(((mode_slen == 1) && (first_char_upcase_match == 'F')) ||
                             strequal_k(mode_str, "file", mode_slen))) {
             if (unlikely(param_ct == 1)) {
               snprintf(g_logbuf, kLogbufSize, "Error: Missing '--indiv-sort %s' filename.\n", mode_str);
               goto main_ret_INVALID_CMDLINE_2A;
             }
-            pc.sample_sort_flags = kfSortFile;
-            uint32_t fname_modif_idx = 2;
-            if (param_ct == 3) {
-              if (unlikely(CheckExtraParam(&(argvk[arg_idx]), "sid", &fname_modif_idx))) {
-                goto main_ret_INVALID_CMDLINE_A;
-              }
-              pc.sample_sort_flags |= kfSortFileSid;
-            }
-            reterr = AllocFname(argvk[arg_idx + fname_modif_idx], flagname_p, 0, &pc.sample_sort_fname);
+            pc.sample_sort_mode = kSortFile;
+            reterr = AllocFname(argvk[arg_idx + 2], flagname_p, 0, &pc.sample_sort_fname);
             if (unlikely(reterr)) {
               goto main_ret_1;
             }
@@ -5882,7 +5877,7 @@ int main(int argc, char** argv) {
             snprintf(g_logbuf, kLogbufSize, "Error: '%s' is not a valid mode for --indiv-sort.\n", mode_str);
             goto main_ret_INVALID_CMDLINE_WWA;
           }
-          if (unlikely((param_ct > 1) && (!(pc.sample_sort_flags & kfSortFile)))) {
+          if (unlikely((param_ct > 1) && (pc.sample_sort_mode != kSortFile))) {
             snprintf(g_logbuf, kLogbufSize, "Error: '--indiv-sort %s' does not accept additional arguments.\n", mode_str);
             goto main_ret_INVALID_CMDLINE_2A;
           }
@@ -7844,24 +7839,31 @@ int main(int argc, char** argv) {
             snprintf(g_logbuf, kLogbufSize, "Error: Invalid --merge-filter-mode argument '%s'.\n", cur_modif);
             goto main_ret_INVALID_CMDLINE_WWA;
           }
-        } else if (strequal_k_unsafe(flagname_p2, "erge-info-sort")) {
+        } else if (strequal_k_unsafe(flagname_p2, "erge-info-sort") ||
+                   strequal_k_unsafe(flagname_p2, "erge-pheno-sort")) {
           if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 1))) {
             goto main_ret_INVALID_CMDLINE_2A;
           }
           const char* mode_str = argvk[arg_idx + 1];
           const char first_char_upcase_match = mode_str[0] & 0xdf;
           const uint32_t mode_slen = strlen(mode_str);
-          if (strequal_k(mode_str, "0", mode_slen) ||
-              strequal_k(mode_str, "none", mode_slen)) {
-            pmerge_info.merge_info_sort = kfSortNone;
+          SortMode sort_mode = kSortNone;
+          if (((mode_slen == 1) && (first_char_upcase_match == 'N')) ||
+              strequal_k(mode_str, "natural", mode_slen)) {
+            sort_mode = kSortNatural;
           } else if (((mode_slen == 1) && (first_char_upcase_match == 'A')) ||
                      strequal_k(mode_str, "ascii", mode_slen)) {
-            pmerge_info.merge_info_sort = kfSortAscii;
-          } else if (unlikely(!(((mode_slen == 1) && (first_char_upcase_match == 'N')) ||
-                                strequal_k(mode_str, "natural", mode_slen)))) {
-            snprintf(g_logbuf, kLogbufSize, "Error: Invalid --merge-info-sort argument '%s'.\n", mode_str);
+            sort_mode = kSortAscii;
+          } else if (unlikely(!(strequal_k(mode_str, "0", mode_slen) ||
+                                strequal_k(mode_str, "none", mode_slen)))) {
+            snprintf(g_logbuf, kLogbufSize, "Error: Invalid --%s argument '%s'.\n", flagname_p, mode_str);
             goto main_ret_INVALID_CMDLINE_WWA;
           }
+          if (flagname_p2[5] == 'i') {
+            pmerge_info.merge_info_sort = sort_mode;
+          } else {
+            pmerge_info.merge_pheno_sort = sort_mode;
+          }
         } else if (strequal_k_unsafe(flagname_p2, "erge-max-allele-ct")) {
           if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 1))) {
             goto main_ret_INVALID_CMDLINE_2A;
@@ -8467,7 +8469,6 @@ int main(int argc, char** argv) {
             }
           }
           pc.command_flags1 |= kfCommand1Pmerge;
-          pc.dependency_flags |= kfFilterAllReq;
         } else if (strequal_k_unsafe(flagname_p2, "merge-list")) {
           if (pc.command_flags1 & kfCommand1Pmerge) {
             logerrputs("Error: --pmerge-list cannot be used with --pmerge.\n");
@@ -8518,7 +8519,6 @@ int main(int argc, char** argv) {
             goto main_ret_1;
           }
           pc.command_flags1 |= kfCommand1Pmerge;
-          pc.dependency_flags |= kfFilterAllReq;
         } else if (strequal_k_unsafe(flagname_p2, "gen-diff")) {
           if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 7))) {
             goto main_ret_INVALID_CMDLINE_2A;
@@ -9328,16 +9328,16 @@ int main(int argc, char** argv) {
             const uint32_t mode_slen = strlen(mode_str);
             if (((mode_slen == 1) && (first_char_upcase_match == 'N')) ||
                 strequal_k(mode_str, "natural", mode_slen)) {
-              pc.sort_vars_flags = kfSortNatural;
+              pc.sort_vars_mode = kSortNatural;
             } else if (likely(((mode_slen == 1) && (first_char_upcase_match == 'A')) ||
                               strequal_k(mode_str, "ascii", mode_slen))) {
-              pc.sort_vars_flags = kfSortAscii;
+              pc.sort_vars_mode = kSortAscii;
             } else {
               snprintf(g_logbuf, kLogbufSize, "Error: '%s' is not a valid mode for --sort-vars.\n", mode_str);
               goto main_ret_INVALID_CMDLINE_WWA;
             }
           } else {
-            pc.sort_vars_flags = kfSortNatural;
+            pc.sort_vars_mode = kSortNatural;
           }
         } else if (strequal_k_unsafe(flagname_p2, "ample-diff")) {
           if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 0x7fffffff))) {
@@ -10303,7 +10303,7 @@ int main(int argc, char** argv) {
       logerrputs("Error: --data/--sample cannot be used with --1.\n");
       goto main_ret_INVALID_CMDLINE_A;
     }
-    if (unlikely((pc.sample_sort_flags != kfSort0) && (!(pc.command_flags1 & (kfCommand1MakePlink2 | kfCommand1WriteCovar | kfCommand1Pmerge))))) {
+    if (unlikely((pc.sample_sort_mode != kSort0) && (!(pc.command_flags1 & (kfCommand1MakePlink2 | kfCommand1WriteCovar | kfCommand1Pmerge))))) {
       logerrputs("Error: --indiv-sort must be used with --make-[b]pgen/--make-bed/--write-covar\nor dataset merging.\n");
       goto main_ret_INVALID_CMDLINE_A;
     }
@@ -10332,9 +10332,6 @@ int main(int argc, char** argv) {
         goto main_ret_INVALID_CMDLINE;
       }
     }
-    if ((pc.command_flags1 & kfCommand1Pmerge) && (pc.command_flags1 & (~kfCommand1Pmerge))) {
-      ;;;
-    }
     if (pc.keep_cat_phenoname && (!pc.keep_cat_names_flattened) && (!pc.keep_cats_fname)) {
       logerrputs("Error: --keep-cat-pheno must be used with --keep-cats and/or --keep-cat-names.\n");
     }
@@ -10529,10 +10526,10 @@ int main(int argc, char** argv) {
         // section; otherwise only do it in "--keep-autoconv vzs" case.
         uint32_t pvar_is_compressed;
         if (xload & (kfXloadVcf | kfXloadBcf)) {
-          const uint32_t no_samples_ok = !(pc.dependency_flags & (kfFilterAllReq | kfFilterPsamReq));
+          const uint32_t no_samples_ok = !((pc.dependency_flags & (kfFilterAllReq | kfFilterPsamReq)) || (pc.command_flags1 & kfCommand1Pmerge));
           const uint32_t is_vcf = (xload / kfXloadVcf) & 1;
           pvar_is_compressed = ((import_flags & (kfImportKeepAutoconv | kfImportKeepAutoconvVzs)) != kfImportKeepAutoconv);
-          if (no_samples_ok && is_vcf && (!(import_flags & kfImportKeepAutoconv)) && pc.command_flags1 && (!(pc.command_flags1 & kfCommand1Pmerge))) {
+          if (no_samples_ok && is_vcf && (!(import_flags & kfImportKeepAutoconv)) && pc.command_flags1) {
             // special case: just treat the VCF as a .pvar file
             strcpy(pvarname, pgenname);
             pgenname[0] = '\0';
@@ -10608,7 +10605,12 @@ int main(int argc, char** argv) {
       if ((pc.dependency_flags & kfFilterOpportunisticPgen) && (pgenname[0] != '\0')) {
         pc.dependency_flags |= kfFilterAllReq;
       }
-      if (pc.dependency_flags & kfFilterAllReq) {
+      if (pmerge_info.list_fname) {
+        if (unlikely((!xload) && (load_params != kfLoadParamsPfileAll) && (load_params!= kfLoadParams0))) {
+          logerrputs("Error: --pmerge-list cannot be used with a proper subset of {--pgen, --pvar,\n--psam}.  You must specify either none or all of those filenames.\n");
+          goto main_ret_INVALID_CMDLINE_A;
+        }
+      } else if ((pc.dependency_flags & kfFilterAllReq) || (pc.command_flags1 & kfCommand1Pmerge)) {
         if (unlikely((!xload) && (load_params != kfLoadParamsPfileAll))) {
           logerrputs("Error: A full fileset (.pgen/.bed + .pvar/.bim + .psam/.fam) is required for\nthis.\n");
           goto main_ret_INVALID_CMDLINE_A;
@@ -10642,7 +10644,7 @@ int main(int argc, char** argv) {
         if (make_plink2_flags & (kfMakePgen | kfMakePvar | kfMakePsam)) {
           merge_outname_end = strcpya_k(merge_outname_end, "-merge");
         }
-        reterr = Pmerge(&pmerge_info, pc.sample_sort_fname, pc.misc_flags, pc.sample_sort_flags, pc.fam_cols, pc.max_thread_ct, pgenname, psamname, pvarname, outname, merge_outname_end, &chr_info);
+        reterr = Pmerge(&pmerge_info, pc.sample_sort_fname, pc.misc_flags, pc.sample_sort_mode, pc.fam_cols, pc.max_thread_ct, pgenname, psamname, pvarname, outname, merge_outname_end, &chr_info);
         if (unlikely(reterr)) {
           goto main_ret_1;
         }
@@ -10652,7 +10654,7 @@ int main(int argc, char** argv) {
         }
       }
 
-      if ((pc.command_flags1 & (~(kfCommand1MakePlink2 | kfCommand1Validate | kfCommand1WriteSnplist | kfCommand1WriteCovar | kfCommand1WriteSamples))) || ((pc.command_flags1 & kfCommand1MakePlink2) && (pc.sort_vars_flags == kfSort0))) {
+      if ((pc.command_flags1 & (~(kfCommand1MakePlink2 | kfCommand1Validate | kfCommand1WriteSnplist | kfCommand1WriteCovar | kfCommand1WriteSamples))) || ((pc.command_flags1 & kfCommand1MakePlink2) && (pc.sort_vars_mode > kSortNone))) {
         // split-chromosome prohibited for all commands unless explicitly
         // permitted here
         pc.dependency_flags |= kfFilterNoSplitChr;


=====================================
plink2_cmdline.cc
=====================================
@@ -1393,33 +1393,6 @@ BoolErr HtableGoodSizeAlloc(uint32_t item_ct, uintptr_t bytes_avail, uint32_t**
   return 0;
 }
 
-uint32_t PopulateStrboxHtable(const char* strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable) {
-  // may want subset_mask parameter later
-  SetAllU32Arr(str_htable_size, str_htable);
-  const char* strbox_iter = strbox;
-  for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
-    const uint32_t slen = strlen(strbox_iter);
-    // previously used quadratic probing, but turns out that that isn't
-    // meaningfully better than linear probing.
-    for (uint32_t hashval = Hashceil(strbox_iter, slen, str_htable_size); ; ) {
-      const uint32_t cur_htable_entry = str_htable[hashval];
-      if (cur_htable_entry == UINT32_MAX) {
-        str_htable[hashval] = str_idx;
-        break;
-      }
-      if (memequal(strbox_iter, &(strbox[cur_htable_entry * max_str_blen]), slen + 1)) {
-        // guaranteed to be positive
-        return str_idx;
-      }
-      if (++hashval == str_htable_size) {
-        hashval = 0;
-      }
-    }
-    strbox_iter = &(strbox_iter[max_str_blen]);
-  }
-  return 0;
-}
-
 // could merge this with non-subset case, but this isn't much code
 /*
 uint32_t populate_strbox_subset_htable(const uintptr_t* __restrict subset_mask, const char* strbox, uintptr_t raw_str_ct, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable) {
@@ -1476,6 +1449,38 @@ uint32_t IdHtableFindNnt(const char* cur_id, const char* const* item_ids, const
   }
 }
 
+uint32_t IdHtableAdd(const char* cur_id, const char* const* item_ids, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t value, uint32_t* id_htable) {
+  for (uint32_t hashval = Hashceil(cur_id, cur_id_slen, id_htable_size); ; ) {
+    const uint32_t cur_htable_entry = id_htable[hashval];
+    if (cur_htable_entry == UINT32_MAX) {
+      id_htable[hashval] = value;
+      return UINT32_MAX;
+    }
+    if (memequal(cur_id, item_ids[cur_htable_entry], cur_id_slen + 1)) {
+      return cur_htable_entry;
+    }
+    if (++hashval == id_htable_size) {
+      hashval = 0;
+    }
+  }
+}
+
+uint32_t IdHtableAddNnt(const char* cur_id, const char* const* item_ids, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t value, uint32_t* id_htable) {
+  for (uint32_t hashval = Hashceil(cur_id, cur_id_slen, id_htable_size); ; ) {
+    const uint32_t cur_htable_entry = id_htable[hashval];
+    if (cur_htable_entry == UINT32_MAX) {
+      id_htable[hashval] = value;
+      return UINT32_MAX;
+    }
+    if (strequal_unsafe(item_ids[cur_htable_entry], cur_id, cur_id_slen)) {
+      return cur_htable_entry;
+    }
+    if (++hashval == id_htable_size) {
+      hashval = 0;
+    }
+  }
+}
+
 // assumes cur_id_slen < max_str_blen.
 // requires cur_id to be null-terminated.
 uint32_t StrboxHtableFind(const char* cur_id, const char* strbox, const uint32_t* id_htable, uintptr_t max_str_blen, uint32_t cur_id_slen, uint32_t id_htable_size) {
@@ -1491,6 +1496,37 @@ uint32_t StrboxHtableFind(const char* cur_id, const char* strbox, const uint32_t
   }
 }
 
+uint32_t StrboxHtableAdd(const char* cur_id, const char* strbox, uintptr_t max_str_blen, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t value, uint32_t* id_htable) {
+  for (uint32_t hashval = Hashceil(cur_id, cur_id_slen, id_htable_size); ; ) {
+    const uint32_t cur_htable_entry = id_htable[hashval];
+    if (cur_htable_entry == UINT32_MAX) {
+      id_htable[hashval] = value;
+      return UINT32_MAX;
+    }
+    if (memequal(cur_id, &(strbox[cur_htable_entry * max_str_blen]), cur_id_slen + 1)) {
+      return cur_htable_entry;
+    }
+    if (++hashval == id_htable_size) {
+      hashval = 0;
+    }
+  }
+}
+
+uint32_t PopulateStrboxHtable(const char* strbox, uint32_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable) {
+  // may want subset_mask parameter later
+  SetAllU32Arr(str_htable_size, str_htable);
+  const char* strbox_iter = strbox;
+  for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
+    const uint32_t slen = strlen(strbox_iter);
+    if (StrboxHtableAdd(strbox_iter, strbox, max_str_blen, slen, str_htable_size, str_idx, str_htable) != UINT32_MAX) {
+      // guaranteed to be positive
+      return str_idx;
+    }
+    strbox_iter = &(strbox_iter[max_str_blen]);
+  }
+  return 0;
+}
+
 uint32_t VariantIdDupflagHtableFind(const char* idbuf, const char* const* variant_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t max_id_slen) {
   // assumes duplicate variant IDs are flagged, but full variant_uidx linked
   // lists are not stored
@@ -3071,7 +3107,6 @@ PglErr AllocAndPopulateIdHtableMt(const uintptr_t* subset_mask, const char* cons
   return PopulateIdHtableMt(g_bigstack_end, subset_mask, item_ids, item_ct, store_all_dups, id_htable_size, max_thread_ct, &g_bigstack_base, *id_htable_ptr, dup_ct_ptr);
 }
 
-
 uint32_t Edit1Match(const char* s1, const char* s2, uint32_t len1, uint32_t len2) {
   // Permit one difference of the following forms (Damerau-Levenshtein distance
   // 1):


=====================================
plink2_cmdline.h
=====================================
@@ -957,10 +957,19 @@ HEADER_INLINE BoolErr arena_alloc_cp(unsigned char* arena_top, uintptr_t ct, uns
   return !(*cp_arr_ptr);
 }
 
+HEADER_INLINE BoolErr arena_alloc_kcp(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, const char*** kcp_arr_ptr) {
+  *kcp_arr_ptr = S_CAST(const char**, arena_alloc(arena_top, ct * sizeof(intptr_t), arena_bottom_ptr));
+  return !(*kcp_arr_ptr);
+}
+
 HEADER_INLINE void ArenaBaseSet(const void* unaligned_base, unsigned char** arena_bottom_ptr) {
   *arena_bottom_ptr = R_CAST(unsigned char*, RoundUpPow2(R_CAST(uintptr_t, unaligned_base), kCacheline));
 }
 
+HEADER_INLINE void ArenaEndSet(const void* unaligned_end, unsigned char** arena_top_ptr) {
+  *arena_top_ptr = R_CAST(unsigned char*, RoundDownPow2(R_CAST(uintptr_t, unaligned_end), kEndAllocAlign));
+}
+
 HEADER_INLINE void* arena_end_alloc_raw(uintptr_t size, unsigned char** arena_top_ptr) {
   assert(!(size % kEndAllocAlign));
   unsigned char* alloc_ptr = *arena_top_ptr;
@@ -1028,6 +1037,11 @@ HEADER_INLINE BoolErr arena_end_alloc_cp(unsigned char* arena_bottom, uintptr_t
   return !(*cp_arr_ptr);
 }
 
+HEADER_INLINE BoolErr arena_end_alloc_kcp(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, const char*** kcp_arr_ptr) {
+  *kcp_arr_ptr = S_CAST(const char**, arena_end_alloc(arena_bottom, ct * sizeof(intptr_t), arena_top_ptr));
+  return !(*kcp_arr_ptr);
+}
+
 
 BoolErr PushLlStr(const char* str, LlStr** ll_stack_ptr);
 
@@ -1337,6 +1351,29 @@ HEADER_INLINE uint32_t Hashceil(const char* idstr, uint32_t idlen, uint32_t htab
   return (S_CAST(uint64_t, Hash32(idstr, idlen)) * htable_size) >> 32;
 }
 
+// In most cases, plink2 represents an array of strings in one of the following
+// two ways:
+//   (const char* strbox, uint32_t str_ct, uintptr_t max_str_blen):
+//     null-terminated string #x starts at &(strbox[x * max_str_blen)), where
+//     x is a 0-based index.
+//   (const char* const* item_ids, uint32_t str_ct):
+//     null-terminated string #x starts at item_ids[x].
+// When we need to perform string -> string-index lookups into the array, we
+// construct a hashmap as follows:
+// - Allocate a uint32_t array of htable_size >= 2 * str_ct, and initialize all
+//   entries to UINT32_MAX to mark them empty.
+// - Iterate through the array, computing hashval := Hashceil(str, strlen(str),
+//   htable_size) for each string, and then setting htable[hashval] :=
+//   string-index whenever that htable entry is empty.  When there is a
+//   conflict, use linear probing (increment hashval until an empty table cell
+//   is found, wrapping around from (htable_size - 1) to 0).
+//   - PLINK 1.9 used quadratic probing, but that's been scrapped since
+//     benchmark results suggest that it has no meaningful advantage.  We
+//     aren't dealing with adversarial input...
+// The "StrboxHtable" functions below work with the const char* strbox
+// string-array representation, while "IdHtable" functions work with the const
+// char* const* item_ids representation.
+
 // uintptr_t geqprime(uintptr_t floor);
 
 // assumes ceil is odd and greater than 4.  Returns the first prime <= ceil.
@@ -1358,10 +1395,6 @@ HEADER_INLINE uint32_t GetHtableFastSize(uint32_t item_ct) {
 
 BoolErr HtableGoodSizeAlloc(uint32_t item_ct, uintptr_t bytes_avail, uint32_t** htable_ptr, uint32_t* htable_size_ptr);
 
-// useful for duplicate detection: returns 0 on no duplicates, a positive index
-// of a duplicate pair if they're present
-uint32_t PopulateStrboxHtable(const char* strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable);
-
 // returned index in duplicate-pair case is unfiltered
 // uint32_t populate_strbox_subset_htable(const uintptr_t* __restrict subset_mask, const char* strbox, uintptr_t raw_str_ct, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable);
 
@@ -1372,10 +1405,41 @@ uint32_t IdHtableFind(const char* cur_id, const char* const* item_ids, const uin
 // null-terminated any more.
 uint32_t IdHtableFindNnt(const char* cur_id, const char* const* item_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size);
 
+HEADER_INLINE void HtableAddNondup(const char* cur_id, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t value, uint32_t* id_htable) {
+  for (uint32_t hashval = Hashceil(cur_id, cur_id_slen, id_htable_size); ; ) {
+    const uint32_t cur_htable_entry = id_htable[hashval];
+    if (cur_htable_entry == UINT32_MAX) {
+      id_htable[hashval] = value;
+      return;
+    }
+    if (++hashval == id_htable_size) {
+      hashval = 0;
+    }
+  }
+}
+
+// Assumes cur_id is null-terminated.
+// item_ids overread must be ok.
+// Returns string-index if cur_id is already in the table, UINT32_MAX if it was
+// added.
+uint32_t IdHtableAdd(const char* cur_id, const char* const* item_ids, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t value, uint32_t* id_htable);
+
+// Does not require cur_id to be null-terminated.
+uint32_t IdHtableAddNnt(const char* cur_id, const char* const* item_ids, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t value, uint32_t* id_htable);
+
 // assumes cur_id_slen < max_str_blen.
 // requires cur_id to be null-terminated.
 uint32_t StrboxHtableFind(const char* cur_id, const char* strbox, const uint32_t* id_htable, uintptr_t max_str_blen, uint32_t cur_id_slen, uint32_t id_htable_size);
 
+// Assumes cur_id is null-terminated.
+// Returns string-index if cur_id is already in the table, UINT32_MAX if it was
+// added.
+uint32_t StrboxHtableAdd(const char* cur_id, const char* strbox, uintptr_t max_str_blen, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t value, uint32_t* id_htable);
+
+// useful for duplicate detection: returns 0 on no duplicates, a positive index
+// of a duplicate pair if they're present
+uint32_t PopulateStrboxHtable(const char* strbox, uint32_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable);
+
 // last variant_ids entry must be at least kMaxIdBlen bytes before end of
 // bigstack
 uint32_t VariantIdDupflagHtableFind(const char* idbuf, const char* const* variant_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t max_id_slen);
@@ -1703,7 +1767,6 @@ PglErr CheckIdUniqueness(unsigned char* arena_bottom, unsigned char* arena_top,
 // not doing that again).
 PglErr AllocAndPopulateIdHtableMt(const uintptr_t* subset_mask, const char* const* item_ids, uintptr_t item_ct, uintptr_t fast_size_min_extra_bytes, uint32_t max_thread_ct, uint32_t** id_htable_ptr, uint32_t** htable_dup_base_ptr, uint32_t* id_htable_size_ptr, uint32_t* dup_ct_ptr);
 
-
 typedef struct HelpCtrlStruct {
   NONCOPYABLE(HelpCtrlStruct);
   uint32_t iters_left;


=====================================
plink2_common.cc
=====================================
@@ -1794,15 +1794,8 @@ PglErr TryToAddChrName(const char* chr_name, const char* file_descrip, uintptr_t
   *chr_idx_ptr = chr_code_end;
   cip->name_ct = name_ct + 1;
   uint32_t* id_htable = cip->nonstd_id_htable;
-  for (uint32_t hashval = Hashceil(chr_name, name_slen, kChrHtableSize); ; ) {
-    if (id_htable[hashval] == UINT32_MAX) {
-      id_htable[hashval] = chr_code_end;
-      return kPglRetSuccess;
-    }
-    if (++hashval == kChrHtableSize) {
-      hashval = 0;
-    }
-  }
+  HtableAddNondup(chr_name, name_slen, kChrHtableSize, chr_code_end, id_htable);
+  return kPglRetSuccess;
 }
 
 
@@ -3037,6 +3030,115 @@ void ExpandMhc(uint32_t sample_ct, uintptr_t* mhc, uintptr_t** patch_01_set_ptr,
   *patch_10_vals_ptr = R_CAST(AlleleCode*, &(patch_10_set[sample_ctl]));
 }
 
+// Returns 2 if resize needed (str_ct == strset_table_size / 2), 1 if OOM
+uint32_t StrsetAdd(unsigned char* arena_top, const char* src, uint32_t slen, uint32_t strset_table_size, char** strset, uint32_t* str_ctp, unsigned char** arena_bottom_ptr) {
+  for (uint32_t hashval = Hashceil(src, slen, strset_table_size); ; ) {
+    char* strset_entry = strset[hashval];
+    if (!strset_entry) {
+      const uint32_t str_ct = 1 + (*str_ctp);
+      if (unlikely(str_ct * 2 > strset_table_size)) {
+        return 2;
+      }
+      *str_ctp = str_ct;
+      return S_CAST(uint32_t, StoreStringAtBase(arena_top, src, slen, arena_bottom_ptr, &(strset[hashval])));
+    }
+    if (strequal_unsafe(strset_entry, src, slen)) {
+      return 0;
+    }
+    if (++hashval == strset_table_size) {
+      hashval = 0;
+    }
+  }
+}
+
+uint32_t StrsetAddEnd(unsigned char* arena_bottom, const char* src, uint32_t slen, uint32_t strset_table_size, char** strset, uint32_t* str_ctp, unsigned char** arena_top_ptr) {
+  for (uint32_t hashval = Hashceil(src, slen, strset_table_size); ; ) {
+    char* strset_entry = strset[hashval];
+    if (!strset_entry) {
+      const uint32_t str_ct = 1 + (*str_ctp);
+      if (unlikely(str_ct * 2 > strset_table_size)) {
+        return 2;
+      }
+      *str_ctp = str_ct;
+      return S_CAST(uint32_t, StoreStringAtEnd(arena_bottom, src, slen, arena_top_ptr, &(strset[hashval])));
+    }
+    if (strequal_unsafe(strset_entry, src, slen)) {
+      return 0;
+    }
+    if (++hashval == strset_table_size) {
+      hashval = 0;
+    }
+  }
+}
+
+void RepopulateStrset(char* str_iter, uint32_t str_ct, uint32_t strset_size, char** strset) {
+  ZeroPtrArr(strset_size, strset);
+  for (uint32_t str_idx = 0; str_idx != str_ct; ++str_idx) {
+    char* str_end = strnul(str_iter);
+    const uint32_t slen = str_end - str_iter;
+    for (uint32_t hashval = Hashceil(str_iter, slen, strset_size); ; ) {
+      char* strset_entry = strset[hashval];
+      if (!strset_entry) {
+        strset[hashval] = str_iter;
+        break;
+      }
+      if (++hashval == strset_size) {
+        hashval = 0;
+      }
+    }
+    str_iter = &(str_end[1]);
+  }
+}
+
+BoolErr StrsetAddResize(unsigned char* arena_top, const char* src, uint32_t slen, uint32_t strset_table_size_max, char** strset, uint32_t* strset_table_sizep, uint32_t* str_ctp, unsigned char** arena_bottom_ptr) {
+  uint32_t strset_table_size = *strset_table_sizep;
+  uint32_t retval = StrsetAdd(arena_top, src, slen, strset_table_size, strset, str_ctp, arena_bottom_ptr);
+  if (!retval) {
+    return 0;
+  }
+  if (unlikely((retval == 1) || (strset_table_size == strset_table_size_max))) {
+    return 1;
+  }
+  const uint32_t new_table_size = MINV(strset_table_size * 2LLU, strset_table_size_max);
+  const uint64_t bytes_needed = sizeof(intptr_t) * (new_table_size - strset_table_size);
+  if (S_CAST(uintptr_t, arena_top - (*arena_bottom_ptr)) < bytes_needed) {
+    return 1;
+  }
+  unsigned char* old_str_base = R_CAST(unsigned char*, &(strset[strset_table_size]));
+  unsigned char* new_str_base = R_CAST(unsigned char*, &(strset[new_table_size]));
+  memmove(new_str_base, old_str_base, (*arena_bottom_ptr) - old_str_base);
+  *arena_bottom_ptr += bytes_needed;
+  RepopulateStrset(R_CAST(char*, new_str_base), *str_ctp, new_table_size, strset);
+  *strset_table_sizep = new_table_size;
+  return (StrsetAdd(arena_top, src, slen, new_table_size, strset, str_ctp, arena_bottom_ptr) != 0);
+}
+
+BoolErr StrsetAddEndResize(unsigned char* arena_bottom, const char* src, uint32_t slen, uint32_t strset_table_size_max, char*** strsetp, uint32_t* strset_table_sizep, uint32_t* str_ctp, unsigned char** arena_top_ptr) {
+  char** strset = *strsetp;
+  uint32_t strset_table_size = *strset_table_sizep;
+  uint32_t retval = StrsetAddEnd(arena_bottom, src, slen, strset_table_size, strset, str_ctp, arena_top_ptr);
+  if (!retval) {
+    return 0;
+  }
+  if (unlikely((retval == 1) || (strset_table_size == strset_table_size_max))) {
+    return 1;
+  }
+  const uint32_t new_table_size = MINV(strset_table_size * 2LLU, strset_table_size_max);
+  const uint64_t bytes_needed = sizeof(intptr_t) * (new_table_size - strset_table_size);
+  unsigned char* old_str_base = *arena_top_ptr;
+  if (unlikely(S_CAST(uintptr_t, old_str_base - arena_bottom) < bytes_needed)) {
+    return 1;
+  }
+  unsigned char* new_str_base = old_str_base - bytes_needed;
+  memmove(new_str_base, old_str_base, R_CAST(unsigned char*, strset) - old_str_base);
+  *arena_top_ptr = new_str_base;
+  strset -= new_table_size - strset_table_size;
+  *strsetp = strset;
+  RepopulateStrset(R_CAST(char*, new_str_base), *str_ctp, new_table_size, strset);
+  *strset_table_sizep = new_table_size;
+  return (StrsetAddEnd(arena_bottom, src, slen, new_table_size, strset, str_ctp, arena_top_ptr) != 0);
+}
+
 PglErr WriteSampleIdsOverride(const uintptr_t* sample_include, const SampleIdInfo* siip, const char* outname, uint32_t sample_ct, SampleIdFlags override_flags) {
   FILE* outfile = nullptr;
   PglErr reterr = kPglRetSuccess;


=====================================
plink2_common.h
=====================================
@@ -226,6 +226,7 @@ FLAGSET_DEF_START()
   kfPsamColAll = ((kfPsamColPhenos * 2) - kfPsamColMaybefid)
 FLAGSET_DEF_END(PvarPsamFlags);
 
+// may want to rename FidPresent to FidMayBePresent
 FLAGSET_DEF_START()
   kfSampleId0,
   kfSampleIdFidPresent = (1 << 0),
@@ -985,6 +986,8 @@ typedef struct PhenoColStruct {
   // * When .sample non-V2 categorical variables are imported, 'C' is added in
   //   front of the integers.
   // * category_names[1..(n-1)] is guaranteed to be in natural-sorted order.
+  // (MergePsams() has slightly different behavior since its pheno_cols
+  // instance is only used for a one-time internal write.)
   const char** category_names;
 
   uintptr_t* nonmiss;  // bitvector
@@ -1109,8 +1112,25 @@ HEADER_INLINE BoolErr StoreStringAndPrecharAtEnd(unsigned char* arena_bottom, co
   return 0;
 }
 
+// Memory layout:
+//   [       strset       ][packed strings]
+//                                         ^
+//                                         |
+//                                   arena_bottom
+// strset pointers do not remain valid after resize.
+
+// returns 1 if resize needed, 2 if OOM
+// uint32_t StrsetAdd(unsigned char* arena_top, const char* src, uint32_t slen, uint32_t strset_table_size, char** strset, uint32_t* str_ctp, unsigned char** arena_bottom_ptr);
+
+// uint32_t StrsetAddEnd(unsigned char* arena_bottom, const char* src, uint32_t slen, uint32_t strset_table_size, char** strset, uint32_t* str_ctp, unsigned char** arena_top_ptr);
+
+BoolErr StrsetAddResize(unsigned char* arena_top, const char* src, uint32_t slen, uint32_t strset_table_size_max, char** strset, uint32_t* strset_table_sizep, uint32_t* str_ctp, unsigned char** arena_bottom_ptr);
+
+BoolErr StrsetAddEndResize(unsigned char* arena_bottom, const char* src, uint32_t slen, uint32_t strset_table_size_max, char*** strsetp, uint32_t* strset_table_sizep, uint32_t* str_ctp, unsigned char** arena_top_ptr);
+
 // These use g_textbuf.
 PglErr WriteSampleIdsOverride(const uintptr_t* sample_include, const SampleIdInfo* siip, const char* outname, uint32_t sample_ct, SampleIdFlags override_flags);
+
 HEADER_INLINE PglErr WriteSampleIds(const uintptr_t* sample_include, const SampleIdInfo* siip, const char* outname, uint32_t sample_ct) {
   return WriteSampleIdsOverride(sample_include, siip, outname, sample_ct, siip->flags);
 }


=====================================
plink2_data.cc
=====================================
@@ -928,17 +928,17 @@ uint32_t DataSidColIsRequired(const uintptr_t* sample_include, const char* sids,
   return 0;
 }
 
-uint32_t DataParentalColsAreRequired(const uintptr_t* sample_include, const PedigreeIdInfo* piip, uint32_t sample_ct, uint32_t maybe_modifier) {
+uint32_t DataParentalColsAreRequired(const uintptr_t* sample_include, const SampleIdInfo* siip, const ParentalIdInfo* parental_id_infop, uint32_t sample_ct, uint32_t maybe_modifier) {
   if (maybe_modifier & 2) {
     return 1;
   }
-  if ((!(maybe_modifier & 1)) || (!(piip->sii.flags & kfSampleIdParentsPresent))) {
+  if ((!(maybe_modifier & 1)) || (!(siip->flags & kfSampleIdParentsPresent))) {
     return 0;
   }
-  const char* paternal_ids = piip->parental_id_info.paternal_ids;
-  const char* maternal_ids = piip->parental_id_info.maternal_ids;
-  const uintptr_t max_paternal_id_blen = piip->parental_id_info.max_paternal_id_blen;
-  const uintptr_t max_maternal_id_blen = piip->parental_id_info.max_maternal_id_blen;
+  const char* paternal_ids = parental_id_infop->paternal_ids;
+  const char* maternal_ids = parental_id_infop->maternal_ids;
+  const uintptr_t max_paternal_id_blen = parental_id_infop->max_paternal_id_blen;
+  const uintptr_t max_maternal_id_blen = parental_id_infop->max_maternal_id_blen;
   uintptr_t sample_uidx_base = 0;
   uintptr_t cur_bits = sample_include[0];
   for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
@@ -966,29 +966,28 @@ char* AppendPhenoStr(const PhenoCol* pheno_col, const char* output_missing_pheno
   return write_iter;
 }
 
-PglErr WritePsam(const char* outname, const uintptr_t* sample_include, const PedigreeIdInfo* piip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, PvarPsamFlags pvar_psam_flags) {
+PglErr WritePsam(const char* outname, const uintptr_t* sample_include, const SampleIdInfo* siip, const ParentalIdInfo* parental_id_infop, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, const char* output_missing_pheno, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, PvarPsamFlags pvar_psam_flags) {
   FILE* outfile = nullptr;
   PglErr reterr = kPglRetSuccess;
   {
     if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
       goto WritePsam_ret_OPEN_FAIL;
     }
-    const char* output_missing_pheno = g_output_missing_pheno;
     const uint32_t omp_slen = strlen(output_missing_pheno);
 
     char* textbuf_flush = &(g_textbuf[kMaxMediumLine]);
 
-    const char* sample_ids = piip->sii.sample_ids;
-    const char* sids = piip->sii.sids;
-    const char* paternal_ids = piip->parental_id_info.paternal_ids;
-    const char* maternal_ids = piip->parental_id_info.maternal_ids;
-    const uintptr_t max_sample_id_blen = piip->sii.max_sample_id_blen;
-    const uintptr_t max_sid_blen = piip->sii.max_sid_blen;
-    const uintptr_t max_paternal_id_blen = piip->parental_id_info.max_paternal_id_blen;
-    const uintptr_t max_maternal_id_blen = piip->parental_id_info.max_maternal_id_blen;
-    const uint32_t write_fid = DataFidColIsRequired(sample_include, &(piip->sii), sample_ct, pvar_psam_flags / kfPsamColMaybefid);
+    const char* sample_ids = siip->sample_ids;
+    const char* sids = siip->sids;
+    const char* paternal_ids = parental_id_infop->paternal_ids;
+    const char* maternal_ids = parental_id_infop->maternal_ids;
+    const uintptr_t max_sample_id_blen = siip->max_sample_id_blen;
+    const uintptr_t max_sid_blen = siip->max_sid_blen;
+    const uintptr_t max_paternal_id_blen = parental_id_infop->max_paternal_id_blen;
+    const uintptr_t max_maternal_id_blen = parental_id_infop->max_maternal_id_blen;
+    const uint32_t write_fid = DataFidColIsRequired(sample_include, siip, sample_ct, pvar_psam_flags / kfPsamColMaybefid);
     const uint32_t write_sid = DataSidColIsRequired(sample_include, sids, sample_ct, max_sid_blen, pvar_psam_flags / kfPsamColMaybesid);
-    const uint32_t write_parents = DataParentalColsAreRequired(sample_include, piip, sample_ct, pvar_psam_flags / kfPsamColMaybeparents);
+    const uint32_t write_parents = DataParentalColsAreRequired(sample_include, siip, parental_id_infop, sample_ct, pvar_psam_flags / kfPsamColMaybeparents);
     const uint32_t write_sex = (pvar_psam_flags / kfPsamColSex) & 1;
     const uint32_t write_empty_pheno = (pvar_psam_flags & kfPsamColPheno1) && (!pheno_ct);
     const uint32_t write_phenos = (pvar_psam_flags & (kfPsamColPheno1 | kfPsamColPhenos)) && pheno_ct;
@@ -4184,6 +4183,7 @@ PglErr MakeFilterHtable(const uintptr_t* variant_include, const uintptr_t* filte
     // while still being small relative to L1 cache.  Double table size
     // whenever load factor reaches 0.25; there shouldn't be *that* many
     // distinct filters.
+    // TODO: check if this should use strset instead
     // possible todo: multithread this scan, merge results at the end; can also
     // separate this stage from the rest of the function.
     uint32_t table_size = 128;
@@ -4254,7 +4254,7 @@ PglErr MakeFilterHtable(const uintptr_t* variant_include, const uintptr_t* filte
               }
               break;
             }
-            if ((!memcmp(filter_iter, cur_token_ptr, cur_id_slen)) && (!cur_token_ptr[cur_id_slen])) {
+            if (strequal_unsafe(filter_iter, cur_token_ptr, cur_id_slen)) {
               break;
             }
             if (++hashval == table_size) {
@@ -4292,14 +4292,7 @@ PglErr MakeFilterHtable(const uintptr_t* variant_include, const uintptr_t* filte
     uint32_t* filter_keys_htable = *filter_keys_htable_ptr;
     SetAllU32Arr(filter_keys_htable_size, filter_keys_htable);
     for (uint32_t uii = 0; uii != filter_key_ct; ++uii) {
-      for (uint32_t hashval = Hashceil(filter_keys[uii], strlen(filter_keys[uii]), filter_keys_htable_size); ; ) {
-        if (filter_keys_htable[hashval] == UINT32_MAX) {
-          filter_keys_htable[hashval] = uii;
-        }
-        if (++hashval == filter_keys_htable_size) {
-          hashval = 0;
-        }
-      }
+      HtableAddNondup(filter_keys[uii], strlen(filter_keys[uii]), filter_keys_htable_size, uii, filter_keys_htable);
     }
   }
   while (0) {
@@ -6765,7 +6758,7 @@ PglErr MakePlink2NoVsort(const uintptr_t* sample_include, const PedigreeIdInfo*
       snprintf(outname_end, kMaxOutfnameExtBlen, ".psam");
       logprintfww5("Writing %s ... ", outname);
       fflush(stdout);
-      reterr = WritePsam(outname, sample_include, piip, sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, sample_ct, pheno_ct, max_pheno_name_blen, pvar_psam_flags);
+      reterr = WritePsam(outname, sample_include, &(piip->sii), &(piip->parental_id_info), sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, g_output_missing_pheno, sample_ct, pheno_ct, max_pheno_name_blen, pvar_psam_flags);
       if (unlikely(reterr)) {
         goto MakePlink2NoVsort_ret_1;
       }
@@ -8218,7 +8211,7 @@ PglErr MakePlink2Vsort(const uintptr_t* sample_include, const PedigreeIdInfo* pi
       snprintf(outname_end, kMaxOutfnameExtBlen, ".psam");
       logprintfww5("Writing %s ... ", outname);
       fflush(stdout);
-      reterr = WritePsam(outname, sample_include, piip, sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, sample_ct, pheno_ct, max_pheno_name_blen, pvar_psam_flags);
+      reterr = WritePsam(outname, sample_include, &(piip->sii), &(piip->parental_id_info), sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, g_output_missing_pheno, sample_ct, pheno_ct, max_pheno_name_blen, pvar_psam_flags);
       if (unlikely(reterr)) {
         goto MakePlink2Vsort_ret_1;
       }


=====================================
plink2_data.h
=====================================
@@ -63,6 +63,8 @@ CONSTI32(kMaxInfoKeySlen, kMaxIdSlen);
 
 PglErr WriteMapOrBim(const char* outname, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const double* variant_cms, uint32_t variant_ct, uint32_t max_allele_slen, char delim, uint32_t output_zst, uint32_t thread_ct);
 
+PglErr WritePsam(const char* outname, const uintptr_t* sample_include, const SampleIdInfo* siip, const ParentalIdInfo* parental_id_infop, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, const char* output_missing_pheno, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, PvarPsamFlags pvar_psam_flags);
+
 PglErr PvarInfoOpenAndReloadHeader(const char* pvar_info_reload, uint32_t max_thread_ct, TextStream* pvar_reload_txsp, char** line_iterp, uint32_t* info_col_idx_ptr);
 
 PglErr PvarInfoReload(uint32_t info_col_idx, uint32_t variant_uidx, TextStream* pvar_reload_txsp, char** line_iterp, uint32_t* trs_variant_uidx_ptr);
@@ -83,7 +85,7 @@ uint32_t DataFidColIsRequired(const uintptr_t* sample_include, const SampleIdInf
 
 uint32_t DataSidColIsRequired(const uintptr_t* sample_include, const char* sids, uint32_t sample_ct, uint32_t max_sid_blen, uint32_t maybe_modifier);
 
-uint32_t DataParentalColsAreRequired(const uintptr_t* sample_include, const PedigreeIdInfo* piip, uint32_t sample_ct, uint32_t maybe_modifier);
+uint32_t DataParentalColsAreRequired(const uintptr_t* sample_include, const SampleIdInfo* siip, const ParentalIdInfo* parental_id_infop, uint32_t sample_ct, uint32_t maybe_modifier);
 
 char* AppendPhenoStr(const PhenoCol* pheno_col, const char* output_missing_pheno, uint32_t omp_slen, uint32_t sample_uidx, char* write_iter);
 


=====================================
plink2_export.cc
=====================================
@@ -3845,7 +3845,7 @@ PglErr ExportOxSampleV2(const char* outname, const uintptr_t* sample_include, co
       }
     }
     // possible todo: support column sets here
-    const uint32_t write_parents = DataParentalColsAreRequired(sample_include, piip, sample_ct, 1);
+    const uint32_t write_parents = DataParentalColsAreRequired(sample_include, &(piip->sii), &(piip->parental_id_info), sample_ct, 1);
     if (write_parents) {
       writebuf_size += max_paternal_id_blen + max_maternal_id_blen;
     }
@@ -6169,48 +6169,40 @@ PglErr ExportVcf(const uintptr_t* sample_include, const uint32_t* sample_include
 }
 
 PglErr AddToFifHtable(unsigned char* arena_bottom, const char* key, uint32_t htable_size, uint32_t key_slen, unsigned char prechar, unsigned char** arena_top_ptr, char** keys, uint32_t* htable, uint32_t* key_ct_ptr, uint32_t* cur_idx_ptr) {
-  for (uint32_t hashval = Hashceil(key, key_slen, htable_size); ; ) {
-    uint32_t cur_idx = htable[hashval];
-    if (cur_idx == UINT32_MAX) {
-      const uint32_t key_ct = *key_ct_ptr;
-      if (StoreStringAndPrecharAtEnd(arena_bottom, key, prechar, key_slen, arena_top_ptr, &(keys[key_ct]))) {
-        return kPglRetNomem;
-      }
-      *cur_idx_ptr = key_ct;
-      htable[hashval] = key_ct;
-      *key_ct_ptr = key_ct + 1;
-      return kPglRetSuccess;
-    }
-    char* existing_key = keys[cur_idx];
-    if (strequal_unsafe(existing_key, key, key_slen)) {
-      // Only permit this if previous instance was FILTER and this is INFO, or
-      // vice versa, or this is a FORMAT key.
-      // INFO entries always have prechar bit 0 set and 3 unset, while FILTER
-      // entries are the reverse.  FORMAT keys have prechar == 0.
-      const unsigned char new_prechar = S_CAST(unsigned char, existing_key[-1]) ^ prechar;
-      if (unlikely(prechar && ((new_prechar & 9) != 9))) {
-        char* write_iter = strcpya_k(g_logbuf, "Error: Multiple ");
-        if (prechar & 8) {
-          write_iter = strcpya_k(write_iter, "FILTER");
-        } else {
-          write_iter = strcpya_k(write_iter, "INFO");
-        }
-        *write_iter++ = ':';
-        write_iter = memcpya(write_iter, key, key_slen);
-        strcpy_k(write_iter, " header lines.\n");
-        WordWrapB(0);
-        logerrputsb();
-        return kPglRetMalformedInput;
-      }
-      existing_key[-1] = new_prechar;
-      // bugfix (3 Jan 2021): forgot to return this
-      *cur_idx_ptr = cur_idx;
-      return kPglRetSuccess;
-    }
-    if (++hashval == htable_size) {
-      hashval = 0;
+  const uint32_t key_ct = *key_ct_ptr;
+  const uint32_t cur_idx = IdHtableAddNnt(key, TO_CONSTCPCONSTP(keys), key_slen, htable_size, key_ct, htable);
+  if (cur_idx == UINT32_MAX) {
+    if (StoreStringAndPrecharAtEnd(arena_bottom, key, prechar, key_slen, arena_top_ptr, &(keys[key_ct]))) {
+      return kPglRetNomem;
+    }
+    *cur_idx_ptr = key_ct;
+    *key_ct_ptr = key_ct + 1;
+    return kPglRetSuccess;
+  }
+  // Key already present.  Only permit this if previous instance was FILTER and
+  // this is INFO, or vice versa, or this is a FORMAT key.
+  // INFO entries always have prechar bit 0 set and 3 unset, while FILTER
+  // entries are the reverse.  FORMAT keys have prechar == 0.
+  char* existing_key = keys[cur_idx];
+  const unsigned char new_prechar = S_CAST(unsigned char, existing_key[-1]) ^ prechar;
+  if (unlikely(prechar && ((new_prechar & 9) != 9))) {
+    char* write_iter = strcpya_k(g_logbuf, "Error: Multiple ");
+    if (prechar & 8) {
+      write_iter = strcpya_k(write_iter, "FILTER");
+    } else {
+      write_iter = strcpya_k(write_iter, "INFO");
     }
+    *write_iter++ = ':';
+    write_iter = memcpya(write_iter, key, key_slen);
+    strcpy_k(write_iter, " header lines.\n");
+    WordWrapB(0);
+    logerrputsb();
+    return kPglRetMalformedInput;
   }
+  existing_key[-1] = new_prechar;
+  // bugfix (3 Jan 2021): forgot to return this
+  *cur_idx_ptr = cur_idx;
+  return kPglRetSuccess;
 }
 
 // Assumes ii is not in 0x80000000..0x80000007.


=====================================
plink2_help.cc
=====================================
@@ -2145,8 +2145,9 @@ PglErr DispHelp(const char* const* argvk, uint32_t param_ct) {
 "                                            (--merge-filter-mode default, not\n"
 "                                            applicable to others)\n"
               );
-    HelpPrint("pmerge\0pmerge-list\0merge-info-sort\0", &help_ctrl, 0,
-"  --merge-info-sort <mode> : Set sort order for INFO entries when merging.\n"
+    HelpPrint("pmerge\0pmerge-list\0merge-pheno-sort\0merge-info-sort\0", &help_ctrl, 0,
+"  --merge-pheno-sort <m>   : Set sort order for phenotype columns and INFO\n"
+"  --merge-info-sort <mode>   entries when merging.\n"
 "                             * 'none'/'0' = keep in loaded order (default)\n"
 "                             * 'ascii'/'a' = ASCII order\n"
 "                             * 'natural'/'n' = natural sort\n"


=====================================
plink2_import.cc
=====================================
@@ -14811,7 +14811,7 @@ PglErr Plink1DosageToPgen(const char* dosagename, const char* famname, const cha
     if (write_sid) {
       write_iter = strcpya_k(write_iter, "\tSID");
     }
-    const uint32_t write_parents = DataParentalColsAreRequired(sample_include, &pii, sample_ct, 1);
+    const uint32_t write_parents = DataParentalColsAreRequired(sample_include, &pii.sii, &pii.parental_id_info, sample_ct, 1);
     if (write_parents) {
       write_iter = strcpya_k(write_iter, "\tPAT\tMAT");
     }


=====================================
plink2_merge.cc
=====================================
@@ -17,6 +17,7 @@
 
 #include "include/pgenlib_write.h"
 #include "plink2_compress_stream.h"
+#include "plink2_data.h"
 #include "plink2_merge.h"
 #include "plink2_psam.h"
 #include "plink2_pvar.h"
@@ -34,7 +35,8 @@ void InitPmerge(PmergeInfo* pmerge_info_ptr) {
   pmerge_info_ptr->merge_qual_mode = kMergeQualInfoModeNmFirst;
   pmerge_info_ptr->merge_filter_mode = kMergeFilterModeNmFirst;
   pmerge_info_ptr->merge_info_mode = kMergeQualInfoModeNmFirst;
-  pmerge_info_ptr->merge_info_sort = kfSortNone;
+  pmerge_info_ptr->merge_pheno_sort = kSortNone;
+  pmerge_info_ptr->merge_info_sort = kSortNone;
   pmerge_info_ptr->max_allele_ct = 0;
   pmerge_info_ptr->pgen_fname = nullptr;
   pmerge_info_ptr->pvar_fname = nullptr;
@@ -63,14 +65,1157 @@ void CleanupPgenDiff(PgenDiffInfo* pgen_diff_info_ptr) {
   free_cond(pgen_diff_info_ptr->psam_fname);
 }
 
-typedef struct PmergeInputFilesetStruct {
+typedef struct PmergeInputFilesetLlStruct {
+  struct PmergeInputFilesetLlStruct* next;
   char* pgen_fname;
   char* pvar_fname;
   char* psam_fname;
-} PmergeInputFileset;
+  uint32_t variant_ct;
+  uintptr_t grand_allele_ct;
+} PmergeInputFilesetLl;
 
-PglErr Pmerge(__attribute__((unused)) const PmergeInfo* pmip, __attribute__((unused)) const char* sample_sort_fname, __attribute__((unused)) MiscFlags misc_flags, __attribute__((unused)) SortFlags sample_sort_flags, __attribute__((unused)) FamCol fam_cols, __attribute__((unused)) uint32_t max_thread_ct, __attribute__((unused)) char* pgenname, __attribute__((unused)) char* psamname, __attribute__((unused)) char* pvarname, __attribute__((unused)) char* outname, __attribute__((unused)) char* outname_end, __attribute__((unused)) ChrInfo* cip) {
+// Allocates at end of bigstack.
+PmergeInputFilesetLl* AllocFilesetLlEntry(PmergeInputFilesetLl*** filesets_endpp) {
+  const uintptr_t alloc_size = RoundUpPow2(sizeof(PmergeInputFilesetLl), kEndAllocAlign);
+  if (unlikely(S_CAST(uintptr_t, g_bigstack_end - g_bigstack_base) < alloc_size)) {
+    return nullptr;
+  }
+  g_bigstack_end -= alloc_size;
+  PmergeInputFilesetLl* new_entry = R_CAST(PmergeInputFilesetLl*, g_bigstack_end);
+  new_entry->next = nullptr;
+  **filesets_endpp = new_entry;
+  *filesets_endpp = &(new_entry->next);
+  return new_entry;
+}
+
+PglErr LoadPmergeList(const char* list_fname, PmergeListMode mode, uint32_t main_fileset_present, PmergeInputFilesetLl*** filesets_endpp) {
   unsigned char* bigstack_mark = g_bigstack_base;
+  uintptr_t line_idx = 0;
+  PglErr reterr = kPglRetSuccess;
+  TextStream txs;
+  PreinitTextStream(&txs);
+  {
+    reterr = InitTextStream(list_fname, kTextStreamBlenFast, 1, &txs);
+    if (unlikely(reterr)) {
+      goto LoadPmergeList_ret_TSTREAM_FAIL;
+    }
+    const char* pgen_suffix;
+    const char* pvar_suffix;
+    const char* psam_suffix;
+    switch (mode) {
+    case kPmergeListModeBfile:
+      pgen_suffix = ".bed";
+      pvar_suffix = ".bim";
+      psam_suffix = ".fam";
+      break;
+    case kPmergeListModeBpfile:
+      pgen_suffix = ".pgen";
+      pvar_suffix = ".bim";
+      psam_suffix = ".fam";
+      break;
+    case kPmergeListModePfile:
+      pgen_suffix = ".pgen";
+      pvar_suffix = ".pvar";
+      psam_suffix = ".psam";
+      break;
+    default:
+      assert(mode == kPmergeListModePfileVzs);
+      pgen_suffix = ".pgen";
+      pvar_suffix = ".pvar.zst";
+      psam_suffix = ".psam";
+      break;
+    }
+    const uint32_t pgen_blen = strlen(pgen_suffix) + 1;
+    const uint32_t pvar_blen = strlen(pvar_suffix) + 1;
+    const uint32_t psam_blen = strlen(psam_suffix) + 1;
+    const uint32_t max_single_token_slen = kPglFnamesize - 1 - MAXV(pgen_blen, pvar_blen);
+    while (1) {
+      const char* first_token_start = TextGet(&txs);
+      if (!first_token_start) {
+        break;
+      }
+      ++line_idx;
+      PmergeInputFilesetLl* cur_entry = AllocFilesetLlEntry(filesets_endpp);
+      if (unlikely(!cur_entry)) {
+        goto LoadPmergeList_ret_NOMEM;
+      }
+      const char* first_token_end = CurTokenEnd(first_token_start);
+      const uint32_t first_token_slen = first_token_end - first_token_start;
+      const char* second_token_start = FirstNonTspace(first_token_end);
+      if (IsEolnKns(*second_token_start)) {
+        // Expand single token, using the --pmerge-list mode.
+        if (unlikely(first_token_slen > max_single_token_slen)) {
+          logerrprintf("Error: Entry on line %" PRIuPTR " of --pmerge-list file is too long.\n", line_idx);
+          goto LoadPmergeList_ret_MALFORMED_INPUT;
+        }
+        char* fname_iter;
+        if (unlikely(bigstack_end_alloc_c(first_token_slen * 3 + pgen_blen + pvar_blen + psam_blen, &fname_iter))) {
+          goto LoadPmergeList_ret_NOMEM;
+        }
+        cur_entry->pgen_fname = fname_iter;
+        fname_iter = memcpya(fname_iter, first_token_start, first_token_slen);
+        fname_iter = memcpya(fname_iter, pgen_suffix, pgen_blen);
+        cur_entry->pvar_fname = fname_iter;
+        fname_iter = memcpya(fname_iter, first_token_start, first_token_slen);
+        fname_iter = memcpya(fname_iter, pvar_suffix, pvar_blen);
+        cur_entry->psam_fname = fname_iter;
+        fname_iter = memcpya(fname_iter, first_token_start, first_token_slen);
+        memcpy(fname_iter, psam_suffix, psam_blen);
+      } else {
+        const char* second_token_end = CurTokenEnd(second_token_start);
+        const char* third_token_start = FirstNonTspace(second_token_end);
+        if (unlikely(IsEolnKns(*third_token_start))) {
+          snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of --pmerge-list has exactly two tokens; 1 or 3 expected.\n", line_idx);
+          goto LoadPmergeList_ret_MALFORMED_INPUT_WW;
+        }
+        const char* third_token_end = CurTokenEnd(third_token_start);
+        const char* fourth_token_start = FirstNonTspace(third_token_end);
+        if (unlikely(!IsEolnKns(*fourth_token_start))) {
+          snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of --pmerge-list has more than 3 tokens.\n", line_idx);
+          goto LoadPmergeList_ret_MALFORMED_INPUT_WW;
+        }
+        const uint32_t second_token_slen = second_token_end - second_token_start;
+        const uint32_t third_token_slen = third_token_end - third_token_start;
+        if (unlikely((first_token_slen >= kPglFnamesize) ||
+                     (second_token_slen >= kPglFnamesize) ||
+                     (third_token_slen >= kPglFnamesize))) {
+          logerrprintf("Error: Filename on line %" PRIuPTR " of --pmerge-list file is too long.\n", line_idx);
+          goto LoadPmergeList_ret_MALFORMED_INPUT;
+        }
+        char* fname_iter;
+        if (unlikely(bigstack_end_alloc_c(first_token_slen + second_token_slen + third_token_slen + 3, &fname_iter))) {
+          goto LoadPmergeList_ret_NOMEM;
+        }
+        cur_entry->pgen_fname = fname_iter;
+        fname_iter = memcpyax(fname_iter, first_token_start, first_token_slen, '\0');
+        cur_entry->pvar_fname = fname_iter;
+        fname_iter = memcpyax(fname_iter, second_token_start, third_token_slen, '\0');
+        cur_entry->psam_fname = fname_iter;
+        memcpyx(fname_iter, third_token_start, third_token_slen, '\0');
+      }
+    }
+    if (unlikely(TextStreamErrcode2(&txs, &reterr))) {
+      goto LoadPmergeList_ret_TSTREAM_FAIL;
+    }
+    if (main_fileset_present + line_idx < 2) {
+      logerrputs("Error: --pmerge-list requires at least two filesets to be specified.\n");
+      goto LoadPmergeList_ret_INCONSISTENT_INPUT;
+    }
+    logprintf("--pmerge-list: %" PRIuPTR " filesets specified%s.\n", main_fileset_present + line_idx, main_fileset_present? " (including main fileset)" : "");
+  }
+  while (0) {
+  LoadPmergeList_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  LoadPmergeList_ret_TSTREAM_FAIL:
+    TextStreamErrPrint("--pmerge-list file", &txs);
+    break;
+  LoadPmergeList_ret_MALFORMED_INPUT_WW:
+    WordWrapB(0);
+    logerrputsb();
+  LoadPmergeList_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  LoadPmergeList_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+  BigstackReset(bigstack_mark);
+  return reterr;
+}
+
+// Permanent allocations are at end of bigstack.
+PglErr MergePsams(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags misc_flags, SortMode sample_sort_mode, FamCol fam_cols, uint32_t max_thread_ct, char* outname, char* outname_end, PmergeInputFilesetLl* filesets, SampleIdInfo* siip, uint32_t* sample_ctp) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+  uintptr_t line_idx = 0;
+  const char* cur_fname = nullptr;
+  PglErr reterr = kPglRetSuccess;
+  TextStream txs;
+  PreinitTextStream(&txs);
+  {
+    // First pass: determine sample IDs and phenotype set.
+    // Intermission: sort sample and phenotype IDs.
+    // Second pass: populate data structure, then call WritePsam().
+
+    // Allocate a single linebuf that's used in all first-pass loads, so that
+    // we can have the pheno strset grow from bigstack_end down in an
+    // unfragmented manner.
+    uintptr_t linebuf_capacity = MINV(kMaxLongLine, bigstack_left() / 4) + kDecompressChunkSize;
+    char* linebuf;
+    if (unlikely(bigstack_end_alloc_c(linebuf_capacity, &linebuf))) {
+      goto MergePsams_ret_NOMEM;
+    }
+    // If --sample-inner-join is specified, we append raw null-terminated
+    // sample-ID strings to the bottom of bigstack while scanning the first
+    // file, construct the usual non-resizable string-hash-table data structure
+    // when we're done.  Each ID then has a 0-based index, and we use
+    // sample_include/cur_sample_include to track which sample IDs are present
+    // in all files from that point on.
+    // Otherwise, we add sample IDs to the resizable sample_id_strset data
+    // structure and wait till we've processed all files before ordering the
+    // elements.
+    // Analogous thing happens for phenotype names if --pheno-inner-join is
+    // specified.  Phenotype-name data structure grows down from the top of
+    // bigstack.
+    const PmergeFlags flags = pmip->flags;
+    char* first_sample_ids_start = nullptr;
+    const char** first_sample_ids = nullptr;
+    uint32_t* first_sample_ids_htable = nullptr;
+    uintptr_t* sample_include = nullptr;
+    uintptr_t* cur_sample_include = nullptr;
+    char** sample_id_strset = nullptr;
+    uint32_t first_sample_id_ct = 0;
+    uint32_t sample_id_table_size = 512;
+    if (flags & kfPmergeSampleInnerJoin) {
+      first_sample_ids_start = R_CAST(char*, g_bigstack_base);
+      // - arena_bottom tracks the current append point
+      // - first_sample_ids, first_sample_ids_htable, sample_include,
+      //   cur_sample_include allocated after we're done scanning first file
+      // - sample_id_table_size set to first_sample_ids_htable size at that
+      //   point
+    } else {
+      if (unlikely(bigstack_calloc_cp(sample_id_table_size, &sample_id_strset))) {
+        goto MergePsams_ret_NOMEM;
+      }
+    }
+    const char** first_pheno_names = nullptr;
+    uint32_t* first_pheno_names_htable = nullptr;
+    uintptr_t* pheno_include = nullptr;
+    uintptr_t* cur_pheno_include = nullptr;
+    char** pheno_strset = nullptr;
+    uint32_t first_pheno_ct = 0;
+    uint32_t pheno_names_table_size = 512;
+    if (!(flags & kfPmergePhenoInnerJoin)) {
+      if (unlikely(bigstack_end_calloc_cp(pheno_names_table_size, &pheno_strset))) {
+        goto MergePsams_ret_NOMEM;
+      }
+    }
+    unsigned char* arena_bottom = g_bigstack_base;
+    unsigned char* arena_top = g_bigstack_end;
+
+    const uint32_t decompress_thread_ct = MAXV(max_thread_ct - 1, 1);
+    PmergeInputFilesetLl* filesets_iter = filesets;
+    char* idbuf = g_textbuf;
+    uint32_t max_line_blen = 0;
+    uint32_t max_sample_id_blen_m2 = 2;
+    uintptr_t max_sid_blen = 0;
+    uintptr_t max_paternal_id_blen = 2;
+    uintptr_t max_maternal_id_blen = 2;
+    uintptr_t max_pheno_name_blen = 0;
+    uint32_t sample_ct = 0;
+    uint32_t pheno_ct = 0;
+    do {
+      cur_fname = filesets_iter->psam_fname;
+      reterr = TextStreamOpenEx(cur_fname, kMaxLongLine, linebuf_capacity, decompress_thread_ct, nullptr, linebuf, &txs);
+      if (unlikely(reterr)) {
+        goto MergePsams_ret_TSTREAM_FAIL;
+      }
+      // Worth optimizing this more than most text-reading loops, since we may
+      // be processing a LOT of files.
+      const char* line_start = TextLineEnd(&txs);
+      for (line_idx = 1; ; ++line_idx) {
+        if (unlikely(!TextGetUnsafe2K(&txs, &line_start))) {
+          if (TextStreamErrcode2(&txs, &reterr)) {
+            goto MergePsams_ret_TSTREAM_FAIL;
+          }
+          logerrprintfww("Error: No samples in %s.\n", cur_fname);
+          goto MergePsams_ret_MALFORMED_INPUT;
+        }
+        if ((line_start[0] != '#') || tokequal_k(&(line_start[1]), "FID") || tokequal_k(&(line_start[1]), "IID")) {
+          break;
+        }
+        const char* line_end = AdvPastDelim(line_start, '\n');
+        const uint32_t line_blen = line_end - line_start;
+        if (max_line_blen < line_blen) {
+          max_line_blen = line_blen;
+        }
+        line_start = line_end;
+      }
+      uint32_t sid_present = 0;
+      uint32_t postid_pat_col_idx = 0;
+      uint32_t postid_mat_col_idx = 0;
+      uint32_t fid_present;
+      if (line_start[0] == '#') {
+        const char* iid_end = &(line_start[4]);
+        fid_present = (line_start[1] == 'F');
+        if (fid_present) {
+          const char* iid_start = FirstNonTspace(iid_end);
+          iid_end = FirstPrechar(iid_start, 33);
+          if (unlikely(!strequal_k(iid_start, "IID", iid_end - iid_start))) {
+            goto MergePsams_ret_MALFORMED_INPUT;
+          }
+        }
+        const char* token_start = FirstNonTspace(iid_end);
+        if (tokequal_k(token_start, "SID")) {
+          sid_present = 1;
+          token_start = FirstNonTspace(&(token_start[3]));
+        }
+        const char* token_end;
+        for (uint32_t postid_col_idx = 1; !IsEolnKns(*token_start); token_start = FirstNonTspace(token_end), ++postid_col_idx) {
+          token_end = CurTokenEnd(token_start);
+          const uint32_t token_slen = token_end - token_start;
+          if (token_slen == 3) {
+            if (unlikely(memequal_k(token_start, "FID", 3))) {
+              if (fid_present) {
+                snprintf(g_logbuf, kLogbufSize, "Error: Duplicate FID column in %s.\n", cur_fname);
+              } else {
+                snprintf(g_logbuf, kLogbufSize, "Error: Improperly positioned FID column in %s (must be first).\n", cur_fname);
+              }
+              goto MergePsams_ret_MALFORMED_INPUT_WW;
+            }
+            if (unlikely(memequal_k(token_start, "IID", 3))) {
+              snprintf(g_logbuf, kLogbufSize, "Error: Duplicate IID column in %s.\n", cur_fname);
+              goto MergePsams_ret_MALFORMED_INPUT_WW;
+            }
+            if (unlikely(memequal_k(token_start, "SID", 3))) {
+              if (sid_present) {
+                snprintf(g_logbuf, kLogbufSize, "Error: Duplicate SID column in %s.\n", cur_fname);
+              } else {
+                snprintf(g_logbuf, kLogbufSize, "Error: Improperly positioned SID column in %s (must immediately follow IID).\n", cur_fname);
+              }
+              goto MergePsams_ret_MALFORMED_INPUT_WW;
+            }
+            if (memequal_k(token_start, "PAT", 3)) {
+              if (unlikely(postid_pat_col_idx)) {
+                snprintf(g_logbuf, kLogbufSize, "Error: Duplicate PAT column in %s.\n", cur_fname);
+                goto MergePsams_ret_MALFORMED_INPUT_WW;
+              }
+              postid_pat_col_idx = postid_col_idx;
+              continue;
+            }
+            if (memequal_k(token_start, "MAT", 3)) {
+              if (unlikely(postid_mat_col_idx)) {
+                snprintf(g_logbuf, kLogbufSize, "Error: Duplicate MAT column in %s.\n", cur_fname);
+                goto MergePsams_ret_MALFORMED_INPUT_WW;
+              }
+              postid_mat_col_idx = postid_col_idx;
+              continue;
+            }
+            if (memequal_k(token_start, "SEX", 3)) {
+              continue;
+            }
+          } else if (token_slen > kMaxIdBlen) {
+            logerrputs("Error: Phenotype names are limited to " MAX_ID_SLEN_STR " characters.\n");
+            goto MergePsams_ret_MALFORMED_INPUT;
+          }
+          if (pheno_strset) {
+            if (token_slen >= max_pheno_name_blen) {
+              max_pheno_name_blen = token_slen + 1;
+            }
+            if (unlikely(StrsetAddEndResize(arena_bottom, token_start, token_slen, kMaxPhenoCt * 2, &pheno_strset, &pheno_names_table_size, &pheno_ct, &arena_top))) {
+              if (pheno_ct == kMaxPhenoCt) {
+                logerrputs("Error: " PROG_NAME_STR " does not support more than " MAX_PHENO_CT_STR " phenotypes.\n");
+                goto MergePsams_ret_INCONSISTENT_INPUT;
+              }
+              goto MergePsams_ret_NOMEM;
+            }
+          } else {
+            // max_pheno_name_blen calculation deferred since any name may not
+            // appear in a later file
+            if (!first_pheno_names_htable) {
+              // StoreStringAtEnd(), without dst assignment
+              if (unlikely(PtrWSubCk(arena_bottom, token_slen + 1, &arena_top))) {
+                goto MergePsams_ret_NOMEM;
+              }
+              memcpyx(arena_top, token_start, token_slen, '\0');
+              ++first_pheno_ct;
+            } else if (pheno_ct) {
+              const uint32_t pheno_idx = IdHtableFindNnt(token_start, first_pheno_names, first_pheno_names_htable, token_slen, pheno_names_table_size);
+              if (pheno_idx != UINT32_MAX) {
+                if (unlikely(IsSet(cur_pheno_include, pheno_idx))) {
+                  snprintf(g_logbuf, kLogbufSize, "Error: Duplicate phenotype name '%s' in %s.\n", first_pheno_names[pheno_idx], cur_fname);
+                  goto MergePsams_ret_MALFORMED_INPUT_WW;
+                }
+                SetBit(pheno_idx, cur_pheno_include);
+              }
+            }
+          }
+        }
+        if ((!postid_pat_col_idx) != (!postid_mat_col_idx)) {
+          snprintf(g_logbuf, kLogbufSize, "Error: %s has a '%cAT' column without a '%cAT' column; either both or neither must be present.\n", cur_fname, postid_pat_col_idx? 'P' : 'M', postid_pat_col_idx? 'M' : 'P');
+          goto MergePsams_ret_MALFORMED_INPUT_WW;
+        }
+        const char* line_end = AdvPastDelim(token_start, '\n');
+        const uint32_t line_blen = line_end - line_start;
+        if (max_line_blen < line_blen) {
+          max_line_blen = line_blen;
+        }
+        line_start = line_end;
+        ++line_idx;
+        if (unlikely(!TextGetUnsafe2K(&txs, &line_start))) {
+          if (TextStreamErrcode2(&txs, &reterr)) {
+            goto MergePsams_ret_TSTREAM_FAIL;
+          }
+          logerrprintfww("Error: No samples in %s.\n", cur_fname);
+          goto MergePsams_ret_MALFORMED_INPUT;
+        }
+      } else {
+        // .fam
+        fid_present = (fam_cols / kfFamCol1) & 1;
+        if (fam_cols & kfFamCol34) {
+          postid_pat_col_idx = 1;
+          postid_mat_col_idx = 2;
+        }
+        if (fam_cols & kfFamCol6) {
+          if (pheno_strset) {
+            if (unlikely(StrsetAddEndResize(arena_bottom, "PHENO1", strlen("PHENO1"), kMaxPhenoCt * 2, &pheno_strset, &pheno_names_table_size, &pheno_ct, &arena_top))) {
+              if (pheno_ct == kMaxPhenoCt) {
+                logerrputs("Error: " PROG_NAME_STR " does not support more than " MAX_PHENO_CT_STR " phenotypes.\n");
+                goto MergePsams_ret_INCONSISTENT_INPUT;
+              }
+              goto MergePsams_ret_NOMEM;
+            }
+            if (strlen("PHENO1") >= max_pheno_name_blen) {
+              max_pheno_name_blen = strlen("PHENO1") + 1;
+            }
+          } else {
+            if (!first_pheno_names_htable) {
+              if (unlikely(PtrWSubCk(arena_bottom, strlen("PHENO1") + 1, &arena_top))) {
+                goto MergePsams_ret_NOMEM;
+              }
+              strcpy_k(R_CAST(char*, arena_top), "PHENO1");
+              first_pheno_ct = 1;
+            } else if (pheno_ct) {
+              const uint32_t pheno_idx = IdHtableFind("PHENO1", TO_CONSTCPCONSTP(first_pheno_names), first_pheno_names_htable, strlen("PHENO1"), pheno_names_table_size);
+              if (pheno_idx != UINT32_MAX) {
+                SetBit(pheno_idx, cur_pheno_include);
+              }
+            }
+          }
+        }
+      }
+      uint32_t first_parent_postid_col_idx = postid_pat_col_idx;
+      uint32_t parent_col_skip = postid_mat_col_idx - postid_pat_col_idx;
+      uintptr_t first_parent_max_blen = max_paternal_id_blen;
+      uintptr_t second_parent_max_blen = max_maternal_id_blen;
+      if (postid_mat_col_idx < postid_pat_col_idx) {
+        first_parent_postid_col_idx = postid_mat_col_idx;
+        parent_col_skip = -parent_col_skip;
+        first_parent_max_blen = max_maternal_id_blen;
+        second_parent_max_blen = max_paternal_id_blen;
+      }
+      const char* fid_start = &(g_one_char_strs[96]);
+      uint32_t fid_slen = 1;
+      const char* sid_start = &(g_one_char_strs[96]);
+      uint32_t sid_slen = 1;
+      while (1) {
+        const char* token_start = line_start;
+        if (fid_present) {
+          fid_start = line_start;
+          const char* fid_end = CurTokenEnd(fid_start);
+          fid_slen = fid_end - fid_start;
+          token_start = FirstNonTspace(fid_end);
+          if (unlikely(IsEolnKns(*token_start))) {
+            goto MergePsams_ret_MISSING_TOKENS;
+          }
+        }
+        const char* iid_start = token_start;
+        const char* iid_end = CurTokenEnd(iid_start);
+        const char* token_end = iid_end;
+        const uint32_t iid_slen = iid_end - iid_start;
+        if (sid_present) {
+          sid_start = FirstNonTspace(iid_end);
+          token_end = CurTokenEnd(token_start);
+          sid_slen = token_end - sid_start;
+          if ((sid_slen > 1) || (sid_start[0] != '0')) {
+            if (sid_slen >= max_sid_blen) {
+              if (unlikely(sid_slen > kMaxIdBlen)) {
+                logerrputs("Error: SIDs are limited to " MAX_ID_SLEN_STR " characters.\n");
+                goto MergePsams_ret_MALFORMED_INPUT;
+              }
+              max_sid_blen = sid_slen + 1;
+            }
+          }
+        }
+        if (fid_slen + iid_slen > max_sample_id_blen_m2) {
+          max_sample_id_blen_m2 = fid_slen + iid_slen;
+          if (unlikely(max_sample_id_blen_m2 > 2 * kMaxIdSlen)) {
+            logerrputs("Error: FIDs and IIDs are limited to " MAX_ID_SLEN_STR " characters.\n");
+            goto MergePsams_ret_MALFORMED_INPUT;
+          }
+        }
+        char* id_iter = memcpyax(idbuf, fid_start, fid_slen, '\t');
+        id_iter = memcpyax(id_iter, iid_start, iid_slen, '\t');
+        id_iter = memcpya(id_iter, sid_start, sid_slen);
+        *id_iter = '\0';
+        const uint32_t id_slen = id_iter - idbuf;
+        if (sample_id_strset) {
+          if (unlikely(StrsetAddResize(arena_top, idbuf, id_slen, 2U * 0x7ffffffe, sample_id_strset, &sample_id_table_size, &sample_ct, &arena_bottom))) {
+            if (sample_ct == 0x7ffffffe) {
+              logerrputs("Error: " PROG_NAME_STR " does not support more than 2^31 - 2 samples.\n");
+              goto MergePsams_ret_INCONSISTENT_INPUT;
+            }
+            goto MergePsams_ret_NOMEM;
+          }
+        } else {
+          if (!first_sample_ids_htable) {
+            if (unlikely(id_slen >= S_CAST(uintptr_t, arena_top - arena_bottom))) {
+              goto MergePsams_ret_NOMEM;
+            }
+            arena_bottom = memcpyua(arena_bottom, idbuf, id_slen + 1);
+            if (unlikely(first_sample_id_ct == 0x7ffffffe)) {
+              logerrputs("Error: " PROG_NAME_STR " does not support more than 2^31 - 2 samples.\n");
+              goto MergePsams_ret_INCONSISTENT_INPUT;
+            }
+            ++first_sample_id_ct;
+          } else {
+            const uint32_t sample_idx = IdHtableFind(idbuf, first_sample_ids, first_sample_ids_htable, id_slen, sample_id_table_size);
+            if (sample_idx != UINT32_MAX) {
+              if (unlikely(IsSet(cur_sample_include, sample_idx))) {
+                TabsToSpaces(idbuf);
+                snprintf(g_logbuf, kLogbufSize, "Error: Duplicate sample ID \"%s\" in %s.\n", idbuf, cur_fname);
+                goto MergePsams_ret_MALFORMED_INPUT_WW;
+              }
+              SetBit(sample_idx, cur_sample_include);
+            }
+          }
+        }
+        if (first_parent_postid_col_idx) {
+          const char* first_parent_start = NextTokenMult(token_end, first_parent_postid_col_idx);
+          if (unlikely(!first_parent_start)) {
+            goto MergePsams_ret_MISSING_TOKENS;
+          }
+          const char* first_parent_end = CurTokenEnd(first_parent_start);
+          const uintptr_t first_parent_slen = first_parent_end - first_parent_start;
+          if (first_parent_slen >= first_parent_max_blen) {
+            if (unlikely(first_parent_slen > kMaxIdSlen)) {
+              logerrputs("Error: FIDs and IIDs are limited to " MAX_ID_SLEN_STR " characters.\n");
+              goto MergePsams_ret_MALFORMED_INPUT;
+            }
+            first_parent_max_blen = first_parent_slen + 1;
+          }
+          const char* second_parent_start = NextTokenMult(first_parent_end, parent_col_skip);
+          if (unlikely(!second_parent_start)) {
+            goto MergePsams_ret_MISSING_TOKENS;
+          }
+          token_end = CurTokenEnd(second_parent_start);
+          const uintptr_t second_parent_slen = token_end - second_parent_start;
+          if (second_parent_slen >= second_parent_max_blen) {
+            if (unlikely(second_parent_slen > kMaxIdSlen)) {
+              logerrputs("Error: FIDs and IIDs are limited to " MAX_ID_SLEN_STR " characters.\n");
+              goto MergePsams_ret_MALFORMED_INPUT;
+            }
+            second_parent_max_blen = second_parent_slen + 1;
+          }
+        }
+        const char* line_end = AdvPastDelim(token_end, '\n');
+        const uint32_t line_blen = line_end - line_start;
+        if (max_line_blen < line_blen) {
+          max_line_blen = line_blen;
+        }
+        line_start = line_end;
+        ++line_idx;
+        if (!TextGetUnsafe2K(&txs, &line_start)) {
+          break;
+        }
+        if (unlikely(line_start[0] == '#')) {
+          snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of %s starts with a '#'. (This is only permitted before the first nonheader line, and if a #FID/IID header line is present it must denote the end of the header block.)\n", line_idx, cur_fname);
+          goto MergePsams_ret_MALFORMED_INPUT_WW;
+        }
+      }
+      if (unlikely(TextStreamErrcode2(&txs, &reterr))) {
+        goto MergePsams_ret_TSTREAM_FAIL;
+      }
+      if (unlikely(CleanupTextStream2(cur_fname, &txs, &reterr))) {
+        goto MergePsams_ret_1;
+      }
+      if (postid_pat_col_idx) {
+        if (postid_mat_col_idx > postid_pat_col_idx) {
+          max_paternal_id_blen = first_parent_max_blen;
+          max_maternal_id_blen = second_parent_max_blen;
+        } else {
+          max_paternal_id_blen = second_parent_max_blen;
+          max_maternal_id_blen = first_parent_max_blen;
+        }
+      }
+      if (first_sample_ids_start) {
+        const uint32_t first_sample_ctl = BitCtToWordCt(first_sample_id_ct);
+        if (filesets_iter == filesets) {
+          ArenaBaseSet(arena_bottom, &arena_bottom);
+          sample_id_table_size = GetHtableFastSize(first_sample_id_ct);
+          if (unlikely((arena_bottom > arena_top) ||
+                       arena_alloc_kcp(arena_top, first_sample_id_ct, &arena_bottom, &first_sample_ids) ||
+                       arena_alloc_u32(arena_top, sample_id_table_size, &arena_bottom, &first_sample_ids_htable) ||
+                       arena_alloc_w(arena_top, first_sample_ctl, &arena_bottom, &sample_include) ||
+                       arena_alloc_w(arena_top, first_sample_ctl, &arena_bottom, &cur_sample_include))) {
+            goto MergePsams_ret_NOMEM;
+          }
+          SetAllU32Arr(sample_id_table_size, first_sample_ids_htable);
+          SetAllBits(first_sample_id_ct, sample_include);
+          const char* first_sample_ids_iter = first_sample_ids_start;
+          for (uint32_t sample_idx = 0; sample_idx != first_sample_id_ct; ++sample_idx) {
+            first_sample_ids[sample_idx] = first_sample_ids_iter;
+            const uint32_t slen = strlen(first_sample_ids_iter);
+            if (unlikely(IdHtableAdd(first_sample_ids_iter, first_sample_ids, slen, sample_id_table_size, sample_idx, first_sample_ids_htable) != UINT32_MAX)) {
+              char* mutable_id = K_CAST(char*, first_sample_ids_iter);
+              TabsToSpaces(mutable_id);
+              snprintf(g_logbuf, kLogbufSize, "Error: Duplicate sample ID \"%s\" in %s.\n", mutable_id, cur_fname);
+              goto MergePsams_ret_MALFORMED_INPUT_WW;
+            }
+            first_sample_ids_iter = &(first_sample_ids_iter[slen + 1]);
+          }
+        } else {
+          BitvecAnd(cur_sample_include, first_sample_ctl, sample_include);
+          if (unlikely(AllWordsAreZero(sample_include, first_sample_ctl))) {
+            snprintf(g_logbuf, kLogbufSize, "Error: No common samples in --pmerge%s --sample-inner-join job.\n", pmip->list_fname? "-list" : "");
+            goto MergePsams_ret_MALFORMED_INPUT_2;
+          }
+        }
+        ZeroWArr(first_sample_ctl, cur_sample_include);
+      }
+      if (!pheno_strset) {
+        const uint32_t first_pheno_ctl = BitCtToWordCt(first_pheno_ct);
+        if (filesets_iter == filesets) {
+          // must handle pheno_ct == 0
+          // (only really need to set first_pheno_names_htable to non-null in
+          // that case, since that's how we currently recognize we aren't
+          // processing the first .psam file)
+          const char* pheno_names_iter = R_CAST(char*, arena_top);
+          ArenaEndSet(arena_top, &arena_top);
+          pheno_names_table_size = GetHtableFastSize(first_pheno_ct);
+          if (unlikely(arena_end_alloc_kcp(arena_bottom, first_pheno_ct, &arena_top, &first_pheno_names) ||
+                       arena_end_alloc_u32(arena_bottom, pheno_names_table_size, &arena_top, &first_pheno_names_htable) ||
+                       arena_end_alloc_w(arena_bottom, first_pheno_ctl, &arena_top, &pheno_include) ||
+                       arena_end_alloc_w(arena_bottom, first_pheno_ctl, &arena_top, &cur_pheno_include))) {
+            goto MergePsams_ret_NOMEM;
+          }
+          SetAllU32Arr(pheno_names_table_size, first_pheno_names_htable);
+          SetAllBits(first_pheno_ct, pheno_include);
+          ZeroWArr(first_pheno_ctl, cur_pheno_include);
+          pheno_ct = first_pheno_ct;
+          for (uint32_t pheno_idx = 0; pheno_idx != first_pheno_ct; ++pheno_idx) {
+            first_pheno_names[pheno_idx] = pheno_names_iter;
+            const uint32_t slen = strlen(pheno_names_iter);
+            if (unlikely(IdHtableAdd(pheno_names_iter, first_pheno_names, slen, pheno_names_table_size, pheno_idx, first_pheno_names_htable) != UINT32_MAX)) {
+              snprintf(g_logbuf, kLogbufSize, "Error: Duplicate phenotype name '%s' in %s.\n", pheno_names_iter, cur_fname);
+              goto MergePsams_ret_MALFORMED_INPUT_WW;
+            }
+            pheno_names_iter = &(pheno_names_iter[slen + 1]);
+          }
+        } else if (pheno_ct) {
+          BitvecAnd(cur_pheno_include, first_pheno_ctl, pheno_include);
+          pheno_ct = PopcountWords(pheno_include, first_pheno_ctl);
+          ZeroWArr(first_pheno_ctl, cur_pheno_include);
+        }
+      }
+      filesets_iter = filesets_iter->next;
+    } while (filesets_iter);
+    // Intermission:
+    // 1. Move final set of phenotype ID strings to bottom, compute
+    //    max_pheno_name_blen if --pheno-inner-join.
+    // 2. Compute max_sample_id_blen and max_sid_blen if --sample-inner-join.
+    // 3. Free all end-of-bigstack allocations.
+    // 4. Construct SampleIdInfo at end of bigstack, to be returned.
+    // 5. Construct pheno_names, hash tables.
+    char* pheno_names = nullptr;
+    uint32_t* pheno_names_htable = nullptr;
+    uint32_t* sample_id_htable;
+    uintptr_t max_sample_id_blen;
+    uint32_t sample_ctl;
+    {
+      char* pheno_names_tmp_start = R_CAST(char*, arena_bottom);
+      char* pheno_names_tmp_end;
+      if (pheno_strset) {
+        const uintptr_t byte_ct = R_CAST(unsigned char*, pheno_strset) - arena_top;
+        memmove(pheno_names_tmp_start, arena_top, byte_ct);
+        pheno_names_tmp_end = &(pheno_names_tmp_start[byte_ct]);
+      } else {
+        max_pheno_name_blen = 0;
+        char* pheno_names_write_iter = pheno_names_tmp_start;
+        if (pheno_ct) {
+          if (S_CAST(uintptr_t, first_pheno_names[0] - pheno_names_write_iter) < kMaxIdBlen) {
+            // ensure memcpy is safe
+            goto MergePsams_ret_NOMEM;
+          }
+          uintptr_t pheno_uidx_base = 0;
+          uintptr_t cur_bits = pheno_include[0];
+          for (uint32_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
+            const uint32_t pheno_uidx = BitIter1(pheno_include, &pheno_uidx_base, &cur_bits);
+            const char* cur_pheno_name = first_pheno_names[pheno_uidx];
+            const uint32_t blen = 1 + strlen(cur_pheno_name);
+            if (blen > max_pheno_name_blen) {
+              max_pheno_name_blen = blen;
+            }
+            pheno_names_write_iter = memcpya(pheno_names_write_iter, cur_pheno_name, blen);
+          }
+        }
+        pheno_names_tmp_end = pheno_names_write_iter;
+      }
+      BigstackBaseSet(pheno_names_tmp_end);
+
+      if (sample_id_strset) {
+        max_sample_id_blen = max_sample_id_blen_m2 + 2;
+      } else {
+        const uint32_t first_sample_ctl = BitCtToWordCt(first_sample_id_ct);
+        sample_ct = PopcountWords(sample_include, first_sample_ctl);
+        max_sample_id_blen = 4;
+        max_sid_blen = 0;
+        uintptr_t sample_uidx_base = 0;
+        uintptr_t cur_bits = sample_include[0];
+        for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
+          const uint32_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
+          const char* cur_sample_id = first_sample_ids[sample_uidx];
+          const char* iid_end = AdvToDelim(AdvPastDelim(cur_sample_id, '\t'), '\t');
+          const uint32_t sample_id_blen_m1 = iid_end - cur_sample_id;
+          if (max_sample_id_blen <= sample_id_blen_m1) {
+            max_sample_id_blen = sample_id_blen_m1 + 1;
+          }
+          const char* sid_start = &(iid_end[1]);
+          const uint32_t sid_slen = strlen(sid_start);
+          if ((sid_slen > 1) || (sid_start[0] != '0')) {
+            if (max_sid_blen <= sid_slen) {
+              max_sid_blen = sid_slen + 1;
+            }
+          }
+        }
+      }
+      // defensive
+      linebuf = nullptr;
+      first_pheno_names = nullptr;
+      first_pheno_names_htable = nullptr;
+      pheno_include = nullptr;
+      cur_pheno_include = nullptr;
+      pheno_strset = nullptr;
+
+      sample_ctl = BitCtToWordCt(sample_ct);
+      siip->flags = kfSampleIdFidPresent | kfSampleIdParentsPresent;
+      if (misc_flags & kfMiscStrictSid0) {
+        // affects --indiv-sort
+        siip->flags |= kfSampleIdStrictSid0;
+      }
+      siip->max_sample_id_blen = max_sample_id_blen;
+      siip->max_sid_blen = max_sid_blen;
+      siip->sids = nullptr;
+      if (max_sid_blen) {
+        if (unlikely(bigstack_end_alloc_c(max_sid_blen * sample_ct, &(siip->sids)))) {
+          goto MergePsams_ret_NOMEM;
+        }
+      }
+      // place this below optional siip->sids, so that
+      // BigstackEndSet(siip->sample_ids) is always correct on success
+      if (unlikely(bigstack_end_alloc_c(max_sample_id_blen * sample_ct, &(siip->sample_ids)))) {
+        goto MergePsams_ret_NOMEM;
+      }
+      if (pheno_ct) {
+        pheno_names_table_size = GetHtableFastSize(pheno_ct);
+        if (unlikely(bigstack_end_alloc_c(max_pheno_name_blen * pheno_ct, &pheno_names) ||
+                     bigstack_end_alloc_u32(pheno_names_table_size, &pheno_names_htable))) {
+          goto MergePsams_ret_NOMEM;
+        }
+      }
+
+      char* sample_ids_write_iter = siip->sample_ids;
+      if (sample_id_strset) {
+        const char* sample_ids_read_iter = R_CAST(char*, &(sample_id_strset[sample_id_table_size]));
+        for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
+          const char* iid_end = AdvToDelim(AdvPastDelim(sample_ids_read_iter, '\t'), '\t');
+          memcpyx(sample_ids_write_iter, sample_ids_read_iter, iid_end - sample_ids_read_iter, '\0');
+          sample_ids_write_iter = &(sample_ids_write_iter[max_sample_id_blen]);
+          const char* sid_end = strnul(iid_end);
+          if (max_sid_blen) {
+            const uint32_t sid_blen = sid_end - iid_end;
+            const char* sid_start = &(iid_end[1]);
+            memcpy(&(siip->sids[sample_idx * max_sid_blen]), sid_start, sid_blen);
+          }
+          sample_ids_read_iter = &(sid_end[1]);
+        }
+      } else {
+        uintptr_t sample_uidx_base = 0;
+        uintptr_t cur_bits = sample_include[0];
+        for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
+          const uint32_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
+          const char* cur_sample_id = first_sample_ids[sample_uidx];
+          const char* iid_end = AdvToDelim(AdvPastDelim(cur_sample_id, '\t'), '\t');
+          memcpyx(sample_ids_write_iter, cur_sample_id, iid_end - cur_sample_id, '\0');
+          sample_ids_write_iter = &(sample_ids_write_iter[max_sample_id_blen]);
+          if (max_sid_blen) {
+            const char* sid_start = &(iid_end[1]);
+            strcpy(&(siip->sids[sample_idx * max_sid_blen]), sid_start);
+          }
+        }
+      }
+
+      // defensive
+      first_sample_ids_start = nullptr;
+      first_sample_ids = nullptr;
+      first_sample_ids_htable = nullptr;
+      sample_include = nullptr;
+      cur_sample_include = nullptr;
+      sample_id_strset = nullptr;
+      g_bigstack_base = bigstack_mark;
+
+      if (pheno_ct) {
+        const char* pheno_names_read_iter = pheno_names_tmp_start;
+        // phenotypes are stored in reverse
+        char* pheno_names_write_iter = &(pheno_names[pheno_ct * max_pheno_name_blen]);
+        for (uint32_t pheno_idx = pheno_ct; pheno_idx; ) {
+          --pheno_idx;
+          pheno_names_write_iter -= max_pheno_name_blen;
+          const uint32_t slen = strlen(pheno_names_read_iter);
+          memcpy(pheno_names_write_iter, pheno_names_read_iter, slen + 1);
+          pheno_names_read_iter = &(pheno_names_read_iter[slen + 1]);
+        }
+        const SortMode pheno_sort_mode = pmip->merge_pheno_sort;
+        if (pheno_sort_mode == kSortAscii) {
+          qsort(pheno_names, pheno_ct, max_pheno_name_blen, strcmp_overread_casted);
+        } else if (pheno_sort_mode == kSortNatural) {
+          qsort(pheno_names, pheno_ct, max_pheno_name_blen, strcmp_natural);
+        }
+
+        SetAllU32Arr(pheno_names_table_size, pheno_names_htable);
+        pheno_names_read_iter = pheno_names;
+        for (uint32_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
+          const uint32_t slen = strlen(pheno_names_read_iter);
+          HtableAddNondup(pheno_names_read_iter, slen, pheno_names_table_size, pheno_idx, pheno_names_htable);
+          pheno_names_read_iter = &(pheno_names_read_iter[max_pheno_name_blen]);
+        }
+      }
+
+      if (unlikely(bigstack_alloc_w(sample_ctl, &sample_include))) {
+        goto MergePsams_ret_NOMEM;
+      }
+      SetAllBits(sample_ct, sample_include);
+      char* sample_ids = siip->sample_ids;
+      char* sids = siip->sids;
+      if (sample_sort_mode != kSortNone) {
+        if (sample_sort_mode == kSortFile) {
+          // yes, this is a bit circuitous
+          unsigned char* bigstack_end_mark2 = g_bigstack_end;
+          uint32_t* new_sample_idx_to_old;
+          char* sample_ids_tmp;
+          if (unlikely(bigstack_end_alloc_u32(sample_ct, &new_sample_idx_to_old) ||
+                       bigstack_end_alloc_c(sample_ct * max_sample_id_blen, &sample_ids_tmp))) {
+            goto MergePsams_ret_NOMEM;
+          }
+          char* sids_tmp = nullptr;
+          if (max_sid_blen) {
+            if (unlikely(bigstack_end_alloc_c(sample_ct * max_sid_blen, &sids_tmp))) {
+              goto MergePsams_ret_NOMEM;
+            }
+          }
+          reterr = SampleSortFileMap(sample_include, siip, sample_sort_fname, sample_ct, sample_ct, &new_sample_idx_to_old);
+          if (unlikely(reterr)) {
+            goto MergePsams_ret_1;
+          }
+          char* new_sample_ids_iter = sample_ids_tmp;
+          char* new_sids_iter = sids_tmp;
+          for (uint32_t new_idx = 0; new_idx != sample_ct; ++new_idx) {
+            const uint32_t old_idx = new_sample_idx_to_old[new_idx];
+            strcpy(new_sample_ids_iter, &(sample_ids[old_idx * max_sample_id_blen]));
+            new_sample_ids_iter = &(new_sample_ids_iter[max_sample_id_blen]);
+            if (sids) {
+              strcpy(new_sids_iter, &(sids[old_idx * max_sid_blen]));
+              new_sids_iter = &(new_sids_iter[max_sid_blen]);
+            }
+          }
+          memcpy(sample_ids, sample_ids_tmp, sample_ct * max_sample_id_blen);
+          if (sids) {
+            memcpy(sids, sids_tmp, sample_ct * max_sid_blen);
+          }
+          BigstackEndReset(bigstack_end_mark2);
+        } else if (sample_sort_mode == kSortAscii) {
+          qsort(siip->sample_ids, sample_ct, max_sample_id_blen, strcmp_overread_casted);
+        } else {
+          // natural-sort, even if sample_sort_mode == kSort0
+          qsort(siip->sample_ids, sample_ct, max_sample_id_blen, strcmp_natural);
+        }
+      }
+
+      sample_id_table_size = GetHtableFastSize(sample_ct);
+      if (unlikely(bigstack_alloc_u32(sample_id_table_size, &sample_id_htable))) {
+        goto MergePsams_ret_NOMEM;
+      }
+      SetAllU32Arr(sample_id_table_size, sample_id_htable);
+
+      const char* sample_ids_iter = siip->sample_ids;
+      const char* sids_iter = siip->sids;
+      for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
+        uint32_t slen = strlen(sample_ids_iter);
+        const char* cur_sample_id;
+        if (!sids_iter) {
+          cur_sample_id = sample_ids_iter;
+        } else {
+          char* id_iter = memcpyax(idbuf, sample_ids_iter, slen, '\t');
+          const uint32_t sid_blen = 1 + strlen(sids_iter);
+          memcpy(id_iter, sids_iter, sid_blen);
+          slen += sid_blen;
+          sids_iter = &(sids_iter[max_sid_blen]);
+          cur_sample_id = idbuf;
+        }
+        HtableAddNondup(cur_sample_id, slen, sample_id_table_size, sample_idx, sample_id_htable);
+      }
+    }
+    linebuf_capacity = MAXV(max_line_blen, kDecompressMinBlen) + kDecompressChunkSize;
+    // max_{p,m}aternal_id_blen may be overestimated in --sample-inner-join
+    // case, but that's harmless since we free this strbox soon enough
+    ParentalIdInfo parental_id_info;
+    uintptr_t* sex_nm;
+    uintptr_t* sex_male;
+    if (unlikely(bigstack_alloc_c(sample_ct * max_paternal_id_blen, &parental_id_info.paternal_ids) ||
+                 bigstack_alloc_c(sample_ct * max_maternal_id_blen, &parental_id_info.maternal_ids) ||
+                 bigstack_calloc_w(sample_ctl, &sex_nm) ||
+                 bigstack_calloc_w(sample_ctl, &sex_male))) {
+      goto MergePsams_ret_NOMEM;
+    }
+    {
+      char* paternal_ids_iter = parental_id_info.paternal_ids;
+      char* maternal_ids_iter = parental_id_info.maternal_ids;
+      for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
+        strcpy_k(paternal_ids_iter, "0");
+        paternal_ids_iter = &(paternal_ids_iter[max_paternal_id_blen]);
+        strcpy_k(maternal_ids_iter, "0");
+        maternal_ids_iter = &(maternal_ids_iter[max_maternal_id_blen]);
+      }
+    }
+    parental_id_info.max_paternal_id_blen = max_paternal_id_blen;
+    parental_id_info.max_maternal_id_blen = max_maternal_id_blen;
+
+    // unlike core pheno_cols, this just lives on bigstack
+    PhenoCol* pheno_cols = nullptr;
+
+    if (pheno_ct) {
+      if (unlikely(BIGSTACK_ALLOC_X(PhenoCol, pheno_ct, &pheno_cols))) {
+        goto MergePsams_ret_NOMEM;
+      }
+      for (uint32_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
+        // we don't really distinguish between case/control and quantitative
+        // phenotypes here (just use data.qt in both cases), and it should be
+        // ok to overallocate a bit in the categorical-phenotype case
+        if (unlikely(bigstack_calloc_w(sample_ctl, &(pheno_cols[pheno_idx].nonmiss)) ||
+                     bigstack_alloc_d(sample_ct, &(pheno_cols[pheno_idx].data.qt)))) {
+          goto MergePsams_ret_NOMEM;
+        }
+        // We use kPhenoDtypeCc instead of kPhenoTypeOther to indicate "type
+        // not yet known", since that lets us leave it unchanged if all values
+        // are missing.
+        pheno_cols[pheno_idx].type_code = kPhenoDtypeCc;
+        // defensive
+        pheno_cols[pheno_idx].category_names = nullptr;
+        pheno_cols[pheno_idx].nonnull_category_ct = 0;
+      }
+    }
+    unsigned char* bigstack_mark2 = g_bigstack_base;
+    const uint32_t kNonphenoColCt = 5;
+    const uint32_t max_col_ct = pheno_ct + kNonphenoColCt;
+    // note that ZeroWArr(pheno_ctl, pheno_include) is safe when pheno_ct == 0
+    const uint32_t pheno_ctl = BitCtToWordCt(pheno_ct);
+    uint32_t* col_skips;
+    uint32_t* col_types;
+    const char** token_ptrs;
+    uint32_t* token_slens;
+    if (unlikely(bigstack_alloc_u32(max_col_ct, &col_skips) ||
+                 bigstack_alloc_u32(max_col_ct, &col_types) ||
+                 bigstack_alloc_kcp(max_col_ct, &token_ptrs) ||
+                 bigstack_alloc_u32(max_col_ct, &token_slens) ||
+                 bigstack_alloc_w(pheno_ctl, &pheno_include) ||
+                 bigstack_alloc_c(linebuf_capacity, &linebuf))) {
+      goto MergePsams_ret_NOMEM;
+    }
+    // phenotype-major
+    uintptr_t* pheno_locked = nullptr;
+    if (pheno_ct && (pmip->merge_pheno_mode != kMergePhenoModeNmFirst)) {
+      // no need to check for overflow in 32-bit case since we would have
+      // already run out of memory on data.qt allocations
+      const uintptr_t bit_ct = S_CAST(uintptr_t, pheno_ct) * sample_ct;
+      if (unlikely(bigstack_calloc_w(BitCtToWordCt(bit_ct), &pheno_locked))) {
+        goto MergePsams_ret_NOMEM;
+      }
+    }
+
+    if (pheno_ct) {
+      logprintf("--pmerge%s: %u sample%s and %u phenotype%s present.\n", pmip->list_fname? "-list" : "", sample_ct, (sample_ct == 1)? "" : "s", pheno_ct, (pheno_ct == 1)? "" : "s");
+    } else {
+      logprintf(g_logbuf, kLogbufSize, "--pmerge%s: %u sample%s present.\n", pmip->list_fname? "-list" : "", sample_ct, (sample_ct == 1)? "" : "s");
+    }
+    //  [category_names] [category_names_htable]            [strings]
+    // ^                                        ^          ^         ^
+    // |                                        |          |         |
+    // g_bigstack_base                    arena_bottom arena_top g_bigstack_end
+    //
+    // category-name strings are allocated at the end of bigstack, and need to
+    // remain allocated until WritePsam() returns.
+    // category_names_htable is repositioned, resized, and rebuilt whenever
+    // category_names would otherwise overflow.
+    // No other allocations allowed past this point, until WritePsam() call.
+    arena_bottom = g_bigstack_base;
+    arena_top = g_bigstack_end;
+    const char** category_names = nullptr;
+    // uint32_t* category_names_htable = nullptr;
+    uint32_t category_names_ct = 0;
+    // category_names_capacity == category_names_htable_size / 2
+    // uint32_t category_names_htable_size = 0;
+    filesets_iter = filesets;
+    // possible todo: track seen realpaths, skip duplicates
+    do {
+      cur_fname = filesets_iter->psam_fname;
+      reterr = TextStreamOpenEx(cur_fname, kMaxLongLine, linebuf_capacity, decompress_thread_ct, nullptr, linebuf, &txs);
+      if (unlikely(reterr)) {
+        goto MergePsams_ret_TSTREAM_REWIND_FAIL;
+      }
+      const char* line_start = TextLineEnd(&txs);
+      for (line_idx = 1; ; ++line_idx) {
+        if (unlikely(!TextGetUnsafe2K(&txs, &line_start))) {
+          reterr = TextStreamRawErrcode(&txs);
+          goto MergePsams_ret_TSTREAM_REWIND_FAIL;
+        }
+        if ((line_start[0] != '#') || tokequal_k(&(line_start[1]), "FID") || tokequal_k(&(line_start[1]), "IID")) {
+          break;
+        }
+        line_start = AdvPastDelim(line_start, '\n');
+      }
+      ZeroWArr(pheno_ctl, pheno_include);
+      // uint32_t sid_present = 0;
+      // uint32_t parents_present = 0;
+      // uint32_t sex_present = 0;
+      uint32_t fid_present;
+      uint32_t relevant_postfid_col_ct;
+      col_types[0] = 0;
+      if (line_start[0] == '#') {
+        fid_present = (line_start[1] == 'F');
+        relevant_postfid_col_ct = 1 - fid_present;
+        if (!fid_present) {
+          col_skips[0] = 0;
+        }
+        const char* token_end = &(line_start[4]);
+        for (uint32_t col_idx = 1; ; ++col_idx) {
+          const char* token_start = FirstNonTspace(token_end);
+          if (IsEolnKns(*token_start)) {
+            break;
+          }
+          token_end = CurTokenEnd(token_start);
+          const uint32_t token_slen = token_end - token_start;
+          uint32_t cur_col_type = UINT32_MAX;
+          if (token_slen == 3) {
+            if (memequal_k(token_start, "IID", 3)) {
+              cur_col_type = 0;
+            } else if (memequal_k(token_start, "SID", 3)) {
+              // sid_present = 1;
+              cur_col_type = 1;
+            } else if (memequal_k(token_start, "PAT", 3)) {
+              cur_col_type = 2;
+              // parents_present = 1;
+            } else if (memequal_k(token_start, "MAT", 3)) {
+              cur_col_type = 3;
+            } else if (memequal_k(token_start, "SEX", 3)) {
+              cur_col_type = 4;
+              // sex_present = 1;
+            }
+          }
+          if (cur_col_type == UINT32_MAX) {
+            if (token_slen < max_pheno_name_blen) {
+              cur_col_type = StrboxHtableFind(token_start, pheno_names, pheno_names_htable, max_pheno_name_blen, token_slen, pheno_names_table_size);
+            }
+            if (cur_col_type == UINT32_MAX) {
+              continue;
+            }
+            SetBit(cur_col_type, pheno_include);
+            cur_col_type += kNonphenoColCt;
+          }
+          col_skips[relevant_postfid_col_ct] = col_idx;
+          col_types[relevant_postfid_col_ct++] = cur_col_type;
+        }
+        for (uint32_t rpf_col_idx = relevant_postfid_col_ct - 1; rpf_col_idx; --rpf_col_idx) {
+          col_skips[rpf_col_idx] -= col_skips[rpf_col_idx - 1];
+        }
+        line_start = AdvPastDelim(token_end, '\n');
+        ++line_idx;
+      } else {
+        // .fam
+        fid_present = (fam_cols / kfFamCol1) & 1;
+        col_skips[0] = fid_present;
+        relevant_postfid_col_ct = 1;
+        if (fam_cols & kfFamCol34) {
+          col_skips[relevant_postfid_col_ct] = 1;
+          col_types[relevant_postfid_col_ct++] = 2;
+          col_skips[relevant_postfid_col_ct] = 1;
+          col_types[relevant_postfid_col_ct++] = 3;
+          // parents_present = 1;
+        }
+        if (fam_cols & kfFamCol5) {
+          col_skips[relevant_postfid_col_ct] = 1;
+          col_types[relevant_postfid_col_ct++] = 4;
+          // sex_present = 1;
+        }
+        if (fam_cols & kfFamCol6) {
+          col_skips[relevant_postfid_col_ct] = 1;
+          const uint32_t pheno_idx = StrboxHtableFind("PHENO1", pheno_names, pheno_names_htable, max_pheno_name_blen, strlen("PHENO1"), pheno_names_table_size);
+          col_types[relevant_postfid_col_ct++] = kNonphenoColCt + pheno_idx;
+          SetBit(pheno_idx, pheno_include);
+        }
+      }
+      // TODO
+      if (unlikely(TextStreamErrcode2(&txs, &reterr))) {
+        goto MergePsams_ret_TSTREAM_FAIL;
+      }
+      if (unlikely(CleanupTextStream2(cur_fname, &txs, &reterr))) {
+        goto MergePsams_ret_1;
+      }
+      filesets_iter = filesets_iter->next;
+    } while (filesets_iter);
+    BigstackReset(bigstack_mark2);
+    BigstackEndSet(arena_top);
+    if (pheno_ct) {
+      if (category_names_ct) {
+        const char** category_names_final = R_CAST(const char**, g_bigstack_base);
+        BigstackBaseSet(&(category_names_final[category_names_ct]));
+        memmove(category_names_final, category_names, category_names_ct * sizeof(intptr_t));
+        category_names = category_names_final;
+      }
+      for (uint32_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
+        PhenoCol* pheno_col = &(pheno_cols[pheno_idx]);
+        if (pheno_col->type_code == kPhenoDtypeQt) {
+          if (misc_flags & kfMiscAffection01) {
+            // TODO
+            // If all values are in {0, 1, missing}, convert to {1, 2, missing}
+          }
+        } else if (pheno_col->type_code == kPhenoDtypeCat) {
+          pheno_col->category_names = category_names;
+        }
+      }
+      ;;;
+    }
+    snprintf(outname_end, kMaxOutfnameExtBlen, ".psam");
+    reterr = WritePsam(outname, sample_include, siip, &parental_id_info, sex_nm, sex_male, pheno_cols, pheno_names, nullptr, "NA", sample_ct, pheno_ct, max_pheno_name_blen, kfPsamColDefault);
+    if (unlikely(reterr)) {
+      goto MergePsams_ret_1;
+    }
+    logprintfww("--pmerge%s: Merged .psam written to %s .\n", pmip->list_fname? "-list" : "", outname);
+    *sample_ctp = sample_ct;
+    ArenaEndSet(siip->sample_ids, &bigstack_end_mark);
+  }
+  while (0) {
+  MergePsams_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  MergePsams_ret_TSTREAM_REWIND_FAIL:
+    TextStreamErrPrintRewind(cur_fname, &txs, &reterr);
+    break;
+  MergePsams_ret_TSTREAM_FAIL:
+    TextStreamErrPrint(cur_fname, &txs);
+    break;
+  MergePsams_ret_MISSING_TOKENS:
+    snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, cur_fname);
+  MergePsams_ret_MALFORMED_INPUT_WW:
+    WordWrapB(0);
+  MergePsams_ret_MALFORMED_INPUT_2:
+    logerrputsb();
+  MergePsams_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  MergePsams_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+ MergePsams_ret_1:
+  CleanupTextStream2(cur_fname, &txs, &reterr);
+  BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
+  return reterr;
+}
+
+PglErr Pmerge(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags misc_flags, SortMode sample_sort_mode, FamCol fam_cols, uint32_t max_thread_ct, char* pgenname, char* psamname, char* pvarname, char* outname, char* outname_end, __attribute__((unused)) ChrInfo* cip) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
   PglErr reterr = kPglRetSuccess;
   {
     // 1. Construct/load fileset list.
@@ -81,17 +1226,63 @@ PglErr Pmerge(__attribute__((unused)) const PmergeInfo* pmip, __attribute__((unu
     // 4. If filesets cover disjoint positions, handle this as a concatenation
     //    job (or error out on --variant-inner-join).
     // 5. Otherwise, perform general-purpose incremental merge.
-    // const PmergeFlags flags = pmip->flags;
-    ;;;
+    PmergeInputFilesetLl* filesets = nullptr;
+    PmergeInputFilesetLl** filesets_endp = &filesets;
+    if (pgenname[0]) {
+      PmergeInputFilesetLl* cur_entry = AllocFilesetLlEntry(&filesets_endp);
+      if (unlikely(!cur_entry)) {
+        goto Pmerge_ret_NOMEM;
+      }
+      const uint32_t pgen_fname_blen = strlen(pgenname) + 1;
+      const uint32_t pvar_fname_blen = strlen(pvarname) + 1;
+      const uint32_t psam_fname_blen = strlen(psamname) + 1;
+      char* fname_iter;
+      if (unlikely(bigstack_end_alloc_c(pgen_fname_blen + pvar_fname_blen + psam_fname_blen, &fname_iter))) {
+        goto Pmerge_ret_NOMEM;
+      }
+      cur_entry->pgen_fname = fname_iter;
+      fname_iter = memcpya(fname_iter, pgenname, pgen_fname_blen);
+      cur_entry->pvar_fname = fname_iter;
+      fname_iter = memcpya(fname_iter, pvarname, pvar_fname_blen);
+      cur_entry->psam_fname = fname_iter;
+      memcpy(fname_iter, psamname, psam_fname_blen);
+    }
+    if (!pmip->list_fname) {
+      PmergeInputFilesetLl* cur_entry = AllocFilesetLlEntry(&filesets_endp);
+      if (unlikely(!cur_entry)) {
+        goto Pmerge_ret_NOMEM;
+      }
+      cur_entry->pgen_fname = pmip->pgen_fname;
+      cur_entry->pvar_fname = pmip->pvar_fname;
+      cur_entry->psam_fname = pmip->psam_fname;
+    } else {
+      reterr = LoadPmergeList(pmip->list_fname, pmip->list_mode, pgenname[0] != '\0', &filesets_endp);
+      if (unlikely(reterr)) {
+        goto Pmerge_ret_1;
+      }
+    }
+
+    SampleIdInfo sii;
+    uint32_t sample_ct;
+    reterr = MergePsams(pmip, sample_sort_fname, misc_flags, sample_sort_mode, fam_cols, max_thread_ct, outname, outname_end, filesets, &sii, &sample_ct);
+    if (unlikely(reterr)) {
+      goto Pmerge_ret_1;
+    }
+
+    if (unlikely(reterr)) {
+      goto Pmerge_ret_1;
+    }
     logerrputs("Error: --pmerge[-list] is under development.\n");
     reterr = kPglRetNotYetSupported;
     goto Pmerge_ret_1;
   }
   while (0) {
-
+  Pmerge_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
   }
  Pmerge_ret_1:
-  BigstackReset(bigstack_mark);
+  BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
   return reterr;
 }
 
@@ -925,7 +2116,7 @@ PglErr PgenDiff(const uintptr_t* orig_sample_include, const SampleIdInfo* siip,
         const uint32_t cur_max_merged_alleles = 1 + cur_allele_ct1 + cur_allele_ct2 - allele1_alt_start - allele2_alt_start;
         const uint32_t cur_htable_size = (cur_max_merged_alleles * 9) / 2;
         SetAllU32Arr(cur_htable_size, merged_alleles_htable);
-        // See e.g PopulateStrboxHtable().
+        // See e.g. PopulateStrboxHtable().
         uint32_t hashval = Hashceil(merged_alleles[0], strlen(merged_alleles[0]), cur_htable_size);
         merged_alleles_htable[hashval] = 0;
         ++merged_allele_ct;
@@ -936,27 +2127,18 @@ PglErr PgenDiff(const uintptr_t* orig_sample_include, const SampleIdInfo* siip,
           }
           const char* cur_allele1 = cur_allele1s[allele1_idx];
           const uint32_t cur_allele1_slen = strlen(cur_allele1);
-          for (hashval = Hashceil(cur_allele1, cur_allele1_slen, cur_htable_size); ; ) {
-            const uint32_t cur_htable_entry = merged_alleles_htable[hashval];
-            if (cur_htable_entry == UINT32_MAX) {
-              if (unlikely(merged_allele_ct == kPglMaxAlleleCt)) {
-                logerrprintfww("Error: Too many alleles across --pgen-diff and main filesets for variant '%s' at position %s:%u. (This " PROG_NAME_STR " build is limited to " PGL_MAX_ALLELE_CT_STR ".)\n", variant_ids[variant_uidx], chr_buf, cur_included_bp);
-                reterr = kPglRetNotYetSupported;
-                goto PgenDiff_ret_1;
-              }
-              remap1[allele1_idx] = merged_allele_ct;
-              merged_alleles_htable[hashval] = merged_allele_ct;
-              merged_alleles[merged_allele_ct] = cur_allele1;
-              ++merged_allele_ct;
-              break;
-            }
-            if (memequal(cur_allele1, merged_alleles[cur_htable_entry], cur_allele1_slen + 1)) {
-              remap1[allele1_idx] = cur_htable_entry;
-              break;
-            }
-            if (++hashval == cur_htable_size) {
-              hashval = 0;
+          const uint32_t cur_idx = IdHtableAdd(cur_allele1, merged_alleles, cur_allele1_slen, cur_htable_size, merged_allele_ct, merged_alleles_htable);
+          if (cur_idx == UINT32_MAX) {
+            if (unlikely(merged_allele_ct == kPglMaxAlleleCt)) {
+              logerrprintfww("Error: Too many alleles across --pgen-diff and main filesets for variant '%s' at position %s:%u. (This " PROG_NAME_STR " build is limited to " PGL_MAX_ALLELE_CT_STR ".)\n", variant_ids[variant_uidx], chr_buf, cur_included_bp);
+              reterr = kPglRetNotYetSupported;
+              goto PgenDiff_ret_1;
             }
+            remap1[allele1_idx] = merged_allele_ct;
+            merged_alleles[merged_allele_ct] = cur_allele1;
+            ++merged_allele_ct;
+          } else {
+            remap1[allele1_idx] = cur_idx;
           }
         }
         for (uint32_t allele2_idx = allele2_alt_start; allele2_idx != cur_allele_ct2; ++allele2_idx) {
@@ -965,27 +2147,18 @@ PglErr PgenDiff(const uintptr_t* orig_sample_include, const SampleIdInfo* siip,
           }
           const char* cur_allele2 = cur_allele2s[allele2_idx];
           const uint32_t cur_allele2_slen = strlen(cur_allele2);
-          for (hashval = Hashceil(cur_allele2, cur_allele2_slen, cur_htable_size); ; ) {
-            const uint32_t cur_htable_entry = merged_alleles_htable[hashval];
-            if (cur_htable_entry == UINT32_MAX) {
-              if (unlikely(merged_allele_ct == kPglMaxAlleleCt)) {
-                logerrprintfww("Error: Too many alleles across --pgen-diff and main filesets for variant '%s' at position %s:%u. (This " PROG_NAME_STR " build is limited to " PGL_MAX_ALLELE_CT_STR ".)\n", variant_ids[variant_uidx], chr_buf, cur_included_bp);
-                reterr = kPglRetNotYetSupported;
-                goto PgenDiff_ret_1;
-              }
-              remap2[allele2_idx] = merged_allele_ct;
-              merged_alleles_htable[hashval] = merged_allele_ct;
-              merged_alleles[merged_allele_ct] = cur_allele2;
-              ++merged_allele_ct;
-              break;
-            }
-            if (memequal(cur_allele2, merged_alleles[cur_htable_entry], cur_allele2_slen + 1)) {
-              remap2[allele2_idx] = cur_htable_entry;
-              break;
-            }
-            if (++hashval == cur_htable_size) {
-              hashval = 0;
+          const uint32_t cur_idx = IdHtableAdd(cur_allele2, merged_alleles, cur_allele2_slen, cur_htable_size, merged_allele_ct, merged_alleles_htable);
+          if (cur_idx == UINT32_MAX) {
+            if (unlikely(merged_allele_ct == kPglMaxAlleleCt)) {
+              logerrprintfww("Error: Too many alleles across --pgen-diff and main filesets for variant '%s' at position %s:%u. (This " PROG_NAME_STR " build is limited to " PGL_MAX_ALLELE_CT_STR ".)\n", variant_ids[variant_uidx], chr_buf, cur_included_bp);
+              reterr = kPglRetNotYetSupported;
+              goto PgenDiff_ret_1;
             }
+            remap2[allele2_idx] = merged_allele_ct;
+            merged_alleles[merged_allele_ct] = cur_allele2;
+            ++merged_allele_ct;
+          } else {
+            remap2[allele2_idx] = cur_idx;
           }
         }
         const uint32_t merged_allele_ctl = BitCtToWordCt(merged_allele_ct);


=====================================
plink2_merge.h
=====================================
@@ -71,15 +71,13 @@ ENUM_U31_DEF_START()
   kMergeFilterModeNonpassUnion
 ENUM_U31_DEF_END(MergeFilterMode);
 
-// this is a hybrid, only kfSortFileSid is actually a flag
-FLAGSET_DEF_START()
-  kfSort0,
-  kfSortNone = (1 << 0),
-  kfSortNatural = (1 << 1),
-  kfSortAscii = (1 << 2),
-  kfSortFile = (1 << 3),
-  kfSortFileSid = (1 << 4)
-FLAGSET_DEF_END(SortFlags);
+ENUM_U31_DEF_START()
+  kSort0,
+  kSortNone,
+  kSortNatural,
+  kSortAscii,
+  kSortFile
+ENUM_U31_DEF_END(SortMode);
 
 // --pgen-diff based here due to --merge-mode 6/7 history
 FLAGSET_DEF_START()
@@ -110,7 +108,8 @@ typedef struct PmergeStruct {
   MergeQualInfoMode merge_qual_mode;
   MergeFilterMode merge_filter_mode;
   MergeQualInfoMode merge_info_mode;
-  SortFlags merge_info_sort;
+  SortMode merge_pheno_sort;
+  SortMode merge_info_sort;
   uint32_t max_allele_ct;
   char* pgen_fname;
   char* pvar_fname;
@@ -135,7 +134,7 @@ void InitPgenDiff(PgenDiffInfo* pgen_diff_info_ptr);
 
 void CleanupPgenDiff(PgenDiffInfo* pgen_diff_info_ptr);
 
-PglErr Pmerge(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags misc_flags, SortFlags sample_sort_flags, FamCol fam_cols, uint32_t max_thread_ct, char* pgenname, char* psamname, char* pvarname, char* outname, char* outname_end, ChrInfo* cip);
+PglErr Pmerge(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags misc_flags, SortMode sample_sort_mode, FamCol fam_cols, uint32_t max_thread_ct, char* pgenname, char* psamname, char* pvarname, char* outname, char* outname_end, ChrInfo* cip);
 
 PglErr PgenDiff(const uintptr_t* orig_sample_include, const SampleIdInfo* siip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const PgenDiffInfo* pdip, uint32_t raw_sample_ct, uint32_t orig_sample_ct, uint32_t raw_variant_ct, uint32_t max_allele_ct1, uint32_t max_allele_slen, uint32_t max_thread_ct, PgenFileInfo* pgfip, PgenReader* simple_pgrp, char* outname, char* outname_end);
 


=====================================
plink2_misc.cc
=====================================
@@ -512,7 +512,7 @@ PglErr UpdateVarAlleles(const char* fname, const uintptr_t* variant_include, con
         if ((old_allele2_match == UINT32_MAX) || (cur_allele_ct != 2) || (old_slen1 != 1) || (old_allele1_start[0] != '.')) {
         UpdateVarAlleles_errfile:
           if (!err_ct) {
-            strcpy(outname_end, ".allele.no.snp");
+            strcpy_k(outname_end, ".allele.no.snp");
             if (fopen_checked(outname, FOPEN_WB, &errfile)) {
               goto UpdateVarAlleles_ret_OPEN_FAIL;
             }
@@ -1023,7 +1023,7 @@ PglErr RecoverVarIds(const char* fname, const uintptr_t* variant_include, const
     logprintf("--recover-var-ids: %" PRIuPTR " line%s scanned.\n", line_idx, (line_idx == 1)? "" : "s");
     const uint32_t conflict_ct = PopcountWords(conflict_bitarr, raw_variant_ctl);
     if (conflict_ct) {
-      strcpy(outname_end, ".recoverid.dup");
+      strcpy_k(outname_end, ".recoverid.dup");
       if (fopen_checked(outname, FOPEN_WB, &errfile)) {
         goto RecoverVarIds_ret_OPEN_FAIL;
       }
@@ -1290,33 +1290,20 @@ PglErr Plink1ClusterImport(const char* within_fname, const char* catpheno_name,
           logerrputs("Error: Category names are limited to " MAX_ID_SLEN_STR " characters.\n");
           goto Plink1ClusterImport_ret_INCONSISTENT_INPUT;
         }
-        uint32_t hashval = Hashceil(main_token_start, main_token_slen, cat_htable_size);
-        const uint32_t main_token_blen = main_token_slen + 1;
-        uint32_t cur_htable_entry;
-        while (1) {
-          cur_htable_entry = cat_htable[hashval];
-          if (cur_htable_entry == UINT32_MAX) {
-            if (unlikely(main_token_blen > S_CAST(uintptr_t, cat_name_write_max - cat_name_iter))) {
-              goto Plink1ClusterImport_ret_NOMEM;
-            }
-            char* cat_name_start = cat_name_iter;
-            cat_name_iter = memcpya(cat_name_iter, main_token_start, main_token_blen);
-            cur_cat_names[++nonnull_cat_ct] = cat_name_start;
-            cur_htable_entry = nonnull_cat_ct;
-            cat_htable[hashval] = cur_htable_entry;
-            break;
-          }
-          if (memequal(main_token_start, cur_cat_names[cur_htable_entry], main_token_blen)) {
-            break;
-          }
-          if (++hashval == cat_htable_size) {
-            hashval = 0;
+        uint32_t cur_cat_idx = IdHtableAdd(main_token_start, cur_cat_names, main_token_slen, cat_htable_size, nonnull_cat_ct + 1, cat_htable);
+        if (cur_cat_idx == UINT32_MAX) {
+          if (unlikely(main_token_slen >= S_CAST(uintptr_t, cat_name_write_max - cat_name_iter))) {
+            goto Plink1ClusterImport_ret_NOMEM;
           }
+          char* cat_name_start = cat_name_iter;
+          cat_name_iter = memcpya(cat_name_iter, main_token_start, main_token_slen + 1);
+          cur_cat_idx = nonnull_cat_ct + 1;
+          cur_cat_names[cur_cat_idx] = cat_name_start;
         }
         // permit duplicates if category is identical
         if (IsSet(already_seen, lb_idx)) {
           const uint32_t existing_cat_idx = sorted_cat_idxs[lb_idx];
-          if (unlikely(existing_cat_idx != cur_htable_entry)) {
+          if (unlikely(existing_cat_idx != cur_cat_idx)) {
             idbuf[fid_slen] = ' ';
             logpreprintfww("Error: Duplicate sample ID '%s' with conflicting category assignments in --within file.\n", idbuf);
             goto Plink1ClusterImport_ret_MALFORMED_INPUT_2;
@@ -1325,7 +1312,7 @@ PglErr Plink1ClusterImport(const char* within_fname, const char* catpheno_name,
         } else {
           SetBit(lb_idx, already_seen);
           for (; lb_idx != ub_idx; ++lb_idx) {
-            sorted_cat_idxs[lb_idx] = cur_htable_entry;
+            sorted_cat_idxs[lb_idx] = cur_cat_idx;
           }
         }
         goto Plink1ClusterImport_LINE_ITER_ALREADY_ADVANCED;
@@ -1936,6 +1923,10 @@ PglErr UpdateSampleIds(const char* fname, const uintptr_t* sample_include, uint3
     } else {
       xid_mode = old_sid_present? kfXidModeIidSid : kfXidModeIid;
     }
+    if (new_fid_present) {
+      // bugfix (14 Jan 2021)
+      siip->flags |= kfSampleIdFidPresent;
+    }
     const uintptr_t max_sample_id_blen = siip->max_sample_id_blen;
     const uintptr_t max_sid_blen = siip->max_sid_blen;
     uint32_t* xid_map = nullptr;
@@ -9007,19 +8998,9 @@ PglErr WriteCovar(const uintptr_t* sample_include, const PedigreeIdInfo* piip, c
       uint32_t max_xcovar_name_blen = max_covar_name_blen;
       if (write_sex) {
         // add "SEX"
-        for (uint32_t hashval = Hashceil("SEX", 3, covar_name_htable_size); ; ) {
-          const uint32_t cur_htable_entry = covar_name_htable[hashval];
-          if (cur_htable_entry == UINT32_MAX) {
-            covar_name_htable[hashval] = covar_ct;
-            break;
-          }
-          if (unlikely(strequal_k_unsafe(&(covar_names[cur_htable_entry * max_covar_name_blen]), "SEX"))) {
-            logerrputs("Error: .cov file cannot have both a regular SEX column and a covariate named\n'SEX'.  Exclude or rename one of these columns.\n");
-            goto WriteCovar_ret_INCONSISTENT_INPUT;
-          }
-          if (++hashval == covar_name_htable_size) {
-            hashval = 0;
-          }
+        if (unlikely(StrboxHtableAdd("SEX", covar_names, max_covar_name_blen, strlen("SEX"), covar_name_htable_size, covar_ct, covar_name_htable) != UINT32_MAX)) {
+          logerrputs("Error: .cov file cannot have both a regular SEX column and a covariate named\n'SEX'.  Exclude or rename one of these columns.\n");
+          goto WriteCovar_ret_INCONSISTENT_INPUT;
         }
         if (max_xcovar_name_blen < 4) {
           max_xcovar_name_blen = 4;
@@ -9031,6 +9012,8 @@ PglErr WriteCovar(const uintptr_t* sample_include, const PedigreeIdInfo* piip, c
           *write_iter++ = '\t';
           const uint32_t cur_pheno_name_slen = strlen(pheno_name_iter);
           if (cur_pheno_name_slen < max_xcovar_name_blen) {
+            // can't just use StrboxHtableFind() since "SEX" may not be stored
+            // in covar_names[]
             const uint32_t cur_pheno_name_blen = cur_pheno_name_slen + 1;
             for (uint32_t hashval = Hashceil(pheno_name_iter, cur_pheno_name_slen, covar_name_htable_size); ; ) {
               const uint32_t cur_htable_idval = covar_name_htable[hashval];


=====================================
plink2_psam.cc
=====================================
@@ -94,7 +94,7 @@ PglErr LoadPsam(const char* psamname, const RangeList* pheno_range_list_ptr, Fam
       // parse header
       // [-1] = #FID (if present, must be first column)
       // [0] = IID (could also be first column)
-      // [1] = SID
+      // [1] = SID (if present, must immediately follow IID)
       // [2] = PAT
       // [3] = MAT
       // [4] = SEX
@@ -229,7 +229,15 @@ PglErr LoadPsam(const char* psamname, const RangeList* pheno_range_list_ptr, Fam
       }
       BigstackBaseSet(ll_alloc_base);
       if (unlikely(!(psam_cols_mask & 1))) {
-        snprintf(g_logbuf, kLogbufSize, "Error: No IID column on line %" PRIuPTR " of %s.\n", line_idx, psamname);
+        snprintf(g_logbuf, kLogbufSize, "Error: No IID column in %s.\n", psamname);
+        goto LoadPsam_ret_MALFORMED_INPUT_WW;
+      }
+      if (unlikely(col_types[0] != 0)) {
+        snprintf(g_logbuf, kLogbufSize, "Error: IID column is not first or second in %s.\n", psamname);
+        goto LoadPsam_ret_MALFORMED_INPUT_WW;
+      }
+      if (unlikely((psam_cols_mask & 2) && ((col_types[1] != 1) || (col_skips[1] != 1)))) {
+        snprintf(g_logbuf, kLogbufSize, "Error: SID column does not immediately follow IID column in %s.\n", psamname);
         goto LoadPsam_ret_MALFORMED_INPUT_WW;
       }
       if (unlikely(in_interval)) {
@@ -257,7 +265,7 @@ PglErr LoadPsam(const char* psamname, const RangeList* pheno_range_list_ptr, Fam
       col_skips = S_CAST(uint32_t*, bigstack_alloc_raw_rd(relevant_postfid_col_ct * sizeof(int32_t)));
       col_types = S_CAST(uint32_t*, bigstack_alloc_raw_rd(relevant_postfid_col_ct * sizeof(int32_t)));
       bigstack_mark2 = g_bigstack_base;
-      col_skips[0] = fam_cols & 1;  // assumes kfFamCol1 == 1
+      col_skips[0] = fid_present;
       col_types[0] = 0;
       // psam_cols_mask = 1;  // may need this later
       uint32_t rpf_col_idx = 1;


=====================================
plink2_psam.h
=====================================
@@ -35,8 +35,8 @@ namespace plink2 {
 // by a leading '#'.  All lines which don't start with '#FID' or '#IID' are
 // currently ignored.  The #FID/IID line specifies the columns in the .psam
 // file; the following column headers are recognized:
-//   IID (required)
-//   SID
+//   IID (required, must be first or second)
+//   SID (must immediately follow IID if present)
 //   PAT
 //   MAT
 //   SEX



View it on GitLab: https://salsa.debian.org/med-team/plink2/-/compare/8fb4d070837984696a17af2daf20a5b7ee95bc70...dc83f84b6f540ce18f6c247dc383bc3af3ac7b01

-- 
View it on GitLab: https://salsa.debian.org/med-team/plink2/-/compare/8fb4d070837984696a17af2daf20a5b7ee95bc70...dc83f84b6f540ce18f6c247dc383bc3af3ac7b01
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210117/5ce2d318/attachment-0001.html>


More information about the debian-med-commit mailing list