[med-svn] [Git][med-team/plink2][upstream] New upstream version 2.00~a3-210118+dfsg
Michael R. Crusoe
gitlab at salsa.debian.org
Thu Jan 21 12:44:55 GMT 2021
Michael R. Crusoe pushed to branch upstream at Debian Med / plink2
Commits:
71771f5c by Michael R. Crusoe at 2021-01-21T12:50:11+01:00
New upstream version 2.00~a3-210118+dfsg
- - - - -
14 changed files:
- plink2.cc
- plink2_cmdline.cc
- plink2_cmdline.h
- plink2_common.cc
- plink2_common.h
- plink2_data.cc
- plink2_data.h
- plink2_export.cc
- plink2_help.cc
- plink2_import.h
- plink2_merge.cc
- plink2_merge.h
- plink2_psam.cc
- plink2_pvar.cc
Changes:
=====================================
plink2.cc
=====================================
@@ -71,7 +71,7 @@ static const char ver_str[] = "PLINK v2.00a3"
#ifdef USE_MKL
" Intel"
#endif
- " (16 Jan 2021)";
+ " (18 Jan 2021)";
static const char ver_str2[] =
// include leading space if day < 10, so character length stays the same
""
@@ -7765,20 +7765,34 @@ int main(int argc, char** argv) {
snprintf(g_logbuf, kLogbufSize, "Error: Invalid --merge-mode argument '%s'.\n", cur_modif);
goto main_ret_INVALID_CMDLINE_WWA;
}
- } else if (strequal_k_unsafe(flagname_p2, "erge-pheno-mode")) {
+ } else if (strequal_k_unsafe(flagname_p2, "erge-parents-mode") ||
+ strequal_k_unsafe(flagname_p2, "erge-sex-mode") ||
+ strequal_k_unsafe(flagname_p2, "erge-pheno-mode")) {
if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 1))) {
goto main_ret_INVALID_CMDLINE_2A;
}
const char* cur_modif = argvk[arg_idx + 1];
const uint32_t cur_modif_slen = strlen(cur_modif);
- if (strequal_k(cur_modif, "nm-first", cur_modif_slen) || strequal_k(cur_modif, "2", cur_modif_slen)) {
- pmerge_info.merge_pheno_mode = kMergePhenoModeNmFirst;
- } else if (strequal_k(cur_modif, "first", cur_modif_slen) || strequal_k(cur_modif, "4", cur_modif_slen)) {
- pmerge_info.merge_pheno_mode = kMergePhenoModeFirst;
- } else if (unlikely(!(strequal_k(cur_modif, "nm-match", cur_modif_slen) || strequal_k(cur_modif, "1", cur_modif_slen)))) {
- snprintf(g_logbuf, kLogbufSize, "Error: Invalid --merge-pheno-mode argument '%s'.\n", cur_modif);
+ MergePhenoMode mode;
+ if (strequal_k(cur_modif, "nm-match", cur_modif_slen) || strequal_k(cur_modif, "1", cur_modif_slen)) {
+ mode = kMergePhenoModeNmMatch;
+ } else if (strequal_k(cur_modif, "nm-first", cur_modif_slen) || strequal_k(cur_modif, "2", cur_modif_slen)) {
+ mode = kMergePhenoModeNmFirst;
+ } else if (likely(strequal_k(cur_modif, "first", cur_modif_slen) || strequal_k(cur_modif, "4", cur_modif_slen))) {
+ mode = kMergePhenoModeFirst;
+ } else {
+ snprintf(g_logbuf, kLogbufSize, "Error: Invalid --%s argument '%s'.\n", flagname_p, cur_modif);
goto main_ret_INVALID_CMDLINE_WWA;
}
+ if (flagname_p2[5] == 'p') {
+ if (flagname_p2[6] == 'a') {
+ pmerge_info.merge_parents_mode = mode;
+ } else {
+ pmerge_info.merge_pheno_mode = mode;
+ }
+ } else {
+ pmerge_info.merge_sex_mode = mode;
+ }
} else if (strequal_k_unsafe(flagname_p2, "erge-xheader-mode")) {
if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 1))) {
goto main_ret_INVALID_CMDLINE_2A;
@@ -7786,6 +7800,10 @@ int main(int argc, char** argv) {
const char* cur_modif = argvk[arg_idx + 1];
const uint32_t cur_modif_slen = strlen(cur_modif);
if (strequal_k(cur_modif, "erase", cur_modif_slen)) {
+ if (unlikely(pmerge_info.merge_info_mode != kMergeQualInfoModeErase)) {
+ logerrputs("Error: \"--merge-xheader-mode erase\" requires \"--merge-info-mode erase\".\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
pmerge_info.merge_xheader_mode = kMergeXheaderModeErase;
} else if (strequal_k(cur_modif, "match", cur_modif_slen)) {
pmerge_info.merge_xheader_mode = kMergeXheaderModeMatch;
@@ -8617,6 +8635,9 @@ int main(int argc, char** argv) {
}
pc.command_flags1 |= kfCommand1PgenDiff;
pc.dependency_flags |= kfFilterAllReq;
+ } else if (strequal_k_unsafe(flagname_p2, "heno-inner-join")) {
+ pmerge_info.flags |= kfPmergePhenoInnerJoin;
+ goto main_param_zero;
} else if (likely(strequal_k_unsafe(flagname_p2, "heno-quantile-normalize"))) {
if (param_ct) {
reterr = AllocAndFlatten(&(argvk[arg_idx + 1]), param_ct, 0x7fffffff, &pc.quantnorm_flattened);
@@ -10644,7 +10665,11 @@ int main(int argc, char** argv) {
if (make_plink2_flags & (kfMakePgen | kfMakePvar | kfMakePsam)) {
merge_outname_end = strcpya_k(merge_outname_end, "-merge");
}
- reterr = Pmerge(&pmerge_info, pc.sample_sort_fname, pc.misc_flags, pc.sample_sort_mode, pc.fam_cols, pc.max_thread_ct, pgenname, psamname, pvarname, outname, merge_outname_end, &chr_info);
+ const uint32_t zst_level = g_zst_level;
+ if (!(import_flags & kfImportKeepAutoconv)) {
+ g_zst_level = 1;
+ }
+ reterr = Pmerge(&pmerge_info, pc.sample_sort_fname, pc.misc_flags, import_flags, pc.sample_sort_mode, pc.fam_cols, pc.missing_pheno, pc.max_thread_ct, pgenname, psamname, pvarname, outname, merge_outname_end, &chr_info);
if (unlikely(reterr)) {
goto main_ret_1;
}
@@ -10652,6 +10677,7 @@ int main(int argc, char** argv) {
if (!pc.command_flags1) {
goto main_ret_1;
}
+ g_zst_level = zst_level;
}
if ((pc.command_flags1 & (~(kfCommand1MakePlink2 | kfCommand1Validate | kfCommand1WriteSnplist | kfCommand1WriteCovar | kfCommand1WriteSamples))) || ((pc.command_flags1 & kfCommand1MakePlink2) && (pc.sort_vars_mode > kSortNone))) {
=====================================
plink2_cmdline.cc
=====================================
@@ -1496,6 +1496,18 @@ uint32_t StrboxHtableFind(const char* cur_id, const char* strbox, const uint32_t
}
}
+uint32_t StrboxHtableFindNnt(const char* cur_id, const char* strbox, const uint32_t* id_htable, uintptr_t max_str_blen, uint32_t cur_id_slen, uint32_t id_htable_size) {
+ for (uint32_t hashval = Hashceil(cur_id, cur_id_slen, id_htable_size); ; ) {
+ const uint32_t cur_htable_idval = id_htable[hashval];
+ if ((cur_htable_idval == UINT32_MAX) || strequal_unsafe(&(strbox[cur_htable_idval * max_str_blen]), cur_id, cur_id_slen)) {
+ return cur_htable_idval;
+ }
+ if (++hashval == id_htable_size) {
+ hashval = 0;
+ }
+ }
+}
+
uint32_t StrboxHtableAdd(const char* cur_id, const char* strbox, uintptr_t max_str_blen, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t value, uint32_t* id_htable) {
for (uint32_t hashval = Hashceil(cur_id, cur_id_slen, id_htable_size); ; ) {
const uint32_t cur_htable_entry = id_htable[hashval];
=====================================
plink2_cmdline.h
=====================================
@@ -1431,6 +1431,8 @@ uint32_t IdHtableAddNnt(const char* cur_id, const char* const* item_ids, uint32_
// requires cur_id to be null-terminated.
uint32_t StrboxHtableFind(const char* cur_id, const char* strbox, const uint32_t* id_htable, uintptr_t max_str_blen, uint32_t cur_id_slen, uint32_t id_htable_size);
+uint32_t StrboxHtableFindNnt(const char* cur_id, const char* strbox, const uint32_t* id_htable, uintptr_t max_str_blen, uint32_t cur_id_slen, uint32_t id_htable_size);
+
// Assumes cur_id is null-terminated.
// Returns string-index if cur_id is already in the table, UINT32_MAX if it was
// added.
=====================================
plink2_common.cc
=====================================
@@ -1220,6 +1220,25 @@ PglErr LoadXidHeaderPair(const char* flag_name, uint32_t sid_over_fid, uintptr_t
return kPglRetSuccess;
}
+// accept 'M'/'F'/'m'/'f' since that's more readable without being any less
+// efficient
+const unsigned char g_char_to_sex[256] =
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
const char g_xymt_log_names[kChrOffsetCt][5] = {"chrX", "chrY", "XY", "chrM", "PAR1", "PAR2"};
=====================================
plink2_common.h
=====================================
@@ -87,7 +87,7 @@ CONSTI32(kMaxPhenoCt, 524287);
// maximum number of usable cluster computers, this is arbitrary though it
// shouldn't be larger than 2^32 - 1
-// (actually, there's an overflow danger: [work units] * parallel_idx may not
+// (actually, there's an overflow danger: <work units> * parallel_idx may not
// fit in a uint64 if parallel_tot is too high.)
CONSTI32(kParallelMax, 32768);
@@ -572,6 +572,12 @@ PglErr OpenAndLoadXidHeader(const char* fname, const char* flag_name, XidHeaderF
// header line expected to start with FID1, ID1, or IID1
PglErr LoadXidHeaderPair(const char* flag_name, uint32_t sid_over_fid, uintptr_t* line_idx_ptr, TextStream* txsp, XidMode* xid_mode_ptr, char** line_startp, char** line_iterp);
+extern const unsigned char g_char_to_sex[256];
+
+HEADER_INLINE uint32_t CharToSex(char cc) {
+ return g_char_to_sex[S_CAST(unsigned char, cc)];
+}
+
// note that this is no longer divisible by 64
CONSTI32(kMaxContigs, 65274);
=====================================
plink2_data.cc
=====================================
@@ -237,7 +237,7 @@ PglErr PvarInfoReloadAndWrite(uint32_t info_pr_flag_present, uint32_t info_col_i
}
void AppendChrsetLine(const ChrInfo* cip, char** write_iter_ptr) {
- char* write_iter = strcpya_k(*write_iter_ptr, "##chrSet=<");
+ char* write_iter = strcpya_k(*write_iter_ptr, "##chrSet=<ID=1,");
if (!(cip->haploid_mask[0] & 1)) {
write_iter = strcpya_k(write_iter, "autosomePairCt=");
write_iter = u32toa(cip->autosome_ct, write_iter);
=====================================
plink2_data.h
=====================================
@@ -58,6 +58,14 @@ FLAGSET_DEF_START()
kfMakePgenFillMissingFromDosage = (1 << 22)
FLAGSET_DEF_END(MakePlink2Flags);
+FLAGSET_DEF_START()
+ kfImport0,
+ kfImportKeepAutoconv = (1 << 0),
+ kfImportKeepAutoconvVzs = (1 << 1),
+ kfImportDoubleId = (1 << 2),
+ kfImportVcfRequireGt = (1 << 3)
+FLAGSET_DEF_END(ImportFlags);
+
CONSTI32(kMaxInfoKeySlen, kMaxIdSlen);
#define MAX_INFO_KEY_SLEN_STR MAX_ID_SLEN_STR
=====================================
plink2_export.cc
=====================================
@@ -4020,6 +4020,23 @@ uint32_t ValidVcfContigName(const char* name_start, const char* name_end, uint32
return 1;
}
+uint32_t ValidVcfHeaderLine(const char* line_start, uint32_t line_slen, uint32_t v43) {
+ const char* first_eq = S_CAST(const char*, memchr(line_start, '=', line_slen));
+ if (!first_eq) {
+ logerrprintf("Error: Header line in .pvar file does not conform to the VCFv4.%c specification\n(no '=').\n", '2' + v43);
+ return 0;
+ }
+ if (v43) {
+ // If header line starts with "##key=<", that must be followed by
+ // "ID=" to conform to the VCFv4.3 (but not 4.2) specification.
+ if ((first_eq[1] == '<') && (!memequal_k(&(first_eq[2]), "ID=", 3))) {
+ logerrprintf("Error: Header line in .pvar file does not conform to the VCFv4.%c specification\n(value starts with '<', but that is not followed by an ID). This is permitted\nin VCFv4.2, so you may want to export that format instead.\n");
+ return 0;
+ }
+ }
+ return 1;
+}
+
uint32_t ValidVcfAlleleCode(const char* allele_code_iter) {
// returns 1 if probably valid (angle-bracket case is not exhaustively
// checked), 0 if definitely not
@@ -4490,6 +4507,9 @@ PglErr ExportVcf(const uintptr_t* sample_include, const uint32_t* sample_include
goto ExportVcf_ret_WRITE_FAIL;
}
} else {
+ if (unlikely(!ValidVcfHeaderLine(xheader_iter, slen, v43))) {
+ goto ExportVcf_ret_INCONSISTENT_INPUT;
+ }
if (unlikely(BgzfWrite(xheader_iter, slen, &bgzf))) {
goto ExportVcf_ret_WRITE_FAIL;
}
@@ -7700,7 +7720,11 @@ PglErr ExportBcf(const uintptr_t* sample_include, const uint32_t* sample_include
const uint32_t is_info_line = StrStartsWithUnsafe(line_main, "INFO=<ID=");
if ((!is_filter_line) && (!is_info_line)) {
if (!StrStartsWithUnsafe(line_main, "contig=<ID=")) {
- write_iter = memcpya(write_iter, line_start, line_end - line_start);
+ const uint32_t slen = line_end - line_start;
+ if (unlikely(!ValidVcfHeaderLine(line_start, slen, v43))) {
+ goto ExportBcf_ret_INCONSISTENT_INPUT;
+ }
+ write_iter = memcpya(write_iter, line_start, slen);
continue;
}
char* contig_name_start = &(line_main[11]);
=====================================
plink2_help.cc
=====================================
@@ -2123,8 +2123,9 @@ PglErr DispHelp(const char* const* argvk, uint32_t param_ct) {
" --{sample,variant,pheno}-inner-join specifies an\n"
" intersection instead.\n"
" --merge-mode <mode> : Set --pmerge[-list] conflict resolution mode for\n"
-" --merge-pheno-mode <m> genotypes and phenotypes.\n"
-" * 'nm-match'/'1' = nonmissing values must match\n"
+" --merge-parents-mode <m> genotypes/dosages, parents, sexes, and phenotypes,\n"
+" --merge-sex-mode <mode> respectively.\n"
+" --merge-pheno-mode <m> * 'nm-match'/'1' = nonmissing values must match\n"
" (default)\n"
" * 'nm-first'/'2' = keep first nonmissing value\n"
" * 'first'/'4' = keep first value, even if missing\n"
=====================================
plink2_import.h
=====================================
@@ -25,14 +25,6 @@
namespace plink2 {
#endif
-FLAGSET_DEF_START()
- kfImport0,
- kfImportKeepAutoconv = (1 << 0),
- kfImportKeepAutoconvVzs = (1 << 1),
- kfImportDoubleId = (1 << 2),
- kfImportVcfRequireGt = (1 << 3)
-FLAGSET_DEF_END(ImportFlags);
-
ENUM_U31_DEF_START()
kVcfHalfCallReference,
kVcfHalfCallHaploid,
=====================================
plink2_merge.cc
=====================================
@@ -30,6 +30,8 @@ void InitPmerge(PmergeInfo* pmerge_info_ptr) {
pmerge_info_ptr->flags = kfPmerge0;
pmerge_info_ptr->list_mode = kPmergeListModePfile;
pmerge_info_ptr->merge_mode = kMergeModeNmMatch;
+ pmerge_info_ptr->merge_parents_mode = kMergePhenoModeNmMatch;
+ pmerge_info_ptr->merge_sex_mode = kMergePhenoModeNmMatch;
pmerge_info_ptr->merge_pheno_mode = kMergePhenoModeNmMatch;
pmerge_info_ptr->merge_xheader_mode = kMergeXheaderModeFirst;
pmerge_info_ptr->merge_qual_mode = kMergeQualInfoModeNmFirst;
@@ -70,8 +72,17 @@ typedef struct PmergeInputFilesetLlStruct {
char* pgen_fname;
char* pvar_fname;
char* psam_fname;
+ char* pgen_locked_fname;
+ uint32_t sample_ct;
uint32_t variant_ct;
+ uint32_t max_line_blen;
+ uint32_t max_single_pos_ct;
+ uintptr_t max_single_pos_blen;
uintptr_t grand_allele_ct;
+ ChrIdx first_chr_idx;
+ ChrIdx last_chr_idx;
+ uint32_t first_pos;
+ uint32_t last_pos;
} PmergeInputFilesetLl;
// Allocates at end of bigstack.
@@ -161,6 +172,7 @@ PglErr LoadPmergeList(const char* list_fname, PmergeListMode mode, uint32_t main
cur_entry->psam_fname = fname_iter;
fname_iter = memcpya(fname_iter, first_token_start, first_token_slen);
memcpy(fname_iter, psam_suffix, psam_blen);
+ cur_entry->pgen_locked_fname = nullptr;
} else {
const char* second_token_end = CurTokenEnd(second_token_start);
const char* third_token_start = FirstNonTspace(second_token_end);
@@ -192,6 +204,7 @@ PglErr LoadPmergeList(const char* list_fname, PmergeListMode mode, uint32_t main
fname_iter = memcpyax(fname_iter, second_token_start, third_token_slen, '\0');
cur_entry->psam_fname = fname_iter;
memcpyx(fname_iter, third_token_start, third_token_slen, '\0');
+ cur_entry->pgen_locked_fname = nullptr;
}
}
if (unlikely(TextStreamErrcode2(&txs, &reterr))) {
@@ -225,7 +238,7 @@ PglErr LoadPmergeList(const char* list_fname, PmergeListMode mode, uint32_t main
}
// Permanent allocations are at end of bigstack.
-PglErr MergePsams(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags misc_flags, SortMode sample_sort_mode, FamCol fam_cols, uint32_t max_thread_ct, char* outname, char* outname_end, PmergeInputFilesetLl* filesets, SampleIdInfo* siip, uint32_t* sample_ctp) {
+PglErr MergePsams(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags misc_flags, SortMode sample_sort_mode, FamCol fam_cols, int32_t missing_pheno, uint32_t max_thread_ct, char* outname, char* outname_end, PmergeInputFilesetLl* filesets, SampleIdInfo* siip, uint32_t* sample_ctp) {
unsigned char* bigstack_mark = g_bigstack_base;
unsigned char* bigstack_end_mark = g_bigstack_end;
uintptr_t line_idx = 0;
@@ -295,6 +308,7 @@ PglErr MergePsams(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFla
unsigned char* arena_top = g_bigstack_end;
const uint32_t decompress_thread_ct = MAXV(max_thread_ct - 1, 1);
+ // possible todo: track seen realpaths, skip duplicates
PmergeInputFilesetLl* filesets_iter = filesets;
char* idbuf = g_textbuf;
uint32_t max_line_blen = 0;
@@ -944,6 +958,7 @@ PglErr MergePsams(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFla
cur_sample_id = idbuf;
}
HtableAddNondup(cur_sample_id, slen, sample_id_table_size, sample_idx, sample_id_htable);
+ sample_ids_iter = &(sample_ids_iter[max_sample_id_blen]);
}
}
linebuf_capacity = MAXV(max_line_blen, kDecompressMinBlen) + kDecompressChunkSize;
@@ -990,7 +1005,8 @@ PglErr MergePsams(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFla
// not yet known", since that lets us leave it unchanged if all values
// are missing.
pheno_cols[pheno_idx].type_code = kPhenoDtypeCc;
- // defensive
+ // defensive, nonnull_category_ct is ignored by WritePsam() except in
+ // the 'SEX' phenotype case which can't happen here
pheno_cols[pheno_idx].category_names = nullptr;
pheno_cols[pheno_idx].nonnull_category_ct = 0;
}
@@ -1000,6 +1016,7 @@ PglErr MergePsams(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFla
const uint32_t max_col_ct = pheno_ct + kNonphenoColCt;
// note that ZeroWArr(pheno_ctl, pheno_include) is safe when pheno_ct == 0
const uint32_t pheno_ctl = BitCtToWordCt(pheno_ct);
+ const double missing_phenod = missing_pheno? S_CAST(double, missing_pheno) : HUGE_VAL;
uint32_t* col_skips;
uint32_t* col_types;
const char** token_ptrs;
@@ -1012,9 +1029,25 @@ PglErr MergePsams(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFla
bigstack_alloc_c(linebuf_capacity, &linebuf))) {
goto MergePsams_ret_NOMEM;
}
+ // sample-major, PAT before MAT
+ const MergePhenoMode merge_parents_mode = pmip->merge_parents_mode;
+ uintptr_t* parents_locked = nullptr;
+ if (merge_parents_mode != kMergePhenoModeNmFirst) {
+ if (unlikely(bigstack_calloc_w(BitCtToWordCt(2 * sample_ct), &parents_locked))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ }
+ const MergePhenoMode merge_sex_mode = pmip->merge_sex_mode;
+ uintptr_t* sex_locked = nullptr;
+ if (merge_sex_mode != kMergePhenoModeNmFirst) {
+ if (unlikely(bigstack_calloc_w(sample_ctl, &sex_locked))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ }
// phenotype-major
+ const MergePhenoMode merge_pheno_mode = pmip->merge_pheno_mode;
uintptr_t* pheno_locked = nullptr;
- if (pheno_ct && (pmip->merge_pheno_mode != kMergePhenoModeNmFirst)) {
+ if (pheno_ct && (merge_pheno_mode != kMergePhenoModeNmFirst)) {
// no need to check for overflow in 32-bit case since we would have
// already run out of memory on data.qt allocations
const uintptr_t bit_ct = S_CAST(uintptr_t, pheno_ct) * sample_ct;
@@ -1022,6 +1055,26 @@ PglErr MergePsams(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFla
goto MergePsams_ret_NOMEM;
}
}
+ // When --1 isn't specified, we need to be careful when interpreting
+ // zeroes: if the phenotype winds up being quantitative, zero is a regular
+ // value, but if it's binary, it's a missing value.
+ // Except in the trivial "--merge-pheno-mode first" case, we treat it as a
+ // missing value during the main pass, but set the relevant pheno_zero_seen
+ // bit if that phenotype cell isn't locked yet.
+ // We then correct the final phenotype values at the end, when we know
+ // whether each column will appear to be binary or quantitative in the
+ // merged .psam.
+ // Note that it is possible for a quantitative column to degrade to a
+ // binary column due to e.g. conflicts knocking out all values not in {-9,
+ // 0, 1, 2}. There isn't much we can do about that here, though.
+ const uint32_t affection_01 = (misc_flags / kfMiscAffection01) & 1;
+ uintptr_t* pheno_zero_seen = nullptr;
+ if (pheno_ct && (!affection_01) && (merge_pheno_mode != kMergePhenoModeFirst)) {
+ const uintptr_t bit_ct = S_CAST(uintptr_t, pheno_ct) * sample_ct;
+ if (unlikely(bigstack_calloc_w(BitCtToWordCt(bit_ct), &pheno_zero_seen))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ }
if (pheno_ct) {
logprintf("--pmerge%s: %u sample%s and %u phenotype%s present.\n", pmip->list_fname? "-list" : "", sample_ct, (sample_ct == 1)? "" : "s", pheno_ct, (pheno_ct == 1)? "" : "s");
@@ -1038,15 +1091,30 @@ PglErr MergePsams(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFla
// category_names_htable is repositioned, resized, and rebuilt whenever
// category_names would otherwise overflow.
// No other allocations allowed past this point, until WritePsam() call.
- arena_bottom = g_bigstack_base;
- arena_top = g_bigstack_end;
const char** category_names = nullptr;
- // uint32_t* category_names_htable = nullptr;
+ uint32_t* category_names_htable = nullptr;
uint32_t category_names_ct = 0;
// category_names_capacity == category_names_htable_size / 2
- // uint32_t category_names_htable_size = 0;
+ uint32_t category_names_htable_size = 0;
+ arena_top = g_bigstack_end;
+ if (pheno_ct) {
+ category_names_htable_size = 512;
+ if (unlikely(bigstack_alloc_kcp(category_names_htable_size / 2, &category_names) ||
+ bigstack_alloc_u32(category_names_htable_size, &category_names_htable))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ SetAllU32Arr(category_names_htable_size, category_names_htable);
+ // copy into arena so overread is safe
+ const uint32_t missing_catname_slen = strlen(g_missing_catname);
+ if (unlikely(StoreStringAtEndK(g_bigstack_base, g_missing_catname, missing_catname_slen, &arena_top, &(category_names[0])))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ const uint32_t hashval = Hashceil(g_missing_catname, missing_catname_slen, category_names_htable_size);
+ category_names_htable[0] = hashval;
+ category_names_ct = 1;
+ }
+ arena_bottom = g_bigstack_base;
filesets_iter = filesets;
- // possible todo: track seen realpaths, skip duplicates
do {
cur_fname = filesets_iter->psam_fname;
reterr = TextStreamOpenEx(cur_fname, kMaxLongLine, linebuf_capacity, decompress_thread_ct, nullptr, linebuf, &txs);
@@ -1065,15 +1133,14 @@ PglErr MergePsams(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFla
line_start = AdvPastDelim(line_start, '\n');
}
ZeroWArr(pheno_ctl, pheno_include);
- // uint32_t sid_present = 0;
- // uint32_t parents_present = 0;
- // uint32_t sex_present = 0;
+ uint32_t sid_present = 0;
+ uint32_t parents_present = 0;
+ uint32_t sex_present = 0;
+ uint32_t relevant_postfid_col_ct = 1;
uint32_t fid_present;
- uint32_t relevant_postfid_col_ct;
col_types[0] = 0;
if (line_start[0] == '#') {
fid_present = (line_start[1] == 'F');
- relevant_postfid_col_ct = 1 - fid_present;
if (!fid_present) {
col_skips[0] = 0;
}
@@ -1090,21 +1157,21 @@ PglErr MergePsams(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFla
if (memequal_k(token_start, "IID", 3)) {
cur_col_type = 0;
} else if (memequal_k(token_start, "SID", 3)) {
- // sid_present = 1;
+ sid_present = 1;
cur_col_type = 1;
} else if (memequal_k(token_start, "PAT", 3)) {
cur_col_type = 2;
- // parents_present = 1;
+ parents_present = 1;
} else if (memequal_k(token_start, "MAT", 3)) {
cur_col_type = 3;
} else if (memequal_k(token_start, "SEX", 3)) {
cur_col_type = 4;
- // sex_present = 1;
+ sex_present = 1;
}
}
if (cur_col_type == UINT32_MAX) {
if (token_slen < max_pheno_name_blen) {
- cur_col_type = StrboxHtableFind(token_start, pheno_names, pheno_names_htable, max_pheno_name_blen, token_slen, pheno_names_table_size);
+ cur_col_type = StrboxHtableFindNnt(token_start, pheno_names, pheno_names_htable, max_pheno_name_blen, token_slen, pheno_names_table_size);
}
if (cur_col_type == UINT32_MAX) {
continue;
@@ -1124,18 +1191,17 @@ PglErr MergePsams(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFla
// .fam
fid_present = (fam_cols / kfFamCol1) & 1;
col_skips[0] = fid_present;
- relevant_postfid_col_ct = 1;
if (fam_cols & kfFamCol34) {
col_skips[relevant_postfid_col_ct] = 1;
col_types[relevant_postfid_col_ct++] = 2;
col_skips[relevant_postfid_col_ct] = 1;
col_types[relevant_postfid_col_ct++] = 3;
- // parents_present = 1;
+ parents_present = 1;
}
if (fam_cols & kfFamCol5) {
col_skips[relevant_postfid_col_ct] = 1;
col_types[relevant_postfid_col_ct++] = 4;
- // sex_present = 1;
+ sex_present = 1;
}
if (fam_cols & kfFamCol6) {
col_skips[relevant_postfid_col_ct] = 1;
@@ -1144,36 +1210,397 @@ PglErr MergePsams(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFla
SetBit(pheno_idx, pheno_include);
}
}
- // TODO
+
+ const uint32_t cur_pheno_ct = PopcountWords(pheno_include, pheno_ctl);
+ const char* sample_ids = siip->sample_ids;
+ const char* sids = siip->sids;
+ uint32_t cur_sample_ct = 0;
+ uint32_t sid_slen = 1;
+ const char* line_iter;
+ for (; TextGetUnsafe2K(&txs, &line_start); line_start = AdvPastDelim(line_iter, '\n'), ++line_idx) {
+ line_iter = TokenLexK0(line_start, col_types, col_skips, relevant_postfid_col_ct, token_ptrs, token_slens);
+ if (unlikely(!line_iter)) {
+ goto MergePsams_ret_MISSING_TOKENS;
+ }
+ char* id_iter = idbuf;
+ if (fid_present) {
+ const char* fid_end = CurTokenEnd(line_start);
+ id_iter = memcpya(id_iter, line_start, fid_end - line_start);
+ } else {
+ *id_iter++ = '0';
+ }
+ *id_iter++ = '\t';
+ id_iter = memcpya(id_iter, token_ptrs[0], token_slens[0]);
+ uint32_t sample_idx;
+ if (!sids) {
+ *id_iter = '\0';
+ const uint32_t id_slen = id_iter - idbuf;
+ sample_idx = StrboxHtableFind(idbuf, sample_ids, sample_id_htable, max_sample_id_blen, id_slen, sample_id_table_size);
+ } else {
+ *id_iter++ = '\t';
+ char* sid_start = id_iter;
+ if (sid_present) {
+ sid_slen = token_slens[1];
+ id_iter = memcpya(id_iter, token_ptrs[1], sid_slen);
+ } else {
+ *id_iter++ = '0';
+ }
+ *id_iter = '\0';
+ // probably want to move this to a plink2_common function
+ uint32_t hashval = Hashceil(idbuf, id_iter - idbuf, sample_id_table_size);
+ const uint32_t fid_iid_blen = sid_start - idbuf;
+ sid_start[-1] = '\0';
+ while (1) {
+ sample_idx = sample_id_htable[hashval];
+ if (sample_idx == UINT32_MAX) {
+ break;
+ }
+ if (memequal(idbuf, &(sample_ids[sample_idx * max_sample_id_blen]), fid_iid_blen) && memequal(sid_start, &(sids[sample_idx * max_sid_blen]), sid_slen + 1)) {
+ break;
+ }
+ if (++hashval == sample_id_table_size) {
+ hashval = 0;
+ }
+ }
+ }
+ if (sample_idx == UINT32_MAX) {
+ continue;
+ }
+ ++cur_sample_ct;
+ if (parents_present) {
+ char* dad_id_dst = &(parental_id_info.paternal_ids[sample_idx * max_paternal_id_blen]);
+ char* mom_id_dst = &(parental_id_info.maternal_ids[sample_idx * max_maternal_id_blen]);
+ if (parents_locked) {
+ if (!IsSet(parents_locked, sample_idx * 2)) {
+ const char* dad_id = token_ptrs[2];
+ const uint32_t dad_slen = token_slens[2];
+ if (merge_parents_mode == kMergePhenoModeNmMatch) {
+ if (((dad_slen != 1) || (dad_id[0] != '0')) && (!strequal_unsafe(dad_id_dst, dad_id, dad_slen))) {
+ if (memequal_k(dad_id_dst, "0", 2)) {
+ memcpyx(dad_id_dst, dad_id, dad_slen, '\0');
+ } else {
+ strcpy_k(dad_id_dst, "0");
+ SetBit(sample_idx * 2, parents_locked);
+ }
+ }
+ } else {
+ memcpyx(dad_id_dst, dad_id, dad_slen, '\0');
+ SetBit(sample_idx * 2, parents_locked);
+ }
+ }
+ if (!IsSet(parents_locked, sample_idx * 2 + 1)) {
+ const char* mom_id = token_ptrs[3];
+ const uint32_t mom_slen = token_slens[3];
+ if (merge_parents_mode == kMergePhenoModeNmMatch) {
+ if (((mom_slen != 1) || (mom_id[0] != '0')) && (!strequal_unsafe(mom_id_dst, mom_id, mom_slen))) {
+ if (memequal_k(mom_id_dst, "0", 2)) {
+ memcpyx(mom_id_dst, mom_id, mom_slen, '\0');
+ } else {
+ strcpy_k(mom_id_dst, "0");
+ SetBit(sample_idx * 2, parents_locked);
+ }
+ }
+ } else {
+ memcpyx(mom_id_dst, mom_id, mom_slen, '\0');
+ SetBit(sample_idx * 2 + 1, parents_locked);
+ }
+ }
+ } else {
+ // nm-first, skip if not missing
+ if (memequal_k(dad_id_dst, "0", 2)) {
+ memcpyx(dad_id_dst, token_ptrs[2], token_slens[2], '\0');
+ }
+ if (memequal_k(mom_id_dst, "0", 2)) {
+ memcpyx(mom_id_dst, token_ptrs[3], token_slens[3], '\0');
+ }
+ }
+ }
+ if (sex_present) {
+ if (!sex_locked) {
+ if (!IsSet(sex_nm, sample_idx)) {
+ // nm-first and previously missing
+ const uint32_t cur_sex_code = (token_slens[4] == 1)? CharToSex(token_ptrs[4][0]) : 0;
+ if (cur_sex_code) {
+ SetBit(sample_idx, sex_nm);
+ AssignBit(sample_idx, cur_sex_code & 1, sex_male);
+ }
+ }
+ } else {
+ if (!IsSet(sex_locked, sample_idx)) {
+ const uint32_t cur_sex_code = (token_slens[4] == 1)? CharToSex(token_ptrs[4][0]) : 0;
+ if (merge_sex_mode == kMergePhenoModeNmMatch) {
+ if (cur_sex_code) {
+ if (!IsSet(sex_nm, sample_idx)) {
+ SetBit(sample_idx, sex_nm);
+ AssignBit(sample_idx, cur_sex_code & 1, sex_male);
+ } else {
+ if (IsSet(sex_male, sample_idx) != (cur_sex_code & 1)) {
+ ClearBit(sample_idx, sex_nm);
+ ClearBit(sample_idx, sex_male);
+ SetBit(sample_idx, sex_locked);
+ }
+ }
+ }
+ } else {
+ // first
+ AssignBit(sample_idx, (cur_sex_code + 1) / 2, sex_nm);
+ AssignBit(sample_idx, cur_sex_code & 1, sex_male);
+ SetBit(sample_idx, sex_locked);
+ }
+ }
+ }
+ }
+ if (cur_pheno_ct) {
+ uintptr_t pheno_uidx_base = 0;
+ uintptr_t cur_bits = pheno_include[0];
+ for (uint32_t cur_pheno_idx = 0; cur_pheno_idx != cur_pheno_ct; ++cur_pheno_idx) {
+ const uintptr_t pheno_uidx = BitIter1(pheno_include, &pheno_uidx_base, &cur_bits);
+ PhenoCol* cur_pheno_col = &(pheno_cols[pheno_uidx]);
+ if (((!pheno_locked) && IsSet(cur_pheno_col->nonmiss, sample_idx)) || (pheno_locked && IsSet(pheno_locked, sample_idx + pheno_uidx * sample_ct))) {
+ continue;
+ }
+ const uint32_t col_type_idx = pheno_uidx + kNonphenoColCt;
+ const char* cur_phenostr = token_ptrs[col_type_idx];
+ double dxx;
+ const char* cur_phenostr_end = ScanadvDouble(cur_phenostr, &dxx);
+ if (!cur_phenostr_end) {
+ const uint32_t slen = token_slens[col_type_idx];
+ if (IsNanStr(cur_phenostr, slen)) {
+ dxx = missing_phenod;
+ } else {
+ // categorical case
+ if (cur_pheno_col->type_code != kPhenoDtypeCat) {
+ if (unlikely(cur_pheno_col->type_code == kPhenoDtypeQt)) {
+ snprintf(g_logbuf, kLogbufSize, "Error: '%s' entry on line %" PRIuPTR " of %s is categorical, while an earlier entry is numeric/'NA'.\n", &(pheno_names[pheno_uidx * max_pheno_name_blen]), line_idx, cur_fname);
+ goto MergePsams_ret_INCONSISTENT_INPUT_WW;
+ }
+ cur_pheno_col->type_code = kPhenoDtypeCat;
+ ZeroU32Arr(sample_ct, cur_pheno_col->data.cat);
+ }
+ uint32_t catname_idx = IdHtableFindNnt(cur_phenostr, category_names, category_names_htable, slen, category_names_htable_size);
+ if (catname_idx == UINT32_MAX) {
+ if (category_names_ct * 2 == category_names_htable_size) {
+ // resize
+ if (unlikely(S_CAST(uintptr_t, arena_top - arena_bottom) < category_names_ct * (2 * sizeof(int32_t) + sizeof(intptr_t)))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ uint32_t next_capacity = category_names_ct * 2;
+ if (next_capacity >= 0x80000000U) {
+ if (unlikely(next_capacity != 0x80000000U)) {
+ logerrputs("Error: This implementation of --pmerge[-list] is limited to ~2^31 distinct\ncategory names.\n");
+ reterr = kPglRetNotYetSupported;
+ goto MergePsams_ret_1;
+ }
+ next_capacity = 0x7fffffff;
+ }
+ category_names_htable = R_CAST(uint32_t*, &(category_names[next_capacity]));
+ category_names_htable_size = 2 * next_capacity;
+ SetAllU32Arr(category_names_htable_size, category_names_htable);
+ arena_bottom = R_CAST(unsigned char*, &(category_names_htable[category_names_htable_size]));
+ for (uint32_t category_idx = 0; category_idx != category_names_ct; ++category_idx) {
+ const char* cur_cat_name = category_names[category_idx];
+ HtableAddNondup(cur_cat_name, strlen(cur_cat_name), category_names_htable_size, category_idx, category_names_htable);
+ }
+ }
+ catname_idx = category_names_ct;
+ if (unlikely(StoreStringAtEndK(arena_bottom, cur_phenostr, slen, &arena_top, &(category_names[catname_idx])))) {
+ goto MergePsams_ret_NOMEM;
+ }
+ ++category_names_ct;
+ HtableAddNondup(cur_phenostr, slen, category_names_htable_size, catname_idx, category_names_htable);
+ }
+ if (!pheno_locked) {
+ // nm-first and previously missing
+ if (catname_idx) {
+ SetBit(sample_idx, cur_pheno_col->nonmiss);
+ cur_pheno_col->data.cat[sample_idx] = catname_idx;
+ }
+ } else {
+ if (merge_pheno_mode == kMergePhenoModeNmMatch) {
+ if (catname_idx) {
+ const uint32_t prev_catname_idx = cur_pheno_col->data.cat[sample_idx];
+ if (prev_catname_idx != catname_idx) {
+ if (!prev_catname_idx) {
+ SetBit(sample_idx, cur_pheno_col->nonmiss);
+ cur_pheno_col->data.cat[sample_idx] = catname_idx;
+ } else {
+ ClearBit(sample_idx, cur_pheno_col->nonmiss);
+ cur_pheno_col->data.cat[sample_idx] = 0;
+ SetBit(sample_idx + pheno_uidx * sample_ct, pheno_locked);
+ }
+ }
+ }
+ } else {
+ // first
+ if (catname_idx) {
+ SetBit(sample_idx, cur_pheno_col->nonmiss);
+ cur_pheno_col->data.cat[sample_idx] = catname_idx;
+ }
+ SetBit(sample_idx + pheno_uidx * sample_ct, pheno_locked);
+ }
+ }
+ continue;
+ }
+ } else {
+ if (unlikely(!IsSpaceOrEoln(*cur_phenostr_end))) {
+ cur_phenostr_end = CurTokenEnd(cur_phenostr_end);
+ *K_CAST(char*, cur_phenostr_end) = '\0';
+ snprintf(g_logbuf, kLogbufSize, "Error: Invalid numeric token '%s' on line %" PRIuPTR " of %s.\n", cur_phenostr, line_idx, cur_fname);
+ goto MergePsams_ret_MALFORMED_INPUT_WW;
+ }
+ }
+ if (unlikely(cur_pheno_col->type_code == kPhenoDtypeCat)) {
+ snprintf(g_logbuf, kLogbufSize, "Error: '%s' entry on line %" PRIuPTR " of %s is numeric/'NA', while an earlier entry is categorical.\n", &(pheno_names[pheno_uidx * max_pheno_name_blen]), line_idx, cur_fname);
+ goto MergePsams_ret_INCONSISTENT_INPUT_WW;
+ }
+ cur_pheno_col->type_code = kPhenoDtypeQt;
+ if (!pheno_locked) {
+ // nm-first
+ if (dxx != missing_phenod) {
+ if ((dxx == 0.0) && pheno_zero_seen) {
+ SetBit(sample_idx + pheno_uidx * sample_ct, pheno_zero_seen);
+ } else {
+ cur_pheno_col->data.qt[sample_idx] = dxx;
+ SetBit(sample_idx, cur_pheno_col->nonmiss);
+ }
+ }
+ } else {
+ if (merge_pheno_mode == kMergePhenoModeNmMatch) {
+ if (dxx != missing_phenod) {
+ if ((dxx == 0.0) && pheno_zero_seen) {
+ SetBit(sample_idx + pheno_uidx * sample_ct, pheno_zero_seen);
+ } else if (!IsSet(cur_pheno_col->nonmiss, sample_idx)) {
+ SetBit(sample_idx, cur_pheno_col->nonmiss);
+ cur_pheno_col->data.qt[sample_idx] = dxx;
+ } else {
+ if (dxx != cur_pheno_col->data.qt[sample_idx]) {
+ ClearBit(sample_idx, cur_pheno_col->nonmiss);
+ SetBit(sample_idx + pheno_uidx * sample_ct, pheno_locked);
+ }
+ }
+ }
+ } else {
+ if (dxx != missing_phenod) {
+ SetBit(sample_idx, cur_pheno_col->nonmiss);
+ cur_pheno_col->data.qt[sample_idx] = dxx;
+ }
+ SetBit(sample_idx + pheno_uidx * sample_ct, pheno_locked);
+ }
+ }
+ }
+ }
+ }
if (unlikely(TextStreamErrcode2(&txs, &reterr))) {
goto MergePsams_ret_TSTREAM_FAIL;
}
if (unlikely(CleanupTextStream2(cur_fname, &txs, &reterr))) {
goto MergePsams_ret_1;
}
+ filesets_iter->sample_ct = cur_sample_ct;
filesets_iter = filesets_iter->next;
} while (filesets_iter);
BigstackReset(bigstack_mark2);
BigstackEndSet(arena_top);
if (pheno_ct) {
- if (category_names_ct) {
+ if (category_names) {
const char** category_names_final = R_CAST(const char**, g_bigstack_base);
BigstackBaseSet(&(category_names_final[category_names_ct]));
memmove(category_names_final, category_names, category_names_ct * sizeof(intptr_t));
category_names = category_names_final;
}
- for (uint32_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
+ for (uintptr_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
PhenoCol* pheno_col = &(pheno_cols[pheno_idx]);
if (pheno_col->type_code == kPhenoDtypeQt) {
- if (misc_flags & kfMiscAffection01) {
- // TODO
- // If all values are in {0, 1, missing}, convert to {1, 2, missing}
+ uintptr_t* nonmiss = pheno_col->nonmiss;
+ double* qt = pheno_col->data.qt;
+ const uint32_t sample_nm_ct = PopcountWords(nonmiss, sample_ctl);
+ uintptr_t sample_uidx_base = 0;
+ uintptr_t cur_bits = nonmiss[0];
+ uint32_t sample_idx = 0;
+ if (!affection_01) {
+ if (merge_pheno_mode == kMergePhenoModeFirst) {
+ // 1. Check for values outside {0, 1, 2, missing}.
+ // 2. If none exist, convert zeroes to missing.
+ for (; sample_idx != sample_nm_ct; ++sample_idx) {
+ const uint32_t sample_uidx = BitIter1(nonmiss, &sample_uidx_base, &cur_bits);
+ const double dxx = qt[sample_uidx];
+ if ((dxx != 0.0) && (dxx != 1.0) && (dxx != 2.0)) {
+ break;
+ }
+ }
+ if (!(sample_idx != sample_nm_ct)) {
+ // Could use explicit word-based iteration instead.
+ sample_uidx_base = 0;
+ cur_bits = nonmiss[0];
+ for (sample_idx = 0; sample_idx != sample_nm_ct; ++sample_idx) {
+ const uint32_t sample_uidx = BitIter1(nonmiss, &sample_uidx_base, &cur_bits);
+ if (qt[sample_uidx] == 0.0) {
+ ClearBit(sample_uidx, nonmiss);
+ }
+ }
+ }
+ } else {
+ // 1. Check for values outside {1, 2, missing}.
+ // 2. If at least one exists, patch in zeroes. (In nm-match
+ // case, when a nonmissing value is in the cell but a zero was
+ // also observed, convert to missing.)
+ for (; sample_idx != sample_nm_ct; ++sample_idx) {
+ const uint32_t sample_uidx = BitIter1(nonmiss, &sample_uidx_base, &cur_bits);
+ const double dxx = qt[sample_uidx];
+ if ((dxx != 1.0) && (dxx != 2.0)) {
+ break;
+ }
+ }
+ if (sample_idx != sample_nm_ct) {
+ const uintptr_t bit_offset = pheno_idx * sample_ct;
+ const uint32_t zero_ct = PopcountBitRange(pheno_zero_seen, bit_offset, bit_offset + sample_ct);
+ if (zero_ct) {
+ uintptr_t zero_seen_uidx_base;
+ BitIter1Start(pheno_zero_seen, bit_offset, &zero_seen_uidx_base, &cur_bits);
+ if (merge_pheno_mode == kMergePhenoModeNmMatch) {
+ for (uint32_t zero_idx = 0; zero_idx != zero_ct; ++zero_idx) {
+ const uintptr_t sample_uidx = BitIter1(pheno_zero_seen, &zero_seen_uidx_base, &cur_bits) - bit_offset;
+ if (IsSet(nonmiss, sample_uidx)) {
+ ClearBit(sample_uidx, nonmiss);
+ } else {
+ SetBit(sample_uidx, nonmiss);
+ qt[sample_uidx] = 0.0;
+ }
+ }
+ } else {
+ // nm-first
+ for (uint32_t zero_idx = 0; zero_idx != zero_ct; ++zero_idx) {
+ const uintptr_t sample_uidx = BitIter1(pheno_zero_seen, &zero_seen_uidx_base, &cur_bits) - bit_offset;
+ SetBit(sample_uidx, nonmiss);
+ qt[sample_uidx] = 0.0;
+ }
+ }
+ }
+ }
+ }
+ } else {
+ // 1. Check for values outside {0, 1, missing}.
+ // 2. If none exist, convert to {1, 2, missing}.
+ for (; sample_idx != sample_nm_ct; ++sample_idx) {
+ const uint32_t sample_uidx = BitIter1(nonmiss, &sample_uidx_base, &cur_bits);
+ const double dxx = qt[sample_uidx];
+ if ((dxx != 0.0) && (dxx != 1.0)) {
+ break;
+ }
+ }
+ if (!(sample_idx != sample_nm_ct)) {
+ sample_uidx_base = 0;
+ cur_bits = nonmiss[0];
+ for (sample_idx = 0; sample_idx != sample_nm_ct; ++sample_idx) {
+ const uint32_t sample_uidx = BitIter1(nonmiss, &sample_uidx_base, &cur_bits);
+ qt[sample_uidx] += 1.0;
+ }
+ }
}
} else if (pheno_col->type_code == kPhenoDtypeCat) {
pheno_col->category_names = category_names;
}
}
- ;;;
}
snprintf(outname_end, kMaxOutfnameExtBlen, ".psam");
reterr = WritePsam(outname, sample_include, siip, &parental_id_info, sex_nm, sex_male, pheno_cols, pheno_names, nullptr, "NA", sample_ct, pheno_ct, max_pheno_name_blen, kfPsamColDefault);
@@ -1203,6 +1630,9 @@ PglErr MergePsams(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFla
MergePsams_ret_MALFORMED_INPUT:
reterr = kPglRetMalformedInput;
break;
+ MergePsams_ret_INCONSISTENT_INPUT_WW:
+ WordWrapB(0);
+ logerrputsb();
MergePsams_ret_INCONSISTENT_INPUT:
reterr = kPglRetInconsistentInput;
break;
@@ -1213,14 +1643,76 @@ PglErr MergePsams(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFla
return reterr;
}
-PglErr Pmerge(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags misc_flags, SortMode sample_sort_mode, FamCol fam_cols, uint32_t max_thread_ct, char* pgenname, char* psamname, char* pvarname, char* outname, char* outname_end, __attribute__((unused)) ChrInfo* cip) {
+// supports topological sort of chromosome order
+typedef struct ChrGraphEdgeLlStruct {
+ struct ChrGraphEdgeLlStruct* next;
+ ChrIdx dst_idx;
+} ChrGraphEdgeLl;
+
+/*
+// cip->chr_file_order is filled with the final chromosome sort order.
+// info_keys, pointed-to InfoVtype entries, and info_keys_htable are allocated
+// at the end of bigstack.
+PglErr ScanPvarsAndMergeHeader(const PmergeInfo* pmip, ImportFlags import_flags, uint32_t max_thread_ct, char* outname, char* outname_end, PmergeInputFilesetLl* filesets, ChrInfo* cip, const char* const** info_keys_ptr, uint32_t* info_key_ctp, uint32_t** info_keys_htablep, uint32_t* info_keys_htable_sizep) {
+ unsigned char* bigstack_mark = g_bigstack_base;
+ unsigned char* bigstack_end_mark = g_bigstack_end;
+ const char* cur_fname = nullptr;
+ PglErr reterr = kPglRetSuccess;
+ char* cswritep = nullptr;
+ uintptr_t line_idx = 0;
+ CompressStreamState css;
+ PreinitCstream(&css);
+ TextStream txs;
+ PreinitTextStream(&txs);
+ {
+ const uintptr_t linebuf_capacity = MINV(kMaxLongLine, bigstack_left() / 4) + kDecompressChunkSize;
+ char* linebuf;
+ if (unlikely(bigstack_alloc_c(linebuf_capacity, &linebuf))) {
+ goto ScanPvarsAndMergeHeader_ret_NOMEM;
+ }
+
+ const MergeXheaderMode merge_xheader_mode = pmip->merge_xheader_mode;
+ // Each entry is an xheader key, followed by a null terminator, followed by
+ // the null-terminated remainder of the header line. The latter includes
+ // the original punctuation character ('=', ',', or '>') ending the key,
+ // but not the final newline.
+ // Header line classes defined in the VCF v4.3 specification, and the
+ // chrSet class defined in the .pvar specification, are recognized. I.e.
+ // when the header line starts with "##INFO=", "##FILTER=", "##FORMAT=",
+ // "##ALT=", "##contig=", "##META=", "##SAMPLE=", or "##PEDIGREE=", the key
+ // includes everything up to (and not including) the ',' ending the ID,
+ // skipping the initial "##". Otherwise, the key includes everything up to
+ // and not including the '=' (also skipping the initial "##").
+ const char** xheader_entries = nullptr;
+ uint32_t* xheader_entries_htable = nullptr;
+ uint32_t xheader_entry_ct = 0;
+ // xheader_entry_capacity == xheader_entry_htable_size / 2
+ uint32_t xheader_entry_htable_size = 0;
+ if (merge_xheader_mode != kMergeXheaderModeErase) {
+ xheader_entry_htable_size = 2048;
+ if (unlikely(bigstack_alloc_kcp())) {
+ }
+ }
+ }
+ while (0) {
+ ScanPvarsAndMergeHeader_ret_NOMEM:
+ reterr = kPglRetNomem;
+ break;
+ }
+ CleanupTextStream2(cur_fname, &txs, &reterr);
+ CswriteCloseCond(&css, cswritep);
+ BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
+ return reterr;
+}
+*/
+
+PglErr Pmerge(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags misc_flags, __attribute__((unused)) ImportFlags import_flags, SortMode sample_sort_mode, FamCol fam_cols, int32_t missing_pheno, uint32_t max_thread_ct, char* pgenname, char* psamname, char* pvarname, char* outname, char* outname_end, __attribute__((unused)) ChrInfo* cip) {
unsigned char* bigstack_mark = g_bigstack_base;
unsigned char* bigstack_end_mark = g_bigstack_end;
PglErr reterr = kPglRetSuccess;
{
// 1. Construct/load fileset list.
- // 2. Merge .psam files. Simplify merge-mode to 'first' if each sample
- // appears in only one input fileset.
+ // 2. Merge .psam files.
// 3. Global .pvar scan, to determine merged header, chromosome sort order,
// and track first/last position in each fileset.
// 4. If filesets cover disjoint positions, handle this as a concatenation
@@ -1246,6 +1738,7 @@ PglErr Pmerge(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags m
fname_iter = memcpya(fname_iter, pvarname, pvar_fname_blen);
cur_entry->psam_fname = fname_iter;
memcpy(fname_iter, psamname, psam_fname_blen);
+ cur_entry->pgen_locked_fname = nullptr;
}
if (!pmip->list_fname) {
PmergeInputFilesetLl* cur_entry = AllocFilesetLlEntry(&filesets_endp);
@@ -1255,6 +1748,7 @@ PglErr Pmerge(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags m
cur_entry->pgen_fname = pmip->pgen_fname;
cur_entry->pvar_fname = pmip->pvar_fname;
cur_entry->psam_fname = pmip->psam_fname;
+ cur_entry->pgen_locked_fname = nullptr;
} else {
reterr = LoadPmergeList(pmip->list_fname, pmip->list_mode, pgenname[0] != '\0', &filesets_endp);
if (unlikely(reterr)) {
@@ -1264,14 +1758,20 @@ PglErr Pmerge(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags m
SampleIdInfo sii;
uint32_t sample_ct;
- reterr = MergePsams(pmip, sample_sort_fname, misc_flags, sample_sort_mode, fam_cols, max_thread_ct, outname, outname_end, filesets, &sii, &sample_ct);
+ reterr = MergePsams(pmip, sample_sort_fname, misc_flags, sample_sort_mode, fam_cols, missing_pheno, max_thread_ct, outname, outname_end, filesets, &sii, &sample_ct);
if (unlikely(reterr)) {
goto Pmerge_ret_1;
}
-
+ /*
+ const char* const* info_keys = nullptr;
+ uint32_t* info_keys_htable = nullptr;
+ uint32_t info_key_ct = 0;
+ uint32_t info_keys_htable_size = 0;
+ reterr = ScanPvarsAndMergeHeader(pmip, import_flags, max_thread_ct, outname, outname_end, filesets, cip, &info_keys, &info_key_ct, &info_keys_htable, &info_keys_htable_size);
if (unlikely(reterr)) {
goto Pmerge_ret_1;
}
+ */
logerrputs("Error: --pmerge[-list] is under development.\n");
reterr = kPglRetNotYetSupported;
goto Pmerge_ret_1;
@@ -1428,17 +1928,8 @@ PglErr PgenDiff(const uintptr_t* orig_sample_include, const SampleIdInfo* siip,
goto PgenDiff_ret_PSAM_MISSING_TOKENS;
}
const uint32_t token_slen = strlen_se(token_iter);
- uint32_t cur_sex_code = 0;
- if (token_slen == 1) {
- const unsigned char sex_ucc = token_iter[0];
- const unsigned char sex_ucc_upcase = sex_ucc & 0xdfU;
- if ((sex_ucc == '1') || (sex_ucc_upcase == 'M')) {
- cur_sex_code = 1;
- } else if ((sex_ucc == '2') || (sex_ucc_upcase == 'F')) {
- cur_sex_code = 2;
- }
- }
- if (!cur_sex_code) {
+ const uint32_t cur_sex_code = CharToSex(token_iter[0]);
+ if ((token_slen != 1) || (!cur_sex_code)) {
snprintf(g_logbuf, kLogbufSize, "Error: Missing sex code on line %" PRIuPTR " of --pgen-diff .psam file; this is prohibited when chrX or chrY is in the comparison.\n", psam_line_idx);
goto PgenDiff_ret_INCONSISTENT_INPUT_WW;
}
=====================================
plink2_merge.h
=====================================
@@ -103,6 +103,8 @@ typedef struct PmergeStruct {
PmergeFlags flags;
PmergeListMode list_mode;
MergeMode merge_mode;
+ MergePhenoMode merge_parents_mode;
+ MergePhenoMode merge_sex_mode;
MergePhenoMode merge_pheno_mode;
MergeXheaderMode merge_xheader_mode;
MergeQualInfoMode merge_qual_mode;
@@ -134,7 +136,7 @@ void InitPgenDiff(PgenDiffInfo* pgen_diff_info_ptr);
void CleanupPgenDiff(PgenDiffInfo* pgen_diff_info_ptr);
-PglErr Pmerge(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags misc_flags, SortMode sample_sort_mode, FamCol fam_cols, uint32_t max_thread_ct, char* pgenname, char* psamname, char* pvarname, char* outname, char* outname_end, ChrInfo* cip);
+PglErr Pmerge(const PmergeInfo* pmip, const char* sample_sort_fname, MiscFlags misc_flags, ImportFlags import_flags, SortMode sample_sort_mode, FamCol fam_cols, int32_t missing_pheno, uint32_t max_thread_ct, char* pgenname, char* psamname, char* pvarname, char* outname, char* outname_end, ChrInfo* cip);
PglErr PgenDiff(const uintptr_t* orig_sample_include, const SampleIdInfo* siip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const PgenDiffInfo* pdip, uint32_t raw_sample_ct, uint32_t orig_sample_ct, uint32_t raw_variant_ct, uint32_t max_allele_ct1, uint32_t max_allele_slen, uint32_t max_thread_ct, PgenFileInfo* pgfip, PgenReader* simple_pgrp, char* outname, char* outname_end);
=====================================
plink2_psam.cc
=====================================
@@ -85,11 +85,11 @@ PglErr LoadPsam(const char* psamname, const RangeList* pheno_range_list_ptr, Fam
uint32_t psam_cols_mask = 0;
LlStr* pheno_names_reverse_ll = nullptr;
uintptr_t max_pheno_name_blen = 0;
- uint32_t relevant_postfid_col_ct = 0;
g_bigstack_end -= kMaxIdSlen;
unsigned char* tmp_bigstack_end = BigstackEndRoundedDown();
unsigned char* bigstack_mark2;
uint32_t fid_present = 1;
+ uint32_t relevant_postfid_col_ct;
if (line_iter[0] == '#') {
// parse header
// [-1] = #FID (if present, must be first column)
@@ -259,7 +259,7 @@ PglErr LoadPsam(const char* psamname, const RangeList* pheno_range_list_ptr, Fam
pheno_ct = (fam_cols & kfFamCol6) && pheno_ct_max;
fid_present = (fam_cols / kfFamCol1) & 1;
- relevant_postfid_col_ct = fid_present + ((fam_cols / (kfFamCol34 / 2)) & 2) + ((fam_cols / kfFamCol5) & 1) + pheno_ct;
+ relevant_postfid_col_ct = 1 + ((fam_cols / (kfFamCol34 / 2)) & 2) + ((fam_cols / kfFamCol5) & 1) + pheno_ct;
// these small allocations can't fail, since kMaxMediumLine <
// linebuf_size <= 1/3 of remaining space
col_skips = S_CAST(uint32_t*, bigstack_alloc_raw_rd(relevant_postfid_col_ct * sizeof(int32_t)));
@@ -364,6 +364,7 @@ PglErr LoadPsam(const char* psamname, const RangeList* pheno_range_list_ptr, Fam
CatnameLl2** catname_htable = nullptr;
CatnameLl2** pheno_catname_last = nullptr;
uintptr_t* total_catname_blens = nullptr;
+ uint32_t fid_slen = 1;
for (; TextGetUnsafe2K(&psam_txs, &line_iter); line_iter = AdvPastDelim(line_iter, '\n'), ++line_idx) {
if (unlikely(line_iter[0] == '#')) {
snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of %s starts with a '#'. (This is only permitted before the first nonheader line, and if a #FID/IID header line is present it must denote the end of the header block.)\n", line_idx, psamname);
@@ -374,21 +375,12 @@ PglErr LoadPsam(const char* psamname, const RangeList* pheno_range_list_ptr, Fam
goto LoadPsam_ret_MALFORMED_INPUT;
}
const char* line_start = line_iter;
- const char* iid_ptr;
- uint32_t iid_slen;
- if (relevant_postfid_col_ct) {
- // bugfix (3 Jul 2018): didn't handle no-other-columns case correctly
- line_iter = TokenLexK0(line_start, col_types, col_skips, relevant_postfid_col_ct, token_ptrs, token_slens);
- if (unlikely(!line_iter)) {
- goto LoadPsam_ret_MISSING_TOKENS;
- }
- iid_ptr = token_ptrs[0];
- iid_slen = token_slens[0];
- } else {
- iid_ptr = line_start;
- iid_slen = CurTokenEnd(line_start) - line_start;
+ line_iter = TokenLexK0(line_start, col_types, col_skips, relevant_postfid_col_ct, token_ptrs, token_slens);
+ if (unlikely(!line_iter)) {
+ goto LoadPsam_ret_MISSING_TOKENS;
}
- uint32_t fid_slen = 2;
+ const char* iid_ptr = token_ptrs[0];
+ const uint32_t iid_slen = token_slens[0];
if (fid_present) {
fid_slen = CurTokenEnd(line_start) - line_start;
}
@@ -494,17 +486,9 @@ PglErr LoadPsam(const char* psamname, const RangeList* pheno_range_list_ptr, Fam
}
new_psam_info->maternal_id_slen = maternal_id_slen;
uint32_t cur_sex_code = 0;
- // accept 'M'/'F'/'m'/'f' since that's more readable without being any
- // less efficient
// don't accept "male"/"female", that's overkill
if (sex_present && (token_slens[4] == 1)) {
- const unsigned char sex_ucc = token_ptrs[4][0];
- const unsigned char sex_ucc_upcase = sex_ucc & 0xdfU;
- if ((sex_ucc == '1') || (sex_ucc_upcase == 'M')) {
- cur_sex_code = 1;
- } else if ((sex_ucc == '2') || (sex_ucc_upcase == 'F')) {
- cur_sex_code = 2;
- }
+ cur_sex_code = CharToSex(token_ptrs[4][0]);
}
new_psam_info->sex_code = cur_sex_code;
// phenotypes
=====================================
plink2_pvar.cc
=====================================
@@ -59,6 +59,16 @@ PglErr ReadChrsetHeaderLine(const char* chrset_iter, const char* file_descrip, M
for (uint32_t uii = 0; uii != kChrOffsetCt; ++uii) {
cip->xymt_codes[uii] = UINT32_MAXM1;
}
+ if (StrStartsWithUnsafe(chrset_iter, "ID=")) {
+ // Need this field to conform to VCFv4.3 specification.
+ // Just ignore the ID value.
+ chrset_iter = strchrnul_n(&(chrset_iter[strlen("ID=")]), ',');
+ if (*chrset_iter != ',') {
+ snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s does not have expected ##chrSet format.\n", line_idx, file_descrip);
+ goto ReadChrsetHeaderLine_ret_MALFORMED_INPUT_WW;
+ }
+ ++chrset_iter;
+ }
if (StrStartsWithUnsafe(chrset_iter, "haploidAutosomeCt=")) {
uint32_t explicit_haploid_ct;
if (unlikely(ScanPosintCapped(&(chrset_iter[strlen("haploidAutosomeCt=")]), kMaxChrTextnum, &explicit_haploid_ct))) {
View it on GitLab: https://salsa.debian.org/med-team/plink2/-/commit/71771f5c613e0273d97e3c9631556e19b48c7d7b
--
View it on GitLab: https://salsa.debian.org/med-team/plink2/-/commit/71771f5c613e0273d97e3c9631556e19b48c7d7b
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210121/16ade098/attachment-0001.html>
More information about the debian-med-commit
mailing list