[med-svn] [plink1.9] 01/03: Imported Upstream version 1.90~b3.28-151216
Dylan Aïssi
bob.dybian-guest at moszumanska.debian.org
Sun Dec 20 13:51:34 UTC 2015
This is an automated email from the git hooks/post-receive script.
bob.dybian-guest pushed a commit to branch master
in repository plink1.9.
commit 597ab217cb3659a3acadf6664e7f03164cc23dcd
Author: Dylan Aïssi <bob.dybian at gmail.com>
Date: Fri Dec 18 19:55:23 2015 +0100
Imported Upstream version 1.90~b3.28-151216
---
Makefile | 2 +-
plink.c | 1020 +++++++++++++++++++---------------
plink_assoc.c | 742 +++----------------------
plink_assoc.h | 10 -
plink_calc.c | 33 +-
plink_cluster.c | 15 +-
plink_cluster.h | 2 +-
plink_common.c | 175 +++---
plink_common.h | 90 +--
plink_data.c | 1654 +++++++++++++++++++++++++++++++++----------------------
plink_data.h | 8 +-
plink_dosage.c | 90 +--
plink_dosage.h | 7 +-
plink_family.c | 1270 +++++++++++++++++++++++++++++++++++++++---
plink_family.h | 2 +-
plink_filter.c | 237 ++++----
plink_filter.h | 24 +-
plink_glm.c | 111 ++--
plink_help.c | 39 +-
plink_lasso.c | 1 +
plink_ld.c | 104 ++--
plink_misc.c | 244 +++++---
plink_misc.h | 1 +
plink_perm.c | 333 +++++++++++
plink_perm.h | 196 +++++++
plink_set.c | 85 ++-
plink_set.h | 4 +-
27 files changed, 4186 insertions(+), 2313 deletions(-)
diff --git a/Makefile b/Makefile
index c7ef5ee..e204486 100644
--- a/Makefile
+++ b/Makefile
@@ -49,7 +49,7 @@ ifdef NO_LAPACK
BLASFLAGS=
endif
-SRC = plink.c plink_assoc.c plink_calc.c plink_cluster.c plink_cnv.c plink_common.c plink_data.c plink_dosage.c plink_family.c plink_filter.c plink_glm.c plink_help.c plink_homozyg.c plink_lasso.c plink_ld.c plink_matrix.c plink_misc.c plink_rserve.c plink_set.c plink_stats.c SFMT.c dcdflib.c pigz.c yarn.c Rconnection.cc hfile.c bgzf.c
+SRC = plink.c plink_assoc.c plink_calc.c plink_cluster.c plink_cnv.c plink_common.c plink_data.c plink_dosage.c plink_family.c plink_filter.c plink_glm.c plink_help.c plink_homozyg.c plink_lasso.c plink_ld.c plink_matrix.c plink_misc.c plink_perm.c plink_rserve.c plink_set.c plink_stats.c SFMT.c dcdflib.c pigz.c yarn.c Rconnection.cc hfile.c bgzf.c
# In the event that you are still concurrently using PLINK 1.07, we suggest
# renaming that binary to "plink107" and "plink1". (Previously,
diff --git a/plink.c b/plink.c
index 2a006fa..de1f23c 100644
--- a/plink.c
+++ b/plink.c
@@ -91,7 +91,7 @@
const char ver_str[] =
#ifdef STABLE_BUILD
- "PLINK v1.90b3w"
+ "PLINK v1.90b3.28"
#else
"PLINK v1.90p"
#endif
@@ -103,19 +103,19 @@ const char ver_str[] =
#else
" 32-bit"
#endif
- " (3 Sep 2015)";
+ " (16 Dec 2015)";
const char ver_str2[] =
// include leading space if day < 10, so character length stays the same
- " "
+ ""
#ifdef STABLE_BUILD
"" // (don't want this when version number has a trailing letter)
#else
- " " // (don't want this when version number has e.g. "b3" before "p")
+ " " // (don't want this when version number has e.g. "b3" before "p")
#endif
#ifndef NOLAPACK
" "
#endif
- " https://www.cog-genomics.org/plink2\n"
+ " https://www.cog-genomics.org/plink2\n"
"(C) 2005-2015 Shaun Purcell, Christopher Chang GNU General Public License v3\n";
const char errstr_append[] = "For more information, try '" PROG_NAME_STR " --help [flag name]' or '" PROG_NAME_STR " --help | more'.\n";
#ifdef STABLE_BUILD
@@ -236,7 +236,7 @@ void calc_marker_reverse_bin(uintptr_t* marker_reverse, uintptr_t* marker_exclud
uint32_t markers_done = 0;
uint32_t marker_uidx_stop;
double dxx;
- do {
+ while (markers_done < marker_ct) {
marker_uidx = next_unset_unsafe(marker_exclude, marker_uidx);
marker_uidx_stop = next_set(marker_exclude, marker_uidx, unfiltered_marker_ct);
markers_done += marker_uidx_stop - marker_uidx;
@@ -247,7 +247,7 @@ void calc_marker_reverse_bin(uintptr_t* marker_reverse, uintptr_t* marker_exclud
set_allele_freqs[marker_uidx] = 1.0 - dxx;
}
}
- } while (markers_done < marker_ct);
+ }
}
void swap_reversed_marker_alleles(uintptr_t unfiltered_marker_ct, uintptr_t* marker_reverse, char** marker_allele_ptrs) {
@@ -288,7 +288,7 @@ static inline int32_t relationship_or_ibc_req(uint64_t calculation_type) {
return (relationship_req(calculation_type) || (calculation_type & CALC_IBC));
}
-int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, char* famname, char* cm_map_fname, char* cm_map_chrname, char* phenoname, char* extractname, char* excludename, char* keepname, char* removename, char* keepfamname, char* removefamname, char* filtername, char* freqname, char* distance_wts_fname, char* read_dists_fname, char* read_dists_id_fname, char* evecname, char* mergename1, char* mergename2, char* mergename3, char* missing_mid_template, char* missing_marke [...]
+int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, char* famname, char* cm_map_fname, char* cm_map_chrname, char* phenoname, char* extractname, char* excludename, char* keepname, char* removename, char* keepfamname, char* removefamname, char* filtername, char* freqname, char* distance_wts_fname, char* read_dists_fname, char* read_dists_id_fname, char* evecname, char* mergename1, char* mergename2, char* mergename3, char* missing_mid_template, char* missing_marke [...]
FILE* bedfile = NULL;
FILE* phenofile = NULL;
uintptr_t unfiltered_marker_ct = 0;
@@ -320,6 +320,8 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
uint32_t uii = 0;
int64_t llxx = 0;
uint32_t nonfounders = (misc_flags / MISC_NONFOUNDERS) & 1;
+ uint32_t allow_no_samples = (misc_flags / MISC_ALLOW_NO_SAMPLES) & 1;
+ uint32_t allow_no_variants = (misc_flags / MISC_ALLOW_NO_VARS) & 1;
uint32_t pheno_all = pheno_modifier & PHENO_ALL;
char* marker_ids = NULL;
uint32_t* marker_id_htable = NULL;
@@ -386,6 +388,9 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
uint32_t plink_maxfid = 0;
uint32_t plink_maxiid = 0;
uint32_t max_bim_linelen = 0;
+ uint32_t sample_male_ct = 0;
+ uint32_t sample_f_ct = 0;
+ uint32_t sample_f_male_ct = 0;
unsigned char* wkspace_mark2 = NULL;
unsigned char* wkspace_mark_precluster = NULL;
unsigned char* wkspace_mark_postcluster = NULL;
@@ -411,9 +416,6 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
int32_t ii;
int64_t llyy;
int64_t llzz;
- uint32_t sample_male_ct;
- uint32_t sample_f_ct;
- uint32_t sample_f_male_ct;
Pedigree_rel_info pri;
uintptr_t marker_uidx;
@@ -533,9 +535,9 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
goto plink_ret_1;
}
if (ulii > 80) {
- // only warn on long new marker ID, since if there's a long old marker ID
- // and no long new one, it's reasonable to infer that the user is fixing
- // the problem, so we shouldn't spam them.
+ // only warn on long new marker ID, since if there's a long old marker
+ // ID and no long new one, it's reasonable to infer that the user is
+ // fixing the problem, so we shouldn't spam them.
logerrprint("Warning: Unusually long new variant ID(s) in --update-name file. Double-check\nyour file and command-line parameters, and consider changing your naming\nscheme if you encounter memory problems.\n");
}
if (ulii > max_marker_id_len) {
@@ -573,7 +575,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
}
}
- retval = load_fam(famname, fam_cols, uii, missing_pheno, (misc_flags / MISC_AFFECTION_01) & 1, &unfiltered_sample_ct, &sample_ids, &max_sample_id_len, &paternal_ids, &max_paternal_id_len, &maternal_ids, &max_maternal_id_len, &sex_nm, &sex_male, &affection, &pheno_nm, &pheno_c, &pheno_d, &founder_info, &sample_exclude);
+ retval = load_fam(famname, fam_cols, uii, missing_pheno, (misc_flags / MISC_AFFECTION_01) & 1, &unfiltered_sample_ct, &sample_ids, &max_sample_id_len, &paternal_ids, &max_paternal_id_len, &maternal_ids, &max_maternal_id_len, &sex_nm, &sex_male, &affection, &pheno_nm, &pheno_c, &pheno_d, &founder_info, &sample_exclude, allow_no_samples);
if (retval) {
goto plink_ret_1;
}
@@ -615,131 +617,133 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
} else {
LOGPRINTF("%" PRIuPTR " %s (%d male%s, %d female%s) loaded from .fam.\n", unfiltered_sample_ct, species_str(unfiltered_sample_ct), uii, (uii == 1)? "" : "s", ujj, (ujj == 1)? "" : "s");
}
- uii = popcount_longs(pheno_nm, unfiltered_sample_ctl);
- if (uii) {
- LOGPRINTF("%u phenotype value%s loaded from .fam.\n", uii, (uii == 1)? "" : "s");
- }
-
- if (phenoname && fopen_checked(&phenofile, phenoname, "r")) {
- goto plink_ret_OPEN_FAIL;
- }
+ if (unfiltered_sample_ct) {
+ uii = popcount_longs(pheno_nm, unfiltered_sample_ctl);
+ if (uii) {
+ LOGPRINTF("%u phenotype value%s loaded from .fam.\n", uii, (uii == 1)? "" : "s");
+ }
- if (phenofile || update_ids_fname || update_parents_fname || update_sex_fname || (filter_flags & FILTER_TAIL_PHENO)) {
- wkspace_mark = wkspace_base;
- retval = sort_item_ids(&cptr, &uiptr, unfiltered_sample_ct, sample_exclude, 0, sample_ids, max_sample_id_len, 0, 0, strcmp_deref);
- if (retval) {
- goto plink_ret_1;
+ if (phenoname && fopen_checked(&phenofile, phenoname, "r")) {
+ goto plink_ret_OPEN_FAIL;
}
- if (makepheno_str) {
- retval = makepheno_load(phenofile, makepheno_str, unfiltered_sample_ct, cptr, max_sample_id_len, uiptr, pheno_nm, &pheno_c);
+ if (phenofile || update_ids_fname || update_parents_fname || update_sex_fname || (filter_flags & FILTER_TAIL_PHENO)) {
+ wkspace_mark = wkspace_base;
+ retval = sort_item_ids(&cptr, &uiptr, unfiltered_sample_ct, sample_exclude, 0, sample_ids, max_sample_id_len, 0, 0, strcmp_deref);
if (retval) {
goto plink_ret_1;
}
- } else if (phenofile) {
- retval = load_pheno(phenofile, unfiltered_sample_ct, 0, cptr, max_sample_id_len, uiptr, missing_pheno, (misc_flags / MISC_AFFECTION_01) & 1, mpheno_col, phenoname_str, pheno_nm, &pheno_c, &pheno_d, NULL, 0);
- if (retval) {
- if (retval == LOAD_PHENO_LAST_COL) {
- logerrprintb();
- retval = RET_INVALID_FORMAT;
- wkspace_reset(wkspace_mark);
+
+ if (makepheno_str) {
+ retval = makepheno_load(phenofile, makepheno_str, unfiltered_sample_ct, cptr, max_sample_id_len, uiptr, pheno_nm, &pheno_c);
+ if (retval) {
+ goto plink_ret_1;
+ }
+ } else if (phenofile) {
+ retval = load_pheno(phenofile, unfiltered_sample_ct, 0, cptr, max_sample_id_len, uiptr, missing_pheno, (misc_flags / MISC_AFFECTION_01) & 1, mpheno_col, phenoname_str, pheno_nm, &pheno_c, &pheno_d, NULL, 0);
+ if (retval) {
+ if (retval == LOAD_PHENO_LAST_COL) {
+ logerrprintb();
+ retval = RET_INVALID_FORMAT;
+ wkspace_reset(wkspace_mark);
+ }
+ goto plink_ret_1;
}
- goto plink_ret_1;
}
- }
- if (filter_flags & FILTER_TAIL_PHENO) {
- retval = convert_tail_pheno(unfiltered_sample_ct, pheno_nm, &pheno_c, &pheno_d, tail_bottom, tail_top, missing_phenod);
- if (retval) {
- goto plink_ret_1;
+ if (filter_flags & FILTER_TAIL_PHENO) {
+ retval = convert_tail_pheno(unfiltered_sample_ct, pheno_nm, &pheno_c, &pheno_d, tail_bottom, tail_top, missing_phenod);
+ if (retval) {
+ goto plink_ret_1;
+ }
}
+ wkspace_reset(wkspace_mark);
}
- wkspace_reset(wkspace_mark);
- }
- if (pheno_c) {
- /*
- if (calculation_type & (CALC_REGRESS_PCS | CALC_REGRESS_PCS_DISTANCE)) {
- sprintf(logbuf, "Error: --regress-pcs%s requires a scalar phenotype.\n", (calculation_type & CALC_REGRESS_PCS_DISTANCE)? "-distance" : "");
- goto plink_ret_INVALID_CMDLINE_2;
- */
- if (calculation_type & (CALC_REGRESS_REL | CALC_REGRESS_DISTANCE | CALC_UNRELATED_HERITABILITY | CALC_GXE)) {
- if (calculation_type & CALC_REGRESS_REL) {
- logerrprint("Error: --regress-rel calculation requires a scalar phenotype.\n");
- } else if (calculation_type & CALC_REGRESS_DISTANCE) {
- logerrprint("Error: --regress-distance calculation requires a scalar phenotype.\n");
- } else if (calculation_type & CALC_UNRELATED_HERITABILITY) {
- logerrprint("Error: --unrelated-heritability requires a scalar phenotype.\n");
- } else if (calculation_type & CALC_GXE) {
- logerrprint("Error: --gxe requires a scalar phenotype.\n");
- }
- goto plink_ret_INVALID_CMDLINE;
- }
- } else {
- if (calculation_type & CALC_CLUSTER) {
- if (cluster_ptr->modifier & CLUSTER_CC) {
- logerrprint("Error: --cc requires a case/control phenotype.\n");
- goto plink_ret_INVALID_CMDLINE;
- } else if ((cluster_ptr->max_cases != 0xffffffffU) || (cluster_ptr->max_ctrls != 0xffffffffU)) {
- logerrprint("Error: --mcc requires a case/control phenotype.\n");
+ if (pheno_c) {
+ /*
+ if (calculation_type & (CALC_REGRESS_PCS | CALC_REGRESS_PCS_DISTANCE)) {
+ sprintf(logbuf, "Error: --regress-pcs%s requires a scalar phenotype.\n", (calculation_type & CALC_REGRESS_PCS_DISTANCE)? "-distance" : "");
+ goto plink_ret_INVALID_CMDLINE_2;
+ */
+ if (calculation_type & (CALC_REGRESS_REL | CALC_REGRESS_DISTANCE | CALC_UNRELATED_HERITABILITY | CALC_GXE)) {
+ if (calculation_type & CALC_REGRESS_REL) {
+ logerrprint("Error: --regress-rel calculation requires a scalar phenotype.\n");
+ } else if (calculation_type & CALC_REGRESS_DISTANCE) {
+ logerrprint("Error: --regress-distance calculation requires a scalar phenotype.\n");
+ } else if (calculation_type & CALC_UNRELATED_HERITABILITY) {
+ logerrprint("Error: --unrelated-heritability requires a scalar phenotype.\n");
+ } else if (calculation_type & CALC_GXE) {
+ logerrprint("Error: --gxe requires a scalar phenotype.\n");
+ }
goto plink_ret_INVALID_CMDLINE;
}
- } else if ((calculation_type & CALC_EPI) && (epi_ip->modifier & EPI_FAST)) {
- logerrprint("Error: --fast-epistasis requires a case/control phenotype.\n");
- goto plink_ret_INVALID_CMDLINE;
- } else if (calculation_type & (CALC_IBS_TEST | CALC_GROUPDIST | CALC_FLIPSCAN)) {
- if (calculation_type & (CALC_IBS_TEST | CALC_GROUPDIST)) {
- logerrprint("Error: --ibs-test and --groupdist calculations require a case/control\nphenotype.\n");
- } else if (calculation_type & CALC_FLIPSCAN) {
- logerrprint("Error: --flip-scan requires a case/control phenotype.\n");
- }
- goto plink_ret_INVALID_CMDLINE;
- } else if ((calculation_type & CALC_RECODE) && (recode_modifier & (RECODE_HV | RECODE_HV_1CHR))) {
- logerrprint("Error: --recode HV{-1chr} requires a case/control phenotype.\n");
- goto plink_ret_INVALID_CMDLINE;
- } else if ((calculation_type & CALC_FST) && (misc_flags & MISC_FST_CC)) {
- logerrprint("Error: '--fst case-control' requires a case/control phenotype.\n");
- goto plink_ret_INVALID_CMDLINE;
- } else if ((calculation_type & CALC_FREQ) && (misc_flags & MISC_FREQ_CC)) {
- logerrprint("Error: '--freq case-control' requires a case/control phenotype.\n");
- goto plink_ret_INVALID_CMDLINE;
- }
- }
-
- if (!pheno_all) {
- if (loop_assoc_fname || (!pheno_d)) {
- if ((calculation_type & CALC_GLM) && (!(glm_modifier & GLM_LOGISTIC))) {
- logerrprint("Error: --linear without --all-pheno requires a scalar phenotype.\n");
+ } else {
+ if (calculation_type & CALC_CLUSTER) {
+ if (cluster_ptr->modifier & CLUSTER_CC) {
+ logerrprint("Error: --cc requires a case/control phenotype.\n");
+ goto plink_ret_INVALID_CMDLINE;
+ } else if ((cluster_ptr->max_cases != 0xffffffffU) || (cluster_ptr->max_ctrls != 0xffffffffU)) {
+ logerrprint("Error: --mcc requires a case/control phenotype.\n");
+ goto plink_ret_INVALID_CMDLINE;
+ }
+ } else if ((calculation_type & CALC_EPI) && (epi_ip->modifier & EPI_FAST)) {
+ logerrprint("Error: --fast-epistasis requires a case/control phenotype.\n");
goto plink_ret_INVALID_CMDLINE;
- } else if (calculation_type & CALC_QFAM) {
- logerrprint("Error: QFAM test requires a scalar phenotype.\n");
+ } else if (calculation_type & (CALC_IBS_TEST | CALC_GROUPDIST | CALC_FLIPSCAN)) {
+ if (calculation_type & (CALC_IBS_TEST | CALC_GROUPDIST)) {
+ logerrprint("Error: --ibs-test and --groupdist calculations require a case/control\nphenotype.\n");
+ } else if (calculation_type & CALC_FLIPSCAN) {
+ logerrprint("Error: --flip-scan requires a case/control phenotype.\n");
+ }
goto plink_ret_INVALID_CMDLINE;
- }
- } else if (!pheno_c) {
- if ((calculation_type & CALC_MODEL) && (!(model_modifier & MODEL_ASSOC))) {
- logerrprint("Error: --model requires a case/control phenotype.\n");
+ } else if ((calculation_type & CALC_RECODE) && (recode_modifier & (RECODE_HV | RECODE_HV_1CHR))) {
+ logerrprint("Error: --recode HV{-1chr} requires a case/control phenotype.\n");
goto plink_ret_INVALID_CMDLINE;
- } else if ((calculation_type & CALC_GLM) && (glm_modifier & GLM_LOGISTIC)) {
- logerrprint("Error: --logistic without --all-pheno requires a case/control phenotype.\n");
+ } else if ((calculation_type & CALC_FST) && (misc_flags & MISC_FST_CC)) {
+ logerrprint("Error: '--fst case-control' requires a case/control phenotype.\n");
goto plink_ret_INVALID_CMDLINE;
- } else if (calculation_type & (CALC_CMH | CALC_HOMOG | CALC_TESTMISS | CALC_TDT | CALC_DFAM)) {
- if (calculation_type & CALC_CMH) {
- logerrprint("Error: --mh and --mh2 require a case/control phenotype.\n");
- } else if (calculation_type & CALC_HOMOG) {
- logerrprint("Error: --homog requires a case/control phenotype.\n");
- } else if (calculation_type & CALC_TESTMISS) {
- logerrprint("Error: --test-missing requires a case/control phenotype.\n");
- } else if (calculation_type & CALC_TDT) {
- logerrprint("Error: --tdt requires a case/control phenotype.\n");
- } else {
- logerrprint("Error: --dfam requires a case/control phenotype.\n");
- }
+ } else if ((calculation_type & CALC_FREQ) && (misc_flags & MISC_FREQ_CC)) {
+ logerrprint("Error: '--freq case-control' requires a case/control phenotype.\n");
goto plink_ret_INVALID_CMDLINE;
}
}
+
+ if (!pheno_all) {
+ if (loop_assoc_fname || (!pheno_d)) {
+ if ((calculation_type & CALC_GLM) && (!(glm_modifier & GLM_LOGISTIC))) {
+ logerrprint("Error: --linear without --all-pheno requires a scalar phenotype.\n");
+ goto plink_ret_INVALID_CMDLINE;
+ } else if (calculation_type & CALC_QFAM) {
+ logerrprint("Error: QFAM test requires a scalar phenotype.\n");
+ goto plink_ret_INVALID_CMDLINE;
+ }
+ } else if (!pheno_c) {
+ if ((calculation_type & CALC_MODEL) && (!(model_modifier & MODEL_ASSOC))) {
+ logerrprint("Error: --model requires a case/control phenotype.\n");
+ goto plink_ret_INVALID_CMDLINE;
+ } else if ((calculation_type & CALC_GLM) && (glm_modifier & GLM_LOGISTIC)) {
+ logerrprint("Error: --logistic without --all-pheno requires a case/control phenotype.\n");
+ goto plink_ret_INVALID_CMDLINE;
+ } else if (calculation_type & (CALC_CMH | CALC_HOMOG | CALC_TESTMISS | CALC_TDT | CALC_DFAM)) {
+ if (calculation_type & CALC_CMH) {
+ logerrprint("Error: --mh and --mh2 require a case/control phenotype.\n");
+ } else if (calculation_type & CALC_HOMOG) {
+ logerrprint("Error: --homog requires a case/control phenotype.\n");
+ } else if (calculation_type & CALC_TESTMISS) {
+ logerrprint("Error: --test-missing requires a case/control phenotype.\n");
+ } else if (calculation_type & CALC_TDT) {
+ logerrprint("Error: --tdt requires a case/control phenotype.\n");
+ } else {
+ logerrprint("Error: --dfam requires a case/control phenotype.\n");
+ }
+ goto plink_ret_INVALID_CMDLINE;
+ }
+ }
+ }
}
}
- if (cm_map_fname) {
+ if (cm_map_fname && unfiltered_marker_ct) {
// need sorted bps, but not marker IDs
if (map_is_unsorted & UNSORTED_BP) {
logerrprint("Error: --cm-map requires a sorted .bim file. Retry this command after using\n--make-bed to sort your data.\n");
@@ -751,112 +755,120 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
}
}
- uii = update_cm || update_map || update_name || (marker_alleles_needed && (update_alleles_fname || (flip_fname && (!flip_subset_fname)))) || filter_attrib_fname || qual_filter;
- if (uii || extractname || excludename) {
- // only permit duplicate marker IDs for --extract/--exclude
- wkspace_mark = wkspace_base;
- retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, !uii, &marker_id_htable, &marker_id_htable_size);
- if (retval) {
- goto plink_ret_1;
- }
- if (update_cm) {
- retval = update_marker_cms(update_cm, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_cms);
- if (retval) {
- goto plink_ret_1;
- }
- }
- if (update_map) {
- retval = update_marker_pos(update_map, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, marker_pos, &map_is_unsorted, chrom_info_ptr);
- } else if (update_name) {
- retval = update_marker_names(update_name, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len, unfiltered_marker_ct);
+ if (unfiltered_marker_ct != marker_exclude_ct) {
+ uii = update_cm || update_map || update_name || (marker_alleles_needed && (update_alleles_fname || (flip_fname && (!flip_subset_fname)))) || filter_attrib_fname || qual_filter;
+ if (uii || extractname || excludename) {
+ // only permit duplicate marker IDs for --extract/--exclude
+ wkspace_mark = wkspace_base;
+ retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, !uii, &marker_id_htable, &marker_id_htable_size);
if (retval) {
goto plink_ret_1;
}
- if (update_alleles_fname || (marker_alleles_needed && flip_fname && (!flip_subset_fname)) || extractname || excludename) {
- wkspace_reset(wkspace_mark);
- retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable, &marker_id_htable_size);
- }
- }
- if (marker_alleles_needed) {
- if (update_alleles_fname) {
- retval = update_marker_alleles(update_alleles_fname, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, marker_allele_ptrs, &max_marker_allele_len, outname, outname_end);
- if (retval) {
- goto plink_ret_1;
- }
- }
- if (flip_fname && (!flip_subset_fname)) {
- retval = flip_strand(flip_fname, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, marker_allele_ptrs);
- if (retval) {
+ if (update_cm) {
+ retval = update_marker_cms(update_cm, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_cms);
+ if (retval) {
goto plink_ret_1;
- }
+ }
}
- }
- if (extractname) {
- if (!(misc_flags & MISC_EXTRACT_RANGE)) {
- retval = extract_exclude_flag_norange(extractname, marker_id_htable, marker_id_htable_size, 0, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
+ if (update_map) {
+ retval = update_marker_pos(update_map, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, marker_pos, &map_is_unsorted, chrom_info_ptr);
if (retval) {
goto plink_ret_1;
}
- } else {
- if (map_is_unsorted & UNSORTED_BP) {
- logerrprint("Error: '--extract range' requires a sorted .bim. Retry this command after\nusing --make-bed to sort your data.\n");
- goto plink_ret_INVALID_CMDLINE;
- }
- retval = extract_exclude_range(extractname, marker_pos, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, 0, chrom_info_ptr);
+ } else if (update_name) {
+ retval = update_marker_names(update_name, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len, unfiltered_marker_ct);
if (retval) {
goto plink_ret_1;
}
- uljj = unfiltered_marker_ct - marker_exclude_ct;
- LOGPRINTF("--extract range: %" PRIuPTR " variant%s remaining.\n", uljj, (uljj == 1)? "" : "s");
+ if (update_alleles_fname || (marker_alleles_needed && flip_fname && (!flip_subset_fname)) || extractname || excludename) {
+ wkspace_reset(wkspace_mark);
+ retval = alloc_and_populate_id_htable(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, 0, &marker_id_htable, &marker_id_htable_size);
+ if (retval) {
+ goto plink_ret_1;
+ }
+ }
}
- }
- if (excludename) {
- if (!(misc_flags & MISC_EXCLUDE_RANGE)) {
- retval = extract_exclude_flag_norange(excludename, marker_id_htable, marker_id_htable_size, 1, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
+ if (marker_alleles_needed) {
+ if (update_alleles_fname) {
+ retval = update_marker_alleles(update_alleles_fname, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, marker_allele_ptrs, &max_marker_allele_len, outname, outname_end);
+ if (retval) {
+ goto plink_ret_1;
+ }
+ }
+ if (flip_fname && (!flip_subset_fname)) {
+ retval = flip_strand(flip_fname, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, marker_allele_ptrs);
+ if (retval) {
+ goto plink_ret_1;
+ }
+ }
+ }
+ if (extractname) {
+ if (!(misc_flags & MISC_EXTRACT_RANGE)) {
+ retval = extract_exclude_flag_norange(extractname, marker_id_htable, marker_id_htable_size, 0, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, allow_no_variants);
+ if (retval) {
+ goto plink_ret_1;
+ }
+ } else {
+ if (map_is_unsorted & UNSORTED_BP) {
+ logerrprint("Error: '--extract range' requires a sorted .bim. Retry this command after\nusing --make-bed to sort your data.\n");
+ goto plink_ret_INVALID_CMDLINE;
+ }
+ retval = extract_exclude_range(extractname, marker_pos, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, 0, allow_no_variants, chrom_info_ptr);
+ if (retval) {
+ goto plink_ret_1;
+ }
+ uljj = unfiltered_marker_ct - marker_exclude_ct;
+ LOGPRINTF("--extract range: %" PRIuPTR " variant%s remaining.\n", uljj, (uljj == 1)? "" : "s");
+ }
+ }
+ if (excludename) {
+ if (!(misc_flags & MISC_EXCLUDE_RANGE)) {
+ retval = extract_exclude_flag_norange(excludename, marker_id_htable, marker_id_htable_size, 1, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, allow_no_variants);
+ if (retval) {
+ goto plink_ret_1;
+ }
+ } else {
+ if (map_is_unsorted & UNSORTED_BP) {
+ logerrprint("Error: '--exclude range' requires a sorted .bim. Retry this command after\nusing --make-bed to sort your data.\n");
+ goto plink_ret_INVALID_CMDLINE;
+ }
+ retval = extract_exclude_range(excludename, marker_pos, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, 1, allow_no_variants, chrom_info_ptr);
+ if (retval) {
+ goto plink_ret_1;
+ }
+ uljj = unfiltered_marker_ct - marker_exclude_ct;
+ LOGPRINTF("--exclude range: %" PRIuPTR " variant%s remaining.\n", uljj, (uljj == 1)? "" : "s");
+ }
+ }
+ if (filter_attrib_fname) {
+ retval = filter_attrib(filter_attrib_fname, filter_attrib_liststr, marker_id_htable, marker_id_htable_size, allow_no_variants, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
if (retval) {
goto plink_ret_1;
}
- } else {
- if (map_is_unsorted & UNSORTED_BP) {
- logerrprint("Error: '--exclude range' requires a sorted .bim. Retry this command after\nusing --make-bed to sort your data.\n");
- goto plink_ret_INVALID_CMDLINE;
- }
- retval = extract_exclude_range(excludename, marker_pos, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, 1, chrom_info_ptr);
+ }
+ if (qual_filter) {
+ retval = filter_qual_scores(qual_filter, qual_min_thresh, qual_max_thresh, marker_id_htable, marker_id_htable_size, allow_no_variants, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
if (retval) {
goto plink_ret_1;
}
- uljj = unfiltered_marker_ct - marker_exclude_ct;
- LOGPRINTF("--exclude range: %" PRIuPTR " variant%s remaining.\n", uljj, (uljj == 1)? "" : "s");
}
+ wkspace_reset(wkspace_mark);
}
- if (filter_attrib_fname) {
- retval = filter_attrib(filter_attrib_fname, filter_attrib_liststr, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
- if (retval) {
- goto plink_ret_1;
- }
+
+ if (allelexxxx) {
+ allelexxxx_recode(allelexxxx, marker_allele_ptrs, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct);
}
- if (qual_filter) {
- retval = filter_qual_scores(qual_filter, qual_min_thresh, qual_max_thresh, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
+
+ if (thin_keep_prob != 1.0) {
+ if (random_thin_markers(thin_keep_prob, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, allow_no_variants)) {
+ goto plink_ret_ALL_MARKERS_EXCLUDED;
+ }
+ } else if (thin_keep_ct != 0xffffffffU) {
+ retval = random_thin_markers_ct(thin_keep_ct, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
if (retval) {
- goto plink_ret_1;
+ goto plink_ret_1;
}
}
- wkspace_reset(wkspace_mark);
- }
-
- if (allelexxxx) {
- allelexxxx_recode(allelexxxx, marker_allele_ptrs, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct);
- }
-
- if (thin_keep_prob != 1.0) {
- if (random_thin_markers(thin_keep_prob, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct)) {
- goto plink_ret_ALL_MARKERS_EXCLUDED;
- }
- } else if (thin_keep_ct) {
- retval = random_thin_markers_ct(thin_keep_ct, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
- if (retval) {
- goto plink_ret_1;
- }
}
if (bedfile) {
@@ -888,13 +900,13 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
bed_offset = 2;
} else {
// pre-v0.99, sample-major, no header bytes
- llyy = llzz;
- if (llxx != llyy) {
+ if (llxx != llzz) {
// probably not PLINK-format at all, so give this error instead of
// "invalid file size"
logerrprint("Error: Invalid header bytes in .bed file.\n");
goto plink_ret_INVALID_FORMAT;
}
+ llyy = llzz;
bed_offset = 2;
}
if (llxx != llyy) {
@@ -924,63 +936,63 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
}
}
- if (update_ids_fname || update_parents_fname || update_sex_fname || keepname || keepfamname || removename || removefamname || filter_attrib_sample_fname || om_ip->marker_fname || filtername) {
+ if (unfiltered_sample_ct && (update_ids_fname || update_parents_fname || update_sex_fname || keepname || keepfamname || removename || removefamname || filter_attrib_sample_fname || om_ip->marker_fname || filtername)) {
wkspace_mark = wkspace_base;
- retval = sort_item_ids(&cptr, &uiptr, unfiltered_sample_ct, sample_exclude, sample_exclude_ct, sample_ids, max_sample_id_len, 0, 0, strcmp_deref);
+ retval = sort_item_ids(&cptr, &uiptr, unfiltered_sample_ct, sample_exclude, 0, sample_ids, max_sample_id_len, 0, 0, strcmp_deref);
if (retval) {
goto plink_ret_1;
}
- ulii = unfiltered_sample_ct - sample_exclude_ct;
if (update_ids_fname) {
- retval = update_sample_ids(update_ids_fname, cptr, ulii, max_sample_id_len, uiptr, sample_ids);
+ retval = update_sample_ids(update_ids_fname, cptr, unfiltered_sample_ct, max_sample_id_len, uiptr, sample_ids);
if (retval) {
goto plink_ret_1;
}
wkspace_reset(wkspace_base);
- retval = sort_item_ids(&cptr, &uiptr, unfiltered_sample_ct, sample_exclude, sample_exclude_ct, sample_ids, max_sample_id_len, 0, 0, strcmp_deref);
+ retval = sort_item_ids(&cptr, &uiptr, unfiltered_sample_ct, sample_exclude, 0, sample_ids, max_sample_id_len, 0, 0, strcmp_deref);
if (retval) {
goto plink_ret_1;
}
} else {
if (update_parents_fname) {
- retval = update_sample_parents(update_parents_fname, cptr, ulii, max_sample_id_len, uiptr, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, founder_info);
+ retval = update_sample_parents(update_parents_fname, cptr, unfiltered_sample_ct, max_sample_id_len, uiptr, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, founder_info);
if (retval) {
goto plink_ret_1;
}
}
if (update_sex_fname) {
- retval = update_sample_sexes(update_sex_fname, update_sex_col, cptr, ulii, max_sample_id_len, uiptr, sex_nm, sex_male);
+ retval = update_sample_sexes(update_sex_fname, update_sex_col, cptr, unfiltered_sample_ct, max_sample_id_len, uiptr, sex_nm, sex_male);
if (retval) {
goto plink_ret_1;
}
}
}
+ // sample_exclude_ct assumed to be 0 before this point
if (keepfamname) {
- retval = keep_or_remove(keepfamname, cptr, ulii, max_sample_id_len, uiptr, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, 2);
+ retval = keep_or_remove(keepfamname, cptr, unfiltered_sample_ct, max_sample_id_len, uiptr, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, 2, allow_no_samples);
if (retval) {
goto plink_ret_1;
}
}
if (keepname) {
- retval = keep_or_remove(keepname, cptr, ulii, max_sample_id_len, uiptr, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, 0);
+ retval = keep_or_remove(keepname, cptr, unfiltered_sample_ct, max_sample_id_len, uiptr, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, 0, allow_no_samples);
if (retval) {
goto plink_ret_1;
}
}
if (removefamname) {
- retval = keep_or_remove(removefamname, cptr, ulii, max_sample_id_len, uiptr, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, 3);
+ retval = keep_or_remove(removefamname, cptr, unfiltered_sample_ct, max_sample_id_len, uiptr, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, 3, allow_no_samples);
if (retval) {
goto plink_ret_1;
}
}
if (removename) {
- retval = keep_or_remove(removename, cptr, ulii, max_sample_id_len, uiptr, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, 1);
+ retval = keep_or_remove(removename, cptr, unfiltered_sample_ct, max_sample_id_len, uiptr, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, 1, allow_no_samples);
if (retval) {
goto plink_ret_1;
}
}
if (filter_attrib_sample_fname) {
- retval = filter_attrib_sample(filter_attrib_sample_fname, filter_attrib_sample_liststr, cptr, ulii, max_sample_id_len, uiptr, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct);
+ retval = filter_attrib_sample(filter_attrib_sample_fname, filter_attrib_sample_liststr, cptr, unfiltered_sample_ct, max_sample_id_len, uiptr, unfiltered_sample_ct, allow_no_samples, sample_exclude, &sample_exclude_ct);
if (retval) {
goto plink_ret_1;
}
@@ -988,16 +1000,23 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
if (om_ip->marker_fname) {
// would rather do this with pre-sorted markers, but that might break
// order-of-operations assumptions in existing pipelines
- retval = load_oblig_missing(bedfile, bed_offset, unfiltered_marker_ct, marker_exclude, marker_exclude_ct, marker_ids, max_marker_id_len, cptr, ulii, max_sample_id_len, uiptr, unfiltered_sample_ct, sample_exclude, sex_male, chrom_info_ptr, om_ip);
- if (retval) {
- goto plink_ret_1;
+ if ((unfiltered_sample_ct == sample_exclude_ct) || (unfiltered_marker_ct == marker_exclude_ct)) {
+ // don't need this if everything that refers to om_ip is skipped
+ oblig_missing_cleanup(om_ip);
+ om_ip->cluster_ct = 0;
+ om_ip->entry_ct = 0;
+ } else {
+ retval = load_oblig_missing(bedfile, bed_offset, unfiltered_marker_ct, marker_exclude, marker_exclude_ct, marker_ids, max_marker_id_len, cptr, unfiltered_sample_ct, max_sample_id_len, uiptr, unfiltered_sample_ct, sex_male, chrom_info_ptr, om_ip);
+ if (retval) {
+ goto plink_ret_1;
+ }
}
}
if (filtername) {
if (!mfilter_col) {
mfilter_col = 1;
}
- retval = filter_samples_file(filtername, cptr, ulii, max_sample_id_len, uiptr, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, filtervals_flattened, mfilter_col);
+ retval = filter_samples_file(filtername, cptr, unfiltered_sample_ct, max_sample_id_len, uiptr, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, filtervals_flattened, mfilter_col, allow_no_samples);
if (retval) {
goto plink_ret_1;
}
@@ -1005,7 +1024,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
wkspace_reset(wkspace_mark);
}
- if (famname[0]) {
+ if (famname[0] && (unfiltered_sample_ct != sample_exclude_ct)) {
if (gender_unk_ct && (!(sex_missing_pheno & ALLOW_NO_SEX))) {
uii = popcount_longs_exclude(pheno_nm, sex_nm, unfiltered_sample_ctl);
if (uii) {
@@ -1028,11 +1047,11 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
bitfield_ornot(sample_exclude, pheno_nm, unfiltered_sample_ctl);
zero_trailing_bits(sample_exclude, unfiltered_sample_ct);
sample_exclude_ct = popcount_longs(sample_exclude, unfiltered_sample_ctl);
- if (sample_exclude_ct == unfiltered_sample_ct) {
+ if ((sample_exclude_ct == unfiltered_sample_ct) && (!allow_no_samples)) {
LOGERRPRINTF("Error: All %s removed by --prune.\n", g_species_plural);
goto plink_ret_ALL_SAMPLES_EXCLUDED;
}
- LOGPRINTF("--prune: %" PRIuPTR " %s remaining.\n", unfiltered_sample_ct - sample_exclude_ct, species_str(unfiltered_sample_ct == sample_exclude_ct + 1));
+ LOGPRINTF("--prune: %" PRIuPTR " %s remaining.\n", unfiltered_sample_ct - sample_exclude_ct, species_str(unfiltered_sample_ct - sample_exclude_ct));
}
if (filter_flags & (FILTER_BINARY_CASES | FILTER_BINARY_CONTROLS)) {
@@ -1045,7 +1064,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
// fcc == 2: exclude all ones in pheno_c
// -> flip on fcc == 1
filter_samples_bitfields(unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, pheno_c, (filter_flags / FILTER_BINARY_CASES) & 1, pheno_nm);
- if (sample_exclude_ct == unfiltered_sample_ct) {
+ if ((sample_exclude_ct == unfiltered_sample_ct) && (!allow_no_samples)) {
LOGERRPRINTF("Error: All %s removed due to case/control status (--filter-%s).\n", g_species_plural, (filter_flags & FILTER_BINARY_CASES)? "cases" : "controls");
goto plink_ret_ALL_SAMPLES_EXCLUDED;
}
@@ -1055,7 +1074,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
if (filter_flags & (FILTER_BINARY_FEMALES | FILTER_BINARY_MALES)) {
ii = sample_exclude_ct;
filter_samples_bitfields(unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, sex_male, (filter_flags / FILTER_BINARY_MALES) & 1, sex_nm);
- if (sample_exclude_ct == unfiltered_sample_ct) {
+ if ((sample_exclude_ct == unfiltered_sample_ct) && (!allow_no_samples)) {
LOGERRPRINTF("Error: All %s removed due to gender filter (--filter-%s).\n", g_species_plural, (filter_flags & FILTER_BINARY_MALES)? "males" : "females");
goto plink_ret_ALL_SAMPLES_EXCLUDED;
}
@@ -1065,7 +1084,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
if (filter_flags & (FILTER_BINARY_FOUNDERS | FILTER_BINARY_NONFOUNDERS)) {
ii = sample_exclude_ct;
filter_samples_bitfields(unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, founder_info, (filter_flags / FILTER_BINARY_FOUNDERS) & 1, NULL);
- if (sample_exclude_ct == unfiltered_sample_ct) {
+ if ((sample_exclude_ct == unfiltered_sample_ct) && (!allow_no_samples)) {
LOGERRPRINTF("Error: All %s removed due to founder status (--filter-%s).\n", g_species_plural, (filter_flags & FILTER_BINARY_FOUNDERS)? "founders" : "nonfounders");
goto plink_ret_ALL_SAMPLES_EXCLUDED;
}
@@ -1074,10 +1093,10 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
}
if (thin_keep_sample_prob != 1.0) {
- if (random_thin_samples(thin_keep_sample_prob, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct)) {
+ if (random_thin_samples(thin_keep_sample_prob, unfiltered_sample_ct, allow_no_samples, sample_exclude, &sample_exclude_ct)) {
goto plink_ret_ALL_SAMPLES_EXCLUDED;
}
- } else if (thin_keep_sample_ct) {
+ } else if (thin_keep_sample_ct != 0xffffffffU) {
retval = random_thin_samples_ct(thin_keep_sample_ct, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct);
if (retval) {
goto plink_ret_1;
@@ -1085,7 +1104,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
}
if (mind_thresh < 1.0) {
- retval = mind_filter(bedfile, bed_offset, outname, outname_end, mind_thresh, unfiltered_marker_ct, marker_exclude, marker_exclude_ct, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, sample_ids, max_sample_id_len, sex_male, chrom_info_ptr, om_ip);
+ retval = mind_filter(bedfile, bed_offset, outname, outname_end, mind_thresh, unfiltered_marker_ct, marker_exclude, marker_exclude_ct, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, sample_ids, max_sample_id_len, sex_male, chrom_info_ptr, om_ip, allow_no_samples);
if (retval) {
goto plink_ret_1;
}
@@ -1094,27 +1113,25 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
// could save off wkspace_mark here and free immediately after
// load_clusters(), if clusters are *only* used for filtering. But not a
// big deal.
- retval = load_clusters(cluster_ptr->fname, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, sample_ids, max_sample_id_len, mwithin_col, (misc_flags / MISC_LOAD_CLUSTER_KEEP_NA) & 1, &cluster_ct, &cluster_map, &cluster_starts, &cluster_ids, &max_cluster_id_len, cluster_ptr->keep_fname, cluster_ptr->keep_flattened, cluster_ptr->remove_fname, cluster_ptr->remove_flattened);
+ retval = load_clusters(cluster_ptr->fname, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, sample_ids, max_sample_id_len, mwithin_col, (misc_flags / MISC_LOAD_CLUSTER_KEEP_NA) & 1, &cluster_ct, &cluster_map, &cluster_starts, &cluster_ids, &max_cluster_id_len, cluster_ptr->keep_fname, cluster_ptr->keep_flattened, cluster_ptr->remove_fname, cluster_ptr->remove_flattened, allow_no_samples);
if (retval) {
goto plink_ret_1;
}
}
sample_ct = unfiltered_sample_ct - sample_exclude_ct;
- if (!sample_ct) {
- // defensive; currently shouldn't happen since we're actually checking at
- // every filter
+ if ((!sample_ct) && (!allow_no_samples)) {
+ // defensive; currently shouldn't happen since we're checking at every
+ // filter
LOGERRPRINTF("Error: No %s pass QC.\n", g_species_plural);
goto plink_ret_ALL_SAMPLES_EXCLUDED;
}
- if ((sample_ct == 1) && (relationship_or_ibc_req(calculation_type) || distance_req(calculation_type, read_dists_fname) || (calculation_type & (CALC_GENOME | CALC_CLUSTER | CALC_NEIGHBOR)))) {
- sprintf(logbuf, "Error: More than 1 %s required for pairwise analysis.\n", g_species_singular);
+ if ((sample_ct < 2) && (relationship_or_ibc_req(calculation_type) || distance_req(calculation_type, read_dists_fname) || (calculation_type & (CALC_GENOME | CALC_CLUSTER | CALC_NEIGHBOR)))) {
+ sprintf(logbuf, "Error: At least 2 %s required for pairwise analysis.\n", g_species_plural);
goto plink_ret_INVALID_CMDLINE_2;
}
- // er, this needs to check marker_ct instead of sample_ct for --r/--r2,
- // --fast-epistasis
- if ((parallel_tot > 1) && (parallel_tot > sample_ct / 2)) {
+ if ((parallel_tot > 1) && (calculation_type & (CALC_DISTANCE | CALC_GENOME | CALC_RELATIONSHIP)) && (parallel_tot > sample_ct / 2)) {
sprintf(logbuf, "Error: Too many --parallel jobs (maximum %" PRIuPTR "/2 = %" PRIuPTR ").\n", sample_ct, sample_ct / 2);
goto plink_ret_INVALID_CMDLINE_2;
}
@@ -1123,7 +1140,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
// Multithreaded BLAS/LAPACK call? If yes, and either user requested <=
// [0.5 * nprocs] threads or nprocs is unknown, warn that BLAS/LAPACK
// multithreading is not under PLINK's control.
- uii = ((!known_procs) || (known_procs * 2 >= g_thread_ct)) && ((calculation_type & (CALC_LASSO | CALC_PCA | CALC_UNRELATED_HERITABILITY)) || ((calculation_type & CALC_GLM) && pheno_d) || cluster_ptr->mds_dim_ct || ((calculation_type & CALC_LD_PRUNE) && (!(ldip->modifier & (LD_PRUNE_PAIRWISE | LD_PRUNE_PAIRPHASE)))));
+ uii = ((!known_procs) || (known_procs >= 2 * ((int32_t)g_thread_ct))) && ((calculation_type & (CALC_LASSO | CALC_PCA | CALC_UNRELATED_HERITABILITY)) || ((calculation_type & CALC_GLM) && pheno_d) || cluster_ptr->mds_dim_ct || ((calculation_type & CALC_LD_PRUNE) && (!(ldip->modifier & (LD_PRUNE_PAIRWISE | LD_PRUNE_PAIRPHASE)))));
#endif
if (g_thread_ct > 1) {
if ((calculation_type & (CALC_RELATIONSHIP | CALC_REL_CUTOFF | CALC_GDISTANCE_MASK | CALC_IBS_TEST | CALC_GROUPDIST | CALC_REGRESS_DISTANCE | CALC_GENOME | CALC_REGRESS_REL | CALC_UNRELATED_HERITABILITY | CALC_LD | CALC_PCA | CALC_MAKE_PERM_PHENO | CALC_QFAM)) || ((calculation_type & CALC_MODEL) && (model_modifier & (MODEL_PERM | MODEL_MPERM))) || ((calculation_type & CALC_GLM) && (glm_modifier & (GLM_PERM | GLM_MPERM))) || ((calculation_type & CALC_TESTMISS) && (testmiss_modifier & [...]
@@ -1231,40 +1248,36 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
}
- if (bimname[0]) {
- if (unfiltered_marker_ct == marker_exclude_ct) {
- // defensive
- logerrprint("Error: No variants remaining.\n");
- goto plink_ret_ALL_MARKERS_EXCLUDED;
- }
+ if (bimname[0] && (unfiltered_marker_ct != marker_exclude_ct)) {
plink_maxsnp = calc_plink_maxsnp(unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len);
uii = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
if (wkspace_alloc_ul_checked(&marker_reverse, uii * sizeof(intptr_t))) {
goto plink_ret_NOMEM;
}
fill_ulong_zero(marker_reverse, uii);
- if (bedfile) {
+ if (bedfile && sample_ct) {
retval = calc_freqs_and_hwe(bedfile, outname, outname_end, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, unfiltered_sample_ct, sample_exclude, sample_exclude_ct, sample_ids, max_sample_id_len, founder_info, nonfounders, (misc_flags / MISC_MAF_SUCC) & 1, set_allele_freqs, bed_offset, (hwe_thresh > 0.0) || (calculation_type & CALC_HARDY), hwe_modifier & HWE_THRESH_ALL, (pheno_nm_ct && pheno_c)? ((calculation_type / CALC [...]
if (retval) {
goto plink_ret_1;
}
-
- if (freqname) {
- retval = read_external_freqs(freqname, unfiltered_marker_ct, marker_exclude, marker_exclude_ct, marker_ids, max_marker_id_len, chrom_info_ptr, marker_allele_ptrs, set_allele_freqs, nchrobs, (misc_flags / MISC_MAF_SUCC) & 1);
- if (retval) {
- goto plink_ret_1;
- }
- }
-
- if (!(misc_flags & MISC_KEEP_ALLELE_ORDER)) {
- // after this, set_allele_freqs[] has A2 freqs
- calc_marker_reverse_bin(marker_reverse, marker_exclude, unfiltered_marker_ct, unfiltered_marker_ct - marker_exclude_ct, set_allele_freqs);
- }
} else {
for (marker_uidx = 0; marker_uidx < unfiltered_marker_ct; marker_uidx++) {
set_allele_freqs[marker_uidx] = 1.0;
}
}
+
+ if (freqname) {
+ retval = read_external_freqs(freqname, unfiltered_marker_ct, marker_exclude, marker_exclude_ct, marker_ids, max_marker_id_len, chrom_info_ptr, marker_allele_ptrs, set_allele_freqs, nchrobs, (misc_flags / MISC_MAF_SUCC) & 1);
+ if (retval) {
+ goto plink_ret_1;
+ }
+ }
+
+ if (!(misc_flags & MISC_KEEP_ALLELE_ORDER)) {
+ // after this, set_allele_freqs[] has A2 freqs
+ calc_marker_reverse_bin(marker_reverse, marker_exclude, unfiltered_marker_ct, unfiltered_marker_ct - marker_exclude_ct, set_allele_freqs);
+ }
+
if (a1alleles || a2alleles) {
retval = load_ax_alleles(a1alleles? a1alleles : a2alleles, unfiltered_marker_ct, marker_exclude, marker_exclude_ct, marker_allele_ptrs, &max_marker_allele_len, marker_reverse, marker_ids, max_marker_id_len, set_allele_freqs, a2alleles? 1 : 0);
if (retval) {
@@ -1277,58 +1290,62 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
// contrary to the PLINK 1.07 flowchart, --freq effectively resolves before
// --geno.
- if (calculation_type & CALC_FREQ) {
- if (cluster_ct && (!(misc_flags & MISC_FREQX))) {
- if (misc_flags & MISC_FREQ_COUNTS) {
- logprint("Note: --freq 'counts' modifier has no effect on cluster-stratified report.\n");
+ if (sample_ct) {
+ if (calculation_type & CALC_FREQ) {
+ if (cluster_ct && (!(misc_flags & MISC_FREQX))) {
+ if (misc_flags & MISC_FREQ_COUNTS) {
+ logprint("Note: --freq 'counts' modifier has no effect on cluster-stratified report.\n");
+ }
+ retval = write_stratified_freqs(bedfile, bed_offset, outname, outname_end, (misc_flags / MISC_FREQ_GZ) & 1, plink_maxsnp, unfiltered_marker_ct, marker_exclude, chrom_info_ptr, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, unfiltered_sample_ct, sample_ct, sample_f_ct, founder_info, nonfounders, sex_male, sample_f_male_ct, marker_reverse, cluster_ct, cluster_map, cluster_starts, cluster_ids, max_cluster_id_len);
+ } else if (misc_flags & MISC_FREQ_CC) {
+ retval = write_cc_freqs(bedfile, bed_offset, outname, outname_end, (misc_flags / MISC_FREQ_GZ) & 1, plink_maxsnp, unfiltered_marker_ct, marker_exclude, chrom_info_ptr, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, unfiltered_sample_ct, founder_info, nonfounders, sex_male, marker_reverse, pheno_nm, pheno_c);
+ } else {
+ retval = write_freqs(outname, outname_end, plink_maxsnp, unfiltered_marker_ct, marker_exclude, set_allele_freqs, chrom_info_ptr, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, hwe_ll_allfs, hwe_lh_allfs, hwe_hh_allfs, hwe_hapl_allfs, hwe_haph_allfs, sample_f_ct, sample_f_male_ct, nonfounders, misc_flags, marker_reverse);
+ }
+ if (retval || (!(calculation_type & (~(CALC_MERGE | CALC_WRITE_CLUSTER | CALC_FREQ))))) {
+ goto plink_ret_1;
}
- retval = write_stratified_freqs(bedfile, bed_offset, outname, outname_end, (misc_flags / MISC_FREQ_GZ) & 1, plink_maxsnp, unfiltered_marker_ct, marker_exclude, chrom_info_ptr, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, unfiltered_sample_ct, sample_ct, sample_f_ct, founder_info, nonfounders, sex_male, sample_f_male_ct, marker_reverse, cluster_ct, cluster_map, cluster_starts, cluster_ids, max_cluster_id_len);
- } else if (misc_flags & MISC_FREQ_CC) {
- retval = write_cc_freqs(bedfile, bed_offset, outname, outname_end, (misc_flags / MISC_FREQ_GZ) & 1, plink_maxsnp, unfiltered_marker_ct, marker_exclude, chrom_info_ptr, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, unfiltered_sample_ct, founder_info, nonfounders, sex_male, marker_reverse, pheno_nm, pheno_c);
- } else {
- retval = write_freqs(outname, outname_end, plink_maxsnp, unfiltered_marker_ct, marker_exclude, set_allele_freqs, chrom_info_ptr, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, hwe_ll_allfs, hwe_lh_allfs, hwe_hh_allfs, hwe_hapl_allfs, hwe_haph_allfs, sample_f_ct, sample_f_male_ct, nonfounders, misc_flags, marker_reverse);
- }
- if (retval || (!(calculation_type & (~(CALC_MERGE | CALC_WRITE_CLUSTER | CALC_FREQ))))) {
- goto plink_ret_1;
}
- }
- if (calculation_type & CALC_MISSING_REPORT) {
- retval = write_missingness_reports(bedfile, bed_offset, outname, outname_end, (misc_flags / MISC_MISSING_GZ) & 1, plink_maxfid, plink_maxiid, plink_maxsnp, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, chrom_info_ptr, om_ip, marker_ids, max_marker_id_len, unfiltered_sample_ct, sample_ct, sample_exclude, pheno_nm, sex_male, sample_male_ct, sample_ids, max_sample_id_len, cluster_ct, cluster_map, cluster_starts, cluster_ids, max_cluster_id_len, hh_exists);
- if (retval || (!(calculation_type & (~(CALC_MERGE | CALC_WRITE_CLUSTER | CALC_FREQ | CALC_MISSING_REPORT))))) {
- goto plink_ret_1;
+ if (calculation_type & CALC_MISSING_REPORT) {
+ retval = write_missingness_reports(bedfile, bed_offset, outname, outname_end, (misc_flags / MISC_MISSING_GZ) & 1, plink_maxfid, plink_maxiid, plink_maxsnp, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, chrom_info_ptr, om_ip, marker_ids, max_marker_id_len, unfiltered_sample_ct, sample_ct, sample_exclude, pheno_nm, sex_male, sample_male_ct, sample_ids, max_sample_id_len, cluster_ct, cluster_map, cluster_starts, cluster_ids, max_cluster_id_len, hh_exists);
+ if (retval || (!(calculation_type & (~(CALC_MERGE | CALC_WRITE_CLUSTER | CALC_FREQ | CALC_MISSING_REPORT))))) {
+ goto plink_ret_1;
+ }
}
- }
- if (geno_excl_bitfield) {
- ulii = marker_exclude_ct;
- uljj = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
- bitfield_or(marker_exclude, geno_excl_bitfield, uljj);
- marker_exclude_ct = popcount_longs(marker_exclude, uljj);
- if (marker_exclude_ct == unfiltered_marker_ct) {
- logerrprint("Error: All variants excluded due to missing genotype data (--geno).\n");
- goto plink_ret_ALL_MARKERS_EXCLUDED;
+ if (geno_excl_bitfield) {
+ ulii = marker_exclude_ct;
+ uljj = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+ bitfield_or(marker_exclude, geno_excl_bitfield, uljj);
+ marker_exclude_ct = popcount_longs(marker_exclude, uljj);
+ if ((marker_exclude_ct == unfiltered_marker_ct) && (!allow_no_variants)) {
+ logerrprint("Error: All variants excluded due to missing genotype data (--geno).\n");
+ goto plink_ret_ALL_MARKERS_EXCLUDED;
+ }
+ ulii = marker_exclude_ct - ulii;
+ LOGPRINTF("%" PRIuPTR " variant%s removed due to missing genotype data (--geno).\n", ulii, (ulii == 1)? "" : "s");
}
- ulii = marker_exclude_ct - ulii;
- LOGPRINTF("%" PRIuPTR " variant%s removed due to missing genotype data (--geno).\n", ulii, (ulii == 1)? "" : "s");
}
oblig_missing_cleanup(om_ip);
- if (calculation_type & CALC_HARDY) {
- retval = hardy_report(outname, outname_end, output_min_p, unfiltered_marker_ct, marker_exclude, marker_exclude_ct, marker_ids, max_marker_id_len, plink_maxsnp, marker_allele_ptrs, max_marker_allele_len, marker_reverse, hwe_lls, hwe_lhs, hwe_hhs, hwe_modifier, nonfounders, hwe_ll_cases, hwe_lh_cases, hwe_hh_cases, hwe_ll_allfs, hwe_lh_allfs, hwe_hh_allfs, pheno_nm_ct, pheno_c, chrom_info_ptr);
- if (retval || (!(calculation_type & (~(CALC_MERGE | CALC_WRITE_CLUSTER | CALC_FREQ | CALC_HARDY))))) {
- goto plink_ret_1;
+ if (sample_ct) {
+ if (calculation_type & CALC_HARDY) {
+ retval = hardy_report(outname, outname_end, output_min_p, unfiltered_marker_ct, marker_exclude, marker_exclude_ct, marker_ids, max_marker_id_len, plink_maxsnp, marker_allele_ptrs, max_marker_allele_len, marker_reverse, hwe_lls, hwe_lhs, hwe_hhs, hwe_modifier, nonfounders, hwe_ll_cases, hwe_lh_cases, hwe_hh_cases, hwe_ll_allfs, hwe_lh_allfs, hwe_hh_allfs, pheno_nm_ct, pheno_c, chrom_info_ptr);
+ if (retval || (!(calculation_type & (~(CALC_MERGE | CALC_WRITE_CLUSTER | CALC_FREQ | CALC_HARDY))))) {
+ goto plink_ret_1;
+ }
}
- }
- if (hwe_thresh > 0.0) {
- if (enforce_hwe_threshold(hwe_thresh, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, hwe_lls, hwe_lhs, hwe_hhs, hwe_modifier, hwe_ll_allfs, hwe_lh_allfs, hwe_hh_allfs, chrom_info_ptr)) {
- goto plink_ret_ALL_MARKERS_EXCLUDED;
+ if (hwe_thresh > 0.0) {
+ if (enforce_hwe_threshold(hwe_thresh, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, hwe_lls, hwe_lhs, hwe_hhs, hwe_modifier, allow_no_variants, hwe_ll_allfs, hwe_lh_allfs, hwe_hh_allfs, chrom_info_ptr)) {
+ goto plink_ret_ALL_MARKERS_EXCLUDED;
+ }
}
}
if ((min_maf != 0.0) || (max_maf != 0.5) || ac_excl_bitfield) {
- if (enforce_minor_allele_thresholds(min_maf, max_maf, unfiltered_marker_ct, marker_exclude, ac_excl_bitfield, &marker_exclude_ct, set_allele_freqs)) {
+ if (enforce_minor_allele_thresholds(min_maf, max_maf, unfiltered_marker_ct, marker_exclude, ac_excl_bitfield, &marker_exclude_ct, set_allele_freqs, allow_no_variants)) {
goto plink_ret_ALL_MARKERS_EXCLUDED;
}
}
- if (min_bp_space) {
+ if (min_bp_space && (unfiltered_marker_ct > marker_exclude_ct)) {
if (map_is_unsorted & UNSORTED_BP) {
logerrprint("Error: --bp-space requires a sorted .bim file. Retry this command after using\n--make-bed to sort your data.\n");
goto plink_ret_INVALID_FORMAT;
@@ -1336,9 +1353,9 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
enforce_min_bp_space(min_bp_space, unfiltered_marker_ct, marker_exclude, marker_pos, &marker_exclude_ct, chrom_info_ptr);
}
- if (bedfile) {
+ if (bedfile && sample_ct && (unfiltered_marker_ct > marker_exclude_ct)) {
if ((calculation_type & CALC_MENDEL) || (fam_ip->mendel_modifier & MENDEL_FILTER)) {
- retval = mendel_error_scan(fam_ip, bedfile, bed_offset, outname, outname_end, plink_maxfid, plink_maxiid, plink_maxsnp, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, marker_reverse, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, founder_info, sex_nm, sex_male, sample_ids, max_sample_id_len, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, hh_exists, chrom_info_ptr, (calc [...]
+ retval = mendel_error_scan(fam_ip, bedfile, bed_offset, outname, outname_end, plink_maxfid, plink_maxiid, plink_maxsnp, allow_no_variants, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, marker_reverse, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, founder_info, sex_nm, sex_male, sample_ids, max_sample_id_len, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, hh_exists, ch [...]
if (retval || (!(calculation_type & (~(CALC_MERGE | CALC_WRITE_CLUSTER | CALC_FREQ | CALC_MISSING_REPORT | CALC_MENDEL))))) {
goto plink_ret_1;
}
@@ -1365,25 +1382,33 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
logerrprint("Error: --set/--make-set requires a sorted .bim file. Retry this command after\nusing --make-bed to sort your data.\n");
goto plink_ret_INVALID_FORMAT;
}
- retval = define_sets(sip, unfiltered_marker_ct, marker_exclude, marker_pos, &marker_exclude_ct, marker_ids, max_marker_id_len, chrom_info_ptr);
+ retval = define_sets(sip, unfiltered_marker_ct, marker_exclude, marker_pos, &marker_exclude_ct, marker_ids, max_marker_id_len, chrom_info_ptr, allow_no_variants);
if (retval) {
goto plink_ret_1;
}
}
marker_ct = unfiltered_marker_ct - marker_exclude_ct;
- if (!marker_ct) {
+ if ((!marker_ct) && (!allow_no_variants)) {
// defensive
logerrprint("Error: All variants fail QC.\n");
goto plink_ret_ALL_MARKERS_EXCLUDED;
}
+ // could add --parallel + --r/--r2/--{fast-}epistasis check here. (this
+ // currently happens downstream)
+
if (bedfile) {
LOGPRINTFWW("%" PRIuPTR " variant%s and %" PRIuPTR " %s pass filters and QC%s.\n", marker_ct, (marker_ct == 1)? "" : "s", sample_ct, species_str(sample_ct), (calculation_type & CALC_REL_CUTOFF)? " (before --rel-cutoff)": "");
} else {
LOGPRINTFWW("%" PRIuPTR " variant%s filters and QC.\n", marker_ct, (marker_ct == 1)? " passes" : "s pass");
}
+ } else if (!allow_no_variants) {
+ // defensive
+ logerrprint("Error: No variants remaining.\n");
+ goto plink_ret_ALL_MARKERS_EXCLUDED;
}
- if (famname[0]) {
+
+ if (famname[0] && sample_ct) {
if (!pheno_nm_ct) {
logprint("Note: No phenotypes present.\n");
} else if (pheno_c) {
@@ -1399,134 +1424,141 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
}
}
- if (relationship_or_ibc_req(calculation_type)) {
- if (relip->pca_cluster_names_flattened || relip->pca_clusters_fname) {
- retval = extract_clusters(unfiltered_sample_ct, sample_exclude, sample_ct, cluster_ct, cluster_map, cluster_starts, cluster_ids, max_cluster_id_len, relip->pca_cluster_names_flattened, relip->pca_clusters_fname, &pca_sample_exclude, &pca_sample_ct);
- if (retval) {
- goto plink_ret_1;
- }
- if (pca_sample_ct < 2) {
- logerrprint("Error: Too few samples specified by --pca-cluster-names/--pca-clusters.\n");
- goto plink_ret_1;
- }
- if (pca_sample_ct == sample_ct) {
- logerrprint("Warning: --pca-cluster-names/--pca-clusters has no effect since all samples are\nin the named clusters.\n");
- pca_sample_exclude = NULL;
- } else {
- LOGPRINTF("--pca-cluster-names/--pca-clusters: %" PRIuPTR " samples specified.\n", pca_sample_ct);
- ulii = unfiltered_sample_ct - pca_sample_ct;
- }
- }
- retval = calc_rel(threads, parallel_idx, parallel_tot, calculation_type, relip, bedfile, bed_offset, outname, outname_end, distance_wts_fname, (dist_calc_type & DISTANCE_WTS_NOHEADER), unfiltered_marker_ct, marker_exclude, marker_reverse, marker_ct, marker_ids, max_marker_id_len, unfiltered_sample_ct, pca_sample_exclude? pca_sample_exclude : sample_exclude, pca_sample_exclude? (&ulii) : (&sample_exclude_ct), sample_ids, max_sample_id_len, set_allele_freqs, &rel_ibc, chrom_info_ptr);
- if (retval) {
- goto plink_ret_1;
- }
- if ((!pca_sample_exclude) && (sample_ct != unfiltered_sample_ct + sample_exclude_ct)) {
- sample_ct = unfiltered_sample_ct - sample_exclude_ct;
- if ((sample_ct < 2) && (distance_req(calculation_type, read_dists_fname) || (calculation_type & (CALC_REGRESS_REL | CALC_PCA | CALC_GENOME | CALC_CLUSTER | CALC_NEIGHBOR)))) {
- // pathological case
- sprintf(logbuf, "Error: Too many %s pruned for additional pairwise analysis steps.\n", g_species_plural);
- goto plink_ret_INVALID_CMDLINE_2;
+ if (sample_ct) {
+ if (marker_ct) {
+ if (relationship_or_ibc_req(calculation_type)) {
+ if (relip->pca_cluster_names_flattened || relip->pca_clusters_fname) {
+ retval = extract_clusters(unfiltered_sample_ct, sample_exclude, sample_ct, cluster_ct, cluster_map, cluster_starts, cluster_ids, max_cluster_id_len, relip->pca_cluster_names_flattened, relip->pca_clusters_fname, &pca_sample_exclude, &pca_sample_ct);
+ if (retval) {
+ goto plink_ret_1;
+ }
+ if (pca_sample_ct < 2) {
+ logerrprint("Error: Too few samples specified by --pca-cluster-names/--pca-clusters.\n");
+ goto plink_ret_1;
+ }
+ if (pca_sample_ct == sample_ct) {
+ logerrprint("Warning: --pca-cluster-names/--pca-clusters has no effect since all samples are\nin the named clusters.\n");
+ pca_sample_exclude = NULL;
+ } else {
+ LOGPRINTF("--pca-cluster-names/--pca-clusters: %" PRIuPTR " samples specified.\n", pca_sample_ct);
+ ulii = unfiltered_sample_ct - pca_sample_ct;
+ }
+ }
+ retval = calc_rel(threads, parallel_idx, parallel_tot, calculation_type, relip, bedfile, bed_offset, outname, outname_end, distance_wts_fname, (dist_calc_type & DISTANCE_WTS_NOHEADER), unfiltered_marker_ct, marker_exclude, marker_reverse, marker_ct, marker_ids, max_marker_id_len, unfiltered_sample_ct, pca_sample_exclude? pca_sample_exclude : sample_exclude, pca_sample_exclude? (&ulii) : (&sample_exclude_ct), sample_ids, max_sample_id_len, set_allele_freqs, &rel_ibc, chrom_info_ptr);
+ if (retval) {
+ goto plink_ret_1;
+ }
+ if ((!pca_sample_exclude) && (sample_ct != unfiltered_sample_ct + sample_exclude_ct)) {
+ sample_ct = unfiltered_sample_ct - sample_exclude_ct;
+ if ((sample_ct < 2) && (distance_req(calculation_type, read_dists_fname) || (calculation_type & (CALC_REGRESS_REL | CALC_PCA | CALC_GENOME | CALC_CLUSTER | CALC_NEIGHBOR)))) {
+ // pathological case
+ sprintf(logbuf, "Error: Too many %s pruned for additional pairwise analysis steps.\n", g_species_plural);
+ goto plink_ret_INVALID_CMDLINE_2;
+ }
+ }
+ if (calculation_type & CALC_REL_CUTOFF) {
+ // ugh, probably better to just stop supporting this
+ bitfield_andnot(founder_info, sample_exclude, unfiltered_sample_ctl);
+ bitfield_andnot(sex_nm, sample_exclude, unfiltered_sample_ctl);
+ bitfield_and(sex_male, sex_nm, unfiltered_sample_ctl);
+ if (pheno_nm_ct) {
+ bitfield_andnot(pheno_nm, sample_exclude, unfiltered_sample_ctl);
+ pheno_nm_ct = popcount_longs(pheno_nm, unfiltered_sample_ctl);
+ if (pheno_c) {
+ bitfield_and(pheno_c, pheno_nm, unfiltered_sample_ctl);
+ pheno_ctrl_ct = pheno_nm_ct - popcount_longs(pheno_c, unfiltered_sample_ctl);
+ }
+ }
+ }
+
+ if (calculation_type & CALC_REGRESS_REL) {
+ retval = regress_rel_main(unfiltered_sample_ct, sample_exclude, sample_ct, relip, threads, pheno_d);
+ if (retval) {
+ goto plink_ret_1;
+ }
+ }
+#ifndef NOLAPACK
+ if (calculation_type & CALC_PCA) {
+ retval = calc_pca(bedfile, bed_offset, outname, outname_end, calculation_type, relip, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, marker_reverse, unfiltered_sample_ct, sample_exclude, sample_ct, pca_sample_exclude? pca_sample_exclude : sample_exclude, pca_sample_exclude? pca_sample_ct : sample_ct, sample_ids, max_sample_id_len, set_allele_freqs, chrom_info_ptr, rel_ibc);
+ } else if (calculation_type & CALC_UNRELATED_HERITABILITY) {
+ if (sample_ct != pheno_nm_ct) {
+ logerrprint("Error: --unrelated-heritability requires phenotype data for all samples.\n(--prune should help.)\n");
+ goto plink_ret_INVALID_CMDLINE;
+ }
+ retval = calc_unrelated_herit(calculation_type, relip, unfiltered_sample_ct, sample_exclude, sample_ct, pheno_d, rel_ibc);
+ }
+#endif
+ wkspace_reset(g_sample_missing_unwt);
+ if (retval) {
+ goto plink_ret_1;
+ }
+ g_sample_missing_unwt = NULL;
+ g_missing_dbl_excluded = NULL;
}
- }
- if (calculation_type & CALC_REL_CUTOFF) {
- // ugh, probably better to just stop supporting this
- bitfield_andnot(founder_info, sample_exclude, unfiltered_sample_ctl);
- bitfield_andnot(sex_nm, sample_exclude, unfiltered_sample_ctl);
- bitfield_and(sex_male, sex_nm, unfiltered_sample_ctl);
- if (pheno_nm_ct) {
- bitfield_andnot(pheno_nm, sample_exclude, unfiltered_sample_ctl);
- pheno_nm_ct = popcount_longs(pheno_nm, unfiltered_sample_ctl);
- if (pheno_c) {
- bitfield_and(pheno_c, pheno_nm, unfiltered_sample_ctl);
- pheno_ctrl_ct = pheno_nm_ct - popcount_longs(pheno_c, unfiltered_sample_ctl);
+
+ if (calculation_type & CALC_SEXCHECK) {
+ retval = sexcheck(bedfile, bed_offset, outname, outname_end, unfiltered_marker_ct, marker_exclude, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, plink_maxfid, plink_maxiid, max_sample_id_len, sex_nm, sex_male, misc_flags, check_sex_fthresh, check_sex_mthresh, check_sex_f_yobs, check_sex_m_yobs, chrom_info_ptr, set_allele_freqs, &gender_unk_ct);
+ if (retval) {
+ goto plink_ret_1;
}
}
}
- if (calculation_type & CALC_REGRESS_REL) {
- retval = regress_rel_main(unfiltered_sample_ct, sample_exclude, sample_ct, relip, threads, pheno_d);
+ if (calculation_type & CALC_MAKE_PERM_PHENO) {
+ retval = make_perm_pheno(threads, outname, outname_end, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, cluster_ct, cluster_map, cluster_starts, pheno_nm_ct, pheno_nm, pheno_c, pheno_d, output_missing_pheno, permphe_ct);
if (retval) {
goto plink_ret_1;
}
}
-#ifndef NOLAPACK
- if (calculation_type & CALC_PCA) {
- retval = calc_pca(bedfile, bed_offset, outname, outname_end, calculation_type, relip, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, marker_reverse, unfiltered_sample_ct, sample_exclude, sample_ct, pca_sample_exclude? pca_sample_exclude : sample_exclude, pca_sample_exclude? pca_sample_ct : sample_ct, sample_ids, max_sample_id_len, set_allele_freqs, chrom_info_ptr, rel_ibc);
- } else if (calculation_type & CALC_UNRELATED_HERITABILITY) {
- if (sample_ct != pheno_nm_ct) {
- logerrprint("Error: --unrelated-heritability requires phenotype data for all samples.\n(--prune should help.)\n");
- goto plink_ret_INVALID_CMDLINE;
- }
- retval = calc_unrelated_herit(calculation_type, relip, unfiltered_sample_ct, sample_exclude, sample_ct, pheno_d, rel_ibc);
- }
-#endif
- wkspace_reset(g_sample_missing_unwt);
- if (retval) {
- goto plink_ret_1;
- }
- g_sample_missing_unwt = NULL;
- g_missing_dbl_excluded = NULL;
- }
- if (calculation_type & CALC_SEXCHECK) {
- retval = sexcheck(bedfile, bed_offset, outname, outname_end, unfiltered_marker_ct, marker_exclude, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, plink_maxfid, plink_maxiid, max_sample_id_len, sex_nm, sex_male, misc_flags, check_sex_fthresh, check_sex_mthresh, check_sex_f_yobs, check_sex_m_yobs, chrom_info_ptr, set_allele_freqs, &gender_unk_ct);
- if (retval) {
- goto plink_ret_1;
- }
- }
-
- if (calculation_type & CALC_MAKE_PERM_PHENO) {
- retval = make_perm_pheno(threads, outname, outname_end, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, cluster_ct, cluster_map, cluster_starts, pheno_nm_ct, pheno_nm, pheno_c, pheno_d, output_missing_pheno, permphe_ct);
- if (retval) {
- goto plink_ret_1;
+ if ((calculation_type & CALC_GENOME) || genome_skip_write) {
+ // er, this probably should be moved inside calc_genome(), since we're
+ // using get_trios_and_families() instead of pri elsewhere
+ retval = populate_pedigree_rel_info(&pri, unfiltered_sample_ct, sample_ids, max_sample_id_len, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, founder_info);
+ if (retval) {
+ goto plink_ret_1;
+ }
}
}
- if ((calculation_type & CALC_GENOME) || genome_skip_write) {
- // er, this probably should be moved inside calc_genome(), since we're
- // using get_trios_and_families() instead of pri elsewhere
- retval = populate_pedigree_rel_info(&pri, unfiltered_sample_ct, sample_ids, max_sample_id_len, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, founder_info);
- if (retval) {
- goto plink_ret_1;
+ if (marker_ct) {
+ if (calculation_type & CALC_WRITE_SET) {
+ retval = write_set(sip, outname, outname_end, marker_ct, unfiltered_marker_ct, marker_exclude, marker_ids, max_marker_id_len, marker_pos, chrom_info_ptr);
+ if (retval) {
+ goto plink_ret_1;
+ }
}
- }
- if (calculation_type & CALC_WRITE_SET) {
- retval = write_set(sip, outname, outname_end, marker_ct, unfiltered_marker_ct, marker_exclude, marker_ids, max_marker_id_len, marker_pos, chrom_info_ptr);
- if (retval) {
- goto plink_ret_1;
+ if (calculation_type & CALC_WRITE_SNPLIST) {
+ retval = write_snplist(outname, outname_end, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, NULL, 0);
+ if (retval) {
+ goto plink_ret_1;
+ }
}
- }
- if (calculation_type & CALC_WRITE_SNPLIST) {
- retval = write_snplist(outname, outname_end, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, NULL, 0);
- if (retval) {
- goto plink_ret_1;
- }
- }
- if (calculation_type & CALC_WRITE_VAR_RANGES) {
- retval = write_var_ranges(outname, outname_end, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, write_var_range_ct);
- if (retval) {
- goto plink_ret_1;
+ if (calculation_type & CALC_WRITE_VAR_RANGES) {
+ retval = write_var_ranges(outname, outname_end, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, write_var_range_ct);
+ if (retval) {
+ goto plink_ret_1;
+ }
}
- }
- if (calculation_type & CALC_LIST_23_INDELS) {
- retval = write_snplist(outname, outname_end, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, marker_allele_ptrs, 1);
- if (retval) {
- goto plink_ret_1;
+ if (calculation_type & CALC_LIST_23_INDELS) {
+ retval = write_snplist(outname, outname_end, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, marker_allele_ptrs, 1);
+ if (retval) {
+ goto plink_ret_1;
+ }
}
- }
- if (calculation_type & CALC_DUPVAR) {
- if (map_is_unsorted & UNSORTED_BP) {
- logerrprint("Error: --list-duplicate-vars requires a sorted .bim file. Retry this command\nafter using --make-bed to sort your data.\n");
- goto plink_ret_INVALID_FORMAT;
- }
- retval = list_duplicate_vars(outname, outname_end, dupvar_modifier, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, marker_pos, chrom_info_ptr, marker_allele_ptrs);
- if (retval) {
- goto plink_ret_1;
+ if (calculation_type & CALC_DUPVAR) {
+ if (map_is_unsorted & UNSORTED_BP) {
+ logerrprint("Error: --list-duplicate-vars requires a sorted .bim file. Retry this command\nafter using --make-bed to sort your data.\n");
+ goto plink_ret_INVALID_FORMAT;
+ }
+ retval = list_duplicate_vars(outname, outname_end, dupvar_modifier, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, marker_pos, chrom_info_ptr, marker_allele_ptrs);
+ if (retval) {
+ goto plink_ret_1;
+ }
}
}
@@ -1538,7 +1570,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
memcpy(pheno_nm_datagen, pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t));
bitfield_and(pheno_nm_datagen, sex_nm, unfiltered_sample_ctl);
}
- if (covar_fname && (calculation_type & (CALC_WRITE_COVAR | CALC_MAKE_BED | CALC_MAKE_FAM | CALC_RECODE))) {
+ if (covar_ct && (calculation_type & (CALC_WRITE_COVAR | CALC_MAKE_BED | CALC_MAKE_FAM | CALC_RECODE)) && sample_ct) {
retval = write_covars(outname, outname_end, write_covar_modifier, write_covar_dummy_max_categories, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, sex_nm, sex_male, pheno_nm_datagen? pheno_nm_datagen : pheno_nm, pheno_c, pheno_d, missing_phenod, output_missing_pheno, covar_ct, covar_names, max_covar_name_len, covar_nm, covar_d);
if (retval) {
goto plink_ret_1;
@@ -1559,6 +1591,10 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
aligned_free_cond_null(&pheno_nm_datagen);
}
+ if ((!marker_ct) || (!sample_ct)) {
+ goto plink_ret_1;
+ }
+
if ((calculation_type & CALC_EPI) && epi_ip->twolocus_mkr1) {
retval = twolocus(epi_ip, bedfile, bed_offset, marker_ct, unfiltered_marker_ct, marker_exclude, marker_reverse, marker_ids, max_marker_id_len, plink_maxsnp, marker_allele_ptrs, chrom_info_ptr, unfiltered_sample_ct, sample_exclude, sample_ct, pheno_nm, pheno_nm_ct, pheno_ctrl_ct, pheno_c, sex_male, outname, outname_end, hh_exists);
if (retval) {
@@ -1851,7 +1887,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
uii = 0; // phenotype/cluster number
*outname_end = '.';
if (loop_assoc_fname) {
- retval = load_clusters(loop_assoc_fname, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, sample_ids, max_sample_id_len, mwithin_col, (misc_flags / MISC_LOAD_CLUSTER_KEEP_NA) & 1, &cluster_ct, &cluster_map, &cluster_starts, &cluster_ids, &max_cluster_id_len, NULL, NULL, NULL, NULL);
+ retval = load_clusters(loop_assoc_fname, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, sample_ids, max_sample_id_len, mwithin_col, (misc_flags / MISC_LOAD_CLUSTER_KEEP_NA) & 1, &cluster_ct, &cluster_map, &cluster_starts, &cluster_ids, &max_cluster_id_len, NULL, NULL, NULL, NULL, 0);
if (retval) {
goto plink_ret_1;
}
@@ -1914,6 +1950,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
retval = load_pheno(phenofile, unfiltered_sample_ct, sample_exclude_ct, cptr, max_sample_id_len, uiptr, missing_pheno, (misc_flags / MISC_AFFECTION_01) & 1, uii, NULL, pheno_nm, &pheno_c, &pheno_d, &(outname_end[1]), (uintptr_t)((&(outname[FNAMESIZE - 32])) - outname_end));
if (retval == LOAD_PHENO_LAST_COL) {
wkspace_reset(wkspace_mark);
+ retval = 0; // exit code bugfix
break;
} else if (retval) {
goto plink_ret_1;
@@ -2112,8 +2149,8 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
return retval;
}
-// output-missing-phenotype + terminating null, or 'recode 01 fastphase-1chr'
-#define MAX_FLAG_LEN 25
+// meta-analysis-report-dups + terminating null, or 'recode 01 fastphase-1chr'
+#define MAX_FLAG_LEN 26
static inline int32_t is_flag(char* param) {
unsigned char ucc = param[1];
@@ -3216,8 +3253,8 @@ int32_t main(int32_t argc, char** argv) {
uint64_t filter_flags = 0;
double thin_keep_prob = 1.0;
double thin_keep_sample_prob = 1.0;
- uint32_t thin_keep_ct = 0;
- uint32_t thin_keep_sample_ct = 0;
+ uint32_t thin_keep_ct = 0xffffffffU;
+ uint32_t thin_keep_sample_ct = 0xffffffffU;
uint32_t min_bp_space = 0;
uint32_t check_sex_f_yobs = 0;
uint32_t check_sex_m_yobs = 0;
@@ -3339,7 +3376,11 @@ int32_t main(int32_t argc, char** argv) {
double lasso_minlambda = -1;
uint32_t testmiss_modifier = 0;
uint32_t testmiss_mperm_val = 0;
+
+ // this default limit plays well with e.g. fbstring small-string optimization
uint32_t new_id_max_allele_len = 23;
+
+ uint32_t aperm_present = 0;
char* segment_spanning_fname = NULL;
char* missing_code = NULL;
char range_delim = '-';
@@ -3372,6 +3413,7 @@ int32_t main(int32_t argc, char** argv) {
time_t rawtime;
char* argptr;
char* sptr;
+ const char* csptr;
int32_t ii;
int32_t jj;
int32_t kk;
@@ -3400,7 +3442,7 @@ int32_t main(int32_t argc, char** argv) {
char* flagptr;
double dxx;
char cc;
- uint32_t known_procs;
+ int32_t known_procs;
uint32_t uii;
uint32_t ujj;
uint32_t ukk;
@@ -3995,7 +4037,7 @@ int32_t main(int32_t argc, char** argv) {
ii = sysconf(_SC_NPROCESSORS_ONLN);
if (ii == -1) {
g_thread_ct = 1;
- known_procs = 0;
+ known_procs = -1;
} else {
g_thread_ct = ii;
known_procs = ii;
@@ -4244,6 +4286,18 @@ int32_t main(int32_t argc, char** argv) {
} else if (!memcmp(argptr2, "llow-no-sex", 12)) {
sex_missing_pheno |= ALLOW_NO_SEX;
goto main_param_zero;
+ } else if (!memcmp(argptr2, "llow-no-samples", 16)) {
+ UNSTABLE("allow-no-samples");
+ misc_flags |= MISC_ALLOW_NO_SAMPLES;
+ goto main_param_zero;
+ } else if (!memcmp(argptr2, "llow-no-vars", 13)) {
+ UNSTABLE("allow-no-vars");
+ misc_flags |= MISC_ALLOW_NO_VARS;
+ goto main_param_zero;
+ } else if (!memcmp(argptr2, "llow-no-covars", 15)) {
+ UNSTABLE("allow-no-covars");
+ covar_modifier |= COVAR_ALLOW_NONE;
+ goto main_param_zero;
} else if (!memcmp(argptr2, "ll", 3)) {
logprint("Note: --all flag has no effect.\n");
goto main_param_zero;
@@ -4429,6 +4483,7 @@ int32_t main(int32_t argc, char** argv) {
}
}
}
+ aperm_present = 1;
} else if (!memcmp(argptr2, "1-allele", 9)) {
if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 4)) {
goto main_ret_INVALID_CMDLINE_2A;
@@ -4592,20 +4647,20 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE_2A;
}
if (param_ct) {
- sptr = argv[cur_arg + 1];
- if (strlen(sptr) > (FNAMESIZE - 5)) {
+ csptr = argv[cur_arg + 1];
+ if (strlen(csptr) > (FNAMESIZE - 5)) {
logerrprint("Error: --bfile parameter too long.\n");
goto main_ret_OPEN_FAIL;
}
} else {
- sptr = (char*)PROG_NAME_STR;
+ csptr = PROG_NAME_STR;
}
if (!(load_params & LOAD_PARAMS_BED)) {
- memcpy(strcpya(pedname, sptr), ".bed", 5);
+ memcpy(strcpya(pedname, csptr), ".bed", 5);
load_params |= LOAD_PARAMS_BED;
}
- memcpy(strcpya(mapname, sptr), ".bim", 5);
- memcpy(strcpya(famname, sptr), ".fam", 5);
+ memcpy(strcpya(mapname, csptr), ".bim", 5);
+ memcpy(strcpya(famname, csptr), ".fam", 5);
load_params |= LOAD_PARAMS_BIM | LOAD_PARAMS_FAM;
} else if (!memcmp(argptr2, "ed", 3)) {
if (load_rare) {
@@ -5976,20 +6031,20 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE_2A;
}
if (param_ct) {
- sptr = argv[cur_arg + 1];
- if (strlen(sptr) > (FNAMESIZE - 8)) {
+ csptr = argv[cur_arg + 1];
+ if (strlen(csptr) > (FNAMESIZE - 8)) {
logerrprint("Error: --data parameter too long.\n");
goto main_ret_OPEN_FAIL;
}
} else {
- sptr = (char*)PROG_NAME_STR;
+ csptr = PROG_NAME_STR;
}
if (!(load_params & LOAD_PARAMS_OXBGEN)) {
- memcpy(strcpya(pedname, sptr), ".gen", 5);
+ memcpy(strcpya(pedname, csptr), ".gen", 5);
load_params |= LOAD_PARAMS_OXGEN;
}
// cheating: this is of course more like a .fam file
- memcpy(strcpya(mapname, sptr), ".sample", 8);
+ memcpy(strcpya(mapname, csptr), ".sample", 8);
load_params |= LOAD_PARAMS_OXSAMPLE;
} else if (!memcmp(argptr2, "ecompress", 10)) {
logerrprint("Error: --decompress flag retired. Use e.g. 'gunzip [filename]'.\n");
@@ -6133,11 +6188,11 @@ int32_t main(int32_t argc, char** argv) {
if (enforce_param_ct_range(param_ct, argv[cur_arg], 2, 6)) {
goto main_ret_INVALID_CMDLINE_2A;
}
- if (scan_posint_defcap(argv[cur_arg + 1], &dummy_sample_ct)) {
+ if (scan_uint_defcap(argv[cur_arg + 1], &dummy_sample_ct) || ((!dummy_sample_ct) && (!(misc_flags & MISC_ALLOW_NO_SAMPLES)))) {
logerrprint("Error: Invalid --dummy sample count.\n");
goto main_ret_INVALID_CMDLINE_A;
}
- if (scan_posint_defcap(argv[cur_arg + 2], &dummy_marker_ct)) {
+ if (scan_uint_defcap(argv[cur_arg + 2], &dummy_marker_ct) || ((!dummy_marker_ct) && (!(misc_flags & MISC_ALLOW_NO_VARS)))) {
logerrprint("Error: Invalid --dummy variant count.\n");
goto main_ret_INVALID_CMDLINE_A;
}
@@ -6233,6 +6288,9 @@ int32_t main(int32_t argc, char** argv) {
} else if (condition_mname || condition_fname) {
logerrprint("Error: --dosage does not support --condition/--condition-list.\n");
goto main_ret_INVALID_CMDLINE_A;
+ } else if (misc_flags & (MISC_ALLOW_NO_SAMPLES | MISC_ALLOW_NO_VARS)) {
+ logerrprint("Error: --dosage does not support --allow-no-samples/--allow-no-vars.\n");
+ goto main_ret_INVALID_CMDLINE_A;
}
if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 13)) {
goto main_ret_INVALID_CMDLINE_2A;
@@ -6529,16 +6587,16 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE_2A;
}
if (param_ct) {
- sptr = argv[cur_arg + 1];
- if (strlen(sptr) > (FNAMESIZE - 5)) {
+ csptr = argv[cur_arg + 1];
+ if (strlen(csptr) > (FNAMESIZE - 5)) {
logerrprint("Error: --file parameter too long.\n");
goto main_ret_OPEN_FAIL;
}
} else {
- sptr = (char*)PROG_NAME_STR;
+ csptr = PROG_NAME_STR;
}
- memcpy(strcpya(pedname, sptr), ".ped", 5);
- memcpy(strcpya(mapname, sptr), ".map", 5);
+ memcpy(strcpya(pedname, csptr), ".ped", 5);
+ memcpy(strcpya(mapname, csptr), ".map", 5);
} else if (!memcmp(argptr2, "am", 3)) {
if (load_params & (LOAD_PARAMS_TEXT_ALL | LOAD_PARAMS_OX_ALL)) {
goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
@@ -7838,22 +7896,44 @@ int32_t main(int32_t argc, char** argv) {
case 'l':
if (!memcmp(argptr2, "file", 5)) {
- if (load_rare || load_params) {
+ if (load_rare || (load_params & (~LOAD_PARAMS_FAM))) {
goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
}
if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 1)) {
goto main_ret_INVALID_CMDLINE_2A;
}
if (param_ct) {
- if (strlen(argv[cur_arg + 1]) > FNAMESIZE - 6) {
+ csptr = argv[cur_arg + 1];
+ if (strlen(csptr) > FNAMESIZE - 6) {
logerrprint("Error: --lfile filename prefix too long.\n");
goto main_ret_OPEN_FAIL;
}
- strcpy(pedname, argv[cur_arg + 1]);
} else {
- memcpy(pedname, PROG_NAME_STR, 6);
+ csptr = PROG_NAME_STR;
+ }
+ memcpy(strcpya(pedname, csptr), ".lgen", 6);
+ memcpy(strcpya(mapname, csptr), ".map", 5);
+ if (!famname[0]) {
+ memcpy(strcpya(famname, csptr), ".fam", 5);
}
load_rare = LOAD_RARE_LGEN;
+ } else if (!memcmp(argptr2, "gen", 4)) {
+ if ((load_rare & (~LOAD_RARE_LGEN)) || (load_params & (~LOAD_PARAMS_FAM))) {
+ goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
+ }
+ if ((load_params != LOAD_PARAMS_FAM) && (!load_rare)) {
+ logerrprint("Error: --lgen must be used with --fam or --lfile.\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
+ if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
+ goto main_ret_INVALID_CMDLINE_2A;
+ }
+ if (strlen(argv[cur_arg + 1]) > (FNAMESIZE - 1)) {
+ logerrprint("Error: --lgen parameter too long.\n");
+ goto main_ret_OPEN_FAIL;
+ }
+ strcpy(pedname, argv[cur_arg + 1]);
+ load_rare = LOAD_RARE_LGEN;
} else if (!memcmp(argptr2, "oop-assoc", 10)) {
if (pheno_modifier & PHENO_ALL) {
logerrprint("Error: --loop-assoc cannot be used with --all-pheno.\n");
@@ -8015,10 +8095,6 @@ int32_t main(int32_t argc, char** argv) {
}
glm_modifier |= GLM_STANDARD_BETA;
} else if (!strcmp(argv[cur_arg + uii], "intercept")) {
- if (glm_modifier & GLM_LOGISTIC) {
- logerrprint("Error: --logistic does not currently have a 'intercept' modifier. (Did you\nmean --linear or 'beta'?)\n");
- goto main_ret_INVALID_CMDLINE_A;
- }
glm_modifier |= GLM_INTERCEPT;
} else if (!strcmp(argv[cur_arg + uii], "beta")) {
glm_modifier |= GLM_BETA;
@@ -8200,7 +8276,7 @@ int32_t main(int32_t argc, char** argv) {
case 'm':
if (!memcmp(argptr2, "ap", 3)) {
- if (((load_params & (LOAD_PARAMS_BFILE_ALL | LOAD_PARAMS_OX_ALL)) || (load_rare & (~(LOAD_RARE_CNV | LOAD_RARE_GVAR)))) && ((load_rare != LOAD_RARE_DOSAGE) || (load_params != LOAD_PARAMS_FAM))) {
+ if (((load_params & (LOAD_PARAMS_BFILE_ALL | LOAD_PARAMS_OX_ALL)) || (load_rare & (~(LOAD_RARE_CNV | LOAD_RARE_GVAR)))) && ((load_rare != LOAD_RARE_DOSAGE) || (load_params != LOAD_PARAMS_FAM)) && (load_rare != LOAD_RARE_LGEN)) {
goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
}
load_params |= LOAD_PARAMS_MAP;
@@ -9379,6 +9455,12 @@ int32_t main(int32_t argc, char** argv) {
if (retval) {
goto main_ret_NOMEM;
}
+ } else if (!memcmp(argptr2, "eta-analysis-report-dups", 25)) {
+ if (!metaanal_fnames) {
+ logerrprint("Error: --meta-analysis-report-dups must be used with --meta-analysis.\n");
+ }
+ metaanal_flags |= METAANAL_REPORT_DUPS;
+ goto main_param_zero;
} else if (!memcmp(argptr2, "ac", 3)) {
UNSTABLE("mac");
if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
@@ -9536,6 +9618,13 @@ int32_t main(int32_t argc, char** argv) {
logprint("Note: --nop flag deprecated. Use '--fast-epistasis nop'.\n");
epi_info.modifier |= EPI_FAST_NO_P_VALUE;
goto main_param_zero;
+ } else if (!memcmp(argptr2, "o-const-covar", 14)) {
+ if (!covar_fname) {
+ logerrprint("Error: --no-const-covar must be used with --covar.\n");
+ goto main_ret_INVALID_CMDLINE;
+ }
+ covar_modifier |= COVAR_NO_CONST;
+ goto main_param_zero;
} else if (!memcmp(argptr2, "oweb", 5)) {
logprint("Note: --noweb has no effect since no web check is implemented yet.\n");
goto main_param_zero;
@@ -11157,7 +11246,7 @@ int32_t main(int32_t argc, char** argv) {
goto main_param_zero;
} else if (!memcmp(argptr2, "tandard-beta", 13)) {
if (((!(calculation_type & CALC_GLM)) || (glm_modifier & GLM_LOGISTIC)) && (!(dosage_info.modifier & DOSAGE_GLM))) {
- logerrprint("Error: --standard-beta must be used wtih --linear or --dosage.\n");
+ logerrprint("Error: --standard-beta must be used with --linear or --dosage.\n");
goto main_ret_INVALID_CMDLINE_A;
}
logprint("Note: --standard-beta flag deprecated. Use e.g. '--linear standard-beta'.\n");
@@ -11391,7 +11480,7 @@ int32_t main(int32_t argc, char** argv) {
}
dosage_info.modifier += (DOSAGE_SCORE - DOSAGE_GLM);
}
- if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 8)) {
+ if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 9)) {
goto main_ret_INVALID_CMDLINE_2A;
}
retval = alloc_fname(&score_info.fname, argv[cur_arg + 1], argptr, 0);
@@ -11434,6 +11523,8 @@ int32_t main(int32_t argc, char** argv) {
dosage_info.modifier |= DOSAGE_SCORE_NOSUM;
} else if (!strcmp(argv[cur_arg + uii], "include-cnt")) {
dosage_info.modifier |= DOSAGE_SCORE_CNT;
+ } else if (!strcmp(argv[cur_arg + uii], "double-dosage")) {
+ dosage_info.modifier |= DOSAGE_SCORE_DOUBLE;
} else if (ujj == 3) {
logerrprint("Error: --score takes at most three numeric parameters.\n");
goto main_ret_INVALID_CMDLINE_A;
@@ -11553,6 +11644,9 @@ int32_t main(int32_t argc, char** argv) {
if (g_thread_ct > MAX_THREADS) {
LOGPRINTF("Note: Reducing --threads parameter to %u. (If this is not large enough,\nrecompile with a larger MAX_THREADS setting.)\n", MAX_THREADS);
g_thread_ct = MAX_THREADS;
+ } else if (known_procs == -1) {
+ // trigger BLAS/LAPACK warning
+ known_procs = 0;
}
} else if (!memcmp(argptr2, "ab", 3)) {
logprint("Note: --tab flag deprecated. Use '--recode tab ...'.\n");
@@ -11737,7 +11831,7 @@ int32_t main(int32_t argc, char** argv) {
if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
goto main_ret_INVALID_CMDLINE_2A;
}
- if (scan_posint_defcap(argv[cur_arg + 1], &thin_keep_ct)) {
+ if (scan_uint_defcap(argv[cur_arg + 1], &thin_keep_ct) || ((!thin_keep_ct) && (!(misc_flags & MISC_ALLOW_NO_VARS)))) {
sprintf(logbuf, "Error: Invalid --thin-count parameter '%s'.\n", argv[cur_arg + 1]);
goto main_ret_INVALID_CMDLINE_WWA;
}
@@ -11767,7 +11861,7 @@ int32_t main(int32_t argc, char** argv) {
if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
goto main_ret_INVALID_CMDLINE_2A;
}
- if (scan_posint_defcap(argv[cur_arg + 1], &thin_keep_sample_ct)) {
+ if (scan_uint_defcap(argv[cur_arg + 1], &thin_keep_sample_ct) || ((!thin_keep_sample_ct) && (!(misc_flags & MISC_ALLOW_NO_SAMPLES)))) {
sprintf(logbuf, "Error: Invalid --thin-indiv-count parameter '%s'.\n", argv[cur_arg + 1]);
goto main_ret_INVALID_CMDLINE_WWA;
}
@@ -12377,6 +12471,13 @@ int32_t main(int32_t argc, char** argv) {
sprintf(logbuf, "Error: '%s' is not a valid mode for --vcf-half-call.\n", argv[cur_arg + 1]);
goto main_ret_INVALID_CMDLINE_WWA;
}
+ } else if (!memcmp(argptr2, "cf-require-gt", 14)) {
+ if (!(load_rare & (LOAD_RARE_VCF | LOAD_RARE_BCF))) {
+ logerrprint("Error: --vcf-require-gt must be used with --vcf/--bcf.\n");
+ goto main_ret_INVALID_CMDLINE;
+ }
+ misc_flags |= MISC_VCF_REQUIRE_GT;
+ goto main_param_zero;
} else {
goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
}
@@ -13054,6 +13155,12 @@ int32_t main(int32_t argc, char** argv) {
if ((calculation_type & CALC_TDT) && ((family_info.tdt_modifier & (TDT_MPERM | TDT_SET_TEST)) == TDT_MPERM)) {
uii++;
}
+ if ((calculation_type & CALC_DFAM) && ((family_info.dfam_modifier & (DFAM_MPERM | DFAM_SET_TEST)) == DFAM_MPERM)) {
+ uii++;
+ }
+
+ // no qfam since that's a nonstandard permutation test
+
if ((calculation_type & CALC_CMH) && ((cluster.modifier & (CLUSTER_CMH_MPERM | CLUSTER_CMH_SET_TEST)) == CLUSTER_CMH_MPERM)) {
uii++;
}
@@ -13078,6 +13185,21 @@ int32_t main(int32_t argc, char** argv) {
model_modifier |= MODEL_PERM;
}
}
+ if (aperm_present && (calculation_type & (CALC_MODEL | CALC_GLM | CALC_TESTMISS | CALC_TDT | CALC_DFAM | CALC_QFAM | CALC_CMH)) &&
+ (!(model_modifier & MODEL_PERM)) &&
+ (!(glm_modifier & GLM_PERM)) &&
+ (!(testmiss_modifier & TESTMISS_PERM)) &&
+ (!(family_info.tdt_modifier & TDT_PERM)) &&
+ (!(family_info.dfam_modifier & DFAM_PERM)) &&
+ (!(family_info.qfam_modifier & QFAM_PERM)) &&
+ (!(cluster.modifier & (CLUSTER_CMH_PERM | CLUSTER_CMH_PERM_BD)))) {
+ // If --aperm is present, at least one association analysis command which
+ // supports adaptive permutation testing was also specified, but no actual
+ // adaptive permutation test is happening, the user is likely to be
+ // confused. Produce a warning. (Not an error since a sophisticated user
+ // may want to use --script with different --aperm defaults.)
+ logerrprint("Warning: --aperm only controls the settings for adaptive permutation tests; it\ndoes not cause such a test to be performed. (Did you forget to add the 'perm'\nmodifier to an association analysis flag?)\n");
+ }
if ((mtest_adjust & (ADJUST_LAMBDA + 1)) == ADJUST_LAMBDA) {
logerrprint("Error: --lambda must be used with --adjust.\n");
goto main_ret_INVALID_CMDLINE_A;
@@ -13113,6 +13235,10 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE_A;
}
+ if ((load_rare == LOAD_RARE_LGEN) && (!mapname[0])) {
+ logerrprint("Error: --lgen must be used with --lfile or --map.\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
uii = load_params & LOAD_PARAMS_OX_ALL;
if ((uii == LOAD_PARAMS_OXGEN) || (uii == LOAD_PARAMS_OXBGEN)) {
logerrprint("Error: --gen/--bgen cannot be used without --data or --sample.\n");
@@ -13309,7 +13435,7 @@ int32_t main(int32_t argc, char** argv) {
}
uii = (sptr - outname);
if (load_rare == LOAD_RARE_LGEN) {
- retval = lgen_to_bed(pedname, outname, sptr, missing_pheno, misc_flags, lgen_modifier, lgen_reference_fname, &chrom_info);
+ retval = lgen_to_bed(pedname, mapname, famname, outname, sptr, missing_pheno, misc_flags, lgen_modifier, lgen_reference_fname, &chrom_info);
} else if (load_rare & LOAD_RARE_TRANSPOSE_MASK) {
retval = transposed_to_bed(pedname, famname, outname, sptr, misc_flags, &chrom_info);
} else if (load_rare & LOAD_RARE_VCF) {
@@ -13317,10 +13443,12 @@ int32_t main(int32_t argc, char** argv) {
} else if (load_rare & LOAD_RARE_BCF) {
retval = bcf_to_bed(pedname, outname, sptr, missing_pheno, misc_flags, const_fid, id_delim, vcf_idspace_to, vcf_min_qual, vcf_filter_exceptions_flattened, &chrom_info);
} else if (load_rare == LOAD_RARE_23) {
- retval = bed_from_23(pedname, outname, sptr, modifier_23, fid_23, iid_23, (pheno_23 == HUGE_DOUBLE)? ((double)missing_pheno) : pheno_23, paternal_id_23, maternal_id_23, &chrom_info);
+ retval = bed_from_23(pedname, outname, sptr, modifier_23, fid_23, iid_23, (pheno_23 == HUGE_DOUBLE)? ((double)missing_pheno) : pheno_23, misc_flags, paternal_id_23, maternal_id_23, &chrom_info);
} else if (load_rare & LOAD_RARE_DUMMY) {
retval = generate_dummy(outname, sptr, dummy_flags, dummy_marker_ct, dummy_sample_ct, dummy_missing_geno, dummy_missing_pheno, missing_pheno);
} else if (load_rare & LOAD_RARE_SIMULATE) {
+ // no need to support zero samples/variants here since --dummy takes
+ // care of generating those test cases
retval = simulate_dataset(outname, sptr, simulate_flags, simulate_fname, simulate_cases, simulate_controls, simulate_prevalence, simulate_qt_samples, simulate_missing, simulate_label);
free(simulate_fname);
simulate_fname = NULL;
diff --git a/plink_assoc.c b/plink_assoc.c
index c597a3f..14030dc 100644
--- a/plink_assoc.c
+++ b/plink_assoc.c
@@ -4,6 +4,7 @@
#include "plink_cluster.h"
#include "plink_ld.h"
#include "plink_matrix.h"
+#include "plink_perm.h"
#include "plink_stats.h"
void aperm_init(Aperm_info* apip) {
@@ -562,335 +563,6 @@ int32_t multcomp(char* outname, char* outname_end, uint32_t* marker_uidxs, uintp
return retval;
}
-void generate_cc_perm_vec(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotient, uint64_t totq_magic, uint32_t totq_preshift, uint32_t totq_postshift, uint32_t totq_incr, uintptr_t* perm_vec, sfmt_t* sfmtp) {
- // Assumes tot_quotient is 2^32 / tot_ct, and
- // totq_magic/totq_preshift/totq_postshift/totq_incr have been precomputed
- // from magic_num().
- uint32_t num_set = 0;
- uint32_t upper_bound = tot_ct * tot_quotient - 1;
- uintptr_t widx;
- uintptr_t wcomp;
- uintptr_t pv_val;
- uint32_t urand;
- uint32_t uii;
- if (set_ct * 2 < tot_ct) {
- fill_ulong_zero(perm_vec, 2 * ((tot_ct + (BITCT - 1)) / BITCT));
- for (; num_set < set_ct; num_set++) {
- do {
- do {
- urand = sfmt_genrand_uint32(sfmtp);
- } while (urand > upper_bound);
- uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
- widx = uii / BITCT2;
- wcomp = ONELU << (2 * (uii % BITCT2));
- pv_val = perm_vec[widx];
- } while (pv_val & wcomp);
- perm_vec[widx] = pv_val | wcomp;
- }
- } else {
- fill_vec_55(perm_vec, tot_ct);
- set_ct = tot_ct - set_ct;
- for (; num_set < set_ct; num_set++) {
- do {
- do {
- urand = sfmt_genrand_uint32(sfmtp);
- } while (urand > upper_bound);
- uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
- widx = uii / BITCT2;
- wcomp = ONELU << (2 * (uii % BITCT2));
- pv_val = perm_vec[widx];
- } while (!(pv_val & wcomp));
- perm_vec[widx] = pv_val - wcomp;
- }
- }
-}
-
-void generate_cc_perm1(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotient, uint64_t totq_magic, uint32_t totq_preshift, uint32_t totq_postshift, uint32_t totq_incr, uintptr_t* perm_vec, sfmt_t* sfmtp) {
- // generate_cc_perm_vec() variant which uses 1-bit packing instead of 2.
- uint32_t num_set = 0;
- uint32_t upper_bound = tot_ct * tot_quotient - 1;
- uintptr_t widx;
- uintptr_t wcomp;
- uintptr_t pv_val;
- uint32_t urand;
- uint32_t uii;
- if (set_ct * 2 < tot_ct) {
- fill_ulong_zero(perm_vec, (tot_ct + (BITCT - 1)) / BITCT);
- for (; num_set < set_ct; num_set++) {
- do {
- do {
- urand = sfmt_genrand_uint32(sfmtp);
- } while (urand > upper_bound);
- uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
- widx = uii / BITCT;
- wcomp = ONELU << (uii % BITCT);
- pv_val = perm_vec[widx];
- } while (pv_val & wcomp);
- perm_vec[widx] = pv_val | wcomp;
- }
- } else {
- fill_all_bits(perm_vec, tot_ct);
- set_ct = tot_ct - set_ct;
- for (; num_set < set_ct; num_set++) {
- do {
- do {
- urand = sfmt_genrand_uint32(sfmtp);
- } while (urand > upper_bound);
- uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
- widx = uii / BITCT;
- wcomp = ONELU << (uii % BITCT);
- pv_val = perm_vec[widx];
- } while (!(pv_val & wcomp));
- perm_vec[widx] = pv_val - wcomp;
- }
- }
-}
-
-void generate_cc_cluster_perm_vec(uint32_t tot_ct, uintptr_t* preimage, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t* cluster_case_cts, uint32_t* tot_quotients, uint64_t* totq_magics, uint32_t* totq_preshifts, uint32_t* totq_postshifts, uint32_t* totq_incrs, uintptr_t* perm_vec, sfmt_t* sfmtp) {
- uint32_t tot_ctl2 = 2 * ((tot_ct + (BITCT - 1)) / BITCT);
- uint32_t cluster_idx;
- uint32_t target_ct;
- uint32_t cluster_end;
- uint32_t* map_ptr;
- uint32_t num_swapped;
- uint32_t cluster_size;
- uint32_t upper_bound;
- uint64_t totq_magic;
- uint32_t totq_preshift;
- uint32_t totq_postshift;
- uint32_t totq_incr;
- uintptr_t widx;
- uintptr_t wcomp;
- uintptr_t pv_val;
- uint32_t urand;
- uint32_t uii;
- memcpy(perm_vec, preimage, tot_ctl2 * sizeof(intptr_t));
- for (cluster_idx = 0; cluster_idx < cluster_ct; cluster_idx++) {
- target_ct = cluster_case_cts[cluster_idx];
- cluster_end = cluster_starts[cluster_idx + 1];
- cluster_size = cluster_end - cluster_starts[cluster_idx];
- if (target_ct && (target_ct != cluster_size)) {
- upper_bound = cluster_size * tot_quotients[cluster_idx] - 1;
- totq_magic = totq_magics[cluster_idx];
- totq_preshift = totq_preshifts[cluster_idx];
- totq_postshift = totq_postshifts[cluster_idx];
- totq_incr = totq_incrs[cluster_idx];
- map_ptr = &(cluster_map[cluster_starts[cluster_idx]]);
- if (target_ct * 2 < cluster_size) {
- for (num_swapped = 0; num_swapped < target_ct; num_swapped++) {
- do {
- do {
- urand = sfmt_genrand_uint32(sfmtp);
- } while (urand > upper_bound);
- uii = map_ptr[(uint32_t)((totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift)];
- widx = uii / BITCT2;
- wcomp = ONELU << (2 * (uii % BITCT2));
- pv_val = perm_vec[widx];
- } while (pv_val & wcomp);
- perm_vec[widx] = pv_val | wcomp;
- }
- } else {
- target_ct = cluster_size - target_ct;
- for (num_swapped = 0; num_swapped < target_ct; num_swapped++) {
- do {
- do {
- urand = sfmt_genrand_uint32(sfmtp);
- } while (urand > upper_bound);
- uii = map_ptr[(uint32_t)((totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift)];
- widx = uii / BITCT2;
- wcomp = ONELU << (2 * (uii % BITCT2));
- pv_val = perm_vec[widx];
- } while (!(pv_val & wcomp));
- perm_vec[widx] = pv_val - wcomp;
- }
- }
- }
- }
-}
-
-void generate_cc_cluster_perm1(uint32_t tot_ct, uintptr_t* preimage, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t* cluster_case_cts, uint32_t* tot_quotients, uint64_t* totq_magics, uint32_t* totq_preshifts, uint32_t* totq_postshifts, uint32_t* totq_incrs, uintptr_t* perm_vec, sfmt_t* sfmtp) {
- uint32_t tot_ctl = (tot_ct + (BITCT - 1)) / BITCT;
- uint32_t cluster_idx;
- uint32_t target_ct;
- uint32_t cluster_end;
- uint32_t cluster_size;
- uint32_t* map_ptr;
- uint32_t num_swapped;
- uint32_t upper_bound;
- uint64_t totq_magic;
- uint32_t totq_preshift;
- uint32_t totq_postshift;
- uint32_t totq_incr;
- uintptr_t widx;
- uintptr_t wcomp;
- uintptr_t pv_val;
- uint32_t urand;
- uint32_t uii;
- memcpy(perm_vec, preimage, tot_ctl * sizeof(intptr_t));
- for (cluster_idx = 0; cluster_idx < cluster_ct; cluster_idx++) {
- target_ct = cluster_case_cts[cluster_idx];
- cluster_end = cluster_starts[cluster_idx + 1];
- cluster_size = cluster_end - cluster_starts[cluster_idx];
- if (target_ct && (target_ct != cluster_size)) {
- upper_bound = cluster_size * tot_quotients[cluster_idx] - 1;
- totq_magic = totq_magics[cluster_idx];
- totq_preshift = totq_preshifts[cluster_idx];
- totq_postshift = totq_postshifts[cluster_idx];
- totq_incr = totq_incrs[cluster_idx];
- map_ptr = &(cluster_map[cluster_starts[cluster_idx]]);
- if (target_ct * 2 < cluster_size) {
- for (num_swapped = 0; num_swapped < target_ct; num_swapped++) {
- do {
- do {
- urand = sfmt_genrand_uint32(sfmtp);
- } while (urand > upper_bound);
- uii = map_ptr[(uint32_t)((totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift)];
- widx = uii / BITCT;
- wcomp = ONELU << (uii % BITCT);
- pv_val = perm_vec[widx];
- } while (pv_val & wcomp);
- perm_vec[widx] = pv_val | wcomp;
- }
- } else {
- target_ct = cluster_size - target_ct;
- for (num_swapped = 0; num_swapped < target_ct; num_swapped++) {
- do {
- do {
- urand = sfmt_genrand_uint32(sfmtp);
- } while (urand > upper_bound);
- uii = map_ptr[(uint32_t)((totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift)];
- widx = uii / BITCT;
- wcomp = ONELU << (uii % BITCT);
- pv_val = perm_vec[widx];
- } while (!(pv_val & wcomp));
- perm_vec[widx] = pv_val - wcomp;
- }
- }
- }
- }
-}
-
-void transpose_perms(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_nm_ct, uint32_t* perm_vecst) {
- // Transpose permutations so PRESTO/PERMORY-style genotype indexing can work.
- //
- // We used a 32-ply interleaved format, to allow counts up to the uint32_t
- // limit without giving up highly parallel adds in the calc_git() inner loop.
- // The index order used here is:
- // 64-bit build:
- // first 16 bytes: 0 32 64 96 16 48 80 112 4 36 68 100 20 52 84 116
- // 8 40 72 104 24 56 88 120 12 44 76 108 28 60 92 124 1...
- // next 16 bytes: 128 160 192...
- //
- // 32-bit build:
- // first 4 bytes: 0 8 16 24 4 12 20 28 1 9 17 25 5 13 21 29 2 10 18...
- // next 4 bytes: 32 40 48...
- uintptr_t sample_idx = 0;
- uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
-#ifdef __LP64__
- uint32_t wbuf[4];
- uint32_t* wbptr;
-#else
- uint32_t wval;
-#endif
- uint32_t rshift;
- uint32_t wshift;
- uintptr_t* pvptr;
- uintptr_t perm_idx;
- for (; sample_idx < pheno_nm_ct; sample_idx++) {
- perm_idx = 0;
- pvptr = &(perm_vecs[sample_idx / BITCT2]);
- rshift = 2 * (sample_idx % BITCT2);
- goto transpose_perms_loop_start;
-#ifdef __LP64__
- do {
- if (!(perm_idx % 4)) {
- if (perm_idx % 128) {
- wshift = ((perm_idx & 96) >> 5) | ((perm_idx & 16) >> 2) | ((perm_idx & 12) << 1);
- } else {
- memcpy(perm_vecst, wbuf, 16);
- perm_vecst = &(perm_vecst[4]);
- transpose_perms_loop_start:
- fill_uint_zero(wbuf, 4);
- wshift = 0;
- }
- wbptr = wbuf;
- }
- *wbptr |= ((pvptr[perm_idx * pheno_nm_ctl2] >> rshift) & 1) << wshift;
- wbptr++;
- } while (++perm_idx < perm_vec_ct);
- memcpy(perm_vecst, wbuf, 16);
- perm_vecst = &(perm_vecst[4]);
-#else
- do {
- if (perm_idx % 32) {
- wshift = ((perm_idx & 24) >> 3) | (perm_idx & 4) | ((perm_idx & 3) << 3);
- } else {
- *perm_vecst++ = wval;
- transpose_perms_loop_start:
- wval = 0;
- wshift = 0;
- }
- wval |= ((pvptr[perm_idx * pheno_nm_ctl2] >> rshift) & 1) << wshift;
- } while (++perm_idx < perm_vec_ct);
- *perm_vecst++ = wval;
-#endif
- }
-}
-
-void transpose_perm1s(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_nm_ct, uint32_t* perm_vecst) {
- uintptr_t sample_idx = 0;
- uintptr_t pheno_nm_ctl = (pheno_nm_ct + (BITCT - 1)) / BITCT;
-#ifdef __LP64__
- uint32_t wbuf[4];
- uint32_t* wbptr;
-#else
- uint32_t wval;
-#endif
- uint32_t rshift;
- uint32_t wshift;
- uintptr_t* pvptr;
- uintptr_t perm_idx;
- for (; sample_idx < pheno_nm_ct; sample_idx++) {
- perm_idx = 0;
- pvptr = &(perm_vecs[sample_idx / BITCT]);
- rshift = sample_idx % BITCT;
- goto transpose_perm1s_loop_start;
-#ifdef __LP64__
- do {
- if (!(perm_idx % 4)) {
- if (perm_idx % 128) {
- wshift = ((perm_idx & 96) >> 5) | ((perm_idx & 16) >> 2) | ((perm_idx & 12) << 1);
- } else {
- memcpy(perm_vecst, wbuf, 16);
- perm_vecst = &(perm_vecst[4]);
- transpose_perm1s_loop_start:
- fill_uint_zero(wbuf, 2);
- wshift = 0;
- }
- wbptr = wbuf;
- }
- *wbptr |= ((pvptr[perm_idx * pheno_nm_ctl] >> rshift) & 1) << wshift;
- wbptr++;
- } while (++perm_idx < perm_vec_ct);
- memcpy(perm_vecst, wbuf, 16);
- perm_vecst = &(perm_vecst[4]);
-#else
- do {
- if (perm_idx % 32) {
- wshift = ((perm_idx & 24) >> 3) | (perm_idx & 4) | ((perm_idx & 3) << 3);
- } else {
- *perm_vecst++ = wval;
- transpose_perm1s_loop_start:
- wval = 0;
- wshift = 0;
- }
- wval |= ((pvptr[perm_idx * pheno_nm_ctl] >> rshift) & 1) << wshift;
- } while (++perm_idx < perm_vec_ct);
- *perm_vecst++ = wval;
-#endif
- }
-}
-
char* model_assoc_tna(uint32_t model_fisher, char* wptr) {
// write terminal NAs to buffer
if (model_fisher) {
@@ -924,33 +596,19 @@ void calc_git(uint32_t pheno_nm_ct, uint32_t perm_vec_ct, uintptr_t* __restrict_
uint32_t perm_ct128x4 = perm_ct128 * 4;
uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
uint32_t perm_ct16x4 = 4 * perm_ct16;
- const __m128i m1x4 = {0x1111111111111111LLU, 0x1111111111111111LLU};
- const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
- const __m128i m8x32 = {0x000000ff000000ffLLU, 0x000000ff000000ffLLU};
__m128i* permsv = (__m128i*)perm_vecst;
__m128i* gitv[9];
- __m128i* __restrict__ git_merge4; // no conflicts, please
- __m128i* __restrict__ git_merge8;
- __m128i* __restrict__ git_write;
- __m128i* __restrict__ perm_ptr;
- __m128i loader;
#else
uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
uint32_t perm_ct32x4 = perm_ct32 * 4;
uint32_t perm_ct8 = (perm_vec_ct + 7) / 8;
uint32_t perm_ct4 = (perm_vec_ct + 3) / 4;
uint32_t perm_ct16x16 = 16 * perm_ct16;
- uint32_t* permsv = perm_vecst;
- uint32_t* gitv[9];
- uint32_t* git_merge4;
- uint32_t* git_merge8;
- uint32_t* git_write;
- uint32_t* perm_ptr;
- uintptr_t loader;
+ uintptr_t* permsv = (uintptr_t*)perm_vecst;
+ uintptr_t* gitv[9];
#endif
uint32_t cur_cts[3];
uintptr_t ulii;
- uint32_t pbidx;
uint32_t uii;
uint32_t ujj;
uint32_t ukk;
@@ -967,15 +625,15 @@ void calc_git(uint32_t pheno_nm_ct, uint32_t perm_vec_ct, uintptr_t* __restrict_
gitv[7] = &(((__m128i*)results_bufs)[perm_ct16x4]);
gitv[8] = (__m128i*)results_bufs;
#else
- gitv[0] = thread_wkspace;
- gitv[1] = &(thread_wkspace[perm_ct32x4]);
- gitv[2] = &(thread_wkspace[2 * perm_ct32x4]);
- gitv[3] = &(thread_wkspace[3 * perm_ct32x4]);
- gitv[4] = &(thread_wkspace[3 * perm_ct32x4 + 2 * perm_ct8]);
- gitv[5] = &(thread_wkspace[3 * perm_ct32x4 + 4 * perm_ct8]);
- gitv[6] = &(results_bufs[2 * perm_ct16x16]);
- gitv[7] = &(results_bufs[perm_ct16x16]);
- gitv[8] = results_bufs;
+ gitv[0] = (uintptr_t*)thread_wkspace;
+ gitv[1] = (uintptr_t*)(&(thread_wkspace[perm_ct32x4]));
+ gitv[2] = (uintptr_t*)(&(thread_wkspace[2 * perm_ct32x4]));
+ gitv[3] = (uintptr_t*)(&(thread_wkspace[3 * perm_ct32x4]));
+ gitv[4] = (uintptr_t*)(&(thread_wkspace[3 * perm_ct32x4 + 2 * perm_ct8]));
+ gitv[5] = (uintptr_t*)(&(thread_wkspace[3 * perm_ct32x4 + 4 * perm_ct8]));
+ gitv[6] = (uintptr_t*)(&(results_bufs[2 * perm_ct16x16]));
+ gitv[7] = (uintptr_t*)(&(results_bufs[perm_ct16x16]));
+ gitv[8] = (uintptr_t*)results_bufs;
#endif
cur_cts[0] = 0;
cur_cts[1] = 0;
@@ -991,77 +649,22 @@ void calc_git(uint32_t pheno_nm_ct, uint32_t perm_vec_ct, uintptr_t* __restrict_
while (ulii) {
ujj = CTZLU(ulii) & (BITCT - 2); // get pos of next non-[hom A2] sample
sample_type = ((ulii >> ujj) & 3) - 1;
- git_merge4 = gitv[sample_type];
-#ifdef __LP64__
- perm_ptr = &(permsv[(ujj / 2) * perm_ct128]);
- for (pbidx = 0; pbidx < perm_ct128; pbidx++) {
- loader = *perm_ptr++;
- git_merge4[0] = _mm_add_epi64(git_merge4[0], _mm_and_si128(loader, m1x4));
- git_merge4[1] = _mm_add_epi64(git_merge4[1], _mm_and_si128(_mm_srli_epi64(loader, 1), m1x4));
- git_merge4[2] = _mm_add_epi64(git_merge4[2], _mm_and_si128(_mm_srli_epi64(loader, 2), m1x4));
- git_merge4[3] = _mm_add_epi64(git_merge4[3], _mm_and_si128(_mm_srli_epi64(loader, 3), m1x4));
- git_merge4 = &(git_merge4[4]);
- }
ukk = cur_cts[sample_type] + 1;
cur_cts[sample_type] = ukk;
+#ifdef __LP64__
+ unroll_incr_1_4(&(permsv[(ujj / 2) * perm_ct128]), gitv[sample_type], perm_ct128);
if (!(ukk % 15)) {
- git_merge4 = gitv[sample_type];
- git_merge8 = gitv[sample_type + 3];
- for (pbidx = 0; pbidx < perm_ct32; pbidx++) {
- loader = *git_merge4;
- git_merge8[0] = _mm_add_epi64(git_merge8[0], _mm_and_si128(loader, m4));
- git_merge8[1] = _mm_add_epi64(git_merge8[1], _mm_and_si128(_mm_srli_epi64(loader, 4), m4));
- git_merge8 = &(git_merge8[2]);
- *git_merge4++ = _mm_setzero_si128();
- }
+ unroll_zero_incr_4_8(gitv[sample_type], gitv[sample_type + 3], perm_ct32);
if (!(ukk % 255)) {
- git_merge8 = gitv[sample_type + 3];
- git_write = gitv[sample_type + 6];
- for (pbidx = 0; pbidx < perm_ct16; pbidx++) {
- loader = *git_merge8;
- git_write[0] = _mm_add_epi64(git_write[0], _mm_and_si128(loader, m8x32));
- git_write[1] = _mm_add_epi64(git_write[1], _mm_and_si128(_mm_srli_epi64(loader, 8), m8x32));
- git_write[2] = _mm_add_epi64(git_write[2], _mm_and_si128(_mm_srli_epi64(loader, 16), m8x32));
- git_write[3] = _mm_add_epi64(git_write[3], _mm_and_si128(_mm_srli_epi64(loader, 24), m8x32));
- git_write = &(git_write[4]);
- *git_merge8++ = _mm_setzero_si128();
- }
+ unroll_zero_incr_8_32(gitv[sample_type + 3], gitv[sample_type + 6], perm_ct16);
}
}
#else
- perm_ptr = &(permsv[(ujj / 2) * perm_ct32]);
- for (pbidx = 0; pbidx < perm_ct32; pbidx++) {
- loader = *perm_ptr++;
- git_merge4[0] += loader & 0x11111111;
- git_merge4[1] += (loader >> 1) & 0x11111111;
- git_merge4[2] += (loader >> 2) & 0x11111111;
- git_merge4[3] += (loader >> 3) & 0x11111111;
- git_merge4 = &(git_merge4[4]);
- }
- ukk = cur_cts[sample_type] + 1;
- cur_cts[sample_type] = ukk;
+ unroll_incr_1_4(&(permsv[(ujj / 2) * perm_ct32]), gitv[sample_type], perm_ct32);
if (!(ukk % 15)) {
- git_merge4 = gitv[sample_type];
- git_merge8 = gitv[sample_type + 3];
- for (pbidx = 0; pbidx < perm_ct8; pbidx++) {
- loader = *git_merge4;
- git_merge8[0] += loader & 0x0f0f0f0f;
- git_merge8[1] += (loader >> 4) & 0x0f0f0f0f;
- git_merge8 = &(git_merge8[2]);
- *git_merge4++ = 0;
- }
+ unroll_zero_incr_4_8(gitv[sample_type], gitv[sample_type + 3], perm_ct8);
if (!(ukk % 255)) {
- git_merge8 = gitv[sample_type + 3];
- git_write = gitv[sample_type + 6];
- for (pbidx = 0; pbidx < perm_ct4; pbidx++) {
- loader = *git_merge8;
- git_write[0] += loader & 0x000000ff;
- git_write[1] += (loader >> 8) & 0x000000ff;
- git_write[2] += (loader >> 16) & 0x000000ff;
- git_write[3] += loader >> 24;
- git_write = &(git_write[4]);
- *git_merge8++ = 0;
- }
+ unroll_zero_incr_8_32(gitv[sample_type + 3], gitv[sample_type + 6], perm_ct4);
}
}
#endif
@@ -1077,49 +680,17 @@ void calc_git(uint32_t pheno_nm_ct, uint32_t perm_vec_ct, uintptr_t* __restrict_
uii = cur_cts[sample_type];
#ifdef __LP64__
if (uii % 15) {
- git_merge4 = gitv[sample_type];
- git_merge8 = gitv[sample_type + 3];
- for (pbidx = 0; pbidx < perm_ct32; pbidx++) {
- loader = *git_merge4++;
- git_merge8[0] = _mm_add_epi64(git_merge8[0], _mm_and_si128(loader, m4));
- git_merge8[1] = _mm_add_epi64(git_merge8[1], _mm_and_si128(_mm_srli_epi64(loader, 4), m4));
- git_merge8 = &(git_merge8[2]);
- }
+ unroll_incr_4_8(gitv[sample_type], gitv[sample_type + 3], perm_ct32);
}
if (uii % 255) {
- git_merge8 = gitv[sample_type + 3];
- git_write = gitv[sample_type + 6];
- for (pbidx = 0; pbidx < perm_ct16; pbidx++) {
- loader = *git_merge8++;
- git_write[0] = _mm_add_epi64(git_write[0], _mm_and_si128(loader, m8x32));
- git_write[1] = _mm_add_epi64(git_write[1], _mm_and_si128(_mm_srli_epi64(loader, 8), m8x32));
- git_write[2] = _mm_add_epi64(git_write[2], _mm_and_si128(_mm_srli_epi64(loader, 16), m8x32));
- git_write[3] = _mm_add_epi64(git_write[3], _mm_and_si128(_mm_srli_epi64(loader, 24), m8x32));
- git_write = &(git_write[4]);
- }
+ unroll_incr_8_32(gitv[sample_type + 3], gitv[sample_type + 6], perm_ct16);
}
#else
if (uii % 15) {
- git_merge4 = gitv[sample_type];
- git_merge8 = gitv[sample_type + 3];
- for (pbidx = 0; pbidx < perm_ct8; pbidx++) {
- loader = *git_merge4++;
- git_merge8[0] += loader & 0x0f0f0f0f;
- git_merge8[1] += (loader >> 4) & 0x0f0f0f0f;
- git_merge8 = &(git_merge8[2]);
- }
+ unroll_incr_4_8(gitv[sample_type], gitv[sample_type + 3], perm_ct8);
}
if (uii % 255) {
- git_merge8 = gitv[sample_type + 3];
- git_write = gitv[sample_type + 6];
- for (pbidx = 0; pbidx < perm_ct4; pbidx++) {
- loader = *git_merge8++;
- git_write[0] += loader & 0x000000ff;
- git_write[1] += (loader >> 8) & 0x000000ff;
- git_write[2] += (loader >> 16) & 0x000000ff;
- git_write[3] += loader >> 24;
- git_write = &(git_write[4]);
- }
+ unroll_incr_8_32(gitv[sample_type + 3], gitv[sample_type + 6], perm_ct4);
}
#endif
}
@@ -1621,20 +1192,6 @@ uintptr_t qrem_cost2(uintptr_t sample_ctl2, uintptr_t* loadbuf1, uintptr_t* load
}
#ifdef __LP64__
-static inline void calc_rem_merge4_one(uint32_t perm_ct128, __m128i* __restrict__ perm_ptr, __m128i* __restrict__ rem_merge4) {
- const __m128i m1x4 = {0x1111111111111111LLU, 0x1111111111111111LLU};
- __m128i loader;
- uint32_t pbidx;
- for (pbidx = 0; pbidx < perm_ct128; pbidx++) {
- loader = *perm_ptr++;
- rem_merge4[0] = _mm_add_epi64(rem_merge4[0], _mm_and_si128(loader, m1x4));
- rem_merge4[1] = _mm_add_epi64(rem_merge4[1], _mm_and_si128(_mm_srli_epi64(loader, 1), m1x4));
- rem_merge4[2] = _mm_add_epi64(rem_merge4[2], _mm_and_si128(_mm_srli_epi64(loader, 2), m1x4));
- rem_merge4[3] = _mm_add_epi64(rem_merge4[3], _mm_and_si128(_mm_srli_epi64(loader, 3), m1x4));
- rem_merge4 = &(rem_merge4[4]);
- }
-}
-
static inline void calc_rem_merge4_two(uint32_t perm_ct128, __m128i* __restrict__ perm_ptr, __m128i* __restrict__ rem_merge4a, __m128i* __restrict__ rem_merge4b) {
const __m128i m1x4 = {0x1111111111111111LLU, 0x1111111111111111LLU};
__m128i loader;
@@ -1645,13 +1202,16 @@ static inline void calc_rem_merge4_two(uint32_t perm_ct128, __m128i* __restrict_
loader2 = _mm_and_si128(loader, m1x4);
rem_merge4a[0] = _mm_add_epi64(rem_merge4a[0], loader2);
rem_merge4b[0] = _mm_add_epi64(rem_merge4b[0], loader2);
- loader2 = _mm_and_si128(_mm_srli_epi64(loader, 1), m1x4);
+ loader = _mm_srli_epi64(loader, 1);
+ loader2 = _mm_and_si128(loader, m1x4);
rem_merge4a[1] = _mm_add_epi64(rem_merge4a[1], loader2);
rem_merge4b[1] = _mm_add_epi64(rem_merge4b[1], loader2);
- loader2 = _mm_and_si128(_mm_srli_epi64(loader, 2), m1x4);
+ loader = _mm_srli_epi64(loader, 1);
+ loader2 = _mm_and_si128(loader, m1x4);
rem_merge4a[2] = _mm_add_epi64(rem_merge4a[2], loader2);
rem_merge4b[2] = _mm_add_epi64(rem_merge4b[2], loader2);
- loader2 = _mm_and_si128(_mm_srli_epi64(loader, 3), m1x4);
+ loader = _mm_srli_epi64(loader, 1);
+ loader2 = _mm_and_si128(loader, m1x4);
rem_merge4a[3] = _mm_add_epi64(rem_merge4a[3], loader2);
rem_merge4b[3] = _mm_add_epi64(rem_merge4b[3], loader2);
rem_merge4a = &(rem_merge4a[4]);
@@ -1659,34 +1219,6 @@ static inline void calc_rem_merge4_two(uint32_t perm_ct128, __m128i* __restrict_
}
}
-static inline void calc_rem_merge8(uint32_t perm_ct32, __m128i* __restrict__ rem_merge4, __m128i* __restrict__ rem_merge8) {
- const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
- __m128i loader;
- uint32_t pbidx;
- for (pbidx = 0; pbidx < perm_ct32; pbidx++) {
- loader = *rem_merge4;
- rem_merge8[0] = _mm_add_epi64(rem_merge8[0], _mm_and_si128(loader, m4));
- rem_merge8[1] = _mm_add_epi64(rem_merge8[1], _mm_and_si128(_mm_srli_epi64(loader, 4), m4));
- rem_merge8 = &(rem_merge8[2]);
- *rem_merge4++ = _mm_setzero_si128();
- }
-}
-
-static inline void calc_rem_merge32_plus(uint32_t perm_ct16, __m128i* __restrict__ rem_merge8, __m128i* rem_write) {
- const __m128i m8x32 = {0x000000ff000000ffLLU, 0x000000ff000000ffLLU};
- __m128i loader;
- uint32_t pbidx;
- for (pbidx = 0; pbidx < perm_ct16; pbidx++) {
- loader = *rem_merge8;
- rem_write[0] = _mm_add_epi64(rem_write[0], _mm_and_si128(loader, m8x32));
- rem_write[1] = _mm_add_epi64(rem_write[1], _mm_and_si128(_mm_srli_epi64(loader, 8), m8x32));
- rem_write[2] = _mm_add_epi64(rem_write[2], _mm_and_si128(_mm_srli_epi64(loader, 16), m8x32));
- rem_write[3] = _mm_add_epi64(rem_write[3], _mm_and_si128(_mm_srli_epi64(loader, 24), m8x32));
- rem_write = &(rem_write[4]);
- *rem_merge8++ = _mm_setzero_si128();
- }
-}
-
static inline void calc_rem_merge32_minus(uint32_t perm_ct16, __m128i* __restrict__ rem_merge8, __m128i* rem_write) {
// temporary integer underflow is possible here, but by the end of the
// calculation it should be reversed
@@ -1696,27 +1228,17 @@ static inline void calc_rem_merge32_minus(uint32_t perm_ct16, __m128i* __restric
for (pbidx = 0; pbidx < perm_ct16; pbidx++) {
loader = *rem_merge8;
rem_write[0] = _mm_sub_epi64(rem_write[0], _mm_and_si128(loader, m8x32));
- rem_write[1] = _mm_sub_epi64(rem_write[1], _mm_and_si128(_mm_srli_epi64(loader, 8), m8x32));
- rem_write[2] = _mm_sub_epi64(rem_write[2], _mm_and_si128(_mm_srli_epi64(loader, 16), m8x32));
- rem_write[3] = _mm_sub_epi64(rem_write[3], _mm_and_si128(_mm_srli_epi64(loader, 24), m8x32));
+ loader = _mm_srli_epi64(loader, 8);
+ rem_write[1] = _mm_sub_epi64(rem_write[1], _mm_and_si128(loader, m8x32));
+ loader = _mm_srli_epi64(loader, 8);
+ rem_write[2] = _mm_sub_epi64(rem_write[2], _mm_and_si128(loader, m8x32));
+ loader = _mm_srli_epi64(loader, 8);
+ rem_write[3] = _mm_sub_epi64(rem_write[3], _mm_and_si128(loader, m8x32));
rem_write = &(rem_write[4]);
*rem_merge8++ = _mm_setzero_si128();
}
}
#else
-static inline void calc_rem_merge4_one(uint32_t perm_ct32, uintptr_t* __restrict__ perm_ptr, uintptr_t* __restrict__ rem_merge4) {
- uintptr_t loader;
- uint32_t pbidx;
- for (pbidx = 0; pbidx < perm_ct32; pbidx++) {
- loader = *perm_ptr++;
- rem_merge4[0] += loader & 0x11111111;
- rem_merge4[1] += (loader >> 1) & 0x11111111;
- rem_merge4[2] += (loader >> 2) & 0x11111111;
- rem_merge4[3] += (loader >> 3) & 0x11111111;
- rem_merge4 = &(rem_merge4[4]);
- }
-}
-
static inline void calc_rem_merge4_two(uint32_t perm_ct32, uintptr_t* __restrict__ perm_ptr, uintptr_t* __restrict__ rem_merge4a, uintptr_t* __restrict__ rem_merge4b) {
uintptr_t loader;
uintptr_t loader2;
@@ -1740,41 +1262,18 @@ static inline void calc_rem_merge4_two(uint32_t perm_ct32, uintptr_t* __restrict
}
}
-static inline void calc_rem_merge8(uint32_t perm_ct8, uintptr_t* __restrict__ rem_merge4, uintptr_t* __restrict__ rem_merge8) {
- uintptr_t loader;
- uint32_t pbidx;
- for (pbidx = 0; pbidx < perm_ct8; pbidx++) {
- loader = *rem_merge4;
- rem_merge8[0] += loader & 0x0f0f0f0f;
- rem_merge8[1] += (loader >> 4) & 0x0f0f0f0f;
- rem_merge8 = &(rem_merge8[2]);
- *rem_merge4++ = 0;
- }
-}
-
-static inline void calc_rem_merge32_plus(uint32_t perm_ct4, uintptr_t* __restrict__ rem_merge8, uintptr_t* __restrict__ rem_write) {
- uintptr_t loader;
- uint32_t pbidx;
- for (pbidx = 0; pbidx < perm_ct4; pbidx++) {
- loader = *rem_merge8;
- rem_write[0] += loader & 0x000000ff;
- rem_write[1] += (loader >> 8) & 0x000000ff;
- rem_write[2] += (loader >> 16) & 0x000000ff;
- rem_write[3] += loader >> 24;
- rem_write = &(rem_write[4]);
- *rem_merge8++ = 0;
- }
-}
-
static inline void calc_rem_merge32_minus(uint32_t perm_ct4, uintptr_t* __restrict__ rem_merge8, uintptr_t* __restrict__ rem_write) {
uintptr_t loader;
uint32_t pbidx;
for (pbidx = 0; pbidx < perm_ct4; pbidx++) {
loader = *rem_merge8;
- rem_write[0] -= loader & 0x000000ff;
- rem_write[1] -= (loader >> 8) & 0x000000ff;
- rem_write[2] -= (loader >> 16) & 0x000000ff;
- rem_write[3] -= loader >> 24;
+ rem_write[0] -= (uint8_t)loader;
+ loader >>= 8;
+ rem_write[1] -= (uint8_t)loader;
+ loader >>= 8;
+ rem_write[2] -= (uint8_t)loader;
+ loader >>= 8;
+ rem_write[3] -= loader;
rem_write = &(rem_write[4]);
*rem_merge8++ = 0;
}
@@ -1863,13 +1362,13 @@ void calc_rem(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, u
#ifdef __LP64__
perm_ptr = &(permsv[(ujj / 2) * perm_ct128]);
if (!idx2) {
- calc_rem_merge4_one(perm_ct128, perm_ptr, remv[idx1]);
+ unroll_incr_1_4(perm_ptr, remv[idx1], perm_ct128);
} else {
calc_rem_merge4_two(perm_ct128, perm_ptr, remv[idx1], remv[idx2]);
ukk = cur_cts[idx2] + 1;
cur_cts[idx2] = ukk;
if (!(ukk % 15)) {
- calc_rem_merge8(perm_ct32, remv[idx2], remv[idx2 + 6]);
+ unroll_zero_incr_4_8(remv[idx2], remv[idx2 + 6], perm_ct32);
if (!(ukk % 255)) {
calc_rem_merge32_minus(perm_ct16, remv[idx2 + 6], remv[(idx2 / 2) + 12]);
}
@@ -1878,10 +1377,10 @@ void calc_rem(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, u
ukk = cur_cts[idx1] + 1;
cur_cts[idx1] = ukk;
if (!(ukk % 15)) {
- calc_rem_merge8(perm_ct32, remv[idx1], remv[idx1 + 6]);
+ unroll_zero_incr_4_8(remv[idx1], remv[idx1 + 6], perm_ct32);
if (!(ukk % 255)) {
if (!(idx1 & 1)) {
- calc_rem_merge32_plus(perm_ct16, remv[idx1 + 6], remv[(idx1 / 2) + 12]);
+ unroll_zero_incr_8_32(remv[idx1 + 6], remv[(idx1 / 2) + 12], perm_ct16);
} else {
calc_rem_merge32_minus(perm_ct16, remv[idx1 + 6], remv[(idx1 / 2) + 12]);
}
@@ -1890,13 +1389,13 @@ void calc_rem(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, u
#else
perm_ptr = &(permsv[(ujj / 2) * perm_ct32]);
if (!idx2) {
- calc_rem_merge4_one(perm_ct32, perm_ptr, remv[idx1]);
+ unroll_incr_1_4(perm_ptr, remv[idx1], perm_ct32);
} else {
calc_rem_merge4_two(perm_ct32, perm_ptr, remv[idx1], remv[idx2]);
ukk = cur_cts[idx2] + 1;
cur_cts[idx2] = ukk;
if (!(ukk % 15)) {
- calc_rem_merge8(perm_ct8, remv[idx2], remv[idx2 + 6]);
+ unroll_zero_incr_4_8(remv[idx2], remv[idx2 + 6], perm_ct8);
if (!(ukk % 255)) {
calc_rem_merge32_minus(perm_ct4, remv[idx2 + 6], remv[(idx2 / 2) + 12]);
}
@@ -1905,10 +1404,10 @@ void calc_rem(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, u
ukk = cur_cts[idx1] + 1;
cur_cts[idx1] = ukk;
if (!(ukk % 15)) {
- calc_rem_merge8(perm_ct8, remv[idx1], remv[idx1 + 6]);
+ unroll_zero_incr_4_8(remv[idx1], remv[idx1 + 6], perm_ct8);
if (!(ukk % 255)) {
if (!(idx1 & 1)) {
- calc_rem_merge32_plus(perm_ct4, remv[idx1 + 6], remv[(idx1 / 2) + 12]);
+ unroll_zero_incr_8_32(remv[idx1 + 6], remv[(idx1 / 2) + 12], perm_ct4);
} else {
calc_rem_merge32_minus(perm_ct4, remv[idx1 + 6], remv[(idx1 / 2) + 12]);
}
@@ -1927,22 +1426,23 @@ void calc_rem(uint32_t pheno_nm_ct, uintptr_t perm_vec_ct, uintptr_t* loadbuf, u
uii = cur_cts[idx1];
#ifdef __LP64__
if (uii % 15) {
- calc_rem_merge8(perm_ct32, remv[idx1], remv[idx1 + 6]);
+ // todo: check if zeroing needed
+ unroll_zero_incr_4_8(remv[idx1], remv[idx1 + 6], perm_ct32);
}
if (uii % 255) {
if (!(idx1 & 1)) {
- calc_rem_merge32_plus(perm_ct16, remv[idx1 + 6], remv[(idx1 / 2) + 12]);
+ unroll_zero_incr_8_32(remv[idx1 + 6], remv[(idx1 / 2) + 12], perm_ct16);
} else {
calc_rem_merge32_minus(perm_ct16, remv[idx1 + 6], remv[(idx1 / 2) + 12]);
}
}
#else
if (uii % 15) {
- calc_rem_merge8(perm_ct8, remv[idx1], remv[idx1 + 6]);
+ unroll_zero_incr_4_8(remv[idx1], remv[idx1 + 6], perm_ct8);
}
if (uii % 255) {
if (!(idx1 & 1)) {
- calc_rem_merge32_plus(perm_ct4, remv[idx1 + 6], remv[(idx1 / 2) + 12]);
+ unroll_zero_incr_8_32(remv[idx1 + 6], remv[(idx1 / 2) + 12], perm_ct4);
} else {
calc_rem_merge32_minus(perm_ct4, remv[idx1 + 6], remv[(idx1 / 2) + 12]);
}
@@ -10438,32 +9938,18 @@ void calc_git_missing(uint32_t pheno_nm_ct, uint32_t perm_vec_ct, uintptr_t* __r
uint32_t perm_ct128 = (perm_vec_ct + 127) / 128;
uint32_t perm_ct128x4 = perm_ct128 * 4;
uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
- const __m128i m1x4 = {0x1111111111111111LLU, 0x1111111111111111LLU};
- const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
- const __m128i m8x32 = {0x000000ff000000ffLLU, 0x000000ff000000ffLLU};
__m128i* permsv = (__m128i*)perm_vecst;
__m128i* gitv[3];
- __m128i* __restrict__ git_merge4;
- __m128i* __restrict__ git_merge8;
- __m128i* __restrict__ git_write;
- __m128i* __restrict__ perm_ptr;
- __m128i loader;
#else
uint32_t perm_ct32 = (perm_vec_ct + 31) / 32;
uint32_t perm_ct32x4 = perm_ct32 * 4;
uint32_t perm_ct8 = (perm_vec_ct + 7) / 8;
uint32_t perm_ct4 = (perm_vec_ct + 3) / 4;
- uint32_t* permsv = perm_vecst;
- uint32_t* gitv[3];
- uint32_t* git_merge4;
- uint32_t* git_merge8;
- uint32_t* git_write;
- uint32_t* perm_ptr;
- uintptr_t loader;
+ uintptr_t* permsv = (uintptr_t*)perm_vecst;
+ uintptr_t* gitv[3];
#endif
uint32_t cur_ct;
uintptr_t ulii;
- uint32_t pbidx;
uint32_t uii;
uint32_t ujj;
#ifdef __LP64__
@@ -10472,9 +9958,9 @@ void calc_git_missing(uint32_t pheno_nm_ct, uint32_t perm_vec_ct, uintptr_t* __r
gitv[1] = &(((__m128i*)thread_wkspace)[9 * perm_ct128x4]);
gitv[2] = (__m128i*)thread_wkspace;
#else
- gitv[0] = &(thread_wkspace[8 * perm_ct32x4]);
- gitv[1] = &(thread_wkspace[9 * perm_ct32x4]);
- gitv[2] = thread_wkspace;
+ gitv[0] = (uintptr_t*)(&(thread_wkspace[8 * perm_ct32x4]));
+ gitv[1] = (uintptr_t*)(&(thread_wkspace[9 * perm_ct32x4]));
+ gitv[2] = (uintptr_t*)thread_wkspace;
#endif
cur_ct = 0;
for (uii = 0; uii < pheno_nm_ctl; uii++) {
@@ -10487,75 +9973,21 @@ void calc_git_missing(uint32_t pheno_nm_ct, uint32_t perm_vec_ct, uintptr_t* __r
}
while (ulii) {
ujj = CTZLU(ulii);
- git_merge4 = gitv[0];
-#ifdef __LP64__
- perm_ptr = &(permsv[ujj * perm_ct128]);
- for (pbidx = 0; pbidx < perm_ct128; pbidx++) {
- loader = *perm_ptr++;
- git_merge4[0] = _mm_add_epi64(git_merge4[0], _mm_and_si128(loader, m1x4));
- git_merge4[1] = _mm_add_epi64(git_merge4[1], _mm_and_si128(_mm_srli_epi64(loader, 1), m1x4));
- git_merge4[2] = _mm_add_epi64(git_merge4[2], _mm_and_si128(_mm_srli_epi64(loader, 2), m1x4));
- git_merge4[3] = _mm_add_epi64(git_merge4[3], _mm_and_si128(_mm_srli_epi64(loader, 3), m1x4));
- git_merge4 = &(git_merge4[4]);
- }
cur_ct++;
+#ifdef __LP64__
+ unroll_incr_1_4(&(permsv[ujj * perm_ct128]), gitv[0], perm_ct128);
if (!(cur_ct % 15)) {
- git_merge4 = gitv[0];
- git_merge8 = gitv[1];
- for (pbidx = 0; pbidx < perm_ct32; pbidx++) {
- loader = *git_merge4;
- git_merge8[0] = _mm_add_epi64(git_merge8[0], _mm_and_si128(loader, m4));
- git_merge8[1] = _mm_add_epi64(git_merge8[1], _mm_and_si128(_mm_srli_epi64(loader, 4), m4));
- git_merge8 = &(git_merge8[2]);
- *git_merge4++ = _mm_setzero_si128();
- }
+ unroll_zero_incr_4_8(gitv[0], gitv[1], perm_ct32);
if (!(cur_ct % 255)) {
- git_merge8 = gitv[1];
- git_write = gitv[2];
- for (pbidx = 0; pbidx < perm_ct16; pbidx++) {
- loader = *git_merge8;
- git_write[0] = _mm_add_epi64(git_write[0], _mm_and_si128(loader, m8x32));
- git_write[1] = _mm_add_epi64(git_write[1], _mm_and_si128(_mm_srli_epi64(loader, 8), m8x32));
- git_write[2] = _mm_add_epi64(git_write[2], _mm_and_si128(_mm_srli_epi64(loader, 16), m8x32));
- git_write[3] = _mm_add_epi64(git_write[3], _mm_and_si128(_mm_srli_epi64(loader, 24), m8x32));
- git_write = &(git_write[4]);
- *git_merge8++ = _mm_setzero_si128();
- }
+ unroll_zero_incr_8_32(gitv[1], gitv[2], perm_ct16);
}
}
#else
- perm_ptr = &(permsv[ujj * perm_ct32]);
- for (pbidx = 0; pbidx < perm_ct32; pbidx++) {
- loader = *perm_ptr++;
- git_merge4[0] += loader & 0x11111111;
- git_merge4[1] += (loader >> 1) & 0x11111111;
- git_merge4[2] += (loader >> 2) & 0x11111111;
- git_merge4[3] += (loader >> 3) & 0x11111111;
- git_merge4 = &(git_merge4[4]);
- }
- cur_ct++;
+ unroll_incr_1_4(&(permsv[ujj * perm_ct32]), gitv[0], perm_ct32);
if (!(cur_ct % 15)) {
- git_merge4 = gitv[0];
- git_merge8 = gitv[1];
- for (pbidx = 0; pbidx < perm_ct8; pbidx++) {
- loader = *git_merge4;
- git_merge8[0] += loader & 0x0f0f0f0f;
- git_merge8[1] += (loader >> 4) & 0x0f0f0f0f;
- git_merge8 = &(git_merge8[2]);
- *git_merge4++ = 0;
- }
+ unroll_zero_incr_4_8(gitv[0], gitv[1], perm_ct8);
if (!(cur_ct % 255)) {
- git_merge8 = gitv[1];
- git_write = gitv[2];
- for (pbidx = 0; pbidx < perm_ct4; pbidx++) {
- loader = *git_merge8;
- git_write[0] += loader & 0x000000ff;
- git_write[1] += (loader >> 8) & 0x000000ff;
- git_write[2] += (loader >> 16) & 0x000000ff;
- git_write[3] += loader >> 24;
- git_write = &(git_write[4]);
- *git_merge8++ = 0;
- }
+ unroll_zero_incr_8_32(gitv[1], gitv[2], perm_ct4);
}
}
#endif
@@ -10569,49 +10001,17 @@ void calc_git_missing(uint32_t pheno_nm_ct, uint32_t perm_vec_ct, uintptr_t* __r
}
#ifdef __LP64__
if (cur_ct % 15) {
- git_merge4 = gitv[0];
- git_merge8 = gitv[1];
- for (pbidx = 0; pbidx < perm_ct32; pbidx++) {
- loader = *git_merge4++;
- git_merge8[0] = _mm_add_epi64(git_merge8[0], _mm_and_si128(loader, m4));
- git_merge8[1] = _mm_add_epi64(git_merge8[1], _mm_and_si128(_mm_srli_epi64(loader, 4), m4));
- git_merge8 = &(git_merge8[2]);
- }
+ unroll_incr_4_8(gitv[0], gitv[1], perm_ct32);
}
if (cur_ct % 255) {
- git_merge8 = gitv[1];
- git_write = gitv[2];
- for (pbidx = 0; pbidx < perm_ct16; pbidx++) {
- loader = *git_merge8++;
- git_write[0] = _mm_add_epi64(git_write[0], _mm_and_si128(loader, m8x32));
- git_write[1] = _mm_add_epi64(git_write[1], _mm_and_si128(_mm_srli_epi64(loader, 8), m8x32));
- git_write[2] = _mm_add_epi64(git_write[2], _mm_and_si128(_mm_srli_epi64(loader, 16), m8x32));
- git_write[3] = _mm_add_epi64(git_write[3], _mm_and_si128(_mm_srli_epi64(loader, 24), m8x32));
- git_write = &(git_write[4]);
- }
+ unroll_incr_8_32(gitv[1], gitv[2], perm_ct16);
}
#else
if (cur_ct % 15) {
- git_merge4 = gitv[0];
- git_merge8 = gitv[1];
- for (pbidx = 0; pbidx < perm_ct8; pbidx++) {
- loader = *git_merge4++;
- git_merge8[0] += loader & 0x0f0f0f0f;
- git_merge8[1] += (loader >> 4) & 0x0f0f0f0f;
- git_merge8 = &(git_merge8[2]);
- }
+ unroll_incr_4_8(gitv[0], gitv[1], perm_ct8);
}
if (cur_ct % 255) {
- git_merge8 = gitv[1];
- git_write = gitv[2];
- for (pbidx = 0; pbidx < perm_ct4; pbidx++) {
- loader = *git_merge8++;
- git_write[0] += loader & 0x000000ff;
- git_write[1] += (loader >> 8) & 0x000000ff;
- git_write[2] += (loader >> 16) & 0x000000ff;
- git_write[3] += loader >> 24;
- git_write = &(git_write[4]);
- }
+ unroll_incr_8_32(gitv[1], gitv[2], perm_ct4);
}
#endif
}
diff --git a/plink_assoc.h b/plink_assoc.h
index 476e667..97aeae3 100644
--- a/plink_assoc.h
+++ b/plink_assoc.h
@@ -12,16 +12,6 @@ void aperm_init(Aperm_info* apip);
int32_t multcomp(char* outname, char* outname_end, uint32_t* marker_uidxs, uintptr_t chi_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, Chrom_info* chrom_info_ptr, double* chi, double pfilter, double output_min_p, uint32_t mtest_adjust, uint32_t skip_gc, double adjust_lambda, uint32_t* tcnt, double* pvals);
-void generate_cc_perm_vec(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotient, uint64_t totq_magic, uint32_t totq_preshift, uint32_t totq_postshift, uint32_t totq_incr, uintptr_t* perm_vec, sfmt_t* sfmtp);
-
-// void generate_cc_perm1(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotient, uint64_t totq_magic, uint32_t totq_preshift, uint32_t totq_postshift, uint32_t totq_incr, uintptr_t* perm_vec, sfmt_t* sfmtp);
-
-void generate_cc_cluster_perm_vec(uint32_t tot_ct, uintptr_t* preimage, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t* cluster_case_cts, uint32_t* tot_quotients, uint64_t* totq_magics, uint32_t* totq_preshifts, uint32_t* totq_postshifts, uint32_t* totq_incrs, uintptr_t* perm_vec, sfmt_t* sfmtp);
-
-// void generate_cc_cluster_perm1(uint32_t tot_ct, uintptr_t* preimage, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t* cluster_case_cts, uint32_t* tot_quotients, uint64_t* totq_magics, uint32_t* totq_preshifts, uint32_t* totq_postshifts, uint32_t* totq_incrs, uintptr_t* perm_vec, sfmt_t* sfmtp);
-
-void transpose_perm1s(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_nm_ct, uint32_t* perm_vecst);
-
int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t model_modifier, uint32_t model_cell_ct, uint32_t model_mperm_val, double ci_size, double ci_zt, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marke [...]
int32_t qassoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t model_modifier, uint32_t model_mperm_val, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t* marker_reverse, Chrom_info* chrom [...]
diff --git a/plink_calc.c b/plink_calc.c
index 48e81dc..3595665 100644
--- a/plink_calc.c
+++ b/plink_calc.c
@@ -1496,19 +1496,20 @@ void incr_dists_rm_inv(uint32_t* idists, uintptr_t* mmasks, uintptr_t sample_ct_
uintptr_t uljj;
uint32_t uii;
uint32_t ujj;
+ uint32_t ukk;
for (uii = start_idx; uii < end_idx; uii++) {
ulii = mmasks[uii];
+ ukk = sample_ct_m1 - uii;
if (ulii) {
glptr = &(mmasks[uii + 1]);
- // ujj is deliberately biased down by 1
- for (ujj = uii; ujj < sample_ct_m1; ujj++) {
+ for (ujj = 0; ujj < ukk; ujj++) {
uljj = (*glptr++) & ulii;
if (uljj) {
idists[ujj] += popcount_long(uljj);
}
}
}
- idists = &(idists[sample_ct_m1 - uii - 1]);
+ idists = &(idists[ukk]);
}
}
@@ -1520,8 +1521,8 @@ THREAD_RET_TYPE calc_genome_thread(void* arg) {
uintptr_t uljj = g_thread_start[0];
// this is different from the regular offset because incr_dists_rm_inv() has
// custom arithmetic
- uintptr_t offsetm = ((uint64_t)sample_ct) * (ulii - uljj) - ((((uint64_t)(ulii + 1)) * (ulii + 2) - ((uint64_t)(uljj + 1)) * (uljj + 2)) / 2);
- uintptr_t offset = (((uint64_t)sample_ct) * (ulii - uljj) - ((((uint64_t)ulii) * (ulii + 1) - ((uint64_t)uljj) * (uljj + 1)) / 2)) * 5;
+ uintptr_t offsetm = ((uint64_t)sample_ct) * (ulii - uljj) - ((((uint64_t)ulii) * (ulii + 1) - ((uint64_t)uljj) * (uljj + 1)) / 2);
+ uintptr_t offset = offsetm * 5;
uint32_t* missing_ptr = &(g_missing_dbl_excluded[offsetm]);
uint32_t* genome_main_ptr = &(g_genome_main[offset]);
uintptr_t* geno_ptr = (uintptr_t*)g_geno;
@@ -4251,7 +4252,7 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
fputs("Writing...", stdout);
fflush(stdout);
if (!bin4) {
- if (fwrite_checkedz(dists, sample_idx_ct * sizeof(double), *outfile_ptr)) {
+ if (fwrite_checked(dists, sample_idx_ct * sizeof(double), *outfile_ptr)) {
goto distance_d_write_ret_WRITE_FAIL;
}
} else {
@@ -4322,12 +4323,12 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
dxx = 0.0;
dist_ptr = dists;
for (ii = first_sample_idx; ii < end_sample_idx; ii++) {
- if (fwrite_checkedz(dist_ptr, ii * sizeof(double), *outfile_ptr)) {
+ if (fwrite_checked(dist_ptr, ii * sizeof(double), *outfile_ptr)) {
goto distance_d_write_ret_WRITE_FAIL;
}
dist_ptr = &(dist_ptr[(uint32_t)ii]);
if (shape == DISTANCE_SQ0) {
- if (fwrite_checkedz(membuf, (sample_ct - ii) * sizeof(double), *outfile_ptr)) {
+ if (fwrite_checked(membuf, (sample_ct - ii) * sizeof(double), *outfile_ptr)) {
goto distance_d_write_ret_WRITE_FAIL;
}
} else {
@@ -4364,7 +4365,7 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
}
}
if (shape == DISTANCE_SQ0) {
- if (fwrite_checkedz(membuf, (sample_ct - ii) * sizeof(double), *outfile2_ptr)) {
+ if (fwrite_checked(membuf, (sample_ct - ii) * sizeof(double), *outfile2_ptr)) {
goto distance_d_write_ret_WRITE_FAIL;
}
} else {
@@ -4402,7 +4403,7 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
}
}
if (shape == DISTANCE_SQ0) {
- if (fwrite_checkedz(membuf, (sample_ct - ii) * sizeof(double), *outfile3_ptr)) {
+ if (fwrite_checked(membuf, (sample_ct - ii) * sizeof(double), *outfile3_ptr)) {
goto distance_d_write_ret_WRITE_FAIL;
}
} else {
@@ -4442,7 +4443,7 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
}
dist_ptr = &(dist_ptr[(uint32_t)ii]);
if (shape == DISTANCE_SQ0) {
- if (fwrite_checkedz(membuf, (sample_ct - ii) * sizeof(float), *outfile_ptr)) {
+ if (fwrite_checked(membuf, (sample_ct - ii) * sizeof(float), *outfile_ptr)) {
goto distance_d_write_ret_WRITE_FAIL;
}
} else {
@@ -4478,7 +4479,7 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
}
}
if (shape == DISTANCE_SQ0) {
- if (fwrite_checkedz(membuf, (sample_ct - ii) * sizeof(float), *outfile2_ptr)) {
+ if (fwrite_checked(membuf, (sample_ct - ii) * sizeof(float), *outfile2_ptr)) {
goto distance_d_write_ret_WRITE_FAIL;
}
} else {
@@ -4514,7 +4515,7 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
}
}
if (shape == DISTANCE_SQ0) {
- if (fwrite_checkedz(membuf, (sample_ct - ii) * sizeof(float), *outfile3_ptr)) {
+ if (fwrite_checked(membuf, (sample_ct - ii) * sizeof(float), *outfile3_ptr)) {
goto distance_d_write_ret_WRITE_FAIL;
}
} else {
@@ -7166,7 +7167,7 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
goto calc_rel_ret_OPEN_FAIL;
}
for (sample_idx = min_sample; sample_idx < max_parallel_sample; sample_idx++) {
- if (fwrite_checkedz(&(rel_dists[((int64_t)sample_idx * (sample_idx - 1)) / 2 - start_offset]), sample_idx * sizeof(double), outfile)) {
+ if (fwrite_checked(&(rel_dists[((int64_t)sample_idx * (sample_idx - 1)) / 2 - start_offset]), sample_idx * sizeof(double), outfile)) {
goto calc_rel_ret_WRITE_FAIL;
}
if (fwrite_checked(dptr2++, sizeof(double), outfile)) {
@@ -7180,7 +7181,7 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
}
} else {
if (rel_shape == REL_CALC_SQ0) {
- if (fwrite_checkedz(geno, (sample_ct - sample_idx - 1) * sizeof(double), outfile)) {
+ if (fwrite_checked(geno, (sample_ct - sample_idx - 1) * sizeof(double), outfile)) {
goto calc_rel_ret_WRITE_FAIL;
}
} else {
@@ -7276,7 +7277,7 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
}
} else {
if (rel_shape == REL_CALC_SQ0) {
- if (fwrite_checkedz(geno, (sample_ct - sample_idx - 1) * sizeof(float), outfile)) {
+ if (fwrite_checked(geno, (sample_ct - sample_idx - 1) * sizeof(float), outfile)) {
goto calc_rel_ret_WRITE_FAIL;
}
} else {
diff --git a/plink_cluster.c b/plink_cluster.c
index d1cf1d8..f0741be 100644
--- a/plink_cluster.c
+++ b/plink_cluster.c
@@ -42,7 +42,7 @@ void cluster_cleanup(Cluster_info* cluster_ptr) {
free_cond(cluster_ptr->zerofname);
}
-int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* sample_ids, uintptr_t max_sample_id_len, uint32_t mwithin_col, uint32_t keep_na, uintptr_t* cluster_ct_ptr, uint32_t** cluster_map_ptr, uint32_t** cluster_starts_ptr, char** cluster_ids_ptr, uintptr_t* max_cluster_id_len_ptr, char* keep_fname, char* keep_flattened, char* remove_fname, char* remove_flattened) {
+int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* sample_ids, uintptr_t max_sample_id_len, uint32_t mwithin_col, uint32_t keep_na, uintptr_t* cluster_ct_ptr, uint32_t** cluster_map_ptr, uint32_t** cluster_starts_ptr, char** cluster_ids_ptr, uintptr_t* max_cluster_id_len_ptr, char* keep_fname, char* keep_flattened, char* remove_fname, char* remove_flattened, uint32_t allow_no_samples) {
unsigned char* wkspace_mark = wkspace_base;
FILE* infile = NULL;
uintptr_t* sample_exclude_new = NULL;
@@ -196,7 +196,6 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
}
uii = next_set(already_seen, 0, cluster_kr_ct);
if (uii < cluster_kr_ct) {
- read_idx = uii + 1;
for (read_idx = uii + 1; read_idx < cluster_kr_ct; read_idx++) {
if (!IS_SET(already_seen, read_idx)) {
strcpy(&(sorted_keep_ids[uii * max_cluster_kr_len]), &(sorted_keep_ids[read_idx * max_cluster_kr_len]));
@@ -455,11 +454,15 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
LOGPRINTF("--within: %" PRIuPTR " cluster%s loaded, covering a total of %" PRIuPTR " %s.\n", cluster_ct, (cluster_ct == 1)? "" : "s", assigned_ct, species_str(assigned_ct));
} else {
if (sorted_keep_ids) {
- logerrprint("Error: No samples named in --within file remain in the current analysis, so\n--keep-clusters/--keep-cluster-names excludes everyone.\n");
- goto load_clusters_ret_INVALID_FORMAT;
+ if (!allow_no_samples) {
+ logerrprint("Error: No samples named in --within file remain in the current analysis, so\n--keep-clusters/--keep-cluster-names excludes everyone.\n");
+ goto load_clusters_ret_INVALID_FORMAT;
+ }
}
logerrprint("Warning: No samples named in --within file remain in the current analysis.\n");
- goto load_clusters_ret_1;
+ if (!sorted_keep_ids) {
+ goto load_clusters_ret_1;
+ }
}
} else {
// --family
@@ -496,7 +499,7 @@ int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sa
}
assigned_ct++;
}
- if (!assigned_ct) {
+ if ((!assigned_ct) && (!allow_no_samples)) {
logerrprint("Error: --keep-clusters/--keep-cluster-names excludes everyone.\n");
goto load_clusters_ret_INVALID_FORMAT;
}
diff --git a/plink_cluster.h b/plink_cluster.h
index 6046e6f..40ecdbd 100644
--- a/plink_cluster.h
+++ b/plink_cluster.h
@@ -45,7 +45,7 @@ void cluster_init(Cluster_info* cluster_ptr);
void cluster_cleanup(Cluster_info* cluster_ptr);
-int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* sample_ids, uintptr_t max_sample_id_len, uint32_t mwithin_col, uint32_t keep_na, uintptr_t* cluster_ct_ptr, uint32_t** cluster_map_ptr, uint32_t** cluster_starts_ptr, char** cluster_ids_ptr, uintptr_t* max_cluster_id_len_ptr, char* keep_fname, char* keep_flattened, char* remove_fname, char* remove_flattened);
+int32_t load_clusters(char* fname, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* sample_ids, uintptr_t max_sample_id_len, uint32_t mwithin_col, uint32_t keep_na, uintptr_t* cluster_ct_ptr, uint32_t** cluster_map_ptr, uint32_t** cluster_starts_ptr, char** cluster_ids_ptr, uintptr_t* max_cluster_id_len_ptr, char* keep_fname, char* keep_flattened, char* remove_fname, char* remove_flattened, uint32_t allow_no_samples);
void fill_unfiltered_sample_to_cluster(uintptr_t unfiltered_sample_ct, uintptr_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t* sample_to_cluster);
diff --git a/plink_common.c b/plink_common.c
index f27d740..9bbc9f9 100644
--- a/plink_common.c
+++ b/plink_common.c
@@ -3306,6 +3306,7 @@ void magic_num(uint32_t divisor, uint64_t* multp, uint32_t* pre_shiftp, uint32_t
}
void fill_bits(uintptr_t* bit_arr, uintptr_t loc_start, uintptr_t len) {
+ // requires bit_arr to be nonempty
uintptr_t maj_start = loc_start / BITCT;
uintptr_t maj_end = (loc_start + len) / BITCT;
uintptr_t minor;
@@ -3322,6 +3323,7 @@ void fill_bits(uintptr_t* bit_arr, uintptr_t loc_start, uintptr_t len) {
}
void clear_bits(uintptr_t* bit_arr, uintptr_t loc_start, uintptr_t len) {
+ // requires bit_arr to be nonempty
uintptr_t maj_start = loc_start / BITCT;
uintptr_t maj_end = (loc_start + len) / BITCT;
uintptr_t minor;
@@ -3667,7 +3669,7 @@ uintptr_t geqprime(uintptr_t floor) {
return floor;
}
-int32_t populate_id_htable(uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t item_ct, const char* item_ids, uintptr_t max_id_len, uint32_t allow_dups, uint32_t* id_htable, uint32_t id_htable_size) {
+int32_t populate_id_htable(uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t item_ct, const char* item_ids, uintptr_t max_id_len, uint32_t store_dups, uint32_t* id_htable, uint32_t id_htable_size) {
// While unique IDs are normally assumed (and enforced) here, --extract and
// --exclude are an exception, since we want to be able to e.g. exclude all
// variants named '.'. Since there could be millions of them, ordinary
@@ -3691,7 +3693,7 @@ int32_t populate_id_htable(uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uint
uint32_t hash_result;
uint32_t cur_dup;
fill_uint_one(id_htable, id_htable_size);
- if (!allow_dups) {
+ if (!store_dups) {
for (; item_idx < item_ct; item_uidx++, item_idx++) {
next_unset_ul_unsafe_ck(exclude_arr, &item_uidx);
sptr = &(item_ids[item_uidx * max_id_len]);
@@ -3704,6 +3706,8 @@ int32_t populate_id_htable(uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uint
id_htable[hashval] = item_uidx;
break;
} else if (!memcmp(sptr, &(item_ids[hash_result * max_id_len]), slen + 1)) {
+ // could add an allow_dups parameter which controls whether this is
+ // an error
LOGERRPRINTFWW("Error: Duplicate ID '%s'.\n", sptr);
return RET_INVALID_FORMAT;
}
@@ -3781,7 +3785,7 @@ int32_t populate_id_htable(uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uint
}
uint32_t id_htable_find(const char* id_buf, uintptr_t cur_id_len, const uint32_t* id_htable, uint32_t id_htable_size, const char* item_ids, uintptr_t max_id_len) {
- // assumes no duplicate entries
+ // assumes no duplicate entries, and nonzero id_htable_size
// returns 0xffffffffU on failure
if (cur_id_len >= max_id_len) {
return 0xffffffffU;
@@ -3814,26 +3818,26 @@ void fill_idx_to_uidx(uintptr_t* exclude_arr, uintptr_t unfiltered_item_ct, uint
uint32_t* idx_to_uidx_end = &(idx_to_uidx[item_ct]);
uint32_t item_uidx = 0;
uint32_t item_uidx_stop;
- do {
+ while (idx_to_uidx < idx_to_uidx_end) {
item_uidx = next_unset_unsafe(exclude_arr, item_uidx);
item_uidx_stop = next_set(exclude_arr, item_uidx, unfiltered_item_ct);
do {
*idx_to_uidx++ = item_uidx++;
} while (item_uidx < item_uidx_stop);
- } while (idx_to_uidx < idx_to_uidx_end);
+ }
}
void fill_idx_to_uidx_incl(uintptr_t* include_arr, uintptr_t unfiltered_item_ct, uintptr_t item_ct, uint32_t* idx_to_uidx) {
uint32_t* idx_to_uidx_end = &(idx_to_uidx[item_ct]);
uint32_t item_uidx = 0;
uint32_t item_uidx_stop;
- do {
+ while (idx_to_uidx < idx_to_uidx_end) {
item_uidx = next_set_unsafe(include_arr, item_uidx);
item_uidx_stop = next_unset(include_arr, item_uidx, unfiltered_item_ct);
do {
*idx_to_uidx++ = item_uidx++;
} while (item_uidx < item_uidx_stop);
- } while (idx_to_uidx < idx_to_uidx_end);
+ }
}
void fill_uidx_to_idx(uintptr_t* exclude_arr, uint32_t unfiltered_item_ct, uint32_t item_ct, uint32_t* uidx_to_idx) {
@@ -3841,7 +3845,7 @@ void fill_uidx_to_idx(uintptr_t* exclude_arr, uint32_t unfiltered_item_ct, uint3
uint32_t item_idx = 0;
uint32_t* uidx_to_idx_ptr;
uint32_t* uidx_to_idx_stop;
- do {
+ while (item_idx < item_ct) {
item_uidx = next_unset_unsafe(exclude_arr, item_uidx);
uidx_to_idx_ptr = &(uidx_to_idx[item_uidx]);
item_uidx = next_set(exclude_arr, item_uidx, unfiltered_item_ct);
@@ -3849,7 +3853,7 @@ void fill_uidx_to_idx(uintptr_t* exclude_arr, uint32_t unfiltered_item_ct, uint3
do {
*uidx_to_idx_ptr++ = item_idx++;
} while (uidx_to_idx_ptr < uidx_to_idx_stop);
- } while (item_idx < item_ct);
+ }
}
void fill_uidx_to_idx_incl(uintptr_t* include_arr, uint32_t unfiltered_item_ct, uint32_t item_ct, uint32_t* uidx_to_idx) {
@@ -3857,7 +3861,7 @@ void fill_uidx_to_idx_incl(uintptr_t* include_arr, uint32_t unfiltered_item_ct,
uint32_t item_idx = 0;
uint32_t* uidx_to_idx_ptr;
uint32_t* uidx_to_idx_stop;
- do {
+ while (item_idx < item_ct) {
item_uidx = next_set_unsafe(include_arr, item_uidx);
uidx_to_idx_ptr = &(uidx_to_idx[item_uidx]);
item_uidx = next_unset(include_arr, item_uidx, unfiltered_item_ct);
@@ -3865,7 +3869,7 @@ void fill_uidx_to_idx_incl(uintptr_t* include_arr, uint32_t unfiltered_item_ct,
do {
*uidx_to_idx_ptr++ = item_idx++;
} while (uidx_to_idx_ptr < uidx_to_idx_stop);
- } while (item_idx < item_ct);
+ }
}
void fill_midx_to_idx(uintptr_t* exclude_arr_orig, uintptr_t* exclude_arr, uint32_t item_ct, uint32_t* midx_to_idx) {
@@ -3885,28 +3889,40 @@ void fill_midx_to_idx(uintptr_t* exclude_arr_orig, uintptr_t* exclude_arr, uint3
}
void fill_vec_55(uintptr_t* vec, uint32_t ct) {
- uint32_t ctl = 2 * ((ct + (BITCT - 1)) / BITCT);
uint32_t rem = ct & (BITCT - 1);
- uintptr_t* second_to_last = &(vec[ctl - 2]);
#ifdef __LP64__
const __m128i m1 = {FIVEMASK, FIVEMASK};
__m128i* vecp = (__m128i*)vec;
- __m128i* vec_end = (__m128i*)(&(vec[ctl]));
- do {
+ __m128i* vec_end = (__m128i*)(&(vec[2 * (ct / BITCT)]));
+ uintptr_t* second_to_last;
+ while (vecp < vec_end) {
*vecp++ = m1;
- } while (vecp < vec_end);
+ }
+ if (rem) {
+ second_to_last = (uintptr_t*)vecp;
+ if (rem > BITCT2) {
+ second_to_last[0] = FIVEMASK;
+ second_to_last[1] = FIVEMASK >> ((BITCT - rem) * 2);
+ } else {
+ second_to_last[0] = FIVEMASK >> ((BITCT2 - rem) * 2);
+ second_to_last[1] = 0;
+ }
+ }
#else
- uintptr_t* vec_end = &(vec[ctl]);
- do {
+ uintptr_t* vec_end = &(vec[2 * (ct / BITCT)]);
+ while (vec < vec_end) {
*vec++ = FIVEMASK;
- } while (vec < vec_end);
-#endif
- if (rem > BITCT2) {
- second_to_last[1] &= (~ZEROLU) >> ((BITCT - rem) * 2);
- } else if (rem) {
- *second_to_last &= (~ZEROLU) >> ((BITCT2 - rem) * 2);
- second_to_last[1] = 0;
}
+ if (rem) {
+ if (rem > BITCT2) {
+ vec[0] = FIVEMASK;
+ vec[1] = FIVEMASK >> ((BITCT - rem) * 2);
+ } else {
+ vec[0] = FIVEMASK >> ((BITCT2 - rem) * 2);
+ vec[1] = 0;
+ }
+ }
+#endif
}
void vec_collapse_init(uintptr_t* unfiltered_bitarr, uint32_t unfiltered_ct, uintptr_t* filter_bitarr, uint32_t filtered_ct, uintptr_t* output_vec) {
@@ -3918,7 +3934,7 @@ void vec_collapse_init(uintptr_t* unfiltered_bitarr, uint32_t unfiltered_ct, uin
uint32_t write_bit = 0;
uint32_t item_idx = 0;
uint32_t item_uidx_stop;
- do {
+ while (item_idx < filtered_ct) {
item_uidx = next_set_unsafe(filter_bitarr, item_uidx);
item_uidx_stop = next_unset(filter_bitarr, item_uidx, unfiltered_ct);
item_idx += item_uidx_stop - item_uidx;
@@ -3930,7 +3946,7 @@ void vec_collapse_init(uintptr_t* unfiltered_bitarr, uint32_t unfiltered_ct, uin
write_bit = 0;
}
} while (++item_uidx < item_uidx_stop);
- } while (item_idx < filtered_ct);
+ }
if (write_bit) {
*output_vec++ = cur_write;
}
@@ -3945,7 +3961,7 @@ void vec_collapse_init_exclude(uintptr_t* unfiltered_bitarr, uint32_t unfiltered
uint32_t write_bit = 0;
uint32_t item_idx = 0;
uint32_t item_uidx_stop;
- do {
+ while (item_idx < filtered_ct) {
item_uidx = next_unset_unsafe(filter_exclude_bitarr, item_uidx);
item_uidx_stop = next_set(filter_exclude_bitarr, item_uidx, unfiltered_ct);
item_idx += item_uidx_stop - item_uidx;
@@ -3957,7 +3973,7 @@ void vec_collapse_init_exclude(uintptr_t* unfiltered_bitarr, uint32_t unfiltered
write_bit = 0;
}
} while (++item_uidx < item_uidx_stop);
- } while (item_idx < filtered_ct);
+ }
if (write_bit) {
*output_vec++ = cur_write;
}
@@ -4405,6 +4421,7 @@ int32_t resolve_or_add_chrom_name(Chrom_info* chrom_info_ptr, char* bufptr, int3
}
void refresh_chrom_info(Chrom_info* chrom_info_ptr, uintptr_t marker_uidx, uint32_t* chrom_end_ptr, uint32_t* chrom_fo_idx_ptr, uint32_t* is_x_ptr, uint32_t* is_y_ptr, uint32_t* is_mt_ptr, uint32_t* is_haploid_ptr) {
+ // assumes marker_uidx < unfiltered_marker_ct
int32_t chrom_idx;
*chrom_end_ptr = chrom_info_ptr->chrom_file_order_marker_idx[(*chrom_fo_idx_ptr) + 1];
while (marker_uidx >= (*chrom_end_ptr)) {
@@ -5237,9 +5254,9 @@ void bitfield_and(uintptr_t* vv, uintptr_t* include_vec, uintptr_t word_ct) {
}
#else
uintptr_t* vec_end = &(vv[word_ct]);
- do {
+ while (vv < vec_end) {
*vv++ &= *include_vec++;
- } while (vv < vec_end);
+ }
#endif
}
@@ -5261,15 +5278,16 @@ void bitfield_andnot(uintptr_t* vv, uintptr_t* exclude_vec, uintptr_t word_ct) {
}
#else
uintptr_t* vec_end = &(vv[word_ct]);
- do {
+ while (vv < vec_end) {
*vv++ &= ~(*exclude_vec++);
- } while (vv < vec_end);
+ }
#endif
}
void bitfield_andnot_reversed_args(uintptr_t* vv, uintptr_t* include_vec, uintptr_t word_ct) {
// vv := (~vv) AND include_vec
// on 64-bit systems, assumes vv and exclude_vec are 16-byte aligned
+ // assumes word_ct is nonzero
#ifdef __LP64__
__m128i* vv128 = (__m128i*)vv;
__m128i* iv128 = (__m128i*)include_vec;
@@ -5284,10 +5302,10 @@ void bitfield_andnot_reversed_args(uintptr_t* vv, uintptr_t* include_vec, uintpt
}
#else
uintptr_t* vec_end = &(vv[word_ct]);
- do {
+ while (vv < vec_end) {
*vv = (~(*vv)) & (*include_vec++);
vv++;
- } while (vv < vec_end);
+ }
#endif
}
@@ -5308,9 +5326,9 @@ void bitfield_or(uintptr_t* vv, uintptr_t* or_vec, uintptr_t word_ct) {
}
#else
uintptr_t* vec_end = &(vv[word_ct]);
- do {
+ while (vv < vec_end) {
*vv++ |= *or_vec++;
- } while (vv < vec_end);
+ }
#endif
}
@@ -5336,9 +5354,9 @@ void bitfield_ornot(uintptr_t* vv, uintptr_t* inverted_or_vec, uintptr_t word_ct
}
#else
uintptr_t* vec_end = &(vv[word_ct]);
- do {
+ while (vv < vec_end) {
*vv++ |= ~(*inverted_or_vec++);
- } while (vv < vec_end);
+ }
#endif
}
@@ -5359,9 +5377,9 @@ void bitfield_xor(uintptr_t* bit_arr, uintptr_t* xor_arr, uintptr_t word_ct) {
}
#else
uintptr_t* bit_arr_end = &(bit_arr[word_ct]);
- do {
+ while (bit_arr < bit_arr_end) {
*bit_arr++ ^= *xor_arr++;
- } while (bit_arr < bit_arr_end);
+ }
#endif
}
@@ -7519,6 +7537,7 @@ uintptr_t count_01(uintptr_t* lptr, uintptr_t word_ct) {
void fill_all_bits(uintptr_t* bit_arr, uintptr_t ct) {
// leaves bits beyond the end unset
+ // ok for ct == 0
uintptr_t quotient = ct / BITCT;
uintptr_t remainder = ct % BITCT;
fill_ulong_one(bit_arr, quotient);
@@ -7781,6 +7800,7 @@ uint32_t get_max_chrom_size(Chrom_info* chrom_info_ptr, uintptr_t* marker_exclud
}
void count_genders(uintptr_t* sex_nm, uintptr_t* sex_male, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uint32_t* male_ct_ptr, uint32_t* female_ct_ptr, uint32_t* unk_ct_ptr) {
+ // unfiltered_sample_ct can be zero
uint32_t male_ct = 0;
uint32_t female_ct = 0;
uint32_t unk_ct = 0;
@@ -7810,6 +7830,7 @@ void count_genders(uintptr_t* sex_nm, uintptr_t* sex_male, uintptr_t unfiltered_
}
void reverse_loadbuf(unsigned char* loadbuf, uintptr_t unfiltered_sample_ct) {
+ // unfiltered_sample_ct can be zero
uintptr_t sample_bidx = 0;
unsigned char* loadbuf_end = &(loadbuf[(unfiltered_sample_ct + 3) / 4]);
unsigned char ucc;
@@ -7892,7 +7913,7 @@ void collapse_copy_2bitarr(uintptr_t* rawbuf, uintptr_t* mainbuf, uint32_t unfil
sample_idx += sample_uidx_stop - sample_uidx;
do {
// er, this can totally be sped up
- cur_write |= (((rawbuf[sample_uidx / BITCT2] >> ((sample_uidx % BITCT2) * 2)) & 3) << (ii_rem * 2));
+ cur_write |= EXTRACT_2BIT_GENO(rawbuf, sample_uidx) << (ii_rem * 2);
if (++ii_rem == BITCT2) {
*mainbuf++ = cur_write;
cur_write = 0;
@@ -7906,6 +7927,7 @@ void collapse_copy_2bitarr(uintptr_t* rawbuf, uintptr_t* mainbuf, uint32_t unfil
}
uint32_t load_and_collapse(FILE* bedfile, uintptr_t* rawbuf, uint32_t unfiltered_sample_ct, uintptr_t* mainbuf, uint32_t sample_ct, uintptr_t* sample_exclude, uintptr_t final_mask, uint32_t do_reverse) {
+ // assumes unfiltered_sample_ct is positive
uint32_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
if (unfiltered_sample_ct == sample_ct) {
rawbuf = mainbuf;
@@ -7942,7 +7964,7 @@ void collapse_copy_2bitarr_incl(uintptr_t* rawbuf, uintptr_t* mainbuf, uint32_t
sample_uidx_stop = next_unset(sample_include, sample_uidx, unfiltered_sample_ct);
sample_idx += sample_uidx_stop - sample_uidx;
do {
- cur_write |= (((rawbuf[sample_uidx / BITCT2] >> ((sample_uidx % BITCT2) * 2)) & 3) << (ii_rem * 2));
+ cur_write |= EXTRACT_2BIT_GENO(rawbuf, sample_uidx) << (ii_rem * 2);
if (++ii_rem == BITCT2) {
*mainbuf++ = cur_write;
cur_write = 0;
@@ -8032,13 +8054,14 @@ uint32_t load_and_split(FILE* bedfile, uintptr_t* rawbuf, uint32_t unfiltered_sa
}
void vec_include_init(uintptr_t unfiltered_sample_ct, uintptr_t* new_include2, uintptr_t* old_include) {
+ // allows unfiltered_sample_ct == 0
uint32_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
uintptr_t ulii;
uintptr_t uljj;
uintptr_t ulkk;
uintptr_t ulmm;
uint32_t bit_idx;
- do {
+ while (unfiltered_sample_ctl) {
ulii = ~(*old_include++);
ulkk = FIVEMASK;
ulmm = FIVEMASK;
@@ -8066,7 +8089,8 @@ void vec_include_init(uintptr_t unfiltered_sample_ct, uintptr_t* new_include2, u
}
*new_include2++ = ulkk;
*new_include2++ = ulmm;
- } while (--unfiltered_sample_ctl);
+ --unfiltered_sample_ctl;
+ }
ulii = unfiltered_sample_ct & (BITCT - 1);
if (ulii) {
new_include2--;
@@ -8080,13 +8104,14 @@ void vec_include_init(uintptr_t unfiltered_sample_ct, uintptr_t* new_include2, u
}
void exclude_to_vec_include(uintptr_t unfiltered_sample_ct, uintptr_t* include_vec, uintptr_t* exclude_arr) {
+ // allows unfiltered_sample_ct == 0
uint32_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
uintptr_t ulii;
uintptr_t uljj;
uintptr_t ulkk;
uintptr_t ulmm;
uint32_t bit_idx;
- do {
+ while (unfiltered_sample_ctl) {
ulii = *exclude_arr++;
ulkk = FIVEMASK;
ulmm = FIVEMASK;
@@ -8114,7 +8139,8 @@ void exclude_to_vec_include(uintptr_t unfiltered_sample_ct, uintptr_t* include_v
}
*include_vec++ = ulkk;
*include_vec++ = ulmm;
- } while (--unfiltered_sample_ctl);
+ --unfiltered_sample_ctl;
+ }
ulii = unfiltered_sample_ct & (BITCT - 1);
if (ulii) {
include_vec--;
@@ -8131,31 +8157,44 @@ void vec_init_invert(uintptr_t entry_ct, uintptr_t* target_arr, uintptr_t* sourc
// Initializes a half-bitfield as the inverse of another. Assumes target_arr
// and source_arr are doubleword-aligned.
uint32_t vec_wsize = 2 * ((entry_ct + (BITCT - 1)) / BITCT);
- uintptr_t* second_to_last = &(target_arr[vec_wsize - 2]);
uint32_t rem = entry_ct & (BITCT - 1);
#ifdef __LP64__
const __m128i m1 = {FIVEMASK, FIVEMASK};
__m128i* tptr = (__m128i*)target_arr;
__m128i* sptr = (__m128i*)source_arr;
__m128i* tptr_end = (__m128i*)(&(target_arr[vec_wsize]));
- do {
+ uintptr_t* second_to_last;
+ while (tptr < tptr_end) {
*tptr++ = _mm_andnot_si128(*sptr++, m1);
- } while (tptr < tptr_end);
+ }
+ if (rem) {
+ second_to_last = &(((uintptr_t*)tptr_end)[-2]);
+ if (rem > BITCT2) {
+ second_to_last[1] &= (~ZEROLU) >> ((BITCT - rem) * 2);
+ } else {
+ *second_to_last &= (~ZEROLU) >> ((BITCT2 - rem) * 2);
+ second_to_last[1] = 0;
+ }
+ }
#else
uintptr_t* tptr_end = &(target_arr[vec_wsize]);
- do {
+ while (target_arr < tptr_end) {
*target_arr++ = FIVEMASK & (~(*source_arr++));
- } while (target_arr < tptr_end);
-#endif
- if (rem > BITCT2) {
- second_to_last[1] &= (~ZEROLU) >> ((BITCT - rem) * 2);
- } else if (rem) {
- *second_to_last &= (~ZEROLU) >> ((BITCT2 - rem) * 2);
- second_to_last[1] = 0;
}
+ if (rem) {
+ if (rem > BITCT2) {
+ target_arr[-1] &= (~ZEROLU) >> ((BITCT - rem) * 2);
+ } else {
+ target_arr[-2] &= (~ZEROLU) >> ((BITCT2 - rem) * 2);
+ target_arr[-1] = 0;
+ }
+ }
+
+#endif
}
void bitfield_andnot_copy(uintptr_t word_ct, uintptr_t* target_arr, uintptr_t* source_arr, uintptr_t* exclude_arr) {
+ // assumes word_ct is positive
// target_arr := source_arr ANDNOT exclude_arr
// may write an extra word
#ifdef __LP64__
@@ -8175,13 +8214,14 @@ void bitfield_andnot_copy(uintptr_t word_ct, uintptr_t* target_arr, uintptr_t* s
}
void vec_include_mask_in(uintptr_t unfiltered_sample_ct, uintptr_t* include_arr, uintptr_t* mask_arr) {
+ // allows unfiltered_sample_ct == 0
uint32_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
uintptr_t ulii;
uintptr_t uljj;
uintptr_t ulkk;
uintptr_t ulmm;
uint32_t bit_idx;
- do {
+ while (unfiltered_sample_ctl) {
ulii = ~(*mask_arr++);
ulkk = *include_arr;
ulmm = include_arr[1];
@@ -8209,10 +8249,12 @@ void vec_include_mask_in(uintptr_t unfiltered_sample_ct, uintptr_t* include_arr,
}
*include_arr++ = ulkk;
*include_arr++ = ulmm;
- } while (--unfiltered_sample_ctl);
+ --unfiltered_sample_ctl;
+ }
}
void vec_include_mask_out(uintptr_t unfiltered_sample_ct, uintptr_t* include_arr, uintptr_t* mask_arr) {
+ // assumes unfiltered_sample_ct is positive
uint32_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
uintptr_t ulii;
uintptr_t uljj;
@@ -8251,6 +8293,7 @@ void vec_include_mask_out(uintptr_t unfiltered_sample_ct, uintptr_t* include_arr
}
void vec_include_mask_out_intersect(uintptr_t unfiltered_sample_ct, uintptr_t* include_arr, uintptr_t* mask_arr, uintptr_t* mask2_arr) {
+ // assumes unfiltered_sample_ct is positive
uint32_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
uintptr_t ulii;
uintptr_t uljj;
@@ -8289,6 +8332,7 @@ void vec_include_mask_out_intersect(uintptr_t unfiltered_sample_ct, uintptr_t* i
}
void vec_init_01(uintptr_t unfiltered_sample_ct, uintptr_t* data_ptr, uintptr_t* result_ptr) {
+ // assumes unfiltered_sample_ct is positive
// initializes result_ptr bits 01 iff data_ptr bits are 01
#ifdef __LP64__
const __m128i m1 = {FIVEMASK, FIVEMASK};
@@ -8338,6 +8382,7 @@ void vec_invert(uintptr_t unfiltered_sample_ct, uintptr_t* vec2) {
}
void vec_datamask(uintptr_t unfiltered_sample_ct, uint32_t matchval, uintptr_t* data_ptr, uintptr_t* mask_ptr, uintptr_t* result_ptr) {
+ // assumes unfiltered_sample_ct is positive
// vec_ptr assumed to be standard 00/01 bit vector
// sets result_vec bits to 01 iff data_ptr bits are equal to matchval and
// vec_ptr bit is set, 00 otherwise.
@@ -8424,6 +8469,7 @@ void vec_rotate_plink1_to_plink2(uintptr_t* lptr, uint32_t word_ct) {
*/
void rotate_plink1_to_plink2_and_copy(uintptr_t* loadbuf, uintptr_t* writebuf, uintptr_t word_ct) {
+ // assumes positive word_ct
uintptr_t* loadbuf_end = &(loadbuf[word_ct]);
uintptr_t ulii;
uintptr_t uljj;
@@ -9149,14 +9195,14 @@ char* alloc_and_init_collapsed_arr(char* item_arr, uintptr_t item_len, uintptr_t
}
wptr = new_arr;
wptr_end = &(new_arr[filtered_ct * item_len]);
- do {
+ while (wptr < wptr_end) {
item_uidx = next_unset_ul_unsafe(exclude_arr, item_uidx);
item_uidx_stop = next_set_ul(exclude_arr, item_uidx, unfiltered_ct);
delta = item_uidx_stop - item_uidx;
memcpy(wptr, &(item_arr[item_uidx * item_len]), delta * item_len);
wptr = &(wptr[delta * item_len]);
item_uidx = item_uidx_stop;
- } while (wptr < wptr_end);
+ }
return new_arr;
}
@@ -9418,9 +9464,12 @@ uint32_t collapse_duplicate_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max
// Collapses array of sorted IDs to remove duplicates, and writes
// pre-collapse positions to id_starts (so e.g. duplication count of any
// sample ID can be determined via subtraction) if it isn't NULL.
- // Assumes id_ct is positive. Returns id_ct of collapsed array.
+ // Returns id_ct of collapsed array.
uintptr_t read_idx;
uintptr_t write_idx;
+ if (!id_ct) {
+ return 0;
+ }
if (id_starts) {
id_starts[0] = 0;
for (read_idx = 1; read_idx < id_ct; read_idx++) {
diff --git a/plink_common.h b/plink_common.h
index e1b6e94..c9794f4 100644
--- a/plink_common.h
+++ b/plink_common.h
@@ -83,7 +83,18 @@
#include <algorithm>
#endif
+// It would be useful to disable compilation on big-endian platforms, but I
+// don't see a decent portable way to do this (see e.g. discussion at
+// http://esr.ibiblio.org/?p=5095 ).
+
#ifdef __LP64__
+ #ifndef __SSE2__
+ // It's obviously possible to support this by writing 64-bit non-SSE2 code
+ // shadowing each SSE2 intrinsic, but this almost certainly isn't worth the
+ // development/testing effort until regular PLINK 2.0 development is
+ // complete. No researcher has ever asked me for this feature.
+ #error "64-bit builds currently require SSE2. Try producing a 32-bit build instead."
+ #endif
#include <emmintrin.h>
#define FIVEMASK 0x5555555555555555LLU
typedef union {
@@ -223,6 +234,9 @@
#define MISC_MISSING_GZ 0x4000000000LLU
#define MISC_FREQ_GZ 0x8000000000LLU
#define MISC_HET_GZ 0x10000000000LLU
+#define MISC_ALLOW_NO_SAMPLES 0x20000000000LLU
+#define MISC_ALLOW_NO_VARS 0x40000000000LLU
+#define MISC_VCF_REQUIRE_GT 0x80000000000LLU
// assume for now that .bed must always be accompanied by both .bim and .fam
#define FILTER_ALL_REQ 1LLU
@@ -357,6 +371,8 @@
#define COVAR_KEEP_PHENO_ON_MISSING_COV 1
#define COVAR_NAME 2
#define COVAR_NUMBER 4
+#define COVAR_NO_CONST 8
+#define COVAR_ALLOW_NONE 0x10
#define DISTANCE_SQ 1
#define DISTANCE_SQ0 2
@@ -566,25 +582,34 @@
#define WKSPACE_MIN_MB 64
#define WKSPACE_DEFAULT_MB 2048
+#ifdef __LP64__
+ #define BITCT 64
+#else
+ #define BITCT 32
+#endif
+
+#define BITCT2 (BITCT / 2)
+#define BYTECT (BITCT / 8)
+
#define CACHELINE 64 // assumed number of bytes per cache line, for alignment
#define CACHELINE_INT32 (CACHELINE / 4)
#define CACHELINE_INT64 (CACHELINE / 8)
#define CACHELINE_WORD (CACHELINE / BYTECT)
#define CACHELINE_DBL (CACHELINE / 8)
-#define CACHEALIGN(val) ((val + (CACHELINE - 1)) & (~(CACHELINE - ONELU)))
-#define CACHEALIGN_INT32(val) ((val + (CACHELINE_INT32 - 1)) & (~(CACHELINE_INT32 - ONELU)))
-#define CACHEALIGN_WORD(val) ((val + (CACHELINE_WORD - 1)) & (~(CACHELINE_WORD - ONELU)))
-#define CACHEALIGN_DBL(val) ((val + (CACHELINE_DBL - 1)) & (~(CACHELINE_DBL - ONELU)))
+#define CACHEALIGN(val) (((val) + (CACHELINE - 1)) & (~(CACHELINE - ONELU)))
+#define CACHEALIGN_INT32(val) (((val) + (CACHELINE_INT32 - 1)) & (~(CACHELINE_INT32 - ONELU)))
+#define CACHEALIGN_WORD(val) (((val) + (CACHELINE_WORD - 1)) & (~(CACHELINE_WORD - ONELU)))
+#define CACHEALIGN_DBL(val) (((val) + (CACHELINE_DBL - 1)) & (~(CACHELINE_DBL - ONELU)))
// 32-bit instead of word-length bitwise not here, when val can be assumed to
// be 32-bit.
// (note that the sizeof operator "returns" an uintptr_t, not a uint32_t; hence
// the lack of sizeof in the CACHELINE_INT32, etc. definitions.)
-#define CACHEALIGN32(val) ((val + (CACHELINE - 1)) & (~(CACHELINE - 1)))
-#define CACHEALIGN32_INT32(val) ((val + (CACHELINE_INT32 - 1)) & (~(CACHELINE_INT32 - 1)))
-#define CACHEALIGN32_WORD(val) ((val + (CACHELINE_WORD - 1)) & (~(CACHELINE_WORD - 1)))
-#define CACHEALIGN32_DBL(val) ((val + (CACHELINE_DBL - 1)) & (~(CACHELINE_DBL - 1)))
+#define CACHEALIGN32(val) (((val) + (CACHELINE - 1)) & (~(CACHELINE - 1)))
+#define CACHEALIGN32_INT32(val) (((val) + (CACHELINE_INT32 - 1)) & (~(CACHELINE_INT32 - 1)))
+#define CACHEALIGN32_WORD(val) (((val) + (CACHELINE_WORD - 1)) & (~(CACHELINE_WORD - 1)))
+#define CACHEALIGN32_DBL(val) (((val) + (CACHELINE_DBL - 1)) & (~(CACHELINE_DBL - 1)))
#define MAXV(aa, bb) (((bb) > (aa))? (bb) : (aa))
#define MINV(aa, bb) (((aa) > (bb))? (bb) : (aa))
@@ -601,14 +626,9 @@
#define MAX_THREADS_P1 513
#endif
-#ifdef __LP64__
- #define BITCT 64
-#else
- #define BITCT 32
-#endif
-
-#define BITCT2 (BITCT / 2)
-#define BYTECT (BITCT / 8)
+// defined as a macro since type of idx can vary; might want a debug
+// compilation mode which performs type-checking, though
+#define EXTRACT_2BIT_GENO(ulptr, idx) (((ulptr)[(idx) / BITCT2] >> (2 * ((idx) % BITCT2))) & 3)
// generic maximum line length. .ped/.vcf/etc. lines can of course be longer
#define MAXLINELEN 131072
@@ -713,7 +733,7 @@ static inline const char* cond_replace(const char* ss, const char* match_str, co
uint32_t aligned_malloc(uintptr_t** aligned_pp, uintptr_t size);
-void aligned_free(uintptr_t* aligned_ptr);
+void aligned_free(uintptr_t* aligned_pp);
static inline void aligned_free_cond(uintptr_t* aligned_ptr) {
if (aligned_ptr) {
@@ -837,13 +857,6 @@ static inline int32_t fputs_checked(const char* ss, FILE* outfile) {
int32_t fwrite_checked(const void* buf, size_t len, FILE* outfile);
-static inline int32_t fwrite_checkedz(const void* buf, size_t len, FILE* outfile) {
- if (len) {
- return fwrite_checked(buf, len, outfile);
- }
- return ferror(outfile);
-}
-
static inline int32_t fread_checked(char* buf, uintptr_t len, FILE* infile, uintptr_t* bytes_read_ptr) {
*bytes_read_ptr = fread(buf, 1, len, infile);
return ferror(infile);
@@ -1475,9 +1488,9 @@ static inline uint32_t tri_coord_no_diag_32(uint32_t small_coord, uint32_t big_c
}
// let the compiler worry about the second argument's bit width here
-#define SET_BIT(aa, bb) (aa[(bb) / BITCT] |= ONELU << ((bb) % BITCT))
+#define SET_BIT(aa, bb) ((aa)[(bb) / BITCT] |= ONELU << ((bb) % BITCT))
-#define SET_BIT_DBL(aa, bb) (aa[bb / BITCT2] |= ONELU << (2 * (bb % BITCT2)))
+#define SET_BIT_DBL(aa, bb) ((aa)[(bb) / BITCT2] |= ONELU << (2 * ((bb) % BITCT2)))
static inline void set_bit(uintptr_t* bit_arr, uint32_t loc) {
bit_arr[loc / BITCT] |= (ONELU << (loc % BITCT));
@@ -1491,9 +1504,9 @@ void fill_bits(uintptr_t* bit_arr, uintptr_t loc_start, uintptr_t len);
void clear_bits(uintptr_t* bit_arr, uintptr_t loc_start, uintptr_t len);
-#define CLEAR_BIT(aa, bb) (aa[bb / BITCT] &= ~(ONELU << (bb % BITCT)))
+#define CLEAR_BIT(aa, bb) ((aa)[(bb) / BITCT] &= ~(ONELU << ((bb) % BITCT)))
-#define CLEAR_BIT_DBL(aa, bb) (aa[bb / BITCT2] &= ~(ONELU << (2 * (bb % BITCT2))))
+#define CLEAR_BIT_DBL(aa, bb) ((aa)[(bb) / BITCT2] &= ~(ONELU << (2 * ((bb) % BITCT2))))
static inline void clear_bit(uintptr_t* bit_arr, uint32_t loc) {
bit_arr[loc / BITCT] &= ~(ONELU << (loc % BITCT));
@@ -1503,20 +1516,20 @@ static inline void clear_bit_ul(uintptr_t* bit_arr, uintptr_t loc) {
bit_arr[loc / BITCT] &= ~(ONELU << (loc % BITCT));
}
-#define IS_SET(aa, bb) ((aa[bb / BITCT] >> (bb % BITCT)) & 1)
+#define IS_SET(aa, bb) (((aa)[(bb) / BITCT] >> ((bb) % BITCT)) & 1)
-#define IS_SET_DBL(aa, bb) ((aa[bb / BITCT2] >> (2 * (bb % BITCT2))) & 1)
+#define IS_SET_DBL(aa, bb) (((aa)[(bb) / BITCT2] >> (2 * ((bb) % BITCT2))) & 1)
// use this instead of IS_SET() for signed 32-bit integers
-static inline uint32_t is_set(uintptr_t* exclude_arr, uint32_t loc) {
+static inline uint32_t is_set(const uintptr_t* exclude_arr, uint32_t loc) {
return (exclude_arr[loc / BITCT] >> (loc % BITCT)) & 1;
}
-static inline uint32_t is_set_ul(uintptr_t* exclude_arr, uintptr_t loc) {
+static inline uint32_t is_set_ul(const uintptr_t* exclude_arr, uintptr_t loc) {
return (exclude_arr[loc / BITCT] >> (loc % BITCT)) & 1;
}
-#define IS_NONNULL_AND_SET(aa, bb) (aa && IS_SET(aa, bb))
+#define IS_NONNULL_AND_SET(aa, bb) ((aa) && IS_SET(aa, bb))
uint32_t next_unset_unsafe(uintptr_t* bit_arr, uint32_t loc);
@@ -1643,6 +1656,13 @@ static inline void fill_ulong_zero(uintptr_t* ularr, size_t size) {
static inline void fill_ull_zero(uint64_t* ullarr, size_t size) {
fill_ulong_zero((uintptr_t*)ullarr, size);
}
+
+static inline void fill_v128_zero(__m128i* v128arr, size_t size) {
+ size_t ulii;
+ for (ulii = 0; ulii < size; ulii++) {
+ *v128arr++ = _mm_setzero_si128();
+ }
+}
#else
static inline void fill_ull_zero(uint64_t* ullarr, size_t size) {
fill_ulong_zero((uintptr_t*)ullarr, size * 2);
@@ -1727,7 +1747,7 @@ static inline uint32_t get_id_htable_size(uintptr_t item_ct) {
return (item_ct < 32761)? 65521 : geqprime(item_ct * 2 + 1);
}
-int32_t populate_id_htable(uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t item_ct, const char* item_ids, uintptr_t max_id_len, uint32_t allow_dups, uint32_t* id_htable, uint32_t id_htable_size);
+int32_t populate_id_htable(uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t item_ct, const char* item_ids, uintptr_t max_id_len, uint32_t store_dups, uint32_t* id_htable, uint32_t id_htable_size);
static inline int32_t alloc_and_populate_id_htable(uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t item_ct, const char* item_ids, uintptr_t max_id_len, uint32_t allow_dups, uint32_t** id_htable_ptr, uint32_t* id_htable_size_ptr) {
uint32_t id_htable_size = get_id_htable_size(item_ct);
@@ -1795,7 +1815,7 @@ void get_set_wrange_align(uintptr_t* bitfield, uintptr_t word_ct, uintptr_t* fir
// Maximum accepted chromosome index is this minus 1. Currently cannot exceed
// 2^14 due to SMALL_INTERVAL_BITS setting in plink_cnv.c...
-#define MAX_POSSIBLE_CHROM 5120
+#define MAX_POSSIBLE_CHROM 64000
// ...unless this is uncommented (it removes the entire CNV module).
// #define HIGH_MAX_CHROM
diff --git a/plink_data.c b/plink_data.c
index fe07a9c..6bcc946 100644
--- a/plink_data.c
+++ b/plink_data.c
@@ -54,17 +54,17 @@ int32_t sort_item_ids_nx(char** sorted_ids_ptr, uint32_t** id_map_ptr, uintptr_t
return 0;
}
-int32_t sample_major_to_snp_major(char* sample_major_fname, char* outname, uintptr_t unfiltered_marker_ct, uintptr_t sample_ct, uint64_t fsize) {
- // See below for old mmap() code. Turns out this is more portable without
- // being noticeably slower.
+int32_t sample_major_to_snp_major(char* sample_major_fname, char* outname, uintptr_t unfiltered_marker_ct, uintptr_t unfiltered_sample_ct, uint64_t fsize) {
+ // previously used mmap(); turns out this is more portable without being
+ // noticeably slower.
unsigned char* wkspace_mark = wkspace_base;
FILE* infile = NULL;
FILE* outfile = NULL;
uintptr_t unfiltered_marker_ct4 = (unfiltered_marker_ct + 3) / 4;
uintptr_t unfiltered_marker_ctl2 = (unfiltered_marker_ct + (BITCT2 - 1)) / BITCT2;
- uintptr_t unfiltered_sample_ct4 = (sample_ct + 3) / 4;
+ uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
uintptr_t marker_idx_end = 0;
- uint32_t bed_offset = fsize - sample_ct * ((uint64_t)unfiltered_marker_ct4);
+ uint32_t bed_offset = fsize - unfiltered_sample_ct * ((uint64_t)unfiltered_marker_ct4);
int32_t retval = 0;
uintptr_t* loadbuf;
uintptr_t* lptr;
@@ -81,80 +81,82 @@ int32_t sample_major_to_snp_major(char* sample_major_fname, char* outname, uintp
uintptr_t cur_word1;
uintptr_t cur_word2;
uintptr_t cur_word3;
- // could make this allocation a bit smaller in multipass case, but whatever
- if (wkspace_alloc_ul_checked(&loadbuf, unfiltered_marker_ctl2 * 4 * sizeof(intptr_t))) {
- goto sample_major_to_snp_major_ret_NOMEM;
- }
- if (wkspace_left < unfiltered_sample_ct4) {
- goto sample_major_to_snp_major_ret_NOMEM;
- }
- writebuf = (unsigned char*)wkspace_base;
- write_marker_ct = BITCT2 * (wkspace_left / (unfiltered_sample_ct4 * BITCT2));
- loadbuf[unfiltered_marker_ctl2 - 1] = 0;
- loadbuf[2 * unfiltered_marker_ctl2 - 1] = 0;
- loadbuf[3 * unfiltered_marker_ctl2 - 1] = 0;
- loadbuf[4 * unfiltered_marker_ctl2 - 1] = 0;
- if (fopen_checked(&infile, sample_major_fname, "rb")) {
- goto sample_major_to_snp_major_ret_OPEN_FAIL;
- }
if (fopen_checked(&outfile, outname, "wb")) {
goto sample_major_to_snp_major_ret_OPEN_FAIL;
}
if (fwrite_checked("l\x1b\x01", 3, outfile)) {
goto sample_major_to_snp_major_ret_WRITE_FAIL;
}
- do {
- marker_idx_base = marker_idx_end;
- marker_idx_end += write_marker_ct;
- if (marker_idx_end > unfiltered_marker_ct) {
- marker_idx_end = unfiltered_marker_ct;
- }
- if (fseeko(infile, bed_offset, SEEK_SET)) {
- goto sample_major_to_snp_major_ret_READ_FAIL;
- }
- for (sample_idx_end = 0; sample_idx_end < sample_ct;) {
- sample_idx_base = sample_idx_end;
- sample_idx_end = sample_idx_base + 4;
- if (sample_idx_end > sample_ct) {
- fill_ulong_zero(&(loadbuf[(sample_ct % 4) * unfiltered_marker_ctl2]), (4 - (sample_ct % 4)) * unfiltered_marker_ctl2);
- sample_idx_end = sample_ct;
- }
- lptr = loadbuf;
- for (sample_idx = sample_idx_base; sample_idx < sample_idx_end; sample_idx++) {
- if (load_raw(infile, lptr, unfiltered_marker_ct4)) {
- goto sample_major_to_snp_major_ret_READ_FAIL;
- }
- lptr = &(lptr[unfiltered_marker_ctl2]);
- }
- lptr = &(loadbuf[marker_idx_base / BITCT2]);
- for (marker_idx_block_end = marker_idx_base; marker_idx_block_end < marker_idx_end; lptr++) {
- marker_idx = marker_idx_block_end;
- cur_word0 = *lptr;
- cur_word1 = lptr[unfiltered_marker_ctl2];
- cur_word2 = lptr[2 * unfiltered_marker_ctl2];
- cur_word3 = lptr[3 * unfiltered_marker_ctl2];
- marker_idx_block_end = marker_idx + BITCT2;
- if (marker_idx_block_end > marker_idx_end) {
- marker_idx_block_end = marker_idx_end;
- }
- ucptr = &(writebuf[(marker_idx - marker_idx_base) * unfiltered_sample_ct4 + (sample_idx_base / 4)]);
- while (1) {
- *ucptr = (unsigned char)((cur_word0 & 3) | ((cur_word1 & 3) << 2) | ((cur_word2 & 3) << 4) | ((cur_word3 & 3) << 6));
- if (++marker_idx == marker_idx_block_end) {
- break;
+ if (unfiltered_marker_ct && unfiltered_sample_ct) {
+ // could make this allocation a bit smaller in multipass case, but whatever
+ if (wkspace_alloc_ul_checked(&loadbuf, unfiltered_marker_ctl2 * 4 * sizeof(intptr_t))) {
+ goto sample_major_to_snp_major_ret_NOMEM;
+ }
+ if (wkspace_left < unfiltered_sample_ct4) {
+ goto sample_major_to_snp_major_ret_NOMEM;
+ }
+ writebuf = (unsigned char*)wkspace_base;
+ write_marker_ct = BITCT2 * (wkspace_left / (unfiltered_sample_ct4 * BITCT2));
+ if (fopen_checked(&infile, sample_major_fname, "rb")) {
+ goto sample_major_to_snp_major_ret_OPEN_FAIL;
+ }
+ loadbuf[unfiltered_marker_ctl2 - 1] = 0;
+ loadbuf[2 * unfiltered_marker_ctl2 - 1] = 0;
+ loadbuf[3 * unfiltered_marker_ctl2 - 1] = 0;
+ loadbuf[4 * unfiltered_marker_ctl2 - 1] = 0;
+ do {
+ marker_idx_base = marker_idx_end;
+ marker_idx_end += write_marker_ct;
+ if (marker_idx_end > unfiltered_marker_ct) {
+ marker_idx_end = unfiltered_marker_ct;
+ }
+ if (fseeko(infile, bed_offset, SEEK_SET)) {
+ goto sample_major_to_snp_major_ret_READ_FAIL;
+ }
+ for (sample_idx_end = 0; sample_idx_end < unfiltered_sample_ct;) {
+ sample_idx_base = sample_idx_end;
+ sample_idx_end = sample_idx_base + 4;
+ if (sample_idx_end > unfiltered_sample_ct) {
+ fill_ulong_zero(&(loadbuf[(unfiltered_sample_ct % 4) * unfiltered_marker_ctl2]), (4 - (unfiltered_sample_ct % 4)) * unfiltered_marker_ctl2);
+ sample_idx_end = unfiltered_sample_ct;
+ }
+ lptr = loadbuf;
+ for (sample_idx = sample_idx_base; sample_idx < sample_idx_end; sample_idx++) {
+ if (load_raw(infile, lptr, unfiltered_marker_ct4)) {
+ goto sample_major_to_snp_major_ret_READ_FAIL;
+ }
+ lptr = &(lptr[unfiltered_marker_ctl2]);
+ }
+ lptr = &(loadbuf[marker_idx_base / BITCT2]);
+ for (marker_idx_block_end = marker_idx_base; marker_idx_block_end < marker_idx_end; lptr++) {
+ marker_idx = marker_idx_block_end;
+ cur_word0 = *lptr;
+ cur_word1 = lptr[unfiltered_marker_ctl2];
+ cur_word2 = lptr[2 * unfiltered_marker_ctl2];
+ cur_word3 = lptr[3 * unfiltered_marker_ctl2];
+ marker_idx_block_end = marker_idx + BITCT2;
+ if (marker_idx_block_end > marker_idx_end) {
+ marker_idx_block_end = marker_idx_end;
+ }
+ ucptr = &(writebuf[(marker_idx - marker_idx_base) * unfiltered_sample_ct4 + (sample_idx_base / 4)]);
+ while (1) {
+ *ucptr = (unsigned char)((cur_word0 & 3) | ((cur_word1 & 3) << 2) | ((cur_word2 & 3) << 4) | ((cur_word3 & 3) << 6));
+ if (++marker_idx == marker_idx_block_end) {
+ break;
+ }
+ cur_word0 >>= 2;
+ cur_word1 >>= 2;
+ cur_word2 >>= 2;
+ cur_word3 >>= 2;
+ ucptr = &(ucptr[unfiltered_sample_ct4]);
}
- cur_word0 >>= 2;
- cur_word1 >>= 2;
- cur_word2 >>= 2;
- cur_word3 >>= 2;
- ucptr = &(ucptr[unfiltered_sample_ct4]);
}
}
- }
- if (fwrite_checked(writebuf, (marker_idx_end - marker_idx_base) * unfiltered_sample_ct4, outfile)) {
- goto sample_major_to_snp_major_ret_WRITE_FAIL;
- }
- } while (marker_idx_end < unfiltered_marker_ct);
+ if (fwrite_checked(writebuf, (marker_idx_end - marker_idx_base) * unfiltered_sample_ct4, outfile)) {
+ goto sample_major_to_snp_major_ret_WRITE_FAIL;
+ }
+ } while (marker_idx_end < unfiltered_marker_ct);
+ }
if (fclose_null(&outfile)) {
goto sample_major_to_snp_major_ret_WRITE_FAIL;
}
@@ -208,7 +210,8 @@ uint32_t chrom_error(const char* extension, Chrom_info* chrom_info_ptr, char* ch
return 1;
}
-int32_t load_map(FILE** mapfile_ptr, char* mapname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_marker_ct_ptr, uintptr_t* marker_exclude_ct_ptr, uintptr_t* max_marker_id_len_ptr, uintptr_t** marker_exclude_ptr, char** marker_ids_ptr, Chrom_info* chrom_info_ptr, uint32_t** marker_pos_ptr, uint32_t* map_is_unsorted_ptr, uint32_t allow_extra_chroms) {
+int32_t load_map(FILE** mapfile_ptr, char* mapname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_marker_ct_ptr, uintptr_t* marker_exclude_ct_ptr, uintptr_t* max_marker_id_len_ptr, uintptr_t** marker_exclude_ptr, char** marker_ids_ptr, Chrom_info* chrom_info_ptr, uint32_t** marker_pos_ptr, uint32_t* map_is_unsorted_ptr, uint32_t allow_extra_chroms, uint32_t allow_no_vars) {
+ // currently only used by lgen_to_bed()
// todo: some cleanup
uintptr_t marker_exclude_ct = *marker_exclude_ct_ptr;
uintptr_t max_marker_id_len = 0;
@@ -267,7 +270,7 @@ int32_t load_map(FILE** mapfile_ptr, char* mapname, uint32_t* map_cols_ptr, uint
if (!feof(*mapfile_ptr)) {
goto load_map_ret_READ_FAIL;
}
- if (!unfiltered_marker_ct) {
+ if ((!unfiltered_marker_ct) && (!allow_no_vars)) {
logerrprint("Error: No variants in .map file.\n");
goto load_map_ret_INVALID_FORMAT;
}
@@ -367,14 +370,16 @@ int32_t load_map(FILE** mapfile_ptr, char* mapname, uint32_t* map_cols_ptr, uint
}
}
}
- chrom_info_ptr->chrom_end[last_chrom] = marker_uidx;
chrom_info_ptr->chrom_ct = ++chroms_encountered_m1;
- chrom_info_ptr->chrom_file_order_marker_idx[chroms_encountered_m1] = marker_uidx;
*marker_exclude_ct_ptr = marker_exclude_ct;
- if (*marker_exclude_ct_ptr == unfiltered_marker_ct) {
- logerrprint("Error: All variants excluded from .map file.\n");
- goto load_map_ret_ALL_MARKERS_EXCLUDED;
+ if (unfiltered_marker_ct) {
+ chrom_info_ptr->chrom_end[last_chrom] = marker_uidx;
+ if (marker_exclude_ct == unfiltered_marker_ct) {
+ logerrprint("Error: All variants excluded from .map file.\n");
+ goto load_map_ret_ALL_MARKERS_EXCLUDED;
+ }
}
+ chrom_info_ptr->chrom_file_order_marker_idx[chroms_encountered_m1] = marker_uidx;
while (0) {
load_map_ret_NOMEM:
retval = RET_NOMEM;
@@ -509,6 +514,7 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
int32_t prev_chrom = -1;
uint32_t last_pos = 0;
uint32_t allow_extra_chroms = (misc_flags / MISC_ALLOW_EXTRA_CHROMS) & 1;
+ uint32_t allow_no_variants = (misc_flags / MISC_ALLOW_NO_VARS) & 1;
uint32_t exclude_snp = (filter_flags / FILTER_EXCLUDE_MARKERNAME_SNP) & 1;
uint32_t snps_only = (filter_flags / FILTER_SNPS_ONLY) & 1;
uint32_t snps_only_no_di = (misc_flags / MISC_SNPS_ONLY_NO_DI) & 1;
@@ -948,7 +954,7 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
if (!feof(bimfile)) {
goto load_bim_ret_READ_FAIL;
}
- if (!unfiltered_marker_ct) {
+ if ((!unfiltered_marker_ct) && (!allow_no_variants)) {
sprintf(logbuf, "Error: No variants in %s.\n", ftype_str);
goto load_bim_ret_INVALID_FORMAT_2;
} else if (unfiltered_marker_ct > 2147483645) {
@@ -1281,7 +1287,7 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
}
}
}
- if (unfiltered_marker_ct == marker_exclude_ct) {
+ if ((unfiltered_marker_ct == marker_exclude_ct) && (!allow_no_variants)) {
logerrprint("Error: All variants excluded.\n");
goto load_bim_ret_ALL_MARKERS_EXCLUDED;
}
@@ -1357,6 +1363,7 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
// similar to load_clusters() in plink_cluster.c
// sex_nm and sex_male should be NULL unless sex is supposed to be added as
// an extra covariate
+ // covar_range_list_ptr is NULL iff --gxe was specified
unsigned char* wkspace_mark = wkspace_base;
unsigned char* wkspace_mark2 = NULL;
FILE* covar_file = NULL;
@@ -1471,15 +1478,41 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
goto load_covars_ret_NOMEM;
}
loadbuf = (char*)wkspace_base;
- retval = open_and_load_to_first_token(&covar_file, covar_fname, loadbuf_size, '\0', "--covar file", loadbuf, &bufptr, &line_idx);
- if (retval) {
- goto load_covars_ret_1;
+ // was using open_and_load_to_first_token(), but we now don't want to
+ // automatically print an error message on an empty file.
+ loadbuf[loadbuf_size - 1] = ' ';
+ if (fopen_checked(&covar_file, covar_fname, "r")) {
+ goto load_covars_ret_OPEN_FAIL;
}
+ line_idx = 0;
+ do {
+ if (!fgets(loadbuf, loadbuf_size, covar_file)) {
+ if (!feof(covar_file)) {
+ goto load_covars_ret_READ_FAIL;
+ }
+ strcpy(tbuf, "Empty --covar file.\n");
+ goto load_covars_none;
+ }
+ line_idx++;
+ if (!(loadbuf[loadbuf_size - 1])) {
+ if (loadbuf_size == MAXLINEBUFLEN) {
+ LOGERRPRINTF("Error: Line %" PRIuPTR " of --covar file is pathologically long.\n", line_idx);
+ goto load_covars_ret_INVALID_FORMAT;
+ } else {
+ goto load_covars_ret_NOMEM;
+ }
+ }
+ bufptr = skip_initial_spaces(loadbuf);
+ } while (is_eoln_kns(*bufptr));
covar_raw_ct = count_tokens(bufptr);
- if ((covar_raw_ct < 3) || (covar_raw_ct < 2 + gxe_mcovar)) {
+ if ((covar_raw_ct < 2) || (covar_raw_ct < 2 + gxe_mcovar)) {
goto load_covars_ret_MISSING_TOKENS;
}
covar_raw_ct -= 2;
+ if ((!covar_raw_ct) && (!sex_nm)) {
+ strcpy(tbuf, "No covariate columns in --covar file.\n");
+ goto load_covars_none;
+ }
covar_raw_ctl = (covar_raw_ct + (BITCT - 1)) / BITCT;
covars_active = (uintptr_t*)top_alloc(&topsize, covar_raw_ctl * sizeof(intptr_t));
@@ -1488,13 +1521,13 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
header_absent = (strcmp_se(bufptr, "FID", 3) || strcmp_se(bufptr2, "IID", 3));
bufptr = next_token(bufptr2);
- if (covar_modifier & (COVAR_NAME | COVAR_NUMBER)) {
+ if ((covar_modifier & (COVAR_NAME | COVAR_NUMBER)) && covar_raw_ct) {
fill_ulong_zero(covars_active, covar_raw_ctl);
if (covar_modifier & COVAR_NUMBER) {
if (numeric_range_list_to_bitfield(covar_range_list_ptr, covar_raw_ct, covars_active, 1, 0)) {
goto load_covars_ret_MISSING_TOKENS;
}
- } else {
+ } else if (covar_modifier & COVAR_NAME) {
if (header_absent) {
logerrprint("Error: --covar file doesn't have a header line for --covar-name.\n");
goto load_covars_ret_INVALID_FORMAT;
@@ -1512,17 +1545,22 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
fill_all_bits(covars_active, covar_raw_ct);
covar_ct = covar_raw_ct;
} else {
+ // --gxe only
fill_ulong_zero(covars_active, covar_raw_ctl);
covar_ct = 0;
}
covar_ctx = covar_ct + (sex_nm? 1 : 0);
- min_covar_col_ct = last_set_bit(covars_active, covar_raw_ctl) + 1;
+ if ((!covar_ctx) && (!gxe_mcovar)) {
+ strcpy(tbuf, "No --covar values loaded.\n");
+ goto load_covars_none;
+ }
+ min_covar_col_ct = covar_ct? (last_set_bit(covars_active, covar_raw_ctl) + 1) : 0;
if (min_covar_col_ct < gxe_mcovar) {
min_covar_col_ct = gxe_mcovar;
}
if (header_absent) {
max_covar_name_len = 4 + intlen(min_covar_col_ct);
- } else {
+ } else if (min_covar_col_ct) {
uii = 0;
while (1) {
bufptr2 = token_endnn(bufptr);
@@ -1593,14 +1631,13 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
rewind(covar_file);
if (header_absent) {
if (covar_range_list_ptr) {
- covar_uidx = 0;
- for (covar_idx = 0; covar_idx < covar_ct; covar_idx++) {
+ for (covar_uidx = 0, covar_idx = 0; covar_idx < covar_ct; covar_idx++) {
covar_uidx = next_set_ul_unsafe(covars_active, covar_uidx);
uint32_writex(memcpyl3a(&(covar_names[covar_idx * max_covar_name_len]), "COV"), ++covar_uidx, '\0');
}
}
line_idx = 0;
- } else {
+ } else if (covar_ct) {
covar_idx = 0;
retval = load_to_first_token(covar_file, loadbuf_size, '\0', "--covar file", loadbuf, &bufptr, &line_idx);
if (retval) {
@@ -1648,11 +1685,13 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
set_bit(already_seen, ii);
sample_idx = id_map[(uint32_t)ii];
bufptr = bufptr2;
- if (min_covar_col_ct > 1) {
- bufptr = next_token_mult(bufptr, min_covar_col_ct - 1);
- }
- if (no_more_tokens_kns(bufptr)) {
- goto load_covars_ret_MISSING_TOKENS;
+ if (min_covar_col_ct) {
+ if (min_covar_col_ct > 1) {
+ bufptr = next_token_mult(bufptr, min_covar_col_ct - 1);
+ }
+ if (no_more_tokens_kns(bufptr)) {
+ goto load_covars_ret_MISSING_TOKENS;
+ }
}
if (covar_range_list_ptr) {
dptr = &(covar_d[sample_idx * covar_ctx]);
@@ -1696,6 +1735,7 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
*dptr++ = (double)((int32_t)is_set(sex_male, sample_uidx));
} else {
covar_missing = 1;
+ *dptr++ = missing_phenod;
}
}
if (!covar_missing) {
@@ -1715,12 +1755,8 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
if (!feof(covar_file)) {
goto load_covars_ret_READ_FAIL;
}
- if (loaded_sample_ct == missing_cov_ct) {
- logerrprint("Error: No --covar values loaded.\n");
- goto load_covars_ret_INVALID_FORMAT;
- }
if (covar_range_list_ptr) {
- if ((covar_ct < covar_raw_ct - 1) || ((covar_ct == covar_raw_ct - 1) && ((!gxe_mcovar) || is_set(covars_active, gxe_mcovar - 1)))) {
+ if ((covar_ct + 1 < covar_raw_ct) || ((covar_ct + 1 == covar_raw_ct) && ((!gxe_mcovar) || is_set(covars_active, gxe_mcovar - 1)))) {
if (gxe_mcovar && (!is_set(covars_active, gxe_mcovar - 1))) {
sprintf(logbuf, "--covar: 1 C/C cov. loaded for --gxe, %" PRIuPTR "/%" PRIuPTR " for other operations.\n", covar_ct, covar_raw_ct);
} else {
@@ -1741,6 +1777,78 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
LOGPRINTF("%" PRIuPTR " %s %s not seen in the covariate file.\n", ulii, species_str(ulii), (ulii == 1)? "was" : "were");
}
+ if (covar_modifier & COVAR_NO_CONST) {
+ if (gxe_mcovar) {
+ uii = popcount_longs(gxe_covar_c, sample_ctl);
+ if ((!uii) || (uii == popcount_longs(gxe_covar_nm, sample_ctl))) {
+ logerrprint("Error: --gxe covariate is constant and --no-const-covar was specified.\n");
+ goto load_covars_ret_INVALID_FORMAT;
+ }
+ }
+ if (covar_range_list_ptr) {
+ // redefinition
+ covar_raw_ctl = (covar_ctx + BITCT - 1) / BITCT;
+ if (wkspace_alloc_ul_checked(&already_seen, covar_raw_ctl * sizeof(intptr_t))) {
+ goto load_covars_ret_NOMEM;
+ }
+ // is covariate nonconstant?
+ fill_ulong_zero(already_seen, covar_raw_ctl);
+ for (covar_idx = 0; covar_idx < covar_ctx; covar_idx++) {
+ dptr = &(covar_d[covar_idx]);
+ dxx = missing_phenod;
+ for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
+ if (dptr[sample_idx * covar_ctx] != missing_phenod) {
+ dxx = dptr[sample_idx * covar_ctx];
+ break;
+ }
+ }
+ for (; sample_idx < sample_ct; sample_idx++) {
+ if ((dptr[sample_idx * covar_ctx] != missing_phenod) && (dptr[sample_idx * covar_ctx] != dxx)) {
+ break;
+ }
+ }
+ if (sample_idx < sample_ct) {
+ SET_BIT(already_seen, covar_idx);
+ }
+ }
+ uii = popcount_longs(already_seen, covar_raw_ctl);
+ if (!uii) {
+ strcpy(tbuf, "All covariates are constant.\n");
+ goto load_covars_none;
+ } else if (uii < covar_ctx) {
+ LOGPRINTF("--no-const-covar: %" PRIuPTR " constant covariate%s excluded.\n", covar_ctx - uii, (covar_ctx - uii == 1)? "" : "s");
+ *covar_ctx_ptr = uii;
+ dptr = covar_d;
+ for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
+ uii = 0;
+ for (covar_idx = 0; covar_idx < covar_ctx; covar_idx++) {
+ if (IS_SET(already_seen, covar_idx)) {
+ dxx = covar_d[sample_idx * covar_ctx + covar_idx];
+ if (dxx == missing_phenod) {
+ uii = 1;
+ }
+ *dptr++ = dxx;
+ }
+ }
+ if (!uii) {
+ // if this sample had some missing covariate values, but all those
+ // covariates were excluded by --no-const-covar, set covar_nm bit
+ SET_BIT(covar_nm, sample_idx);
+ }
+ }
+ covar_idx = next_unset_unsafe(already_seen, 0);
+ uii = covar_idx;
+ for (; covar_idx < covar_ctx; covar_idx++) {
+ if (IS_SET(already_seen, covar_idx)) {
+ strcpy(&(covar_names[uii * max_covar_name_len]), &(covar_names[covar_idx * max_covar_name_len]));
+ uii++;
+ }
+ }
+ // don't worry about memory overallocation for now
+ }
+ }
+ }
+
wkspace_reset(wkspace_mark2);
while (0) {
load_covars_ret_NOMEM2:
@@ -1748,6 +1856,9 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
load_covars_ret_NOMEM:
retval = RET_NOMEM;
break;
+ load_covars_ret_OPEN_FAIL:
+ retval = RET_OPEN_FAIL;
+ break;
load_covars_ret_READ_FAIL:
retval = RET_READ_FAIL;
break;
@@ -1758,6 +1869,21 @@ int32_t load_covars(char* covar_fname, uintptr_t unfiltered_sample_ct, uintptr_t
load_covars_ret_INVALID_FORMAT:
retval = RET_INVALID_FORMAT;
break;
+ load_covars_none:
+ if (covar_modifier & COVAR_ALLOW_NONE) {
+ *covar_ctx_ptr = 0;
+ *covar_names_ptr = NULL;
+ *max_covar_name_len_ptr = 1;
+ *covar_nm_ptr = NULL;
+ *covar_d_ptr = NULL;
+ // --gxe not possible
+ wkspace_reset(wkspace_mark);
+ logerrprint("Warning: ");
+ } else {
+ retval = RET_INVALID_FORMAT;
+ logerrprint("Error: ");
+ }
+ logerrprint(tbuf);
}
load_covars_ret_1:
if (retval) {
@@ -2491,8 +2617,8 @@ int32_t load_bim_split_chrom(char* bimname, uintptr_t* marker_exclude, uintptr_t
FILE* infile = NULL;
char* loadbuf = tbuf;
uint32_t marker_uidx = 0xffffffffU; // deliberate overflow
- uintptr_t marker_idx = 0;
int32_t retval = 0;
+ uintptr_t marker_idx;
char* bufptr;
uint64_t chrom_idx;
if (max_bim_linelen > MAXLINELEN) {
@@ -2503,7 +2629,7 @@ int32_t load_bim_split_chrom(char* bimname, uintptr_t* marker_exclude, uintptr_t
if (fopen_checked(&infile, bimname, "r")) {
goto load_bim_split_chrom_ret_OPEN_FAIL;
}
- do {
+ for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
load_bim_split_chrom_reread:
if (!fgets(loadbuf, max_bim_linelen, infile)) {
goto load_bim_split_chrom_ret_READ_FAIL;
@@ -2519,7 +2645,7 @@ int32_t load_bim_split_chrom(char* bimname, uintptr_t* marker_exclude, uintptr_t
// already validated
chrom_idx = ((uint32_t)get_chrom_code(chrom_info_ptr, bufptr));
ll_buf[marker_idx] = (int64_t)((chrom_idx << 32) | ((uint64_t)marker_idx));
- } while ((++marker_idx) < marker_ct);
+ }
while (0) {
load_bim_split_chrom_ret_NOMEM:
retval = RET_NOMEM;
@@ -2703,6 +2829,11 @@ void sort_marker_chrom_pos(int64_t* ll_buf, uintptr_t marker_ct, uint32_t* pos_b
uint32_t uii;
uint32_t cur_chrom;
uint32_t chrom_ct;
+ if (!marker_ct) {
+ chrom_start[0] = 0;
+ *chrom_ct_ptr = 0;
+ return;
+ }
#ifdef __cplusplus
std::sort(ll_buf, &(ll_buf[marker_ct]));
#else
@@ -2766,15 +2897,15 @@ int32_t sort_and_write_bim(uint32_t* map_reverse, uint32_t map_cols, char* outna
uint32_t chrom_ct;
uint32_t uii;
uint32_t ujj;
- // There can be a LOT of markers (some 1000 Genomes files we've been offered
- // have ~40 million), so speeding up the sorting step over just calling
- // qsort_ext() may not be a complete waste of effort.
+ // There can be a LOT of markers (1000 Genomes files can have ~40-80
+ // million), so speeding up the sorting step over just calling qsort_ext()
+ // may not be a complete waste of effort.
// Strategy:
// 1. fill ll_buf with chromosome idx in high-order bits, original position
- // in low-order.
+ // in low-order.
// 2. std::sort() ll_buf, read off chromosome boundaries
// 3. then replace high-order bits in ll_buf with marker positions, and
- // std::sort() each chromosome separately.
+ // std::sort() each chromosome separately.
// Would be even faster if this was performed in a single sort, in the
// super-common case where all three numbers can be squeezed together in 64
// bits. But we care most about performance when this can't be done, so I
@@ -2997,8 +3128,8 @@ int32_t flip_subset_init(char* flip_fname, char* flip_subset_fname, uintptr_t un
}
a1ptr = marker_allele_ptrs[2 * marker_uidx];
a2ptr = marker_allele_ptrs[2 * marker_uidx + 1];
- ucc = a1ptr[0];
- if (a1ptr[1] || a2ptr[1] || (ucc < 'A') || (ucc > 'T') || (reverse_complements[ucc - 'A'] != a2ptr[0])) {
+ ucc = ((unsigned char)a1ptr[0]) - 'A';
+ if (a1ptr[1] || a2ptr[1] || (ucc > 19) || (reverse_complements[ucc] != a2ptr[0])) {
sprintf(logbuf, "Error: Invalid alleles (not reverse complement single bases) on line\n%" PRIuPTR " of --flip file.\n", line_idx);
goto flip_subset_init_ret_INVALID_FORMAT_2;
}
@@ -3139,7 +3270,7 @@ int32_t make_bed_one_marker(FILE* bedfile, uintptr_t* loadbuf, uint32_t unfilter
do {
sample_uidx2 = sample_sort_map[sample_uidx++];
} while (IS_SET(sample_exclude, sample_uidx2));
- cur_word |= (((loadbuf[sample_uidx2 / BITCT2] >> ((sample_uidx2 % BITCT2) * 2)) & 3) << (ii_rem * 2));
+ cur_word |= EXTRACT_2BIT_GENO(loadbuf, sample_uidx2) << (ii_rem * 2);
if (++ii_rem == BITCT2) {
*writeptr++ = cur_word;
cur_word = 0;
@@ -3186,7 +3317,7 @@ int32_t make_bed_me_missing_one_marker(FILE* bedfile, uintptr_t* loadbuf, uint32
do {
sample_uidx2 = sample_sort_map[sample_uidx++];
} while (IS_SET(sample_exclude, sample_uidx2));
- cur_word |= (((loadbuf[sample_uidx2 / BITCT2] >> ((sample_uidx2 % BITCT2) * 2)) & 3) << (ii_rem * 2));
+ cur_word |= EXTRACT_2BIT_GENO(loadbuf, sample_uidx2) << (ii_rem * 2);
if (++ii_rem == BITCT2) {
*writeptr++ = cur_word;
cur_word = 0;
@@ -3252,22 +3383,22 @@ void reverse_subset(uintptr_t* writebuf, uintptr_t* subset_vec2, uintptr_t word_
__m128i* wvec_end = (__m128i*)(&(writebuf[word_ct]));
__m128i vii;
__m128i vjj;
- do {
+ while (wvec < wvec_end) {
vii = *wvec;
vjj = _mm_andnot_si128(_mm_xor_si128(vii, _mm_srli_epi64(vii, 1)), *svec++);
vjj = _mm_or_si128(vjj, _mm_slli_epi64(vjj, 1));
*wvec++ = _mm_xor_si128(vii, vjj);
- } while (wvec < wvec_end);
+ }
#else
uintptr_t* writebuf_end = &(writebuf[word_ct]);
uintptr_t ulii;
uintptr_t uljj;
- do {
+ while (writebuf < writebuf_end) {
ulii = *writebuf;
uljj = (*subset_vec2++) & (~(ulii ^ (ulii >> 1)));
uljj *= 3;
*writebuf++ = ulii ^ uljj;
- } while (writebuf < writebuf_end);
+ }
#endif
}
@@ -3326,7 +3457,7 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
uint32_t unfiltered_sample_ctl2m1 = (unfiltered_sample_ct - 1) / BITCT2;
uint32_t family_ct = 0;
uint32_t set_hh_missing = (misc_flags / MISC_SET_HH_MISSING) & 1;
- uint32_t set_me_missing = (misc_flags / MISC_SET_ME_MISSING) & 1;
+ uint32_t set_me_missing = ((misc_flags / MISC_SET_ME_MISSING) & 1) && sample_ct;
uint32_t fill_missing_a2 = (misc_flags / MISC_FILL_MISSING_A2) & 1;
uint32_t mendel_include_duos = (mendel_modifier / MENDEL_DUOS) & 1;
uint32_t mendel_multigen = (mendel_modifier / MENDEL_MULTIGEN) & 1;
@@ -3465,59 +3596,61 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
*outname_end = '\0';
LOGPRINTFWW5("--make-bed to %s.bed + %s.bim + %s.fam ... ", outname, outname, outname);
fputs("0%", stdout);
- loop_end = marker_ct / 100;
- markers_done = 0;
- for (pass_idx = 0; pass_idx < pass_ct; pass_idx++) {
- pass_start = pass_idx * pass_size;
- pass_end = (pass_idx + 1) * pass_size;
- if (pass_idx + 1 == pass_ct) {
- pass_end = marker_ct;
- }
- seek_needed = 1;
- for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
- if (IS_SET(marker_exclude, marker_uidx)) {
- marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
- seek_needed = 1;
- }
- if ((map_reverse[marker_uidx] < pass_start) || (map_reverse[marker_uidx] >= pass_end)) {
- seek_needed = 1;
- continue;
- }
- writebuf_ptr = &(writebuf[sample_ctv2 * (map_reverse[marker_uidx] - pass_start)]);
- if (seek_needed) {
- if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
- goto make_bed_ret_READ_FAIL;
+ if (sample_ct) {
+ loop_end = marker_ct / 100;
+ markers_done = 0;
+ for (pass_idx = 0; pass_idx < pass_ct; pass_idx++) {
+ pass_start = pass_idx * pass_size;
+ pass_end = (pass_idx + 1) * pass_size;
+ if (pass_idx + 1 == pass_ct) {
+ pass_end = marker_ct;
+ }
+ seek_needed = 1;
+ for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
+ if (IS_SET(marker_exclude, marker_uidx)) {
+ marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
+ seek_needed = 1;
}
- seek_needed = 0;
- }
- retval = make_bed_one_marker(bedfile, loadbuf, unfiltered_sample_ct, unfiltered_sample_ct4, sample_exclude, sample_ct, sample_sort_map, final_mask, IS_SET(marker_reverse, marker_uidx), writebuf_ptr);
- if (retval) {
- goto make_bed_ret_1;
- }
- if (zcdefs) {
- zeropatch(sample_ctv2, cluster_ct, cluster_zc_masks, zcdefs, patchbuf, marker_idx, writebuf_ptr);
- }
- if (flip_subset_markers && is_set(flip_subset_markers, marker_uidx)) {
- reverse_subset(writebuf_ptr, flip_subset_vec2, sample_ctv2);
- }
- if (markers_done >= loop_end) {
- if (pct > 10) {
- putchar('\b');
+ if ((map_reverse[marker_uidx] < pass_start) || (map_reverse[marker_uidx] >= pass_end)) {
+ seek_needed = 1;
+ continue;
+ }
+ writebuf_ptr = &(writebuf[sample_ctv2 * (map_reverse[marker_uidx] - pass_start)]);
+ if (seek_needed) {
+ if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
+ goto make_bed_ret_READ_FAIL;
+ }
+ seek_needed = 0;
+ }
+ retval = make_bed_one_marker(bedfile, loadbuf, unfiltered_sample_ct, unfiltered_sample_ct4, sample_exclude, sample_ct, sample_sort_map, final_mask, IS_SET(marker_reverse, marker_uidx), writebuf_ptr);
+ if (retval) {
+ goto make_bed_ret_1;
+ }
+ if (zcdefs) {
+ zeropatch(sample_ctv2, cluster_ct, cluster_zc_masks, zcdefs, patchbuf, marker_idx, writebuf_ptr);
+ }
+ if (flip_subset_markers && is_set(flip_subset_markers, marker_uidx)) {
+ reverse_subset(writebuf_ptr, flip_subset_vec2, sample_ctv2);
+ }
+ if (markers_done >= loop_end) {
+ if (pct > 10) {
+ putchar('\b');
+ }
+ pct = (markers_done * 100LLU) / marker_ct;
+ printf("\b\b%u%%", pct);
+ fflush(stdout);
+ pct++;
+ loop_end = (pct * ((uint64_t)marker_ct)) / 100;
}
- pct = (markers_done * 100LLU) / marker_ct;
- printf("\b\b%u%%", pct);
- fflush(stdout);
- pct++;
- loop_end = (pct * ((uint64_t)marker_ct)) / 100;
+ markers_done++;
}
- markers_done++;
- }
- writebuf_ptr = writebuf;
- for (marker_idx = pass_start; marker_idx < pass_end; marker_idx++) {
- if (fwrite_checked(writebuf_ptr, sample_ct4, bedoutfile)) {
- goto make_bed_ret_WRITE_FAIL;
+ writebuf_ptr = writebuf;
+ for (marker_idx = pass_start; marker_idx < pass_end; marker_idx++) {
+ if (fwrite_checked(writebuf_ptr, sample_ct4, bedoutfile)) {
+ goto make_bed_ret_WRITE_FAIL;
+ }
+ writebuf_ptr = &(writebuf_ptr[sample_ctv2]);
}
- writebuf_ptr = &(writebuf_ptr[sample_ctv2]);
}
}
} else {
@@ -3701,7 +3834,7 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
return retval;
}
-int32_t load_fam(char* famname, uint32_t fam_cols, uint32_t tmp_fam_col_6, int32_t missing_pheno, uint32_t affection_01, uintptr_t* unfiltered_sample_ct_ptr, char** sample_ids_ptr, uintptr_t* max_sample_id_len_ptr, char** paternal_ids_ptr, uintptr_t* max_paternal_id_len_ptr, char** maternal_ids_ptr, uintptr_t* max_maternal_id_len_ptr, uintptr_t** sex_nm_ptr, uintptr_t** sex_male_ptr, uint32_t* affection_ptr, uintptr_t** pheno_nm_ptr, uintptr_t** pheno_c_ptr, double** pheno_d_ptr, uintptr [...]
+int32_t load_fam(char* famname, uint32_t fam_cols, uint32_t tmp_fam_col_6, int32_t missing_pheno, uint32_t affection_01, uintptr_t* unfiltered_sample_ct_ptr, char** sample_ids_ptr, uintptr_t* max_sample_id_len_ptr, char** paternal_ids_ptr, uintptr_t* max_paternal_id_len_ptr, char** maternal_ids_ptr, uintptr_t* max_maternal_id_len_ptr, uintptr_t** sex_nm_ptr, uintptr_t** sex_male_ptr, uint32_t* affection_ptr, uintptr_t** pheno_nm_ptr, uintptr_t** pheno_c_ptr, double** pheno_d_ptr, uintptr [...]
unsigned char* wkspace_mark = wkspace_base;
double missing_phenod = (double)missing_pheno;
uintptr_t* pheno_c = NULL;
@@ -3747,8 +3880,9 @@ int32_t load_fam(char* famname, uint32_t fam_cols, uint32_t tmp_fam_col_6, int32
goto load_fam_ret_OPEN_FAIL;
}
// ----- .fam read, first pass -----
- // count number of people, determine maximum person/father/mother ID lengths,
- // affection status, verify all floating point phenotype values are valid
+ // count number of samples, determine maximum person/father/mother ID
+ // lengths, affection status, verify all floating point phenotype values are
+ // valid
while (fgets(loadbuf, loadbuf_size, famfile)) {
line_idx++;
if (!loadbuf[loadbuf_size - 1]) {
@@ -3811,7 +3945,7 @@ int32_t load_fam(char* famname, uint32_t fam_cols, uint32_t tmp_fam_col_6, int32
if (ferror(famfile)) {
goto load_fam_ret_READ_FAIL;
}
- if (!unfiltered_sample_ct) {
+ if ((!unfiltered_sample_ct) && (!allow_no_samples)) {
logerrprint("Error: Nobody in .fam file.\n");
goto load_fam_ret_INVALID_FORMAT;
}
@@ -4006,6 +4140,8 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
uint32_t snpid_chr = (misc_flags / MISC_OXFORD_SNPID_CHR) & 1;
uint32_t allow_extra_chroms = (misc_flags / MISC_ALLOW_EXTRA_CHROMS) & 1;
+ uint32_t allow_no_samples = (misc_flags / MISC_ALLOW_NO_SAMPLES) & 1;
+ uint32_t allow_no_variants = (misc_flags / MISC_ALLOW_NO_VARS) & 1;
uint32_t sample_ct = 0;
uint32_t col_ct = 3;
uint32_t is_binary_pheno = 0;
@@ -4338,7 +4474,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
}
sample_ct++;
}
- if (!sample_ct) {
+ if ((!sample_ct) && (!allow_no_samples)) {
logerrprint("Error: No samples in .sample file.\n");
goto oxford_to_bed_ret_INVALID_FORMAT;
}
@@ -4458,156 +4594,158 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
if (putc_checked('\n', outfile_bim)) {
goto oxford_to_bed_ret_WRITE_FAIL;
}
- cur_word = 0;
- shiftval = 0;
- ulptr = writebuf;
- bufptr = skip_initial_spaces(&(bufptr4[1]));
- for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
- if (is_eoln_kns(*bufptr)) {
- goto oxford_to_bed_ret_MISSING_TOKENS_GEN;
- }
- // fast handling of common cases
- cc = bufptr[1];
- if ((cc == ' ') || (cc == '\t')) {
- cc = bufptr[3];
- cc2 = bufptr[5];
- if (((cc == ' ') || (cc == '\t')) && ((cc2 == ' ') || (cc2 == '\t'))) {
- cc = *bufptr;
- if (cc == '0') {
- bufptr2 = &(bufptr[2]);
- cc = *bufptr2;
- cc2 = bufptr2[2];
+ if (sample_ct) {
+ cur_word = 0;
+ shiftval = 0;
+ ulptr = writebuf;
+ bufptr = skip_initial_spaces(bufptr4);
+ for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
+ if (is_eoln_kns(*bufptr)) {
+ goto oxford_to_bed_ret_MISSING_TOKENS_GEN;
+ }
+ // fast handling of common cases
+ cc = bufptr[1];
+ if ((cc == ' ') || (cc == '\t')) {
+ cc = bufptr[3];
+ cc2 = bufptr[5];
+ if (((cc == ' ') || (cc == '\t')) && ((cc2 == ' ') || (cc2 == '\t'))) {
+ cc = *bufptr;
if (cc == '0') {
- if (cc2 == '1') {
- ulii = 3;
- } else if (cc2 == '0') {
- ulii = 1;
+ bufptr2 = &(bufptr[2]);
+ cc = *bufptr2;
+ cc2 = bufptr2[2];
+ if (cc == '0') {
+ if (cc2 == '1') {
+ ulii = 3;
+ } else if (cc2 == '0') {
+ ulii = 1;
+ } else {
+ // could be a space...
+ goto oxford_to_bed_full_parse_2;
+ }
+ } else if ((cc == '1') && (cc2 == '0')) {
+ ulii = 2;
} else {
- // could be a space...
goto oxford_to_bed_full_parse_2;
}
- } else if ((cc == '1') && (cc2 == '0')) {
- ulii = 2;
+ } else if ((cc == '1') && (bufptr[2] == '0') && (bufptr[4] == '0')) {
+ ulii = 0;
} else {
- goto oxford_to_bed_full_parse_2;
+ goto oxford_to_bed_full_parse;
}
- } else if ((cc == '1') && (bufptr[2] == '0') && (bufptr[4] == '0')) {
- ulii = 0;
+ bufptr = &(bufptr[6]);
} else {
goto oxford_to_bed_full_parse;
}
- bufptr = &(bufptr[6]);
} else {
- goto oxford_to_bed_full_parse;
- }
- } else {
- // okay, gotta do things the slow way
- oxford_to_bed_full_parse:
- bufptr2 = token_endnn(bufptr);
- oxford_to_bed_full_parse_2:
- bufptr2 = skip_initial_spaces(bufptr2);
- if (is_eoln_kns(*bufptr2)) {
- goto oxford_to_bed_ret_MISSING_TOKENS_GEN;
- }
- bufptr3 = token_endnn(bufptr2);
- dzz = strtod(bufptr3, &bufptr4);
- if (!is_randomized) {
- if (dzz >= hard_call_floor) {
- ulii = 3;
- } else {
- if (bufptr3 == bufptr4) {
- goto oxford_to_bed_ret_INVALID_DOSAGE;
- }
- dyy = strtod(bufptr2, &bufptr3);
- if (dyy >= hard_call_floor) {
- ulii = 2;
+ // okay, gotta do things the slow way
+ oxford_to_bed_full_parse:
+ bufptr2 = token_endnn(bufptr);
+ oxford_to_bed_full_parse_2:
+ bufptr2 = skip_initial_spaces(bufptr2);
+ if (is_eoln_kns(*bufptr2)) {
+ goto oxford_to_bed_ret_MISSING_TOKENS_GEN;
+ }
+ bufptr3 = token_endnn(bufptr2);
+ dzz = strtod(bufptr3, &bufptr4);
+ if (!is_randomized) {
+ if (dzz >= hard_call_floor) {
+ ulii = 3;
} else {
- if (bufptr2 == bufptr3) {
+ if (bufptr3 == bufptr4) {
goto oxford_to_bed_ret_INVALID_DOSAGE;
}
- dxx = strtod(bufptr, &bufptr2);
- if (dxx >= hard_call_floor) {
- ulii = 0;
+ dyy = strtod(bufptr2, &bufptr3);
+ if (dyy >= hard_call_floor) {
+ ulii = 2;
} else {
- if (bufptr == bufptr2) {
+ if (bufptr2 == bufptr3) {
goto oxford_to_bed_ret_INVALID_DOSAGE;
}
- ulii = 1;
+ dxx = strtod(bufptr, &bufptr2);
+ if (dxx >= hard_call_floor) {
+ ulii = 0;
+ } else {
+ if (bufptr == bufptr2) {
+ goto oxford_to_bed_ret_INVALID_DOSAGE;
+ }
+ ulii = 1;
+ }
}
}
- }
- } else {
- drand = rand_unif();
- if (drand < dzz) {
- ulii = 3;
} else {
- if (bufptr3 == bufptr4) {
- goto oxford_to_bed_ret_INVALID_DOSAGE;
- }
- dyy = strtod(bufptr2, &bufptr3) + dzz;
- if (drand < dyy) {
- ulii = 2;
+ drand = rand_unif();
+ if (drand < dzz) {
+ ulii = 3;
} else {
- if (bufptr2 == bufptr3) {
+ if (bufptr3 == bufptr4) {
goto oxford_to_bed_ret_INVALID_DOSAGE;
}
- dxx = strtod(bufptr, &bufptr2) + dyy;
+ dyy = strtod(bufptr2, &bufptr3) + dzz;
if (drand < dyy) {
- ulii = 0;
- } else if (dxx < 1 - D_EPSILON) {
- ulii = 1;
+ ulii = 2;
} else {
- // fully called genotype probabilities may add up to less
- // than one due to rounding error. If this appears to have
- // happened, do NOT make a missing call; instead rescale
- // everything to add to one and reinterpret the random
- // number. (D_EPSILON is currently set to make 4 decimal
- // place precision safe to use.)
- drand *= dxx;
- if (drand < dzz) {
- ulii = 3;
- } else if (drand < dyy) {
- ulii = 2;
- } else {
+ if (bufptr2 == bufptr3) {
+ goto oxford_to_bed_ret_INVALID_DOSAGE;
+ }
+ dxx = strtod(bufptr, &bufptr2) + dyy;
+ if (drand < dyy) {
ulii = 0;
+ } else if (dxx < 1 - D_EPSILON) {
+ ulii = 1;
+ } else {
+ // fully called genotype probabilities may add up to less
+ // than one due to rounding error. If this appears to have
+ // happened, do NOT make a missing call; instead rescale
+ // everything to add to one and reinterpret the random
+ // number. (D_EPSILON is currently set to make 4 decimal
+ // place precision safe to use.)
+ drand *= dxx;
+ if (drand < dzz) {
+ ulii = 3;
+ } else if (drand < dyy) {
+ ulii = 2;
+ } else {
+ ulii = 0;
+ }
}
}
}
}
+ bufptr = skip_initial_spaces(bufptr4);
+ }
+ cur_word |= ulii << shiftval;
+ shiftval += 2;
+ if (shiftval == BITCT) {
+ *ulptr++ = cur_word;
+ cur_word = 0;
+ shiftval = 0;
}
- bufptr = skip_initial_spaces(bufptr4);
}
- cur_word |= ulii << shiftval;
- shiftval += 2;
- if (shiftval == BITCT) {
+ if (shiftval) {
*ulptr++ = cur_word;
- cur_word = 0;
- shiftval = 0;
}
- }
- if (shiftval) {
- *ulptr++ = cur_word;
- }
- if (identical_alleles) {
- // keep missing calls, but convert hom/het A1 to hom A2.
- for (ulptr = writebuf; ulptr < (&(writebuf[sample_ctl2])); ulptr++) {
- ulii = *ulptr;
- *ulptr = ((~ulii) << 1) | ulii | FIVEMASK;
+ if (identical_alleles) {
+ // keep missing calls, but convert hom/het A1 to hom A2.
+ for (ulptr = writebuf; ulptr < (&(writebuf[sample_ctl2])); ulptr++) {
+ ulii = *ulptr;
+ *ulptr = ((~ulii) << 1) | ulii | FIVEMASK;
+ }
+ if (sample_ct % 4) {
+ writebuf[sample_ctl2 - 1] &= (ONELU << (2 * (sample_ct % BITCT2))) - ONELU;
+ }
}
- if (sample_ct % 4) {
- writebuf[sample_ctl2 - 1] &= (ONELU << (2 * (sample_ct % BITCT2))) - ONELU;
+ if (fwrite_checked(writebuf, sample_ct4, outfile)) {
+ goto oxford_to_bed_ret_WRITE_FAIL;
}
}
- if (fwrite_checked(writebuf, sample_ct4, outfile)) {
- goto oxford_to_bed_ret_WRITE_FAIL;
- }
marker_ct++;
if (!(marker_ct % 1000)) {
printf("\r--data: %uk variants converted.", marker_ct / 1000);
fflush(stdout);
}
}
- if (!marker_ct) {
+ if ((!marker_ct) && (!allow_no_variants)) {
logerrprint("Error: Empty .gen file.\n");
goto oxford_to_bed_ret_INVALID_FORMAT;
}
@@ -4623,7 +4761,6 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
loadbuf = (char*)wkspace_base;
loadbuf_size = wkspace_left;
if (loadbuf_size > MAXLINEBUFLEN) {
- // halve the limit since there are two alleles
loadbuf_size = MAXLINEBUFLEN;
} else if (loadbuf_size < 3 * 65536) {
goto oxford_to_bed_ret_NOMEM;
@@ -4636,8 +4773,8 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
goto oxford_to_bed_ret_INVALID_FORMAT;
}
raw_marker_ct = uint_arr[2];
- if (!raw_marker_ct) {
- logerrprint("Error: .bgen file contains no markers.\n");
+ if ((!raw_marker_ct) && (!allow_no_variants)) {
+ logerrprint("Error: .bgen file contains no variants.\n");
goto oxford_to_bed_ret_INVALID_FORMAT;
}
if (uint_arr[3] != sample_ct) {
@@ -4687,6 +4824,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
goto oxford_to_bed_ret_INVALID_FORMAT;
}
if (bgen_multichar_alleles) {
+ // v1.1
if (fread(&usii, 1, 2, infile) < 2) {
goto oxford_to_bed_ret_READ_FAIL;
}
@@ -4791,6 +4929,9 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
fwrite(bufptr, 1, usjj, outfile_bim);
bufptr = uint32_writex(&(tbuf[3]), uint_arr[0], ' ');
fwrite(tbuf, 1, bufptr - tbuf, outfile_bim);
+
+ // halve the limit since there are two alleles
+ // (may want to enforce NON_WKSPACE_MIN allele length limit?)
if (uint_arr[1] >= loadbuf_size / 2) {
if (loadbuf_size < MAXLINEBUFLEN) {
goto oxford_to_bed_ret_NOMEM;
@@ -4829,6 +4970,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
}
}
} else {
+ // v1.0
uii = 0;
if (fread(&uii, 1, 1, infile) < 1) {
goto oxford_to_bed_ret_READ_FAIL;
@@ -4842,8 +4984,10 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
ii = ((unsigned char)(loadbuf[2 * uii + 2]));
if (ii > 24) {
if (ii == 255) {
+ // unknown
ii = 0;
} else if (ii > 252) {
+ // XY or MT
ii = ii - 228;
} else {
logerrprint("Error: Invalid chromosome code in BGEN v1.0 file.\n");
@@ -4911,7 +5055,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
goto oxford_to_bed_ret_READ_FAIL;
}
if (uii > loadbuf_size) {
- if (loadbuf_size < MAXLINEBUFLEN / 2) {
+ if (loadbuf_size < MAXLINEBUFLEN) {
goto oxford_to_bed_ret_NOMEM;
}
logerrprint("Error: Excessively long compressed SNP block in .bgen file.\n");
@@ -5097,7 +5241,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
}
// side effect: initializes tbuf to first nonempty line of .map/.bim
-int32_t check_cm_col(FILE* bimfile, char* tbuf, uint32_t is_binary, uint32_t bufsize, uint32_t* gd_col_ptr, uintptr_t* line_idx_ptr) {
+int32_t check_cm_col(FILE* bimfile, char* tbuf, uint32_t is_binary, uint32_t allow_no_variants, uint32_t bufsize, uint32_t* cm_col_exists_ptr, uintptr_t* line_idx_ptr) {
uintptr_t line_idx = 0;
char* bufptr;
while (fgets(tbuf, bufsize, bimfile)) {
@@ -5112,14 +5256,14 @@ int32_t check_cm_col(FILE* bimfile, char* tbuf, uint32_t is_binary, uint32_t buf
return -1;
}
if (no_more_tokens_kns(next_token(bufptr))) {
- *gd_col_ptr = 0;
+ *cm_col_exists_ptr = 0;
} else {
- *gd_col_ptr = 1;
+ *cm_col_exists_ptr = 1;
}
return 0;
}
*line_idx_ptr = 0;
- return -1;
+ return allow_no_variants? 0 : -1;
}
int32_t incr_text_allele0(char cc, char* marker_alleles, uint32_t* marker_allele_cts) {
@@ -5242,19 +5386,20 @@ char* get_llstr(Ll_str* llptr, uint32_t allele_idx) {
static inline char* write_token_nt(char* read_ptr, FILE* outfile) {
// assumes read_ptr is at the beginning of an item to write
+ // nt = "no tab"
uint32_t slen = strlen_se(read_ptr);
fwrite(read_ptr, 1, slen, outfile);
- return skip_initial_spaces(&(read_ptr[slen + 1]));
+ return skip_initial_spaces(&(read_ptr[slen]));
}
static inline char* write_token(char* read_ptr, FILE* outfile) {
uint32_t slen = strlen_se(read_ptr);
fwrite(read_ptr, 1, slen, outfile);
putc('\t', outfile);
- return skip_initial_spaces(&(read_ptr[slen + 1]));
+ return skip_initial_spaces(&(read_ptr[slen]));
}
-int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char* outname, char* outname_end, FILE** mapfile_ptr, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_alleles_f, uint32_t map_is_unsorted, uint32_t fam_cols, uint32_t ped_col_skip_iid, uint32_t ped_col_skip, uint32_t gd_col, uint32_t* map_reverse, int64_t ped_size, char* missing_pheno_str) {
+int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char* outname, char* outname_end, FILE** mapfile_ptr, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_alleles_f, uint32_t map_is_unsorted, uint32_t fam_cols, uint32_t ped_col_skip_iid, uint32_t ped_col_skip, uint32_t cm_col_exists, uint32_t* map_reverse, int64_t ped_size, char* missing_pheno_str) {
// maintain allele counts and linked lists of observed alleles at FAR end of
// wkspace.
int32_t retval = 0;
@@ -5410,7 +5555,7 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
}
if ((*aptr1 == missing_geno) && (alen1 == 1)) {
if ((alen2 != 1) || (*aptr2 != missing_geno)) {
- goto ped_to_bed_multichar_allele_ret_INVALID_FORMAT_4;
+ goto ped_to_bed_multichar_allele_ret_INVALID_FORMAT_4;
}
marker_idx++;
continue;
@@ -5451,10 +5596,7 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
}
putchar('\r');
logprint(".ped scan complete (for binary autoconversion).\n");
- if (!sample_ct) {
- sprintf(logbuf, "Error: No %s in .ped file.\n", g_species_plural);
- goto ped_to_bed_multichar_allele_ret_INVALID_FORMAT_2;
- }
+ // sample_ct == 0 impossible
if (fclose_null(outfile_ptr)) {
goto ped_to_bed_multichar_allele_ret_WRITE_FAIL;
}
@@ -5535,7 +5677,7 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
putc('\t', outfile);
bufptr = skip_initial_spaces(&(bufptr[uii + 1]));
bufptr = write_token(bufptr, outfile);
- if (gd_col) {
+ if (cm_col_exists) {
ucc = (unsigned char)(*bufptr);
// should be good enough at detecting nonnumeric values...
if (((ucc >= '0') && (ucc <= '9')) || (ucc == '-') || (ucc == '+')) {
@@ -5788,13 +5930,15 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
uintptr_t marker_ct = 0;
uintptr_t sample_ct = 0;
uint32_t allow_extra_chroms = (misc_flags / MISC_ALLOW_EXTRA_CHROMS) & 1;
+ uint32_t allow_no_samples = (misc_flags / MISC_ALLOW_NO_SAMPLES) & 1;
+ uint32_t allow_no_variants = (misc_flags / MISC_ALLOW_NO_VARS) & 1;
uint32_t map_is_unsorted = 0;
int32_t last_chrom = 0;
uint32_t last_mpos = 0;
uint32_t ped_buflen = 1;
int32_t retval = 0;
- uint32_t ped_col_skip_iid = 1 + 2 * ((fam_cols & FAM_COL_34) / FAM_COL_34) + ((fam_cols & FAM_COL_5) / FAM_COL_5) + ((fam_cols & FAM_COL_6) / FAM_COL_6);
- uint32_t ped_col_skip = ped_col_skip_iid + ((fam_cols & FAM_COL_1) / FAM_COL_1);
+ uint32_t ped_col_skip_iid_m1 = ((fam_cols & FAM_COL_34) / (FAM_COL_34 / 2)) + ((fam_cols & FAM_COL_5) / FAM_COL_5) + ((fam_cols & FAM_COL_6) / FAM_COL_6);
+ uint32_t ped_col_skip = ped_col_skip_iid_m1 + 1 + ((fam_cols & FAM_COL_1) / FAM_COL_1);
uint32_t last_pass = 0;
int64_t* line_starts = NULL;
@@ -5814,7 +5958,7 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
uintptr_t line_idx;
uintptr_t sample_idx;
uintptr_t ulii;
- uint32_t cm_col;
+ uint32_t cm_col_exists;
uint32_t markers_per_pass;
uint32_t marker_start;
uint32_t marker_end;
@@ -5855,7 +5999,7 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
goto ped_to_bed_ret_OPEN_FAIL;
}
tbuf[MAXLINELEN - 6] = ' ';
- if (check_cm_col(mapfile, tbuf, 0, MAXLINELEN - 5, &cm_col, &line_idx)) {
+ if (check_cm_col(mapfile, tbuf, 0, allow_no_variants, MAXLINELEN - 5, &cm_col_exists, &line_idx)) {
if (line_idx) {
goto ped_to_bed_ret_MISSING_TOKENS_MAP;
} else {
@@ -5863,6 +6007,10 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
goto ped_to_bed_ret_INVALID_FORMAT;
}
}
+ if (!line_idx) {
+ // no variants
+ goto ped_to_bed_empty_map_with_allow_no_vars;
+ }
line_idx--;
do {
line_idx++;
@@ -5875,7 +6023,7 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
continue;
}
col2_ptr = next_token(col1_ptr);
- bufptr = next_token_mult(col2_ptr, 1 + cm_col);
+ bufptr = next_token_mult(col2_ptr, 1 + cm_col_exists);
if (no_more_tokens_kns(bufptr)) {
goto ped_to_bed_ret_MISSING_TOKENS_MAP;
}
@@ -5931,18 +6079,19 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
goto ped_to_bed_ret_READ_FAIL;
}
marker_ct = unfiltered_marker_ct - marker_exclude_ct;
- if (!marker_ct) {
+ if ((!marker_ct) && (!allow_no_variants)) {
logprint("Error: No variants in current analysis.\n");
goto ped_to_bed_ret_ALL_MARKERS_EXCLUDED;
}
+ ped_to_bed_empty_map_with_allow_no_vars:
marker_exclude = (uintptr_t*)wkspace_alloc(((unfiltered_marker_ct + (BITCT - 1)) / BITCT) * sizeof(intptr_t));
if (map_is_unsorted) {
- retval = load_sort_and_write_map(&map_reverse, mapfile, 3 + cm_col, outname, outname_end, unfiltered_marker_ct, marker_exclude, marker_ct, max_marker_id_len, 1, chrom_info_ptr);
+ retval = load_sort_and_write_map(&map_reverse, mapfile, 3 + cm_col_exists, outname, outname_end, unfiltered_marker_ct, marker_exclude, marker_ct, max_marker_id_len, 1, chrom_info_ptr);
if (retval) {
goto ped_to_bed_ret_1;
}
- cm_col = 1;
+ cm_col_exists = 1;
fclose_null(&mapfile);
}
// provisionally assume max_marker_allele_len == 1
@@ -6005,10 +6154,11 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
} else {
col2_ptr = col1_ptr;
}
- bufptr = next_token_mult(col2_ptr, ped_col_skip_iid);
+ bufptr = next_token_multz(col2_ptr, ped_col_skip_iid_m1);
if (no_more_tokens_kns(bufptr)) {
goto ped_to_bed_ret_MISSING_TOKENS_PED;
}
+ bufptr = token_endnn(bufptr);
if ((bufptr - col1_ptr) > (MAXLINELEN / 2) - 4) {
logprint("\n");
sprintf(logbuf, "Error: Line %" PRIuPTR " of .ped file has a pathologically long token.\n", line_idx);
@@ -6040,10 +6190,11 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
goto ped_to_bed_ret_WRITE_FAIL;
}
marker_idx = 0;
+ bufptr = skip_initial_spaces(bufptr);
for (marker_uidx = 0; marker_uidx < unfiltered_marker_ct; marker_uidx++) {
cc = *bufptr++;
if (!cc) {
- goto ped_to_bed_ret_MISSING_TOKENS_PED;
+ goto ped_to_bed_ret_MISSING_TOKENS_PED;
}
bufptr = skip_initial_spaces(bufptr);
cc2 = *bufptr++;
@@ -6077,6 +6228,10 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
// either multi-character alleles, or invalid format. Restart scan.
putchar('\r');
logstr("\n");
+ if (!marker_ct) {
+ sprintf(logbuf, "Error: Line %" PRIuPTR " of .ped file has more tokens than expected.\n", line_idx);
+ goto ped_to_bed_ret_INVALID_FORMAT_2;
+ }
logprint("Possibly irregular .ped line. Restarting scan, assuming multichar alleles.\n");
is_single_char_alleles = 0;
break;
@@ -6100,7 +6255,7 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
if (!feof(pedfile)) {
goto ped_to_bed_ret_READ_FAIL;
}
- if (!sample_ct) {
+ if ((!sample_ct) && (!allow_no_samples)) {
logprint("\n");
sprintf(logbuf, "Error: No %s in .ped file.\n", g_species_plural);
goto ped_to_bed_ret_INVALID_FORMAT_2;
@@ -6185,7 +6340,7 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
} else {
bufptr = write_token(bufptr, outfile);
bufptr = write_token(bufptr, outfile);
- if (cm_col) {
+ if (cm_col_exists) {
ucc = (unsigned char)(*bufptr);
if (((ucc >= '0') && (ucc <= '9')) || (ucc == '-') || (ucc == '+')) {
bufptr = write_token_nt(bufptr, outfile);
@@ -6221,7 +6376,7 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
if (wkspace_left >= marker_ct * sample_ct4) {
markers_per_pass = marker_ct;
sprintf(logbuf, "Performing single-pass .bed write (%" PRIuPTR " variant%s, %" PRIuPTR " %s).\n", marker_ct, (marker_ct == 1)? "" : "s", sample_ct, species_str(sample_ct));
- pass_ct = 1;
+ pass_ct = (marker_ct * sample_ct4)? 1 : 0;
} else {
if (!map_is_unsorted) {
if (wkspace_alloc_ll_checked(&line_starts, sample_ct * sizeof(int64_t))) {
@@ -6371,7 +6526,7 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
}
}
} else {
- retval = ped_to_bed_multichar_allele(&pedfile, &outfile, outname, outname_end, &mapfile, unfiltered_marker_ct, marker_exclude, marker_ct, marker_alleles_f, map_is_unsorted, fam_cols, ped_col_skip_iid, ped_col_skip, cm_col, map_reverse, ped_size, missing_pheno_str);
+ retval = ped_to_bed_multichar_allele(&pedfile, &outfile, outname, outname_end, &mapfile, unfiltered_marker_ct, marker_exclude, marker_ct, marker_alleles_f, map_is_unsorted, fam_cols, ped_col_skip_iid_m1 + 1, ped_col_skip, cm_col_exists, map_reverse, ped_size, missing_pheno_str);
if (retval) {
goto ped_to_bed_ret_1;
}
@@ -6422,13 +6577,14 @@ int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_en
return retval;
}
-int32_t lgen_to_bed(char* lgen_namebuf, char* outname, char* outname_end, int32_t missing_pheno, uint64_t misc_flags, uint32_t lgen_modifier, char* lgen_reference_fname, Chrom_info* chrom_info_ptr) {
+int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname, char* outname_end, int32_t missing_pheno, uint64_t misc_flags, uint32_t lgen_modifier, char* lgen_reference_fname, Chrom_info* chrom_info_ptr) {
unsigned char* wkspace_mark = wkspace_base;
FILE* infile = NULL;
FILE* outfile = NULL;
- char* name_end = (char*)memchr(lgen_namebuf, 0, FNAMESIZE);
uint32_t lgen_allele_count = lgen_modifier & LGEN_ALLELE_COUNT;
uint32_t allow_extra_chroms = (misc_flags / MISC_ALLOW_EXTRA_CHROMS) & 1;
+ uint32_t allow_no_samples = (misc_flags / MISC_ALLOW_NO_SAMPLES) & 1;
+ uint32_t allow_no_vars = (misc_flags / MISC_ALLOW_NO_VARS) & 1;
uint32_t affection_01 = (misc_flags / MISC_AFFECTION_01) & 1;
uint32_t map_cols = 3;
uintptr_t* marker_exclude = NULL;
@@ -6496,8 +6652,7 @@ int32_t lgen_to_bed(char* lgen_namebuf, char* outname, char* outname_end, int32_
goto lgen_to_bed_ret_INVALID_CMDLINE;
}
- memcpy(name_end, ".map", 5);
- retval = load_map(&infile, lgen_namebuf, &map_cols, &unfiltered_marker_ct, &marker_exclude_ct, &max_marker_id_len, &marker_exclude, &marker_ids, chrom_info_ptr, &marker_pos, &map_is_unsorted, allow_extra_chroms);
+ retval = load_map(&infile, mapname, &map_cols, &unfiltered_marker_ct, &marker_exclude_ct, &max_marker_id_len, &marker_exclude, &marker_ids, chrom_info_ptr, &marker_pos, &map_is_unsorted, allow_extra_chroms, allow_no_vars);
if (retval) {
goto lgen_to_bed_ret_1;
}
@@ -6520,16 +6675,17 @@ int32_t lgen_to_bed(char* lgen_namebuf, char* outname, char* outname_end, int32_
if (wkspace_alloc_ui_checked(&sample_id_map, unfiltered_marker_ct * sizeof(int32_t))) {
goto lgen_to_bed_ret_NOMEM;
}
- fill_uidx_to_idx(marker_exclude, unfiltered_marker_ct, marker_ct, sample_id_map);
- for (uii = 0; uii < marker_ct; uii++) {
- marker_id_map[uii] = sample_id_map[marker_id_map[uii]];
+ if (marker_ct) {
+ fill_uidx_to_idx(marker_exclude, unfiltered_marker_ct, marker_ct, sample_id_map);
+ for (uii = 0; uii < marker_ct; uii++) {
+ marker_id_map[uii] = sample_id_map[marker_id_map[uii]];
+ }
}
fclose_null(&infile);
memcpy(marker_ids, sorted_marker_ids, marker_ct * max_marker_id_len);
wkspace_reset(sorted_marker_ids);
- memcpy(name_end, ".fam", 5);
- retval = load_fam(lgen_namebuf, FAM_COL_13456, 1, missing_pheno, affection_01, &sample_ct, &sample_ids, &max_sample_id_len, &paternal_ids, &max_paternal_id_len, &maternal_ids, &max_maternal_id_len, &sex_nm, &sex_male, &affection, &pheno_nm, &pheno_c, &pheno_d, &founder_info, &sample_exclude);
+ retval = load_fam(famname, FAM_COL_13456, 1, missing_pheno, affection_01, &sample_ct, &sample_ids, &max_sample_id_len, &paternal_ids, &max_paternal_id_len, &maternal_ids, &max_maternal_id_len, &sex_nm, &sex_male, &affection, &pheno_nm, &pheno_c, &pheno_d, &founder_info, &sample_exclude, allow_no_samples);
if (retval) {
goto lgen_to_bed_ret_1;
}
@@ -6644,8 +6800,7 @@ int32_t lgen_to_bed(char* lgen_namebuf, char* outname, char* outname_end, int32_
if (fwrite_checked("l\x1b\x01", 3, outfile)) {
goto lgen_to_bed_ret_WRITE_FAIL;
}
- memcpy(name_end, ".lgen", 6);
- if (fopen_checked(&infile, lgen_namebuf, "r")) {
+ if (fopen_checked(&infile, lgenname, "r")) {
goto lgen_to_bed_ret_OPEN_FAIL;
}
if (fseeko(infile, 0, SEEK_END)) {
@@ -6830,14 +6985,11 @@ int32_t lgen_to_bed(char* lgen_namebuf, char* outname, char* outname_end, int32_
if (ii != -1) {
marker_idx = marker_id_map[(uint32_t)ii];
a1len = strlen_se(a1ptr);
- ucc = (unsigned char)(*a1ptr);
- if ((a1len != 1) || (ucc < 48) || (ucc > 50)) {
+ uii = ((uint32_t)((unsigned char)(*a1ptr))) - 48;
+ if ((a1len != 1) || (uii > 2)) {
uii = 1;
- } else {
- uii = ucc - 48;
- if (uii) {
- uii++;
- }
+ } else if (uii) {
+ uii++;
}
ulii = marker_idx * sample_ct4 + (sample_idx / 4);
ujj = (sample_idx % 4) * 2;
@@ -6887,8 +7039,7 @@ int32_t lgen_to_bed(char* lgen_namebuf, char* outname, char* outname_end, int32_
goto lgen_to_bed_ret_OPEN_FAIL;
}
} else {
- memcpy(name_end, ".map", 5);
- if (fopen_checked(&infile, lgen_namebuf, "r")) {
+ if (fopen_checked(&infile, mapname, "r")) {
goto lgen_to_bed_ret_OPEN_FAIL;
}
}
@@ -6941,13 +7092,12 @@ int32_t lgen_to_bed(char* lgen_namebuf, char* outname, char* outname_end, int32_
if (fclose_null(&outfile)) {
goto lgen_to_bed_ret_WRITE_FAIL;
}
- memcpy(name_end, ".fam", 5);
memcpy(outname_end, ".fam", 5);
#ifdef _WIN32
- uii = GetFullPathName(lgen_namebuf, FNAMESIZE, tbuf, NULL);
+ uii = GetFullPathName(famname, FNAMESIZE, tbuf, NULL);
if ((!uii) || (uii > FNAMESIZE))
#else
- if (!realpath(lgen_namebuf, tbuf))
+ if (!realpath(famname, tbuf))
#endif
{
LOGERRPRINTFWW("Error: Failed to open %s.\n", outname);
@@ -6961,7 +7111,7 @@ int32_t lgen_to_bed(char* lgen_namebuf, char* outname, char* outname_end, int32_
if (!(cptr && (!strcmp(tbuf, &(tbuf[FNAMESIZE + 64])))))
#endif
{
- if (fopen_checked(&infile, lgen_namebuf, "r")) {
+ if (fopen_checked(&infile, famname, "r")) {
goto lgen_to_bed_ret_OPEN_FAIL;
}
if (fopen_checked(&outfile, outname, "w")) {
@@ -7022,7 +7172,8 @@ int32_t lgen_to_bed(char* lgen_namebuf, char* outname, char* outname_end, int32_
retval = RET_INVALID_FORMAT;
break;
lgen_to_bed_ret_NOT_BIALLELIC:
- LOGERRPRINTFWW("Error: Variant '%s' in .lgen file has 3+ different alleles.\n", id_buf);
+ *cptr4 = '\0';
+ LOGERRPRINTFWW("Error: Variant '%s' in .lgen file has 3+ different alleles.\n", cptr3);
retval = RET_INVALID_FORMAT;
break;
lgen_to_bed_ret_INVALID_CMDLINE:
@@ -7094,6 +7245,8 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
uintptr_t sample_ct = 0;
uintptr_t line_idx = 0;
uint32_t no_extra_cols = 1;
+ uint32_t allow_no_samples = (misc_flags / MISC_ALLOW_NO_SAMPLES) & 1;
+ uint32_t allow_no_variants = (misc_flags / MISC_ALLOW_NO_VARS) & 1;
int32_t retval = 0;
uint32_t pct = 0;
uint32_t map_is_unsorted = 0;
@@ -7189,7 +7342,7 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
if (!feof(infile)) {
goto transposed_to_bed_ret_READ_FAIL;
}
- if (!sample_ct) {
+ if ((!sample_ct) && (!allow_no_samples)) {
sprintf(logbuf, "Error: No %s in .tfam file.\n", g_species_plural);
goto transposed_to_bed_ret_INVALID_FORMAT_2R;
}
@@ -7222,9 +7375,6 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
goto transposed_to_bed_ret_WRITE_FAIL;
}
- // given e.g. 6MB indels in real datasets, there's legitimate reason for a
- // .tped line to be even longer than 2GB, so we use ftoken_...() over
- // fgets().
if (fopen_checked(&infile, tpedname, "r")) {
goto transposed_to_bed_ret_OPEN_FAIL;
}
@@ -7246,7 +7396,9 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
break;
}
// assume first four fields are within MAXLINELEN characters, but after
- // that, anything goes
+ // that, anything goes. given e.g. 6MB indels in real datasets, there's
+ // legitimate reason for a .tped line to be even longer than 2GB, so we use
+ // a custom loading loop.
cptr = skip_initial_spaces(tbuf);
if (is_eoln_kns(*cptr)) {
if (!tbuf[MAXLINELEN - 1]) {
@@ -7355,13 +7507,13 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
}
axptr = cptr2;
axlen = strlen_se(cptr2);
+ if (!axlen) {
+ goto transposed_to_bed_ret_MISSING_TOKENS;
+ }
cptr2 = &(axptr[axlen]);
// only way for this to happen if it isn't at end of buffer is if we're
// at EOF, which is an error anyway
if (!(*cptr2)) {
- if (!axlen) {
- goto transposed_to_bed_ret_MISSING_TOKENS;
- }
cptr3 = memcpya(allele_buf, axptr, axlen);
axptr = allele_buf;
do {
@@ -7605,6 +7757,11 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
if (fclose_null(&outfile)) {
goto transposed_to_bed_ret_WRITE_FAIL;
}
+ if ((!marker_ct) && (!allow_no_variants)) {
+ fputs("\b\b\b\b\b \r", stdout);
+ logerrprint("Error: Empty .tped file.\n");
+ goto transposed_to_bed_ret_INVALID_FORMAT;
+ }
chrom_info_ptr->zero_extra_chroms = 0;
if (map_is_unsorted) {
@@ -7846,7 +8003,7 @@ int32_t vcf_sample_line(char* outname, char* outname_end, int32_t missing_pheno,
} while (bufptr2);
}
}
- do {
+ while (((unsigned char)bufptr[0]) >= ' ') {
sample_ct++;
bufptr2 = strchr(bufptr, '\t');
if (bufptr2) {
@@ -7914,14 +8071,10 @@ int32_t vcf_sample_line(char* outname, char* outname_end, int32_t missing_pheno,
break;
}
bufptr = &(bufptr2[1]);
- } while (((unsigned char)bufptr[0]) > ' ');
+ }
if (fclose_null(&outfile)) {
goto vcf_sample_line_ret_WRITE_FAIL;
}
- if (!sample_ct) {
- sprintf(logbuf, "Error: No samples in .%ccf file.\n", flag_char);
- goto vcf_sample_line_ret_INVALID_FORMAT_2;
- }
*sample_ct_ptr = sample_ct;
while (0) {
vcf_sample_line_ret_OPEN_FAIL:
@@ -7997,14 +8150,18 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
uintptr_t fexcept_ct = 0;
uintptr_t max_fexcept_len = 5;
uintptr_t sample_ct = 0;
+ uintptr_t marker_skip_ct = 0;
+ uintptr_t missing_gt_ct = 0;
uint32_t double_id = (misc_flags / MISC_DOUBLE_ID) & 1;
uint32_t check_qual = (vcf_min_qual != -1);
uint32_t allow_extra_chroms = (misc_flags / MISC_ALLOW_EXTRA_CHROMS) & 1;
uint32_t biallelic_only = (misc_flags / MISC_BIALLELIC_ONLY) & 1;
uint32_t biallelic_strict = (misc_flags / MISC_BIALLELIC_ONLY_STRICT) & 1;
uint32_t skip3_list = (misc_flags / MISC_BIALLELIC_ONLY_LIST) & 1;
+ uint32_t allow_no_samples = (misc_flags / MISC_ALLOW_NO_SAMPLES) & 1;
+ uint32_t allow_no_variants = (misc_flags / MISC_ALLOW_NO_VARS) & 1;
+ uint32_t require_gt = (misc_flags / MISC_VCF_REQUIRE_GT) & 1;
uint32_t marker_ct = 0;
- uint32_t marker_skip_ct = 0;
uint32_t gq_field_pos = 0;
uint32_t gp_field_pos = 0;
uint32_t vcf_half_call_explicit_error = (vcf_half_call == VCF_HALF_CALL_ERROR);
@@ -8119,13 +8276,23 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
goto vcf_to_bed_ret_INVALID_FORMAT;
}
bufptr = &(bufptr[38]);
- if (memcmp(bufptr, "\tFORMAT\t", 8) || (((unsigned char)bufptr[8]) <= ' ')) {
- logerrprint("Error: No genotype data in .vcf file.\n");
- goto vcf_to_bed_ret_INVALID_FORMAT;
+ if (!memcmp(bufptr, "\tFORMAT\t", 8)) {
+ retval = vcf_sample_line(outname, outname_end, missing_pheno, &(bufptr[8]), const_fid, double_id, id_delim, vcf_idspace_to, 'v', &sample_ct);
+ if (retval) {
+ goto vcf_to_bed_ret_1;
+ }
+ } else if (allow_no_samples) {
+ memcpy(outname_end, ".fam", 5);
+ if (fopen_checked(&outfile, outname, "w")) {
+ goto vcf_to_bed_ret_OPEN_FAIL;
+ }
+ if (fclose_null(&outfile)) {
+ goto vcf_to_bed_ret_WRITE_FAIL;
+ }
}
- retval = vcf_sample_line(outname, outname_end, missing_pheno, &(bufptr[8]), const_fid, double_id, id_delim, vcf_idspace_to, 'v', &sample_ct);
- if (retval) {
- goto vcf_to_bed_ret_1;
+ if ((!sample_ct) && (!allow_no_samples)) {
+ logerrprint("Error: No samples in .vcf file.\n");
+ goto vcf_to_bed_ret_INVALID_FORMAT;
}
sample_ct4 = (sample_ct + 3) / 4;
sample_ctl2 = (sample_ct + BITCT2 - 1) / BITCT2;
@@ -8292,6 +8459,10 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
}
bufptr2[-1] = '\t';
}
+ if (!sample_ct) {
+ alt_allele_idx = 1;
+ goto vcf_to_bed_skip_genotype_write;
+ }
bufptr = bufptr2;
bufptr2 = strchr(bufptr, '\t');
if (!bufptr2) {
@@ -8303,8 +8474,16 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
goto vcf_to_bed_ret_MISSING_TOKENS;
}
if (memcmp(bufptr, "GT", 2)) {
- marker_skip_ct++;
- continue;
+ // We previously always skipped this case, but that's inconsistent with
+ // how we now handle zero-sample VCFs.
+ if (require_gt) {
+ marker_skip_ct++;
+ continue;
+ }
+ fill_vec_55(base_bitfields, sample_ct);
+ missing_gt_ct++;
+ alt_allele_idx = 1;
+ goto vcf_to_bed_genotype_write;
}
bufptr2++;
if (vcf_min_gq != -1) {
@@ -8711,9 +8890,11 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
*ref_ptr++ = (uljj & ulii) | (((~ulkk) >> 1) & FIVEMASK);
}
ref_ptr[-1] &= final_mask;
+ vcf_to_bed_genotype_write:
if (fwrite_checked(base_bitfields, sample_ct4, outfile)) {
goto vcf_to_bed_ret_WRITE_FAIL;
}
+ vcf_to_bed_skip_genotype_write:
chrom_ptr[chrom_len] = '\0';
fputs(chrom_ptr, bimfile);
putc('\t', bimfile);
@@ -8775,10 +8956,23 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
}
}
putchar('\r');
+ if ((!marker_ct) && (!allow_no_variants)) {
+ if (marker_skip_ct) {
+ logerrprint("Error: All variants in VCF skipped.\n");
+ retval = RET_ALL_MARKERS_EXCLUDED;
+ goto vcf_to_bed_ret_1;
+ } else {
+ logerrprint("Error: No variants in VCF file.\n");
+ goto vcf_to_bed_ret_INVALID_FORMAT;
+ }
+ }
*outname_end = '\0';
LOGPRINTFWW("--vcf: %s.bed + %s.bim + %s.fam written.\n", outname, outname, outname);
if (marker_skip_ct) {
- LOGPRINTF("(%u variant%s skipped.)\n", marker_skip_ct, (marker_skip_ct == 1)? "" : "s");
+ LOGPRINTF("(%" PRIuPTR " variant%s skipped.)\n", marker_skip_ct, (marker_skip_ct == 1)? "" : "s");
+ }
+ if (missing_gt_ct) {
+ LOGERRPRINTF("Warning: %" PRIuPTR " variant record%s had no GT field.\n", missing_gt_ct, (missing_gt_ct == 1)? "" : "s");
}
while (0) {
vcf_to_bed_ret_NOMEM:
@@ -8834,44 +9028,47 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
return retval;
}
-int32_t read_bcf_typed_integer(gzFile gz_infile, uint32_t* int_ptr) {
+int32_t read_bcf_typed_nonnegative_integer(gzFile gz_infile, uint32_t* int_ptr) {
+ // errors out on missing and negative values.
int32_t retval = 0;
int32_t ii = gzgetc(gz_infile);
uint32_t uii;
if (ii == -1) {
- goto read_bcf_typed_integer_ret_READ_OR_FORMAT_FAIL;
+ goto read_bcf_typed_nonnegative_integer_ret_READ_OR_FORMAT_FAIL;
}
if (ii == 0x11) {
ii = gzgetc(gz_infile);
- if (ii == -1) {
- goto read_bcf_typed_integer_ret_READ_OR_FORMAT_FAIL;
- } else if (((uint32_t)ii) > 127) {
- goto read_bcf_typed_integer_ret_INVALID_FORMAT_GENERIC;
+ if (((uint32_t)ii) > 127) {
+ if (ii == -1) {
+ goto read_bcf_typed_nonnegative_integer_ret_READ_OR_FORMAT_FAIL;
+ }
+ goto read_bcf_typed_nonnegative_integer_ret_INVALID_FORMAT_GENERIC;
}
*int_ptr = (uint32_t)ii;
} else if (ii == 0x12) {
uii = gzgetc(gz_infile);
ii = gzgetc(gz_infile);
- if (ii == -1) {
- goto read_bcf_typed_integer_ret_READ_OR_FORMAT_FAIL;
- } else if (((uint32_t)ii) > 127) {
- goto read_bcf_typed_integer_ret_INVALID_FORMAT_GENERIC;
+ if (((uint32_t)ii) > 127) {
+ if (ii == -1) {
+ goto read_bcf_typed_nonnegative_integer_ret_READ_OR_FORMAT_FAIL;
+ }
+ goto read_bcf_typed_nonnegative_integer_ret_INVALID_FORMAT_GENERIC;
}
*int_ptr = uii | (((uint32_t)ii) << 8);
} else if (ii == 0x13) {
if (gzread(gz_infile, int_ptr, 4) < 4) {
- goto read_bcf_typed_integer_ret_READ_OR_FORMAT_FAIL;
+ goto read_bcf_typed_nonnegative_integer_ret_READ_OR_FORMAT_FAIL;
}
} else {
- goto read_bcf_typed_integer_ret_INVALID_FORMAT_GENERIC;
+ goto read_bcf_typed_nonnegative_integer_ret_INVALID_FORMAT_GENERIC;
}
while (0) {
- read_bcf_typed_integer_ret_READ_OR_FORMAT_FAIL:
+ read_bcf_typed_nonnegative_integer_ret_READ_OR_FORMAT_FAIL:
if (!gzeof(gz_infile)) {
retval = RET_READ_FAIL;
break;
}
- read_bcf_typed_integer_ret_INVALID_FORMAT_GENERIC:
+ read_bcf_typed_nonnegative_integer_ret_INVALID_FORMAT_GENERIC:
logerrprint("Error: Improperly formatted .bcf file.\n");
retval = RET_INVALID_FORMAT;
break;
@@ -8888,7 +9085,7 @@ int32_t read_bcf_typed_string(gzFile gz_infile, char* readbuf, uint32_t maxlen,
goto read_bcf_typed_string_ret_READ_OR_FORMAT_FAIL;
}
if (((uint32_t)ii) == 0xf7) {
- retval = read_bcf_typed_integer(gz_infile, &slen);
+ retval = read_bcf_typed_nonnegative_integer(gz_infile, &slen);
if (retval) {
goto read_bcf_typed_string_ret_1;
}
@@ -8940,6 +9137,8 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
uintptr_t max_contig_len = 0;
uintptr_t max_fexcept_len = 0;
uintptr_t fexcept_ct = 0;
+ uintptr_t marker_skip_ct = 0;
+ uintptr_t missing_gt_ct = 0;
uintptr_t topsize = 0;
uint32_t double_id = (misc_flags / MISC_DOUBLE_ID) & 1;
uint32_t check_qual = (vcf_min_qual != -1);
@@ -8948,11 +9147,13 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
uint32_t biallelic_strict = (misc_flags / MISC_BIALLELIC_ONLY_STRICT) & 1;
uint32_t skip3_list = (misc_flags / MISC_BIALLELIC_ONLY_LIST) & 1;
uint32_t vcf_filter = (misc_flags / MISC_VCF_FILTER) & 1;
+ uint32_t allow_no_samples = (misc_flags / MISC_ALLOW_NO_SAMPLES) & 1;
+ uint32_t allow_no_variants = (misc_flags / MISC_ALLOW_NO_VARS) & 1;
+ uint32_t require_gt = (misc_flags / MISC_VCF_REQUIRE_GT) & 1;
+ uint32_t sample_ct = 0;
uint32_t stringdict_ct = 1;
uint32_t gt_idx = 0;
uint32_t marker_ct = 0;
- uint32_t marker_skip_ct = 0;
- uint32_t sample_ct = 0;
uint32_t umm = 0;
int32_t retval = 0;
float vcf_min_qualf = vcf_min_qual;
@@ -9015,7 +9216,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
if (memcmp(tbuf, "BCF\4", 4)) {
LOGPREPRINTFWW("Error: %s is not a BCF2 file.\n", bcfname);
} else {
- LOGPREPRINTFWW("Error: %s appears to be a BCF1 file; --bcf only supports BCF2. Use 'bcftools view' to convert to a readable VCF.\n", bcfname);
+ LOGPREPRINTFWW("Error: %s appears to be a BCF1 file; --bcf only supports BCF2. Use 'bcftools view' to convert it to a PLINK-readable VCF.\n", bcfname);
}
goto bcf_to_bed_ret_INVALID_FORMAT_2;
}
@@ -9028,8 +9229,10 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
if (gzread(gz_infile, &header_size, 4) < 4) {
goto bcf_to_bed_ret_READ_OR_FORMAT_FAIL;
}
- // must have at least fileformat, GT, and one contig
- if (header_size < 96) {
+ // must have at least fileformat, and first eight fields of #CHROM line. GT
+ // not required with --allow-no-samples, contig not require with
+ // --allow-no-vars.
+ if (header_size < 59) {
goto bcf_to_bed_ret_INVALID_FORMAT_GENERIC;
}
if (vcf_filter_exceptions_flattened) {
@@ -9140,23 +9343,39 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
goto bcf_to_bed_ret_INVALID_FORMAT_GENERIC;
}
}
- if (!gt_idx) {
- logerrprint("Error: No GT field in .bcf header.\n");
- goto bcf_to_bed_ret_INVALID_FORMAT;
- }
- if (!contig_ct) {
+ if ((!allow_no_variants) && (!contig_ct)) {
logerrprint("Error: No contig fields in .bcf header.\n");
goto bcf_to_bed_ret_INVALID_FORMAT;
}
- if (memcmp(linebuf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t", 46)) {
+ if (memcmp(linebuf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO", 38)) {
goto bcf_to_bed_ret_INVALID_FORMAT_GENERIC;
}
- *linebuf_end = '\0';
- retval = vcf_sample_line(outname, outname_end, missing_pheno, &(linebuf[46]), const_fid, double_id, id_delim, vcf_idspace_to, 'b', &ulii);
- if (retval) {
- goto bcf_to_bed_ret_1;
+ if (!memcmp(&(linebuf[38]), "\tFORMAT\t", 8)) {
+ *linebuf_end = '\0';
+ retval = vcf_sample_line(outname, outname_end, missing_pheno, &(linebuf[46]), const_fid, double_id, id_delim, vcf_idspace_to, 'b', &ulii);
+ if (retval) {
+ goto bcf_to_bed_ret_1;
+ }
+ if (ulii >= 0x1000000) {
+ // variant records only have 24 bits allocated for n_sample
+ logerrprint("Error: .bcf file contains >= 2^24 sample IDs.\n");
+ goto bcf_to_bed_ret_INVALID_FORMAT;
+ }
+ sample_ct = ulii;
+ } else if (allow_no_samples) {
+ gt_idx = 0;
+ memcpy(outname_end, ".fam", 5);
+ if (fopen_checked(&outfile, outname, "w")) {
+ goto bcf_to_bed_ret_OPEN_FAIL;
+ }
+ if (fclose_null(&outfile)) {
+ goto bcf_to_bed_ret_WRITE_FAIL;
+ }
+ }
+ if ((!sample_ct) && (!allow_no_samples)) {
+ logerrprint("Error: No samples in .bcf file.\n");
+ goto bcf_to_bed_ret_INVALID_FORMAT;
}
- sample_ct = ulii;
sample_ct4 = (sample_ct + 3) / 4;
sample_ctl2 = (sample_ct + (BITCT2 - 1)) / BITCT2;
sample_ctv2 = 2 * ((sample_ct + (BITCT - 1)) / BITCT);
@@ -9169,7 +9388,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
}
fill_ulong_zero(contig_bitfield, ulii);
ulii = contig_ct;
- do {
+ while (ulii) {
ulii--;
ii = get_chrom_code(chrom_info_ptr, contig_list->ss);
if (ii < 0) {
@@ -9186,7 +9405,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
strcpy(&(contigdict[ulii * max_contig_len]), contig_list->ss);
}
contig_list = contig_list->next;
- } while (ulii);
+ }
if (vcf_filter) {
uii = (stringdict_ct + (BITCT - 1)) / BITCT;
if (wkspace_alloc_ul_checked(&fexcept_bitfield, uii * sizeof(intptr_t))) {
@@ -9232,6 +9451,18 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
if (fwrite_checked("l\x1b\x01", 3, outfile)) {
goto bcf_to_bed_ret_WRITE_FAIL;
}
+ if ((!gt_idx) && require_gt) {
+ if (!allow_no_variants) {
+ logerrprint("Error: .bcf header doesn't define FORMAT:GT.\n");
+ retval = RET_ALL_MARKERS_EXCLUDED;
+ goto bcf_to_bed_ret_1;
+ }
+ logerrprint("Warning: Skipping all variants since .bcf header doesn't define FORMAT:GT.\n");
+ goto bcf_to_bed_skip_all_variants;
+ }
+ // possible todo: optimize other no-GT cases. e.g. if no sample information
+ // is needed, don't write the .bed or .fam.
+
memcpyl3(tbuf2, "\t0\t");
while (1) {
lastloc = gztell(gz_infile) + 8;
@@ -9258,10 +9489,6 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
goto bcf_to_bed_ret_1;
}
n_allele = bcf_var_header[6] >> 16;
- if (!n_allele) {
- // skip instead of error out on zero alleles?
- goto bcf_to_bed_marker_skip;
- }
if (biallelic_strict && (n_allele > 2)) {
goto bcf_to_bed_skip3;
}
@@ -9270,18 +9497,30 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
}
ujj = NON_WKSPACE_MIN; // remaining allele name buffer space
bufptr = allele_buf;
- for (uii = 0; uii < n_allele; uii++) {
- retval = read_bcf_typed_string(gz_infile, bufptr, ujj, &ukk);
- if (retval) {
- goto bcf_to_bed_ret_1;
- }
- if ((!uii) && (!ukk)) {
- // skip instead of error out on missing ref allele?
- goto bcf_to_bed_marker_skip;
+ if (n_allele) {
+ for (uii = 0; uii < n_allele; uii++) {
+ retval = read_bcf_typed_string(gz_infile, bufptr, ujj, &ukk);
+ if (retval) {
+ goto bcf_to_bed_ret_1;
+ }
+ if ((!uii) && ((!ukk) || ((ukk == 1) && (*bufptr == 'N')))) {
+ // convert ref 'N' or '.' to missing genotype. ('.' case was skipped
+ // the past, and 'N' was not converted.)
+ allele_lens[0] = 1;
+ allele_ptrs[0] = bufptr;
+ *bufptr++ = missing_geno;
+ } else {
+ allele_lens[uii] = ukk;
+ allele_ptrs[uii] = bufptr;
+ bufptr = &(bufptr[ukk]);
+ }
}
- allele_lens[uii] = ukk;
- allele_ptrs[uii] = bufptr;
- bufptr = &(bufptr[ukk]);
+ } else {
+ // n_allele == 0 case was previously skipped, but it might have a place
+ // with --allow-no-samples.
+ allele_lens[0] = 1;
+ allele_ptrs[0] = bufptr;
+ *bufptr = missing_geno;
}
if (vcf_filter) {
ii = gzgetc(gz_infile);
@@ -9290,7 +9529,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
} else {
ujj = ((uint32_t)ii) >> 4;
if (ujj == 15) {
- retval = read_bcf_typed_integer(gz_infile, &ujj);
+ retval = read_bcf_typed_nonnegative_integer(gz_infile, &ujj);
if (retval) {
goto bcf_to_bed_ret_1;
}
@@ -9359,15 +9598,29 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
}
}
}
+ alt_allele_idx = 1;
+ if ((!gt_idx) || (!bcf_var_header[1])) {
+ if (require_gt) {
+ goto bcf_to_bed_marker_skip;
+ }
+ ulljj = gztell(gz_infile);
+ ullii = lastloc + bcf_var_header[0] + bcf_var_header[1];
+ if (!sample_ct) {
+ goto bcf_to_bed_skip_genotype_write;
+ }
+ missing_gt_ct++;
+ fill_vec_55(base_bitfields, sample_ct);
+ goto bcf_to_bed_genotype_write;
+ }
+
// skip INFO
ullii = lastloc + bcf_var_header[0];
if (gzseek(gz_infile, ullii, SEEK_SET) == -1) {
goto bcf_to_bed_ret_READ_FAIL;
}
-
ullii += bcf_var_header[1];
while (1) {
- retval = read_bcf_typed_integer(gz_infile, &uii);
+ retval = read_bcf_typed_nonnegative_integer(gz_infile, &uii);
if (retval) {
goto bcf_to_bed_ret_1;
}
@@ -9377,7 +9630,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
}
ujj = ((uint32_t)ii) >> 4;
if (ujj == 15) {
- retval = read_bcf_typed_integer(gz_infile, &ujj);
+ retval = read_bcf_typed_nonnegative_integer(gz_infile, &ujj);
if (retval) {
goto bcf_to_bed_ret_1;
}
@@ -9386,17 +9639,17 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
ukk = ((uint32_t)ii) & 0x0f;
if ((ukk == 3) || (ukk == 5)) {
umm = 4; // int32, float = 4 bytes
- } else if (ukk && (ukk > 2)) {
+ } else if ((!ukk) || (ukk > 2)) {
logerrprint("Error: Unrecognized type in .bcf file.\n");
goto bcf_to_bed_ret_INVALID_FORMAT;
} else {
umm = ukk;
}
}
- ulljj = gztell(gz_infile) + ujj * umm * sample_ct;
- // uii = format code
- // ujj = vector length
- // ukk = type code
+ ulljj = gztell(gz_infile) + ((uint64_t)ujj) * umm * sample_ct;
+ // uii = format key
+ // ujj = for GT, max ploidy
+ // ukk = integer/float/character type code
// umm = bytes per entry
if (ulljj > ullii) {
goto bcf_to_bed_ret_INVALID_FORMAT_GENERIC;
@@ -9404,17 +9657,26 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
if (uii == gt_idx) {
break;
}
+ // possible todo: --vcf-min-gq and --vcf-min-gp support
if (ujj) {
- if (gzseek(gz_infile, ujj * umm * sample_ct, SEEK_CUR) == -1) {
+ if (gzseek(gz_infile, ((uint64_t)ujj) * umm * sample_ct, SEEK_CUR) == -1) {
goto bcf_to_bed_ret_READ_FAIL;
}
if (ulljj == ullii) {
- goto bcf_to_bed_marker_skip2;
+ if (require_gt) {
+ goto bcf_to_bed_marker_skip2;
+ } else {
+ missing_gt_ct++;
+ fill_vec_55(base_bitfields, sample_ct);
+ goto bcf_to_bed_genotype_write;
+ }
}
}
}
if (!ujj) {
- goto bcf_to_bed_marker_skip;
+ // ploidy zero previously caused the variant to be skipped
+ fill_vec_55(base_bitfields, sample_ct);
+ goto bcf_to_bed_genotype_write;
}
if (ukk == 5) {
logerrprint("Error: GT field cannot contain floating point values.\n");
@@ -9426,6 +9688,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
logerrprint("Error: --bcf does not support GT vectors requiring >12 bytes per sample.\n");
goto bcf_to_bed_ret_INVALID_FORMAT;
}
+ // ujj * umm <= 12 and sample_ct < 2^24, so no uint64_t cast needed there
if ((uint32_t)((uint64_t)gzread(gz_infile, loadbuf, ujj * umm * sample_ct)) < ujj * umm * sample_ct) {
goto bcf_to_bed_ret_READ_OR_FORMAT_FAIL;
}
@@ -9568,7 +9831,6 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
}
}
}
- alt_allele_idx = 1;
if (n_allele > 2) {
ulii = popcount2_longs(&(base_bitfields[sample_ctv2]), sample_ctl2);
for (ulkk = 2; ulkk < n_allele; ulkk++) {
@@ -9600,9 +9862,11 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
*ref_ptr++ = (uljj & ulii) | (((~ulkk) >> 1) & FIVEMASK);
}
ref_ptr[-1] &= final_mask;
+ bcf_to_bed_genotype_write:
if (fwrite_checked(base_bitfields, sample_ct4, outfile)) {
goto bcf_to_bed_ret_WRITE_FAIL;
}
+ bcf_to_bed_skip_genotype_write:
fputs(&(contigdict[bcf_var_header[2] * max_contig_len]), bimfile);
putc('\t', bimfile);
if (marker_id_len) {
@@ -9611,8 +9875,8 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
putc('.', bimfile);
}
// bcf2 coordinates are 0-based while vcf is 1-based... (seriously, whose
- // idea was this? this is basically a bug in the spec, but we have to play
- // along)
+ // idea was this? this is basically a bug in the spec due to how e.g.
+ // telomeres are supposed to be encoded, but we have to play along)
bufptr = uint32_writex(&(tbuf2[3]), bcf_var_header[3] + 1, '\t');
if (fwrite_checked(tbuf2, bufptr - tbuf2, bimfile)) {
goto bcf_to_bed_ret_WRITE_FAIL;
@@ -9664,9 +9928,15 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
bcf_to_bed_marker_skip2:
marker_skip_ct++;
}
- if (!marker_ct) {
- logerrprint("Error: No variants in .bcf file.\n");
- goto bcf_to_bed_ret_INVALID_FORMAT;
+ if ((!marker_ct) && (!allow_no_variants)) {
+ if (marker_skip_ct) {
+ logerrprint("Error: All variants in .bcf file skipped.\n");
+ retval = RET_ALL_MARKERS_EXCLUDED;
+ goto bcf_to_bed_ret_1;
+ } else {
+ logerrprint("Error: No variants in .bcf file.\n");
+ goto bcf_to_bed_ret_INVALID_FORMAT;
+ }
}
if (gzclose(gz_infile) != Z_OK) {
gz_infile = NULL;
@@ -9680,10 +9950,14 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
goto bcf_to_bed_ret_WRITE_FAIL;
}
putchar('\r');
+ bcf_to_bed_skip_all_variants:
*outname_end = '\0';
LOGPRINTFWW("--bcf: %s.bed + %s.bim + %s.fam written.\n", outname, outname, outname);
if (marker_skip_ct) {
- LOGPRINTF("(%u variant%s skipped.)\n", marker_skip_ct, (marker_skip_ct == 1)? "" : "s");
+ LOGPRINTF("(%" PRIuPTR " variant%s skipped.)\n", marker_skip_ct, (marker_skip_ct == 1)? "" : "s");
+ }
+ if (missing_gt_ct) {
+ LOGERRPRINTF("Warning: %" PRIuPTR " variant record%s had no GT field.\n", missing_gt_ct, (missing_gt_ct == 1)? "" : "s");
}
while (0) {
bcf_to_bed_ret_NOMEM2:
@@ -9743,7 +10017,7 @@ uint32_t write_23_cached_chrom(char* write_cache, uint32_t markers_left, char ch
return 0;
}
-int32_t bed_from_23(char* infile_name, char* outname, char* outname_end, uint32_t modifier_23, char* fid_23, char* iid_23, double pheno_23, char* paternal_id_23, char* maternal_id_23, Chrom_info* chrom_info_ptr) {
+int32_t bed_from_23(char* infile_name, char* outname, char* outname_end, uint32_t modifier_23, char* fid_23, char* iid_23, double pheno_23, uint64_t misc_flags, char* paternal_id_23, char* maternal_id_23, Chrom_info* chrom_info_ptr) {
unsigned char* wkspace_mark = wkspace_base;
FILE* infile_23 = NULL;
FILE* outfile_bed = NULL;
@@ -9751,6 +10025,7 @@ int32_t bed_from_23(char* infile_name, char* outname, char* outname_end, uint32_
uintptr_t line_idx = 0;
uint32_t is_male = modifier_23 & M23_MALE;
uint32_t is_female = modifier_23 & M23_FEMALE;
+ uint32_t allow_no_variants = (misc_flags / MISC_ALLOW_NO_VARS) & 1;
uint32_t x_present = 0;
uint32_t haploid_x_present = 0;
uint32_t y_present = 0;
@@ -9921,7 +10196,7 @@ int32_t bed_from_23(char* infile_name, char* outname, char* outname_end, uint32_
if (!feof(infile_23)) {
goto bed_from_23_ret_READ_FAIL;
}
- if ((writebuf_cur == &(writebuf[3])) && (writebuf[0] == 'l')) {
+ if ((writebuf_cur == &(writebuf[3])) && (writebuf[0] == 'l') && (!allow_no_variants)) {
if (chrom_mask_23 == 0x7ffffff) {
logerrprint("Error: No --23file variants.\n");
goto bed_from_23_ret_INVALID_FORMAT;
@@ -11190,22 +11465,28 @@ int32_t simulate_dataset(char* outname, char* outname_end, uint32_t flags, char*
int32_t recode_allele_load(char* loadbuf, uintptr_t loadbuf_size, char* recode_allele_name, char*** allele_missing_ptr, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* recode_allele_reverse, char* recode_allele_extra) {
FILE* rafile = NULL;
uint32_t missing_allele = 0;
+ uint32_t marker_id_htable_size = get_id_htable_size(marker_ct);
uintptr_t rae_size = 0;
uintptr_t line_idx = 0;
- char* sorted_ids;
- uint32_t* id_map;
+ uintptr_t topsize = 0;
+ uint32_t* marker_id_htable;
char* bufptr;
char* bufptr2;
int32_t retval;
uint32_t slen;
uint32_t alen;
- int32_t ii;
uintptr_t marker_uidx;
if (fopen_checked(&rafile, recode_allele_name, "r")) {
goto recode_allele_load_ret_OPEN_FAIL;
}
- retval = sort_item_ids(&sorted_ids, &id_map, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_ct, marker_ids, max_marker_id_len, 0, 0, strcmp_deref);
+ marker_id_htable = (uint32_t*)top_alloc(&topsize, marker_id_htable_size * sizeof(int32_t));
+ if (!marker_id_htable) {
+ goto recode_allele_load_ret_NOMEM;
+ }
+ wkspace_left -= topsize;
+ retval = populate_id_htable(unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, 0, marker_id_htable, marker_id_htable_size);
if (retval) {
+ wkspace_left += topsize;
goto recode_allele_load_ret_1;
}
loadbuf[loadbuf_size - 1] = ' ';
@@ -11213,7 +11494,7 @@ int32_t recode_allele_load(char* loadbuf, uintptr_t loadbuf_size, char* recode_a
line_idx++;
if (!loadbuf[loadbuf_size - 1]) {
sprintf(logbuf, "Error: Line %" PRIuPTR " of --recode-allele file is pathologically long.\n", line_idx);
- goto recode_allele_load_ret_INVALID_FORMAT_2;
+ goto recode_allele_load_ret_INVALID_FORMAT_3;
}
bufptr = skip_initial_spaces(loadbuf);
if (is_eoln_kns(*bufptr)) {
@@ -11223,12 +11504,11 @@ int32_t recode_allele_load(char* loadbuf, uintptr_t loadbuf_size, char* recode_a
bufptr2 = skip_initial_spaces(&(bufptr[slen]));
if (is_eoln_kns(*bufptr2)) {
sprintf(logbuf, "Error: Line %" PRIuPTR " of --recode-allele file has fewer tokens than expected.\n", line_idx);
- goto recode_allele_load_ret_INVALID_FORMAT_2;
+ goto recode_allele_load_ret_INVALID_FORMAT_3;
}
alen = strlen_se(bufptr2);
- ii = bsearch_str(bufptr, slen, sorted_ids, max_marker_id_len, marker_ct);
- if (ii != -1) {
- marker_uidx = id_map[(uint32_t)ii];
+ marker_uidx = id_htable_find(bufptr, slen, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len);
+ if (marker_uidx != 0xffffffffU) {
bufptr2[alen++] = '\0';
if (!strcmp(bufptr2, marker_allele_ptrs[2 * marker_uidx])) {
CLEAR_BIT(recode_allele_reverse, marker_uidx);
@@ -11236,7 +11516,7 @@ int32_t recode_allele_load(char* loadbuf, uintptr_t loadbuf_size, char* recode_a
SET_BIT(recode_allele_reverse, marker_uidx);
} else {
if (rae_size + alen > wkspace_left) {
- goto recode_allele_load_ret_NOMEM;
+ goto recode_allele_load_ret_NOMEM2;
}
missing_allele = 1;
(*allele_missing_ptr)[marker_uidx] = &(recode_allele_extra[rae_size]);
@@ -11245,10 +11525,13 @@ int32_t recode_allele_load(char* loadbuf, uintptr_t loadbuf_size, char* recode_a
}
}
}
+ wkspace_left += topsize;
if (!feof(rafile)) {
goto recode_allele_load_ret_READ_FAIL;
}
while (0) {
+ recode_allele_load_ret_NOMEM2:
+ wkspace_left += topsize;
recode_allele_load_ret_NOMEM:
retval = RET_NOMEM;
break;
@@ -11258,7 +11541,8 @@ int32_t recode_allele_load(char* loadbuf, uintptr_t loadbuf_size, char* recode_a
recode_allele_load_ret_READ_FAIL:
retval = RET_READ_FAIL;
break;
- recode_allele_load_ret_INVALID_FORMAT_2:
+ recode_allele_load_ret_INVALID_FORMAT_3:
+ wkspace_left += topsize;
logerrprintb();
retval = RET_INVALID_FORMAT;
}
@@ -11592,9 +11876,13 @@ uint32_t write_ped_lines(FILE* outfile, unsigned char* loadbuf, uintptr_t* marke
}
bufptr = &(bufptr[unfiltered_sample_ct4]);
}
- wbufptr[-1] = '\n';
- if (fwrite_checked(writebuf, wbufptr - writebuf, outfile)) {
- return 1;
+ if (marker_ct) {
+ wbufptr[-1] = '\n';
+ if (fwrite_checked(writebuf, wbufptr - writebuf, outfile)) {
+ return 1;
+ }
+ } else {
+ putc('\n', outfile);
}
}
}
@@ -11623,7 +11911,7 @@ uint32_t valid_vcf_allele_code(const char* allele_code) {
// returns 1 if probably valid (angle-bracket case is not exhaustively
// checked), 0 if definitely not
uint32_t uii = (unsigned char)(*allele_code);
- if (uii == '<') {
+ if ((uii == '<') || ((uii == '*') && (!allele_code[1]))) {
return 1;
}
do {
@@ -12047,8 +12335,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
goto recode_ret_NOMEM;
}
} else if (recode_modifier & RECODE_STRUCTURE) {
- sample_uidx = 0;
- for (sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
+ for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
cptr = &(sample_ids[sample_uidx * max_sample_id_len]);
aptr = (char*)memchr(cptr, '\t', max_sample_id_len);
@@ -12063,8 +12350,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
if (wkspace_alloc_c_checked(&writebuf3, max_fid_len * sample_ct)) {
goto recode_ret_NOMEM;
}
- sample_uidx = 0;
- for (sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
+ for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
cptr = &(sample_ids[sample_uidx * max_sample_id_len]);
aptr = (char*)memchr(cptr, '\t', max_fid_len);
@@ -12079,7 +12365,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
break;
}
}
- fid_ct = ulii;
+ fid_ct = MINV(ulii, sample_ct);
while (++ulii < sample_ct) {
if (strcmp(&(writebuf3[(fid_ct - 1) * max_fid_len]), &(writebuf3[ulii * max_fid_len]))) {
strcpy(&(writebuf3[fid_ct * max_fid_len]), &(writebuf3[ulii * max_fid_len]));
@@ -12175,7 +12461,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
if (wkspace_alloc_c_checked(&writebuf, max_chrom_size * ulii)) {
goto recode_ret_NOMEM;
}
- if (recode_modifier & RECODE_COMPOUND) {
+ if ((recode_modifier & RECODE_COMPOUND) && max_chrom_size) {
memset(writebuf, delimiter, max_chrom_size * 3 - 1);
writebuf[max_chrom_size * 3 - 1] = '\n';
}
@@ -12250,8 +12536,16 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
}
loadbuf = wkspace_base;
chrom_fo_idx = 0;
- refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
- chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
+ if (unfiltered_marker_ct) {
+ refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
+ chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
+ } else {
+ chrom_end = 0;
+ is_x = 0;
+ is_y = 0;
+ is_mt = 0;
+ is_haploid = 0;
+ }
if (recode_modifier & RECODE_TRANSPOSE) {
strcpy(outname_end, ".tped");
if (fopen_checked(&outfile, outname, "w")) {
@@ -12297,30 +12591,32 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
goto recode_ret_WRITE_FAIL;
}
- if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
- goto recode_ret_READ_FAIL;
- }
- if (is_haploid && set_hh_missing) {
- haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
- }
- init_recode_cmax0(mk_allele_ptrs[2 * marker_uidx], mk_allele_ptrs[2 * marker_uidx + 1], cur_mk_allelesx, cmalen, delimiter, delim2);
- ulptr = loadbuf_collapsed;
- ulptr_end = &(loadbuf_collapsed[sample_ct / BITCT2]);
- shiftmax = BITCT2;
- while (1) {
- while (ulptr < ulptr_end) {
- cur_word = *ulptr++;
- for (shiftval = 0; shiftval < shiftmax; shiftval++) {
- ulii = cur_word & 3;
- fwrite(cur_mk_allelesx[ulii], 1, cmalen[ulii], outfile);
- cur_word >>= 2;
- }
+ if (sample_ct) {
+ if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+ goto recode_ret_READ_FAIL;
}
- if (ulptr == loadbuf_collapsed_end) {
- break;
+ if (is_haploid && set_hh_missing) {
+ haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
+ }
+ init_recode_cmax0(mk_allele_ptrs[2 * marker_uidx], mk_allele_ptrs[2 * marker_uidx + 1], cur_mk_allelesx, cmalen, delimiter, delim2);
+ ulptr = loadbuf_collapsed;
+ ulptr_end = &(loadbuf_collapsed[sample_ct / BITCT2]);
+ shiftmax = BITCT2;
+ while (1) {
+ while (ulptr < ulptr_end) {
+ cur_word = *ulptr++;
+ for (shiftval = 0; shiftval < shiftmax; shiftval++) {
+ ulii = cur_word & 3;
+ fwrite(cur_mk_allelesx[ulii], 1, cmalen[ulii], outfile);
+ cur_word >>= 2;
+ }
+ }
+ if (ulptr == loadbuf_collapsed_end) {
+ break;
+ }
+ ulptr_end++;
+ shiftmax = sample_ct % BITCT2;
}
- ulptr_end++;
- shiftmax = sample_ct % BITCT2;
}
if (putc_checked('\n', outfile)) {
goto recode_ret_WRITE_FAIL;
@@ -12402,40 +12698,42 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
putc(delimiter, outfile);
}
fputs(mk_allele_ptrs[2 * marker_uidx + 1 - uii], outfile);
- if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, uii ^ IS_SET(marker_reverse, marker_uidx))) {
- goto recode_ret_READ_FAIL;
- }
- if (is_haploid && set_hh_missing) {
- haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
- }
- ulptr = loadbuf_collapsed;
- ulptr_end = &(loadbuf_collapsed[sample_ct / BITCT2]);
- shiftmax = BITCT2;
wbufptr = writebuf;
- if (allele_missing && allele_missing[marker_uidx]) {
- // all 0s and NAs
- memcpy(cur_dosage_chars, "0N00", 4);
- } else {
- memcpy(cur_dosage_chars, "2N10", 4);
- }
- while (1) {
- while (ulptr < ulptr_end) {
- cur_word = *ulptr++;
- for (shiftval = 0; shiftval < shiftmax; shiftval++) {
- ulii = cur_word & 3;
- *wbufptr++ = delimiter;
- *wbufptr++ = cur_dosage_chars[ulii];
- if (ulii == 1) {
- *wbufptr++ = 'A';
+ if (sample_ct) {
+ if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, uii ^ IS_SET(marker_reverse, marker_uidx))) {
+ goto recode_ret_READ_FAIL;
+ }
+ if (is_haploid && set_hh_missing) {
+ haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
+ }
+ ulptr = loadbuf_collapsed;
+ ulptr_end = &(loadbuf_collapsed[sample_ct / BITCT2]);
+ shiftmax = BITCT2;
+ if (allele_missing && allele_missing[marker_uidx]) {
+ // all 0s and NAs
+ memcpy(cur_dosage_chars, "0N00", 4);
+ } else {
+ memcpy(cur_dosage_chars, "2N10", 4);
+ }
+ while (1) {
+ while (ulptr < ulptr_end) {
+ cur_word = *ulptr++;
+ for (shiftval = 0; shiftval < shiftmax; shiftval++) {
+ ulii = cur_word & 3;
+ *wbufptr++ = delimiter;
+ *wbufptr++ = cur_dosage_chars[ulii];
+ if (ulii == 1) {
+ *wbufptr++ = 'A';
+ }
+ cur_word >>= 2;
}
- cur_word >>= 2;
}
+ if (ulptr == loadbuf_collapsed_end) {
+ break;
+ }
+ ulptr_end++;
+ shiftmax = sample_ct % BITCT2;
}
- if (ulptr == loadbuf_collapsed_end) {
- break;
- }
- ulptr_end++;
- shiftmax = sample_ct % BITCT2;
}
*wbufptr++ = '\n';
if (fwrite_checked(writebuf, wbufptr - writebuf, outfile)) {
@@ -12520,11 +12818,12 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
goto recode_ret_WRITE_FAIL;
}
chrom_fo_idx = 0;
- refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
- chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
- sample_uidx = 0;
+ if (unfiltered_marker_ct) {
+ refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
+ chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
+ }
shiftval = 0; // repurposed: underscore seen in ID?
- for (sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
+ for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
cptr = &(sample_ids[sample_uidx * max_sample_id_len]);
ulii = strlen_se(cptr);
@@ -12605,11 +12904,13 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
goto recode_ret_WRITE_FAIL;
}
- if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
- goto recode_ret_READ_FAIL;
- }
- if (is_haploid && set_hh_missing) {
- haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
+ if (sample_ct) {
+ if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+ goto recode_ret_READ_FAIL;
+ }
+ if (is_haploid && set_hh_missing) {
+ haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
+ }
}
cptr = mk_allele_ptrs[2 * marker_uidx];
@@ -12731,33 +13032,35 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
pzwritep = uint32_writex(pzwritep, marker_pos[marker_uidx], ' ');
pzwritep = strcpyax(pzwritep, mk_allele_ptrs[2 * marker_uidx], ' ');
pzwritep = strcpya(pzwritep, mk_allele_ptrs[2 * marker_uidx + 1]);
- if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
- goto recode_ret_READ_FAIL;
- }
- if (is_haploid && set_hh_missing) {
- haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
- }
- ulptr = loadbuf_collapsed;
- ulptr_end = &(loadbuf_collapsed[sample_ct / BITCT2]);
- sample_idx = 0;
- sample_uidx = BITCT2; // repurposed as stop value
- while (1) {
- while (ulptr < ulptr_end) {
- cur_word = *ulptr++;
- for (; sample_idx < sample_uidx; sample_idx++, cur_word >>= 2) {
- ulii = cur_word & 3;
- if (ulii == 1) {
- missing_cts[sample_idx] += 1;
+ if (sample_ct) {
+ if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+ goto recode_ret_READ_FAIL;
+ }
+ if (is_haploid && set_hh_missing) {
+ haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
+ }
+ ulptr = loadbuf_collapsed;
+ ulptr_end = &(loadbuf_collapsed[sample_ct / BITCT2]);
+ sample_idx = 0;
+ sample_uidx = BITCT2; // repurposed as stop value
+ while (1) {
+ while (ulptr < ulptr_end) {
+ cur_word = *ulptr++;
+ for (; sample_idx < sample_uidx; sample_idx++, cur_word >>= 2) {
+ ulii = cur_word & 3;
+ if (ulii == 1) {
+ missing_cts[sample_idx] += 1;
+ }
+ pzwritep = memcpya(pzwritep, &(cur_mk_allelesx_buf[ulii * 8]), 6);
}
- pzwritep = memcpya(pzwritep, &(cur_mk_allelesx_buf[ulii * 8]), 6);
+ sample_uidx += BITCT2;
}
- sample_uidx += BITCT2;
- }
- if (ulptr == loadbuf_collapsed_end) {
- break;
+ if (ulptr == loadbuf_collapsed_end) {
+ break;
+ }
+ ulptr_end++;
+ sample_uidx = sample_ct;
}
- ulptr_end++;
- sample_uidx = sample_ct;
}
append_binary_eoln(&pzwritep);
if (flex_pzwrite(&ps, &pzwritep)) {
@@ -12896,6 +13199,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
autosomal_marker_ct -= count_chrom_markers(chrom_info_ptr, chrom_info_ptr->xy_code, marker_exclude);
}
if (!autosomal_marker_ct) {
+ // could allow this?
logerrprint("Error: No autosomal variants for --recode beagle.\n");
goto recode_ret_ALL_MARKERS_EXCLUDED;
}
@@ -13022,6 +13326,10 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
}
} else if (recode_modifier & (RECODE_BIMBAM | RECODE_BIMBAM_1CHR)) {
if (recode_modifier & RECODE_BIMBAM_1CHR) {
+ if (!marker_ct) {
+ logerrprint("Error: No variants for --recode bimbam-1chr.\n");
+ goto recode_ret_ALL_MARKERS_EXCLUDED;
+ }
ii = single_chrom_start(chrom_info_ptr, unfiltered_marker_ct, marker_exclude);
if (ii == -1) {
logerrprint("Error: --recode bimbam-1chr requires a single-chromosome dataset. Did you mean\n'--recode bimbam'? (Note the lack of a dash in the middle.)\n");
@@ -13119,8 +13427,10 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
marker_uidx = 0;
marker_idx = 0;
chrom_fo_idx = 0;
- refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
- chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
+ if (unfiltered_marker_ct) {
+ refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
+ chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
+ }
writebuf2[0] = ',';
memcpy(&(writebuf2[4]), ",??", 4);
writebuf2[8] = ',';
@@ -13139,44 +13449,46 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
}
- if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
- goto recode_ret_READ_FAIL;
- }
- if (is_haploid && set_hh_missing) {
- haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
- }
- ucc = mk_allele_ptrs[2 * marker_uidx][0];
- ucc2 = mk_allele_ptrs[2 * marker_uidx + 1][0];
- writebuf2[1] = ucc;
- writebuf2[2] = ucc;
- writebuf2[9] = ucc;
- writebuf2[10] = ucc2;
- writebuf2[13] = ucc2;
- writebuf2[14] = ucc2;
if (fputs_checked(&(marker_ids[marker_uidx * max_marker_id_len]), outfile)) {
goto recode_ret_WRITE_FAIL;
}
- wbufptr = writebuf;
- ulptr = loadbuf_collapsed;
- ulptr_end = &(loadbuf_collapsed[sample_ct / BITCT2]);
- shiftmax = BITCT2;
- while (1) {
- while (ulptr < ulptr_end) {
- cur_word = *ulptr++;
- for (shiftval = 0; shiftval < shiftmax; shiftval++) {
- ulii = cur_word & 3;
- wbufptr = memcpyl3a(wbufptr, &(writebuf2[4 * ulii]));
- cur_word >>= 2;
+ if (sample_ct) {
+ if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+ goto recode_ret_READ_FAIL;
+ }
+ if (is_haploid && set_hh_missing) {
+ haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
+ }
+ ucc = mk_allele_ptrs[2 * marker_uidx][0];
+ ucc2 = mk_allele_ptrs[2 * marker_uidx + 1][0];
+ writebuf2[1] = ucc;
+ writebuf2[2] = ucc;
+ writebuf2[9] = ucc;
+ writebuf2[10] = ucc2;
+ writebuf2[13] = ucc2;
+ writebuf2[14] = ucc2;
+ wbufptr = writebuf;
+ ulptr = loadbuf_collapsed;
+ ulptr_end = &(loadbuf_collapsed[sample_ct / BITCT2]);
+ shiftmax = BITCT2;
+ while (1) {
+ while (ulptr < ulptr_end) {
+ cur_word = *ulptr++;
+ for (shiftval = 0; shiftval < shiftmax; shiftval++) {
+ ulii = cur_word & 3;
+ wbufptr = memcpyl3a(wbufptr, &(writebuf2[4 * ulii]));
+ cur_word >>= 2;
+ }
}
+ if (ulptr == loadbuf_collapsed_end) {
+ break;
+ }
+ ulptr_end++;
+ shiftmax = sample_ct % BITCT2;
}
- if (ulptr == loadbuf_collapsed_end) {
- break;
+ if (fwrite_checked(writebuf, 3 * sample_ct, outfile)) {
+ goto recode_ret_WRITE_FAIL;
}
- ulptr_end++;
- shiftmax = sample_ct % BITCT2;
- }
- if (fwrite_checked(writebuf, 3 * sample_ct, outfile)) {
- goto recode_ret_WRITE_FAIL;
}
putc('\n', outfile);
}
@@ -13189,6 +13501,14 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
}
}
} else if (recode_modifier & (RECODE_FASTPHASE | RECODE_FASTPHASE_1CHR)) {
+ if (!marker_ct) {
+ // why bother
+ logerrprint("Error: No variants for --recode fastphase{-1chr}.\n");
+ goto recode_ret_ALL_MARKERS_EXCLUDED;
+ } else if (!sample_ct) {
+ logerrprint("Error: No samples for --recode fastphase{-1chr}.\n");
+ goto recode_ret_ALL_SAMPLES_EXCLUDED;
+ }
if (recode_modifier & RECODE_FASTPHASE) {
memcpy(outname_end, ".chr-*", 7);
} else {
@@ -13347,11 +13667,13 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
chrom_fo_idx++;
refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
}
- if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
- goto recode_ret_READ_FAIL;
- }
- if (is_haploid && set_hh_missing) {
- haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
+ if (sample_ct) {
+ if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+ goto recode_ret_READ_FAIL;
+ }
+ if (is_haploid && set_hh_missing) {
+ haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
+ }
}
wbufptr = &(marker_ids[marker_uidx * max_marker_id_len]);
cptr = strcpya(&(writebuf[1]), wbufptr);
@@ -13483,7 +13805,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
if (recode_load_to(loadbuf, bedfile, bed_offset, unfiltered_marker_ct, 0, marker_ct, marker_exclude, recode_allele_reverse, &marker_uidx, unfiltered_sample_ct)) {
goto recode_ret_READ_FAIL;
}
- if (set_hh_missing) {
+ if (set_hh_missing && marker_ct) {
haploid_fix_multiple(marker_exclude, 0, marker_ct, chrom_info_ptr, hh_exists, sample_include2, sample_male_include2, unfiltered_sample_ct, unfiltered_sample_ct4, loadbuf);
}
fputs("0%", stdout);
@@ -13496,50 +13818,54 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
if (recode_write_first_cols(outfile, sample_uidx, delimiter, sample_ids, max_sample_id_len, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, sex_nm, sex_male, pheno_nm, pheno_c, pheno_d, output_missing_pheno)) {
goto recode_ret_WRITE_FAIL;
}
- bufptr = &(loadbuf[sample_uidx / 4]);
- wbufptr = writebuf;
- shiftval = (sample_uidx % 4) * 2;
- marker_uidx = 0;
- marker_idx = 0;
- if (recode_modifier & RECODE_A) {
- do {
- marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
- ulii = next_set_ul(marker_exclude, marker_uidx, unfiltered_marker_ct);
- marker_idx += ulii - marker_uidx;
+ if (marker_ct) {
+ bufptr = &(loadbuf[sample_uidx / 4]);
+ wbufptr = writebuf;
+ shiftval = (sample_uidx % 4) * 2;
+ marker_uidx = 0;
+ marker_idx = 0;
+ if (recode_modifier & RECODE_A) {
do {
- ucc = ((*bufptr) >> shiftval) & 3;
- if (allele_missing && allele_missing[marker_uidx]) {
- *wbufptr++ = "0N00"[ucc];
- } else {
- *wbufptr++ = "2N10"[ucc];
- }
- if (ucc == 1) {
- *wbufptr++ = 'A';
- }
- *wbufptr++ = delimiter;
- bufptr = &(bufptr[unfiltered_sample_ct4]);
- } while (++marker_uidx < ulii);
- } while (marker_idx < marker_ct);
- } else {
- do {
- marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
- ulii = next_set_ul(marker_exclude, marker_uidx, unfiltered_marker_ct);
- marker_idx += ulii - marker_uidx;
+ marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
+ ulii = next_set_ul(marker_exclude, marker_uidx, unfiltered_marker_ct);
+ marker_idx += ulii - marker_uidx;
+ do {
+ ucc = ((*bufptr) >> shiftval) & 3;
+ if (allele_missing && allele_missing[marker_uidx]) {
+ *wbufptr++ = "0N00"[ucc];
+ } else {
+ *wbufptr++ = "2N10"[ucc];
+ }
+ if (ucc == 1) {
+ *wbufptr++ = 'A';
+ }
+ *wbufptr++ = delimiter;
+ bufptr = &(bufptr[unfiltered_sample_ct4]);
+ } while (++marker_uidx < ulii);
+ } while (marker_idx < marker_ct);
+ } else {
do {
- ucc = ((*bufptr) >> shiftval) & 3;
- if (ucc != 1) {
- wbufptr = memcpya(wbufptr, &(writebuf2[4 * ((allele_missing && allele_missing[marker_uidx])? 3 : ucc)]), 4);
- } else {
- wbufptr = memcpya(wbufptr, &(writebuf2[16]), 6);
- }
- bufptr = &(bufptr[unfiltered_sample_ct4]);
- } while (++marker_uidx < ulii);
- } while (marker_idx < marker_ct);
- }
- wbufptr[-1] = '\n';
- ulii = (uintptr_t)(wbufptr - writebuf);
- if (fwrite_checked(writebuf, ulii, outfile)) {
- goto recode_ret_WRITE_FAIL;
+ marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
+ ulii = next_set_ul(marker_exclude, marker_uidx, unfiltered_marker_ct);
+ marker_idx += ulii - marker_uidx;
+ do {
+ ucc = ((*bufptr) >> shiftval) & 3;
+ if (ucc != 1) {
+ wbufptr = memcpya(wbufptr, &(writebuf2[4 * ((allele_missing && allele_missing[marker_uidx])? 3 : ucc)]), 4);
+ } else {
+ wbufptr = memcpya(wbufptr, &(writebuf2[16]), 6);
+ }
+ bufptr = &(bufptr[unfiltered_sample_ct4]);
+ } while (++marker_uidx < ulii);
+ } while (marker_idx < marker_ct);
+ }
+ wbufptr[-1] = '\n';
+ ulii = (uintptr_t)(wbufptr - writebuf);
+ if (fwrite_checked(writebuf, ulii, outfile)) {
+ goto recode_ret_WRITE_FAIL;
+ }
+ } else {
+ putc('\n', outfile);
}
}
if (pct < 100) {
@@ -13610,11 +13936,13 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
}
chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
}
- if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, cur_sample_ct, cur_sample_exclude, cur_final_mask, IS_SET(marker_reverse, marker_uidx))) {
- goto recode_ret_READ_FAIL;
- }
- if (is_haploid && set_hh_missing) {
- haploid_fix(hh_exists, cur_sample_include2, cur_sample_male_include2, cur_sample_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
+ if (unfiltered_sample_ct) {
+ if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, cur_sample_ct, cur_sample_exclude, cur_final_mask, IS_SET(marker_reverse, marker_uidx))) {
+ goto recode_ret_READ_FAIL;
+ }
+ if (is_haploid && set_hh_missing) {
+ haploid_fix(hh_exists, cur_sample_include2, cur_sample_male_include2, cur_sample_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
+ }
}
init_recode_cmax(mk_allele_ptrs[2 * marker_uidx], mk_allele_ptrs[2 * marker_uidx + 1], cur_mk_allelesx, cmalen, '\0', delimiter);
cmalen[0] -= 1;
@@ -13741,6 +14069,10 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
if (wkspace_left < ((uint64_t)unfiltered_sample_ct4) * max_chrom_size) {
goto recode_ret_NO_MULTIPASS_YET;
}
+ if (!marker_ct) {
+ logerrprint("Error: No variants for --recode HV{-1chr}.\n");
+ goto recode_ret_ALL_MARKERS_EXCLUDED;
+ }
if (recode_modifier & RECODE_HV) {
memcpy(outname_end, ".chr-", 5);
sprintf(logbuf, "--recode HV to %s*.ped + .info... ", outname);
@@ -13779,7 +14111,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
if (recode_load_to(loadbuf, bedfile, bed_offset, chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1], 0, ulii, marker_exclude, marker_reverse, &marker_uidx, unfiltered_sample_ct)) {
goto recode_ret_READ_FAIL;
}
- if (set_hh_missing) {
+ if (set_hh_missing && marker_ct) {
haploid_fix_multiple(marker_exclude, marker_uidx_start, ulii, chrom_info_ptr, hh_exists, sample_include2, sample_male_include2, unfiltered_sample_ct, unfiltered_sample_ct4, loadbuf);
}
sample_uidx = 0;
@@ -13852,7 +14184,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
if (recode_load_to(loadbuf, bedfile, bed_offset, unfiltered_marker_ct, 0, marker_ct, marker_exclude, marker_reverse, &marker_uidx, unfiltered_sample_ct)) {
goto recode_ret_READ_FAIL;
}
- if (set_hh_missing) {
+ if (set_hh_missing && marker_ct) {
haploid_fix_multiple(marker_exclude, 0, marker_ct, chrom_info_ptr, hh_exists, sample_include2, sample_male_include2, unfiltered_sample_ct, unfiltered_sample_ct4, loadbuf);
}
sample_uidx = 0;
@@ -13912,7 +14244,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
if (recode_load_to(loadbuf, bedfile, bed_offset, unfiltered_marker_ct, 0, marker_ct, marker_exclude, marker_reverse, &marker_uidx, unfiltered_sample_ct)) {
goto recode_ret_READ_FAIL;
}
- if (set_hh_missing) {
+ if (set_hh_missing && marker_ct) {
haploid_fix_multiple(marker_exclude, 0, marker_ct, chrom_info_ptr, hh_exists, sample_include2, sample_male_include2, unfiltered_sample_ct, unfiltered_sample_ct4, loadbuf);
}
sample_uidx = 0;
@@ -13967,7 +14299,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
if (!(recode_modifier & (RECODE_BEAGLE | RECODE_FASTPHASE | RECODE_FASTPHASE_1CHR | RECODE_HV))) {
logprint("done.\n");
if (invalid_allele_code_seen) {
- logerrprint("Warning: At least one VCF allele code violates the official specification;\nother tools may not accept the file. (Valid codes must either start with a\n'<', only contain characters in {A,C,G,T,N,a,c,g,t,n}, or represent a\nbreakend.)\n");
+ logerrprint("Warning: At least one VCF allele code violates the official specification;\nother tools may not accept the file. (Valid codes must either start with a\n'<', only contain characters in {A,C,G,T,N,a,c,g,t,n}, be an isolated '*', or\nrepresent a breakend.)\n");
}
} else {
fputs("done.\n", stdout);
@@ -14007,6 +14339,9 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
recode_ret_ALL_MARKERS_EXCLUDED:
retval = RET_ALL_MARKERS_EXCLUDED;
break;
+ recode_ret_ALL_SAMPLES_EXCLUDED:
+ retval = RET_ALL_SAMPLES_EXCLUDED;
+ break;
}
recode_ret_1:
wkspace_reset(wkspace_mark);
@@ -14171,7 +14506,7 @@ static inline Ll_entry2* top_alloc_ll2(uintptr_t* topsize_ptr, uint32_t size) {
return (Ll_entry2*)top_alloc(topsize_ptr, size + sizeof(Ll_entry2));
}
-int32_t merge_fam_id_scan(char* bedname, char* famname, uintptr_t* max_sample_id_len_ptr, uint32_t* max_sample_full_len_ptr, uint32_t* is_dichot_pheno_ptr, Ll_entry** htable, uintptr_t* topsize_ptr, uint64_t* tot_sample_ct_ptr, uint32_t* ped_buflen_ptr, uint32_t* cur_sample_ct_ptr, uint32_t* orig_idx_ptr) {
+int32_t merge_fam_id_scan(char* bedname, char* famname, uint32_t allow_no_samples, uintptr_t* max_sample_id_len_ptr, uint32_t* max_sample_full_len_ptr, uint32_t* is_dichot_pheno_ptr, Ll_entry** htable, uintptr_t* topsize_ptr, uint64_t* tot_sample_ct_ptr, uint32_t* ped_buflen_ptr, uint32_t* cur_sample_ct_ptr, uint32_t* orig_idx_ptr) {
uint64_t tot_sample_ct = *tot_sample_ct_ptr;
uintptr_t max_sample_id_len = *max_sample_id_len_ptr;
uintptr_t topsize = *topsize_ptr;
@@ -14335,7 +14670,7 @@ int32_t merge_fam_id_scan(char* bedname, char* famname, uintptr_t* max_sample_id
if (!feof(infile)) {
goto merge_fam_id_scan_ret_READ_FAIL;
}
- if (!cur_sample_ct) {
+ if ((!cur_sample_ct) && (!allow_no_samples)) {
LOGPREPRINTFWW("Error: No %s in %s.\n", g_species_plural, famname);
goto merge_fam_id_scan_ret_INVALID_FORMAT_2;
}
@@ -14377,7 +14712,7 @@ int32_t merge_sample_sortf(char* sample_sort_fname, char* sample_fids, uintptr_t
return retval;
}
-int32_t merge_bim_scan(char* bimname, uint32_t is_binary, uintptr_t* max_marker_id_len_ptr, Ll_entry2** htable2, uintptr_t* topsize_ptr, uint32_t* max_bim_linelen_ptr, uint64_t* tot_marker_ct_ptr, uint32_t* cur_marker_ct_ptr, uint64_t* position_warning_ct_ptr, Ll_str** non_biallelics_ptr, uint32_t allow_extra_chroms, Chrom_info* chrom_info_ptr) {
+int32_t merge_bim_scan(char* bimname, uint32_t is_binary, uint32_t allow_no_variants, uintptr_t* max_marker_id_len_ptr, Ll_entry2** htable2, uintptr_t* topsize_ptr, uint32_t* max_bim_linelen_ptr, uint64_t* tot_marker_ct_ptr, uint32_t* cur_marker_ct_ptr, uint64_t* position_warning_ct_ptr, Ll_str** non_biallelics_ptr, uint32_t allow_extra_chroms, Chrom_info* chrom_info_ptr) {
unsigned char* wkspace_mark = wkspace_base;
uintptr_t max_marker_id_len = *max_marker_id_len_ptr;
uintptr_t topsize = *topsize_ptr;
@@ -14404,7 +14739,7 @@ int32_t merge_bim_scan(char* bimname, uint32_t is_binary, uintptr_t* max_marker_
Ll_str* ll_string_new;
int64_t llxx;
uintptr_t line_idx;
- uint32_t cm_col;
+ uint32_t cm_col_exists;
uint32_t allele_ct;
uint32_t name_match;
uint32_t uii;
@@ -14426,9 +14761,14 @@ int32_t merge_bim_scan(char* bimname, uint32_t is_binary, uintptr_t* max_marker_
}
loadbuf = (char*)wkspace_alloc(loadbuf_size);
loadbuf[loadbuf_size - 1] = ' ';
- if (check_cm_col(infile, loadbuf, is_binary, loadbuf_size, &cm_col, &line_idx)) {
+ if (check_cm_col(infile, loadbuf, is_binary, allow_no_variants, loadbuf_size, &cm_col_exists, &line_idx)) {
goto merge_bim_scan_ret_MISSING_TOKENS;
}
+ if (!line_idx) {
+ // no variants
+ *cur_marker_ct_ptr = 0;
+ goto merge_bim_scan_ret_1;
+ }
line_idx--;
do {
line_idx++;
@@ -14466,7 +14806,7 @@ int32_t merge_bim_scan(char* bimname, uint32_t is_binary, uintptr_t* max_marker_
if (no_more_tokens_kns(bufptr2)) {
goto merge_bim_scan_ret_MISSING_TOKENS;
}
- if (cm_col) {
+ if (cm_col_exists) {
if (scan_double(bufptr2, &cm)) {
cm = 0;
}
@@ -14945,7 +15285,7 @@ int32_t merge_main(char* bedname, char* bimname, char* famname, char* bim_loadbu
uintptr_t uljj = 0;
uintptr_t* mbufptr2;
uintptr_t* rbufptr;
- uint32_t cm_col;
+ uint32_t cm_col_exists;
char* aptr1;
char* aptr2;
char* bufptr;
@@ -15015,9 +15355,12 @@ int32_t merge_main(char* bedname, char* bimname, char* famname, char* bim_loadbu
if (fopen_checked(&infile2, bimname, "r")) {
goto merge_main_ret_OPEN_FAIL;
}
- if (check_cm_col(infile2, bim_loadbuf, is_binary, max_bim_linelen, &cm_col, &ulii)) {
+ if (check_cm_col(infile2, bim_loadbuf, is_binary, 1, max_bim_linelen, &cm_col_exists, &ulii)) {
goto merge_main_ret_READ_FAIL;
}
+ if (!ulii) {
+ bim_loadbuf[0] = '\0';
+ }
if (fopen_checked(&bedfile, bedname, is_binary? "rb" : "r")) {
goto merge_main_ret_OPEN_FAIL;
}
@@ -15045,7 +15388,7 @@ int32_t merge_main(char* bedname, char* bimname, char* famname, char* bim_loadbu
}
++marker_in_idx;
bufptr = next_token(bufptr);
- bufptr2 = next_token_mult(bufptr, 1 + cm_col);
+ bufptr2 = next_token_mult(bufptr, 1 + cm_col_exists);
if (!bufptr2) {
goto merge_main_ret_READ_FAIL;
}
@@ -15086,6 +15429,9 @@ int32_t merge_main(char* bedname, char* bimname, char* famname, char* bim_loadbu
bufptr5 = marker_allele_ptrs[((uint32_t)ii) * 2 + 1];
last_marker_in_idx = marker_in_idx;
+ if (!cur_sample_ct) {
+ continue;
+ }
if (load_raw(bedfile, readbuf_w, cur_sample_ct4)) {
goto merge_main_ret_READ_FAIL;
}
@@ -15298,6 +15644,7 @@ int32_t merge_main(char* bedname, char* bimname, char* famname, char* bim_loadbu
if (is_eoln_or_comment(cc)) {
continue;
}
+ // only possible to get here if sample_ct and marker_ct are positive
bufptr2 = token_endnn(bufptr);
uii = (bufptr2 - bufptr);
bufptr3 = skip_initial_spaces(bufptr2);
@@ -15566,6 +15913,8 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
uint32_t merge_mode = merge_type & MERGE_MODE_MASK;
uint32_t merge_nsort = ((!sample_sort) || (sample_sort == SAMPLE_SORT_NATURAL))? 1 : 0;
uint32_t merge_equal_pos = (merge_type & MERGE_EQUAL_POS)? 1 : 0;
+ uint32_t allow_no_samples = (misc_flags / MISC_ALLOW_NO_SAMPLES) & 1;
+ uint32_t allow_no_variants = (misc_flags / MISC_ALLOW_NO_VARS) & 1;
Ll_entry** htable = (Ll_entry**)(&(wkspace_base[wkspace_left - HASHMEM_S]));
Ll_entry2** htable2 = (Ll_entry2**)(&(wkspace_base[wkspace_left - HASHMEM]));
Ll_str* non_biallelics = NULL;
@@ -15793,13 +16142,13 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
ullxx = 0;
mlpos = 0;
for (mlpos = 0; mlpos < merge_ct; mlpos++) {
- retval = merge_fam_id_scan(mergelist_bed[mlpos], mergelist_fam[mlpos], &max_sample_id_len, &max_sample_full_len, &is_dichot_pheno, htable, &topsize, &ullxx, &ped_buflen, &cur_sample_ct, &orig_idx);
+ retval = merge_fam_id_scan(mergelist_bed[mlpos], mergelist_fam[mlpos], allow_no_samples, &max_sample_id_len, &max_sample_full_len, &is_dichot_pheno, htable, &topsize, &ullxx, &ped_buflen, &cur_sample_ct, &orig_idx);
if (retval) {
goto merge_datasets_ret_1;
}
if ((!merge_list) && mlpos) {
LOGPRINTFWW("%u %s loaded from %s.\n", max_cur_sample_ct, species_str(max_cur_sample_ct), mergelist_fam[0]);
- LOGPRINTFWW("%u %s to be merged from %s.\n", cur_sample_ct, species_str(cur_sample_ct), mergelist_fam[1]);
+ LOGPRINTFWW("%u %s to be merged from %s.\n", cur_sample_ct, species_str(cur_sample_ct), (merge_type & MERGE_BINARY)? mergelist_fam[1] : mergelist_bed[1]);
uii = ullxx - max_cur_sample_ct;
LOGPRINTF("Of these, %u %s new, while %u %s present in the base dataset.\n", uii, (uii == 1)? "is" : "are", cur_sample_ct - uii, (cur_sample_ct - uii == 1)? "is" : "are");
}
@@ -16018,7 +16367,7 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
ullxx = 0;
for (mlpos = 0; mlpos < merge_ct; ++mlpos) {
- retval = merge_bim_scan(mergelist_bim[mlpos], (mergelist_fam[mlpos])? 1 : 0, &max_marker_id_len, htable2, &topsize, &max_bim_linelen, &ullxx, &cur_marker_ct, &position_warning_ct, &non_biallelics, allow_extra_chroms, chrom_info_ptr);
+ retval = merge_bim_scan(mergelist_bim[mlpos], (mergelist_fam[mlpos])? 1 : 0, allow_no_variants, &max_marker_id_len, htable2, &topsize, &max_bim_linelen, &ullxx, &cur_marker_ct, &position_warning_ct, &non_biallelics, allow_extra_chroms, chrom_info_ptr);
if (retval) {
goto merge_datasets_ret_1;
}
@@ -16148,7 +16497,7 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
if (merge_post_msort_update_maps(marker_ids, max_marker_id_len, marker_map, marker_cms, marker_cms_tmp, pos_buf, ll_buf, chrom_start, chrom_id, chrom_ct, &dedup_marker_ct, merge_equal_pos, marker_allele_ptrs, chrom_info_ptr)) {
goto merge_datasets_ret_INVALID_FORMAT;
}
- if (!dedup_marker_ct) {
+ if ((!dedup_marker_ct) && (!allow_no_variants)) {
logerrprint("Error: No variants in merged file.\n");
goto merge_datasets_ret_INVALID_FORMAT;
}
@@ -16173,27 +16522,36 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
} else {
ulii = ped_buflen;
}
- // don't need to enforce >= 3 since wkspace_alloc guarantees >= 64
- if (wkspace_alloc_uc_checked(&readbuf, ulii)) {
+ if (wkspace_alloc_uc_checked(&readbuf, MAXV(ulii, 3))) {
goto merge_datasets_ret_NOMEM;
}
if (merge_must_track_write(merge_mode)) {
ulii = (tot_sample_ct + (BITCT - 1)) / BITCT;
- markers_per_pass = wkspace_left / (3 * sizeof(intptr_t) * ulii);
- if (markers_per_pass > dedup_marker_ct) {
+ if (ulii) {
+ markers_per_pass = wkspace_left / (3 * sizeof(intptr_t) * ulii);
+ if (markers_per_pass > dedup_marker_ct) {
+ markers_per_pass = dedup_marker_ct;
+ }
+ } else {
markers_per_pass = dedup_marker_ct;
}
markbuf = (uintptr_t*)wkspace_alloc(markers_per_pass * ulii * sizeof(intptr_t));
- } else {
+ } else if (tot_sample_ct4) {
markers_per_pass = wkspace_left / tot_sample_ct4;
if (markers_per_pass > dedup_marker_ct) {
markers_per_pass = dedup_marker_ct;
}
+ } else {
+ markers_per_pass = dedup_marker_ct;
}
- if (!markers_per_pass) {
- goto merge_datasets_ret_NOMEM;
+ if (dedup_marker_ct) {
+ if (!markers_per_pass) {
+ goto merge_datasets_ret_NOMEM;
+ }
+ pass_ct = 1 + ((dedup_marker_ct - 1) / markers_per_pass);
+ } else {
+ pass_ct = 0;
}
- pass_ct = 1 + ((dedup_marker_ct - 1) / markers_per_pass);
writebuf = wkspace_base;
pcptr = (uintptr_t*)wkspace_base;
diff --git a/plink_data.h b/plink_data.h
index 94ae693..0dd15ac 100644
--- a/plink_data.h
+++ b/plink_data.h
@@ -1,7 +1,7 @@
#ifndef __PLINK_DATA_H__
#define __PLINK_DATA_H__
-int32_t sample_major_to_snp_major(char* sample_major_fname, char* outname, uintptr_t unfiltered_marker_ct, uintptr_t sample_ct, uint64_t fsize);
+int32_t sample_major_to_snp_major(char* sample_major_fname, char* outname, uintptr_t unfiltered_marker_ct, uintptr_t unfiltered_sample_ct, uint64_t fsize);
int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_marker_ct_ptr, uintptr_t* marker_exclude_ct_ptr, uintptr_t* max_marker_id_len_ptr, uintptr_t** marker_exclude_ptr, double** set_allele_freqs_ptr, uint32_t** nchrobs_ptr, char*** marker_allele_pp, uintptr_t* max_marker_allele_len_ptr, char** marker_ids_ptr, char* missing_mid_template, uint32_t new_id_max_allele_len, const char* missing_marker_id_match, Chrom_info* chrom_info_ptr, double** marker_cms_ptr, uint32_ [...]
@@ -11,13 +11,13 @@ int32_t write_covars(char* outname, char* outname_end, uint32_t write_covar_modi
int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t map_cols, char* outname, char* outname_end, uint64_t calculation_type, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, double* marker_cms, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_i [...]
-int32_t load_fam(char* famname, uint32_t fam_cols, uint32_t tmp_fam_col_6, int32_t missing_pheno, uint32_t affection_01, uintptr_t* unfiltered_sample_ct_ptr, char** sample_ids_ptr, uintptr_t* max_sample_id_len_ptr, char** paternal_ids_ptr, uintptr_t* max_paternal_id_len_ptr, char** maternal_ids_ptr, uintptr_t* max_maternal_id_len_ptr, uintptr_t** sex_nm_ptr, uintptr_t** sex_male_ptr, uint32_t* affection_ptr, uintptr_t** pheno_nm_ptr, uintptr_t** pheno_c_ptr, double** pheno_d_ptr, uintptr [...]
+int32_t load_fam(char* famname, uint32_t fam_cols, uint32_t tmp_fam_col_6, int32_t missing_pheno, uint32_t affection_01, uintptr_t* unfiltered_sample_ct_ptr, char** sample_ids_ptr, uintptr_t* max_sample_id_len_ptr, char** paternal_ids_ptr, uintptr_t* max_paternal_id_len_ptr, char** maternal_ids_ptr, uintptr_t* max_maternal_id_len_ptr, uintptr_t** sex_nm_ptr, uintptr_t** sex_male_ptr, uint32_t* affection_ptr, uintptr_t** pheno_nm_ptr, uintptr_t** pheno_c_ptr, double** pheno_d_ptr, uintptr [...]
int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outname_end, char* single_chr, char* pheno_name, double hard_call_threshold, char* missing_code, int32_t missing_pheno, uint64_t misc_flags, uint32_t is_bgen, Chrom_info* chrom_info_ptr);
int32_t ped_to_bed(char* pedname, char* mapname, char* outname, char* outname_end, uint32_t fam_cols, uint64_t misc_flags, int32_t missing_pheno, Chrom_info* chrom_info_ptr);
-int32_t lgen_to_bed(char* lgen_namebuf, char* outname, char* outname_end, int32_t missing_pheno, uint64_t misc_flags, uint32_t lgen_modifier, char* lgen_reference_fname, Chrom_info* chrom_info_ptr);
+int32_t lgen_to_bed(char* lgenname, char* mapname, char* famname, char* outname, char* outname_end, int32_t missing_pheno, uint64_t misc_flags, uint32_t lgen_modifier, char* lgen_reference_fname, Chrom_info* chrom_info_ptr);
int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* outname_end, uint64_t misc_flags, Chrom_info* chrom_info_ptr);
@@ -25,7 +25,7 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t missing_pheno, uint64_t misc_flags, char* const_fid, char id_delim, char vcf_idspace_to, double vcf_min_qual, char* vcf_filter_exceptions_flattened, Chrom_info* chrom_info_ptr);
-int32_t bed_from_23(char* fname, char* outname, char* outname_end, uint32_t modifier_23, char* fid_23, char* iid_23, double pheno_23, char* paternal_id_23, char* maternal_id_23, Chrom_info* chrom_info_ptr);
+int32_t bed_from_23(char* fname, char* outname, char* outname_end, uint32_t modifier_23, char* fid_23, char* iid_23, double pheno_23, uint64_t misc_flags, char* paternal_id_23, char* maternal_id_23, Chrom_info* chrom_info_ptr);
int32_t generate_dummy(char* outname, char* outname_end, uint32_t flags, uintptr_t marker_ct, uintptr_t sample_ct, double geno_mrate, double pheno_mrate, int32_t missing_pheno);
diff --git a/plink_dosage.c b/plink_dosage.c
index 602a504..71b0c3a 100644
--- a/plink_dosage.c
+++ b/plink_dosage.c
@@ -24,7 +24,7 @@ void dosage_cleanup(Dosage_info* doip) {
#define DOSAGE_EPSILON 0.000244140625
-int32_t dosage_load_score_files(Score_info* sc_ip, char* outname, char* outname_end, uintptr_t* score_marker_ct_ptr, uintptr_t* max_score_marker_id_len_ptr, char** score_marker_ids_ptr, char*** score_allele_codes_ptr, double** score_effect_sizes_ptr, uintptr_t** score_qrange_key_exists_ptr, double** score_qrange_keys_ptr, uintptr_t* qrange_ct_ptr, uintptr_t* max_qrange_name_len_ptr, char** score_qrange_names_ptr, double** score_qrange_bounds_ptr) {
+int32_t dosage_load_score_files(Score_info* sc_ip, char* outname, char* outname_end, uint32_t double_dosage, uintptr_t* score_marker_ct_ptr, uintptr_t* max_score_marker_id_len_ptr, char** score_marker_ids_ptr, char*** score_allele_codes_ptr, double** score_effect_sizes_ptr, uintptr_t** score_qrange_key_exists_ptr, double** score_qrange_keys_ptr, uintptr_t* qrange_ct_ptr, uintptr_t* max_qrange_name_len_ptr, char** score_qrange_names_ptr, double** score_qrange_bounds_ptr) {
// We don't necessarily have the whole variant ID list in advance, so it
// makes sense to deviate a bit from score_report().
//
@@ -268,6 +268,9 @@ int32_t dosage_load_score_files(Score_info* sc_ip, char* outname, char* outname_
// guaranteed to succeed unless the user is overwriting the file between
// load passes, which we won't bother defending against
marker_idx = (uint32_t)bsearch_str(bufptr_arr[varid_idx], strlen_se(bufptr_arr[varid_idx]), score_marker_ids, max_score_marker_id_len, score_marker_ct);
+ if (double_dosage) {
+ dxx *= 2;
+ }
score_effect_sizes[marker_idx] = dxx;
slen = strlen_se(bufptr_arr[allele_idx]);
if (slen == 1) {
@@ -461,7 +464,7 @@ int32_t dosage_load_score_files(Score_info* sc_ip, char* outname, char* outname_
return retval;
}
-int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* outname, char* outname_end, char* phenoname, char* extractname, char* excludename, char* keepname, char* removename, char* keepfamname, char* removefamname, char* filtername, char* makepheno_str, char* phenoname_str, char* covar_fname, Two_col_params* qual_filter, Two_col_params* update_map, Two_col_params* update_name, char* update_ids_fname, char* update_parents_fname, char* update_sex_fname, char* filtervals_ [...]
+int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* outname, char* outname_end, char* phenoname, char* extractname, char* excludename, char* keepname, char* removename, char* keepfamname, char* removefamname, char* filtername, char* makepheno_str, char* phenoname_str, char* covar_fname, Two_col_params* qual_filter, Two_col_params* update_map, Two_col_params* update_name, char* update_ids_fname, char* update_parents_fname, char* update_sex_fname, char* filtervals_ [...]
// sucks to duplicate so much, but this code will be thrown out later so
// there's no long-term maintenance problem
FILE* phenofile = NULL;
@@ -701,7 +704,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
goto plink1_dosage_ret_1;
}
}
- retval = load_fam(famname, fam_cols, uii, missing_pheno, (misc_flags / MISC_AFFECTION_01) & 1, &unfiltered_sample_ct, &sample_ids, &max_sample_id_len, &paternal_ids, &max_paternal_id_len, &maternal_ids, &max_maternal_id_len, &sex_nm, &sex_male, &affection, &pheno_nm, &pheno_c, &pheno_d, &founder_info, &sample_exclude);
+ retval = load_fam(famname, fam_cols, uii, missing_pheno, (misc_flags / MISC_AFFECTION_01) & 1, &unfiltered_sample_ct, &sample_ids, &max_sample_id_len, &paternal_ids, &max_paternal_id_len, &maternal_ids, &max_maternal_id_len, &sex_nm, &sex_male, &affection, &pheno_nm, &pheno_c, &pheno_d, &founder_info, &sample_exclude, 0);
if (retval) {
goto plink1_dosage_ret_1;
}
@@ -786,7 +789,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
}
if (extractname) {
if (!(misc_flags & MISC_EXTRACT_RANGE)) {
- retval = extract_exclude_flag_norange(extractname, marker_id_htable, marker_id_htable_size, 0, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
+ retval = extract_exclude_flag_norange(extractname, marker_id_htable, marker_id_htable_size, 0, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, 0);
if (retval) {
goto plink1_dosage_ret_1;
}
@@ -795,7 +798,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
logerrprint("Error: '--extract range' requires a sorted .bim. Retry this command after\nusing --make-bed to sort your data.\n");
goto plink1_dosage_ret_INVALID_CMDLINE;
}
- retval = extract_exclude_range(extractname, marker_pos, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, 0, chrom_info_ptr);
+ retval = extract_exclude_range(extractname, marker_pos, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, 0, 0, chrom_info_ptr);
if (retval) {
goto plink1_dosage_ret_1;
}
@@ -805,7 +808,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
}
if (excludename) {
if (!(misc_flags & MISC_EXCLUDE_RANGE)) {
- retval = extract_exclude_flag_norange(excludename, marker_id_htable, marker_id_htable_size, 1, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
+ retval = extract_exclude_flag_norange(excludename, marker_id_htable, marker_id_htable_size, 1, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, 0);
if (retval) {
goto plink1_dosage_ret_1;
}
@@ -814,7 +817,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
logerrprint("Error: '--exclude range' requires a sorted .bim. Retry this command after\nusing --make-bed to sort your data.\n");
goto plink1_dosage_ret_INVALID_CMDLINE;
}
- retval = extract_exclude_range(excludename, marker_pos, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, 1, chrom_info_ptr);
+ retval = extract_exclude_range(excludename, marker_pos, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, 1, 0, chrom_info_ptr);
if (retval) {
goto plink1_dosage_ret_1;
}
@@ -823,13 +826,13 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
}
}
if (filter_attrib_fname) {
- retval = filter_attrib(filter_attrib_fname, filter_attrib_liststr, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
+ retval = filter_attrib(filter_attrib_fname, filter_attrib_liststr, marker_id_htable, marker_id_htable_size, 0, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
if (retval) {
goto plink1_dosage_ret_1;
}
}
if (qual_filter) {
- retval = filter_qual_scores(qual_filter, qual_min_thresh, qual_max_thresh, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
+ retval = filter_qual_scores(qual_filter, qual_min_thresh, qual_max_thresh, marker_id_htable, marker_id_htable_size, 0, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
if (retval) {
goto plink1_dosage_ret_1;
}
@@ -837,10 +840,10 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
wkspace_reset(wkspace_mark);
}
if (thin_keep_prob != 1.0) {
- if (random_thin_markers(thin_keep_prob, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct)) {
+ if (random_thin_markers(thin_keep_prob, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct, 0)) {
goto plink1_dosage_ret_ALL_MARKERS_EXCLUDED;
}
- } else if (thin_keep_ct) {
+ } else if (thin_keep_ct != 0xffffffffU) {
retval = random_thin_markers_ct(thin_keep_ct, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
if (retval) {
goto plink1_dosage_ret_1;
@@ -874,31 +877,31 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
}
}
if (keepfamname) {
- retval = keep_or_remove(keepfamname, sorted_sample_ids, ulii, max_sample_id_len, sample_id_map, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, 2);
+ retval = keep_or_remove(keepfamname, sorted_sample_ids, ulii, max_sample_id_len, sample_id_map, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, 2, 0);
if (retval) {
goto plink1_dosage_ret_1;
}
}
if (keepname) {
- retval = keep_or_remove(keepname, sorted_sample_ids, ulii, max_sample_id_len, sample_id_map, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, 0);
+ retval = keep_or_remove(keepname, sorted_sample_ids, ulii, max_sample_id_len, sample_id_map, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, 0, 0);
if (retval) {
goto plink1_dosage_ret_1;
}
}
if (removefamname) {
- retval = keep_or_remove(removefamname, sorted_sample_ids, ulii, max_sample_id_len, sample_id_map, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, 3);
+ retval = keep_or_remove(removefamname, sorted_sample_ids, ulii, max_sample_id_len, sample_id_map, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, 3, 0);
if (retval) {
goto plink1_dosage_ret_1;
}
}
if (removename) {
- retval = keep_or_remove(removename, sorted_sample_ids, ulii, max_sample_id_len, sample_id_map, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, 1);
+ retval = keep_or_remove(removename, sorted_sample_ids, ulii, max_sample_id_len, sample_id_map, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, 1, 0);
if (retval) {
goto plink1_dosage_ret_1;
}
}
if (filter_attrib_sample_fname) {
- retval = filter_attrib_sample(filter_attrib_sample_fname, filter_attrib_sample_liststr, sorted_sample_ids, ulii, max_sample_id_len, sample_id_map, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct);
+ retval = filter_attrib_sample(filter_attrib_sample_fname, filter_attrib_sample_liststr, sorted_sample_ids, ulii, max_sample_id_len, sample_id_map, unfiltered_sample_ct, 0, sample_exclude, &sample_exclude_ct);
if (retval) {
goto plink1_dosage_ret_1;
}
@@ -907,7 +910,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
if (!mfilter_col) {
mfilter_col = 1;
}
- retval = filter_samples_file(filtername, sorted_sample_ids, ulii, max_sample_id_len, sample_id_map, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, filtervals_flattened, mfilter_col);
+ retval = filter_samples_file(filtername, sorted_sample_ids, ulii, max_sample_id_len, sample_id_map, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, filtervals_flattened, mfilter_col, 0);
if (retval) {
goto plink1_dosage_ret_1;
}
@@ -971,9 +974,13 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
LOGPRINTF("%d %s removed due to founder status (--filter-%s).\n", ii, species_str(ii), (filter_flags & FILTER_BINARY_FOUNDERS)? "founders" : "nonfounders");
}
if (cluster_ptr->fname || (misc_flags & MISC_FAMILY_CLUSTERS)) {
- retval = load_clusters(cluster_ptr->fname, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, sample_ids, max_sample_id_len, mwithin_col, (misc_flags / MISC_LOAD_CLUSTER_KEEP_NA) & 1, &cluster_ct, &cluster_map, &cluster_starts, &cluster_ids, &max_cluster_id_len, cluster_ptr->keep_fname, cluster_ptr->keep_flattened, cluster_ptr->remove_fname, cluster_ptr->remove_flattened);
- if (retval) {
- goto plink1_dosage_ret_1;
+ if (cluster_ptr->keep_fname || cluster_ptr->keep_flattened || cluster_ptr->remove_fname || cluster_ptr->remove_flattened) {
+ retval = load_clusters(cluster_ptr->fname, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, sample_ids, max_sample_id_len, mwithin_col, (misc_flags / MISC_LOAD_CLUSTER_KEEP_NA) & 1, &cluster_ct, &cluster_map, &cluster_starts, &cluster_ids, &max_cluster_id_len, cluster_ptr->keep_fname, cluster_ptr->keep_flattened, cluster_ptr->remove_fname, cluster_ptr->remove_flattened, 0);
+ if (retval) {
+ goto plink1_dosage_ret_1;
+ }
+ } else {
+ logerrprint("Warning: Ignoring --within/--family since it has no effect. (PLINK 1.07's\nundocumented Huber-White standard error computation is currently disabled.)\n");
}
}
sample_ct = unfiltered_sample_ct - sample_exclude_ct;
@@ -998,7 +1005,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
logprint("Using 1 thread.\n");
}
#ifndef NOLAPACK
- if (uii && ((!known_procs) || (known_procs * 2 >= g_thread_ct))) {
+ if (uii && ((!known_procs) || (known_procs >= 2 * ((int32_t)g_thread_ct)))) {
logerrprint("Warning: This run includes BLAS/LAPACK linear algebra operations which\ncurrently disregard the --threads limit. If this is problematic, you may want\nto recompile against single-threaded BLAS/LAPACK.\n");
}
#endif
@@ -1102,7 +1109,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
#endif
}
if (do_score) {
- retval = dosage_load_score_files(sc_ip, outname, outname_end, &score_marker_ct, &max_score_marker_id_len, &score_marker_ids, &score_allele_codes, &score_effect_sizes, &score_qrange_key_exists, &score_qrange_keys, &qrange_ct, &max_qrange_name_len, &score_qrange_names, &score_qrange_bounds);
+ retval = dosage_load_score_files(sc_ip, outname, outname_end, (doip->modifier & DOSAGE_SCORE_DOUBLE), &score_marker_ct, &max_score_marker_id_len, &score_marker_ids, &score_allele_codes, &score_effect_sizes, &score_qrange_key_exists, &score_qrange_keys, &qrange_ct, &max_qrange_name_len, &score_qrange_names, &score_qrange_bounds);
if (retval) {
goto plink1_dosage_ret_1;
}
@@ -1516,15 +1523,15 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
}
bufptr2 = memcpyb(outname_end, ".out.dosage", 12);
}
- if (output_gz) {
- memcpy(bufptr2, ".gz", 4);
- }
- if (flex_pzwrite_init(output_gz, outname, overflow_buf, 0, &ps)) {
- goto plink1_dosage_ret_OPEN_FAIL;
- }
- pzwritep = (char*)overflow_buf;
if (!do_score) {
+ if (output_gz) {
+ memcpy(bufptr2, ".gz", 4);
+ }
+ if (flex_pzwrite_init(output_gz, outname, overflow_buf, 0, &ps)) {
+ goto plink1_dosage_ret_OPEN_FAIL;
+ }
+ pzwritep = (char*)overflow_buf;
if (do_glm) {
pzwritep = memcpya(pzwritep, tbuf, bufptr - tbuf);
} else if (!count_occur) {
@@ -1922,19 +1929,26 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
dxx = dzz * (1.0 - dzz); // now dxx = theoretical var
dyy = 2 * dyy * sample_valid_ct_recip; // and dyy = empirical
rsq = (dxx > 0.0)? (dyy / dxx) : 0.0;
+ if ((dxx >= 0.0098999999999999) && (rsq >= 0.1) && (rsq <= 2.0)) {
#ifndef NOLAPACK
- if (pheno_d) {
- is_valid = glm_linear_dosage(sample_ct, cur_samples, sample_valid_ct, pheno_nm_collapsed, pheno_d_collapsed, perm_fails, covar_ct, covar_d, cur_dosages, pheno_d2, covars_cov_major_buf, covars_sample_major_buf, param_2d_buf, mi_buf, param_2d_buf2, regression_results, dgels_a, dgels_b, dgels_work, dgels_lwork, standard_beta, glm_vif_thresh, &beta, &se, &pval);
- if (is_valid == 2) {
- // NOMEM special case
- goto plink1_dosage_ret_NOMEM;
- }
- } else {
+ if (pheno_d) {
+ is_valid = glm_linear_dosage(sample_ct, cur_samples, sample_valid_ct, pheno_nm_collapsed, pheno_d_collapsed, perm_fails, covar_ct, covar_d, cur_dosages, pheno_d2, covars_cov_major_buf, covars_sample_major_buf, param_2d_buf, mi_buf, param_2d_buf2, regression_results, dgels_a, dgels_b, dgels_work, dgels_lwork, standard_beta, glm_vif_thresh, &beta, &se, &pval);
+ if (is_valid == 2) {
+ // NOMEM special case
+ goto plink1_dosage_ret_NOMEM;
+ }
+ } else {
#endif
- is_valid = glm_logistic_dosage(sample_ct, cur_samples, sample_valid_ct, pheno_nm_collapsed, pheno_c_collapsed, perm_vec, perm_fails, covar_ct, covar_f, cur_dosages, coef_f, pp_f, pheno_buf_f, covars_cov_major_f_buf, param_1d_buf_f, param_1d_buf2_f, param_2d_buf_f, param_2d_buf2_f, regression_results_f, sample_1d_buf_f, &beta, &se, &pval);
+ is_valid = glm_logistic_dosage(sample_ct, cur_samples, sample_valid_ct, pheno_nm_collapsed, pheno_c_collapsed, perm_vec, perm_fails, covar_ct, covar_f, cur_dosages, coef_f, pp_f, pheno_buf_f, covars_cov_major_f_buf, param_1d_buf_f, param_1d_buf2_f, param_2d_buf_f, param_2d_buf2_f, regression_results_f, sample_1d_buf_f, &beta, &se, &pval);
#ifndef NOLAPACK
- }
+ }
#endif
+ } else {
+ is_valid = 0;
+ if (rsq > 2.0) {
+ rsq = 2.0;
+ }
+ }
if (load_map) {
pzwritep = width_force(4, pzwritep, chrom_name_write(pzwritep, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_idx)));
*pzwritep++ = ' ';
diff --git a/plink_dosage.h b/plink_dosage.h
index 7530915..5df6173 100644
--- a/plink_dosage.h
+++ b/plink_dosage.h
@@ -15,8 +15,9 @@
#define DOSAGE_SCORE 0x100
#define DOSAGE_SCORE_NOSUM 0x200
#define DOSAGE_SCORE_CNT 0x400
-#define DOSAGE_SEX 0x800
-#define DOSAGE_FREQ_CC 0x1000
+#define DOSAGE_SCORE_DOUBLE 0x800
+#define DOSAGE_SEX 0x1000
+#define DOSAGE_FREQ_CC 0x2000
typedef struct {
char* fname;
@@ -31,6 +32,6 @@ void dosage_init(Dosage_info* doip);
void dosage_cleanup(Dosage_info* doip);
-int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* outname, char* outname_end, char* phenoname, char* extractname, char* excludename, char* keepname, char* removename, char* keepfamname, char* removefamname, char* filtername, char* makepheno_str, char* phenoname_str, char* covar_fname, Two_col_params* qual_filter, Two_col_params* update_map, Two_col_params* update_name, char* update_ids_fname, char* update_parents_fname, char* update_sex_fname, char* filtervals_ [...]
+int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* outname, char* outname_end, char* phenoname, char* extractname, char* excludename, char* keepname, char* removename, char* keepfamname, char* removefamname, char* filtername, char* makepheno_str, char* phenoname_str, char* covar_fname, Two_col_params* qual_filter, Two_col_params* update_map, Two_col_params* update_name, char* update_ids_fname, char* update_parents_fname, char* update_sex_fname, char* filtervals_ [...]
#endif
diff --git a/plink_family.c b/plink_family.c
index 53806a2..12cee9e 100644
--- a/plink_family.c
+++ b/plink_family.c
@@ -3,6 +3,7 @@
#include "plink_assoc.h"
#include "plink_cluster.h"
#include "plink_family.h"
+#include "plink_perm.h"
#include "plink_stats.h"
void family_init(Family_info* fam_ip) {
@@ -482,10 +483,10 @@ uint32_t erase_mendel_errors(uintptr_t unfiltered_sample_ct, uintptr_t* loadbuf,
uii = *uiptr++;
ujj = *uiptr++;
ukk = *uiptr++;
- umm = (workbuf[uii / BITCT2] >> (2 * (uii % BITCT2))) & 3;
- unn = (workbuf[ukk / BITCT2] >> (2 * (ukk % BITCT2))) & 3;
+ umm = EXTRACT_2BIT_GENO(workbuf, uii);
+ unn = EXTRACT_2BIT_GENO(workbuf, ukk);
if ((!is_x) || (!is_set(sex_male, uii))) {
- umm = mendel_error_table[umm | (((workbuf[ujj / BITCT2] >> (2 * (ujj % BITCT2))) & 3) << 2) | (unn << 4)];
+ umm = mendel_error_table[umm | (EXTRACT_2BIT_GENO(workbuf, ujj) << 2) | (unn << 4)];
} else {
umm = mendel_error_table_male_x[umm | (unn << 2)];
}
@@ -684,7 +685,7 @@ void fill_mendel_errstr(uint32_t error_code, char** allele_ptrs, uint32_t* alens
*len_ptr = (uintptr_t)(wptr - wbuf);
}
-int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t plink_maxfid, uint32_t plink_maxiid, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_e [...]
+int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t plink_maxfid, uint32_t plink_maxiid, uint32_t plink_maxsnp, uint32_t allow_no_variants, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_ [...]
unsigned char* wkspace_mark = wkspace_base;
FILE* outfile = NULL;
FILE* outfile_l = NULL;
@@ -887,10 +888,10 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
uii = *uiptr++;
ujj = *uiptr++;
ukk = *uiptr++;
- umm = (loadbuf[uii / BITCT2] >> (2 * (uii % BITCT2))) & 3;
- unn = (loadbuf[ukk / BITCT2] >> (2 * (ukk % BITCT2))) & 3;
+ umm = EXTRACT_2BIT_GENO(loadbuf, uii);
+ unn = EXTRACT_2BIT_GENO(loadbuf, ukk);
if ((!is_x) || (!is_set(sex_male, uii))) {
- umm = mendel_error_table[umm | (((loadbuf[ujj / BITCT2] >> (2 * (ujj % BITCT2))) & 3) << 2) | (unn << 4)];
+ umm = mendel_error_table[umm | (EXTRACT_2BIT_GENO(loadbuf, ujj) << 2) | (unn << 4)];
} else {
umm = mendel_error_table_male_x[umm | (unn << 2)];
}
@@ -924,7 +925,7 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
ujj = *uiptr++;
ukk = *uiptr++;
trio_idx = *uiptr++;
- uljj = ((loadbuf[ujj / BITCT2] >> (2 * (ujj % BITCT2))) & 3) | (((loadbuf[ukk / BITCT2] >> (2 * (ukk % BITCT2))) & 3) << 2);
+ uljj = EXTRACT_2BIT_GENO(loadbuf, ujj) | (EXTRACT_2BIT_GENO(loadbuf, ukk) << 2);
umm = uii / BITCT2;
ujj = 2 * (uii % BITCT2);
ulii = (loadbuf[umm] >> ujj) & 3;
@@ -1179,7 +1180,7 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
}
if (fam_ip->mendel_modifier & MENDEL_FILTER) {
*marker_exclude_ct_ptr += new_marker_exclude_ct;
- if (unfiltered_marker_ct == *marker_exclude_ct_ptr) {
+ if ((unfiltered_marker_ct == *marker_exclude_ct_ptr) && (!allow_no_variants)) {
logerrprint("Error: All variants excluded by --me.\n");
goto mendel_error_scan_ret_ALL_MARKERS_EXCLUDED;
}
@@ -1847,8 +1848,8 @@ int32_t tdt_poo(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* o
uii = *lookup_ptr++;
ujj = *lookup_ptr++;
cur_child_ct = *lookup_ptr++;
- ulii = (loadbuf[uii / BITCT2] >> (2 * (uii % BITCT2))) & 3;
- uljj = (loadbuf[ujj / BITCT2] >> (2 * (ujj % BITCT2))) & 3;
+ ulii = EXTRACT_2BIT_GENO(loadbuf, uii);
+ uljj = EXTRACT_2BIT_GENO(loadbuf, ujj);
ukk = ulii | (uljj << 2);
if ((0x4d04 >> ukk) & 1) {
// 1+ het parents, no missing
@@ -1857,7 +1858,7 @@ int32_t tdt_poo(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* o
poo_table_ptr = &(poo_table[4 * ukk]);
for (child_idx = 0; child_idx < cur_child_ct; child_idx++) {
ukk = *lookup_ptr++;
- poo_acc += poo_table_ptr[(loadbuf[ukk / BITCT2] >> (2 * (ukk % BITCT2))) & 3];
+ poo_acc += poo_table_ptr[EXTRACT_2BIT_GENO(loadbuf, ukk)];
if (++poo_acc_ct == 127) {
// accumulator about to overflow, unpack it
poo_obs_pat_x2 += (unsigned char)poo_acc;
@@ -2296,8 +2297,8 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
uii = *lookup_ptr++;
ujj = *lookup_ptr++;
cur_child_ct = *lookup_ptr++;
- ulii = (loadbuf[uii / BITCT2] >> (2 * (uii % BITCT2))) & 3;
- uljj = (loadbuf[ujj / BITCT2] >> (2 * (ujj % BITCT2))) & 3;
+ ulii = EXTRACT_2BIT_GENO(loadbuf, uii);
+ uljj = EXTRACT_2BIT_GENO(loadbuf, ujj);
ukk = ulii | (uljj << 2);
if (cur_child_ct & 0x80000000U) {
// discordant
@@ -2326,7 +2327,7 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
tdt_table_ptr = &(tdt_table[4 * (ulii ^ uljj)]);
for (child_idx = 0; child_idx < cur_child_ct; child_idx++) {
ukk = *lookup_ptr++;
- umm = tdt_table_ptr[(loadbuf[ukk / BITCT2] >> (2 * (ukk % BITCT2))) & 3];
+ umm = tdt_table_ptr[EXTRACT_2BIT_GENO(loadbuf, ukk)];
tdt_obs_ct += (uint16_t)umm;
tdt_a1_trans_ct += umm >> 16;
}
@@ -2866,12 +2867,27 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
// multithread globals
static double* g_maxt_extreme_stat;
-// static double* g_maxt_thread_results;
+static double* g_maxt_thread_results;
static double* g_mperm_save_all;
static uintptr_t* g_pheno_c;
-// static uintptr_t* g_dfam_flipa;
-// static uintptr_t* g_dfam_perm_vecs;
-// static uintptr_t* g_dfam_perm_vecst;
+static uintptr_t* g_dfam_flipa;
+#ifdef __LP64__
+static uintptr_t* g_dfam_flipa_shuffled;
+#endif
+static uintptr_t* g_dfam_perm_vecs;
+static uintptr_t* g_dfam_perm_vecst; // sample-major, shuffled
+static double* g_dfam_numers;
+static double* g_dfam_denoms;
+static uintptr_t* g_dfam_acc;
+static int32_t* g_dfam_twice_numers;
+static uint32_t* g_dfam_total_counts;
+static uint32_t* g_dfam_iteration_order;
+static uintptr_t g_perm_vec_ct;
+static uint32_t g_dfam_family_all_case_children_ct;
+static uint32_t g_dfam_family_mixed_ct;
+static uint32_t g_dfam_sibship_mixed_ct;
+static uint32_t g_dfam_unrelated_cluster_ct;
+static uint32_t g_dfam_sample_ct;
static uintptr_t* g_loadbuf;
static uintptr_t* g_lm_eligible;
@@ -2899,7 +2915,7 @@ static uintptr_t g_cur_perm_ct;
static double g_qt_sum_all;
static double g_qt_ssq_all;
static uint32_t g_test_type;
-static uint32_t g_qfam_thread_ct;
+static uint32_t g_xfam_thread_ct;
static uint32_t g_fs_ct;
static uint32_t g_singleton_ct;
static uint32_t g_lm_ct;
@@ -2921,6 +2937,832 @@ const uint8_t dfam_allele_ct_table[] =
3, 0, 2, 1,
0, 0, 1, 0};
+void dfam_sibship_or_unrelated_perm_calc(uintptr_t* loadbuf_ptr, const uint32_t* cur_dfam_ptr, const uintptr_t* perm_vecst, const uintptr_t* orig_pheno_c, uint32_t sibling_ct, uint32_t is_unrelated_calc, uintptr_t perm_vec_ct,
+#ifdef __LP64__
+ __m128i* acc4, __m128i* acc8,
+#else
+ uintptr_t* acc4, uintptr_t* acc8,
+#endif
+ uint32_t* cur_case_a1_cts, uint32_t* cur_case_missing_cts, int32_t* twice_numers, double* numers, double* denoms, uint32_t* total_counts) {
+ // okay, compute array of familial/sibship case_a1_ct values. Most
+ // families/sibships should have 7 or fewer children, so it makes sense
+ // to use 4-bit accumulators in the inner loop (similar to calc_git()
+ // in plink_assoc.c).
+ uintptr_t perm_vec_ct128 = (perm_vec_ct + 127) / 128;
+ const uintptr_t perm_vec_wcta = perm_vec_ct128 * (128 / BITCT);
+ uint32_t cur_genotype_cts[4];
+#ifdef __LP64__
+ const __m128i m1x4 = {0x1111111111111111LLU, 0x1111111111111111LLU};
+ const __m128i m1x4ls1 = {0x2222222222222222LLU, 0x2222222222222222LLU};
+ uintptr_t acc4_word_ct = perm_vec_ct128 * 8;
+ // uintptr_t acc8_word_ct = perm_vec_ct128 * 16;
+ uintptr_t acc4_vec_ct = perm_vec_ct128 * 4;
+ uintptr_t acc8_vec_ct = acc4_word_ct;
+ const __m128i* pheno_perm_ptr;
+ __m128i* acc4_ptr;
+ __m128i loader;
+#else
+ uintptr_t acc4_word_ct = perm_vec_ct128 * 16;
+ uintptr_t acc8_word_ct = perm_vec_ct128 * 32;
+ uintptr_t perm_vec_wct = (perm_vec_ct + (BITCT - 1)) / BITCT;
+ const uintptr_t* pheno_perm_ptr;
+ uintptr_t* acc4_ptr;
+ uintptr_t loader;
+#endif
+ uint32_t case_ct_base = 0;
+ uintptr_t perm_idx;
+ double total_ctd;
+ double total_ct_recip;
+ double xxm1_recip;
+ double hom_a1_ctd;
+ double het_ctd;
+ double case_ctd;
+ double ctrl_ctd;
+ double case_proportion;
+ double case_expected_hom_a1;
+ double case_expected_het;
+ double case_ctrl_div_xxxm1;
+ double case_var_hom_a1;
+ double case_var_het;
+ double case_neg_covar;
+ double case_expected_a1_ct;
+ double case_var_a1_ct;
+ double case_a1_ctd;
+ double dbl_total_ctd;
+ uint32_t sib_idx;
+ uint32_t sample_idx;
+ uint32_t cur_geno;
+ uint32_t geno_match;
+ uint32_t cur_case_ct;
+ uint32_t case_missing_ct;
+ uint32_t case_a1_ct;
+ uint32_t total_ct;
+ uint32_t cur_ctrl_ct;
+ uint32_t max_incr4;
+ uint32_t max_incr8;
+ uint32_t uii;
+ // first check if all genotypes are identical
+ fill_uint_zero(cur_genotype_cts, 4);
+ for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+ sample_idx = cur_dfam_ptr[sib_idx];
+ cur_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, sample_idx);
+ cur_genotype_cts[cur_geno] += 1;
+ case_ct_base += IS_SET(orig_pheno_c, sample_idx);
+ }
+ cur_geno = 4;
+ for (geno_match = 0; geno_match < 4; geno_match++) {
+ if (cur_genotype_cts[geno_match]) {
+ if (cur_geno != 4) {
+ break;
+ }
+ cur_geno = geno_match;
+ }
+ }
+ if (geno_match == 4) {
+ if ((!is_unrelated_calc) && (!(cur_geno % 2))) {
+ if (!cur_geno) {
+ uii = cur_genotype_cts[0] * 2;
+ } else {
+ uii = cur_genotype_cts[0];
+ }
+ for (perm_idx = 0; perm_idx < perm_vec_ct; perm_idx++) {
+ total_counts[perm_idx] += uii;
+ }
+ }
+ return;
+ }
+
+#ifdef __LP64__
+ fill_v128_zero(acc4, acc4_vec_ct);
+ fill_v128_zero(acc8, acc8_vec_ct);
+#else
+ fill_ulong_zero(acc4, acc4_word_ct);
+ fill_ulong_zero(acc8, acc8_word_ct);
+#endif
+ fill_uint_zero(cur_case_a1_cts, perm_vec_ct);
+ max_incr4 = 0;
+ max_incr8 = 0;
+ for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+ sample_idx = cur_dfam_ptr[sib_idx];
+ cur_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, sample_idx);
+ if (cur_geno & 1) {
+ continue;
+ }
+ uii = 2 - (cur_geno / 2);
+#ifdef __LP64__
+ if (max_incr4 + uii > 15) {
+ unroll_zero_incr_4_8(acc4, acc8, acc4_vec_ct);
+ max_incr8 += max_incr4;
+ if (max_incr8 > 240) {
+ unroll_zero_incr_8_32(acc8, (__m128i*)cur_case_a1_cts, acc8_vec_ct);
+ max_incr8 = 0;
+ }
+ max_incr4 = 0;
+ }
+ max_incr4 += uii;
+ pheno_perm_ptr = (const __m128i*)(&(perm_vecst[sample_idx * perm_vec_wcta]));
+ if (cur_geno) {
+ unroll_incr_1_4(pheno_perm_ptr, acc4, perm_vec_ct128);
+ } else {
+ // add 2 whenever this sample x permutation is a case
+ acc4_ptr = acc4;
+ for (uii = 0; uii < acc4_vec_ct; uii++) {
+ loader = *pheno_perm_ptr++;
+ acc4_ptr[0] = _mm_add_epi64(acc4_ptr[0], _mm_slli_epi64(_mm_and_si128(loader, m1x4), 1));
+ acc4_ptr[1] = _mm_add_epi64(acc4_ptr[1], _mm_and_si128(loader, m1x4ls1));
+ loader = _mm_srli_epi64(loader, 1);
+ acc4_ptr[2] = _mm_add_epi64(acc4_ptr[2], _mm_and_si128(loader, m1x4ls1));
+ loader = _mm_srli_epi64(loader, 1);
+ acc4_ptr[3] = _mm_add_epi64(acc4_ptr[3], _mm_and_si128(loader, m1x4ls1));
+ acc4_ptr = &(acc4_ptr[4]);
+ }
+ }
+#else
+ if (max_incr4 + uii > 15) {
+ unroll_zero_incr_4_8(acc4, acc8, acc4_word_ct);
+ max_incr8 += max_incr4;
+ if (max_incr8 > 240) {
+ unroll_zero_incr_8_32(acc8, (uintptr_t*)cur_case_a1_cts, acc8_word_ct);
+ max_incr8 = 0;
+ }
+ max_incr4 = 0;
+ }
+ max_incr4 += uii;
+ pheno_perm_ptr = &(perm_vecst[sample_idx * perm_vec_wcta]);
+ if (cur_geno) {
+ unroll_incr_1_4(pheno_perm_ptr, acc4, perm_vec_wct);
+ } else {
+ acc4_ptr = acc4;
+ for (uii = 0; uii < perm_vec_wct; uii++) {
+ loader = *pheno_perm_ptr++;
+ acc4_ptr[0] += (loader & 0x11111111U) << 1;
+ acc4_ptr[1] += loader & 0x22222222U;
+ acc4_ptr[2] += (loader >> 1) & 0x22222222U;
+ acc4_ptr[3] += (loader >> 2) & 0x22222222U;
+ acc4_ptr = &(acc4_ptr[4]);
+ }
+ }
+#endif
+ }
+#ifdef __LP64__
+ unroll_incr_4_8(acc4, acc8, acc4_vec_ct);
+ unroll_incr_8_32(acc8, (__m128i*)cur_case_a1_cts, acc8_vec_ct);
+#else
+ unroll_incr_4_8(acc4, acc8, acc4_word_ct);
+ unroll_incr_8_32(acc8, (uintptr_t*)cur_case_a1_cts, acc8_word_ct);
+#endif
+
+ if (!cur_genotype_cts[1]) {
+ // optimize the common no-missing-genotypes case
+ total_ctd = (double)((int32_t)sibling_ct);
+ total_ct_recip = 1.0 / total_ctd;
+ case_ctd = (double)((int32_t)case_ct_base);
+ case_proportion = case_ctd * total_ct_recip;
+ cur_ctrl_ct = sibling_ct - case_ct_base;
+ ctrl_ctd = (double)((int32_t)cur_ctrl_ct);
+ if (!is_unrelated_calc) {
+ // actually ctrl_ct/(x(x-1)), not 1/(x(x-1))
+ xxm1_recip = ctrl_ctd * total_ct_recip / ((double)((int32_t)(sibling_ct - 1)));
+ hom_a1_ctd = (double)((int32_t)cur_genotype_cts[0]);
+ het_ctd = (double)((int32_t)cur_genotype_cts[2]);
+ case_expected_hom_a1 = case_proportion * hom_a1_ctd;
+ case_expected_het = case_proportion * het_ctd;
+ case_ctrl_div_xxxm1 = case_proportion * xxm1_recip;
+ case_var_hom_a1 = case_ctrl_div_xxxm1 * hom_a1_ctd * (total_ctd - hom_a1_ctd);
+ case_var_het = case_ctrl_div_xxxm1 * het_ctd * (total_ctd - het_ctd);
+ case_neg_covar = case_ctrl_div_xxxm1 * het_ctd;
+ case_expected_a1_ct = 2 * case_expected_hom_a1 + case_expected_het;
+ case_var_a1_ct = 4 * (case_var_hom_a1 + case_neg_covar) + case_var_het;
+ for (perm_idx = 0; perm_idx < perm_vec_ct; perm_idx++) {
+ case_a1_ct = cur_case_a1_cts[perm_idx];
+ total_counts[perm_idx] += case_a1_ct;
+ numers[perm_idx] += (double)((int32_t)case_a1_ct) - case_expected_a1_ct;
+ denoms[perm_idx] += case_var_a1_ct;
+ }
+ } else {
+ // actually ctrl_ct/(x(2x-1)), not 1/(x(x-1))
+ xxm1_recip = ctrl_ctd * total_ct_recip / ((double)((int32_t)(2 * sibling_ct - 1)));
+ dbl_total_ctd = 2 * total_ctd;
+ for (perm_idx = 0; perm_idx < perm_vec_ct; perm_idx++) {
+ case_a1_ct = cur_case_a1_cts[perm_idx];
+ case_a1_ctd = (double)((int32_t)case_a1_ct);
+ case_expected_a1_ct = case_proportion * case_a1_ctd;
+ case_var_a1_ct = case_expected_a1_ct * (dbl_total_ctd - case_a1_ctd) * xxm1_recip;
+ total_counts[perm_idx] += case_a1_ct;
+ numers[perm_idx] += case_a1_ctd - case_expected_a1_ct;
+ denoms[perm_idx] += case_var_a1_ct;
+ }
+ }
+ return;
+ }
+
+#ifdef __LP64__
+ fill_v128_zero(acc4, acc4_vec_ct);
+ fill_v128_zero(acc8, acc8_vec_ct);
+#else
+ fill_ulong_zero(acc4, acc4_word_ct);
+ fill_ulong_zero(acc8, acc8_word_ct);
+#endif
+ fill_uint_zero(cur_case_missing_cts, perm_vec_ct);
+ uii = 0;
+ for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+ sample_idx = cur_dfam_ptr[sib_idx];
+ cur_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, sample_idx);
+ if (cur_geno != geno_match) {
+ continue;
+ }
+
+#ifdef __LP64__
+ pheno_perm_ptr = (const __m128i*)(&(perm_vecst[sample_idx * perm_vec_wcta]));
+ unroll_incr_1_4(pheno_perm_ptr, acc4, perm_vec_ct128);
+ if (!(uii % 15)) {
+ unroll_zero_incr_4_8(acc4, acc8, acc4_vec_ct);
+ if (!(uii % 255)) {
+ unroll_zero_incr_8_32(acc8, (__m128i*)cur_case_missing_cts, acc8_vec_ct);
+ }
+ }
+#else
+ pheno_perm_ptr = &(perm_vecst[sample_idx * perm_vec_wcta]);
+ unroll_incr_1_4(pheno_perm_ptr, acc4, perm_vec_wct);
+ if (!(uii % 15)) {
+ unroll_zero_incr_4_8(acc4, acc8, acc4_word_ct);
+ if (!(uii % 255)) {
+ unroll_zero_incr_8_32(acc8, (uintptr_t*)cur_case_missing_cts, acc8_word_ct);
+ }
+ }
+#endif
+ }
+ if (uii % 255) {
+#ifdef __LP64__
+ if (uii % 15) {
+ unroll_incr_4_8(acc4, acc8, acc4_vec_ct);
+ }
+ unroll_incr_8_32(acc8, (__m128i*)cur_case_missing_cts, acc8_vec_ct);
+#else
+ if (uii % 15) {
+ unroll_incr_4_8(acc4, acc8, acc4_word_ct);
+ }
+ unroll_incr_8_32(acc8, (uintptr_t*)cur_case_missing_cts, acc8_word_ct);
+#endif
+ }
+
+ total_ct = sibling_ct - cur_genotype_cts[1];
+ total_ctd = (double)((int32_t)total_ct);
+ total_ct_recip = 1.0 / total_ctd;
+ if (!is_unrelated_calc) {
+ xxm1_recip = total_ct_recip / ((double)((int32_t)(total_ct - 1)));
+ hom_a1_ctd = (double)((int32_t)cur_genotype_cts[0]);
+ het_ctd = (double)((int32_t)cur_genotype_cts[2]);
+ for (perm_idx = 0; perm_idx < perm_vec_ct; perm_idx++) {
+ case_missing_ct = cur_case_missing_cts[perm_idx];
+ cur_case_ct = case_ct_base - case_missing_ct;
+ cur_ctrl_ct = total_ct - cur_case_ct;
+ if ((!cur_case_ct) || (!cur_ctrl_ct)) {
+ continue;
+ }
+ case_ctd = (double)((int32_t)cur_case_ct);
+ ctrl_ctd = (double)((int32_t)cur_ctrl_ct);
+ case_a1_ct = cur_case_a1_cts[perm_idx];
+ case_proportion = case_ctd * total_ct_recip;
+ case_expected_hom_a1 = case_proportion * hom_a1_ctd;
+ case_expected_het = case_proportion * het_ctd;
+ case_ctrl_div_xxxm1 = case_proportion * ctrl_ctd * xxm1_recip;
+ case_var_hom_a1 = case_ctrl_div_xxxm1 * hom_a1_ctd * (total_ctd - hom_a1_ctd);
+ case_var_het = case_ctrl_div_xxxm1 * het_ctd * (total_ctd - het_ctd);
+ case_neg_covar = case_ctrl_div_xxxm1 * het_ctd;
+ case_expected_a1_ct = 2 * case_expected_hom_a1 + case_expected_het;
+ case_var_a1_ct = 4 * (case_var_hom_a1 + case_neg_covar) + case_var_het;
+ total_counts[perm_idx] += case_a1_ct;
+ numers[perm_idx] += (double)((int32_t)case_a1_ct) - case_expected_a1_ct;
+ denoms[perm_idx] += case_var_a1_ct;
+ }
+ } else {
+ // actually 1/(x(2x-1)), not 1/(x(x-1))
+ xxm1_recip = total_ct_recip / ((double)((int32_t)(2 * total_ct - 1)));
+ dbl_total_ctd = 2 * total_ctd;
+
+ for (perm_idx = 0; perm_idx < perm_vec_ct; perm_idx++) {
+ case_missing_ct = cur_case_missing_cts[perm_idx];
+ cur_case_ct = case_ct_base - case_missing_ct;
+ cur_ctrl_ct = total_ct - cur_case_ct;
+ if ((!cur_case_ct) || (!cur_ctrl_ct)) {
+ continue;
+ }
+ case_proportion = ((double)((int32_t)cur_case_ct)) * total_ct_recip;
+ case_a1_ct = cur_case_a1_cts[perm_idx];
+ case_a1_ctd = (double)((int32_t)case_a1_ct);
+ case_expected_a1_ct = case_proportion * case_a1_ctd;
+ case_var_a1_ct = case_expected_a1_ct * (dbl_total_ctd - case_a1_ctd) * ((double)((int32_t)cur_ctrl_ct)) * xxm1_recip;
+ total_counts[perm_idx] += case_a1_ct;
+ numers[perm_idx] += case_a1_ctd - case_expected_a1_ct;
+ denoms[perm_idx] += case_var_a1_ct;
+ }
+ }
+}
+
+THREAD_RET_TYPE dfam_perm_thread(void* arg) {
+ uintptr_t tidx = (uintptr_t)arg;
+ uintptr_t perm_vec_ct = g_perm_vec_ct;
+ uintptr_t perm_vec_ct128 = (perm_vec_ct + 127) / 128;
+ uintptr_t perm_vec_cta128 = perm_vec_ct128 * 128;
+ uintptr_t perm_vec_ctcl8m = CACHEALIGN32_DBL(perm_vec_ct);
+ uint32_t dfam_thread_ct = g_xfam_thread_ct;
+ uint32_t pidx_offset = g_perms_done;
+ uint32_t first_adapt_check = g_first_adapt_check;
+ uint32_t family_all_case_children_ct = g_dfam_family_all_case_children_ct;
+ uint32_t family_mixed_ct = g_dfam_family_mixed_ct;
+ uint32_t sibship_mixed_ct = g_dfam_sibship_mixed_ct;
+ uint32_t unrelated_cluster_ct = g_dfam_unrelated_cluster_ct;
+ uint32_t dfam_sample_ct = g_dfam_sample_ct;
+ uint32_t dfam_sample_ctl2 = (dfam_sample_ct + (BITCT2 - 1)) / BITCT2;
+ const uintptr_t perm_vec_wcta = perm_vec_ct128 * (128 / BITCT);
+ const uintptr_t* flipa = g_dfam_flipa;
+ const uintptr_t* perm_vecst = g_dfam_perm_vecst;
+ const uintptr_t* orig_pheno_c = g_pheno_c;
+ int32_t* __restrict__ twice_numers = &(g_dfam_twice_numers[tidx * perm_vec_cta128]);
+ uint32_t* __restrict__ total_counts = &(g_dfam_total_counts[tidx * perm_vec_cta128]);
+ uint32_t* __restrict__ perm_attempt_ct = g_perm_attempt_ct;
+ uint32_t* __restrict__ perm_2success_ct = g_perm_2success_ct;
+ double* __restrict__ mperm_save_all = g_mperm_save_all;
+ double* msa_ptr = NULL;
+ double* numers = &(g_dfam_numers[tidx * perm_vec_cta128]);
+ double* denoms = &(g_dfam_denoms[tidx * perm_vec_cta128]);
+ const uint32_t* dfam_iteration_order = g_dfam_iteration_order;
+ unsigned char* perm_adapt_stop = NULL;
+ double adaptive_intercept = 0.0;
+ double adaptive_slope = 0.0;
+ double adaptive_ci_zt = 0.0;
+ double aperm_alpha = 0.0;
+ double* maxt_results = NULL;
+ uint32_t perm_adapt = g_test_type;
+ uint32_t next_adapt_check = 0;
+ uint32_t cur_case_a1_ct_flip[2];
+#ifdef __LP64__
+ const __m128i m1x8 = {0x0101010101010101LLU, 0x0101010101010101LLU};
+ const __m128i m1x4 = {0x1111111111111111LLU, 0x1111111111111111LLU};
+ const __m128i m1x4ls1 = {0x2222222222222222LLU, 0x2222222222222222LLU};
+ __m128i diff_vec;
+ __m128i incr8;
+ __m128i loader;
+ // acc8 (8-bit accumulator) requires (perm_vec_ct + 7) / 8 words; this is
+ // 16-byte aligned when perm_vec_ct is divisible by 16
+ // acc4 requires (perm_vec_ct + 15) / 16 words
+ // sum reduces to (perm_vec_ct128 * 248) since we have 3 acc8s and 2 acc32s
+ const uintptr_t acc_thread_offset = perm_vec_ct128 * 184;
+ const uintptr_t acc4_word_ct = perm_vec_ct128 * 8;
+ const uintptr_t acc8_word_ct = perm_vec_ct128 * 16;
+ const uintptr_t acc4_vec_ct = perm_vec_ct128 * 4;
+ const uintptr_t acc8_vec_ct = acc4_word_ct;
+ __m128i* acc4 = (__m128i*)(&(g_dfam_acc[tidx * acc_thread_offset]));
+ __m128i* acc8 = (__m128i*)(&(g_dfam_acc[tidx * acc_thread_offset + acc4_word_ct]));
+ __m128i* case_a1_ct_acc8 = (__m128i*)(&(g_dfam_acc[tidx * acc_thread_offset + acc4_word_ct + acc8_word_ct]));
+ // __m128i* cur_case_ct_acc8 = (__m128i*)(&(g_dfam_acc[tidx * acc_thread_offset + acc4_word_ct + 2 * acc8_word_ct]));
+
+ uint32_t* cur_case_a1_cts = (uint32_t*)(&(g_dfam_acc[tidx * acc_thread_offset + acc4_word_ct + 3 * acc8_word_ct]));
+ uint32_t* cur_case_missing_cts = (uint32_t*)(&(g_dfam_acc[tidx * acc_thread_offset + acc4_word_ct + 7 * acc8_word_ct]));
+
+ const uintptr_t* flipa_shuffled = g_dfam_flipa_shuffled;
+ const __m128i* pheno_perm_ptr;
+ const __m128i* flipa_perm_ptr;
+ __m128i* acc4_ptr;
+ __m128i* acc8_ptr;
+ uintptr_t vidx;
+#else
+ const uintptr_t perm_vec_wct = (perm_vec_ct + (BITCT - 1)) / BITCT;
+ // acc8 requires (perm_vec_ct + 3) / 4 words
+ // acc4 requires (perm_vec_ct + 7) / 8 words
+ // sum reduces to perm_vec_ct128 * 304 since we also have 2 acc32s
+ const uintptr_t acc_thread_offset = perm_vec_ct128 * 304;
+ const uintptr_t acc4_word_ct = perm_vec_ct128 * 16;
+ const uintptr_t acc8_word_ct = perm_vec_ct128 * 32;
+ uintptr_t* acc4 = &(g_dfam_acc[tidx * acc_thread_offset]);
+ uintptr_t* acc8 = &(g_dfam_acc[tidx * acc_thread_offset + acc4_word_ct]);
+ uint32_t* cur_case_a1_cts = (uint32_t*)(&(g_dfam_acc[tidx * acc_thread_offset + acc4_word_ct + acc8_word_ct]));
+ uint32_t* cur_case_missing_cts = (uint32_t*)(&(g_dfam_acc[tidx * acc_thread_offset + acc4_word_ct + 5 * acc8_word_ct]));
+ const uintptr_t* pheno_perm_ptr;
+ uintptr_t* acc4_ptr;
+ uintptr_t loader;
+ uintptr_t widx;
+#endif
+ uintptr_t perm_idx;
+ const uintptr_t* cur_flipa;
+ double* orig_chisq;
+ const uint32_t* cur_dfam_ptr;
+ uintptr_t* loadbuf_ptr;
+ double chisq_high;
+ double chisq_low;
+ double chisq;
+ double pval;
+ double dxx;
+ double dyy;
+ double dzz;
+ uint32_t marker_bidx;
+ uint32_t marker_bceil;
+ uint32_t marker_idx;
+ uint32_t sample_idx;
+ uint32_t fs_idx;
+ uint32_t unrelated_cluster_idx;
+ uint32_t quad_denom;
+ uint32_t twice_numer_subtract;
+ uint32_t paternal_id;
+ uint32_t maternal_id;
+ uint32_t sibling_ct;
+ uint32_t paternal_geno;
+ uint32_t maternal_geno;
+ uint32_t parental_a1_ct;
+ uint32_t sib_idx;
+ uint32_t nonmissing_sib_ct;
+ uint32_t cur_geno;
+ uint32_t is_flipped;
+ uint32_t max_incr4;
+ uint32_t max_incr8;
+ uint32_t cur_max_incr;
+ uint32_t orig_case_ct;
+ uint32_t success_2start;
+ uint32_t success_2incr;
+ uint32_t uii;
+ uint32_t ujj;
+ if (perm_adapt) {
+ perm_adapt_stop = g_perm_adapt_stop;
+ adaptive_intercept = g_adaptive_intercept;
+ adaptive_slope = g_adaptive_slope;
+ adaptive_ci_zt = g_adaptive_ci_zt;
+ aperm_alpha = g_aperm_alpha;
+ } else {
+ maxt_results = &(g_maxt_thread_results[perm_vec_ctcl8m * tidx]);
+ }
+ while (1) {
+ if (g_block_size <= dfam_thread_ct) {
+ if (g_block_size <= tidx) {
+ goto dfam_perm_thread_skip_all;
+ }
+ marker_bidx = tidx;
+ marker_bceil = tidx + 1;
+ } else {
+ marker_bidx = (((uint64_t)tidx) * g_block_size) / dfam_thread_ct;
+ marker_bceil = (((uint64_t)tidx + 1) * g_block_size) / dfam_thread_ct;
+ }
+ orig_chisq = g_orig_stat;
+ for (; marker_bidx < marker_bceil; marker_bidx++) {
+ marker_idx = g_adapt_m_table[marker_bidx];
+ loadbuf_ptr = &(g_loadbuf[marker_bidx * dfam_sample_ctl2]);
+ if (perm_adapt) {
+ next_adapt_check = first_adapt_check;
+ } else if (mperm_save_all) {
+ msa_ptr = &(mperm_save_all[marker_idx * perm_vec_ct]);
+ }
+ quad_denom = 0;
+ twice_numer_subtract = 0;
+ fill_uint_zero(total_counts, perm_vec_ct);
+ fill_double_zero(numers, perm_vec_ct);
+ fill_double_zero(denoms, perm_vec_ct);
+
+ cur_dfam_ptr = dfam_iteration_order;
+ success_2start = perm_2success_ct[marker_idx];
+ success_2incr = 0;
+ chisq_high = orig_chisq[marker_idx] + EPSILON;
+ chisq_low = orig_chisq[marker_idx] - EPSILON;
+#ifdef __LP64__
+ fill_v128_zero(case_a1_ct_acc8, acc8_vec_ct);
+ max_incr4 = 0;
+ max_incr8 = 0;
+#endif
+ for (fs_idx = 0; fs_idx < family_all_case_children_ct; fs_idx++, cur_dfam_ptr = &(cur_dfam_ptr[sibling_ct])) {
+ paternal_id = *cur_dfam_ptr++;
+ maternal_id = *cur_dfam_ptr++;
+ sibling_ct = *cur_dfam_ptr++;
+ paternal_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, paternal_id);
+ maternal_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, maternal_id);
+ parental_a1_ct = dfam_allele_ct_table[paternal_geno * 4 + maternal_geno];
+ // skip if parent has missing genotype, or neither parent is het
+ if (!parental_a1_ct) {
+ continue;
+ }
+
+ for (sib_idx = 0, nonmissing_sib_ct = 0; sib_idx < sibling_ct; sib_idx++) {
+ sample_idx = cur_dfam_ptr[sib_idx];
+ cur_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, sample_idx);
+ nonmissing_sib_ct += (cur_geno != 1);
+ }
+ // skip if all children have missing genotypes
+ if (!nonmissing_sib_ct) {
+ continue;
+ }
+
+ cur_case_a1_ct_flip[0] = 0;
+ cur_case_a1_ct_flip[1] = 0;
+ for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+ sample_idx = cur_dfam_ptr[sib_idx];
+ cur_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, sample_idx);
+ if (cur_geno == 1) {
+ continue;
+ }
+ cur_case_a1_ct_flip[0] += (4 - cur_geno) / 2;
+ }
+ cur_case_a1_ct_flip[1] = nonmissing_sib_ct * parental_a1_ct - cur_case_a1_ct_flip[0];
+ quad_denom += (2 - (parental_a1_ct & 1)) * nonmissing_sib_ct;
+ twice_numer_subtract += nonmissing_sib_ct * parental_a1_ct;
+
+#ifdef __LP64__
+ cur_max_incr = MAXV(cur_case_a1_ct_flip[0], cur_case_a1_ct_flip[1]);
+ max_incr8 += cur_max_incr;
+ // also tried 16-bit accumulators, but that has ~50% greater runtime on
+ // typical datasets
+ if (max_incr8 >= 256) {
+ if (max_incr4) {
+ loader = _mm_set1_epi8(max_incr4);
+ acc8_ptr = case_a1_ct_acc8;
+ for (vidx = 0; vidx < acc8_vec_ct; vidx++) {
+ *acc8_ptr = _mm_add_epi8(*acc8_ptr, loader);
+ acc8_ptr++;
+ }
+ }
+ unroll_zero_incr_8_32(case_a1_ct_acc8, (__m128i*)total_counts, acc8_vec_ct);
+ max_incr8 = cur_max_incr;
+ max_incr4 = 0;
+ }
+ if (cur_max_incr < 256) {
+ max_incr4 += cur_case_a1_ct_flip[0];
+ diff_vec = _mm_set1_epi8((uint8_t)(cur_case_a1_ct_flip[1] - cur_case_a1_ct_flip[0]));
+ acc8_ptr = case_a1_ct_acc8;
+ flipa_perm_ptr = (__m128i*)(&(flipa_shuffled[fs_idx * perm_vec_wcta]));
+ for (vidx = 0; vidx < perm_vec_ct128; vidx++) {
+ loader = *flipa_perm_ptr++;
+ for (uii = 0; uii < 8; uii++) {
+ // set incr8 to (cur_case_a1_ct_flip[1] - cur_case_a1_ct_flip[0])
+ // where (specially permuted) flipA is set, zero when it is not
+ incr8 = _mm_and_si128(_mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(loader, m1x8)), diff_vec);
+ *acc8_ptr = _mm_add_epi8(*acc8_ptr, incr8);
+ acc8_ptr++;
+ loader = _mm_srli_epi64(loader, 1);
+ }
+ }
+ } else {
+ cur_flipa = &(flipa[fs_idx * perm_vec_wcta]);
+ for (uii = 0; uii < perm_vec_ct; uii++) {
+ is_flipped = IS_SET(cur_flipa, uii);
+ total_counts[uii] += cur_case_a1_ct_flip[is_flipped];
+ }
+ max_incr8 = 0;
+ }
+#else
+ cur_flipa = &(flipa[fs_idx * perm_vec_wcta]);
+ for (uii = 0; uii < perm_vec_ct; uii++) {
+ is_flipped = IS_SET(cur_flipa, uii);
+ total_counts[uii] += cur_case_a1_ct_flip[is_flipped];
+ }
+#endif
+ }
+ for (perm_idx = 0; perm_idx < perm_vec_ct; perm_idx++) {
+ twice_numers[perm_idx] = 2 * total_counts[perm_idx] - twice_numer_subtract;
+ }
+ for (fs_idx = 0; fs_idx < family_mixed_ct; fs_idx++, cur_dfam_ptr = &(cur_dfam_ptr[sibling_ct])) {
+ paternal_id = *cur_dfam_ptr++;
+ maternal_id = *cur_dfam_ptr++;
+ sibling_ct = *cur_dfam_ptr++;
+ paternal_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, paternal_id);
+ maternal_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, maternal_id);
+ parental_a1_ct = dfam_allele_ct_table[paternal_geno * 4 + maternal_geno];
+ if (!parental_a1_ct) {
+ dfam_sibship_or_unrelated_perm_calc(loadbuf_ptr, cur_dfam_ptr, perm_vecst, orig_pheno_c, sibling_ct, 0, perm_vec_ct, acc4, acc8, cur_case_a1_cts, cur_case_missing_cts, twice_numers, numers, denoms, total_counts);
+ } else {
+ for (sib_idx = 0, nonmissing_sib_ct = 0; sib_idx < sibling_ct; sib_idx++) {
+ sample_idx = cur_dfam_ptr[sib_idx];
+ cur_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, sample_idx);
+ nonmissing_sib_ct += (cur_geno != 1);
+ }
+ // skip if all children have missing genotypes
+ if (!nonmissing_sib_ct) {
+ continue;
+ }
+
+ quad_denom += (2 - (parental_a1_ct & 1)) * nonmissing_sib_ct;
+ cur_flipa = &(flipa[fs_idx * perm_vec_wcta]);
+ fill_uint_zero(cur_case_a1_cts, perm_vec_ct);
+#ifdef __LP64__
+ fill_v128_zero(acc4, acc4_vec_ct);
+ fill_v128_zero(acc8, acc8_vec_ct);
+#else
+ fill_ulong_zero(acc4, acc4_word_ct);
+ fill_ulong_zero(acc8, acc8_word_ct);
+#endif
+ // compute (unflipped) case_a1_ct for each permutation
+ max_incr4 = 0; // maximum possible value in acc4
+ max_incr8 = 0; // maximum possible value in acc8
+ orig_case_ct = 0;
+ for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+ sample_idx = cur_dfam_ptr[sib_idx];
+ orig_case_ct += IS_SET(orig_pheno_c, sample_idx);
+ cur_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, sample_idx);
+ // nothing to do here when cur_geno == 3, since a1_ct is zero
+ if (cur_geno & 1) {
+ continue;
+ }
+ cur_max_incr = (4 - cur_geno) / 2;
+#ifdef __LP64__
+ if (max_incr4 + cur_max_incr > 15) {
+ unroll_zero_incr_4_8(acc4, acc8, acc4_vec_ct);
+ max_incr8 += max_incr4;
+ if (max_incr8 > 240) {
+ unroll_zero_incr_8_32(acc8, (__m128i*)cur_case_a1_cts, acc8_vec_ct);
+ max_incr8 = 0;
+ }
+ max_incr4 = 0;
+ }
+ max_incr4 += cur_max_incr;
+
+ pheno_perm_ptr = (const __m128i*)(&(perm_vecst[sample_idx * perm_vec_wcta]));
+ if (cur_max_incr == 1) {
+ unroll_incr_1_4(pheno_perm_ptr, acc4, perm_vec_ct128);
+ } else {
+ // add 2 whenever this sample is a case
+ acc4_ptr = acc4;
+ for (vidx = 0; vidx < acc4_vec_ct; vidx++) {
+ loader = *pheno_perm_ptr++;
+ acc4_ptr[0] = _mm_add_epi64(acc4_ptr[0], _mm_slli_epi64(_mm_and_si128(loader, m1x4), 1));
+ acc4_ptr[1] = _mm_add_epi64(acc4_ptr[1], _mm_and_si128(loader, m1x4ls1));
+ loader = _mm_srli_epi64(loader, 1);
+ acc4_ptr[2] = _mm_add_epi64(acc4_ptr[2], _mm_and_si128(loader, m1x4ls1));
+ loader = _mm_srli_epi64(loader, 1);
+ acc4_ptr[3] = _mm_add_epi64(acc4_ptr[3], _mm_and_si128(loader, m1x4ls1));
+ acc4_ptr = &(acc4_ptr[4]);
+ }
+ }
+#else
+ if (max_incr4 + cur_max_incr > 15) {
+ unroll_zero_incr_4_8(acc4, acc8, acc4_word_ct);
+ max_incr8 += max_incr4;
+ if (max_incr8 > 240) {
+ unroll_zero_incr_8_32(acc8, (uintptr_t*)cur_case_a1_cts, acc8_word_ct);
+ max_incr8 = 0;
+ }
+ max_incr4 = 0;
+ }
+ max_incr4 += cur_max_incr;
+
+ pheno_perm_ptr = &(perm_vecst[sample_idx * perm_vec_wcta]);
+ if (cur_max_incr == 1) {
+ unroll_incr_1_4(pheno_perm_ptr, acc4, perm_vec_wct);
+ } else {
+ acc4_ptr = acc4;
+ for (widx = 0; widx < perm_vec_wct; widx++) {
+ loader = *pheno_perm_ptr++;
+ acc4_ptr[0] += (loader & 0x11111111U) << 1;
+ acc4_ptr[1] += loader & 0x22222222U;
+ acc4_ptr[2] += (loader >> 1) & 0x22222222U;
+ acc4_ptr[3] += (loader >> 2) & 0x22222222U;
+ acc4_ptr = &(acc4_ptr[4]);
+ }
+ }
+#endif
+ }
+#ifdef __LP64__
+ // max_incr4 guaranteed to be nonzero unless no child had any A1
+ // alleles
+ if (max_incr4) {
+ unroll_incr_4_8(acc4, acc8, acc4_vec_ct);
+ unroll_incr_8_32(acc8, (__m128i*)cur_case_a1_cts, acc8_vec_ct);
+ }
+#else
+ if (max_incr4) {
+ unroll_incr_4_8(acc4, acc8, acc4_word_ct);
+ unroll_incr_8_32(acc8, (uintptr_t*)cur_case_a1_cts, acc8_word_ct);
+ }
+#endif
+ if (nonmissing_sib_ct == sibling_ct) {
+ cur_flipa = &(flipa[fs_idx * perm_vec_wcta]);
+ cur_max_incr = orig_case_ct * parental_a1_ct;
+ for (perm_idx = 0; perm_idx < perm_vec_ct; perm_idx++) {
+ uii = cur_case_a1_cts[perm_idx];
+ if (IS_SET(cur_flipa, perm_idx)) {
+ uii = cur_max_incr - uii;
+ }
+ total_counts[perm_idx] += uii;
+ twice_numers[perm_idx] += (int32_t)(2 * uii) - (int32_t)cur_max_incr;
+ }
+ } else {
+ // cur_case_ct also varies; need to compute case_missing_ct for
+ // each permutation, and twice_numers/total_counts updates are more
+ // complex.
+ // (technically could separate out >50% missingness as a special
+ // case, but we focus our attention on the far more common sparse
+ // missingness scenario.)
+ cur_max_incr = 0;
+ fill_uint_zero(cur_case_missing_cts, perm_vec_ct);
+#ifdef __LP64__
+ fill_v128_zero(acc4, acc4_vec_ct);
+ fill_v128_zero(acc8, acc8_vec_ct);
+#else
+ fill_ulong_zero(acc4, acc4_word_ct);
+ fill_ulong_zero(acc8, acc8_word_ct);
+#endif
+ for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+ sample_idx = cur_dfam_ptr[sib_idx];
+ cur_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, sample_idx);
+ if (cur_geno != 1) {
+ continue;
+ }
+#ifdef __LP64__
+ if (!(cur_max_incr % 15)) {
+ unroll_zero_incr_4_8(acc4, acc8, acc4_vec_ct);
+ if (!(cur_max_incr % 255)) {
+ unroll_zero_incr_8_32(acc8, (__m128i*)cur_case_missing_cts, acc8_vec_ct);
+ }
+ }
+ unroll_incr_1_4((__m128i*)(&(perm_vecst[sample_idx * perm_vec_wcta])), acc4, perm_vec_ct128);
+#else
+ if (!(cur_max_incr % 15)) {
+ unroll_zero_incr_4_8(acc4, acc8, acc4_word_ct);
+ if (!(cur_max_incr % 255)) {
+ unroll_zero_incr_8_32(acc8, (uintptr_t*)cur_case_missing_cts, acc8_word_ct);
+ }
+ }
+ unroll_incr_1_4(&(perm_vecst[sample_idx * perm_vec_wcta]), acc4, perm_vec_wct);
+#endif
+ cur_max_incr++;
+ }
+#ifdef __LP64__
+ unroll_incr_4_8(acc4, acc8, acc4_vec_ct);
+ unroll_incr_8_32(acc8, (__m128i*)cur_case_missing_cts, acc8_vec_ct);
+#else
+ unroll_incr_4_8(acc4, acc8, acc4_word_ct);
+ unroll_incr_8_32(acc8, (uintptr_t*)cur_case_missing_cts, acc8_word_ct);
+#endif
+ for (perm_idx = 0; perm_idx < perm_vec_ct; perm_idx++) {
+ uii = cur_case_a1_cts[perm_idx];
+ ujj = (orig_case_ct - cur_case_missing_cts[perm_idx]) * parental_a1_ct;
+ twice_numers[perm_idx] += (int32_t)(2 * uii) - ((int32_t)ujj);
+ total_counts[perm_idx] += uii;
+ }
+ }
+ }
+ }
+ for (fs_idx = 0; fs_idx < sibship_mixed_ct; fs_idx++, cur_dfam_ptr = &(cur_dfam_ptr[sibling_ct])) {
+ sibling_ct = *cur_dfam_ptr++;
+ dfam_sibship_or_unrelated_perm_calc(loadbuf_ptr, cur_dfam_ptr, perm_vecst, orig_pheno_c, sibling_ct, 0, perm_vec_ct, acc4, acc8, cur_case_a1_cts, cur_case_missing_cts, twice_numers, numers, denoms, total_counts);
+ }
+ for (unrelated_cluster_idx = 0; unrelated_cluster_idx < unrelated_cluster_ct; unrelated_cluster_idx++, cur_dfam_ptr = &(cur_dfam_ptr[sibling_ct])) {
+ sibling_ct = *cur_dfam_ptr++;
+ // call sibling permutation routine with unrelated bool set (most of
+ // the code should be identical so this should be one function)
+ dfam_sibship_or_unrelated_perm_calc(loadbuf_ptr, cur_dfam_ptr, perm_vecst, orig_pheno_c, sibling_ct, 1, perm_vec_ct, acc4, acc8, cur_case_a1_cts, cur_case_missing_cts, twice_numers, numers, denoms, total_counts);
+ }
+ if (perm_adapt) {
+ for (perm_idx = 0; perm_idx < perm_vec_ct;) {
+ // now harvest the chi-square values, check adaptive termination
+ // condition, etc.
+ dxx = numers[perm_idx] + ((double)((int32_t)twice_numers[perm_idx])) * 0.5;
+ dyy = denoms[perm_idx] + ((double)((int32_t)quad_denom)) * 0.25;
+ chisq = dxx * dxx / dyy;
+ if (chisq > chisq_high) {
+ success_2incr += 2;
+ } else if (chisq > chisq_low) {
+ success_2incr++;
+ }
+ if (++perm_idx == next_adapt_check - pidx_offset) {
+ uii = success_2start + success_2incr;
+ if (uii) {
+ pval = ((double)((int32_t)uii + 2)) / ((double)(2 * ((int32_t)next_adapt_check + 1)));
+ dxx = adaptive_ci_zt * sqrt(pval * (1 - pval) / ((int32_t)next_adapt_check));
+ dyy = pval - dxx; // lower bound
+ dzz = pval + dxx; // upper bound
+ if ((dyy > aperm_alpha) || (dzz < aperm_alpha)) {
+ perm_adapt_stop[marker_idx] = 1;
+ perm_attempt_ct[marker_idx] = next_adapt_check;
+ break;
+ }
+ }
+ next_adapt_check += (int32_t)(adaptive_intercept + ((int32_t)next_adapt_check) * adaptive_slope);
+ }
+ }
+ } else {
+ for (perm_idx = 0; perm_idx < perm_vec_ct;) {
+ dxx = numers[perm_idx] + ((double)((int32_t)twice_numers[perm_idx])) * 0.5;
+ dyy = denoms[perm_idx] + ((double)((int32_t)quad_denom)) * 0.25;
+ chisq = dxx * dxx / dyy;
+ if (chisq > chisq_high) {
+ success_2incr += 2;
+ } else if (chisq > chisq_low) {
+ success_2incr++;
+ }
+ if (maxt_results[perm_idx] < chisq) {
+ maxt_results[perm_idx] = chisq;
+ }
+ if (msa_ptr) {
+ *msa_ptr++ = chisq;
+ }
+ }
+ }
+ perm_2success_ct[marker_idx] += success_2incr;
+ }
+ dfam_perm_thread_skip_all:
+ if ((!tidx) || g_is_last_thread_block) {
+ THREAD_RETURN;
+ }
+ THREAD_BLOCK_FINISH(tidx);
+ }
+}
+
void dfam_sibship_calc(uint32_t cur_case_ct, uint32_t case_hom_a1_ct, uint32_t case_het_ct, uint32_t cur_ctrl_ct, uint32_t ctrl_hom_a1_ct, uint32_t ctrl_het_ct, uint32_t* total_a1_count_ptr, double* numer_ptr, double* denom_ptr, double* total_expected_ptr) {
if (!cur_ctrl_ct) {
return;
@@ -2954,6 +3796,35 @@ void dfam_sibship_calc(uint32_t cur_case_ct, uint32_t case_hom_a1_ct, uint32_t c
*total_expected_ptr += case_expected_a1_ct;
}
+#ifdef __LP64__
+void dfam_flipa_shuffle(uintptr_t* perms, uintptr_t* shuffled_perms, uint32_t perm_ct) {
+ // 0 16 32 48 64 80 96 112 4 20 36 52 68 84 100 116 8 24 40 56 72 88 104 120 12 28 44 60 76 92 108 124
+ // 1 17 ...
+ uint32_t vct = (perm_ct + 127) / 128;
+ uint32_t vidx;
+ uint32_t offset1;
+ uint32_t offset8;
+ uint32_t read_offset;
+ uint32_t write_offset;
+ for (vidx = 0; vidx < vct; ++vidx) {
+ shuffled_perms[0] = 0;
+ shuffled_perms[1] = 0;
+ for (offset1 = 0; offset1 < 8; offset1++) {
+ for (offset8 = 0; offset8 < 4; offset8++) {
+ read_offset = offset1 * 16 + offset8 * 4;
+ write_offset = offset1 + offset8 * 8;
+ shuffled_perms[0] |= IS_SET(perms, read_offset) << write_offset;
+ shuffled_perms[0] |= IS_SET(perms, read_offset + 1) << (write_offset + 32);
+ shuffled_perms[1] |= IS_SET(perms, read_offset + 2) << write_offset;
+ shuffled_perms[1] |= IS_SET(perms, read_offset + 3) << (write_offset + 32);
+ }
+ }
+ perms = &(perms[2]);
+ shuffled_perms = &(shuffled_perms[2]);
+ }
+}
+#endif
+
int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, [...]
unsigned char* wkspace_mark = wkspace_base;
FILE* outfile = NULL;
@@ -2966,9 +3837,24 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
uintptr_t unfiltered_sample_ctp1l2 = 1 + (unfiltered_sample_ct / BITCT2);
uintptr_t final_mask = get_final_mask(unfiltered_sample_ct);
+ uintptr_t perm_vec_ct128 = 0;
+ uintptr_t perm_vec_cta128 = 0;
+ uintptr_t perm_vec_wct = 0;
+ uintptr_t perm_vec_wcta = 0;
+ uintptr_t perm_vec_ctcl8m = 0;
uintptr_t* marker_exclude_orig_autosomal = marker_exclude_orig;
uintptr_t* founder_pnm = NULL;
+ uintptr_t* perm_preimage = NULL;
double* orig_chisq = NULL;
+ double* maxt_extreme_stat = NULL;
+ uint32_t* dfam_cluster_map = NULL;
+ uint32_t* dfam_cluster_starts = NULL;
+ uint32_t* dfam_cluster_case_cts = NULL;
+ uint32_t* dfam_tot_quotients = NULL;
+ uint64_t* dfam_totq_magics = NULL;
+ uint32_t* dfam_totq_preshifts = NULL;
+ uint32_t* dfam_totq_postshifts = NULL;
+ uint32_t* dfam_totq_incrs = NULL;
uint32_t unfiltered_sample_ctl2m1 = (unfiltered_sample_ct - 1) / BITCT2;
uint32_t multigen = (fam_ip->mendel_modifier / MENDEL_MULTIGEN) & 1;
uint32_t is_set_test = fam_ip->dfam_modifier & DFAM_SET_TEST;
@@ -2976,7 +3862,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
uint32_t perm_maxt_nst = (fam_ip->dfam_modifier & DFAM_MPERM) && (!is_set_test);
uint32_t do_perms = fam_ip->dfam_modifier & (DFAM_PERM | DFAM_MPERM);
uint32_t do_perms_nst = do_perms && (!is_set_test);
- // uint32_t perm_count = fam_ip->dfam_modifier & DFAM_PERM_COUNT;
+ uint32_t perm_count = fam_ip->dfam_modifier & DFAM_PERM_COUNT;
uint32_t fill_orig_chisq = do_perms || mtest_adjust;
uint32_t no_unrelateds = (fam_ip->dfam_modifier & DFAM_NO_UNRELATEDS) || (within_cmdflag && (!cluster_ct));
uint32_t family_all_case_children_ct = 0;
@@ -2984,10 +3870,10 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
uint32_t sibship_mixed_ct = 0;
uint32_t unrelated_cluster_ct = 0;
uint32_t pct = 0;
- // uint32_t max_thread_ct = g_thread_ct;
+ uint32_t max_thread_ct = MINV(g_thread_ct, MODEL_BLOCKSIZE);
uint32_t perm_pass_idx = 0;
uint32_t perms_total = 0;
- uint32_t perms_done = 0;
+ uint32_t dfam_cluster_map_size = 0;
int32_t retval = 0;
uintptr_t* pheno_nm;
uintptr_t* dfam_pheno_c;
@@ -2997,10 +3883,10 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
uintptr_t* marker_exclude;
uintptr_t* dfam_sample_exclude;
uintptr_t* size_one_sibships;
- double* maxt_extreme_stat = NULL;
uint32_t mu_table[MODEL_BLOCKSIZE];
- // char* outname_end2;
+ char* outname_end2;
char* wptr;
+ char* wptr_start;
uint64_t* family_list;
uint64_t* trio_list;
uint32_t* trio_error_lookup;
@@ -3009,11 +3895,12 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
uint32_t* sample_to_fss_idx;
uint32_t* dfam_iteration_order;
uint32_t* idx_to_uidx;
- uint32_t* uidx_to_idx;
+ uint32_t* sample_uidx_to_idx;
uint32_t* sample_to_cluster;
uint32_t* cluster_ctrl_case_cts;
uint32_t* cluster_write_idxs;
uint32_t* cur_dfam_ptr;
+ uint32_t* dfam_mixed_start;
uintptr_t marker_ct;
uintptr_t marker_uidx; // loading
uintptr_t marker_uidx2; // writing
@@ -3029,6 +3916,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
double chisq;
double pval;
double dxx;
+ double dyy;
uint32_t family_ct;
uint32_t fs_ct;
uint32_t sample_uidx;
@@ -3042,8 +3930,10 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
uint32_t cur_ctrl_ct;
uint32_t cur_case_ct;
uint32_t dfam_sample_ct;
+ uint32_t dfam_sample_ctl;
uint32_t dfam_sample_ctl2;
uint32_t chrom_fo_idx;
+ uint32_t chrom_end;
uint32_t chrom_idx;
uint32_t block_size;
uint32_t block_end;
@@ -3070,6 +3960,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
uint32_t ctrl_het_ct;
uint32_t hom_a1_ct;
uint32_t het_ct;
+ uint32_t dfam_cluster_ct;
uint32_t uii;
uint32_t ujj;
int32_t twice_numer;
@@ -3191,6 +4082,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
}
}
}
+ dfam_mixed_start = cur_dfam_ptr;
for (fs_idx = 0; fs_idx < family_ct; fs_idx++) {
// Scan for families with at least one case and one control child.
fssc_start = fs_starts[fs_idx] + 2;
@@ -3199,7 +4091,8 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
cur_case_ct += is_set(pheno_c, idx_to_uidx[fss_contents[fssc_idx]]);
}
- if (cur_case_ct && (cur_case_ct != fssc_end - fssc_start)) {
+ sibling_ct = fssc_end - fssc_start;
+ if (cur_case_ct && (cur_case_ct != sibling_ct)) {
family_mixed_ct++;
sample_uidx = idx_to_uidx[fss_contents[fssc_start - 2]];
clear_bit(dfam_sample_exclude, sample_uidx);
@@ -3209,7 +4102,8 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
clear_bit(dfam_sample_exclude, sample_uidx);
*cur_dfam_ptr++ = sample_uidx;
- *cur_dfam_ptr++ = fssc_end - fssc_start;
+ dfam_cluster_map_size += sibling_ct;
+ *cur_dfam_ptr++ = sibling_ct;
for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
sample_uidx = idx_to_uidx[fss_contents[fssc_idx]];
clear_bit(dfam_sample_exclude, sample_uidx);
@@ -3225,11 +4119,13 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
cur_case_ct += is_set(pheno_c, idx_to_uidx[fss_contents[fssc_idx]]);
}
- if (cur_case_ct && (cur_case_ct != fssc_end - fssc_start)) {
+ sibling_ct = fssc_end - fssc_start;
+ if (cur_case_ct && (cur_case_ct != sibling_ct)) {
sibship_mixed_ct++;
// [0]: sibling ct
// [1...]: member uidxs
- *cur_dfam_ptr++ = fssc_end - fssc_start;
+ dfam_cluster_map_size += sibling_ct;
+ *cur_dfam_ptr++ = sibling_ct;
for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
sample_uidx = idx_to_uidx[fss_contents[fssc_idx]];
clear_bit(dfam_sample_exclude, sample_uidx);
@@ -3237,6 +4133,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
}
}
}
+ dfam_cluster_map_size = ((uintptr_t)(cur_dfam_ptr - dfam_mixed_start)) - 3 * family_mixed_ct - sibship_mixed_ct;
if (!no_unrelateds) {
if (wkspace_alloc_ui_checked(&sample_to_cluster, sample_ct * sizeof(int32_t))) {
goto dfam_ret_NOMEM;
@@ -3278,11 +4175,13 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
cur_case_ct = cluster_ctrl_case_cts[2 * unrelated_cluster_idx + 1];
if (cur_ctrl_ct && cur_case_ct) {
unrelated_cluster_ct++;
- cur_dfam_ptr[write_idx++] = cur_ctrl_ct + cur_case_ct;
+ uii = cur_ctrl_ct + cur_case_ct;
+ cur_dfam_ptr[write_idx++] = uii;
cluster_write_idxs[unrelated_cluster_idx] = write_idx;
- write_idx += cur_ctrl_ct + cur_case_ct;
+ write_idx += uii;
}
}
+ dfam_cluster_map_size += write_idx - unrelated_cluster_ct;
for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
next_unset_unsafe_ck(sample_exclude, &sample_uidx);
unrelated_cluster_idx = sample_to_cluster[sample_idx];
@@ -3301,27 +4200,23 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
}
wkspace_reset((unsigned char*)idx_to_uidx);
wkspace_shrink_top(dfam_iteration_order, (cur_dfam_ptr - dfam_iteration_order) * sizeof(int32_t));
- if (do_perms) {
- logerrprint("Error: --dfam permutation tests are currently under development.\n");
- retval = RET_CALC_NOT_YET_SUPPORTED;
- goto dfam_ret_1;
- }
dfam_sample_ct = unfiltered_sample_ct - popcount_longs(dfam_sample_exclude, unfiltered_sample_ctl);
+ dfam_sample_ctl = (dfam_sample_ct + (BITCT - 1)) / BITCT;
dfam_sample_ctl2 = (dfam_sample_ct + (BITCT2 - 1)) / BITCT2;
- if (wkspace_alloc_ui_checked(&uidx_to_idx, unfiltered_sample_ct * sizeof(int32_t))) {
+ if (wkspace_alloc_ui_checked(&sample_uidx_to_idx, unfiltered_sample_ct * sizeof(int32_t))) {
goto dfam_ret_NOMEM;
}
- fill_uidx_to_idx(dfam_sample_exclude, unfiltered_sample_ct, dfam_sample_ct, uidx_to_idx);
+ fill_uidx_to_idx(dfam_sample_exclude, unfiltered_sample_ct, dfam_sample_ct, sample_uidx_to_idx);
cur_dfam_ptr = dfam_iteration_order;
uii = family_all_case_children_ct + family_mixed_ct;
for (fs_idx = 0; fs_idx < uii; fs_idx++) {
- *cur_dfam_ptr = uidx_to_idx[*cur_dfam_ptr];
+ *cur_dfam_ptr = sample_uidx_to_idx[*cur_dfam_ptr];
cur_dfam_ptr++;
- *cur_dfam_ptr = uidx_to_idx[*cur_dfam_ptr];
+ *cur_dfam_ptr = sample_uidx_to_idx[*cur_dfam_ptr];
cur_dfam_ptr++;
sibling_ct = *cur_dfam_ptr++;
for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
- *cur_dfam_ptr = uidx_to_idx[*cur_dfam_ptr];
+ *cur_dfam_ptr = sample_uidx_to_idx[*cur_dfam_ptr];
cur_dfam_ptr++;
}
}
@@ -3329,13 +4224,11 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
for (fs_idx = 0; fs_idx < uii; fs_idx++) {
sibling_ct = *cur_dfam_ptr++;
for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
- *cur_dfam_ptr = uidx_to_idx[*cur_dfam_ptr];
+ *cur_dfam_ptr = sample_uidx_to_idx[*cur_dfam_ptr];
cur_dfam_ptr++;
}
}
- // DEBUG
- // printf("*** %u %u %u %u\n", family_all_case_children_ct, family_mixed_ct, sibship_mixed_ct, unrelated_cluster_ct);
- wkspace_reset((unsigned char*)uidx_to_idx);
+ wkspace_reset((unsigned char*)sample_uidx_to_idx);
if (wkspace_alloc_ul_checked(&dfam_pheno_c, dfam_sample_ctl2 * sizeof(intptr_t)) ||
wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
wkspace_alloc_ul_checked(&workbuf, unfiltered_sample_ctp1l2 * sizeof(intptr_t)) ||
@@ -3344,6 +4237,13 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
}
collapse_copy_bitarr(sample_ct, pheno_c, dfam_sample_exclude, dfam_sample_ct, dfam_pheno_c);
g_pheno_c = dfam_pheno_c;
+ g_dfam_iteration_order = dfam_iteration_order;
+ g_dfam_family_all_case_children_ct = family_all_case_children_ct;
+ g_dfam_family_mixed_ct = family_mixed_ct;
+ g_dfam_sample_ct = dfam_sample_ct;
+ g_dfam_sibship_mixed_ct = sibship_mixed_ct;
+ g_dfam_unrelated_cluster_ct = unrelated_cluster_ct;
+ g_test_type = perm_adapt_nst;
loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
workbuf[unfiltered_sample_ctp1l2 - 1] = 0;
for (ulii = 1; ulii <= MODEL_BLOCKSIZE; ulii++) {
@@ -3359,6 +4259,67 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
g_orig_stat = orig_chisq;
}
+ dfam_cluster_ct = family_mixed_ct + sibship_mixed_ct + unrelated_cluster_ct;
+ if (do_perms_nst) {
+ logerrprint("Error: --dfam permutation tests are currently under development.\n");
+ retval = RET_CALC_NOT_YET_SUPPORTED;
+ goto dfam_ret_1;
+ if (wkspace_alloc_ui_checked(&dfam_cluster_map, dfam_cluster_map_size * sizeof(int32_t)) ||
+ wkspace_alloc_ui_checked(&dfam_cluster_starts, (dfam_cluster_ct + 1) * sizeof(int32_t)) ||
+ wkspace_alloc_ui_checked(&dfam_cluster_case_cts, dfam_cluster_ct * sizeof(int32_t)) ||
+ wkspace_alloc_ul_checked(&perm_preimage, dfam_sample_ctl * sizeof(intptr_t))) {
+ goto dfam_ret_NOMEM;
+ }
+ fill_ulong_zero(perm_preimage, dfam_sample_ctl);
+ cur_dfam_ptr = dfam_mixed_start;
+ write_idx = 0;
+ for (uii = 0; uii < family_mixed_ct; uii++) {
+ dfam_cluster_starts[uii] = write_idx;
+ cur_dfam_ptr = &(cur_dfam_ptr[2]);
+ sibling_ct = *cur_dfam_ptr++;
+ cur_case_ct = 0;
+ for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+ sample_idx = cur_dfam_ptr[sib_idx];
+ dfam_cluster_map[write_idx++] = sample_idx;
+ cur_case_ct += IS_SET(dfam_pheno_c, sample_idx);
+ }
+ if (cur_case_ct * 2 >= sibling_ct) {
+ for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+ SET_BIT(perm_preimage, cur_dfam_ptr[sib_idx]);
+ }
+ }
+ cur_dfam_ptr = &(cur_dfam_ptr[sibling_ct]);
+ dfam_cluster_case_cts[uii] = cur_case_ct;
+ }
+ for (; uii < dfam_cluster_ct; uii++) {
+ dfam_cluster_starts[uii] = write_idx;
+ sibling_ct = *cur_dfam_ptr++;
+ cur_case_ct = 0;
+ for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+ sample_idx = cur_dfam_ptr[sib_idx];
+ dfam_cluster_map[write_idx++] = sample_idx;
+ cur_case_ct += IS_SET(dfam_pheno_c, sample_idx);
+ }
+ if (cur_case_ct * 2 >= sibling_ct) {
+ for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+ SET_BIT(perm_preimage, cur_dfam_ptr[sib_idx]);
+ }
+ }
+ cur_dfam_ptr = &(cur_dfam_ptr[sibling_ct]);
+ dfam_cluster_case_cts[uii] = cur_case_ct;
+ }
+ if (write_idx != dfam_cluster_map_size) {
+ logerrprint("assert failure: write_idx != dfam_cluster_map_size\n");
+ exit(1);
+ }
+ dfam_cluster_starts[dfam_cluster_ct] = write_idx;
+
+ retval = cluster_alloc_and_populate_magic_nums(dfam_cluster_ct, dfam_cluster_map, dfam_cluster_starts, &dfam_tot_quotients, &dfam_totq_magics, &dfam_totq_preshifts, &dfam_totq_postshifts, &dfam_totq_incrs);
+ if (retval) {
+ goto dfam_ret_1;
+ }
+ }
+
ulii = 2 * max_marker_allele_len + plink_maxsnp + MAX_ID_LEN + 256;
if (ulii > MAXLINELEN) {
if (wkspace_alloc_c_checked(&textbuf, ulii)) {
@@ -3370,6 +4331,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
// since it's also restricted to autosomes
g_perms_done = 0;
g_mperm_save_all = NULL;
+ g_dfam_perm_vecs = NULL;
if (perm_maxt_nst) {
perms_total = fam_ip->dfam_mperm_val;
if (wkspace_alloc_d_checked(&maxt_extreme_stat, perms_total * sizeof(double))) {
@@ -3412,8 +4374,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
}
}
- memcpy(outname_end, ".dfam", 6);
- // outname_end2 = memcpyb(outname_end, ".dfam", 6);
+ outname_end2 = memcpyb(outname_end, ".dfam", 6);
if (fopen_checked(&outfile, outname, "w")) {
goto dfam_ret_OPEN_FAIL;
}
@@ -3439,28 +4400,64 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
fputs("0%", stdout);
fflush(stdout);
// ----- begin main loop -----
- // dfam_more_perms:
+ dfam_more_perms:
if (do_perms_nst) {
- logerrprint("Error: --dfam permutation tests are currently under development.\n");
- retval = RET_CALC_NOT_YET_SUPPORTED;
- goto dfam_ret_1;
- /*
if (perm_adapt_nst && perm_pass_idx) {
while (g_first_adapt_check <= g_perms_done) {
// APERM_MAX prevents infinite loop here
g_first_adapt_check += (int32_t)(apip->init_interval + ((int32_t)g_first_adapt_check) * apip->interval_slope);
}
}
- // g_perm_vec_ct memory allocation dependencies:
- // ;;;
+ // todo: check whether larger batches make sense
g_perm_vec_ct = perm_batch_size;
if (g_perm_vec_ct > perms_total - g_perms_done) {
g_perm_vec_ct = perms_total - g_perms_done;
}
- if (wkspace_alloc_ul_checked(&g_dfam_perm_vecs, g_perm_vec_ct * sample_ctv2 * sizeof(intptr_t))) {
+ perm_vec_ct128 = (g_perm_vec_ct + 127) / 128;
+ perm_vec_cta128 = perm_vec_ct128 * 128;
+ perm_vec_wct = (g_perm_vec_ct + (BITCT - 1)) / BITCT;
+ perm_vec_wcta = perm_vec_ct128 * (128 / BITCT);
+ perm_vec_ctcl8m = CACHEALIGN32_DBL(g_perm_vec_ct);
+
+ if (wkspace_alloc_ul_checked(&g_dfam_perm_vecs, g_perm_vec_ct * dfam_sample_ctl * sizeof(intptr_t)) ||
+ wkspace_alloc_ul_checked(&g_dfam_perm_vecst, dfam_sample_ct * perm_vec_wcta * sizeof(intptr_t)) ||
+ wkspace_alloc_ul_checked(&g_dfam_flipa, family_ct * perm_vec_wct * sizeof(intptr_t)) ||
+#ifdef __LP64__
+ wkspace_alloc_ul_checked(&g_dfam_flipa_shuffled, family_all_case_children_ct * perm_vec_wcta * sizeof(intptr_t)) ||
+#endif
+ wkspace_alloc_i_checked(&g_dfam_twice_numers, max_thread_ct * perm_vec_cta128 * sizeof(int32_t)) ||
+ wkspace_alloc_ui_checked(&g_dfam_total_counts, max_thread_ct * perm_vec_cta128 * sizeof(int32_t)) ||
+ wkspace_alloc_d_checked(&g_dfam_numers, max_thread_ct * perm_vec_cta128 * sizeof(double)) ||
+ wkspace_alloc_d_checked(&g_dfam_denoms, max_thread_ct * perm_vec_cta128 * sizeof(double))
+ ) {
goto dfam_ret_NOMEM;
}
+ // initialize phenotype and flipa permutations.
+ // don't bother multithreading for now
+ for (ulii = 0; ulii < g_perm_vec_ct; ulii++) {
+ generate_cc_cluster_perm1(dfam_sample_ct, perm_preimage, dfam_cluster_ct, dfam_cluster_map, dfam_cluster_starts, dfam_cluster_case_cts, dfam_tot_quotients, dfam_totq_magics, dfam_totq_preshifts, dfam_totq_postshifts, dfam_totq_incrs, &(g_dfam_perm_vecs[ulii * dfam_sample_ctl]), &sfmt);
+ }
+ transpose_perm1s(g_dfam_perm_vecs, g_perm_vec_ct, sample_ct, (uint32_t*)g_dfam_perm_vecst);
+ /*
+ for () {
+ }
*/
+
+#ifdef __LP64__
+ for (fs_idx = 0; fs_idx < family_all_case_children_ct; fs_idx++) {
+ dfam_flipa_shuffle(&(g_dfam_flipa[fs_idx * perm_vec_wcta]), &(g_dfam_flipa_shuffled[fs_idx * perm_vec_wcta]), g_perm_vec_ct);
+ }
+#endif
+ if (perm_maxt_nst) {
+ if (wkspace_alloc_d_checked(&g_maxt_thread_results, max_thread_ct * perm_vec_ctcl8m * sizeof(double))) {
+ goto dfam_ret_NOMEM;
+ }
+ if (mperm_save & MPERM_DUMP_ALL) {
+ if (wkspace_alloc_d_checked(&g_mperm_save_all, marker_ct * g_perm_vec_ct * sizeof(double))) {
+ goto dfam_ret_NOMEM;
+ }
+ }
+ }
}
chrom_fo_idx = 0xffffffffU;
marker_uidx = next_unset_unsafe(marker_exclude, 0);
@@ -3496,7 +4493,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
}
erase_mendel_errors(unfiltered_sample_ct, loadbuf_raw, workbuf, sex_male, trio_error_lookup, trio_ct, 0, multigen);
collapse_copy_2bitarr(loadbuf_raw, &(g_loadbuf[block_size * dfam_sample_ctl2]), unfiltered_sample_ct, dfam_sample_ct, dfam_sample_exclude);
- if (perm_adapt_nst) {
+ if (do_perms_nst) {
g_adapt_m_table[block_size] = marker_idx2++;
}
mu_table[block_size++] = marker_uidx;
@@ -3586,8 +4583,8 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
paternal_id = *cur_dfam_ptr++;
maternal_id = *cur_dfam_ptr++;
sibling_ct = *cur_dfam_ptr++;
- paternal_geno = (loadbuf_ptr[paternal_id / BITCT2] >> (2 * (paternal_id % BITCT2))) & 3;
- maternal_geno = (loadbuf_ptr[maternal_id / BITCT2] >> (2 * (maternal_id % BITCT2))) & 3;
+ paternal_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, paternal_id);
+ maternal_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, maternal_id);
parental_a1_ct = dfam_allele_ct_table[paternal_geno * 4 + maternal_geno];
if (!parental_a1_ct) {
cur_dfam_ptr = &(cur_dfam_ptr[sibling_ct]);
@@ -3597,7 +4594,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
case_a1_ct = 0;
for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
sample_idx = *cur_dfam_ptr++;
- cur_geno = (loadbuf_ptr[sample_idx / BITCT2] >> (2 * (sample_idx % BITCT2))) & 3;
+ cur_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, sample_idx);
if (cur_geno == 1) {
continue;
}
@@ -3615,8 +4612,8 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
paternal_id = *cur_dfam_ptr++;
maternal_id = *cur_dfam_ptr++;
sibling_ct = *cur_dfam_ptr++;
- paternal_geno = (loadbuf_ptr[paternal_id / BITCT2] >> (2 * (paternal_id % BITCT2))) & 3;
- maternal_geno = (loadbuf_ptr[maternal_id / BITCT2] >> (2 * (maternal_id % BITCT2))) & 3;
+ paternal_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, paternal_id);
+ maternal_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, maternal_id);
parental_a1_ct = dfam_allele_ct_table[paternal_geno * 4 + maternal_geno];
cur_case_ct = 0;
cur_ctrl_ct = 0;
@@ -3626,7 +4623,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
ctrl_het_ct = 0;
for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
sample_idx = *cur_dfam_ptr++;
- cur_geno = (loadbuf_ptr[sample_idx / BITCT2] >> (2 * (sample_idx % BITCT2))) & 3;
+ cur_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, sample_idx);
if (cur_geno == 1) {
continue;
}
@@ -3676,7 +4673,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
ctrl_het_ct = 0;
for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
sample_idx = *cur_dfam_ptr++;
- cur_geno = (loadbuf_ptr[sample_idx / BITCT2] >> (2 * (sample_idx % BITCT2))) & 3;
+ cur_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, sample_idx);
if (cur_geno == 1) {
continue;
}
@@ -3715,7 +4712,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
ctrl_het_ct = 0;
for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
sample_idx = *cur_dfam_ptr++;
- cur_geno = (loadbuf_ptr[sample_idx / BITCT2] >> (2 * (sample_idx % BITCT2))) & 3;
+ cur_geno = EXTRACT_2BIT_GENO(loadbuf_ptr, sample_idx);
if (cur_geno == 1) {
continue;
}
@@ -3789,6 +4786,11 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
}
}
}
+ if (do_perms_nst) {
+ // g_xfam_thread_ct = ;;; // f(block size)
+ // ...
+ g_perms_done += g_perm_vec_ct;
+ }
marker_idx += block_size;
if ((!perm_pass_idx) && (marker_idx >= loop_end)) {
if (marker_idx < marker_unstopped_ct) {
@@ -3815,6 +4817,9 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
goto dfam_ret_WRITE_FAIL;
}
if (!is_set_test) {
+ if (do_perms_nst) {
+ wkspace_reset(g_dfam_perm_vecs);
+ }
if (mtest_adjust) {
if (wkspace_alloc_ui_checked(&idx_to_uidx, marker_ct * sizeof(int32_t))) {
goto dfam_ret_NOMEM;
@@ -3836,9 +4841,117 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
}
if (do_perms_nst) {
// if (mperm_save & MPERM_DUMP_ALL) { ...
- // wkspace_reset();
- if (perms_done < perms_total) {
+ wkspace_reset(g_dfam_perm_vecs);
+ if (g_perms_done < perms_total) {
+ if (perm_adapt_nst) {
+ marker_unstopped_ct = marker_ct - popcount_longs((uintptr_t*)g_perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
+ if (!marker_unstopped_ct) {
+ goto dfam_adapt_perm_count;
+ }
+ }
+ printf("\r%u permutation%s complete.", g_perms_done, (g_perms_done != 1)? "s" : "");
+ fflush(stdout);
+ perm_pass_idx++;
+ goto dfam_more_perms;
}
+ if (perm_adapt_nst) {
+ dfam_adapt_perm_count:
+ g_perms_done = 0;
+ for (uii = 0; uii < marker_ct; uii++) {
+ if (g_perm_attempt_ct[uii] > g_perms_done) {
+ g_perms_done = g_perm_attempt_ct[uii];
+ if (g_perms_done == perms_total) {
+ break;
+ }
+ }
+ }
+ }
+ putchar('\r');
+ LOGPRINTF("%u %s permutation%s complete.\n", g_perms_done, perm_maxt_nst? "max(T)" : "adaptive", (g_perms_done != 1)? "s" : "");
+ if (perm_adapt_nst) {
+ memcpy(outname_end2, ".perm", 6);
+ } else {
+ if (mperm_save & MPERM_DUMP_BEST) {
+ memcpy(outname_end, ".mperm.dump.best", 17);
+ // ...
+ memcpy(outname_end, ".qassoc", 7);
+ }
+ memcpy(outname_end2, ".mperm", 7);
+ }
+ if (fopen_checked(&outfile, outname, "w")) {
+ goto dfam_ret_OPEN_FAIL;
+ }
+ if (perm_adapt_nst) {
+ sprintf(tbuf, " CHR %%%us CHISQ_TDT EMP1 NP \n", plink_maxsnp);
+ } else {
+ sprintf(tbuf, " CHR %%%us CHISQ_TDT EMP1 EMP2 \n", plink_maxsnp);
+#ifdef __cplusplus
+ std::sort(g_maxt_extreme_stat, &(g_maxt_extreme_stat[perms_total]));
+#else
+ qsort(g_maxt_extreme_stat, perms_total, sizeof(double), double_cmp);
+#endif
+ }
+ fprintf(outfile, tbuf, "SNP");
+ chrom_fo_idx = 0xffffffffU;
+ marker_uidx = next_unset_unsafe(marker_exclude, 0);
+ marker_idx = 0;
+ dyy = 1.0 / ((double)((int32_t)perms_total + 1));
+ dxx = 0.5 * dyy;
+ while (1) {
+ do {
+ chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[(++chrom_fo_idx) + 1U];
+ } while (marker_uidx >= chrom_end);
+ uii = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
+ wptr_start = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, uii));
+ *wptr_start++ = ' ';
+ wptr_start[plink_maxsnp] = ' ';
+ for (; marker_uidx < chrom_end;) {
+ if (perm_adapt_nst) {
+ pval = ((double)(g_perm_2success_ct[marker_idx] + 2)) / ((double)(2 * (g_perm_attempt_ct[marker_idx + 1])));
+ } else {
+ pval = ((double)(g_perm_2success_ct[marker_idx + 2])) * dxx;
+ }
+ if (pval <= pfilter) {
+ fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), wptr_start);
+ wptr = &(wptr_start[1 + plink_maxsnp]);
+ if (perm_adapt_nst && (!g_perm_attempt_ct[marker_idx])) {
+ // invalid
+ wptr = memcpya(wptr, " NA NA NA", 38);
+ } else {
+ wptr = double_g_writewx4x(wptr, orig_chisq[marker_idx], 12, ' ');
+ if (!perm_count) {
+ wptr = double_g_writewx4(wptr, pval, 12);
+ } else {
+ wptr = double_g_writewx4(wptr, ((double)g_perm_2success_ct[marker_idx]) * 0.5, 12);
+ }
+ *wptr++ = ' ';
+ if (perm_adapt_nst) {
+ wptr = memseta(wptr, 32, 2);
+ wptr = uint32_writew10(wptr, g_perm_attempt_ct[marker_idx]);
+ } else {
+ // ...
+ if (!perm_count) {
+ } else {
+ }
+ }
+ *wptr++ = '\n';
+ if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+ goto dfam_ret_WRITE_FAIL;
+ }
+ }
+ if (++marker_idx == marker_ct) {
+ goto dfam_loop_end;
+ }
+ marker_uidx++;
+ next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
+ }
+ }
+ }
+ dfam_loop_end:
+ if (fclose_null(&outfile)) {
+ goto dfam_ret_WRITE_FAIL;
+ }
+ LOGPRINTFWW("Permutation test report written to %s .\n", outname);
}
// ...
@@ -3858,6 +4971,11 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
dfam_ret_INVALID_CMDLINE:
retval = RET_INVALID_CMDLINE;
break;
+ /*
+ dfam_ret_THREAD_CREATE_FAIL:
+ retval = RET_THREAD_CREATE_FAIL;
+ break;
+ */
}
dfam_ret_1:
wkspace_reset(wkspace_mark);
@@ -3912,8 +5030,8 @@ void qfam_compute_bw(uintptr_t* loadbuf, uintptr_t sample_ct, uint32_t* fs_start
cur_end = *fs_starts_ptr++;
sample_uidx = fss_contents[cur_start];
uii = fss_contents[cur_start + 1];
- ulii = (loadbuf[sample_uidx / BITCT2] >> (2 * (sample_uidx % BITCT2))) & 3;
- uljj = (loadbuf[uii / BITCT2] >> (2 * (uii % BITCT2))) & 3;
+ ulii = EXTRACT_2BIT_GENO(loadbuf, sample_uidx);
+ uljj = EXTRACT_2BIT_GENO(loadbuf, uii);
if ((ulii != 1) && (uljj != 1)) {
// both parents nonmissing
qfam_b[cur_idx] = 0.5 * (double)(4 - ((intptr_t)((ulii + (ulii == 0)) + (uljj + (uljj == 0)))));
@@ -3925,7 +5043,7 @@ void qfam_compute_bw(uintptr_t* loadbuf, uintptr_t sample_ct, uint32_t* fs_start
uljj = 0;
do {
sample_uidx = *fss_ptr++;
- ulii = (loadbuf[sample_uidx / BITCT2] >> (2 * (sample_uidx % BITCT2))) & 3;
+ ulii = EXTRACT_2BIT_GENO(loadbuf, sample_uidx);
if (ulii != 1) {
uljj += ulii + (ulii == 0);
} else {
@@ -3949,7 +5067,7 @@ void qfam_compute_bw(uintptr_t* loadbuf, uintptr_t sample_ct, uint32_t* fs_start
uljj = 0;
do {
sample_uidx = *fss_ptr++;
- ulii = (loadbuf[sample_uidx / BITCT2] >> (2 * (sample_uidx % BITCT2))) & 3;
+ ulii = EXTRACT_2BIT_GENO(loadbuf, sample_uidx);
if (ulii != 1) {
uljj += ulii + (ulii == 0);
} else {
@@ -3966,7 +5084,7 @@ void qfam_compute_bw(uintptr_t* loadbuf, uintptr_t sample_ct, uint32_t* fs_start
for (; cur_idx < fss_ct; cur_idx++) {
// singletons
sample_uidx = *fss_ptr++;
- ulii = (loadbuf[sample_uidx / BITCT2] >> (2 * (sample_uidx % BITCT2))) & 3;
+ ulii = EXTRACT_2BIT_GENO(loadbuf, sample_uidx);
if (ulii != 1) {
qfam_b[cur_idx] = (double)(2 - (intptr_t)(ulii + (ulii == 0)));
} else {
@@ -3976,7 +5094,7 @@ void qfam_compute_bw(uintptr_t* loadbuf, uintptr_t sample_ct, uint32_t* fs_start
fill_all_bits(nm_lm, lm_ct);
for (sample_uidx = 0, sample_idx = 0; sample_idx < lm_ct; sample_uidx++, sample_idx++) {
next_set_unsafe_ck(lm_eligible, &sample_uidx);
- ulii = (loadbuf[sample_uidx / BITCT2] >> (2 * (sample_uidx % BITCT2))) & 3;
+ ulii = EXTRACT_2BIT_GENO(loadbuf, sample_uidx);
if (ulii != 1) {
fss_idx = sample_lm_to_fss_idx[sample_idx];
if (!is_set(nm_fss, fss_idx)) {
@@ -3990,7 +5108,7 @@ void qfam_compute_bw(uintptr_t* loadbuf, uintptr_t sample_ct, uint32_t* fs_start
// assert: fss_contents[uii + 1] == sample_uidx
uii = fss_contents[uii];
}
- if (((loadbuf[uii / BITCT2] >> (2 * (uii % BITCT2))) & 3) == 1) {
+ if (EXTRACT_2BIT_GENO(loadbuf, uii) == 1) {
goto qfam_compute_bw_skip;
}
}
@@ -4115,7 +5233,7 @@ static inline uint32_t qfam_regress(uint32_t test_type, uint32_t nind, uint32_t
THREAD_RET_TYPE qfam_thread(void* arg) {
uintptr_t tidx = (uintptr_t)arg;
- uint32_t qfam_thread_ct = g_qfam_thread_ct;
+ uint32_t qfam_thread_ct = g_xfam_thread_ct;
uint32_t fs_ct = g_fs_ct;
uint32_t lm_ct = g_lm_ct;
uint32_t singleton_ct = g_singleton_ct;
@@ -4460,7 +5578,7 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
g_fs_ct = fs_ct;
g_singleton_ct = singleton_ct;
g_lm_ct = lm_ct;
- g_qfam_thread_ct = qfam_thread_ct;
+ g_xfam_thread_ct = qfam_thread_ct;
fss_ctl = (fss_ct + BITCT - 1) / BITCT;
lm_ctl = (lm_ct + BITCT - 1) / BITCT;
flip_ctl = only_within? lm_ctl : fss_ctl;
diff --git a/plink_family.h b/plink_family.h
index 5c6826d..4223c6b 100644
--- a/plink_family.h
+++ b/plink_family.h
@@ -53,7 +53,7 @@ int32_t get_trios_and_families(uintptr_t unfiltered_sample_ct, uintptr_t* sample
uint32_t erase_mendel_errors(uintptr_t unfiltered_sample_ct, uintptr_t* loadbuf, uintptr_t* workbuf, uintptr_t* sex_male, uint32_t* trio_lookup, uint32_t trio_ct, uint32_t is_x, uint32_t multigen);
-int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t plink_maxfid, uint32_t plink_maxiid, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_e [...]
+int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t plink_maxfid, uint32_t plink_maxiid, uint32_t plink_maxsnp, uint32_t allow_no_variants, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_ [...]
typedef struct {
char* family_ids;
diff --git a/plink_filter.c b/plink_filter.c
index 8b66155..baf4eb0 100644
--- a/plink_filter.c
+++ b/plink_filter.c
@@ -45,7 +45,7 @@ const char* keep_or_remove_flag_str(uint32_t flags) {
return NULL;
}
-int32_t keep_or_remove(char* fname, char* sorted_ids, uintptr_t sorted_ids_ct, uintptr_t max_id_len, uint32_t* id_map, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t* exclude_ct_ptr, uint32_t flags) {
+int32_t keep_or_remove(char* fname, char* sorted_ids, uintptr_t sorted_ids_ct, uintptr_t max_id_len, uint32_t* id_map, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t* exclude_ct_ptr, uint32_t flags, uint32_t allow_no_samples) {
FILE* infile = NULL;
unsigned char* wkspace_mark = wkspace_base;
uintptr_t* exclude_arr_new = NULL;
@@ -143,7 +143,7 @@ int32_t keep_or_remove(char* fname, char* sorted_ids, uintptr_t sorted_ids_ct, u
}
memcpy(exclude_arr, exclude_arr_new, unfiltered_ctl * sizeof(intptr_t));
*exclude_ct_ptr = popcount_longs(exclude_arr, unfiltered_ctl);
- if (*exclude_ct_ptr == unfiltered_ct) {
+ if ((*exclude_ct_ptr == unfiltered_ct) && (!allow_no_samples)) {
LOGERRPRINTF("Error: No %s remaining after --%s.\n", g_species_plural, keep_or_remove_flag_str(flags));
goto keep_or_remove_ret_ALL_SAMPLES_EXCLUDED;
}
@@ -235,7 +235,7 @@ void extract_exclude_process_token(const char* tok_start, const uint32_t* marker
}
}
-int32_t extract_exclude_flag_norange(char* fname, uint32_t* marker_id_htable, uint32_t marker_id_htable_size, uint32_t do_exclude, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr) {
+int32_t extract_exclude_flag_norange(char* fname, uint32_t* marker_id_htable, uint32_t marker_id_htable_size, uint32_t do_exclude, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, uint32_t allow_no_variants) {
unsigned char* wkspace_mark = wkspace_base;
FILE* infile = NULL;
uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
@@ -313,7 +313,7 @@ int32_t extract_exclude_flag_norange(char* fname, uint32_t* marker_id_htable, ui
zero_trailing_bits(marker_exclude, unfiltered_marker_ct);
}
*marker_exclude_ct_ptr = popcount_longs(marker_exclude, unfiltered_marker_ctl);
- if (*marker_exclude_ct_ptr == unfiltered_marker_ct) {
+ if ((*marker_exclude_ct_ptr == unfiltered_marker_ct) && (!allow_no_variants)) {
LOGERRPRINTF("Error: No variants remaining after --%s.\n", do_exclude? "exclude" : "extract");
goto extract_exclude_flag_norange_ret_ALL_MARKERS_EXCLUDED;
}
@@ -347,7 +347,7 @@ int32_t extract_exclude_flag_norange(char* fname, uint32_t* marker_id_htable, ui
return retval;
}
-int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uint32_t id_htable_size, char* item_ids, uintptr_t max_id_len, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t* exclude_ct_ptr) {
+int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uint32_t id_htable_size, uint32_t allow_no_variants, char* item_ids, uintptr_t max_id_len, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t* exclude_ct_ptr) {
gzFile gz_infile = NULL;
unsigned char* wkspace_mark = wkspace_base;
uintptr_t include_ct = 0;
@@ -548,7 +548,7 @@ int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uin
clear_bit(exclude_arr_new, item_uidx);
include_ct++;
}
- if (!include_ct) {
+ if ((!include_ct) && (!allow_no_variants)) {
logerrprint("Error: No variants remaining after --attrib.\n");
retval = RET_ALL_MARKERS_EXCLUDED;
goto filter_attrib_ret_1;
@@ -582,7 +582,7 @@ int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uin
return retval;
}
-int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids, uintptr_t sorted_ids_ct, uintptr_t max_id_len, uint32_t* id_map, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t* exclude_ct_ptr) {
+int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids, uintptr_t sorted_ids_ct, uintptr_t max_id_len, uint32_t* id_map, uintptr_t unfiltered_ct, uint32_t allow_no_samples, uintptr_t* exclude_arr, uintptr_t* exclude_ct_ptr) {
// re-merge this with filter_attrib() after making sample ID lookup
// hash-based
gzFile gz_infile = NULL;
@@ -612,7 +612,7 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
uint32_t unfiltered_idx;
uint32_t pos_match_needed;
int32_t sorted_idx;
-
+
if (wkspace_alloc_ul_checked(&exclude_arr_new, unfiltered_ctl * sizeof(intptr_t)) ||
wkspace_alloc_ul_checked(&already_seen, unfiltered_ctl * sizeof(intptr_t)) ||
wkspace_alloc_c_checked(&id_buf, max_id_len)) {
@@ -761,6 +761,10 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
}
set_bit(already_seen, sorted_idx);
unfiltered_idx = id_map[(uint32_t)sorted_idx];
+ if (is_set(exclude_arr, unfiltered_idx)) {
+ // bugfix: don't proceed here
+ continue;
+ }
pos_match_needed = pos_match_ct;
while (!is_eoln_kns(*cond_ptr)) {
bufptr2 = cond_ptr;
@@ -784,7 +788,7 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
clear_bit(exclude_arr_new, unfiltered_idx);
include_ct++;
}
- if (!include_ct) {
+ if ((!include_ct) && (!allow_no_samples)) {
LOGERRPRINTF("Error: No %s remaining after --attrib-indiv.\n", g_species_plural);
retval = RET_ALL_SAMPLES_EXCLUDED;
goto filter_attrib_sample_ret_1;
@@ -792,6 +796,7 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
LOGPRINTF("--attrib-indiv: %" PRIuPTR " %s remaining.\n", include_ct, species_str(include_ct));
memcpy(exclude_arr, exclude_arr_new, unfiltered_ctl * sizeof(intptr_t));
*exclude_ct_ptr = unfiltered_ct - include_ct;
+
while (0) {
filter_attrib_sample_ret_NOMEM:
retval = RET_NOMEM;
@@ -818,7 +823,7 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
return retval;
}
-int32_t filter_qual_scores(Two_col_params* qual_filter, double qual_min_thresh, double qual_max_thresh, uint32_t* marker_id_htable, uint32_t marker_id_htable_size, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr) {
+int32_t filter_qual_scores(Two_col_params* qual_filter, double qual_min_thresh, double qual_max_thresh, uint32_t* marker_id_htable, uint32_t marker_id_htable_size, uint32_t allow_no_variants, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr) {
unsigned char* wkspace_mark = wkspace_base;
FILE* infile = NULL;
uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
@@ -916,6 +921,11 @@ int32_t filter_qual_scores(Two_col_params* qual_filter, double qual_min_thresh,
}
*marker_exclude_ct_ptr = popcount_longs(marker_exclude, unfiltered_marker_ctl);
marker_ct = unfiltered_marker_ct - *marker_exclude_ct_ptr;
+ if ((!marker_ct) && (!allow_no_variants)) {
+ logerrprint("Error: No variants remaining after --qual-scores.\n");
+ retval = RET_ALL_MARKERS_EXCLUDED;
+ goto filter_qual_scores_ret_1;
+ }
if (miss_ct) {
sprintf(logbuf, "--qual-scores: %" PRIuPTR " variant%s remaining, %" PRIuPTR " ID%s missing.\n", marker_ct, (marker_ct == 1)? "" : "s", miss_ct, (miss_ct == 1)? "" : "s");
} else {
@@ -942,7 +952,7 @@ int32_t filter_qual_scores(Two_col_params* qual_filter, double qual_min_thresh,
return retval;
}
-uint32_t random_thin_markers(double thin_keep_prob, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr) {
+uint32_t random_thin_markers(double thin_keep_prob, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, uint32_t allow_no_variants) {
uint32_t marker_ct = unfiltered_marker_ct - *marker_exclude_ct_ptr;
uint32_t marker_uidx = 0;
uint32_t markers_done = 0;
@@ -960,7 +970,7 @@ uint32_t random_thin_markers(double thin_keep_prob, uintptr_t unfiltered_marker_
}
} while (++marker_uidx < marker_uidx_stop);
}
- if (marker_ct == removed_ct) {
+ if ((marker_ct == removed_ct) && (!allow_no_variants)) {
logerrprint("Error: All variants removed by --thin. Try a higher probability.\n");
return 1;
}
@@ -981,17 +991,23 @@ int32_t random_thin_markers_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_marke
LOGERRPRINTF("Error: --thin-count parameter exceeds number of remaining variants.\n");
goto random_thin_markers_ct_ret_INVALID_CMDLINE;
}
- if (wkspace_alloc_ul_checked(&perm_buf, marker_ctl * sizeof(intptr_t))) {
- goto random_thin_markers_ct_ret_NOMEM;
- }
- // no actual interleaving here, but may as well use this function
- generate_perm1_interleaved(marker_ct, marker_ct - thin_keep_ct, 0, 1, perm_buf);
- marker_uidx = 0;
- for (marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
- next_unset_unsafe_ck(marker_exclude, &marker_uidx);
- if (is_set(perm_buf, marker_idx)) {
- set_bit(marker_exclude, marker_uidx);
+ if (marker_ct > 1) {
+ if (wkspace_alloc_ul_checked(&perm_buf, marker_ctl * sizeof(intptr_t))) {
+ goto random_thin_markers_ct_ret_NOMEM;
+ }
+ // no actual interleaving here, but may as well use this function
+ // note that this requires marker_ct >= 2
+ generate_perm1_interleaved(marker_ct, marker_ct - thin_keep_ct, 0, 1, perm_buf);
+ marker_uidx = 0;
+ for (marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
+ next_unset_unsafe_ck(marker_exclude, &marker_uidx);
+ if (is_set(perm_buf, marker_idx)) {
+ set_bit(marker_exclude, marker_uidx);
+ }
}
+ } else if ((!thin_keep_ct) && marker_ct) {
+ marker_uidx = next_unset_unsafe(marker_exclude, 0);
+ set_bit(marker_exclude, marker_uidx);
}
LOGPRINTF("--thin-count: %u variant%s removed (%u remaining).\n", marker_ct - thin_keep_ct, (marker_ct - thin_keep_ct == 1)? "" : "s", thin_keep_ct);
*marker_exclude_ct_ptr = unfiltered_marker_ct - thin_keep_ct;
@@ -1007,7 +1023,7 @@ int32_t random_thin_markers_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_marke
return retval;
}
-uint32_t random_thin_samples(double thin_keep_prob, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr) {
+uint32_t random_thin_samples(double thin_keep_prob, uintptr_t unfiltered_sample_ct, uint32_t allow_no_samples, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr) {
uint32_t sample_ct = unfiltered_sample_ct - *sample_exclude_ct_ptr;
uint32_t sample_uidx = 0;
uint32_t samples_done = 0;
@@ -1025,7 +1041,7 @@ uint32_t random_thin_samples(double thin_keep_prob, uintptr_t unfiltered_sample_
}
} while (++sample_uidx < sample_uidx_stop);
}
- if (sample_ct == removed_ct) {
+ if ((sample_ct == removed_ct) && (!allow_no_samples)) {
LOGERRPRINTF("Error: All %s removed by --thin-indiv. Try a higher probability.\n", g_species_plural);
return 1;
}
@@ -1073,7 +1089,7 @@ int32_t random_thin_samples_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_sampl
}
-int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, char* sorted_sample_ids, uintptr_t sorted_sample_ct, uintptr_t max_sample_id_len, uint32_t* sample_id_map, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sex_male, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip) {
+int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, char* sorted_sample_ids, uintptr_t sorted_sample_ct, uintptr_t max_sample_id_len, uint32_t* sample_id_map, uintptr_t unfiltered_sample_ct, uintptr_t* sex_male, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip) {
// 1. load and validate cluster file
// 2. load marker file, sort by uidx
// 3. check for early exit (no clusters and/or no .zero entries)
@@ -1378,7 +1394,7 @@ int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfilt
return retval;
}
-int32_t filter_samples_file(char* filtername, char* sorted_sample_ids, uintptr_t sorted_ids_len, uintptr_t max_sample_id_len, uint32_t* id_map, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* filtervals_flattened, uint32_t mfilter_col) {
+int32_t filter_samples_file(char* filtername, char* sorted_sample_ids, uintptr_t sorted_ids_len, uintptr_t max_sample_id_len, uint32_t* id_map, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* filtervals_flattened, uint32_t mfilter_col, uint32_t allow_no_samples) {
FILE* infile = NULL;
unsigned char* wkspace_mark = wkspace_base;
uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
@@ -1446,7 +1462,7 @@ int32_t filter_samples_file(char* filtername, char* sorted_sample_ids, uintptr_t
if (!feof(infile)) {
goto filter_samples_file_ret_READ_FAIL;
}
- if (!include_ct) {
+ if ((!include_ct) && (!allow_no_samples)) {
LOGERRPRINTF("Error: All %s excluded by --filter.\n", g_species_plural);
goto filter_samples_file_ret_ALL_SAMPLES_EXCLUDED;
}
@@ -1510,7 +1526,12 @@ void filter_samples_bitfields(uintptr_t unfiltered_sample_ct, uintptr_t* sample_
*sample_exclude_ct_ptr = popcount_longs(sample_exclude, unfiltered_sample_ctl);
}
-int32_t mind_filter(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double mind_thresh, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* sex_male, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip) {
+int32_t mind_filter(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double mind_thresh, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* sex_male, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip, uint32_t allow_no_samples) {
+ uint32_t sample_exclude_ct = *sample_exclude_ct_ptr;
+ uint32_t sample_ct = unfiltered_sample_ct - sample_exclude_ct;
+ if (!sample_ct) {
+ return 0;
+ }
unsigned char* wkspace_mark = wkspace_base;
FILE* outfile = NULL;
uint32_t marker_ct = unfiltered_marker_ct - marker_exclude_ct;
@@ -1523,8 +1544,6 @@ int32_t mind_filter(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
uintptr_t y_end = 0;
uintptr_t* sample_male_include2 = NULL;
uint32_t unfiltered_sample_ctl2m1 = (unfiltered_sample_ct - 1) / BITCT2;
- uint32_t sample_exclude_ct = *sample_exclude_ct_ptr;
- uint32_t sample_ct = unfiltered_sample_ct - sample_exclude_ct;
uint32_t sample_uidx = 0;
uint32_t sample_idx = 0;
uint32_t removed_ct = 0;
@@ -1664,7 +1683,7 @@ int32_t mind_filter(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
}
}
*sample_exclude_ct_ptr += removed_ct;
- if (*sample_exclude_ct_ptr == unfiltered_sample_ct) {
+ if ((*sample_exclude_ct_ptr == unfiltered_sample_ct) && (!allow_no_samples)) {
LOGERRPRINTF("Error: All %s removed due to missing genotype data (--mind).\n", g_species_plural);
LOGPRINTFWW("IDs written to %s .\n", outname);
goto mind_filter_ret_ALL_SAMPLES_EXCLUDED;
@@ -2846,23 +2865,23 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1];
marker_uidx = next_unset(marker_exclude, chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx], chrom_end);
- is_x = (((int32_t)chrom_idx) == chrom_info_ptr->x_code);
- is_y = (((int32_t)chrom_idx) == chrom_info_ptr->y_code);
- is_haploid = is_set(chrom_info_ptr->haploid_mask, chrom_idx);
- if (!is_y) {
- cur_nm = sample_include2;
- cur_tot = sample_ct;
- cur_cluster_sizes = cluster_sizes;
- om_ycorr = 0;
- } else {
- cur_nm = sample_male_include2;
- cur_tot = sample_male_ct;
- cur_cluster_sizes = cluster_sizes_y;
- om_ycorr = om_cluster_ct;
- }
- cptr = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, chrom_idx));
- *cptr++ = ' ';
if (marker_uidx < chrom_end) {
+ is_x = (((int32_t)chrom_idx) == chrom_info_ptr->x_code);
+ is_y = (((int32_t)chrom_idx) == chrom_info_ptr->y_code);
+ is_haploid = is_set(chrom_info_ptr->haploid_mask, chrom_idx);
+ if (!is_y) {
+ cur_nm = sample_include2;
+ cur_tot = sample_ct;
+ cur_cluster_sizes = cluster_sizes;
+ om_ycorr = 0;
+ } else {
+ cur_nm = sample_male_include2;
+ cur_tot = sample_male_ct;
+ cur_cluster_sizes = cluster_sizes_y;
+ om_ycorr = om_cluster_ct;
+ }
+ cptr = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, chrom_idx));
+ *cptr++ = ' ';
if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
goto write_missingness_reports_ret_READ_FAIL;
}
@@ -3049,7 +3068,7 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
return retval;
}
-int32_t hardy_report_write_line(Pigz_state* ps_ptr, char** pzwritep_ptr, char* prefix_buf, uint32_t prefix_len, uint32_t reverse, uint32_t ll_ct, uint32_t lh_ct, uint32_t hh_ct, char* midbuf_ptr, double pval, double output_min_p) {
+int32_t hardy_report_write_line(Pigz_state* ps_ptr, char** pzwritep_ptr, char* prefix_buf, uint32_t prefix_len, uint32_t reverse, uint32_t ll_ct, uint32_t lh_ct, uint32_t hh_ct, uint32_t hwe_midp, uint32_t is_mt, char* midbuf_ptr, double pval, double output_min_p) {
char* pzwritep = *pzwritep_ptr;
char wbuf[48];
char* cptr;
@@ -3065,12 +3084,13 @@ int32_t hardy_report_write_line(Pigz_state* ps_ptr, char** pzwritep_ptr, char* p
pzwritep = fw_strcpyn(20, cptr - wbuf, wbuf, pzwritep);
*pzwritep++ = ' ';
denom = (ll_ct + lh_ct + hh_ct) * 2;
- if (denom) {
+ if (denom && (!is_mt)) {
drecip = 1.0 / ((double)denom);
minor_freq = (2 * ll_ct + lh_ct) * drecip;
pzwritep = double_g_writewx4(double_g_writewx4x(double_g_writewx4x(pzwritep, (lh_ct * 2) * drecip, 8, ' '), minor_freq * (2 * hh_ct + lh_ct) * drecip * 2, 8, ' '), MAXV(pval, output_min_p), 12);
} else {
- pzwritep = memcpya(pzwritep, " nan nan NA", 30);
+ pzwritep = memcpya(pzwritep, " nan nan ", 27);
+ pzwritep = memcpyl3a(pzwritep, hwe_midp? "0.5" : " 1");
}
append_binary_eoln(&pzwritep);
if (flex_pzwrite(ps_ptr, &pzwritep)) {
@@ -3089,7 +3109,6 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
uint32_t hwe_midp = hwe_modifier & HWE_MIDP;
uint32_t output_gz = (hwe_modifier / HWE_GZ) & 1;
int32_t retval = 0;
- uint32_t skip_chrom = 0;
uint32_t pct = 0;
Pigz_state ps;
uint32_t prefix_len;
@@ -3156,7 +3175,6 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
chrom_fo_idx = 0;
refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
- skip_chrom = (is_haploid && (!is_x)) || is_mt;
cptr0 = width_force(4, writebuf, chrom_name_write(writebuf, chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx]));
*cptr0++ = ' ';
cptr = &(cptr0[10 + plink_maxsnp]);
@@ -3175,7 +3193,6 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
if (marker_uidx >= chrom_end) {
chrom_fo_idx++;
refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
- skip_chrom = (is_haploid && (!is_x)) || is_mt;
cptr0 = width_force(4, writebuf, chrom_name_write(writebuf, chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx]));
*cptr0++ = ' ';
cptr = &(cptr0[10 + plink_maxsnp]);
@@ -3187,9 +3204,6 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
}
cptr2 = &(cptr[18 + 2 * max_marker_allele_len]);
}
- if (skip_chrom) {
- continue;
- }
fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), cptr0);
reverse = IS_SET(marker_reverse, marker_uidx);
cptr3 = marker_allele_ptrs[2 * marker_uidx];
@@ -3199,7 +3213,7 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
cptr5 = fw_strcpy(4, cptr4, &(cptr5[1]));
*cptr5 = ' ';
prefix_len = 1 + (cptr5 - writebuf);
- if (hardy_report_write_line(&ps, &pzwritep, writebuf, prefix_len, reverse, hwe_ll_allfs[marker_uidx], hwe_lh_allfs[marker_uidx], hwe_hh_allfs[marker_uidx], cptr2, p_values[marker_idx], output_min_p)) {
+ if (hardy_report_write_line(&ps, &pzwritep, writebuf, prefix_len, reverse, hwe_ll_allfs[marker_uidx], hwe_lh_allfs[marker_uidx], hwe_hh_allfs[marker_uidx], hwe_midp, is_mt, cptr2, p_values[marker_idx], output_min_p)) {
goto hardy_report_ret_WRITE_FAIL;
}
}
@@ -3221,7 +3235,6 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
if (marker_uidx >= chrom_end) {
chrom_fo_idx++;
refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
- skip_chrom = (is_haploid && (!is_x)) || is_mt;
cptr0 = width_force(4, writebuf, chrom_name_write(writebuf, chrom_info_ptr, chrom_info_ptr->chrom_file_order[chrom_fo_idx]));
*cptr0++ = ' ';
memset(&(cptr0[plink_maxsnp]), 32, 20);
@@ -3229,9 +3242,6 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
cptr2 = &(cptr[18 + 2 * max_marker_allele_len]);
prefix_len = 10 + ((uintptr_t)(cptr - writebuf));
}
- if (skip_chrom) {
- continue;
- }
fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), cptr0);
memcpy(&(cptr0[4 + plink_maxsnp]), " ALL", 5);
reverse = IS_SET(marker_reverse, marker_uidx);
@@ -3242,17 +3252,17 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
cptr5 = fw_strcpy(4, cptr4, &(cptr5[1]));
*cptr5 = ' ';
prefix_len = 1 + (cptr5 - writebuf);
- if (hardy_report_write_line(&ps, &pzwritep, writebuf, prefix_len, reverse, hwe_ll_allfs[marker_uidx], hwe_lh_allfs[marker_uidx], hwe_hh_allfs[marker_uidx], cptr2, p_values[3 * marker_idx], output_min_p)) {
+ if (hardy_report_write_line(&ps, &pzwritep, writebuf, prefix_len, reverse, hwe_ll_allfs[marker_uidx], hwe_lh_allfs[marker_uidx], hwe_hh_allfs[marker_uidx], hwe_midp, is_mt, cptr2, p_values[3 * marker_idx], output_min_p)) {
goto hardy_report_ret_WRITE_FAIL;
}
memcpy(&(cptr0[7 + plink_maxsnp]), "FF", 2);
- if (hardy_report_write_line(&ps, &pzwritep, writebuf, prefix_len, reverse, hwe_ll_cases[marker_uidx], hwe_lh_cases[marker_uidx], hwe_hh_cases[marker_uidx], cptr2, p_values[3 * marker_idx + 1], output_min_p)) {
+ if (hardy_report_write_line(&ps, &pzwritep, writebuf, prefix_len, reverse, hwe_ll_cases[marker_uidx], hwe_lh_cases[marker_uidx], hwe_hh_cases[marker_uidx], hwe_midp, is_mt, cptr2, p_values[3 * marker_idx + 1], output_min_p)) {
goto hardy_report_ret_WRITE_FAIL;
}
memcpy(&(cptr0[4 + plink_maxsnp]), "UN", 2);
- if (hardy_report_write_line(&ps, &pzwritep, writebuf, prefix_len, reverse, hwe_lls[marker_uidx], hwe_lhs[marker_uidx], hwe_hhs[marker_uidx], cptr2, p_values[3 * marker_idx + 2], output_min_p)) {
+ if (hardy_report_write_line(&ps, &pzwritep, writebuf, prefix_len, reverse, hwe_lls[marker_uidx], hwe_lhs[marker_uidx], hwe_hhs[marker_uidx], hwe_midp, is_mt, cptr2, p_values[3 * marker_idx + 2], output_min_p)) {
goto hardy_report_ret_WRITE_FAIL;
}
}
@@ -3284,70 +3294,75 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
return retval;
}
-uint32_t enforce_hwe_threshold(double hwe_thresh, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, int32_t* hwe_lls, int32_t* hwe_lhs, int32_t* hwe_hhs, uint32_t hwe_modifier, int32_t* hwe_ll_allfs, int32_t* hwe_lh_allfs, int32_t* hwe_hh_allfs, Chrom_info* chrom_info_ptr) {
+uint32_t enforce_hwe_threshold(double hwe_thresh, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, int32_t* hwe_lls, int32_t* hwe_lhs, int32_t* hwe_hhs, uint32_t hwe_modifier, uint32_t allow_no_variants, int32_t* hwe_ll_allfs, int32_t* hwe_lh_allfs, int32_t* hwe_hh_allfs, Chrom_info* chrom_info_ptr) {
uint32_t marker_ct = unfiltered_marker_ct - *marker_exclude_ct_ptr;
uint32_t marker_uidx = 0;
uint32_t removed_ct = 0;
uint32_t hwe_all = hwe_modifier & HWE_THRESH_ALL;
uint32_t hwe_thresh_midp = hwe_modifier & HWE_THRESH_MIDP;
- uint32_t min_obs = 0xffffffffU;
+ uint32_t min_obs_nonx = 0xffffffffU;
+ uint32_t min_obs_x = 0xffffffffU;
uint32_t max_obs = 0;
- int32_t mt_code = chrom_info_ptr->mt_code;
- uint32_t mt_start = 0;
- uint32_t mt_end = 0;
- uint32_t markers_done;
+ uint32_t chrom_fo_idx;
+ uint32_t chrom_idx;
+ uint32_t chrom_end;
uint32_t cur_obs;
+ uint32_t cur_min_obs;
+ int32_t is_x;
+ int32_t test_failed;
+ if (chrom_info_ptr->haploid_mask[0] & 1) {
+ logerrprint("Warning: --hwe has no effect since entire genome is haploid.\n");
+ return 0;
+ }
hwe_thresh *= 1 + SMALL_EPSILON;
if (hwe_all) {
hwe_lhs = hwe_lh_allfs;
hwe_lls = hwe_ll_allfs;
hwe_hhs = hwe_hh_allfs;
}
- if ((mt_code != -1) && is_set(chrom_info_ptr->chrom_mask, mt_code)) {
- mt_start = chrom_info_ptr->chrom_start[(uint32_t)mt_code];
- mt_end = chrom_info_ptr->chrom_end[(uint32_t)mt_code];
- }
- if (hwe_thresh_midp) {
- for (markers_done = 0; markers_done < marker_ct; marker_uidx++, markers_done++) {
- next_unset_unsafe_ck(marker_exclude, &marker_uidx);
- if ((marker_uidx < mt_end) && (marker_uidx >= mt_start)) {
- continue;
- }
- if (SNPHWE_midp_t(hwe_lhs[marker_uidx], hwe_lls[marker_uidx], hwe_hhs[marker_uidx], hwe_thresh)) {
- SET_BIT(marker_exclude, marker_uidx);
- removed_ct++;
- }
- cur_obs = hwe_lhs[marker_uidx] + hwe_lls[marker_uidx] + hwe_hhs[marker_uidx];
- if (cur_obs < min_obs) {
- min_obs = cur_obs;
- }
- if (cur_obs > max_obs) {
- max_obs = cur_obs;
- }
- }
- } else {
- for (markers_done = 0; markers_done < marker_ct; marker_uidx++, markers_done++) {
- next_unset_unsafe_ck(marker_exclude, &marker_uidx);
- if ((marker_uidx < mt_end) && (marker_uidx >= mt_start)) {
- continue;
- }
- if (SNPHWE_t(hwe_lhs[marker_uidx], hwe_lls[marker_uidx], hwe_hhs[marker_uidx], hwe_thresh)) {
- SET_BIT(marker_exclude, marker_uidx);
- removed_ct++;
- }
- cur_obs = hwe_lhs[marker_uidx] + hwe_lls[marker_uidx] + hwe_hhs[marker_uidx];
- if (cur_obs < min_obs) {
- min_obs = cur_obs;
+ for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
+ chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
+ chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1];
+ marker_uidx = next_unset(marker_exclude, chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx], chrom_end);
+ if (marker_uidx < chrom_end) {
+ is_x = (((int32_t)chrom_idx) == chrom_info_ptr->x_code);
+ if ((((int32_t)chrom_idx) == chrom_info_ptr->mt_code) || (is_set(chrom_info_ptr->haploid_mask, chrom_idx) && (!is_x))) {
+ continue;
}
- if (cur_obs > max_obs) {
- max_obs = cur_obs;
+ // okay if min_obs_x is an underestimate
+ cur_min_obs = min_obs_nonx;
+ do {
+ if (hwe_thresh_midp) {
+ test_failed = SNPHWE_midp_t(hwe_lhs[marker_uidx], hwe_lls[marker_uidx], hwe_hhs[marker_uidx], hwe_thresh);
+ } else {
+ test_failed = SNPHWE_t(hwe_lhs[marker_uidx], hwe_lls[marker_uidx], hwe_hhs[marker_uidx], hwe_thresh);
+ }
+ if (test_failed) {
+ SET_BIT(marker_exclude, marker_uidx);
+ removed_ct++;
+ }
+ cur_obs = hwe_lhs[marker_uidx] + hwe_lls[marker_uidx] + hwe_hhs[marker_uidx];
+ if (cur_obs < cur_min_obs) {
+ cur_min_obs = cur_obs;
+ }
+ if (cur_obs > max_obs) {
+ max_obs = cur_obs;
+ }
+ marker_uidx = next_unset(marker_exclude, marker_uidx + 1, chrom_end);
+ } while (marker_uidx < chrom_end);
+ if (is_x) {
+ min_obs_x = cur_min_obs;
+ } else {
+ min_obs_nonx = cur_min_obs;
}
}
}
- if (((uint64_t)max_obs) * 9 > ((uint64_t)min_obs) * 10) {
+ if (((uint64_t)max_obs) * 9 > ((uint64_t)min_obs_nonx) * 10) {
logerrprint("Warning: --hwe observation counts vary by more than 10%. Consider using\n--geno, and/or applying different p-value thresholds to distinct subsets of\nyour data.\n");
+ } else if (((uint64_t)max_obs) * 9 > ((uint64_t)min_obs_x) * 10) {
+ logerrprint("Warning: --hwe observation counts vary by more than 10%, due to the X\nchromosome. You may want to use a less stringent --hwe p-value threshold for X\nchromosome variants.\n");
}
- if (marker_ct == removed_ct) {
+ if ((marker_ct == removed_ct) && (!allow_no_variants)) {
logerrprint("Error: All variants removed due to Hardy-Weinberg exact test (--hwe).\n");
return 1;
}
@@ -3356,7 +3371,7 @@ uint32_t enforce_hwe_threshold(double hwe_thresh, uintptr_t unfiltered_marker_ct
return 0;
}
-uint32_t enforce_minor_allele_thresholds(double min_maf, double max_maf, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* ac_excl_bitfield, uintptr_t* marker_exclude_ct_ptr, double* set_allele_freqs) {
+uint32_t enforce_minor_allele_thresholds(double min_maf, double max_maf, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* ac_excl_bitfield, uintptr_t* marker_exclude_ct_ptr, double* set_allele_freqs, uint32_t allow_no_variants) {
uint32_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
uint32_t marker_ct = unfiltered_marker_ct - *marker_exclude_ct_ptr;
uint32_t marker_uidx = 0;
@@ -3383,7 +3398,7 @@ uint32_t enforce_minor_allele_thresholds(double min_maf, double max_maf, uintptr
bitfield_or(marker_exclude, ac_excl_bitfield, unfiltered_marker_ctl);
}
removed_ct = popcount_longs(marker_exclude, unfiltered_marker_ctl) - (*marker_exclude_ct_ptr);
- if (marker_ct == removed_ct) {
+ if ((marker_ct == removed_ct) && (!allow_no_variants)) {
logerrprint("Error: All variants removed due to minor allele threshold(s)\n(--maf/--max-maf/--mac/--max-mac).\n");
return 1;
}
diff --git a/plink_filter.h b/plink_filter.h
index 9c5194d..7ac2d96 100644
--- a/plink_filter.h
+++ b/plink_filter.h
@@ -17,31 +17,31 @@ void oblig_missing_init(Oblig_missing_info* om_ip);
void oblig_missing_cleanup(Oblig_missing_info* om_ip);
-int32_t keep_or_remove(char* fname, char* sorted_ids, uintptr_t sorted_ids_len, uintptr_t max_id_len, uint32_t* id_map, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t* exclude_ct_ptr, uint32_t flags);
+int32_t keep_or_remove(char* fname, char* sorted_ids, uintptr_t sorted_ids_len, uintptr_t max_id_len, uint32_t* id_map, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t* exclude_ct_ptr, uint32_t flags, uint32_t allow_no_samples);
-int32_t extract_exclude_flag_norange(char* fname, uint32_t* marker_id_htable, uint32_t marker_id_htable_size, uint32_t do_exclude, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr);
+int32_t extract_exclude_flag_norange(char* fname, uint32_t* marker_id_htable, uint32_t marker_id_htable_size, uint32_t do_exclude, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, uint32_t allow_no_variants);
-int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uint32_t id_htable_size, char* item_ids, uintptr_t max_id_len, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t* exclude_ct_ptr);
+int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uint32_t id_htable_size, uint32_t allow_no_variants, char* item_ids, uintptr_t max_id_len, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t* exclude_ct_ptr);
-int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids, uintptr_t sorted_ids_ct, uintptr_t max_id_len, uint32_t* id_map, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t* exclude_ct_ptr);
+int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids, uintptr_t sorted_ids_ct, uintptr_t max_id_len, uint32_t* id_map, uintptr_t unfiltered_ct, uint32_t allow_no_samples, uintptr_t* exclude_arr, uintptr_t* exclude_ct_ptr);
-int32_t filter_qual_scores(Two_col_params* qual_filter, double qual_min_thresh, double qual_max_thresh, uint32_t* marker_id_htable, uint32_t marker_id_htable_size, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr);
+int32_t filter_qual_scores(Two_col_params* qual_filter, double qual_min_thresh, double qual_max_thresh, uint32_t* marker_id_htable, uint32_t marker_id_htable_size, uint32_t allow_no_variants, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr);
-uint32_t random_thin_markers(double thin_keep_prob, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr);
+uint32_t random_thin_markers(double thin_keep_prob, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, uint32_t allow_no_variants);
int32_t random_thin_markers_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr);
-uint32_t random_thin_samples(double thin_keep_prob, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr);
+uint32_t random_thin_samples(double thin_keep_prob, uintptr_t unfiltered_sample_ct, uint32_t allow_no_samples, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr);
int32_t random_thin_samples_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr);
-int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, char* sorted_sample_ids, uintptr_t sorted_sample_ct, uintptr_t max_sample_id_len, uint32_t* sample_id_map, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sex_male, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip);
+int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, char* sorted_sample_ids, uintptr_t sorted_sample_ct, uintptr_t max_sample_id_len, uint32_t* sample_id_map, uintptr_t unfiltered_sample_ct, uintptr_t* sex_male, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip);
-int32_t filter_samples_file(char* filtername, char* sorted_sample_ids, uintptr_t sorted_ids_len, uintptr_t max_sample_id_len, uint32_t* id_map, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* filtervals_flattened, uint32_t mfilter_col);
+int32_t filter_samples_file(char* filtername, char* sorted_sample_ids, uintptr_t sorted_ids_len, uintptr_t max_sample_id_len, uint32_t* id_map, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* filtervals_flattened, uint32_t mfilter_col, uint32_t allow_no_samples);
void filter_samples_bitfields(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, uintptr_t* orfield, int32_t orfield_flip, uintptr_t* ornot);
-int32_t mind_filter(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double mind_thresh, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* sex_male, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip);
+int32_t mind_filter(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double mind_thresh, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* sex_male, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip, uint32_t allow_no_samples);
int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_exclude_ct, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* founder_info, int32_t nonfounders, int32_t maf_succ, double* set_allele_freqs, uintptr_t bed_offset, uint32_t hwe_needed, uint32_t hwe_all, uin [...]
@@ -49,9 +49,9 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, int32_t* hwe_lls, int32_t* hwe_lhs, int32_t* hwe_hhs, uint32_t hwe_modifier, uint32_t nonfounders, int32_t* hwe_ll_cases, int32_t* hwe_lh_cases, int32_t* hwe_hh_cases, int [...]
-uint32_t enforce_hwe_threshold(double hwe_thresh, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, int32_t* hwe_lls, int32_t* hwe_lhs, int32_t* hwe_hhs, uint32_t hwe_modifier, int32_t* hwe_ll_allfs, int32_t* hwe_lh_allfs, int32_t* hwe_hh_allfs, Chrom_info* chrom_info_ptr);
+uint32_t enforce_hwe_threshold(double hwe_thresh, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, int32_t* hwe_lls, int32_t* hwe_lhs, int32_t* hwe_hhs, uint32_t hwe_modifier, uint32_t allow_no_variants, int32_t* hwe_ll_allfs, int32_t* hwe_lh_allfs, int32_t* hwe_hh_allfs, Chrom_info* chrom_info_ptr);
-uint32_t enforce_minor_allele_thresholds(double min_maf, double max_maf, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* ac_excl_bitfield, uintptr_t* marker_exclude_ct_ptr, double* set_allele_freqs);
+uint32_t enforce_minor_allele_thresholds(double min_maf, double max_maf, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* ac_excl_bitfield, uintptr_t* marker_exclude_ct_ptr, double* set_allele_freqs, uint32_t allow_no_variants);
void enforce_min_bp_space(int32_t min_bp_space, uint32_t unfiltered_marker_ct, uintptr_t* marker_exclude, uint32_t* marker_pos, uintptr_t* marker_exclude_ct_ptr, Chrom_info* chrom_info_ptr);
diff --git a/plink_glm.c b/plink_glm.c
index ebf3e32..d71c59a 100644
--- a/plink_glm.c
+++ b/plink_glm.c
@@ -4,6 +4,7 @@
#include "plink_cluster.h"
#include "plink_ld.h"
#include "plink_matrix.h"
+#include "plink_perm.h"
#include "plink_set.h"
#include "plink_stats.h"
@@ -1839,7 +1840,7 @@ uint32_t logistic_regression(uint32_t sample_ct, uint32_t param_ct, float* vv, f
}
}
-uint32_t glm_logistic(uintptr_t cur_batch_size, uintptr_t param_ct, uintptr_t sample_valid_ct, uint32_t missing_ct, uintptr_t* loadbuf, float* covars_cov_major, uintptr_t* perm_vecs, float* coef, float* pp, float* sample_1d_buf, float* pheno_buf, float* param_1d_buf, float* param_1d_buf2, float* param_2d_buf, float* param_2d_buf2, float* logistic_results, uintptr_t constraint_ct, double* constraints_con_major, double* param_1d_dbuf, double* param_2d_dbuf, double* param_2d_dbuf2, double* [...]
+uint32_t glm_logistic(uintptr_t cur_batch_size, uintptr_t param_ct, uintptr_t sample_valid_ct, uint32_t missing_ct, uint32_t skip_intercept, uintptr_t* loadbuf, float* covars_cov_major, uintptr_t* perm_vecs, float* coef, float* pp, float* sample_1d_buf, float* pheno_buf, float* param_1d_buf, float* param_1d_buf2, float* param_2d_buf, float* param_2d_buf2, float* logistic_results, uintptr_t constraint_ct, double* constraints_con_major, double* param_1d_dbuf, double* param_2d_dbuf, double* [...]
// Similar to logistic.cpp fitLM(), but incorporates changes from the
// postprocessed TopCoder contest code.
// * coef is now assumed to be initialized with a good starting point for
@@ -1851,10 +1852,10 @@ uint32_t glm_logistic(uintptr_t cur_batch_size, uintptr_t param_ct, uintptr_t sa
// Returns number of regression failures.
uintptr_t param_cta4 = (param_ct + 3) & (~3);
uintptr_t param_ct_p1 = param_ct + 1;
- uintptr_t param_ct_m1 = param_ct - 1;
+ uintptr_t param_ct_msi = param_ct - skip_intercept;
uintptr_t joint_test_requested = (constraints_con_major? 1 : 0);
uintptr_t param_ctx = param_ct + joint_test_requested;
- uintptr_t param_ctx_m1 = param_ctx - 1;
+ uintptr_t param_ctx_msi = param_ctx - skip_intercept;
uintptr_t sample_validx_ctv2 = 2 * ((sample_valid_ct + missing_ct + (BITCT - 1)) / BITCT);
uintptr_t perm_fail_ct = 0;
uintptr_t cur_word = 0;
@@ -1927,8 +1928,8 @@ uint32_t glm_logistic(uintptr_t cur_batch_size, uintptr_t param_ct, uintptr_t sa
}
}
}
- fptr = &(logistic_results[perm_idx * param_ctx_m1]);
- for (param_idx = 1; param_idx < param_ct; param_idx++) {
+ fptr = &(logistic_results[perm_idx * param_ctx_msi]);
+ for (param_idx = skip_intercept; param_idx < param_ct; param_idx++) {
*fptr++ = param_2d_buf[param_idx * param_ct_p1];
}
if (joint_test_requested) {
@@ -1954,11 +1955,11 @@ uint32_t glm_logistic(uintptr_t cur_batch_size, uintptr_t param_ct, uintptr_t sa
}
if (0) {
glm_logistic_fail:
- fill_float_zero(&(logistic_results[perm_idx * param_ctx_m1]), param_ct_m1);
+ fill_float_zero(&(logistic_results[perm_idx * param_ctx_msi]), param_ct_msi);
SET_BIT(perm_fails, perm_idx);
perm_fail_ct++;
if (joint_test_requested) {
- logistic_results[perm_idx * param_ctx_m1 + param_ct_m1] = -9;
+ logistic_results[perm_idx * param_ctx_msi + param_ct_msi] = -9;
}
}
coef = &(coef[param_cta4]);
@@ -3093,6 +3094,7 @@ THREAD_RET_TYPE glm_logistic_adapt_thread(void* arg) {
double adaptive_ci_zt = g_adaptive_ci_zt;
double aperm_alpha = g_aperm_alpha;
uintptr_t cur_param_ct = g_cur_param_ct;
+ uintptr_t cur_param_cta4 = (cur_param_ct + 3) & (~3);
uintptr_t cur_constraint_ct = g_cur_constraint_ct;
uint32_t coding_flags = g_coding_flags;
uint32_t glm_xchr_model = g_glm_xchr_model;
@@ -3167,12 +3169,12 @@ THREAD_RET_TYPE glm_logistic_adapt_thread(void* arg) {
success_2incr = 0;
cur_fail_ct = 0;
// todo: try better starting position
- fill_float_zero(coef, ((cur_param_ct + 3) & (~3)) * perm_vec_ct);
- glm_logistic(perm_vec_ct, cur_param_ct, cur_sample_valid_ct, cur_missing_ct, loadbuf_ptr, cur_covars_cov_major, perm_vecs, coef, pp, sample_1d_buf, pheno_buf, param_1d_buf, param_1d_buf2, param_2d_buf, param_2d_buf2, regression_results, cur_constraint_ct, constraints_con_major, param_1d_dbuf, param_2d_dbuf, param_2d_dbuf2, param_df_dbuf, df_df_dbuf, mi_buf, df_dbuf, perm_fails);
+ fill_float_zero(coef, cur_param_cta4 * perm_vec_ct);
+ glm_logistic(perm_vec_ct, cur_param_ct, cur_sample_valid_ct, cur_missing_ct, 1, loadbuf_ptr, cur_covars_cov_major, perm_vecs, coef, pp, sample_1d_buf, pheno_buf, param_1d_buf, param_1d_buf2, param_2d_buf, param_2d_buf2, regression_results, cur_constraint_ct, constraints_con_major, param_1d_dbuf, param_2d_dbuf, param_2d_dbuf2, param_df_dbuf, df_df_dbuf, mi_buf, df_dbuf, perm_fails);
for (pidx = 0; pidx < perm_vec_ct;) {
if (!IS_SET(perm_fails, pidx)) {
if (!joint_test_params) {
- dxx = (double)coef[pidx * cur_param_ct + 1];
+ dxx = (double)coef[pidx * cur_param_cta4 + 1]; // bugfix, forgot a4
dxx *= dxx;
dxx /= (double)regression_results[pidx * param_ctx_m1];
if (dxx > stat_high) {
@@ -3491,7 +3493,7 @@ THREAD_RET_TYPE glm_logistic_maxt_thread(void* arg) {
success_2incr = 0;
// todo: try better starting position
fill_float_zero(coef, cur_param_cta4 * perm_vec_ct);
- perm_fail_ct = glm_logistic(perm_vec_ct, cur_param_ct, cur_sample_valid_ct, cur_missing_ct, loadbuf_ptr, cur_covars_cov_major, perm_vecs, coef, pp, sample_1d_buf, pheno_buf, param_1d_buf, param_1d_buf2, param_2d_buf, param_2d_buf2, regression_results, cur_constraint_ct, constraints_con_major, param_1d_dbuf, param_2d_dbuf, param_2d_dbuf2, param_df_dbuf, df_df_dbuf, mi_buf, df_dbuf, perm_fails);
+ perm_fail_ct = glm_logistic(perm_vec_ct, cur_param_ct, cur_sample_valid_ct, cur_missing_ct, 1, loadbuf_ptr, cur_covars_cov_major, perm_vecs, coef, pp, sample_1d_buf, pheno_buf, param_1d_buf, param_1d_buf2, param_2d_buf, param_2d_buf2, regression_results, cur_constraint_ct, constraints_con_major, param_1d_dbuf, param_2d_dbuf, param_2d_dbuf2, param_df_dbuf, df_df_dbuf, mi_buf, df_dbuf, perm_fails);
for (pidx = 0; pidx < perm_vec_ct; pidx++) {
if (!IS_SET(perm_fails, pidx)) {
if (!joint_test_params) {
@@ -3693,7 +3695,7 @@ THREAD_RET_TYPE glm_logistic_set_thread(void* arg) {
cur_sample_valid_ct = sample_valid_ct - cur_missing_ct;
// todo: try better starting position
fill_float_zero(coef, cur_param_cta4 * perm_vec_ct);
- glm_logistic(perm_vec_ct, cur_param_ct, cur_sample_valid_ct, cur_missing_ct, loadbuf_ptr, cur_covars_cov_major, perm_vecs, coef, pp, sample_1d_buf, pheno_buf, param_1d_buf, param_1d_buf2, param_2d_buf, param_2d_buf2, regression_results, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, perm_fails);
+ glm_logistic(perm_vec_ct, cur_param_ct, cur_sample_valid_ct, cur_missing_ct, 1, loadbuf_ptr, cur_covars_cov_major, perm_vecs, coef, pp, sample_1d_buf, pheno_buf, param_1d_buf, param_1d_buf2, param_2d_buf, param_2d_buf2, regression_results, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, perm_fails);
for (pidx = 0; pidx < perm_vec_ct; pidx++) {
if (!IS_SET(perm_fails, pidx)) {
dxx = (double)coef[pidx * cur_param_cta4 + 1];
@@ -4560,8 +4562,8 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
// changed.
uint32_t max_thread_ct = g_thread_ct;
uint32_t hh_or_mt_exists = hh_exists;
+ uint32_t report_intercept = glm_modifier & GLM_INTERCEPT;
int32_t retval = 0;
- uint32_t linear_intercept = glm_modifier & GLM_INTERCEPT;
char dgels_trans = 'N';
__CLPK_integer dgels_m = 0;
__CLPK_integer dgels_n = 0;
@@ -5416,7 +5418,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
}
}
}
- if (linear_intercept) {
+ if (report_intercept) {
dxx = g_linear_mt[0].dgels_b[0];
wptr = memcpya(wptr_start2, " INTERCEPT ", 11);
wptr = uint32_writew8x(wptr, (uint32_t)cur_sample_valid_ct, ' ');
@@ -5424,7 +5426,6 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
if (display_ci) {
// okay, this should be made more maintainable...
se = sqrt(g_linear_mt[0].param_2d_buf2[0]);
- zval = dxx / se;
dyy = ci_zt * se;
wptr = double_g_writewx4x(wptr, se, 8, ' ');
wptr = double_g_writewx4x(wptr, dxx - dyy, 8, ' ');
@@ -6103,6 +6104,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
uint32_t pct = 0;
uint32_t max_thread_ct = g_thread_ct;
uint32_t hh_or_mt_exists = hh_exists;
+ uint32_t skip_intercept = !(glm_modifier & GLM_INTERCEPT);
int32_t retval = 0;
double* constraints_con_major = NULL;
double* orig_pvals = NULL;
@@ -6152,7 +6154,6 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
uintptr_t sample_idx;
uintptr_t param_ctx_max;
uintptr_t param_ctl_max;
- uintptr_t param_ctx_max_m1;
uintptr_t condition_list_start_idx;
uintptr_t covar_start_idx;
uintptr_t interaction_start_idx;
@@ -6240,7 +6241,6 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
sample_valid_cta4 = (sample_valid_ct + 3) & (~3);
sample_valid_ctv2 = 2 * ((sample_valid_ct + BITCT - 1) / BITCT);
final_mask = get_final_mask(sample_valid_ct);
- param_ctx_max_m1 = param_ctx_max - 1;
param_ct_maxa4 = (param_ct_max + 3) & (~3);
if (wkspace_alloc_d_checked(&g_orig_stats, marker_initial_ct * sizeof(double)) ||
wkspace_alloc_c_checked(¶m_names, param_ctx_max * max_param_name_len) ||
@@ -6549,6 +6549,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
// (no need to worry about 1D 16-byte alignment requirements since
// wkspace_alloc actually forces 64-byte alignment, and allocation sizes
// are automatically rounded up)
+ uii = (tidx || (orig_perm_batch_size > 1) || skip_intercept)? 1 : 0;
if (wkspace_alloc_f_checked(&(g_logistic_mt[tidx].cur_covars_cov_major), param_ct_max * sample_valid_cta4 * sizeof(float)) ||
wkspace_alloc_f_checked(&(g_logistic_mt[tidx].coef), param_ct_maxa4 * orig_perm_batch_size * sizeof(float)) ||
wkspace_alloc_f_checked(&(g_logistic_mt[tidx].pp), sample_valid_cta4 * sizeof(float)) ||
@@ -6558,7 +6559,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
wkspace_alloc_f_checked(&(g_logistic_mt[tidx].param_1d_buf2), param_ct_max * sizeof(float)) ||
wkspace_alloc_f_checked(&(g_logistic_mt[tidx].param_2d_buf), param_ct_max * param_ct_maxa4 * sizeof(float)) ||
wkspace_alloc_f_checked(&(g_logistic_mt[tidx].param_2d_buf2), param_ct_max * param_ct_maxa4 * sizeof(float)) ||
- wkspace_alloc_f_checked(&(g_logistic_mt[tidx].regression_results), orig_perm_batch_size * param_ctx_max_m1 * sizeof(float)) ||
+ wkspace_alloc_f_checked(&(g_logistic_mt[tidx].regression_results), orig_perm_batch_size * (param_ctx_max - uii) * sizeof(float)) ||
wkspace_alloc_ul_checked(&(g_logistic_mt[tidx].perm_fails), ulii * sizeof(intptr_t))) {
goto glm_logistic_assoc_ret_NOMEM;
}
@@ -6811,7 +6812,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
if (cur_sample_valid_ct > cur_param_ct) {
// todo: try better starting position
fill_float_zero(g_logistic_mt[0].coef, (cur_param_ct + 3) & (~3));
- regression_fail = glm_logistic(1, cur_param_ct, cur_sample_valid_ct, cur_missing_ct, loadbuf_ptr, g_logistic_mt[0].cur_covars_cov_major, pheno_c_collapsed, g_logistic_mt[0].coef, g_logistic_mt[0].pp, g_logistic_mt[0].sample_1d_buf, g_logistic_mt[0].pheno_buf, g_logistic_mt[0].param_1d_buf, g_logistic_mt[0].param_1d_buf2, g_logistic_mt[0].param_2d_buf, g_logistic_mt[0].param_2d_buf2, g_logistic_mt[0].regression_results, cur_constraint_ct, constraints_con_major, g_logistic_mt[0].param_1 [...]
+ regression_fail = glm_logistic(1, cur_param_ct, cur_sample_valid_ct, cur_missing_ct, skip_intercept, loadbuf_ptr, g_logistic_mt[0].cur_covars_cov_major, pheno_c_collapsed, g_logistic_mt[0].coef, g_logistic_mt[0].pp, g_logistic_mt[0].sample_1d_buf, g_logistic_mt[0].pheno_buf, g_logistic_mt[0].param_1d_buf, g_logistic_mt[0].param_1d_buf2, g_logistic_mt[0].param_2d_buf, g_logistic_mt[0].param_2d_buf2, g_logistic_mt[0].regression_results, cur_constraint_ct, constraints_con_major, g_logist [...]
} else {
regression_fail = 1;
}
@@ -6825,7 +6826,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
if (!regression_fail) {
for (param_idx = 1; param_idx < cur_param_ct; param_idx++) {
dxx = (double)g_logistic_mt[0].coef[param_idx];
- se = sqrt((double)g_logistic_mt[0].regression_results[param_idx - 1]);
+ se = sqrt((double)g_logistic_mt[0].regression_results[param_idx - skip_intercept]);
zval = dxx / se;
pval = chiprob_p(zval * zval, 1);
if (param_idx == 1) {
@@ -6863,8 +6864,30 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
}
}
}
+ if (!skip_intercept) {
+ dxx = (double)g_logistic_mt[0].coef[0];
+ wptr = memcpya(wptr_start2, " INTERCEPT ", 11);
+ wptr = uint32_writew8x(wptr, (uint32_t)cur_sample_valid_ct, ' ');
+ wptr = double_g_writewx4x(wptr, dxx, 10, ' ');
+ if (display_ci) {
+ se = sqrt((double)g_logistic_mt[0].regression_results[0]);
+ dyy = ci_zt * se;
+ wptr = double_g_writewx4x(wptr, se, 8, ' ');
+ if (report_odds) {
+ wptr = double_g_writewx4x(wptr, exp(dxx - dyy), 8, ' ');
+ wptr = double_g_writewx4x(wptr, exp(dxx + dyy), 8, ' ');
+ } else {
+ wptr = double_g_writewx4x(wptr, dxx - dyy, 8, ' ');
+ wptr = double_g_writewx4x(wptr, dxx + dyy, 8, ' ');
+ }
+ }
+ wptr = memcpya(wptr, " NA NA\n", 26);
+ if (fwrite_checked(writebuf, wptr - writebuf, outfile)) {
+ goto glm_logistic_assoc_ret_WRITE_FAIL;
+ }
+ }
if (cur_constraint_ct) {
- dxx = (double)g_logistic_mt[0].regression_results[cur_param_ct - 1];
+ dxx = (double)g_logistic_mt[0].regression_results[cur_param_ct - skip_intercept];
*orig_stats_ptr = dxx;
pval = chiprob_p(dxx, cur_constraint_ct);
if (orig_pvals) {
@@ -7238,7 +7261,7 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
uint32_t perm_fail_ct = 0;
uint32_t max_thread_ct = g_thread_ct;
int32_t retval = 0;
- uint32_t linear_intercept = glm_modifier & GLM_INTERCEPT;
+ uint32_t report_intercept = glm_modifier & GLM_INTERCEPT;
char dgels_trans = 'N';
__CLPK_integer dgels_m = 0;
__CLPK_integer dgels_n = 0;
@@ -7831,13 +7854,12 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
}
}
}
- if (linear_intercept) {
+ if (report_intercept) {
wptr = memcpya(tbuf, " INTERCEPT ", 11);
wptr = uint32_writew8x(wptr, (uint32_t)sample_valid_ct, ' ');
wptr = double_g_writewx4x(wptr, dgels_b[0], 10, ' ');
if (display_ci) {
se = sqrt(param_2d_buf2[0]);
- zval = dgels_b[0] / se;
dyy = ci_zt * se;
wptr = double_g_writewx4x(wptr, se, 8, ' ');
wptr = double_g_writewx4x(wptr, dgels_b[0] - dyy, 8, ' ');
@@ -8049,7 +8071,7 @@ int32_t glm_linear_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
wptr = tbuf;
}
}
- if (fwrite_checkedz(tbuf, wptr - tbuf, outfile)) {
+ if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
goto glm_linear_nosnp_ret_WRITE_FAIL;
}
if (fclose_null(&outfile)) {
@@ -8121,6 +8143,7 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
uint32_t perm_fail_total = 0;
uint32_t joint_perm_fail_extra = 0;
uint32_t max_thread_ct = g_thread_ct;
+ uint32_t skip_intercept = !(glm_modifier & GLM_INTERCEPT);
int32_t retval = 0;
uintptr_t* loadbuf_raw = NULL;
uintptr_t* loadbuf_collapsed = NULL;
@@ -8503,6 +8526,7 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
mperm_save = 0;
}
ulii = (perm_batch_size + (BITCT - 1)) / BITCT;
+ uii = ((perm_batch_size > 1) || skip_intercept)? 1 : 0;
if (wkspace_alloc_f_checked(&coef, param_cta4 * perm_batch_size * sizeof(float)) ||
wkspace_alloc_f_checked(&pp, sample_valid_cta4 * sizeof(float)) ||
wkspace_alloc_f_checked(&sample_1d_buf, sample_valid_ct * sizeof(float)) ||
@@ -8511,7 +8535,7 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
wkspace_alloc_f_checked(¶m_1d_buf2, param_ct * sizeof(float)) ||
wkspace_alloc_f_checked(¶m_2d_buf, param_ct * param_cta4 * sizeof(float)) ||
wkspace_alloc_f_checked(¶m_2d_buf2, param_ct * param_cta4 * sizeof(float)) ||
- wkspace_alloc_f_checked(®ression_results, perm_batch_size * (param_ctx - 1) * sizeof(float)) ||
+ wkspace_alloc_f_checked(®ression_results, perm_batch_size * (param_ctx - uii) * sizeof(float)) ||
wkspace_alloc_ul_checked(&perm_fails, ulii * sizeof(intptr_t)) ||
wkspace_alloc_ul_checked(&g_perm_vecs, perm_batch_size * sample_valid_ctv2 * sizeof(intptr_t))) {
goto glm_logistic_nosnp_ret_NOMEM;
@@ -8567,11 +8591,11 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
}
fill_float_zero(coef, param_cta4);
- if (glm_logistic(1, param_ct, sample_valid_ct, 0, NULL, covars_cov_major, g_perm_vecs, coef, pp, sample_1d_buf, pheno_buf, param_1d_buf, param_1d_buf2, param_2d_buf, param_2d_buf2, regression_results, constraint_ct, constraints_con_major, param_1d_dbuf, param_2d_dbuf, param_2d_dbuf2, param_df_dbuf, df_df_dbuf, mi_buf, df_dbuf, perm_fails)) {
+ if (glm_logistic(1, param_ct, sample_valid_ct, 0, skip_intercept, NULL, covars_cov_major, g_perm_vecs, coef, pp, sample_1d_buf, pheno_buf, param_1d_buf, param_1d_buf2, param_2d_buf, param_2d_buf2, regression_results, constraint_ct, constraints_con_major, param_1d_dbuf, param_2d_dbuf, param_2d_dbuf2, param_df_dbuf, df_df_dbuf, mi_buf, df_dbuf, perm_fails)) {
logerrprint("Warning: Skipping --logistic no-snp due to multicollinearity.\n");
goto glm_logistic_nosnp_ret_1;
}
- if (constraint_ct && (regression_results[param_ct - 1] == -9)) {
+ if (constraint_ct && (regression_results[param_ct - skip_intercept] == -9)) {
logerrprint("Warning: Ignoring --tests due to regression failure.\n");
constraint_ct = 0;
}
@@ -8607,7 +8631,7 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
for (param_idx = 1; param_idx < param_ct; param_idx++) {
dxx = (double)coef[param_idx];
- se = sqrt((double)regression_results[param_idx - 1]);
+ se = sqrt((double)regression_results[param_idx - skip_intercept]);
zval = dxx / se;
orig_stats[param_idx - 1] = zval * zval;
pval = chiprob_p(zval * zval, 1);
@@ -8634,8 +8658,29 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
}
}
}
+ if (!skip_intercept) {
+ dxx = (double)coef[0];
+ wptr = memcpya(tbuf, " INTERCEPT ", 11);
+ wptr = uint32_writew8x(wptr, (uint32_t)sample_valid_ct, ' ');
+ wptr = double_g_writewx4x(wptr, report_odds? exp(dxx) : dxx, 10, ' ');
+ if (display_ci) {
+ se = sqrt((double)regression_results[0]);
+ dyy = ci_zt * se;
+ wptr = double_g_writewx4x(wptr, se, 8, ' ');
+ if (report_odds) {
+ wptr = double_g_writewx4x(wptr, exp(dxx - dyy), 8, ' ');
+ wptr = double_g_writewx4x(wptr, exp(dxx + dyy), 8, ' ');
+ } else {
+ wptr = double_g_writewx4x(wptr, dxx - dyy, 8, ' ');
+ wptr = double_g_writewx4x(wptr, dxx + dyy, 8, ' ');
+ }
+ }
+ if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+ goto glm_logistic_nosnp_ret_WRITE_FAIL;
+ }
+ }
if (constraint_ct) {
- dxx = (double)regression_results[param_ct - 1];
+ dxx = (double)regression_results[param_ct - skip_intercept];
orig_stats[param_ct - 1] = dxx;
pval = chiprob_p(dxx, constraint_ct);
if ((pfilter == 2.0) || ((pval <= pfilter) && (pval >= 0.0))) {
@@ -8685,7 +8730,7 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
}
join_threads(threads, g_assoc_thread_ct);
fill_float_zero(coef, cur_batch_size * param_cta4);
- perm_fail_total += glm_logistic(cur_batch_size, param_ct, sample_valid_ct, 0, NULL, covars_cov_major, g_perm_vecs, coef, pp, sample_1d_buf, pheno_buf, param_1d_buf, param_1d_buf2, param_2d_buf, param_2d_buf2, regression_results, constraint_ct, constraints_con_major, param_1d_dbuf, param_2d_dbuf, param_2d_dbuf2, param_df_dbuf, df_df_dbuf, mi_buf, df_dbuf, perm_fails);
+ perm_fail_total += glm_logistic(cur_batch_size, param_ct, sample_valid_ct, 0, 1, NULL, covars_cov_major, g_perm_vecs, coef, pp, sample_1d_buf, pheno_buf, param_1d_buf, param_1d_buf2, param_2d_buf, param_2d_buf2, regression_results, constraint_ct, constraints_con_major, param_1d_dbuf, param_2d_dbuf, param_2d_dbuf2, param_df_dbuf, df_df_dbuf, mi_buf, df_dbuf, perm_fails);
ulii = param_ct - 1;
uljj = param_ctx - 1;
for (perm_idx = 0; perm_idx < cur_batch_size; perm_idx++) {
@@ -8823,7 +8868,7 @@ int32_t glm_logistic_nosnp(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
wptr = tbuf;
}
}
- if (fwrite_checkedz(tbuf, wptr - tbuf, outfile)) {
+ if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
goto glm_logistic_nosnp_ret_WRITE_FAIL;
}
if (fclose_null(&outfile)) {
@@ -9026,7 +9071,7 @@ uint32_t glm_logistic_dosage(uintptr_t sample_ct, uintptr_t* cur_samples, uintpt
covar_f++;
}
fill_float_zero(coef, param_cta4);
- if (glm_logistic(1, param_ct, sample_valid_ct, 0, NULL, covars_cov_major, perm_vec, coef, pp, sample_1d_buf, pheno_buf, param_1d_buf, param_1d_buf2, param_2d_buf, param_2d_buf2, regression_results, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, perm_fails) || perm_fails[0]) {
+ if (glm_logistic(1, param_ct, sample_valid_ct, 0, 1, NULL, covars_cov_major, perm_vec, coef, pp, sample_1d_buf, pheno_buf, param_1d_buf, param_1d_buf2, param_2d_buf, param_2d_buf2, regression_results, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, perm_fails) || perm_fails[0]) {
return 0;
}
dxx = (double)coef[1];
diff --git a/plink_help.c b/plink_help.c
index ec8daa0..a172b8c 100644
--- a/plink_help.c
+++ b/plink_help.c
@@ -303,9 +303,10 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
);
help_print("lfile\treference\tallele-count", &help_ctrl, 1,
" --lfile {prefix} : Specify .lgen + .map + .fam (long-format fileset) prefix.\n"
-" --reference [fn] : Specify default allele file accompanying --lfile input.\n"
-" --allele-count : When used with --lfile + --reference, specifies that the\n"
-" .lgen file contains reference allele counts.\n\n"
+" --lgen [fname] : Specify full name of .lgen file.\n"
+" --reference [fn] : Specify default allele file accompanying .lgen input.\n"
+" --allele-count : When used with --lfile/--lgen + --reference, specifies\n"
+" that the .lgen file contains reference allele counts.\n\n"
);
help_print("vcf\tbcf", &help_ctrl, 1,
" --vcf [filename] : Specify full name of .vcf or .vcf.gz file.\n"
@@ -962,7 +963,7 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
*/
" --logistic <perm | mperm=[value]> <perm-count> <set-test>\n"
" <genotypic | hethom | dominant | recessive | no-snp> <hide-covar>\n"
-" <sex | no-x-sex> <interaction> <beta>\n"
+" <sex | no-x-sex> <interaction> <beta> <intercept>\n"
" Multi-covariate association analysis on a quantitative (--linear) or\n"
" case/control (--logistic) phenotype. Normally used with --covar.\n"
" * 'perm' normally causes an adaptive permutation test to be performed on\n"
@@ -991,11 +992,11 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" * 'interaction' adds genotype x covariate interactions to the model. This\n"
" cannot be used with the usual permutation tests; use --tests to define\n"
" the permutation test statistic instead.\n"
+" * 'intercept' causes intercepts to be included in the main report.\n"
" * For logistic regressions, the 'beta' modifier causes regression\n"
" coefficients instead of odds ratios to be reported.\n"
" * With --linear, the 'standard-beta' modifier standardizes the phenotype\n"
-" and all predictors to zero mean and unit variance before regression, and\n"
-" the 'intercept' modifier adds intercepts to the main report.\n\n"
+" and all predictors to zero mean and unit variance before regression.\n\n"
);
help_print("dosage\twrite-dosage", &help_ctrl, 1,
" --dosage [allele dosage file] <noheader> <skip0=[i]> <skip1=[j]> <skip2=[k]>\n"
@@ -1236,7 +1237,7 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
);
help_print("score\tscore-no-mean-imputation", &help_ctrl, 1,
" --score [filename] {i} {j} {k} <header> <sum | no-sum>\n"
-" <no-mean-imputation | center> <include-cnt>\n"
+" <no-mean-imputation | center> <include-cnt> <double-dosage>\n"
" Apply a linear scoring system to each sample.\n"
" The input file should have one line per scored variant. Variant IDs are\n"
" read from column #i, allele codes are read from column #j, and scores are\n"
@@ -1257,7 +1258,8 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" mean zero.\n"
" * This command can be used with dosage data. By default, the 'CNT' column\n"
" is omitted from the output file in this case; use 'include-cnt' to keep\n"
-" it.\n\n"
+" it. Also, note that scores are multiplied by 0..1 dosages, not 0..2\n"
+" diploid allele counts, unless the 'double-dosage' modifier is present.\n\n"
);
#if defined __cplusplus && !defined _WIN32 && !defined STABLE_BUILD
help_print("R\tR-debug", &help_ctrl, 1,
@@ -1360,10 +1362,11 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
help_print("vcf\tbcf\tid-delim\tvcf-idspace-to", &help_ctrl, 0,
" --vcf-idspace-to [c] : Convert spaces in sample IDs to the given character.\n"
);
- help_print("vcf\tbcf\tbiallelic-only\tvcf-min-qual\tvcf-filter\tvcf-half-call\tvcf-min-gq\tvcf-min-gp", &help_ctrl, 0,
+ help_print("vcf\tbcf\tbiallelic-only\tvcf-min-qual\tvcf-filter\tvcf-half-call\tvcf-min-gq\tvcf-min-gp\tvcf-require-gt", &help_ctrl, 0,
" --biallelic-only <strict> <list> : Skip VCF variants with 2+ alt. alleles.\n"
" --vcf-min-qual [val] : Skip VCF variants with low/missing QUAL.\n"
" --vcf-filter {exception(s)...} : Skip variants which have FILTER failures.\n"
+" --vcf-require-gt : Skip variants with no GT field.\n"
" --vcf-min-gq [val] : No-call a genotype when GQ is below the\n"
" given threshold.\n"
" --vcf-min-gp [val] : No-call a genotype when 0-1 scaled GP is\n"
@@ -1423,8 +1426,14 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" one chromosome, include a '@' in the first\n"
" parameter where the chrom. number belongs,\n"
" e.g. 'genetic_map_chr at _combined_b37.txt'.\n"
-" --zero-cms : Zero out centimorgan positions.\n"
+" --zero-cms : Zero out centimorgan positions.\n"
);
+#ifndef STABLE_BUILD
+ help_print("allow-no-samples\tallow-no-vars", &help_ctrl, 0,
+" --allow-no-samples : Allow the input fileset to contain no samples.\n"
+" --allow-no-vars : Allow the input fileset to contain no variants.\n"
+ );
+#endif
help_print("pheno\tall-pheno\tmpheno\tpheno-name\tpheno-merge", &help_ctrl, 0,
" --pheno [fname] : Load phenotype data from the specified file, instead of\n"
" using the values in the main input fileset.\n"
@@ -1462,12 +1471,16 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" unspecified, it is equal to Lt; otherwise,\n"
" in-between phenotype values are set to missing.\n"
);
- help_print("covar\tcovar-name\tcovar-number", &help_ctrl, 0,
+ help_print("covar\tcovar-name\tcovar-number\tno-const-covar\tallow-no-covars", &help_ctrl, 0,
" --covar [filename] <keep-pheno-on-missing-cov> : Specify covariate file.\n"
" --covar-name [...] : Specify covariate(s) in --covar file by name.\n"
" Separate multiple names with spaces or commas, and\n"
" use dashes to designate ranges.\n"
" --covar-number [...] : Specify covariate(s) in --covar file by index.\n"
+" --no-const-covar : Exclude constant covariates.\n"
+#ifndef STABLE_BUILD
+" --allow-no-covars : Allow no covariates to be loaded from --covar file.\n"
+#endif
);
help_print("within\tmwithin\tfamily", &help_ctrl, 0,
" --within [f] <keep-NA> : Specify initial cluster assignments.\n"
@@ -2049,6 +2062,10 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" size should be\n"
" 4 / (1/[# cases] + 1/[# controls]).\n"
);
+ help_print("meta-analysis-report-dups\tmeta-analysis", &help_ctrl, 0,
+" --meta-analysis-report-dups : When a variant appears multiple times in\n"
+" in the same file, report that.\n"
+ );
help_print("gene-list-border\tgene-report\tgene-subset\tgene-list\tgene-report-snp-field", &help_ctrl, 0,
" --gene-list-border [kbs] : Extend --gene-report regions by given # of kbs.\n"
" --gene-subset [filename] : Specify gene name subset for --gene-report.\n"
diff --git a/plink_lasso.c b/plink_lasso.c
index 3908a86..98ed6f0 100644
--- a/plink_lasso.c
+++ b/plink_lasso.c
@@ -947,6 +947,7 @@ int32_t lasso(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* out
}
}
ullii += CACHEALIGN(((uint64_t)uii) * sample_valid_ct * sizeof(double));
+ // if (0) {
if (ullii <= wkspace_left) {
retval = lasso_bigmem(bedfile, bed_offset, marker_exclude, marker_ct, marker_reverse, chrom_info_ptr, unfiltered_sample_ct, pheno_nm2, lasso_h2, lasso_minlambda, select_covars, select_covars_bitfield, pheno_d_collapsed, covar_ct, covar_names, max_covar_name_len, covar_nm, covar_d, hh_or_mt_exists, sample_valid_ct, sample_include2, sample_male_include2, loadbuf_raw, loadbuf_collapsed, rand_matrix, misc_arr, residuals, polymorphic_markers, &polymorphic_marker_ct, &iter_tot, &xhat);
} else {
diff --git a/plink_ld.c b/plink_ld.c
index a382a7d..3d78800 100644
--- a/plink_ld.c
+++ b/plink_ld.c
@@ -638,21 +638,27 @@ void ld_prune_start_chrom(uint32_t ld_window_kb, uint32_t* cur_chrom_ptr, uint32
uint32_t uii = 0;
uint32_t window_size;
live_indices[0] = window_unfiltered_start;
+ next_unset_ck(marker_exclude, &window_unfiltered_end, chrom_end);
if (ld_window_kb) {
- window_size = 0;
- while ((window_unfiltered_start + window_size < chrom_end) && (marker_pos[window_unfiltered_start + window_size] <= marker_pos[window_unfiltered_start] + (1000 * ld_window_size))) {
+ window_size = 1;
+ uii = window_unfiltered_end;
+ while ((uii < chrom_end) && (marker_pos[uii] <= marker_pos[window_unfiltered_start] + (1000 * ld_window_size))) {
window_size++;
+ uii++;
+ next_unset_ck(marker_exclude, &uii, chrom_end);
}
+ uii = 0;
} else {
window_size = ld_window_size;
}
- for (uii = 1; uii < window_size; window_unfiltered_end++, uii++) {
- next_unset_ck(marker_exclude, &window_unfiltered_end, chrom_end);
+ for (uii = 1; uii < window_size; uii++) {
if (window_unfiltered_end == chrom_end) {
break;
}
start_arr[uii - 1] = window_unfiltered_end;
live_indices[uii] = window_unfiltered_end;
+ window_unfiltered_end++;
+ next_unset_ck(marker_exclude, &window_unfiltered_end, chrom_end);
}
*cur_window_size_ptr = uii;
start_arr[uii - 1] = window_unfiltered_end;
@@ -817,6 +823,7 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
uint32_t bsearch_max;
uint32_t bsearch_cur;
double prune_ld_thresh;
+
if (founder_ct < 2) {
LOGERRPRINTF("Warning: Skipping --indep%s since there are less than two founders.\n(--make-founders may come in handy here.)\n", pairwise? "-pairwise" : "");
goto ld_prune_ret_1;
@@ -1029,6 +1036,9 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
at_least_one_prune = 1;
cur_exclude_ct++;
// remove marker with lower MAF
+ // could cache MAFs of all current-window variants, but
+ // get_maf() is too cheap for this to make a noticeable
+ // difference
if (get_maf(set_allele_freqs[live_indices[uii]]) < get_maf(set_allele_freqs[live_indices[ujj]])) {
SET_BIT(pruned_arr, live_indices[uii]);
} else {
@@ -1173,16 +1183,11 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
}
}
for (uii = 0; uii < ld_window_incr; uii++) {
- while (IS_SET(marker_exclude, window_unfiltered_start)) {
- if (window_unfiltered_start == chrom_end) {
- break;
- }
- window_unfiltered_start++;
- }
if (window_unfiltered_start == chrom_end) {
break;
}
window_unfiltered_start++;
+ next_unset_ck(marker_exclude, &window_unfiltered_start, chrom_end);
}
if (window_unfiltered_start == chrom_end) {
break;
@@ -1194,6 +1199,11 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
pct_thresh = chrom_info_ptr->chrom_start[cur_chrom] + (((uint64_t)pct * (chrom_end - chrom_info_ptr->chrom_start[cur_chrom])) / 100);
}
ujj = 0;
+
+ if (window_unfiltered_end < window_unfiltered_start) {
+ window_unfiltered_end = window_unfiltered_start;
+ }
+
// copy back previously loaded/computed results
while (live_indices[ujj] < window_unfiltered_start) {
ujj++;
@@ -1230,15 +1240,17 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
cur_window_size = uii;
if (window_is_kb) {
ujj = 0;
- while ((window_unfiltered_end + ujj < chrom_end) && (marker_pos[window_unfiltered_end + ujj] <= marker_pos[window_unfiltered_start] + (1000 * ld_window_size))) {
+ ukk = window_unfiltered_end;
+ while ((ukk < chrom_end) && (marker_pos[ukk] <= marker_pos[window_unfiltered_start] + (1000 * ld_window_size))) {
ujj++;
+ ukk++;
+ next_unset_ck(marker_exclude, &ukk, chrom_end);
}
} else {
ujj = ld_window_incr;
}
old_window_size = cur_window_size;
- for (uii = 0; uii < ujj; window_unfiltered_end++, uii++) {
- next_unset_ck(marker_exclude, &window_unfiltered_end, chrom_end);
+ for (uii = 0; uii < ujj; uii++) {
if (window_unfiltered_end == chrom_end) {
break;
}
@@ -1260,6 +1272,8 @@ int32_t ld_prune(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uintptr_t m
cur_exclude_ct++;
}
cur_window_size++;
+ window_unfiltered_end++;
+ next_unset_ck(marker_exclude, &window_unfiltered_end, chrom_end);
}
if (cur_window_size > prev_end) {
start_arr[cur_window_size] = window_unfiltered_end;
@@ -2091,17 +2105,17 @@ uint32_t ld_matrix_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
goto ld_matrix_emitn_ret;
}
}
- if (is_square0) {
- while (marker_idx < marker_ct) {
- ulii = (((uintptr_t)(readbuf_end - sptr_cur)) + 1) / 2;
- if (ulii <= marker_ct - marker_idx) {
- sptr_cur = memcpya(sptr_cur, tbuf, ulii * 2);
- marker_idx += ulii;
- goto ld_matrix_emitn_ret;
- } else {
- sptr_cur = memcpya(sptr_cur, tbuf, (marker_ct - marker_idx) * 2);
- marker_idx = marker_ct;
- }
+ if (is_square0 && (marker_idx < marker_ct)) {
+ ulii = (((uintptr_t)(readbuf_end - sptr_cur)) + 1) / 2;
+ // bugfix: can't be <= since tab delimiter wouldn't be handled correctly
+ // on subsequent pass
+ if (ulii < marker_ct - marker_idx) {
+ sptr_cur = memcpya(sptr_cur, tbuf, ulii * 2);
+ marker_idx += ulii;
+ goto ld_matrix_emitn_ret;
+ } else {
+ sptr_cur = memcpya(sptr_cur, tbuf, (marker_ct - marker_idx) * 2);
+ marker_idx = marker_ct;
}
}
if (delimiter == '\t') {
@@ -3824,6 +3838,8 @@ THREAD_RET_TYPE fast_epi_thread(void* arg) {
chisq2_ptr = &(best_chisq2[block_idx2]);
for (; block_idx2 < cur_idx2_block_size; block_idx2++, chisq2_ptr++, cur_geno2 = &(cur_geno2[tot_ctsplit])) {
cur_tot2 = &(tot2[block_idx2 * tot_stride]);
+ // this operation isn't extracting a 2-bit genotype, so don't use the
+ // macro
cur_zmiss2 = (zmiss2[block_idx2 / BITCT2] >> (2 * (block_idx2 % BITCT2))) & 3;
cur_zmiss2_tmp = cur_zmiss2 & 1;
if (nm_case_fixed) {
@@ -9973,11 +9989,14 @@ int32_t epistasis_report(pthread_t* threads, Epi_info* epi_ip, FILE* bedfile, ui
*wptr_start++ = ' ';
marker_uidx2 = next_unset_ul_unsafe(marker_exclude2, marker_uidx_base);
for (chrom_fo_idx2 = get_marker_chrom_fo_idx(chrom_info_ptr, marker_uidx2); chrom_fo_idx2 < chrom_ct; chrom_fo_idx2++) {
- chrom_idx2 = chrom_info_ptr->chrom_file_order[chrom_fo_idx2];
chrom_end2 = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx2 + 1];
+ if (marker_uidx2 >= chrom_end2) {
+ continue;
+ }
+ chrom_idx2 = chrom_info_ptr->chrom_file_order[chrom_fo_idx2];
wptr_start2 = width_force(4, wptr_start, chrom_name_write(wptr_start, chrom_info_ptr, chrom_idx2));
*wptr_start2++ = ' ';
- for (; marker_uidx2 < chrom_end2; next_unset_ul_ck(marker_exclude2, &marker_uidx2, chrom_end2), marker_idx2++, dptr++) {
+ for (; marker_uidx2 < chrom_end2; next_unset_ul_ck(marker_exclude2, &marker_uidx2, unfiltered_marker_ct), marker_idx2++, dptr++) {
if (marker_idx2 == ujj) {
marker_idx2 = g_epi_geno1_offsets[2 * block_idx1 + 1];
if (marker_idx2 == marker_ct2) {
@@ -10217,7 +10236,7 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
uintptr_t founder_ct = popcount_longs(founder_info, unfiltered_sample_ctl2 / 2);
uintptr_t founder_ctl = (founder_ct + BITCT - 1) / BITCT;
uintptr_t founder_ctv3 = 2 * ((founder_ct + (2 * BITCT - 1)) / (2 * BITCT));
- // no actual case/control split here, but keep the variable name the same to
+ // no actual case/control split here, but keep the variables the same to
// minimize divergence from ld_report_dprime()
uintptr_t founder_ctsplit = 3 * founder_ctv3;
uintptr_t final_mask = get_final_mask(founder_ct);
@@ -10334,7 +10353,8 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
loadbuf[founder_ctl * 2 - 2] = 0;
loadbuf[founder_ctl * 2 - 1] = 0;
fill_all_bits(dummy_nm, founder_ct);
- for (ulii = 1; ulii <= window_max; ulii++) {
+ // bugfix: this loop must start at 0, not 1
+ for (ulii = 0; ulii < window_max; ulii++) {
geno[ulii * founder_ctsplit + founder_ctv3 - 1] = 0;
geno[ulii * founder_ctsplit + 2 * founder_ctv3 - 1] = 0;
geno[ulii * founder_ctsplit + founder_ctsplit - 1] = 0;
@@ -10471,16 +10491,11 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
} while (at_least_one_prune);
}
for (uii = 0; uii < ld_window_incr; uii++) {
- while (IS_SET(marker_exclude, window_unfiltered_start)) {
- if (window_unfiltered_start == chrom_end) {
- break;
- }
- window_unfiltered_start++;
- }
if (window_unfiltered_start == chrom_end) {
break;
}
window_unfiltered_start++;
+ next_unset_ck(marker_exclude, &window_unfiltered_start, chrom_end);
}
if (window_unfiltered_start == chrom_end) {
break;
@@ -10492,6 +10507,9 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
pct_thresh = chrom_info_ptr->chrom_start[cur_chrom] + (((uint64_t)pct * (chrom_end - chrom_info_ptr->chrom_start[cur_chrom])) / 100);
}
uljj = 0;
+ if (window_unfiltered_end < window_unfiltered_start) {
+ window_unfiltered_end = window_unfiltered_start;
+ }
// copy back previously loaded/computed results
while (live_indices[uljj] < window_unfiltered_start) {
uljj++;
@@ -10507,21 +10525,31 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
live_indices[ulii] = live_indices[uljj];
start_arr[ulii] = start_arr[uljj];
memcpy(&(cur_tots[ulii * 3]), &(cur_tots[uljj * 3]), 3 * sizeof(int32_t));
+ // bugfix: forgot to update zmiss
+ if (IS_SET(zmiss, uljj)) {
+ SET_BIT(zmiss, ulii);
+ } else {
+ CLEAR_BIT(zmiss, ulii);
+ }
ulii++;
}
+ clear_bits(zmiss, ulii, window_max);
prev_end = ulii;
cur_window_size = ulii;
if (window_is_kb) {
uljj = 0;
- while ((window_unfiltered_end + uljj < chrom_end) && (marker_pos[window_unfiltered_end + uljj] <= marker_pos[window_unfiltered_start] + (1000 * ld_window_size))) {
+ ulkk = window_unfiltered_end;
+ while ((window_unfiltered_end < chrom_end) && (marker_pos[window_unfiltered_end] <= marker_pos[window_unfiltered_start] + (1000 * ld_window_size))) {
uljj++;
+ window_unfiltered_end++;
+ next_unset_ck(marker_exclude, &window_unfiltered_end, chrom_end);
}
+ window_unfiltered_end = ulkk;
} else {
uljj = ld_window_incr;
}
- for (ulii = 0; ulii < uljj; window_unfiltered_end++, ulii++) {
- next_unset_ck(marker_exclude, &window_unfiltered_end, chrom_end);
+ for (ulii = 0; ulii < uljj; ulii++) {
if (window_unfiltered_end == chrom_end) {
break;
}
@@ -10550,6 +10578,8 @@ int32_t indep_pairphase(Ld_info* ldip, FILE* bedfile, uintptr_t bed_offset, uint
SET_BIT(zmiss, cur_window_size);
}
cur_window_size++;
+ window_unfiltered_end++;
+ next_unset_ck(marker_exclude, &window_unfiltered_end, chrom_end);
}
if (cur_window_size > prev_end) {
start_arr[cur_window_size] = window_unfiltered_end;
diff --git a/plink_misc.c b/plink_misc.c
index 20d9791..3955967 100644
--- a/plink_misc.c
+++ b/plink_misc.c
@@ -44,7 +44,7 @@ int32_t make_founders(uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, char*
}
bitfield_exclude_to_include(sample_exclude, nf_bitarr, unfiltered_sample_ct);
bitfield_andnot(nf_bitarr, founder_info, unfiltered_sample_ctl);
- sample_uidx = next_set(nf_bitarr, 0, unfiltered_sample_ct);
+ sample_uidx = unfiltered_sample_ct? next_set(nf_bitarr, 0, unfiltered_sample_ct) : 0;
if (sample_uidx == unfiltered_sample_ct) {
logprint("Note: Skipping --make-founders since there are no nonfounders.\n");
goto make_founders_ret_1;
@@ -1660,7 +1660,7 @@ void calc_plink_maxfid(uint32_t unfiltered_sample_ct, uintptr_t* sample_exclude,
// imitate PLINK 1.07 behavior (see Plink::prettyPrintLengths() in
// helper.cpp), to simplify testing and avoid randomly breaking existing
// scripts
- do {
+ while (samples_done < sample_ct) {
sample_uidx = next_unset_unsafe(sample_exclude, sample_uidx);
sample_uidx_stop = next_set(sample_exclude, sample_uidx, unfiltered_sample_ct);
samples_done += sample_uidx_stop - sample_uidx;
@@ -1679,7 +1679,7 @@ void calc_plink_maxfid(uint32_t unfiltered_sample_ct, uintptr_t* sample_exclude,
}
cptr = &(cptr[max_sample_id_len]);
} while (cptr < cptr_end);
- } while (samples_done < sample_ct);
+ }
*plink_maxfid_ptr = plink_maxfid;
*plink_maxiid_ptr = plink_maxiid;
}
@@ -1991,8 +1991,7 @@ int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uint
logprint("--read-freq: .frq file loaded.\n");
}
} else if (uii == 3) {
- // changed from strcmp to avoid eoln problems
- // known --freqx format, WDIST v0.15.3 or later
+ // --freqx format
while (fgets(loadbuf, loadbuf_size, freqfile) != NULL) {
line_idx++;
if (!loadbuf[loadbuf_size - 1]) {
@@ -2332,6 +2331,7 @@ int32_t load_ax_alleles(Two_col_params* axalleles, uintptr_t unfiltered_marker_c
}
int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uint32_t sample_f_ct, uintptr_t* founder_info, uint32_t nonfounders, uintptr_t* sex_male, uint32_t s [...]
+ // unfiltered_sample_ct == 0 ok
unsigned char* wkspace_mark = wkspace_base;
char* writebuf = tbuf;
char* pzwritep = NULL;
@@ -2620,6 +2620,7 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
}
int32_t write_cc_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t* founder_info, uint32_t nonfounders, uintptr_t* sex_male, uintptr_t* marker_reverse, uintptr_t* pheno_nm, uintptr_t* ph [...]
+ // unfiltered_sample_ct must be positive
unsigned char* wkspace_mark = wkspace_base;
char* pzwritep = NULL;
uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
@@ -2803,6 +2804,7 @@ int32_t write_cc_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
}
int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, double* set_allele_freqs, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, int32_t* ll_cts, int32_t* lh_cts, int32_t* hh_cts, int32_t* hapl_cts, int32_t* haph_cts, uint32_t sample_f_ct, uint32_t sample_f_male_ct, uint32_t nonfounders, uint64_t misc_flags, uintptr_t* mar [...]
+ // unfiltered_sample_ct == 0 ok
unsigned char* wkspace_mark = wkspace_base;
char* pzwritep = NULL;
uint32_t reverse = 0;
@@ -3282,7 +3284,7 @@ int32_t write_snplist(char* outname, char* outname_end, uintptr_t unfiltered_mar
goto write_snplist_ret_OPEN_FAIL;
}
if (!list_23_indels) {
- do {
+ while (markers_done < marker_ct) {
marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
marker_uidx_stop = next_set_ul(marker_exclude, marker_uidx, unfiltered_marker_ct);
markers_done += marker_uidx_stop - marker_uidx;
@@ -3296,7 +3298,7 @@ int32_t write_snplist(char* outname, char* outname_end, uintptr_t unfiltered_mar
}
cptr = &(cptr[max_marker_id_len]);
} while (cptr < cptr_end);
- } while (markers_done < marker_ct);
+ }
} else {
for (; markers_done < marker_ct; marker_uidx++, markers_done++) {
next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
@@ -4335,10 +4337,10 @@ int32_t score_report(Score_info* sc_ip, FILE* bedfile, uintptr_t bed_offset, uin
}
}
first_col_m1--;
+ memcpy(outname_end, ".nopred", 8); // bugfix, this was after the goto before
if (modifier & SCORE_HEADER) {
goto score_report_load_next;
}
- memcpy(outname_end, ".nopred", 8);
while (1) {
bufptr_arr[0] = next_token_multz(bufptr, first_col_m1);
bufptr_arr[1] = next_token_mult(bufptr_arr[0], col_01_delta);
@@ -5121,17 +5123,21 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
gzFile gz_infile = NULL;
FILE* infile = NULL;
FILE* outfile = NULL;
- char* sorted_extract_ids = NULL;
char* loadbuf_end = (char*)(&(wkspace_base[wkspace_left]));
char* cur_window_marker_ids = NULL;
+ char* sorted_extract_ids = NULL;
+ uintptr_t* duplicate_id_bitfield = NULL;
+ Ll_str** duplicate_id_htable = NULL;
uintptr_t header_dict_ct = 2; // 'SE', BETA/OR
uintptr_t max_header_len = 3;
uintptr_t extract_ct = 0;
uintptr_t max_extract_id_len = 0;
+ uintptr_t extract_ctl = 0;
uintptr_t final_variant_ct = 0;
uintptr_t last_var_idx = 0;
- uintptr_t rejected_ct = 0;
uintptr_t window_entry_base_cost = 2;
+ uintptr_t duplicate_id_htable_max_alloc = 0;
+ uint64_t rejected_ct = 0;
double cur_p = 0.0;
double cur_ess = 0.0;
uint32_t max_var_id_len_p1 = 0;
@@ -5142,6 +5148,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
uint32_t report_all = flags & METAANAL_REPORT_ALL;
uint32_t output_beta = flags & METAANAL_QT;
uint32_t report_study_specific = flags & METAANAL_STUDY;
+ uint32_t report_dups = flags & METAANAL_REPORT_DUPS;
uint32_t weighted_z = (flags / METAANAL_WEIGHTED_Z) & 1;
uint32_t parse_max = 3;
@@ -5152,12 +5159,11 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
uint32_t a2lenp1 = 0;
uint32_t cur_chrom = 0;
uint32_t cur_bp = 0;
- uint32_t cur_file_ct_m1 = 0;
uint32_t cur_combined_allele_len = 0;
uint32_t pass_idx = 0;
int32_t retval = 0;
char missing_geno = *g_missing_geno_ptr;
- const char problem_strings[][16] = {"BAD_CHR", "BAD_BP", "MISSING_A1", "MISSING_A2", "BAD_ES", "BAD_SE", "ALLELE_MISMATCH", "BAD_P", "BAD_ESS"};
+ const char problem_strings[][16] = {"BAD_CHR", "BAD_BP", "MISSING_A1", "MISSING_A2", "BAD_ES", "BAD_SE", "ALLELE_MISMATCH", "BAD_P", "BAD_ESS", "DUPLICATE"};
// [0] = SNP
// [1] = BETA/OR
@@ -5185,11 +5191,13 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
uintptr_t variants_remaining;
uintptr_t cur_var_idx;
uintptr_t first_var_idx;
+ uintptr_t htable_write_limit;
uintptr_t ulii;
Ll_str** htable;
Ll_str** ll_pptr;
Ll_str* ll_ptr;
Ll_str* htable_write;
+ Ll_str* duplicate_id_htable_write;
unsigned char* wkspace_mark2;
char* sorted_header_dict;
char* master_var_list;
@@ -5231,6 +5239,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
uint32_t file_ct64;
uint32_t file_idx;
uint32_t cur_file_ct;
+ uint32_t cur_file_ct_m1;
uint32_t fname_len;
uint32_t token_ct;
uint32_t seq_idx;
@@ -5292,7 +5301,6 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
wkspace_alloc_ui_checked(&header_id_map, header_dict_ct * sizeof(int32_t))) {
goto meta_analysis_ret_NOMEM;
}
- wkspace_mark2 = wkspace_base;
ulii = 0; // write position
if (snpfield_search_order) {
bufptr = snpfield_search_order;
@@ -5386,23 +5394,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
goto meta_analysis_ret_INVALID_CMDLINE;
}
- // 2. Allocate space for initial hash table.
- // Saving memory is pretty important here, so we use the following packing in
- // the ss field (W = byte width required to save numbers up to file_ct, and
- // M = 1 iff 'no-map' was not specified):
- // [W]: number of files this variant appears in, little-endian
- // [W+1]..[W+5], if M==1: chromosome byte followed by bp coordinate int; may
- // need to widen chromosome byte later
- // [W+5M+1]: null-terminated variant ID. Followed by null-terminated A1/A2
- // if 'no-allele' not specified
- htable = (Ll_str**)wkspace_alloc(HASHMEM);
- if (!htable) {
- goto meta_analysis_ret_NOMEM;
- }
- for (uii = 0; uii < HASHSIZE; uii++) {
- htable[uii] = NULL;
- }
- // 3. If --extract specified, load and sort permitted variant list.
+ // 2. If --extract specified, load and sort permitted variant list.
if (extractname) {
if (fopen_checked(&infile, extractname, "rb")) {
goto meta_analysis_ret_OPEN_FAIL;
@@ -5423,7 +5415,12 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
goto meta_analysis_ret_NOMEM;
}
rewind(infile);
- // todo: switch to hash table to avoid sort
+ // Considered switching to a hash table, but decided against it for now
+ // since it's less memory-efficient (in the usual case of similar-length
+ // IDs), especially when lots of duplicate IDs are present. Might be worth
+ // revisiting this decision in the future, though, since there are
+ // reasonable use cases involving 40-80 million line --extract files, and
+ // skipping the sort step there is a big win.
retval = read_tokens(infile, tbuf, MAXLINELEN, extract_ct, max_extract_id_len, sorted_extract_ids);
if (retval) {
goto meta_analysis_ret_1;
@@ -5437,7 +5434,32 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
extract_ct = ulii;
wkspace_shrink_top(sorted_extract_ids, extract_ct * max_extract_id_len);
}
+ extract_ctl = (extract_ct + BITCT - 1) / BITCT;
+ if (wkspace_alloc_ul_checked(&duplicate_id_bitfield, extract_ctl * sizeof(intptr_t))) {
+ goto meta_analysis_ret_NOMEM;
+ }
+ } else {
+ duplicate_id_htable = (Ll_str**)wkspace_alloc(HASHMEM);
}
+
+ // 3. Allocate space for initial hash table.
+ // Saving memory is pretty important here, so we use the following packing in
+ // the ss field (W = byte width required to save numbers up to file_ct, and
+ // M = 1 iff 'no-map' was not specified):
+ // [W]: number of files this variant appears in minus 1, little-endian
+ // [W+1]..[W+5], if M==1: chromosome byte followed by bp coordinate int; may
+ // need to widen chromosome byte later
+ // [W+5M+1]: null-terminated variant ID. Followed by null-terminated A1/A2
+ // if 'no-allele' not specified
+ wkspace_mark2 = wkspace_base;
+ htable = (Ll_str**)wkspace_alloc(HASHMEM);
+ if (!htable) {
+ goto meta_analysis_ret_NOMEM;
+ }
+ for (uii = 0; uii < HASHSIZE; uii++) {
+ htable[uii] = NULL;
+ }
+
// 4. Initial scan: save all potentially valid variant IDs (and accompanying
// allele codes/chr/pos, if present) in the hash table, and produce .prob
// file. Also determine maximum line length, for use in later passes.
@@ -5461,17 +5483,24 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
htable_write = (Ll_str*)wkspace_base;
loadbuf_end[-1] = ' ';
for (file_idx = 0; file_idx < file_ct; file_idx++) {
+ if (sorted_extract_ids) {
+ fill_ulong_zero(duplicate_id_bitfield, extract_ctl);
+ } else {
+ for (uii = 0; uii < HASHSIZE; uii++) {
+ duplicate_id_htable[uii] = NULL;
+ }
+ }
fname_len = strlen(fname_ptr);
- // divide by two and subtract 16 to prevent overlap between loadbuf and new
- // hash table entry.
- loadbuf_size = (((uintptr_t)(loadbuf_end - ((char*)htable_write))) / 2);
- if (loadbuf_size > MAXLINEBUFLEN + 16) {
- loadbuf_size = MAXLINEBUFLEN + 16;
- } else if (loadbuf_size <= MAXLINELEN + 16) {
+ // prevent overlap between loadbuf and new hash table entries.
+ loadbuf_size = (((uintptr_t)(loadbuf_end - ((char*)htable_write))) / 4);
+ if (loadbuf_size > MAXLINEBUFLEN) {
+ loadbuf_size = MAXLINEBUFLEN;
+ } else if (loadbuf_size <= MAXLINELEN) {
goto meta_analysis_ret_NOMEM;
}
- loadbuf_size -= 16;
loadbuf = &(loadbuf_end[-((intptr_t)loadbuf_size)]);
+ duplicate_id_htable_write = (Ll_str*)loadbuf;
+ htable_write_limit = ((uintptr_t)loadbuf) - loadbuf_size - 16;
token_ct = parse_max;
retval = meta_analysis_open_and_read_header(fname_ptr, loadbuf, loadbuf_size, sorted_header_dict, header_id_map, header_dict_ct, max_header_len, weighted_z, &token_ct, &gz_infile, col_skips, col_sequence, &line_idx, &line_max);
if (retval) {
@@ -5494,6 +5523,10 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
}
bufptr = skip_initial_spaces(loadbuf);
if (is_eoln_kns(*bufptr)) {
+ slen = strlen(bufptr) + ((uintptr_t)(bufptr - loadbuf));
+ if (slen >= line_max) {
+ line_max = slen + 1;
+ }
continue;
}
bufptr = next_token_multz(bufptr, col_skips[0]);
@@ -5517,20 +5550,55 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
}
bufptr = token_ptrs[0];
var_id_len = strlen_se(bufptr);
- if (sorted_extract_ids && (bsearch_str(bufptr, var_id_len, sorted_extract_ids, max_extract_id_len, extract_ct) == -1)) {
- continue;
- }
if (var_id_len > MAX_ID_LEN) {
sprintf(logbuf, "Error: Line %" PRIuPTR " of %s has an excessively long variant ID.\n", line_idx, fname_ptr);
goto meta_analysis_ret_INVALID_FORMAT_WW;
}
+ bufptr[var_id_len] = '\0';
+ uii = hashval2(bufptr, var_id_len++);
+ // var_id_len now includes null-terminator
+ if (sorted_extract_ids) {
+ ii = bsearch_str(bufptr, var_id_len - 1, sorted_extract_ids, max_extract_id_len, extract_ct);
+ if (ii == -1) {
+ continue;
+ }
+ if (is_set(duplicate_id_bitfield, ii)) {
+ problem_mask = 0x200;
+ goto meta_analysis_report_error;
+ }
+ set_bit(duplicate_id_bitfield, ii);
+ } else {
+ ll_pptr = &(duplicate_id_htable[uii]);
+ while (1) {
+ ll_ptr = *ll_pptr;
+ if ((!ll_ptr) || (!strcmp(bufptr, ll_ptr->ss))) {
+ break;
+ }
+ ll_pptr = &(ll_ptr->next);
+ }
+ if (ll_ptr) {
+ problem_mask = 0x200;
+ goto meta_analysis_report_error;
+ }
+ // word-align for now
+ // note that it is NOT safe to use uii here.
+ ulii = sizeof(intptr_t) + ((var_id_len + BYTECT - 1) & (~(BYTECT - 1)));
+ if (((uintptr_t)htable_write) + ulii > ((uintptr_t)duplicate_id_htable_write)) {
+ goto meta_analysis_ret_NOMEM;
+ }
+ duplicate_id_htable_write = (Ll_str*)(((uintptr_t)duplicate_id_htable_write) - ulii);
+ *ll_pptr = duplicate_id_htable_write;
+ duplicate_id_htable_write->next = NULL;
+ memcpy(duplicate_id_htable_write->ss, bufptr, var_id_len);
+ }
+ ll_pptr = &(htable[uii]);
// validate
problem_mask = 0;
if (use_map) {
ii = get_chrom_code(chrom_info_ptr, token_ptrs[5]);
if (ii < 0) {
- problem_mask |= 1;
+ problem_mask = 1;
} else {
cur_chrom = (uint32_t)ii;
if (!is_set(chrom_info_ptr->chrom_mask, cur_chrom)) {
@@ -5574,12 +5642,8 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
problem_mask |= 0x100;
}
}
- // check hash table
+ // check main hash table
bufptr = token_ptrs[0];
- bufptr[var_id_len] = '\0';
- uii = hashval2(bufptr, var_id_len++);
- // var_id_len now includes null-terminator
- ll_pptr = &(htable[uii]);
while (1) {
ll_ptr = *ll_pptr;
if ((!ll_ptr) || (!strcmp(bufptr, &(ll_ptr->ss[slen_base])))) {
@@ -5622,15 +5686,9 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
final_variant_ct++;
}
htable_write = (Ll_str*)((((uintptr_t)wptr) + sizeof(uintptr_t) - 1) & (~(sizeof(uintptr_t) - ONELU)));
- // now shrink loadbuf if necessary
- loadbuf_size = (((uintptr_t)(loadbuf_end - ((char*)htable_write))) / 2);
- if (loadbuf_size > MAXLINEBUFLEN + 16) {
- loadbuf_size = MAXLINEBUFLEN + 16;
- } else if (loadbuf_size <= MAXLINELEN + 16) {
+ if ((((uintptr_t)htable_write) > ((uintptr_t)duplicate_id_htable_write)) || (((uintptr_t)htable_write) > htable_write_limit)) {
goto meta_analysis_ret_NOMEM;
}
- loadbuf_size -= 16;
- loadbuf = &(loadbuf_end[-((intptr_t)loadbuf_size)]);
} else {
if ((token_ct - 2 * weighted_z < 6) || meta_analysis_allelic_match(&(ll_ptr->ss[slen_base + var_id_len]), token_ptrs, token_ct, a1lenp1, a2lenp1)) {
if (problem_mask) {
@@ -5647,6 +5705,9 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
} else {
problem_mask |= 0x40;
meta_analysis_report_error:
+ if ((problem_mask == 0x200) && (!report_dups)) {
+ continue;
+ }
if (!outfile) {
memcpy(outname_end, ".prob", 6);
if (fopen_checked(&outfile, outname, "w")) {
@@ -5671,13 +5732,19 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
goto meta_analysis_ret_READ_FAIL;
}
gz_infile = NULL;
+ if (!sorted_extract_ids) {
+ ulii = ((uintptr_t)loadbuf) - ((uintptr_t)duplicate_id_htable_write);
+ if (ulii > duplicate_id_htable_max_alloc) {
+ duplicate_id_htable_max_alloc = ulii;
+ }
+ }
fname_ptr = &(fname_ptr[fname_len + 1]);
}
if (outfile) {
if (fclose_null(&outfile)) {
goto meta_analysis_ret_WRITE_FAIL;
}
- LOGPRINTFWW("--meta-analysis: %" PRIuPTR " problematic line%s; see %s .\n", rejected_ct, (rejected_ct == 1)? "" : "s", outname);
+ LOGPRINTFWW("--meta-analysis: %" PRIu64 " problematic line%s; see %s .\n", rejected_ct, (rejected_ct == 1)? "" : "s", outname);
}
// 5. Determine final set of variants, and sort them (by chromosome, then
@@ -5709,6 +5776,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
// sequentially
ll_ptr = (Ll_str*)wkspace_base;
for (master_var_idx = 0; master_var_idx < final_variant_ct;) {
+ cur_file_ct_m1 = 0; // clear high bits
memcpy(&cur_file_ct_m1, ll_ptr->ss, file_ct_byte_width);
if (report_all || cur_file_ct_m1) {
wptr = &(master_var_list[master_var_idx * master_var_entry_len]);
@@ -5744,8 +5812,11 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
ll_ptr = (Ll_str*)((((uintptr_t)bufptr) + sizeof(uintptr_t) - 1) & (~(sizeof(uintptr_t) - ONELU)));
}
qsort(master_var_list, final_variant_ct, master_var_entry_len, strcmp_natural);
- // don't need sorted_extract_ids anymore
+ // don't need htable anymore
wkspace_reset(wkspace_mark2);
+ if (!sorted_extract_ids) {
+ wkspace_alloc(duplicate_id_htable_max_alloc);
+ }
total_data_slots = (wkspace_left - topsize) / sizeof(uintptr_t);
// 6. Remaining load passes: determine how many remaining variants' worth of
@@ -5815,6 +5886,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
bufptr2 = &(bufptr[cur_variant_ct * master_var_entry_len]);
bufptr2 = (char*)memchr(bufptr2, 0, master_var_entry_len);
bufptr2++;
+ cur_file_ct_m1 = 0;
memcpy(&cur_file_ct_m1, bufptr2, file_ct_byte_width);
cur_data_slots = 0;
if (report_study_specific) {
@@ -5825,6 +5897,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
#endif
}
if (!no_allele) {
+ cur_combined_allele_len = 0;
memcpy(&cur_combined_allele_len, &(bufptr2[file_ct_byte_width]), combined_allele_len_byte_width);
cur_data_slots += (8 / BYTECT) * ((cur_combined_allele_len + 7) / 8);
}
@@ -5871,6 +5944,14 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
}
fname_ptr = input_fnames;
for (file_idx = 0; file_idx < file_ct; file_idx++) {
+ if (sorted_extract_ids) {
+ fill_ulong_zero(duplicate_id_bitfield, extract_ctl);
+ } else {
+ for (uii = 0; uii < HASHSIZE; uii++) {
+ duplicate_id_htable[uii] = NULL;
+ }
+ }
+ duplicate_id_htable_write = (Ll_str*)wkspace_mark2;
fname_len = strlen(fname_ptr);
token_ct = parse_max;
retval = meta_analysis_open_and_read_header(fname_ptr, loadbuf, loadbuf_size, sorted_header_dict, header_id_map, header_dict_ct, max_header_len, weighted_z, &token_ct, &gz_infile, col_skips, col_sequence, NULL, NULL);
@@ -5898,6 +5979,40 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
if (!bufptr) {
continue;
}
+ bufptr = token_ptrs[0];
+ var_id_len = strlen_se(bufptr);
+ if (var_id_len >= max_var_id_len_p1) {
+ continue;
+ }
+ bufptr[var_id_len] = '\0';
+ if (sorted_extract_ids) {
+ ii = bsearch_str(bufptr, var_id_len, sorted_extract_ids, max_extract_id_len, extract_ct);
+ if (ii == -1) {
+ continue;
+ }
+ if (is_set(duplicate_id_bitfield, ii)) {
+ continue;
+ }
+ set_bit(duplicate_id_bitfield, ii);
+ } else {
+ uii = hashval2(bufptr, var_id_len);
+ ll_pptr = &(duplicate_id_htable[uii]);
+ while (1) {
+ ll_ptr = *ll_pptr;
+ if ((!ll_ptr) || (!strcmp(bufptr, ll_ptr->ss))) {
+ break;
+ }
+ ll_pptr = &(ll_ptr->next);
+ }
+ if (ll_ptr) {
+ continue;
+ }
+ *ll_pptr = duplicate_id_htable_write;
+ duplicate_id_htable_write->next = NULL;
+ memcpy(duplicate_id_htable_write->ss, bufptr, var_id_len + 1);
+ ulii = sizeof(intptr_t) + ((var_id_len + BYTECT) & (~(BYTECT - 1)));
+ duplicate_id_htable_write = (Ll_str*)(((uintptr_t)duplicate_id_htable_write) + ulii);
+ }
if (use_map) {
ii = get_chrom_code(chrom_info_ptr, token_ptrs[5]);
if (ii < 0) {
@@ -5947,17 +6062,15 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
continue;
}
}
- bufptr = token_ptrs[0];
- var_id_len = strlen_se(bufptr);
- if (var_id_len >= max_var_id_len_p1) {
- continue;
- }
+ bufptr = token_ptrs[0];
if (use_map) {
ii = bsearch_str(bufptr, var_id_len, cur_window_marker_ids, max_var_id_len_p5, cur_variant_ct);
if (ii == -1) {
continue;
}
- cur_var_idx = 0; // clear high bits
+#ifdef __LP64__
+ cur_var_idx = 0; // clear high 32 bits
+#endif
memcpy(&cur_var_idx, &(cur_window_marker_ids[(((uint32_t)ii) * max_var_id_len_p5) + max_var_id_len_p1]), 4);
} else {
bufptr[var_id_len] = '\0';
@@ -6177,7 +6290,12 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
for (file_idx = 0; file_idx < file_ct; file_idx++) {
if (is_set(ulptr, file_idx)) {
uii++;
- double_f_writew74x(&(tbuf[1]), exp(cur_data_ptr[((int32_t)(uii + weighted_z)) * (-2)]), '\0');
+ dxx = cur_data_ptr[((int32_t)(uii + weighted_z)) * (-2)];
+ if (!output_beta) {
+ // finish fixing PLINK 1.07 bug
+ dxx = exp(dxx);
+ }
+ double_f_writew74x(&(tbuf[1]), dxx, '\0');
fputs(tbuf, outfile);
} else {
fputs(" NA", outfile);
diff --git a/plink_misc.h b/plink_misc.h
index 799ee73..41a49de 100644
--- a/plink_misc.h
+++ b/plink_misc.h
@@ -14,6 +14,7 @@
#define METAANAL_LOGSCALE 0x10
#define METAANAL_QT 0x20
#define METAANAL_WEIGHTED_Z 0x40
+#define METAANAL_REPORT_DUPS 0x80
typedef struct {
char* fname;
diff --git a/plink_perm.c b/plink_perm.c
new file mode 100644
index 0000000..20320f4
--- /dev/null
+++ b/plink_perm.c
@@ -0,0 +1,333 @@
+#include "plink_common.h"
+
+void generate_cc_perm_vec(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotient, uint64_t totq_magic, uint32_t totq_preshift, uint32_t totq_postshift, uint32_t totq_incr, uintptr_t* perm_vec, sfmt_t* sfmtp) {
+ // Assumes tot_quotient is 2^32 / tot_ct, and
+ // totq_magic/totq_preshift/totq_postshift/totq_incr have been precomputed
+ // from magic_num().
+ uint32_t num_set = 0;
+ uint32_t upper_bound = tot_ct * tot_quotient - 1;
+ uintptr_t widx;
+ uintptr_t wcomp;
+ uintptr_t pv_val;
+ uint32_t urand;
+ uint32_t uii;
+ if (set_ct * 2 < tot_ct) {
+ fill_ulong_zero(perm_vec, 2 * ((tot_ct + (BITCT - 1)) / BITCT));
+ for (; num_set < set_ct; num_set++) {
+ do {
+ do {
+ urand = sfmt_genrand_uint32(sfmtp);
+ } while (urand > upper_bound);
+ uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
+ widx = uii / BITCT2;
+ wcomp = ONELU << (2 * (uii % BITCT2));
+ pv_val = perm_vec[widx];
+ } while (pv_val & wcomp);
+ perm_vec[widx] = pv_val | wcomp;
+ }
+ } else {
+ fill_vec_55(perm_vec, tot_ct);
+ set_ct = tot_ct - set_ct;
+ for (; num_set < set_ct; num_set++) {
+ do {
+ do {
+ urand = sfmt_genrand_uint32(sfmtp);
+ } while (urand > upper_bound);
+ uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
+ widx = uii / BITCT2;
+ wcomp = ONELU << (2 * (uii % BITCT2));
+ pv_val = perm_vec[widx];
+ } while (!(pv_val & wcomp));
+ perm_vec[widx] = pv_val - wcomp;
+ }
+ }
+}
+
+void generate_cc_perm1(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotient, uint64_t totq_magic, uint32_t totq_preshift, uint32_t totq_postshift, uint32_t totq_incr, uintptr_t* perm_vec, sfmt_t* sfmtp) {
+ // generate_cc_perm_vec() variant which uses 1-bit packing instead of 2.
+ uint32_t num_set = 0;
+ uint32_t upper_bound = tot_ct * tot_quotient - 1;
+ uintptr_t widx;
+ uintptr_t wcomp;
+ uintptr_t pv_val;
+ uint32_t urand;
+ uint32_t uii;
+ if (set_ct * 2 < tot_ct) {
+ fill_ulong_zero(perm_vec, (tot_ct + (BITCT - 1)) / BITCT);
+ for (; num_set < set_ct; num_set++) {
+ do {
+ do {
+ urand = sfmt_genrand_uint32(sfmtp);
+ } while (urand > upper_bound);
+ uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
+ widx = uii / BITCT;
+ wcomp = ONELU << (uii % BITCT);
+ pv_val = perm_vec[widx];
+ } while (pv_val & wcomp);
+ perm_vec[widx] = pv_val | wcomp;
+ }
+ } else {
+ fill_all_bits(perm_vec, tot_ct);
+ set_ct = tot_ct - set_ct;
+ for (; num_set < set_ct; num_set++) {
+ do {
+ do {
+ urand = sfmt_genrand_uint32(sfmtp);
+ } while (urand > upper_bound);
+ uii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
+ widx = uii / BITCT;
+ wcomp = ONELU << (uii % BITCT);
+ pv_val = perm_vec[widx];
+ } while (!(pv_val & wcomp));
+ perm_vec[widx] = pv_val - wcomp;
+ }
+ }
+}
+
+void generate_cc_cluster_perm_vec(uint32_t tot_ct, uintptr_t* preimage, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t* cluster_case_cts, uint32_t* tot_quotients, uint64_t* totq_magics, uint32_t* totq_preshifts, uint32_t* totq_postshifts, uint32_t* totq_incrs, uintptr_t* perm_vec, sfmt_t* sfmtp) {
+ uint32_t tot_ctl2 = 2 * ((tot_ct + (BITCT - 1)) / BITCT);
+ uint32_t cluster_idx;
+ uint32_t target_ct;
+ uint32_t cluster_end;
+ uint32_t* map_ptr;
+ uint32_t num_swapped;
+ uint32_t cluster_size;
+ uint32_t upper_bound;
+ uint64_t totq_magic;
+ uint32_t totq_preshift;
+ uint32_t totq_postshift;
+ uint32_t totq_incr;
+ uintptr_t widx;
+ uintptr_t wcomp;
+ uintptr_t pv_val;
+ uint32_t urand;
+ uint32_t uii;
+ memcpy(perm_vec, preimage, tot_ctl2 * sizeof(intptr_t));
+ for (cluster_idx = 0; cluster_idx < cluster_ct; cluster_idx++) {
+ target_ct = cluster_case_cts[cluster_idx];
+ cluster_end = cluster_starts[cluster_idx + 1];
+ cluster_size = cluster_end - cluster_starts[cluster_idx];
+ if (target_ct && (target_ct != cluster_size)) {
+ upper_bound = cluster_size * tot_quotients[cluster_idx] - 1;
+ totq_magic = totq_magics[cluster_idx];
+ totq_preshift = totq_preshifts[cluster_idx];
+ totq_postshift = totq_postshifts[cluster_idx];
+ totq_incr = totq_incrs[cluster_idx];
+ map_ptr = &(cluster_map[cluster_starts[cluster_idx]]);
+ if (target_ct * 2 < cluster_size) {
+ for (num_swapped = 0; num_swapped < target_ct; num_swapped++) {
+ do {
+ do {
+ urand = sfmt_genrand_uint32(sfmtp);
+ } while (urand > upper_bound);
+ uii = map_ptr[(uint32_t)((totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift)];
+ widx = uii / BITCT2;
+ wcomp = ONELU << (2 * (uii % BITCT2));
+ pv_val = perm_vec[widx];
+ } while (pv_val & wcomp);
+ perm_vec[widx] = pv_val | wcomp;
+ }
+ } else {
+ target_ct = cluster_size - target_ct;
+ for (num_swapped = 0; num_swapped < target_ct; num_swapped++) {
+ do {
+ do {
+ urand = sfmt_genrand_uint32(sfmtp);
+ } while (urand > upper_bound);
+ uii = map_ptr[(uint32_t)((totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift)];
+ widx = uii / BITCT2;
+ wcomp = ONELU << (2 * (uii % BITCT2));
+ pv_val = perm_vec[widx];
+ } while (!(pv_val & wcomp));
+ perm_vec[widx] = pv_val - wcomp;
+ }
+ }
+ }
+ }
+}
+
+void generate_cc_cluster_perm1(uint32_t tot_ct, uintptr_t* preimage, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t* cluster_case_cts, uint32_t* tot_quotients, uint64_t* totq_magics, uint32_t* totq_preshifts, uint32_t* totq_postshifts, uint32_t* totq_incrs, uintptr_t* perm_vec, sfmt_t* sfmtp) {
+ uint32_t tot_ctl = (tot_ct + (BITCT - 1)) / BITCT;
+ uint32_t cluster_idx;
+ uint32_t target_ct;
+ uint32_t cluster_end;
+ uint32_t cluster_size;
+ uint32_t* map_ptr;
+ uint32_t num_swapped;
+ uint32_t upper_bound;
+ uint64_t totq_magic;
+ uint32_t totq_preshift;
+ uint32_t totq_postshift;
+ uint32_t totq_incr;
+ uintptr_t widx;
+ uintptr_t wcomp;
+ uintptr_t pv_val;
+ uint32_t urand;
+ uint32_t uii;
+ memcpy(perm_vec, preimage, tot_ctl * sizeof(intptr_t));
+ for (cluster_idx = 0; cluster_idx < cluster_ct; cluster_idx++) {
+ target_ct = cluster_case_cts[cluster_idx];
+ cluster_end = cluster_starts[cluster_idx + 1];
+ cluster_size = cluster_end - cluster_starts[cluster_idx];
+ if (target_ct && (target_ct != cluster_size)) {
+ upper_bound = cluster_size * tot_quotients[cluster_idx] - 1;
+ totq_magic = totq_magics[cluster_idx];
+ totq_preshift = totq_preshifts[cluster_idx];
+ totq_postshift = totq_postshifts[cluster_idx];
+ totq_incr = totq_incrs[cluster_idx];
+ map_ptr = &(cluster_map[cluster_starts[cluster_idx]]);
+ if (target_ct * 2 < cluster_size) {
+ for (num_swapped = 0; num_swapped < target_ct; num_swapped++) {
+ do {
+ do {
+ urand = sfmt_genrand_uint32(sfmtp);
+ } while (urand > upper_bound);
+ uii = map_ptr[(uint32_t)((totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift)];
+ widx = uii / BITCT;
+ wcomp = ONELU << (uii % BITCT);
+ pv_val = perm_vec[widx];
+ } while (pv_val & wcomp);
+ perm_vec[widx] = pv_val | wcomp;
+ }
+ } else {
+ target_ct = cluster_size - target_ct;
+ for (num_swapped = 0; num_swapped < target_ct; num_swapped++) {
+ do {
+ do {
+ urand = sfmt_genrand_uint32(sfmtp);
+ } while (urand > upper_bound);
+ uii = map_ptr[(uint32_t)((totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift)];
+ widx = uii / BITCT;
+ wcomp = ONELU << (uii % BITCT);
+ pv_val = perm_vec[widx];
+ } while (!(pv_val & wcomp));
+ perm_vec[widx] = pv_val - wcomp;
+ }
+ }
+ }
+ }
+}
+
+void transpose_perms(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_nm_ct, uint32_t* perm_vecst) {
+ // Transpose permutations so PRESTO/PERMORY-style genotype indexing can work.
+ //
+ // We used a 32-ply interleaved format, to allow counts up to the uint32_t
+ // limit without giving up highly parallel adds in the calc_git() inner loop
+ // (performed with a combination of unroll_incr_1_4, unroll_incr_4_8, and
+ // unroll_incr_8_32). The index order is:
+ // 64-bit build:
+ // first 16 bytes: 0 32 64 96 16 48 80 112 4 36 68 100 20 52 84 116
+ // 8 40 72 104 24 56 88 120 12 44 76 108 28 60 92 124 1...
+ // next 16 bytes: 128 160 192...
+ //
+ // 32-bit build:
+ // first 4 bytes: 0 8 16 24 4 12 20 28 1 9 17 25 5 13 21 29 2 10 18...
+ // next 4 bytes: 32 40 48...
+ uintptr_t sample_idx = 0;
+ uintptr_t pheno_nm_ctl2 = 2 * ((pheno_nm_ct + (BITCT - 1)) / BITCT);
+#ifdef __LP64__
+ uint32_t wbuf[4];
+ uint32_t* wbptr;
+#else
+ uint32_t wval;
+#endif
+ uint32_t rshift;
+ uint32_t wshift;
+ uintptr_t* pvptr;
+ uintptr_t perm_idx;
+ for (; sample_idx < pheno_nm_ct; sample_idx++) {
+ perm_idx = 0;
+ pvptr = &(perm_vecs[sample_idx / BITCT2]);
+ rshift = 2 * (sample_idx % BITCT2);
+ goto transpose_perms_loop_start;
+#ifdef __LP64__
+ do {
+ if (!(perm_idx % 4)) {
+ if (perm_idx % 128) {
+ wshift = ((perm_idx & 96) >> 5) | ((perm_idx & 16) >> 2) | ((perm_idx & 12) << 1);
+ } else {
+ memcpy(perm_vecst, wbuf, 16);
+ perm_vecst = &(perm_vecst[4]);
+ transpose_perms_loop_start:
+ fill_uint_zero(wbuf, 4);
+ wshift = 0;
+ }
+ wbptr = wbuf;
+ }
+ *wbptr |= ((pvptr[perm_idx * pheno_nm_ctl2] >> rshift) & 1) << wshift;
+ wbptr++;
+ } while (++perm_idx < perm_vec_ct);
+ memcpy(perm_vecst, wbuf, 16);
+ perm_vecst = &(perm_vecst[4]);
+#else
+ do {
+ if (perm_idx % 32) {
+ wshift = ((perm_idx & 24) >> 3) | (perm_idx & 4) | ((perm_idx & 3) << 3);
+ } else {
+ *perm_vecst++ = wval;
+ transpose_perms_loop_start:
+ wval = 0;
+ wshift = 0;
+ }
+ wval |= ((pvptr[perm_idx * pheno_nm_ctl2] >> rshift) & 1) << wshift;
+ } while (++perm_idx < perm_vec_ct);
+ *perm_vecst++ = wval;
+#endif
+ }
+}
+
+void transpose_perm1s(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_nm_ct, uint32_t* perm_vecst) {
+ uintptr_t sample_idx = 0;
+ uintptr_t pheno_nm_ctl = (pheno_nm_ct + (BITCT - 1)) / BITCT;
+#ifdef __LP64__
+ uint32_t wbuf[4];
+ uint32_t* wbptr;
+#else
+ uint32_t wval;
+#endif
+ uint32_t rshift;
+ uint32_t wshift;
+ uintptr_t* pvptr;
+ uintptr_t perm_idx;
+ for (; sample_idx < pheno_nm_ct; sample_idx++) {
+ perm_idx = 0;
+ pvptr = &(perm_vecs[sample_idx / BITCT]);
+ rshift = sample_idx % BITCT;
+ goto transpose_perm1s_loop_start;
+#ifdef __LP64__
+ do {
+ if (!(perm_idx % 4)) {
+ if (perm_idx % 128) {
+ wshift = ((perm_idx & 96) >> 5) | ((perm_idx & 16) >> 2) | ((perm_idx & 12) << 1);
+ } else {
+ memcpy(perm_vecst, wbuf, 16);
+ perm_vecst = &(perm_vecst[4]);
+ transpose_perm1s_loop_start:
+ fill_uint_zero(wbuf, 2);
+ wshift = 0;
+ }
+ wbptr = wbuf;
+ }
+ *wbptr |= ((pvptr[perm_idx * pheno_nm_ctl] >> rshift) & 1) << wshift;
+ wbptr++;
+ } while (++perm_idx < perm_vec_ct);
+ memcpy(perm_vecst, wbuf, 16);
+ perm_vecst = &(perm_vecst[4]);
+#else
+ do {
+ if (perm_idx % 32) {
+ wshift = ((perm_idx & 24) >> 3) | (perm_idx & 4) | ((perm_idx & 3) << 3);
+ } else {
+ *perm_vecst++ = wval;
+ transpose_perm1s_loop_start:
+ wval = 0;
+ wshift = 0;
+ }
+ wval |= ((pvptr[perm_idx * pheno_nm_ctl] >> rshift) & 1) << wshift;
+ } while (++perm_idx < perm_vec_ct);
+ *perm_vecst++ = wval;
+#endif
+ }
+}
+
+// todo: add multithread globals with extern linkage
diff --git a/plink_perm.h b/plink_perm.h
new file mode 100644
index 0000000..8de84e5
--- /dev/null
+++ b/plink_perm.h
@@ -0,0 +1,196 @@
+#ifndef __PLINK_PERM_H__
+
+// Permutation generation and interpretation code common to many association
+// tests.
+
+void generate_cc_perm_vec(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotient, uint64_t totq_magic, uint32_t totq_preshift, uint32_t totq_postshift, uint32_t totq_incr, uintptr_t* perm_vec, sfmt_t* sfmtp);
+
+void generate_cc_perm1(uint32_t tot_ct, uint32_t set_ct, uint32_t tot_quotient, uint64_t totq_magic, uint32_t totq_preshift, uint32_t totq_postshift, uint32_t totq_incr, uintptr_t* perm_vec, sfmt_t* sfmtp);
+
+void generate_cc_cluster_perm_vec(uint32_t tot_ct, uintptr_t* preimage, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t* cluster_case_cts, uint32_t* tot_quotients, uint64_t* totq_magics, uint32_t* totq_preshifts, uint32_t* totq_postshifts, uint32_t* totq_incrs, uintptr_t* perm_vec, sfmt_t* sfmtp);
+
+void generate_cc_cluster_perm1(uint32_t tot_ct, uintptr_t* preimage, uint32_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t* cluster_case_cts, uint32_t* tot_quotients, uint64_t* totq_magics, uint32_t* totq_preshifts, uint32_t* totq_postshifts, uint32_t* totq_incrs, uintptr_t* perm_vec, sfmt_t* sfmtp);
+
+// Efficient "vertical popcount" support.
+void transpose_perms(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_nm_ct, uint32_t* perm_vecst);
+
+void transpose_perm1s(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_nm_ct, uint32_t* perm_vecst);
+
+#ifdef __LP64__
+static inline void unroll_incr_1_4(const __m128i* acc1, __m128i* acc4, uint32_t acc1_vec_ct) {
+ const __m128i m1x4 = {0x1111111111111111LLU, 0x1111111111111111LLU};
+ __m128i loader;
+ uint32_t vidx;
+ for (vidx = 0; vidx < acc1_vec_ct; vidx++) {
+ loader = *acc1++;
+ *acc4 = _mm_add_epi64(*acc4, _mm_and_si128(loader, m1x4));
+ acc4++;
+ loader = _mm_srli_epi64(loader, 1);
+ *acc4 = _mm_add_epi64(*acc4, _mm_and_si128(loader, m1x4));
+ acc4++;
+ loader = _mm_srli_epi64(loader, 1);
+ *acc4 = _mm_add_epi64(*acc4, _mm_and_si128(loader, m1x4));
+ acc4++;
+ loader = _mm_srli_epi64(loader, 1);
+ *acc4 = _mm_add_epi64(*acc4, _mm_and_si128(loader, m1x4));
+ acc4++;
+ }
+}
+
+static inline void unroll_incr_4_8(const __m128i* acc4, __m128i* acc8, uint32_t acc4_vec_ct) {
+ const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
+ __m128i loader;
+ uint32_t vidx;
+ for (vidx = 0; vidx < acc4_vec_ct; vidx++) {
+ loader = *acc4++;
+ *acc8 = _mm_add_epi64(*acc8, _mm_and_si128(loader, m4));
+ acc8++;
+ loader = _mm_srli_epi64(loader, 4);
+ *acc8 = _mm_add_epi64(*acc8, _mm_and_si128(loader, m4));
+ acc8++;
+ }
+}
+
+static inline void unroll_zero_incr_4_8(__m128i* acc4, __m128i* acc8, uint32_t acc4_vec_ct) {
+ const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
+ __m128i loader;
+ uint32_t vidx;
+ for (vidx = 0; vidx < acc4_vec_ct; vidx++) {
+ loader = *acc4;
+ *acc4++ = _mm_setzero_si128();
+ *acc8 = _mm_add_epi64(*acc8, _mm_and_si128(loader, m4));
+ acc8++;
+ loader = _mm_srli_epi64(loader, 4);
+ *acc8 = _mm_add_epi64(*acc8, _mm_and_si128(loader, m4));
+ acc8++;
+ }
+}
+
+static inline void unroll_incr_8_32(const __m128i* acc8, __m128i* acc32, uint32_t acc8_vec_ct) {
+ const __m128i m8x32 = {0x000000ff000000ffLLU, 0x000000ff000000ffLLU};
+ __m128i loader;
+ uint32_t vidx;
+ for (vidx = 0; vidx < acc8_vec_ct; vidx++) {
+ loader = *acc8++;
+ *acc32 = _mm_add_epi64(*acc32, _mm_and_si128(loader, m8x32));
+ acc32++;
+ loader = _mm_srli_epi64(loader, 8);
+ *acc32 = _mm_add_epi64(*acc32, _mm_and_si128(loader, m8x32));
+ acc32++;
+ loader = _mm_srli_epi64(loader, 8);
+ *acc32 = _mm_add_epi64(*acc32, _mm_and_si128(loader, m8x32));
+ acc32++;
+ loader = _mm_srli_epi64(loader, 8);
+ *acc32 = _mm_add_epi64(*acc32, _mm_and_si128(loader, m8x32));
+ acc32++;
+ }
+}
+
+static inline void unroll_zero_incr_8_32(__m128i* acc8, __m128i* acc32, uint32_t acc8_vec_ct) {
+ const __m128i m8x32 = {0x000000ff000000ffLLU, 0x000000ff000000ffLLU};
+ __m128i loader;
+ uint32_t vidx;
+ for (vidx = 0; vidx < acc8_vec_ct; vidx++) {
+ loader = *acc8;
+ *acc8++ = _mm_setzero_si128();
+ *acc32 = _mm_add_epi64(*acc32, _mm_and_si128(loader, m8x32));
+ acc32++;
+ loader = _mm_srli_epi64(loader, 8);
+ *acc32 = _mm_add_epi64(*acc32, _mm_and_si128(loader, m8x32));
+ acc32++;
+ loader = _mm_srli_epi64(loader, 8);
+ *acc32 = _mm_add_epi64(*acc32, _mm_and_si128(loader, m8x32));
+ acc32++;
+ loader = _mm_srli_epi64(loader, 8);
+ *acc32 = _mm_add_epi64(*acc32, _mm_and_si128(loader, m8x32));
+ acc32++;
+ }
+}
+#else
+static inline void unroll_incr_1_4(const uintptr_t* acc1, uintptr_t* acc4, uint32_t acc1_word_ct) {
+ uint32_t widx;
+ uintptr_t loader;
+ for (widx = 0; widx < acc1_word_ct; widx++) {
+ loader = *acc1++;
+ *acc4 += loader & 0x11111111U;
+ acc4++;
+ loader >>= 1;
+ *acc4 += loader & 0x11111111U;
+ acc4++;
+ loader >>= 1;
+ *acc4 += loader & 0x11111111U;
+ acc4++;
+ loader >>= 1;
+ *acc4 += loader & 0x11111111U;
+ acc4++;
+ }
+}
+
+static inline void unroll_incr_4_8(const uintptr_t* acc4, uintptr_t* acc8, uint32_t acc4_word_ct) {
+ uint32_t widx;
+ uintptr_t loader;
+ for (widx = 0; widx < acc4_word_ct; widx++) {
+ loader = *acc4++;
+ *acc8 += loader & 0x0f0f0f0fU;
+ acc8++;
+ loader >>= 4;
+ *acc8 += loader & 0x0f0f0f0fU;
+ acc8++;
+ }
+}
+
+static inline void unroll_zero_incr_4_8(uintptr_t* acc4, uintptr_t* acc8, uint32_t acc4_word_ct) {
+ uint32_t widx;
+ uintptr_t loader;
+ for (widx = 0; widx < acc4_word_ct; widx++) {
+ loader = *acc4;
+ *acc4++ = 0;
+ *acc8 += loader & 0x0f0f0f0fU;
+ acc8++;
+ loader >>= 4;
+ *acc8 += loader & 0x0f0f0f0fU;
+ acc8++;
+ }
+}
+
+static inline void unroll_incr_8_32(const uintptr_t* acc8, uintptr_t* acc32, uint32_t acc8_word_ct) {
+ uint32_t widx;
+ uintptr_t loader;
+ for (widx = 0; widx < acc8_word_ct; widx++) {
+ loader = *acc8++;
+ *acc32 += (uint8_t)loader;
+ acc32++;
+ loader >>= 8;
+ *acc32 += (uint8_t)loader;
+ acc32++;
+ loader >>= 8;
+ *acc32 += (uint8_t)loader;
+ acc32++;
+ loader >>= 8;
+ *acc32 += loader;
+ acc32++;
+ }
+}
+
+static inline void unroll_zero_incr_8_32(uintptr_t* acc8, uintptr_t* acc32, uint32_t acc8_word_ct) {
+ uint32_t widx;
+ uintptr_t loader;
+ for (widx = 0; widx < acc8_word_ct; widx++) {
+ loader = *acc8;
+ *acc8++ = 0;
+ *acc32 += (uint8_t)loader;
+ acc32++;
+ loader >>= 8;
+ *acc32 += (uint8_t)loader;
+ acc32++;
+ loader >>= 8;
+ *acc32 += (uint8_t)loader;
+ acc32++;
+ loader >>= 8;
+ *acc32 += loader;
+ acc32++;
+ }
+}
+#endif
+
+#endif // __PLINK_PERM_H__
diff --git a/plink_set.c b/plink_set.c
index 59e7ba2..014de18 100644
--- a/plink_set.c
+++ b/plink_set.c
@@ -250,7 +250,7 @@ uint32_t alloc_and_populate_nonempty_set_incl(Set_info* sip, uint32_t* nonempty_
return 0;
}
-int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_extend, uint32_t collapse_group, uint32_t fail_on_no_sets, uint32_t c_prefix, uintptr_t subset_ct, char* sorted_subset_ids, uintptr_t max_subset_id_len, uint32_t* marker_pos, Chrom_info* chrom_info_ptr, uintptr_t* topsize_ptr, uintptr_t* set_ct_ptr, char** set_names_ptr, uintptr_t* max_set_id_len_ptr, Make_set_range*** make_set_range_arr_ptr, uint64_t** range_sort_buf_ptr, const char* file_descrip) {
+int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_extend, uint32_t collapse_group, uint32_t fail_on_no_sets, uint32_t c_prefix, uint32_t allow_no_variants, uintptr_t subset_ct, char* sorted_subset_ids, uintptr_t max_subset_id_len, uint32_t* marker_pos, Chrom_info* chrom_info_ptr, uintptr_t* topsize_ptr, uintptr_t* set_ct_ptr, char** set_names_ptr, uintptr_t* max_set_id_len_ptr, Make_set_range*** make_set_range_arr_ptr, uint64_t** range_sort_buf_ptr, const ch [...]
// Called directly by extract_exclude_range(), define_sets(), and indirectly
// by annotate(), gene_report(), and clump_reports().
// Assumes topsize has not been subtracted off wkspace_left. (This remains
@@ -349,9 +349,12 @@ int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_
if (!set_ct) {
if (fail_on_no_sets) {
if (marker_pos) {
- // okay, this is a kludge
- logerrprint("Error: All variants excluded by --gene{-all}, since no sets were defined from\n--make-set file.\n");
- retval = RET_ALL_MARKERS_EXCLUDED;
+ if (!allow_no_variants) {
+ // okay, this is a kludge
+ logerrprint("Error: All variants excluded by --gene{-all}, since no sets were defined from\n--make-set file.\n");
+ retval = RET_ALL_MARKERS_EXCLUDED;
+ goto load_range_list_ret_1;
+ }
} else {
if (subset_ct) {
logerrprint("Error: No --gene-subset genes present in --gene-report file.\n");
@@ -359,8 +362,8 @@ int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_
logerrprint("Error: Empty --gene-report file.\n");
}
retval = RET_INVALID_FORMAT;
+ goto load_range_list_ret_1;
}
- goto load_range_list_ret_1;
}
LOGERRPRINTF("Warning: No valid ranges in %s file.\n", file_descrip);
goto load_range_list_ret_1;
@@ -539,7 +542,10 @@ int32_t load_range_list(FILE* infile, uint32_t track_set_names, uint32_t border_
return retval;
}
-int32_t extract_exclude_range(char* fname, uint32_t* marker_pos, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, uint32_t is_exclude, Chrom_info* chrom_info_ptr) {
+int32_t extract_exclude_range(char* fname, uint32_t* marker_pos, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, uint32_t is_exclude, uint32_t allow_no_variants, Chrom_info* chrom_info_ptr) {
+ if (unfiltered_marker_ct == *marker_exclude_ct_ptr) {
+ return 0;
+ }
unsigned char* wkspace_mark = wkspace_base;
uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
FILE* infile = NULL;
@@ -552,7 +558,7 @@ int32_t extract_exclude_range(char* fname, uint32_t* marker_pos, uintptr_t unfil
if (fopen_checked(&infile, fname, "r")) {
goto extract_exclude_range_ret_OPEN_FAIL;
}
- retval = load_range_list(infile, 0, 0, 0, 0, 0, 0, NULL, 0, marker_pos, chrom_info_ptr, &topsize, NULL, NULL, NULL, &range_arr, NULL, is_exclude? "--exclude range" : "--extract range");
+ retval = load_range_list(infile, 0, 0, 0, 0, 0, allow_no_variants, 0, NULL, 0, marker_pos, chrom_info_ptr, &topsize, NULL, NULL, NULL, &range_arr, NULL, is_exclude? "--exclude range" : "--extract range");
if (retval) {
goto extract_exclude_range_ret_1;
}
@@ -580,7 +586,7 @@ int32_t extract_exclude_range(char* fname, uint32_t* marker_pos, uintptr_t unfil
bitfield_or(marker_exclude, marker_exclude_new, unfiltered_marker_ctl);
}
*marker_exclude_ct_ptr = popcount_longs(marker_exclude, unfiltered_marker_ctl);
- if (*marker_exclude_ct_ptr == unfiltered_marker_ct) {
+ if ((*marker_exclude_ct_ptr == unfiltered_marker_ct) && (!allow_no_variants)) {
LOGERRPRINTF("Error: All variants excluded by '--%s range'.\n", is_exclude? "exclude" : "extract");
retval = RET_ALL_MARKERS_EXCLUDED;
} else if (*marker_exclude_ct_ptr == orig_marker_exclude_ct) {
@@ -745,6 +751,10 @@ uint32_t save_set_bitfield(uintptr_t* marker_bitfield_tmp, uint32_t marker_ct, u
save_set_bitfield_standard:
bound_bottom_d128 *= 128;
bound_top_d128 *= 128;
+ // bugfix
+ if (bound_top_d128 > marker_ct) {
+ bound_top_d128 = marker_ct;
+ }
(*set_range_pp)[0] = 0xffffffffU;
(*set_range_pp)[1] = bound_bottom_d128;
(*set_range_pp)[2] = bound_top_d128 - bound_bottom_d128;
@@ -940,7 +950,7 @@ uint32_t save_set_range(uint64_t* range_sort_buf, uint32_t marker_ct, uint32_t r
return 0;
}
-int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uint32_t* marker_pos, uintptr_t* marker_exclude_ct_ptr, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr) {
+int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uint32_t* marker_pos, uintptr_t* marker_exclude_ct_ptr, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, uint32_t allow_no_variants) {
FILE* infile = NULL;
uintptr_t topsize = 0;
char* sorted_marker_ids = NULL;
@@ -1004,7 +1014,11 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
}
bufptr = &(bufptr[slen + 1]);
if (!(*bufptr)) {
- goto define_sets_ret_ALL_MARKERS_EXCLUDED;
+ if (!allow_no_variants) {
+ goto define_sets_ret_ALL_MARKERS_EXCLUDED;
+ } else {
+ goto define_sets_ret_EXCLUDE_ALL_MARKERS_ALLOWED;
+ }
}
}
free(sip->genekeep_flattened);
@@ -1022,8 +1036,12 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
bufptr = &(bufptr[slen]);
} while (*bufptr);
if (!genekeep_ct) {
- logerrprint("Error: All variants excluded by --gene.\n");
- goto define_sets_ret_ALL_MARKERS_EXCLUDED_2;
+ if (!allow_no_variants) {
+ logerrprint("Error: All variants excluded by --gene.\n");
+ goto define_sets_ret_ALL_MARKERS_EXCLUDED_2;
+ } else {
+ goto define_sets_ret_EXCLUDE_ALL_MARKERS_ALLOWED;
+ }
}
sorted_genekeep_ids = (char*)top_alloc(&topsize, genekeep_ct * max_genekeep_len);
if (!sorted_genekeep_ids) {
@@ -1062,12 +1080,16 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
}
if (!subset_ct) {
if ((gene_all || sip->genekeep_flattened) && ((!sip->merged_set_name) || (!complement_sets))) {
- if (sip->subset_fname) {
- logerrprint("Error: All variants excluded, since --subset file is empty.\n");
+ if (!allow_no_variants) {
+ if (sip->subset_fname) {
+ logerrprint("Error: All variants excluded, since --subset file is empty.\n");
+ } else {
+ logerrprint("Error: All variants excluded, since --set-names was given no parameters.\n");
+ }
+ goto define_sets_ret_ALL_MARKERS_EXCLUDED_2;
} else {
- logerrprint("Error: All variants excluded, since --set-names was given no parameters.\n");
+ goto define_sets_ret_EXCLUDE_ALL_MARKERS_ALLOWED;
}
- goto define_sets_ret_ALL_MARKERS_EXCLUDED_2;
}
if (sip->merged_set_name) {
goto define_sets_merge_nothing;
@@ -1117,7 +1139,7 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
}
// 3. load --make-set range list
if (make_set) {
- retval = load_range_list(infile, !sip->merged_set_name, sip->make_set_border, sip->modifier & SET_MAKE_COLLAPSE_GROUP, gene_all || sip->genekeep_flattened, c_prefix, subset_ct, sorted_subset_ids, max_subset_id_len, marker_pos, chrom_info_ptr, &topsize, &set_ct, &set_names, &max_set_id_len, &make_set_range_arr, &range_sort_buf, "--make-set");
+ retval = load_range_list(infile, !sip->merged_set_name, sip->make_set_border, sip->modifier & SET_MAKE_COLLAPSE_GROUP, gene_all || sip->genekeep_flattened, c_prefix, allow_no_variants, subset_ct, sorted_subset_ids, max_subset_id_len, marker_pos, chrom_info_ptr, &topsize, &set_ct, &set_names, &max_set_id_len, &make_set_range_arr, &range_sort_buf, "--make-set");
if (retval) {
goto define_sets_ret_1;
}
@@ -1251,11 +1273,16 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
}
if (!set_ct) {
if (!complement_sets) {
- logerrprint("Error: All variants excluded by --gene{-all}, since no sets were defined from\n--set file.\n");
- goto define_sets_ret_ALL_MARKERS_EXCLUDED_2;
+ if (!allow_no_variants) {
+ logerrprint("Error: All variants excluded by --gene{-all}, since no sets were defined from\n--set file.\n");
+ goto define_sets_ret_ALL_MARKERS_EXCLUDED_2;
+ } else {
+ goto define_sets_ret_EXCLUDE_ALL_MARKERS_ALLOWED;
+ }
+ } else {
+ logerrprint("Warning: No sets defined from --set file.\n");
+ goto define_sets_ret_1;
}
- logerrprint("Warning: No sets defined from --set file.\n");
- goto define_sets_ret_1;
}
}
if (!complement_sets) {
@@ -1263,10 +1290,14 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
}
bitfield_or(marker_exclude, marker_exclude_new, unfiltered_marker_ctl);
marker_exclude_ct = popcount_longs(marker_exclude, unfiltered_marker_ctl);
+ *marker_exclude_ct_ptr = marker_exclude_ct;
if (marker_exclude_ct == unfiltered_marker_ct) {
- goto define_sets_ret_ALL_MARKERS_EXCLUDED;
+ if (!allow_no_variants) {
+ goto define_sets_ret_ALL_MARKERS_EXCLUDED;
+ } else {
+ goto define_sets_ret_1;
+ }
}
- *marker_exclude_ct_ptr = marker_exclude_ct;
marker_ct = unfiltered_marker_ct - marker_exclude_ct;
rewind(infile);
topsize = topsize_bak;
@@ -1624,6 +1655,10 @@ int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* ma
define_sets_ret_ALL_MARKERS_EXCLUDED_2:
retval = RET_ALL_MARKERS_EXCLUDED;
break;
+ define_sets_ret_EXCLUDE_ALL_MARKERS_ALLOWED:
+ fill_all_bits(marker_exclude, unfiltered_marker_ct);
+ *marker_exclude_ct_ptr = unfiltered_marker_ct;
+ break;
define_sets_ret_INVALID_FORMAT_EXTRA_END:
logerrprint("Error: Extra 'END' token in --set file.\n");
retval = RET_INVALID_FORMAT;
@@ -2142,7 +2177,7 @@ uint32_t setdefs_compress(Set_info* sip, uintptr_t* set_incl, uintptr_t set_ct,
fill_bits(cur_bitfield, marker_midx_to_idx[range_offset + range_stop], marker_ct_orig - range_offset - range_stop);
range_end = marker_ct;
} else {
- range_end = 1 + last_set_bit(cur_bitfield, (range_offset + range_stop + (BITCT - 1)) / BITCT);
+ range_end = 1 + last_set_bit(cur_bitfield, (marker_ct + (BITCT - 1)) / BITCT);
}
if (range_start) {
range_start = marker_midx_to_idx[next_set_unsafe(read_bitfield, 0) + range_offset];
@@ -2188,7 +2223,7 @@ int32_t load_range_list_sortpos(char* fname, uint32_t border_extend, uintptr_t s
if (fopen_checked(&infile, fname, "r")) {
goto load_range_list_sortpos_ret_OPEN_FAIL;
}
- retval = load_range_list(infile, 1, border_extend, 0, 0, 0, subset_ct, sorted_subset_ids, 0, NULL, chrom_info_ptr, &topsize, &gene_ct, gene_names_ptr, &max_gene_id_len, &gene_arr, &range_sort_buf, file_descrip);
+ retval = load_range_list(infile, 1, border_extend, 0, 0, 0, 0, subset_ct, sorted_subset_ids, 0, NULL, chrom_info_ptr, &topsize, &gene_ct, gene_names_ptr, &max_gene_id_len, &gene_arr, &range_sort_buf, file_descrip);
if (retval) {
goto load_range_list_sortpos_ret_1;
}
diff --git a/plink_set.h b/plink_set.h
index 14d1b08..e56f69b 100644
--- a/plink_set.h
+++ b/plink_set.h
@@ -94,13 +94,13 @@ uint32_t setdef_iter(uint32_t* setdef, uint32_t* cur_idx_ptr, uint32_t* aux_ptr)
uint32_t alloc_and_populate_nonempty_set_incl(Set_info* sip, uint32_t* nonempty_set_ct_ptr, uintptr_t** nonempty_set_incl_ptr);
-int32_t extract_exclude_range(char* fname, uint32_t* marker_pos, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, uint32_t is_exclude, Chrom_info* chrom_info_ptr);
+int32_t extract_exclude_range(char* fname, uint32_t* marker_pos, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, uint32_t is_exclude, uint32_t allow_no_variants, Chrom_info* chrom_info_ptr);
uint32_t save_set_bitfield(uintptr_t* marker_bitfield_tmp, uint32_t marker_ct, uint32_t range_start, uint32_t range_end, uint32_t complement_sets, uint32_t** set_range_pp);
uint32_t save_set_range(uint64_t* range_sort_buf, uint32_t marker_ct, uint32_t rsb_last_idx, uint32_t complement_sets, uint32_t** set_range_pp);
-int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uint32_t* marker_pos, uintptr_t* marker_exclude_ct_ptr, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr);
+int32_t define_sets(Set_info* sip, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uint32_t* marker_pos, uintptr_t* marker_exclude_ct_ptr, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, uint32_t allow_no_variants);
int32_t write_set(Set_info* sip, char* outname, char* outname_end, uint32_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, char* marker_ids, uintptr_t max_marker_id_len, uint32_t* marker_pos, Chrom_info* chrom_info_ptr);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/plink1.9.git
More information about the debian-med-commit
mailing list