[med-svn] [plink1.9] 01/02: Imported Upstream version 1.90~b3u-150703
Dylan Aïssi
bob.dybian-guest at moszumanska.debian.org
Fri Jul 10 19:14:09 UTC 2015
This is an automated email from the git hooks/post-receive script.
bob.dybian-guest pushed a commit to branch master
in repository plink1.9.
commit abaa4fb2bd613d1a35df5098a2a789a86b063c46
Author: Dylan Aïssi <bob.dybian at gmail.com>
Date: Fri Jul 10 21:12:47 2015 +0200
Imported Upstream version 1.90~b3u-150703
---
pigz.c | 17 ++-
plink.c | 402 ++++++++++++++++++++++++++++++++++---------------
plink_assoc.c | 9 +-
plink_calc.c | 8 +-
plink_cluster.c | 343 +++++++++++++++++++++++++++++++++++++++---
plink_cluster.h | 19 ++-
plink_common.c | 52 ++++++-
plink_common.h | 82 +++++-----
plink_data.c | 104 ++++++++-----
plink_dosage.c | 61 +++++++-
plink_dosage.h | 1 +
plink_family.c | 90 +++++------
plink_family.h | 2 +-
plink_filter.c | 88 ++++-------
plink_help.c | 30 ++--
plink_lasso.c | 456 +++++++++++++++++++++++++++++++++++---------------------
plink_ld.c | 1 +
plink_matrix.h | 7 +
plink_misc.c | 226 ++++++++++++++++++++++++++--
plink_misc.h | 2 +
plink_stats.c | 10 +-
21 files changed, 1454 insertions(+), 556 deletions(-)
diff --git a/pigz.c b/pigz.c
index 2a4b7ab..eb15d5e 100644
--- a/pigz.c
+++ b/pigz.c
@@ -1559,10 +1559,17 @@ void force_compressed_pzwrite(Pigz_state* ps_ptr, char** writep_ptr, uint32_t wr
int32_t flex_pzputs_std(Pigz_state* ps_ptr, char** writep_ptr, char* ss, uint32_t sslen) {
unsigned char* writep = (unsigned char*)(*writep_ptr);
unsigned char* readp = (unsigned char*)ss;
- uint32_t cur_write_space = 2 * PIGZ_BLOCK_SIZE - ((uintptr_t)(writep - ps_ptr->overflow_buf));
+ uint32_t cur_write_pos = (uintptr_t)(writep - ps_ptr->overflow_buf);
+ uint32_t delta;
int32_t ii;
- while (sslen > cur_write_space) {
- memcpy(writep, readp, cur_write_space);
+ while ((sslen + cur_write_pos) > PIGZ_BLOCK_SIZE) {
+ if (cur_write_pos <= PIGZ_BLOCK_SIZE) {
+ delta = PIGZ_BLOCK_SIZE + 1 - cur_write_pos;
+ memcpy(writep, readp, delta);
+ writep = &(writep[delta]);
+ readp = &(readp[delta]);
+ sslen -= delta;
+ }
if (is_uncompressed_pzwrite(ps_ptr)) {
ii = force_pzwrite(ps_ptr, (char**)(&writep), PIGZ_BLOCK_SIZE + 1);
if (ii) {
@@ -1571,9 +1578,7 @@ int32_t flex_pzputs_std(Pigz_state* ps_ptr, char** writep_ptr, char* ss, uint32_
} else {
force_compressed_pzwrite(ps_ptr, (char**)(&writep), PIGZ_BLOCK_SIZE + 1);
}
- readp = &(readp[cur_write_space]);
- sslen -= cur_write_space;
- cur_write_space = 2 * PIGZ_BLOCK_SIZE;
+ cur_write_pos = (uintptr_t)(writep - ps_ptr->overflow_buf);
}
memcpy(writep, readp, sslen);
*writep_ptr = (char*)(&(writep[sslen]));
diff --git a/plink.c b/plink.c
index b50c0c7..7e84ba2 100644
--- a/plink.c
+++ b/plink.c
@@ -91,7 +91,7 @@
const char ver_str[] =
#ifdef STABLE_BUILD
- "PLINK v1.90b3l"
+ "PLINK v1.90b3u"
#else
"PLINK v1.90p"
#endif
@@ -103,9 +103,10 @@ const char ver_str[] =
#else
" 32-bit"
#endif
- // include trailing space if day < 10, so character length stays the same
- " (18 Apr 2015)";
+ " (3 Jul 2015)";
const char ver_str2[] =
+ // include leading space if day < 10, so character length stays the same
+ " "
#ifdef STABLE_BUILD
"" // (don't want this when version number has a trailing letter)
#else
@@ -664,6 +665,8 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
logprint("Error: --regress-distance calculation requires a scalar phenotype.\n");
} else if (calculation_type & CALC_UNRELATED_HERITABILITY) {
logprint("Error: --unrelated-heritability requires a scalar phenotype.\n");
+ } else if (calculation_type & CALC_GXE) {
+ logprint("Error: --gxe requires a scalar phenotype.\n");
}
goto plink_ret_INVALID_CMDLINE;
}
@@ -692,6 +695,9 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
} else if ((calculation_type & CALC_FST) && (misc_flags & MISC_FST_CC)) {
logprint("Error: '--fst case-control' requires a case/control phenotype.\n");
goto plink_ret_INVALID_CMDLINE;
+ } else if ((calculation_type & CALC_FREQ) && (misc_flags & MISC_FREQ_CC)) {
+ logprint("Error: '--freq case-control' requires a case/control phenotype.\n");
+ goto plink_ret_INVALID_CMDLINE;
}
}
@@ -1254,6 +1260,8 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
logprint("Note: --freq 'counts' modifier has no effect on cluster-stratified report.\n");
}
retval = write_stratified_freqs(bedfile, bed_offset, outname, outname_end, (misc_flags / MISC_FREQ_GZ) & 1, plink_maxsnp, unfiltered_marker_ct, marker_exclude, chrom_info_ptr, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, unfiltered_sample_ct, sample_ct, sample_f_ct, founder_info, nonfounders, sex_male, sample_f_male_ct, marker_reverse, cluster_ct, cluster_map, cluster_starts, cluster_ids, max_cluster_id_len);
+ } else if (misc_flags & MISC_FREQ_CC) {
+ retval = write_cc_freqs(bedfile, bed_offset, outname, outname_end, (misc_flags / MISC_FREQ_GZ) & 1, plink_maxsnp, unfiltered_marker_ct, marker_exclude, chrom_info_ptr, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, unfiltered_sample_ct, founder_info, nonfounders, sex_male, marker_reverse, pheno_nm, pheno_c);
} else {
retval = write_freqs(outname, outname_end, plink_maxsnp, unfiltered_marker_ct, marker_exclude, set_allele_freqs, chrom_info_ptr, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, hwe_ll_allfs, hwe_lh_allfs, hwe_hh_allfs, hwe_hapl_allfs, hwe_haph_allfs, sample_f_ct, sample_f_male_ct, nonfounders, misc_flags, marker_reverse);
}
@@ -2395,6 +2403,218 @@ void print_ver() {
fputs(ver_str2, stdout);
}
+int32_t rerun(uint32_t rerun_argv_pos, uint32_t rerun_parameter_present, int32_t* argc_ptr, uint32_t* cur_arg_ptr, char*** argv_ptr, char*** subst_argv_ptr, char** rerun_buf_ptr) {
+ // caller is responsible for freeing rerun_buf
+ char** argv = *argv_ptr;
+ FILE* rerunfile = fopen(rerun_parameter_present? argv[rerun_argv_pos + 1] : (PROG_NAME_STR ".log"), "r");
+ uintptr_t line_idx = 1;
+ char** subst_argv2 = NULL;
+ uint32_t argc = (uint32_t)(*argc_ptr);
+ uint32_t cur_arg = *cur_arg_ptr;
+ int32_t retval = 0;
+ char* rerun_buf;
+ char* rerun_start_ptr;
+ char* sptr;
+ char* argptr;
+ char* argptr2;
+ char* load_ptr;
+ uint32_t line_byte_ct;
+ uint32_t loaded_arg_ct;
+ uint32_t duplicate_ct;
+ uint32_t loaded_arg_idx;
+ uint32_t cmdline_arg_idx;
+ uint32_t new_arg_idx;
+ uint32_t slen;
+ uint32_t slen2;
+ if (!rerunfile) {
+ print_ver();
+ goto rerun_ret_OPEN_FAIL;
+ }
+ tbuf[MAXLINELEN - 1] = ' ';
+ if (!fgets(tbuf, MAXLINELEN, rerunfile)) {
+ print_ver();
+ fputs("Error: Empty log file for --rerun.\n", stdout);
+ goto rerun_ret_INVALID_FORMAT;
+ }
+ if (!tbuf[MAXLINELEN - 1]) {
+ goto rerun_ret_LONG_LINE;
+ }
+ if (!fgets(tbuf, MAXLINELEN, rerunfile)) {
+ print_ver();
+ fputs("Error: Only one line in --rerun log file.\n", stdout);
+ goto rerun_ret_INVALID_FORMAT;
+ }
+ line_idx++;
+ if (!tbuf[MAXLINELEN - 1]) {
+ goto rerun_ret_LONG_LINE;
+ }
+ if ((tbuf[0] >= '0') && (tbuf[0] <= '9')) {
+ // Old "xx arguments: --aa bb --cc --dd" format
+ fclose_null(&rerunfile);
+ if (scan_posint_capped(tbuf, &loaded_arg_ct, (MAXLINELEN / 2) / 10, (MAXLINELEN / 2) % 10)) {
+ print_ver();
+ fputs("Error: Invalid argument count on line 2 of --rerun log file.\n", stdout);
+ goto rerun_ret_INVALID_FORMAT;
+ }
+ line_byte_ct = strlen(tbuf) + 1;
+ rerun_buf = (char*)malloc(line_byte_ct);
+ if (!rerun_buf) {
+ goto rerun_ret_NOMEM;
+ }
+ *rerun_buf_ptr = rerun_buf;
+ memcpy(rerun_buf, tbuf, line_byte_ct);
+ // skip "xx arguments: ", to get to the first flag
+ rerun_start_ptr = next_token_mult(rerun_buf, 2);
+ } else {
+ // Current, and also PLINK 1.07, "Options in effect:"
+ while (memcmp(tbuf, "Options in effect:", 18) || (tbuf[18] >= ' ')) {
+ line_idx++;
+ if (!fgets(tbuf, MAXLINELEN, rerunfile)) {
+ print_ver();
+ fputs("Error: Invalid log file for --rerun.\n", stdout);
+ goto rerun_ret_INVALID_FORMAT;
+ }
+ }
+ load_ptr = tbuf;
+ loaded_arg_ct = 0;
+ // We load each of the option lines in sequence into tbuf, always
+ // overwriting the previous line's newline. (Note that tbuf[] has
+ // size > 2 * MAXLINELEN; this lets us avoid additional dynamic memory
+ // allocation as long as we impose the constraint that all lines combined
+ // add up to less than MAXLINELEN bytes.)
+ while (1) {
+ load_ptr[MAXLINELEN - 1] = ' ';
+ if (!fgets(load_ptr, MAXLINELEN, rerunfile)) {
+ break;
+ }
+ line_idx++;
+ if (!tbuf[MAXLINELEN - 1]) {
+ goto rerun_ret_LONG_LINE;
+ }
+ sptr = skip_initial_spaces(load_ptr);
+ if (is_eoln_kns(*sptr)) {
+ *load_ptr = '\0';
+ break;
+ }
+ do {
+ argptr = token_endnn(sptr);
+ loaded_arg_ct++;
+ sptr = skip_initial_spaces(argptr);
+ } while (!is_eoln_kns(*sptr));
+ load_ptr = argptr;
+ if (load_ptr >= &(tbuf[MAXLINELEN])) {
+ print_ver();
+ fputs("Error: --rerun argument sequence too long.\n", stdout);
+ goto rerun_ret_INVALID_FORMAT;
+ }
+ }
+ fclose_null(&rerunfile);
+ line_byte_ct = 1 + (uintptr_t)(load_ptr - tbuf);
+ rerun_buf = (char*)malloc(line_byte_ct);
+ if (!rerun_buf) {
+ goto rerun_ret_NOMEM;
+ }
+ rerun_buf = (char*)malloc(1 + ((uintptr_t)(load_ptr - tbuf)));
+ memcpy(rerun_buf, tbuf, line_byte_ct);
+ rerun_start_ptr = skip_initial_spaces(rerun_buf);
+ }
+ sptr = rerun_start_ptr;
+
+ // now use tbuf as a lame bitfield
+ memset(tbuf, 1, loaded_arg_ct);
+ loaded_arg_idx = 0;
+ duplicate_ct = 0;
+ do {
+ if (no_more_tokens_kns(sptr)) {
+ print_ver();
+ fputs("Error: Line 2 of --rerun log file has fewer tokens than expected.\n", stdout);
+ goto rerun_ret_INVALID_FORMAT;
+ }
+ argptr = is_flag_start(sptr);
+ if (argptr) {
+ slen = strlen_se(argptr);
+ for (cmdline_arg_idx = cur_arg; cmdline_arg_idx < argc; cmdline_arg_idx++) {
+ argptr2 = is_flag_start(argv[cmdline_arg_idx]);
+ if (argptr2) {
+ slen2 = strlen(argptr2);
+ if ((slen == slen2) && (!memcmp(argptr, argptr2, slen))) {
+ cmdline_arg_idx = 0xffffffffU;
+ break;
+ }
+ }
+ }
+ if (cmdline_arg_idx == 0xffffffffU) {
+ // matching flag, override --rerun
+ do {
+ duplicate_ct++;
+ tbuf[loaded_arg_idx++] = 0;
+ if (loaded_arg_idx == loaded_arg_ct) {
+ break;
+ }
+ sptr = next_token(sptr);
+ } while (!is_flag(sptr));
+ } else {
+ loaded_arg_idx++;
+ sptr = next_token(sptr);
+ }
+ } else {
+ loaded_arg_idx++;
+ sptr = next_token(sptr);
+ }
+ } while (loaded_arg_idx < loaded_arg_ct);
+ subst_argv2 = (char**)malloc((argc + loaded_arg_ct - duplicate_ct - rerun_parameter_present - 1 - cur_arg) * sizeof(char*));
+ if (!subst_argv2) {
+ goto rerun_ret_NOMEM;
+ }
+ new_arg_idx = 0;
+ for (cmdline_arg_idx = cur_arg; cmdline_arg_idx < rerun_argv_pos; cmdline_arg_idx++) {
+ subst_argv2[new_arg_idx++] = argv[cmdline_arg_idx];
+ }
+ sptr = rerun_start_ptr;
+ for (loaded_arg_idx = 0; loaded_arg_idx < loaded_arg_ct; loaded_arg_idx++) {
+ if (tbuf[loaded_arg_idx]) {
+ slen = strlen_se(sptr);
+ subst_argv2[new_arg_idx++] = sptr;
+ sptr[slen] = '\0';
+ if (loaded_arg_idx != loaded_arg_ct - 1) {
+ sptr = skip_initial_spaces(&(sptr[slen + 1]));
+ }
+ } else {
+ sptr = next_token(sptr);
+ }
+ }
+ for (cmdline_arg_idx = rerun_argv_pos + rerun_parameter_present + 1; cmdline_arg_idx < argc; cmdline_arg_idx++) {
+ subst_argv2[new_arg_idx++] = argv[cmdline_arg_idx];
+ }
+ *cur_arg_ptr = 0;
+ *argc_ptr = new_arg_idx;
+ if (*subst_argv_ptr) {
+ free(*subst_argv_ptr);
+ }
+ *subst_argv_ptr = subst_argv2;
+ *argv_ptr = subst_argv2;
+ subst_argv2 = NULL;
+ while (0) {
+ rerun_ret_NOMEM:
+ print_ver();
+ retval = RET_NOMEM;
+ break;
+ rerun_ret_OPEN_FAIL:
+ print_ver();
+ retval = RET_OPEN_FAIL;
+ break;
+ rerun_ret_LONG_LINE:
+ print_ver();
+ printf("Error: Line %" PRIuPTR " of --rerun log file is pathologically long.\n", line_idx);
+ rerun_ret_INVALID_FORMAT:
+ retval = RET_INVALID_FORMAT;
+ break;
+ }
+ free_cond(subst_argv2);
+ fclose_cond(rerunfile);
+ return retval;
+}
+
char extract_char_param(char* ss) {
// maps c, 'c', and "c" to c, and anything else to the null char. This is
// intended to support e.g. always using '#' to designate a # parameter
@@ -3118,7 +3338,6 @@ int32_t main(int32_t argc, char** argv) {
#endif
unsigned char* wkspace_ua = NULL;
char* bubble = NULL;
- char** subst_argv2;
uint32_t param_ct;
time_t rawtime;
char* argptr;
@@ -3155,7 +3374,6 @@ int32_t main(int32_t argc, char** argv) {
uint32_t ujj;
uint32_t ukk;
uint32_t umm;
- uint32_t unn;
intptr_t default_alloc_mb;
int64_t llxx;
Ll_str* ll_str_ptr;
@@ -3289,119 +3507,10 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_INVALID_CMDLINE;
}
}
- if (ujj) {
- scriptfile = fopen(argv[uii + 1], "r");
- } else {
- scriptfile = fopen(PROG_NAME_STR ".log", "r");
- }
- if (!scriptfile) {
- print_ver();
- goto main_ret_OPEN_FAIL;
- }
- tbuf[MAXLINELEN - 1] = ' ';
- if (!fgets(tbuf, MAXLINELEN, scriptfile)) {
- print_ver();
- fputs("Error: Empty log file for --rerun.\n", stdout);
- goto main_ret_INVALID_FORMAT;
- }
- if (!tbuf[MAXLINELEN - 1]) {
- print_ver();
- fputs("Error: Line 1 of --rerun log file is pathologically long.\n", stdout);
- goto main_ret_INVALID_FORMAT;
- }
- if (!fgets(tbuf, MAXLINELEN, scriptfile)) {
- print_ver();
- fputs("Error: Only one line in --rerun log file.\n", stdout);
- goto main_ret_INVALID_FORMAT;
- }
- if (!tbuf[MAXLINELEN - 1]) {
- print_ver();
- fputs("Error: Line 2 of --rerun log file is pathologically long.\n", stdout);
- goto main_ret_INVALID_FORMAT;
- }
- fclose_null(&scriptfile);
- if (scan_posint_capped(tbuf, (uint32_t*)(&kk), (MAXLINELEN / 2) / 10, (MAXLINELEN / 2) % 10)) {
- print_ver();
- fputs("Error: Invalid argument count on line 2 of --rerun log file.\n", stdout);
- goto main_ret_INVALID_FORMAT;
- }
- ukk = strlen(tbuf) + 1;
- rerun_buf = (char*)malloc(ukk);
- memcpy(rerun_buf, tbuf, ukk);
-
- memset(tbuf, 1, (uint32_t)kk);
- sptr = next_token_mult(rerun_buf, 2);
- umm = 0;
- ukk = 0;
- do {
- if (no_more_tokens_kns(sptr)) {
- print_ver();
- fputs("Error: Line 2 of --rerun log file has fewer tokens than expected.\n", stdout);
- goto main_ret_INVALID_FORMAT;
- }
- argptr = is_flag_start(sptr);
- if (argptr) {
- for (unn = cur_arg; unn < (uint32_t)argc; unn++) {
- argptr2 = is_flag_start(argv[unn]);
- if (argptr2) {
- if (!strcmp(argptr, argptr2)) {
- unn = 0xffffffffU;
- break;
- }
- }
- }
- if (unn == 0xffffffffU) {
- // matching flag, override --rerun
- do {
- ukk++;
- tbuf[umm++] = 0;
- if (umm == (uint32_t)kk) {
- break;
- }
- sptr = next_token(sptr);
- } while (!is_flag(sptr));
- } else {
- umm++;
- sptr = next_token(sptr);
- }
- } else {
- umm++;
- sptr = next_token(sptr);
- }
- } while (umm < (uint32_t)kk);
- subst_argv2 = (char**)malloc((argc + kk - ukk - ujj - 1 - cur_arg) * sizeof(char*));
- if (!subst_argv2) {
- print_ver();
- goto main_ret_NOMEM;
- }
- unn = 0;
- for (umm = cur_arg; umm < uii; umm++) {
- subst_argv2[unn++] = argv[umm];
- }
- sptr = next_token_mult(rerun_buf, 2);
- for (umm = 0; umm < (uint32_t)kk; umm++) {
- if (tbuf[umm]) {
- ukk = strlen_se(sptr);
- subst_argv2[unn++] = sptr;
- sptr[ukk] = '\0';
- if (umm != ((uint32_t)kk) - 1) {
- sptr = skip_initial_spaces(&(sptr[ukk + 1]));
- }
- } else {
- sptr = next_token(sptr);
- }
- }
- for (umm = uii + ujj + 1; umm < (uint32_t)argc; umm++) {
- subst_argv2[unn++] = argv[umm];
- }
- cur_arg = 0;
- argc = unn;
- if (subst_argv) {
- free(subst_argv);
+ retval = rerun(uii, ujj, &argc, &cur_arg, &argv, &subst_argv, &rerun_buf);
+ if (retval) {
+ goto main_ret_1;
}
- subst_argv = subst_argv2;
- argv = subst_argv2;
- subst_argv2 = NULL;
}
}
if ((cur_arg < (uint32_t)argc) && (!is_flag(argv[cur_arg]))) {
@@ -3792,6 +3901,7 @@ int32_t main(int32_t argc, char** argv) {
outname[uii] = '\0';
logstr(ver_str);
+ /*
sprintf(logbuf, "\n%d argument%s:", argc + umm - cur_arg, (argc + umm - cur_arg == 1)? "" : "s");
logstr(logbuf);
for (cur_flag = 0; cur_flag < flag_ct; cur_flag++) {
@@ -3803,6 +3913,21 @@ int32_t main(int32_t argc, char** argv) {
logstr(argv[ii++]);
}
}
+ */
+ logstr("\n");
+ logprint("Options in effect:\n");
+ for (cur_flag = 0; cur_flag < flag_ct; cur_flag++) {
+ logprint(" --");
+ logprint(&(flag_buf[cur_flag * MAX_FLAG_LEN]));
+ ii = flag_map[cur_flag] + 1;
+ while ((ii < argc) && (!is_flag(argv[ii]))) {
+ logprint(" ");
+ logprint(argv[ii++]);
+ }
+ logprint("\n");
+ }
+ logprint("\n");
+
#ifdef _WIN32
windows_dw = TBUF_SIZE;
if (GetComputerName(tbuf, &windows_dw))
@@ -3810,7 +3935,7 @@ int32_t main(int32_t argc, char** argv) {
if (gethostname(tbuf, TBUF_SIZE) != -1)
#endif
{
- logstr("\nHostname: ");
+ logstr("Hostname: ");
logstr(tbuf);
}
logstr("\nWorking directory: ");
@@ -6065,7 +6190,7 @@ int32_t main(int32_t argc, char** argv) {
logprint("Error: --dosage does not support --condition/--condition-list.\n");
goto main_ret_INVALID_CMDLINE_A;
}
- if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 12)) {
+ if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 13)) {
goto main_ret_INVALID_CMDLINE_2A;
}
retval = alloc_fname(&dosage_info.fname, argv[cur_arg + 1], argptr, 0);
@@ -6088,6 +6213,12 @@ int32_t main(int32_t argc, char** argv) {
dosage_info.modifier |= DOSAGE_OCCUR;
} else if (!strcmp(argv[cur_arg + uii], "sex")) {
dosage_info.modifier |= DOSAGE_SEX;
+ } else if (!strcmp(argv[cur_arg + uii], "case-control-freqs")) {
+ dosage_info.modifier |= DOSAGE_FREQ_CC;
+ } else if (!strcmp(argv[cur_arg + uii], "frq2")) {
+ // turn this into an error before official 1.90 release
+ logprint("Warning: The --dosage 'frq2' modifier has been renamed to 'case-control-freqs'.\n");
+ dosage_info.modifier |= DOSAGE_FREQ_CC;
} else if (strlen(argv[cur_arg + uii]) <= 6) {
goto main_dosage_invalid_param;
} else if (!strcmp(argv[cur_arg + uii], "sepheader")) {
@@ -6426,7 +6557,17 @@ int32_t main(int32_t argc, char** argv) {
}
for (uii = 1; uii <= param_ct; uii++) {
if (!strcmp(argv[cur_arg + uii], "counts")) {
+ if (misc_flags & MISC_FREQ_CC) {
+ logprint("Error: --freq 'counts' and 'case-control' modifiers cannot be used together.\n");
+ goto main_ret_INVALID_CMDLINE;
+ }
misc_flags |= MISC_FREQ_COUNTS;
+ } else if (!strcmp(argv[cur_arg + uii], "case-control")) {
+ if (misc_flags & MISC_FREQ_COUNTS) {
+ logprint("Error: --freq 'counts' and 'case-control' modifiers cannot be used together.\n");
+ goto main_ret_INVALID_CMDLINE;
+ }
+ misc_flags |= MISC_FREQ_CC;
} else if (!strcmp(argv[cur_arg + uii], "gz")) {
misc_flags |= MISC_FREQ_GZ;
} else {
@@ -8764,13 +8905,15 @@ int32_t main(int32_t argc, char** argv) {
logprint("Error: --mds-plot must be used with --cluster.\n");
goto main_ret_INVALID_CMDLINE_A;
}
- if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 3)) {
+ if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 4)) {
goto main_ret_INVALID_CMDLINE_2A;
}
cluster.mds_dim_ct = 0;
for (uii = 1; uii <= param_ct; uii++) {
if (!strcmp(argv[cur_arg + uii], "by-cluster")) {
cluster.modifier |= CLUSTER_MDS;
+ } else if (!strcmp(argv[cur_arg + uii], "eigendecomp")) {
+ cluster.modifier |= CLUSTER_MDS_EIGENDECOMP;
} else if (!strcmp(argv[cur_arg + uii], "eigvals")) {
cluster.modifier |= CLUSTER_MDS_EIGVALS;
} else {
@@ -9532,7 +9675,7 @@ int32_t main(int32_t argc, char** argv) {
} else if ((rel_info.modifier & REL_CALC_SHAPEMASK) == REL_CALC_SQ) {
logprint("Error: --parallel cannot be used with '--make-rel square'. Use '--make-rel\nsquare0' or plain '--make-rel' instead.\n");
goto main_ret_INVALID_CMDLINE_A;
- } else if ((rel_info.modifier & (REL_CALC_BIN | REL_CALC_BIN4)) && (!(rel_info.modifier & REL_CALC_SHAPEMASK))) {
+ } else if ((rel_info.modifier & (REL_CALC_BIN | REL_CALC_BIN4)) && (!(rel_info.modifier & (REL_CALC_SHAPEMASK | REL_CALC_GRM_BIN)))) {
logprint("Error: --parallel cannot be used with plain '--make-rel bin{4}'. Use e.g.\n'--make-rel bin square0' or '--make-rel bin triangle' instead.\n");
goto main_ret_INVALID_CMDLINE_A;
} else if (calculation_type & CALC_PLINK1_DISTANCE_MATRIX) {
@@ -10318,6 +10461,8 @@ int32_t main(int32_t argc, char** argv) {
recode_modifier |= RECODE_DELIMX;
} else if (!strcmp(argv[cur_arg + uii], "bgz")) {
recode_modifier |= RECODE_BGZ;
+ } else if (!strcmp(argv[cur_arg + uii], "gen-gz")) {
+ recode_modifier |= RECODE_GEN_GZ;
} else if (!strcmp(argv[cur_arg + uii], "beagle")) {
if (recode_type_set(&recode_modifier, RECODE_BEAGLE)) {
goto main_ret_INVALID_CMDLINE_A;
@@ -10416,6 +10561,10 @@ int32_t main(int32_t argc, char** argv) {
logprint("Error: --recode 'bgz' modifier must be used with VCF output.\n");
goto main_ret_INVALID_CMDLINE_A;
}
+ if ((recode_modifier & RECODE_GEN_GZ) && (!(recode_modifier & RECODE_OXFORD))) {
+ logprint("Error: --recode 'gen-gz' modifier must be used with Oxford-format output.\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
calculation_type |= CALC_RECODE;
} else if (!memcmp(argptr2, "ecode-whap", 11)) {
logprint("Error: --recode-whap flag retired since WHAP is no longer supported.\n");
@@ -11400,6 +11549,10 @@ int32_t main(int32_t argc, char** argv) {
if (load_params || (load_rare & (~(LOAD_RARE_TRANSPOSE | LOAD_RARE_TFAM)))) {
goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
}
+ if (!(load_rare & (LOAD_RARE_TRANSPOSE | LOAD_RARE_TFAM))) {
+ logprint("Error: --tped must be used with --tfam or --tfile.\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
goto main_ret_INVALID_CMDLINE_2A;
}
@@ -12902,6 +13055,10 @@ int32_t main(int32_t argc, char** argv) {
logprint("Error: --gen/--bgen cannot be used without --data or --sample.\n");
goto main_ret_INVALID_CMDLINE_A;
}
+ if ((load_rare & LOAD_RARE_TFAM) && (!(load_rare & (LOAD_RARE_TRANSPOSE | LOAD_RARE_TPED)))) {
+ logprint("Error: --tfam must be used with --tfile or --tped.\n");
+ goto main_ret_INVALID_CMDLINE_A;
+ }
if ((merge_type & MERGE_EQUAL_POS) && (!(calculation_type & CALC_MERGE))) {
logprint("Error: --merge-equal-pos must be used with --merge/--bmerge/--merge-list.\n(Note that you are permitted to merge a fileset with itself.)\n");
goto main_ret_INVALID_CMDLINE_A;
@@ -13148,9 +13305,6 @@ int32_t main(int32_t argc, char** argv) {
main_ret_READ_FAIL:
retval = RET_READ_FAIL;
break;
- main_ret_INVALID_FORMAT:
- retval = RET_INVALID_FORMAT;
- break;
main_ret_INVALID_CMDLINE_UNRECOGNIZED:
invalid_arg(argv[cur_arg]);
logprintb();
diff --git a/plink_assoc.c b/plink_assoc.c
index dded1e3..d766b87 100644
--- a/plink_assoc.c
+++ b/plink_assoc.c
@@ -7306,9 +7306,9 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
vec_collapse_init(sex_male, unfiltered_sample_ct, pheno_nm, pheno_nm_ct, sample_male_case_include2);
bitfield_and(sample_male_case_include2, sample_case_include2, pheno_nm_ctv2);
case_male_ct = popcount01_longs(sample_male_case_include2, pheno_nm_ctv2);
- vec_init_andnot(pheno_nm_ctv2, sample_male_ctrl_include2, sample_male_include2, sample_male_case_include2);
- vec_init_andnot(pheno_nm_ctv2, sample_nonmale_case_include2, sample_case_include2, sample_male_case_include2);
- vec_init_andnot(pheno_nm_ctv2, sample_nonmale_ctrl_include2, sample_ctrl_include2, sample_male_ctrl_include2);
+ bitfield_andnot_copy(pheno_nm_ctv2, sample_male_ctrl_include2, sample_male_include2, sample_male_case_include2);
+ bitfield_andnot_copy(pheno_nm_ctv2, sample_nonmale_case_include2, sample_case_include2, sample_male_case_include2);
+ bitfield_andnot_copy(pheno_nm_ctv2, sample_nonmale_ctrl_include2, sample_ctrl_include2, sample_male_ctrl_include2);
ctrl_male_ct = male_ct - case_male_ct;
case_nonmale_ct = case_ct - case_male_ct;
ctrl_nonmale_ct = ctrl_ct - ctrl_male_ct;
@@ -11985,8 +11985,7 @@ int32_t cluster_assoc_init(const char* flag_name, uintptr_t unfiltered_sample_ct
}
cluster_ct2++;
}
- memcpy(pheno_nm_nonmale_11, pheno_nm_11, unfiltered_sample_ctl2 * sizeof(intptr_t));
- bitfield_andnot(pheno_nm_nonmale_11, pheno_nm_male_11, unfiltered_sample_ctl2);
+ bitfield_andnot_copy(unfiltered_sample_ctl2, pheno_nm_nonmale_11, pheno_nm_11, pheno_nm_male_11);
wkspace_shrink_top(cluster_pheno_gtots, cluster_ct2 * 4 * sizeof(int32_t));
if (wkspace_alloc_ui_checked(cur_cluster_pheno_gtots_ptr, cluster_ct2 * 2 * sizeof(int32_t)) ||
wkspace_alloc_ui_checked(cluster_geno_cts_ptr, cluster_ct2 * 4 * sizeof(int32_t)) ||
diff --git a/plink_calc.c b/plink_calc.c
index 547c404..174f005 100644
--- a/plink_calc.c
+++ b/plink_calc.c
@@ -7642,8 +7642,6 @@ int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
wptr = memcpyl3a(tbuf, "CHR");
*wptr++ = delimiter;
wptr = memcpyl3a(wptr, "VAR");
- *wptr++ = delimiter;
- wptr = memcpya(wptr, "A1", 2);
for (pc_idx = 1; pc_idx <= pc_ct; pc_idx++) {
*wptr++ = delimiter;
wptr = memcpya(wptr, "PC", 2);
@@ -9666,7 +9664,11 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
cluster_dist_multiply(sample_ct, cluster_ct, cluster_starts, mds_plot_dmatrix_copy);
}
}
- retval = mds_plot(outname, outname_end, sample_exclude, sample_ct, sample_idx_to_uidx, sample_ids, plink_maxfid, plink_maxiid, max_sample_id_len, cur_cluster_ct, merge_ct, sample_to_cluster, cur_cluster_remap, cp->mds_dim_ct, is_mds_cluster, cp->modifier & CLUSTER_MDS_EIGVALS, mds_plot_dmatrix_copy);
+ if (cp->modifier & CLUSTER_MDS_EIGENDECOMP) {
+ retval = mds_plot_eigendecomp(outname, outname_end, sample_exclude, sample_ct, sample_idx_to_uidx, sample_ids, plink_maxfid, plink_maxiid, max_sample_id_len, cur_cluster_ct, merge_ct, sample_to_cluster, cur_cluster_remap, cp->mds_dim_ct, is_mds_cluster, cp->modifier & CLUSTER_MDS_EIGVALS, mds_plot_dmatrix_copy);
+ } else {
+ retval = mds_plot(outname, outname_end, sample_exclude, sample_ct, sample_idx_to_uidx, sample_ids, plink_maxfid, plink_maxiid, max_sample_id_len, cur_cluster_ct, merge_ct, sample_to_cluster, cur_cluster_remap, cp->mds_dim_ct, is_mds_cluster, cp->modifier & CLUSTER_MDS_EIGVALS, mds_plot_dmatrix_copy);
+ }
if (retval) {
goto calc_cluster_neighbor_ret_1;
}
diff --git a/plink_cluster.c b/plink_cluster.c
index 305e817..5a036c2 100644
--- a/plink_cluster.c
+++ b/plink_cluster.c
@@ -2924,6 +2924,307 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
double grand_mean = 0.0;
uintptr_t ulii = 0;
int32_t retval = 0;
+ char jobz = 'A';
+ __CLPK_integer info = 0;
+ __CLPK_integer lwork = -1;
+ __CLPK_integer mdim;
+ __CLPK_integer* iwork;
+ double* work;
+ double optim_lwork;
+ double* main_matrix;
+ double* column_means;
+ double* sqrt_eigvals;
+ double* out_u;
+ double* out_v;
+ uint32_t* final_cluster_remap;
+ uint32_t* final_cluster_sizes;
+ double* dptr;
+ double* dptr2;
+ char* wptr;
+ char* wptr2;
+ uintptr_t sample_idx;
+ uintptr_t clidx1;
+ uintptr_t clidx2;
+ uint32_t dim_idx;
+ uint32_t uii;
+ uint32_t ujj;
+ double dxx;
+ double dyy;
+ final_cluster_remap = (uint32_t*)malloc(cur_cluster_ct * sizeof(int32_t));
+ if (!final_cluster_remap) {
+ goto mds_plot_ret_NOMEM;
+ }
+ if ((sample_ct > 5000) && (!is_mds_cluster) && (final_cluster_ct < sample_ct) && (final_cluster_ct > 1)) {
+ LOGPRINTF("Warning: Per-sample --mds-plot can be very slow with over 5000 %s.\nConsider using the 'by-cluster' modifier.\n", g_species_plural);
+ }
+ for (clidx1 = 0; clidx1 < cur_cluster_ct; clidx1++) {
+ clidx2 = cur_cluster_remap[clidx1];
+ if (clidx2 == clidx1) {
+ final_cluster_remap[clidx1] = ulii++;
+ } else {
+ final_cluster_remap[clidx1] = final_cluster_remap[clidx2];
+ }
+ }
+ if (is_mds_cluster) {
+ if (wkspace_alloc_d_checked(&main_matrix, final_cluster_ct * final_cluster_ct * sizeof(double)) ||
+ wkspace_alloc_ui_checked(&final_cluster_sizes, final_cluster_ct * sizeof(int32_t))) {
+ goto mds_plot_ret_NOMEM;
+ }
+ fill_double_zero(main_matrix, final_cluster_ct * final_cluster_ct);
+ fill_uint_zero(final_cluster_sizes, final_cluster_ct);
+ dptr = dists;
+ final_cluster_sizes[final_cluster_remap[0]] = 1;
+ for (uii = 1; uii < cur_cluster_ct; uii++) {
+ clidx1 = final_cluster_remap[uii];
+ final_cluster_sizes[clidx1] += 1;
+ dptr2 = &(main_matrix[clidx1 * final_cluster_ct]);
+ for (ujj = 0; ujj < uii; ujj++) {
+ clidx2 = final_cluster_remap[ujj];
+ if (clidx2 < clidx1) {
+ dptr2[clidx2] += (*dptr);
+ } else if (clidx1 > clidx2) {
+ main_matrix[clidx2 * final_cluster_ct + clidx1] += (*dptr);
+ }
+ dptr++;
+ }
+ }
+ for (clidx1 = 1; clidx1 < final_cluster_ct; clidx1++) {
+ dptr = &(main_matrix[clidx1 * final_cluster_ct]);
+ ulii = final_cluster_sizes[clidx1];
+ for (clidx2 = 0; clidx2 < clidx1; clidx2++) {
+ dxx = (double)((intptr_t)(ulii * final_cluster_sizes[clidx2]));
+ dptr[clidx2] /= dxx;
+ }
+ }
+ ulii = final_cluster_ct;
+ } else {
+ wkspace_reset(dists);
+ if (wkspace_alloc_d_checked(&main_matrix, sample_ct * sample_ct * sizeof(double))) {
+ goto mds_plot_ret_NOMEM;
+ }
+ // expand triangular diagonal-free matrix to bottom-left of square matrix
+ ulii = ((sample_ct - 1) * (sample_ct - 2)) >> 1;
+ for (sample_idx = sample_ct - 1; sample_idx;) {
+ memcpy(&(main_matrix[sample_idx * sample_ct]), &(main_matrix[ulii]), sample_idx * sizeof(double));
+ ulii -= (--sample_idx);
+ }
+ ulii = sample_ct + 1;
+ for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
+ main_matrix[sample_idx * ulii] = 0.0;
+ }
+ ulii = sample_ct;
+ }
+ if (wkspace_alloc_d_checked(&column_means, ulii * sizeof(double))) {
+ goto mds_plot_ret_NOMEM;
+ }
+ fill_double_zero(column_means, ulii);
+ // bottom left filled with IBS values. Now subtract them from 1 and square
+ // them, and extract column means...
+ for (clidx1 = 0; clidx1 < ulii; clidx1++) {
+ dptr = &(main_matrix[clidx1 * ulii]);
+ dptr2 = column_means;
+ dyy = 0.0;
+ for (clidx2 = 0; clidx2 < clidx1; clidx2++) {
+ dxx = 1.0 - (*dptr);
+ dxx *= dxx;
+ *dptr++ = dxx;
+ *dptr2 += dxx;
+ dptr2++;
+ dyy += dxx;
+ }
+ *dptr2 += dyy;
+ }
+
+ dxx = 1.0 / ((double)((intptr_t)ulii));
+ grand_mean = 0.0;
+ for (clidx1 = 0; clidx1 < ulii; clidx1++) {
+ column_means[clidx1] *= dxx;
+ grand_mean += column_means[clidx1];
+ }
+ grand_mean *= dxx;
+ // ...then double-center and multiply by -0.5
+ for (clidx1 = 0; clidx1 < ulii; clidx1++) {
+ dxx = column_means[clidx1];
+ dptr = &(main_matrix[clidx1 * ulii]);
+ dptr2 = column_means;
+ for (clidx2 = 0; clidx2 <= clidx1; clidx2++) {
+ *dptr = -0.5 * ((*dptr) - dxx - (*dptr2++) + grand_mean);
+ dptr++;
+ }
+ }
+ // finally, copy over top right since dgesdd does not exploit symmetry
+ for (clidx1 = 0; clidx1 < ulii; clidx1++) {
+ for (clidx2 = clidx1 + 1; clidx2 < ulii; clidx2++) {
+ main_matrix[clidx1 * ulii + clidx2] = main_matrix[clidx2 * ulii + clidx1];
+ }
+ }
+
+ if (dim_ct > ulii) {
+ dim_ct = ulii;
+ }
+
+ LOGPRINTF("Performing multidimensional scaling analysis (SVD algorithm, %u\ndimension%s)...", dim_ct, (dim_ct == 1)? "" : "s");
+ fflush(stdout);
+
+ mdim = ulii;
+ if (wkspace_alloc_d_checked(&sqrt_eigvals, ulii * sizeof(double)) ||
+ wkspace_alloc_d_checked(&out_u, ulii * ulii * sizeof(double)) ||
+ wkspace_alloc_d_checked(&out_v, ulii * ulii * sizeof(double))) {
+ goto mds_plot_ret_NOMEM;
+ }
+ // fill_double_zero(sqrt_eigvals, ulii);
+ // fill_double_zero(out_u, ulii * ulii);
+ // fill_double_zero(out_v, ulii * ulii);
+
+ iwork = (__CLPK_integer*)wkspace_alloc(8 * ulii * sizeof(__CLPK_integer));
+ if (!iwork) {
+ goto mds_plot_ret_NOMEM;
+ }
+ // fill_int_zero(iwork, 8 * mdim);
+
+ // workspace query
+ dgesdd_(&jobz, &mdim, &mdim, main_matrix, &mdim, sqrt_eigvals, out_u, &mdim, out_v, &mdim, &optim_lwork, &lwork, iwork, &info);
+ lwork = (int32_t)optim_lwork;
+ if (wkspace_alloc_d_checked(&work, lwork * sizeof(double))) {
+ goto mds_plot_ret_NOMEM;
+ }
+ // fill_double_zero(work, lwork);
+ dgesdd_(&jobz, &mdim, &mdim, main_matrix, &mdim, sqrt_eigvals, out_u, &mdim, out_v, &mdim, work, &lwork, iwork, &info);
+
+ // * sqrt_eigvals[0..(ulii-1)] contains singular values
+ // * out_u[(ii*ulii)..(ii*ulii + ulii - 1)] are eigenvectors corresponding to
+ // sqrt_eigvals[ii]. (out_v is a transposed version, signs may differ
+ // too.)
+
+ for (clidx1 = 0; clidx1 < ulii; clidx1++) {
+ if (sqrt_eigvals[clidx1] >= 0.0) {
+ sqrt_eigvals[clidx1] = sqrt(sqrt_eigvals[clidx1]);
+ } else {
+ // is this possible?
+ sqrt_eigvals[clidx1] = 0.0;
+ }
+ }
+
+ // repurpose main_matrix as mds[]
+ dptr = main_matrix;
+ for (clidx1 = 0; clidx1 < ulii; clidx1++) {
+ for (dim_idx = 0; dim_idx < dim_ct; dim_idx++) {
+ *dptr++ = out_u[dim_idx * ulii + clidx1] * sqrt_eigvals[dim_idx];
+ }
+ }
+ logprint(" done.\n");
+
+ memcpy(outname_end, ".mds", 5);
+ if (fopen_checked(&outfile, outname, "w")) {
+ goto mds_plot_ret_OPEN_FAIL;
+ }
+ sprintf(tbuf, "%%%us %%%us SOL ", plink_maxfid, plink_maxiid);
+ fprintf(outfile, tbuf, "FID", "IID");
+ tbuf[22] = ' ';
+ for (dim_idx = 0; dim_idx < dim_ct; dim_idx++) {
+ wptr = uint32_write(tbuf, dim_idx + 1);
+ uii = wptr - tbuf;
+ wptr2 = memseta(&(tbuf[10]), 32, 11 - uii);
+ *wptr2++ = 'C';
+ memcpy(wptr2, tbuf, uii);
+ fwrite(&(tbuf[10]), 1, 13, outfile);
+ }
+ if (putc_checked('\n', outfile)) {
+ goto mds_plot_ret_WRITE_FAIL;
+ }
+ for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
+ wptr2 = &(sample_ids[sample_idx_to_uidx[sample_idx] * max_sample_id_len]);
+ uii = strlen_se(wptr2);
+ wptr = fw_strcpyn(plink_maxfid, uii, wptr2, tbuf);
+ *wptr++ = ' ';
+ wptr = fw_strcpy(plink_maxiid, &(wptr2[uii + 1]), wptr);
+ *wptr++ = ' ';
+ if (orig_sample_to_cluster) {
+ uii = orig_sample_to_cluster[sample_idx];
+ } else {
+ uii = sample_idx;
+ }
+ uii = final_cluster_remap[uii];
+ wptr = uint32_writew6x(wptr, uii, ' ');
+ if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+ goto mds_plot_ret_WRITE_FAIL;
+ }
+ if (!is_mds_cluster) {
+ dptr = &(main_matrix[sample_idx * dim_ct]);
+ for (dim_idx = 0; dim_idx < dim_ct; dim_idx++) {
+ wptr = double_g_writex(&(tbuf[11]), *(dptr++), ' ');
+ uii = wptr - (&(tbuf[11]));
+ if (uii < 13) {
+ wptr2 = &(wptr[-13]);
+ memset(wptr2, 32, 13 - uii);
+ } else {
+ wptr2 = &(tbuf[11]);
+ }
+ fwrite(wptr2, 1, wptr - wptr2, outfile);
+ }
+ } else {
+ dptr = &(main_matrix[uii * dim_ct]);
+ for (dim_idx = 0; dim_idx < dim_ct; dim_idx++) {
+ wptr = double_g_writex(&(tbuf[11]), *(dptr++), ' ');
+ uii = wptr - (&(tbuf[11]));
+ if (uii < 13) {
+ wptr2 = &(wptr[-13]);
+ memset(wptr2, 32, 13 - uii);
+ } else {
+ wptr2 = &(tbuf[11]);
+ }
+ fwrite(wptr2, 1, wptr - wptr2, outfile);
+ }
+ }
+ if (putc_checked('\n', outfile)) {
+ goto mds_plot_ret_WRITE_FAIL;
+ }
+ }
+ if (fclose_null(&outfile)) {
+ goto mds_plot_ret_WRITE_FAIL;
+ }
+ if (!dump_eigvals) {
+ LOGPREPRINTFWW("MDS solution written to %s .\n", outname);
+ } else {
+ LOGPREPRINTFWW("MDS solution written to %s (eigenvalues in %s.eigvals ).\n", outname, outname);
+ memcpy(&(outname_end[4]), ".eigvals", 9);
+ if (fopen_checked(&outfile, outname, "w")) {
+ goto mds_plot_ret_OPEN_FAIL;
+ }
+ for (dim_idx = 0; dim_idx < dim_ct; dim_idx++) {
+ wptr = double_g_writex(tbuf, sqrt_eigvals[dim_idx] * sqrt_eigvals[dim_idx], '\n');
+ *wptr = '\0';
+ fputs(tbuf, outfile);
+ }
+ if (fclose_null(&outfile)) {
+ goto mds_plot_ret_WRITE_FAIL;
+ }
+ }
+ logprintb();
+ while (0) {
+ mds_plot_ret_NOMEM:
+ retval = RET_NOMEM;
+ break;
+ mds_plot_ret_OPEN_FAIL:
+ retval = RET_OPEN_FAIL;
+ break;
+ mds_plot_ret_WRITE_FAIL:
+ retval = RET_WRITE_FAIL;
+ break;
+ }
+ fclose_cond(outfile);
+ free_cond(final_cluster_remap);
+ wkspace_reset(dists);
+ return retval;
+}
+
+// probably want to factor out common initialization with mds_plot, etc.
+int32_t mds_plot_eigendecomp(char* outname, char* outname_end, uintptr_t* sample_exclude, uintptr_t sample_ct, uint32_t* sample_idx_to_uidx, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uint32_t cur_cluster_ct, uint32_t merge_ct, uint32_t* orig_sample_to_cluster, uint32_t* cur_cluster_remap, uint32_t dim_ct, uint32_t is_mds_cluster, uint32_t dump_eigvals, double* dists) {
+ FILE* outfile = NULL;
+ uintptr_t final_cluster_ct = cur_cluster_ct - merge_ct;
+ double grand_mean = 0.0;
+ uintptr_t ulii = 0;
+ int32_t retval = 0;
char jobz = 'V';
char range = 'I';
char uplo = 'U';
@@ -2963,7 +3264,7 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
double* sqrt_eigvals;
final_cluster_remap = (uint32_t*)malloc(cur_cluster_ct * sizeof(int32_t));
if (!final_cluster_remap) {
- goto mds_plot_ret_NOMEM;
+ goto mds_plot_eigendecomp_ret_NOMEM;
}
if ((sample_ct > 5000) && (!is_mds_cluster) && (final_cluster_ct < sample_ct) && (final_cluster_ct > 1)) {
LOGPRINTF("Warning: Per-sample --mds-plot can be very slow with over 5000 %s.\nConsider using the 'by-cluster' modifier.\n", g_species_plural);
@@ -2979,7 +3280,7 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
if (is_mds_cluster) {
if (wkspace_alloc_d_checked(&main_matrix, final_cluster_ct * final_cluster_ct * sizeof(double)) ||
wkspace_alloc_ui_checked(&final_cluster_sizes, final_cluster_ct * sizeof(int32_t))) {
- goto mds_plot_ret_NOMEM;
+ goto mds_plot_eigendecomp_ret_NOMEM;
}
fill_double_zero(main_matrix, final_cluster_ct * final_cluster_ct);
fill_uint_zero(final_cluster_sizes, final_cluster_ct);
@@ -3011,7 +3312,7 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
} else {
wkspace_reset(dists);
if (wkspace_alloc_d_checked(&main_matrix, sample_ct * sample_ct * sizeof(double))) {
- goto mds_plot_ret_NOMEM;
+ goto mds_plot_eigendecomp_ret_NOMEM;
}
// expand triangular diagonal-free matrix to square matrix
ulii = ((sample_ct - 1) * (sample_ct - 2)) >> 1;
@@ -3026,7 +3327,7 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
ulii = sample_ct;
}
if (wkspace_alloc_d_checked(&column_means, ulii * sizeof(double))) {
- goto mds_plot_ret_NOMEM;
+ goto mds_plot_eigendecomp_ret_NOMEM;
}
fill_double_zero(column_means, ulii);
// bottom left filled with IBS values. Now subtract them from 1 and square
@@ -3057,7 +3358,7 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
dxx = column_means[clidx1];
dptr = &(main_matrix[clidx1 * ulii]);
dptr2 = column_means;
- for (clidx2 = 0; clidx2 < clidx1; clidx2++) {
+ for (clidx2 = 0; clidx2 <= clidx1; clidx2++) {
*dptr = -0.5 * ((*dptr) - dxx - (*dptr2++) + grand_mean);
dptr++;
}
@@ -3067,7 +3368,7 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
dim_ct = ulii;
}
- LOGPRINTF("Performing multidimensional scaling analysis (%u dimension%s)...", dim_ct, (dim_ct == 1)? "" : "s");
+ LOGPRINTF("Performing multidimensional scaling analysis (eigendecomposition algorithm,\n%u dimension%s)...", dim_ct, (dim_ct == 1)? "" : "s");
fflush(stdout);
// no need to fill upper right
@@ -3079,13 +3380,13 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
i1 = i2 + 1 - dim_ct;
if (wkspace_alloc_d_checked(&out_w, dim_ct * sizeof(double)) ||
wkspace_alloc_d_checked(&out_z, dim_ct * ulii * sizeof(double))) {
- goto mds_plot_ret_NOMEM;
+ goto mds_plot_eigendecomp_ret_NOMEM;
}
fill_double_zero(out_w, dim_ct);
fill_double_zero(out_z, dim_ct * ulii);
isuppz = (__CLPK_integer*)wkspace_alloc(2 * dim_ct * sizeof(__CLPK_integer));
if (!isuppz) {
- goto mds_plot_ret_NOMEM;
+ goto mds_plot_eigendecomp_ret_NOMEM;
}
fill_int_zero((int32_t*)isuppz, 2 * dim_ct * (sizeof(__CLPK_integer) / sizeof(int32_t)));
ldz = mdim;
@@ -3093,12 +3394,12 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
dsyevr_(&jobz, &range, &uplo, &mdim, main_matrix, &mdim, &nz, &nz, &i1, &i2, &zz, &out_m, out_w, out_z, &ldz, isuppz, &optim_lwork, &lwork, &optim_liwork, &liwork, &info);
lwork = (int32_t)optim_lwork;
if (wkspace_alloc_d_checked(&work, lwork * sizeof(double))) {
- goto mds_plot_ret_NOMEM;
+ goto mds_plot_eigendecomp_ret_NOMEM;
}
liwork = optim_liwork;
iwork = (__CLPK_integer*)wkspace_alloc(liwork * sizeof(__CLPK_integer));
if (!iwork) {
- goto mds_plot_ret_NOMEM;
+ goto mds_plot_eigendecomp_ret_NOMEM;
}
fill_double_zero(work, lwork);
fill_int_zero((int32_t*)iwork, liwork * (sizeof(__CLPK_integer) / sizeof(int32_t)));
@@ -3109,7 +3410,7 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
// out_w[ii]
wkspace_reset(isuppz);
if (wkspace_alloc_d_checked(&sqrt_eigvals, dim_ct * sizeof(double))) {
- goto mds_plot_ret_NOMEM;
+ goto mds_plot_eigendecomp_ret_NOMEM;
}
for (dim_idx = 0; dim_idx < dim_ct; dim_idx++) {
dxx = out_w[dim_idx];
@@ -3130,7 +3431,7 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
memcpy(outname_end, ".mds", 5);
if (fopen_checked(&outfile, outname, "w")) {
- goto mds_plot_ret_OPEN_FAIL;
+ goto mds_plot_eigendecomp_ret_OPEN_FAIL;
}
sprintf(tbuf, "%%%us %%%us SOL ", plink_maxfid, plink_maxiid);
fprintf(outfile, tbuf, "FID", "IID");
@@ -3144,7 +3445,7 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
fwrite(&(tbuf[10]), 1, 13, outfile);
}
if (putc_checked('\n', outfile)) {
- goto mds_plot_ret_WRITE_FAIL;
+ goto mds_plot_eigendecomp_ret_WRITE_FAIL;
}
for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
wptr2 = &(sample_ids[sample_idx_to_uidx[sample_idx] * max_sample_id_len]);
@@ -3161,7 +3462,7 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
uii = final_cluster_remap[uii];
wptr = uint32_writew6x(wptr, uii, ' ');
if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
- goto mds_plot_ret_WRITE_FAIL;
+ goto mds_plot_eigendecomp_ret_WRITE_FAIL;
}
if (!is_mds_cluster) {
dptr = &(main_matrix[(sample_idx + 1) * dim_ct]);
@@ -3191,11 +3492,11 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
}
}
if (putc_checked('\n', outfile)) {
- goto mds_plot_ret_WRITE_FAIL;
+ goto mds_plot_eigendecomp_ret_WRITE_FAIL;
}
}
if (fclose_null(&outfile)) {
- goto mds_plot_ret_WRITE_FAIL;
+ goto mds_plot_eigendecomp_ret_WRITE_FAIL;
}
if (!dump_eigvals) {
LOGPREPRINTFWW("MDS solution written to %s .\n", outname);
@@ -3203,7 +3504,7 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
LOGPREPRINTFWW("MDS solution written to %s (eigenvalues in %s.eigvals ).\n", outname, outname);
memcpy(&(outname_end[4]), ".eigvals", 9);
if (fopen_checked(&outfile, outname, "w")) {
- goto mds_plot_ret_OPEN_FAIL;
+ goto mds_plot_eigendecomp_ret_OPEN_FAIL;
}
for (dim_idx = dim_ct; dim_idx; dim_idx--) {
wptr = double_g_writex(tbuf, sqrt_eigvals[dim_idx - 1] * sqrt_eigvals[dim_idx - 1], '\n');
@@ -3211,18 +3512,18 @@ int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, ui
fputs(tbuf, outfile);
}
if (fclose_null(&outfile)) {
- goto mds_plot_ret_WRITE_FAIL;
+ goto mds_plot_eigendecomp_ret_WRITE_FAIL;
}
}
logprintb();
while (0) {
- mds_plot_ret_NOMEM:
+ mds_plot_eigendecomp_ret_NOMEM:
retval = RET_NOMEM;
break;
- mds_plot_ret_OPEN_FAIL:
+ mds_plot_eigendecomp_ret_OPEN_FAIL:
retval = RET_OPEN_FAIL;
break;
- mds_plot_ret_WRITE_FAIL:
+ mds_plot_eigendecomp_ret_WRITE_FAIL:
retval = RET_WRITE_FAIL;
break;
}
diff --git a/plink_cluster.h b/plink_cluster.h
index b58fd27..6046e6f 100644
--- a/plink_cluster.h
+++ b/plink_cluster.h
@@ -7,14 +7,15 @@
#define CLUSTER_ONLY2 8
#define CLUSTER_OLD_TIEBREAKS 0x10
#define CLUSTER_MDS 0x20
-#define CLUSTER_MDS_EIGVALS 0x40
-#define CLUSTER_CMH_BD 0x80
-#define CLUSTER_CMH_PERM 0x100
-#define CLUSTER_CMH_PERM_BD 0x200
-#define CLUSTER_CMH_MPERM 0x400
-#define CLUSTER_CMH_PERM_COUNT 0x800
-#define CLUSTER_CMH_SET_TEST 0x1000
-#define CLUSTER_CMH2 0x2000
+#define CLUSTER_MDS_EIGENDECOMP 0x40
+#define CLUSTER_MDS_EIGVALS 0x80
+#define CLUSTER_CMH_BD 0x100
+#define CLUSTER_CMH_PERM 0x200
+#define CLUSTER_CMH_PERM_BD 0x400
+#define CLUSTER_CMH_MPERM 0x800
+#define CLUSTER_CMH_PERM_COUNT 0x1000
+#define CLUSTER_CMH_SET_TEST 0x2000
+#define CLUSTER_CMH2 0x4000
typedef struct {
char* fname;
@@ -74,6 +75,8 @@ int32_t write_cluster_solution(char* outname, char* outname_end, uint32_t* orig_
#ifndef NOLAPACK
int32_t mds_plot(char* outname, char* outname_end, uintptr_t* sample_exclude, uintptr_t sample_ct, uint32_t* sample_idx_to_uidx, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uint32_t cur_cluster_ct, uint32_t merge_ct, uint32_t* orig_sample_to_cluster, uint32_t* cur_cluster_remap, uint32_t dim_ct, uint32_t is_mds_cluster, uint32_t dump_eigvals, double* dists);
+
+int32_t mds_plot_eigendecomp(char* outname, char* outname_end, uintptr_t* sample_exclude, uintptr_t sample_ct, uint32_t* sample_idx_to_uidx, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uint32_t cur_cluster_ct, uint32_t merge_ct, uint32_t* orig_sample_to_cluster, uint32_t* cur_cluster_remap, uint32_t dim_ct, uint32_t is_mds_cluster, uint32_t dump_eigvals, double* dists);
#endif
#endif // __PLINK_CLUSTER_H__
diff --git a/plink_common.c b/plink_common.c
index 7d15185..22cb061 100644
--- a/plink_common.c
+++ b/plink_common.c
@@ -4633,6 +4633,40 @@ char* scan_for_duplicate_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id
return NULL;
}
+char* scan_for_duplicate_or_overlap_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_len, char* sorted_nonoverlap_ids, uintptr_t nonoverlap_id_ct, uintptr_t max_nonoverlap_id_len) {
+ // extended scan_for_duplicate_ids() which also verifies that no entry in
+ // sorted_ids matches any entry in sorted_nonoverlap_ids.
+ // nonoverlap_id_ct == 0 and sorted_nonoverlap_ids == NULL ok. id_ct cannot
+ // be zero, though.
+ uintptr_t nonoverlap_id_idx = 0;
+ uintptr_t id_idx = 0;
+ char* cur_id_ptr = sorted_ids;
+ char* nonoverlap_id_ptr;
+ char* other_id_ptr;
+ int32_t ii;
+ while (1) {
+ if (nonoverlap_id_idx == nonoverlap_id_ct) {
+ return scan_for_duplicate_ids(cur_id_ptr, id_ct - id_idx, max_id_len);
+ }
+ nonoverlap_id_ptr = &(sorted_nonoverlap_ids[nonoverlap_id_idx * max_nonoverlap_id_len]);
+ ii = strcmp(cur_id_ptr, nonoverlap_id_ptr);
+ if (ii < 0) {
+ if (++id_idx == id_ct) {
+ return NULL;
+ }
+ other_id_ptr = &(cur_id_ptr[max_id_len]);
+ if (!strcmp(cur_id_ptr, other_id_ptr)) {
+ return cur_id_ptr;
+ }
+ cur_id_ptr = other_id_ptr;
+ continue;
+ } else if (!ii) {
+ return cur_id_ptr;
+ }
+ nonoverlap_id_idx++;
+ }
+}
+
int32_t is_missing_pheno_cc(char* bufptr, double missing_phenod, uint32_t affection_01) {
char* ss;
double dxx;
@@ -4770,6 +4804,14 @@ int32_t intcmp(const void* aa, const void* bb) {
return *((const int32_t*)aa) - *((const int32_t*)bb);
}
+int32_t uintcmp(const void* aa, const void* bb) {
+ if (*((const uint32_t*)aa) < *((const uint32_t*)bb)) {
+ return -1;
+ } else {
+ return (*((const uint32_t*)aa) > *((const uint32_t*)bb));
+ }
+}
+
int32_t intcmp2(const void* aa, const void* bb) {
if (*((const int32_t*)aa) < *((const int32_t*)bb)) {
return -1;
@@ -7342,6 +7384,7 @@ void vec_set_freq_x(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* include_v
}
void vec_set_freq_y(uintptr_t sample_ctl2, uintptr_t* lptr, uintptr_t* include_vec, uintptr_t* nonmale_vec, uint32_t* set_ctp, uint32_t* missing_ctp) {
+ // all nonmales contribute to missing_ct here
uintptr_t* lptr_end = &(lptr[sample_ctl2]);
uintptr_t loader;
uintptr_t loader2;
@@ -8096,18 +8139,19 @@ void vec_init_invert(uintptr_t entry_ct, uintptr_t* target_arr, uintptr_t* sourc
}
}
-void vec_init_andnot(uintptr_t vec_wsize, uintptr_t* target_arr, uintptr_t* source_arr, uintptr_t* exclude_arr) {
- // initializes a half-bitfield as source_arr ANDNOT exclude_arr
+void bitfield_andnot_copy(uintptr_t word_ct, uintptr_t* target_arr, uintptr_t* source_arr, uintptr_t* exclude_arr) {
+ // target_arr := source_arr ANDNOT exclude_arr
+ // may write an extra word
#ifdef __LP64__
__m128i* tptr = (__m128i*)target_arr;
__m128i* sptr = (__m128i*)source_arr;
__m128i* xptr = (__m128i*)exclude_arr;
- __m128i* tptr_end = (__m128i*)(&(target_arr[vec_wsize]));
+ __m128i* tptr_end = (__m128i*)(&(target_arr[word_ct]));
do {
*tptr++ = _mm_andnot_si128(*xptr++, *sptr++);
} while (tptr < tptr_end);
#else
- uintptr_t* tptr_end = &(target_arr[vec_wsize]);
+ uintptr_t* tptr_end = &(target_arr[word_ct]);
do {
*target_arr++ = (*source_arr++) & (~(*exclude_arr++));
} while (target_arr < tptr_end);
diff --git a/plink_common.h b/plink_common.h
index 315f56e..55d31e6 100644
--- a/plink_common.h
+++ b/plink_common.h
@@ -186,42 +186,43 @@
#define MISC_NONFOUNDERS 2LLU
#define MISC_MAF_SUCC 4LLU
#define MISC_FREQ_COUNTS 8LLU
-#define MISC_FREQX 0x10LLU
-#define MISC_KEEP_ALLELE_ORDER 0x20LLU
-#define MISC_SET_HH_MISSING 0x40LLU
-#define MISC_KEEP_AUTOCONV 0x80LLU
-#define MISC_LOAD_CLUSTER_KEEP_NA 0x100LLU
-#define MISC_WRITE_CLUSTER_OMIT_UNASSIGNED 0x200LLU
-#define MISC_ALLOW_EXTRA_CHROMS 0x400LLU
-#define MISC_MAKE_FOUNDERS_REQUIRE_2_MISSING 0x800LLU
-#define MISC_MAKE_FOUNDERS_FIRST 0x1000LLU
-#define MISC_LASSO_REPORT_ZEROES 0x2000LLU
-#define MISC_LASSO_SELECT_COVARS 0x4000LLU
-#define MISC_DOUBLE_ID 0x8000LLU
-#define MISC_BIALLELIC_ONLY 0x10000LLU
-#define MISC_BIALLELIC_ONLY_STRICT 0x20000LLU
-#define MISC_BIALLELIC_ONLY_LIST 0x40000LLU
-#define MISC_VCF_FILTER 0x80000LLU
-#define MISC_GPLINK 0x100000LLU
-#define MISC_SNPS_ONLY_NO_DI 0x200000LLU
-#define MISC_IMPUTE_SEX 0x400000LLU
-#define MISC_OXFORD_SNPID_CHR 0x800000LLU
-#define MISC_EXTRACT_RANGE 0x1000000LLU
-#define MISC_EXCLUDE_RANGE 0x2000000LLU
-#define MISC_MERGEX 0x4000000LLU
-#define MISC_SET_ME_MISSING 0x8000000LLU
-#define MISC_SEXCHECK_YCOUNT 0x10000000LLU
-#define MISC_SEXCHECK_YONLY 0x20000000LLU
-#define MISC_FAMILY_CLUSTERS 0x40000000LLU
-#define MISC_FILL_MISSING_A2 0x80000000LLU
-#define MISC_HET_SMALL_SAMPLE 0x100000000LLU
-#define MISC_FST_CC 0x200000000LLU
-#define MISC_SPLIT_MERGE_NOFAIL 0x400000000LLU
-#define MISC_REAL_REF_ALLELES 0x800000000LLU
-#define MISC_RPLUGIN_DEBUG 0x1000000000LLU
-#define MISC_MISSING_GZ 0x2000000000LLU
-#define MISC_FREQ_GZ 0x4000000000LLU
-#define MISC_HET_GZ 0x8000000000LLU
+#define MISC_FREQ_CC 0x10LLU
+#define MISC_FREQX 0x20LLU
+#define MISC_KEEP_ALLELE_ORDER 0x40LLU
+#define MISC_SET_HH_MISSING 0x80LLU
+#define MISC_KEEP_AUTOCONV 0x100LLU
+#define MISC_LOAD_CLUSTER_KEEP_NA 0x200LLU
+#define MISC_WRITE_CLUSTER_OMIT_UNASSIGNED 0x400LLU
+#define MISC_ALLOW_EXTRA_CHROMS 0x800LLU
+#define MISC_MAKE_FOUNDERS_REQUIRE_2_MISSING 0x1000LLU
+#define MISC_MAKE_FOUNDERS_FIRST 0x2000LLU
+#define MISC_LASSO_REPORT_ZEROES 0x4000LLU
+#define MISC_LASSO_SELECT_COVARS 0x8000LLU
+#define MISC_DOUBLE_ID 0x10000LLU
+#define MISC_BIALLELIC_ONLY 0x20000LLU
+#define MISC_BIALLELIC_ONLY_STRICT 0x40000LLU
+#define MISC_BIALLELIC_ONLY_LIST 0x80000LLU
+#define MISC_VCF_FILTER 0x100000LLU
+#define MISC_GPLINK 0x200000LLU
+#define MISC_SNPS_ONLY_NO_DI 0x400000LLU
+#define MISC_IMPUTE_SEX 0x800000LLU
+#define MISC_OXFORD_SNPID_CHR 0x1000000LLU
+#define MISC_EXTRACT_RANGE 0x2000000LLU
+#define MISC_EXCLUDE_RANGE 0x4000000LLU
+#define MISC_MERGEX 0x8000000LLU
+#define MISC_SET_ME_MISSING 0x10000000LLU
+#define MISC_SEXCHECK_YCOUNT 0x20000000LLU
+#define MISC_SEXCHECK_YONLY 0x40000000LLU
+#define MISC_FAMILY_CLUSTERS 0x80000000LLU
+#define MISC_FILL_MISSING_A2 0x100000000LLU
+#define MISC_HET_SMALL_SAMPLE 0x200000000LLU
+#define MISC_FST_CC 0x400000000LLU
+#define MISC_SPLIT_MERGE_NOFAIL 0x800000000LLU
+#define MISC_REAL_REF_ALLELES 0x1000000000LLU
+#define MISC_RPLUGIN_DEBUG 0x2000000000LLU
+#define MISC_MISSING_GZ 0x4000000000LLU
+#define MISC_FREQ_GZ 0x8000000000LLU
+#define MISC_HET_GZ 0x10000000000LLU
// assume for now that .bed must always be accompanied by both .bim and .fam
#define FILTER_ALL_REQ 1LLU
@@ -402,6 +403,7 @@
#define RECODE_IID 0x4000000
#define RECODE_INCLUDE_ALT 0x8000000
#define RECODE_BGZ 0x10000000
+#define RECODE_GEN_GZ 0x20000000
#define GENOME_OUTPUT_GZ 1
#define GENOME_REL_CHECK 2
@@ -1949,6 +1951,8 @@ int32_t get_uidx_from_unsorted(char* idstr, uintptr_t* exclude_arr, uint32_t id_
char* scan_for_duplicate_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_len);
+char* scan_for_duplicate_or_overlap_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_len, char* sorted_nonoverlap_ids, uintptr_t nonoverlap_id_ct, uintptr_t max_nonoverlap_id_len);
+
int32_t is_missing_pheno_cc(char* bufptr, double missing_phenod, uint32_t affection_01);
int32_t eval_affection(char* bufptr, double missing_phenod);
@@ -1971,6 +1975,8 @@ int32_t char_cmp_deref(const void* aa, const void* bb);
int32_t intcmp(const void* aa, const void* bb);
+int32_t uintcmp(const void* aa, const void* bb);
+
#ifndef __cplusplus
int32_t intcmp2(const void* aa, const void* bb);
#endif
@@ -2194,9 +2200,9 @@ void vec_include_init(uintptr_t unfiltered_sample_ct, uintptr_t* new_include2, u
void exclude_to_vec_include(uintptr_t unfiltered_sample_ct, uintptr_t* include_vec, uintptr_t* exclude_arr);
-void vec_init_invert(uintptr_t vec_entry_ct, uintptr_t* target_arr, uintptr_t* source_arr);
+void vec_init_invert(uintptr_t entry_ct, uintptr_t* target_arr, uintptr_t* source_arr);
-void vec_init_andnot(uintptr_t vec_entry_ct, uintptr_t* target_arr, uintptr_t* source_arr, uintptr_t* exclude_arr);
+void bitfield_andnot_copy(uintptr_t word_ct, uintptr_t* target_arr, uintptr_t* source_arr, uintptr_t* exclude_arr);
void vec_include_mask_in(uintptr_t unfiltered_sample_ct, uintptr_t* include_arr, uintptr_t* mask_arr);
diff --git a/plink_data.c b/plink_data.c
index 13d1f44..2082535 100644
--- a/plink_data.c
+++ b/plink_data.c
@@ -14,7 +14,9 @@
#include <sys/types.h>
#include "plink_family.h"
#include "plink_set.h"
+
#include "bgzf.h"
+#include "pigz.h"
#define PHENO_EPSILON 0.000030517578125
@@ -444,6 +446,11 @@ void load_bim_sf_insert(uint32_t chrom_idx, uint32_t pos_start, uint32_t pos_end
}
if (llbuf[llidx + 1] < pos_end) {
// scan forward, attempt to collapse entries
+
+ // bugfix: if no forward entries can be collapsed, current entry must
+ // be updated
+ llbuf[llidx + 1] = pos_end;
+
old_llidx = llidx;
new_llidx = llbuf[llidx + 2];
while (new_llidx != 1) {
@@ -3153,7 +3160,7 @@ int32_t make_bed_one_marker(FILE* bedfile, uintptr_t* loadbuf, uint32_t unfilter
return 0;
}
-int32_t make_bed_me_missing_one_marker(FILE* bedfile, uintptr_t* loadbuf, uint32_t unfiltered_sample_ct, uintptr_t unfiltered_sample_ct4, uintptr_t* sample_exclude, uint32_t sample_ct, uint32_t* sample_sort_map, uintptr_t final_mask, uint32_t unfiltered_sample_ctl2m1, uint32_t is_reverse, uintptr_t* writebuf, uintptr_t* workbuf, uintptr_t* sample_raw_male_include2, uint32_t* trio_lookup, uint32_t trio_ct, uint32_t set_x_hh_missing, uint32_t multigen, uint64_t* error_ct_ptr) {
+int32_t make_bed_me_missing_one_marker(FILE* bedfile, uintptr_t* loadbuf, uint32_t unfiltered_sample_ct, uintptr_t unfiltered_sample_ct4, uintptr_t* sample_exclude, uint32_t sample_ct, uint32_t* sample_sort_map, uintptr_t final_mask, uint32_t unfiltered_sample_ctl2m1, uint32_t is_reverse, uintptr_t* writebuf, uintptr_t* workbuf, uintptr_t* sex_male, uintptr_t* sample_raw_male_include2, uint32_t* trio_lookup, uint32_t trio_ct, uint32_t set_hh_missing, uint32_t is_x, uint32_t multigen, uin [...]
// requires final_mask to be for unfiltered_sample_ct
uintptr_t* writeptr = writebuf;
uintptr_t cur_word = 0;
@@ -3167,15 +3174,13 @@ int32_t make_bed_me_missing_one_marker(FILE* bedfile, uintptr_t* loadbuf, uint32
if (load_raw2(bedfile, loadbuf, unfiltered_sample_ct4, unfiltered_sample_ctl2m1, final_mask)) {
return RET_READ_FAIL;
}
- // do NOT treat males differently from females on Xchr if --set-hh-missing
- // not specified, since user may be procrastinating on fixing gender errors.
- if (set_x_hh_missing) {
+ if (set_hh_missing && is_x) {
hh_reset((unsigned char*)loadbuf, sample_raw_male_include2, unfiltered_sample_ct);
}
if (is_reverse) {
reverse_loadbuf((unsigned char*)loadbuf, unfiltered_sample_ct);
}
- *error_ct_ptr += erase_mendel_errors(unfiltered_sample_ct, loadbuf, workbuf, trio_lookup, trio_ct, multigen);
+ *error_ct_ptr += erase_mendel_errors(unfiltered_sample_ct, loadbuf, workbuf, sex_male, trio_lookup, trio_ct, is_x, multigen);
if (sample_sort_map) {
for (; sample_idx < sample_ct; sample_idx++) {
do {
@@ -3574,7 +3579,7 @@ int32_t make_bed(FILE* bedfile, uintptr_t bed_offset, char* bimname, uint32_t ma
haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)writebuf);
}
} else {
- retval = make_bed_me_missing_one_marker(bedfile, loadbuf, unfiltered_sample_ct, unfiltered_sample_ct4, sample_exclude, sample_ct, sample_sort_map, final_mask, unfiltered_sample_ctl2m1, IS_SET(marker_reverse, marker_uidx), writebuf, workbuf, sample_raw_male_include2, trio_lookup, trio_ct, set_hh_missing && is_x, mendel_multigen, &mendel_error_ct);
+ retval = make_bed_me_missing_one_marker(bedfile, loadbuf, unfiltered_sample_ct, unfiltered_sample_ct4, sample_exclude, sample_ct, sample_sort_map, final_mask, unfiltered_sample_ctl2m1, IS_SET(marker_reverse, marker_uidx), writebuf, workbuf, sex_male, sample_raw_male_include2, trio_lookup, trio_ct, set_hh_missing, is_x, mendel_multigen, &mendel_error_ct);
}
if (retval) {
goto make_bed_ret_1;
@@ -4008,7 +4013,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
uint32_t bgen_hardthresh = 0;
uint32_t marker_ct = 0;
int32_t retval = 0;
- uint32_t uint_arr[4];
+ uint32_t uint_arr[5];
char missing_pheno_str[12];
char* bufptr;
char* bufptr2;
@@ -4610,8 +4615,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
if (fopen_checked(&infile, genname, "rb")) {
goto oxford_to_bed_ret_OPEN_FAIL;
}
- // supports BGEN v1.0 and v1.1. (online documentation seems to have
- // several errors as of this writing, ugh)
+ // supports BGEN v1.0 and v1.1.
bgen_probs = (uint16_t*)wkspace_alloc(6 * sample_ct);
if (!bgen_probs) {
goto oxford_to_bed_ret_NOMEM;
@@ -4624,7 +4628,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
} else if (loadbuf_size < 3 * 65536) {
goto oxford_to_bed_ret_NOMEM;
}
- if (fread(uint_arr, 1, 16, infile) < 16) {
+ if (fread(uint_arr, 1, 20, infile) < 20) {
goto oxford_to_bed_ret_READ_FAIL;
}
if (uint_arr[1] > uint_arr[0]) {
@@ -4640,6 +4644,10 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
logprint("Error: --bgen and --sample files contain different numbers of samples.\n");
goto oxford_to_bed_ret_INVALID_FORMAT;
}
+ if (uint_arr[4] && (uint_arr[4] != 0x6e656762)) {
+ logprint("Error: Invalid .bgen magic number.\n");
+ goto oxford_to_bed_ret_INVALID_FORMAT;
+ }
if (fseeko(infile, uint_arr[1], SEEK_SET)) {
goto oxford_to_bed_ret_READ_FAIL;
}
@@ -4647,7 +4655,14 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
goto oxford_to_bed_ret_READ_FAIL;
}
if (uii & (~5)) {
- logprint("Error: Unrecognized flags in .bgen header. (This PLINK build only supports\nBGEN v1.0 and v1.1.)\n");
+ uii = (uii >> 2) & 15;
+ if (uii == 2) {
+ logprint("Error: BGEN v1.2 input requires PLINK 2.0 (under development as of this\nwriting). Use gen-convert to downcode to BGEN v1.1 if you want to process this\ndata with PLINK 1.9.\n");
+ } else if (uii > 2) {
+ logprint("Error: Unrecognized BGEN version. Use gen-convert or a similar tool to\ndowncode to BGEN v1.1 if you want to process this data with PLINK 1.9.\n");
+ } else {
+ logprint("Error: Unrecognized flags in .bgen header. (PLINK 1.9 only supports\nBGEN v1.0 and v1.1.)\n");
+ }
goto oxford_to_bed_ret_INVALID_FORMAT;
}
if (fseeko(infile, 4 + uint_arr[0], SEEK_SET)) {
@@ -7147,9 +7162,6 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
goto transposed_to_bed_ret_NOMEM;
}
- logstr("Processing .tped file.\n");
- transposed_to_bed_print_pct(0);
- fflush(stdout);
if (fopen_checked(&infile, tfamname, "r")) {
goto transposed_to_bed_ret_OPEN_FAIL;
}
@@ -7222,6 +7234,9 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
if (fseeko(infile, 0, SEEK_END)) {
goto transposed_to_bed_ret_READ_FAIL;
}
+ logstr("Processing .tped file.\n");
+ transposed_to_bed_print_pct(0);
+ fflush(stdout);
tped_size = ftello(infile);
rewind(infile);
tped_next_thresh = tped_size / 100;
@@ -8216,7 +8231,8 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
bufptr2 = bufptr;
do {
cc = *(++bufptr);
- } while ((unsigned char)cc > ',');
+ // allow GATK 3.4 <*:DEL> symbolic allele
+ } while (((unsigned char)cc > ',') || (cc == '*'));
if (((uintptr_t)(bufptr - bufptr2) == ref_allele_len) && (!memcmp(ref_allele_ptr, bufptr2, ref_allele_len))) {
if ((alt_ct != 1) || (cc == ',')) {
sprintf(logbuf, "\nError: ALT allele duplicates REF allele on line %" PRIuPTR " of .vcf file.\n", line_idx);
@@ -8753,6 +8769,14 @@ int32_t vcf_to_bed(char* vcfname, char* outname, char* outname_end, int32_t miss
}
marker_skip_ct++;
}
+ if (fclose_null(&bimfile) || fclose_null(&outfile)) {
+ goto vcf_to_bed_ret_WRITE_FAIL;
+ }
+ if (skip3file) {
+ if (fclose_null(&skip3file)) {
+ goto vcf_to_bed_ret_WRITE_FAIL;
+ }
+ }
putchar('\r');
*outname_end = '\0';
LOGPRINTFWW("--vcf: %s.bed + %s.bim + %s.fam written.\n", outname, outname, outname);
@@ -11648,6 +11672,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
FILE* outfile = NULL;
FILE* outfile2 = NULL;
BGZF* bgz_outfile = NULL;
+ char* pzwritep = NULL;
uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
uintptr_t sample_ctv2 = 2 * ((sample_ct + (BITCT - 1)) / BITCT);
@@ -11659,6 +11684,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
char** mk_allele_ptrs = marker_allele_ptrs;
char** allele_missing = NULL;
char* recode_allele_extra = NULL;
+ unsigned char* overflow_buf = NULL;
const char* missing_geno_ptr = g_missing_geno_ptr;
char delim2 = delimiter;
uintptr_t* sample_include2 = NULL;
@@ -11670,6 +11696,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
uint32_t vcf_not_iid = (recode_modifier & RECODE_VCF) && (!(recode_modifier & RECODE_IID));
uint32_t vcf_two_ids = vcf_not_fid && vcf_not_iid;
uint32_t output_bgz = (recode_modifier / RECODE_BGZ) & 1;
+ uint32_t output_gen_gz = (recode_modifier / RECODE_GEN_GZ) & 1;
uint32_t recode_012 = recode_modifier & (RECODE_01 | RECODE_12);
uint32_t set_hh_missing = (misc_flags / MISC_SET_HH_MISSING) & 1;
uint32_t real_ref_alleles = (misc_flags / MISC_REAL_REF_ALLELES) & 1;
@@ -11700,6 +11727,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
char* cur_mk_allelesx[6];
char cur_dosage_chars[4];
uint32_t cmalen[4];
+ Pigz_state ps;
time_t rawtime;
struct tm *loctime;
uint32_t is_x;
@@ -11743,6 +11771,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
uint32_t cur_fid;
uint32_t uii;
int32_t ii;
+ pzwrite_init_null(&ps);
if (!hh_exists) {
set_hh_missing = 0;
}
@@ -11797,7 +11826,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
goto recode_ret_NOMEM;
}
} else if (recode_modifier & RECODE_OXFORD) {
- if (wkspace_alloc_c_checked(&writebuf, sample_ct * 6) ||
+ if (wkspace_alloc_uc_checked(&overflow_buf, PIGZ_BLOCK_SIZE + sample_ct * 6 + 2 * max_marker_allele_len + MAXLINELEN) ||
wkspace_alloc_ui_checked(&missing_cts, sample_ct * sizeof(int32_t))) {
goto recode_ret_NOMEM;
}
@@ -12640,12 +12669,13 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
bgz_outfile = NULL;
}
} else if (recode_modifier & RECODE_OXFORD) {
- memcpy(outname_end, ".gen", 5);
- if (fopen_checked(&outfile, outname, "w")) {
+ memcpy(outname_end, output_gen_gz? ".gen.gz" : ".gen", output_gen_gz? 8 : 5);
+ if (flex_pzwrite_init(output_gen_gz, outname, overflow_buf, 0, &ps)) {
goto recode_ret_OPEN_FAIL;
}
*outname_end = '\0';
- LOGPRINTFWW5("--recode oxford to %s.gen + %s.sample ... ", outname, outname);
+ LOGPRINTFWW5("--recode oxford%s to %s.gen%s + %s.sample ... ", output_gen_gz? " gen-gz" : "", outname, output_gen_gz? ".gz" : "", outname);
+ pzwritep = (char*)overflow_buf;
fputs("0%", stdout);
fflush(stdout);
for (pct = 1; pct <= 100; pct++) {
@@ -12664,24 +12694,18 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
// not clear from documentation whether anything special should be
// done for Y/haploid chromosomes
}
- wbufptr = chrom_name_write(tbuf, chrom_info_ptr, chrom_idx);
- *wbufptr++ = ' ';
- wbufptr = strcpyax(wbufptr, &(marker_ids[marker_uidx * max_marker_id_len]), ' ');
- wbufptr = uint32_writex(wbufptr, marker_pos[marker_uidx], ' ');
- if (fwrite_checked(tbuf, wbufptr - tbuf, outfile)) {
- goto recode_ret_WRITE_FAIL;
- }
- fputs(mk_allele_ptrs[2 * marker_uidx], outfile);
- putc(' ', outfile);
- fputs(mk_allele_ptrs[2 * marker_uidx + 1], outfile);
-
+ pzwritep = chrom_name_write(pzwritep, chrom_info_ptr, chrom_idx);
+ *pzwritep++ = ' ';
+ pzwritep = strcpyax(pzwritep, &(marker_ids[marker_uidx * max_marker_id_len]), ' ');
+ pzwritep = uint32_writex(pzwritep, marker_pos[marker_uidx], ' ');
+ pzwritep = strcpyax(pzwritep, mk_allele_ptrs[2 * marker_uidx], ' ');
+ pzwritep = strcpya(pzwritep, mk_allele_ptrs[2 * marker_uidx + 1]);
if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
goto recode_ret_READ_FAIL;
}
if (is_haploid && set_hh_missing) {
haploid_fix(hh_exists, sample_include2, sample_male_include2, sample_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
}
- wbufptr = writebuf;
ulptr = loadbuf_collapsed;
ulptr_end = &(loadbuf_collapsed[sample_ct / BITCT2]);
sample_idx = 0;
@@ -12694,7 +12718,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
if (ulii == 1) {
missing_cts[sample_idx] += 1;
}
- wbufptr = memcpya(wbufptr, &(cur_mk_allelesx_buf[ulii * 8]), 6);
+ pzwritep = memcpya(pzwritep, &(cur_mk_allelesx_buf[ulii * 8]), 6);
}
sample_uidx += BITCT2;
}
@@ -12704,10 +12728,10 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
ulptr_end++;
sample_uidx = sample_ct;
}
- if (fwrite_checked(writebuf, wbufptr - writebuf, outfile)) {
+ append_binary_eoln(&pzwritep);
+ if (flex_pzwrite(&ps, &pzwritep)) {
goto recode_ret_WRITE_FAIL;
}
- putc('\n', outfile);
}
if (pct < 100) {
if (pct > 10) {
@@ -12717,7 +12741,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
fflush(stdout);
}
}
- if (fclose_null(&outfile)) {
+ if (flex_pzwrite_close_null(&ps, pzwritep)) {
goto recode_ret_WRITE_FAIL;
}
memcpy(outname_end, ".sample", 8);
@@ -13940,6 +13964,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
if (bgz_outfile) {
bgzf_close(bgz_outfile);
}
+ flex_pzwrite_close_cond(&ps, pzwritep);
return retval;
}
@@ -14377,6 +14402,7 @@ int32_t merge_bim_scan(char* bimname, uint32_t is_binary, uintptr_t* max_marker_
goto merge_bim_scan_ret_1;
}
}
+ // do not filter on chrom_mask here, since that happens later
bufptr = next_token(bufptr);
bufptr2 = token_endl(bufptr);
uii = bufptr2 - bufptr;
@@ -16045,6 +16071,9 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
}
}
sort_marker_chrom_pos(ll_buf, tot_marker_ct, pos_buf, chrom_start, chrom_id, NULL, &chrom_ct);
+ // bugfix: when chromosomes are filtered out, flag the corresponding markers
+ // in marker_map[]
+ fill_uint_one(marker_map, tot_marker_ct);
if (merge_post_msort_update_maps(marker_ids, max_marker_id_len, marker_map, marker_cms, marker_cms_tmp, pos_buf, ll_buf, chrom_start, chrom_id, chrom_ct, &dedup_marker_ct, merge_equal_pos, marker_allele_ptrs, chrom_info_ptr)) {
goto merge_datasets_ret_INVALID_FORMAT;
}
@@ -16187,7 +16216,10 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
}
uii = tot_marker_ct;
while (uii--) {
- map_reverse[marker_map[uii]] = uii;
+ ujj = marker_map[uii];
+ if (ujj != 0xffffffffU) {
+ map_reverse[ujj] = uii;
+ }
}
for (ulii = 0; ulii < chrom_ct; ulii++) {
uii = chrom_start[ulii + 1];
diff --git a/plink_dosage.c b/plink_dosage.c
index 4bf4505..861048f 100644
--- a/plink_dosage.c
+++ b/plink_dosage.c
@@ -576,6 +576,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
uint32_t score_report_average = doip->modifier & DOSAGE_SCORE_NOSUM;
uint32_t dosage_score_cnt = doip->modifier & DOSAGE_SCORE_CNT;
uint32_t sex_covar = doip->modifier & DOSAGE_SEX;
+ uint32_t freq_cc = doip->modifier & DOSAGE_FREQ_CC;
uint32_t skip0 = doip->skip0;
uint32_t skip1p1 = doip->skip1 + 1;
uint32_t skip2 = doip->skip2;
@@ -1373,6 +1374,14 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
goto plink1_dosage_ret_1;
}
}
+ if (freq_cc) {
+ if (!do_glm) {
+ logprint("Error: --dosage 'case-control-freqs' modifier can only be used in association\nanalysis mode.\n");
+ goto plink1_dosage_ret_INVALID_CMDLINE;
+ } else if (!pheno_c) {
+ logprint("Warning: '--dosage case-control-freqs' is silly with a quantitative phenotype.\n");
+ }
+ }
if (do_glm) {
if (!sample_exclude_ct) {
pheno_nm_collapsed = pheno_nm;
@@ -1470,7 +1479,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
} else {
bufptr = memcpya(tbuf, " SNP", 12);
}
- bufptr = memcpya(bufptr, " A1 A2 FRQ INFO ", 28);
+ bufptr = memcpya(bufptr, freq_cc? " A1 A2 FRQ_A FRQ_U INFO " : " A1 A2 FRQ INFO ", freq_cc? 36 : 28);
bufptr = memcpya(bufptr, pheno_c? " OR" : "BETA", 4);
bufptr = memcpya(bufptr, " SE P", 16);
append_binary_eoln(&bufptr);
@@ -1684,6 +1693,12 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
file_icts[file_idx] = read_idx - read_idx_start;
line_idx_arr[file_idx] = line_idx;
}
+ if ((batch_ct == 1) && (!noheader)) {
+ ulii = sample_ct - popcount_longs(batch_samples, sample_ctl);
+ if (ulii) {
+ LOGPRINTFWW("Warning: %" PRIuPTR " sample ID%s present in .fam file but missing from dosage file%s.\n", ulii, (ulii == 1)? "" : "s", (cur_batch_size == 1)? "" : "s");
+ }
+ }
while (1) {
read_idx_start = 0;
@@ -1936,15 +1951,45 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
goto plink1_dosage_ret_WRITE_FAIL;
}
*pzwritep++ = ' ';
- pzwritep = double_f_writew74(pzwritep, dzz);
- *pzwritep++ = ' ';
- pzwritep = double_f_writew74(pzwritep, rsq);
+ if (freq_cc && pheno_c) {
+ // compute case and control A1 frequencies
+ dxx = 0.0; // case sum
+ dyy = 0.0; // control sum
+ uii = 0;
+ for (sample_idx = 0, sample_uidx = 0; sample_idx < sample_valid_ct; sample_idx++, sample_uidx++) {
+ next_set_unsafe_ck(cur_samples, &sample_uidx);
+ dzz = cur_dosages[sample_uidx];
+ if (is_set(pheno_c_collapsed, sample_uidx)) {
+ dxx += dzz;
+ uii++; // could also use a popcount
+ } else {
+ dyy += dzz;
+ }
+ }
+ if (uii) {
+ pzwritep = double_f_writew74x(pzwritep, dxx / ((double)((int32_t)uii)), ' ');
+ } else {
+ pzwritep = memcpya(pzwritep, " NA ", 8);
+ }
+ uii = sample_valid_ct - uii;
+ if (uii) {
+ pzwritep = double_f_writew74(pzwritep, dyy / ((double)((int32_t)uii)));
+ } else {
+ pzwritep = memcpya(pzwritep, " NA", 7);
+ }
+ } else {
+ pzwritep = double_f_writew74(pzwritep, dzz);
+ // remove this kludge once scripts stop depending on it
+ if (freq_cc) {
+ *pzwritep++ = ' ';
+ pzwritep = double_f_writew74(pzwritep, dzz);
+ }
+ }
*pzwritep++ = ' ';
+ pzwritep = double_f_writew74x(pzwritep, rsq, ' ');
if (is_valid) {
- pzwritep = double_f_writew74(pzwritep, pheno_c? exp(beta * 0.5) : (beta * 0.5));
- *pzwritep++ = ' ';
- pzwritep = double_f_writew74(pzwritep, se * 0.5);
- *pzwritep++ = ' ';
+ pzwritep = double_f_writew74x(pzwritep, pheno_c? exp(beta * 0.5) : (beta * 0.5), ' ');
+ pzwritep = double_f_writew74x(pzwritep, se * 0.5, ' ');
pzwritep = double_g_writewx4(pzwritep, MAXV(pval, output_min_p), 7);
} else {
pzwritep = memcpya(pzwritep, " NA NA NA", 23);
diff --git a/plink_dosage.h b/plink_dosage.h
index 00f9802..0c6b21b 100644
--- a/plink_dosage.h
+++ b/plink_dosage.h
@@ -16,6 +16,7 @@
#define DOSAGE_SCORE_NOSUM 0x200
#define DOSAGE_SCORE_CNT 0x400
#define DOSAGE_SEX 0x800
+#define DOSAGE_FREQ_CC 0x1000
typedef struct {
char* fname;
diff --git a/plink_family.c b/plink_family.c
index fe36eda..91060b4 100644
--- a/plink_family.c
+++ b/plink_family.c
@@ -25,7 +25,7 @@ void family_init(Family_info* fam_ip) {
// bits 0-7 = child increment (always 1)
// bits 8-15 = father increment
// bits 16-23 = mother increment
-// bits 24-31 = error code
+// bits 24-31 = female error code
const uint32_t mendel_error_table[] =
{0, 0, 0x1010101, 0x8000001,
0, 0, 0, 0x7010001,
@@ -43,24 +43,14 @@ const uint32_t mendel_error_table[] =
0x4010001, 0, 0, 0,
0x4010001, 0, 0, 0,
0x5000001, 0, 0x2010101, 0};
-// necessary to check child gender when dealing with error 9/10
-const uint32_t mendel_error_table_x[] =
-{0, 0, 0x1010101, 0x8000001,
- 0, 0, 0, 0x9010001,
- 0, 0, 0, 0x7010001,
- 0x3000101, 0, 0, 0x7010001,
- 0, 0, 0, 0x6000101,
- 0, 0, 0, 0,
- 0, 0, 0, 0,
- 0x3000101, 0, 0, 0,
- 0, 0, 0, 0x6000101,
+
+// bottom 2 bits of index = child genotype
+// next 2 bits of index = maternal genotype
+const uint32_t mendel_error_table_male_x[] =
+{0, 0, 0, 0x9010001,
0, 0, 0, 0,
0, 0, 0, 0,
- 0x3000101, 0, 0, 0,
- 0x4010001, 0, 0, 0x6000101,
- 0xa010001, 0, 0, 0,
- 0x4010001, 0, 0, 0,
- 0x5000001, 0, 0x2010101, 0};
+ 0xa010001, 0, 0, 0};
int32_t get_trios_and_families(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uintptr_t* founder_info, uintptr_t* sex_nm, uintptr_t* sex_male, char* sample_ids, uintptr_t max_sample_id_len, char* paternal_ids, uintptr_t max_paternal_id_len, char* maternal_ids, uintptr_t max_maternal_id_len, char** fids_ptr, uintptr_t* max_fid_len_ptr, char** iids_ptr, uintptr_t* max_iid_len_ptr, uint64_t** family_list_ptr, uint32_t* family_ct_ptr, uint64_t** trio_list_ptr [...]
// This mirrors linkRelateds() in genedrop.cpp, and parseTrios() in trio.cpp,
@@ -471,7 +461,7 @@ int32_t get_trios_and_families(uintptr_t unfiltered_sample_ct, uintptr_t* sample
return retval;
}
-uint32_t erase_mendel_errors(uintptr_t unfiltered_sample_ct, uintptr_t* loadbuf, uintptr_t* workbuf, uint32_t* trio_lookup, uint32_t trio_ct, uint32_t multigen) {
+uint32_t erase_mendel_errors(uintptr_t unfiltered_sample_ct, uintptr_t* loadbuf, uintptr_t* workbuf, uintptr_t* sex_male, uint32_t* trio_lookup, uint32_t trio_ct, uint32_t is_x, uint32_t multigen) {
uint32_t* uiptr = trio_lookup;
uint32_t cur_errors = 0;
uint32_t trio_idx;
@@ -492,11 +482,17 @@ uint32_t erase_mendel_errors(uintptr_t unfiltered_sample_ct, uintptr_t* loadbuf,
uii = *uiptr++;
ujj = *uiptr++;
ukk = *uiptr++;
- umm = mendel_error_table[((workbuf[uii / BITCT2] >> (2 * (uii % BITCT2))) & 3) | (((workbuf[ujj / BITCT2] >> (2 * (ujj % BITCT2))) & 3) << 2) | (((workbuf[ukk / BITCT2] >> (2 * (ukk % BITCT2))) & 3) << 4)];
+ umm = (workbuf[uii / BITCT2] >> (2 * (uii % BITCT2))) & 3;
+ unn = (workbuf[ukk / BITCT2] >> (2 * (ukk % BITCT2))) & 3;
+ if ((!is_x) || (!is_set(sex_male, uii))) {
+ umm = mendel_error_table[umm | (((workbuf[ujj / BITCT2] >> (2 * (ujj % BITCT2))) & 3) << 2) | (unn << 4)];
+ } else {
+ umm = mendel_error_table_male_x[umm | (unn << 2)];
+ }
if (umm) {
ulii = loadbuf[uii / BITCT2];
uljj = ONELU << (2 * (uii % BITCT2));
- loadbuf[uii / BITCT2] = (ulii & (~(3 * uljj))) | uljj;
+ loadbuf[uii / BITCT2] = (ulii & (~(3 * uljj))) | uljj;
if (umm & 0x100) {
ulii = loadbuf[ujj / BITCT2];
uljj = ONELU << (2 * (ujj % BITCT2));
@@ -525,7 +521,11 @@ uint32_t erase_mendel_errors(uintptr_t unfiltered_sample_ct, uintptr_t* loadbuf,
ulii = (workbuf[unn] >> uii) & 3;
uljj = ((workbuf[uoo] >> ujj) & 3) | (((workbuf[upp] >> ukk) & 3) << 2);
if (ulii != 1) {
- umm = mendel_error_table[ulii | (uljj << 2)];
+ if ((!is_x) || (!is_set(sex_male, uii))) {
+ umm = mendel_error_table[ulii | (uljj << 2)];
+ } else {
+ umm = mendel_error_table_male_x[ulii | (uljj & 12)];
+ }
if (umm) {
ulii = loadbuf[unn];
uljj = ONELU << uii;
@@ -543,10 +543,14 @@ uint32_t erase_mendel_errors(uintptr_t unfiltered_sample_ct, uintptr_t* loadbuf,
cur_errors++;
}
} else if (!uljj) {
+ // both parents are homozygous for the same allele, so child genotype
+ // is "known" for the purpose of checking grandchild genotypes
workbuf[unn] &= ~(ONELU << uii);
} else if (uljj == 15) {
workbuf[unn] |= (2 * ONELU) << uii;
}
+ // no need to fill "known" heterozygous genotype, since that's treated
+ // the same way as a missing genotype
}
}
return cur_errors;
@@ -711,7 +715,6 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
char* errstrs[10];
uint32_t errstr_lens[11];
uint32_t alens[2];
- const uint32_t* error_table_ptr;
char* fids;
char* iids;
char* wptr;
@@ -753,6 +756,7 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
uint32_t ujj;
uint32_t ukk;
uint32_t umm;
+ uint32_t unn;
marker_ct -= count_non_autosomal_markers(chrom_info_ptr, marker_exclude, 0, 1);
if ((!marker_ct) || is_set(chrom_info_ptr->haploid_mask, 0)) {
logprint("Warning: Skipping --me/--mendel since there is no autosomal or Xchr data.\n");
@@ -845,11 +849,6 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
if (uii == chrom_end) {
continue;
}
- if (!is_x) {
- error_table_ptr = mendel_error_table;
- } else {
- error_table_ptr = mendel_error_table_x;
- }
if (calc_mendel) {
chrom_name_ptr = chrom_name_buf5w4write(chrom_name_buf, chrom_info_ptr, chrom_idx, &chrom_name_len);
}
@@ -885,15 +884,18 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
uii = *uiptr++;
ujj = *uiptr++;
ukk = *uiptr++;
- umm = error_table_ptr[((loadbuf[uii / BITCT2] >> (2 * (uii % BITCT2))) & 3) | (((loadbuf[ujj / BITCT2] >> (2 * (ujj % BITCT2))) & 3) << 2) | (((loadbuf[ukk / BITCT2] >> (2 * (ukk % BITCT2))) & 3) << 4)];
+ umm = (loadbuf[uii / BITCT2] >> (2 * (uii % BITCT2))) & 3;
+ unn = (loadbuf[ukk / BITCT2] >> (2 * (ukk % BITCT2))) & 3;
+ if ((!is_x) || (!is_set(sex_male, uii))) {
+ umm = mendel_error_table[umm | (((loadbuf[ujj / BITCT2] >> (2 * (ujj % BITCT2))) & 3) << 2) | (unn << 4)];
+ } else {
+ umm = mendel_error_table_male_x[umm | (unn << 2)];
+ }
if (umm) {
error_cts_tmp2[trio_idx] += umm & 0xffffff;
cur_error_ct++;
if (calc_mendel) {
umm >>= 24;
- if ((umm > 8) && (!is_set(sex_male, uii))) {
- umm = 34 - 3 * umm; // 9 -> 7, 10 -> 4
- }
wptr = fw_strcpy(plink_maxfid, &(fids[trio_idx * max_fid_len]), tbuf);
*wptr++ = ' ';
wptr = fw_strcpy(plink_maxiid, &(iids[uii * max_iid_len]), wptr);
@@ -924,16 +926,17 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
ujj = 2 * (uii % BITCT2);
ulii = (loadbuf[umm] >> ujj) & 3;
if (ulii != 1) {
- umm = error_table_ptr[ulii | (uljj << 2)];
+ if ((!is_x) || (!is_set(sex_male, uii))) {
+ umm = mendel_error_table[ulii | (uljj << 2)];
+ } else {
+ umm = mendel_error_table_male_x[ulii | (uljj & 12)];
+ }
if (umm) {
error_cts_tmp2[trio_idx] += umm & 0xffffff;
cur_error_ct++;
if (calc_mendel) {
set_bit(error_locs, trio_idx);
umm >>= 24;
- if ((umm > 8) && (!is_set(sex_male, uii))) {
- umm = 34 - 3 * umm;
- }
cur_errors[trio_idx] = (unsigned char)umm;
}
}
@@ -1724,7 +1727,7 @@ int32_t populate_pedigree_rel_info(Pedigree_rel_info* pri_ptr, uintptr_t unfilte
return 0;
}
-int32_t tdt_poo(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct_ax, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sample_male_include2, uint32_t* trio_nuclear_lookup, uint32_t family_ct, Aperm_in [...]
+int32_t tdt_poo(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct_ax, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sex_male, uintptr_t* sample_male_include2, uint32_t* trio_nuclear_lookup, uint32_ [...]
FILE* outfile = NULL;
uint64_t mendel_error_ct = 0;
double pat_a2transmit_recip = 0.0;
@@ -1827,7 +1830,7 @@ int32_t tdt_poo(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* o
if (hh_exists && is_x) {
hh_reset((unsigned char*)loadbuf, sample_male_include2, unfiltered_sample_ct);
}
- mendel_error_ct += erase_mendel_errors(unfiltered_sample_ct, loadbuf, workbuf, trio_error_lookup, trio_ct, multigen);
+ mendel_error_ct += erase_mendel_errors(unfiltered_sample_ct, loadbuf, workbuf, sex_male, trio_error_lookup, trio_ct, is_x, multigen);
lookup_ptr = trio_nuclear_lookup;
poo_acc = 0;
poo_acc_ct = 0;
@@ -2203,7 +2206,7 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
}
}
if (poo_test) {
- retval = tdt_poo(threads, bedfile, bed_offset, outname, outname_end, output_min_p, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, marker_allele_ptrs, max_marker_allele_len, marker_reverse, unfiltered_sample_ct, sample_male_include2, trio_nuclear_lookup, family_ct, apip, mperm_save, sample_ids, max_sample_id_len, chrom_info_ptr, hh_exists, fam_ip, loadbuf, workbuf, textbuf, orig_chisq, trio_error_lookup, trio_ct);
+ retval = tdt_poo(threads, bedfile, bed_offset, outname, outname_end, output_min_p, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, marker_allele_ptrs, max_marker_allele_len, marker_reverse, unfiltered_sample_ct, sex_male, sample_male_include2, trio_nuclear_lookup, family_ct, apip, mperm_save, sample_ids, max_sample_id_len, chrom_info_ptr, hh_exists, fam_ip, loadbuf, workbuf, textbuf, orig_chisq, trio_error_lookup, trio_ct);
if (retval) {
goto tdt_ret_1;
}
@@ -2274,7 +2277,7 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
// c. if at least one het parent, iterate through genotyped children,
// incrementing regular TDT counts.
// mendel_error_ct += erase_mendel_errors(unfiltered_sample_ct, loadbuf, workbuf, trio_error_lookup, trio_ct, multigen);
- erase_mendel_errors(unfiltered_sample_ct, loadbuf, workbuf, trio_error_lookup, trio_ct, multigen);
+ erase_mendel_errors(unfiltered_sample_ct, loadbuf, workbuf, sex_male, trio_error_lookup, trio_ct, is_x, multigen);
lookup_ptr = trio_nuclear_lookup;
parentdt_acc = 0;
parentdt_acc_ct = 0;
@@ -2709,8 +2712,7 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
}
topsize = ulii;
- memcpy(ulptr, not_in_family, unfiltered_sample_ctl * sizeof(intptr_t));
- bitfield_andnot(ulptr, founder_info, unfiltered_sample_ctl);
+ bitfield_andnot_copy(unfiltered_sample_ctl, ulptr, not_in_family, founder_info);
cur_sample_ct = popcount_longs(ulptr, unfiltered_sample_ctl);
wkspace_left -= topsize;
if (wkspace_alloc_ui_checked(&fs_starts, (1 + family_ct + (cur_sample_ct / 2)) * sizeof(int32_t))) {
@@ -3439,7 +3441,7 @@ int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
if (IS_SET(marker_reverse, marker_uidx)) {
reverse_loadbuf((unsigned char*)loadbuf_raw, unfiltered_sample_ct);
}
- erase_mendel_errors(unfiltered_sample_ct, loadbuf_raw, workbuf, trio_error_lookup, trio_ct, multigen);
+ erase_mendel_errors(unfiltered_sample_ct, loadbuf_raw, workbuf, sex_male, trio_error_lookup, trio_ct, 0, multigen);
collapse_copy_2bitarr(loadbuf_raw, &(g_loadbuf[block_size * dfam_sample_ctl2]), unfiltered_sample_ct, dfam_sample_ct, dfam_sample_exclude);
if (perm_adapt_nst) {
g_adapt_m_table[block_size] = marker_idx2++;
@@ -3863,6 +3865,7 @@ void qfam_compute_bw(uintptr_t* loadbuf, uintptr_t sample_ct, uint32_t* fs_start
}
fss_ptr = &(fss_contents[cur_start]);
for (; cur_idx < fs_ct; cur_idx++) {
+ // sibships
cur_end = *fs_starts_ptr++;
sib_ct = cur_end - cur_start;
fss_end = &(fss_contents[cur_end]);
@@ -3884,6 +3887,7 @@ void qfam_compute_bw(uintptr_t* loadbuf, uintptr_t sample_ct, uint32_t* fs_start
cur_start = cur_end;
}
for (; cur_idx < fss_ct; cur_idx++) {
+ // singletons
sample_uidx = *fss_ptr++;
ulii = (loadbuf[sample_uidx / BITCT2] >> (2 * (sample_uidx % BITCT2))) & 3;
if (ulii != 1) {
@@ -4595,7 +4599,7 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
if (IS_SET(marker_reverse, marker_uidx)) {
reverse_loadbuf((unsigned char*)loadbuf_raw, unfiltered_sample_ct);
}
- erase_mendel_errors(unfiltered_sample_ct, loadbuf_raw, workbuf, trio_error_lookup, trio_ct, multigen);
+ erase_mendel_errors(unfiltered_sample_ct, loadbuf_raw, workbuf, sex_male, trio_error_lookup, trio_ct, 0, multigen);
loadbuf_ptr = &(g_loadbuf[block_idx * sample_ctl2]);
collapse_copy_2bitarr(loadbuf_raw, loadbuf_ptr, unfiltered_sample_ct, sample_ct, sample_exclude);
g_adapt_m_table[block_idx] = marker_idx;
diff --git a/plink_family.h b/plink_family.h
index 1558046..5c6826d 100644
--- a/plink_family.h
+++ b/plink_family.h
@@ -51,7 +51,7 @@ void family_init(Family_info* fam_ip);
int32_t get_trios_and_families(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uintptr_t* founder_info, uintptr_t* sex_nm, uintptr_t* sex_male, char* sample_ids, uintptr_t max_sample_id_len, char* paternal_ids, uintptr_t max_paternal_id_len, char* maternal_ids, uintptr_t max_maternal_id_len, char** fids_ptr, uintptr_t* max_fid_len_ptr, char** iids_ptr, uintptr_t* max_iid_len_ptr, uint64_t** family_list_ptr, uint32_t* family_ct_ptr, uint64_t** trio_list_ptr [...]
-uint32_t erase_mendel_errors(uintptr_t unfiltered_sample_ct, uintptr_t* loadbuf, uintptr_t* workbuf, uint32_t* trio_lookup, uint32_t trio_ct, uint32_t multigen);
+uint32_t erase_mendel_errors(uintptr_t unfiltered_sample_ct, uintptr_t* loadbuf, uintptr_t* workbuf, uintptr_t* sex_male, uint32_t* trio_lookup, uint32_t trio_ct, uint32_t is_x, uint32_t multigen);
int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t plink_maxfid, uint32_t plink_maxiid, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_e [...]
diff --git a/plink_filter.c b/plink_filter.c
index b17c998..793a155 100644
--- a/plink_filter.c
+++ b/plink_filter.c
@@ -352,13 +352,11 @@ int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uin
unsigned char* wkspace_mark = wkspace_base;
uintptr_t include_ct = 0;
uintptr_t unfiltered_ctl = (unfiltered_ct + (BITCT - 1)) / BITCT;
- uintptr_t* cur_neg_matches = NULL;
char* sorted_pos_match = NULL;
char* sorted_neg_match = NULL;
char* bufptr2 = NULL;
uint32_t pos_match_ct = 0;
uint32_t neg_match_ct = 0;
- uint32_t neg_match_ctl = 0;
uintptr_t max_pos_match_len = 0;
uintptr_t max_neg_match_len = 0;
uintptr_t line_idx = 0;
@@ -375,7 +373,6 @@ int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uin
uintptr_t ulii;
uint32_t item_uidx;
uint32_t pos_match_needed;
- uint32_t cur_neg_match_ct;
int32_t sorted_idx;
if (wkspace_alloc_ul_checked(&exclude_arr_new, unfiltered_ctl * sizeof(intptr_t)) ||
@@ -428,9 +425,7 @@ int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uin
}
}
if (neg_match_ct) {
- neg_match_ctl = (neg_match_ct + (BITCT - 1)) / BITCT;
- if (wkspace_alloc_c_checked(&sorted_neg_match, max_neg_match_len * neg_match_ct) ||
- wkspace_alloc_ul_checked(&cur_neg_matches, neg_match_ctl * sizeof(intptr_t))) {
+ if (wkspace_alloc_c_checked(&sorted_neg_match, max_neg_match_len * neg_match_ct)) {
goto filter_attrib_ret_NOMEM;
}
}
@@ -473,13 +468,15 @@ int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uin
}
if (neg_match_ct) {
qsort(sorted_neg_match, neg_match_ct, max_neg_match_len, strcmp_casted);
- bufptr = scan_for_duplicate_ids(sorted_neg_match, neg_match_ct, max_neg_match_len);
+ // bugfix: when multiple negative match conditions are present, presence
+ // of ANY of them is enough to disqualify the variant. So it's
+ // appropriate to ensure no duplication between positive and negative
+ // match conditions after all.
+ bufptr = scan_for_duplicate_or_overlap_ids(sorted_neg_match, neg_match_ct, max_neg_match_len, sorted_pos_match, pos_match_ct, max_pos_match_len);
if (bufptr) {
LOGPREPRINTFWW("Error: Duplicate attribute '%s' in --attrib argument.\n", bufptr);
goto filter_attrib_ret_INVALID_CMDLINE_2;
}
- // actually may make sense to have same attribute as a positive and
- // negative condition, so we don't check for that
}
}
loadbuf_size = wkspace_left;
@@ -529,10 +526,6 @@ int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uin
}
set_bit(already_seen, item_uidx);
pos_match_needed = pos_match_ct;
- cur_neg_match_ct = 0;
- if (neg_match_ct) {
- fill_ulong_zero(cur_neg_matches, neg_match_ctl);
- }
while (!is_eoln_kns(*cond_ptr)) {
bufptr2 = cond_ptr;
bufptr = token_endnn(cond_ptr);
@@ -540,25 +533,18 @@ int32_t filter_attrib(char* fname, char* condition_str, uint32_t* id_htable, uin
cond_ptr = skip_initial_spaces(bufptr);
if (pos_match_needed && (bsearch_str(bufptr2, ulii, sorted_pos_match, max_pos_match_len, pos_match_ct) != -1)) {
pos_match_needed = 0;
- }
- if (cur_neg_match_ct < neg_match_ct) {
+ } else if (neg_match_ct) {
sorted_idx = bsearch_str(bufptr2, ulii, sorted_neg_match, max_neg_match_len, neg_match_ct);
- if ((sorted_idx != -1) && (!is_set(cur_neg_matches, sorted_idx))) {
- cur_neg_match_ct++;
- if (cur_neg_match_ct == neg_match_ct) {
- // fail
- pos_match_needed = 1;
- break;
- }
- set_bit(cur_neg_matches, sorted_idx);
+ if (sorted_idx != -1) {
+ // fail
+ pos_match_needed = 1;
+ break;
}
}
}
if (pos_match_needed) {
continue;
}
- // full negative match causes pos_match_needed to be set, so no further
- // check required
clear_bit(exclude_arr_new, item_uidx);
include_ct++;
}
@@ -603,14 +589,12 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
unsigned char* wkspace_mark = wkspace_base;
uintptr_t include_ct = 0;
uintptr_t unfiltered_ctl = (unfiltered_ct + (BITCT - 1)) / BITCT;
- uintptr_t* cur_neg_matches = NULL;
char* sorted_pos_match = NULL;
char* sorted_neg_match = NULL;
char* id_buf = NULL;
char* bufptr2 = NULL;
uint32_t pos_match_ct = 0;
uint32_t neg_match_ct = 0;
- uint32_t neg_match_ctl = 0;
uintptr_t max_pos_match_len = 0;
uintptr_t max_neg_match_len = 0;
uintptr_t line_idx = 0;
@@ -627,7 +611,6 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
uintptr_t ulii;
uint32_t unfiltered_idx;
uint32_t pos_match_needed;
- uint32_t cur_neg_match_ct;
int32_t sorted_idx;
if (wkspace_alloc_ul_checked(&exclude_arr_new, unfiltered_ctl * sizeof(intptr_t)) ||
@@ -681,9 +664,7 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
}
}
if (neg_match_ct) {
- neg_match_ctl = (neg_match_ct + (BITCT - 1)) / BITCT;
- if (wkspace_alloc_c_checked(&sorted_neg_match, max_neg_match_len * neg_match_ct) ||
- wkspace_alloc_ul_checked(&cur_neg_matches, neg_match_ctl * sizeof(intptr_t))) {
+ if (wkspace_alloc_c_checked(&sorted_neg_match, max_neg_match_len * neg_match_ct)) {
goto filter_attrib_sample_ret_NOMEM;
}
}
@@ -726,13 +707,11 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
}
if (neg_match_ct) {
qsort(sorted_neg_match, neg_match_ct, max_neg_match_len, strcmp_casted);
- bufptr = scan_for_duplicate_ids(sorted_neg_match, neg_match_ct, max_neg_match_len);
+ bufptr = scan_for_duplicate_or_overlap_ids(sorted_neg_match, neg_match_ct, max_neg_match_len, sorted_pos_match, pos_match_ct, max_pos_match_len);
if (bufptr) {
LOGPREPRINTFWW("Error: Duplicate attribute '%s' in --attrib-indiv argument.\n", bufptr);
goto filter_attrib_sample_ret_INVALID_CMDLINE_2;
}
- // actually may make sense to have same attribute as a positive and
- // negative condition, so we don't check for that
}
}
loadbuf_size = wkspace_left;
@@ -783,10 +762,6 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
set_bit(already_seen, sorted_idx);
unfiltered_idx = id_map[(uint32_t)sorted_idx];
pos_match_needed = pos_match_ct;
- cur_neg_match_ct = 0;
- if (neg_match_ct) {
- fill_ulong_zero(cur_neg_matches, neg_match_ctl);
- }
while (!is_eoln_kns(*cond_ptr)) {
bufptr2 = cond_ptr;
bufptr = token_endnn(cond_ptr);
@@ -794,25 +769,18 @@ int32_t filter_attrib_sample(char* fname, char* condition_str, char* sorted_ids,
cond_ptr = skip_initial_spaces(bufptr);
if (pos_match_needed && (bsearch_str(bufptr2, ulii, sorted_pos_match, max_pos_match_len, pos_match_ct) != -1)) {
pos_match_needed = 0;
- }
- if (cur_neg_match_ct < neg_match_ct) {
+ } else if (neg_match_ct) {
sorted_idx = bsearch_str(bufptr2, ulii, sorted_neg_match, max_neg_match_len, neg_match_ct);
- if ((sorted_idx != -1) && (!is_set(cur_neg_matches, sorted_idx))) {
- cur_neg_match_ct++;
- if (cur_neg_match_ct == neg_match_ct) {
- // fail
- pos_match_needed = 1;
- break;
- }
- set_bit(cur_neg_matches, sorted_idx);
+ if (sorted_idx != -1) {
+ // fail
+ pos_match_needed = 1;
+ break;
}
}
}
if (pos_match_needed) {
continue;
}
- // full negative match causes pos_match_needed to be set, so no further
- // check required
clear_bit(exclude_arr_new, unfiltered_idx);
include_ct++;
}
@@ -1547,7 +1515,7 @@ int32_t mind_filter(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
FILE* outfile = NULL;
uint32_t marker_ct = unfiltered_marker_ct - marker_exclude_ct;
uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
- uintptr_t unfiltered_sample_ct2l = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+ uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
uintptr_t final_mask = get_final_mask(unfiltered_sample_ct);
uintptr_t marker_idx = 0;
@@ -1583,23 +1551,23 @@ int32_t mind_filter(FILE* bedfile, uintptr_t bed_offset, char* outname, char* ou
if (y_present) {
y_start = chrom_info_ptr->chrom_start[(uint32_t)y_code];
y_end = chrom_info_ptr->chrom_end[(uint32_t)y_code];
- if (wkspace_alloc_ul_checked(&sample_male_include2, unfiltered_sample_ct2l * sizeof(intptr_t))) {
+ if (wkspace_alloc_ul_checked(&sample_male_include2, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
goto mind_filter_ret_NOMEM;
}
vec_include_init(unfiltered_sample_ct, sample_male_include2, sex_male);
nony_marker_ct = marker_ct - (y_end - y_start - popcount_bit_idx(marker_exclude, y_start, y_end));
}
if (wkspace_alloc_ui_checked(&missing_cts, unfiltered_sample_ct * sizeof(int32_t)) ||
- wkspace_alloc_ul_checked(&loadbuf, unfiltered_sample_ct2l * sizeof(intptr_t)) ||
+ wkspace_alloc_ul_checked(&loadbuf, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
wkspace_alloc_ul_checked(&newly_excluded, unfiltered_sample_ctl * sizeof(int32_t))) {
goto mind_filter_ret_NOMEM;
}
- loadbuf[unfiltered_sample_ct2l - 1] = 0;
+ loadbuf[unfiltered_sample_ctl2 - 1] = 0;
fill_uint_zero(missing_cts, unfiltered_sample_ct);
if (fseeko(bedfile, bed_offset, SEEK_SET)) {
goto mind_filter_ret_READ_FAIL;
}
- ujj = unfiltered_sample_ct2l * BITCT2;
+ ujj = unfiltered_sample_ctl2 * BITCT2;
marker_uidx = 0;
for (; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
if (IS_SET(marker_exclude, marker_uidx)) {
@@ -2700,10 +2668,10 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
logprint(" done.\n");
if (hethap_ct) {
*outname_end = '\0';
- LOGPRINTFWW("Warning: %" PRIu64 " het. haploid genotype%s present (see %s.hh ).\n", hethap_ct, (hethap_ct == 1LLU)? "" : "s", outname);
+ LOGPRINTFWW("Warning: %" PRIu64 " het. haploid genotype%s present (see %s.hh ); many commands treat these as missing.\n", hethap_ct, (hethap_ct == 1LLU)? "" : "s", outname);
}
if (nonmissing_nonmale_y) {
- logprint("Warning: Nonmissing nonmale Y chromosome genotype(s) present.\n");
+ logprint("Warning: Nonmissing nonmale Y chromosome genotype(s) present; many commands\ntreat these as missing.\n");
*hh_exists_ptr |= Y_FIX_NEEDED;
}
if (nonmissing_rate_tot <= 0.9999995 * ((double)((intptr_t)nonmissing_rate_tot_max))) {
@@ -2731,8 +2699,8 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uint32_t plink_maxfid, uint32_t plink_maxiid, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uintptr_t* sample_exclude, uintptr_t* pheno_nm, uintptr_t* sex_ [...]
unsigned char* wkspace_mark = wkspace_base;
uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
- uintptr_t unfiltered_sample_ct2l = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
- uintptr_t unfiltered_sample_ctv2 = (unfiltered_sample_ct2l + 1) & (~1);
+ uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+ uintptr_t unfiltered_sample_ctv2 = (unfiltered_sample_ctl2 + 1) & (~1);
uintptr_t marker_ct_y = 0;
uintptr_t* sample_male_include2 = NULL;
uint64_t* om_entry_ptr = NULL;
@@ -2842,7 +2810,7 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
}
}
}
- ujj = unfiltered_sample_ct2l * BITCT2;
+ ujj = unfiltered_sample_ctl2 * BITCT2;
if (!cluster_ct) {
sprintf(tbuf, " CHR %%%us N_MISS N_GENO F_MISS" EOLN_STR, plink_maxsnp);
} else {
diff --git a/plink_help.c b/plink_help.c
index dc1aaf7..a09b2ff 100644
--- a/plink_help.c
+++ b/plink_help.c
@@ -382,7 +382,7 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" --recode <01 | 12> <23 | A{-transpose} | AD | beagle{-nomap} | bimbam{-1chr}\n"
" | compound-genotypes | fastphase{-1chr} | HV{-1chr} | lgen{-ref} |\n"
" list | oxford | rlist | structure | transpose | vcf | vcf-fid |\n"
-" vcf-iid> <tab | tabx | spacex | bgz> <include-alt>\n"
+" vcf-iid> <tab | tabx | spacex | bgz | gen-gz> <include-alt>\n"
" Create a new text fileset with all filters applied. By default, the\n"
" fileset consists of a .ped and a .map file, readable with --file.\n"
" * The '12' modifier causes A1 (usually minor) alleles to be coded as '1'\n"
@@ -416,6 +416,7 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" * The 'list' modifier creates a genotype-based list, while 'rlist' creates\n"
" a rare-genotype fileset.\n"
" * 'oxford' causes an Oxford-format .gen + .sample fileset to be generated.\n"
+" If you also include the 'gen-gz' modifier, the .gen file is gzipped.\n"
" * The 'structure' modifier causes a Structure-format file to be generated.\n"
" * 'transpose' creates a transposed text fileset (loadable with --tfile).\n"
" * 'vcf', 'vcf-fid', and 'vcf-iid' result in production of a VCFv4.2 file.\n"
@@ -499,11 +500,13 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" from the report.\n\n"
);
help_print("freq\tfreqx\tfrqx\tcounts", &help_ctrl, 1,
-" --freq <counts> <gz>\n"
+" --freq <counts | case-control> <gz>\n"
" --freqx <gz>\n"
" --freq generates a basic allele frequency (or count, if the 'counts'\n"
" modifier is present) report. This can be combined with --within/--family\n"
-" to produce a cluster-stratified allele frequency/count report instead.\n"
+" to produce a cluster-stratified allele frequency/count report instead, or\n"
+" the 'case-control' modifier to report case and control allele frequencies\n"
+" separately.\n"
" --freqx generates a more detailed genotype count report, designed for use\n"
" with --read-freq.\n\n"
);
@@ -850,8 +853,9 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
#ifndef NOLAPACK
help_print("pca\tmake-rel\tmake-grm\tmake-grm-gz\tmake-grm-bin", &help_ctrl, 1,
" --pca {count} <header> <tabs> <var-wts>\n"
-" Calculates a variance-standardized relationship matrix (use --make-rel or\n"
-" --make-grm-gz to dump it), and extracts the top 20 principal components.\n"
+" Calculates a variance-standardized relationship matrix (use\n"
+" --make-rel/--make-grm-gz/--make-grm-bin to dump it), and extracts the top\n"
+" 20 principal components.\n"
" * It is usually best to perform this calculation on a marker set in\n"
" approximate linkage equilibrium.\n"
" * You can change the number of PCs by passing a numeric parameter.\n"
@@ -993,9 +997,10 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
help_print("dosage\twrite-dosage", &help_ctrl, 1,
" --dosage [allele dosage file] <noheader> <skip0=[i]> <skip1=[j]> <skip2=[k]>\n"
" <dose1> <format=[m]> <Zout> <occur | standard-beta> <sex>\n"
+" <case-control-freqs>\n"
" --dosage [list file] list <sepheader | noheader> <skip0=[i]> <skip1=[j]>\n"
" <skip2=[k]> <dose1> <format=[m]> <Zout> <occur | standard-beta>\n"
-" <sex>\n"
+" <sex> <case-control-freqs>\n"
" --write-dosage\n"
" Process (possibly gzipped) text files with variant-major allelic dosage\n"
" data. This cannot be used with a regular input fileset; instead, you must\n"
@@ -1029,6 +1034,8 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" * 'Zout' causes the output file to be gzipped.\n"
" * Normally, an association analysis is performed. 'standard-beta' and\n"
" 'sex' behave as they are supposed to with --linear/--logistic.\n"
+" 'case-control-freqs' causes case and control allele frequencies to be\n"
+" reported separately.\n"
" * There are three alternate modes which cause the association analysis to\n"
" be skipped.\n"
" * 'occur' requests a simple variant occurrence report.\n"
@@ -1516,7 +1523,7 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" the listed attributes. If some attribute names in\n"
" the list are preceded by '-', they are treated as\n"
" 'negative match conditions' instead: variants with\n"
-" all the negative match attributes are removed.\n"
+" at least one negative match attribute are removed.\n"
" The first character in the list cannot be a '-', due\n"
" to how command-line parsing works; add a comma in\n"
" front to get around this.\n"
@@ -1758,7 +1765,10 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
);
help_print("a1-allele\treference-allele\tupdate-ref-allele\ta2-allele", &help_ctrl, 0,
" --a1-allele [f] {a1col} {IDcol} {skip} : Force alleles in the file to A1.\n"
-" --a2-allele [f] {a2col} {IDcol} {skip} : Force alleles in the file to A2.\n"
+" --a2-allele [filename] {a2col} {IDcol} {skip} :\n"
+" Force alleles in the file to A2. (\"--a2-allele [VCF filename] 4 3 '#'\",\n"
+" which scrapes reference allele assignments from a VCF file, is especially\n"
+" useful.)\n"
);
help_print("indiv-sort\tmerge\tbmerge\tmerge-list", &help_ctrl, 0,
" --indiv-sort [m] {f} : Specify FID/IID sort order. The following four modes\n"
@@ -1905,8 +1915,8 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
" calculated PCs.\n"
);
help_print("cluster\tmds-plot\tmds-cluster", &help_ctrl, 0,
-" --mds-plot [dims] <by-cluster> <eigvals> : Multidimensional scaling analysis.\n"
-" Requires --cluster.\n"
+" --mds-plot [dims] <by-cluster> <eigendecomp> <eigvals> :\n"
+" Multidimensional scaling analysis. Requires --cluster.\n"
);
#endif
help_print("cell\tmodel", &help_ctrl, 0,
diff --git a/plink_lasso.c b/plink_lasso.c
index f734f16..faf8b6e 100644
--- a/plink_lasso.c
+++ b/plink_lasso.c
@@ -48,6 +48,7 @@ int32_t lasso_bigmem(FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_excl
uintptr_t polymorphic_marker_ct = 0;
uintptr_t unselected_covar_ct = 0;
uintptr_t final_mask = get_final_mask(sample_valid_ct);
+ double lambda_min = lasso_minlambda;
uint32_t chrom_fo_idx = 0xffffffffU; // exploit overflow
uint32_t chrom_end = 0;
uint32_t is_x = 0;
@@ -64,7 +65,6 @@ int32_t lasso_bigmem(FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_excl
uintptr_t* active_set;
uintptr_t* ulptr;
double sige;
- double lambda_min;
double loghi;
double loglo;
double logdelta;
@@ -200,6 +200,36 @@ int32_t lasso_bigmem(FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_excl
col_ct = covar_ct + polymorphic_marker_ct;
col_ctl = (col_ct + (BITCT - 1)) / BITCT;
wkspace_shrink_top(data_arr, col_ct * sample_valid_ct * sizeof(double));
+ sige = sqrt(1.0 - lasso_h2 + 1.0 / ((double)((intptr_t)sample_valid_ct)));
+ zz = sige * sqrt_n_recip;
+ if (rand_matrix) {
+ prod_matrix = (double*)wkspace_alloc(WARM_START_ITERS * WARM_START_ITERS * sizeof(double));
+ fputs("\r--lasso: Initializing warm start matrix...", stdout);
+ fflush(stdout);
+ fill_double_zero(misc_arr, WARM_START_ITERS);
+ for (col_idx = 0; col_idx < col_ct;) {
+ ulii = col_idx + WARM_START_ITERS;
+ if (ulii > col_ct) {
+ ulii = col_ct;
+ }
+ // splitting this into square blocks reduces memory consumption without
+ // slowing things down (may even be faster due to locality).
+ col_major_matrix_multiply(WARM_START_ITERS, ulii - col_idx, sample_valid_ct, rand_matrix, &(data_arr[col_idx * sample_valid_ct]), prod_matrix);
+ dptr = prod_matrix;
+ for (; col_idx < ulii; col_idx++) {
+ for (uii = 0; uii < WARM_START_ITERS; uii++) {
+ dxx = fabs(*dptr++);
+ if (dxx > misc_arr[uii]) {
+ misc_arr[uii] = dxx;
+ }
+ }
+ }
+ }
+ lambda_min = destructive_get_dmedian(misc_arr, WARM_START_ITERS) * zz;
+ logstr("--lasso:");
+ LOGPRINTF(" using min lambda = %g.\n", lambda_min);
+ wkspace_reset(prod_matrix);
+ }
xhat = (double*)wkspace_alloc(col_ct * sizeof(double));
active_set = (uintptr_t*)wkspace_alloc(col_ctl * sizeof(intptr_t));
*xhat_ptr = xhat;
@@ -215,40 +245,16 @@ int32_t lasso_bigmem(FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_excl
lambda_max = dxx;
}
}
- sige = sqrt(1.0 - lasso_h2 + 1.0 / ((double)((intptr_t)sample_valid_ct)));
- zz = sige * sqrt_n_recip;
- if (rand_matrix) {
- prod_matrix = (double*)wkspace_alloc(col_ct * WARM_START_ITERS * sizeof(double));
- fputs("\r--lasso: Initializing warm start matrix...", stdout);
- fflush(stdout);
- col_major_matrix_multiply(WARM_START_ITERS, col_ct, sample_valid_ct, rand_matrix, data_arr, prod_matrix);
- for (ulii = 0; ulii < WARM_START_ITERS; ulii++) {
- dxx = 0.0;
- dptr = &(prod_matrix[ulii]);
- for (col_idx = 0; col_idx < col_ct; col_idx++) {
- dyy = fabs(dptr[col_idx * WARM_START_ITERS]);
- if (dyy > dxx) {
- dxx = dyy;
- }
- }
- misc_arr[ulii] = dxx * zz;
- }
- lambda_min = destructive_get_dmedian(misc_arr, WARM_START_ITERS);
- logstr("--lasso:");
- LOGPRINTF(" using min lambda = %g.\n", lambda_min);
- } else {
- lambda_min = lasso_minlambda;
- if (lasso_minlambda >= lambda_max) {
- logprint("\nError: min lambda >= max lambda.\n");
- goto lasso_bigmem_ret_INVALID_CMDLINE;
- }
+ if (lambda_min >= lambda_max) {
+ logprint("\nError: min lambda >= max lambda.\n");
+ goto lasso_bigmem_ret_INVALID_CMDLINE;
}
loghi = log(lambda_max);
loglo = log(lambda_min);
logdelta = (loghi - loglo) / (NLAMBDA - 1.0);
+ dptr = data_arr;
for (col_idx = 0; col_idx < col_ct; col_idx++) {
dxx = 0.0;
- dptr = &(data_arr[col_idx * sample_valid_ct]);
dptr2 = pheno_d_collapsed;
for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
dxx += (*dptr++) * (*dptr2++);
@@ -346,7 +352,60 @@ int32_t lasso_bigmem(FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_excl
return retval;
}
-/*
+uint32_t load_and_normalize(FILE* bedfile, uintptr_t* loadbuf_raw, uintptr_t unfiltered_sample_ct, uintptr_t* loadbuf_collapsed, uintptr_t sample_valid_ct, uintptr_t* pheno_nm2, uintptr_t final_mask, uint32_t do_reverse, uint32_t min_ploidy_1, uint32_t hh_or_mt_exists, uintptr_t* sample_include2, uintptr_t* sample_male_include2, uint32_t is_x, uint32_t is_y, double sqrt_n_recip, double* data_window_ptr) {
+ uintptr_t sample_valid_ctl2 = (sample_valid_ct + (BITCT2 - 1)) / BITCT2;
+ uintptr_t sample_idx = 0;
+ uintptr_t sample_idx_stop = BITCT2;
+ uintptr_t* ulptr_end_init = &(loadbuf_collapsed[sample_valid_ct / BITCT2]);
+ double cur_mapping[4];
+ uintptr_t* ulptr = loadbuf_collapsed;
+ uintptr_t* ulptr_end = ulptr_end_init;
+ uintptr_t cur_word;
+ uintptr_t cur_genotype;
+ double dxx;
+ double dyy;
+ uint32_t homrar_ct;
+ uint32_t missing_ct;
+ uint32_t het_ct;
+ uint32_t homset_ct;
+ uint32_t uii;
+ if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf_collapsed, sample_valid_ct, pheno_nm2, final_mask, do_reverse)) {
+ return 2; // read failure
+ }
+ if (min_ploidy_1) {
+ haploid_fix(hh_or_mt_exists, sample_include2, sample_male_include2, sample_valid_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
+ }
+ vec_3freq(sample_valid_ctl2, loadbuf_collapsed, sample_include2, &missing_ct, &het_ct, &homset_ct);
+ uii = sample_valid_ct - missing_ct;
+ homrar_ct = uii - het_ct - homset_ct;
+ if (((!homrar_ct) && ((!het_ct) || (!homset_ct))) || ((!het_ct) && (!homset_ct))) {
+ return 1; // not polymorphic
+ }
+ dyy = (double)(2 * homrar_ct + het_ct); // sum
+ dxx = dyy / ((double)((int32_t)uii)); // mean
+ dyy = sqrt_n_recip * sqrt(((double)((int32_t)(uii - 1))) / (4 * ((double)((int32_t)homrar_ct)) + ((double)((int32_t)het_ct)) - dyy * dxx)); // 1/(stdev * sqrt(n))
+ cur_mapping[0] = (2 - dxx) * dyy; // 2x minor allele
+ cur_mapping[1] = 0.0;
+ cur_mapping[2] = (1 - dxx) * dyy; // 1x minor allele
+ cur_mapping[3] = (-dxx) * dyy; // no copies of minor allele
+ while (1) {
+ while (ulptr < ulptr_end) {
+ cur_word = *ulptr++;
+ for (; sample_idx < sample_idx_stop; sample_idx++, cur_word >>= 2) {
+ cur_genotype = cur_word & 3;
+ *data_window_ptr++ = cur_mapping[cur_genotype];
+ }
+ sample_idx_stop += BITCT2;
+ }
+ if (sample_idx == sample_valid_ct) {
+ return 0;
+ }
+ ulptr_end++;
+ sample_idx_stop = sample_valid_ct;
+ }
+}
+
+
int32_t lasso_smallmem(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_exclude, uintptr_t marker_ct, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* pheno_nm2, double lasso_h2, double lasso_minlambda, uint32_t select_covars, uintptr_t* select_covars_bitfield, double* pheno_d_collapsed, uintptr_t covar_ct, char* covar_names, uintptr_t max_covar_name_len, uintptr_t* covar_nm, double* covar_d, uint32_t hh_or_mt_exi [...]
// Instead of populating and normalizing data_arr before the coordinate
// descent, we reload and renormalize the data every iteration.
@@ -358,28 +417,25 @@ int32_t lasso_smallmem(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
double sqrt_n_recip = sqrt(1.0 / ((double)((intptr_t)sample_valid_ct)));
double lambda_max = 0.0;
double err_cur = 0.0;
+ double lambda_min = lasso_minlambda;
uint64_t iter_tot = 0;
- uintptr_t sample_valid_ctl2 = (sample_valid_ct + (BITCT2 - 1)) / BITCT2;
- uintptr_t polymorphic_marker_ct = 0;
+ uintptr_t unselected_covar_ct = 0;
uintptr_t final_mask = get_final_mask(sample_valid_ct);
+ uintptr_t polymorphic_marker_ct = 0;
uint32_t chrom_fo_idx = 0xffffffffU; // exploit overflow
uint32_t chrom_end = 0;
uint32_t is_x = 0;
uint32_t is_y = 0;
uint32_t min_ploidy_1 = 0;
+ uint32_t partial_marker_idx = 0;
int32_t retval = 0;
- double cur_mapping[4];
+ double* prod_matrix = NULL;
double* data_window;
double* xhat;
- double* prod_matrix;
double* dptr;
double* dptr2;
- uintptr_t* ulptr_end_init;
- uintptr_t* ulptr_end;
uintptr_t* active_set;
- uintptr_t* ulptr;
double sige;
- double lambda_min;
double loghi;
double loglo;
double logdelta;
@@ -388,10 +444,7 @@ int32_t lasso_smallmem(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
double err_last;
double dxx;
double dyy;
- double dzz;
double zz;
- uintptr_t cur_word;
- uintptr_t cur_genotype;
uintptr_t ulii;
uintptr_t iter;
uintptr_t col_ct;
@@ -401,55 +454,79 @@ int32_t lasso_smallmem(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
uintptr_t col_to_z;
uintptr_t col_uidx;
uintptr_t covar_idx;
- uintptr_t sample_uidx;
uintptr_t sample_idx;
- uintptr_t sample_idx_stop;
uintptr_t marker_idx;
uint32_t lambi;
uint32_t marker_uidx;
- uint32_t homrar_ct;
- uint32_t missing_ct;
- uint32_t het_ct;
- uint32_t homset_ct;
+ uint32_t last_marker_uidx;
uint32_t uii;
- cur_mapping[1] = 0; // missing
if (fseeko(bedfile, bed_offset, SEEK_SET)) {
goto lasso_smallmem_ret_READ_FAIL;
}
- ulptr_end_init = &(loadbuf_collapsed[sample_valid_ct / BITCT2]);
fputs("Using memory-conserving LASSO implementation.\n", stdout);
if (covar_ct) {
- if (wkspace_alloc_d_checked()) {
+ if (wkspace_alloc_d_checked(&covar_data_arr, covar_ct * sample_valid_ct * sizeof(double))) {
goto lasso_smallmem_ret_NOMEM;
}
- dptr = covar_data_arr;
- for (covar_idx = 0; covar_idx < covar_ct; covar_idx++) {
- dxx = 0; // sum
- dyy = 0; // ssq
- dptr2 = &(covar_d[covar_idx]);
- for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_valid_ct; sample_uidx++, sample_idx++) {
- next_set_ul_unsafe_ck(covar_nm, &sample_uidx);
- dzz = dptr2[sample_uidx * covar_ct];
- dxx += dzz;
- dyy += dzz * dzz;
- *dptr++ = dzz;
+ dxx = 1.0 / ((double)((intptr_t)sample_valid_ct));
+ dyy = (double)((intptr_t)(sample_valid_ct - 1));
+ if (!select_covars_bitfield) {
+ for (covar_idx = 0; covar_idx < covar_ct; covar_idx++) {
+ if (transpose_covar(sample_valid_ct, covar_ct, covar_nm, &(covar_d[covar_idx]), &(covar_data_arr[covar_idx * sample_valid_ct]), sqrt_n_recip, dxx, dyy)) {
+ goto lasso_smallmem_ret_CONST_COVAR;
+ }
}
- if (dyy * ((double)sample_valid_ct) == dxx * dxx) {
- logprint("Error: --lasso covariate is constant.\n");
- goto lasso_smallmem_ret_INVALID_CMDLINE;
+ if (!select_covars) {
+ unselected_covar_ct = covar_ct;
}
- dzz = dxx / ((double)((intptr_t)sample_valid_ct));
- dyy = sqrt_n_recip * sqrt(((double)((intptr_t)(sample_valid_ct - 1))) / (dyy - dxx * dzz));
- dptr = &(data_arr[covar_idx * sample_valid_ct]);
- for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
- *dptr = ((*dptr) - dzz) * dyy;
- dptr++;
+ } else {
+ ulii = 0;
+ for (covar_idx = 0; covar_idx < covar_ct; covar_idx++) {
+ if (IS_SET(select_covars_bitfield, covar_idx)) {
+ continue;
+ }
+ if (transpose_covar(sample_valid_ct, covar_ct, covar_nm, &(covar_d[covar_idx]), &(covar_data_arr[ulii * sample_valid_ct]), sqrt_n_recip, dxx, dyy)) {
+ goto lasso_smallmem_ret_CONST_COVAR;
+ }
+ ulii++;
+ }
+ unselected_covar_ct = ulii;
+ for (covar_idx = 0; covar_idx < covar_ct; covar_idx++) {
+ if (!IS_SET(select_covars_bitfield, covar_idx)) {
+ continue;
+ }
+ if (transpose_covar(sample_valid_ct, covar_ct, covar_nm, &(covar_d[covar_idx]), &(covar_data_arr[ulii * sample_valid_ct]), sqrt_n_recip, dxx, dyy)) {
+ goto lasso_smallmem_ret_CONST_COVAR;
+ }
+ ulii++;
}
}
}
- dptr = &(data_arr[covar_ct * sample_valid_ct]);
+ sige = sqrt(1.0 - lasso_h2 + 1.0 / ((double)((intptr_t)sample_valid_ct)));
+ zz = sige * sqrt_n_recip;
+ // put this on top of the permanent stack portion so we can shrink it when we
+ // know the true column count
+ if (wkspace_alloc_d_checked(&xhat, (covar_ct + marker_ct) * sizeof(double))) {
+ goto lasso_smallmem_ret_NOMEM;
+ }
+ if (rand_matrix) {
+ if (wkspace_alloc_d_checked(&data_window, sample_valid_ct * WARM_START_ITERS * sizeof(double)) ||
+ wkspace_alloc_d_checked(&prod_matrix, WARM_START_ITERS * WARM_START_ITERS * sizeof(double))) {
+ goto lasso_smallmem_ret_NOMEM;
+ }
+ fputs("\r--lasso: Initializing warm start matrix...", stdout);
+ fflush(stdout);
+ fill_double_zero(misc_arr, WARM_START_ITERS);
+ } else {
+ if (wkspace_alloc_d_checked(&data_window, sample_valid_ct * sizeof(double))) {
+ goto lasso_smallmem_ret_NOMEM;
+ }
+ }
+ dptr = data_window;
for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
+ // count polymorphic markers here, compute xhat[] and lambda_max, perform
+ // rand_matrix multiply if necessary
if (IS_SET(marker_exclude, marker_uidx)) {
marker_uidx = next_unset_unsafe(marker_exclude, marker_uidx);
if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
@@ -460,109 +537,85 @@ int32_t lasso_smallmem(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
chrom_fo_idx++;
refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &uii, &min_ploidy_1);
}
- if (load_and_collapse_incl(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf_collapsed, sample_valid_ct, pheno_nm2, final_mask, IS_SET(marker_reverse, marker_uidx))) {
+ uii = load_and_normalize(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf_collapsed, sample_valid_ct, pheno_nm2, final_mask, IS_SET(marker_reverse, marker_uidx), min_ploidy_1, hh_or_mt_exists, sample_include2, sample_male_include2, is_x, is_y, sqrt_n_recip, dptr);
+ if (uii == 2) {
goto lasso_smallmem_ret_READ_FAIL;
}
- if (min_ploidy_1) {
- haploid_fix(hh_or_mt_exists, sample_include2, sample_male_include2, sample_valid_ct, is_x, is_y, (unsigned char*)loadbuf_collapsed);
+ if (uii == 1) {
+ continue;
}
- vec_3freq(sample_valid_ctl2, loadbuf_collapsed, sample_include2, &missing_ct, &het_ct, &homset_ct);
- uii = sample_valid_ct - missing_ct;
- homrar_ct = uii - het_ct - homset_ct;
- if (!(((!homrar_ct) && ((!het_ct) || (!homset_ct))) || ((!het_ct) && (!homset_ct)))) {
- // ok, not monomorphic. standardize to zero mean, unit variance
- SET_BIT(polymorphic_markers, marker_uidx);
- dyy = (double)(2 * homrar_ct + het_ct); // sum
- dxx = dyy / ((double)((int32_t)uii)); // mean
- dyy = sqrt_n_recip * sqrt(((double)((int32_t)(uii - 1))) / (4 * ((double)((int32_t)homrar_ct)) + ((double)((int32_t)het_ct)) - dyy * dxx)); // 1/(stdev * sqrt(n))
- cur_mapping[0] = (2 - dxx) * dyy; // 2x minor allele
- cur_mapping[2] = (1 - dxx) * dyy; // 1x minor allele
- cur_mapping[3] = (-dxx) * dyy; // no copies of minor allele
- sample_idx = 0;
- sample_idx_stop = BITCT2;
- ulptr = loadbuf_collapsed;
- ulptr_end = ulptr_end_init;
- while (1) {
- while (ulptr < ulptr_end) {
- cur_word = *ulptr++;
- for (; sample_idx < sample_idx_stop; sample_idx++, cur_word >>= 2) {
- cur_genotype = cur_word & 3;
- *dptr++ = cur_mapping[cur_genotype];
+ SET_BIT(polymorphic_markers, marker_uidx);
+ dxx = 0.0;
+ for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
+ dxx += dptr[sample_idx] * pheno_d_collapsed[sample_idx];
+ }
+ xhat[covar_ct + polymorphic_marker_ct + partial_marker_idx] = dxx;
+ partial_marker_idx++;
+ dxx = fabs(dxx);
+ if (dxx > lambda_max) {
+ lambda_max = dxx;
+ }
+ if (partial_marker_idx == WARM_START_ITERS) {
+ if (rand_matrix) {
+ col_major_matrix_multiply(WARM_START_ITERS, WARM_START_ITERS, sample_valid_ct, rand_matrix, data_window, prod_matrix);
+ dptr = prod_matrix;
+ for (col_idx = 0; col_idx < WARM_START_ITERS; col_idx++) {
+ for (ulii = 0; ulii < WARM_START_ITERS; ulii++) {
+ dxx = fabs(*dptr++);
+ if (dxx > misc_arr[ulii]) {
+ misc_arr[ulii] = dxx;
+ }
}
- sample_idx_stop += BITCT2;
}
- if (sample_idx == sample_valid_ct) {
- break;
- }
- ulptr_end++;
- sample_idx_stop = sample_valid_ct;
}
- polymorphic_marker_ct++;
+ polymorphic_marker_ct += WARM_START_ITERS;
+ partial_marker_idx = 0;
+ dptr = data_window;
+ } else if (!rand_matrix) {
+ dptr = data_window;
+ } else {
+ dptr = &(dptr[sample_valid_ct]);
}
}
- *polymorphic_marker_ct_ptr = polymorphic_marker_ct;
- if (!polymorphic_marker_ct) {
+ if (!(polymorphic_marker_ct + partial_marker_idx)) {
logprint("Warning: Skipping --lasso since no polymorphic markers are present.\n");
return 0;
}
- col_ct = covar_ct + polymorphic_marker_ct;
- col_ctl = (col_ct + (BITCT - 1)) / BITCT;
- wkspace_shrink_top(data_arr, col_ct * sample_valid_ct * sizeof(double));
- xhat = (double*)wkspace_alloc(col_ct * sizeof(double));
- active_set = (uintptr_t*)wkspace_alloc(col_ctl * sizeof(intptr_t));
- *xhat_ptr = xhat;
- dptr = data_arr;
- for (col_idx = 0; col_idx < col_ct; col_idx++) {
- dxx = 0;
- dptr2 = pheno_d_collapsed;
- for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
- dxx += (*dptr++) * (*dptr2++);
- }
- dxx = fabs(dxx);
- if (dxx > lambda_max) {
- lambda_max = dxx;
- }
- }
- sige = sqrt(1.0 - lasso_h2 + 1.0 / ((double)((intptr_t)sample_valid_ct)));
- zz = sige * sqrt_n_recip;
if (rand_matrix) {
- prod_matrix = (double*)wkspace_alloc(col_ct * WARM_START_ITERS * sizeof(double));
- fputs("\r--lasso: Initializing warm start matrix...", stdout);
- fflush(stdout);
- col_major_matrix_multiply(WARM_START_ITERS, col_ct, sample_valid_ct, rand_matrix, data_arr, prod_matrix);
- for (ulii = 0; ulii < WARM_START_ITERS; ulii++) {
- dxx = 0.0;
- dptr = &(prod_matrix[ulii]);
- for (col_idx = 0; col_idx < col_ct; col_idx++) {
- dyy = fabs(dptr[col_idx * WARM_START_ITERS]);
- if (dyy > dxx) {
- dxx = dyy;
+ if (partial_marker_idx) {
+ col_major_matrix_multiply(WARM_START_ITERS, partial_marker_idx, sample_valid_ct, rand_matrix, data_window, prod_matrix);
+ dptr = prod_matrix;
+ for (col_idx = 0; col_idx < partial_marker_idx; col_idx++) {
+ for (ulii = 0; ulii < WARM_START_ITERS; ulii++) {
+ dxx = fabs(*dptr++);
+ if (dxx > misc_arr[ulii]) {
+ misc_arr[ulii] = dxx;
+ }
}
}
- misc_arr[ulii] = dxx * zz;
}
- lambda_min = destructive_get_dmedian(misc_arr, WARM_START_ITERS);
+ lambda_min = destructive_get_dmedian(misc_arr, WARM_START_ITERS) * zz;
logstr("--lasso:");
LOGPRINTF(" using min lambda = %g.\n", lambda_min);
- } else {
- lambda_min = lasso_minlambda;
- if (lasso_minlambda >= lambda_max) {
- logprint("\nError: min lambda >= max lambda.\n");
- goto lasso_smallmem_ret_INVALID_CMDLINE;
- }
+ }
+ polymorphic_marker_ct += partial_marker_idx;
+ *polymorphic_marker_ct_ptr = polymorphic_marker_ct;
+ col_ct = covar_ct + polymorphic_marker_ct;
+ wkspace_reset(data_window);
+ wkspace_shrink_top(xhat, col_ct * sizeof(double));
+ col_ctl = (col_ct + (BITCT - 1)) / BITCT;
+ *xhat_ptr = xhat;
+ if (lambda_min >= lambda_max) {
+ logprint("\nError: min lambda >= max lambda.\n");
+ goto lasso_smallmem_ret_INVALID_CMDLINE;
+ }
+ if (wkspace_alloc_ul_checked(&active_set, col_ctl * sizeof(intptr_t)) ||
+ wkspace_alloc_d_checked(&data_window, sample_valid_ct * sizeof(double))) {
+ goto lasso_smallmem_ret_NOMEM;
}
loghi = log(lambda_max);
loglo = log(lambda_min);
logdelta = (loghi - loglo) / (NLAMBDA - 1.0);
- for (col_idx = 0; col_idx < col_ct; col_idx++) {
- dxx = 0.0;
- dptr = &(data_arr[col_idx * sample_valid_ct]);
- dptr2 = pheno_d_collapsed;
- for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
- dxx += (*dptr++) * (*dptr2++);
- }
- xhat[col_idx] = dxx;
- }
fputs("\r--lasso: Executing coordinate descent... ", stdout);
for (lambi = 0; lambi < NLAMBDA; lambi++) {
if (lambi > 10) {
@@ -574,31 +627,80 @@ int32_t lasso_smallmem(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
fflush(stdout);
lambda = exp(loghi - logdelta * ((double)((int32_t)lambi)));
memcpy(residuals, pheno_d_collapsed, sample_valid_ct * sizeof(double));
- for (col_idx = 0; col_idx < col_ct; col_idx++) {
+ for (col_idx = 0; col_idx < covar_ct; col_idx++) {
dptr = residuals;
- dptr2 = &(data_arr[col_idx * sample_valid_ct]);
+ dptr2 = &(covar_data_arr[col_idx * sample_valid_ct]);
dxx = -xhat[col_idx];
for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
*dptr += (*dptr2++) * dxx;
dptr++;
}
}
+ if (fseeko(bedfile, bed_offset, SEEK_SET)) {
+ goto lasso_smallmem_ret_READ_FAIL;
+ }
+ for (marker_uidx = 0, marker_idx = 0, chrom_end = 0, chrom_fo_idx = 0xffffffffU; marker_idx < polymorphic_marker_ct; marker_uidx++, marker_idx++) {
+ if (!IS_SET(polymorphic_markers, marker_uidx)) {
+ marker_uidx = next_set_unsafe(polymorphic_markers, marker_uidx);
+ if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
+ goto lasso_smallmem_ret_READ_FAIL;
+ }
+ }
+ if (marker_uidx >= chrom_end) {
+ chrom_fo_idx++;
+ refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &uii, &min_ploidy_1);
+ }
+ if (load_and_normalize(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf_collapsed, sample_valid_ct, pheno_nm2, final_mask, IS_SET(marker_reverse, marker_uidx), min_ploidy_1, hh_or_mt_exists, sample_include2, sample_male_include2, is_x, is_y, sqrt_n_recip, data_window)) {
+ goto lasso_smallmem_ret_READ_FAIL;
+ }
+ dptr = residuals;
+ dptr2 = data_window;
+ dxx = -xhat[marker_idx + covar_ct];
+ for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
+ *dptr += (*dptr2++) * dxx;
+ dptr++;
+ }
+ }
iter = 0;
fill_all_bits(active_set, col_ct);
col_nz_ct = col_ct;
while (1) {
col_uidx = 0;
col_to_z = 0;
- for (marker_idx = 0; marker_idx < col_nz_ct; marker_idx++, col_uidx++) {
- col_uidx = next_set_unsafe(active_set, col_uidx);
+ for (marker_uidx = 0, last_marker_uidx = 0xfffffffeU, chrom_end = 0, chrom_fo_idx = 0xffffffffU, marker_idx = 0; marker_idx < col_nz_ct; col_uidx++) {
+ if (col_uidx < covar_ct) {
+ if (!(IS_SET(active_set, col_uidx))) {
+ continue;
+ }
+ dptr = &(covar_data_arr[col_uidx * sample_valid_ct]);
+ } else {
+ marker_uidx = next_set_unsafe(polymorphic_markers, marker_uidx);
+ if (!(IS_SET(active_set, col_uidx))) {
+ marker_uidx++;
+ continue;
+ }
+ if (marker_uidx != last_marker_uidx + 1) {
+ if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
+ goto lasso_smallmem_ret_READ_FAIL;
+ }
+ }
+ if (marker_uidx >= chrom_end) {
+ chrom_fo_idx++;
+ refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &uii, &min_ploidy_1);
+ }
+ if (load_and_normalize(bedfile, loadbuf_raw, unfiltered_sample_ct, loadbuf_collapsed, sample_valid_ct, pheno_nm2, final_mask, IS_SET(marker_reverse, marker_uidx), min_ploidy_1, hh_or_mt_exists, sample_include2, sample_male_include2, is_x, is_y, sqrt_n_recip, data_window)) {
+ goto lasso_smallmem_ret_READ_FAIL;
+ }
+ dptr = data_window;
+ }
+ marker_idx++;
xjold = xhat[col_uidx];
dxx = xjold;
- dptr = &(data_arr[col_uidx * sample_valid_ct]);
dptr2 = residuals;
for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
dxx += (*dptr++) * (*dptr2++);
}
- if (col_uidx >= covar_ct) {
+ if (col_uidx >= unselected_covar_ct) {
if (dxx > 0.0) {
dxx = MAXV(dxx - lambda, 0.0);
} else {
@@ -611,7 +713,13 @@ int32_t lasso_smallmem(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
col_to_z++;
}
dptr = residuals;
- dptr2 = &(data_arr[col_uidx * sample_valid_ct]);
+ if (col_uidx < covar_ct) {
+ dptr2 = &(covar_data_arr[col_uidx * sample_valid_ct]);
+ } else {
+ dptr2 = data_window;
+ last_marker_uidx = marker_uidx;
+ marker_uidx++;
+ }
dxx -= xjold;
for (sample_idx = 0; sample_idx < sample_valid_ct; sample_idx++) {
*dptr -= (*dptr2++) * dxx;
@@ -621,8 +729,7 @@ int32_t lasso_smallmem(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
col_nz_ct -= col_to_z;
err_last = err_cur;
err_cur = 0.0;
- col_uidx = 0;
- for (marker_idx = 0; marker_idx < col_nz_ct; marker_idx++, col_uidx++) {
+ for (col_uidx = 0, marker_idx = 0; marker_idx < col_nz_ct; marker_idx++, col_uidx++) {
col_uidx = next_set_unsafe(active_set, col_uidx);
err_cur += fabs(xhat[col_uidx]);
}
@@ -648,13 +755,14 @@ int32_t lasso_smallmem(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset,
lasso_smallmem_ret_READ_FAIL:
retval = RET_READ_FAIL;
break;
+ lasso_smallmem_ret_CONST_COVAR:
+ logprint("Error: --lasso covariate is constant.\n");
lasso_smallmem_ret_INVALID_CMDLINE:
retval = RET_INVALID_CMDLINE;
break;
}
return retval;
}
-*/
int32_t lasso(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t* marker_reverse, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t pheno_nm_ct, double lasso_h2, double lasso_minlambda, Range_list* select_covars_range_list_ptr, uint64_t misc_flags, uintptr_t* pheno_nm [...]
// Coordinate descent LASSO. Based on a MATLAB script by Shashaank
@@ -819,25 +927,31 @@ int32_t lasso(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* out
// maximum size of remaining allocations, for memory hog mode (col_ct is at
// most marker_ct + covar_ct):
// 1. data_arr: col_ct * sample_valid_ct * sizeof(double)
- // 2. prod_matrix: col_ct * WARM_START_ITERS * sizeof(double)
- // 3. xhat: col_ct * sizeof(double)
- // 4. active_set: col_ctl * sizeof(intptr_t)
- ullii = CACHEALIGN(((uint64_t)uii) * sample_valid_ct * sizeof(double)) + CACHEALIGN(((uint64_t)uii) * sizeof(double)) + CACHEALIGN(((uii + 7) / 8));
+ // plus
+ // 2a. xhat: col_ct * sizeof(double)
+ // 2b. active_set: col_ctl * sizeof(intptr_t)
+ // or
+ // 3. prod_matrix: WARM_START_ITERS * WARM_START_ITERS * sizeof(double)
+ // (whichever is larger)
+ ullii = CACHEALIGN(((uint64_t)uii) * sizeof(double)) + CACHEALIGN(((uii + 7) / 8));
// assumes WARM_START_ITERS is even
if (rand_matrix) {
uljj = (sample_valid_ct * WARM_START_ITERS) - 1;
for (ulii = 0; ulii < uljj; ulii += 2) {
rand_matrix[ulii] = rand_normal(&(rand_matrix[ulii + 1]));
}
- ullii += CACHEALIGN(((uint64_t)uii) * WARM_START_ITERS * sizeof(double)) + CACHEALIGN(((uint64_t)uii) * sizeof(double)) + CACHEALIGN(((uii + 7) / 8));
+ if (ullii < CACHEALIGN(WARM_START_ITERS * WARM_START_ITERS * sizeof(double))) {
+ ullii = CACHEALIGN(WARM_START_ITERS * WARM_START_ITERS * sizeof(double));
+ }
}
+ ullii += CACHEALIGN(((uint64_t)uii) * sample_valid_ct * sizeof(double));
if (ullii <= wkspace_left) {
retval = lasso_bigmem(bedfile, bed_offset, marker_exclude, marker_ct, marker_reverse, chrom_info_ptr, unfiltered_sample_ct, pheno_nm2, lasso_h2, lasso_minlambda, select_covars, select_covars_bitfield, pheno_d_collapsed, covar_ct, covar_names, max_covar_name_len, covar_nm, covar_d, hh_or_mt_exists, sample_valid_ct, sample_include2, sample_male_include2, loadbuf_raw, loadbuf_collapsed, rand_matrix, misc_arr, residuals, polymorphic_markers, &polymorphic_marker_ct, &iter_tot, &xhat);
} else {
- // retval = lasso_smallmem(threads, bedfile, bed_offset, marker_exclude, marker_ct, marker_reverse, chrom_info_ptr, unfiltered_sample_ct, pheno_nm2, lasso_h2, lasso_minlambda, select_covars, select_covars_bitfield, pheno_d_collapsed, covar_ct, covar_names, max_covar_name_len, covar_nm, covar_d, hh_or_mt_exists, sample_valid_ct, sample_include2, sample_male_include2, loadbuf_raw, loadbuf_collapsed, rand_matrix, misc_arr, residuals, polymorphic_markers, &polymorphic_marker_ct, &iter_to [...]
- retval = RET_NOMEM;
+ retval = lasso_smallmem(threads, bedfile, bed_offset, marker_exclude, marker_ct, marker_reverse, chrom_info_ptr, unfiltered_sample_ct, pheno_nm2, lasso_h2, lasso_minlambda, select_covars, select_covars_bitfield, pheno_d_collapsed, covar_ct, covar_names, max_covar_name_len, covar_nm, covar_d, hh_or_mt_exists, sample_valid_ct, sample_include2, sample_male_include2, loadbuf_raw, loadbuf_collapsed, rand_matrix, misc_arr, residuals, polymorphic_markers, &polymorphic_marker_ct, &iter_tot, &xhat);
+ // retval = RET_NOMEM;
}
- if (retval) {
+ if (retval || (!polymorphic_marker_ct)) {
goto lasso_ret_1;
}
memcpy(outname_end, ".lasso", 7);
diff --git a/plink_ld.c b/plink_ld.c
index fb97c82..97eaa9c 100644
--- a/plink_ld.c
+++ b/plink_ld.c
@@ -13135,6 +13135,7 @@ int32_t clump_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char*
}
window_data_ptr = &(window_data_ptr[founder_ctv2]);
}
+ next_unset_unsafe_ck(marker_exclude, &marker_uidx);
if (fseeko(bedfile, bed_offset + marker_uidx * ((uint64_t)unfiltered_sample_ct4), SEEK_SET)) {
goto clump_reports_ret_READ_FAIL;
}
diff --git a/plink_matrix.h b/plink_matrix.h
index 3e48035..f95bf69 100644
--- a/plink_matrix.h
+++ b/plink_matrix.h
@@ -92,6 +92,13 @@ extern "C" {
__CLPK_integer* isuppz, __CLPK_doublereal* work,
__CLPK_integer* lwork, __CLPK_integer* iwork,
__CLPK_integer* liwork, __CLPK_integer* info);
+
+ void dgesdd_(char* jobs, __CLPK_integer* m, __CLPK_integer* n,
+ __CLPK_doublereal* a, __CLPK_integer* lda, __CLPK_doublereal* s,
+ __CLPK_doublereal* u, __CLPK_integer* ldu,
+ __CLPK_doublereal* vt, __CLPK_integer* ldvt,
+ __CLPK_doublereal* work, __CLPK_integer* lwork,
+ __CLPK_integer* iwork, __CLPK_integer* info);
#endif
void xerbla_(void);
diff --git a/plink_misc.c b/plink_misc.c
index 9ec831d..14eb7e2 100644
--- a/plink_misc.c
+++ b/plink_misc.c
@@ -2619,6 +2619,189 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
return retval;
}
+int32_t write_cc_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t* founder_info, uint32_t nonfounders, uintptr_t* sex_male, uintptr_t* marker_reverse, uintptr_t* pheno_nm, uintptr_t* ph [...]
+ unsigned char* wkspace_mark = wkspace_base;
+ char* pzwritep = NULL;
+ uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
+ uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+ uintptr_t* loadbuf = NULL;
+ uintptr_t* case_include2 = NULL;
+ uintptr_t* ctrl_include2 = NULL;
+ uintptr_t* male_vec = NULL;
+ uintptr_t* nonmale_vec = NULL;
+ int32_t chrom_code_end = chrom_info_ptr->max_code + 1 + chrom_info_ptr->name_ct;
+ int32_t retval = 0;
+ Pigz_state ps;
+ unsigned char* overflow_buf;
+ uintptr_t* ulptr;
+ char* bufptr;
+ uintptr_t chrom_end;
+ uintptr_t marker_uidx;
+ uint32_t is_x;
+ uint32_t is_y;
+ uint32_t is_haploid;
+ int32_t chrom_idx;
+ uint32_t case_ct;
+ uint32_t ctrl_ct;
+ uint32_t case_set_ct;
+ uint32_t ctrl_set_ct;
+ uint32_t case_missing_ct;
+ uint32_t ctrl_missing_ct;
+ uint32_t case_obs;
+ uint32_t ctrl_obs;
+ uint32_t uii;
+ pzwrite_init_null(&ps);
+ if (wkspace_alloc_uc_checked(&overflow_buf, PIGZ_BLOCK_SIZE + 2 * max_marker_allele_len + max_marker_id_len + 256)) {
+ goto write_cc_freqs_ret_NOMEM;
+ }
+
+ if (wkspace_alloc_ul_checked(&loadbuf, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
+ wkspace_alloc_ul_checked(&case_include2, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
+ wkspace_alloc_ul_checked(&ctrl_include2, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
+ wkspace_alloc_ul_checked(&male_vec, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
+ wkspace_alloc_ul_checked(&nonmale_vec, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
+ goto write_cc_freqs_ret_NOMEM;
+ }
+ loadbuf[unfiltered_sample_ctl2 - 1] = 0;
+ vec_include_init(unfiltered_sample_ct, case_include2, pheno_c);
+ vec_include_init(unfiltered_sample_ct, ctrl_include2, pheno_nm);
+ memcpy(nonmale_vec, ctrl_include2, unfiltered_sample_ctl2 * sizeof(intptr_t));
+ bitfield_andnot(ctrl_include2, case_include2, unfiltered_sample_ctl2);
+ vec_include_init(unfiltered_sample_ct, male_vec, sex_male);
+ bitfield_andnot(nonmale_vec, male_vec, unfiltered_sample_ctl2);
+ if (!nonfounders) {
+ if (wkspace_alloc_ul_checked(&ulptr, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
+ goto write_cc_freqs_ret_NOMEM;
+ }
+ vec_include_init(unfiltered_sample_ct, ulptr, founder_info);
+ bitfield_and(case_include2, ulptr, unfiltered_sample_ctl2);
+ bitfield_and(ctrl_include2, ulptr, unfiltered_sample_ctl2);
+ bitfield_and(male_vec, ulptr, unfiltered_sample_ctl2);
+ bitfield_and(nonmale_vec, ulptr, unfiltered_sample_ctl2);
+ wkspace_reset(ulptr);
+ }
+ case_ct = popcount2_longs(case_include2, unfiltered_sample_ctl2);
+ ctrl_ct = popcount2_longs(ctrl_include2, unfiltered_sample_ctl2);
+
+ bufptr = memcpya(outname_end, ".frq.cc", 7);
+ if (!output_gz) {
+ *bufptr = '\0';
+ } else {
+ memcpy(bufptr, ".gz", 4);
+ }
+ if (flex_pzwrite_init(output_gz, outname, overflow_buf, 0, &ps)) {
+ goto write_cc_freqs_ret_OPEN_FAIL;
+ }
+ pzwritep = (char*)overflow_buf;
+ pzwritep = memcpya(pzwritep, " CHR ", 6);
+ if (plink_maxsnp >= 5) {
+ pzwritep = memseta(pzwritep, 32, plink_maxsnp - 4);
+ }
+ pzwritep = strcpya(pzwritep, "SNP A1 A2 MAF_A MAF_U NCHROBS_A NCHROBS_U" EOLN_STR);
+
+ for (chrom_idx = 0; chrom_idx < chrom_code_end; chrom_idx++) {
+ if (!chrom_exists(chrom_info_ptr, chrom_idx)) {
+ continue;
+ }
+ is_x = (chrom_idx == chrom_info_ptr->x_code);
+ is_y = (chrom_idx == chrom_info_ptr->y_code);
+ is_haploid = is_set(chrom_info_ptr->haploid_mask, chrom_idx);
+ chrom_end = chrom_info_ptr->chrom_end[chrom_idx];
+ marker_uidx = next_unset_ul(marker_exclude, chrom_info_ptr->chrom_start[chrom_idx], chrom_end);
+ if (marker_uidx >= chrom_end) {
+ continue;
+ }
+ if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
+ goto write_cc_freqs_ret_READ_FAIL;
+ }
+ do {
+ pzwritep = width_force(4, pzwritep, chrom_name_write(pzwritep, chrom_info_ptr, chrom_idx));
+ *pzwritep++ = ' ';
+ pzwritep = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), pzwritep);
+ *pzwritep++ = ' ';
+ pzwritep = fw_strcpy(4, marker_allele_ptrs[marker_uidx * 2], pzwritep);
+ *pzwritep++ = ' ';
+ pzwritep = fw_strcpy(4, marker_allele_ptrs[marker_uidx * 2 + 1], pzwritep);
+ *pzwritep++ = ' ';
+
+ if (load_raw(bedfile, loadbuf, unfiltered_sample_ct4)) {
+ goto write_cc_freqs_ret_READ_FAIL;
+ }
+ if (IS_SET(marker_reverse, marker_uidx)) {
+ reverse_loadbuf((unsigned char*)loadbuf, unfiltered_sample_ct);
+ }
+ if (is_x) {
+ vec_set_freq_x(unfiltered_sample_ctl2, loadbuf, case_include2, male_vec, &case_set_ct, &case_missing_ct);
+ vec_set_freq_x(unfiltered_sample_ctl2, loadbuf, ctrl_include2, male_vec, &ctrl_set_ct, &ctrl_missing_ct);
+ case_obs = 2 * case_ct - case_missing_ct;
+ ctrl_obs = 2 * ctrl_ct - ctrl_missing_ct;
+ } else if (!is_haploid) {
+ vec_set_freq(unfiltered_sample_ctl2, loadbuf, case_include2, &case_set_ct, &case_missing_ct);
+ vec_set_freq(unfiltered_sample_ctl2, loadbuf, ctrl_include2, &ctrl_set_ct, &ctrl_missing_ct);
+ case_obs = 2 * (case_ct - case_missing_ct);
+ ctrl_obs = 2 * (ctrl_ct - ctrl_missing_ct);
+ } else {
+ if (is_y) {
+ vec_set_freq_y(unfiltered_sample_ctl2, loadbuf, case_include2, nonmale_vec, &case_set_ct, &case_missing_ct);
+ vec_set_freq_y(unfiltered_sample_ctl2, loadbuf, ctrl_include2, nonmale_vec, &ctrl_set_ct, &ctrl_missing_ct);
+ } else {
+ vec_3freq(unfiltered_sample_ctl2, loadbuf, case_include2, &case_missing_ct, &uii, &case_set_ct);
+ case_missing_ct += uii;
+ vec_3freq(unfiltered_sample_ctl2, loadbuf, ctrl_include2, &ctrl_missing_ct, &uii, &ctrl_set_ct);
+ ctrl_missing_ct += uii;
+ }
+ case_obs = case_ct - case_missing_ct;
+ ctrl_obs = ctrl_ct - ctrl_missing_ct;
+ }
+ if (case_obs) {
+ pzwritep = double_g_writewx4x(pzwritep, ((double)((int32_t)(case_obs - case_set_ct))) / ((double)case_obs), 12, ' ');
+ } else {
+ pzwritep = memcpya(pzwritep, " NA ", 11);
+ }
+ if (ctrl_obs) {
+ pzwritep = double_g_writewx4x(pzwritep, ((double)((int32_t)(ctrl_obs - ctrl_set_ct))) / ((double)ctrl_obs), 12, ' ');
+ } else {
+ pzwritep = memcpya(pzwritep, " NA ", 11);
+ }
+
+ pzwritep = uint32_writew10x(pzwritep, case_obs, ' ');
+ pzwritep = uint32_writew10(pzwritep, ctrl_obs);
+ append_binary_eoln(&pzwritep);
+ if (flex_pzwrite(&ps, &pzwritep)) {
+ goto write_cc_freqs_ret_WRITE_FAIL;
+ }
+ marker_uidx++;
+ if (IS_SET(marker_exclude, marker_uidx)) {
+ marker_uidx = next_unset_ul(marker_exclude, marker_uidx, chrom_end);
+ if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
+ goto write_cc_freqs_ret_WRITE_FAIL;
+ }
+ }
+ } while (marker_uidx < chrom_end);
+ }
+ if (flex_pzwrite_close_null(&ps, pzwritep)) {
+ goto write_cc_freqs_ret_WRITE_FAIL;
+ }
+ LOGPRINTFWW("--freq case-control: Phenotype-stratified allele frequencies (%s) written to %s .\n", nonfounders? "all samples" : "founders only", outname);
+ while (0) {
+ write_cc_freqs_ret_NOMEM:
+ retval = RET_NOMEM;
+ break;
+ write_cc_freqs_ret_OPEN_FAIL:
+ retval = RET_OPEN_FAIL;
+ break;
+ write_cc_freqs_ret_READ_FAIL:
+ retval = RET_READ_FAIL;
+ break;
+ write_cc_freqs_ret_WRITE_FAIL:
+ retval = RET_WRITE_FAIL;
+ break;
+ }
+ flex_pzwrite_close_cond(&ps, pzwritep);
+ wkspace_reset(wkspace_mark);
+ return retval;
+}
+
int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, double* set_allele_freqs, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, int32_t* ll_cts, int32_t* lh_cts, int32_t* hh_cts, int32_t* hapl_cts, int32_t* haph_cts, uint32_t sample_f_ct, uint32_t sample_f_male_ct, uint32_t nonfounders, uint64_t misc_flags, uintptr_t* mar [...]
unsigned char* wkspace_mark = wkspace_base;
char* pzwritep = NULL;
@@ -2664,11 +2847,16 @@ int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uin
pzwritep = (char*)overflow_buf;
if (freqx) {
pzwritep = strcpya(pzwritep, "CHR\tSNP\tA1\tA2\tC(HOM A1)\tC(HET)\tC(HOM A2)\tC(HAP A1)\tC(HAP A2)\tC(MISSING)" EOLN_STR);
- } else if (plink_maxsnp < 5) {
- pzwritep = strcpya(pzwritep, freq_counts? (" CHR SNP A1 A2 C1 C2 G0" EOLN_STR) : (" CHR SNP A1 A2 MAF NCHROBS" EOLN_STR));
} else {
- sprintf(tbuf, freq_counts? (" CHR %%%us A1 A2 C1 C2 G0" EOLN_STR) : (" CHR %%%us A1 A2 MAF NCHROBS" EOLN_STR), plink_maxsnp);
- pzwritep += sprintf(pzwritep, tbuf, "SNP");
+ pzwritep = memcpya(pzwritep, " CHR ", 6);
+ if (plink_maxsnp >= 5) {
+ pzwritep = memseta(pzwritep, 32, plink_maxsnp - 4);
+ }
+ if (freq_counts) {
+ pzwritep = strcpya(pzwritep, "SNP A1 A2 C1 C2 G0" EOLN_STR);
+ } else {
+ pzwritep = strcpya(pzwritep, "SNP A1 A2 MAF NCHROBS" EOLN_STR);
+ }
}
for (chrom_idx = 0; chrom_idx < chrom_code_end; chrom_idx++) {
if (!chrom_exists(chrom_info_ptr, chrom_idx)) {
@@ -2696,7 +2884,7 @@ int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uin
missing_ct = sample_f_ct - (ll_cts[marker_uidx] + lh_cts[marker_uidx] + hh_cts[marker_uidx]);
}
if (freqx) {
- pzwritep = chrom_name_write(pzwritep, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx));
+ pzwritep = chrom_name_write(pzwritep, chrom_info_ptr, chrom_idx);
*pzwritep++ = '\t';
pzwritep = strcpyax(pzwritep, &(marker_ids[marker_uidx * max_marker_id_len]), '\t');
pzwritep = strcpyax(pzwritep, minor_ptr, '\t');
@@ -2708,7 +2896,7 @@ int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uin
pzwritep = uint32_writex(pzwritep, reverse? hapl_cts[marker_uidx] : haph_cts[marker_uidx], '\t');
pzwritep = uint32_write(pzwritep, missing_ct);
} else {
- pzwritep = width_force(4, pzwritep, chrom_name_write(pzwritep, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx)));
+ pzwritep = width_force(4, pzwritep, chrom_name_write(pzwritep, chrom_info_ptr, chrom_idx));
*pzwritep++ = ' ';
pzwritep = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), pzwritep);
*pzwritep++ = ' ';
@@ -2721,7 +2909,7 @@ int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uin
pzwritep = uint32_writew6(pzwritep, missing_ct);
}
} else {
- pzwritep = width_force(4, pzwritep, chrom_name_write(pzwritep, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx)));
+ pzwritep = width_force(4, pzwritep, chrom_name_write(pzwritep, chrom_info_ptr, chrom_idx));
*pzwritep++ = ' ';
pzwritep = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), pzwritep);
*pzwritep++ = ' ';
@@ -2731,12 +2919,12 @@ int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uin
*pzwritep++ = ' ';
uii = 2 * (ll_cts[marker_uidx] + lh_cts[marker_uidx] + hh_cts[marker_uidx]) + hapl_cts[marker_uidx] + haph_cts[marker_uidx];
if (maf_succ || uii || (set_allele_freqs[marker_uidx] != 0.5)) {
- pzwritep = double_g_writewx4(pzwritep, 1.0 - set_allele_freqs[marker_uidx], 12);
+ pzwritep = double_g_writewx4(pzwritep, 1.0 - set_allele_freqs[marker_uidx], 12);
} else {
pzwritep = memcpya(pzwritep, " NA", 12);
}
*pzwritep++ = ' ';
- pzwritep = uint32_writew8(pzwritep, uii);
+ pzwritep = uint32_writew8(pzwritep, uii);
}
append_binary_eoln(&pzwritep);
if (flex_pzwrite(&ps, &pzwritep)) {
@@ -2748,7 +2936,7 @@ int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uin
if (flex_pzwrite_close_null(&ps, pzwritep)) {
goto write_freqs_ret_WRITE_FAIL;
}
- LOGPRINTFWW("--freq%s: Allele frequencies (%s) written to %s .\n", freqx? "x" : "", nonfounders? "all samples" : "founders only", outname);
+ LOGPRINTFWW("--freq%s%s: Allele frequencies (%s) written to %s .\n", freqx? "x" : "", freq_counts? " counts" : "", nonfounders? "all samples" : "founders only", outname);
while (0) {
write_freqs_ret_NOMEM:
retval = RET_NOMEM;
@@ -4828,10 +5016,10 @@ int32_t meta_analysis_open_and_read_header(const char* fname, char* loadbuf, uin
}
#ifdef __cplusplus
// suppress bogus gcc 4.4 warning, this is not performance-critical
- qsort((int32_t*)parse_table, token_ct, sizeof(int32_t), intcmp);
+ qsort(parse_table, token_ct, sizeof(int32_t), uintcmp);
// std::sort(parse_table, &(parse_table[token_ct]));
#else
- qsort((int32_t*)parse_table, token_ct, sizeof(int32_t), intcmp);
+ qsort(parse_table, token_ct, sizeof(int32_t), uintcmp);
#endif
// bugfix: this caused a segfault in no-map case
if ((!weighted_z) && (token_ct > 5)) {
@@ -5825,7 +6013,19 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
}
bufptr2++;
if (*bufptr2) {
- fputs_w4(bufptr2, outfile);
+ // bugfix: fputs_w4 does the wrong thing for 4+ character alleles.
+ // instead, we want a leading space, then fputs_w3.
+ if (!bufptr2[1]) {
+ fputs(" ", outfile);
+ putc(bufptr2[0], outfile);
+ } else if (!bufptr2[2]) {
+ fputs(" ", outfile);
+ putc(bufptr2[0], outfile);
+ putc(bufptr2[1], outfile);
+ } else {
+ putc(' ', outfile);
+ fputs(bufptr2, outfile);
+ }
} else {
fputs(" ?", outfile);
}
diff --git a/plink_misc.h b/plink_misc.h
index 5bbdfd0..799ee73 100644
--- a/plink_misc.h
+++ b/plink_misc.h
@@ -69,6 +69,8 @@ int32_t load_ax_alleles(Two_col_params* axalleles, uintptr_t unfiltered_marker_c
int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uint32_t sample_f_ct, uintptr_t* founder_info, uint32_t nonfounders, uintptr_t* sex_male, uint32_t s [...]
+int32_t write_cc_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t* founder_info, uint32_t nonfounders, uintptr_t* sex_male, uintptr_t* marker_reverse, uintptr_t* pheno_nm, uintptr_t* pheno_c);
+
int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, double* set_allele_freqs, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, int32_t* ll_cts, int32_t* lh_cts, int32_t* hh_cts, int32_t* hapl_cts, int32_t* haph_cts, uint32_t sample_f_ct, uint32_t sample_f_male_ct, uint32_t nonfounders, uint64_t misc_flags, uintptr_t* mar [...]
int32_t sexcheck(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uintptr_t* sex_nm, uintptr_t* sex_male, uint64_t misc_flags, double check_sex_fthresh, double check_sex_mthresh, uint32_t max_f_yobs, uint32_t min_m_yobs, Chrom_info* chrom [...]
diff --git a/plink_stats.c b/plink_stats.c
index 963e0fc..9a91b57 100644
--- a/plink_stats.c
+++ b/plink_stats.c
@@ -326,13 +326,13 @@ int32_t SNPHWE_t(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, double th
// - Sum the *relative* likelihoods of more likely smaller het counts.
// - Determine the minimum tail mass to pass the threshold.
// - The majority of the time, the tail boundary elements are enough to pass
- // the threshold; we never need to sum the remainder of the tails.
+ // the threshold; we never need to sum the remainder of the tails.
// - And in the case of disequilibrium, we will often be able to immediately
- // determine that the tail sum cannot possibly pass the threshold, just by
- // looking at the tail boundary elements and using a geometric series to
- // upper-bound the tail sums.
+ // determine that the tail sum cannot possibly pass the threshold, just by
+ // looking at the tail boundary elements and using a geometric series to
+ // upper-bound the tail sums.
// - Only when neither of these conditions hold do we start traveling down
- // the tails.
+ // the tails.
intptr_t obs_homc;
intptr_t obs_homr;
if (obs_hom1 < obs_hom2) {
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/plink1.9.git
More information about the debian-med-commit
mailing list