[med-svn] [Git][med-team/plink2][upstream] New upstream version 2.00~a3-200321+dfsg
Dylan Aïssi
gitlab at salsa.debian.org
Fri Mar 27 07:45:04 GMT 2020
Dylan Aïssi pushed to branch upstream at Debian Med / plink2
Commits:
2aaa4872 by Dylan Aïssi at 2020-03-27T08:42:52+01:00
New upstream version 2.00~a3-200321+dfsg
- - - - -
28 changed files:
- include/pgenlib_misc.cc
- include/pgenlib_misc.h
- include/pgenlib_read.cc
- include/plink2_base.cc
- include/plink2_base.h
- include/plink2_bits.cc
- include/plink2_bits.h
- include/plink2_string.cc
- include/plink2_string.h
- pgenlibr/src/Makevars
- plink2.cc
- plink2_cmdline.cc
- plink2_cmdline.h
- plink2_common.cc
- plink2_common.h
- plink2_data.cc
- plink2_data.h
- plink2_decompress.h
- plink2_export.cc
- plink2_export.h
- plink2_filter.cc
- plink2_glm.cc
- plink2_help.cc
- plink2_import.cc
- plink2_import.h
- plink2_matrix.h
- plink2_matrix_calc.cc
- plink2_pvar.cc
Changes:
=====================================
include/pgenlib_misc.cc
=====================================
@@ -1181,12 +1181,12 @@ void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, ui
# else
VecW tmp_lo = vecw_unpacklo16(loader0, loader1);
VecW tmp_hi = vecw_unpackhi16(loader0, loader1);
- loader0 = (tmp_lo & m16) | vecw_and_notfirst(m16, vecw_slli(tmp_hi, 16));
- loader1 = (vecw_srli(tmp_lo, 16) & m16) | (vecw_and_notfirst(m16, tmp_hi));
+ loader0 = vecw_blendv(vecw_slli(tmp_hi, 16), tmp_lo, m16);
+ loader1 = vecw_blendv(tmp_hi, vecw_srli(tmp_lo, 16), m16);
tmp_lo = vecw_unpacklo16(loader2, loader3);
tmp_hi = vecw_unpackhi16(loader2, loader3);
- loader2 = (tmp_lo & m16) | vecw_and_notfirst(m16, vecw_slli(tmp_hi, 16));
- loader3 = (vecw_srli(tmp_lo, 16) & m16) | (vecw_and_notfirst(m16, tmp_hi));
+ loader2 = vecw_blendv(vecw_slli(tmp_hi, 16), tmp_lo, m16);
+ loader3 = vecw_blendv(tmp_hi, vecw_srli(tmp_lo, 16), m16);
# endif
// (0,0) (0,1) (1,0) ... (7,1) (0,2) ... (7,3) (8,0) ... (31,3)
// + (0,4) ... (31,7)
@@ -1232,7 +1232,7 @@ void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, ui
// and target_4567 to
// _ (4, 0) _ (5, 0) _ (6, 0) _ (7, 0) _ (4, 1) _ (5, 1) _ (6, 1) ...
// This is perfectly arranged for movemask.
- VecW target_4567 = (m8 & vecw_srli(loader, 7)) | vecw_and_notfirst(m8, loader);
+ VecW target_4567 = vecw_blendv(loader, vecw_srli(loader, 7), m8);
target_iter7[vidx] = vecw_movemask(target_4567);
target_4567 = vecw_slli(target_4567, 2);
target_iter6[vidx] = vecw_movemask(target_4567);
@@ -1240,7 +1240,7 @@ void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, ui
target_iter5[vidx] = vecw_movemask(target_4567);
target_4567 = vecw_slli(target_4567, 2);
target_iter4[vidx] = vecw_movemask(target_4567);
- VecW target_0123 = (vecw_slli(loader, 1) & m8) | vecw_and_notfirst(m8, vecw_slli(loader, 8));
+ VecW target_0123 = vecw_blendv(vecw_slli(loader, 8), vecw_slli(loader, 1), m8);
target_iter3[vidx] = vecw_movemask(target_0123);
target_0123 = vecw_slli(target_0123, 2);
target_iter2[vidx] = vecw_movemask(target_0123);
@@ -1260,7 +1260,7 @@ void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, ui
Vec8thUint* target_iter3 = &(target_iter2[write_v8ui_stride]);
for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) {
const VecW loader = source_iter[vidx];
- VecW target_0123 = (vecw_slli(loader, 1) & m8) | vecw_and_notfirst(m8, vecw_slli(loader, 8));
+ VecW target_0123 = vecw_blendv(vecw_slli(loader, 8), vecw_slli(loader, 1), m8);
target_iter3[vidx] = vecw_movemask(target_0123);
target_0123 = vecw_slli(target_0123, 2);
target_iter2[vidx] = vecw_movemask(target_0123);
@@ -1420,8 +1420,9 @@ void SplitHomRef2het(const uintptr_t* genoarr, uint32_t sample_ct, uintptr_t* __
}
}
-
-/*
+// Note that once the result elements are only 4 bits wide, shuffle8 should
+// beat a 256x4 lookup table when SSE4 is available. (possible todo: see if
+// shuffle8-based loop also wins here.)
void GenoarrLookup256x1bx4(const uintptr_t* genoarr, const void* table256x1bx4, uint32_t sample_ct, void* __restrict result) {
const uint32_t* table_alias = S_CAST(const uint32_t*, table256x1bx4);
const unsigned char* genoarr_alias = R_CAST(const unsigned char*, genoarr);
@@ -1440,7 +1441,6 @@ void GenoarrLookup256x1bx4(const uintptr_t* genoarr, const void* table256x1bx4,
}
}
}
-*/
void GenoarrLookup16x4bx2(const uintptr_t* genoarr, const void* table16x4bx2, uint32_t sample_ct, void* __restrict result) {
const uint64_t* table_alias = S_CAST(const uint64_t*, table16x4bx2);
@@ -1666,6 +1666,54 @@ void InitLookup16x8bx2(void* table16x8bx2) {
}
}
+void InitLookup256x1bx4(void* table256x1bx4) {
+ unsigned char* table_iter = S_CAST(unsigned char*, table256x1bx4);
+ unsigned char vals[4];
+ vals[0] = table_iter[0];
+ vals[1] = table_iter[4];
+ vals[2] = table_iter[8];
+ vals[3] = table_iter[12];
+ for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
+ const uint32_t cur_high = vals[high_idx];
+ for (uint32_t second_idx = 0; second_idx != 4; ++second_idx) {
+ const uint32_t cur_second = vals[second_idx];
+ for (uint32_t third_idx = 0; third_idx != 4; ++third_idx) {
+ const uint32_t cur_third = vals[third_idx];
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = vals[low_idx];
+ *table_iter++ = cur_third;
+ *table_iter++ = cur_second;
+ *table_iter++ = cur_high;
+ }
+ }
+ }
+ }
+}
+
+void InitLookup256x2bx4(void* table256x2bx4) {
+ uint16_t* table_iter = S_CAST(uint16_t*, table256x2bx4);
+ uint16_t vals[4];
+ vals[0] = table_iter[0];
+ vals[1] = table_iter[4];
+ vals[2] = table_iter[8];
+ vals[3] = table_iter[12];
+ for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
+ const uint32_t cur_high = vals[high_idx];
+ for (uint32_t second_idx = 0; second_idx != 4; ++second_idx) {
+ const uint32_t cur_second = vals[second_idx];
+ for (uint32_t third_idx = 0; third_idx != 4; ++third_idx) {
+ const uint32_t cur_third = vals[third_idx];
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = vals[low_idx];
+ *table_iter++ = cur_third;
+ *table_iter++ = cur_second;
+ *table_iter++ = cur_high;
+ }
+ }
+ }
+ }
+}
+
void InitLookup256x4bx4(void* table256x4bx4) {
uint32_t* table_iter = S_CAST(uint32_t*, table256x4bx4);
uint32_t vals[4];
@@ -1798,6 +1846,158 @@ void InitPhaseLookup4b(void* table56x4bx2) {
}
}
+#ifdef __LP64__
+void PhaseLookup8b(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, const void* table56x8bx2, uint32_t sample_ct, void* __restrict result) {
+ const __m128i* table_alias = S_CAST(const __m128i*, table56x8bx2);
+ const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
+ const Halfword* phasepresent_alias = R_CAST(const Halfword*, phasepresent);
+ const Halfword* phaseinfo_alias = R_CAST(const Halfword*, phaseinfo);
+ __m128i* result_iter = S_CAST(__m128i*, result);
+ uint32_t loop_len = kBitsPerWordD4;
+ uintptr_t geno_word = 0;
+ uintptr_t phasepresent_hw_shifted = 0;
+ uintptr_t phaseinfo_hw_shifted = 0;
+ for (uint32_t widx = 0; ; ++widx) {
+ if (widx >= sample_ctl2_m1) {
+ if (widx > sample_ctl2_m1) {
+ if (sample_ct % 2) {
+ uintptr_t cur_idx = (geno_word & 3);
+ if (phasepresent_hw_shifted & 16) {
+ cur_idx ^= 16 | (phaseinfo_hw_shifted & 2);
+ }
+ memcpy(result_iter, &(table_alias[cur_idx]), 8);
+ }
+ return;
+ }
+ loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
+ }
+ geno_word = genoarr[widx];
+ phasepresent_hw_shifted = phasepresent_alias[widx];
+ if (!phasepresent_hw_shifted) {
+ for (uint32_t uii = 0; uii != loop_len; ++uii) {
+ _mm_storeu_si128(result_iter, table_alias[geno_word & 15]);
+ ++result_iter;
+ geno_word >>= 4;
+ }
+ } else {
+ phasepresent_hw_shifted = phasepresent_hw_shifted << 4;
+ phaseinfo_hw_shifted = phaseinfo_alias[widx];
+ phaseinfo_hw_shifted = phaseinfo_hw_shifted << 1;
+ for (uint32_t uii = 0; uii != loop_len; ++uii) {
+ const uintptr_t cur_idx = ((geno_word & 15) | (phasepresent_hw_shifted & 48)) ^ (phaseinfo_hw_shifted & 6);
+ _mm_storeu_si128(result_iter, table_alias[cur_idx]);
+ ++result_iter;
+ geno_word >>= 4;
+ phasepresent_hw_shifted >>= 2;
+ phaseinfo_hw_shifted >>= 2;
+ }
+ }
+ }
+}
+#else
+void PhaseLookup8b(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, const void* table56x8bx2, uint32_t sample_ct, void* __restrict result) {
+ const uint64_t* table_alias = S_CAST(const uint64_t*, table56x8bx2);
+ const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
+ const Halfword* phasepresent_alias = R_CAST(const Halfword*, phasepresent);
+ const Halfword* phaseinfo_alias = R_CAST(const Halfword*, phaseinfo);
+ uint64_t* result_iter = S_CAST(uint64_t*, result);
+ uint32_t loop_len = kBitsPerWordD4;
+ uintptr_t geno_word = 0;
+ uintptr_t phasepresent_hw_shifted = 0;
+ uintptr_t phaseinfo_hw_shifted = 0;
+ for (uint32_t widx = 0; ; ++widx) {
+ if (widx >= sample_ctl2_m1) {
+ if (widx > sample_ctl2_m1) {
+ if (sample_ct % 2) {
+ uintptr_t cur_idx = (geno_word & 3);
+ if (phasepresent_hw_shifted & 16) {
+ cur_idx ^= 16 | (phaseinfo_hw_shifted & 2);
+ }
+ memcpy(result_iter, &(table_alias[cur_idx * 2]), 8);
+ }
+ return;
+ }
+ loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
+ }
+ geno_word = genoarr[widx];
+ phasepresent_hw_shifted = phasepresent_alias[widx];
+ if (!phasepresent_hw_shifted) {
+ for (uint32_t uii = 0; uii != loop_len; ++uii) {
+ memcpy(result_iter, &(table_alias[(geno_word & 15) * 2]), 16);
+ result_iter = &(result_iter[2]);
+ geno_word >>= 4;
+ }
+ } else {
+ phasepresent_hw_shifted = phasepresent_hw_shifted << 4;
+ phaseinfo_hw_shifted = phaseinfo_alias[widx];
+ phaseinfo_hw_shifted = phaseinfo_hw_shifted << 1;
+ for (uint32_t uii = 0; uii != loop_len; ++uii) {
+ const uintptr_t cur_idx = ((geno_word & 15) | (phasepresent_hw_shifted & 48)) ^ (phaseinfo_hw_shifted & 6);
+ memcpy(result_iter, &(table_alias[cur_idx * 2]), 16);
+ geno_word >>= 4;
+ phasepresent_hw_shifted >>= 2;
+ phaseinfo_hw_shifted >>= 2;
+ }
+ }
+ }
+}
+#endif
+
+void InitPhaseLookup8b(void* table56x8bx2) {
+ uint64_t* table_iter = S_CAST(uint64_t*, table56x8bx2);
+ uint64_t vals[4];
+ vals[0] = table_iter[0];
+ table_iter[1] = vals[0];
+ vals[1] = table_iter[2];
+ table_iter[3] = vals[0];
+ vals[2] = table_iter[4];
+ table_iter[5] = vals[0];
+ vals[3] = table_iter[6];
+ table_iter[7] = vals[0];
+ table_iter = &(table_iter[8]);
+ for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
+ const uint64_t cur_high = vals[high_idx];
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = vals[low_idx];
+ *table_iter++ = cur_high;
+ }
+ }
+ // [16][0]..[31][1]: bit 4 is set
+ // low bits must be 01 or 11
+ const uint64_t val_phaseinfo0 = table_iter[2];
+ table_iter[3] = vals[0];
+ const uint64_t val_phaseinfo1 = table_iter[6];
+ table_iter[7] = vals[0];
+ table_iter = &(table_iter[8]);
+ for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
+ const uint64_t cur_high = vals[high_idx];
+ table_iter[2] = val_phaseinfo0;
+ table_iter[3] = cur_high;
+ table_iter[6] = val_phaseinfo1;
+ table_iter[7] = cur_high;
+ table_iter = &(table_iter[8]);
+ }
+ // [32][0]..[39][1]: bit 5 set, bit 4 unset
+ // high bits must be 00 or 01
+ for (uint32_t high_idx = 0; high_idx != 2; ++high_idx) {
+ const uint64_t cur_high = high_idx? val_phaseinfo0 : val_phaseinfo1;
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = vals[low_idx];
+ *table_iter++ = cur_high;
+ }
+ }
+ table_iter = &(table_iter[16]);
+ // [48][0]..[55][1]: bits 4 and 5 set
+ for (uint32_t high_idx = 0; high_idx != 2; ++high_idx) {
+ const uint64_t cur_high = high_idx? val_phaseinfo0 : val_phaseinfo1;
+ table_iter[2] = val_phaseinfo0;
+ table_iter[3] = cur_high;
+ table_iter[6] = val_phaseinfo1;
+ table_iter[7] = cur_high;
+ table_iter = &(table_iter[8]);
+ }
+}
+
// bits 0..3: two genotypes
// bits 4..5: two (phasepresent | sex_male) bits
// bits 1,3: unpacked phaseinfo xor
@@ -1925,6 +2125,393 @@ void GenoarrSexLookup4b(const uintptr_t* genoarr, const uintptr_t* sex_male, con
}
}
+void InitPhaseXNohhLookup8b(void* table64x8bx2) {
+ uint64_t* table_iter = S_CAST(uint64_t*, table64x8bx2);
+ uint64_t vals[4];
+ vals[0] = table_iter[0];
+ table_iter[1] = vals[0];
+ vals[1] = table_iter[2];
+ table_iter[3] = vals[0];
+ vals[2] = table_iter[4];
+ table_iter[5] = vals[0];
+ vals[3] = table_iter[6];
+ table_iter[7] = vals[0];
+ table_iter = &(table_iter[8]);
+ for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
+ const uint64_t cur_high = vals[high_idx];
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = vals[low_idx];
+ *table_iter++ = cur_high;
+ }
+ }
+ // [16][0]..[31][1]: bit 4 is set
+ uint64_t male_or_phasepresent_vals[4];
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ male_or_phasepresent_vals[low_idx] = *table_iter++;
+ *table_iter++ = vals[0];
+ }
+ for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
+ const uint64_t cur_high = vals[high_idx];
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = male_or_phasepresent_vals[low_idx];
+ *table_iter++ = cur_high;
+ }
+ }
+ // [32][0]..[47][1]: bit 5 set, bit 4 unset
+ for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
+ const uint64_t cur_high = male_or_phasepresent_vals[high_idx];
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = vals[low_idx];
+ *table_iter++ = cur_high;
+ }
+ }
+ // [48][0]..[63][1]: bits 4 and 5 set
+ for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
+ const uint64_t cur_high = male_or_phasepresent_vals[high_idx];
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = male_or_phasepresent_vals[low_idx];
+ *table_iter++ = cur_high;
+ }
+ }
+}
+
+#ifdef __LP64__
+void GenoarrSexLookup8b(const uintptr_t* genoarr, const uintptr_t* sex_male, const void* table64x8bx2, uint32_t sample_ct, void* result) {
+ const __m128i* table_alias = S_CAST(const __m128i*, table64x8bx2);
+ const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
+ const Halfword* sex_male_alias = R_CAST(const Halfword*, sex_male);
+ __m128i* result_iter = S_CAST(__m128i*, result);
+ uint32_t loop_len = kBitsPerWordD4;
+ uintptr_t geno_word = 0;
+ uintptr_t male_hw_shifted = 0;
+ for (uint32_t widx = 0; ; ++widx) {
+ if (widx >= sample_ctl2_m1) {
+ if (widx > sample_ctl2_m1) {
+ if (sample_ct % 2) {
+ uintptr_t cur_idx = (geno_word & 3) | (male_hw_shifted & 16);
+ memcpy(result_iter, &(table_alias[cur_idx]), 8);
+ }
+ return;
+ }
+ loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
+ }
+ geno_word = genoarr[widx];
+ male_hw_shifted = sex_male_alias[widx];
+ male_hw_shifted <<= 4;
+ for (uint32_t uii = 0; uii != loop_len; ++uii) {
+ _mm_storeu_si128(result_iter, table_alias[(geno_word & 15) | (male_hw_shifted & 48)]);
+ ++result_iter;
+ geno_word >>= 4;
+ male_hw_shifted >>= 2;
+ }
+ }
+}
+#else
+void GenoarrSexLookup8b(const uintptr_t* genoarr, const uintptr_t* sex_male, const void* table64x8bx2, uint32_t sample_ct, void* result) {
+ const uint64_t* table_alias = S_CAST(const uint64_t*, table64x8bx2);
+ const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
+ const Halfword* sex_male_alias = R_CAST(const Halfword*, sex_male);
+ uint64_t* result_iter = S_CAST(uint64_t*, result);
+ uint32_t loop_len = kBitsPerWordD4;
+ uintptr_t geno_word = 0;
+ uintptr_t male_hw_shifted = 0;
+ for (uint32_t widx = 0; ; ++widx) {
+ if (widx >= sample_ctl2_m1) {
+ if (widx > sample_ctl2_m1) {
+ if (sample_ct % 2) {
+ uintptr_t cur_idx = (geno_word & 3) | (male_hw_shifted & 16);
+ memcpy(result_iter, &(table_alias[cur_idx * 2]), 8);
+ }
+ return;
+ }
+ loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
+ }
+ geno_word = genoarr[widx];
+ male_hw_shifted = sex_male_alias[widx];
+ male_hw_shifted <<= 4;
+ for (uint32_t uii = 0; uii != loop_len; ++uii) {
+ memcpy(result_iter, &(table_alias[((geno_word & 15) | (male_hw_shifted & 48)) * 2]), 16);
+ result_iter = &(result_iter[2]);
+ geno_word >>= 4;
+ male_hw_shifted >>= 2;
+ }
+ }
+}
+#endif
+
+void VcfPhaseLookup4b(const uintptr_t* genoarr, const uintptr_t* cur_phased, const uintptr_t* phaseinfo, const void* table246x4bx2, uint32_t sample_ct, void* __restrict result) {
+ const uint64_t* table_alias = S_CAST(const uint64_t*, table246x4bx2);
+ const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
+ const Halfword* cur_phased_alias = R_CAST(const Halfword*, cur_phased);
+ const Halfword* phaseinfo_alias = R_CAST(const Halfword*, phaseinfo);
+ uint64_t* result_iter = S_CAST(uint64_t*, result);
+ uint32_t loop_len = kBitsPerWordD4;
+ uintptr_t geno_word = 0;
+ uintptr_t cur_phased_hw_shifted = 0;
+ uintptr_t phaseinfo_hw_shifted = 0;
+ for (uint32_t widx = 0; ; ++widx) {
+ if (widx >= sample_ctl2_m1) {
+ if (widx > sample_ctl2_m1) {
+ if (sample_ct % 2) {
+ uintptr_t cur_idx = (geno_word & 3) | (cur_phased_hw_shifted & 16) | (phaseinfo_hw_shifted & 64);
+ memcpy(result_iter, &(table_alias[cur_idx]), 4);
+ }
+ return;
+ }
+ loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
+ }
+ geno_word = genoarr[widx];
+ cur_phased_hw_shifted = cur_phased_alias[widx];
+ if (!cur_phased_hw_shifted) {
+ for (uint32_t uii = 0; uii != loop_len; ++uii) {
+ *result_iter++ = table_alias[geno_word & 15];
+ geno_word >>= 4;
+ }
+ } else {
+ cur_phased_hw_shifted = cur_phased_hw_shifted << 4;
+ phaseinfo_hw_shifted = phaseinfo_alias[widx];
+
+ // note that this must be on a separate line (or we have to static_cast)
+ phaseinfo_hw_shifted = phaseinfo_hw_shifted << 6;
+
+ for (uint32_t uii = 0; uii != loop_len; ++uii) {
+ const uintptr_t cur_idx = (geno_word & 15) | (cur_phased_hw_shifted & 48) | (phaseinfo_hw_shifted & 192);
+ *result_iter++ = table_alias[cur_idx];
+ geno_word >>= 4;
+ cur_phased_hw_shifted >>= 2;
+ phaseinfo_hw_shifted >>= 2;
+ }
+ }
+ }
+}
+
+void InitVcfPhaseLookup4b(void* table246x4bx2) {
+ uint32_t* table_iter = S_CAST(uint32_t*, table246x4bx2);
+ uint32_t unphased_vals[4];
+ unphased_vals[0] = table_iter[0];
+ table_iter[1] = unphased_vals[0];
+ unphased_vals[1] = table_iter[2];
+ table_iter[3] = unphased_vals[0];
+ unphased_vals[2] = table_iter[4];
+ table_iter[5] = unphased_vals[0];
+ unphased_vals[3] = table_iter[6];
+ table_iter[7] = unphased_vals[0];
+ table_iter = &(table_iter[8]);
+ for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
+ const uint32_t cur_high = unphased_vals[high_idx];
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = unphased_vals[low_idx];
+ *table_iter++ = cur_high;
+ }
+ }
+ // [16][0]..[31][1]: first entry is phased and unflipped, second is unphased
+ uint32_t phased_unflipped_vals[4];
+ phased_unflipped_vals[0] = table_iter[0];
+ table_iter[1] = unphased_vals[0];
+ phased_unflipped_vals[1] = table_iter[2];
+ table_iter[3] = unphased_vals[0];
+ phased_unflipped_vals[2] = table_iter[4];
+ table_iter[5] = unphased_vals[0];
+ phased_unflipped_vals[3] = table_iter[6];
+ table_iter[7] = unphased_vals[0];
+ table_iter = &(table_iter[8]);
+ for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
+ const uint32_t cur_high = unphased_vals[high_idx];
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = phased_unflipped_vals[low_idx];
+ *table_iter++ = cur_high;
+ }
+ }
+ // [32][0]..[63][1]: second entry is phased and unflipped
+ for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
+ const uint32_t cur_high = phased_unflipped_vals[high_idx];
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = unphased_vals[low_idx];
+ *table_iter++ = cur_high;
+ }
+ }
+ for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
+ const uint32_t cur_high = phased_unflipped_vals[high_idx];
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = phased_unflipped_vals[low_idx];
+ *table_iter++ = cur_high;
+ }
+ }
+ // [64][0]..[79][1] should be impossible
+ table_iter = &(table_iter[32]);
+ // [80][0]..[95][1]: first entry is phased and flipped, second is unphased
+ // genotype must be 01
+ const uint32_t phased_flipped_01 = table_iter[2];
+ for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
+ table_iter[2] = phased_flipped_01;
+ table_iter[3] = unphased_vals[high_idx];
+ table_iter = &(table_iter[8]);
+ }
+ // [96][0]..[111][1] should be impossible
+ table_iter = &(table_iter[32]);
+ // [112][0]..[127][1]: first entry phased-flipped, second phased-unflipped
+ for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
+ table_iter[2] = phased_flipped_01;
+ table_iter[3] = phased_unflipped_vals[high_idx];
+ table_iter = &(table_iter[8]);
+ }
+ // [128][0]..[163][1] should be impossible
+ table_iter = &(table_iter[72]);
+ // [164][0]..[167][1]: second entry phased-flipped, first entry unphased
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = unphased_vals[low_idx];
+ *table_iter++ = phased_flipped_01;
+ }
+ // [168][0]..[179][1] should be impossible
+ table_iter = &(table_iter[24]);
+ // [180][0]..[183][1]: second entry phased-flipped, first phased-unflipped
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = phased_unflipped_vals[low_idx];
+ *table_iter++ = phased_flipped_01;
+ }
+ // [184][0]..[244][1] should be impossible
+ // [245][0]..[245][1]: both phased-flipped
+ table_iter[122] = phased_flipped_01;
+ table_iter[123] = phased_flipped_01;
+}
+
+void VcfPhaseLookup2b(const uintptr_t* genoarr, const uintptr_t* cur_phased, const uintptr_t* phaseinfo, const void* table246x2bx2, uint32_t sample_ct, void* __restrict result) {
+ const uint32_t* table_alias = S_CAST(const uint32_t*, table246x2bx2);
+ const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
+ const Halfword* cur_phased_alias = R_CAST(const Halfword*, cur_phased);
+ const Halfword* phaseinfo_alias = R_CAST(const Halfword*, phaseinfo);
+ uint32_t* result_iter = S_CAST(uint32_t*, result);
+ uint32_t loop_len = kBitsPerWordD4;
+ uintptr_t geno_word = 0;
+ uintptr_t cur_phased_hw_shifted = 0;
+ uintptr_t phaseinfo_hw_shifted = 0;
+ for (uint32_t widx = 0; ; ++widx) {
+ if (widx >= sample_ctl2_m1) {
+ if (widx > sample_ctl2_m1) {
+ if (sample_ct % 2) {
+ uintptr_t cur_idx = (geno_word & 3) | (cur_phased_hw_shifted & 16) | (phaseinfo_hw_shifted & 64);
+ memcpy(result_iter, &(table_alias[cur_idx]), 2);
+ }
+ return;
+ }
+ loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
+ }
+ geno_word = genoarr[widx];
+ cur_phased_hw_shifted = cur_phased_alias[widx];
+ if (!cur_phased_hw_shifted) {
+ for (uint32_t uii = 0; uii != loop_len; ++uii) {
+ *result_iter++ = table_alias[geno_word & 15];
+ geno_word >>= 4;
+ }
+ } else {
+ cur_phased_hw_shifted = cur_phased_hw_shifted << 4;
+ phaseinfo_hw_shifted = phaseinfo_alias[widx];
+
+ // note that this must be on a separate line (or we have to static_cast)
+ phaseinfo_hw_shifted = phaseinfo_hw_shifted << 6;
+
+ for (uint32_t uii = 0; uii != loop_len; ++uii) {
+ const uintptr_t cur_idx = (geno_word & 15) | (cur_phased_hw_shifted & 48) | (phaseinfo_hw_shifted & 192);
+ *result_iter++ = table_alias[cur_idx];
+ geno_word >>= 4;
+ cur_phased_hw_shifted >>= 2;
+ phaseinfo_hw_shifted >>= 2;
+ }
+ }
+ }
+}
+
+void InitVcfPhaseLookup2b(void* table246x2bx2) {
+ uint16_t* table_iter = S_CAST(uint16_t*, table246x2bx2);
+ uint16_t unphased_vals[4];
+ unphased_vals[0] = table_iter[0];
+ table_iter[1] = unphased_vals[0];
+ unphased_vals[1] = table_iter[2];
+ table_iter[3] = unphased_vals[0];
+ unphased_vals[2] = table_iter[4];
+ table_iter[5] = unphased_vals[0];
+ unphased_vals[3] = table_iter[6];
+ table_iter[7] = unphased_vals[0];
+ table_iter = &(table_iter[8]);
+ for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
+ const uint32_t cur_high = unphased_vals[high_idx];
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = unphased_vals[low_idx];
+ *table_iter++ = cur_high;
+ }
+ }
+ // [16][0]..[31][1]: first entry is phased and unflipped, second is unphased
+ uint16_t phased_unflipped_vals[4];
+ phased_unflipped_vals[0] = table_iter[0];
+ table_iter[1] = unphased_vals[0];
+ phased_unflipped_vals[1] = table_iter[2];
+ table_iter[3] = unphased_vals[0];
+ phased_unflipped_vals[2] = table_iter[4];
+ table_iter[5] = unphased_vals[0];
+ phased_unflipped_vals[3] = table_iter[6];
+ table_iter[7] = unphased_vals[0];
+ table_iter = &(table_iter[8]);
+ for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
+ const uint32_t cur_high = unphased_vals[high_idx];
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = phased_unflipped_vals[low_idx];
+ *table_iter++ = cur_high;
+ }
+ }
+ // [32][0]..[63][1]: second entry is phased and unflipped
+ for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
+ const uint32_t cur_high = phased_unflipped_vals[high_idx];
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = unphased_vals[low_idx];
+ *table_iter++ = cur_high;
+ }
+ }
+ for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
+ const uint32_t cur_high = phased_unflipped_vals[high_idx];
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = phased_unflipped_vals[low_idx];
+ *table_iter++ = cur_high;
+ }
+ }
+ // [64][0]..[79][1] should be impossible
+ table_iter = &(table_iter[32]);
+ // [80][0]..[95][1]: first entry is phased and flipped, second is unphased
+ // genotype must be 01
+ const uint32_t phased_flipped_01 = table_iter[2];
+ for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
+ table_iter[2] = phased_flipped_01;
+ table_iter[3] = unphased_vals[high_idx];
+ table_iter = &(table_iter[8]);
+ }
+ // [96][0]..[111][1] should be impossible
+ table_iter = &(table_iter[32]);
+ // [112][0]..[127][1]: first entry phased-flipped, second phased-unflipped
+ for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
+ table_iter[2] = phased_flipped_01;
+ table_iter[3] = phased_unflipped_vals[high_idx];
+ table_iter = &(table_iter[8]);
+ }
+ // [128][0]..[163][1] should be impossible
+ table_iter = &(table_iter[72]);
+ // [164][0]..[167][1]: second entry phased-flipped, first entry unphased
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = unphased_vals[low_idx];
+ *table_iter++ = phased_flipped_01;
+ }
+ // [168][0]..[179][1] should be impossible
+ table_iter = &(table_iter[24]);
+ // [180][0]..[183][1]: second entry phased-flipped, first phased-unflipped
+ for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
+ *table_iter++ = phased_unflipped_vals[low_idx];
+ *table_iter++ = phased_flipped_01;
+ }
+ // [184][0]..[244][1] should be impossible
+ // [245][0]..[245][1]: both phased-flipped
+ table_iter[122] = phased_flipped_01;
+ table_iter[123] = phased_flipped_01;
+}
+
+
void ClearGenoarrMissing1bit8Unsafe(const uintptr_t* __restrict genoarr, uint32_t* subset_sizep, uintptr_t* __restrict subset, void* __restrict sparse_vals) {
const uint32_t orig_subset_size = *subset_sizep;
Halfword* subset_alias = R_CAST(Halfword*, subset);
=====================================
include/pgenlib_misc.h
=====================================
@@ -79,7 +79,7 @@
// 10000 * major + 100 * minor + patch
// Exception to CONSTI32, since we want the preprocessor to have access to this
// value. Named with all caps as a consequence.
-#define PGENLIB_INTERNAL_VERNUM 1501
+#define PGENLIB_INTERNAL_VERNUM 1504
#ifdef __cplusplus
namespace plink2 {
@@ -472,6 +472,8 @@ void SplitHomRef2het(const uintptr_t* genoarr, uint32_t sample_ct, uintptr_t* __
// so they aren't always the right choice.
// When lookup table rows are 16 bytes, they are assumed to be 16-byte aligned
// in 64-bit builds. result[] is not assumed to be aligned.
+void GenoarrLookup256x1bx4(const uintptr_t* genoarr, const void* table256x1bx4, uint32_t sample_ct, void* __restrict result);
+
void GenoarrLookup16x4bx2(const uintptr_t* genoarr, const void* table16x4bx2, uint32_t sample_ct, void* result);
void GenoarrLookup256x2bx4(const uintptr_t* genoarr, const void* table256x2bx4, uint32_t sample_ct, void* result);
@@ -515,6 +517,10 @@ void InitLookup16x4bx2(void* table16x4bx2);
void InitLookup16x8bx2(void* table16x8bx2);
+void InitLookup256x1bx4(void* table256x1bx4);
+
+void InitLookup256x2bx4(void* table256x2bx4);
+
void InitLookup256x4bx4(void* table256x4bx4);
void PhaseLookup4b(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, const void* table56x4bx2, uint32_t sample_ct, void* result);
@@ -522,6 +528,10 @@ void PhaseLookup4b(const uintptr_t* genoarr, const uintptr_t* phasepresent, cons
// [0][0]..[3][0], [17][0], and [19][0] should contain the relevant values
void InitPhaseLookup4b(void* table56x4bx2);
+void PhaseLookup8b(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, const void* table56x8bx2, uint32_t sample_ct, void* result);
+
+void InitPhaseLookup8b(void* table56x8bx2);
+
// het-haploid prohibited. 64-entry table suffices: we use the same bits for
// phasepresent and sex_male since they can't be true simultaneously.
// phaseinfo is xor'd with bits 1 and 3 instead of 1 and 2.
@@ -533,6 +543,25 @@ void InitPhaseXNohhLookup4b(void* table64x4bx2);
// uses same table as PhaseXNohhLookup
void GenoarrSexLookup4b(const uintptr_t* genoarr, const uintptr_t* sex_male, const void* table64x4bx2, uint32_t sample_ct, void* result);
+void InitPhaseXNohhLookup8b(void* table64x8bx2);
+
+void GenoarrSexLookup8b(const uintptr_t* genoarr, const uintptr_t* sex_male, const void* table64x8bx2, uint32_t sample_ct, void* result);
+
+// Unlike PhaseLookup4b(), this allows the cur_phased bit to be set when the
+// genoarr entry is not 01 (het).
+void VcfPhaseLookup4b(const uintptr_t* genoarr, const uintptr_t* cur_phased, const uintptr_t* phaseinfo, const void* table246x4bx2, uint32_t sample_ct, void* __restrict result);
+
+// Precondition:
+// [0], [2], [4], [6] initialized with unphased entries
+// [32], [34], [36], [38] initialized with phased-unflipped entries
+// [162] initialized with phased-flipped case
+void InitVcfPhaseLookup4b(void* table246x4bx2);
+
+void VcfPhaseLookup2b(const uintptr_t* genoarr, const uintptr_t* cur_phased, const uintptr_t* phaseinfo, const void* table246x2bx2, uint32_t sample_ct, void* __restrict result);
+
+void InitVcfPhaseLookup2b(void* table246x2bx2);
+
+
// Analogue of BitIter1x.
HEADER_INLINE uint32_t GenoIter1x(const uintptr_t* __restrict genoarr, uintptr_t match_word, uintptr_t* __restrict widxp, uintptr_t* __restrict cur_bitsp) {
uintptr_t cur_bits = *cur_bitsp;
=====================================
include/pgenlib_read.cc
=====================================
@@ -6636,39 +6636,39 @@ PglErr ParseAux2Subset(const unsigned char* fread_end, const uintptr_t* __restri
}
ExpandThenSubsetBytearr(aux2_start, all_hets, sample_include, het_ct, sample_ct, 1, phaseinfo);
}
- *phasepresent_ct_ptr = PopcountWords(phasepresent, sample_ctl);
- return kPglRetSuccess;
- }
- const uint32_t het_ctdl = het_ct / kBitsPerWord;
+ // bugfix (25 Feb 2020): forgot to mask out subsetted_10het here
+ } else {
+ const uint32_t het_ctdl = het_ct / kBitsPerWord;
- // explicit phasepresent
- const uintptr_t* aux2_first_part = R_CAST(const uintptr_t*, aux2_start);
- uintptr_t* aux2_first_part_copy = workspace_subset;
- aux2_first_part_copy[het_ctdl] = 0;
- memcpy(aux2_first_part_copy, aux2_first_part, 1 + (het_ct / CHAR_BIT));
- const uint32_t raw_phasepresent_ct = PopcountWords(aux2_first_part_copy, het_ctdl + 1) - 1;
- if (unlikely(!raw_phasepresent_ct)) {
- // there shouldn't be a hphase track at all in this case
- return kPglRetMalformedInput;
- }
- const unsigned char* aux2_second_part = &(aux2_start[1 + (het_ct / CHAR_BIT)]);
- *fread_pp = aux2_second_part;
- if (PtrAddCk(fread_end, DivUp(raw_phasepresent_ct, CHAR_BIT), fread_pp)) {
- return kPglRetMalformedInput;
- }
- if (!phaseinfo) {
- return kPglRetSuccess;
- }
- if (!sample_include) {
- ExpandBytearrNested(aux2_second_part, aux2_first_part_copy, all_hets, sample_ctl, raw_phasepresent_ct, 1, phasepresent, phaseinfo);
- if (!subsetted_10het) {
- *phasepresent_ct_ptr = raw_phasepresent_ct;
+ // explicit phasepresent
+ const uintptr_t* aux2_first_part = R_CAST(const uintptr_t*, aux2_start);
+ uintptr_t* aux2_first_part_copy = workspace_subset;
+ aux2_first_part_copy[het_ctdl] = 0;
+ memcpy(aux2_first_part_copy, aux2_first_part, 1 + (het_ct / CHAR_BIT));
+ const uint32_t raw_phasepresent_ct = PopcountWords(aux2_first_part_copy, het_ctdl + 1) - 1;
+ if (unlikely(!raw_phasepresent_ct)) {
+ // there shouldn't be a hphase track at all in this case
+ return kPglRetMalformedInput;
+ }
+ const unsigned char* aux2_second_part = &(aux2_start[1 + (het_ct / CHAR_BIT)]);
+ *fread_pp = aux2_second_part;
+ if (PtrAddCk(fread_end, DivUp(raw_phasepresent_ct, CHAR_BIT), fread_pp)) {
+ return kPglRetMalformedInput;
+ }
+ if (!phaseinfo) {
return kPglRetSuccess;
}
- } else {
- // could skip if intersection of phasepresent with sample_include is empty,
- // but this function call should be fast enough there anyway?
- ExpandThenSubsetBytearrNested(aux2_second_part, aux2_first_part_copy, all_hets, sample_include, sample_ct, raw_phasepresent_ct, 1, phasepresent, phaseinfo);
+ if (!sample_include) {
+ ExpandBytearrNested(aux2_second_part, aux2_first_part_copy, all_hets, sample_ctl, raw_phasepresent_ct, 1, phasepresent, phaseinfo);
+ if (!subsetted_10het) {
+ *phasepresent_ct_ptr = raw_phasepresent_ct;
+ return kPglRetSuccess;
+ }
+ } else {
+ // could skip if intersection of phasepresent with sample_include is
+ // empty, but this function call should be fast enough there anyway?
+ ExpandThenSubsetBytearrNested(aux2_second_part, aux2_first_part_copy, all_hets, sample_include, sample_ct, raw_phasepresent_ct, 1, phasepresent, phaseinfo);
+ }
}
if (subsetted_10het) {
BitvecInvmask(subsetted_10het, sample_ctl, phasepresent);
=====================================
include/plink2_base.cc
=====================================
@@ -173,11 +173,11 @@ BoolErr ScanIntAbsBounded(const char* str_iter, uint64_t bound, int32_t* valp) {
if (ctou32(*valp) >= 10) {
if (*valp == -3) {
sign = -1;
- } else if (*valp != -5) {
+ } else if (unlikely(*valp != -5)) {
return 1;
}
*valp = ctou32(*str_iter++) - 48;
- if (unlikely(ctou32(*valp) >= 10)) {
+ if (unlikely(*valp >= 10)) {
return 1;
}
}
@@ -260,7 +260,7 @@ BoolErr ScanIntAbsBounded32(const char* str_iter, uint32_t bound_div_10, uint32_
if (val >= 10) {
if (val == 0xfffffffdU) {
sign = -1;
- } else if (val != 0xfffffffbU) {
+ } else if (unlikely(val != 0xfffffffbU)) {
return 1;
}
val = ctou32(*str_iter++) - 48;
=====================================
include/plink2_base.h
=====================================
@@ -1019,6 +1019,10 @@ HEADER_INLINE void vecw_storeu(void* mem_addr, VecW vv) {
_mm256_storeu_si256(S_CAST(__m256i*, mem_addr), R_CAST(__m256i, vv));
}
+HEADER_INLINE void vecu32_storeu(void* mem_addr, VecU32 vv) {
+ _mm256_storeu_si256(S_CAST(__m256i*, mem_addr), R_CAST(__m256i, vv));
+}
+
HEADER_INLINE void veci32_storeu(void* mem_addr, VecI32 vv) {
_mm256_storeu_si256(S_CAST(__m256i*, mem_addr), R_CAST(__m256i, vv));
}
@@ -1059,6 +1063,18 @@ HEADER_INLINE VecUc vecuc_min(VecUc v1, VecUc v2) {
return R_CAST(VecUc, _mm256_min_epu8(R_CAST(__m256i, v1), R_CAST(__m256i, v2)));
}
+HEADER_INLINE VecW vecw_blendv(VecW aa, VecW bb, VecW mask) {
+ return R_CAST(VecW, _mm256_blendv_epi8(R_CAST(__m256i, aa), R_CAST(__m256i, bb), R_CAST(__m256i, mask)));
+}
+
+HEADER_INLINE VecU32 vecu32_blendv(VecU32 aa, VecU32 bb, VecU32 mask) {
+ return R_CAST(VecU32, _mm256_blendv_epi8(R_CAST(__m256i, aa), R_CAST(__m256i, bb), R_CAST(__m256i, mask)));
+}
+
+HEADER_INLINE VecU16 vecu16_blendv(VecU16 aa, VecU16 bb, VecU16 mask) {
+ return R_CAST(VecU16, _mm256_blendv_epi8(R_CAST(__m256i, aa), R_CAST(__m256i, bb), R_CAST(__m256i, mask)));
+}
+
# else // !USE_AVX2
# define VCONST_W(xx) {xx, xx}
@@ -1238,6 +1254,10 @@ HEADER_INLINE void vecw_storeu(void* mem_addr, VecW vv) {
_mm_storeu_si128(S_CAST(__m128i*, mem_addr), R_CAST(__m128i, vv));
}
+HEADER_INLINE void vecu32_storeu(void* mem_addr, VecU32 vv) {
+ _mm_storeu_si128(S_CAST(__m128i*, mem_addr), R_CAST(__m128i, vv));
+}
+
HEADER_INLINE void veci32_storeu(void* mem_addr, VecI32 vv) {
_mm_storeu_si128(S_CAST(__m128i*, mem_addr), R_CAST(__m128i, vv));
}
@@ -1368,6 +1388,18 @@ HEADER_INLINE uintptr_t vecw_extract64_0(VecW vv) {
HEADER_INLINE uintptr_t vecw_extract64_1(VecW vv) {
return _mm_extract_epi64(R_CAST(__m128i, vv), 1);
}
+
+HEADER_INLINE VecW vecw_blendv(VecW aa, VecW bb, VecW mask) {
+ return R_CAST(VecW, _mm_blendv_epi8(R_CAST(__m128i, aa), R_CAST(__m128i, bb), R_CAST(__m128i, mask)));
+}
+
+HEADER_INLINE VecU32 vecu32_blendv(VecU32 aa, VecU32 bb, VecU32 mask) {
+ return R_CAST(VecU32, _mm_blendv_epi8(R_CAST(__m128i, aa), R_CAST(__m128i, bb), R_CAST(__m128i, mask)));
+}
+
+HEADER_INLINE VecU16 vecu16_blendv(VecU16 aa, VecU16 bb, VecU16 mask) {
+ return R_CAST(VecU16, _mm_blendv_epi8(R_CAST(__m128i, aa), R_CAST(__m128i, bb), R_CAST(__m128i, mask)));
+}
# else
HEADER_INLINE uintptr_t vecw_extract64_0(VecW vv) {
return R_CAST(uintptr_t, _mm_movepi64_pi64(R_CAST(__m128i, vv)));
@@ -1377,6 +1409,20 @@ HEADER_INLINE uintptr_t vecw_extract64_1(VecW vv) {
const __m128i v0 = _mm_srli_si128(R_CAST(__m128i, vv), 8);
return R_CAST(uintptr_t, _mm_movepi64_pi64(v0));
}
+
+// N.B. we do *not* enforce the low bits of each mask byte matching the high
+// bit.
+HEADER_INLINE VecW vecw_blendv(VecW aa, VecW bb, VecW mask) {
+ return vecw_and_notfirst(mask, aa) | (mask & bb);
+}
+
+HEADER_INLINE VecU32 vecu32_blendv(VecU32 aa, VecU32 bb, VecU32 mask) {
+ return vecu32_and_notfirst(mask, aa) | (mask & bb);
+}
+
+HEADER_INLINE VecU16 vecu16_blendv(VecU16 aa, VecU16 bb, VecU16 mask) {
+ return vecu16_and_notfirst(mask, aa) | (mask & bb);
+}
# endif
HEADER_INLINE VecI16 veci16_max(VecI16 v1, VecI16 v2) {
@@ -1398,7 +1444,6 @@ HEADER_INLINE VecU16 vecu16_min8(VecU16 v1, VecU16 v2) {
HEADER_INLINE VecUc vecuc_min(VecUc v1, VecUc v2) {
return R_CAST(VecUc, _mm_min_epu8(R_CAST(__m128i, v1), R_CAST(__m128i, v2)));
}
-
# endif // !USE_AVX2
HEADER_INLINE VecW vecw_bytesum(VecW src, VecW m0) {
=====================================
include/plink2_bits.cc
=====================================
@@ -1682,12 +1682,12 @@ void TransposeBitblock64(const uintptr_t* read_iter, uintptr_t read_ul_stride, u
# else
VecW tmp_lo = vecw_unpacklo8(loader0, loader1);
VecW tmp_hi = vecw_unpackhi8(loader0, loader1);
- loader0 = (tmp_lo & m8) | vecw_and_notfirst(m8, vecw_slli(tmp_hi, 8));
- loader1 = (vecw_srli(tmp_lo, 8) & m8) | vecw_and_notfirst(m8, tmp_hi);
+ loader0 = vecw_blendv(vecw_slli(tmp_hi, 8), tmp_lo, m8);
+ loader1 = vecw_blendv(tmp_hi, vecw_srli(tmp_lo, 8), m8);
tmp_lo = vecw_unpacklo8(loader2, loader3);
tmp_hi = vecw_unpackhi8(loader2, loader3);
- loader2 = (tmp_lo & m8) | vecw_and_notfirst(m8, vecw_slli(tmp_hi, 8));
- loader3 = (vecw_srli(tmp_lo, 8) & m8) | vecw_and_notfirst(m8, tmp_hi);
+ loader2 = vecw_blendv(vecw_slli(tmp_hi, 8), tmp_lo, m8);
+ loader3 = vecw_blendv(tmp_hi, vecw_srli(tmp_lo, 8), m8);
# endif
// -> (0,0) (1,0) (2,0) (3,0) (0,1) (1,1) (2,1) (3,1) (0,2) ...
const VecW lo_0123 = vecw_unpacklo16(loader0, loader1);
@@ -2086,20 +2086,20 @@ void TransposeNybbleblock(const uintptr_t* read_iter, uint32_t read_ul_stride, u
# else
VecW tmp_lo = vecw_unpacklo8(even_nybbles0, odd_nybbles0);
VecW tmp_hi = vecw_unpackhi8(even_nybbles0, odd_nybbles0);
- even_nybbles0 = (tmp_lo & m8) | vecw_and_notfirst(m8, vecw_slli(tmp_hi, 8));
- odd_nybbles0 = (vecw_srli(tmp_lo, 8) & m8) | vecw_and_notfirst(m8, tmp_hi);
+ even_nybbles0 = vecw_blendv(vecw_slli(tmp_hi, 8), tmp_lo, m8);
+ odd_nybbles0 = vecw_blendv(tmp_hi, vecw_srli(tmp_lo, 8), m8);
tmp_lo = vecw_unpacklo8(even_nybbles1, odd_nybbles1);
tmp_hi = vecw_unpackhi8(even_nybbles1, odd_nybbles1);
- even_nybbles1 = (tmp_lo & m8) | vecw_and_notfirst(m8, vecw_slli(tmp_hi, 8));
- odd_nybbles1 = (vecw_srli(tmp_lo, 8) & m8) | vecw_and_notfirst(m8, tmp_hi);
+ even_nybbles1 = vecw_blendv(vecw_slli(tmp_hi, 8), tmp_lo, m8);
+ odd_nybbles1 = vecw_blendv(tmp_hi, vecw_srli(tmp_lo, 8), m8);
tmp_lo = vecw_unpacklo8(even_nybbles2, odd_nybbles2);
tmp_hi = vecw_unpackhi8(even_nybbles2, odd_nybbles2);
- even_nybbles2 = (tmp_lo & m8) | vecw_and_notfirst(m8, vecw_slli(tmp_hi, 8));
- odd_nybbles2 = (vecw_srli(tmp_lo, 8) & m8) | vecw_and_notfirst(m8, tmp_hi);
+ even_nybbles2 = vecw_blendv(vecw_slli(tmp_hi, 8), tmp_lo, m8);
+ odd_nybbles2 = vecw_blendv(tmp_hi, vecw_srli(tmp_lo, 8), m8);
tmp_lo = vecw_unpacklo8(even_nybbles3, odd_nybbles3);
tmp_hi = vecw_unpackhi8(even_nybbles3, odd_nybbles3);
- even_nybbles3 = (tmp_lo & m8) | vecw_and_notfirst(m8, vecw_slli(tmp_hi, 8));
- odd_nybbles3 = (vecw_srli(tmp_lo, 8) & m8) | vecw_and_notfirst(m8, tmp_hi);
+ even_nybbles3 = vecw_blendv(vecw_slli(tmp_hi, 8), tmp_lo, m8);
+ odd_nybbles3 = vecw_blendv(tmp_hi, vecw_srli(tmp_lo, 8), m8);
# endif
const VecW even_lo = vecw_unpacklo16(even_nybbles0, even_nybbles1);
=====================================
include/plink2_bits.h
=====================================
@@ -504,6 +504,8 @@ uintptr_t CountByte(const void* bytearr, unsigned char ucc, uintptr_t byte_ct);
uintptr_t CountU16(const void* u16arr, uint16_t usii, uintptr_t u16_ct);
+
+// Applies sample_include to {src_subset, src_vals}.
uint32_t Copy1bit8Subset(const uintptr_t* __restrict src_subset, const void* __restrict src_vals, const uintptr_t* __restrict sample_include, uint32_t src_subset_size, uint32_t sample_ct, uintptr_t* __restrict dst_subset, void* __restrict dst_vals);
uint32_t Copy1bit16Subset(const uintptr_t* __restrict src_subset, const void* __restrict src_vals, const uintptr_t* __restrict sample_include, uint32_t src_subset_size, uint32_t sample_ct, uintptr_t* __restrict dst_subset, void* __restrict dst_vals);
=====================================
include/plink2_string.cc
=====================================
@@ -1047,6 +1047,31 @@ BoolErr ScanmovUintCapped(uint64_t cap, const char** str_iterp, uint32_t* valp)
*str_iterp = str_iter;
return ScanmovUintCappedFinish(cap, str_iterp, valp);
}
+
+// 2^{-31} < floor <= 0 <= cap < 2^31
+BoolErr ScanmovIntBounded(uint64_t abs_floor, uint64_t cap, const char** str_iterp, int32_t* valp) {
+ const char* str_iter = *str_iterp;
+ *valp = ctou32(*str_iter++) - 48;
+ int32_t sign = 1;
+ if (ctou32(*valp) >= 10) {
+ if (*valp == -3) {
+ sign = -1;
+ cap = abs_floor;
+ } else if (unlikely(*valp != -5)) { // accept leading '+'
+ return 1;
+ }
+ *valp = ctou32(*str_iter++) - 48;
+ if (unlikely(*valp >= 10)) {
+ return 1;
+ }
+ }
+ *str_iterp = str_iter;
+ if (ScanmovUintCappedFinish(cap, str_iterp, R_CAST(uint32_t*, valp))) {
+ return 1;
+ }
+ *valp *= sign;
+ return 0;
+}
#else
BoolErr ScanmovPosintCapped32(uint32_t cap_div_10, uint32_t cap_mod_10, const char** str_iterp, uint32_t* valp) {
const char* str_iter = *str_iterp;
@@ -1111,6 +1136,37 @@ BoolErr ScanmovUintCapped32(uint32_t cap_div_10, uint32_t cap_mod_10, const char
val = val * 10 + cur_digit;
}
}
+
+BoolErr ScanmovIntBounded32(uint32_t abs_floor_div_10, uint32_t abs_floor_mod_10, uint32_t cap_div_10, uint32_t cap_mod_10, const char** str_iterp, int32_t* valp) {
+ const char* str_iter = *str_iterp;
+ uint32_t val = ctou32(*str_iter++) - 48;
+ int32_t sign = 1;
+ if (val >= 10) {
+ if (val == 0xfffffffdU) {
+ sign = -1;
+ cap_div_10 = abs_floor_div_10;
+ cap_mod_10 = abs_floor_mod_10;
+ } else if (unlikely(val != 0xfffffffbU)) {
+ return 1;
+ }
+ val = ctou32(*str_iter++) - 48;
+ if (unlikely(val >= 10)) {
+ return 1;
+ }
+ }
+ for (; ; ++str_iter) {
+ const uint32_t cur_digit = ctou32(*str_iter) - 48;
+ if (cur_digit >= 10) {
+ *valp = sign * S_CAST(int32_t, val);
+ *str_iterp = str_iter;
+ return 0;
+ }
+ if (unlikely((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10)))) {
+ return 1;
+ }
+ val = val * 10 + cur_digit;
+ }
+}
#endif
static const double kPositivePow10[] = {1, 1.0e1, 1.0e2, 1.0e3, 1.0e4, 1.0e5, 1.0e6, 1.0e7, 1.0e8, 1.0e9, 1.0e10, 1.0e11, 1.0e12, 1.0e13, 1.0e14, 1.0e15};
@@ -3091,6 +3147,30 @@ uintptr_t ExpsearchNsortStrLb(const char* idbuf, const char* nsorted_strbox, uin
return start_idx;
}
+uint32_t IsInfStr(const char* ss, uint32_t slen, uint32_t* is_neg_ptr) {
+ const char first_char = *ss;
+ if (first_char == '-') {
+ *is_neg_ptr = 1;
+ ++ss;
+ --slen;
+ } else if (first_char == '+') {
+ ++ss;
+ --slen;
+ }
+ if (slen == 3) {
+ uint32_t four_chars;
+ memcpy(&four_chars, ss, 4); // OVERREAD
+ // assumes little-endian
+ return ((four_chars & 0xdfdfdf) == 0x464e49);
+ }
+ if (slen != 8) {
+ return 0;
+ }
+ uint64_t eight_chars;
+ memcpy(&eight_chars, ss, 8);
+ return ((eight_chars & 0xdfdfdfdfdfdfdfdfLLU) == 0x5954494e49464e49LLU);
+}
+
#ifdef __LP64__
CXXCONST_CP Memrchr(const char* str_start, char needle, uintptr_t slen) {
=====================================
include/plink2_string.h
=====================================
@@ -921,11 +921,16 @@ BoolErr ScanPosintptr(const char* str_iter, uintptr_t* valp);
BoolErr ScanmovPosintCapped(uint64_t cap, const char** str_iterp, uint32_t* valp);
BoolErr ScanmovUintCapped(uint64_t cap, const char** str_iterp, uint32_t* valp);
+
+// 2^{-31} < -abs_floor <= 0 <= cap < 2^31
+BoolErr ScanmovIntBounded(uint64_t abs_floor, uint64_t cap, const char** str_iterp, int32_t* valp);
#else
BoolErr ScanmovPosintCapped32(uint32_t cap_div_10, uint32_t cap_mod_10, const char** str_iterp, uint32_t* valp);
BoolErr ScanmovUintCapped32(uint32_t cap_div_10, uint32_t cap_mod_10, const char** str_iterp, uint32_t* valp);
+BoolErr ScanmovIntBounded32(uint32_t abs_floor_div_10, uint32_t abs_floor_mod_10, uint32_t cap_div_10, uint32_t cap_mod_10, const char** str_iterp, int32_t* valp);
+
HEADER_INLINE BoolErr ScanmovPosintCapped(uint32_t cap, const char** str_iterp, uint32_t* valp) {
return ScanmovPosintCapped32(cap / 10, cap % 10, str_iterp, valp);
}
@@ -933,6 +938,10 @@ HEADER_INLINE BoolErr ScanmovPosintCapped(uint32_t cap, const char** str_iterp,
HEADER_INLINE BoolErr ScanmovUintCapped(uint32_t cap, const char** str_iterp, uint32_t* valp) {
return ScanmovUintCapped32(cap / 10, cap % 10, str_iterp, valp);
}
+
+HEADER_INLINE BoolErr ScanmovIntBounded(uint32_t abs_floor, uint32_t cap, const char** str_iterp, int32_t* valp) {
+ return ScanmovIntBounded32(abs_floor / 10, abs_floor % 10, cap / 10, cap % 10, str_iterp, valp);
+}
#endif
HEADER_INLINE BoolErr ScanmovUintDefcap(const char** str_iterp, uint32_t* valp) {
@@ -1524,6 +1533,11 @@ HEADER_INLINE uint32_t IsNanStr(const char* ss, uint32_t slen) {
return (slen == 2) || ((ctou32(ss[2]) & 0xdf) == 78);
}
+// Matches "inf"/"infinity", any capitalization, can have sign in front.
+// Assumes is_neg zero-initialized.
+// Assumes one-char overread is ok.
+uint32_t IsInfStr(const char* ss, uint32_t slen, uint32_t* is_neg_ptr);
+
HEADER_INLINE CXXCONST_CP AdvToDelimOrEnd(const char* str_iter, const char* str_end, char delim) {
CXXCONST_CP memchr_result = S_CAST(CXXCONST_CP, memchr(str_iter, delim, str_end - str_iter));
=====================================
pgenlibr/src/Makevars
=====================================
@@ -1,5 +1,5 @@
SOURCES = $(wildcard libdeflate/lib/*.c) $(wildcard libdeflate/lib/x86/*.c)
-OBJECTS = include/plink2_base.o include/pgenlib_misc.o include/pgenlib_read.o pvar_ffi_support.o pgenlib_ffi_support.o include/plink2_bgzf.o include/plink2_string.o include/plink2_text.o include/plink2_thread.o include/plink2_zstfile.o pvar.o pgenlibr.o RcppExports.o $(SOURCES:.c=.o)
+OBJECTS = include/plink2_base.o include/plink2_bits.o include/pgenlib_misc.o include/pgenlib_read.o pvar_ffi_support.o pgenlib_ffi_support.o include/plink2_bgzf.o include/plink2_string.o include/plink2_text.o include/plink2_thread.o include/plink2_zstfile.o pvar.o pgenlibr.o RcppExports.o $(SOURCES:.c=.o)
PKG_CFLAGS = -Ilibdeflate -Ilibdeflate/common
PKG_CPPFLAGS =
PKG_LIBS = -lpthread -lzstd
=====================================
plink2.cc
=====================================
@@ -66,7 +66,7 @@ static const char ver_str[] = "PLINK v2.00a3"
#ifdef USE_MKL
" Intel"
#endif
- " (17 Feb 2020)";
+ " (21 Mar 2020)";
static const char ver_str2[] =
// include leading space if day < 10, so character length stays the same
""
@@ -2523,12 +2523,12 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c
if (pcp->command_flags1 & kfCommand1MakePlink2) {
// todo: unsorted case (--update-chr, etc.)
if (pcp->sort_vars_flags != kfSort0) {
- reterr = MakePlink2Vsort(xheader, sample_include, &pii, sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, info_reload_slen? pvarname : nullptr, variant_cms, chr_idxs, xheader_blen, info_flags, raw_sample_ct, sample_ct, pheno_ct, max_pheno_name_blen, raw_variant_ct, variant_ct, max_allele_ct, max_allele_slen, max_filter_slen, info_reload_slen, pcp->max_thread_ct, pcp->hard_call_thresh, pcp->dosage_erase_thresh, make_plink2_flags, (pcp->sort_vars_flags == kfSortNatural), pcp->pvar_psam_flags, &simple_pgr, outname, outname_end);
+ reterr = MakePlink2Vsort(sample_include, &pii, sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, info_reload_slen? pvarname : nullptr, variant_cms, chr_idxs, xheader_blen, info_flags, raw_sample_ct, sample_ct, pheno_ct, max_pheno_name_blen, raw_variant_ct, variant_ct, max_allele_ct, max_allele_slen, max_filter_slen, info_reload_slen, pcp->max_thread_ct, pcp->hard_call_thresh, pcp->dosage_erase_thresh, make_plink2_flags, (pcp->sort_vars_flags == kfSortNatural), pcp->pvar_psam_flags, xheader, &simple_pgr, outname, outname_end);
} else {
if (vpos_sortstatus & kfUnsortedVarBp) {
logerrputs("Warning: Variants are not sorted by position. Consider rerunning with the\n--sort-vars flag added to remedy this.\n");
}
- reterr = MakePlink2NoVsort(xheader, sample_include, &pii, sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, info_reload_slen? pvarname : nullptr, variant_cms, pcp->varid_template_str, pcp->varid_multi_template_str, pcp->varid_multi_nonsnp_template_str, pcp->missing_varid_match, xheader_blen, info_flags, raw_sample_ct, sample_ct, pheno_ct, max_pheno_name_blen, raw_variant_ct, variant_ct, max_allele_ct, max_allele_slen, max_filter_slen, info_reload_slen, pcp->max_thread_ct, pcp->hard_call_thresh, pcp->dosage_erase_thresh, pcp->new_variant_id_max_allele_slen, pcp->misc_flags, make_plink2_flags, pcp->pvar_psam_flags, pgr_alloc_cacheline_ct, &pgfi, &simple_pgr, outname, outname_end);
+ reterr = MakePlink2NoVsort(sample_include, &pii, sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, info_reload_slen? pvarname : nullptr, variant_cms, pcp->varid_template_str, pcp->varid_multi_template_str, pcp->varid_multi_nonsnp_template_str, pcp->missing_varid_match, xheader_blen, info_flags, raw_sample_ct, sample_ct, pheno_ct, max_pheno_name_blen, raw_variant_ct, variant_ct, max_allele_ct, max_allele_slen, max_filter_slen, info_reload_slen, vpos_sortstatus, pcp->max_thread_ct, pcp->hard_call_thresh, pcp->dosage_erase_thresh, pcp->new_variant_id_max_allele_slen, pcp->misc_flags, make_plink2_flags, pcp->pvar_psam_flags, pgr_alloc_cacheline_ct, xheader, &pgfi, &simple_pgr, outname, outname_end);
}
if (unlikely(reterr)) {
goto Plink2Core_ret_1;
@@ -2538,7 +2538,7 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c
}
if (pcp->command_flags1 & kfCommand1Exportf) {
- reterr = Exportf(sample_include, &pii, sex_nm, sex_male, pheno_cols, pheno_names, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, info_reload_slen? pvarname : nullptr, variant_cms, &(pcp->exportf_info), xheader_blen, info_flags, raw_sample_ct, sample_ct, pheno_ct, max_pheno_name_blen, raw_variant_ct, variant_ct, max_variant_id_slen, max_allele_slen, max_filter_slen, info_reload_slen, pcp->max_thread_ct, make_plink2_flags, pgr_alloc_cacheline_ct, xheader, &pgfi, &simple_pgr, outname, outname_end);
+ reterr = Exportf(sample_include, &pii, sex_nm, sex_male, pheno_cols, pheno_names, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, info_reload_slen? pvarname : nullptr, variant_cms, &(pcp->exportf_info), xheader_blen, info_flags, raw_sample_ct, sample_ct, pheno_ct, max_pheno_name_blen, raw_variant_ct, variant_ct, max_variant_id_slen, max_allele_slen, max_filter_slen, info_reload_slen, vpos_sortstatus, pcp->max_thread_ct, make_plink2_flags, pgr_alloc_cacheline_ct, xheader, &pgfi, &simple_pgr, outname, outname_end);
if (unlikely(reterr)) {
goto Plink2Core_ret_1;
}
@@ -5943,8 +5943,19 @@ int main(int argc, char** argv) {
}
pc.filter_flags |= kfFilterPsamReq;
} else if (strequal_k_unsafe(flagname_p2, "eep-autoconv")) {
+ if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 0, 1))) {
+ goto main_ret_INVALID_CMDLINE_2A;
+ }
+ if (param_ct) {
+ const char* cur_modif = argvk[arg_idx + 1];
+ if (likely(!strcmp(cur_modif, "vzs"))) {
+ import_flags |= kfImportKeepAutoconvVzs;
+ } else {
+ snprintf(g_logbuf, kLogbufSize, "Error: Invalid --keep-autoconv argument '%s'.\n", cur_modif);
+ goto main_ret_INVALID_CMDLINE_WWA;
+ }
+ }
import_flags |= kfImportKeepAutoconv;
- goto main_param_zero;
} else if (strequal_k_unsafe(flagname_p2, "eep-females")) {
pc.filter_flags |= kfFilterPsamReq | kfFilterExclMales | kfFilterExclNosex;
goto main_param_zero;
@@ -6424,12 +6435,12 @@ int main(int argc, char** argv) {
goto main_ret_INVALID_CMDLINE;
}
explicit_pvar_cols = 1;
- reterr = ParseColDescriptor(&(cur_modif[strlen("pvar-cols=")]), "xheader\0maybequal\0qual\0maybefilter\0filter\0maybeinfo\0info\0maybecm\0cm\0", "--make-pgen pvar-cols", kfPvarColXheader, kfPvarColDefault, 0, &pc.pvar_psam_flags);
+ reterr = ParseColDescriptor(&(cur_modif[strlen("pvar-cols=")]), "xheader\0vcfheader\0maybequal\0qual\0maybefilter\0filter\0maybeinfo\0info\0maybecm\0cm\0", "--make-pgen pvar-cols", kfPvarColXheader, kfPvarColDefault, 0, &pc.pvar_psam_flags);
if (unlikely(reterr)) {
goto main_ret_1;
}
- if (unlikely((pc.pvar_psam_flags & kfPvarColXinfo) && (!(pc.pvar_psam_flags & kfPvarColXheader)))) {
- logerrputs("Error: --make-pgen pvar-cols= expression cannot exclude xheader when info is\npresent.\n");
+ if (unlikely((pc.pvar_psam_flags & kfPvarColXinfo) && (!(pc.pvar_psam_flags & (kfPvarColXheader | kfPvarColVcfheader))))) {
+ logerrputs("Error: --make-pgen pvar-cols= expression cannot exclude xheader/vcfheader when\ninfo is present.\n");
goto main_ret_INVALID_CMDLINE_A;
}
} else if (StrStartsWith(cur_modif, "format=", cur_modif_slen)) {
@@ -6585,12 +6596,12 @@ int main(int argc, char** argv) {
goto main_ret_INVALID_CMDLINE;
}
explicit_cols = 1;
- reterr = ParseColDescriptor(&(cur_modif[5]), "xheader\0maybequal\0qual\0maybefilter\0filter\0maybeinfo\0info\0maybecm\0cm\0", "make-just-pvar", kfPvarColXheader, kfPvarColDefault, 0, &pc.pvar_psam_flags);
+ reterr = ParseColDescriptor(&(cur_modif[5]), "xheader\0vcfheader\0maybequal\0qual\0maybefilter\0filter\0maybeinfo\0info\0maybecm\0cm\0", "make-just-pvar", kfPvarColXheader, kfPvarColDefault, 0, &pc.pvar_psam_flags);
if (unlikely(reterr)) {
goto main_ret_1;
}
- if (unlikely((pc.pvar_psam_flags & kfPvarColXinfo) && (!(pc.pvar_psam_flags & kfPvarColXheader)))) {
- logerrputs("Error: --make-just-pvar cols= expression cannot exclude xheader when info is\npresent.\n");
+ if (unlikely((pc.pvar_psam_flags & kfPvarColXinfo) && (!(pc.pvar_psam_flags & (kfPvarColXheader | kfPvarColVcfheader))))) {
+ logerrputs("Error: --make-just-pvar cols= expression cannot exclude xheader/vcfheader when\ninfo is present.\n");
goto main_ret_INVALID_CMDLINE_A;
}
if (pc.pvar_psam_flags & kfPvarColInfo) {
@@ -9858,29 +9869,53 @@ int main(int argc, char** argv) {
const uint32_t convname_slen = convname_end - outname;
uint32_t pgen_generated = 1;
uint32_t psam_generated = 1;
+
+ // Compress by default for VCF/BCF due to potentially large INFO
+ // section; otherwise only do it in "--keep-autoconv vzs" case.
+ uint32_t pvar_is_compressed;
if (xload & (kfXloadVcf | kfXloadBcf)) {
const uint32_t no_samples_ok = !(pc.dependency_flags & (kfFilterAllReq | kfFilterPsamReq));
const uint32_t is_vcf = (xload / kfXloadVcf) & 1;
+ pvar_is_compressed = ((import_flags & (kfImportKeepAutoconv | kfImportKeepAutoconvVzs)) != kfImportKeepAutoconv);
if (no_samples_ok && is_vcf && (!(import_flags & kfImportKeepAutoconv)) && pc.command_flags1) {
// special case: just treat the VCF as a .pvar file
strcpy(pvarname, pgenname);
pgenname[0] = '\0';
goto main_reinterpret_vcf_instead_of_converting;
- } else if (is_vcf) {
+ }
+ // Default to compression level 1 for temporary .pvar files for now.
+ //
+ // Level 1 may actually be best in a much wider variety of scenarios
+ // as of this writing, but I won't try to tune any other compression
+ // defaults for now. Interestingly, the current setup is actually a
+ // bit backwards given observed zstd behavior: the long INFO fields
+ // motivating automatic compression here are best handled with level
+ // 3, while the other simpler text files generated by plink2 are
+ // likely to compress *better*, not just faster, with level 1 for
+ // some reason.
+ const uint32_t zst_level = g_zst_level;
+ if (!(import_flags & kfImportKeepAutoconv)) {
+ g_zst_level = 1;
+ }
+ if (is_vcf) {
reterr = VcfToPgen(pgenname, (load_params & kfLoadParamsPsam)? psamname : nullptr, const_fid, vcf_dosage_import_field, pc.misc_flags, import_flags, no_samples_ok, pc.hard_call_thresh, pc.dosage_erase_thresh, import_dosage_certainty, id_delim, idspace_to, vcf_min_gq, vcf_min_dp, vcf_max_dp, vcf_half_call, pc.fam_cols, pc.max_thread_ct, outname, convname_end, &chr_info, &pgen_generated, &psam_generated);
} else {
reterr = BcfToPgen(pgenname, (load_params & kfLoadParamsPsam)? psamname : nullptr, const_fid, vcf_dosage_import_field, pc.misc_flags, import_flags, no_samples_ok, pc.hard_call_thresh, pc.dosage_erase_thresh, import_dosage_certainty, id_delim, idspace_to, vcf_min_gq, vcf_min_dp, vcf_max_dp, vcf_half_call, pc.fam_cols, pc.max_thread_ct, outname, convname_end, &chr_info, &pgen_generated, &psam_generated);
}
- } else if (xload & kfXloadOxGen) {
- reterr = OxGenToPgen(pgenname, psamname, import_single_chr_str, ox_missing_code, pc.misc_flags, import_flags, oxford_import_flags, pc.hard_call_thresh, pc.dosage_erase_thresh, import_dosage_certainty, pc.max_thread_ct, outname, convname_end, &chr_info);
- } else if (xload & kfXloadOxBgen) {
- reterr = OxBgenToPgen(pgenname, psamname, const_fid, import_single_chr_str, ox_missing_code, pc.misc_flags, import_flags, oxford_import_flags, pc.hard_call_thresh, pc.dosage_erase_thresh, import_dosage_certainty, id_delim, idspace_to, pc.max_thread_ct, outname, convname_end, &chr_info);
- } else if (xload & kfXloadOxHaps) {
- reterr = OxHapslegendToPgen(pgenname, pvarname, psamname, import_single_chr_str, ox_missing_code, pc.misc_flags, import_flags, oxford_import_flags, pc.max_thread_ct, outname, convname_end, &chr_info);
- } else if (xload & kfXloadPlink1Dosage) {
- reterr = Plink1DosageToPgen(pgenname, psamname, (xload & kfXloadMap)? pvarname : nullptr, import_single_chr_str, &plink1_dosage_info, pc.misc_flags, import_flags, pc.fam_cols, pc.missing_pheno, pc.hard_call_thresh, pc.dosage_erase_thresh, import_dosage_certainty, pc.max_thread_ct, outname, convname_end, &chr_info);
- } else if (xload & kfXloadGenDummy) {
- reterr = GenerateDummy(&gendummy_info, pc.misc_flags, import_flags, pc.hard_call_thresh, pc.dosage_erase_thresh, pc.max_thread_ct, &main_sfmt, outname, convname_end, &chr_info);
+ g_zst_level = zst_level;
+ } else {
+ pvar_is_compressed = (import_flags / kfImportKeepAutoconvVzs) & 1;
+ if (xload & kfXloadOxGen) {
+ reterr = OxGenToPgen(pgenname, psamname, import_single_chr_str, ox_missing_code, pc.misc_flags, import_flags, oxford_import_flags, pc.hard_call_thresh, pc.dosage_erase_thresh, import_dosage_certainty, pc.max_thread_ct, outname, convname_end, &chr_info);
+ } else if (xload & kfXloadOxBgen) {
+ reterr = OxBgenToPgen(pgenname, psamname, const_fid, import_single_chr_str, ox_missing_code, pc.misc_flags, import_flags, oxford_import_flags, pc.hard_call_thresh, pc.dosage_erase_thresh, import_dosage_certainty, id_delim, idspace_to, pc.max_thread_ct, outname, convname_end, &chr_info);
+ } else if (xload & kfXloadOxHaps) {
+ reterr = OxHapslegendToPgen(pgenname, pvarname, psamname, import_single_chr_str, ox_missing_code, pc.misc_flags, import_flags, oxford_import_flags, pc.max_thread_ct, outname, convname_end, &chr_info);
+ } else if (xload & kfXloadPlink1Dosage) {
+ reterr = Plink1DosageToPgen(pgenname, psamname, (xload & kfXloadMap)? pvarname : nullptr, import_single_chr_str, &plink1_dosage_info, pc.misc_flags, import_flags, pc.fam_cols, pc.missing_pheno, pc.hard_call_thresh, pc.dosage_erase_thresh, import_dosage_certainty, pc.max_thread_ct, outname, convname_end, &chr_info);
+ } else if (xload & kfXloadGenDummy) {
+ reterr = GenerateDummy(&gendummy_info, pc.misc_flags, import_flags, pc.hard_call_thresh, pc.dosage_erase_thresh, pc.max_thread_ct, &main_sfmt, outname, convname_end, &chr_info);
+ }
}
if (reterr || (!pc.command_flags1)) {
goto main_ret_1;
@@ -9893,7 +9928,7 @@ int main(int argc, char** argv) {
if (pgen_generated) {
snprintf(memcpya(pgenname, outname, convname_slen), kMaxOutfnameExtBlen - 10, ".pgen");
}
- snprintf(memcpya(pvarname, outname, convname_slen), kMaxOutfnameExtBlen - 10, ".pvar");
+ snprintf(memcpya(pvarname, outname, convname_slen), kMaxOutfnameExtBlen - 10, pvar_is_compressed? ".pvar.zst" : ".pvar");
if (psam_generated) {
snprintf(memcpya(psamname, outname, convname_slen), kMaxOutfnameExtBlen - 10, ".psam");
}
=====================================
plink2_cmdline.cc
=====================================
@@ -3856,6 +3856,10 @@ PglErr CmdlineParsePhase2(const char* ver_str, const char* errstr_append, const
uint32_t arg_idx = flag_map[cur_flag_idx] + 1;
while ((arg_idx < S_CAST(uint32_t, argc)) && (!IsCmdlineFlag(argvk[arg_idx]))) {
logputs(" ");
+ // Thought about special-casing argvk[arg_idx] == " ", so that
+ // "--id-delim ' '" actually works with --rerun, but that adds too much
+ // complexity to the rereader to be worth it. Instead we just document
+ // the incompatibility.
logputs(argvk[arg_idx++]);
}
logputs("\n");
=====================================
plink2_cmdline.h
=====================================
@@ -1761,12 +1761,35 @@ typedef struct CmpExprStruct {
// arguably too small for noncopyable to be a great idea, but the next
// iteration of this struct probably won't have that issue.
NONCOPYABLE(CmpExprStruct);
- // Restrict to [pheno/covar name] [operator] [pheno val] for now. could
- // support or/and, parentheses, etc. later.
-
- // Currently stores null-terminated pheno/covar name, followed by
- // null-terminated value string. Storage format needs to be synced with
- // ValidateAndAllocCmpExpr().
+ // Restrict to <key> <operator> <value> for now; key=INFO/pheno/covar.
+ //
+ // Almost certainly want to support conjunctions/disjunctions later; but
+ // there are complications regarding the command-line interface:
+ // * Phenotype/covariate names and VCFv4.2 INFO keys can contain parenthesis
+ // and mathematical-operator characters. And we actually need to permit at
+ // least the latter, since autogenerated categorical-covariate names
+ // contain '=' for a good reason. Some sort of escaping is probably in
+ // in order, but...
+ // * Escaping is not a big deal in a shell script, but most
+ // conjunctions/disjunctions can already be emulated easily enough in a
+ // shell script by calling plink2 multiple times. While there's some
+ // execution-time improvement, the main value-add from directly supporting
+ // logical operations in --keep-if/--extract-if-info is reduced
+ // interactive-use cognitive load.
+ // * We can't treat multiple command-line arguments differently from the
+ // corresponding space-containing single command-line argument; otherwise
+ // --rerun breaks. So "multiple-argument-form supports keys with special
+ // characters but not conjunctions/disjunctions; single-argument-form is
+ // the other way around" is not an option.
+ // The least-bad solution I've thought of is to add --keep-if-file/etc. flags
+ // which read the expression from a short text file. In that case, we'd at
+ // least be able to define normal quoting and escaping rules without worrying
+ // about confusing interactions with bash. Deferring implementation for now
+ // in hopes of coming up with a better idea, but this should go in before
+ // beta testing begins.
+
+ // Currently stores null-terminated key, followed by null-terminated value
+ // string. Storage format needs to be synced with ValidateAndAllocCmpExpr().
char* pheno_name;
CmpBinaryOp binary_op;
} CmpExpr;
=====================================
plink2_common.cc
=====================================
@@ -260,6 +260,56 @@ void PopulateRescaledDosage(const uintptr_t* genoarr, const uintptr_t* dosage_pr
}
}
+static_assert(sizeof(AlleleCode) == 1, "AtLeastOneMultiallelicHet() needs to be updated.");
+uint32_t AtLeastOneMultiallelicHet(const PgenVariant* pgvp, uint32_t sample_ct) {
+ if (pgvp->patch_01_ct) {
+ return 1;
+ }
+ {
+ const uintptr_t* genoarr = pgvp->genovec;
+ const uint32_t fullvec_ct = sample_ct / kBitsPerWordD2;
+ for (uint32_t uii = 0; uii != fullvec_ct; ++uii) {
+ const uintptr_t geno_word = genoarr[uii];
+ if (Word01(geno_word)) {
+ return 1;
+ }
+ }
+ const uint32_t remainder = sample_ct % kBitsPerWordD2;
+ if (remainder) {
+ if (Word01(bzhi(genoarr[fullvec_ct], 2 * remainder))) {
+ return 1;
+ }
+ }
+ }
+ const uint32_t patch_10_ct = pgvp->patch_10_ct;
+ if (patch_10_ct) {
+ const AlleleCode* patch_10_vals = pgvp->patch_10_vals;
+#ifdef __LP64__
+ const uint32_t fullvec_ct = patch_10_ct / (kBytesPerVec / 2);
+ const VecU16* patch_10_valias = R_CAST(const VecU16*, patch_10_vals);
+ const VecU16 m8 = vecu16_set1(0xff);
+ for (uint32_t vidx = 0; vidx != fullvec_ct; ++vidx) {
+ const VecU16 vv_orig = vecu16_loadu(&(patch_10_valias[vidx]));
+ const VecU16 vv_hi = vecu16_srli(vv_orig, 8);
+ const VecU16 vv_lo = vv_orig & m8;
+ const VecU16 vv_eq = (vv_hi == vv_lo);
+ if (vecu16_movemask(vv_eq) != kVec8thUintMax) {
+ return 1;
+ }
+ }
+ uint32_t uii = fullvec_ct * (kBytesPerVec / 2);
+#else
+ uint32_t uii = 0;
+#endif
+ for (; uii != patch_10_ct; ++uii) {
+ if (patch_10_vals[2 * uii] != patch_10_vals[2 * uii + 1]) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
void SetHetMissing(uintptr_t word_ct, uintptr_t* genovec) {
// 01 -> 11, nothing else changes
#ifdef __LP64__
@@ -1401,9 +1451,8 @@ char* ChrNameStd(const ChrInfo* cip, uint32_t chr_idx, char* buf) {
char* chrtoa(const ChrInfo* cip, uint32_t chr_idx, char* buf) {
// assumes chr_idx is valid
if (!chr_idx) {
- // TODO: probably add 'chr' in front here when output encoding calls for
- // it, but wait till beta since this would technically be
- // compatibility-breaking
+ // Note that this never has a 'chr' prefix. Which is probably fine, it
+ // isn't a real chromosome.
*buf++ = '0';
return buf;
}
@@ -1581,9 +1630,12 @@ uint32_t GetChrCode(const char* chr_name, const ChrInfo* cip, uint32_t name_slen
}
uint32_t GetChrCodeCounted(const ChrInfo* cip, uint32_t name_slen, char* chr_name) {
- // when the chromosome name isn't null-terminated
- // (yeah, probably want to revise the called functions so that chr_name
- // doesn't need to be mutable here)
+ // When the chromosome name isn't null-terminated.
+ // Yeah, probably want to revise this so that chr_name doesn't need to be
+ // mutable here. However, that currently requires new substitutes for both
+ // GetChrCodeRaw AND IdHtableFind (IdHtableFindNnt doesn't work due to
+ // overread); when either of those are needed for some other reason, that's a
+ // good time to revise this function.
char* s_end = &(chr_name[name_slen]);
const char tmpc = *s_end;
*s_end = '\0';
=====================================
plink2_common.h
=====================================
@@ -201,26 +201,27 @@ FLAGSET_DEF_START()
kfPvarZs = (1 << 0),
kfPvarColXheader = (1 << 1),
- kfPvarColMaybequal = (1 << 2),
- kfPvarColQual = (1 << 3),
- kfPvarColMaybefilter = (1 << 4),
- kfPvarColFilter = (1 << 5),
- kfPvarColMaybeinfo = (1 << 6),
- kfPvarColInfo = (1 << 7),
+ kfPvarColVcfheader = (1 << 2),
+ kfPvarColMaybequal = (1 << 3),
+ kfPvarColQual = (1 << 4),
+ kfPvarColMaybefilter = (1 << 5),
+ kfPvarColFilter = (1 << 6),
+ kfPvarColMaybeinfo = (1 << 7),
+ kfPvarColInfo = (1 << 8),
kfPvarColXinfo = (kfPvarColInfo * 2) - kfPvarColMaybeinfo,
- kfPvarColMaybecm = (1 << 8),
- kfPvarColCm = (1 << 9),
+ kfPvarColMaybecm = (1 << 9),
+ kfPvarColCm = (1 << 10),
kfPvarColDefault = (kfPvarColXheader | kfPvarColMaybequal | kfPvarColMaybefilter | kfPvarColMaybeinfo | kfPvarColMaybecm),
kfPvarColAll = ((kfPvarColCm * 2) - kfPvarColXheader),
- kfPsamColMaybefid = (1 << 10),
- kfPsamColFid = (1 << 11),
- kfPsamColMaybesid = (1 << 12),
- kfPsamColSid = (1 << 13),
- kfPsamColMaybeparents = (1 << 14),
- kfPsamColParents = (1 << 15),
- kfPsamColSex = (1 << 16),
- kfPsamColPheno1 = (1 << 17),
- kfPsamColPhenos = (1 << 18),
+ kfPsamColMaybefid = (1 << 11),
+ kfPsamColFid = (1 << 12),
+ kfPsamColMaybesid = (1 << 13),
+ kfPsamColSid = (1 << 14),
+ kfPsamColMaybeparents = (1 << 15),
+ kfPsamColParents = (1 << 16),
+ kfPsamColSex = (1 << 17),
+ kfPsamColPheno1 = (1 << 18),
+ kfPsamColPhenos = (1 << 19),
kfPsamColDefault = (kfPsamColMaybefid | kfPsamColMaybesid | kfPsamColMaybeparents | kfPsamColSex | kfPsamColPhenos),
kfPsamColAll = ((kfPsamColPhenos * 2) - kfPsamColMaybefid)
FLAGSET_DEF_END(PvarPsamFlags);
@@ -382,6 +383,20 @@ void PopulateDenseDosage(const uintptr_t* genoarr, const uintptr_t* dosage_prese
void PopulateRescaledDosage(const uintptr_t* genoarr, const uintptr_t* dosage_present, const Dosage* dosage_main, double slope, double intercept, double missing_val, uint32_t sample_ct, uint32_t dosage_ct, double* expanded_dosages);
+// assumes trailing bits of genoarr are zeroed out
+HEADER_INLINE uint32_t AtLeastOneHetUnsafe(const uintptr_t* genoarr, uint32_t sample_ct) {
+ const uint32_t sample_ctl2 = DivUp(sample_ct, kBitsPerWordD2);
+ for (uint32_t uii = 0; uii != sample_ctl2; ++uii) {
+ const uintptr_t geno_word = genoarr[uii];
+ if (Word01(geno_word)) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+uint32_t AtLeastOneMultiallelicHet(const PgenVariant* pgvp, uint32_t sample_ct);
+
void SetHetMissing(uintptr_t word_ct, uintptr_t* genovec);
void SetHetMissingCleardosage(uintptr_t word_ct, uintptr_t* genovec, uint32_t* write_dosage_ct_ptr, uintptr_t* dosagepresent, Dosage* dosage_main);
@@ -691,6 +706,7 @@ PglErr FinalizeChrInfo(ChrInfo* cip);
void CleanupChrInfo(ChrInfo* cip);
// assumes chr_idx is valid
+// note that chr_idx == 0 is always rendered as '0', never 'chr0'
char* chrtoa(const ChrInfo* cip, uint32_t chr_idx, char* buf);
uint32_t GetMaxChrSlen(const ChrInfo* cip);
@@ -1068,6 +1084,17 @@ HEADER_INLINE BoolErr StoreStringAtEndK(unsigned char* arena_bottom, const char*
return 0;
}
+HEADER_INLINE BoolErr StoreStringAndPrecharAtEnd(unsigned char* arena_bottom, const char* src, unsigned char prechar, uintptr_t slen, unsigned char** arena_top_ptr, char** dst) {
+ if (PtrWSubCk(arena_bottom, slen + 2, arena_top_ptr)) {
+ return 1;
+ }
+ **arena_top_ptr = prechar;
+ char* dst_write = 1 + R_CAST(char*, *arena_top_ptr);
+ memcpyx(dst_write, src, slen, '\0');
+ *dst = dst_write;
+ return 0;
+}
+
// These use g_textbuf.
PglErr WriteSampleIdsOverride(const uintptr_t* sample_include, const SampleIdInfo* siip, const char* outname, uint32_t sample_ct, SampleIdFlags override_flags);
HEADER_INLINE PglErr WriteSampleIds(const uintptr_t* sample_include, const SampleIdInfo* siip, const char* outname, uint32_t sample_ct) {
=====================================
plink2_data.cc
=====================================
@@ -20,6 +20,8 @@
#include "plink2_data.h"
#include "plink2_pvar.h"
+#include <time.h>
+
#ifdef __cplusplus
namespace plink2 {
#endif
@@ -266,7 +268,262 @@ void AppendChrsetLine(const ChrInfo* cip, char** write_iter_ptr) {
AppendBinaryEoln(write_iter_ptr);
}
-PglErr WritePvar(const char* outname, const char* xheader, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* qual_present, const float* quals, const uintptr_t* filter_present, const uintptr_t* filter_npass, const char* const* filter_storage, const uintptr_t* nonref_flags, const char* pvar_info_reload, const double* variant_cms, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t nonref_flags_storage, uint32_t max_filter_slen, uint32_t info_reload_slen, PvarPsamFlags pvar_psam_flags, uint32_t thread_ct) {
+// fileformat, fileDate, source
+void AppendVcfHeaderStart(uint32_t v43, char** cswritepp) {
+ char* cswritep = *cswritepp;
+ cswritep = strcpya_k(cswritep, "##fileformat=VCFv4.");
+ *cswritep++ = v43 + '2';
+ cswritep = strcpya_k(cswritep, EOLN_STR "##fileDate=");
+ time_t rawtime;
+ time(&rawtime);
+ const struct tm* loctime = localtime(&rawtime);
+ cswritep += strftime(cswritep, kMaxMediumLine, "%Y%m%d", loctime);
+ cswritep = strcpya_k(cswritep, EOLN_STR "##source=PLINKv2.00" EOLN_STR);
+ *cswritepp = cswritep;
+ return;
+}
+
+// Note that the order-of-operations page lists this as happening right after
+// the filtering performed by LoadPvar(). Which is effectively true, since we
+// ignore variant_include (this is safe since LoadPvar() always initializes
+// all variant_bps[] and allele_storage[] entries appropriately).
+// possible todo: ChrInfo can have a length field, which is initialized by the
+// ##contig header line when possible, but when that doesn't exist LoadPvar()
+// can conditionally detect INFO:END and take that into account. (Or a reason
+// to keep the entire info_end array in memory may emerge.)
+uint32_t ChrLenLbound(const ChrInfo* cip, const uint32_t* variant_bps, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uint32_t* new_variant_idx_to_old, uint32_t chr_fo_idx, uint32_t max_allele_slen, UnsortedVar vpos_sortstatus) {
+ const uint32_t vidx_start = cip->chr_fo_vidx_start[chr_fo_idx];
+ const uint32_t vidx_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+ assert(vidx_start != vidx_end);
+ if (!(vpos_sortstatus & kfUnsortedVarBp)) {
+ if (!new_variant_idx_to_old) {
+ if (max_allele_slen == 1) {
+ return variant_bps[vidx_end - 1];
+ }
+ uint32_t bp_end = 0;
+ for (uint32_t vidx = vidx_end; vidx != vidx_start; ) {
+ --vidx;
+ const uint32_t cur_bp = variant_bps[vidx];
+ if (cur_bp + max_allele_slen <= bp_end) {
+ break;
+ }
+ uintptr_t allele_idx_offset_base = vidx * 2;
+ if (allele_idx_offsets) {
+ allele_idx_offset_base = allele_idx_offsets[vidx];
+ }
+ // We only care about reference-allele length.
+ const uint32_t cur_bp_end = cur_bp + strlen(allele_storage[allele_idx_offset_base]) - 1;
+ if (cur_bp_end > bp_end) {
+ bp_end = cur_bp_end;
+ }
+ }
+ return bp_end;
+ }
+ if (max_allele_slen == 1) {
+ return variant_bps[new_variant_idx_to_old[vidx_end - 1]];
+ }
+ uint32_t bp_end = 0;
+ for (uint32_t new_vidx = vidx_end; new_vidx != vidx_start; ) {
+ --new_vidx;
+ const uint32_t old_vidx = new_variant_idx_to_old[new_vidx];
+ const uint32_t cur_bp = variant_bps[old_vidx];
+ if (cur_bp + max_allele_slen <= bp_end) {
+ break;
+ }
+ uintptr_t allele_idx_offset_base = old_vidx * 2;
+ if (allele_idx_offsets) {
+ allele_idx_offset_base = allele_idx_offsets[old_vidx];
+ }
+ const uint32_t cur_bp_end = cur_bp + strlen(allele_storage[allele_idx_offset_base]) - 1;
+ if (cur_bp_end > bp_end) {
+ bp_end = cur_bp_end;
+ }
+ }
+ return bp_end;
+ }
+ uint32_t bp_end = U32ArrMax(&(variant_bps[vidx_start]), vidx_end - vidx_start);
+ if (max_allele_slen == 1) {
+ return bp_end;
+ }
+ uint32_t min_check_bp = 0;
+ if (bp_end >= max_allele_slen) {
+ min_check_bp = bp_end + 1 - max_allele_slen;
+ }
+ for (uint32_t vidx = vidx_start; vidx != vidx_end; ++vidx) {
+ const uint32_t cur_bp = variant_bps[vidx];
+ if (cur_bp < min_check_bp) {
+ continue;
+ }
+ uintptr_t allele_idx_offset_base = vidx * 2;
+ if (allele_idx_offsets) {
+ allele_idx_offset_base = allele_idx_offsets[vidx];
+ }
+ const uint32_t cur_bp_end = cur_bp + strlen(allele_storage[allele_idx_offset_base]) - 1;
+ if (cur_bp_end > bp_end) {
+ bp_end = cur_bp_end;
+ }
+ }
+ return bp_end;
+}
+
+PglErr PvarXheaderWrite(const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uint32_t* new_variant_idx_to_old, uintptr_t xheader_blen, uint32_t vcfheader, uint32_t write_filter, uint32_t write_info, uint32_t append_info_pr_header_line, uint32_t max_allele_slen, UnsortedVar vpos_sortstatus, char* xheader, CompressStreamState* css_ptr, char** cswritepp) {
+ unsigned char* bigstack_mark = g_bigstack_base;
+ PglErr reterr = kPglRetSuccess;
+ {
+ if (!vcfheader) {
+ if (write_filter && write_info) {
+ if (unlikely(CsputsStd(xheader, xheader_blen, css_ptr, cswritepp))) {
+ goto PvarXheaderWrite_ret_WRITE_FAIL;
+ }
+ } else {
+ // Filter out FILTER/INFO definitions iff the corresponding column has
+ // been removed.
+ const char* copy_start = xheader;
+ const char* xheader_end = &(xheader[xheader_blen]);
+ for (const char* xheader_iter = xheader; xheader_iter != xheader_end; ) {
+ const char* next_line_start = AdvPastDelim(xheader_iter, '\n');
+ if (((!write_filter) && StrStartsWithUnsafe(xheader_iter, "##FILTER=<ID=")) ||
+ ((!write_info) && StrStartsWithUnsafe(xheader_iter, "##INFO=<ID="))) {
+ if (copy_start != xheader_iter) {
+ if (unlikely(CsputsStd(copy_start, xheader_iter - copy_start, css_ptr, cswritepp))) {
+ goto PvarXheaderWrite_ret_WRITE_FAIL;
+ }
+ }
+ copy_start = next_line_start;
+ }
+ xheader_iter = next_line_start;
+ }
+ if (copy_start != xheader_end) {
+ if (unlikely(CsputsStd(copy_start, xheader_end - copy_start, css_ptr, cswritepp))) {
+ goto PvarXheaderWrite_ret_WRITE_FAIL;
+ }
+ }
+ }
+ } else {
+ // See the start of ExportVcf().
+ AppendVcfHeaderStart(1, cswritepp);
+ const uint32_t chr_ctl = BitCtToWordCt(cip->chr_ct);
+ uintptr_t* written_contig_header_lines;
+ if (unlikely(bigstack_calloc_w(chr_ctl, &written_contig_header_lines))) {
+ goto PvarXheaderWrite_ret_NOMEM;
+ }
+ uint32_t contig_zero_written = 0;
+ char* cswritep = *cswritepp;
+ // ExportVcf() has to perform a customized --merge-par operation, so it
+ // has special handling of chrX/PAR1/PAR2 ##contig header lines. We omit
+ // that here.
+ char* xheader_end = &(xheader[xheader_blen]);
+ for (char* line_end = xheader; line_end != xheader_end; ) {
+ char* line_start = line_end;
+ line_end = AdvPastDelim(line_start, '\n');
+ const uint32_t slen = line_end - line_start;
+ if ((slen > 14) && StrStartsWithUnsafe(line_start, "##contig=<ID=")) {
+ char* contig_name_start = &(line_start[13]);
+ char* contig_name_end = S_CAST(char*, memchr(contig_name_start, ',', slen - 14));
+ if (!contig_name_end) {
+ // if this line is technically well-formed (ends in '>'), it's
+ // useless anyway, throw it out
+ continue;
+ }
+ const uint32_t chr_idx = GetChrCodeCounted(cip, contig_name_end - contig_name_start, contig_name_start);
+ if (IsI32Neg(chr_idx) || (!IsSet(cip->chr_mask, chr_idx))) {
+ continue;
+ }
+ const uint32_t chr_fo_idx = cip->chr_idx_to_foidx[chr_idx];
+ if (unlikely(IsSet(written_contig_header_lines, chr_fo_idx))) {
+ logerrputs("Error: Duplicate ##contig line in .pvar file.\n");
+ goto PvarXheaderWrite_ret_MALFORMED_INPUT;
+ }
+ SetBit(chr_fo_idx, written_contig_header_lines);
+ // if --output-chr was used at some point, we need to sync the
+ // ##contig chromosome code with the code in the .pvar body.
+ char* chr_name_write_start = strcpya_k(cswritep, "##contig=<ID=");
+ char* chr_name_write_end = chrtoa(cip, chr_idx, chr_name_write_start);
+ if ((*chr_name_write_start == '0') && (chr_name_write_end == &(chr_name_write_start[1]))) {
+ // --allow-extra-chr 0 special case
+ // note that cswritep has *not* been advanced
+ contig_zero_written = 1; // technically we write this a bit later
+ continue;
+ }
+ cswritep = chr_name_write_end;
+ if (unlikely(Cswrite(css_ptr, &cswritep))) {
+ goto PvarXheaderWrite_ret_WRITE_FAIL;
+ }
+ if (unlikely(CsputsStd(contig_name_end, line_end - contig_name_end, css_ptr, &cswritep))) {
+ goto PvarXheaderWrite_ret_WRITE_FAIL;
+ }
+ } else {
+ if (!write_filter) {
+ if (StrStartsWithUnsafe(line_start, "##FILTER=<ID=")) {
+ continue;
+ }
+ }
+ if (!write_info) {
+ if (StrStartsWithUnsafe(line_start, "##INFO=<ID=")) {
+ continue;
+ }
+ }
+ if (unlikely(CsputsStd(line_start, slen, css_ptr, &cswritep))) {
+ goto PvarXheaderWrite_ret_WRITE_FAIL;
+ }
+ }
+ }
+ // fill in the missing ##contig lines
+ if (contig_zero_written) {
+ cswritep = strcpya_k(cswritep, "##contig=<ID=0,length=2147483645>" EOLN_STR);
+ }
+ for (uint32_t chr_fo_idx = 0; chr_fo_idx != cip->chr_ct; ++chr_fo_idx) {
+ if (IsSet(written_contig_header_lines, chr_fo_idx)) {
+ continue;
+ }
+ const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+ // AllBitsAreZero() doesn't do what we want in the --sort-vars case,
+ // but fortunately we don't need it there.
+ if ((!IsSet(cip->chr_mask, chr_idx)) || (variant_include && AllBitsAreZero(variant_include, cip->chr_fo_vidx_start[chr_fo_idx], cip->chr_fo_vidx_start[chr_fo_idx + 1]))) {
+ continue;
+ }
+ char* chr_name_write_start = strcpya_k(cswritep, "##contig=<ID=");
+ char* chr_name_write_end = chrtoa(cip, chr_idx, chr_name_write_start);
+ if ((*chr_name_write_start == '0') && (chr_name_write_end == &(chr_name_write_start[1]))) {
+ // --allow-extra-chr 0 special case
+ if (contig_zero_written) {
+ continue;
+ }
+ contig_zero_written = 1;
+ cswritep = strcpya_k(chr_name_write_end, ",length=2147483645");
+ } else {
+ cswritep = strcpya_k(chr_name_write_end, ",length=");
+ const uint32_t pos_end = ChrLenLbound(cip, variant_bps, allele_idx_offsets, allele_storage, new_variant_idx_to_old, chr_fo_idx, max_allele_slen, vpos_sortstatus);
+ cswritep = u32toa(pos_end, cswritep);
+ }
+ *cswritep++ = '>';
+ AppendBinaryEoln(&cswritep);
+ if (unlikely(Cswrite(css_ptr, &cswritep))) {
+ goto PvarXheaderWrite_ret_WRITE_FAIL;
+ }
+ }
+ *cswritepp = cswritep;
+ }
+ if (append_info_pr_header_line) {
+ *cswritepp = strcpya_k(*cswritepp, "##INFO=<ID=PR,Number=0,Type=Flag,Description=\"Provisional reference allele, may not be based on real reference genome\">" EOLN_STR);
+ }
+ }
+ while (0) {
+ PvarXheaderWrite_ret_NOMEM:
+ reterr = kPglRetNomem;
+ break;
+ PvarXheaderWrite_ret_WRITE_FAIL:
+ reterr = kPglRetWriteFail;
+ break;
+ PvarXheaderWrite_ret_MALFORMED_INPUT:
+ reterr = kPglRetMalformedInput;
+ break;
+ }
+ BigstackReset(bigstack_mark);
+ return reterr;
+}
+
+PglErr WritePvar(const char* outname, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* qual_present, const float* quals, const uintptr_t* filter_present, const uintptr_t* filter_npass, const char* const* filter_storage, const uintptr_t* nonref_flags, const char* pvar_info_reload, const double* variant_cms, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t nonref_flags_storage, uint32_t max_filter_slen, uint32_t info_reload_slen, UnsortedVar vpos_sortstatus, PvarPsamFlags pvar_psam_flags, uint32_t thread_ct, char* xheader) {
// allele_presents must be nullptr unless we're trimming alt alleles
// split/join cases handled by WritePvarSplit() and WritePvarJoin()
unsigned char* bigstack_mark = g_bigstack_base;
@@ -316,37 +573,10 @@ PglErr WritePvar(const char* outname, const char* xheader, const uintptr_t* vari
char* pvar_info_line_iter = nullptr;
uint32_t info_col_idx = 0; // could save this during first load instead
const uint32_t info_pr_flag_present = (info_flags / kfInfoPrFlagPresent) & 1;
- if (pvar_psam_flags & kfPvarColXheader) {
- if (write_filter && write_info) {
- if (unlikely(CsputsStd(xheader, xheader_blen, &css, &cswritep))) {
- goto WritePvar_ret_WRITE_FAIL;
- }
- } else {
- // Filter out FILTER/INFO definitions iff the corresponding column has
- // been removed.
- const char* copy_start = xheader;
- const char* xheader_end = &(xheader[xheader_blen]);
- for (const char* xheader_iter = xheader; xheader_iter != xheader_end; ) {
- const char* next_line_start = AdvPastDelim(xheader_iter, '\n');
- if (((!write_filter) && StrStartsWithUnsafe(xheader_iter, "##FILTER=<ID=")) ||
- ((!write_info) && StrStartsWithUnsafe(xheader_iter, "##INFO=<ID="))) {
- if (copy_start != xheader_iter) {
- if (unlikely(CsputsStd(copy_start, xheader_iter - copy_start, &css, &cswritep))) {
- goto WritePvar_ret_WRITE_FAIL;
- }
- }
- copy_start = next_line_start;
- }
- xheader_iter = next_line_start;
- }
- if (copy_start != xheader_end) {
- if (unlikely(CsputsStd(copy_start, xheader_end - copy_start, &css, &cswritep))) {
- goto WritePvar_ret_WRITE_FAIL;
- }
- }
- }
- if (write_info_pr && (!info_pr_flag_present)) {
- cswritep = strcpya_k(cswritep, "##INFO=<ID=PR,Number=0,Type=Flag,Description=\"Provisional reference allele, may not be based on real reference genome\">" EOLN_STR);
+ if (pvar_psam_flags & (kfPvarColXheader | kfPvarColVcfheader)) {
+ reterr = PvarXheaderWrite(variant_include, cip, variant_bps, allele_idx_offsets, allele_storage, nullptr, xheader_blen, (pvar_psam_flags / kfPvarColVcfheader) & 1, write_filter, write_info, write_info_pr && (!info_pr_flag_present), max_allele_slen, vpos_sortstatus, xheader, &css, &cswritep);
+ if (unlikely(reterr)) {
+ goto WritePvar_ret_1;
}
}
// bugfix (30 Jul 2017): may be necessary to reload INFO when no ## lines
@@ -2586,7 +2816,7 @@ uintptr_t InitWriteAlleleIdxOffsets(const uintptr_t* variant_include, const uint
// is printed if INFO:END isn't defined, and an error occurs if either REF
// is multi-character, or there's an INFO:END mismatch.
// - Error out if REF alleles aren't all consistent, or any ALT allele is
-// duplicated (necessary to use --pmerge to merge the latter).
+// duplicated (note that --pmerge must support the latter).
// - For joined not-entirely-SNP non-symbolic variants, the final REF is the
// longest of the original REFs; ALT alleles have bases added to the end if
// necessary. (Yes, this causes SNPs to stop being visible to a strlen ==
@@ -2675,9 +2905,10 @@ typedef struct JoinCountsStruct {
JoinVtype JoinCount(const char* const* cur_alleles, uintptr_t allele_ct, JoinCounts* jcp) {
jcp->snp_ct = 0;
jcp->symbolic_ct = 0;
+ jcp->missalt_snp_ct = 0;
+ jcp->missalt_nonsnp_ct = 0;
if (cur_alleles[0][1] == '\0') {
jcp->nonsnp_ct = 0;
- jcp->missalt_nonsnp_ct = 0;
for (uintptr_t allele_idx = 1; allele_idx != allele_ct; ++allele_idx) {
const char* cur_allele = cur_alleles[allele_idx];
if (cur_allele[0] == '<') {
@@ -2703,12 +2934,12 @@ JoinVtype JoinCount(const char* const* cur_alleles, uintptr_t allele_ct, JoinCou
}
return kJoinVtypeSnp;
}
- jcp->missalt_snp_ct = 0;
for (uint32_t allele_idx = 1; allele_idx != allele_ct; ++allele_idx) {
const char* cur_allele = cur_alleles[allele_idx];
if (cur_allele[0] == '<') {
return kJoinVtypeError;
- } else if (memequal_k(cur_allele, ".", 2)) {
+ }
+ if (memequal_k(cur_allele, ".", 2)) {
if (allele_ct == 2) {
jcp->nonsnp_ct = 0;
jcp->missalt_nonsnp_ct = 1;
@@ -3336,7 +3567,7 @@ PglErr ParseInfoHeader(const char* xheader, uintptr_t xheader_blen, const char*
return reterr;
}
-PglErr WritePvarSplit(const char* outname, const char* xheader, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* qual_present, const float* quals, const uintptr_t* filter_present, const uintptr_t* filter_npass, const char* const* filter_storage, const uintptr_t* nonref_flags, const char* pvar_info_reload, const double* variant_cms, const char* varid_template_str, const char* missing_varid_match, const char* const* info_keys, const uint32_t* info_keys_htable, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, uint32_t new_variant_id_max_allele_slen, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t nonref_flags_storage, uint32_t max_filter_slen, uint32_t info_reload_slen, uint32_t info_key_ct, uint32_t info_keys_htable_size, MiscFlags misc_flags, MakePlink2Flags make_plink2_flags, PvarPsamFlags pvar_psam_flags, uint32_t thread_ct) {
+PglErr WritePvarSplit(const char* outname, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* qual_present, const float* quals, const uintptr_t* filter_present, const uintptr_t* filter_npass, const char* const* filter_storage, const uintptr_t* nonref_flags, const char* pvar_info_reload, const double* variant_cms, const char* varid_template_str, const char* missing_varid_match, const char* const* info_keys, const uint32_t* info_keys_htable, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, uint32_t new_variant_id_max_allele_slen, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t nonref_flags_storage, uint32_t max_filter_slen, uint32_t info_reload_slen, UnsortedVar vpos_sortstatus, uint32_t info_key_ct, uint32_t info_keys_htable_size, MiscFlags misc_flags, MakePlink2Flags make_plink2_flags, PvarPsamFlags pvar_psam_flags, uint32_t thread_ct, char* xheader) {
unsigned char* bigstack_mark = g_bigstack_base;
char* cswritep = nullptr;
PglErr reterr = kPglRetSuccess;
@@ -3407,14 +3638,18 @@ PglErr WritePvarSplit(const char* outname, const char* xheader, const uintptr_t*
}
char* pvar_info_line_iter = nullptr;
+ uint32_t write_filter = 0;
+ if (pvar_psam_flags & kfPvarColFilter) {
+ write_filter = 1;
+ } else if ((pvar_psam_flags & kfPvarColMaybefilter) && filter_present) {
+ write_filter = !IntersectionIsEmpty(variant_include, filter_present, raw_variant_ctl);
+ }
uint32_t info_col_idx = 0; // could save this during first load instead
const uint32_t info_pr_flag_present = (info_flags / kfInfoPrFlagPresent) & 1;
- if (pvar_psam_flags & kfPvarColXheader) {
- if (unlikely(CsputsStd(xheader, xheader_blen, &css, &cswritep))) {
- goto WritePvarSplit_ret_WRITE_FAIL;
- }
- if (write_info_pr && (!info_pr_flag_present)) {
- cswritep = strcpya_k(cswritep, "##INFO=<ID=PR,Number=0,Type=Flag,Description=\"Provisional reference allele, may not be based on real reference genome\">" EOLN_STR);
+ if (pvar_psam_flags & (kfPvarColXheader | kfPvarColVcfheader)) {
+ reterr = PvarXheaderWrite(variant_include, cip, variant_bps, allele_idx_offsets, allele_storage, nullptr, xheader_blen, (pvar_psam_flags / kfPvarColVcfheader) & 1, write_filter, write_info, write_info_pr && (!info_pr_flag_present), max_allele_slen, vpos_sortstatus, xheader, &css, &cswritep);
+ if (unlikely(reterr)) {
+ goto WritePvarSplit_ret_1;
}
}
// could also make this an array-of-structs
@@ -3451,17 +3686,9 @@ PglErr WritePvarSplit(const char* outname, const char* xheader, const uintptr_t*
if (write_qual) {
cswritep = strcpya_k(cswritep, "\tQUAL");
}
-
- uint32_t write_filter = 0;
- if (pvar_psam_flags & kfPvarColFilter) {
- write_filter = 1;
- } else if ((pvar_psam_flags & kfPvarColMaybefilter) && filter_present) {
- write_filter = !IntersectionIsEmpty(variant_include, filter_present, raw_variant_ctl);
- }
if (write_filter) {
cswritep = strcpya_k(cswritep, "\tFILTER");
}
-
if (write_info) {
cswritep = strcpya_k(cswritep, "\tINFO");
}
@@ -3986,7 +4213,7 @@ PglErr MakeFilterHtable(const uintptr_t* variant_include, const uintptr_t* filte
}
/*
-PglErr WritePvarJoin(const char* outname, const char* xheader, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* qual_present, const float* quals, const uintptr_t* filter_present, const uintptr_t* filter_npass, const char* const* filter_storage, const uintptr_t* nonref_flags, const char* pvar_info_reload, const double* variant_cms, const char* varid_template_str, const char* missing_varid_match, const char* const* info_keys, const uint32_t* info_keys_htable, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, uint32_t new_variant_id_max_allele_slen, uint32_t max_write_allele_ct, uint32_t max_missalt_ct, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t nonref_flags_storage, uint32_t max_filter_slen, uint32_t info_reload_slen, uint32_t info_key_ct, uint32_t info_keys_htable_size, MiscFlags misc_flags, MakePlink2Flags make_plink2_flags, PvarPsamFlags pvar_psam_flags, uint32_t thread_ct) {
+PglErr WritePvarJoin(const char* outname, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* qual_present, const float* quals, const uintptr_t* filter_present, const uintptr_t* filter_npass, const char* const* filter_storage, const uintptr_t* nonref_flags, const char* pvar_info_reload, const double* variant_cms, const char* varid_template_str, const char* missing_varid_match, const char* const* info_keys, const uint32_t* info_keys_htable, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, uint32_t new_variant_id_max_allele_slen, uint32_t max_write_allele_ct, uint32_t max_missalt_ct, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t nonref_flags_storage, uint32_t max_filter_slen, uint32_t info_reload_slen, UnsortedVar vpos_sortstatus, uint32_t info_key_ct, uint32_t info_keys_htable_size, MiscFlags misc_flags, MakePlink2Flags make_plink2_flags, PvarPsamFlags pvar_psam_flags, uint32_t thread_ct, char* xheader) {
unsigned char* bigstack_mark = g_bigstack_base;
char* cswritep = nullptr;
PglErr reterr = kPglRetSuccess;
@@ -4057,14 +4284,18 @@ PglErr WritePvarJoin(const char* outname, const char* xheader, const uintptr_t*
}
char* pvar_info_line_iter = nullptr;
+ uint32_t write_filter = 0;
+ if (pvar_psam_flags & kfPvarColFilter) {
+ write_filter = 1;
+ } else if ((pvar_psam_flags & kfPvarColMaybefilter) && filter_present) {
+ write_filter = !IntersectionIsEmpty(variant_include, filter_present, raw_variant_ctl);
+ }
uint32_t info_col_idx = 0; // could save this during first load instead
const uint32_t info_pr_flag_present = (info_flags / kfInfoPrFlagPresent) & 1;
- if (pvar_psam_flags & kfPvarColXheader) {
- if (unlikely(CsputsStd(xheader, xheader_blen, &css, &cswritep))) {
- goto WritePvarJoin_ret_WRITE_FAIL;
- }
- if (write_info_pr && (!info_pr_flag_present)) {
- cswritep = strcpya_k(cswritep, "##INFO=<ID=PR,Number=0,Type=Flag,Description=\"Provisional reference allele, may not be based on real reference genome\">" EOLN_STR);
+ if (pvar_psam_flags & (kfPvarColXheader | kfPvarColVcfheader)) {
+ reterr = PvarXheaderWrite(variant_include, cip, variant_bps, allele_idx_offsets, allele_storage, nullptr, xheader_blen, (pvar_psam_flags / kfPvarColVcfheader) & 1, write_filter, write_info, write_info_pr && (!info_pr_flag_present), max_allele_slen, vpos_sortstatus, xheader, &css, &cswritep);
+ if (unlikely(reterr)) {
+ goto WritePvarJoin_ret_1;
}
}
const uint32_t join_mode = (make_plink2_flags & (kfMakePlink2MSplitBase * 7));
@@ -4092,13 +4323,6 @@ PglErr WritePvarJoin(const char* outname, const char* xheader, const uintptr_t*
if (write_qual) {
cswritep = strcpya_k(cswritep, "\tQUAL");
}
-
- uint32_t write_filter = 0;
- if (pvar_psam_flags & kfPvarColFilter) {
- write_filter = 1;
- } else if ((pvar_psam_flags & kfPvarColMaybefilter) && filter_present) {
- write_filter = !IntersectionIsEmpty(variant_include, filter_present, raw_variant_ctl);
- }
const char** filter_keys = nullptr;
uint32_t* filter_keys_htable = nullptr;
uintptr_t* cur_filter_keys = nullptr;
@@ -4127,8 +4351,9 @@ PglErr WritePvarJoin(const char* outname, const char* xheader, const uintptr_t*
char** info_bufs = nullptr;
const char** info_starts = nullptr;
- const char** info_ends = nullptr;
+ const char** info_ends = nullptr; // ugh, this is not related to INFO:END
const char** info_curs = nullptr;
+ uint32_t info_end_key_idx = UINT32_MAX;
if (pvar_info_reload) {
if (unlikely(
bigstack_alloc_cp(info_cache_size, &info_bufs) ||
@@ -4141,6 +4366,16 @@ PglErr WritePvarJoin(const char* outname, const char* xheader, const uintptr_t*
if (unlikely(reterr)) {
goto WritePvarJoin_ret_TSTREAM_FAIL;
}
+ info_end_key_idx = IdHtableFind("END", info_keys, info_keys_htable, strlen("END"), info_keys_htable_size);
+ if (info_end_key_idx != UINT32_MAX) {
+ const int32_t knum = const_container_of[info_keys[info_end_key_idx], InfoVtype, key)->num;
+ if ((knum != 1) && (knum != kInfoVtypeUnknown)) {
+ // TODO: verify type instead.
+ // but if number is not . or 1, this is not the INFO:END we're
+ // looking for.
+ info_end_key_idx = UINT32_MAX;
+ }
+ }
}
if (write_info) {
cswritep = strcpya_k(cswritep, "\tINFO");
@@ -4192,13 +4427,12 @@ PglErr WritePvarJoin(const char* outname, const char* xheader, const uintptr_t*
uint32_t pct = 0;
uint32_t next_print_variant_idx = variant_ct / 100;
JoinCounts jc;
- JoinCounts next_jc;
jc.snp_ct = 0;
jc.nonsnp_ct = 0;
jc.symbolic_ct = 0;
jc.missalt_snp_ct = 0;
jc.missalt_nonsnp_ct = 0;
- ;;;
+ JoinCounts next_jc = jc;
fputs("0%", stdout);
fflush(stdout);
while (1) {
@@ -4232,6 +4466,12 @@ PglErr WritePvarJoin(const char* outname, const char* xheader, const uintptr_t*
allele_ct = allele_idx_offsets[next_variant_uidx + 1] - allele_idx_offset_base;
}
const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
+ JoinVtype jvt = JoinCount(cur_alleles, allele_ct, &next_jc);
+ // previously validated
+ if ((join_mode == kfMakePlink2MJoinSnps) && ) {
+ }
+
+ // TODO
jc.snp_ct += next_jc.snp_ct;
jc.nonsnp_ct += next_jc.nonsnp_ct;
jc.symbolic_ct += next_jc.symbolic_ct;
@@ -4331,12 +4571,18 @@ PglErr WritePvarJoin(const char* outname, const char* xheader, const uintptr_t*
}
AppendBinaryEoln(&cswritep);
} else {
+ // TODO
+ next_jc.snp_ct = 0;
+ next_jc.nonsnp_ct = 0;
+ next_jc.symbolic_ct = 0;
+ next_jc.missalt_snp_ct = 0;
+ next_jc.missalt_nonsnp_ct = 0;
}
if (next_variant_idx == variant_ct) {
break;
}
this_pos_write_variant_ct = 0;
- jt = next_jt;
+ jc = next_jc;
prev_bp = cur_bp;
bp_start_variant_idx = next_variant_idx;
bp_start_variant_uidx = next_variant_uidx;
@@ -4377,6 +4623,7 @@ PglErr WritePvarJoin(const char* outname, const char* xheader, const uintptr_t*
}
}
}
+ }
}
if (unlikely(CswriteCloseNull(&css, cswritep))) {
goto WritePvarJoin_ret_WRITE_FAIL;
@@ -5605,20 +5852,24 @@ PglErr MakePgenRobust(const uintptr_t* sample_include, const uint32_t* new_sampl
// bugfix (25 Jul 2018): left expression needs ||, not &&
mcp->hard_call_halfdist = ((hard_call_thresh == UINT32_MAX) || (!read_dosage_present))? 0 : (kDosage4th - hard_call_thresh);
ctx.dosage_erase_halfdist = kDosage4th - dosage_erase_thresh;
- const uint32_t read_hphase_present = (read_gflags / kfPgenGlobalHardcallPhasePresent) & 1;
+ // bugfix/simplification (10 Mar 2020): it is possible for dosage-phase
+ // to be present in the input without hardcall-phase. Don't try to treat
+ // that differently than the usual scenario where hardcall-phase is
+ // present.
+ const uint32_t read_phase_present = !!(read_gflags & (kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePhasePresent));
const uint32_t read_dphase_present = (read_gflags / kfPgenGlobalDosagePhasePresent) & 1;
PgenGlobalFlags write_gflags = read_gflags;
// When --hard-call-threshold is specified, if either hphase or dphase
// values exist, the other can be generated.
- uint32_t read_or_write_hphase_present = read_hphase_present;
+ uint32_t read_or_write_phase_present = read_phase_present;
uint32_t read_or_write_dphase_present = read_dphase_present;
- if (mcp->hard_call_halfdist && (read_hphase_present || read_or_write_dphase_present)) {
- read_or_write_hphase_present = 1;
+ if (mcp->hard_call_halfdist && (read_phase_present || read_or_write_dphase_present)) {
+ read_or_write_phase_present = 1;
read_or_write_dphase_present = 1;
write_gflags |= kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePhasePresent;
} else if (dosage_erase_thresh && read_dosage_present) {
// need write_phasepresent, pretty harmless to allocate write_phaseinfo
- read_or_write_hphase_present = 1;
+ read_or_write_phase_present = 1;
}
uint32_t read_or_write_dosage_present = read_dosage_present;
if (mcp->plink2_write_flags & kfPlink2WriteLateDosageErase) {
@@ -5708,7 +5959,7 @@ PglErr MakePgenRobust(const uintptr_t* sample_include, const uint32_t* new_sampl
if (unlikely(bigstack_alloc_wp(1, &ctx.thread_write_genovecs))) {
goto MakePgenRobust_ret_NOMEM;
}
- if (read_hphase_present && new_sample_idx_to_old) {
+ if (read_phase_present && new_sample_idx_to_old) {
if (unlikely(bigstack_alloc_u32(raw_sample_ct, &ctx.old_sample_idx_to_new))) {
goto MakePgenRobust_ret_NOMEM;
}
@@ -5734,7 +5985,7 @@ PglErr MakePgenRobust(const uintptr_t* sample_include, const uint32_t* new_sampl
}
ctx.thread_write_phasepresents = nullptr;
ctx.thread_all_hets = nullptr;
- if (read_or_write_hphase_present) {
+ if (read_or_write_phase_present) {
if (unlikely(
bigstack_alloc_wp(1, &ctx.thread_write_phasepresents) ||
bigstack_alloc_wp(1, &ctx.thread_write_phaseinfos) ||
@@ -5742,7 +5993,7 @@ PglErr MakePgenRobust(const uintptr_t* sample_include, const uint32_t* new_sampl
bigstack_alloc_w(sample_ctl, &(ctx.thread_write_phaseinfos[0])))) {
goto MakePgenRobust_ret_NOMEM;
}
- if (read_hphase_present) {
+ if (read_phase_present) {
if (unlikely(
bigstack_alloc_wp(1, &ctx.thread_all_hets) ||
bigstack_alloc_w(raw_sample_ctl, &(ctx.thread_all_hets[0])))) {
@@ -5834,7 +6085,7 @@ PglErr MakePgenRobust(const uintptr_t* sample_include, const uint32_t* new_sampl
bigstack_alloc_u32p(max_read_allele_ct + 1, &alt_two_sample_idx_starts))) {
goto MakePgenRobust_ret_NOMEM;
}
- if (read_hphase_present) {
+ if (read_phase_present) {
if (unlikely(
bigstack_alloc_w(raw_sample_ctl, &pgv.phasepresent) ||
bigstack_alloc_w(raw_sample_ctl, &pgv.phaseinfo) ||
@@ -5853,9 +6104,9 @@ PglErr MakePgenRobust(const uintptr_t* sample_include, const uint32_t* new_sampl
const uint32_t raw_sample_ctv2 = NypCtToVecCt(raw_sample_ct);
uintptr_t load_variant_vec_ct = raw_sample_ctv2;
uint32_t loaded_vrtypes_needed = (read_gflags & kfPgenGlobalMultiallelicHardcallFound)? 1 : 0;
- if (read_hphase_present || read_dosage_present) {
+ if (read_phase_present || read_dosage_present) {
loaded_vrtypes_needed = 1;
- if (read_hphase_present) {
+ if (read_phase_present) {
// phaseraw has three parts:
// 1. het_ct as uint32_t, and explicit_phasepresent_ct as uint32_t.
// 2. vec-aligned bitarray of up to (raw_sample_ct + 1) bits. first
@@ -5985,7 +6236,7 @@ PglErr MakePgenRobust(const uintptr_t* sample_include, const uint32_t* new_sampl
} else if (cur_write_allele_ct == 2) {
if (write_aidx == 1) {
// 1. read into normal, not raw representation
- if (read_hphase_present) {
+ if (read_phase_present) {
reterr = PgrGetMDp(nullptr, null_pssi, raw_sample_ct, read_variant_uidx, simple_pgrp, &pgv);
} else {
reterr = PgrGetMD(nullptr, null_pssi, raw_sample_ct, read_variant_uidx, simple_pgrp, &pgv);
@@ -6395,7 +6646,7 @@ PglErr MakePgenRobust(const uintptr_t* sample_include, const uint32_t* new_sampl
}
// allele_presents should be nullptr iff trim_alts not true
-PglErr MakePlink2NoVsort(const char* xheader, const uintptr_t* sample_include, const PedigreeIdInfo* piip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* pvar_qual_present, const float* pvar_quals, const uintptr_t* pvar_filter_present, const uintptr_t* pvar_filter_npass, const char* const* pvar_filter_storage, const char* pvar_info_reload, const double* variant_cms, const char* varid_template_str, __maybe_unused const char* varid_multi_template_str, __maybe_unused const char* varid_multi_nonsnp_template_str, const char* missing_varid_match, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_ct, uint32_t max_allele_slen, uint32_t max_filter_slen, uint32_t info_reload_slen, uint32_t max_thread_ct, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, uint32_t new_variant_id_max_allele_slen, MiscFlags misc_flags, MakePlink2Flags make_plink2_flags, PvarPsamFlags pvar_psam_flags, uintptr_t pgr_alloc_cacheline_ct, PgenFileInfo* pgfip, PgenReader* simple_pgrp, char* outname, char* outname_end) {
+PglErr MakePlink2NoVsort(const uintptr_t* sample_include, const PedigreeIdInfo* piip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* pvar_qual_present, const float* pvar_quals, const uintptr_t* pvar_filter_present, const uintptr_t* pvar_filter_npass, const char* const* pvar_filter_storage, const char* pvar_info_reload, const double* variant_cms, const char* varid_template_str, __maybe_unused const char* varid_multi_template_str, __maybe_unused const char* varid_multi_nonsnp_template_str, const char* missing_varid_match, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_ct, uint32_t max_allele_slen, uint32_t max_filter_slen, uint32_t info_reload_slen, UnsortedVar vpos_sortstatus, uint32_t max_thread_ct, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, uint32_t new_variant_id_max_allele_slen, MiscFlags misc_flags, MakePlink2Flags make_plink2_flags, PvarPsamFlags pvar_psam_flags, uintptr_t pgr_alloc_cacheline_ct, char* xheader, PgenFileInfo* pgfip, PgenReader* simple_pgrp, char* outname, char* outname_end) {
unsigned char* bigstack_mark = g_bigstack_base;
FILE* outfile = nullptr;
PglErr reterr = kPglRetSuccess;
@@ -6501,7 +6752,7 @@ PglErr MakePlink2NoVsort(const char* xheader, const uintptr_t* sample_include, c
nonref_flags_storage = (pgfip->gflags & kfPgenGlobalAllNonref)? 2 : 1;
}
if (write_variant_ct == variant_ct) {
- reterr = WritePvar(outname, xheader, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, pgfip->nonref_flags, pvar_info_reload, variant_cms, raw_variant_ct, variant_ct, max_allele_slen, xheader_blen, info_flags, nonref_flags_storage, max_filter_slen, info_reload_slen, pvar_psam_flags, max_thread_ct);
+ reterr = WritePvar(outname, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, pgfip->nonref_flags, pvar_info_reload, variant_cms, raw_variant_ct, variant_ct, max_allele_slen, xheader_blen, info_flags, nonref_flags_storage, max_filter_slen, info_reload_slen, vpos_sortstatus, pvar_psam_flags, max_thread_ct, xheader);
} else {
const char* const* info_keys = nullptr;
uint32_t info_key_ct = 0;
@@ -6514,12 +6765,12 @@ PglErr MakePlink2NoVsort(const char* xheader, const uintptr_t* sample_include, c
}
}
if (write_variant_ct > variant_ct) {
- reterr = WritePvarSplit(outname, xheader, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, pgfip->nonref_flags, pvar_info_reload, variant_cms, varid_template_str, missing_varid_match, info_keys, info_keys_htable, raw_variant_ct, variant_ct, max_allele_slen, new_variant_id_max_allele_slen, xheader_blen, info_flags, nonref_flags_storage, max_filter_slen, info_reload_slen, info_key_ct, info_keys_htable_size, misc_flags, make_plink2_flags, pvar_psam_flags, max_thread_ct);
+ reterr = WritePvarSplit(outname, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, pgfip->nonref_flags, pvar_info_reload, variant_cms, varid_template_str, missing_varid_match, info_keys, info_keys_htable, raw_variant_ct, variant_ct, max_allele_slen, new_variant_id_max_allele_slen, xheader_blen, info_flags, nonref_flags_storage, max_filter_slen, info_reload_slen, vpos_sortstatus, info_key_ct, info_keys_htable_size, misc_flags, make_plink2_flags, pvar_psam_flags, max_thread_ct, xheader);
} else {
logerrputs("Error: Multiallelic join is under development.\n");
reterr = kPglRetNotYetSupported;
goto MakePlink2NoVsort_ret_1;
- // reterr = WritePvarJoin(outname, xheader, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, pgfip->nonref_flags, pvar_info_reload, variant_cms, varid_template_str, missing_varid_match, info_keys, info_keys_htable, raw_variant_ct, variant_ct, max_allele_slen, new_variant_id_max_allele_slen, max_write_allele_ct, max_missalt_ct, xheader_blen, info_flags, nonref_flags_storage, max_filter_slen, info_reload_slen, info_key_ct, info_keys_htable_size, misc_flags, make_plink2_flags, pvar_psam_flags, max_thread_ct);
+ // reterr = WritePvarJoin(outname, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, pgfip->nonref_flags, pvar_info_reload, variant_cms, varid_template_str, missing_varid_match, info_keys, info_keys_htable, raw_variant_ct, variant_ct, max_allele_slen, new_variant_id_max_allele_slen, max_write_allele_ct, max_missalt_ct, xheader_blen, info_flags, nonref_flags_storage, max_filter_slen, info_reload_slen, vpos_sortstatus, info_key_ct, info_keys_htable_size, misc_flags, make_plink2_flags, pvar_psam_flags, max_thread_ct, xheader);
}
}
if (unlikely(reterr)) {
@@ -6620,17 +6871,17 @@ PglErr MakePlink2NoVsort(const char* xheader, const uintptr_t* sample_include, c
const uint32_t read_dosage_present = (read_gflags / kfPgenGlobalDosagePresent) & 1;
mc.hard_call_halfdist = ((hard_call_thresh == UINT32_MAX) || (!read_dosage_present))? 0 : (kDosage4th - hard_call_thresh);
ctx.dosage_erase_halfdist = kDosage4th - dosage_erase_thresh;
- const uint32_t read_hphase_present = (read_gflags / kfPgenGlobalHardcallPhasePresent) & 1;
+ const uint32_t read_phase_present = !!(read_gflags & (kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePhasePresent));
const uint32_t read_dphase_present = (read_gflags / kfPgenGlobalDosagePhasePresent) & 1;
PgenGlobalFlags write_gflags = read_gflags;
- uint32_t read_or_write_hphase_present = read_hphase_present;
+ uint32_t read_or_write_phase_present = read_phase_present;
uint32_t read_or_write_dphase_present = read_dphase_present;
- if (mc.hard_call_halfdist && (read_hphase_present || read_or_write_dphase_present)) {
- read_or_write_hphase_present = 1;
+ if (mc.hard_call_halfdist && (read_phase_present || read_or_write_dphase_present)) {
+ read_or_write_phase_present = 1;
read_or_write_dphase_present = 1;
write_gflags |= kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePhasePresent;
} else if (dosage_erase_thresh && read_dosage_present) {
- read_or_write_hphase_present = 1;
+ read_or_write_phase_present = 1;
}
uint32_t read_or_write_dosage_present = read_dosage_present;
if (mc.plink2_write_flags & kfPlink2WriteLateDosageErase) {
@@ -6672,7 +6923,7 @@ PglErr MakePlink2NoVsort(const char* xheader, const uintptr_t* sample_include, c
const uintptr_t mhcraw_word_ct = RoundUpPow2(2, kWordsPerVec) + GetMhcWordCt(raw_sample_ct);
load_vblock_cacheline_ct += WordCtToCachelineCtU64(S_CAST(uint64_t, mhcraw_word_ct) * max_vblock_size);
}
- if (read_hphase_present) {
+ if (read_phase_present) {
// could make this bound tighter when lots of unphased variants are
// mixed in among the phased variants, but this isn't nearly as
// important as the analogous multiallelic optimization
@@ -6801,7 +7052,7 @@ PglErr MakePlink2NoVsort(const char* xheader, const uintptr_t* sample_include, c
if (bigstack_alloc_wp(calc_thread_ct, &ctx.thread_write_genovecs)) {
goto MakePlink2NoVsort_fallback;
}
- if (read_hphase_present && new_sample_idx_to_old) {
+ if (read_phase_present && new_sample_idx_to_old) {
if (bigstack_alloc_u32(raw_sample_ct, &ctx.old_sample_idx_to_new)) {
goto MakePlink2NoVsort_fallback;
}
@@ -6834,13 +7085,13 @@ PglErr MakePlink2NoVsort(const char* xheader, const uintptr_t* sample_include, c
ctx.thread_all_hets = nullptr;
ctx.thread_write_dosagepresents = nullptr;
ctx.thread_write_dphasepresents = nullptr;
- if (read_or_write_hphase_present || read_or_write_dosage_present) {
- if (read_or_write_hphase_present) {
+ if (read_or_write_phase_present || read_or_write_dosage_present) {
+ if (read_or_write_phase_present) {
if (bigstack_alloc_wp(calc_thread_ct, &ctx.thread_write_phasepresents) ||
bigstack_alloc_wp(calc_thread_ct, &ctx.thread_write_phaseinfos)) {
goto MakePlink2NoVsort_fallback;
}
- if (read_hphase_present) {
+ if (read_phase_present) {
if (bigstack_alloc_wp(calc_thread_ct, &ctx.thread_all_hets)) {
goto MakePlink2NoVsort_fallback;
}
@@ -6869,7 +7120,7 @@ PglErr MakePlink2NoVsort(const char* xheader, const uintptr_t* sample_include, c
// todo: multiallelic dosage
}
}
- if (read_or_write_hphase_present || read_dosage_present || (read_gflags & kfPgenGlobalMultiallelicHardcallFound)) {
+ if (read_or_write_phase_present || read_dosage_present || (read_gflags & kfPgenGlobalMultiallelicHardcallFound)) {
// ctx.loaded_vrtypes
other_per_thread_cacheline_ct += 2 * (kPglVblockSize / kCacheline);
}
@@ -6885,19 +7136,19 @@ PglErr MakePlink2NoVsort(const char* xheader, const uintptr_t* sample_include, c
main_loadbufs[1] = S_CAST(uintptr_t*, bigstack_alloc_raw(load_vblock_cacheline_ct * calc_thread_ct * kCacheline));
ctx.loaded_vrtypes[0] = nullptr;
ctx.loaded_vrtypes[1] = nullptr;
- if (read_or_write_hphase_present || read_dosage_present || (read_gflags & kfPgenGlobalMultiallelicHardcallFound)) {
+ if (read_or_write_phase_present || read_dosage_present || (read_gflags & kfPgenGlobalMultiallelicHardcallFound)) {
ctx.loaded_vrtypes[0] = S_CAST(unsigned char*, bigstack_alloc_raw(kPglVblockSize * calc_thread_ct));
ctx.loaded_vrtypes[1] = S_CAST(unsigned char*, bigstack_alloc_raw(kPglVblockSize * calc_thread_ct));
}
- if (read_or_write_hphase_present || read_or_write_dosage_present) {
+ if (read_or_write_phase_present || read_or_write_dosage_present) {
const uint32_t bitvec_writebuf_byte_ct = BitCtToCachelineCt(sample_ct) * kCacheline;
const uintptr_t dosagevals_writebuf_byte_ct = DivUp(sample_ct, (kCacheline / 2)) * kCacheline;
for (uint32_t tidx = 0; tidx != calc_thread_ct; ++tidx) {
- if (read_or_write_hphase_present) {
+ if (read_or_write_phase_present) {
ctx.thread_write_phasepresents[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(bitvec_writebuf_byte_ct));
ctx.thread_write_phaseinfos[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(bitvec_writebuf_byte_ct));
- if (read_hphase_present) {
+ if (read_phase_present) {
ctx.thread_all_hets[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(BitCtToCachelineCt(raw_sample_ct) * kCacheline));
}
}
@@ -7438,7 +7689,7 @@ PglErr WritePvarResortedInterval(const ChrInfo* write_cip, const uint32_t* varia
// allocation of buffers, and generating the header line occurs directly in
// this function, while loading the next pvar_info_strs batch and writing the
// next .pvar line batch are one level down.
-PglErr WritePvarResorted(const char* outname, const char* xheader, const uintptr_t* variant_include, const ChrInfo* write_cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* qual_present, const float* quals, const uintptr_t* filter_present, const uintptr_t* filter_npass, const char* const* filter_storage, const uintptr_t* nonref_flags, const char* pvar_info_reload, const double* variant_cms, const uint32_t* new_variant_idx_to_old, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t nonref_flags_storage, uint32_t max_filter_slen, uint32_t info_reload_slen, PvarPsamFlags pvar_psam_flags, uint32_t thread_ct) {
+PglErr WritePvarResorted(const char* outname, const uintptr_t* variant_include, const ChrInfo* write_cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* qual_present, const float* quals, const uintptr_t* filter_present, const uintptr_t* filter_npass, const char* const* filter_storage, const uintptr_t* nonref_flags, const char* pvar_info_reload, const double* variant_cms, const uint32_t* new_variant_idx_to_old, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t nonref_flags_storage, uint32_t max_filter_slen, uint32_t info_reload_slen, PvarPsamFlags pvar_psam_flags, uint32_t thread_ct, char* xheader) {
unsigned char* bigstack_mark = g_bigstack_base;
char* cswritep = nullptr;
PglErr reterr = kPglRetSuccess;
@@ -7478,13 +7729,17 @@ PglErr WritePvarResorted(const char* outname, const char* xheader, const uintptr
goto WritePvarResorted_ret_INCONSISTENT_INPUT;
}
+ uint32_t write_filter = 0;
+ if (pvar_psam_flags & kfPvarColFilter) {
+ write_filter = 1;
+ } else if ((pvar_psam_flags & kfPvarColMaybefilter) && filter_present) {
+ write_filter = !IntersectionIsEmpty(variant_include, filter_present, raw_variant_ctl);
+ }
const uint32_t info_pr_flag_present = (info_flags / kfInfoPrFlagPresent) & 1;
- if (pvar_psam_flags & kfPvarColXheader) {
- if (unlikely(CsputsStd(xheader, xheader_blen, &css, &cswritep))) {
- goto WritePvarResorted_ret_WRITE_FAIL;
- }
- if (write_info_pr && (!info_pr_flag_present)) {
- cswritep = strcpya_k(cswritep, "##INFO=<ID=PR,Number=0,Type=Flag,Description=\"Provisional reference allele, may not be based on real reference genome\">" EOLN_STR);
+ if (pvar_psam_flags & (kfPvarColXheader | kfPvarColVcfheader)) {
+ reterr = PvarXheaderWrite(nullptr, write_cip, variant_bps, allele_idx_offsets, allele_storage, new_variant_idx_to_old, xheader_blen, (pvar_psam_flags / kfPvarColVcfheader) & 1, write_filter, write_info, write_info_pr && (!info_pr_flag_present), max_allele_slen, kfUnsortedVar0, xheader, &css, &cswritep);
+ if (unlikely(reterr)) {
+ goto WritePvarResorted_ret_1;
}
}
if (write_cip->chrset_source) {
@@ -7502,12 +7757,6 @@ PglErr WritePvarResorted(const char* outname, const char* xheader, const uintptr
cswritep = strcpya_k(cswritep, "\tQUAL");
}
- uint32_t write_filter = 0;
- if (pvar_psam_flags & kfPvarColFilter) {
- write_filter = 1;
- } else if ((pvar_psam_flags & kfPvarColMaybefilter) && filter_present) {
- write_filter = !IntersectionIsEmpty(variant_include, filter_present, raw_variant_ctl);
- }
if (write_filter) {
cswritep = strcpya_k(cswritep, "\tFILTER");
}
@@ -7637,7 +7886,7 @@ PglErr WritePvarResorted(const char* outname, const char* xheader, const uintptr
return reterr;
}
-PglErr MakePlink2Vsort(const char* xheader, const uintptr_t* sample_include, const PedigreeIdInfo* piip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* pvar_qual_present, const float* pvar_quals, const uintptr_t* pvar_filter_present, const uintptr_t* pvar_filter_npass, const char* const* pvar_filter_storage, const char* pvar_info_reload, const double* variant_cms, const ChrIdx* chr_idxs, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_ct, uint32_t max_allele_slen, uint32_t max_filter_slen, uint32_t info_reload_slen, uint32_t max_thread_ct, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, MakePlink2Flags make_plink2_flags, uint32_t use_nsort, PvarPsamFlags pvar_psam_flags, PgenReader* simple_pgrp, char* outname, char* outname_end) {
+PglErr MakePlink2Vsort(const uintptr_t* sample_include, const PedigreeIdInfo* piip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* pvar_qual_present, const float* pvar_quals, const uintptr_t* pvar_filter_present, const uintptr_t* pvar_filter_npass, const char* const* pvar_filter_storage, const char* pvar_info_reload, const double* variant_cms, const ChrIdx* chr_idxs, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_ct, uint32_t max_allele_slen, uint32_t max_filter_slen, uint32_t info_reload_slen, uint32_t max_thread_ct, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, MakePlink2Flags make_plink2_flags, uint32_t use_nsort, PvarPsamFlags pvar_psam_flags, char* xheader, PgenReader* simple_pgrp, char* outname, char* outname_end) {
unsigned char* bigstack_mark = g_bigstack_base;
unsigned char* bigstack_end_mark = g_bigstack_end;
PglErr reterr = kPglRetSuccess;
@@ -7843,7 +8092,7 @@ PglErr MakePlink2Vsort(const char* xheader, const uintptr_t* sample_include, con
if (!PgrGetNonrefFlags(simple_pgrp)) {
nonref_flags_storage = (PgrGetGflags(simple_pgrp) & kfPgenGlobalAllNonref)? 2 : 1;
}
- reterr = WritePvarResorted(outname, xheader, variant_include, &write_chr_info, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, PgrGetNonrefFlags(simple_pgrp), pvar_info_reload, variant_cms, new_variant_idx_to_old, raw_variant_ct, variant_ct, max_allele_slen, xheader_blen, info_flags, nonref_flags_storage, max_filter_slen, info_reload_slen, pvar_psam_flags, max_thread_ct);
+ reterr = WritePvarResorted(outname, variant_include, &write_chr_info, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, PgrGetNonrefFlags(simple_pgrp), pvar_info_reload, variant_cms, new_variant_idx_to_old, raw_variant_ct, variant_ct, max_allele_slen, xheader_blen, info_flags, nonref_flags_storage, max_filter_slen, info_reload_slen, pvar_psam_flags, max_thread_ct, xheader);
if (unlikely(reterr)) {
goto MakePlink2Vsort_ret_1;
}
=====================================
plink2_data.h
=====================================
@@ -64,10 +64,18 @@ PglErr WriteMapOrBim(const char* outname, const uintptr_t* variant_include, cons
PglErr PvarInfoOpenAndReloadHeader(const char* pvar_info_reload, uint32_t max_thread_ct, TextStream* pvar_reload_txsp, char** line_iterp, uint32_t* info_col_idx_ptr);
+PglErr PvarInfoReload(uint32_t info_col_idx, uint32_t variant_uidx, TextStream* pvar_reload_txsp, char** line_iterp, uint32_t* trs_variant_uidx_ptr);
+
PglErr PvarInfoReloadAndWrite(uint32_t info_pr_flag_present, uint32_t info_col_idx, uint32_t variant_uidx, uint32_t is_pr, TextStream* pvar_reload_txsp, char** line_iterp, char** write_iter_ptr, uint32_t* rls_variant_uidx_ptr);
void AppendChrsetLine(const ChrInfo* cip, char** write_iter_ptr);
+uint32_t ChrLenLbound(const ChrInfo* cip, const uint32_t* variant_bps, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uint32_t* new_variant_idx_to_old, uint32_t chr_fo_idx, uint32_t max_allele_slen, UnsortedVar vpos_sortstatus);
+
+// fileformat, fileDate, source
+// assumes adequate buffer space
+void AppendVcfHeaderStart(uint32_t v43, char** cswritepp);
+
PglErr WriteFam(const char* outname, const uintptr_t* sample_include, const PedigreeIdInfo* piip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const uint32_t* new_sample_idx_to_old, uint32_t sample_ct, uint32_t pheno_ct, char delim);
uint32_t DataFidColIsRequired(const uintptr_t* sample_include, const SampleIdInfo* siip, uint32_t sample_ct, uint32_t maybe_modifier);
@@ -84,9 +92,9 @@ void ApplyHardCallThresh(const uintptr_t* dosage_present, const Dosage* dosage_m
uint32_t ApplyHardCallThreshPhased(const uintptr_t* dosage_present, const Dosage* dosage_main, uint32_t dosage_ct, uint32_t hard_call_halfdist, uintptr_t* genovec, uintptr_t* phasepresent, uintptr_t* phaseinfo, uintptr_t* dphase_present, SDosage* dphase_delta, SDosage* tmp_dphase_delta);
-PglErr MakePlink2NoVsort(const char* xheader, const uintptr_t* sample_include, const PedigreeIdInfo* piip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* pvar_qual_present, const float* pvar_quals, const uintptr_t* pvar_filter_present, const uintptr_t* pvar_filter_npass, const char* const* pvar_filter_storage, const char* pvar_info_reload, const double* variant_cms, const char* varid_template_str, const char* varid_multi_template_str, const char* varid_multi_nonsnp_template_str, const char* missing_varid_match, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_ct, uint32_t max_allele_slen, uint32_t max_filter_slen, uint32_t info_reload_slen, uint32_t max_thread_ct, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, uint32_t new_variant_id_max_allele_slen, MiscFlags misc_flags, MakePlink2Flags make_plink2_flags, PvarPsamFlags pvar_psam_flags, uintptr_t pgr_alloc_cacheline_ct, PgenFileInfo* pgfip, PgenReader* simple_pgrp, char* outname, char* outname_end);
+PglErr MakePlink2NoVsort(const uintptr_t* sample_include, const PedigreeIdInfo* piip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* pvar_qual_present, const float* pvar_quals, const uintptr_t* pvar_filter_present, const uintptr_t* pvar_filter_npass, const char* const* pvar_filter_storage, const char* pvar_info_reload, const double* variant_cms, const char* varid_template_str, __maybe_unused const char* varid_multi_template_str, __maybe_unused const char* varid_multi_nonsnp_template_str, const char* missing_varid_match, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_ct, uint32_t max_allele_slen, uint32_t max_filter_slen, uint32_t info_reload_slen, UnsortedVar vpos_sortstatus, uint32_t max_thread_ct, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, uint32_t new_variant_id_max_allele_slen, MiscFlags misc_flags, MakePlink2Flags make_plink2_flags, PvarPsamFlags pvar_psam_flags, uintptr_t pgr_alloc_cacheline_ct, char* xheader, PgenFileInfo* pgfip, PgenReader* simple_pgrp, char* outname, char* outname_end);
-PglErr MakePlink2Vsort(const char* xheader, const uintptr_t* sample_include, const PedigreeIdInfo* piip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* pvar_qual_present, const float* pvar_quals, const uintptr_t* pvar_filter_present, const uintptr_t* pvar_filter_npass, const char* const* pvar_filter_storage, const char* pvar_info_reload, const double* variant_cms, const ChrIdx* chr_idxs, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_ct, uint32_t max_allele_slen, uint32_t max_filter_slen, uint32_t info_reload_slen, uint32_t max_thread_ct, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, MakePlink2Flags make_plink2_flags, uint32_t use_nsort, PvarPsamFlags pvar_psam_flags, PgenReader* simple_pgrp, char* outname, char* outname_end);
+PglErr MakePlink2Vsort(const uintptr_t* sample_include, const PedigreeIdInfo* piip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* pvar_qual_present, const float* pvar_quals, const uintptr_t* pvar_filter_present, const uintptr_t* pvar_filter_npass, const char* const* pvar_filter_storage, const char* pvar_info_reload, const double* variant_cms, const ChrIdx* chr_idxs, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_ct, uint32_t max_allele_slen, uint32_t max_filter_slen, uint32_t info_reload_slen, uint32_t max_thread_ct, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, MakePlink2Flags make_plink2_flags, uint32_t use_nsort, PvarPsamFlags pvar_psam_flags, char* xheader, PgenReader* simple_pgrp, char* outname, char* outname_end);
PglErr SampleSortFileMap(const uintptr_t* sample_include, const SampleIdInfo* siip, const char* sample_sort_fname, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t** new_sample_idx_to_old_ptr);
=====================================
plink2_decompress.h
=====================================
@@ -88,6 +88,7 @@ HEADER_INLINE unsigned char* TextStreamMemStart(TextStream* txs_ptr) {
return R_CAST(unsigned char*, GET_PRIVATE(*txs_ptr, m).base.dst);
}
+// TODO: logputs("\n") first when necessary
void TextErrPrint(const char* file_descrip, const char* errmsg, PglErr reterr);
HEADER_INLINE void TextFileErrPrint(const char* file_descrip, const textFILE* txfp) {
=====================================
plink2_export.cc
=====================================
The diff for this file was not included because it is too large.
=====================================
plink2_export.h
=====================================
@@ -56,7 +56,7 @@ void InitExportf(ExportfInfo* exportf_info_ptr);
void CleanupExportf(ExportfInfo* exportf_info_ptr);
-PglErr Exportf(const uintptr_t* sample_include, const PedigreeIdInfo* piip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* pvar_qual_present, const float* pvar_quals, const uintptr_t* pvar_filter_present, const uintptr_t* pvar_filter_npass, const char* const* pvar_filter_storage, const char* pvar_info_reload, const double* variant_cms, const ExportfInfo* eip, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_variant_id_slen, uint32_t max_allele_slen, uint32_t max_filter_slen, uint32_t info_reload_slen, uint32_t max_thread_ct, MakePlink2Flags make_plink2_flags, uintptr_t pgr_alloc_cacheline_ct, char* xheader, PgenFileInfo* pgfip, PgenReader* simple_pgrp, char* outname, char* outname_end);
+PglErr Exportf(const uintptr_t* sample_include, const PedigreeIdInfo* piip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* pvar_qual_present, const float* pvar_quals, const uintptr_t* pvar_filter_present, const uintptr_t* pvar_filter_npass, const char* const* pvar_filter_storage, const char* pvar_info_reload, const double* variant_cms, const ExportfInfo* eip, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_variant_id_slen, uint32_t max_allele_slen, uint32_t max_filter_slen, uint32_t info_reload_slen, UnsortedVar vpos_sortstatus, uint32_t max_thread_ct, MakePlink2Flags make_plink2_flags, uintptr_t pgr_alloc_cacheline_ct, char* xheader, PgenFileInfo* pgfip, PgenReader* simple_pgrp, char* outname, char* outname_end);
#ifdef __cplusplus
} // namespace plink2
=====================================
plink2_filter.cc
=====================================
@@ -4215,8 +4215,12 @@ PglErr SetRefalt1FromFile(const uintptr_t* variant_include, const char* const* v
char* write_iter = strcpya_k(g_logbuf, "Warning: ");
// strlen("--ref-allele") == 12, strlen("--alt1-allele") == 13
write_iter = memcpya(write_iter, flagstr, 12 + is_alt1);
- write_iter = strcpya_k(write_iter, " mismatch for multiallelic variant '");
- write_iter = strcpya(write_iter, variant_ids[variant_uidx]);
+ write_iter = strcpya_k(write_iter, " mismatch for multiallelic variant");
+ // If we put this all on one line, its length would be
+ // 60 + is_alt1 + variant_id_slen. Split into two if this is >79.
+ *write_iter++ = (variant_id_slen + is_alt1 < 20)? ' ' : '\n';
+ *write_iter++ = '\'';
+ write_iter = memcpya(write_iter, variant_ids[variant_uidx], variant_id_slen);
strcpy_k(write_iter, "'.\n");
if (allele_mismatch_warning_ct < 3) {
logerrputsb();
=====================================
plink2_glm.cc
=====================================
The diff for this file was not included because it is too large.
=====================================
plink2_help.cc
=====================================
@@ -132,8 +132,8 @@ PglErr DispHelp(const char* const* argvk, uint32_t param_ct) {
" --bpfile <prefix> ['vzs'] : Specify .pgen + .bim[.zst] + .fam prefix.\n\n"
);
HelpPrint("vcf\0bcf\0keep-autoconv\0", &help_ctrl, 1,
-" --keep-autoconv : When importing non-PLINK-binary data, don't delete\n"
-" autogenerated binary fileset at end of run.\n\n"
+" --keep-autoconv ['vzs'] : When importing non-PLINK-binary data, don't\n"
+" delete autogenerated fileset at end of run.\n\n"
);
HelpPrint("bfile\0fam\0no-fid\0no-parents\0no-sex\0", &help_ctrl, 1,
" --no-fid : .fam file does not contain column 1 (family ID).\n"
@@ -299,7 +299,7 @@ PglErr DispHelp(const char* const* argvk, uint32_t param_ct) {
" --make-bed ['vzs'] [{trim-alts | erase-alt2+}]\n"
" ['multiallelics='<split mode>]\n"
*/
-" Create a new PLINK binary fileset (--make-pgen = .pgen + .pvar[.zst] +\n"
+" Create a new PLINK 2 binary fileset (--make-pgen = .pgen + .pvar[.zst] +\n"
" .psam, --make-bpgen = .pgen + .bim[.zst] + .fam).\n"
" * Unlike the automatic text-to-binary converters (which only heed\n"
" chromosome filters), this supports all of PLINK's filtering flags.\n"
@@ -337,7 +337,7 @@ PglErr DispHelp(const char* const* argvk, uint32_t param_ct) {
" * '+any': All variants with identical CHROM/POS/REF and no symbolic ALT\n"
" alleles are joined into a single multiallelic variant.\n"
" Symbolic ALT alleles are joined separately. Additional conditions apply\n"
-" to them, and PLINK usually errors out when they aren't met.\n"
+" to them, and PLINK 2 usually errors out when they aren't met.\n"
" When splitting with 'vid-dup', the new variants keep the original variant\n"
" ID. Otherwise, new variant IDs are assigned as follows:\n"
" 1. If --set-all-var-ids was specified, that template is applied.\n"
@@ -372,6 +372,7 @@ PglErr DispHelp(const char* const* argvk, uint32_t param_ct) {
" except for possibly FILTER/INFO definitions when those\n"
" column(s) have been removed. Without this, only the #CHROM\n"
" header line is kept.\n"
+" vcfheader: xheader, with additions to make it a valid VCF header.\n"
" maybequal: QUAL. Omitted if all remaining values are missing.\n"
" qual: Force QUAL column to be written even when empty.\n"
" maybefilter: FILTER. Omitted if all remaining values are missing.\n"
@@ -414,8 +415,8 @@ PglErr DispHelp(const char* const* argvk, uint32_t param_ct) {
" ['bits='<#>]\n"
" Create a new fileset with all filters applied. The following output\n"
" formats are supported:\n"
-" (actually, only A, AD, A-transpose, bgen-1.x, ind-major-bed, haps,\n"
-" hapslegend, oxford, and vcf are implemented for now)\n"
+" (actually, only A, AD, A-transpose, bcf, bgen-1.x, haps, hapslegend,\n"
+" ind-major-bed, oxford, and vcf are implemented for now)\n"
" * '23': 23andMe 4-column format. This can only be used on a single\n"
" sample's data (--keep may be handy), and does not support\n"
" multicharacter allele codes.\n"
@@ -466,15 +467,18 @@ PglErr DispHelp(const char* const* argvk, uint32_t param_ct) {
" * 'transpose': PLINK 1 variant-major (.tped + .tfam), loadable with\n"
" --tfile.\n"
" * 'vcf', : VCF (default version 4.3). If PAR1 and PAR2 are present,\n"
-" 'vcf-4.2' they are automatically merged with chrX, with proper\n"
-" handling of chromosome codes and male ploidy.\n"
-" When the 'bgz' modifier is present, the VCF file is\n"
-" block-gzipped.\n"
+" 'vcf-4.2', they are automatically merged with chrX, with proper\n"
+" 'bcf', handling of chromosome codes and male ploidy.\n"
+" 'bcf-4.2' When the 'bgz' modifier is present, the VCF file is\n"
+" block-gzipped. (This always happens with BCF output.)\n"
" The 'id-paste' modifier controls which .psam columns are\n"
" used to construct sample IDs (choices are maybefid, fid,\n"
" iid, maybesid, and sid; default is maybefid,iid,maybesid),\n"
" while the 'id-delim' modifier sets the character between the\n"
" ID pieces (default '_').\n"
+" Genotypes are always exported. If you want to export a\n"
+" sites-only VCF instead, see --make-pgen/--make-just-pvar's\n"
+" 'vcfheader' column set.\n"
" Dosages are not exported unless the 'vcf-dosage=' modifier\n"
" is present. The following five dosage export modes are\n"
" supported:\n"
=====================================
plink2_import.cc
=====================================
The diff for this file was not included because it is too large.
=====================================
plink2_import.h
=====================================
@@ -28,8 +28,9 @@ namespace plink2 {
FLAGSET_DEF_START()
kfImport0,
kfImportKeepAutoconv = (1 << 0),
- kfImportDoubleId = (1 << 1),
- kfImportVcfRequireGt = (1 << 2)
+ kfImportKeepAutoconvVzs = (1 << 1),
+ kfImportDoubleId = (1 << 2),
+ kfImportVcfRequireGt = (1 << 3)
FLAGSET_DEF_END(ImportFlags);
ENUM_U31_DEF_START()
=====================================
plink2_matrix.h
=====================================
@@ -447,10 +447,19 @@ HEADER_INLINE void PrintSymmFmatrix(const float* matrix, uint32_t dim) {
}
}
-HEADER_INLINE void PrintMatrix(const double* matrix, uint32_t dim) {
- for (uint32_t uii = 0; uii != dim; ++uii) {
- for (uint32_t ujj = 0; ujj != dim; ++ujj) {
- printf("%g ", matrix[uii * dim + ujj]);
+HEADER_INLINE void PrintMatrix(const double* matrix, uintptr_t row_ct, uintptr_t col_ct) {
+ for (uintptr_t ulii = 0; ulii != row_ct; ++ulii) {
+ for (uintptr_t uljj = 0; uljj != col_ct; ++uljj) {
+ printf("%g ", matrix[ulii * col_ct + uljj]);
+ }
+ printf("\n");
+ }
+}
+
+HEADER_INLINE void PrintFmatrix(const float* matrix, uintptr_t row_ct, uintptr_t col_ct, uintptr_t stride) {
+ for (uintptr_t ulii = 0; ulii != row_ct; ++ulii) {
+ for (uintptr_t uljj = 0; uljj != col_ct; ++uljj) {
+ printf("%g ", S_CAST(double, matrix[ulii * stride + uljj]));
}
printf("\n");
}
=====================================
plink2_matrix_calc.cc
=====================================
@@ -442,7 +442,12 @@ PglErr KingCutoffBatch(const SampleIdInfo* siip, uint32_t raw_sample_ct, double
}
} else {
if (unlikely(fsize != (fsize_double_expected / 2))) {
- logerrprintfww("Error: Invalid --king-cutoff .bin file size (expected %" PRIu64 " or %" PRIu64 " bytes).\n", fsize_double_expected / 2, fsize_double_expected);
+ const uint64_t fsize_double_square = king_id_ct * S_CAST(uint64_t, king_id_ct) * sizeof(double);
+ if ((fsize == fsize_double_square) || (fsize == fsize_double_square / 2)) {
+ logerrputs("Error: --king-cutoff currently requires a *triangular* .bin file; the provided\nfile appears to be square.\n");
+ } else {
+ logerrprintfww("Error: Invalid --king-cutoff .bin file size (expected %" PRIu64 " or %" PRIu64 " bytes).\n", fsize_double_expected / 2, fsize_double_expected);
+ }
goto KingCutoffBatch_ret_MALFORMED_INPUT;
}
assert(king_id_ct <= ((0x7ffff000 / sizeof(float)) + 1));
=====================================
plink2_pvar.cc
=====================================
@@ -787,7 +787,8 @@ PglErr LoadPvar(const char* pvarname, const char* var_filter_exceptions_flattene
// probable todo: load INFO:END. (does this allow the CNV module to be
// unified with the rest of the program?) but this will probably wait
// until I need to analyze some sort of CNV data, and that day keeps
- // getting postponed...
+ // getting postponed... for now, the BCF exporter performs its own parsing
+ // of INFO:END so that it can fill each variant's rlen field correctly.
// possible todo: require FILTER to only contain values declared in header,
// and modify its storage accordingly? (pointless for now, but worthwhile
// to keep an eye on what typical VCF files look like.)
@@ -833,7 +834,7 @@ PglErr LoadPvar(const char* pvarname, const char* var_filter_exceptions_flattene
unsigned char* tmp_alloc_base = g_bigstack_base;
g_bigstack_base = bigstack_mark;
- char* xheader_end = ((pvar_psam_flags & kfPvarColXheader) || xheader_needed)? R_CAST(char*, bigstack_mark) : nullptr;
+ char* xheader_end = ((pvar_psam_flags & (kfPvarColXheader | kfPvarColVcfheader)) || xheader_needed)? R_CAST(char*, bigstack_mark) : nullptr;
uint32_t chrset_present = 0;
uint32_t info_pr_present = 0;
uint32_t info_pr_nonflag_present = 0;
@@ -906,7 +907,6 @@ PglErr LoadPvar(const char* pvarname, const char* var_filter_exceptions_flattene
line_iter = line_end;
}
}
- *info_flags_ptr = S_CAST(InfoFlags, (info_pr_present * kfInfoPrFlagPresent) | (info_pr_nonflag_present * kfInfoPrNonflagPresent) | (info_nonpr_present * kfInfoNonprPresent));
line_iter = AdvPastDelim(line_iter, '\n');
}
if (xheader_end) {
@@ -927,6 +927,7 @@ PglErr LoadPvar(const char* pvarname, const char* var_filter_exceptions_flattene
uint32_t info_col_present = 0;
uint32_t cm_col_present = 0;
if (line_start[0] == '#') {
+ *info_flags_ptr = S_CAST(InfoFlags, (info_pr_present * kfInfoPrFlagPresent) | (info_pr_nonflag_present * kfInfoPrNonflagPresent) | (info_nonpr_present * kfInfoNonprPresent));
// parse header
// [-1] = #CHROM (must be first column)
// [0] = POS
@@ -1388,10 +1389,14 @@ PglErr LoadPvar(const char* pvarname, const char* var_filter_exceptions_flattene
cip->chr_file_order[++chrs_encountered_m1] = cur_chr_code;
cip->chr_fo_vidx_start[chrs_encountered_m1] = raw_variant_ct;
cip->chr_idx_to_foidx[cur_chr_code] = chrs_encountered_m1;
- last_bp = 0;
last_cm = -DBL_MAX;
}
}
+ // always need to set this, if we want to avoid chromosome-length
+ // overestimation in --sort-vars + early-variant-filter +
+ // pvar-cols=+vcfheader case.
+ last_bp = 0;
+
SetBit(cur_chr_code, loaded_chr_mask);
if (chr_output_name_buf) {
char* chr_name_end = chrtoa(cip, cur_chr_code, chr_output_name_buf);
View it on GitLab: https://salsa.debian.org/med-team/plink2/-/commit/2aaa4872560daa25e2010f56b88dbad02cf67fc6
--
View it on GitLab: https://salsa.debian.org/med-team/plink2/-/commit/2aaa4872560daa25e2010f56b88dbad02cf67fc6
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200327/56405380/attachment-0001.html>
More information about the debian-med-commit
mailing list